1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 /* Copyright (c) 1990 Mentat Inc. */ 27 28 #pragma ident "%Z%%M% %I% %E% SMI" 29 30 #include <sys/types.h> 31 #include <sys/stream.h> 32 #include <sys/dlpi.h> 33 #include <sys/stropts.h> 34 #include <sys/sysmacros.h> 35 #include <sys/strsubr.h> 36 #include <sys/strlog.h> 37 #include <sys/strsun.h> 38 #include <sys/zone.h> 39 #define _SUN_TPI_VERSION 2 40 #include <sys/tihdr.h> 41 #include <sys/xti_inet.h> 42 #include <sys/ddi.h> 43 #include <sys/sunddi.h> 44 #include <sys/cmn_err.h> 45 #include <sys/debug.h> 46 #include <sys/kobj.h> 47 #include <sys/modctl.h> 48 #include <sys/atomic.h> 49 #include <sys/policy.h> 50 #include <sys/priv.h> 51 52 #include <sys/systm.h> 53 #include <sys/param.h> 54 #include <sys/kmem.h> 55 #include <sys/socket.h> 56 #include <sys/vtrace.h> 57 #include <sys/isa_defs.h> 58 #include <net/if.h> 59 #include <net/if_arp.h> 60 #include <net/route.h> 61 #include <sys/sockio.h> 62 #include <netinet/in.h> 63 #include <net/if_dl.h> 64 65 #include <inet/common.h> 66 #include <inet/mi.h> 67 #include <inet/mib2.h> 68 #include <inet/nd.h> 69 #include <inet/arp.h> 70 #include <inet/snmpcom.h> 71 #include <inet/kstatcom.h> 72 73 #include <netinet/igmp_var.h> 74 #include <netinet/ip6.h> 75 #include <netinet/icmp6.h> 76 #include <netinet/sctp.h> 77 78 #include <inet/ip.h> 79 #include <inet/ip_impl.h> 80 #include <inet/ip6.h> 81 #include <inet/ip6_asp.h> 82 #include <inet/tcp.h> 83 #include <inet/tcp_impl.h> 84 #include <inet/ip_multi.h> 85 #include <inet/ip_if.h> 86 #include <inet/ip_ire.h> 87 #include <inet/ip_rts.h> 88 #include <inet/optcom.h> 89 #include <inet/ip_ndp.h> 90 #include <inet/ip_listutils.h> 91 #include <netinet/igmp.h> 92 #include <netinet/ip_mroute.h> 93 #include <inet/ipp_common.h> 94 95 #include <net/pfkeyv2.h> 96 #include <inet/ipsec_info.h> 97 #include <inet/sadb.h> 98 #include <inet/ipsec_impl.h> 99 #include <sys/iphada.h> 100 #include <inet/tun.h> 101 #include <inet/ipdrop.h> 102 103 #include <sys/ethernet.h> 104 #include <net/if_types.h> 105 #include <sys/cpuvar.h> 106 107 #include <ipp/ipp.h> 108 #include <ipp/ipp_impl.h> 109 #include <ipp/ipgpc/ipgpc.h> 110 111 #include <sys/multidata.h> 112 #include <sys/pattr.h> 113 114 #include <inet/ipclassifier.h> 115 #include <inet/sctp_ip.h> 116 #include <inet/udp_impl.h> 117 118 #include <sys/tsol/label.h> 119 #include <sys/tsol/tnet.h> 120 121 #include <rpc/pmap_prot.h> 122 123 /* 124 * Values for squeue switch: 125 * IP_SQUEUE_ENTER_NODRAIN: squeue_enter_nodrain 126 * IP_SQUEUE_ENTER: squeue_enter 127 * IP_SQUEUE_FILL: squeue_fill 128 */ 129 int ip_squeue_enter = 2; 130 squeue_func_t ip_input_proc; 131 /* 132 * IP statistics. 133 */ 134 #define IP_STAT(x) (ip_statistics.x.value.ui64++) 135 #define IP_STAT_UPDATE(x, n) (ip_statistics.x.value.ui64 += (n)) 136 137 typedef struct ip_stat { 138 kstat_named_t ipsec_fanout_proto; 139 kstat_named_t ip_udp_fannorm; 140 kstat_named_t ip_udp_fanmb; 141 kstat_named_t ip_udp_fanothers; 142 kstat_named_t ip_udp_fast_path; 143 kstat_named_t ip_udp_slow_path; 144 kstat_named_t ip_udp_input_err; 145 kstat_named_t ip_tcppullup; 146 kstat_named_t ip_tcpoptions; 147 kstat_named_t ip_multipkttcp; 148 kstat_named_t ip_tcp_fast_path; 149 kstat_named_t ip_tcp_slow_path; 150 kstat_named_t ip_tcp_input_error; 151 kstat_named_t ip_db_ref; 152 kstat_named_t ip_notaligned1; 153 kstat_named_t ip_notaligned2; 154 kstat_named_t ip_multimblk3; 155 kstat_named_t ip_multimblk4; 156 kstat_named_t ip_ipoptions; 157 kstat_named_t ip_classify_fail; 158 kstat_named_t ip_opt; 159 kstat_named_t ip_udp_rput_local; 160 kstat_named_t ipsec_proto_ahesp; 161 kstat_named_t ip_conn_flputbq; 162 kstat_named_t ip_conn_walk_drain; 163 kstat_named_t ip_out_sw_cksum; 164 kstat_named_t ip_in_sw_cksum; 165 kstat_named_t ip_trash_ire_reclaim_calls; 166 kstat_named_t ip_trash_ire_reclaim_success; 167 kstat_named_t ip_ire_arp_timer_expired; 168 kstat_named_t ip_ire_redirect_timer_expired; 169 kstat_named_t ip_ire_pmtu_timer_expired; 170 kstat_named_t ip_input_multi_squeue; 171 kstat_named_t ip_tcp_in_full_hw_cksum_err; 172 kstat_named_t ip_tcp_in_part_hw_cksum_err; 173 kstat_named_t ip_tcp_in_sw_cksum_err; 174 kstat_named_t ip_tcp_out_sw_cksum_bytes; 175 kstat_named_t ip_udp_in_full_hw_cksum_err; 176 kstat_named_t ip_udp_in_part_hw_cksum_err; 177 kstat_named_t ip_udp_in_sw_cksum_err; 178 kstat_named_t ip_udp_out_sw_cksum_bytes; 179 kstat_named_t ip_frag_mdt_pkt_out; 180 kstat_named_t ip_frag_mdt_discarded; 181 kstat_named_t ip_frag_mdt_allocfail; 182 kstat_named_t ip_frag_mdt_addpdescfail; 183 kstat_named_t ip_frag_mdt_allocd; 184 } ip_stat_t; 185 186 static ip_stat_t ip_statistics = { 187 { "ipsec_fanout_proto", KSTAT_DATA_UINT64 }, 188 { "ip_udp_fannorm", KSTAT_DATA_UINT64 }, 189 { "ip_udp_fanmb", KSTAT_DATA_UINT64 }, 190 { "ip_udp_fanothers", KSTAT_DATA_UINT64 }, 191 { "ip_udp_fast_path", KSTAT_DATA_UINT64 }, 192 { "ip_udp_slow_path", KSTAT_DATA_UINT64 }, 193 { "ip_udp_input_err", KSTAT_DATA_UINT64 }, 194 { "ip_tcppullup", KSTAT_DATA_UINT64 }, 195 { "ip_tcpoptions", KSTAT_DATA_UINT64 }, 196 { "ip_multipkttcp", KSTAT_DATA_UINT64 }, 197 { "ip_tcp_fast_path", KSTAT_DATA_UINT64 }, 198 { "ip_tcp_slow_path", KSTAT_DATA_UINT64 }, 199 { "ip_tcp_input_error", KSTAT_DATA_UINT64 }, 200 { "ip_db_ref", KSTAT_DATA_UINT64 }, 201 { "ip_notaligned1", KSTAT_DATA_UINT64 }, 202 { "ip_notaligned2", KSTAT_DATA_UINT64 }, 203 { "ip_multimblk3", KSTAT_DATA_UINT64 }, 204 { "ip_multimblk4", KSTAT_DATA_UINT64 }, 205 { "ip_ipoptions", KSTAT_DATA_UINT64 }, 206 { "ip_classify_fail", KSTAT_DATA_UINT64 }, 207 { "ip_opt", KSTAT_DATA_UINT64 }, 208 { "ip_udp_rput_local", KSTAT_DATA_UINT64 }, 209 { "ipsec_proto_ahesp", KSTAT_DATA_UINT64 }, 210 { "ip_conn_flputbq", KSTAT_DATA_UINT64 }, 211 { "ip_conn_walk_drain", KSTAT_DATA_UINT64 }, 212 { "ip_out_sw_cksum", KSTAT_DATA_UINT64 }, 213 { "ip_in_sw_cksum", KSTAT_DATA_UINT64 }, 214 { "ip_trash_ire_reclaim_calls", KSTAT_DATA_UINT64 }, 215 { "ip_trash_ire_reclaim_success", KSTAT_DATA_UINT64 }, 216 { "ip_ire_arp_timer_expired", KSTAT_DATA_UINT64 }, 217 { "ip_ire_redirect_timer_expired", KSTAT_DATA_UINT64 }, 218 { "ip_ire_pmtu_timer_expired", KSTAT_DATA_UINT64 }, 219 { "ip_input_multi_squeue", KSTAT_DATA_UINT64 }, 220 { "ip_tcp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 }, 221 { "ip_tcp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 }, 222 { "ip_tcp_in_sw_cksum_err", KSTAT_DATA_UINT64 }, 223 { "ip_tcp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, 224 { "ip_udp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 }, 225 { "ip_udp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 }, 226 { "ip_udp_in_sw_cksum_err", KSTAT_DATA_UINT64 }, 227 { "ip_udp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, 228 { "ip_frag_mdt_pkt_out", KSTAT_DATA_UINT64 }, 229 { "ip_frag_mdt_discarded", KSTAT_DATA_UINT64 }, 230 { "ip_frag_mdt_allocfail", KSTAT_DATA_UINT64 }, 231 { "ip_frag_mdt_addpdescfail", KSTAT_DATA_UINT64 }, 232 { "ip_frag_mdt_allocd", KSTAT_DATA_UINT64 }, 233 }; 234 235 static kstat_t *ip_kstat; 236 237 #define TCP6 "tcp6" 238 #define TCP "tcp" 239 #define SCTP "sctp" 240 #define SCTP6 "sctp6" 241 242 major_t TCP6_MAJ; 243 major_t TCP_MAJ; 244 major_t SCTP_MAJ; 245 major_t SCTP6_MAJ; 246 247 int ip_poll_normal_ms = 100; 248 int ip_poll_normal_ticks = 0; 249 250 /* 251 * Structure to represent a linked list of msgblks. Used by ip_snmp_ functions. 252 */ 253 254 struct listptr_s { 255 mblk_t *lp_head; /* pointer to the head of the list */ 256 mblk_t *lp_tail; /* pointer to the tail of the list */ 257 }; 258 259 typedef struct listptr_s listptr_t; 260 261 /* 262 * This is used by ip_snmp_get_mib2_ip_route_media and 263 * ip_snmp_get_mib2_ip6_route_media to carry the lists of return data. 264 */ 265 typedef struct iproutedata_s { 266 uint_t ird_idx; 267 listptr_t ird_route; /* ipRouteEntryTable */ 268 listptr_t ird_netmedia; /* ipNetToMediaEntryTable */ 269 listptr_t ird_attrs; /* ipRouteAttributeTable */ 270 } iproutedata_t; 271 272 /* 273 * Cluster specific hooks. These should be NULL when booted as a non-cluster 274 */ 275 276 /* 277 * Hook functions to enable cluster networking 278 * On non-clustered systems these vectors must always be NULL. 279 * 280 * Hook function to Check ip specified ip address is a shared ip address 281 * in the cluster 282 * 283 */ 284 int (*cl_inet_isclusterwide)(uint8_t protocol, 285 sa_family_t addr_family, uint8_t *laddrp) = NULL; 286 287 /* 288 * Hook function to generate cluster wide ip fragment identifier 289 */ 290 uint32_t (*cl_inet_ipident)(uint8_t protocol, sa_family_t addr_family, 291 uint8_t *laddrp, uint8_t *faddrp) = NULL; 292 293 /* 294 * Synchronization notes: 295 * 296 * IP is a fully D_MP STREAMS module/driver. Thus it does not depend on any 297 * MT level protection given by STREAMS. IP uses a combination of its own 298 * internal serialization mechanism and standard Solaris locking techniques. 299 * The internal serialization is per phyint (no IPMP) or per IPMP group. 300 * This is used to serialize plumbing operations, IPMP operations, certain 301 * multicast operations, most set ioctls, igmp/mld timers etc. 302 * 303 * Plumbing is a long sequence of operations involving message 304 * exchanges between IP, ARP and device drivers. Many set ioctls are typically 305 * involved in plumbing operations. A natural model is to serialize these 306 * ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in 307 * parallel without any interference. But various set ioctls on hme0 are best 308 * serialized. However if the system uses IPMP, the operations are easier if 309 * they are serialized on a per IPMP group basis since IPMP operations 310 * happen across ill's of a group. Thus the lowest common denominator is to 311 * serialize most set ioctls, multicast join/leave operations, IPMP operations 312 * igmp/mld timer operations, and processing of DLPI control messages received 313 * from drivers on a per IPMP group basis. If the system does not employ 314 * IPMP the serialization is on a per phyint basis. This serialization is 315 * provided by the ipsq_t and primitives operating on this. Details can 316 * be found in ip_if.c above the core primitives operating on ipsq_t. 317 * 318 * Lookups of an ipif or ill by a thread return a refheld ipif / ill. 319 * Simiarly lookup of an ire by a thread also returns a refheld ire. 320 * In addition ipif's and ill's referenced by the ire are also indirectly 321 * refheld. Thus no ipif or ill can vanish nor can critical parameters like 322 * the ipif's address or netmask change as long as an ipif is refheld 323 * directly or indirectly. For example an SIOCLIFADDR ioctl that changes the 324 * address of an ipif has to go through the ipsq_t. This ensures that only 325 * 1 such exclusive operation proceeds at any time on the ipif. It then 326 * deletes all ires associated with this ipif, and waits for all refcnts 327 * associated with this ipif to come down to zero. The address is changed 328 * only after the ipif has been quiesced. Then the ipif is brought up again. 329 * More details are described above the comment in ip_sioctl_flags. 330 * 331 * Packet processing is based mostly on IREs and are fully multi-threaded 332 * using standard Solaris MT techniques. 333 * 334 * There are explicit locks in IP to handle: 335 * - The ip_g_head list maintained by mi_open_link() and friends. 336 * 337 * - The reassembly data structures (one lock per hash bucket) 338 * 339 * - conn_lock is meant to protect conn_t fields. The fields actually 340 * protected by conn_lock are documented in the conn_t definition. 341 * 342 * - ire_lock to protect some of the fields of the ire, IRE tables 343 * (one lock per hash bucket). Refer to ip_ire.c for details. 344 * 345 * - ndp_g_lock and nce_lock for protecting NCEs. 346 * 347 * - ill_lock protects fields of the ill and ipif. Details in ip.h 348 * 349 * - ill_g_lock: This is a global reader/writer lock. Protects the following 350 * * The AVL tree based global multi list of all ills. 351 * * The linked list of all ipifs of an ill 352 * * The <ill-ipsq> mapping 353 * * The ipsq->ipsq_phyint_list threaded by phyint_ipsq_next 354 * * The illgroup list threaded by ill_group_next. 355 * * <ill-phyint> association 356 * Insertion/deletion of an ill in the system, insertion/deletion of an ipif 357 * into an ill, changing the <ill-ipsq> mapping of an ill, insertion/deletion 358 * of an ill into the illgrp list, changing the <ill-phyint> assoc of an ill 359 * will all have to hold the ill_g_lock as writer for the actual duration 360 * of the insertion/deletion/change. More details about the <ill-ipsq> mapping 361 * may be found in the IPMP section. 362 * 363 * - ill_lock: This is a per ill mutex. 364 * It protects some members of the ill and is documented below. 365 * It also protects the <ill-ipsq> mapping 366 * It also protects the illgroup list threaded by ill_group_next. 367 * It also protects the <ill-phyint> assoc. 368 * It also protects the list of ipifs hanging off the ill. 369 * 370 * - ipsq_lock: This is a per ipsq_t mutex lock. 371 * This protects all the other members of the ipsq struct except 372 * ipsq_refs and ipsq_phyint_list which are protected by ill_g_lock 373 * 374 * - illgrp_lock: This is a per ill_group mutex lock. 375 * The only thing it protects is the illgrp_ill_schednext member of ill_group 376 * which dictates which is the next ill in an ill_group that is to be chosen 377 * for sending outgoing packets, through creation of an IRE_CACHE that 378 * references this ill. 379 * 380 * - phyint_lock: This is a per phyint mutex lock. Protects just the 381 * phyint_flags 382 * 383 * - ip_g_nd_lock: This is a global reader/writer lock. 384 * Any call to nd_load to load a new parameter to the ND table must hold the 385 * lock as writer. ND_GET/ND_SET routines that read the ND table hold the lock 386 * as reader. 387 * 388 * - ip_addr_avail_lock: This is used to ensure the uniqueness of IP addresses. 389 * This lock is held in ipif_up_done and the ipif is marked IPIF_UP and the 390 * uniqueness check also done atomically. 391 * 392 * - ipsec_capab_ills_lock: This readers/writer lock protects the global 393 * lists of IPsec capable ills (ipsec_capab_ills_{ah,esp}). It is taken 394 * as a writer when adding or deleting elements from these lists, and 395 * as a reader when walking these lists to send a SADB update to the 396 * IPsec capable ills. 397 * 398 * - ill_g_usesrc_lock: This readers/writer lock protects the usesrc 399 * group list linked by ill_usesrc_grp_next. It also protects the 400 * ill_usesrc_ifindex field. It is taken as a writer when a member of the 401 * group is being added or deleted. This lock is taken as a reader when 402 * walking the list/group(eg: to get the number of members in a usesrc group). 403 * Note, it is only necessary to take this lock if the ill_usesrc_grp_next 404 * field is changing state i.e from NULL to non-NULL or vice-versa. For 405 * example, it is not necessary to take this lock in the initial portion 406 * of ip_sioctl_slifusesrc or at all in ip_sioctl_groupname and 407 * ip_sioctl_flags since the these operations are executed exclusively and 408 * that ensures that the "usesrc group state" cannot change. The "usesrc 409 * group state" change can happen only in the latter part of 410 * ip_sioctl_slifusesrc and in ill_delete. 411 * 412 * Changing <ill-phyint>, <ill-ipsq>, <ill-illgroup> assocications. 413 * 414 * To change the <ill-phyint> association, the ill_g_lock must be held 415 * as writer, and the ill_locks of both the v4 and v6 instance of the ill 416 * must be held. 417 * 418 * To change the <ill-ipsq> association the ill_g_lock must be held as writer 419 * and the ill_lock of the ill in question must be held. 420 * 421 * To change the <ill-illgroup> association the ill_g_lock must be held as 422 * writer and the ill_lock of the ill in question must be held. 423 * 424 * To add or delete an ipif from the list of ipifs hanging off the ill, 425 * ill_g_lock (writer) and ill_lock must be held and the thread must be 426 * a writer on the associated ipsq,. 427 * 428 * To add or delete an ill to the system, the ill_g_lock must be held as 429 * writer and the thread must be a writer on the associated ipsq. 430 * 431 * To add or delete an ilm to an ill, the ill_lock must be held and the thread 432 * must be a writer on the associated ipsq. 433 * 434 * Lock hierarchy 435 * 436 * Some lock hierarchy scenarios are listed below. 437 * 438 * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock 439 * ill_g_lock -> illgrp_lock -> ill_lock 440 * ill_g_lock -> ill_lock(s) -> phyint_lock 441 * ill_g_lock -> ndp_g_lock -> ill_lock -> nce_lock 442 * ill_g_lock -> ip_addr_avail_lock 443 * conn_lock -> irb_lock -> ill_lock -> ire_lock 444 * ill_g_lock -> ip_g_nd_lock 445 * 446 * When more than 1 ill lock is needed to be held, all ill lock addresses 447 * are sorted on address and locked starting from highest addressed lock 448 * downward. 449 * 450 * Mobile-IP scenarios 451 * 452 * irb_lock -> ill_lock -> ire_mrtun_lock 453 * irb_lock -> ill_lock -> ire_srcif_table_lock 454 * 455 * IPsec scenarios 456 * 457 * ipsa_lock -> ill_g_lock -> ill_lock 458 * ipsec_capab_ills_lock -> ill_g_lock -> ill_lock 459 * ipsec_capab_ills_lock -> ipsa_lock 460 * ill_g_usesrc_lock -> ill_g_lock -> ill_lock 461 * 462 * Trusted Solaris scenarios 463 * 464 * igsa_lock -> gcgrp_rwlock -> gcgrp_lock 465 * igsa_lock -> gcdb_lock 466 * gcgrp_rwlock -> ire_lock 467 * gcgrp_rwlock -> gcdb_lock 468 * 469 * IPSEC notes : 470 * 471 * IP interacts with the IPSEC code (AH/ESP) by tagging a M_CTL message 472 * in front of the actual packet. For outbound datagrams, the M_CTL 473 * contains a ipsec_out_t (defined in ipsec_info.h), which has the 474 * information used by the IPSEC code for applying the right level of 475 * protection. The information initialized by IP in the ipsec_out_t 476 * is determined by the per-socket policy or global policy in the system. 477 * For inbound datagrams, the M_CTL contains a ipsec_in_t (defined in 478 * ipsec_info.h) which starts out with nothing in it. It gets filled 479 * with the right information if it goes through the AH/ESP code, which 480 * happens if the incoming packet is secure. The information initialized 481 * by AH/ESP, is later used by IP(during fanouts to ULP) to see whether 482 * the policy requirements needed by per-socket policy or global policy 483 * is met or not. 484 * 485 * If there is both per-socket policy (set using setsockopt) and there 486 * is also global policy match for the 5 tuples of the socket, 487 * ipsec_override_policy() makes the decision of which one to use. 488 * 489 * For fully connected sockets i.e dst, src [addr, port] is known, 490 * conn_policy_cached is set indicating that policy has been cached. 491 * conn_in_enforce_policy may or may not be set depending on whether 492 * there is a global policy match or per-socket policy match. 493 * Policy inheriting happpens in ip_bind during the ipa_conn_t bind. 494 * Once the right policy is set on the conn_t, policy cannot change for 495 * this socket. This makes life simpler for TCP (UDP ?) where 496 * re-transmissions go out with the same policy. For symmetry, policy 497 * is cached for fully connected UDP sockets also. Thus if policy is cached, 498 * it also implies that policy is latched i.e policy cannot change 499 * on these sockets. As we have the right policy on the conn, we don't 500 * have to lookup global policy for every outbound and inbound datagram 501 * and thus serving as an optimization. Note that a global policy change 502 * does not affect fully connected sockets if they have policy. If fully 503 * connected sockets did not have any policy associated with it, global 504 * policy change may affect them. 505 * 506 * IP Flow control notes: 507 * 508 * Non-TCP streams are flow controlled by IP. On the send side, if the packet 509 * cannot be sent down to the driver by IP, because of a canput failure, IP 510 * does a putq on the conn_wq. This will cause ip_wsrv to run on the conn_wq. 511 * ip_wsrv in turn, inserts the conn in a list of conn's that need to be drained 512 * when the flowcontrol condition subsides. Ultimately STREAMS backenables the 513 * ip_wsrv on the IP module, which in turn does a qenable of the conn_wq of the 514 * first conn in the list of conn's to be drained. ip_wsrv on this conn drains 515 * the queued messages, and removes the conn from the drain list, if all 516 * messages were drained. It also qenables the next conn in the drain list to 517 * continue the drain process. 518 * 519 * In reality the drain list is not a single list, but a configurable number 520 * of lists. The ip_wsrv on the IP module, qenables the first conn in each 521 * list. If the ip_wsrv of the next qenabled conn does not run, because the 522 * stream closes, ip_close takes responsibility to qenable the next conn in 523 * the drain list. The directly called ip_wput path always does a putq, if 524 * it cannot putnext. Thus synchronization problems are handled between 525 * ip_wsrv and ip_close. conn_drain_insert and conn_drain_tail are the only 526 * functions that manipulate this drain list. Furthermore conn_drain_insert 527 * is called only from ip_wsrv, and there can be only 1 instance of ip_wsrv 528 * running on a queue at any time. conn_drain_tail can be simultaneously called 529 * from both ip_wsrv and ip_close. 530 * 531 * IPQOS notes: 532 * 533 * IPQoS Policies are applied to packets using IPPF (IP Policy framework) 534 * and IPQoS modules. IPPF includes hooks in IP at different control points 535 * (callout positions) which direct packets to IPQoS modules for policy 536 * processing. Policies, if present, are global. 537 * 538 * The callout positions are located in the following paths: 539 * o local_in (packets destined for this host) 540 * o local_out (packets orginating from this host ) 541 * o fwd_in (packets forwarded by this m/c - inbound) 542 * o fwd_out (packets forwarded by this m/c - outbound) 543 * Hooks at these callout points can be enabled/disabled using the ndd variable 544 * ip_policy_mask (a bit mask with the 4 LSB indicating the callout positions). 545 * By default all the callout positions are enabled. 546 * 547 * Outbound (local_out) 548 * Hooks are placed in ip_wput_ire and ipsec_out_process. 549 * 550 * Inbound (local_in) 551 * Hooks are placed in ip_proto_input, icmp_inbound, ip_fanout_proto and 552 * TCP and UDP fanout routines. 553 * 554 * Forwarding (in and out) 555 * Hooks are placed in ip_rput_forward and ip_mrtun_forward. 556 * 557 * IP Policy Framework processing (IPPF processing) 558 * Policy processing for a packet is initiated by ip_process, which ascertains 559 * that the classifier (ipgpc) is loaded and configured, failing which the 560 * packet resumes normal processing in IP. If the clasifier is present, the 561 * packet is acted upon by one or more IPQoS modules (action instances), per 562 * filters configured in ipgpc and resumes normal IP processing thereafter. 563 * An action instance can drop a packet in course of its processing. 564 * 565 * A boolean variable, ip_policy, is used in all the fanout routines that can 566 * invoke ip_process for a packet. This variable indicates if the packet should 567 * to be sent for policy processing. The variable is set to B_TRUE by default, 568 * i.e. when the routines are invoked in the normal ip procesing path for a 569 * packet. The two exceptions being ip_wput_local and icmp_inbound_error_fanout; 570 * ip_policy is set to B_FALSE for all the routines called in these two 571 * functions because, in the former case, we don't process loopback traffic 572 * currently while in the latter, the packets have already been processed in 573 * icmp_inbound. 574 * 575 * Zones notes: 576 * 577 * The partitioning rules for networking are as follows: 578 * 1) Packets coming from a zone must have a source address belonging to that 579 * zone. 580 * 2) Packets coming from a zone can only be sent on a physical interface on 581 * which the zone has an IP address. 582 * 3) Between two zones on the same machine, packet delivery is only allowed if 583 * there's a matching route for the destination and zone in the forwarding 584 * table. 585 * 4) The TCP and UDP port spaces are per-zone; that is, two processes in 586 * different zones can bind to the same port with the wildcard address 587 * (INADDR_ANY). 588 * 589 * The granularity of interface partitioning is at the logical interface level. 590 * Therefore, every zone has its own IP addresses, and incoming packets can be 591 * attributed to a zone unambiguously. A logical interface is placed into a zone 592 * using the SIOCSLIFZONE ioctl; this sets the ipif_zoneid field in the ipif_t 593 * structure. Rule (1) is implemented by modifying the source address selection 594 * algorithm so that the list of eligible addresses is filtered based on the 595 * sending process zone. 596 * 597 * The Internet Routing Entries (IREs) are either exclusive to a zone or shared 598 * across all zones, depending on their type. Here is the break-up: 599 * 600 * IRE type Shared/exclusive 601 * -------- ---------------- 602 * IRE_BROADCAST Exclusive 603 * IRE_DEFAULT (default routes) Shared (*) 604 * IRE_LOCAL Exclusive 605 * IRE_LOOPBACK Exclusive 606 * IRE_PREFIX (net routes) Shared (*) 607 * IRE_CACHE Exclusive 608 * IRE_IF_NORESOLVER (interface routes) Exclusive 609 * IRE_IF_RESOLVER (interface routes) Exclusive 610 * IRE_HOST (host routes) Shared (*) 611 * 612 * (*) A zone can only use a default or off-subnet route if the gateway is 613 * directly reachable from the zone, that is, if the gateway's address matches 614 * one of the zone's logical interfaces. 615 * 616 * Multiple zones can share a common broadcast address; typically all zones 617 * share the 255.255.255.255 address. Incoming as well as locally originated 618 * broadcast packets must be dispatched to all the zones on the broadcast 619 * network. For directed broadcasts (e.g. 10.16.72.255) this is not trivial 620 * since some zones may not be on the 10.16.72/24 network. To handle this, each 621 * zone has its own set of IRE_BROADCAST entries; then, broadcast packets are 622 * sent to every zone that has an IRE_BROADCAST entry for the destination 623 * address on the input ill, see conn_wantpacket(). 624 * 625 * Applications in different zones can join the same multicast group address. 626 * For IPv4, group memberships are per-logical interface, so they're already 627 * inherently part of a zone. For IPv6, group memberships are per-physical 628 * interface, so we distinguish IPv6 group memberships based on group address, 629 * interface and zoneid. In both cases, received multicast packets are sent to 630 * every zone for which a group membership entry exists. On IPv6 we need to 631 * check that the target zone still has an address on the receiving physical 632 * interface; it could have been removed since the application issued the 633 * IPV6_JOIN_GROUP. 634 */ 635 636 /* 637 * Squeue Fanout flags: 638 * 0: No fanout. 639 * 1: Fanout across all squeues 640 */ 641 boolean_t ip_squeue_fanout = 0; 642 643 /* 644 * Maximum dups allowed per packet. 645 */ 646 uint_t ip_max_frag_dups = 10; 647 648 #define IS_SIMPLE_IPH(ipha) \ 649 ((ipha)->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION) 650 651 /* RFC1122 Conformance */ 652 #define IP_FORWARD_DEFAULT IP_FORWARD_NEVER 653 654 #define ILL_MAX_NAMELEN LIFNAMSIZ 655 656 /* Leave room for ip_newroute to tack on the src and target addresses */ 657 #define OK_RESOLVER_MP(mp) \ 658 ((mp) && ((mp)->b_wptr - (mp)->b_rptr) >= (2 * IP_ADDR_LEN)) 659 660 static int conn_set_held_ipif(conn_t *, ipif_t **, ipif_t *); 661 662 static mblk_t *ip_wput_attach_llhdr(mblk_t *, ire_t *, ip_proc_t, uint32_t); 663 static void ip_ipsec_out_prepend(mblk_t *, mblk_t *, ill_t *); 664 665 static void icmp_frag_needed(queue_t *, mblk_t *, int); 666 static void icmp_inbound(queue_t *, mblk_t *, boolean_t, ill_t *, int, 667 uint32_t, boolean_t, boolean_t, ill_t *, zoneid_t); 668 static boolean_t icmp_inbound_too_big(icmph_t *, ipha_t *); 669 static void icmp_inbound_error_fanout(queue_t *, ill_t *, mblk_t *, 670 icmph_t *, ipha_t *, int, int, boolean_t, boolean_t, 671 ill_t *, zoneid_t); 672 static void icmp_options_update(ipha_t *); 673 static void icmp_param_problem(queue_t *, mblk_t *, uint8_t); 674 static void icmp_pkt(queue_t *, mblk_t *, void *, size_t, boolean_t); 675 static mblk_t *icmp_pkt_err_ok(mblk_t *); 676 static void icmp_redirect(mblk_t *); 677 static void icmp_send_redirect(queue_t *, mblk_t *, ipaddr_t); 678 679 static void ip_arp_news(queue_t *, mblk_t *); 680 static boolean_t ip_bind_insert_ire(mblk_t *, ire_t *, iulp_t *); 681 mblk_t *ip_dlpi_alloc(size_t, t_uscalar_t); 682 char *ip_dot_addr(ipaddr_t, char *); 683 mblk_t *ip_carve_mp(mblk_t **, ssize_t); 684 int ip_close(queue_t *, int); 685 static char *ip_dot_saddr(uchar_t *, char *); 686 static void ip_fanout_proto(queue_t *, mblk_t *, ill_t *, ipha_t *, uint_t, 687 boolean_t, boolean_t, ill_t *, zoneid_t); 688 static void ip_fanout_tcp(queue_t *, mblk_t *, ill_t *, ipha_t *, uint_t, 689 boolean_t, boolean_t, zoneid_t); 690 static void ip_fanout_udp(queue_t *, mblk_t *, ill_t *, ipha_t *, uint32_t, 691 boolean_t, uint_t, boolean_t, boolean_t, ill_t *, zoneid_t); 692 static void ip_lrput(queue_t *, mblk_t *); 693 ipaddr_t ip_massage_options(ipha_t *); 694 static void ip_mrtun_forward(ire_t *, ill_t *, mblk_t *); 695 ipaddr_t ip_net_mask(ipaddr_t); 696 void ip_newroute(queue_t *, mblk_t *, ipaddr_t, ill_t *, conn_t *); 697 static void ip_newroute_ipif(queue_t *, mblk_t *, ipif_t *, ipaddr_t, 698 conn_t *, uint32_t); 699 static int ip_hdr_complete(ipha_t *, zoneid_t); 700 char *ip_nv_lookup(nv_t *, int); 701 static boolean_t ip_check_for_ipsec_opt(queue_t *, mblk_t *); 702 static int ip_param_get(queue_t *, mblk_t *, caddr_t, cred_t *); 703 static int ip_param_generic_get(queue_t *, mblk_t *, caddr_t, cred_t *); 704 static boolean_t ip_param_register(ipparam_t *, size_t, ipndp_t *, 705 size_t); 706 static int ip_param_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); 707 void ip_rput(queue_t *, mblk_t *); 708 static void ip_rput_dlpi_writer(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, 709 void *dummy_arg); 710 void ip_rput_forward(ire_t *, ipha_t *, mblk_t *, ill_t *); 711 static int ip_rput_forward_options(mblk_t *, ipha_t *, ire_t *); 712 static boolean_t ip_rput_local_options(queue_t *, mblk_t *, ipha_t *, 713 ire_t *); 714 static int ip_rput_options(queue_t *, mblk_t *, ipha_t *, ipaddr_t *); 715 static boolean_t ip_rput_fragment(queue_t *, mblk_t **, ipha_t *, uint32_t *, 716 uint16_t *); 717 int ip_snmp_get(queue_t *, mblk_t *); 718 static mblk_t *ip_snmp_get_mib2_ip(queue_t *, mblk_t *); 719 static mblk_t *ip_snmp_get_mib2_ip6(queue_t *, mblk_t *); 720 static mblk_t *ip_snmp_get_mib2_icmp(queue_t *, mblk_t *); 721 static mblk_t *ip_snmp_get_mib2_icmp6(queue_t *, mblk_t *); 722 static mblk_t *ip_snmp_get_mib2_igmp(queue_t *, mblk_t *); 723 static mblk_t *ip_snmp_get_mib2_multi(queue_t *, mblk_t *); 724 static mblk_t *ip_snmp_get_mib2_ip_addr(queue_t *, mblk_t *); 725 static mblk_t *ip_snmp_get_mib2_ip6_addr(queue_t *, mblk_t *); 726 static mblk_t *ip_snmp_get_mib2_ip_group_mem(queue_t *, mblk_t *); 727 static mblk_t *ip_snmp_get_mib2_ip6_group_mem(queue_t *, mblk_t *); 728 static mblk_t *ip_snmp_get_mib2_ip_group_src(queue_t *, mblk_t *); 729 static mblk_t *ip_snmp_get_mib2_ip6_group_src(queue_t *, mblk_t *); 730 static mblk_t *ip_snmp_get_mib2_virt_multi(queue_t *, mblk_t *); 731 static mblk_t *ip_snmp_get_mib2_multi_rtable(queue_t *, mblk_t *); 732 static mblk_t *ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *); 733 static mblk_t *ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *); 734 static void ip_snmp_get2_v4(ire_t *, iproutedata_t *); 735 static void ip_snmp_get2_v6_route(ire_t *, iproutedata_t *); 736 static int ip_snmp_get2_v6_media(nce_t *, iproutedata_t *); 737 int ip_snmp_set(queue_t *, int, int, uchar_t *, int); 738 static boolean_t ip_source_routed(ipha_t *); 739 static boolean_t ip_source_route_included(ipha_t *); 740 741 static void ip_wput_frag(ire_t *, mblk_t *, ip_pkt_t, uint32_t, uint32_t); 742 static mblk_t *ip_wput_frag_copyhdr(uchar_t *, int, int); 743 static void ip_wput_local_options(ipha_t *); 744 static int ip_wput_options(queue_t *, mblk_t *, ipha_t *, boolean_t, 745 zoneid_t); 746 747 static void conn_drain_init(void); 748 static void conn_drain_fini(void); 749 static void conn_drain_tail(conn_t *connp, boolean_t closing); 750 751 static void conn_walk_drain(void); 752 static void conn_walk_fanout_table(connf_t *, uint_t, pfv_t, void *, 753 zoneid_t); 754 755 static boolean_t conn_wantpacket(conn_t *, ill_t *, ipha_t *, int, 756 zoneid_t); 757 static void ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, 758 void *dummy_arg); 759 760 static int ip_forward_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); 761 762 static int ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t, 763 ipaddr_t, ipaddr_t, uint_t *, mcast_record_t, ipaddr_t, mblk_t *), ire_t *, 764 conn_t *, boolean_t, ipaddr_t, mcast_record_t, ipaddr_t, mblk_t *); 765 static void ip_multirt_bad_mtu(ire_t *, uint32_t); 766 767 static int ip_cgtp_filter_get(queue_t *, mblk_t *, caddr_t, cred_t *); 768 static int ip_cgtp_filter_set(queue_t *, mblk_t *, char *, 769 caddr_t, cred_t *); 770 extern int ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value, 771 caddr_t cp, cred_t *cr); 772 extern int ip_squeue_profile_set(queue_t *, mblk_t *, char *, caddr_t, 773 cred_t *); 774 static int ip_input_proc_set(queue_t *q, mblk_t *mp, char *value, 775 caddr_t cp, cred_t *cr); 776 static int ip_int_set(queue_t *, mblk_t *, char *, caddr_t, 777 cred_t *); 778 static squeue_func_t ip_squeue_switch(int); 779 780 static void ip_kstat_init(void); 781 static void ip_kstat_fini(void); 782 static int ip_kstat_update(kstat_t *kp, int rw); 783 static void icmp_kstat_init(void); 784 static void icmp_kstat_fini(void); 785 static int icmp_kstat_update(kstat_t *kp, int rw); 786 787 static int ip_conn_report(queue_t *, mblk_t *, caddr_t, cred_t *); 788 789 static boolean_t ip_no_forward(ipha_t *, ill_t *); 790 static boolean_t ip_loopback_src_or_dst(ipha_t *, ill_t *); 791 792 static mblk_t *ip_tcp_input(mblk_t *, ipha_t *, ill_t *, boolean_t, 793 ire_t *, mblk_t *, uint_t, queue_t *, ill_rx_ring_t *); 794 795 void ip_input(ill_t *, ill_rx_ring_t *, mblk_t *, size_t); 796 797 timeout_id_t ip_ire_expire_id; /* IRE expiration timer. */ 798 static clock_t ip_ire_arp_time_elapsed; /* Time since IRE cache last flushed */ 799 static clock_t ip_ire_rd_time_elapsed; /* ... redirect IREs last flushed */ 800 static clock_t ip_ire_pmtu_time_elapsed; /* Time since path mtu increase */ 801 802 uint_t ip_ire_default_count; /* Number of IPv4 IRE_DEFAULT entries. */ 803 uint_t ip_ire_default_index; /* Walking index used to mod in */ 804 805 ipaddr_t ip_g_all_ones = IP_HOST_MASK; 806 clock_t icmp_pkt_err_last = 0; /* Time since last icmp_pkt_err */ 807 uint_t icmp_pkt_err_sent = 0; /* Number of packets sent in burst */ 808 809 /* How long, in seconds, we allow frags to hang around. */ 810 #define IP_FRAG_TIMEOUT 60 811 812 time_t ip_g_frag_timeout = IP_FRAG_TIMEOUT; 813 clock_t ip_g_frag_timo_ms = IP_FRAG_TIMEOUT * 1000; 814 815 /* 816 * Threshold which determines whether MDT should be used when 817 * generating IP fragments; payload size must be greater than 818 * this threshold for MDT to take place. 819 */ 820 #define IP_WPUT_FRAG_MDT_MIN 32768 821 822 int ip_wput_frag_mdt_min = IP_WPUT_FRAG_MDT_MIN; 823 824 /* Protected by ip_mi_lock */ 825 static void *ip_g_head; /* Instance Data List Head */ 826 kmutex_t ip_mi_lock; /* Lock for list of instances */ 827 828 /* Only modified during _init and _fini thus no locking is needed. */ 829 caddr_t ip_g_nd; /* Named Dispatch List Head */ 830 831 832 static long ip_rput_pullups; 833 int dohwcksum = 1; /* use h/w cksum if supported by the hardware */ 834 835 vmem_t *ip_minor_arena; 836 837 /* 838 * MIB-2 stuff for SNMP (both IP and ICMP) 839 */ 840 mib2_ip_t ip_mib; 841 mib2_icmp_t icmp_mib; 842 843 #ifdef DEBUG 844 uint32_t ipsechw_debug = 0; 845 #endif 846 847 kstat_t *ip_mibkp; /* kstat exporting ip_mib data */ 848 kstat_t *icmp_mibkp; /* kstat exporting icmp_mib data */ 849 850 uint_t loopback_packets = 0; 851 852 /* 853 * Multirouting/CGTP stuff 854 */ 855 cgtp_filter_ops_t *ip_cgtp_filter_ops; /* CGTP hooks */ 856 int ip_cgtp_filter_rev = CGTP_FILTER_REV; /* CGTP hooks version */ 857 boolean_t ip_cgtp_filter; /* Enable/disable CGTP hooks */ 858 /* Interval (in ms) between consecutive 'bad MTU' warnings */ 859 hrtime_t ip_multirt_log_interval = 1000; 860 /* Time since last warning issued. */ 861 static hrtime_t multirt_bad_mtu_last_time = 0; 862 863 kmutex_t ip_trash_timer_lock; 864 krwlock_t ip_g_nd_lock; 865 866 /* 867 * XXX following really should only be in a header. Would need more 868 * header and .c clean up first. 869 */ 870 extern optdb_obj_t ip_opt_obj; 871 872 ulong_t ip_squeue_enter_unbound = 0; 873 874 /* 875 * Named Dispatch Parameter Table. 876 * All of these are alterable, within the min/max values given, at run time. 877 */ 878 static ipparam_t lcl_param_arr[] = { 879 /* min max value name */ 880 { 0, 1, 0, "ip_respond_to_address_mask_broadcast"}, 881 { 0, 1, 1, "ip_respond_to_echo_broadcast"}, 882 { 0, 1, 1, "ip_respond_to_echo_multicast"}, 883 { 0, 1, 0, "ip_respond_to_timestamp"}, 884 { 0, 1, 0, "ip_respond_to_timestamp_broadcast"}, 885 { 0, 1, 1, "ip_send_redirects"}, 886 { 0, 1, 0, "ip_forward_directed_broadcasts"}, 887 { 0, 10, 0, "ip_debug"}, 888 { 0, 10, 0, "ip_mrtdebug"}, 889 { 5000, 999999999, 60000, "ip_ire_timer_interval" }, 890 { 60000, 999999999, 1200000, "ip_ire_arp_interval" }, 891 { 60000, 999999999, 60000, "ip_ire_redirect_interval" }, 892 { 1, 255, 255, "ip_def_ttl" }, 893 { 0, 1, 0, "ip_forward_src_routed"}, 894 { 0, 256, 32, "ip_wroff_extra" }, 895 { 5000, 999999999, 600000, "ip_ire_pathmtu_interval" }, 896 { 8, 65536, 64, "ip_icmp_return_data_bytes" }, 897 { 0, 1, 1, "ip_path_mtu_discovery" }, 898 { 0, 240, 30, "ip_ignore_delete_time" }, 899 { 0, 1, 0, "ip_ignore_redirect" }, 900 { 0, 1, 1, "ip_output_queue" }, 901 { 1, 254, 1, "ip_broadcast_ttl" }, 902 { 0, 99999, 100, "ip_icmp_err_interval" }, 903 { 1, 99999, 10, "ip_icmp_err_burst" }, 904 { 0, 999999999, 1000000, "ip_reass_queue_bytes" }, 905 { 0, 1, 0, "ip_strict_dst_multihoming" }, 906 { 1, MAX_ADDRS_PER_IF, 256, "ip_addrs_per_if"}, 907 { 0, 1, 0, "ipsec_override_persocket_policy" }, 908 { 0, 1, 1, "icmp_accept_clear_messages" }, 909 { 0, 1, 1, "igmp_accept_clear_messages" }, 910 { 2, 999999999, ND_DELAY_FIRST_PROBE_TIME, 911 "ip_ndp_delay_first_probe_time"}, 912 { 1, 999999999, ND_MAX_UNICAST_SOLICIT, 913 "ip_ndp_max_unicast_solicit"}, 914 { 1, 255, IPV6_MAX_HOPS, "ip6_def_hops" }, 915 { 8, IPV6_MIN_MTU, IPV6_MIN_MTU, "ip6_icmp_return_data_bytes" }, 916 { 0, 1, 0, "ip6_forward_src_routed"}, 917 { 0, 1, 1, "ip6_respond_to_echo_multicast"}, 918 { 0, 1, 1, "ip6_send_redirects"}, 919 { 0, 1, 0, "ip6_ignore_redirect" }, 920 { 0, 1, 0, "ip6_strict_dst_multihoming" }, 921 922 { 1, 8, 3, "ip_ire_reclaim_fraction" }, 923 924 { 0, 999999, 1000, "ipsec_policy_log_interval" }, 925 926 { 0, 1, 1, "pim_accept_clear_messages" }, 927 { 1000, 20000, 2000, "ip_ndp_unsolicit_interval" }, 928 { 1, 20, 3, "ip_ndp_unsolicit_count" }, 929 { 0, 1, 1, "ip6_ignore_home_address_opt" }, 930 { 0, 15, 0, "ip_policy_mask" }, 931 { 1000, 60000, 1000, "ip_multirt_resolution_interval" }, 932 { 0, 255, 1, "ip_multirt_ttl" }, 933 { 0, 1, 1, "ip_multidata_outbound" }, 934 #ifdef DEBUG 935 { 0, 1, 0, "ip6_drop_inbound_icmpv6" }, 936 #endif 937 }; 938 939 ipparam_t *ip_param_arr = lcl_param_arr; 940 941 /* Extended NDP table */ 942 static ipndp_t lcl_ndp_arr[] = { 943 /* getf setf data name */ 944 { ip_param_generic_get, ip_forward_set, (caddr_t)&ip_g_forward, 945 "ip_forwarding" }, 946 { ip_param_generic_get, ip_forward_set, (caddr_t)&ipv6_forward, 947 "ip6_forwarding" }, 948 { ip_ill_report, NULL, NULL, 949 "ip_ill_status" }, 950 { ip_ipif_report, NULL, NULL, 951 "ip_ipif_status" }, 952 { ip_ire_report, NULL, NULL, 953 "ipv4_ire_status" }, 954 { ip_ire_report_mrtun, NULL, NULL, 955 "ipv4_mrtun_ire_status" }, 956 { ip_ire_report_srcif, NULL, NULL, 957 "ipv4_srcif_ire_status" }, 958 { ip_ire_report_v6, NULL, NULL, 959 "ipv6_ire_status" }, 960 { ip_conn_report, NULL, NULL, 961 "ip_conn_status" }, 962 { nd_get_long, nd_set_long, (caddr_t)&ip_rput_pullups, 963 "ip_rput_pullups" }, 964 { ndp_report, NULL, NULL, 965 "ip_ndp_cache_report" }, 966 { ip_srcid_report, NULL, NULL, 967 "ip_srcid_status" }, 968 { ip_param_generic_get, ip_squeue_profile_set, 969 (caddr_t)&ip_squeue_profile, "ip_squeue_profile" }, 970 { ip_param_generic_get, ip_squeue_bind_set, 971 (caddr_t)&ip_squeue_bind, "ip_squeue_bind" }, 972 { ip_param_generic_get, ip_input_proc_set, 973 (caddr_t)&ip_squeue_enter, "ip_squeue_enter" }, 974 { ip_param_generic_get, ip_int_set, 975 (caddr_t)&ip_squeue_fanout, "ip_squeue_fanout" }, 976 { ip_cgtp_filter_get, ip_cgtp_filter_set, (caddr_t)&ip_cgtp_filter, 977 "ip_cgtp_filter" }, 978 { ip_param_generic_get, ip_int_set, 979 (caddr_t)&ip_soft_rings_cnt, "ip_soft_rings_cnt" } 980 }; 981 982 /* 983 * ip_g_forward controls IP forwarding. It takes two values: 984 * 0: IP_FORWARD_NEVER Don't forward packets ever. 985 * 1: IP_FORWARD_ALWAYS Forward packets for elsewhere. 986 * 987 * RFC1122 says there must be a configuration switch to control forwarding, 988 * but that the default MUST be to not forward packets ever. Implicit 989 * control based on configuration of multiple interfaces MUST NOT be 990 * implemented (Section 3.1). SunOS 4.1 did provide the "automatic" capability 991 * and, in fact, it was the default. That capability is now provided in the 992 * /etc/rc2.d/S69inet script. 993 */ 994 int ip_g_forward = IP_FORWARD_DEFAULT; 995 996 /* It also has an IPv6 counterpart. */ 997 998 int ipv6_forward = IP_FORWARD_DEFAULT; 999 1000 /* Following line is external, and in ip.h. Normally marked with * *. */ 1001 #define ip_respond_to_address_mask_broadcast ip_param_arr[0].ip_param_value 1002 #define ip_g_resp_to_echo_bcast ip_param_arr[1].ip_param_value 1003 #define ip_g_resp_to_echo_mcast ip_param_arr[2].ip_param_value 1004 #define ip_g_resp_to_timestamp ip_param_arr[3].ip_param_value 1005 #define ip_g_resp_to_timestamp_bcast ip_param_arr[4].ip_param_value 1006 #define ip_g_send_redirects ip_param_arr[5].ip_param_value 1007 #define ip_g_forward_directed_bcast ip_param_arr[6].ip_param_value 1008 #define ip_debug ip_param_arr[7].ip_param_value /* */ 1009 #define ip_mrtdebug ip_param_arr[8].ip_param_value /* */ 1010 #define ip_timer_interval ip_param_arr[9].ip_param_value /* */ 1011 #define ip_ire_arp_interval ip_param_arr[10].ip_param_value /* */ 1012 #define ip_ire_redir_interval ip_param_arr[11].ip_param_value 1013 #define ip_def_ttl ip_param_arr[12].ip_param_value 1014 #define ip_forward_src_routed ip_param_arr[13].ip_param_value 1015 #define ip_wroff_extra ip_param_arr[14].ip_param_value 1016 #define ip_ire_pathmtu_interval ip_param_arr[15].ip_param_value 1017 #define ip_icmp_return ip_param_arr[16].ip_param_value 1018 #define ip_path_mtu_discovery ip_param_arr[17].ip_param_value /* */ 1019 #define ip_ignore_delete_time ip_param_arr[18].ip_param_value /* */ 1020 #define ip_ignore_redirect ip_param_arr[19].ip_param_value 1021 #define ip_output_queue ip_param_arr[20].ip_param_value 1022 #define ip_broadcast_ttl ip_param_arr[21].ip_param_value 1023 #define ip_icmp_err_interval ip_param_arr[22].ip_param_value 1024 #define ip_icmp_err_burst ip_param_arr[23].ip_param_value 1025 #define ip_reass_queue_bytes ip_param_arr[24].ip_param_value 1026 #define ip_strict_dst_multihoming ip_param_arr[25].ip_param_value 1027 #define ip_addrs_per_if ip_param_arr[26].ip_param_value 1028 #define ipsec_override_persocket_policy ip_param_arr[27].ip_param_value /* */ 1029 #define icmp_accept_clear_messages ip_param_arr[28].ip_param_value 1030 #define igmp_accept_clear_messages ip_param_arr[29].ip_param_value 1031 1032 /* IPv6 configuration knobs */ 1033 #define delay_first_probe_time ip_param_arr[30].ip_param_value 1034 #define max_unicast_solicit ip_param_arr[31].ip_param_value 1035 #define ipv6_def_hops ip_param_arr[32].ip_param_value 1036 #define ipv6_icmp_return ip_param_arr[33].ip_param_value 1037 #define ipv6_forward_src_routed ip_param_arr[34].ip_param_value 1038 #define ipv6_resp_echo_mcast ip_param_arr[35].ip_param_value 1039 #define ipv6_send_redirects ip_param_arr[36].ip_param_value 1040 #define ipv6_ignore_redirect ip_param_arr[37].ip_param_value 1041 #define ipv6_strict_dst_multihoming ip_param_arr[38].ip_param_value 1042 #define ip_ire_reclaim_fraction ip_param_arr[39].ip_param_value 1043 #define ipsec_policy_log_interval ip_param_arr[40].ip_param_value 1044 #define pim_accept_clear_messages ip_param_arr[41].ip_param_value 1045 #define ip_ndp_unsolicit_interval ip_param_arr[42].ip_param_value 1046 #define ip_ndp_unsolicit_count ip_param_arr[43].ip_param_value 1047 #define ipv6_ignore_home_address_opt ip_param_arr[44].ip_param_value 1048 #define ip_policy_mask ip_param_arr[45].ip_param_value 1049 #define ip_multirt_resolution_interval ip_param_arr[46].ip_param_value 1050 #define ip_multirt_ttl ip_param_arr[47].ip_param_value 1051 #define ip_multidata_outbound ip_param_arr[48].ip_param_value 1052 #ifdef DEBUG 1053 #define ipv6_drop_inbound_icmpv6 ip_param_arr[49].ip_param_value 1054 #else 1055 #define ipv6_drop_inbound_icmpv6 0 1056 #endif 1057 1058 1059 /* 1060 * Table of IP ioctls encoding the various properties of the ioctl and 1061 * indexed based on the last byte of the ioctl command. Occasionally there 1062 * is a clash, and there is more than 1 ioctl with the same last byte. 1063 * In such a case 1 ioctl is encoded in the ndx table and the remaining 1064 * ioctls are encoded in the misc table. An entry in the ndx table is 1065 * retrieved by indexing on the last byte of the ioctl command and comparing 1066 * the ioctl command with the value in the ndx table. In the event of a 1067 * mismatch the misc table is then searched sequentially for the desired 1068 * ioctl command. 1069 * 1070 * Entry: <command> <copyin_size> <flags> <cmd_type> <function> <restart_func> 1071 */ 1072 ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { 1073 /* 000 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1074 /* 001 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1075 /* 002 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1076 /* 003 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1077 /* 004 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1078 /* 005 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1079 /* 006 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1080 /* 007 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1081 /* 008 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1082 /* 009 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1083 1084 /* 010 */ { SIOCADDRT, sizeof (struct rtentry), IPI_PRIV, 1085 MISC_CMD, ip_siocaddrt, NULL }, 1086 /* 011 */ { SIOCDELRT, sizeof (struct rtentry), IPI_PRIV, 1087 MISC_CMD, ip_siocdelrt, NULL }, 1088 1089 /* 012 */ { SIOCSIFADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 1090 IF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart }, 1091 /* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, 1092 IF_CMD, ip_sioctl_get_addr, NULL }, 1093 1094 /* 014 */ { SIOCSIFDSTADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 1095 IF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart }, 1096 /* 015 */ { SIOCGIFDSTADDR, sizeof (struct ifreq), 1097 IPI_GET_CMD | IPI_REPL, 1098 IF_CMD, ip_sioctl_get_dstaddr, NULL }, 1099 1100 /* 016 */ { SIOCSIFFLAGS, sizeof (struct ifreq), 1101 IPI_PRIV | IPI_WR | IPI_REPL, 1102 IF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart }, 1103 /* 017 */ { SIOCGIFFLAGS, sizeof (struct ifreq), 1104 IPI_MODOK | IPI_GET_CMD | IPI_REPL, 1105 IF_CMD, ip_sioctl_get_flags, NULL }, 1106 1107 /* 018 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1108 /* 019 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1109 1110 /* copyin size cannot be coded for SIOCGIFCONF */ 1111 /* 020 */ { O_SIOCGIFCONF, 0, IPI_GET_CMD | IPI_REPL, 1112 MISC_CMD, ip_sioctl_get_ifconf, NULL }, 1113 1114 /* 021 */ { SIOCSIFMTU, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 1115 IF_CMD, ip_sioctl_mtu, NULL }, 1116 /* 022 */ { SIOCGIFMTU, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, 1117 IF_CMD, ip_sioctl_get_mtu, NULL }, 1118 /* 023 */ { SIOCGIFBRDADDR, sizeof (struct ifreq), 1119 IPI_GET_CMD | IPI_REPL, 1120 IF_CMD, ip_sioctl_get_brdaddr, NULL }, 1121 /* 024 */ { SIOCSIFBRDADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 1122 IF_CMD, ip_sioctl_brdaddr, NULL }, 1123 /* 025 */ { SIOCGIFNETMASK, sizeof (struct ifreq), 1124 IPI_GET_CMD | IPI_REPL, 1125 IF_CMD, ip_sioctl_get_netmask, NULL }, 1126 /* 026 */ { SIOCSIFNETMASK, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 1127 IF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart }, 1128 /* 027 */ { SIOCGIFMETRIC, sizeof (struct ifreq), 1129 IPI_GET_CMD | IPI_REPL, 1130 IF_CMD, ip_sioctl_get_metric, NULL }, 1131 /* 028 */ { SIOCSIFMETRIC, sizeof (struct ifreq), IPI_PRIV, 1132 IF_CMD, ip_sioctl_metric, NULL }, 1133 /* 029 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1134 1135 /* See 166-168 below for extended SIOC*XARP ioctls */ 1136 /* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV, 1137 MISC_CMD, ip_sioctl_arp, NULL }, 1138 /* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD | IPI_REPL, 1139 MISC_CMD, ip_sioctl_arp, NULL }, 1140 /* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV, 1141 MISC_CMD, ip_sioctl_arp, NULL }, 1142 1143 /* 033 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1144 /* 034 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1145 /* 035 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1146 /* 036 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1147 /* 037 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1148 /* 038 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1149 /* 039 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1150 /* 040 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1151 /* 041 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1152 /* 042 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1153 /* 043 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1154 /* 044 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1155 /* 045 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1156 /* 046 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1157 /* 047 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1158 /* 048 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1159 /* 049 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1160 /* 050 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1161 /* 051 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1162 /* 052 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1163 /* 053 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1164 1165 /* 054 */ { IF_UNITSEL, sizeof (int), IPI_PRIV | IPI_WR | IPI_MODOK, 1166 MISC_CMD, if_unitsel, if_unitsel_restart }, 1167 1168 /* 055 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1169 /* 056 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1170 /* 057 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1171 /* 058 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1172 /* 059 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1173 /* 060 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1174 /* 061 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1175 /* 062 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1176 /* 063 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1177 /* 064 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1178 /* 065 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1179 /* 066 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1180 /* 067 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1181 /* 068 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1182 /* 069 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1183 /* 070 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1184 /* 071 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1185 /* 072 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1186 1187 /* 073 */ { SIOCSIFNAME, sizeof (struct ifreq), 1188 IPI_PRIV | IPI_WR | IPI_MODOK, 1189 IF_CMD, ip_sioctl_sifname, NULL }, 1190 1191 /* 074 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1192 /* 075 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1193 /* 076 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1194 /* 077 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1195 /* 078 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1196 /* 079 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1197 /* 080 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1198 /* 081 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1199 /* 082 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1200 /* 083 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1201 /* 084 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1202 /* 085 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1203 /* 086 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1204 1205 /* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD | IPI_REPL, 1206 MISC_CMD, ip_sioctl_get_ifnum, NULL }, 1207 /* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, 1208 IF_CMD, ip_sioctl_get_muxid, NULL }, 1209 /* 089 */ { SIOCSIFMUXID, sizeof (struct ifreq), 1210 IPI_PRIV | IPI_WR | IPI_REPL, 1211 IF_CMD, ip_sioctl_muxid, NULL }, 1212 1213 /* Both if and lif variants share same func */ 1214 /* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, 1215 IF_CMD, ip_sioctl_get_lifindex, NULL }, 1216 /* Both if and lif variants share same func */ 1217 /* 091 */ { SIOCSIFINDEX, sizeof (struct ifreq), 1218 IPI_PRIV | IPI_WR | IPI_REPL, 1219 IF_CMD, ip_sioctl_slifindex, NULL }, 1220 1221 /* copyin size cannot be coded for SIOCGIFCONF */ 1222 /* 092 */ { SIOCGIFCONF, 0, IPI_GET_CMD | IPI_REPL, 1223 MISC_CMD, ip_sioctl_get_ifconf, NULL }, 1224 /* 093 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1225 /* 094 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1226 /* 095 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1227 /* 096 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1228 /* 097 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1229 /* 098 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1230 /* 099 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1231 /* 100 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1232 /* 101 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1233 /* 102 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1234 /* 103 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1235 /* 104 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1236 /* 105 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1237 /* 106 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1238 /* 107 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1239 /* 108 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1240 /* 109 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1241 1242 /* 110 */ { SIOCLIFREMOVEIF, sizeof (struct lifreq), 1243 IPI_PRIV | IPI_WR | IPI_REPL, 1244 LIF_CMD, ip_sioctl_removeif, 1245 ip_sioctl_removeif_restart }, 1246 /* 111 */ { SIOCLIFADDIF, sizeof (struct lifreq), 1247 IPI_GET_CMD | IPI_PRIV | IPI_WR | IPI_REPL, 1248 LIF_CMD, ip_sioctl_addif, NULL }, 1249 #define SIOCLIFADDR_NDX 112 1250 /* 112 */ { SIOCSLIFADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1251 LIF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart }, 1252 /* 113 */ { SIOCGLIFADDR, sizeof (struct lifreq), 1253 IPI_GET_CMD | IPI_REPL, 1254 LIF_CMD, ip_sioctl_get_addr, NULL }, 1255 /* 114 */ { SIOCSLIFDSTADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1256 LIF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart }, 1257 /* 115 */ { SIOCGLIFDSTADDR, sizeof (struct lifreq), 1258 IPI_GET_CMD | IPI_REPL, 1259 LIF_CMD, ip_sioctl_get_dstaddr, NULL }, 1260 /* 116 */ { SIOCSLIFFLAGS, sizeof (struct lifreq), 1261 IPI_PRIV | IPI_WR | IPI_REPL, 1262 LIF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart }, 1263 /* 117 */ { SIOCGLIFFLAGS, sizeof (struct lifreq), 1264 IPI_GET_CMD | IPI_MODOK | IPI_REPL, 1265 LIF_CMD, ip_sioctl_get_flags, NULL }, 1266 1267 /* 118 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1268 /* 119 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1269 1270 /* 120 */ { O_SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD | IPI_REPL, 1271 ip_sioctl_get_lifconf, NULL }, 1272 /* 121 */ { SIOCSLIFMTU, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1273 LIF_CMD, ip_sioctl_mtu, NULL }, 1274 /* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD | IPI_REPL, 1275 LIF_CMD, ip_sioctl_get_mtu, NULL }, 1276 /* 123 */ { SIOCGLIFBRDADDR, sizeof (struct lifreq), 1277 IPI_GET_CMD | IPI_REPL, 1278 LIF_CMD, ip_sioctl_get_brdaddr, NULL }, 1279 /* 124 */ { SIOCSLIFBRDADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1280 LIF_CMD, ip_sioctl_brdaddr, NULL }, 1281 /* 125 */ { SIOCGLIFNETMASK, sizeof (struct lifreq), 1282 IPI_GET_CMD | IPI_REPL, 1283 LIF_CMD, ip_sioctl_get_netmask, NULL }, 1284 /* 126 */ { SIOCSLIFNETMASK, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1285 LIF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart }, 1286 /* 127 */ { SIOCGLIFMETRIC, sizeof (struct lifreq), 1287 IPI_GET_CMD | IPI_REPL, 1288 LIF_CMD, ip_sioctl_get_metric, NULL }, 1289 /* 128 */ { SIOCSLIFMETRIC, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1290 LIF_CMD, ip_sioctl_metric, NULL }, 1291 /* 129 */ { SIOCSLIFNAME, sizeof (struct lifreq), 1292 IPI_PRIV | IPI_WR | IPI_MODOK | IPI_REPL, 1293 LIF_CMD, ip_sioctl_slifname, 1294 ip_sioctl_slifname_restart }, 1295 1296 /* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD | IPI_REPL, 1297 MISC_CMD, ip_sioctl_get_lifnum, NULL }, 1298 /* 131 */ { SIOCGLIFMUXID, sizeof (struct lifreq), 1299 IPI_GET_CMD | IPI_REPL, 1300 LIF_CMD, ip_sioctl_get_muxid, NULL }, 1301 /* 132 */ { SIOCSLIFMUXID, sizeof (struct lifreq), 1302 IPI_PRIV | IPI_WR | IPI_REPL, 1303 LIF_CMD, ip_sioctl_muxid, NULL }, 1304 /* 133 */ { SIOCGLIFINDEX, sizeof (struct lifreq), 1305 IPI_GET_CMD | IPI_REPL, 1306 LIF_CMD, ip_sioctl_get_lifindex, 0 }, 1307 /* 134 */ { SIOCSLIFINDEX, sizeof (struct lifreq), 1308 IPI_PRIV | IPI_WR | IPI_REPL, 1309 LIF_CMD, ip_sioctl_slifindex, 0 }, 1310 /* 135 */ { SIOCSLIFTOKEN, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1311 LIF_CMD, ip_sioctl_token, NULL }, 1312 /* 136 */ { SIOCGLIFTOKEN, sizeof (struct lifreq), 1313 IPI_GET_CMD | IPI_REPL, 1314 LIF_CMD, ip_sioctl_get_token, NULL }, 1315 /* 137 */ { SIOCSLIFSUBNET, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1316 LIF_CMD, ip_sioctl_subnet, ip_sioctl_subnet_restart }, 1317 /* 138 */ { SIOCGLIFSUBNET, sizeof (struct lifreq), 1318 IPI_GET_CMD | IPI_REPL, 1319 LIF_CMD, ip_sioctl_get_subnet, NULL }, 1320 /* 139 */ { SIOCSLIFLNKINFO, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1321 LIF_CMD, ip_sioctl_lnkinfo, NULL }, 1322 1323 /* 140 */ { SIOCGLIFLNKINFO, sizeof (struct lifreq), 1324 IPI_GET_CMD | IPI_REPL, 1325 LIF_CMD, ip_sioctl_get_lnkinfo, NULL }, 1326 /* 141 */ { SIOCLIFDELND, sizeof (struct lifreq), IPI_PRIV, 1327 LIF_CMD, ip_siocdelndp_v6, NULL }, 1328 /* 142 */ { SIOCLIFGETND, sizeof (struct lifreq), IPI_GET_CMD, 1329 LIF_CMD, ip_siocqueryndp_v6, NULL }, 1330 /* 143 */ { SIOCLIFSETND, sizeof (struct lifreq), IPI_PRIV, 1331 LIF_CMD, ip_siocsetndp_v6, NULL }, 1332 /* 144 */ { SIOCTMYADDR, sizeof (struct sioc_addrreq), IPI_GET_CMD, 1333 MISC_CMD, ip_sioctl_tmyaddr, NULL }, 1334 /* 145 */ { SIOCTONLINK, sizeof (struct sioc_addrreq), IPI_GET_CMD, 1335 MISC_CMD, ip_sioctl_tonlink, NULL }, 1336 /* 146 */ { SIOCTMYSITE, sizeof (struct sioc_addrreq), 0, 1337 MISC_CMD, ip_sioctl_tmysite, NULL }, 1338 /* 147 */ { SIOCGTUNPARAM, sizeof (struct iftun_req), IPI_REPL, 1339 TUN_CMD, ip_sioctl_tunparam, NULL }, 1340 /* 148 */ { SIOCSTUNPARAM, sizeof (struct iftun_req), 1341 IPI_PRIV | IPI_WR, 1342 TUN_CMD, ip_sioctl_tunparam, NULL }, 1343 1344 /* IPSECioctls handled in ip_sioctl_copyin_setup itself */ 1345 /* 149 */ { SIOCFIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1346 /* 150 */ { SIOCSIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1347 /* 151 */ { SIOCDIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1348 /* 152 */ { SIOCLIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1349 1350 /* 153 */ { SIOCLIFFAILOVER, sizeof (struct lifreq), 1351 IPI_PRIV | IPI_WR | IPI_REPL, 1352 LIF_CMD, ip_sioctl_move, ip_sioctl_move }, 1353 /* 154 */ { SIOCLIFFAILBACK, sizeof (struct lifreq), 1354 IPI_PRIV | IPI_WR | IPI_REPL, 1355 LIF_CMD, ip_sioctl_move, ip_sioctl_move }, 1356 /* 155 */ { SIOCSLIFGROUPNAME, sizeof (struct lifreq), 1357 IPI_PRIV | IPI_WR, 1358 LIF_CMD, ip_sioctl_groupname, ip_sioctl_groupname }, 1359 /* 156 */ { SIOCGLIFGROUPNAME, sizeof (struct lifreq), 1360 IPI_GET_CMD | IPI_REPL, 1361 LIF_CMD, ip_sioctl_get_groupname, NULL }, 1362 /* 157 */ { SIOCGLIFOINDEX, sizeof (struct lifreq), 1363 IPI_GET_CMD | IPI_REPL, 1364 LIF_CMD, ip_sioctl_get_oindex, NULL }, 1365 1366 /* Leave 158-160 unused; used to be SIOC*IFARP ioctls */ 1367 /* 158 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1368 /* 159 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1369 /* 160 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1370 1371 /* 161 */ { SIOCSLIFOINDEX, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1372 LIF_CMD, ip_sioctl_slifoindex, NULL }, 1373 1374 /* These are handled in ip_sioctl_copyin_setup itself */ 1375 /* 162 */ { SIOCGIP6ADDRPOLICY, 0, IPI_NULL_BCONT, 1376 MISC_CMD, NULL, NULL }, 1377 /* 163 */ { SIOCSIP6ADDRPOLICY, 0, IPI_PRIV | IPI_NULL_BCONT, 1378 MISC_CMD, NULL, NULL }, 1379 /* 164 */ { SIOCGDSTINFO, 0, IPI_GET_CMD, MISC_CMD, NULL, NULL }, 1380 1381 /* 165 */ { SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD | IPI_REPL, 1382 ip_sioctl_get_lifconf, NULL }, 1383 1384 /* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV, 1385 MISC_CMD, ip_sioctl_xarp, NULL }, 1386 /* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD | IPI_REPL, 1387 MISC_CMD, ip_sioctl_xarp, NULL }, 1388 /* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV, 1389 MISC_CMD, ip_sioctl_xarp, NULL }, 1390 1391 /* SIOCPOPSOCKFS is not handled by IP */ 1392 /* 169 */ { IPI_DONTCARE /* SIOCPOPSOCKFS */, 0, 0, 0, NULL, NULL }, 1393 1394 /* 170 */ { SIOCGLIFZONE, sizeof (struct lifreq), 1395 IPI_GET_CMD | IPI_REPL, 1396 LIF_CMD, ip_sioctl_get_lifzone, NULL }, 1397 /* 171 */ { SIOCSLIFZONE, sizeof (struct lifreq), 1398 IPI_PRIV | IPI_WR | IPI_REPL, 1399 LIF_CMD, ip_sioctl_slifzone, 1400 ip_sioctl_slifzone_restart }, 1401 /* 172-174 are SCTP ioctls and not handled by IP */ 1402 /* 172 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1403 /* 173 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1404 /* 174 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1405 /* 175 */ { SIOCGLIFUSESRC, sizeof (struct lifreq), 1406 IPI_GET_CMD, LIF_CMD, 1407 ip_sioctl_get_lifusesrc, 0 }, 1408 /* 176 */ { SIOCSLIFUSESRC, sizeof (struct lifreq), 1409 IPI_PRIV | IPI_WR, 1410 LIF_CMD, ip_sioctl_slifusesrc, 1411 NULL }, 1412 /* 177 */ { SIOCGLIFSRCOF, 0, IPI_GET_CMD, MISC_CMD, 1413 ip_sioctl_get_lifsrcof, NULL }, 1414 /* 178 */ { SIOCGMSFILTER, sizeof (struct group_filter), IPI_GET_CMD, 1415 MISC_CMD, ip_sioctl_msfilter, NULL }, 1416 /* 179 */ { SIOCSMSFILTER, sizeof (struct group_filter), IPI_WR, 1417 MISC_CMD, ip_sioctl_msfilter, NULL }, 1418 /* 180 */ { SIOCGIPMSFILTER, sizeof (struct ip_msfilter), IPI_GET_CMD, 1419 MISC_CMD, ip_sioctl_msfilter, NULL }, 1420 /* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), IPI_WR, 1421 MISC_CMD, ip_sioctl_msfilter, NULL }, 1422 /* 182 */ { SIOCSIPMPFAILBACK, sizeof (int), IPI_PRIV, MISC_CMD, 1423 ip_sioctl_set_ipmpfailback, NULL } 1424 }; 1425 1426 int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t); 1427 1428 ip_ioctl_cmd_t ip_misc_ioctl_table[] = { 1429 { OSIOCGTUNPARAM, sizeof (struct old_iftun_req), 1430 IPI_GET_CMD | IPI_REPL, TUN_CMD, ip_sioctl_tunparam, NULL }, 1431 { OSIOCSTUNPARAM, sizeof (struct old_iftun_req), IPI_PRIV | IPI_WR, 1432 TUN_CMD, ip_sioctl_tunparam, NULL }, 1433 { I_LINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1434 { I_UNLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1435 { I_PLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1436 { I_PUNLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1437 { ND_GET, 0, IPI_PASS_DOWN, 0, NULL, NULL }, 1438 { ND_SET, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1439 { IP_IOCTL, 0, 0, 0, NULL, NULL }, 1440 { SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_REPL | IPI_GET_CMD, 1441 MISC_CMD, mrt_ioctl}, 1442 { SIOCGETSGCNT, sizeof (struct sioc_sg_req), IPI_REPL | IPI_GET_CMD, 1443 MISC_CMD, mrt_ioctl}, 1444 { SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_REPL | IPI_GET_CMD, 1445 MISC_CMD, mrt_ioctl} 1446 }; 1447 1448 int ip_misc_ioctl_count = 1449 sizeof (ip_misc_ioctl_table) / sizeof (ip_ioctl_cmd_t); 1450 1451 static idl_t *conn_drain_list; /* The array of conn drain lists */ 1452 static uint_t conn_drain_list_cnt; /* Total count of conn_drain_list */ 1453 static int conn_drain_list_index; /* Next drain_list to be used */ 1454 int conn_drain_nthreads; /* Number of drainers reqd. */ 1455 /* Settable in /etc/system */ 1456 1457 /* Defined in ip_ire.c */ 1458 extern uint32_t ip_ire_max_bucket_cnt, ip6_ire_max_bucket_cnt; 1459 extern uint32_t ip_ire_min_bucket_cnt, ip6_ire_min_bucket_cnt; 1460 extern uint32_t ip_ire_mem_ratio, ip_ire_cpu_ratio; 1461 1462 static nv_t ire_nv_arr[] = { 1463 { IRE_BROADCAST, "BROADCAST" }, 1464 { IRE_LOCAL, "LOCAL" }, 1465 { IRE_LOOPBACK, "LOOPBACK" }, 1466 { IRE_CACHE, "CACHE" }, 1467 { IRE_DEFAULT, "DEFAULT" }, 1468 { IRE_PREFIX, "PREFIX" }, 1469 { IRE_IF_NORESOLVER, "IF_NORESOL" }, 1470 { IRE_IF_RESOLVER, "IF_RESOLV" }, 1471 { IRE_HOST, "HOST" }, 1472 { IRE_HOST_REDIRECT, "HOST_REDIRECT" }, 1473 { 0 } 1474 }; 1475 1476 nv_t *ire_nv_tbl = ire_nv_arr; 1477 1478 /* Defined in ip_if.c, protect the list of IPsec capable ills */ 1479 extern krwlock_t ipsec_capab_ills_lock; 1480 1481 /* Packet dropper for IP IPsec processing failures */ 1482 ipdropper_t ip_dropper; 1483 1484 /* Simple ICMP IP Header Template */ 1485 static ipha_t icmp_ipha = { 1486 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 1487 }; 1488 1489 struct module_info ip_mod_info = { 1490 IP_MOD_ID, IP_MOD_NAME, 1, INFPSZ, 65536, 1024 1491 }; 1492 1493 static struct qinit rinit = { 1494 (pfi_t)ip_rput, NULL, ip_open, ip_close, NULL, 1495 &ip_mod_info 1496 }; 1497 1498 static struct qinit winit = { 1499 (pfi_t)ip_wput, (pfi_t)ip_wsrv, ip_open, ip_close, NULL, 1500 &ip_mod_info 1501 }; 1502 1503 static struct qinit lrinit = { 1504 (pfi_t)ip_lrput, NULL, ip_open, ip_close, NULL, 1505 &ip_mod_info 1506 }; 1507 1508 static struct qinit lwinit = { 1509 (pfi_t)ip_lwput, NULL, ip_open, ip_close, NULL, 1510 &ip_mod_info 1511 }; 1512 1513 struct streamtab ipinfo = { 1514 &rinit, &winit, &lrinit, &lwinit 1515 }; 1516 1517 #ifdef DEBUG 1518 static boolean_t skip_sctp_cksum = B_FALSE; 1519 #endif 1520 /* 1521 * Copy an M_CTL-tagged message, preserving reference counts appropriately. 1522 */ 1523 mblk_t * 1524 ip_copymsg(mblk_t *mp) 1525 { 1526 mblk_t *nmp; 1527 ipsec_info_t *in; 1528 1529 if (mp->b_datap->db_type != M_CTL) 1530 return (copymsg(mp)); 1531 1532 in = (ipsec_info_t *)mp->b_rptr; 1533 1534 /* 1535 * Note that M_CTL is also used for delivering ICMP error messages 1536 * upstream to transport layers. 1537 */ 1538 if (in->ipsec_info_type != IPSEC_OUT && 1539 in->ipsec_info_type != IPSEC_IN) 1540 return (copymsg(mp)); 1541 1542 nmp = copymsg(mp->b_cont); 1543 1544 if (in->ipsec_info_type == IPSEC_OUT) 1545 return (ipsec_out_tag(mp, nmp)); 1546 else 1547 return (ipsec_in_tag(mp, nmp)); 1548 } 1549 1550 /* Generate an ICMP fragmentation needed message. */ 1551 static void 1552 icmp_frag_needed(queue_t *q, mblk_t *mp, int mtu) 1553 { 1554 icmph_t icmph; 1555 mblk_t *first_mp; 1556 boolean_t mctl_present; 1557 1558 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 1559 1560 if (!(mp = icmp_pkt_err_ok(mp))) { 1561 if (mctl_present) 1562 freeb(first_mp); 1563 return; 1564 } 1565 1566 bzero(&icmph, sizeof (icmph_t)); 1567 icmph.icmph_type = ICMP_DEST_UNREACHABLE; 1568 icmph.icmph_code = ICMP_FRAGMENTATION_NEEDED; 1569 icmph.icmph_du_mtu = htons((uint16_t)mtu); 1570 BUMP_MIB(&icmp_mib, icmpOutFragNeeded); 1571 BUMP_MIB(&icmp_mib, icmpOutDestUnreachs); 1572 icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present); 1573 } 1574 1575 /* 1576 * icmp_inbound deals with ICMP messages in the following ways. 1577 * 1578 * 1) It needs to send a reply back and possibly delivering it 1579 * to the "interested" upper clients. 1580 * 2) It needs to send it to the upper clients only. 1581 * 3) It needs to change some values in IP only. 1582 * 4) It needs to change some values in IP and upper layers e.g TCP. 1583 * 1584 * We need to accomodate icmp messages coming in clear until we get 1585 * everything secure from the wire. If icmp_accept_clear_messages 1586 * is zero we check with the global policy and act accordingly. If 1587 * it is non-zero, we accept the message without any checks. But 1588 * *this does not mean* that this will be delivered to the upper 1589 * clients. By accepting we might send replies back, change our MTU 1590 * value etc. but delivery to the ULP/clients depends on their policy 1591 * dispositions. 1592 * 1593 * We handle the above 4 cases in the context of IPSEC in the 1594 * following way : 1595 * 1596 * 1) Send the reply back in the same way as the request came in. 1597 * If it came in encrypted, it goes out encrypted. If it came in 1598 * clear, it goes out in clear. Thus, this will prevent chosen 1599 * plain text attack. 1600 * 2) The client may or may not expect things to come in secure. 1601 * If it comes in secure, the policy constraints are checked 1602 * before delivering it to the upper layers. If it comes in 1603 * clear, ipsec_inbound_accept_clear will decide whether to 1604 * accept this in clear or not. In both the cases, if the returned 1605 * message (IP header + 8 bytes) that caused the icmp message has 1606 * AH/ESP headers, it is sent up to AH/ESP for validation before 1607 * sending up. If there are only 8 bytes of returned message, then 1608 * upper client will not be notified. 1609 * 3) Check with global policy to see whether it matches the constaints. 1610 * But this will be done only if icmp_accept_messages_in_clear is 1611 * zero. 1612 * 4) If we need to change both in IP and ULP, then the decision taken 1613 * while affecting the values in IP and while delivering up to TCP 1614 * should be the same. 1615 * 1616 * There are two cases. 1617 * 1618 * a) If we reject data at the IP layer (ipsec_check_global_policy() 1619 * failed), we will not deliver it to the ULP, even though they 1620 * are *willing* to accept in *clear*. This is fine as our global 1621 * disposition to icmp messages asks us reject the datagram. 1622 * 1623 * b) If we accept data at the IP layer (ipsec_check_global_policy() 1624 * succeeded or icmp_accept_messages_in_clear is 1), and not able 1625 * to deliver it to ULP (policy failed), it can lead to 1626 * consistency problems. The cases known at this time are 1627 * ICMP_DESTINATION_UNREACHABLE messages with following code 1628 * values : 1629 * 1630 * - ICMP_FRAGMENTATION_NEEDED : IP adapts to the new value 1631 * and Upper layer rejects. Then the communication will 1632 * come to a stop. This is solved by making similar decisions 1633 * at both levels. Currently, when we are unable to deliver 1634 * to the Upper Layer (due to policy failures) while IP has 1635 * adjusted ire_max_frag, the next outbound datagram would 1636 * generate a local ICMP_FRAGMENTATION_NEEDED message - which 1637 * will be with the right level of protection. Thus the right 1638 * value will be communicated even if we are not able to 1639 * communicate when we get from the wire initially. But this 1640 * assumes there would be at least one outbound datagram after 1641 * IP has adjusted its ire_max_frag value. To make things 1642 * simpler, we accept in clear after the validation of 1643 * AH/ESP headers. 1644 * 1645 * - Other ICMP ERRORS : We may not be able to deliver it to the 1646 * upper layer depending on the level of protection the upper 1647 * layer expects and the disposition in ipsec_inbound_accept_clear(). 1648 * ipsec_inbound_accept_clear() decides whether a given ICMP error 1649 * should be accepted in clear when the Upper layer expects secure. 1650 * Thus the communication may get aborted by some bad ICMP 1651 * packets. 1652 * 1653 * IPQoS Notes: 1654 * The only instance when a packet is sent for processing is when there 1655 * isn't an ICMP client and if we are interested in it. 1656 * If there is a client, IPPF processing will take place in the 1657 * ip_fanout_proto routine. 1658 * 1659 * Zones notes: 1660 * The packet is only processed in the context of the specified zone: typically 1661 * only this zone will reply to an echo request, and only interested clients in 1662 * this zone will receive a copy of the packet. This means that the caller must 1663 * call icmp_inbound() for each relevant zone. 1664 */ 1665 static void 1666 icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill, 1667 int sum_valid, uint32_t sum, boolean_t mctl_present, boolean_t ip_policy, 1668 ill_t *recv_ill, zoneid_t zoneid) 1669 { 1670 icmph_t *icmph; 1671 ipha_t *ipha; 1672 int iph_hdr_length; 1673 int hdr_length; 1674 boolean_t interested; 1675 uint32_t ts; 1676 uchar_t *wptr; 1677 ipif_t *ipif; 1678 mblk_t *first_mp; 1679 ipsec_in_t *ii; 1680 ire_t *src_ire; 1681 boolean_t onlink; 1682 timestruc_t now; 1683 uint32_t ill_index; 1684 1685 ASSERT(ill != NULL); 1686 1687 first_mp = mp; 1688 if (mctl_present) { 1689 mp = first_mp->b_cont; 1690 ASSERT(mp != NULL); 1691 } 1692 1693 ipha = (ipha_t *)mp->b_rptr; 1694 if (icmp_accept_clear_messages == 0) { 1695 first_mp = ipsec_check_global_policy(first_mp, NULL, 1696 ipha, NULL, mctl_present); 1697 if (first_mp == NULL) 1698 return; 1699 } 1700 1701 /* 1702 * On a labeled system, we have to check whether the zone itself is 1703 * permitted to receive raw traffic. 1704 */ 1705 if (is_system_labeled()) { 1706 if (zoneid == ALL_ZONES) 1707 zoneid = tsol_packet_to_zoneid(mp); 1708 if (!tsol_can_accept_raw(mp, B_FALSE)) { 1709 ip1dbg(("icmp_inbound: zone %d can't receive raw", 1710 zoneid)); 1711 BUMP_MIB(&icmp_mib, icmpInErrors); 1712 freemsg(first_mp); 1713 return; 1714 } 1715 } 1716 1717 /* 1718 * We have accepted the ICMP message. It means that we will 1719 * respond to the packet if needed. It may not be delivered 1720 * to the upper client depending on the policy constraints 1721 * and the disposition in ipsec_inbound_accept_clear. 1722 */ 1723 1724 ASSERT(ill != NULL); 1725 1726 BUMP_MIB(&icmp_mib, icmpInMsgs); 1727 iph_hdr_length = IPH_HDR_LENGTH(ipha); 1728 if ((mp->b_wptr - mp->b_rptr) < (iph_hdr_length + ICMPH_SIZE)) { 1729 /* Last chance to get real. */ 1730 if (!pullupmsg(mp, iph_hdr_length + ICMPH_SIZE)) { 1731 BUMP_MIB(&icmp_mib, icmpInErrors); 1732 freemsg(first_mp); 1733 return; 1734 } 1735 /* Refresh iph following the pullup. */ 1736 ipha = (ipha_t *)mp->b_rptr; 1737 } 1738 /* ICMP header checksum, including checksum field, should be zero. */ 1739 if (sum_valid ? (sum != 0 && sum != 0xFFFF) : 1740 IP_CSUM(mp, iph_hdr_length, 0)) { 1741 BUMP_MIB(&icmp_mib, icmpInCksumErrs); 1742 freemsg(first_mp); 1743 return; 1744 } 1745 /* The IP header will always be a multiple of four bytes */ 1746 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1747 ip2dbg(("icmp_inbound: type %d code %d\n", icmph->icmph_type, 1748 icmph->icmph_code)); 1749 wptr = (uchar_t *)icmph + ICMPH_SIZE; 1750 /* We will set "interested" to "true" if we want a copy */ 1751 interested = B_FALSE; 1752 switch (icmph->icmph_type) { 1753 case ICMP_ECHO_REPLY: 1754 BUMP_MIB(&icmp_mib, icmpInEchoReps); 1755 break; 1756 case ICMP_DEST_UNREACHABLE: 1757 if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) 1758 BUMP_MIB(&icmp_mib, icmpInFragNeeded); 1759 interested = B_TRUE; /* Pass up to transport */ 1760 BUMP_MIB(&icmp_mib, icmpInDestUnreachs); 1761 break; 1762 case ICMP_SOURCE_QUENCH: 1763 interested = B_TRUE; /* Pass up to transport */ 1764 BUMP_MIB(&icmp_mib, icmpInSrcQuenchs); 1765 break; 1766 case ICMP_REDIRECT: 1767 if (!ip_ignore_redirect) 1768 interested = B_TRUE; 1769 BUMP_MIB(&icmp_mib, icmpInRedirects); 1770 break; 1771 case ICMP_ECHO_REQUEST: 1772 /* 1773 * Whether to respond to echo requests that come in as IP 1774 * broadcasts or as IP multicast is subject to debate 1775 * (what isn't?). We aim to please, you pick it. 1776 * Default is do it. 1777 */ 1778 if (!broadcast && !CLASSD(ipha->ipha_dst)) { 1779 /* unicast: always respond */ 1780 interested = B_TRUE; 1781 } else if (CLASSD(ipha->ipha_dst)) { 1782 /* multicast: respond based on tunable */ 1783 interested = ip_g_resp_to_echo_mcast; 1784 } else if (broadcast) { 1785 /* broadcast: respond based on tunable */ 1786 interested = ip_g_resp_to_echo_bcast; 1787 } 1788 BUMP_MIB(&icmp_mib, icmpInEchos); 1789 break; 1790 case ICMP_ROUTER_ADVERTISEMENT: 1791 case ICMP_ROUTER_SOLICITATION: 1792 break; 1793 case ICMP_TIME_EXCEEDED: 1794 interested = B_TRUE; /* Pass up to transport */ 1795 BUMP_MIB(&icmp_mib, icmpInTimeExcds); 1796 break; 1797 case ICMP_PARAM_PROBLEM: 1798 interested = B_TRUE; /* Pass up to transport */ 1799 BUMP_MIB(&icmp_mib, icmpInParmProbs); 1800 break; 1801 case ICMP_TIME_STAMP_REQUEST: 1802 /* Response to Time Stamp Requests is local policy. */ 1803 if (ip_g_resp_to_timestamp && 1804 /* So is whether to respond if it was an IP broadcast. */ 1805 (!broadcast || ip_g_resp_to_timestamp_bcast)) { 1806 int tstamp_len = 3 * sizeof (uint32_t); 1807 1808 if (wptr + tstamp_len > mp->b_wptr) { 1809 if (!pullupmsg(mp, wptr + tstamp_len - 1810 mp->b_rptr)) { 1811 BUMP_MIB(&ip_mib, ipInDiscards); 1812 freemsg(first_mp); 1813 return; 1814 } 1815 /* Refresh ipha following the pullup. */ 1816 ipha = (ipha_t *)mp->b_rptr; 1817 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1818 wptr = (uchar_t *)icmph + ICMPH_SIZE; 1819 } 1820 interested = B_TRUE; 1821 } 1822 BUMP_MIB(&icmp_mib, icmpInTimestamps); 1823 break; 1824 case ICMP_TIME_STAMP_REPLY: 1825 BUMP_MIB(&icmp_mib, icmpInTimestampReps); 1826 break; 1827 case ICMP_INFO_REQUEST: 1828 /* Per RFC 1122 3.2.2.7, ignore this. */ 1829 case ICMP_INFO_REPLY: 1830 break; 1831 case ICMP_ADDRESS_MASK_REQUEST: 1832 if ((ip_respond_to_address_mask_broadcast || !broadcast) && 1833 /* TODO m_pullup of complete header? */ 1834 (mp->b_datap->db_lim - wptr) >= IP_ADDR_LEN) 1835 interested = B_TRUE; 1836 BUMP_MIB(&icmp_mib, icmpInAddrMasks); 1837 break; 1838 case ICMP_ADDRESS_MASK_REPLY: 1839 BUMP_MIB(&icmp_mib, icmpInAddrMaskReps); 1840 break; 1841 default: 1842 interested = B_TRUE; /* Pass up to transport */ 1843 BUMP_MIB(&icmp_mib, icmpInUnknowns); 1844 break; 1845 } 1846 /* See if there is an ICMP client. */ 1847 if (ipcl_proto_search(IPPROTO_ICMP) != NULL) { 1848 /* If there is an ICMP client and we want one too, copy it. */ 1849 mblk_t *first_mp1; 1850 1851 if (!interested) { 1852 ip_fanout_proto(q, first_mp, ill, ipha, 0, mctl_present, 1853 ip_policy, recv_ill, zoneid); 1854 return; 1855 } 1856 first_mp1 = ip_copymsg(first_mp); 1857 if (first_mp1 != NULL) { 1858 ip_fanout_proto(q, first_mp1, ill, ipha, 1859 0, mctl_present, ip_policy, recv_ill, zoneid); 1860 } 1861 } else if (!interested) { 1862 freemsg(first_mp); 1863 return; 1864 } else { 1865 /* 1866 * Initiate policy processing for this packet if ip_policy 1867 * is true. 1868 */ 1869 if (IPP_ENABLED(IPP_LOCAL_IN) && ip_policy) { 1870 ill_index = ill->ill_phyint->phyint_ifindex; 1871 ip_process(IPP_LOCAL_IN, &mp, ill_index); 1872 if (mp == NULL) { 1873 if (mctl_present) { 1874 freeb(first_mp); 1875 } 1876 BUMP_MIB(&icmp_mib, icmpInErrors); 1877 return; 1878 } 1879 } 1880 } 1881 /* We want to do something with it. */ 1882 /* Check db_ref to make sure we can modify the packet. */ 1883 if (mp->b_datap->db_ref > 1) { 1884 mblk_t *first_mp1; 1885 1886 first_mp1 = ip_copymsg(first_mp); 1887 freemsg(first_mp); 1888 if (!first_mp1) { 1889 BUMP_MIB(&icmp_mib, icmpOutDrops); 1890 return; 1891 } 1892 first_mp = first_mp1; 1893 if (mctl_present) { 1894 mp = first_mp->b_cont; 1895 ASSERT(mp != NULL); 1896 } else { 1897 mp = first_mp; 1898 } 1899 ipha = (ipha_t *)mp->b_rptr; 1900 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1901 wptr = (uchar_t *)icmph + ICMPH_SIZE; 1902 } 1903 switch (icmph->icmph_type) { 1904 case ICMP_ADDRESS_MASK_REQUEST: 1905 ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid); 1906 if (ipif == NULL) { 1907 freemsg(first_mp); 1908 return; 1909 } 1910 /* 1911 * outging interface must be IPv4 1912 */ 1913 ASSERT(ipif != NULL && !ipif->ipif_isv6); 1914 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; 1915 bcopy(&ipif->ipif_net_mask, wptr, IP_ADDR_LEN); 1916 ipif_refrele(ipif); 1917 BUMP_MIB(&icmp_mib, icmpOutAddrMaskReps); 1918 break; 1919 case ICMP_ECHO_REQUEST: 1920 icmph->icmph_type = ICMP_ECHO_REPLY; 1921 BUMP_MIB(&icmp_mib, icmpOutEchoReps); 1922 break; 1923 case ICMP_TIME_STAMP_REQUEST: { 1924 uint32_t *tsp; 1925 1926 icmph->icmph_type = ICMP_TIME_STAMP_REPLY; 1927 tsp = (uint32_t *)wptr; 1928 tsp++; /* Skip past 'originate time' */ 1929 /* Compute # of milliseconds since midnight */ 1930 gethrestime(&now); 1931 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + 1932 now.tv_nsec / (NANOSEC / MILLISEC); 1933 *tsp++ = htonl(ts); /* Lay in 'receive time' */ 1934 *tsp++ = htonl(ts); /* Lay in 'send time' */ 1935 BUMP_MIB(&icmp_mib, icmpOutTimestampReps); 1936 break; 1937 } 1938 default: 1939 ipha = (ipha_t *)&icmph[1]; 1940 if ((uchar_t *)&ipha[1] > mp->b_wptr) { 1941 if (!pullupmsg(mp, (uchar_t *)&ipha[1] - mp->b_rptr)) { 1942 BUMP_MIB(&ip_mib, ipInDiscards); 1943 freemsg(first_mp); 1944 return; 1945 } 1946 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1947 ipha = (ipha_t *)&icmph[1]; 1948 } 1949 if ((IPH_HDR_VERSION(ipha) != IPV4_VERSION)) { 1950 BUMP_MIB(&ip_mib, ipInDiscards); 1951 freemsg(first_mp); 1952 return; 1953 } 1954 hdr_length = IPH_HDR_LENGTH(ipha); 1955 if (hdr_length < sizeof (ipha_t)) { 1956 BUMP_MIB(&ip_mib, ipInDiscards); 1957 freemsg(first_mp); 1958 return; 1959 } 1960 if ((uchar_t *)ipha + hdr_length > mp->b_wptr) { 1961 if (!pullupmsg(mp, 1962 (uchar_t *)ipha + hdr_length - mp->b_rptr)) { 1963 BUMP_MIB(&ip_mib, ipInDiscards); 1964 freemsg(first_mp); 1965 return; 1966 } 1967 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1968 ipha = (ipha_t *)&icmph[1]; 1969 } 1970 switch (icmph->icmph_type) { 1971 case ICMP_REDIRECT: 1972 /* 1973 * As there is no upper client to deliver, we don't 1974 * need the first_mp any more. 1975 */ 1976 if (mctl_present) { 1977 freeb(first_mp); 1978 } 1979 icmp_redirect(mp); 1980 return; 1981 case ICMP_DEST_UNREACHABLE: 1982 if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) { 1983 if (!icmp_inbound_too_big(icmph, ipha)) { 1984 freemsg(first_mp); 1985 return; 1986 } 1987 } 1988 /* FALLTHRU */ 1989 default : 1990 /* 1991 * IPQoS notes: Since we have already done IPQoS 1992 * processing we don't want to do it again in 1993 * the fanout routines called by 1994 * icmp_inbound_error_fanout, hence the last 1995 * argument, ip_policy, is B_FALSE. 1996 */ 1997 icmp_inbound_error_fanout(q, ill, first_mp, icmph, 1998 ipha, iph_hdr_length, hdr_length, mctl_present, 1999 B_FALSE, recv_ill, zoneid); 2000 } 2001 return; 2002 } 2003 /* Send out an ICMP packet */ 2004 icmph->icmph_checksum = 0; 2005 icmph->icmph_checksum = IP_CSUM(mp, iph_hdr_length, 0); 2006 if (icmph->icmph_checksum == 0) 2007 icmph->icmph_checksum = 0xFFFF; 2008 if (broadcast || CLASSD(ipha->ipha_dst)) { 2009 ipif_t *ipif_chosen; 2010 /* 2011 * Make it look like it was directed to us, so we don't look 2012 * like a fool with a broadcast or multicast source address. 2013 */ 2014 ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid); 2015 /* 2016 * Make sure that we haven't grabbed an interface that's DOWN. 2017 */ 2018 if (ipif != NULL) { 2019 ipif_chosen = ipif_select_source(ipif->ipif_ill, 2020 ipha->ipha_src, zoneid); 2021 if (ipif_chosen != NULL) { 2022 ipif_refrele(ipif); 2023 ipif = ipif_chosen; 2024 } 2025 } 2026 if (ipif == NULL) { 2027 ip0dbg(("icmp_inbound: " 2028 "No source for broadcast/multicast:\n" 2029 "\tsrc 0x%x dst 0x%x ill %p " 2030 "ipif_lcl_addr 0x%x\n", 2031 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), 2032 (void *)ill, 2033 ill->ill_ipif->ipif_lcl_addr)); 2034 freemsg(first_mp); 2035 return; 2036 } 2037 ASSERT(ipif != NULL && !ipif->ipif_isv6); 2038 ipha->ipha_dst = ipif->ipif_src_addr; 2039 ipif_refrele(ipif); 2040 } 2041 /* Reset time to live. */ 2042 ipha->ipha_ttl = ip_def_ttl; 2043 { 2044 /* Swap source and destination addresses */ 2045 ipaddr_t tmp; 2046 2047 tmp = ipha->ipha_src; 2048 ipha->ipha_src = ipha->ipha_dst; 2049 ipha->ipha_dst = tmp; 2050 } 2051 ipha->ipha_ident = 0; 2052 if (!IS_SIMPLE_IPH(ipha)) 2053 icmp_options_update(ipha); 2054 2055 /* 2056 * ICMP echo replies should go out on the same interface 2057 * the request came on as probes used by in.mpathd for detecting 2058 * NIC failures are ECHO packets. We turn-off load spreading 2059 * by setting ipsec_in_attach_if to B_TRUE, which is copied 2060 * to ipsec_out_attach_if by ipsec_in_to_out called later in this 2061 * function. This is in turn handled by ip_wput and ip_newroute 2062 * to make sure that the packet goes out on the interface it came 2063 * in on. If we don't turnoff load spreading, the packets might get 2064 * dropped if there are no non-FAILED/INACTIVE interfaces for it 2065 * to go out and in.mpathd would wrongly detect a failure or 2066 * mis-detect a NIC failure for link failure. As load spreading 2067 * can happen only if ill_group is not NULL, we do only for 2068 * that case and this does not affect the normal case. 2069 * 2070 * We turn off load spreading only on echo packets that came from 2071 * on-link hosts. If the interface route has been deleted, this will 2072 * not be enforced as we can't do much. For off-link hosts, as the 2073 * default routes in IPv4 does not typically have an ire_ipif 2074 * pointer, we can't force MATCH_IRE_ILL in ip_wput/ip_newroute. 2075 * Moreover, expecting a default route through this interface may 2076 * not be correct. We use ipha_dst because of the swap above. 2077 */ 2078 onlink = B_FALSE; 2079 if (icmph->icmph_type == ICMP_ECHO_REPLY && ill->ill_group != NULL) { 2080 /* 2081 * First, we need to make sure that it is not one of our 2082 * local addresses. If we set onlink when it is one of 2083 * our local addresses, we will end up creating IRE_CACHES 2084 * for one of our local addresses. Then, we will never 2085 * accept packets for them afterwards. 2086 */ 2087 src_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_LOCAL, 2088 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 2089 if (src_ire == NULL) { 2090 ipif = ipif_get_next_ipif(NULL, ill); 2091 if (ipif == NULL) { 2092 BUMP_MIB(&ip_mib, ipInDiscards); 2093 freemsg(mp); 2094 return; 2095 } 2096 src_ire = ire_ftable_lookup(ipha->ipha_dst, 0, 0, 2097 IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, 2098 NULL, MATCH_IRE_ILL | MATCH_IRE_TYPE); 2099 ipif_refrele(ipif); 2100 if (src_ire != NULL) { 2101 onlink = B_TRUE; 2102 ire_refrele(src_ire); 2103 } 2104 } else { 2105 ire_refrele(src_ire); 2106 } 2107 } 2108 if (!mctl_present) { 2109 /* 2110 * This packet should go out the same way as it 2111 * came in i.e in clear. To make sure that global 2112 * policy will not be applied to this in ip_wput_ire, 2113 * we attach a IPSEC_IN mp and clear ipsec_in_secure. 2114 */ 2115 ASSERT(first_mp == mp); 2116 if ((first_mp = ipsec_in_alloc(B_TRUE)) == NULL) { 2117 BUMP_MIB(&ip_mib, ipInDiscards); 2118 freemsg(mp); 2119 return; 2120 } 2121 ii = (ipsec_in_t *)first_mp->b_rptr; 2122 2123 /* This is not a secure packet */ 2124 ii->ipsec_in_secure = B_FALSE; 2125 if (onlink) { 2126 ii->ipsec_in_attach_if = B_TRUE; 2127 ii->ipsec_in_ill_index = 2128 ill->ill_phyint->phyint_ifindex; 2129 ii->ipsec_in_rill_index = 2130 recv_ill->ill_phyint->phyint_ifindex; 2131 } 2132 first_mp->b_cont = mp; 2133 } else if (onlink) { 2134 ii = (ipsec_in_t *)first_mp->b_rptr; 2135 ii->ipsec_in_attach_if = B_TRUE; 2136 ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex; 2137 ii->ipsec_in_rill_index = recv_ill->ill_phyint->phyint_ifindex; 2138 } else { 2139 ii = (ipsec_in_t *)first_mp->b_rptr; 2140 } 2141 ii->ipsec_in_zoneid = zoneid; 2142 ASSERT(zoneid != ALL_ZONES); 2143 if (!ipsec_in_to_out(first_mp, ipha, NULL)) { 2144 BUMP_MIB(&ip_mib, ipInDiscards); 2145 return; 2146 } 2147 BUMP_MIB(&icmp_mib, icmpOutMsgs); 2148 put(WR(q), first_mp); 2149 } 2150 2151 /* Table from RFC 1191 */ 2152 static int icmp_frag_size_table[] = 2153 { 32000, 17914, 8166, 4352, 2002, 1496, 1006, 508, 296, 68 }; 2154 2155 /* 2156 * Process received ICMP Packet too big. 2157 * After updating any IRE it does the fanout to any matching transport streams. 2158 * Assumes the message has been pulled up till the IP header that caused 2159 * the error. 2160 * 2161 * Returns B_FALSE on failure and B_TRUE on success. 2162 */ 2163 static boolean_t 2164 icmp_inbound_too_big(icmph_t *icmph, ipha_t *ipha) 2165 { 2166 ire_t *ire, *first_ire; 2167 int mtu; 2168 int hdr_length; 2169 2170 ASSERT(icmph->icmph_type == ICMP_DEST_UNREACHABLE && 2171 icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED); 2172 2173 hdr_length = IPH_HDR_LENGTH(ipha); 2174 2175 first_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_CACHE, NULL, 2176 ALL_ZONES, NULL, MATCH_IRE_TYPE); 2177 2178 if (!first_ire) { 2179 ip1dbg(("icmp_inbound_too_big: no route for 0x%x\n", 2180 ntohl(ipha->ipha_dst))); 2181 return (B_FALSE); 2182 } 2183 /* Drop if the original packet contained a source route */ 2184 if (ip_source_route_included(ipha)) { 2185 ire_refrele(first_ire); 2186 return (B_FALSE); 2187 } 2188 /* Check for MTU discovery advice as described in RFC 1191 */ 2189 mtu = ntohs(icmph->icmph_du_mtu); 2190 rw_enter(&first_ire->ire_bucket->irb_lock, RW_READER); 2191 for (ire = first_ire; ire != NULL && ire->ire_addr == ipha->ipha_dst; 2192 ire = ire->ire_next) { 2193 mutex_enter(&ire->ire_lock); 2194 if (icmph->icmph_du_zero == 0 && mtu > 68) { 2195 /* Reduce the IRE max frag value as advised. */ 2196 ip1dbg(("Received mtu from router: %d (was %d)\n", 2197 mtu, ire->ire_max_frag)); 2198 ire->ire_max_frag = MIN(ire->ire_max_frag, mtu); 2199 } else { 2200 uint32_t length; 2201 int i; 2202 2203 /* 2204 * Use the table from RFC 1191 to figure out 2205 * the next "plateau" based on the length in 2206 * the original IP packet. 2207 */ 2208 length = ntohs(ipha->ipha_length); 2209 if (ire->ire_max_frag <= length && 2210 ire->ire_max_frag >= length - hdr_length) { 2211 /* 2212 * Handle broken BSD 4.2 systems that 2213 * return the wrong iph_length in ICMP 2214 * errors. 2215 */ 2216 ip1dbg(("Wrong mtu: sent %d, ire %d\n", 2217 length, ire->ire_max_frag)); 2218 length -= hdr_length; 2219 } 2220 for (i = 0; i < A_CNT(icmp_frag_size_table); i++) { 2221 if (length > icmp_frag_size_table[i]) 2222 break; 2223 } 2224 if (i == A_CNT(icmp_frag_size_table)) { 2225 /* Smaller than 68! */ 2226 ip1dbg(("Too big for packet size %d\n", 2227 length)); 2228 ire->ire_max_frag = MIN(ire->ire_max_frag, 576); 2229 ire->ire_frag_flag = 0; 2230 } else { 2231 mtu = icmp_frag_size_table[i]; 2232 ip1dbg(("Calculated mtu %d, packet size %d, " 2233 "before %d", mtu, length, 2234 ire->ire_max_frag)); 2235 ire->ire_max_frag = MIN(ire->ire_max_frag, mtu); 2236 ip1dbg((", after %d\n", ire->ire_max_frag)); 2237 } 2238 /* Record the new max frag size for the ULP. */ 2239 icmph->icmph_du_zero = 0; 2240 icmph->icmph_du_mtu = 2241 htons((uint16_t)ire->ire_max_frag); 2242 } 2243 mutex_exit(&ire->ire_lock); 2244 } 2245 rw_exit(&first_ire->ire_bucket->irb_lock); 2246 ire_refrele(first_ire); 2247 return (B_TRUE); 2248 } 2249 2250 /* 2251 * If the packet in error is Self-Encapsulated, icmp_inbound_error_fanout 2252 * calls this function. 2253 */ 2254 static mblk_t * 2255 icmp_inbound_self_encap_error(mblk_t *mp, int iph_hdr_length, int hdr_length) 2256 { 2257 ipha_t *ipha; 2258 icmph_t *icmph; 2259 ipha_t *in_ipha; 2260 int length; 2261 2262 ASSERT(mp->b_datap->db_type == M_DATA); 2263 2264 /* 2265 * For Self-encapsulated packets, we added an extra IP header 2266 * without the options. Inner IP header is the one from which 2267 * the outer IP header was formed. Thus, we need to remove the 2268 * outer IP header. To do this, we pullup the whole message 2269 * and overlay whatever follows the outer IP header over the 2270 * outer IP header. 2271 */ 2272 2273 if (!pullupmsg(mp, -1)) { 2274 BUMP_MIB(&ip_mib, ipInDiscards); 2275 return (NULL); 2276 } 2277 2278 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2279 ipha = (ipha_t *)&icmph[1]; 2280 in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length); 2281 2282 /* 2283 * The length that we want to overlay is following the inner 2284 * IP header. Subtracting the IP header + icmp header + outer 2285 * IP header's length should give us the length that we want to 2286 * overlay. 2287 */ 2288 length = msgdsize(mp) - iph_hdr_length - sizeof (icmph_t) - 2289 hdr_length; 2290 /* 2291 * Overlay whatever follows the inner header over the 2292 * outer header. 2293 */ 2294 bcopy((uchar_t *)in_ipha, (uchar_t *)ipha, length); 2295 2296 /* Set the wptr to account for the outer header */ 2297 mp->b_wptr -= hdr_length; 2298 return (mp); 2299 } 2300 2301 /* 2302 * Try to pass the ICMP message upstream in case the ULP cares. 2303 * 2304 * If the packet that caused the ICMP error is secure, we send 2305 * it to AH/ESP to make sure that the attached packet has a 2306 * valid association. ipha in the code below points to the 2307 * IP header of the packet that caused the error. 2308 * 2309 * We handle ICMP_FRAGMENTATION_NEEDED(IFN) message differently 2310 * in the context of IPSEC. Normally we tell the upper layer 2311 * whenever we send the ire (including ip_bind), the IPSEC header 2312 * length in ire_ipsec_overhead. TCP can deduce the MSS as it 2313 * has both the MTU (ire_max_frag) and the ire_ipsec_overhead. 2314 * Similarly, we pass the new MTU icmph_du_mtu and TCP does the 2315 * same thing. As TCP has the IPSEC options size that needs to be 2316 * adjusted, we just pass the MTU unchanged. 2317 * 2318 * IFN could have been generated locally or by some router. 2319 * 2320 * LOCAL : *ip_wput_ire -> icmp_frag_needed could have generated this. 2321 * This happens because IP adjusted its value of MTU on an 2322 * earlier IFN message and could not tell the upper layer, 2323 * the new adjusted value of MTU e.g. Packet was encrypted 2324 * or there was not enough information to fanout to upper 2325 * layers. Thus on the next outbound datagram, ip_wput_ire 2326 * generates the IFN, where IPSEC processing has *not* been 2327 * done. 2328 * 2329 * *ip_wput_ire_fragmentit -> ip_wput_frag -> icmp_frag_needed 2330 * could have generated this. This happens because ire_max_frag 2331 * value in IP was set to a new value, while the IPSEC processing 2332 * was being done and after we made the fragmentation check in 2333 * ip_wput_ire. Thus on return from IPSEC processing, 2334 * ip_wput_ipsec_out finds that the new length is > ire_max_frag 2335 * and generates the IFN. As IPSEC processing is over, we fanout 2336 * to AH/ESP to remove the header. 2337 * 2338 * In both these cases, ipsec_in_loopback will be set indicating 2339 * that IFN was generated locally. 2340 * 2341 * ROUTER : IFN could be secure or non-secure. 2342 * 2343 * * SECURE : We use the IPSEC_IN to fanout to AH/ESP if the 2344 * packet in error has AH/ESP headers to validate the AH/ESP 2345 * headers. AH/ESP will verify whether there is a valid SA or 2346 * not and send it back. We will fanout again if we have more 2347 * data in the packet. 2348 * 2349 * If the packet in error does not have AH/ESP, we handle it 2350 * like any other case. 2351 * 2352 * * NON_SECURE : If the packet in error has AH/ESP headers, 2353 * we attach a dummy ipsec_in and send it up to AH/ESP 2354 * for validation. AH/ESP will verify whether there is a 2355 * valid SA or not and send it back. We will fanout again if 2356 * we have more data in the packet. 2357 * 2358 * If the packet in error does not have AH/ESP, we handle it 2359 * like any other case. 2360 */ 2361 static void 2362 icmp_inbound_error_fanout(queue_t *q, ill_t *ill, mblk_t *mp, 2363 icmph_t *icmph, ipha_t *ipha, int iph_hdr_length, int hdr_length, 2364 boolean_t mctl_present, boolean_t ip_policy, ill_t *recv_ill, 2365 zoneid_t zoneid) 2366 { 2367 uint16_t *up; /* Pointer to ports in ULP header */ 2368 uint32_t ports; /* reversed ports for fanout */ 2369 ipha_t ripha; /* With reversed addresses */ 2370 mblk_t *first_mp; 2371 ipsec_in_t *ii; 2372 tcph_t *tcph; 2373 conn_t *connp; 2374 2375 first_mp = mp; 2376 if (mctl_present) { 2377 mp = first_mp->b_cont; 2378 ASSERT(mp != NULL); 2379 2380 ii = (ipsec_in_t *)first_mp->b_rptr; 2381 ASSERT(ii->ipsec_in_type == IPSEC_IN); 2382 } else { 2383 ii = NULL; 2384 } 2385 2386 switch (ipha->ipha_protocol) { 2387 case IPPROTO_UDP: 2388 /* 2389 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of 2390 * transport header. 2391 */ 2392 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > 2393 mp->b_wptr) { 2394 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 2395 ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { 2396 BUMP_MIB(&ip_mib, ipInDiscards); 2397 goto drop_pkt; 2398 } 2399 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2400 ipha = (ipha_t *)&icmph[1]; 2401 } 2402 up = (uint16_t *)((uchar_t *)ipha + hdr_length); 2403 2404 /* 2405 * Attempt to find a client stream based on port. 2406 * Note that we do a reverse lookup since the header is 2407 * in the form we sent it out. 2408 * The ripha header is only used for the IP_UDP_MATCH and we 2409 * only set the src and dst addresses and protocol. 2410 */ 2411 ripha.ipha_src = ipha->ipha_dst; 2412 ripha.ipha_dst = ipha->ipha_src; 2413 ripha.ipha_protocol = ipha->ipha_protocol; 2414 ((uint16_t *)&ports)[0] = up[1]; 2415 ((uint16_t *)&ports)[1] = up[0]; 2416 ip2dbg(("icmp_inbound_error: UDP %x:%d to %x:%d: %d/%d\n", 2417 ntohl(ipha->ipha_src), ntohs(up[0]), 2418 ntohl(ipha->ipha_dst), ntohs(up[1]), 2419 icmph->icmph_type, icmph->icmph_code)); 2420 2421 /* Have to change db_type after any pullupmsg */ 2422 DB_TYPE(mp) = M_CTL; 2423 2424 ip_fanout_udp(q, first_mp, ill, &ripha, ports, B_FALSE, 0, 2425 mctl_present, ip_policy, recv_ill, zoneid); 2426 return; 2427 2428 case IPPROTO_TCP: 2429 /* 2430 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of 2431 * transport header. 2432 */ 2433 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > 2434 mp->b_wptr) { 2435 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 2436 ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { 2437 BUMP_MIB(&ip_mib, ipInDiscards); 2438 goto drop_pkt; 2439 } 2440 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2441 ipha = (ipha_t *)&icmph[1]; 2442 } 2443 /* 2444 * Find a TCP client stream for this packet. 2445 * Note that we do a reverse lookup since the header is 2446 * in the form we sent it out. 2447 */ 2448 tcph = (tcph_t *)((uchar_t *)ipha + hdr_length); 2449 connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcph, TCPS_LISTEN); 2450 if (connp == NULL) { 2451 BUMP_MIB(&ip_mib, ipInDiscards); 2452 goto drop_pkt; 2453 } 2454 2455 /* Have to change db_type after any pullupmsg */ 2456 DB_TYPE(mp) = M_CTL; 2457 squeue_fill(connp->conn_sqp, first_mp, tcp_input, 2458 connp, SQTAG_TCP_INPUT_ICMP_ERR); 2459 return; 2460 2461 case IPPROTO_SCTP: 2462 /* 2463 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of 2464 * transport header. 2465 */ 2466 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > 2467 mp->b_wptr) { 2468 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 2469 ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { 2470 BUMP_MIB(&ip_mib, ipInDiscards); 2471 goto drop_pkt; 2472 } 2473 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2474 ipha = (ipha_t *)&icmph[1]; 2475 } 2476 up = (uint16_t *)((uchar_t *)ipha + hdr_length); 2477 /* 2478 * Find a SCTP client stream for this packet. 2479 * Note that we do a reverse lookup since the header is 2480 * in the form we sent it out. 2481 * The ripha header is only used for the matching and we 2482 * only set the src and dst addresses, protocol, and version. 2483 */ 2484 ripha.ipha_src = ipha->ipha_dst; 2485 ripha.ipha_dst = ipha->ipha_src; 2486 ripha.ipha_protocol = ipha->ipha_protocol; 2487 ripha.ipha_version_and_hdr_length = 2488 ipha->ipha_version_and_hdr_length; 2489 ((uint16_t *)&ports)[0] = up[1]; 2490 ((uint16_t *)&ports)[1] = up[0]; 2491 2492 /* Have to change db_type after any pullupmsg */ 2493 DB_TYPE(mp) = M_CTL; 2494 ip_fanout_sctp(first_mp, recv_ill, &ripha, ports, 0, 2495 mctl_present, ip_policy, 0, zoneid); 2496 return; 2497 2498 case IPPROTO_ESP: 2499 case IPPROTO_AH: { 2500 int ipsec_rc; 2501 2502 /* 2503 * We need a IPSEC_IN in the front to fanout to AH/ESP. 2504 * We will re-use the IPSEC_IN if it is already present as 2505 * AH/ESP will not affect any fields in the IPSEC_IN for 2506 * ICMP errors. If there is no IPSEC_IN, allocate a new 2507 * one and attach it in the front. 2508 */ 2509 if (ii != NULL) { 2510 /* 2511 * ip_fanout_proto_again converts the ICMP errors 2512 * that come back from AH/ESP to M_DATA so that 2513 * if it is non-AH/ESP and we do a pullupmsg in 2514 * this function, it would work. Convert it back 2515 * to M_CTL before we send up as this is a ICMP 2516 * error. This could have been generated locally or 2517 * by some router. Validate the inner IPSEC 2518 * headers. 2519 * 2520 * NOTE : ill_index is used by ip_fanout_proto_again 2521 * to locate the ill. 2522 */ 2523 ASSERT(ill != NULL); 2524 ii->ipsec_in_ill_index = 2525 ill->ill_phyint->phyint_ifindex; 2526 ii->ipsec_in_rill_index = 2527 recv_ill->ill_phyint->phyint_ifindex; 2528 DB_TYPE(first_mp->b_cont) = M_CTL; 2529 } else { 2530 /* 2531 * IPSEC_IN is not present. We attach a ipsec_in 2532 * message and send up to IPSEC for validating 2533 * and removing the IPSEC headers. Clear 2534 * ipsec_in_secure so that when we return 2535 * from IPSEC, we don't mistakenly think that this 2536 * is a secure packet came from the network. 2537 * 2538 * NOTE : ill_index is used by ip_fanout_proto_again 2539 * to locate the ill. 2540 */ 2541 ASSERT(first_mp == mp); 2542 first_mp = ipsec_in_alloc(B_TRUE); 2543 if (first_mp == NULL) { 2544 freemsg(mp); 2545 BUMP_MIB(&ip_mib, ipInDiscards); 2546 return; 2547 } 2548 ii = (ipsec_in_t *)first_mp->b_rptr; 2549 2550 /* This is not a secure packet */ 2551 ii->ipsec_in_secure = B_FALSE; 2552 first_mp->b_cont = mp; 2553 DB_TYPE(mp) = M_CTL; 2554 ASSERT(ill != NULL); 2555 ii->ipsec_in_ill_index = 2556 ill->ill_phyint->phyint_ifindex; 2557 ii->ipsec_in_rill_index = 2558 recv_ill->ill_phyint->phyint_ifindex; 2559 } 2560 ip2dbg(("icmp_inbound_error: ipsec\n")); 2561 2562 if (!ipsec_loaded()) { 2563 ip_proto_not_sup(q, first_mp, 0, zoneid); 2564 return; 2565 } 2566 2567 if (ipha->ipha_protocol == IPPROTO_ESP) 2568 ipsec_rc = ipsecesp_icmp_error(first_mp); 2569 else 2570 ipsec_rc = ipsecah_icmp_error(first_mp); 2571 if (ipsec_rc == IPSEC_STATUS_FAILED) 2572 return; 2573 2574 ip_fanout_proto_again(first_mp, ill, recv_ill, NULL); 2575 return; 2576 } 2577 default: 2578 /* 2579 * The ripha header is only used for the lookup and we 2580 * only set the src and dst addresses and protocol. 2581 */ 2582 ripha.ipha_src = ipha->ipha_dst; 2583 ripha.ipha_dst = ipha->ipha_src; 2584 ripha.ipha_protocol = ipha->ipha_protocol; 2585 ip2dbg(("icmp_inbound_error: proto %d %x to %x: %d/%d\n", 2586 ripha.ipha_protocol, ntohl(ipha->ipha_src), 2587 ntohl(ipha->ipha_dst), 2588 icmph->icmph_type, icmph->icmph_code)); 2589 if (ipha->ipha_protocol == IPPROTO_ENCAP) { 2590 ipha_t *in_ipha; 2591 2592 if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) > 2593 mp->b_wptr) { 2594 if (!pullupmsg(mp, (uchar_t *)ipha + 2595 hdr_length + sizeof (ipha_t) - 2596 mp->b_rptr)) { 2597 2598 BUMP_MIB(&ip_mib, ipInDiscards); 2599 goto drop_pkt; 2600 } 2601 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2602 ipha = (ipha_t *)&icmph[1]; 2603 } 2604 /* 2605 * Caller has verified that length has to be 2606 * at least the size of IP header. 2607 */ 2608 ASSERT(hdr_length >= sizeof (ipha_t)); 2609 /* 2610 * Check the sanity of the inner IP header like 2611 * we did for the outer header. 2612 */ 2613 in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length); 2614 if ((IPH_HDR_VERSION(in_ipha) != IPV4_VERSION)) { 2615 BUMP_MIB(&ip_mib, ipInDiscards); 2616 goto drop_pkt; 2617 } 2618 if (IPH_HDR_LENGTH(in_ipha) < sizeof (ipha_t)) { 2619 BUMP_MIB(&ip_mib, ipInDiscards); 2620 goto drop_pkt; 2621 } 2622 /* Check for Self-encapsulated tunnels */ 2623 if (in_ipha->ipha_src == ipha->ipha_src && 2624 in_ipha->ipha_dst == ipha->ipha_dst) { 2625 2626 mp = icmp_inbound_self_encap_error(mp, 2627 iph_hdr_length, hdr_length); 2628 if (mp == NULL) 2629 goto drop_pkt; 2630 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2631 ipha = (ipha_t *)&icmph[1]; 2632 hdr_length = IPH_HDR_LENGTH(ipha); 2633 /* 2634 * The packet in error is self-encapsualted. 2635 * And we are finding it further encapsulated 2636 * which we could not have possibly generated. 2637 */ 2638 if (ipha->ipha_protocol == IPPROTO_ENCAP) { 2639 BUMP_MIB(&ip_mib, ipInDiscards); 2640 goto drop_pkt; 2641 } 2642 icmp_inbound_error_fanout(q, ill, first_mp, 2643 icmph, ipha, iph_hdr_length, hdr_length, 2644 mctl_present, ip_policy, recv_ill, zoneid); 2645 return; 2646 } 2647 } 2648 if ((ipha->ipha_protocol == IPPROTO_ENCAP || 2649 ipha->ipha_protocol == IPPROTO_IPV6) && 2650 icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED && 2651 ii != NULL && 2652 ii->ipsec_in_loopback && 2653 ii->ipsec_in_secure) { 2654 /* 2655 * For IP tunnels that get a looped-back 2656 * ICMP_FRAGMENTATION_NEEDED message, adjust the 2657 * reported new MTU to take into account the IPsec 2658 * headers protecting this configured tunnel. 2659 * 2660 * This allows the tunnel module (tun.c) to blindly 2661 * accept the MTU reported in an ICMP "too big" 2662 * message. 2663 * 2664 * Non-looped back ICMP messages will just be 2665 * handled by the security protocols (if needed), 2666 * and the first subsequent packet will hit this 2667 * path. 2668 */ 2669 icmph->icmph_du_mtu = htons(ntohs(icmph->icmph_du_mtu) - 2670 ipsec_in_extra_length(first_mp)); 2671 } 2672 /* Have to change db_type after any pullupmsg */ 2673 DB_TYPE(mp) = M_CTL; 2674 2675 ip_fanout_proto(q, first_mp, ill, &ripha, 0, mctl_present, 2676 ip_policy, recv_ill, zoneid); 2677 return; 2678 } 2679 /* NOTREACHED */ 2680 drop_pkt:; 2681 ip1dbg(("icmp_inbound_error_fanout: drop pkt\n")); 2682 freemsg(first_mp); 2683 } 2684 2685 /* 2686 * Common IP options parser. 2687 * 2688 * Setup routine: fill in *optp with options-parsing state, then 2689 * tail-call ipoptp_next to return the first option. 2690 */ 2691 uint8_t 2692 ipoptp_first(ipoptp_t *optp, ipha_t *ipha) 2693 { 2694 uint32_t totallen; /* total length of all options */ 2695 2696 totallen = ipha->ipha_version_and_hdr_length - 2697 (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS); 2698 totallen <<= 2; 2699 optp->ipoptp_next = (uint8_t *)(&ipha[1]); 2700 optp->ipoptp_end = optp->ipoptp_next + totallen; 2701 optp->ipoptp_flags = 0; 2702 return (ipoptp_next(optp)); 2703 } 2704 2705 /* 2706 * Common IP options parser: extract next option. 2707 */ 2708 uint8_t 2709 ipoptp_next(ipoptp_t *optp) 2710 { 2711 uint8_t *end = optp->ipoptp_end; 2712 uint8_t *cur = optp->ipoptp_next; 2713 uint8_t opt, len, pointer; 2714 2715 /* 2716 * If cur > end already, then the ipoptp_end or ipoptp_next pointer 2717 * has been corrupted. 2718 */ 2719 ASSERT(cur <= end); 2720 2721 if (cur == end) 2722 return (IPOPT_EOL); 2723 2724 opt = cur[IPOPT_OPTVAL]; 2725 2726 /* 2727 * Skip any NOP options. 2728 */ 2729 while (opt == IPOPT_NOP) { 2730 cur++; 2731 if (cur == end) 2732 return (IPOPT_EOL); 2733 opt = cur[IPOPT_OPTVAL]; 2734 } 2735 2736 if (opt == IPOPT_EOL) 2737 return (IPOPT_EOL); 2738 2739 /* 2740 * Option requiring a length. 2741 */ 2742 if ((cur + 1) >= end) { 2743 optp->ipoptp_flags |= IPOPTP_ERROR; 2744 return (IPOPT_EOL); 2745 } 2746 len = cur[IPOPT_OLEN]; 2747 if (len < 2) { 2748 optp->ipoptp_flags |= IPOPTP_ERROR; 2749 return (IPOPT_EOL); 2750 } 2751 optp->ipoptp_cur = cur; 2752 optp->ipoptp_len = len; 2753 optp->ipoptp_next = cur + len; 2754 if (cur + len > end) { 2755 optp->ipoptp_flags |= IPOPTP_ERROR; 2756 return (IPOPT_EOL); 2757 } 2758 2759 /* 2760 * For the options which require a pointer field, make sure 2761 * its there, and make sure it points to either something 2762 * inside this option, or the end of the option. 2763 */ 2764 switch (opt) { 2765 case IPOPT_RR: 2766 case IPOPT_TS: 2767 case IPOPT_LSRR: 2768 case IPOPT_SSRR: 2769 if (len <= IPOPT_OFFSET) { 2770 optp->ipoptp_flags |= IPOPTP_ERROR; 2771 return (opt); 2772 } 2773 pointer = cur[IPOPT_OFFSET]; 2774 if (pointer - 1 > len) { 2775 optp->ipoptp_flags |= IPOPTP_ERROR; 2776 return (opt); 2777 } 2778 break; 2779 } 2780 2781 /* 2782 * Sanity check the pointer field based on the type of the 2783 * option. 2784 */ 2785 switch (opt) { 2786 case IPOPT_RR: 2787 case IPOPT_SSRR: 2788 case IPOPT_LSRR: 2789 if (pointer < IPOPT_MINOFF_SR) 2790 optp->ipoptp_flags |= IPOPTP_ERROR; 2791 break; 2792 case IPOPT_TS: 2793 if (pointer < IPOPT_MINOFF_IT) 2794 optp->ipoptp_flags |= IPOPTP_ERROR; 2795 /* 2796 * Note that the Internet Timestamp option also 2797 * contains two four bit fields (the Overflow field, 2798 * and the Flag field), which follow the pointer 2799 * field. We don't need to check that these fields 2800 * fall within the length of the option because this 2801 * was implicitely done above. We've checked that the 2802 * pointer value is at least IPOPT_MINOFF_IT, and that 2803 * it falls within the option. Since IPOPT_MINOFF_IT > 2804 * IPOPT_POS_OV_FLG, we don't need the explicit check. 2805 */ 2806 ASSERT(len > IPOPT_POS_OV_FLG); 2807 break; 2808 } 2809 2810 return (opt); 2811 } 2812 2813 /* 2814 * Use the outgoing IP header to create an IP_OPTIONS option the way 2815 * it was passed down from the application. 2816 */ 2817 int 2818 ip_opt_get_user(const ipha_t *ipha, uchar_t *buf) 2819 { 2820 ipoptp_t opts; 2821 const uchar_t *opt; 2822 uint8_t optval; 2823 uint8_t optlen; 2824 uint32_t len = 0; 2825 uchar_t *buf1 = buf; 2826 2827 buf += IP_ADDR_LEN; /* Leave room for final destination */ 2828 len += IP_ADDR_LEN; 2829 bzero(buf1, IP_ADDR_LEN); 2830 2831 /* 2832 * OK to cast away const here, as we don't store through the returned 2833 * opts.ipoptp_cur pointer. 2834 */ 2835 for (optval = ipoptp_first(&opts, (ipha_t *)ipha); 2836 optval != IPOPT_EOL; 2837 optval = ipoptp_next(&opts)) { 2838 int off; 2839 2840 opt = opts.ipoptp_cur; 2841 optlen = opts.ipoptp_len; 2842 switch (optval) { 2843 case IPOPT_SSRR: 2844 case IPOPT_LSRR: 2845 2846 /* 2847 * Insert ipha_dst as the first entry in the source 2848 * route and move down the entries on step. 2849 * The last entry gets placed at buf1. 2850 */ 2851 buf[IPOPT_OPTVAL] = optval; 2852 buf[IPOPT_OLEN] = optlen; 2853 buf[IPOPT_OFFSET] = optlen; 2854 2855 off = optlen - IP_ADDR_LEN; 2856 if (off < 0) { 2857 /* No entries in source route */ 2858 break; 2859 } 2860 /* Last entry in source route */ 2861 bcopy(opt + off, buf1, IP_ADDR_LEN); 2862 off -= IP_ADDR_LEN; 2863 2864 while (off > 0) { 2865 bcopy(opt + off, 2866 buf + off + IP_ADDR_LEN, 2867 IP_ADDR_LEN); 2868 off -= IP_ADDR_LEN; 2869 } 2870 /* ipha_dst into first slot */ 2871 bcopy(&ipha->ipha_dst, 2872 buf + off + IP_ADDR_LEN, 2873 IP_ADDR_LEN); 2874 buf += optlen; 2875 len += optlen; 2876 break; 2877 2878 case IPOPT_COMSEC: 2879 case IPOPT_SECURITY: 2880 /* if passing up a label is not ok, then remove */ 2881 if (is_system_labeled()) 2882 break; 2883 /* FALLTHROUGH */ 2884 default: 2885 bcopy(opt, buf, optlen); 2886 buf += optlen; 2887 len += optlen; 2888 break; 2889 } 2890 } 2891 done: 2892 /* Pad the resulting options */ 2893 while (len & 0x3) { 2894 *buf++ = IPOPT_EOL; 2895 len++; 2896 } 2897 return (len); 2898 } 2899 2900 /* 2901 * Update any record route or timestamp options to include this host. 2902 * Reverse any source route option. 2903 * This routine assumes that the options are well formed i.e. that they 2904 * have already been checked. 2905 */ 2906 static void 2907 icmp_options_update(ipha_t *ipha) 2908 { 2909 ipoptp_t opts; 2910 uchar_t *opt; 2911 uint8_t optval; 2912 ipaddr_t src; /* Our local address */ 2913 ipaddr_t dst; 2914 2915 ip2dbg(("icmp_options_update\n")); 2916 src = ipha->ipha_src; 2917 dst = ipha->ipha_dst; 2918 2919 for (optval = ipoptp_first(&opts, ipha); 2920 optval != IPOPT_EOL; 2921 optval = ipoptp_next(&opts)) { 2922 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 2923 opt = opts.ipoptp_cur; 2924 ip2dbg(("icmp_options_update: opt %d, len %d\n", 2925 optval, opts.ipoptp_len)); 2926 switch (optval) { 2927 int off1, off2; 2928 case IPOPT_SSRR: 2929 case IPOPT_LSRR: 2930 /* 2931 * Reverse the source route. The first entry 2932 * should be the next to last one in the current 2933 * source route (the last entry is our address). 2934 * The last entry should be the final destination. 2935 */ 2936 off1 = IPOPT_MINOFF_SR - 1; 2937 off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1; 2938 if (off2 < 0) { 2939 /* No entries in source route */ 2940 ip1dbg(( 2941 "icmp_options_update: bad src route\n")); 2942 break; 2943 } 2944 bcopy((char *)opt + off2, &dst, IP_ADDR_LEN); 2945 bcopy(&ipha->ipha_dst, (char *)opt + off2, IP_ADDR_LEN); 2946 bcopy(&dst, &ipha->ipha_dst, IP_ADDR_LEN); 2947 off2 -= IP_ADDR_LEN; 2948 2949 while (off1 < off2) { 2950 bcopy((char *)opt + off1, &src, IP_ADDR_LEN); 2951 bcopy((char *)opt + off2, (char *)opt + off1, 2952 IP_ADDR_LEN); 2953 bcopy(&src, (char *)opt + off2, IP_ADDR_LEN); 2954 off1 += IP_ADDR_LEN; 2955 off2 -= IP_ADDR_LEN; 2956 } 2957 opt[IPOPT_OFFSET] = IPOPT_MINOFF_SR; 2958 break; 2959 } 2960 } 2961 } 2962 2963 /* 2964 * Process received ICMP Redirect messages. 2965 */ 2966 /* ARGSUSED */ 2967 static void 2968 icmp_redirect(mblk_t *mp) 2969 { 2970 ipha_t *ipha; 2971 int iph_hdr_length; 2972 icmph_t *icmph; 2973 ipha_t *ipha_err; 2974 ire_t *ire; 2975 ire_t *prev_ire; 2976 ire_t *save_ire; 2977 ipaddr_t src, dst, gateway; 2978 iulp_t ulp_info = { 0 }; 2979 int error; 2980 2981 ipha = (ipha_t *)mp->b_rptr; 2982 iph_hdr_length = IPH_HDR_LENGTH(ipha); 2983 if (((mp->b_wptr - mp->b_rptr) - iph_hdr_length) < 2984 sizeof (icmph_t) + IP_SIMPLE_HDR_LENGTH) { 2985 BUMP_MIB(&icmp_mib, icmpInErrors); 2986 freemsg(mp); 2987 return; 2988 } 2989 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2990 ipha_err = (ipha_t *)&icmph[1]; 2991 src = ipha->ipha_src; 2992 dst = ipha_err->ipha_dst; 2993 gateway = icmph->icmph_rd_gateway; 2994 /* Make sure the new gateway is reachable somehow. */ 2995 ire = ire_route_lookup(gateway, 0, 0, IRE_INTERFACE, NULL, NULL, 2996 ALL_ZONES, NULL, MATCH_IRE_TYPE); 2997 /* 2998 * Make sure we had a route for the dest in question and that 2999 * that route was pointing to the old gateway (the source of the 3000 * redirect packet.) 3001 */ 3002 prev_ire = ire_route_lookup(dst, 0, src, 0, NULL, NULL, ALL_ZONES, 3003 NULL, MATCH_IRE_GW); 3004 /* 3005 * Check that 3006 * the redirect was not from ourselves 3007 * the new gateway and the old gateway are directly reachable 3008 */ 3009 if (!prev_ire || 3010 !ire || 3011 ire->ire_type == IRE_LOCAL) { 3012 BUMP_MIB(&icmp_mib, icmpInBadRedirects); 3013 freemsg(mp); 3014 if (ire != NULL) 3015 ire_refrele(ire); 3016 if (prev_ire != NULL) 3017 ire_refrele(prev_ire); 3018 return; 3019 } 3020 3021 /* 3022 * Should we use the old ULP info to create the new gateway? From 3023 * a user's perspective, we should inherit the info so that it 3024 * is a "smooth" transition. If we do not do that, then new 3025 * connections going thru the new gateway will have no route metrics, 3026 * which is counter-intuitive to user. From a network point of 3027 * view, this may or may not make sense even though the new gateway 3028 * is still directly connected to us so the route metrics should not 3029 * change much. 3030 * 3031 * But if the old ire_uinfo is not initialized, we do another 3032 * recursive lookup on the dest using the new gateway. There may 3033 * be a route to that. If so, use it to initialize the redirect 3034 * route. 3035 */ 3036 if (prev_ire->ire_uinfo.iulp_set) { 3037 bcopy(&prev_ire->ire_uinfo, &ulp_info, sizeof (iulp_t)); 3038 } else { 3039 ire_t *tmp_ire; 3040 ire_t *sire; 3041 3042 tmp_ire = ire_ftable_lookup(dst, 0, gateway, 0, NULL, &sire, 3043 ALL_ZONES, 0, NULL, 3044 (MATCH_IRE_RECURSIVE | MATCH_IRE_GW | MATCH_IRE_DEFAULT)); 3045 if (sire != NULL) { 3046 bcopy(&sire->ire_uinfo, &ulp_info, sizeof (iulp_t)); 3047 /* 3048 * If sire != NULL, ire_ftable_lookup() should not 3049 * return a NULL value. 3050 */ 3051 ASSERT(tmp_ire != NULL); 3052 ire_refrele(tmp_ire); 3053 ire_refrele(sire); 3054 } else if (tmp_ire != NULL) { 3055 bcopy(&tmp_ire->ire_uinfo, &ulp_info, 3056 sizeof (iulp_t)); 3057 ire_refrele(tmp_ire); 3058 } 3059 } 3060 if (prev_ire->ire_type == IRE_CACHE) 3061 ire_delete(prev_ire); 3062 ire_refrele(prev_ire); 3063 /* 3064 * TODO: more precise handling for cases 0, 2, 3, the latter two 3065 * require TOS routing 3066 */ 3067 switch (icmph->icmph_code) { 3068 case 0: 3069 case 1: 3070 /* TODO: TOS specificity for cases 2 and 3 */ 3071 case 2: 3072 case 3: 3073 break; 3074 default: 3075 freemsg(mp); 3076 BUMP_MIB(&icmp_mib, icmpInBadRedirects); 3077 ire_refrele(ire); 3078 return; 3079 } 3080 /* 3081 * Create a Route Association. This will allow us to remember that 3082 * someone we believe told us to use the particular gateway. 3083 */ 3084 save_ire = ire; 3085 ire = ire_create( 3086 (uchar_t *)&dst, /* dest addr */ 3087 (uchar_t *)&ip_g_all_ones, /* mask */ 3088 (uchar_t *)&save_ire->ire_src_addr, /* source addr */ 3089 (uchar_t *)&gateway, /* gateway addr */ 3090 NULL, /* no in_srcaddr */ 3091 &save_ire->ire_max_frag, /* max frag */ 3092 NULL, /* Fast Path header */ 3093 NULL, /* no rfq */ 3094 NULL, /* no stq */ 3095 IRE_HOST_REDIRECT, 3096 NULL, 3097 NULL, 3098 NULL, 3099 0, 3100 0, 3101 0, 3102 (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 3103 &ulp_info, 3104 NULL, 3105 NULL); 3106 3107 if (ire == NULL) { 3108 freemsg(mp); 3109 ire_refrele(save_ire); 3110 return; 3111 } 3112 error = ire_add(&ire, NULL, NULL, NULL); 3113 ire_refrele(save_ire); 3114 if (error == 0) { 3115 ire_refrele(ire); /* Held in ire_add_v4 */ 3116 /* tell routing sockets that we received a redirect */ 3117 ip_rts_change(RTM_REDIRECT, dst, gateway, IP_HOST_MASK, 0, src, 3118 (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 0, 3119 (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_AUTHOR)); 3120 } 3121 3122 /* 3123 * Delete any existing IRE_HOST_REDIRECT for this destination. 3124 * This together with the added IRE has the effect of 3125 * modifying an existing redirect. 3126 */ 3127 prev_ire = ire_ftable_lookup(dst, 0, src, IRE_HOST_REDIRECT, NULL, NULL, 3128 ALL_ZONES, 0, NULL, (MATCH_IRE_GW | MATCH_IRE_TYPE)); 3129 if (prev_ire) { 3130 ire_delete(prev_ire); 3131 ire_refrele(prev_ire); 3132 } 3133 3134 freemsg(mp); 3135 } 3136 3137 /* 3138 * Generate an ICMP parameter problem message. 3139 */ 3140 static void 3141 icmp_param_problem(queue_t *q, mblk_t *mp, uint8_t ptr) 3142 { 3143 icmph_t icmph; 3144 boolean_t mctl_present; 3145 mblk_t *first_mp; 3146 3147 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 3148 3149 if (!(mp = icmp_pkt_err_ok(mp))) { 3150 if (mctl_present) 3151 freeb(first_mp); 3152 return; 3153 } 3154 3155 bzero(&icmph, sizeof (icmph_t)); 3156 icmph.icmph_type = ICMP_PARAM_PROBLEM; 3157 icmph.icmph_pp_ptr = ptr; 3158 BUMP_MIB(&icmp_mib, icmpOutParmProbs); 3159 icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present); 3160 } 3161 3162 /* 3163 * Build and ship an IPv4 ICMP message using the packet data in mp, and 3164 * the ICMP header pointed to by "stuff". (May be called as writer.) 3165 * Note: assumes that icmp_pkt_err_ok has been called to verify that 3166 * an icmp error packet can be sent. 3167 * Assigns an appropriate source address to the packet. If ipha_dst is 3168 * one of our addresses use it for source. Otherwise pick a source based 3169 * on a route lookup back to ipha_src. 3170 * Note that ipha_src must be set here since the 3171 * packet is likely to arrive on an ill queue in ip_wput() which will 3172 * not set a source address. 3173 */ 3174 static void 3175 icmp_pkt(queue_t *q, mblk_t *mp, void *stuff, size_t len, 3176 boolean_t mctl_present) 3177 { 3178 ipaddr_t dst; 3179 icmph_t *icmph; 3180 ipha_t *ipha; 3181 uint_t len_needed; 3182 size_t msg_len; 3183 mblk_t *mp1; 3184 ipaddr_t src; 3185 ire_t *ire; 3186 mblk_t *ipsec_mp; 3187 ipsec_out_t *io = NULL; 3188 boolean_t xmit_if_on = B_FALSE; 3189 zoneid_t zoneid; 3190 3191 if (mctl_present) { 3192 /* 3193 * If it is : 3194 * 3195 * 1) a IPSEC_OUT, then this is caused by outbound 3196 * datagram originating on this host. IPSEC processing 3197 * may or may not have been done. Refer to comments above 3198 * icmp_inbound_error_fanout for details. 3199 * 3200 * 2) a IPSEC_IN if we are generating a icmp_message 3201 * for an incoming datagram destined for us i.e called 3202 * from ip_fanout_send_icmp. 3203 */ 3204 ipsec_info_t *in; 3205 ipsec_mp = mp; 3206 mp = ipsec_mp->b_cont; 3207 3208 in = (ipsec_info_t *)ipsec_mp->b_rptr; 3209 ipha = (ipha_t *)mp->b_rptr; 3210 3211 ASSERT(in->ipsec_info_type == IPSEC_OUT || 3212 in->ipsec_info_type == IPSEC_IN); 3213 3214 if (in->ipsec_info_type == IPSEC_IN) { 3215 /* 3216 * Convert the IPSEC_IN to IPSEC_OUT. 3217 */ 3218 if (!ipsec_in_to_out(ipsec_mp, ipha, NULL)) { 3219 BUMP_MIB(&ip_mib, ipOutDiscards); 3220 return; 3221 } 3222 io = (ipsec_out_t *)ipsec_mp->b_rptr; 3223 } else { 3224 ASSERT(in->ipsec_info_type == IPSEC_OUT); 3225 io = (ipsec_out_t *)in; 3226 if (io->ipsec_out_xmit_if) 3227 xmit_if_on = B_TRUE; 3228 /* 3229 * Clear out ipsec_out_proc_begin, so we do a fresh 3230 * ire lookup. 3231 */ 3232 io->ipsec_out_proc_begin = B_FALSE; 3233 } 3234 zoneid = io->ipsec_out_zoneid; 3235 ASSERT(zoneid != ALL_ZONES); 3236 } else { 3237 /* 3238 * This is in clear. The icmp message we are building 3239 * here should go out in clear. 3240 * 3241 * Pardon the convolution of it all, but it's easier to 3242 * allocate a "use cleartext" IPSEC_IN message and convert 3243 * it than it is to allocate a new one. 3244 */ 3245 ipsec_in_t *ii; 3246 ASSERT(DB_TYPE(mp) == M_DATA); 3247 if ((ipsec_mp = ipsec_in_alloc(B_TRUE)) == NULL) { 3248 freemsg(mp); 3249 BUMP_MIB(&ip_mib, ipOutDiscards); 3250 return; 3251 } 3252 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 3253 3254 /* This is not a secure packet */ 3255 ii->ipsec_in_secure = B_FALSE; 3256 if (CONN_Q(q)) { 3257 zoneid = Q_TO_CONN(q)->conn_zoneid; 3258 } else { 3259 zoneid = GLOBAL_ZONEID; 3260 } 3261 ii->ipsec_in_zoneid = zoneid; 3262 ASSERT(zoneid != ALL_ZONES); 3263 ipsec_mp->b_cont = mp; 3264 ipha = (ipha_t *)mp->b_rptr; 3265 /* 3266 * Convert the IPSEC_IN to IPSEC_OUT. 3267 */ 3268 if (!ipsec_in_to_out(ipsec_mp, ipha, NULL)) { 3269 BUMP_MIB(&ip_mib, ipOutDiscards); 3270 return; 3271 } 3272 io = (ipsec_out_t *)ipsec_mp->b_rptr; 3273 } 3274 3275 /* Remember our eventual destination */ 3276 dst = ipha->ipha_src; 3277 3278 ire = ire_route_lookup(ipha->ipha_dst, 0, 0, (IRE_LOCAL|IRE_LOOPBACK), 3279 NULL, NULL, zoneid, NULL, MATCH_IRE_TYPE); 3280 if (ire != NULL && 3281 (ire->ire_zoneid == zoneid || ire->ire_zoneid == ALL_ZONES)) { 3282 src = ipha->ipha_dst; 3283 } else if (!xmit_if_on) { 3284 if (ire != NULL) 3285 ire_refrele(ire); 3286 ire = ire_route_lookup(dst, 0, 0, 0, NULL, NULL, zoneid, NULL, 3287 (MATCH_IRE_DEFAULT|MATCH_IRE_RECURSIVE|MATCH_IRE_ZONEONLY)); 3288 if (ire == NULL) { 3289 BUMP_MIB(&ip_mib, ipOutNoRoutes); 3290 freemsg(ipsec_mp); 3291 return; 3292 } 3293 src = ire->ire_src_addr; 3294 } else { 3295 ipif_t *ipif = NULL; 3296 ill_t *ill; 3297 /* 3298 * This must be an ICMP error coming from 3299 * ip_mrtun_forward(). The src addr should 3300 * be equal to the IP-addr of the outgoing 3301 * interface. 3302 */ 3303 if (io == NULL) { 3304 /* This is not a IPSEC_OUT type control msg */ 3305 BUMP_MIB(&ip_mib, ipOutNoRoutes); 3306 freemsg(ipsec_mp); 3307 return; 3308 } 3309 ill = ill_lookup_on_ifindex(io->ipsec_out_ill_index, B_FALSE, 3310 NULL, NULL, NULL, NULL); 3311 if (ill != NULL) { 3312 ipif = ipif_get_next_ipif(NULL, ill); 3313 ill_refrele(ill); 3314 } 3315 if (ipif == NULL) { 3316 BUMP_MIB(&ip_mib, ipOutNoRoutes); 3317 freemsg(ipsec_mp); 3318 return; 3319 } 3320 src = ipif->ipif_src_addr; 3321 ipif_refrele(ipif); 3322 } 3323 3324 if (ire != NULL) 3325 ire_refrele(ire); 3326 3327 /* 3328 * Check if we can send back more then 8 bytes in addition 3329 * to the IP header. We will include as much as 64 bytes. 3330 */ 3331 len_needed = IPH_HDR_LENGTH(ipha); 3332 if (ipha->ipha_protocol == IPPROTO_ENCAP && 3333 (uchar_t *)ipha + len_needed + 1 <= mp->b_wptr) { 3334 len_needed += IPH_HDR_LENGTH(((uchar_t *)ipha + len_needed)); 3335 } 3336 len_needed += ip_icmp_return; 3337 msg_len = msgdsize(mp); 3338 if (msg_len > len_needed) { 3339 (void) adjmsg(mp, len_needed - msg_len); 3340 msg_len = len_needed; 3341 } 3342 mp1 = allocb(sizeof (icmp_ipha) + len, BPRI_HI); 3343 if (mp1 == NULL) { 3344 BUMP_MIB(&icmp_mib, icmpOutErrors); 3345 freemsg(ipsec_mp); 3346 return; 3347 } 3348 /* 3349 * On an unlabeled system, dblks don't necessarily have creds. 3350 */ 3351 ASSERT(!is_system_labeled() || DB_CRED(mp) != NULL); 3352 if (DB_CRED(mp) != NULL) 3353 mblk_setcred(mp1, DB_CRED(mp)); 3354 mp1->b_cont = mp; 3355 mp = mp1; 3356 ASSERT(ipsec_mp->b_datap->db_type == M_CTL && 3357 ipsec_mp->b_rptr == (uint8_t *)io && 3358 io->ipsec_out_type == IPSEC_OUT); 3359 ipsec_mp->b_cont = mp; 3360 3361 /* 3362 * Set ipsec_out_icmp_loopback so we can let the ICMP messages this 3363 * node generates be accepted in peace by all on-host destinations. 3364 * If we do NOT assume that all on-host destinations trust 3365 * self-generated ICMP messages, then rework here, ip6.c, and spd.c. 3366 * (Look for ipsec_out_icmp_loopback). 3367 */ 3368 io->ipsec_out_icmp_loopback = B_TRUE; 3369 3370 ipha = (ipha_t *)mp->b_rptr; 3371 mp1->b_wptr = (uchar_t *)ipha + (sizeof (icmp_ipha) + len); 3372 *ipha = icmp_ipha; 3373 ipha->ipha_src = src; 3374 ipha->ipha_dst = dst; 3375 ipha->ipha_ttl = ip_def_ttl; 3376 msg_len += sizeof (icmp_ipha) + len; 3377 if (msg_len > IP_MAXPACKET) { 3378 (void) adjmsg(mp, IP_MAXPACKET - msg_len); 3379 msg_len = IP_MAXPACKET; 3380 } 3381 ipha->ipha_length = htons((uint16_t)msg_len); 3382 icmph = (icmph_t *)&ipha[1]; 3383 bcopy(stuff, icmph, len); 3384 icmph->icmph_checksum = 0; 3385 icmph->icmph_checksum = IP_CSUM(mp, (int32_t)sizeof (ipha_t), 0); 3386 if (icmph->icmph_checksum == 0) 3387 icmph->icmph_checksum = 0xFFFF; 3388 BUMP_MIB(&icmp_mib, icmpOutMsgs); 3389 put(q, ipsec_mp); 3390 } 3391 3392 /* 3393 * Determine if an ICMP error packet can be sent given the rate limit. 3394 * The limit consists of an average frequency (icmp_pkt_err_interval measured 3395 * in milliseconds) and a burst size. Burst size number of packets can 3396 * be sent arbitrarely closely spaced. 3397 * The state is tracked using two variables to implement an approximate 3398 * token bucket filter: 3399 * icmp_pkt_err_last - lbolt value when the last burst started 3400 * icmp_pkt_err_sent - number of packets sent in current burst 3401 */ 3402 boolean_t 3403 icmp_err_rate_limit(void) 3404 { 3405 clock_t now = TICK_TO_MSEC(lbolt); 3406 uint_t refilled; /* Number of packets refilled in tbf since last */ 3407 uint_t err_interval = ip_icmp_err_interval; /* Guard against changes */ 3408 3409 if (err_interval == 0) 3410 return (B_FALSE); 3411 3412 if (icmp_pkt_err_last > now) { 3413 /* 100HZ lbolt in ms for 32bit arch wraps every 49.7 days */ 3414 icmp_pkt_err_last = 0; 3415 icmp_pkt_err_sent = 0; 3416 } 3417 /* 3418 * If we are in a burst update the token bucket filter. 3419 * Update the "last" time to be close to "now" but make sure 3420 * we don't loose precision. 3421 */ 3422 if (icmp_pkt_err_sent != 0) { 3423 refilled = (now - icmp_pkt_err_last)/err_interval; 3424 if (refilled > icmp_pkt_err_sent) { 3425 icmp_pkt_err_sent = 0; 3426 } else { 3427 icmp_pkt_err_sent -= refilled; 3428 icmp_pkt_err_last += refilled * err_interval; 3429 } 3430 } 3431 if (icmp_pkt_err_sent == 0) { 3432 /* Start of new burst */ 3433 icmp_pkt_err_last = now; 3434 } 3435 if (icmp_pkt_err_sent < ip_icmp_err_burst) { 3436 icmp_pkt_err_sent++; 3437 ip1dbg(("icmp_err_rate_limit: %d sent in burst\n", 3438 icmp_pkt_err_sent)); 3439 return (B_FALSE); 3440 } 3441 ip1dbg(("icmp_err_rate_limit: dropped\n")); 3442 return (B_TRUE); 3443 } 3444 3445 /* 3446 * Check if it is ok to send an IPv4 ICMP error packet in 3447 * response to the IPv4 packet in mp. 3448 * Free the message and return null if no 3449 * ICMP error packet should be sent. 3450 */ 3451 static mblk_t * 3452 icmp_pkt_err_ok(mblk_t *mp) 3453 { 3454 icmph_t *icmph; 3455 ipha_t *ipha; 3456 uint_t len_needed; 3457 ire_t *src_ire; 3458 ire_t *dst_ire; 3459 3460 if (!mp) 3461 return (NULL); 3462 ipha = (ipha_t *)mp->b_rptr; 3463 if (ip_csum_hdr(ipha)) { 3464 BUMP_MIB(&ip_mib, ipInCksumErrs); 3465 freemsg(mp); 3466 return (NULL); 3467 } 3468 src_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_BROADCAST, 3469 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 3470 dst_ire = ire_ctable_lookup(ipha->ipha_src, 0, IRE_BROADCAST, 3471 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 3472 if (src_ire != NULL || dst_ire != NULL || 3473 CLASSD(ipha->ipha_dst) || 3474 CLASSD(ipha->ipha_src) || 3475 (ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_OFFSET)) { 3476 /* Note: only errors to the fragment with offset 0 */ 3477 BUMP_MIB(&icmp_mib, icmpOutDrops); 3478 freemsg(mp); 3479 if (src_ire != NULL) 3480 ire_refrele(src_ire); 3481 if (dst_ire != NULL) 3482 ire_refrele(dst_ire); 3483 return (NULL); 3484 } 3485 if (ipha->ipha_protocol == IPPROTO_ICMP) { 3486 /* 3487 * Check the ICMP type. RFC 1122 sez: don't send ICMP 3488 * errors in response to any ICMP errors. 3489 */ 3490 len_needed = IPH_HDR_LENGTH(ipha) + ICMPH_SIZE; 3491 if (mp->b_wptr - mp->b_rptr < len_needed) { 3492 if (!pullupmsg(mp, len_needed)) { 3493 BUMP_MIB(&icmp_mib, icmpInErrors); 3494 freemsg(mp); 3495 return (NULL); 3496 } 3497 ipha = (ipha_t *)mp->b_rptr; 3498 } 3499 icmph = (icmph_t *) 3500 (&((char *)ipha)[IPH_HDR_LENGTH(ipha)]); 3501 switch (icmph->icmph_type) { 3502 case ICMP_DEST_UNREACHABLE: 3503 case ICMP_SOURCE_QUENCH: 3504 case ICMP_TIME_EXCEEDED: 3505 case ICMP_PARAM_PROBLEM: 3506 case ICMP_REDIRECT: 3507 BUMP_MIB(&icmp_mib, icmpOutDrops); 3508 freemsg(mp); 3509 return (NULL); 3510 default: 3511 break; 3512 } 3513 } 3514 /* 3515 * If this is a labeled system, then check to see if we're allowed to 3516 * send a response to this particular sender. If not, then just drop. 3517 */ 3518 if (is_system_labeled() && !tsol_can_reply_error(mp)) { 3519 ip2dbg(("icmp_pkt_err_ok: can't respond to packet\n")); 3520 BUMP_MIB(&icmp_mib, icmpOutDrops); 3521 freemsg(mp); 3522 return (NULL); 3523 } 3524 if (icmp_err_rate_limit()) { 3525 /* 3526 * Only send ICMP error packets every so often. 3527 * This should be done on a per port/source basis, 3528 * but for now this will suffice. 3529 */ 3530 freemsg(mp); 3531 return (NULL); 3532 } 3533 return (mp); 3534 } 3535 3536 /* 3537 * Generate an ICMP redirect message. 3538 */ 3539 static void 3540 icmp_send_redirect(queue_t *q, mblk_t *mp, ipaddr_t gateway) 3541 { 3542 icmph_t icmph; 3543 3544 /* 3545 * We are called from ip_rput where we could 3546 * not have attached an IPSEC_IN. 3547 */ 3548 ASSERT(mp->b_datap->db_type == M_DATA); 3549 3550 if (!(mp = icmp_pkt_err_ok(mp))) { 3551 return; 3552 } 3553 3554 bzero(&icmph, sizeof (icmph_t)); 3555 icmph.icmph_type = ICMP_REDIRECT; 3556 icmph.icmph_code = 1; 3557 icmph.icmph_rd_gateway = gateway; 3558 BUMP_MIB(&icmp_mib, icmpOutRedirects); 3559 icmp_pkt(q, mp, &icmph, sizeof (icmph_t), B_FALSE); 3560 } 3561 3562 /* 3563 * Generate an ICMP time exceeded message. 3564 */ 3565 void 3566 icmp_time_exceeded(queue_t *q, mblk_t *mp, uint8_t code) 3567 { 3568 icmph_t icmph; 3569 boolean_t mctl_present; 3570 mblk_t *first_mp; 3571 3572 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 3573 3574 if (!(mp = icmp_pkt_err_ok(mp))) { 3575 if (mctl_present) 3576 freeb(first_mp); 3577 return; 3578 } 3579 3580 bzero(&icmph, sizeof (icmph_t)); 3581 icmph.icmph_type = ICMP_TIME_EXCEEDED; 3582 icmph.icmph_code = code; 3583 BUMP_MIB(&icmp_mib, icmpOutTimeExcds); 3584 icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present); 3585 } 3586 3587 /* 3588 * Generate an ICMP unreachable message. 3589 */ 3590 void 3591 icmp_unreachable(queue_t *q, mblk_t *mp, uint8_t code) 3592 { 3593 icmph_t icmph; 3594 mblk_t *first_mp; 3595 boolean_t mctl_present; 3596 3597 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 3598 3599 if (!(mp = icmp_pkt_err_ok(mp))) { 3600 if (mctl_present) 3601 freeb(first_mp); 3602 return; 3603 } 3604 3605 bzero(&icmph, sizeof (icmph_t)); 3606 icmph.icmph_type = ICMP_DEST_UNREACHABLE; 3607 icmph.icmph_code = code; 3608 BUMP_MIB(&icmp_mib, icmpOutDestUnreachs); 3609 ip2dbg(("send icmp destination unreachable code %d\n", code)); 3610 icmp_pkt(q, first_mp, (char *)&icmph, sizeof (icmph_t), mctl_present); 3611 } 3612 3613 /* 3614 * News from ARP. ARP sends notification of interesting events down 3615 * to its clients using M_CTL messages with the interesting ARP packet 3616 * attached via b_cont. 3617 * The interesting event from a device comes up the corresponding ARP-IP-DEV 3618 * queue as opposed to ARP sending the message to all the clients, i.e. all 3619 * its ARP-IP-DEV instances. Thus, for AR_CN_ANNOUNCE, we must walk the cache 3620 * table if a cache IRE is found to delete all the entries for the address in 3621 * the packet. 3622 */ 3623 static void 3624 ip_arp_news(queue_t *q, mblk_t *mp) 3625 { 3626 arcn_t *arcn; 3627 arh_t *arh; 3628 char *cp1; 3629 uchar_t *cp2; 3630 ire_t *ire = NULL; 3631 int i1; 3632 char hbuf[128]; 3633 char sbuf[16]; 3634 ipaddr_t src; 3635 in6_addr_t v6src; 3636 boolean_t isv6 = B_FALSE; 3637 3638 if ((mp->b_wptr - mp->b_rptr) < sizeof (arcn_t) || !mp->b_cont) { 3639 if (q->q_next) { 3640 putnext(q, mp); 3641 } else 3642 freemsg(mp); 3643 return; 3644 } 3645 arh = (arh_t *)mp->b_cont->b_rptr; 3646 /* Is it one we are interested in? */ 3647 if (BE16_TO_U16(arh->arh_proto) == IP6_DL_SAP) { 3648 isv6 = B_TRUE; 3649 bcopy((char *)&arh[1] + (arh->arh_hlen & 0xFF), &v6src, 3650 IPV6_ADDR_LEN); 3651 } else if (BE16_TO_U16(arh->arh_proto) == IP_ARP_PROTO_TYPE) { 3652 bcopy((char *)&arh[1] + (arh->arh_hlen & 0xFF), &src, 3653 IP_ADDR_LEN); 3654 } else { 3655 freemsg(mp); 3656 return; 3657 } 3658 3659 arcn = (arcn_t *)mp->b_rptr; 3660 switch (arcn->arcn_code) { 3661 case AR_CN_BOGON: 3662 /* 3663 * Someone is sending ARP packets with a source protocol 3664 * address which we have published. Either they are 3665 * pretending to be us, or we have been asked to proxy 3666 * for a machine that can do fine for itself, or two 3667 * different machines are providing proxy service for the 3668 * same protocol address, or something. We try and do 3669 * something appropriate here. 3670 */ 3671 cp2 = (uchar_t *)&arh[1]; 3672 cp1 = hbuf; 3673 *cp1 = '\0'; 3674 for (i1 = arh->arh_hlen; i1--; cp1 += 3) 3675 (void) sprintf(cp1, "%02x:", *cp2++ & 0xff); 3676 if (cp1 != hbuf) 3677 cp1[-1] = '\0'; 3678 (void) ip_dot_addr(src, sbuf); 3679 if (isv6) 3680 ire = ire_cache_lookup_v6(&v6src, ALL_ZONES, NULL); 3681 else 3682 ire = ire_cache_lookup(src, ALL_ZONES, NULL); 3683 3684 if (ire != NULL && IRE_IS_LOCAL(ire)) { 3685 cmn_err(CE_WARN, 3686 "IP: Hardware address '%s' trying" 3687 " to be our address %s!", 3688 hbuf, sbuf); 3689 } else { 3690 cmn_err(CE_WARN, 3691 "IP: Proxy ARP problem? " 3692 "Hardware address '%s' thinks it is %s", 3693 hbuf, sbuf); 3694 } 3695 if (ire != NULL) 3696 ire_refrele(ire); 3697 break; 3698 case AR_CN_ANNOUNCE: 3699 if (isv6) { 3700 /* 3701 * For XRESOLV interfaces. 3702 * Delete the IRE cache entry and NCE for this 3703 * v6 address 3704 */ 3705 ip_ire_clookup_and_delete_v6(&v6src); 3706 /* 3707 * If v6src is a non-zero, it's a router address 3708 * as below. Do the same sort of thing to clean 3709 * out off-net IRE_CACHE entries that go through 3710 * the router. 3711 */ 3712 if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 3713 ire_walk_v6(ire_delete_cache_gw_v6, 3714 (char *)&v6src, ALL_ZONES); 3715 } 3716 break; 3717 } 3718 /* 3719 * ARP gives us a copy of any broadcast packet with identical 3720 * sender and receiver protocol address, in 3721 * case we want to intuit something from it. Such a packet 3722 * usually means that a machine has just come up on the net. 3723 * If we have an IRE_CACHE, we blow it away. This way we will 3724 * immediately pick up the rare case of a host changing 3725 * hardware address. ip_ire_clookup_and_delete achieves this. 3726 * 3727 * The address in "src" may be an entry for a router. 3728 * (Default router, or non-default router.) If 3729 * that's true, then any off-net IRE_CACHE entries 3730 * that go through the router with address "src" 3731 * must be clobbered. Use ire_walk to achieve this 3732 * goal. 3733 * 3734 * It should be possible to determine if the address 3735 * in src is or is not for a router. This way, 3736 * the ire_walk() isn't called all of the time here. 3737 * Do not pass 'src' value of 0 to ire_delete_cache_gw, 3738 * as it would remove all IRE_CACHE entries for onlink 3739 * destinations. All onlink destinations have 3740 * ire_gateway_addr == 0. 3741 */ 3742 if ((ip_ire_clookup_and_delete(src, NULL) || 3743 (ire = ire_ftable_lookup(src, 0, 0, 0, NULL, NULL, NULL, 3744 0, NULL, MATCH_IRE_DSTONLY)) != NULL) && src != 0) { 3745 ire_walk_v4(ire_delete_cache_gw, (char *)&src, 3746 ALL_ZONES); 3747 } 3748 /* From ire_ftable_lookup */ 3749 if (ire != NULL) 3750 ire_refrele(ire); 3751 break; 3752 default: 3753 if (ire != NULL) 3754 ire_refrele(ire); 3755 break; 3756 } 3757 freemsg(mp); 3758 } 3759 3760 /* 3761 * Create a mblk suitable for carrying the interface index and/or source link 3762 * address. This mblk is tagged as an M_CTL and is sent to ULP. This is used 3763 * when the IP_RECVIF and/or IP_RECVSLLA socket option is set by the user 3764 * application. 3765 */ 3766 mblk_t * 3767 ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags) 3768 { 3769 mblk_t *mp; 3770 in_pktinfo_t *pinfo; 3771 ipha_t *ipha; 3772 struct ether_header *pether; 3773 3774 mp = allocb(sizeof (in_pktinfo_t), BPRI_MED); 3775 if (mp == NULL) { 3776 ip1dbg(("ip_add_info: allocation failure.\n")); 3777 return (data_mp); 3778 } 3779 3780 ipha = (ipha_t *)data_mp->b_rptr; 3781 pinfo = (in_pktinfo_t *)mp->b_rptr; 3782 bzero(pinfo, sizeof (in_pktinfo_t)); 3783 pinfo->in_pkt_flags = (uchar_t)flags; 3784 pinfo->in_pkt_ulp_type = IN_PKTINFO; /* Tell ULP what type of info */ 3785 3786 if (flags & IPF_RECVIF) 3787 pinfo->in_pkt_ifindex = ill->ill_phyint->phyint_ifindex; 3788 3789 pether = (struct ether_header *)((char *)ipha 3790 - sizeof (struct ether_header)); 3791 /* 3792 * Make sure the interface is an ethernet type, since this option 3793 * is currently supported only on this type of interface. Also make 3794 * sure we are pointing correctly above db_base. 3795 */ 3796 3797 if ((flags & IPF_RECVSLLA) && 3798 ((uchar_t *)pether >= data_mp->b_datap->db_base) && 3799 (ill->ill_type == IFT_ETHER) && 3800 (ill->ill_net_type == IRE_IF_RESOLVER)) { 3801 3802 pinfo->in_pkt_slla.sdl_type = IFT_ETHER; 3803 bcopy((uchar_t *)pether->ether_shost.ether_addr_octet, 3804 (uchar_t *)pinfo->in_pkt_slla.sdl_data, ETHERADDRL); 3805 } else { 3806 /* 3807 * Clear the bit. Indicate to upper layer that IP is not 3808 * sending this ancillary info. 3809 */ 3810 pinfo->in_pkt_flags = pinfo->in_pkt_flags & ~IPF_RECVSLLA; 3811 } 3812 3813 mp->b_datap->db_type = M_CTL; 3814 mp->b_wptr += sizeof (in_pktinfo_t); 3815 mp->b_cont = data_mp; 3816 3817 return (mp); 3818 } 3819 3820 /* 3821 * Latch in the IPsec state for a stream based on the ipsec_in_t passed in as 3822 * part of the bind request. 3823 */ 3824 3825 boolean_t 3826 ip_bind_ipsec_policy_set(conn_t *connp, mblk_t *policy_mp) 3827 { 3828 ipsec_in_t *ii; 3829 3830 ASSERT(policy_mp != NULL); 3831 ASSERT(policy_mp->b_datap->db_type == IPSEC_POLICY_SET); 3832 3833 ii = (ipsec_in_t *)policy_mp->b_rptr; 3834 ASSERT(ii->ipsec_in_type == IPSEC_IN); 3835 3836 connp->conn_policy = ii->ipsec_in_policy; 3837 ii->ipsec_in_policy = NULL; 3838 3839 if (ii->ipsec_in_action != NULL) { 3840 if (connp->conn_latch == NULL) { 3841 connp->conn_latch = iplatch_create(); 3842 if (connp->conn_latch == NULL) 3843 return (B_FALSE); 3844 } 3845 ipsec_latch_inbound(connp->conn_latch, ii); 3846 } 3847 return (B_TRUE); 3848 } 3849 3850 /* 3851 * Upper level protocols (ULP) pass through bind requests to IP for inspection 3852 * and to arrange for power-fanout assist. The ULP is identified by 3853 * adding a single byte at the end of the original bind message. 3854 * A ULP other than UDP or TCP that wishes to be recognized passes 3855 * down a bind with a zero length address. 3856 * 3857 * The binding works as follows: 3858 * - A zero byte address means just bind to the protocol. 3859 * - A four byte address is treated as a request to validate 3860 * that the address is a valid local address, appropriate for 3861 * an application to bind to. This does not affect any fanout 3862 * information in IP. 3863 * - A sizeof sin_t byte address is used to bind to only the local address 3864 * and port. 3865 * - A sizeof ipa_conn_t byte address contains complete fanout information 3866 * consisting of local and remote addresses and ports. In 3867 * this case, the addresses are both validated as appropriate 3868 * for this operation, and, if so, the information is retained 3869 * for use in the inbound fanout. 3870 * 3871 * The ULP (except in the zero-length bind) can append an 3872 * additional mblk of db_type IRE_DB_REQ_TYPE or IPSEC_POLICY_SET to the 3873 * T_BIND_REQ/O_T_BIND_REQ. IRE_DB_REQ_TYPE indicates that the ULP wants 3874 * a copy of the source or destination IRE (source for local bind; 3875 * destination for complete bind). IPSEC_POLICY_SET indicates that the 3876 * policy information contained should be copied on to the conn. 3877 * 3878 * NOTE : Only one of IRE_DB_REQ_TYPE or IPSEC_POLICY_SET can be present. 3879 */ 3880 mblk_t * 3881 ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp) 3882 { 3883 ssize_t len; 3884 struct T_bind_req *tbr; 3885 sin_t *sin; 3886 ipa_conn_t *ac; 3887 uchar_t *ucp; 3888 mblk_t *mp1; 3889 boolean_t ire_requested; 3890 boolean_t ipsec_policy_set = B_FALSE; 3891 int error = 0; 3892 int protocol; 3893 ipa_conn_x_t *acx; 3894 3895 ASSERT(!connp->conn_af_isv6); 3896 connp->conn_pkt_isv6 = B_FALSE; 3897 3898 len = MBLKL(mp); 3899 if (len < (sizeof (*tbr) + 1)) { 3900 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 3901 "ip_bind: bogus msg, len %ld", len); 3902 /* XXX: Need to return something better */ 3903 goto bad_addr; 3904 } 3905 /* Back up and extract the protocol identifier. */ 3906 mp->b_wptr--; 3907 protocol = *mp->b_wptr & 0xFF; 3908 tbr = (struct T_bind_req *)mp->b_rptr; 3909 /* Reset the message type in preparation for shipping it back. */ 3910 DB_TYPE(mp) = M_PCPROTO; 3911 3912 connp->conn_ulp = (uint8_t)protocol; 3913 3914 /* 3915 * Check for a zero length address. This is from a protocol that 3916 * wants to register to receive all packets of its type. 3917 */ 3918 if (tbr->ADDR_length == 0) { 3919 /* 3920 * These protocols are now intercepted in ip_bind_v6(). 3921 * Reject protocol-level binds here for now. 3922 * 3923 * For SCTP raw socket, ICMP sends down a bind with sin_t 3924 * so that the protocol type cannot be SCTP. 3925 */ 3926 if (protocol == IPPROTO_TCP || protocol == IPPROTO_AH || 3927 protocol == IPPROTO_ESP || protocol == IPPROTO_SCTP) { 3928 goto bad_addr; 3929 } 3930 3931 /* 3932 * 3933 * The udp module never sends down a zero-length address, 3934 * and allowing this on a labeled system will break MLP 3935 * functionality. 3936 */ 3937 if (is_system_labeled() && protocol == IPPROTO_UDP) 3938 goto bad_addr; 3939 3940 if (connp->conn_mac_exempt) 3941 goto bad_addr; 3942 3943 /* No hash here really. The table is big enough. */ 3944 connp->conn_srcv6 = ipv6_all_zeros; 3945 3946 ipcl_proto_insert(connp, protocol); 3947 3948 tbr->PRIM_type = T_BIND_ACK; 3949 return (mp); 3950 } 3951 3952 /* Extract the address pointer from the message. */ 3953 ucp = (uchar_t *)mi_offset_param(mp, tbr->ADDR_offset, 3954 tbr->ADDR_length); 3955 if (ucp == NULL) { 3956 ip1dbg(("ip_bind: no address\n")); 3957 goto bad_addr; 3958 } 3959 if (!OK_32PTR(ucp)) { 3960 ip1dbg(("ip_bind: unaligned address\n")); 3961 goto bad_addr; 3962 } 3963 /* 3964 * Check for trailing mps. 3965 */ 3966 3967 mp1 = mp->b_cont; 3968 ire_requested = (mp1 != NULL && DB_TYPE(mp1) == IRE_DB_REQ_TYPE); 3969 ipsec_policy_set = (mp1 != NULL && DB_TYPE(mp1) == IPSEC_POLICY_SET); 3970 3971 switch (tbr->ADDR_length) { 3972 default: 3973 ip1dbg(("ip_bind: bad address length %d\n", 3974 (int)tbr->ADDR_length)); 3975 goto bad_addr; 3976 3977 case IP_ADDR_LEN: 3978 /* Verification of local address only */ 3979 error = ip_bind_laddr(connp, mp, *(ipaddr_t *)ucp, 0, 3980 ire_requested, ipsec_policy_set, B_FALSE); 3981 break; 3982 3983 case sizeof (sin_t): 3984 sin = (sin_t *)ucp; 3985 error = ip_bind_laddr(connp, mp, sin->sin_addr.s_addr, 3986 sin->sin_port, ire_requested, ipsec_policy_set, B_TRUE); 3987 if (protocol == IPPROTO_TCP) 3988 connp->conn_recv = tcp_conn_request; 3989 break; 3990 3991 case sizeof (ipa_conn_t): 3992 ac = (ipa_conn_t *)ucp; 3993 /* For raw socket, the local port is not set. */ 3994 if (ac->ac_lport == 0) 3995 ac->ac_lport = connp->conn_lport; 3996 /* Always verify destination reachability. */ 3997 error = ip_bind_connected(connp, mp, &ac->ac_laddr, 3998 ac->ac_lport, ac->ac_faddr, ac->ac_fport, ire_requested, 3999 ipsec_policy_set, B_TRUE, B_TRUE); 4000 if (protocol == IPPROTO_TCP) 4001 connp->conn_recv = tcp_input; 4002 break; 4003 4004 case sizeof (ipa_conn_x_t): 4005 acx = (ipa_conn_x_t *)ucp; 4006 /* 4007 * Whether or not to verify destination reachability depends 4008 * on the setting of the ACX_VERIFY_DST flag in acx->acx_flags. 4009 */ 4010 error = ip_bind_connected(connp, mp, &acx->acx_conn.ac_laddr, 4011 acx->acx_conn.ac_lport, acx->acx_conn.ac_faddr, 4012 acx->acx_conn.ac_fport, ire_requested, ipsec_policy_set, 4013 B_TRUE, (acx->acx_flags & ACX_VERIFY_DST) != 0); 4014 if (protocol == IPPROTO_TCP) 4015 connp->conn_recv = tcp_input; 4016 break; 4017 } 4018 if (error == EINPROGRESS) 4019 return (NULL); 4020 else if (error != 0) 4021 goto bad_addr; 4022 /* 4023 * Pass the IPSEC headers size in ire_ipsec_overhead. 4024 * We can't do this in ip_bind_insert_ire because the policy 4025 * may not have been inherited at that point in time and hence 4026 * conn_out_enforce_policy may not be set. 4027 */ 4028 mp1 = mp->b_cont; 4029 if (ire_requested && connp->conn_out_enforce_policy && 4030 mp1 != NULL && DB_TYPE(mp1) == IRE_DB_REQ_TYPE) { 4031 ire_t *ire = (ire_t *)mp1->b_rptr; 4032 ASSERT(MBLKL(mp1) >= sizeof (ire_t)); 4033 ire->ire_ipsec_overhead = conn_ipsec_length(connp); 4034 } 4035 4036 /* Send it home. */ 4037 mp->b_datap->db_type = M_PCPROTO; 4038 tbr->PRIM_type = T_BIND_ACK; 4039 return (mp); 4040 4041 bad_addr: 4042 /* 4043 * If error = -1 then we generate a TBADADDR - otherwise error is 4044 * a unix errno. 4045 */ 4046 if (error > 0) 4047 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error); 4048 else 4049 mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0); 4050 return (mp); 4051 } 4052 4053 /* 4054 * Here address is verified to be a valid local address. 4055 * If the IRE_DB_REQ_TYPE mp is present, a broadcast/multicast 4056 * address is also considered a valid local address. 4057 * In the case of a broadcast/multicast address, however, the 4058 * upper protocol is expected to reset the src address 4059 * to 0 if it sees a IRE_BROADCAST type returned so that 4060 * no packets are emitted with broadcast/multicast address as 4061 * source address (that violates hosts requirements RFC1122) 4062 * The addresses valid for bind are: 4063 * (1) - INADDR_ANY (0) 4064 * (2) - IP address of an UP interface 4065 * (3) - IP address of a DOWN interface 4066 * (4) - valid local IP broadcast addresses. In this case 4067 * the conn will only receive packets destined to 4068 * the specified broadcast address. 4069 * (5) - a multicast address. In this case 4070 * the conn will only receive packets destined to 4071 * the specified multicast address. Note: the 4072 * application still has to issue an 4073 * IP_ADD_MEMBERSHIP socket option. 4074 * 4075 * On error, return -1 for TBADADDR otherwise pass the 4076 * errno with TSYSERR reply. 4077 * 4078 * In all the above cases, the bound address must be valid in the current zone. 4079 * When the address is loopback, multicast or broadcast, there might be many 4080 * matching IREs so bind has to look up based on the zone. 4081 * 4082 * Note: lport is in network byte order. 4083 */ 4084 int 4085 ip_bind_laddr(conn_t *connp, mblk_t *mp, ipaddr_t src_addr, uint16_t lport, 4086 boolean_t ire_requested, boolean_t ipsec_policy_set, 4087 boolean_t fanout_insert) 4088 { 4089 int error = 0; 4090 ire_t *src_ire; 4091 mblk_t *policy_mp; 4092 ipif_t *ipif; 4093 zoneid_t zoneid; 4094 4095 if (ipsec_policy_set) { 4096 policy_mp = mp->b_cont; 4097 } 4098 4099 /* 4100 * If it was previously connected, conn_fully_bound would have 4101 * been set. 4102 */ 4103 connp->conn_fully_bound = B_FALSE; 4104 4105 src_ire = NULL; 4106 ipif = NULL; 4107 4108 zoneid = connp->conn_zoneid; 4109 4110 if (src_addr) { 4111 src_ire = ire_route_lookup(src_addr, 0, 0, 0, 4112 NULL, NULL, zoneid, NULL, MATCH_IRE_ZONEONLY); 4113 /* 4114 * If an address other than 0.0.0.0 is requested, 4115 * we verify that it is a valid address for bind 4116 * Note: Following code is in if-else-if form for 4117 * readability compared to a condition check. 4118 */ 4119 /* LINTED - statement has no consequent */ 4120 if (IRE_IS_LOCAL(src_ire)) { 4121 /* 4122 * (2) Bind to address of local UP interface 4123 */ 4124 } else if (src_ire && src_ire->ire_type == IRE_BROADCAST) { 4125 /* 4126 * (4) Bind to broadcast address 4127 * Note: permitted only from transports that 4128 * request IRE 4129 */ 4130 if (!ire_requested) 4131 error = EADDRNOTAVAIL; 4132 } else { 4133 /* 4134 * (3) Bind to address of local DOWN interface 4135 * (ipif_lookup_addr() looks up all interfaces 4136 * but we do not get here for UP interfaces 4137 * - case (2) above) 4138 * We put the protocol byte back into the mblk 4139 * since we may come back via ip_wput_nondata() 4140 * later with this mblk if ipif_lookup_addr chooses 4141 * to defer processing. 4142 */ 4143 *mp->b_wptr++ = (char)connp->conn_ulp; 4144 if ((ipif = ipif_lookup_addr(src_addr, NULL, zoneid, 4145 CONNP_TO_WQ(connp), mp, ip_wput_nondata, 4146 &error)) != NULL) { 4147 ipif_refrele(ipif); 4148 } else if (error == EINPROGRESS) { 4149 if (src_ire != NULL) 4150 ire_refrele(src_ire); 4151 return (EINPROGRESS); 4152 } else if (CLASSD(src_addr)) { 4153 error = 0; 4154 if (src_ire != NULL) 4155 ire_refrele(src_ire); 4156 /* 4157 * (5) bind to multicast address. 4158 * Fake out the IRE returned to upper 4159 * layer to be a broadcast IRE. 4160 */ 4161 src_ire = ire_ctable_lookup( 4162 INADDR_BROADCAST, INADDR_ANY, 4163 IRE_BROADCAST, NULL, zoneid, NULL, 4164 (MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY)); 4165 if (src_ire == NULL || !ire_requested) 4166 error = EADDRNOTAVAIL; 4167 } else { 4168 /* 4169 * Not a valid address for bind 4170 */ 4171 error = EADDRNOTAVAIL; 4172 } 4173 /* 4174 * Just to keep it consistent with the processing in 4175 * ip_bind_v4() 4176 */ 4177 mp->b_wptr--; 4178 } 4179 if (error) { 4180 /* Red Alert! Attempting to be a bogon! */ 4181 ip1dbg(("ip_bind: bad src address 0x%x\n", 4182 ntohl(src_addr))); 4183 goto bad_addr; 4184 } 4185 } 4186 4187 /* 4188 * Allow setting new policies. For example, disconnects come 4189 * down as ipa_t bind. As we would have set conn_policy_cached 4190 * to B_TRUE before, we should set it to B_FALSE, so that policy 4191 * can change after the disconnect. 4192 */ 4193 connp->conn_policy_cached = B_FALSE; 4194 4195 /* 4196 * If not fanout_insert this was just an address verification 4197 */ 4198 if (fanout_insert) { 4199 /* 4200 * The addresses have been verified. Time to insert in 4201 * the correct fanout list. 4202 */ 4203 IN6_IPADDR_TO_V4MAPPED(src_addr, &connp->conn_srcv6); 4204 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &connp->conn_remv6); 4205 connp->conn_lport = lport; 4206 connp->conn_fport = 0; 4207 /* 4208 * Do we need to add a check to reject Multicast packets 4209 */ 4210 error = ipcl_bind_insert(connp, *mp->b_wptr, src_addr, lport); 4211 } 4212 4213 if (error == 0) { 4214 if (ire_requested) { 4215 if (!ip_bind_insert_ire(mp, src_ire, NULL)) { 4216 error = -1; 4217 /* Falls through to bad_addr */ 4218 } 4219 } else if (ipsec_policy_set) { 4220 if (!ip_bind_ipsec_policy_set(connp, policy_mp)) { 4221 error = -1; 4222 /* Falls through to bad_addr */ 4223 } 4224 } 4225 } 4226 bad_addr: 4227 if (error != 0) { 4228 if (connp->conn_anon_port) { 4229 (void) tsol_mlp_anon(crgetzone(connp->conn_cred), 4230 connp->conn_mlp_type, connp->conn_ulp, ntohs(lport), 4231 B_FALSE); 4232 } 4233 connp->conn_mlp_type = mlptSingle; 4234 } 4235 if (src_ire != NULL) 4236 IRE_REFRELE(src_ire); 4237 if (ipsec_policy_set) { 4238 ASSERT(policy_mp == mp->b_cont); 4239 ASSERT(policy_mp != NULL); 4240 freeb(policy_mp); 4241 /* 4242 * As of now assume that nothing else accompanies 4243 * IPSEC_POLICY_SET. 4244 */ 4245 mp->b_cont = NULL; 4246 } 4247 return (error); 4248 } 4249 4250 /* 4251 * Verify that both the source and destination addresses 4252 * are valid. If verify_dst is false, then the destination address may be 4253 * unreachable, i.e. have no route to it. Protocols like TCP want to verify 4254 * destination reachability, while tunnels do not. 4255 * Note that we allow connect to broadcast and multicast 4256 * addresses when ire_requested is set. Thus the ULP 4257 * has to check for IRE_BROADCAST and multicast. 4258 * 4259 * Returns zero if ok. 4260 * On error: returns -1 to mean TBADADDR otherwise returns an errno 4261 * (for use with TSYSERR reply). 4262 * 4263 * Note: lport and fport are in network byte order. 4264 */ 4265 int 4266 ip_bind_connected(conn_t *connp, mblk_t *mp, ipaddr_t *src_addrp, 4267 uint16_t lport, ipaddr_t dst_addr, uint16_t fport, 4268 boolean_t ire_requested, boolean_t ipsec_policy_set, 4269 boolean_t fanout_insert, boolean_t verify_dst) 4270 { 4271 ire_t *src_ire; 4272 ire_t *dst_ire; 4273 int error = 0; 4274 int protocol; 4275 mblk_t *policy_mp; 4276 ire_t *sire = NULL; 4277 ire_t *md_dst_ire = NULL; 4278 ill_t *md_ill = NULL; 4279 zoneid_t zoneid; 4280 ipaddr_t src_addr = *src_addrp; 4281 4282 src_ire = dst_ire = NULL; 4283 protocol = *mp->b_wptr & 0xFF; 4284 4285 /* 4286 * If we never got a disconnect before, clear it now. 4287 */ 4288 connp->conn_fully_bound = B_FALSE; 4289 4290 if (ipsec_policy_set) { 4291 policy_mp = mp->b_cont; 4292 } 4293 4294 zoneid = connp->conn_zoneid; 4295 4296 if (CLASSD(dst_addr)) { 4297 /* Pick up an IRE_BROADCAST */ 4298 dst_ire = ire_route_lookup(ip_g_all_ones, 0, 0, 0, NULL, 4299 NULL, zoneid, MBLK_GETLABEL(mp), 4300 (MATCH_IRE_RECURSIVE | 4301 MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE | 4302 MATCH_IRE_SECATTR)); 4303 } else { 4304 /* 4305 * If conn_dontroute is set or if conn_nexthop_set is set, 4306 * and onlink ipif is not found set ENETUNREACH error. 4307 */ 4308 if (connp->conn_dontroute || connp->conn_nexthop_set) { 4309 ipif_t *ipif; 4310 4311 ipif = ipif_lookup_onlink_addr(connp->conn_dontroute ? 4312 dst_addr : connp->conn_nexthop_v4, zoneid); 4313 if (ipif == NULL) { 4314 error = ENETUNREACH; 4315 goto bad_addr; 4316 } 4317 ipif_refrele(ipif); 4318 } 4319 4320 if (connp->conn_nexthop_set) { 4321 dst_ire = ire_route_lookup(connp->conn_nexthop_v4, 0, 4322 0, 0, NULL, NULL, zoneid, MBLK_GETLABEL(mp), 4323 MATCH_IRE_SECATTR); 4324 } else { 4325 dst_ire = ire_route_lookup(dst_addr, 0, 0, 0, NULL, 4326 &sire, zoneid, MBLK_GETLABEL(mp), 4327 (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 4328 MATCH_IRE_PARENT | MATCH_IRE_RJ_BHOLE | 4329 MATCH_IRE_SECATTR)); 4330 } 4331 } 4332 /* 4333 * dst_ire can't be a broadcast when not ire_requested. 4334 * We also prevent ire's with src address INADDR_ANY to 4335 * be used, which are created temporarily for 4336 * sending out packets from endpoints that have 4337 * conn_unspec_src set. If verify_dst is true, the destination must be 4338 * reachable. If verify_dst is false, the destination needn't be 4339 * reachable. 4340 * 4341 * If we match on a reject or black hole, then we've got a 4342 * local failure. May as well fail out the connect() attempt, 4343 * since it's never going to succeed. 4344 */ 4345 if (dst_ire == NULL || dst_ire->ire_src_addr == INADDR_ANY || 4346 (dst_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 4347 ((dst_ire->ire_type & IRE_BROADCAST) && !ire_requested)) { 4348 /* 4349 * If we're verifying destination reachability, we always want 4350 * to complain here. 4351 * 4352 * If we're not verifying destination reachability but the 4353 * destination has a route, we still want to fail on the 4354 * temporary address and broadcast address tests. 4355 */ 4356 if (verify_dst || (dst_ire != NULL)) { 4357 if (ip_debug > 2) { 4358 pr_addr_dbg("ip_bind_connected: bad connected " 4359 "dst %s\n", AF_INET, &dst_addr); 4360 } 4361 if (dst_ire == NULL || !(dst_ire->ire_type & IRE_HOST)) 4362 error = ENETUNREACH; 4363 else 4364 error = EHOSTUNREACH; 4365 goto bad_addr; 4366 } 4367 } 4368 4369 /* 4370 * We now know that routing will allow us to reach the destination. 4371 * Check whether Trusted Solaris policy allows communication with this 4372 * host, and pretend that the destination is unreachable if not. 4373 * 4374 * This is never a problem for TCP, since that transport is known to 4375 * compute the label properly as part of the tcp_rput_other T_BIND_ACK 4376 * handling. If the remote is unreachable, it will be detected at that 4377 * point, so there's no reason to check it here. 4378 * 4379 * Note that for sendto (and other datagram-oriented friends), this 4380 * check is done as part of the data path label computation instead. 4381 * The check here is just to make non-TCP connect() report the right 4382 * error. 4383 */ 4384 if (dst_ire != NULL && is_system_labeled() && 4385 !IPCL_IS_TCP(connp) && 4386 tsol_compute_label(DB_CREDDEF(mp, connp->conn_cred), dst_addr, NULL, 4387 connp->conn_mac_exempt) != 0) { 4388 error = EHOSTUNREACH; 4389 if (ip_debug > 2) { 4390 pr_addr_dbg("ip_bind_connected: no label for dst %s\n", 4391 AF_INET, &dst_addr); 4392 } 4393 goto bad_addr; 4394 } 4395 4396 /* 4397 * If the app does a connect(), it means that it will most likely 4398 * send more than 1 packet to the destination. It makes sense 4399 * to clear the temporary flag. 4400 */ 4401 if (dst_ire != NULL && dst_ire->ire_type == IRE_CACHE && 4402 (dst_ire->ire_marks & IRE_MARK_TEMPORARY)) { 4403 irb_t *irb = dst_ire->ire_bucket; 4404 4405 rw_enter(&irb->irb_lock, RW_WRITER); 4406 dst_ire->ire_marks &= ~IRE_MARK_TEMPORARY; 4407 irb->irb_tmp_ire_cnt--; 4408 rw_exit(&irb->irb_lock); 4409 } 4410 4411 /* 4412 * See if we should notify ULP about MDT; we do this whether or not 4413 * ire_requested is TRUE, in order to handle active connects; MDT 4414 * eligibility tests for passive connects are handled separately 4415 * through tcp_adapt_ire(). We do this before the source address 4416 * selection, because dst_ire may change after a call to 4417 * ipif_select_source(). This is a best-effort check, as the 4418 * packet for this connection may not actually go through 4419 * dst_ire->ire_stq, and the exact IRE can only be known after 4420 * calling ip_newroute(). This is why we further check on the 4421 * IRE during Multidata packet transmission in tcp_multisend(). 4422 */ 4423 if (ip_multidata_outbound && !ipsec_policy_set && dst_ire != NULL && 4424 !(dst_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST)) && 4425 (md_ill = ire_to_ill(dst_ire), md_ill != NULL) && 4426 ILL_MDT_CAPABLE(md_ill)) { 4427 md_dst_ire = dst_ire; 4428 IRE_REFHOLD(md_dst_ire); 4429 } 4430 4431 if (dst_ire != NULL && 4432 dst_ire->ire_type == IRE_LOCAL && 4433 dst_ire->ire_zoneid != zoneid && dst_ire->ire_zoneid != ALL_ZONES) { 4434 /* 4435 * If the IRE belongs to a different zone, look for a matching 4436 * route in the forwarding table and use the source address from 4437 * that route. 4438 */ 4439 src_ire = ire_ftable_lookup(dst_addr, 0, 0, 0, NULL, NULL, 4440 zoneid, 0, NULL, 4441 MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 4442 MATCH_IRE_RJ_BHOLE); 4443 if (src_ire == NULL) { 4444 error = EHOSTUNREACH; 4445 goto bad_addr; 4446 } else if (src_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 4447 if (!(src_ire->ire_type & IRE_HOST)) 4448 error = ENETUNREACH; 4449 else 4450 error = EHOSTUNREACH; 4451 goto bad_addr; 4452 } 4453 if (src_addr == INADDR_ANY) 4454 src_addr = src_ire->ire_src_addr; 4455 ire_refrele(src_ire); 4456 src_ire = NULL; 4457 } else if ((src_addr == INADDR_ANY) && (dst_ire != NULL)) { 4458 if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) { 4459 src_addr = sire->ire_src_addr; 4460 ire_refrele(dst_ire); 4461 dst_ire = sire; 4462 sire = NULL; 4463 } else { 4464 /* 4465 * Pick a source address so that a proper inbound 4466 * load spreading would happen. 4467 */ 4468 ill_t *dst_ill = dst_ire->ire_ipif->ipif_ill; 4469 ipif_t *src_ipif = NULL; 4470 ire_t *ipif_ire; 4471 4472 /* 4473 * Supply a local source address such that inbound 4474 * load spreading happens. 4475 * 4476 * Determine the best source address on this ill for 4477 * the destination. 4478 * 4479 * 1) For broadcast, we should return a broadcast ire 4480 * found above so that upper layers know that the 4481 * destination address is a broadcast address. 4482 * 4483 * 2) If this is part of a group, select a better 4484 * source address so that better inbound load 4485 * balancing happens. Do the same if the ipif 4486 * is DEPRECATED. 4487 * 4488 * 3) If the outgoing interface is part of a usesrc 4489 * group, then try selecting a source address from 4490 * the usesrc ILL. 4491 */ 4492 if ((dst_ire->ire_zoneid != zoneid && 4493 dst_ire->ire_zoneid != ALL_ZONES) || 4494 (!(dst_ire->ire_type & IRE_BROADCAST) && 4495 ((dst_ill->ill_group != NULL) || 4496 (dst_ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) || 4497 (dst_ill->ill_usesrc_ifindex != 0)))) { 4498 /* 4499 * If the destination is reachable via a 4500 * given gateway, the selected source address 4501 * should be in the same subnet as the gateway. 4502 * Otherwise, the destination is not reachable. 4503 * 4504 * If there are no interfaces on the same subnet 4505 * as the destination, ipif_select_source gives 4506 * first non-deprecated interface which might be 4507 * on a different subnet than the gateway. 4508 * This is not desirable. Hence pass the dst_ire 4509 * source address to ipif_select_source. 4510 * It is sure that the destination is reachable 4511 * with the dst_ire source address subnet. 4512 * So passing dst_ire source address to 4513 * ipif_select_source will make sure that the 4514 * selected source will be on the same subnet 4515 * as dst_ire source address. 4516 */ 4517 ipaddr_t saddr = 4518 dst_ire->ire_ipif->ipif_src_addr; 4519 src_ipif = ipif_select_source(dst_ill, 4520 saddr, zoneid); 4521 if (src_ipif != NULL) { 4522 if (IS_VNI(src_ipif->ipif_ill)) { 4523 /* 4524 * For VNI there is no 4525 * interface route 4526 */ 4527 src_addr = 4528 src_ipif->ipif_src_addr; 4529 } else { 4530 ipif_ire = 4531 ipif_to_ire(src_ipif); 4532 if (ipif_ire != NULL) { 4533 IRE_REFRELE(dst_ire); 4534 dst_ire = ipif_ire; 4535 } 4536 src_addr = 4537 dst_ire->ire_src_addr; 4538 } 4539 ipif_refrele(src_ipif); 4540 } else { 4541 src_addr = dst_ire->ire_src_addr; 4542 } 4543 } else { 4544 src_addr = dst_ire->ire_src_addr; 4545 } 4546 } 4547 } 4548 4549 /* 4550 * We do ire_route_lookup() here (and not 4551 * interface lookup as we assert that 4552 * src_addr should only come from an 4553 * UP interface for hard binding. 4554 */ 4555 ASSERT(src_ire == NULL); 4556 src_ire = ire_route_lookup(src_addr, 0, 0, 0, NULL, 4557 NULL, zoneid, NULL, MATCH_IRE_ZONEONLY); 4558 /* src_ire must be a local|loopback */ 4559 if (!IRE_IS_LOCAL(src_ire)) { 4560 if (ip_debug > 2) { 4561 pr_addr_dbg("ip_bind_connected: bad connected " 4562 "src %s\n", AF_INET, &src_addr); 4563 } 4564 error = EADDRNOTAVAIL; 4565 goto bad_addr; 4566 } 4567 4568 /* 4569 * If the source address is a loopback address, the 4570 * destination had best be local or multicast. 4571 * The transports that can't handle multicast will reject 4572 * those addresses. 4573 */ 4574 if (src_ire->ire_type == IRE_LOOPBACK && 4575 !(IRE_IS_LOCAL(dst_ire) || CLASSD(dst_addr))) { 4576 ip1dbg(("ip_bind_connected: bad connected loopback\n")); 4577 error = -1; 4578 goto bad_addr; 4579 } 4580 4581 /* 4582 * Allow setting new policies. For example, disconnects come 4583 * down as ipa_t bind. As we would have set conn_policy_cached 4584 * to B_TRUE before, we should set it to B_FALSE, so that policy 4585 * can change after the disconnect. 4586 */ 4587 connp->conn_policy_cached = B_FALSE; 4588 4589 /* 4590 * Set the conn addresses/ports immediately, so the IPsec policy calls 4591 * can handle their passed-in conn's. 4592 */ 4593 4594 IN6_IPADDR_TO_V4MAPPED(src_addr, &connp->conn_srcv6); 4595 IN6_IPADDR_TO_V4MAPPED(dst_addr, &connp->conn_remv6); 4596 connp->conn_lport = lport; 4597 connp->conn_fport = fport; 4598 *src_addrp = src_addr; 4599 4600 ASSERT(!(ipsec_policy_set && ire_requested)); 4601 if (ire_requested) { 4602 iulp_t *ulp_info = NULL; 4603 4604 /* 4605 * Note that sire will not be NULL if this is an off-link 4606 * connection and there is not cache for that dest yet. 4607 * 4608 * XXX Because of an existing bug, if there are multiple 4609 * default routes, the IRE returned now may not be the actual 4610 * default route used (default routes are chosen in a 4611 * round robin fashion). So if the metrics for different 4612 * default routes are different, we may return the wrong 4613 * metrics. This will not be a problem if the existing 4614 * bug is fixed. 4615 */ 4616 if (sire != NULL) { 4617 ulp_info = &(sire->ire_uinfo); 4618 } 4619 if (!ip_bind_insert_ire(mp, dst_ire, ulp_info)) { 4620 error = -1; 4621 goto bad_addr; 4622 } 4623 } else if (ipsec_policy_set) { 4624 if (!ip_bind_ipsec_policy_set(connp, policy_mp)) { 4625 error = -1; 4626 goto bad_addr; 4627 } 4628 } 4629 4630 /* 4631 * Cache IPsec policy in this conn. If we have per-socket policy, 4632 * we'll cache that. If we don't, we'll inherit global policy. 4633 * 4634 * We can't insert until the conn reflects the policy. Note that 4635 * conn_policy_cached is set by ipsec_conn_cache_policy() even for 4636 * connections where we don't have a policy. This is to prevent 4637 * global policy lookups in the inbound path. 4638 * 4639 * If we insert before we set conn_policy_cached, 4640 * CONN_INBOUND_POLICY_PRESENT() check can still evaluate true 4641 * because global policy cound be non-empty. We normally call 4642 * ipsec_check_policy() for conn_policy_cached connections only if 4643 * ipc_in_enforce_policy is set. But in this case, 4644 * conn_policy_cached can get set anytime since we made the 4645 * CONN_INBOUND_POLICY_PRESENT() check and ipsec_check_policy() is 4646 * called, which will make the above assumption false. Thus, we 4647 * need to insert after we set conn_policy_cached. 4648 */ 4649 if ((error = ipsec_conn_cache_policy(connp, B_TRUE)) != 0) 4650 goto bad_addr; 4651 4652 if (fanout_insert) { 4653 /* 4654 * The addresses have been verified. Time to insert in 4655 * the correct fanout list. 4656 */ 4657 error = ipcl_conn_insert(connp, protocol, src_addr, 4658 dst_addr, connp->conn_ports); 4659 } 4660 4661 if (error == 0) { 4662 connp->conn_fully_bound = B_TRUE; 4663 /* 4664 * Our initial checks for MDT have passed; the IRE is not 4665 * LOCAL/LOOPBACK/BROADCAST, and the link layer seems to 4666 * be supporting MDT. Pass the IRE, IPC and ILL into 4667 * ip_mdinfo_return(), which performs further checks 4668 * against them and upon success, returns the MDT info 4669 * mblk which we will attach to the bind acknowledgment. 4670 */ 4671 if (md_dst_ire != NULL) { 4672 mblk_t *mdinfo_mp; 4673 4674 ASSERT(md_ill != NULL); 4675 ASSERT(md_ill->ill_mdt_capab != NULL); 4676 if ((mdinfo_mp = ip_mdinfo_return(md_dst_ire, connp, 4677 md_ill->ill_name, md_ill->ill_mdt_capab)) != NULL) 4678 linkb(mp, mdinfo_mp); 4679 } 4680 } 4681 bad_addr: 4682 if (ipsec_policy_set) { 4683 ASSERT(policy_mp == mp->b_cont); 4684 ASSERT(policy_mp != NULL); 4685 freeb(policy_mp); 4686 /* 4687 * As of now assume that nothing else accompanies 4688 * IPSEC_POLICY_SET. 4689 */ 4690 mp->b_cont = NULL; 4691 } 4692 if (src_ire != NULL) 4693 IRE_REFRELE(src_ire); 4694 if (dst_ire != NULL) 4695 IRE_REFRELE(dst_ire); 4696 if (sire != NULL) 4697 IRE_REFRELE(sire); 4698 if (md_dst_ire != NULL) 4699 IRE_REFRELE(md_dst_ire); 4700 return (error); 4701 } 4702 4703 /* 4704 * Insert the ire in b_cont. Returns false if it fails (due to lack of space). 4705 * Prefers dst_ire over src_ire. 4706 */ 4707 static boolean_t 4708 ip_bind_insert_ire(mblk_t *mp, ire_t *ire, iulp_t *ulp_info) 4709 { 4710 mblk_t *mp1; 4711 ire_t *ret_ire = NULL; 4712 4713 mp1 = mp->b_cont; 4714 ASSERT(mp1 != NULL); 4715 4716 if (ire != NULL) { 4717 /* 4718 * mp1 initialized above to IRE_DB_REQ_TYPE 4719 * appended mblk. Its <upper protocol>'s 4720 * job to make sure there is room. 4721 */ 4722 if ((mp1->b_datap->db_lim - mp1->b_rptr) < sizeof (ire_t)) 4723 return (0); 4724 4725 mp1->b_datap->db_type = IRE_DB_TYPE; 4726 mp1->b_wptr = mp1->b_rptr + sizeof (ire_t); 4727 bcopy(ire, mp1->b_rptr, sizeof (ire_t)); 4728 ret_ire = (ire_t *)mp1->b_rptr; 4729 /* 4730 * Pass the latest setting of the ip_path_mtu_discovery and 4731 * copy the ulp info if any. 4732 */ 4733 ret_ire->ire_frag_flag |= (ip_path_mtu_discovery) ? 4734 IPH_DF : 0; 4735 if (ulp_info != NULL) { 4736 bcopy(ulp_info, &(ret_ire->ire_uinfo), 4737 sizeof (iulp_t)); 4738 } 4739 ret_ire->ire_mp = mp1; 4740 } else { 4741 /* 4742 * No IRE was found. Remove IRE mblk. 4743 */ 4744 mp->b_cont = mp1->b_cont; 4745 freeb(mp1); 4746 } 4747 4748 return (1); 4749 } 4750 4751 /* 4752 * Carve "len" bytes out of an mblk chain, consuming any we empty, and duping 4753 * the final piece where we don't. Return a pointer to the first mblk in the 4754 * result, and update the pointer to the next mblk to chew on. If anything 4755 * goes wrong (i.e., dupb fails), we waste everything in sight and return a 4756 * NULL pointer. 4757 */ 4758 mblk_t * 4759 ip_carve_mp(mblk_t **mpp, ssize_t len) 4760 { 4761 mblk_t *mp0; 4762 mblk_t *mp1; 4763 mblk_t *mp2; 4764 4765 if (!len || !mpp || !(mp0 = *mpp)) 4766 return (NULL); 4767 /* If we aren't going to consume the first mblk, we need a dup. */ 4768 if (mp0->b_wptr - mp0->b_rptr > len) { 4769 mp1 = dupb(mp0); 4770 if (mp1) { 4771 /* Partition the data between the two mblks. */ 4772 mp1->b_wptr = mp1->b_rptr + len; 4773 mp0->b_rptr = mp1->b_wptr; 4774 /* 4775 * after adjustments if mblk not consumed is now 4776 * unaligned, try to align it. If this fails free 4777 * all messages and let upper layer recover. 4778 */ 4779 if (!OK_32PTR(mp0->b_rptr)) { 4780 if (!pullupmsg(mp0, -1)) { 4781 freemsg(mp0); 4782 freemsg(mp1); 4783 *mpp = NULL; 4784 return (NULL); 4785 } 4786 } 4787 } 4788 return (mp1); 4789 } 4790 /* Eat through as many mblks as we need to get len bytes. */ 4791 len -= mp0->b_wptr - mp0->b_rptr; 4792 for (mp2 = mp1 = mp0; (mp2 = mp2->b_cont) != 0 && len; mp1 = mp2) { 4793 if (mp2->b_wptr - mp2->b_rptr > len) { 4794 /* 4795 * We won't consume the entire last mblk. Like 4796 * above, dup and partition it. 4797 */ 4798 mp1->b_cont = dupb(mp2); 4799 mp1 = mp1->b_cont; 4800 if (!mp1) { 4801 /* 4802 * Trouble. Rather than go to a lot of 4803 * trouble to clean up, we free the messages. 4804 * This won't be any worse than losing it on 4805 * the wire. 4806 */ 4807 freemsg(mp0); 4808 freemsg(mp2); 4809 *mpp = NULL; 4810 return (NULL); 4811 } 4812 mp1->b_wptr = mp1->b_rptr + len; 4813 mp2->b_rptr = mp1->b_wptr; 4814 /* 4815 * after adjustments if mblk not consumed is now 4816 * unaligned, try to align it. If this fails free 4817 * all messages and let upper layer recover. 4818 */ 4819 if (!OK_32PTR(mp2->b_rptr)) { 4820 if (!pullupmsg(mp2, -1)) { 4821 freemsg(mp0); 4822 freemsg(mp2); 4823 *mpp = NULL; 4824 return (NULL); 4825 } 4826 } 4827 *mpp = mp2; 4828 return (mp0); 4829 } 4830 /* Decrement len by the amount we just got. */ 4831 len -= mp2->b_wptr - mp2->b_rptr; 4832 } 4833 /* 4834 * len should be reduced to zero now. If not our caller has 4835 * screwed up. 4836 */ 4837 if (len) { 4838 /* Shouldn't happen! */ 4839 freemsg(mp0); 4840 *mpp = NULL; 4841 return (NULL); 4842 } 4843 /* 4844 * We consumed up to exactly the end of an mblk. Detach the part 4845 * we are returning from the rest of the chain. 4846 */ 4847 mp1->b_cont = NULL; 4848 *mpp = mp2; 4849 return (mp0); 4850 } 4851 4852 /* The ill stream is being unplumbed. Called from ip_close */ 4853 int 4854 ip_modclose(ill_t *ill) 4855 { 4856 4857 boolean_t success; 4858 ipsq_t *ipsq; 4859 ipif_t *ipif; 4860 queue_t *q = ill->ill_rq; 4861 4862 /* 4863 * Forcibly enter the ipsq after some delay. This is to take 4864 * care of the case when some ioctl does not complete because 4865 * we sent a control message to the driver and it did not 4866 * send us a reply. We want to be able to at least unplumb 4867 * and replumb rather than force the user to reboot the system. 4868 */ 4869 success = ipsq_enter(ill, B_FALSE); 4870 4871 /* 4872 * Open/close/push/pop is guaranteed to be single threaded 4873 * per stream by STREAMS. FS guarantees that all references 4874 * from top are gone before close is called. So there can't 4875 * be another close thread that has set CONDEMNED on this ill. 4876 * and cause ipsq_enter to return failure. 4877 */ 4878 ASSERT(success); 4879 ipsq = ill->ill_phyint->phyint_ipsq; 4880 4881 /* 4882 * Mark it condemned. No new reference will be made to this ill. 4883 * Lookup functions will return an error. Threads that try to 4884 * increment the refcnt must check for ILL_CAN_LOOKUP. This ensures 4885 * that the refcnt will drop down to zero. 4886 */ 4887 mutex_enter(&ill->ill_lock); 4888 ill->ill_state_flags |= ILL_CONDEMNED; 4889 for (ipif = ill->ill_ipif; ipif != NULL; 4890 ipif = ipif->ipif_next) { 4891 ipif->ipif_state_flags |= IPIF_CONDEMNED; 4892 } 4893 /* 4894 * Wake up anybody waiting to enter the ipsq. ipsq_enter 4895 * returns error if ILL_CONDEMNED is set 4896 */ 4897 cv_broadcast(&ill->ill_cv); 4898 mutex_exit(&ill->ill_lock); 4899 4900 /* 4901 * Shut down fragmentation reassembly. 4902 * ill_frag_timer won't start a timer again. 4903 * Now cancel any existing timer 4904 */ 4905 (void) untimeout(ill->ill_frag_timer_id); 4906 (void) ill_frag_timeout(ill, 0); 4907 4908 /* 4909 * If MOVE was in progress, clear the 4910 * move_in_progress fields also. 4911 */ 4912 if (ill->ill_move_in_progress) { 4913 ILL_CLEAR_MOVE(ill); 4914 } 4915 4916 /* 4917 * Call ill_delete to bring down the ipifs, ilms and ill on 4918 * this ill. Then wait for the refcnts to drop to zero. 4919 * ill_is_quiescent checks whether the ill is really quiescent. 4920 * Then make sure that threads that are waiting to enter the 4921 * ipsq have seen the error returned by ipsq_enter and have 4922 * gone away. Then we call ill_delete_tail which does the 4923 * DL_UNBIND and DL_DETACH with the driver and then qprocsoff. 4924 */ 4925 ill_delete(ill); 4926 mutex_enter(&ill->ill_lock); 4927 while (!ill_is_quiescent(ill)) 4928 cv_wait(&ill->ill_cv, &ill->ill_lock); 4929 while (ill->ill_waiters) 4930 cv_wait(&ill->ill_cv, &ill->ill_lock); 4931 4932 mutex_exit(&ill->ill_lock); 4933 4934 /* qprocsoff is called in ill_delete_tail */ 4935 ill_delete_tail(ill); 4936 4937 /* 4938 * Walk through all upper (conn) streams and qenable 4939 * those that have queued data. 4940 * close synchronization needs this to 4941 * be done to ensure that all upper layers blocked 4942 * due to flow control to the closing device 4943 * get unblocked. 4944 */ 4945 ip1dbg(("ip_wsrv: walking\n")); 4946 conn_walk_drain(); 4947 4948 mutex_enter(&ip_mi_lock); 4949 mi_close_unlink(&ip_g_head, (IDP)ill); 4950 mutex_exit(&ip_mi_lock); 4951 4952 /* 4953 * credp could be null if the open didn't succeed and ip_modopen 4954 * itself calls ip_close. 4955 */ 4956 if (ill->ill_credp != NULL) 4957 crfree(ill->ill_credp); 4958 4959 mi_close_free((IDP)ill); 4960 q->q_ptr = WR(q)->q_ptr = NULL; 4961 4962 ipsq_exit(ipsq, B_TRUE, B_TRUE); 4963 4964 return (0); 4965 } 4966 4967 /* 4968 * This is called as part of close() for both IP and UDP 4969 * in order to quiesce the conn. 4970 */ 4971 void 4972 ip_quiesce_conn(conn_t *connp) 4973 { 4974 boolean_t drain_cleanup_reqd = B_FALSE; 4975 boolean_t conn_ioctl_cleanup_reqd = B_FALSE; 4976 boolean_t ilg_cleanup_reqd = B_FALSE; 4977 4978 ASSERT(!IPCL_IS_TCP(connp)); 4979 4980 /* 4981 * Mark the conn as closing, and this conn must not be 4982 * inserted in future into any list. Eg. conn_drain_insert(), 4983 * won't insert this conn into the conn_drain_list. 4984 * Similarly ill_pending_mp_add() will not add any mp to 4985 * the pending mp list, after this conn has started closing. 4986 * 4987 * conn_idl, conn_pending_ill, conn_down_pending_ill, conn_ilg 4988 * cannot get set henceforth. 4989 */ 4990 mutex_enter(&connp->conn_lock); 4991 ASSERT(!(connp->conn_state_flags & CONN_QUIESCED)); 4992 connp->conn_state_flags |= CONN_CLOSING; 4993 if (connp->conn_idl != NULL) 4994 drain_cleanup_reqd = B_TRUE; 4995 if (connp->conn_oper_pending_ill != NULL) 4996 conn_ioctl_cleanup_reqd = B_TRUE; 4997 if (connp->conn_ilg_inuse != 0) 4998 ilg_cleanup_reqd = B_TRUE; 4999 mutex_exit(&connp->conn_lock); 5000 5001 if (IPCL_IS_UDP(connp)) 5002 udp_quiesce_conn(connp); 5003 5004 if (conn_ioctl_cleanup_reqd) 5005 conn_ioctl_cleanup(connp); 5006 5007 if (is_system_labeled() && connp->conn_anon_port) { 5008 (void) tsol_mlp_anon(crgetzone(connp->conn_cred), 5009 connp->conn_mlp_type, connp->conn_ulp, 5010 ntohs(connp->conn_lport), B_FALSE); 5011 connp->conn_anon_port = 0; 5012 } 5013 connp->conn_mlp_type = mlptSingle; 5014 5015 /* 5016 * Remove this conn from any fanout list it is on. 5017 * and then wait for any threads currently operating 5018 * on this endpoint to finish 5019 */ 5020 ipcl_hash_remove(connp); 5021 5022 /* 5023 * Remove this conn from the drain list, and do 5024 * any other cleanup that may be required. 5025 * (Only non-tcp streams may have a non-null conn_idl. 5026 * TCP streams are never flow controlled, and 5027 * conn_idl will be null) 5028 */ 5029 if (drain_cleanup_reqd) 5030 conn_drain_tail(connp, B_TRUE); 5031 5032 if (connp->conn_rq == ip_g_mrouter || connp->conn_wq == ip_g_mrouter) 5033 (void) ip_mrouter_done(NULL); 5034 5035 if (ilg_cleanup_reqd) 5036 ilg_delete_all(connp); 5037 5038 conn_delete_ire(connp, NULL); 5039 5040 /* 5041 * Now conn refcnt can increase only thru CONN_INC_REF_LOCKED. 5042 * callers from write side can't be there now because close 5043 * is in progress. The only other caller is ipcl_walk 5044 * which checks for the condemned flag. 5045 */ 5046 mutex_enter(&connp->conn_lock); 5047 connp->conn_state_flags |= CONN_CONDEMNED; 5048 while (connp->conn_ref != 1) 5049 cv_wait(&connp->conn_cv, &connp->conn_lock); 5050 connp->conn_state_flags |= CONN_QUIESCED; 5051 mutex_exit(&connp->conn_lock); 5052 } 5053 5054 /* ARGSUSED */ 5055 int 5056 ip_close(queue_t *q, int flags) 5057 { 5058 conn_t *connp; 5059 5060 TRACE_1(TR_FAC_IP, TR_IP_CLOSE, "ip_close: q %p", q); 5061 5062 /* 5063 * Call the appropriate delete routine depending on whether this is 5064 * a module or device. 5065 */ 5066 if (WR(q)->q_next != NULL) { 5067 /* This is a module close */ 5068 return (ip_modclose((ill_t *)q->q_ptr)); 5069 } 5070 5071 connp = q->q_ptr; 5072 ip_quiesce_conn(connp); 5073 5074 qprocsoff(q); 5075 5076 /* 5077 * Now we are truly single threaded on this stream, and can 5078 * delete the things hanging off the connp, and finally the connp. 5079 * We removed this connp from the fanout list, it cannot be 5080 * accessed thru the fanouts, and we already waited for the 5081 * conn_ref to drop to 0. We are already in close, so 5082 * there cannot be any other thread from the top. qprocsoff 5083 * has completed, and service has completed or won't run in 5084 * future. 5085 */ 5086 ASSERT(connp->conn_ref == 1); 5087 5088 /* 5089 * A conn which was previously marked as IPCL_UDP cannot 5090 * retain the flag because it would have been cleared by 5091 * udp_close(). 5092 */ 5093 ASSERT(!IPCL_IS_UDP(connp)); 5094 5095 if (connp->conn_latch != NULL) { 5096 IPLATCH_REFRELE(connp->conn_latch); 5097 connp->conn_latch = NULL; 5098 } 5099 if (connp->conn_policy != NULL) { 5100 IPPH_REFRELE(connp->conn_policy); 5101 connp->conn_policy = NULL; 5102 } 5103 if (connp->conn_ipsec_opt_mp != NULL) { 5104 freemsg(connp->conn_ipsec_opt_mp); 5105 connp->conn_ipsec_opt_mp = NULL; 5106 } 5107 5108 inet_minor_free(ip_minor_arena, connp->conn_dev); 5109 5110 connp->conn_ref--; 5111 ipcl_conn_destroy(connp); 5112 5113 q->q_ptr = WR(q)->q_ptr = NULL; 5114 return (0); 5115 } 5116 5117 int 5118 ip_snmpmod_close(queue_t *q) 5119 { 5120 conn_t *connp = Q_TO_CONN(q); 5121 ASSERT(connp->conn_flags & (IPCL_TCPMOD | IPCL_UDPMOD)); 5122 5123 qprocsoff(q); 5124 5125 if (connp->conn_flags & IPCL_UDPMOD) 5126 udp_close_free(connp); 5127 5128 if (connp->conn_cred != NULL) { 5129 crfree(connp->conn_cred); 5130 connp->conn_cred = NULL; 5131 } 5132 CONN_DEC_REF(connp); 5133 q->q_ptr = WR(q)->q_ptr = NULL; 5134 return (0); 5135 } 5136 5137 /* 5138 * Write side put procedure for TCP module or UDP module instance. TCP/UDP 5139 * as a module is only used for MIB browsers that push TCP/UDP over IP or ARP. 5140 * The only supported primitives are T_SVR4_OPTMGMT_REQ and T_OPTMGMT_REQ. 5141 * M_FLUSH messages and ioctls are only passed downstream; we don't flush our 5142 * queues as we never enqueue messages there and we don't handle any ioctls. 5143 * Everything else is freed. 5144 */ 5145 void 5146 ip_snmpmod_wput(queue_t *q, mblk_t *mp) 5147 { 5148 conn_t *connp = q->q_ptr; 5149 pfi_t setfn; 5150 pfi_t getfn; 5151 5152 ASSERT(connp->conn_flags & (IPCL_TCPMOD | IPCL_UDPMOD)); 5153 5154 switch (DB_TYPE(mp)) { 5155 case M_PROTO: 5156 case M_PCPROTO: 5157 if ((MBLKL(mp) >= sizeof (t_scalar_t)) && 5158 ((((union T_primitives *)mp->b_rptr)->type == 5159 T_SVR4_OPTMGMT_REQ) || 5160 (((union T_primitives *)mp->b_rptr)->type == 5161 T_OPTMGMT_REQ))) { 5162 /* 5163 * This is the only TPI primitive supported. Its 5164 * handling does not require tcp_t, but it does require 5165 * conn_t to check permissions. 5166 */ 5167 cred_t *cr = DB_CREDDEF(mp, connp->conn_cred); 5168 5169 if (connp->conn_flags & IPCL_TCPMOD) { 5170 setfn = tcp_snmp_set; 5171 getfn = tcp_snmp_get; 5172 } else { 5173 setfn = udp_snmp_set; 5174 getfn = udp_snmp_get; 5175 } 5176 if (!snmpcom_req(q, mp, setfn, getfn, cr)) { 5177 freemsg(mp); 5178 return; 5179 } 5180 } else if ((mp = mi_tpi_err_ack_alloc(mp, TPROTO, ENOTSUP)) 5181 != NULL) 5182 qreply(q, mp); 5183 break; 5184 case M_FLUSH: 5185 case M_IOCTL: 5186 putnext(q, mp); 5187 break; 5188 default: 5189 freemsg(mp); 5190 break; 5191 } 5192 } 5193 5194 /* Return the IP checksum for the IP header at "iph". */ 5195 uint16_t 5196 ip_csum_hdr(ipha_t *ipha) 5197 { 5198 uint16_t *uph; 5199 uint32_t sum; 5200 int opt_len; 5201 5202 opt_len = (ipha->ipha_version_and_hdr_length & 0xF) - 5203 IP_SIMPLE_HDR_LENGTH_IN_WORDS; 5204 uph = (uint16_t *)ipha; 5205 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + 5206 uph[5] + uph[6] + uph[7] + uph[8] + uph[9]; 5207 if (opt_len > 0) { 5208 do { 5209 sum += uph[10]; 5210 sum += uph[11]; 5211 uph += 2; 5212 } while (--opt_len); 5213 } 5214 sum = (sum & 0xFFFF) + (sum >> 16); 5215 sum = ~(sum + (sum >> 16)) & 0xFFFF; 5216 if (sum == 0xffff) 5217 sum = 0; 5218 return ((uint16_t)sum); 5219 } 5220 5221 void 5222 ip_ddi_destroy(void) 5223 { 5224 tnet_fini(); 5225 tcp_ddi_destroy(); 5226 sctp_ddi_destroy(); 5227 ipsec_loader_destroy(); 5228 ipsec_policy_destroy(); 5229 ipsec_kstat_destroy(); 5230 nd_free(&ip_g_nd); 5231 mutex_destroy(&igmp_timer_lock); 5232 mutex_destroy(&mld_timer_lock); 5233 mutex_destroy(&igmp_slowtimeout_lock); 5234 mutex_destroy(&mld_slowtimeout_lock); 5235 mutex_destroy(&ip_mi_lock); 5236 mutex_destroy(&rts_clients.connf_lock); 5237 ip_ire_fini(); 5238 ip6_asp_free(); 5239 conn_drain_fini(); 5240 ipcl_destroy(); 5241 inet_minor_destroy(ip_minor_arena); 5242 icmp_kstat_fini(); 5243 ip_kstat_fini(); 5244 rw_destroy(&ipsec_capab_ills_lock); 5245 rw_destroy(&ill_g_usesrc_lock); 5246 ip_drop_unregister(&ip_dropper); 5247 } 5248 5249 5250 void 5251 ip_ddi_init(void) 5252 { 5253 TCP6_MAJ = ddi_name_to_major(TCP6); 5254 TCP_MAJ = ddi_name_to_major(TCP); 5255 SCTP_MAJ = ddi_name_to_major(SCTP); 5256 SCTP6_MAJ = ddi_name_to_major(SCTP6); 5257 5258 ip_input_proc = ip_squeue_switch(ip_squeue_enter); 5259 5260 /* IP's IPsec code calls the packet dropper */ 5261 ip_drop_register(&ip_dropper, "IP IPsec processing"); 5262 5263 if (!ip_g_nd) { 5264 if (!ip_param_register(lcl_param_arr, A_CNT(lcl_param_arr), 5265 lcl_ndp_arr, A_CNT(lcl_ndp_arr))) { 5266 nd_free(&ip_g_nd); 5267 } 5268 } 5269 5270 ipsec_loader_init(); 5271 ipsec_policy_init(); 5272 ipsec_kstat_init(); 5273 rw_init(&ip_g_nd_lock, NULL, RW_DEFAULT, NULL); 5274 mutex_init(&igmp_timer_lock, NULL, MUTEX_DEFAULT, NULL); 5275 mutex_init(&mld_timer_lock, NULL, MUTEX_DEFAULT, NULL); 5276 mutex_init(&igmp_slowtimeout_lock, NULL, MUTEX_DEFAULT, NULL); 5277 mutex_init(&mld_slowtimeout_lock, NULL, MUTEX_DEFAULT, NULL); 5278 mutex_init(&ip_mi_lock, NULL, MUTEX_DEFAULT, NULL); 5279 mutex_init(&ip_addr_avail_lock, NULL, MUTEX_DEFAULT, NULL); 5280 rw_init(&ill_g_lock, NULL, RW_DEFAULT, NULL); 5281 rw_init(&ipsec_capab_ills_lock, NULL, RW_DEFAULT, NULL); 5282 rw_init(&ill_g_usesrc_lock, NULL, RW_DEFAULT, NULL); 5283 5284 /* 5285 * For IP and TCP the minor numbers should start from 2 since we have 4 5286 * initial devices: ip, ip6, tcp, tcp6. 5287 */ 5288 if ((ip_minor_arena = inet_minor_create("ip_minor_arena", 5289 INET_MIN_DEV + 2, KM_SLEEP)) == NULL) { 5290 cmn_err(CE_PANIC, 5291 "ip_ddi_init: ip_minor_arena creation failed\n"); 5292 } 5293 5294 ipcl_init(); 5295 mutex_init(&rts_clients.connf_lock, NULL, MUTEX_DEFAULT, NULL); 5296 ip_ire_init(); 5297 ip6_asp_init(); 5298 ipif_init(); 5299 conn_drain_init(); 5300 tcp_ddi_init(); 5301 sctp_ddi_init(); 5302 5303 ip_poll_normal_ticks = MSEC_TO_TICK_ROUNDUP(ip_poll_normal_ms); 5304 5305 if ((ip_kstat = kstat_create("ip", 0, "ipstat", 5306 "net", KSTAT_TYPE_NAMED, 5307 sizeof (ip_statistics) / sizeof (kstat_named_t), 5308 KSTAT_FLAG_VIRTUAL)) != NULL) { 5309 ip_kstat->ks_data = &ip_statistics; 5310 kstat_install(ip_kstat); 5311 } 5312 ip_kstat_init(); 5313 ip6_kstat_init(); 5314 icmp_kstat_init(); 5315 ipsec_loader_start(); 5316 tnet_init(); 5317 } 5318 5319 /* 5320 * Allocate and initialize a DLPI template of the specified length. (May be 5321 * called as writer.) 5322 */ 5323 mblk_t * 5324 ip_dlpi_alloc(size_t len, t_uscalar_t prim) 5325 { 5326 mblk_t *mp; 5327 5328 mp = allocb(len, BPRI_MED); 5329 if (!mp) 5330 return (NULL); 5331 5332 /* 5333 * DLPIv2 says that DL_INFO_REQ and DL_TOKEN_REQ (the latter 5334 * of which we don't seem to use) are sent with M_PCPROTO, and 5335 * that other DLPI are M_PROTO. 5336 */ 5337 if (prim == DL_INFO_REQ) { 5338 mp->b_datap->db_type = M_PCPROTO; 5339 } else { 5340 mp->b_datap->db_type = M_PROTO; 5341 } 5342 5343 mp->b_wptr = mp->b_rptr + len; 5344 bzero(mp->b_rptr, len); 5345 ((dl_unitdata_req_t *)mp->b_rptr)->dl_primitive = prim; 5346 return (mp); 5347 } 5348 5349 const char * 5350 dlpi_prim_str(int prim) 5351 { 5352 switch (prim) { 5353 case DL_INFO_REQ: return ("DL_INFO_REQ"); 5354 case DL_INFO_ACK: return ("DL_INFO_ACK"); 5355 case DL_ATTACH_REQ: return ("DL_ATTACH_REQ"); 5356 case DL_DETACH_REQ: return ("DL_DETACH_REQ"); 5357 case DL_BIND_REQ: return ("DL_BIND_REQ"); 5358 case DL_BIND_ACK: return ("DL_BIND_ACK"); 5359 case DL_UNBIND_REQ: return ("DL_UNBIND_REQ"); 5360 case DL_OK_ACK: return ("DL_OK_ACK"); 5361 case DL_ERROR_ACK: return ("DL_ERROR_ACK"); 5362 case DL_ENABMULTI_REQ: return ("DL_ENABMULTI_REQ"); 5363 case DL_DISABMULTI_REQ: return ("DL_DISABMULTI_REQ"); 5364 case DL_PROMISCON_REQ: return ("DL_PROMISCON_REQ"); 5365 case DL_PROMISCOFF_REQ: return ("DL_PROMISCOFF_REQ"); 5366 case DL_UNITDATA_REQ: return ("DL_UNITDATA_REQ"); 5367 case DL_UNITDATA_IND: return ("DL_UNITDATA_IND"); 5368 case DL_UDERROR_IND: return ("DL_UDERROR_IND"); 5369 case DL_PHYS_ADDR_REQ: return ("DL_PHYS_ADDR_REQ"); 5370 case DL_PHYS_ADDR_ACK: return ("DL_PHYS_ADDR_ACK"); 5371 case DL_SET_PHYS_ADDR_REQ: return ("DL_SET_PHYS_ADDR_REQ"); 5372 case DL_NOTIFY_REQ: return ("DL_NOTIFY_REQ"); 5373 case DL_NOTIFY_ACK: return ("DL_NOTIFY_ACK"); 5374 case DL_NOTIFY_IND: return ("DL_NOTIFY_IND"); 5375 case DL_CAPABILITY_REQ: return ("DL_CAPABILITY_REQ"); 5376 case DL_CAPABILITY_ACK: return ("DL_CAPABILITY_ACK"); 5377 case DL_CONTROL_REQ: return ("DL_CONTROL_REQ"); 5378 case DL_CONTROL_ACK: return ("DL_CONTROL_ACK"); 5379 default: return ("<unknown primitive>"); 5380 } 5381 } 5382 5383 const char * 5384 dlpi_err_str(int err) 5385 { 5386 switch (err) { 5387 case DL_ACCESS: return ("DL_ACCESS"); 5388 case DL_BADADDR: return ("DL_BADADDR"); 5389 case DL_BADCORR: return ("DL_BADCORR"); 5390 case DL_BADDATA: return ("DL_BADDATA"); 5391 case DL_BADPPA: return ("DL_BADPPA"); 5392 case DL_BADPRIM: return ("DL_BADPRIM"); 5393 case DL_BADQOSPARAM: return ("DL_BADQOSPARAM"); 5394 case DL_BADQOSTYPE: return ("DL_BADQOSTYPE"); 5395 case DL_BADSAP: return ("DL_BADSAP"); 5396 case DL_BADTOKEN: return ("DL_BADTOKEN"); 5397 case DL_BOUND: return ("DL_BOUND"); 5398 case DL_INITFAILED: return ("DL_INITFAILED"); 5399 case DL_NOADDR: return ("DL_NOADDR"); 5400 case DL_NOTINIT: return ("DL_NOTINIT"); 5401 case DL_OUTSTATE: return ("DL_OUTSTATE"); 5402 case DL_SYSERR: return ("DL_SYSERR"); 5403 case DL_UNSUPPORTED: return ("DL_UNSUPPORTED"); 5404 case DL_UNDELIVERABLE: return ("DL_UNDELIVERABLE"); 5405 case DL_NOTSUPPORTED : return ("DL_NOTSUPPORTED "); 5406 case DL_TOOMANY: return ("DL_TOOMANY"); 5407 case DL_NOTENAB: return ("DL_NOTENAB"); 5408 case DL_BUSY: return ("DL_BUSY"); 5409 case DL_NOAUTO: return ("DL_NOAUTO"); 5410 case DL_NOXIDAUTO: return ("DL_NOXIDAUTO"); 5411 case DL_NOTESTAUTO: return ("DL_NOTESTAUTO"); 5412 case DL_XIDAUTO: return ("DL_XIDAUTO"); 5413 case DL_TESTAUTO: return ("DL_TESTAUTO"); 5414 case DL_PENDING: return ("DL_PENDING"); 5415 default: return ("<unknown error>"); 5416 } 5417 } 5418 5419 /* 5420 * Debug formatting routine. Returns a character string representation of the 5421 * addr in buf, of the form xxx.xxx.xxx.xxx. This routine takes the address 5422 * in the form of a ipaddr_t and calls ip_dot_saddr with a pointer. 5423 */ 5424 char * 5425 ip_dot_addr(ipaddr_t addr, char *buf) 5426 { 5427 return (ip_dot_saddr((uchar_t *)&addr, buf)); 5428 } 5429 5430 /* 5431 * Debug formatting routine. Returns a character string representation of the 5432 * addr in buf, of the form xxx.xxx.xxx.xxx. This routine takes the address 5433 * as a pointer. The "xxx" parts including left zero padding so the final 5434 * string will fit easily in tables. It would be nice to take a padding 5435 * length argument instead. 5436 */ 5437 static char * 5438 ip_dot_saddr(uchar_t *addr, char *buf) 5439 { 5440 (void) mi_sprintf(buf, "%03d.%03d.%03d.%03d", 5441 addr[0] & 0xFF, addr[1] & 0xFF, addr[2] & 0xFF, addr[3] & 0xFF); 5442 return (buf); 5443 } 5444 5445 /* 5446 * Send an ICMP error after patching up the packet appropriately. Returns 5447 * non-zero if the appropriate MIB should be bumped; zero otherwise. 5448 */ 5449 static boolean_t 5450 ip_fanout_send_icmp(queue_t *q, mblk_t *mp, uint_t flags, 5451 uint_t icmp_type, uint_t icmp_code, boolean_t mctl_present, zoneid_t zoneid) 5452 { 5453 ipha_t *ipha; 5454 mblk_t *first_mp; 5455 boolean_t secure; 5456 unsigned char db_type; 5457 5458 first_mp = mp; 5459 if (mctl_present) { 5460 mp = mp->b_cont; 5461 secure = ipsec_in_is_secure(first_mp); 5462 ASSERT(mp != NULL); 5463 } else { 5464 /* 5465 * If this is an ICMP error being reported - which goes 5466 * up as M_CTLs, we need to convert them to M_DATA till 5467 * we finish checking with global policy because 5468 * ipsec_check_global_policy() assumes M_DATA as clear 5469 * and M_CTL as secure. 5470 */ 5471 db_type = DB_TYPE(mp); 5472 DB_TYPE(mp) = M_DATA; 5473 secure = B_FALSE; 5474 } 5475 /* 5476 * We are generating an icmp error for some inbound packet. 5477 * Called from all ip_fanout_(udp, tcp, proto) functions. 5478 * Before we generate an error, check with global policy 5479 * to see whether this is allowed to enter the system. As 5480 * there is no "conn", we are checking with global policy. 5481 */ 5482 ipha = (ipha_t *)mp->b_rptr; 5483 if (secure || ipsec_inbound_v4_policy_present) { 5484 first_mp = ipsec_check_global_policy(first_mp, NULL, 5485 ipha, NULL, mctl_present); 5486 if (first_mp == NULL) 5487 return (B_FALSE); 5488 } 5489 5490 if (!mctl_present) 5491 DB_TYPE(mp) = db_type; 5492 5493 if (flags & IP_FF_SEND_ICMP) { 5494 if (flags & IP_FF_HDR_COMPLETE) { 5495 if (ip_hdr_complete(ipha, zoneid)) { 5496 freemsg(first_mp); 5497 return (B_TRUE); 5498 } 5499 } 5500 if (flags & IP_FF_CKSUM) { 5501 /* 5502 * Have to correct checksum since 5503 * the packet might have been 5504 * fragmented and the reassembly code in ip_rput 5505 * does not restore the IP checksum. 5506 */ 5507 ipha->ipha_hdr_checksum = 0; 5508 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 5509 } 5510 switch (icmp_type) { 5511 case ICMP_DEST_UNREACHABLE: 5512 icmp_unreachable(WR(q), first_mp, icmp_code); 5513 break; 5514 default: 5515 freemsg(first_mp); 5516 break; 5517 } 5518 } else { 5519 freemsg(first_mp); 5520 return (B_FALSE); 5521 } 5522 5523 return (B_TRUE); 5524 } 5525 5526 /* 5527 * Used to send an ICMP error message when a packet is received for 5528 * a protocol that is not supported. The mblk passed as argument 5529 * is consumed by this function. 5530 */ 5531 void 5532 ip_proto_not_sup(queue_t *q, mblk_t *ipsec_mp, uint_t flags, zoneid_t zoneid) 5533 { 5534 mblk_t *mp; 5535 ipha_t *ipha; 5536 ill_t *ill; 5537 ipsec_in_t *ii; 5538 5539 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 5540 ASSERT(ii->ipsec_in_type == IPSEC_IN); 5541 5542 mp = ipsec_mp->b_cont; 5543 ipsec_mp->b_cont = NULL; 5544 ipha = (ipha_t *)mp->b_rptr; 5545 if (IPH_HDR_VERSION(ipha) == IP_VERSION) { 5546 if (ip_fanout_send_icmp(q, mp, flags, ICMP_DEST_UNREACHABLE, 5547 ICMP_PROTOCOL_UNREACHABLE, B_FALSE, zoneid)) { 5548 BUMP_MIB(&ip_mib, ipInUnknownProtos); 5549 } 5550 } else { 5551 /* Get ill from index in ipsec_in_t. */ 5552 ill = ill_lookup_on_ifindex(ii->ipsec_in_ill_index, 5553 B_TRUE, NULL, NULL, NULL, NULL); 5554 if (ill != NULL) { 5555 if (ip_fanout_send_icmp_v6(q, mp, flags, 5556 ICMP6_PARAM_PROB, ICMP6_PARAMPROB_NEXTHEADER, 5557 0, B_FALSE, zoneid)) { 5558 BUMP_MIB(ill->ill_ip6_mib, ipv6InUnknownProtos); 5559 } 5560 5561 ill_refrele(ill); 5562 } else { /* re-link for the freemsg() below. */ 5563 ipsec_mp->b_cont = mp; 5564 } 5565 } 5566 5567 /* If ICMP delivered, ipsec_mp will be a singleton (b_cont == NULL). */ 5568 freemsg(ipsec_mp); 5569 } 5570 5571 /* 5572 * See if the inbound datagram has had IPsec processing applied to it. 5573 */ 5574 boolean_t 5575 ipsec_in_is_secure(mblk_t *ipsec_mp) 5576 { 5577 ipsec_in_t *ii; 5578 5579 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 5580 ASSERT(ii->ipsec_in_type == IPSEC_IN); 5581 5582 if (ii->ipsec_in_loopback) { 5583 return (ii->ipsec_in_secure); 5584 } else { 5585 return (ii->ipsec_in_ah_sa != NULL || 5586 ii->ipsec_in_esp_sa != NULL || 5587 ii->ipsec_in_decaps); 5588 } 5589 } 5590 5591 /* 5592 * Handle protocols with which IP is less intimate. There 5593 * can be more than one stream bound to a particular 5594 * protocol. When this is the case, normally each one gets a copy 5595 * of any incoming packets. 5596 * 5597 * IPSEC NOTE : 5598 * 5599 * Don't allow a secure packet going up a non-secure connection. 5600 * We don't allow this because 5601 * 5602 * 1) Reply might go out in clear which will be dropped at 5603 * the sending side. 5604 * 2) If the reply goes out in clear it will give the 5605 * adversary enough information for getting the key in 5606 * most of the cases. 5607 * 5608 * Moreover getting a secure packet when we expect clear 5609 * implies that SA's were added without checking for 5610 * policy on both ends. This should not happen once ISAKMP 5611 * is used to negotiate SAs as SAs will be added only after 5612 * verifying the policy. 5613 * 5614 * NOTE : If the packet was tunneled and not multicast we only send 5615 * to it the first match. Unlike TCP and UDP fanouts this doesn't fall 5616 * back to delivering packets to AF_INET6 raw sockets. 5617 * 5618 * IPQoS Notes: 5619 * Once we have determined the client, invoke IPPF processing. 5620 * Policy processing takes place only if the callout_position, IPP_LOCAL_IN, 5621 * is enabled. If we get here from icmp_inbound_error_fanout or ip_wput_local 5622 * ip_policy will be false. 5623 * 5624 * Zones notes: 5625 * Currently only applications in the global zone can create raw sockets for 5626 * protocols other than ICMP. So unlike the broadcast / multicast case of 5627 * ip_fanout_udp(), we only send a copy of the packet to streams in the 5628 * specified zone. For ICMP, this is handled by the callers of icmp_inbound(). 5629 */ 5630 static void 5631 ip_fanout_proto(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, uint_t flags, 5632 boolean_t mctl_present, boolean_t ip_policy, ill_t *recv_ill, 5633 zoneid_t zoneid) 5634 { 5635 queue_t *rq; 5636 mblk_t *mp1, *first_mp1; 5637 uint_t protocol = ipha->ipha_protocol; 5638 ipaddr_t dst; 5639 boolean_t one_only; 5640 mblk_t *first_mp = mp; 5641 boolean_t secure; 5642 uint32_t ill_index; 5643 conn_t *connp, *first_connp, *next_connp; 5644 connf_t *connfp; 5645 boolean_t shared_addr; 5646 5647 if (mctl_present) { 5648 mp = first_mp->b_cont; 5649 secure = ipsec_in_is_secure(first_mp); 5650 ASSERT(mp != NULL); 5651 } else { 5652 secure = B_FALSE; 5653 } 5654 dst = ipha->ipha_dst; 5655 /* 5656 * If the packet was tunneled and not multicast we only send to it 5657 * the first match. 5658 */ 5659 one_only = ((protocol == IPPROTO_ENCAP || protocol == IPPROTO_IPV6) && 5660 !CLASSD(dst)); 5661 5662 shared_addr = (zoneid == ALL_ZONES); 5663 if (shared_addr) { 5664 /* 5665 * We don't allow multilevel ports for raw IP, so no need to 5666 * check for that here. 5667 */ 5668 zoneid = tsol_packet_to_zoneid(mp); 5669 } 5670 5671 connfp = &ipcl_proto_fanout[protocol]; 5672 mutex_enter(&connfp->connf_lock); 5673 connp = connfp->connf_head; 5674 for (connp = connfp->connf_head; connp != NULL; 5675 connp = connp->conn_next) { 5676 if (IPCL_PROTO_MATCH(connp, protocol, ipha, ill, flags, 5677 zoneid) && 5678 (!is_system_labeled() || 5679 tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 5680 connp))) 5681 break; 5682 } 5683 5684 if (connp == NULL || connp->conn_upq == NULL) { 5685 /* 5686 * No one bound to these addresses. Is 5687 * there a client that wants all 5688 * unclaimed datagrams? 5689 */ 5690 mutex_exit(&connfp->connf_lock); 5691 /* 5692 * Check for IPPROTO_ENCAP... 5693 */ 5694 if (protocol == IPPROTO_ENCAP && ip_g_mrouter) { 5695 /* 5696 * XXX If an IPsec mblk is here on a multicast 5697 * tunnel (using ip_mroute stuff), what should 5698 * I do? 5699 * 5700 * For now, just free the IPsec mblk before 5701 * passing it up to the multicast routing 5702 * stuff. 5703 * 5704 * BTW, If I match a configured IP-in-IP 5705 * tunnel, ip_mroute_decap will never be 5706 * called. 5707 */ 5708 if (mp != first_mp) 5709 freeb(first_mp); 5710 ip_mroute_decap(q, mp); 5711 } else { 5712 /* 5713 * Otherwise send an ICMP protocol unreachable. 5714 */ 5715 if (ip_fanout_send_icmp(q, first_mp, flags, 5716 ICMP_DEST_UNREACHABLE, ICMP_PROTOCOL_UNREACHABLE, 5717 mctl_present, zoneid)) { 5718 BUMP_MIB(&ip_mib, ipInUnknownProtos); 5719 } 5720 } 5721 return; 5722 } 5723 CONN_INC_REF(connp); 5724 first_connp = connp; 5725 5726 /* 5727 * Only send message to one tunnel driver by immediately 5728 * terminating the loop. 5729 */ 5730 connp = one_only ? NULL : connp->conn_next; 5731 5732 for (;;) { 5733 while (connp != NULL) { 5734 if (IPCL_PROTO_MATCH(connp, protocol, ipha, ill, 5735 flags, zoneid) && 5736 (!is_system_labeled() || 5737 tsol_receive_local(mp, &dst, IPV4_VERSION, 5738 shared_addr, connp))) 5739 break; 5740 connp = connp->conn_next; 5741 } 5742 5743 /* 5744 * Copy the packet. 5745 */ 5746 if (connp == NULL || connp->conn_upq == NULL || 5747 (((first_mp1 = dupmsg(first_mp)) == NULL) && 5748 ((first_mp1 = ip_copymsg(first_mp)) == NULL))) { 5749 /* 5750 * No more interested clients or memory 5751 * allocation failed 5752 */ 5753 connp = first_connp; 5754 break; 5755 } 5756 mp1 = mctl_present ? first_mp1->b_cont : first_mp1; 5757 CONN_INC_REF(connp); 5758 mutex_exit(&connfp->connf_lock); 5759 rq = connp->conn_rq; 5760 if (!canputnext(rq)) { 5761 if (flags & IP_FF_RAWIP) { 5762 BUMP_MIB(&ip_mib, rawipInOverflows); 5763 } else { 5764 BUMP_MIB(&icmp_mib, icmpInOverflows); 5765 } 5766 5767 freemsg(first_mp1); 5768 } else { 5769 if (CONN_INBOUND_POLICY_PRESENT(connp) || secure) { 5770 first_mp1 = ipsec_check_inbound_policy 5771 (first_mp1, connp, ipha, NULL, 5772 mctl_present); 5773 } 5774 if (first_mp1 != NULL) { 5775 /* 5776 * ip_fanout_proto also gets called from 5777 * icmp_inbound_error_fanout, in which case 5778 * the msg type is M_CTL. Don't add info 5779 * in this case for the time being. In future 5780 * when there is a need for knowing the 5781 * inbound iface index for ICMP error msgs, 5782 * then this can be changed. 5783 */ 5784 if ((connp->conn_recvif != 0) && 5785 (mp->b_datap->db_type != M_CTL)) { 5786 /* 5787 * the actual data will be 5788 * contained in b_cont upon 5789 * successful return of the 5790 * following call else 5791 * original mblk is returned 5792 */ 5793 ASSERT(recv_ill != NULL); 5794 mp1 = ip_add_info(mp1, recv_ill, 5795 IPF_RECVIF); 5796 } 5797 BUMP_MIB(&ip_mib, ipInDelivers); 5798 if (mctl_present) 5799 freeb(first_mp1); 5800 putnext(rq, mp1); 5801 } 5802 } 5803 mutex_enter(&connfp->connf_lock); 5804 /* Follow the next pointer before releasing the conn. */ 5805 next_connp = connp->conn_next; 5806 CONN_DEC_REF(connp); 5807 connp = next_connp; 5808 } 5809 5810 /* Last one. Send it upstream. */ 5811 mutex_exit(&connfp->connf_lock); 5812 5813 /* 5814 * If this packet is coming from icmp_inbound_error_fanout ip_policy 5815 * will be set to false. 5816 */ 5817 if (IPP_ENABLED(IPP_LOCAL_IN) && ip_policy) { 5818 ill_index = ill->ill_phyint->phyint_ifindex; 5819 ip_process(IPP_LOCAL_IN, &mp, ill_index); 5820 if (mp == NULL) { 5821 CONN_DEC_REF(connp); 5822 if (mctl_present) { 5823 freeb(first_mp); 5824 } 5825 return; 5826 } 5827 } 5828 5829 rq = connp->conn_rq; 5830 if (!canputnext(rq)) { 5831 if (flags & IP_FF_RAWIP) { 5832 BUMP_MIB(&ip_mib, rawipInOverflows); 5833 } else { 5834 BUMP_MIB(&icmp_mib, icmpInOverflows); 5835 } 5836 5837 freemsg(first_mp); 5838 } else { 5839 if (CONN_INBOUND_POLICY_PRESENT(connp) || secure) { 5840 first_mp = ipsec_check_inbound_policy(first_mp, connp, 5841 ipha, NULL, mctl_present); 5842 } 5843 if (first_mp != NULL) { 5844 /* 5845 * ip_fanout_proto also gets called 5846 * from icmp_inbound_error_fanout, in 5847 * which case the msg type is M_CTL. 5848 * Don't add info in this case for time 5849 * being. In future when there is a 5850 * need for knowing the inbound iface 5851 * index for ICMP error msgs, then this 5852 * can be changed 5853 */ 5854 if ((connp->conn_recvif != 0) && 5855 (mp->b_datap->db_type != M_CTL)) { 5856 /* 5857 * the actual data will be contained in 5858 * b_cont upon successful return 5859 * of the following call else original 5860 * mblk is returned 5861 */ 5862 ASSERT(recv_ill != NULL); 5863 mp = ip_add_info(mp, recv_ill, IPF_RECVIF); 5864 } 5865 BUMP_MIB(&ip_mib, ipInDelivers); 5866 putnext(rq, mp); 5867 if (mctl_present) 5868 freeb(first_mp); 5869 } 5870 } 5871 CONN_DEC_REF(connp); 5872 } 5873 5874 /* 5875 * Fanout for TCP packets 5876 * The caller puts <fport, lport> in the ports parameter. 5877 * 5878 * IPQoS Notes 5879 * Before sending it to the client, invoke IPPF processing. 5880 * Policy processing takes place only if the callout_position, IPP_LOCAL_IN, 5881 * is enabled. If we get here from icmp_inbound_error_fanout or ip_wput_local 5882 * ip_policy is false. 5883 */ 5884 static void 5885 ip_fanout_tcp(queue_t *q, mblk_t *mp, ill_t *recv_ill, ipha_t *ipha, 5886 uint_t flags, boolean_t mctl_present, boolean_t ip_policy, zoneid_t zoneid) 5887 { 5888 mblk_t *first_mp; 5889 boolean_t secure; 5890 uint32_t ill_index; 5891 int ip_hdr_len; 5892 tcph_t *tcph; 5893 boolean_t syn_present = B_FALSE; 5894 conn_t *connp; 5895 5896 first_mp = mp; 5897 if (mctl_present) { 5898 ASSERT(first_mp->b_datap->db_type == M_CTL); 5899 mp = first_mp->b_cont; 5900 secure = ipsec_in_is_secure(first_mp); 5901 ASSERT(mp != NULL); 5902 } else { 5903 secure = B_FALSE; 5904 } 5905 5906 ip_hdr_len = IPH_HDR_LENGTH(mp->b_rptr); 5907 5908 if ((connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_len, zoneid)) == 5909 NULL) { 5910 /* 5911 * No connected connection or listener. Send a 5912 * TH_RST via tcp_xmit_listeners_reset. 5913 */ 5914 5915 /* Initiate IPPf processing, if needed. */ 5916 if (IPP_ENABLED(IPP_LOCAL_IN)) { 5917 uint32_t ill_index; 5918 ill_index = recv_ill->ill_phyint->phyint_ifindex; 5919 ip_process(IPP_LOCAL_IN, &first_mp, ill_index); 5920 if (first_mp == NULL) 5921 return; 5922 } 5923 BUMP_MIB(&ip_mib, ipInDelivers); 5924 ip2dbg(("ip_fanout_tcp: no listener; send reset to zone %d\n", 5925 zoneid)); 5926 tcp_xmit_listeners_reset(first_mp, ip_hdr_len); 5927 return; 5928 } 5929 5930 /* 5931 * Allocate the SYN for the TCP connection here itself 5932 */ 5933 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 5934 if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) { 5935 if (IPCL_IS_TCP(connp)) { 5936 squeue_t *sqp; 5937 5938 /* 5939 * For fused tcp loopback, assign the eager's 5940 * squeue to be that of the active connect's. 5941 * Note that we don't check for IP_FF_LOOPBACK 5942 * here since this routine gets called only 5943 * for loopback (unlike the IPv6 counterpart). 5944 */ 5945 ASSERT(Q_TO_CONN(q) != NULL); 5946 if (do_tcp_fusion && 5947 !CONN_INBOUND_POLICY_PRESENT(connp) && !secure && 5948 !IPP_ENABLED(IPP_LOCAL_IN) && !ip_policy && 5949 IPCL_IS_TCP(Q_TO_CONN(q))) { 5950 ASSERT(Q_TO_CONN(q)->conn_sqp != NULL); 5951 sqp = Q_TO_CONN(q)->conn_sqp; 5952 } else { 5953 sqp = IP_SQUEUE_GET(lbolt); 5954 } 5955 5956 mp->b_datap->db_struioflag |= STRUIO_EAGER; 5957 DB_CKSUMSTART(mp) = (intptr_t)sqp; 5958 syn_present = B_TRUE; 5959 } 5960 } 5961 5962 if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp) && !syn_present) { 5963 uint_t flags = (unsigned int)tcph->th_flags[0] & 0xFF; 5964 if ((flags & TH_RST) || (flags & TH_URG)) { 5965 CONN_DEC_REF(connp); 5966 freemsg(first_mp); 5967 return; 5968 } 5969 if (flags & TH_ACK) { 5970 tcp_xmit_listeners_reset(first_mp, ip_hdr_len); 5971 CONN_DEC_REF(connp); 5972 return; 5973 } 5974 5975 CONN_DEC_REF(connp); 5976 freemsg(first_mp); 5977 return; 5978 } 5979 5980 if (CONN_INBOUND_POLICY_PRESENT(connp) || secure) { 5981 first_mp = ipsec_check_inbound_policy(first_mp, connp, ipha, 5982 NULL, mctl_present); 5983 if (first_mp == NULL) { 5984 CONN_DEC_REF(connp); 5985 return; 5986 } 5987 if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp)) { 5988 ASSERT(syn_present); 5989 if (mctl_present) { 5990 ASSERT(first_mp != mp); 5991 first_mp->b_datap->db_struioflag |= 5992 STRUIO_POLICY; 5993 } else { 5994 ASSERT(first_mp == mp); 5995 mp->b_datap->db_struioflag &= 5996 ~STRUIO_EAGER; 5997 mp->b_datap->db_struioflag |= 5998 STRUIO_POLICY; 5999 } 6000 } else { 6001 /* 6002 * Discard first_mp early since we're dealing with a 6003 * fully-connected conn_t and tcp doesn't do policy in 6004 * this case. 6005 */ 6006 if (mctl_present) { 6007 freeb(first_mp); 6008 mctl_present = B_FALSE; 6009 } 6010 first_mp = mp; 6011 } 6012 } 6013 6014 /* 6015 * Initiate policy processing here if needed. If we get here from 6016 * icmp_inbound_error_fanout, ip_policy is false. 6017 */ 6018 if (IPP_ENABLED(IPP_LOCAL_IN) && ip_policy) { 6019 ill_index = recv_ill->ill_phyint->phyint_ifindex; 6020 ip_process(IPP_LOCAL_IN, &mp, ill_index); 6021 if (mp == NULL) { 6022 CONN_DEC_REF(connp); 6023 if (mctl_present) 6024 freeb(first_mp); 6025 return; 6026 } else if (mctl_present) { 6027 ASSERT(first_mp != mp); 6028 first_mp->b_cont = mp; 6029 } else { 6030 first_mp = mp; 6031 } 6032 } 6033 6034 6035 6036 /* Handle IPv6 socket options. */ 6037 if (!syn_present && 6038 connp->conn_ipv6_recvpktinfo && (flags & IP_FF_IP6INFO)) { 6039 /* Add header */ 6040 ASSERT(recv_ill != NULL); 6041 mp = ip_add_info(mp, recv_ill, IPF_RECVIF); 6042 if (mp == NULL) { 6043 CONN_DEC_REF(connp); 6044 if (mctl_present) 6045 freeb(first_mp); 6046 return; 6047 } else if (mctl_present) { 6048 /* 6049 * ip_add_info might return a new mp. 6050 */ 6051 ASSERT(first_mp != mp); 6052 first_mp->b_cont = mp; 6053 } else { 6054 first_mp = mp; 6055 } 6056 } 6057 6058 BUMP_MIB(&ip_mib, ipInDelivers); 6059 if (IPCL_IS_TCP(connp)) { 6060 (*ip_input_proc)(connp->conn_sqp, first_mp, 6061 connp->conn_recv, connp, SQTAG_IP_FANOUT_TCP); 6062 } else { 6063 putnext(connp->conn_rq, first_mp); 6064 CONN_DEC_REF(connp); 6065 } 6066 } 6067 6068 /* 6069 * Deliver a udp packet to the given conn, possibly applying ipsec policy. 6070 * We are responsible for disposing of mp, such as by freemsg() or putnext() 6071 * Caller is responsible for dropping references to the conn, and freeing 6072 * first_mp. 6073 * 6074 * IPQoS Notes 6075 * Before sending it to the client, invoke IPPF processing. Policy processing 6076 * takes place only if the callout_position, IPP_LOCAL_IN, is enabled and 6077 * ip_policy is true. If we get here from icmp_inbound_error_fanout or 6078 * ip_wput_local, ip_policy is false. 6079 */ 6080 static void 6081 ip_fanout_udp_conn(conn_t *connp, mblk_t *first_mp, mblk_t *mp, 6082 boolean_t secure, ipha_t *ipha, uint_t flags, ill_t *recv_ill, 6083 boolean_t ip_policy) 6084 { 6085 boolean_t mctl_present = (first_mp != NULL); 6086 uint32_t in_flags = 0; /* set to IP_RECVSLLA and/or IP_RECVIF */ 6087 uint32_t ill_index; 6088 6089 if (mctl_present) 6090 first_mp->b_cont = mp; 6091 else 6092 first_mp = mp; 6093 6094 if (CONN_UDP_FLOWCTLD(connp)) { 6095 BUMP_MIB(&ip_mib, udpInOverflows); 6096 freemsg(first_mp); 6097 return; 6098 } 6099 6100 if (CONN_INBOUND_POLICY_PRESENT(connp) || secure) { 6101 first_mp = ipsec_check_inbound_policy(first_mp, connp, ipha, 6102 NULL, mctl_present); 6103 if (first_mp == NULL) 6104 return; /* Freed by ipsec_check_inbound_policy(). */ 6105 } 6106 if (mctl_present) 6107 freeb(first_mp); 6108 6109 if (connp->conn_recvif) 6110 in_flags = IPF_RECVIF; 6111 if (connp->conn_recvslla && !(flags & IP_FF_SEND_SLLA)) 6112 in_flags |= IPF_RECVSLLA; 6113 6114 /* Handle IPv6 options. */ 6115 if (connp->conn_ipv6_recvpktinfo && (flags & IP_FF_IP6INFO)) 6116 in_flags |= IPF_RECVIF; 6117 6118 /* 6119 * Initiate IPPF processing here, if needed. Note first_mp won't be 6120 * freed if the packet is dropped. The caller will do so. 6121 */ 6122 if (IPP_ENABLED(IPP_LOCAL_IN) && ip_policy) { 6123 ill_index = recv_ill->ill_phyint->phyint_ifindex; 6124 ip_process(IPP_LOCAL_IN, &mp, ill_index); 6125 if (mp == NULL) { 6126 return; 6127 } 6128 } 6129 if ((in_flags != 0) && 6130 (mp->b_datap->db_type != M_CTL)) { 6131 /* 6132 * The actual data will be contained in b_cont 6133 * upon successful return of the following call 6134 * else original mblk is returned 6135 */ 6136 ASSERT(recv_ill != NULL); 6137 mp = ip_add_info(mp, recv_ill, in_flags); 6138 } 6139 BUMP_MIB(&ip_mib, ipInDelivers); 6140 6141 /* Send it upstream */ 6142 CONN_UDP_RECV(connp, mp); 6143 } 6144 6145 /* 6146 * Fanout for UDP packets. 6147 * The caller puts <fport, lport> in the ports parameter. 6148 * 6149 * If SO_REUSEADDR is set all multicast and broadcast packets 6150 * will be delivered to all streams bound to the same port. 6151 * 6152 * Zones notes: 6153 * Multicast and broadcast packets will be distributed to streams in all zones. 6154 * In the special case where an AF_INET socket binds to 0.0.0.0/<port> and an 6155 * AF_INET6 socket binds to ::/<port>, only the AF_INET socket receives the IPv4 6156 * packets. To maintain this behavior with multiple zones, the conns are grouped 6157 * by zone and the SO_REUSEADDR flag is checked for the first matching conn in 6158 * each zone. If unset, all the following conns in the same zone are skipped. 6159 */ 6160 static void 6161 ip_fanout_udp(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, 6162 uint32_t ports, boolean_t broadcast, uint_t flags, boolean_t mctl_present, 6163 boolean_t ip_policy, ill_t *recv_ill, zoneid_t zoneid) 6164 { 6165 uint32_t dstport, srcport; 6166 ipaddr_t dst; 6167 mblk_t *first_mp; 6168 boolean_t secure; 6169 in6_addr_t v6src; 6170 conn_t *connp; 6171 connf_t *connfp; 6172 conn_t *first_connp; 6173 conn_t *next_connp; 6174 mblk_t *mp1, *first_mp1; 6175 ipaddr_t src; 6176 zoneid_t last_zoneid; 6177 boolean_t reuseaddr; 6178 boolean_t shared_addr; 6179 6180 first_mp = mp; 6181 if (mctl_present) { 6182 mp = first_mp->b_cont; 6183 first_mp->b_cont = NULL; 6184 secure = ipsec_in_is_secure(first_mp); 6185 ASSERT(mp != NULL); 6186 } else { 6187 first_mp = NULL; 6188 secure = B_FALSE; 6189 } 6190 6191 /* Extract ports in net byte order */ 6192 dstport = htons(ntohl(ports) & 0xFFFF); 6193 srcport = htons(ntohl(ports) >> 16); 6194 dst = ipha->ipha_dst; 6195 src = ipha->ipha_src; 6196 6197 shared_addr = (zoneid == ALL_ZONES); 6198 if (shared_addr) { 6199 zoneid = tsol_mlp_findzone(IPPROTO_UDP, dstport); 6200 if (zoneid == ALL_ZONES) 6201 zoneid = tsol_packet_to_zoneid(mp); 6202 } 6203 6204 connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(dstport)]; 6205 mutex_enter(&connfp->connf_lock); 6206 connp = connfp->connf_head; 6207 if (!broadcast && !CLASSD(dst)) { 6208 /* 6209 * Not broadcast or multicast. Send to the one (first) 6210 * client we find. No need to check conn_wantpacket() 6211 * since IP_BOUND_IF/conn_incoming_ill does not apply to 6212 * IPv4 unicast packets. 6213 */ 6214 while ((connp != NULL) && 6215 (!IPCL_UDP_MATCH(connp, dstport, dst, 6216 srcport, src) || connp->conn_zoneid != zoneid)) { 6217 connp = connp->conn_next; 6218 } 6219 6220 if (connp == NULL || connp->conn_upq == NULL) 6221 goto notfound; 6222 6223 if (is_system_labeled() && 6224 !tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 6225 connp)) 6226 goto notfound; 6227 6228 CONN_INC_REF(connp); 6229 mutex_exit(&connfp->connf_lock); 6230 ip_fanout_udp_conn(connp, first_mp, mp, secure, ipha, flags, 6231 recv_ill, ip_policy); 6232 IP_STAT(ip_udp_fannorm); 6233 CONN_DEC_REF(connp); 6234 return; 6235 } 6236 6237 /* 6238 * Broadcast and multicast case 6239 * 6240 * Need to check conn_wantpacket(). 6241 * If SO_REUSEADDR has been set on the first we send the 6242 * packet to all clients that have joined the group and 6243 * match the port. 6244 */ 6245 6246 while (connp != NULL) { 6247 if ((IPCL_UDP_MATCH(connp, dstport, dst, srcport, src)) && 6248 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 6249 (!is_system_labeled() || 6250 tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 6251 connp))) 6252 break; 6253 connp = connp->conn_next; 6254 } 6255 6256 if (connp == NULL || connp->conn_upq == NULL) 6257 goto notfound; 6258 6259 first_connp = connp; 6260 /* 6261 * When SO_REUSEADDR is not set, send the packet only to the first 6262 * matching connection in its zone by keeping track of the zoneid. 6263 */ 6264 reuseaddr = first_connp->conn_reuseaddr; 6265 last_zoneid = first_connp->conn_zoneid; 6266 6267 CONN_INC_REF(connp); 6268 connp = connp->conn_next; 6269 for (;;) { 6270 while (connp != NULL) { 6271 if (IPCL_UDP_MATCH(connp, dstport, dst, srcport, src) && 6272 (reuseaddr || connp->conn_zoneid != last_zoneid) && 6273 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 6274 (!is_system_labeled() || 6275 tsol_receive_local(mp, &dst, IPV4_VERSION, 6276 shared_addr, connp))) 6277 break; 6278 connp = connp->conn_next; 6279 } 6280 /* 6281 * Just copy the data part alone. The mctl part is 6282 * needed just for verifying policy and it is never 6283 * sent up. 6284 */ 6285 if (connp == NULL || (((mp1 = dupmsg(mp)) == NULL) && 6286 ((mp1 = copymsg(mp)) == NULL))) { 6287 /* 6288 * No more interested clients or memory 6289 * allocation failed 6290 */ 6291 connp = first_connp; 6292 break; 6293 } 6294 if (connp->conn_zoneid != last_zoneid) { 6295 /* 6296 * Update the zoneid so that the packet isn't sent to 6297 * any more conns in the same zone unless SO_REUSEADDR 6298 * is set. 6299 */ 6300 reuseaddr = connp->conn_reuseaddr; 6301 last_zoneid = connp->conn_zoneid; 6302 } 6303 if (first_mp != NULL) { 6304 ASSERT(((ipsec_info_t *)first_mp->b_rptr)-> 6305 ipsec_info_type == IPSEC_IN); 6306 first_mp1 = ipsec_in_tag(first_mp, NULL); 6307 if (first_mp1 == NULL) { 6308 freemsg(mp1); 6309 connp = first_connp; 6310 break; 6311 } 6312 } else { 6313 first_mp1 = NULL; 6314 } 6315 CONN_INC_REF(connp); 6316 mutex_exit(&connfp->connf_lock); 6317 /* 6318 * IPQoS notes: We don't send the packet for policy 6319 * processing here, will do it for the last one (below). 6320 * i.e. we do it per-packet now, but if we do policy 6321 * processing per-conn, then we would need to do it 6322 * here too. 6323 */ 6324 ip_fanout_udp_conn(connp, first_mp1, mp1, secure, 6325 ipha, flags, recv_ill, B_FALSE); 6326 mutex_enter(&connfp->connf_lock); 6327 /* Follow the next pointer before releasing the conn. */ 6328 next_connp = connp->conn_next; 6329 IP_STAT(ip_udp_fanmb); 6330 CONN_DEC_REF(connp); 6331 connp = next_connp; 6332 } 6333 6334 /* Last one. Send it upstream. */ 6335 mutex_exit(&connfp->connf_lock); 6336 ip_fanout_udp_conn(connp, first_mp, mp, secure, ipha, flags, recv_ill, 6337 ip_policy); 6338 IP_STAT(ip_udp_fanmb); 6339 CONN_DEC_REF(connp); 6340 return; 6341 6342 notfound: 6343 6344 mutex_exit(&connfp->connf_lock); 6345 IP_STAT(ip_udp_fanothers); 6346 /* 6347 * IPv6 endpoints bound to unicast or multicast IPv4-mapped addresses 6348 * have already been matched above, since they live in the IPv4 6349 * fanout tables. This implies we only need to 6350 * check for IPv6 in6addr_any endpoints here. 6351 * Thus we compare using ipv6_all_zeros instead of the destination 6352 * address, except for the multicast group membership lookup which 6353 * uses the IPv4 destination. 6354 */ 6355 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src); 6356 connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(dstport)]; 6357 mutex_enter(&connfp->connf_lock); 6358 connp = connfp->connf_head; 6359 if (!broadcast && !CLASSD(dst)) { 6360 while (connp != NULL) { 6361 if (IPCL_UDP_MATCH_V6(connp, dstport, ipv6_all_zeros, 6362 srcport, v6src) && connp->conn_zoneid == zoneid && 6363 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 6364 !connp->conn_ipv6_v6only) 6365 break; 6366 connp = connp->conn_next; 6367 } 6368 6369 if (connp != NULL && is_system_labeled() && 6370 !tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 6371 connp)) 6372 connp = NULL; 6373 6374 if (connp == NULL || connp->conn_upq == NULL) { 6375 /* 6376 * No one bound to this port. Is 6377 * there a client that wants all 6378 * unclaimed datagrams? 6379 */ 6380 mutex_exit(&connfp->connf_lock); 6381 6382 if (mctl_present) 6383 first_mp->b_cont = mp; 6384 else 6385 first_mp = mp; 6386 if (ipcl_proto_search(IPPROTO_UDP) != NULL) { 6387 ip_fanout_proto(q, first_mp, ill, ipha, 6388 flags | IP_FF_RAWIP, mctl_present, 6389 ip_policy, recv_ill, zoneid); 6390 } else { 6391 if (ip_fanout_send_icmp(q, first_mp, flags, 6392 ICMP_DEST_UNREACHABLE, 6393 ICMP_PORT_UNREACHABLE, 6394 mctl_present, zoneid)) { 6395 BUMP_MIB(&ip_mib, udpNoPorts); 6396 } 6397 } 6398 return; 6399 } 6400 6401 CONN_INC_REF(connp); 6402 mutex_exit(&connfp->connf_lock); 6403 ip_fanout_udp_conn(connp, first_mp, mp, secure, ipha, flags, 6404 recv_ill, ip_policy); 6405 CONN_DEC_REF(connp); 6406 return; 6407 } 6408 /* 6409 * IPv4 multicast packet being delivered to an AF_INET6 6410 * in6addr_any endpoint. 6411 * Need to check conn_wantpacket(). Note that we use conn_wantpacket() 6412 * and not conn_wantpacket_v6() since any multicast membership is 6413 * for an IPv4-mapped multicast address. 6414 * The packet is sent to all clients in all zones that have joined the 6415 * group and match the port. 6416 */ 6417 while (connp != NULL) { 6418 if (IPCL_UDP_MATCH_V6(connp, dstport, ipv6_all_zeros, 6419 srcport, v6src) && 6420 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 6421 (!is_system_labeled() || 6422 tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 6423 connp))) 6424 break; 6425 connp = connp->conn_next; 6426 } 6427 6428 if (connp == NULL || connp->conn_upq == NULL) { 6429 /* 6430 * No one bound to this port. Is 6431 * there a client that wants all 6432 * unclaimed datagrams? 6433 */ 6434 mutex_exit(&connfp->connf_lock); 6435 6436 if (mctl_present) 6437 first_mp->b_cont = mp; 6438 else 6439 first_mp = mp; 6440 if (ipcl_proto_search(IPPROTO_UDP) != NULL) { 6441 ip_fanout_proto(q, first_mp, ill, ipha, 6442 flags | IP_FF_RAWIP, mctl_present, ip_policy, 6443 recv_ill, zoneid); 6444 } else { 6445 /* 6446 * We used to attempt to send an icmp error here, but 6447 * since this is known to be a multicast packet 6448 * and we don't send icmp errors in response to 6449 * multicast, just drop the packet and give up sooner. 6450 */ 6451 BUMP_MIB(&ip_mib, udpNoPorts); 6452 freemsg(first_mp); 6453 } 6454 return; 6455 } 6456 6457 first_connp = connp; 6458 6459 CONN_INC_REF(connp); 6460 connp = connp->conn_next; 6461 for (;;) { 6462 while (connp != NULL) { 6463 if (IPCL_UDP_MATCH_V6(connp, dstport, 6464 ipv6_all_zeros, srcport, v6src) && 6465 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 6466 (!is_system_labeled() || 6467 tsol_receive_local(mp, &dst, IPV4_VERSION, 6468 shared_addr, connp))) 6469 break; 6470 connp = connp->conn_next; 6471 } 6472 /* 6473 * Just copy the data part alone. The mctl part is 6474 * needed just for verifying policy and it is never 6475 * sent up. 6476 */ 6477 if (connp == NULL || (((mp1 = dupmsg(mp)) == NULL) && 6478 ((mp1 = copymsg(mp)) == NULL))) { 6479 /* 6480 * No more intested clients or memory 6481 * allocation failed 6482 */ 6483 connp = first_connp; 6484 break; 6485 } 6486 if (first_mp != NULL) { 6487 ASSERT(((ipsec_info_t *)first_mp->b_rptr)-> 6488 ipsec_info_type == IPSEC_IN); 6489 first_mp1 = ipsec_in_tag(first_mp, NULL); 6490 if (first_mp1 == NULL) { 6491 freemsg(mp1); 6492 connp = first_connp; 6493 break; 6494 } 6495 } else { 6496 first_mp1 = NULL; 6497 } 6498 CONN_INC_REF(connp); 6499 mutex_exit(&connfp->connf_lock); 6500 /* 6501 * IPQoS notes: We don't send the packet for policy 6502 * processing here, will do it for the last one (below). 6503 * i.e. we do it per-packet now, but if we do policy 6504 * processing per-conn, then we would need to do it 6505 * here too. 6506 */ 6507 ip_fanout_udp_conn(connp, first_mp1, mp1, secure, 6508 ipha, flags, recv_ill, B_FALSE); 6509 mutex_enter(&connfp->connf_lock); 6510 /* Follow the next pointer before releasing the conn. */ 6511 next_connp = connp->conn_next; 6512 CONN_DEC_REF(connp); 6513 connp = next_connp; 6514 } 6515 6516 /* Last one. Send it upstream. */ 6517 mutex_exit(&connfp->connf_lock); 6518 ip_fanout_udp_conn(connp, first_mp, mp, secure, ipha, flags, recv_ill, 6519 ip_policy); 6520 CONN_DEC_REF(connp); 6521 } 6522 6523 /* 6524 * Complete the ip_wput header so that it 6525 * is possible to generate ICMP 6526 * errors. 6527 */ 6528 static int 6529 ip_hdr_complete(ipha_t *ipha, zoneid_t zoneid) 6530 { 6531 ire_t *ire; 6532 6533 if (ipha->ipha_src == INADDR_ANY) { 6534 ire = ire_lookup_local(zoneid); 6535 if (ire == NULL) { 6536 ip1dbg(("ip_hdr_complete: no source IRE\n")); 6537 return (1); 6538 } 6539 ipha->ipha_src = ire->ire_addr; 6540 ire_refrele(ire); 6541 } 6542 ipha->ipha_ttl = ip_def_ttl; 6543 ipha->ipha_hdr_checksum = 0; 6544 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 6545 return (0); 6546 } 6547 6548 /* 6549 * Nobody should be sending 6550 * packets up this stream 6551 */ 6552 static void 6553 ip_lrput(queue_t *q, mblk_t *mp) 6554 { 6555 mblk_t *mp1; 6556 6557 switch (mp->b_datap->db_type) { 6558 case M_FLUSH: 6559 /* Turn around */ 6560 if (*mp->b_rptr & FLUSHW) { 6561 *mp->b_rptr &= ~FLUSHR; 6562 qreply(q, mp); 6563 return; 6564 } 6565 break; 6566 } 6567 /* Could receive messages that passed through ar_rput */ 6568 for (mp1 = mp; mp1; mp1 = mp1->b_cont) 6569 mp1->b_prev = mp1->b_next = NULL; 6570 freemsg(mp); 6571 } 6572 6573 /* Nobody should be sending packets down this stream */ 6574 /* ARGSUSED */ 6575 void 6576 ip_lwput(queue_t *q, mblk_t *mp) 6577 { 6578 freemsg(mp); 6579 } 6580 6581 /* 6582 * Move the first hop in any source route to ipha_dst and remove that part of 6583 * the source route. Called by other protocols. Errors in option formatting 6584 * are ignored - will be handled by ip_wput_options Return the final 6585 * destination (either ipha_dst or the last entry in a source route.) 6586 */ 6587 ipaddr_t 6588 ip_massage_options(ipha_t *ipha) 6589 { 6590 ipoptp_t opts; 6591 uchar_t *opt; 6592 uint8_t optval; 6593 uint8_t optlen; 6594 ipaddr_t dst; 6595 int i; 6596 ire_t *ire; 6597 6598 ip2dbg(("ip_massage_options\n")); 6599 dst = ipha->ipha_dst; 6600 for (optval = ipoptp_first(&opts, ipha); 6601 optval != IPOPT_EOL; 6602 optval = ipoptp_next(&opts)) { 6603 opt = opts.ipoptp_cur; 6604 switch (optval) { 6605 uint8_t off; 6606 case IPOPT_SSRR: 6607 case IPOPT_LSRR: 6608 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 6609 ip1dbg(("ip_massage_options: bad src route\n")); 6610 break; 6611 } 6612 optlen = opts.ipoptp_len; 6613 off = opt[IPOPT_OFFSET]; 6614 off--; 6615 redo_srr: 6616 if (optlen < IP_ADDR_LEN || 6617 off > optlen - IP_ADDR_LEN) { 6618 /* End of source route */ 6619 ip1dbg(("ip_massage_options: end of SR\n")); 6620 break; 6621 } 6622 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 6623 ip1dbg(("ip_massage_options: next hop 0x%x\n", 6624 ntohl(dst))); 6625 /* 6626 * Check if our address is present more than 6627 * once as consecutive hops in source route. 6628 * XXX verify per-interface ip_forwarding 6629 * for source route? 6630 */ 6631 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, 6632 ALL_ZONES, NULL, MATCH_IRE_TYPE); 6633 if (ire != NULL) { 6634 ire_refrele(ire); 6635 off += IP_ADDR_LEN; 6636 goto redo_srr; 6637 } 6638 if (dst == htonl(INADDR_LOOPBACK)) { 6639 ip1dbg(("ip_massage_options: loopback addr in " 6640 "source route!\n")); 6641 break; 6642 } 6643 /* 6644 * Update ipha_dst to be the first hop and remove the 6645 * first hop from the source route (by overwriting 6646 * part of the option with NOP options). 6647 */ 6648 ipha->ipha_dst = dst; 6649 /* Put the last entry in dst */ 6650 off = ((optlen - IP_ADDR_LEN - 3) & ~(IP_ADDR_LEN-1)) + 6651 3; 6652 bcopy(&opt[off], &dst, IP_ADDR_LEN); 6653 6654 ip1dbg(("ip_massage_options: last hop 0x%x\n", 6655 ntohl(dst))); 6656 /* Move down and overwrite */ 6657 opt[IP_ADDR_LEN] = opt[0]; 6658 opt[IP_ADDR_LEN+1] = opt[IPOPT_OLEN] - IP_ADDR_LEN; 6659 opt[IP_ADDR_LEN+2] = opt[IPOPT_OFFSET]; 6660 for (i = 0; i < IP_ADDR_LEN; i++) 6661 opt[i] = IPOPT_NOP; 6662 break; 6663 } 6664 } 6665 return (dst); 6666 } 6667 6668 /* 6669 * This function's job is to forward data to the reverse tunnel (FA->HA) 6670 * after doing a few checks. It is assumed that the incoming interface 6671 * of the packet is always different than the outgoing interface and the 6672 * ire_type of the found ire has to be a non-resolver type. 6673 * 6674 * IPQoS notes 6675 * IP policy is invoked twice for a forwarded packet, once on the read side 6676 * and again on the write side if both, IPP_FWD_IN and IPP_FWD_OUT are 6677 * enabled. 6678 */ 6679 static void 6680 ip_mrtun_forward(ire_t *ire, ill_t *in_ill, mblk_t *mp) 6681 { 6682 ipha_t *ipha; 6683 queue_t *q; 6684 uint32_t pkt_len; 6685 #define rptr ((uchar_t *)ipha) 6686 uint32_t sum; 6687 uint32_t max_frag; 6688 mblk_t *first_mp; 6689 uint32_t ill_index; 6690 6691 ASSERT(ire != NULL); 6692 ASSERT(ire->ire_ipif->ipif_net_type == IRE_IF_NORESOLVER); 6693 ASSERT(ire->ire_stq != NULL); 6694 6695 /* Initiate read side IPPF processing */ 6696 if (IPP_ENABLED(IPP_FWD_IN)) { 6697 ill_index = in_ill->ill_phyint->phyint_ifindex; 6698 ip_process(IPP_FWD_IN, &mp, ill_index); 6699 if (mp == NULL) { 6700 ip2dbg(("ip_mrtun_forward: inbound pkt " 6701 "dropped during IPPF processing\n")); 6702 return; 6703 } 6704 } 6705 6706 if (((in_ill->ill_flags & ((ill_t *)ire->ire_stq->q_ptr)->ill_flags & 6707 ILLF_ROUTER) == 0) || 6708 (in_ill == (ill_t *)ire->ire_stq->q_ptr)) { 6709 BUMP_MIB(&ip_mib, ipForwProhibits); 6710 ip0dbg(("ip_mrtun_forward: Can't forward :" 6711 "forwarding is not turned on\n")); 6712 goto drop_pkt; 6713 } 6714 6715 /* 6716 * Don't forward if the interface is down 6717 */ 6718 if (ire->ire_ipif->ipif_ill->ill_ipif_up_count == 0) { 6719 BUMP_MIB(&ip_mib, ipInDiscards); 6720 goto drop_pkt; 6721 } 6722 6723 ipha = (ipha_t *)mp->b_rptr; 6724 pkt_len = ntohs(ipha->ipha_length); 6725 /* Adjust the checksum to reflect the ttl decrement. */ 6726 sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST; 6727 ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16)); 6728 if (ipha->ipha_ttl-- <= 1) { 6729 if (ip_csum_hdr(ipha)) { 6730 BUMP_MIB(&ip_mib, ipInCksumErrs); 6731 goto drop_pkt; 6732 } 6733 q = ire->ire_stq; 6734 if ((first_mp = allocb(sizeof (ipsec_info_t), 6735 BPRI_HI)) == NULL) { 6736 goto drop_pkt; 6737 } 6738 ip_ipsec_out_prepend(first_mp, mp, in_ill); 6739 icmp_time_exceeded(q, first_mp, ICMP_TTL_EXCEEDED); 6740 6741 return; 6742 } 6743 6744 /* Get the ill_index of the ILL */ 6745 ill_index = ire->ire_ipif->ipif_ill->ill_phyint->phyint_ifindex; 6746 6747 /* 6748 * ip_mrtun_forward is only used by foreign agent to reverse 6749 * tunnel the incoming packet. So it does not do any option 6750 * processing for source routing. 6751 */ 6752 max_frag = ire->ire_max_frag; 6753 if (pkt_len > max_frag) { 6754 /* 6755 * It needs fragging on its way out. We haven't 6756 * verified the header checksum yet. Since we 6757 * are going to put a surely good checksum in the 6758 * outgoing header, we have to make sure that it 6759 * was good coming in. 6760 */ 6761 if (ip_csum_hdr(ipha)) { 6762 BUMP_MIB(&ip_mib, ipInCksumErrs); 6763 goto drop_pkt; 6764 } 6765 6766 /* Initiate write side IPPF processing */ 6767 if (IPP_ENABLED(IPP_FWD_OUT)) { 6768 ip_process(IPP_FWD_OUT, &mp, ill_index); 6769 if (mp == NULL) { 6770 ip2dbg(("ip_mrtun_forward: outbound pkt "\ 6771 "dropped/deferred during ip policy "\ 6772 "processing\n")); 6773 return; 6774 } 6775 } 6776 if ((first_mp = allocb(sizeof (ipsec_info_t), 6777 BPRI_HI)) == NULL) { 6778 goto drop_pkt; 6779 } 6780 ip_ipsec_out_prepend(first_mp, mp, in_ill); 6781 mp = first_mp; 6782 6783 ip_wput_frag(ire, mp, IB_PKT, max_frag, 0); 6784 return; 6785 } 6786 6787 ip2dbg(("ip_mrtun_forward: ire type (%d)\n", ire->ire_type)); 6788 6789 ASSERT(ire->ire_ipif != NULL); 6790 6791 mp = ip_wput_attach_llhdr(mp, ire, IPP_FWD_OUT, ill_index); 6792 if (mp == NULL) { 6793 BUMP_MIB(&ip_mib, ipInDiscards); 6794 return; 6795 } 6796 6797 /* Now send the packet to the tunnel interface */ 6798 q = ire->ire_stq; 6799 UPDATE_IB_PKT_COUNT(ire); 6800 ire->ire_last_used_time = lbolt; 6801 BUMP_MIB(&ip_mib, ipForwDatagrams); 6802 putnext(q, mp); 6803 ip2dbg(("ip_mrtun_forward: sent packet to ill %p\n", q->q_ptr)); 6804 return; 6805 6806 drop_pkt:; 6807 ip2dbg(("ip_mrtun_forward: dropping pkt\n")); 6808 freemsg(mp); 6809 #undef rptr 6810 } 6811 6812 /* 6813 * Fills the ipsec_out_t data structure with appropriate fields and 6814 * prepends it to mp which contains the IP hdr + data that was meant 6815 * to be forwarded. Please note that ipsec_out_info data structure 6816 * is used here to communicate the outgoing ill path at ip_wput() 6817 * for the ICMP error packet. This has nothing to do with ipsec IP 6818 * security. ipsec_out_t is really used to pass the info to the module 6819 * IP where this information cannot be extracted from conn. 6820 * This functions is called by ip_mrtun_forward(). 6821 */ 6822 void 6823 ip_ipsec_out_prepend(mblk_t *first_mp, mblk_t *mp, ill_t *xmit_ill) 6824 { 6825 ipsec_out_t *io; 6826 6827 ASSERT(xmit_ill != NULL); 6828 first_mp->b_datap->db_type = M_CTL; 6829 first_mp->b_wptr += sizeof (ipsec_info_t); 6830 /* 6831 * This is to pass info to ip_wput in absence of conn. 6832 * ipsec_out_secure will be B_FALSE because of this. 6833 * Thus ipsec_out_secure being B_FALSE indicates that 6834 * this is not IPSEC security related information. 6835 */ 6836 bzero(first_mp->b_rptr, sizeof (ipsec_info_t)); 6837 io = (ipsec_out_t *)first_mp->b_rptr; 6838 io->ipsec_out_type = IPSEC_OUT; 6839 io->ipsec_out_len = sizeof (ipsec_out_t); 6840 first_mp->b_cont = mp; 6841 io->ipsec_out_ill_index = 6842 xmit_ill->ill_phyint->phyint_ifindex; 6843 io->ipsec_out_xmit_if = B_TRUE; 6844 } 6845 6846 /* 6847 * Return the network mask 6848 * associated with the specified address. 6849 */ 6850 ipaddr_t 6851 ip_net_mask(ipaddr_t addr) 6852 { 6853 uchar_t *up = (uchar_t *)&addr; 6854 ipaddr_t mask = 0; 6855 uchar_t *maskp = (uchar_t *)&mask; 6856 6857 #if defined(__i386) || defined(__amd64) 6858 #define TOTALLY_BRAIN_DAMAGED_C_COMPILER 6859 #endif 6860 #ifdef TOTALLY_BRAIN_DAMAGED_C_COMPILER 6861 maskp[0] = maskp[1] = maskp[2] = maskp[3] = 0; 6862 #endif 6863 if (CLASSD(addr)) { 6864 maskp[0] = 0xF0; 6865 return (mask); 6866 } 6867 if (addr == 0) 6868 return (0); 6869 maskp[0] = 0xFF; 6870 if ((up[0] & 0x80) == 0) 6871 return (mask); 6872 6873 maskp[1] = 0xFF; 6874 if ((up[0] & 0xC0) == 0x80) 6875 return (mask); 6876 6877 maskp[2] = 0xFF; 6878 if ((up[0] & 0xE0) == 0xC0) 6879 return (mask); 6880 6881 /* Must be experimental or multicast, indicate as much */ 6882 return ((ipaddr_t)0); 6883 } 6884 6885 /* 6886 * Select an ill for the packet by considering load spreading across 6887 * a different ill in the group if dst_ill is part of some group. 6888 */ 6889 static ill_t * 6890 ip_newroute_get_dst_ill(ill_t *dst_ill) 6891 { 6892 ill_t *ill; 6893 6894 /* 6895 * We schedule irrespective of whether the source address is 6896 * INADDR_ANY or not. illgrp_scheduler returns a held ill. 6897 */ 6898 ill = illgrp_scheduler(dst_ill); 6899 if (ill == NULL) 6900 return (NULL); 6901 6902 /* 6903 * For groups with names ip_sioctl_groupname ensures that all 6904 * ills are of same type. For groups without names, ifgrp_insert 6905 * ensures this. 6906 */ 6907 ASSERT(dst_ill->ill_type == ill->ill_type); 6908 6909 return (ill); 6910 } 6911 6912 /* 6913 * Helper function for the IPIF_NOFAILOVER/ATTACH_IF interface attachment case. 6914 */ 6915 ill_t * 6916 ip_grab_attach_ill(ill_t *ill, mblk_t *first_mp, int ifindex, boolean_t isv6) 6917 { 6918 ill_t *ret_ill; 6919 6920 ASSERT(ifindex != 0); 6921 ret_ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL); 6922 if (ret_ill == NULL || 6923 (ret_ill->ill_phyint->phyint_flags & PHYI_OFFLINE)) { 6924 if (isv6) { 6925 if (ill != NULL) { 6926 BUMP_MIB(ill->ill_ip6_mib, ipv6OutDiscards); 6927 } else { 6928 BUMP_MIB(&ip6_mib, ipv6OutDiscards); 6929 } 6930 ip1dbg(("ip_grab_attach_ill (IPv6): " 6931 "bad ifindex %d.\n", ifindex)); 6932 } else { 6933 BUMP_MIB(&ip_mib, ipOutDiscards); 6934 ip1dbg(("ip_grab_attach_ill (IPv4): " 6935 "bad ifindex %d.\n", ifindex)); 6936 } 6937 if (ret_ill != NULL) 6938 ill_refrele(ret_ill); 6939 freemsg(first_mp); 6940 return (NULL); 6941 } 6942 6943 return (ret_ill); 6944 } 6945 6946 /* 6947 * IPv4 - 6948 * ip_newroute is called by ip_rput or ip_wput whenever we need to send 6949 * out a packet to a destination address for which we do not have specific 6950 * (or sufficient) routing information. 6951 * 6952 * NOTE : These are the scopes of some of the variables that point at IRE, 6953 * which needs to be followed while making any future modifications 6954 * to avoid memory leaks. 6955 * 6956 * - ire and sire are the entries looked up initially by 6957 * ire_ftable_lookup. 6958 * - ipif_ire is used to hold the interface ire associated with 6959 * the new cache ire. But it's scope is limited, so we always REFRELE 6960 * it before branching out to error paths. 6961 * - save_ire is initialized before ire_create, so that ire returned 6962 * by ire_create will not over-write the ire. We REFRELE save_ire 6963 * before breaking out of the switch. 6964 * 6965 * Thus on failures, we have to REFRELE only ire and sire, if they 6966 * are not NULL. 6967 */ 6968 void 6969 ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, ill_t *in_ill, conn_t *connp) 6970 { 6971 areq_t *areq; 6972 ipaddr_t gw = 0; 6973 ire_t *ire = NULL; 6974 mblk_t *res_mp; 6975 ipaddr_t *addrp; 6976 ipaddr_t nexthop_addr; 6977 ipif_t *src_ipif = NULL; 6978 ill_t *dst_ill = NULL; 6979 ipha_t *ipha; 6980 ire_t *sire = NULL; 6981 mblk_t *first_mp; 6982 ire_t *save_ire; 6983 mblk_t *dlureq_mp; 6984 ill_t *attach_ill = NULL; /* Bind to IPIF_NOFAILOVER address */ 6985 ushort_t ire_marks = 0; 6986 boolean_t mctl_present; 6987 ipsec_out_t *io; 6988 mblk_t *saved_mp; 6989 ire_t *first_sire = NULL; 6990 mblk_t *copy_mp = NULL; 6991 mblk_t *xmit_mp = NULL; 6992 ipaddr_t save_dst; 6993 uint32_t multirt_flags = 6994 MULTIRT_CACHEGW | MULTIRT_USESTAMP | MULTIRT_SETSTAMP; 6995 boolean_t multirt_is_resolvable; 6996 boolean_t multirt_resolve_next; 6997 boolean_t do_attach_ill = B_FALSE; 6998 boolean_t ip_nexthop = B_FALSE; 6999 zoneid_t zoneid; 7000 tsol_ire_gw_secattr_t *attrp = NULL; 7001 tsol_gcgrp_t *gcgrp = NULL; 7002 tsol_gcgrp_addr_t ga; 7003 7004 if (ip_debug > 2) { 7005 /* ip1dbg */ 7006 pr_addr_dbg("ip_newroute: dst %s\n", AF_INET, &dst); 7007 } 7008 7009 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 7010 if (mctl_present) { 7011 io = (ipsec_out_t *)first_mp->b_rptr; 7012 zoneid = io->ipsec_out_zoneid; 7013 ASSERT(zoneid != ALL_ZONES); 7014 } else if (connp != NULL) { 7015 zoneid = connp->conn_zoneid; 7016 } else { 7017 zoneid = GLOBAL_ZONEID; 7018 } 7019 7020 ipha = (ipha_t *)mp->b_rptr; 7021 7022 /* All multicast lookups come through ip_newroute_ipif() */ 7023 if (CLASSD(dst)) { 7024 ip0dbg(("ip_newroute: CLASSD 0x%x (b_prev %p, b_next %p)\n", 7025 ntohl(dst), (void *)mp->b_prev, (void *)mp->b_next)); 7026 freemsg(first_mp); 7027 return; 7028 } 7029 7030 if (ip_loopback_src_or_dst(ipha, NULL)) { 7031 goto icmp_err_ret; 7032 } 7033 7034 if (mctl_present && io->ipsec_out_attach_if) { 7035 /* ip_grab_attach_ill returns a held ill */ 7036 attach_ill = ip_grab_attach_ill(NULL, first_mp, 7037 io->ipsec_out_ill_index, B_FALSE); 7038 7039 /* Failure case frees things for us. */ 7040 if (attach_ill == NULL) 7041 return; 7042 7043 /* 7044 * Check if we need an ire that will not be 7045 * looked up by anybody else i.e. HIDDEN. 7046 */ 7047 if (ill_is_probeonly(attach_ill)) 7048 ire_marks = IRE_MARK_HIDDEN; 7049 } 7050 if (mctl_present && io->ipsec_out_ip_nexthop) { 7051 ip_nexthop = B_TRUE; 7052 nexthop_addr = io->ipsec_out_nexthop_addr; 7053 } 7054 /* 7055 * If this IRE is created for forwarding or it is not for 7056 * traffic for congestion controlled protocols, mark it as temporary. 7057 */ 7058 if (mp->b_prev != NULL || !IP_FLOW_CONTROLLED_ULP(ipha->ipha_protocol)) 7059 ire_marks |= IRE_MARK_TEMPORARY; 7060 7061 /* 7062 * Get what we can from ire_ftable_lookup which will follow an IRE 7063 * chain until it gets the most specific information available. 7064 * For example, we know that there is no IRE_CACHE for this dest, 7065 * but there may be an IRE_OFFSUBNET which specifies a gateway. 7066 * ire_ftable_lookup will look up the gateway, etc. 7067 * Check if in_ill != NULL. If it is true, the packet must be 7068 * from an incoming interface where RTA_SRCIFP is set. 7069 * Otherwise, given ire_ftable_lookup algorithm, only one among routes 7070 * to the destination, of equal netmask length in the forward table, 7071 * will be recursively explored. If no information is available 7072 * for the final gateway of that route, we force the returned ire 7073 * to be equal to sire using MATCH_IRE_PARENT. 7074 * At least, in this case we have a starting point (in the buckets) 7075 * to look for other routes to the destination in the forward table. 7076 * This is actually used only for multirouting, where a list 7077 * of routes has to be processed in sequence. 7078 */ 7079 if (in_ill != NULL) { 7080 ire = ire_srcif_table_lookup(dst, IRE_IF_RESOLVER, NULL, 7081 in_ill, MATCH_IRE_TYPE); 7082 } else if (ip_nexthop) { 7083 /* 7084 * The first time we come here, we look for an IRE_INTERFACE 7085 * entry for the specified nexthop, set the dst to be the 7086 * nexthop address and create an IRE_CACHE entry for the 7087 * nexthop. The next time around, we are able to find an 7088 * IRE_CACHE entry for the nexthop, set the gateway to be the 7089 * nexthop address and create an IRE_CACHE entry for the 7090 * destination address via the specified nexthop. 7091 */ 7092 ire = ire_cache_lookup(nexthop_addr, zoneid, 7093 MBLK_GETLABEL(mp)); 7094 if (ire != NULL) { 7095 gw = nexthop_addr; 7096 ire_marks |= IRE_MARK_PRIVATE_ADDR; 7097 } else { 7098 ire = ire_ftable_lookup(nexthop_addr, 0, 0, 7099 IRE_INTERFACE, NULL, NULL, zoneid, 0, 7100 MBLK_GETLABEL(mp), 7101 MATCH_IRE_TYPE | MATCH_IRE_SECATTR); 7102 if (ire != NULL) { 7103 dst = nexthop_addr; 7104 } 7105 } 7106 } else if (attach_ill == NULL) { 7107 ire = ire_ftable_lookup(dst, 0, 0, 0, 7108 NULL, &sire, zoneid, 0, MBLK_GETLABEL(mp), 7109 MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 7110 MATCH_IRE_RJ_BHOLE | MATCH_IRE_PARENT | 7111 MATCH_IRE_SECATTR); 7112 } else { 7113 /* 7114 * attach_ill is set only for communicating with 7115 * on-link hosts. So, don't look for DEFAULT. 7116 */ 7117 ipif_t *attach_ipif; 7118 7119 attach_ipif = ipif_get_next_ipif(NULL, attach_ill); 7120 if (attach_ipif == NULL) { 7121 ill_refrele(attach_ill); 7122 goto icmp_err_ret; 7123 } 7124 ire = ire_ftable_lookup(dst, 0, 0, 0, attach_ipif, 7125 &sire, zoneid, 0, MBLK_GETLABEL(mp), 7126 MATCH_IRE_RJ_BHOLE | MATCH_IRE_ILL | 7127 MATCH_IRE_SECATTR); 7128 ipif_refrele(attach_ipif); 7129 } 7130 ip3dbg(("ip_newroute: ire_ftable_lookup() " 7131 "returned ire %p, sire %p\n", (void *)ire, (void *)sire)); 7132 7133 /* 7134 * This loop is run only once in most cases. 7135 * We loop to resolve further routes only when the destination 7136 * can be reached through multiple RTF_MULTIRT-flagged ires. 7137 */ 7138 do { 7139 /* Clear the previous iteration's values */ 7140 if (src_ipif != NULL) { 7141 ipif_refrele(src_ipif); 7142 src_ipif = NULL; 7143 } 7144 if (dst_ill != NULL) { 7145 ill_refrele(dst_ill); 7146 dst_ill = NULL; 7147 } 7148 7149 multirt_resolve_next = B_FALSE; 7150 /* 7151 * We check if packets have to be multirouted. 7152 * In this case, given the current <ire, sire> couple, 7153 * we look for the next suitable <ire, sire>. 7154 * This check is done in ire_multirt_lookup(), 7155 * which applies various criteria to find the next route 7156 * to resolve. ire_multirt_lookup() leaves <ire, sire> 7157 * unchanged if it detects it has not been tried yet. 7158 */ 7159 if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) { 7160 ip3dbg(("ip_newroute: starting next_resolution " 7161 "with first_mp %p, tag %d\n", 7162 (void *)first_mp, 7163 MULTIRT_DEBUG_TAGGED(first_mp))); 7164 7165 ASSERT(sire != NULL); 7166 multirt_is_resolvable = 7167 ire_multirt_lookup(&ire, &sire, multirt_flags, 7168 MBLK_GETLABEL(mp)); 7169 7170 ip3dbg(("ip_newroute: multirt_is_resolvable %d, " 7171 "ire %p, sire %p\n", 7172 multirt_is_resolvable, 7173 (void *)ire, (void *)sire)); 7174 7175 if (!multirt_is_resolvable) { 7176 /* 7177 * No more multirt route to resolve; give up 7178 * (all routes resolved or no more 7179 * resolvable routes). 7180 */ 7181 if (ire != NULL) { 7182 ire_refrele(ire); 7183 ire = NULL; 7184 } 7185 } else { 7186 ASSERT(sire != NULL); 7187 ASSERT(ire != NULL); 7188 /* 7189 * We simply use first_sire as a flag that 7190 * indicates if a resolvable multirt route 7191 * has already been found. 7192 * If it is not the case, we may have to send 7193 * an ICMP error to report that the 7194 * destination is unreachable. 7195 * We do not IRE_REFHOLD first_sire. 7196 */ 7197 if (first_sire == NULL) { 7198 first_sire = sire; 7199 } 7200 } 7201 } 7202 if (ire == NULL) { 7203 if (ip_debug > 3) { 7204 /* ip2dbg */ 7205 pr_addr_dbg("ip_newroute: " 7206 "can't resolve %s\n", AF_INET, &dst); 7207 } 7208 ip3dbg(("ip_newroute: " 7209 "ire %p, sire %p, first_sire %p\n", 7210 (void *)ire, (void *)sire, (void *)first_sire)); 7211 7212 if (sire != NULL) { 7213 ire_refrele(sire); 7214 sire = NULL; 7215 } 7216 7217 if (first_sire != NULL) { 7218 /* 7219 * At least one multirt route has been found 7220 * in the same call to ip_newroute(); 7221 * there is no need to report an ICMP error. 7222 * first_sire was not IRE_REFHOLDed. 7223 */ 7224 MULTIRT_DEBUG_UNTAG(first_mp); 7225 freemsg(first_mp); 7226 return; 7227 } 7228 ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0, 7229 RTA_DST); 7230 if (attach_ill != NULL) 7231 ill_refrele(attach_ill); 7232 goto icmp_err_ret; 7233 } 7234 7235 /* 7236 * When RTA_SRCIFP is used to add a route, then an interface 7237 * route is added in the source interface's routing table. 7238 * If the outgoing interface of this route is of type 7239 * IRE_IF_RESOLVER, then upon creation of the ire, 7240 * ire_dlureq_mp is set to NULL. Later, when this route is 7241 * first used for forwarding packet, ip_newroute() is called 7242 * to resolve the hardware address of the outgoing ipif. 7243 * We do not come here for IRE_IF_NORESOLVER entries in the 7244 * source interface based table. We only come here if the 7245 * outgoing interface is a resolver interface and we don't 7246 * have the ire_dlureq_mp information yet. 7247 * If in_ill is not null that means it is called from 7248 * ip_rput. 7249 */ 7250 7251 ASSERT(ire->ire_in_ill == NULL || 7252 (ire->ire_type == IRE_IF_RESOLVER && 7253 ire->ire_dlureq_mp == NULL)); 7254 7255 /* 7256 * Verify that the returned IRE does not have either 7257 * the RTF_REJECT or RTF_BLACKHOLE flags set and that the IRE is 7258 * either an IRE_CACHE, IRE_IF_NORESOLVER or IRE_IF_RESOLVER. 7259 */ 7260 if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) || 7261 (ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0) { 7262 if (attach_ill != NULL) 7263 ill_refrele(attach_ill); 7264 goto icmp_err_ret; 7265 } 7266 /* 7267 * Increment the ire_ob_pkt_count field for ire if it is an 7268 * INTERFACE (IF_RESOLVER or IF_NORESOLVER) IRE type, and 7269 * increment the same for the parent IRE, sire, if it is some 7270 * sort of prefix IRE (which includes DEFAULT, PREFIX, HOST 7271 * and HOST_REDIRECT). 7272 */ 7273 if ((ire->ire_type & IRE_INTERFACE) != 0) { 7274 UPDATE_OB_PKT_COUNT(ire); 7275 ire->ire_last_used_time = lbolt; 7276 } 7277 7278 if (sire != NULL) { 7279 gw = sire->ire_gateway_addr; 7280 ASSERT((sire->ire_type & (IRE_CACHETABLE | 7281 IRE_INTERFACE)) == 0); 7282 UPDATE_OB_PKT_COUNT(sire); 7283 sire->ire_last_used_time = lbolt; 7284 } 7285 /* 7286 * We have a route to reach the destination. 7287 * 7288 * 1) If the interface is part of ill group, try to get a new 7289 * ill taking load spreading into account. 7290 * 7291 * 2) After selecting the ill, get a source address that 7292 * might create good inbound load spreading. 7293 * ipif_select_source does this for us. 7294 * 7295 * If the application specified the ill (ifindex), we still 7296 * load spread. Only if the packets needs to go out 7297 * specifically on a given ill e.g. binding to 7298 * IPIF_NOFAILOVER address, then we don't try to use a 7299 * different ill for load spreading. 7300 */ 7301 if (attach_ill == NULL) { 7302 /* 7303 * Don't perform outbound load spreading in the 7304 * case of an RTF_MULTIRT route, as we actually 7305 * typically want to replicate outgoing packets 7306 * through particular interfaces. 7307 */ 7308 if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) { 7309 dst_ill = ire->ire_ipif->ipif_ill; 7310 /* for uniformity */ 7311 ill_refhold(dst_ill); 7312 } else { 7313 /* 7314 * If we are here trying to create an IRE_CACHE 7315 * for an offlink destination and have the 7316 * IRE_CACHE for the next hop and the latter is 7317 * using virtual IP source address selection i.e 7318 * it's ire->ire_ipif is pointing to a virtual 7319 * network interface (vni) then 7320 * ip_newroute_get_dst_ll() will return the vni 7321 * interface as the dst_ill. Since the vni is 7322 * virtual i.e not associated with any physical 7323 * interface, it cannot be the dst_ill, hence 7324 * in such a case call ip_newroute_get_dst_ll() 7325 * with the stq_ill instead of the ire_ipif ILL. 7326 * The function returns a refheld ill. 7327 */ 7328 if ((ire->ire_type == IRE_CACHE) && 7329 IS_VNI(ire->ire_ipif->ipif_ill)) 7330 dst_ill = ip_newroute_get_dst_ill( 7331 ire->ire_stq->q_ptr); 7332 else 7333 dst_ill = ip_newroute_get_dst_ill( 7334 ire->ire_ipif->ipif_ill); 7335 } 7336 if (dst_ill == NULL) { 7337 if (ip_debug > 2) { 7338 pr_addr_dbg("ip_newroute: " 7339 "no dst ill for dst" 7340 " %s\n", AF_INET, &dst); 7341 } 7342 goto icmp_err_ret; 7343 } 7344 } else { 7345 dst_ill = ire->ire_ipif->ipif_ill; 7346 /* for uniformity */ 7347 ill_refhold(dst_ill); 7348 /* 7349 * We should have found a route matching ill as we 7350 * called ire_ftable_lookup with MATCH_IRE_ILL. 7351 * Rather than asserting, when there is a mismatch, 7352 * we just drop the packet. 7353 */ 7354 if (dst_ill != attach_ill) { 7355 ip0dbg(("ip_newroute: Packet dropped as " 7356 "IPIF_NOFAILOVER ill is %s, " 7357 "ire->ire_ipif->ipif_ill is %s\n", 7358 attach_ill->ill_name, 7359 dst_ill->ill_name)); 7360 ill_refrele(attach_ill); 7361 goto icmp_err_ret; 7362 } 7363 } 7364 /* attach_ill can't go in loop. IPMP and CGTP are disjoint */ 7365 if (attach_ill != NULL) { 7366 ill_refrele(attach_ill); 7367 attach_ill = NULL; 7368 do_attach_ill = B_TRUE; 7369 } 7370 ASSERT(dst_ill != NULL); 7371 ip2dbg(("ip_newroute: dst_ill %s\n", dst_ill->ill_name)); 7372 7373 /* 7374 * Pick the best source address from dst_ill. 7375 * 7376 * 1) If it is part of a multipathing group, we would 7377 * like to spread the inbound packets across different 7378 * interfaces. ipif_select_source picks a random source 7379 * across the different ills in the group. 7380 * 7381 * 2) If it is not part of a multipathing group, we try 7382 * to pick the source address from the destination 7383 * route. Clustering assumes that when we have multiple 7384 * prefixes hosted on an interface, the prefix of the 7385 * source address matches the prefix of the destination 7386 * route. We do this only if the address is not 7387 * DEPRECATED. 7388 * 7389 * 3) If the conn is in a different zone than the ire, we 7390 * need to pick a source address from the right zone. 7391 * 7392 * NOTE : If we hit case (1) above, the prefix of the source 7393 * address picked may not match the prefix of the 7394 * destination routes prefix as ipif_select_source 7395 * does not look at "dst" while picking a source 7396 * address. 7397 * If we want the same behavior as (2), we will need 7398 * to change the behavior of ipif_select_source. 7399 */ 7400 ASSERT(src_ipif == NULL); 7401 if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) { 7402 /* 7403 * The RTF_SETSRC flag is set in the parent ire (sire). 7404 * Check that the ipif matching the requested source 7405 * address still exists. 7406 */ 7407 src_ipif = ipif_lookup_addr(sire->ire_src_addr, NULL, 7408 zoneid, NULL, NULL, NULL, NULL); 7409 } 7410 if (src_ipif == NULL) { 7411 ire_marks |= IRE_MARK_USESRC_CHECK; 7412 if ((dst_ill->ill_group != NULL) || 7413 (ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) || 7414 (connp != NULL && ire->ire_zoneid != zoneid && 7415 ire->ire_zoneid != ALL_ZONES) || 7416 (dst_ill->ill_usesrc_ifindex != 0)) { 7417 /* 7418 * If the destination is reachable via a 7419 * given gateway, the selected source address 7420 * should be in the same subnet as the gateway. 7421 * Otherwise, the destination is not reachable. 7422 * 7423 * If there are no interfaces on the same subnet 7424 * as the destination, ipif_select_source gives 7425 * first non-deprecated interface which might be 7426 * on a different subnet than the gateway. 7427 * This is not desirable. Hence pass the dst_ire 7428 * source address to ipif_select_source. 7429 * It is sure that the destination is reachable 7430 * with the dst_ire source address subnet. 7431 * So passing dst_ire source address to 7432 * ipif_select_source will make sure that the 7433 * selected source will be on the same subnet 7434 * as dst_ire source address. 7435 */ 7436 ipaddr_t saddr = ire->ire_ipif->ipif_src_addr; 7437 src_ipif = ipif_select_source(dst_ill, saddr, 7438 zoneid); 7439 if (src_ipif == NULL) { 7440 if (ip_debug > 2) { 7441 pr_addr_dbg("ip_newroute: " 7442 "no src for dst %s ", 7443 AF_INET, &dst); 7444 printf("through interface %s\n", 7445 dst_ill->ill_name); 7446 } 7447 goto icmp_err_ret; 7448 } 7449 } else { 7450 src_ipif = ire->ire_ipif; 7451 ASSERT(src_ipif != NULL); 7452 /* hold src_ipif for uniformity */ 7453 ipif_refhold(src_ipif); 7454 } 7455 } 7456 7457 /* 7458 * Assign a source address while we have the conn. 7459 * We can't have ip_wput_ire pick a source address when the 7460 * packet returns from arp since we need to look at 7461 * conn_unspec_src and conn_zoneid, and we lose the conn when 7462 * going through arp. 7463 * 7464 * NOTE : ip_newroute_v6 does not have this piece of code as 7465 * it uses ip6i to store this information. 7466 */ 7467 if (ipha->ipha_src == INADDR_ANY && 7468 (connp == NULL || !connp->conn_unspec_src)) { 7469 ipha->ipha_src = src_ipif->ipif_src_addr; 7470 } 7471 if (ip_debug > 3) { 7472 /* ip2dbg */ 7473 pr_addr_dbg("ip_newroute: first hop %s\n", 7474 AF_INET, &gw); 7475 } 7476 ip2dbg(("\tire type %s (%d)\n", 7477 ip_nv_lookup(ire_nv_tbl, ire->ire_type), ire->ire_type)); 7478 7479 /* 7480 * The TTL of multirouted packets is bounded by the 7481 * ip_multirt_ttl ndd variable. 7482 */ 7483 if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) { 7484 /* Force TTL of multirouted packets */ 7485 if ((ip_multirt_ttl > 0) && 7486 (ipha->ipha_ttl > ip_multirt_ttl)) { 7487 ip2dbg(("ip_newroute: forcing multirt TTL " 7488 "to %d (was %d), dst 0x%08x\n", 7489 ip_multirt_ttl, ipha->ipha_ttl, 7490 ntohl(sire->ire_addr))); 7491 ipha->ipha_ttl = ip_multirt_ttl; 7492 } 7493 } 7494 /* 7495 * At this point in ip_newroute(), ire is either the 7496 * IRE_CACHE of the next-hop gateway for an off-subnet 7497 * destination or an IRE_INTERFACE type that should be used 7498 * to resolve an on-subnet destination or an on-subnet 7499 * next-hop gateway. 7500 * 7501 * In the IRE_CACHE case, we have the following : 7502 * 7503 * 1) src_ipif - used for getting a source address. 7504 * 7505 * 2) dst_ill - from which we derive ire_stq/ire_rfq. This 7506 * means packets using this IRE_CACHE will go out on 7507 * dst_ill. 7508 * 7509 * 3) The IRE sire will point to the prefix that is the 7510 * longest matching route for the destination. These 7511 * prefix types include IRE_DEFAULT, IRE_PREFIX, IRE_HOST, 7512 * and IRE_HOST_REDIRECT. 7513 * 7514 * The newly created IRE_CACHE entry for the off-subnet 7515 * destination is tied to both the prefix route and the 7516 * interface route used to resolve the next-hop gateway 7517 * via the ire_phandle and ire_ihandle fields, 7518 * respectively. 7519 * 7520 * In the IRE_INTERFACE case, we have the following : 7521 * 7522 * 1) src_ipif - used for getting a source address. 7523 * 7524 * 2) dst_ill - from which we derive ire_stq/ire_rfq. This 7525 * means packets using the IRE_CACHE that we will build 7526 * here will go out on dst_ill. 7527 * 7528 * 3) sire may or may not be NULL. But, the IRE_CACHE that is 7529 * to be created will only be tied to the IRE_INTERFACE 7530 * that was derived from the ire_ihandle field. 7531 * 7532 * If sire is non-NULL, it means the destination is 7533 * off-link and we will first create the IRE_CACHE for the 7534 * gateway. Next time through ip_newroute, we will create 7535 * the IRE_CACHE for the final destination as described 7536 * above. 7537 * 7538 * In both cases, after the current resolution has been 7539 * completed (or possibly initialised, in the IRE_INTERFACE 7540 * case), the loop may be re-entered to attempt the resolution 7541 * of another RTF_MULTIRT route. 7542 * 7543 * When an IRE_CACHE entry for the off-subnet destination is 7544 * created, RTF_SETSRC and RTF_MULTIRT are inherited from sire, 7545 * for further processing in emission loops. 7546 */ 7547 save_ire = ire; 7548 switch (ire->ire_type) { 7549 case IRE_CACHE: { 7550 ire_t *ipif_ire; 7551 mblk_t *ire_fp_mp; 7552 7553 if (gw == 0) 7554 gw = ire->ire_gateway_addr; 7555 /* 7556 * We need 3 ire's to create a new cache ire for an 7557 * off-link destination from the cache ire of the 7558 * gateway. 7559 * 7560 * 1. The prefix ire 'sire' (Note that this does 7561 * not apply to the conn_nexthop_set case) 7562 * 2. The cache ire of the gateway 'ire' 7563 * 3. The interface ire 'ipif_ire' 7564 * 7565 * We have (1) and (2). We lookup (3) below. 7566 * 7567 * If there is no interface route to the gateway, 7568 * it is a race condition, where we found the cache 7569 * but the interface route has been deleted. 7570 */ 7571 if (ip_nexthop) { 7572 ipif_ire = ire_ihandle_lookup_onlink(ire); 7573 } else { 7574 ipif_ire = 7575 ire_ihandle_lookup_offlink(ire, sire); 7576 } 7577 if (ipif_ire == NULL) { 7578 ip1dbg(("ip_newroute: " 7579 "ire_ihandle_lookup_offlink failed\n")); 7580 goto icmp_err_ret; 7581 } 7582 /* 7583 * XXX We are using the same dlureq_mp 7584 * (DL_UNITDATA_REQ) though the save_ire is not 7585 * pointing at the same ill. 7586 * This is incorrect. We need to send it up to the 7587 * resolver to get the right dlureq_mp. For ethernets 7588 * this may be okay (ill_type == DL_ETHER). 7589 */ 7590 dlureq_mp = save_ire->ire_dlureq_mp; 7591 ire_fp_mp = NULL; 7592 /* 7593 * save_ire's ire_fp_mp can't change since it is 7594 * not an IRE_MIPRTUN or IRE_BROADCAST 7595 * LOCK_IRE_FP_MP does not do any useful work in 7596 * the case of IRE_CACHE. So we don't use it below. 7597 */ 7598 if (save_ire->ire_stq == dst_ill->ill_wq) 7599 ire_fp_mp = save_ire->ire_fp_mp; 7600 7601 /* 7602 * Check cached gateway IRE for any security 7603 * attributes; if found, associate the gateway 7604 * credentials group to the destination IRE. 7605 */ 7606 if ((attrp = save_ire->ire_gw_secattr) != NULL) { 7607 mutex_enter(&attrp->igsa_lock); 7608 if ((gcgrp = attrp->igsa_gcgrp) != NULL) 7609 GCGRP_REFHOLD(gcgrp); 7610 mutex_exit(&attrp->igsa_lock); 7611 } 7612 7613 ire = ire_create( 7614 (uchar_t *)&dst, /* dest address */ 7615 (uchar_t *)&ip_g_all_ones, /* mask */ 7616 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 7617 (uchar_t *)&gw, /* gateway address */ 7618 NULL, 7619 &save_ire->ire_max_frag, 7620 ire_fp_mp, /* Fast Path header */ 7621 dst_ill->ill_rq, /* recv-from queue */ 7622 dst_ill->ill_wq, /* send-to queue */ 7623 IRE_CACHE, /* IRE type */ 7624 save_ire->ire_dlureq_mp, 7625 src_ipif, 7626 in_ill, /* incoming ill */ 7627 (sire != NULL) ? 7628 sire->ire_mask : 0, /* Parent mask */ 7629 (sire != NULL) ? 7630 sire->ire_phandle : 0, /* Parent handle */ 7631 ipif_ire->ire_ihandle, /* Interface handle */ 7632 (sire != NULL) ? (sire->ire_flags & 7633 (RTF_SETSRC | RTF_MULTIRT)) : 0, /* flags */ 7634 (sire != NULL) ? 7635 &(sire->ire_uinfo) : &(save_ire->ire_uinfo), 7636 NULL, 7637 gcgrp); 7638 7639 if (ire == NULL) { 7640 if (gcgrp != NULL) { 7641 GCGRP_REFRELE(gcgrp); 7642 gcgrp = NULL; 7643 } 7644 ire_refrele(ipif_ire); 7645 ire_refrele(save_ire); 7646 break; 7647 } 7648 7649 /* reference now held by IRE */ 7650 gcgrp = NULL; 7651 7652 ire->ire_marks |= ire_marks; 7653 7654 /* 7655 * Prevent sire and ipif_ire from getting deleted. 7656 * The newly created ire is tied to both of them via 7657 * the phandle and ihandle respectively. 7658 */ 7659 if (sire != NULL) { 7660 IRB_REFHOLD(sire->ire_bucket); 7661 /* Has it been removed already ? */ 7662 if (sire->ire_marks & IRE_MARK_CONDEMNED) { 7663 IRB_REFRELE(sire->ire_bucket); 7664 ire_refrele(ipif_ire); 7665 ire_refrele(save_ire); 7666 break; 7667 } 7668 } 7669 7670 IRB_REFHOLD(ipif_ire->ire_bucket); 7671 /* Has it been removed already ? */ 7672 if (ipif_ire->ire_marks & IRE_MARK_CONDEMNED) { 7673 IRB_REFRELE(ipif_ire->ire_bucket); 7674 if (sire != NULL) 7675 IRB_REFRELE(sire->ire_bucket); 7676 ire_refrele(ipif_ire); 7677 ire_refrele(save_ire); 7678 break; 7679 } 7680 7681 xmit_mp = first_mp; 7682 /* 7683 * In the case of multirouting, a copy 7684 * of the packet is done before its sending. 7685 * The copy is used to attempt another 7686 * route resolution, in a next loop. 7687 */ 7688 if (ire->ire_flags & RTF_MULTIRT) { 7689 copy_mp = copymsg(first_mp); 7690 if (copy_mp != NULL) { 7691 xmit_mp = copy_mp; 7692 MULTIRT_DEBUG_TAG(first_mp); 7693 } 7694 } 7695 ire_add_then_send(q, ire, xmit_mp); 7696 ire_refrele(save_ire); 7697 7698 /* Assert that sire is not deleted yet. */ 7699 if (sire != NULL) { 7700 ASSERT(sire->ire_ptpn != NULL); 7701 IRB_REFRELE(sire->ire_bucket); 7702 } 7703 7704 /* Assert that ipif_ire is not deleted yet. */ 7705 ASSERT(ipif_ire->ire_ptpn != NULL); 7706 IRB_REFRELE(ipif_ire->ire_bucket); 7707 ire_refrele(ipif_ire); 7708 7709 /* 7710 * If copy_mp is not NULL, multirouting was 7711 * requested. We loop to initiate a next 7712 * route resolution attempt, starting from sire. 7713 */ 7714 if (copy_mp != NULL) { 7715 /* 7716 * Search for the next unresolved 7717 * multirt route. 7718 */ 7719 copy_mp = NULL; 7720 ipif_ire = NULL; 7721 ire = NULL; 7722 multirt_resolve_next = B_TRUE; 7723 continue; 7724 } 7725 if (sire != NULL) 7726 ire_refrele(sire); 7727 ipif_refrele(src_ipif); 7728 ill_refrele(dst_ill); 7729 return; 7730 } 7731 case IRE_IF_NORESOLVER: { 7732 /* 7733 * We have what we need to build an IRE_CACHE. 7734 * 7735 * Create a new dlureq_mp with the IP gateway address 7736 * in destination address in the DLPI hdr if the 7737 * physical length is exactly 4 bytes. 7738 */ 7739 if (dst_ill->ill_phys_addr_length == IP_ADDR_LEN) { 7740 uchar_t *addr; 7741 7742 if (gw) 7743 addr = (uchar_t *)&gw; 7744 else 7745 addr = (uchar_t *)&dst; 7746 7747 dlureq_mp = ill_dlur_gen(addr, 7748 dst_ill->ill_phys_addr_length, 7749 dst_ill->ill_sap, 7750 dst_ill->ill_sap_length); 7751 } else { 7752 dlureq_mp = ire->ire_dlureq_mp; 7753 } 7754 7755 if (dlureq_mp == NULL) { 7756 ip1dbg(("ip_newroute: dlureq_mp NULL\n")); 7757 break; 7758 } 7759 7760 /* 7761 * TSol note: We are creating the ire cache for the 7762 * destination 'dst'. If 'dst' is offlink, going 7763 * through the first hop 'gw', the security attributes 7764 * of 'dst' must be set to point to the gateway 7765 * credentials of gateway 'gw'. If 'dst' is onlink, it 7766 * is possible that 'dst' is a potential gateway that is 7767 * referenced by some route that has some security 7768 * attributes. Thus in the former case, we need to do a 7769 * gcgrp_lookup of 'gw' while in the latter case we 7770 * need to do gcgrp_lookup of 'dst' itself. 7771 */ 7772 ga.ga_af = AF_INET; 7773 IN6_IPADDR_TO_V4MAPPED(gw != INADDR_ANY ? gw : dst, 7774 &ga.ga_addr); 7775 gcgrp = gcgrp_lookup(&ga, B_FALSE); 7776 7777 ire = ire_create( 7778 (uchar_t *)&dst, /* dest address */ 7779 (uchar_t *)&ip_g_all_ones, /* mask */ 7780 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 7781 (uchar_t *)&gw, /* gateway address */ 7782 NULL, 7783 &save_ire->ire_max_frag, 7784 NULL, /* Fast Path header */ 7785 dst_ill->ill_rq, /* recv-from queue */ 7786 dst_ill->ill_wq, /* send-to queue */ 7787 IRE_CACHE, 7788 dlureq_mp, 7789 src_ipif, 7790 in_ill, /* Incoming ill */ 7791 save_ire->ire_mask, /* Parent mask */ 7792 (sire != NULL) ? /* Parent handle */ 7793 sire->ire_phandle : 0, 7794 save_ire->ire_ihandle, /* Interface handle */ 7795 (sire != NULL) ? sire->ire_flags & 7796 (RTF_SETSRC | RTF_MULTIRT) : 0, /* flags */ 7797 &(save_ire->ire_uinfo), 7798 NULL, 7799 gcgrp); 7800 7801 if (dst_ill->ill_phys_addr_length == IP_ADDR_LEN) 7802 freeb(dlureq_mp); 7803 7804 if (ire == NULL) { 7805 if (gcgrp != NULL) { 7806 GCGRP_REFRELE(gcgrp); 7807 gcgrp = NULL; 7808 } 7809 ire_refrele(save_ire); 7810 break; 7811 } 7812 7813 /* reference now held by IRE */ 7814 gcgrp = NULL; 7815 7816 ire->ire_marks |= ire_marks; 7817 7818 /* Prevent save_ire from getting deleted */ 7819 IRB_REFHOLD(save_ire->ire_bucket); 7820 /* Has it been removed already ? */ 7821 if (save_ire->ire_marks & IRE_MARK_CONDEMNED) { 7822 IRB_REFRELE(save_ire->ire_bucket); 7823 ire_refrele(save_ire); 7824 break; 7825 } 7826 7827 /* 7828 * In the case of multirouting, a copy 7829 * of the packet is made before it is sent. 7830 * The copy is used in the next 7831 * loop to attempt another resolution. 7832 */ 7833 xmit_mp = first_mp; 7834 if ((sire != NULL) && 7835 (sire->ire_flags & RTF_MULTIRT)) { 7836 copy_mp = copymsg(first_mp); 7837 if (copy_mp != NULL) { 7838 xmit_mp = copy_mp; 7839 MULTIRT_DEBUG_TAG(first_mp); 7840 } 7841 } 7842 ire_add_then_send(q, ire, xmit_mp); 7843 7844 /* Assert that it is not deleted yet. */ 7845 ASSERT(save_ire->ire_ptpn != NULL); 7846 IRB_REFRELE(save_ire->ire_bucket); 7847 ire_refrele(save_ire); 7848 7849 if (copy_mp != NULL) { 7850 /* 7851 * If we found a (no)resolver, we ignore any 7852 * trailing top priority IRE_CACHE in further 7853 * loops. This ensures that we do not omit any 7854 * (no)resolver. 7855 * This IRE_CACHE, if any, will be processed 7856 * by another thread entering ip_newroute(). 7857 * IRE_CACHE entries, if any, will be processed 7858 * by another thread entering ip_newroute(), 7859 * (upon resolver response, for instance). 7860 * This aims to force parallel multirt 7861 * resolutions as soon as a packet must be sent. 7862 * In the best case, after the tx of only one 7863 * packet, all reachable routes are resolved. 7864 * Otherwise, the resolution of all RTF_MULTIRT 7865 * routes would require several emissions. 7866 */ 7867 multirt_flags &= ~MULTIRT_CACHEGW; 7868 7869 /* 7870 * Search for the next unresolved multirt 7871 * route. 7872 */ 7873 copy_mp = NULL; 7874 save_ire = NULL; 7875 ire = NULL; 7876 multirt_resolve_next = B_TRUE; 7877 continue; 7878 } 7879 7880 /* 7881 * Don't need sire anymore 7882 */ 7883 if (sire != NULL) 7884 ire_refrele(sire); 7885 7886 ipif_refrele(src_ipif); 7887 ill_refrele(dst_ill); 7888 return; 7889 } 7890 case IRE_IF_RESOLVER: 7891 /* 7892 * We can't build an IRE_CACHE yet, but at least we 7893 * found a resolver that can help. 7894 */ 7895 res_mp = dst_ill->ill_resolver_mp; 7896 if (!OK_RESOLVER_MP(res_mp)) 7897 break; 7898 7899 /* 7900 * To be at this point in the code with a non-zero gw 7901 * means that dst is reachable through a gateway that 7902 * we have never resolved. By changing dst to the gw 7903 * addr we resolve the gateway first. 7904 * When ire_add_then_send() tries to put the IP dg 7905 * to dst, it will reenter ip_newroute() at which 7906 * time we will find the IRE_CACHE for the gw and 7907 * create another IRE_CACHE in case IRE_CACHE above. 7908 */ 7909 if (gw != INADDR_ANY) { 7910 /* 7911 * The source ipif that was determined above was 7912 * relative to the destination address, not the 7913 * gateway's. If src_ipif was not taken out of 7914 * the IRE_IF_RESOLVER entry, we'll need to call 7915 * ipif_select_source() again. 7916 */ 7917 if (src_ipif != ire->ire_ipif) { 7918 ipif_refrele(src_ipif); 7919 src_ipif = ipif_select_source(dst_ill, 7920 gw, zoneid); 7921 if (src_ipif == NULL) { 7922 if (ip_debug > 2) { 7923 pr_addr_dbg( 7924 "ip_newroute: no " 7925 "src for gw %s ", 7926 AF_INET, &gw); 7927 printf("through " 7928 "interface %s\n", 7929 dst_ill->ill_name); 7930 } 7931 goto icmp_err_ret; 7932 } 7933 } 7934 save_dst = dst; 7935 dst = gw; 7936 gw = INADDR_ANY; 7937 } 7938 7939 /* 7940 * TSol note: Please see the corresponding note 7941 * of the IRE_IF_NORESOLVER case 7942 */ 7943 ga.ga_af = AF_INET; 7944 IN6_IPADDR_TO_V4MAPPED(dst, &ga.ga_addr); 7945 gcgrp = gcgrp_lookup(&ga, B_FALSE); 7946 7947 /* 7948 * We obtain a partial IRE_CACHE which we will pass 7949 * along with the resolver query. When the response 7950 * comes back it will be there ready for us to add. 7951 * The ire_max_frag is atomically set under the 7952 * irebucket lock in ire_add_v[46]. 7953 */ 7954 ire = ire_create_mp( 7955 (uchar_t *)&dst, /* dest address */ 7956 (uchar_t *)&ip_g_all_ones, /* mask */ 7957 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 7958 (uchar_t *)&gw, /* gateway address */ 7959 NULL, /* no in_src_addr */ 7960 NULL, /* ire_max_frag */ 7961 NULL, /* Fast Path header */ 7962 dst_ill->ill_rq, /* recv-from queue */ 7963 dst_ill->ill_wq, /* send-to queue */ 7964 IRE_CACHE, 7965 res_mp, 7966 src_ipif, /* Interface ipif */ 7967 in_ill, /* Incoming ILL */ 7968 save_ire->ire_mask, /* Parent mask */ 7969 0, 7970 save_ire->ire_ihandle, /* Interface handle */ 7971 0, /* flags if any */ 7972 &(save_ire->ire_uinfo), 7973 NULL, 7974 gcgrp); 7975 7976 if (ire == NULL) { 7977 ire_refrele(save_ire); 7978 if (gcgrp != NULL) { 7979 GCGRP_REFRELE(gcgrp); 7980 gcgrp = NULL; 7981 } 7982 break; 7983 } 7984 7985 /* reference now held by IRE */ 7986 gcgrp = NULL; 7987 7988 if ((sire != NULL) && 7989 (sire->ire_flags & RTF_MULTIRT)) { 7990 copy_mp = copymsg(first_mp); 7991 if (copy_mp != NULL) 7992 MULTIRT_DEBUG_TAG(copy_mp); 7993 } 7994 7995 ire->ire_marks |= ire_marks; 7996 7997 /* 7998 * Construct message chain for the resolver 7999 * of the form: 8000 * ARP_REQ_MBLK-->IRE_MBLK-->Packet 8001 * Packet could contain a IPSEC_OUT mp. 8002 * 8003 * NOTE : ire will be added later when the response 8004 * comes back from ARP. If the response does not 8005 * come back, ARP frees the packet. For this reason, 8006 * we can't REFHOLD the bucket of save_ire to prevent 8007 * deletions. We may not be able to REFRELE the bucket 8008 * if the response never comes back. Thus, before 8009 * adding the ire, ire_add_v4 will make sure that the 8010 * interface route does not get deleted. This is the 8011 * only case unlike ip_newroute_v6, ip_newroute_ipif_v6 8012 * where we can always prevent deletions because of 8013 * the synchronous nature of adding IRES i.e 8014 * ire_add_then_send is called after creating the IRE. 8015 */ 8016 ASSERT(ire->ire_mp != NULL); 8017 ire->ire_mp->b_cont = first_mp; 8018 /* Have saved_mp handy, for cleanup if canput fails */ 8019 saved_mp = mp; 8020 mp = ire->ire_dlureq_mp; 8021 ASSERT(mp != NULL); 8022 ire->ire_dlureq_mp = NULL; 8023 linkb(mp, ire->ire_mp); 8024 8025 8026 /* 8027 * Fill in the source and dest addrs for the resolver. 8028 * NOTE: this depends on memory layouts imposed by 8029 * ill_init(). 8030 */ 8031 areq = (areq_t *)mp->b_rptr; 8032 addrp = (ipaddr_t *)((char *)areq + 8033 areq->areq_sender_addr_offset); 8034 if (do_attach_ill) { 8035 /* 8036 * This is bind to no failover case. 8037 * arp packet also must go out on attach_ill. 8038 */ 8039 ASSERT(ipha->ipha_src != NULL); 8040 *addrp = ipha->ipha_src; 8041 } else { 8042 *addrp = save_ire->ire_src_addr; 8043 } 8044 8045 ire_refrele(save_ire); 8046 addrp = (ipaddr_t *)((char *)areq + 8047 areq->areq_target_addr_offset); 8048 *addrp = dst; 8049 /* Up to the resolver. */ 8050 if (canputnext(dst_ill->ill_rq)) { 8051 putnext(dst_ill->ill_rq, mp); 8052 ire = NULL; 8053 if (copy_mp != NULL) { 8054 /* 8055 * If we found a resolver, we ignore 8056 * any trailing top priority IRE_CACHE 8057 * in the further loops. This ensures 8058 * that we do not omit any resolver. 8059 * IRE_CACHE entries, if any, will be 8060 * processed next time we enter 8061 * ip_newroute(). 8062 */ 8063 multirt_flags &= ~MULTIRT_CACHEGW; 8064 /* 8065 * Search for the next unresolved 8066 * multirt route. 8067 */ 8068 first_mp = copy_mp; 8069 copy_mp = NULL; 8070 /* Prepare the next resolution loop. */ 8071 mp = first_mp; 8072 EXTRACT_PKT_MP(mp, first_mp, 8073 mctl_present); 8074 if (mctl_present) 8075 io = (ipsec_out_t *) 8076 first_mp->b_rptr; 8077 ipha = (ipha_t *)mp->b_rptr; 8078 8079 ASSERT(sire != NULL); 8080 8081 dst = save_dst; 8082 multirt_resolve_next = B_TRUE; 8083 continue; 8084 } 8085 8086 if (sire != NULL) 8087 ire_refrele(sire); 8088 8089 /* 8090 * The response will come back in ip_wput 8091 * with db_type IRE_DB_TYPE. 8092 */ 8093 ipif_refrele(src_ipif); 8094 ill_refrele(dst_ill); 8095 return; 8096 } else { 8097 /* Prepare for cleanup */ 8098 ire->ire_dlureq_mp = mp; 8099 mp->b_cont = NULL; 8100 ire_delete(ire); 8101 mp = saved_mp; 8102 ire = NULL; 8103 if (copy_mp != NULL) { 8104 MULTIRT_DEBUG_UNTAG(copy_mp); 8105 freemsg(copy_mp); 8106 copy_mp = NULL; 8107 } 8108 break; 8109 } 8110 default: 8111 break; 8112 } 8113 } while (multirt_resolve_next); 8114 8115 ip1dbg(("ip_newroute: dropped\n")); 8116 /* Did this packet originate externally? */ 8117 if (mp->b_prev) { 8118 mp->b_next = NULL; 8119 mp->b_prev = NULL; 8120 BUMP_MIB(&ip_mib, ipInDiscards); 8121 } else { 8122 BUMP_MIB(&ip_mib, ipOutDiscards); 8123 } 8124 ASSERT(copy_mp == NULL); 8125 MULTIRT_DEBUG_UNTAG(first_mp); 8126 freemsg(first_mp); 8127 if (ire != NULL) 8128 ire_refrele(ire); 8129 if (sire != NULL) 8130 ire_refrele(sire); 8131 if (src_ipif != NULL) 8132 ipif_refrele(src_ipif); 8133 if (dst_ill != NULL) 8134 ill_refrele(dst_ill); 8135 return; 8136 8137 icmp_err_ret: 8138 ip1dbg(("ip_newroute: no route\n")); 8139 if (src_ipif != NULL) 8140 ipif_refrele(src_ipif); 8141 if (dst_ill != NULL) 8142 ill_refrele(dst_ill); 8143 if (sire != NULL) 8144 ire_refrele(sire); 8145 /* Did this packet originate externally? */ 8146 if (mp->b_prev) { 8147 mp->b_next = NULL; 8148 mp->b_prev = NULL; 8149 /* XXX ipInNoRoutes */ 8150 q = WR(q); 8151 } else { 8152 /* 8153 * Since ip_wput() isn't close to finished, we fill 8154 * in enough of the header for credible error reporting. 8155 */ 8156 if (ip_hdr_complete(ipha, zoneid)) { 8157 /* Failed */ 8158 MULTIRT_DEBUG_UNTAG(first_mp); 8159 freemsg(first_mp); 8160 if (ire != NULL) 8161 ire_refrele(ire); 8162 return; 8163 } 8164 } 8165 BUMP_MIB(&ip_mib, ipOutNoRoutes); 8166 8167 /* 8168 * At this point we will have ire only if RTF_BLACKHOLE 8169 * or RTF_REJECT flags are set on the IRE. It will not 8170 * generate ICMP_HOST_UNREACHABLE if RTF_BLACKHOLE is set. 8171 */ 8172 if (ire != NULL) { 8173 if (ire->ire_flags & RTF_BLACKHOLE) { 8174 ire_refrele(ire); 8175 MULTIRT_DEBUG_UNTAG(first_mp); 8176 freemsg(first_mp); 8177 return; 8178 } 8179 ire_refrele(ire); 8180 } 8181 if (ip_source_routed(ipha)) { 8182 icmp_unreachable(q, first_mp, ICMP_SOURCE_ROUTE_FAILED); 8183 return; 8184 } 8185 icmp_unreachable(q, first_mp, ICMP_HOST_UNREACHABLE); 8186 } 8187 8188 /* 8189 * IPv4 - 8190 * ip_newroute_ipif is called by ip_wput_multicast and 8191 * ip_rput_forward_multicast whenever we need to send 8192 * out a packet to a destination address for which we do not have specific 8193 * routing information. It is used when the packet will be sent out 8194 * on a specific interface. It is also called by ip_wput() when IP_XMIT_IF 8195 * socket option is set or icmp error message wants to go out on a particular 8196 * interface for a unicast packet. 8197 * 8198 * In most cases, the destination address is resolved thanks to the ipif 8199 * intrinsic resolver. However, there are some cases where the call to 8200 * ip_newroute_ipif must take into account the potential presence of 8201 * RTF_SETSRC and/or RTF_MULITRT flags in an IRE_OFFSUBNET ire 8202 * that uses the interface. This is specified through flags, 8203 * which can be a combination of: 8204 * - RTF_SETSRC: if an IRE_OFFSUBNET ire exists that has the RTF_SETSRC 8205 * flag, the resulting ire will inherit the IRE_OFFSUBNET source address 8206 * and flags. Additionally, the packet source address has to be set to 8207 * the specified address. The caller is thus expected to set this flag 8208 * if the packet has no specific source address yet. 8209 * - RTF_MULTIRT: if an IRE_OFFSUBNET ire exists that has the RTF_MULTIRT 8210 * flag, the resulting ire will inherit the flag. All unresolved routes 8211 * to the destination must be explored in the same call to 8212 * ip_newroute_ipif(). 8213 */ 8214 static void 8215 ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, 8216 conn_t *connp, uint32_t flags) 8217 { 8218 areq_t *areq; 8219 ire_t *ire = NULL; 8220 mblk_t *res_mp; 8221 ipaddr_t *addrp; 8222 mblk_t *first_mp; 8223 ire_t *save_ire = NULL; 8224 ill_t *attach_ill = NULL; /* Bind to IPIF_NOFAILOVER */ 8225 ipif_t *src_ipif = NULL; 8226 ushort_t ire_marks = 0; 8227 ill_t *dst_ill = NULL; 8228 boolean_t mctl_present; 8229 ipsec_out_t *io; 8230 ipha_t *ipha; 8231 int ihandle = 0; 8232 mblk_t *saved_mp; 8233 ire_t *fire = NULL; 8234 mblk_t *copy_mp = NULL; 8235 boolean_t multirt_resolve_next; 8236 ipaddr_t ipha_dst; 8237 zoneid_t zoneid = (connp != NULL ? connp->conn_zoneid : ALL_ZONES); 8238 8239 /* 8240 * CGTP goes in a loop which looks up a new ipif, do an ipif_refhold 8241 * here for uniformity 8242 */ 8243 ipif_refhold(ipif); 8244 8245 /* 8246 * This loop is run only once in most cases. 8247 * We loop to resolve further routes only when the destination 8248 * can be reached through multiple RTF_MULTIRT-flagged ires. 8249 */ 8250 do { 8251 if (dst_ill != NULL) { 8252 ill_refrele(dst_ill); 8253 dst_ill = NULL; 8254 } 8255 if (src_ipif != NULL) { 8256 ipif_refrele(src_ipif); 8257 src_ipif = NULL; 8258 } 8259 multirt_resolve_next = B_FALSE; 8260 8261 ip1dbg(("ip_newroute_ipif: dst 0x%x, if %s\n", ntohl(dst), 8262 ipif->ipif_ill->ill_name)); 8263 8264 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 8265 if (mctl_present) 8266 io = (ipsec_out_t *)first_mp->b_rptr; 8267 8268 ipha = (ipha_t *)mp->b_rptr; 8269 8270 /* 8271 * Save the packet destination address, we may need it after 8272 * the packet has been consumed. 8273 */ 8274 ipha_dst = ipha->ipha_dst; 8275 8276 /* 8277 * If the interface is a pt-pt interface we look for an 8278 * IRE_IF_RESOLVER or IRE_IF_NORESOLVER that matches both the 8279 * local_address and the pt-pt destination address. Otherwise 8280 * we just match the local address. 8281 * NOTE: dst could be different than ipha->ipha_dst in case 8282 * of sending igmp multicast packets over a point-to-point 8283 * connection. 8284 * Thus we must be careful enough to check ipha_dst to be a 8285 * multicast address, otherwise it will take xmit_if path for 8286 * multicast packets resulting into kernel stack overflow by 8287 * repeated calls to ip_newroute_ipif from ire_send(). 8288 */ 8289 if (CLASSD(ipha_dst) && 8290 !(ipif->ipif_ill->ill_flags & ILLF_MULTICAST)) { 8291 goto err_ret; 8292 } 8293 8294 /* 8295 * We check if an IRE_OFFSUBNET for the addr that goes through 8296 * ipif exists. We need it to determine if the RTF_SETSRC and/or 8297 * RTF_MULTIRT flags must be honored. This IRE_OFFSUBNET ire may 8298 * propagate its flags to the new ire. 8299 */ 8300 if (CLASSD(ipha_dst) && (flags & (RTF_MULTIRT | RTF_SETSRC))) { 8301 fire = ipif_lookup_multi_ire(ipif, ipha_dst); 8302 ip2dbg(("ip_newroute_ipif: " 8303 "ipif_lookup_multi_ire(" 8304 "ipif %p, dst %08x) = fire %p\n", 8305 (void *)ipif, ntohl(dst), (void *)fire)); 8306 } 8307 8308 if (mctl_present && io->ipsec_out_attach_if) { 8309 attach_ill = ip_grab_attach_ill(NULL, first_mp, 8310 io->ipsec_out_ill_index, B_FALSE); 8311 8312 /* Failure case frees things for us. */ 8313 if (attach_ill == NULL) { 8314 ipif_refrele(ipif); 8315 if (fire != NULL) 8316 ire_refrele(fire); 8317 return; 8318 } 8319 8320 /* 8321 * Check if we need an ire that will not be 8322 * looked up by anybody else i.e. HIDDEN. 8323 */ 8324 if (ill_is_probeonly(attach_ill)) { 8325 ire_marks = IRE_MARK_HIDDEN; 8326 } 8327 /* 8328 * ip_wput passes the right ipif for IPIF_NOFAILOVER 8329 * case. 8330 */ 8331 dst_ill = ipif->ipif_ill; 8332 /* attach_ill has been refheld by ip_grab_attach_ill */ 8333 ASSERT(dst_ill == attach_ill); 8334 } else { 8335 /* 8336 * If this is set by IP_XMIT_IF, then make sure that 8337 * ipif is pointing to the same ill as the IP_XMIT_IF 8338 * specified ill. 8339 */ 8340 ASSERT((connp == NULL) || 8341 (connp->conn_xmit_if_ill == NULL) || 8342 (connp->conn_xmit_if_ill == ipif->ipif_ill)); 8343 /* 8344 * If the interface belongs to an interface group, 8345 * make sure the next possible interface in the group 8346 * is used. This encourages load spreading among 8347 * peers in an interface group. 8348 * Note: load spreading is disabled for RTF_MULTIRT 8349 * routes. 8350 */ 8351 if ((flags & RTF_MULTIRT) && (fire != NULL) && 8352 (fire->ire_flags & RTF_MULTIRT)) { 8353 /* 8354 * Don't perform outbound load spreading 8355 * in the case of an RTF_MULTIRT issued route, 8356 * we actually typically want to replicate 8357 * outgoing packets through particular 8358 * interfaces. 8359 */ 8360 dst_ill = ipif->ipif_ill; 8361 ill_refhold(dst_ill); 8362 } else { 8363 dst_ill = ip_newroute_get_dst_ill( 8364 ipif->ipif_ill); 8365 } 8366 if (dst_ill == NULL) { 8367 if (ip_debug > 2) { 8368 pr_addr_dbg("ip_newroute_ipif: " 8369 "no dst ill for dst %s\n", 8370 AF_INET, &dst); 8371 } 8372 goto err_ret; 8373 } 8374 } 8375 8376 /* 8377 * Pick a source address preferring non-deprecated ones. 8378 * Unlike ip_newroute, we don't do any source address 8379 * selection here since for multicast it really does not help 8380 * in inbound load spreading as in the unicast case. 8381 */ 8382 if ((flags & RTF_SETSRC) && (fire != NULL) && 8383 (fire->ire_flags & RTF_SETSRC)) { 8384 /* 8385 * As requested by flags, an IRE_OFFSUBNET was looked up 8386 * on that interface. This ire has RTF_SETSRC flag, so 8387 * the source address of the packet must be changed. 8388 * Check that the ipif matching the requested source 8389 * address still exists. 8390 */ 8391 src_ipif = ipif_lookup_addr(fire->ire_src_addr, NULL, 8392 zoneid, NULL, NULL, NULL, NULL); 8393 } 8394 if (((ipif->ipif_flags & IPIF_DEPRECATED) || 8395 (connp != NULL && ipif->ipif_zoneid != zoneid && 8396 ipif->ipif_zoneid != ALL_ZONES)) && 8397 (src_ipif == NULL)) { 8398 src_ipif = ipif_select_source(dst_ill, dst, zoneid); 8399 if (src_ipif == NULL) { 8400 if (ip_debug > 2) { 8401 /* ip1dbg */ 8402 pr_addr_dbg("ip_newroute_ipif: " 8403 "no src for dst %s", 8404 AF_INET, &dst); 8405 } 8406 ip1dbg((" through interface %s\n", 8407 dst_ill->ill_name)); 8408 goto err_ret; 8409 } 8410 ipif_refrele(ipif); 8411 ipif = src_ipif; 8412 ipif_refhold(ipif); 8413 } 8414 if (src_ipif == NULL) { 8415 src_ipif = ipif; 8416 ipif_refhold(src_ipif); 8417 } 8418 8419 /* 8420 * Assign a source address while we have the conn. 8421 * We can't have ip_wput_ire pick a source address when the 8422 * packet returns from arp since conn_unspec_src might be set 8423 * and we loose the conn when going through arp. 8424 */ 8425 if (ipha->ipha_src == INADDR_ANY && 8426 (connp == NULL || !connp->conn_unspec_src)) { 8427 ipha->ipha_src = src_ipif->ipif_src_addr; 8428 } 8429 8430 /* 8431 * In case of IP_XMIT_IF, it is possible that the outgoing 8432 * interface does not have an interface ire. 8433 * Example: Thousands of mobileip PPP interfaces to mobile 8434 * nodes. We don't want to create interface ires because 8435 * packets from other mobile nodes must not take the route 8436 * via interface ires to the visiting mobile node without 8437 * going through the home agent, in absence of mobileip 8438 * route optimization. 8439 */ 8440 if (CLASSD(ipha_dst) && (connp == NULL || 8441 connp->conn_xmit_if_ill == NULL)) { 8442 /* ipif_to_ire returns an held ire */ 8443 ire = ipif_to_ire(ipif); 8444 if (ire == NULL) 8445 goto err_ret; 8446 if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) 8447 goto err_ret; 8448 /* 8449 * ihandle is needed when the ire is added to 8450 * cache table. 8451 */ 8452 save_ire = ire; 8453 ihandle = save_ire->ire_ihandle; 8454 8455 ip2dbg(("ip_newroute_ipif: ire %p, ipif %p, " 8456 "flags %04x\n", 8457 (void *)ire, (void *)ipif, flags)); 8458 if ((flags & RTF_MULTIRT) && (fire != NULL) && 8459 (fire->ire_flags & RTF_MULTIRT)) { 8460 /* 8461 * As requested by flags, an IRE_OFFSUBNET was 8462 * looked up on that interface. This ire has 8463 * RTF_MULTIRT flag, so the resolution loop will 8464 * be re-entered to resolve additional routes on 8465 * other interfaces. For that purpose, a copy of 8466 * the packet is performed at this point. 8467 */ 8468 fire->ire_last_used_time = lbolt; 8469 copy_mp = copymsg(first_mp); 8470 if (copy_mp) { 8471 MULTIRT_DEBUG_TAG(copy_mp); 8472 } 8473 } 8474 if ((flags & RTF_SETSRC) && (fire != NULL) && 8475 (fire->ire_flags & RTF_SETSRC)) { 8476 /* 8477 * As requested by flags, an IRE_OFFSUBET was 8478 * looked up on that interface. This ire has 8479 * RTF_SETSRC flag, so the source address of the 8480 * packet must be changed. 8481 */ 8482 ipha->ipha_src = fire->ire_src_addr; 8483 } 8484 } else { 8485 ASSERT((connp == NULL) || 8486 (connp->conn_xmit_if_ill != NULL) || 8487 (connp->conn_dontroute)); 8488 /* 8489 * The only ways we can come here are: 8490 * 1) IP_XMIT_IF socket option is set 8491 * 2) ICMP error message generated from 8492 * ip_mrtun_forward() routine and it needs 8493 * to go through the specified ill. 8494 * 3) SO_DONTROUTE socket option is set 8495 * In all cases, the new ire will not be added 8496 * into cache table. 8497 */ 8498 ire_marks |= IRE_MARK_NOADD; 8499 } 8500 8501 switch (ipif->ipif_net_type) { 8502 case IRE_IF_NORESOLVER: { 8503 /* We have what we need to build an IRE_CACHE. */ 8504 mblk_t *dlureq_mp; 8505 8506 /* 8507 * Create a new dlureq_mp with the 8508 * IP gateway address as destination address in the 8509 * DLPI hdr if the physical length is exactly 4 bytes. 8510 */ 8511 if (dst_ill->ill_phys_addr_length == IP_ADDR_LEN) { 8512 dlureq_mp = ill_dlur_gen((uchar_t *)&dst, 8513 dst_ill->ill_phys_addr_length, 8514 dst_ill->ill_sap, 8515 dst_ill->ill_sap_length); 8516 } else { 8517 /* use the value set in ip_ll_subnet_defaults */ 8518 dlureq_mp = ill_dlur_gen(NULL, 8519 dst_ill->ill_phys_addr_length, 8520 dst_ill->ill_sap, 8521 dst_ill->ill_sap_length); 8522 } 8523 8524 if (dlureq_mp == NULL) 8525 break; 8526 /* 8527 * The new ire inherits the IRE_OFFSUBNET flags 8528 * and source address, if this was requested. 8529 */ 8530 ire = ire_create( 8531 (uchar_t *)&dst, /* dest address */ 8532 (uchar_t *)&ip_g_all_ones, /* mask */ 8533 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 8534 NULL, /* gateway address */ 8535 NULL, 8536 &ipif->ipif_mtu, 8537 NULL, /* Fast Path header */ 8538 dst_ill->ill_rq, /* recv-from queue */ 8539 dst_ill->ill_wq, /* send-to queue */ 8540 IRE_CACHE, 8541 dlureq_mp, 8542 src_ipif, 8543 NULL, 8544 (save_ire != NULL ? save_ire->ire_mask : 0), 8545 (fire != NULL) ? /* Parent handle */ 8546 fire->ire_phandle : 0, 8547 ihandle, /* Interface handle */ 8548 (fire != NULL) ? 8549 (fire->ire_flags & 8550 (RTF_SETSRC | RTF_MULTIRT)) : 0, 8551 (save_ire == NULL ? &ire_uinfo_null : 8552 &save_ire->ire_uinfo), 8553 NULL, 8554 NULL); 8555 8556 freeb(dlureq_mp); 8557 8558 if (ire == NULL) { 8559 if (save_ire != NULL) 8560 ire_refrele(save_ire); 8561 break; 8562 } 8563 8564 ire->ire_marks |= ire_marks; 8565 8566 /* 8567 * If IRE_MARK_NOADD is set then we need to convert 8568 * the max_fragp to a useable value now. This is 8569 * normally done in ire_add_v[46]. 8570 */ 8571 if (ire->ire_marks & IRE_MARK_NOADD) { 8572 uint_t max_frag; 8573 8574 max_frag = *ire->ire_max_fragp; 8575 ire->ire_max_fragp = NULL; 8576 ire->ire_max_frag = max_frag; 8577 } 8578 8579 /* Prevent save_ire from getting deleted */ 8580 if (save_ire != NULL) { 8581 IRB_REFHOLD(save_ire->ire_bucket); 8582 /* Has it been removed already ? */ 8583 if (save_ire->ire_marks & IRE_MARK_CONDEMNED) { 8584 IRB_REFRELE(save_ire->ire_bucket); 8585 ire_refrele(save_ire); 8586 break; 8587 } 8588 } 8589 8590 ire_add_then_send(q, ire, first_mp); 8591 8592 /* Assert that save_ire is not deleted yet. */ 8593 if (save_ire != NULL) { 8594 ASSERT(save_ire->ire_ptpn != NULL); 8595 IRB_REFRELE(save_ire->ire_bucket); 8596 ire_refrele(save_ire); 8597 save_ire = NULL; 8598 } 8599 if (fire != NULL) { 8600 ire_refrele(fire); 8601 fire = NULL; 8602 } 8603 8604 /* 8605 * the resolution loop is re-entered if this 8606 * was requested through flags and if we 8607 * actually are in a multirouting case. 8608 */ 8609 if ((flags & RTF_MULTIRT) && (copy_mp != NULL)) { 8610 boolean_t need_resolve = 8611 ire_multirt_need_resolve(ipha_dst, 8612 MBLK_GETLABEL(copy_mp)); 8613 if (!need_resolve) { 8614 MULTIRT_DEBUG_UNTAG(copy_mp); 8615 freemsg(copy_mp); 8616 copy_mp = NULL; 8617 } else { 8618 /* 8619 * ipif_lookup_group() calls 8620 * ire_lookup_multi() that uses 8621 * ire_ftable_lookup() to find 8622 * an IRE_INTERFACE for the group. 8623 * In the multirt case, 8624 * ire_lookup_multi() then invokes 8625 * ire_multirt_lookup() to find 8626 * the next resolvable ire. 8627 * As a result, we obtain an new 8628 * interface, derived from the 8629 * next ire. 8630 */ 8631 ipif_refrele(ipif); 8632 ipif = ipif_lookup_group(ipha_dst, 8633 zoneid); 8634 ip2dbg(("ip_newroute_ipif: " 8635 "multirt dst %08x, ipif %p\n", 8636 htonl(dst), (void *)ipif)); 8637 if (ipif != NULL) { 8638 mp = copy_mp; 8639 copy_mp = NULL; 8640 multirt_resolve_next = B_TRUE; 8641 continue; 8642 } else { 8643 freemsg(copy_mp); 8644 } 8645 } 8646 } 8647 if (ipif != NULL) 8648 ipif_refrele(ipif); 8649 ill_refrele(dst_ill); 8650 ipif_refrele(src_ipif); 8651 return; 8652 } 8653 case IRE_IF_RESOLVER: 8654 /* 8655 * We can't build an IRE_CACHE yet, but at least 8656 * we found a resolver that can help. 8657 */ 8658 res_mp = dst_ill->ill_resolver_mp; 8659 if (!OK_RESOLVER_MP(res_mp)) 8660 break; 8661 8662 /* 8663 * We obtain a partial IRE_CACHE which we will pass 8664 * along with the resolver query. When the response 8665 * comes back it will be there ready for us to add. 8666 * The new ire inherits the IRE_OFFSUBNET flags 8667 * and source address, if this was requested. 8668 * The ire_max_frag is atomically set under the 8669 * irebucket lock in ire_add_v[46]. Only in the 8670 * case of IRE_MARK_NOADD, we set it here itself. 8671 */ 8672 ire = ire_create_mp( 8673 (uchar_t *)&dst, /* dest address */ 8674 (uchar_t *)&ip_g_all_ones, /* mask */ 8675 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 8676 NULL, /* gateway address */ 8677 NULL, /* no in_src_addr */ 8678 (ire_marks & IRE_MARK_NOADD) ? 8679 ipif->ipif_mtu : 0, /* max_frag */ 8680 NULL, /* Fast path header */ 8681 dst_ill->ill_rq, /* recv-from queue */ 8682 dst_ill->ill_wq, /* send-to queue */ 8683 IRE_CACHE, 8684 res_mp, 8685 src_ipif, 8686 NULL, 8687 (save_ire != NULL ? save_ire->ire_mask : 0), 8688 (fire != NULL) ? /* Parent handle */ 8689 fire->ire_phandle : 0, 8690 ihandle, /* Interface handle */ 8691 (fire != NULL) ? /* flags if any */ 8692 (fire->ire_flags & 8693 (RTF_SETSRC | RTF_MULTIRT)) : 0, 8694 (save_ire == NULL ? &ire_uinfo_null : 8695 &save_ire->ire_uinfo), 8696 NULL, 8697 NULL); 8698 8699 if (save_ire != NULL) { 8700 ire_refrele(save_ire); 8701 save_ire = NULL; 8702 } 8703 if (ire == NULL) 8704 break; 8705 8706 ire->ire_marks |= ire_marks; 8707 /* 8708 * Construct message chain for the resolver of the 8709 * form: 8710 * ARP_REQ_MBLK-->IRE_MBLK-->Packet 8711 * 8712 * NOTE : ire will be added later when the response 8713 * comes back from ARP. If the response does not 8714 * come back, ARP frees the packet. For this reason, 8715 * we can't REFHOLD the bucket of save_ire to prevent 8716 * deletions. We may not be able to REFRELE the 8717 * bucket if the response never comes back. 8718 * Thus, before adding the ire, ire_add_v4 will make 8719 * sure that the interface route does not get deleted. 8720 * This is the only case unlike ip_newroute_v6, 8721 * ip_newroute_ipif_v6 where we can always prevent 8722 * deletions because ire_add_then_send is called after 8723 * creating the IRE. 8724 * If IRE_MARK_NOADD is set, then ire_add_then_send 8725 * does not add this IRE into the IRE CACHE. 8726 */ 8727 ASSERT(ire->ire_mp != NULL); 8728 ire->ire_mp->b_cont = first_mp; 8729 /* Have saved_mp handy, for cleanup if canput fails */ 8730 saved_mp = mp; 8731 mp = ire->ire_dlureq_mp; 8732 ASSERT(mp != NULL); 8733 ire->ire_dlureq_mp = NULL; 8734 linkb(mp, ire->ire_mp); 8735 8736 /* 8737 * Fill in the source and dest addrs for the resolver. 8738 * NOTE: this depends on memory layouts imposed by 8739 * ill_init(). 8740 */ 8741 areq = (areq_t *)mp->b_rptr; 8742 addrp = (ipaddr_t *)((char *)areq + 8743 areq->areq_sender_addr_offset); 8744 *addrp = ire->ire_src_addr; 8745 addrp = (ipaddr_t *)((char *)areq + 8746 areq->areq_target_addr_offset); 8747 *addrp = dst; 8748 /* Up to the resolver. */ 8749 if (canputnext(dst_ill->ill_rq)) { 8750 putnext(dst_ill->ill_rq, mp); 8751 /* 8752 * The response will come back in ip_wput 8753 * with db_type IRE_DB_TYPE. 8754 */ 8755 } else { 8756 ire->ire_dlureq_mp = mp; 8757 mp->b_cont = NULL; 8758 ire_delete(ire); 8759 saved_mp->b_next = NULL; 8760 saved_mp->b_prev = NULL; 8761 freemsg(first_mp); 8762 ip2dbg(("ip_newroute_ipif: dropped\n")); 8763 } 8764 8765 if (fire != NULL) { 8766 ire_refrele(fire); 8767 fire = NULL; 8768 } 8769 8770 8771 /* 8772 * The resolution loop is re-entered if this was 8773 * requested through flags and we actually are 8774 * in a multirouting case. 8775 */ 8776 if ((flags & RTF_MULTIRT) && (copy_mp != NULL)) { 8777 boolean_t need_resolve = 8778 ire_multirt_need_resolve(ipha_dst, 8779 MBLK_GETLABEL(copy_mp)); 8780 if (!need_resolve) { 8781 MULTIRT_DEBUG_UNTAG(copy_mp); 8782 freemsg(copy_mp); 8783 copy_mp = NULL; 8784 } else { 8785 /* 8786 * ipif_lookup_group() calls 8787 * ire_lookup_multi() that uses 8788 * ire_ftable_lookup() to find 8789 * an IRE_INTERFACE for the group. 8790 * In the multirt case, 8791 * ire_lookup_multi() then invokes 8792 * ire_multirt_lookup() to find 8793 * the next resolvable ire. 8794 * As a result, we obtain an new 8795 * interface, derived from the 8796 * next ire. 8797 */ 8798 ipif_refrele(ipif); 8799 ipif = ipif_lookup_group(ipha_dst, 8800 zoneid); 8801 if (ipif != NULL) { 8802 mp = copy_mp; 8803 copy_mp = NULL; 8804 multirt_resolve_next = B_TRUE; 8805 continue; 8806 } else { 8807 freemsg(copy_mp); 8808 } 8809 } 8810 } 8811 if (ipif != NULL) 8812 ipif_refrele(ipif); 8813 ill_refrele(dst_ill); 8814 ipif_refrele(src_ipif); 8815 return; 8816 default: 8817 break; 8818 } 8819 } while (multirt_resolve_next); 8820 8821 err_ret: 8822 ip2dbg(("ip_newroute_ipif: dropped\n")); 8823 if (fire != NULL) 8824 ire_refrele(fire); 8825 ipif_refrele(ipif); 8826 /* Did this packet originate externally? */ 8827 if (dst_ill != NULL) 8828 ill_refrele(dst_ill); 8829 if (src_ipif != NULL) 8830 ipif_refrele(src_ipif); 8831 if (mp->b_prev || mp->b_next) { 8832 mp->b_next = NULL; 8833 mp->b_prev = NULL; 8834 } else { 8835 /* 8836 * Since ip_wput() isn't close to finished, we fill 8837 * in enough of the header for credible error reporting. 8838 */ 8839 if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid)) { 8840 /* Failed */ 8841 freemsg(first_mp); 8842 if (ire != NULL) 8843 ire_refrele(ire); 8844 return; 8845 } 8846 } 8847 /* 8848 * At this point we will have ire only if RTF_BLACKHOLE 8849 * or RTF_REJECT flags are set on the IRE. It will not 8850 * generate ICMP_HOST_UNREACHABLE if RTF_BLACKHOLE is set. 8851 */ 8852 if (ire != NULL) { 8853 if (ire->ire_flags & RTF_BLACKHOLE) { 8854 ire_refrele(ire); 8855 freemsg(first_mp); 8856 return; 8857 } 8858 ire_refrele(ire); 8859 } 8860 icmp_unreachable(q, first_mp, ICMP_HOST_UNREACHABLE); 8861 } 8862 8863 /* Name/Value Table Lookup Routine */ 8864 char * 8865 ip_nv_lookup(nv_t *nv, int value) 8866 { 8867 if (!nv) 8868 return (NULL); 8869 for (; nv->nv_name; nv++) { 8870 if (nv->nv_value == value) 8871 return (nv->nv_name); 8872 } 8873 return ("unknown"); 8874 } 8875 8876 /* 8877 * one day it can be patched to 1 from /etc/system for machines that have few 8878 * fast network interfaces feeding multiple cpus. 8879 */ 8880 int ill_stream_putlocks = 0; 8881 8882 /* 8883 * This is a module open, i.e. this is a control stream for access 8884 * to a DLPI device. We allocate an ill_t as the instance data in 8885 * this case. 8886 */ 8887 int 8888 ip_modopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 8889 { 8890 uint32_t mem_cnt; 8891 uint32_t cpu_cnt; 8892 uint32_t min_cnt; 8893 pgcnt_t mem_avail; 8894 extern uint32_t ip_cache_table_size, ip6_cache_table_size; 8895 ill_t *ill; 8896 int err; 8897 8898 /* 8899 * Prevent unprivileged processes from pushing IP so that 8900 * they can't send raw IP. 8901 */ 8902 if (secpolicy_net_rawaccess(credp) != 0) 8903 return (EPERM); 8904 8905 ill = (ill_t *)mi_open_alloc_sleep(sizeof (ill_t)); 8906 q->q_ptr = WR(q)->q_ptr = ill; 8907 8908 /* 8909 * ill_init initializes the ill fields and then sends down 8910 * down a DL_INFO_REQ after calling qprocson. 8911 */ 8912 err = ill_init(q, ill); 8913 if (err != 0) { 8914 mi_free(ill); 8915 q->q_ptr = NULL; 8916 WR(q)->q_ptr = NULL; 8917 return (err); 8918 } 8919 8920 /* ill_init initializes the ipsq marking this thread as writer */ 8921 ipsq_exit(ill->ill_phyint->phyint_ipsq, B_TRUE, B_TRUE); 8922 /* Wait for the DL_INFO_ACK */ 8923 mutex_enter(&ill->ill_lock); 8924 while (ill->ill_state_flags & ILL_LL_SUBNET_PENDING) { 8925 /* 8926 * Return value of 0 indicates a pending signal. 8927 */ 8928 err = cv_wait_sig(&ill->ill_cv, &ill->ill_lock); 8929 if (err == 0) { 8930 mutex_exit(&ill->ill_lock); 8931 (void) ip_close(q, 0); 8932 return (EINTR); 8933 } 8934 } 8935 mutex_exit(&ill->ill_lock); 8936 8937 /* 8938 * ip_rput_other could have set an error in ill_error on 8939 * receipt of M_ERROR. 8940 */ 8941 8942 err = ill->ill_error; 8943 if (err != 0) { 8944 (void) ip_close(q, 0); 8945 return (err); 8946 } 8947 8948 /* 8949 * ip_ire_max_bucket_cnt is sized below based on the memory 8950 * size and the cpu speed of the machine. This is upper 8951 * bounded by the compile time value of ip_ire_max_bucket_cnt 8952 * and is lower bounded by the compile time value of 8953 * ip_ire_min_bucket_cnt. Similar logic applies to 8954 * ip6_ire_max_bucket_cnt. 8955 */ 8956 mem_avail = kmem_avail(); 8957 mem_cnt = (mem_avail >> ip_ire_mem_ratio) / 8958 ip_cache_table_size / sizeof (ire_t); 8959 cpu_cnt = CPU->cpu_type_info.pi_clock >> ip_ire_cpu_ratio; 8960 8961 min_cnt = MIN(cpu_cnt, mem_cnt); 8962 if (min_cnt < ip_ire_min_bucket_cnt) 8963 min_cnt = ip_ire_min_bucket_cnt; 8964 if (ip_ire_max_bucket_cnt > min_cnt) { 8965 ip_ire_max_bucket_cnt = min_cnt; 8966 } 8967 8968 mem_cnt = (mem_avail >> ip_ire_mem_ratio) / 8969 ip6_cache_table_size / sizeof (ire_t); 8970 min_cnt = MIN(cpu_cnt, mem_cnt); 8971 if (min_cnt < ip6_ire_min_bucket_cnt) 8972 min_cnt = ip6_ire_min_bucket_cnt; 8973 if (ip6_ire_max_bucket_cnt > min_cnt) { 8974 ip6_ire_max_bucket_cnt = min_cnt; 8975 } 8976 8977 ill->ill_credp = credp; 8978 crhold(credp); 8979 8980 mutex_enter(&ip_mi_lock); 8981 err = mi_open_link(&ip_g_head, (IDP)ill, devp, flag, sflag, credp); 8982 mutex_exit(&ip_mi_lock); 8983 if (err) { 8984 (void) ip_close(q, 0); 8985 return (err); 8986 } 8987 return (0); 8988 } 8989 8990 /* IP open routine. */ 8991 int 8992 ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 8993 { 8994 conn_t *connp; 8995 major_t maj; 8996 8997 TRACE_1(TR_FAC_IP, TR_IP_OPEN, "ip_open: q %p", q); 8998 8999 /* Allow reopen. */ 9000 if (q->q_ptr != NULL) 9001 return (0); 9002 9003 if (sflag & MODOPEN) { 9004 /* This is a module open */ 9005 return (ip_modopen(q, devp, flag, sflag, credp)); 9006 } 9007 9008 /* 9009 * We are opening as a device. This is an IP client stream, and we 9010 * allocate an conn_t as the instance data. 9011 */ 9012 connp = ipcl_conn_create(IPCL_IPCCONN, KM_SLEEP); 9013 connp->conn_upq = q; 9014 q->q_ptr = WR(q)->q_ptr = connp; 9015 9016 if (flag & SO_SOCKSTR) 9017 connp->conn_flags |= IPCL_SOCKET; 9018 9019 /* Minor tells us which /dev entry was opened */ 9020 if (geteminor(*devp) == IPV6_MINOR) { 9021 connp->conn_flags |= IPCL_ISV6; 9022 connp->conn_af_isv6 = B_TRUE; 9023 ip_setqinfo(q, geteminor(*devp), B_FALSE); 9024 connp->conn_src_preferences = IPV6_PREFER_SRC_DEFAULT; 9025 } else { 9026 connp->conn_af_isv6 = B_FALSE; 9027 connp->conn_pkt_isv6 = B_FALSE; 9028 } 9029 9030 if ((connp->conn_dev = inet_minor_alloc(ip_minor_arena)) == 0) { 9031 q->q_ptr = WR(q)->q_ptr = NULL; 9032 CONN_DEC_REF(connp); 9033 return (EBUSY); 9034 } 9035 9036 maj = getemajor(*devp); 9037 *devp = makedevice(maj, (minor_t)connp->conn_dev); 9038 9039 /* 9040 * connp->conn_cred is crfree()ed in ipcl_conn_destroy() 9041 */ 9042 connp->conn_cred = credp; 9043 crhold(connp->conn_cred); 9044 9045 /* 9046 * If the caller has the process-wide flag set, then default to MAC 9047 * exempt mode. This allows read-down to unlabeled hosts. 9048 */ 9049 if (getpflags(NET_MAC_AWARE, credp) != 0) 9050 connp->conn_mac_exempt = B_TRUE; 9051 9052 connp->conn_zoneid = getzoneid(); 9053 9054 /* 9055 * This should only happen for ndd, netstat, raw socket or other SCTP 9056 * administrative ops. In these cases, we just need a normal conn_t 9057 * with ulp set to IPPROTO_SCTP. All other ops are trapped and 9058 * an error will be returned. 9059 */ 9060 if (maj != SCTP_MAJ && maj != SCTP6_MAJ) { 9061 connp->conn_rq = q; 9062 connp->conn_wq = WR(q); 9063 } else { 9064 connp->conn_ulp = IPPROTO_SCTP; 9065 connp->conn_rq = connp->conn_wq = NULL; 9066 } 9067 /* Non-zero default values */ 9068 connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; 9069 9070 /* 9071 * Make the conn globally visible to walkers 9072 */ 9073 mutex_enter(&connp->conn_lock); 9074 connp->conn_state_flags &= ~CONN_INCIPIENT; 9075 mutex_exit(&connp->conn_lock); 9076 ASSERT(connp->conn_ref == 1); 9077 9078 qprocson(q); 9079 9080 return (0); 9081 } 9082 9083 /* 9084 * Change q_qinfo based on the value of isv6. 9085 * This can not called on an ill queue. 9086 * Note that there is no race since either q_qinfo works for conn queues - it 9087 * is just an optimization to enter the best wput routine directly. 9088 */ 9089 void 9090 ip_setqinfo(queue_t *q, minor_t minor, boolean_t bump_mib) 9091 { 9092 ASSERT(q->q_flag & QREADR); 9093 ASSERT(WR(q)->q_next == NULL); 9094 ASSERT(q->q_ptr != NULL); 9095 9096 if (minor == IPV6_MINOR) { 9097 if (bump_mib) 9098 BUMP_MIB(&ip6_mib, ipv6OutSwitchIPv4); 9099 q->q_qinfo = &rinit_ipv6; 9100 WR(q)->q_qinfo = &winit_ipv6; 9101 (Q_TO_CONN(q))->conn_pkt_isv6 = B_TRUE; 9102 } else { 9103 if (bump_mib) 9104 BUMP_MIB(&ip_mib, ipOutSwitchIPv6); 9105 q->q_qinfo = &rinit; 9106 WR(q)->q_qinfo = &winit; 9107 (Q_TO_CONN(q))->conn_pkt_isv6 = B_FALSE; 9108 } 9109 9110 } 9111 9112 /* 9113 * See if IPsec needs loading because of the options in mp. 9114 */ 9115 static boolean_t 9116 ipsec_opt_present(mblk_t *mp) 9117 { 9118 uint8_t *optcp, *next_optcp, *opt_endcp; 9119 struct opthdr *opt; 9120 struct T_opthdr *topt; 9121 int opthdr_len; 9122 t_uscalar_t optname, optlevel; 9123 struct T_optmgmt_req *tor = (struct T_optmgmt_req *)mp->b_rptr; 9124 ipsec_req_t *ipsr; 9125 9126 /* 9127 * Walk through the mess, and find IP_SEC_OPT. If it's there, 9128 * return TRUE. 9129 */ 9130 9131 optcp = mi_offset_param(mp, tor->OPT_offset, tor->OPT_length); 9132 opt_endcp = optcp + tor->OPT_length; 9133 if (tor->PRIM_type == T_OPTMGMT_REQ) { 9134 opthdr_len = sizeof (struct T_opthdr); 9135 } else { /* O_OPTMGMT_REQ */ 9136 ASSERT(tor->PRIM_type == T_SVR4_OPTMGMT_REQ); 9137 opthdr_len = sizeof (struct opthdr); 9138 } 9139 for (; optcp < opt_endcp; optcp = next_optcp) { 9140 if (optcp + opthdr_len > opt_endcp) 9141 return (B_FALSE); /* Not enough option header. */ 9142 if (tor->PRIM_type == T_OPTMGMT_REQ) { 9143 topt = (struct T_opthdr *)optcp; 9144 optlevel = topt->level; 9145 optname = topt->name; 9146 next_optcp = optcp + _TPI_ALIGN_TOPT(topt->len); 9147 } else { 9148 opt = (struct opthdr *)optcp; 9149 optlevel = opt->level; 9150 optname = opt->name; 9151 next_optcp = optcp + opthdr_len + 9152 _TPI_ALIGN_OPT(opt->len); 9153 } 9154 if ((next_optcp < optcp) || /* wraparound pointer space */ 9155 ((next_optcp >= opt_endcp) && /* last option bad len */ 9156 ((next_optcp - opt_endcp) >= __TPI_ALIGN_SIZE))) 9157 return (B_FALSE); /* bad option buffer */ 9158 if ((optlevel == IPPROTO_IP && optname == IP_SEC_OPT) || 9159 (optlevel == IPPROTO_IPV6 && optname == IPV6_SEC_OPT)) { 9160 /* 9161 * Check to see if it's an all-bypass or all-zeroes 9162 * IPsec request. Don't bother loading IPsec if 9163 * the socket doesn't want to use it. (A good example 9164 * is a bypass request.) 9165 * 9166 * Basically, if any of the non-NEVER bits are set, 9167 * load IPsec. 9168 */ 9169 ipsr = (ipsec_req_t *)(optcp + opthdr_len); 9170 if ((ipsr->ipsr_ah_req & ~IPSEC_PREF_NEVER) != 0 || 9171 (ipsr->ipsr_esp_req & ~IPSEC_PREF_NEVER) != 0 || 9172 (ipsr->ipsr_self_encap_req & ~IPSEC_PREF_NEVER) 9173 != 0) 9174 return (B_TRUE); 9175 } 9176 } 9177 return (B_FALSE); 9178 } 9179 9180 /* 9181 * If conn is is waiting for ipsec to finish loading, kick it. 9182 */ 9183 /* ARGSUSED */ 9184 static void 9185 conn_restart_ipsec_waiter(conn_t *connp, void *arg) 9186 { 9187 t_scalar_t optreq_prim; 9188 mblk_t *mp; 9189 cred_t *cr; 9190 int err = 0; 9191 9192 /* 9193 * This function is called, after ipsec loading is complete. 9194 * Since IP checks exclusively and atomically (i.e it prevents 9195 * ipsec load from completing until ip_optcom_req completes) 9196 * whether ipsec load is complete, there cannot be a race with IP 9197 * trying to set the CONN_IPSEC_LOAD_WAIT flag on any conn now. 9198 */ 9199 mutex_enter(&connp->conn_lock); 9200 if (connp->conn_state_flags & CONN_IPSEC_LOAD_WAIT) { 9201 ASSERT(connp->conn_ipsec_opt_mp != NULL); 9202 mp = connp->conn_ipsec_opt_mp; 9203 connp->conn_ipsec_opt_mp = NULL; 9204 connp->conn_state_flags &= ~CONN_IPSEC_LOAD_WAIT; 9205 cr = DB_CREDDEF(mp, GET_QUEUE_CRED(CONNP_TO_WQ(connp))); 9206 mutex_exit(&connp->conn_lock); 9207 9208 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 9209 9210 optreq_prim = ((union T_primitives *)mp->b_rptr)->type; 9211 if (optreq_prim == T_OPTMGMT_REQ) { 9212 err = tpi_optcom_req(CONNP_TO_WQ(connp), mp, cr, 9213 &ip_opt_obj); 9214 } else { 9215 ASSERT(optreq_prim == T_SVR4_OPTMGMT_REQ); 9216 err = svr4_optcom_req(CONNP_TO_WQ(connp), mp, cr, 9217 &ip_opt_obj); 9218 } 9219 if (err != EINPROGRESS) 9220 CONN_OPER_PENDING_DONE(connp); 9221 return; 9222 } 9223 mutex_exit(&connp->conn_lock); 9224 } 9225 9226 /* 9227 * Called from the ipsec_loader thread, outside any perimeter, to tell 9228 * ip qenable any of the queues waiting for the ipsec loader to 9229 * complete. 9230 * 9231 * Use ip_mi_lock to be safe here: all modifications of the mi lists 9232 * are done with this lock held, so it's guaranteed that none of the 9233 * links will change along the way. 9234 */ 9235 void 9236 ip_ipsec_load_complete() 9237 { 9238 ipcl_walk(conn_restart_ipsec_waiter, NULL); 9239 } 9240 9241 /* 9242 * Can't be used. Need to call svr4* -> optset directly. the leaf routine 9243 * determines the grp on which it has to become exclusive, queues the mp 9244 * and sq draining restarts the optmgmt 9245 */ 9246 static boolean_t 9247 ip_check_for_ipsec_opt(queue_t *q, mblk_t *mp) 9248 { 9249 conn_t *connp; 9250 9251 /* 9252 * Take IPsec requests and treat them special. 9253 */ 9254 if (ipsec_opt_present(mp)) { 9255 /* First check if IPsec is loaded. */ 9256 mutex_enter(&ipsec_loader_lock); 9257 if (ipsec_loader_state != IPSEC_LOADER_WAIT) { 9258 mutex_exit(&ipsec_loader_lock); 9259 return (B_FALSE); 9260 } 9261 connp = Q_TO_CONN(q); 9262 mutex_enter(&connp->conn_lock); 9263 connp->conn_state_flags |= CONN_IPSEC_LOAD_WAIT; 9264 9265 ASSERT(connp->conn_ipsec_opt_mp == NULL); 9266 connp->conn_ipsec_opt_mp = mp; 9267 mutex_exit(&connp->conn_lock); 9268 mutex_exit(&ipsec_loader_lock); 9269 9270 ipsec_loader_loadnow(); 9271 return (B_TRUE); 9272 } 9273 return (B_FALSE); 9274 } 9275 9276 /* 9277 * Set IPsec policy from an ipsec_req_t. If the req is not "zero" and valid, 9278 * all of them are copied to the conn_t. If the req is "zero", the policy is 9279 * zeroed out. A "zero" policy has zero ipsr_{ah,req,self_encap}_req 9280 * fields. 9281 * We keep only the latest setting of the policy and thus policy setting 9282 * is not incremental/cumulative. 9283 * 9284 * Requests to set policies with multiple alternative actions will 9285 * go through a different API. 9286 */ 9287 int 9288 ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req) 9289 { 9290 uint_t ah_req = 0; 9291 uint_t esp_req = 0; 9292 uint_t se_req = 0; 9293 ipsec_selkey_t sel; 9294 ipsec_act_t *actp = NULL; 9295 uint_t nact; 9296 ipsec_policy_t *pin4 = NULL, *pout4 = NULL; 9297 ipsec_policy_t *pin6 = NULL, *pout6 = NULL; 9298 ipsec_policy_root_t *pr; 9299 ipsec_policy_head_t *ph; 9300 int fam; 9301 boolean_t is_pol_reset; 9302 int error = 0; 9303 9304 #define REQ_MASK (IPSEC_PREF_REQUIRED|IPSEC_PREF_NEVER) 9305 9306 /* 9307 * The IP_SEC_OPT option does not allow variable length parameters, 9308 * hence a request cannot be NULL. 9309 */ 9310 if (req == NULL) 9311 return (EINVAL); 9312 9313 ah_req = req->ipsr_ah_req; 9314 esp_req = req->ipsr_esp_req; 9315 se_req = req->ipsr_self_encap_req; 9316 9317 /* 9318 * Are we dealing with a request to reset the policy (i.e. 9319 * zero requests). 9320 */ 9321 is_pol_reset = ((ah_req & REQ_MASK) == 0 && 9322 (esp_req & REQ_MASK) == 0 && 9323 (se_req & REQ_MASK) == 0); 9324 9325 if (!is_pol_reset) { 9326 /* 9327 * If we couldn't load IPsec, fail with "protocol 9328 * not supported". 9329 * IPsec may not have been loaded for a request with zero 9330 * policies, so we don't fail in this case. 9331 */ 9332 mutex_enter(&ipsec_loader_lock); 9333 if (ipsec_loader_state != IPSEC_LOADER_SUCCEEDED) { 9334 mutex_exit(&ipsec_loader_lock); 9335 return (EPROTONOSUPPORT); 9336 } 9337 mutex_exit(&ipsec_loader_lock); 9338 9339 /* 9340 * Test for valid requests. Invalid algorithms 9341 * need to be tested by IPSEC code because new 9342 * algorithms can be added dynamically. 9343 */ 9344 if ((ah_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0 || 9345 (esp_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0 || 9346 (se_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0) { 9347 return (EINVAL); 9348 } 9349 9350 /* 9351 * Only privileged users can issue these 9352 * requests. 9353 */ 9354 if (((ah_req & IPSEC_PREF_NEVER) || 9355 (esp_req & IPSEC_PREF_NEVER) || 9356 (se_req & IPSEC_PREF_NEVER)) && 9357 secpolicy_net_config(cr, B_FALSE) != 0) { 9358 return (EPERM); 9359 } 9360 9361 /* 9362 * The IPSEC_PREF_REQUIRED and IPSEC_PREF_NEVER 9363 * are mutually exclusive. 9364 */ 9365 if (((ah_req & REQ_MASK) == REQ_MASK) || 9366 ((esp_req & REQ_MASK) == REQ_MASK) || 9367 ((se_req & REQ_MASK) == REQ_MASK)) { 9368 /* Both of them are set */ 9369 return (EINVAL); 9370 } 9371 } 9372 9373 mutex_enter(&connp->conn_lock); 9374 9375 /* 9376 * If we have already cached policies in ip_bind_connected*(), don't 9377 * let them change now. We cache policies for connections 9378 * whose src,dst [addr, port] is known. The exception to this is 9379 * tunnels. Tunnels are allowed to change policies after having 9380 * become fully bound. 9381 */ 9382 if (connp->conn_policy_cached && !IPCL_IS_IPTUN(connp)) { 9383 mutex_exit(&connp->conn_lock); 9384 return (EINVAL); 9385 } 9386 9387 /* 9388 * We have a zero policies, reset the connection policy if already 9389 * set. This will cause the connection to inherit the 9390 * global policy, if any. 9391 */ 9392 if (is_pol_reset) { 9393 if (connp->conn_policy != NULL) { 9394 IPPH_REFRELE(connp->conn_policy); 9395 connp->conn_policy = NULL; 9396 } 9397 connp->conn_flags &= ~IPCL_CHECK_POLICY; 9398 connp->conn_in_enforce_policy = B_FALSE; 9399 connp->conn_out_enforce_policy = B_FALSE; 9400 mutex_exit(&connp->conn_lock); 9401 return (0); 9402 } 9403 9404 ph = connp->conn_policy = ipsec_polhead_split(connp->conn_policy); 9405 if (ph == NULL) 9406 goto enomem; 9407 9408 ipsec_actvec_from_req(req, &actp, &nact); 9409 if (actp == NULL) 9410 goto enomem; 9411 9412 /* 9413 * Always allocate IPv4 policy entries, since they can also 9414 * apply to ipv6 sockets being used in ipv4-compat mode. 9415 */ 9416 bzero(&sel, sizeof (sel)); 9417 sel.ipsl_valid = IPSL_IPV4; 9418 9419 pin4 = ipsec_policy_create(&sel, actp, nact, IPSEC_PRIO_SOCKET); 9420 if (pin4 == NULL) 9421 goto enomem; 9422 9423 pout4 = ipsec_policy_create(&sel, actp, nact, IPSEC_PRIO_SOCKET); 9424 if (pout4 == NULL) 9425 goto enomem; 9426 9427 if (connp->conn_pkt_isv6) { 9428 /* 9429 * We're looking at a v6 socket, also allocate the 9430 * v6-specific entries... 9431 */ 9432 sel.ipsl_valid = IPSL_IPV6; 9433 pin6 = ipsec_policy_create(&sel, actp, nact, 9434 IPSEC_PRIO_SOCKET); 9435 if (pin6 == NULL) 9436 goto enomem; 9437 9438 pout6 = ipsec_policy_create(&sel, actp, nact, 9439 IPSEC_PRIO_SOCKET); 9440 if (pout6 == NULL) 9441 goto enomem; 9442 9443 /* 9444 * .. and file them away in the right place. 9445 */ 9446 fam = IPSEC_AF_V6; 9447 pr = &ph->iph_root[IPSEC_TYPE_INBOUND]; 9448 HASHLIST_INSERT(pin6, ipsp_hash, pr->ipr_nonhash[fam]); 9449 ipsec_insert_always(&ph->iph_rulebyid, pin6); 9450 pr = &ph->iph_root[IPSEC_TYPE_OUTBOUND]; 9451 HASHLIST_INSERT(pout6, ipsp_hash, pr->ipr_nonhash[fam]); 9452 ipsec_insert_always(&ph->iph_rulebyid, pout6); 9453 } 9454 9455 ipsec_actvec_free(actp, nact); 9456 9457 /* 9458 * File the v4 policies. 9459 */ 9460 fam = IPSEC_AF_V4; 9461 pr = &ph->iph_root[IPSEC_TYPE_INBOUND]; 9462 HASHLIST_INSERT(pin4, ipsp_hash, pr->ipr_nonhash[fam]); 9463 ipsec_insert_always(&ph->iph_rulebyid, pin4); 9464 9465 pr = &ph->iph_root[IPSEC_TYPE_OUTBOUND]; 9466 HASHLIST_INSERT(pout4, ipsp_hash, pr->ipr_nonhash[fam]); 9467 ipsec_insert_always(&ph->iph_rulebyid, pout4); 9468 9469 /* 9470 * If the requests need security, set enforce_policy. 9471 * If the requests are IPSEC_PREF_NEVER, one should 9472 * still set conn_out_enforce_policy so that an ipsec_out 9473 * gets attached in ip_wput. This is needed so that 9474 * for connections that we don't cache policy in ip_bind, 9475 * if global policy matches in ip_wput_attach_policy, we 9476 * don't wrongly inherit global policy. Similarly, we need 9477 * to set conn_in_enforce_policy also so that we don't verify 9478 * policy wrongly. 9479 */ 9480 if ((ah_req & REQ_MASK) != 0 || 9481 (esp_req & REQ_MASK) != 0 || 9482 (se_req & REQ_MASK) != 0) { 9483 connp->conn_in_enforce_policy = B_TRUE; 9484 connp->conn_out_enforce_policy = B_TRUE; 9485 connp->conn_flags |= IPCL_CHECK_POLICY; 9486 } 9487 9488 /* 9489 * Tunnels are allowed to set policy after having been fully bound. 9490 * If that's the case, cache policy here. 9491 */ 9492 if (IPCL_IS_IPTUN(connp) && connp->conn_fully_bound) 9493 error = ipsec_conn_cache_policy(connp, !connp->conn_af_isv6); 9494 9495 mutex_exit(&connp->conn_lock); 9496 return (error); 9497 #undef REQ_MASK 9498 9499 /* 9500 * Common memory-allocation-failure exit path. 9501 */ 9502 enomem: 9503 mutex_exit(&connp->conn_lock); 9504 if (actp != NULL) 9505 ipsec_actvec_free(actp, nact); 9506 if (pin4 != NULL) 9507 IPPOL_REFRELE(pin4); 9508 if (pout4 != NULL) 9509 IPPOL_REFRELE(pout4); 9510 if (pin6 != NULL) 9511 IPPOL_REFRELE(pin6); 9512 if (pout6 != NULL) 9513 IPPOL_REFRELE(pout6); 9514 return (ENOMEM); 9515 } 9516 9517 /* 9518 * Only for options that pass in an IP addr. Currently only V4 options 9519 * pass in an ipif. V6 options always pass an ifindex specifying the ill. 9520 * So this function assumes level is IPPROTO_IP 9521 */ 9522 int 9523 ip_opt_set_ipif(conn_t *connp, ipaddr_t addr, boolean_t checkonly, int option, 9524 mblk_t *first_mp) 9525 { 9526 ipif_t *ipif = NULL; 9527 int error; 9528 ill_t *ill; 9529 9530 ip2dbg(("ip_opt_set_ipif: ipaddr %X\n", addr)); 9531 9532 if (addr != INADDR_ANY || checkonly) { 9533 ASSERT(connp != NULL); 9534 if (option == IP_NEXTHOP) { 9535 ipif = 9536 ipif_lookup_onlink_addr(addr, connp->conn_zoneid); 9537 } else { 9538 ipif = ipif_lookup_addr(addr, NULL, connp->conn_zoneid, 9539 CONNP_TO_WQ(connp), first_mp, ip_restart_optmgmt, 9540 &error); 9541 } 9542 if (ipif == NULL) { 9543 if (error == EINPROGRESS) 9544 return (error); 9545 else if ((option == IP_MULTICAST_IF) || 9546 (option == IP_NEXTHOP)) 9547 return (EHOSTUNREACH); 9548 else 9549 return (EINVAL); 9550 } else if (checkonly) { 9551 if (option == IP_MULTICAST_IF) { 9552 ill = ipif->ipif_ill; 9553 /* not supported by the virtual network iface */ 9554 if (IS_VNI(ill)) { 9555 ipif_refrele(ipif); 9556 return (EINVAL); 9557 } 9558 } 9559 ipif_refrele(ipif); 9560 return (0); 9561 } 9562 ill = ipif->ipif_ill; 9563 mutex_enter(&connp->conn_lock); 9564 mutex_enter(&ill->ill_lock); 9565 if ((ill->ill_state_flags & ILL_CONDEMNED) || 9566 (ipif->ipif_state_flags & IPIF_CONDEMNED)) { 9567 mutex_exit(&ill->ill_lock); 9568 mutex_exit(&connp->conn_lock); 9569 ipif_refrele(ipif); 9570 return (option == IP_MULTICAST_IF ? 9571 EHOSTUNREACH : EINVAL); 9572 } 9573 } else { 9574 mutex_enter(&connp->conn_lock); 9575 } 9576 9577 /* None of the options below are supported on the VNI */ 9578 if (ipif != NULL && IS_VNI(ipif->ipif_ill)) { 9579 mutex_exit(&ill->ill_lock); 9580 mutex_exit(&connp->conn_lock); 9581 ipif_refrele(ipif); 9582 return (EINVAL); 9583 } 9584 9585 switch (option) { 9586 case IP_DONTFAILOVER_IF: 9587 /* 9588 * This option is used by in.mpathd to ensure 9589 * that IPMP probe packets only go out on the 9590 * test interfaces. in.mpathd sets this option 9591 * on the non-failover interfaces. 9592 * For backward compatibility, this option 9593 * implicitly sets IP_MULTICAST_IF, as used 9594 * be done in bind(), so that ip_wput gets 9595 * this ipif to send mcast packets. 9596 */ 9597 if (ipif != NULL) { 9598 ASSERT(addr != INADDR_ANY); 9599 connp->conn_nofailover_ill = ipif->ipif_ill; 9600 connp->conn_multicast_ipif = ipif; 9601 } else { 9602 ASSERT(addr == INADDR_ANY); 9603 connp->conn_nofailover_ill = NULL; 9604 connp->conn_multicast_ipif = NULL; 9605 } 9606 break; 9607 9608 case IP_MULTICAST_IF: 9609 connp->conn_multicast_ipif = ipif; 9610 break; 9611 case IP_NEXTHOP: 9612 connp->conn_nexthop_v4 = addr; 9613 connp->conn_nexthop_set = B_TRUE; 9614 break; 9615 } 9616 9617 if (ipif != NULL) { 9618 mutex_exit(&ill->ill_lock); 9619 mutex_exit(&connp->conn_lock); 9620 ipif_refrele(ipif); 9621 return (0); 9622 } 9623 mutex_exit(&connp->conn_lock); 9624 /* We succeded in cleared the option */ 9625 return (0); 9626 } 9627 9628 /* 9629 * For options that pass in an ifindex specifying the ill. V6 options always 9630 * pass in an ill. Some v4 options also pass in ifindex specifying the ill. 9631 */ 9632 int 9633 ip_opt_set_ill(conn_t *connp, int ifindex, boolean_t isv6, boolean_t checkonly, 9634 int level, int option, mblk_t *first_mp) 9635 { 9636 ill_t *ill = NULL; 9637 int error = 0; 9638 9639 ip2dbg(("ip_opt_set_ill: ifindex %d\n", ifindex)); 9640 if (ifindex != 0) { 9641 ASSERT(connp != NULL); 9642 ill = ill_lookup_on_ifindex(ifindex, isv6, CONNP_TO_WQ(connp), 9643 first_mp, ip_restart_optmgmt, &error); 9644 if (ill != NULL) { 9645 if (checkonly) { 9646 /* not supported by the virtual network iface */ 9647 if (IS_VNI(ill)) { 9648 ill_refrele(ill); 9649 return (EINVAL); 9650 } 9651 ill_refrele(ill); 9652 return (0); 9653 } 9654 if (!ipif_lookup_zoneid_group(ill, connp->conn_zoneid, 9655 0, NULL)) { 9656 ill_refrele(ill); 9657 ill = NULL; 9658 mutex_enter(&connp->conn_lock); 9659 goto setit; 9660 } 9661 mutex_enter(&connp->conn_lock); 9662 mutex_enter(&ill->ill_lock); 9663 if (ill->ill_state_flags & ILL_CONDEMNED) { 9664 mutex_exit(&ill->ill_lock); 9665 mutex_exit(&connp->conn_lock); 9666 ill_refrele(ill); 9667 ill = NULL; 9668 mutex_enter(&connp->conn_lock); 9669 } 9670 goto setit; 9671 } else if (error == EINPROGRESS) { 9672 return (error); 9673 } else { 9674 error = 0; 9675 } 9676 } 9677 mutex_enter(&connp->conn_lock); 9678 setit: 9679 ASSERT((level == IPPROTO_IP || level == IPPROTO_IPV6)); 9680 9681 /* 9682 * The options below assume that the ILL (if any) transmits and/or 9683 * receives traffic. Neither of which is true for the virtual network 9684 * interface, so fail setting these on a VNI. 9685 */ 9686 if (IS_VNI(ill)) { 9687 ASSERT(ill != NULL); 9688 mutex_exit(&ill->ill_lock); 9689 mutex_exit(&connp->conn_lock); 9690 ill_refrele(ill); 9691 return (EINVAL); 9692 } 9693 9694 if (level == IPPROTO_IP) { 9695 switch (option) { 9696 case IP_BOUND_IF: 9697 connp->conn_incoming_ill = ill; 9698 connp->conn_outgoing_ill = ill; 9699 connp->conn_orig_bound_ifindex = (ill == NULL) ? 9700 0 : ifindex; 9701 break; 9702 9703 case IP_XMIT_IF: 9704 /* 9705 * Similar to IP_BOUND_IF, but this only 9706 * determines the outgoing interface for 9707 * unicast packets. Also no IRE_CACHE entry 9708 * is added for the destination of the 9709 * outgoing packets. This feature is needed 9710 * for mobile IP. 9711 */ 9712 connp->conn_xmit_if_ill = ill; 9713 connp->conn_orig_xmit_ifindex = (ill == NULL) ? 9714 0 : ifindex; 9715 break; 9716 9717 case IP_MULTICAST_IF: 9718 /* 9719 * This option is an internal special. The socket 9720 * level IP_MULTICAST_IF specifies an 'ipaddr' and 9721 * is handled in ip_opt_set_ipif. IPV6_MULTICAST_IF 9722 * specifies an ifindex and we try first on V6 ill's. 9723 * If we don't find one, we they try using on v4 ill's 9724 * intenally and we come here. 9725 */ 9726 if (!checkonly && ill != NULL) { 9727 ipif_t *ipif; 9728 ipif = ill->ill_ipif; 9729 9730 if (ipif->ipif_state_flags & IPIF_CONDEMNED) { 9731 mutex_exit(&ill->ill_lock); 9732 mutex_exit(&connp->conn_lock); 9733 ill_refrele(ill); 9734 ill = NULL; 9735 mutex_enter(&connp->conn_lock); 9736 } else { 9737 connp->conn_multicast_ipif = ipif; 9738 } 9739 } 9740 break; 9741 } 9742 } else { 9743 switch (option) { 9744 case IPV6_BOUND_IF: 9745 connp->conn_incoming_ill = ill; 9746 connp->conn_outgoing_ill = ill; 9747 connp->conn_orig_bound_ifindex = (ill == NULL) ? 9748 0 : ifindex; 9749 break; 9750 9751 case IPV6_BOUND_PIF: 9752 /* 9753 * Limit all transmit to this ill. 9754 * Unlike IPV6_BOUND_IF, using this option 9755 * prevents load spreading and failover from 9756 * happening when the interface is part of the 9757 * group. That's why we don't need to remember 9758 * the ifindex in orig_bound_ifindex as in 9759 * IPV6_BOUND_IF. 9760 */ 9761 connp->conn_outgoing_pill = ill; 9762 break; 9763 9764 case IPV6_DONTFAILOVER_IF: 9765 /* 9766 * This option is used by in.mpathd to ensure 9767 * that IPMP probe packets only go out on the 9768 * test interfaces. in.mpathd sets this option 9769 * on the non-failover interfaces. 9770 */ 9771 connp->conn_nofailover_ill = ill; 9772 /* 9773 * For backward compatibility, this option 9774 * implicitly sets ip_multicast_ill as used in 9775 * IP_MULTICAST_IF so that ip_wput gets 9776 * this ipif to send mcast packets. 9777 */ 9778 connp->conn_multicast_ill = ill; 9779 connp->conn_orig_multicast_ifindex = (ill == NULL) ? 9780 0 : ifindex; 9781 break; 9782 9783 case IPV6_MULTICAST_IF: 9784 /* 9785 * Set conn_multicast_ill to be the IPv6 ill. 9786 * Set conn_multicast_ipif to be an IPv4 ipif 9787 * for ifindex to make IPv4 mapped addresses 9788 * on PF_INET6 sockets honor IPV6_MULTICAST_IF. 9789 * Even if no IPv6 ill exists for the ifindex 9790 * we need to check for an IPv4 ifindex in order 9791 * for this to work with mapped addresses. In that 9792 * case only set conn_multicast_ipif. 9793 */ 9794 if (!checkonly) { 9795 if (ifindex == 0) { 9796 connp->conn_multicast_ill = NULL; 9797 connp->conn_orig_multicast_ifindex = 0; 9798 connp->conn_multicast_ipif = NULL; 9799 } else if (ill != NULL) { 9800 connp->conn_multicast_ill = ill; 9801 connp->conn_orig_multicast_ifindex = 9802 ifindex; 9803 } 9804 } 9805 break; 9806 } 9807 } 9808 9809 if (ill != NULL) { 9810 mutex_exit(&ill->ill_lock); 9811 mutex_exit(&connp->conn_lock); 9812 ill_refrele(ill); 9813 return (0); 9814 } 9815 mutex_exit(&connp->conn_lock); 9816 /* 9817 * We succeeded in clearing the option (ifindex == 0) or failed to 9818 * locate the ill and could not set the option (ifindex != 0) 9819 */ 9820 return (ifindex == 0 ? 0 : EINVAL); 9821 } 9822 9823 /* This routine sets socket options. */ 9824 /* ARGSUSED */ 9825 int 9826 ip_opt_set(queue_t *q, uint_t optset_context, int level, int name, 9827 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 9828 void *dummy, cred_t *cr, mblk_t *first_mp) 9829 { 9830 int *i1 = (int *)invalp; 9831 conn_t *connp = Q_TO_CONN(q); 9832 int error = 0; 9833 boolean_t checkonly; 9834 ire_t *ire; 9835 boolean_t found; 9836 9837 switch (optset_context) { 9838 9839 case SETFN_OPTCOM_CHECKONLY: 9840 checkonly = B_TRUE; 9841 /* 9842 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 9843 * inlen != 0 implies value supplied and 9844 * we have to "pretend" to set it. 9845 * inlen == 0 implies that there is no 9846 * value part in T_CHECK request and just validation 9847 * done elsewhere should be enough, we just return here. 9848 */ 9849 if (inlen == 0) { 9850 *outlenp = 0; 9851 return (0); 9852 } 9853 break; 9854 case SETFN_OPTCOM_NEGOTIATE: 9855 case SETFN_UD_NEGOTIATE: 9856 case SETFN_CONN_NEGOTIATE: 9857 checkonly = B_FALSE; 9858 break; 9859 default: 9860 /* 9861 * We should never get here 9862 */ 9863 *outlenp = 0; 9864 return (EINVAL); 9865 } 9866 9867 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 9868 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 9869 9870 /* 9871 * For fixed length options, no sanity check 9872 * of passed in length is done. It is assumed *_optcom_req() 9873 * routines do the right thing. 9874 */ 9875 9876 switch (level) { 9877 case SOL_SOCKET: 9878 /* 9879 * conn_lock protects the bitfields, and is used to 9880 * set the fields atomically. 9881 */ 9882 switch (name) { 9883 case SO_BROADCAST: 9884 if (!checkonly) { 9885 /* TODO: use value someplace? */ 9886 mutex_enter(&connp->conn_lock); 9887 connp->conn_broadcast = *i1 ? 1 : 0; 9888 mutex_exit(&connp->conn_lock); 9889 } 9890 break; /* goto sizeof (int) option return */ 9891 case SO_USELOOPBACK: 9892 if (!checkonly) { 9893 /* TODO: use value someplace? */ 9894 mutex_enter(&connp->conn_lock); 9895 connp->conn_loopback = *i1 ? 1 : 0; 9896 mutex_exit(&connp->conn_lock); 9897 } 9898 break; /* goto sizeof (int) option return */ 9899 case SO_DONTROUTE: 9900 if (!checkonly) { 9901 mutex_enter(&connp->conn_lock); 9902 connp->conn_dontroute = *i1 ? 1 : 0; 9903 mutex_exit(&connp->conn_lock); 9904 } 9905 break; /* goto sizeof (int) option return */ 9906 case SO_REUSEADDR: 9907 if (!checkonly) { 9908 mutex_enter(&connp->conn_lock); 9909 connp->conn_reuseaddr = *i1 ? 1 : 0; 9910 mutex_exit(&connp->conn_lock); 9911 } 9912 break; /* goto sizeof (int) option return */ 9913 case SO_PROTOTYPE: 9914 if (!checkonly) { 9915 mutex_enter(&connp->conn_lock); 9916 connp->conn_proto = *i1; 9917 mutex_exit(&connp->conn_lock); 9918 } 9919 break; /* goto sizeof (int) option return */ 9920 case SO_ANON_MLP: 9921 if (!checkonly) { 9922 mutex_enter(&connp->conn_lock); 9923 connp->conn_anon_mlp = *i1 != 0 ? 1 : 0; 9924 mutex_exit(&connp->conn_lock); 9925 } 9926 break; /* goto sizeof (int) option return */ 9927 case SO_MAC_EXEMPT: 9928 if (secpolicy_net_mac_aware(cr) != 0 || 9929 IPCL_IS_BOUND(connp)) 9930 return (EACCES); 9931 if (!checkonly) { 9932 mutex_enter(&connp->conn_lock); 9933 connp->conn_mac_exempt = *i1 != 0 ? 1 : 0; 9934 mutex_exit(&connp->conn_lock); 9935 } 9936 break; /* goto sizeof (int) option return */ 9937 default: 9938 /* 9939 * "soft" error (negative) 9940 * option not handled at this level 9941 * Note: Do not modify *outlenp 9942 */ 9943 return (-EINVAL); 9944 } 9945 break; 9946 case IPPROTO_IP: 9947 switch (name) { 9948 case IP_NEXTHOP: 9949 case IP_MULTICAST_IF: 9950 case IP_DONTFAILOVER_IF: { 9951 ipaddr_t addr = *i1; 9952 9953 error = ip_opt_set_ipif(connp, addr, checkonly, name, 9954 first_mp); 9955 if (error != 0) 9956 return (error); 9957 break; /* goto sizeof (int) option return */ 9958 } 9959 9960 case IP_MULTICAST_TTL: 9961 /* Recorded in transport above IP */ 9962 *outvalp = *invalp; 9963 *outlenp = sizeof (uchar_t); 9964 return (0); 9965 case IP_MULTICAST_LOOP: 9966 if (!checkonly) { 9967 mutex_enter(&connp->conn_lock); 9968 connp->conn_multicast_loop = *invalp ? 1 : 0; 9969 mutex_exit(&connp->conn_lock); 9970 } 9971 *outvalp = *invalp; 9972 *outlenp = sizeof (uchar_t); 9973 return (0); 9974 case IP_ADD_MEMBERSHIP: 9975 case MCAST_JOIN_GROUP: 9976 case IP_DROP_MEMBERSHIP: 9977 case MCAST_LEAVE_GROUP: { 9978 struct ip_mreq *mreqp; 9979 struct group_req *greqp; 9980 ire_t *ire; 9981 boolean_t done = B_FALSE; 9982 ipaddr_t group, ifaddr; 9983 struct sockaddr_in *sin; 9984 uint32_t *ifindexp; 9985 boolean_t mcast_opt = B_TRUE; 9986 mcast_record_t fmode; 9987 int (*optfn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t, 9988 uint_t *, mcast_record_t, ipaddr_t, mblk_t *); 9989 9990 switch (name) { 9991 case IP_ADD_MEMBERSHIP: 9992 mcast_opt = B_FALSE; 9993 /* FALLTHRU */ 9994 case MCAST_JOIN_GROUP: 9995 fmode = MODE_IS_EXCLUDE; 9996 optfn = ip_opt_add_group; 9997 break; 9998 9999 case IP_DROP_MEMBERSHIP: 10000 mcast_opt = B_FALSE; 10001 /* FALLTHRU */ 10002 case MCAST_LEAVE_GROUP: 10003 fmode = MODE_IS_INCLUDE; 10004 optfn = ip_opt_delete_group; 10005 break; 10006 } 10007 10008 if (mcast_opt) { 10009 greqp = (struct group_req *)i1; 10010 sin = (struct sockaddr_in *)&greqp->gr_group; 10011 if (sin->sin_family != AF_INET) { 10012 *outlenp = 0; 10013 return (ENOPROTOOPT); 10014 } 10015 group = (ipaddr_t)sin->sin_addr.s_addr; 10016 ifaddr = INADDR_ANY; 10017 ifindexp = &greqp->gr_interface; 10018 } else { 10019 mreqp = (struct ip_mreq *)i1; 10020 group = (ipaddr_t)mreqp->imr_multiaddr.s_addr; 10021 ifaddr = (ipaddr_t)mreqp->imr_interface.s_addr; 10022 ifindexp = NULL; 10023 } 10024 10025 /* 10026 * In the multirouting case, we need to replicate 10027 * the request on all interfaces that will take part 10028 * in replication. We do so because multirouting is 10029 * reflective, thus we will probably receive multi- 10030 * casts on those interfaces. 10031 * The ip_multirt_apply_membership() succeeds if the 10032 * operation succeeds on at least one interface. 10033 */ 10034 ire = ire_ftable_lookup(group, IP_HOST_MASK, 0, 10035 IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, 10036 MATCH_IRE_MASK | MATCH_IRE_TYPE); 10037 if (ire != NULL) { 10038 if (ire->ire_flags & RTF_MULTIRT) { 10039 error = ip_multirt_apply_membership( 10040 optfn, ire, connp, checkonly, group, 10041 fmode, INADDR_ANY, first_mp); 10042 done = B_TRUE; 10043 } 10044 ire_refrele(ire); 10045 } 10046 if (!done) { 10047 error = optfn(connp, checkonly, group, ifaddr, 10048 ifindexp, fmode, INADDR_ANY, first_mp); 10049 } 10050 if (error) { 10051 /* 10052 * EINPROGRESS is a soft error, needs retry 10053 * so don't make *outlenp zero. 10054 */ 10055 if (error != EINPROGRESS) 10056 *outlenp = 0; 10057 return (error); 10058 } 10059 /* OK return - copy input buffer into output buffer */ 10060 if (invalp != outvalp) { 10061 /* don't trust bcopy for identical src/dst */ 10062 bcopy(invalp, outvalp, inlen); 10063 } 10064 *outlenp = inlen; 10065 return (0); 10066 } 10067 case IP_BLOCK_SOURCE: 10068 case IP_UNBLOCK_SOURCE: 10069 case IP_ADD_SOURCE_MEMBERSHIP: 10070 case IP_DROP_SOURCE_MEMBERSHIP: 10071 case MCAST_BLOCK_SOURCE: 10072 case MCAST_UNBLOCK_SOURCE: 10073 case MCAST_JOIN_SOURCE_GROUP: 10074 case MCAST_LEAVE_SOURCE_GROUP: { 10075 struct ip_mreq_source *imreqp; 10076 struct group_source_req *gsreqp; 10077 in_addr_t grp, src, ifaddr = INADDR_ANY; 10078 uint32_t ifindex = 0; 10079 mcast_record_t fmode; 10080 struct sockaddr_in *sin; 10081 ire_t *ire; 10082 boolean_t mcast_opt = B_TRUE, done = B_FALSE; 10083 int (*optfn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t, 10084 uint_t *, mcast_record_t, ipaddr_t, mblk_t *); 10085 10086 switch (name) { 10087 case IP_BLOCK_SOURCE: 10088 mcast_opt = B_FALSE; 10089 /* FALLTHRU */ 10090 case MCAST_BLOCK_SOURCE: 10091 fmode = MODE_IS_EXCLUDE; 10092 optfn = ip_opt_add_group; 10093 break; 10094 10095 case IP_UNBLOCK_SOURCE: 10096 mcast_opt = B_FALSE; 10097 /* FALLTHRU */ 10098 case MCAST_UNBLOCK_SOURCE: 10099 fmode = MODE_IS_EXCLUDE; 10100 optfn = ip_opt_delete_group; 10101 break; 10102 10103 case IP_ADD_SOURCE_MEMBERSHIP: 10104 mcast_opt = B_FALSE; 10105 /* FALLTHRU */ 10106 case MCAST_JOIN_SOURCE_GROUP: 10107 fmode = MODE_IS_INCLUDE; 10108 optfn = ip_opt_add_group; 10109 break; 10110 10111 case IP_DROP_SOURCE_MEMBERSHIP: 10112 mcast_opt = B_FALSE; 10113 /* FALLTHRU */ 10114 case MCAST_LEAVE_SOURCE_GROUP: 10115 fmode = MODE_IS_INCLUDE; 10116 optfn = ip_opt_delete_group; 10117 break; 10118 } 10119 10120 if (mcast_opt) { 10121 gsreqp = (struct group_source_req *)i1; 10122 if (gsreqp->gsr_group.ss_family != AF_INET) { 10123 *outlenp = 0; 10124 return (ENOPROTOOPT); 10125 } 10126 sin = (struct sockaddr_in *)&gsreqp->gsr_group; 10127 grp = (ipaddr_t)sin->sin_addr.s_addr; 10128 sin = (struct sockaddr_in *)&gsreqp->gsr_source; 10129 src = (ipaddr_t)sin->sin_addr.s_addr; 10130 ifindex = gsreqp->gsr_interface; 10131 } else { 10132 imreqp = (struct ip_mreq_source *)i1; 10133 grp = (ipaddr_t)imreqp->imr_multiaddr.s_addr; 10134 src = (ipaddr_t)imreqp->imr_sourceaddr.s_addr; 10135 ifaddr = (ipaddr_t)imreqp->imr_interface.s_addr; 10136 } 10137 10138 /* 10139 * In the multirouting case, we need to replicate 10140 * the request as noted in the mcast cases above. 10141 */ 10142 ire = ire_ftable_lookup(grp, IP_HOST_MASK, 0, 10143 IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, 10144 MATCH_IRE_MASK | MATCH_IRE_TYPE); 10145 if (ire != NULL) { 10146 if (ire->ire_flags & RTF_MULTIRT) { 10147 error = ip_multirt_apply_membership( 10148 optfn, ire, connp, checkonly, grp, 10149 fmode, src, first_mp); 10150 done = B_TRUE; 10151 } 10152 ire_refrele(ire); 10153 } 10154 if (!done) { 10155 error = optfn(connp, checkonly, grp, ifaddr, 10156 &ifindex, fmode, src, first_mp); 10157 } 10158 if (error != 0) { 10159 /* 10160 * EINPROGRESS is a soft error, needs retry 10161 * so don't make *outlenp zero. 10162 */ 10163 if (error != EINPROGRESS) 10164 *outlenp = 0; 10165 return (error); 10166 } 10167 /* OK return - copy input buffer into output buffer */ 10168 if (invalp != outvalp) { 10169 bcopy(invalp, outvalp, inlen); 10170 } 10171 *outlenp = inlen; 10172 return (0); 10173 } 10174 case IP_SEC_OPT: 10175 error = ipsec_set_req(cr, connp, (ipsec_req_t *)invalp); 10176 if (error != 0) { 10177 *outlenp = 0; 10178 return (error); 10179 } 10180 break; 10181 case IP_HDRINCL: 10182 case IP_OPTIONS: 10183 case T_IP_OPTIONS: 10184 case IP_TOS: 10185 case T_IP_TOS: 10186 case IP_TTL: 10187 case IP_RECVDSTADDR: 10188 case IP_RECVOPTS: 10189 /* OK return - copy input buffer into output buffer */ 10190 if (invalp != outvalp) { 10191 /* don't trust bcopy for identical src/dst */ 10192 bcopy(invalp, outvalp, inlen); 10193 } 10194 *outlenp = inlen; 10195 return (0); 10196 case IP_RECVIF: 10197 /* Retrieve the inbound interface index */ 10198 if (!checkonly) { 10199 mutex_enter(&connp->conn_lock); 10200 connp->conn_recvif = *i1 ? 1 : 0; 10201 mutex_exit(&connp->conn_lock); 10202 } 10203 break; /* goto sizeof (int) option return */ 10204 case IP_RECVSLLA: 10205 /* Retrieve the source link layer address */ 10206 if (!checkonly) { 10207 mutex_enter(&connp->conn_lock); 10208 connp->conn_recvslla = *i1 ? 1 : 0; 10209 mutex_exit(&connp->conn_lock); 10210 } 10211 break; /* goto sizeof (int) option return */ 10212 case MRT_INIT: 10213 case MRT_DONE: 10214 case MRT_ADD_VIF: 10215 case MRT_DEL_VIF: 10216 case MRT_ADD_MFC: 10217 case MRT_DEL_MFC: 10218 case MRT_ASSERT: 10219 if ((error = secpolicy_net_config(cr, B_FALSE)) != 0) { 10220 *outlenp = 0; 10221 return (error); 10222 } 10223 error = ip_mrouter_set((int)name, q, checkonly, 10224 (uchar_t *)invalp, inlen, first_mp); 10225 if (error) { 10226 *outlenp = 0; 10227 return (error); 10228 } 10229 /* OK return - copy input buffer into output buffer */ 10230 if (invalp != outvalp) { 10231 /* don't trust bcopy for identical src/dst */ 10232 bcopy(invalp, outvalp, inlen); 10233 } 10234 *outlenp = inlen; 10235 return (0); 10236 case IP_BOUND_IF: 10237 case IP_XMIT_IF: 10238 error = ip_opt_set_ill(connp, *i1, B_FALSE, checkonly, 10239 level, name, first_mp); 10240 if (error != 0) 10241 return (error); 10242 break; /* goto sizeof (int) option return */ 10243 10244 case IP_UNSPEC_SRC: 10245 /* Allow sending with a zero source address */ 10246 if (!checkonly) { 10247 mutex_enter(&connp->conn_lock); 10248 connp->conn_unspec_src = *i1 ? 1 : 0; 10249 mutex_exit(&connp->conn_lock); 10250 } 10251 break; /* goto sizeof (int) option return */ 10252 default: 10253 /* 10254 * "soft" error (negative) 10255 * option not handled at this level 10256 * Note: Do not modify *outlenp 10257 */ 10258 return (-EINVAL); 10259 } 10260 break; 10261 case IPPROTO_IPV6: 10262 switch (name) { 10263 case IPV6_BOUND_IF: 10264 case IPV6_BOUND_PIF: 10265 case IPV6_DONTFAILOVER_IF: 10266 error = ip_opt_set_ill(connp, *i1, B_TRUE, checkonly, 10267 level, name, first_mp); 10268 if (error != 0) 10269 return (error); 10270 break; /* goto sizeof (int) option return */ 10271 10272 case IPV6_MULTICAST_IF: 10273 /* 10274 * The only possible errors are EINPROGRESS and 10275 * EINVAL. EINPROGRESS will be restarted and is not 10276 * a hard error. We call this option on both V4 and V6 10277 * If both return EINVAL, then this call returns 10278 * EINVAL. If at least one of them succeeds we 10279 * return success. 10280 */ 10281 found = B_FALSE; 10282 error = ip_opt_set_ill(connp, *i1, B_TRUE, checkonly, 10283 level, name, first_mp); 10284 if (error == EINPROGRESS) 10285 return (error); 10286 if (error == 0) 10287 found = B_TRUE; 10288 error = ip_opt_set_ill(connp, *i1, B_FALSE, checkonly, 10289 IPPROTO_IP, IP_MULTICAST_IF, first_mp); 10290 if (error == 0) 10291 found = B_TRUE; 10292 if (!found) 10293 return (error); 10294 break; /* goto sizeof (int) option return */ 10295 10296 case IPV6_MULTICAST_HOPS: 10297 /* Recorded in transport above IP */ 10298 break; /* goto sizeof (int) option return */ 10299 case IPV6_MULTICAST_LOOP: 10300 if (!checkonly) { 10301 mutex_enter(&connp->conn_lock); 10302 connp->conn_multicast_loop = *i1; 10303 mutex_exit(&connp->conn_lock); 10304 } 10305 break; /* goto sizeof (int) option return */ 10306 case IPV6_JOIN_GROUP: 10307 case MCAST_JOIN_GROUP: 10308 case IPV6_LEAVE_GROUP: 10309 case MCAST_LEAVE_GROUP: { 10310 struct ipv6_mreq *ip_mreqp; 10311 struct group_req *greqp; 10312 ire_t *ire; 10313 boolean_t done = B_FALSE; 10314 in6_addr_t groupv6; 10315 uint32_t ifindex; 10316 boolean_t mcast_opt = B_TRUE; 10317 mcast_record_t fmode; 10318 int (*optfn)(conn_t *, boolean_t, const in6_addr_t *, 10319 int, mcast_record_t, const in6_addr_t *, mblk_t *); 10320 10321 switch (name) { 10322 case IPV6_JOIN_GROUP: 10323 mcast_opt = B_FALSE; 10324 /* FALLTHRU */ 10325 case MCAST_JOIN_GROUP: 10326 fmode = MODE_IS_EXCLUDE; 10327 optfn = ip_opt_add_group_v6; 10328 break; 10329 10330 case IPV6_LEAVE_GROUP: 10331 mcast_opt = B_FALSE; 10332 /* FALLTHRU */ 10333 case MCAST_LEAVE_GROUP: 10334 fmode = MODE_IS_INCLUDE; 10335 optfn = ip_opt_delete_group_v6; 10336 break; 10337 } 10338 10339 if (mcast_opt) { 10340 struct sockaddr_in *sin; 10341 struct sockaddr_in6 *sin6; 10342 greqp = (struct group_req *)i1; 10343 if (greqp->gr_group.ss_family == AF_INET) { 10344 sin = (struct sockaddr_in *) 10345 &(greqp->gr_group); 10346 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, 10347 &groupv6); 10348 } else { 10349 sin6 = (struct sockaddr_in6 *) 10350 &(greqp->gr_group); 10351 groupv6 = sin6->sin6_addr; 10352 } 10353 ifindex = greqp->gr_interface; 10354 } else { 10355 ip_mreqp = (struct ipv6_mreq *)i1; 10356 groupv6 = ip_mreqp->ipv6mr_multiaddr; 10357 ifindex = ip_mreqp->ipv6mr_interface; 10358 } 10359 /* 10360 * In the multirouting case, we need to replicate 10361 * the request on all interfaces that will take part 10362 * in replication. We do so because multirouting is 10363 * reflective, thus we will probably receive multi- 10364 * casts on those interfaces. 10365 * The ip_multirt_apply_membership_v6() succeeds if 10366 * the operation succeeds on at least one interface. 10367 */ 10368 ire = ire_ftable_lookup_v6(&groupv6, &ipv6_all_ones, 0, 10369 IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, 10370 MATCH_IRE_MASK | MATCH_IRE_TYPE); 10371 if (ire != NULL) { 10372 if (ire->ire_flags & RTF_MULTIRT) { 10373 error = ip_multirt_apply_membership_v6( 10374 optfn, ire, connp, checkonly, 10375 &groupv6, fmode, &ipv6_all_zeros, 10376 first_mp); 10377 done = B_TRUE; 10378 } 10379 ire_refrele(ire); 10380 } 10381 if (!done) { 10382 error = optfn(connp, checkonly, &groupv6, 10383 ifindex, fmode, &ipv6_all_zeros, first_mp); 10384 } 10385 if (error) { 10386 /* 10387 * EINPROGRESS is a soft error, needs retry 10388 * so don't make *outlenp zero. 10389 */ 10390 if (error != EINPROGRESS) 10391 *outlenp = 0; 10392 return (error); 10393 } 10394 /* OK return - copy input buffer into output buffer */ 10395 if (invalp != outvalp) { 10396 /* don't trust bcopy for identical src/dst */ 10397 bcopy(invalp, outvalp, inlen); 10398 } 10399 *outlenp = inlen; 10400 return (0); 10401 } 10402 case MCAST_BLOCK_SOURCE: 10403 case MCAST_UNBLOCK_SOURCE: 10404 case MCAST_JOIN_SOURCE_GROUP: 10405 case MCAST_LEAVE_SOURCE_GROUP: { 10406 struct group_source_req *gsreqp; 10407 in6_addr_t v6grp, v6src; 10408 uint32_t ifindex; 10409 mcast_record_t fmode; 10410 ire_t *ire; 10411 boolean_t done = B_FALSE; 10412 int (*optfn)(conn_t *, boolean_t, const in6_addr_t *, 10413 int, mcast_record_t, const in6_addr_t *, mblk_t *); 10414 10415 switch (name) { 10416 case MCAST_BLOCK_SOURCE: 10417 fmode = MODE_IS_EXCLUDE; 10418 optfn = ip_opt_add_group_v6; 10419 break; 10420 case MCAST_UNBLOCK_SOURCE: 10421 fmode = MODE_IS_EXCLUDE; 10422 optfn = ip_opt_delete_group_v6; 10423 break; 10424 case MCAST_JOIN_SOURCE_GROUP: 10425 fmode = MODE_IS_INCLUDE; 10426 optfn = ip_opt_add_group_v6; 10427 break; 10428 case MCAST_LEAVE_SOURCE_GROUP: 10429 fmode = MODE_IS_INCLUDE; 10430 optfn = ip_opt_delete_group_v6; 10431 break; 10432 } 10433 10434 gsreqp = (struct group_source_req *)i1; 10435 ifindex = gsreqp->gsr_interface; 10436 if (gsreqp->gsr_group.ss_family == AF_INET) { 10437 struct sockaddr_in *s; 10438 s = (struct sockaddr_in *)&gsreqp->gsr_group; 10439 IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6grp); 10440 s = (struct sockaddr_in *)&gsreqp->gsr_source; 10441 IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6src); 10442 } else { 10443 struct sockaddr_in6 *s6; 10444 s6 = (struct sockaddr_in6 *)&gsreqp->gsr_group; 10445 v6grp = s6->sin6_addr; 10446 s6 = (struct sockaddr_in6 *)&gsreqp->gsr_source; 10447 v6src = s6->sin6_addr; 10448 } 10449 10450 /* 10451 * In the multirouting case, we need to replicate 10452 * the request as noted in the mcast cases above. 10453 */ 10454 ire = ire_ftable_lookup_v6(&v6grp, &ipv6_all_ones, 0, 10455 IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, 10456 MATCH_IRE_MASK | MATCH_IRE_TYPE); 10457 if (ire != NULL) { 10458 if (ire->ire_flags & RTF_MULTIRT) { 10459 error = ip_multirt_apply_membership_v6( 10460 optfn, ire, connp, checkonly, 10461 &v6grp, fmode, &v6src, first_mp); 10462 done = B_TRUE; 10463 } 10464 ire_refrele(ire); 10465 } 10466 if (!done) { 10467 error = optfn(connp, checkonly, &v6grp, 10468 ifindex, fmode, &v6src, first_mp); 10469 } 10470 if (error != 0) { 10471 /* 10472 * EINPROGRESS is a soft error, needs retry 10473 * so don't make *outlenp zero. 10474 */ 10475 if (error != EINPROGRESS) 10476 *outlenp = 0; 10477 return (error); 10478 } 10479 /* OK return - copy input buffer into output buffer */ 10480 if (invalp != outvalp) { 10481 bcopy(invalp, outvalp, inlen); 10482 } 10483 *outlenp = inlen; 10484 return (0); 10485 } 10486 case IPV6_UNICAST_HOPS: 10487 /* Recorded in transport above IP */ 10488 break; /* goto sizeof (int) option return */ 10489 case IPV6_UNSPEC_SRC: 10490 /* Allow sending with a zero source address */ 10491 if (!checkonly) { 10492 mutex_enter(&connp->conn_lock); 10493 connp->conn_unspec_src = *i1 ? 1 : 0; 10494 mutex_exit(&connp->conn_lock); 10495 } 10496 break; /* goto sizeof (int) option return */ 10497 case IPV6_RECVPKTINFO: 10498 if (!checkonly) { 10499 mutex_enter(&connp->conn_lock); 10500 connp->conn_ipv6_recvpktinfo = *i1 ? 1 : 0; 10501 mutex_exit(&connp->conn_lock); 10502 } 10503 break; /* goto sizeof (int) option return */ 10504 case IPV6_RECVTCLASS: 10505 if (!checkonly) { 10506 if (*i1 < 0 || *i1 > 1) { 10507 return (EINVAL); 10508 } 10509 mutex_enter(&connp->conn_lock); 10510 connp->conn_ipv6_recvtclass = *i1; 10511 mutex_exit(&connp->conn_lock); 10512 } 10513 break; 10514 case IPV6_RECVPATHMTU: 10515 if (!checkonly) { 10516 if (*i1 < 0 || *i1 > 1) { 10517 return (EINVAL); 10518 } 10519 mutex_enter(&connp->conn_lock); 10520 connp->conn_ipv6_recvpathmtu = *i1; 10521 mutex_exit(&connp->conn_lock); 10522 } 10523 break; 10524 case IPV6_RECVHOPLIMIT: 10525 if (!checkonly) { 10526 mutex_enter(&connp->conn_lock); 10527 connp->conn_ipv6_recvhoplimit = *i1 ? 1 : 0; 10528 mutex_exit(&connp->conn_lock); 10529 } 10530 break; /* goto sizeof (int) option return */ 10531 case IPV6_RECVHOPOPTS: 10532 if (!checkonly) { 10533 mutex_enter(&connp->conn_lock); 10534 connp->conn_ipv6_recvhopopts = *i1 ? 1 : 0; 10535 mutex_exit(&connp->conn_lock); 10536 } 10537 break; /* goto sizeof (int) option return */ 10538 case IPV6_RECVDSTOPTS: 10539 if (!checkonly) { 10540 mutex_enter(&connp->conn_lock); 10541 connp->conn_ipv6_recvdstopts = *i1 ? 1 : 0; 10542 mutex_exit(&connp->conn_lock); 10543 } 10544 break; /* goto sizeof (int) option return */ 10545 case IPV6_RECVRTHDR: 10546 if (!checkonly) { 10547 mutex_enter(&connp->conn_lock); 10548 connp->conn_ipv6_recvrthdr = *i1 ? 1 : 0; 10549 mutex_exit(&connp->conn_lock); 10550 } 10551 break; /* goto sizeof (int) option return */ 10552 case IPV6_RECVRTHDRDSTOPTS: 10553 if (!checkonly) { 10554 mutex_enter(&connp->conn_lock); 10555 connp->conn_ipv6_recvrtdstopts = *i1 ? 1 : 0; 10556 mutex_exit(&connp->conn_lock); 10557 } 10558 break; /* goto sizeof (int) option return */ 10559 case IPV6_PKTINFO: 10560 if (inlen == 0) 10561 return (-EINVAL); /* clearing option */ 10562 error = ip6_set_pktinfo(cr, connp, 10563 (struct in6_pktinfo *)invalp, first_mp); 10564 if (error != 0) 10565 *outlenp = 0; 10566 else 10567 *outlenp = inlen; 10568 return (error); 10569 case IPV6_NEXTHOP: { 10570 struct sockaddr_in6 *sin6; 10571 10572 /* Verify that the nexthop is reachable */ 10573 if (inlen == 0) 10574 return (-EINVAL); /* clearing option */ 10575 10576 sin6 = (struct sockaddr_in6 *)invalp; 10577 ire = ire_route_lookup_v6(&sin6->sin6_addr, 10578 0, 0, 0, NULL, NULL, connp->conn_zoneid, 10579 NULL, MATCH_IRE_DEFAULT); 10580 10581 if (ire == NULL) { 10582 *outlenp = 0; 10583 return (EHOSTUNREACH); 10584 } 10585 ire_refrele(ire); 10586 return (-EINVAL); 10587 } 10588 case IPV6_SEC_OPT: 10589 error = ipsec_set_req(cr, connp, (ipsec_req_t *)invalp); 10590 if (error != 0) { 10591 *outlenp = 0; 10592 return (error); 10593 } 10594 break; 10595 case IPV6_SRC_PREFERENCES: { 10596 /* 10597 * This is implemented strictly in the ip module 10598 * (here and in tcp_opt_*() to accomodate tcp 10599 * sockets). Modules above ip pass this option 10600 * down here since ip is the only one that needs to 10601 * be aware of source address preferences. 10602 * 10603 * This socket option only affects connected 10604 * sockets that haven't already bound to a specific 10605 * IPv6 address. In other words, sockets that 10606 * don't call bind() with an address other than the 10607 * unspecified address and that call connect(). 10608 * ip_bind_connected_v6() passes these preferences 10609 * to the ipif_select_source_v6() function. 10610 */ 10611 if (inlen != sizeof (uint32_t)) 10612 return (EINVAL); 10613 error = ip6_set_src_preferences(connp, 10614 *(uint32_t *)invalp); 10615 if (error != 0) { 10616 *outlenp = 0; 10617 return (error); 10618 } else { 10619 *outlenp = sizeof (uint32_t); 10620 } 10621 break; 10622 } 10623 case IPV6_V6ONLY: 10624 if (*i1 < 0 || *i1 > 1) { 10625 return (EINVAL); 10626 } 10627 mutex_enter(&connp->conn_lock); 10628 connp->conn_ipv6_v6only = *i1; 10629 mutex_exit(&connp->conn_lock); 10630 break; 10631 default: 10632 return (-EINVAL); 10633 } 10634 break; 10635 default: 10636 /* 10637 * "soft" error (negative) 10638 * option not handled at this level 10639 * Note: Do not modify *outlenp 10640 */ 10641 return (-EINVAL); 10642 } 10643 /* 10644 * Common case of return from an option that is sizeof (int) 10645 */ 10646 *(int *)outvalp = *i1; 10647 *outlenp = sizeof (int); 10648 return (0); 10649 } 10650 10651 /* 10652 * This routine gets default values of certain options whose default 10653 * values are maintained by protocol specific code 10654 */ 10655 /* ARGSUSED */ 10656 int 10657 ip_opt_default(queue_t *q, int level, int name, uchar_t *ptr) 10658 { 10659 int *i1 = (int *)ptr; 10660 10661 switch (level) { 10662 case IPPROTO_IP: 10663 switch (name) { 10664 case IP_MULTICAST_TTL: 10665 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL; 10666 return (sizeof (uchar_t)); 10667 case IP_MULTICAST_LOOP: 10668 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP; 10669 return (sizeof (uchar_t)); 10670 default: 10671 return (-1); 10672 } 10673 case IPPROTO_IPV6: 10674 switch (name) { 10675 case IPV6_UNICAST_HOPS: 10676 *i1 = ipv6_def_hops; 10677 return (sizeof (int)); 10678 case IPV6_MULTICAST_HOPS: 10679 *i1 = IP_DEFAULT_MULTICAST_TTL; 10680 return (sizeof (int)); 10681 case IPV6_MULTICAST_LOOP: 10682 *i1 = IP_DEFAULT_MULTICAST_LOOP; 10683 return (sizeof (int)); 10684 case IPV6_V6ONLY: 10685 *i1 = 1; 10686 return (sizeof (int)); 10687 default: 10688 return (-1); 10689 } 10690 default: 10691 return (-1); 10692 } 10693 /* NOTREACHED */ 10694 } 10695 10696 /* 10697 * Given a destination address and a pointer to where to put the information 10698 * this routine fills in the mtuinfo. 10699 */ 10700 int 10701 ip_fill_mtuinfo(struct in6_addr *in6, in_port_t port, 10702 struct ip6_mtuinfo *mtuinfo) 10703 { 10704 ire_t *ire; 10705 10706 if (IN6_IS_ADDR_UNSPECIFIED(in6)) 10707 return (-1); 10708 10709 bzero(mtuinfo, sizeof (*mtuinfo)); 10710 mtuinfo->ip6m_addr.sin6_family = AF_INET6; 10711 mtuinfo->ip6m_addr.sin6_port = port; 10712 mtuinfo->ip6m_addr.sin6_addr = *in6; 10713 10714 ire = ire_cache_lookup_v6(in6, ALL_ZONES, NULL); 10715 if (ire != NULL) { 10716 mtuinfo->ip6m_mtu = ire->ire_max_frag; 10717 ire_refrele(ire); 10718 } else { 10719 mtuinfo->ip6m_mtu = IPV6_MIN_MTU; 10720 } 10721 return (sizeof (struct ip6_mtuinfo)); 10722 } 10723 10724 /* 10725 * This routine gets socket options. For MRT_VERSION and MRT_ASSERT, error 10726 * checking of GET_QUEUE_CRED(q) and that ip_g_mrouter is set should be done and 10727 * isn't. This doesn't matter as the error checking is done properly for the 10728 * other MRT options coming in through ip_opt_set. 10729 */ 10730 int 10731 ip_opt_get(queue_t *q, int level, int name, uchar_t *ptr) 10732 { 10733 conn_t *connp = Q_TO_CONN(q); 10734 ipsec_req_t *req = (ipsec_req_t *)ptr; 10735 10736 switch (level) { 10737 case IPPROTO_IP: 10738 switch (name) { 10739 case MRT_VERSION: 10740 case MRT_ASSERT: 10741 (void) ip_mrouter_get(name, q, ptr); 10742 return (sizeof (int)); 10743 case IP_SEC_OPT: 10744 return (ipsec_req_from_conn(connp, req, IPSEC_AF_V4)); 10745 case IP_NEXTHOP: 10746 if (connp->conn_nexthop_set) { 10747 *(ipaddr_t *)ptr = connp->conn_nexthop_v4; 10748 return (sizeof (ipaddr_t)); 10749 } else 10750 return (0); 10751 default: 10752 break; 10753 } 10754 break; 10755 case IPPROTO_IPV6: 10756 switch (name) { 10757 case IPV6_SEC_OPT: 10758 return (ipsec_req_from_conn(connp, req, IPSEC_AF_V6)); 10759 case IPV6_SRC_PREFERENCES: { 10760 return (ip6_get_src_preferences(connp, 10761 (uint32_t *)ptr)); 10762 } 10763 case IPV6_V6ONLY: 10764 *(int *)ptr = connp->conn_ipv6_v6only ? 1 : 0; 10765 return (sizeof (int)); 10766 case IPV6_PATHMTU: 10767 return (ip_fill_mtuinfo(&connp->conn_remv6, 0, 10768 (struct ip6_mtuinfo *)ptr)); 10769 default: 10770 break; 10771 } 10772 break; 10773 default: 10774 break; 10775 } 10776 return (-1); 10777 } 10778 10779 /* Named Dispatch routine to get a current value out of our parameter table. */ 10780 /* ARGSUSED */ 10781 static int 10782 ip_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 10783 { 10784 ipparam_t *ippa = (ipparam_t *)cp; 10785 10786 (void) mi_mpprintf(mp, "%d", ippa->ip_param_value); 10787 return (0); 10788 } 10789 10790 /* ARGSUSED */ 10791 static int 10792 ip_param_generic_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 10793 { 10794 10795 (void) mi_mpprintf(mp, "%d", *(int *)cp); 10796 return (0); 10797 } 10798 10799 /* 10800 * Set ip{,6}_forwarding values. This means walking through all of the 10801 * ill's and toggling their forwarding values. 10802 */ 10803 /* ARGSUSED */ 10804 static int 10805 ip_forward_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *ioc_cr) 10806 { 10807 long new_value; 10808 int *forwarding_value = (int *)cp; 10809 ill_t *walker; 10810 boolean_t isv6 = (forwarding_value == &ipv6_forward); 10811 ill_walk_context_t ctx; 10812 10813 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 10814 new_value < 0 || new_value > 1) { 10815 return (EINVAL); 10816 } 10817 10818 *forwarding_value = new_value; 10819 10820 /* 10821 * Regardless of the current value of ip_forwarding, set all per-ill 10822 * values of ip_forwarding to the value being set. 10823 * 10824 * Bring all the ill's up to date with the new global value. 10825 */ 10826 rw_enter(&ill_g_lock, RW_READER); 10827 10828 if (isv6) 10829 walker = ILL_START_WALK_V6(&ctx); 10830 else 10831 walker = ILL_START_WALK_V4(&ctx); 10832 for (; walker != NULL; walker = ill_next(&ctx, walker)) { 10833 (void) ill_forward_set(q, mp, (new_value != 0), 10834 (caddr_t)walker); 10835 } 10836 rw_exit(&ill_g_lock); 10837 10838 return (0); 10839 } 10840 10841 /* 10842 * Walk through the param array specified registering each element with the 10843 * Named Dispatch handler. This is called only during init. So it is ok 10844 * not to acquire any locks 10845 */ 10846 static boolean_t 10847 ip_param_register(ipparam_t *ippa, size_t ippa_cnt, 10848 ipndp_t *ipnd, size_t ipnd_cnt) 10849 { 10850 for (; ippa_cnt-- > 0; ippa++) { 10851 if (ippa->ip_param_name && ippa->ip_param_name[0]) { 10852 if (!nd_load(&ip_g_nd, ippa->ip_param_name, 10853 ip_param_get, ip_param_set, (caddr_t)ippa)) { 10854 nd_free(&ip_g_nd); 10855 return (B_FALSE); 10856 } 10857 } 10858 } 10859 10860 for (; ipnd_cnt-- > 0; ipnd++) { 10861 if (ipnd->ip_ndp_name && ipnd->ip_ndp_name[0]) { 10862 if (!nd_load(&ip_g_nd, ipnd->ip_ndp_name, 10863 ipnd->ip_ndp_getf, ipnd->ip_ndp_setf, 10864 ipnd->ip_ndp_data)) { 10865 nd_free(&ip_g_nd); 10866 return (B_FALSE); 10867 } 10868 } 10869 } 10870 10871 return (B_TRUE); 10872 } 10873 10874 /* Named Dispatch routine to negotiate a new value for one of our parameters. */ 10875 /* ARGSUSED */ 10876 static int 10877 ip_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *ioc_cr) 10878 { 10879 long new_value; 10880 ipparam_t *ippa = (ipparam_t *)cp; 10881 10882 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 10883 new_value < ippa->ip_param_min || new_value > ippa->ip_param_max) { 10884 return (EINVAL); 10885 } 10886 ippa->ip_param_value = new_value; 10887 return (0); 10888 } 10889 10890 /* 10891 * Handles both IPv4 and IPv6 reassembly - doing the out-of-order cases, 10892 * When an ipf is passed here for the first time, if 10893 * we already have in-order fragments on the queue, we convert from the fast- 10894 * path reassembly scheme to the hard-case scheme. From then on, additional 10895 * fragments are reassembled here. We keep track of the start and end offsets 10896 * of each piece, and the number of holes in the chain. When the hole count 10897 * goes to zero, we are done! 10898 * 10899 * The ipf_count will be updated to account for any mblk(s) added (pointed to 10900 * by mp) or subtracted (freeb()ed dups), upon return the caller must update 10901 * ipfb_count and ill_frag_count by the difference of ipf_count before and 10902 * after the call to ip_reassemble(). 10903 */ 10904 int 10905 ip_reassemble(mblk_t *mp, ipf_t *ipf, uint_t start, boolean_t more, ill_t *ill, 10906 size_t msg_len) 10907 { 10908 uint_t end; 10909 mblk_t *next_mp; 10910 mblk_t *mp1; 10911 uint_t offset; 10912 boolean_t incr_dups = B_TRUE; 10913 boolean_t offset_zero_seen = B_FALSE; 10914 boolean_t pkt_boundary_checked = B_FALSE; 10915 10916 /* If start == 0 then ipf_nf_hdr_len has to be set. */ 10917 ASSERT(start != 0 || ipf->ipf_nf_hdr_len != 0); 10918 10919 /* Add in byte count */ 10920 ipf->ipf_count += msg_len; 10921 if (ipf->ipf_end) { 10922 /* 10923 * We were part way through in-order reassembly, but now there 10924 * is a hole. We walk through messages already queued, and 10925 * mark them for hard case reassembly. We know that up till 10926 * now they were in order starting from offset zero. 10927 */ 10928 offset = 0; 10929 for (mp1 = ipf->ipf_mp->b_cont; mp1; mp1 = mp1->b_cont) { 10930 IP_REASS_SET_START(mp1, offset); 10931 if (offset == 0) { 10932 ASSERT(ipf->ipf_nf_hdr_len != 0); 10933 offset = -ipf->ipf_nf_hdr_len; 10934 } 10935 offset += mp1->b_wptr - mp1->b_rptr; 10936 IP_REASS_SET_END(mp1, offset); 10937 } 10938 /* One hole at the end. */ 10939 ipf->ipf_hole_cnt = 1; 10940 /* Brand it as a hard case, forever. */ 10941 ipf->ipf_end = 0; 10942 } 10943 /* Walk through all the new pieces. */ 10944 do { 10945 end = start + (mp->b_wptr - mp->b_rptr); 10946 /* 10947 * If start is 0, decrease 'end' only for the first mblk of 10948 * the fragment. Otherwise 'end' can get wrong value in the 10949 * second pass of the loop if first mblk is exactly the 10950 * size of ipf_nf_hdr_len. 10951 */ 10952 if (start == 0 && !offset_zero_seen) { 10953 /* First segment */ 10954 ASSERT(ipf->ipf_nf_hdr_len != 0); 10955 end -= ipf->ipf_nf_hdr_len; 10956 offset_zero_seen = B_TRUE; 10957 } 10958 next_mp = mp->b_cont; 10959 /* 10960 * We are checking to see if there is any interesing data 10961 * to process. If there isn't and the mblk isn't the 10962 * one which carries the unfragmentable header then we 10963 * drop it. It's possible to have just the unfragmentable 10964 * header come through without any data. That needs to be 10965 * saved. 10966 * 10967 * If the assert at the top of this function holds then the 10968 * term "ipf->ipf_nf_hdr_len != 0" isn't needed. This code 10969 * is infrequently traveled enough that the test is left in 10970 * to protect against future code changes which break that 10971 * invariant. 10972 */ 10973 if (start == end && start != 0 && ipf->ipf_nf_hdr_len != 0) { 10974 /* Empty. Blast it. */ 10975 IP_REASS_SET_START(mp, 0); 10976 IP_REASS_SET_END(mp, 0); 10977 /* 10978 * If the ipf points to the mblk we are about to free, 10979 * update ipf to point to the next mblk (or NULL 10980 * if none). 10981 */ 10982 if (ipf->ipf_mp->b_cont == mp) 10983 ipf->ipf_mp->b_cont = next_mp; 10984 freeb(mp); 10985 continue; 10986 } 10987 mp->b_cont = NULL; 10988 IP_REASS_SET_START(mp, start); 10989 IP_REASS_SET_END(mp, end); 10990 if (!ipf->ipf_tail_mp) { 10991 ipf->ipf_tail_mp = mp; 10992 ipf->ipf_mp->b_cont = mp; 10993 if (start == 0 || !more) { 10994 ipf->ipf_hole_cnt = 1; 10995 /* 10996 * if the first fragment comes in more than one 10997 * mblk, this loop will be executed for each 10998 * mblk. Need to adjust hole count so exiting 10999 * this routine will leave hole count at 1. 11000 */ 11001 if (next_mp) 11002 ipf->ipf_hole_cnt++; 11003 } else 11004 ipf->ipf_hole_cnt = 2; 11005 continue; 11006 } else if (ipf->ipf_last_frag_seen && !more && 11007 !pkt_boundary_checked) { 11008 /* 11009 * We check datagram boundary only if this fragment 11010 * claims to be the last fragment and we have seen a 11011 * last fragment in the past too. We do this only 11012 * once for a given fragment. 11013 * 11014 * start cannot be 0 here as fragments with start=0 11015 * and MF=0 gets handled as a complete packet. These 11016 * fragments should not reach here. 11017 */ 11018 11019 if (start + msgdsize(mp) != 11020 IP_REASS_END(ipf->ipf_tail_mp)) { 11021 /* 11022 * We have two fragments both of which claim 11023 * to be the last fragment but gives conflicting 11024 * information about the whole datagram size. 11025 * Something fishy is going on. Drop the 11026 * fragment and free up the reassembly list. 11027 */ 11028 return (IP_REASS_FAILED); 11029 } 11030 11031 /* 11032 * We shouldn't come to this code block again for this 11033 * particular fragment. 11034 */ 11035 pkt_boundary_checked = B_TRUE; 11036 } 11037 11038 /* New stuff at or beyond tail? */ 11039 offset = IP_REASS_END(ipf->ipf_tail_mp); 11040 if (start >= offset) { 11041 if (ipf->ipf_last_frag_seen) { 11042 /* current fragment is beyond last fragment */ 11043 return (IP_REASS_FAILED); 11044 } 11045 /* Link it on end. */ 11046 ipf->ipf_tail_mp->b_cont = mp; 11047 ipf->ipf_tail_mp = mp; 11048 if (more) { 11049 if (start != offset) 11050 ipf->ipf_hole_cnt++; 11051 } else if (start == offset && next_mp == NULL) 11052 ipf->ipf_hole_cnt--; 11053 continue; 11054 } 11055 mp1 = ipf->ipf_mp->b_cont; 11056 offset = IP_REASS_START(mp1); 11057 /* New stuff at the front? */ 11058 if (start < offset) { 11059 if (start == 0) { 11060 if (end >= offset) { 11061 /* Nailed the hole at the begining. */ 11062 ipf->ipf_hole_cnt--; 11063 } 11064 } else if (end < offset) { 11065 /* 11066 * A hole, stuff, and a hole where there used 11067 * to be just a hole. 11068 */ 11069 ipf->ipf_hole_cnt++; 11070 } 11071 mp->b_cont = mp1; 11072 /* Check for overlap. */ 11073 while (end > offset) { 11074 if (end < IP_REASS_END(mp1)) { 11075 mp->b_wptr -= end - offset; 11076 IP_REASS_SET_END(mp, offset); 11077 if (ill->ill_isv6) { 11078 BUMP_MIB(ill->ill_ip6_mib, 11079 ipv6ReasmPartDups); 11080 } else { 11081 BUMP_MIB(&ip_mib, 11082 ipReasmPartDups); 11083 } 11084 break; 11085 } 11086 /* Did we cover another hole? */ 11087 if ((mp1->b_cont && 11088 IP_REASS_END(mp1) != 11089 IP_REASS_START(mp1->b_cont) && 11090 end >= IP_REASS_START(mp1->b_cont)) || 11091 (!ipf->ipf_last_frag_seen && !more)) { 11092 ipf->ipf_hole_cnt--; 11093 } 11094 /* Clip out mp1. */ 11095 if ((mp->b_cont = mp1->b_cont) == NULL) { 11096 /* 11097 * After clipping out mp1, this guy 11098 * is now hanging off the end. 11099 */ 11100 ipf->ipf_tail_mp = mp; 11101 } 11102 IP_REASS_SET_START(mp1, 0); 11103 IP_REASS_SET_END(mp1, 0); 11104 /* Subtract byte count */ 11105 ipf->ipf_count -= mp1->b_datap->db_lim - 11106 mp1->b_datap->db_base; 11107 freeb(mp1); 11108 if (ill->ill_isv6) { 11109 BUMP_MIB(ill->ill_ip6_mib, 11110 ipv6ReasmPartDups); 11111 } else { 11112 BUMP_MIB(&ip_mib, ipReasmPartDups); 11113 } 11114 mp1 = mp->b_cont; 11115 if (!mp1) 11116 break; 11117 offset = IP_REASS_START(mp1); 11118 } 11119 ipf->ipf_mp->b_cont = mp; 11120 continue; 11121 } 11122 /* 11123 * The new piece starts somewhere between the start of the head 11124 * and before the end of the tail. 11125 */ 11126 for (; mp1; mp1 = mp1->b_cont) { 11127 offset = IP_REASS_END(mp1); 11128 if (start < offset) { 11129 if (end <= offset) { 11130 /* Nothing new. */ 11131 IP_REASS_SET_START(mp, 0); 11132 IP_REASS_SET_END(mp, 0); 11133 /* Subtract byte count */ 11134 ipf->ipf_count -= mp->b_datap->db_lim - 11135 mp->b_datap->db_base; 11136 if (incr_dups) { 11137 ipf->ipf_num_dups++; 11138 incr_dups = B_FALSE; 11139 } 11140 freeb(mp); 11141 if (ill->ill_isv6) { 11142 BUMP_MIB(ill->ill_ip6_mib, 11143 ipv6ReasmDuplicates); 11144 } else { 11145 BUMP_MIB(&ip_mib, 11146 ipReasmDuplicates); 11147 } 11148 break; 11149 } 11150 /* 11151 * Trim redundant stuff off beginning of new 11152 * piece. 11153 */ 11154 IP_REASS_SET_START(mp, offset); 11155 mp->b_rptr += offset - start; 11156 if (ill->ill_isv6) { 11157 BUMP_MIB(ill->ill_ip6_mib, 11158 ipv6ReasmPartDups); 11159 } else { 11160 BUMP_MIB(&ip_mib, ipReasmPartDups); 11161 } 11162 start = offset; 11163 if (!mp1->b_cont) { 11164 /* 11165 * After trimming, this guy is now 11166 * hanging off the end. 11167 */ 11168 mp1->b_cont = mp; 11169 ipf->ipf_tail_mp = mp; 11170 if (!more) { 11171 ipf->ipf_hole_cnt--; 11172 } 11173 break; 11174 } 11175 } 11176 if (start >= IP_REASS_START(mp1->b_cont)) 11177 continue; 11178 /* Fill a hole */ 11179 if (start > offset) 11180 ipf->ipf_hole_cnt++; 11181 mp->b_cont = mp1->b_cont; 11182 mp1->b_cont = mp; 11183 mp1 = mp->b_cont; 11184 offset = IP_REASS_START(mp1); 11185 if (end >= offset) { 11186 ipf->ipf_hole_cnt--; 11187 /* Check for overlap. */ 11188 while (end > offset) { 11189 if (end < IP_REASS_END(mp1)) { 11190 mp->b_wptr -= end - offset; 11191 IP_REASS_SET_END(mp, offset); 11192 /* 11193 * TODO we might bump 11194 * this up twice if there is 11195 * overlap at both ends. 11196 */ 11197 if (ill->ill_isv6) { 11198 BUMP_MIB( 11199 ill->ill_ip6_mib, 11200 ipv6ReasmPartDups); 11201 } else { 11202 BUMP_MIB(&ip_mib, 11203 ipReasmPartDups); 11204 } 11205 break; 11206 } 11207 /* Did we cover another hole? */ 11208 if ((mp1->b_cont && 11209 IP_REASS_END(mp1) 11210 != IP_REASS_START(mp1->b_cont) && 11211 end >= 11212 IP_REASS_START(mp1->b_cont)) || 11213 (!ipf->ipf_last_frag_seen && 11214 !more)) { 11215 ipf->ipf_hole_cnt--; 11216 } 11217 /* Clip out mp1. */ 11218 if ((mp->b_cont = mp1->b_cont) == 11219 NULL) { 11220 /* 11221 * After clipping out mp1, 11222 * this guy is now hanging 11223 * off the end. 11224 */ 11225 ipf->ipf_tail_mp = mp; 11226 } 11227 IP_REASS_SET_START(mp1, 0); 11228 IP_REASS_SET_END(mp1, 0); 11229 /* Subtract byte count */ 11230 ipf->ipf_count -= 11231 mp1->b_datap->db_lim - 11232 mp1->b_datap->db_base; 11233 freeb(mp1); 11234 if (ill->ill_isv6) { 11235 BUMP_MIB(ill->ill_ip6_mib, 11236 ipv6ReasmPartDups); 11237 } else { 11238 BUMP_MIB(&ip_mib, 11239 ipReasmPartDups); 11240 } 11241 mp1 = mp->b_cont; 11242 if (!mp1) 11243 break; 11244 offset = IP_REASS_START(mp1); 11245 } 11246 } 11247 break; 11248 } 11249 } while (start = end, mp = next_mp); 11250 11251 /* Fragment just processed could be the last one. Remember this fact */ 11252 if (!more) 11253 ipf->ipf_last_frag_seen = B_TRUE; 11254 11255 /* Still got holes? */ 11256 if (ipf->ipf_hole_cnt) 11257 return (IP_REASS_PARTIAL); 11258 /* Clean up overloaded fields to avoid upstream disasters. */ 11259 for (mp1 = ipf->ipf_mp->b_cont; mp1; mp1 = mp1->b_cont) { 11260 IP_REASS_SET_START(mp1, 0); 11261 IP_REASS_SET_END(mp1, 0); 11262 } 11263 return (IP_REASS_COMPLETE); 11264 } 11265 11266 /* 11267 * ipsec processing for the fast path, used for input UDP Packets 11268 */ 11269 static boolean_t 11270 ip_udp_check(queue_t *q, conn_t *connp, ill_t *ill, ipha_t *ipha, 11271 mblk_t **mpp, mblk_t **first_mpp, boolean_t mctl_present) 11272 { 11273 uint32_t ill_index; 11274 uint_t in_flags; /* IPF_RECVSLLA and/or IPF_RECVIF */ 11275 11276 ASSERT(ipha->ipha_protocol == IPPROTO_UDP); 11277 /* The ill_index of the incoming ILL */ 11278 ill_index = ((ill_t *)q->q_ptr)->ill_phyint->phyint_ifindex; 11279 11280 /* pass packet up to the transport */ 11281 if (CONN_INBOUND_POLICY_PRESENT(connp) || mctl_present) { 11282 *first_mpp = ipsec_check_inbound_policy(*first_mpp, connp, ipha, 11283 NULL, mctl_present); 11284 if (*first_mpp == NULL) { 11285 return (B_FALSE); 11286 } 11287 } 11288 11289 /* Initiate IPPF processing for fastpath UDP */ 11290 if (IPP_ENABLED(IPP_LOCAL_IN)) { 11291 ip_process(IPP_LOCAL_IN, mpp, ill_index); 11292 if (*mpp == NULL) { 11293 ip2dbg(("ip_input_ipsec_process: UDP pkt " 11294 "deferred/dropped during IPPF processing\n")); 11295 return (B_FALSE); 11296 } 11297 } 11298 /* 11299 * We make the checks as below since we are in the fast path 11300 * and want to minimize the number of checks if the IP_RECVIF and/or 11301 * IP_RECVSLLA and/or IPV6_RECVPKTINFO options are not set 11302 */ 11303 if (connp->conn_recvif || connp->conn_recvslla || 11304 connp->conn_ipv6_recvpktinfo) { 11305 if (connp->conn_recvif || 11306 connp->conn_ipv6_recvpktinfo) { 11307 in_flags = IPF_RECVIF; 11308 } 11309 if (connp->conn_recvslla) { 11310 in_flags |= IPF_RECVSLLA; 11311 } 11312 /* 11313 * since in_flags are being set ill will be 11314 * referenced in ip_add_info, so it better not 11315 * be NULL. 11316 */ 11317 /* 11318 * the actual data will be contained in b_cont 11319 * upon successful return of the following call. 11320 * If the call fails then the original mblk is 11321 * returned. 11322 */ 11323 *mpp = ip_add_info(*mpp, ill, in_flags); 11324 } 11325 11326 return (B_TRUE); 11327 } 11328 11329 /* 11330 * Fragmentation reassembly. Each ILL has a hash table for 11331 * queuing packets undergoing reassembly for all IPIFs 11332 * associated with the ILL. The hash is based on the packet 11333 * IP ident field. The ILL frag hash table was allocated 11334 * as a timer block at the time the ILL was created. Whenever 11335 * there is anything on the reassembly queue, the timer will 11336 * be running. Returns B_TRUE if successful else B_FALSE; 11337 * frees mp on failure. 11338 */ 11339 static boolean_t 11340 ip_rput_fragment(queue_t *q, mblk_t **mpp, ipha_t *ipha, 11341 uint32_t *cksum_val, uint16_t *cksum_flags) 11342 { 11343 uint32_t frag_offset_flags; 11344 ill_t *ill = (ill_t *)q->q_ptr; 11345 mblk_t *mp = *mpp; 11346 mblk_t *t_mp; 11347 ipaddr_t dst; 11348 uint8_t proto = ipha->ipha_protocol; 11349 uint32_t sum_val; 11350 uint16_t sum_flags; 11351 ipf_t *ipf; 11352 ipf_t **ipfp; 11353 ipfb_t *ipfb; 11354 uint16_t ident; 11355 uint32_t offset; 11356 ipaddr_t src; 11357 uint_t hdr_length; 11358 uint32_t end; 11359 mblk_t *mp1; 11360 mblk_t *tail_mp; 11361 size_t count; 11362 size_t msg_len; 11363 uint8_t ecn_info = 0; 11364 uint32_t packet_size; 11365 boolean_t pruned = B_FALSE; 11366 11367 if (cksum_val != NULL) 11368 *cksum_val = 0; 11369 if (cksum_flags != NULL) 11370 *cksum_flags = 0; 11371 11372 /* 11373 * Drop the fragmented as early as possible, if 11374 * we don't have resource(s) to re-assemble. 11375 */ 11376 if (ip_reass_queue_bytes == 0) { 11377 freemsg(mp); 11378 return (B_FALSE); 11379 } 11380 11381 /* Check for fragmentation offset; return if there's none */ 11382 if ((frag_offset_flags = ntohs(ipha->ipha_fragment_offset_and_flags) & 11383 (IPH_MF | IPH_OFFSET)) == 0) 11384 return (B_TRUE); 11385 11386 /* 11387 * We utilize hardware computed checksum info only for UDP since 11388 * IP fragmentation is a normal occurence for the protocol. In 11389 * addition, checksum offload support for IP fragments carrying 11390 * UDP payload is commonly implemented across network adapters. 11391 */ 11392 ASSERT(ill != NULL); 11393 if (proto == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(ill) && 11394 (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) { 11395 mblk_t *mp1 = mp->b_cont; 11396 int32_t len; 11397 11398 /* Record checksum information from the packet */ 11399 sum_val = (uint32_t)DB_CKSUM16(mp); 11400 sum_flags = DB_CKSUMFLAGS(mp); 11401 11402 /* IP payload offset from beginning of mblk */ 11403 offset = ((uchar_t *)ipha + IPH_HDR_LENGTH(ipha)) - mp->b_rptr; 11404 11405 if ((sum_flags & HCK_PARTIALCKSUM) && 11406 (mp1 == NULL || mp1->b_cont == NULL) && 11407 offset >= DB_CKSUMSTART(mp) && 11408 ((len = offset - DB_CKSUMSTART(mp)) & 1) == 0) { 11409 uint32_t adj; 11410 /* 11411 * Partial checksum has been calculated by hardware 11412 * and attached to the packet; in addition, any 11413 * prepended extraneous data is even byte aligned. 11414 * If any such data exists, we adjust the checksum; 11415 * this would also handle any postpended data. 11416 */ 11417 IP_ADJCKSUM_PARTIAL(mp->b_rptr + DB_CKSUMSTART(mp), 11418 mp, mp1, len, adj); 11419 11420 /* One's complement subtract extraneous checksum */ 11421 if (adj >= sum_val) 11422 sum_val = ~(adj - sum_val) & 0xFFFF; 11423 else 11424 sum_val -= adj; 11425 } 11426 } else { 11427 sum_val = 0; 11428 sum_flags = 0; 11429 } 11430 11431 /* Clear hardware checksumming flag */ 11432 DB_CKSUMFLAGS(mp) = 0; 11433 11434 ident = ipha->ipha_ident; 11435 offset = (frag_offset_flags << 3) & 0xFFFF; 11436 src = ipha->ipha_src; 11437 dst = ipha->ipha_dst; 11438 hdr_length = IPH_HDR_LENGTH(ipha); 11439 end = ntohs(ipha->ipha_length) - hdr_length; 11440 11441 /* If end == 0 then we have a packet with no data, so just free it */ 11442 if (end == 0) { 11443 freemsg(mp); 11444 return (B_FALSE); 11445 } 11446 11447 /* Record the ECN field info. */ 11448 ecn_info = (ipha->ipha_type_of_service & 0x3); 11449 if (offset != 0) { 11450 /* 11451 * If this isn't the first piece, strip the header, and 11452 * add the offset to the end value. 11453 */ 11454 mp->b_rptr += hdr_length; 11455 end += offset; 11456 } 11457 11458 msg_len = MBLKSIZE(mp); 11459 tail_mp = mp; 11460 while (tail_mp->b_cont != NULL) { 11461 tail_mp = tail_mp->b_cont; 11462 msg_len += MBLKSIZE(tail_mp); 11463 } 11464 11465 /* If the reassembly list for this ILL will get too big, prune it */ 11466 if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >= 11467 ip_reass_queue_bytes) { 11468 ill_frag_prune(ill, 11469 (ip_reass_queue_bytes < msg_len) ? 0 : 11470 (ip_reass_queue_bytes - msg_len)); 11471 pruned = B_TRUE; 11472 } 11473 11474 ipfb = &ill->ill_frag_hash_tbl[ILL_FRAG_HASH(src, ident)]; 11475 mutex_enter(&ipfb->ipfb_lock); 11476 11477 ipfp = &ipfb->ipfb_ipf; 11478 /* Try to find an existing fragment queue for this packet. */ 11479 for (;;) { 11480 ipf = ipfp[0]; 11481 if (ipf != NULL) { 11482 /* 11483 * It has to match on ident and src/dst address. 11484 */ 11485 if (ipf->ipf_ident == ident && 11486 ipf->ipf_src == src && 11487 ipf->ipf_dst == dst && 11488 ipf->ipf_protocol == proto) { 11489 /* 11490 * If we have received too many 11491 * duplicate fragments for this packet 11492 * free it. 11493 */ 11494 if (ipf->ipf_num_dups > ip_max_frag_dups) { 11495 ill_frag_free_pkts(ill, ipfb, ipf, 1); 11496 freemsg(mp); 11497 mutex_exit(&ipfb->ipfb_lock); 11498 return (B_FALSE); 11499 } 11500 /* Found it. */ 11501 break; 11502 } 11503 ipfp = &ipf->ipf_hash_next; 11504 continue; 11505 } 11506 11507 /* 11508 * If we pruned the list, do we want to store this new 11509 * fragment?. We apply an optimization here based on the 11510 * fact that most fragments will be received in order. 11511 * So if the offset of this incoming fragment is zero, 11512 * it is the first fragment of a new packet. We will 11513 * keep it. Otherwise drop the fragment, as we have 11514 * probably pruned the packet already (since the 11515 * packet cannot be found). 11516 */ 11517 if (pruned && offset != 0) { 11518 mutex_exit(&ipfb->ipfb_lock); 11519 freemsg(mp); 11520 return (B_FALSE); 11521 } 11522 11523 if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS) { 11524 /* 11525 * Too many fragmented packets in this hash 11526 * bucket. Free the oldest. 11527 */ 11528 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 1); 11529 } 11530 11531 /* New guy. Allocate a frag message. */ 11532 mp1 = allocb(sizeof (*ipf), BPRI_MED); 11533 if (mp1 == NULL) { 11534 BUMP_MIB(&ip_mib, ipInDiscards); 11535 freemsg(mp); 11536 reass_done: 11537 mutex_exit(&ipfb->ipfb_lock); 11538 return (B_FALSE); 11539 } 11540 11541 11542 BUMP_MIB(&ip_mib, ipReasmReqds); 11543 mp1->b_cont = mp; 11544 11545 /* Initialize the fragment header. */ 11546 ipf = (ipf_t *)mp1->b_rptr; 11547 ipf->ipf_mp = mp1; 11548 ipf->ipf_ptphn = ipfp; 11549 ipfp[0] = ipf; 11550 ipf->ipf_hash_next = NULL; 11551 ipf->ipf_ident = ident; 11552 ipf->ipf_protocol = proto; 11553 ipf->ipf_src = src; 11554 ipf->ipf_dst = dst; 11555 ipf->ipf_nf_hdr_len = 0; 11556 /* Record reassembly start time. */ 11557 ipf->ipf_timestamp = gethrestime_sec(); 11558 /* Record ipf generation and account for frag header */ 11559 ipf->ipf_gen = ill->ill_ipf_gen++; 11560 ipf->ipf_count = MBLKSIZE(mp1); 11561 ipf->ipf_last_frag_seen = B_FALSE; 11562 ipf->ipf_ecn = ecn_info; 11563 ipf->ipf_num_dups = 0; 11564 ipfb->ipfb_frag_pkts++; 11565 ipf->ipf_checksum = 0; 11566 ipf->ipf_checksum_flags = 0; 11567 11568 /* Store checksum value in fragment header */ 11569 if (sum_flags != 0) { 11570 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); 11571 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); 11572 ipf->ipf_checksum = sum_val; 11573 ipf->ipf_checksum_flags = sum_flags; 11574 } 11575 11576 /* 11577 * We handle reassembly two ways. In the easy case, 11578 * where all the fragments show up in order, we do 11579 * minimal bookkeeping, and just clip new pieces on 11580 * the end. If we ever see a hole, then we go off 11581 * to ip_reassemble which has to mark the pieces and 11582 * keep track of the number of holes, etc. Obviously, 11583 * the point of having both mechanisms is so we can 11584 * handle the easy case as efficiently as possible. 11585 */ 11586 if (offset == 0) { 11587 /* Easy case, in-order reassembly so far. */ 11588 ipf->ipf_count += msg_len; 11589 ipf->ipf_tail_mp = tail_mp; 11590 /* 11591 * Keep track of next expected offset in 11592 * ipf_end. 11593 */ 11594 ipf->ipf_end = end; 11595 ipf->ipf_nf_hdr_len = hdr_length; 11596 } else { 11597 /* Hard case, hole at the beginning. */ 11598 ipf->ipf_tail_mp = NULL; 11599 /* 11600 * ipf_end == 0 means that we have given up 11601 * on easy reassembly. 11602 */ 11603 ipf->ipf_end = 0; 11604 11605 /* Forget checksum offload from now on */ 11606 ipf->ipf_checksum_flags = 0; 11607 11608 /* 11609 * ipf_hole_cnt is set by ip_reassemble. 11610 * ipf_count is updated by ip_reassemble. 11611 * No need to check for return value here 11612 * as we don't expect reassembly to complete 11613 * or fail for the first fragment itself. 11614 */ 11615 (void) ip_reassemble(mp, ipf, 11616 (frag_offset_flags & IPH_OFFSET) << 3, 11617 (frag_offset_flags & IPH_MF), ill, msg_len); 11618 } 11619 /* Update per ipfb and ill byte counts */ 11620 ipfb->ipfb_count += ipf->ipf_count; 11621 ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ 11622 ill->ill_frag_count += ipf->ipf_count; 11623 ASSERT(ill->ill_frag_count > 0); /* Wraparound */ 11624 /* If the frag timer wasn't already going, start it. */ 11625 mutex_enter(&ill->ill_lock); 11626 ill_frag_timer_start(ill); 11627 mutex_exit(&ill->ill_lock); 11628 goto reass_done; 11629 } 11630 11631 /* 11632 * If the packet's flag has changed (it could be coming up 11633 * from an interface different than the previous, therefore 11634 * possibly different checksum capability), then forget about 11635 * any stored checksum states. Otherwise add the value to 11636 * the existing one stored in the fragment header. 11637 */ 11638 if (sum_flags != 0 && sum_flags == ipf->ipf_checksum_flags) { 11639 sum_val += ipf->ipf_checksum; 11640 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); 11641 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); 11642 ipf->ipf_checksum = sum_val; 11643 } else if (ipf->ipf_checksum_flags != 0) { 11644 /* Forget checksum offload from now on */ 11645 ipf->ipf_checksum_flags = 0; 11646 } 11647 11648 /* 11649 * We have a new piece of a datagram which is already being 11650 * reassembled. Update the ECN info if all IP fragments 11651 * are ECN capable. If there is one which is not, clear 11652 * all the info. If there is at least one which has CE 11653 * code point, IP needs to report that up to transport. 11654 */ 11655 if (ecn_info != IPH_ECN_NECT && ipf->ipf_ecn != IPH_ECN_NECT) { 11656 if (ecn_info == IPH_ECN_CE) 11657 ipf->ipf_ecn = IPH_ECN_CE; 11658 } else { 11659 ipf->ipf_ecn = IPH_ECN_NECT; 11660 } 11661 if (offset && ipf->ipf_end == offset) { 11662 /* The new fragment fits at the end */ 11663 ipf->ipf_tail_mp->b_cont = mp; 11664 /* Update the byte count */ 11665 ipf->ipf_count += msg_len; 11666 /* Update per ipfb and ill byte counts */ 11667 ipfb->ipfb_count += msg_len; 11668 ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ 11669 ill->ill_frag_count += msg_len; 11670 ASSERT(ill->ill_frag_count > 0); /* Wraparound */ 11671 if (frag_offset_flags & IPH_MF) { 11672 /* More to come. */ 11673 ipf->ipf_end = end; 11674 ipf->ipf_tail_mp = tail_mp; 11675 goto reass_done; 11676 } 11677 } else { 11678 /* Go do the hard cases. */ 11679 int ret; 11680 11681 if (offset == 0) 11682 ipf->ipf_nf_hdr_len = hdr_length; 11683 11684 /* Save current byte count */ 11685 count = ipf->ipf_count; 11686 ret = ip_reassemble(mp, ipf, 11687 (frag_offset_flags & IPH_OFFSET) << 3, 11688 (frag_offset_flags & IPH_MF), ill, msg_len); 11689 /* Count of bytes added and subtracted (freeb()ed) */ 11690 count = ipf->ipf_count - count; 11691 if (count) { 11692 /* Update per ipfb and ill byte counts */ 11693 ipfb->ipfb_count += count; 11694 ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ 11695 ill->ill_frag_count += count; 11696 ASSERT(ill->ill_frag_count > 0); 11697 } 11698 if (ret == IP_REASS_PARTIAL) { 11699 goto reass_done; 11700 } else if (ret == IP_REASS_FAILED) { 11701 /* Reassembly failed. Free up all resources */ 11702 ill_frag_free_pkts(ill, ipfb, ipf, 1); 11703 for (t_mp = mp; t_mp != NULL; t_mp = t_mp->b_cont) { 11704 IP_REASS_SET_START(t_mp, 0); 11705 IP_REASS_SET_END(t_mp, 0); 11706 } 11707 freemsg(mp); 11708 goto reass_done; 11709 } 11710 /* We will reach here iff 'ret' is IP_REASS_COMPLETE */ 11711 } 11712 /* 11713 * We have completed reassembly. Unhook the frag header from 11714 * the reassembly list. 11715 * 11716 * Before we free the frag header, record the ECN info 11717 * to report back to the transport. 11718 */ 11719 ecn_info = ipf->ipf_ecn; 11720 BUMP_MIB(&ip_mib, ipReasmOKs); 11721 ipfp = ipf->ipf_ptphn; 11722 11723 /* We need to supply these to caller */ 11724 if ((sum_flags = ipf->ipf_checksum_flags) != 0) 11725 sum_val = ipf->ipf_checksum; 11726 else 11727 sum_val = 0; 11728 11729 mp1 = ipf->ipf_mp; 11730 count = ipf->ipf_count; 11731 ipf = ipf->ipf_hash_next; 11732 if (ipf != NULL) 11733 ipf->ipf_ptphn = ipfp; 11734 ipfp[0] = ipf; 11735 ill->ill_frag_count -= count; 11736 ASSERT(ipfb->ipfb_count >= count); 11737 ipfb->ipfb_count -= count; 11738 ipfb->ipfb_frag_pkts--; 11739 mutex_exit(&ipfb->ipfb_lock); 11740 /* Ditch the frag header. */ 11741 mp = mp1->b_cont; 11742 11743 freeb(mp1); 11744 11745 /* Restore original IP length in header. */ 11746 packet_size = (uint32_t)msgdsize(mp); 11747 if (packet_size > IP_MAXPACKET) { 11748 freemsg(mp); 11749 BUMP_MIB(&ip_mib, ipInHdrErrors); 11750 return (B_FALSE); 11751 } 11752 11753 if (DB_REF(mp) > 1) { 11754 mblk_t *mp2 = copymsg(mp); 11755 11756 freemsg(mp); 11757 if (mp2 == NULL) { 11758 BUMP_MIB(&ip_mib, ipInDiscards); 11759 return (B_FALSE); 11760 } 11761 mp = mp2; 11762 } 11763 ipha = (ipha_t *)mp->b_rptr; 11764 11765 ipha->ipha_length = htons((uint16_t)packet_size); 11766 /* We're now complete, zip the frag state */ 11767 ipha->ipha_fragment_offset_and_flags = 0; 11768 /* Record the ECN info. */ 11769 ipha->ipha_type_of_service &= 0xFC; 11770 ipha->ipha_type_of_service |= ecn_info; 11771 *mpp = mp; 11772 11773 /* Reassembly is successful; return checksum information if needed */ 11774 if (cksum_val != NULL) 11775 *cksum_val = sum_val; 11776 if (cksum_flags != NULL) 11777 *cksum_flags = sum_flags; 11778 11779 return (B_TRUE); 11780 } 11781 11782 /* 11783 * Perform ip header check sum update local options. 11784 * return B_TRUE if all is well, else return B_FALSE and release 11785 * the mp. caller is responsible for decrementing ire ref cnt. 11786 */ 11787 static boolean_t 11788 ip_options_cksum(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire) 11789 { 11790 mblk_t *first_mp; 11791 boolean_t mctl_present; 11792 uint16_t sum; 11793 11794 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 11795 /* 11796 * Don't do the checksum if it has gone through AH/ESP 11797 * processing. 11798 */ 11799 if (!mctl_present) { 11800 sum = ip_csum_hdr(ipha); 11801 if (sum != 0) { 11802 BUMP_MIB(&ip_mib, ipInCksumErrs); 11803 freemsg(first_mp); 11804 return (B_FALSE); 11805 } 11806 } 11807 11808 if (!ip_rput_local_options(q, mp, ipha, ire)) { 11809 if (mctl_present) 11810 freeb(first_mp); 11811 return (B_FALSE); 11812 } 11813 11814 return (B_TRUE); 11815 } 11816 11817 /* 11818 * All udp packet are delivered to the local host via this routine. 11819 */ 11820 void 11821 ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, 11822 ill_t *recv_ill) 11823 { 11824 uint32_t sum; 11825 uint32_t u1; 11826 boolean_t mctl_present; 11827 conn_t *connp; 11828 mblk_t *first_mp; 11829 uint16_t *up; 11830 ill_t *ill = (ill_t *)q->q_ptr; 11831 uint16_t reass_hck_flags = 0; 11832 11833 #define rptr ((uchar_t *)ipha) 11834 11835 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 11836 ASSERT(!mctl_present || ipsec_in_is_secure(first_mp)); 11837 ASSERT(ipha->ipha_protocol == IPPROTO_UDP); 11838 11839 /* 11840 * FAST PATH for udp packets 11841 */ 11842 11843 /* u1 is # words of IP options */ 11844 u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) + 11845 IP_SIMPLE_HDR_LENGTH_IN_WORDS); 11846 11847 /* IP options present */ 11848 if (u1 != 0) 11849 goto ipoptions; 11850 11851 /* Check the IP header checksum. */ 11852 if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) { 11853 /* Clear the IP header h/w cksum flag */ 11854 DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; 11855 } else { 11856 #define uph ((uint16_t *)ipha) 11857 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + uph[5] + 11858 uph[6] + uph[7] + uph[8] + uph[9]; 11859 #undef uph 11860 /* finish doing IP checksum */ 11861 sum = (sum & 0xFFFF) + (sum >> 16); 11862 sum = ~(sum + (sum >> 16)) & 0xFFFF; 11863 /* 11864 * Don't verify header checksum if this packet is coming 11865 * back from AH/ESP as we already did it. 11866 */ 11867 if (!mctl_present && sum != 0 && sum != 0xFFFF) { 11868 BUMP_MIB(&ip_mib, ipInCksumErrs); 11869 freemsg(first_mp); 11870 return; 11871 } 11872 } 11873 11874 /* 11875 * Count for SNMP of inbound packets for ire. 11876 * if mctl is present this might be a secure packet and 11877 * has already been counted for in ip_proto_input(). 11878 */ 11879 if (!mctl_present) { 11880 UPDATE_IB_PKT_COUNT(ire); 11881 ire->ire_last_used_time = lbolt; 11882 } 11883 11884 /* packet part of fragmented IP packet? */ 11885 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 11886 if (u1 & (IPH_MF | IPH_OFFSET)) { 11887 goto fragmented; 11888 } 11889 11890 /* u1 = IP header length (20 bytes) */ 11891 u1 = IP_SIMPLE_HDR_LENGTH; 11892 11893 /* packet does not contain complete IP & UDP headers */ 11894 if ((mp->b_wptr - rptr) < (IP_SIMPLE_HDR_LENGTH + UDPH_SIZE)) 11895 goto udppullup; 11896 11897 /* up points to UDP header */ 11898 up = (uint16_t *)((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH); 11899 #define iphs ((uint16_t *)ipha) 11900 11901 /* if udp hdr cksum != 0, then need to checksum udp packet */ 11902 if (up[3] != 0) { 11903 mblk_t *mp1 = mp->b_cont; 11904 boolean_t cksum_err; 11905 uint16_t hck_flags = 0; 11906 11907 /* Pseudo-header checksum */ 11908 u1 = IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] + 11909 iphs[9] + up[2]; 11910 11911 /* 11912 * Revert to software checksum calculation if the interface 11913 * isn't capable of checksum offload or if IPsec is present. 11914 */ 11915 if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum) 11916 hck_flags = DB_CKSUMFLAGS(mp); 11917 11918 if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) 11919 IP_STAT(ip_in_sw_cksum); 11920 11921 IP_CKSUM_RECV(hck_flags, u1, 11922 (uchar_t *)(rptr + DB_CKSUMSTART(mp)), 11923 (int32_t)((uchar_t *)up - rptr), 11924 mp, mp1, cksum_err); 11925 11926 if (cksum_err) { 11927 BUMP_MIB(&ip_mib, udpInCksumErrs); 11928 11929 if (hck_flags & HCK_FULLCKSUM) 11930 IP_STAT(ip_udp_in_full_hw_cksum_err); 11931 else if (hck_flags & HCK_PARTIALCKSUM) 11932 IP_STAT(ip_udp_in_part_hw_cksum_err); 11933 else 11934 IP_STAT(ip_udp_in_sw_cksum_err); 11935 11936 freemsg(first_mp); 11937 return; 11938 } 11939 } 11940 11941 /* Non-fragmented broadcast or multicast packet? */ 11942 if (ire->ire_type == IRE_BROADCAST) 11943 goto udpslowpath; 11944 11945 if ((connp = ipcl_classify_v4(mp, IPPROTO_UDP, IP_SIMPLE_HDR_LENGTH, 11946 ire->ire_zoneid)) != NULL) { 11947 ASSERT(connp->conn_upq != NULL); 11948 IP_STAT(ip_udp_fast_path); 11949 11950 if (CONN_UDP_FLOWCTLD(connp)) { 11951 freemsg(mp); 11952 BUMP_MIB(&ip_mib, udpInOverflows); 11953 } else { 11954 if (!mctl_present) { 11955 BUMP_MIB(&ip_mib, ipInDelivers); 11956 } 11957 /* 11958 * mp and first_mp can change. 11959 */ 11960 if (ip_udp_check(q, connp, recv_ill, 11961 ipha, &mp, &first_mp, mctl_present)) { 11962 /* Send it upstream */ 11963 CONN_UDP_RECV(connp, mp); 11964 } 11965 } 11966 /* 11967 * freeb() cannot deal with null mblk being passed 11968 * in and first_mp can be set to null in the call 11969 * ipsec_input_fast_proc()->ipsec_check_inbound_policy. 11970 */ 11971 if (mctl_present && first_mp != NULL) { 11972 freeb(first_mp); 11973 } 11974 CONN_DEC_REF(connp); 11975 return; 11976 } 11977 11978 /* 11979 * if we got here we know the packet is not fragmented and 11980 * has no options. The classifier could not find a conn_t and 11981 * most likely its an icmp packet so send it through slow path. 11982 */ 11983 11984 goto udpslowpath; 11985 11986 ipoptions: 11987 if (!ip_options_cksum(q, mp, ipha, ire)) { 11988 goto slow_done; 11989 } 11990 11991 UPDATE_IB_PKT_COUNT(ire); 11992 ire->ire_last_used_time = lbolt; 11993 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 11994 if (u1 & (IPH_MF | IPH_OFFSET)) { 11995 fragmented: 11996 /* 11997 * "sum" and "reass_hck_flags" are non-zero if the 11998 * reassembled packet has a valid hardware computed 11999 * checksum information associated with it. 12000 */ 12001 if (!ip_rput_fragment(q, &mp, ipha, &sum, &reass_hck_flags)) 12002 goto slow_done; 12003 /* 12004 * Make sure that first_mp points back to mp as 12005 * the mp we came in with could have changed in 12006 * ip_rput_fragment(). 12007 */ 12008 ASSERT(!mctl_present); 12009 ipha = (ipha_t *)mp->b_rptr; 12010 first_mp = mp; 12011 } 12012 12013 /* Now we have a complete datagram, destined for this machine. */ 12014 u1 = IPH_HDR_LENGTH(ipha); 12015 /* Pull up the UDP header, if necessary. */ 12016 if ((MBLKL(mp)) < (u1 + UDPH_SIZE)) { 12017 udppullup: 12018 if (!pullupmsg(mp, u1 + UDPH_SIZE)) { 12019 BUMP_MIB(&ip_mib, ipInDiscards); 12020 freemsg(first_mp); 12021 goto slow_done; 12022 } 12023 ipha = (ipha_t *)mp->b_rptr; 12024 } 12025 12026 /* 12027 * Validate the checksum for the reassembled packet; for the 12028 * pullup case we calculate the payload checksum in software. 12029 */ 12030 up = (uint16_t *)((uchar_t *)ipha + u1 + UDP_PORTS_OFFSET); 12031 if (up[3] != 0) { 12032 boolean_t cksum_err; 12033 12034 if ((reass_hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) 12035 IP_STAT(ip_in_sw_cksum); 12036 12037 IP_CKSUM_RECV_REASS(reass_hck_flags, 12038 (int32_t)((uchar_t *)up - (uchar_t *)ipha), 12039 IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] + 12040 iphs[9] + up[2], sum, cksum_err); 12041 12042 if (cksum_err) { 12043 BUMP_MIB(&ip_mib, udpInCksumErrs); 12044 12045 if (reass_hck_flags & HCK_FULLCKSUM) 12046 IP_STAT(ip_udp_in_full_hw_cksum_err); 12047 else if (reass_hck_flags & HCK_PARTIALCKSUM) 12048 IP_STAT(ip_udp_in_part_hw_cksum_err); 12049 else 12050 IP_STAT(ip_udp_in_sw_cksum_err); 12051 12052 freemsg(first_mp); 12053 goto slow_done; 12054 } 12055 } 12056 udpslowpath: 12057 12058 /* Clear hardware checksum flag to be safe */ 12059 DB_CKSUMFLAGS(mp) = 0; 12060 12061 ip_fanout_udp(q, first_mp, ill, ipha, *(uint32_t *)up, 12062 (ire->ire_type == IRE_BROADCAST), 12063 IP_FF_SEND_ICMP | IP_FF_CKSUM | IP_FF_IP6INFO, 12064 mctl_present, B_TRUE, recv_ill, ire->ire_zoneid); 12065 12066 slow_done: 12067 IP_STAT(ip_udp_slow_path); 12068 return; 12069 12070 #undef iphs 12071 #undef rptr 12072 } 12073 12074 /* ARGSUSED */ 12075 static mblk_t * 12076 ip_tcp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present, 12077 ire_t *ire, mblk_t *first_mp, uint_t flags, queue_t *q, 12078 ill_rx_ring_t *ill_ring) 12079 { 12080 conn_t *connp; 12081 uint32_t sum; 12082 uint32_t u1; 12083 uint16_t *up; 12084 int offset; 12085 ssize_t len; 12086 mblk_t *mp1; 12087 boolean_t syn_present = B_FALSE; 12088 tcph_t *tcph; 12089 uint_t ip_hdr_len; 12090 ill_t *ill = (ill_t *)q->q_ptr; 12091 zoneid_t zoneid = ire->ire_zoneid; 12092 boolean_t cksum_err; 12093 uint16_t hck_flags = 0; 12094 12095 #define rptr ((uchar_t *)ipha) 12096 12097 ASSERT(ipha->ipha_protocol == IPPROTO_TCP); 12098 12099 /* 12100 * FAST PATH for tcp packets 12101 */ 12102 12103 /* u1 is # words of IP options */ 12104 u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) 12105 + IP_SIMPLE_HDR_LENGTH_IN_WORDS); 12106 12107 /* IP options present */ 12108 if (u1) { 12109 goto ipoptions; 12110 } else { 12111 /* Check the IP header checksum. */ 12112 if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) { 12113 /* Clear the IP header h/w cksum flag */ 12114 DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; 12115 } else { 12116 #define uph ((uint16_t *)ipha) 12117 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + 12118 uph[5] + uph[6] + uph[7] + uph[8] + uph[9]; 12119 #undef uph 12120 /* finish doing IP checksum */ 12121 sum = (sum & 0xFFFF) + (sum >> 16); 12122 sum = ~(sum + (sum >> 16)) & 0xFFFF; 12123 /* 12124 * Don't verify header checksum if this packet 12125 * is coming back from AH/ESP as we already did it. 12126 */ 12127 if (!mctl_present && (sum != 0) && sum != 0xFFFF) { 12128 BUMP_MIB(&ip_mib, ipInCksumErrs); 12129 goto error; 12130 } 12131 } 12132 } 12133 12134 if (!mctl_present) { 12135 UPDATE_IB_PKT_COUNT(ire); 12136 ire->ire_last_used_time = lbolt; 12137 } 12138 12139 /* packet part of fragmented IP packet? */ 12140 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 12141 if (u1 & (IPH_MF | IPH_OFFSET)) { 12142 goto fragmented; 12143 } 12144 12145 /* u1 = IP header length (20 bytes) */ 12146 u1 = ip_hdr_len = IP_SIMPLE_HDR_LENGTH; 12147 12148 /* does packet contain IP+TCP headers? */ 12149 len = mp->b_wptr - rptr; 12150 if (len < (IP_SIMPLE_HDR_LENGTH + TCP_MIN_HEADER_LENGTH)) { 12151 IP_STAT(ip_tcppullup); 12152 goto tcppullup; 12153 } 12154 12155 /* TCP options present? */ 12156 offset = ((uchar_t *)ipha)[IP_SIMPLE_HDR_LENGTH + 12] >> 4; 12157 12158 /* 12159 * If options need to be pulled up, then goto tcpoptions. 12160 * otherwise we are still in the fast path 12161 */ 12162 if (len < (offset << 2) + IP_SIMPLE_HDR_LENGTH) { 12163 IP_STAT(ip_tcpoptions); 12164 goto tcpoptions; 12165 } 12166 12167 /* multiple mblks of tcp data? */ 12168 if ((mp1 = mp->b_cont) != NULL) { 12169 /* more then two? */ 12170 if (mp1->b_cont != NULL) { 12171 IP_STAT(ip_multipkttcp); 12172 goto multipkttcp; 12173 } 12174 len += mp1->b_wptr - mp1->b_rptr; 12175 } 12176 12177 up = (uint16_t *)(rptr + IP_SIMPLE_HDR_LENGTH + TCP_PORTS_OFFSET); 12178 12179 /* part of pseudo checksum */ 12180 12181 /* TCP datagram length */ 12182 u1 = len - IP_SIMPLE_HDR_LENGTH; 12183 12184 #define iphs ((uint16_t *)ipha) 12185 12186 #ifdef _BIG_ENDIAN 12187 u1 += IPPROTO_TCP; 12188 #else 12189 u1 = ((u1 >> 8) & 0xFF) + (((u1 & 0xFF) + IPPROTO_TCP) << 8); 12190 #endif 12191 u1 += iphs[6] + iphs[7] + iphs[8] + iphs[9]; 12192 12193 /* 12194 * Revert to software checksum calculation if the interface 12195 * isn't capable of checksum offload or if IPsec is present. 12196 */ 12197 if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum) 12198 hck_flags = DB_CKSUMFLAGS(mp); 12199 12200 if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) 12201 IP_STAT(ip_in_sw_cksum); 12202 12203 IP_CKSUM_RECV(hck_flags, u1, 12204 (uchar_t *)(rptr + DB_CKSUMSTART(mp)), 12205 (int32_t)((uchar_t *)up - rptr), 12206 mp, mp1, cksum_err); 12207 12208 if (cksum_err) { 12209 BUMP_MIB(&ip_mib, tcpInErrs); 12210 12211 if (hck_flags & HCK_FULLCKSUM) 12212 IP_STAT(ip_tcp_in_full_hw_cksum_err); 12213 else if (hck_flags & HCK_PARTIALCKSUM) 12214 IP_STAT(ip_tcp_in_part_hw_cksum_err); 12215 else 12216 IP_STAT(ip_tcp_in_sw_cksum_err); 12217 12218 goto error; 12219 } 12220 12221 try_again: 12222 12223 if ((connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_len, zoneid)) == 12224 NULL) { 12225 /* Send the TH_RST */ 12226 goto no_conn; 12227 } 12228 12229 /* 12230 * TCP FAST PATH for AF_INET socket. 12231 * 12232 * TCP fast path to avoid extra work. An AF_INET socket type 12233 * does not have facility to receive extra information via 12234 * ip_process or ip_add_info. Also, when the connection was 12235 * established, we made a check if this connection is impacted 12236 * by any global IPSec policy or per connection policy (a 12237 * policy that comes in effect later will not apply to this 12238 * connection). Since all this can be determined at the 12239 * connection establishment time, a quick check of flags 12240 * can avoid extra work. 12241 */ 12242 if (IPCL_IS_TCP4_CONNECTED_NO_POLICY(connp) && !mctl_present && 12243 !IPP_ENABLED(IPP_LOCAL_IN)) { 12244 ASSERT(first_mp == mp); 12245 SET_SQUEUE(mp, tcp_rput_data, connp); 12246 return (mp); 12247 } 12248 12249 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 12250 if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) { 12251 if (IPCL_IS_TCP(connp)) { 12252 mp->b_datap->db_struioflag |= STRUIO_EAGER; 12253 DB_CKSUMSTART(mp) = 12254 (intptr_t)ip_squeue_get(ill_ring); 12255 if (IPCL_IS_FULLY_BOUND(connp) && !mctl_present && 12256 !CONN_INBOUND_POLICY_PRESENT(connp)) { 12257 SET_SQUEUE(mp, connp->conn_recv, connp); 12258 return (mp); 12259 } else if (IPCL_IS_BOUND(connp) && !mctl_present && 12260 !CONN_INBOUND_POLICY_PRESENT(connp)) { 12261 ip_squeue_enter_unbound++; 12262 SET_SQUEUE(mp, tcp_conn_request_unbound, 12263 connp); 12264 return (mp); 12265 } 12266 syn_present = B_TRUE; 12267 } 12268 12269 } 12270 12271 if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp) && !syn_present) { 12272 uint_t flags = (unsigned int)tcph->th_flags[0] & 0xFF; 12273 12274 /* No need to send this packet to TCP */ 12275 if ((flags & TH_RST) || (flags & TH_URG)) { 12276 CONN_DEC_REF(connp); 12277 freemsg(first_mp); 12278 return (NULL); 12279 } 12280 if (flags & TH_ACK) { 12281 tcp_xmit_listeners_reset(first_mp, ip_hdr_len); 12282 CONN_DEC_REF(connp); 12283 return (NULL); 12284 } 12285 12286 CONN_DEC_REF(connp); 12287 freemsg(first_mp); 12288 return (NULL); 12289 } 12290 12291 if (CONN_INBOUND_POLICY_PRESENT(connp) || mctl_present) { 12292 first_mp = ipsec_check_inbound_policy(first_mp, connp, 12293 ipha, NULL, mctl_present); 12294 if (first_mp == NULL) { 12295 CONN_DEC_REF(connp); 12296 return (NULL); 12297 } 12298 if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp)) { 12299 ASSERT(syn_present); 12300 if (mctl_present) { 12301 ASSERT(first_mp != mp); 12302 first_mp->b_datap->db_struioflag |= 12303 STRUIO_POLICY; 12304 } else { 12305 ASSERT(first_mp == mp); 12306 mp->b_datap->db_struioflag &= ~STRUIO_EAGER; 12307 mp->b_datap->db_struioflag |= STRUIO_POLICY; 12308 } 12309 } else { 12310 /* 12311 * Discard first_mp early since we're dealing with a 12312 * fully-connected conn_t and tcp doesn't do policy in 12313 * this case. 12314 */ 12315 if (mctl_present) { 12316 freeb(first_mp); 12317 mctl_present = B_FALSE; 12318 } 12319 first_mp = mp; 12320 } 12321 } 12322 12323 /* Initiate IPPF processing for fastpath */ 12324 if (IPP_ENABLED(IPP_LOCAL_IN)) { 12325 uint32_t ill_index; 12326 12327 ill_index = recv_ill->ill_phyint->phyint_ifindex; 12328 ip_process(IPP_LOCAL_IN, &mp, ill_index); 12329 if (mp == NULL) { 12330 ip2dbg(("ip_input_ipsec_process: TCP pkt " 12331 "deferred/dropped during IPPF processing\n")); 12332 CONN_DEC_REF(connp); 12333 if (mctl_present) 12334 freeb(first_mp); 12335 return (NULL); 12336 } else if (mctl_present) { 12337 /* 12338 * ip_process might return a new mp. 12339 */ 12340 ASSERT(first_mp != mp); 12341 first_mp->b_cont = mp; 12342 } else { 12343 first_mp = mp; 12344 } 12345 12346 } 12347 12348 if (!syn_present && connp->conn_ipv6_recvpktinfo) { 12349 mp = ip_add_info(mp, recv_ill, flags); 12350 if (mp == NULL) { 12351 CONN_DEC_REF(connp); 12352 if (mctl_present) 12353 freeb(first_mp); 12354 return (NULL); 12355 } else if (mctl_present) { 12356 /* 12357 * ip_add_info might return a new mp. 12358 */ 12359 ASSERT(first_mp != mp); 12360 first_mp->b_cont = mp; 12361 } else { 12362 first_mp = mp; 12363 } 12364 } 12365 12366 if (IPCL_IS_TCP(connp)) { 12367 SET_SQUEUE(first_mp, connp->conn_recv, connp); 12368 return (first_mp); 12369 } else { 12370 putnext(connp->conn_rq, first_mp); 12371 CONN_DEC_REF(connp); 12372 return (NULL); 12373 } 12374 12375 no_conn: 12376 /* Initiate IPPf processing, if needed. */ 12377 if (IPP_ENABLED(IPP_LOCAL_IN)) { 12378 uint32_t ill_index; 12379 ill_index = recv_ill->ill_phyint->phyint_ifindex; 12380 ip_process(IPP_LOCAL_IN, &first_mp, ill_index); 12381 if (first_mp == NULL) { 12382 return (NULL); 12383 } 12384 } 12385 BUMP_MIB(&ip_mib, ipInDelivers); 12386 tcp_xmit_listeners_reset(first_mp, IPH_HDR_LENGTH(mp->b_rptr)); 12387 return (NULL); 12388 ipoptions: 12389 if (!ip_options_cksum(q, first_mp, ipha, ire)) { 12390 goto slow_done; 12391 } 12392 12393 UPDATE_IB_PKT_COUNT(ire); 12394 ire->ire_last_used_time = lbolt; 12395 12396 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 12397 if (u1 & (IPH_MF | IPH_OFFSET)) { 12398 fragmented: 12399 if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) { 12400 if (mctl_present) 12401 freeb(first_mp); 12402 goto slow_done; 12403 } 12404 /* 12405 * Make sure that first_mp points back to mp as 12406 * the mp we came in with could have changed in 12407 * ip_rput_fragment(). 12408 */ 12409 ASSERT(!mctl_present); 12410 ipha = (ipha_t *)mp->b_rptr; 12411 first_mp = mp; 12412 } 12413 12414 /* Now we have a complete datagram, destined for this machine. */ 12415 u1 = ip_hdr_len = IPH_HDR_LENGTH(ipha); 12416 12417 len = mp->b_wptr - mp->b_rptr; 12418 /* Pull up a minimal TCP header, if necessary. */ 12419 if (len < (u1 + 20)) { 12420 tcppullup: 12421 if (!pullupmsg(mp, u1 + 20)) { 12422 BUMP_MIB(&ip_mib, ipInDiscards); 12423 goto error; 12424 } 12425 ipha = (ipha_t *)mp->b_rptr; 12426 len = mp->b_wptr - mp->b_rptr; 12427 } 12428 12429 /* 12430 * Extract the offset field from the TCP header. As usual, we 12431 * try to help the compiler more than the reader. 12432 */ 12433 offset = ((uchar_t *)ipha)[u1 + 12] >> 4; 12434 if (offset != 5) { 12435 tcpoptions: 12436 if (offset < 5) { 12437 BUMP_MIB(&ip_mib, ipInDiscards); 12438 goto error; 12439 } 12440 /* 12441 * There must be TCP options. 12442 * Make sure we can grab them. 12443 */ 12444 offset <<= 2; 12445 offset += u1; 12446 if (len < offset) { 12447 if (!pullupmsg(mp, offset)) { 12448 BUMP_MIB(&ip_mib, ipInDiscards); 12449 goto error; 12450 } 12451 ipha = (ipha_t *)mp->b_rptr; 12452 len = mp->b_wptr - rptr; 12453 } 12454 } 12455 12456 /* Get the total packet length in len, including headers. */ 12457 if (mp->b_cont) { 12458 multipkttcp: 12459 len = msgdsize(mp); 12460 } 12461 12462 /* 12463 * Check the TCP checksum by pulling together the pseudo- 12464 * header checksum, and passing it to ip_csum to be added in 12465 * with the TCP datagram. 12466 * 12467 * Since we are not using the hwcksum if available we must 12468 * clear the flag. We may come here via tcppullup or tcpoptions. 12469 * If either of these fails along the way the mblk is freed. 12470 * If this logic ever changes and mblk is reused to say send 12471 * ICMP's back, then this flag may need to be cleared in 12472 * other places as well. 12473 */ 12474 DB_CKSUMFLAGS(mp) = 0; 12475 12476 up = (uint16_t *)(rptr + u1 + TCP_PORTS_OFFSET); 12477 12478 u1 = (uint32_t)(len - u1); /* TCP datagram length. */ 12479 #ifdef _BIG_ENDIAN 12480 u1 += IPPROTO_TCP; 12481 #else 12482 u1 = ((u1 >> 8) & 0xFF) + (((u1 & 0xFF) + IPPROTO_TCP) << 8); 12483 #endif 12484 u1 += iphs[6] + iphs[7] + iphs[8] + iphs[9]; 12485 /* 12486 * Not M_DATA mblk or its a dup, so do the checksum now. 12487 */ 12488 IP_STAT(ip_in_sw_cksum); 12489 if (IP_CSUM(mp, (int32_t)((uchar_t *)up - rptr), u1) != 0) { 12490 BUMP_MIB(&ip_mib, tcpInErrs); 12491 goto error; 12492 } 12493 12494 IP_STAT(ip_tcp_slow_path); 12495 goto try_again; 12496 #undef iphs 12497 #undef rptr 12498 12499 error: 12500 freemsg(first_mp); 12501 slow_done: 12502 return (NULL); 12503 } 12504 12505 /* ARGSUSED */ 12506 static void 12507 ip_sctp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present, 12508 ire_t *ire, mblk_t *first_mp, uint_t flags, queue_t *q, ipaddr_t dst) 12509 { 12510 conn_t *connp; 12511 uint32_t sum; 12512 uint32_t u1; 12513 ssize_t len; 12514 sctp_hdr_t *sctph; 12515 zoneid_t zoneid = ire->ire_zoneid; 12516 uint32_t pktsum; 12517 uint32_t calcsum; 12518 uint32_t ports; 12519 uint_t ipif_seqid; 12520 in6_addr_t map_src, map_dst; 12521 ill_t *ill = (ill_t *)q->q_ptr; 12522 12523 #define rptr ((uchar_t *)ipha) 12524 12525 ASSERT(ipha->ipha_protocol == IPPROTO_SCTP); 12526 12527 /* u1 is # words of IP options */ 12528 u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) 12529 + IP_SIMPLE_HDR_LENGTH_IN_WORDS); 12530 12531 /* IP options present */ 12532 if (u1 > 0) { 12533 goto ipoptions; 12534 } else { 12535 /* Check the IP header checksum. */ 12536 if (!IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) { 12537 #define uph ((uint16_t *)ipha) 12538 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + 12539 uph[5] + uph[6] + uph[7] + uph[8] + uph[9]; 12540 #undef uph 12541 /* finish doing IP checksum */ 12542 sum = (sum & 0xFFFF) + (sum >> 16); 12543 sum = ~(sum + (sum >> 16)) & 0xFFFF; 12544 /* 12545 * Don't verify header checksum if this packet 12546 * is coming back from AH/ESP as we already did it. 12547 */ 12548 if (!mctl_present && (sum != 0) && sum != 0xFFFF) { 12549 BUMP_MIB(&ip_mib, ipInCksumErrs); 12550 goto error; 12551 } 12552 } 12553 /* 12554 * Since there is no SCTP h/w cksum support yet, just 12555 * clear the flag. 12556 */ 12557 DB_CKSUMFLAGS(mp) = 0; 12558 } 12559 12560 /* 12561 * Don't verify header checksum if this packet is coming 12562 * back from AH/ESP as we already did it. 12563 */ 12564 if (!mctl_present) { 12565 UPDATE_IB_PKT_COUNT(ire); 12566 ire->ire_last_used_time = lbolt; 12567 } 12568 12569 /* packet part of fragmented IP packet? */ 12570 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 12571 if (u1 & (IPH_MF | IPH_OFFSET)) 12572 goto fragmented; 12573 12574 /* u1 = IP header length (20 bytes) */ 12575 u1 = IP_SIMPLE_HDR_LENGTH; 12576 12577 find_sctp_client: 12578 /* Pullup if we don't have the sctp common header. */ 12579 len = MBLKL(mp); 12580 if (len < (u1 + SCTP_COMMON_HDR_LENGTH)) { 12581 if (mp->b_cont == NULL || 12582 !pullupmsg(mp, u1 + SCTP_COMMON_HDR_LENGTH)) { 12583 BUMP_MIB(&ip_mib, ipInDiscards); 12584 goto error; 12585 } 12586 ipha = (ipha_t *)mp->b_rptr; 12587 len = MBLKL(mp); 12588 } 12589 12590 sctph = (sctp_hdr_t *)(rptr + u1); 12591 #ifdef DEBUG 12592 if (!skip_sctp_cksum) { 12593 #endif 12594 pktsum = sctph->sh_chksum; 12595 sctph->sh_chksum = 0; 12596 calcsum = sctp_cksum(mp, u1); 12597 if (calcsum != pktsum) { 12598 BUMP_MIB(&sctp_mib, sctpChecksumError); 12599 goto error; 12600 } 12601 sctph->sh_chksum = pktsum; 12602 #ifdef DEBUG /* skip_sctp_cksum */ 12603 } 12604 #endif 12605 /* get the ports */ 12606 ports = *(uint32_t *)&sctph->sh_sport; 12607 12608 ipif_seqid = ire->ire_ipif->ipif_seqid; 12609 IRE_REFRELE(ire); 12610 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_dst); 12611 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_src); 12612 if ((connp = sctp_fanout(&map_src, &map_dst, ports, ipif_seqid, zoneid, 12613 mp)) == NULL) { 12614 /* Check for raw socket or OOTB handling */ 12615 goto no_conn; 12616 } 12617 12618 /* Found a client; up it goes */ 12619 BUMP_MIB(&ip_mib, ipInDelivers); 12620 sctp_input(connp, ipha, mp, first_mp, recv_ill, B_TRUE, mctl_present); 12621 return; 12622 12623 no_conn: 12624 ip_fanout_sctp_raw(first_mp, recv_ill, ipha, B_TRUE, 12625 ports, mctl_present, flags, B_TRUE, ipif_seqid, zoneid); 12626 return; 12627 12628 ipoptions: 12629 DB_CKSUMFLAGS(mp) = 0; 12630 if (!ip_options_cksum(q, first_mp, ipha, ire)) 12631 goto slow_done; 12632 12633 UPDATE_IB_PKT_COUNT(ire); 12634 ire->ire_last_used_time = lbolt; 12635 12636 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 12637 if (u1 & (IPH_MF | IPH_OFFSET)) { 12638 fragmented: 12639 if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) 12640 goto slow_done; 12641 /* 12642 * Make sure that first_mp points back to mp as 12643 * the mp we came in with could have changed in 12644 * ip_rput_fragment(). 12645 */ 12646 ASSERT(!mctl_present); 12647 ipha = (ipha_t *)mp->b_rptr; 12648 first_mp = mp; 12649 } 12650 12651 /* Now we have a complete datagram, destined for this machine. */ 12652 u1 = IPH_HDR_LENGTH(ipha); 12653 goto find_sctp_client; 12654 #undef iphs 12655 #undef rptr 12656 12657 error: 12658 freemsg(first_mp); 12659 slow_done: 12660 IRE_REFRELE(ire); 12661 } 12662 12663 #define VER_BITS 0xF0 12664 #define VERSION_6 0x60 12665 12666 static boolean_t 12667 ip_rput_multimblk_ipoptions(queue_t *q, mblk_t *mp, ipha_t **iphapp, 12668 ipaddr_t *dstp) 12669 { 12670 uint_t opt_len; 12671 ipha_t *ipha; 12672 ssize_t len; 12673 uint_t pkt_len; 12674 12675 IP_STAT(ip_ipoptions); 12676 ipha = *iphapp; 12677 12678 #define rptr ((uchar_t *)ipha) 12679 /* Assume no IPv6 packets arrive over the IPv4 queue */ 12680 if (IPH_HDR_VERSION(ipha) == IPV6_VERSION) { 12681 BUMP_MIB(&ip_mib, ipInIPv6); 12682 freemsg(mp); 12683 return (B_FALSE); 12684 } 12685 12686 /* multiple mblk or too short */ 12687 pkt_len = ntohs(ipha->ipha_length); 12688 12689 /* Get the number of words of IP options in the IP header. */ 12690 opt_len = ipha->ipha_version_and_hdr_length - IP_SIMPLE_HDR_VERSION; 12691 if (opt_len) { 12692 /* IP Options present! Validate and process. */ 12693 if (opt_len > (15 - IP_SIMPLE_HDR_LENGTH_IN_WORDS)) { 12694 BUMP_MIB(&ip_mib, ipInHdrErrors); 12695 goto done; 12696 } 12697 /* 12698 * Recompute complete header length and make sure we 12699 * have access to all of it. 12700 */ 12701 len = ((size_t)opt_len + IP_SIMPLE_HDR_LENGTH_IN_WORDS) << 2; 12702 if (len > (mp->b_wptr - rptr)) { 12703 if (len > pkt_len) { 12704 BUMP_MIB(&ip_mib, ipInHdrErrors); 12705 goto done; 12706 } 12707 if (!pullupmsg(mp, len)) { 12708 BUMP_MIB(&ip_mib, ipInDiscards); 12709 goto done; 12710 } 12711 ipha = (ipha_t *)mp->b_rptr; 12712 } 12713 /* 12714 * Go off to ip_rput_options which returns the next hop 12715 * destination address, which may have been affected 12716 * by source routing. 12717 */ 12718 IP_STAT(ip_opt); 12719 if (ip_rput_options(q, mp, ipha, dstp) == -1) { 12720 return (B_FALSE); 12721 } 12722 } 12723 *iphapp = ipha; 12724 return (B_TRUE); 12725 done: 12726 /* clear b_prev - used by ip_mroute_decap */ 12727 mp->b_prev = NULL; 12728 freemsg(mp); 12729 return (B_FALSE); 12730 #undef rptr 12731 } 12732 12733 /* 12734 * Deal with the fact that there is no ire for the destination. 12735 * The incoming ill (in_ill) is passed in to ip_newroute only 12736 * in the case of packets coming from mobile ip forward tunnel. 12737 * It must be null otherwise. 12738 */ 12739 static void 12740 ip_rput_noire(queue_t *q, ill_t *in_ill, mblk_t *mp, int ll_multicast, 12741 ipaddr_t dst) 12742 { 12743 ipha_t *ipha; 12744 ill_t *ill; 12745 12746 ipha = (ipha_t *)mp->b_rptr; 12747 ill = (ill_t *)q->q_ptr; 12748 12749 ASSERT(ill != NULL); 12750 /* 12751 * No IRE for this destination, so it can't be for us. 12752 * Unless we are forwarding, drop the packet. 12753 * We have to let source routed packets through 12754 * since we don't yet know if they are 'ping -l' 12755 * packets i.e. if they will go out over the 12756 * same interface as they came in on. 12757 */ 12758 if (ll_multicast) { 12759 freemsg(mp); 12760 return; 12761 } 12762 if (!(ill->ill_flags & ILLF_ROUTER) && !ip_source_routed(ipha)) { 12763 BUMP_MIB(&ip_mib, ipForwProhibits); 12764 freemsg(mp); 12765 return; 12766 } 12767 12768 /* Check for Martian addresses */ 12769 if ((in_ill == NULL) && (ip_no_forward(ipha, ill))) { 12770 freemsg(mp); 12771 return; 12772 } 12773 12774 /* Mark this packet as having originated externally */ 12775 mp->b_prev = (mblk_t *)(uintptr_t)ill->ill_phyint->phyint_ifindex; 12776 12777 /* 12778 * Clear the indication that this may have a hardware checksum 12779 * as we are not using it 12780 */ 12781 DB_CKSUMFLAGS(mp) = 0; 12782 12783 /* 12784 * Now hand the packet to ip_newroute. 12785 */ 12786 ip_newroute(q, mp, dst, in_ill, NULL); 12787 } 12788 12789 /* 12790 * check ip header length and align it. 12791 */ 12792 static boolean_t 12793 ip_check_and_align_header(queue_t *q, mblk_t *mp) 12794 { 12795 ssize_t len; 12796 ill_t *ill; 12797 ipha_t *ipha; 12798 12799 len = MBLKL(mp); 12800 12801 if (!OK_32PTR(mp->b_rptr) || len < IP_SIMPLE_HDR_LENGTH) { 12802 if (!OK_32PTR(mp->b_rptr)) 12803 IP_STAT(ip_notaligned1); 12804 else 12805 IP_STAT(ip_notaligned2); 12806 /* Guard against bogus device drivers */ 12807 if (len < 0) { 12808 /* clear b_prev - used by ip_mroute_decap */ 12809 mp->b_prev = NULL; 12810 BUMP_MIB(&ip_mib, ipInHdrErrors); 12811 freemsg(mp); 12812 return (B_FALSE); 12813 } 12814 12815 if (ip_rput_pullups++ == 0) { 12816 ill = (ill_t *)q->q_ptr; 12817 ipha = (ipha_t *)mp->b_rptr; 12818 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 12819 "ip_check_and_align_header: %s forced us to " 12820 " pullup pkt, hdr len %ld, hdr addr %p", 12821 ill->ill_name, len, ipha); 12822 } 12823 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 12824 /* clear b_prev - used by ip_mroute_decap */ 12825 mp->b_prev = NULL; 12826 BUMP_MIB(&ip_mib, ipInDiscards); 12827 freemsg(mp); 12828 return (B_FALSE); 12829 } 12830 } 12831 return (B_TRUE); 12832 } 12833 12834 static boolean_t 12835 ip_rput_notforus(queue_t **qp, mblk_t *mp, ire_t *ire, ill_t *ill) 12836 { 12837 ill_group_t *ill_group; 12838 ill_group_t *ire_group; 12839 queue_t *q; 12840 ill_t *ire_ill; 12841 uint_t ill_ifindex; 12842 12843 q = *qp; 12844 /* 12845 * We need to check to make sure the packet came in 12846 * on the queue associated with the destination IRE. 12847 * Note that for multicast packets and broadcast packets sent to 12848 * a broadcast address which is shared between multiple interfaces 12849 * we should not do this since we just got a random broadcast ire. 12850 */ 12851 if (ire->ire_rfq && ire->ire_type != IRE_BROADCAST) { 12852 boolean_t check_multi = B_TRUE; 12853 12854 /* 12855 * This packet came in on an interface other than the 12856 * one associated with the destination address. 12857 * "Gateway" it to the appropriate interface here. 12858 * As long as the ills belong to the same group, 12859 * we don't consider them to arriving on the wrong 12860 * interface. Thus, when the switch is doing inbound 12861 * load spreading, we won't drop packets when we 12862 * are doing strict multihoming checks. Note, the 12863 * same holds true for 'usesrc groups' where the 12864 * destination address may belong to another interface 12865 * to allow multipathing to happen 12866 */ 12867 ill_group = ill->ill_group; 12868 ire_ill = (ill_t *)(ire->ire_rfq)->q_ptr; 12869 ill_ifindex = ill->ill_usesrc_ifindex; 12870 ire_group = ire_ill->ill_group; 12871 12872 /* 12873 * If it's part of the same IPMP group, or if it's a legal 12874 * address on the 'usesrc' interface, then bypass strict 12875 * checks. 12876 */ 12877 if (ill_group != NULL && ill_group == ire_group) { 12878 check_multi = B_FALSE; 12879 } else if (ill_ifindex != 0 && 12880 ill_ifindex == ire_ill->ill_phyint->phyint_ifindex) { 12881 check_multi = B_FALSE; 12882 } 12883 12884 if (check_multi && 12885 ip_strict_dst_multihoming && 12886 ((ill->ill_flags & 12887 ire->ire_ipif->ipif_ill->ill_flags & 12888 ILLF_ROUTER) == 0)) { 12889 /* Drop packet */ 12890 BUMP_MIB(&ip_mib, ipForwProhibits); 12891 freemsg(mp); 12892 ire_refrele(ire); 12893 return (B_TRUE); 12894 } 12895 12896 /* 12897 * Change the queue (for non-virtual destination network 12898 * interfaces) and ip_rput_local will be called with the right 12899 * queue 12900 */ 12901 q = ire->ire_rfq; 12902 } 12903 /* Must be broadcast. We'll take it. */ 12904 *qp = q; 12905 return (B_FALSE); 12906 } 12907 12908 static void 12909 ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha, 12910 ill_t *ill, int ll_multicast) 12911 { 12912 ill_group_t *ill_group; 12913 ill_group_t *ire_group; 12914 queue_t *dev_q; 12915 12916 ASSERT(ire->ire_stq != NULL); 12917 if (ll_multicast != 0) 12918 goto drop_pkt; 12919 12920 if (ip_no_forward(ipha, ill)) 12921 goto drop_pkt; 12922 12923 ill_group = ill->ill_group; 12924 ire_group = ((ill_t *)(ire->ire_rfq)->q_ptr)->ill_group; 12925 /* 12926 * Check if we want to forward this one at this time. 12927 * We allow source routed packets on a host provided that 12928 * they go out the same interface or same interface group 12929 * as they came in on. 12930 * 12931 * XXX To be quicker, we may wish to not chase pointers to 12932 * get the ILLF_ROUTER flag and instead store the 12933 * forwarding policy in the ire. An unfortunate 12934 * side-effect of that would be requiring an ire flush 12935 * whenever the ILLF_ROUTER flag changes. 12936 */ 12937 if (((ill->ill_flags & 12938 ((ill_t *)ire->ire_stq->q_ptr)->ill_flags & 12939 ILLF_ROUTER) == 0) && 12940 !(ip_source_routed(ipha) && (ire->ire_rfq == q || 12941 (ill_group != NULL && ill_group == ire_group)))) { 12942 BUMP_MIB(&ip_mib, ipForwProhibits); 12943 if (ip_source_routed(ipha)) { 12944 q = WR(q); 12945 /* 12946 * Clear the indication that this may have 12947 * hardware checksum as we are not using it. 12948 */ 12949 DB_CKSUMFLAGS(mp) = 0; 12950 icmp_unreachable(q, mp, 12951 ICMP_SOURCE_ROUTE_FAILED); 12952 ire_refrele(ire); 12953 return; 12954 } 12955 goto drop_pkt; 12956 } 12957 12958 /* Packet is being forwarded. Turning off hwcksum flag. */ 12959 DB_CKSUMFLAGS(mp) = 0; 12960 if (ip_g_send_redirects) { 12961 /* 12962 * Check whether the incoming interface and outgoing 12963 * interface is part of the same group. If so, 12964 * send redirects. 12965 * 12966 * Check the source address to see if it originated 12967 * on the same logical subnet it is going back out on. 12968 * If so, we should be able to send it a redirect. 12969 * Avoid sending a redirect if the destination 12970 * is directly connected (gw_addr == 0), 12971 * or if the packet was source routed out this 12972 * interface. 12973 */ 12974 ipaddr_t src; 12975 mblk_t *mp1; 12976 ire_t *src_ire = NULL; 12977 12978 /* 12979 * Check whether ire_rfq and q are from the same ill 12980 * or if they are not same, they at least belong 12981 * to the same group. If so, send redirects. 12982 */ 12983 if ((ire->ire_rfq == q || 12984 (ill_group != NULL && ill_group == ire_group)) && 12985 (ire->ire_gateway_addr != 0) && 12986 !ip_source_routed(ipha)) { 12987 12988 src = ipha->ipha_src; 12989 src_ire = ire_ftable_lookup(src, 0, 0, 12990 IRE_INTERFACE, ire->ire_ipif, NULL, ALL_ZONES, 12991 0, NULL, MATCH_IRE_IPIF | MATCH_IRE_TYPE); 12992 12993 if (src_ire != NULL) { 12994 /* 12995 * The source is directly connected. 12996 * Just copy the ip header (which is 12997 * in the first mblk) 12998 */ 12999 mp1 = copyb(mp); 13000 if (mp1 != NULL) { 13001 icmp_send_redirect(WR(q), mp1, 13002 ire->ire_gateway_addr); 13003 } 13004 ire_refrele(src_ire); 13005 } 13006 } 13007 } 13008 13009 dev_q = ire->ire_stq->q_next; 13010 if ((dev_q->q_next || dev_q->q_first) && !canput(dev_q)) { 13011 BUMP_MIB(&ip_mib, ipInDiscards); 13012 freemsg(mp); 13013 ire_refrele(ire); 13014 return; 13015 } 13016 13017 ip_rput_forward(ire, ipha, mp, ill); 13018 IRE_REFRELE(ire); 13019 return; 13020 13021 drop_pkt: 13022 ire_refrele(ire); 13023 ip2dbg(("ip_rput_forward: drop pkt\n")); 13024 freemsg(mp); 13025 } 13026 13027 static boolean_t 13028 ip_rput_process_broadcast(queue_t **qp, mblk_t *mp, ire_t **irep, ipha_t *ipha, 13029 ill_t *ill, ipaddr_t dst, int cgtp_flt_pkt, int ll_multicast) 13030 { 13031 queue_t *q; 13032 ire_t *ire; 13033 uint16_t hcksumflags; 13034 13035 q = *qp; 13036 ire = *irep; 13037 13038 /* 13039 * Clear the indication that this may have hardware 13040 * checksum as we are not using it for forwarding. 13041 */ 13042 hcksumflags = DB_CKSUMFLAGS(mp); 13043 DB_CKSUMFLAGS(mp) = 0; 13044 13045 /* 13046 * Directed broadcast forwarding: if the packet came in over a 13047 * different interface then it is routed out over we can forward it. 13048 */ 13049 if (ipha->ipha_protocol == IPPROTO_TCP) { 13050 ire_refrele(ire); 13051 freemsg(mp); 13052 BUMP_MIB(&ip_mib, ipInDiscards); 13053 return (B_TRUE); 13054 } 13055 /* 13056 * For multicast we have set dst to be INADDR_BROADCAST 13057 * for delivering to all STREAMS. IRE_MARK_NORECV is really 13058 * only for broadcast packets. 13059 */ 13060 if (!CLASSD(ipha->ipha_dst)) { 13061 ire_t *new_ire; 13062 ipif_t *ipif; 13063 /* 13064 * For ill groups, as the switch duplicates broadcasts 13065 * across all the ports, we need to filter out and 13066 * send up only one copy. There is one copy for every 13067 * broadcast address on each ill. Thus, we look for a 13068 * specific IRE on this ill and look at IRE_MARK_NORECV 13069 * later to see whether this ill is eligible to receive 13070 * them or not. ill_nominate_bcast_rcv() nominates only 13071 * one set of IREs for receiving. 13072 */ 13073 13074 ipif = ipif_get_next_ipif(NULL, ill); 13075 if (ipif == NULL) { 13076 ire_refrele(ire); 13077 freemsg(mp); 13078 BUMP_MIB(&ip_mib, ipInDiscards); 13079 return (B_TRUE); 13080 } 13081 new_ire = ire_ctable_lookup(dst, 0, 0, 13082 ipif, ALL_ZONES, NULL, MATCH_IRE_ILL); 13083 ipif_refrele(ipif); 13084 13085 if (new_ire != NULL) { 13086 if (new_ire->ire_marks & IRE_MARK_NORECV) { 13087 ire_refrele(ire); 13088 ire_refrele(new_ire); 13089 freemsg(mp); 13090 BUMP_MIB(&ip_mib, ipInDiscards); 13091 return (B_TRUE); 13092 } 13093 /* 13094 * In the special case of multirouted broadcast 13095 * packets, we unconditionally need to "gateway" 13096 * them to the appropriate interface here. 13097 * In the normal case, this cannot happen, because 13098 * there is no broadcast IRE tagged with the 13099 * RTF_MULTIRT flag. 13100 */ 13101 if (new_ire->ire_flags & RTF_MULTIRT) { 13102 ire_refrele(new_ire); 13103 if (ire->ire_rfq != NULL) { 13104 q = ire->ire_rfq; 13105 *qp = q; 13106 } 13107 } else { 13108 ire_refrele(ire); 13109 ire = new_ire; 13110 } 13111 } else if (cgtp_flt_pkt == CGTP_IP_PKT_NOT_CGTP) { 13112 if (!ip_g_forward_directed_bcast) { 13113 /* 13114 * Free the message if 13115 * ip_g_forward_directed_bcast is turned 13116 * off for non-local broadcast. 13117 */ 13118 ire_refrele(ire); 13119 freemsg(mp); 13120 BUMP_MIB(&ip_mib, ipInDiscards); 13121 return (B_TRUE); 13122 } 13123 } else { 13124 /* 13125 * This CGTP packet successfully passed the 13126 * CGTP filter, but the related CGTP 13127 * broadcast IRE has not been found, 13128 * meaning that the redundant ipif is 13129 * probably down. However, if we discarded 13130 * this packet, its duplicate would be 13131 * filtered out by the CGTP filter so none 13132 * of them would get through. So we keep 13133 * going with this one. 13134 */ 13135 ASSERT(cgtp_flt_pkt == CGTP_IP_PKT_PREMIUM); 13136 if (ire->ire_rfq != NULL) { 13137 q = ire->ire_rfq; 13138 *qp = q; 13139 } 13140 } 13141 } 13142 if (ip_g_forward_directed_bcast && ll_multicast == 0) { 13143 /* 13144 * Verify that there are not more then one 13145 * IRE_BROADCAST with this broadcast address which 13146 * has ire_stq set. 13147 * TODO: simplify, loop over all IRE's 13148 */ 13149 ire_t *ire1; 13150 int num_stq = 0; 13151 mblk_t *mp1; 13152 13153 /* Find the first one with ire_stq set */ 13154 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 13155 for (ire1 = ire; ire1 && 13156 !ire1->ire_stq && ire1->ire_addr == ire->ire_addr; 13157 ire1 = ire1->ire_next) 13158 ; 13159 if (ire1) { 13160 ire_refrele(ire); 13161 ire = ire1; 13162 IRE_REFHOLD(ire); 13163 } 13164 13165 /* Check if there are additional ones with stq set */ 13166 for (ire1 = ire; ire1; ire1 = ire1->ire_next) { 13167 if (ire->ire_addr != ire1->ire_addr) 13168 break; 13169 if (ire1->ire_stq) { 13170 num_stq++; 13171 break; 13172 } 13173 } 13174 rw_exit(&ire->ire_bucket->irb_lock); 13175 if (num_stq == 1 && ire->ire_stq != NULL) { 13176 ip1dbg(("ip_rput_process_broadcast: directed " 13177 "broadcast to 0x%x\n", 13178 ntohl(ire->ire_addr))); 13179 mp1 = copymsg(mp); 13180 if (mp1) { 13181 switch (ipha->ipha_protocol) { 13182 case IPPROTO_UDP: 13183 ip_udp_input(q, mp1, ipha, ire, ill); 13184 break; 13185 default: 13186 ip_proto_input(q, mp1, ipha, ire, ill); 13187 break; 13188 } 13189 } 13190 /* 13191 * Adjust ttl to 2 (1+1 - the forward engine 13192 * will decrement it by one. 13193 */ 13194 if (ip_csum_hdr(ipha)) { 13195 BUMP_MIB(&ip_mib, ipInCksumErrs); 13196 ip2dbg(("ip_rput_broadcast:drop pkt\n")); 13197 freemsg(mp); 13198 ire_refrele(ire); 13199 return (B_TRUE); 13200 } 13201 ipha->ipha_ttl = ip_broadcast_ttl + 1; 13202 ipha->ipha_hdr_checksum = 0; 13203 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 13204 ip_rput_process_forward(q, mp, ire, ipha, 13205 ill, ll_multicast); 13206 return (B_TRUE); 13207 } 13208 ip1dbg(("ip_rput: NO directed broadcast to 0x%x\n", 13209 ntohl(ire->ire_addr))); 13210 } 13211 13212 *irep = ire; 13213 13214 /* Restore any hardware checksum flags */ 13215 DB_CKSUMFLAGS(mp) = hcksumflags; 13216 return (B_FALSE); 13217 } 13218 13219 /* ARGSUSED */ 13220 static boolean_t 13221 ip_rput_process_multicast(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, 13222 int *ll_multicast, ipaddr_t *dstp) 13223 { 13224 /* 13225 * Forward packets only if we have joined the allmulti 13226 * group on this interface. 13227 */ 13228 if (ip_g_mrouter && ill->ill_join_allmulti) { 13229 int retval; 13230 13231 /* 13232 * Clear the indication that this may have hardware 13233 * checksum as we are not using it. 13234 */ 13235 DB_CKSUMFLAGS(mp) = 0; 13236 retval = ip_mforward(ill, ipha, mp); 13237 /* ip_mforward updates mib variables if needed */ 13238 /* clear b_prev - used by ip_mroute_decap */ 13239 mp->b_prev = NULL; 13240 13241 switch (retval) { 13242 case 0: 13243 /* 13244 * pkt is okay and arrived on phyint. 13245 * 13246 * If we are running as a multicast router 13247 * we need to see all IGMP and/or PIM packets. 13248 */ 13249 if ((ipha->ipha_protocol == IPPROTO_IGMP) || 13250 (ipha->ipha_protocol == IPPROTO_PIM)) { 13251 goto done; 13252 } 13253 break; 13254 case -1: 13255 /* pkt is mal-formed, toss it */ 13256 goto drop_pkt; 13257 case 1: 13258 /* pkt is okay and arrived on a tunnel */ 13259 /* 13260 * If we are running a multicast router 13261 * we need to see all igmp packets. 13262 */ 13263 if (ipha->ipha_protocol == IPPROTO_IGMP) { 13264 *dstp = INADDR_BROADCAST; 13265 *ll_multicast = 1; 13266 return (B_FALSE); 13267 } 13268 13269 goto drop_pkt; 13270 } 13271 } 13272 13273 ILM_WALKER_HOLD(ill); 13274 if (ilm_lookup_ill(ill, *dstp, ALL_ZONES) == NULL) { 13275 /* 13276 * This might just be caused by the fact that 13277 * multiple IP Multicast addresses map to the same 13278 * link layer multicast - no need to increment counter! 13279 */ 13280 ILM_WALKER_RELE(ill); 13281 freemsg(mp); 13282 return (B_TRUE); 13283 } 13284 ILM_WALKER_RELE(ill); 13285 done: 13286 ip2dbg(("ip_rput: multicast for us: 0x%x\n", ntohl(*dstp))); 13287 /* 13288 * This assumes the we deliver to all streams for multicast 13289 * and broadcast packets. 13290 */ 13291 *dstp = INADDR_BROADCAST; 13292 *ll_multicast = 1; 13293 return (B_FALSE); 13294 drop_pkt: 13295 ip2dbg(("ip_rput: drop pkt\n")); 13296 freemsg(mp); 13297 return (B_TRUE); 13298 } 13299 13300 static boolean_t 13301 ip_rput_process_notdata(queue_t *q, mblk_t **first_mpp, ill_t *ill, 13302 int *ll_multicast, mblk_t **mpp) 13303 { 13304 mblk_t *mp1, *from_mp, *to_mp, *mp, *first_mp; 13305 boolean_t must_copy = B_FALSE; 13306 struct iocblk *iocp; 13307 ipha_t *ipha; 13308 13309 #define rptr ((uchar_t *)ipha) 13310 13311 first_mp = *first_mpp; 13312 mp = *mpp; 13313 13314 ASSERT(first_mp == mp); 13315 13316 /* 13317 * if db_ref > 1 then copymsg and free original. Packet may be 13318 * changed and do not want other entity who has a reference to this 13319 * message to trip over the changes. This is a blind change because 13320 * trying to catch all places that might change packet is too 13321 * difficult (since it may be a module above this one) 13322 * 13323 * This corresponds to the non-fast path case. We walk down the full 13324 * chain in this case, and check the db_ref count of all the dblks, 13325 * and do a copymsg if required. It is possible that the db_ref counts 13326 * of the data blocks in the mblk chain can be different. 13327 * For Example, we can get a DL_UNITDATA_IND(M_PROTO) with a db_ref 13328 * count of 1, followed by a M_DATA block with a ref count of 2, if 13329 * 'snoop' is running. 13330 */ 13331 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { 13332 if (mp1->b_datap->db_ref > 1) { 13333 must_copy = B_TRUE; 13334 break; 13335 } 13336 } 13337 13338 if (must_copy) { 13339 mp1 = copymsg(mp); 13340 if (mp1 == NULL) { 13341 for (mp1 = mp; mp1 != NULL; 13342 mp1 = mp1->b_cont) { 13343 mp1->b_next = NULL; 13344 mp1->b_prev = NULL; 13345 } 13346 freemsg(mp); 13347 BUMP_MIB(&ip_mib, ipInDiscards); 13348 return (B_TRUE); 13349 } 13350 for (from_mp = mp, to_mp = mp1; from_mp != NULL; 13351 from_mp = from_mp->b_cont, to_mp = to_mp->b_cont) { 13352 /* Copy b_next - used in M_BREAK messages */ 13353 to_mp->b_next = from_mp->b_next; 13354 from_mp->b_next = NULL; 13355 /* Copy b_prev - used by ip_mroute_decap */ 13356 to_mp->b_prev = from_mp->b_prev; 13357 from_mp->b_prev = NULL; 13358 } 13359 *first_mpp = first_mp = mp1; 13360 freemsg(mp); 13361 mp = mp1; 13362 *mpp = mp1; 13363 } 13364 13365 ipha = (ipha_t *)mp->b_rptr; 13366 13367 /* 13368 * previous code has a case for M_DATA. 13369 * We want to check how that happens. 13370 */ 13371 ASSERT(first_mp->b_datap->db_type != M_DATA); 13372 switch (first_mp->b_datap->db_type) { 13373 case M_PROTO: 13374 case M_PCPROTO: 13375 if (((dl_unitdata_ind_t *)rptr)->dl_primitive != 13376 DL_UNITDATA_IND) { 13377 /* Go handle anything other than data elsewhere. */ 13378 ip_rput_dlpi(q, mp); 13379 return (B_TRUE); 13380 } 13381 *ll_multicast = ((dl_unitdata_ind_t *)rptr)->dl_group_address; 13382 /* Ditch the DLPI header. */ 13383 mp1 = mp->b_cont; 13384 ASSERT(first_mp == mp); 13385 *first_mpp = mp1; 13386 freeb(mp); 13387 *mpp = mp1; 13388 return (B_FALSE); 13389 case M_BREAK: 13390 /* 13391 * A packet arrives as M_BREAK following a cycle through 13392 * ip_rput, ip_newroute, ... and finally ire_add_then_send. 13393 * This is an IP datagram sans lower level header. 13394 * M_BREAK are also used to pass back in multicast packets 13395 * that are encapsulated with a source route. 13396 */ 13397 /* Ditch the M_BREAK mblk */ 13398 mp1 = mp->b_cont; 13399 ASSERT(first_mp == mp); 13400 *first_mpp = mp1; 13401 freeb(mp); 13402 mp = mp1; 13403 mp->b_next = NULL; 13404 *mpp = mp; 13405 *ll_multicast = 0; 13406 return (B_FALSE); 13407 case M_IOCACK: 13408 ip1dbg(("got iocack ")); 13409 iocp = (struct iocblk *)mp->b_rptr; 13410 switch (iocp->ioc_cmd) { 13411 case DL_IOC_HDR_INFO: 13412 ill = (ill_t *)q->q_ptr; 13413 ill_fastpath_ack(ill, mp); 13414 return (B_TRUE); 13415 case SIOCSTUNPARAM: 13416 case OSIOCSTUNPARAM: 13417 /* Go through qwriter_ip */ 13418 break; 13419 case SIOCGTUNPARAM: 13420 case OSIOCGTUNPARAM: 13421 ip_rput_other(NULL, q, mp, NULL); 13422 return (B_TRUE); 13423 default: 13424 putnext(q, mp); 13425 return (B_TRUE); 13426 } 13427 /* FALLTHRU */ 13428 case M_ERROR: 13429 case M_HANGUP: 13430 /* 13431 * Since this is on the ill stream we unconditionally 13432 * bump up the refcount 13433 */ 13434 ill_refhold(ill); 13435 (void) qwriter_ip(NULL, ill, q, mp, ip_rput_other, CUR_OP, 13436 B_FALSE); 13437 return (B_TRUE); 13438 case M_CTL: 13439 if ((MBLKL(first_mp) >= sizeof (da_ipsec_t)) && 13440 (((da_ipsec_t *)first_mp->b_rptr)->da_type == 13441 IPHADA_M_CTL)) { 13442 /* 13443 * It's an IPsec accelerated packet. 13444 * Make sure that the ill from which we received the 13445 * packet has enabled IPsec hardware acceleration. 13446 */ 13447 if (!(ill->ill_capabilities & 13448 (ILL_CAPAB_AH|ILL_CAPAB_ESP))) { 13449 /* IPsec kstats: bean counter */ 13450 freemsg(mp); 13451 return (B_TRUE); 13452 } 13453 13454 /* 13455 * Make mp point to the mblk following the M_CTL, 13456 * then process according to type of mp. 13457 * After this processing, first_mp will point to 13458 * the data-attributes and mp to the pkt following 13459 * the M_CTL. 13460 */ 13461 mp = first_mp->b_cont; 13462 if (mp == NULL) { 13463 freemsg(first_mp); 13464 return (B_TRUE); 13465 } 13466 /* 13467 * A Hardware Accelerated packet can only be M_DATA 13468 * ESP or AH packet. 13469 */ 13470 if (mp->b_datap->db_type != M_DATA) { 13471 /* non-M_DATA IPsec accelerated packet */ 13472 IPSECHW_DEBUG(IPSECHW_PKT, 13473 ("non-M_DATA IPsec accelerated pkt\n")); 13474 freemsg(first_mp); 13475 return (B_TRUE); 13476 } 13477 ipha = (ipha_t *)mp->b_rptr; 13478 if (ipha->ipha_protocol != IPPROTO_AH && 13479 ipha->ipha_protocol != IPPROTO_ESP) { 13480 IPSECHW_DEBUG(IPSECHW_PKT, 13481 ("non-M_DATA IPsec accelerated pkt\n")); 13482 freemsg(first_mp); 13483 return (B_TRUE); 13484 } 13485 *mpp = mp; 13486 return (B_FALSE); 13487 } 13488 putnext(q, mp); 13489 return (B_TRUE); 13490 case M_FLUSH: 13491 if (*mp->b_rptr & FLUSHW) { 13492 *mp->b_rptr &= ~FLUSHR; 13493 qreply(q, mp); 13494 return (B_TRUE); 13495 } 13496 freemsg(mp); 13497 return (B_TRUE); 13498 case M_IOCNAK: 13499 ip1dbg(("got iocnak ")); 13500 iocp = (struct iocblk *)mp->b_rptr; 13501 switch (iocp->ioc_cmd) { 13502 case DL_IOC_HDR_INFO: 13503 case SIOCSTUNPARAM: 13504 case OSIOCSTUNPARAM: 13505 /* 13506 * Since this is on the ill stream we unconditionally 13507 * bump up the refcount 13508 */ 13509 ill_refhold(ill); 13510 (void) qwriter_ip(NULL, ill, q, mp, ip_rput_other, 13511 CUR_OP, B_FALSE); 13512 return (B_TRUE); 13513 case SIOCGTUNPARAM: 13514 case OSIOCGTUNPARAM: 13515 ip_rput_other(NULL, q, mp, NULL); 13516 return (B_TRUE); 13517 default: 13518 break; 13519 } 13520 /* FALLTHRU */ 13521 default: 13522 putnext(q, mp); 13523 return (B_TRUE); 13524 } 13525 } 13526 13527 /* Read side put procedure. Packets coming from the wire arrive here. */ 13528 void 13529 ip_rput(queue_t *q, mblk_t *mp) 13530 { 13531 ill_t *ill; 13532 13533 TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_rput_start: q %p", q); 13534 13535 ill = (ill_t *)q->q_ptr; 13536 13537 if (ill->ill_state_flags & (ILL_CONDEMNED | ILL_LL_SUBNET_PENDING)) { 13538 union DL_primitives *dl; 13539 13540 /* 13541 * Things are opening or closing. Only accept DLPI control 13542 * messages. In the open case, the ill->ill_ipif has not yet 13543 * been created. In the close case, things hanging off the 13544 * ill could have been freed already. In either case it 13545 * may not be safe to proceed further. 13546 */ 13547 13548 dl = (union DL_primitives *)mp->b_rptr; 13549 if ((mp->b_datap->db_type != M_PCPROTO) || 13550 (dl->dl_primitive == DL_UNITDATA_IND)) { 13551 /* 13552 * Also SIOC[GS]TUN* ioctls can come here. 13553 */ 13554 inet_freemsg(mp); 13555 TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, 13556 "ip_input_end: q %p (%S)", q, "uninit"); 13557 return; 13558 } 13559 } 13560 13561 /* 13562 * if db_ref > 1 then copymsg and free original. Packet may be 13563 * changed and we do not want the other entity who has a reference to 13564 * this message to trip over the changes. This is a blind change because 13565 * trying to catch all places that might change the packet is too 13566 * difficult. 13567 * 13568 * This corresponds to the fast path case, where we have a chain of 13569 * M_DATA mblks. We check the db_ref count of only the 1st data block 13570 * in the mblk chain. There doesn't seem to be a reason why a device 13571 * driver would send up data with varying db_ref counts in the mblk 13572 * chain. In any case the Fast path is a private interface, and our 13573 * drivers don't do such a thing. Given the above assumption, there is 13574 * no need to walk down the entire mblk chain (which could have a 13575 * potential performance problem) 13576 */ 13577 if (mp->b_datap->db_ref > 1) { 13578 mblk_t *mp1; 13579 boolean_t adjusted = B_FALSE; 13580 IP_STAT(ip_db_ref); 13581 13582 /* 13583 * The IP_RECVSLLA option depends on having the link layer 13584 * header. First check that: 13585 * a> the underlying device is of type ether, since this 13586 * option is currently supported only over ethernet. 13587 * b> there is enough room to copy over the link layer header. 13588 * 13589 * Once the checks are done, adjust rptr so that the link layer 13590 * header will be copied via copymsg. Note that, IFT_ETHER may 13591 * be returned by some non-ethernet drivers but in this case the 13592 * second check will fail. 13593 */ 13594 if (ill->ill_type == IFT_ETHER && 13595 (mp->b_rptr - mp->b_datap->db_base) >= 13596 sizeof (struct ether_header)) { 13597 mp->b_rptr -= sizeof (struct ether_header); 13598 adjusted = B_TRUE; 13599 } 13600 mp1 = copymsg(mp); 13601 if (mp1 == NULL) { 13602 /* Clear b_next - used in M_BREAK messages */ 13603 mp->b_next = NULL; 13604 /* clear b_prev - used by ip_mroute_decap */ 13605 mp->b_prev = NULL; 13606 freemsg(mp); 13607 BUMP_MIB(&ip_mib, ipInDiscards); 13608 TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, 13609 "ip_rput_end: q %p (%S)", q, "copymsg"); 13610 return; 13611 } 13612 if (adjusted) { 13613 /* 13614 * Copy is done. Restore the pointer in the _new_ mblk 13615 */ 13616 mp1->b_rptr += sizeof (struct ether_header); 13617 } 13618 /* Copy b_next - used in M_BREAK messages */ 13619 mp1->b_next = mp->b_next; 13620 mp->b_next = NULL; 13621 /* Copy b_prev - used by ip_mroute_decap */ 13622 mp1->b_prev = mp->b_prev; 13623 mp->b_prev = NULL; 13624 freemsg(mp); 13625 mp = mp1; 13626 } 13627 13628 TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, 13629 "ip_rput_end: q %p (%S)", q, "end"); 13630 13631 ip_input(ill, NULL, mp, 0); 13632 } 13633 13634 /* 13635 * Direct read side procedure capable of dealing with chains. GLDv3 based 13636 * drivers call this function directly with mblk chains while STREAMS 13637 * read side procedure ip_rput() calls this for single packet with ip_ring 13638 * set to NULL to process one packet at a time. 13639 * 13640 * The ill will always be valid if this function is called directly from 13641 * the driver. 13642 */ 13643 /*ARGSUSED*/ 13644 void 13645 ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain, size_t hdrlen) 13646 { 13647 ipaddr_t dst; 13648 ire_t *ire; 13649 ipha_t *ipha; 13650 uint_t pkt_len; 13651 ssize_t len; 13652 uint_t opt_len; 13653 int ll_multicast; 13654 int cgtp_flt_pkt; 13655 queue_t *q = ill->ill_rq; 13656 squeue_t *curr_sqp = NULL; 13657 mblk_t *head = NULL; 13658 mblk_t *tail = NULL; 13659 mblk_t *first_mp; 13660 mblk_t *mp; 13661 int cnt = 0; 13662 13663 ASSERT(mp_chain != NULL); 13664 ASSERT(ill != NULL); 13665 13666 TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_input_start: q %p", q); 13667 13668 #define rptr ((uchar_t *)ipha) 13669 13670 while (mp_chain != NULL) { 13671 first_mp = mp = mp_chain; 13672 mp_chain = mp_chain->b_next; 13673 mp->b_next = NULL; 13674 ll_multicast = 0; 13675 ire = NULL; 13676 13677 /* 13678 * ip_input fast path 13679 */ 13680 13681 /* mblk type is not M_DATA */ 13682 if (mp->b_datap->db_type != M_DATA) { 13683 if (ip_rput_process_notdata(q, &first_mp, ill, 13684 &ll_multicast, &mp)) 13685 continue; 13686 } 13687 13688 ASSERT(mp->b_datap->db_type == M_DATA); 13689 ASSERT(mp->b_datap->db_ref == 1); 13690 13691 /* 13692 * Invoke the CGTP (multirouting) filtering module to process 13693 * the incoming packet. Packets identified as duplicates 13694 * must be discarded. Filtering is active only if the 13695 * the ip_cgtp_filter ndd variable is non-zero. 13696 */ 13697 cgtp_flt_pkt = CGTP_IP_PKT_NOT_CGTP; 13698 if (ip_cgtp_filter && (ip_cgtp_filter_ops != NULL)) { 13699 cgtp_flt_pkt = 13700 ip_cgtp_filter_ops->cfo_filter_fp(q, mp); 13701 if (cgtp_flt_pkt == CGTP_IP_PKT_DUPLICATE) { 13702 freemsg(first_mp); 13703 continue; 13704 } 13705 } 13706 13707 ipha = (ipha_t *)mp->b_rptr; 13708 len = mp->b_wptr - rptr; 13709 13710 BUMP_MIB(&ip_mib, ipInReceives); 13711 13712 /* 13713 * IP header ptr not aligned? 13714 * OR IP header not complete in first mblk 13715 */ 13716 if (!OK_32PTR(rptr) || len < IP_SIMPLE_HDR_LENGTH) { 13717 if (!ip_check_and_align_header(q, mp)) 13718 continue; 13719 ipha = (ipha_t *)mp->b_rptr; 13720 len = mp->b_wptr - rptr; 13721 } 13722 13723 /* multiple mblk or too short */ 13724 pkt_len = ntohs(ipha->ipha_length); 13725 len -= pkt_len; 13726 if (len != 0) { 13727 /* 13728 * Make sure we have data length consistent 13729 * with the IP header. 13730 */ 13731 if (mp->b_cont == NULL) { 13732 if (len < 0 || pkt_len < IP_SIMPLE_HDR_LENGTH) { 13733 BUMP_MIB(&ip_mib, ipInHdrErrors); 13734 ip2dbg(("ip_input: drop pkt\n")); 13735 freemsg(mp); 13736 continue; 13737 } 13738 mp->b_wptr = rptr + pkt_len; 13739 } else if (len += msgdsize(mp->b_cont)) { 13740 if (len < 0 || pkt_len < IP_SIMPLE_HDR_LENGTH) { 13741 BUMP_MIB(&ip_mib, ipInHdrErrors); 13742 ip2dbg(("ip_input: drop pkt\n")); 13743 freemsg(mp); 13744 continue; 13745 } 13746 (void) adjmsg(mp, -len); 13747 IP_STAT(ip_multimblk3); 13748 } 13749 } 13750 13751 if (ip_loopback_src_or_dst(ipha, ill)) { 13752 ip2dbg(("ip_input: drop pkt\n")); 13753 freemsg(mp); 13754 continue; 13755 } 13756 13757 /* 13758 * Attach any necessary label information to this packet. 13759 */ 13760 if (is_system_labeled() && 13761 !tsol_get_pkt_label(mp, IPV4_VERSION)) { 13762 BUMP_MIB(&ip_mib, ipInDiscards); 13763 freemsg(mp); 13764 continue; 13765 } 13766 13767 opt_len = ipha->ipha_version_and_hdr_length - 13768 IP_SIMPLE_HDR_VERSION; 13769 /* IP version bad or there are IP options */ 13770 if (opt_len) { 13771 if (len != 0) 13772 IP_STAT(ip_multimblk4); 13773 else 13774 IP_STAT(ip_ipoptions); 13775 if (!ip_rput_multimblk_ipoptions(q, mp, &ipha, &dst)) 13776 continue; 13777 } else { 13778 dst = ipha->ipha_dst; 13779 } 13780 13781 /* 13782 * If rsvpd is running, let RSVP daemon handle its processing 13783 * and forwarding of RSVP multicast/unicast packets. 13784 * If rsvpd is not running but mrouted is running, RSVP 13785 * multicast packets are forwarded as multicast traffic 13786 * and RSVP unicast packets are forwarded by unicast router. 13787 * If neither rsvpd nor mrouted is running, RSVP multicast 13788 * packets are not forwarded, but the unicast packets are 13789 * forwarded like unicast traffic. 13790 */ 13791 if (ipha->ipha_protocol == IPPROTO_RSVP && 13792 ipcl_proto_search(IPPROTO_RSVP) != NULL) { 13793 /* RSVP packet and rsvpd running. Treat as ours */ 13794 ip2dbg(("ip_input: RSVP for us: 0x%x\n", ntohl(dst))); 13795 /* 13796 * This assumes that we deliver to all streams for 13797 * multicast and broadcast packets. 13798 * We have to force ll_multicast to 1 to handle the 13799 * M_DATA messages passed in from ip_mroute_decap. 13800 */ 13801 dst = INADDR_BROADCAST; 13802 ll_multicast = 1; 13803 } else if (CLASSD(dst)) { 13804 /* packet is multicast */ 13805 mp->b_next = NULL; 13806 if (ip_rput_process_multicast(q, mp, ill, ipha, 13807 &ll_multicast, &dst)) 13808 continue; 13809 } 13810 13811 13812 /* 13813 * Check if the packet is coming from the Mobile IP 13814 * forward tunnel interface 13815 */ 13816 if (ill->ill_srcif_refcnt > 0) { 13817 ire = ire_srcif_table_lookup(dst, IRE_INTERFACE, 13818 NULL, ill, MATCH_IRE_TYPE); 13819 if (ire != NULL && ire->ire_dlureq_mp == NULL && 13820 ire->ire_ipif->ipif_net_type == 13821 IRE_IF_RESOLVER) { 13822 /* We need to resolve the link layer info */ 13823 ire_refrele(ire); 13824 ip_rput_noire(q, (ill_t *)q->q_ptr, mp, 13825 ll_multicast, dst); 13826 continue; 13827 } 13828 } 13829 13830 if (ire == NULL) { 13831 ire = ire_cache_lookup(dst, ALL_ZONES, 13832 MBLK_GETLABEL(mp)); 13833 } 13834 13835 /* 13836 * If mipagent is running and reverse tunnel is created as per 13837 * mobile node request, then any packet coming through the 13838 * incoming interface from the mobile-node, should be reverse 13839 * tunneled to it's home agent except those that are destined 13840 * to foreign agent only. 13841 * This needs source address based ire lookup. The routing 13842 * entries for source address based lookup are only created by 13843 * mipagent program only when a reverse tunnel is created. 13844 * Reference : RFC2002, RFC2344 13845 */ 13846 if (ill->ill_mrtun_refcnt > 0) { 13847 ipaddr_t srcaddr; 13848 ire_t *tmp_ire; 13849 13850 tmp_ire = ire; /* Save, we might need it later */ 13851 if (ire == NULL || (ire->ire_type != IRE_LOCAL && 13852 ire->ire_type != IRE_BROADCAST)) { 13853 srcaddr = ipha->ipha_src; 13854 ire = ire_mrtun_lookup(srcaddr, ill); 13855 if (ire != NULL) { 13856 /* 13857 * Should not be getting iphada packet 13858 * here. we should only get those for 13859 * IRE_LOCAL traffic, excluded above. 13860 * Fail-safe (drop packet) in the event 13861 * hardware is misbehaving. 13862 */ 13863 if (first_mp != mp) { 13864 /* IPsec KSTATS: beancount me */ 13865 freemsg(first_mp); 13866 } else { 13867 /* 13868 * This packet must be forwarded 13869 * to Reverse Tunnel 13870 */ 13871 ip_mrtun_forward(ire, ill, mp); 13872 } 13873 ire_refrele(ire); 13874 if (tmp_ire != NULL) 13875 ire_refrele(tmp_ire); 13876 TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, 13877 "ip_input_end: q %p (%S)", 13878 q, "uninit"); 13879 continue; 13880 } 13881 } 13882 /* 13883 * If this packet is from a non-mobilenode or a 13884 * mobile-node which does not request reverse 13885 * tunnel service 13886 */ 13887 ire = tmp_ire; 13888 } 13889 13890 13891 /* 13892 * If we reach here that means the incoming packet satisfies 13893 * one of the following conditions: 13894 * - packet is from a mobile node which does not request 13895 * reverse tunnel 13896 * - packet is from a non-mobile node, which is the most 13897 * common case 13898 * - packet is from a reverse tunnel enabled mobile node 13899 * and destined to foreign agent only 13900 */ 13901 13902 if (ire == NULL) { 13903 /* 13904 * No IRE for this destination, so it can't be for us. 13905 * Unless we are forwarding, drop the packet. 13906 * We have to let source routed packets through 13907 * since we don't yet know if they are 'ping -l' 13908 * packets i.e. if they will go out over the 13909 * same interface as they came in on. 13910 */ 13911 ip_rput_noire(q, NULL, mp, ll_multicast, dst); 13912 continue; 13913 } 13914 13915 /* 13916 * Broadcast IRE may indicate either broadcast or 13917 * multicast packet 13918 */ 13919 if (ire->ire_type == IRE_BROADCAST) { 13920 /* 13921 * Skip broadcast checks if packet is UDP multicast; 13922 * we'd rather not enter ip_rput_process_broadcast() 13923 * unless the packet is broadcast for real, since 13924 * that routine is a no-op for multicast. 13925 */ 13926 if ((ipha->ipha_protocol != IPPROTO_UDP || 13927 !CLASSD(ipha->ipha_dst)) && 13928 ip_rput_process_broadcast(&q, mp, &ire, ipha, ill, 13929 dst, cgtp_flt_pkt, ll_multicast)) { 13930 continue; 13931 } 13932 } else if (ire->ire_stq != NULL) { 13933 /* fowarding? */ 13934 ip_rput_process_forward(q, mp, ire, ipha, ill, 13935 ll_multicast); 13936 continue; 13937 } 13938 13939 /* packet not for us */ 13940 if (ire->ire_rfq != q) { 13941 if (ip_rput_notforus(&q, mp, ire, ill)) { 13942 continue; 13943 } 13944 } 13945 13946 switch (ipha->ipha_protocol) { 13947 case IPPROTO_TCP: 13948 ASSERT(first_mp == mp); 13949 if ((mp = ip_tcp_input(mp, ipha, ill, B_FALSE, ire, 13950 mp, 0, q, ip_ring)) != NULL) { 13951 if (curr_sqp == NULL) { 13952 curr_sqp = GET_SQUEUE(mp); 13953 ASSERT(cnt == 0); 13954 cnt++; 13955 head = tail = mp; 13956 } else if (curr_sqp == GET_SQUEUE(mp)) { 13957 ASSERT(tail != NULL); 13958 cnt++; 13959 tail->b_next = mp; 13960 tail = mp; 13961 } else { 13962 /* 13963 * A different squeue. Send the 13964 * chain for the previous squeue on 13965 * its way. This shouldn't happen 13966 * often unless interrupt binding 13967 * changes. 13968 */ 13969 IP_STAT(ip_input_multi_squeue); 13970 squeue_enter_chain(curr_sqp, head, 13971 tail, cnt, SQTAG_IP_INPUT); 13972 curr_sqp = GET_SQUEUE(mp); 13973 head = mp; 13974 tail = mp; 13975 cnt = 1; 13976 } 13977 } 13978 IRE_REFRELE(ire); 13979 continue; 13980 case IPPROTO_UDP: 13981 ASSERT(first_mp == mp); 13982 ip_udp_input(q, mp, ipha, ire, ill); 13983 IRE_REFRELE(ire); 13984 continue; 13985 case IPPROTO_SCTP: 13986 ASSERT(first_mp == mp); 13987 ip_sctp_input(mp, ipha, ill, B_FALSE, ire, mp, 0, 13988 q, dst); 13989 continue; 13990 default: 13991 ip_proto_input(q, first_mp, ipha, ire, ill); 13992 IRE_REFRELE(ire); 13993 continue; 13994 } 13995 } 13996 13997 if (head != NULL) 13998 squeue_enter_chain(curr_sqp, head, tail, cnt, SQTAG_IP_INPUT); 13999 14000 /* 14001 * This code is there just to make netperf/ttcp look good. 14002 * 14003 * Its possible that after being in polling mode (and having cleared 14004 * the backlog), squeues have turned the interrupt frequency higher 14005 * to improve latency at the expense of more CPU utilization (less 14006 * packets per interrupts or more number of interrupts). Workloads 14007 * like ttcp/netperf do manage to tickle polling once in a while 14008 * but for the remaining time, stay in higher interrupt mode since 14009 * their packet arrival rate is pretty uniform and this shows up 14010 * as higher CPU utilization. Since people care about CPU utilization 14011 * while running netperf/ttcp, turn the interrupt frequency back to 14012 * normal/default if polling has not been used in ip_poll_normal_ticks. 14013 */ 14014 if (ip_ring != NULL && (ip_ring->rr_poll_state & ILL_POLLING)) { 14015 if (lbolt >= (ip_ring->rr_poll_time + ip_poll_normal_ticks)) { 14016 ip_ring->rr_poll_state &= ~ILL_POLLING; 14017 ip_ring->rr_blank(ip_ring->rr_handle, 14018 ip_ring->rr_normal_blank_time, 14019 ip_ring->rr_normal_pkt_cnt); 14020 } 14021 } 14022 14023 TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, 14024 "ip_input_end: q %p (%S)", q, "end"); 14025 #undef rptr 14026 } 14027 14028 static void 14029 ip_dlpi_error(ill_t *ill, t_uscalar_t prim, t_uscalar_t dl_err, 14030 t_uscalar_t err) 14031 { 14032 if (dl_err == DL_SYSERR) { 14033 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 14034 "%s: %s failed: DL_SYSERR (errno %u)\n", 14035 ill->ill_name, dlpi_prim_str(prim), err); 14036 return; 14037 } 14038 14039 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 14040 "%s: %s failed: %s\n", ill->ill_name, dlpi_prim_str(prim), 14041 dlpi_err_str(dl_err)); 14042 } 14043 14044 /* 14045 * ip_rput_dlpi is called by ip_rput to handle all DLPI messages other 14046 * than DL_UNITDATA_IND messages. If we need to process this message 14047 * exclusively, we call qwriter_ip, in which case we also need to call 14048 * ill_refhold before that, since qwriter_ip does an ill_refrele. 14049 */ 14050 void 14051 ip_rput_dlpi(queue_t *q, mblk_t *mp) 14052 { 14053 dl_ok_ack_t *dloa = (dl_ok_ack_t *)mp->b_rptr; 14054 dl_error_ack_t *dlea = (dl_error_ack_t *)dloa; 14055 ill_t *ill; 14056 14057 ip1dbg(("ip_rput_dlpi")); 14058 ill = (ill_t *)q->q_ptr; 14059 switch (dloa->dl_primitive) { 14060 case DL_ERROR_ACK: 14061 ip2dbg(("ip_rput_dlpi(%s): DL_ERROR_ACK %s (0x%x): " 14062 "%s (0x%x), unix %u\n", ill->ill_name, 14063 dlpi_prim_str(dlea->dl_error_primitive), 14064 dlea->dl_error_primitive, 14065 dlpi_err_str(dlea->dl_errno), 14066 dlea->dl_errno, 14067 dlea->dl_unix_errno)); 14068 switch (dlea->dl_error_primitive) { 14069 case DL_UNBIND_REQ: 14070 mutex_enter(&ill->ill_lock); 14071 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS; 14072 cv_signal(&ill->ill_cv); 14073 mutex_exit(&ill->ill_lock); 14074 /* FALLTHRU */ 14075 case DL_NOTIFY_REQ: 14076 case DL_ATTACH_REQ: 14077 case DL_DETACH_REQ: 14078 case DL_INFO_REQ: 14079 case DL_BIND_REQ: 14080 case DL_ENABMULTI_REQ: 14081 case DL_PHYS_ADDR_REQ: 14082 case DL_CAPABILITY_REQ: 14083 case DL_CONTROL_REQ: 14084 /* 14085 * Refhold the ill to match qwriter_ip which does a 14086 * refrele. Since this is on the ill stream we 14087 * unconditionally bump up the refcount without 14088 * checking for ILL_CAN_LOOKUP 14089 */ 14090 ill_refhold(ill); 14091 (void) qwriter_ip(NULL, ill, q, mp, ip_rput_dlpi_writer, 14092 CUR_OP, B_FALSE); 14093 return; 14094 case DL_DISABMULTI_REQ: 14095 freemsg(mp); /* Don't want to pass this up */ 14096 return; 14097 default: 14098 break; 14099 } 14100 ip_dlpi_error(ill, dlea->dl_error_primitive, 14101 dlea->dl_errno, dlea->dl_unix_errno); 14102 freemsg(mp); 14103 return; 14104 case DL_INFO_ACK: 14105 case DL_BIND_ACK: 14106 case DL_PHYS_ADDR_ACK: 14107 case DL_NOTIFY_ACK: 14108 case DL_CAPABILITY_ACK: 14109 case DL_CONTROL_ACK: 14110 /* 14111 * Refhold the ill to match qwriter_ip which does a refrele 14112 * Since this is on the ill stream we unconditionally 14113 * bump up the refcount without doing ILL_CAN_LOOKUP. 14114 */ 14115 ill_refhold(ill); 14116 (void) qwriter_ip(NULL, ill, q, mp, ip_rput_dlpi_writer, 14117 CUR_OP, B_FALSE); 14118 return; 14119 case DL_NOTIFY_IND: 14120 ill_refhold(ill); 14121 /* 14122 * The DL_NOTIFY_IND is an asynchronous message that has no 14123 * relation to the current ioctl in progress (if any). Hence we 14124 * pass in NEW_OP in this case. 14125 */ 14126 (void) qwriter_ip(NULL, ill, q, mp, ip_rput_dlpi_writer, 14127 NEW_OP, B_FALSE); 14128 return; 14129 case DL_OK_ACK: 14130 ip1dbg(("ip_rput: DL_OK_ACK for %s\n", 14131 dlpi_prim_str((int)dloa->dl_correct_primitive))); 14132 switch (dloa->dl_correct_primitive) { 14133 case DL_UNBIND_REQ: 14134 mutex_enter(&ill->ill_lock); 14135 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS; 14136 cv_signal(&ill->ill_cv); 14137 mutex_exit(&ill->ill_lock); 14138 /* FALLTHRU */ 14139 case DL_ATTACH_REQ: 14140 case DL_DETACH_REQ: 14141 /* 14142 * Refhold the ill to match qwriter_ip which does a 14143 * refrele. Since this is on the ill stream we 14144 * unconditionally bump up the refcount 14145 */ 14146 ill_refhold(ill); 14147 qwriter_ip(NULL, ill, q, mp, ip_rput_dlpi_writer, 14148 CUR_OP, B_FALSE); 14149 return; 14150 case DL_ENABMULTI_REQ: 14151 if (ill->ill_dlpi_multicast_state == IDMS_INPROGRESS) 14152 ill->ill_dlpi_multicast_state = IDMS_OK; 14153 break; 14154 14155 } 14156 break; 14157 default: 14158 break; 14159 } 14160 freemsg(mp); 14161 } 14162 14163 /* 14164 * Handling of DLPI messages that require exclusive access to the ipsq. 14165 * 14166 * Need to do ill_pending_mp_release on ioctl completion, which could 14167 * happen here. (along with mi_copy_done) 14168 */ 14169 /* ARGSUSED */ 14170 static void 14171 ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 14172 { 14173 dl_ok_ack_t *dloa = (dl_ok_ack_t *)mp->b_rptr; 14174 dl_error_ack_t *dlea = (dl_error_ack_t *)dloa; 14175 int err = 0; 14176 ill_t *ill; 14177 ipif_t *ipif = NULL; 14178 mblk_t *mp1 = NULL; 14179 conn_t *connp = NULL; 14180 t_uscalar_t physaddr_req; 14181 mblk_t *mp_hw; 14182 union DL_primitives *dlp; 14183 boolean_t success; 14184 boolean_t ioctl_aborted = B_FALSE; 14185 boolean_t log = B_TRUE; 14186 14187 ip1dbg(("ip_rput_dlpi_writer ..")); 14188 ill = (ill_t *)q->q_ptr; 14189 ASSERT(ipsq == ill->ill_phyint->phyint_ipsq); 14190 14191 ASSERT(IAM_WRITER_ILL(ill)); 14192 14193 /* 14194 * ipsq_pending_mp and ipsq_pending_ipif track each other. i.e. 14195 * both are null or non-null. However we can assert that only 14196 * after grabbing the ipsq_lock. So we don't make any assertion 14197 * here and in other places in the code. 14198 */ 14199 ipif = ipsq->ipsq_pending_ipif; 14200 /* 14201 * The current ioctl could have been aborted by the user and a new 14202 * ioctl to bring up another ill could have started. We could still 14203 * get a response from the driver later. 14204 */ 14205 if (ipif != NULL && ipif->ipif_ill != ill) 14206 ioctl_aborted = B_TRUE; 14207 14208 switch (dloa->dl_primitive) { 14209 case DL_ERROR_ACK: 14210 switch (dlea->dl_error_primitive) { 14211 case DL_UNBIND_REQ: 14212 case DL_ATTACH_REQ: 14213 case DL_DETACH_REQ: 14214 case DL_INFO_REQ: 14215 ill_dlpi_done(ill, dlea->dl_error_primitive); 14216 break; 14217 case DL_NOTIFY_REQ: 14218 ill_dlpi_done(ill, DL_NOTIFY_REQ); 14219 log = B_FALSE; 14220 break; 14221 case DL_PHYS_ADDR_REQ: 14222 /* 14223 * For IPv6 only, there are two additional 14224 * phys_addr_req's sent to the driver to get the 14225 * IPv6 token and lla. This allows IP to acquire 14226 * the hardware address format for a given interface 14227 * without having built in knowledge of the hardware 14228 * address. ill_phys_addr_pend keeps track of the last 14229 * DL_PAR sent so we know which response we are 14230 * dealing with. ill_dlpi_done will update 14231 * ill_phys_addr_pend when it sends the next req. 14232 * We don't complete the IOCTL until all three DL_PARs 14233 * have been attempted, so set *_len to 0 and break. 14234 */ 14235 physaddr_req = ill->ill_phys_addr_pend; 14236 ill_dlpi_done(ill, DL_PHYS_ADDR_REQ); 14237 if (physaddr_req == DL_IPV6_TOKEN) { 14238 ill->ill_token_length = 0; 14239 log = B_FALSE; 14240 break; 14241 } else if (physaddr_req == DL_IPV6_LINK_LAYER_ADDR) { 14242 ill->ill_nd_lla_len = 0; 14243 log = B_FALSE; 14244 break; 14245 } 14246 /* 14247 * Something went wrong with the DL_PHYS_ADDR_REQ. 14248 * We presumably have an IOCTL hanging out waiting 14249 * for completion. Find it and complete the IOCTL 14250 * with the error noted. 14251 * However, ill_dl_phys was called on an ill queue 14252 * (from SIOCSLIFNAME), thus conn_pending_ill is not 14253 * set. But the ioctl is known to be pending on ill_wq. 14254 */ 14255 if (!ill->ill_ifname_pending) 14256 break; 14257 ill->ill_ifname_pending = 0; 14258 if (!ioctl_aborted) 14259 mp1 = ipsq_pending_mp_get(ipsq, &connp); 14260 if (mp1 != NULL) { 14261 /* 14262 * This operation (SIOCSLIFNAME) must have 14263 * happened on the ill. Assert there is no conn 14264 */ 14265 ASSERT(connp == NULL); 14266 q = ill->ill_wq; 14267 } 14268 break; 14269 case DL_BIND_REQ: 14270 ill_dlpi_done(ill, DL_BIND_REQ); 14271 if (ill->ill_ifname_pending) 14272 break; 14273 /* 14274 * Something went wrong with the bind. We presumably 14275 * have an IOCTL hanging out waiting for completion. 14276 * Find it, take down the interface that was coming 14277 * up, and complete the IOCTL with the error noted. 14278 */ 14279 if (!ioctl_aborted) 14280 mp1 = ipsq_pending_mp_get(ipsq, &connp); 14281 if (mp1 != NULL) { 14282 /* 14283 * This operation (SIOCSLIFFLAGS) must have 14284 * happened from a conn. 14285 */ 14286 ASSERT(connp != NULL); 14287 q = CONNP_TO_WQ(connp); 14288 if (ill->ill_move_in_progress) { 14289 ILL_CLEAR_MOVE(ill); 14290 } 14291 (void) ipif_down(ipif, NULL, NULL); 14292 /* error is set below the switch */ 14293 } 14294 break; 14295 case DL_ENABMULTI_REQ: 14296 ip1dbg(("DL_ERROR_ACK to enabmulti\n")); 14297 14298 if (ill->ill_dlpi_multicast_state == IDMS_INPROGRESS) 14299 ill->ill_dlpi_multicast_state = IDMS_FAILED; 14300 if (ill->ill_dlpi_multicast_state == IDMS_FAILED) { 14301 ipif_t *ipif; 14302 14303 log = B_FALSE; 14304 printf("ip: joining multicasts failed (%d)" 14305 " on %s - will use link layer " 14306 "broadcasts for multicast\n", 14307 dlea->dl_errno, ill->ill_name); 14308 14309 /* 14310 * Set up the multicast mapping alone. 14311 * writer, so ok to access ill->ill_ipif 14312 * without any lock. 14313 */ 14314 ipif = ill->ill_ipif; 14315 mutex_enter(&ill->ill_phyint->phyint_lock); 14316 ill->ill_phyint->phyint_flags |= 14317 PHYI_MULTI_BCAST; 14318 mutex_exit(&ill->ill_phyint->phyint_lock); 14319 14320 if (!ill->ill_isv6) { 14321 (void) ipif_arp_setup_multicast(ipif, 14322 NULL); 14323 } else { 14324 (void) ipif_ndp_setup_multicast(ipif, 14325 NULL); 14326 } 14327 } 14328 freemsg(mp); /* Don't want to pass this up */ 14329 return; 14330 case DL_CAPABILITY_REQ: 14331 case DL_CONTROL_REQ: 14332 ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for " 14333 "DL_CAPABILITY/CONTROL REQ\n")); 14334 ill_dlpi_done(ill, dlea->dl_error_primitive); 14335 ill->ill_capab_state = IDMS_FAILED; 14336 freemsg(mp); 14337 return; 14338 } 14339 /* 14340 * Note the error for IOCTL completion (mp1 is set when 14341 * ready to complete ioctl). If ill_ifname_pending_err is 14342 * set, an error occured during plumbing (ill_ifname_pending), 14343 * so we want to report that error. 14344 * 14345 * NOTE: there are two addtional DL_PHYS_ADDR_REQ's 14346 * (DL_IPV6_TOKEN and DL_IPV6_LINK_LAYER_ADDR) that are 14347 * expected to get errack'd if the driver doesn't support 14348 * these flags (e.g. ethernet). log will be set to B_FALSE 14349 * if these error conditions are encountered. 14350 */ 14351 if (mp1 != NULL) { 14352 if (ill->ill_ifname_pending_err != 0) { 14353 err = ill->ill_ifname_pending_err; 14354 ill->ill_ifname_pending_err = 0; 14355 } else { 14356 err = dlea->dl_unix_errno ? 14357 dlea->dl_unix_errno : ENXIO; 14358 } 14359 /* 14360 * If we're plumbing an interface and an error hasn't already 14361 * been saved, set ill_ifname_pending_err to the error passed 14362 * up. Ignore the error if log is B_FALSE (see comment above). 14363 */ 14364 } else if (log && ill->ill_ifname_pending && 14365 ill->ill_ifname_pending_err == 0) { 14366 ill->ill_ifname_pending_err = dlea->dl_unix_errno ? 14367 dlea->dl_unix_errno : ENXIO; 14368 } 14369 14370 if (log) 14371 ip_dlpi_error(ill, dlea->dl_error_primitive, 14372 dlea->dl_errno, dlea->dl_unix_errno); 14373 break; 14374 case DL_CAPABILITY_ACK: { 14375 boolean_t reneg_flag = B_FALSE; 14376 /* Call a routine to handle this one. */ 14377 ill_dlpi_done(ill, DL_CAPABILITY_REQ); 14378 /* 14379 * Check if the ACK is due to renegotiation case since we 14380 * will need to send a new CAPABILITY_REQ later. 14381 */ 14382 if (ill->ill_capab_state == IDMS_RENEG) { 14383 /* This is the ack for a renogiation case */ 14384 reneg_flag = B_TRUE; 14385 ill->ill_capab_state = IDMS_UNKNOWN; 14386 } 14387 ill_capability_ack(ill, mp); 14388 if (reneg_flag) 14389 ill_capability_probe(ill); 14390 break; 14391 } 14392 case DL_CONTROL_ACK: 14393 /* We treat all of these as "fire and forget" */ 14394 ill_dlpi_done(ill, DL_CONTROL_REQ); 14395 break; 14396 case DL_INFO_ACK: 14397 /* Call a routine to handle this one. */ 14398 ill_dlpi_done(ill, DL_INFO_REQ); 14399 ip_ll_subnet_defaults(ill, mp); 14400 ASSERT(!MUTEX_HELD(&ill->ill_phyint->phyint_ipsq->ipsq_lock)); 14401 return; 14402 case DL_BIND_ACK: 14403 /* 14404 * We should have an IOCTL waiting on this unless 14405 * sent by ill_dl_phys, in which case just return 14406 */ 14407 ill_dlpi_done(ill, DL_BIND_REQ); 14408 if (ill->ill_ifname_pending) 14409 break; 14410 14411 if (!ioctl_aborted) 14412 mp1 = ipsq_pending_mp_get(ipsq, &connp); 14413 if (mp1 == NULL) 14414 break; 14415 ASSERT(connp != NULL); 14416 q = CONNP_TO_WQ(connp); 14417 14418 /* 14419 * We are exclusive. So nothing can change even after 14420 * we get the pending mp. If need be we can put it back 14421 * and restart, as in calling ipif_arp_up() below. 14422 */ 14423 ip1dbg(("ip_rput_dlpi: bind_ack %s\n", ill->ill_name)); 14424 14425 mutex_enter(&ill->ill_lock); 14426 ill->ill_dl_up = 1; 14427 mutex_exit(&ill->ill_lock); 14428 14429 /* 14430 * Now bring up the resolver, when that is 14431 * done we'll create IREs and we are done. 14432 */ 14433 if (ill->ill_isv6) { 14434 /* 14435 * v6 interfaces. 14436 * Unlike ARP which has to do another bind 14437 * and attach, once we get here we are 14438 * done withh NDP. Except in the case of 14439 * ILLF_XRESOLV, in which case we send an 14440 * AR_INTERFACE_UP to the external resolver. 14441 * If all goes well, the ioctl will complete 14442 * in ip_rput(). If there's an error, we 14443 * complete it here. 14444 */ 14445 err = ipif_ndp_up(ipif, &ipif->ipif_v6lcl_addr, 14446 B_FALSE); 14447 if (err == 0) { 14448 if (ill->ill_flags & ILLF_XRESOLV) { 14449 mutex_enter(&connp->conn_lock); 14450 mutex_enter(&ill->ill_lock); 14451 success = ipsq_pending_mp_add( 14452 connp, ipif, q, mp1, 0); 14453 mutex_exit(&ill->ill_lock); 14454 mutex_exit(&connp->conn_lock); 14455 if (success) { 14456 err = ipif_resolver_up(ipif, 14457 B_FALSE); 14458 if (err == EINPROGRESS) { 14459 freemsg(mp); 14460 return; 14461 } 14462 ASSERT(err != 0); 14463 mp1 = ipsq_pending_mp_get(ipsq, 14464 &connp); 14465 ASSERT(mp1 != NULL); 14466 } else { 14467 /* conn has started closing */ 14468 err = EINTR; 14469 } 14470 } else { /* Non XRESOLV interface */ 14471 err = ipif_up_done_v6(ipif); 14472 } 14473 } 14474 } else if (ill->ill_net_type == IRE_IF_RESOLVER) { 14475 /* 14476 * ARP and other v4 external resolvers. 14477 * Leave the pending mblk intact so that 14478 * the ioctl completes in ip_rput(). 14479 */ 14480 mutex_enter(&connp->conn_lock); 14481 mutex_enter(&ill->ill_lock); 14482 success = ipsq_pending_mp_add(connp, ipif, q, mp1, 0); 14483 mutex_exit(&ill->ill_lock); 14484 mutex_exit(&connp->conn_lock); 14485 if (success) { 14486 err = ipif_resolver_up(ipif, B_FALSE); 14487 if (err == EINPROGRESS) { 14488 freemsg(mp); 14489 return; 14490 } 14491 ASSERT(err != 0); 14492 mp1 = ipsq_pending_mp_get(ipsq, &connp); 14493 } else { 14494 /* The conn has started closing */ 14495 err = EINTR; 14496 } 14497 } else { 14498 /* 14499 * This one is complete. Reply to pending ioctl. 14500 */ 14501 err = ipif_up_done(ipif); 14502 } 14503 14504 if ((err == 0) && (ill->ill_up_ipifs)) { 14505 err = ill_up_ipifs(ill, q, mp1); 14506 if (err == EINPROGRESS) { 14507 freemsg(mp); 14508 return; 14509 } 14510 } 14511 14512 if (ill->ill_up_ipifs) { 14513 ill_group_cleanup(ill); 14514 } 14515 14516 break; 14517 case DL_NOTIFY_IND: { 14518 dl_notify_ind_t *notify = (dl_notify_ind_t *)mp->b_rptr; 14519 ire_t *ire; 14520 boolean_t need_ire_walk_v4 = B_FALSE; 14521 boolean_t need_ire_walk_v6 = B_FALSE; 14522 14523 /* 14524 * Change the address everywhere we need to. 14525 * What we're getting here is a link-level addr or phys addr. 14526 * The new addr is at notify + notify->dl_addr_offset 14527 * The address length is notify->dl_addr_length; 14528 */ 14529 switch (notify->dl_notification) { 14530 case DL_NOTE_PHYS_ADDR: 14531 mp_hw = copyb(mp); 14532 if (mp_hw == NULL) { 14533 err = ENOMEM; 14534 break; 14535 } 14536 dlp = (union DL_primitives *)mp_hw->b_rptr; 14537 /* 14538 * We currently don't support changing 14539 * the token via DL_NOTIFY_IND. 14540 * When we do support it, we have to consider 14541 * what the implications are with respect to 14542 * the token and the link local address. 14543 */ 14544 mutex_enter(&ill->ill_lock); 14545 if (dlp->notify_ind.dl_data == 14546 DL_IPV6_LINK_LAYER_ADDR) { 14547 if (ill->ill_nd_lla_mp != NULL) 14548 freemsg(ill->ill_nd_lla_mp); 14549 ill->ill_nd_lla_mp = mp_hw; 14550 ill->ill_nd_lla = (uchar_t *)mp_hw->b_rptr + 14551 dlp->notify_ind.dl_addr_offset; 14552 ill->ill_nd_lla_len = 14553 dlp->notify_ind.dl_addr_length - 14554 ABS(ill->ill_sap_length); 14555 mutex_exit(&ill->ill_lock); 14556 break; 14557 } else if (dlp->notify_ind.dl_data == 14558 DL_CURR_PHYS_ADDR) { 14559 if (ill->ill_phys_addr_mp != NULL) 14560 freemsg(ill->ill_phys_addr_mp); 14561 ill->ill_phys_addr_mp = mp_hw; 14562 ill->ill_phys_addr = (uchar_t *)mp_hw->b_rptr + 14563 dlp->notify_ind.dl_addr_offset; 14564 ill->ill_phys_addr_length = 14565 dlp->notify_ind.dl_addr_length - 14566 ABS(ill->ill_sap_length); 14567 if (ill->ill_isv6 && 14568 !(ill->ill_flags & ILLF_XRESOLV)) { 14569 if (ill->ill_nd_lla_mp != NULL) 14570 freemsg(ill->ill_nd_lla_mp); 14571 ill->ill_nd_lla_mp = copyb(mp_hw); 14572 ill->ill_nd_lla = (uchar_t *) 14573 ill->ill_nd_lla_mp->b_rptr + 14574 dlp->notify_ind.dl_addr_offset; 14575 ill->ill_nd_lla_len = 14576 ill->ill_phys_addr_length; 14577 } 14578 } 14579 mutex_exit(&ill->ill_lock); 14580 /* 14581 * Send out gratuitous arp request for our new 14582 * hardware address. 14583 */ 14584 for (ipif = ill->ill_ipif; ipif != NULL; 14585 ipif = ipif->ipif_next) { 14586 if (!(ipif->ipif_flags & IPIF_UP)) 14587 continue; 14588 if (ill->ill_isv6) { 14589 ipif_ndp_down(ipif); 14590 /* 14591 * Set B_TRUE to enable 14592 * ipif_ndp_up() to send out 14593 * unsolicited advertisements. 14594 */ 14595 err = ipif_ndp_up(ipif, 14596 &ipif->ipif_v6lcl_addr, 14597 B_TRUE); 14598 if (err) { 14599 ip1dbg(( 14600 "ip_rput_dlpi_writer: " 14601 "Failed to update ndp " 14602 "err %d\n", err)); 14603 } 14604 } else { 14605 /* 14606 * IPv4 ARP case 14607 * 14608 * Set B_TRUE, as we only want 14609 * ipif_resolver_up to send an 14610 * AR_ENTRY_ADD request up to 14611 * ARP. 14612 */ 14613 err = ipif_resolver_up(ipif, 14614 B_TRUE); 14615 if (err) { 14616 ip1dbg(( 14617 "ip_rput_dlpi_writer: " 14618 "Failed to update arp " 14619 "err %d\n", err)); 14620 } 14621 } 14622 } 14623 /* 14624 * Allow "fall through" to the DL_NOTE_FASTPATH_FLUSH 14625 * case so that all old fastpath information can be 14626 * purged from IRE caches. 14627 */ 14628 /* FALLTHRU */ 14629 case DL_NOTE_FASTPATH_FLUSH: 14630 /* 14631 * Any fastpath probe sent henceforth will get the 14632 * new fp mp. So we first delete any ires that are 14633 * waiting for the fastpath. Then walk all ires and 14634 * delete the ire or delete the fp mp. In the case of 14635 * IRE_MIPRTUN and IRE_BROADCAST it is difficult to 14636 * recreate the ire's without going through a complex 14637 * ipif up/down dance. So we don't delete the ire 14638 * itself, but just the ire_fp_mp for these 2 ire's 14639 * In the case of the other ire's we delete the ire's 14640 * themselves. Access to ire_fp_mp is completely 14641 * protected by ire_lock for IRE_MIPRTUN and 14642 * IRE_BROADCAST. Deleting the ire is preferable in the 14643 * other cases for performance. 14644 */ 14645 if (ill->ill_isv6) { 14646 nce_fastpath_list_dispatch(ill, NULL, NULL); 14647 ndp_walk(ill, (pfi_t)ndp_fastpath_flush, 14648 NULL); 14649 } else { 14650 ire_fastpath_list_dispatch(ill, NULL, NULL); 14651 ire_walk_ill_v4(MATCH_IRE_WQ | MATCH_IRE_TYPE, 14652 IRE_CACHE | IRE_BROADCAST, 14653 ire_fastpath_flush, NULL, ill); 14654 mutex_enter(&ire_mrtun_lock); 14655 if (ire_mrtun_count != 0) { 14656 mutex_exit(&ire_mrtun_lock); 14657 ire_walk_ill_mrtun(MATCH_IRE_WQ, 14658 IRE_MIPRTUN, ire_fastpath_flush, 14659 NULL, ill); 14660 } else { 14661 mutex_exit(&ire_mrtun_lock); 14662 } 14663 } 14664 break; 14665 case DL_NOTE_SDU_SIZE: 14666 /* 14667 * Change the MTU size of the interface, of all 14668 * attached ipif's, and of all relevant ire's. The 14669 * new value's a uint32_t at notify->dl_data. 14670 * Mtu change Vs. new ire creation - protocol below. 14671 * 14672 * a Mark the ipif as IPIF_CHANGING. 14673 * b Set the new mtu in the ipif. 14674 * c Change the ire_max_frag on all affected ires 14675 * d Unmark the IPIF_CHANGING 14676 * 14677 * To see how the protocol works, assume an interface 14678 * route is also being added simultaneously by 14679 * ip_rt_add and let 'ipif' be the ipif referenced by 14680 * the ire. If the ire is created before step a, 14681 * it will be cleaned up by step c. If the ire is 14682 * created after step d, it will see the new value of 14683 * ipif_mtu. Any attempt to create the ire between 14684 * steps a to d will fail because of the IPIF_CHANGING 14685 * flag. Note that ire_create() is passed a pointer to 14686 * the ipif_mtu, and not the value. During ire_add 14687 * under the bucket lock, the ire_max_frag of the 14688 * new ire being created is set from the ipif/ire from 14689 * which it is being derived. 14690 */ 14691 mutex_enter(&ill->ill_lock); 14692 ill->ill_max_frag = (uint_t)notify->dl_data; 14693 14694 /* 14695 * If an SIOCSLIFLNKINFO has changed the ill_max_mtu 14696 * leave it alone 14697 */ 14698 if (ill->ill_mtu_userspecified) { 14699 mutex_exit(&ill->ill_lock); 14700 break; 14701 } 14702 ill->ill_max_mtu = ill->ill_max_frag; 14703 if (ill->ill_isv6) { 14704 if (ill->ill_max_mtu < IPV6_MIN_MTU) 14705 ill->ill_max_mtu = IPV6_MIN_MTU; 14706 } else { 14707 if (ill->ill_max_mtu < IP_MIN_MTU) 14708 ill->ill_max_mtu = IP_MIN_MTU; 14709 } 14710 for (ipif = ill->ill_ipif; ipif != NULL; 14711 ipif = ipif->ipif_next) { 14712 /* 14713 * Don't override the mtu if the user 14714 * has explicitly set it. 14715 */ 14716 if (ipif->ipif_flags & IPIF_FIXEDMTU) 14717 continue; 14718 ipif->ipif_mtu = (uint_t)notify->dl_data; 14719 if (ipif->ipif_isv6) 14720 ire = ipif_to_ire_v6(ipif); 14721 else 14722 ire = ipif_to_ire(ipif); 14723 if (ire != NULL) { 14724 ire->ire_max_frag = ipif->ipif_mtu; 14725 ire_refrele(ire); 14726 } 14727 if (ipif->ipif_flags & IPIF_UP) { 14728 if (ill->ill_isv6) 14729 need_ire_walk_v6 = B_TRUE; 14730 else 14731 need_ire_walk_v4 = B_TRUE; 14732 } 14733 } 14734 mutex_exit(&ill->ill_lock); 14735 if (need_ire_walk_v4) 14736 ire_walk_v4(ill_mtu_change, (char *)ill, 14737 ALL_ZONES); 14738 if (need_ire_walk_v6) 14739 ire_walk_v6(ill_mtu_change, (char *)ill, 14740 ALL_ZONES); 14741 break; 14742 case DL_NOTE_LINK_UP: 14743 case DL_NOTE_LINK_DOWN: { 14744 /* 14745 * We are writer. ill / phyint / ipsq assocs stable. 14746 * The RUNNING flag reflects the state of the link. 14747 */ 14748 phyint_t *phyint = ill->ill_phyint; 14749 uint64_t new_phyint_flags; 14750 boolean_t changed = B_FALSE; 14751 14752 mutex_enter(&phyint->phyint_lock); 14753 new_phyint_flags = 14754 (notify->dl_notification == DL_NOTE_LINK_UP) ? 14755 phyint->phyint_flags | PHYI_RUNNING : 14756 phyint->phyint_flags & ~PHYI_RUNNING; 14757 if (new_phyint_flags != phyint->phyint_flags) { 14758 phyint->phyint_flags = new_phyint_flags; 14759 changed = B_TRUE; 14760 } 14761 mutex_exit(&phyint->phyint_lock); 14762 /* 14763 * If the flags have changed, send a message to 14764 * the routing socket. 14765 */ 14766 if (changed) { 14767 if (phyint->phyint_illv4 != NULL) { 14768 ip_rts_ifmsg( 14769 phyint->phyint_illv4->ill_ipif); 14770 } 14771 if (phyint->phyint_illv6 != NULL) { 14772 ip_rts_ifmsg( 14773 phyint->phyint_illv6->ill_ipif); 14774 } 14775 } 14776 break; 14777 } 14778 case DL_NOTE_PROMISC_ON_PHYS: 14779 IPSECHW_DEBUG(IPSECHW_PKT, ("ip_rput_dlpi_writer: " 14780 "got a DL_NOTE_PROMISC_ON_PHYS\n")); 14781 mutex_enter(&ill->ill_lock); 14782 ill->ill_promisc_on_phys = B_TRUE; 14783 mutex_exit(&ill->ill_lock); 14784 break; 14785 case DL_NOTE_PROMISC_OFF_PHYS: 14786 IPSECHW_DEBUG(IPSECHW_PKT, ("ip_rput_dlpi_writer: " 14787 "got a DL_NOTE_PROMISC_OFF_PHYS\n")); 14788 mutex_enter(&ill->ill_lock); 14789 ill->ill_promisc_on_phys = B_FALSE; 14790 mutex_exit(&ill->ill_lock); 14791 break; 14792 case DL_NOTE_CAPAB_RENEG: 14793 /* 14794 * Something changed on the driver side. 14795 * It wants us to renegotiate the capabilities 14796 * on this ill. The most likely cause is the 14797 * aggregation interface under us where a 14798 * port got added or went away. 14799 * 14800 * We reset the capabilities and set the 14801 * state to IDMS_RENG so that when the ack 14802 * comes back, we can start the 14803 * renegotiation process. 14804 */ 14805 ill_capability_reset(ill); 14806 ill->ill_capab_state = IDMS_RENEG; 14807 break; 14808 default: 14809 ip0dbg(("ip_rput_dlpi_writer: unknown notification " 14810 "type 0x%x for DL_NOTIFY_IND\n", 14811 notify->dl_notification)); 14812 break; 14813 } 14814 14815 /* 14816 * As this is an asynchronous operation, we 14817 * should not call ill_dlpi_done 14818 */ 14819 break; 14820 } 14821 case DL_NOTIFY_ACK: 14822 /* 14823 * Don't really need to check for what notifications 14824 * are supported; we'll process what gets sent upstream, 14825 * and we know it'll be something we support changing 14826 * based on our DL_NOTIFY_REQ. 14827 */ 14828 ill_dlpi_done(ill, DL_NOTIFY_REQ); 14829 break; 14830 case DL_PHYS_ADDR_ACK: { 14831 /* 14832 * We should have an IOCTL waiting on this when request 14833 * sent by ill_dl_phys. 14834 * However, ill_dl_phys was called on an ill queue (from 14835 * SIOCSLIFNAME), thus conn_pending_ill is not set. But the 14836 * ioctl is known to be pending on ill_wq. 14837 * There are two additional phys_addr_req's sent to the 14838 * driver to get the token and lla. ill_phys_addr_pend 14839 * keeps track of the last one sent so we know which 14840 * response we are dealing with. ill_dlpi_done will 14841 * update ill_phys_addr_pend when it sends the next req. 14842 * We don't complete the IOCTL until all three DL_PARs 14843 * have been attempted. 14844 * 14845 * We don't need any lock to update ill_nd_lla* fields, 14846 * since the ill is not yet up, We grab the lock just 14847 * for uniformity with other code that accesses ill_nd_lla. 14848 */ 14849 physaddr_req = ill->ill_phys_addr_pend; 14850 ill_dlpi_done(ill, DL_PHYS_ADDR_REQ); 14851 if (physaddr_req == DL_IPV6_TOKEN || 14852 physaddr_req == DL_IPV6_LINK_LAYER_ADDR) { 14853 if (physaddr_req == DL_IPV6_TOKEN) { 14854 /* 14855 * bcopy to low-order bits of ill_token 14856 * 14857 * XXX Temporary hack - currently, 14858 * all known tokens are 64 bits, 14859 * so I'll cheat for the moment. 14860 */ 14861 dlp = (union DL_primitives *)mp->b_rptr; 14862 14863 mutex_enter(&ill->ill_lock); 14864 bcopy((uchar_t *)(mp->b_rptr + 14865 dlp->physaddr_ack.dl_addr_offset), 14866 (void *)&ill->ill_token.s6_addr32[2], 14867 dlp->physaddr_ack.dl_addr_length); 14868 ill->ill_token_length = 14869 dlp->physaddr_ack.dl_addr_length; 14870 mutex_exit(&ill->ill_lock); 14871 } else { 14872 ASSERT(ill->ill_nd_lla_mp == NULL); 14873 mp_hw = copyb(mp); 14874 if (mp_hw == NULL) { 14875 err = ENOMEM; 14876 break; 14877 } 14878 dlp = (union DL_primitives *)mp_hw->b_rptr; 14879 mutex_enter(&ill->ill_lock); 14880 ill->ill_nd_lla_mp = mp_hw; 14881 ill->ill_nd_lla = (uchar_t *)mp_hw->b_rptr + 14882 dlp->physaddr_ack.dl_addr_offset; 14883 ill->ill_nd_lla_len = 14884 dlp->physaddr_ack.dl_addr_length; 14885 mutex_exit(&ill->ill_lock); 14886 } 14887 break; 14888 } 14889 ASSERT(physaddr_req == DL_CURR_PHYS_ADDR); 14890 ASSERT(ill->ill_phys_addr_mp == NULL); 14891 if (!ill->ill_ifname_pending) 14892 break; 14893 ill->ill_ifname_pending = 0; 14894 if (!ioctl_aborted) 14895 mp1 = ipsq_pending_mp_get(ipsq, &connp); 14896 if (mp1 != NULL) { 14897 ASSERT(connp == NULL); 14898 q = ill->ill_wq; 14899 } 14900 /* 14901 * If any error acks received during the plumbing sequence, 14902 * ill_ifname_pending_err will be set. Break out and send up 14903 * the error to the pending ioctl. 14904 */ 14905 if (ill->ill_ifname_pending_err != 0) { 14906 err = ill->ill_ifname_pending_err; 14907 ill->ill_ifname_pending_err = 0; 14908 break; 14909 } 14910 /* 14911 * Get the interface token. If the zeroth interface 14912 * address is zero then set the address to the link local 14913 * address 14914 */ 14915 mp_hw = copyb(mp); 14916 if (mp_hw == NULL) { 14917 err = ENOMEM; 14918 break; 14919 } 14920 dlp = (union DL_primitives *)mp_hw->b_rptr; 14921 ill->ill_phys_addr_mp = mp_hw; 14922 ill->ill_phys_addr = (uchar_t *)mp_hw->b_rptr + 14923 dlp->physaddr_ack.dl_addr_offset; 14924 if (dlp->physaddr_ack.dl_addr_length == 0 || 14925 ill->ill_phys_addr_length == 0 || 14926 ill->ill_phys_addr_length == IP_ADDR_LEN) { 14927 /* 14928 * Compatibility: atun driver returns a length of 0. 14929 * ipdptp has an ill_phys_addr_length of zero(from 14930 * DL_BIND_ACK) but a non-zero length here. 14931 * ipd has an ill_phys_addr_length of 4(from 14932 * DL_BIND_ACK) but a non-zero length here. 14933 */ 14934 ill->ill_phys_addr = NULL; 14935 } else if (dlp->physaddr_ack.dl_addr_length != 14936 ill->ill_phys_addr_length) { 14937 ip0dbg(("DL_PHYS_ADDR_ACK: " 14938 "Address length mismatch %d %d\n", 14939 dlp->physaddr_ack.dl_addr_length, 14940 ill->ill_phys_addr_length)); 14941 err = EINVAL; 14942 break; 14943 } 14944 mutex_enter(&ill->ill_lock); 14945 if (ill->ill_nd_lla_mp == NULL) { 14946 ill->ill_nd_lla_mp = copyb(mp_hw); 14947 if (ill->ill_nd_lla_mp == NULL) { 14948 err = ENOMEM; 14949 mutex_exit(&ill->ill_lock); 14950 break; 14951 } 14952 ill->ill_nd_lla = 14953 (uchar_t *)ill->ill_nd_lla_mp->b_rptr + 14954 dlp->physaddr_ack.dl_addr_offset; 14955 ill->ill_nd_lla_len = ill->ill_phys_addr_length; 14956 } 14957 mutex_exit(&ill->ill_lock); 14958 if (IN6_IS_ADDR_UNSPECIFIED(&ill->ill_token)) 14959 (void) ill_setdefaulttoken(ill); 14960 14961 /* 14962 * If the ill zero interface has a zero address assign 14963 * it the proper link local address. 14964 */ 14965 ASSERT(ill->ill_ipif->ipif_id == 0); 14966 if (ipif != NULL && 14967 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) 14968 (void) ipif_setlinklocal(ipif); 14969 break; 14970 } 14971 case DL_OK_ACK: 14972 ip2dbg(("DL_OK_ACK %s (0x%x)\n", 14973 dlpi_prim_str((int)dloa->dl_correct_primitive), 14974 dloa->dl_correct_primitive)); 14975 switch (dloa->dl_correct_primitive) { 14976 case DL_UNBIND_REQ: 14977 case DL_ATTACH_REQ: 14978 case DL_DETACH_REQ: 14979 ill_dlpi_done(ill, dloa->dl_correct_primitive); 14980 break; 14981 } 14982 break; 14983 default: 14984 break; 14985 } 14986 14987 freemsg(mp); 14988 if (mp1) { 14989 struct iocblk *iocp; 14990 int mode; 14991 14992 /* 14993 * Complete the waiting IOCTL. For SIOCLIFADDIF or 14994 * SIOCSLIFNAME do a copyout. 14995 */ 14996 iocp = (struct iocblk *)mp1->b_rptr; 14997 14998 if (iocp->ioc_cmd == SIOCLIFADDIF || 14999 iocp->ioc_cmd == SIOCSLIFNAME) 15000 mode = COPYOUT; 15001 else 15002 mode = NO_COPYOUT; 15003 /* 15004 * The ioctl must complete now without EINPROGRESS 15005 * since ipsq_pending_mp_get has removed the ioctl mblk 15006 * from ipsq_pending_mp. Otherwise the ioctl will be 15007 * stuck for ever in the ipsq. 15008 */ 15009 ASSERT(err != EINPROGRESS); 15010 ip_ioctl_finish(q, mp1, err, mode, ipif, ipsq); 15011 15012 } 15013 } 15014 15015 /* 15016 * ip_rput_other is called by ip_rput to handle messages modifying the global 15017 * state in IP. Normally called as writer. Exception SIOCGTUNPARAM (shared) 15018 */ 15019 /* ARGSUSED */ 15020 void 15021 ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 15022 { 15023 ill_t *ill; 15024 struct iocblk *iocp; 15025 mblk_t *mp1; 15026 conn_t *connp = NULL; 15027 15028 ip1dbg(("ip_rput_other ")); 15029 ill = (ill_t *)q->q_ptr; 15030 /* 15031 * This routine is not a writer in the case of SIOCGTUNPARAM 15032 * in which case ipsq is NULL. 15033 */ 15034 if (ipsq != NULL) { 15035 ASSERT(IAM_WRITER_IPSQ(ipsq)); 15036 ASSERT(ipsq == ill->ill_phyint->phyint_ipsq); 15037 } 15038 15039 switch (mp->b_datap->db_type) { 15040 case M_ERROR: 15041 case M_HANGUP: 15042 /* 15043 * The device has a problem. We force the ILL down. It can 15044 * be brought up again manually using SIOCSIFFLAGS (via 15045 * ifconfig or equivalent). 15046 */ 15047 ASSERT(ipsq != NULL); 15048 if (mp->b_rptr < mp->b_wptr) 15049 ill->ill_error = (int)(*mp->b_rptr & 0xFF); 15050 if (ill->ill_error == 0) 15051 ill->ill_error = ENXIO; 15052 if (!ill_down_start(q, mp)) 15053 return; 15054 ipif_all_down_tail(ipsq, q, mp, NULL); 15055 break; 15056 case M_IOCACK: 15057 iocp = (struct iocblk *)mp->b_rptr; 15058 ASSERT(iocp->ioc_cmd != DL_IOC_HDR_INFO); 15059 switch (iocp->ioc_cmd) { 15060 case SIOCSTUNPARAM: 15061 case OSIOCSTUNPARAM: 15062 ASSERT(ipsq != NULL); 15063 /* 15064 * Finish socket ioctl passed through to tun. 15065 * We should have an IOCTL waiting on this. 15066 */ 15067 mp1 = ipsq_pending_mp_get(ipsq, &connp); 15068 if (ill->ill_isv6) { 15069 struct iftun_req *ta; 15070 15071 /* 15072 * if a source or destination is 15073 * being set, try and set the link 15074 * local address for the tunnel 15075 */ 15076 ta = (struct iftun_req *)mp->b_cont-> 15077 b_cont->b_rptr; 15078 if (ta->ifta_flags & (IFTUN_SRC | IFTUN_DST)) { 15079 ipif_set_tun_llink(ill, ta); 15080 } 15081 15082 } 15083 if (mp1 != NULL) { 15084 /* 15085 * Now copy back the b_next/b_prev used by 15086 * mi code for the mi_copy* functions. 15087 * See ip_sioctl_tunparam() for the reason. 15088 * Also protect against missing b_cont. 15089 */ 15090 if (mp->b_cont != NULL) { 15091 mp->b_cont->b_next = 15092 mp1->b_cont->b_next; 15093 mp->b_cont->b_prev = 15094 mp1->b_cont->b_prev; 15095 } 15096 inet_freemsg(mp1); 15097 ASSERT(ipsq->ipsq_current_ipif != NULL); 15098 ASSERT(connp != NULL); 15099 ip_ioctl_finish(CONNP_TO_WQ(connp), mp, 15100 iocp->ioc_error, NO_COPYOUT, 15101 ipsq->ipsq_current_ipif, ipsq); 15102 } else { 15103 ASSERT(connp == NULL); 15104 putnext(q, mp); 15105 } 15106 break; 15107 case SIOCGTUNPARAM: 15108 case OSIOCGTUNPARAM: 15109 /* 15110 * This is really M_IOCDATA from the tunnel driver. 15111 * convert back and complete the ioctl. 15112 * We should have an IOCTL waiting on this. 15113 */ 15114 mp1 = ill_pending_mp_get(ill, &connp, iocp->ioc_id); 15115 if (mp1) { 15116 /* 15117 * Now copy back the b_next/b_prev used by 15118 * mi code for the mi_copy* functions. 15119 * See ip_sioctl_tunparam() for the reason. 15120 * Also protect against missing b_cont. 15121 */ 15122 if (mp->b_cont != NULL) { 15123 mp->b_cont->b_next = 15124 mp1->b_cont->b_next; 15125 mp->b_cont->b_prev = 15126 mp1->b_cont->b_prev; 15127 } 15128 inet_freemsg(mp1); 15129 if (iocp->ioc_error == 0) 15130 mp->b_datap->db_type = M_IOCDATA; 15131 ASSERT(connp != NULL); 15132 ip_ioctl_finish(CONNP_TO_WQ(connp), mp, 15133 iocp->ioc_error, COPYOUT, NULL, NULL); 15134 } else { 15135 ASSERT(connp == NULL); 15136 putnext(q, mp); 15137 } 15138 break; 15139 default: 15140 break; 15141 } 15142 break; 15143 case M_IOCNAK: 15144 iocp = (struct iocblk *)mp->b_rptr; 15145 15146 switch (iocp->ioc_cmd) { 15147 int mode; 15148 ipif_t *ipif; 15149 15150 case DL_IOC_HDR_INFO: 15151 /* 15152 * If this was the first attempt turn of the 15153 * fastpath probing. 15154 */ 15155 mutex_enter(&ill->ill_lock); 15156 if (ill->ill_dlpi_fastpath_state == IDMS_INPROGRESS) { 15157 ill->ill_dlpi_fastpath_state = IDMS_FAILED; 15158 mutex_exit(&ill->ill_lock); 15159 ill_fastpath_nack(ill); 15160 ip1dbg(("ip_rput: DLPI fastpath off on " 15161 "interface %s\n", 15162 ill->ill_name)); 15163 } else { 15164 mutex_exit(&ill->ill_lock); 15165 } 15166 freemsg(mp); 15167 break; 15168 case SIOCSTUNPARAM: 15169 case OSIOCSTUNPARAM: 15170 ASSERT(ipsq != NULL); 15171 /* 15172 * Finish socket ioctl passed through to tun 15173 * We should have an IOCTL waiting on this. 15174 */ 15175 /* FALLTHRU */ 15176 case SIOCGTUNPARAM: 15177 case OSIOCGTUNPARAM: 15178 /* 15179 * This is really M_IOCDATA from the tunnel driver. 15180 * convert back and complete the ioctl. 15181 * We should have an IOCTL waiting on this. 15182 */ 15183 if (iocp->ioc_cmd == SIOCGTUNPARAM || 15184 iocp->ioc_cmd == OSIOCGTUNPARAM) { 15185 mp1 = ill_pending_mp_get(ill, &connp, 15186 iocp->ioc_id); 15187 mode = COPYOUT; 15188 ipsq = NULL; 15189 ipif = NULL; 15190 } else { 15191 mp1 = ipsq_pending_mp_get(ipsq, &connp); 15192 mode = NO_COPYOUT; 15193 ASSERT(ipsq->ipsq_current_ipif != NULL); 15194 ipif = ipsq->ipsq_current_ipif; 15195 } 15196 if (mp1 != NULL) { 15197 /* 15198 * Now copy back the b_next/b_prev used by 15199 * mi code for the mi_copy* functions. 15200 * See ip_sioctl_tunparam() for the reason. 15201 * Also protect against missing b_cont. 15202 */ 15203 if (mp->b_cont != NULL) { 15204 mp->b_cont->b_next = 15205 mp1->b_cont->b_next; 15206 mp->b_cont->b_prev = 15207 mp1->b_cont->b_prev; 15208 } 15209 inet_freemsg(mp1); 15210 if (iocp->ioc_error == 0) 15211 iocp->ioc_error = EINVAL; 15212 ASSERT(connp != NULL); 15213 ip_ioctl_finish(CONNP_TO_WQ(connp), mp, 15214 iocp->ioc_error, mode, ipif, ipsq); 15215 } else { 15216 ASSERT(connp == NULL); 15217 putnext(q, mp); 15218 } 15219 break; 15220 default: 15221 break; 15222 } 15223 default: 15224 break; 15225 } 15226 } 15227 15228 /* 15229 * NOTE : This function does not ire_refrele the ire argument passed in. 15230 * 15231 * IPQoS notes 15232 * IP policy is invoked twice for a forwarded packet, once on the read side 15233 * and again on the write side if both, IPP_FWD_IN and IPP_FWD_OUT are 15234 * enabled. An additional parameter, in_ill, has been added for this purpose. 15235 * Note that in_ill could be NULL when called from ip_rput_forward_multicast 15236 * because ip_mroute drops this information. 15237 * 15238 */ 15239 void 15240 ip_rput_forward(ire_t *ire, ipha_t *ipha, mblk_t *mp, ill_t *in_ill) 15241 { 15242 uint32_t pkt_len; 15243 queue_t *q; 15244 uint32_t sum; 15245 #define rptr ((uchar_t *)ipha) 15246 uint32_t max_frag; 15247 uint32_t ill_index; 15248 15249 /* Get the ill_index of the incoming ILL */ 15250 ill_index = (in_ill != NULL) ? in_ill->ill_phyint->phyint_ifindex : 0; 15251 15252 /* Initiate Read side IPPF processing */ 15253 if (IPP_ENABLED(IPP_FWD_IN)) { 15254 ip_process(IPP_FWD_IN, &mp, ill_index); 15255 if (mp == NULL) { 15256 ip2dbg(("ip_rput_forward: pkt dropped/deferred "\ 15257 "during IPPF processing\n")); 15258 return; 15259 } 15260 } 15261 pkt_len = ntohs(ipha->ipha_length); 15262 15263 /* Adjust the checksum to reflect the ttl decrement. */ 15264 sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST; 15265 ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16)); 15266 15267 if (ipha->ipha_ttl-- <= 1) { 15268 if (ip_csum_hdr(ipha)) { 15269 BUMP_MIB(&ip_mib, ipInCksumErrs); 15270 goto drop_pkt; 15271 } 15272 /* 15273 * Note: ire_stq this will be NULL for multicast 15274 * datagrams using the long path through arp (the IRE 15275 * is not an IRE_CACHE). This should not cause 15276 * problems since we don't generate ICMP errors for 15277 * multicast packets. 15278 */ 15279 q = ire->ire_stq; 15280 if (q) 15281 icmp_time_exceeded(q, mp, ICMP_TTL_EXCEEDED); 15282 else 15283 freemsg(mp); 15284 return; 15285 } 15286 15287 /* 15288 * Don't forward if the interface is down 15289 */ 15290 if (ire->ire_ipif->ipif_ill->ill_ipif_up_count == 0) { 15291 BUMP_MIB(&ip_mib, ipInDiscards); 15292 goto drop_pkt; 15293 } 15294 15295 /* Get the ill_index of the outgoing ILL */ 15296 ill_index = ire->ire_ipif->ipif_ill->ill_phyint->phyint_ifindex; 15297 15298 if (is_system_labeled()) { 15299 mblk_t *mp1; 15300 15301 if ((mp1 = tsol_ip_forward(ire, mp)) == NULL) { 15302 BUMP_MIB(&ip_mib, ipForwProhibits); 15303 goto drop_pkt; 15304 } 15305 /* Size may have changed */ 15306 mp = mp1; 15307 ipha = (ipha_t *)mp->b_rptr; 15308 pkt_len = ntohs(ipha->ipha_length); 15309 } 15310 15311 /* Check if there are options to update */ 15312 if (!IS_SIMPLE_IPH(ipha)) { 15313 if (ip_csum_hdr(ipha)) { 15314 BUMP_MIB(&ip_mib, ipInCksumErrs); 15315 goto drop_pkt; 15316 } 15317 if (ip_rput_forward_options(mp, ipha, ire)) { 15318 return; 15319 } 15320 15321 ipha->ipha_hdr_checksum = 0; 15322 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 15323 } 15324 max_frag = ire->ire_max_frag; 15325 if (pkt_len > max_frag) { 15326 /* 15327 * It needs fragging on its way out. We haven't 15328 * verified the header checksum yet. Since we 15329 * are going to put a surely good checksum in the 15330 * outgoing header, we have to make sure that it 15331 * was good coming in. 15332 */ 15333 if (ip_csum_hdr(ipha)) { 15334 BUMP_MIB(&ip_mib, ipInCksumErrs); 15335 goto drop_pkt; 15336 } 15337 /* Initiate Write side IPPF processing */ 15338 if (IPP_ENABLED(IPP_FWD_OUT)) { 15339 ip_process(IPP_FWD_OUT, &mp, ill_index); 15340 if (mp == NULL) { 15341 ip2dbg(("ip_rput_forward: pkt dropped/deferred"\ 15342 " during IPPF processing\n")); 15343 return; 15344 } 15345 } 15346 ip_wput_frag(ire, mp, IB_PKT, max_frag, 0); 15347 return; 15348 } 15349 15350 mp = ip_wput_attach_llhdr(mp, ire, IPP_FWD_OUT, ill_index); 15351 if (mp == NULL) { 15352 BUMP_MIB(&ip_mib, ipInDiscards); 15353 return; 15354 } 15355 15356 q = ire->ire_stq; 15357 UPDATE_IB_PKT_COUNT(ire); 15358 ire->ire_last_used_time = lbolt; 15359 BUMP_MIB(&ip_mib, ipForwDatagrams); 15360 putnext(q, mp); 15361 return; 15362 15363 drop_pkt:; 15364 ip1dbg(("ip_rput_forward: drop pkt\n")); 15365 freemsg(mp); 15366 #undef rptr 15367 } 15368 15369 void 15370 ip_rput_forward_multicast(ipaddr_t dst, mblk_t *mp, ipif_t *ipif) 15371 { 15372 ire_t *ire; 15373 15374 ASSERT(!ipif->ipif_isv6); 15375 /* 15376 * Find an IRE which matches the destination and the outgoing 15377 * queue in the cache table. All we need is an IRE_CACHE which 15378 * is pointing at ipif->ipif_ill. If it is part of some ill group, 15379 * then it is enough to have some IRE_CACHE in the group. 15380 */ 15381 if (ipif->ipif_flags & IPIF_POINTOPOINT) 15382 dst = ipif->ipif_pp_dst_addr; 15383 ire = ire_ctable_lookup(dst, 0, 0, ipif, ALL_ZONES, MBLK_GETLABEL(mp), 15384 MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR); 15385 if (ire == NULL) { 15386 /* 15387 * Mark this packet to make it be delivered to 15388 * ip_rput_forward after the new ire has been 15389 * created. 15390 */ 15391 mp->b_prev = NULL; 15392 mp->b_next = mp; 15393 ip_newroute_ipif(ipif->ipif_ill->ill_wq, mp, ipif, dst, 15394 NULL, 0); 15395 } else { 15396 ip_rput_forward(ire, (ipha_t *)mp->b_rptr, mp, NULL); 15397 IRE_REFRELE(ire); 15398 } 15399 } 15400 15401 /* Update any source route, record route or timestamp options */ 15402 static int 15403 ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire) 15404 { 15405 ipoptp_t opts; 15406 uchar_t *opt; 15407 uint8_t optval; 15408 uint8_t optlen; 15409 ipaddr_t dst; 15410 uint32_t ts; 15411 ire_t *dst_ire = NULL; 15412 ire_t *tmp_ire = NULL; 15413 timestruc_t now; 15414 15415 ip2dbg(("ip_rput_forward_options\n")); 15416 dst = ipha->ipha_dst; 15417 for (optval = ipoptp_first(&opts, ipha); 15418 optval != IPOPT_EOL; 15419 optval = ipoptp_next(&opts)) { 15420 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 15421 opt = opts.ipoptp_cur; 15422 optlen = opts.ipoptp_len; 15423 ip2dbg(("ip_rput_forward_options: opt %d, len %d\n", 15424 optval, opts.ipoptp_len)); 15425 switch (optval) { 15426 uint32_t off; 15427 case IPOPT_SSRR: 15428 case IPOPT_LSRR: 15429 /* Check if adminstratively disabled */ 15430 if (!ip_forward_src_routed) { 15431 BUMP_MIB(&ip_mib, ipForwProhibits); 15432 if (ire->ire_stq) 15433 icmp_unreachable(ire->ire_stq, mp, 15434 ICMP_SOURCE_ROUTE_FAILED); 15435 else { 15436 ip0dbg(("ip_rput_forward_options: " 15437 "unable to send unreach\n")); 15438 freemsg(mp); 15439 } 15440 return (-1); 15441 } 15442 15443 dst_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, 15444 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 15445 if (dst_ire == NULL) { 15446 /* 15447 * Must be partial since ip_rput_options 15448 * checked for strict. 15449 */ 15450 break; 15451 } 15452 off = opt[IPOPT_OFFSET]; 15453 off--; 15454 redo_srr: 15455 if (optlen < IP_ADDR_LEN || 15456 off > optlen - IP_ADDR_LEN) { 15457 /* End of source route */ 15458 ip1dbg(( 15459 "ip_rput_forward_options: end of SR\n")); 15460 ire_refrele(dst_ire); 15461 break; 15462 } 15463 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 15464 bcopy(&ire->ire_src_addr, (char *)opt + off, 15465 IP_ADDR_LEN); 15466 ip1dbg(("ip_rput_forward_options: next hop 0x%x\n", 15467 ntohl(dst))); 15468 15469 /* 15470 * Check if our address is present more than 15471 * once as consecutive hops in source route. 15472 */ 15473 tmp_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, 15474 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 15475 if (tmp_ire != NULL) { 15476 ire_refrele(tmp_ire); 15477 off += IP_ADDR_LEN; 15478 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 15479 goto redo_srr; 15480 } 15481 ipha->ipha_dst = dst; 15482 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 15483 ire_refrele(dst_ire); 15484 break; 15485 case IPOPT_RR: 15486 off = opt[IPOPT_OFFSET]; 15487 off--; 15488 if (optlen < IP_ADDR_LEN || 15489 off > optlen - IP_ADDR_LEN) { 15490 /* No more room - ignore */ 15491 ip1dbg(( 15492 "ip_rput_forward_options: end of RR\n")); 15493 break; 15494 } 15495 bcopy(&ire->ire_src_addr, (char *)opt + off, 15496 IP_ADDR_LEN); 15497 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 15498 break; 15499 case IPOPT_TS: 15500 /* Insert timestamp if there is room */ 15501 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 15502 case IPOPT_TS_TSONLY: 15503 off = IPOPT_TS_TIMELEN; 15504 break; 15505 case IPOPT_TS_PRESPEC: 15506 case IPOPT_TS_PRESPEC_RFC791: 15507 /* Verify that the address matched */ 15508 off = opt[IPOPT_OFFSET] - 1; 15509 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 15510 dst_ire = ire_ctable_lookup(dst, 0, 15511 IRE_LOCAL, NULL, ALL_ZONES, NULL, 15512 MATCH_IRE_TYPE); 15513 15514 if (dst_ire == NULL) { 15515 /* Not for us */ 15516 break; 15517 } 15518 ire_refrele(dst_ire); 15519 /* FALLTHRU */ 15520 case IPOPT_TS_TSANDADDR: 15521 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 15522 break; 15523 default: 15524 /* 15525 * ip_*put_options should have already 15526 * dropped this packet. 15527 */ 15528 cmn_err(CE_PANIC, "ip_rput_forward_options: " 15529 "unknown IT - bug in ip_rput_options?\n"); 15530 return (0); /* Keep "lint" happy */ 15531 } 15532 if (opt[IPOPT_OFFSET] - 1 + off > optlen) { 15533 /* Increase overflow counter */ 15534 off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1; 15535 opt[IPOPT_POS_OV_FLG] = 15536 (uint8_t)((opt[IPOPT_POS_OV_FLG] & 0x0F) | 15537 (off << 4)); 15538 break; 15539 } 15540 off = opt[IPOPT_OFFSET] - 1; 15541 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 15542 case IPOPT_TS_PRESPEC: 15543 case IPOPT_TS_PRESPEC_RFC791: 15544 case IPOPT_TS_TSANDADDR: 15545 bcopy(&ire->ire_src_addr, 15546 (char *)opt + off, IP_ADDR_LEN); 15547 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 15548 /* FALLTHRU */ 15549 case IPOPT_TS_TSONLY: 15550 off = opt[IPOPT_OFFSET] - 1; 15551 /* Compute # of milliseconds since midnight */ 15552 gethrestime(&now); 15553 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + 15554 now.tv_nsec / (NANOSEC / MILLISEC); 15555 bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN); 15556 opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN; 15557 break; 15558 } 15559 break; 15560 } 15561 } 15562 return (0); 15563 } 15564 15565 /* 15566 * This is called after processing at least one of AH/ESP headers. 15567 * 15568 * NOTE: the ill corresponding to ipsec_in_ill_index may not be 15569 * the actual, physical interface on which the packet was received, 15570 * but, when ip_strict_dst_multihoming is set to 1, could be the 15571 * interface which had the ipha_dst configured when the packet went 15572 * through ip_rput. The ill_index corresponding to the recv_ill 15573 * is saved in ipsec_in_rill_index 15574 */ 15575 void 15576 ip_fanout_proto_again(mblk_t *ipsec_mp, ill_t *ill, ill_t *recv_ill, ire_t *ire) 15577 { 15578 mblk_t *mp; 15579 ipaddr_t dst; 15580 in6_addr_t *v6dstp; 15581 ipha_t *ipha; 15582 ip6_t *ip6h; 15583 ipsec_in_t *ii; 15584 boolean_t ill_need_rele = B_FALSE; 15585 boolean_t rill_need_rele = B_FALSE; 15586 boolean_t ire_need_rele = B_FALSE; 15587 15588 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 15589 ASSERT(ii->ipsec_in_ill_index != 0); 15590 15591 mp = ipsec_mp->b_cont; 15592 ASSERT(mp != NULL); 15593 15594 15595 if (ill == NULL) { 15596 ASSERT(recv_ill == NULL); 15597 /* 15598 * We need to get the original queue on which ip_rput_local 15599 * or ip_rput_data_v6 was called. 15600 */ 15601 ill = ill_lookup_on_ifindex(ii->ipsec_in_ill_index, 15602 !ii->ipsec_in_v4, NULL, NULL, NULL, NULL); 15603 ill_need_rele = B_TRUE; 15604 15605 if (ii->ipsec_in_ill_index != ii->ipsec_in_rill_index) { 15606 recv_ill = ill_lookup_on_ifindex( 15607 ii->ipsec_in_rill_index, !ii->ipsec_in_v4, 15608 NULL, NULL, NULL, NULL); 15609 rill_need_rele = B_TRUE; 15610 } else { 15611 recv_ill = ill; 15612 } 15613 15614 if ((ill == NULL) || (recv_ill == NULL)) { 15615 ip0dbg(("ip_fanout_proto_again: interface " 15616 "disappeared\n")); 15617 if (ill != NULL) 15618 ill_refrele(ill); 15619 if (recv_ill != NULL) 15620 ill_refrele(recv_ill); 15621 freemsg(ipsec_mp); 15622 return; 15623 } 15624 } 15625 15626 ASSERT(ill != NULL && recv_ill != NULL); 15627 15628 if (mp->b_datap->db_type == M_CTL) { 15629 /* 15630 * AH/ESP is returning the ICMP message after 15631 * removing their headers. Fanout again till 15632 * it gets to the right protocol. 15633 */ 15634 if (ii->ipsec_in_v4) { 15635 icmph_t *icmph; 15636 int iph_hdr_length; 15637 int hdr_length; 15638 15639 ipha = (ipha_t *)mp->b_rptr; 15640 iph_hdr_length = IPH_HDR_LENGTH(ipha); 15641 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 15642 ipha = (ipha_t *)&icmph[1]; 15643 hdr_length = IPH_HDR_LENGTH(ipha); 15644 /* 15645 * icmp_inbound_error_fanout may need to do pullupmsg. 15646 * Reset the type to M_DATA. 15647 */ 15648 mp->b_datap->db_type = M_DATA; 15649 icmp_inbound_error_fanout(ill->ill_rq, ill, ipsec_mp, 15650 icmph, ipha, iph_hdr_length, hdr_length, B_TRUE, 15651 B_FALSE, ill, ii->ipsec_in_zoneid); 15652 } else { 15653 icmp6_t *icmp6; 15654 int hdr_length; 15655 15656 ip6h = (ip6_t *)mp->b_rptr; 15657 /* Don't call hdr_length_v6() unless you have to. */ 15658 if (ip6h->ip6_nxt != IPPROTO_ICMPV6) 15659 hdr_length = ip_hdr_length_v6(mp, ip6h); 15660 else 15661 hdr_length = IPV6_HDR_LEN; 15662 15663 icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]); 15664 /* 15665 * icmp_inbound_error_fanout_v6 may need to do 15666 * pullupmsg. Reset the type to M_DATA. 15667 */ 15668 mp->b_datap->db_type = M_DATA; 15669 icmp_inbound_error_fanout_v6(ill->ill_rq, ipsec_mp, 15670 ip6h, icmp6, ill, B_TRUE, ii->ipsec_in_zoneid); 15671 } 15672 if (ill_need_rele) 15673 ill_refrele(ill); 15674 if (rill_need_rele) 15675 ill_refrele(recv_ill); 15676 return; 15677 } 15678 15679 if (ii->ipsec_in_v4) { 15680 ipha = (ipha_t *)mp->b_rptr; 15681 dst = ipha->ipha_dst; 15682 if (CLASSD(dst)) { 15683 /* 15684 * Multicast has to be delivered to all streams. 15685 */ 15686 dst = INADDR_BROADCAST; 15687 } 15688 15689 if (ire == NULL) { 15690 ire = ire_cache_lookup(dst, ii->ipsec_in_zoneid, 15691 MBLK_GETLABEL(mp)); 15692 if (ire == NULL) { 15693 if (ill_need_rele) 15694 ill_refrele(ill); 15695 if (rill_need_rele) 15696 ill_refrele(recv_ill); 15697 ip1dbg(("ip_fanout_proto_again: " 15698 "IRE not found")); 15699 freemsg(ipsec_mp); 15700 return; 15701 } 15702 ire_need_rele = B_TRUE; 15703 } 15704 15705 switch (ipha->ipha_protocol) { 15706 case IPPROTO_UDP: 15707 ip_udp_input(ill->ill_rq, ipsec_mp, ipha, ire, 15708 recv_ill); 15709 if (ire_need_rele) 15710 ire_refrele(ire); 15711 break; 15712 case IPPROTO_TCP: 15713 if (!ire_need_rele) 15714 IRE_REFHOLD(ire); 15715 mp = ip_tcp_input(mp, ipha, ill, B_TRUE, 15716 ire, ipsec_mp, 0, ill->ill_rq, NULL); 15717 IRE_REFRELE(ire); 15718 if (mp != NULL) 15719 squeue_enter_chain(GET_SQUEUE(mp), mp, 15720 mp, 1, SQTAG_IP_PROTO_AGAIN); 15721 break; 15722 case IPPROTO_SCTP: 15723 if (!ire_need_rele) 15724 IRE_REFHOLD(ire); 15725 ip_sctp_input(mp, ipha, ill, B_TRUE, ire, 15726 ipsec_mp, 0, ill->ill_rq, dst); 15727 break; 15728 default: 15729 ip_proto_input(ill->ill_rq, ipsec_mp, ipha, ire, 15730 recv_ill); 15731 if (ire_need_rele) 15732 ire_refrele(ire); 15733 break; 15734 } 15735 } else { 15736 uint32_t rput_flags = 0; 15737 15738 ip6h = (ip6_t *)mp->b_rptr; 15739 v6dstp = &ip6h->ip6_dst; 15740 /* 15741 * XXX Assumes ip_rput_v6 sets ll_multicast only for multicast 15742 * address. 15743 * 15744 * Currently, we don't store that state in the IPSEC_IN 15745 * message, and we may need to. 15746 */ 15747 rput_flags |= (IN6_IS_ADDR_MULTICAST(v6dstp) ? 15748 IP6_IN_LLMCAST : 0); 15749 ip_rput_data_v6(ill->ill_rq, ill, ipsec_mp, ip6h, rput_flags, 15750 NULL); 15751 } 15752 if (ill_need_rele) 15753 ill_refrele(ill); 15754 if (rill_need_rele) 15755 ill_refrele(recv_ill); 15756 } 15757 15758 /* 15759 * Call ill_frag_timeout to do garbage collection. ill_frag_timeout 15760 * returns 'true' if there are still fragments left on the queue, in 15761 * which case we restart the timer. 15762 */ 15763 void 15764 ill_frag_timer(void *arg) 15765 { 15766 ill_t *ill = (ill_t *)arg; 15767 boolean_t frag_pending; 15768 15769 mutex_enter(&ill->ill_lock); 15770 ASSERT(!ill->ill_fragtimer_executing); 15771 if (ill->ill_state_flags & ILL_CONDEMNED) { 15772 ill->ill_frag_timer_id = 0; 15773 mutex_exit(&ill->ill_lock); 15774 return; 15775 } 15776 ill->ill_fragtimer_executing = 1; 15777 mutex_exit(&ill->ill_lock); 15778 15779 frag_pending = ill_frag_timeout(ill, ip_g_frag_timeout); 15780 15781 /* 15782 * Restart the timer, if we have fragments pending or if someone 15783 * wanted us to be scheduled again. 15784 */ 15785 mutex_enter(&ill->ill_lock); 15786 ill->ill_fragtimer_executing = 0; 15787 ill->ill_frag_timer_id = 0; 15788 if (frag_pending || ill->ill_fragtimer_needrestart) 15789 ill_frag_timer_start(ill); 15790 mutex_exit(&ill->ill_lock); 15791 } 15792 15793 void 15794 ill_frag_timer_start(ill_t *ill) 15795 { 15796 ASSERT(MUTEX_HELD(&ill->ill_lock)); 15797 15798 /* If the ill is closing or opening don't proceed */ 15799 if (ill->ill_state_flags & ILL_CONDEMNED) 15800 return; 15801 15802 if (ill->ill_fragtimer_executing) { 15803 /* 15804 * ill_frag_timer is currently executing. Just record the 15805 * the fact that we want the timer to be restarted. 15806 * ill_frag_timer will post a timeout before it returns, 15807 * ensuring it will be called again. 15808 */ 15809 ill->ill_fragtimer_needrestart = 1; 15810 return; 15811 } 15812 15813 if (ill->ill_frag_timer_id == 0) { 15814 /* 15815 * The timer is neither running nor is the timeout handler 15816 * executing. Post a timeout so that ill_frag_timer will be 15817 * called 15818 */ 15819 ill->ill_frag_timer_id = timeout(ill_frag_timer, ill, 15820 MSEC_TO_TICK(ip_g_frag_timo_ms >> 1)); 15821 ill->ill_fragtimer_needrestart = 0; 15822 } 15823 } 15824 15825 /* 15826 * This routine is needed for loopback when forwarding multicasts. 15827 * 15828 * IPQoS Notes: 15829 * IPPF processing is done in fanout routines. 15830 * Policy processing is done only if IPP_lOCAL_IN is enabled. Further, 15831 * processing for IPSec packets is done when it comes back in clear. 15832 * NOTE : The callers of this function need to do the ire_refrele for the 15833 * ire that is being passed in. 15834 */ 15835 void 15836 ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, 15837 ill_t *recv_ill) 15838 { 15839 ill_t *ill = (ill_t *)q->q_ptr; 15840 uint32_t sum; 15841 uint32_t u1; 15842 uint32_t u2; 15843 int hdr_length; 15844 boolean_t mctl_present; 15845 mblk_t *first_mp = mp; 15846 mblk_t *hada_mp = NULL; 15847 ipha_t *inner_ipha; 15848 15849 TRACE_1(TR_FAC_IP, TR_IP_RPUT_LOCL_START, 15850 "ip_rput_locl_start: q %p", q); 15851 15852 ASSERT(ire->ire_ipversion == IPV4_VERSION); 15853 15854 15855 #define rptr ((uchar_t *)ipha) 15856 #define iphs ((uint16_t *)ipha) 15857 15858 /* 15859 * no UDP or TCP packet should come here anymore. 15860 */ 15861 ASSERT((ipha->ipha_protocol != IPPROTO_TCP) && 15862 (ipha->ipha_protocol != IPPROTO_UDP)); 15863 15864 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 15865 if (mctl_present && 15866 ((da_ipsec_t *)first_mp->b_rptr)->da_type == IPHADA_M_CTL) { 15867 ASSERT(MBLKL(first_mp) >= sizeof (da_ipsec_t)); 15868 15869 /* 15870 * It's an IPsec accelerated packet. 15871 * Keep a pointer to the data attributes around until 15872 * we allocate the ipsec_info_t. 15873 */ 15874 IPSECHW_DEBUG(IPSECHW_PKT, 15875 ("ip_rput_local: inbound HW accelerated IPsec pkt\n")); 15876 hada_mp = first_mp; 15877 hada_mp->b_cont = NULL; 15878 /* 15879 * Since it is accelerated, it comes directly from 15880 * the ill and the data attributes is followed by 15881 * the packet data. 15882 */ 15883 ASSERT(mp->b_datap->db_type != M_CTL); 15884 first_mp = mp; 15885 mctl_present = B_FALSE; 15886 } 15887 15888 /* 15889 * IF M_CTL is not present, then ipsec_in_is_secure 15890 * should return B_TRUE. There is a case where loopback 15891 * packets has an M_CTL in the front with all the 15892 * IPSEC options set to IPSEC_PREF_NEVER - which means 15893 * ipsec_in_is_secure will return B_FALSE. As loopback 15894 * packets never comes here, it is safe to ASSERT the 15895 * following. 15896 */ 15897 ASSERT(!mctl_present || ipsec_in_is_secure(first_mp)); 15898 15899 15900 /* u1 is # words of IP options */ 15901 u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) 15902 + IP_SIMPLE_HDR_LENGTH_IN_WORDS); 15903 15904 if (u1) { 15905 if (!ip_options_cksum(q, mp, ipha, ire)) { 15906 if (hada_mp != NULL) 15907 freemsg(hada_mp); 15908 return; 15909 } 15910 } else { 15911 /* Check the IP header checksum. */ 15912 #define uph ((uint16_t *)ipha) 15913 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + uph[5] + 15914 uph[6] + uph[7] + uph[8] + uph[9]; 15915 #undef uph 15916 /* finish doing IP checksum */ 15917 sum = (sum & 0xFFFF) + (sum >> 16); 15918 sum = ~(sum + (sum >> 16)) & 0xFFFF; 15919 /* 15920 * Don't verify header checksum if this packet is coming 15921 * back from AH/ESP as we already did it. 15922 */ 15923 if (!mctl_present && (sum && sum != 0xFFFF)) { 15924 BUMP_MIB(&ip_mib, ipInCksumErrs); 15925 goto drop_pkt; 15926 } 15927 } 15928 15929 /* 15930 * Count for SNMP of inbound packets for ire. As ip_proto_input 15931 * might be called more than once for secure packets, count only 15932 * the first time. 15933 */ 15934 if (!mctl_present) { 15935 UPDATE_IB_PKT_COUNT(ire); 15936 ire->ire_last_used_time = lbolt; 15937 } 15938 15939 /* Check for fragmentation offset. */ 15940 u2 = ntohs(ipha->ipha_fragment_offset_and_flags); 15941 u1 = u2 & (IPH_MF | IPH_OFFSET); 15942 if (u1) { 15943 /* 15944 * We re-assemble fragments before we do the AH/ESP 15945 * processing. Thus, M_CTL should not be present 15946 * while we are re-assembling. 15947 */ 15948 ASSERT(!mctl_present); 15949 ASSERT(first_mp == mp); 15950 if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) { 15951 return; 15952 } 15953 /* 15954 * Make sure that first_mp points back to mp as 15955 * the mp we came in with could have changed in 15956 * ip_rput_fragment(). 15957 */ 15958 ipha = (ipha_t *)mp->b_rptr; 15959 first_mp = mp; 15960 } 15961 15962 /* 15963 * Clear hardware checksumming flag as it is currently only 15964 * used by TCP and UDP. 15965 */ 15966 DB_CKSUMFLAGS(mp) = 0; 15967 15968 /* Now we have a complete datagram, destined for this machine. */ 15969 u1 = IPH_HDR_LENGTH(ipha); 15970 switch (ipha->ipha_protocol) { 15971 case IPPROTO_ICMP: { 15972 ire_t *ire_zone; 15973 ilm_t *ilm; 15974 mblk_t *mp1; 15975 zoneid_t last_zoneid; 15976 15977 if (CLASSD(ipha->ipha_dst) && 15978 !(recv_ill->ill_phyint->phyint_flags & PHYI_LOOPBACK)) { 15979 ASSERT(ire->ire_type == IRE_BROADCAST); 15980 /* 15981 * In the multicast case, applications may have joined 15982 * the group from different zones, so we need to deliver 15983 * the packet to each of them. Loop through the 15984 * multicast memberships structures (ilm) on the receive 15985 * ill and send a copy of the packet up each matching 15986 * one. However, we don't do this for multicasts sent on 15987 * the loopback interface (PHYI_LOOPBACK flag set) as 15988 * they must stay in the sender's zone. 15989 * 15990 * ilm_add_v6() ensures that ilms in the same zone are 15991 * contiguous in the ill_ilm list. We use this property 15992 * to avoid sending duplicates needed when two 15993 * applications in the same zone join the same group on 15994 * different logical interfaces: we ignore the ilm if 15995 * its zoneid is the same as the last matching one. 15996 * In addition, the sending of the packet for 15997 * ire_zoneid is delayed until all of the other ilms 15998 * have been exhausted. 15999 */ 16000 last_zoneid = -1; 16001 ILM_WALKER_HOLD(recv_ill); 16002 for (ilm = recv_ill->ill_ilm; ilm != NULL; 16003 ilm = ilm->ilm_next) { 16004 if ((ilm->ilm_flags & ILM_DELETED) || 16005 ipha->ipha_dst != ilm->ilm_addr || 16006 ilm->ilm_zoneid == last_zoneid || 16007 ilm->ilm_zoneid == ire->ire_zoneid || 16008 ilm->ilm_zoneid == ALL_ZONES || 16009 !(ilm->ilm_ipif->ipif_flags & IPIF_UP)) 16010 continue; 16011 mp1 = ip_copymsg(first_mp); 16012 if (mp1 == NULL) 16013 continue; 16014 icmp_inbound(q, mp1, B_TRUE, ill, 16015 0, sum, mctl_present, B_TRUE, 16016 recv_ill, ilm->ilm_zoneid); 16017 last_zoneid = ilm->ilm_zoneid; 16018 } 16019 ILM_WALKER_RELE(recv_ill); 16020 } else if (ire->ire_type == IRE_BROADCAST) { 16021 /* 16022 * In the broadcast case, there may be many zones 16023 * which need a copy of the packet delivered to them. 16024 * There is one IRE_BROADCAST per broadcast address 16025 * and per zone; we walk those using a helper function. 16026 * In addition, the sending of the packet for ire is 16027 * delayed until all of the other ires have been 16028 * processed. 16029 */ 16030 IRB_REFHOLD(ire->ire_bucket); 16031 ire_zone = NULL; 16032 while ((ire_zone = ire_get_next_bcast_ire(ire_zone, 16033 ire)) != NULL) { 16034 mp1 = ip_copymsg(first_mp); 16035 if (mp1 == NULL) 16036 continue; 16037 16038 UPDATE_IB_PKT_COUNT(ire_zone); 16039 ire_zone->ire_last_used_time = lbolt; 16040 icmp_inbound(q, mp1, B_TRUE, ill, 16041 0, sum, mctl_present, B_TRUE, 16042 recv_ill, ire_zone->ire_zoneid); 16043 } 16044 IRB_REFRELE(ire->ire_bucket); 16045 } 16046 icmp_inbound(q, first_mp, (ire->ire_type == IRE_BROADCAST), 16047 ill, 0, sum, mctl_present, B_TRUE, recv_ill, 16048 ire->ire_zoneid); 16049 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 16050 "ip_rput_locl_end: q %p (%S)", q, "icmp"); 16051 return; 16052 } 16053 case IPPROTO_IGMP: 16054 /* 16055 * If we are not willing to accept IGMP packets in clear, 16056 * then check with global policy. 16057 */ 16058 if (igmp_accept_clear_messages == 0) { 16059 first_mp = ipsec_check_global_policy(first_mp, NULL, 16060 ipha, NULL, mctl_present); 16061 if (first_mp == NULL) 16062 return; 16063 } 16064 if (is_system_labeled() && !tsol_can_accept_raw(mp, B_TRUE)) { 16065 freemsg(first_mp); 16066 ip1dbg(("ip_proto_input: zone all cannot accept raw")); 16067 BUMP_MIB(&ip_mib, ipInDiscards); 16068 return; 16069 } 16070 if (igmp_input(q, mp, ill)) { 16071 /* Bad packet - discarded by igmp_input */ 16072 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 16073 "ip_rput_locl_end: q %p (%S)", q, "igmp"); 16074 if (mctl_present) 16075 freeb(first_mp); 16076 return; 16077 } 16078 /* 16079 * igmp_input() may have pulled up the message so ipha needs to 16080 * be reinitialized. 16081 */ 16082 ipha = (ipha_t *)mp->b_rptr; 16083 if (ipcl_proto_search(ipha->ipha_protocol) == NULL) { 16084 /* No user-level listener for IGMP packets */ 16085 goto drop_pkt; 16086 } 16087 /* deliver to local raw users */ 16088 break; 16089 case IPPROTO_PIM: 16090 /* 16091 * If we are not willing to accept PIM packets in clear, 16092 * then check with global policy. 16093 */ 16094 if (pim_accept_clear_messages == 0) { 16095 first_mp = ipsec_check_global_policy(first_mp, NULL, 16096 ipha, NULL, mctl_present); 16097 if (first_mp == NULL) 16098 return; 16099 } 16100 if (is_system_labeled() && !tsol_can_accept_raw(mp, B_TRUE)) { 16101 freemsg(first_mp); 16102 ip1dbg(("ip_proto_input: zone all cannot accept PIM")); 16103 BUMP_MIB(&ip_mib, ipInDiscards); 16104 return; 16105 } 16106 if (pim_input(q, mp) != 0) { 16107 /* Bad packet - discarded by pim_input */ 16108 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 16109 "ip_rput_locl_end: q %p (%S)", q, "pim"); 16110 if (mctl_present) 16111 freeb(first_mp); 16112 return; 16113 } 16114 16115 /* 16116 * pim_input() may have pulled up the message so ipha needs to 16117 * be reinitialized. 16118 */ 16119 ipha = (ipha_t *)mp->b_rptr; 16120 if (ipcl_proto_search(ipha->ipha_protocol) == NULL) { 16121 /* No user-level listener for PIM packets */ 16122 goto drop_pkt; 16123 } 16124 /* deliver to local raw users */ 16125 break; 16126 case IPPROTO_ENCAP: 16127 /* 16128 * Handle self-encapsulated packets (IP-in-IP where 16129 * the inner addresses == the outer addresses). 16130 */ 16131 hdr_length = IPH_HDR_LENGTH(ipha); 16132 if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) > 16133 mp->b_wptr) { 16134 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 16135 sizeof (ipha_t) - mp->b_rptr)) { 16136 BUMP_MIB(&ip_mib, ipInDiscards); 16137 freemsg(first_mp); 16138 return; 16139 } 16140 ipha = (ipha_t *)mp->b_rptr; 16141 } 16142 inner_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length); 16143 /* 16144 * Check the sanity of the inner IP header. 16145 */ 16146 if ((IPH_HDR_VERSION(inner_ipha) != IPV4_VERSION)) { 16147 BUMP_MIB(&ip_mib, ipInDiscards); 16148 freemsg(first_mp); 16149 return; 16150 } 16151 if (IPH_HDR_LENGTH(inner_ipha) < sizeof (ipha_t)) { 16152 BUMP_MIB(&ip_mib, ipInDiscards); 16153 freemsg(first_mp); 16154 return; 16155 } 16156 if (inner_ipha->ipha_src == ipha->ipha_src && 16157 inner_ipha->ipha_dst == ipha->ipha_dst) { 16158 ipsec_in_t *ii; 16159 16160 /* 16161 * Self-encapsulated tunnel packet. Remove 16162 * the outer IP header and fanout again. 16163 * We also need to make sure that the inner 16164 * header is pulled up until options. 16165 */ 16166 mp->b_rptr = (uchar_t *)inner_ipha; 16167 ipha = inner_ipha; 16168 hdr_length = IPH_HDR_LENGTH(ipha); 16169 if ((uchar_t *)ipha + hdr_length > mp->b_wptr) { 16170 if (!pullupmsg(mp, (uchar_t *)ipha + 16171 + hdr_length - mp->b_rptr)) { 16172 freemsg(first_mp); 16173 return; 16174 } 16175 ipha = (ipha_t *)mp->b_rptr; 16176 } 16177 if (!mctl_present) { 16178 ASSERT(first_mp == mp); 16179 /* 16180 * This means that somebody is sending 16181 * Self-encapsualted packets without AH/ESP. 16182 * If AH/ESP was present, we would have already 16183 * allocated the first_mp. 16184 */ 16185 if ((first_mp = ipsec_in_alloc(B_TRUE)) == 16186 NULL) { 16187 ip1dbg(("ip_proto_input: IPSEC_IN " 16188 "allocation failure.\n")); 16189 BUMP_MIB(&ip_mib, ipInDiscards); 16190 freemsg(mp); 16191 return; 16192 } 16193 first_mp->b_cont = mp; 16194 } 16195 /* 16196 * We generally store the ill_index if we need to 16197 * do IPSEC processing as we lose the ill queue when 16198 * we come back. But in this case, we never should 16199 * have to store the ill_index here as it should have 16200 * been stored previously when we processed the 16201 * AH/ESP header in this routine or for non-ipsec 16202 * cases, we still have the queue. But for some bad 16203 * packets from the wire, we can get to IPSEC after 16204 * this and we better store the index for that case. 16205 */ 16206 ill = (ill_t *)q->q_ptr; 16207 ii = (ipsec_in_t *)first_mp->b_rptr; 16208 ii->ipsec_in_ill_index = 16209 ill->ill_phyint->phyint_ifindex; 16210 ii->ipsec_in_rill_index = 16211 recv_ill->ill_phyint->phyint_ifindex; 16212 if (ii->ipsec_in_decaps) { 16213 /* 16214 * This packet is self-encapsulated multiple 16215 * times. We don't want to recurse infinitely. 16216 * To keep it simple, drop the packet. 16217 */ 16218 BUMP_MIB(&ip_mib, ipInDiscards); 16219 freemsg(first_mp); 16220 return; 16221 } 16222 ii->ipsec_in_decaps = B_TRUE; 16223 ip_proto_input(q, first_mp, ipha, ire, recv_ill); 16224 return; 16225 } 16226 break; 16227 case IPPROTO_AH: 16228 case IPPROTO_ESP: { 16229 /* 16230 * Fast path for AH/ESP. If this is the first time 16231 * we are sending a datagram to AH/ESP, allocate 16232 * a IPSEC_IN message and prepend it. Otherwise, 16233 * just fanout. 16234 */ 16235 16236 int ipsec_rc; 16237 ipsec_in_t *ii; 16238 16239 IP_STAT(ipsec_proto_ahesp); 16240 if (!mctl_present) { 16241 ASSERT(first_mp == mp); 16242 if ((first_mp = ipsec_in_alloc(B_TRUE)) == NULL) { 16243 ip1dbg(("ip_proto_input: IPSEC_IN " 16244 "allocation failure.\n")); 16245 freemsg(hada_mp); /* okay ifnull */ 16246 BUMP_MIB(&ip_mib, ipInDiscards); 16247 freemsg(mp); 16248 return; 16249 } 16250 /* 16251 * Store the ill_index so that when we come back 16252 * from IPSEC we ride on the same queue. 16253 */ 16254 ill = (ill_t *)q->q_ptr; 16255 ii = (ipsec_in_t *)first_mp->b_rptr; 16256 ii->ipsec_in_ill_index = 16257 ill->ill_phyint->phyint_ifindex; 16258 ii->ipsec_in_rill_index = 16259 recv_ill->ill_phyint->phyint_ifindex; 16260 first_mp->b_cont = mp; 16261 /* 16262 * Cache hardware acceleration info. 16263 */ 16264 if (hada_mp != NULL) { 16265 IPSECHW_DEBUG(IPSECHW_PKT, 16266 ("ip_rput_local: caching data attr.\n")); 16267 ii->ipsec_in_accelerated = B_TRUE; 16268 ii->ipsec_in_da = hada_mp; 16269 hada_mp = NULL; 16270 } 16271 } else { 16272 ii = (ipsec_in_t *)first_mp->b_rptr; 16273 } 16274 16275 if (!ipsec_loaded()) { 16276 ip_proto_not_sup(q, first_mp, IP_FF_SEND_ICMP, 16277 ire->ire_zoneid); 16278 return; 16279 } 16280 16281 /* select inbound SA and have IPsec process the pkt */ 16282 if (ipha->ipha_protocol == IPPROTO_ESP) { 16283 esph_t *esph = ipsec_inbound_esp_sa(first_mp); 16284 if (esph == NULL) 16285 return; 16286 ASSERT(ii->ipsec_in_esp_sa != NULL); 16287 ASSERT(ii->ipsec_in_esp_sa->ipsa_input_func != NULL); 16288 ipsec_rc = ii->ipsec_in_esp_sa->ipsa_input_func( 16289 first_mp, esph); 16290 } else { 16291 ah_t *ah = ipsec_inbound_ah_sa(first_mp); 16292 if (ah == NULL) 16293 return; 16294 ASSERT(ii->ipsec_in_ah_sa != NULL); 16295 ASSERT(ii->ipsec_in_ah_sa->ipsa_input_func != NULL); 16296 ipsec_rc = ii->ipsec_in_ah_sa->ipsa_input_func( 16297 first_mp, ah); 16298 } 16299 16300 switch (ipsec_rc) { 16301 case IPSEC_STATUS_SUCCESS: 16302 break; 16303 case IPSEC_STATUS_FAILED: 16304 BUMP_MIB(&ip_mib, ipInDiscards); 16305 /* FALLTHRU */ 16306 case IPSEC_STATUS_PENDING: 16307 return; 16308 } 16309 /* we're done with IPsec processing, send it up */ 16310 ip_fanout_proto_again(first_mp, ill, recv_ill, ire); 16311 return; 16312 } 16313 default: 16314 break; 16315 } 16316 if (is_system_labeled() && !tsol_can_accept_raw(mp, B_FALSE)) { 16317 ip1dbg(("ip_proto_input: zone %d cannot accept raw IP", 16318 ire->ire_zoneid)); 16319 goto drop_pkt; 16320 } 16321 /* 16322 * Handle protocols with which IP is less intimate. There 16323 * can be more than one stream bound to a particular 16324 * protocol. When this is the case, each one gets a copy 16325 * of any incoming packets. 16326 */ 16327 ip_fanout_proto(q, first_mp, ill, ipha, 16328 IP_FF_SEND_ICMP | IP_FF_CKSUM | IP_FF_RAWIP, mctl_present, 16329 B_TRUE, recv_ill, ire->ire_zoneid); 16330 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 16331 "ip_rput_locl_end: q %p (%S)", q, "ip_fanout_proto"); 16332 return; 16333 16334 drop_pkt: 16335 freemsg(first_mp); 16336 if (hada_mp != NULL) 16337 freeb(hada_mp); 16338 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 16339 "ip_rput_locl_end: q %p (%S)", q, "droppkt"); 16340 #undef rptr 16341 #undef iphs 16342 16343 } 16344 16345 /* 16346 * Update any source route, record route or timestamp options. 16347 * Check that we are at end of strict source route. 16348 * The options have already been checked for sanity in ip_rput_options(). 16349 */ 16350 static boolean_t 16351 ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire) 16352 { 16353 ipoptp_t opts; 16354 uchar_t *opt; 16355 uint8_t optval; 16356 uint8_t optlen; 16357 ipaddr_t dst; 16358 uint32_t ts; 16359 ire_t *dst_ire; 16360 timestruc_t now; 16361 16362 ASSERT(ire->ire_ipversion == IPV4_VERSION); 16363 16364 ip2dbg(("ip_rput_local_options\n")); 16365 16366 for (optval = ipoptp_first(&opts, ipha); 16367 optval != IPOPT_EOL; 16368 optval = ipoptp_next(&opts)) { 16369 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 16370 opt = opts.ipoptp_cur; 16371 optlen = opts.ipoptp_len; 16372 ip2dbg(("ip_rput_local_options: opt %d, len %d\n", 16373 optval, optlen)); 16374 switch (optval) { 16375 uint32_t off; 16376 case IPOPT_SSRR: 16377 case IPOPT_LSRR: 16378 off = opt[IPOPT_OFFSET]; 16379 off--; 16380 if (optlen < IP_ADDR_LEN || 16381 off > optlen - IP_ADDR_LEN) { 16382 /* End of source route */ 16383 ip1dbg(("ip_rput_local_options: end of SR\n")); 16384 break; 16385 } 16386 /* 16387 * This will only happen if two consecutive entries 16388 * in the source route contains our address or if 16389 * it is a packet with a loose source route which 16390 * reaches us before consuming the whole source route 16391 */ 16392 ip1dbg(("ip_rput_local_options: not end of SR\n")); 16393 if (optval == IPOPT_SSRR) { 16394 goto bad_src_route; 16395 } 16396 /* 16397 * Hack: instead of dropping the packet truncate the 16398 * source route to what has been used by filling the 16399 * rest with IPOPT_NOP. 16400 */ 16401 opt[IPOPT_OLEN] = (uint8_t)off; 16402 while (off < optlen) { 16403 opt[off++] = IPOPT_NOP; 16404 } 16405 break; 16406 case IPOPT_RR: 16407 off = opt[IPOPT_OFFSET]; 16408 off--; 16409 if (optlen < IP_ADDR_LEN || 16410 off > optlen - IP_ADDR_LEN) { 16411 /* No more room - ignore */ 16412 ip1dbg(( 16413 "ip_rput_local_options: end of RR\n")); 16414 break; 16415 } 16416 bcopy(&ire->ire_src_addr, (char *)opt + off, 16417 IP_ADDR_LEN); 16418 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 16419 break; 16420 case IPOPT_TS: 16421 /* Insert timestamp if there is romm */ 16422 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 16423 case IPOPT_TS_TSONLY: 16424 off = IPOPT_TS_TIMELEN; 16425 break; 16426 case IPOPT_TS_PRESPEC: 16427 case IPOPT_TS_PRESPEC_RFC791: 16428 /* Verify that the address matched */ 16429 off = opt[IPOPT_OFFSET] - 1; 16430 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 16431 dst_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, 16432 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 16433 if (dst_ire == NULL) { 16434 /* Not for us */ 16435 break; 16436 } 16437 ire_refrele(dst_ire); 16438 /* FALLTHRU */ 16439 case IPOPT_TS_TSANDADDR: 16440 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 16441 break; 16442 default: 16443 /* 16444 * ip_*put_options should have already 16445 * dropped this packet. 16446 */ 16447 cmn_err(CE_PANIC, "ip_rput_local_options: " 16448 "unknown IT - bug in ip_rput_options?\n"); 16449 return (B_TRUE); /* Keep "lint" happy */ 16450 } 16451 if (opt[IPOPT_OFFSET] - 1 + off > optlen) { 16452 /* Increase overflow counter */ 16453 off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1; 16454 opt[IPOPT_POS_OV_FLG] = 16455 (uint8_t)((opt[IPOPT_POS_OV_FLG] & 0x0F) | 16456 (off << 4)); 16457 break; 16458 } 16459 off = opt[IPOPT_OFFSET] - 1; 16460 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 16461 case IPOPT_TS_PRESPEC: 16462 case IPOPT_TS_PRESPEC_RFC791: 16463 case IPOPT_TS_TSANDADDR: 16464 bcopy(&ire->ire_src_addr, (char *)opt + off, 16465 IP_ADDR_LEN); 16466 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 16467 /* FALLTHRU */ 16468 case IPOPT_TS_TSONLY: 16469 off = opt[IPOPT_OFFSET] - 1; 16470 /* Compute # of milliseconds since midnight */ 16471 gethrestime(&now); 16472 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + 16473 now.tv_nsec / (NANOSEC / MILLISEC); 16474 bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN); 16475 opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN; 16476 break; 16477 } 16478 break; 16479 } 16480 } 16481 return (B_TRUE); 16482 16483 bad_src_route: 16484 q = WR(q); 16485 /* make sure we clear any indication of a hardware checksum */ 16486 DB_CKSUMFLAGS(mp) = 0; 16487 icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED); 16488 return (B_FALSE); 16489 16490 } 16491 16492 /* 16493 * Process IP options in an inbound packet. If an option affects the 16494 * effective destination address, return the next hop address via dstp. 16495 * Returns -1 if something fails in which case an ICMP error has been sent 16496 * and mp freed. 16497 */ 16498 static int 16499 ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp) 16500 { 16501 ipoptp_t opts; 16502 uchar_t *opt; 16503 uint8_t optval; 16504 uint8_t optlen; 16505 ipaddr_t dst; 16506 intptr_t code = 0; 16507 ire_t *ire = NULL; 16508 16509 ip2dbg(("ip_rput_options\n")); 16510 dst = ipha->ipha_dst; 16511 for (optval = ipoptp_first(&opts, ipha); 16512 optval != IPOPT_EOL; 16513 optval = ipoptp_next(&opts)) { 16514 opt = opts.ipoptp_cur; 16515 optlen = opts.ipoptp_len; 16516 ip2dbg(("ip_rput_options: opt %d, len %d\n", 16517 optval, optlen)); 16518 /* 16519 * Note: we need to verify the checksum before we 16520 * modify anything thus this routine only extracts the next 16521 * hop dst from any source route. 16522 */ 16523 switch (optval) { 16524 uint32_t off; 16525 case IPOPT_SSRR: 16526 case IPOPT_LSRR: 16527 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, 16528 ALL_ZONES, NULL, MATCH_IRE_TYPE); 16529 if (ire == NULL) { 16530 if (optval == IPOPT_SSRR) { 16531 ip1dbg(("ip_rput_options: not next" 16532 " strict source route 0x%x\n", 16533 ntohl(dst))); 16534 code = (char *)&ipha->ipha_dst - 16535 (char *)ipha; 16536 goto param_prob; /* RouterReq's */ 16537 } 16538 ip2dbg(("ip_rput_options: " 16539 "not next source route 0x%x\n", 16540 ntohl(dst))); 16541 break; 16542 } 16543 ire_refrele(ire); 16544 16545 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 16546 ip1dbg(( 16547 "ip_rput_options: bad option offset\n")); 16548 code = (char *)&opt[IPOPT_OLEN] - 16549 (char *)ipha; 16550 goto param_prob; 16551 } 16552 off = opt[IPOPT_OFFSET]; 16553 off--; 16554 redo_srr: 16555 if (optlen < IP_ADDR_LEN || 16556 off > optlen - IP_ADDR_LEN) { 16557 /* End of source route */ 16558 ip1dbg(("ip_rput_options: end of SR\n")); 16559 break; 16560 } 16561 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 16562 ip1dbg(("ip_rput_options: next hop 0x%x\n", 16563 ntohl(dst))); 16564 16565 /* 16566 * Check if our address is present more than 16567 * once as consecutive hops in source route. 16568 * XXX verify per-interface ip_forwarding 16569 * for source route? 16570 */ 16571 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, 16572 ALL_ZONES, NULL, MATCH_IRE_TYPE); 16573 16574 if (ire != NULL) { 16575 ire_refrele(ire); 16576 off += IP_ADDR_LEN; 16577 goto redo_srr; 16578 } 16579 16580 if (dst == htonl(INADDR_LOOPBACK)) { 16581 ip1dbg(("ip_rput_options: loopback addr in " 16582 "source route!\n")); 16583 goto bad_src_route; 16584 } 16585 /* 16586 * For strict: verify that dst is directly 16587 * reachable. 16588 */ 16589 if (optval == IPOPT_SSRR) { 16590 ire = ire_ftable_lookup(dst, 0, 0, 16591 IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, 16592 MBLK_GETLABEL(mp), 16593 MATCH_IRE_TYPE | MATCH_IRE_SECATTR); 16594 if (ire == NULL) { 16595 ip1dbg(("ip_rput_options: SSRR not " 16596 "directly reachable: 0x%x\n", 16597 ntohl(dst))); 16598 goto bad_src_route; 16599 } 16600 ire_refrele(ire); 16601 } 16602 /* 16603 * Defer update of the offset and the record route 16604 * until the packet is forwarded. 16605 */ 16606 break; 16607 case IPOPT_RR: 16608 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 16609 ip1dbg(( 16610 "ip_rput_options: bad option offset\n")); 16611 code = (char *)&opt[IPOPT_OLEN] - 16612 (char *)ipha; 16613 goto param_prob; 16614 } 16615 break; 16616 case IPOPT_TS: 16617 /* 16618 * Verify that length >= 5 and that there is either 16619 * room for another timestamp or that the overflow 16620 * counter is not maxed out. 16621 */ 16622 code = (char *)&opt[IPOPT_OLEN] - (char *)ipha; 16623 if (optlen < IPOPT_MINLEN_IT) { 16624 goto param_prob; 16625 } 16626 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 16627 ip1dbg(( 16628 "ip_rput_options: bad option offset\n")); 16629 code = (char *)&opt[IPOPT_OFFSET] - 16630 (char *)ipha; 16631 goto param_prob; 16632 } 16633 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 16634 case IPOPT_TS_TSONLY: 16635 off = IPOPT_TS_TIMELEN; 16636 break; 16637 case IPOPT_TS_TSANDADDR: 16638 case IPOPT_TS_PRESPEC: 16639 case IPOPT_TS_PRESPEC_RFC791: 16640 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 16641 break; 16642 default: 16643 code = (char *)&opt[IPOPT_POS_OV_FLG] - 16644 (char *)ipha; 16645 goto param_prob; 16646 } 16647 if (opt[IPOPT_OFFSET] - 1 + off > optlen && 16648 (opt[IPOPT_POS_OV_FLG] & 0xF0) == 0xF0) { 16649 /* 16650 * No room and the overflow counter is 15 16651 * already. 16652 */ 16653 goto param_prob; 16654 } 16655 break; 16656 } 16657 } 16658 16659 if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0) { 16660 *dstp = dst; 16661 return (0); 16662 } 16663 16664 ip1dbg(("ip_rput_options: error processing IP options.")); 16665 code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha; 16666 16667 param_prob: 16668 q = WR(q); 16669 /* make sure we clear any indication of a hardware checksum */ 16670 DB_CKSUMFLAGS(mp) = 0; 16671 icmp_param_problem(q, mp, (uint8_t)code); 16672 return (-1); 16673 16674 bad_src_route: 16675 q = WR(q); 16676 /* make sure we clear any indication of a hardware checksum */ 16677 DB_CKSUMFLAGS(mp) = 0; 16678 icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED); 16679 return (-1); 16680 } 16681 16682 /* 16683 * IP & ICMP info in >=14 msg's ... 16684 * - ip fixed part (mib2_ip_t) 16685 * - icmp fixed part (mib2_icmp_t) 16686 * - ipAddrEntryTable (ip 20) all IPv4 ipifs 16687 * - ipRouteEntryTable (ip 21) all IPv4 IREs 16688 * - ipNetToMediaEntryTable (ip 22) IPv4 IREs for on-link destinations 16689 * - ipRouteAttributeTable (ip 102) labeled routes 16690 * - ip multicast membership (ip_member_t) 16691 * - ip multicast source filtering (ip_grpsrc_t) 16692 * - igmp fixed part (struct igmpstat) 16693 * - multicast routing stats (struct mrtstat) 16694 * - multicast routing vifs (array of struct vifctl) 16695 * - multicast routing routes (array of struct mfcctl) 16696 * - ip6 fixed part (mib2_ipv6IfStatsEntry_t) 16697 * One per ill plus one generic 16698 * - icmp6 fixed part (mib2_ipv6IfIcmpEntry_t) 16699 * One per ill plus one generic 16700 * - ipv6RouteEntry all IPv6 IREs 16701 * - ipv6RouteAttributeTable (ip6 102) labeled routes 16702 * - ipv6NetToMediaEntry all Neighbor Cache entries 16703 * - ipv6AddrEntry all IPv6 ipifs 16704 * - ipv6 multicast membership (ipv6_member_t) 16705 * - ipv6 multicast source filtering (ipv6_grpsrc_t) 16706 * 16707 * IP_ROUTE and IP_MEDIA are augmented in arp to include arp cache entries not 16708 * already present. 16709 * NOTE: original mpctl is copied for msg's 2..N, since its ctl part is 16710 * already filled in by the caller. 16711 * Return value of 0 indicates that no messages were sent and caller 16712 * should free mpctl. 16713 */ 16714 int 16715 ip_snmp_get(queue_t *q, mblk_t *mpctl) 16716 { 16717 16718 if (mpctl == NULL || mpctl->b_cont == NULL) { 16719 return (0); 16720 } 16721 16722 if ((mpctl = ip_snmp_get_mib2_ip(q, mpctl)) == NULL) { 16723 return (1); 16724 } 16725 16726 if ((mpctl = ip_snmp_get_mib2_ip6(q, mpctl)) == NULL) { 16727 return (1); 16728 } 16729 16730 if ((mpctl = ip_snmp_get_mib2_icmp(q, mpctl)) == NULL) { 16731 return (1); 16732 } 16733 16734 if ((mpctl = ip_snmp_get_mib2_icmp6(q, mpctl)) == NULL) { 16735 return (1); 16736 } 16737 16738 if ((mpctl = ip_snmp_get_mib2_igmp(q, mpctl)) == NULL) { 16739 return (1); 16740 } 16741 16742 if ((mpctl = ip_snmp_get_mib2_multi(q, mpctl)) == NULL) { 16743 return (1); 16744 } 16745 16746 if ((mpctl = ip_snmp_get_mib2_ip_addr(q, mpctl)) == NULL) { 16747 return (1); 16748 } 16749 16750 if ((mpctl = ip_snmp_get_mib2_ip6_addr(q, mpctl)) == NULL) { 16751 return (1); 16752 } 16753 16754 if ((mpctl = ip_snmp_get_mib2_ip_group_mem(q, mpctl)) == NULL) { 16755 return (1); 16756 } 16757 16758 if ((mpctl = ip_snmp_get_mib2_ip6_group_mem(q, mpctl)) == NULL) { 16759 return (1); 16760 } 16761 16762 if ((mpctl = ip_snmp_get_mib2_ip_group_src(q, mpctl)) == NULL) { 16763 return (1); 16764 } 16765 16766 if ((mpctl = ip_snmp_get_mib2_ip6_group_src(q, mpctl)) == NULL) { 16767 return (1); 16768 } 16769 16770 if ((mpctl = ip_snmp_get_mib2_virt_multi(q, mpctl)) == NULL) { 16771 return (1); 16772 } 16773 16774 if ((mpctl = ip_snmp_get_mib2_multi_rtable(q, mpctl)) == NULL) { 16775 return (1); 16776 } 16777 16778 if ((mpctl = ip_snmp_get_mib2_ip_route_media(q, mpctl)) == NULL) { 16779 return (1); 16780 } 16781 16782 if ((mpctl = ip_snmp_get_mib2_ip6_route_media(q, mpctl)) == NULL) { 16783 return (1); 16784 } 16785 16786 if ((mpctl = sctp_snmp_get_mib2(q, mpctl)) == NULL) { 16787 return (1); 16788 } 16789 freemsg(mpctl); 16790 return (1); 16791 } 16792 16793 16794 /* Get global IPv4 statistics */ 16795 static mblk_t * 16796 ip_snmp_get_mib2_ip(queue_t *q, mblk_t *mpctl) 16797 { 16798 struct opthdr *optp; 16799 mblk_t *mp2ctl; 16800 16801 /* 16802 * make a copy of the original message 16803 */ 16804 mp2ctl = copymsg(mpctl); 16805 16806 /* fixed length IP structure... */ 16807 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 16808 optp->level = MIB2_IP; 16809 optp->name = 0; 16810 SET_MIB(ip_mib.ipForwarding, 16811 (WE_ARE_FORWARDING ? 1 : 2)); 16812 SET_MIB(ip_mib.ipDefaultTTL, 16813 (uint32_t)ip_def_ttl); 16814 SET_MIB(ip_mib.ipReasmTimeout, 16815 ip_g_frag_timeout); 16816 SET_MIB(ip_mib.ipAddrEntrySize, 16817 sizeof (mib2_ipAddrEntry_t)); 16818 SET_MIB(ip_mib.ipRouteEntrySize, 16819 sizeof (mib2_ipRouteEntry_t)); 16820 SET_MIB(ip_mib.ipNetToMediaEntrySize, 16821 sizeof (mib2_ipNetToMediaEntry_t)); 16822 SET_MIB(ip_mib.ipMemberEntrySize, sizeof (ip_member_t)); 16823 SET_MIB(ip_mib.ipGroupSourceEntrySize, sizeof (ip_grpsrc_t)); 16824 SET_MIB(ip_mib.ipRouteAttributeSize, sizeof (mib2_ipAttributeEntry_t)); 16825 SET_MIB(ip_mib.transportMLPSize, sizeof (mib2_transportMLPEntry_t)); 16826 if (!snmp_append_data(mpctl->b_cont, (char *)&ip_mib, 16827 (int)sizeof (ip_mib))) { 16828 ip1dbg(("ip_snmp_get_mib2_ip: failed to allocate %u bytes\n", 16829 (uint_t)sizeof (ip_mib))); 16830 } 16831 16832 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 16833 ip3dbg(("ip_snmp_get_mib2_ip: level %d, name %d, len %d\n", 16834 (int)optp->level, (int)optp->name, (int)optp->len)); 16835 qreply(q, mpctl); 16836 return (mp2ctl); 16837 } 16838 16839 /* Global IPv4 ICMP statistics */ 16840 static mblk_t * 16841 ip_snmp_get_mib2_icmp(queue_t *q, mblk_t *mpctl) 16842 { 16843 struct opthdr *optp; 16844 mblk_t *mp2ctl; 16845 16846 /* 16847 * Make a copy of the original message 16848 */ 16849 mp2ctl = copymsg(mpctl); 16850 16851 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 16852 optp->level = MIB2_ICMP; 16853 optp->name = 0; 16854 if (!snmp_append_data(mpctl->b_cont, (char *)&icmp_mib, 16855 (int)sizeof (icmp_mib))) { 16856 ip1dbg(("ip_snmp_get_mib2_icmp: failed to allocate %u bytes\n", 16857 (uint_t)sizeof (icmp_mib))); 16858 } 16859 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 16860 ip3dbg(("ip_snmp_get_mib2_icmp: level %d, name %d, len %d\n", 16861 (int)optp->level, (int)optp->name, (int)optp->len)); 16862 qreply(q, mpctl); 16863 return (mp2ctl); 16864 } 16865 16866 /* Global IPv4 IGMP statistics */ 16867 static mblk_t * 16868 ip_snmp_get_mib2_igmp(queue_t *q, mblk_t *mpctl) 16869 { 16870 struct opthdr *optp; 16871 mblk_t *mp2ctl; 16872 16873 /* 16874 * make a copy of the original message 16875 */ 16876 mp2ctl = copymsg(mpctl); 16877 16878 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 16879 optp->level = EXPER_IGMP; 16880 optp->name = 0; 16881 if (!snmp_append_data(mpctl->b_cont, (char *)&igmpstat, 16882 (int)sizeof (igmpstat))) { 16883 ip1dbg(("ip_snmp_get_mib2_igmp: failed to allocate %u bytes\n", 16884 (uint_t)sizeof (igmpstat))); 16885 } 16886 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 16887 ip3dbg(("ip_snmp_get_mib2_igmp: level %d, name %d, len %d\n", 16888 (int)optp->level, (int)optp->name, (int)optp->len)); 16889 qreply(q, mpctl); 16890 return (mp2ctl); 16891 } 16892 16893 /* Global IPv4 Multicast Routing statistics */ 16894 static mblk_t * 16895 ip_snmp_get_mib2_multi(queue_t *q, mblk_t *mpctl) 16896 { 16897 struct opthdr *optp; 16898 mblk_t *mp2ctl; 16899 16900 /* 16901 * make a copy of the original message 16902 */ 16903 mp2ctl = copymsg(mpctl); 16904 16905 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 16906 optp->level = EXPER_DVMRP; 16907 optp->name = 0; 16908 if (!ip_mroute_stats(mpctl->b_cont)) { 16909 ip0dbg(("ip_mroute_stats: failed\n")); 16910 } 16911 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 16912 ip3dbg(("ip_snmp_get_mib2_multi: level %d, name %d, len %d\n", 16913 (int)optp->level, (int)optp->name, (int)optp->len)); 16914 qreply(q, mpctl); 16915 return (mp2ctl); 16916 } 16917 16918 /* IPv4 address information */ 16919 static mblk_t * 16920 ip_snmp_get_mib2_ip_addr(queue_t *q, mblk_t *mpctl) 16921 { 16922 struct opthdr *optp; 16923 mblk_t *mp2ctl; 16924 mblk_t *mp_tail = NULL; 16925 ill_t *ill; 16926 ipif_t *ipif; 16927 uint_t bitval; 16928 mib2_ipAddrEntry_t mae; 16929 zoneid_t zoneid; 16930 ill_walk_context_t ctx; 16931 16932 /* 16933 * make a copy of the original message 16934 */ 16935 mp2ctl = copymsg(mpctl); 16936 16937 /* ipAddrEntryTable */ 16938 16939 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 16940 optp->level = MIB2_IP; 16941 optp->name = MIB2_IP_ADDR; 16942 zoneid = Q_TO_CONN(q)->conn_zoneid; 16943 16944 rw_enter(&ill_g_lock, RW_READER); 16945 ill = ILL_START_WALK_V4(&ctx); 16946 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 16947 for (ipif = ill->ill_ipif; ipif != NULL; 16948 ipif = ipif->ipif_next) { 16949 if (ipif->ipif_zoneid != zoneid && 16950 ipif->ipif_zoneid != ALL_ZONES) 16951 continue; 16952 mae.ipAdEntInfo.ae_ibcnt = ipif->ipif_ib_pkt_count; 16953 mae.ipAdEntInfo.ae_obcnt = ipif->ipif_ob_pkt_count; 16954 mae.ipAdEntInfo.ae_focnt = ipif->ipif_fo_pkt_count; 16955 16956 (void) ipif_get_name(ipif, 16957 mae.ipAdEntIfIndex.o_bytes, 16958 OCTET_LENGTH); 16959 mae.ipAdEntIfIndex.o_length = 16960 mi_strlen(mae.ipAdEntIfIndex.o_bytes); 16961 mae.ipAdEntAddr = ipif->ipif_lcl_addr; 16962 mae.ipAdEntNetMask = ipif->ipif_net_mask; 16963 mae.ipAdEntInfo.ae_subnet = ipif->ipif_subnet; 16964 mae.ipAdEntInfo.ae_subnet_len = 16965 ip_mask_to_plen(ipif->ipif_net_mask); 16966 mae.ipAdEntInfo.ae_src_addr = ipif->ipif_src_addr; 16967 for (bitval = 1; 16968 bitval && 16969 !(bitval & ipif->ipif_brd_addr); 16970 bitval <<= 1) 16971 noop; 16972 mae.ipAdEntBcastAddr = bitval; 16973 mae.ipAdEntReasmMaxSize = 65535; 16974 mae.ipAdEntInfo.ae_mtu = ipif->ipif_mtu; 16975 mae.ipAdEntInfo.ae_metric = ipif->ipif_metric; 16976 mae.ipAdEntInfo.ae_broadcast_addr = 16977 ipif->ipif_brd_addr; 16978 mae.ipAdEntInfo.ae_pp_dst_addr = 16979 ipif->ipif_pp_dst_addr; 16980 mae.ipAdEntInfo.ae_flags = ipif->ipif_flags | 16981 ill->ill_flags | ill->ill_phyint->phyint_flags; 16982 16983 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 16984 (char *)&mae, (int)sizeof (mib2_ipAddrEntry_t))) { 16985 ip1dbg(("ip_snmp_get_mib2_ip_addr: failed to " 16986 "allocate %u bytes\n", 16987 (uint_t)sizeof (mib2_ipAddrEntry_t))); 16988 } 16989 } 16990 } 16991 rw_exit(&ill_g_lock); 16992 16993 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 16994 ip3dbg(("ip_snmp_get_mib2_ip_addr: level %d, name %d, len %d\n", 16995 (int)optp->level, (int)optp->name, (int)optp->len)); 16996 qreply(q, mpctl); 16997 return (mp2ctl); 16998 } 16999 17000 /* IPv6 address information */ 17001 static mblk_t * 17002 ip_snmp_get_mib2_ip6_addr(queue_t *q, mblk_t *mpctl) 17003 { 17004 struct opthdr *optp; 17005 mblk_t *mp2ctl; 17006 mblk_t *mp_tail = NULL; 17007 ill_t *ill; 17008 ipif_t *ipif; 17009 mib2_ipv6AddrEntry_t mae6; 17010 zoneid_t zoneid; 17011 ill_walk_context_t ctx; 17012 17013 /* 17014 * make a copy of the original message 17015 */ 17016 mp2ctl = copymsg(mpctl); 17017 17018 /* ipv6AddrEntryTable */ 17019 17020 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17021 optp->level = MIB2_IP6; 17022 optp->name = MIB2_IP6_ADDR; 17023 zoneid = Q_TO_CONN(q)->conn_zoneid; 17024 17025 rw_enter(&ill_g_lock, RW_READER); 17026 ill = ILL_START_WALK_V6(&ctx); 17027 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 17028 for (ipif = ill->ill_ipif; ipif; ipif = ipif->ipif_next) { 17029 if (ipif->ipif_zoneid != zoneid && 17030 ipif->ipif_zoneid != ALL_ZONES) 17031 continue; 17032 mae6.ipv6AddrInfo.ae_ibcnt = ipif->ipif_ib_pkt_count; 17033 mae6.ipv6AddrInfo.ae_obcnt = ipif->ipif_ob_pkt_count; 17034 mae6.ipv6AddrInfo.ae_focnt = ipif->ipif_fo_pkt_count; 17035 17036 (void) ipif_get_name(ipif, 17037 mae6.ipv6AddrIfIndex.o_bytes, 17038 OCTET_LENGTH); 17039 mae6.ipv6AddrIfIndex.o_length = 17040 mi_strlen(mae6.ipv6AddrIfIndex.o_bytes); 17041 mae6.ipv6AddrAddress = ipif->ipif_v6lcl_addr; 17042 mae6.ipv6AddrPfxLength = 17043 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 17044 mae6.ipv6AddrInfo.ae_subnet = ipif->ipif_v6subnet; 17045 mae6.ipv6AddrInfo.ae_subnet_len = 17046 mae6.ipv6AddrPfxLength; 17047 mae6.ipv6AddrInfo.ae_src_addr = ipif->ipif_v6src_addr; 17048 17049 /* Type: stateless(1), stateful(2), unknown(3) */ 17050 if (ipif->ipif_flags & IPIF_ADDRCONF) 17051 mae6.ipv6AddrType = 1; 17052 else 17053 mae6.ipv6AddrType = 2; 17054 /* Anycast: true(1), false(2) */ 17055 if (ipif->ipif_flags & IPIF_ANYCAST) 17056 mae6.ipv6AddrAnycastFlag = 1; 17057 else 17058 mae6.ipv6AddrAnycastFlag = 2; 17059 17060 /* 17061 * Address status: preferred(1), deprecated(2), 17062 * invalid(3), inaccessible(4), unknown(5) 17063 */ 17064 if (ipif->ipif_flags & IPIF_NOLOCAL) 17065 mae6.ipv6AddrStatus = 3; 17066 else if (ipif->ipif_flags & IPIF_DEPRECATED) 17067 mae6.ipv6AddrStatus = 2; 17068 else 17069 mae6.ipv6AddrStatus = 1; 17070 mae6.ipv6AddrInfo.ae_mtu = ipif->ipif_mtu; 17071 mae6.ipv6AddrInfo.ae_metric = ipif->ipif_metric; 17072 mae6.ipv6AddrInfo.ae_pp_dst_addr = 17073 ipif->ipif_v6pp_dst_addr; 17074 mae6.ipv6AddrInfo.ae_flags = ipif->ipif_flags | 17075 ill->ill_flags | ill->ill_phyint->phyint_flags; 17076 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 17077 (char *)&mae6, 17078 (int)sizeof (mib2_ipv6AddrEntry_t))) { 17079 ip1dbg(("ip_snmp_get_mib2_ip6_addr: failed to " 17080 "allocate %u bytes\n", 17081 (uint_t)sizeof (mib2_ipv6AddrEntry_t))); 17082 } 17083 } 17084 } 17085 rw_exit(&ill_g_lock); 17086 17087 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 17088 ip3dbg(("ip_snmp_get_mib2_ip6_addr: level %d, name %d, len %d\n", 17089 (int)optp->level, (int)optp->name, (int)optp->len)); 17090 qreply(q, mpctl); 17091 return (mp2ctl); 17092 } 17093 17094 /* IPv4 multicast group membership. */ 17095 static mblk_t * 17096 ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl) 17097 { 17098 struct opthdr *optp; 17099 mblk_t *mp2ctl; 17100 ill_t *ill; 17101 ipif_t *ipif; 17102 ilm_t *ilm; 17103 ip_member_t ipm; 17104 mblk_t *mp_tail = NULL; 17105 ill_walk_context_t ctx; 17106 zoneid_t zoneid; 17107 17108 /* 17109 * make a copy of the original message 17110 */ 17111 mp2ctl = copymsg(mpctl); 17112 zoneid = Q_TO_CONN(q)->conn_zoneid; 17113 17114 /* ipGroupMember table */ 17115 optp = (struct opthdr *)&mpctl->b_rptr[ 17116 sizeof (struct T_optmgmt_ack)]; 17117 optp->level = MIB2_IP; 17118 optp->name = EXPER_IP_GROUP_MEMBERSHIP; 17119 17120 rw_enter(&ill_g_lock, RW_READER); 17121 ill = ILL_START_WALK_V4(&ctx); 17122 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 17123 ILM_WALKER_HOLD(ill); 17124 for (ipif = ill->ill_ipif; ipif != NULL; 17125 ipif = ipif->ipif_next) { 17126 if (ipif->ipif_zoneid != zoneid && 17127 ipif->ipif_zoneid != ALL_ZONES) 17128 continue; /* not this zone */ 17129 (void) ipif_get_name(ipif, 17130 ipm.ipGroupMemberIfIndex.o_bytes, 17131 OCTET_LENGTH); 17132 ipm.ipGroupMemberIfIndex.o_length = 17133 mi_strlen(ipm.ipGroupMemberIfIndex.o_bytes); 17134 for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { 17135 ASSERT(ilm->ilm_ipif != NULL); 17136 ASSERT(ilm->ilm_ill == NULL); 17137 if (ilm->ilm_ipif != ipif) 17138 continue; 17139 ipm.ipGroupMemberAddress = ilm->ilm_addr; 17140 ipm.ipGroupMemberRefCnt = ilm->ilm_refcnt; 17141 ipm.ipGroupMemberFilterMode = ilm->ilm_fmode; 17142 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 17143 (char *)&ipm, (int)sizeof (ipm))) { 17144 ip1dbg(("ip_snmp_get_mib2_ip_group: " 17145 "failed to allocate %u bytes\n", 17146 (uint_t)sizeof (ipm))); 17147 } 17148 } 17149 } 17150 ILM_WALKER_RELE(ill); 17151 } 17152 rw_exit(&ill_g_lock); 17153 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 17154 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 17155 (int)optp->level, (int)optp->name, (int)optp->len)); 17156 qreply(q, mpctl); 17157 return (mp2ctl); 17158 } 17159 17160 /* IPv6 multicast group membership. */ 17161 static mblk_t * 17162 ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl) 17163 { 17164 struct opthdr *optp; 17165 mblk_t *mp2ctl; 17166 ill_t *ill; 17167 ilm_t *ilm; 17168 ipv6_member_t ipm6; 17169 mblk_t *mp_tail = NULL; 17170 ill_walk_context_t ctx; 17171 zoneid_t zoneid; 17172 17173 /* 17174 * make a copy of the original message 17175 */ 17176 mp2ctl = copymsg(mpctl); 17177 zoneid = Q_TO_CONN(q)->conn_zoneid; 17178 17179 /* ip6GroupMember table */ 17180 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17181 optp->level = MIB2_IP6; 17182 optp->name = EXPER_IP6_GROUP_MEMBERSHIP; 17183 17184 rw_enter(&ill_g_lock, RW_READER); 17185 ill = ILL_START_WALK_V6(&ctx); 17186 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 17187 ILM_WALKER_HOLD(ill); 17188 ipm6.ipv6GroupMemberIfIndex = ill->ill_phyint->phyint_ifindex; 17189 for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { 17190 ASSERT(ilm->ilm_ipif == NULL); 17191 ASSERT(ilm->ilm_ill != NULL); 17192 if (ilm->ilm_zoneid != zoneid) 17193 continue; /* not this zone */ 17194 ipm6.ipv6GroupMemberAddress = ilm->ilm_v6addr; 17195 ipm6.ipv6GroupMemberRefCnt = ilm->ilm_refcnt; 17196 ipm6.ipv6GroupMemberFilterMode = ilm->ilm_fmode; 17197 if (!snmp_append_data2(mpctl->b_cont, 17198 &mp_tail, 17199 (char *)&ipm6, (int)sizeof (ipm6))) { 17200 ip1dbg(("ip_snmp_get_mib2_ip6_group: " 17201 "failed to allocate %u bytes\n", 17202 (uint_t)sizeof (ipm6))); 17203 } 17204 } 17205 ILM_WALKER_RELE(ill); 17206 } 17207 rw_exit(&ill_g_lock); 17208 17209 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 17210 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 17211 (int)optp->level, (int)optp->name, (int)optp->len)); 17212 qreply(q, mpctl); 17213 return (mp2ctl); 17214 } 17215 17216 /* IP multicast filtered sources */ 17217 static mblk_t * 17218 ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl) 17219 { 17220 struct opthdr *optp; 17221 mblk_t *mp2ctl; 17222 ill_t *ill; 17223 ipif_t *ipif; 17224 ilm_t *ilm; 17225 ip_grpsrc_t ips; 17226 mblk_t *mp_tail = NULL; 17227 ill_walk_context_t ctx; 17228 zoneid_t zoneid; 17229 int i; 17230 slist_t *sl; 17231 17232 /* 17233 * make a copy of the original message 17234 */ 17235 mp2ctl = copymsg(mpctl); 17236 zoneid = Q_TO_CONN(q)->conn_zoneid; 17237 17238 /* ipGroupSource table */ 17239 optp = (struct opthdr *)&mpctl->b_rptr[ 17240 sizeof (struct T_optmgmt_ack)]; 17241 optp->level = MIB2_IP; 17242 optp->name = EXPER_IP_GROUP_SOURCES; 17243 17244 rw_enter(&ill_g_lock, RW_READER); 17245 ill = ILL_START_WALK_V4(&ctx); 17246 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 17247 ILM_WALKER_HOLD(ill); 17248 for (ipif = ill->ill_ipif; ipif != NULL; 17249 ipif = ipif->ipif_next) { 17250 if (ipif->ipif_zoneid != zoneid) 17251 continue; /* not this zone */ 17252 (void) ipif_get_name(ipif, 17253 ips.ipGroupSourceIfIndex.o_bytes, 17254 OCTET_LENGTH); 17255 ips.ipGroupSourceIfIndex.o_length = 17256 mi_strlen(ips.ipGroupSourceIfIndex.o_bytes); 17257 for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { 17258 ASSERT(ilm->ilm_ipif != NULL); 17259 ASSERT(ilm->ilm_ill == NULL); 17260 sl = ilm->ilm_filter; 17261 if (ilm->ilm_ipif != ipif || SLIST_IS_EMPTY(sl)) 17262 continue; 17263 ips.ipGroupSourceGroup = ilm->ilm_addr; 17264 for (i = 0; i < sl->sl_numsrc; i++) { 17265 if (!IN6_IS_ADDR_V4MAPPED( 17266 &sl->sl_addr[i])) 17267 continue; 17268 IN6_V4MAPPED_TO_IPADDR(&sl->sl_addr[i], 17269 ips.ipGroupSourceAddress); 17270 if (snmp_append_data2(mpctl->b_cont, 17271 &mp_tail, (char *)&ips, 17272 (int)sizeof (ips)) == 0) { 17273 ip1dbg(("ip_snmp_get_mib2_" 17274 "ip_group_src: failed to " 17275 "allocate %u bytes\n", 17276 (uint_t)sizeof (ips))); 17277 } 17278 } 17279 } 17280 } 17281 ILM_WALKER_RELE(ill); 17282 } 17283 rw_exit(&ill_g_lock); 17284 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 17285 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 17286 (int)optp->level, (int)optp->name, (int)optp->len)); 17287 qreply(q, mpctl); 17288 return (mp2ctl); 17289 } 17290 17291 /* IPv6 multicast filtered sources. */ 17292 static mblk_t * 17293 ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl) 17294 { 17295 struct opthdr *optp; 17296 mblk_t *mp2ctl; 17297 ill_t *ill; 17298 ilm_t *ilm; 17299 ipv6_grpsrc_t ips6; 17300 mblk_t *mp_tail = NULL; 17301 ill_walk_context_t ctx; 17302 zoneid_t zoneid; 17303 int i; 17304 slist_t *sl; 17305 17306 /* 17307 * make a copy of the original message 17308 */ 17309 mp2ctl = copymsg(mpctl); 17310 zoneid = Q_TO_CONN(q)->conn_zoneid; 17311 17312 /* ip6GroupMember table */ 17313 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17314 optp->level = MIB2_IP6; 17315 optp->name = EXPER_IP6_GROUP_SOURCES; 17316 17317 rw_enter(&ill_g_lock, RW_READER); 17318 ill = ILL_START_WALK_V6(&ctx); 17319 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 17320 ILM_WALKER_HOLD(ill); 17321 ips6.ipv6GroupSourceIfIndex = ill->ill_phyint->phyint_ifindex; 17322 for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { 17323 ASSERT(ilm->ilm_ipif == NULL); 17324 ASSERT(ilm->ilm_ill != NULL); 17325 sl = ilm->ilm_filter; 17326 if (ilm->ilm_zoneid != zoneid || SLIST_IS_EMPTY(sl)) 17327 continue; 17328 ips6.ipv6GroupSourceGroup = ilm->ilm_v6addr; 17329 for (i = 0; i < sl->sl_numsrc; i++) { 17330 ips6.ipv6GroupSourceAddress = sl->sl_addr[i]; 17331 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 17332 (char *)&ips6, (int)sizeof (ips6))) { 17333 ip1dbg(("ip_snmp_get_mib2_ip6_" 17334 "group_src: failed to allocate " 17335 "%u bytes\n", 17336 (uint_t)sizeof (ips6))); 17337 } 17338 } 17339 } 17340 ILM_WALKER_RELE(ill); 17341 } 17342 rw_exit(&ill_g_lock); 17343 17344 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 17345 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 17346 (int)optp->level, (int)optp->name, (int)optp->len)); 17347 qreply(q, mpctl); 17348 return (mp2ctl); 17349 } 17350 17351 /* Multicast routing virtual interface table. */ 17352 static mblk_t * 17353 ip_snmp_get_mib2_virt_multi(queue_t *q, mblk_t *mpctl) 17354 { 17355 struct opthdr *optp; 17356 mblk_t *mp2ctl; 17357 17358 /* 17359 * make a copy of the original message 17360 */ 17361 mp2ctl = copymsg(mpctl); 17362 17363 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17364 optp->level = EXPER_DVMRP; 17365 optp->name = EXPER_DVMRP_VIF; 17366 if (!ip_mroute_vif(mpctl->b_cont)) { 17367 ip0dbg(("ip_mroute_vif: failed\n")); 17368 } 17369 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 17370 ip3dbg(("ip_snmp_get_mib2_virt_multi: level %d, name %d, len %d\n", 17371 (int)optp->level, (int)optp->name, (int)optp->len)); 17372 qreply(q, mpctl); 17373 return (mp2ctl); 17374 } 17375 17376 /* Multicast routing table. */ 17377 static mblk_t * 17378 ip_snmp_get_mib2_multi_rtable(queue_t *q, mblk_t *mpctl) 17379 { 17380 struct opthdr *optp; 17381 mblk_t *mp2ctl; 17382 17383 /* 17384 * make a copy of the original message 17385 */ 17386 mp2ctl = copymsg(mpctl); 17387 17388 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17389 optp->level = EXPER_DVMRP; 17390 optp->name = EXPER_DVMRP_MRT; 17391 if (!ip_mroute_mrt(mpctl->b_cont)) { 17392 ip0dbg(("ip_mroute_mrt: failed\n")); 17393 } 17394 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 17395 ip3dbg(("ip_snmp_get_mib2_multi_rtable: level %d, name %d, len %d\n", 17396 (int)optp->level, (int)optp->name, (int)optp->len)); 17397 qreply(q, mpctl); 17398 return (mp2ctl); 17399 } 17400 17401 /* 17402 * Return ipRouteEntryTable, ipNetToMediaEntryTable, and ipRouteAttributeTable 17403 * in one IRE walk. 17404 */ 17405 static mblk_t * 17406 ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl) 17407 { 17408 struct opthdr *optp; 17409 mblk_t *mp2ctl; /* Returned */ 17410 mblk_t *mp3ctl; /* nettomedia */ 17411 mblk_t *mp4ctl; /* routeattrs */ 17412 iproutedata_t ird; 17413 zoneid_t zoneid; 17414 17415 /* 17416 * make copies of the original message 17417 * - mp2ctl is returned unchanged to the caller for his use 17418 * - mpctl is sent upstream as ipRouteEntryTable 17419 * - mp3ctl is sent upstream as ipNetToMediaEntryTable 17420 * - mp4ctl is sent upstream as ipRouteAttributeTable 17421 */ 17422 mp2ctl = copymsg(mpctl); 17423 mp3ctl = copymsg(mpctl); 17424 mp4ctl = copymsg(mpctl); 17425 if (mp3ctl == NULL || mp4ctl == NULL) { 17426 freemsg(mp4ctl); 17427 freemsg(mp3ctl); 17428 freemsg(mp2ctl); 17429 freemsg(mpctl); 17430 return (NULL); 17431 } 17432 17433 bzero(&ird, sizeof (ird)); 17434 17435 ird.ird_route.lp_head = mpctl->b_cont; 17436 ird.ird_netmedia.lp_head = mp3ctl->b_cont; 17437 ird.ird_attrs.lp_head = mp4ctl->b_cont; 17438 17439 zoneid = Q_TO_CONN(q)->conn_zoneid; 17440 ire_walk_v4(ip_snmp_get2_v4, &ird, zoneid); 17441 if (zoneid == GLOBAL_ZONEID) { 17442 /* 17443 * Those IREs are used by Mobile-IP; since mipagent(1M) requires 17444 * the sys_net_config privilege, it can only run in the global 17445 * zone, so we don't display these IREs in the other zones. 17446 */ 17447 ire_walk_srcif_table_v4(ip_snmp_get2_v4, &ird); 17448 ire_walk_ill_mrtun(0, 0, ip_snmp_get2_v4, &ird, NULL); 17449 } 17450 17451 /* ipRouteEntryTable in mpctl */ 17452 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17453 optp->level = MIB2_IP; 17454 optp->name = MIB2_IP_ROUTE; 17455 optp->len = msgdsize(ird.ird_route.lp_head); 17456 ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n", 17457 (int)optp->level, (int)optp->name, (int)optp->len)); 17458 qreply(q, mpctl); 17459 17460 /* ipNetToMediaEntryTable in mp3ctl */ 17461 optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17462 optp->level = MIB2_IP; 17463 optp->name = MIB2_IP_MEDIA; 17464 optp->len = msgdsize(ird.ird_netmedia.lp_head); 17465 ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n", 17466 (int)optp->level, (int)optp->name, (int)optp->len)); 17467 qreply(q, mp3ctl); 17468 17469 /* ipRouteAttributeTable in mp4ctl */ 17470 optp = (struct opthdr *)&mp4ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17471 optp->level = MIB2_IP; 17472 optp->name = EXPER_IP_RTATTR; 17473 optp->len = msgdsize(ird.ird_attrs.lp_head); 17474 ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n", 17475 (int)optp->level, (int)optp->name, (int)optp->len)); 17476 if (optp->len == 0) 17477 freemsg(mp4ctl); 17478 else 17479 qreply(q, mp4ctl); 17480 17481 return (mp2ctl); 17482 } 17483 17484 /* 17485 * Return ipv6RouteEntryTable and ipv6RouteAttributeTable in one IRE walk, and 17486 * ipv6NetToMediaEntryTable in an NDP walk. 17487 */ 17488 static mblk_t * 17489 ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl) 17490 { 17491 struct opthdr *optp; 17492 mblk_t *mp2ctl; /* Returned */ 17493 mblk_t *mp3ctl; /* nettomedia */ 17494 mblk_t *mp4ctl; /* routeattrs */ 17495 iproutedata_t ird; 17496 zoneid_t zoneid; 17497 17498 /* 17499 * make copies of the original message 17500 * - mp2ctl is returned unchanged to the caller for his use 17501 * - mpctl is sent upstream as ipv6RouteEntryTable 17502 * - mp3ctl is sent upstream as ipv6NetToMediaEntryTable 17503 * - mp4ctl is sent upstream as ipv6RouteAttributeTable 17504 */ 17505 mp2ctl = copymsg(mpctl); 17506 mp3ctl = copymsg(mpctl); 17507 mp4ctl = copymsg(mpctl); 17508 if (mp3ctl == NULL || mp4ctl == NULL) { 17509 freemsg(mp4ctl); 17510 freemsg(mp3ctl); 17511 freemsg(mp2ctl); 17512 freemsg(mpctl); 17513 return (NULL); 17514 } 17515 17516 bzero(&ird, sizeof (ird)); 17517 17518 ird.ird_route.lp_head = mpctl->b_cont; 17519 ird.ird_netmedia.lp_head = mp3ctl->b_cont; 17520 ird.ird_attrs.lp_head = mp4ctl->b_cont; 17521 17522 zoneid = Q_TO_CONN(q)->conn_zoneid; 17523 ire_walk_v6(ip_snmp_get2_v6_route, &ird, zoneid); 17524 17525 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17526 optp->level = MIB2_IP6; 17527 optp->name = MIB2_IP6_ROUTE; 17528 optp->len = msgdsize(ird.ird_route.lp_head); 17529 ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n", 17530 (int)optp->level, (int)optp->name, (int)optp->len)); 17531 qreply(q, mpctl); 17532 17533 /* ipv6NetToMediaEntryTable in mp3ctl */ 17534 ndp_walk(NULL, ip_snmp_get2_v6_media, &ird); 17535 17536 optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17537 optp->level = MIB2_IP6; 17538 optp->name = MIB2_IP6_MEDIA; 17539 optp->len = msgdsize(ird.ird_netmedia.lp_head); 17540 ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n", 17541 (int)optp->level, (int)optp->name, (int)optp->len)); 17542 qreply(q, mp3ctl); 17543 17544 /* ipv6RouteAttributeTable in mp4ctl */ 17545 optp = (struct opthdr *)&mp4ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17546 optp->level = MIB2_IP6; 17547 optp->name = EXPER_IP_RTATTR; 17548 optp->len = msgdsize(ird.ird_attrs.lp_head); 17549 ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n", 17550 (int)optp->level, (int)optp->name, (int)optp->len)); 17551 if (optp->len == 0) 17552 freemsg(mp4ctl); 17553 else 17554 qreply(q, mp4ctl); 17555 17556 return (mp2ctl); 17557 } 17558 17559 /* 17560 * ICMPv6 mib: One per ill 17561 */ 17562 static mblk_t * 17563 ip_snmp_get_mib2_ip6(queue_t *q, mblk_t *mpctl) 17564 { 17565 struct opthdr *optp; 17566 mblk_t *mp2ctl; 17567 ill_t *ill; 17568 ill_walk_context_t ctx; 17569 mblk_t *mp_tail = NULL; 17570 17571 /* 17572 * Make a copy of the original message 17573 */ 17574 mp2ctl = copymsg(mpctl); 17575 17576 /* fixed length IPv6 structure ... */ 17577 17578 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17579 optp->level = MIB2_IP6; 17580 optp->name = 0; 17581 /* Include "unknown interface" ip6_mib */ 17582 ip6_mib.ipv6IfIndex = 0; /* Flag to netstat */ 17583 SET_MIB(ip6_mib.ipv6Forwarding, ipv6_forward ? 1 : 2); 17584 SET_MIB(ip6_mib.ipv6DefaultHopLimit, ipv6_def_hops); 17585 SET_MIB(ip6_mib.ipv6IfStatsEntrySize, 17586 sizeof (mib2_ipv6IfStatsEntry_t)); 17587 SET_MIB(ip6_mib.ipv6AddrEntrySize, sizeof (mib2_ipv6AddrEntry_t)); 17588 SET_MIB(ip6_mib.ipv6RouteEntrySize, sizeof (mib2_ipv6RouteEntry_t)); 17589 SET_MIB(ip6_mib.ipv6NetToMediaEntrySize, 17590 sizeof (mib2_ipv6NetToMediaEntry_t)); 17591 SET_MIB(ip6_mib.ipv6MemberEntrySize, sizeof (ipv6_member_t)); 17592 SET_MIB(ip6_mib.ipv6GroupSourceEntrySize, sizeof (ipv6_grpsrc_t)); 17593 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, (char *)&ip6_mib, 17594 (int)sizeof (ip6_mib))) { 17595 ip1dbg(("ip_snmp_get_mib2_ip6: failed to allocate %u bytes\n", 17596 (uint_t)sizeof (ip6_mib))); 17597 } 17598 17599 rw_enter(&ill_g_lock, RW_READER); 17600 ill = ILL_START_WALK_V6(&ctx); 17601 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 17602 ill->ill_ip6_mib->ipv6IfIndex = 17603 ill->ill_phyint->phyint_ifindex; 17604 SET_MIB(ill->ill_ip6_mib->ipv6Forwarding, 17605 ipv6_forward ? 1 : 2); 17606 SET_MIB(ill->ill_ip6_mib->ipv6DefaultHopLimit, 17607 ill->ill_max_hops); 17608 SET_MIB(ill->ill_ip6_mib->ipv6IfStatsEntrySize, 17609 sizeof (mib2_ipv6IfStatsEntry_t)); 17610 SET_MIB(ill->ill_ip6_mib->ipv6AddrEntrySize, 17611 sizeof (mib2_ipv6AddrEntry_t)); 17612 SET_MIB(ill->ill_ip6_mib->ipv6RouteEntrySize, 17613 sizeof (mib2_ipv6RouteEntry_t)); 17614 SET_MIB(ill->ill_ip6_mib->ipv6NetToMediaEntrySize, 17615 sizeof (mib2_ipv6NetToMediaEntry_t)); 17616 SET_MIB(ill->ill_ip6_mib->ipv6MemberEntrySize, 17617 sizeof (ipv6_member_t)); 17618 17619 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 17620 (char *)ill->ill_ip6_mib, 17621 (int)sizeof (*ill->ill_ip6_mib))) { 17622 ip1dbg(("ip_snmp_get_mib2_ip6: failed to allocate " 17623 "%u bytes\n", 17624 (uint_t)sizeof (*ill->ill_ip6_mib))); 17625 } 17626 } 17627 rw_exit(&ill_g_lock); 17628 17629 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 17630 ip3dbg(("ip_snmp_get_mib2_ip6: level %d, name %d, len %d\n", 17631 (int)optp->level, (int)optp->name, (int)optp->len)); 17632 qreply(q, mpctl); 17633 return (mp2ctl); 17634 } 17635 17636 /* 17637 * ICMPv6 mib: One per ill 17638 */ 17639 static mblk_t * 17640 ip_snmp_get_mib2_icmp6(queue_t *q, mblk_t *mpctl) 17641 { 17642 struct opthdr *optp; 17643 mblk_t *mp2ctl; 17644 ill_t *ill; 17645 ill_walk_context_t ctx; 17646 mblk_t *mp_tail = NULL; 17647 /* 17648 * Make a copy of the original message 17649 */ 17650 mp2ctl = copymsg(mpctl); 17651 17652 /* fixed length ICMPv6 structure ... */ 17653 17654 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17655 optp->level = MIB2_ICMP6; 17656 optp->name = 0; 17657 /* Include "unknown interface" icmp6_mib */ 17658 icmp6_mib.ipv6IfIcmpIfIndex = 0; /* Flag to netstat */ 17659 icmp6_mib.ipv6IfIcmpEntrySize = sizeof (mib2_ipv6IfIcmpEntry_t); 17660 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, (char *)&icmp6_mib, 17661 (int)sizeof (icmp6_mib))) { 17662 ip1dbg(("ip_snmp_get_mib2_icmp6: failed to allocate %u bytes\n", 17663 (uint_t)sizeof (icmp6_mib))); 17664 } 17665 17666 rw_enter(&ill_g_lock, RW_READER); 17667 ill = ILL_START_WALK_V6(&ctx); 17668 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 17669 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex = 17670 ill->ill_phyint->phyint_ifindex; 17671 ill->ill_icmp6_mib->ipv6IfIcmpEntrySize = 17672 sizeof (mib2_ipv6IfIcmpEntry_t); 17673 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 17674 (char *)ill->ill_icmp6_mib, 17675 (int)sizeof (*ill->ill_icmp6_mib))) { 17676 ip1dbg(("ip_snmp_get_mib2_icmp6: failed to allocate " 17677 "%u bytes\n", 17678 (uint_t)sizeof (*ill->ill_icmp6_mib))); 17679 } 17680 } 17681 rw_exit(&ill_g_lock); 17682 17683 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 17684 ip3dbg(("ip_snmp_get_mib2_icmp6: level %d, name %d, len %d\n", 17685 (int)optp->level, (int)optp->name, (int)optp->len)); 17686 qreply(q, mpctl); 17687 return (mp2ctl); 17688 } 17689 17690 /* 17691 * ire_walk routine to create both ipRouteEntryTable and 17692 * ipNetToMediaEntryTable in one IRE walk 17693 */ 17694 static void 17695 ip_snmp_get2_v4(ire_t *ire, iproutedata_t *ird) 17696 { 17697 ill_t *ill; 17698 ipif_t *ipif; 17699 mblk_t *llmp; 17700 dl_unitdata_req_t *dlup; 17701 mib2_ipRouteEntry_t *re; 17702 mib2_ipNetToMediaEntry_t ntme; 17703 mib2_ipAttributeEntry_t *iae, *iaeptr; 17704 ipaddr_t gw_addr; 17705 tsol_ire_gw_secattr_t *attrp; 17706 tsol_gc_t *gc = NULL; 17707 tsol_gcgrp_t *gcgrp = NULL; 17708 uint_t sacnt = 0; 17709 int i; 17710 17711 ASSERT(ire->ire_ipversion == IPV4_VERSION); 17712 17713 if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL) 17714 return; 17715 17716 if ((attrp = ire->ire_gw_secattr) != NULL) { 17717 mutex_enter(&attrp->igsa_lock); 17718 if ((gc = attrp->igsa_gc) != NULL) { 17719 gcgrp = gc->gc_grp; 17720 ASSERT(gcgrp != NULL); 17721 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); 17722 sacnt = 1; 17723 } else if ((gcgrp = attrp->igsa_gcgrp) != NULL) { 17724 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); 17725 gc = gcgrp->gcgrp_head; 17726 sacnt = gcgrp->gcgrp_count; 17727 } 17728 mutex_exit(&attrp->igsa_lock); 17729 17730 /* do nothing if there's no gc to report */ 17731 if (gc == NULL) { 17732 ASSERT(sacnt == 0); 17733 if (gcgrp != NULL) { 17734 /* we might as well drop the lock now */ 17735 rw_exit(&gcgrp->gcgrp_rwlock); 17736 gcgrp = NULL; 17737 } 17738 attrp = NULL; 17739 } 17740 17741 ASSERT(gc == NULL || (gcgrp != NULL && 17742 RW_LOCK_HELD(&gcgrp->gcgrp_rwlock))); 17743 } 17744 ASSERT(sacnt == 0 || gc != NULL); 17745 17746 if (sacnt != 0 && 17747 (iae = kmem_alloc(sacnt * sizeof (*iae), KM_NOSLEEP)) == NULL) { 17748 kmem_free(re, sizeof (*re)); 17749 rw_exit(&gcgrp->gcgrp_rwlock); 17750 return; 17751 } 17752 17753 /* 17754 * Return all IRE types for route table... let caller pick and choose 17755 */ 17756 re->ipRouteDest = ire->ire_addr; 17757 ipif = ire->ire_ipif; 17758 re->ipRouteIfIndex.o_length = 0; 17759 if (ire->ire_type == IRE_CACHE) { 17760 ill = (ill_t *)ire->ire_stq->q_ptr; 17761 re->ipRouteIfIndex.o_length = 17762 ill->ill_name_length == 0 ? 0 : 17763 MIN(OCTET_LENGTH, ill->ill_name_length - 1); 17764 bcopy(ill->ill_name, re->ipRouteIfIndex.o_bytes, 17765 re->ipRouteIfIndex.o_length); 17766 } else if (ipif != NULL) { 17767 (void) ipif_get_name(ipif, re->ipRouteIfIndex.o_bytes, 17768 OCTET_LENGTH); 17769 re->ipRouteIfIndex.o_length = 17770 mi_strlen(re->ipRouteIfIndex.o_bytes); 17771 } 17772 re->ipRouteMetric1 = -1; 17773 re->ipRouteMetric2 = -1; 17774 re->ipRouteMetric3 = -1; 17775 re->ipRouteMetric4 = -1; 17776 17777 gw_addr = ire->ire_gateway_addr; 17778 17779 if (ire->ire_type & (IRE_INTERFACE|IRE_LOOPBACK|IRE_BROADCAST)) 17780 re->ipRouteNextHop = ire->ire_src_addr; 17781 else 17782 re->ipRouteNextHop = gw_addr; 17783 /* indirect(4), direct(3), or invalid(2) */ 17784 if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) 17785 re->ipRouteType = 2; 17786 else 17787 re->ipRouteType = (gw_addr != 0) ? 4 : 3; 17788 re->ipRouteProto = -1; 17789 re->ipRouteAge = gethrestime_sec() - ire->ire_create_time; 17790 re->ipRouteMask = ire->ire_mask; 17791 re->ipRouteMetric5 = -1; 17792 re->ipRouteInfo.re_max_frag = ire->ire_max_frag; 17793 re->ipRouteInfo.re_frag_flag = ire->ire_frag_flag; 17794 re->ipRouteInfo.re_rtt = ire->ire_uinfo.iulp_rtt; 17795 llmp = ire->ire_dlureq_mp; 17796 re->ipRouteInfo.re_ref = ire->ire_refcnt; 17797 re->ipRouteInfo.re_src_addr = ire->ire_src_addr; 17798 re->ipRouteInfo.re_ire_type = ire->ire_type; 17799 re->ipRouteInfo.re_obpkt = ire->ire_ob_pkt_count; 17800 re->ipRouteInfo.re_ibpkt = ire->ire_ib_pkt_count; 17801 re->ipRouteInfo.re_flags = ire->ire_flags; 17802 re->ipRouteInfo.re_in_ill.o_length = 0; 17803 if (ire->ire_in_ill != NULL) { 17804 re->ipRouteInfo.re_in_ill.o_length = 17805 ire->ire_in_ill->ill_name_length == 0 ? 0 : 17806 MIN(OCTET_LENGTH, ire->ire_in_ill->ill_name_length - 1); 17807 bcopy(ire->ire_in_ill->ill_name, 17808 re->ipRouteInfo.re_in_ill.o_bytes, 17809 re->ipRouteInfo.re_in_ill.o_length); 17810 } 17811 re->ipRouteInfo.re_in_src_addr = ire->ire_in_src_addr; 17812 17813 if (!snmp_append_data2(ird->ird_route.lp_head, &ird->ird_route.lp_tail, 17814 (char *)re, (int)sizeof (*re))) { 17815 ip1dbg(("ip_snmp_get2_v4: failed to allocate %u bytes\n", 17816 (uint_t)sizeof (*re))); 17817 } 17818 17819 for (iaeptr = iae, i = 0; i < sacnt; i++, iaeptr++, gc = gc->gc_next) { 17820 iaeptr->iae_routeidx = ird->ird_idx; 17821 iaeptr->iae_doi = gc->gc_db->gcdb_doi; 17822 iaeptr->iae_slrange = gc->gc_db->gcdb_slrange; 17823 } 17824 17825 if (!snmp_append_data2(ird->ird_attrs.lp_head, &ird->ird_attrs.lp_tail, 17826 (char *)iae, sacnt * sizeof (*iae))) { 17827 ip1dbg(("ip_snmp_get2_v4: failed to allocate %u bytes\n", 17828 (unsigned)(sacnt * sizeof (*iae)))); 17829 } 17830 17831 if (ire->ire_type != IRE_CACHE || gw_addr != 0) 17832 goto done; 17833 /* 17834 * only IRE_CACHE entries that are for a directly connected subnet 17835 * get appended to net -> phys addr table 17836 * (others in arp) 17837 */ 17838 ntme.ipNetToMediaIfIndex.o_length = 0; 17839 ill = ire_to_ill(ire); 17840 ASSERT(ill != NULL); 17841 ntme.ipNetToMediaIfIndex.o_length = 17842 ill->ill_name_length == 0 ? 0 : 17843 MIN(OCTET_LENGTH, ill->ill_name_length - 1); 17844 bcopy(ill->ill_name, ntme.ipNetToMediaIfIndex.o_bytes, 17845 ntme.ipNetToMediaIfIndex.o_length); 17846 17847 ntme.ipNetToMediaPhysAddress.o_length = 0; 17848 if (llmp) { 17849 uchar_t *addr; 17850 17851 dlup = (dl_unitdata_req_t *)llmp->b_rptr; 17852 /* Remove sap from address */ 17853 if (ill->ill_sap_length < 0) 17854 addr = llmp->b_rptr + dlup->dl_dest_addr_offset; 17855 else 17856 addr = llmp->b_rptr + dlup->dl_dest_addr_offset + 17857 ill->ill_sap_length; 17858 17859 ntme.ipNetToMediaPhysAddress.o_length = 17860 MIN(OCTET_LENGTH, ill->ill_phys_addr_length); 17861 bcopy(addr, ntme.ipNetToMediaPhysAddress.o_bytes, 17862 ntme.ipNetToMediaPhysAddress.o_length); 17863 } 17864 ntme.ipNetToMediaNetAddress = ire->ire_addr; 17865 /* assume dynamic (may be changed in arp) */ 17866 ntme.ipNetToMediaType = 3; 17867 ntme.ipNetToMediaInfo.ntm_mask.o_length = sizeof (uint32_t); 17868 bcopy(&ire->ire_mask, ntme.ipNetToMediaInfo.ntm_mask.o_bytes, 17869 ntme.ipNetToMediaInfo.ntm_mask.o_length); 17870 ntme.ipNetToMediaInfo.ntm_flags = ACE_F_RESOLVED; 17871 if (!snmp_append_data2(ird->ird_netmedia.lp_head, 17872 &ird->ird_netmedia.lp_tail, (char *)&ntme, sizeof (ntme))) { 17873 ip1dbg(("ip_snmp_get2_v4: failed to allocate %u bytes\n", 17874 (uint_t)sizeof (ntme))); 17875 } 17876 done: 17877 /* bump route index for next pass */ 17878 ird->ird_idx++; 17879 17880 kmem_free(re, sizeof (*re)); 17881 if (sacnt != 0) 17882 kmem_free(iae, sacnt * sizeof (*iae)); 17883 17884 if (gcgrp != NULL) 17885 rw_exit(&gcgrp->gcgrp_rwlock); 17886 } 17887 17888 /* 17889 * ire_walk routine to create ipv6RouteEntryTable and ipRouteEntryTable. 17890 */ 17891 static void 17892 ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird) 17893 { 17894 ill_t *ill; 17895 ipif_t *ipif; 17896 mib2_ipv6RouteEntry_t *re; 17897 mib2_ipAttributeEntry_t *iae, *iaeptr; 17898 in6_addr_t gw_addr_v6; 17899 tsol_ire_gw_secattr_t *attrp; 17900 tsol_gc_t *gc = NULL; 17901 tsol_gcgrp_t *gcgrp = NULL; 17902 uint_t sacnt = 0; 17903 int i; 17904 17905 ASSERT(ire->ire_ipversion == IPV6_VERSION); 17906 17907 if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL) 17908 return; 17909 17910 if ((attrp = ire->ire_gw_secattr) != NULL) { 17911 mutex_enter(&attrp->igsa_lock); 17912 if ((gc = attrp->igsa_gc) != NULL) { 17913 gcgrp = gc->gc_grp; 17914 ASSERT(gcgrp != NULL); 17915 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); 17916 sacnt = 1; 17917 } else if ((gcgrp = attrp->igsa_gcgrp) != NULL) { 17918 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); 17919 gc = gcgrp->gcgrp_head; 17920 sacnt = gcgrp->gcgrp_count; 17921 } 17922 mutex_exit(&attrp->igsa_lock); 17923 17924 /* do nothing if there's no gc to report */ 17925 if (gc == NULL) { 17926 ASSERT(sacnt == 0); 17927 if (gcgrp != NULL) { 17928 /* we might as well drop the lock now */ 17929 rw_exit(&gcgrp->gcgrp_rwlock); 17930 gcgrp = NULL; 17931 } 17932 attrp = NULL; 17933 } 17934 17935 ASSERT(gc == NULL || (gcgrp != NULL && 17936 RW_LOCK_HELD(&gcgrp->gcgrp_rwlock))); 17937 } 17938 ASSERT(sacnt == 0 || gc != NULL); 17939 17940 if (sacnt != 0 && 17941 (iae = kmem_alloc(sacnt * sizeof (*iae), KM_NOSLEEP)) == NULL) { 17942 kmem_free(re, sizeof (*re)); 17943 rw_exit(&gcgrp->gcgrp_rwlock); 17944 return; 17945 } 17946 17947 /* 17948 * Return all IRE types for route table... let caller pick and choose 17949 */ 17950 re->ipv6RouteDest = ire->ire_addr_v6; 17951 re->ipv6RoutePfxLength = ip_mask_to_plen_v6(&ire->ire_mask_v6); 17952 re->ipv6RouteIndex = 0; /* Unique when multiple with same dest/plen */ 17953 re->ipv6RouteIfIndex.o_length = 0; 17954 ipif = ire->ire_ipif; 17955 if (ire->ire_type == IRE_CACHE) { 17956 ill = (ill_t *)ire->ire_stq->q_ptr; 17957 re->ipv6RouteIfIndex.o_length = 17958 ill->ill_name_length == 0 ? 0 : 17959 MIN(OCTET_LENGTH, ill->ill_name_length - 1); 17960 bcopy(ill->ill_name, re->ipv6RouteIfIndex.o_bytes, 17961 re->ipv6RouteIfIndex.o_length); 17962 } else if (ipif != NULL) { 17963 (void) ipif_get_name(ipif, re->ipv6RouteIfIndex.o_bytes, 17964 OCTET_LENGTH); 17965 re->ipv6RouteIfIndex.o_length = 17966 mi_strlen(re->ipv6RouteIfIndex.o_bytes); 17967 } 17968 17969 ASSERT(!(ire->ire_type & IRE_BROADCAST)); 17970 17971 mutex_enter(&ire->ire_lock); 17972 gw_addr_v6 = ire->ire_gateway_addr_v6; 17973 mutex_exit(&ire->ire_lock); 17974 17975 if (ire->ire_type & (IRE_INTERFACE|IRE_LOOPBACK)) 17976 re->ipv6RouteNextHop = ire->ire_src_addr_v6; 17977 else 17978 re->ipv6RouteNextHop = gw_addr_v6; 17979 17980 /* remote(4), local(3), or discard(2) */ 17981 if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) 17982 re->ipv6RouteType = 2; 17983 else if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) 17984 re->ipv6RouteType = 3; 17985 else 17986 re->ipv6RouteType = 4; 17987 17988 re->ipv6RouteProtocol = -1; 17989 re->ipv6RoutePolicy = 0; 17990 re->ipv6RouteAge = gethrestime_sec() - ire->ire_create_time; 17991 re->ipv6RouteNextHopRDI = 0; 17992 re->ipv6RouteWeight = 0; 17993 re->ipv6RouteMetric = 0; 17994 re->ipv6RouteInfo.re_max_frag = ire->ire_max_frag; 17995 re->ipv6RouteInfo.re_frag_flag = ire->ire_frag_flag; 17996 re->ipv6RouteInfo.re_rtt = ire->ire_uinfo.iulp_rtt; 17997 re->ipv6RouteInfo.re_src_addr = ire->ire_src_addr_v6; 17998 re->ipv6RouteInfo.re_ire_type = ire->ire_type; 17999 re->ipv6RouteInfo.re_obpkt = ire->ire_ob_pkt_count; 18000 re->ipv6RouteInfo.re_ibpkt = ire->ire_ib_pkt_count; 18001 re->ipv6RouteInfo.re_ref = ire->ire_refcnt; 18002 re->ipv6RouteInfo.re_flags = ire->ire_flags; 18003 18004 if (!snmp_append_data2(ird->ird_route.lp_head, &ird->ird_route.lp_tail, 18005 (char *)re, (int)sizeof (*re))) { 18006 ip1dbg(("ip_snmp_get2_v6: failed to allocate %u bytes\n", 18007 (uint_t)sizeof (*re))); 18008 } 18009 18010 for (iaeptr = iae, i = 0; i < sacnt; i++, iaeptr++, gc = gc->gc_next) { 18011 iaeptr->iae_routeidx = ird->ird_idx; 18012 iaeptr->iae_doi = gc->gc_db->gcdb_doi; 18013 iaeptr->iae_slrange = gc->gc_db->gcdb_slrange; 18014 } 18015 18016 if (!snmp_append_data2(ird->ird_attrs.lp_head, &ird->ird_attrs.lp_tail, 18017 (char *)iae, sacnt * sizeof (*iae))) { 18018 ip1dbg(("ip_snmp_get2_v6: failed to allocate %u bytes\n", 18019 (unsigned)(sacnt * sizeof (*iae)))); 18020 } 18021 18022 /* bump route index for next pass */ 18023 ird->ird_idx++; 18024 18025 kmem_free(re, sizeof (*re)); 18026 if (sacnt != 0) 18027 kmem_free(iae, sacnt * sizeof (*iae)); 18028 18029 if (gcgrp != NULL) 18030 rw_exit(&gcgrp->gcgrp_rwlock); 18031 } 18032 18033 /* 18034 * ndp_walk routine to create ipv6NetToMediaEntryTable 18035 */ 18036 static int 18037 ip_snmp_get2_v6_media(nce_t *nce, iproutedata_t *ird) 18038 { 18039 ill_t *ill; 18040 mib2_ipv6NetToMediaEntry_t ntme; 18041 dl_unitdata_req_t *dl; 18042 18043 ill = nce->nce_ill; 18044 ASSERT(ill->ill_isv6); 18045 18046 /* 18047 * Neighbor cache entry attached to IRE with on-link 18048 * destination. 18049 */ 18050 ntme.ipv6NetToMediaIfIndex = ill->ill_phyint->phyint_ifindex; 18051 ntme.ipv6NetToMediaNetAddress = nce->nce_addr; 18052 if ((ill->ill_flags & ILLF_XRESOLV) && 18053 (nce->nce_res_mp != NULL)) { 18054 dl = (dl_unitdata_req_t *)(nce->nce_res_mp->b_rptr); 18055 ntme.ipv6NetToMediaPhysAddress.o_length = 18056 dl->dl_dest_addr_length; 18057 } else { 18058 ntme.ipv6NetToMediaPhysAddress.o_length = 18059 ill->ill_phys_addr_length; 18060 } 18061 if (nce->nce_res_mp != NULL) { 18062 bcopy((char *)nce->nce_res_mp->b_rptr + 18063 NCE_LL_ADDR_OFFSET(ill), 18064 ntme.ipv6NetToMediaPhysAddress.o_bytes, 18065 ntme.ipv6NetToMediaPhysAddress.o_length); 18066 } else { 18067 bzero(ntme.ipv6NetToMediaPhysAddress.o_bytes, 18068 ill->ill_phys_addr_length); 18069 } 18070 /* 18071 * Note: Returns ND_* states. Should be: 18072 * reachable(1), stale(2), delay(3), probe(4), 18073 * invalid(5), unknown(6) 18074 */ 18075 ntme.ipv6NetToMediaState = nce->nce_state; 18076 ntme.ipv6NetToMediaLastUpdated = 0; 18077 18078 /* other(1), dynamic(2), static(3), local(4) */ 18079 if (IN6_IS_ADDR_LOOPBACK(&nce->nce_addr)) { 18080 ntme.ipv6NetToMediaType = 4; 18081 } else if (IN6_IS_ADDR_MULTICAST(&nce->nce_addr)) { 18082 ntme.ipv6NetToMediaType = 1; 18083 } else { 18084 ntme.ipv6NetToMediaType = 2; 18085 } 18086 18087 if (!snmp_append_data2(ird->ird_netmedia.lp_head, 18088 &ird->ird_netmedia.lp_tail, (char *)&ntme, sizeof (ntme))) { 18089 ip1dbg(("ip_snmp_get2_v6_media: failed to allocate %u bytes\n", 18090 (uint_t)sizeof (ntme))); 18091 } 18092 return (0); 18093 } 18094 18095 /* 18096 * return (0) if invalid set request, 1 otherwise, including non-tcp requests 18097 */ 18098 /* ARGSUSED */ 18099 int 18100 ip_snmp_set(queue_t *q, int level, int name, uchar_t *ptr, int len) 18101 { 18102 switch (level) { 18103 case MIB2_IP: 18104 case MIB2_ICMP: 18105 switch (name) { 18106 default: 18107 break; 18108 } 18109 return (1); 18110 default: 18111 return (1); 18112 } 18113 } 18114 18115 /* 18116 * Called before the options are updated to check if this packet will 18117 * be source routed from here. 18118 * This routine assumes that the options are well formed i.e. that they 18119 * have already been checked. 18120 */ 18121 static boolean_t 18122 ip_source_routed(ipha_t *ipha) 18123 { 18124 ipoptp_t opts; 18125 uchar_t *opt; 18126 uint8_t optval; 18127 uint8_t optlen; 18128 ipaddr_t dst; 18129 ire_t *ire; 18130 18131 if (IS_SIMPLE_IPH(ipha)) { 18132 ip2dbg(("not source routed\n")); 18133 return (B_FALSE); 18134 } 18135 dst = ipha->ipha_dst; 18136 for (optval = ipoptp_first(&opts, ipha); 18137 optval != IPOPT_EOL; 18138 optval = ipoptp_next(&opts)) { 18139 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 18140 opt = opts.ipoptp_cur; 18141 optlen = opts.ipoptp_len; 18142 ip2dbg(("ip_source_routed: opt %d, len %d\n", 18143 optval, optlen)); 18144 switch (optval) { 18145 uint32_t off; 18146 case IPOPT_SSRR: 18147 case IPOPT_LSRR: 18148 /* 18149 * If dst is one of our addresses and there are some 18150 * entries left in the source route return (true). 18151 */ 18152 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, 18153 ALL_ZONES, NULL, MATCH_IRE_TYPE); 18154 if (ire == NULL) { 18155 ip2dbg(("ip_source_routed: not next" 18156 " source route 0x%x\n", 18157 ntohl(dst))); 18158 return (B_FALSE); 18159 } 18160 ire_refrele(ire); 18161 off = opt[IPOPT_OFFSET]; 18162 off--; 18163 if (optlen < IP_ADDR_LEN || 18164 off > optlen - IP_ADDR_LEN) { 18165 /* End of source route */ 18166 ip1dbg(("ip_source_routed: end of SR\n")); 18167 return (B_FALSE); 18168 } 18169 return (B_TRUE); 18170 } 18171 } 18172 ip2dbg(("not source routed\n")); 18173 return (B_FALSE); 18174 } 18175 18176 /* 18177 * Check if the packet contains any source route. 18178 */ 18179 static boolean_t 18180 ip_source_route_included(ipha_t *ipha) 18181 { 18182 ipoptp_t opts; 18183 uint8_t optval; 18184 18185 if (IS_SIMPLE_IPH(ipha)) 18186 return (B_FALSE); 18187 for (optval = ipoptp_first(&opts, ipha); 18188 optval != IPOPT_EOL; 18189 optval = ipoptp_next(&opts)) { 18190 switch (optval) { 18191 case IPOPT_SSRR: 18192 case IPOPT_LSRR: 18193 return (B_TRUE); 18194 } 18195 } 18196 return (B_FALSE); 18197 } 18198 18199 /* 18200 * Called when the IRE expiration timer fires. 18201 */ 18202 /* ARGSUSED */ 18203 void 18204 ip_trash_timer_expire(void *args) 18205 { 18206 int flush_flag = 0; 18207 18208 /* 18209 * ip_ire_expire_id is protected by ip_trash_timer_lock. 18210 * This lock makes sure that a new invocation of this function 18211 * that occurs due to an almost immediate timer firing will not 18212 * progress beyond this point until the current invocation is done 18213 */ 18214 mutex_enter(&ip_trash_timer_lock); 18215 ip_ire_expire_id = 0; 18216 mutex_exit(&ip_trash_timer_lock); 18217 18218 /* Periodic timer */ 18219 if (ip_ire_arp_time_elapsed >= ip_ire_arp_interval) { 18220 /* 18221 * Remove all IRE_CACHE entries since they might 18222 * contain arp information. 18223 */ 18224 flush_flag |= FLUSH_ARP_TIME; 18225 ip_ire_arp_time_elapsed = 0; 18226 IP_STAT(ip_ire_arp_timer_expired); 18227 } 18228 if (ip_ire_rd_time_elapsed >= ip_ire_redir_interval) { 18229 /* Remove all redirects */ 18230 flush_flag |= FLUSH_REDIRECT_TIME; 18231 ip_ire_rd_time_elapsed = 0; 18232 IP_STAT(ip_ire_redirect_timer_expired); 18233 } 18234 if (ip_ire_pmtu_time_elapsed >= ip_ire_pathmtu_interval) { 18235 /* Increase path mtu */ 18236 flush_flag |= FLUSH_MTU_TIME; 18237 ip_ire_pmtu_time_elapsed = 0; 18238 IP_STAT(ip_ire_pmtu_timer_expired); 18239 } 18240 if (flush_flag != 0) { 18241 /* Walk all IPv4 IRE's and update them */ 18242 ire_walk_v4(ire_expire, (char *)(uintptr_t)flush_flag, 18243 ALL_ZONES); 18244 } 18245 if (flush_flag & FLUSH_MTU_TIME) { 18246 /* 18247 * Walk all IPv6 IRE's and update them 18248 * Note that ARP and redirect timers are not 18249 * needed since NUD handles stale entries. 18250 */ 18251 flush_flag = FLUSH_MTU_TIME; 18252 ire_walk_v6(ire_expire, (char *)(uintptr_t)flush_flag, 18253 ALL_ZONES); 18254 } 18255 18256 ip_ire_arp_time_elapsed += ip_timer_interval; 18257 ip_ire_rd_time_elapsed += ip_timer_interval; 18258 ip_ire_pmtu_time_elapsed += ip_timer_interval; 18259 18260 /* 18261 * Hold the lock to serialize timeout calls and prevent 18262 * stale values in ip_ire_expire_id. Otherwise it is possible 18263 * for the timer to fire and a new invocation of this function 18264 * to start before the return value of timeout has been stored 18265 * in ip_ire_expire_id by the current invocation. 18266 */ 18267 mutex_enter(&ip_trash_timer_lock); 18268 ip_ire_expire_id = timeout(ip_trash_timer_expire, NULL, 18269 MSEC_TO_TICK(ip_timer_interval)); 18270 mutex_exit(&ip_trash_timer_lock); 18271 } 18272 18273 /* 18274 * Called by the memory allocator subsystem directly, when the system 18275 * is running low on memory. 18276 */ 18277 /* ARGSUSED */ 18278 void 18279 ip_trash_ire_reclaim(void *args) 18280 { 18281 ire_cache_count_t icc; 18282 ire_cache_reclaim_t icr; 18283 ncc_cache_count_t ncc; 18284 nce_cache_reclaim_t ncr; 18285 uint_t delete_cnt; 18286 /* 18287 * Memory reclaim call back. 18288 * Count unused, offlink, pmtu, and onlink IRE_CACHE entries. 18289 * Then, with a target of freeing 1/Nth of IRE_CACHE 18290 * entries, determine what fraction to free for 18291 * each category of IRE_CACHE entries giving absolute priority 18292 * in the order of onlink, pmtu, offlink, unused (e.g. no pmtu 18293 * entry will be freed unless all offlink entries are freed). 18294 */ 18295 icc.icc_total = 0; 18296 icc.icc_unused = 0; 18297 icc.icc_offlink = 0; 18298 icc.icc_pmtu = 0; 18299 icc.icc_onlink = 0; 18300 ire_walk(ire_cache_count, (char *)&icc); 18301 18302 /* 18303 * Free NCEs for IPv6 like the onlink ires. 18304 */ 18305 ncc.ncc_total = 0; 18306 ncc.ncc_host = 0; 18307 ndp_walk(NULL, (pfi_t)ndp_cache_count, (uchar_t *)&ncc); 18308 18309 ASSERT(icc.icc_total == icc.icc_unused + icc.icc_offlink + 18310 icc.icc_pmtu + icc.icc_onlink); 18311 delete_cnt = icc.icc_total/ip_ire_reclaim_fraction; 18312 IP_STAT(ip_trash_ire_reclaim_calls); 18313 if (delete_cnt == 0) 18314 return; 18315 IP_STAT(ip_trash_ire_reclaim_success); 18316 /* Always delete all unused offlink entries */ 18317 icr.icr_unused = 1; 18318 if (delete_cnt <= icc.icc_unused) { 18319 /* 18320 * Only need to free unused entries. In other words, 18321 * there are enough unused entries to free to meet our 18322 * target number of freed ire cache entries. 18323 */ 18324 icr.icr_offlink = icr.icr_pmtu = icr.icr_onlink = 0; 18325 ncr.ncr_host = 0; 18326 } else if (delete_cnt <= icc.icc_unused + icc.icc_offlink) { 18327 /* 18328 * Only need to free unused entries, plus a fraction of offlink 18329 * entries. It follows from the first if statement that 18330 * icc_offlink is non-zero, and that delete_cnt != icc_unused. 18331 */ 18332 delete_cnt -= icc.icc_unused; 18333 /* Round up # deleted by truncating fraction */ 18334 icr.icr_offlink = icc.icc_offlink / delete_cnt; 18335 icr.icr_pmtu = icr.icr_onlink = 0; 18336 ncr.ncr_host = 0; 18337 } else if (delete_cnt <= 18338 icc.icc_unused + icc.icc_offlink + icc.icc_pmtu) { 18339 /* 18340 * Free all unused and offlink entries, plus a fraction of 18341 * pmtu entries. It follows from the previous if statement 18342 * that icc_pmtu is non-zero, and that 18343 * delete_cnt != icc_unused + icc_offlink. 18344 */ 18345 icr.icr_offlink = 1; 18346 delete_cnt -= icc.icc_unused + icc.icc_offlink; 18347 /* Round up # deleted by truncating fraction */ 18348 icr.icr_pmtu = icc.icc_pmtu / delete_cnt; 18349 icr.icr_onlink = 0; 18350 ncr.ncr_host = 0; 18351 } else { 18352 /* 18353 * Free all unused, offlink, and pmtu entries, plus a fraction 18354 * of onlink entries. If we're here, then we know that 18355 * icc_onlink is non-zero, and that 18356 * delete_cnt != icc_unused + icc_offlink + icc_pmtu. 18357 */ 18358 icr.icr_offlink = icr.icr_pmtu = 1; 18359 delete_cnt -= icc.icc_unused + icc.icc_offlink + 18360 icc.icc_pmtu; 18361 /* Round up # deleted by truncating fraction */ 18362 icr.icr_onlink = icc.icc_onlink / delete_cnt; 18363 /* Using the same delete fraction as for onlink IREs */ 18364 ncr.ncr_host = ncc.ncc_host / delete_cnt; 18365 } 18366 #ifdef DEBUG 18367 ip1dbg(("IP reclaim: target %d out of %d current %d/%d/%d/%d " 18368 "fractions %d/%d/%d/%d\n", 18369 icc.icc_total/ip_ire_reclaim_fraction, icc.icc_total, 18370 icc.icc_unused, icc.icc_offlink, 18371 icc.icc_pmtu, icc.icc_onlink, 18372 icr.icr_unused, icr.icr_offlink, 18373 icr.icr_pmtu, icr.icr_onlink)); 18374 #endif 18375 ire_walk(ire_cache_reclaim, (char *)&icr); 18376 if (ncr.ncr_host != 0) 18377 ndp_walk(NULL, (pfi_t)ndp_cache_reclaim, 18378 (uchar_t *)&ncr); 18379 #ifdef DEBUG 18380 icc.icc_total = 0; icc.icc_unused = 0; icc.icc_offlink = 0; 18381 icc.icc_pmtu = 0; icc.icc_onlink = 0; 18382 ire_walk(ire_cache_count, (char *)&icc); 18383 ip1dbg(("IP reclaim: result total %d %d/%d/%d/%d\n", 18384 icc.icc_total, icc.icc_unused, icc.icc_offlink, 18385 icc.icc_pmtu, icc.icc_onlink)); 18386 #endif 18387 } 18388 18389 /* 18390 * ip_unbind is called when a copy of an unbind request is received from the 18391 * upper level protocol. We remove this conn from any fanout hash list it is 18392 * on, and zero out the bind information. No reply is expected up above. 18393 */ 18394 mblk_t * 18395 ip_unbind(queue_t *q, mblk_t *mp) 18396 { 18397 conn_t *connp = Q_TO_CONN(q); 18398 18399 ASSERT(!MUTEX_HELD(&connp->conn_lock)); 18400 18401 if (is_system_labeled() && connp->conn_anon_port) { 18402 (void) tsol_mlp_anon(crgetzone(connp->conn_cred), 18403 connp->conn_mlp_type, connp->conn_ulp, 18404 ntohs(connp->conn_lport), B_FALSE); 18405 connp->conn_anon_port = 0; 18406 } 18407 connp->conn_mlp_type = mlptSingle; 18408 18409 ipcl_hash_remove(connp); 18410 18411 ASSERT(mp->b_cont == NULL); 18412 /* 18413 * Convert mp into a T_OK_ACK 18414 */ 18415 mp = mi_tpi_ok_ack_alloc(mp); 18416 18417 /* 18418 * should not happen in practice... T_OK_ACK is smaller than the 18419 * original message. 18420 */ 18421 if (mp == NULL) 18422 return (NULL); 18423 18424 /* 18425 * Don't bzero the ports if its TCP since TCP still needs the 18426 * lport to remove it from its own bind hash. TCP will do the 18427 * cleanup. 18428 */ 18429 if (!IPCL_IS_TCP(connp)) 18430 bzero(&connp->u_port, sizeof (connp->u_port)); 18431 18432 return (mp); 18433 } 18434 18435 /* 18436 * Write side put procedure. Outbound data, IOCTLs, responses from 18437 * resolvers, etc, come down through here. 18438 */ 18439 void 18440 ip_output(void *arg, mblk_t *mp, void *arg2, int caller) 18441 { 18442 conn_t *connp = NULL; 18443 queue_t *q = (queue_t *)arg2; 18444 ipha_t *ipha; 18445 #define rptr ((uchar_t *)ipha) 18446 ire_t *ire = NULL; 18447 ire_t *sctp_ire = NULL; 18448 uint32_t v_hlen_tos_len; 18449 ipaddr_t dst; 18450 mblk_t *first_mp = NULL; 18451 boolean_t mctl_present; 18452 ipsec_out_t *io; 18453 int match_flags; 18454 ill_t *attach_ill = NULL; 18455 /* Bind to IPIF_NOFAILOVER ill etc. */ 18456 ill_t *xmit_ill = NULL; /* IP_XMIT_IF etc. */ 18457 ipif_t *dst_ipif; 18458 boolean_t multirt_need_resolve = B_FALSE; 18459 mblk_t *copy_mp = NULL; 18460 int err; 18461 zoneid_t zoneid; 18462 int adjust; 18463 uint16_t iplen; 18464 boolean_t need_decref = B_FALSE; 18465 boolean_t ignore_dontroute = B_FALSE; 18466 boolean_t ignore_nexthop = B_FALSE; 18467 boolean_t ip_nexthop = B_FALSE; 18468 ipaddr_t nexthop_addr; 18469 18470 #ifdef _BIG_ENDIAN 18471 #define V_HLEN (v_hlen_tos_len >> 24) 18472 #else 18473 #define V_HLEN (v_hlen_tos_len & 0xFF) 18474 #endif 18475 18476 TRACE_1(TR_FAC_IP, TR_IP_WPUT_START, 18477 "ip_wput_start: q %p", q); 18478 18479 /* 18480 * ip_wput fast path 18481 */ 18482 18483 /* is packet from ARP ? */ 18484 if (q->q_next != NULL) 18485 goto qnext; 18486 18487 connp = (conn_t *)arg; 18488 zoneid = (connp != NULL ? connp->conn_zoneid : ALL_ZONES); 18489 18490 /* is queue flow controlled? */ 18491 if ((q->q_first != NULL || connp->conn_draining) && 18492 (caller == IP_WPUT)) { 18493 ASSERT(!need_decref); 18494 (void) putq(q, mp); 18495 return; 18496 } 18497 18498 /* Multidata transmit? */ 18499 if (DB_TYPE(mp) == M_MULTIDATA) { 18500 /* 18501 * We should never get here, since all Multidata messages 18502 * originating from tcp should have been directed over to 18503 * tcp_multisend() in the first place. 18504 */ 18505 BUMP_MIB(&ip_mib, ipOutDiscards); 18506 freemsg(mp); 18507 return; 18508 } else if (DB_TYPE(mp) != M_DATA) 18509 goto notdata; 18510 18511 if (mp->b_flag & MSGHASREF) { 18512 ASSERT(connp->conn_ulp == IPPROTO_SCTP); 18513 mp->b_flag &= ~MSGHASREF; 18514 SCTP_EXTRACT_IPINFO(mp, sctp_ire); 18515 need_decref = B_TRUE; 18516 } 18517 ipha = (ipha_t *)mp->b_rptr; 18518 18519 /* is IP header non-aligned or mblk smaller than basic IP header */ 18520 #ifndef SAFETY_BEFORE_SPEED 18521 if (!OK_32PTR(rptr) || 18522 (mp->b_wptr - rptr) < IP_SIMPLE_HDR_LENGTH) 18523 goto hdrtoosmall; 18524 #endif 18525 18526 ASSERT(OK_32PTR(ipha)); 18527 18528 /* 18529 * This function assumes that mp points to an IPv4 packet. If it's the 18530 * wrong version, we'll catch it again in ip_output_v6. 18531 * 18532 * Note that this is *only* locally-generated output here, and never 18533 * forwarded data, and that we need to deal only with transports that 18534 * don't know how to label. (TCP, UDP, and ICMP/raw-IP all know how to 18535 * label.) 18536 */ 18537 if (is_system_labeled() && 18538 (ipha->ipha_version_and_hdr_length & 0xf0) == (IPV4_VERSION << 4) && 18539 !connp->conn_ulp_labeled) { 18540 err = tsol_check_label(BEST_CRED(mp, connp), &mp, &adjust, 18541 connp->conn_mac_exempt); 18542 ipha = (ipha_t *)mp->b_rptr; 18543 if (err != 0) { 18544 first_mp = mp; 18545 if (err == EINVAL) 18546 goto icmp_parameter_problem; 18547 ip2dbg(("ip_wput: label check failed (%d)\n", err)); 18548 goto drop_pkt; 18549 } 18550 iplen = ntohs(ipha->ipha_length) + adjust; 18551 ipha->ipha_length = htons(iplen); 18552 } 18553 18554 /* 18555 * If there is a policy, try to attach an ipsec_out in 18556 * the front. At the end, first_mp either points to a 18557 * M_DATA message or IPSEC_OUT message linked to a 18558 * M_DATA message. We have to do it now as we might 18559 * lose the "conn" if we go through ip_newroute. 18560 */ 18561 if (connp->conn_out_enforce_policy || (connp->conn_latch != NULL)) { 18562 if (((mp = ipsec_attach_ipsec_out(mp, connp, NULL, 18563 ipha->ipha_protocol)) == NULL)) { 18564 if (need_decref) 18565 CONN_DEC_REF(connp); 18566 return; 18567 } else { 18568 ASSERT(mp->b_datap->db_type == M_CTL); 18569 first_mp = mp; 18570 mp = mp->b_cont; 18571 mctl_present = B_TRUE; 18572 } 18573 } else { 18574 first_mp = mp; 18575 mctl_present = B_FALSE; 18576 } 18577 18578 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 18579 18580 /* is wrong version or IP options present */ 18581 if (V_HLEN != IP_SIMPLE_HDR_VERSION) 18582 goto version_hdrlen_check; 18583 dst = ipha->ipha_dst; 18584 18585 if (connp->conn_nofailover_ill != NULL) { 18586 attach_ill = conn_get_held_ill(connp, 18587 &connp->conn_nofailover_ill, &err); 18588 if (err == ILL_LOOKUP_FAILED) { 18589 if (need_decref) 18590 CONN_DEC_REF(connp); 18591 freemsg(first_mp); 18592 return; 18593 } 18594 } 18595 18596 /* is packet multicast? */ 18597 if (CLASSD(dst)) 18598 goto multicast; 18599 18600 if ((connp->conn_dontroute) || (connp->conn_xmit_if_ill != NULL) || 18601 (connp->conn_nexthop_set)) { 18602 /* 18603 * If the destination is a broadcast or a loopback 18604 * address, SO_DONTROUTE, IP_XMIT_IF and IP_NEXTHOP go 18605 * through the standard path. But in the case of local 18606 * destination only SO_DONTROUTE and IP_NEXTHOP go through 18607 * the standard path not IP_XMIT_IF. 18608 */ 18609 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp)); 18610 if ((ire == NULL) || ((ire->ire_type != IRE_BROADCAST) && 18611 (ire->ire_type != IRE_LOOPBACK))) { 18612 if ((connp->conn_dontroute || 18613 connp->conn_nexthop_set) && (ire != NULL) && 18614 (ire->ire_type == IRE_LOCAL)) 18615 goto standard_path; 18616 18617 if (ire != NULL) { 18618 ire_refrele(ire); 18619 /* No more access to ire */ 18620 ire = NULL; 18621 } 18622 /* 18623 * bypass routing checks and go directly to 18624 * interface. 18625 */ 18626 if (connp->conn_dontroute) { 18627 goto dontroute; 18628 } else if (connp->conn_nexthop_set) { 18629 ip_nexthop = B_TRUE; 18630 nexthop_addr = connp->conn_nexthop_v4; 18631 goto send_from_ill; 18632 } 18633 18634 /* 18635 * If IP_XMIT_IF socket option is set, 18636 * then we allow unicast and multicast 18637 * packets to go through the ill. It is 18638 * quite possible that the destination 18639 * is not in the ire cache table and we 18640 * do not want to go to ip_newroute() 18641 * instead we call ip_newroute_ipif. 18642 */ 18643 xmit_ill = conn_get_held_ill(connp, 18644 &connp->conn_xmit_if_ill, &err); 18645 if (err == ILL_LOOKUP_FAILED) { 18646 if (attach_ill != NULL) 18647 ill_refrele(attach_ill); 18648 if (need_decref) 18649 CONN_DEC_REF(connp); 18650 freemsg(first_mp); 18651 return; 18652 } 18653 goto send_from_ill; 18654 } 18655 standard_path: 18656 /* Must be a broadcast, a loopback or a local ire */ 18657 if (ire != NULL) { 18658 ire_refrele(ire); 18659 /* No more access to ire */ 18660 ire = NULL; 18661 } 18662 } 18663 18664 if (attach_ill != NULL) 18665 goto send_from_ill; 18666 18667 /* 18668 * We cache IRE_CACHEs to avoid lookups. We don't do 18669 * this for the tcp global queue and listen end point 18670 * as it does not really have a real destination to 18671 * talk to. This is also true for SCTP. 18672 */ 18673 if (IP_FLOW_CONTROLLED_ULP(connp->conn_ulp) && 18674 !connp->conn_fully_bound) { 18675 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp)); 18676 if (ire == NULL) 18677 goto noirefound; 18678 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 18679 "ip_wput_end: q %p (%S)", q, "end"); 18680 18681 /* 18682 * Check if the ire has the RTF_MULTIRT flag, inherited 18683 * from an IRE_OFFSUBNET ire entry in ip_newroute(). 18684 */ 18685 if (ire->ire_flags & RTF_MULTIRT) { 18686 18687 /* 18688 * Force the TTL of multirouted packets if required. 18689 * The TTL of such packets is bounded by the 18690 * ip_multirt_ttl ndd variable. 18691 */ 18692 if ((ip_multirt_ttl > 0) && 18693 (ipha->ipha_ttl > ip_multirt_ttl)) { 18694 ip2dbg(("ip_wput: forcing multirt TTL to %d " 18695 "(was %d), dst 0x%08x\n", 18696 ip_multirt_ttl, ipha->ipha_ttl, 18697 ntohl(ire->ire_addr))); 18698 ipha->ipha_ttl = ip_multirt_ttl; 18699 } 18700 /* 18701 * We look at this point if there are pending 18702 * unresolved routes. ire_multirt_resolvable() 18703 * checks in O(n) that all IRE_OFFSUBNET ire 18704 * entries for the packet's destination and 18705 * flagged RTF_MULTIRT are currently resolved. 18706 * If some remain unresolved, we make a copy 18707 * of the current message. It will be used 18708 * to initiate additional route resolutions. 18709 */ 18710 multirt_need_resolve = 18711 ire_multirt_need_resolve(ire->ire_addr, 18712 MBLK_GETLABEL(first_mp)); 18713 ip2dbg(("ip_wput[TCP]: ire %p, " 18714 "multirt_need_resolve %d, first_mp %p\n", 18715 (void *)ire, multirt_need_resolve, 18716 (void *)first_mp)); 18717 if (multirt_need_resolve) { 18718 copy_mp = copymsg(first_mp); 18719 if (copy_mp != NULL) { 18720 MULTIRT_DEBUG_TAG(copy_mp); 18721 } 18722 } 18723 } 18724 18725 ip_wput_ire(q, first_mp, ire, connp, caller); 18726 18727 /* 18728 * Try to resolve another multiroute if 18729 * ire_multirt_need_resolve() deemed it necessary. 18730 */ 18731 if (copy_mp != NULL) { 18732 ip_newroute(q, copy_mp, dst, NULL, connp); 18733 } 18734 if (need_decref) 18735 CONN_DEC_REF(connp); 18736 return; 18737 } 18738 18739 /* 18740 * Access to conn_ire_cache. (protected by conn_lock) 18741 * 18742 * IRE_MARK_CONDEMNED is marked in ire_delete. We don't grab 18743 * the ire bucket lock here to check for CONDEMNED as it is okay to 18744 * send a packet or two with the IRE_CACHE that is going away. 18745 * Access to the ire requires an ire refhold on the ire prior to 18746 * its use since an interface unplumb thread may delete the cached 18747 * ire and release the refhold at any time. 18748 * 18749 * Caching an ire in the conn_ire_cache 18750 * 18751 * o Caching an ire pointer in the conn requires a strict check for 18752 * IRE_MARK_CONDEMNED. An interface unplumb thread deletes all relevant 18753 * ires before cleaning up the conns. So the caching of an ire pointer 18754 * in the conn is done after making sure under the bucket lock that the 18755 * ire has not yet been marked CONDEMNED. Otherwise we will end up 18756 * caching an ire after the unplumb thread has cleaned up the conn. 18757 * If the conn does not send a packet subsequently the unplumb thread 18758 * will be hanging waiting for the ire count to drop to zero. 18759 * 18760 * o We also need to atomically test for a null conn_ire_cache and 18761 * set the conn_ire_cache under the the protection of the conn_lock 18762 * to avoid races among concurrent threads trying to simultaneously 18763 * cache an ire in the conn_ire_cache. 18764 */ 18765 mutex_enter(&connp->conn_lock); 18766 ire = sctp_ire != NULL ? sctp_ire : connp->conn_ire_cache; 18767 18768 if (ire != NULL && ire->ire_addr == dst && 18769 !(ire->ire_marks & IRE_MARK_CONDEMNED)) { 18770 18771 IRE_REFHOLD(ire); 18772 mutex_exit(&connp->conn_lock); 18773 18774 } else { 18775 boolean_t cached = B_FALSE; 18776 connp->conn_ire_cache = NULL; 18777 mutex_exit(&connp->conn_lock); 18778 /* Release the old ire */ 18779 if (ire != NULL && sctp_ire == NULL) 18780 IRE_REFRELE_NOTR(ire); 18781 18782 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp)); 18783 if (ire == NULL) 18784 goto noirefound; 18785 IRE_REFHOLD_NOTR(ire); 18786 18787 mutex_enter(&connp->conn_lock); 18788 if (!(connp->conn_state_flags & CONN_CLOSING) && 18789 connp->conn_ire_cache == NULL) { 18790 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 18791 if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { 18792 connp->conn_ire_cache = ire; 18793 cached = B_TRUE; 18794 } 18795 rw_exit(&ire->ire_bucket->irb_lock); 18796 } 18797 mutex_exit(&connp->conn_lock); 18798 18799 /* 18800 * We can continue to use the ire but since it was 18801 * not cached, we should drop the extra reference. 18802 */ 18803 if (!cached) 18804 IRE_REFRELE_NOTR(ire); 18805 } 18806 18807 18808 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 18809 "ip_wput_end: q %p (%S)", q, "end"); 18810 18811 /* 18812 * Check if the ire has the RTF_MULTIRT flag, inherited 18813 * from an IRE_OFFSUBNET ire entry in ip_newroute(). 18814 */ 18815 if (ire->ire_flags & RTF_MULTIRT) { 18816 18817 /* 18818 * Force the TTL of multirouted packets if required. 18819 * The TTL of such packets is bounded by the 18820 * ip_multirt_ttl ndd variable. 18821 */ 18822 if ((ip_multirt_ttl > 0) && 18823 (ipha->ipha_ttl > ip_multirt_ttl)) { 18824 ip2dbg(("ip_wput: forcing multirt TTL to %d " 18825 "(was %d), dst 0x%08x\n", 18826 ip_multirt_ttl, ipha->ipha_ttl, 18827 ntohl(ire->ire_addr))); 18828 ipha->ipha_ttl = ip_multirt_ttl; 18829 } 18830 18831 /* 18832 * At this point, we check to see if there are any pending 18833 * unresolved routes. ire_multirt_resolvable() 18834 * checks in O(n) that all IRE_OFFSUBNET ire 18835 * entries for the packet's destination and 18836 * flagged RTF_MULTIRT are currently resolved. 18837 * If some remain unresolved, we make a copy 18838 * of the current message. It will be used 18839 * to initiate additional route resolutions. 18840 */ 18841 multirt_need_resolve = ire_multirt_need_resolve(ire->ire_addr, 18842 MBLK_GETLABEL(first_mp)); 18843 ip2dbg(("ip_wput[not TCP]: ire %p, " 18844 "multirt_need_resolve %d, first_mp %p\n", 18845 (void *)ire, multirt_need_resolve, (void *)first_mp)); 18846 if (multirt_need_resolve) { 18847 copy_mp = copymsg(first_mp); 18848 if (copy_mp != NULL) { 18849 MULTIRT_DEBUG_TAG(copy_mp); 18850 } 18851 } 18852 } 18853 18854 ip_wput_ire(q, first_mp, ire, connp, caller); 18855 18856 /* 18857 * Try to resolve another multiroute if 18858 * ire_multirt_resolvable() deemed it necessary 18859 */ 18860 if (copy_mp != NULL) { 18861 ip_newroute(q, copy_mp, dst, NULL, connp); 18862 } 18863 if (need_decref) 18864 CONN_DEC_REF(connp); 18865 return; 18866 18867 qnext: 18868 /* 18869 * Upper Level Protocols pass down complete IP datagrams 18870 * as M_DATA messages. Everything else is a sideshow. 18871 * 18872 * 1) We could be re-entering ip_wput because of ip_neworute 18873 * in which case we could have a IPSEC_OUT message. We 18874 * need to pass through ip_wput like other datagrams and 18875 * hence cannot branch to ip_wput_nondata. 18876 * 18877 * 2) ARP, AH, ESP, and other clients who are on the module 18878 * instance of IP stream, give us something to deal with. 18879 * We will handle AH and ESP here and rest in ip_wput_nondata. 18880 * 18881 * 3) ICMP replies also could come here. 18882 */ 18883 if (DB_TYPE(mp) != M_DATA) { 18884 notdata: 18885 if (DB_TYPE(mp) == M_CTL) { 18886 /* 18887 * M_CTL messages are used by ARP, AH and ESP to 18888 * communicate with IP. We deal with IPSEC_IN and 18889 * IPSEC_OUT here. ip_wput_nondata handles other 18890 * cases. 18891 */ 18892 ipsec_info_t *ii = (ipsec_info_t *)mp->b_rptr; 18893 if (mp->b_cont && (mp->b_cont->b_flag & MSGHASREF)) { 18894 first_mp = mp->b_cont; 18895 first_mp->b_flag &= ~MSGHASREF; 18896 ASSERT(connp->conn_ulp == IPPROTO_SCTP); 18897 SCTP_EXTRACT_IPINFO(first_mp, sctp_ire); 18898 CONN_DEC_REF(connp); 18899 connp = NULL; 18900 } 18901 if (ii->ipsec_info_type == IPSEC_IN) { 18902 /* 18903 * Either this message goes back to 18904 * IPSEC for further processing or to 18905 * ULP after policy checks. 18906 */ 18907 ip_fanout_proto_again(mp, NULL, NULL, NULL); 18908 return; 18909 } else if (ii->ipsec_info_type == IPSEC_OUT) { 18910 io = (ipsec_out_t *)ii; 18911 if (io->ipsec_out_proc_begin) { 18912 /* 18913 * IPSEC processing has already started. 18914 * Complete it. 18915 * IPQoS notes: We don't care what is 18916 * in ipsec_out_ill_index since this 18917 * won't be processed for IPQoS policies 18918 * in ipsec_out_process. 18919 */ 18920 ipsec_out_process(q, mp, NULL, 18921 io->ipsec_out_ill_index); 18922 return; 18923 } else { 18924 connp = (q->q_next != NULL) ? 18925 NULL : Q_TO_CONN(q); 18926 first_mp = mp; 18927 mp = mp->b_cont; 18928 mctl_present = B_TRUE; 18929 } 18930 zoneid = io->ipsec_out_zoneid; 18931 ASSERT(zoneid != ALL_ZONES); 18932 } else if (ii->ipsec_info_type == IPSEC_CTL) { 18933 /* 18934 * It's an IPsec control message requesting 18935 * an SADB update to be sent to the IPsec 18936 * hardware acceleration capable ills. 18937 */ 18938 ipsec_ctl_t *ipsec_ctl = 18939 (ipsec_ctl_t *)mp->b_rptr; 18940 ipsa_t *sa = (ipsa_t *)ipsec_ctl->ipsec_ctl_sa; 18941 uint_t satype = ipsec_ctl->ipsec_ctl_sa_type; 18942 mblk_t *cmp = mp->b_cont; 18943 18944 ASSERT(MBLKL(mp) >= sizeof (ipsec_ctl_t)); 18945 ASSERT(cmp != NULL); 18946 18947 freeb(mp); 18948 ill_ipsec_capab_send_all(satype, cmp, sa); 18949 return; 18950 } else { 18951 /* 18952 * This must be ARP or special TSOL signaling. 18953 */ 18954 ip_wput_nondata(NULL, q, mp, NULL); 18955 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 18956 "ip_wput_end: q %p (%S)", q, "nondata"); 18957 return; 18958 } 18959 } else { 18960 /* 18961 * This must be non-(ARP/AH/ESP) messages. 18962 */ 18963 ASSERT(!need_decref); 18964 ip_wput_nondata(NULL, q, mp, NULL); 18965 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 18966 "ip_wput_end: q %p (%S)", q, "nondata"); 18967 return; 18968 } 18969 } else { 18970 first_mp = mp; 18971 mctl_present = B_FALSE; 18972 } 18973 18974 ASSERT(first_mp != NULL); 18975 /* 18976 * ICMP echo replies attach an ipsec_out and set ipsec_out_attach_if 18977 * to make sure that this packet goes out on the same interface it 18978 * came in. We handle that here. 18979 */ 18980 if (mctl_present) { 18981 uint_t ifindex; 18982 18983 io = (ipsec_out_t *)first_mp->b_rptr; 18984 if (io->ipsec_out_attach_if || 18985 io->ipsec_out_xmit_if || 18986 io->ipsec_out_ip_nexthop) { 18987 ill_t *ill; 18988 18989 /* 18990 * We may have lost the conn context if we are 18991 * coming here from ip_newroute(). Copy the 18992 * nexthop information. 18993 */ 18994 if (io->ipsec_out_ip_nexthop) { 18995 ip_nexthop = B_TRUE; 18996 nexthop_addr = io->ipsec_out_nexthop_addr; 18997 18998 ipha = (ipha_t *)mp->b_rptr; 18999 dst = ipha->ipha_dst; 19000 goto send_from_ill; 19001 } else { 19002 ASSERT(io->ipsec_out_ill_index != 0); 19003 ifindex = io->ipsec_out_ill_index; 19004 ill = ill_lookup_on_ifindex(ifindex, B_FALSE, 19005 NULL, NULL, NULL, NULL); 19006 /* 19007 * ipsec_out_xmit_if bit is used to tell 19008 * ip_wput to use the ill to send outgoing data 19009 * as we have no conn when data comes from ICMP 19010 * error msg routines. Currently this feature is 19011 * only used by ip_mrtun_forward routine. 19012 */ 19013 if (io->ipsec_out_xmit_if) { 19014 xmit_ill = ill; 19015 if (xmit_ill == NULL) { 19016 ip1dbg(("ip_output:bad ifindex " 19017 "for xmit_ill %d\n", 19018 ifindex)); 19019 freemsg(first_mp); 19020 BUMP_MIB(&ip_mib, 19021 ipOutDiscards); 19022 ASSERT(!need_decref); 19023 return; 19024 } 19025 /* Free up the ipsec_out_t mblk */ 19026 ASSERT(first_mp->b_cont == mp); 19027 first_mp->b_cont = NULL; 19028 freeb(first_mp); 19029 /* Just send the IP header+ICMP+data */ 19030 first_mp = mp; 19031 ipha = (ipha_t *)mp->b_rptr; 19032 dst = ipha->ipha_dst; 19033 goto send_from_ill; 19034 } else { 19035 attach_ill = ill; 19036 } 19037 19038 if (attach_ill == NULL) { 19039 ASSERT(xmit_ill == NULL); 19040 ip1dbg(("ip_output: bad ifindex for " 19041 "(BIND TO IPIF_NOFAILOVER) %d\n", 19042 ifindex)); 19043 freemsg(first_mp); 19044 BUMP_MIB(&ip_mib, ipOutDiscards); 19045 ASSERT(!need_decref); 19046 return; 19047 } 19048 } 19049 } 19050 } 19051 19052 ASSERT(xmit_ill == NULL); 19053 19054 /* We have a complete IP datagram heading outbound. */ 19055 ipha = (ipha_t *)mp->b_rptr; 19056 19057 #ifndef SPEED_BEFORE_SAFETY 19058 /* 19059 * Make sure we have a full-word aligned message and that at least 19060 * a simple IP header is accessible in the first message. If not, 19061 * try a pullup. 19062 */ 19063 if (!OK_32PTR(rptr) || 19064 (mp->b_wptr - rptr) < IP_SIMPLE_HDR_LENGTH) { 19065 hdrtoosmall: 19066 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 19067 BUMP_MIB(&ip_mib, ipOutDiscards); 19068 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19069 "ip_wput_end: q %p (%S)", q, "pullupfailed"); 19070 if (first_mp == NULL) 19071 first_mp = mp; 19072 goto drop_pkt; 19073 } 19074 19075 /* This function assumes that mp points to an IPv4 packet. */ 19076 if (is_system_labeled() && q->q_next == NULL && 19077 (*mp->b_rptr & 0xf0) == (IPV4_VERSION << 4) && 19078 !connp->conn_ulp_labeled) { 19079 err = tsol_check_label(BEST_CRED(mp, connp), &mp, 19080 &adjust, connp->conn_mac_exempt); 19081 ipha = (ipha_t *)mp->b_rptr; 19082 if (first_mp != NULL) 19083 first_mp->b_cont = mp; 19084 if (err != 0) { 19085 if (first_mp == NULL) 19086 first_mp = mp; 19087 if (err == EINVAL) 19088 goto icmp_parameter_problem; 19089 ip2dbg(("ip_wput: label check failed (%d)\n", 19090 err)); 19091 goto drop_pkt; 19092 } 19093 iplen = ntohs(ipha->ipha_length) + adjust; 19094 ipha->ipha_length = htons(iplen); 19095 } 19096 19097 ipha = (ipha_t *)mp->b_rptr; 19098 if (first_mp == NULL) { 19099 ASSERT(attach_ill == NULL && xmit_ill == NULL); 19100 /* 19101 * If we got here because of "goto hdrtoosmall" 19102 * We need to attach a IPSEC_OUT. 19103 */ 19104 if (connp->conn_out_enforce_policy) { 19105 if (((mp = ipsec_attach_ipsec_out(mp, connp, 19106 NULL, ipha->ipha_protocol)) == NULL)) { 19107 if (need_decref) 19108 CONN_DEC_REF(connp); 19109 return; 19110 } else { 19111 ASSERT(mp->b_datap->db_type == M_CTL); 19112 first_mp = mp; 19113 mp = mp->b_cont; 19114 mctl_present = B_TRUE; 19115 } 19116 } else { 19117 first_mp = mp; 19118 mctl_present = B_FALSE; 19119 } 19120 } 19121 } 19122 #endif 19123 19124 /* Most of the code below is written for speed, not readability */ 19125 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 19126 19127 /* 19128 * If ip_newroute() fails, we're going to need a full 19129 * header for the icmp wraparound. 19130 */ 19131 if (V_HLEN != IP_SIMPLE_HDR_VERSION) { 19132 uint_t v_hlen; 19133 version_hdrlen_check: 19134 ASSERT(first_mp != NULL); 19135 v_hlen = V_HLEN; 19136 /* 19137 * siphon off IPv6 packets coming down from transport 19138 * layer modules here. 19139 * Note: high-order bit carries NUD reachability confirmation 19140 */ 19141 if (((v_hlen >> 4) & 0x7) == IPV6_VERSION) { 19142 /* 19143 * XXX implement a IPv4 and IPv6 packet counter per 19144 * conn and switch when ratio exceeds e.g. 10:1 19145 */ 19146 #ifdef notyet 19147 if (q->q_next == NULL) /* Avoid ill queue */ 19148 ip_setqinfo(RD(q), B_TRUE, B_TRUE); 19149 #endif 19150 BUMP_MIB(&ip_mib, ipOutIPv6); 19151 ASSERT(xmit_ill == NULL); 19152 if (attach_ill != NULL) 19153 ill_refrele(attach_ill); 19154 if (need_decref) 19155 mp->b_flag |= MSGHASREF; 19156 (void) ip_output_v6(connp, first_mp, q, caller); 19157 return; 19158 } 19159 19160 if ((v_hlen >> 4) != IP_VERSION) { 19161 BUMP_MIB(&ip_mib, ipOutDiscards); 19162 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19163 "ip_wput_end: q %p (%S)", q, "badvers"); 19164 goto drop_pkt; 19165 } 19166 /* 19167 * Is the header length at least 20 bytes? 19168 * 19169 * Are there enough bytes accessible in the header? If 19170 * not, try a pullup. 19171 */ 19172 v_hlen &= 0xF; 19173 v_hlen <<= 2; 19174 if (v_hlen < IP_SIMPLE_HDR_LENGTH) { 19175 BUMP_MIB(&ip_mib, ipOutDiscards); 19176 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19177 "ip_wput_end: q %p (%S)", q, "badlen"); 19178 goto drop_pkt; 19179 } 19180 if (v_hlen > (mp->b_wptr - rptr)) { 19181 if (!pullupmsg(mp, v_hlen)) { 19182 BUMP_MIB(&ip_mib, ipOutDiscards); 19183 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19184 "ip_wput_end: q %p (%S)", q, "badpullup2"); 19185 goto drop_pkt; 19186 } 19187 ipha = (ipha_t *)mp->b_rptr; 19188 } 19189 /* 19190 * Move first entry from any source route into ipha_dst and 19191 * verify the options 19192 */ 19193 if (ip_wput_options(q, first_mp, ipha, mctl_present, zoneid)) { 19194 ASSERT(xmit_ill == NULL); 19195 if (attach_ill != NULL) 19196 ill_refrele(attach_ill); 19197 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19198 "ip_wput_end: q %p (%S)", q, "badopts"); 19199 if (need_decref) 19200 CONN_DEC_REF(connp); 19201 return; 19202 } 19203 } 19204 dst = ipha->ipha_dst; 19205 19206 /* 19207 * Try to get an IRE_CACHE for the destination address. If we can't, 19208 * we have to run the packet through ip_newroute which will take 19209 * the appropriate action to arrange for an IRE_CACHE, such as querying 19210 * a resolver, or assigning a default gateway, etc. 19211 */ 19212 if (CLASSD(dst)) { 19213 ipif_t *ipif; 19214 uint32_t setsrc = 0; 19215 19216 multicast: 19217 ASSERT(first_mp != NULL); 19218 ASSERT(xmit_ill == NULL); 19219 ip2dbg(("ip_wput: CLASSD\n")); 19220 if (connp == NULL) { 19221 /* 19222 * Use the first good ipif on the ill. 19223 * XXX Should this ever happen? (Appears 19224 * to show up with just ppp and no ethernet due 19225 * to in.rdisc.) 19226 * However, ire_send should be able to 19227 * call ip_wput_ire directly. 19228 * 19229 * XXX Also, this can happen for ICMP and other packets 19230 * with multicast source addresses. Perhaps we should 19231 * fix things so that we drop the packet in question, 19232 * but for now, just run with it. 19233 */ 19234 ill_t *ill = (ill_t *)q->q_ptr; 19235 19236 /* 19237 * Don't honor attach_if for this case. If ill 19238 * is part of the group, ipif could belong to 19239 * any ill and we cannot maintain attach_ill 19240 * and ipif_ill same anymore and the assert 19241 * below would fail. 19242 */ 19243 if (mctl_present) { 19244 io->ipsec_out_ill_index = 0; 19245 io->ipsec_out_attach_if = B_FALSE; 19246 ASSERT(attach_ill != NULL); 19247 ill_refrele(attach_ill); 19248 attach_ill = NULL; 19249 } 19250 19251 ASSERT(attach_ill == NULL); 19252 ipif = ipif_select_source(ill, dst, GLOBAL_ZONEID); 19253 if (ipif == NULL) { 19254 if (need_decref) 19255 CONN_DEC_REF(connp); 19256 freemsg(first_mp); 19257 return; 19258 } 19259 ip1dbg(("ip_wput: CLASSD no CONN: dst 0x%x on %s\n", 19260 ntohl(dst), ill->ill_name)); 19261 } else { 19262 /* 19263 * If both IP_MULTICAST_IF and IP_XMIT_IF are set, 19264 * IP_XMIT_IF is honoured. 19265 * Block comment above this function explains the 19266 * locking mechanism used here 19267 */ 19268 xmit_ill = conn_get_held_ill(connp, 19269 &connp->conn_xmit_if_ill, &err); 19270 if (err == ILL_LOOKUP_FAILED) { 19271 ip1dbg(("ip_wput: No ill for IP_XMIT_IF\n")); 19272 goto drop_pkt; 19273 } 19274 if (xmit_ill == NULL) { 19275 ipif = conn_get_held_ipif(connp, 19276 &connp->conn_multicast_ipif, &err); 19277 if (err == IPIF_LOOKUP_FAILED) { 19278 ip1dbg(("ip_wput: No ipif for " 19279 "multicast\n")); 19280 BUMP_MIB(&ip_mib, ipOutNoRoutes); 19281 goto drop_pkt; 19282 } 19283 } 19284 if (xmit_ill != NULL) { 19285 ipif = ipif_get_next_ipif(NULL, xmit_ill); 19286 if (ipif == NULL) { 19287 ip1dbg(("ip_wput: No ipif for " 19288 "IP_XMIT_IF\n")); 19289 BUMP_MIB(&ip_mib, ipOutNoRoutes); 19290 goto drop_pkt; 19291 } 19292 } else if (ipif == NULL || ipif->ipif_isv6) { 19293 /* 19294 * We must do this ipif determination here 19295 * else we could pass through ip_newroute 19296 * and come back here without the conn context. 19297 * 19298 * Note: we do late binding i.e. we bind to 19299 * the interface when the first packet is sent. 19300 * For performance reasons we do not rebind on 19301 * each packet but keep the binding until the 19302 * next IP_MULTICAST_IF option. 19303 * 19304 * conn_multicast_{ipif,ill} are shared between 19305 * IPv4 and IPv6 and AF_INET6 sockets can 19306 * send both IPv4 and IPv6 packets. Hence 19307 * we have to check that "isv6" matches above. 19308 */ 19309 if (ipif != NULL) 19310 ipif_refrele(ipif); 19311 ipif = ipif_lookup_group(dst, zoneid); 19312 if (ipif == NULL) { 19313 ip1dbg(("ip_wput: No ipif for " 19314 "multicast\n")); 19315 BUMP_MIB(&ip_mib, ipOutNoRoutes); 19316 goto drop_pkt; 19317 } 19318 err = conn_set_held_ipif(connp, 19319 &connp->conn_multicast_ipif, ipif); 19320 if (err == IPIF_LOOKUP_FAILED) { 19321 ipif_refrele(ipif); 19322 ip1dbg(("ip_wput: No ipif for " 19323 "multicast\n")); 19324 BUMP_MIB(&ip_mib, ipOutNoRoutes); 19325 goto drop_pkt; 19326 } 19327 } 19328 } 19329 ASSERT(!ipif->ipif_isv6); 19330 /* 19331 * As we may lose the conn by the time we reach ip_wput_ire, 19332 * we copy conn_multicast_loop and conn_dontroute on to an 19333 * ipsec_out. In case if this datagram goes out secure, 19334 * we need the ill_index also. Copy that also into the 19335 * ipsec_out. 19336 */ 19337 if (mctl_present) { 19338 io = (ipsec_out_t *)first_mp->b_rptr; 19339 ASSERT(first_mp->b_datap->db_type == M_CTL); 19340 ASSERT(io->ipsec_out_type == IPSEC_OUT); 19341 } else { 19342 ASSERT(mp == first_mp); 19343 if ((first_mp = allocb(sizeof (ipsec_info_t), 19344 BPRI_HI)) == NULL) { 19345 ipif_refrele(ipif); 19346 first_mp = mp; 19347 goto drop_pkt; 19348 } 19349 first_mp->b_datap->db_type = M_CTL; 19350 first_mp->b_wptr += sizeof (ipsec_info_t); 19351 /* ipsec_out_secure is B_FALSE now */ 19352 bzero(first_mp->b_rptr, sizeof (ipsec_info_t)); 19353 io = (ipsec_out_t *)first_mp->b_rptr; 19354 io->ipsec_out_type = IPSEC_OUT; 19355 io->ipsec_out_len = sizeof (ipsec_out_t); 19356 io->ipsec_out_use_global_policy = B_TRUE; 19357 first_mp->b_cont = mp; 19358 mctl_present = B_TRUE; 19359 } 19360 if (attach_ill != NULL) { 19361 ASSERT(attach_ill == ipif->ipif_ill); 19362 match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; 19363 19364 /* 19365 * Check if we need an ire that will not be 19366 * looked up by anybody else i.e. HIDDEN. 19367 */ 19368 if (ill_is_probeonly(attach_ill)) { 19369 match_flags |= MATCH_IRE_MARK_HIDDEN; 19370 } 19371 io->ipsec_out_ill_index = 19372 attach_ill->ill_phyint->phyint_ifindex; 19373 io->ipsec_out_attach_if = B_TRUE; 19374 } else { 19375 match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR; 19376 io->ipsec_out_ill_index = 19377 ipif->ipif_ill->ill_phyint->phyint_ifindex; 19378 } 19379 if (connp != NULL) { 19380 io->ipsec_out_multicast_loop = 19381 connp->conn_multicast_loop; 19382 io->ipsec_out_dontroute = connp->conn_dontroute; 19383 io->ipsec_out_zoneid = connp->conn_zoneid; 19384 } 19385 /* 19386 * If the application uses IP_MULTICAST_IF with 19387 * different logical addresses of the same ILL, we 19388 * need to make sure that the soruce address of 19389 * the packet matches the logical IP address used 19390 * in the option. We do it by initializing ipha_src 19391 * here. This should keep IPSEC also happy as 19392 * when we return from IPSEC processing, we don't 19393 * have to worry about getting the right address on 19394 * the packet. Thus it is sufficient to look for 19395 * IRE_CACHE using MATCH_IRE_ILL rathen than 19396 * MATCH_IRE_IPIF. 19397 * 19398 * NOTE : We need to do it for non-secure case also as 19399 * this might go out secure if there is a global policy 19400 * match in ip_wput_ire. For bind to IPIF_NOFAILOVER 19401 * address, the source should be initialized already and 19402 * hence we won't be initializing here. 19403 * 19404 * As we do not have the ire yet, it is possible that 19405 * we set the source address here and then later discover 19406 * that the ire implies the source address to be assigned 19407 * through the RTF_SETSRC flag. 19408 * In that case, the setsrc variable will remind us 19409 * that overwritting the source address by the one 19410 * of the RTF_SETSRC-flagged ire is allowed. 19411 */ 19412 if (ipha->ipha_src == INADDR_ANY && 19413 (connp == NULL || !connp->conn_unspec_src)) { 19414 ipha->ipha_src = ipif->ipif_src_addr; 19415 setsrc = RTF_SETSRC; 19416 } 19417 /* 19418 * Find an IRE which matches the destination and the outgoing 19419 * queue (i.e. the outgoing interface.) 19420 * For loopback use a unicast IP address for 19421 * the ire lookup. 19422 */ 19423 if (ipif->ipif_ill->ill_phyint->phyint_flags & 19424 PHYI_LOOPBACK) { 19425 dst = ipif->ipif_lcl_addr; 19426 } 19427 /* 19428 * If IP_XMIT_IF is set, we branch out to ip_newroute_ipif. 19429 * We don't need to lookup ire in ctable as the packet 19430 * needs to be sent to the destination through the specified 19431 * ill irrespective of ires in the cache table. 19432 */ 19433 ire = NULL; 19434 if (xmit_ill == NULL) { 19435 ire = ire_ctable_lookup(dst, 0, 0, ipif, 19436 zoneid, MBLK_GETLABEL(mp), match_flags); 19437 } 19438 19439 /* 19440 * refrele attach_ill as its not needed anymore. 19441 */ 19442 if (attach_ill != NULL) { 19443 ill_refrele(attach_ill); 19444 attach_ill = NULL; 19445 } 19446 19447 if (ire == NULL) { 19448 /* 19449 * Multicast loopback and multicast forwarding is 19450 * done in ip_wput_ire. 19451 * 19452 * Mark this packet to make it be delivered to 19453 * ip_wput_ire after the new ire has been 19454 * created. 19455 * 19456 * The call to ip_newroute_ipif takes into account 19457 * the setsrc reminder. In any case, we take care 19458 * of the RTF_MULTIRT flag. 19459 */ 19460 mp->b_prev = mp->b_next = NULL; 19461 if (xmit_ill == NULL || 19462 xmit_ill->ill_ipif_up_count > 0) { 19463 ip_newroute_ipif(q, first_mp, ipif, dst, connp, 19464 setsrc | RTF_MULTIRT); 19465 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19466 "ip_wput_end: q %p (%S)", q, "noire"); 19467 } else { 19468 freemsg(first_mp); 19469 } 19470 ipif_refrele(ipif); 19471 if (xmit_ill != NULL) 19472 ill_refrele(xmit_ill); 19473 if (need_decref) 19474 CONN_DEC_REF(connp); 19475 return; 19476 } 19477 19478 ipif_refrele(ipif); 19479 ipif = NULL; 19480 ASSERT(xmit_ill == NULL); 19481 19482 /* 19483 * Honor the RTF_SETSRC flag for multicast packets, 19484 * if allowed by the setsrc reminder. 19485 */ 19486 if ((ire->ire_flags & RTF_SETSRC) && setsrc) { 19487 ipha->ipha_src = ire->ire_src_addr; 19488 } 19489 19490 /* 19491 * Unconditionally force the TTL to 1 for 19492 * multirouted multicast packets: 19493 * multirouted multicast should not cross 19494 * multicast routers. 19495 */ 19496 if (ire->ire_flags & RTF_MULTIRT) { 19497 if (ipha->ipha_ttl > 1) { 19498 ip2dbg(("ip_wput: forcing multicast " 19499 "multirt TTL to 1 (was %d), dst 0x%08x\n", 19500 ipha->ipha_ttl, ntohl(ire->ire_addr))); 19501 ipha->ipha_ttl = 1; 19502 } 19503 } 19504 } else { 19505 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp)); 19506 if ((ire != NULL) && (ire->ire_type & 19507 (IRE_BROADCAST | IRE_LOCAL | IRE_LOOPBACK))) { 19508 ignore_dontroute = B_TRUE; 19509 ignore_nexthop = B_TRUE; 19510 } 19511 if (ire != NULL) { 19512 ire_refrele(ire); 19513 ire = NULL; 19514 } 19515 /* 19516 * Guard against coming in from arp in which case conn is NULL. 19517 * Also guard against non M_DATA with dontroute set but 19518 * destined to local, loopback or broadcast addresses. 19519 */ 19520 if (connp != NULL && connp->conn_dontroute && 19521 !ignore_dontroute) { 19522 dontroute: 19523 /* 19524 * Set TTL to 1 if SO_DONTROUTE is set to prevent 19525 * routing protocols from seeing false direct 19526 * connectivity. 19527 */ 19528 ipha->ipha_ttl = 1; 19529 /* 19530 * If IP_XMIT_IF is also set (conn_xmit_if_ill != NULL) 19531 * along with SO_DONTROUTE, higher precedence is 19532 * given to IP_XMIT_IF and the IP_XMIT_IF ipif is used. 19533 */ 19534 if (connp->conn_xmit_if_ill == NULL) { 19535 /* If suitable ipif not found, drop packet */ 19536 dst_ipif = ipif_lookup_onlink_addr(dst, zoneid); 19537 if (dst_ipif == NULL) { 19538 ip1dbg(("ip_wput: no route for " 19539 "dst using SO_DONTROUTE\n")); 19540 BUMP_MIB(&ip_mib, ipOutNoRoutes); 19541 mp->b_prev = mp->b_next = NULL; 19542 if (first_mp == NULL) 19543 first_mp = mp; 19544 goto drop_pkt; 19545 } else { 19546 /* 19547 * If suitable ipif has been found, set 19548 * xmit_ill to the corresponding 19549 * ipif_ill because we'll be following 19550 * the IP_XMIT_IF logic. 19551 */ 19552 ASSERT(xmit_ill == NULL); 19553 xmit_ill = dst_ipif->ipif_ill; 19554 mutex_enter(&xmit_ill->ill_lock); 19555 if (!ILL_CAN_LOOKUP(xmit_ill)) { 19556 mutex_exit(&xmit_ill->ill_lock); 19557 xmit_ill = NULL; 19558 ipif_refrele(dst_ipif); 19559 ip1dbg(("ip_wput: no route for" 19560 " dst using" 19561 " SO_DONTROUTE\n")); 19562 BUMP_MIB(&ip_mib, 19563 ipOutNoRoutes); 19564 mp->b_prev = mp->b_next = NULL; 19565 if (first_mp == NULL) 19566 first_mp = mp; 19567 goto drop_pkt; 19568 } 19569 ill_refhold_locked(xmit_ill); 19570 mutex_exit(&xmit_ill->ill_lock); 19571 ipif_refrele(dst_ipif); 19572 } 19573 } 19574 19575 } 19576 /* 19577 * If we are bound to IPIF_NOFAILOVER address, look for 19578 * an IRE_CACHE matching the ill. 19579 */ 19580 send_from_ill: 19581 if (attach_ill != NULL) { 19582 ipif_t *attach_ipif; 19583 19584 match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; 19585 19586 /* 19587 * Check if we need an ire that will not be 19588 * looked up by anybody else i.e. HIDDEN. 19589 */ 19590 if (ill_is_probeonly(attach_ill)) { 19591 match_flags |= MATCH_IRE_MARK_HIDDEN; 19592 } 19593 19594 attach_ipif = ipif_get_next_ipif(NULL, attach_ill); 19595 if (attach_ipif == NULL) { 19596 ip1dbg(("ip_wput: No ipif for attach_ill\n")); 19597 goto drop_pkt; 19598 } 19599 ire = ire_ctable_lookup(dst, 0, 0, attach_ipif, 19600 zoneid, MBLK_GETLABEL(mp), match_flags); 19601 ipif_refrele(attach_ipif); 19602 } else if (xmit_ill != NULL || (connp != NULL && 19603 connp->conn_xmit_if_ill != NULL)) { 19604 /* 19605 * Mark this packet as originated locally 19606 */ 19607 mp->b_prev = mp->b_next = NULL; 19608 /* 19609 * xmit_ill could be NULL if SO_DONTROUTE 19610 * is also set. 19611 */ 19612 if (xmit_ill == NULL) { 19613 xmit_ill = conn_get_held_ill(connp, 19614 &connp->conn_xmit_if_ill, &err); 19615 if (err == ILL_LOOKUP_FAILED) { 19616 if (need_decref) 19617 CONN_DEC_REF(connp); 19618 freemsg(first_mp); 19619 return; 19620 } 19621 if (xmit_ill == NULL) { 19622 if (connp->conn_dontroute) 19623 goto dontroute; 19624 goto send_from_ill; 19625 } 19626 } 19627 /* 19628 * could be SO_DONTROUTE case also. 19629 * check at least one interface is UP as 19630 * spcified by this ILL, and then call 19631 * ip_newroute_ipif() 19632 */ 19633 if (xmit_ill->ill_ipif_up_count > 0) { 19634 ipif_t *ipif; 19635 19636 ipif = ipif_get_next_ipif(NULL, xmit_ill); 19637 if (ipif != NULL) { 19638 ip_newroute_ipif(q, first_mp, ipif, 19639 dst, connp, 0); 19640 ipif_refrele(ipif); 19641 ip1dbg(("ip_wput: ip_unicast_if\n")); 19642 } 19643 } else { 19644 freemsg(first_mp); 19645 } 19646 ill_refrele(xmit_ill); 19647 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19648 "ip_wput_end: q %p (%S)", q, "unicast_if"); 19649 if (need_decref) 19650 CONN_DEC_REF(connp); 19651 return; 19652 } else if (ip_nexthop || (connp != NULL && 19653 (connp->conn_nexthop_set)) && !ignore_nexthop) { 19654 if (!ip_nexthop) { 19655 ip_nexthop = B_TRUE; 19656 nexthop_addr = connp->conn_nexthop_v4; 19657 } 19658 match_flags = MATCH_IRE_MARK_PRIVATE_ADDR | 19659 MATCH_IRE_GW; 19660 ire = ire_ctable_lookup(dst, nexthop_addr, 0, 19661 NULL, zoneid, MBLK_GETLABEL(mp), match_flags); 19662 } else { 19663 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp)); 19664 } 19665 if (!ire) { 19666 /* 19667 * Make sure we don't load spread if this 19668 * is IPIF_NOFAILOVER case. 19669 */ 19670 if ((attach_ill != NULL) || 19671 (ip_nexthop && !ignore_nexthop)) { 19672 if (mctl_present) { 19673 io = (ipsec_out_t *)first_mp->b_rptr; 19674 ASSERT(first_mp->b_datap->db_type == 19675 M_CTL); 19676 ASSERT(io->ipsec_out_type == IPSEC_OUT); 19677 } else { 19678 ASSERT(mp == first_mp); 19679 first_mp = allocb( 19680 sizeof (ipsec_info_t), BPRI_HI); 19681 if (first_mp == NULL) { 19682 first_mp = mp; 19683 goto drop_pkt; 19684 } 19685 first_mp->b_datap->db_type = M_CTL; 19686 first_mp->b_wptr += 19687 sizeof (ipsec_info_t); 19688 /* ipsec_out_secure is B_FALSE now */ 19689 bzero(first_mp->b_rptr, 19690 sizeof (ipsec_info_t)); 19691 io = (ipsec_out_t *)first_mp->b_rptr; 19692 io->ipsec_out_type = IPSEC_OUT; 19693 io->ipsec_out_len = 19694 sizeof (ipsec_out_t); 19695 io->ipsec_out_use_global_policy = 19696 B_TRUE; 19697 first_mp->b_cont = mp; 19698 mctl_present = B_TRUE; 19699 } 19700 if (attach_ill != NULL) { 19701 io->ipsec_out_ill_index = attach_ill-> 19702 ill_phyint->phyint_ifindex; 19703 io->ipsec_out_attach_if = B_TRUE; 19704 } else { 19705 io->ipsec_out_ip_nexthop = ip_nexthop; 19706 io->ipsec_out_nexthop_addr = 19707 nexthop_addr; 19708 } 19709 } 19710 noirefound: 19711 /* 19712 * Mark this packet as having originated on 19713 * this machine. This will be noted in 19714 * ire_add_then_send, which needs to know 19715 * whether to run it back through ip_wput or 19716 * ip_rput following successful resolution. 19717 */ 19718 mp->b_prev = NULL; 19719 mp->b_next = NULL; 19720 ip_newroute(q, first_mp, dst, NULL, connp); 19721 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19722 "ip_wput_end: q %p (%S)", q, "newroute"); 19723 if (attach_ill != NULL) 19724 ill_refrele(attach_ill); 19725 if (xmit_ill != NULL) 19726 ill_refrele(xmit_ill); 19727 if (need_decref) 19728 CONN_DEC_REF(connp); 19729 return; 19730 } 19731 } 19732 19733 /* We now know where we are going with it. */ 19734 19735 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19736 "ip_wput_end: q %p (%S)", q, "end"); 19737 19738 /* 19739 * Check if the ire has the RTF_MULTIRT flag, inherited 19740 * from an IRE_OFFSUBNET ire entry in ip_newroute. 19741 */ 19742 if (ire->ire_flags & RTF_MULTIRT) { 19743 /* 19744 * Force the TTL of multirouted packets if required. 19745 * The TTL of such packets is bounded by the 19746 * ip_multirt_ttl ndd variable. 19747 */ 19748 if ((ip_multirt_ttl > 0) && 19749 (ipha->ipha_ttl > ip_multirt_ttl)) { 19750 ip2dbg(("ip_wput: forcing multirt TTL to %d " 19751 "(was %d), dst 0x%08x\n", 19752 ip_multirt_ttl, ipha->ipha_ttl, 19753 ntohl(ire->ire_addr))); 19754 ipha->ipha_ttl = ip_multirt_ttl; 19755 } 19756 /* 19757 * At this point, we check to see if there are any pending 19758 * unresolved routes. ire_multirt_resolvable() 19759 * checks in O(n) that all IRE_OFFSUBNET ire 19760 * entries for the packet's destination and 19761 * flagged RTF_MULTIRT are currently resolved. 19762 * If some remain unresolved, we make a copy 19763 * of the current message. It will be used 19764 * to initiate additional route resolutions. 19765 */ 19766 multirt_need_resolve = ire_multirt_need_resolve(ire->ire_addr, 19767 MBLK_GETLABEL(first_mp)); 19768 ip2dbg(("ip_wput[noirefound]: ire %p, " 19769 "multirt_need_resolve %d, first_mp %p\n", 19770 (void *)ire, multirt_need_resolve, (void *)first_mp)); 19771 if (multirt_need_resolve) { 19772 copy_mp = copymsg(first_mp); 19773 if (copy_mp != NULL) { 19774 MULTIRT_DEBUG_TAG(copy_mp); 19775 } 19776 } 19777 } 19778 19779 ip_wput_ire(q, first_mp, ire, connp, caller); 19780 /* 19781 * Try to resolve another multiroute if 19782 * ire_multirt_resolvable() deemed it necessary. 19783 * At this point, we need to distinguish 19784 * multicasts from other packets. For multicasts, 19785 * we call ip_newroute_ipif() and request that both 19786 * multirouting and setsrc flags are checked. 19787 */ 19788 if (copy_mp != NULL) { 19789 if (CLASSD(dst)) { 19790 ipif_t *ipif = ipif_lookup_group(dst, zoneid); 19791 if (ipif) { 19792 ip_newroute_ipif(q, copy_mp, ipif, dst, connp, 19793 RTF_SETSRC | RTF_MULTIRT); 19794 ipif_refrele(ipif); 19795 } else { 19796 MULTIRT_DEBUG_UNTAG(copy_mp); 19797 freemsg(copy_mp); 19798 copy_mp = NULL; 19799 } 19800 } else { 19801 ip_newroute(q, copy_mp, dst, NULL, connp); 19802 } 19803 } 19804 if (attach_ill != NULL) 19805 ill_refrele(attach_ill); 19806 if (xmit_ill != NULL) 19807 ill_refrele(xmit_ill); 19808 if (need_decref) 19809 CONN_DEC_REF(connp); 19810 return; 19811 19812 icmp_parameter_problem: 19813 /* could not have originated externally */ 19814 ASSERT(mp->b_prev == NULL); 19815 if (ip_hdr_complete(ipha, zoneid) == 0) { 19816 BUMP_MIB(&ip_mib, ipOutNoRoutes); 19817 /* it's the IP header length that's in trouble */ 19818 icmp_param_problem(q, first_mp, 0); 19819 first_mp = NULL; 19820 } 19821 19822 drop_pkt: 19823 ip1dbg(("ip_wput: dropped packet\n")); 19824 if (ire != NULL) 19825 ire_refrele(ire); 19826 if (need_decref) 19827 CONN_DEC_REF(connp); 19828 freemsg(first_mp); 19829 if (attach_ill != NULL) 19830 ill_refrele(attach_ill); 19831 if (xmit_ill != NULL) 19832 ill_refrele(xmit_ill); 19833 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19834 "ip_wput_end: q %p (%S)", q, "droppkt"); 19835 } 19836 19837 void 19838 ip_wput(queue_t *q, mblk_t *mp) 19839 { 19840 ip_output(Q_TO_CONN(q), mp, q, IP_WPUT); 19841 } 19842 19843 /* 19844 * 19845 * The following rules must be observed when accessing any ipif or ill 19846 * that has been cached in the conn. Typically conn_nofailover_ill, 19847 * conn_xmit_if_ill, conn_multicast_ipif and conn_multicast_ill. 19848 * 19849 * Access: The ipif or ill pointed to from the conn can be accessed under 19850 * the protection of the conn_lock or after it has been refheld under the 19851 * protection of the conn lock. In addition the IPIF_CAN_LOOKUP or 19852 * ILL_CAN_LOOKUP macros must be used before actually doing the refhold. 19853 * The reason for this is that a concurrent unplumb could actually be 19854 * cleaning up these cached pointers by walking the conns and might have 19855 * finished cleaning up the conn in question. The macros check that an 19856 * unplumb has not yet started on the ipif or ill. 19857 * 19858 * Caching: An ipif or ill pointer may be cached in the conn only after 19859 * making sure that an unplumb has not started. So the caching is done 19860 * while holding both the conn_lock and the ill_lock and after using the 19861 * ILL_CAN_LOOKUP/IPIF_CAN_LOOKUP macro. An unplumb will set the ILL_CONDEMNED 19862 * flag before starting the cleanup of conns. 19863 * 19864 * The list of ipifs hanging off the ill is protected by ill_g_lock and ill_lock 19865 * On the other hand to access ipif->ipif_ill, we need one of either ill_g_lock 19866 * or a reference to the ipif or a reference to an ire that references the 19867 * ipif. An ipif does not change its ill except for failover/failback. Since 19868 * failover/failback happens only after bringing down the ipif and making sure 19869 * the ipif refcnt has gone to zero and holding the ill_g_lock and ill_lock 19870 * the above holds. 19871 */ 19872 ipif_t * 19873 conn_get_held_ipif(conn_t *connp, ipif_t **ipifp, int *err) 19874 { 19875 ipif_t *ipif; 19876 ill_t *ill; 19877 19878 *err = 0; 19879 rw_enter(&ill_g_lock, RW_READER); 19880 mutex_enter(&connp->conn_lock); 19881 ipif = *ipifp; 19882 if (ipif != NULL) { 19883 ill = ipif->ipif_ill; 19884 mutex_enter(&ill->ill_lock); 19885 if (IPIF_CAN_LOOKUP(ipif)) { 19886 ipif_refhold_locked(ipif); 19887 mutex_exit(&ill->ill_lock); 19888 mutex_exit(&connp->conn_lock); 19889 rw_exit(&ill_g_lock); 19890 return (ipif); 19891 } else { 19892 *err = IPIF_LOOKUP_FAILED; 19893 } 19894 mutex_exit(&ill->ill_lock); 19895 } 19896 mutex_exit(&connp->conn_lock); 19897 rw_exit(&ill_g_lock); 19898 return (NULL); 19899 } 19900 19901 ill_t * 19902 conn_get_held_ill(conn_t *connp, ill_t **illp, int *err) 19903 { 19904 ill_t *ill; 19905 19906 *err = 0; 19907 mutex_enter(&connp->conn_lock); 19908 ill = *illp; 19909 if (ill != NULL) { 19910 mutex_enter(&ill->ill_lock); 19911 if (ILL_CAN_LOOKUP(ill)) { 19912 ill_refhold_locked(ill); 19913 mutex_exit(&ill->ill_lock); 19914 mutex_exit(&connp->conn_lock); 19915 return (ill); 19916 } else { 19917 *err = ILL_LOOKUP_FAILED; 19918 } 19919 mutex_exit(&ill->ill_lock); 19920 } 19921 mutex_exit(&connp->conn_lock); 19922 return (NULL); 19923 } 19924 19925 static int 19926 conn_set_held_ipif(conn_t *connp, ipif_t **ipifp, ipif_t *ipif) 19927 { 19928 ill_t *ill; 19929 19930 ill = ipif->ipif_ill; 19931 mutex_enter(&connp->conn_lock); 19932 mutex_enter(&ill->ill_lock); 19933 if (IPIF_CAN_LOOKUP(ipif)) { 19934 *ipifp = ipif; 19935 mutex_exit(&ill->ill_lock); 19936 mutex_exit(&connp->conn_lock); 19937 return (0); 19938 } 19939 mutex_exit(&ill->ill_lock); 19940 mutex_exit(&connp->conn_lock); 19941 return (IPIF_LOOKUP_FAILED); 19942 } 19943 19944 /* 19945 * This is called if the outbound datagram needs fragmentation. 19946 * 19947 * NOTE : This function does not ire_refrele the ire argument passed in. 19948 */ 19949 static void 19950 ip_wput_ire_fragmentit(mblk_t *ipsec_mp, ire_t *ire) 19951 { 19952 ipha_t *ipha; 19953 mblk_t *mp; 19954 uint32_t v_hlen_tos_len; 19955 uint32_t max_frag; 19956 uint32_t frag_flag; 19957 boolean_t dont_use; 19958 19959 if (ipsec_mp->b_datap->db_type == M_CTL) { 19960 mp = ipsec_mp->b_cont; 19961 } else { 19962 mp = ipsec_mp; 19963 } 19964 19965 ipha = (ipha_t *)mp->b_rptr; 19966 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 19967 19968 #ifdef _BIG_ENDIAN 19969 #define V_HLEN (v_hlen_tos_len >> 24) 19970 #define LENGTH (v_hlen_tos_len & 0xFFFF) 19971 #else 19972 #define V_HLEN (v_hlen_tos_len & 0xFF) 19973 #define LENGTH ((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00)) 19974 #endif 19975 19976 #ifndef SPEED_BEFORE_SAFETY 19977 /* 19978 * Check that ipha_length is consistent with 19979 * the mblk length 19980 */ 19981 if (LENGTH != (mp->b_cont ? msgdsize(mp) : mp->b_wptr - rptr)) { 19982 ip0dbg(("Packet length mismatch: %d, %ld\n", 19983 LENGTH, msgdsize(mp))); 19984 freemsg(ipsec_mp); 19985 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 19986 "ip_wput_ire_fragmentit: mp %p (%S)", mp, 19987 "packet length mismatch"); 19988 return; 19989 } 19990 #endif 19991 /* 19992 * Don't use frag_flag if pre-built packet or source 19993 * routed or if multicast (since multicast packets do not solicit 19994 * ICMP "packet too big" messages). Get the values of 19995 * max_frag and frag_flag atomically by acquiring the 19996 * ire_lock. 19997 */ 19998 mutex_enter(&ire->ire_lock); 19999 max_frag = ire->ire_max_frag; 20000 frag_flag = ire->ire_frag_flag; 20001 mutex_exit(&ire->ire_lock); 20002 20003 dont_use = ((ipha->ipha_ident == IP_HDR_INCLUDED) || 20004 (V_HLEN != IP_SIMPLE_HDR_VERSION && 20005 ip_source_route_included(ipha)) || CLASSD(ipha->ipha_dst)); 20006 20007 ip_wput_frag(ire, ipsec_mp, OB_PKT, max_frag, 20008 (dont_use ? 0 : frag_flag)); 20009 } 20010 20011 /* 20012 * Used for deciding the MSS size for the upper layer. Thus 20013 * we need to check the outbound policy values in the conn. 20014 */ 20015 int 20016 conn_ipsec_length(conn_t *connp) 20017 { 20018 ipsec_latch_t *ipl; 20019 20020 ipl = connp->conn_latch; 20021 if (ipl == NULL) 20022 return (0); 20023 20024 if (ipl->ipl_out_policy == NULL) 20025 return (0); 20026 20027 return (ipl->ipl_out_policy->ipsp_act->ipa_ovhd); 20028 } 20029 20030 /* 20031 * Returns an estimate of the IPSEC headers size. This is used if 20032 * we don't want to call into IPSEC to get the exact size. 20033 */ 20034 int 20035 ipsec_out_extra_length(mblk_t *ipsec_mp) 20036 { 20037 ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr; 20038 ipsec_action_t *a; 20039 20040 ASSERT(io->ipsec_out_type == IPSEC_OUT); 20041 if (!io->ipsec_out_secure) 20042 return (0); 20043 20044 a = io->ipsec_out_act; 20045 20046 if (a == NULL) { 20047 ASSERT(io->ipsec_out_policy != NULL); 20048 a = io->ipsec_out_policy->ipsp_act; 20049 } 20050 ASSERT(a != NULL); 20051 20052 return (a->ipa_ovhd); 20053 } 20054 20055 /* 20056 * Returns an estimate of the IPSEC headers size. This is used if 20057 * we don't want to call into IPSEC to get the exact size. 20058 */ 20059 int 20060 ipsec_in_extra_length(mblk_t *ipsec_mp) 20061 { 20062 ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr; 20063 ipsec_action_t *a; 20064 20065 ASSERT(ii->ipsec_in_type == IPSEC_IN); 20066 20067 a = ii->ipsec_in_action; 20068 return (a == NULL ? 0 : a->ipa_ovhd); 20069 } 20070 20071 /* 20072 * If there are any source route options, return the true final 20073 * destination. Otherwise, return the destination. 20074 */ 20075 ipaddr_t 20076 ip_get_dst(ipha_t *ipha) 20077 { 20078 ipoptp_t opts; 20079 uchar_t *opt; 20080 uint8_t optval; 20081 uint8_t optlen; 20082 ipaddr_t dst; 20083 uint32_t off; 20084 20085 dst = ipha->ipha_dst; 20086 20087 if (IS_SIMPLE_IPH(ipha)) 20088 return (dst); 20089 20090 for (optval = ipoptp_first(&opts, ipha); 20091 optval != IPOPT_EOL; 20092 optval = ipoptp_next(&opts)) { 20093 opt = opts.ipoptp_cur; 20094 optlen = opts.ipoptp_len; 20095 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 20096 switch (optval) { 20097 case IPOPT_SSRR: 20098 case IPOPT_LSRR: 20099 off = opt[IPOPT_OFFSET]; 20100 /* 20101 * If one of the conditions is true, it means 20102 * end of options and dst already has the right 20103 * value. 20104 */ 20105 if (!(optlen < IP_ADDR_LEN || off > optlen - 3)) { 20106 off = optlen - IP_ADDR_LEN; 20107 bcopy(&opt[off], &dst, IP_ADDR_LEN); 20108 } 20109 return (dst); 20110 default: 20111 break; 20112 } 20113 } 20114 20115 return (dst); 20116 } 20117 20118 mblk_t * 20119 ip_wput_ire_parse_ipsec_out(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, ire_t *ire, 20120 conn_t *connp, boolean_t unspec_src) 20121 { 20122 ipsec_out_t *io; 20123 mblk_t *first_mp; 20124 boolean_t policy_present; 20125 20126 first_mp = mp; 20127 if (mp->b_datap->db_type == M_CTL) { 20128 io = (ipsec_out_t *)first_mp->b_rptr; 20129 /* 20130 * ip_wput[_v6] attaches an IPSEC_OUT in two cases. 20131 * 20132 * 1) There is per-socket policy (including cached global 20133 * policy). 20134 * 2) There is no per-socket policy, but it is 20135 * a multicast packet that needs to go out 20136 * on a specific interface. This is the case 20137 * where (ip_wput and ip_wput_multicast) attaches 20138 * an IPSEC_OUT and sets ipsec_out_secure B_FALSE. 20139 * 20140 * In case (2) we check with global policy to 20141 * see if there is a match and set the ill_index 20142 * appropriately so that we can lookup the ire 20143 * properly in ip_wput_ipsec_out. 20144 */ 20145 20146 /* 20147 * ipsec_out_use_global_policy is set to B_FALSE 20148 * in ipsec_in_to_out(). Refer to that function for 20149 * details. 20150 */ 20151 if ((io->ipsec_out_latch == NULL) && 20152 (io->ipsec_out_use_global_policy)) { 20153 return (ip_wput_attach_policy(first_mp, ipha, ip6h, 20154 ire, connp, unspec_src)); 20155 } 20156 if (!io->ipsec_out_secure) { 20157 /* 20158 * If this is not a secure packet, drop 20159 * the IPSEC_OUT mp and treat it as a clear 20160 * packet. This happens when we are sending 20161 * a ICMP reply back to a clear packet. See 20162 * ipsec_in_to_out() for details. 20163 */ 20164 mp = first_mp->b_cont; 20165 freeb(first_mp); 20166 } 20167 return (mp); 20168 } 20169 /* 20170 * See whether we need to attach a global policy here. We 20171 * don't depend on the conn (as it could be null) for deciding 20172 * what policy this datagram should go through because it 20173 * should have happened in ip_wput if there was some 20174 * policy. This normally happens for connections which are not 20175 * fully bound preventing us from caching policies in 20176 * ip_bind. Packets coming from the TCP listener/global queue 20177 * - which are non-hard_bound - could also be affected by 20178 * applying policy here. 20179 * 20180 * If this packet is coming from tcp global queue or listener, 20181 * we will be applying policy here. This may not be *right* 20182 * if these packets are coming from the detached connection as 20183 * it could have gone in clear before. This happens only if a 20184 * TCP connection started when there is no policy and somebody 20185 * added policy before it became detached. Thus packets of the 20186 * detached connection could go out secure and the other end 20187 * would drop it because it will be expecting in clear. The 20188 * converse is not true i.e if somebody starts a TCP 20189 * connection and deletes the policy, all the packets will 20190 * still go out with the policy that existed before deleting 20191 * because ip_unbind sends up policy information which is used 20192 * by TCP on subsequent ip_wputs. The right solution is to fix 20193 * TCP to attach a dummy IPSEC_OUT and set 20194 * ipsec_out_use_global_policy to B_FALSE. As this might 20195 * affect performance for normal cases, we are not doing it. 20196 * Thus, set policy before starting any TCP connections. 20197 * 20198 * NOTE - We might apply policy even for a hard bound connection 20199 * - for which we cached policy in ip_bind - if somebody added 20200 * global policy after we inherited the policy in ip_bind. 20201 * This means that the packets that were going out in clear 20202 * previously would start going secure and hence get dropped 20203 * on the other side. To fix this, TCP attaches a dummy 20204 * ipsec_out and make sure that we don't apply global policy. 20205 */ 20206 if (ipha != NULL) 20207 policy_present = ipsec_outbound_v4_policy_present; 20208 else 20209 policy_present = ipsec_outbound_v6_policy_present; 20210 if (!policy_present) 20211 return (mp); 20212 20213 return (ip_wput_attach_policy(mp, ipha, ip6h, ire, connp, unspec_src)); 20214 } 20215 20216 ire_t * 20217 conn_set_outgoing_ill(conn_t *connp, ire_t *ire, ill_t **conn_outgoing_ill) 20218 { 20219 ipaddr_t addr; 20220 ire_t *save_ire; 20221 irb_t *irb; 20222 ill_group_t *illgrp; 20223 int err; 20224 20225 save_ire = ire; 20226 addr = ire->ire_addr; 20227 20228 ASSERT(ire->ire_type == IRE_BROADCAST); 20229 20230 illgrp = connp->conn_outgoing_ill->ill_group; 20231 if (illgrp == NULL) { 20232 *conn_outgoing_ill = conn_get_held_ill(connp, 20233 &connp->conn_outgoing_ill, &err); 20234 if (err == ILL_LOOKUP_FAILED) { 20235 ire_refrele(save_ire); 20236 return (NULL); 20237 } 20238 return (save_ire); 20239 } 20240 /* 20241 * If IP_BOUND_IF has been done, conn_outgoing_ill will be set. 20242 * If it is part of the group, we need to send on the ire 20243 * that has been cleared of IRE_MARK_NORECV and that belongs 20244 * to this group. This is okay as IP_BOUND_IF really means 20245 * any ill in the group. We depend on the fact that the 20246 * first ire in the group is always cleared of IRE_MARK_NORECV 20247 * if such an ire exists. This is possible only if you have 20248 * at least one ill in the group that has not failed. 20249 * 20250 * First get to the ire that matches the address and group. 20251 * 20252 * We don't look for an ire with a matching zoneid because a given zone 20253 * won't always have broadcast ires on all ills in the group. 20254 */ 20255 irb = ire->ire_bucket; 20256 rw_enter(&irb->irb_lock, RW_READER); 20257 if (ire->ire_marks & IRE_MARK_NORECV) { 20258 /* 20259 * If the current zone only has an ire broadcast for this 20260 * address marked NORECV, the ire we want is ahead in the 20261 * bucket, so we look it up deliberately ignoring the zoneid. 20262 */ 20263 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 20264 if (ire->ire_addr != addr) 20265 continue; 20266 /* skip over deleted ires */ 20267 if (ire->ire_marks & IRE_MARK_CONDEMNED) 20268 continue; 20269 } 20270 } 20271 while (ire != NULL) { 20272 /* 20273 * If a new interface is coming up, we could end up 20274 * seeing the loopback ire and the non-loopback ire 20275 * may not have been added yet. So check for ire_stq 20276 */ 20277 if (ire->ire_stq != NULL && (ire->ire_addr != addr || 20278 ire->ire_ipif->ipif_ill->ill_group == illgrp)) { 20279 break; 20280 } 20281 ire = ire->ire_next; 20282 } 20283 if (ire != NULL && ire->ire_addr == addr && 20284 ire->ire_ipif->ipif_ill->ill_group == illgrp) { 20285 IRE_REFHOLD(ire); 20286 rw_exit(&irb->irb_lock); 20287 ire_refrele(save_ire); 20288 *conn_outgoing_ill = ire_to_ill(ire); 20289 /* 20290 * Refhold the ill to make the conn_outgoing_ill 20291 * independent of the ire. ip_wput_ire goes in a loop 20292 * and may refrele the ire. Since we have an ire at this 20293 * point we don't need to use ILL_CAN_LOOKUP on the ill. 20294 */ 20295 ill_refhold(*conn_outgoing_ill); 20296 return (ire); 20297 } 20298 rw_exit(&irb->irb_lock); 20299 ip1dbg(("conn_set_outgoing_ill: No matching ire\n")); 20300 /* 20301 * If we can't find a suitable ire, return the original ire. 20302 */ 20303 return (save_ire); 20304 } 20305 20306 /* 20307 * This function does the ire_refrele of the ire passed in as the 20308 * argument. As this function looks up more ires i.e broadcast ires, 20309 * it needs to REFRELE them. Currently, for simplicity we don't 20310 * differentiate the one passed in and looked up here. We always 20311 * REFRELE. 20312 * IPQoS Notes: 20313 * IP policy is invoked if IPP_LOCAL_OUT is enabled. Processing for 20314 * IPSec packets are done in ipsec_out_process. 20315 * 20316 */ 20317 void 20318 ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller) 20319 { 20320 ipha_t *ipha; 20321 #define rptr ((uchar_t *)ipha) 20322 mblk_t *mp1; 20323 queue_t *stq; 20324 #define Q_TO_INDEX(stq) (((ill_t *)stq->q_ptr)->ill_phyint->phyint_ifindex) 20325 uint32_t v_hlen_tos_len; 20326 uint32_t ttl_protocol; 20327 ipaddr_t src; 20328 ipaddr_t dst; 20329 uint32_t cksum; 20330 ipaddr_t orig_src; 20331 ire_t *ire1; 20332 mblk_t *next_mp; 20333 uint_t hlen; 20334 uint16_t *up; 20335 uint32_t max_frag = ire->ire_max_frag; 20336 ill_t *ill = ire_to_ill(ire); 20337 int clusterwide; 20338 uint16_t ip_hdr_included; /* IP header included by ULP? */ 20339 int ipsec_len; 20340 mblk_t *first_mp; 20341 ipsec_out_t *io; 20342 boolean_t conn_dontroute; /* conn value for multicast */ 20343 boolean_t conn_multicast_loop; /* conn value for multicast */ 20344 boolean_t multicast_forward; /* Should we forward ? */ 20345 boolean_t unspec_src; 20346 ill_t *conn_outgoing_ill = NULL; 20347 ill_t *ire_ill; 20348 ill_t *ire1_ill; 20349 uint32_t ill_index = 0; 20350 boolean_t multirt_send = B_FALSE; 20351 int err; 20352 zoneid_t zoneid; 20353 20354 TRACE_1(TR_FAC_IP, TR_IP_WPUT_IRE_START, 20355 "ip_wput_ire_start: q %p", q); 20356 20357 multicast_forward = B_FALSE; 20358 unspec_src = (connp != NULL && connp->conn_unspec_src); 20359 20360 if (ire->ire_flags & RTF_MULTIRT) { 20361 /* 20362 * Multirouting case. The bucket where ire is stored 20363 * probably holds other RTF_MULTIRT flagged ire 20364 * to the destination. In this call to ip_wput_ire, 20365 * we attempt to send the packet through all 20366 * those ires. Thus, we first ensure that ire is the 20367 * first RTF_MULTIRT ire in the bucket, 20368 * before walking the ire list. 20369 */ 20370 ire_t *first_ire; 20371 irb_t *irb = ire->ire_bucket; 20372 ASSERT(irb != NULL); 20373 20374 /* Make sure we do not omit any multiroute ire. */ 20375 IRB_REFHOLD(irb); 20376 for (first_ire = irb->irb_ire; 20377 first_ire != NULL; 20378 first_ire = first_ire->ire_next) { 20379 if ((first_ire->ire_flags & RTF_MULTIRT) && 20380 (first_ire->ire_addr == ire->ire_addr) && 20381 !(first_ire->ire_marks & 20382 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) 20383 break; 20384 } 20385 20386 if ((first_ire != NULL) && (first_ire != ire)) { 20387 IRE_REFHOLD(first_ire); 20388 ire_refrele(ire); 20389 ire = first_ire; 20390 ill = ire_to_ill(ire); 20391 } 20392 IRB_REFRELE(irb); 20393 } 20394 20395 /* 20396 * conn_outgoing_ill is used only in the broadcast loop. 20397 * for performance we don't grab the mutexs in the fastpath 20398 */ 20399 if ((connp != NULL) && 20400 (connp->conn_xmit_if_ill == NULL) && 20401 (ire->ire_type == IRE_BROADCAST) && 20402 ((connp->conn_nofailover_ill != NULL) || 20403 (connp->conn_outgoing_ill != NULL))) { 20404 /* 20405 * Bind to IPIF_NOFAILOVER address overrides IP_BOUND_IF 20406 * option. So, see if this endpoint is bound to a 20407 * IPIF_NOFAILOVER address. If so, honor it. This implies 20408 * that if the interface is failed, we will still send 20409 * the packet on the same ill which is what we want. 20410 */ 20411 conn_outgoing_ill = conn_get_held_ill(connp, 20412 &connp->conn_nofailover_ill, &err); 20413 if (err == ILL_LOOKUP_FAILED) { 20414 ire_refrele(ire); 20415 freemsg(mp); 20416 return; 20417 } 20418 if (conn_outgoing_ill == NULL) { 20419 /* 20420 * Choose a good ill in the group to send the 20421 * packets on. 20422 */ 20423 ire = conn_set_outgoing_ill(connp, ire, 20424 &conn_outgoing_ill); 20425 if (ire == NULL) { 20426 freemsg(mp); 20427 return; 20428 } 20429 } 20430 } 20431 20432 if (mp->b_datap->db_type != M_CTL) { 20433 ipha = (ipha_t *)mp->b_rptr; 20434 zoneid = (connp != NULL ? connp->conn_zoneid : ALL_ZONES); 20435 } else { 20436 io = (ipsec_out_t *)mp->b_rptr; 20437 ASSERT(io->ipsec_out_type == IPSEC_OUT); 20438 zoneid = io->ipsec_out_zoneid; 20439 ASSERT(zoneid != ALL_ZONES); 20440 ipha = (ipha_t *)mp->b_cont->b_rptr; 20441 dst = ipha->ipha_dst; 20442 /* 20443 * For the multicast case, ipsec_out carries conn_dontroute and 20444 * conn_multicast_loop as conn may not be available here. We 20445 * need this for multicast loopback and forwarding which is done 20446 * later in the code. 20447 */ 20448 if (CLASSD(dst)) { 20449 conn_dontroute = io->ipsec_out_dontroute; 20450 conn_multicast_loop = io->ipsec_out_multicast_loop; 20451 /* 20452 * If conn_dontroute is not set or conn_multicast_loop 20453 * is set, we need to do forwarding/loopback. For 20454 * datagrams from ip_wput_multicast, conn_dontroute is 20455 * set to B_TRUE and conn_multicast_loop is set to 20456 * B_FALSE so that we neither do forwarding nor 20457 * loopback. 20458 */ 20459 if (!conn_dontroute || conn_multicast_loop) 20460 multicast_forward = B_TRUE; 20461 } 20462 } 20463 20464 if (ire->ire_type == IRE_LOCAL && ire->ire_zoneid != zoneid && 20465 ire->ire_zoneid != ALL_ZONES) { 20466 /* 20467 * When a zone sends a packet to another zone, we try to deliver 20468 * the packet under the same conditions as if the destination 20469 * was a real node on the network. To do so, we look for a 20470 * matching route in the forwarding table. 20471 * RTF_REJECT and RTF_BLACKHOLE are handled just like 20472 * ip_newroute() does. 20473 */ 20474 ire_t *src_ire = ire_ftable_lookup(ipha->ipha_dst, 0, 0, 0, 20475 NULL, NULL, zoneid, 0, NULL, (MATCH_IRE_RECURSIVE | 20476 MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE)); 20477 if (src_ire != NULL && 20478 !(src_ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE))) { 20479 if (ipha->ipha_src == INADDR_ANY && !unspec_src) 20480 ipha->ipha_src = src_ire->ire_src_addr; 20481 ire_refrele(src_ire); 20482 } else { 20483 ire_refrele(ire); 20484 if (conn_outgoing_ill != NULL) 20485 ill_refrele(conn_outgoing_ill); 20486 BUMP_MIB(&ip_mib, ipOutNoRoutes); 20487 if (src_ire != NULL) { 20488 if (src_ire->ire_flags & RTF_BLACKHOLE) { 20489 ire_refrele(src_ire); 20490 freemsg(mp); 20491 return; 20492 } 20493 ire_refrele(src_ire); 20494 } 20495 if (ip_hdr_complete(ipha, zoneid)) { 20496 /* Failed */ 20497 freemsg(mp); 20498 return; 20499 } 20500 icmp_unreachable(q, mp, ICMP_HOST_UNREACHABLE); 20501 return; 20502 } 20503 } 20504 20505 if (mp->b_datap->db_type == M_CTL || 20506 ipsec_outbound_v4_policy_present) { 20507 mp = ip_wput_ire_parse_ipsec_out(mp, ipha, NULL, ire, connp, 20508 unspec_src); 20509 if (mp == NULL) { 20510 ire_refrele(ire); 20511 if (conn_outgoing_ill != NULL) 20512 ill_refrele(conn_outgoing_ill); 20513 return; 20514 } 20515 } 20516 20517 first_mp = mp; 20518 ipsec_len = 0; 20519 20520 if (first_mp->b_datap->db_type == M_CTL) { 20521 io = (ipsec_out_t *)first_mp->b_rptr; 20522 ASSERT(io->ipsec_out_type == IPSEC_OUT); 20523 mp = first_mp->b_cont; 20524 ipsec_len = ipsec_out_extra_length(first_mp); 20525 ASSERT(ipsec_len >= 0); 20526 zoneid = io->ipsec_out_zoneid; 20527 ASSERT(zoneid != ALL_ZONES); 20528 20529 /* 20530 * Drop M_CTL here if IPsec processing is not needed. 20531 * (Non-IPsec use of M_CTL extracted any information it 20532 * needed above). 20533 */ 20534 if (ipsec_len == 0) { 20535 freeb(first_mp); 20536 first_mp = mp; 20537 } 20538 } 20539 20540 /* 20541 * Fast path for ip_wput_ire 20542 */ 20543 20544 ipha = (ipha_t *)mp->b_rptr; 20545 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 20546 dst = ipha->ipha_dst; 20547 20548 /* 20549 * ICMP(RAWIP) module should set the ipha_ident to IP_HDR_INCLUDED 20550 * if the socket is a SOCK_RAW type. The transport checksum should 20551 * be provided in the pre-built packet, so we don't need to compute it. 20552 * Also, other application set flags, like DF, should not be altered. 20553 * Other transport MUST pass down zero. 20554 */ 20555 ip_hdr_included = ipha->ipha_ident; 20556 ASSERT(ipha->ipha_ident == 0 || ipha->ipha_ident == IP_HDR_INCLUDED); 20557 20558 if (CLASSD(dst)) { 20559 ip1dbg(("ip_wput_ire: to 0x%x ire %s addr 0x%x\n", 20560 ntohl(dst), 20561 ip_nv_lookup(ire_nv_tbl, ire->ire_type), 20562 ntohl(ire->ire_addr))); 20563 } 20564 20565 /* Macros to extract header fields from data already in registers */ 20566 #ifdef _BIG_ENDIAN 20567 #define V_HLEN (v_hlen_tos_len >> 24) 20568 #define LENGTH (v_hlen_tos_len & 0xFFFF) 20569 #define PROTO (ttl_protocol & 0xFF) 20570 #else 20571 #define V_HLEN (v_hlen_tos_len & 0xFF) 20572 #define LENGTH ((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00)) 20573 #define PROTO (ttl_protocol >> 8) 20574 #endif 20575 20576 20577 orig_src = src = ipha->ipha_src; 20578 /* (The loop back to "another" is explained down below.) */ 20579 another:; 20580 /* 20581 * Assign an ident value for this packet. We assign idents on 20582 * a per destination basis out of the IRE. There could be 20583 * other threads targeting the same destination, so we have to 20584 * arrange for a atomic increment. Note that we use a 32-bit 20585 * atomic add because it has better performance than its 20586 * 16-bit sibling. 20587 * 20588 * If running in cluster mode and if the source address 20589 * belongs to a replicated service then vector through 20590 * cl_inet_ipident vector to allocate ip identifier 20591 * NOTE: This is a contract private interface with the 20592 * clustering group. 20593 */ 20594 clusterwide = 0; 20595 if (cl_inet_ipident) { 20596 ASSERT(cl_inet_isclusterwide); 20597 if ((*cl_inet_isclusterwide)(IPPROTO_IP, 20598 AF_INET, (uint8_t *)(uintptr_t)src)) { 20599 ipha->ipha_ident = (*cl_inet_ipident)(IPPROTO_IP, 20600 AF_INET, (uint8_t *)(uintptr_t)src, 20601 (uint8_t *)(uintptr_t)dst); 20602 clusterwide = 1; 20603 } 20604 } 20605 if (!clusterwide) { 20606 ipha->ipha_ident = 20607 (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1); 20608 } 20609 20610 #ifndef _BIG_ENDIAN 20611 ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8); 20612 #endif 20613 20614 /* 20615 * Set source address unless sent on an ill or conn_unspec_src is set. 20616 * This is needed to obey conn_unspec_src when packets go through 20617 * ip_newroute + arp. 20618 * Assumes ip_newroute{,_multi} sets the source address as well. 20619 */ 20620 if (src == INADDR_ANY && !unspec_src) { 20621 /* 20622 * Assign the appropriate source address from the IRE if none 20623 * was specified. 20624 */ 20625 ASSERT(ire->ire_ipversion == IPV4_VERSION); 20626 20627 /* 20628 * With IP multipathing, broadcast packets are sent on the ire 20629 * that has been cleared of IRE_MARK_NORECV and that belongs to 20630 * the group. However, this ire might not be in the same zone so 20631 * we can't always use its source address. We look for a 20632 * broadcast ire in the same group and in the right zone. 20633 */ 20634 if (ire->ire_type == IRE_BROADCAST && 20635 ire->ire_zoneid != zoneid) { 20636 ire_t *src_ire = ire_ctable_lookup(dst, 0, 20637 IRE_BROADCAST, ire->ire_ipif, zoneid, NULL, 20638 (MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP)); 20639 if (src_ire != NULL) { 20640 src = src_ire->ire_src_addr; 20641 ire_refrele(src_ire); 20642 } else { 20643 ire_refrele(ire); 20644 if (conn_outgoing_ill != NULL) 20645 ill_refrele(conn_outgoing_ill); 20646 freemsg(first_mp); 20647 BUMP_MIB(&ip_mib, ipOutDiscards); 20648 return; 20649 } 20650 } else { 20651 src = ire->ire_src_addr; 20652 } 20653 20654 if (connp == NULL) { 20655 ip1dbg(("ip_wput_ire: no connp and no src " 20656 "address for dst 0x%x, using src 0x%x\n", 20657 ntohl(dst), 20658 ntohl(src))); 20659 } 20660 ipha->ipha_src = src; 20661 } 20662 stq = ire->ire_stq; 20663 20664 /* 20665 * We only allow ire chains for broadcasts since there will 20666 * be multiple IRE_CACHE entries for the same multicast 20667 * address (one per ipif). 20668 */ 20669 next_mp = NULL; 20670 20671 /* broadcast packet */ 20672 if (ire->ire_type == IRE_BROADCAST) 20673 goto broadcast; 20674 20675 /* loopback ? */ 20676 if (stq == NULL) 20677 goto nullstq; 20678 20679 /* The ill_index for outbound ILL */ 20680 ill_index = Q_TO_INDEX(stq); 20681 20682 BUMP_MIB(&ip_mib, ipOutRequests); 20683 ttl_protocol = ((uint16_t *)ipha)[4]; 20684 20685 /* pseudo checksum (do it in parts for IP header checksum) */ 20686 cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); 20687 20688 if (!IP_FLOW_CONTROLLED_ULP(PROTO)) { 20689 queue_t *dev_q = stq->q_next; 20690 20691 /* flow controlled */ 20692 if ((dev_q->q_next || dev_q->q_first) && 20693 !canput(dev_q)) 20694 goto blocked; 20695 if ((PROTO == IPPROTO_UDP) && 20696 (ip_hdr_included != IP_HDR_INCLUDED)) { 20697 hlen = (V_HLEN & 0xF) << 2; 20698 up = IPH_UDPH_CHECKSUMP(ipha, hlen); 20699 if (*up != 0) { 20700 IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO, 20701 hlen, LENGTH, max_frag, ipsec_len, cksum); 20702 /* Software checksum? */ 20703 if (DB_CKSUMFLAGS(mp) == 0) { 20704 IP_STAT(ip_out_sw_cksum); 20705 IP_STAT_UPDATE( 20706 ip_udp_out_sw_cksum_bytes, 20707 LENGTH - hlen); 20708 } 20709 } 20710 } 20711 } else if (ip_hdr_included != IP_HDR_INCLUDED) { 20712 hlen = (V_HLEN & 0xF) << 2; 20713 if (PROTO == IPPROTO_TCP) { 20714 up = IPH_TCPH_CHECKSUMP(ipha, hlen); 20715 /* 20716 * The packet header is processed once and for all, even 20717 * in the multirouting case. We disable hardware 20718 * checksum if the packet is multirouted, as it will be 20719 * replicated via several interfaces, and not all of 20720 * them may have this capability. 20721 */ 20722 IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO, hlen, 20723 LENGTH, max_frag, ipsec_len, cksum); 20724 /* Software checksum? */ 20725 if (DB_CKSUMFLAGS(mp) == 0) { 20726 IP_STAT(ip_out_sw_cksum); 20727 IP_STAT_UPDATE(ip_tcp_out_sw_cksum_bytes, 20728 LENGTH - hlen); 20729 } 20730 } else { 20731 sctp_hdr_t *sctph; 20732 20733 ASSERT(PROTO == IPPROTO_SCTP); 20734 ASSERT(MBLKL(mp) >= (hlen + sizeof (*sctph))); 20735 sctph = (sctp_hdr_t *)(mp->b_rptr + hlen); 20736 /* 20737 * Zero out the checksum field to ensure proper 20738 * checksum calculation. 20739 */ 20740 sctph->sh_chksum = 0; 20741 #ifdef DEBUG 20742 if (!skip_sctp_cksum) 20743 #endif 20744 sctph->sh_chksum = sctp_cksum(mp, hlen); 20745 } 20746 } 20747 20748 /* 20749 * If this is a multicast packet and originated from ip_wput 20750 * we need to do loopback and forwarding checks. If it comes 20751 * from ip_wput_multicast, we SHOULD not do this. 20752 */ 20753 if (CLASSD(ipha->ipha_dst) && multicast_forward) goto multi_loopback; 20754 20755 /* checksum */ 20756 cksum += ttl_protocol; 20757 20758 /* fragment the packet */ 20759 if (max_frag < (uint_t)(LENGTH + ipsec_len)) 20760 goto fragmentit; 20761 /* 20762 * Don't use frag_flag if packet is pre-built or source 20763 * routed or if multicast (since multicast packets do 20764 * not solicit ICMP "packet too big" messages). 20765 */ 20766 if ((ip_hdr_included != IP_HDR_INCLUDED) && 20767 (V_HLEN == IP_SIMPLE_HDR_VERSION || 20768 !ip_source_route_included(ipha)) && 20769 !CLASSD(ipha->ipha_dst)) 20770 ipha->ipha_fragment_offset_and_flags |= 20771 htons(ire->ire_frag_flag); 20772 20773 if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) { 20774 /* calculate IP header checksum */ 20775 cksum += ipha->ipha_ident; 20776 cksum += (v_hlen_tos_len >> 16)+(v_hlen_tos_len & 0xFFFF); 20777 cksum += ipha->ipha_fragment_offset_and_flags; 20778 20779 /* IP options present */ 20780 hlen = (V_HLEN & 0xF) - IP_SIMPLE_HDR_LENGTH_IN_WORDS; 20781 if (hlen) 20782 goto checksumoptions; 20783 20784 /* calculate hdr checksum */ 20785 cksum = ((cksum & 0xFFFF) + (cksum >> 16)); 20786 cksum = ~(cksum + (cksum >> 16)); 20787 ipha->ipha_hdr_checksum = (uint16_t)cksum; 20788 } 20789 if (ipsec_len != 0) { 20790 /* 20791 * We will do the rest of the processing after 20792 * we come back from IPSEC in ip_wput_ipsec_out(). 20793 */ 20794 ASSERT(MBLKL(first_mp) >= sizeof (ipsec_out_t)); 20795 20796 io = (ipsec_out_t *)first_mp->b_rptr; 20797 io->ipsec_out_ill_index = ((ill_t *)stq->q_ptr)-> 20798 ill_phyint->phyint_ifindex; 20799 20800 ipsec_out_process(q, first_mp, ire, ill_index); 20801 ire_refrele(ire); 20802 if (conn_outgoing_ill != NULL) 20803 ill_refrele(conn_outgoing_ill); 20804 return; 20805 } 20806 20807 /* 20808 * In most cases, the emission loop below is entered only 20809 * once. Only in the case where the ire holds the 20810 * RTF_MULTIRT flag, do we loop to process all RTF_MULTIRT 20811 * flagged ires in the bucket, and send the packet 20812 * through all crossed RTF_MULTIRT routes. 20813 */ 20814 if (ire->ire_flags & RTF_MULTIRT) { 20815 multirt_send = B_TRUE; 20816 } 20817 do { 20818 if (multirt_send) { 20819 irb_t *irb; 20820 /* 20821 * We are in a multiple send case, need to get 20822 * the next ire and make a duplicate of the packet. 20823 * ire1 holds here the next ire to process in the 20824 * bucket. If multirouting is expected, 20825 * any non-RTF_MULTIRT ire that has the 20826 * right destination address is ignored. 20827 */ 20828 irb = ire->ire_bucket; 20829 ASSERT(irb != NULL); 20830 20831 IRB_REFHOLD(irb); 20832 for (ire1 = ire->ire_next; 20833 ire1 != NULL; 20834 ire1 = ire1->ire_next) { 20835 if ((ire1->ire_flags & RTF_MULTIRT) == 0) 20836 continue; 20837 if (ire1->ire_addr != ire->ire_addr) 20838 continue; 20839 if (ire1->ire_marks & 20840 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)) 20841 continue; 20842 20843 /* Got one */ 20844 IRE_REFHOLD(ire1); 20845 break; 20846 } 20847 IRB_REFRELE(irb); 20848 20849 if (ire1 != NULL) { 20850 next_mp = copyb(mp); 20851 if ((next_mp == NULL) || 20852 ((mp->b_cont != NULL) && 20853 ((next_mp->b_cont = 20854 dupmsg(mp->b_cont)) == NULL))) { 20855 freemsg(next_mp); 20856 next_mp = NULL; 20857 ire_refrele(ire1); 20858 ire1 = NULL; 20859 } 20860 } 20861 20862 /* Last multiroute ire; don't loop anymore. */ 20863 if (ire1 == NULL) { 20864 multirt_send = B_FALSE; 20865 } 20866 } 20867 mp = ip_wput_attach_llhdr(mp, ire, IPP_LOCAL_OUT, ill_index); 20868 if (mp == NULL) { 20869 BUMP_MIB(&ip_mib, ipOutDiscards); 20870 ip2dbg(("ip_wput_ire: fastpath wput pkt dropped "\ 20871 "during IPPF processing\n")); 20872 ire_refrele(ire); 20873 if (next_mp != NULL) { 20874 freemsg(next_mp); 20875 ire_refrele(ire1); 20876 } 20877 if (conn_outgoing_ill != NULL) 20878 ill_refrele(conn_outgoing_ill); 20879 return; 20880 } 20881 UPDATE_OB_PKT_COUNT(ire); 20882 ire->ire_last_used_time = lbolt; 20883 20884 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 20885 "ip_wput_ire_end: q %p (%S)", 20886 q, "last copy out"); 20887 putnext(stq, mp); 20888 IRE_REFRELE(ire); 20889 20890 if (multirt_send) { 20891 ASSERT(ire1); 20892 /* 20893 * Proceed with the next RTF_MULTIRT ire, 20894 * Also set up the send-to queue accordingly. 20895 */ 20896 ire = ire1; 20897 ire1 = NULL; 20898 stq = ire->ire_stq; 20899 mp = next_mp; 20900 next_mp = NULL; 20901 ipha = (ipha_t *)mp->b_rptr; 20902 ill_index = Q_TO_INDEX(stq); 20903 } 20904 } while (multirt_send); 20905 if (conn_outgoing_ill != NULL) 20906 ill_refrele(conn_outgoing_ill); 20907 return; 20908 20909 /* 20910 * ire->ire_type == IRE_BROADCAST (minimize diffs) 20911 */ 20912 broadcast: 20913 { 20914 /* 20915 * Avoid broadcast storms by setting the ttl to 1 20916 * for broadcasts. This parameter can be set 20917 * via ndd, so make sure that for the SO_DONTROUTE 20918 * case that ipha_ttl is always set to 1. 20919 * In the event that we are replying to incoming 20920 * ICMP packets, conn could be NULL. 20921 */ 20922 if ((connp != NULL) && connp->conn_dontroute) 20923 ipha->ipha_ttl = 1; 20924 else 20925 ipha->ipha_ttl = ip_broadcast_ttl; 20926 20927 /* 20928 * Note that we are not doing a IRB_REFHOLD here. 20929 * Actually we don't care if the list changes i.e 20930 * if somebody deletes an IRE from the list while 20931 * we drop the lock, the next time we come around 20932 * ire_next will be NULL and hence we won't send 20933 * out multiple copies which is fine. 20934 */ 20935 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 20936 ire1 = ire->ire_next; 20937 if (conn_outgoing_ill != NULL) { 20938 while (ire->ire_ipif->ipif_ill != conn_outgoing_ill) { 20939 ASSERT(ire1 == ire->ire_next); 20940 if (ire1 != NULL && ire1->ire_addr == dst) { 20941 ire_refrele(ire); 20942 ire = ire1; 20943 IRE_REFHOLD(ire); 20944 ire1 = ire->ire_next; 20945 continue; 20946 } 20947 rw_exit(&ire->ire_bucket->irb_lock); 20948 /* Did not find a matching ill */ 20949 ip1dbg(("ip_wput_ire: broadcast with no " 20950 "matching IP_BOUND_IF ill %s\n", 20951 conn_outgoing_ill->ill_name)); 20952 freemsg(first_mp); 20953 if (ire != NULL) 20954 ire_refrele(ire); 20955 ill_refrele(conn_outgoing_ill); 20956 return; 20957 } 20958 } else if (ire1 != NULL && ire1->ire_addr == dst) { 20959 /* 20960 * If the next IRE has the same address and is not one 20961 * of the two copies that we need to send, try to see 20962 * whether this copy should be sent at all. This 20963 * assumes that we insert loopbacks first and then 20964 * non-loopbacks. This is acheived by inserting the 20965 * loopback always before non-loopback. 20966 * This is used to send a single copy of a broadcast 20967 * packet out all physical interfaces that have an 20968 * matching IRE_BROADCAST while also looping 20969 * back one copy (to ip_wput_local) for each 20970 * matching physical interface. However, we avoid 20971 * sending packets out different logical that match by 20972 * having ipif_up/ipif_down supress duplicate 20973 * IRE_BROADCASTS. 20974 * 20975 * This feature is currently used to get broadcasts 20976 * sent to multiple interfaces, when the broadcast 20977 * address being used applies to multiple interfaces. 20978 * For example, a whole net broadcast will be 20979 * replicated on every connected subnet of 20980 * the target net. 20981 * 20982 * Each zone has its own set of IRE_BROADCASTs, so that 20983 * we're able to distribute inbound packets to multiple 20984 * zones who share a broadcast address. We avoid looping 20985 * back outbound packets in different zones but on the 20986 * same ill, as the application would see duplicates. 20987 * 20988 * If the interfaces are part of the same group, 20989 * we would want to send only one copy out for 20990 * whole group. 20991 * 20992 * This logic assumes that ire_add_v4() groups the 20993 * IRE_BROADCAST entries so that those with the same 20994 * ire_addr and ill_group are kept together. 20995 */ 20996 ire_ill = ire->ire_ipif->ipif_ill; 20997 if (ire->ire_stq == NULL && ire1->ire_stq != NULL) { 20998 if (ire_ill->ill_group != NULL && 20999 (ire->ire_marks & IRE_MARK_NORECV)) { 21000 /* 21001 * If the current zone only has an ire 21002 * broadcast for this address marked 21003 * NORECV, the ire we want is ahead in 21004 * the bucket, so we look it up 21005 * deliberately ignoring the zoneid. 21006 */ 21007 for (ire1 = ire->ire_bucket->irb_ire; 21008 ire1 != NULL; 21009 ire1 = ire1->ire_next) { 21010 ire1_ill = 21011 ire1->ire_ipif->ipif_ill; 21012 if (ire1->ire_addr != dst) 21013 continue; 21014 /* skip over the current ire */ 21015 if (ire1 == ire) 21016 continue; 21017 /* skip over deleted ires */ 21018 if (ire1->ire_marks & 21019 IRE_MARK_CONDEMNED) 21020 continue; 21021 /* 21022 * non-loopback ire in our 21023 * group: use it for the next 21024 * pass in the loop 21025 */ 21026 if (ire1->ire_stq != NULL && 21027 ire1_ill->ill_group == 21028 ire_ill->ill_group) 21029 break; 21030 } 21031 } 21032 } else { 21033 while (ire1 != NULL && ire1->ire_addr == dst) { 21034 ire1_ill = ire1->ire_ipif->ipif_ill; 21035 /* 21036 * We can have two broadcast ires on the 21037 * same ill in different zones; here 21038 * we'll send a copy of the packet on 21039 * each ill and the fanout code will 21040 * call conn_wantpacket() to check that 21041 * the zone has the broadcast address 21042 * configured on the ill. If the two 21043 * ires are in the same group we only 21044 * send one copy up. 21045 */ 21046 if (ire1_ill != ire_ill && 21047 (ire1_ill->ill_group == NULL || 21048 ire_ill->ill_group == NULL || 21049 ire1_ill->ill_group != 21050 ire_ill->ill_group)) { 21051 break; 21052 } 21053 ire1 = ire1->ire_next; 21054 } 21055 } 21056 } 21057 ASSERT(multirt_send == B_FALSE); 21058 if (ire1 != NULL && ire1->ire_addr == dst) { 21059 if ((ire->ire_flags & RTF_MULTIRT) && 21060 (ire1->ire_flags & RTF_MULTIRT)) { 21061 /* 21062 * We are in the multirouting case. 21063 * The message must be sent at least 21064 * on both ires. These ires have been 21065 * inserted AFTER the standard ones 21066 * in ip_rt_add(). There are thus no 21067 * other ire entries for the destination 21068 * address in the rest of the bucket 21069 * that do not have the RTF_MULTIRT 21070 * flag. We don't process a copy 21071 * of the message here. This will be 21072 * done in the final sending loop. 21073 */ 21074 multirt_send = B_TRUE; 21075 } else { 21076 next_mp = ip_copymsg(first_mp); 21077 if (next_mp != NULL) 21078 IRE_REFHOLD(ire1); 21079 } 21080 } 21081 rw_exit(&ire->ire_bucket->irb_lock); 21082 } 21083 21084 if (stq) { 21085 /* 21086 * A non-NULL send-to queue means this packet is going 21087 * out of this machine. 21088 */ 21089 21090 BUMP_MIB(&ip_mib, ipOutRequests); 21091 ttl_protocol = ((uint16_t *)ipha)[4]; 21092 /* 21093 * We accumulate the pseudo header checksum in cksum. 21094 * This is pretty hairy code, so watch close. One 21095 * thing to keep in mind is that UDP and TCP have 21096 * stored their respective datagram lengths in their 21097 * checksum fields. This lines things up real nice. 21098 */ 21099 cksum = (dst >> 16) + (dst & 0xFFFF) + 21100 (src >> 16) + (src & 0xFFFF); 21101 /* 21102 * We assume the udp checksum field contains the 21103 * length, so to compute the pseudo header checksum, 21104 * all we need is the protocol number and src/dst. 21105 */ 21106 /* Provide the checksums for UDP and TCP. */ 21107 if ((PROTO == IPPROTO_TCP) && 21108 (ip_hdr_included != IP_HDR_INCLUDED)) { 21109 /* hlen gets the number of uchar_ts in the IP header */ 21110 hlen = (V_HLEN & 0xF) << 2; 21111 up = IPH_TCPH_CHECKSUMP(ipha, hlen); 21112 IP_STAT(ip_out_sw_cksum); 21113 IP_STAT_UPDATE(ip_tcp_out_sw_cksum_bytes, 21114 LENGTH - hlen); 21115 *up = IP_CSUM(mp, hlen, cksum + IP_TCP_CSUM_COMP); 21116 if (*up == 0) 21117 *up = 0xFFFF; 21118 } else if (PROTO == IPPROTO_SCTP && 21119 (ip_hdr_included != IP_HDR_INCLUDED)) { 21120 sctp_hdr_t *sctph; 21121 21122 hlen = (V_HLEN & 0xF) << 2; 21123 ASSERT(MBLKL(mp) >= (hlen + sizeof (*sctph))); 21124 sctph = (sctp_hdr_t *)(mp->b_rptr + hlen); 21125 sctph->sh_chksum = 0; 21126 #ifdef DEBUG 21127 if (!skip_sctp_cksum) 21128 #endif 21129 sctph->sh_chksum = sctp_cksum(mp, hlen); 21130 } else { 21131 queue_t *dev_q = stq->q_next; 21132 21133 if ((dev_q->q_next || dev_q->q_first) && 21134 !canput(dev_q)) { 21135 blocked: 21136 ipha->ipha_ident = ip_hdr_included; 21137 /* 21138 * If we don't have a conn to apply 21139 * backpressure, free the message. 21140 * In the ire_send path, we don't know 21141 * the position to requeue the packet. Rather 21142 * than reorder packets, we just drop this 21143 * packet. 21144 */ 21145 if (ip_output_queue && connp != NULL && 21146 caller != IRE_SEND) { 21147 if (caller == IP_WSRV) { 21148 connp->conn_did_putbq = 1; 21149 (void) putbq(connp->conn_wq, 21150 first_mp); 21151 conn_drain_insert(connp); 21152 /* 21153 * This is the service thread, 21154 * and the queue is already 21155 * noenabled. The check for 21156 * canput and the putbq is not 21157 * atomic. So we need to check 21158 * again. 21159 */ 21160 if (canput(stq->q_next)) 21161 connp->conn_did_putbq 21162 = 0; 21163 IP_STAT(ip_conn_flputbq); 21164 } else { 21165 /* 21166 * We are not the service proc. 21167 * ip_wsrv will be scheduled or 21168 * is already running. 21169 */ 21170 (void) putq(connp->conn_wq, 21171 first_mp); 21172 } 21173 } else { 21174 BUMP_MIB(&ip_mib, ipOutDiscards); 21175 freemsg(first_mp); 21176 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 21177 "ip_wput_ire_end: q %p (%S)", 21178 q, "discard"); 21179 } 21180 ire_refrele(ire); 21181 if (next_mp) { 21182 ire_refrele(ire1); 21183 freemsg(next_mp); 21184 } 21185 if (conn_outgoing_ill != NULL) 21186 ill_refrele(conn_outgoing_ill); 21187 return; 21188 } 21189 if ((PROTO == IPPROTO_UDP) && 21190 (ip_hdr_included != IP_HDR_INCLUDED)) { 21191 /* 21192 * hlen gets the number of uchar_ts in the 21193 * IP header 21194 */ 21195 hlen = (V_HLEN & 0xF) << 2; 21196 up = IPH_UDPH_CHECKSUMP(ipha, hlen); 21197 max_frag = ire->ire_max_frag; 21198 if (*up != 0) { 21199 IP_CKSUM_XMIT(ire_ill, ire, mp, ipha, 21200 up, PROTO, hlen, LENGTH, max_frag, 21201 ipsec_len, cksum); 21202 /* Software checksum? */ 21203 if (DB_CKSUMFLAGS(mp) == 0) { 21204 IP_STAT(ip_out_sw_cksum); 21205 IP_STAT_UPDATE( 21206 ip_udp_out_sw_cksum_bytes, 21207 LENGTH - hlen); 21208 } 21209 } 21210 } 21211 } 21212 /* 21213 * Need to do this even when fragmenting. The local 21214 * loopback can be done without computing checksums 21215 * but forwarding out other interface must be done 21216 * after the IP checksum (and ULP checksums) have been 21217 * computed. 21218 * 21219 * NOTE : multicast_forward is set only if this packet 21220 * originated from ip_wput. For packets originating from 21221 * ip_wput_multicast, it is not set. 21222 */ 21223 if (CLASSD(ipha->ipha_dst) && multicast_forward) { 21224 multi_loopback: 21225 ip2dbg(("ip_wput: multicast, loop %d\n", 21226 conn_multicast_loop)); 21227 21228 /* Forget header checksum offload */ 21229 DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; 21230 21231 /* 21232 * Local loopback of multicasts? Check the 21233 * ill. 21234 * 21235 * Note that the loopback function will not come 21236 * in through ip_rput - it will only do the 21237 * client fanout thus we need to do an mforward 21238 * as well. The is different from the BSD 21239 * logic. 21240 */ 21241 if (ill != NULL) { 21242 ilm_t *ilm; 21243 21244 ILM_WALKER_HOLD(ill); 21245 ilm = ilm_lookup_ill(ill, ipha->ipha_dst, 21246 ALL_ZONES); 21247 ILM_WALKER_RELE(ill); 21248 if (ilm != NULL) { 21249 /* 21250 * Pass along the virtual output q. 21251 * ip_wput_local() will distribute the 21252 * packet to all the matching zones, 21253 * except the sending zone when 21254 * IP_MULTICAST_LOOP is false. 21255 */ 21256 ip_multicast_loopback(q, ill, first_mp, 21257 conn_multicast_loop ? 0 : 21258 IP_FF_NO_MCAST_LOOP, zoneid); 21259 } 21260 } 21261 if (ipha->ipha_ttl == 0) { 21262 /* 21263 * 0 => only to this host i.e. we are 21264 * done. We are also done if this was the 21265 * loopback interface since it is sufficient 21266 * to loopback one copy of a multicast packet. 21267 */ 21268 freemsg(first_mp); 21269 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 21270 "ip_wput_ire_end: q %p (%S)", 21271 q, "loopback"); 21272 ire_refrele(ire); 21273 if (conn_outgoing_ill != NULL) 21274 ill_refrele(conn_outgoing_ill); 21275 return; 21276 } 21277 /* 21278 * ILLF_MULTICAST is checked in ip_newroute 21279 * i.e. we don't need to check it here since 21280 * all IRE_CACHEs come from ip_newroute. 21281 * For multicast traffic, SO_DONTROUTE is interpreted 21282 * to mean only send the packet out the interface 21283 * (optionally specified with IP_MULTICAST_IF) 21284 * and do not forward it out additional interfaces. 21285 * RSVP and the rsvp daemon is an example of a 21286 * protocol and user level process that 21287 * handles it's own routing. Hence, it uses the 21288 * SO_DONTROUTE option to accomplish this. 21289 */ 21290 21291 if (ip_g_mrouter && !conn_dontroute && ill != NULL) { 21292 /* Unconditionally redo the checksum */ 21293 ipha->ipha_hdr_checksum = 0; 21294 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 21295 21296 /* 21297 * If this needs to go out secure, we need 21298 * to wait till we finish the IPSEC 21299 * processing. 21300 */ 21301 if (ipsec_len == 0 && 21302 ip_mforward(ill, ipha, mp)) { 21303 freemsg(first_mp); 21304 ip1dbg(("ip_wput: mforward failed\n")); 21305 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 21306 "ip_wput_ire_end: q %p (%S)", 21307 q, "mforward failed"); 21308 ire_refrele(ire); 21309 if (conn_outgoing_ill != NULL) 21310 ill_refrele(conn_outgoing_ill); 21311 return; 21312 } 21313 } 21314 } 21315 max_frag = ire->ire_max_frag; 21316 cksum += ttl_protocol; 21317 if (max_frag >= (uint_t)(LENGTH + ipsec_len)) { 21318 /* No fragmentation required for this one. */ 21319 /* 21320 * Don't use frag_flag if packet is pre-built or source 21321 * routed or if multicast (since multicast packets do 21322 * not solicit ICMP "packet too big" messages). 21323 */ 21324 if ((ip_hdr_included != IP_HDR_INCLUDED) && 21325 (V_HLEN == IP_SIMPLE_HDR_VERSION || 21326 !ip_source_route_included(ipha)) && 21327 !CLASSD(ipha->ipha_dst)) 21328 ipha->ipha_fragment_offset_and_flags |= 21329 htons(ire->ire_frag_flag); 21330 21331 if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) { 21332 /* Complete the IP header checksum. */ 21333 cksum += ipha->ipha_ident; 21334 cksum += (v_hlen_tos_len >> 16)+ 21335 (v_hlen_tos_len & 0xFFFF); 21336 cksum += ipha->ipha_fragment_offset_and_flags; 21337 hlen = (V_HLEN & 0xF) - 21338 IP_SIMPLE_HDR_LENGTH_IN_WORDS; 21339 if (hlen) { 21340 checksumoptions: 21341 /* 21342 * Account for the IP Options in the IP 21343 * header checksum. 21344 */ 21345 up = (uint16_t *)(rptr+ 21346 IP_SIMPLE_HDR_LENGTH); 21347 do { 21348 cksum += up[0]; 21349 cksum += up[1]; 21350 up += 2; 21351 } while (--hlen); 21352 } 21353 cksum = ((cksum & 0xFFFF) + (cksum >> 16)); 21354 cksum = ~(cksum + (cksum >> 16)); 21355 ipha->ipha_hdr_checksum = (uint16_t)cksum; 21356 } 21357 if (ipsec_len != 0) { 21358 ipsec_out_process(q, first_mp, ire, ill_index); 21359 if (!next_mp) { 21360 ire_refrele(ire); 21361 if (conn_outgoing_ill != NULL) 21362 ill_refrele(conn_outgoing_ill); 21363 return; 21364 } 21365 goto next; 21366 } 21367 21368 /* 21369 * multirt_send has already been handled 21370 * for broadcast, but not yet for multicast 21371 * or IP options. 21372 */ 21373 if (next_mp == NULL) { 21374 if (ire->ire_flags & RTF_MULTIRT) { 21375 multirt_send = B_TRUE; 21376 } 21377 } 21378 21379 /* 21380 * In most cases, the emission loop below is 21381 * entered only once. Only in the case where 21382 * the ire holds the RTF_MULTIRT flag, do we loop 21383 * to process all RTF_MULTIRT ires in the bucket, 21384 * and send the packet through all crossed 21385 * RTF_MULTIRT routes. 21386 */ 21387 do { 21388 if (multirt_send) { 21389 irb_t *irb; 21390 21391 irb = ire->ire_bucket; 21392 ASSERT(irb != NULL); 21393 /* 21394 * We are in a multiple send case, 21395 * need to get the next IRE and make 21396 * a duplicate of the packet. 21397 */ 21398 IRB_REFHOLD(irb); 21399 for (ire1 = ire->ire_next; 21400 ire1 != NULL; 21401 ire1 = ire1->ire_next) { 21402 if (!(ire1->ire_flags & 21403 RTF_MULTIRT)) 21404 continue; 21405 if (ire1->ire_addr != 21406 ire->ire_addr) 21407 continue; 21408 if (ire1->ire_marks & 21409 (IRE_MARK_CONDEMNED| 21410 IRE_MARK_HIDDEN)) 21411 continue; 21412 21413 /* Got one */ 21414 IRE_REFHOLD(ire1); 21415 break; 21416 } 21417 IRB_REFRELE(irb); 21418 21419 if (ire1 != NULL) { 21420 next_mp = copyb(mp); 21421 if ((next_mp == NULL) || 21422 ((mp->b_cont != NULL) && 21423 ((next_mp->b_cont = 21424 dupmsg(mp->b_cont)) 21425 == NULL))) { 21426 freemsg(next_mp); 21427 next_mp = NULL; 21428 ire_refrele(ire1); 21429 ire1 = NULL; 21430 } 21431 } 21432 21433 /* 21434 * Last multiroute ire; don't loop 21435 * anymore. The emission is over 21436 * and next_mp is NULL. 21437 */ 21438 if (ire1 == NULL) { 21439 multirt_send = B_FALSE; 21440 } 21441 } 21442 21443 ASSERT(ipsec_len == 0); 21444 mp1 = ip_wput_attach_llhdr(mp, ire, 21445 IPP_LOCAL_OUT, ill_index); 21446 if (mp1 == NULL) { 21447 BUMP_MIB(&ip_mib, ipOutDiscards); 21448 if (next_mp) { 21449 freemsg(next_mp); 21450 ire_refrele(ire1); 21451 } 21452 ire_refrele(ire); 21453 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 21454 "ip_wput_ire_end: q %p (%S)", 21455 q, "discard MDATA"); 21456 if (conn_outgoing_ill != NULL) 21457 ill_refrele(conn_outgoing_ill); 21458 return; 21459 } 21460 UPDATE_OB_PKT_COUNT(ire); 21461 ire->ire_last_used_time = lbolt; 21462 21463 if (multirt_send) { 21464 /* 21465 * We are in a multiple send case, 21466 * need to re-enter the sending loop 21467 * using the next ire. 21468 */ 21469 putnext(stq, mp1); 21470 ire_refrele(ire); 21471 ire = ire1; 21472 stq = ire->ire_stq; 21473 mp = next_mp; 21474 next_mp = NULL; 21475 ipha = (ipha_t *)mp->b_rptr; 21476 ill_index = Q_TO_INDEX(stq); 21477 } 21478 } while (multirt_send); 21479 21480 if (!next_mp) { 21481 /* 21482 * Last copy going out (the ultra-common 21483 * case). Note that we intentionally replicate 21484 * the putnext rather than calling it before 21485 * the next_mp check in hopes of a little 21486 * tail-call action out of the compiler. 21487 */ 21488 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 21489 "ip_wput_ire_end: q %p (%S)", 21490 q, "last copy out(1)"); 21491 putnext(stq, mp1); 21492 ire_refrele(ire); 21493 if (conn_outgoing_ill != NULL) 21494 ill_refrele(conn_outgoing_ill); 21495 return; 21496 } 21497 /* More copies going out below. */ 21498 putnext(stq, mp1); 21499 } else { 21500 int offset; 21501 fragmentit: 21502 offset = ntohs(ipha->ipha_fragment_offset_and_flags); 21503 /* 21504 * If this would generate a icmp_frag_needed message, 21505 * we need to handle it before we do the IPSEC 21506 * processing. Otherwise, we need to strip the IPSEC 21507 * headers before we send up the message to the ULPs 21508 * which becomes messy and difficult. 21509 */ 21510 if (ipsec_len != 0) { 21511 if ((max_frag < (unsigned int)(LENGTH + 21512 ipsec_len)) && (offset & IPH_DF)) { 21513 21514 BUMP_MIB(&ip_mib, ipFragFails); 21515 ipha->ipha_hdr_checksum = 0; 21516 ipha->ipha_hdr_checksum = 21517 (uint16_t)ip_csum_hdr(ipha); 21518 icmp_frag_needed(ire->ire_stq, first_mp, 21519 max_frag); 21520 if (!next_mp) { 21521 ire_refrele(ire); 21522 if (conn_outgoing_ill != NULL) { 21523 ill_refrele( 21524 conn_outgoing_ill); 21525 } 21526 return; 21527 } 21528 } else { 21529 /* 21530 * This won't cause a icmp_frag_needed 21531 * message. to be gnerated. Send it on 21532 * the wire. Note that this could still 21533 * cause fragmentation and all we 21534 * do is the generation of the message 21535 * to the ULP if needed before IPSEC. 21536 */ 21537 if (!next_mp) { 21538 ipsec_out_process(q, first_mp, 21539 ire, ill_index); 21540 TRACE_2(TR_FAC_IP, 21541 TR_IP_WPUT_IRE_END, 21542 "ip_wput_ire_end: q %p " 21543 "(%S)", q, 21544 "last ipsec_out_process"); 21545 ire_refrele(ire); 21546 if (conn_outgoing_ill != NULL) { 21547 ill_refrele( 21548 conn_outgoing_ill); 21549 } 21550 return; 21551 } 21552 ipsec_out_process(q, first_mp, 21553 ire, ill_index); 21554 } 21555 } else { 21556 /* Initiate IPPF processing */ 21557 if (IPP_ENABLED(IPP_LOCAL_OUT)) { 21558 ip_process(IPP_LOCAL_OUT, &mp, 21559 ill_index); 21560 if (mp == NULL) { 21561 BUMP_MIB(&ip_mib, 21562 ipOutDiscards); 21563 if (next_mp != NULL) { 21564 freemsg(next_mp); 21565 ire_refrele(ire1); 21566 } 21567 ire_refrele(ire); 21568 TRACE_2(TR_FAC_IP, 21569 TR_IP_WPUT_IRE_END, 21570 "ip_wput_ire: q %p (%S)", 21571 q, "discard MDATA"); 21572 if (conn_outgoing_ill != NULL) { 21573 ill_refrele( 21574 conn_outgoing_ill); 21575 } 21576 return; 21577 } 21578 } 21579 if (!next_mp) { 21580 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 21581 "ip_wput_ire_end: q %p (%S)", 21582 q, "last fragmentation"); 21583 ip_wput_ire_fragmentit(mp, ire); 21584 ire_refrele(ire); 21585 if (conn_outgoing_ill != NULL) 21586 ill_refrele(conn_outgoing_ill); 21587 return; 21588 } 21589 ip_wput_ire_fragmentit(mp, ire); 21590 } 21591 } 21592 } else { 21593 nullstq: 21594 /* A NULL stq means the destination address is local. */ 21595 UPDATE_OB_PKT_COUNT(ire); 21596 ire->ire_last_used_time = lbolt; 21597 ASSERT(ire->ire_ipif != NULL); 21598 if (!next_mp) { 21599 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 21600 "ip_wput_ire_end: q %p (%S)", 21601 q, "local address"); 21602 ip_wput_local(q, ire->ire_ipif->ipif_ill, ipha, 21603 first_mp, ire, 0, ire->ire_zoneid); 21604 ire_refrele(ire); 21605 if (conn_outgoing_ill != NULL) 21606 ill_refrele(conn_outgoing_ill); 21607 return; 21608 } 21609 ip_wput_local(q, ire->ire_ipif->ipif_ill, ipha, first_mp, 21610 ire, 0, ire->ire_zoneid); 21611 } 21612 next: 21613 /* 21614 * More copies going out to additional interfaces. 21615 * ire1 has already been held. We don't need the 21616 * "ire" anymore. 21617 */ 21618 ire_refrele(ire); 21619 ire = ire1; 21620 ASSERT(ire != NULL && ire->ire_refcnt >= 1 && next_mp != NULL); 21621 mp = next_mp; 21622 ASSERT(ire->ire_ipversion == IPV4_VERSION); 21623 ill = ire_to_ill(ire); 21624 first_mp = mp; 21625 if (ipsec_len != 0) { 21626 ASSERT(first_mp->b_datap->db_type == M_CTL); 21627 mp = mp->b_cont; 21628 } 21629 dst = ire->ire_addr; 21630 ipha = (ipha_t *)mp->b_rptr; 21631 /* 21632 * Restore src so that we will pick up ire->ire_src_addr if src was 0. 21633 * Restore ipha_ident "no checksum" flag. 21634 */ 21635 src = orig_src; 21636 ipha->ipha_ident = ip_hdr_included; 21637 goto another; 21638 21639 #undef rptr 21640 #undef Q_TO_INDEX 21641 } 21642 21643 /* 21644 * Routine to allocate a message that is used to notify the ULP about MDT. 21645 * The caller may provide a pointer to the link-layer MDT capabilities, 21646 * or NULL if MDT is to be disabled on the stream. 21647 */ 21648 mblk_t * 21649 ip_mdinfo_alloc(ill_mdt_capab_t *isrc) 21650 { 21651 mblk_t *mp; 21652 ip_mdt_info_t *mdti; 21653 ill_mdt_capab_t *idst; 21654 21655 if ((mp = allocb(sizeof (*mdti), BPRI_HI)) != NULL) { 21656 DB_TYPE(mp) = M_CTL; 21657 mp->b_wptr = mp->b_rptr + sizeof (*mdti); 21658 mdti = (ip_mdt_info_t *)mp->b_rptr; 21659 mdti->mdt_info_id = MDT_IOC_INFO_UPDATE; 21660 idst = &(mdti->mdt_capab); 21661 21662 /* 21663 * If the caller provides us with the capability, copy 21664 * it over into our notification message; otherwise 21665 * we zero out the capability portion. 21666 */ 21667 if (isrc != NULL) 21668 bcopy((caddr_t)isrc, (caddr_t)idst, sizeof (*idst)); 21669 else 21670 bzero((caddr_t)idst, sizeof (*idst)); 21671 } 21672 return (mp); 21673 } 21674 21675 /* 21676 * Routine which determines whether MDT can be enabled on the destination 21677 * IRE and IPC combination, and if so, allocates and returns the MDT 21678 * notification mblk that may be used by ULP. We also check if we need to 21679 * turn MDT back to 'on' when certain restrictions prohibiting us to allow 21680 * MDT usage in the past have been lifted. This gets called during IP 21681 * and ULP binding. 21682 */ 21683 mblk_t * 21684 ip_mdinfo_return(ire_t *dst_ire, conn_t *connp, char *ill_name, 21685 ill_mdt_capab_t *mdt_cap) 21686 { 21687 mblk_t *mp; 21688 boolean_t rc = B_FALSE; 21689 21690 ASSERT(dst_ire != NULL); 21691 ASSERT(connp != NULL); 21692 ASSERT(mdt_cap != NULL); 21693 21694 /* 21695 * Currently, we only support simple TCP/{IPv4,IPv6} with 21696 * Multidata, which is handled in tcp_multisend(). This 21697 * is the reason why we do all these checks here, to ensure 21698 * that we don't enable Multidata for the cases which we 21699 * can't handle at the moment. 21700 */ 21701 do { 21702 /* Only do TCP at the moment */ 21703 if (connp->conn_ulp != IPPROTO_TCP) 21704 break; 21705 21706 /* 21707 * IPSEC outbound policy present? Note that we get here 21708 * after calling ipsec_conn_cache_policy() where the global 21709 * policy checking is performed. conn_latch will be 21710 * non-NULL as long as there's a policy defined, 21711 * i.e. conn_out_enforce_policy may be NULL in such case 21712 * when the connection is non-secure, and hence we check 21713 * further if the latch refers to an outbound policy. 21714 */ 21715 if (CONN_IPSEC_OUT_ENCAPSULATED(connp)) 21716 break; 21717 21718 /* CGTP (multiroute) is enabled? */ 21719 if (dst_ire->ire_flags & RTF_MULTIRT) 21720 break; 21721 21722 /* Outbound IPQoS enabled? */ 21723 if (IPP_ENABLED(IPP_LOCAL_OUT)) { 21724 /* 21725 * In this case, we disable MDT for this and all 21726 * future connections going over the interface. 21727 */ 21728 mdt_cap->ill_mdt_on = 0; 21729 break; 21730 } 21731 21732 /* socket option(s) present? */ 21733 if (!CONN_IS_MD_FASTPATH(connp)) 21734 break; 21735 21736 rc = B_TRUE; 21737 /* CONSTCOND */ 21738 } while (0); 21739 21740 /* Remember the result */ 21741 connp->conn_mdt_ok = rc; 21742 21743 if (!rc) 21744 return (NULL); 21745 else if (!mdt_cap->ill_mdt_on) { 21746 /* 21747 * If MDT has been previously turned off in the past, and we 21748 * currently can do MDT (due to IPQoS policy removal, etc.) 21749 * then enable it for this interface. 21750 */ 21751 mdt_cap->ill_mdt_on = 1; 21752 ip1dbg(("ip_mdinfo_return: reenabling MDT for " 21753 "interface %s\n", ill_name)); 21754 } 21755 21756 /* Allocate the MDT info mblk */ 21757 if ((mp = ip_mdinfo_alloc(mdt_cap)) == NULL) { 21758 ip0dbg(("ip_mdinfo_return: can't enable Multidata for " 21759 "conn %p on %s (ENOMEM)\n", (void *)connp, ill_name)); 21760 return (NULL); 21761 } 21762 return (mp); 21763 } 21764 21765 /* 21766 * Create destination address attribute, and fill it with the physical 21767 * destination address and SAP taken from the template DL_UNITDATA_REQ 21768 * message block. 21769 */ 21770 boolean_t 21771 ip_md_addr_attr(multidata_t *mmd, pdesc_t *pd, const mblk_t *dlmp) 21772 { 21773 dl_unitdata_req_t *dlurp; 21774 pattr_t *pa; 21775 pattrinfo_t pa_info; 21776 pattr_addr_t **das = (pattr_addr_t **)&pa_info.buf; 21777 uint_t das_len, das_off; 21778 21779 ASSERT(dlmp != NULL); 21780 21781 dlurp = (dl_unitdata_req_t *)dlmp->b_rptr; 21782 das_len = dlurp->dl_dest_addr_length; 21783 das_off = dlurp->dl_dest_addr_offset; 21784 21785 pa_info.type = PATTR_DSTADDRSAP; 21786 pa_info.len = sizeof (**das) + das_len - 1; 21787 21788 /* create and associate the attribute */ 21789 pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP); 21790 if (pa != NULL) { 21791 ASSERT(*das != NULL); 21792 (*das)->addr_is_group = 0; 21793 (*das)->addr_len = (uint8_t)das_len; 21794 bcopy((caddr_t)dlurp + das_off, (*das)->addr, das_len); 21795 } 21796 21797 return (pa != NULL); 21798 } 21799 21800 /* 21801 * Create hardware checksum attribute and fill it with the values passed. 21802 */ 21803 boolean_t 21804 ip_md_hcksum_attr(multidata_t *mmd, pdesc_t *pd, uint32_t start_offset, 21805 uint32_t stuff_offset, uint32_t end_offset, uint32_t flags) 21806 { 21807 pattr_t *pa; 21808 pattrinfo_t pa_info; 21809 21810 ASSERT(mmd != NULL); 21811 21812 pa_info.type = PATTR_HCKSUM; 21813 pa_info.len = sizeof (pattr_hcksum_t); 21814 21815 /* create and associate the attribute */ 21816 pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP); 21817 if (pa != NULL) { 21818 pattr_hcksum_t *hck = (pattr_hcksum_t *)pa_info.buf; 21819 21820 hck->hcksum_start_offset = start_offset; 21821 hck->hcksum_stuff_offset = stuff_offset; 21822 hck->hcksum_end_offset = end_offset; 21823 hck->hcksum_flags = flags; 21824 } 21825 return (pa != NULL); 21826 } 21827 21828 /* 21829 * Create zerocopy attribute and fill it with the specified flags 21830 */ 21831 boolean_t 21832 ip_md_zcopy_attr(multidata_t *mmd, pdesc_t *pd, uint_t flags) 21833 { 21834 pattr_t *pa; 21835 pattrinfo_t pa_info; 21836 21837 ASSERT(mmd != NULL); 21838 pa_info.type = PATTR_ZCOPY; 21839 pa_info.len = sizeof (pattr_zcopy_t); 21840 21841 /* create and associate the attribute */ 21842 pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP); 21843 if (pa != NULL) { 21844 pattr_zcopy_t *zcopy = (pattr_zcopy_t *)pa_info.buf; 21845 21846 zcopy->zcopy_flags = flags; 21847 } 21848 return (pa != NULL); 21849 } 21850 21851 /* 21852 * Check if ip_wput_frag_mdt() and ip_wput_frag_mdt_v6() can handle a message 21853 * block chain. We could rewrite to handle arbitrary message block chains but 21854 * that would make the code complicated and slow. Right now there three 21855 * restrictions: 21856 * 21857 * 1. The first message block must contain the complete IP header and 21858 * at least 1 byte of payload data. 21859 * 2. At most MULTIDATA_MAX_PBUFS non-empty message blocks are allowed 21860 * so that we can use a single Multidata message. 21861 * 3. No frag must be distributed over two or more message blocks so 21862 * that we don't need more than two packet descriptors per frag. 21863 * 21864 * The above restrictions allow us to support userland applications (which 21865 * will send down a single message block) and NFS over UDP (which will 21866 * send down a chain of at most three message blocks). 21867 * 21868 * We also don't use MDT for payloads with less than or equal to 21869 * ip_wput_frag_mdt_min bytes because it would cause too much overhead. 21870 */ 21871 boolean_t 21872 ip_can_frag_mdt(mblk_t *mp, ssize_t hdr_len, ssize_t len) 21873 { 21874 int blocks; 21875 ssize_t total, missing, size; 21876 21877 ASSERT(mp != NULL); 21878 ASSERT(hdr_len > 0); 21879 21880 size = MBLKL(mp) - hdr_len; 21881 if (size <= 0) 21882 return (B_FALSE); 21883 21884 /* The first mblk contains the header and some payload. */ 21885 blocks = 1; 21886 total = size; 21887 size %= len; 21888 missing = (size == 0) ? 0 : (len - size); 21889 mp = mp->b_cont; 21890 21891 while (mp != NULL) { 21892 /* 21893 * Give up if we encounter a zero length message block. 21894 * In practice, this should rarely happen and therefore 21895 * not worth the trouble of freeing and re-linking the 21896 * mblk from the chain to handle such case. 21897 */ 21898 if ((size = MBLKL(mp)) == 0) 21899 return (B_FALSE); 21900 21901 /* Too many payload buffers for a single Multidata message? */ 21902 if (++blocks > MULTIDATA_MAX_PBUFS) 21903 return (B_FALSE); 21904 21905 total += size; 21906 /* Is a frag distributed over two or more message blocks? */ 21907 if (missing > size) 21908 return (B_FALSE); 21909 size -= missing; 21910 21911 size %= len; 21912 missing = (size == 0) ? 0 : (len - size); 21913 21914 mp = mp->b_cont; 21915 } 21916 21917 return (total > ip_wput_frag_mdt_min); 21918 } 21919 21920 /* 21921 * Outbound IPv4 fragmentation routine using MDT. 21922 */ 21923 static void 21924 ip_wput_frag_mdt(ire_t *ire, mblk_t *mp, ip_pkt_t pkt_type, int len, 21925 uint32_t frag_flag, int offset) 21926 { 21927 ipha_t *ipha_orig; 21928 int i1, ip_data_end; 21929 uint_t pkts, wroff, hdr_chunk_len, pbuf_idx; 21930 mblk_t *hdr_mp, *md_mp = NULL; 21931 unsigned char *hdr_ptr, *pld_ptr; 21932 multidata_t *mmd; 21933 ip_pdescinfo_t pdi; 21934 21935 ASSERT(DB_TYPE(mp) == M_DATA); 21936 ASSERT(MBLKL(mp) > sizeof (ipha_t)); 21937 21938 ipha_orig = (ipha_t *)mp->b_rptr; 21939 mp->b_rptr += sizeof (ipha_t); 21940 21941 /* Calculate how many packets we will send out */ 21942 i1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgsize(mp); 21943 pkts = (i1 + len - 1) / len; 21944 ASSERT(pkts > 1); 21945 21946 /* Allocate a message block which will hold all the IP Headers. */ 21947 wroff = ip_wroff_extra; 21948 hdr_chunk_len = wroff + IP_SIMPLE_HDR_LENGTH; 21949 21950 i1 = pkts * hdr_chunk_len; 21951 /* 21952 * Create the header buffer, Multidata and destination address 21953 * and SAP attribute that should be associated with it. 21954 */ 21955 if ((hdr_mp = allocb(i1, BPRI_HI)) == NULL || 21956 ((hdr_mp->b_wptr += i1), 21957 (mmd = mmd_alloc(hdr_mp, &md_mp, KM_NOSLEEP)) == NULL) || 21958 !ip_md_addr_attr(mmd, NULL, ire->ire_dlureq_mp)) { 21959 freemsg(mp); 21960 if (md_mp == NULL) { 21961 freemsg(hdr_mp); 21962 } else { 21963 free_mmd: IP_STAT(ip_frag_mdt_discarded); 21964 freemsg(md_mp); 21965 } 21966 IP_STAT(ip_frag_mdt_allocfail); 21967 UPDATE_MIB(&ip_mib, ipOutDiscards, pkts); 21968 return; 21969 } 21970 IP_STAT(ip_frag_mdt_allocd); 21971 21972 /* 21973 * Add a payload buffer to the Multidata; this operation must not 21974 * fail, or otherwise our logic in this routine is broken. There 21975 * is no memory allocation done by the routine, so any returned 21976 * failure simply tells us that we've done something wrong. 21977 * 21978 * A failure tells us that either we're adding the same payload 21979 * buffer more than once, or we're trying to add more buffers than 21980 * allowed. None of the above cases should happen, and we panic 21981 * because either there's horrible heap corruption, and/or 21982 * programming mistake. 21983 */ 21984 if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) 21985 goto pbuf_panic; 21986 21987 hdr_ptr = hdr_mp->b_rptr; 21988 pld_ptr = mp->b_rptr; 21989 21990 /* Establish the ending byte offset, based on the starting offset. */ 21991 offset <<= 3; 21992 ip_data_end = offset + ntohs(ipha_orig->ipha_length) - 21993 IP_SIMPLE_HDR_LENGTH; 21994 21995 pdi.flags = PDESC_HBUF_REF | PDESC_PBUF_REF; 21996 21997 while (pld_ptr < mp->b_wptr) { 21998 ipha_t *ipha; 21999 uint16_t offset_and_flags; 22000 uint16_t ip_len; 22001 int error; 22002 22003 ASSERT((hdr_ptr + hdr_chunk_len) <= hdr_mp->b_wptr); 22004 ipha = (ipha_t *)(hdr_ptr + wroff); 22005 ASSERT(OK_32PTR(ipha)); 22006 *ipha = *ipha_orig; 22007 22008 if (ip_data_end - offset > len) { 22009 offset_and_flags = IPH_MF; 22010 } else { 22011 /* 22012 * Last frag. Set len to the length of this last piece. 22013 */ 22014 len = ip_data_end - offset; 22015 /* A frag of a frag might have IPH_MF non-zero */ 22016 offset_and_flags = 22017 ntohs(ipha->ipha_fragment_offset_and_flags) & 22018 IPH_MF; 22019 } 22020 offset_and_flags |= (uint16_t)(offset >> 3); 22021 offset_and_flags |= (uint16_t)frag_flag; 22022 /* Store the offset and flags in the IP header. */ 22023 ipha->ipha_fragment_offset_and_flags = htons(offset_and_flags); 22024 22025 /* Store the length in the IP header. */ 22026 ip_len = (uint16_t)(len + IP_SIMPLE_HDR_LENGTH); 22027 ipha->ipha_length = htons(ip_len); 22028 22029 /* 22030 * Set the IP header checksum. Note that mp is just 22031 * the header, so this is easy to pass to ip_csum. 22032 */ 22033 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 22034 22035 /* 22036 * Record offset and size of header and data of the next packet 22037 * in the multidata message. 22038 */ 22039 PDESC_HDR_ADD(&pdi, hdr_ptr, wroff, IP_SIMPLE_HDR_LENGTH, 0); 22040 PDESC_PLD_INIT(&pdi); 22041 i1 = MIN(mp->b_wptr - pld_ptr, len); 22042 ASSERT(i1 > 0); 22043 PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, pld_ptr, i1); 22044 if (i1 == len) { 22045 pld_ptr += len; 22046 } else { 22047 i1 = len - i1; 22048 mp = mp->b_cont; 22049 ASSERT(mp != NULL); 22050 ASSERT(MBLKL(mp) >= i1); 22051 /* 22052 * Attach the next payload message block to the 22053 * multidata message. 22054 */ 22055 if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) 22056 goto pbuf_panic; 22057 PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, mp->b_rptr, i1); 22058 pld_ptr = mp->b_rptr + i1; 22059 } 22060 22061 if ((mmd_addpdesc(mmd, (pdescinfo_t *)&pdi, &error, 22062 KM_NOSLEEP)) == NULL) { 22063 /* 22064 * Any failure other than ENOMEM indicates that we 22065 * have passed in invalid pdesc info or parameters 22066 * to mmd_addpdesc, which must not happen. 22067 * 22068 * EINVAL is a result of failure on boundary checks 22069 * against the pdesc info contents. It should not 22070 * happen, and we panic because either there's 22071 * horrible heap corruption, and/or programming 22072 * mistake. 22073 */ 22074 if (error != ENOMEM) { 22075 cmn_err(CE_PANIC, "ip_wput_frag_mdt: " 22076 "pdesc logic error detected for " 22077 "mmd %p pinfo %p (%d)\n", 22078 (void *)mmd, (void *)&pdi, error); 22079 /* NOTREACHED */ 22080 } 22081 IP_STAT(ip_frag_mdt_addpdescfail); 22082 /* Free unattached payload message blocks as well */ 22083 md_mp->b_cont = mp->b_cont; 22084 goto free_mmd; 22085 } 22086 22087 /* Advance fragment offset. */ 22088 offset += len; 22089 22090 /* Advance to location for next header in the buffer. */ 22091 hdr_ptr += hdr_chunk_len; 22092 22093 /* Did we reach the next payload message block? */ 22094 if (pld_ptr == mp->b_wptr && mp->b_cont != NULL) { 22095 mp = mp->b_cont; 22096 /* 22097 * Attach the next message block with payload 22098 * data to the multidata message. 22099 */ 22100 if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) 22101 goto pbuf_panic; 22102 pld_ptr = mp->b_rptr; 22103 } 22104 } 22105 22106 ASSERT(hdr_mp->b_wptr == hdr_ptr); 22107 ASSERT(mp->b_wptr == pld_ptr); 22108 22109 /* Update IP statistics */ 22110 UPDATE_MIB(&ip_mib, ipFragCreates, pkts); 22111 BUMP_MIB(&ip_mib, ipFragOKs); 22112 IP_STAT_UPDATE(ip_frag_mdt_pkt_out, pkts); 22113 22114 if (pkt_type == OB_PKT) { 22115 ire->ire_ob_pkt_count += pkts; 22116 if (ire->ire_ipif != NULL) 22117 atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, pkts); 22118 } else { 22119 /* 22120 * The type is IB_PKT in the forwarding path and in 22121 * the mobile IP case when the packet is being reverse- 22122 * tunneled to the home agent. 22123 */ 22124 ire->ire_ib_pkt_count += pkts; 22125 ASSERT(!IRE_IS_LOCAL(ire)); 22126 if (ire->ire_type & IRE_BROADCAST) 22127 atomic_add_32(&ire->ire_ipif->ipif_ib_pkt_count, pkts); 22128 else 22129 atomic_add_32(&ire->ire_ipif->ipif_fo_pkt_count, pkts); 22130 } 22131 ire->ire_last_used_time = lbolt; 22132 /* Send it down */ 22133 putnext(ire->ire_stq, md_mp); 22134 return; 22135 22136 pbuf_panic: 22137 cmn_err(CE_PANIC, "ip_wput_frag_mdt: payload buffer logic " 22138 "error for mmd %p pbuf %p (%d)", (void *)mmd, (void *)mp, 22139 pbuf_idx); 22140 /* NOTREACHED */ 22141 } 22142 22143 /* 22144 * Outbound IP fragmentation routine. 22145 * 22146 * NOTE : This routine does not ire_refrele the ire that is passed in 22147 * as the argument. 22148 */ 22149 static void 22150 ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, 22151 uint32_t frag_flag) 22152 { 22153 int i1; 22154 mblk_t *ll_hdr_mp; 22155 int ll_hdr_len; 22156 int hdr_len; 22157 mblk_t *hdr_mp; 22158 ipha_t *ipha; 22159 int ip_data_end; 22160 int len; 22161 mblk_t *mp = mp_orig; 22162 int offset; 22163 queue_t *q; 22164 uint32_t v_hlen_tos_len; 22165 mblk_t *first_mp; 22166 boolean_t mctl_present; 22167 ill_t *ill; 22168 mblk_t *xmit_mp; 22169 mblk_t *carve_mp; 22170 ire_t *ire1 = NULL; 22171 ire_t *save_ire = NULL; 22172 mblk_t *next_mp = NULL; 22173 boolean_t last_frag = B_FALSE; 22174 boolean_t multirt_send = B_FALSE; 22175 ire_t *first_ire = NULL; 22176 irb_t *irb = NULL; 22177 22178 TRACE_0(TR_FAC_IP, TR_IP_WPUT_FRAG_START, 22179 "ip_wput_frag_start:"); 22180 22181 if (mp->b_datap->db_type == M_CTL) { 22182 first_mp = mp; 22183 mp_orig = mp = mp->b_cont; 22184 mctl_present = B_TRUE; 22185 } else { 22186 first_mp = mp; 22187 mctl_present = B_FALSE; 22188 } 22189 22190 ASSERT(MBLKL(mp) >= sizeof (ipha_t)); 22191 ipha = (ipha_t *)mp->b_rptr; 22192 22193 /* 22194 * If the Don't Fragment flag is on, generate an ICMP destination 22195 * unreachable, fragmentation needed. 22196 */ 22197 offset = ntohs(ipha->ipha_fragment_offset_and_flags); 22198 if (offset & IPH_DF) { 22199 BUMP_MIB(&ip_mib, ipFragFails); 22200 /* 22201 * Need to compute hdr checksum if called from ip_wput_ire. 22202 * Note that ip_rput_forward verifies the checksum before 22203 * calling this routine so in that case this is a noop. 22204 */ 22205 ipha->ipha_hdr_checksum = 0; 22206 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 22207 icmp_frag_needed(ire->ire_stq, first_mp, max_frag); 22208 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 22209 "ip_wput_frag_end:(%S)", 22210 "don't fragment"); 22211 return; 22212 } 22213 if (mctl_present) 22214 freeb(first_mp); 22215 /* 22216 * Establish the starting offset. May not be zero if we are fragging 22217 * a fragment that is being forwarded. 22218 */ 22219 offset = offset & IPH_OFFSET; 22220 22221 /* TODO why is this test needed? */ 22222 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 22223 if (((max_frag - LENGTH) & ~7) < 8) { 22224 /* TODO: notify ulp somehow */ 22225 BUMP_MIB(&ip_mib, ipFragFails); 22226 freemsg(mp); 22227 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 22228 "ip_wput_frag_end:(%S)", 22229 "len < 8"); 22230 return; 22231 } 22232 22233 hdr_len = (V_HLEN & 0xF) << 2; 22234 22235 ipha->ipha_hdr_checksum = 0; 22236 22237 /* 22238 * Establish the number of bytes maximum per frag, after putting 22239 * in the header. 22240 */ 22241 len = (max_frag - hdr_len) & ~7; 22242 22243 /* Check if we can use MDT to send out the frags. */ 22244 ASSERT(!IRE_IS_LOCAL(ire)); 22245 if (hdr_len == IP_SIMPLE_HDR_LENGTH && ip_multidata_outbound && 22246 !(ire->ire_flags & RTF_MULTIRT) && !IPP_ENABLED(IPP_LOCAL_OUT) && 22247 (ill = ire_to_ill(ire)) != NULL && ILL_MDT_CAPABLE(ill) && 22248 IP_CAN_FRAG_MDT(mp, IP_SIMPLE_HDR_LENGTH, len)) { 22249 ASSERT(ill->ill_mdt_capab != NULL); 22250 if (!ill->ill_mdt_capab->ill_mdt_on) { 22251 /* 22252 * If MDT has been previously turned off in the past, 22253 * and we currently can do MDT (due to IPQoS policy 22254 * removal, etc.) then enable it for this interface. 22255 */ 22256 ill->ill_mdt_capab->ill_mdt_on = 1; 22257 ip1dbg(("ip_wput_frag: enabled MDT for interface %s\n", 22258 ill->ill_name)); 22259 } 22260 ip_wput_frag_mdt(ire, mp, pkt_type, len, frag_flag, 22261 offset); 22262 return; 22263 } 22264 22265 /* Get a copy of the header for the trailing frags */ 22266 hdr_mp = ip_wput_frag_copyhdr((uchar_t *)ipha, hdr_len, offset); 22267 if (!hdr_mp) { 22268 BUMP_MIB(&ip_mib, ipOutDiscards); 22269 freemsg(mp); 22270 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 22271 "ip_wput_frag_end:(%S)", 22272 "couldn't copy hdr"); 22273 return; 22274 } 22275 if (DB_CRED(mp) != NULL) 22276 mblk_setcred(hdr_mp, DB_CRED(mp)); 22277 22278 /* Store the starting offset, with the MoreFrags flag. */ 22279 i1 = offset | IPH_MF | frag_flag; 22280 ipha->ipha_fragment_offset_and_flags = htons((uint16_t)i1); 22281 22282 /* Establish the ending byte offset, based on the starting offset. */ 22283 offset <<= 3; 22284 ip_data_end = offset + ntohs(ipha->ipha_length) - hdr_len; 22285 22286 /* Store the length of the first fragment in the IP header. */ 22287 i1 = len + hdr_len; 22288 ASSERT(i1 <= IP_MAXPACKET); 22289 ipha->ipha_length = htons((uint16_t)i1); 22290 22291 /* 22292 * Compute the IP header checksum for the first frag. We have to 22293 * watch out that we stop at the end of the header. 22294 */ 22295 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 22296 22297 /* 22298 * Now carve off the first frag. Note that this will include the 22299 * original IP header. 22300 */ 22301 if (!(mp = ip_carve_mp(&mp_orig, i1))) { 22302 BUMP_MIB(&ip_mib, ipOutDiscards); 22303 freeb(hdr_mp); 22304 freemsg(mp_orig); 22305 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 22306 "ip_wput_frag_end:(%S)", 22307 "couldn't carve first"); 22308 return; 22309 } 22310 22311 /* 22312 * Multirouting case. Each fragment is replicated 22313 * via all non-condemned RTF_MULTIRT routes 22314 * currently resolved. 22315 * We ensure that first_ire is the first RTF_MULTIRT 22316 * ire in the bucket. 22317 */ 22318 if (ire->ire_flags & RTF_MULTIRT) { 22319 irb = ire->ire_bucket; 22320 ASSERT(irb != NULL); 22321 22322 multirt_send = B_TRUE; 22323 22324 /* Make sure we do not omit any multiroute ire. */ 22325 IRB_REFHOLD(irb); 22326 for (first_ire = irb->irb_ire; 22327 first_ire != NULL; 22328 first_ire = first_ire->ire_next) { 22329 if ((first_ire->ire_flags & RTF_MULTIRT) && 22330 (first_ire->ire_addr == ire->ire_addr) && 22331 !(first_ire->ire_marks & 22332 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) 22333 break; 22334 } 22335 22336 if (first_ire != NULL) { 22337 if (first_ire != ire) { 22338 IRE_REFHOLD(first_ire); 22339 /* 22340 * Do not release the ire passed in 22341 * as the argument. 22342 */ 22343 ire = first_ire; 22344 } else { 22345 first_ire = NULL; 22346 } 22347 } 22348 IRB_REFRELE(irb); 22349 22350 /* 22351 * Save the first ire; we will need to restore it 22352 * for the trailing frags. 22353 * We REFHOLD save_ire, as each iterated ire will be 22354 * REFRELEd. 22355 */ 22356 save_ire = ire; 22357 IRE_REFHOLD(save_ire); 22358 } 22359 22360 /* 22361 * First fragment emission loop. 22362 * In most cases, the emission loop below is entered only 22363 * once. Only in the case where the ire holds the RTF_MULTIRT 22364 * flag, do we loop to process all RTF_MULTIRT ires in the 22365 * bucket, and send the fragment through all crossed 22366 * RTF_MULTIRT routes. 22367 */ 22368 do { 22369 if (ire->ire_flags & RTF_MULTIRT) { 22370 /* 22371 * We are in a multiple send case, need to get 22372 * the next ire and make a copy of the packet. 22373 * ire1 holds here the next ire to process in the 22374 * bucket. If multirouting is expected, 22375 * any non-RTF_MULTIRT ire that has the 22376 * right destination address is ignored. 22377 * 22378 * We have to take into account the MTU of 22379 * each walked ire. max_frag is set by the 22380 * the caller and generally refers to 22381 * the primary ire entry. Here we ensure that 22382 * no route with a lower MTU will be used, as 22383 * fragments are carved once for all ires, 22384 * then replicated. 22385 */ 22386 ASSERT(irb != NULL); 22387 IRB_REFHOLD(irb); 22388 for (ire1 = ire->ire_next; 22389 ire1 != NULL; 22390 ire1 = ire1->ire_next) { 22391 if ((ire1->ire_flags & RTF_MULTIRT) == 0) 22392 continue; 22393 if (ire1->ire_addr != ire->ire_addr) 22394 continue; 22395 if (ire1->ire_marks & 22396 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)) 22397 continue; 22398 /* 22399 * Ensure we do not exceed the MTU 22400 * of the next route. 22401 */ 22402 if (ire1->ire_max_frag < max_frag) { 22403 ip_multirt_bad_mtu(ire1, max_frag); 22404 continue; 22405 } 22406 22407 /* Got one. */ 22408 IRE_REFHOLD(ire1); 22409 break; 22410 } 22411 IRB_REFRELE(irb); 22412 22413 if (ire1 != NULL) { 22414 next_mp = copyb(mp); 22415 if ((next_mp == NULL) || 22416 ((mp->b_cont != NULL) && 22417 ((next_mp->b_cont = 22418 dupmsg(mp->b_cont)) == NULL))) { 22419 freemsg(next_mp); 22420 next_mp = NULL; 22421 ire_refrele(ire1); 22422 ire1 = NULL; 22423 } 22424 } 22425 22426 /* Last multiroute ire; don't loop anymore. */ 22427 if (ire1 == NULL) { 22428 multirt_send = B_FALSE; 22429 } 22430 } 22431 22432 ll_hdr_len = 0; 22433 LOCK_IRE_FP_MP(ire); 22434 ll_hdr_mp = ire->ire_fp_mp; 22435 if (ll_hdr_mp != NULL) { 22436 ASSERT(ll_hdr_mp->b_datap->db_type == M_DATA); 22437 ll_hdr_len = ll_hdr_mp->b_wptr - ll_hdr_mp->b_rptr; 22438 } else { 22439 ll_hdr_mp = ire->ire_dlureq_mp; 22440 } 22441 22442 /* If there is a transmit header, get a copy for this frag. */ 22443 /* 22444 * TODO: should check db_ref before calling ip_carve_mp since 22445 * it might give us a dup. 22446 */ 22447 if (!ll_hdr_mp) { 22448 /* No xmit header. */ 22449 xmit_mp = mp; 22450 } else if (mp->b_datap->db_ref == 1 && 22451 ll_hdr_len != 0 && 22452 ll_hdr_len <= mp->b_rptr - mp->b_datap->db_base) { 22453 /* M_DATA fastpath */ 22454 mp->b_rptr -= ll_hdr_len; 22455 bcopy(ll_hdr_mp->b_rptr, mp->b_rptr, ll_hdr_len); 22456 xmit_mp = mp; 22457 } else if (!(xmit_mp = copyb(ll_hdr_mp))) { 22458 UNLOCK_IRE_FP_MP(ire); 22459 BUMP_MIB(&ip_mib, ipOutDiscards); 22460 freeb(hdr_mp); 22461 freemsg(mp); 22462 freemsg(mp_orig); 22463 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 22464 "ip_wput_frag_end:(%S)", 22465 "discard"); 22466 22467 if (multirt_send) { 22468 ASSERT(ire1); 22469 ASSERT(next_mp); 22470 22471 freemsg(next_mp); 22472 ire_refrele(ire1); 22473 } 22474 if (save_ire != NULL) 22475 IRE_REFRELE(save_ire); 22476 22477 if (first_ire != NULL) 22478 ire_refrele(first_ire); 22479 return; 22480 } else { 22481 xmit_mp->b_cont = mp; 22482 if (DB_CRED(mp) != NULL) 22483 mblk_setcred(xmit_mp, DB_CRED(mp)); 22484 /* Get priority marking, if any. */ 22485 if (DB_TYPE(xmit_mp) == M_DATA) 22486 xmit_mp->b_band = mp->b_band; 22487 } 22488 UNLOCK_IRE_FP_MP(ire); 22489 q = ire->ire_stq; 22490 BUMP_MIB(&ip_mib, ipFragCreates); 22491 putnext(q, xmit_mp); 22492 if (pkt_type != OB_PKT) { 22493 /* 22494 * Update the packet count of trailing 22495 * RTF_MULTIRT ires. 22496 */ 22497 UPDATE_OB_PKT_COUNT(ire); 22498 } 22499 22500 if (multirt_send) { 22501 /* 22502 * We are in a multiple send case; look for 22503 * the next ire and re-enter the loop. 22504 */ 22505 ASSERT(ire1); 22506 ASSERT(next_mp); 22507 /* REFRELE the current ire before looping */ 22508 ire_refrele(ire); 22509 ire = ire1; 22510 ire1 = NULL; 22511 mp = next_mp; 22512 next_mp = NULL; 22513 } 22514 } while (multirt_send); 22515 22516 ASSERT(ire1 == NULL); 22517 22518 /* Restore the original ire; we need it for the trailing frags */ 22519 if (save_ire != NULL) { 22520 /* REFRELE the last iterated ire */ 22521 ire_refrele(ire); 22522 /* save_ire has been REFHOLDed */ 22523 ire = save_ire; 22524 save_ire = NULL; 22525 q = ire->ire_stq; 22526 } 22527 22528 if (pkt_type == OB_PKT) { 22529 UPDATE_OB_PKT_COUNT(ire); 22530 } else { 22531 UPDATE_IB_PKT_COUNT(ire); 22532 } 22533 22534 /* Advance the offset to the second frag starting point. */ 22535 offset += len; 22536 /* 22537 * Update hdr_len from the copied header - there might be less options 22538 * in the later fragments. 22539 */ 22540 hdr_len = IPH_HDR_LENGTH(hdr_mp->b_rptr); 22541 /* Loop until done. */ 22542 for (;;) { 22543 uint16_t offset_and_flags; 22544 uint16_t ip_len; 22545 22546 if (ip_data_end - offset > len) { 22547 /* 22548 * Carve off the appropriate amount from the original 22549 * datagram. 22550 */ 22551 if (!(carve_mp = ip_carve_mp(&mp_orig, len))) { 22552 mp = NULL; 22553 break; 22554 } 22555 /* 22556 * More frags after this one. Get another copy 22557 * of the header. 22558 */ 22559 if (carve_mp->b_datap->db_ref == 1 && 22560 hdr_mp->b_wptr - hdr_mp->b_rptr < 22561 carve_mp->b_rptr - carve_mp->b_datap->db_base) { 22562 /* Inline IP header */ 22563 carve_mp->b_rptr -= hdr_mp->b_wptr - 22564 hdr_mp->b_rptr; 22565 bcopy(hdr_mp->b_rptr, carve_mp->b_rptr, 22566 hdr_mp->b_wptr - hdr_mp->b_rptr); 22567 mp = carve_mp; 22568 } else { 22569 if (!(mp = copyb(hdr_mp))) { 22570 freemsg(carve_mp); 22571 break; 22572 } 22573 /* Get priority marking, if any. */ 22574 mp->b_band = carve_mp->b_band; 22575 mp->b_cont = carve_mp; 22576 } 22577 ipha = (ipha_t *)mp->b_rptr; 22578 offset_and_flags = IPH_MF; 22579 } else { 22580 /* 22581 * Last frag. Consume the header. Set len to 22582 * the length of this last piece. 22583 */ 22584 len = ip_data_end - offset; 22585 22586 /* 22587 * Carve off the appropriate amount from the original 22588 * datagram. 22589 */ 22590 if (!(carve_mp = ip_carve_mp(&mp_orig, len))) { 22591 mp = NULL; 22592 break; 22593 } 22594 if (carve_mp->b_datap->db_ref == 1 && 22595 hdr_mp->b_wptr - hdr_mp->b_rptr < 22596 carve_mp->b_rptr - carve_mp->b_datap->db_base) { 22597 /* Inline IP header */ 22598 carve_mp->b_rptr -= hdr_mp->b_wptr - 22599 hdr_mp->b_rptr; 22600 bcopy(hdr_mp->b_rptr, carve_mp->b_rptr, 22601 hdr_mp->b_wptr - hdr_mp->b_rptr); 22602 mp = carve_mp; 22603 freeb(hdr_mp); 22604 hdr_mp = mp; 22605 } else { 22606 mp = hdr_mp; 22607 /* Get priority marking, if any. */ 22608 mp->b_band = carve_mp->b_band; 22609 mp->b_cont = carve_mp; 22610 } 22611 ipha = (ipha_t *)mp->b_rptr; 22612 /* A frag of a frag might have IPH_MF non-zero */ 22613 offset_and_flags = 22614 ntohs(ipha->ipha_fragment_offset_and_flags) & 22615 IPH_MF; 22616 } 22617 offset_and_flags |= (uint16_t)(offset >> 3); 22618 offset_and_flags |= (uint16_t)frag_flag; 22619 /* Store the offset and flags in the IP header. */ 22620 ipha->ipha_fragment_offset_and_flags = htons(offset_and_flags); 22621 22622 /* Store the length in the IP header. */ 22623 ip_len = (uint16_t)(len + hdr_len); 22624 ipha->ipha_length = htons(ip_len); 22625 22626 /* 22627 * Set the IP header checksum. Note that mp is just 22628 * the header, so this is easy to pass to ip_csum. 22629 */ 22630 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 22631 22632 /* Attach a transmit header, if any, and ship it. */ 22633 if (pkt_type == OB_PKT) { 22634 UPDATE_OB_PKT_COUNT(ire); 22635 } else { 22636 UPDATE_IB_PKT_COUNT(ire); 22637 } 22638 22639 if (ire->ire_flags & RTF_MULTIRT) { 22640 irb = ire->ire_bucket; 22641 ASSERT(irb != NULL); 22642 22643 multirt_send = B_TRUE; 22644 22645 /* 22646 * Save the original ire; we will need to restore it 22647 * for the tailing frags. 22648 */ 22649 save_ire = ire; 22650 IRE_REFHOLD(save_ire); 22651 } 22652 /* 22653 * Emission loop for this fragment, similar 22654 * to what is done for the first fragment. 22655 */ 22656 do { 22657 if (multirt_send) { 22658 /* 22659 * We are in a multiple send case, need to get 22660 * the next ire and make a copy of the packet. 22661 */ 22662 ASSERT(irb != NULL); 22663 IRB_REFHOLD(irb); 22664 for (ire1 = ire->ire_next; 22665 ire1 != NULL; 22666 ire1 = ire1->ire_next) { 22667 if (!(ire1->ire_flags & RTF_MULTIRT)) 22668 continue; 22669 if (ire1->ire_addr != ire->ire_addr) 22670 continue; 22671 if (ire1->ire_marks & 22672 (IRE_MARK_CONDEMNED| 22673 IRE_MARK_HIDDEN)) 22674 continue; 22675 /* 22676 * Ensure we do not exceed the MTU 22677 * of the next route. 22678 */ 22679 if (ire1->ire_max_frag < max_frag) { 22680 ip_multirt_bad_mtu(ire1, 22681 max_frag); 22682 continue; 22683 } 22684 22685 /* Got one. */ 22686 IRE_REFHOLD(ire1); 22687 break; 22688 } 22689 IRB_REFRELE(irb); 22690 22691 if (ire1 != NULL) { 22692 next_mp = copyb(mp); 22693 if ((next_mp == NULL) || 22694 ((mp->b_cont != NULL) && 22695 ((next_mp->b_cont = 22696 dupmsg(mp->b_cont)) == NULL))) { 22697 freemsg(next_mp); 22698 next_mp = NULL; 22699 ire_refrele(ire1); 22700 ire1 = NULL; 22701 } 22702 } 22703 22704 /* Last multiroute ire; don't loop anymore. */ 22705 if (ire1 == NULL) { 22706 multirt_send = B_FALSE; 22707 } 22708 } 22709 22710 /* Update transmit header */ 22711 ll_hdr_len = 0; 22712 LOCK_IRE_FP_MP(ire); 22713 ll_hdr_mp = ire->ire_fp_mp; 22714 if (ll_hdr_mp != NULL) { 22715 ASSERT(ll_hdr_mp->b_datap->db_type == M_DATA); 22716 ll_hdr_len = MBLKL(ll_hdr_mp); 22717 } else { 22718 ll_hdr_mp = ire->ire_dlureq_mp; 22719 } 22720 22721 if (!ll_hdr_mp) { 22722 xmit_mp = mp; 22723 } else if (mp->b_datap->db_ref == 1 && 22724 ll_hdr_len != 0 && 22725 ll_hdr_len <= mp->b_rptr - mp->b_datap->db_base) { 22726 /* M_DATA fastpath */ 22727 mp->b_rptr -= ll_hdr_len; 22728 bcopy(ll_hdr_mp->b_rptr, mp->b_rptr, 22729 ll_hdr_len); 22730 xmit_mp = mp; 22731 } else if ((xmit_mp = copyb(ll_hdr_mp)) != NULL) { 22732 xmit_mp->b_cont = mp; 22733 if (DB_CRED(mp) != NULL) 22734 mblk_setcred(xmit_mp, DB_CRED(mp)); 22735 /* Get priority marking, if any. */ 22736 if (DB_TYPE(xmit_mp) == M_DATA) 22737 xmit_mp->b_band = mp->b_band; 22738 } else { 22739 /* 22740 * Exit both the replication and 22741 * fragmentation loops. 22742 */ 22743 UNLOCK_IRE_FP_MP(ire); 22744 goto drop_pkt; 22745 } 22746 UNLOCK_IRE_FP_MP(ire); 22747 BUMP_MIB(&ip_mib, ipFragCreates); 22748 putnext(q, xmit_mp); 22749 22750 if (pkt_type != OB_PKT) { 22751 /* 22752 * Update the packet count of trailing 22753 * RTF_MULTIRT ires. 22754 */ 22755 UPDATE_OB_PKT_COUNT(ire); 22756 } 22757 22758 /* All done if we just consumed the hdr_mp. */ 22759 if (mp == hdr_mp) { 22760 last_frag = B_TRUE; 22761 } 22762 22763 if (multirt_send) { 22764 /* 22765 * We are in a multiple send case; look for 22766 * the next ire and re-enter the loop. 22767 */ 22768 ASSERT(ire1); 22769 ASSERT(next_mp); 22770 /* REFRELE the current ire before looping */ 22771 ire_refrele(ire); 22772 ire = ire1; 22773 ire1 = NULL; 22774 q = ire->ire_stq; 22775 mp = next_mp; 22776 next_mp = NULL; 22777 } 22778 } while (multirt_send); 22779 /* 22780 * Restore the original ire; we need it for the 22781 * trailing frags 22782 */ 22783 if (save_ire != NULL) { 22784 ASSERT(ire1 == NULL); 22785 /* REFRELE the last iterated ire */ 22786 ire_refrele(ire); 22787 /* save_ire has been REFHOLDed */ 22788 ire = save_ire; 22789 q = ire->ire_stq; 22790 save_ire = NULL; 22791 } 22792 22793 if (last_frag) { 22794 BUMP_MIB(&ip_mib, ipFragOKs); 22795 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 22796 "ip_wput_frag_end:(%S)", 22797 "consumed hdr_mp"); 22798 22799 if (first_ire != NULL) 22800 ire_refrele(first_ire); 22801 return; 22802 } 22803 /* Otherwise, advance and loop. */ 22804 offset += len; 22805 } 22806 22807 drop_pkt: 22808 /* Clean up following allocation failure. */ 22809 BUMP_MIB(&ip_mib, ipOutDiscards); 22810 freemsg(mp); 22811 if (mp != hdr_mp) 22812 freeb(hdr_mp); 22813 if (mp != mp_orig) 22814 freemsg(mp_orig); 22815 22816 if (save_ire != NULL) 22817 IRE_REFRELE(save_ire); 22818 if (first_ire != NULL) 22819 ire_refrele(first_ire); 22820 22821 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 22822 "ip_wput_frag_end:(%S)", 22823 "end--alloc failure"); 22824 } 22825 22826 /* 22827 * Copy the header plus those options which have the copy bit set 22828 */ 22829 static mblk_t * 22830 ip_wput_frag_copyhdr(uchar_t *rptr, int hdr_len, int offset) 22831 { 22832 mblk_t *mp; 22833 uchar_t *up; 22834 22835 /* 22836 * Quick check if we need to look for options without the copy bit 22837 * set 22838 */ 22839 mp = allocb(ip_wroff_extra + hdr_len, BPRI_HI); 22840 if (!mp) 22841 return (mp); 22842 mp->b_rptr += ip_wroff_extra; 22843 if (hdr_len == IP_SIMPLE_HDR_LENGTH || offset != 0) { 22844 bcopy(rptr, mp->b_rptr, hdr_len); 22845 mp->b_wptr += hdr_len + ip_wroff_extra; 22846 return (mp); 22847 } 22848 up = mp->b_rptr; 22849 bcopy(rptr, up, IP_SIMPLE_HDR_LENGTH); 22850 up += IP_SIMPLE_HDR_LENGTH; 22851 rptr += IP_SIMPLE_HDR_LENGTH; 22852 hdr_len -= IP_SIMPLE_HDR_LENGTH; 22853 while (hdr_len > 0) { 22854 uint32_t optval; 22855 uint32_t optlen; 22856 22857 optval = *rptr; 22858 if (optval == IPOPT_EOL) 22859 break; 22860 if (optval == IPOPT_NOP) 22861 optlen = 1; 22862 else 22863 optlen = rptr[1]; 22864 if (optval & IPOPT_COPY) { 22865 bcopy(rptr, up, optlen); 22866 up += optlen; 22867 } 22868 rptr += optlen; 22869 hdr_len -= optlen; 22870 } 22871 /* 22872 * Make sure that we drop an even number of words by filling 22873 * with EOL to the next word boundary. 22874 */ 22875 for (hdr_len = up - (mp->b_rptr + IP_SIMPLE_HDR_LENGTH); 22876 hdr_len & 0x3; hdr_len++) 22877 *up++ = IPOPT_EOL; 22878 mp->b_wptr = up; 22879 /* Update header length */ 22880 mp->b_rptr[0] = (uint8_t)((IP_VERSION << 4) | ((up - mp->b_rptr) >> 2)); 22881 return (mp); 22882 } 22883 22884 /* 22885 * Delivery to local recipients including fanout to multiple recipients. 22886 * Does not do checksumming of UDP/TCP. 22887 * Note: q should be the read side queue for either the ill or conn. 22888 * Note: rq should be the read side q for the lower (ill) stream. 22889 * We don't send packets to IPPF processing, thus the last argument 22890 * to all the fanout calls are B_FALSE. 22891 */ 22892 void 22893 ip_wput_local(queue_t *q, ill_t *ill, ipha_t *ipha, mblk_t *mp, ire_t *ire, 22894 int fanout_flags, zoneid_t zoneid) 22895 { 22896 uint32_t protocol; 22897 mblk_t *first_mp; 22898 boolean_t mctl_present; 22899 int ire_type; 22900 #define rptr ((uchar_t *)ipha) 22901 22902 TRACE_1(TR_FAC_IP, TR_IP_WPUT_LOCAL_START, 22903 "ip_wput_local_start: q %p", q); 22904 22905 if (ire != NULL) { 22906 ire_type = ire->ire_type; 22907 } else { 22908 /* 22909 * Only ip_multicast_loopback() calls us with a NULL ire. If the 22910 * packet is not multicast, we can't tell the ire type. 22911 */ 22912 ASSERT(CLASSD(ipha->ipha_dst)); 22913 ire_type = IRE_BROADCAST; 22914 } 22915 22916 first_mp = mp; 22917 if (first_mp->b_datap->db_type == M_CTL) { 22918 ipsec_out_t *io = (ipsec_out_t *)first_mp->b_rptr; 22919 if (!io->ipsec_out_secure) { 22920 /* 22921 * This ipsec_out_t was allocated in ip_wput 22922 * for multicast packets to store the ill_index. 22923 * As this is being delivered locally, we don't 22924 * need this anymore. 22925 */ 22926 mp = first_mp->b_cont; 22927 freeb(first_mp); 22928 first_mp = mp; 22929 mctl_present = B_FALSE; 22930 } else { 22931 mctl_present = B_TRUE; 22932 mp = first_mp->b_cont; 22933 ASSERT(mp != NULL); 22934 ipsec_out_to_in(first_mp); 22935 } 22936 } else { 22937 mctl_present = B_FALSE; 22938 } 22939 22940 loopback_packets++; 22941 22942 ip2dbg(("ip_wput_local: from 0x%x to 0x%x in zone %d\n", 22943 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), zoneid)); 22944 if (!IS_SIMPLE_IPH(ipha)) { 22945 ip_wput_local_options(ipha); 22946 } 22947 22948 protocol = ipha->ipha_protocol; 22949 switch (protocol) { 22950 case IPPROTO_ICMP: { 22951 ire_t *ire_zone; 22952 ilm_t *ilm; 22953 mblk_t *mp1; 22954 zoneid_t last_zoneid; 22955 22956 if (CLASSD(ipha->ipha_dst) && 22957 !(ill->ill_phyint->phyint_flags & PHYI_LOOPBACK)) { 22958 ASSERT(ire_type == IRE_BROADCAST); 22959 /* 22960 * In the multicast case, applications may have joined 22961 * the group from different zones, so we need to deliver 22962 * the packet to each of them. Loop through the 22963 * multicast memberships structures (ilm) on the receive 22964 * ill and send a copy of the packet up each matching 22965 * one. However, we don't do this for multicasts sent on 22966 * the loopback interface (PHYI_LOOPBACK flag set) as 22967 * they must stay in the sender's zone. 22968 * 22969 * ilm_add_v6() ensures that ilms in the same zone are 22970 * contiguous in the ill_ilm list. We use this property 22971 * to avoid sending duplicates needed when two 22972 * applications in the same zone join the same group on 22973 * different logical interfaces: we ignore the ilm if 22974 * its zoneid is the same as the last matching one. 22975 * In addition, the sending of the packet for 22976 * ire_zoneid is delayed until all of the other ilms 22977 * have been exhausted. 22978 */ 22979 last_zoneid = -1; 22980 ILM_WALKER_HOLD(ill); 22981 for (ilm = ill->ill_ilm; ilm != NULL; 22982 ilm = ilm->ilm_next) { 22983 if ((ilm->ilm_flags & ILM_DELETED) || 22984 ipha->ipha_dst != ilm->ilm_addr || 22985 ilm->ilm_zoneid == last_zoneid || 22986 ilm->ilm_zoneid == zoneid || 22987 !(ilm->ilm_ipif->ipif_flags & IPIF_UP)) 22988 continue; 22989 mp1 = ip_copymsg(first_mp); 22990 if (mp1 == NULL) 22991 continue; 22992 icmp_inbound(q, mp1, B_TRUE, ill, 0, 0, 22993 mctl_present, B_FALSE, ill, 22994 ilm->ilm_zoneid); 22995 last_zoneid = ilm->ilm_zoneid; 22996 } 22997 ILM_WALKER_RELE(ill); 22998 /* 22999 * Loopback case: the sending endpoint has 23000 * IP_MULTICAST_LOOP disabled, therefore we don't 23001 * dispatch the multicast packet to the sending zone. 23002 */ 23003 if (fanout_flags & IP_FF_NO_MCAST_LOOP) { 23004 freemsg(first_mp); 23005 return; 23006 } 23007 } else if (ire_type == IRE_BROADCAST) { 23008 /* 23009 * In the broadcast case, there may be many zones 23010 * which need a copy of the packet delivered to them. 23011 * There is one IRE_BROADCAST per broadcast address 23012 * and per zone; we walk those using a helper function. 23013 * In addition, the sending of the packet for zoneid is 23014 * delayed until all of the other ires have been 23015 * processed. 23016 */ 23017 IRB_REFHOLD(ire->ire_bucket); 23018 ire_zone = NULL; 23019 while ((ire_zone = ire_get_next_bcast_ire(ire_zone, 23020 ire)) != NULL) { 23021 mp1 = ip_copymsg(first_mp); 23022 if (mp1 == NULL) 23023 continue; 23024 23025 UPDATE_IB_PKT_COUNT(ire_zone); 23026 ire_zone->ire_last_used_time = lbolt; 23027 icmp_inbound(q, mp1, B_TRUE, ill, 0, 0, 23028 mctl_present, B_FALSE, ill, 23029 ire_zone->ire_zoneid); 23030 } 23031 IRB_REFRELE(ire->ire_bucket); 23032 } 23033 icmp_inbound(q, first_mp, (ire_type == IRE_BROADCAST), ill, 0, 23034 0, mctl_present, B_FALSE, ill, zoneid); 23035 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 23036 "ip_wput_local_end: q %p (%S)", 23037 q, "icmp"); 23038 return; 23039 } 23040 case IPPROTO_IGMP: 23041 if (igmp_input(q, mp, ill)) { 23042 /* Bad packet - discarded by igmp_input */ 23043 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 23044 "ip_wput_local_end: q %p (%S)", 23045 q, "igmp_input--bad packet"); 23046 if (mctl_present) 23047 freeb(first_mp); 23048 return; 23049 } 23050 /* 23051 * igmp_input() may have pulled up the message so ipha needs to 23052 * be reinitialized. 23053 */ 23054 ipha = (ipha_t *)mp->b_rptr; 23055 /* deliver to local raw users */ 23056 break; 23057 case IPPROTO_ENCAP: 23058 /* 23059 * This case is covered by either ip_fanout_proto, or by 23060 * the above security processing for self-tunneled packets. 23061 */ 23062 break; 23063 case IPPROTO_UDP: { 23064 uint16_t *up; 23065 uint32_t ports; 23066 23067 up = (uint16_t *)(rptr + IPH_HDR_LENGTH(ipha) + 23068 UDP_PORTS_OFFSET); 23069 /* Force a 'valid' checksum. */ 23070 up[3] = 0; 23071 23072 ports = *(uint32_t *)up; 23073 ip_fanout_udp(q, first_mp, ill, ipha, ports, 23074 (ire_type == IRE_BROADCAST), 23075 fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | 23076 IP_FF_SEND_SLLA | IP_FF_IP6INFO, mctl_present, B_FALSE, 23077 ill, zoneid); 23078 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 23079 "ip_wput_local_end: q %p (%S)", q, "ip_fanout_udp"); 23080 return; 23081 } 23082 case IPPROTO_TCP: { 23083 23084 /* 23085 * For TCP, discard broadcast packets. 23086 */ 23087 if ((ushort_t)ire_type == IRE_BROADCAST) { 23088 freemsg(first_mp); 23089 BUMP_MIB(&ip_mib, ipInDiscards); 23090 ip2dbg(("ip_wput_local: discard broadcast\n")); 23091 return; 23092 } 23093 23094 if (mp->b_datap->db_type == M_DATA) { 23095 /* 23096 * M_DATA mblk, so init mblk (chain) for no struio(). 23097 */ 23098 mblk_t *mp1 = mp; 23099 23100 do 23101 mp1->b_datap->db_struioflag = 0; 23102 while ((mp1 = mp1->b_cont) != NULL); 23103 } 23104 ASSERT((rptr + IPH_HDR_LENGTH(ipha) + TCP_PORTS_OFFSET + 4) 23105 <= mp->b_wptr); 23106 ip_fanout_tcp(q, first_mp, ill, ipha, 23107 fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | 23108 IP_FF_SYN_ADDIRE | IP_FF_IP6INFO, 23109 mctl_present, B_FALSE, zoneid); 23110 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 23111 "ip_wput_local_end: q %p (%S)", q, "ip_fanout_tcp"); 23112 return; 23113 } 23114 case IPPROTO_SCTP: 23115 { 23116 uint32_t ports; 23117 23118 bcopy(rptr + IPH_HDR_LENGTH(ipha), &ports, sizeof (ports)); 23119 ip_fanout_sctp(first_mp, ill, ipha, ports, 23120 fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | 23121 IP_FF_IP6INFO, 23122 mctl_present, B_FALSE, 0, zoneid); 23123 return; 23124 } 23125 23126 default: 23127 break; 23128 } 23129 /* 23130 * Find a client for some other protocol. We give 23131 * copies to multiple clients, if more than one is 23132 * bound. 23133 */ 23134 ip_fanout_proto(q, first_mp, ill, ipha, 23135 fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | IP_FF_RAWIP, 23136 mctl_present, B_FALSE, ill, zoneid); 23137 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 23138 "ip_wput_local_end: q %p (%S)", q, "ip_fanout_proto"); 23139 #undef rptr 23140 } 23141 23142 /* 23143 * Update any source route, record route, or timestamp options. 23144 * Check that we are at end of strict source route. 23145 * The options have been sanity checked by ip_wput_options(). 23146 */ 23147 static void 23148 ip_wput_local_options(ipha_t *ipha) 23149 { 23150 ipoptp_t opts; 23151 uchar_t *opt; 23152 uint8_t optval; 23153 uint8_t optlen; 23154 ipaddr_t dst; 23155 uint32_t ts; 23156 ire_t *ire; 23157 timestruc_t now; 23158 23159 ip2dbg(("ip_wput_local_options\n")); 23160 for (optval = ipoptp_first(&opts, ipha); 23161 optval != IPOPT_EOL; 23162 optval = ipoptp_next(&opts)) { 23163 opt = opts.ipoptp_cur; 23164 optlen = opts.ipoptp_len; 23165 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 23166 switch (optval) { 23167 uint32_t off; 23168 case IPOPT_SSRR: 23169 case IPOPT_LSRR: 23170 off = opt[IPOPT_OFFSET]; 23171 off--; 23172 if (optlen < IP_ADDR_LEN || 23173 off > optlen - IP_ADDR_LEN) { 23174 /* End of source route */ 23175 break; 23176 } 23177 /* 23178 * This will only happen if two consecutive entries 23179 * in the source route contains our address or if 23180 * it is a packet with a loose source route which 23181 * reaches us before consuming the whole source route 23182 */ 23183 ip1dbg(("ip_wput_local_options: not end of SR\n")); 23184 if (optval == IPOPT_SSRR) { 23185 return; 23186 } 23187 /* 23188 * Hack: instead of dropping the packet truncate the 23189 * source route to what has been used by filling the 23190 * rest with IPOPT_NOP. 23191 */ 23192 opt[IPOPT_OLEN] = (uint8_t)off; 23193 while (off < optlen) { 23194 opt[off++] = IPOPT_NOP; 23195 } 23196 break; 23197 case IPOPT_RR: 23198 off = opt[IPOPT_OFFSET]; 23199 off--; 23200 if (optlen < IP_ADDR_LEN || 23201 off > optlen - IP_ADDR_LEN) { 23202 /* No more room - ignore */ 23203 ip1dbg(( 23204 "ip_wput_forward_options: end of RR\n")); 23205 break; 23206 } 23207 dst = htonl(INADDR_LOOPBACK); 23208 bcopy(&dst, (char *)opt + off, IP_ADDR_LEN); 23209 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 23210 break; 23211 case IPOPT_TS: 23212 /* Insert timestamp if there is romm */ 23213 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 23214 case IPOPT_TS_TSONLY: 23215 off = IPOPT_TS_TIMELEN; 23216 break; 23217 case IPOPT_TS_PRESPEC: 23218 case IPOPT_TS_PRESPEC_RFC791: 23219 /* Verify that the address matched */ 23220 off = opt[IPOPT_OFFSET] - 1; 23221 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 23222 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, 23223 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 23224 if (ire == NULL) { 23225 /* Not for us */ 23226 break; 23227 } 23228 ire_refrele(ire); 23229 /* FALLTHRU */ 23230 case IPOPT_TS_TSANDADDR: 23231 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 23232 break; 23233 default: 23234 /* 23235 * ip_*put_options should have already 23236 * dropped this packet. 23237 */ 23238 cmn_err(CE_PANIC, "ip_wput_local_options: " 23239 "unknown IT - bug in ip_wput_options?\n"); 23240 return; /* Keep "lint" happy */ 23241 } 23242 if (opt[IPOPT_OFFSET] - 1 + off > optlen) { 23243 /* Increase overflow counter */ 23244 off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1; 23245 opt[IPOPT_POS_OV_FLG] = (uint8_t) 23246 (opt[IPOPT_POS_OV_FLG] & 0x0F) | 23247 (off << 4); 23248 break; 23249 } 23250 off = opt[IPOPT_OFFSET] - 1; 23251 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 23252 case IPOPT_TS_PRESPEC: 23253 case IPOPT_TS_PRESPEC_RFC791: 23254 case IPOPT_TS_TSANDADDR: 23255 dst = htonl(INADDR_LOOPBACK); 23256 bcopy(&dst, (char *)opt + off, IP_ADDR_LEN); 23257 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 23258 /* FALLTHRU */ 23259 case IPOPT_TS_TSONLY: 23260 off = opt[IPOPT_OFFSET] - 1; 23261 /* Compute # of milliseconds since midnight */ 23262 gethrestime(&now); 23263 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + 23264 now.tv_nsec / (NANOSEC / MILLISEC); 23265 bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN); 23266 opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN; 23267 break; 23268 } 23269 break; 23270 } 23271 } 23272 } 23273 23274 /* 23275 * Send out a multicast packet on interface ipif. 23276 * The sender does not have an conn. 23277 * Caller verifies that this isn't a PHYI_LOOPBACK. 23278 */ 23279 void 23280 ip_wput_multicast(queue_t *q, mblk_t *mp, ipif_t *ipif) 23281 { 23282 ipha_t *ipha; 23283 ire_t *ire; 23284 ipaddr_t dst; 23285 mblk_t *first_mp; 23286 23287 /* igmp_sendpkt always allocates a ipsec_out_t */ 23288 ASSERT(mp->b_datap->db_type == M_CTL); 23289 ASSERT(!ipif->ipif_isv6); 23290 ASSERT(!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_LOOPBACK)); 23291 23292 first_mp = mp; 23293 mp = first_mp->b_cont; 23294 ASSERT(mp->b_datap->db_type == M_DATA); 23295 ipha = (ipha_t *)mp->b_rptr; 23296 23297 /* 23298 * Find an IRE which matches the destination and the outgoing 23299 * queue (i.e. the outgoing interface.) 23300 */ 23301 if (ipif->ipif_flags & IPIF_POINTOPOINT) 23302 dst = ipif->ipif_pp_dst_addr; 23303 else 23304 dst = ipha->ipha_dst; 23305 /* 23306 * The source address has already been initialized by the 23307 * caller and hence matching on ILL (MATCH_IRE_ILL) would 23308 * be sufficient rather than MATCH_IRE_IPIF. 23309 * 23310 * This function is used for sending IGMP packets. We need 23311 * to make sure that we send the packet out of the interface 23312 * (ipif->ipif_ill) where we joined the group. This is to 23313 * prevent from switches doing IGMP snooping to send us multicast 23314 * packets for a given group on the interface we have joined. 23315 * If we can't find an ire, igmp_sendpkt has already initialized 23316 * ipsec_out_attach_if so that this will not be load spread in 23317 * ip_newroute_ipif. 23318 */ 23319 ire = ire_ctable_lookup(dst, 0, 0, ipif, ALL_ZONES, NULL, 23320 MATCH_IRE_ILL); 23321 if (!ire) { 23322 /* 23323 * Mark this packet to make it be delivered to 23324 * ip_wput_ire after the new ire has been 23325 * created. 23326 */ 23327 mp->b_prev = NULL; 23328 mp->b_next = NULL; 23329 ip_newroute_ipif(q, first_mp, ipif, dst, NULL, RTF_SETSRC); 23330 return; 23331 } 23332 23333 /* 23334 * Honor the RTF_SETSRC flag; this is the only case 23335 * where we force this addr whatever the current src addr is, 23336 * because this address is set by igmp_sendpkt(), and 23337 * cannot be specified by any user. 23338 */ 23339 if (ire->ire_flags & RTF_SETSRC) { 23340 ipha->ipha_src = ire->ire_src_addr; 23341 } 23342 23343 ip_wput_ire(q, first_mp, ire, NULL, B_FALSE); 23344 } 23345 23346 /* 23347 * NOTE : This function does not ire_refrele the ire argument passed in. 23348 * 23349 * Copy the link layer header and do IPQoS if needed. Frees the mblk on 23350 * failure. The ire_fp_mp can vanish any time in the case of IRE_MIPRTUN 23351 * and IRE_BROADCAST due to DL_NOTE_FASTPATH_FLUSH. Hence we have to hold 23352 * the ire_lock to access the ire_fp_mp in this case. 23353 * IPQoS assumes that the first M_DATA contains the IP header. So, if we are 23354 * prepending a fastpath message IPQoS processing must precede it, we also set 23355 * the b_band of the fastpath message to that of the mblk returned by IPQoS 23356 * (IPQoS might have set the b_band for CoS marking). 23357 * However, if we are prepending DL_UNITDATA_REQ message, IPQoS processing 23358 * must follow it so that IPQoS can mark the dl_priority field for CoS 23359 * marking, if needed. 23360 */ 23361 static mblk_t * 23362 ip_wput_attach_llhdr(mblk_t *mp, ire_t *ire, ip_proc_t proc, uint32_t ill_index) 23363 { 23364 uint_t hlen; 23365 ipha_t *ipha; 23366 mblk_t *mp1; 23367 boolean_t qos_done = B_FALSE; 23368 uchar_t *ll_hdr; 23369 23370 #define rptr ((uchar_t *)ipha) 23371 23372 ipha = (ipha_t *)mp->b_rptr; 23373 hlen = 0; 23374 LOCK_IRE_FP_MP(ire); 23375 if ((mp1 = ire->ire_fp_mp) != NULL) { 23376 ASSERT(DB_TYPE(mp1) == M_DATA); 23377 /* Initiate IPPF processing */ 23378 if ((proc != 0) && IPP_ENABLED(proc)) { 23379 UNLOCK_IRE_FP_MP(ire); 23380 ip_process(proc, &mp, ill_index); 23381 if (mp == NULL) 23382 return (NULL); 23383 23384 ipha = (ipha_t *)mp->b_rptr; 23385 LOCK_IRE_FP_MP(ire); 23386 if ((mp1 = ire->ire_fp_mp) == NULL) { 23387 qos_done = B_TRUE; 23388 goto no_fp_mp; 23389 } 23390 ASSERT(DB_TYPE(mp1) == M_DATA); 23391 } 23392 hlen = MBLKL(mp1); 23393 /* 23394 * Check if we have enough room to prepend fastpath 23395 * header 23396 */ 23397 if (hlen != 0 && (rptr - mp->b_datap->db_base) >= hlen) { 23398 ll_hdr = rptr - hlen; 23399 bcopy(mp1->b_rptr, ll_hdr, hlen); 23400 /* XXX ipha is not aligned here */ 23401 ipha = (ipha_t *)(rptr - hlen); 23402 /* 23403 * Set the b_rptr to the start of the link layer 23404 * header 23405 */ 23406 mp->b_rptr = rptr; 23407 mp1 = mp; 23408 } else { 23409 mp1 = copyb(mp1); 23410 if (mp1 == NULL) 23411 goto unlock_err; 23412 mp1->b_band = mp->b_band; 23413 mp1->b_cont = mp; 23414 /* 23415 * certain system generated traffic may not 23416 * have cred/label in ip header block. This 23417 * is true even for a labeled system. But for 23418 * labeled traffic, inherit the label in the 23419 * new header. 23420 */ 23421 if (DB_CRED(mp) != NULL) 23422 mblk_setcred(mp1, DB_CRED(mp)); 23423 /* 23424 * XXX disable ICK_VALID and compute checksum 23425 * here; can happen if ire_fp_mp changes and 23426 * it can't be copied now due to insufficient 23427 * space. (unlikely, fp mp can change, but it 23428 * does not increase in length) 23429 */ 23430 } 23431 UNLOCK_IRE_FP_MP(ire); 23432 } else { 23433 no_fp_mp: 23434 mp1 = copyb(ire->ire_dlureq_mp); 23435 if (mp1 == NULL) { 23436 unlock_err: 23437 UNLOCK_IRE_FP_MP(ire); 23438 freemsg(mp); 23439 return (NULL); 23440 } 23441 UNLOCK_IRE_FP_MP(ire); 23442 mp1->b_cont = mp; 23443 /* 23444 * certain system generated traffic may not 23445 * have cred/label in ip header block. This 23446 * is true even for a labeled system. But for 23447 * labeled traffic, inherit the label in the 23448 * new header. 23449 */ 23450 if (DB_CRED(mp) != NULL) 23451 mblk_setcred(mp1, DB_CRED(mp)); 23452 if (!qos_done && (proc != 0) && IPP_ENABLED(proc)) { 23453 ip_process(proc, &mp1, ill_index); 23454 if (mp1 == NULL) 23455 return (NULL); 23456 } 23457 } 23458 return (mp1); 23459 #undef rptr 23460 } 23461 23462 /* 23463 * Finish the outbound IPsec processing for an IPv6 packet. This function 23464 * is called from ipsec_out_process() if the IPsec packet was processed 23465 * synchronously, or from {ah,esp}_kcf_callback() if it was processed 23466 * asynchronously. 23467 */ 23468 void 23469 ip_wput_ipsec_out_v6(queue_t *q, mblk_t *ipsec_mp, ip6_t *ip6h, ill_t *ill, 23470 ire_t *ire_arg) 23471 { 23472 in6_addr_t *v6dstp; 23473 ire_t *ire; 23474 mblk_t *mp; 23475 uint_t ill_index; 23476 ipsec_out_t *io; 23477 boolean_t attach_if, hwaccel; 23478 uint32_t flags = IP6_NO_IPPOLICY; 23479 int match_flags; 23480 zoneid_t zoneid; 23481 boolean_t ill_need_rele = B_FALSE; 23482 boolean_t ire_need_rele = B_FALSE; 23483 23484 mp = ipsec_mp->b_cont; 23485 io = (ipsec_out_t *)ipsec_mp->b_rptr; 23486 ill_index = io->ipsec_out_ill_index; 23487 if (io->ipsec_out_reachable) { 23488 flags |= IPV6_REACHABILITY_CONFIRMATION; 23489 } 23490 attach_if = io->ipsec_out_attach_if; 23491 hwaccel = io->ipsec_out_accelerated; 23492 zoneid = io->ipsec_out_zoneid; 23493 ASSERT(zoneid != ALL_ZONES); 23494 match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR; 23495 /* Multicast addresses should have non-zero ill_index. */ 23496 v6dstp = &ip6h->ip6_dst; 23497 ASSERT(ip6h->ip6_nxt != IPPROTO_RAW); 23498 ASSERT(!IN6_IS_ADDR_MULTICAST(v6dstp) || ill_index != 0); 23499 ASSERT(!attach_if || ill_index != 0); 23500 if (ill_index != 0) { 23501 if (ill == NULL) { 23502 ill = ip_grab_attach_ill(NULL, ipsec_mp, ill_index, 23503 B_TRUE); 23504 23505 /* Failure case frees things for us. */ 23506 if (ill == NULL) 23507 return; 23508 23509 ill_need_rele = B_TRUE; 23510 } 23511 /* 23512 * If this packet needs to go out on a particular interface 23513 * honor it. 23514 */ 23515 if (attach_if) { 23516 match_flags = MATCH_IRE_ILL; 23517 23518 /* 23519 * Check if we need an ire that will not be 23520 * looked up by anybody else i.e. HIDDEN. 23521 */ 23522 if (ill_is_probeonly(ill)) { 23523 match_flags |= MATCH_IRE_MARK_HIDDEN; 23524 } 23525 } 23526 } 23527 ASSERT(mp != NULL); 23528 23529 if (IN6_IS_ADDR_MULTICAST(v6dstp)) { 23530 boolean_t unspec_src; 23531 ipif_t *ipif; 23532 23533 /* 23534 * Use the ill_index to get the right ill. 23535 */ 23536 unspec_src = io->ipsec_out_unspec_src; 23537 (void) ipif_lookup_zoneid(ill, zoneid, 0, &ipif); 23538 if (ipif == NULL) { 23539 if (ill_need_rele) 23540 ill_refrele(ill); 23541 freemsg(ipsec_mp); 23542 return; 23543 } 23544 23545 if (ire_arg != NULL) { 23546 ire = ire_arg; 23547 } else { 23548 ire = ire_ctable_lookup_v6(v6dstp, 0, 0, ipif, 23549 zoneid, MBLK_GETLABEL(mp), match_flags); 23550 ire_need_rele = B_TRUE; 23551 } 23552 if (ire != NULL) { 23553 ipif_refrele(ipif); 23554 /* 23555 * XXX Do the multicast forwarding now, as the IPSEC 23556 * processing has been done. 23557 */ 23558 goto send; 23559 } 23560 23561 ip0dbg(("ip_wput_ipsec_out_v6: multicast: IRE disappeared\n")); 23562 mp->b_prev = NULL; 23563 mp->b_next = NULL; 23564 23565 /* 23566 * If the IPsec packet was processed asynchronously, 23567 * drop it now. 23568 */ 23569 if (q == NULL) { 23570 if (ill_need_rele) 23571 ill_refrele(ill); 23572 freemsg(ipsec_mp); 23573 return; 23574 } 23575 23576 ip_newroute_ipif_v6(q, ipsec_mp, ipif, *v6dstp, 23577 unspec_src, zoneid); 23578 ipif_refrele(ipif); 23579 } else { 23580 if (attach_if) { 23581 ipif_t *ipif; 23582 23583 ipif = ipif_get_next_ipif(NULL, ill); 23584 if (ipif == NULL) { 23585 if (ill_need_rele) 23586 ill_refrele(ill); 23587 freemsg(ipsec_mp); 23588 return; 23589 } 23590 ire = ire_ctable_lookup_v6(v6dstp, 0, 0, ipif, 23591 zoneid, MBLK_GETLABEL(mp), match_flags); 23592 ire_need_rele = B_TRUE; 23593 ipif_refrele(ipif); 23594 } else { 23595 if (ire_arg != NULL) { 23596 ire = ire_arg; 23597 } else { 23598 ire = ire_cache_lookup_v6(v6dstp, zoneid, NULL); 23599 ire_need_rele = B_TRUE; 23600 } 23601 } 23602 if (ire != NULL) 23603 goto send; 23604 /* 23605 * ire disappeared underneath. 23606 * 23607 * What we need to do here is the ip_newroute 23608 * logic to get the ire without doing the IPSEC 23609 * processing. Follow the same old path. But this 23610 * time, ip_wput or ire_add_then_send will call us 23611 * directly as all the IPSEC operations are done. 23612 */ 23613 ip1dbg(("ip_wput_ipsec_out_v6: IRE disappeared\n")); 23614 mp->b_prev = NULL; 23615 mp->b_next = NULL; 23616 23617 /* 23618 * If the IPsec packet was processed asynchronously, 23619 * drop it now. 23620 */ 23621 if (q == NULL) { 23622 if (ill_need_rele) 23623 ill_refrele(ill); 23624 freemsg(ipsec_mp); 23625 return; 23626 } 23627 23628 ip_newroute_v6(q, ipsec_mp, v6dstp, &ip6h->ip6_src, ill, 23629 zoneid); 23630 } 23631 if (ill != NULL && ill_need_rele) 23632 ill_refrele(ill); 23633 return; 23634 send: 23635 if (ill != NULL && ill_need_rele) 23636 ill_refrele(ill); 23637 23638 /* Local delivery */ 23639 if (ire->ire_stq == NULL) { 23640 ASSERT(q != NULL); 23641 ip_wput_local_v6(RD(q), ire->ire_ipif->ipif_ill, ip6h, ipsec_mp, 23642 ire, 0); 23643 if (ire_need_rele) 23644 ire_refrele(ire); 23645 return; 23646 } 23647 /* 23648 * Everything is done. Send it out on the wire. 23649 * We force the insertion of a fragment header using the 23650 * IPH_FRAG_HDR flag in two cases: 23651 * - after reception of an ICMPv6 "packet too big" message 23652 * with a MTU < 1280 (cf. RFC 2460 section 5) 23653 * - for multirouted IPv6 packets, so that the receiver can 23654 * discard duplicates according to their fragment identifier 23655 */ 23656 /* XXX fix flow control problems. */ 23657 if (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN > ire->ire_max_frag || 23658 (ire->ire_frag_flag & IPH_FRAG_HDR)) { 23659 if (hwaccel) { 23660 /* 23661 * hardware acceleration does not handle these 23662 * "slow path" cases. 23663 */ 23664 /* IPsec KSTATS: should bump bean counter here. */ 23665 if (ire_need_rele) 23666 ire_refrele(ire); 23667 freemsg(ipsec_mp); 23668 return; 23669 } 23670 if (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN != 23671 (mp->b_cont ? msgdsize(mp) : 23672 mp->b_wptr - (uchar_t *)ip6h)) { 23673 /* IPsec KSTATS: should bump bean counter here. */ 23674 ip0dbg(("Packet length mismatch: %d, %ld\n", 23675 ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN, 23676 msgdsize(mp))); 23677 if (ire_need_rele) 23678 ire_refrele(ire); 23679 freemsg(ipsec_mp); 23680 return; 23681 } 23682 ASSERT(mp->b_prev == NULL); 23683 ip2dbg(("Fragmenting Size = %d, mtu = %d\n", 23684 ntohs(ip6h->ip6_plen) + 23685 IPV6_HDR_LEN, ire->ire_max_frag)); 23686 ip_wput_frag_v6(mp, ire, flags, NULL, B_FALSE, 23687 ire->ire_max_frag); 23688 } else { 23689 UPDATE_OB_PKT_COUNT(ire); 23690 ire->ire_last_used_time = lbolt; 23691 ip_xmit_v6(mp, ire, flags, NULL, B_FALSE, hwaccel ? io : NULL); 23692 } 23693 if (ire_need_rele) 23694 ire_refrele(ire); 23695 freeb(ipsec_mp); 23696 } 23697 23698 void 23699 ipsec_hw_putnext(queue_t *q, mblk_t *mp) 23700 { 23701 mblk_t *hada_mp; /* attributes M_CTL mblk */ 23702 da_ipsec_t *hada; /* data attributes */ 23703 ill_t *ill = (ill_t *)q->q_ptr; 23704 23705 IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_hw_putnext: accelerated packet\n")); 23706 23707 if ((ill->ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP)) == 0) { 23708 /* IPsec KSTATS: Bump lose counter here! */ 23709 freemsg(mp); 23710 return; 23711 } 23712 23713 /* 23714 * It's an IPsec packet that must be 23715 * accelerated by the Provider, and the 23716 * outbound ill is IPsec acceleration capable. 23717 * Prepends the mblk with an IPHADA_M_CTL, and ship it 23718 * to the ill. 23719 * IPsec KSTATS: should bump packet counter here. 23720 */ 23721 23722 hada_mp = allocb(sizeof (da_ipsec_t), BPRI_HI); 23723 if (hada_mp == NULL) { 23724 /* IPsec KSTATS: should bump packet counter here. */ 23725 freemsg(mp); 23726 return; 23727 } 23728 23729 hada_mp->b_datap->db_type = M_CTL; 23730 hada_mp->b_wptr = hada_mp->b_rptr + sizeof (*hada); 23731 hada_mp->b_cont = mp; 23732 23733 hada = (da_ipsec_t *)hada_mp->b_rptr; 23734 bzero(hada, sizeof (da_ipsec_t)); 23735 hada->da_type = IPHADA_M_CTL; 23736 23737 putnext(q, hada_mp); 23738 } 23739 23740 /* 23741 * Finish the outbound IPsec processing. This function is called from 23742 * ipsec_out_process() if the IPsec packet was processed 23743 * synchronously, or from {ah,esp}_kcf_callback() if it was processed 23744 * asynchronously. 23745 */ 23746 void 23747 ip_wput_ipsec_out(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, ill_t *ill, 23748 ire_t *ire_arg) 23749 { 23750 uint32_t v_hlen_tos_len; 23751 ipaddr_t dst; 23752 ipif_t *ipif = NULL; 23753 ire_t *ire; 23754 ire_t *ire1 = NULL; 23755 mblk_t *next_mp = NULL; 23756 uint32_t max_frag; 23757 boolean_t multirt_send = B_FALSE; 23758 mblk_t *mp; 23759 mblk_t *mp1; 23760 uint_t ill_index; 23761 ipsec_out_t *io; 23762 boolean_t attach_if; 23763 int match_flags, offset; 23764 irb_t *irb = NULL; 23765 boolean_t ill_need_rele = B_FALSE, ire_need_rele = B_TRUE; 23766 zoneid_t zoneid; 23767 uint32_t cksum; 23768 uint16_t *up; 23769 #ifdef _BIG_ENDIAN 23770 #define LENGTH (v_hlen_tos_len & 0xFFFF) 23771 #else 23772 #define LENGTH ((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00)) 23773 #endif 23774 23775 mp = ipsec_mp->b_cont; 23776 ASSERT(mp != NULL); 23777 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 23778 dst = ipha->ipha_dst; 23779 23780 io = (ipsec_out_t *)ipsec_mp->b_rptr; 23781 ill_index = io->ipsec_out_ill_index; 23782 attach_if = io->ipsec_out_attach_if; 23783 zoneid = io->ipsec_out_zoneid; 23784 ASSERT(zoneid != ALL_ZONES); 23785 match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR; 23786 if (ill_index != 0) { 23787 if (ill == NULL) { 23788 ill = ip_grab_attach_ill(NULL, ipsec_mp, 23789 ill_index, B_FALSE); 23790 23791 /* Failure case frees things for us. */ 23792 if (ill == NULL) 23793 return; 23794 23795 ill_need_rele = B_TRUE; 23796 } 23797 /* 23798 * If this packet needs to go out on a particular interface 23799 * honor it. 23800 */ 23801 if (attach_if) { 23802 match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; 23803 23804 /* 23805 * Check if we need an ire that will not be 23806 * looked up by anybody else i.e. HIDDEN. 23807 */ 23808 if (ill_is_probeonly(ill)) { 23809 match_flags |= MATCH_IRE_MARK_HIDDEN; 23810 } 23811 } 23812 } 23813 23814 if (CLASSD(dst)) { 23815 boolean_t conn_dontroute; 23816 /* 23817 * Use the ill_index to get the right ipif. 23818 */ 23819 conn_dontroute = io->ipsec_out_dontroute; 23820 if (ill_index == 0) 23821 ipif = ipif_lookup_group(dst, zoneid); 23822 else 23823 (void) ipif_lookup_zoneid(ill, zoneid, 0, &ipif); 23824 if (ipif == NULL) { 23825 ip1dbg(("ip_wput_ipsec_out: No ipif for" 23826 " multicast\n")); 23827 BUMP_MIB(&ip_mib, ipOutNoRoutes); 23828 freemsg(ipsec_mp); 23829 goto done; 23830 } 23831 /* 23832 * ipha_src has already been intialized with the 23833 * value of the ipif in ip_wput. All we need now is 23834 * an ire to send this downstream. 23835 */ 23836 ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid, 23837 MBLK_GETLABEL(mp), match_flags); 23838 if (ire != NULL) { 23839 ill_t *ill1; 23840 /* 23841 * Do the multicast forwarding now, as the IPSEC 23842 * processing has been done. 23843 */ 23844 if (ip_g_mrouter && !conn_dontroute && 23845 (ill1 = ire_to_ill(ire))) { 23846 if (ip_mforward(ill1, ipha, mp)) { 23847 freemsg(ipsec_mp); 23848 ip1dbg(("ip_wput_ipsec_out: mforward " 23849 "failed\n")); 23850 ire_refrele(ire); 23851 goto done; 23852 } 23853 } 23854 goto send; 23855 } 23856 23857 ip0dbg(("ip_wput_ipsec_out: multicast: IRE disappeared\n")); 23858 mp->b_prev = NULL; 23859 mp->b_next = NULL; 23860 23861 /* 23862 * If the IPsec packet was processed asynchronously, 23863 * drop it now. 23864 */ 23865 if (q == NULL) { 23866 freemsg(ipsec_mp); 23867 goto done; 23868 } 23869 23870 /* 23871 * We may be using a wrong ipif to create the ire. 23872 * But it is okay as the source address is assigned 23873 * for the packet already. Next outbound packet would 23874 * create the IRE with the right IPIF in ip_wput. 23875 * 23876 * Also handle RTF_MULTIRT routes. 23877 */ 23878 ip_newroute_ipif(q, ipsec_mp, ipif, dst, NULL, RTF_MULTIRT); 23879 } else { 23880 if (attach_if) { 23881 ire = ire_ctable_lookup(dst, 0, 0, ill->ill_ipif, 23882 zoneid, MBLK_GETLABEL(mp), match_flags); 23883 } else { 23884 if (ire_arg != NULL) { 23885 ire = ire_arg; 23886 ire_need_rele = B_FALSE; 23887 } else { 23888 ire = ire_cache_lookup(dst, zoneid, 23889 MBLK_GETLABEL(mp)); 23890 } 23891 } 23892 if (ire != NULL) { 23893 goto send; 23894 } 23895 23896 /* 23897 * ire disappeared underneath. 23898 * 23899 * What we need to do here is the ip_newroute 23900 * logic to get the ire without doing the IPSEC 23901 * processing. Follow the same old path. But this 23902 * time, ip_wput or ire_add_then_put will call us 23903 * directly as all the IPSEC operations are done. 23904 */ 23905 ip1dbg(("ip_wput_ipsec_out: IRE disappeared\n")); 23906 mp->b_prev = NULL; 23907 mp->b_next = NULL; 23908 23909 /* 23910 * If the IPsec packet was processed asynchronously, 23911 * drop it now. 23912 */ 23913 if (q == NULL) { 23914 freemsg(ipsec_mp); 23915 goto done; 23916 } 23917 23918 /* 23919 * Since we're going through ip_newroute() again, we 23920 * need to make sure we don't: 23921 * 23922 * 1.) Trigger the ASSERT() with the ipha_ident 23923 * overloading. 23924 * 2.) Redo transport-layer checksumming, since we've 23925 * already done all that to get this far. 23926 * 23927 * The easiest way not do either of the above is to set 23928 * the ipha_ident field to IP_HDR_INCLUDED. 23929 */ 23930 ipha->ipha_ident = IP_HDR_INCLUDED; 23931 ip_newroute(q, ipsec_mp, dst, NULL, 23932 (CONN_Q(q) ? Q_TO_CONN(q) : NULL)); 23933 } 23934 goto done; 23935 send: 23936 if (ipha->ipha_protocol == IPPROTO_UDP && udp_compute_checksum()) { 23937 /* 23938 * ESP NAT-Traversal packet. 23939 * 23940 * Just do software checksum for now. 23941 */ 23942 23943 offset = IP_SIMPLE_HDR_LENGTH + UDP_CHECKSUM_OFFSET; 23944 IP_STAT(ip_out_sw_cksum); 23945 IP_STAT_UPDATE(ip_udp_out_sw_cksum_bytes, 23946 ntohs(htons(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH)); 23947 #define iphs ((uint16_t *)ipha) 23948 cksum = IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] + 23949 iphs[9] + ntohs(htons(ipha->ipha_length) - 23950 IP_SIMPLE_HDR_LENGTH); 23951 #undef iphs 23952 if ((cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH, cksum)) == 0) 23953 cksum = 0xFFFF; 23954 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) 23955 if (mp1->b_wptr - mp1->b_rptr >= 23956 offset + sizeof (uint16_t)) { 23957 up = (uint16_t *)(mp1->b_rptr + offset); 23958 *up = cksum; 23959 break; /* out of for loop */ 23960 } else { 23961 offset -= (mp->b_wptr - mp->b_rptr); 23962 } 23963 } /* Otherwise, just keep the all-zero checksum. */ 23964 23965 if (ire->ire_stq == NULL) { 23966 /* 23967 * Loopbacks go through ip_wput_local except for one case. 23968 * We come here if we generate a icmp_frag_needed message 23969 * after IPSEC processing is over. When this function calls 23970 * ip_wput_ire_fragmentit, ip_wput_frag might end up calling 23971 * icmp_frag_needed. The message generated comes back here 23972 * through icmp_frag_needed -> icmp_pkt -> ip_wput -> 23973 * ipsec_out_process -> ip_wput_ipsec_out. We need to set the 23974 * source address as it is usually set in ip_wput_ire. As 23975 * ipsec_out_proc_begin is set, ip_wput calls ipsec_out_process 23976 * and we end up here. We can't enter ip_wput_ire once the 23977 * IPSEC processing is over and hence we need to do it here. 23978 */ 23979 ASSERT(q != NULL); 23980 UPDATE_OB_PKT_COUNT(ire); 23981 ire->ire_last_used_time = lbolt; 23982 if (ipha->ipha_src == 0) 23983 ipha->ipha_src = ire->ire_src_addr; 23984 ip_wput_local(RD(q), ire->ire_ipif->ipif_ill, ipha, ipsec_mp, 23985 ire, 0, zoneid); 23986 if (ire_need_rele) 23987 ire_refrele(ire); 23988 goto done; 23989 } 23990 23991 if (ire->ire_max_frag < (unsigned int)LENGTH) { 23992 /* 23993 * We are through with IPSEC processing. 23994 * Fragment this and send it on the wire. 23995 */ 23996 if (io->ipsec_out_accelerated) { 23997 /* 23998 * The packet has been accelerated but must 23999 * be fragmented. This should not happen 24000 * since AH and ESP must not accelerate 24001 * packets that need fragmentation, however 24002 * the configuration could have changed 24003 * since the AH or ESP processing. 24004 * Drop packet. 24005 * IPsec KSTATS: bump bean counter here. 24006 */ 24007 IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_wput_ipsec_out: " 24008 "fragmented accelerated packet!\n")); 24009 freemsg(ipsec_mp); 24010 } else { 24011 ip_wput_ire_fragmentit(ipsec_mp, ire); 24012 } 24013 if (ire_need_rele) 24014 ire_refrele(ire); 24015 goto done; 24016 } 24017 24018 ip2dbg(("ip_wput_ipsec_out: ipsec_mp %p, ire %p, ire_ipif %p, " 24019 "ipif %p\n", (void *)ipsec_mp, (void *)ire, 24020 (void *)ire->ire_ipif, (void *)ipif)); 24021 24022 /* 24023 * Multiroute the secured packet, unless IPsec really 24024 * requires the packet to go out only through a particular 24025 * interface. 24026 */ 24027 if ((ire->ire_flags & RTF_MULTIRT) && !attach_if) { 24028 ire_t *first_ire; 24029 irb = ire->ire_bucket; 24030 ASSERT(irb != NULL); 24031 /* 24032 * This ire has been looked up as the one that 24033 * goes through the given ipif; 24034 * make sure we do not omit any other multiroute ire 24035 * that may be present in the bucket before this one. 24036 */ 24037 IRB_REFHOLD(irb); 24038 for (first_ire = irb->irb_ire; 24039 first_ire != NULL; 24040 first_ire = first_ire->ire_next) { 24041 if ((first_ire->ire_flags & RTF_MULTIRT) && 24042 (first_ire->ire_addr == ire->ire_addr) && 24043 !(first_ire->ire_marks & 24044 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) 24045 break; 24046 } 24047 24048 if ((first_ire != NULL) && (first_ire != ire)) { 24049 /* 24050 * Don't change the ire if the packet must 24051 * be fragmented if sent via this new one. 24052 */ 24053 if (first_ire->ire_max_frag >= (unsigned int)LENGTH) { 24054 IRE_REFHOLD(first_ire); 24055 if (ire_need_rele) 24056 ire_refrele(ire); 24057 else 24058 ire_need_rele = B_TRUE; 24059 ire = first_ire; 24060 } 24061 } 24062 IRB_REFRELE(irb); 24063 24064 multirt_send = B_TRUE; 24065 max_frag = ire->ire_max_frag; 24066 } else { 24067 if ((ire->ire_flags & RTF_MULTIRT) && attach_if) { 24068 ip1dbg(("ip_wput_ipsec_out: ignoring multirouting " 24069 "flag, attach_if %d\n", attach_if)); 24070 } 24071 } 24072 24073 /* 24074 * In most cases, the emission loop below is entered only once. 24075 * Only in the case where the ire holds the RTF_MULTIRT 24076 * flag, we loop to process all RTF_MULTIRT ires in the 24077 * bucket, and send the packet through all crossed 24078 * RTF_MULTIRT routes. 24079 */ 24080 do { 24081 if (multirt_send) { 24082 /* 24083 * ire1 holds here the next ire to process in the 24084 * bucket. If multirouting is expected, 24085 * any non-RTF_MULTIRT ire that has the 24086 * right destination address is ignored. 24087 */ 24088 ASSERT(irb != NULL); 24089 IRB_REFHOLD(irb); 24090 for (ire1 = ire->ire_next; 24091 ire1 != NULL; 24092 ire1 = ire1->ire_next) { 24093 if ((ire1->ire_flags & RTF_MULTIRT) == 0) 24094 continue; 24095 if (ire1->ire_addr != ire->ire_addr) 24096 continue; 24097 if (ire1->ire_marks & 24098 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)) 24099 continue; 24100 /* No loopback here */ 24101 if (ire1->ire_stq == NULL) 24102 continue; 24103 /* 24104 * Ensure we do not exceed the MTU 24105 * of the next route. 24106 */ 24107 if (ire1->ire_max_frag < (unsigned int)LENGTH) { 24108 ip_multirt_bad_mtu(ire1, max_frag); 24109 continue; 24110 } 24111 24112 IRE_REFHOLD(ire1); 24113 break; 24114 } 24115 IRB_REFRELE(irb); 24116 if (ire1 != NULL) { 24117 /* 24118 * We are in a multiple send case, need to 24119 * make a copy of the packet. 24120 */ 24121 next_mp = copymsg(ipsec_mp); 24122 if (next_mp == NULL) { 24123 ire_refrele(ire1); 24124 ire1 = NULL; 24125 } 24126 } 24127 } 24128 24129 /* Everything is done. Send it out on the wire */ 24130 mp1 = ip_wput_attach_llhdr(mp, ire, 0, 0); 24131 if (mp1 == NULL) { 24132 BUMP_MIB(&ip_mib, ipOutDiscards); 24133 freemsg(ipsec_mp); 24134 if (ire_need_rele) 24135 ire_refrele(ire); 24136 if (ire1 != NULL) { 24137 ire_refrele(ire1); 24138 freemsg(next_mp); 24139 } 24140 goto done; 24141 } 24142 UPDATE_OB_PKT_COUNT(ire); 24143 ire->ire_last_used_time = lbolt; 24144 if (!io->ipsec_out_accelerated) { 24145 putnext(ire->ire_stq, mp1); 24146 } else { 24147 /* 24148 * Safety Pup says: make sure this is going to 24149 * the right interface! 24150 */ 24151 ill_t *ill1 = (ill_t *)ire->ire_stq->q_ptr; 24152 int ifindex = ill1->ill_phyint->phyint_ifindex; 24153 24154 if (ifindex != io->ipsec_out_capab_ill_index) { 24155 /* IPsec kstats: bump lose counter */ 24156 freemsg(mp1); 24157 } else { 24158 ipsec_hw_putnext(ire->ire_stq, mp1); 24159 } 24160 } 24161 24162 freeb(ipsec_mp); 24163 if (ire_need_rele) 24164 ire_refrele(ire); 24165 24166 if (ire1 != NULL) { 24167 ire = ire1; 24168 ire_need_rele = B_TRUE; 24169 ASSERT(next_mp); 24170 ipsec_mp = next_mp; 24171 mp = ipsec_mp->b_cont; 24172 ire1 = NULL; 24173 next_mp = NULL; 24174 io = (ipsec_out_t *)ipsec_mp->b_rptr; 24175 } else { 24176 multirt_send = B_FALSE; 24177 } 24178 } while (multirt_send); 24179 done: 24180 if (ill != NULL && ill_need_rele) 24181 ill_refrele(ill); 24182 if (ipif != NULL) 24183 ipif_refrele(ipif); 24184 } 24185 24186 /* 24187 * Get the ill corresponding to the specified ire, and compare its 24188 * capabilities with the protocol and algorithms specified by the 24189 * the SA obtained from ipsec_out. If they match, annotate the 24190 * ipsec_out structure to indicate that the packet needs acceleration. 24191 * 24192 * 24193 * A packet is eligible for outbound hardware acceleration if the 24194 * following conditions are satisfied: 24195 * 24196 * 1. the packet will not be fragmented 24197 * 2. the provider supports the algorithm 24198 * 3. there is no pending control message being exchanged 24199 * 4. snoop is not attached 24200 * 5. the destination address is not a broadcast or multicast address. 24201 * 24202 * Rationale: 24203 * - Hardware drivers do not support fragmentation with 24204 * the current interface. 24205 * - snoop, multicast, and broadcast may result in exposure of 24206 * a cleartext datagram. 24207 * We check all five of these conditions here. 24208 * 24209 * XXX would like to nuke "ire_t *" parameter here; problem is that 24210 * IRE is only way to figure out if a v4 address is a broadcast and 24211 * thus ineligible for acceleration... 24212 */ 24213 static void 24214 ipsec_out_is_accelerated(mblk_t *ipsec_mp, ipsa_t *sa, ill_t *ill, ire_t *ire) 24215 { 24216 ipsec_out_t *io; 24217 mblk_t *data_mp; 24218 uint_t plen, overhead; 24219 24220 if ((sa->ipsa_flags & IPSA_F_HW) == 0) 24221 return; 24222 24223 if (ill == NULL) 24224 return; 24225 24226 /* 24227 * Destination address is a broadcast or multicast. Punt. 24228 */ 24229 if ((ire != NULL) && (ire->ire_type & (IRE_BROADCAST|IRE_LOOPBACK| 24230 IRE_LOCAL))) 24231 return; 24232 24233 data_mp = ipsec_mp->b_cont; 24234 24235 if (ill->ill_isv6) { 24236 ip6_t *ip6h = (ip6_t *)data_mp->b_rptr; 24237 24238 if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) 24239 return; 24240 24241 plen = ip6h->ip6_plen; 24242 } else { 24243 ipha_t *ipha = (ipha_t *)data_mp->b_rptr; 24244 24245 if (CLASSD(ipha->ipha_dst)) 24246 return; 24247 24248 plen = ipha->ipha_length; 24249 } 24250 /* 24251 * Is there a pending DLPI control message being exchanged 24252 * between IP/IPsec and the DLS Provider? If there is, it 24253 * could be a SADB update, and the state of the DLS Provider 24254 * SADB might not be in sync with the SADB maintained by 24255 * IPsec. To avoid dropping packets or using the wrong keying 24256 * material, we do not accelerate this packet. 24257 */ 24258 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 24259 IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_out_check_is_accelerated: " 24260 "ill_dlpi_pending! don't accelerate packet\n")); 24261 return; 24262 } 24263 24264 /* 24265 * Is the Provider in promiscous mode? If it does, we don't 24266 * accelerate the packet since it will bounce back up to the 24267 * listeners in the clear. 24268 */ 24269 if (ill->ill_promisc_on_phys) { 24270 IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_out_check_is_accelerated: " 24271 "ill in promiscous mode, don't accelerate packet\n")); 24272 return; 24273 } 24274 24275 /* 24276 * Will the packet require fragmentation? 24277 */ 24278 24279 /* 24280 * IPsec ESP note: this is a pessimistic estimate, but the same 24281 * as is used elsewhere. 24282 * SPI + sequence + MAC + IV(blocksize) + padding(blocksize-1) 24283 * + 2-byte trailer 24284 */ 24285 overhead = (sa->ipsa_type == SADB_SATYPE_AH) ? IPSEC_MAX_AH_HDR_SIZE : 24286 IPSEC_BASE_ESP_HDR_SIZE(sa); 24287 24288 if ((plen + overhead) > ill->ill_max_mtu) 24289 return; 24290 24291 io = (ipsec_out_t *)ipsec_mp->b_rptr; 24292 24293 /* 24294 * Can the ill accelerate this IPsec protocol and algorithm 24295 * specified by the SA? 24296 */ 24297 if (!ipsec_capab_match(ill, io->ipsec_out_capab_ill_index, 24298 ill->ill_isv6, sa)) { 24299 return; 24300 } 24301 24302 /* 24303 * Tell AH or ESP that the outbound ill is capable of 24304 * accelerating this packet. 24305 */ 24306 io->ipsec_out_is_capab_ill = B_TRUE; 24307 } 24308 24309 /* 24310 * Select which AH & ESP SA's to use (if any) for the outbound packet. 24311 * 24312 * If this function returns B_TRUE, the requested SA's have been filled 24313 * into the ipsec_out_*_sa pointers. 24314 * 24315 * If the function returns B_FALSE, the packet has been "consumed", most 24316 * likely by an ACQUIRE sent up via PF_KEY to a key management daemon. 24317 * 24318 * The SA references created by the protocol-specific "select" 24319 * function will be released when the ipsec_mp is freed, thanks to the 24320 * ipsec_out_free destructor -- see spd.c. 24321 */ 24322 static boolean_t 24323 ipsec_out_select_sa(mblk_t *ipsec_mp) 24324 { 24325 boolean_t need_ah_acquire = B_FALSE, need_esp_acquire = B_FALSE; 24326 ipsec_out_t *io; 24327 ipsec_policy_t *pp; 24328 ipsec_action_t *ap; 24329 io = (ipsec_out_t *)ipsec_mp->b_rptr; 24330 ASSERT(io->ipsec_out_type == IPSEC_OUT); 24331 ASSERT(io->ipsec_out_len == sizeof (ipsec_out_t)); 24332 24333 if (!io->ipsec_out_secure) { 24334 /* 24335 * We came here by mistake. 24336 * Don't bother with ipsec processing 24337 * We should "discourage" this path in the future. 24338 */ 24339 ASSERT(io->ipsec_out_proc_begin == B_FALSE); 24340 return (B_FALSE); 24341 } 24342 ASSERT(io->ipsec_out_need_policy == B_FALSE); 24343 ASSERT((io->ipsec_out_policy != NULL) || 24344 (io->ipsec_out_act != NULL)); 24345 24346 ASSERT(io->ipsec_out_failed == B_FALSE); 24347 24348 /* 24349 * IPSEC processing has started. 24350 */ 24351 io->ipsec_out_proc_begin = B_TRUE; 24352 ap = io->ipsec_out_act; 24353 if (ap == NULL) { 24354 pp = io->ipsec_out_policy; 24355 ASSERT(pp != NULL); 24356 ap = pp->ipsp_act; 24357 ASSERT(ap != NULL); 24358 } 24359 24360 /* 24361 * We have an action. now, let's select SA's. 24362 * (In the future, we can cache this in the conn_t..) 24363 */ 24364 if (ap->ipa_want_esp) { 24365 if (io->ipsec_out_esp_sa == NULL) { 24366 need_esp_acquire = !ipsec_outbound_sa(ipsec_mp, 24367 IPPROTO_ESP); 24368 } 24369 ASSERT(need_esp_acquire || io->ipsec_out_esp_sa != NULL); 24370 } 24371 24372 if (ap->ipa_want_ah) { 24373 if (io->ipsec_out_ah_sa == NULL) { 24374 need_ah_acquire = !ipsec_outbound_sa(ipsec_mp, 24375 IPPROTO_AH); 24376 } 24377 ASSERT(need_ah_acquire || io->ipsec_out_ah_sa != NULL); 24378 /* 24379 * The ESP and AH processing order needs to be preserved 24380 * when both protocols are required (ESP should be applied 24381 * before AH for an outbound packet). Force an ESP ACQUIRE 24382 * when both ESP and AH are required, and an AH ACQUIRE 24383 * is needed. 24384 */ 24385 if (ap->ipa_want_esp && need_ah_acquire) 24386 need_esp_acquire = B_TRUE; 24387 } 24388 24389 /* 24390 * Send an ACQUIRE (extended, regular, or both) if we need one. 24391 * Release SAs that got referenced, but will not be used until we 24392 * acquire _all_ of the SAs we need. 24393 */ 24394 if (need_ah_acquire || need_esp_acquire) { 24395 if (io->ipsec_out_ah_sa != NULL) { 24396 IPSA_REFRELE(io->ipsec_out_ah_sa); 24397 io->ipsec_out_ah_sa = NULL; 24398 } 24399 if (io->ipsec_out_esp_sa != NULL) { 24400 IPSA_REFRELE(io->ipsec_out_esp_sa); 24401 io->ipsec_out_esp_sa = NULL; 24402 } 24403 24404 sadb_acquire(ipsec_mp, io, need_ah_acquire, need_esp_acquire); 24405 return (B_FALSE); 24406 } 24407 24408 return (B_TRUE); 24409 } 24410 24411 /* 24412 * Process an IPSEC_OUT message and see what you can 24413 * do with it. 24414 * IPQoS Notes: 24415 * We do IPPF processing if IPP_LOCAL_OUT is enabled before processing for 24416 * IPSec. 24417 * XXX would like to nuke ire_t. 24418 * XXX ill_index better be "real" 24419 */ 24420 void 24421 ipsec_out_process(queue_t *q, mblk_t *ipsec_mp, ire_t *ire, uint_t ill_index) 24422 { 24423 ipsec_out_t *io; 24424 ipsec_policy_t *pp; 24425 ipsec_action_t *ap; 24426 ipha_t *ipha; 24427 ip6_t *ip6h; 24428 mblk_t *mp; 24429 ill_t *ill; 24430 zoneid_t zoneid; 24431 ipsec_status_t ipsec_rc; 24432 boolean_t ill_need_rele = B_FALSE; 24433 24434 io = (ipsec_out_t *)ipsec_mp->b_rptr; 24435 ASSERT(io->ipsec_out_type == IPSEC_OUT); 24436 ASSERT(io->ipsec_out_len == sizeof (ipsec_out_t)); 24437 mp = ipsec_mp->b_cont; 24438 24439 /* 24440 * Initiate IPPF processing. We do it here to account for packets 24441 * coming here that don't have any policy (i.e. !io->ipsec_out_secure). 24442 * We can check for ipsec_out_proc_begin even for such packets, as 24443 * they will always be false (asserted below). 24444 */ 24445 if (IPP_ENABLED(IPP_LOCAL_OUT) && !io->ipsec_out_proc_begin) { 24446 ip_process(IPP_LOCAL_OUT, &mp, io->ipsec_out_ill_index != 0 ? 24447 io->ipsec_out_ill_index : ill_index); 24448 if (mp == NULL) { 24449 ip2dbg(("ipsec_out_process: packet dropped "\ 24450 "during IPPF processing\n")); 24451 freeb(ipsec_mp); 24452 BUMP_MIB(&ip_mib, ipOutDiscards); 24453 return; 24454 } 24455 } 24456 24457 if (!io->ipsec_out_secure) { 24458 /* 24459 * We came here by mistake. 24460 * Don't bother with ipsec processing 24461 * Should "discourage" this path in the future. 24462 */ 24463 ASSERT(io->ipsec_out_proc_begin == B_FALSE); 24464 goto done; 24465 } 24466 ASSERT(io->ipsec_out_need_policy == B_FALSE); 24467 ASSERT((io->ipsec_out_policy != NULL) || 24468 (io->ipsec_out_act != NULL)); 24469 ASSERT(io->ipsec_out_failed == B_FALSE); 24470 24471 if (!ipsec_loaded()) { 24472 ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr; 24473 if (IPH_HDR_VERSION(ipha) == IP_VERSION) { 24474 BUMP_MIB(&ip_mib, ipOutDiscards); 24475 } else { 24476 BUMP_MIB(&ip6_mib, ipv6OutDiscards); 24477 } 24478 ip_drop_packet(ipsec_mp, B_FALSE, NULL, ire, 24479 &ipdrops_ip_ipsec_not_loaded, &ip_dropper); 24480 return; 24481 } 24482 24483 /* 24484 * IPSEC processing has started. 24485 */ 24486 io->ipsec_out_proc_begin = B_TRUE; 24487 ap = io->ipsec_out_act; 24488 if (ap == NULL) { 24489 pp = io->ipsec_out_policy; 24490 ASSERT(pp != NULL); 24491 ap = pp->ipsp_act; 24492 ASSERT(ap != NULL); 24493 } 24494 24495 /* 24496 * Save the outbound ill index. When the packet comes back 24497 * from IPsec, we make sure the ill hasn't changed or disappeared 24498 * before sending it the accelerated packet. 24499 */ 24500 if ((ire != NULL) && (io->ipsec_out_capab_ill_index == 0)) { 24501 int ifindex; 24502 ill = ire_to_ill(ire); 24503 ifindex = ill->ill_phyint->phyint_ifindex; 24504 io->ipsec_out_capab_ill_index = ifindex; 24505 } 24506 24507 /* 24508 * The order of processing is first insert a IP header if needed. 24509 * Then insert the ESP header and then the AH header. 24510 */ 24511 if ((io->ipsec_out_se_done == B_FALSE) && 24512 (ap->ipa_want_se)) { 24513 /* 24514 * First get the outer IP header before sending 24515 * it to ESP. 24516 */ 24517 ipha_t *oipha, *iipha; 24518 mblk_t *outer_mp, *inner_mp; 24519 24520 if ((outer_mp = allocb(sizeof (ipha_t), BPRI_HI)) == NULL) { 24521 (void) mi_strlog(q, 0, SL_ERROR|SL_TRACE|SL_CONSOLE, 24522 "ipsec_out_process: " 24523 "Self-Encapsulation failed: Out of memory\n"); 24524 freemsg(ipsec_mp); 24525 BUMP_MIB(&ip_mib, ipOutDiscards); 24526 return; 24527 } 24528 inner_mp = ipsec_mp->b_cont; 24529 ASSERT(inner_mp->b_datap->db_type == M_DATA); 24530 oipha = (ipha_t *)outer_mp->b_rptr; 24531 iipha = (ipha_t *)inner_mp->b_rptr; 24532 *oipha = *iipha; 24533 outer_mp->b_wptr += sizeof (ipha_t); 24534 oipha->ipha_length = htons(ntohs(iipha->ipha_length) + 24535 sizeof (ipha_t)); 24536 oipha->ipha_protocol = IPPROTO_ENCAP; 24537 oipha->ipha_version_and_hdr_length = 24538 IP_SIMPLE_HDR_VERSION; 24539 oipha->ipha_hdr_checksum = 0; 24540 oipha->ipha_hdr_checksum = ip_csum_hdr(oipha); 24541 outer_mp->b_cont = inner_mp; 24542 ipsec_mp->b_cont = outer_mp; 24543 24544 io->ipsec_out_se_done = B_TRUE; 24545 io->ipsec_out_encaps = B_TRUE; 24546 } 24547 24548 if (((ap->ipa_want_ah && (io->ipsec_out_ah_sa == NULL)) || 24549 (ap->ipa_want_esp && (io->ipsec_out_esp_sa == NULL))) && 24550 !ipsec_out_select_sa(ipsec_mp)) 24551 return; 24552 24553 /* 24554 * By now, we know what SA's to use. Toss over to ESP & AH 24555 * to do the heavy lifting. 24556 */ 24557 zoneid = io->ipsec_out_zoneid; 24558 ASSERT(zoneid != ALL_ZONES); 24559 if ((io->ipsec_out_esp_done == B_FALSE) && (ap->ipa_want_esp)) { 24560 ASSERT(io->ipsec_out_esp_sa != NULL); 24561 io->ipsec_out_esp_done = B_TRUE; 24562 /* 24563 * Note that since hw accel can only apply one transform, 24564 * not two, we skip hw accel for ESP if we also have AH 24565 * This is an design limitation of the interface 24566 * which should be revisited. 24567 */ 24568 ASSERT(ire != NULL); 24569 if (io->ipsec_out_ah_sa == NULL) { 24570 ill = (ill_t *)ire->ire_stq->q_ptr; 24571 ipsec_out_is_accelerated(ipsec_mp, 24572 io->ipsec_out_esp_sa, ill, ire); 24573 } 24574 24575 ipsec_rc = io->ipsec_out_esp_sa->ipsa_output_func(ipsec_mp); 24576 switch (ipsec_rc) { 24577 case IPSEC_STATUS_SUCCESS: 24578 break; 24579 case IPSEC_STATUS_FAILED: 24580 BUMP_MIB(&ip_mib, ipOutDiscards); 24581 /* FALLTHRU */ 24582 case IPSEC_STATUS_PENDING: 24583 return; 24584 } 24585 } 24586 24587 if ((io->ipsec_out_ah_done == B_FALSE) && (ap->ipa_want_ah)) { 24588 ASSERT(io->ipsec_out_ah_sa != NULL); 24589 io->ipsec_out_ah_done = B_TRUE; 24590 if (ire == NULL) { 24591 int idx = io->ipsec_out_capab_ill_index; 24592 ill = ill_lookup_on_ifindex(idx, B_FALSE, 24593 NULL, NULL, NULL, NULL); 24594 ill_need_rele = B_TRUE; 24595 } else { 24596 ill = (ill_t *)ire->ire_stq->q_ptr; 24597 } 24598 ipsec_out_is_accelerated(ipsec_mp, io->ipsec_out_ah_sa, ill, 24599 ire); 24600 24601 ipsec_rc = io->ipsec_out_ah_sa->ipsa_output_func(ipsec_mp); 24602 switch (ipsec_rc) { 24603 case IPSEC_STATUS_SUCCESS: 24604 break; 24605 case IPSEC_STATUS_FAILED: 24606 BUMP_MIB(&ip_mib, ipOutDiscards); 24607 /* FALLTHRU */ 24608 case IPSEC_STATUS_PENDING: 24609 if (ill != NULL && ill_need_rele) 24610 ill_refrele(ill); 24611 return; 24612 } 24613 } 24614 /* 24615 * We are done with IPSEC processing. Send it over 24616 * the wire. 24617 */ 24618 done: 24619 mp = ipsec_mp->b_cont; 24620 ipha = (ipha_t *)mp->b_rptr; 24621 if (IPH_HDR_VERSION(ipha) == IP_VERSION) { 24622 ip_wput_ipsec_out(q, ipsec_mp, ipha, ill, ire); 24623 } else { 24624 ip6h = (ip6_t *)ipha; 24625 ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h, ill, ire); 24626 } 24627 if (ill != NULL && ill_need_rele) 24628 ill_refrele(ill); 24629 } 24630 24631 /* ARGSUSED */ 24632 void 24633 ip_restart_optmgmt(ipsq_t *dummy_sq, queue_t *q, mblk_t *first_mp, void *dummy) 24634 { 24635 opt_restart_t *or; 24636 int err; 24637 conn_t *connp; 24638 24639 ASSERT(CONN_Q(q)); 24640 connp = Q_TO_CONN(q); 24641 24642 ASSERT(first_mp->b_datap->db_type == M_CTL); 24643 or = (opt_restart_t *)first_mp->b_rptr; 24644 /* 24645 * We don't need to pass any credentials here since this is just 24646 * a restart. The credentials are passed in when svr4_optcom_req 24647 * is called the first time (from ip_wput_nondata). 24648 */ 24649 if (or->or_type == T_SVR4_OPTMGMT_REQ) { 24650 err = svr4_optcom_req(q, first_mp, NULL, 24651 &ip_opt_obj); 24652 } else { 24653 ASSERT(or->or_type == T_OPTMGMT_REQ); 24654 err = tpi_optcom_req(q, first_mp, NULL, 24655 &ip_opt_obj); 24656 } 24657 if (err != EINPROGRESS) { 24658 /* operation is done */ 24659 CONN_OPER_PENDING_DONE(connp); 24660 } 24661 } 24662 24663 /* 24664 * ioctls that go through a down/up sequence may need to wait for the down 24665 * to complete. This involves waiting for the ire and ipif refcnts to go down 24666 * to zero. Subsequently the ioctl is restarted from ipif_ill_refrele_tail. 24667 */ 24668 /* ARGSUSED */ 24669 void 24670 ip_reprocess_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 24671 { 24672 struct iocblk *iocp; 24673 mblk_t *mp1; 24674 ipif_t *ipif; 24675 ip_ioctl_cmd_t *ipip; 24676 int err; 24677 sin_t *sin; 24678 struct lifreq *lifr; 24679 struct ifreq *ifr; 24680 24681 iocp = (struct iocblk *)mp->b_rptr; 24682 ASSERT(ipsq != NULL); 24683 /* Existence of mp1 verified in ip_wput_nondata */ 24684 mp1 = mp->b_cont->b_cont; 24685 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 24686 if (ipip->ipi_cmd == SIOCSLIFNAME || ipip->ipi_cmd == IF_UNITSEL) { 24687 ill_t *ill; 24688 /* 24689 * Special case where ipsq_current_ipif may not be set. 24690 * ill_phyint_reinit merged the v4 and v6 into a single ipsq. 24691 * ill could also have become part of a ipmp group in the 24692 * process, we are here as were not able to complete the 24693 * operation in ipif_set_values because we could not become 24694 * exclusive on the new ipsq, In such a case ipsq_current_ipif 24695 * will not be set so we need to set it. 24696 */ 24697 ill = (ill_t *)q->q_ptr; 24698 ipsq->ipsq_current_ipif = ill->ill_ipif; 24699 ipsq->ipsq_last_cmd = ipip->ipi_cmd; 24700 } 24701 24702 ipif = ipsq->ipsq_current_ipif; 24703 ASSERT(ipif != NULL); 24704 if (ipip->ipi_cmd_type == IF_CMD) { 24705 /* This a old style SIOC[GS]IF* command */ 24706 ifr = (struct ifreq *)mp1->b_rptr; 24707 sin = (sin_t *)&ifr->ifr_addr; 24708 } else if (ipip->ipi_cmd_type == LIF_CMD) { 24709 /* This a new style SIOC[GS]LIF* command */ 24710 lifr = (struct lifreq *)mp1->b_rptr; 24711 sin = (sin_t *)&lifr->lifr_addr; 24712 } else { 24713 sin = NULL; 24714 } 24715 24716 err = (*ipip->ipi_func_restart)(ipif, sin, q, mp, ipip, 24717 (void *)mp1->b_rptr); 24718 24719 /* SIOCLIFREMOVEIF could have removed the ipif */ 24720 ip_ioctl_finish(q, mp, err, 24721 ipip->ipi_flags & IPI_GET_CMD ? COPYOUT : NO_COPYOUT, 24722 ipip->ipi_cmd == SIOCLIFREMOVEIF ? NULL : ipif, ipsq); 24723 } 24724 24725 /* 24726 * ioctl processing 24727 * 24728 * ioctl processing starts with ip_sioctl_copyin_setup which looks up 24729 * the ioctl command in the ioctl tables and determines the copyin data size 24730 * from the ioctl property ipi_copyin_size, and does an mi_copyin() of that 24731 * size. 24732 * 24733 * ioctl processing then continues when the M_IOCDATA makes its way down. 24734 * Now the ioctl is looked up again in the ioctl table, and its properties are 24735 * extracted. The associated 'conn' is then refheld till the end of the ioctl 24736 * and the general ioctl processing function ip_process_ioctl is called. 24737 * ip_process_ioctl determines if the ioctl needs to be serialized, and if 24738 * so goes thru the serialization primitive ipsq_try_enter. Then the 24739 * appropriate function to handle the ioctl is called based on the entry in 24740 * the ioctl table. ioctl completion is encapsulated in ip_ioctl_finish 24741 * which also refreleases the 'conn' that was refheld at the start of the 24742 * ioctl. Finally ipsq_exit is called if needed to exit the ipsq. 24743 * ip_extract_lifreq_cmn extracts the interface name from the lifreq/ifreq 24744 * struct and looks up the ipif. ip_extract_tunreq handles the case of tunnel. 24745 * 24746 * Many exclusive ioctls go thru an internal down up sequence as part of 24747 * the operation. For example an attempt to change the IP address of an 24748 * ipif entails ipif_down, set address, ipif_up. Bringing down the interface 24749 * does all the cleanup such as deleting all ires that use this address. 24750 * Then we need to wait till all references to the interface go away. 24751 */ 24752 void 24753 ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) 24754 { 24755 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 24756 ip_ioctl_cmd_t *ipip = (ip_ioctl_cmd_t *)arg; 24757 cmd_info_t ci; 24758 int err; 24759 boolean_t entered_ipsq = B_FALSE; 24760 24761 ip3dbg(("ip_process_ioctl: ioctl %X\n", iocp->ioc_cmd)); 24762 24763 if (ipip == NULL) 24764 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 24765 24766 /* 24767 * SIOCLIFADDIF needs to go thru a special path since the 24768 * ill may not exist yet. This happens in the case of lo0 24769 * which is created using this ioctl. 24770 */ 24771 if (ipip->ipi_cmd == SIOCLIFADDIF) { 24772 err = ip_sioctl_addif(NULL, NULL, q, mp, NULL, NULL); 24773 ip_ioctl_finish(q, mp, err, 24774 ipip->ipi_flags & IPI_GET_CMD ? COPYOUT : NO_COPYOUT, 24775 NULL, NULL); 24776 return; 24777 } 24778 24779 ci.ci_ipif = NULL; 24780 switch (ipip->ipi_cmd_type) { 24781 case IF_CMD: 24782 case LIF_CMD: 24783 /* 24784 * ioctls that pass in a [l]ifreq appear here. 24785 * ip_extract_lifreq_cmn returns a refheld ipif in 24786 * ci.ci_ipif 24787 */ 24788 err = ip_extract_lifreq_cmn(q, mp, ipip->ipi_cmd_type, 24789 ipip->ipi_flags, &ci, ip_process_ioctl); 24790 if (err != 0) { 24791 ip_ioctl_finish(q, mp, err, 24792 ipip->ipi_flags & IPI_GET_CMD ? 24793 COPYOUT : NO_COPYOUT, NULL, NULL); 24794 return; 24795 } 24796 ASSERT(ci.ci_ipif != NULL); 24797 break; 24798 24799 case TUN_CMD: 24800 /* 24801 * SIOC[GS]TUNPARAM appear here. ip_extract_tunreq returns 24802 * a refheld ipif in ci.ci_ipif 24803 */ 24804 err = ip_extract_tunreq(q, mp, &ci.ci_ipif, ip_process_ioctl); 24805 if (err != 0) { 24806 ip_ioctl_finish(q, mp, err, 24807 ipip->ipi_flags & IPI_GET_CMD ? 24808 COPYOUT : NO_COPYOUT, NULL, NULL); 24809 return; 24810 } 24811 ASSERT(ci.ci_ipif != NULL); 24812 break; 24813 24814 case MISC_CMD: 24815 /* 24816 * ioctls that neither pass in [l]ifreq or iftun_req come here 24817 * For eg. SIOCGLIFCONF will appear here. 24818 */ 24819 switch (ipip->ipi_cmd) { 24820 case IF_UNITSEL: 24821 /* ioctl comes down the ill */ 24822 ci.ci_ipif = ((ill_t *)q->q_ptr)->ill_ipif; 24823 ipif_refhold(ci.ci_ipif); 24824 break; 24825 case SIOCGMSFILTER: 24826 case SIOCSMSFILTER: 24827 case SIOCGIPMSFILTER: 24828 case SIOCSIPMSFILTER: 24829 err = ip_extract_msfilter(q, mp, &ci.ci_ipif, 24830 ip_process_ioctl); 24831 if (err != 0) { 24832 ip_ioctl_finish(q, mp, err, 24833 ipip->ipi_flags & IPI_GET_CMD ? 24834 COPYOUT : NO_COPYOUT, NULL, NULL); 24835 return; 24836 } 24837 break; 24838 } 24839 err = 0; 24840 ci.ci_sin = NULL; 24841 ci.ci_sin6 = NULL; 24842 ci.ci_lifr = NULL; 24843 break; 24844 } 24845 24846 /* 24847 * If ipsq is non-null, we are already being called exclusively 24848 */ 24849 ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq)); 24850 if (!(ipip->ipi_flags & IPI_WR)) { 24851 /* 24852 * A return value of EINPROGRESS means the ioctl is 24853 * either queued and waiting for some reason or has 24854 * already completed. 24855 */ 24856 err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, 24857 ci.ci_lifr); 24858 if (ci.ci_ipif != NULL) 24859 ipif_refrele(ci.ci_ipif); 24860 ip_ioctl_finish(q, mp, err, 24861 ipip->ipi_flags & IPI_GET_CMD ? COPYOUT : NO_COPYOUT, 24862 NULL, NULL); 24863 return; 24864 } 24865 24866 ASSERT(ci.ci_ipif != NULL); 24867 24868 if (ipsq == NULL) { 24869 ipsq = ipsq_try_enter(ci.ci_ipif, NULL, q, mp, 24870 ip_process_ioctl, NEW_OP, B_TRUE); 24871 entered_ipsq = B_TRUE; 24872 } 24873 /* 24874 * Release the ipif so that ipif_down and friends that wait for 24875 * references to go away are not misled about the current ipif_refcnt 24876 * values. We are writer so we can access the ipif even after releasing 24877 * the ipif. 24878 */ 24879 ipif_refrele(ci.ci_ipif); 24880 if (ipsq == NULL) 24881 return; 24882 24883 mutex_enter(&ipsq->ipsq_lock); 24884 ASSERT(ipsq->ipsq_current_ipif == NULL); 24885 ipsq->ipsq_current_ipif = ci.ci_ipif; 24886 ipsq->ipsq_last_cmd = ipip->ipi_cmd; 24887 mutex_exit(&ipsq->ipsq_lock); 24888 mutex_enter(&(ci.ci_ipif)->ipif_ill->ill_lock); 24889 /* 24890 * For most set ioctls that come here, this serves as a single point 24891 * where we set the IPIF_CHANGING flag. This ensures that there won't 24892 * be any new references to the ipif. This helps functions that go 24893 * through this path and end up trying to wait for the refcnts 24894 * associated with the ipif to go down to zero. Some exceptions are 24895 * Failover, Failback, and Groupname commands that operate on more than 24896 * just the ci.ci_ipif. These commands internally determine the 24897 * set of ipif's they operate on and set and clear the IPIF_CHANGING 24898 * flags on that set. Another exception is the Removeif command that 24899 * sets the IPIF_CONDEMNED flag internally after identifying the right 24900 * ipif to operate on. 24901 */ 24902 if (ipip->ipi_cmd != SIOCLIFREMOVEIF && 24903 ipip->ipi_cmd != SIOCLIFFAILOVER && 24904 ipip->ipi_cmd != SIOCLIFFAILBACK && 24905 ipip->ipi_cmd != SIOCSLIFGROUPNAME) 24906 (ci.ci_ipif)->ipif_state_flags |= IPIF_CHANGING; 24907 mutex_exit(&(ci.ci_ipif)->ipif_ill->ill_lock); 24908 24909 /* 24910 * A return value of EINPROGRESS means the ioctl is 24911 * either queued and waiting for some reason or has 24912 * already completed. 24913 */ 24914 err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, 24915 ci.ci_lifr); 24916 24917 /* SIOCLIFREMOVEIF could have removed the ipif */ 24918 ip_ioctl_finish(q, mp, err, 24919 ipip->ipi_flags & IPI_GET_CMD ? COPYOUT : NO_COPYOUT, 24920 ipip->ipi_cmd == SIOCLIFREMOVEIF ? NULL : ci.ci_ipif, ipsq); 24921 24922 if (entered_ipsq) 24923 ipsq_exit(ipsq, B_TRUE, B_TRUE); 24924 } 24925 24926 /* 24927 * Complete the ioctl. Typically ioctls use the mi package and need to 24928 * do mi_copyout/mi_copy_done. 24929 */ 24930 void 24931 ip_ioctl_finish(queue_t *q, mblk_t *mp, int err, int mode, 24932 ipif_t *ipif, ipsq_t *ipsq) 24933 { 24934 conn_t *connp = NULL; 24935 24936 if (err == EINPROGRESS) 24937 return; 24938 24939 if (CONN_Q(q)) { 24940 connp = Q_TO_CONN(q); 24941 ASSERT(connp->conn_ref >= 2); 24942 } 24943 24944 switch (mode) { 24945 case COPYOUT: 24946 if (err == 0) 24947 mi_copyout(q, mp); 24948 else 24949 mi_copy_done(q, mp, err); 24950 break; 24951 24952 case NO_COPYOUT: 24953 mi_copy_done(q, mp, err); 24954 break; 24955 24956 default: 24957 /* An ioctl aborted through a conn close would take this path */ 24958 break; 24959 } 24960 24961 /* 24962 * The refhold placed at the start of the ioctl is released here. 24963 */ 24964 if (connp != NULL) 24965 CONN_OPER_PENDING_DONE(connp); 24966 24967 /* 24968 * If the ioctl were an exclusive ioctl it would have set 24969 * IPIF_CHANGING at the start of the ioctl which is undone here. 24970 */ 24971 if (ipif != NULL) { 24972 mutex_enter(&(ipif)->ipif_ill->ill_lock); 24973 ipif->ipif_state_flags &= ~IPIF_CHANGING; 24974 mutex_exit(&(ipif)->ipif_ill->ill_lock); 24975 } 24976 24977 /* 24978 * Clear the current ipif in the ipsq at the completion of the ioctl. 24979 * Note that a non-null ipsq_current_ipif prevents new ioctls from 24980 * entering the ipsq 24981 */ 24982 if (ipsq != NULL) { 24983 mutex_enter(&ipsq->ipsq_lock); 24984 ipsq->ipsq_current_ipif = NULL; 24985 mutex_exit(&ipsq->ipsq_lock); 24986 } 24987 } 24988 24989 /* 24990 * This is called from ip_wput_nondata to resume a deferred TCP bind. 24991 */ 24992 /* ARGSUSED */ 24993 void 24994 ip_resume_tcp_bind(void *arg, mblk_t *mp, void *arg2) 24995 { 24996 conn_t *connp = arg; 24997 tcp_t *tcp; 24998 24999 ASSERT(connp != NULL && IPCL_IS_TCP(connp) && connp->conn_tcp != NULL); 25000 tcp = connp->conn_tcp; 25001 25002 if (connp->conn_tcp->tcp_state == TCPS_CLOSED) 25003 freemsg(mp); 25004 else 25005 tcp_rput_other(tcp, mp); 25006 CONN_OPER_PENDING_DONE(connp); 25007 } 25008 25009 /* Called from ip_wput for all non data messages */ 25010 /* ARGSUSED */ 25011 void 25012 ip_wput_nondata(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 25013 { 25014 mblk_t *mp1; 25015 ire_t *ire; 25016 ill_t *ill; 25017 struct iocblk *iocp; 25018 ip_ioctl_cmd_t *ipip; 25019 cred_t *cr; 25020 conn_t *connp = NULL; 25021 int cmd, err; 25022 25023 if (CONN_Q(q)) 25024 connp = Q_TO_CONN(q); 25025 25026 cr = DB_CREDDEF(mp, GET_QUEUE_CRED(q)); 25027 25028 /* Check if it is a queue to /dev/sctp. */ 25029 if (connp != NULL && connp->conn_ulp == IPPROTO_SCTP && 25030 connp->conn_rq == NULL) { 25031 sctp_wput(q, mp); 25032 return; 25033 } 25034 25035 switch (DB_TYPE(mp)) { 25036 case M_IOCTL: 25037 /* 25038 * IOCTL processing begins in ip_sioctl_copyin_setup which 25039 * will arrange to copy in associated control structures. 25040 */ 25041 ip_sioctl_copyin_setup(q, mp); 25042 return; 25043 case M_IOCDATA: 25044 /* 25045 * Ensure that this is associated with one of our trans- 25046 * parent ioctls. If it's not ours, discard it if we're 25047 * running as a driver, or pass it on if we're a module. 25048 */ 25049 iocp = (struct iocblk *)mp->b_rptr; 25050 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 25051 if (ipip == NULL) { 25052 if (q->q_next == NULL) { 25053 goto nak; 25054 } else { 25055 putnext(q, mp); 25056 } 25057 return; 25058 } else if ((q->q_next != NULL) && 25059 !(ipip->ipi_flags & IPI_MODOK)) { 25060 /* 25061 * the ioctl is one we recognise, but is not 25062 * consumed by IP as a module, pass M_IOCDATA 25063 * for processing downstream, but only for 25064 * common Streams ioctls. 25065 */ 25066 if (ipip->ipi_flags & IPI_PASS_DOWN) { 25067 putnext(q, mp); 25068 return; 25069 } else { 25070 goto nak; 25071 } 25072 } 25073 25074 /* IOCTL continuation following copyin or copyout. */ 25075 if (mi_copy_state(q, mp, NULL) == -1) { 25076 /* 25077 * The copy operation failed. mi_copy_state already 25078 * cleaned up, so we're out of here. 25079 */ 25080 return; 25081 } 25082 /* 25083 * If we just completed a copy in, we become writer and 25084 * continue processing in ip_sioctl_copyin_done. If it 25085 * was a copy out, we call mi_copyout again. If there is 25086 * nothing more to copy out, it will complete the IOCTL. 25087 */ 25088 if (MI_COPY_DIRECTION(mp) == MI_COPY_IN) { 25089 if (!(mp1 = mp->b_cont) || !(mp1 = mp1->b_cont)) { 25090 mi_copy_done(q, mp, EPROTO); 25091 return; 25092 } 25093 /* 25094 * Check for cases that need more copying. A return 25095 * value of 0 means a second copyin has been started, 25096 * so we return; a return value of 1 means no more 25097 * copying is needed, so we continue. 25098 */ 25099 cmd = iocp->ioc_cmd; 25100 if ((cmd == SIOCGMSFILTER || cmd == SIOCSMSFILTER || 25101 cmd == SIOCGIPMSFILTER || cmd == SIOCSIPMSFILTER) && 25102 MI_COPY_COUNT(mp) == 1) { 25103 if (ip_copyin_msfilter(q, mp) == 0) 25104 return; 25105 } 25106 /* 25107 * Refhold the conn, till the ioctl completes. This is 25108 * needed in case the ioctl ends up in the pending mp 25109 * list. Every mp in the ill_pending_mp list and 25110 * the ipsq_pending_mp must have a refhold on the conn 25111 * to resume processing. The refhold is released when 25112 * the ioctl completes. (normally or abnormally) 25113 * In all cases ip_ioctl_finish is called to finish 25114 * the ioctl. 25115 */ 25116 if (connp != NULL) { 25117 /* This is not a reentry */ 25118 ASSERT(ipsq == NULL); 25119 CONN_INC_REF(connp); 25120 } else { 25121 if (!(ipip->ipi_flags & IPI_MODOK)) { 25122 mi_copy_done(q, mp, EINVAL); 25123 return; 25124 } 25125 } 25126 25127 ip_process_ioctl(ipsq, q, mp, ipip); 25128 25129 } else { 25130 mi_copyout(q, mp); 25131 } 25132 return; 25133 nak: 25134 iocp->ioc_error = EINVAL; 25135 mp->b_datap->db_type = M_IOCNAK; 25136 iocp->ioc_count = 0; 25137 qreply(q, mp); 25138 return; 25139 25140 case M_IOCNAK: 25141 /* 25142 * The only way we could get here is if a resolver didn't like 25143 * an IOCTL we sent it. This shouldn't happen. 25144 */ 25145 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 25146 "ip_wput: unexpected M_IOCNAK, ioc_cmd 0x%x", 25147 ((struct iocblk *)mp->b_rptr)->ioc_cmd); 25148 freemsg(mp); 25149 return; 25150 case M_IOCACK: 25151 /* Finish socket ioctls passed through to ARP. */ 25152 ip_sioctl_iocack(q, mp); 25153 return; 25154 case M_FLUSH: 25155 if (*mp->b_rptr & FLUSHW) 25156 flushq(q, FLUSHALL); 25157 if (q->q_next) { 25158 /* 25159 * M_FLUSH is sent up to IP by some drivers during 25160 * unbind. ip_rput has already replied to it. We are 25161 * here for the M_FLUSH that we originated in IP 25162 * before sending the unbind request to the driver. 25163 * Just free it as we don't queue packets in IP 25164 * on the write side of the device instance. 25165 */ 25166 freemsg(mp); 25167 return; 25168 } 25169 if (*mp->b_rptr & FLUSHR) { 25170 *mp->b_rptr &= ~FLUSHW; 25171 qreply(q, mp); 25172 return; 25173 } 25174 freemsg(mp); 25175 return; 25176 case IRE_DB_REQ_TYPE: 25177 /* An Upper Level Protocol wants a copy of an IRE. */ 25178 ip_ire_req(q, mp); 25179 return; 25180 case M_CTL: 25181 if (mp->b_wptr - mp->b_rptr < sizeof (uint32_t)) 25182 break; 25183 25184 if (connp != NULL && *(uint32_t *)mp->b_rptr == 25185 IP_ULP_OUT_LABELED) { 25186 out_labeled_t *olp; 25187 25188 if (mp->b_wptr - mp->b_rptr != sizeof (*olp)) 25189 break; 25190 olp = (out_labeled_t *)mp->b_rptr; 25191 connp->conn_ulp_labeled = olp->out_qnext == q; 25192 freemsg(mp); 25193 return; 25194 } 25195 25196 /* M_CTL messages are used by ARP to tell us things. */ 25197 if ((mp->b_wptr - mp->b_rptr) < sizeof (arc_t)) 25198 break; 25199 switch (((arc_t *)mp->b_rptr)->arc_cmd) { 25200 case AR_ENTRY_SQUERY: 25201 ip_wput_ctl(q, mp); 25202 return; 25203 case AR_CLIENT_NOTIFY: 25204 ip_arp_news(q, mp); 25205 return; 25206 case AR_DLPIOP_DONE: 25207 ASSERT(q->q_next != NULL); 25208 ill = (ill_t *)q->q_ptr; 25209 /* qwriter_ip releases the refhold */ 25210 /* refhold on ill stream is ok without ILL_CAN_LOOKUP */ 25211 ill_refhold(ill); 25212 (void) qwriter_ip(NULL, ill, q, mp, ip_arp_done, 25213 CUR_OP, B_FALSE); 25214 return; 25215 case AR_ARP_CLOSING: 25216 /* 25217 * ARP (above us) is closing. If no ARP bringup is 25218 * currently pending, ack the message so that ARP 25219 * can complete its close. Also mark ill_arp_closing 25220 * so that new ARP bringups will fail. If any 25221 * ARP bringup is currently in progress, we will 25222 * ack this when the current ARP bringup completes. 25223 */ 25224 ASSERT(q->q_next != NULL); 25225 ill = (ill_t *)q->q_ptr; 25226 mutex_enter(&ill->ill_lock); 25227 ill->ill_arp_closing = 1; 25228 if (!ill->ill_arp_bringup_pending) { 25229 mutex_exit(&ill->ill_lock); 25230 qreply(q, mp); 25231 } else { 25232 mutex_exit(&ill->ill_lock); 25233 freemsg(mp); 25234 } 25235 return; 25236 default: 25237 break; 25238 } 25239 break; 25240 case M_PROTO: 25241 case M_PCPROTO: 25242 /* 25243 * The only PROTO messages we expect are ULP binds and 25244 * copies of option negotiation acknowledgements. 25245 */ 25246 switch (((union T_primitives *)mp->b_rptr)->type) { 25247 case O_T_BIND_REQ: 25248 case T_BIND_REQ: { 25249 /* Request can get queued in bind */ 25250 ASSERT(connp != NULL); 25251 /* 25252 * Both TCP and UDP call ip_bind_{v4,v6}() directly 25253 * instead of going through this path. We only get 25254 * here in the following cases: 25255 * 25256 * a. Bind retries, where ipsq is non-NULL. 25257 * b. T_BIND_REQ is issued from non TCP/UDP 25258 * transport, e.g. icmp for raw socket, 25259 * in which case ipsq will be NULL. 25260 */ 25261 ASSERT(ipsq != NULL || 25262 (!IPCL_IS_TCP(connp) && !IPCL_IS_UDP(connp))); 25263 25264 /* Don't increment refcnt if this is a re-entry */ 25265 if (ipsq == NULL) 25266 CONN_INC_REF(connp); 25267 mp = connp->conn_af_isv6 ? ip_bind_v6(q, mp, 25268 connp, NULL) : ip_bind_v4(q, mp, connp); 25269 if (mp == NULL) 25270 return; 25271 if (IPCL_IS_TCP(connp)) { 25272 /* 25273 * In the case of TCP endpoint we 25274 * come here only for bind retries 25275 */ 25276 ASSERT(ipsq != NULL); 25277 CONN_INC_REF(connp); 25278 squeue_fill(connp->conn_sqp, mp, 25279 ip_resume_tcp_bind, connp, 25280 SQTAG_BIND_RETRY); 25281 return; 25282 } else if (IPCL_IS_UDP(connp)) { 25283 /* 25284 * In the case of UDP endpoint we 25285 * come here only for bind retries 25286 */ 25287 ASSERT(ipsq != NULL); 25288 udp_resume_bind(connp, mp); 25289 return; 25290 } 25291 qreply(q, mp); 25292 CONN_OPER_PENDING_DONE(connp); 25293 return; 25294 } 25295 case T_SVR4_OPTMGMT_REQ: 25296 ip2dbg(("ip_wput: T_SVR4_OPTMGMT_REQ flags %x\n", 25297 ((struct T_optmgmt_req *)mp->b_rptr)->MGMT_flags)); 25298 25299 ASSERT(connp != NULL); 25300 if (!snmpcom_req(q, mp, ip_snmp_set, 25301 ip_snmp_get, cr)) { 25302 /* 25303 * Call svr4_optcom_req so that it can 25304 * generate the ack. We don't come here 25305 * if this operation is being restarted. 25306 * ip_restart_optmgmt will drop the conn ref. 25307 * In the case of ipsec option after the ipsec 25308 * load is complete conn_restart_ipsec_waiter 25309 * drops the conn ref. 25310 */ 25311 ASSERT(ipsq == NULL); 25312 CONN_INC_REF(connp); 25313 if (ip_check_for_ipsec_opt(q, mp)) 25314 return; 25315 err = svr4_optcom_req(q, mp, cr, &ip_opt_obj); 25316 if (err != EINPROGRESS) { 25317 /* Operation is done */ 25318 CONN_OPER_PENDING_DONE(connp); 25319 } 25320 } 25321 return; 25322 case T_OPTMGMT_REQ: 25323 ip2dbg(("ip_wput: T_OPTMGMT_REQ\n")); 25324 /* 25325 * Note: No snmpcom_req support through new 25326 * T_OPTMGMT_REQ. 25327 * Call tpi_optcom_req so that it can 25328 * generate the ack. 25329 */ 25330 ASSERT(connp != NULL); 25331 ASSERT(ipsq == NULL); 25332 /* 25333 * We don't come here for restart. ip_restart_optmgmt 25334 * will drop the conn ref. In the case of ipsec option 25335 * after the ipsec load is complete 25336 * conn_restart_ipsec_waiter drops the conn ref. 25337 */ 25338 CONN_INC_REF(connp); 25339 if (ip_check_for_ipsec_opt(q, mp)) 25340 return; 25341 err = tpi_optcom_req(q, mp, cr, &ip_opt_obj); 25342 if (err != EINPROGRESS) { 25343 /* Operation is done */ 25344 CONN_OPER_PENDING_DONE(connp); 25345 } 25346 return; 25347 case T_UNBIND_REQ: 25348 mp = ip_unbind(q, mp); 25349 qreply(q, mp); 25350 return; 25351 default: 25352 /* 25353 * Have to drop any DLPI messages coming down from 25354 * arp (such as an info_req which would cause ip 25355 * to receive an extra info_ack if it was passed 25356 * through. 25357 */ 25358 ip1dbg(("ip_wput_nondata: dropping M_PROTO %d\n", 25359 (int)*(uint_t *)mp->b_rptr)); 25360 freemsg(mp); 25361 return; 25362 } 25363 /* NOTREACHED */ 25364 case IRE_DB_TYPE: { 25365 nce_t *nce; 25366 ill_t *ill; 25367 in6_addr_t gw_addr_v6; 25368 25369 25370 /* 25371 * This is a response back from a resolver. It 25372 * consists of a message chain containing: 25373 * IRE_MBLK-->LL_HDR_MBLK->pkt 25374 * The IRE_MBLK is the one we allocated in ip_newroute. 25375 * The LL_HDR_MBLK is the DLPI header to use to get 25376 * the attached packet, and subsequent ones for the 25377 * same destination, transmitted. 25378 */ 25379 if ((mp->b_wptr - mp->b_rptr) != sizeof (ire_t)) /* ire */ 25380 break; 25381 /* 25382 * First, check to make sure the resolution succeeded. 25383 * If it failed, the second mblk will be empty. 25384 * If it is, free the chain, dropping the packet. 25385 * (We must ire_delete the ire; that frees the ire mblk) 25386 * We're doing this now to support PVCs for ATM; it's 25387 * a partial xresolv implementation. When we fully implement 25388 * xresolv interfaces, instead of freeing everything here 25389 * we'll initiate neighbor discovery. 25390 * 25391 * For v4 (ARP and other external resolvers) the resolver 25392 * frees the message, so no check is needed. This check 25393 * is required, though, for a full xresolve implementation. 25394 * Including this code here now both shows how external 25395 * resolvers can NACK a resolution request using an 25396 * existing design that has no specific provisions for NACKs, 25397 * and also takes into account that the current non-ARP 25398 * external resolver has been coded to use this method of 25399 * NACKing for all IPv6 (xresolv) cases, 25400 * whether our xresolv implementation is complete or not. 25401 * 25402 */ 25403 ire = (ire_t *)mp->b_rptr; 25404 ill = ire_to_ill(ire); 25405 mp1 = mp->b_cont; /* dl_unitdata_req */ 25406 if (mp1->b_rptr == mp1->b_wptr) { 25407 if (ire->ire_ipversion == IPV6_VERSION) { 25408 /* 25409 * XRESOLV interface. 25410 */ 25411 ASSERT(ill->ill_flags & ILLF_XRESOLV); 25412 mutex_enter(&ire->ire_lock); 25413 gw_addr_v6 = ire->ire_gateway_addr_v6; 25414 mutex_exit(&ire->ire_lock); 25415 if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) { 25416 nce = ndp_lookup(ill, 25417 &ire->ire_addr_v6, B_FALSE); 25418 } else { 25419 nce = ndp_lookup(ill, &gw_addr_v6, 25420 B_FALSE); 25421 } 25422 if (nce != NULL) { 25423 nce_resolv_failed(nce); 25424 ndp_delete(nce); 25425 NCE_REFRELE(nce); 25426 } 25427 } 25428 mp->b_cont = NULL; 25429 freemsg(mp1); /* frees the pkt as well */ 25430 ire_delete((ire_t *)mp->b_rptr); 25431 return; 25432 } 25433 /* 25434 * Split them into IRE_MBLK and pkt and feed it into 25435 * ire_add_then_send. Then in ire_add_then_send 25436 * the IRE will be added, and then the packet will be 25437 * run back through ip_wput. This time it will make 25438 * it to the wire. 25439 */ 25440 mp->b_cont = NULL; 25441 mp = mp1->b_cont; /* now, mp points to pkt */ 25442 mp1->b_cont = NULL; 25443 ip1dbg(("ip_wput_nondata: reply from external resolver \n")); 25444 if (ire->ire_ipversion == IPV6_VERSION) { 25445 /* 25446 * XRESOLV interface. Find the nce and put a copy 25447 * of the dl_unitdata_req in nce_res_mp 25448 */ 25449 ASSERT(ill->ill_flags & ILLF_XRESOLV); 25450 mutex_enter(&ire->ire_lock); 25451 gw_addr_v6 = ire->ire_gateway_addr_v6; 25452 mutex_exit(&ire->ire_lock); 25453 if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) { 25454 nce = ndp_lookup(ill, &ire->ire_addr_v6, 25455 B_FALSE); 25456 } else { 25457 nce = ndp_lookup(ill, &gw_addr_v6, B_FALSE); 25458 } 25459 if (nce != NULL) { 25460 /* 25461 * We have to protect nce_res_mp here 25462 * from being accessed by other threads 25463 * while we change the mblk pointer. 25464 * Other functions will also lock the nce when 25465 * accessing nce_res_mp. 25466 * 25467 * The reason we change the mblk pointer 25468 * here rather than copying the resolved address 25469 * into the template is that, unlike with 25470 * ethernet, we have no guarantee that the 25471 * resolved address length will be 25472 * smaller than or equal to the lla length 25473 * with which the template was allocated, 25474 * (for ethernet, they're equal) 25475 * so we have to use the actual resolved 25476 * address mblk - which holds the real 25477 * dl_unitdata_req with the resolved address. 25478 * 25479 * Doing this is the same behavior as was 25480 * previously used in the v4 ARP case. 25481 */ 25482 mutex_enter(&nce->nce_lock); 25483 if (nce->nce_res_mp != NULL) 25484 freemsg(nce->nce_res_mp); 25485 nce->nce_res_mp = mp1; 25486 mutex_exit(&nce->nce_lock); 25487 /* 25488 * We do a fastpath probe here because 25489 * we have resolved the address without 25490 * using Neighbor Discovery. 25491 * In the non-XRESOLV v6 case, the fastpath 25492 * probe is done right after neighbor 25493 * discovery completes. 25494 */ 25495 if (nce->nce_res_mp != NULL) { 25496 int res; 25497 nce_fastpath_list_add(nce); 25498 res = ill_fastpath_probe(ill, 25499 nce->nce_res_mp); 25500 if (res != 0 && res != EAGAIN) 25501 nce_fastpath_list_delete(nce); 25502 } 25503 25504 ire_add_then_send(q, ire, mp); 25505 /* 25506 * Now we have to clean out any packets 25507 * that may have been queued on the nce 25508 * while it was waiting for address resolution 25509 * to complete. 25510 */ 25511 mutex_enter(&nce->nce_lock); 25512 mp1 = nce->nce_qd_mp; 25513 nce->nce_qd_mp = NULL; 25514 mutex_exit(&nce->nce_lock); 25515 while (mp1 != NULL) { 25516 mblk_t *nxt_mp; 25517 queue_t *fwdq = NULL; 25518 ill_t *inbound_ill; 25519 uint_t ifindex; 25520 25521 nxt_mp = mp1->b_next; 25522 mp1->b_next = NULL; 25523 /* 25524 * Retrieve ifindex stored in 25525 * ip_rput_data_v6() 25526 */ 25527 ifindex = 25528 (uint_t)(uintptr_t)mp1->b_prev; 25529 inbound_ill = 25530 ill_lookup_on_ifindex(ifindex, 25531 B_TRUE, NULL, NULL, NULL, 25532 NULL); 25533 mp1->b_prev = NULL; 25534 if (inbound_ill != NULL) 25535 fwdq = inbound_ill->ill_rq; 25536 25537 if (fwdq != NULL) { 25538 put(fwdq, mp1); 25539 ill_refrele(inbound_ill); 25540 } else 25541 put(WR(ill->ill_rq), mp1); 25542 mp1 = nxt_mp; 25543 } 25544 NCE_REFRELE(nce); 25545 } else { /* nce is NULL; clean up */ 25546 ire_delete(ire); 25547 freemsg(mp); 25548 freemsg(mp1); 25549 return; 25550 } 25551 } else { 25552 ire->ire_dlureq_mp = mp1; 25553 ire_add_then_send(q, ire, mp); 25554 } 25555 return; /* All is well, the packet has been sent. */ 25556 } 25557 default: 25558 break; 25559 } 25560 if (q->q_next) { 25561 putnext(q, mp); 25562 } else 25563 freemsg(mp); 25564 } 25565 25566 /* 25567 * Process IP options in an outbound packet. Modify the destination if there 25568 * is a source route option. 25569 * Returns non-zero if something fails in which case an ICMP error has been 25570 * sent and mp freed. 25571 */ 25572 static int 25573 ip_wput_options(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, 25574 boolean_t mctl_present, zoneid_t zoneid) 25575 { 25576 ipoptp_t opts; 25577 uchar_t *opt; 25578 uint8_t optval; 25579 uint8_t optlen; 25580 ipaddr_t dst; 25581 intptr_t code = 0; 25582 mblk_t *mp; 25583 ire_t *ire = NULL; 25584 25585 ip2dbg(("ip_wput_options\n")); 25586 mp = ipsec_mp; 25587 if (mctl_present) { 25588 mp = ipsec_mp->b_cont; 25589 } 25590 25591 dst = ipha->ipha_dst; 25592 for (optval = ipoptp_first(&opts, ipha); 25593 optval != IPOPT_EOL; 25594 optval = ipoptp_next(&opts)) { 25595 opt = opts.ipoptp_cur; 25596 optlen = opts.ipoptp_len; 25597 ip2dbg(("ip_wput_options: opt %d, len %d\n", 25598 optval, optlen)); 25599 switch (optval) { 25600 uint32_t off; 25601 case IPOPT_SSRR: 25602 case IPOPT_LSRR: 25603 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 25604 ip1dbg(( 25605 "ip_wput_options: bad option offset\n")); 25606 code = (char *)&opt[IPOPT_OLEN] - 25607 (char *)ipha; 25608 goto param_prob; 25609 } 25610 off = opt[IPOPT_OFFSET]; 25611 ip1dbg(("ip_wput_options: next hop 0x%x\n", 25612 ntohl(dst))); 25613 /* 25614 * For strict: verify that dst is directly 25615 * reachable. 25616 */ 25617 if (optval == IPOPT_SSRR) { 25618 ire = ire_ftable_lookup(dst, 0, 0, 25619 IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, 25620 MBLK_GETLABEL(mp), 25621 MATCH_IRE_TYPE | MATCH_IRE_SECATTR); 25622 if (ire == NULL) { 25623 ip1dbg(("ip_wput_options: SSRR not" 25624 " directly reachable: 0x%x\n", 25625 ntohl(dst))); 25626 goto bad_src_route; 25627 } 25628 ire_refrele(ire); 25629 } 25630 break; 25631 case IPOPT_RR: 25632 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 25633 ip1dbg(( 25634 "ip_wput_options: bad option offset\n")); 25635 code = (char *)&opt[IPOPT_OLEN] - 25636 (char *)ipha; 25637 goto param_prob; 25638 } 25639 break; 25640 case IPOPT_TS: 25641 /* 25642 * Verify that length >=5 and that there is either 25643 * room for another timestamp or that the overflow 25644 * counter is not maxed out. 25645 */ 25646 code = (char *)&opt[IPOPT_OLEN] - (char *)ipha; 25647 if (optlen < IPOPT_MINLEN_IT) { 25648 goto param_prob; 25649 } 25650 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 25651 ip1dbg(( 25652 "ip_wput_options: bad option offset\n")); 25653 code = (char *)&opt[IPOPT_OFFSET] - 25654 (char *)ipha; 25655 goto param_prob; 25656 } 25657 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 25658 case IPOPT_TS_TSONLY: 25659 off = IPOPT_TS_TIMELEN; 25660 break; 25661 case IPOPT_TS_TSANDADDR: 25662 case IPOPT_TS_PRESPEC: 25663 case IPOPT_TS_PRESPEC_RFC791: 25664 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 25665 break; 25666 default: 25667 code = (char *)&opt[IPOPT_POS_OV_FLG] - 25668 (char *)ipha; 25669 goto param_prob; 25670 } 25671 if (opt[IPOPT_OFFSET] - 1 + off > optlen && 25672 (opt[IPOPT_POS_OV_FLG] & 0xF0) == 0xF0) { 25673 /* 25674 * No room and the overflow counter is 15 25675 * already. 25676 */ 25677 goto param_prob; 25678 } 25679 break; 25680 } 25681 } 25682 25683 if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0) 25684 return (0); 25685 25686 ip1dbg(("ip_wput_options: error processing IP options.")); 25687 code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha; 25688 25689 param_prob: 25690 /* 25691 * Since ip_wput() isn't close to finished, we fill 25692 * in enough of the header for credible error reporting. 25693 */ 25694 if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid)) { 25695 /* Failed */ 25696 freemsg(ipsec_mp); 25697 return (-1); 25698 } 25699 icmp_param_problem(q, ipsec_mp, (uint8_t)code); 25700 return (-1); 25701 25702 bad_src_route: 25703 /* 25704 * Since ip_wput() isn't close to finished, we fill 25705 * in enough of the header for credible error reporting. 25706 */ 25707 if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid)) { 25708 /* Failed */ 25709 freemsg(ipsec_mp); 25710 return (-1); 25711 } 25712 icmp_unreachable(q, ipsec_mp, ICMP_SOURCE_ROUTE_FAILED); 25713 return (-1); 25714 } 25715 25716 /* 25717 * The maximum value of conn_drain_list_cnt is CONN_MAXDRAINCNT. 25718 * conn_drain_list_cnt can be changed by setting conn_drain_nthreads 25719 * thru /etc/system. 25720 */ 25721 #define CONN_MAXDRAINCNT 64 25722 25723 static void 25724 conn_drain_init(void) 25725 { 25726 int i; 25727 25728 conn_drain_list_cnt = conn_drain_nthreads; 25729 25730 if ((conn_drain_list_cnt == 0) || 25731 (conn_drain_list_cnt > CONN_MAXDRAINCNT)) { 25732 /* 25733 * Default value of the number of drainers is the 25734 * number of cpus, subject to maximum of 8 drainers. 25735 */ 25736 if (boot_max_ncpus != -1) 25737 conn_drain_list_cnt = MIN(boot_max_ncpus, 8); 25738 else 25739 conn_drain_list_cnt = MIN(max_ncpus, 8); 25740 } 25741 25742 conn_drain_list = kmem_zalloc(conn_drain_list_cnt * sizeof (idl_t), 25743 KM_SLEEP); 25744 25745 for (i = 0; i < conn_drain_list_cnt; i++) { 25746 mutex_init(&conn_drain_list[i].idl_lock, NULL, 25747 MUTEX_DEFAULT, NULL); 25748 } 25749 } 25750 25751 static void 25752 conn_drain_fini(void) 25753 { 25754 int i; 25755 25756 for (i = 0; i < conn_drain_list_cnt; i++) 25757 mutex_destroy(&conn_drain_list[i].idl_lock); 25758 kmem_free(conn_drain_list, conn_drain_list_cnt * sizeof (idl_t)); 25759 conn_drain_list = NULL; 25760 } 25761 25762 /* 25763 * Note: For an overview of how flowcontrol is handled in IP please see the 25764 * IP Flowcontrol notes at the top of this file. 25765 * 25766 * Flow control has blocked us from proceeding. Insert the given conn in one 25767 * of the conn drain lists. These conn wq's will be qenabled later on when 25768 * STREAMS flow control does a backenable. conn_walk_drain will enable 25769 * the first conn in each of these drain lists. Each of these qenabled conns 25770 * in turn enables the next in the list, after it runs, or when it closes, 25771 * thus sustaining the drain process. 25772 * 25773 * The only possible calling sequence is ip_wsrv (on conn) -> ip_wput -> 25774 * conn_drain_insert. Thus there can be only 1 instance of conn_drain_insert 25775 * running at any time, on a given conn, since there can be only 1 service proc 25776 * running on a queue at any time. 25777 */ 25778 void 25779 conn_drain_insert(conn_t *connp) 25780 { 25781 idl_t *idl; 25782 uint_t index; 25783 25784 mutex_enter(&connp->conn_lock); 25785 if (connp->conn_state_flags & CONN_CLOSING) { 25786 /* 25787 * The conn is closing as a result of which CONN_CLOSING 25788 * is set. Return. 25789 */ 25790 mutex_exit(&connp->conn_lock); 25791 return; 25792 } else if (connp->conn_idl == NULL) { 25793 /* 25794 * Assign the next drain list round robin. We dont' use 25795 * a lock, and thus it may not be strictly round robin. 25796 * Atomicity of load/stores is enough to make sure that 25797 * conn_drain_list_index is always within bounds. 25798 */ 25799 index = conn_drain_list_index; 25800 ASSERT(index < conn_drain_list_cnt); 25801 connp->conn_idl = &conn_drain_list[index]; 25802 index++; 25803 if (index == conn_drain_list_cnt) 25804 index = 0; 25805 conn_drain_list_index = index; 25806 } 25807 mutex_exit(&connp->conn_lock); 25808 25809 mutex_enter(CONN_DRAIN_LIST_LOCK(connp)); 25810 if ((connp->conn_drain_prev != NULL) || 25811 (connp->conn_state_flags & CONN_CLOSING)) { 25812 /* 25813 * The conn is already in the drain list, OR 25814 * the conn is closing. We need to check again for 25815 * the closing case again since close can happen 25816 * after we drop the conn_lock, and before we 25817 * acquire the CONN_DRAIN_LIST_LOCK. 25818 */ 25819 mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); 25820 return; 25821 } else { 25822 idl = connp->conn_idl; 25823 } 25824 25825 /* 25826 * The conn is not in the drain list. Insert it at the 25827 * tail of the drain list. The drain list is circular 25828 * and doubly linked. idl_conn points to the 1st element 25829 * in the list. 25830 */ 25831 if (idl->idl_conn == NULL) { 25832 idl->idl_conn = connp; 25833 connp->conn_drain_next = connp; 25834 connp->conn_drain_prev = connp; 25835 } else { 25836 conn_t *head = idl->idl_conn; 25837 25838 connp->conn_drain_next = head; 25839 connp->conn_drain_prev = head->conn_drain_prev; 25840 head->conn_drain_prev->conn_drain_next = connp; 25841 head->conn_drain_prev = connp; 25842 } 25843 mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); 25844 } 25845 25846 /* 25847 * This conn is closing, and we are called from ip_close. OR 25848 * This conn has been serviced by ip_wsrv, and we need to do the tail 25849 * processing. 25850 * If this conn is part of the drain list, we may need to sustain the drain 25851 * process by qenabling the next conn in the drain list. We may also need to 25852 * remove this conn from the list, if it is done. 25853 */ 25854 static void 25855 conn_drain_tail(conn_t *connp, boolean_t closing) 25856 { 25857 idl_t *idl; 25858 25859 /* 25860 * connp->conn_idl is stable at this point, and no lock is needed 25861 * to check it. If we are called from ip_close, close has already 25862 * set CONN_CLOSING, thus freezing the value of conn_idl, and 25863 * called us only because conn_idl is non-null. If we are called thru 25864 * service, conn_idl could be null, but it cannot change because 25865 * service is single-threaded per queue, and there cannot be another 25866 * instance of service trying to call conn_drain_insert on this conn 25867 * now. 25868 */ 25869 ASSERT(!closing || (connp->conn_idl != NULL)); 25870 25871 /* 25872 * If connp->conn_idl is null, the conn has not been inserted into any 25873 * drain list even once since creation of the conn. Just return. 25874 */ 25875 if (connp->conn_idl == NULL) 25876 return; 25877 25878 mutex_enter(CONN_DRAIN_LIST_LOCK(connp)); 25879 25880 if (connp->conn_drain_prev == NULL) { 25881 /* This conn is currently not in the drain list. */ 25882 mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); 25883 return; 25884 } 25885 idl = connp->conn_idl; 25886 if (idl->idl_conn_draining == connp) { 25887 /* 25888 * This conn is the current drainer. If this is the last conn 25889 * in the drain list, we need to do more checks, in the 'if' 25890 * below. Otherwwise we need to just qenable the next conn, 25891 * to sustain the draining, and is handled in the 'else' 25892 * below. 25893 */ 25894 if (connp->conn_drain_next == idl->idl_conn) { 25895 /* 25896 * This conn is the last in this list. This round 25897 * of draining is complete. If idl_repeat is set, 25898 * it means another flow enabling has happened from 25899 * the driver/streams and we need to another round 25900 * of draining. 25901 * If there are more than 2 conns in the drain list, 25902 * do a left rotate by 1, so that all conns except the 25903 * conn at the head move towards the head by 1, and the 25904 * the conn at the head goes to the tail. This attempts 25905 * a more even share for all queues that are being 25906 * drained. 25907 */ 25908 if ((connp->conn_drain_next != connp) && 25909 (idl->idl_conn->conn_drain_next != connp)) { 25910 idl->idl_conn = idl->idl_conn->conn_drain_next; 25911 } 25912 if (idl->idl_repeat) { 25913 qenable(idl->idl_conn->conn_wq); 25914 idl->idl_conn_draining = idl->idl_conn; 25915 idl->idl_repeat = 0; 25916 } else { 25917 idl->idl_conn_draining = NULL; 25918 } 25919 } else { 25920 /* 25921 * If the next queue that we are now qenable'ing, 25922 * is closing, it will remove itself from this list 25923 * and qenable the subsequent queue in ip_close(). 25924 * Serialization is acheived thru idl_lock. 25925 */ 25926 qenable(connp->conn_drain_next->conn_wq); 25927 idl->idl_conn_draining = connp->conn_drain_next; 25928 } 25929 } 25930 if (!connp->conn_did_putbq || closing) { 25931 /* 25932 * Remove ourself from the drain list, if we did not do 25933 * a putbq, or if the conn is closing. 25934 * Note: It is possible that q->q_first is non-null. It means 25935 * that these messages landed after we did a enableok() in 25936 * ip_wsrv. Thus STREAMS will call ip_wsrv once again to 25937 * service them. 25938 */ 25939 if (connp->conn_drain_next == connp) { 25940 /* Singleton in the list */ 25941 ASSERT(connp->conn_drain_prev == connp); 25942 idl->idl_conn = NULL; 25943 idl->idl_conn_draining = NULL; 25944 } else { 25945 connp->conn_drain_prev->conn_drain_next = 25946 connp->conn_drain_next; 25947 connp->conn_drain_next->conn_drain_prev = 25948 connp->conn_drain_prev; 25949 if (idl->idl_conn == connp) 25950 idl->idl_conn = connp->conn_drain_next; 25951 ASSERT(idl->idl_conn_draining != connp); 25952 25953 } 25954 connp->conn_drain_next = NULL; 25955 connp->conn_drain_prev = NULL; 25956 } 25957 mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); 25958 } 25959 25960 /* 25961 * Write service routine. Shared perimeter entry point. 25962 * ip_wsrv can be called in any of the following ways. 25963 * 1. The device queue's messages has fallen below the low water mark 25964 * and STREAMS has backenabled the ill_wq. We walk thru all the 25965 * the drain lists and backenable the first conn in each list. 25966 * 2. The above causes STREAMS to run ip_wsrv on the conn_wq of the 25967 * qenabled non-tcp upper layers. We start dequeing messages and call 25968 * ip_wput for each message. 25969 */ 25970 25971 void 25972 ip_wsrv(queue_t *q) 25973 { 25974 conn_t *connp; 25975 ill_t *ill; 25976 mblk_t *mp; 25977 25978 if (q->q_next) { 25979 ill = (ill_t *)q->q_ptr; 25980 if (ill->ill_state_flags == 0) { 25981 /* 25982 * The device flow control has opened up. 25983 * Walk through conn drain lists and qenable the 25984 * first conn in each list. This makes sense only 25985 * if the stream is fully plumbed and setup. 25986 * Hence the if check above. 25987 */ 25988 ip1dbg(("ip_wsrv: walking\n")); 25989 conn_walk_drain(); 25990 } 25991 return; 25992 } 25993 25994 connp = Q_TO_CONN(q); 25995 ip1dbg(("ip_wsrv: %p %p\n", (void *)q, (void *)connp)); 25996 25997 /* 25998 * 1. Set conn_draining flag to signal that service is active. 25999 * 26000 * 2. ip_output determines whether it has been called from service, 26001 * based on the last parameter. If it is IP_WSRV it concludes it 26002 * has been called from service. 26003 * 26004 * 3. Message ordering is preserved by the following logic. 26005 * i. A directly called ip_output (i.e. not thru service) will queue 26006 * the message at the tail, if conn_draining is set (i.e. service 26007 * is running) or if q->q_first is non-null. 26008 * 26009 * ii. If ip_output is called from service, and if ip_output cannot 26010 * putnext due to flow control, it does a putbq. 26011 * 26012 * 4. noenable the queue so that a putbq from ip_wsrv does not reenable 26013 * (causing an infinite loop). 26014 */ 26015 ASSERT(!connp->conn_did_putbq); 26016 while ((q->q_first != NULL) && !connp->conn_did_putbq) { 26017 connp->conn_draining = 1; 26018 noenable(q); 26019 while ((mp = getq(q)) != NULL) { 26020 ip_output(Q_TO_CONN(q), mp, q, IP_WSRV); 26021 if (connp->conn_did_putbq) { 26022 /* ip_wput did a putbq */ 26023 break; 26024 } 26025 } 26026 /* 26027 * At this point, a thread coming down from top, calling 26028 * ip_wput, may end up queueing the message. We have not yet 26029 * enabled the queue, so ip_wsrv won't be called again. 26030 * To avoid this race, check q->q_first again (in the loop) 26031 * If the other thread queued the message before we call 26032 * enableok(), we will catch it in the q->q_first check. 26033 * If the other thread queues the message after we call 26034 * enableok(), ip_wsrv will be called again by STREAMS. 26035 */ 26036 connp->conn_draining = 0; 26037 enableok(q); 26038 } 26039 26040 /* Enable the next conn for draining */ 26041 conn_drain_tail(connp, B_FALSE); 26042 26043 connp->conn_did_putbq = 0; 26044 } 26045 26046 /* 26047 * Walk the list of all conn's calling the function provided with the 26048 * specified argument for each. Note that this only walks conn's that 26049 * have been bound. 26050 * Applies to both IPv4 and IPv6. 26051 */ 26052 static void 26053 conn_walk_fanout(pfv_t func, void *arg, zoneid_t zoneid) 26054 { 26055 conn_walk_fanout_table(ipcl_udp_fanout, ipcl_udp_fanout_size, 26056 func, arg, zoneid); 26057 conn_walk_fanout_table(ipcl_conn_fanout, ipcl_conn_fanout_size, 26058 func, arg, zoneid); 26059 conn_walk_fanout_table(ipcl_bind_fanout, ipcl_bind_fanout_size, 26060 func, arg, zoneid); 26061 conn_walk_fanout_table(ipcl_proto_fanout, 26062 A_CNT(ipcl_proto_fanout), func, arg, zoneid); 26063 conn_walk_fanout_table(ipcl_proto_fanout_v6, 26064 A_CNT(ipcl_proto_fanout_v6), func, arg, zoneid); 26065 } 26066 26067 /* 26068 * Flowcontrol has relieved, and STREAMS has backenabled us. For each list 26069 * of conns that need to be drained, check if drain is already in progress. 26070 * If so set the idl_repeat bit, indicating that the last conn in the list 26071 * needs to reinitiate the drain once again, for the list. If drain is not 26072 * in progress for the list, initiate the draining, by qenabling the 1st 26073 * conn in the list. The drain is self-sustaining, each qenabled conn will 26074 * in turn qenable the next conn, when it is done/blocked/closing. 26075 */ 26076 static void 26077 conn_walk_drain(void) 26078 { 26079 int i; 26080 idl_t *idl; 26081 26082 IP_STAT(ip_conn_walk_drain); 26083 26084 for (i = 0; i < conn_drain_list_cnt; i++) { 26085 idl = &conn_drain_list[i]; 26086 mutex_enter(&idl->idl_lock); 26087 if (idl->idl_conn == NULL) { 26088 mutex_exit(&idl->idl_lock); 26089 continue; 26090 } 26091 /* 26092 * If this list is not being drained currently by 26093 * an ip_wsrv thread, start the process. 26094 */ 26095 if (idl->idl_conn_draining == NULL) { 26096 ASSERT(idl->idl_repeat == 0); 26097 qenable(idl->idl_conn->conn_wq); 26098 idl->idl_conn_draining = idl->idl_conn; 26099 } else { 26100 idl->idl_repeat = 1; 26101 } 26102 mutex_exit(&idl->idl_lock); 26103 } 26104 } 26105 26106 /* 26107 * Walk an conn hash table of `count' buckets, calling func for each entry. 26108 */ 26109 static void 26110 conn_walk_fanout_table(connf_t *connfp, uint_t count, pfv_t func, void *arg, 26111 zoneid_t zoneid) 26112 { 26113 conn_t *connp; 26114 26115 while (count-- > 0) { 26116 mutex_enter(&connfp->connf_lock); 26117 for (connp = connfp->connf_head; connp != NULL; 26118 connp = connp->conn_next) { 26119 if (zoneid == GLOBAL_ZONEID || 26120 zoneid == connp->conn_zoneid) { 26121 CONN_INC_REF(connp); 26122 mutex_exit(&connfp->connf_lock); 26123 (*func)(connp, arg); 26124 mutex_enter(&connfp->connf_lock); 26125 CONN_DEC_REF(connp); 26126 } 26127 } 26128 mutex_exit(&connfp->connf_lock); 26129 connfp++; 26130 } 26131 } 26132 26133 /* ipcl_walk routine invoked for ip_conn_report for each conn. */ 26134 static void 26135 conn_report1(conn_t *connp, void *mp) 26136 { 26137 char buf1[INET6_ADDRSTRLEN]; 26138 char buf2[INET6_ADDRSTRLEN]; 26139 uint_t print_len, buf_len; 26140 26141 ASSERT(connp != NULL); 26142 26143 buf_len = ((mblk_t *)mp)->b_datap->db_lim - ((mblk_t *)mp)->b_wptr; 26144 if (buf_len <= 0) 26145 return; 26146 (void) inet_ntop(AF_INET6, &connp->conn_srcv6, buf1, sizeof (buf1)), 26147 (void) inet_ntop(AF_INET6, &connp->conn_remv6, buf2, sizeof (buf2)), 26148 print_len = snprintf((char *)((mblk_t *)mp)->b_wptr, buf_len, 26149 MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR 26150 "%5d %s/%05d %s/%05d\n", 26151 (void *)connp, (void *)CONNP_TO_RQ(connp), 26152 (void *)CONNP_TO_WQ(connp), connp->conn_zoneid, 26153 buf1, connp->conn_lport, 26154 buf2, connp->conn_fport); 26155 if (print_len < buf_len) { 26156 ((mblk_t *)mp)->b_wptr += print_len; 26157 } else { 26158 ((mblk_t *)mp)->b_wptr += buf_len; 26159 } 26160 } 26161 26162 /* 26163 * Named Dispatch routine to produce a formatted report on all conns 26164 * that are listed in one of the fanout tables. 26165 * This report is accessed by using the ndd utility to "get" ND variable 26166 * "ip_conn_status". 26167 */ 26168 /* ARGSUSED */ 26169 static int 26170 ip_conn_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 26171 { 26172 (void) mi_mpprintf(mp, 26173 "CONN " MI_COL_HDRPAD_STR 26174 "rfq " MI_COL_HDRPAD_STR 26175 "stq " MI_COL_HDRPAD_STR 26176 " zone local remote"); 26177 26178 /* 26179 * Because of the ndd constraint, at most we can have 64K buffer 26180 * to put in all conn info. So to be more efficient, just 26181 * allocate a 64K buffer here, assuming we need that large buffer. 26182 * This should be OK as only privileged processes can do ndd /dev/ip. 26183 */ 26184 if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) { 26185 /* The following may work even if we cannot get a large buf. */ 26186 (void) mi_mpprintf(mp, "<< Out of buffer >>\n"); 26187 return (0); 26188 } 26189 26190 conn_walk_fanout(conn_report1, mp->b_cont, Q_TO_CONN(q)->conn_zoneid); 26191 return (0); 26192 } 26193 26194 /* 26195 * Determine if the ill and multicast aspects of that packets 26196 * "matches" the conn. 26197 */ 26198 boolean_t 26199 conn_wantpacket(conn_t *connp, ill_t *ill, ipha_t *ipha, int fanout_flags, 26200 zoneid_t zoneid) 26201 { 26202 ill_t *in_ill; 26203 boolean_t found; 26204 ipif_t *ipif; 26205 ire_t *ire; 26206 ipaddr_t dst, src; 26207 26208 dst = ipha->ipha_dst; 26209 src = ipha->ipha_src; 26210 26211 /* 26212 * conn_incoming_ill is set by IP_BOUND_IF which limits 26213 * unicast, broadcast and multicast reception to 26214 * conn_incoming_ill. conn_wantpacket itself is called 26215 * only for BROADCAST and multicast. 26216 * 26217 * 1) ip_rput supresses duplicate broadcasts if the ill 26218 * is part of a group. Hence, we should be receiving 26219 * just one copy of broadcast for the whole group. 26220 * Thus, if it is part of the group the packet could 26221 * come on any ill of the group and hence we need a 26222 * match on the group. Otherwise, match on ill should 26223 * be sufficient. 26224 * 26225 * 2) ip_rput does not suppress duplicate multicast packets. 26226 * If there are two interfaces in a ill group and we have 26227 * 2 applications (conns) joined a multicast group G on 26228 * both the interfaces, ilm_lookup_ill filter in ip_rput 26229 * will give us two packets because we join G on both the 26230 * interfaces rather than nominating just one interface 26231 * for receiving multicast like broadcast above. So, 26232 * we have to call ilg_lookup_ill to filter out duplicate 26233 * copies, if ill is part of a group. 26234 */ 26235 in_ill = connp->conn_incoming_ill; 26236 if (in_ill != NULL) { 26237 if (in_ill->ill_group == NULL) { 26238 if (in_ill != ill) 26239 return (B_FALSE); 26240 } else if (in_ill->ill_group != ill->ill_group) { 26241 return (B_FALSE); 26242 } 26243 } 26244 26245 if (!CLASSD(dst)) { 26246 if (connp->conn_zoneid == zoneid) 26247 return (B_TRUE); 26248 /* 26249 * The conn is in a different zone; we need to check that this 26250 * broadcast address is configured in the application's zone and 26251 * on one ill in the group. 26252 */ 26253 ipif = ipif_get_next_ipif(NULL, ill); 26254 if (ipif == NULL) 26255 return (B_FALSE); 26256 ire = ire_ctable_lookup(dst, 0, IRE_BROADCAST, ipif, 26257 connp->conn_zoneid, NULL, 26258 (MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP)); 26259 ipif_refrele(ipif); 26260 if (ire != NULL) { 26261 ire_refrele(ire); 26262 return (B_TRUE); 26263 } else { 26264 return (B_FALSE); 26265 } 26266 } 26267 26268 if ((fanout_flags & IP_FF_NO_MCAST_LOOP) && 26269 connp->conn_zoneid == zoneid) { 26270 /* 26271 * Loopback case: the sending endpoint has IP_MULTICAST_LOOP 26272 * disabled, therefore we don't dispatch the multicast packet to 26273 * the sending zone. 26274 */ 26275 return (B_FALSE); 26276 } 26277 26278 if ((ill->ill_phyint->phyint_flags & PHYI_LOOPBACK) && 26279 connp->conn_zoneid != zoneid) { 26280 /* 26281 * Multicast packet on the loopback interface: we only match 26282 * conns who joined the group in the specified zone. 26283 */ 26284 return (B_FALSE); 26285 } 26286 26287 if (connp->conn_multi_router) { 26288 /* multicast packet and multicast router socket: send up */ 26289 return (B_TRUE); 26290 } 26291 26292 mutex_enter(&connp->conn_lock); 26293 found = (ilg_lookup_ill_withsrc(connp, dst, src, ill) != NULL); 26294 mutex_exit(&connp->conn_lock); 26295 return (found); 26296 } 26297 26298 /* 26299 * Finish processing of "arp_up" when AR_DLPIOP_DONE is received from arp. 26300 */ 26301 /* ARGSUSED */ 26302 static void 26303 ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy_arg) 26304 { 26305 ill_t *ill = (ill_t *)q->q_ptr; 26306 mblk_t *mp1, *mp2; 26307 ipif_t *ipif; 26308 int err = 0; 26309 conn_t *connp = NULL; 26310 ipsq_t *ipsq; 26311 arc_t *arc; 26312 26313 ip1dbg(("ip_arp_done(%s)\n", ill->ill_name)); 26314 26315 ASSERT((mp->b_wptr - mp->b_rptr) >= sizeof (arc_t)); 26316 ASSERT(((arc_t *)mp->b_rptr)->arc_cmd == AR_DLPIOP_DONE); 26317 26318 ASSERT(IAM_WRITER_ILL(ill)); 26319 mp2 = mp->b_cont; 26320 mp->b_cont = NULL; 26321 26322 /* 26323 * We have now received the arp bringup completion message 26324 * from ARP. Mark the arp bringup as done. Also if the arp 26325 * stream has already started closing, send up the AR_ARP_CLOSING 26326 * ack now since ARP is waiting in close for this ack. 26327 */ 26328 mutex_enter(&ill->ill_lock); 26329 ill->ill_arp_bringup_pending = 0; 26330 if (ill->ill_arp_closing) { 26331 mutex_exit(&ill->ill_lock); 26332 /* Let's reuse the mp for sending the ack */ 26333 arc = (arc_t *)mp->b_rptr; 26334 mp->b_wptr = mp->b_rptr + sizeof (arc_t); 26335 arc->arc_cmd = AR_ARP_CLOSING; 26336 qreply(q, mp); 26337 } else { 26338 mutex_exit(&ill->ill_lock); 26339 freeb(mp); 26340 } 26341 26342 /* We should have an IOCTL waiting on this. */ 26343 ipsq = ill->ill_phyint->phyint_ipsq; 26344 ipif = ipsq->ipsq_pending_ipif; 26345 mp1 = ipsq_pending_mp_get(ipsq, &connp); 26346 ASSERT(!((mp1 != NULL) ^ (ipif != NULL))); 26347 if (mp1 == NULL) { 26348 /* bringup was aborted by the user */ 26349 freemsg(mp2); 26350 return; 26351 } 26352 ASSERT(connp != NULL); 26353 q = CONNP_TO_WQ(connp); 26354 /* 26355 * If the DL_BIND_REQ fails, it is noted 26356 * in arc_name_offset. 26357 */ 26358 err = *((int *)mp2->b_rptr); 26359 if (err == 0) { 26360 if (ipif->ipif_isv6) { 26361 if ((err = ipif_up_done_v6(ipif)) != 0) 26362 ip0dbg(("ip_arp_done: init failed\n")); 26363 } else { 26364 if ((err = ipif_up_done(ipif)) != 0) 26365 ip0dbg(("ip_arp_done: init failed\n")); 26366 } 26367 } else { 26368 ip0dbg(("ip_arp_done: DL_BIND_REQ failed\n")); 26369 } 26370 26371 freemsg(mp2); 26372 26373 if ((err == 0) && (ill->ill_up_ipifs)) { 26374 err = ill_up_ipifs(ill, q, mp1); 26375 if (err == EINPROGRESS) 26376 return; 26377 } 26378 26379 if (ill->ill_up_ipifs) { 26380 ill_group_cleanup(ill); 26381 } 26382 26383 /* 26384 * The ioctl must complete now without EINPROGRESS 26385 * since ipsq_pending_mp_get has removed the ioctl mblk 26386 * from ipsq_pending_mp. Otherwise the ioctl will be 26387 * stuck for ever in the ipsq. 26388 */ 26389 ASSERT(err != EINPROGRESS); 26390 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipif, ipsq); 26391 } 26392 26393 /* Allocate the private structure */ 26394 static int 26395 ip_priv_alloc(void **bufp) 26396 { 26397 void *buf; 26398 26399 if ((buf = kmem_alloc(sizeof (ip_priv_t), KM_NOSLEEP)) == NULL) 26400 return (ENOMEM); 26401 26402 *bufp = buf; 26403 return (0); 26404 } 26405 26406 /* Function to delete the private structure */ 26407 void 26408 ip_priv_free(void *buf) 26409 { 26410 ASSERT(buf != NULL); 26411 kmem_free(buf, sizeof (ip_priv_t)); 26412 } 26413 26414 /* 26415 * The entry point for IPPF processing. 26416 * If the classifier (IPGPC_CLASSIFY) is not loaded and configured, the 26417 * routine just returns. 26418 * 26419 * When called, ip_process generates an ipp_packet_t structure 26420 * which holds the state information for this packet and invokes the 26421 * the classifier (via ipp_packet_process). The classification, depending on 26422 * configured filters, results in a list of actions for this packet. Invoking 26423 * an action may cause the packet to be dropped, in which case the resulting 26424 * mblk (*mpp) is NULL. proc indicates the callout position for 26425 * this packet and ill_index is the interface this packet on or will leave 26426 * on (inbound and outbound resp.). 26427 */ 26428 void 26429 ip_process(ip_proc_t proc, mblk_t **mpp, uint32_t ill_index) 26430 { 26431 mblk_t *mp; 26432 ip_priv_t *priv; 26433 ipp_action_id_t aid; 26434 int rc = 0; 26435 ipp_packet_t *pp; 26436 #define IP_CLASS "ip" 26437 26438 /* If the classifier is not loaded, return */ 26439 if ((aid = ipp_action_lookup(IPGPC_CLASSIFY)) == IPP_ACTION_INVAL) { 26440 return; 26441 } 26442 26443 mp = *mpp; 26444 ASSERT(mp != NULL); 26445 26446 /* Allocate the packet structure */ 26447 rc = ipp_packet_alloc(&pp, IP_CLASS, aid); 26448 if (rc != 0) { 26449 *mpp = NULL; 26450 freemsg(mp); 26451 return; 26452 } 26453 26454 /* Allocate the private structure */ 26455 rc = ip_priv_alloc((void **)&priv); 26456 if (rc != 0) { 26457 *mpp = NULL; 26458 freemsg(mp); 26459 ipp_packet_free(pp); 26460 return; 26461 } 26462 priv->proc = proc; 26463 priv->ill_index = ill_index; 26464 ipp_packet_set_private(pp, priv, ip_priv_free); 26465 ipp_packet_set_data(pp, mp); 26466 26467 /* Invoke the classifier */ 26468 rc = ipp_packet_process(&pp); 26469 if (pp != NULL) { 26470 mp = ipp_packet_get_data(pp); 26471 ipp_packet_free(pp); 26472 if (rc != 0) { 26473 freemsg(mp); 26474 *mpp = NULL; 26475 } 26476 } else { 26477 *mpp = NULL; 26478 } 26479 #undef IP_CLASS 26480 } 26481 26482 /* 26483 * Propagate a multicast group membership operation (add/drop) on 26484 * all the interfaces crossed by the related multirt routes. 26485 * The call is considered successful if the operation succeeds 26486 * on at least one interface. 26487 */ 26488 static int 26489 ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t, 26490 uint_t *, mcast_record_t, ipaddr_t, mblk_t *), ire_t *ire, conn_t *connp, 26491 boolean_t checkonly, ipaddr_t group, mcast_record_t fmode, ipaddr_t src, 26492 mblk_t *first_mp) 26493 { 26494 ire_t *ire_gw; 26495 irb_t *irb; 26496 int error = 0; 26497 opt_restart_t *or; 26498 26499 irb = ire->ire_bucket; 26500 ASSERT(irb != NULL); 26501 26502 ASSERT(DB_TYPE(first_mp) == M_CTL); 26503 26504 or = (opt_restart_t *)first_mp->b_rptr; 26505 IRB_REFHOLD(irb); 26506 for (; ire != NULL; ire = ire->ire_next) { 26507 if ((ire->ire_flags & RTF_MULTIRT) == 0) 26508 continue; 26509 if (ire->ire_addr != group) 26510 continue; 26511 26512 ire_gw = ire_ftable_lookup(ire->ire_gateway_addr, 0, 0, 26513 IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, NULL, 26514 MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE); 26515 /* No resolver exists for the gateway; skip this ire. */ 26516 if (ire_gw == NULL) 26517 continue; 26518 26519 /* 26520 * This function can return EINPROGRESS. If so the operation 26521 * will be restarted from ip_restart_optmgmt which will 26522 * call ip_opt_set and option processing will restart for 26523 * this option. So we may end up calling 'fn' more than once. 26524 * This requires that 'fn' is idempotent except for the 26525 * return value. The operation is considered a success if 26526 * it succeeds at least once on any one interface. 26527 */ 26528 error = fn(connp, checkonly, group, ire_gw->ire_src_addr, 26529 NULL, fmode, src, first_mp); 26530 if (error == 0) 26531 or->or_private = CGTP_MCAST_SUCCESS; 26532 26533 if (ip_debug > 0) { 26534 ulong_t off; 26535 char *ksym; 26536 ksym = kobj_getsymname((uintptr_t)fn, &off); 26537 ip2dbg(("ip_multirt_apply_membership: " 26538 "called %s, multirt group 0x%08x via itf 0x%08x, " 26539 "error %d [success %u]\n", 26540 ksym ? ksym : "?", 26541 ntohl(group), ntohl(ire_gw->ire_src_addr), 26542 error, or->or_private)); 26543 } 26544 26545 ire_refrele(ire_gw); 26546 if (error == EINPROGRESS) { 26547 IRB_REFRELE(irb); 26548 return (error); 26549 } 26550 } 26551 IRB_REFRELE(irb); 26552 /* 26553 * Consider the call as successful if we succeeded on at least 26554 * one interface. Otherwise, return the last encountered error. 26555 */ 26556 return (or->or_private == CGTP_MCAST_SUCCESS ? 0 : error); 26557 } 26558 26559 26560 /* 26561 * Issue a warning regarding a route crossing an interface with an 26562 * incorrect MTU. Only one message every 'ip_multirt_log_interval' 26563 * amount of time is logged. 26564 */ 26565 static void 26566 ip_multirt_bad_mtu(ire_t *ire, uint32_t max_frag) 26567 { 26568 hrtime_t current = gethrtime(); 26569 char buf[16]; 26570 26571 /* Convert interval in ms to hrtime in ns */ 26572 if (multirt_bad_mtu_last_time + 26573 ((hrtime_t)ip_multirt_log_interval * (hrtime_t)1000000) <= 26574 current) { 26575 cmn_err(CE_WARN, "ip: ignoring multiroute " 26576 "to %s, incorrect MTU %u (expected %u)\n", 26577 ip_dot_addr(ire->ire_addr, buf), 26578 ire->ire_max_frag, max_frag); 26579 26580 multirt_bad_mtu_last_time = current; 26581 } 26582 } 26583 26584 26585 /* 26586 * Get the CGTP (multirouting) filtering status. 26587 * If 0, the CGTP hooks are transparent. 26588 */ 26589 /* ARGSUSED */ 26590 static int 26591 ip_cgtp_filter_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 26592 { 26593 boolean_t *ip_cgtp_filter_value = (boolean_t *)cp; 26594 26595 (void) mi_mpprintf(mp, "%d", (int)*ip_cgtp_filter_value); 26596 return (0); 26597 } 26598 26599 26600 /* 26601 * Set the CGTP (multirouting) filtering status. 26602 * If the status is changed from active to transparent 26603 * or from transparent to active, forward the new status 26604 * to the filtering module (if loaded). 26605 */ 26606 /* ARGSUSED */ 26607 static int 26608 ip_cgtp_filter_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 26609 cred_t *ioc_cr) 26610 { 26611 long new_value; 26612 boolean_t *ip_cgtp_filter_value = (boolean_t *)cp; 26613 26614 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 26615 new_value < 0 || new_value > 1) { 26616 return (EINVAL); 26617 } 26618 26619 /* 26620 * Do not enable CGTP filtering - thus preventing the hooks 26621 * from being invoked - if the version number of the 26622 * filtering module hooks does not match. 26623 */ 26624 if ((ip_cgtp_filter_ops != NULL) && 26625 (ip_cgtp_filter_ops->cfo_filter_rev != CGTP_FILTER_REV)) { 26626 cmn_err(CE_WARN, "IP: CGTP filtering version mismatch " 26627 "(module hooks version %d, expecting %d)\n", 26628 ip_cgtp_filter_ops->cfo_filter_rev, CGTP_FILTER_REV); 26629 return (ENOTSUP); 26630 } 26631 26632 if ((!*ip_cgtp_filter_value) && new_value) { 26633 cmn_err(CE_NOTE, "IP: enabling CGTP filtering%s", 26634 ip_cgtp_filter_ops == NULL ? 26635 " (module not loaded)" : ""); 26636 } 26637 if (*ip_cgtp_filter_value && (!new_value)) { 26638 cmn_err(CE_NOTE, "IP: disabling CGTP filtering%s", 26639 ip_cgtp_filter_ops == NULL ? 26640 " (module not loaded)" : ""); 26641 } 26642 26643 if (ip_cgtp_filter_ops != NULL) { 26644 int res; 26645 if ((res = ip_cgtp_filter_ops->cfo_change_state(new_value))) { 26646 return (res); 26647 } 26648 } 26649 26650 *ip_cgtp_filter_value = (boolean_t)new_value; 26651 26652 return (0); 26653 } 26654 26655 26656 /* 26657 * Return the expected CGTP hooks version number. 26658 */ 26659 int 26660 ip_cgtp_filter_supported(void) 26661 { 26662 return (ip_cgtp_filter_rev); 26663 } 26664 26665 26666 /* 26667 * CGTP hooks can be registered by directly touching ip_cgtp_filter_ops 26668 * or by invoking this function. In the first case, the version number 26669 * of the registered structure is checked at hooks activation time 26670 * in ip_cgtp_filter_set(). 26671 */ 26672 int 26673 ip_cgtp_filter_register(cgtp_filter_ops_t *ops) 26674 { 26675 if (ops->cfo_filter_rev != CGTP_FILTER_REV) 26676 return (ENOTSUP); 26677 26678 ip_cgtp_filter_ops = ops; 26679 return (0); 26680 } 26681 26682 static squeue_func_t 26683 ip_squeue_switch(int val) 26684 { 26685 squeue_func_t rval = squeue_fill; 26686 26687 switch (val) { 26688 case IP_SQUEUE_ENTER_NODRAIN: 26689 rval = squeue_enter_nodrain; 26690 break; 26691 case IP_SQUEUE_ENTER: 26692 rval = squeue_enter; 26693 break; 26694 default: 26695 break; 26696 } 26697 return (rval); 26698 } 26699 26700 /* ARGSUSED */ 26701 static int 26702 ip_input_proc_set(queue_t *q, mblk_t *mp, char *value, 26703 caddr_t addr, cred_t *cr) 26704 { 26705 int *v = (int *)addr; 26706 long new_value; 26707 26708 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 26709 return (EINVAL); 26710 26711 ip_input_proc = ip_squeue_switch(new_value); 26712 *v = new_value; 26713 return (0); 26714 } 26715 26716 /* ARGSUSED */ 26717 static int 26718 ip_int_set(queue_t *q, mblk_t *mp, char *value, 26719 caddr_t addr, cred_t *cr) 26720 { 26721 int *v = (int *)addr; 26722 long new_value; 26723 26724 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 26725 return (EINVAL); 26726 26727 *v = new_value; 26728 return (0); 26729 } 26730 26731 static void 26732 ip_kstat_init(void) 26733 { 26734 ip_named_kstat_t template = { 26735 { "forwarding", KSTAT_DATA_UINT32, 0 }, 26736 { "defaultTTL", KSTAT_DATA_UINT32, 0 }, 26737 { "inReceives", KSTAT_DATA_UINT32, 0 }, 26738 { "inHdrErrors", KSTAT_DATA_UINT32, 0 }, 26739 { "inAddrErrors", KSTAT_DATA_UINT32, 0 }, 26740 { "forwDatagrams", KSTAT_DATA_UINT32, 0 }, 26741 { "inUnknownProtos", KSTAT_DATA_UINT32, 0 }, 26742 { "inDiscards", KSTAT_DATA_UINT32, 0 }, 26743 { "inDelivers", KSTAT_DATA_UINT32, 0 }, 26744 { "outRequests", KSTAT_DATA_UINT32, 0 }, 26745 { "outDiscards", KSTAT_DATA_UINT32, 0 }, 26746 { "outNoRoutes", KSTAT_DATA_UINT32, 0 }, 26747 { "reasmTimeout", KSTAT_DATA_UINT32, 0 }, 26748 { "reasmReqds", KSTAT_DATA_UINT32, 0 }, 26749 { "reasmOKs", KSTAT_DATA_UINT32, 0 }, 26750 { "reasmFails", KSTAT_DATA_UINT32, 0 }, 26751 { "fragOKs", KSTAT_DATA_UINT32, 0 }, 26752 { "fragFails", KSTAT_DATA_UINT32, 0 }, 26753 { "fragCreates", KSTAT_DATA_UINT32, 0 }, 26754 { "addrEntrySize", KSTAT_DATA_INT32, 0 }, 26755 { "routeEntrySize", KSTAT_DATA_INT32, 0 }, 26756 { "netToMediaEntrySize", KSTAT_DATA_INT32, 0 }, 26757 { "routingDiscards", KSTAT_DATA_UINT32, 0 }, 26758 { "inErrs", KSTAT_DATA_UINT32, 0 }, 26759 { "noPorts", KSTAT_DATA_UINT32, 0 }, 26760 { "inCksumErrs", KSTAT_DATA_UINT32, 0 }, 26761 { "reasmDuplicates", KSTAT_DATA_UINT32, 0 }, 26762 { "reasmPartDups", KSTAT_DATA_UINT32, 0 }, 26763 { "forwProhibits", KSTAT_DATA_UINT32, 0 }, 26764 { "udpInCksumErrs", KSTAT_DATA_UINT32, 0 }, 26765 { "udpInOverflows", KSTAT_DATA_UINT32, 0 }, 26766 { "rawipInOverflows", KSTAT_DATA_UINT32, 0 }, 26767 { "ipsecInSucceeded", KSTAT_DATA_UINT32, 0 }, 26768 { "ipsecInFailed", KSTAT_DATA_INT32, 0 }, 26769 { "memberEntrySize", KSTAT_DATA_INT32, 0 }, 26770 { "inIPv6", KSTAT_DATA_UINT32, 0 }, 26771 { "outIPv6", KSTAT_DATA_UINT32, 0 }, 26772 { "outSwitchIPv6", KSTAT_DATA_UINT32, 0 }, 26773 }; 26774 26775 ip_mibkp = kstat_create("ip", 0, "ip", "mib2", KSTAT_TYPE_NAMED, 26776 NUM_OF_FIELDS(ip_named_kstat_t), 26777 0); 26778 if (!ip_mibkp) 26779 return; 26780 26781 template.forwarding.value.ui32 = WE_ARE_FORWARDING ? 1:2; 26782 template.defaultTTL.value.ui32 = (uint32_t)ip_def_ttl; 26783 template.reasmTimeout.value.ui32 = ip_g_frag_timeout; 26784 template.addrEntrySize.value.i32 = sizeof (mib2_ipAddrEntry_t); 26785 template.routeEntrySize.value.i32 = sizeof (mib2_ipRouteEntry_t); 26786 26787 template.netToMediaEntrySize.value.i32 = 26788 sizeof (mib2_ipNetToMediaEntry_t); 26789 26790 template.memberEntrySize.value.i32 = sizeof (ipv6_member_t); 26791 26792 bcopy(&template, ip_mibkp->ks_data, sizeof (template)); 26793 26794 ip_mibkp->ks_update = ip_kstat_update; 26795 26796 kstat_install(ip_mibkp); 26797 } 26798 26799 static void 26800 ip_kstat_fini(void) 26801 { 26802 26803 if (ip_mibkp != NULL) { 26804 kstat_delete(ip_mibkp); 26805 ip_mibkp = NULL; 26806 } 26807 } 26808 26809 static int 26810 ip_kstat_update(kstat_t *kp, int rw) 26811 { 26812 ip_named_kstat_t *ipkp; 26813 26814 if (!kp || !kp->ks_data) 26815 return (EIO); 26816 26817 if (rw == KSTAT_WRITE) 26818 return (EACCES); 26819 26820 ipkp = (ip_named_kstat_t *)kp->ks_data; 26821 26822 ipkp->forwarding.value.ui32 = ip_mib.ipForwarding; 26823 ipkp->defaultTTL.value.ui32 = ip_mib.ipDefaultTTL; 26824 ipkp->inReceives.value.ui32 = ip_mib.ipInReceives; 26825 ipkp->inHdrErrors.value.ui32 = ip_mib.ipInHdrErrors; 26826 ipkp->inAddrErrors.value.ui32 = ip_mib.ipInAddrErrors; 26827 ipkp->forwDatagrams.value.ui32 = ip_mib.ipForwDatagrams; 26828 ipkp->inUnknownProtos.value.ui32 = ip_mib.ipInUnknownProtos; 26829 ipkp->inDiscards.value.ui32 = ip_mib.ipInDiscards; 26830 ipkp->inDelivers.value.ui32 = ip_mib.ipInDelivers; 26831 ipkp->outRequests.value.ui32 = ip_mib.ipOutRequests; 26832 ipkp->outDiscards.value.ui32 = ip_mib.ipOutDiscards; 26833 ipkp->outNoRoutes.value.ui32 = ip_mib.ipOutNoRoutes; 26834 ipkp->reasmTimeout.value.ui32 = ip_mib.ipReasmTimeout; 26835 ipkp->reasmReqds.value.ui32 = ip_mib.ipReasmReqds; 26836 ipkp->reasmOKs.value.ui32 = ip_mib.ipReasmOKs; 26837 ipkp->reasmFails.value.ui32 = ip_mib.ipReasmFails; 26838 ipkp->fragOKs.value.ui32 = ip_mib.ipFragOKs; 26839 ipkp->fragFails.value.ui32 = ip_mib.ipFragFails; 26840 ipkp->fragCreates.value.ui32 = ip_mib.ipFragCreates; 26841 26842 ipkp->routingDiscards.value.ui32 = ip_mib.ipRoutingDiscards; 26843 ipkp->inErrs.value.ui32 = ip_mib.tcpInErrs; 26844 ipkp->noPorts.value.ui32 = ip_mib.udpNoPorts; 26845 ipkp->inCksumErrs.value.ui32 = ip_mib.ipInCksumErrs; 26846 ipkp->reasmDuplicates.value.ui32 = ip_mib.ipReasmDuplicates; 26847 ipkp->reasmPartDups.value.ui32 = ip_mib.ipReasmPartDups; 26848 ipkp->forwProhibits.value.ui32 = ip_mib.ipForwProhibits; 26849 ipkp->udpInCksumErrs.value.ui32 = ip_mib.udpInCksumErrs; 26850 ipkp->udpInOverflows.value.ui32 = ip_mib.udpInOverflows; 26851 ipkp->rawipInOverflows.value.ui32 = ip_mib.rawipInOverflows; 26852 ipkp->ipsecInSucceeded.value.ui32 = ip_mib.ipsecInSucceeded; 26853 ipkp->ipsecInFailed.value.i32 = ip_mib.ipsecInFailed; 26854 26855 ipkp->inIPv6.value.ui32 = ip_mib.ipInIPv6; 26856 ipkp->outIPv6.value.ui32 = ip_mib.ipOutIPv6; 26857 ipkp->outSwitchIPv6.value.ui32 = ip_mib.ipOutSwitchIPv6; 26858 26859 return (0); 26860 } 26861 26862 static void 26863 icmp_kstat_init(void) 26864 { 26865 icmp_named_kstat_t template = { 26866 { "inMsgs", KSTAT_DATA_UINT32 }, 26867 { "inErrors", KSTAT_DATA_UINT32 }, 26868 { "inDestUnreachs", KSTAT_DATA_UINT32 }, 26869 { "inTimeExcds", KSTAT_DATA_UINT32 }, 26870 { "inParmProbs", KSTAT_DATA_UINT32 }, 26871 { "inSrcQuenchs", KSTAT_DATA_UINT32 }, 26872 { "inRedirects", KSTAT_DATA_UINT32 }, 26873 { "inEchos", KSTAT_DATA_UINT32 }, 26874 { "inEchoReps", KSTAT_DATA_UINT32 }, 26875 { "inTimestamps", KSTAT_DATA_UINT32 }, 26876 { "inTimestampReps", KSTAT_DATA_UINT32 }, 26877 { "inAddrMasks", KSTAT_DATA_UINT32 }, 26878 { "inAddrMaskReps", KSTAT_DATA_UINT32 }, 26879 { "outMsgs", KSTAT_DATA_UINT32 }, 26880 { "outErrors", KSTAT_DATA_UINT32 }, 26881 { "outDestUnreachs", KSTAT_DATA_UINT32 }, 26882 { "outTimeExcds", KSTAT_DATA_UINT32 }, 26883 { "outParmProbs", KSTAT_DATA_UINT32 }, 26884 { "outSrcQuenchs", KSTAT_DATA_UINT32 }, 26885 { "outRedirects", KSTAT_DATA_UINT32 }, 26886 { "outEchos", KSTAT_DATA_UINT32 }, 26887 { "outEchoReps", KSTAT_DATA_UINT32 }, 26888 { "outTimestamps", KSTAT_DATA_UINT32 }, 26889 { "outTimestampReps", KSTAT_DATA_UINT32 }, 26890 { "outAddrMasks", KSTAT_DATA_UINT32 }, 26891 { "outAddrMaskReps", KSTAT_DATA_UINT32 }, 26892 { "inChksumErrs", KSTAT_DATA_UINT32 }, 26893 { "inUnknowns", KSTAT_DATA_UINT32 }, 26894 { "inFragNeeded", KSTAT_DATA_UINT32 }, 26895 { "outFragNeeded", KSTAT_DATA_UINT32 }, 26896 { "outDrops", KSTAT_DATA_UINT32 }, 26897 { "inOverFlows", KSTAT_DATA_UINT32 }, 26898 { "inBadRedirects", KSTAT_DATA_UINT32 }, 26899 }; 26900 26901 icmp_mibkp = kstat_create("ip", 0, "icmp", "mib2", KSTAT_TYPE_NAMED, 26902 NUM_OF_FIELDS(icmp_named_kstat_t), 26903 0); 26904 if (icmp_mibkp == NULL) 26905 return; 26906 26907 bcopy(&template, icmp_mibkp->ks_data, sizeof (template)); 26908 26909 icmp_mibkp->ks_update = icmp_kstat_update; 26910 26911 kstat_install(icmp_mibkp); 26912 } 26913 26914 static void 26915 icmp_kstat_fini(void) 26916 { 26917 26918 if (icmp_mibkp != NULL) { 26919 kstat_delete(icmp_mibkp); 26920 icmp_mibkp = NULL; 26921 } 26922 } 26923 26924 static int 26925 icmp_kstat_update(kstat_t *kp, int rw) 26926 { 26927 icmp_named_kstat_t *icmpkp; 26928 26929 if ((kp == NULL) || (kp->ks_data == NULL)) 26930 return (EIO); 26931 26932 if (rw == KSTAT_WRITE) 26933 return (EACCES); 26934 26935 icmpkp = (icmp_named_kstat_t *)kp->ks_data; 26936 26937 icmpkp->inMsgs.value.ui32 = icmp_mib.icmpInMsgs; 26938 icmpkp->inErrors.value.ui32 = icmp_mib.icmpInErrors; 26939 icmpkp->inDestUnreachs.value.ui32 = icmp_mib.icmpInDestUnreachs; 26940 icmpkp->inTimeExcds.value.ui32 = icmp_mib.icmpInTimeExcds; 26941 icmpkp->inParmProbs.value.ui32 = icmp_mib.icmpInParmProbs; 26942 icmpkp->inSrcQuenchs.value.ui32 = icmp_mib.icmpInSrcQuenchs; 26943 icmpkp->inRedirects.value.ui32 = icmp_mib.icmpInRedirects; 26944 icmpkp->inEchos.value.ui32 = icmp_mib.icmpInEchos; 26945 icmpkp->inEchoReps.value.ui32 = icmp_mib.icmpInEchoReps; 26946 icmpkp->inTimestamps.value.ui32 = icmp_mib.icmpInTimestamps; 26947 icmpkp->inTimestampReps.value.ui32 = icmp_mib.icmpInTimestampReps; 26948 icmpkp->inAddrMasks.value.ui32 = icmp_mib.icmpInAddrMasks; 26949 icmpkp->inAddrMaskReps.value.ui32 = icmp_mib.icmpInAddrMaskReps; 26950 icmpkp->outMsgs.value.ui32 = icmp_mib.icmpOutMsgs; 26951 icmpkp->outErrors.value.ui32 = icmp_mib.icmpOutErrors; 26952 icmpkp->outDestUnreachs.value.ui32 = icmp_mib.icmpOutDestUnreachs; 26953 icmpkp->outTimeExcds.value.ui32 = icmp_mib.icmpOutTimeExcds; 26954 icmpkp->outParmProbs.value.ui32 = icmp_mib.icmpOutParmProbs; 26955 icmpkp->outSrcQuenchs.value.ui32 = icmp_mib.icmpOutSrcQuenchs; 26956 icmpkp->outRedirects.value.ui32 = icmp_mib.icmpOutRedirects; 26957 icmpkp->outEchos.value.ui32 = icmp_mib.icmpOutEchos; 26958 icmpkp->outEchoReps.value.ui32 = icmp_mib.icmpOutEchoReps; 26959 icmpkp->outTimestamps.value.ui32 = icmp_mib.icmpOutTimestamps; 26960 icmpkp->outTimestampReps.value.ui32 = icmp_mib.icmpOutTimestampReps; 26961 icmpkp->outAddrMasks.value.ui32 = icmp_mib.icmpOutAddrMasks; 26962 icmpkp->outAddrMaskReps.value.ui32 = icmp_mib.icmpOutAddrMaskReps; 26963 icmpkp->inCksumErrs.value.ui32 = icmp_mib.icmpInCksumErrs; 26964 icmpkp->inUnknowns.value.ui32 = icmp_mib.icmpInUnknowns; 26965 icmpkp->inFragNeeded.value.ui32 = icmp_mib.icmpInFragNeeded; 26966 icmpkp->outFragNeeded.value.ui32 = icmp_mib.icmpOutFragNeeded; 26967 icmpkp->outDrops.value.ui32 = icmp_mib.icmpOutDrops; 26968 icmpkp->inOverflows.value.ui32 = icmp_mib.icmpInOverflows; 26969 icmpkp->inBadRedirects.value.ui32 = icmp_mib.icmpInBadRedirects; 26970 26971 return (0); 26972 } 26973 26974 /* 26975 * This is the fanout function for raw socket opened for SCTP. Note 26976 * that it is called after SCTP checks that there is no socket which 26977 * wants a packet. Then before SCTP handles this out of the blue packet, 26978 * this function is called to see if there is any raw socket for SCTP. 26979 * If there is and it is bound to the correct address, the packet will 26980 * be sent to that socket. Note that only one raw socket can be bound to 26981 * a port. This is assured in ipcl_sctp_hash_insert(); 26982 */ 26983 void 26984 ip_fanout_sctp_raw(mblk_t *mp, ill_t *recv_ill, ipha_t *ipha, boolean_t isv4, 26985 uint32_t ports, boolean_t mctl_present, uint_t flags, boolean_t ip_policy, 26986 uint_t ipif_seqid, zoneid_t zoneid) 26987 { 26988 conn_t *connp; 26989 queue_t *rq; 26990 mblk_t *first_mp; 26991 boolean_t secure; 26992 ip6_t *ip6h; 26993 26994 first_mp = mp; 26995 if (mctl_present) { 26996 mp = first_mp->b_cont; 26997 secure = ipsec_in_is_secure(first_mp); 26998 ASSERT(mp != NULL); 26999 } else { 27000 secure = B_FALSE; 27001 } 27002 ip6h = (isv4) ? NULL : (ip6_t *)ipha; 27003 27004 connp = ipcl_classify_raw(mp, IPPROTO_SCTP, zoneid, ports, ipha); 27005 if (connp == NULL) { 27006 sctp_ootb_input(first_mp, recv_ill, ipif_seqid, zoneid, 27007 mctl_present); 27008 return; 27009 } 27010 rq = connp->conn_rq; 27011 if (!canputnext(rq)) { 27012 CONN_DEC_REF(connp); 27013 BUMP_MIB(&ip_mib, rawipInOverflows); 27014 freemsg(first_mp); 27015 return; 27016 } 27017 if ((isv4 ? CONN_INBOUND_POLICY_PRESENT(connp) : 27018 CONN_INBOUND_POLICY_PRESENT_V6(connp)) || secure) { 27019 first_mp = ipsec_check_inbound_policy(first_mp, connp, 27020 (isv4 ? ipha : NULL), ip6h, mctl_present); 27021 if (first_mp == NULL) { 27022 CONN_DEC_REF(connp); 27023 return; 27024 } 27025 } 27026 /* 27027 * We probably should not send M_CTL message up to 27028 * raw socket. 27029 */ 27030 if (mctl_present) 27031 freeb(first_mp); 27032 27033 /* Initiate IPPF processing here if needed. */ 27034 if ((isv4 && IPP_ENABLED(IPP_LOCAL_IN) && ip_policy) || 27035 (!isv4 && IP6_IN_IPP(flags))) { 27036 ip_process(IPP_LOCAL_IN, &mp, 27037 recv_ill->ill_phyint->phyint_ifindex); 27038 if (mp == NULL) { 27039 CONN_DEC_REF(connp); 27040 return; 27041 } 27042 } 27043 27044 if (connp->conn_recvif || connp->conn_recvslla || 27045 ((connp->conn_ipv6_recvpktinfo || 27046 (!isv4 && IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))) && 27047 (flags & IP_FF_IP6INFO))) { 27048 int in_flags = 0; 27049 27050 if (connp->conn_recvif || connp->conn_ipv6_recvpktinfo) { 27051 in_flags = IPF_RECVIF; 27052 } 27053 if (connp->conn_recvslla) { 27054 in_flags |= IPF_RECVSLLA; 27055 } 27056 if (isv4) { 27057 mp = ip_add_info(mp, recv_ill, in_flags); 27058 } else { 27059 mp = ip_add_info_v6(mp, recv_ill, &ip6h->ip6_dst); 27060 if (mp == NULL) { 27061 CONN_DEC_REF(connp); 27062 return; 27063 } 27064 } 27065 } 27066 27067 BUMP_MIB(&ip_mib, ipInDelivers); 27068 /* 27069 * We are sending the IPSEC_IN message also up. Refer 27070 * to comments above this function. 27071 */ 27072 putnext(rq, mp); 27073 CONN_DEC_REF(connp); 27074 } 27075 27076 /* 27077 * Martian Address Filtering [RFC 1812, Section 5.3.7] 27078 */ 27079 static boolean_t 27080 ip_no_forward(ipha_t *ipha, ill_t *ill) 27081 { 27082 ipaddr_t ip_src, ip_dst; 27083 ire_t *src_ire = NULL; 27084 27085 ip_src = ntohl(ipha->ipha_src); 27086 ip_dst = ntohl(ipha->ipha_dst); 27087 27088 if (ip_dst == INADDR_ANY) 27089 goto dont_forward; 27090 27091 if (IN_CLASSD(ip_src)) 27092 goto dont_forward; 27093 27094 if ((ip_src >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) 27095 goto dont_forward; 27096 27097 if (IN_BADCLASS(ip_dst)) 27098 goto dont_forward; 27099 27100 src_ire = ire_ctable_lookup(ipha->ipha_src, 0, IRE_BROADCAST, NULL, 27101 ALL_ZONES, NULL, MATCH_IRE_TYPE); 27102 if (src_ire != NULL) { 27103 ire_refrele(src_ire); 27104 goto dont_forward; 27105 } 27106 27107 return (B_FALSE); 27108 27109 dont_forward: 27110 if (ip_debug > 2) { 27111 printf("ip_no_forward: dropping packet received on %s\n", 27112 ill->ill_name); 27113 pr_addr_dbg("ip_no_forward: from src %s\n", 27114 AF_INET, &ipha->ipha_src); 27115 pr_addr_dbg("ip_no_forward: to dst %s\n", 27116 AF_INET, &ipha->ipha_dst); 27117 } 27118 BUMP_MIB(&ip_mib, ipForwProhibits); 27119 return (B_TRUE); 27120 } 27121 27122 static boolean_t 27123 ip_loopback_src_or_dst(ipha_t *ipha, ill_t *ill) 27124 { 27125 if (((ntohl(ipha->ipha_src) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) || 27126 ((ntohl(ipha->ipha_dst) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) { 27127 if (ip_debug > 2) { 27128 if (ill != NULL) { 27129 printf("ip_loopback_src_or_dst: " 27130 "dropping packet received on %s\n", 27131 ill->ill_name); 27132 } else { 27133 printf("ip_loopback_src_or_dst: " 27134 "dropping packet\n"); 27135 } 27136 27137 pr_addr_dbg( 27138 "ip_loopback_src_or_dst: from src %s\n", 27139 AF_INET, &ipha->ipha_src); 27140 pr_addr_dbg( 27141 "ip_loopback_src_or_dst: to dst %s\n", 27142 AF_INET, &ipha->ipha_dst); 27143 } 27144 27145 BUMP_MIB(&ip_mib, ipInAddrErrors); 27146 return (B_TRUE); 27147 } 27148 return (B_FALSE); 27149 } 27150 27151 /* 27152 * Return B_TRUE if the buffers differ in length or content. 27153 * This is used for comparing extension header buffers. 27154 * Note that an extension header would be declared different 27155 * even if all that changed was the next header value in that header i.e. 27156 * what really changed is the next extension header. 27157 */ 27158 boolean_t 27159 ip_cmpbuf(const void *abuf, uint_t alen, boolean_t b_valid, const void *bbuf, 27160 uint_t blen) 27161 { 27162 if (!b_valid) 27163 blen = 0; 27164 27165 if (alen != blen) 27166 return (B_TRUE); 27167 if (alen == 0) 27168 return (B_FALSE); /* Both zero length */ 27169 return (bcmp(abuf, bbuf, alen)); 27170 } 27171 27172 /* 27173 * Preallocate memory for ip_savebuf(). Returns B_TRUE if ok. 27174 * Return B_FALSE if memory allocation fails - don't change any state! 27175 */ 27176 boolean_t 27177 ip_allocbuf(void **dstp, uint_t *dstlenp, boolean_t src_valid, 27178 const void *src, uint_t srclen) 27179 { 27180 void *dst; 27181 27182 if (!src_valid) 27183 srclen = 0; 27184 27185 ASSERT(*dstlenp == 0); 27186 if (src != NULL && srclen != 0) { 27187 dst = mi_alloc(srclen, BPRI_MED); 27188 if (dst == NULL) 27189 return (B_FALSE); 27190 } else { 27191 dst = NULL; 27192 } 27193 if (*dstp != NULL) 27194 mi_free(*dstp); 27195 *dstp = dst; 27196 *dstlenp = dst == NULL ? 0 : srclen; 27197 return (B_TRUE); 27198 } 27199 27200 /* 27201 * Replace what is in *dst, *dstlen with the source. 27202 * Assumes ip_allocbuf has already been called. 27203 */ 27204 void 27205 ip_savebuf(void **dstp, uint_t *dstlenp, boolean_t src_valid, 27206 const void *src, uint_t srclen) 27207 { 27208 if (!src_valid) 27209 srclen = 0; 27210 27211 ASSERT(*dstlenp == srclen); 27212 if (src != NULL && srclen != 0) 27213 bcopy(src, *dstp, srclen); 27214 } 27215 27216 /* 27217 * Free the storage pointed to by the members of an ip6_pkt_t. 27218 */ 27219 void 27220 ip6_pkt_free(ip6_pkt_t *ipp) 27221 { 27222 ASSERT(ipp->ipp_pathmtu == NULL && !(ipp->ipp_fields & IPPF_PATHMTU)); 27223 27224 if (ipp->ipp_fields & IPPF_HOPOPTS) { 27225 kmem_free(ipp->ipp_hopopts, ipp->ipp_hopoptslen); 27226 ipp->ipp_hopopts = NULL; 27227 ipp->ipp_hopoptslen = 0; 27228 } 27229 if (ipp->ipp_fields & IPPF_RTDSTOPTS) { 27230 kmem_free(ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen); 27231 ipp->ipp_rtdstopts = NULL; 27232 ipp->ipp_rtdstoptslen = 0; 27233 } 27234 if (ipp->ipp_fields & IPPF_DSTOPTS) { 27235 kmem_free(ipp->ipp_dstopts, ipp->ipp_dstoptslen); 27236 ipp->ipp_dstopts = NULL; 27237 ipp->ipp_dstoptslen = 0; 27238 } 27239 if (ipp->ipp_fields & IPPF_RTHDR) { 27240 kmem_free(ipp->ipp_rthdr, ipp->ipp_rthdrlen); 27241 ipp->ipp_rthdr = NULL; 27242 ipp->ipp_rthdrlen = 0; 27243 } 27244 ipp->ipp_fields &= ~(IPPF_HOPOPTS | IPPF_RTDSTOPTS | IPPF_DSTOPTS | 27245 IPPF_RTHDR); 27246 } 27247