1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/stream.h> 31 #include <sys/dlpi.h> 32 #include <sys/stropts.h> 33 #include <sys/sysmacros.h> 34 #include <sys/strsubr.h> 35 #include <sys/strlog.h> 36 #include <sys/strsun.h> 37 #include <sys/zone.h> 38 #define _SUN_TPI_VERSION 2 39 #include <sys/tihdr.h> 40 #include <sys/xti_inet.h> 41 #include <sys/ddi.h> 42 #include <sys/sunddi.h> 43 #include <sys/cmn_err.h> 44 #include <sys/debug.h> 45 #include <sys/kobj.h> 46 #include <sys/modctl.h> 47 #include <sys/atomic.h> 48 #include <sys/policy.h> 49 #include <sys/priv.h> 50 51 #include <sys/systm.h> 52 #include <sys/param.h> 53 #include <sys/kmem.h> 54 #include <sys/socket.h> 55 #include <sys/vtrace.h> 56 #include <sys/isa_defs.h> 57 #include <net/if.h> 58 #include <net/if_arp.h> 59 #include <net/route.h> 60 #include <sys/sockio.h> 61 #include <netinet/in.h> 62 #include <net/if_dl.h> 63 64 #include <inet/common.h> 65 #include <inet/mi.h> 66 #include <inet/mib2.h> 67 #include <inet/nd.h> 68 #include <inet/arp.h> 69 #include <inet/snmpcom.h> 70 #include <inet/kstatcom.h> 71 72 #include <netinet/igmp_var.h> 73 #include <netinet/ip6.h> 74 #include <netinet/icmp6.h> 75 #include <netinet/sctp.h> 76 77 #include <inet/ip.h> 78 #include <inet/ip_impl.h> 79 #include <inet/ip6.h> 80 #include <inet/ip6_asp.h> 81 #include <inet/tcp.h> 82 #include <inet/tcp_impl.h> 83 #include <inet/ip_multi.h> 84 #include <inet/ip_if.h> 85 #include <inet/ip_ire.h> 86 #include <inet/ip_rts.h> 87 #include <inet/optcom.h> 88 #include <inet/ip_ndp.h> 89 #include <inet/ip_listutils.h> 90 #include <netinet/igmp.h> 91 #include <netinet/ip_mroute.h> 92 #include <inet/ipp_common.h> 93 94 #include <net/pfkeyv2.h> 95 #include <inet/ipsec_info.h> 96 #include <inet/sadb.h> 97 #include <inet/ipsec_impl.h> 98 #include <sys/iphada.h> 99 #include <inet/tun.h> 100 #include <inet/ipdrop.h> 101 102 #include <sys/ethernet.h> 103 #include <net/if_types.h> 104 #include <sys/cpuvar.h> 105 106 #include <ipp/ipp.h> 107 #include <ipp/ipp_impl.h> 108 #include <ipp/ipgpc/ipgpc.h> 109 110 #include <sys/multidata.h> 111 #include <sys/pattr.h> 112 113 #include <inet/ipclassifier.h> 114 #include <inet/sctp_ip.h> 115 #include <inet/udp_impl.h> 116 117 #include <sys/tsol/label.h> 118 #include <sys/tsol/tnet.h> 119 120 #include <rpc/pmap_prot.h> 121 122 /* 123 * Values for squeue switch: 124 * IP_SQUEUE_ENTER_NODRAIN: squeue_enter_nodrain 125 * IP_SQUEUE_ENTER: squeue_enter 126 * IP_SQUEUE_FILL: squeue_fill 127 */ 128 int ip_squeue_enter = 2; 129 squeue_func_t ip_input_proc; 130 /* 131 * IP statistics. 132 */ 133 #define IP_STAT(x) (ip_statistics.x.value.ui64++) 134 #define IP_STAT_UPDATE(x, n) (ip_statistics.x.value.ui64 += (n)) 135 136 typedef struct ip_stat { 137 kstat_named_t ipsec_fanout_proto; 138 kstat_named_t ip_udp_fannorm; 139 kstat_named_t ip_udp_fanmb; 140 kstat_named_t ip_udp_fanothers; 141 kstat_named_t ip_udp_fast_path; 142 kstat_named_t ip_udp_slow_path; 143 kstat_named_t ip_udp_input_err; 144 kstat_named_t ip_tcppullup; 145 kstat_named_t ip_tcpoptions; 146 kstat_named_t ip_multipkttcp; 147 kstat_named_t ip_tcp_fast_path; 148 kstat_named_t ip_tcp_slow_path; 149 kstat_named_t ip_tcp_input_error; 150 kstat_named_t ip_db_ref; 151 kstat_named_t ip_notaligned1; 152 kstat_named_t ip_notaligned2; 153 kstat_named_t ip_multimblk3; 154 kstat_named_t ip_multimblk4; 155 kstat_named_t ip_ipoptions; 156 kstat_named_t ip_classify_fail; 157 kstat_named_t ip_opt; 158 kstat_named_t ip_udp_rput_local; 159 kstat_named_t ipsec_proto_ahesp; 160 kstat_named_t ip_conn_flputbq; 161 kstat_named_t ip_conn_walk_drain; 162 kstat_named_t ip_out_sw_cksum; 163 kstat_named_t ip_in_sw_cksum; 164 kstat_named_t ip_trash_ire_reclaim_calls; 165 kstat_named_t ip_trash_ire_reclaim_success; 166 kstat_named_t ip_ire_arp_timer_expired; 167 kstat_named_t ip_ire_redirect_timer_expired; 168 kstat_named_t ip_ire_pmtu_timer_expired; 169 kstat_named_t ip_input_multi_squeue; 170 kstat_named_t ip_tcp_in_full_hw_cksum_err; 171 kstat_named_t ip_tcp_in_part_hw_cksum_err; 172 kstat_named_t ip_tcp_in_sw_cksum_err; 173 kstat_named_t ip_tcp_out_sw_cksum_bytes; 174 kstat_named_t ip_udp_in_full_hw_cksum_err; 175 kstat_named_t ip_udp_in_part_hw_cksum_err; 176 kstat_named_t ip_udp_in_sw_cksum_err; 177 kstat_named_t ip_udp_out_sw_cksum_bytes; 178 kstat_named_t ip_frag_mdt_pkt_out; 179 kstat_named_t ip_frag_mdt_discarded; 180 kstat_named_t ip_frag_mdt_allocfail; 181 kstat_named_t ip_frag_mdt_addpdescfail; 182 kstat_named_t ip_frag_mdt_allocd; 183 } ip_stat_t; 184 185 static ip_stat_t ip_statistics = { 186 { "ipsec_fanout_proto", KSTAT_DATA_UINT64 }, 187 { "ip_udp_fannorm", KSTAT_DATA_UINT64 }, 188 { "ip_udp_fanmb", KSTAT_DATA_UINT64 }, 189 { "ip_udp_fanothers", KSTAT_DATA_UINT64 }, 190 { "ip_udp_fast_path", KSTAT_DATA_UINT64 }, 191 { "ip_udp_slow_path", KSTAT_DATA_UINT64 }, 192 { "ip_udp_input_err", KSTAT_DATA_UINT64 }, 193 { "ip_tcppullup", KSTAT_DATA_UINT64 }, 194 { "ip_tcpoptions", KSTAT_DATA_UINT64 }, 195 { "ip_multipkttcp", KSTAT_DATA_UINT64 }, 196 { "ip_tcp_fast_path", KSTAT_DATA_UINT64 }, 197 { "ip_tcp_slow_path", KSTAT_DATA_UINT64 }, 198 { "ip_tcp_input_error", KSTAT_DATA_UINT64 }, 199 { "ip_db_ref", KSTAT_DATA_UINT64 }, 200 { "ip_notaligned1", KSTAT_DATA_UINT64 }, 201 { "ip_notaligned2", KSTAT_DATA_UINT64 }, 202 { "ip_multimblk3", KSTAT_DATA_UINT64 }, 203 { "ip_multimblk4", KSTAT_DATA_UINT64 }, 204 { "ip_ipoptions", KSTAT_DATA_UINT64 }, 205 { "ip_classify_fail", KSTAT_DATA_UINT64 }, 206 { "ip_opt", KSTAT_DATA_UINT64 }, 207 { "ip_udp_rput_local", KSTAT_DATA_UINT64 }, 208 { "ipsec_proto_ahesp", KSTAT_DATA_UINT64 }, 209 { "ip_conn_flputbq", KSTAT_DATA_UINT64 }, 210 { "ip_conn_walk_drain", KSTAT_DATA_UINT64 }, 211 { "ip_out_sw_cksum", KSTAT_DATA_UINT64 }, 212 { "ip_in_sw_cksum", KSTAT_DATA_UINT64 }, 213 { "ip_trash_ire_reclaim_calls", KSTAT_DATA_UINT64 }, 214 { "ip_trash_ire_reclaim_success", KSTAT_DATA_UINT64 }, 215 { "ip_ire_arp_timer_expired", KSTAT_DATA_UINT64 }, 216 { "ip_ire_redirect_timer_expired", KSTAT_DATA_UINT64 }, 217 { "ip_ire_pmtu_timer_expired", KSTAT_DATA_UINT64 }, 218 { "ip_input_multi_squeue", KSTAT_DATA_UINT64 }, 219 { "ip_tcp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 }, 220 { "ip_tcp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 }, 221 { "ip_tcp_in_sw_cksum_err", KSTAT_DATA_UINT64 }, 222 { "ip_tcp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, 223 { "ip_udp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 }, 224 { "ip_udp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 }, 225 { "ip_udp_in_sw_cksum_err", KSTAT_DATA_UINT64 }, 226 { "ip_udp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, 227 { "ip_frag_mdt_pkt_out", KSTAT_DATA_UINT64 }, 228 { "ip_frag_mdt_discarded", KSTAT_DATA_UINT64 }, 229 { "ip_frag_mdt_allocfail", KSTAT_DATA_UINT64 }, 230 { "ip_frag_mdt_addpdescfail", KSTAT_DATA_UINT64 }, 231 { "ip_frag_mdt_allocd", KSTAT_DATA_UINT64 }, 232 }; 233 234 static kstat_t *ip_kstat; 235 236 #define TCP6 "tcp6" 237 #define TCP "tcp" 238 #define SCTP "sctp" 239 #define SCTP6 "sctp6" 240 241 major_t TCP6_MAJ; 242 major_t TCP_MAJ; 243 major_t SCTP_MAJ; 244 major_t SCTP6_MAJ; 245 246 int ip_poll_normal_ms = 100; 247 int ip_poll_normal_ticks = 0; 248 249 /* 250 * Structure to represent a linked list of msgblks. Used by ip_snmp_ functions. 251 */ 252 253 struct listptr_s { 254 mblk_t *lp_head; /* pointer to the head of the list */ 255 mblk_t *lp_tail; /* pointer to the tail of the list */ 256 }; 257 258 typedef struct listptr_s listptr_t; 259 260 /* 261 * This is used by ip_snmp_get_mib2_ip_route_media and 262 * ip_snmp_get_mib2_ip6_route_media to carry the lists of return data. 263 */ 264 typedef struct iproutedata_s { 265 uint_t ird_idx; 266 listptr_t ird_route; /* ipRouteEntryTable */ 267 listptr_t ird_netmedia; /* ipNetToMediaEntryTable */ 268 listptr_t ird_attrs; /* ipRouteAttributeTable */ 269 } iproutedata_t; 270 271 /* 272 * Cluster specific hooks. These should be NULL when booted as a non-cluster 273 */ 274 275 /* 276 * Hook functions to enable cluster networking 277 * On non-clustered systems these vectors must always be NULL. 278 * 279 * Hook function to Check ip specified ip address is a shared ip address 280 * in the cluster 281 * 282 */ 283 int (*cl_inet_isclusterwide)(uint8_t protocol, 284 sa_family_t addr_family, uint8_t *laddrp) = NULL; 285 286 /* 287 * Hook function to generate cluster wide ip fragment identifier 288 */ 289 uint32_t (*cl_inet_ipident)(uint8_t protocol, sa_family_t addr_family, 290 uint8_t *laddrp, uint8_t *faddrp) = NULL; 291 292 /* 293 * Synchronization notes: 294 * 295 * IP is a fully D_MP STREAMS module/driver. Thus it does not depend on any 296 * MT level protection given by STREAMS. IP uses a combination of its own 297 * internal serialization mechanism and standard Solaris locking techniques. 298 * The internal serialization is per phyint (no IPMP) or per IPMP group. 299 * This is used to serialize plumbing operations, IPMP operations, certain 300 * multicast operations, most set ioctls, igmp/mld timers etc. 301 * 302 * Plumbing is a long sequence of operations involving message 303 * exchanges between IP, ARP and device drivers. Many set ioctls are typically 304 * involved in plumbing operations. A natural model is to serialize these 305 * ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in 306 * parallel without any interference. But various set ioctls on hme0 are best 307 * serialized. However if the system uses IPMP, the operations are easier if 308 * they are serialized on a per IPMP group basis since IPMP operations 309 * happen across ill's of a group. Thus the lowest common denominator is to 310 * serialize most set ioctls, multicast join/leave operations, IPMP operations 311 * igmp/mld timer operations, and processing of DLPI control messages received 312 * from drivers on a per IPMP group basis. If the system does not employ 313 * IPMP the serialization is on a per phyint basis. This serialization is 314 * provided by the ipsq_t and primitives operating on this. Details can 315 * be found in ip_if.c above the core primitives operating on ipsq_t. 316 * 317 * Lookups of an ipif or ill by a thread return a refheld ipif / ill. 318 * Simiarly lookup of an ire by a thread also returns a refheld ire. 319 * In addition ipif's and ill's referenced by the ire are also indirectly 320 * refheld. Thus no ipif or ill can vanish nor can critical parameters like 321 * the ipif's address or netmask change as long as an ipif is refheld 322 * directly or indirectly. For example an SIOCLIFADDR ioctl that changes the 323 * address of an ipif has to go through the ipsq_t. This ensures that only 324 * 1 such exclusive operation proceeds at any time on the ipif. It then 325 * deletes all ires associated with this ipif, and waits for all refcnts 326 * associated with this ipif to come down to zero. The address is changed 327 * only after the ipif has been quiesced. Then the ipif is brought up again. 328 * More details are described above the comment in ip_sioctl_flags. 329 * 330 * Packet processing is based mostly on IREs and are fully multi-threaded 331 * using standard Solaris MT techniques. 332 * 333 * There are explicit locks in IP to handle: 334 * - The ip_g_head list maintained by mi_open_link() and friends. 335 * 336 * - The reassembly data structures (one lock per hash bucket) 337 * 338 * - conn_lock is meant to protect conn_t fields. The fields actually 339 * protected by conn_lock are documented in the conn_t definition. 340 * 341 * - ire_lock to protect some of the fields of the ire, IRE tables 342 * (one lock per hash bucket). Refer to ip_ire.c for details. 343 * 344 * - ndp_g_lock and nce_lock for protecting NCEs. 345 * 346 * - ill_lock protects fields of the ill and ipif. Details in ip.h 347 * 348 * - ill_g_lock: This is a global reader/writer lock. Protects the following 349 * * The AVL tree based global multi list of all ills. 350 * * The linked list of all ipifs of an ill 351 * * The <ill-ipsq> mapping 352 * * The ipsq->ipsq_phyint_list threaded by phyint_ipsq_next 353 * * The illgroup list threaded by ill_group_next. 354 * * <ill-phyint> association 355 * Insertion/deletion of an ill in the system, insertion/deletion of an ipif 356 * into an ill, changing the <ill-ipsq> mapping of an ill, insertion/deletion 357 * of an ill into the illgrp list, changing the <ill-phyint> assoc of an ill 358 * will all have to hold the ill_g_lock as writer for the actual duration 359 * of the insertion/deletion/change. More details about the <ill-ipsq> mapping 360 * may be found in the IPMP section. 361 * 362 * - ill_lock: This is a per ill mutex. 363 * It protects some members of the ill and is documented below. 364 * It also protects the <ill-ipsq> mapping 365 * It also protects the illgroup list threaded by ill_group_next. 366 * It also protects the <ill-phyint> assoc. 367 * It also protects the list of ipifs hanging off the ill. 368 * 369 * - ipsq_lock: This is a per ipsq_t mutex lock. 370 * This protects all the other members of the ipsq struct except 371 * ipsq_refs and ipsq_phyint_list which are protected by ill_g_lock 372 * 373 * - illgrp_lock: This is a per ill_group mutex lock. 374 * The only thing it protects is the illgrp_ill_schednext member of ill_group 375 * which dictates which is the next ill in an ill_group that is to be chosen 376 * for sending outgoing packets, through creation of an IRE_CACHE that 377 * references this ill. 378 * 379 * - phyint_lock: This is a per phyint mutex lock. Protects just the 380 * phyint_flags 381 * 382 * - ip_g_nd_lock: This is a global reader/writer lock. 383 * Any call to nd_load to load a new parameter to the ND table must hold the 384 * lock as writer. ND_GET/ND_SET routines that read the ND table hold the lock 385 * as reader. 386 * 387 * - ip_addr_avail_lock: This is used to ensure the uniqueness of IP addresses. 388 * This lock is held in ipif_up_done and the ipif is marked IPIF_UP and the 389 * uniqueness check also done atomically. 390 * 391 * - ipsec_capab_ills_lock: This readers/writer lock protects the global 392 * lists of IPsec capable ills (ipsec_capab_ills_{ah,esp}). It is taken 393 * as a writer when adding or deleting elements from these lists, and 394 * as a reader when walking these lists to send a SADB update to the 395 * IPsec capable ills. 396 * 397 * - ill_g_usesrc_lock: This readers/writer lock protects the usesrc 398 * group list linked by ill_usesrc_grp_next. It also protects the 399 * ill_usesrc_ifindex field. It is taken as a writer when a member of the 400 * group is being added or deleted. This lock is taken as a reader when 401 * walking the list/group(eg: to get the number of members in a usesrc group). 402 * Note, it is only necessary to take this lock if the ill_usesrc_grp_next 403 * field is changing state i.e from NULL to non-NULL or vice-versa. For 404 * example, it is not necessary to take this lock in the initial portion 405 * of ip_sioctl_slifusesrc or at all in ip_sioctl_groupname and 406 * ip_sioctl_flags since the these operations are executed exclusively and 407 * that ensures that the "usesrc group state" cannot change. The "usesrc 408 * group state" change can happen only in the latter part of 409 * ip_sioctl_slifusesrc and in ill_delete. 410 * 411 * Changing <ill-phyint>, <ill-ipsq>, <ill-illgroup> assocications. 412 * 413 * To change the <ill-phyint> association, the ill_g_lock must be held 414 * as writer, and the ill_locks of both the v4 and v6 instance of the ill 415 * must be held. 416 * 417 * To change the <ill-ipsq> association the ill_g_lock must be held as writer 418 * and the ill_lock of the ill in question must be held. 419 * 420 * To change the <ill-illgroup> association the ill_g_lock must be held as 421 * writer and the ill_lock of the ill in question must be held. 422 * 423 * To add or delete an ipif from the list of ipifs hanging off the ill, 424 * ill_g_lock (writer) and ill_lock must be held and the thread must be 425 * a writer on the associated ipsq,. 426 * 427 * To add or delete an ill to the system, the ill_g_lock must be held as 428 * writer and the thread must be a writer on the associated ipsq. 429 * 430 * To add or delete an ilm to an ill, the ill_lock must be held and the thread 431 * must be a writer on the associated ipsq. 432 * 433 * Lock hierarchy 434 * 435 * Some lock hierarchy scenarios are listed below. 436 * 437 * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock 438 * ill_g_lock -> illgrp_lock -> ill_lock 439 * ill_g_lock -> ill_lock(s) -> phyint_lock 440 * ill_g_lock -> ndp_g_lock -> ill_lock -> nce_lock 441 * ill_g_lock -> ip_addr_avail_lock 442 * conn_lock -> irb_lock -> ill_lock -> ire_lock 443 * ill_g_lock -> ip_g_nd_lock 444 * 445 * When more than 1 ill lock is needed to be held, all ill lock addresses 446 * are sorted on address and locked starting from highest addressed lock 447 * downward. 448 * 449 * Mobile-IP scenarios 450 * 451 * irb_lock -> ill_lock -> ire_mrtun_lock 452 * irb_lock -> ill_lock -> ire_srcif_table_lock 453 * 454 * IPsec scenarios 455 * 456 * ipsa_lock -> ill_g_lock -> ill_lock 457 * ipsec_capab_ills_lock -> ill_g_lock -> ill_lock 458 * ipsec_capab_ills_lock -> ipsa_lock 459 * ill_g_usesrc_lock -> ill_g_lock -> ill_lock 460 * 461 * Trusted Solaris scenarios 462 * 463 * igsa_lock -> gcgrp_rwlock -> gcgrp_lock 464 * igsa_lock -> gcdb_lock 465 * gcgrp_rwlock -> ire_lock 466 * gcgrp_rwlock -> gcdb_lock 467 * 468 * IPSEC notes : 469 * 470 * IP interacts with the IPSEC code (AH/ESP) by tagging a M_CTL message 471 * in front of the actual packet. For outbound datagrams, the M_CTL 472 * contains a ipsec_out_t (defined in ipsec_info.h), which has the 473 * information used by the IPSEC code for applying the right level of 474 * protection. The information initialized by IP in the ipsec_out_t 475 * is determined by the per-socket policy or global policy in the system. 476 * For inbound datagrams, the M_CTL contains a ipsec_in_t (defined in 477 * ipsec_info.h) which starts out with nothing in it. It gets filled 478 * with the right information if it goes through the AH/ESP code, which 479 * happens if the incoming packet is secure. The information initialized 480 * by AH/ESP, is later used by IP(during fanouts to ULP) to see whether 481 * the policy requirements needed by per-socket policy or global policy 482 * is met or not. 483 * 484 * If there is both per-socket policy (set using setsockopt) and there 485 * is also global policy match for the 5 tuples of the socket, 486 * ipsec_override_policy() makes the decision of which one to use. 487 * 488 * For fully connected sockets i.e dst, src [addr, port] is known, 489 * conn_policy_cached is set indicating that policy has been cached. 490 * conn_in_enforce_policy may or may not be set depending on whether 491 * there is a global policy match or per-socket policy match. 492 * Policy inheriting happpens in ip_bind during the ipa_conn_t bind. 493 * Once the right policy is set on the conn_t, policy cannot change for 494 * this socket. This makes life simpler for TCP (UDP ?) where 495 * re-transmissions go out with the same policy. For symmetry, policy 496 * is cached for fully connected UDP sockets also. Thus if policy is cached, 497 * it also implies that policy is latched i.e policy cannot change 498 * on these sockets. As we have the right policy on the conn, we don't 499 * have to lookup global policy for every outbound and inbound datagram 500 * and thus serving as an optimization. Note that a global policy change 501 * does not affect fully connected sockets if they have policy. If fully 502 * connected sockets did not have any policy associated with it, global 503 * policy change may affect them. 504 * 505 * IP Flow control notes: 506 * 507 * Non-TCP streams are flow controlled by IP. On the send side, if the packet 508 * cannot be sent down to the driver by IP, because of a canput failure, IP 509 * does a putq on the conn_wq. This will cause ip_wsrv to run on the conn_wq. 510 * ip_wsrv in turn, inserts the conn in a list of conn's that need to be drained 511 * when the flowcontrol condition subsides. Ultimately STREAMS backenables the 512 * ip_wsrv on the IP module, which in turn does a qenable of the conn_wq of the 513 * first conn in the list of conn's to be drained. ip_wsrv on this conn drains 514 * the queued messages, and removes the conn from the drain list, if all 515 * messages were drained. It also qenables the next conn in the drain list to 516 * continue the drain process. 517 * 518 * In reality the drain list is not a single list, but a configurable number 519 * of lists. The ip_wsrv on the IP module, qenables the first conn in each 520 * list. If the ip_wsrv of the next qenabled conn does not run, because the 521 * stream closes, ip_close takes responsibility to qenable the next conn in 522 * the drain list. The directly called ip_wput path always does a putq, if 523 * it cannot putnext. Thus synchronization problems are handled between 524 * ip_wsrv and ip_close. conn_drain_insert and conn_drain_tail are the only 525 * functions that manipulate this drain list. Furthermore conn_drain_insert 526 * is called only from ip_wsrv, and there can be only 1 instance of ip_wsrv 527 * running on a queue at any time. conn_drain_tail can be simultaneously called 528 * from both ip_wsrv and ip_close. 529 * 530 * IPQOS notes: 531 * 532 * IPQoS Policies are applied to packets using IPPF (IP Policy framework) 533 * and IPQoS modules. IPPF includes hooks in IP at different control points 534 * (callout positions) which direct packets to IPQoS modules for policy 535 * processing. Policies, if present, are global. 536 * 537 * The callout positions are located in the following paths: 538 * o local_in (packets destined for this host) 539 * o local_out (packets orginating from this host ) 540 * o fwd_in (packets forwarded by this m/c - inbound) 541 * o fwd_out (packets forwarded by this m/c - outbound) 542 * Hooks at these callout points can be enabled/disabled using the ndd variable 543 * ip_policy_mask (a bit mask with the 4 LSB indicating the callout positions). 544 * By default all the callout positions are enabled. 545 * 546 * Outbound (local_out) 547 * Hooks are placed in ip_wput_ire and ipsec_out_process. 548 * 549 * Inbound (local_in) 550 * Hooks are placed in ip_proto_input, icmp_inbound, ip_fanout_proto and 551 * TCP and UDP fanout routines. 552 * 553 * Forwarding (in and out) 554 * Hooks are placed in ip_rput_forward and ip_mrtun_forward. 555 * 556 * IP Policy Framework processing (IPPF processing) 557 * Policy processing for a packet is initiated by ip_process, which ascertains 558 * that the classifier (ipgpc) is loaded and configured, failing which the 559 * packet resumes normal processing in IP. If the clasifier is present, the 560 * packet is acted upon by one or more IPQoS modules (action instances), per 561 * filters configured in ipgpc and resumes normal IP processing thereafter. 562 * An action instance can drop a packet in course of its processing. 563 * 564 * A boolean variable, ip_policy, is used in all the fanout routines that can 565 * invoke ip_process for a packet. This variable indicates if the packet should 566 * to be sent for policy processing. The variable is set to B_TRUE by default, 567 * i.e. when the routines are invoked in the normal ip procesing path for a 568 * packet. The two exceptions being ip_wput_local and icmp_inbound_error_fanout; 569 * ip_policy is set to B_FALSE for all the routines called in these two 570 * functions because, in the former case, we don't process loopback traffic 571 * currently while in the latter, the packets have already been processed in 572 * icmp_inbound. 573 * 574 * Zones notes: 575 * 576 * The partitioning rules for networking are as follows: 577 * 1) Packets coming from a zone must have a source address belonging to that 578 * zone. 579 * 2) Packets coming from a zone can only be sent on a physical interface on 580 * which the zone has an IP address. 581 * 3) Between two zones on the same machine, packet delivery is only allowed if 582 * there's a matching route for the destination and zone in the forwarding 583 * table. 584 * 4) The TCP and UDP port spaces are per-zone; that is, two processes in 585 * different zones can bind to the same port with the wildcard address 586 * (INADDR_ANY). 587 * 588 * The granularity of interface partitioning is at the logical interface level. 589 * Therefore, every zone has its own IP addresses, and incoming packets can be 590 * attributed to a zone unambiguously. A logical interface is placed into a zone 591 * using the SIOCSLIFZONE ioctl; this sets the ipif_zoneid field in the ipif_t 592 * structure. Rule (1) is implemented by modifying the source address selection 593 * algorithm so that the list of eligible addresses is filtered based on the 594 * sending process zone. 595 * 596 * The Internet Routing Entries (IREs) are either exclusive to a zone or shared 597 * across all zones, depending on their type. Here is the break-up: 598 * 599 * IRE type Shared/exclusive 600 * -------- ---------------- 601 * IRE_BROADCAST Exclusive 602 * IRE_DEFAULT (default routes) Shared (*) 603 * IRE_LOCAL Exclusive 604 * IRE_LOOPBACK Exclusive 605 * IRE_PREFIX (net routes) Shared (*) 606 * IRE_CACHE Exclusive 607 * IRE_IF_NORESOLVER (interface routes) Exclusive 608 * IRE_IF_RESOLVER (interface routes) Exclusive 609 * IRE_HOST (host routes) Shared (*) 610 * 611 * (*) A zone can only use a default or off-subnet route if the gateway is 612 * directly reachable from the zone, that is, if the gateway's address matches 613 * one of the zone's logical interfaces. 614 * 615 * Multiple zones can share a common broadcast address; typically all zones 616 * share the 255.255.255.255 address. Incoming as well as locally originated 617 * broadcast packets must be dispatched to all the zones on the broadcast 618 * network. For directed broadcasts (e.g. 10.16.72.255) this is not trivial 619 * since some zones may not be on the 10.16.72/24 network. To handle this, each 620 * zone has its own set of IRE_BROADCAST entries; then, broadcast packets are 621 * sent to every zone that has an IRE_BROADCAST entry for the destination 622 * address on the input ill, see conn_wantpacket(). 623 * 624 * Applications in different zones can join the same multicast group address. 625 * For IPv4, group memberships are per-logical interface, so they're already 626 * inherently part of a zone. For IPv6, group memberships are per-physical 627 * interface, so we distinguish IPv6 group memberships based on group address, 628 * interface and zoneid. In both cases, received multicast packets are sent to 629 * every zone for which a group membership entry exists. On IPv6 we need to 630 * check that the target zone still has an address on the receiving physical 631 * interface; it could have been removed since the application issued the 632 * IPV6_JOIN_GROUP. 633 */ 634 635 /* 636 * Squeue Fanout flags: 637 * 0: No fanout. 638 * 1: Fanout across all squeues 639 */ 640 boolean_t ip_squeue_fanout = 0; 641 642 /* 643 * Maximum dups allowed per packet. 644 */ 645 uint_t ip_max_frag_dups = 10; 646 647 #define IS_SIMPLE_IPH(ipha) \ 648 ((ipha)->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION) 649 650 /* RFC1122 Conformance */ 651 #define IP_FORWARD_DEFAULT IP_FORWARD_NEVER 652 653 #define ILL_MAX_NAMELEN LIFNAMSIZ 654 655 /* Leave room for ip_newroute to tack on the src and target addresses */ 656 #define OK_RESOLVER_MP(mp) \ 657 ((mp) && ((mp)->b_wptr - (mp)->b_rptr) >= (2 * IP_ADDR_LEN)) 658 659 static int conn_set_held_ipif(conn_t *, ipif_t **, ipif_t *); 660 661 static mblk_t *ip_wput_attach_llhdr(mblk_t *, ire_t *, ip_proc_t, uint32_t); 662 static void ip_ipsec_out_prepend(mblk_t *, mblk_t *, ill_t *); 663 664 static void icmp_frag_needed(queue_t *, mblk_t *, int); 665 static void icmp_inbound(queue_t *, mblk_t *, boolean_t, ill_t *, int, 666 uint32_t, boolean_t, boolean_t, ill_t *, zoneid_t); 667 static boolean_t icmp_inbound_too_big(icmph_t *, ipha_t *); 668 static void icmp_inbound_error_fanout(queue_t *, ill_t *, mblk_t *, 669 icmph_t *, ipha_t *, int, int, boolean_t, boolean_t, 670 ill_t *, zoneid_t); 671 static void icmp_options_update(ipha_t *); 672 static void icmp_param_problem(queue_t *, mblk_t *, uint8_t); 673 static void icmp_pkt(queue_t *, mblk_t *, void *, size_t, boolean_t); 674 static mblk_t *icmp_pkt_err_ok(mblk_t *); 675 static void icmp_redirect(mblk_t *); 676 static void icmp_send_redirect(queue_t *, mblk_t *, ipaddr_t); 677 678 static void ip_arp_news(queue_t *, mblk_t *); 679 static boolean_t ip_bind_insert_ire(mblk_t *, ire_t *, iulp_t *); 680 mblk_t *ip_dlpi_alloc(size_t, t_uscalar_t); 681 char *ip_dot_addr(ipaddr_t, char *); 682 mblk_t *ip_carve_mp(mblk_t **, ssize_t); 683 int ip_close(queue_t *, int); 684 static char *ip_dot_saddr(uchar_t *, char *); 685 static void ip_fanout_proto(queue_t *, mblk_t *, ill_t *, ipha_t *, uint_t, 686 boolean_t, boolean_t, ill_t *, zoneid_t); 687 static void ip_fanout_tcp(queue_t *, mblk_t *, ill_t *, ipha_t *, uint_t, 688 boolean_t, boolean_t, zoneid_t); 689 static void ip_fanout_udp(queue_t *, mblk_t *, ill_t *, ipha_t *, uint32_t, 690 boolean_t, uint_t, boolean_t, boolean_t, ill_t *, zoneid_t); 691 static void ip_lrput(queue_t *, mblk_t *); 692 ipaddr_t ip_massage_options(ipha_t *); 693 static void ip_mrtun_forward(ire_t *, ill_t *, mblk_t *); 694 ipaddr_t ip_net_mask(ipaddr_t); 695 void ip_newroute(queue_t *, mblk_t *, ipaddr_t, ill_t *, conn_t *); 696 static void ip_newroute_ipif(queue_t *, mblk_t *, ipif_t *, ipaddr_t, 697 conn_t *, uint32_t); 698 static int ip_hdr_complete(ipha_t *, zoneid_t); 699 char *ip_nv_lookup(nv_t *, int); 700 static boolean_t ip_check_for_ipsec_opt(queue_t *, mblk_t *); 701 static int ip_param_get(queue_t *, mblk_t *, caddr_t, cred_t *); 702 static int ip_param_generic_get(queue_t *, mblk_t *, caddr_t, cred_t *); 703 static boolean_t ip_param_register(ipparam_t *, size_t, ipndp_t *, 704 size_t); 705 static int ip_param_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); 706 void ip_rput(queue_t *, mblk_t *); 707 static void ip_rput_dlpi_writer(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, 708 void *dummy_arg); 709 void ip_rput_forward(ire_t *, ipha_t *, mblk_t *, ill_t *); 710 static int ip_rput_forward_options(mblk_t *, ipha_t *, ire_t *); 711 static boolean_t ip_rput_local_options(queue_t *, mblk_t *, ipha_t *, 712 ire_t *); 713 static int ip_rput_options(queue_t *, mblk_t *, ipha_t *, ipaddr_t *); 714 static boolean_t ip_rput_fragment(queue_t *, mblk_t **, ipha_t *, uint32_t *, 715 uint16_t *); 716 int ip_snmp_get(queue_t *, mblk_t *); 717 static mblk_t *ip_snmp_get_mib2_ip(queue_t *, mblk_t *); 718 static mblk_t *ip_snmp_get_mib2_ip6(queue_t *, mblk_t *); 719 static mblk_t *ip_snmp_get_mib2_icmp(queue_t *, mblk_t *); 720 static mblk_t *ip_snmp_get_mib2_icmp6(queue_t *, mblk_t *); 721 static mblk_t *ip_snmp_get_mib2_igmp(queue_t *, mblk_t *); 722 static mblk_t *ip_snmp_get_mib2_multi(queue_t *, mblk_t *); 723 static mblk_t *ip_snmp_get_mib2_ip_addr(queue_t *, mblk_t *); 724 static mblk_t *ip_snmp_get_mib2_ip6_addr(queue_t *, mblk_t *); 725 static mblk_t *ip_snmp_get_mib2_ip_group_mem(queue_t *, mblk_t *); 726 static mblk_t *ip_snmp_get_mib2_ip6_group_mem(queue_t *, mblk_t *); 727 static mblk_t *ip_snmp_get_mib2_ip_group_src(queue_t *, mblk_t *); 728 static mblk_t *ip_snmp_get_mib2_ip6_group_src(queue_t *, mblk_t *); 729 static mblk_t *ip_snmp_get_mib2_virt_multi(queue_t *, mblk_t *); 730 static mblk_t *ip_snmp_get_mib2_multi_rtable(queue_t *, mblk_t *); 731 static mblk_t *ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *); 732 static mblk_t *ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *); 733 static void ip_snmp_get2_v4(ire_t *, iproutedata_t *); 734 static void ip_snmp_get2_v6_route(ire_t *, iproutedata_t *); 735 static int ip_snmp_get2_v6_media(nce_t *, iproutedata_t *); 736 int ip_snmp_set(queue_t *, int, int, uchar_t *, int); 737 static boolean_t ip_source_routed(ipha_t *); 738 static boolean_t ip_source_route_included(ipha_t *); 739 740 static void ip_wput_frag(ire_t *, mblk_t *, ip_pkt_t, uint32_t, uint32_t); 741 static mblk_t *ip_wput_frag_copyhdr(uchar_t *, int, int); 742 static void ip_wput_local_options(ipha_t *); 743 static int ip_wput_options(queue_t *, mblk_t *, ipha_t *, boolean_t, 744 zoneid_t); 745 746 static void conn_drain_init(void); 747 static void conn_drain_fini(void); 748 static void conn_drain_tail(conn_t *connp, boolean_t closing); 749 750 static void conn_walk_drain(void); 751 static void conn_walk_fanout_table(connf_t *, uint_t, pfv_t, void *, 752 zoneid_t); 753 754 static boolean_t conn_wantpacket(conn_t *, ill_t *, ipha_t *, int, 755 zoneid_t); 756 static void ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, 757 void *dummy_arg); 758 759 static int ip_forward_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); 760 761 static int ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t, 762 ipaddr_t, ipaddr_t, uint_t *, mcast_record_t, ipaddr_t, mblk_t *), ire_t *, 763 conn_t *, boolean_t, ipaddr_t, mcast_record_t, ipaddr_t, mblk_t *); 764 static void ip_multirt_bad_mtu(ire_t *, uint32_t); 765 766 static int ip_cgtp_filter_get(queue_t *, mblk_t *, caddr_t, cred_t *); 767 static int ip_cgtp_filter_set(queue_t *, mblk_t *, char *, 768 caddr_t, cred_t *); 769 extern int ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value, 770 caddr_t cp, cred_t *cr); 771 extern int ip_squeue_profile_set(queue_t *, mblk_t *, char *, caddr_t, 772 cred_t *); 773 static int ip_input_proc_set(queue_t *q, mblk_t *mp, char *value, 774 caddr_t cp, cred_t *cr); 775 static int ip_int_set(queue_t *, mblk_t *, char *, caddr_t, 776 cred_t *); 777 static squeue_func_t ip_squeue_switch(int); 778 779 static void ip_kstat_init(void); 780 static void ip_kstat_fini(void); 781 static int ip_kstat_update(kstat_t *kp, int rw); 782 static void icmp_kstat_init(void); 783 static void icmp_kstat_fini(void); 784 static int icmp_kstat_update(kstat_t *kp, int rw); 785 786 static int ip_conn_report(queue_t *, mblk_t *, caddr_t, cred_t *); 787 788 static boolean_t ip_no_forward(ipha_t *, ill_t *); 789 static boolean_t ip_loopback_src_or_dst(ipha_t *, ill_t *); 790 791 static mblk_t *ip_tcp_input(mblk_t *, ipha_t *, ill_t *, boolean_t, 792 ire_t *, mblk_t *, uint_t, queue_t *, ill_rx_ring_t *); 793 794 void ip_input(ill_t *, ill_rx_ring_t *, mblk_t *, size_t); 795 796 timeout_id_t ip_ire_expire_id; /* IRE expiration timer. */ 797 static clock_t ip_ire_arp_time_elapsed; /* Time since IRE cache last flushed */ 798 static clock_t ip_ire_rd_time_elapsed; /* ... redirect IREs last flushed */ 799 static clock_t ip_ire_pmtu_time_elapsed; /* Time since path mtu increase */ 800 801 uint_t ip_ire_default_count; /* Number of IPv4 IRE_DEFAULT entries. */ 802 uint_t ip_ire_default_index; /* Walking index used to mod in */ 803 804 ipaddr_t ip_g_all_ones = IP_HOST_MASK; 805 clock_t icmp_pkt_err_last = 0; /* Time since last icmp_pkt_err */ 806 uint_t icmp_pkt_err_sent = 0; /* Number of packets sent in burst */ 807 808 /* How long, in seconds, we allow frags to hang around. */ 809 #define IP_FRAG_TIMEOUT 60 810 811 time_t ip_g_frag_timeout = IP_FRAG_TIMEOUT; 812 clock_t ip_g_frag_timo_ms = IP_FRAG_TIMEOUT * 1000; 813 814 /* 815 * Threshold which determines whether MDT should be used when 816 * generating IP fragments; payload size must be greater than 817 * this threshold for MDT to take place. 818 */ 819 #define IP_WPUT_FRAG_MDT_MIN 32768 820 821 int ip_wput_frag_mdt_min = IP_WPUT_FRAG_MDT_MIN; 822 823 /* Protected by ip_mi_lock */ 824 static void *ip_g_head; /* Instance Data List Head */ 825 kmutex_t ip_mi_lock; /* Lock for list of instances */ 826 827 /* Only modified during _init and _fini thus no locking is needed. */ 828 caddr_t ip_g_nd; /* Named Dispatch List Head */ 829 830 831 static long ip_rput_pullups; 832 int dohwcksum = 1; /* use h/w cksum if supported by the hardware */ 833 834 vmem_t *ip_minor_arena; 835 836 /* 837 * MIB-2 stuff for SNMP (both IP and ICMP) 838 */ 839 mib2_ip_t ip_mib; 840 mib2_icmp_t icmp_mib; 841 842 #ifdef DEBUG 843 uint32_t ipsechw_debug = 0; 844 #endif 845 846 kstat_t *ip_mibkp; /* kstat exporting ip_mib data */ 847 kstat_t *icmp_mibkp; /* kstat exporting icmp_mib data */ 848 849 uint_t loopback_packets = 0; 850 851 /* 852 * Multirouting/CGTP stuff 853 */ 854 cgtp_filter_ops_t *ip_cgtp_filter_ops; /* CGTP hooks */ 855 int ip_cgtp_filter_rev = CGTP_FILTER_REV; /* CGTP hooks version */ 856 boolean_t ip_cgtp_filter; /* Enable/disable CGTP hooks */ 857 /* Interval (in ms) between consecutive 'bad MTU' warnings */ 858 hrtime_t ip_multirt_log_interval = 1000; 859 /* Time since last warning issued. */ 860 static hrtime_t multirt_bad_mtu_last_time = 0; 861 862 kmutex_t ip_trash_timer_lock; 863 krwlock_t ip_g_nd_lock; 864 865 /* 866 * XXX following really should only be in a header. Would need more 867 * header and .c clean up first. 868 */ 869 extern optdb_obj_t ip_opt_obj; 870 871 ulong_t ip_squeue_enter_unbound = 0; 872 873 /* 874 * Named Dispatch Parameter Table. 875 * All of these are alterable, within the min/max values given, at run time. 876 */ 877 static ipparam_t lcl_param_arr[] = { 878 /* min max value name */ 879 { 0, 1, 0, "ip_respond_to_address_mask_broadcast"}, 880 { 0, 1, 1, "ip_respond_to_echo_broadcast"}, 881 { 0, 1, 1, "ip_respond_to_echo_multicast"}, 882 { 0, 1, 0, "ip_respond_to_timestamp"}, 883 { 0, 1, 0, "ip_respond_to_timestamp_broadcast"}, 884 { 0, 1, 1, "ip_send_redirects"}, 885 { 0, 1, 0, "ip_forward_directed_broadcasts"}, 886 { 0, 10, 0, "ip_debug"}, 887 { 0, 10, 0, "ip_mrtdebug"}, 888 { 5000, 999999999, 60000, "ip_ire_timer_interval" }, 889 { 60000, 999999999, 1200000, "ip_ire_arp_interval" }, 890 { 60000, 999999999, 60000, "ip_ire_redirect_interval" }, 891 { 1, 255, 255, "ip_def_ttl" }, 892 { 0, 1, 0, "ip_forward_src_routed"}, 893 { 0, 256, 32, "ip_wroff_extra" }, 894 { 5000, 999999999, 600000, "ip_ire_pathmtu_interval" }, 895 { 8, 65536, 64, "ip_icmp_return_data_bytes" }, 896 { 0, 1, 1, "ip_path_mtu_discovery" }, 897 { 0, 240, 30, "ip_ignore_delete_time" }, 898 { 0, 1, 0, "ip_ignore_redirect" }, 899 { 0, 1, 1, "ip_output_queue" }, 900 { 1, 254, 1, "ip_broadcast_ttl" }, 901 { 0, 99999, 100, "ip_icmp_err_interval" }, 902 { 1, 99999, 10, "ip_icmp_err_burst" }, 903 { 0, 999999999, 1000000, "ip_reass_queue_bytes" }, 904 { 0, 1, 0, "ip_strict_dst_multihoming" }, 905 { 1, MAX_ADDRS_PER_IF, 256, "ip_addrs_per_if"}, 906 { 0, 1, 0, "ipsec_override_persocket_policy" }, 907 { 0, 1, 1, "icmp_accept_clear_messages" }, 908 { 0, 1, 1, "igmp_accept_clear_messages" }, 909 { 2, 999999999, ND_DELAY_FIRST_PROBE_TIME, 910 "ip_ndp_delay_first_probe_time"}, 911 { 1, 999999999, ND_MAX_UNICAST_SOLICIT, 912 "ip_ndp_max_unicast_solicit"}, 913 { 1, 255, IPV6_MAX_HOPS, "ip6_def_hops" }, 914 { 8, IPV6_MIN_MTU, IPV6_MIN_MTU, "ip6_icmp_return_data_bytes" }, 915 { 0, 1, 0, "ip6_forward_src_routed"}, 916 { 0, 1, 1, "ip6_respond_to_echo_multicast"}, 917 { 0, 1, 1, "ip6_send_redirects"}, 918 { 0, 1, 0, "ip6_ignore_redirect" }, 919 { 0, 1, 0, "ip6_strict_dst_multihoming" }, 920 921 { 1, 8, 3, "ip_ire_reclaim_fraction" }, 922 923 { 0, 999999, 1000, "ipsec_policy_log_interval" }, 924 925 { 0, 1, 1, "pim_accept_clear_messages" }, 926 { 1000, 20000, 2000, "ip_ndp_unsolicit_interval" }, 927 { 1, 20, 3, "ip_ndp_unsolicit_count" }, 928 { 0, 1, 1, "ip6_ignore_home_address_opt" }, 929 { 0, 15, 0, "ip_policy_mask" }, 930 { 1000, 60000, 1000, "ip_multirt_resolution_interval" }, 931 { 0, 255, 1, "ip_multirt_ttl" }, 932 { 0, 1, 1, "ip_multidata_outbound" }, 933 #ifdef DEBUG 934 { 0, 1, 0, "ip6_drop_inbound_icmpv6" }, 935 #endif 936 }; 937 938 ipparam_t *ip_param_arr = lcl_param_arr; 939 940 /* Extended NDP table */ 941 static ipndp_t lcl_ndp_arr[] = { 942 /* getf setf data name */ 943 { ip_param_generic_get, ip_forward_set, (caddr_t)&ip_g_forward, 944 "ip_forwarding" }, 945 { ip_param_generic_get, ip_forward_set, (caddr_t)&ipv6_forward, 946 "ip6_forwarding" }, 947 { ip_ill_report, NULL, NULL, 948 "ip_ill_status" }, 949 { ip_ipif_report, NULL, NULL, 950 "ip_ipif_status" }, 951 { ip_ire_report, NULL, NULL, 952 "ipv4_ire_status" }, 953 { ip_ire_report_mrtun, NULL, NULL, 954 "ipv4_mrtun_ire_status" }, 955 { ip_ire_report_srcif, NULL, NULL, 956 "ipv4_srcif_ire_status" }, 957 { ip_ire_report_v6, NULL, NULL, 958 "ipv6_ire_status" }, 959 { ip_conn_report, NULL, NULL, 960 "ip_conn_status" }, 961 { nd_get_long, nd_set_long, (caddr_t)&ip_rput_pullups, 962 "ip_rput_pullups" }, 963 { ndp_report, NULL, NULL, 964 "ip_ndp_cache_report" }, 965 { ip_srcid_report, NULL, NULL, 966 "ip_srcid_status" }, 967 { ip_param_generic_get, ip_squeue_profile_set, 968 (caddr_t)&ip_squeue_profile, "ip_squeue_profile" }, 969 { ip_param_generic_get, ip_squeue_bind_set, 970 (caddr_t)&ip_squeue_bind, "ip_squeue_bind" }, 971 { ip_param_generic_get, ip_input_proc_set, 972 (caddr_t)&ip_squeue_enter, "ip_squeue_enter" }, 973 { ip_param_generic_get, ip_int_set, 974 (caddr_t)&ip_squeue_fanout, "ip_squeue_fanout" }, 975 { ip_cgtp_filter_get, ip_cgtp_filter_set, (caddr_t)&ip_cgtp_filter, 976 "ip_cgtp_filter" }, 977 { ip_param_generic_get, ip_int_set, 978 (caddr_t)&ip_soft_rings_cnt, "ip_soft_rings_cnt" } 979 }; 980 981 /* 982 * ip_g_forward controls IP forwarding. It takes two values: 983 * 0: IP_FORWARD_NEVER Don't forward packets ever. 984 * 1: IP_FORWARD_ALWAYS Forward packets for elsewhere. 985 * 986 * RFC1122 says there must be a configuration switch to control forwarding, 987 * but that the default MUST be to not forward packets ever. Implicit 988 * control based on configuration of multiple interfaces MUST NOT be 989 * implemented (Section 3.1). SunOS 4.1 did provide the "automatic" capability 990 * and, in fact, it was the default. That capability is now provided in the 991 * /etc/rc2.d/S69inet script. 992 */ 993 int ip_g_forward = IP_FORWARD_DEFAULT; 994 995 /* It also has an IPv6 counterpart. */ 996 997 int ipv6_forward = IP_FORWARD_DEFAULT; 998 999 /* Following line is external, and in ip.h. Normally marked with * *. */ 1000 #define ip_respond_to_address_mask_broadcast ip_param_arr[0].ip_param_value 1001 #define ip_g_resp_to_echo_bcast ip_param_arr[1].ip_param_value 1002 #define ip_g_resp_to_echo_mcast ip_param_arr[2].ip_param_value 1003 #define ip_g_resp_to_timestamp ip_param_arr[3].ip_param_value 1004 #define ip_g_resp_to_timestamp_bcast ip_param_arr[4].ip_param_value 1005 #define ip_g_send_redirects ip_param_arr[5].ip_param_value 1006 #define ip_g_forward_directed_bcast ip_param_arr[6].ip_param_value 1007 #define ip_debug ip_param_arr[7].ip_param_value /* */ 1008 #define ip_mrtdebug ip_param_arr[8].ip_param_value /* */ 1009 #define ip_timer_interval ip_param_arr[9].ip_param_value /* */ 1010 #define ip_ire_arp_interval ip_param_arr[10].ip_param_value /* */ 1011 #define ip_ire_redir_interval ip_param_arr[11].ip_param_value 1012 #define ip_def_ttl ip_param_arr[12].ip_param_value 1013 #define ip_forward_src_routed ip_param_arr[13].ip_param_value 1014 #define ip_wroff_extra ip_param_arr[14].ip_param_value 1015 #define ip_ire_pathmtu_interval ip_param_arr[15].ip_param_value 1016 #define ip_icmp_return ip_param_arr[16].ip_param_value 1017 #define ip_path_mtu_discovery ip_param_arr[17].ip_param_value /* */ 1018 #define ip_ignore_delete_time ip_param_arr[18].ip_param_value /* */ 1019 #define ip_ignore_redirect ip_param_arr[19].ip_param_value 1020 #define ip_output_queue ip_param_arr[20].ip_param_value 1021 #define ip_broadcast_ttl ip_param_arr[21].ip_param_value 1022 #define ip_icmp_err_interval ip_param_arr[22].ip_param_value 1023 #define ip_icmp_err_burst ip_param_arr[23].ip_param_value 1024 #define ip_reass_queue_bytes ip_param_arr[24].ip_param_value 1025 #define ip_strict_dst_multihoming ip_param_arr[25].ip_param_value 1026 #define ip_addrs_per_if ip_param_arr[26].ip_param_value 1027 #define ipsec_override_persocket_policy ip_param_arr[27].ip_param_value /* */ 1028 #define icmp_accept_clear_messages ip_param_arr[28].ip_param_value 1029 #define igmp_accept_clear_messages ip_param_arr[29].ip_param_value 1030 1031 /* IPv6 configuration knobs */ 1032 #define delay_first_probe_time ip_param_arr[30].ip_param_value 1033 #define max_unicast_solicit ip_param_arr[31].ip_param_value 1034 #define ipv6_def_hops ip_param_arr[32].ip_param_value 1035 #define ipv6_icmp_return ip_param_arr[33].ip_param_value 1036 #define ipv6_forward_src_routed ip_param_arr[34].ip_param_value 1037 #define ipv6_resp_echo_mcast ip_param_arr[35].ip_param_value 1038 #define ipv6_send_redirects ip_param_arr[36].ip_param_value 1039 #define ipv6_ignore_redirect ip_param_arr[37].ip_param_value 1040 #define ipv6_strict_dst_multihoming ip_param_arr[38].ip_param_value 1041 #define ip_ire_reclaim_fraction ip_param_arr[39].ip_param_value 1042 #define ipsec_policy_log_interval ip_param_arr[40].ip_param_value 1043 #define pim_accept_clear_messages ip_param_arr[41].ip_param_value 1044 #define ip_ndp_unsolicit_interval ip_param_arr[42].ip_param_value 1045 #define ip_ndp_unsolicit_count ip_param_arr[43].ip_param_value 1046 #define ipv6_ignore_home_address_opt ip_param_arr[44].ip_param_value 1047 #define ip_policy_mask ip_param_arr[45].ip_param_value 1048 #define ip_multirt_resolution_interval ip_param_arr[46].ip_param_value 1049 #define ip_multirt_ttl ip_param_arr[47].ip_param_value 1050 #define ip_multidata_outbound ip_param_arr[48].ip_param_value 1051 #ifdef DEBUG 1052 #define ipv6_drop_inbound_icmpv6 ip_param_arr[49].ip_param_value 1053 #else 1054 #define ipv6_drop_inbound_icmpv6 0 1055 #endif 1056 1057 1058 /* 1059 * Table of IP ioctls encoding the various properties of the ioctl and 1060 * indexed based on the last byte of the ioctl command. Occasionally there 1061 * is a clash, and there is more than 1 ioctl with the same last byte. 1062 * In such a case 1 ioctl is encoded in the ndx table and the remaining 1063 * ioctls are encoded in the misc table. An entry in the ndx table is 1064 * retrieved by indexing on the last byte of the ioctl command and comparing 1065 * the ioctl command with the value in the ndx table. In the event of a 1066 * mismatch the misc table is then searched sequentially for the desired 1067 * ioctl command. 1068 * 1069 * Entry: <command> <copyin_size> <flags> <cmd_type> <function> <restart_func> 1070 */ 1071 ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { 1072 /* 000 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1073 /* 001 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1074 /* 002 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1075 /* 003 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1076 /* 004 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1077 /* 005 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1078 /* 006 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1079 /* 007 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1080 /* 008 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1081 /* 009 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1082 1083 /* 010 */ { SIOCADDRT, sizeof (struct rtentry), IPI_PRIV, 1084 MISC_CMD, ip_siocaddrt, NULL }, 1085 /* 011 */ { SIOCDELRT, sizeof (struct rtentry), IPI_PRIV, 1086 MISC_CMD, ip_siocdelrt, NULL }, 1087 1088 /* 012 */ { SIOCSIFADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 1089 IF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart }, 1090 /* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, 1091 IF_CMD, ip_sioctl_get_addr, NULL }, 1092 1093 /* 014 */ { SIOCSIFDSTADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 1094 IF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart }, 1095 /* 015 */ { SIOCGIFDSTADDR, sizeof (struct ifreq), 1096 IPI_GET_CMD | IPI_REPL, 1097 IF_CMD, ip_sioctl_get_dstaddr, NULL }, 1098 1099 /* 016 */ { SIOCSIFFLAGS, sizeof (struct ifreq), 1100 IPI_PRIV | IPI_WR | IPI_REPL, 1101 IF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart }, 1102 /* 017 */ { SIOCGIFFLAGS, sizeof (struct ifreq), 1103 IPI_MODOK | IPI_GET_CMD | IPI_REPL, 1104 IF_CMD, ip_sioctl_get_flags, NULL }, 1105 1106 /* 018 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1107 /* 019 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1108 1109 /* copyin size cannot be coded for SIOCGIFCONF */ 1110 /* 020 */ { O_SIOCGIFCONF, 0, IPI_GET_CMD | IPI_REPL, 1111 MISC_CMD, ip_sioctl_get_ifconf, NULL }, 1112 1113 /* 021 */ { SIOCSIFMTU, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 1114 IF_CMD, ip_sioctl_mtu, NULL }, 1115 /* 022 */ { SIOCGIFMTU, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, 1116 IF_CMD, ip_sioctl_get_mtu, NULL }, 1117 /* 023 */ { SIOCGIFBRDADDR, sizeof (struct ifreq), 1118 IPI_GET_CMD | IPI_REPL, 1119 IF_CMD, ip_sioctl_get_brdaddr, NULL }, 1120 /* 024 */ { SIOCSIFBRDADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 1121 IF_CMD, ip_sioctl_brdaddr, NULL }, 1122 /* 025 */ { SIOCGIFNETMASK, sizeof (struct ifreq), 1123 IPI_GET_CMD | IPI_REPL, 1124 IF_CMD, ip_sioctl_get_netmask, NULL }, 1125 /* 026 */ { SIOCSIFNETMASK, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 1126 IF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart }, 1127 /* 027 */ { SIOCGIFMETRIC, sizeof (struct ifreq), 1128 IPI_GET_CMD | IPI_REPL, 1129 IF_CMD, ip_sioctl_get_metric, NULL }, 1130 /* 028 */ { SIOCSIFMETRIC, sizeof (struct ifreq), IPI_PRIV, 1131 IF_CMD, ip_sioctl_metric, NULL }, 1132 /* 029 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1133 1134 /* See 166-168 below for extended SIOC*XARP ioctls */ 1135 /* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV, 1136 MISC_CMD, ip_sioctl_arp, NULL }, 1137 /* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD | IPI_REPL, 1138 MISC_CMD, ip_sioctl_arp, NULL }, 1139 /* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV, 1140 MISC_CMD, ip_sioctl_arp, NULL }, 1141 1142 /* 033 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1143 /* 034 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1144 /* 035 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1145 /* 036 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1146 /* 037 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1147 /* 038 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1148 /* 039 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1149 /* 040 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1150 /* 041 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1151 /* 042 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1152 /* 043 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1153 /* 044 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1154 /* 045 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1155 /* 046 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1156 /* 047 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1157 /* 048 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1158 /* 049 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1159 /* 050 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1160 /* 051 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1161 /* 052 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1162 /* 053 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1163 1164 /* 054 */ { IF_UNITSEL, sizeof (int), IPI_PRIV | IPI_WR | IPI_MODOK, 1165 MISC_CMD, if_unitsel, if_unitsel_restart }, 1166 1167 /* 055 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1168 /* 056 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1169 /* 057 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1170 /* 058 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1171 /* 059 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1172 /* 060 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1173 /* 061 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1174 /* 062 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1175 /* 063 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1176 /* 064 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1177 /* 065 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1178 /* 066 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1179 /* 067 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1180 /* 068 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1181 /* 069 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1182 /* 070 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1183 /* 071 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1184 /* 072 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1185 1186 /* 073 */ { SIOCSIFNAME, sizeof (struct ifreq), 1187 IPI_PRIV | IPI_WR | IPI_MODOK, 1188 IF_CMD, ip_sioctl_sifname, NULL }, 1189 1190 /* 074 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1191 /* 075 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1192 /* 076 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1193 /* 077 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1194 /* 078 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1195 /* 079 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1196 /* 080 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1197 /* 081 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1198 /* 082 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1199 /* 083 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1200 /* 084 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1201 /* 085 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1202 /* 086 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1203 1204 /* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD | IPI_REPL, 1205 MISC_CMD, ip_sioctl_get_ifnum, NULL }, 1206 /* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, 1207 IF_CMD, ip_sioctl_get_muxid, NULL }, 1208 /* 089 */ { SIOCSIFMUXID, sizeof (struct ifreq), 1209 IPI_PRIV | IPI_WR | IPI_REPL, 1210 IF_CMD, ip_sioctl_muxid, NULL }, 1211 1212 /* Both if and lif variants share same func */ 1213 /* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, 1214 IF_CMD, ip_sioctl_get_lifindex, NULL }, 1215 /* Both if and lif variants share same func */ 1216 /* 091 */ { SIOCSIFINDEX, sizeof (struct ifreq), 1217 IPI_PRIV | IPI_WR | IPI_REPL, 1218 IF_CMD, ip_sioctl_slifindex, NULL }, 1219 1220 /* copyin size cannot be coded for SIOCGIFCONF */ 1221 /* 092 */ { SIOCGIFCONF, 0, IPI_GET_CMD | IPI_REPL, 1222 MISC_CMD, ip_sioctl_get_ifconf, NULL }, 1223 /* 093 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1224 /* 094 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1225 /* 095 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1226 /* 096 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1227 /* 097 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1228 /* 098 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1229 /* 099 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1230 /* 100 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1231 /* 101 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1232 /* 102 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1233 /* 103 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1234 /* 104 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1235 /* 105 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1236 /* 106 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1237 /* 107 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1238 /* 108 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1239 /* 109 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1240 1241 /* 110 */ { SIOCLIFREMOVEIF, sizeof (struct lifreq), 1242 IPI_PRIV | IPI_WR | IPI_REPL, 1243 LIF_CMD, ip_sioctl_removeif, 1244 ip_sioctl_removeif_restart }, 1245 /* 111 */ { SIOCLIFADDIF, sizeof (struct lifreq), 1246 IPI_GET_CMD | IPI_PRIV | IPI_WR | IPI_REPL, 1247 LIF_CMD, ip_sioctl_addif, NULL }, 1248 #define SIOCLIFADDR_NDX 112 1249 /* 112 */ { SIOCSLIFADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1250 LIF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart }, 1251 /* 113 */ { SIOCGLIFADDR, sizeof (struct lifreq), 1252 IPI_GET_CMD | IPI_REPL, 1253 LIF_CMD, ip_sioctl_get_addr, NULL }, 1254 /* 114 */ { SIOCSLIFDSTADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1255 LIF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart }, 1256 /* 115 */ { SIOCGLIFDSTADDR, sizeof (struct lifreq), 1257 IPI_GET_CMD | IPI_REPL, 1258 LIF_CMD, ip_sioctl_get_dstaddr, NULL }, 1259 /* 116 */ { SIOCSLIFFLAGS, sizeof (struct lifreq), 1260 IPI_PRIV | IPI_WR | IPI_REPL, 1261 LIF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart }, 1262 /* 117 */ { SIOCGLIFFLAGS, sizeof (struct lifreq), 1263 IPI_GET_CMD | IPI_MODOK | IPI_REPL, 1264 LIF_CMD, ip_sioctl_get_flags, NULL }, 1265 1266 /* 118 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1267 /* 119 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1268 1269 /* 120 */ { O_SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD | IPI_REPL, 1270 ip_sioctl_get_lifconf, NULL }, 1271 /* 121 */ { SIOCSLIFMTU, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1272 LIF_CMD, ip_sioctl_mtu, NULL }, 1273 /* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD | IPI_REPL, 1274 LIF_CMD, ip_sioctl_get_mtu, NULL }, 1275 /* 123 */ { SIOCGLIFBRDADDR, sizeof (struct lifreq), 1276 IPI_GET_CMD | IPI_REPL, 1277 LIF_CMD, ip_sioctl_get_brdaddr, NULL }, 1278 /* 124 */ { SIOCSLIFBRDADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1279 LIF_CMD, ip_sioctl_brdaddr, NULL }, 1280 /* 125 */ { SIOCGLIFNETMASK, sizeof (struct lifreq), 1281 IPI_GET_CMD | IPI_REPL, 1282 LIF_CMD, ip_sioctl_get_netmask, NULL }, 1283 /* 126 */ { SIOCSLIFNETMASK, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1284 LIF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart }, 1285 /* 127 */ { SIOCGLIFMETRIC, sizeof (struct lifreq), 1286 IPI_GET_CMD | IPI_REPL, 1287 LIF_CMD, ip_sioctl_get_metric, NULL }, 1288 /* 128 */ { SIOCSLIFMETRIC, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1289 LIF_CMD, ip_sioctl_metric, NULL }, 1290 /* 129 */ { SIOCSLIFNAME, sizeof (struct lifreq), 1291 IPI_PRIV | IPI_WR | IPI_MODOK | IPI_REPL, 1292 LIF_CMD, ip_sioctl_slifname, 1293 ip_sioctl_slifname_restart }, 1294 1295 /* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD | IPI_REPL, 1296 MISC_CMD, ip_sioctl_get_lifnum, NULL }, 1297 /* 131 */ { SIOCGLIFMUXID, sizeof (struct lifreq), 1298 IPI_GET_CMD | IPI_REPL, 1299 LIF_CMD, ip_sioctl_get_muxid, NULL }, 1300 /* 132 */ { SIOCSLIFMUXID, sizeof (struct lifreq), 1301 IPI_PRIV | IPI_WR | IPI_REPL, 1302 LIF_CMD, ip_sioctl_muxid, NULL }, 1303 /* 133 */ { SIOCGLIFINDEX, sizeof (struct lifreq), 1304 IPI_GET_CMD | IPI_REPL, 1305 LIF_CMD, ip_sioctl_get_lifindex, 0 }, 1306 /* 134 */ { SIOCSLIFINDEX, sizeof (struct lifreq), 1307 IPI_PRIV | IPI_WR | IPI_REPL, 1308 LIF_CMD, ip_sioctl_slifindex, 0 }, 1309 /* 135 */ { SIOCSLIFTOKEN, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1310 LIF_CMD, ip_sioctl_token, NULL }, 1311 /* 136 */ { SIOCGLIFTOKEN, sizeof (struct lifreq), 1312 IPI_GET_CMD | IPI_REPL, 1313 LIF_CMD, ip_sioctl_get_token, NULL }, 1314 /* 137 */ { SIOCSLIFSUBNET, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1315 LIF_CMD, ip_sioctl_subnet, ip_sioctl_subnet_restart }, 1316 /* 138 */ { SIOCGLIFSUBNET, sizeof (struct lifreq), 1317 IPI_GET_CMD | IPI_REPL, 1318 LIF_CMD, ip_sioctl_get_subnet, NULL }, 1319 /* 139 */ { SIOCSLIFLNKINFO, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1320 LIF_CMD, ip_sioctl_lnkinfo, NULL }, 1321 1322 /* 140 */ { SIOCGLIFLNKINFO, sizeof (struct lifreq), 1323 IPI_GET_CMD | IPI_REPL, 1324 LIF_CMD, ip_sioctl_get_lnkinfo, NULL }, 1325 /* 141 */ { SIOCLIFDELND, sizeof (struct lifreq), IPI_PRIV, 1326 LIF_CMD, ip_siocdelndp_v6, NULL }, 1327 /* 142 */ { SIOCLIFGETND, sizeof (struct lifreq), IPI_GET_CMD, 1328 LIF_CMD, ip_siocqueryndp_v6, NULL }, 1329 /* 143 */ { SIOCLIFSETND, sizeof (struct lifreq), IPI_PRIV, 1330 LIF_CMD, ip_siocsetndp_v6, NULL }, 1331 /* 144 */ { SIOCTMYADDR, sizeof (struct sioc_addrreq), IPI_GET_CMD, 1332 MISC_CMD, ip_sioctl_tmyaddr, NULL }, 1333 /* 145 */ { SIOCTONLINK, sizeof (struct sioc_addrreq), IPI_GET_CMD, 1334 MISC_CMD, ip_sioctl_tonlink, NULL }, 1335 /* 146 */ { SIOCTMYSITE, sizeof (struct sioc_addrreq), 0, 1336 MISC_CMD, ip_sioctl_tmysite, NULL }, 1337 /* 147 */ { SIOCGTUNPARAM, sizeof (struct iftun_req), IPI_REPL, 1338 TUN_CMD, ip_sioctl_tunparam, NULL }, 1339 /* 148 */ { SIOCSTUNPARAM, sizeof (struct iftun_req), 1340 IPI_PRIV | IPI_WR, 1341 TUN_CMD, ip_sioctl_tunparam, NULL }, 1342 1343 /* IPSECioctls handled in ip_sioctl_copyin_setup itself */ 1344 /* 149 */ { SIOCFIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1345 /* 150 */ { SIOCSIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1346 /* 151 */ { SIOCDIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1347 /* 152 */ { SIOCLIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1348 1349 /* 153 */ { SIOCLIFFAILOVER, sizeof (struct lifreq), 1350 IPI_PRIV | IPI_WR | IPI_REPL, 1351 LIF_CMD, ip_sioctl_move, ip_sioctl_move }, 1352 /* 154 */ { SIOCLIFFAILBACK, sizeof (struct lifreq), 1353 IPI_PRIV | IPI_WR | IPI_REPL, 1354 LIF_CMD, ip_sioctl_move, ip_sioctl_move }, 1355 /* 155 */ { SIOCSLIFGROUPNAME, sizeof (struct lifreq), 1356 IPI_PRIV | IPI_WR, 1357 LIF_CMD, ip_sioctl_groupname, ip_sioctl_groupname }, 1358 /* 156 */ { SIOCGLIFGROUPNAME, sizeof (struct lifreq), 1359 IPI_GET_CMD | IPI_REPL, 1360 LIF_CMD, ip_sioctl_get_groupname, NULL }, 1361 /* 157 */ { SIOCGLIFOINDEX, sizeof (struct lifreq), 1362 IPI_GET_CMD | IPI_REPL, 1363 LIF_CMD, ip_sioctl_get_oindex, NULL }, 1364 1365 /* Leave 158-160 unused; used to be SIOC*IFARP ioctls */ 1366 /* 158 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1367 /* 159 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1368 /* 160 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1369 1370 /* 161 */ { SIOCSLIFOINDEX, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1371 LIF_CMD, ip_sioctl_slifoindex, NULL }, 1372 1373 /* These are handled in ip_sioctl_copyin_setup itself */ 1374 /* 162 */ { SIOCGIP6ADDRPOLICY, 0, IPI_NULL_BCONT, 1375 MISC_CMD, NULL, NULL }, 1376 /* 163 */ { SIOCSIP6ADDRPOLICY, 0, IPI_PRIV | IPI_NULL_BCONT, 1377 MISC_CMD, NULL, NULL }, 1378 /* 164 */ { SIOCGDSTINFO, 0, IPI_GET_CMD, MISC_CMD, NULL, NULL }, 1379 1380 /* 165 */ { SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD | IPI_REPL, 1381 ip_sioctl_get_lifconf, NULL }, 1382 1383 /* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV, 1384 MISC_CMD, ip_sioctl_xarp, NULL }, 1385 /* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD | IPI_REPL, 1386 MISC_CMD, ip_sioctl_xarp, NULL }, 1387 /* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV, 1388 MISC_CMD, ip_sioctl_xarp, NULL }, 1389 1390 /* SIOCPOPSOCKFS is not handled by IP */ 1391 /* 169 */ { IPI_DONTCARE /* SIOCPOPSOCKFS */, 0, 0, 0, NULL, NULL }, 1392 1393 /* 170 */ { SIOCGLIFZONE, sizeof (struct lifreq), 1394 IPI_GET_CMD | IPI_REPL, 1395 LIF_CMD, ip_sioctl_get_lifzone, NULL }, 1396 /* 171 */ { SIOCSLIFZONE, sizeof (struct lifreq), 1397 IPI_PRIV | IPI_WR | IPI_REPL, 1398 LIF_CMD, ip_sioctl_slifzone, 1399 ip_sioctl_slifzone_restart }, 1400 /* 172-174 are SCTP ioctls and not handled by IP */ 1401 /* 172 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1402 /* 173 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1403 /* 174 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1404 /* 175 */ { SIOCGLIFUSESRC, sizeof (struct lifreq), 1405 IPI_GET_CMD, LIF_CMD, 1406 ip_sioctl_get_lifusesrc, 0 }, 1407 /* 176 */ { SIOCSLIFUSESRC, sizeof (struct lifreq), 1408 IPI_PRIV | IPI_WR, 1409 LIF_CMD, ip_sioctl_slifusesrc, 1410 NULL }, 1411 /* 177 */ { SIOCGLIFSRCOF, 0, IPI_GET_CMD, MISC_CMD, 1412 ip_sioctl_get_lifsrcof, NULL }, 1413 /* 178 */ { SIOCGMSFILTER, sizeof (struct group_filter), IPI_GET_CMD, 1414 MISC_CMD, ip_sioctl_msfilter, NULL }, 1415 /* 179 */ { SIOCSMSFILTER, sizeof (struct group_filter), IPI_WR, 1416 MISC_CMD, ip_sioctl_msfilter, NULL }, 1417 /* 180 */ { SIOCGIPMSFILTER, sizeof (struct ip_msfilter), IPI_GET_CMD, 1418 MISC_CMD, ip_sioctl_msfilter, NULL }, 1419 /* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), IPI_WR, 1420 MISC_CMD, ip_sioctl_msfilter, NULL }, 1421 /* 182 */ { SIOCSIPMPFAILBACK, sizeof (int), IPI_PRIV, MISC_CMD, 1422 ip_sioctl_set_ipmpfailback, NULL } 1423 }; 1424 1425 int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t); 1426 1427 ip_ioctl_cmd_t ip_misc_ioctl_table[] = { 1428 { OSIOCGTUNPARAM, sizeof (struct old_iftun_req), 1429 IPI_GET_CMD | IPI_REPL, TUN_CMD, ip_sioctl_tunparam, NULL }, 1430 { OSIOCSTUNPARAM, sizeof (struct old_iftun_req), IPI_PRIV | IPI_WR, 1431 TUN_CMD, ip_sioctl_tunparam, NULL }, 1432 { I_LINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1433 { I_UNLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1434 { I_PLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1435 { I_PUNLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1436 { ND_GET, 0, IPI_PASS_DOWN, 0, NULL, NULL }, 1437 { ND_SET, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1438 { IP_IOCTL, 0, 0, 0, NULL, NULL }, 1439 { SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_REPL | IPI_GET_CMD, 1440 MISC_CMD, mrt_ioctl}, 1441 { SIOCGETSGCNT, sizeof (struct sioc_sg_req), IPI_REPL | IPI_GET_CMD, 1442 MISC_CMD, mrt_ioctl}, 1443 { SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_REPL | IPI_GET_CMD, 1444 MISC_CMD, mrt_ioctl} 1445 }; 1446 1447 int ip_misc_ioctl_count = 1448 sizeof (ip_misc_ioctl_table) / sizeof (ip_ioctl_cmd_t); 1449 1450 static idl_t *conn_drain_list; /* The array of conn drain lists */ 1451 static uint_t conn_drain_list_cnt; /* Total count of conn_drain_list */ 1452 static int conn_drain_list_index; /* Next drain_list to be used */ 1453 int conn_drain_nthreads; /* Number of drainers reqd. */ 1454 /* Settable in /etc/system */ 1455 1456 /* Defined in ip_ire.c */ 1457 extern uint32_t ip_ire_max_bucket_cnt, ip6_ire_max_bucket_cnt; 1458 extern uint32_t ip_ire_min_bucket_cnt, ip6_ire_min_bucket_cnt; 1459 extern uint32_t ip_ire_mem_ratio, ip_ire_cpu_ratio; 1460 1461 static nv_t ire_nv_arr[] = { 1462 { IRE_BROADCAST, "BROADCAST" }, 1463 { IRE_LOCAL, "LOCAL" }, 1464 { IRE_LOOPBACK, "LOOPBACK" }, 1465 { IRE_CACHE, "CACHE" }, 1466 { IRE_DEFAULT, "DEFAULT" }, 1467 { IRE_PREFIX, "PREFIX" }, 1468 { IRE_IF_NORESOLVER, "IF_NORESOL" }, 1469 { IRE_IF_RESOLVER, "IF_RESOLV" }, 1470 { IRE_HOST, "HOST" }, 1471 { IRE_HOST_REDIRECT, "HOST_REDIRECT" }, 1472 { 0 } 1473 }; 1474 1475 nv_t *ire_nv_tbl = ire_nv_arr; 1476 1477 /* Defined in ip_if.c, protect the list of IPsec capable ills */ 1478 extern krwlock_t ipsec_capab_ills_lock; 1479 1480 /* Packet dropper for IP IPsec processing failures */ 1481 ipdropper_t ip_dropper; 1482 1483 /* Simple ICMP IP Header Template */ 1484 static ipha_t icmp_ipha = { 1485 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 1486 }; 1487 1488 struct module_info ip_mod_info = { 1489 IP_MOD_ID, IP_MOD_NAME, 1, INFPSZ, 65536, 1024 1490 }; 1491 1492 static struct qinit rinit = { 1493 (pfi_t)ip_rput, NULL, ip_open, ip_close, NULL, 1494 &ip_mod_info 1495 }; 1496 1497 static struct qinit winit = { 1498 (pfi_t)ip_wput, (pfi_t)ip_wsrv, ip_open, ip_close, NULL, 1499 &ip_mod_info 1500 }; 1501 1502 static struct qinit lrinit = { 1503 (pfi_t)ip_lrput, NULL, ip_open, ip_close, NULL, 1504 &ip_mod_info 1505 }; 1506 1507 static struct qinit lwinit = { 1508 (pfi_t)ip_lwput, NULL, ip_open, ip_close, NULL, 1509 &ip_mod_info 1510 }; 1511 1512 struct streamtab ipinfo = { 1513 &rinit, &winit, &lrinit, &lwinit 1514 }; 1515 1516 #ifdef DEBUG 1517 static boolean_t skip_sctp_cksum = B_FALSE; 1518 #endif 1519 /* 1520 * Copy an M_CTL-tagged message, preserving reference counts appropriately. 1521 */ 1522 mblk_t * 1523 ip_copymsg(mblk_t *mp) 1524 { 1525 mblk_t *nmp; 1526 ipsec_info_t *in; 1527 1528 if (mp->b_datap->db_type != M_CTL) 1529 return (copymsg(mp)); 1530 1531 in = (ipsec_info_t *)mp->b_rptr; 1532 1533 /* 1534 * Note that M_CTL is also used for delivering ICMP error messages 1535 * upstream to transport layers. 1536 */ 1537 if (in->ipsec_info_type != IPSEC_OUT && 1538 in->ipsec_info_type != IPSEC_IN) 1539 return (copymsg(mp)); 1540 1541 nmp = copymsg(mp->b_cont); 1542 1543 if (in->ipsec_info_type == IPSEC_OUT) 1544 return (ipsec_out_tag(mp, nmp)); 1545 else 1546 return (ipsec_in_tag(mp, nmp)); 1547 } 1548 1549 /* Generate an ICMP fragmentation needed message. */ 1550 static void 1551 icmp_frag_needed(queue_t *q, mblk_t *mp, int mtu) 1552 { 1553 icmph_t icmph; 1554 mblk_t *first_mp; 1555 boolean_t mctl_present; 1556 1557 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 1558 1559 if (!(mp = icmp_pkt_err_ok(mp))) { 1560 if (mctl_present) 1561 freeb(first_mp); 1562 return; 1563 } 1564 1565 bzero(&icmph, sizeof (icmph_t)); 1566 icmph.icmph_type = ICMP_DEST_UNREACHABLE; 1567 icmph.icmph_code = ICMP_FRAGMENTATION_NEEDED; 1568 icmph.icmph_du_mtu = htons((uint16_t)mtu); 1569 BUMP_MIB(&icmp_mib, icmpOutFragNeeded); 1570 BUMP_MIB(&icmp_mib, icmpOutDestUnreachs); 1571 icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present); 1572 } 1573 1574 /* 1575 * icmp_inbound deals with ICMP messages in the following ways. 1576 * 1577 * 1) It needs to send a reply back and possibly delivering it 1578 * to the "interested" upper clients. 1579 * 2) It needs to send it to the upper clients only. 1580 * 3) It needs to change some values in IP only. 1581 * 4) It needs to change some values in IP and upper layers e.g TCP. 1582 * 1583 * We need to accomodate icmp messages coming in clear until we get 1584 * everything secure from the wire. If icmp_accept_clear_messages 1585 * is zero we check with the global policy and act accordingly. If 1586 * it is non-zero, we accept the message without any checks. But 1587 * *this does not mean* that this will be delivered to the upper 1588 * clients. By accepting we might send replies back, change our MTU 1589 * value etc. but delivery to the ULP/clients depends on their policy 1590 * dispositions. 1591 * 1592 * We handle the above 4 cases in the context of IPSEC in the 1593 * following way : 1594 * 1595 * 1) Send the reply back in the same way as the request came in. 1596 * If it came in encrypted, it goes out encrypted. If it came in 1597 * clear, it goes out in clear. Thus, this will prevent chosen 1598 * plain text attack. 1599 * 2) The client may or may not expect things to come in secure. 1600 * If it comes in secure, the policy constraints are checked 1601 * before delivering it to the upper layers. If it comes in 1602 * clear, ipsec_inbound_accept_clear will decide whether to 1603 * accept this in clear or not. In both the cases, if the returned 1604 * message (IP header + 8 bytes) that caused the icmp message has 1605 * AH/ESP headers, it is sent up to AH/ESP for validation before 1606 * sending up. If there are only 8 bytes of returned message, then 1607 * upper client will not be notified. 1608 * 3) Check with global policy to see whether it matches the constaints. 1609 * But this will be done only if icmp_accept_messages_in_clear is 1610 * zero. 1611 * 4) If we need to change both in IP and ULP, then the decision taken 1612 * while affecting the values in IP and while delivering up to TCP 1613 * should be the same. 1614 * 1615 * There are two cases. 1616 * 1617 * a) If we reject data at the IP layer (ipsec_check_global_policy() 1618 * failed), we will not deliver it to the ULP, even though they 1619 * are *willing* to accept in *clear*. This is fine as our global 1620 * disposition to icmp messages asks us reject the datagram. 1621 * 1622 * b) If we accept data at the IP layer (ipsec_check_global_policy() 1623 * succeeded or icmp_accept_messages_in_clear is 1), and not able 1624 * to deliver it to ULP (policy failed), it can lead to 1625 * consistency problems. The cases known at this time are 1626 * ICMP_DESTINATION_UNREACHABLE messages with following code 1627 * values : 1628 * 1629 * - ICMP_FRAGMENTATION_NEEDED : IP adapts to the new value 1630 * and Upper layer rejects. Then the communication will 1631 * come to a stop. This is solved by making similar decisions 1632 * at both levels. Currently, when we are unable to deliver 1633 * to the Upper Layer (due to policy failures) while IP has 1634 * adjusted ire_max_frag, the next outbound datagram would 1635 * generate a local ICMP_FRAGMENTATION_NEEDED message - which 1636 * will be with the right level of protection. Thus the right 1637 * value will be communicated even if we are not able to 1638 * communicate when we get from the wire initially. But this 1639 * assumes there would be at least one outbound datagram after 1640 * IP has adjusted its ire_max_frag value. To make things 1641 * simpler, we accept in clear after the validation of 1642 * AH/ESP headers. 1643 * 1644 * - Other ICMP ERRORS : We may not be able to deliver it to the 1645 * upper layer depending on the level of protection the upper 1646 * layer expects and the disposition in ipsec_inbound_accept_clear(). 1647 * ipsec_inbound_accept_clear() decides whether a given ICMP error 1648 * should be accepted in clear when the Upper layer expects secure. 1649 * Thus the communication may get aborted by some bad ICMP 1650 * packets. 1651 * 1652 * IPQoS Notes: 1653 * The only instance when a packet is sent for processing is when there 1654 * isn't an ICMP client and if we are interested in it. 1655 * If there is a client, IPPF processing will take place in the 1656 * ip_fanout_proto routine. 1657 * 1658 * Zones notes: 1659 * The packet is only processed in the context of the specified zone: typically 1660 * only this zone will reply to an echo request, and only interested clients in 1661 * this zone will receive a copy of the packet. This means that the caller must 1662 * call icmp_inbound() for each relevant zone. 1663 */ 1664 static void 1665 icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill, 1666 int sum_valid, uint32_t sum, boolean_t mctl_present, boolean_t ip_policy, 1667 ill_t *recv_ill, zoneid_t zoneid) 1668 { 1669 icmph_t *icmph; 1670 ipha_t *ipha; 1671 int iph_hdr_length; 1672 int hdr_length; 1673 boolean_t interested; 1674 uint32_t ts; 1675 uchar_t *wptr; 1676 ipif_t *ipif; 1677 mblk_t *first_mp; 1678 ipsec_in_t *ii; 1679 ire_t *src_ire; 1680 boolean_t onlink; 1681 timestruc_t now; 1682 uint32_t ill_index; 1683 1684 ASSERT(ill != NULL); 1685 1686 first_mp = mp; 1687 if (mctl_present) { 1688 mp = first_mp->b_cont; 1689 ASSERT(mp != NULL); 1690 } 1691 1692 ipha = (ipha_t *)mp->b_rptr; 1693 if (icmp_accept_clear_messages == 0) { 1694 first_mp = ipsec_check_global_policy(first_mp, NULL, 1695 ipha, NULL, mctl_present); 1696 if (first_mp == NULL) 1697 return; 1698 } 1699 1700 /* 1701 * On a labeled system, we have to check whether the zone itself is 1702 * permitted to receive raw traffic. 1703 */ 1704 if (is_system_labeled()) { 1705 if (zoneid == ALL_ZONES) 1706 zoneid = tsol_packet_to_zoneid(mp); 1707 if (!tsol_can_accept_raw(mp, B_FALSE)) { 1708 ip1dbg(("icmp_inbound: zone %d can't receive raw", 1709 zoneid)); 1710 BUMP_MIB(&icmp_mib, icmpInErrors); 1711 freemsg(first_mp); 1712 return; 1713 } 1714 } 1715 1716 /* 1717 * We have accepted the ICMP message. It means that we will 1718 * respond to the packet if needed. It may not be delivered 1719 * to the upper client depending on the policy constraints 1720 * and the disposition in ipsec_inbound_accept_clear. 1721 */ 1722 1723 ASSERT(ill != NULL); 1724 1725 BUMP_MIB(&icmp_mib, icmpInMsgs); 1726 iph_hdr_length = IPH_HDR_LENGTH(ipha); 1727 if ((mp->b_wptr - mp->b_rptr) < (iph_hdr_length + ICMPH_SIZE)) { 1728 /* Last chance to get real. */ 1729 if (!pullupmsg(mp, iph_hdr_length + ICMPH_SIZE)) { 1730 BUMP_MIB(&icmp_mib, icmpInErrors); 1731 freemsg(first_mp); 1732 return; 1733 } 1734 /* Refresh iph following the pullup. */ 1735 ipha = (ipha_t *)mp->b_rptr; 1736 } 1737 /* ICMP header checksum, including checksum field, should be zero. */ 1738 if (sum_valid ? (sum != 0 && sum != 0xFFFF) : 1739 IP_CSUM(mp, iph_hdr_length, 0)) { 1740 BUMP_MIB(&icmp_mib, icmpInCksumErrs); 1741 freemsg(first_mp); 1742 return; 1743 } 1744 /* The IP header will always be a multiple of four bytes */ 1745 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1746 ip2dbg(("icmp_inbound: type %d code %d\n", icmph->icmph_type, 1747 icmph->icmph_code)); 1748 wptr = (uchar_t *)icmph + ICMPH_SIZE; 1749 /* We will set "interested" to "true" if we want a copy */ 1750 interested = B_FALSE; 1751 switch (icmph->icmph_type) { 1752 case ICMP_ECHO_REPLY: 1753 BUMP_MIB(&icmp_mib, icmpInEchoReps); 1754 break; 1755 case ICMP_DEST_UNREACHABLE: 1756 if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) 1757 BUMP_MIB(&icmp_mib, icmpInFragNeeded); 1758 interested = B_TRUE; /* Pass up to transport */ 1759 BUMP_MIB(&icmp_mib, icmpInDestUnreachs); 1760 break; 1761 case ICMP_SOURCE_QUENCH: 1762 interested = B_TRUE; /* Pass up to transport */ 1763 BUMP_MIB(&icmp_mib, icmpInSrcQuenchs); 1764 break; 1765 case ICMP_REDIRECT: 1766 if (!ip_ignore_redirect) 1767 interested = B_TRUE; 1768 BUMP_MIB(&icmp_mib, icmpInRedirects); 1769 break; 1770 case ICMP_ECHO_REQUEST: 1771 /* 1772 * Whether to respond to echo requests that come in as IP 1773 * broadcasts or as IP multicast is subject to debate 1774 * (what isn't?). We aim to please, you pick it. 1775 * Default is do it. 1776 */ 1777 if (!broadcast && !CLASSD(ipha->ipha_dst)) { 1778 /* unicast: always respond */ 1779 interested = B_TRUE; 1780 } else if (CLASSD(ipha->ipha_dst)) { 1781 /* multicast: respond based on tunable */ 1782 interested = ip_g_resp_to_echo_mcast; 1783 } else if (broadcast) { 1784 /* broadcast: respond based on tunable */ 1785 interested = ip_g_resp_to_echo_bcast; 1786 } 1787 BUMP_MIB(&icmp_mib, icmpInEchos); 1788 break; 1789 case ICMP_ROUTER_ADVERTISEMENT: 1790 case ICMP_ROUTER_SOLICITATION: 1791 break; 1792 case ICMP_TIME_EXCEEDED: 1793 interested = B_TRUE; /* Pass up to transport */ 1794 BUMP_MIB(&icmp_mib, icmpInTimeExcds); 1795 break; 1796 case ICMP_PARAM_PROBLEM: 1797 interested = B_TRUE; /* Pass up to transport */ 1798 BUMP_MIB(&icmp_mib, icmpInParmProbs); 1799 break; 1800 case ICMP_TIME_STAMP_REQUEST: 1801 /* Response to Time Stamp Requests is local policy. */ 1802 if (ip_g_resp_to_timestamp && 1803 /* So is whether to respond if it was an IP broadcast. */ 1804 (!broadcast || ip_g_resp_to_timestamp_bcast)) { 1805 int tstamp_len = 3 * sizeof (uint32_t); 1806 1807 if (wptr + tstamp_len > mp->b_wptr) { 1808 if (!pullupmsg(mp, wptr + tstamp_len - 1809 mp->b_rptr)) { 1810 BUMP_MIB(&ip_mib, ipInDiscards); 1811 freemsg(first_mp); 1812 return; 1813 } 1814 /* Refresh ipha following the pullup. */ 1815 ipha = (ipha_t *)mp->b_rptr; 1816 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1817 wptr = (uchar_t *)icmph + ICMPH_SIZE; 1818 } 1819 interested = B_TRUE; 1820 } 1821 BUMP_MIB(&icmp_mib, icmpInTimestamps); 1822 break; 1823 case ICMP_TIME_STAMP_REPLY: 1824 BUMP_MIB(&icmp_mib, icmpInTimestampReps); 1825 break; 1826 case ICMP_INFO_REQUEST: 1827 /* Per RFC 1122 3.2.2.7, ignore this. */ 1828 case ICMP_INFO_REPLY: 1829 break; 1830 case ICMP_ADDRESS_MASK_REQUEST: 1831 if ((ip_respond_to_address_mask_broadcast || !broadcast) && 1832 /* TODO m_pullup of complete header? */ 1833 (mp->b_datap->db_lim - wptr) >= IP_ADDR_LEN) 1834 interested = B_TRUE; 1835 BUMP_MIB(&icmp_mib, icmpInAddrMasks); 1836 break; 1837 case ICMP_ADDRESS_MASK_REPLY: 1838 BUMP_MIB(&icmp_mib, icmpInAddrMaskReps); 1839 break; 1840 default: 1841 interested = B_TRUE; /* Pass up to transport */ 1842 BUMP_MIB(&icmp_mib, icmpInUnknowns); 1843 break; 1844 } 1845 /* See if there is an ICMP client. */ 1846 if (ipcl_proto_search(IPPROTO_ICMP) != NULL) { 1847 /* If there is an ICMP client and we want one too, copy it. */ 1848 mblk_t *first_mp1; 1849 1850 if (!interested) { 1851 ip_fanout_proto(q, first_mp, ill, ipha, 0, mctl_present, 1852 ip_policy, recv_ill, zoneid); 1853 return; 1854 } 1855 first_mp1 = ip_copymsg(first_mp); 1856 if (first_mp1 != NULL) { 1857 ip_fanout_proto(q, first_mp1, ill, ipha, 1858 0, mctl_present, ip_policy, recv_ill, zoneid); 1859 } 1860 } else if (!interested) { 1861 freemsg(first_mp); 1862 return; 1863 } else { 1864 /* 1865 * Initiate policy processing for this packet if ip_policy 1866 * is true. 1867 */ 1868 if (IPP_ENABLED(IPP_LOCAL_IN) && ip_policy) { 1869 ill_index = ill->ill_phyint->phyint_ifindex; 1870 ip_process(IPP_LOCAL_IN, &mp, ill_index); 1871 if (mp == NULL) { 1872 if (mctl_present) { 1873 freeb(first_mp); 1874 } 1875 BUMP_MIB(&icmp_mib, icmpInErrors); 1876 return; 1877 } 1878 } 1879 } 1880 /* We want to do something with it. */ 1881 /* Check db_ref to make sure we can modify the packet. */ 1882 if (mp->b_datap->db_ref > 1) { 1883 mblk_t *first_mp1; 1884 1885 first_mp1 = ip_copymsg(first_mp); 1886 freemsg(first_mp); 1887 if (!first_mp1) { 1888 BUMP_MIB(&icmp_mib, icmpOutDrops); 1889 return; 1890 } 1891 first_mp = first_mp1; 1892 if (mctl_present) { 1893 mp = first_mp->b_cont; 1894 ASSERT(mp != NULL); 1895 } else { 1896 mp = first_mp; 1897 } 1898 ipha = (ipha_t *)mp->b_rptr; 1899 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1900 wptr = (uchar_t *)icmph + ICMPH_SIZE; 1901 } 1902 switch (icmph->icmph_type) { 1903 case ICMP_ADDRESS_MASK_REQUEST: 1904 ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid); 1905 if (ipif == NULL) { 1906 freemsg(first_mp); 1907 return; 1908 } 1909 /* 1910 * outging interface must be IPv4 1911 */ 1912 ASSERT(ipif != NULL && !ipif->ipif_isv6); 1913 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; 1914 bcopy(&ipif->ipif_net_mask, wptr, IP_ADDR_LEN); 1915 ipif_refrele(ipif); 1916 BUMP_MIB(&icmp_mib, icmpOutAddrMaskReps); 1917 break; 1918 case ICMP_ECHO_REQUEST: 1919 icmph->icmph_type = ICMP_ECHO_REPLY; 1920 BUMP_MIB(&icmp_mib, icmpOutEchoReps); 1921 break; 1922 case ICMP_TIME_STAMP_REQUEST: { 1923 uint32_t *tsp; 1924 1925 icmph->icmph_type = ICMP_TIME_STAMP_REPLY; 1926 tsp = (uint32_t *)wptr; 1927 tsp++; /* Skip past 'originate time' */ 1928 /* Compute # of milliseconds since midnight */ 1929 gethrestime(&now); 1930 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + 1931 now.tv_nsec / (NANOSEC / MILLISEC); 1932 *tsp++ = htonl(ts); /* Lay in 'receive time' */ 1933 *tsp++ = htonl(ts); /* Lay in 'send time' */ 1934 BUMP_MIB(&icmp_mib, icmpOutTimestampReps); 1935 break; 1936 } 1937 default: 1938 ipha = (ipha_t *)&icmph[1]; 1939 if ((uchar_t *)&ipha[1] > mp->b_wptr) { 1940 if (!pullupmsg(mp, (uchar_t *)&ipha[1] - mp->b_rptr)) { 1941 BUMP_MIB(&ip_mib, ipInDiscards); 1942 freemsg(first_mp); 1943 return; 1944 } 1945 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1946 ipha = (ipha_t *)&icmph[1]; 1947 } 1948 if ((IPH_HDR_VERSION(ipha) != IPV4_VERSION)) { 1949 BUMP_MIB(&ip_mib, ipInDiscards); 1950 freemsg(first_mp); 1951 return; 1952 } 1953 hdr_length = IPH_HDR_LENGTH(ipha); 1954 if (hdr_length < sizeof (ipha_t)) { 1955 BUMP_MIB(&ip_mib, ipInDiscards); 1956 freemsg(first_mp); 1957 return; 1958 } 1959 if ((uchar_t *)ipha + hdr_length > mp->b_wptr) { 1960 if (!pullupmsg(mp, 1961 (uchar_t *)ipha + hdr_length - mp->b_rptr)) { 1962 BUMP_MIB(&ip_mib, ipInDiscards); 1963 freemsg(first_mp); 1964 return; 1965 } 1966 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1967 ipha = (ipha_t *)&icmph[1]; 1968 } 1969 switch (icmph->icmph_type) { 1970 case ICMP_REDIRECT: 1971 /* 1972 * As there is no upper client to deliver, we don't 1973 * need the first_mp any more. 1974 */ 1975 if (mctl_present) { 1976 freeb(first_mp); 1977 } 1978 icmp_redirect(mp); 1979 return; 1980 case ICMP_DEST_UNREACHABLE: 1981 if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) { 1982 if (!icmp_inbound_too_big(icmph, ipha)) { 1983 freemsg(first_mp); 1984 return; 1985 } 1986 } 1987 /* FALLTHRU */ 1988 default : 1989 /* 1990 * IPQoS notes: Since we have already done IPQoS 1991 * processing we don't want to do it again in 1992 * the fanout routines called by 1993 * icmp_inbound_error_fanout, hence the last 1994 * argument, ip_policy, is B_FALSE. 1995 */ 1996 icmp_inbound_error_fanout(q, ill, first_mp, icmph, 1997 ipha, iph_hdr_length, hdr_length, mctl_present, 1998 B_FALSE, recv_ill, zoneid); 1999 } 2000 return; 2001 } 2002 /* Send out an ICMP packet */ 2003 icmph->icmph_checksum = 0; 2004 icmph->icmph_checksum = IP_CSUM(mp, iph_hdr_length, 0); 2005 if (icmph->icmph_checksum == 0) 2006 icmph->icmph_checksum = 0xFFFF; 2007 if (broadcast || CLASSD(ipha->ipha_dst)) { 2008 ipif_t *ipif_chosen; 2009 /* 2010 * Make it look like it was directed to us, so we don't look 2011 * like a fool with a broadcast or multicast source address. 2012 */ 2013 ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid); 2014 /* 2015 * Make sure that we haven't grabbed an interface that's DOWN. 2016 */ 2017 if (ipif != NULL) { 2018 ipif_chosen = ipif_select_source(ipif->ipif_ill, 2019 ipha->ipha_src, zoneid); 2020 if (ipif_chosen != NULL) { 2021 ipif_refrele(ipif); 2022 ipif = ipif_chosen; 2023 } 2024 } 2025 if (ipif == NULL) { 2026 ip0dbg(("icmp_inbound: " 2027 "No source for broadcast/multicast:\n" 2028 "\tsrc 0x%x dst 0x%x ill %p " 2029 "ipif_lcl_addr 0x%x\n", 2030 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), 2031 (void *)ill, 2032 ill->ill_ipif->ipif_lcl_addr)); 2033 freemsg(first_mp); 2034 return; 2035 } 2036 ASSERT(ipif != NULL && !ipif->ipif_isv6); 2037 ipha->ipha_dst = ipif->ipif_src_addr; 2038 ipif_refrele(ipif); 2039 } 2040 /* Reset time to live. */ 2041 ipha->ipha_ttl = ip_def_ttl; 2042 { 2043 /* Swap source and destination addresses */ 2044 ipaddr_t tmp; 2045 2046 tmp = ipha->ipha_src; 2047 ipha->ipha_src = ipha->ipha_dst; 2048 ipha->ipha_dst = tmp; 2049 } 2050 ipha->ipha_ident = 0; 2051 if (!IS_SIMPLE_IPH(ipha)) 2052 icmp_options_update(ipha); 2053 2054 /* 2055 * ICMP echo replies should go out on the same interface 2056 * the request came on as probes used by in.mpathd for detecting 2057 * NIC failures are ECHO packets. We turn-off load spreading 2058 * by setting ipsec_in_attach_if to B_TRUE, which is copied 2059 * to ipsec_out_attach_if by ipsec_in_to_out called later in this 2060 * function. This is in turn handled by ip_wput and ip_newroute 2061 * to make sure that the packet goes out on the interface it came 2062 * in on. If we don't turnoff load spreading, the packets might get 2063 * dropped if there are no non-FAILED/INACTIVE interfaces for it 2064 * to go out and in.mpathd would wrongly detect a failure or 2065 * mis-detect a NIC failure for link failure. As load spreading 2066 * can happen only if ill_group is not NULL, we do only for 2067 * that case and this does not affect the normal case. 2068 * 2069 * We turn off load spreading only on echo packets that came from 2070 * on-link hosts. If the interface route has been deleted, this will 2071 * not be enforced as we can't do much. For off-link hosts, as the 2072 * default routes in IPv4 does not typically have an ire_ipif 2073 * pointer, we can't force MATCH_IRE_ILL in ip_wput/ip_newroute. 2074 * Moreover, expecting a default route through this interface may 2075 * not be correct. We use ipha_dst because of the swap above. 2076 */ 2077 onlink = B_FALSE; 2078 if (icmph->icmph_type == ICMP_ECHO_REPLY && ill->ill_group != NULL) { 2079 /* 2080 * First, we need to make sure that it is not one of our 2081 * local addresses. If we set onlink when it is one of 2082 * our local addresses, we will end up creating IRE_CACHES 2083 * for one of our local addresses. Then, we will never 2084 * accept packets for them afterwards. 2085 */ 2086 src_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_LOCAL, 2087 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 2088 if (src_ire == NULL) { 2089 ipif = ipif_get_next_ipif(NULL, ill); 2090 if (ipif == NULL) { 2091 BUMP_MIB(&ip_mib, ipInDiscards); 2092 freemsg(mp); 2093 return; 2094 } 2095 src_ire = ire_ftable_lookup(ipha->ipha_dst, 0, 0, 2096 IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, 2097 NULL, MATCH_IRE_ILL | MATCH_IRE_TYPE); 2098 ipif_refrele(ipif); 2099 if (src_ire != NULL) { 2100 onlink = B_TRUE; 2101 ire_refrele(src_ire); 2102 } 2103 } else { 2104 ire_refrele(src_ire); 2105 } 2106 } 2107 if (!mctl_present) { 2108 /* 2109 * This packet should go out the same way as it 2110 * came in i.e in clear. To make sure that global 2111 * policy will not be applied to this in ip_wput_ire, 2112 * we attach a IPSEC_IN mp and clear ipsec_in_secure. 2113 */ 2114 ASSERT(first_mp == mp); 2115 if ((first_mp = ipsec_in_alloc(B_TRUE)) == NULL) { 2116 BUMP_MIB(&ip_mib, ipInDiscards); 2117 freemsg(mp); 2118 return; 2119 } 2120 ii = (ipsec_in_t *)first_mp->b_rptr; 2121 2122 /* This is not a secure packet */ 2123 ii->ipsec_in_secure = B_FALSE; 2124 if (onlink) { 2125 ii->ipsec_in_attach_if = B_TRUE; 2126 ii->ipsec_in_ill_index = 2127 ill->ill_phyint->phyint_ifindex; 2128 ii->ipsec_in_rill_index = 2129 recv_ill->ill_phyint->phyint_ifindex; 2130 } 2131 first_mp->b_cont = mp; 2132 } else if (onlink) { 2133 ii = (ipsec_in_t *)first_mp->b_rptr; 2134 ii->ipsec_in_attach_if = B_TRUE; 2135 ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex; 2136 ii->ipsec_in_rill_index = recv_ill->ill_phyint->phyint_ifindex; 2137 } else { 2138 ii = (ipsec_in_t *)first_mp->b_rptr; 2139 } 2140 ii->ipsec_in_zoneid = zoneid; 2141 ASSERT(zoneid != ALL_ZONES); 2142 if (!ipsec_in_to_out(first_mp, ipha, NULL)) { 2143 BUMP_MIB(&ip_mib, ipInDiscards); 2144 return; 2145 } 2146 BUMP_MIB(&icmp_mib, icmpOutMsgs); 2147 put(WR(q), first_mp); 2148 } 2149 2150 /* Table from RFC 1191 */ 2151 static int icmp_frag_size_table[] = 2152 { 32000, 17914, 8166, 4352, 2002, 1496, 1006, 508, 296, 68 }; 2153 2154 /* 2155 * Process received ICMP Packet too big. 2156 * After updating any IRE it does the fanout to any matching transport streams. 2157 * Assumes the message has been pulled up till the IP header that caused 2158 * the error. 2159 * 2160 * Returns B_FALSE on failure and B_TRUE on success. 2161 */ 2162 static boolean_t 2163 icmp_inbound_too_big(icmph_t *icmph, ipha_t *ipha) 2164 { 2165 ire_t *ire, *first_ire; 2166 int mtu; 2167 int hdr_length; 2168 2169 ASSERT(icmph->icmph_type == ICMP_DEST_UNREACHABLE && 2170 icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED); 2171 2172 hdr_length = IPH_HDR_LENGTH(ipha); 2173 2174 first_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_CACHE, NULL, 2175 ALL_ZONES, NULL, MATCH_IRE_TYPE); 2176 2177 if (!first_ire) { 2178 ip1dbg(("icmp_inbound_too_big: no route for 0x%x\n", 2179 ntohl(ipha->ipha_dst))); 2180 return (B_FALSE); 2181 } 2182 /* Drop if the original packet contained a source route */ 2183 if (ip_source_route_included(ipha)) { 2184 ire_refrele(first_ire); 2185 return (B_FALSE); 2186 } 2187 /* Check for MTU discovery advice as described in RFC 1191 */ 2188 mtu = ntohs(icmph->icmph_du_mtu); 2189 rw_enter(&first_ire->ire_bucket->irb_lock, RW_READER); 2190 for (ire = first_ire; ire != NULL && ire->ire_addr == ipha->ipha_dst; 2191 ire = ire->ire_next) { 2192 mutex_enter(&ire->ire_lock); 2193 if (icmph->icmph_du_zero == 0 && mtu > 68) { 2194 /* Reduce the IRE max frag value as advised. */ 2195 ip1dbg(("Received mtu from router: %d (was %d)\n", 2196 mtu, ire->ire_max_frag)); 2197 ire->ire_max_frag = MIN(ire->ire_max_frag, mtu); 2198 } else { 2199 uint32_t length; 2200 int i; 2201 2202 /* 2203 * Use the table from RFC 1191 to figure out 2204 * the next "plateau" based on the length in 2205 * the original IP packet. 2206 */ 2207 length = ntohs(ipha->ipha_length); 2208 if (ire->ire_max_frag <= length && 2209 ire->ire_max_frag >= length - hdr_length) { 2210 /* 2211 * Handle broken BSD 4.2 systems that 2212 * return the wrong iph_length in ICMP 2213 * errors. 2214 */ 2215 ip1dbg(("Wrong mtu: sent %d, ire %d\n", 2216 length, ire->ire_max_frag)); 2217 length -= hdr_length; 2218 } 2219 for (i = 0; i < A_CNT(icmp_frag_size_table); i++) { 2220 if (length > icmp_frag_size_table[i]) 2221 break; 2222 } 2223 if (i == A_CNT(icmp_frag_size_table)) { 2224 /* Smaller than 68! */ 2225 ip1dbg(("Too big for packet size %d\n", 2226 length)); 2227 ire->ire_max_frag = MIN(ire->ire_max_frag, 576); 2228 ire->ire_frag_flag = 0; 2229 } else { 2230 mtu = icmp_frag_size_table[i]; 2231 ip1dbg(("Calculated mtu %d, packet size %d, " 2232 "before %d", mtu, length, 2233 ire->ire_max_frag)); 2234 ire->ire_max_frag = MIN(ire->ire_max_frag, mtu); 2235 ip1dbg((", after %d\n", ire->ire_max_frag)); 2236 } 2237 /* Record the new max frag size for the ULP. */ 2238 icmph->icmph_du_zero = 0; 2239 icmph->icmph_du_mtu = 2240 htons((uint16_t)ire->ire_max_frag); 2241 } 2242 mutex_exit(&ire->ire_lock); 2243 } 2244 rw_exit(&first_ire->ire_bucket->irb_lock); 2245 ire_refrele(first_ire); 2246 return (B_TRUE); 2247 } 2248 2249 /* 2250 * If the packet in error is Self-Encapsulated, icmp_inbound_error_fanout 2251 * calls this function. 2252 */ 2253 static mblk_t * 2254 icmp_inbound_self_encap_error(mblk_t *mp, int iph_hdr_length, int hdr_length) 2255 { 2256 ipha_t *ipha; 2257 icmph_t *icmph; 2258 ipha_t *in_ipha; 2259 int length; 2260 2261 ASSERT(mp->b_datap->db_type == M_DATA); 2262 2263 /* 2264 * For Self-encapsulated packets, we added an extra IP header 2265 * without the options. Inner IP header is the one from which 2266 * the outer IP header was formed. Thus, we need to remove the 2267 * outer IP header. To do this, we pullup the whole message 2268 * and overlay whatever follows the outer IP header over the 2269 * outer IP header. 2270 */ 2271 2272 if (!pullupmsg(mp, -1)) { 2273 BUMP_MIB(&ip_mib, ipInDiscards); 2274 return (NULL); 2275 } 2276 2277 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2278 ipha = (ipha_t *)&icmph[1]; 2279 in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length); 2280 2281 /* 2282 * The length that we want to overlay is following the inner 2283 * IP header. Subtracting the IP header + icmp header + outer 2284 * IP header's length should give us the length that we want to 2285 * overlay. 2286 */ 2287 length = msgdsize(mp) - iph_hdr_length - sizeof (icmph_t) - 2288 hdr_length; 2289 /* 2290 * Overlay whatever follows the inner header over the 2291 * outer header. 2292 */ 2293 bcopy((uchar_t *)in_ipha, (uchar_t *)ipha, length); 2294 2295 /* Set the wptr to account for the outer header */ 2296 mp->b_wptr -= hdr_length; 2297 return (mp); 2298 } 2299 2300 /* 2301 * Try to pass the ICMP message upstream in case the ULP cares. 2302 * 2303 * If the packet that caused the ICMP error is secure, we send 2304 * it to AH/ESP to make sure that the attached packet has a 2305 * valid association. ipha in the code below points to the 2306 * IP header of the packet that caused the error. 2307 * 2308 * We handle ICMP_FRAGMENTATION_NEEDED(IFN) message differently 2309 * in the context of IPSEC. Normally we tell the upper layer 2310 * whenever we send the ire (including ip_bind), the IPSEC header 2311 * length in ire_ipsec_overhead. TCP can deduce the MSS as it 2312 * has both the MTU (ire_max_frag) and the ire_ipsec_overhead. 2313 * Similarly, we pass the new MTU icmph_du_mtu and TCP does the 2314 * same thing. As TCP has the IPSEC options size that needs to be 2315 * adjusted, we just pass the MTU unchanged. 2316 * 2317 * IFN could have been generated locally or by some router. 2318 * 2319 * LOCAL : *ip_wput_ire -> icmp_frag_needed could have generated this. 2320 * This happens because IP adjusted its value of MTU on an 2321 * earlier IFN message and could not tell the upper layer, 2322 * the new adjusted value of MTU e.g. Packet was encrypted 2323 * or there was not enough information to fanout to upper 2324 * layers. Thus on the next outbound datagram, ip_wput_ire 2325 * generates the IFN, where IPSEC processing has *not* been 2326 * done. 2327 * 2328 * *ip_wput_ire_fragmentit -> ip_wput_frag -> icmp_frag_needed 2329 * could have generated this. This happens because ire_max_frag 2330 * value in IP was set to a new value, while the IPSEC processing 2331 * was being done and after we made the fragmentation check in 2332 * ip_wput_ire. Thus on return from IPSEC processing, 2333 * ip_wput_ipsec_out finds that the new length is > ire_max_frag 2334 * and generates the IFN. As IPSEC processing is over, we fanout 2335 * to AH/ESP to remove the header. 2336 * 2337 * In both these cases, ipsec_in_loopback will be set indicating 2338 * that IFN was generated locally. 2339 * 2340 * ROUTER : IFN could be secure or non-secure. 2341 * 2342 * * SECURE : We use the IPSEC_IN to fanout to AH/ESP if the 2343 * packet in error has AH/ESP headers to validate the AH/ESP 2344 * headers. AH/ESP will verify whether there is a valid SA or 2345 * not and send it back. We will fanout again if we have more 2346 * data in the packet. 2347 * 2348 * If the packet in error does not have AH/ESP, we handle it 2349 * like any other case. 2350 * 2351 * * NON_SECURE : If the packet in error has AH/ESP headers, 2352 * we attach a dummy ipsec_in and send it up to AH/ESP 2353 * for validation. AH/ESP will verify whether there is a 2354 * valid SA or not and send it back. We will fanout again if 2355 * we have more data in the packet. 2356 * 2357 * If the packet in error does not have AH/ESP, we handle it 2358 * like any other case. 2359 */ 2360 static void 2361 icmp_inbound_error_fanout(queue_t *q, ill_t *ill, mblk_t *mp, 2362 icmph_t *icmph, ipha_t *ipha, int iph_hdr_length, int hdr_length, 2363 boolean_t mctl_present, boolean_t ip_policy, ill_t *recv_ill, 2364 zoneid_t zoneid) 2365 { 2366 uint16_t *up; /* Pointer to ports in ULP header */ 2367 uint32_t ports; /* reversed ports for fanout */ 2368 ipha_t ripha; /* With reversed addresses */ 2369 mblk_t *first_mp; 2370 ipsec_in_t *ii; 2371 tcph_t *tcph; 2372 conn_t *connp; 2373 2374 first_mp = mp; 2375 if (mctl_present) { 2376 mp = first_mp->b_cont; 2377 ASSERT(mp != NULL); 2378 2379 ii = (ipsec_in_t *)first_mp->b_rptr; 2380 ASSERT(ii->ipsec_in_type == IPSEC_IN); 2381 } else { 2382 ii = NULL; 2383 } 2384 2385 switch (ipha->ipha_protocol) { 2386 case IPPROTO_UDP: 2387 /* 2388 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of 2389 * transport header. 2390 */ 2391 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > 2392 mp->b_wptr) { 2393 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 2394 ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { 2395 BUMP_MIB(&ip_mib, ipInDiscards); 2396 goto drop_pkt; 2397 } 2398 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2399 ipha = (ipha_t *)&icmph[1]; 2400 } 2401 up = (uint16_t *)((uchar_t *)ipha + hdr_length); 2402 2403 /* 2404 * Attempt to find a client stream based on port. 2405 * Note that we do a reverse lookup since the header is 2406 * in the form we sent it out. 2407 * The ripha header is only used for the IP_UDP_MATCH and we 2408 * only set the src and dst addresses and protocol. 2409 */ 2410 ripha.ipha_src = ipha->ipha_dst; 2411 ripha.ipha_dst = ipha->ipha_src; 2412 ripha.ipha_protocol = ipha->ipha_protocol; 2413 ((uint16_t *)&ports)[0] = up[1]; 2414 ((uint16_t *)&ports)[1] = up[0]; 2415 ip2dbg(("icmp_inbound_error: UDP %x:%d to %x:%d: %d/%d\n", 2416 ntohl(ipha->ipha_src), ntohs(up[0]), 2417 ntohl(ipha->ipha_dst), ntohs(up[1]), 2418 icmph->icmph_type, icmph->icmph_code)); 2419 2420 /* Have to change db_type after any pullupmsg */ 2421 DB_TYPE(mp) = M_CTL; 2422 2423 ip_fanout_udp(q, first_mp, ill, &ripha, ports, B_FALSE, 0, 2424 mctl_present, ip_policy, recv_ill, zoneid); 2425 return; 2426 2427 case IPPROTO_TCP: 2428 /* 2429 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of 2430 * transport header. 2431 */ 2432 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > 2433 mp->b_wptr) { 2434 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 2435 ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { 2436 BUMP_MIB(&ip_mib, ipInDiscards); 2437 goto drop_pkt; 2438 } 2439 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2440 ipha = (ipha_t *)&icmph[1]; 2441 } 2442 /* 2443 * Find a TCP client stream for this packet. 2444 * Note that we do a reverse lookup since the header is 2445 * in the form we sent it out. 2446 */ 2447 tcph = (tcph_t *)((uchar_t *)ipha + hdr_length); 2448 connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcph, TCPS_LISTEN); 2449 if (connp == NULL) { 2450 BUMP_MIB(&ip_mib, ipInDiscards); 2451 goto drop_pkt; 2452 } 2453 2454 /* Have to change db_type after any pullupmsg */ 2455 DB_TYPE(mp) = M_CTL; 2456 squeue_fill(connp->conn_sqp, first_mp, tcp_input, 2457 connp, SQTAG_TCP_INPUT_ICMP_ERR); 2458 return; 2459 2460 case IPPROTO_SCTP: 2461 /* 2462 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of 2463 * transport header. 2464 */ 2465 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > 2466 mp->b_wptr) { 2467 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 2468 ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { 2469 BUMP_MIB(&ip_mib, ipInDiscards); 2470 goto drop_pkt; 2471 } 2472 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2473 ipha = (ipha_t *)&icmph[1]; 2474 } 2475 up = (uint16_t *)((uchar_t *)ipha + hdr_length); 2476 /* 2477 * Find a SCTP client stream for this packet. 2478 * Note that we do a reverse lookup since the header is 2479 * in the form we sent it out. 2480 * The ripha header is only used for the matching and we 2481 * only set the src and dst addresses, protocol, and version. 2482 */ 2483 ripha.ipha_src = ipha->ipha_dst; 2484 ripha.ipha_dst = ipha->ipha_src; 2485 ripha.ipha_protocol = ipha->ipha_protocol; 2486 ripha.ipha_version_and_hdr_length = 2487 ipha->ipha_version_and_hdr_length; 2488 ((uint16_t *)&ports)[0] = up[1]; 2489 ((uint16_t *)&ports)[1] = up[0]; 2490 2491 /* Have to change db_type after any pullupmsg */ 2492 DB_TYPE(mp) = M_CTL; 2493 ip_fanout_sctp(first_mp, recv_ill, &ripha, ports, 0, 2494 mctl_present, ip_policy, 0, zoneid); 2495 return; 2496 2497 case IPPROTO_ESP: 2498 case IPPROTO_AH: { 2499 int ipsec_rc; 2500 2501 /* 2502 * We need a IPSEC_IN in the front to fanout to AH/ESP. 2503 * We will re-use the IPSEC_IN if it is already present as 2504 * AH/ESP will not affect any fields in the IPSEC_IN for 2505 * ICMP errors. If there is no IPSEC_IN, allocate a new 2506 * one and attach it in the front. 2507 */ 2508 if (ii != NULL) { 2509 /* 2510 * ip_fanout_proto_again converts the ICMP errors 2511 * that come back from AH/ESP to M_DATA so that 2512 * if it is non-AH/ESP and we do a pullupmsg in 2513 * this function, it would work. Convert it back 2514 * to M_CTL before we send up as this is a ICMP 2515 * error. This could have been generated locally or 2516 * by some router. Validate the inner IPSEC 2517 * headers. 2518 * 2519 * NOTE : ill_index is used by ip_fanout_proto_again 2520 * to locate the ill. 2521 */ 2522 ASSERT(ill != NULL); 2523 ii->ipsec_in_ill_index = 2524 ill->ill_phyint->phyint_ifindex; 2525 ii->ipsec_in_rill_index = 2526 recv_ill->ill_phyint->phyint_ifindex; 2527 DB_TYPE(first_mp->b_cont) = M_CTL; 2528 } else { 2529 /* 2530 * IPSEC_IN is not present. We attach a ipsec_in 2531 * message and send up to IPSEC for validating 2532 * and removing the IPSEC headers. Clear 2533 * ipsec_in_secure so that when we return 2534 * from IPSEC, we don't mistakenly think that this 2535 * is a secure packet came from the network. 2536 * 2537 * NOTE : ill_index is used by ip_fanout_proto_again 2538 * to locate the ill. 2539 */ 2540 ASSERT(first_mp == mp); 2541 first_mp = ipsec_in_alloc(B_TRUE); 2542 if (first_mp == NULL) { 2543 freemsg(mp); 2544 BUMP_MIB(&ip_mib, ipInDiscards); 2545 return; 2546 } 2547 ii = (ipsec_in_t *)first_mp->b_rptr; 2548 2549 /* This is not a secure packet */ 2550 ii->ipsec_in_secure = B_FALSE; 2551 first_mp->b_cont = mp; 2552 DB_TYPE(mp) = M_CTL; 2553 ASSERT(ill != NULL); 2554 ii->ipsec_in_ill_index = 2555 ill->ill_phyint->phyint_ifindex; 2556 ii->ipsec_in_rill_index = 2557 recv_ill->ill_phyint->phyint_ifindex; 2558 } 2559 ip2dbg(("icmp_inbound_error: ipsec\n")); 2560 2561 if (!ipsec_loaded()) { 2562 ip_proto_not_sup(q, first_mp, 0, zoneid); 2563 return; 2564 } 2565 2566 if (ipha->ipha_protocol == IPPROTO_ESP) 2567 ipsec_rc = ipsecesp_icmp_error(first_mp); 2568 else 2569 ipsec_rc = ipsecah_icmp_error(first_mp); 2570 if (ipsec_rc == IPSEC_STATUS_FAILED) 2571 return; 2572 2573 ip_fanout_proto_again(first_mp, ill, recv_ill, NULL); 2574 return; 2575 } 2576 default: 2577 /* 2578 * The ripha header is only used for the lookup and we 2579 * only set the src and dst addresses and protocol. 2580 */ 2581 ripha.ipha_src = ipha->ipha_dst; 2582 ripha.ipha_dst = ipha->ipha_src; 2583 ripha.ipha_protocol = ipha->ipha_protocol; 2584 ip2dbg(("icmp_inbound_error: proto %d %x to %x: %d/%d\n", 2585 ripha.ipha_protocol, ntohl(ipha->ipha_src), 2586 ntohl(ipha->ipha_dst), 2587 icmph->icmph_type, icmph->icmph_code)); 2588 if (ipha->ipha_protocol == IPPROTO_ENCAP) { 2589 ipha_t *in_ipha; 2590 2591 if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) > 2592 mp->b_wptr) { 2593 if (!pullupmsg(mp, (uchar_t *)ipha + 2594 hdr_length + sizeof (ipha_t) - 2595 mp->b_rptr)) { 2596 2597 BUMP_MIB(&ip_mib, ipInDiscards); 2598 goto drop_pkt; 2599 } 2600 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2601 ipha = (ipha_t *)&icmph[1]; 2602 } 2603 /* 2604 * Caller has verified that length has to be 2605 * at least the size of IP header. 2606 */ 2607 ASSERT(hdr_length >= sizeof (ipha_t)); 2608 /* 2609 * Check the sanity of the inner IP header like 2610 * we did for the outer header. 2611 */ 2612 in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length); 2613 if ((IPH_HDR_VERSION(in_ipha) != IPV4_VERSION)) { 2614 BUMP_MIB(&ip_mib, ipInDiscards); 2615 goto drop_pkt; 2616 } 2617 if (IPH_HDR_LENGTH(in_ipha) < sizeof (ipha_t)) { 2618 BUMP_MIB(&ip_mib, ipInDiscards); 2619 goto drop_pkt; 2620 } 2621 /* Check for Self-encapsulated tunnels */ 2622 if (in_ipha->ipha_src == ipha->ipha_src && 2623 in_ipha->ipha_dst == ipha->ipha_dst) { 2624 2625 mp = icmp_inbound_self_encap_error(mp, 2626 iph_hdr_length, hdr_length); 2627 if (mp == NULL) 2628 goto drop_pkt; 2629 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2630 ipha = (ipha_t *)&icmph[1]; 2631 hdr_length = IPH_HDR_LENGTH(ipha); 2632 /* 2633 * The packet in error is self-encapsualted. 2634 * And we are finding it further encapsulated 2635 * which we could not have possibly generated. 2636 */ 2637 if (ipha->ipha_protocol == IPPROTO_ENCAP) { 2638 BUMP_MIB(&ip_mib, ipInDiscards); 2639 goto drop_pkt; 2640 } 2641 icmp_inbound_error_fanout(q, ill, first_mp, 2642 icmph, ipha, iph_hdr_length, hdr_length, 2643 mctl_present, ip_policy, recv_ill, zoneid); 2644 return; 2645 } 2646 } 2647 if ((ipha->ipha_protocol == IPPROTO_ENCAP || 2648 ipha->ipha_protocol == IPPROTO_IPV6) && 2649 icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED && 2650 ii != NULL && 2651 ii->ipsec_in_loopback && 2652 ii->ipsec_in_secure) { 2653 /* 2654 * For IP tunnels that get a looped-back 2655 * ICMP_FRAGMENTATION_NEEDED message, adjust the 2656 * reported new MTU to take into account the IPsec 2657 * headers protecting this configured tunnel. 2658 * 2659 * This allows the tunnel module (tun.c) to blindly 2660 * accept the MTU reported in an ICMP "too big" 2661 * message. 2662 * 2663 * Non-looped back ICMP messages will just be 2664 * handled by the security protocols (if needed), 2665 * and the first subsequent packet will hit this 2666 * path. 2667 */ 2668 icmph->icmph_du_mtu = htons(ntohs(icmph->icmph_du_mtu) - 2669 ipsec_in_extra_length(first_mp)); 2670 } 2671 /* Have to change db_type after any pullupmsg */ 2672 DB_TYPE(mp) = M_CTL; 2673 2674 ip_fanout_proto(q, first_mp, ill, &ripha, 0, mctl_present, 2675 ip_policy, recv_ill, zoneid); 2676 return; 2677 } 2678 /* NOTREACHED */ 2679 drop_pkt:; 2680 ip1dbg(("icmp_inbound_error_fanout: drop pkt\n")); 2681 freemsg(first_mp); 2682 } 2683 2684 /* 2685 * Common IP options parser. 2686 * 2687 * Setup routine: fill in *optp with options-parsing state, then 2688 * tail-call ipoptp_next to return the first option. 2689 */ 2690 uint8_t 2691 ipoptp_first(ipoptp_t *optp, ipha_t *ipha) 2692 { 2693 uint32_t totallen; /* total length of all options */ 2694 2695 totallen = ipha->ipha_version_and_hdr_length - 2696 (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS); 2697 totallen <<= 2; 2698 optp->ipoptp_next = (uint8_t *)(&ipha[1]); 2699 optp->ipoptp_end = optp->ipoptp_next + totallen; 2700 optp->ipoptp_flags = 0; 2701 return (ipoptp_next(optp)); 2702 } 2703 2704 /* 2705 * Common IP options parser: extract next option. 2706 */ 2707 uint8_t 2708 ipoptp_next(ipoptp_t *optp) 2709 { 2710 uint8_t *end = optp->ipoptp_end; 2711 uint8_t *cur = optp->ipoptp_next; 2712 uint8_t opt, len, pointer; 2713 2714 /* 2715 * If cur > end already, then the ipoptp_end or ipoptp_next pointer 2716 * has been corrupted. 2717 */ 2718 ASSERT(cur <= end); 2719 2720 if (cur == end) 2721 return (IPOPT_EOL); 2722 2723 opt = cur[IPOPT_OPTVAL]; 2724 2725 /* 2726 * Skip any NOP options. 2727 */ 2728 while (opt == IPOPT_NOP) { 2729 cur++; 2730 if (cur == end) 2731 return (IPOPT_EOL); 2732 opt = cur[IPOPT_OPTVAL]; 2733 } 2734 2735 if (opt == IPOPT_EOL) 2736 return (IPOPT_EOL); 2737 2738 /* 2739 * Option requiring a length. 2740 */ 2741 if ((cur + 1) >= end) { 2742 optp->ipoptp_flags |= IPOPTP_ERROR; 2743 return (IPOPT_EOL); 2744 } 2745 len = cur[IPOPT_OLEN]; 2746 if (len < 2) { 2747 optp->ipoptp_flags |= IPOPTP_ERROR; 2748 return (IPOPT_EOL); 2749 } 2750 optp->ipoptp_cur = cur; 2751 optp->ipoptp_len = len; 2752 optp->ipoptp_next = cur + len; 2753 if (cur + len > end) { 2754 optp->ipoptp_flags |= IPOPTP_ERROR; 2755 return (IPOPT_EOL); 2756 } 2757 2758 /* 2759 * For the options which require a pointer field, make sure 2760 * its there, and make sure it points to either something 2761 * inside this option, or the end of the option. 2762 */ 2763 switch (opt) { 2764 case IPOPT_RR: 2765 case IPOPT_TS: 2766 case IPOPT_LSRR: 2767 case IPOPT_SSRR: 2768 if (len <= IPOPT_OFFSET) { 2769 optp->ipoptp_flags |= IPOPTP_ERROR; 2770 return (opt); 2771 } 2772 pointer = cur[IPOPT_OFFSET]; 2773 if (pointer - 1 > len) { 2774 optp->ipoptp_flags |= IPOPTP_ERROR; 2775 return (opt); 2776 } 2777 break; 2778 } 2779 2780 /* 2781 * Sanity check the pointer field based on the type of the 2782 * option. 2783 */ 2784 switch (opt) { 2785 case IPOPT_RR: 2786 case IPOPT_SSRR: 2787 case IPOPT_LSRR: 2788 if (pointer < IPOPT_MINOFF_SR) 2789 optp->ipoptp_flags |= IPOPTP_ERROR; 2790 break; 2791 case IPOPT_TS: 2792 if (pointer < IPOPT_MINOFF_IT) 2793 optp->ipoptp_flags |= IPOPTP_ERROR; 2794 /* 2795 * Note that the Internet Timestamp option also 2796 * contains two four bit fields (the Overflow field, 2797 * and the Flag field), which follow the pointer 2798 * field. We don't need to check that these fields 2799 * fall within the length of the option because this 2800 * was implicitely done above. We've checked that the 2801 * pointer value is at least IPOPT_MINOFF_IT, and that 2802 * it falls within the option. Since IPOPT_MINOFF_IT > 2803 * IPOPT_POS_OV_FLG, we don't need the explicit check. 2804 */ 2805 ASSERT(len > IPOPT_POS_OV_FLG); 2806 break; 2807 } 2808 2809 return (opt); 2810 } 2811 2812 /* 2813 * Use the outgoing IP header to create an IP_OPTIONS option the way 2814 * it was passed down from the application. 2815 */ 2816 int 2817 ip_opt_get_user(const ipha_t *ipha, uchar_t *buf) 2818 { 2819 ipoptp_t opts; 2820 const uchar_t *opt; 2821 uint8_t optval; 2822 uint8_t optlen; 2823 uint32_t len = 0; 2824 uchar_t *buf1 = buf; 2825 2826 buf += IP_ADDR_LEN; /* Leave room for final destination */ 2827 len += IP_ADDR_LEN; 2828 bzero(buf1, IP_ADDR_LEN); 2829 2830 /* 2831 * OK to cast away const here, as we don't store through the returned 2832 * opts.ipoptp_cur pointer. 2833 */ 2834 for (optval = ipoptp_first(&opts, (ipha_t *)ipha); 2835 optval != IPOPT_EOL; 2836 optval = ipoptp_next(&opts)) { 2837 int off; 2838 2839 opt = opts.ipoptp_cur; 2840 optlen = opts.ipoptp_len; 2841 switch (optval) { 2842 case IPOPT_SSRR: 2843 case IPOPT_LSRR: 2844 2845 /* 2846 * Insert ipha_dst as the first entry in the source 2847 * route and move down the entries on step. 2848 * The last entry gets placed at buf1. 2849 */ 2850 buf[IPOPT_OPTVAL] = optval; 2851 buf[IPOPT_OLEN] = optlen; 2852 buf[IPOPT_OFFSET] = optlen; 2853 2854 off = optlen - IP_ADDR_LEN; 2855 if (off < 0) { 2856 /* No entries in source route */ 2857 break; 2858 } 2859 /* Last entry in source route */ 2860 bcopy(opt + off, buf1, IP_ADDR_LEN); 2861 off -= IP_ADDR_LEN; 2862 2863 while (off > 0) { 2864 bcopy(opt + off, 2865 buf + off + IP_ADDR_LEN, 2866 IP_ADDR_LEN); 2867 off -= IP_ADDR_LEN; 2868 } 2869 /* ipha_dst into first slot */ 2870 bcopy(&ipha->ipha_dst, 2871 buf + off + IP_ADDR_LEN, 2872 IP_ADDR_LEN); 2873 buf += optlen; 2874 len += optlen; 2875 break; 2876 2877 case IPOPT_COMSEC: 2878 case IPOPT_SECURITY: 2879 /* if passing up a label is not ok, then remove */ 2880 if (is_system_labeled()) 2881 break; 2882 /* FALLTHROUGH */ 2883 default: 2884 bcopy(opt, buf, optlen); 2885 buf += optlen; 2886 len += optlen; 2887 break; 2888 } 2889 } 2890 done: 2891 /* Pad the resulting options */ 2892 while (len & 0x3) { 2893 *buf++ = IPOPT_EOL; 2894 len++; 2895 } 2896 return (len); 2897 } 2898 2899 /* 2900 * Update any record route or timestamp options to include this host. 2901 * Reverse any source route option. 2902 * This routine assumes that the options are well formed i.e. that they 2903 * have already been checked. 2904 */ 2905 static void 2906 icmp_options_update(ipha_t *ipha) 2907 { 2908 ipoptp_t opts; 2909 uchar_t *opt; 2910 uint8_t optval; 2911 ipaddr_t src; /* Our local address */ 2912 ipaddr_t dst; 2913 2914 ip2dbg(("icmp_options_update\n")); 2915 src = ipha->ipha_src; 2916 dst = ipha->ipha_dst; 2917 2918 for (optval = ipoptp_first(&opts, ipha); 2919 optval != IPOPT_EOL; 2920 optval = ipoptp_next(&opts)) { 2921 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 2922 opt = opts.ipoptp_cur; 2923 ip2dbg(("icmp_options_update: opt %d, len %d\n", 2924 optval, opts.ipoptp_len)); 2925 switch (optval) { 2926 int off1, off2; 2927 case IPOPT_SSRR: 2928 case IPOPT_LSRR: 2929 /* 2930 * Reverse the source route. The first entry 2931 * should be the next to last one in the current 2932 * source route (the last entry is our address). 2933 * The last entry should be the final destination. 2934 */ 2935 off1 = IPOPT_MINOFF_SR - 1; 2936 off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1; 2937 if (off2 < 0) { 2938 /* No entries in source route */ 2939 ip1dbg(( 2940 "icmp_options_update: bad src route\n")); 2941 break; 2942 } 2943 bcopy((char *)opt + off2, &dst, IP_ADDR_LEN); 2944 bcopy(&ipha->ipha_dst, (char *)opt + off2, IP_ADDR_LEN); 2945 bcopy(&dst, &ipha->ipha_dst, IP_ADDR_LEN); 2946 off2 -= IP_ADDR_LEN; 2947 2948 while (off1 < off2) { 2949 bcopy((char *)opt + off1, &src, IP_ADDR_LEN); 2950 bcopy((char *)opt + off2, (char *)opt + off1, 2951 IP_ADDR_LEN); 2952 bcopy(&src, (char *)opt + off2, IP_ADDR_LEN); 2953 off1 += IP_ADDR_LEN; 2954 off2 -= IP_ADDR_LEN; 2955 } 2956 opt[IPOPT_OFFSET] = IPOPT_MINOFF_SR; 2957 break; 2958 } 2959 } 2960 } 2961 2962 /* 2963 * Process received ICMP Redirect messages. 2964 */ 2965 /* ARGSUSED */ 2966 static void 2967 icmp_redirect(mblk_t *mp) 2968 { 2969 ipha_t *ipha; 2970 int iph_hdr_length; 2971 icmph_t *icmph; 2972 ipha_t *ipha_err; 2973 ire_t *ire; 2974 ire_t *prev_ire; 2975 ire_t *save_ire; 2976 ipaddr_t src, dst, gateway; 2977 iulp_t ulp_info = { 0 }; 2978 int error; 2979 2980 ipha = (ipha_t *)mp->b_rptr; 2981 iph_hdr_length = IPH_HDR_LENGTH(ipha); 2982 if (((mp->b_wptr - mp->b_rptr) - iph_hdr_length) < 2983 sizeof (icmph_t) + IP_SIMPLE_HDR_LENGTH) { 2984 BUMP_MIB(&icmp_mib, icmpInErrors); 2985 freemsg(mp); 2986 return; 2987 } 2988 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2989 ipha_err = (ipha_t *)&icmph[1]; 2990 src = ipha->ipha_src; 2991 dst = ipha_err->ipha_dst; 2992 gateway = icmph->icmph_rd_gateway; 2993 /* Make sure the new gateway is reachable somehow. */ 2994 ire = ire_route_lookup(gateway, 0, 0, IRE_INTERFACE, NULL, NULL, 2995 ALL_ZONES, NULL, MATCH_IRE_TYPE); 2996 /* 2997 * Make sure we had a route for the dest in question and that 2998 * that route was pointing to the old gateway (the source of the 2999 * redirect packet.) 3000 */ 3001 prev_ire = ire_route_lookup(dst, 0, src, 0, NULL, NULL, ALL_ZONES, 3002 NULL, MATCH_IRE_GW); 3003 /* 3004 * Check that 3005 * the redirect was not from ourselves 3006 * the new gateway and the old gateway are directly reachable 3007 */ 3008 if (!prev_ire || 3009 !ire || 3010 ire->ire_type == IRE_LOCAL) { 3011 BUMP_MIB(&icmp_mib, icmpInBadRedirects); 3012 freemsg(mp); 3013 if (ire != NULL) 3014 ire_refrele(ire); 3015 if (prev_ire != NULL) 3016 ire_refrele(prev_ire); 3017 return; 3018 } 3019 3020 /* 3021 * Should we use the old ULP info to create the new gateway? From 3022 * a user's perspective, we should inherit the info so that it 3023 * is a "smooth" transition. If we do not do that, then new 3024 * connections going thru the new gateway will have no route metrics, 3025 * which is counter-intuitive to user. From a network point of 3026 * view, this may or may not make sense even though the new gateway 3027 * is still directly connected to us so the route metrics should not 3028 * change much. 3029 * 3030 * But if the old ire_uinfo is not initialized, we do another 3031 * recursive lookup on the dest using the new gateway. There may 3032 * be a route to that. If so, use it to initialize the redirect 3033 * route. 3034 */ 3035 if (prev_ire->ire_uinfo.iulp_set) { 3036 bcopy(&prev_ire->ire_uinfo, &ulp_info, sizeof (iulp_t)); 3037 } else { 3038 ire_t *tmp_ire; 3039 ire_t *sire; 3040 3041 tmp_ire = ire_ftable_lookup(dst, 0, gateway, 0, NULL, &sire, 3042 ALL_ZONES, 0, NULL, 3043 (MATCH_IRE_RECURSIVE | MATCH_IRE_GW | MATCH_IRE_DEFAULT)); 3044 if (sire != NULL) { 3045 bcopy(&sire->ire_uinfo, &ulp_info, sizeof (iulp_t)); 3046 /* 3047 * If sire != NULL, ire_ftable_lookup() should not 3048 * return a NULL value. 3049 */ 3050 ASSERT(tmp_ire != NULL); 3051 ire_refrele(tmp_ire); 3052 ire_refrele(sire); 3053 } else if (tmp_ire != NULL) { 3054 bcopy(&tmp_ire->ire_uinfo, &ulp_info, 3055 sizeof (iulp_t)); 3056 ire_refrele(tmp_ire); 3057 } 3058 } 3059 if (prev_ire->ire_type == IRE_CACHE) 3060 ire_delete(prev_ire); 3061 ire_refrele(prev_ire); 3062 /* 3063 * TODO: more precise handling for cases 0, 2, 3, the latter two 3064 * require TOS routing 3065 */ 3066 switch (icmph->icmph_code) { 3067 case 0: 3068 case 1: 3069 /* TODO: TOS specificity for cases 2 and 3 */ 3070 case 2: 3071 case 3: 3072 break; 3073 default: 3074 freemsg(mp); 3075 BUMP_MIB(&icmp_mib, icmpInBadRedirects); 3076 ire_refrele(ire); 3077 return; 3078 } 3079 /* 3080 * Create a Route Association. This will allow us to remember that 3081 * someone we believe told us to use the particular gateway. 3082 */ 3083 save_ire = ire; 3084 ire = ire_create( 3085 (uchar_t *)&dst, /* dest addr */ 3086 (uchar_t *)&ip_g_all_ones, /* mask */ 3087 (uchar_t *)&save_ire->ire_src_addr, /* source addr */ 3088 (uchar_t *)&gateway, /* gateway addr */ 3089 NULL, /* no in_srcaddr */ 3090 &save_ire->ire_max_frag, /* max frag */ 3091 NULL, /* Fast Path header */ 3092 NULL, /* no rfq */ 3093 NULL, /* no stq */ 3094 IRE_HOST_REDIRECT, 3095 NULL, 3096 NULL, 3097 NULL, 3098 0, 3099 0, 3100 0, 3101 (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 3102 &ulp_info, 3103 NULL, 3104 NULL); 3105 3106 if (ire == NULL) { 3107 freemsg(mp); 3108 ire_refrele(save_ire); 3109 return; 3110 } 3111 error = ire_add(&ire, NULL, NULL, NULL); 3112 ire_refrele(save_ire); 3113 if (error == 0) { 3114 ire_refrele(ire); /* Held in ire_add_v4 */ 3115 /* tell routing sockets that we received a redirect */ 3116 ip_rts_change(RTM_REDIRECT, dst, gateway, IP_HOST_MASK, 0, src, 3117 (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 0, 3118 (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_AUTHOR)); 3119 } 3120 3121 /* 3122 * Delete any existing IRE_HOST_REDIRECT for this destination. 3123 * This together with the added IRE has the effect of 3124 * modifying an existing redirect. 3125 */ 3126 prev_ire = ire_ftable_lookup(dst, 0, src, IRE_HOST_REDIRECT, NULL, NULL, 3127 ALL_ZONES, 0, NULL, (MATCH_IRE_GW | MATCH_IRE_TYPE)); 3128 if (prev_ire) { 3129 ire_delete(prev_ire); 3130 ire_refrele(prev_ire); 3131 } 3132 3133 freemsg(mp); 3134 } 3135 3136 /* 3137 * Generate an ICMP parameter problem message. 3138 */ 3139 static void 3140 icmp_param_problem(queue_t *q, mblk_t *mp, uint8_t ptr) 3141 { 3142 icmph_t icmph; 3143 boolean_t mctl_present; 3144 mblk_t *first_mp; 3145 3146 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 3147 3148 if (!(mp = icmp_pkt_err_ok(mp))) { 3149 if (mctl_present) 3150 freeb(first_mp); 3151 return; 3152 } 3153 3154 bzero(&icmph, sizeof (icmph_t)); 3155 icmph.icmph_type = ICMP_PARAM_PROBLEM; 3156 icmph.icmph_pp_ptr = ptr; 3157 BUMP_MIB(&icmp_mib, icmpOutParmProbs); 3158 icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present); 3159 } 3160 3161 /* 3162 * Build and ship an IPv4 ICMP message using the packet data in mp, and 3163 * the ICMP header pointed to by "stuff". (May be called as writer.) 3164 * Note: assumes that icmp_pkt_err_ok has been called to verify that 3165 * an icmp error packet can be sent. 3166 * Assigns an appropriate source address to the packet. If ipha_dst is 3167 * one of our addresses use it for source. Otherwise pick a source based 3168 * on a route lookup back to ipha_src. 3169 * Note that ipha_src must be set here since the 3170 * packet is likely to arrive on an ill queue in ip_wput() which will 3171 * not set a source address. 3172 */ 3173 static void 3174 icmp_pkt(queue_t *q, mblk_t *mp, void *stuff, size_t len, 3175 boolean_t mctl_present) 3176 { 3177 ipaddr_t dst; 3178 icmph_t *icmph; 3179 ipha_t *ipha; 3180 uint_t len_needed; 3181 size_t msg_len; 3182 mblk_t *mp1; 3183 ipaddr_t src; 3184 ire_t *ire; 3185 mblk_t *ipsec_mp; 3186 ipsec_out_t *io = NULL; 3187 boolean_t xmit_if_on = B_FALSE; 3188 zoneid_t zoneid; 3189 3190 if (mctl_present) { 3191 /* 3192 * If it is : 3193 * 3194 * 1) a IPSEC_OUT, then this is caused by outbound 3195 * datagram originating on this host. IPSEC processing 3196 * may or may not have been done. Refer to comments above 3197 * icmp_inbound_error_fanout for details. 3198 * 3199 * 2) a IPSEC_IN if we are generating a icmp_message 3200 * for an incoming datagram destined for us i.e called 3201 * from ip_fanout_send_icmp. 3202 */ 3203 ipsec_info_t *in; 3204 ipsec_mp = mp; 3205 mp = ipsec_mp->b_cont; 3206 3207 in = (ipsec_info_t *)ipsec_mp->b_rptr; 3208 ipha = (ipha_t *)mp->b_rptr; 3209 3210 ASSERT(in->ipsec_info_type == IPSEC_OUT || 3211 in->ipsec_info_type == IPSEC_IN); 3212 3213 if (in->ipsec_info_type == IPSEC_IN) { 3214 /* 3215 * Convert the IPSEC_IN to IPSEC_OUT. 3216 */ 3217 if (!ipsec_in_to_out(ipsec_mp, ipha, NULL)) { 3218 BUMP_MIB(&ip_mib, ipOutDiscards); 3219 return; 3220 } 3221 io = (ipsec_out_t *)ipsec_mp->b_rptr; 3222 } else { 3223 ASSERT(in->ipsec_info_type == IPSEC_OUT); 3224 io = (ipsec_out_t *)in; 3225 if (io->ipsec_out_xmit_if) 3226 xmit_if_on = B_TRUE; 3227 /* 3228 * Clear out ipsec_out_proc_begin, so we do a fresh 3229 * ire lookup. 3230 */ 3231 io->ipsec_out_proc_begin = B_FALSE; 3232 } 3233 zoneid = io->ipsec_out_zoneid; 3234 ASSERT(zoneid != ALL_ZONES); 3235 } else { 3236 /* 3237 * This is in clear. The icmp message we are building 3238 * here should go out in clear. 3239 * 3240 * Pardon the convolution of it all, but it's easier to 3241 * allocate a "use cleartext" IPSEC_IN message and convert 3242 * it than it is to allocate a new one. 3243 */ 3244 ipsec_in_t *ii; 3245 ASSERT(DB_TYPE(mp) == M_DATA); 3246 if ((ipsec_mp = ipsec_in_alloc(B_TRUE)) == NULL) { 3247 freemsg(mp); 3248 BUMP_MIB(&ip_mib, ipOutDiscards); 3249 return; 3250 } 3251 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 3252 3253 /* This is not a secure packet */ 3254 ii->ipsec_in_secure = B_FALSE; 3255 if (CONN_Q(q)) { 3256 zoneid = Q_TO_CONN(q)->conn_zoneid; 3257 } else { 3258 zoneid = GLOBAL_ZONEID; 3259 } 3260 ii->ipsec_in_zoneid = zoneid; 3261 ASSERT(zoneid != ALL_ZONES); 3262 ipsec_mp->b_cont = mp; 3263 ipha = (ipha_t *)mp->b_rptr; 3264 /* 3265 * Convert the IPSEC_IN to IPSEC_OUT. 3266 */ 3267 if (!ipsec_in_to_out(ipsec_mp, ipha, NULL)) { 3268 BUMP_MIB(&ip_mib, ipOutDiscards); 3269 return; 3270 } 3271 io = (ipsec_out_t *)ipsec_mp->b_rptr; 3272 } 3273 3274 /* Remember our eventual destination */ 3275 dst = ipha->ipha_src; 3276 3277 ire = ire_route_lookup(ipha->ipha_dst, 0, 0, (IRE_LOCAL|IRE_LOOPBACK), 3278 NULL, NULL, zoneid, NULL, MATCH_IRE_TYPE); 3279 if (ire != NULL && 3280 (ire->ire_zoneid == zoneid || ire->ire_zoneid == ALL_ZONES)) { 3281 src = ipha->ipha_dst; 3282 } else if (!xmit_if_on) { 3283 if (ire != NULL) 3284 ire_refrele(ire); 3285 ire = ire_route_lookup(dst, 0, 0, 0, NULL, NULL, zoneid, NULL, 3286 (MATCH_IRE_DEFAULT|MATCH_IRE_RECURSIVE|MATCH_IRE_ZONEONLY)); 3287 if (ire == NULL) { 3288 BUMP_MIB(&ip_mib, ipOutNoRoutes); 3289 freemsg(ipsec_mp); 3290 return; 3291 } 3292 src = ire->ire_src_addr; 3293 } else { 3294 ipif_t *ipif = NULL; 3295 ill_t *ill; 3296 /* 3297 * This must be an ICMP error coming from 3298 * ip_mrtun_forward(). The src addr should 3299 * be equal to the IP-addr of the outgoing 3300 * interface. 3301 */ 3302 if (io == NULL) { 3303 /* This is not a IPSEC_OUT type control msg */ 3304 BUMP_MIB(&ip_mib, ipOutNoRoutes); 3305 freemsg(ipsec_mp); 3306 return; 3307 } 3308 ill = ill_lookup_on_ifindex(io->ipsec_out_ill_index, B_FALSE, 3309 NULL, NULL, NULL, NULL); 3310 if (ill != NULL) { 3311 ipif = ipif_get_next_ipif(NULL, ill); 3312 ill_refrele(ill); 3313 } 3314 if (ipif == NULL) { 3315 BUMP_MIB(&ip_mib, ipOutNoRoutes); 3316 freemsg(ipsec_mp); 3317 return; 3318 } 3319 src = ipif->ipif_src_addr; 3320 ipif_refrele(ipif); 3321 } 3322 3323 if (ire != NULL) 3324 ire_refrele(ire); 3325 3326 /* 3327 * Check if we can send back more then 8 bytes in addition 3328 * to the IP header. We will include as much as 64 bytes. 3329 */ 3330 len_needed = IPH_HDR_LENGTH(ipha); 3331 if (ipha->ipha_protocol == IPPROTO_ENCAP && 3332 (uchar_t *)ipha + len_needed + 1 <= mp->b_wptr) { 3333 len_needed += IPH_HDR_LENGTH(((uchar_t *)ipha + len_needed)); 3334 } 3335 len_needed += ip_icmp_return; 3336 msg_len = msgdsize(mp); 3337 if (msg_len > len_needed) { 3338 (void) adjmsg(mp, len_needed - msg_len); 3339 msg_len = len_needed; 3340 } 3341 mp1 = allocb(sizeof (icmp_ipha) + len, BPRI_HI); 3342 if (mp1 == NULL) { 3343 BUMP_MIB(&icmp_mib, icmpOutErrors); 3344 freemsg(ipsec_mp); 3345 return; 3346 } 3347 /* 3348 * On an unlabeled system, dblks don't necessarily have creds. 3349 */ 3350 ASSERT(!is_system_labeled() || DB_CRED(mp) != NULL); 3351 if (DB_CRED(mp) != NULL) 3352 mblk_setcred(mp1, DB_CRED(mp)); 3353 mp1->b_cont = mp; 3354 mp = mp1; 3355 ASSERT(ipsec_mp->b_datap->db_type == M_CTL && 3356 ipsec_mp->b_rptr == (uint8_t *)io && 3357 io->ipsec_out_type == IPSEC_OUT); 3358 ipsec_mp->b_cont = mp; 3359 3360 /* 3361 * Set ipsec_out_icmp_loopback so we can let the ICMP messages this 3362 * node generates be accepted in peace by all on-host destinations. 3363 * If we do NOT assume that all on-host destinations trust 3364 * self-generated ICMP messages, then rework here, ip6.c, and spd.c. 3365 * (Look for ipsec_out_icmp_loopback). 3366 */ 3367 io->ipsec_out_icmp_loopback = B_TRUE; 3368 3369 ipha = (ipha_t *)mp->b_rptr; 3370 mp1->b_wptr = (uchar_t *)ipha + (sizeof (icmp_ipha) + len); 3371 *ipha = icmp_ipha; 3372 ipha->ipha_src = src; 3373 ipha->ipha_dst = dst; 3374 ipha->ipha_ttl = ip_def_ttl; 3375 msg_len += sizeof (icmp_ipha) + len; 3376 if (msg_len > IP_MAXPACKET) { 3377 (void) adjmsg(mp, IP_MAXPACKET - msg_len); 3378 msg_len = IP_MAXPACKET; 3379 } 3380 ipha->ipha_length = htons((uint16_t)msg_len); 3381 icmph = (icmph_t *)&ipha[1]; 3382 bcopy(stuff, icmph, len); 3383 icmph->icmph_checksum = 0; 3384 icmph->icmph_checksum = IP_CSUM(mp, (int32_t)sizeof (ipha_t), 0); 3385 if (icmph->icmph_checksum == 0) 3386 icmph->icmph_checksum = 0xFFFF; 3387 BUMP_MIB(&icmp_mib, icmpOutMsgs); 3388 put(q, ipsec_mp); 3389 } 3390 3391 /* 3392 * Determine if an ICMP error packet can be sent given the rate limit. 3393 * The limit consists of an average frequency (icmp_pkt_err_interval measured 3394 * in milliseconds) and a burst size. Burst size number of packets can 3395 * be sent arbitrarely closely spaced. 3396 * The state is tracked using two variables to implement an approximate 3397 * token bucket filter: 3398 * icmp_pkt_err_last - lbolt value when the last burst started 3399 * icmp_pkt_err_sent - number of packets sent in current burst 3400 */ 3401 boolean_t 3402 icmp_err_rate_limit(void) 3403 { 3404 clock_t now = TICK_TO_MSEC(lbolt); 3405 uint_t refilled; /* Number of packets refilled in tbf since last */ 3406 uint_t err_interval = ip_icmp_err_interval; /* Guard against changes */ 3407 3408 if (err_interval == 0) 3409 return (B_FALSE); 3410 3411 if (icmp_pkt_err_last > now) { 3412 /* 100HZ lbolt in ms for 32bit arch wraps every 49.7 days */ 3413 icmp_pkt_err_last = 0; 3414 icmp_pkt_err_sent = 0; 3415 } 3416 /* 3417 * If we are in a burst update the token bucket filter. 3418 * Update the "last" time to be close to "now" but make sure 3419 * we don't loose precision. 3420 */ 3421 if (icmp_pkt_err_sent != 0) { 3422 refilled = (now - icmp_pkt_err_last)/err_interval; 3423 if (refilled > icmp_pkt_err_sent) { 3424 icmp_pkt_err_sent = 0; 3425 } else { 3426 icmp_pkt_err_sent -= refilled; 3427 icmp_pkt_err_last += refilled * err_interval; 3428 } 3429 } 3430 if (icmp_pkt_err_sent == 0) { 3431 /* Start of new burst */ 3432 icmp_pkt_err_last = now; 3433 } 3434 if (icmp_pkt_err_sent < ip_icmp_err_burst) { 3435 icmp_pkt_err_sent++; 3436 ip1dbg(("icmp_err_rate_limit: %d sent in burst\n", 3437 icmp_pkt_err_sent)); 3438 return (B_FALSE); 3439 } 3440 ip1dbg(("icmp_err_rate_limit: dropped\n")); 3441 return (B_TRUE); 3442 } 3443 3444 /* 3445 * Check if it is ok to send an IPv4 ICMP error packet in 3446 * response to the IPv4 packet in mp. 3447 * Free the message and return null if no 3448 * ICMP error packet should be sent. 3449 */ 3450 static mblk_t * 3451 icmp_pkt_err_ok(mblk_t *mp) 3452 { 3453 icmph_t *icmph; 3454 ipha_t *ipha; 3455 uint_t len_needed; 3456 ire_t *src_ire; 3457 ire_t *dst_ire; 3458 3459 if (!mp) 3460 return (NULL); 3461 ipha = (ipha_t *)mp->b_rptr; 3462 if (ip_csum_hdr(ipha)) { 3463 BUMP_MIB(&ip_mib, ipInCksumErrs); 3464 freemsg(mp); 3465 return (NULL); 3466 } 3467 src_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_BROADCAST, 3468 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 3469 dst_ire = ire_ctable_lookup(ipha->ipha_src, 0, IRE_BROADCAST, 3470 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 3471 if (src_ire != NULL || dst_ire != NULL || 3472 CLASSD(ipha->ipha_dst) || 3473 CLASSD(ipha->ipha_src) || 3474 (ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_OFFSET)) { 3475 /* Note: only errors to the fragment with offset 0 */ 3476 BUMP_MIB(&icmp_mib, icmpOutDrops); 3477 freemsg(mp); 3478 if (src_ire != NULL) 3479 ire_refrele(src_ire); 3480 if (dst_ire != NULL) 3481 ire_refrele(dst_ire); 3482 return (NULL); 3483 } 3484 if (ipha->ipha_protocol == IPPROTO_ICMP) { 3485 /* 3486 * Check the ICMP type. RFC 1122 sez: don't send ICMP 3487 * errors in response to any ICMP errors. 3488 */ 3489 len_needed = IPH_HDR_LENGTH(ipha) + ICMPH_SIZE; 3490 if (mp->b_wptr - mp->b_rptr < len_needed) { 3491 if (!pullupmsg(mp, len_needed)) { 3492 BUMP_MIB(&icmp_mib, icmpInErrors); 3493 freemsg(mp); 3494 return (NULL); 3495 } 3496 ipha = (ipha_t *)mp->b_rptr; 3497 } 3498 icmph = (icmph_t *) 3499 (&((char *)ipha)[IPH_HDR_LENGTH(ipha)]); 3500 switch (icmph->icmph_type) { 3501 case ICMP_DEST_UNREACHABLE: 3502 case ICMP_SOURCE_QUENCH: 3503 case ICMP_TIME_EXCEEDED: 3504 case ICMP_PARAM_PROBLEM: 3505 case ICMP_REDIRECT: 3506 BUMP_MIB(&icmp_mib, icmpOutDrops); 3507 freemsg(mp); 3508 return (NULL); 3509 default: 3510 break; 3511 } 3512 } 3513 /* 3514 * If this is a labeled system, then check to see if we're allowed to 3515 * send a response to this particular sender. If not, then just drop. 3516 */ 3517 if (is_system_labeled() && !tsol_can_reply_error(mp)) { 3518 ip2dbg(("icmp_pkt_err_ok: can't respond to packet\n")); 3519 BUMP_MIB(&icmp_mib, icmpOutDrops); 3520 freemsg(mp); 3521 return (NULL); 3522 } 3523 if (icmp_err_rate_limit()) { 3524 /* 3525 * Only send ICMP error packets every so often. 3526 * This should be done on a per port/source basis, 3527 * but for now this will suffice. 3528 */ 3529 freemsg(mp); 3530 return (NULL); 3531 } 3532 return (mp); 3533 } 3534 3535 /* 3536 * Generate an ICMP redirect message. 3537 */ 3538 static void 3539 icmp_send_redirect(queue_t *q, mblk_t *mp, ipaddr_t gateway) 3540 { 3541 icmph_t icmph; 3542 3543 /* 3544 * We are called from ip_rput where we could 3545 * not have attached an IPSEC_IN. 3546 */ 3547 ASSERT(mp->b_datap->db_type == M_DATA); 3548 3549 if (!(mp = icmp_pkt_err_ok(mp))) { 3550 return; 3551 } 3552 3553 bzero(&icmph, sizeof (icmph_t)); 3554 icmph.icmph_type = ICMP_REDIRECT; 3555 icmph.icmph_code = 1; 3556 icmph.icmph_rd_gateway = gateway; 3557 BUMP_MIB(&icmp_mib, icmpOutRedirects); 3558 icmp_pkt(q, mp, &icmph, sizeof (icmph_t), B_FALSE); 3559 } 3560 3561 /* 3562 * Generate an ICMP time exceeded message. 3563 */ 3564 void 3565 icmp_time_exceeded(queue_t *q, mblk_t *mp, uint8_t code) 3566 { 3567 icmph_t icmph; 3568 boolean_t mctl_present; 3569 mblk_t *first_mp; 3570 3571 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 3572 3573 if (!(mp = icmp_pkt_err_ok(mp))) { 3574 if (mctl_present) 3575 freeb(first_mp); 3576 return; 3577 } 3578 3579 bzero(&icmph, sizeof (icmph_t)); 3580 icmph.icmph_type = ICMP_TIME_EXCEEDED; 3581 icmph.icmph_code = code; 3582 BUMP_MIB(&icmp_mib, icmpOutTimeExcds); 3583 icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present); 3584 } 3585 3586 /* 3587 * Generate an ICMP unreachable message. 3588 */ 3589 void 3590 icmp_unreachable(queue_t *q, mblk_t *mp, uint8_t code) 3591 { 3592 icmph_t icmph; 3593 mblk_t *first_mp; 3594 boolean_t mctl_present; 3595 3596 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 3597 3598 if (!(mp = icmp_pkt_err_ok(mp))) { 3599 if (mctl_present) 3600 freeb(first_mp); 3601 return; 3602 } 3603 3604 bzero(&icmph, sizeof (icmph_t)); 3605 icmph.icmph_type = ICMP_DEST_UNREACHABLE; 3606 icmph.icmph_code = code; 3607 BUMP_MIB(&icmp_mib, icmpOutDestUnreachs); 3608 ip2dbg(("send icmp destination unreachable code %d\n", code)); 3609 icmp_pkt(q, first_mp, (char *)&icmph, sizeof (icmph_t), mctl_present); 3610 } 3611 3612 /* 3613 * News from ARP. ARP sends notification of interesting events down 3614 * to its clients using M_CTL messages with the interesting ARP packet 3615 * attached via b_cont. 3616 * The interesting event from a device comes up the corresponding ARP-IP-DEV 3617 * queue as opposed to ARP sending the message to all the clients, i.e. all 3618 * its ARP-IP-DEV instances. Thus, for AR_CN_ANNOUNCE, we must walk the cache 3619 * table if a cache IRE is found to delete all the entries for the address in 3620 * the packet. 3621 */ 3622 static void 3623 ip_arp_news(queue_t *q, mblk_t *mp) 3624 { 3625 arcn_t *arcn; 3626 arh_t *arh; 3627 char *cp1; 3628 uchar_t *cp2; 3629 ire_t *ire = NULL; 3630 int i1; 3631 char hbuf[128]; 3632 char sbuf[16]; 3633 ipaddr_t src; 3634 in6_addr_t v6src; 3635 boolean_t isv6 = B_FALSE; 3636 3637 if ((mp->b_wptr - mp->b_rptr) < sizeof (arcn_t) || !mp->b_cont) { 3638 if (q->q_next) { 3639 putnext(q, mp); 3640 } else 3641 freemsg(mp); 3642 return; 3643 } 3644 arh = (arh_t *)mp->b_cont->b_rptr; 3645 /* Is it one we are interested in? */ 3646 if (BE16_TO_U16(arh->arh_proto) == IP6_DL_SAP) { 3647 isv6 = B_TRUE; 3648 bcopy((char *)&arh[1] + (arh->arh_hlen & 0xFF), &v6src, 3649 IPV6_ADDR_LEN); 3650 } else if (BE16_TO_U16(arh->arh_proto) == IP_ARP_PROTO_TYPE) { 3651 bcopy((char *)&arh[1] + (arh->arh_hlen & 0xFF), &src, 3652 IP_ADDR_LEN); 3653 } else { 3654 freemsg(mp); 3655 return; 3656 } 3657 3658 arcn = (arcn_t *)mp->b_rptr; 3659 switch (arcn->arcn_code) { 3660 case AR_CN_BOGON: 3661 /* 3662 * Someone is sending ARP packets with a source protocol 3663 * address which we have published. Either they are 3664 * pretending to be us, or we have been asked to proxy 3665 * for a machine that can do fine for itself, or two 3666 * different machines are providing proxy service for the 3667 * same protocol address, or something. We try and do 3668 * something appropriate here. 3669 */ 3670 cp2 = (uchar_t *)&arh[1]; 3671 cp1 = hbuf; 3672 *cp1 = '\0'; 3673 for (i1 = arh->arh_hlen; i1--; cp1 += 3) 3674 (void) sprintf(cp1, "%02x:", *cp2++ & 0xff); 3675 if (cp1 != hbuf) 3676 cp1[-1] = '\0'; 3677 (void) ip_dot_addr(src, sbuf); 3678 if (isv6) 3679 ire = ire_cache_lookup_v6(&v6src, ALL_ZONES, NULL); 3680 else 3681 ire = ire_cache_lookup(src, ALL_ZONES, NULL); 3682 3683 if (ire != NULL && IRE_IS_LOCAL(ire)) { 3684 cmn_err(CE_WARN, 3685 "IP: Hardware address '%s' trying" 3686 " to be our address %s!", 3687 hbuf, sbuf); 3688 } else { 3689 cmn_err(CE_WARN, 3690 "IP: Proxy ARP problem? " 3691 "Hardware address '%s' thinks it is %s", 3692 hbuf, sbuf); 3693 } 3694 if (ire != NULL) 3695 ire_refrele(ire); 3696 break; 3697 case AR_CN_ANNOUNCE: 3698 if (isv6) { 3699 /* 3700 * For XRESOLV interfaces. 3701 * Delete the IRE cache entry and NCE for this 3702 * v6 address 3703 */ 3704 ip_ire_clookup_and_delete_v6(&v6src); 3705 /* 3706 * If v6src is a non-zero, it's a router address 3707 * as below. Do the same sort of thing to clean 3708 * out off-net IRE_CACHE entries that go through 3709 * the router. 3710 */ 3711 if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 3712 ire_walk_v6(ire_delete_cache_gw_v6, 3713 (char *)&v6src, ALL_ZONES); 3714 } 3715 break; 3716 } 3717 /* 3718 * ARP gives us a copy of any broadcast packet with identical 3719 * sender and receiver protocol address, in 3720 * case we want to intuit something from it. Such a packet 3721 * usually means that a machine has just come up on the net. 3722 * If we have an IRE_CACHE, we blow it away. This way we will 3723 * immediately pick up the rare case of a host changing 3724 * hardware address. ip_ire_clookup_and_delete achieves this. 3725 * 3726 * The address in "src" may be an entry for a router. 3727 * (Default router, or non-default router.) If 3728 * that's true, then any off-net IRE_CACHE entries 3729 * that go through the router with address "src" 3730 * must be clobbered. Use ire_walk to achieve this 3731 * goal. 3732 * 3733 * It should be possible to determine if the address 3734 * in src is or is not for a router. This way, 3735 * the ire_walk() isn't called all of the time here. 3736 * Do not pass 'src' value of 0 to ire_delete_cache_gw, 3737 * as it would remove all IRE_CACHE entries for onlink 3738 * destinations. All onlink destinations have 3739 * ire_gateway_addr == 0. 3740 */ 3741 if ((ip_ire_clookup_and_delete(src, NULL) || 3742 (ire = ire_ftable_lookup(src, 0, 0, 0, NULL, NULL, NULL, 3743 0, NULL, MATCH_IRE_DSTONLY)) != NULL) && src != 0) { 3744 ire_walk_v4(ire_delete_cache_gw, (char *)&src, 3745 ALL_ZONES); 3746 } 3747 /* From ire_ftable_lookup */ 3748 if (ire != NULL) 3749 ire_refrele(ire); 3750 break; 3751 default: 3752 if (ire != NULL) 3753 ire_refrele(ire); 3754 break; 3755 } 3756 freemsg(mp); 3757 } 3758 3759 /* 3760 * Create a mblk suitable for carrying the interface index and/or source link 3761 * address. This mblk is tagged as an M_CTL and is sent to ULP. This is used 3762 * when the IP_RECVIF and/or IP_RECVSLLA socket option is set by the user 3763 * application. 3764 */ 3765 mblk_t * 3766 ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags) 3767 { 3768 mblk_t *mp; 3769 in_pktinfo_t *pinfo; 3770 ipha_t *ipha; 3771 struct ether_header *pether; 3772 3773 mp = allocb(sizeof (in_pktinfo_t), BPRI_MED); 3774 if (mp == NULL) { 3775 ip1dbg(("ip_add_info: allocation failure.\n")); 3776 return (data_mp); 3777 } 3778 3779 ipha = (ipha_t *)data_mp->b_rptr; 3780 pinfo = (in_pktinfo_t *)mp->b_rptr; 3781 bzero(pinfo, sizeof (in_pktinfo_t)); 3782 pinfo->in_pkt_flags = (uchar_t)flags; 3783 pinfo->in_pkt_ulp_type = IN_PKTINFO; /* Tell ULP what type of info */ 3784 3785 if (flags & IPF_RECVIF) 3786 pinfo->in_pkt_ifindex = ill->ill_phyint->phyint_ifindex; 3787 3788 pether = (struct ether_header *)((char *)ipha 3789 - sizeof (struct ether_header)); 3790 /* 3791 * Make sure the interface is an ethernet type, since this option 3792 * is currently supported only on this type of interface. Also make 3793 * sure we are pointing correctly above db_base. 3794 */ 3795 3796 if ((flags & IPF_RECVSLLA) && 3797 ((uchar_t *)pether >= data_mp->b_datap->db_base) && 3798 (ill->ill_type == IFT_ETHER) && 3799 (ill->ill_net_type == IRE_IF_RESOLVER)) { 3800 3801 pinfo->in_pkt_slla.sdl_type = IFT_ETHER; 3802 bcopy((uchar_t *)pether->ether_shost.ether_addr_octet, 3803 (uchar_t *)pinfo->in_pkt_slla.sdl_data, ETHERADDRL); 3804 } else { 3805 /* 3806 * Clear the bit. Indicate to upper layer that IP is not 3807 * sending this ancillary info. 3808 */ 3809 pinfo->in_pkt_flags = pinfo->in_pkt_flags & ~IPF_RECVSLLA; 3810 } 3811 3812 mp->b_datap->db_type = M_CTL; 3813 mp->b_wptr += sizeof (in_pktinfo_t); 3814 mp->b_cont = data_mp; 3815 3816 return (mp); 3817 } 3818 3819 /* 3820 * Latch in the IPsec state for a stream based on the ipsec_in_t passed in as 3821 * part of the bind request. 3822 */ 3823 3824 boolean_t 3825 ip_bind_ipsec_policy_set(conn_t *connp, mblk_t *policy_mp) 3826 { 3827 ipsec_in_t *ii; 3828 3829 ASSERT(policy_mp != NULL); 3830 ASSERT(policy_mp->b_datap->db_type == IPSEC_POLICY_SET); 3831 3832 ii = (ipsec_in_t *)policy_mp->b_rptr; 3833 ASSERT(ii->ipsec_in_type == IPSEC_IN); 3834 3835 connp->conn_policy = ii->ipsec_in_policy; 3836 ii->ipsec_in_policy = NULL; 3837 3838 if (ii->ipsec_in_action != NULL) { 3839 if (connp->conn_latch == NULL) { 3840 connp->conn_latch = iplatch_create(); 3841 if (connp->conn_latch == NULL) 3842 return (B_FALSE); 3843 } 3844 ipsec_latch_inbound(connp->conn_latch, ii); 3845 } 3846 return (B_TRUE); 3847 } 3848 3849 /* 3850 * Upper level protocols (ULP) pass through bind requests to IP for inspection 3851 * and to arrange for power-fanout assist. The ULP is identified by 3852 * adding a single byte at the end of the original bind message. 3853 * A ULP other than UDP or TCP that wishes to be recognized passes 3854 * down a bind with a zero length address. 3855 * 3856 * The binding works as follows: 3857 * - A zero byte address means just bind to the protocol. 3858 * - A four byte address is treated as a request to validate 3859 * that the address is a valid local address, appropriate for 3860 * an application to bind to. This does not affect any fanout 3861 * information in IP. 3862 * - A sizeof sin_t byte address is used to bind to only the local address 3863 * and port. 3864 * - A sizeof ipa_conn_t byte address contains complete fanout information 3865 * consisting of local and remote addresses and ports. In 3866 * this case, the addresses are both validated as appropriate 3867 * for this operation, and, if so, the information is retained 3868 * for use in the inbound fanout. 3869 * 3870 * The ULP (except in the zero-length bind) can append an 3871 * additional mblk of db_type IRE_DB_REQ_TYPE or IPSEC_POLICY_SET to the 3872 * T_BIND_REQ/O_T_BIND_REQ. IRE_DB_REQ_TYPE indicates that the ULP wants 3873 * a copy of the source or destination IRE (source for local bind; 3874 * destination for complete bind). IPSEC_POLICY_SET indicates that the 3875 * policy information contained should be copied on to the conn. 3876 * 3877 * NOTE : Only one of IRE_DB_REQ_TYPE or IPSEC_POLICY_SET can be present. 3878 */ 3879 mblk_t * 3880 ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp) 3881 { 3882 ssize_t len; 3883 struct T_bind_req *tbr; 3884 sin_t *sin; 3885 ipa_conn_t *ac; 3886 uchar_t *ucp; 3887 mblk_t *mp1; 3888 boolean_t ire_requested; 3889 boolean_t ipsec_policy_set = B_FALSE; 3890 int error = 0; 3891 int protocol; 3892 ipa_conn_x_t *acx; 3893 3894 ASSERT(!connp->conn_af_isv6); 3895 connp->conn_pkt_isv6 = B_FALSE; 3896 3897 len = MBLKL(mp); 3898 if (len < (sizeof (*tbr) + 1)) { 3899 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 3900 "ip_bind: bogus msg, len %ld", len); 3901 /* XXX: Need to return something better */ 3902 goto bad_addr; 3903 } 3904 /* Back up and extract the protocol identifier. */ 3905 mp->b_wptr--; 3906 protocol = *mp->b_wptr & 0xFF; 3907 tbr = (struct T_bind_req *)mp->b_rptr; 3908 /* Reset the message type in preparation for shipping it back. */ 3909 DB_TYPE(mp) = M_PCPROTO; 3910 3911 connp->conn_ulp = (uint8_t)protocol; 3912 3913 /* 3914 * Check for a zero length address. This is from a protocol that 3915 * wants to register to receive all packets of its type. 3916 */ 3917 if (tbr->ADDR_length == 0) { 3918 /* 3919 * These protocols are now intercepted in ip_bind_v6(). 3920 * Reject protocol-level binds here for now. 3921 * 3922 * For SCTP raw socket, ICMP sends down a bind with sin_t 3923 * so that the protocol type cannot be SCTP. 3924 */ 3925 if (protocol == IPPROTO_TCP || protocol == IPPROTO_AH || 3926 protocol == IPPROTO_ESP || protocol == IPPROTO_SCTP) { 3927 goto bad_addr; 3928 } 3929 3930 /* 3931 * 3932 * The udp module never sends down a zero-length address, 3933 * and allowing this on a labeled system will break MLP 3934 * functionality. 3935 */ 3936 if (is_system_labeled() && protocol == IPPROTO_UDP) 3937 goto bad_addr; 3938 3939 if (connp->conn_mac_exempt) 3940 goto bad_addr; 3941 3942 /* No hash here really. The table is big enough. */ 3943 connp->conn_srcv6 = ipv6_all_zeros; 3944 3945 ipcl_proto_insert(connp, protocol); 3946 3947 tbr->PRIM_type = T_BIND_ACK; 3948 return (mp); 3949 } 3950 3951 /* Extract the address pointer from the message. */ 3952 ucp = (uchar_t *)mi_offset_param(mp, tbr->ADDR_offset, 3953 tbr->ADDR_length); 3954 if (ucp == NULL) { 3955 ip1dbg(("ip_bind: no address\n")); 3956 goto bad_addr; 3957 } 3958 if (!OK_32PTR(ucp)) { 3959 ip1dbg(("ip_bind: unaligned address\n")); 3960 goto bad_addr; 3961 } 3962 /* 3963 * Check for trailing mps. 3964 */ 3965 3966 mp1 = mp->b_cont; 3967 ire_requested = (mp1 != NULL && DB_TYPE(mp1) == IRE_DB_REQ_TYPE); 3968 ipsec_policy_set = (mp1 != NULL && DB_TYPE(mp1) == IPSEC_POLICY_SET); 3969 3970 switch (tbr->ADDR_length) { 3971 default: 3972 ip1dbg(("ip_bind: bad address length %d\n", 3973 (int)tbr->ADDR_length)); 3974 goto bad_addr; 3975 3976 case IP_ADDR_LEN: 3977 /* Verification of local address only */ 3978 error = ip_bind_laddr(connp, mp, *(ipaddr_t *)ucp, 0, 3979 ire_requested, ipsec_policy_set, B_FALSE); 3980 break; 3981 3982 case sizeof (sin_t): 3983 sin = (sin_t *)ucp; 3984 error = ip_bind_laddr(connp, mp, sin->sin_addr.s_addr, 3985 sin->sin_port, ire_requested, ipsec_policy_set, B_TRUE); 3986 if (protocol == IPPROTO_TCP) 3987 connp->conn_recv = tcp_conn_request; 3988 break; 3989 3990 case sizeof (ipa_conn_t): 3991 ac = (ipa_conn_t *)ucp; 3992 /* For raw socket, the local port is not set. */ 3993 if (ac->ac_lport == 0) 3994 ac->ac_lport = connp->conn_lport; 3995 /* Always verify destination reachability. */ 3996 error = ip_bind_connected(connp, mp, &ac->ac_laddr, 3997 ac->ac_lport, ac->ac_faddr, ac->ac_fport, ire_requested, 3998 ipsec_policy_set, B_TRUE, B_TRUE); 3999 if (protocol == IPPROTO_TCP) 4000 connp->conn_recv = tcp_input; 4001 break; 4002 4003 case sizeof (ipa_conn_x_t): 4004 acx = (ipa_conn_x_t *)ucp; 4005 /* 4006 * Whether or not to verify destination reachability depends 4007 * on the setting of the ACX_VERIFY_DST flag in acx->acx_flags. 4008 */ 4009 error = ip_bind_connected(connp, mp, &acx->acx_conn.ac_laddr, 4010 acx->acx_conn.ac_lport, acx->acx_conn.ac_faddr, 4011 acx->acx_conn.ac_fport, ire_requested, ipsec_policy_set, 4012 B_TRUE, (acx->acx_flags & ACX_VERIFY_DST) != 0); 4013 if (protocol == IPPROTO_TCP) 4014 connp->conn_recv = tcp_input; 4015 break; 4016 } 4017 if (error == EINPROGRESS) 4018 return (NULL); 4019 else if (error != 0) 4020 goto bad_addr; 4021 /* 4022 * Pass the IPSEC headers size in ire_ipsec_overhead. 4023 * We can't do this in ip_bind_insert_ire because the policy 4024 * may not have been inherited at that point in time and hence 4025 * conn_out_enforce_policy may not be set. 4026 */ 4027 mp1 = mp->b_cont; 4028 if (ire_requested && connp->conn_out_enforce_policy && 4029 mp1 != NULL && DB_TYPE(mp1) == IRE_DB_REQ_TYPE) { 4030 ire_t *ire = (ire_t *)mp1->b_rptr; 4031 ASSERT(MBLKL(mp1) >= sizeof (ire_t)); 4032 ire->ire_ipsec_overhead = conn_ipsec_length(connp); 4033 } 4034 4035 /* Send it home. */ 4036 mp->b_datap->db_type = M_PCPROTO; 4037 tbr->PRIM_type = T_BIND_ACK; 4038 return (mp); 4039 4040 bad_addr: 4041 /* 4042 * If error = -1 then we generate a TBADADDR - otherwise error is 4043 * a unix errno. 4044 */ 4045 if (error > 0) 4046 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error); 4047 else 4048 mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0); 4049 return (mp); 4050 } 4051 4052 /* 4053 * Here address is verified to be a valid local address. 4054 * If the IRE_DB_REQ_TYPE mp is present, a broadcast/multicast 4055 * address is also considered a valid local address. 4056 * In the case of a broadcast/multicast address, however, the 4057 * upper protocol is expected to reset the src address 4058 * to 0 if it sees a IRE_BROADCAST type returned so that 4059 * no packets are emitted with broadcast/multicast address as 4060 * source address (that violates hosts requirements RFC1122) 4061 * The addresses valid for bind are: 4062 * (1) - INADDR_ANY (0) 4063 * (2) - IP address of an UP interface 4064 * (3) - IP address of a DOWN interface 4065 * (4) - valid local IP broadcast addresses. In this case 4066 * the conn will only receive packets destined to 4067 * the specified broadcast address. 4068 * (5) - a multicast address. In this case 4069 * the conn will only receive packets destined to 4070 * the specified multicast address. Note: the 4071 * application still has to issue an 4072 * IP_ADD_MEMBERSHIP socket option. 4073 * 4074 * On error, return -1 for TBADADDR otherwise pass the 4075 * errno with TSYSERR reply. 4076 * 4077 * In all the above cases, the bound address must be valid in the current zone. 4078 * When the address is loopback, multicast or broadcast, there might be many 4079 * matching IREs so bind has to look up based on the zone. 4080 * 4081 * Note: lport is in network byte order. 4082 */ 4083 int 4084 ip_bind_laddr(conn_t *connp, mblk_t *mp, ipaddr_t src_addr, uint16_t lport, 4085 boolean_t ire_requested, boolean_t ipsec_policy_set, 4086 boolean_t fanout_insert) 4087 { 4088 int error = 0; 4089 ire_t *src_ire; 4090 mblk_t *policy_mp; 4091 ipif_t *ipif; 4092 zoneid_t zoneid; 4093 4094 if (ipsec_policy_set) { 4095 policy_mp = mp->b_cont; 4096 } 4097 4098 /* 4099 * If it was previously connected, conn_fully_bound would have 4100 * been set. 4101 */ 4102 connp->conn_fully_bound = B_FALSE; 4103 4104 src_ire = NULL; 4105 ipif = NULL; 4106 4107 zoneid = connp->conn_zoneid; 4108 4109 if (src_addr) { 4110 src_ire = ire_route_lookup(src_addr, 0, 0, 0, 4111 NULL, NULL, zoneid, NULL, MATCH_IRE_ZONEONLY); 4112 /* 4113 * If an address other than 0.0.0.0 is requested, 4114 * we verify that it is a valid address for bind 4115 * Note: Following code is in if-else-if form for 4116 * readability compared to a condition check. 4117 */ 4118 /* LINTED - statement has no consequent */ 4119 if (IRE_IS_LOCAL(src_ire)) { 4120 /* 4121 * (2) Bind to address of local UP interface 4122 */ 4123 } else if (src_ire && src_ire->ire_type == IRE_BROADCAST) { 4124 /* 4125 * (4) Bind to broadcast address 4126 * Note: permitted only from transports that 4127 * request IRE 4128 */ 4129 if (!ire_requested) 4130 error = EADDRNOTAVAIL; 4131 } else { 4132 /* 4133 * (3) Bind to address of local DOWN interface 4134 * (ipif_lookup_addr() looks up all interfaces 4135 * but we do not get here for UP interfaces 4136 * - case (2) above) 4137 * We put the protocol byte back into the mblk 4138 * since we may come back via ip_wput_nondata() 4139 * later with this mblk if ipif_lookup_addr chooses 4140 * to defer processing. 4141 */ 4142 *mp->b_wptr++ = (char)connp->conn_ulp; 4143 if ((ipif = ipif_lookup_addr(src_addr, NULL, zoneid, 4144 CONNP_TO_WQ(connp), mp, ip_wput_nondata, 4145 &error)) != NULL) { 4146 ipif_refrele(ipif); 4147 } else if (error == EINPROGRESS) { 4148 if (src_ire != NULL) 4149 ire_refrele(src_ire); 4150 return (EINPROGRESS); 4151 } else if (CLASSD(src_addr)) { 4152 error = 0; 4153 if (src_ire != NULL) 4154 ire_refrele(src_ire); 4155 /* 4156 * (5) bind to multicast address. 4157 * Fake out the IRE returned to upper 4158 * layer to be a broadcast IRE. 4159 */ 4160 src_ire = ire_ctable_lookup( 4161 INADDR_BROADCAST, INADDR_ANY, 4162 IRE_BROADCAST, NULL, zoneid, NULL, 4163 (MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY)); 4164 if (src_ire == NULL || !ire_requested) 4165 error = EADDRNOTAVAIL; 4166 } else { 4167 /* 4168 * Not a valid address for bind 4169 */ 4170 error = EADDRNOTAVAIL; 4171 } 4172 /* 4173 * Just to keep it consistent with the processing in 4174 * ip_bind_v4() 4175 */ 4176 mp->b_wptr--; 4177 } 4178 if (error) { 4179 /* Red Alert! Attempting to be a bogon! */ 4180 ip1dbg(("ip_bind: bad src address 0x%x\n", 4181 ntohl(src_addr))); 4182 goto bad_addr; 4183 } 4184 } 4185 4186 /* 4187 * Allow setting new policies. For example, disconnects come 4188 * down as ipa_t bind. As we would have set conn_policy_cached 4189 * to B_TRUE before, we should set it to B_FALSE, so that policy 4190 * can change after the disconnect. 4191 */ 4192 connp->conn_policy_cached = B_FALSE; 4193 4194 /* 4195 * If not fanout_insert this was just an address verification 4196 */ 4197 if (fanout_insert) { 4198 /* 4199 * The addresses have been verified. Time to insert in 4200 * the correct fanout list. 4201 */ 4202 IN6_IPADDR_TO_V4MAPPED(src_addr, &connp->conn_srcv6); 4203 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &connp->conn_remv6); 4204 connp->conn_lport = lport; 4205 connp->conn_fport = 0; 4206 /* 4207 * Do we need to add a check to reject Multicast packets 4208 */ 4209 error = ipcl_bind_insert(connp, *mp->b_wptr, src_addr, lport); 4210 } 4211 4212 if (error == 0) { 4213 if (ire_requested) { 4214 if (!ip_bind_insert_ire(mp, src_ire, NULL)) { 4215 error = -1; 4216 /* Falls through to bad_addr */ 4217 } 4218 } else if (ipsec_policy_set) { 4219 if (!ip_bind_ipsec_policy_set(connp, policy_mp)) { 4220 error = -1; 4221 /* Falls through to bad_addr */ 4222 } 4223 } 4224 } 4225 bad_addr: 4226 if (error != 0) { 4227 if (connp->conn_anon_port) { 4228 (void) tsol_mlp_anon(crgetzone(connp->conn_cred), 4229 connp->conn_mlp_type, connp->conn_ulp, ntohs(lport), 4230 B_FALSE); 4231 } 4232 connp->conn_mlp_type = mlptSingle; 4233 } 4234 if (src_ire != NULL) 4235 IRE_REFRELE(src_ire); 4236 if (ipsec_policy_set) { 4237 ASSERT(policy_mp == mp->b_cont); 4238 ASSERT(policy_mp != NULL); 4239 freeb(policy_mp); 4240 /* 4241 * As of now assume that nothing else accompanies 4242 * IPSEC_POLICY_SET. 4243 */ 4244 mp->b_cont = NULL; 4245 } 4246 return (error); 4247 } 4248 4249 /* 4250 * Verify that both the source and destination addresses 4251 * are valid. If verify_dst is false, then the destination address may be 4252 * unreachable, i.e. have no route to it. Protocols like TCP want to verify 4253 * destination reachability, while tunnels do not. 4254 * Note that we allow connect to broadcast and multicast 4255 * addresses when ire_requested is set. Thus the ULP 4256 * has to check for IRE_BROADCAST and multicast. 4257 * 4258 * Returns zero if ok. 4259 * On error: returns -1 to mean TBADADDR otherwise returns an errno 4260 * (for use with TSYSERR reply). 4261 * 4262 * Note: lport and fport are in network byte order. 4263 */ 4264 int 4265 ip_bind_connected(conn_t *connp, mblk_t *mp, ipaddr_t *src_addrp, 4266 uint16_t lport, ipaddr_t dst_addr, uint16_t fport, 4267 boolean_t ire_requested, boolean_t ipsec_policy_set, 4268 boolean_t fanout_insert, boolean_t verify_dst) 4269 { 4270 ire_t *src_ire; 4271 ire_t *dst_ire; 4272 int error = 0; 4273 int protocol; 4274 mblk_t *policy_mp; 4275 ire_t *sire = NULL; 4276 ire_t *md_dst_ire = NULL; 4277 ill_t *md_ill = NULL; 4278 zoneid_t zoneid; 4279 ipaddr_t src_addr = *src_addrp; 4280 4281 src_ire = dst_ire = NULL; 4282 protocol = *mp->b_wptr & 0xFF; 4283 4284 /* 4285 * If we never got a disconnect before, clear it now. 4286 */ 4287 connp->conn_fully_bound = B_FALSE; 4288 4289 if (ipsec_policy_set) { 4290 policy_mp = mp->b_cont; 4291 } 4292 4293 zoneid = connp->conn_zoneid; 4294 4295 if (CLASSD(dst_addr)) { 4296 /* Pick up an IRE_BROADCAST */ 4297 dst_ire = ire_route_lookup(ip_g_all_ones, 0, 0, 0, NULL, 4298 NULL, zoneid, MBLK_GETLABEL(mp), 4299 (MATCH_IRE_RECURSIVE | 4300 MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE | 4301 MATCH_IRE_SECATTR)); 4302 } else { 4303 /* 4304 * If conn_dontroute is set or if conn_nexthop_set is set, 4305 * and onlink ipif is not found set ENETUNREACH error. 4306 */ 4307 if (connp->conn_dontroute || connp->conn_nexthop_set) { 4308 ipif_t *ipif; 4309 4310 ipif = ipif_lookup_onlink_addr(connp->conn_dontroute ? 4311 dst_addr : connp->conn_nexthop_v4, zoneid); 4312 if (ipif == NULL) { 4313 error = ENETUNREACH; 4314 goto bad_addr; 4315 } 4316 ipif_refrele(ipif); 4317 } 4318 4319 if (connp->conn_nexthop_set) { 4320 dst_ire = ire_route_lookup(connp->conn_nexthop_v4, 0, 4321 0, 0, NULL, NULL, zoneid, MBLK_GETLABEL(mp), 4322 MATCH_IRE_SECATTR); 4323 } else { 4324 dst_ire = ire_route_lookup(dst_addr, 0, 0, 0, NULL, 4325 &sire, zoneid, MBLK_GETLABEL(mp), 4326 (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 4327 MATCH_IRE_PARENT | MATCH_IRE_RJ_BHOLE | 4328 MATCH_IRE_SECATTR)); 4329 } 4330 } 4331 /* 4332 * dst_ire can't be a broadcast when not ire_requested. 4333 * We also prevent ire's with src address INADDR_ANY to 4334 * be used, which are created temporarily for 4335 * sending out packets from endpoints that have 4336 * conn_unspec_src set. If verify_dst is true, the destination must be 4337 * reachable. If verify_dst is false, the destination needn't be 4338 * reachable. 4339 * 4340 * If we match on a reject or black hole, then we've got a 4341 * local failure. May as well fail out the connect() attempt, 4342 * since it's never going to succeed. 4343 */ 4344 if (dst_ire == NULL || dst_ire->ire_src_addr == INADDR_ANY || 4345 (dst_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 4346 ((dst_ire->ire_type & IRE_BROADCAST) && !ire_requested)) { 4347 /* 4348 * If we're verifying destination reachability, we always want 4349 * to complain here. 4350 * 4351 * If we're not verifying destination reachability but the 4352 * destination has a route, we still want to fail on the 4353 * temporary address and broadcast address tests. 4354 */ 4355 if (verify_dst || (dst_ire != NULL)) { 4356 if (ip_debug > 2) { 4357 pr_addr_dbg("ip_bind_connected: bad connected " 4358 "dst %s\n", AF_INET, &dst_addr); 4359 } 4360 if (dst_ire == NULL || !(dst_ire->ire_type & IRE_HOST)) 4361 error = ENETUNREACH; 4362 else 4363 error = EHOSTUNREACH; 4364 goto bad_addr; 4365 } 4366 } 4367 4368 /* 4369 * We now know that routing will allow us to reach the destination. 4370 * Check whether Trusted Solaris policy allows communication with this 4371 * host, and pretend that the destination is unreachable if not. 4372 * 4373 * This is never a problem for TCP, since that transport is known to 4374 * compute the label properly as part of the tcp_rput_other T_BIND_ACK 4375 * handling. If the remote is unreachable, it will be detected at that 4376 * point, so there's no reason to check it here. 4377 * 4378 * Note that for sendto (and other datagram-oriented friends), this 4379 * check is done as part of the data path label computation instead. 4380 * The check here is just to make non-TCP connect() report the right 4381 * error. 4382 */ 4383 if (dst_ire != NULL && is_system_labeled() && 4384 !IPCL_IS_TCP(connp) && 4385 tsol_compute_label(DB_CREDDEF(mp, connp->conn_cred), dst_addr, NULL, 4386 connp->conn_mac_exempt) != 0) { 4387 error = EHOSTUNREACH; 4388 if (ip_debug > 2) { 4389 pr_addr_dbg("ip_bind_connected: no label for dst %s\n", 4390 AF_INET, &dst_addr); 4391 } 4392 goto bad_addr; 4393 } 4394 4395 /* 4396 * If the app does a connect(), it means that it will most likely 4397 * send more than 1 packet to the destination. It makes sense 4398 * to clear the temporary flag. 4399 */ 4400 if (dst_ire != NULL && dst_ire->ire_type == IRE_CACHE && 4401 (dst_ire->ire_marks & IRE_MARK_TEMPORARY)) { 4402 irb_t *irb = dst_ire->ire_bucket; 4403 4404 rw_enter(&irb->irb_lock, RW_WRITER); 4405 dst_ire->ire_marks &= ~IRE_MARK_TEMPORARY; 4406 irb->irb_tmp_ire_cnt--; 4407 rw_exit(&irb->irb_lock); 4408 } 4409 4410 /* 4411 * See if we should notify ULP about MDT; we do this whether or not 4412 * ire_requested is TRUE, in order to handle active connects; MDT 4413 * eligibility tests for passive connects are handled separately 4414 * through tcp_adapt_ire(). We do this before the source address 4415 * selection, because dst_ire may change after a call to 4416 * ipif_select_source(). This is a best-effort check, as the 4417 * packet for this connection may not actually go through 4418 * dst_ire->ire_stq, and the exact IRE can only be known after 4419 * calling ip_newroute(). This is why we further check on the 4420 * IRE during Multidata packet transmission in tcp_multisend(). 4421 */ 4422 if (ip_multidata_outbound && !ipsec_policy_set && dst_ire != NULL && 4423 !(dst_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST)) && 4424 (md_ill = ire_to_ill(dst_ire), md_ill != NULL) && 4425 ILL_MDT_CAPABLE(md_ill)) { 4426 md_dst_ire = dst_ire; 4427 IRE_REFHOLD(md_dst_ire); 4428 } 4429 4430 if (dst_ire != NULL && 4431 dst_ire->ire_type == IRE_LOCAL && 4432 dst_ire->ire_zoneid != zoneid && dst_ire->ire_zoneid != ALL_ZONES) { 4433 /* 4434 * If the IRE belongs to a different zone, look for a matching 4435 * route in the forwarding table and use the source address from 4436 * that route. 4437 */ 4438 src_ire = ire_ftable_lookup(dst_addr, 0, 0, 0, NULL, NULL, 4439 zoneid, 0, NULL, 4440 MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 4441 MATCH_IRE_RJ_BHOLE); 4442 if (src_ire == NULL) { 4443 error = EHOSTUNREACH; 4444 goto bad_addr; 4445 } else if (src_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 4446 if (!(src_ire->ire_type & IRE_HOST)) 4447 error = ENETUNREACH; 4448 else 4449 error = EHOSTUNREACH; 4450 goto bad_addr; 4451 } 4452 if (src_addr == INADDR_ANY) 4453 src_addr = src_ire->ire_src_addr; 4454 ire_refrele(src_ire); 4455 src_ire = NULL; 4456 } else if ((src_addr == INADDR_ANY) && (dst_ire != NULL)) { 4457 if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) { 4458 src_addr = sire->ire_src_addr; 4459 ire_refrele(dst_ire); 4460 dst_ire = sire; 4461 sire = NULL; 4462 } else { 4463 /* 4464 * Pick a source address so that a proper inbound 4465 * load spreading would happen. 4466 */ 4467 ill_t *dst_ill = dst_ire->ire_ipif->ipif_ill; 4468 ipif_t *src_ipif = NULL; 4469 ire_t *ipif_ire; 4470 4471 /* 4472 * Supply a local source address such that inbound 4473 * load spreading happens. 4474 * 4475 * Determine the best source address on this ill for 4476 * the destination. 4477 * 4478 * 1) For broadcast, we should return a broadcast ire 4479 * found above so that upper layers know that the 4480 * destination address is a broadcast address. 4481 * 4482 * 2) If this is part of a group, select a better 4483 * source address so that better inbound load 4484 * balancing happens. Do the same if the ipif 4485 * is DEPRECATED. 4486 * 4487 * 3) If the outgoing interface is part of a usesrc 4488 * group, then try selecting a source address from 4489 * the usesrc ILL. 4490 */ 4491 if ((dst_ire->ire_zoneid != zoneid && 4492 dst_ire->ire_zoneid != ALL_ZONES) || 4493 (!(dst_ire->ire_type & IRE_BROADCAST) && 4494 ((dst_ill->ill_group != NULL) || 4495 (dst_ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) || 4496 (dst_ill->ill_usesrc_ifindex != 0)))) { 4497 /* 4498 * If the destination is reachable via a 4499 * given gateway, the selected source address 4500 * should be in the same subnet as the gateway. 4501 * Otherwise, the destination is not reachable. 4502 * 4503 * If there are no interfaces on the same subnet 4504 * as the destination, ipif_select_source gives 4505 * first non-deprecated interface which might be 4506 * on a different subnet than the gateway. 4507 * This is not desirable. Hence pass the dst_ire 4508 * source address to ipif_select_source. 4509 * It is sure that the destination is reachable 4510 * with the dst_ire source address subnet. 4511 * So passing dst_ire source address to 4512 * ipif_select_source will make sure that the 4513 * selected source will be on the same subnet 4514 * as dst_ire source address. 4515 */ 4516 ipaddr_t saddr = 4517 dst_ire->ire_ipif->ipif_src_addr; 4518 src_ipif = ipif_select_source(dst_ill, 4519 saddr, zoneid); 4520 if (src_ipif != NULL) { 4521 if (IS_VNI(src_ipif->ipif_ill)) { 4522 /* 4523 * For VNI there is no 4524 * interface route 4525 */ 4526 src_addr = 4527 src_ipif->ipif_src_addr; 4528 } else { 4529 ipif_ire = 4530 ipif_to_ire(src_ipif); 4531 if (ipif_ire != NULL) { 4532 IRE_REFRELE(dst_ire); 4533 dst_ire = ipif_ire; 4534 } 4535 src_addr = 4536 dst_ire->ire_src_addr; 4537 } 4538 ipif_refrele(src_ipif); 4539 } else { 4540 src_addr = dst_ire->ire_src_addr; 4541 } 4542 } else { 4543 src_addr = dst_ire->ire_src_addr; 4544 } 4545 } 4546 } 4547 4548 /* 4549 * We do ire_route_lookup() here (and not 4550 * interface lookup as we assert that 4551 * src_addr should only come from an 4552 * UP interface for hard binding. 4553 */ 4554 ASSERT(src_ire == NULL); 4555 src_ire = ire_route_lookup(src_addr, 0, 0, 0, NULL, 4556 NULL, zoneid, NULL, MATCH_IRE_ZONEONLY); 4557 /* src_ire must be a local|loopback */ 4558 if (!IRE_IS_LOCAL(src_ire)) { 4559 if (ip_debug > 2) { 4560 pr_addr_dbg("ip_bind_connected: bad connected " 4561 "src %s\n", AF_INET, &src_addr); 4562 } 4563 error = EADDRNOTAVAIL; 4564 goto bad_addr; 4565 } 4566 4567 /* 4568 * If the source address is a loopback address, the 4569 * destination had best be local or multicast. 4570 * The transports that can't handle multicast will reject 4571 * those addresses. 4572 */ 4573 if (src_ire->ire_type == IRE_LOOPBACK && 4574 !(IRE_IS_LOCAL(dst_ire) || CLASSD(dst_addr))) { 4575 ip1dbg(("ip_bind_connected: bad connected loopback\n")); 4576 error = -1; 4577 goto bad_addr; 4578 } 4579 4580 /* 4581 * Allow setting new policies. For example, disconnects come 4582 * down as ipa_t bind. As we would have set conn_policy_cached 4583 * to B_TRUE before, we should set it to B_FALSE, so that policy 4584 * can change after the disconnect. 4585 */ 4586 connp->conn_policy_cached = B_FALSE; 4587 4588 /* 4589 * Set the conn addresses/ports immediately, so the IPsec policy calls 4590 * can handle their passed-in conn's. 4591 */ 4592 4593 IN6_IPADDR_TO_V4MAPPED(src_addr, &connp->conn_srcv6); 4594 IN6_IPADDR_TO_V4MAPPED(dst_addr, &connp->conn_remv6); 4595 connp->conn_lport = lport; 4596 connp->conn_fport = fport; 4597 *src_addrp = src_addr; 4598 4599 ASSERT(!(ipsec_policy_set && ire_requested)); 4600 if (ire_requested) { 4601 iulp_t *ulp_info = NULL; 4602 4603 /* 4604 * Note that sire will not be NULL if this is an off-link 4605 * connection and there is not cache for that dest yet. 4606 * 4607 * XXX Because of an existing bug, if there are multiple 4608 * default routes, the IRE returned now may not be the actual 4609 * default route used (default routes are chosen in a 4610 * round robin fashion). So if the metrics for different 4611 * default routes are different, we may return the wrong 4612 * metrics. This will not be a problem if the existing 4613 * bug is fixed. 4614 */ 4615 if (sire != NULL) { 4616 ulp_info = &(sire->ire_uinfo); 4617 } 4618 if (!ip_bind_insert_ire(mp, dst_ire, ulp_info)) { 4619 error = -1; 4620 goto bad_addr; 4621 } 4622 } else if (ipsec_policy_set) { 4623 if (!ip_bind_ipsec_policy_set(connp, policy_mp)) { 4624 error = -1; 4625 goto bad_addr; 4626 } 4627 } 4628 4629 /* 4630 * Cache IPsec policy in this conn. If we have per-socket policy, 4631 * we'll cache that. If we don't, we'll inherit global policy. 4632 * 4633 * We can't insert until the conn reflects the policy. Note that 4634 * conn_policy_cached is set by ipsec_conn_cache_policy() even for 4635 * connections where we don't have a policy. This is to prevent 4636 * global policy lookups in the inbound path. 4637 * 4638 * If we insert before we set conn_policy_cached, 4639 * CONN_INBOUND_POLICY_PRESENT() check can still evaluate true 4640 * because global policy cound be non-empty. We normally call 4641 * ipsec_check_policy() for conn_policy_cached connections only if 4642 * ipc_in_enforce_policy is set. But in this case, 4643 * conn_policy_cached can get set anytime since we made the 4644 * CONN_INBOUND_POLICY_PRESENT() check and ipsec_check_policy() is 4645 * called, which will make the above assumption false. Thus, we 4646 * need to insert after we set conn_policy_cached. 4647 */ 4648 if ((error = ipsec_conn_cache_policy(connp, B_TRUE)) != 0) 4649 goto bad_addr; 4650 4651 if (fanout_insert) { 4652 /* 4653 * The addresses have been verified. Time to insert in 4654 * the correct fanout list. 4655 */ 4656 error = ipcl_conn_insert(connp, protocol, src_addr, 4657 dst_addr, connp->conn_ports); 4658 } 4659 4660 if (error == 0) { 4661 connp->conn_fully_bound = B_TRUE; 4662 /* 4663 * Our initial checks for MDT have passed; the IRE is not 4664 * LOCAL/LOOPBACK/BROADCAST, and the link layer seems to 4665 * be supporting MDT. Pass the IRE, IPC and ILL into 4666 * ip_mdinfo_return(), which performs further checks 4667 * against them and upon success, returns the MDT info 4668 * mblk which we will attach to the bind acknowledgment. 4669 */ 4670 if (md_dst_ire != NULL) { 4671 mblk_t *mdinfo_mp; 4672 4673 ASSERT(md_ill != NULL); 4674 ASSERT(md_ill->ill_mdt_capab != NULL); 4675 if ((mdinfo_mp = ip_mdinfo_return(md_dst_ire, connp, 4676 md_ill->ill_name, md_ill->ill_mdt_capab)) != NULL) 4677 linkb(mp, mdinfo_mp); 4678 } 4679 } 4680 bad_addr: 4681 if (ipsec_policy_set) { 4682 ASSERT(policy_mp == mp->b_cont); 4683 ASSERT(policy_mp != NULL); 4684 freeb(policy_mp); 4685 /* 4686 * As of now assume that nothing else accompanies 4687 * IPSEC_POLICY_SET. 4688 */ 4689 mp->b_cont = NULL; 4690 } 4691 if (src_ire != NULL) 4692 IRE_REFRELE(src_ire); 4693 if (dst_ire != NULL) 4694 IRE_REFRELE(dst_ire); 4695 if (sire != NULL) 4696 IRE_REFRELE(sire); 4697 if (md_dst_ire != NULL) 4698 IRE_REFRELE(md_dst_ire); 4699 return (error); 4700 } 4701 4702 /* 4703 * Insert the ire in b_cont. Returns false if it fails (due to lack of space). 4704 * Prefers dst_ire over src_ire. 4705 */ 4706 static boolean_t 4707 ip_bind_insert_ire(mblk_t *mp, ire_t *ire, iulp_t *ulp_info) 4708 { 4709 mblk_t *mp1; 4710 ire_t *ret_ire = NULL; 4711 4712 mp1 = mp->b_cont; 4713 ASSERT(mp1 != NULL); 4714 4715 if (ire != NULL) { 4716 /* 4717 * mp1 initialized above to IRE_DB_REQ_TYPE 4718 * appended mblk. Its <upper protocol>'s 4719 * job to make sure there is room. 4720 */ 4721 if ((mp1->b_datap->db_lim - mp1->b_rptr) < sizeof (ire_t)) 4722 return (0); 4723 4724 mp1->b_datap->db_type = IRE_DB_TYPE; 4725 mp1->b_wptr = mp1->b_rptr + sizeof (ire_t); 4726 bcopy(ire, mp1->b_rptr, sizeof (ire_t)); 4727 ret_ire = (ire_t *)mp1->b_rptr; 4728 /* 4729 * Pass the latest setting of the ip_path_mtu_discovery and 4730 * copy the ulp info if any. 4731 */ 4732 ret_ire->ire_frag_flag |= (ip_path_mtu_discovery) ? 4733 IPH_DF : 0; 4734 if (ulp_info != NULL) { 4735 bcopy(ulp_info, &(ret_ire->ire_uinfo), 4736 sizeof (iulp_t)); 4737 } 4738 ret_ire->ire_mp = mp1; 4739 } else { 4740 /* 4741 * No IRE was found. Remove IRE mblk. 4742 */ 4743 mp->b_cont = mp1->b_cont; 4744 freeb(mp1); 4745 } 4746 4747 return (1); 4748 } 4749 4750 /* 4751 * Carve "len" bytes out of an mblk chain, consuming any we empty, and duping 4752 * the final piece where we don't. Return a pointer to the first mblk in the 4753 * result, and update the pointer to the next mblk to chew on. If anything 4754 * goes wrong (i.e., dupb fails), we waste everything in sight and return a 4755 * NULL pointer. 4756 */ 4757 mblk_t * 4758 ip_carve_mp(mblk_t **mpp, ssize_t len) 4759 { 4760 mblk_t *mp0; 4761 mblk_t *mp1; 4762 mblk_t *mp2; 4763 4764 if (!len || !mpp || !(mp0 = *mpp)) 4765 return (NULL); 4766 /* If we aren't going to consume the first mblk, we need a dup. */ 4767 if (mp0->b_wptr - mp0->b_rptr > len) { 4768 mp1 = dupb(mp0); 4769 if (mp1) { 4770 /* Partition the data between the two mblks. */ 4771 mp1->b_wptr = mp1->b_rptr + len; 4772 mp0->b_rptr = mp1->b_wptr; 4773 /* 4774 * after adjustments if mblk not consumed is now 4775 * unaligned, try to align it. If this fails free 4776 * all messages and let upper layer recover. 4777 */ 4778 if (!OK_32PTR(mp0->b_rptr)) { 4779 if (!pullupmsg(mp0, -1)) { 4780 freemsg(mp0); 4781 freemsg(mp1); 4782 *mpp = NULL; 4783 return (NULL); 4784 } 4785 } 4786 } 4787 return (mp1); 4788 } 4789 /* Eat through as many mblks as we need to get len bytes. */ 4790 len -= mp0->b_wptr - mp0->b_rptr; 4791 for (mp2 = mp1 = mp0; (mp2 = mp2->b_cont) != 0 && len; mp1 = mp2) { 4792 if (mp2->b_wptr - mp2->b_rptr > len) { 4793 /* 4794 * We won't consume the entire last mblk. Like 4795 * above, dup and partition it. 4796 */ 4797 mp1->b_cont = dupb(mp2); 4798 mp1 = mp1->b_cont; 4799 if (!mp1) { 4800 /* 4801 * Trouble. Rather than go to a lot of 4802 * trouble to clean up, we free the messages. 4803 * This won't be any worse than losing it on 4804 * the wire. 4805 */ 4806 freemsg(mp0); 4807 freemsg(mp2); 4808 *mpp = NULL; 4809 return (NULL); 4810 } 4811 mp1->b_wptr = mp1->b_rptr + len; 4812 mp2->b_rptr = mp1->b_wptr; 4813 /* 4814 * after adjustments if mblk not consumed is now 4815 * unaligned, try to align it. If this fails free 4816 * all messages and let upper layer recover. 4817 */ 4818 if (!OK_32PTR(mp2->b_rptr)) { 4819 if (!pullupmsg(mp2, -1)) { 4820 freemsg(mp0); 4821 freemsg(mp2); 4822 *mpp = NULL; 4823 return (NULL); 4824 } 4825 } 4826 *mpp = mp2; 4827 return (mp0); 4828 } 4829 /* Decrement len by the amount we just got. */ 4830 len -= mp2->b_wptr - mp2->b_rptr; 4831 } 4832 /* 4833 * len should be reduced to zero now. If not our caller has 4834 * screwed up. 4835 */ 4836 if (len) { 4837 /* Shouldn't happen! */ 4838 freemsg(mp0); 4839 *mpp = NULL; 4840 return (NULL); 4841 } 4842 /* 4843 * We consumed up to exactly the end of an mblk. Detach the part 4844 * we are returning from the rest of the chain. 4845 */ 4846 mp1->b_cont = NULL; 4847 *mpp = mp2; 4848 return (mp0); 4849 } 4850 4851 /* The ill stream is being unplumbed. Called from ip_close */ 4852 int 4853 ip_modclose(ill_t *ill) 4854 { 4855 4856 boolean_t success; 4857 ipsq_t *ipsq; 4858 ipif_t *ipif; 4859 queue_t *q = ill->ill_rq; 4860 4861 /* 4862 * Forcibly enter the ipsq after some delay. This is to take 4863 * care of the case when some ioctl does not complete because 4864 * we sent a control message to the driver and it did not 4865 * send us a reply. We want to be able to at least unplumb 4866 * and replumb rather than force the user to reboot the system. 4867 */ 4868 success = ipsq_enter(ill, B_FALSE); 4869 4870 /* 4871 * Open/close/push/pop is guaranteed to be single threaded 4872 * per stream by STREAMS. FS guarantees that all references 4873 * from top are gone before close is called. So there can't 4874 * be another close thread that has set CONDEMNED on this ill. 4875 * and cause ipsq_enter to return failure. 4876 */ 4877 ASSERT(success); 4878 ipsq = ill->ill_phyint->phyint_ipsq; 4879 4880 /* 4881 * Mark it condemned. No new reference will be made to this ill. 4882 * Lookup functions will return an error. Threads that try to 4883 * increment the refcnt must check for ILL_CAN_LOOKUP. This ensures 4884 * that the refcnt will drop down to zero. 4885 */ 4886 mutex_enter(&ill->ill_lock); 4887 ill->ill_state_flags |= ILL_CONDEMNED; 4888 for (ipif = ill->ill_ipif; ipif != NULL; 4889 ipif = ipif->ipif_next) { 4890 ipif->ipif_state_flags |= IPIF_CONDEMNED; 4891 } 4892 /* 4893 * Wake up anybody waiting to enter the ipsq. ipsq_enter 4894 * returns error if ILL_CONDEMNED is set 4895 */ 4896 cv_broadcast(&ill->ill_cv); 4897 mutex_exit(&ill->ill_lock); 4898 4899 /* 4900 * Shut down fragmentation reassembly. 4901 * ill_frag_timer won't start a timer again. 4902 * Now cancel any existing timer 4903 */ 4904 (void) untimeout(ill->ill_frag_timer_id); 4905 (void) ill_frag_timeout(ill, 0); 4906 4907 /* 4908 * If MOVE was in progress, clear the 4909 * move_in_progress fields also. 4910 */ 4911 if (ill->ill_move_in_progress) { 4912 ILL_CLEAR_MOVE(ill); 4913 } 4914 4915 /* 4916 * Call ill_delete to bring down the ipifs, ilms and ill on 4917 * this ill. Then wait for the refcnts to drop to zero. 4918 * ill_is_quiescent checks whether the ill is really quiescent. 4919 * Then make sure that threads that are waiting to enter the 4920 * ipsq have seen the error returned by ipsq_enter and have 4921 * gone away. Then we call ill_delete_tail which does the 4922 * DL_UNBIND and DL_DETACH with the driver and then qprocsoff. 4923 */ 4924 ill_delete(ill); 4925 mutex_enter(&ill->ill_lock); 4926 while (!ill_is_quiescent(ill)) 4927 cv_wait(&ill->ill_cv, &ill->ill_lock); 4928 while (ill->ill_waiters) 4929 cv_wait(&ill->ill_cv, &ill->ill_lock); 4930 4931 mutex_exit(&ill->ill_lock); 4932 4933 /* qprocsoff is called in ill_delete_tail */ 4934 ill_delete_tail(ill); 4935 4936 /* 4937 * Walk through all upper (conn) streams and qenable 4938 * those that have queued data. 4939 * close synchronization needs this to 4940 * be done to ensure that all upper layers blocked 4941 * due to flow control to the closing device 4942 * get unblocked. 4943 */ 4944 ip1dbg(("ip_wsrv: walking\n")); 4945 conn_walk_drain(); 4946 4947 mutex_enter(&ip_mi_lock); 4948 mi_close_unlink(&ip_g_head, (IDP)ill); 4949 mutex_exit(&ip_mi_lock); 4950 4951 /* 4952 * credp could be null if the open didn't succeed and ip_modopen 4953 * itself calls ip_close. 4954 */ 4955 if (ill->ill_credp != NULL) 4956 crfree(ill->ill_credp); 4957 4958 mi_close_free((IDP)ill); 4959 q->q_ptr = WR(q)->q_ptr = NULL; 4960 4961 ipsq_exit(ipsq, B_TRUE, B_TRUE); 4962 4963 return (0); 4964 } 4965 4966 /* 4967 * This is called as part of close() for both IP and UDP 4968 * in order to quiesce the conn. 4969 */ 4970 void 4971 ip_quiesce_conn(conn_t *connp) 4972 { 4973 boolean_t drain_cleanup_reqd = B_FALSE; 4974 boolean_t conn_ioctl_cleanup_reqd = B_FALSE; 4975 boolean_t ilg_cleanup_reqd = B_FALSE; 4976 4977 ASSERT(!IPCL_IS_TCP(connp)); 4978 4979 /* 4980 * Mark the conn as closing, and this conn must not be 4981 * inserted in future into any list. Eg. conn_drain_insert(), 4982 * won't insert this conn into the conn_drain_list. 4983 * Similarly ill_pending_mp_add() will not add any mp to 4984 * the pending mp list, after this conn has started closing. 4985 * 4986 * conn_idl, conn_pending_ill, conn_down_pending_ill, conn_ilg 4987 * cannot get set henceforth. 4988 */ 4989 mutex_enter(&connp->conn_lock); 4990 ASSERT(!(connp->conn_state_flags & CONN_QUIESCED)); 4991 connp->conn_state_flags |= CONN_CLOSING; 4992 if (connp->conn_idl != NULL) 4993 drain_cleanup_reqd = B_TRUE; 4994 if (connp->conn_oper_pending_ill != NULL) 4995 conn_ioctl_cleanup_reqd = B_TRUE; 4996 if (connp->conn_ilg_inuse != 0) 4997 ilg_cleanup_reqd = B_TRUE; 4998 mutex_exit(&connp->conn_lock); 4999 5000 if (IPCL_IS_UDP(connp)) 5001 udp_quiesce_conn(connp); 5002 5003 if (conn_ioctl_cleanup_reqd) 5004 conn_ioctl_cleanup(connp); 5005 5006 if (is_system_labeled() && connp->conn_anon_port) { 5007 (void) tsol_mlp_anon(crgetzone(connp->conn_cred), 5008 connp->conn_mlp_type, connp->conn_ulp, 5009 ntohs(connp->conn_lport), B_FALSE); 5010 connp->conn_anon_port = 0; 5011 } 5012 connp->conn_mlp_type = mlptSingle; 5013 5014 /* 5015 * Remove this conn from any fanout list it is on. 5016 * and then wait for any threads currently operating 5017 * on this endpoint to finish 5018 */ 5019 ipcl_hash_remove(connp); 5020 5021 /* 5022 * Remove this conn from the drain list, and do 5023 * any other cleanup that may be required. 5024 * (Only non-tcp streams may have a non-null conn_idl. 5025 * TCP streams are never flow controlled, and 5026 * conn_idl will be null) 5027 */ 5028 if (drain_cleanup_reqd) 5029 conn_drain_tail(connp, B_TRUE); 5030 5031 if (connp->conn_rq == ip_g_mrouter || connp->conn_wq == ip_g_mrouter) 5032 (void) ip_mrouter_done(NULL); 5033 5034 if (ilg_cleanup_reqd) 5035 ilg_delete_all(connp); 5036 5037 conn_delete_ire(connp, NULL); 5038 5039 /* 5040 * Now conn refcnt can increase only thru CONN_INC_REF_LOCKED. 5041 * callers from write side can't be there now because close 5042 * is in progress. The only other caller is ipcl_walk 5043 * which checks for the condemned flag. 5044 */ 5045 mutex_enter(&connp->conn_lock); 5046 connp->conn_state_flags |= CONN_CONDEMNED; 5047 while (connp->conn_ref != 1) 5048 cv_wait(&connp->conn_cv, &connp->conn_lock); 5049 connp->conn_state_flags |= CONN_QUIESCED; 5050 mutex_exit(&connp->conn_lock); 5051 } 5052 5053 /* ARGSUSED */ 5054 int 5055 ip_close(queue_t *q, int flags) 5056 { 5057 conn_t *connp; 5058 5059 TRACE_1(TR_FAC_IP, TR_IP_CLOSE, "ip_close: q %p", q); 5060 5061 /* 5062 * Call the appropriate delete routine depending on whether this is 5063 * a module or device. 5064 */ 5065 if (WR(q)->q_next != NULL) { 5066 /* This is a module close */ 5067 return (ip_modclose((ill_t *)q->q_ptr)); 5068 } 5069 5070 connp = q->q_ptr; 5071 ip_quiesce_conn(connp); 5072 5073 qprocsoff(q); 5074 5075 /* 5076 * Now we are truly single threaded on this stream, and can 5077 * delete the things hanging off the connp, and finally the connp. 5078 * We removed this connp from the fanout list, it cannot be 5079 * accessed thru the fanouts, and we already waited for the 5080 * conn_ref to drop to 0. We are already in close, so 5081 * there cannot be any other thread from the top. qprocsoff 5082 * has completed, and service has completed or won't run in 5083 * future. 5084 */ 5085 ASSERT(connp->conn_ref == 1); 5086 5087 /* 5088 * A conn which was previously marked as IPCL_UDP cannot 5089 * retain the flag because it would have been cleared by 5090 * udp_close(). 5091 */ 5092 ASSERT(!IPCL_IS_UDP(connp)); 5093 5094 if (connp->conn_latch != NULL) { 5095 IPLATCH_REFRELE(connp->conn_latch); 5096 connp->conn_latch = NULL; 5097 } 5098 if (connp->conn_policy != NULL) { 5099 IPPH_REFRELE(connp->conn_policy); 5100 connp->conn_policy = NULL; 5101 } 5102 if (connp->conn_ipsec_opt_mp != NULL) { 5103 freemsg(connp->conn_ipsec_opt_mp); 5104 connp->conn_ipsec_opt_mp = NULL; 5105 } 5106 5107 inet_minor_free(ip_minor_arena, connp->conn_dev); 5108 5109 connp->conn_ref--; 5110 ipcl_conn_destroy(connp); 5111 5112 q->q_ptr = WR(q)->q_ptr = NULL; 5113 return (0); 5114 } 5115 5116 int 5117 ip_snmpmod_close(queue_t *q) 5118 { 5119 conn_t *connp = Q_TO_CONN(q); 5120 ASSERT(connp->conn_flags & (IPCL_TCPMOD | IPCL_UDPMOD)); 5121 5122 qprocsoff(q); 5123 5124 if (connp->conn_flags & IPCL_UDPMOD) 5125 udp_close_free(connp); 5126 5127 if (connp->conn_cred != NULL) { 5128 crfree(connp->conn_cred); 5129 connp->conn_cred = NULL; 5130 } 5131 CONN_DEC_REF(connp); 5132 q->q_ptr = WR(q)->q_ptr = NULL; 5133 return (0); 5134 } 5135 5136 /* 5137 * Write side put procedure for TCP module or UDP module instance. TCP/UDP 5138 * as a module is only used for MIB browsers that push TCP/UDP over IP or ARP. 5139 * The only supported primitives are T_SVR4_OPTMGMT_REQ and T_OPTMGMT_REQ. 5140 * M_FLUSH messages and ioctls are only passed downstream; we don't flush our 5141 * queues as we never enqueue messages there and we don't handle any ioctls. 5142 * Everything else is freed. 5143 */ 5144 void 5145 ip_snmpmod_wput(queue_t *q, mblk_t *mp) 5146 { 5147 conn_t *connp = q->q_ptr; 5148 pfi_t setfn; 5149 pfi_t getfn; 5150 5151 ASSERT(connp->conn_flags & (IPCL_TCPMOD | IPCL_UDPMOD)); 5152 5153 switch (DB_TYPE(mp)) { 5154 case M_PROTO: 5155 case M_PCPROTO: 5156 if ((MBLKL(mp) >= sizeof (t_scalar_t)) && 5157 ((((union T_primitives *)mp->b_rptr)->type == 5158 T_SVR4_OPTMGMT_REQ) || 5159 (((union T_primitives *)mp->b_rptr)->type == 5160 T_OPTMGMT_REQ))) { 5161 /* 5162 * This is the only TPI primitive supported. Its 5163 * handling does not require tcp_t, but it does require 5164 * conn_t to check permissions. 5165 */ 5166 cred_t *cr = DB_CREDDEF(mp, connp->conn_cred); 5167 5168 if (connp->conn_flags & IPCL_TCPMOD) { 5169 setfn = tcp_snmp_set; 5170 getfn = tcp_snmp_get; 5171 } else { 5172 setfn = udp_snmp_set; 5173 getfn = udp_snmp_get; 5174 } 5175 if (!snmpcom_req(q, mp, setfn, getfn, cr)) { 5176 freemsg(mp); 5177 return; 5178 } 5179 } else if ((mp = mi_tpi_err_ack_alloc(mp, TPROTO, ENOTSUP)) 5180 != NULL) 5181 qreply(q, mp); 5182 break; 5183 case M_FLUSH: 5184 case M_IOCTL: 5185 putnext(q, mp); 5186 break; 5187 default: 5188 freemsg(mp); 5189 break; 5190 } 5191 } 5192 5193 /* Return the IP checksum for the IP header at "iph". */ 5194 uint16_t 5195 ip_csum_hdr(ipha_t *ipha) 5196 { 5197 uint16_t *uph; 5198 uint32_t sum; 5199 int opt_len; 5200 5201 opt_len = (ipha->ipha_version_and_hdr_length & 0xF) - 5202 IP_SIMPLE_HDR_LENGTH_IN_WORDS; 5203 uph = (uint16_t *)ipha; 5204 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + 5205 uph[5] + uph[6] + uph[7] + uph[8] + uph[9]; 5206 if (opt_len > 0) { 5207 do { 5208 sum += uph[10]; 5209 sum += uph[11]; 5210 uph += 2; 5211 } while (--opt_len); 5212 } 5213 sum = (sum & 0xFFFF) + (sum >> 16); 5214 sum = ~(sum + (sum >> 16)) & 0xFFFF; 5215 if (sum == 0xffff) 5216 sum = 0; 5217 return ((uint16_t)sum); 5218 } 5219 5220 void 5221 ip_ddi_destroy(void) 5222 { 5223 tnet_fini(); 5224 tcp_ddi_destroy(); 5225 sctp_ddi_destroy(); 5226 ipsec_loader_destroy(); 5227 ipsec_policy_destroy(); 5228 ipsec_kstat_destroy(); 5229 nd_free(&ip_g_nd); 5230 mutex_destroy(&igmp_timer_lock); 5231 mutex_destroy(&mld_timer_lock); 5232 mutex_destroy(&igmp_slowtimeout_lock); 5233 mutex_destroy(&mld_slowtimeout_lock); 5234 mutex_destroy(&ip_mi_lock); 5235 mutex_destroy(&rts_clients.connf_lock); 5236 ip_ire_fini(); 5237 ip6_asp_free(); 5238 conn_drain_fini(); 5239 ipcl_destroy(); 5240 inet_minor_destroy(ip_minor_arena); 5241 icmp_kstat_fini(); 5242 ip_kstat_fini(); 5243 rw_destroy(&ipsec_capab_ills_lock); 5244 rw_destroy(&ill_g_usesrc_lock); 5245 ip_drop_unregister(&ip_dropper); 5246 } 5247 5248 5249 void 5250 ip_ddi_init(void) 5251 { 5252 TCP6_MAJ = ddi_name_to_major(TCP6); 5253 TCP_MAJ = ddi_name_to_major(TCP); 5254 SCTP_MAJ = ddi_name_to_major(SCTP); 5255 SCTP6_MAJ = ddi_name_to_major(SCTP6); 5256 5257 ip_input_proc = ip_squeue_switch(ip_squeue_enter); 5258 5259 /* IP's IPsec code calls the packet dropper */ 5260 ip_drop_register(&ip_dropper, "IP IPsec processing"); 5261 5262 if (!ip_g_nd) { 5263 if (!ip_param_register(lcl_param_arr, A_CNT(lcl_param_arr), 5264 lcl_ndp_arr, A_CNT(lcl_ndp_arr))) { 5265 nd_free(&ip_g_nd); 5266 } 5267 } 5268 5269 ipsec_loader_init(); 5270 ipsec_policy_init(); 5271 ipsec_kstat_init(); 5272 rw_init(&ip_g_nd_lock, NULL, RW_DEFAULT, NULL); 5273 mutex_init(&igmp_timer_lock, NULL, MUTEX_DEFAULT, NULL); 5274 mutex_init(&mld_timer_lock, NULL, MUTEX_DEFAULT, NULL); 5275 mutex_init(&igmp_slowtimeout_lock, NULL, MUTEX_DEFAULT, NULL); 5276 mutex_init(&mld_slowtimeout_lock, NULL, MUTEX_DEFAULT, NULL); 5277 mutex_init(&ip_mi_lock, NULL, MUTEX_DEFAULT, NULL); 5278 mutex_init(&ip_addr_avail_lock, NULL, MUTEX_DEFAULT, NULL); 5279 rw_init(&ill_g_lock, NULL, RW_DEFAULT, NULL); 5280 rw_init(&ipsec_capab_ills_lock, NULL, RW_DEFAULT, NULL); 5281 rw_init(&ill_g_usesrc_lock, NULL, RW_DEFAULT, NULL); 5282 5283 /* 5284 * For IP and TCP the minor numbers should start from 2 since we have 4 5285 * initial devices: ip, ip6, tcp, tcp6. 5286 */ 5287 if ((ip_minor_arena = inet_minor_create("ip_minor_arena", 5288 INET_MIN_DEV + 2, KM_SLEEP)) == NULL) { 5289 cmn_err(CE_PANIC, 5290 "ip_ddi_init: ip_minor_arena creation failed\n"); 5291 } 5292 5293 ipcl_init(); 5294 mutex_init(&rts_clients.connf_lock, NULL, MUTEX_DEFAULT, NULL); 5295 ip_ire_init(); 5296 ip6_asp_init(); 5297 ipif_init(); 5298 conn_drain_init(); 5299 tcp_ddi_init(); 5300 sctp_ddi_init(); 5301 5302 ip_poll_normal_ticks = MSEC_TO_TICK_ROUNDUP(ip_poll_normal_ms); 5303 5304 if ((ip_kstat = kstat_create("ip", 0, "ipstat", 5305 "net", KSTAT_TYPE_NAMED, 5306 sizeof (ip_statistics) / sizeof (kstat_named_t), 5307 KSTAT_FLAG_VIRTUAL)) != NULL) { 5308 ip_kstat->ks_data = &ip_statistics; 5309 kstat_install(ip_kstat); 5310 } 5311 ip_kstat_init(); 5312 ip6_kstat_init(); 5313 icmp_kstat_init(); 5314 ipsec_loader_start(); 5315 tnet_init(); 5316 } 5317 5318 /* 5319 * Allocate and initialize a DLPI template of the specified length. (May be 5320 * called as writer.) 5321 */ 5322 mblk_t * 5323 ip_dlpi_alloc(size_t len, t_uscalar_t prim) 5324 { 5325 mblk_t *mp; 5326 5327 mp = allocb(len, BPRI_MED); 5328 if (!mp) 5329 return (NULL); 5330 5331 /* 5332 * DLPIv2 says that DL_INFO_REQ and DL_TOKEN_REQ (the latter 5333 * of which we don't seem to use) are sent with M_PCPROTO, and 5334 * that other DLPI are M_PROTO. 5335 */ 5336 if (prim == DL_INFO_REQ) { 5337 mp->b_datap->db_type = M_PCPROTO; 5338 } else { 5339 mp->b_datap->db_type = M_PROTO; 5340 } 5341 5342 mp->b_wptr = mp->b_rptr + len; 5343 bzero(mp->b_rptr, len); 5344 ((dl_unitdata_req_t *)mp->b_rptr)->dl_primitive = prim; 5345 return (mp); 5346 } 5347 5348 const char * 5349 dlpi_prim_str(int prim) 5350 { 5351 switch (prim) { 5352 case DL_INFO_REQ: return ("DL_INFO_REQ"); 5353 case DL_INFO_ACK: return ("DL_INFO_ACK"); 5354 case DL_ATTACH_REQ: return ("DL_ATTACH_REQ"); 5355 case DL_DETACH_REQ: return ("DL_DETACH_REQ"); 5356 case DL_BIND_REQ: return ("DL_BIND_REQ"); 5357 case DL_BIND_ACK: return ("DL_BIND_ACK"); 5358 case DL_UNBIND_REQ: return ("DL_UNBIND_REQ"); 5359 case DL_OK_ACK: return ("DL_OK_ACK"); 5360 case DL_ERROR_ACK: return ("DL_ERROR_ACK"); 5361 case DL_ENABMULTI_REQ: return ("DL_ENABMULTI_REQ"); 5362 case DL_DISABMULTI_REQ: return ("DL_DISABMULTI_REQ"); 5363 case DL_PROMISCON_REQ: return ("DL_PROMISCON_REQ"); 5364 case DL_PROMISCOFF_REQ: return ("DL_PROMISCOFF_REQ"); 5365 case DL_UNITDATA_REQ: return ("DL_UNITDATA_REQ"); 5366 case DL_UNITDATA_IND: return ("DL_UNITDATA_IND"); 5367 case DL_UDERROR_IND: return ("DL_UDERROR_IND"); 5368 case DL_PHYS_ADDR_REQ: return ("DL_PHYS_ADDR_REQ"); 5369 case DL_PHYS_ADDR_ACK: return ("DL_PHYS_ADDR_ACK"); 5370 case DL_SET_PHYS_ADDR_REQ: return ("DL_SET_PHYS_ADDR_REQ"); 5371 case DL_NOTIFY_REQ: return ("DL_NOTIFY_REQ"); 5372 case DL_NOTIFY_ACK: return ("DL_NOTIFY_ACK"); 5373 case DL_NOTIFY_IND: return ("DL_NOTIFY_IND"); 5374 case DL_CAPABILITY_REQ: return ("DL_CAPABILITY_REQ"); 5375 case DL_CAPABILITY_ACK: return ("DL_CAPABILITY_ACK"); 5376 case DL_CONTROL_REQ: return ("DL_CONTROL_REQ"); 5377 case DL_CONTROL_ACK: return ("DL_CONTROL_ACK"); 5378 default: return ("<unknown primitive>"); 5379 } 5380 } 5381 5382 const char * 5383 dlpi_err_str(int err) 5384 { 5385 switch (err) { 5386 case DL_ACCESS: return ("DL_ACCESS"); 5387 case DL_BADADDR: return ("DL_BADADDR"); 5388 case DL_BADCORR: return ("DL_BADCORR"); 5389 case DL_BADDATA: return ("DL_BADDATA"); 5390 case DL_BADPPA: return ("DL_BADPPA"); 5391 case DL_BADPRIM: return ("DL_BADPRIM"); 5392 case DL_BADQOSPARAM: return ("DL_BADQOSPARAM"); 5393 case DL_BADQOSTYPE: return ("DL_BADQOSTYPE"); 5394 case DL_BADSAP: return ("DL_BADSAP"); 5395 case DL_BADTOKEN: return ("DL_BADTOKEN"); 5396 case DL_BOUND: return ("DL_BOUND"); 5397 case DL_INITFAILED: return ("DL_INITFAILED"); 5398 case DL_NOADDR: return ("DL_NOADDR"); 5399 case DL_NOTINIT: return ("DL_NOTINIT"); 5400 case DL_OUTSTATE: return ("DL_OUTSTATE"); 5401 case DL_SYSERR: return ("DL_SYSERR"); 5402 case DL_UNSUPPORTED: return ("DL_UNSUPPORTED"); 5403 case DL_UNDELIVERABLE: return ("DL_UNDELIVERABLE"); 5404 case DL_NOTSUPPORTED : return ("DL_NOTSUPPORTED "); 5405 case DL_TOOMANY: return ("DL_TOOMANY"); 5406 case DL_NOTENAB: return ("DL_NOTENAB"); 5407 case DL_BUSY: return ("DL_BUSY"); 5408 case DL_NOAUTO: return ("DL_NOAUTO"); 5409 case DL_NOXIDAUTO: return ("DL_NOXIDAUTO"); 5410 case DL_NOTESTAUTO: return ("DL_NOTESTAUTO"); 5411 case DL_XIDAUTO: return ("DL_XIDAUTO"); 5412 case DL_TESTAUTO: return ("DL_TESTAUTO"); 5413 case DL_PENDING: return ("DL_PENDING"); 5414 default: return ("<unknown error>"); 5415 } 5416 } 5417 5418 /* 5419 * Debug formatting routine. Returns a character string representation of the 5420 * addr in buf, of the form xxx.xxx.xxx.xxx. This routine takes the address 5421 * in the form of a ipaddr_t and calls ip_dot_saddr with a pointer. 5422 */ 5423 char * 5424 ip_dot_addr(ipaddr_t addr, char *buf) 5425 { 5426 return (ip_dot_saddr((uchar_t *)&addr, buf)); 5427 } 5428 5429 /* 5430 * Debug formatting routine. Returns a character string representation of the 5431 * addr in buf, of the form xxx.xxx.xxx.xxx. This routine takes the address 5432 * as a pointer. The "xxx" parts including left zero padding so the final 5433 * string will fit easily in tables. It would be nice to take a padding 5434 * length argument instead. 5435 */ 5436 static char * 5437 ip_dot_saddr(uchar_t *addr, char *buf) 5438 { 5439 (void) mi_sprintf(buf, "%03d.%03d.%03d.%03d", 5440 addr[0] & 0xFF, addr[1] & 0xFF, addr[2] & 0xFF, addr[3] & 0xFF); 5441 return (buf); 5442 } 5443 5444 /* 5445 * Send an ICMP error after patching up the packet appropriately. Returns 5446 * non-zero if the appropriate MIB should be bumped; zero otherwise. 5447 */ 5448 static boolean_t 5449 ip_fanout_send_icmp(queue_t *q, mblk_t *mp, uint_t flags, 5450 uint_t icmp_type, uint_t icmp_code, boolean_t mctl_present, zoneid_t zoneid) 5451 { 5452 ipha_t *ipha; 5453 mblk_t *first_mp; 5454 boolean_t secure; 5455 unsigned char db_type; 5456 5457 first_mp = mp; 5458 if (mctl_present) { 5459 mp = mp->b_cont; 5460 secure = ipsec_in_is_secure(first_mp); 5461 ASSERT(mp != NULL); 5462 } else { 5463 /* 5464 * If this is an ICMP error being reported - which goes 5465 * up as M_CTLs, we need to convert them to M_DATA till 5466 * we finish checking with global policy because 5467 * ipsec_check_global_policy() assumes M_DATA as clear 5468 * and M_CTL as secure. 5469 */ 5470 db_type = DB_TYPE(mp); 5471 DB_TYPE(mp) = M_DATA; 5472 secure = B_FALSE; 5473 } 5474 /* 5475 * We are generating an icmp error for some inbound packet. 5476 * Called from all ip_fanout_(udp, tcp, proto) functions. 5477 * Before we generate an error, check with global policy 5478 * to see whether this is allowed to enter the system. As 5479 * there is no "conn", we are checking with global policy. 5480 */ 5481 ipha = (ipha_t *)mp->b_rptr; 5482 if (secure || ipsec_inbound_v4_policy_present) { 5483 first_mp = ipsec_check_global_policy(first_mp, NULL, 5484 ipha, NULL, mctl_present); 5485 if (first_mp == NULL) 5486 return (B_FALSE); 5487 } 5488 5489 if (!mctl_present) 5490 DB_TYPE(mp) = db_type; 5491 5492 if (flags & IP_FF_SEND_ICMP) { 5493 if (flags & IP_FF_HDR_COMPLETE) { 5494 if (ip_hdr_complete(ipha, zoneid)) { 5495 freemsg(first_mp); 5496 return (B_TRUE); 5497 } 5498 } 5499 if (flags & IP_FF_CKSUM) { 5500 /* 5501 * Have to correct checksum since 5502 * the packet might have been 5503 * fragmented and the reassembly code in ip_rput 5504 * does not restore the IP checksum. 5505 */ 5506 ipha->ipha_hdr_checksum = 0; 5507 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 5508 } 5509 switch (icmp_type) { 5510 case ICMP_DEST_UNREACHABLE: 5511 icmp_unreachable(WR(q), first_mp, icmp_code); 5512 break; 5513 default: 5514 freemsg(first_mp); 5515 break; 5516 } 5517 } else { 5518 freemsg(first_mp); 5519 return (B_FALSE); 5520 } 5521 5522 return (B_TRUE); 5523 } 5524 5525 /* 5526 * Used to send an ICMP error message when a packet is received for 5527 * a protocol that is not supported. The mblk passed as argument 5528 * is consumed by this function. 5529 */ 5530 void 5531 ip_proto_not_sup(queue_t *q, mblk_t *ipsec_mp, uint_t flags, zoneid_t zoneid) 5532 { 5533 mblk_t *mp; 5534 ipha_t *ipha; 5535 ill_t *ill; 5536 ipsec_in_t *ii; 5537 5538 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 5539 ASSERT(ii->ipsec_in_type == IPSEC_IN); 5540 5541 mp = ipsec_mp->b_cont; 5542 ipsec_mp->b_cont = NULL; 5543 ipha = (ipha_t *)mp->b_rptr; 5544 if (IPH_HDR_VERSION(ipha) == IP_VERSION) { 5545 if (ip_fanout_send_icmp(q, mp, flags, ICMP_DEST_UNREACHABLE, 5546 ICMP_PROTOCOL_UNREACHABLE, B_FALSE, zoneid)) { 5547 BUMP_MIB(&ip_mib, ipInUnknownProtos); 5548 } 5549 } else { 5550 /* Get ill from index in ipsec_in_t. */ 5551 ill = ill_lookup_on_ifindex(ii->ipsec_in_ill_index, 5552 B_TRUE, NULL, NULL, NULL, NULL); 5553 if (ill != NULL) { 5554 if (ip_fanout_send_icmp_v6(q, mp, flags, 5555 ICMP6_PARAM_PROB, ICMP6_PARAMPROB_NEXTHEADER, 5556 0, B_FALSE, zoneid)) { 5557 BUMP_MIB(ill->ill_ip6_mib, ipv6InUnknownProtos); 5558 } 5559 5560 ill_refrele(ill); 5561 } else { /* re-link for the freemsg() below. */ 5562 ipsec_mp->b_cont = mp; 5563 } 5564 } 5565 5566 /* If ICMP delivered, ipsec_mp will be a singleton (b_cont == NULL). */ 5567 freemsg(ipsec_mp); 5568 } 5569 5570 /* 5571 * See if the inbound datagram has had IPsec processing applied to it. 5572 */ 5573 boolean_t 5574 ipsec_in_is_secure(mblk_t *ipsec_mp) 5575 { 5576 ipsec_in_t *ii; 5577 5578 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 5579 ASSERT(ii->ipsec_in_type == IPSEC_IN); 5580 5581 if (ii->ipsec_in_loopback) { 5582 return (ii->ipsec_in_secure); 5583 } else { 5584 return (ii->ipsec_in_ah_sa != NULL || 5585 ii->ipsec_in_esp_sa != NULL || 5586 ii->ipsec_in_decaps); 5587 } 5588 } 5589 5590 /* 5591 * Handle protocols with which IP is less intimate. There 5592 * can be more than one stream bound to a particular 5593 * protocol. When this is the case, normally each one gets a copy 5594 * of any incoming packets. 5595 * 5596 * IPSEC NOTE : 5597 * 5598 * Don't allow a secure packet going up a non-secure connection. 5599 * We don't allow this because 5600 * 5601 * 1) Reply might go out in clear which will be dropped at 5602 * the sending side. 5603 * 2) If the reply goes out in clear it will give the 5604 * adversary enough information for getting the key in 5605 * most of the cases. 5606 * 5607 * Moreover getting a secure packet when we expect clear 5608 * implies that SA's were added without checking for 5609 * policy on both ends. This should not happen once ISAKMP 5610 * is used to negotiate SAs as SAs will be added only after 5611 * verifying the policy. 5612 * 5613 * NOTE : If the packet was tunneled and not multicast we only send 5614 * to it the first match. Unlike TCP and UDP fanouts this doesn't fall 5615 * back to delivering packets to AF_INET6 raw sockets. 5616 * 5617 * IPQoS Notes: 5618 * Once we have determined the client, invoke IPPF processing. 5619 * Policy processing takes place only if the callout_position, IPP_LOCAL_IN, 5620 * is enabled. If we get here from icmp_inbound_error_fanout or ip_wput_local 5621 * ip_policy will be false. 5622 * 5623 * Zones notes: 5624 * Currently only applications in the global zone can create raw sockets for 5625 * protocols other than ICMP. So unlike the broadcast / multicast case of 5626 * ip_fanout_udp(), we only send a copy of the packet to streams in the 5627 * specified zone. For ICMP, this is handled by the callers of icmp_inbound(). 5628 */ 5629 static void 5630 ip_fanout_proto(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, uint_t flags, 5631 boolean_t mctl_present, boolean_t ip_policy, ill_t *recv_ill, 5632 zoneid_t zoneid) 5633 { 5634 queue_t *rq; 5635 mblk_t *mp1, *first_mp1; 5636 uint_t protocol = ipha->ipha_protocol; 5637 ipaddr_t dst; 5638 boolean_t one_only; 5639 mblk_t *first_mp = mp; 5640 boolean_t secure; 5641 uint32_t ill_index; 5642 conn_t *connp, *first_connp, *next_connp; 5643 connf_t *connfp; 5644 boolean_t shared_addr; 5645 5646 if (mctl_present) { 5647 mp = first_mp->b_cont; 5648 secure = ipsec_in_is_secure(first_mp); 5649 ASSERT(mp != NULL); 5650 } else { 5651 secure = B_FALSE; 5652 } 5653 dst = ipha->ipha_dst; 5654 /* 5655 * If the packet was tunneled and not multicast we only send to it 5656 * the first match. 5657 */ 5658 one_only = ((protocol == IPPROTO_ENCAP || protocol == IPPROTO_IPV6) && 5659 !CLASSD(dst)); 5660 5661 shared_addr = (zoneid == ALL_ZONES); 5662 if (shared_addr) { 5663 /* 5664 * We don't allow multilevel ports for raw IP, so no need to 5665 * check for that here. 5666 */ 5667 zoneid = tsol_packet_to_zoneid(mp); 5668 } 5669 5670 connfp = &ipcl_proto_fanout[protocol]; 5671 mutex_enter(&connfp->connf_lock); 5672 connp = connfp->connf_head; 5673 for (connp = connfp->connf_head; connp != NULL; 5674 connp = connp->conn_next) { 5675 if (IPCL_PROTO_MATCH(connp, protocol, ipha, ill, flags, 5676 zoneid) && 5677 (!is_system_labeled() || 5678 tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 5679 connp))) 5680 break; 5681 } 5682 5683 if (connp == NULL || connp->conn_upq == NULL) { 5684 /* 5685 * No one bound to these addresses. Is 5686 * there a client that wants all 5687 * unclaimed datagrams? 5688 */ 5689 mutex_exit(&connfp->connf_lock); 5690 /* 5691 * Check for IPPROTO_ENCAP... 5692 */ 5693 if (protocol == IPPROTO_ENCAP && ip_g_mrouter) { 5694 /* 5695 * XXX If an IPsec mblk is here on a multicast 5696 * tunnel (using ip_mroute stuff), what should 5697 * I do? 5698 * 5699 * For now, just free the IPsec mblk before 5700 * passing it up to the multicast routing 5701 * stuff. 5702 * 5703 * BTW, If I match a configured IP-in-IP 5704 * tunnel, ip_mroute_decap will never be 5705 * called. 5706 */ 5707 if (mp != first_mp) 5708 freeb(first_mp); 5709 ip_mroute_decap(q, mp); 5710 } else { 5711 /* 5712 * Otherwise send an ICMP protocol unreachable. 5713 */ 5714 if (ip_fanout_send_icmp(q, first_mp, flags, 5715 ICMP_DEST_UNREACHABLE, ICMP_PROTOCOL_UNREACHABLE, 5716 mctl_present, zoneid)) { 5717 BUMP_MIB(&ip_mib, ipInUnknownProtos); 5718 } 5719 } 5720 return; 5721 } 5722 CONN_INC_REF(connp); 5723 first_connp = connp; 5724 5725 /* 5726 * Only send message to one tunnel driver by immediately 5727 * terminating the loop. 5728 */ 5729 connp = one_only ? NULL : connp->conn_next; 5730 5731 for (;;) { 5732 while (connp != NULL) { 5733 if (IPCL_PROTO_MATCH(connp, protocol, ipha, ill, 5734 flags, zoneid) && 5735 (!is_system_labeled() || 5736 tsol_receive_local(mp, &dst, IPV4_VERSION, 5737 shared_addr, connp))) 5738 break; 5739 connp = connp->conn_next; 5740 } 5741 5742 /* 5743 * Copy the packet. 5744 */ 5745 if (connp == NULL || connp->conn_upq == NULL || 5746 (((first_mp1 = dupmsg(first_mp)) == NULL) && 5747 ((first_mp1 = ip_copymsg(first_mp)) == NULL))) { 5748 /* 5749 * No more interested clients or memory 5750 * allocation failed 5751 */ 5752 connp = first_connp; 5753 break; 5754 } 5755 mp1 = mctl_present ? first_mp1->b_cont : first_mp1; 5756 CONN_INC_REF(connp); 5757 mutex_exit(&connfp->connf_lock); 5758 rq = connp->conn_rq; 5759 if (!canputnext(rq)) { 5760 if (flags & IP_FF_RAWIP) { 5761 BUMP_MIB(&ip_mib, rawipInOverflows); 5762 } else { 5763 BUMP_MIB(&icmp_mib, icmpInOverflows); 5764 } 5765 5766 freemsg(first_mp1); 5767 } else { 5768 if (CONN_INBOUND_POLICY_PRESENT(connp) || secure) { 5769 first_mp1 = ipsec_check_inbound_policy 5770 (first_mp1, connp, ipha, NULL, 5771 mctl_present); 5772 } 5773 if (first_mp1 != NULL) { 5774 /* 5775 * ip_fanout_proto also gets called from 5776 * icmp_inbound_error_fanout, in which case 5777 * the msg type is M_CTL. Don't add info 5778 * in this case for the time being. In future 5779 * when there is a need for knowing the 5780 * inbound iface index for ICMP error msgs, 5781 * then this can be changed. 5782 */ 5783 if ((connp->conn_recvif != 0) && 5784 (mp->b_datap->db_type != M_CTL)) { 5785 /* 5786 * the actual data will be 5787 * contained in b_cont upon 5788 * successful return of the 5789 * following call else 5790 * original mblk is returned 5791 */ 5792 ASSERT(recv_ill != NULL); 5793 mp1 = ip_add_info(mp1, recv_ill, 5794 IPF_RECVIF); 5795 } 5796 BUMP_MIB(&ip_mib, ipInDelivers); 5797 if (mctl_present) 5798 freeb(first_mp1); 5799 putnext(rq, mp1); 5800 } 5801 } 5802 mutex_enter(&connfp->connf_lock); 5803 /* Follow the next pointer before releasing the conn. */ 5804 next_connp = connp->conn_next; 5805 CONN_DEC_REF(connp); 5806 connp = next_connp; 5807 } 5808 5809 /* Last one. Send it upstream. */ 5810 mutex_exit(&connfp->connf_lock); 5811 5812 /* 5813 * If this packet is coming from icmp_inbound_error_fanout ip_policy 5814 * will be set to false. 5815 */ 5816 if (IPP_ENABLED(IPP_LOCAL_IN) && ip_policy) { 5817 ill_index = ill->ill_phyint->phyint_ifindex; 5818 ip_process(IPP_LOCAL_IN, &mp, ill_index); 5819 if (mp == NULL) { 5820 CONN_DEC_REF(connp); 5821 if (mctl_present) { 5822 freeb(first_mp); 5823 } 5824 return; 5825 } 5826 } 5827 5828 rq = connp->conn_rq; 5829 if (!canputnext(rq)) { 5830 if (flags & IP_FF_RAWIP) { 5831 BUMP_MIB(&ip_mib, rawipInOverflows); 5832 } else { 5833 BUMP_MIB(&icmp_mib, icmpInOverflows); 5834 } 5835 5836 freemsg(first_mp); 5837 } else { 5838 if (CONN_INBOUND_POLICY_PRESENT(connp) || secure) { 5839 first_mp = ipsec_check_inbound_policy(first_mp, connp, 5840 ipha, NULL, mctl_present); 5841 } 5842 if (first_mp != NULL) { 5843 /* 5844 * ip_fanout_proto also gets called 5845 * from icmp_inbound_error_fanout, in 5846 * which case the msg type is M_CTL. 5847 * Don't add info in this case for time 5848 * being. In future when there is a 5849 * need for knowing the inbound iface 5850 * index for ICMP error msgs, then this 5851 * can be changed 5852 */ 5853 if ((connp->conn_recvif != 0) && 5854 (mp->b_datap->db_type != M_CTL)) { 5855 /* 5856 * the actual data will be contained in 5857 * b_cont upon successful return 5858 * of the following call else original 5859 * mblk is returned 5860 */ 5861 ASSERT(recv_ill != NULL); 5862 mp = ip_add_info(mp, recv_ill, IPF_RECVIF); 5863 } 5864 BUMP_MIB(&ip_mib, ipInDelivers); 5865 putnext(rq, mp); 5866 if (mctl_present) 5867 freeb(first_mp); 5868 } 5869 } 5870 CONN_DEC_REF(connp); 5871 } 5872 5873 /* 5874 * Fanout for TCP packets 5875 * The caller puts <fport, lport> in the ports parameter. 5876 * 5877 * IPQoS Notes 5878 * Before sending it to the client, invoke IPPF processing. 5879 * Policy processing takes place only if the callout_position, IPP_LOCAL_IN, 5880 * is enabled. If we get here from icmp_inbound_error_fanout or ip_wput_local 5881 * ip_policy is false. 5882 */ 5883 static void 5884 ip_fanout_tcp(queue_t *q, mblk_t *mp, ill_t *recv_ill, ipha_t *ipha, 5885 uint_t flags, boolean_t mctl_present, boolean_t ip_policy, zoneid_t zoneid) 5886 { 5887 mblk_t *first_mp; 5888 boolean_t secure; 5889 uint32_t ill_index; 5890 int ip_hdr_len; 5891 tcph_t *tcph; 5892 boolean_t syn_present = B_FALSE; 5893 conn_t *connp; 5894 5895 first_mp = mp; 5896 if (mctl_present) { 5897 ASSERT(first_mp->b_datap->db_type == M_CTL); 5898 mp = first_mp->b_cont; 5899 secure = ipsec_in_is_secure(first_mp); 5900 ASSERT(mp != NULL); 5901 } else { 5902 secure = B_FALSE; 5903 } 5904 5905 ip_hdr_len = IPH_HDR_LENGTH(mp->b_rptr); 5906 5907 if ((connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_len, zoneid)) == 5908 NULL) { 5909 /* 5910 * No connected connection or listener. Send a 5911 * TH_RST via tcp_xmit_listeners_reset. 5912 */ 5913 5914 /* Initiate IPPf processing, if needed. */ 5915 if (IPP_ENABLED(IPP_LOCAL_IN)) { 5916 uint32_t ill_index; 5917 ill_index = recv_ill->ill_phyint->phyint_ifindex; 5918 ip_process(IPP_LOCAL_IN, &first_mp, ill_index); 5919 if (first_mp == NULL) 5920 return; 5921 } 5922 BUMP_MIB(&ip_mib, ipInDelivers); 5923 ip2dbg(("ip_fanout_tcp: no listener; send reset to zone %d\n", 5924 zoneid)); 5925 tcp_xmit_listeners_reset(first_mp, ip_hdr_len); 5926 return; 5927 } 5928 5929 /* 5930 * Allocate the SYN for the TCP connection here itself 5931 */ 5932 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 5933 if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) { 5934 if (IPCL_IS_TCP(connp)) { 5935 squeue_t *sqp; 5936 5937 /* 5938 * For fused tcp loopback, assign the eager's 5939 * squeue to be that of the active connect's. 5940 * Note that we don't check for IP_FF_LOOPBACK 5941 * here since this routine gets called only 5942 * for loopback (unlike the IPv6 counterpart). 5943 */ 5944 ASSERT(Q_TO_CONN(q) != NULL); 5945 if (do_tcp_fusion && 5946 !CONN_INBOUND_POLICY_PRESENT(connp) && !secure && 5947 !IPP_ENABLED(IPP_LOCAL_IN) && !ip_policy && 5948 IPCL_IS_TCP(Q_TO_CONN(q))) { 5949 ASSERT(Q_TO_CONN(q)->conn_sqp != NULL); 5950 sqp = Q_TO_CONN(q)->conn_sqp; 5951 } else { 5952 sqp = IP_SQUEUE_GET(lbolt); 5953 } 5954 5955 mp->b_datap->db_struioflag |= STRUIO_EAGER; 5956 DB_CKSUMSTART(mp) = (intptr_t)sqp; 5957 syn_present = B_TRUE; 5958 } 5959 } 5960 5961 if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp) && !syn_present) { 5962 uint_t flags = (unsigned int)tcph->th_flags[0] & 0xFF; 5963 if ((flags & TH_RST) || (flags & TH_URG)) { 5964 CONN_DEC_REF(connp); 5965 freemsg(first_mp); 5966 return; 5967 } 5968 if (flags & TH_ACK) { 5969 tcp_xmit_listeners_reset(first_mp, ip_hdr_len); 5970 CONN_DEC_REF(connp); 5971 return; 5972 } 5973 5974 CONN_DEC_REF(connp); 5975 freemsg(first_mp); 5976 return; 5977 } 5978 5979 if (CONN_INBOUND_POLICY_PRESENT(connp) || secure) { 5980 first_mp = ipsec_check_inbound_policy(first_mp, connp, ipha, 5981 NULL, mctl_present); 5982 if (first_mp == NULL) { 5983 CONN_DEC_REF(connp); 5984 return; 5985 } 5986 if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp)) { 5987 ASSERT(syn_present); 5988 if (mctl_present) { 5989 ASSERT(first_mp != mp); 5990 first_mp->b_datap->db_struioflag |= 5991 STRUIO_POLICY; 5992 } else { 5993 ASSERT(first_mp == mp); 5994 mp->b_datap->db_struioflag &= 5995 ~STRUIO_EAGER; 5996 mp->b_datap->db_struioflag |= 5997 STRUIO_POLICY; 5998 } 5999 } else { 6000 /* 6001 * Discard first_mp early since we're dealing with a 6002 * fully-connected conn_t and tcp doesn't do policy in 6003 * this case. 6004 */ 6005 if (mctl_present) { 6006 freeb(first_mp); 6007 mctl_present = B_FALSE; 6008 } 6009 first_mp = mp; 6010 } 6011 } 6012 6013 /* 6014 * Initiate policy processing here if needed. If we get here from 6015 * icmp_inbound_error_fanout, ip_policy is false. 6016 */ 6017 if (IPP_ENABLED(IPP_LOCAL_IN) && ip_policy) { 6018 ill_index = recv_ill->ill_phyint->phyint_ifindex; 6019 ip_process(IPP_LOCAL_IN, &mp, ill_index); 6020 if (mp == NULL) { 6021 CONN_DEC_REF(connp); 6022 if (mctl_present) 6023 freeb(first_mp); 6024 return; 6025 } else if (mctl_present) { 6026 ASSERT(first_mp != mp); 6027 first_mp->b_cont = mp; 6028 } else { 6029 first_mp = mp; 6030 } 6031 } 6032 6033 6034 6035 /* Handle IPv6 socket options. */ 6036 if (!syn_present && 6037 connp->conn_ipv6_recvpktinfo && (flags & IP_FF_IP6INFO)) { 6038 /* Add header */ 6039 ASSERT(recv_ill != NULL); 6040 mp = ip_add_info(mp, recv_ill, IPF_RECVIF); 6041 if (mp == NULL) { 6042 CONN_DEC_REF(connp); 6043 if (mctl_present) 6044 freeb(first_mp); 6045 return; 6046 } else if (mctl_present) { 6047 /* 6048 * ip_add_info might return a new mp. 6049 */ 6050 ASSERT(first_mp != mp); 6051 first_mp->b_cont = mp; 6052 } else { 6053 first_mp = mp; 6054 } 6055 } 6056 6057 BUMP_MIB(&ip_mib, ipInDelivers); 6058 if (IPCL_IS_TCP(connp)) { 6059 (*ip_input_proc)(connp->conn_sqp, first_mp, 6060 connp->conn_recv, connp, SQTAG_IP_FANOUT_TCP); 6061 } else { 6062 putnext(connp->conn_rq, first_mp); 6063 CONN_DEC_REF(connp); 6064 } 6065 } 6066 6067 /* 6068 * Deliver a udp packet to the given conn, possibly applying ipsec policy. 6069 * We are responsible for disposing of mp, such as by freemsg() or putnext() 6070 * Caller is responsible for dropping references to the conn, and freeing 6071 * first_mp. 6072 * 6073 * IPQoS Notes 6074 * Before sending it to the client, invoke IPPF processing. Policy processing 6075 * takes place only if the callout_position, IPP_LOCAL_IN, is enabled and 6076 * ip_policy is true. If we get here from icmp_inbound_error_fanout or 6077 * ip_wput_local, ip_policy is false. 6078 */ 6079 static void 6080 ip_fanout_udp_conn(conn_t *connp, mblk_t *first_mp, mblk_t *mp, 6081 boolean_t secure, ipha_t *ipha, uint_t flags, ill_t *recv_ill, 6082 boolean_t ip_policy) 6083 { 6084 boolean_t mctl_present = (first_mp != NULL); 6085 uint32_t in_flags = 0; /* set to IP_RECVSLLA and/or IP_RECVIF */ 6086 uint32_t ill_index; 6087 6088 if (mctl_present) 6089 first_mp->b_cont = mp; 6090 else 6091 first_mp = mp; 6092 6093 if (CONN_UDP_FLOWCTLD(connp)) { 6094 BUMP_MIB(&ip_mib, udpInOverflows); 6095 freemsg(first_mp); 6096 return; 6097 } 6098 6099 if (CONN_INBOUND_POLICY_PRESENT(connp) || secure) { 6100 first_mp = ipsec_check_inbound_policy(first_mp, connp, ipha, 6101 NULL, mctl_present); 6102 if (first_mp == NULL) 6103 return; /* Freed by ipsec_check_inbound_policy(). */ 6104 } 6105 if (mctl_present) 6106 freeb(first_mp); 6107 6108 if (connp->conn_recvif) 6109 in_flags = IPF_RECVIF; 6110 if (connp->conn_recvslla && !(flags & IP_FF_SEND_SLLA)) 6111 in_flags |= IPF_RECVSLLA; 6112 6113 /* Handle IPv6 options. */ 6114 if (connp->conn_ipv6_recvpktinfo && (flags & IP_FF_IP6INFO)) 6115 in_flags |= IPF_RECVIF; 6116 6117 /* 6118 * Initiate IPPF processing here, if needed. Note first_mp won't be 6119 * freed if the packet is dropped. The caller will do so. 6120 */ 6121 if (IPP_ENABLED(IPP_LOCAL_IN) && ip_policy) { 6122 ill_index = recv_ill->ill_phyint->phyint_ifindex; 6123 ip_process(IPP_LOCAL_IN, &mp, ill_index); 6124 if (mp == NULL) { 6125 return; 6126 } 6127 } 6128 if ((in_flags != 0) && 6129 (mp->b_datap->db_type != M_CTL)) { 6130 /* 6131 * The actual data will be contained in b_cont 6132 * upon successful return of the following call 6133 * else original mblk is returned 6134 */ 6135 ASSERT(recv_ill != NULL); 6136 mp = ip_add_info(mp, recv_ill, in_flags); 6137 } 6138 BUMP_MIB(&ip_mib, ipInDelivers); 6139 6140 /* Send it upstream */ 6141 CONN_UDP_RECV(connp, mp); 6142 } 6143 6144 /* 6145 * Fanout for UDP packets. 6146 * The caller puts <fport, lport> in the ports parameter. 6147 * 6148 * If SO_REUSEADDR is set all multicast and broadcast packets 6149 * will be delivered to all streams bound to the same port. 6150 * 6151 * Zones notes: 6152 * Multicast and broadcast packets will be distributed to streams in all zones. 6153 * In the special case where an AF_INET socket binds to 0.0.0.0/<port> and an 6154 * AF_INET6 socket binds to ::/<port>, only the AF_INET socket receives the IPv4 6155 * packets. To maintain this behavior with multiple zones, the conns are grouped 6156 * by zone and the SO_REUSEADDR flag is checked for the first matching conn in 6157 * each zone. If unset, all the following conns in the same zone are skipped. 6158 */ 6159 static void 6160 ip_fanout_udp(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, 6161 uint32_t ports, boolean_t broadcast, uint_t flags, boolean_t mctl_present, 6162 boolean_t ip_policy, ill_t *recv_ill, zoneid_t zoneid) 6163 { 6164 uint32_t dstport, srcport; 6165 ipaddr_t dst; 6166 mblk_t *first_mp; 6167 boolean_t secure; 6168 in6_addr_t v6src; 6169 conn_t *connp; 6170 connf_t *connfp; 6171 conn_t *first_connp; 6172 conn_t *next_connp; 6173 mblk_t *mp1, *first_mp1; 6174 ipaddr_t src; 6175 zoneid_t last_zoneid; 6176 boolean_t reuseaddr; 6177 boolean_t shared_addr; 6178 6179 first_mp = mp; 6180 if (mctl_present) { 6181 mp = first_mp->b_cont; 6182 first_mp->b_cont = NULL; 6183 secure = ipsec_in_is_secure(first_mp); 6184 ASSERT(mp != NULL); 6185 } else { 6186 first_mp = NULL; 6187 secure = B_FALSE; 6188 } 6189 6190 /* Extract ports in net byte order */ 6191 dstport = htons(ntohl(ports) & 0xFFFF); 6192 srcport = htons(ntohl(ports) >> 16); 6193 dst = ipha->ipha_dst; 6194 src = ipha->ipha_src; 6195 6196 shared_addr = (zoneid == ALL_ZONES); 6197 if (shared_addr) { 6198 zoneid = tsol_mlp_findzone(IPPROTO_UDP, dstport); 6199 if (zoneid == ALL_ZONES) 6200 zoneid = tsol_packet_to_zoneid(mp); 6201 } 6202 6203 connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(dstport)]; 6204 mutex_enter(&connfp->connf_lock); 6205 connp = connfp->connf_head; 6206 if (!broadcast && !CLASSD(dst)) { 6207 /* 6208 * Not broadcast or multicast. Send to the one (first) 6209 * client we find. No need to check conn_wantpacket() 6210 * since IP_BOUND_IF/conn_incoming_ill does not apply to 6211 * IPv4 unicast packets. 6212 */ 6213 while ((connp != NULL) && 6214 (!IPCL_UDP_MATCH(connp, dstport, dst, 6215 srcport, src) || connp->conn_zoneid != zoneid)) { 6216 connp = connp->conn_next; 6217 } 6218 6219 if (connp == NULL || connp->conn_upq == NULL) 6220 goto notfound; 6221 6222 if (is_system_labeled() && 6223 !tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 6224 connp)) 6225 goto notfound; 6226 6227 CONN_INC_REF(connp); 6228 mutex_exit(&connfp->connf_lock); 6229 ip_fanout_udp_conn(connp, first_mp, mp, secure, ipha, flags, 6230 recv_ill, ip_policy); 6231 IP_STAT(ip_udp_fannorm); 6232 CONN_DEC_REF(connp); 6233 return; 6234 } 6235 6236 /* 6237 * Broadcast and multicast case 6238 * 6239 * Need to check conn_wantpacket(). 6240 * If SO_REUSEADDR has been set on the first we send the 6241 * packet to all clients that have joined the group and 6242 * match the port. 6243 */ 6244 6245 while (connp != NULL) { 6246 if ((IPCL_UDP_MATCH(connp, dstport, dst, srcport, src)) && 6247 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 6248 (!is_system_labeled() || 6249 tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 6250 connp))) 6251 break; 6252 connp = connp->conn_next; 6253 } 6254 6255 if (connp == NULL || connp->conn_upq == NULL) 6256 goto notfound; 6257 6258 first_connp = connp; 6259 /* 6260 * When SO_REUSEADDR is not set, send the packet only to the first 6261 * matching connection in its zone by keeping track of the zoneid. 6262 */ 6263 reuseaddr = first_connp->conn_reuseaddr; 6264 last_zoneid = first_connp->conn_zoneid; 6265 6266 CONN_INC_REF(connp); 6267 connp = connp->conn_next; 6268 for (;;) { 6269 while (connp != NULL) { 6270 if (IPCL_UDP_MATCH(connp, dstport, dst, srcport, src) && 6271 (reuseaddr || connp->conn_zoneid != last_zoneid) && 6272 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 6273 (!is_system_labeled() || 6274 tsol_receive_local(mp, &dst, IPV4_VERSION, 6275 shared_addr, connp))) 6276 break; 6277 connp = connp->conn_next; 6278 } 6279 /* 6280 * Just copy the data part alone. The mctl part is 6281 * needed just for verifying policy and it is never 6282 * sent up. 6283 */ 6284 if (connp == NULL || (((mp1 = dupmsg(mp)) == NULL) && 6285 ((mp1 = copymsg(mp)) == NULL))) { 6286 /* 6287 * No more interested clients or memory 6288 * allocation failed 6289 */ 6290 connp = first_connp; 6291 break; 6292 } 6293 if (connp->conn_zoneid != last_zoneid) { 6294 /* 6295 * Update the zoneid so that the packet isn't sent to 6296 * any more conns in the same zone unless SO_REUSEADDR 6297 * is set. 6298 */ 6299 reuseaddr = connp->conn_reuseaddr; 6300 last_zoneid = connp->conn_zoneid; 6301 } 6302 if (first_mp != NULL) { 6303 ASSERT(((ipsec_info_t *)first_mp->b_rptr)-> 6304 ipsec_info_type == IPSEC_IN); 6305 first_mp1 = ipsec_in_tag(first_mp, NULL); 6306 if (first_mp1 == NULL) { 6307 freemsg(mp1); 6308 connp = first_connp; 6309 break; 6310 } 6311 } else { 6312 first_mp1 = NULL; 6313 } 6314 CONN_INC_REF(connp); 6315 mutex_exit(&connfp->connf_lock); 6316 /* 6317 * IPQoS notes: We don't send the packet for policy 6318 * processing here, will do it for the last one (below). 6319 * i.e. we do it per-packet now, but if we do policy 6320 * processing per-conn, then we would need to do it 6321 * here too. 6322 */ 6323 ip_fanout_udp_conn(connp, first_mp1, mp1, secure, 6324 ipha, flags, recv_ill, B_FALSE); 6325 mutex_enter(&connfp->connf_lock); 6326 /* Follow the next pointer before releasing the conn. */ 6327 next_connp = connp->conn_next; 6328 IP_STAT(ip_udp_fanmb); 6329 CONN_DEC_REF(connp); 6330 connp = next_connp; 6331 } 6332 6333 /* Last one. Send it upstream. */ 6334 mutex_exit(&connfp->connf_lock); 6335 ip_fanout_udp_conn(connp, first_mp, mp, secure, ipha, flags, recv_ill, 6336 ip_policy); 6337 IP_STAT(ip_udp_fanmb); 6338 CONN_DEC_REF(connp); 6339 return; 6340 6341 notfound: 6342 6343 mutex_exit(&connfp->connf_lock); 6344 IP_STAT(ip_udp_fanothers); 6345 /* 6346 * IPv6 endpoints bound to unicast or multicast IPv4-mapped addresses 6347 * have already been matched above, since they live in the IPv4 6348 * fanout tables. This implies we only need to 6349 * check for IPv6 in6addr_any endpoints here. 6350 * Thus we compare using ipv6_all_zeros instead of the destination 6351 * address, except for the multicast group membership lookup which 6352 * uses the IPv4 destination. 6353 */ 6354 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src); 6355 connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(dstport)]; 6356 mutex_enter(&connfp->connf_lock); 6357 connp = connfp->connf_head; 6358 if (!broadcast && !CLASSD(dst)) { 6359 while (connp != NULL) { 6360 if (IPCL_UDP_MATCH_V6(connp, dstport, ipv6_all_zeros, 6361 srcport, v6src) && connp->conn_zoneid == zoneid && 6362 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 6363 !connp->conn_ipv6_v6only) 6364 break; 6365 connp = connp->conn_next; 6366 } 6367 6368 if (connp != NULL && is_system_labeled() && 6369 !tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 6370 connp)) 6371 connp = NULL; 6372 6373 if (connp == NULL || connp->conn_upq == NULL) { 6374 /* 6375 * No one bound to this port. Is 6376 * there a client that wants all 6377 * unclaimed datagrams? 6378 */ 6379 mutex_exit(&connfp->connf_lock); 6380 6381 if (mctl_present) 6382 first_mp->b_cont = mp; 6383 else 6384 first_mp = mp; 6385 if (ipcl_proto_search(IPPROTO_UDP) != NULL) { 6386 ip_fanout_proto(q, first_mp, ill, ipha, 6387 flags | IP_FF_RAWIP, mctl_present, 6388 ip_policy, recv_ill, zoneid); 6389 } else { 6390 if (ip_fanout_send_icmp(q, first_mp, flags, 6391 ICMP_DEST_UNREACHABLE, 6392 ICMP_PORT_UNREACHABLE, 6393 mctl_present, zoneid)) { 6394 BUMP_MIB(&ip_mib, udpNoPorts); 6395 } 6396 } 6397 return; 6398 } 6399 6400 CONN_INC_REF(connp); 6401 mutex_exit(&connfp->connf_lock); 6402 ip_fanout_udp_conn(connp, first_mp, mp, secure, ipha, flags, 6403 recv_ill, ip_policy); 6404 CONN_DEC_REF(connp); 6405 return; 6406 } 6407 /* 6408 * IPv4 multicast packet being delivered to an AF_INET6 6409 * in6addr_any endpoint. 6410 * Need to check conn_wantpacket(). Note that we use conn_wantpacket() 6411 * and not conn_wantpacket_v6() since any multicast membership is 6412 * for an IPv4-mapped multicast address. 6413 * The packet is sent to all clients in all zones that have joined the 6414 * group and match the port. 6415 */ 6416 while (connp != NULL) { 6417 if (IPCL_UDP_MATCH_V6(connp, dstport, ipv6_all_zeros, 6418 srcport, v6src) && 6419 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 6420 (!is_system_labeled() || 6421 tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 6422 connp))) 6423 break; 6424 connp = connp->conn_next; 6425 } 6426 6427 if (connp == NULL || connp->conn_upq == NULL) { 6428 /* 6429 * No one bound to this port. Is 6430 * there a client that wants all 6431 * unclaimed datagrams? 6432 */ 6433 mutex_exit(&connfp->connf_lock); 6434 6435 if (mctl_present) 6436 first_mp->b_cont = mp; 6437 else 6438 first_mp = mp; 6439 if (ipcl_proto_search(IPPROTO_UDP) != NULL) { 6440 ip_fanout_proto(q, first_mp, ill, ipha, 6441 flags | IP_FF_RAWIP, mctl_present, ip_policy, 6442 recv_ill, zoneid); 6443 } else { 6444 /* 6445 * We used to attempt to send an icmp error here, but 6446 * since this is known to be a multicast packet 6447 * and we don't send icmp errors in response to 6448 * multicast, just drop the packet and give up sooner. 6449 */ 6450 BUMP_MIB(&ip_mib, udpNoPorts); 6451 freemsg(first_mp); 6452 } 6453 return; 6454 } 6455 6456 first_connp = connp; 6457 6458 CONN_INC_REF(connp); 6459 connp = connp->conn_next; 6460 for (;;) { 6461 while (connp != NULL) { 6462 if (IPCL_UDP_MATCH_V6(connp, dstport, 6463 ipv6_all_zeros, srcport, v6src) && 6464 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 6465 (!is_system_labeled() || 6466 tsol_receive_local(mp, &dst, IPV4_VERSION, 6467 shared_addr, connp))) 6468 break; 6469 connp = connp->conn_next; 6470 } 6471 /* 6472 * Just copy the data part alone. The mctl part is 6473 * needed just for verifying policy and it is never 6474 * sent up. 6475 */ 6476 if (connp == NULL || (((mp1 = dupmsg(mp)) == NULL) && 6477 ((mp1 = copymsg(mp)) == NULL))) { 6478 /* 6479 * No more intested clients or memory 6480 * allocation failed 6481 */ 6482 connp = first_connp; 6483 break; 6484 } 6485 if (first_mp != NULL) { 6486 ASSERT(((ipsec_info_t *)first_mp->b_rptr)-> 6487 ipsec_info_type == IPSEC_IN); 6488 first_mp1 = ipsec_in_tag(first_mp, NULL); 6489 if (first_mp1 == NULL) { 6490 freemsg(mp1); 6491 connp = first_connp; 6492 break; 6493 } 6494 } else { 6495 first_mp1 = NULL; 6496 } 6497 CONN_INC_REF(connp); 6498 mutex_exit(&connfp->connf_lock); 6499 /* 6500 * IPQoS notes: We don't send the packet for policy 6501 * processing here, will do it for the last one (below). 6502 * i.e. we do it per-packet now, but if we do policy 6503 * processing per-conn, then we would need to do it 6504 * here too. 6505 */ 6506 ip_fanout_udp_conn(connp, first_mp1, mp1, secure, 6507 ipha, flags, recv_ill, B_FALSE); 6508 mutex_enter(&connfp->connf_lock); 6509 /* Follow the next pointer before releasing the conn. */ 6510 next_connp = connp->conn_next; 6511 CONN_DEC_REF(connp); 6512 connp = next_connp; 6513 } 6514 6515 /* Last one. Send it upstream. */ 6516 mutex_exit(&connfp->connf_lock); 6517 ip_fanout_udp_conn(connp, first_mp, mp, secure, ipha, flags, recv_ill, 6518 ip_policy); 6519 CONN_DEC_REF(connp); 6520 } 6521 6522 /* 6523 * Complete the ip_wput header so that it 6524 * is possible to generate ICMP 6525 * errors. 6526 */ 6527 static int 6528 ip_hdr_complete(ipha_t *ipha, zoneid_t zoneid) 6529 { 6530 ire_t *ire; 6531 6532 if (ipha->ipha_src == INADDR_ANY) { 6533 ire = ire_lookup_local(zoneid); 6534 if (ire == NULL) { 6535 ip1dbg(("ip_hdr_complete: no source IRE\n")); 6536 return (1); 6537 } 6538 ipha->ipha_src = ire->ire_addr; 6539 ire_refrele(ire); 6540 } 6541 ipha->ipha_ttl = ip_def_ttl; 6542 ipha->ipha_hdr_checksum = 0; 6543 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 6544 return (0); 6545 } 6546 6547 /* 6548 * Nobody should be sending 6549 * packets up this stream 6550 */ 6551 static void 6552 ip_lrput(queue_t *q, mblk_t *mp) 6553 { 6554 mblk_t *mp1; 6555 6556 switch (mp->b_datap->db_type) { 6557 case M_FLUSH: 6558 /* Turn around */ 6559 if (*mp->b_rptr & FLUSHW) { 6560 *mp->b_rptr &= ~FLUSHR; 6561 qreply(q, mp); 6562 return; 6563 } 6564 break; 6565 } 6566 /* Could receive messages that passed through ar_rput */ 6567 for (mp1 = mp; mp1; mp1 = mp1->b_cont) 6568 mp1->b_prev = mp1->b_next = NULL; 6569 freemsg(mp); 6570 } 6571 6572 /* Nobody should be sending packets down this stream */ 6573 /* ARGSUSED */ 6574 void 6575 ip_lwput(queue_t *q, mblk_t *mp) 6576 { 6577 freemsg(mp); 6578 } 6579 6580 /* 6581 * Move the first hop in any source route to ipha_dst and remove that part of 6582 * the source route. Called by other protocols. Errors in option formatting 6583 * are ignored - will be handled by ip_wput_options Return the final 6584 * destination (either ipha_dst or the last entry in a source route.) 6585 */ 6586 ipaddr_t 6587 ip_massage_options(ipha_t *ipha) 6588 { 6589 ipoptp_t opts; 6590 uchar_t *opt; 6591 uint8_t optval; 6592 uint8_t optlen; 6593 ipaddr_t dst; 6594 int i; 6595 ire_t *ire; 6596 6597 ip2dbg(("ip_massage_options\n")); 6598 dst = ipha->ipha_dst; 6599 for (optval = ipoptp_first(&opts, ipha); 6600 optval != IPOPT_EOL; 6601 optval = ipoptp_next(&opts)) { 6602 opt = opts.ipoptp_cur; 6603 switch (optval) { 6604 uint8_t off; 6605 case IPOPT_SSRR: 6606 case IPOPT_LSRR: 6607 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 6608 ip1dbg(("ip_massage_options: bad src route\n")); 6609 break; 6610 } 6611 optlen = opts.ipoptp_len; 6612 off = opt[IPOPT_OFFSET]; 6613 off--; 6614 redo_srr: 6615 if (optlen < IP_ADDR_LEN || 6616 off > optlen - IP_ADDR_LEN) { 6617 /* End of source route */ 6618 ip1dbg(("ip_massage_options: end of SR\n")); 6619 break; 6620 } 6621 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 6622 ip1dbg(("ip_massage_options: next hop 0x%x\n", 6623 ntohl(dst))); 6624 /* 6625 * Check if our address is present more than 6626 * once as consecutive hops in source route. 6627 * XXX verify per-interface ip_forwarding 6628 * for source route? 6629 */ 6630 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, 6631 ALL_ZONES, NULL, MATCH_IRE_TYPE); 6632 if (ire != NULL) { 6633 ire_refrele(ire); 6634 off += IP_ADDR_LEN; 6635 goto redo_srr; 6636 } 6637 if (dst == htonl(INADDR_LOOPBACK)) { 6638 ip1dbg(("ip_massage_options: loopback addr in " 6639 "source route!\n")); 6640 break; 6641 } 6642 /* 6643 * Update ipha_dst to be the first hop and remove the 6644 * first hop from the source route (by overwriting 6645 * part of the option with NOP options). 6646 */ 6647 ipha->ipha_dst = dst; 6648 /* Put the last entry in dst */ 6649 off = ((optlen - IP_ADDR_LEN - 3) & ~(IP_ADDR_LEN-1)) + 6650 3; 6651 bcopy(&opt[off], &dst, IP_ADDR_LEN); 6652 6653 ip1dbg(("ip_massage_options: last hop 0x%x\n", 6654 ntohl(dst))); 6655 /* Move down and overwrite */ 6656 opt[IP_ADDR_LEN] = opt[0]; 6657 opt[IP_ADDR_LEN+1] = opt[IPOPT_OLEN] - IP_ADDR_LEN; 6658 opt[IP_ADDR_LEN+2] = opt[IPOPT_OFFSET]; 6659 for (i = 0; i < IP_ADDR_LEN; i++) 6660 opt[i] = IPOPT_NOP; 6661 break; 6662 } 6663 } 6664 return (dst); 6665 } 6666 6667 /* 6668 * This function's job is to forward data to the reverse tunnel (FA->HA) 6669 * after doing a few checks. It is assumed that the incoming interface 6670 * of the packet is always different than the outgoing interface and the 6671 * ire_type of the found ire has to be a non-resolver type. 6672 * 6673 * IPQoS notes 6674 * IP policy is invoked twice for a forwarded packet, once on the read side 6675 * and again on the write side if both, IPP_FWD_IN and IPP_FWD_OUT are 6676 * enabled. 6677 */ 6678 static void 6679 ip_mrtun_forward(ire_t *ire, ill_t *in_ill, mblk_t *mp) 6680 { 6681 ipha_t *ipha; 6682 queue_t *q; 6683 uint32_t pkt_len; 6684 #define rptr ((uchar_t *)ipha) 6685 uint32_t sum; 6686 uint32_t max_frag; 6687 mblk_t *first_mp; 6688 uint32_t ill_index; 6689 6690 ASSERT(ire != NULL); 6691 ASSERT(ire->ire_ipif->ipif_net_type == IRE_IF_NORESOLVER); 6692 ASSERT(ire->ire_stq != NULL); 6693 6694 /* Initiate read side IPPF processing */ 6695 if (IPP_ENABLED(IPP_FWD_IN)) { 6696 ill_index = in_ill->ill_phyint->phyint_ifindex; 6697 ip_process(IPP_FWD_IN, &mp, ill_index); 6698 if (mp == NULL) { 6699 ip2dbg(("ip_mrtun_forward: inbound pkt " 6700 "dropped during IPPF processing\n")); 6701 return; 6702 } 6703 } 6704 6705 if (((in_ill->ill_flags & ((ill_t *)ire->ire_stq->q_ptr)->ill_flags & 6706 ILLF_ROUTER) == 0) || 6707 (in_ill == (ill_t *)ire->ire_stq->q_ptr)) { 6708 BUMP_MIB(&ip_mib, ipForwProhibits); 6709 ip0dbg(("ip_mrtun_forward: Can't forward :" 6710 "forwarding is not turned on\n")); 6711 goto drop_pkt; 6712 } 6713 6714 /* 6715 * Don't forward if the interface is down 6716 */ 6717 if (ire->ire_ipif->ipif_ill->ill_ipif_up_count == 0) { 6718 BUMP_MIB(&ip_mib, ipInDiscards); 6719 goto drop_pkt; 6720 } 6721 6722 ipha = (ipha_t *)mp->b_rptr; 6723 pkt_len = ntohs(ipha->ipha_length); 6724 /* Adjust the checksum to reflect the ttl decrement. */ 6725 sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST; 6726 ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16)); 6727 if (ipha->ipha_ttl-- <= 1) { 6728 if (ip_csum_hdr(ipha)) { 6729 BUMP_MIB(&ip_mib, ipInCksumErrs); 6730 goto drop_pkt; 6731 } 6732 q = ire->ire_stq; 6733 if ((first_mp = allocb(sizeof (ipsec_info_t), 6734 BPRI_HI)) == NULL) { 6735 goto drop_pkt; 6736 } 6737 ip_ipsec_out_prepend(first_mp, mp, in_ill); 6738 icmp_time_exceeded(q, first_mp, ICMP_TTL_EXCEEDED); 6739 6740 return; 6741 } 6742 6743 /* Get the ill_index of the ILL */ 6744 ill_index = ire->ire_ipif->ipif_ill->ill_phyint->phyint_ifindex; 6745 6746 /* 6747 * ip_mrtun_forward is only used by foreign agent to reverse 6748 * tunnel the incoming packet. So it does not do any option 6749 * processing for source routing. 6750 */ 6751 max_frag = ire->ire_max_frag; 6752 if (pkt_len > max_frag) { 6753 /* 6754 * It needs fragging on its way out. We haven't 6755 * verified the header checksum yet. Since we 6756 * are going to put a surely good checksum in the 6757 * outgoing header, we have to make sure that it 6758 * was good coming in. 6759 */ 6760 if (ip_csum_hdr(ipha)) { 6761 BUMP_MIB(&ip_mib, ipInCksumErrs); 6762 goto drop_pkt; 6763 } 6764 6765 /* Initiate write side IPPF processing */ 6766 if (IPP_ENABLED(IPP_FWD_OUT)) { 6767 ip_process(IPP_FWD_OUT, &mp, ill_index); 6768 if (mp == NULL) { 6769 ip2dbg(("ip_mrtun_forward: outbound pkt "\ 6770 "dropped/deferred during ip policy "\ 6771 "processing\n")); 6772 return; 6773 } 6774 } 6775 if ((first_mp = allocb(sizeof (ipsec_info_t), 6776 BPRI_HI)) == NULL) { 6777 goto drop_pkt; 6778 } 6779 ip_ipsec_out_prepend(first_mp, mp, in_ill); 6780 mp = first_mp; 6781 6782 ip_wput_frag(ire, mp, IB_PKT, max_frag, 0); 6783 return; 6784 } 6785 6786 ip2dbg(("ip_mrtun_forward: ire type (%d)\n", ire->ire_type)); 6787 6788 ASSERT(ire->ire_ipif != NULL); 6789 6790 mp = ip_wput_attach_llhdr(mp, ire, IPP_FWD_OUT, ill_index); 6791 if (mp == NULL) { 6792 BUMP_MIB(&ip_mib, ipInDiscards); 6793 return; 6794 } 6795 6796 /* Now send the packet to the tunnel interface */ 6797 q = ire->ire_stq; 6798 UPDATE_IB_PKT_COUNT(ire); 6799 ire->ire_last_used_time = lbolt; 6800 BUMP_MIB(&ip_mib, ipForwDatagrams); 6801 putnext(q, mp); 6802 ip2dbg(("ip_mrtun_forward: sent packet to ill %p\n", q->q_ptr)); 6803 return; 6804 6805 drop_pkt:; 6806 ip2dbg(("ip_mrtun_forward: dropping pkt\n")); 6807 freemsg(mp); 6808 #undef rptr 6809 } 6810 6811 /* 6812 * Fills the ipsec_out_t data structure with appropriate fields and 6813 * prepends it to mp which contains the IP hdr + data that was meant 6814 * to be forwarded. Please note that ipsec_out_info data structure 6815 * is used here to communicate the outgoing ill path at ip_wput() 6816 * for the ICMP error packet. This has nothing to do with ipsec IP 6817 * security. ipsec_out_t is really used to pass the info to the module 6818 * IP where this information cannot be extracted from conn. 6819 * This functions is called by ip_mrtun_forward(). 6820 */ 6821 void 6822 ip_ipsec_out_prepend(mblk_t *first_mp, mblk_t *mp, ill_t *xmit_ill) 6823 { 6824 ipsec_out_t *io; 6825 6826 ASSERT(xmit_ill != NULL); 6827 first_mp->b_datap->db_type = M_CTL; 6828 first_mp->b_wptr += sizeof (ipsec_info_t); 6829 /* 6830 * This is to pass info to ip_wput in absence of conn. 6831 * ipsec_out_secure will be B_FALSE because of this. 6832 * Thus ipsec_out_secure being B_FALSE indicates that 6833 * this is not IPSEC security related information. 6834 */ 6835 bzero(first_mp->b_rptr, sizeof (ipsec_info_t)); 6836 io = (ipsec_out_t *)first_mp->b_rptr; 6837 io->ipsec_out_type = IPSEC_OUT; 6838 io->ipsec_out_len = sizeof (ipsec_out_t); 6839 first_mp->b_cont = mp; 6840 io->ipsec_out_ill_index = 6841 xmit_ill->ill_phyint->phyint_ifindex; 6842 io->ipsec_out_xmit_if = B_TRUE; 6843 } 6844 6845 /* 6846 * Return the network mask 6847 * associated with the specified address. 6848 */ 6849 ipaddr_t 6850 ip_net_mask(ipaddr_t addr) 6851 { 6852 uchar_t *up = (uchar_t *)&addr; 6853 ipaddr_t mask = 0; 6854 uchar_t *maskp = (uchar_t *)&mask; 6855 6856 #if defined(__i386) || defined(__amd64) 6857 #define TOTALLY_BRAIN_DAMAGED_C_COMPILER 6858 #endif 6859 #ifdef TOTALLY_BRAIN_DAMAGED_C_COMPILER 6860 maskp[0] = maskp[1] = maskp[2] = maskp[3] = 0; 6861 #endif 6862 if (CLASSD(addr)) { 6863 maskp[0] = 0xF0; 6864 return (mask); 6865 } 6866 if (addr == 0) 6867 return (0); 6868 maskp[0] = 0xFF; 6869 if ((up[0] & 0x80) == 0) 6870 return (mask); 6871 6872 maskp[1] = 0xFF; 6873 if ((up[0] & 0xC0) == 0x80) 6874 return (mask); 6875 6876 maskp[2] = 0xFF; 6877 if ((up[0] & 0xE0) == 0xC0) 6878 return (mask); 6879 6880 /* Must be experimental or multicast, indicate as much */ 6881 return ((ipaddr_t)0); 6882 } 6883 6884 /* 6885 * Select an ill for the packet by considering load spreading across 6886 * a different ill in the group if dst_ill is part of some group. 6887 */ 6888 static ill_t * 6889 ip_newroute_get_dst_ill(ill_t *dst_ill) 6890 { 6891 ill_t *ill; 6892 6893 /* 6894 * We schedule irrespective of whether the source address is 6895 * INADDR_ANY or not. illgrp_scheduler returns a held ill. 6896 */ 6897 ill = illgrp_scheduler(dst_ill); 6898 if (ill == NULL) 6899 return (NULL); 6900 6901 /* 6902 * For groups with names ip_sioctl_groupname ensures that all 6903 * ills are of same type. For groups without names, ifgrp_insert 6904 * ensures this. 6905 */ 6906 ASSERT(dst_ill->ill_type == ill->ill_type); 6907 6908 return (ill); 6909 } 6910 6911 /* 6912 * Helper function for the IPIF_NOFAILOVER/ATTACH_IF interface attachment case. 6913 */ 6914 ill_t * 6915 ip_grab_attach_ill(ill_t *ill, mblk_t *first_mp, int ifindex, boolean_t isv6) 6916 { 6917 ill_t *ret_ill; 6918 6919 ASSERT(ifindex != 0); 6920 ret_ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL); 6921 if (ret_ill == NULL || 6922 (ret_ill->ill_phyint->phyint_flags & PHYI_OFFLINE)) { 6923 if (isv6) { 6924 if (ill != NULL) { 6925 BUMP_MIB(ill->ill_ip6_mib, ipv6OutDiscards); 6926 } else { 6927 BUMP_MIB(&ip6_mib, ipv6OutDiscards); 6928 } 6929 ip1dbg(("ip_grab_attach_ill (IPv6): " 6930 "bad ifindex %d.\n", ifindex)); 6931 } else { 6932 BUMP_MIB(&ip_mib, ipOutDiscards); 6933 ip1dbg(("ip_grab_attach_ill (IPv4): " 6934 "bad ifindex %d.\n", ifindex)); 6935 } 6936 if (ret_ill != NULL) 6937 ill_refrele(ret_ill); 6938 freemsg(first_mp); 6939 return (NULL); 6940 } 6941 6942 return (ret_ill); 6943 } 6944 6945 /* 6946 * IPv4 - 6947 * ip_newroute is called by ip_rput or ip_wput whenever we need to send 6948 * out a packet to a destination address for which we do not have specific 6949 * (or sufficient) routing information. 6950 * 6951 * NOTE : These are the scopes of some of the variables that point at IRE, 6952 * which needs to be followed while making any future modifications 6953 * to avoid memory leaks. 6954 * 6955 * - ire and sire are the entries looked up initially by 6956 * ire_ftable_lookup. 6957 * - ipif_ire is used to hold the interface ire associated with 6958 * the new cache ire. But it's scope is limited, so we always REFRELE 6959 * it before branching out to error paths. 6960 * - save_ire is initialized before ire_create, so that ire returned 6961 * by ire_create will not over-write the ire. We REFRELE save_ire 6962 * before breaking out of the switch. 6963 * 6964 * Thus on failures, we have to REFRELE only ire and sire, if they 6965 * are not NULL. 6966 */ 6967 void 6968 ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, ill_t *in_ill, conn_t *connp) 6969 { 6970 areq_t *areq; 6971 ipaddr_t gw = 0; 6972 ire_t *ire = NULL; 6973 mblk_t *res_mp; 6974 ipaddr_t *addrp; 6975 ipaddr_t nexthop_addr; 6976 ipif_t *src_ipif = NULL; 6977 ill_t *dst_ill = NULL; 6978 ipha_t *ipha; 6979 ire_t *sire = NULL; 6980 mblk_t *first_mp; 6981 ire_t *save_ire; 6982 mblk_t *dlureq_mp; 6983 ill_t *attach_ill = NULL; /* Bind to IPIF_NOFAILOVER address */ 6984 ushort_t ire_marks = 0; 6985 boolean_t mctl_present; 6986 ipsec_out_t *io; 6987 mblk_t *saved_mp; 6988 ire_t *first_sire = NULL; 6989 mblk_t *copy_mp = NULL; 6990 mblk_t *xmit_mp = NULL; 6991 ipaddr_t save_dst; 6992 uint32_t multirt_flags = 6993 MULTIRT_CACHEGW | MULTIRT_USESTAMP | MULTIRT_SETSTAMP; 6994 boolean_t multirt_is_resolvable; 6995 boolean_t multirt_resolve_next; 6996 boolean_t do_attach_ill = B_FALSE; 6997 boolean_t ip_nexthop = B_FALSE; 6998 zoneid_t zoneid; 6999 tsol_ire_gw_secattr_t *attrp = NULL; 7000 tsol_gcgrp_t *gcgrp = NULL; 7001 tsol_gcgrp_addr_t ga; 7002 7003 if (ip_debug > 2) { 7004 /* ip1dbg */ 7005 pr_addr_dbg("ip_newroute: dst %s\n", AF_INET, &dst); 7006 } 7007 7008 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 7009 if (mctl_present) { 7010 io = (ipsec_out_t *)first_mp->b_rptr; 7011 zoneid = io->ipsec_out_zoneid; 7012 ASSERT(zoneid != ALL_ZONES); 7013 } else if (connp != NULL) { 7014 zoneid = connp->conn_zoneid; 7015 } else { 7016 zoneid = GLOBAL_ZONEID; 7017 } 7018 7019 ipha = (ipha_t *)mp->b_rptr; 7020 7021 /* All multicast lookups come through ip_newroute_ipif() */ 7022 if (CLASSD(dst)) { 7023 ip0dbg(("ip_newroute: CLASSD 0x%x (b_prev %p, b_next %p)\n", 7024 ntohl(dst), (void *)mp->b_prev, (void *)mp->b_next)); 7025 freemsg(first_mp); 7026 return; 7027 } 7028 7029 if (ip_loopback_src_or_dst(ipha, NULL)) { 7030 goto icmp_err_ret; 7031 } 7032 7033 if (mctl_present && io->ipsec_out_attach_if) { 7034 /* ip_grab_attach_ill returns a held ill */ 7035 attach_ill = ip_grab_attach_ill(NULL, first_mp, 7036 io->ipsec_out_ill_index, B_FALSE); 7037 7038 /* Failure case frees things for us. */ 7039 if (attach_ill == NULL) 7040 return; 7041 7042 /* 7043 * Check if we need an ire that will not be 7044 * looked up by anybody else i.e. HIDDEN. 7045 */ 7046 if (ill_is_probeonly(attach_ill)) 7047 ire_marks = IRE_MARK_HIDDEN; 7048 } 7049 if (mctl_present && io->ipsec_out_ip_nexthop) { 7050 ip_nexthop = B_TRUE; 7051 nexthop_addr = io->ipsec_out_nexthop_addr; 7052 } 7053 /* 7054 * If this IRE is created for forwarding or it is not for 7055 * traffic for congestion controlled protocols, mark it as temporary. 7056 */ 7057 if (mp->b_prev != NULL || !IP_FLOW_CONTROLLED_ULP(ipha->ipha_protocol)) 7058 ire_marks |= IRE_MARK_TEMPORARY; 7059 7060 /* 7061 * Get what we can from ire_ftable_lookup which will follow an IRE 7062 * chain until it gets the most specific information available. 7063 * For example, we know that there is no IRE_CACHE for this dest, 7064 * but there may be an IRE_OFFSUBNET which specifies a gateway. 7065 * ire_ftable_lookup will look up the gateway, etc. 7066 * Check if in_ill != NULL. If it is true, the packet must be 7067 * from an incoming interface where RTA_SRCIFP is set. 7068 * Otherwise, given ire_ftable_lookup algorithm, only one among routes 7069 * to the destination, of equal netmask length in the forward table, 7070 * will be recursively explored. If no information is available 7071 * for the final gateway of that route, we force the returned ire 7072 * to be equal to sire using MATCH_IRE_PARENT. 7073 * At least, in this case we have a starting point (in the buckets) 7074 * to look for other routes to the destination in the forward table. 7075 * This is actually used only for multirouting, where a list 7076 * of routes has to be processed in sequence. 7077 */ 7078 if (in_ill != NULL) { 7079 ire = ire_srcif_table_lookup(dst, IRE_IF_RESOLVER, NULL, 7080 in_ill, MATCH_IRE_TYPE); 7081 } else if (ip_nexthop) { 7082 /* 7083 * The first time we come here, we look for an IRE_INTERFACE 7084 * entry for the specified nexthop, set the dst to be the 7085 * nexthop address and create an IRE_CACHE entry for the 7086 * nexthop. The next time around, we are able to find an 7087 * IRE_CACHE entry for the nexthop, set the gateway to be the 7088 * nexthop address and create an IRE_CACHE entry for the 7089 * destination address via the specified nexthop. 7090 */ 7091 ire = ire_cache_lookup(nexthop_addr, zoneid, 7092 MBLK_GETLABEL(mp)); 7093 if (ire != NULL) { 7094 gw = nexthop_addr; 7095 ire_marks |= IRE_MARK_PRIVATE_ADDR; 7096 } else { 7097 ire = ire_ftable_lookup(nexthop_addr, 0, 0, 7098 IRE_INTERFACE, NULL, NULL, zoneid, 0, 7099 MBLK_GETLABEL(mp), 7100 MATCH_IRE_TYPE | MATCH_IRE_SECATTR); 7101 if (ire != NULL) { 7102 dst = nexthop_addr; 7103 } 7104 } 7105 } else if (attach_ill == NULL) { 7106 ire = ire_ftable_lookup(dst, 0, 0, 0, 7107 NULL, &sire, zoneid, 0, MBLK_GETLABEL(mp), 7108 MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 7109 MATCH_IRE_RJ_BHOLE | MATCH_IRE_PARENT | 7110 MATCH_IRE_SECATTR); 7111 } else { 7112 /* 7113 * attach_ill is set only for communicating with 7114 * on-link hosts. So, don't look for DEFAULT. 7115 */ 7116 ipif_t *attach_ipif; 7117 7118 attach_ipif = ipif_get_next_ipif(NULL, attach_ill); 7119 if (attach_ipif == NULL) { 7120 ill_refrele(attach_ill); 7121 goto icmp_err_ret; 7122 } 7123 ire = ire_ftable_lookup(dst, 0, 0, 0, attach_ipif, 7124 &sire, zoneid, 0, MBLK_GETLABEL(mp), 7125 MATCH_IRE_RJ_BHOLE | MATCH_IRE_ILL | 7126 MATCH_IRE_SECATTR); 7127 ipif_refrele(attach_ipif); 7128 } 7129 ip3dbg(("ip_newroute: ire_ftable_lookup() " 7130 "returned ire %p, sire %p\n", (void *)ire, (void *)sire)); 7131 7132 /* 7133 * This loop is run only once in most cases. 7134 * We loop to resolve further routes only when the destination 7135 * can be reached through multiple RTF_MULTIRT-flagged ires. 7136 */ 7137 do { 7138 /* Clear the previous iteration's values */ 7139 if (src_ipif != NULL) { 7140 ipif_refrele(src_ipif); 7141 src_ipif = NULL; 7142 } 7143 if (dst_ill != NULL) { 7144 ill_refrele(dst_ill); 7145 dst_ill = NULL; 7146 } 7147 7148 multirt_resolve_next = B_FALSE; 7149 /* 7150 * We check if packets have to be multirouted. 7151 * In this case, given the current <ire, sire> couple, 7152 * we look for the next suitable <ire, sire>. 7153 * This check is done in ire_multirt_lookup(), 7154 * which applies various criteria to find the next route 7155 * to resolve. ire_multirt_lookup() leaves <ire, sire> 7156 * unchanged if it detects it has not been tried yet. 7157 */ 7158 if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) { 7159 ip3dbg(("ip_newroute: starting next_resolution " 7160 "with first_mp %p, tag %d\n", 7161 (void *)first_mp, 7162 MULTIRT_DEBUG_TAGGED(first_mp))); 7163 7164 ASSERT(sire != NULL); 7165 multirt_is_resolvable = 7166 ire_multirt_lookup(&ire, &sire, multirt_flags, 7167 MBLK_GETLABEL(mp)); 7168 7169 ip3dbg(("ip_newroute: multirt_is_resolvable %d, " 7170 "ire %p, sire %p\n", 7171 multirt_is_resolvable, 7172 (void *)ire, (void *)sire)); 7173 7174 if (!multirt_is_resolvable) { 7175 /* 7176 * No more multirt route to resolve; give up 7177 * (all routes resolved or no more 7178 * resolvable routes). 7179 */ 7180 if (ire != NULL) { 7181 ire_refrele(ire); 7182 ire = NULL; 7183 } 7184 } else { 7185 ASSERT(sire != NULL); 7186 ASSERT(ire != NULL); 7187 /* 7188 * We simply use first_sire as a flag that 7189 * indicates if a resolvable multirt route 7190 * has already been found. 7191 * If it is not the case, we may have to send 7192 * an ICMP error to report that the 7193 * destination is unreachable. 7194 * We do not IRE_REFHOLD first_sire. 7195 */ 7196 if (first_sire == NULL) { 7197 first_sire = sire; 7198 } 7199 } 7200 } 7201 if (ire == NULL) { 7202 if (ip_debug > 3) { 7203 /* ip2dbg */ 7204 pr_addr_dbg("ip_newroute: " 7205 "can't resolve %s\n", AF_INET, &dst); 7206 } 7207 ip3dbg(("ip_newroute: " 7208 "ire %p, sire %p, first_sire %p\n", 7209 (void *)ire, (void *)sire, (void *)first_sire)); 7210 7211 if (sire != NULL) { 7212 ire_refrele(sire); 7213 sire = NULL; 7214 } 7215 7216 if (first_sire != NULL) { 7217 /* 7218 * At least one multirt route has been found 7219 * in the same call to ip_newroute(); 7220 * there is no need to report an ICMP error. 7221 * first_sire was not IRE_REFHOLDed. 7222 */ 7223 MULTIRT_DEBUG_UNTAG(first_mp); 7224 freemsg(first_mp); 7225 return; 7226 } 7227 ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0, 7228 RTA_DST); 7229 if (attach_ill != NULL) 7230 ill_refrele(attach_ill); 7231 goto icmp_err_ret; 7232 } 7233 7234 /* 7235 * When RTA_SRCIFP is used to add a route, then an interface 7236 * route is added in the source interface's routing table. 7237 * If the outgoing interface of this route is of type 7238 * IRE_IF_RESOLVER, then upon creation of the ire, 7239 * ire_dlureq_mp is set to NULL. Later, when this route is 7240 * first used for forwarding packet, ip_newroute() is called 7241 * to resolve the hardware address of the outgoing ipif. 7242 * We do not come here for IRE_IF_NORESOLVER entries in the 7243 * source interface based table. We only come here if the 7244 * outgoing interface is a resolver interface and we don't 7245 * have the ire_dlureq_mp information yet. 7246 * If in_ill is not null that means it is called from 7247 * ip_rput. 7248 */ 7249 7250 ASSERT(ire->ire_in_ill == NULL || 7251 (ire->ire_type == IRE_IF_RESOLVER && 7252 ire->ire_dlureq_mp == NULL)); 7253 7254 /* 7255 * Verify that the returned IRE does not have either 7256 * the RTF_REJECT or RTF_BLACKHOLE flags set and that the IRE is 7257 * either an IRE_CACHE, IRE_IF_NORESOLVER or IRE_IF_RESOLVER. 7258 */ 7259 if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) || 7260 (ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0) { 7261 if (attach_ill != NULL) 7262 ill_refrele(attach_ill); 7263 goto icmp_err_ret; 7264 } 7265 /* 7266 * Increment the ire_ob_pkt_count field for ire if it is an 7267 * INTERFACE (IF_RESOLVER or IF_NORESOLVER) IRE type, and 7268 * increment the same for the parent IRE, sire, if it is some 7269 * sort of prefix IRE (which includes DEFAULT, PREFIX, HOST 7270 * and HOST_REDIRECT). 7271 */ 7272 if ((ire->ire_type & IRE_INTERFACE) != 0) { 7273 UPDATE_OB_PKT_COUNT(ire); 7274 ire->ire_last_used_time = lbolt; 7275 } 7276 7277 if (sire != NULL) { 7278 gw = sire->ire_gateway_addr; 7279 ASSERT((sire->ire_type & (IRE_CACHETABLE | 7280 IRE_INTERFACE)) == 0); 7281 UPDATE_OB_PKT_COUNT(sire); 7282 sire->ire_last_used_time = lbolt; 7283 } 7284 /* 7285 * We have a route to reach the destination. 7286 * 7287 * 1) If the interface is part of ill group, try to get a new 7288 * ill taking load spreading into account. 7289 * 7290 * 2) After selecting the ill, get a source address that 7291 * might create good inbound load spreading. 7292 * ipif_select_source does this for us. 7293 * 7294 * If the application specified the ill (ifindex), we still 7295 * load spread. Only if the packets needs to go out 7296 * specifically on a given ill e.g. binding to 7297 * IPIF_NOFAILOVER address, then we don't try to use a 7298 * different ill for load spreading. 7299 */ 7300 if (attach_ill == NULL) { 7301 /* 7302 * Don't perform outbound load spreading in the 7303 * case of an RTF_MULTIRT route, as we actually 7304 * typically want to replicate outgoing packets 7305 * through particular interfaces. 7306 */ 7307 if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) { 7308 dst_ill = ire->ire_ipif->ipif_ill; 7309 /* for uniformity */ 7310 ill_refhold(dst_ill); 7311 } else { 7312 /* 7313 * If we are here trying to create an IRE_CACHE 7314 * for an offlink destination and have the 7315 * IRE_CACHE for the next hop and the latter is 7316 * using virtual IP source address selection i.e 7317 * it's ire->ire_ipif is pointing to a virtual 7318 * network interface (vni) then 7319 * ip_newroute_get_dst_ll() will return the vni 7320 * interface as the dst_ill. Since the vni is 7321 * virtual i.e not associated with any physical 7322 * interface, it cannot be the dst_ill, hence 7323 * in such a case call ip_newroute_get_dst_ll() 7324 * with the stq_ill instead of the ire_ipif ILL. 7325 * The function returns a refheld ill. 7326 */ 7327 if ((ire->ire_type == IRE_CACHE) && 7328 IS_VNI(ire->ire_ipif->ipif_ill)) 7329 dst_ill = ip_newroute_get_dst_ill( 7330 ire->ire_stq->q_ptr); 7331 else 7332 dst_ill = ip_newroute_get_dst_ill( 7333 ire->ire_ipif->ipif_ill); 7334 } 7335 if (dst_ill == NULL) { 7336 if (ip_debug > 2) { 7337 pr_addr_dbg("ip_newroute: " 7338 "no dst ill for dst" 7339 " %s\n", AF_INET, &dst); 7340 } 7341 goto icmp_err_ret; 7342 } 7343 } else { 7344 dst_ill = ire->ire_ipif->ipif_ill; 7345 /* for uniformity */ 7346 ill_refhold(dst_ill); 7347 /* 7348 * We should have found a route matching ill as we 7349 * called ire_ftable_lookup with MATCH_IRE_ILL. 7350 * Rather than asserting, when there is a mismatch, 7351 * we just drop the packet. 7352 */ 7353 if (dst_ill != attach_ill) { 7354 ip0dbg(("ip_newroute: Packet dropped as " 7355 "IPIF_NOFAILOVER ill is %s, " 7356 "ire->ire_ipif->ipif_ill is %s\n", 7357 attach_ill->ill_name, 7358 dst_ill->ill_name)); 7359 ill_refrele(attach_ill); 7360 goto icmp_err_ret; 7361 } 7362 } 7363 /* attach_ill can't go in loop. IPMP and CGTP are disjoint */ 7364 if (attach_ill != NULL) { 7365 ill_refrele(attach_ill); 7366 attach_ill = NULL; 7367 do_attach_ill = B_TRUE; 7368 } 7369 ASSERT(dst_ill != NULL); 7370 ip2dbg(("ip_newroute: dst_ill %s\n", dst_ill->ill_name)); 7371 7372 /* 7373 * Pick the best source address from dst_ill. 7374 * 7375 * 1) If it is part of a multipathing group, we would 7376 * like to spread the inbound packets across different 7377 * interfaces. ipif_select_source picks a random source 7378 * across the different ills in the group. 7379 * 7380 * 2) If it is not part of a multipathing group, we try 7381 * to pick the source address from the destination 7382 * route. Clustering assumes that when we have multiple 7383 * prefixes hosted on an interface, the prefix of the 7384 * source address matches the prefix of the destination 7385 * route. We do this only if the address is not 7386 * DEPRECATED. 7387 * 7388 * 3) If the conn is in a different zone than the ire, we 7389 * need to pick a source address from the right zone. 7390 * 7391 * NOTE : If we hit case (1) above, the prefix of the source 7392 * address picked may not match the prefix of the 7393 * destination routes prefix as ipif_select_source 7394 * does not look at "dst" while picking a source 7395 * address. 7396 * If we want the same behavior as (2), we will need 7397 * to change the behavior of ipif_select_source. 7398 */ 7399 ASSERT(src_ipif == NULL); 7400 if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) { 7401 /* 7402 * The RTF_SETSRC flag is set in the parent ire (sire). 7403 * Check that the ipif matching the requested source 7404 * address still exists. 7405 */ 7406 src_ipif = ipif_lookup_addr(sire->ire_src_addr, NULL, 7407 zoneid, NULL, NULL, NULL, NULL); 7408 } 7409 if (src_ipif == NULL) { 7410 ire_marks |= IRE_MARK_USESRC_CHECK; 7411 if ((dst_ill->ill_group != NULL) || 7412 (ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) || 7413 (connp != NULL && ire->ire_zoneid != zoneid && 7414 ire->ire_zoneid != ALL_ZONES) || 7415 (dst_ill->ill_usesrc_ifindex != 0)) { 7416 /* 7417 * If the destination is reachable via a 7418 * given gateway, the selected source address 7419 * should be in the same subnet as the gateway. 7420 * Otherwise, the destination is not reachable. 7421 * 7422 * If there are no interfaces on the same subnet 7423 * as the destination, ipif_select_source gives 7424 * first non-deprecated interface which might be 7425 * on a different subnet than the gateway. 7426 * This is not desirable. Hence pass the dst_ire 7427 * source address to ipif_select_source. 7428 * It is sure that the destination is reachable 7429 * with the dst_ire source address subnet. 7430 * So passing dst_ire source address to 7431 * ipif_select_source will make sure that the 7432 * selected source will be on the same subnet 7433 * as dst_ire source address. 7434 */ 7435 ipaddr_t saddr = ire->ire_ipif->ipif_src_addr; 7436 src_ipif = ipif_select_source(dst_ill, saddr, 7437 zoneid); 7438 if (src_ipif == NULL) { 7439 if (ip_debug > 2) { 7440 pr_addr_dbg("ip_newroute: " 7441 "no src for dst %s ", 7442 AF_INET, &dst); 7443 printf("through interface %s\n", 7444 dst_ill->ill_name); 7445 } 7446 goto icmp_err_ret; 7447 } 7448 } else { 7449 src_ipif = ire->ire_ipif; 7450 ASSERT(src_ipif != NULL); 7451 /* hold src_ipif for uniformity */ 7452 ipif_refhold(src_ipif); 7453 } 7454 } 7455 7456 /* 7457 * Assign a source address while we have the conn. 7458 * We can't have ip_wput_ire pick a source address when the 7459 * packet returns from arp since we need to look at 7460 * conn_unspec_src and conn_zoneid, and we lose the conn when 7461 * going through arp. 7462 * 7463 * NOTE : ip_newroute_v6 does not have this piece of code as 7464 * it uses ip6i to store this information. 7465 */ 7466 if (ipha->ipha_src == INADDR_ANY && 7467 (connp == NULL || !connp->conn_unspec_src)) { 7468 ipha->ipha_src = src_ipif->ipif_src_addr; 7469 } 7470 if (ip_debug > 3) { 7471 /* ip2dbg */ 7472 pr_addr_dbg("ip_newroute: first hop %s\n", 7473 AF_INET, &gw); 7474 } 7475 ip2dbg(("\tire type %s (%d)\n", 7476 ip_nv_lookup(ire_nv_tbl, ire->ire_type), ire->ire_type)); 7477 7478 /* 7479 * The TTL of multirouted packets is bounded by the 7480 * ip_multirt_ttl ndd variable. 7481 */ 7482 if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) { 7483 /* Force TTL of multirouted packets */ 7484 if ((ip_multirt_ttl > 0) && 7485 (ipha->ipha_ttl > ip_multirt_ttl)) { 7486 ip2dbg(("ip_newroute: forcing multirt TTL " 7487 "to %d (was %d), dst 0x%08x\n", 7488 ip_multirt_ttl, ipha->ipha_ttl, 7489 ntohl(sire->ire_addr))); 7490 ipha->ipha_ttl = ip_multirt_ttl; 7491 } 7492 } 7493 /* 7494 * At this point in ip_newroute(), ire is either the 7495 * IRE_CACHE of the next-hop gateway for an off-subnet 7496 * destination or an IRE_INTERFACE type that should be used 7497 * to resolve an on-subnet destination or an on-subnet 7498 * next-hop gateway. 7499 * 7500 * In the IRE_CACHE case, we have the following : 7501 * 7502 * 1) src_ipif - used for getting a source address. 7503 * 7504 * 2) dst_ill - from which we derive ire_stq/ire_rfq. This 7505 * means packets using this IRE_CACHE will go out on 7506 * dst_ill. 7507 * 7508 * 3) The IRE sire will point to the prefix that is the 7509 * longest matching route for the destination. These 7510 * prefix types include IRE_DEFAULT, IRE_PREFIX, IRE_HOST, 7511 * and IRE_HOST_REDIRECT. 7512 * 7513 * The newly created IRE_CACHE entry for the off-subnet 7514 * destination is tied to both the prefix route and the 7515 * interface route used to resolve the next-hop gateway 7516 * via the ire_phandle and ire_ihandle fields, 7517 * respectively. 7518 * 7519 * In the IRE_INTERFACE case, we have the following : 7520 * 7521 * 1) src_ipif - used for getting a source address. 7522 * 7523 * 2) dst_ill - from which we derive ire_stq/ire_rfq. This 7524 * means packets using the IRE_CACHE that we will build 7525 * here will go out on dst_ill. 7526 * 7527 * 3) sire may or may not be NULL. But, the IRE_CACHE that is 7528 * to be created will only be tied to the IRE_INTERFACE 7529 * that was derived from the ire_ihandle field. 7530 * 7531 * If sire is non-NULL, it means the destination is 7532 * off-link and we will first create the IRE_CACHE for the 7533 * gateway. Next time through ip_newroute, we will create 7534 * the IRE_CACHE for the final destination as described 7535 * above. 7536 * 7537 * In both cases, after the current resolution has been 7538 * completed (or possibly initialised, in the IRE_INTERFACE 7539 * case), the loop may be re-entered to attempt the resolution 7540 * of another RTF_MULTIRT route. 7541 * 7542 * When an IRE_CACHE entry for the off-subnet destination is 7543 * created, RTF_SETSRC and RTF_MULTIRT are inherited from sire, 7544 * for further processing in emission loops. 7545 */ 7546 save_ire = ire; 7547 switch (ire->ire_type) { 7548 case IRE_CACHE: { 7549 ire_t *ipif_ire; 7550 mblk_t *ire_fp_mp; 7551 7552 if (gw == 0) 7553 gw = ire->ire_gateway_addr; 7554 /* 7555 * We need 3 ire's to create a new cache ire for an 7556 * off-link destination from the cache ire of the 7557 * gateway. 7558 * 7559 * 1. The prefix ire 'sire' (Note that this does 7560 * not apply to the conn_nexthop_set case) 7561 * 2. The cache ire of the gateway 'ire' 7562 * 3. The interface ire 'ipif_ire' 7563 * 7564 * We have (1) and (2). We lookup (3) below. 7565 * 7566 * If there is no interface route to the gateway, 7567 * it is a race condition, where we found the cache 7568 * but the interface route has been deleted. 7569 */ 7570 if (ip_nexthop) { 7571 ipif_ire = ire_ihandle_lookup_onlink(ire); 7572 } else { 7573 ipif_ire = 7574 ire_ihandle_lookup_offlink(ire, sire); 7575 } 7576 if (ipif_ire == NULL) { 7577 ip1dbg(("ip_newroute: " 7578 "ire_ihandle_lookup_offlink failed\n")); 7579 goto icmp_err_ret; 7580 } 7581 /* 7582 * XXX We are using the same dlureq_mp 7583 * (DL_UNITDATA_REQ) though the save_ire is not 7584 * pointing at the same ill. 7585 * This is incorrect. We need to send it up to the 7586 * resolver to get the right dlureq_mp. For ethernets 7587 * this may be okay (ill_type == DL_ETHER). 7588 */ 7589 dlureq_mp = save_ire->ire_dlureq_mp; 7590 ire_fp_mp = NULL; 7591 /* 7592 * save_ire's ire_fp_mp can't change since it is 7593 * not an IRE_MIPRTUN or IRE_BROADCAST 7594 * LOCK_IRE_FP_MP does not do any useful work in 7595 * the case of IRE_CACHE. So we don't use it below. 7596 */ 7597 if (save_ire->ire_stq == dst_ill->ill_wq) 7598 ire_fp_mp = save_ire->ire_fp_mp; 7599 7600 /* 7601 * Check cached gateway IRE for any security 7602 * attributes; if found, associate the gateway 7603 * credentials group to the destination IRE. 7604 */ 7605 if ((attrp = save_ire->ire_gw_secattr) != NULL) { 7606 mutex_enter(&attrp->igsa_lock); 7607 if ((gcgrp = attrp->igsa_gcgrp) != NULL) 7608 GCGRP_REFHOLD(gcgrp); 7609 mutex_exit(&attrp->igsa_lock); 7610 } 7611 7612 ire = ire_create( 7613 (uchar_t *)&dst, /* dest address */ 7614 (uchar_t *)&ip_g_all_ones, /* mask */ 7615 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 7616 (uchar_t *)&gw, /* gateway address */ 7617 NULL, 7618 &save_ire->ire_max_frag, 7619 ire_fp_mp, /* Fast Path header */ 7620 dst_ill->ill_rq, /* recv-from queue */ 7621 dst_ill->ill_wq, /* send-to queue */ 7622 IRE_CACHE, /* IRE type */ 7623 save_ire->ire_dlureq_mp, 7624 src_ipif, 7625 in_ill, /* incoming ill */ 7626 (sire != NULL) ? 7627 sire->ire_mask : 0, /* Parent mask */ 7628 (sire != NULL) ? 7629 sire->ire_phandle : 0, /* Parent handle */ 7630 ipif_ire->ire_ihandle, /* Interface handle */ 7631 (sire != NULL) ? (sire->ire_flags & 7632 (RTF_SETSRC | RTF_MULTIRT)) : 0, /* flags */ 7633 (sire != NULL) ? 7634 &(sire->ire_uinfo) : &(save_ire->ire_uinfo), 7635 NULL, 7636 gcgrp); 7637 7638 if (ire == NULL) { 7639 if (gcgrp != NULL) { 7640 GCGRP_REFRELE(gcgrp); 7641 gcgrp = NULL; 7642 } 7643 ire_refrele(ipif_ire); 7644 ire_refrele(save_ire); 7645 break; 7646 } 7647 7648 /* reference now held by IRE */ 7649 gcgrp = NULL; 7650 7651 ire->ire_marks |= ire_marks; 7652 7653 /* 7654 * Prevent sire and ipif_ire from getting deleted. 7655 * The newly created ire is tied to both of them via 7656 * the phandle and ihandle respectively. 7657 */ 7658 if (sire != NULL) { 7659 IRB_REFHOLD(sire->ire_bucket); 7660 /* Has it been removed already ? */ 7661 if (sire->ire_marks & IRE_MARK_CONDEMNED) { 7662 IRB_REFRELE(sire->ire_bucket); 7663 ire_refrele(ipif_ire); 7664 ire_refrele(save_ire); 7665 break; 7666 } 7667 } 7668 7669 IRB_REFHOLD(ipif_ire->ire_bucket); 7670 /* Has it been removed already ? */ 7671 if (ipif_ire->ire_marks & IRE_MARK_CONDEMNED) { 7672 IRB_REFRELE(ipif_ire->ire_bucket); 7673 if (sire != NULL) 7674 IRB_REFRELE(sire->ire_bucket); 7675 ire_refrele(ipif_ire); 7676 ire_refrele(save_ire); 7677 break; 7678 } 7679 7680 xmit_mp = first_mp; 7681 /* 7682 * In the case of multirouting, a copy 7683 * of the packet is done before its sending. 7684 * The copy is used to attempt another 7685 * route resolution, in a next loop. 7686 */ 7687 if (ire->ire_flags & RTF_MULTIRT) { 7688 copy_mp = copymsg(first_mp); 7689 if (copy_mp != NULL) { 7690 xmit_mp = copy_mp; 7691 MULTIRT_DEBUG_TAG(first_mp); 7692 } 7693 } 7694 ire_add_then_send(q, ire, xmit_mp); 7695 ire_refrele(save_ire); 7696 7697 /* Assert that sire is not deleted yet. */ 7698 if (sire != NULL) { 7699 ASSERT(sire->ire_ptpn != NULL); 7700 IRB_REFRELE(sire->ire_bucket); 7701 } 7702 7703 /* Assert that ipif_ire is not deleted yet. */ 7704 ASSERT(ipif_ire->ire_ptpn != NULL); 7705 IRB_REFRELE(ipif_ire->ire_bucket); 7706 ire_refrele(ipif_ire); 7707 7708 /* 7709 * If copy_mp is not NULL, multirouting was 7710 * requested. We loop to initiate a next 7711 * route resolution attempt, starting from sire. 7712 */ 7713 if (copy_mp != NULL) { 7714 /* 7715 * Search for the next unresolved 7716 * multirt route. 7717 */ 7718 copy_mp = NULL; 7719 ipif_ire = NULL; 7720 ire = NULL; 7721 multirt_resolve_next = B_TRUE; 7722 continue; 7723 } 7724 if (sire != NULL) 7725 ire_refrele(sire); 7726 ipif_refrele(src_ipif); 7727 ill_refrele(dst_ill); 7728 return; 7729 } 7730 case IRE_IF_NORESOLVER: { 7731 /* 7732 * We have what we need to build an IRE_CACHE. 7733 * 7734 * Create a new dlureq_mp with the IP gateway address 7735 * in destination address in the DLPI hdr if the 7736 * physical length is exactly 4 bytes. 7737 */ 7738 if (dst_ill->ill_phys_addr_length == IP_ADDR_LEN) { 7739 uchar_t *addr; 7740 7741 if (gw) 7742 addr = (uchar_t *)&gw; 7743 else 7744 addr = (uchar_t *)&dst; 7745 7746 dlureq_mp = ill_dlur_gen(addr, 7747 dst_ill->ill_phys_addr_length, 7748 dst_ill->ill_sap, 7749 dst_ill->ill_sap_length); 7750 } else { 7751 dlureq_mp = ire->ire_dlureq_mp; 7752 } 7753 7754 if (dlureq_mp == NULL) { 7755 ip1dbg(("ip_newroute: dlureq_mp NULL\n")); 7756 break; 7757 } 7758 7759 /* 7760 * TSol note: We are creating the ire cache for the 7761 * destination 'dst'. If 'dst' is offlink, going 7762 * through the first hop 'gw', the security attributes 7763 * of 'dst' must be set to point to the gateway 7764 * credentials of gateway 'gw'. If 'dst' is onlink, it 7765 * is possible that 'dst' is a potential gateway that is 7766 * referenced by some route that has some security 7767 * attributes. Thus in the former case, we need to do a 7768 * gcgrp_lookup of 'gw' while in the latter case we 7769 * need to do gcgrp_lookup of 'dst' itself. 7770 */ 7771 ga.ga_af = AF_INET; 7772 IN6_IPADDR_TO_V4MAPPED(gw != INADDR_ANY ? gw : dst, 7773 &ga.ga_addr); 7774 gcgrp = gcgrp_lookup(&ga, B_FALSE); 7775 7776 ire = ire_create( 7777 (uchar_t *)&dst, /* dest address */ 7778 (uchar_t *)&ip_g_all_ones, /* mask */ 7779 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 7780 (uchar_t *)&gw, /* gateway address */ 7781 NULL, 7782 &save_ire->ire_max_frag, 7783 NULL, /* Fast Path header */ 7784 dst_ill->ill_rq, /* recv-from queue */ 7785 dst_ill->ill_wq, /* send-to queue */ 7786 IRE_CACHE, 7787 dlureq_mp, 7788 src_ipif, 7789 in_ill, /* Incoming ill */ 7790 save_ire->ire_mask, /* Parent mask */ 7791 (sire != NULL) ? /* Parent handle */ 7792 sire->ire_phandle : 0, 7793 save_ire->ire_ihandle, /* Interface handle */ 7794 (sire != NULL) ? sire->ire_flags & 7795 (RTF_SETSRC | RTF_MULTIRT) : 0, /* flags */ 7796 &(save_ire->ire_uinfo), 7797 NULL, 7798 gcgrp); 7799 7800 if (dst_ill->ill_phys_addr_length == IP_ADDR_LEN) 7801 freeb(dlureq_mp); 7802 7803 if (ire == NULL) { 7804 if (gcgrp != NULL) { 7805 GCGRP_REFRELE(gcgrp); 7806 gcgrp = NULL; 7807 } 7808 ire_refrele(save_ire); 7809 break; 7810 } 7811 7812 /* reference now held by IRE */ 7813 gcgrp = NULL; 7814 7815 ire->ire_marks |= ire_marks; 7816 7817 /* Prevent save_ire from getting deleted */ 7818 IRB_REFHOLD(save_ire->ire_bucket); 7819 /* Has it been removed already ? */ 7820 if (save_ire->ire_marks & IRE_MARK_CONDEMNED) { 7821 IRB_REFRELE(save_ire->ire_bucket); 7822 ire_refrele(save_ire); 7823 break; 7824 } 7825 7826 /* 7827 * In the case of multirouting, a copy 7828 * of the packet is made before it is sent. 7829 * The copy is used in the next 7830 * loop to attempt another resolution. 7831 */ 7832 xmit_mp = first_mp; 7833 if ((sire != NULL) && 7834 (sire->ire_flags & RTF_MULTIRT)) { 7835 copy_mp = copymsg(first_mp); 7836 if (copy_mp != NULL) { 7837 xmit_mp = copy_mp; 7838 MULTIRT_DEBUG_TAG(first_mp); 7839 } 7840 } 7841 ire_add_then_send(q, ire, xmit_mp); 7842 7843 /* Assert that it is not deleted yet. */ 7844 ASSERT(save_ire->ire_ptpn != NULL); 7845 IRB_REFRELE(save_ire->ire_bucket); 7846 ire_refrele(save_ire); 7847 7848 if (copy_mp != NULL) { 7849 /* 7850 * If we found a (no)resolver, we ignore any 7851 * trailing top priority IRE_CACHE in further 7852 * loops. This ensures that we do not omit any 7853 * (no)resolver. 7854 * This IRE_CACHE, if any, will be processed 7855 * by another thread entering ip_newroute(). 7856 * IRE_CACHE entries, if any, will be processed 7857 * by another thread entering ip_newroute(), 7858 * (upon resolver response, for instance). 7859 * This aims to force parallel multirt 7860 * resolutions as soon as a packet must be sent. 7861 * In the best case, after the tx of only one 7862 * packet, all reachable routes are resolved. 7863 * Otherwise, the resolution of all RTF_MULTIRT 7864 * routes would require several emissions. 7865 */ 7866 multirt_flags &= ~MULTIRT_CACHEGW; 7867 7868 /* 7869 * Search for the next unresolved multirt 7870 * route. 7871 */ 7872 copy_mp = NULL; 7873 save_ire = NULL; 7874 ire = NULL; 7875 multirt_resolve_next = B_TRUE; 7876 continue; 7877 } 7878 7879 /* 7880 * Don't need sire anymore 7881 */ 7882 if (sire != NULL) 7883 ire_refrele(sire); 7884 7885 ipif_refrele(src_ipif); 7886 ill_refrele(dst_ill); 7887 return; 7888 } 7889 case IRE_IF_RESOLVER: 7890 /* 7891 * We can't build an IRE_CACHE yet, but at least we 7892 * found a resolver that can help. 7893 */ 7894 res_mp = dst_ill->ill_resolver_mp; 7895 if (!OK_RESOLVER_MP(res_mp)) 7896 break; 7897 7898 /* 7899 * To be at this point in the code with a non-zero gw 7900 * means that dst is reachable through a gateway that 7901 * we have never resolved. By changing dst to the gw 7902 * addr we resolve the gateway first. 7903 * When ire_add_then_send() tries to put the IP dg 7904 * to dst, it will reenter ip_newroute() at which 7905 * time we will find the IRE_CACHE for the gw and 7906 * create another IRE_CACHE in case IRE_CACHE above. 7907 */ 7908 if (gw != INADDR_ANY) { 7909 /* 7910 * The source ipif that was determined above was 7911 * relative to the destination address, not the 7912 * gateway's. If src_ipif was not taken out of 7913 * the IRE_IF_RESOLVER entry, we'll need to call 7914 * ipif_select_source() again. 7915 */ 7916 if (src_ipif != ire->ire_ipif) { 7917 ipif_refrele(src_ipif); 7918 src_ipif = ipif_select_source(dst_ill, 7919 gw, zoneid); 7920 if (src_ipif == NULL) { 7921 if (ip_debug > 2) { 7922 pr_addr_dbg( 7923 "ip_newroute: no " 7924 "src for gw %s ", 7925 AF_INET, &gw); 7926 printf("through " 7927 "interface %s\n", 7928 dst_ill->ill_name); 7929 } 7930 goto icmp_err_ret; 7931 } 7932 } 7933 save_dst = dst; 7934 dst = gw; 7935 gw = INADDR_ANY; 7936 } 7937 7938 /* 7939 * TSol note: Please see the corresponding note 7940 * of the IRE_IF_NORESOLVER case 7941 */ 7942 ga.ga_af = AF_INET; 7943 IN6_IPADDR_TO_V4MAPPED(dst, &ga.ga_addr); 7944 gcgrp = gcgrp_lookup(&ga, B_FALSE); 7945 7946 /* 7947 * We obtain a partial IRE_CACHE which we will pass 7948 * along with the resolver query. When the response 7949 * comes back it will be there ready for us to add. 7950 * The ire_max_frag is atomically set under the 7951 * irebucket lock in ire_add_v[46]. 7952 */ 7953 ire = ire_create_mp( 7954 (uchar_t *)&dst, /* dest address */ 7955 (uchar_t *)&ip_g_all_ones, /* mask */ 7956 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 7957 (uchar_t *)&gw, /* gateway address */ 7958 NULL, /* no in_src_addr */ 7959 NULL, /* ire_max_frag */ 7960 NULL, /* Fast Path header */ 7961 dst_ill->ill_rq, /* recv-from queue */ 7962 dst_ill->ill_wq, /* send-to queue */ 7963 IRE_CACHE, 7964 res_mp, 7965 src_ipif, /* Interface ipif */ 7966 in_ill, /* Incoming ILL */ 7967 save_ire->ire_mask, /* Parent mask */ 7968 0, 7969 save_ire->ire_ihandle, /* Interface handle */ 7970 0, /* flags if any */ 7971 &(save_ire->ire_uinfo), 7972 NULL, 7973 gcgrp); 7974 7975 if (ire == NULL) { 7976 ire_refrele(save_ire); 7977 if (gcgrp != NULL) { 7978 GCGRP_REFRELE(gcgrp); 7979 gcgrp = NULL; 7980 } 7981 break; 7982 } 7983 7984 /* reference now held by IRE */ 7985 gcgrp = NULL; 7986 7987 if ((sire != NULL) && 7988 (sire->ire_flags & RTF_MULTIRT)) { 7989 copy_mp = copymsg(first_mp); 7990 if (copy_mp != NULL) 7991 MULTIRT_DEBUG_TAG(copy_mp); 7992 } 7993 7994 ire->ire_marks |= ire_marks; 7995 7996 /* 7997 * Construct message chain for the resolver 7998 * of the form: 7999 * ARP_REQ_MBLK-->IRE_MBLK-->Packet 8000 * Packet could contain a IPSEC_OUT mp. 8001 * 8002 * NOTE : ire will be added later when the response 8003 * comes back from ARP. If the response does not 8004 * come back, ARP frees the packet. For this reason, 8005 * we can't REFHOLD the bucket of save_ire to prevent 8006 * deletions. We may not be able to REFRELE the bucket 8007 * if the response never comes back. Thus, before 8008 * adding the ire, ire_add_v4 will make sure that the 8009 * interface route does not get deleted. This is the 8010 * only case unlike ip_newroute_v6, ip_newroute_ipif_v6 8011 * where we can always prevent deletions because of 8012 * the synchronous nature of adding IRES i.e 8013 * ire_add_then_send is called after creating the IRE. 8014 */ 8015 ASSERT(ire->ire_mp != NULL); 8016 ire->ire_mp->b_cont = first_mp; 8017 /* Have saved_mp handy, for cleanup if canput fails */ 8018 saved_mp = mp; 8019 mp = ire->ire_dlureq_mp; 8020 ASSERT(mp != NULL); 8021 ire->ire_dlureq_mp = NULL; 8022 linkb(mp, ire->ire_mp); 8023 8024 8025 /* 8026 * Fill in the source and dest addrs for the resolver. 8027 * NOTE: this depends on memory layouts imposed by 8028 * ill_init(). 8029 */ 8030 areq = (areq_t *)mp->b_rptr; 8031 addrp = (ipaddr_t *)((char *)areq + 8032 areq->areq_sender_addr_offset); 8033 if (do_attach_ill) { 8034 /* 8035 * This is bind to no failover case. 8036 * arp packet also must go out on attach_ill. 8037 */ 8038 ASSERT(ipha->ipha_src != NULL); 8039 *addrp = ipha->ipha_src; 8040 } else { 8041 *addrp = save_ire->ire_src_addr; 8042 } 8043 8044 ire_refrele(save_ire); 8045 addrp = (ipaddr_t *)((char *)areq + 8046 areq->areq_target_addr_offset); 8047 *addrp = dst; 8048 /* Up to the resolver. */ 8049 if (canputnext(dst_ill->ill_rq)) { 8050 putnext(dst_ill->ill_rq, mp); 8051 ire = NULL; 8052 if (copy_mp != NULL) { 8053 /* 8054 * If we found a resolver, we ignore 8055 * any trailing top priority IRE_CACHE 8056 * in the further loops. This ensures 8057 * that we do not omit any resolver. 8058 * IRE_CACHE entries, if any, will be 8059 * processed next time we enter 8060 * ip_newroute(). 8061 */ 8062 multirt_flags &= ~MULTIRT_CACHEGW; 8063 /* 8064 * Search for the next unresolved 8065 * multirt route. 8066 */ 8067 first_mp = copy_mp; 8068 copy_mp = NULL; 8069 /* Prepare the next resolution loop. */ 8070 mp = first_mp; 8071 EXTRACT_PKT_MP(mp, first_mp, 8072 mctl_present); 8073 if (mctl_present) 8074 io = (ipsec_out_t *) 8075 first_mp->b_rptr; 8076 ipha = (ipha_t *)mp->b_rptr; 8077 8078 ASSERT(sire != NULL); 8079 8080 dst = save_dst; 8081 multirt_resolve_next = B_TRUE; 8082 continue; 8083 } 8084 8085 if (sire != NULL) 8086 ire_refrele(sire); 8087 8088 /* 8089 * The response will come back in ip_wput 8090 * with db_type IRE_DB_TYPE. 8091 */ 8092 ipif_refrele(src_ipif); 8093 ill_refrele(dst_ill); 8094 return; 8095 } else { 8096 /* Prepare for cleanup */ 8097 ire->ire_dlureq_mp = mp; 8098 mp->b_cont = NULL; 8099 ire_delete(ire); 8100 mp = saved_mp; 8101 ire = NULL; 8102 if (copy_mp != NULL) { 8103 MULTIRT_DEBUG_UNTAG(copy_mp); 8104 freemsg(copy_mp); 8105 copy_mp = NULL; 8106 } 8107 break; 8108 } 8109 default: 8110 break; 8111 } 8112 } while (multirt_resolve_next); 8113 8114 ip1dbg(("ip_newroute: dropped\n")); 8115 /* Did this packet originate externally? */ 8116 if (mp->b_prev) { 8117 mp->b_next = NULL; 8118 mp->b_prev = NULL; 8119 BUMP_MIB(&ip_mib, ipInDiscards); 8120 } else { 8121 BUMP_MIB(&ip_mib, ipOutDiscards); 8122 } 8123 ASSERT(copy_mp == NULL); 8124 MULTIRT_DEBUG_UNTAG(first_mp); 8125 freemsg(first_mp); 8126 if (ire != NULL) 8127 ire_refrele(ire); 8128 if (sire != NULL) 8129 ire_refrele(sire); 8130 if (src_ipif != NULL) 8131 ipif_refrele(src_ipif); 8132 if (dst_ill != NULL) 8133 ill_refrele(dst_ill); 8134 return; 8135 8136 icmp_err_ret: 8137 ip1dbg(("ip_newroute: no route\n")); 8138 if (src_ipif != NULL) 8139 ipif_refrele(src_ipif); 8140 if (dst_ill != NULL) 8141 ill_refrele(dst_ill); 8142 if (sire != NULL) 8143 ire_refrele(sire); 8144 /* Did this packet originate externally? */ 8145 if (mp->b_prev) { 8146 mp->b_next = NULL; 8147 mp->b_prev = NULL; 8148 /* XXX ipInNoRoutes */ 8149 q = WR(q); 8150 } else { 8151 /* 8152 * Since ip_wput() isn't close to finished, we fill 8153 * in enough of the header for credible error reporting. 8154 */ 8155 if (ip_hdr_complete(ipha, zoneid)) { 8156 /* Failed */ 8157 MULTIRT_DEBUG_UNTAG(first_mp); 8158 freemsg(first_mp); 8159 if (ire != NULL) 8160 ire_refrele(ire); 8161 return; 8162 } 8163 } 8164 BUMP_MIB(&ip_mib, ipOutNoRoutes); 8165 8166 /* 8167 * At this point we will have ire only if RTF_BLACKHOLE 8168 * or RTF_REJECT flags are set on the IRE. It will not 8169 * generate ICMP_HOST_UNREACHABLE if RTF_BLACKHOLE is set. 8170 */ 8171 if (ire != NULL) { 8172 if (ire->ire_flags & RTF_BLACKHOLE) { 8173 ire_refrele(ire); 8174 MULTIRT_DEBUG_UNTAG(first_mp); 8175 freemsg(first_mp); 8176 return; 8177 } 8178 ire_refrele(ire); 8179 } 8180 if (ip_source_routed(ipha)) { 8181 icmp_unreachable(q, first_mp, ICMP_SOURCE_ROUTE_FAILED); 8182 return; 8183 } 8184 icmp_unreachable(q, first_mp, ICMP_HOST_UNREACHABLE); 8185 } 8186 8187 /* 8188 * IPv4 - 8189 * ip_newroute_ipif is called by ip_wput_multicast and 8190 * ip_rput_forward_multicast whenever we need to send 8191 * out a packet to a destination address for which we do not have specific 8192 * routing information. It is used when the packet will be sent out 8193 * on a specific interface. It is also called by ip_wput() when IP_XMIT_IF 8194 * socket option is set or icmp error message wants to go out on a particular 8195 * interface for a unicast packet. 8196 * 8197 * In most cases, the destination address is resolved thanks to the ipif 8198 * intrinsic resolver. However, there are some cases where the call to 8199 * ip_newroute_ipif must take into account the potential presence of 8200 * RTF_SETSRC and/or RTF_MULITRT flags in an IRE_OFFSUBNET ire 8201 * that uses the interface. This is specified through flags, 8202 * which can be a combination of: 8203 * - RTF_SETSRC: if an IRE_OFFSUBNET ire exists that has the RTF_SETSRC 8204 * flag, the resulting ire will inherit the IRE_OFFSUBNET source address 8205 * and flags. Additionally, the packet source address has to be set to 8206 * the specified address. The caller is thus expected to set this flag 8207 * if the packet has no specific source address yet. 8208 * - RTF_MULTIRT: if an IRE_OFFSUBNET ire exists that has the RTF_MULTIRT 8209 * flag, the resulting ire will inherit the flag. All unresolved routes 8210 * to the destination must be explored in the same call to 8211 * ip_newroute_ipif(). 8212 */ 8213 static void 8214 ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, 8215 conn_t *connp, uint32_t flags) 8216 { 8217 areq_t *areq; 8218 ire_t *ire = NULL; 8219 mblk_t *res_mp; 8220 ipaddr_t *addrp; 8221 mblk_t *first_mp; 8222 ire_t *save_ire = NULL; 8223 ill_t *attach_ill = NULL; /* Bind to IPIF_NOFAILOVER */ 8224 ipif_t *src_ipif = NULL; 8225 ushort_t ire_marks = 0; 8226 ill_t *dst_ill = NULL; 8227 boolean_t mctl_present; 8228 ipsec_out_t *io; 8229 ipha_t *ipha; 8230 int ihandle = 0; 8231 mblk_t *saved_mp; 8232 ire_t *fire = NULL; 8233 mblk_t *copy_mp = NULL; 8234 boolean_t multirt_resolve_next; 8235 ipaddr_t ipha_dst; 8236 zoneid_t zoneid = (connp != NULL ? connp->conn_zoneid : ALL_ZONES); 8237 8238 /* 8239 * CGTP goes in a loop which looks up a new ipif, do an ipif_refhold 8240 * here for uniformity 8241 */ 8242 ipif_refhold(ipif); 8243 8244 /* 8245 * This loop is run only once in most cases. 8246 * We loop to resolve further routes only when the destination 8247 * can be reached through multiple RTF_MULTIRT-flagged ires. 8248 */ 8249 do { 8250 if (dst_ill != NULL) { 8251 ill_refrele(dst_ill); 8252 dst_ill = NULL; 8253 } 8254 if (src_ipif != NULL) { 8255 ipif_refrele(src_ipif); 8256 src_ipif = NULL; 8257 } 8258 multirt_resolve_next = B_FALSE; 8259 8260 ip1dbg(("ip_newroute_ipif: dst 0x%x, if %s\n", ntohl(dst), 8261 ipif->ipif_ill->ill_name)); 8262 8263 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 8264 if (mctl_present) 8265 io = (ipsec_out_t *)first_mp->b_rptr; 8266 8267 ipha = (ipha_t *)mp->b_rptr; 8268 8269 /* 8270 * Save the packet destination address, we may need it after 8271 * the packet has been consumed. 8272 */ 8273 ipha_dst = ipha->ipha_dst; 8274 8275 /* 8276 * If the interface is a pt-pt interface we look for an 8277 * IRE_IF_RESOLVER or IRE_IF_NORESOLVER that matches both the 8278 * local_address and the pt-pt destination address. Otherwise 8279 * we just match the local address. 8280 * NOTE: dst could be different than ipha->ipha_dst in case 8281 * of sending igmp multicast packets over a point-to-point 8282 * connection. 8283 * Thus we must be careful enough to check ipha_dst to be a 8284 * multicast address, otherwise it will take xmit_if path for 8285 * multicast packets resulting into kernel stack overflow by 8286 * repeated calls to ip_newroute_ipif from ire_send(). 8287 */ 8288 if (CLASSD(ipha_dst) && 8289 !(ipif->ipif_ill->ill_flags & ILLF_MULTICAST)) { 8290 goto err_ret; 8291 } 8292 8293 /* 8294 * We check if an IRE_OFFSUBNET for the addr that goes through 8295 * ipif exists. We need it to determine if the RTF_SETSRC and/or 8296 * RTF_MULTIRT flags must be honored. This IRE_OFFSUBNET ire may 8297 * propagate its flags to the new ire. 8298 */ 8299 if (CLASSD(ipha_dst) && (flags & (RTF_MULTIRT | RTF_SETSRC))) { 8300 fire = ipif_lookup_multi_ire(ipif, ipha_dst); 8301 ip2dbg(("ip_newroute_ipif: " 8302 "ipif_lookup_multi_ire(" 8303 "ipif %p, dst %08x) = fire %p\n", 8304 (void *)ipif, ntohl(dst), (void *)fire)); 8305 } 8306 8307 if (mctl_present && io->ipsec_out_attach_if) { 8308 attach_ill = ip_grab_attach_ill(NULL, first_mp, 8309 io->ipsec_out_ill_index, B_FALSE); 8310 8311 /* Failure case frees things for us. */ 8312 if (attach_ill == NULL) { 8313 ipif_refrele(ipif); 8314 if (fire != NULL) 8315 ire_refrele(fire); 8316 return; 8317 } 8318 8319 /* 8320 * Check if we need an ire that will not be 8321 * looked up by anybody else i.e. HIDDEN. 8322 */ 8323 if (ill_is_probeonly(attach_ill)) { 8324 ire_marks = IRE_MARK_HIDDEN; 8325 } 8326 /* 8327 * ip_wput passes the right ipif for IPIF_NOFAILOVER 8328 * case. 8329 */ 8330 dst_ill = ipif->ipif_ill; 8331 /* attach_ill has been refheld by ip_grab_attach_ill */ 8332 ASSERT(dst_ill == attach_ill); 8333 } else { 8334 /* 8335 * If this is set by IP_XMIT_IF, then make sure that 8336 * ipif is pointing to the same ill as the IP_XMIT_IF 8337 * specified ill. 8338 */ 8339 ASSERT((connp == NULL) || 8340 (connp->conn_xmit_if_ill == NULL) || 8341 (connp->conn_xmit_if_ill == ipif->ipif_ill)); 8342 /* 8343 * If the interface belongs to an interface group, 8344 * make sure the next possible interface in the group 8345 * is used. This encourages load spreading among 8346 * peers in an interface group. 8347 * Note: load spreading is disabled for RTF_MULTIRT 8348 * routes. 8349 */ 8350 if ((flags & RTF_MULTIRT) && (fire != NULL) && 8351 (fire->ire_flags & RTF_MULTIRT)) { 8352 /* 8353 * Don't perform outbound load spreading 8354 * in the case of an RTF_MULTIRT issued route, 8355 * we actually typically want to replicate 8356 * outgoing packets through particular 8357 * interfaces. 8358 */ 8359 dst_ill = ipif->ipif_ill; 8360 ill_refhold(dst_ill); 8361 } else { 8362 dst_ill = ip_newroute_get_dst_ill( 8363 ipif->ipif_ill); 8364 } 8365 if (dst_ill == NULL) { 8366 if (ip_debug > 2) { 8367 pr_addr_dbg("ip_newroute_ipif: " 8368 "no dst ill for dst %s\n", 8369 AF_INET, &dst); 8370 } 8371 goto err_ret; 8372 } 8373 } 8374 8375 /* 8376 * Pick a source address preferring non-deprecated ones. 8377 * Unlike ip_newroute, we don't do any source address 8378 * selection here since for multicast it really does not help 8379 * in inbound load spreading as in the unicast case. 8380 */ 8381 if ((flags & RTF_SETSRC) && (fire != NULL) && 8382 (fire->ire_flags & RTF_SETSRC)) { 8383 /* 8384 * As requested by flags, an IRE_OFFSUBNET was looked up 8385 * on that interface. This ire has RTF_SETSRC flag, so 8386 * the source address of the packet must be changed. 8387 * Check that the ipif matching the requested source 8388 * address still exists. 8389 */ 8390 src_ipif = ipif_lookup_addr(fire->ire_src_addr, NULL, 8391 zoneid, NULL, NULL, NULL, NULL); 8392 } 8393 if (((ipif->ipif_flags & IPIF_DEPRECATED) || 8394 (connp != NULL && ipif->ipif_zoneid != zoneid && 8395 ipif->ipif_zoneid != ALL_ZONES)) && 8396 (src_ipif == NULL)) { 8397 src_ipif = ipif_select_source(dst_ill, dst, zoneid); 8398 if (src_ipif == NULL) { 8399 if (ip_debug > 2) { 8400 /* ip1dbg */ 8401 pr_addr_dbg("ip_newroute_ipif: " 8402 "no src for dst %s", 8403 AF_INET, &dst); 8404 } 8405 ip1dbg((" through interface %s\n", 8406 dst_ill->ill_name)); 8407 goto err_ret; 8408 } 8409 ipif_refrele(ipif); 8410 ipif = src_ipif; 8411 ipif_refhold(ipif); 8412 } 8413 if (src_ipif == NULL) { 8414 src_ipif = ipif; 8415 ipif_refhold(src_ipif); 8416 } 8417 8418 /* 8419 * Assign a source address while we have the conn. 8420 * We can't have ip_wput_ire pick a source address when the 8421 * packet returns from arp since conn_unspec_src might be set 8422 * and we loose the conn when going through arp. 8423 */ 8424 if (ipha->ipha_src == INADDR_ANY && 8425 (connp == NULL || !connp->conn_unspec_src)) { 8426 ipha->ipha_src = src_ipif->ipif_src_addr; 8427 } 8428 8429 /* 8430 * In case of IP_XMIT_IF, it is possible that the outgoing 8431 * interface does not have an interface ire. 8432 * Example: Thousands of mobileip PPP interfaces to mobile 8433 * nodes. We don't want to create interface ires because 8434 * packets from other mobile nodes must not take the route 8435 * via interface ires to the visiting mobile node without 8436 * going through the home agent, in absence of mobileip 8437 * route optimization. 8438 */ 8439 if (CLASSD(ipha_dst) && (connp == NULL || 8440 connp->conn_xmit_if_ill == NULL)) { 8441 /* ipif_to_ire returns an held ire */ 8442 ire = ipif_to_ire(ipif); 8443 if (ire == NULL) 8444 goto err_ret; 8445 if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) 8446 goto err_ret; 8447 /* 8448 * ihandle is needed when the ire is added to 8449 * cache table. 8450 */ 8451 save_ire = ire; 8452 ihandle = save_ire->ire_ihandle; 8453 8454 ip2dbg(("ip_newroute_ipif: ire %p, ipif %p, " 8455 "flags %04x\n", 8456 (void *)ire, (void *)ipif, flags)); 8457 if ((flags & RTF_MULTIRT) && (fire != NULL) && 8458 (fire->ire_flags & RTF_MULTIRT)) { 8459 /* 8460 * As requested by flags, an IRE_OFFSUBNET was 8461 * looked up on that interface. This ire has 8462 * RTF_MULTIRT flag, so the resolution loop will 8463 * be re-entered to resolve additional routes on 8464 * other interfaces. For that purpose, a copy of 8465 * the packet is performed at this point. 8466 */ 8467 fire->ire_last_used_time = lbolt; 8468 copy_mp = copymsg(first_mp); 8469 if (copy_mp) { 8470 MULTIRT_DEBUG_TAG(copy_mp); 8471 } 8472 } 8473 if ((flags & RTF_SETSRC) && (fire != NULL) && 8474 (fire->ire_flags & RTF_SETSRC)) { 8475 /* 8476 * As requested by flags, an IRE_OFFSUBET was 8477 * looked up on that interface. This ire has 8478 * RTF_SETSRC flag, so the source address of the 8479 * packet must be changed. 8480 */ 8481 ipha->ipha_src = fire->ire_src_addr; 8482 } 8483 } else { 8484 ASSERT((connp == NULL) || 8485 (connp->conn_xmit_if_ill != NULL) || 8486 (connp->conn_dontroute)); 8487 /* 8488 * The only ways we can come here are: 8489 * 1) IP_XMIT_IF socket option is set 8490 * 2) ICMP error message generated from 8491 * ip_mrtun_forward() routine and it needs 8492 * to go through the specified ill. 8493 * 3) SO_DONTROUTE socket option is set 8494 * In all cases, the new ire will not be added 8495 * into cache table. 8496 */ 8497 ire_marks |= IRE_MARK_NOADD; 8498 } 8499 8500 switch (ipif->ipif_net_type) { 8501 case IRE_IF_NORESOLVER: { 8502 /* We have what we need to build an IRE_CACHE. */ 8503 mblk_t *dlureq_mp; 8504 8505 /* 8506 * Create a new dlureq_mp with the 8507 * IP gateway address as destination address in the 8508 * DLPI hdr if the physical length is exactly 4 bytes. 8509 */ 8510 if (dst_ill->ill_phys_addr_length == IP_ADDR_LEN) { 8511 dlureq_mp = ill_dlur_gen((uchar_t *)&dst, 8512 dst_ill->ill_phys_addr_length, 8513 dst_ill->ill_sap, 8514 dst_ill->ill_sap_length); 8515 } else { 8516 /* use the value set in ip_ll_subnet_defaults */ 8517 dlureq_mp = ill_dlur_gen(NULL, 8518 dst_ill->ill_phys_addr_length, 8519 dst_ill->ill_sap, 8520 dst_ill->ill_sap_length); 8521 } 8522 8523 if (dlureq_mp == NULL) 8524 break; 8525 /* 8526 * The new ire inherits the IRE_OFFSUBNET flags 8527 * and source address, if this was requested. 8528 */ 8529 ire = ire_create( 8530 (uchar_t *)&dst, /* dest address */ 8531 (uchar_t *)&ip_g_all_ones, /* mask */ 8532 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 8533 NULL, /* gateway address */ 8534 NULL, 8535 &ipif->ipif_mtu, 8536 NULL, /* Fast Path header */ 8537 dst_ill->ill_rq, /* recv-from queue */ 8538 dst_ill->ill_wq, /* send-to queue */ 8539 IRE_CACHE, 8540 dlureq_mp, 8541 src_ipif, 8542 NULL, 8543 (save_ire != NULL ? save_ire->ire_mask : 0), 8544 (fire != NULL) ? /* Parent handle */ 8545 fire->ire_phandle : 0, 8546 ihandle, /* Interface handle */ 8547 (fire != NULL) ? 8548 (fire->ire_flags & 8549 (RTF_SETSRC | RTF_MULTIRT)) : 0, 8550 (save_ire == NULL ? &ire_uinfo_null : 8551 &save_ire->ire_uinfo), 8552 NULL, 8553 NULL); 8554 8555 freeb(dlureq_mp); 8556 8557 if (ire == NULL) { 8558 if (save_ire != NULL) 8559 ire_refrele(save_ire); 8560 break; 8561 } 8562 8563 ire->ire_marks |= ire_marks; 8564 8565 /* 8566 * If IRE_MARK_NOADD is set then we need to convert 8567 * the max_fragp to a useable value now. This is 8568 * normally done in ire_add_v[46]. 8569 */ 8570 if (ire->ire_marks & IRE_MARK_NOADD) { 8571 uint_t max_frag; 8572 8573 max_frag = *ire->ire_max_fragp; 8574 ire->ire_max_fragp = NULL; 8575 ire->ire_max_frag = max_frag; 8576 } 8577 8578 /* Prevent save_ire from getting deleted */ 8579 if (save_ire != NULL) { 8580 IRB_REFHOLD(save_ire->ire_bucket); 8581 /* Has it been removed already ? */ 8582 if (save_ire->ire_marks & IRE_MARK_CONDEMNED) { 8583 IRB_REFRELE(save_ire->ire_bucket); 8584 ire_refrele(save_ire); 8585 break; 8586 } 8587 } 8588 8589 ire_add_then_send(q, ire, first_mp); 8590 8591 /* Assert that save_ire is not deleted yet. */ 8592 if (save_ire != NULL) { 8593 ASSERT(save_ire->ire_ptpn != NULL); 8594 IRB_REFRELE(save_ire->ire_bucket); 8595 ire_refrele(save_ire); 8596 save_ire = NULL; 8597 } 8598 if (fire != NULL) { 8599 ire_refrele(fire); 8600 fire = NULL; 8601 } 8602 8603 /* 8604 * the resolution loop is re-entered if this 8605 * was requested through flags and if we 8606 * actually are in a multirouting case. 8607 */ 8608 if ((flags & RTF_MULTIRT) && (copy_mp != NULL)) { 8609 boolean_t need_resolve = 8610 ire_multirt_need_resolve(ipha_dst, 8611 MBLK_GETLABEL(copy_mp)); 8612 if (!need_resolve) { 8613 MULTIRT_DEBUG_UNTAG(copy_mp); 8614 freemsg(copy_mp); 8615 copy_mp = NULL; 8616 } else { 8617 /* 8618 * ipif_lookup_group() calls 8619 * ire_lookup_multi() that uses 8620 * ire_ftable_lookup() to find 8621 * an IRE_INTERFACE for the group. 8622 * In the multirt case, 8623 * ire_lookup_multi() then invokes 8624 * ire_multirt_lookup() to find 8625 * the next resolvable ire. 8626 * As a result, we obtain an new 8627 * interface, derived from the 8628 * next ire. 8629 */ 8630 ipif_refrele(ipif); 8631 ipif = ipif_lookup_group(ipha_dst, 8632 zoneid); 8633 ip2dbg(("ip_newroute_ipif: " 8634 "multirt dst %08x, ipif %p\n", 8635 htonl(dst), (void *)ipif)); 8636 if (ipif != NULL) { 8637 mp = copy_mp; 8638 copy_mp = NULL; 8639 multirt_resolve_next = B_TRUE; 8640 continue; 8641 } else { 8642 freemsg(copy_mp); 8643 } 8644 } 8645 } 8646 if (ipif != NULL) 8647 ipif_refrele(ipif); 8648 ill_refrele(dst_ill); 8649 ipif_refrele(src_ipif); 8650 return; 8651 } 8652 case IRE_IF_RESOLVER: 8653 /* 8654 * We can't build an IRE_CACHE yet, but at least 8655 * we found a resolver that can help. 8656 */ 8657 res_mp = dst_ill->ill_resolver_mp; 8658 if (!OK_RESOLVER_MP(res_mp)) 8659 break; 8660 8661 /* 8662 * We obtain a partial IRE_CACHE which we will pass 8663 * along with the resolver query. When the response 8664 * comes back it will be there ready for us to add. 8665 * The new ire inherits the IRE_OFFSUBNET flags 8666 * and source address, if this was requested. 8667 * The ire_max_frag is atomically set under the 8668 * irebucket lock in ire_add_v[46]. Only in the 8669 * case of IRE_MARK_NOADD, we set it here itself. 8670 */ 8671 ire = ire_create_mp( 8672 (uchar_t *)&dst, /* dest address */ 8673 (uchar_t *)&ip_g_all_ones, /* mask */ 8674 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 8675 NULL, /* gateway address */ 8676 NULL, /* no in_src_addr */ 8677 (ire_marks & IRE_MARK_NOADD) ? 8678 ipif->ipif_mtu : 0, /* max_frag */ 8679 NULL, /* Fast path header */ 8680 dst_ill->ill_rq, /* recv-from queue */ 8681 dst_ill->ill_wq, /* send-to queue */ 8682 IRE_CACHE, 8683 res_mp, 8684 src_ipif, 8685 NULL, 8686 (save_ire != NULL ? save_ire->ire_mask : 0), 8687 (fire != NULL) ? /* Parent handle */ 8688 fire->ire_phandle : 0, 8689 ihandle, /* Interface handle */ 8690 (fire != NULL) ? /* flags if any */ 8691 (fire->ire_flags & 8692 (RTF_SETSRC | RTF_MULTIRT)) : 0, 8693 (save_ire == NULL ? &ire_uinfo_null : 8694 &save_ire->ire_uinfo), 8695 NULL, 8696 NULL); 8697 8698 if (save_ire != NULL) { 8699 ire_refrele(save_ire); 8700 save_ire = NULL; 8701 } 8702 if (ire == NULL) 8703 break; 8704 8705 ire->ire_marks |= ire_marks; 8706 /* 8707 * Construct message chain for the resolver of the 8708 * form: 8709 * ARP_REQ_MBLK-->IRE_MBLK-->Packet 8710 * 8711 * NOTE : ire will be added later when the response 8712 * comes back from ARP. If the response does not 8713 * come back, ARP frees the packet. For this reason, 8714 * we can't REFHOLD the bucket of save_ire to prevent 8715 * deletions. We may not be able to REFRELE the 8716 * bucket if the response never comes back. 8717 * Thus, before adding the ire, ire_add_v4 will make 8718 * sure that the interface route does not get deleted. 8719 * This is the only case unlike ip_newroute_v6, 8720 * ip_newroute_ipif_v6 where we can always prevent 8721 * deletions because ire_add_then_send is called after 8722 * creating the IRE. 8723 * If IRE_MARK_NOADD is set, then ire_add_then_send 8724 * does not add this IRE into the IRE CACHE. 8725 */ 8726 ASSERT(ire->ire_mp != NULL); 8727 ire->ire_mp->b_cont = first_mp; 8728 /* Have saved_mp handy, for cleanup if canput fails */ 8729 saved_mp = mp; 8730 mp = ire->ire_dlureq_mp; 8731 ASSERT(mp != NULL); 8732 ire->ire_dlureq_mp = NULL; 8733 linkb(mp, ire->ire_mp); 8734 8735 /* 8736 * Fill in the source and dest addrs for the resolver. 8737 * NOTE: this depends on memory layouts imposed by 8738 * ill_init(). 8739 */ 8740 areq = (areq_t *)mp->b_rptr; 8741 addrp = (ipaddr_t *)((char *)areq + 8742 areq->areq_sender_addr_offset); 8743 *addrp = ire->ire_src_addr; 8744 addrp = (ipaddr_t *)((char *)areq + 8745 areq->areq_target_addr_offset); 8746 *addrp = dst; 8747 /* Up to the resolver. */ 8748 if (canputnext(dst_ill->ill_rq)) { 8749 putnext(dst_ill->ill_rq, mp); 8750 /* 8751 * The response will come back in ip_wput 8752 * with db_type IRE_DB_TYPE. 8753 */ 8754 } else { 8755 ire->ire_dlureq_mp = mp; 8756 mp->b_cont = NULL; 8757 ire_delete(ire); 8758 saved_mp->b_next = NULL; 8759 saved_mp->b_prev = NULL; 8760 freemsg(first_mp); 8761 ip2dbg(("ip_newroute_ipif: dropped\n")); 8762 } 8763 8764 if (fire != NULL) { 8765 ire_refrele(fire); 8766 fire = NULL; 8767 } 8768 8769 8770 /* 8771 * The resolution loop is re-entered if this was 8772 * requested through flags and we actually are 8773 * in a multirouting case. 8774 */ 8775 if ((flags & RTF_MULTIRT) && (copy_mp != NULL)) { 8776 boolean_t need_resolve = 8777 ire_multirt_need_resolve(ipha_dst, 8778 MBLK_GETLABEL(copy_mp)); 8779 if (!need_resolve) { 8780 MULTIRT_DEBUG_UNTAG(copy_mp); 8781 freemsg(copy_mp); 8782 copy_mp = NULL; 8783 } else { 8784 /* 8785 * ipif_lookup_group() calls 8786 * ire_lookup_multi() that uses 8787 * ire_ftable_lookup() to find 8788 * an IRE_INTERFACE for the group. 8789 * In the multirt case, 8790 * ire_lookup_multi() then invokes 8791 * ire_multirt_lookup() to find 8792 * the next resolvable ire. 8793 * As a result, we obtain an new 8794 * interface, derived from the 8795 * next ire. 8796 */ 8797 ipif_refrele(ipif); 8798 ipif = ipif_lookup_group(ipha_dst, 8799 zoneid); 8800 if (ipif != NULL) { 8801 mp = copy_mp; 8802 copy_mp = NULL; 8803 multirt_resolve_next = B_TRUE; 8804 continue; 8805 } else { 8806 freemsg(copy_mp); 8807 } 8808 } 8809 } 8810 if (ipif != NULL) 8811 ipif_refrele(ipif); 8812 ill_refrele(dst_ill); 8813 ipif_refrele(src_ipif); 8814 return; 8815 default: 8816 break; 8817 } 8818 } while (multirt_resolve_next); 8819 8820 err_ret: 8821 ip2dbg(("ip_newroute_ipif: dropped\n")); 8822 if (fire != NULL) 8823 ire_refrele(fire); 8824 ipif_refrele(ipif); 8825 /* Did this packet originate externally? */ 8826 if (dst_ill != NULL) 8827 ill_refrele(dst_ill); 8828 if (src_ipif != NULL) 8829 ipif_refrele(src_ipif); 8830 if (mp->b_prev || mp->b_next) { 8831 mp->b_next = NULL; 8832 mp->b_prev = NULL; 8833 } else { 8834 /* 8835 * Since ip_wput() isn't close to finished, we fill 8836 * in enough of the header for credible error reporting. 8837 */ 8838 if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid)) { 8839 /* Failed */ 8840 freemsg(first_mp); 8841 if (ire != NULL) 8842 ire_refrele(ire); 8843 return; 8844 } 8845 } 8846 /* 8847 * At this point we will have ire only if RTF_BLACKHOLE 8848 * or RTF_REJECT flags are set on the IRE. It will not 8849 * generate ICMP_HOST_UNREACHABLE if RTF_BLACKHOLE is set. 8850 */ 8851 if (ire != NULL) { 8852 if (ire->ire_flags & RTF_BLACKHOLE) { 8853 ire_refrele(ire); 8854 freemsg(first_mp); 8855 return; 8856 } 8857 ire_refrele(ire); 8858 } 8859 icmp_unreachable(q, first_mp, ICMP_HOST_UNREACHABLE); 8860 } 8861 8862 /* Name/Value Table Lookup Routine */ 8863 char * 8864 ip_nv_lookup(nv_t *nv, int value) 8865 { 8866 if (!nv) 8867 return (NULL); 8868 for (; nv->nv_name; nv++) { 8869 if (nv->nv_value == value) 8870 return (nv->nv_name); 8871 } 8872 return ("unknown"); 8873 } 8874 8875 /* 8876 * one day it can be patched to 1 from /etc/system for machines that have few 8877 * fast network interfaces feeding multiple cpus. 8878 */ 8879 int ill_stream_putlocks = 0; 8880 8881 /* 8882 * This is a module open, i.e. this is a control stream for access 8883 * to a DLPI device. We allocate an ill_t as the instance data in 8884 * this case. 8885 */ 8886 int 8887 ip_modopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 8888 { 8889 uint32_t mem_cnt; 8890 uint32_t cpu_cnt; 8891 uint32_t min_cnt; 8892 pgcnt_t mem_avail; 8893 extern uint32_t ip_cache_table_size, ip6_cache_table_size; 8894 ill_t *ill; 8895 int err; 8896 8897 /* 8898 * Prevent unprivileged processes from pushing IP so that 8899 * they can't send raw IP. 8900 */ 8901 if (secpolicy_net_rawaccess(credp) != 0) 8902 return (EPERM); 8903 8904 ill = (ill_t *)mi_open_alloc_sleep(sizeof (ill_t)); 8905 q->q_ptr = WR(q)->q_ptr = ill; 8906 8907 /* 8908 * ill_init initializes the ill fields and then sends down 8909 * down a DL_INFO_REQ after calling qprocson. 8910 */ 8911 err = ill_init(q, ill); 8912 if (err != 0) { 8913 mi_free(ill); 8914 q->q_ptr = NULL; 8915 WR(q)->q_ptr = NULL; 8916 return (err); 8917 } 8918 8919 /* ill_init initializes the ipsq marking this thread as writer */ 8920 ipsq_exit(ill->ill_phyint->phyint_ipsq, B_TRUE, B_TRUE); 8921 /* Wait for the DL_INFO_ACK */ 8922 mutex_enter(&ill->ill_lock); 8923 while (ill->ill_state_flags & ILL_LL_SUBNET_PENDING) { 8924 /* 8925 * Return value of 0 indicates a pending signal. 8926 */ 8927 err = cv_wait_sig(&ill->ill_cv, &ill->ill_lock); 8928 if (err == 0) { 8929 mutex_exit(&ill->ill_lock); 8930 (void) ip_close(q, 0); 8931 return (EINTR); 8932 } 8933 } 8934 mutex_exit(&ill->ill_lock); 8935 8936 /* 8937 * ip_rput_other could have set an error in ill_error on 8938 * receipt of M_ERROR. 8939 */ 8940 8941 err = ill->ill_error; 8942 if (err != 0) { 8943 (void) ip_close(q, 0); 8944 return (err); 8945 } 8946 8947 /* 8948 * ip_ire_max_bucket_cnt is sized below based on the memory 8949 * size and the cpu speed of the machine. This is upper 8950 * bounded by the compile time value of ip_ire_max_bucket_cnt 8951 * and is lower bounded by the compile time value of 8952 * ip_ire_min_bucket_cnt. Similar logic applies to 8953 * ip6_ire_max_bucket_cnt. 8954 */ 8955 mem_avail = kmem_avail(); 8956 mem_cnt = (mem_avail >> ip_ire_mem_ratio) / 8957 ip_cache_table_size / sizeof (ire_t); 8958 cpu_cnt = CPU->cpu_type_info.pi_clock >> ip_ire_cpu_ratio; 8959 8960 min_cnt = MIN(cpu_cnt, mem_cnt); 8961 if (min_cnt < ip_ire_min_bucket_cnt) 8962 min_cnt = ip_ire_min_bucket_cnt; 8963 if (ip_ire_max_bucket_cnt > min_cnt) { 8964 ip_ire_max_bucket_cnt = min_cnt; 8965 } 8966 8967 mem_cnt = (mem_avail >> ip_ire_mem_ratio) / 8968 ip6_cache_table_size / sizeof (ire_t); 8969 min_cnt = MIN(cpu_cnt, mem_cnt); 8970 if (min_cnt < ip6_ire_min_bucket_cnt) 8971 min_cnt = ip6_ire_min_bucket_cnt; 8972 if (ip6_ire_max_bucket_cnt > min_cnt) { 8973 ip6_ire_max_bucket_cnt = min_cnt; 8974 } 8975 8976 ill->ill_credp = credp; 8977 crhold(credp); 8978 8979 mutex_enter(&ip_mi_lock); 8980 err = mi_open_link(&ip_g_head, (IDP)ill, devp, flag, sflag, credp); 8981 mutex_exit(&ip_mi_lock); 8982 if (err) { 8983 (void) ip_close(q, 0); 8984 return (err); 8985 } 8986 return (0); 8987 } 8988 8989 /* IP open routine. */ 8990 int 8991 ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 8992 { 8993 conn_t *connp; 8994 major_t maj; 8995 8996 TRACE_1(TR_FAC_IP, TR_IP_OPEN, "ip_open: q %p", q); 8997 8998 /* Allow reopen. */ 8999 if (q->q_ptr != NULL) 9000 return (0); 9001 9002 if (sflag & MODOPEN) { 9003 /* This is a module open */ 9004 return (ip_modopen(q, devp, flag, sflag, credp)); 9005 } 9006 9007 /* 9008 * We are opening as a device. This is an IP client stream, and we 9009 * allocate an conn_t as the instance data. 9010 */ 9011 connp = ipcl_conn_create(IPCL_IPCCONN, KM_SLEEP); 9012 connp->conn_upq = q; 9013 q->q_ptr = WR(q)->q_ptr = connp; 9014 9015 if (flag & SO_SOCKSTR) 9016 connp->conn_flags |= IPCL_SOCKET; 9017 9018 /* Minor tells us which /dev entry was opened */ 9019 if (geteminor(*devp) == IPV6_MINOR) { 9020 connp->conn_flags |= IPCL_ISV6; 9021 connp->conn_af_isv6 = B_TRUE; 9022 ip_setqinfo(q, geteminor(*devp), B_FALSE); 9023 connp->conn_src_preferences = IPV6_PREFER_SRC_DEFAULT; 9024 } else { 9025 connp->conn_af_isv6 = B_FALSE; 9026 connp->conn_pkt_isv6 = B_FALSE; 9027 } 9028 9029 if ((connp->conn_dev = inet_minor_alloc(ip_minor_arena)) == 0) { 9030 q->q_ptr = WR(q)->q_ptr = NULL; 9031 CONN_DEC_REF(connp); 9032 return (EBUSY); 9033 } 9034 9035 maj = getemajor(*devp); 9036 *devp = makedevice(maj, (minor_t)connp->conn_dev); 9037 9038 /* 9039 * connp->conn_cred is crfree()ed in ipcl_conn_destroy() 9040 */ 9041 connp->conn_cred = credp; 9042 crhold(connp->conn_cred); 9043 9044 /* 9045 * If the caller has the process-wide flag set, then default to MAC 9046 * exempt mode. This allows read-down to unlabeled hosts. 9047 */ 9048 if (getpflags(NET_MAC_AWARE, credp) != 0) 9049 connp->conn_mac_exempt = B_TRUE; 9050 9051 connp->conn_zoneid = getzoneid(); 9052 9053 /* 9054 * This should only happen for ndd, netstat, raw socket or other SCTP 9055 * administrative ops. In these cases, we just need a normal conn_t 9056 * with ulp set to IPPROTO_SCTP. All other ops are trapped and 9057 * an error will be returned. 9058 */ 9059 if (maj != SCTP_MAJ && maj != SCTP6_MAJ) { 9060 connp->conn_rq = q; 9061 connp->conn_wq = WR(q); 9062 } else { 9063 connp->conn_ulp = IPPROTO_SCTP; 9064 connp->conn_rq = connp->conn_wq = NULL; 9065 } 9066 /* Non-zero default values */ 9067 connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; 9068 9069 /* 9070 * Make the conn globally visible to walkers 9071 */ 9072 mutex_enter(&connp->conn_lock); 9073 connp->conn_state_flags &= ~CONN_INCIPIENT; 9074 mutex_exit(&connp->conn_lock); 9075 ASSERT(connp->conn_ref == 1); 9076 9077 qprocson(q); 9078 9079 return (0); 9080 } 9081 9082 /* 9083 * Change q_qinfo based on the value of isv6. 9084 * This can not called on an ill queue. 9085 * Note that there is no race since either q_qinfo works for conn queues - it 9086 * is just an optimization to enter the best wput routine directly. 9087 */ 9088 void 9089 ip_setqinfo(queue_t *q, minor_t minor, boolean_t bump_mib) 9090 { 9091 ASSERT(q->q_flag & QREADR); 9092 ASSERT(WR(q)->q_next == NULL); 9093 ASSERT(q->q_ptr != NULL); 9094 9095 if (minor == IPV6_MINOR) { 9096 if (bump_mib) 9097 BUMP_MIB(&ip6_mib, ipv6OutSwitchIPv4); 9098 q->q_qinfo = &rinit_ipv6; 9099 WR(q)->q_qinfo = &winit_ipv6; 9100 (Q_TO_CONN(q))->conn_pkt_isv6 = B_TRUE; 9101 } else { 9102 if (bump_mib) 9103 BUMP_MIB(&ip_mib, ipOutSwitchIPv6); 9104 q->q_qinfo = &rinit; 9105 WR(q)->q_qinfo = &winit; 9106 (Q_TO_CONN(q))->conn_pkt_isv6 = B_FALSE; 9107 } 9108 9109 } 9110 9111 /* 9112 * See if IPsec needs loading because of the options in mp. 9113 */ 9114 static boolean_t 9115 ipsec_opt_present(mblk_t *mp) 9116 { 9117 uint8_t *optcp, *next_optcp, *opt_endcp; 9118 struct opthdr *opt; 9119 struct T_opthdr *topt; 9120 int opthdr_len; 9121 t_uscalar_t optname, optlevel; 9122 struct T_optmgmt_req *tor = (struct T_optmgmt_req *)mp->b_rptr; 9123 ipsec_req_t *ipsr; 9124 9125 /* 9126 * Walk through the mess, and find IP_SEC_OPT. If it's there, 9127 * return TRUE. 9128 */ 9129 9130 optcp = mi_offset_param(mp, tor->OPT_offset, tor->OPT_length); 9131 opt_endcp = optcp + tor->OPT_length; 9132 if (tor->PRIM_type == T_OPTMGMT_REQ) { 9133 opthdr_len = sizeof (struct T_opthdr); 9134 } else { /* O_OPTMGMT_REQ */ 9135 ASSERT(tor->PRIM_type == T_SVR4_OPTMGMT_REQ); 9136 opthdr_len = sizeof (struct opthdr); 9137 } 9138 for (; optcp < opt_endcp; optcp = next_optcp) { 9139 if (optcp + opthdr_len > opt_endcp) 9140 return (B_FALSE); /* Not enough option header. */ 9141 if (tor->PRIM_type == T_OPTMGMT_REQ) { 9142 topt = (struct T_opthdr *)optcp; 9143 optlevel = topt->level; 9144 optname = topt->name; 9145 next_optcp = optcp + _TPI_ALIGN_TOPT(topt->len); 9146 } else { 9147 opt = (struct opthdr *)optcp; 9148 optlevel = opt->level; 9149 optname = opt->name; 9150 next_optcp = optcp + opthdr_len + 9151 _TPI_ALIGN_OPT(opt->len); 9152 } 9153 if ((next_optcp < optcp) || /* wraparound pointer space */ 9154 ((next_optcp >= opt_endcp) && /* last option bad len */ 9155 ((next_optcp - opt_endcp) >= __TPI_ALIGN_SIZE))) 9156 return (B_FALSE); /* bad option buffer */ 9157 if ((optlevel == IPPROTO_IP && optname == IP_SEC_OPT) || 9158 (optlevel == IPPROTO_IPV6 && optname == IPV6_SEC_OPT)) { 9159 /* 9160 * Check to see if it's an all-bypass or all-zeroes 9161 * IPsec request. Don't bother loading IPsec if 9162 * the socket doesn't want to use it. (A good example 9163 * is a bypass request.) 9164 * 9165 * Basically, if any of the non-NEVER bits are set, 9166 * load IPsec. 9167 */ 9168 ipsr = (ipsec_req_t *)(optcp + opthdr_len); 9169 if ((ipsr->ipsr_ah_req & ~IPSEC_PREF_NEVER) != 0 || 9170 (ipsr->ipsr_esp_req & ~IPSEC_PREF_NEVER) != 0 || 9171 (ipsr->ipsr_self_encap_req & ~IPSEC_PREF_NEVER) 9172 != 0) 9173 return (B_TRUE); 9174 } 9175 } 9176 return (B_FALSE); 9177 } 9178 9179 /* 9180 * If conn is is waiting for ipsec to finish loading, kick it. 9181 */ 9182 /* ARGSUSED */ 9183 static void 9184 conn_restart_ipsec_waiter(conn_t *connp, void *arg) 9185 { 9186 t_scalar_t optreq_prim; 9187 mblk_t *mp; 9188 cred_t *cr; 9189 int err = 0; 9190 9191 /* 9192 * This function is called, after ipsec loading is complete. 9193 * Since IP checks exclusively and atomically (i.e it prevents 9194 * ipsec load from completing until ip_optcom_req completes) 9195 * whether ipsec load is complete, there cannot be a race with IP 9196 * trying to set the CONN_IPSEC_LOAD_WAIT flag on any conn now. 9197 */ 9198 mutex_enter(&connp->conn_lock); 9199 if (connp->conn_state_flags & CONN_IPSEC_LOAD_WAIT) { 9200 ASSERT(connp->conn_ipsec_opt_mp != NULL); 9201 mp = connp->conn_ipsec_opt_mp; 9202 connp->conn_ipsec_opt_mp = NULL; 9203 connp->conn_state_flags &= ~CONN_IPSEC_LOAD_WAIT; 9204 cr = DB_CREDDEF(mp, GET_QUEUE_CRED(CONNP_TO_WQ(connp))); 9205 mutex_exit(&connp->conn_lock); 9206 9207 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 9208 9209 optreq_prim = ((union T_primitives *)mp->b_rptr)->type; 9210 if (optreq_prim == T_OPTMGMT_REQ) { 9211 err = tpi_optcom_req(CONNP_TO_WQ(connp), mp, cr, 9212 &ip_opt_obj); 9213 } else { 9214 ASSERT(optreq_prim == T_SVR4_OPTMGMT_REQ); 9215 err = svr4_optcom_req(CONNP_TO_WQ(connp), mp, cr, 9216 &ip_opt_obj); 9217 } 9218 if (err != EINPROGRESS) 9219 CONN_OPER_PENDING_DONE(connp); 9220 return; 9221 } 9222 mutex_exit(&connp->conn_lock); 9223 } 9224 9225 /* 9226 * Called from the ipsec_loader thread, outside any perimeter, to tell 9227 * ip qenable any of the queues waiting for the ipsec loader to 9228 * complete. 9229 * 9230 * Use ip_mi_lock to be safe here: all modifications of the mi lists 9231 * are done with this lock held, so it's guaranteed that none of the 9232 * links will change along the way. 9233 */ 9234 void 9235 ip_ipsec_load_complete() 9236 { 9237 ipcl_walk(conn_restart_ipsec_waiter, NULL); 9238 } 9239 9240 /* 9241 * Can't be used. Need to call svr4* -> optset directly. the leaf routine 9242 * determines the grp on which it has to become exclusive, queues the mp 9243 * and sq draining restarts the optmgmt 9244 */ 9245 static boolean_t 9246 ip_check_for_ipsec_opt(queue_t *q, mblk_t *mp) 9247 { 9248 conn_t *connp; 9249 9250 /* 9251 * Take IPsec requests and treat them special. 9252 */ 9253 if (ipsec_opt_present(mp)) { 9254 /* First check if IPsec is loaded. */ 9255 mutex_enter(&ipsec_loader_lock); 9256 if (ipsec_loader_state != IPSEC_LOADER_WAIT) { 9257 mutex_exit(&ipsec_loader_lock); 9258 return (B_FALSE); 9259 } 9260 connp = Q_TO_CONN(q); 9261 mutex_enter(&connp->conn_lock); 9262 connp->conn_state_flags |= CONN_IPSEC_LOAD_WAIT; 9263 9264 ASSERT(connp->conn_ipsec_opt_mp == NULL); 9265 connp->conn_ipsec_opt_mp = mp; 9266 mutex_exit(&connp->conn_lock); 9267 mutex_exit(&ipsec_loader_lock); 9268 9269 ipsec_loader_loadnow(); 9270 return (B_TRUE); 9271 } 9272 return (B_FALSE); 9273 } 9274 9275 /* 9276 * Set IPsec policy from an ipsec_req_t. If the req is not "zero" and valid, 9277 * all of them are copied to the conn_t. If the req is "zero", the policy is 9278 * zeroed out. A "zero" policy has zero ipsr_{ah,req,self_encap}_req 9279 * fields. 9280 * We keep only the latest setting of the policy and thus policy setting 9281 * is not incremental/cumulative. 9282 * 9283 * Requests to set policies with multiple alternative actions will 9284 * go through a different API. 9285 */ 9286 int 9287 ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req) 9288 { 9289 uint_t ah_req = 0; 9290 uint_t esp_req = 0; 9291 uint_t se_req = 0; 9292 ipsec_selkey_t sel; 9293 ipsec_act_t *actp = NULL; 9294 uint_t nact; 9295 ipsec_policy_t *pin4 = NULL, *pout4 = NULL; 9296 ipsec_policy_t *pin6 = NULL, *pout6 = NULL; 9297 ipsec_policy_root_t *pr; 9298 ipsec_policy_head_t *ph; 9299 int fam; 9300 boolean_t is_pol_reset; 9301 int error = 0; 9302 9303 #define REQ_MASK (IPSEC_PREF_REQUIRED|IPSEC_PREF_NEVER) 9304 9305 /* 9306 * The IP_SEC_OPT option does not allow variable length parameters, 9307 * hence a request cannot be NULL. 9308 */ 9309 if (req == NULL) 9310 return (EINVAL); 9311 9312 ah_req = req->ipsr_ah_req; 9313 esp_req = req->ipsr_esp_req; 9314 se_req = req->ipsr_self_encap_req; 9315 9316 /* 9317 * Are we dealing with a request to reset the policy (i.e. 9318 * zero requests). 9319 */ 9320 is_pol_reset = ((ah_req & REQ_MASK) == 0 && 9321 (esp_req & REQ_MASK) == 0 && 9322 (se_req & REQ_MASK) == 0); 9323 9324 if (!is_pol_reset) { 9325 /* 9326 * If we couldn't load IPsec, fail with "protocol 9327 * not supported". 9328 * IPsec may not have been loaded for a request with zero 9329 * policies, so we don't fail in this case. 9330 */ 9331 mutex_enter(&ipsec_loader_lock); 9332 if (ipsec_loader_state != IPSEC_LOADER_SUCCEEDED) { 9333 mutex_exit(&ipsec_loader_lock); 9334 return (EPROTONOSUPPORT); 9335 } 9336 mutex_exit(&ipsec_loader_lock); 9337 9338 /* 9339 * Test for valid requests. Invalid algorithms 9340 * need to be tested by IPSEC code because new 9341 * algorithms can be added dynamically. 9342 */ 9343 if ((ah_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0 || 9344 (esp_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0 || 9345 (se_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0) { 9346 return (EINVAL); 9347 } 9348 9349 /* 9350 * Only privileged users can issue these 9351 * requests. 9352 */ 9353 if (((ah_req & IPSEC_PREF_NEVER) || 9354 (esp_req & IPSEC_PREF_NEVER) || 9355 (se_req & IPSEC_PREF_NEVER)) && 9356 secpolicy_net_config(cr, B_FALSE) != 0) { 9357 return (EPERM); 9358 } 9359 9360 /* 9361 * The IPSEC_PREF_REQUIRED and IPSEC_PREF_NEVER 9362 * are mutually exclusive. 9363 */ 9364 if (((ah_req & REQ_MASK) == REQ_MASK) || 9365 ((esp_req & REQ_MASK) == REQ_MASK) || 9366 ((se_req & REQ_MASK) == REQ_MASK)) { 9367 /* Both of them are set */ 9368 return (EINVAL); 9369 } 9370 } 9371 9372 mutex_enter(&connp->conn_lock); 9373 9374 /* 9375 * If we have already cached policies in ip_bind_connected*(), don't 9376 * let them change now. We cache policies for connections 9377 * whose src,dst [addr, port] is known. The exception to this is 9378 * tunnels. Tunnels are allowed to change policies after having 9379 * become fully bound. 9380 */ 9381 if (connp->conn_policy_cached && !IPCL_IS_IPTUN(connp)) { 9382 mutex_exit(&connp->conn_lock); 9383 return (EINVAL); 9384 } 9385 9386 /* 9387 * We have a zero policies, reset the connection policy if already 9388 * set. This will cause the connection to inherit the 9389 * global policy, if any. 9390 */ 9391 if (is_pol_reset) { 9392 if (connp->conn_policy != NULL) { 9393 IPPH_REFRELE(connp->conn_policy); 9394 connp->conn_policy = NULL; 9395 } 9396 connp->conn_flags &= ~IPCL_CHECK_POLICY; 9397 connp->conn_in_enforce_policy = B_FALSE; 9398 connp->conn_out_enforce_policy = B_FALSE; 9399 mutex_exit(&connp->conn_lock); 9400 return (0); 9401 } 9402 9403 ph = connp->conn_policy = ipsec_polhead_split(connp->conn_policy); 9404 if (ph == NULL) 9405 goto enomem; 9406 9407 ipsec_actvec_from_req(req, &actp, &nact); 9408 if (actp == NULL) 9409 goto enomem; 9410 9411 /* 9412 * Always allocate IPv4 policy entries, since they can also 9413 * apply to ipv6 sockets being used in ipv4-compat mode. 9414 */ 9415 bzero(&sel, sizeof (sel)); 9416 sel.ipsl_valid = IPSL_IPV4; 9417 9418 pin4 = ipsec_policy_create(&sel, actp, nact, IPSEC_PRIO_SOCKET); 9419 if (pin4 == NULL) 9420 goto enomem; 9421 9422 pout4 = ipsec_policy_create(&sel, actp, nact, IPSEC_PRIO_SOCKET); 9423 if (pout4 == NULL) 9424 goto enomem; 9425 9426 if (connp->conn_pkt_isv6) { 9427 /* 9428 * We're looking at a v6 socket, also allocate the 9429 * v6-specific entries... 9430 */ 9431 sel.ipsl_valid = IPSL_IPV6; 9432 pin6 = ipsec_policy_create(&sel, actp, nact, 9433 IPSEC_PRIO_SOCKET); 9434 if (pin6 == NULL) 9435 goto enomem; 9436 9437 pout6 = ipsec_policy_create(&sel, actp, nact, 9438 IPSEC_PRIO_SOCKET); 9439 if (pout6 == NULL) 9440 goto enomem; 9441 9442 /* 9443 * .. and file them away in the right place. 9444 */ 9445 fam = IPSEC_AF_V6; 9446 pr = &ph->iph_root[IPSEC_TYPE_INBOUND]; 9447 HASHLIST_INSERT(pin6, ipsp_hash, pr->ipr_nonhash[fam]); 9448 ipsec_insert_always(&ph->iph_rulebyid, pin6); 9449 pr = &ph->iph_root[IPSEC_TYPE_OUTBOUND]; 9450 HASHLIST_INSERT(pout6, ipsp_hash, pr->ipr_nonhash[fam]); 9451 ipsec_insert_always(&ph->iph_rulebyid, pout6); 9452 } 9453 9454 ipsec_actvec_free(actp, nact); 9455 9456 /* 9457 * File the v4 policies. 9458 */ 9459 fam = IPSEC_AF_V4; 9460 pr = &ph->iph_root[IPSEC_TYPE_INBOUND]; 9461 HASHLIST_INSERT(pin4, ipsp_hash, pr->ipr_nonhash[fam]); 9462 ipsec_insert_always(&ph->iph_rulebyid, pin4); 9463 9464 pr = &ph->iph_root[IPSEC_TYPE_OUTBOUND]; 9465 HASHLIST_INSERT(pout4, ipsp_hash, pr->ipr_nonhash[fam]); 9466 ipsec_insert_always(&ph->iph_rulebyid, pout4); 9467 9468 /* 9469 * If the requests need security, set enforce_policy. 9470 * If the requests are IPSEC_PREF_NEVER, one should 9471 * still set conn_out_enforce_policy so that an ipsec_out 9472 * gets attached in ip_wput. This is needed so that 9473 * for connections that we don't cache policy in ip_bind, 9474 * if global policy matches in ip_wput_attach_policy, we 9475 * don't wrongly inherit global policy. Similarly, we need 9476 * to set conn_in_enforce_policy also so that we don't verify 9477 * policy wrongly. 9478 */ 9479 if ((ah_req & REQ_MASK) != 0 || 9480 (esp_req & REQ_MASK) != 0 || 9481 (se_req & REQ_MASK) != 0) { 9482 connp->conn_in_enforce_policy = B_TRUE; 9483 connp->conn_out_enforce_policy = B_TRUE; 9484 connp->conn_flags |= IPCL_CHECK_POLICY; 9485 } 9486 9487 /* 9488 * Tunnels are allowed to set policy after having been fully bound. 9489 * If that's the case, cache policy here. 9490 */ 9491 if (IPCL_IS_IPTUN(connp) && connp->conn_fully_bound) 9492 error = ipsec_conn_cache_policy(connp, !connp->conn_af_isv6); 9493 9494 mutex_exit(&connp->conn_lock); 9495 return (error); 9496 #undef REQ_MASK 9497 9498 /* 9499 * Common memory-allocation-failure exit path. 9500 */ 9501 enomem: 9502 mutex_exit(&connp->conn_lock); 9503 if (actp != NULL) 9504 ipsec_actvec_free(actp, nact); 9505 if (pin4 != NULL) 9506 IPPOL_REFRELE(pin4); 9507 if (pout4 != NULL) 9508 IPPOL_REFRELE(pout4); 9509 if (pin6 != NULL) 9510 IPPOL_REFRELE(pin6); 9511 if (pout6 != NULL) 9512 IPPOL_REFRELE(pout6); 9513 return (ENOMEM); 9514 } 9515 9516 /* 9517 * Only for options that pass in an IP addr. Currently only V4 options 9518 * pass in an ipif. V6 options always pass an ifindex specifying the ill. 9519 * So this function assumes level is IPPROTO_IP 9520 */ 9521 int 9522 ip_opt_set_ipif(conn_t *connp, ipaddr_t addr, boolean_t checkonly, int option, 9523 mblk_t *first_mp) 9524 { 9525 ipif_t *ipif = NULL; 9526 int error; 9527 ill_t *ill; 9528 9529 ip2dbg(("ip_opt_set_ipif: ipaddr %X\n", addr)); 9530 9531 if (addr != INADDR_ANY || checkonly) { 9532 ASSERT(connp != NULL); 9533 if (option == IP_NEXTHOP) { 9534 ipif = 9535 ipif_lookup_onlink_addr(addr, connp->conn_zoneid); 9536 } else { 9537 ipif = ipif_lookup_addr(addr, NULL, connp->conn_zoneid, 9538 CONNP_TO_WQ(connp), first_mp, ip_restart_optmgmt, 9539 &error); 9540 } 9541 if (ipif == NULL) { 9542 if (error == EINPROGRESS) 9543 return (error); 9544 else if ((option == IP_MULTICAST_IF) || 9545 (option == IP_NEXTHOP)) 9546 return (EHOSTUNREACH); 9547 else 9548 return (EINVAL); 9549 } else if (checkonly) { 9550 if (option == IP_MULTICAST_IF) { 9551 ill = ipif->ipif_ill; 9552 /* not supported by the virtual network iface */ 9553 if (IS_VNI(ill)) { 9554 ipif_refrele(ipif); 9555 return (EINVAL); 9556 } 9557 } 9558 ipif_refrele(ipif); 9559 return (0); 9560 } 9561 ill = ipif->ipif_ill; 9562 mutex_enter(&connp->conn_lock); 9563 mutex_enter(&ill->ill_lock); 9564 if ((ill->ill_state_flags & ILL_CONDEMNED) || 9565 (ipif->ipif_state_flags & IPIF_CONDEMNED)) { 9566 mutex_exit(&ill->ill_lock); 9567 mutex_exit(&connp->conn_lock); 9568 ipif_refrele(ipif); 9569 return (option == IP_MULTICAST_IF ? 9570 EHOSTUNREACH : EINVAL); 9571 } 9572 } else { 9573 mutex_enter(&connp->conn_lock); 9574 } 9575 9576 /* None of the options below are supported on the VNI */ 9577 if (ipif != NULL && IS_VNI(ipif->ipif_ill)) { 9578 mutex_exit(&ill->ill_lock); 9579 mutex_exit(&connp->conn_lock); 9580 ipif_refrele(ipif); 9581 return (EINVAL); 9582 } 9583 9584 switch (option) { 9585 case IP_DONTFAILOVER_IF: 9586 /* 9587 * This option is used by in.mpathd to ensure 9588 * that IPMP probe packets only go out on the 9589 * test interfaces. in.mpathd sets this option 9590 * on the non-failover interfaces. 9591 * For backward compatibility, this option 9592 * implicitly sets IP_MULTICAST_IF, as used 9593 * be done in bind(), so that ip_wput gets 9594 * this ipif to send mcast packets. 9595 */ 9596 if (ipif != NULL) { 9597 ASSERT(addr != INADDR_ANY); 9598 connp->conn_nofailover_ill = ipif->ipif_ill; 9599 connp->conn_multicast_ipif = ipif; 9600 } else { 9601 ASSERT(addr == INADDR_ANY); 9602 connp->conn_nofailover_ill = NULL; 9603 connp->conn_multicast_ipif = NULL; 9604 } 9605 break; 9606 9607 case IP_MULTICAST_IF: 9608 connp->conn_multicast_ipif = ipif; 9609 break; 9610 case IP_NEXTHOP: 9611 connp->conn_nexthop_v4 = addr; 9612 connp->conn_nexthop_set = B_TRUE; 9613 break; 9614 } 9615 9616 if (ipif != NULL) { 9617 mutex_exit(&ill->ill_lock); 9618 mutex_exit(&connp->conn_lock); 9619 ipif_refrele(ipif); 9620 return (0); 9621 } 9622 mutex_exit(&connp->conn_lock); 9623 /* We succeded in cleared the option */ 9624 return (0); 9625 } 9626 9627 /* 9628 * For options that pass in an ifindex specifying the ill. V6 options always 9629 * pass in an ill. Some v4 options also pass in ifindex specifying the ill. 9630 */ 9631 int 9632 ip_opt_set_ill(conn_t *connp, int ifindex, boolean_t isv6, boolean_t checkonly, 9633 int level, int option, mblk_t *first_mp) 9634 { 9635 ill_t *ill = NULL; 9636 int error = 0; 9637 9638 ip2dbg(("ip_opt_set_ill: ifindex %d\n", ifindex)); 9639 if (ifindex != 0) { 9640 ASSERT(connp != NULL); 9641 ill = ill_lookup_on_ifindex(ifindex, isv6, CONNP_TO_WQ(connp), 9642 first_mp, ip_restart_optmgmt, &error); 9643 if (ill != NULL) { 9644 if (checkonly) { 9645 /* not supported by the virtual network iface */ 9646 if (IS_VNI(ill)) { 9647 ill_refrele(ill); 9648 return (EINVAL); 9649 } 9650 ill_refrele(ill); 9651 return (0); 9652 } 9653 if (!ipif_lookup_zoneid_group(ill, connp->conn_zoneid, 9654 0, NULL)) { 9655 ill_refrele(ill); 9656 ill = NULL; 9657 mutex_enter(&connp->conn_lock); 9658 goto setit; 9659 } 9660 mutex_enter(&connp->conn_lock); 9661 mutex_enter(&ill->ill_lock); 9662 if (ill->ill_state_flags & ILL_CONDEMNED) { 9663 mutex_exit(&ill->ill_lock); 9664 mutex_exit(&connp->conn_lock); 9665 ill_refrele(ill); 9666 ill = NULL; 9667 mutex_enter(&connp->conn_lock); 9668 } 9669 goto setit; 9670 } else if (error == EINPROGRESS) { 9671 return (error); 9672 } else { 9673 error = 0; 9674 } 9675 } 9676 mutex_enter(&connp->conn_lock); 9677 setit: 9678 ASSERT((level == IPPROTO_IP || level == IPPROTO_IPV6)); 9679 9680 /* 9681 * The options below assume that the ILL (if any) transmits and/or 9682 * receives traffic. Neither of which is true for the virtual network 9683 * interface, so fail setting these on a VNI. 9684 */ 9685 if (IS_VNI(ill)) { 9686 ASSERT(ill != NULL); 9687 mutex_exit(&ill->ill_lock); 9688 mutex_exit(&connp->conn_lock); 9689 ill_refrele(ill); 9690 return (EINVAL); 9691 } 9692 9693 if (level == IPPROTO_IP) { 9694 switch (option) { 9695 case IP_BOUND_IF: 9696 connp->conn_incoming_ill = ill; 9697 connp->conn_outgoing_ill = ill; 9698 connp->conn_orig_bound_ifindex = (ill == NULL) ? 9699 0 : ifindex; 9700 break; 9701 9702 case IP_XMIT_IF: 9703 /* 9704 * Similar to IP_BOUND_IF, but this only 9705 * determines the outgoing interface for 9706 * unicast packets. Also no IRE_CACHE entry 9707 * is added for the destination of the 9708 * outgoing packets. This feature is needed 9709 * for mobile IP. 9710 */ 9711 connp->conn_xmit_if_ill = ill; 9712 connp->conn_orig_xmit_ifindex = (ill == NULL) ? 9713 0 : ifindex; 9714 break; 9715 9716 case IP_MULTICAST_IF: 9717 /* 9718 * This option is an internal special. The socket 9719 * level IP_MULTICAST_IF specifies an 'ipaddr' and 9720 * is handled in ip_opt_set_ipif. IPV6_MULTICAST_IF 9721 * specifies an ifindex and we try first on V6 ill's. 9722 * If we don't find one, we they try using on v4 ill's 9723 * intenally and we come here. 9724 */ 9725 if (!checkonly && ill != NULL) { 9726 ipif_t *ipif; 9727 ipif = ill->ill_ipif; 9728 9729 if (ipif->ipif_state_flags & IPIF_CONDEMNED) { 9730 mutex_exit(&ill->ill_lock); 9731 mutex_exit(&connp->conn_lock); 9732 ill_refrele(ill); 9733 ill = NULL; 9734 mutex_enter(&connp->conn_lock); 9735 } else { 9736 connp->conn_multicast_ipif = ipif; 9737 } 9738 } 9739 break; 9740 } 9741 } else { 9742 switch (option) { 9743 case IPV6_BOUND_IF: 9744 connp->conn_incoming_ill = ill; 9745 connp->conn_outgoing_ill = ill; 9746 connp->conn_orig_bound_ifindex = (ill == NULL) ? 9747 0 : ifindex; 9748 break; 9749 9750 case IPV6_BOUND_PIF: 9751 /* 9752 * Limit all transmit to this ill. 9753 * Unlike IPV6_BOUND_IF, using this option 9754 * prevents load spreading and failover from 9755 * happening when the interface is part of the 9756 * group. That's why we don't need to remember 9757 * the ifindex in orig_bound_ifindex as in 9758 * IPV6_BOUND_IF. 9759 */ 9760 connp->conn_outgoing_pill = ill; 9761 break; 9762 9763 case IPV6_DONTFAILOVER_IF: 9764 /* 9765 * This option is used by in.mpathd to ensure 9766 * that IPMP probe packets only go out on the 9767 * test interfaces. in.mpathd sets this option 9768 * on the non-failover interfaces. 9769 */ 9770 connp->conn_nofailover_ill = ill; 9771 /* 9772 * For backward compatibility, this option 9773 * implicitly sets ip_multicast_ill as used in 9774 * IP_MULTICAST_IF so that ip_wput gets 9775 * this ipif to send mcast packets. 9776 */ 9777 connp->conn_multicast_ill = ill; 9778 connp->conn_orig_multicast_ifindex = (ill == NULL) ? 9779 0 : ifindex; 9780 break; 9781 9782 case IPV6_MULTICAST_IF: 9783 /* 9784 * Set conn_multicast_ill to be the IPv6 ill. 9785 * Set conn_multicast_ipif to be an IPv4 ipif 9786 * for ifindex to make IPv4 mapped addresses 9787 * on PF_INET6 sockets honor IPV6_MULTICAST_IF. 9788 * Even if no IPv6 ill exists for the ifindex 9789 * we need to check for an IPv4 ifindex in order 9790 * for this to work with mapped addresses. In that 9791 * case only set conn_multicast_ipif. 9792 */ 9793 if (!checkonly) { 9794 if (ifindex == 0) { 9795 connp->conn_multicast_ill = NULL; 9796 connp->conn_orig_multicast_ifindex = 0; 9797 connp->conn_multicast_ipif = NULL; 9798 } else if (ill != NULL) { 9799 connp->conn_multicast_ill = ill; 9800 connp->conn_orig_multicast_ifindex = 9801 ifindex; 9802 } 9803 } 9804 break; 9805 } 9806 } 9807 9808 if (ill != NULL) { 9809 mutex_exit(&ill->ill_lock); 9810 mutex_exit(&connp->conn_lock); 9811 ill_refrele(ill); 9812 return (0); 9813 } 9814 mutex_exit(&connp->conn_lock); 9815 /* 9816 * We succeeded in clearing the option (ifindex == 0) or failed to 9817 * locate the ill and could not set the option (ifindex != 0) 9818 */ 9819 return (ifindex == 0 ? 0 : EINVAL); 9820 } 9821 9822 /* This routine sets socket options. */ 9823 /* ARGSUSED */ 9824 int 9825 ip_opt_set(queue_t *q, uint_t optset_context, int level, int name, 9826 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 9827 void *dummy, cred_t *cr, mblk_t *first_mp) 9828 { 9829 int *i1 = (int *)invalp; 9830 conn_t *connp = Q_TO_CONN(q); 9831 int error = 0; 9832 boolean_t checkonly; 9833 ire_t *ire; 9834 boolean_t found; 9835 9836 switch (optset_context) { 9837 9838 case SETFN_OPTCOM_CHECKONLY: 9839 checkonly = B_TRUE; 9840 /* 9841 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 9842 * inlen != 0 implies value supplied and 9843 * we have to "pretend" to set it. 9844 * inlen == 0 implies that there is no 9845 * value part in T_CHECK request and just validation 9846 * done elsewhere should be enough, we just return here. 9847 */ 9848 if (inlen == 0) { 9849 *outlenp = 0; 9850 return (0); 9851 } 9852 break; 9853 case SETFN_OPTCOM_NEGOTIATE: 9854 case SETFN_UD_NEGOTIATE: 9855 case SETFN_CONN_NEGOTIATE: 9856 checkonly = B_FALSE; 9857 break; 9858 default: 9859 /* 9860 * We should never get here 9861 */ 9862 *outlenp = 0; 9863 return (EINVAL); 9864 } 9865 9866 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 9867 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 9868 9869 /* 9870 * For fixed length options, no sanity check 9871 * of passed in length is done. It is assumed *_optcom_req() 9872 * routines do the right thing. 9873 */ 9874 9875 switch (level) { 9876 case SOL_SOCKET: 9877 /* 9878 * conn_lock protects the bitfields, and is used to 9879 * set the fields atomically. 9880 */ 9881 switch (name) { 9882 case SO_BROADCAST: 9883 if (!checkonly) { 9884 /* TODO: use value someplace? */ 9885 mutex_enter(&connp->conn_lock); 9886 connp->conn_broadcast = *i1 ? 1 : 0; 9887 mutex_exit(&connp->conn_lock); 9888 } 9889 break; /* goto sizeof (int) option return */ 9890 case SO_USELOOPBACK: 9891 if (!checkonly) { 9892 /* TODO: use value someplace? */ 9893 mutex_enter(&connp->conn_lock); 9894 connp->conn_loopback = *i1 ? 1 : 0; 9895 mutex_exit(&connp->conn_lock); 9896 } 9897 break; /* goto sizeof (int) option return */ 9898 case SO_DONTROUTE: 9899 if (!checkonly) { 9900 mutex_enter(&connp->conn_lock); 9901 connp->conn_dontroute = *i1 ? 1 : 0; 9902 mutex_exit(&connp->conn_lock); 9903 } 9904 break; /* goto sizeof (int) option return */ 9905 case SO_REUSEADDR: 9906 if (!checkonly) { 9907 mutex_enter(&connp->conn_lock); 9908 connp->conn_reuseaddr = *i1 ? 1 : 0; 9909 mutex_exit(&connp->conn_lock); 9910 } 9911 break; /* goto sizeof (int) option return */ 9912 case SO_PROTOTYPE: 9913 if (!checkonly) { 9914 mutex_enter(&connp->conn_lock); 9915 connp->conn_proto = *i1; 9916 mutex_exit(&connp->conn_lock); 9917 } 9918 break; /* goto sizeof (int) option return */ 9919 case SO_ANON_MLP: 9920 if (!checkonly) { 9921 mutex_enter(&connp->conn_lock); 9922 connp->conn_anon_mlp = *i1 != 0 ? 1 : 0; 9923 mutex_exit(&connp->conn_lock); 9924 } 9925 break; /* goto sizeof (int) option return */ 9926 case SO_MAC_EXEMPT: 9927 if (secpolicy_net_mac_aware(cr) != 0 || 9928 IPCL_IS_BOUND(connp)) 9929 return (EACCES); 9930 if (!checkonly) { 9931 mutex_enter(&connp->conn_lock); 9932 connp->conn_mac_exempt = *i1 != 0 ? 1 : 0; 9933 mutex_exit(&connp->conn_lock); 9934 } 9935 break; /* goto sizeof (int) option return */ 9936 default: 9937 /* 9938 * "soft" error (negative) 9939 * option not handled at this level 9940 * Note: Do not modify *outlenp 9941 */ 9942 return (-EINVAL); 9943 } 9944 break; 9945 case IPPROTO_IP: 9946 switch (name) { 9947 case IP_NEXTHOP: 9948 case IP_MULTICAST_IF: 9949 case IP_DONTFAILOVER_IF: { 9950 ipaddr_t addr = *i1; 9951 9952 error = ip_opt_set_ipif(connp, addr, checkonly, name, 9953 first_mp); 9954 if (error != 0) 9955 return (error); 9956 break; /* goto sizeof (int) option return */ 9957 } 9958 9959 case IP_MULTICAST_TTL: 9960 /* Recorded in transport above IP */ 9961 *outvalp = *invalp; 9962 *outlenp = sizeof (uchar_t); 9963 return (0); 9964 case IP_MULTICAST_LOOP: 9965 if (!checkonly) { 9966 mutex_enter(&connp->conn_lock); 9967 connp->conn_multicast_loop = *invalp ? 1 : 0; 9968 mutex_exit(&connp->conn_lock); 9969 } 9970 *outvalp = *invalp; 9971 *outlenp = sizeof (uchar_t); 9972 return (0); 9973 case IP_ADD_MEMBERSHIP: 9974 case MCAST_JOIN_GROUP: 9975 case IP_DROP_MEMBERSHIP: 9976 case MCAST_LEAVE_GROUP: { 9977 struct ip_mreq *mreqp; 9978 struct group_req *greqp; 9979 ire_t *ire; 9980 boolean_t done = B_FALSE; 9981 ipaddr_t group, ifaddr; 9982 struct sockaddr_in *sin; 9983 uint32_t *ifindexp; 9984 boolean_t mcast_opt = B_TRUE; 9985 mcast_record_t fmode; 9986 int (*optfn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t, 9987 uint_t *, mcast_record_t, ipaddr_t, mblk_t *); 9988 9989 switch (name) { 9990 case IP_ADD_MEMBERSHIP: 9991 mcast_opt = B_FALSE; 9992 /* FALLTHRU */ 9993 case MCAST_JOIN_GROUP: 9994 fmode = MODE_IS_EXCLUDE; 9995 optfn = ip_opt_add_group; 9996 break; 9997 9998 case IP_DROP_MEMBERSHIP: 9999 mcast_opt = B_FALSE; 10000 /* FALLTHRU */ 10001 case MCAST_LEAVE_GROUP: 10002 fmode = MODE_IS_INCLUDE; 10003 optfn = ip_opt_delete_group; 10004 break; 10005 } 10006 10007 if (mcast_opt) { 10008 greqp = (struct group_req *)i1; 10009 sin = (struct sockaddr_in *)&greqp->gr_group; 10010 if (sin->sin_family != AF_INET) { 10011 *outlenp = 0; 10012 return (ENOPROTOOPT); 10013 } 10014 group = (ipaddr_t)sin->sin_addr.s_addr; 10015 ifaddr = INADDR_ANY; 10016 ifindexp = &greqp->gr_interface; 10017 } else { 10018 mreqp = (struct ip_mreq *)i1; 10019 group = (ipaddr_t)mreqp->imr_multiaddr.s_addr; 10020 ifaddr = (ipaddr_t)mreqp->imr_interface.s_addr; 10021 ifindexp = NULL; 10022 } 10023 10024 /* 10025 * In the multirouting case, we need to replicate 10026 * the request on all interfaces that will take part 10027 * in replication. We do so because multirouting is 10028 * reflective, thus we will probably receive multi- 10029 * casts on those interfaces. 10030 * The ip_multirt_apply_membership() succeeds if the 10031 * operation succeeds on at least one interface. 10032 */ 10033 ire = ire_ftable_lookup(group, IP_HOST_MASK, 0, 10034 IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, 10035 MATCH_IRE_MASK | MATCH_IRE_TYPE); 10036 if (ire != NULL) { 10037 if (ire->ire_flags & RTF_MULTIRT) { 10038 error = ip_multirt_apply_membership( 10039 optfn, ire, connp, checkonly, group, 10040 fmode, INADDR_ANY, first_mp); 10041 done = B_TRUE; 10042 } 10043 ire_refrele(ire); 10044 } 10045 if (!done) { 10046 error = optfn(connp, checkonly, group, ifaddr, 10047 ifindexp, fmode, INADDR_ANY, first_mp); 10048 } 10049 if (error) { 10050 /* 10051 * EINPROGRESS is a soft error, needs retry 10052 * so don't make *outlenp zero. 10053 */ 10054 if (error != EINPROGRESS) 10055 *outlenp = 0; 10056 return (error); 10057 } 10058 /* OK return - copy input buffer into output buffer */ 10059 if (invalp != outvalp) { 10060 /* don't trust bcopy for identical src/dst */ 10061 bcopy(invalp, outvalp, inlen); 10062 } 10063 *outlenp = inlen; 10064 return (0); 10065 } 10066 case IP_BLOCK_SOURCE: 10067 case IP_UNBLOCK_SOURCE: 10068 case IP_ADD_SOURCE_MEMBERSHIP: 10069 case IP_DROP_SOURCE_MEMBERSHIP: 10070 case MCAST_BLOCK_SOURCE: 10071 case MCAST_UNBLOCK_SOURCE: 10072 case MCAST_JOIN_SOURCE_GROUP: 10073 case MCAST_LEAVE_SOURCE_GROUP: { 10074 struct ip_mreq_source *imreqp; 10075 struct group_source_req *gsreqp; 10076 in_addr_t grp, src, ifaddr = INADDR_ANY; 10077 uint32_t ifindex = 0; 10078 mcast_record_t fmode; 10079 struct sockaddr_in *sin; 10080 ire_t *ire; 10081 boolean_t mcast_opt = B_TRUE, done = B_FALSE; 10082 int (*optfn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t, 10083 uint_t *, mcast_record_t, ipaddr_t, mblk_t *); 10084 10085 switch (name) { 10086 case IP_BLOCK_SOURCE: 10087 mcast_opt = B_FALSE; 10088 /* FALLTHRU */ 10089 case MCAST_BLOCK_SOURCE: 10090 fmode = MODE_IS_EXCLUDE; 10091 optfn = ip_opt_add_group; 10092 break; 10093 10094 case IP_UNBLOCK_SOURCE: 10095 mcast_opt = B_FALSE; 10096 /* FALLTHRU */ 10097 case MCAST_UNBLOCK_SOURCE: 10098 fmode = MODE_IS_EXCLUDE; 10099 optfn = ip_opt_delete_group; 10100 break; 10101 10102 case IP_ADD_SOURCE_MEMBERSHIP: 10103 mcast_opt = B_FALSE; 10104 /* FALLTHRU */ 10105 case MCAST_JOIN_SOURCE_GROUP: 10106 fmode = MODE_IS_INCLUDE; 10107 optfn = ip_opt_add_group; 10108 break; 10109 10110 case IP_DROP_SOURCE_MEMBERSHIP: 10111 mcast_opt = B_FALSE; 10112 /* FALLTHRU */ 10113 case MCAST_LEAVE_SOURCE_GROUP: 10114 fmode = MODE_IS_INCLUDE; 10115 optfn = ip_opt_delete_group; 10116 break; 10117 } 10118 10119 if (mcast_opt) { 10120 gsreqp = (struct group_source_req *)i1; 10121 if (gsreqp->gsr_group.ss_family != AF_INET) { 10122 *outlenp = 0; 10123 return (ENOPROTOOPT); 10124 } 10125 sin = (struct sockaddr_in *)&gsreqp->gsr_group; 10126 grp = (ipaddr_t)sin->sin_addr.s_addr; 10127 sin = (struct sockaddr_in *)&gsreqp->gsr_source; 10128 src = (ipaddr_t)sin->sin_addr.s_addr; 10129 ifindex = gsreqp->gsr_interface; 10130 } else { 10131 imreqp = (struct ip_mreq_source *)i1; 10132 grp = (ipaddr_t)imreqp->imr_multiaddr.s_addr; 10133 src = (ipaddr_t)imreqp->imr_sourceaddr.s_addr; 10134 ifaddr = (ipaddr_t)imreqp->imr_interface.s_addr; 10135 } 10136 10137 /* 10138 * In the multirouting case, we need to replicate 10139 * the request as noted in the mcast cases above. 10140 */ 10141 ire = ire_ftable_lookup(grp, IP_HOST_MASK, 0, 10142 IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, 10143 MATCH_IRE_MASK | MATCH_IRE_TYPE); 10144 if (ire != NULL) { 10145 if (ire->ire_flags & RTF_MULTIRT) { 10146 error = ip_multirt_apply_membership( 10147 optfn, ire, connp, checkonly, grp, 10148 fmode, src, first_mp); 10149 done = B_TRUE; 10150 } 10151 ire_refrele(ire); 10152 } 10153 if (!done) { 10154 error = optfn(connp, checkonly, grp, ifaddr, 10155 &ifindex, fmode, src, first_mp); 10156 } 10157 if (error != 0) { 10158 /* 10159 * EINPROGRESS is a soft error, needs retry 10160 * so don't make *outlenp zero. 10161 */ 10162 if (error != EINPROGRESS) 10163 *outlenp = 0; 10164 return (error); 10165 } 10166 /* OK return - copy input buffer into output buffer */ 10167 if (invalp != outvalp) { 10168 bcopy(invalp, outvalp, inlen); 10169 } 10170 *outlenp = inlen; 10171 return (0); 10172 } 10173 case IP_SEC_OPT: 10174 error = ipsec_set_req(cr, connp, (ipsec_req_t *)invalp); 10175 if (error != 0) { 10176 *outlenp = 0; 10177 return (error); 10178 } 10179 break; 10180 case IP_HDRINCL: 10181 case IP_OPTIONS: 10182 case T_IP_OPTIONS: 10183 case IP_TOS: 10184 case T_IP_TOS: 10185 case IP_TTL: 10186 case IP_RECVDSTADDR: 10187 case IP_RECVOPTS: 10188 /* OK return - copy input buffer into output buffer */ 10189 if (invalp != outvalp) { 10190 /* don't trust bcopy for identical src/dst */ 10191 bcopy(invalp, outvalp, inlen); 10192 } 10193 *outlenp = inlen; 10194 return (0); 10195 case IP_RECVIF: 10196 /* Retrieve the inbound interface index */ 10197 if (!checkonly) { 10198 mutex_enter(&connp->conn_lock); 10199 connp->conn_recvif = *i1 ? 1 : 0; 10200 mutex_exit(&connp->conn_lock); 10201 } 10202 break; /* goto sizeof (int) option return */ 10203 case IP_RECVSLLA: 10204 /* Retrieve the source link layer address */ 10205 if (!checkonly) { 10206 mutex_enter(&connp->conn_lock); 10207 connp->conn_recvslla = *i1 ? 1 : 0; 10208 mutex_exit(&connp->conn_lock); 10209 } 10210 break; /* goto sizeof (int) option return */ 10211 case MRT_INIT: 10212 case MRT_DONE: 10213 case MRT_ADD_VIF: 10214 case MRT_DEL_VIF: 10215 case MRT_ADD_MFC: 10216 case MRT_DEL_MFC: 10217 case MRT_ASSERT: 10218 if ((error = secpolicy_net_config(cr, B_FALSE)) != 0) { 10219 *outlenp = 0; 10220 return (error); 10221 } 10222 error = ip_mrouter_set((int)name, q, checkonly, 10223 (uchar_t *)invalp, inlen, first_mp); 10224 if (error) { 10225 *outlenp = 0; 10226 return (error); 10227 } 10228 /* OK return - copy input buffer into output buffer */ 10229 if (invalp != outvalp) { 10230 /* don't trust bcopy for identical src/dst */ 10231 bcopy(invalp, outvalp, inlen); 10232 } 10233 *outlenp = inlen; 10234 return (0); 10235 case IP_BOUND_IF: 10236 case IP_XMIT_IF: 10237 error = ip_opt_set_ill(connp, *i1, B_FALSE, checkonly, 10238 level, name, first_mp); 10239 if (error != 0) 10240 return (error); 10241 break; /* goto sizeof (int) option return */ 10242 10243 case IP_UNSPEC_SRC: 10244 /* Allow sending with a zero source address */ 10245 if (!checkonly) { 10246 mutex_enter(&connp->conn_lock); 10247 connp->conn_unspec_src = *i1 ? 1 : 0; 10248 mutex_exit(&connp->conn_lock); 10249 } 10250 break; /* goto sizeof (int) option return */ 10251 default: 10252 /* 10253 * "soft" error (negative) 10254 * option not handled at this level 10255 * Note: Do not modify *outlenp 10256 */ 10257 return (-EINVAL); 10258 } 10259 break; 10260 case IPPROTO_IPV6: 10261 switch (name) { 10262 case IPV6_BOUND_IF: 10263 case IPV6_BOUND_PIF: 10264 case IPV6_DONTFAILOVER_IF: 10265 error = ip_opt_set_ill(connp, *i1, B_TRUE, checkonly, 10266 level, name, first_mp); 10267 if (error != 0) 10268 return (error); 10269 break; /* goto sizeof (int) option return */ 10270 10271 case IPV6_MULTICAST_IF: 10272 /* 10273 * The only possible errors are EINPROGRESS and 10274 * EINVAL. EINPROGRESS will be restarted and is not 10275 * a hard error. We call this option on both V4 and V6 10276 * If both return EINVAL, then this call returns 10277 * EINVAL. If at least one of them succeeds we 10278 * return success. 10279 */ 10280 found = B_FALSE; 10281 error = ip_opt_set_ill(connp, *i1, B_TRUE, checkonly, 10282 level, name, first_mp); 10283 if (error == EINPROGRESS) 10284 return (error); 10285 if (error == 0) 10286 found = B_TRUE; 10287 error = ip_opt_set_ill(connp, *i1, B_FALSE, checkonly, 10288 IPPROTO_IP, IP_MULTICAST_IF, first_mp); 10289 if (error == 0) 10290 found = B_TRUE; 10291 if (!found) 10292 return (error); 10293 break; /* goto sizeof (int) option return */ 10294 10295 case IPV6_MULTICAST_HOPS: 10296 /* Recorded in transport above IP */ 10297 break; /* goto sizeof (int) option return */ 10298 case IPV6_MULTICAST_LOOP: 10299 if (!checkonly) { 10300 mutex_enter(&connp->conn_lock); 10301 connp->conn_multicast_loop = *i1; 10302 mutex_exit(&connp->conn_lock); 10303 } 10304 break; /* goto sizeof (int) option return */ 10305 case IPV6_JOIN_GROUP: 10306 case MCAST_JOIN_GROUP: 10307 case IPV6_LEAVE_GROUP: 10308 case MCAST_LEAVE_GROUP: { 10309 struct ipv6_mreq *ip_mreqp; 10310 struct group_req *greqp; 10311 ire_t *ire; 10312 boolean_t done = B_FALSE; 10313 in6_addr_t groupv6; 10314 uint32_t ifindex; 10315 boolean_t mcast_opt = B_TRUE; 10316 mcast_record_t fmode; 10317 int (*optfn)(conn_t *, boolean_t, const in6_addr_t *, 10318 int, mcast_record_t, const in6_addr_t *, mblk_t *); 10319 10320 switch (name) { 10321 case IPV6_JOIN_GROUP: 10322 mcast_opt = B_FALSE; 10323 /* FALLTHRU */ 10324 case MCAST_JOIN_GROUP: 10325 fmode = MODE_IS_EXCLUDE; 10326 optfn = ip_opt_add_group_v6; 10327 break; 10328 10329 case IPV6_LEAVE_GROUP: 10330 mcast_opt = B_FALSE; 10331 /* FALLTHRU */ 10332 case MCAST_LEAVE_GROUP: 10333 fmode = MODE_IS_INCLUDE; 10334 optfn = ip_opt_delete_group_v6; 10335 break; 10336 } 10337 10338 if (mcast_opt) { 10339 struct sockaddr_in *sin; 10340 struct sockaddr_in6 *sin6; 10341 greqp = (struct group_req *)i1; 10342 if (greqp->gr_group.ss_family == AF_INET) { 10343 sin = (struct sockaddr_in *) 10344 &(greqp->gr_group); 10345 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, 10346 &groupv6); 10347 } else { 10348 sin6 = (struct sockaddr_in6 *) 10349 &(greqp->gr_group); 10350 groupv6 = sin6->sin6_addr; 10351 } 10352 ifindex = greqp->gr_interface; 10353 } else { 10354 ip_mreqp = (struct ipv6_mreq *)i1; 10355 groupv6 = ip_mreqp->ipv6mr_multiaddr; 10356 ifindex = ip_mreqp->ipv6mr_interface; 10357 } 10358 /* 10359 * In the multirouting case, we need to replicate 10360 * the request on all interfaces that will take part 10361 * in replication. We do so because multirouting is 10362 * reflective, thus we will probably receive multi- 10363 * casts on those interfaces. 10364 * The ip_multirt_apply_membership_v6() succeeds if 10365 * the operation succeeds on at least one interface. 10366 */ 10367 ire = ire_ftable_lookup_v6(&groupv6, &ipv6_all_ones, 0, 10368 IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, 10369 MATCH_IRE_MASK | MATCH_IRE_TYPE); 10370 if (ire != NULL) { 10371 if (ire->ire_flags & RTF_MULTIRT) { 10372 error = ip_multirt_apply_membership_v6( 10373 optfn, ire, connp, checkonly, 10374 &groupv6, fmode, &ipv6_all_zeros, 10375 first_mp); 10376 done = B_TRUE; 10377 } 10378 ire_refrele(ire); 10379 } 10380 if (!done) { 10381 error = optfn(connp, checkonly, &groupv6, 10382 ifindex, fmode, &ipv6_all_zeros, first_mp); 10383 } 10384 if (error) { 10385 /* 10386 * EINPROGRESS is a soft error, needs retry 10387 * so don't make *outlenp zero. 10388 */ 10389 if (error != EINPROGRESS) 10390 *outlenp = 0; 10391 return (error); 10392 } 10393 /* OK return - copy input buffer into output buffer */ 10394 if (invalp != outvalp) { 10395 /* don't trust bcopy for identical src/dst */ 10396 bcopy(invalp, outvalp, inlen); 10397 } 10398 *outlenp = inlen; 10399 return (0); 10400 } 10401 case MCAST_BLOCK_SOURCE: 10402 case MCAST_UNBLOCK_SOURCE: 10403 case MCAST_JOIN_SOURCE_GROUP: 10404 case MCAST_LEAVE_SOURCE_GROUP: { 10405 struct group_source_req *gsreqp; 10406 in6_addr_t v6grp, v6src; 10407 uint32_t ifindex; 10408 mcast_record_t fmode; 10409 ire_t *ire; 10410 boolean_t done = B_FALSE; 10411 int (*optfn)(conn_t *, boolean_t, const in6_addr_t *, 10412 int, mcast_record_t, const in6_addr_t *, mblk_t *); 10413 10414 switch (name) { 10415 case MCAST_BLOCK_SOURCE: 10416 fmode = MODE_IS_EXCLUDE; 10417 optfn = ip_opt_add_group_v6; 10418 break; 10419 case MCAST_UNBLOCK_SOURCE: 10420 fmode = MODE_IS_EXCLUDE; 10421 optfn = ip_opt_delete_group_v6; 10422 break; 10423 case MCAST_JOIN_SOURCE_GROUP: 10424 fmode = MODE_IS_INCLUDE; 10425 optfn = ip_opt_add_group_v6; 10426 break; 10427 case MCAST_LEAVE_SOURCE_GROUP: 10428 fmode = MODE_IS_INCLUDE; 10429 optfn = ip_opt_delete_group_v6; 10430 break; 10431 } 10432 10433 gsreqp = (struct group_source_req *)i1; 10434 ifindex = gsreqp->gsr_interface; 10435 if (gsreqp->gsr_group.ss_family == AF_INET) { 10436 struct sockaddr_in *s; 10437 s = (struct sockaddr_in *)&gsreqp->gsr_group; 10438 IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6grp); 10439 s = (struct sockaddr_in *)&gsreqp->gsr_source; 10440 IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6src); 10441 } else { 10442 struct sockaddr_in6 *s6; 10443 s6 = (struct sockaddr_in6 *)&gsreqp->gsr_group; 10444 v6grp = s6->sin6_addr; 10445 s6 = (struct sockaddr_in6 *)&gsreqp->gsr_source; 10446 v6src = s6->sin6_addr; 10447 } 10448 10449 /* 10450 * In the multirouting case, we need to replicate 10451 * the request as noted in the mcast cases above. 10452 */ 10453 ire = ire_ftable_lookup_v6(&v6grp, &ipv6_all_ones, 0, 10454 IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, 10455 MATCH_IRE_MASK | MATCH_IRE_TYPE); 10456 if (ire != NULL) { 10457 if (ire->ire_flags & RTF_MULTIRT) { 10458 error = ip_multirt_apply_membership_v6( 10459 optfn, ire, connp, checkonly, 10460 &v6grp, fmode, &v6src, first_mp); 10461 done = B_TRUE; 10462 } 10463 ire_refrele(ire); 10464 } 10465 if (!done) { 10466 error = optfn(connp, checkonly, &v6grp, 10467 ifindex, fmode, &v6src, first_mp); 10468 } 10469 if (error != 0) { 10470 /* 10471 * EINPROGRESS is a soft error, needs retry 10472 * so don't make *outlenp zero. 10473 */ 10474 if (error != EINPROGRESS) 10475 *outlenp = 0; 10476 return (error); 10477 } 10478 /* OK return - copy input buffer into output buffer */ 10479 if (invalp != outvalp) { 10480 bcopy(invalp, outvalp, inlen); 10481 } 10482 *outlenp = inlen; 10483 return (0); 10484 } 10485 case IPV6_UNICAST_HOPS: 10486 /* Recorded in transport above IP */ 10487 break; /* goto sizeof (int) option return */ 10488 case IPV6_UNSPEC_SRC: 10489 /* Allow sending with a zero source address */ 10490 if (!checkonly) { 10491 mutex_enter(&connp->conn_lock); 10492 connp->conn_unspec_src = *i1 ? 1 : 0; 10493 mutex_exit(&connp->conn_lock); 10494 } 10495 break; /* goto sizeof (int) option return */ 10496 case IPV6_RECVPKTINFO: 10497 if (!checkonly) { 10498 mutex_enter(&connp->conn_lock); 10499 connp->conn_ipv6_recvpktinfo = *i1 ? 1 : 0; 10500 mutex_exit(&connp->conn_lock); 10501 } 10502 break; /* goto sizeof (int) option return */ 10503 case IPV6_RECVTCLASS: 10504 if (!checkonly) { 10505 if (*i1 < 0 || *i1 > 1) { 10506 return (EINVAL); 10507 } 10508 mutex_enter(&connp->conn_lock); 10509 connp->conn_ipv6_recvtclass = *i1; 10510 mutex_exit(&connp->conn_lock); 10511 } 10512 break; 10513 case IPV6_RECVPATHMTU: 10514 if (!checkonly) { 10515 if (*i1 < 0 || *i1 > 1) { 10516 return (EINVAL); 10517 } 10518 mutex_enter(&connp->conn_lock); 10519 connp->conn_ipv6_recvpathmtu = *i1; 10520 mutex_exit(&connp->conn_lock); 10521 } 10522 break; 10523 case IPV6_RECVHOPLIMIT: 10524 if (!checkonly) { 10525 mutex_enter(&connp->conn_lock); 10526 connp->conn_ipv6_recvhoplimit = *i1 ? 1 : 0; 10527 mutex_exit(&connp->conn_lock); 10528 } 10529 break; /* goto sizeof (int) option return */ 10530 case IPV6_RECVHOPOPTS: 10531 if (!checkonly) { 10532 mutex_enter(&connp->conn_lock); 10533 connp->conn_ipv6_recvhopopts = *i1 ? 1 : 0; 10534 mutex_exit(&connp->conn_lock); 10535 } 10536 break; /* goto sizeof (int) option return */ 10537 case IPV6_RECVDSTOPTS: 10538 if (!checkonly) { 10539 mutex_enter(&connp->conn_lock); 10540 connp->conn_ipv6_recvdstopts = *i1 ? 1 : 0; 10541 mutex_exit(&connp->conn_lock); 10542 } 10543 break; /* goto sizeof (int) option return */ 10544 case IPV6_RECVRTHDR: 10545 if (!checkonly) { 10546 mutex_enter(&connp->conn_lock); 10547 connp->conn_ipv6_recvrthdr = *i1 ? 1 : 0; 10548 mutex_exit(&connp->conn_lock); 10549 } 10550 break; /* goto sizeof (int) option return */ 10551 case IPV6_RECVRTHDRDSTOPTS: 10552 if (!checkonly) { 10553 mutex_enter(&connp->conn_lock); 10554 connp->conn_ipv6_recvrtdstopts = *i1 ? 1 : 0; 10555 mutex_exit(&connp->conn_lock); 10556 } 10557 break; /* goto sizeof (int) option return */ 10558 case IPV6_PKTINFO: 10559 if (inlen == 0) 10560 return (-EINVAL); /* clearing option */ 10561 error = ip6_set_pktinfo(cr, connp, 10562 (struct in6_pktinfo *)invalp, first_mp); 10563 if (error != 0) 10564 *outlenp = 0; 10565 else 10566 *outlenp = inlen; 10567 return (error); 10568 case IPV6_NEXTHOP: { 10569 struct sockaddr_in6 *sin6; 10570 10571 /* Verify that the nexthop is reachable */ 10572 if (inlen == 0) 10573 return (-EINVAL); /* clearing option */ 10574 10575 sin6 = (struct sockaddr_in6 *)invalp; 10576 ire = ire_route_lookup_v6(&sin6->sin6_addr, 10577 0, 0, 0, NULL, NULL, connp->conn_zoneid, 10578 NULL, MATCH_IRE_DEFAULT); 10579 10580 if (ire == NULL) { 10581 *outlenp = 0; 10582 return (EHOSTUNREACH); 10583 } 10584 ire_refrele(ire); 10585 return (-EINVAL); 10586 } 10587 case IPV6_SEC_OPT: 10588 error = ipsec_set_req(cr, connp, (ipsec_req_t *)invalp); 10589 if (error != 0) { 10590 *outlenp = 0; 10591 return (error); 10592 } 10593 break; 10594 case IPV6_SRC_PREFERENCES: { 10595 /* 10596 * This is implemented strictly in the ip module 10597 * (here and in tcp_opt_*() to accomodate tcp 10598 * sockets). Modules above ip pass this option 10599 * down here since ip is the only one that needs to 10600 * be aware of source address preferences. 10601 * 10602 * This socket option only affects connected 10603 * sockets that haven't already bound to a specific 10604 * IPv6 address. In other words, sockets that 10605 * don't call bind() with an address other than the 10606 * unspecified address and that call connect(). 10607 * ip_bind_connected_v6() passes these preferences 10608 * to the ipif_select_source_v6() function. 10609 */ 10610 if (inlen != sizeof (uint32_t)) 10611 return (EINVAL); 10612 error = ip6_set_src_preferences(connp, 10613 *(uint32_t *)invalp); 10614 if (error != 0) { 10615 *outlenp = 0; 10616 return (error); 10617 } else { 10618 *outlenp = sizeof (uint32_t); 10619 } 10620 break; 10621 } 10622 case IPV6_V6ONLY: 10623 if (*i1 < 0 || *i1 > 1) { 10624 return (EINVAL); 10625 } 10626 mutex_enter(&connp->conn_lock); 10627 connp->conn_ipv6_v6only = *i1; 10628 mutex_exit(&connp->conn_lock); 10629 break; 10630 default: 10631 return (-EINVAL); 10632 } 10633 break; 10634 default: 10635 /* 10636 * "soft" error (negative) 10637 * option not handled at this level 10638 * Note: Do not modify *outlenp 10639 */ 10640 return (-EINVAL); 10641 } 10642 /* 10643 * Common case of return from an option that is sizeof (int) 10644 */ 10645 *(int *)outvalp = *i1; 10646 *outlenp = sizeof (int); 10647 return (0); 10648 } 10649 10650 /* 10651 * This routine gets default values of certain options whose default 10652 * values are maintained by protocol specific code 10653 */ 10654 /* ARGSUSED */ 10655 int 10656 ip_opt_default(queue_t *q, int level, int name, uchar_t *ptr) 10657 { 10658 int *i1 = (int *)ptr; 10659 10660 switch (level) { 10661 case IPPROTO_IP: 10662 switch (name) { 10663 case IP_MULTICAST_TTL: 10664 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL; 10665 return (sizeof (uchar_t)); 10666 case IP_MULTICAST_LOOP: 10667 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP; 10668 return (sizeof (uchar_t)); 10669 default: 10670 return (-1); 10671 } 10672 case IPPROTO_IPV6: 10673 switch (name) { 10674 case IPV6_UNICAST_HOPS: 10675 *i1 = ipv6_def_hops; 10676 return (sizeof (int)); 10677 case IPV6_MULTICAST_HOPS: 10678 *i1 = IP_DEFAULT_MULTICAST_TTL; 10679 return (sizeof (int)); 10680 case IPV6_MULTICAST_LOOP: 10681 *i1 = IP_DEFAULT_MULTICAST_LOOP; 10682 return (sizeof (int)); 10683 case IPV6_V6ONLY: 10684 *i1 = 1; 10685 return (sizeof (int)); 10686 default: 10687 return (-1); 10688 } 10689 default: 10690 return (-1); 10691 } 10692 /* NOTREACHED */ 10693 } 10694 10695 /* 10696 * Given a destination address and a pointer to where to put the information 10697 * this routine fills in the mtuinfo. 10698 */ 10699 int 10700 ip_fill_mtuinfo(struct in6_addr *in6, in_port_t port, 10701 struct ip6_mtuinfo *mtuinfo) 10702 { 10703 ire_t *ire; 10704 10705 if (IN6_IS_ADDR_UNSPECIFIED(in6)) 10706 return (-1); 10707 10708 bzero(mtuinfo, sizeof (*mtuinfo)); 10709 mtuinfo->ip6m_addr.sin6_family = AF_INET6; 10710 mtuinfo->ip6m_addr.sin6_port = port; 10711 mtuinfo->ip6m_addr.sin6_addr = *in6; 10712 10713 ire = ire_cache_lookup_v6(in6, ALL_ZONES, NULL); 10714 if (ire != NULL) { 10715 mtuinfo->ip6m_mtu = ire->ire_max_frag; 10716 ire_refrele(ire); 10717 } else { 10718 mtuinfo->ip6m_mtu = IPV6_MIN_MTU; 10719 } 10720 return (sizeof (struct ip6_mtuinfo)); 10721 } 10722 10723 /* 10724 * This routine gets socket options. For MRT_VERSION and MRT_ASSERT, error 10725 * checking of GET_QUEUE_CRED(q) and that ip_g_mrouter is set should be done and 10726 * isn't. This doesn't matter as the error checking is done properly for the 10727 * other MRT options coming in through ip_opt_set. 10728 */ 10729 int 10730 ip_opt_get(queue_t *q, int level, int name, uchar_t *ptr) 10731 { 10732 conn_t *connp = Q_TO_CONN(q); 10733 ipsec_req_t *req = (ipsec_req_t *)ptr; 10734 10735 switch (level) { 10736 case IPPROTO_IP: 10737 switch (name) { 10738 case MRT_VERSION: 10739 case MRT_ASSERT: 10740 (void) ip_mrouter_get(name, q, ptr); 10741 return (sizeof (int)); 10742 case IP_SEC_OPT: 10743 return (ipsec_req_from_conn(connp, req, IPSEC_AF_V4)); 10744 case IP_NEXTHOP: 10745 if (connp->conn_nexthop_set) { 10746 *(ipaddr_t *)ptr = connp->conn_nexthop_v4; 10747 return (sizeof (ipaddr_t)); 10748 } else 10749 return (0); 10750 default: 10751 break; 10752 } 10753 break; 10754 case IPPROTO_IPV6: 10755 switch (name) { 10756 case IPV6_SEC_OPT: 10757 return (ipsec_req_from_conn(connp, req, IPSEC_AF_V6)); 10758 case IPV6_SRC_PREFERENCES: { 10759 return (ip6_get_src_preferences(connp, 10760 (uint32_t *)ptr)); 10761 } 10762 case IPV6_V6ONLY: 10763 *(int *)ptr = connp->conn_ipv6_v6only ? 1 : 0; 10764 return (sizeof (int)); 10765 case IPV6_PATHMTU: 10766 return (ip_fill_mtuinfo(&connp->conn_remv6, 0, 10767 (struct ip6_mtuinfo *)ptr)); 10768 default: 10769 break; 10770 } 10771 break; 10772 default: 10773 break; 10774 } 10775 return (-1); 10776 } 10777 10778 /* Named Dispatch routine to get a current value out of our parameter table. */ 10779 /* ARGSUSED */ 10780 static int 10781 ip_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 10782 { 10783 ipparam_t *ippa = (ipparam_t *)cp; 10784 10785 (void) mi_mpprintf(mp, "%d", ippa->ip_param_value); 10786 return (0); 10787 } 10788 10789 /* ARGSUSED */ 10790 static int 10791 ip_param_generic_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 10792 { 10793 10794 (void) mi_mpprintf(mp, "%d", *(int *)cp); 10795 return (0); 10796 } 10797 10798 /* 10799 * Set ip{,6}_forwarding values. This means walking through all of the 10800 * ill's and toggling their forwarding values. 10801 */ 10802 /* ARGSUSED */ 10803 static int 10804 ip_forward_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *ioc_cr) 10805 { 10806 long new_value; 10807 int *forwarding_value = (int *)cp; 10808 ill_t *walker; 10809 boolean_t isv6 = (forwarding_value == &ipv6_forward); 10810 ill_walk_context_t ctx; 10811 10812 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 10813 new_value < 0 || new_value > 1) { 10814 return (EINVAL); 10815 } 10816 10817 *forwarding_value = new_value; 10818 10819 /* 10820 * Regardless of the current value of ip_forwarding, set all per-ill 10821 * values of ip_forwarding to the value being set. 10822 * 10823 * Bring all the ill's up to date with the new global value. 10824 */ 10825 rw_enter(&ill_g_lock, RW_READER); 10826 10827 if (isv6) 10828 walker = ILL_START_WALK_V6(&ctx); 10829 else 10830 walker = ILL_START_WALK_V4(&ctx); 10831 for (; walker != NULL; walker = ill_next(&ctx, walker)) { 10832 (void) ill_forward_set(q, mp, (new_value != 0), 10833 (caddr_t)walker); 10834 } 10835 rw_exit(&ill_g_lock); 10836 10837 return (0); 10838 } 10839 10840 /* 10841 * Walk through the param array specified registering each element with the 10842 * Named Dispatch handler. This is called only during init. So it is ok 10843 * not to acquire any locks 10844 */ 10845 static boolean_t 10846 ip_param_register(ipparam_t *ippa, size_t ippa_cnt, 10847 ipndp_t *ipnd, size_t ipnd_cnt) 10848 { 10849 for (; ippa_cnt-- > 0; ippa++) { 10850 if (ippa->ip_param_name && ippa->ip_param_name[0]) { 10851 if (!nd_load(&ip_g_nd, ippa->ip_param_name, 10852 ip_param_get, ip_param_set, (caddr_t)ippa)) { 10853 nd_free(&ip_g_nd); 10854 return (B_FALSE); 10855 } 10856 } 10857 } 10858 10859 for (; ipnd_cnt-- > 0; ipnd++) { 10860 if (ipnd->ip_ndp_name && ipnd->ip_ndp_name[0]) { 10861 if (!nd_load(&ip_g_nd, ipnd->ip_ndp_name, 10862 ipnd->ip_ndp_getf, ipnd->ip_ndp_setf, 10863 ipnd->ip_ndp_data)) { 10864 nd_free(&ip_g_nd); 10865 return (B_FALSE); 10866 } 10867 } 10868 } 10869 10870 return (B_TRUE); 10871 } 10872 10873 /* Named Dispatch routine to negotiate a new value for one of our parameters. */ 10874 /* ARGSUSED */ 10875 static int 10876 ip_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *ioc_cr) 10877 { 10878 long new_value; 10879 ipparam_t *ippa = (ipparam_t *)cp; 10880 10881 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 10882 new_value < ippa->ip_param_min || new_value > ippa->ip_param_max) { 10883 return (EINVAL); 10884 } 10885 ippa->ip_param_value = new_value; 10886 return (0); 10887 } 10888 10889 /* 10890 * Handles both IPv4 and IPv6 reassembly - doing the out-of-order cases, 10891 * When an ipf is passed here for the first time, if 10892 * we already have in-order fragments on the queue, we convert from the fast- 10893 * path reassembly scheme to the hard-case scheme. From then on, additional 10894 * fragments are reassembled here. We keep track of the start and end offsets 10895 * of each piece, and the number of holes in the chain. When the hole count 10896 * goes to zero, we are done! 10897 * 10898 * The ipf_count will be updated to account for any mblk(s) added (pointed to 10899 * by mp) or subtracted (freeb()ed dups), upon return the caller must update 10900 * ipfb_count and ill_frag_count by the difference of ipf_count before and 10901 * after the call to ip_reassemble(). 10902 */ 10903 int 10904 ip_reassemble(mblk_t *mp, ipf_t *ipf, uint_t start, boolean_t more, ill_t *ill, 10905 size_t msg_len) 10906 { 10907 uint_t end; 10908 mblk_t *next_mp; 10909 mblk_t *mp1; 10910 uint_t offset; 10911 boolean_t incr_dups = B_TRUE; 10912 boolean_t offset_zero_seen = B_FALSE; 10913 boolean_t pkt_boundary_checked = B_FALSE; 10914 10915 /* If start == 0 then ipf_nf_hdr_len has to be set. */ 10916 ASSERT(start != 0 || ipf->ipf_nf_hdr_len != 0); 10917 10918 /* Add in byte count */ 10919 ipf->ipf_count += msg_len; 10920 if (ipf->ipf_end) { 10921 /* 10922 * We were part way through in-order reassembly, but now there 10923 * is a hole. We walk through messages already queued, and 10924 * mark them for hard case reassembly. We know that up till 10925 * now they were in order starting from offset zero. 10926 */ 10927 offset = 0; 10928 for (mp1 = ipf->ipf_mp->b_cont; mp1; mp1 = mp1->b_cont) { 10929 IP_REASS_SET_START(mp1, offset); 10930 if (offset == 0) { 10931 ASSERT(ipf->ipf_nf_hdr_len != 0); 10932 offset = -ipf->ipf_nf_hdr_len; 10933 } 10934 offset += mp1->b_wptr - mp1->b_rptr; 10935 IP_REASS_SET_END(mp1, offset); 10936 } 10937 /* One hole at the end. */ 10938 ipf->ipf_hole_cnt = 1; 10939 /* Brand it as a hard case, forever. */ 10940 ipf->ipf_end = 0; 10941 } 10942 /* Walk through all the new pieces. */ 10943 do { 10944 end = start + (mp->b_wptr - mp->b_rptr); 10945 /* 10946 * If start is 0, decrease 'end' only for the first mblk of 10947 * the fragment. Otherwise 'end' can get wrong value in the 10948 * second pass of the loop if first mblk is exactly the 10949 * size of ipf_nf_hdr_len. 10950 */ 10951 if (start == 0 && !offset_zero_seen) { 10952 /* First segment */ 10953 ASSERT(ipf->ipf_nf_hdr_len != 0); 10954 end -= ipf->ipf_nf_hdr_len; 10955 offset_zero_seen = B_TRUE; 10956 } 10957 next_mp = mp->b_cont; 10958 /* 10959 * We are checking to see if there is any interesing data 10960 * to process. If there isn't and the mblk isn't the 10961 * one which carries the unfragmentable header then we 10962 * drop it. It's possible to have just the unfragmentable 10963 * header come through without any data. That needs to be 10964 * saved. 10965 * 10966 * If the assert at the top of this function holds then the 10967 * term "ipf->ipf_nf_hdr_len != 0" isn't needed. This code 10968 * is infrequently traveled enough that the test is left in 10969 * to protect against future code changes which break that 10970 * invariant. 10971 */ 10972 if (start == end && start != 0 && ipf->ipf_nf_hdr_len != 0) { 10973 /* Empty. Blast it. */ 10974 IP_REASS_SET_START(mp, 0); 10975 IP_REASS_SET_END(mp, 0); 10976 /* 10977 * If the ipf points to the mblk we are about to free, 10978 * update ipf to point to the next mblk (or NULL 10979 * if none). 10980 */ 10981 if (ipf->ipf_mp->b_cont == mp) 10982 ipf->ipf_mp->b_cont = next_mp; 10983 freeb(mp); 10984 continue; 10985 } 10986 mp->b_cont = NULL; 10987 IP_REASS_SET_START(mp, start); 10988 IP_REASS_SET_END(mp, end); 10989 if (!ipf->ipf_tail_mp) { 10990 ipf->ipf_tail_mp = mp; 10991 ipf->ipf_mp->b_cont = mp; 10992 if (start == 0 || !more) { 10993 ipf->ipf_hole_cnt = 1; 10994 /* 10995 * if the first fragment comes in more than one 10996 * mblk, this loop will be executed for each 10997 * mblk. Need to adjust hole count so exiting 10998 * this routine will leave hole count at 1. 10999 */ 11000 if (next_mp) 11001 ipf->ipf_hole_cnt++; 11002 } else 11003 ipf->ipf_hole_cnt = 2; 11004 continue; 11005 } else if (ipf->ipf_last_frag_seen && !more && 11006 !pkt_boundary_checked) { 11007 /* 11008 * We check datagram boundary only if this fragment 11009 * claims to be the last fragment and we have seen a 11010 * last fragment in the past too. We do this only 11011 * once for a given fragment. 11012 * 11013 * start cannot be 0 here as fragments with start=0 11014 * and MF=0 gets handled as a complete packet. These 11015 * fragments should not reach here. 11016 */ 11017 11018 if (start + msgdsize(mp) != 11019 IP_REASS_END(ipf->ipf_tail_mp)) { 11020 /* 11021 * We have two fragments both of which claim 11022 * to be the last fragment but gives conflicting 11023 * information about the whole datagram size. 11024 * Something fishy is going on. Drop the 11025 * fragment and free up the reassembly list. 11026 */ 11027 return (IP_REASS_FAILED); 11028 } 11029 11030 /* 11031 * We shouldn't come to this code block again for this 11032 * particular fragment. 11033 */ 11034 pkt_boundary_checked = B_TRUE; 11035 } 11036 11037 /* New stuff at or beyond tail? */ 11038 offset = IP_REASS_END(ipf->ipf_tail_mp); 11039 if (start >= offset) { 11040 if (ipf->ipf_last_frag_seen) { 11041 /* current fragment is beyond last fragment */ 11042 return (IP_REASS_FAILED); 11043 } 11044 /* Link it on end. */ 11045 ipf->ipf_tail_mp->b_cont = mp; 11046 ipf->ipf_tail_mp = mp; 11047 if (more) { 11048 if (start != offset) 11049 ipf->ipf_hole_cnt++; 11050 } else if (start == offset && next_mp == NULL) 11051 ipf->ipf_hole_cnt--; 11052 continue; 11053 } 11054 mp1 = ipf->ipf_mp->b_cont; 11055 offset = IP_REASS_START(mp1); 11056 /* New stuff at the front? */ 11057 if (start < offset) { 11058 if (start == 0) { 11059 if (end >= offset) { 11060 /* Nailed the hole at the begining. */ 11061 ipf->ipf_hole_cnt--; 11062 } 11063 } else if (end < offset) { 11064 /* 11065 * A hole, stuff, and a hole where there used 11066 * to be just a hole. 11067 */ 11068 ipf->ipf_hole_cnt++; 11069 } 11070 mp->b_cont = mp1; 11071 /* Check for overlap. */ 11072 while (end > offset) { 11073 if (end < IP_REASS_END(mp1)) { 11074 mp->b_wptr -= end - offset; 11075 IP_REASS_SET_END(mp, offset); 11076 if (ill->ill_isv6) { 11077 BUMP_MIB(ill->ill_ip6_mib, 11078 ipv6ReasmPartDups); 11079 } else { 11080 BUMP_MIB(&ip_mib, 11081 ipReasmPartDups); 11082 } 11083 break; 11084 } 11085 /* Did we cover another hole? */ 11086 if ((mp1->b_cont && 11087 IP_REASS_END(mp1) != 11088 IP_REASS_START(mp1->b_cont) && 11089 end >= IP_REASS_START(mp1->b_cont)) || 11090 (!ipf->ipf_last_frag_seen && !more)) { 11091 ipf->ipf_hole_cnt--; 11092 } 11093 /* Clip out mp1. */ 11094 if ((mp->b_cont = mp1->b_cont) == NULL) { 11095 /* 11096 * After clipping out mp1, this guy 11097 * is now hanging off the end. 11098 */ 11099 ipf->ipf_tail_mp = mp; 11100 } 11101 IP_REASS_SET_START(mp1, 0); 11102 IP_REASS_SET_END(mp1, 0); 11103 /* Subtract byte count */ 11104 ipf->ipf_count -= mp1->b_datap->db_lim - 11105 mp1->b_datap->db_base; 11106 freeb(mp1); 11107 if (ill->ill_isv6) { 11108 BUMP_MIB(ill->ill_ip6_mib, 11109 ipv6ReasmPartDups); 11110 } else { 11111 BUMP_MIB(&ip_mib, ipReasmPartDups); 11112 } 11113 mp1 = mp->b_cont; 11114 if (!mp1) 11115 break; 11116 offset = IP_REASS_START(mp1); 11117 } 11118 ipf->ipf_mp->b_cont = mp; 11119 continue; 11120 } 11121 /* 11122 * The new piece starts somewhere between the start of the head 11123 * and before the end of the tail. 11124 */ 11125 for (; mp1; mp1 = mp1->b_cont) { 11126 offset = IP_REASS_END(mp1); 11127 if (start < offset) { 11128 if (end <= offset) { 11129 /* Nothing new. */ 11130 IP_REASS_SET_START(mp, 0); 11131 IP_REASS_SET_END(mp, 0); 11132 /* Subtract byte count */ 11133 ipf->ipf_count -= mp->b_datap->db_lim - 11134 mp->b_datap->db_base; 11135 if (incr_dups) { 11136 ipf->ipf_num_dups++; 11137 incr_dups = B_FALSE; 11138 } 11139 freeb(mp); 11140 if (ill->ill_isv6) { 11141 BUMP_MIB(ill->ill_ip6_mib, 11142 ipv6ReasmDuplicates); 11143 } else { 11144 BUMP_MIB(&ip_mib, 11145 ipReasmDuplicates); 11146 } 11147 break; 11148 } 11149 /* 11150 * Trim redundant stuff off beginning of new 11151 * piece. 11152 */ 11153 IP_REASS_SET_START(mp, offset); 11154 mp->b_rptr += offset - start; 11155 if (ill->ill_isv6) { 11156 BUMP_MIB(ill->ill_ip6_mib, 11157 ipv6ReasmPartDups); 11158 } else { 11159 BUMP_MIB(&ip_mib, ipReasmPartDups); 11160 } 11161 start = offset; 11162 if (!mp1->b_cont) { 11163 /* 11164 * After trimming, this guy is now 11165 * hanging off the end. 11166 */ 11167 mp1->b_cont = mp; 11168 ipf->ipf_tail_mp = mp; 11169 if (!more) { 11170 ipf->ipf_hole_cnt--; 11171 } 11172 break; 11173 } 11174 } 11175 if (start >= IP_REASS_START(mp1->b_cont)) 11176 continue; 11177 /* Fill a hole */ 11178 if (start > offset) 11179 ipf->ipf_hole_cnt++; 11180 mp->b_cont = mp1->b_cont; 11181 mp1->b_cont = mp; 11182 mp1 = mp->b_cont; 11183 offset = IP_REASS_START(mp1); 11184 if (end >= offset) { 11185 ipf->ipf_hole_cnt--; 11186 /* Check for overlap. */ 11187 while (end > offset) { 11188 if (end < IP_REASS_END(mp1)) { 11189 mp->b_wptr -= end - offset; 11190 IP_REASS_SET_END(mp, offset); 11191 /* 11192 * TODO we might bump 11193 * this up twice if there is 11194 * overlap at both ends. 11195 */ 11196 if (ill->ill_isv6) { 11197 BUMP_MIB( 11198 ill->ill_ip6_mib, 11199 ipv6ReasmPartDups); 11200 } else { 11201 BUMP_MIB(&ip_mib, 11202 ipReasmPartDups); 11203 } 11204 break; 11205 } 11206 /* Did we cover another hole? */ 11207 if ((mp1->b_cont && 11208 IP_REASS_END(mp1) 11209 != IP_REASS_START(mp1->b_cont) && 11210 end >= 11211 IP_REASS_START(mp1->b_cont)) || 11212 (!ipf->ipf_last_frag_seen && 11213 !more)) { 11214 ipf->ipf_hole_cnt--; 11215 } 11216 /* Clip out mp1. */ 11217 if ((mp->b_cont = mp1->b_cont) == 11218 NULL) { 11219 /* 11220 * After clipping out mp1, 11221 * this guy is now hanging 11222 * off the end. 11223 */ 11224 ipf->ipf_tail_mp = mp; 11225 } 11226 IP_REASS_SET_START(mp1, 0); 11227 IP_REASS_SET_END(mp1, 0); 11228 /* Subtract byte count */ 11229 ipf->ipf_count -= 11230 mp1->b_datap->db_lim - 11231 mp1->b_datap->db_base; 11232 freeb(mp1); 11233 if (ill->ill_isv6) { 11234 BUMP_MIB(ill->ill_ip6_mib, 11235 ipv6ReasmPartDups); 11236 } else { 11237 BUMP_MIB(&ip_mib, 11238 ipReasmPartDups); 11239 } 11240 mp1 = mp->b_cont; 11241 if (!mp1) 11242 break; 11243 offset = IP_REASS_START(mp1); 11244 } 11245 } 11246 break; 11247 } 11248 } while (start = end, mp = next_mp); 11249 11250 /* Fragment just processed could be the last one. Remember this fact */ 11251 if (!more) 11252 ipf->ipf_last_frag_seen = B_TRUE; 11253 11254 /* Still got holes? */ 11255 if (ipf->ipf_hole_cnt) 11256 return (IP_REASS_PARTIAL); 11257 /* Clean up overloaded fields to avoid upstream disasters. */ 11258 for (mp1 = ipf->ipf_mp->b_cont; mp1; mp1 = mp1->b_cont) { 11259 IP_REASS_SET_START(mp1, 0); 11260 IP_REASS_SET_END(mp1, 0); 11261 } 11262 return (IP_REASS_COMPLETE); 11263 } 11264 11265 /* 11266 * ipsec processing for the fast path, used for input UDP Packets 11267 */ 11268 static boolean_t 11269 ip_udp_check(queue_t *q, conn_t *connp, ill_t *ill, ipha_t *ipha, 11270 mblk_t **mpp, mblk_t **first_mpp, boolean_t mctl_present) 11271 { 11272 uint32_t ill_index; 11273 uint_t in_flags; /* IPF_RECVSLLA and/or IPF_RECVIF */ 11274 11275 ASSERT(ipha->ipha_protocol == IPPROTO_UDP); 11276 /* The ill_index of the incoming ILL */ 11277 ill_index = ((ill_t *)q->q_ptr)->ill_phyint->phyint_ifindex; 11278 11279 /* pass packet up to the transport */ 11280 if (CONN_INBOUND_POLICY_PRESENT(connp) || mctl_present) { 11281 *first_mpp = ipsec_check_inbound_policy(*first_mpp, connp, ipha, 11282 NULL, mctl_present); 11283 if (*first_mpp == NULL) { 11284 return (B_FALSE); 11285 } 11286 } 11287 11288 /* Initiate IPPF processing for fastpath UDP */ 11289 if (IPP_ENABLED(IPP_LOCAL_IN)) { 11290 ip_process(IPP_LOCAL_IN, mpp, ill_index); 11291 if (*mpp == NULL) { 11292 ip2dbg(("ip_input_ipsec_process: UDP pkt " 11293 "deferred/dropped during IPPF processing\n")); 11294 return (B_FALSE); 11295 } 11296 } 11297 /* 11298 * We make the checks as below since we are in the fast path 11299 * and want to minimize the number of checks if the IP_RECVIF and/or 11300 * IP_RECVSLLA and/or IPV6_RECVPKTINFO options are not set 11301 */ 11302 if (connp->conn_recvif || connp->conn_recvslla || 11303 connp->conn_ipv6_recvpktinfo) { 11304 if (connp->conn_recvif || 11305 connp->conn_ipv6_recvpktinfo) { 11306 in_flags = IPF_RECVIF; 11307 } 11308 if (connp->conn_recvslla) { 11309 in_flags |= IPF_RECVSLLA; 11310 } 11311 /* 11312 * since in_flags are being set ill will be 11313 * referenced in ip_add_info, so it better not 11314 * be NULL. 11315 */ 11316 /* 11317 * the actual data will be contained in b_cont 11318 * upon successful return of the following call. 11319 * If the call fails then the original mblk is 11320 * returned. 11321 */ 11322 *mpp = ip_add_info(*mpp, ill, in_flags); 11323 } 11324 11325 return (B_TRUE); 11326 } 11327 11328 /* 11329 * Fragmentation reassembly. Each ILL has a hash table for 11330 * queuing packets undergoing reassembly for all IPIFs 11331 * associated with the ILL. The hash is based on the packet 11332 * IP ident field. The ILL frag hash table was allocated 11333 * as a timer block at the time the ILL was created. Whenever 11334 * there is anything on the reassembly queue, the timer will 11335 * be running. Returns B_TRUE if successful else B_FALSE; 11336 * frees mp on failure. 11337 */ 11338 static boolean_t 11339 ip_rput_fragment(queue_t *q, mblk_t **mpp, ipha_t *ipha, 11340 uint32_t *cksum_val, uint16_t *cksum_flags) 11341 { 11342 uint32_t frag_offset_flags; 11343 ill_t *ill = (ill_t *)q->q_ptr; 11344 mblk_t *mp = *mpp; 11345 mblk_t *t_mp; 11346 ipaddr_t dst; 11347 uint8_t proto = ipha->ipha_protocol; 11348 uint32_t sum_val; 11349 uint16_t sum_flags; 11350 ipf_t *ipf; 11351 ipf_t **ipfp; 11352 ipfb_t *ipfb; 11353 uint16_t ident; 11354 uint32_t offset; 11355 ipaddr_t src; 11356 uint_t hdr_length; 11357 uint32_t end; 11358 mblk_t *mp1; 11359 mblk_t *tail_mp; 11360 size_t count; 11361 size_t msg_len; 11362 uint8_t ecn_info = 0; 11363 uint32_t packet_size; 11364 boolean_t pruned = B_FALSE; 11365 11366 if (cksum_val != NULL) 11367 *cksum_val = 0; 11368 if (cksum_flags != NULL) 11369 *cksum_flags = 0; 11370 11371 /* 11372 * Drop the fragmented as early as possible, if 11373 * we don't have resource(s) to re-assemble. 11374 */ 11375 if (ip_reass_queue_bytes == 0) { 11376 freemsg(mp); 11377 return (B_FALSE); 11378 } 11379 11380 /* Check for fragmentation offset; return if there's none */ 11381 if ((frag_offset_flags = ntohs(ipha->ipha_fragment_offset_and_flags) & 11382 (IPH_MF | IPH_OFFSET)) == 0) 11383 return (B_TRUE); 11384 11385 /* 11386 * We utilize hardware computed checksum info only for UDP since 11387 * IP fragmentation is a normal occurence for the protocol. In 11388 * addition, checksum offload support for IP fragments carrying 11389 * UDP payload is commonly implemented across network adapters. 11390 */ 11391 ASSERT(ill != NULL); 11392 if (proto == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(ill) && 11393 (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) { 11394 mblk_t *mp1 = mp->b_cont; 11395 int32_t len; 11396 11397 /* Record checksum information from the packet */ 11398 sum_val = (uint32_t)DB_CKSUM16(mp); 11399 sum_flags = DB_CKSUMFLAGS(mp); 11400 11401 /* IP payload offset from beginning of mblk */ 11402 offset = ((uchar_t *)ipha + IPH_HDR_LENGTH(ipha)) - mp->b_rptr; 11403 11404 if ((sum_flags & HCK_PARTIALCKSUM) && 11405 (mp1 == NULL || mp1->b_cont == NULL) && 11406 offset >= DB_CKSUMSTART(mp) && 11407 ((len = offset - DB_CKSUMSTART(mp)) & 1) == 0) { 11408 uint32_t adj; 11409 /* 11410 * Partial checksum has been calculated by hardware 11411 * and attached to the packet; in addition, any 11412 * prepended extraneous data is even byte aligned. 11413 * If any such data exists, we adjust the checksum; 11414 * this would also handle any postpended data. 11415 */ 11416 IP_ADJCKSUM_PARTIAL(mp->b_rptr + DB_CKSUMSTART(mp), 11417 mp, mp1, len, adj); 11418 11419 /* One's complement subtract extraneous checksum */ 11420 if (adj >= sum_val) 11421 sum_val = ~(adj - sum_val) & 0xFFFF; 11422 else 11423 sum_val -= adj; 11424 } 11425 } else { 11426 sum_val = 0; 11427 sum_flags = 0; 11428 } 11429 11430 /* Clear hardware checksumming flag */ 11431 DB_CKSUMFLAGS(mp) = 0; 11432 11433 ident = ipha->ipha_ident; 11434 offset = (frag_offset_flags << 3) & 0xFFFF; 11435 src = ipha->ipha_src; 11436 dst = ipha->ipha_dst; 11437 hdr_length = IPH_HDR_LENGTH(ipha); 11438 end = ntohs(ipha->ipha_length) - hdr_length; 11439 11440 /* If end == 0 then we have a packet with no data, so just free it */ 11441 if (end == 0) { 11442 freemsg(mp); 11443 return (B_FALSE); 11444 } 11445 11446 /* Record the ECN field info. */ 11447 ecn_info = (ipha->ipha_type_of_service & 0x3); 11448 if (offset != 0) { 11449 /* 11450 * If this isn't the first piece, strip the header, and 11451 * add the offset to the end value. 11452 */ 11453 mp->b_rptr += hdr_length; 11454 end += offset; 11455 } 11456 11457 msg_len = MBLKSIZE(mp); 11458 tail_mp = mp; 11459 while (tail_mp->b_cont != NULL) { 11460 tail_mp = tail_mp->b_cont; 11461 msg_len += MBLKSIZE(tail_mp); 11462 } 11463 11464 /* If the reassembly list for this ILL will get too big, prune it */ 11465 if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >= 11466 ip_reass_queue_bytes) { 11467 ill_frag_prune(ill, 11468 (ip_reass_queue_bytes < msg_len) ? 0 : 11469 (ip_reass_queue_bytes - msg_len)); 11470 pruned = B_TRUE; 11471 } 11472 11473 ipfb = &ill->ill_frag_hash_tbl[ILL_FRAG_HASH(src, ident)]; 11474 mutex_enter(&ipfb->ipfb_lock); 11475 11476 ipfp = &ipfb->ipfb_ipf; 11477 /* Try to find an existing fragment queue for this packet. */ 11478 for (;;) { 11479 ipf = ipfp[0]; 11480 if (ipf != NULL) { 11481 /* 11482 * It has to match on ident and src/dst address. 11483 */ 11484 if (ipf->ipf_ident == ident && 11485 ipf->ipf_src == src && 11486 ipf->ipf_dst == dst && 11487 ipf->ipf_protocol == proto) { 11488 /* 11489 * If we have received too many 11490 * duplicate fragments for this packet 11491 * free it. 11492 */ 11493 if (ipf->ipf_num_dups > ip_max_frag_dups) { 11494 ill_frag_free_pkts(ill, ipfb, ipf, 1); 11495 freemsg(mp); 11496 mutex_exit(&ipfb->ipfb_lock); 11497 return (B_FALSE); 11498 } 11499 /* Found it. */ 11500 break; 11501 } 11502 ipfp = &ipf->ipf_hash_next; 11503 continue; 11504 } 11505 11506 /* 11507 * If we pruned the list, do we want to store this new 11508 * fragment?. We apply an optimization here based on the 11509 * fact that most fragments will be received in order. 11510 * So if the offset of this incoming fragment is zero, 11511 * it is the first fragment of a new packet. We will 11512 * keep it. Otherwise drop the fragment, as we have 11513 * probably pruned the packet already (since the 11514 * packet cannot be found). 11515 */ 11516 if (pruned && offset != 0) { 11517 mutex_exit(&ipfb->ipfb_lock); 11518 freemsg(mp); 11519 return (B_FALSE); 11520 } 11521 11522 if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS) { 11523 /* 11524 * Too many fragmented packets in this hash 11525 * bucket. Free the oldest. 11526 */ 11527 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 1); 11528 } 11529 11530 /* New guy. Allocate a frag message. */ 11531 mp1 = allocb(sizeof (*ipf), BPRI_MED); 11532 if (mp1 == NULL) { 11533 BUMP_MIB(&ip_mib, ipInDiscards); 11534 freemsg(mp); 11535 reass_done: 11536 mutex_exit(&ipfb->ipfb_lock); 11537 return (B_FALSE); 11538 } 11539 11540 11541 BUMP_MIB(&ip_mib, ipReasmReqds); 11542 mp1->b_cont = mp; 11543 11544 /* Initialize the fragment header. */ 11545 ipf = (ipf_t *)mp1->b_rptr; 11546 ipf->ipf_mp = mp1; 11547 ipf->ipf_ptphn = ipfp; 11548 ipfp[0] = ipf; 11549 ipf->ipf_hash_next = NULL; 11550 ipf->ipf_ident = ident; 11551 ipf->ipf_protocol = proto; 11552 ipf->ipf_src = src; 11553 ipf->ipf_dst = dst; 11554 ipf->ipf_nf_hdr_len = 0; 11555 /* Record reassembly start time. */ 11556 ipf->ipf_timestamp = gethrestime_sec(); 11557 /* Record ipf generation and account for frag header */ 11558 ipf->ipf_gen = ill->ill_ipf_gen++; 11559 ipf->ipf_count = MBLKSIZE(mp1); 11560 ipf->ipf_last_frag_seen = B_FALSE; 11561 ipf->ipf_ecn = ecn_info; 11562 ipf->ipf_num_dups = 0; 11563 ipfb->ipfb_frag_pkts++; 11564 ipf->ipf_checksum = 0; 11565 ipf->ipf_checksum_flags = 0; 11566 11567 /* Store checksum value in fragment header */ 11568 if (sum_flags != 0) { 11569 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); 11570 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); 11571 ipf->ipf_checksum = sum_val; 11572 ipf->ipf_checksum_flags = sum_flags; 11573 } 11574 11575 /* 11576 * We handle reassembly two ways. In the easy case, 11577 * where all the fragments show up in order, we do 11578 * minimal bookkeeping, and just clip new pieces on 11579 * the end. If we ever see a hole, then we go off 11580 * to ip_reassemble which has to mark the pieces and 11581 * keep track of the number of holes, etc. Obviously, 11582 * the point of having both mechanisms is so we can 11583 * handle the easy case as efficiently as possible. 11584 */ 11585 if (offset == 0) { 11586 /* Easy case, in-order reassembly so far. */ 11587 ipf->ipf_count += msg_len; 11588 ipf->ipf_tail_mp = tail_mp; 11589 /* 11590 * Keep track of next expected offset in 11591 * ipf_end. 11592 */ 11593 ipf->ipf_end = end; 11594 ipf->ipf_nf_hdr_len = hdr_length; 11595 } else { 11596 /* Hard case, hole at the beginning. */ 11597 ipf->ipf_tail_mp = NULL; 11598 /* 11599 * ipf_end == 0 means that we have given up 11600 * on easy reassembly. 11601 */ 11602 ipf->ipf_end = 0; 11603 11604 /* Forget checksum offload from now on */ 11605 ipf->ipf_checksum_flags = 0; 11606 11607 /* 11608 * ipf_hole_cnt is set by ip_reassemble. 11609 * ipf_count is updated by ip_reassemble. 11610 * No need to check for return value here 11611 * as we don't expect reassembly to complete 11612 * or fail for the first fragment itself. 11613 */ 11614 (void) ip_reassemble(mp, ipf, 11615 (frag_offset_flags & IPH_OFFSET) << 3, 11616 (frag_offset_flags & IPH_MF), ill, msg_len); 11617 } 11618 /* Update per ipfb and ill byte counts */ 11619 ipfb->ipfb_count += ipf->ipf_count; 11620 ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ 11621 ill->ill_frag_count += ipf->ipf_count; 11622 ASSERT(ill->ill_frag_count > 0); /* Wraparound */ 11623 /* If the frag timer wasn't already going, start it. */ 11624 mutex_enter(&ill->ill_lock); 11625 ill_frag_timer_start(ill); 11626 mutex_exit(&ill->ill_lock); 11627 goto reass_done; 11628 } 11629 11630 /* 11631 * If the packet's flag has changed (it could be coming up 11632 * from an interface different than the previous, therefore 11633 * possibly different checksum capability), then forget about 11634 * any stored checksum states. Otherwise add the value to 11635 * the existing one stored in the fragment header. 11636 */ 11637 if (sum_flags != 0 && sum_flags == ipf->ipf_checksum_flags) { 11638 sum_val += ipf->ipf_checksum; 11639 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); 11640 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); 11641 ipf->ipf_checksum = sum_val; 11642 } else if (ipf->ipf_checksum_flags != 0) { 11643 /* Forget checksum offload from now on */ 11644 ipf->ipf_checksum_flags = 0; 11645 } 11646 11647 /* 11648 * We have a new piece of a datagram which is already being 11649 * reassembled. Update the ECN info if all IP fragments 11650 * are ECN capable. If there is one which is not, clear 11651 * all the info. If there is at least one which has CE 11652 * code point, IP needs to report that up to transport. 11653 */ 11654 if (ecn_info != IPH_ECN_NECT && ipf->ipf_ecn != IPH_ECN_NECT) { 11655 if (ecn_info == IPH_ECN_CE) 11656 ipf->ipf_ecn = IPH_ECN_CE; 11657 } else { 11658 ipf->ipf_ecn = IPH_ECN_NECT; 11659 } 11660 if (offset && ipf->ipf_end == offset) { 11661 /* The new fragment fits at the end */ 11662 ipf->ipf_tail_mp->b_cont = mp; 11663 /* Update the byte count */ 11664 ipf->ipf_count += msg_len; 11665 /* Update per ipfb and ill byte counts */ 11666 ipfb->ipfb_count += msg_len; 11667 ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ 11668 ill->ill_frag_count += msg_len; 11669 ASSERT(ill->ill_frag_count > 0); /* Wraparound */ 11670 if (frag_offset_flags & IPH_MF) { 11671 /* More to come. */ 11672 ipf->ipf_end = end; 11673 ipf->ipf_tail_mp = tail_mp; 11674 goto reass_done; 11675 } 11676 } else { 11677 /* Go do the hard cases. */ 11678 int ret; 11679 11680 if (offset == 0) 11681 ipf->ipf_nf_hdr_len = hdr_length; 11682 11683 /* Save current byte count */ 11684 count = ipf->ipf_count; 11685 ret = ip_reassemble(mp, ipf, 11686 (frag_offset_flags & IPH_OFFSET) << 3, 11687 (frag_offset_flags & IPH_MF), ill, msg_len); 11688 /* Count of bytes added and subtracted (freeb()ed) */ 11689 count = ipf->ipf_count - count; 11690 if (count) { 11691 /* Update per ipfb and ill byte counts */ 11692 ipfb->ipfb_count += count; 11693 ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ 11694 ill->ill_frag_count += count; 11695 ASSERT(ill->ill_frag_count > 0); 11696 } 11697 if (ret == IP_REASS_PARTIAL) { 11698 goto reass_done; 11699 } else if (ret == IP_REASS_FAILED) { 11700 /* Reassembly failed. Free up all resources */ 11701 ill_frag_free_pkts(ill, ipfb, ipf, 1); 11702 for (t_mp = mp; t_mp != NULL; t_mp = t_mp->b_cont) { 11703 IP_REASS_SET_START(t_mp, 0); 11704 IP_REASS_SET_END(t_mp, 0); 11705 } 11706 freemsg(mp); 11707 goto reass_done; 11708 } 11709 /* We will reach here iff 'ret' is IP_REASS_COMPLETE */ 11710 } 11711 /* 11712 * We have completed reassembly. Unhook the frag header from 11713 * the reassembly list. 11714 * 11715 * Before we free the frag header, record the ECN info 11716 * to report back to the transport. 11717 */ 11718 ecn_info = ipf->ipf_ecn; 11719 BUMP_MIB(&ip_mib, ipReasmOKs); 11720 ipfp = ipf->ipf_ptphn; 11721 11722 /* We need to supply these to caller */ 11723 if ((sum_flags = ipf->ipf_checksum_flags) != 0) 11724 sum_val = ipf->ipf_checksum; 11725 else 11726 sum_val = 0; 11727 11728 mp1 = ipf->ipf_mp; 11729 count = ipf->ipf_count; 11730 ipf = ipf->ipf_hash_next; 11731 if (ipf != NULL) 11732 ipf->ipf_ptphn = ipfp; 11733 ipfp[0] = ipf; 11734 ill->ill_frag_count -= count; 11735 ASSERT(ipfb->ipfb_count >= count); 11736 ipfb->ipfb_count -= count; 11737 ipfb->ipfb_frag_pkts--; 11738 mutex_exit(&ipfb->ipfb_lock); 11739 /* Ditch the frag header. */ 11740 mp = mp1->b_cont; 11741 11742 freeb(mp1); 11743 11744 /* Restore original IP length in header. */ 11745 packet_size = (uint32_t)msgdsize(mp); 11746 if (packet_size > IP_MAXPACKET) { 11747 freemsg(mp); 11748 BUMP_MIB(&ip_mib, ipInHdrErrors); 11749 return (B_FALSE); 11750 } 11751 11752 if (DB_REF(mp) > 1) { 11753 mblk_t *mp2 = copymsg(mp); 11754 11755 freemsg(mp); 11756 if (mp2 == NULL) { 11757 BUMP_MIB(&ip_mib, ipInDiscards); 11758 return (B_FALSE); 11759 } 11760 mp = mp2; 11761 } 11762 ipha = (ipha_t *)mp->b_rptr; 11763 11764 ipha->ipha_length = htons((uint16_t)packet_size); 11765 /* We're now complete, zip the frag state */ 11766 ipha->ipha_fragment_offset_and_flags = 0; 11767 /* Record the ECN info. */ 11768 ipha->ipha_type_of_service &= 0xFC; 11769 ipha->ipha_type_of_service |= ecn_info; 11770 *mpp = mp; 11771 11772 /* Reassembly is successful; return checksum information if needed */ 11773 if (cksum_val != NULL) 11774 *cksum_val = sum_val; 11775 if (cksum_flags != NULL) 11776 *cksum_flags = sum_flags; 11777 11778 return (B_TRUE); 11779 } 11780 11781 /* 11782 * Perform ip header check sum update local options. 11783 * return B_TRUE if all is well, else return B_FALSE and release 11784 * the mp. caller is responsible for decrementing ire ref cnt. 11785 */ 11786 static boolean_t 11787 ip_options_cksum(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire) 11788 { 11789 mblk_t *first_mp; 11790 boolean_t mctl_present; 11791 uint16_t sum; 11792 11793 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 11794 /* 11795 * Don't do the checksum if it has gone through AH/ESP 11796 * processing. 11797 */ 11798 if (!mctl_present) { 11799 sum = ip_csum_hdr(ipha); 11800 if (sum != 0) { 11801 BUMP_MIB(&ip_mib, ipInCksumErrs); 11802 freemsg(first_mp); 11803 return (B_FALSE); 11804 } 11805 } 11806 11807 if (!ip_rput_local_options(q, mp, ipha, ire)) { 11808 if (mctl_present) 11809 freeb(first_mp); 11810 return (B_FALSE); 11811 } 11812 11813 return (B_TRUE); 11814 } 11815 11816 /* 11817 * All udp packet are delivered to the local host via this routine. 11818 */ 11819 void 11820 ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, 11821 ill_t *recv_ill) 11822 { 11823 uint32_t sum; 11824 uint32_t u1; 11825 boolean_t mctl_present; 11826 conn_t *connp; 11827 mblk_t *first_mp; 11828 uint16_t *up; 11829 ill_t *ill = (ill_t *)q->q_ptr; 11830 uint16_t reass_hck_flags = 0; 11831 11832 #define rptr ((uchar_t *)ipha) 11833 11834 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 11835 ASSERT(!mctl_present || ipsec_in_is_secure(first_mp)); 11836 ASSERT(ipha->ipha_protocol == IPPROTO_UDP); 11837 11838 /* 11839 * FAST PATH for udp packets 11840 */ 11841 11842 /* u1 is # words of IP options */ 11843 u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) + 11844 IP_SIMPLE_HDR_LENGTH_IN_WORDS); 11845 11846 /* IP options present */ 11847 if (u1 != 0) 11848 goto ipoptions; 11849 11850 /* Check the IP header checksum. */ 11851 if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) { 11852 /* Clear the IP header h/w cksum flag */ 11853 DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; 11854 } else { 11855 #define uph ((uint16_t *)ipha) 11856 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + uph[5] + 11857 uph[6] + uph[7] + uph[8] + uph[9]; 11858 #undef uph 11859 /* finish doing IP checksum */ 11860 sum = (sum & 0xFFFF) + (sum >> 16); 11861 sum = ~(sum + (sum >> 16)) & 0xFFFF; 11862 /* 11863 * Don't verify header checksum if this packet is coming 11864 * back from AH/ESP as we already did it. 11865 */ 11866 if (!mctl_present && sum != 0 && sum != 0xFFFF) { 11867 BUMP_MIB(&ip_mib, ipInCksumErrs); 11868 freemsg(first_mp); 11869 return; 11870 } 11871 } 11872 11873 /* 11874 * Count for SNMP of inbound packets for ire. 11875 * if mctl is present this might be a secure packet and 11876 * has already been counted for in ip_proto_input(). 11877 */ 11878 if (!mctl_present) { 11879 UPDATE_IB_PKT_COUNT(ire); 11880 ire->ire_last_used_time = lbolt; 11881 } 11882 11883 /* packet part of fragmented IP packet? */ 11884 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 11885 if (u1 & (IPH_MF | IPH_OFFSET)) { 11886 goto fragmented; 11887 } 11888 11889 /* u1 = IP header length (20 bytes) */ 11890 u1 = IP_SIMPLE_HDR_LENGTH; 11891 11892 /* packet does not contain complete IP & UDP headers */ 11893 if ((mp->b_wptr - rptr) < (IP_SIMPLE_HDR_LENGTH + UDPH_SIZE)) 11894 goto udppullup; 11895 11896 /* up points to UDP header */ 11897 up = (uint16_t *)((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH); 11898 #define iphs ((uint16_t *)ipha) 11899 11900 /* if udp hdr cksum != 0, then need to checksum udp packet */ 11901 if (up[3] != 0) { 11902 mblk_t *mp1 = mp->b_cont; 11903 boolean_t cksum_err; 11904 uint16_t hck_flags = 0; 11905 11906 /* Pseudo-header checksum */ 11907 u1 = IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] + 11908 iphs[9] + up[2]; 11909 11910 /* 11911 * Revert to software checksum calculation if the interface 11912 * isn't capable of checksum offload or if IPsec is present. 11913 */ 11914 if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum) 11915 hck_flags = DB_CKSUMFLAGS(mp); 11916 11917 if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) 11918 IP_STAT(ip_in_sw_cksum); 11919 11920 IP_CKSUM_RECV(hck_flags, u1, 11921 (uchar_t *)(rptr + DB_CKSUMSTART(mp)), 11922 (int32_t)((uchar_t *)up - rptr), 11923 mp, mp1, cksum_err); 11924 11925 if (cksum_err) { 11926 BUMP_MIB(&ip_mib, udpInCksumErrs); 11927 11928 if (hck_flags & HCK_FULLCKSUM) 11929 IP_STAT(ip_udp_in_full_hw_cksum_err); 11930 else if (hck_flags & HCK_PARTIALCKSUM) 11931 IP_STAT(ip_udp_in_part_hw_cksum_err); 11932 else 11933 IP_STAT(ip_udp_in_sw_cksum_err); 11934 11935 freemsg(first_mp); 11936 return; 11937 } 11938 } 11939 11940 /* Non-fragmented broadcast or multicast packet? */ 11941 if (ire->ire_type == IRE_BROADCAST) 11942 goto udpslowpath; 11943 11944 if ((connp = ipcl_classify_v4(mp, IPPROTO_UDP, IP_SIMPLE_HDR_LENGTH, 11945 ire->ire_zoneid)) != NULL) { 11946 ASSERT(connp->conn_upq != NULL); 11947 IP_STAT(ip_udp_fast_path); 11948 11949 if (CONN_UDP_FLOWCTLD(connp)) { 11950 freemsg(mp); 11951 BUMP_MIB(&ip_mib, udpInOverflows); 11952 } else { 11953 if (!mctl_present) { 11954 BUMP_MIB(&ip_mib, ipInDelivers); 11955 } 11956 /* 11957 * mp and first_mp can change. 11958 */ 11959 if (ip_udp_check(q, connp, recv_ill, 11960 ipha, &mp, &first_mp, mctl_present)) { 11961 /* Send it upstream */ 11962 CONN_UDP_RECV(connp, mp); 11963 } 11964 } 11965 /* 11966 * freeb() cannot deal with null mblk being passed 11967 * in and first_mp can be set to null in the call 11968 * ipsec_input_fast_proc()->ipsec_check_inbound_policy. 11969 */ 11970 if (mctl_present && first_mp != NULL) { 11971 freeb(first_mp); 11972 } 11973 CONN_DEC_REF(connp); 11974 return; 11975 } 11976 11977 /* 11978 * if we got here we know the packet is not fragmented and 11979 * has no options. The classifier could not find a conn_t and 11980 * most likely its an icmp packet so send it through slow path. 11981 */ 11982 11983 goto udpslowpath; 11984 11985 ipoptions: 11986 if (!ip_options_cksum(q, mp, ipha, ire)) { 11987 goto slow_done; 11988 } 11989 11990 UPDATE_IB_PKT_COUNT(ire); 11991 ire->ire_last_used_time = lbolt; 11992 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 11993 if (u1 & (IPH_MF | IPH_OFFSET)) { 11994 fragmented: 11995 /* 11996 * "sum" and "reass_hck_flags" are non-zero if the 11997 * reassembled packet has a valid hardware computed 11998 * checksum information associated with it. 11999 */ 12000 if (!ip_rput_fragment(q, &mp, ipha, &sum, &reass_hck_flags)) 12001 goto slow_done; 12002 /* 12003 * Make sure that first_mp points back to mp as 12004 * the mp we came in with could have changed in 12005 * ip_rput_fragment(). 12006 */ 12007 ASSERT(!mctl_present); 12008 ipha = (ipha_t *)mp->b_rptr; 12009 first_mp = mp; 12010 } 12011 12012 /* Now we have a complete datagram, destined for this machine. */ 12013 u1 = IPH_HDR_LENGTH(ipha); 12014 /* Pull up the UDP header, if necessary. */ 12015 if ((MBLKL(mp)) < (u1 + UDPH_SIZE)) { 12016 udppullup: 12017 if (!pullupmsg(mp, u1 + UDPH_SIZE)) { 12018 BUMP_MIB(&ip_mib, ipInDiscards); 12019 freemsg(first_mp); 12020 goto slow_done; 12021 } 12022 ipha = (ipha_t *)mp->b_rptr; 12023 } 12024 12025 /* 12026 * Validate the checksum for the reassembled packet; for the 12027 * pullup case we calculate the payload checksum in software. 12028 */ 12029 up = (uint16_t *)((uchar_t *)ipha + u1 + UDP_PORTS_OFFSET); 12030 if (up[3] != 0) { 12031 boolean_t cksum_err; 12032 12033 if ((reass_hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) 12034 IP_STAT(ip_in_sw_cksum); 12035 12036 IP_CKSUM_RECV_REASS(reass_hck_flags, 12037 (int32_t)((uchar_t *)up - (uchar_t *)ipha), 12038 IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] + 12039 iphs[9] + up[2], sum, cksum_err); 12040 12041 if (cksum_err) { 12042 BUMP_MIB(&ip_mib, udpInCksumErrs); 12043 12044 if (reass_hck_flags & HCK_FULLCKSUM) 12045 IP_STAT(ip_udp_in_full_hw_cksum_err); 12046 else if (reass_hck_flags & HCK_PARTIALCKSUM) 12047 IP_STAT(ip_udp_in_part_hw_cksum_err); 12048 else 12049 IP_STAT(ip_udp_in_sw_cksum_err); 12050 12051 freemsg(first_mp); 12052 goto slow_done; 12053 } 12054 } 12055 udpslowpath: 12056 12057 /* Clear hardware checksum flag to be safe */ 12058 DB_CKSUMFLAGS(mp) = 0; 12059 12060 ip_fanout_udp(q, first_mp, ill, ipha, *(uint32_t *)up, 12061 (ire->ire_type == IRE_BROADCAST), 12062 IP_FF_SEND_ICMP | IP_FF_CKSUM | IP_FF_IP6INFO, 12063 mctl_present, B_TRUE, recv_ill, ire->ire_zoneid); 12064 12065 slow_done: 12066 IP_STAT(ip_udp_slow_path); 12067 return; 12068 12069 #undef iphs 12070 #undef rptr 12071 } 12072 12073 /* ARGSUSED */ 12074 static mblk_t * 12075 ip_tcp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present, 12076 ire_t *ire, mblk_t *first_mp, uint_t flags, queue_t *q, 12077 ill_rx_ring_t *ill_ring) 12078 { 12079 conn_t *connp; 12080 uint32_t sum; 12081 uint32_t u1; 12082 uint16_t *up; 12083 int offset; 12084 ssize_t len; 12085 mblk_t *mp1; 12086 boolean_t syn_present = B_FALSE; 12087 tcph_t *tcph; 12088 uint_t ip_hdr_len; 12089 ill_t *ill = (ill_t *)q->q_ptr; 12090 zoneid_t zoneid = ire->ire_zoneid; 12091 boolean_t cksum_err; 12092 uint16_t hck_flags = 0; 12093 12094 #define rptr ((uchar_t *)ipha) 12095 12096 ASSERT(ipha->ipha_protocol == IPPROTO_TCP); 12097 12098 /* 12099 * FAST PATH for tcp packets 12100 */ 12101 12102 /* u1 is # words of IP options */ 12103 u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) 12104 + IP_SIMPLE_HDR_LENGTH_IN_WORDS); 12105 12106 /* IP options present */ 12107 if (u1) { 12108 goto ipoptions; 12109 } else { 12110 /* Check the IP header checksum. */ 12111 if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) { 12112 /* Clear the IP header h/w cksum flag */ 12113 DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; 12114 } else { 12115 #define uph ((uint16_t *)ipha) 12116 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + 12117 uph[5] + uph[6] + uph[7] + uph[8] + uph[9]; 12118 #undef uph 12119 /* finish doing IP checksum */ 12120 sum = (sum & 0xFFFF) + (sum >> 16); 12121 sum = ~(sum + (sum >> 16)) & 0xFFFF; 12122 /* 12123 * Don't verify header checksum if this packet 12124 * is coming back from AH/ESP as we already did it. 12125 */ 12126 if (!mctl_present && (sum != 0) && sum != 0xFFFF) { 12127 BUMP_MIB(&ip_mib, ipInCksumErrs); 12128 goto error; 12129 } 12130 } 12131 } 12132 12133 if (!mctl_present) { 12134 UPDATE_IB_PKT_COUNT(ire); 12135 ire->ire_last_used_time = lbolt; 12136 } 12137 12138 /* packet part of fragmented IP packet? */ 12139 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 12140 if (u1 & (IPH_MF | IPH_OFFSET)) { 12141 goto fragmented; 12142 } 12143 12144 /* u1 = IP header length (20 bytes) */ 12145 u1 = ip_hdr_len = IP_SIMPLE_HDR_LENGTH; 12146 12147 /* does packet contain IP+TCP headers? */ 12148 len = mp->b_wptr - rptr; 12149 if (len < (IP_SIMPLE_HDR_LENGTH + TCP_MIN_HEADER_LENGTH)) { 12150 IP_STAT(ip_tcppullup); 12151 goto tcppullup; 12152 } 12153 12154 /* TCP options present? */ 12155 offset = ((uchar_t *)ipha)[IP_SIMPLE_HDR_LENGTH + 12] >> 4; 12156 12157 /* 12158 * If options need to be pulled up, then goto tcpoptions. 12159 * otherwise we are still in the fast path 12160 */ 12161 if (len < (offset << 2) + IP_SIMPLE_HDR_LENGTH) { 12162 IP_STAT(ip_tcpoptions); 12163 goto tcpoptions; 12164 } 12165 12166 /* multiple mblks of tcp data? */ 12167 if ((mp1 = mp->b_cont) != NULL) { 12168 /* more then two? */ 12169 if (mp1->b_cont != NULL) { 12170 IP_STAT(ip_multipkttcp); 12171 goto multipkttcp; 12172 } 12173 len += mp1->b_wptr - mp1->b_rptr; 12174 } 12175 12176 up = (uint16_t *)(rptr + IP_SIMPLE_HDR_LENGTH + TCP_PORTS_OFFSET); 12177 12178 /* part of pseudo checksum */ 12179 12180 /* TCP datagram length */ 12181 u1 = len - IP_SIMPLE_HDR_LENGTH; 12182 12183 #define iphs ((uint16_t *)ipha) 12184 12185 #ifdef _BIG_ENDIAN 12186 u1 += IPPROTO_TCP; 12187 #else 12188 u1 = ((u1 >> 8) & 0xFF) + (((u1 & 0xFF) + IPPROTO_TCP) << 8); 12189 #endif 12190 u1 += iphs[6] + iphs[7] + iphs[8] + iphs[9]; 12191 12192 /* 12193 * Revert to software checksum calculation if the interface 12194 * isn't capable of checksum offload or if IPsec is present. 12195 */ 12196 if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum) 12197 hck_flags = DB_CKSUMFLAGS(mp); 12198 12199 if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) 12200 IP_STAT(ip_in_sw_cksum); 12201 12202 IP_CKSUM_RECV(hck_flags, u1, 12203 (uchar_t *)(rptr + DB_CKSUMSTART(mp)), 12204 (int32_t)((uchar_t *)up - rptr), 12205 mp, mp1, cksum_err); 12206 12207 if (cksum_err) { 12208 BUMP_MIB(&ip_mib, tcpInErrs); 12209 12210 if (hck_flags & HCK_FULLCKSUM) 12211 IP_STAT(ip_tcp_in_full_hw_cksum_err); 12212 else if (hck_flags & HCK_PARTIALCKSUM) 12213 IP_STAT(ip_tcp_in_part_hw_cksum_err); 12214 else 12215 IP_STAT(ip_tcp_in_sw_cksum_err); 12216 12217 goto error; 12218 } 12219 12220 try_again: 12221 12222 if ((connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_len, zoneid)) == 12223 NULL) { 12224 /* Send the TH_RST */ 12225 goto no_conn; 12226 } 12227 12228 /* 12229 * TCP FAST PATH for AF_INET socket. 12230 * 12231 * TCP fast path to avoid extra work. An AF_INET socket type 12232 * does not have facility to receive extra information via 12233 * ip_process or ip_add_info. Also, when the connection was 12234 * established, we made a check if this connection is impacted 12235 * by any global IPSec policy or per connection policy (a 12236 * policy that comes in effect later will not apply to this 12237 * connection). Since all this can be determined at the 12238 * connection establishment time, a quick check of flags 12239 * can avoid extra work. 12240 */ 12241 if (IPCL_IS_TCP4_CONNECTED_NO_POLICY(connp) && !mctl_present && 12242 !IPP_ENABLED(IPP_LOCAL_IN)) { 12243 ASSERT(first_mp == mp); 12244 SET_SQUEUE(mp, tcp_rput_data, connp); 12245 return (mp); 12246 } 12247 12248 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 12249 if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) { 12250 if (IPCL_IS_TCP(connp)) { 12251 mp->b_datap->db_struioflag |= STRUIO_EAGER; 12252 DB_CKSUMSTART(mp) = 12253 (intptr_t)ip_squeue_get(ill_ring); 12254 if (IPCL_IS_FULLY_BOUND(connp) && !mctl_present && 12255 !CONN_INBOUND_POLICY_PRESENT(connp)) { 12256 SET_SQUEUE(mp, connp->conn_recv, connp); 12257 return (mp); 12258 } else if (IPCL_IS_BOUND(connp) && !mctl_present && 12259 !CONN_INBOUND_POLICY_PRESENT(connp)) { 12260 ip_squeue_enter_unbound++; 12261 SET_SQUEUE(mp, tcp_conn_request_unbound, 12262 connp); 12263 return (mp); 12264 } 12265 syn_present = B_TRUE; 12266 } 12267 12268 } 12269 12270 if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp) && !syn_present) { 12271 uint_t flags = (unsigned int)tcph->th_flags[0] & 0xFF; 12272 12273 /* No need to send this packet to TCP */ 12274 if ((flags & TH_RST) || (flags & TH_URG)) { 12275 CONN_DEC_REF(connp); 12276 freemsg(first_mp); 12277 return (NULL); 12278 } 12279 if (flags & TH_ACK) { 12280 tcp_xmit_listeners_reset(first_mp, ip_hdr_len); 12281 CONN_DEC_REF(connp); 12282 return (NULL); 12283 } 12284 12285 CONN_DEC_REF(connp); 12286 freemsg(first_mp); 12287 return (NULL); 12288 } 12289 12290 if (CONN_INBOUND_POLICY_PRESENT(connp) || mctl_present) { 12291 first_mp = ipsec_check_inbound_policy(first_mp, connp, 12292 ipha, NULL, mctl_present); 12293 if (first_mp == NULL) { 12294 CONN_DEC_REF(connp); 12295 return (NULL); 12296 } 12297 if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp)) { 12298 ASSERT(syn_present); 12299 if (mctl_present) { 12300 ASSERT(first_mp != mp); 12301 first_mp->b_datap->db_struioflag |= 12302 STRUIO_POLICY; 12303 } else { 12304 ASSERT(first_mp == mp); 12305 mp->b_datap->db_struioflag &= ~STRUIO_EAGER; 12306 mp->b_datap->db_struioflag |= STRUIO_POLICY; 12307 } 12308 } else { 12309 /* 12310 * Discard first_mp early since we're dealing with a 12311 * fully-connected conn_t and tcp doesn't do policy in 12312 * this case. 12313 */ 12314 if (mctl_present) { 12315 freeb(first_mp); 12316 mctl_present = B_FALSE; 12317 } 12318 first_mp = mp; 12319 } 12320 } 12321 12322 /* Initiate IPPF processing for fastpath */ 12323 if (IPP_ENABLED(IPP_LOCAL_IN)) { 12324 uint32_t ill_index; 12325 12326 ill_index = recv_ill->ill_phyint->phyint_ifindex; 12327 ip_process(IPP_LOCAL_IN, &mp, ill_index); 12328 if (mp == NULL) { 12329 ip2dbg(("ip_input_ipsec_process: TCP pkt " 12330 "deferred/dropped during IPPF processing\n")); 12331 CONN_DEC_REF(connp); 12332 if (mctl_present) 12333 freeb(first_mp); 12334 return (NULL); 12335 } else if (mctl_present) { 12336 /* 12337 * ip_process might return a new mp. 12338 */ 12339 ASSERT(first_mp != mp); 12340 first_mp->b_cont = mp; 12341 } else { 12342 first_mp = mp; 12343 } 12344 12345 } 12346 12347 if (!syn_present && connp->conn_ipv6_recvpktinfo) { 12348 mp = ip_add_info(mp, recv_ill, flags); 12349 if (mp == NULL) { 12350 CONN_DEC_REF(connp); 12351 if (mctl_present) 12352 freeb(first_mp); 12353 return (NULL); 12354 } else if (mctl_present) { 12355 /* 12356 * ip_add_info might return a new mp. 12357 */ 12358 ASSERT(first_mp != mp); 12359 first_mp->b_cont = mp; 12360 } else { 12361 first_mp = mp; 12362 } 12363 } 12364 12365 if (IPCL_IS_TCP(connp)) { 12366 SET_SQUEUE(first_mp, connp->conn_recv, connp); 12367 return (first_mp); 12368 } else { 12369 putnext(connp->conn_rq, first_mp); 12370 CONN_DEC_REF(connp); 12371 return (NULL); 12372 } 12373 12374 no_conn: 12375 /* Initiate IPPf processing, if needed. */ 12376 if (IPP_ENABLED(IPP_LOCAL_IN)) { 12377 uint32_t ill_index; 12378 ill_index = recv_ill->ill_phyint->phyint_ifindex; 12379 ip_process(IPP_LOCAL_IN, &first_mp, ill_index); 12380 if (first_mp == NULL) { 12381 return (NULL); 12382 } 12383 } 12384 BUMP_MIB(&ip_mib, ipInDelivers); 12385 tcp_xmit_listeners_reset(first_mp, IPH_HDR_LENGTH(mp->b_rptr)); 12386 return (NULL); 12387 ipoptions: 12388 if (!ip_options_cksum(q, first_mp, ipha, ire)) { 12389 goto slow_done; 12390 } 12391 12392 UPDATE_IB_PKT_COUNT(ire); 12393 ire->ire_last_used_time = lbolt; 12394 12395 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 12396 if (u1 & (IPH_MF | IPH_OFFSET)) { 12397 fragmented: 12398 if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) { 12399 if (mctl_present) 12400 freeb(first_mp); 12401 goto slow_done; 12402 } 12403 /* 12404 * Make sure that first_mp points back to mp as 12405 * the mp we came in with could have changed in 12406 * ip_rput_fragment(). 12407 */ 12408 ASSERT(!mctl_present); 12409 ipha = (ipha_t *)mp->b_rptr; 12410 first_mp = mp; 12411 } 12412 12413 /* Now we have a complete datagram, destined for this machine. */ 12414 u1 = ip_hdr_len = IPH_HDR_LENGTH(ipha); 12415 12416 len = mp->b_wptr - mp->b_rptr; 12417 /* Pull up a minimal TCP header, if necessary. */ 12418 if (len < (u1 + 20)) { 12419 tcppullup: 12420 if (!pullupmsg(mp, u1 + 20)) { 12421 BUMP_MIB(&ip_mib, ipInDiscards); 12422 goto error; 12423 } 12424 ipha = (ipha_t *)mp->b_rptr; 12425 len = mp->b_wptr - mp->b_rptr; 12426 } 12427 12428 /* 12429 * Extract the offset field from the TCP header. As usual, we 12430 * try to help the compiler more than the reader. 12431 */ 12432 offset = ((uchar_t *)ipha)[u1 + 12] >> 4; 12433 if (offset != 5) { 12434 tcpoptions: 12435 if (offset < 5) { 12436 BUMP_MIB(&ip_mib, ipInDiscards); 12437 goto error; 12438 } 12439 /* 12440 * There must be TCP options. 12441 * Make sure we can grab them. 12442 */ 12443 offset <<= 2; 12444 offset += u1; 12445 if (len < offset) { 12446 if (!pullupmsg(mp, offset)) { 12447 BUMP_MIB(&ip_mib, ipInDiscards); 12448 goto error; 12449 } 12450 ipha = (ipha_t *)mp->b_rptr; 12451 len = mp->b_wptr - rptr; 12452 } 12453 } 12454 12455 /* Get the total packet length in len, including headers. */ 12456 if (mp->b_cont) { 12457 multipkttcp: 12458 len = msgdsize(mp); 12459 } 12460 12461 /* 12462 * Check the TCP checksum by pulling together the pseudo- 12463 * header checksum, and passing it to ip_csum to be added in 12464 * with the TCP datagram. 12465 * 12466 * Since we are not using the hwcksum if available we must 12467 * clear the flag. We may come here via tcppullup or tcpoptions. 12468 * If either of these fails along the way the mblk is freed. 12469 * If this logic ever changes and mblk is reused to say send 12470 * ICMP's back, then this flag may need to be cleared in 12471 * other places as well. 12472 */ 12473 DB_CKSUMFLAGS(mp) = 0; 12474 12475 up = (uint16_t *)(rptr + u1 + TCP_PORTS_OFFSET); 12476 12477 u1 = (uint32_t)(len - u1); /* TCP datagram length. */ 12478 #ifdef _BIG_ENDIAN 12479 u1 += IPPROTO_TCP; 12480 #else 12481 u1 = ((u1 >> 8) & 0xFF) + (((u1 & 0xFF) + IPPROTO_TCP) << 8); 12482 #endif 12483 u1 += iphs[6] + iphs[7] + iphs[8] + iphs[9]; 12484 /* 12485 * Not M_DATA mblk or its a dup, so do the checksum now. 12486 */ 12487 IP_STAT(ip_in_sw_cksum); 12488 if (IP_CSUM(mp, (int32_t)((uchar_t *)up - rptr), u1) != 0) { 12489 BUMP_MIB(&ip_mib, tcpInErrs); 12490 goto error; 12491 } 12492 12493 IP_STAT(ip_tcp_slow_path); 12494 goto try_again; 12495 #undef iphs 12496 #undef rptr 12497 12498 error: 12499 freemsg(first_mp); 12500 slow_done: 12501 return (NULL); 12502 } 12503 12504 /* ARGSUSED */ 12505 static void 12506 ip_sctp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present, 12507 ire_t *ire, mblk_t *first_mp, uint_t flags, queue_t *q, ipaddr_t dst) 12508 { 12509 conn_t *connp; 12510 uint32_t sum; 12511 uint32_t u1; 12512 ssize_t len; 12513 sctp_hdr_t *sctph; 12514 zoneid_t zoneid = ire->ire_zoneid; 12515 uint32_t pktsum; 12516 uint32_t calcsum; 12517 uint32_t ports; 12518 uint_t ipif_seqid; 12519 in6_addr_t map_src, map_dst; 12520 ill_t *ill = (ill_t *)q->q_ptr; 12521 12522 #define rptr ((uchar_t *)ipha) 12523 12524 ASSERT(ipha->ipha_protocol == IPPROTO_SCTP); 12525 12526 /* u1 is # words of IP options */ 12527 u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) 12528 + IP_SIMPLE_HDR_LENGTH_IN_WORDS); 12529 12530 /* IP options present */ 12531 if (u1 > 0) { 12532 goto ipoptions; 12533 } else { 12534 /* Check the IP header checksum. */ 12535 if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) { 12536 /* 12537 * Since there is no SCTP h/w cksum support yet, just 12538 * clear the flag. 12539 */ 12540 DB_CKSUMFLAGS(mp) = 0; 12541 } else { 12542 #define uph ((uint16_t *)ipha) 12543 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + 12544 uph[5] + uph[6] + uph[7] + uph[8] + uph[9]; 12545 #undef uph 12546 /* finish doing IP checksum */ 12547 sum = (sum & 0xFFFF) + (sum >> 16); 12548 sum = ~(sum + (sum >> 16)) & 0xFFFF; 12549 /* 12550 * Don't verify header checksum if this packet 12551 * is coming back from AH/ESP as we already did it. 12552 */ 12553 if (!mctl_present && (sum != 0) && sum != 0xFFFF) { 12554 BUMP_MIB(&ip_mib, ipInCksumErrs); 12555 goto error; 12556 } 12557 } 12558 } 12559 12560 /* 12561 * Don't verify header checksum if this packet is coming 12562 * back from AH/ESP as we already did it. 12563 */ 12564 if (!mctl_present) { 12565 UPDATE_IB_PKT_COUNT(ire); 12566 ire->ire_last_used_time = lbolt; 12567 } 12568 12569 /* packet part of fragmented IP packet? */ 12570 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 12571 if (u1 & (IPH_MF | IPH_OFFSET)) 12572 goto fragmented; 12573 12574 /* u1 = IP header length (20 bytes) */ 12575 u1 = IP_SIMPLE_HDR_LENGTH; 12576 12577 find_sctp_client: 12578 /* Pullup if we don't have the sctp common header. */ 12579 len = MBLKL(mp); 12580 if (len < (u1 + SCTP_COMMON_HDR_LENGTH)) { 12581 if (mp->b_cont == NULL || 12582 !pullupmsg(mp, u1 + SCTP_COMMON_HDR_LENGTH)) { 12583 BUMP_MIB(&ip_mib, ipInDiscards); 12584 goto error; 12585 } 12586 ipha = (ipha_t *)mp->b_rptr; 12587 len = MBLKL(mp); 12588 } 12589 12590 sctph = (sctp_hdr_t *)(rptr + u1); 12591 #ifdef DEBUG 12592 if (!skip_sctp_cksum) { 12593 #endif 12594 pktsum = sctph->sh_chksum; 12595 sctph->sh_chksum = 0; 12596 calcsum = sctp_cksum(mp, u1); 12597 if (calcsum != pktsum) { 12598 BUMP_MIB(&sctp_mib, sctpChecksumError); 12599 goto error; 12600 } 12601 sctph->sh_chksum = pktsum; 12602 #ifdef DEBUG /* skip_sctp_cksum */ 12603 } 12604 #endif 12605 /* get the ports */ 12606 ports = *(uint32_t *)&sctph->sh_sport; 12607 12608 ipif_seqid = ire->ire_ipif->ipif_seqid; 12609 IRE_REFRELE(ire); 12610 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_dst); 12611 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_src); 12612 if ((connp = sctp_fanout(&map_src, &map_dst, ports, ipif_seqid, zoneid, 12613 mp)) == NULL) { 12614 /* Check for raw socket or OOTB handling */ 12615 goto no_conn; 12616 } 12617 12618 /* Found a client; up it goes */ 12619 BUMP_MIB(&ip_mib, ipInDelivers); 12620 sctp_input(connp, ipha, mp, first_mp, recv_ill, B_TRUE, mctl_present); 12621 return; 12622 12623 no_conn: 12624 ip_fanout_sctp_raw(first_mp, recv_ill, ipha, B_TRUE, 12625 ports, mctl_present, flags, B_TRUE, ipif_seqid, zoneid); 12626 return; 12627 12628 ipoptions: 12629 DB_CKSUMFLAGS(mp) = 0; 12630 if (!ip_options_cksum(q, first_mp, ipha, ire)) 12631 goto slow_done; 12632 12633 UPDATE_IB_PKT_COUNT(ire); 12634 ire->ire_last_used_time = lbolt; 12635 12636 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 12637 if (u1 & (IPH_MF | IPH_OFFSET)) { 12638 fragmented: 12639 if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) 12640 goto slow_done; 12641 /* 12642 * Make sure that first_mp points back to mp as 12643 * the mp we came in with could have changed in 12644 * ip_rput_fragment(). 12645 */ 12646 ASSERT(!mctl_present); 12647 ipha = (ipha_t *)mp->b_rptr; 12648 first_mp = mp; 12649 } 12650 12651 /* Now we have a complete datagram, destined for this machine. */ 12652 u1 = IPH_HDR_LENGTH(ipha); 12653 goto find_sctp_client; 12654 #undef iphs 12655 #undef rptr 12656 12657 error: 12658 freemsg(first_mp); 12659 slow_done: 12660 IRE_REFRELE(ire); 12661 } 12662 12663 #define VER_BITS 0xF0 12664 #define VERSION_6 0x60 12665 12666 static boolean_t 12667 ip_rput_multimblk_ipoptions(queue_t *q, mblk_t *mp, ipha_t **iphapp, 12668 ipaddr_t *dstp) 12669 { 12670 uint_t opt_len; 12671 ipha_t *ipha; 12672 ssize_t len; 12673 uint_t pkt_len; 12674 12675 IP_STAT(ip_ipoptions); 12676 ipha = *iphapp; 12677 12678 #define rptr ((uchar_t *)ipha) 12679 /* Assume no IPv6 packets arrive over the IPv4 queue */ 12680 if (IPH_HDR_VERSION(ipha) == IPV6_VERSION) { 12681 BUMP_MIB(&ip_mib, ipInIPv6); 12682 freemsg(mp); 12683 return (B_FALSE); 12684 } 12685 12686 /* multiple mblk or too short */ 12687 pkt_len = ntohs(ipha->ipha_length); 12688 12689 /* Get the number of words of IP options in the IP header. */ 12690 opt_len = ipha->ipha_version_and_hdr_length - IP_SIMPLE_HDR_VERSION; 12691 if (opt_len) { 12692 /* IP Options present! Validate and process. */ 12693 if (opt_len > (15 - IP_SIMPLE_HDR_LENGTH_IN_WORDS)) { 12694 BUMP_MIB(&ip_mib, ipInHdrErrors); 12695 goto done; 12696 } 12697 /* 12698 * Recompute complete header length and make sure we 12699 * have access to all of it. 12700 */ 12701 len = ((size_t)opt_len + IP_SIMPLE_HDR_LENGTH_IN_WORDS) << 2; 12702 if (len > (mp->b_wptr - rptr)) { 12703 if (len > pkt_len) { 12704 BUMP_MIB(&ip_mib, ipInHdrErrors); 12705 goto done; 12706 } 12707 if (!pullupmsg(mp, len)) { 12708 BUMP_MIB(&ip_mib, ipInDiscards); 12709 goto done; 12710 } 12711 ipha = (ipha_t *)mp->b_rptr; 12712 } 12713 /* 12714 * Go off to ip_rput_options which returns the next hop 12715 * destination address, which may have been affected 12716 * by source routing. 12717 */ 12718 IP_STAT(ip_opt); 12719 if (ip_rput_options(q, mp, ipha, dstp) == -1) { 12720 return (B_FALSE); 12721 } 12722 } 12723 *iphapp = ipha; 12724 return (B_TRUE); 12725 done: 12726 /* clear b_prev - used by ip_mroute_decap */ 12727 mp->b_prev = NULL; 12728 freemsg(mp); 12729 return (B_FALSE); 12730 #undef rptr 12731 } 12732 12733 /* 12734 * Deal with the fact that there is no ire for the destination. 12735 * The incoming ill (in_ill) is passed in to ip_newroute only 12736 * in the case of packets coming from mobile ip forward tunnel. 12737 * It must be null otherwise. 12738 */ 12739 static void 12740 ip_rput_noire(queue_t *q, ill_t *in_ill, mblk_t *mp, int ll_multicast, 12741 ipaddr_t dst) 12742 { 12743 ipha_t *ipha; 12744 ill_t *ill; 12745 12746 ipha = (ipha_t *)mp->b_rptr; 12747 ill = (ill_t *)q->q_ptr; 12748 12749 ASSERT(ill != NULL); 12750 /* 12751 * No IRE for this destination, so it can't be for us. 12752 * Unless we are forwarding, drop the packet. 12753 * We have to let source routed packets through 12754 * since we don't yet know if they are 'ping -l' 12755 * packets i.e. if they will go out over the 12756 * same interface as they came in on. 12757 */ 12758 if (ll_multicast) { 12759 freemsg(mp); 12760 return; 12761 } 12762 if (!(ill->ill_flags & ILLF_ROUTER) && !ip_source_routed(ipha)) { 12763 BUMP_MIB(&ip_mib, ipForwProhibits); 12764 freemsg(mp); 12765 return; 12766 } 12767 12768 /* Check for Martian addresses */ 12769 if ((in_ill == NULL) && (ip_no_forward(ipha, ill))) { 12770 freemsg(mp); 12771 return; 12772 } 12773 12774 /* Mark this packet as having originated externally */ 12775 mp->b_prev = (mblk_t *)(uintptr_t)ill->ill_phyint->phyint_ifindex; 12776 12777 /* 12778 * Clear the indication that this may have a hardware checksum 12779 * as we are not using it 12780 */ 12781 DB_CKSUMFLAGS(mp) = 0; 12782 12783 /* 12784 * Now hand the packet to ip_newroute. 12785 */ 12786 ip_newroute(q, mp, dst, in_ill, NULL); 12787 } 12788 12789 /* 12790 * check ip header length and align it. 12791 */ 12792 static boolean_t 12793 ip_check_and_align_header(queue_t *q, mblk_t *mp) 12794 { 12795 ssize_t len; 12796 ill_t *ill; 12797 ipha_t *ipha; 12798 12799 len = MBLKL(mp); 12800 12801 if (!OK_32PTR(mp->b_rptr) || len < IP_SIMPLE_HDR_LENGTH) { 12802 if (!OK_32PTR(mp->b_rptr)) 12803 IP_STAT(ip_notaligned1); 12804 else 12805 IP_STAT(ip_notaligned2); 12806 /* Guard against bogus device drivers */ 12807 if (len < 0) { 12808 /* clear b_prev - used by ip_mroute_decap */ 12809 mp->b_prev = NULL; 12810 BUMP_MIB(&ip_mib, ipInHdrErrors); 12811 freemsg(mp); 12812 return (B_FALSE); 12813 } 12814 12815 if (ip_rput_pullups++ == 0) { 12816 ill = (ill_t *)q->q_ptr; 12817 ipha = (ipha_t *)mp->b_rptr; 12818 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 12819 "ip_check_and_align_header: %s forced us to " 12820 " pullup pkt, hdr len %ld, hdr addr %p", 12821 ill->ill_name, len, ipha); 12822 } 12823 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 12824 /* clear b_prev - used by ip_mroute_decap */ 12825 mp->b_prev = NULL; 12826 BUMP_MIB(&ip_mib, ipInDiscards); 12827 freemsg(mp); 12828 return (B_FALSE); 12829 } 12830 } 12831 return (B_TRUE); 12832 } 12833 12834 static boolean_t 12835 ip_rput_notforus(queue_t **qp, mblk_t *mp, ire_t *ire, ill_t *ill) 12836 { 12837 ill_group_t *ill_group; 12838 ill_group_t *ire_group; 12839 queue_t *q; 12840 ill_t *ire_ill; 12841 uint_t ill_ifindex; 12842 12843 q = *qp; 12844 /* 12845 * We need to check to make sure the packet came in 12846 * on the queue associated with the destination IRE. 12847 * Note that for multicast packets and broadcast packets sent to 12848 * a broadcast address which is shared between multiple interfaces 12849 * we should not do this since we just got a random broadcast ire. 12850 */ 12851 if (ire->ire_rfq && ire->ire_type != IRE_BROADCAST) { 12852 boolean_t check_multi = B_TRUE; 12853 12854 /* 12855 * This packet came in on an interface other than the 12856 * one associated with the destination address. 12857 * "Gateway" it to the appropriate interface here. 12858 * As long as the ills belong to the same group, 12859 * we don't consider them to arriving on the wrong 12860 * interface. Thus, when the switch is doing inbound 12861 * load spreading, we won't drop packets when we 12862 * are doing strict multihoming checks. Note, the 12863 * same holds true for 'usesrc groups' where the 12864 * destination address may belong to another interface 12865 * to allow multipathing to happen 12866 */ 12867 ill_group = ill->ill_group; 12868 ire_ill = (ill_t *)(ire->ire_rfq)->q_ptr; 12869 ill_ifindex = ill->ill_usesrc_ifindex; 12870 ire_group = ire_ill->ill_group; 12871 12872 /* 12873 * If it's part of the same IPMP group, or if it's a legal 12874 * address on the 'usesrc' interface, then bypass strict 12875 * checks. 12876 */ 12877 if (ill_group != NULL && ill_group == ire_group) { 12878 check_multi = B_FALSE; 12879 } else if (ill_ifindex != 0 && 12880 ill_ifindex == ire_ill->ill_phyint->phyint_ifindex) { 12881 check_multi = B_FALSE; 12882 } 12883 12884 if (check_multi && 12885 ip_strict_dst_multihoming && 12886 ((ill->ill_flags & 12887 ire->ire_ipif->ipif_ill->ill_flags & 12888 ILLF_ROUTER) == 0)) { 12889 /* Drop packet */ 12890 BUMP_MIB(&ip_mib, ipForwProhibits); 12891 freemsg(mp); 12892 ire_refrele(ire); 12893 return (B_TRUE); 12894 } 12895 12896 /* 12897 * Change the queue (for non-virtual destination network 12898 * interfaces) and ip_rput_local will be called with the right 12899 * queue 12900 */ 12901 q = ire->ire_rfq; 12902 } 12903 /* Must be broadcast. We'll take it. */ 12904 *qp = q; 12905 return (B_FALSE); 12906 } 12907 12908 static void 12909 ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha, 12910 ill_t *ill, int ll_multicast) 12911 { 12912 ill_group_t *ill_group; 12913 ill_group_t *ire_group; 12914 queue_t *dev_q; 12915 12916 ASSERT(ire->ire_stq != NULL); 12917 if (ll_multicast != 0) 12918 goto drop_pkt; 12919 12920 if (ip_no_forward(ipha, ill)) 12921 goto drop_pkt; 12922 12923 ill_group = ill->ill_group; 12924 ire_group = ((ill_t *)(ire->ire_rfq)->q_ptr)->ill_group; 12925 /* 12926 * Check if we want to forward this one at this time. 12927 * We allow source routed packets on a host provided that 12928 * they go out the same interface or same interface group 12929 * as they came in on. 12930 * 12931 * XXX To be quicker, we may wish to not chase pointers to 12932 * get the ILLF_ROUTER flag and instead store the 12933 * forwarding policy in the ire. An unfortunate 12934 * side-effect of that would be requiring an ire flush 12935 * whenever the ILLF_ROUTER flag changes. 12936 */ 12937 if (((ill->ill_flags & 12938 ((ill_t *)ire->ire_stq->q_ptr)->ill_flags & 12939 ILLF_ROUTER) == 0) && 12940 !(ip_source_routed(ipha) && (ire->ire_rfq == q || 12941 (ill_group != NULL && ill_group == ire_group)))) { 12942 BUMP_MIB(&ip_mib, ipForwProhibits); 12943 if (ip_source_routed(ipha)) { 12944 q = WR(q); 12945 /* 12946 * Clear the indication that this may have 12947 * hardware checksum as we are not using it. 12948 */ 12949 DB_CKSUMFLAGS(mp) = 0; 12950 icmp_unreachable(q, mp, 12951 ICMP_SOURCE_ROUTE_FAILED); 12952 ire_refrele(ire); 12953 return; 12954 } 12955 goto drop_pkt; 12956 } 12957 12958 /* Packet is being forwarded. Turning off hwcksum flag. */ 12959 DB_CKSUMFLAGS(mp) = 0; 12960 if (ip_g_send_redirects) { 12961 /* 12962 * Check whether the incoming interface and outgoing 12963 * interface is part of the same group. If so, 12964 * send redirects. 12965 * 12966 * Check the source address to see if it originated 12967 * on the same logical subnet it is going back out on. 12968 * If so, we should be able to send it a redirect. 12969 * Avoid sending a redirect if the destination 12970 * is directly connected (gw_addr == 0), 12971 * or if the packet was source routed out this 12972 * interface. 12973 */ 12974 ipaddr_t src; 12975 mblk_t *mp1; 12976 ire_t *src_ire = NULL; 12977 12978 /* 12979 * Check whether ire_rfq and q are from the same ill 12980 * or if they are not same, they at least belong 12981 * to the same group. If so, send redirects. 12982 */ 12983 if ((ire->ire_rfq == q || 12984 (ill_group != NULL && ill_group == ire_group)) && 12985 (ire->ire_gateway_addr != 0) && 12986 !ip_source_routed(ipha)) { 12987 12988 src = ipha->ipha_src; 12989 src_ire = ire_ftable_lookup(src, 0, 0, 12990 IRE_INTERFACE, ire->ire_ipif, NULL, ALL_ZONES, 12991 0, NULL, MATCH_IRE_IPIF | MATCH_IRE_TYPE); 12992 12993 if (src_ire != NULL) { 12994 /* 12995 * The source is directly connected. 12996 * Just copy the ip header (which is 12997 * in the first mblk) 12998 */ 12999 mp1 = copyb(mp); 13000 if (mp1 != NULL) { 13001 icmp_send_redirect(WR(q), mp1, 13002 ire->ire_gateway_addr); 13003 } 13004 ire_refrele(src_ire); 13005 } 13006 } 13007 } 13008 13009 dev_q = ire->ire_stq->q_next; 13010 if ((dev_q->q_next || dev_q->q_first) && !canput(dev_q)) { 13011 BUMP_MIB(&ip_mib, ipInDiscards); 13012 freemsg(mp); 13013 ire_refrele(ire); 13014 return; 13015 } 13016 13017 ip_rput_forward(ire, ipha, mp, ill); 13018 IRE_REFRELE(ire); 13019 return; 13020 13021 drop_pkt: 13022 ire_refrele(ire); 13023 ip2dbg(("ip_rput_forward: drop pkt\n")); 13024 freemsg(mp); 13025 } 13026 13027 static boolean_t 13028 ip_rput_process_broadcast(queue_t **qp, mblk_t *mp, ire_t **irep, ipha_t *ipha, 13029 ill_t *ill, ipaddr_t dst, int cgtp_flt_pkt, int ll_multicast) 13030 { 13031 queue_t *q; 13032 ire_t *ire; 13033 uint16_t hcksumflags; 13034 13035 q = *qp; 13036 ire = *irep; 13037 13038 /* 13039 * Clear the indication that this may have hardware 13040 * checksum as we are not using it for forwarding. 13041 */ 13042 hcksumflags = DB_CKSUMFLAGS(mp); 13043 DB_CKSUMFLAGS(mp) = 0; 13044 13045 /* 13046 * Directed broadcast forwarding: if the packet came in over a 13047 * different interface then it is routed out over we can forward it. 13048 */ 13049 if (ipha->ipha_protocol == IPPROTO_TCP) { 13050 ire_refrele(ire); 13051 freemsg(mp); 13052 BUMP_MIB(&ip_mib, ipInDiscards); 13053 return (B_TRUE); 13054 } 13055 /* 13056 * For multicast we have set dst to be INADDR_BROADCAST 13057 * for delivering to all STREAMS. IRE_MARK_NORECV is really 13058 * only for broadcast packets. 13059 */ 13060 if (!CLASSD(ipha->ipha_dst)) { 13061 ire_t *new_ire; 13062 ipif_t *ipif; 13063 /* 13064 * For ill groups, as the switch duplicates broadcasts 13065 * across all the ports, we need to filter out and 13066 * send up only one copy. There is one copy for every 13067 * broadcast address on each ill. Thus, we look for a 13068 * specific IRE on this ill and look at IRE_MARK_NORECV 13069 * later to see whether this ill is eligible to receive 13070 * them or not. ill_nominate_bcast_rcv() nominates only 13071 * one set of IREs for receiving. 13072 */ 13073 13074 ipif = ipif_get_next_ipif(NULL, ill); 13075 if (ipif == NULL) { 13076 ire_refrele(ire); 13077 freemsg(mp); 13078 BUMP_MIB(&ip_mib, ipInDiscards); 13079 return (B_TRUE); 13080 } 13081 new_ire = ire_ctable_lookup(dst, 0, 0, 13082 ipif, ALL_ZONES, NULL, MATCH_IRE_ILL); 13083 ipif_refrele(ipif); 13084 13085 if (new_ire != NULL) { 13086 if (new_ire->ire_marks & IRE_MARK_NORECV) { 13087 ire_refrele(ire); 13088 ire_refrele(new_ire); 13089 freemsg(mp); 13090 BUMP_MIB(&ip_mib, ipInDiscards); 13091 return (B_TRUE); 13092 } 13093 /* 13094 * In the special case of multirouted broadcast 13095 * packets, we unconditionally need to "gateway" 13096 * them to the appropriate interface here. 13097 * In the normal case, this cannot happen, because 13098 * there is no broadcast IRE tagged with the 13099 * RTF_MULTIRT flag. 13100 */ 13101 if (new_ire->ire_flags & RTF_MULTIRT) { 13102 ire_refrele(new_ire); 13103 if (ire->ire_rfq != NULL) { 13104 q = ire->ire_rfq; 13105 *qp = q; 13106 } 13107 } else { 13108 ire_refrele(ire); 13109 ire = new_ire; 13110 } 13111 } else if (cgtp_flt_pkt == CGTP_IP_PKT_NOT_CGTP) { 13112 if (!ip_g_forward_directed_bcast) { 13113 /* 13114 * Free the message if 13115 * ip_g_forward_directed_bcast is turned 13116 * off for non-local broadcast. 13117 */ 13118 ire_refrele(ire); 13119 freemsg(mp); 13120 BUMP_MIB(&ip_mib, ipInDiscards); 13121 return (B_TRUE); 13122 } 13123 } else { 13124 /* 13125 * This CGTP packet successfully passed the 13126 * CGTP filter, but the related CGTP 13127 * broadcast IRE has not been found, 13128 * meaning that the redundant ipif is 13129 * probably down. However, if we discarded 13130 * this packet, its duplicate would be 13131 * filtered out by the CGTP filter so none 13132 * of them would get through. So we keep 13133 * going with this one. 13134 */ 13135 ASSERT(cgtp_flt_pkt == CGTP_IP_PKT_PREMIUM); 13136 if (ire->ire_rfq != NULL) { 13137 q = ire->ire_rfq; 13138 *qp = q; 13139 } 13140 } 13141 } 13142 if (ip_g_forward_directed_bcast && ll_multicast == 0) { 13143 /* 13144 * Verify that there are not more then one 13145 * IRE_BROADCAST with this broadcast address which 13146 * has ire_stq set. 13147 * TODO: simplify, loop over all IRE's 13148 */ 13149 ire_t *ire1; 13150 int num_stq = 0; 13151 mblk_t *mp1; 13152 13153 /* Find the first one with ire_stq set */ 13154 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 13155 for (ire1 = ire; ire1 && 13156 !ire1->ire_stq && ire1->ire_addr == ire->ire_addr; 13157 ire1 = ire1->ire_next) 13158 ; 13159 if (ire1) { 13160 ire_refrele(ire); 13161 ire = ire1; 13162 IRE_REFHOLD(ire); 13163 } 13164 13165 /* Check if there are additional ones with stq set */ 13166 for (ire1 = ire; ire1; ire1 = ire1->ire_next) { 13167 if (ire->ire_addr != ire1->ire_addr) 13168 break; 13169 if (ire1->ire_stq) { 13170 num_stq++; 13171 break; 13172 } 13173 } 13174 rw_exit(&ire->ire_bucket->irb_lock); 13175 if (num_stq == 1 && ire->ire_stq != NULL) { 13176 ip1dbg(("ip_rput_process_broadcast: directed " 13177 "broadcast to 0x%x\n", 13178 ntohl(ire->ire_addr))); 13179 mp1 = copymsg(mp); 13180 if (mp1) { 13181 switch (ipha->ipha_protocol) { 13182 case IPPROTO_UDP: 13183 ip_udp_input(q, mp1, ipha, ire, ill); 13184 break; 13185 default: 13186 ip_proto_input(q, mp1, ipha, ire, ill); 13187 break; 13188 } 13189 } 13190 /* 13191 * Adjust ttl to 2 (1+1 - the forward engine 13192 * will decrement it by one. 13193 */ 13194 if (ip_csum_hdr(ipha)) { 13195 BUMP_MIB(&ip_mib, ipInCksumErrs); 13196 ip2dbg(("ip_rput_broadcast:drop pkt\n")); 13197 freemsg(mp); 13198 ire_refrele(ire); 13199 return (B_TRUE); 13200 } 13201 ipha->ipha_ttl = ip_broadcast_ttl + 1; 13202 ipha->ipha_hdr_checksum = 0; 13203 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 13204 ip_rput_process_forward(q, mp, ire, ipha, 13205 ill, ll_multicast); 13206 return (B_TRUE); 13207 } 13208 ip1dbg(("ip_rput: NO directed broadcast to 0x%x\n", 13209 ntohl(ire->ire_addr))); 13210 } 13211 13212 *irep = ire; 13213 13214 /* Restore any hardware checksum flags */ 13215 DB_CKSUMFLAGS(mp) = hcksumflags; 13216 return (B_FALSE); 13217 } 13218 13219 /* ARGSUSED */ 13220 static boolean_t 13221 ip_rput_process_multicast(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, 13222 int *ll_multicast, ipaddr_t *dstp) 13223 { 13224 /* 13225 * Forward packets only if we have joined the allmulti 13226 * group on this interface. 13227 */ 13228 if (ip_g_mrouter && ill->ill_join_allmulti) { 13229 int retval; 13230 13231 /* 13232 * Clear the indication that this may have hardware 13233 * checksum as we are not using it. 13234 */ 13235 DB_CKSUMFLAGS(mp) = 0; 13236 retval = ip_mforward(ill, ipha, mp); 13237 /* ip_mforward updates mib variables if needed */ 13238 /* clear b_prev - used by ip_mroute_decap */ 13239 mp->b_prev = NULL; 13240 13241 switch (retval) { 13242 case 0: 13243 /* 13244 * pkt is okay and arrived on phyint. 13245 * 13246 * If we are running as a multicast router 13247 * we need to see all IGMP and/or PIM packets. 13248 */ 13249 if ((ipha->ipha_protocol == IPPROTO_IGMP) || 13250 (ipha->ipha_protocol == IPPROTO_PIM)) { 13251 goto done; 13252 } 13253 break; 13254 case -1: 13255 /* pkt is mal-formed, toss it */ 13256 goto drop_pkt; 13257 case 1: 13258 /* pkt is okay and arrived on a tunnel */ 13259 /* 13260 * If we are running a multicast router 13261 * we need to see all igmp packets. 13262 */ 13263 if (ipha->ipha_protocol == IPPROTO_IGMP) { 13264 *dstp = INADDR_BROADCAST; 13265 *ll_multicast = 1; 13266 return (B_FALSE); 13267 } 13268 13269 goto drop_pkt; 13270 } 13271 } 13272 13273 ILM_WALKER_HOLD(ill); 13274 if (ilm_lookup_ill(ill, *dstp, ALL_ZONES) == NULL) { 13275 /* 13276 * This might just be caused by the fact that 13277 * multiple IP Multicast addresses map to the same 13278 * link layer multicast - no need to increment counter! 13279 */ 13280 ILM_WALKER_RELE(ill); 13281 freemsg(mp); 13282 return (B_TRUE); 13283 } 13284 ILM_WALKER_RELE(ill); 13285 done: 13286 ip2dbg(("ip_rput: multicast for us: 0x%x\n", ntohl(*dstp))); 13287 /* 13288 * This assumes the we deliver to all streams for multicast 13289 * and broadcast packets. 13290 */ 13291 *dstp = INADDR_BROADCAST; 13292 *ll_multicast = 1; 13293 return (B_FALSE); 13294 drop_pkt: 13295 ip2dbg(("ip_rput: drop pkt\n")); 13296 freemsg(mp); 13297 return (B_TRUE); 13298 } 13299 13300 static boolean_t 13301 ip_rput_process_notdata(queue_t *q, mblk_t **first_mpp, ill_t *ill, 13302 int *ll_multicast, mblk_t **mpp) 13303 { 13304 mblk_t *mp1, *from_mp, *to_mp, *mp, *first_mp; 13305 boolean_t must_copy = B_FALSE; 13306 struct iocblk *iocp; 13307 ipha_t *ipha; 13308 13309 #define rptr ((uchar_t *)ipha) 13310 13311 first_mp = *first_mpp; 13312 mp = *mpp; 13313 13314 ASSERT(first_mp == mp); 13315 13316 /* 13317 * if db_ref > 1 then copymsg and free original. Packet may be 13318 * changed and do not want other entity who has a reference to this 13319 * message to trip over the changes. This is a blind change because 13320 * trying to catch all places that might change packet is too 13321 * difficult (since it may be a module above this one) 13322 * 13323 * This corresponds to the non-fast path case. We walk down the full 13324 * chain in this case, and check the db_ref count of all the dblks, 13325 * and do a copymsg if required. It is possible that the db_ref counts 13326 * of the data blocks in the mblk chain can be different. 13327 * For Example, we can get a DL_UNITDATA_IND(M_PROTO) with a db_ref 13328 * count of 1, followed by a M_DATA block with a ref count of 2, if 13329 * 'snoop' is running. 13330 */ 13331 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { 13332 if (mp1->b_datap->db_ref > 1) { 13333 must_copy = B_TRUE; 13334 break; 13335 } 13336 } 13337 13338 if (must_copy) { 13339 mp1 = copymsg(mp); 13340 if (mp1 == NULL) { 13341 for (mp1 = mp; mp1 != NULL; 13342 mp1 = mp1->b_cont) { 13343 mp1->b_next = NULL; 13344 mp1->b_prev = NULL; 13345 } 13346 freemsg(mp); 13347 BUMP_MIB(&ip_mib, ipInDiscards); 13348 return (B_TRUE); 13349 } 13350 for (from_mp = mp, to_mp = mp1; from_mp != NULL; 13351 from_mp = from_mp->b_cont, to_mp = to_mp->b_cont) { 13352 /* Copy b_next - used in M_BREAK messages */ 13353 to_mp->b_next = from_mp->b_next; 13354 from_mp->b_next = NULL; 13355 /* Copy b_prev - used by ip_mroute_decap */ 13356 to_mp->b_prev = from_mp->b_prev; 13357 from_mp->b_prev = NULL; 13358 } 13359 *first_mpp = first_mp = mp1; 13360 freemsg(mp); 13361 mp = mp1; 13362 *mpp = mp1; 13363 } 13364 13365 ipha = (ipha_t *)mp->b_rptr; 13366 13367 /* 13368 * previous code has a case for M_DATA. 13369 * We want to check how that happens. 13370 */ 13371 ASSERT(first_mp->b_datap->db_type != M_DATA); 13372 switch (first_mp->b_datap->db_type) { 13373 case M_PROTO: 13374 case M_PCPROTO: 13375 if (((dl_unitdata_ind_t *)rptr)->dl_primitive != 13376 DL_UNITDATA_IND) { 13377 /* Go handle anything other than data elsewhere. */ 13378 ip_rput_dlpi(q, mp); 13379 return (B_TRUE); 13380 } 13381 *ll_multicast = ((dl_unitdata_ind_t *)rptr)->dl_group_address; 13382 /* Ditch the DLPI header. */ 13383 mp1 = mp->b_cont; 13384 ASSERT(first_mp == mp); 13385 *first_mpp = mp1; 13386 freeb(mp); 13387 *mpp = mp1; 13388 return (B_FALSE); 13389 case M_BREAK: 13390 /* 13391 * A packet arrives as M_BREAK following a cycle through 13392 * ip_rput, ip_newroute, ... and finally ire_add_then_send. 13393 * This is an IP datagram sans lower level header. 13394 * M_BREAK are also used to pass back in multicast packets 13395 * that are encapsulated with a source route. 13396 */ 13397 /* Ditch the M_BREAK mblk */ 13398 mp1 = mp->b_cont; 13399 ASSERT(first_mp == mp); 13400 *first_mpp = mp1; 13401 freeb(mp); 13402 mp = mp1; 13403 mp->b_next = NULL; 13404 *mpp = mp; 13405 *ll_multicast = 0; 13406 return (B_FALSE); 13407 case M_IOCACK: 13408 ip1dbg(("got iocack ")); 13409 iocp = (struct iocblk *)mp->b_rptr; 13410 switch (iocp->ioc_cmd) { 13411 case DL_IOC_HDR_INFO: 13412 ill = (ill_t *)q->q_ptr; 13413 ill_fastpath_ack(ill, mp); 13414 return (B_TRUE); 13415 case SIOCSTUNPARAM: 13416 case OSIOCSTUNPARAM: 13417 /* Go through qwriter_ip */ 13418 break; 13419 case SIOCGTUNPARAM: 13420 case OSIOCGTUNPARAM: 13421 ip_rput_other(NULL, q, mp, NULL); 13422 return (B_TRUE); 13423 default: 13424 putnext(q, mp); 13425 return (B_TRUE); 13426 } 13427 /* FALLTHRU */ 13428 case M_ERROR: 13429 case M_HANGUP: 13430 /* 13431 * Since this is on the ill stream we unconditionally 13432 * bump up the refcount 13433 */ 13434 ill_refhold(ill); 13435 (void) qwriter_ip(NULL, ill, q, mp, ip_rput_other, CUR_OP, 13436 B_FALSE); 13437 return (B_TRUE); 13438 case M_CTL: 13439 if ((MBLKL(first_mp) >= sizeof (da_ipsec_t)) && 13440 (((da_ipsec_t *)first_mp->b_rptr)->da_type == 13441 IPHADA_M_CTL)) { 13442 /* 13443 * It's an IPsec accelerated packet. 13444 * Make sure that the ill from which we received the 13445 * packet has enabled IPsec hardware acceleration. 13446 */ 13447 if (!(ill->ill_capabilities & 13448 (ILL_CAPAB_AH|ILL_CAPAB_ESP))) { 13449 /* IPsec kstats: bean counter */ 13450 freemsg(mp); 13451 return (B_TRUE); 13452 } 13453 13454 /* 13455 * Make mp point to the mblk following the M_CTL, 13456 * then process according to type of mp. 13457 * After this processing, first_mp will point to 13458 * the data-attributes and mp to the pkt following 13459 * the M_CTL. 13460 */ 13461 mp = first_mp->b_cont; 13462 if (mp == NULL) { 13463 freemsg(first_mp); 13464 return (B_TRUE); 13465 } 13466 /* 13467 * A Hardware Accelerated packet can only be M_DATA 13468 * ESP or AH packet. 13469 */ 13470 if (mp->b_datap->db_type != M_DATA) { 13471 /* non-M_DATA IPsec accelerated packet */ 13472 IPSECHW_DEBUG(IPSECHW_PKT, 13473 ("non-M_DATA IPsec accelerated pkt\n")); 13474 freemsg(first_mp); 13475 return (B_TRUE); 13476 } 13477 ipha = (ipha_t *)mp->b_rptr; 13478 if (ipha->ipha_protocol != IPPROTO_AH && 13479 ipha->ipha_protocol != IPPROTO_ESP) { 13480 IPSECHW_DEBUG(IPSECHW_PKT, 13481 ("non-M_DATA IPsec accelerated pkt\n")); 13482 freemsg(first_mp); 13483 return (B_TRUE); 13484 } 13485 *mpp = mp; 13486 return (B_FALSE); 13487 } 13488 putnext(q, mp); 13489 return (B_TRUE); 13490 case M_FLUSH: 13491 if (*mp->b_rptr & FLUSHW) { 13492 *mp->b_rptr &= ~FLUSHR; 13493 qreply(q, mp); 13494 return (B_TRUE); 13495 } 13496 freemsg(mp); 13497 return (B_TRUE); 13498 case M_IOCNAK: 13499 ip1dbg(("got iocnak ")); 13500 iocp = (struct iocblk *)mp->b_rptr; 13501 switch (iocp->ioc_cmd) { 13502 case DL_IOC_HDR_INFO: 13503 case SIOCSTUNPARAM: 13504 case OSIOCSTUNPARAM: 13505 /* 13506 * Since this is on the ill stream we unconditionally 13507 * bump up the refcount 13508 */ 13509 ill_refhold(ill); 13510 (void) qwriter_ip(NULL, ill, q, mp, ip_rput_other, 13511 CUR_OP, B_FALSE); 13512 return (B_TRUE); 13513 case SIOCGTUNPARAM: 13514 case OSIOCGTUNPARAM: 13515 ip_rput_other(NULL, q, mp, NULL); 13516 return (B_TRUE); 13517 default: 13518 break; 13519 } 13520 /* FALLTHRU */ 13521 default: 13522 putnext(q, mp); 13523 return (B_TRUE); 13524 } 13525 } 13526 13527 /* Read side put procedure. Packets coming from the wire arrive here. */ 13528 void 13529 ip_rput(queue_t *q, mblk_t *mp) 13530 { 13531 ill_t *ill; 13532 13533 TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_rput_start: q %p", q); 13534 13535 ill = (ill_t *)q->q_ptr; 13536 13537 if (ill->ill_state_flags & (ILL_CONDEMNED | ILL_LL_SUBNET_PENDING)) { 13538 union DL_primitives *dl; 13539 13540 /* 13541 * Things are opening or closing. Only accept DLPI control 13542 * messages. In the open case, the ill->ill_ipif has not yet 13543 * been created. In the close case, things hanging off the 13544 * ill could have been freed already. In either case it 13545 * may not be safe to proceed further. 13546 */ 13547 13548 dl = (union DL_primitives *)mp->b_rptr; 13549 if ((mp->b_datap->db_type != M_PCPROTO) || 13550 (dl->dl_primitive == DL_UNITDATA_IND)) { 13551 /* 13552 * Also SIOC[GS]TUN* ioctls can come here. 13553 */ 13554 inet_freemsg(mp); 13555 TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, 13556 "ip_input_end: q %p (%S)", q, "uninit"); 13557 return; 13558 } 13559 } 13560 13561 /* 13562 * if db_ref > 1 then copymsg and free original. Packet may be 13563 * changed and we do not want the other entity who has a reference to 13564 * this message to trip over the changes. This is a blind change because 13565 * trying to catch all places that might change the packet is too 13566 * difficult. 13567 * 13568 * This corresponds to the fast path case, where we have a chain of 13569 * M_DATA mblks. We check the db_ref count of only the 1st data block 13570 * in the mblk chain. There doesn't seem to be a reason why a device 13571 * driver would send up data with varying db_ref counts in the mblk 13572 * chain. In any case the Fast path is a private interface, and our 13573 * drivers don't do such a thing. Given the above assumption, there is 13574 * no need to walk down the entire mblk chain (which could have a 13575 * potential performance problem) 13576 */ 13577 if (mp->b_datap->db_ref > 1) { 13578 mblk_t *mp1; 13579 boolean_t adjusted = B_FALSE; 13580 IP_STAT(ip_db_ref); 13581 13582 /* 13583 * The IP_RECVSLLA option depends on having the link layer 13584 * header. First check that: 13585 * a> the underlying device is of type ether, since this 13586 * option is currently supported only over ethernet. 13587 * b> there is enough room to copy over the link layer header. 13588 * 13589 * Once the checks are done, adjust rptr so that the link layer 13590 * header will be copied via copymsg. Note that, IFT_ETHER may 13591 * be returned by some non-ethernet drivers but in this case the 13592 * second check will fail. 13593 */ 13594 if (ill->ill_type == IFT_ETHER && 13595 (mp->b_rptr - mp->b_datap->db_base) >= 13596 sizeof (struct ether_header)) { 13597 mp->b_rptr -= sizeof (struct ether_header); 13598 adjusted = B_TRUE; 13599 } 13600 mp1 = copymsg(mp); 13601 if (mp1 == NULL) { 13602 /* Clear b_next - used in M_BREAK messages */ 13603 mp->b_next = NULL; 13604 /* clear b_prev - used by ip_mroute_decap */ 13605 mp->b_prev = NULL; 13606 freemsg(mp); 13607 BUMP_MIB(&ip_mib, ipInDiscards); 13608 TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, 13609 "ip_rput_end: q %p (%S)", q, "copymsg"); 13610 return; 13611 } 13612 if (adjusted) { 13613 /* 13614 * Copy is done. Restore the pointer in the _new_ mblk 13615 */ 13616 mp1->b_rptr += sizeof (struct ether_header); 13617 } 13618 /* Copy b_next - used in M_BREAK messages */ 13619 mp1->b_next = mp->b_next; 13620 mp->b_next = NULL; 13621 /* Copy b_prev - used by ip_mroute_decap */ 13622 mp1->b_prev = mp->b_prev; 13623 mp->b_prev = NULL; 13624 freemsg(mp); 13625 mp = mp1; 13626 } 13627 13628 TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, 13629 "ip_rput_end: q %p (%S)", q, "end"); 13630 13631 ip_input(ill, NULL, mp, 0); 13632 } 13633 13634 /* 13635 * Direct read side procedure capable of dealing with chains. GLDv3 based 13636 * drivers call this function directly with mblk chains while STREAMS 13637 * read side procedure ip_rput() calls this for single packet with ip_ring 13638 * set to NULL to process one packet at a time. 13639 * 13640 * The ill will always be valid if this function is called directly from 13641 * the driver. 13642 */ 13643 /*ARGSUSED*/ 13644 void 13645 ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain, size_t hdrlen) 13646 { 13647 ipaddr_t dst; 13648 ire_t *ire; 13649 ipha_t *ipha; 13650 uint_t pkt_len; 13651 ssize_t len; 13652 uint_t opt_len; 13653 int ll_multicast; 13654 int cgtp_flt_pkt; 13655 queue_t *q = ill->ill_rq; 13656 squeue_t *curr_sqp = NULL; 13657 mblk_t *head = NULL; 13658 mblk_t *tail = NULL; 13659 mblk_t *first_mp; 13660 mblk_t *mp; 13661 int cnt = 0; 13662 13663 ASSERT(mp_chain != NULL); 13664 ASSERT(ill != NULL); 13665 13666 TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_input_start: q %p", q); 13667 13668 #define rptr ((uchar_t *)ipha) 13669 13670 while (mp_chain != NULL) { 13671 first_mp = mp = mp_chain; 13672 mp_chain = mp_chain->b_next; 13673 mp->b_next = NULL; 13674 ll_multicast = 0; 13675 ire = NULL; 13676 13677 /* 13678 * ip_input fast path 13679 */ 13680 13681 /* mblk type is not M_DATA */ 13682 if (mp->b_datap->db_type != M_DATA) { 13683 if (ip_rput_process_notdata(q, &first_mp, ill, 13684 &ll_multicast, &mp)) 13685 continue; 13686 } 13687 13688 ASSERT(mp->b_datap->db_type == M_DATA); 13689 ASSERT(mp->b_datap->db_ref == 1); 13690 13691 /* 13692 * Invoke the CGTP (multirouting) filtering module to process 13693 * the incoming packet. Packets identified as duplicates 13694 * must be discarded. Filtering is active only if the 13695 * the ip_cgtp_filter ndd variable is non-zero. 13696 */ 13697 cgtp_flt_pkt = CGTP_IP_PKT_NOT_CGTP; 13698 if (ip_cgtp_filter && (ip_cgtp_filter_ops != NULL)) { 13699 cgtp_flt_pkt = 13700 ip_cgtp_filter_ops->cfo_filter_fp(q, mp); 13701 if (cgtp_flt_pkt == CGTP_IP_PKT_DUPLICATE) { 13702 freemsg(first_mp); 13703 continue; 13704 } 13705 } 13706 13707 ipha = (ipha_t *)mp->b_rptr; 13708 len = mp->b_wptr - rptr; 13709 13710 BUMP_MIB(&ip_mib, ipInReceives); 13711 13712 /* 13713 * IP header ptr not aligned? 13714 * OR IP header not complete in first mblk 13715 */ 13716 if (!OK_32PTR(rptr) || len < IP_SIMPLE_HDR_LENGTH) { 13717 if (!ip_check_and_align_header(q, mp)) 13718 continue; 13719 ipha = (ipha_t *)mp->b_rptr; 13720 len = mp->b_wptr - rptr; 13721 } 13722 13723 /* multiple mblk or too short */ 13724 pkt_len = ntohs(ipha->ipha_length); 13725 len -= pkt_len; 13726 if (len != 0) { 13727 /* 13728 * Make sure we have data length consistent 13729 * with the IP header. 13730 */ 13731 if (mp->b_cont == NULL) { 13732 if (len < 0 || pkt_len < IP_SIMPLE_HDR_LENGTH) { 13733 BUMP_MIB(&ip_mib, ipInHdrErrors); 13734 ip2dbg(("ip_input: drop pkt\n")); 13735 freemsg(mp); 13736 continue; 13737 } 13738 mp->b_wptr = rptr + pkt_len; 13739 } else if (len += msgdsize(mp->b_cont)) { 13740 if (len < 0 || pkt_len < IP_SIMPLE_HDR_LENGTH) { 13741 BUMP_MIB(&ip_mib, ipInHdrErrors); 13742 ip2dbg(("ip_input: drop pkt\n")); 13743 freemsg(mp); 13744 continue; 13745 } 13746 (void) adjmsg(mp, -len); 13747 IP_STAT(ip_multimblk3); 13748 } 13749 } 13750 13751 if (ip_loopback_src_or_dst(ipha, ill)) { 13752 ip2dbg(("ip_input: drop pkt\n")); 13753 freemsg(mp); 13754 continue; 13755 } 13756 13757 /* 13758 * Attach any necessary label information to this packet. 13759 */ 13760 if (is_system_labeled() && 13761 !tsol_get_pkt_label(mp, IPV4_VERSION)) { 13762 BUMP_MIB(&ip_mib, ipInDiscards); 13763 freemsg(mp); 13764 continue; 13765 } 13766 13767 opt_len = ipha->ipha_version_and_hdr_length - 13768 IP_SIMPLE_HDR_VERSION; 13769 /* IP version bad or there are IP options */ 13770 if (opt_len) { 13771 if (len != 0) 13772 IP_STAT(ip_multimblk4); 13773 else 13774 IP_STAT(ip_ipoptions); 13775 if (!ip_rput_multimblk_ipoptions(q, mp, &ipha, &dst)) 13776 continue; 13777 } else { 13778 dst = ipha->ipha_dst; 13779 } 13780 13781 /* 13782 * If rsvpd is running, let RSVP daemon handle its processing 13783 * and forwarding of RSVP multicast/unicast packets. 13784 * If rsvpd is not running but mrouted is running, RSVP 13785 * multicast packets are forwarded as multicast traffic 13786 * and RSVP unicast packets are forwarded by unicast router. 13787 * If neither rsvpd nor mrouted is running, RSVP multicast 13788 * packets are not forwarded, but the unicast packets are 13789 * forwarded like unicast traffic. 13790 */ 13791 if (ipha->ipha_protocol == IPPROTO_RSVP && 13792 ipcl_proto_search(IPPROTO_RSVP) != NULL) { 13793 /* RSVP packet and rsvpd running. Treat as ours */ 13794 ip2dbg(("ip_input: RSVP for us: 0x%x\n", ntohl(dst))); 13795 /* 13796 * This assumes that we deliver to all streams for 13797 * multicast and broadcast packets. 13798 * We have to force ll_multicast to 1 to handle the 13799 * M_DATA messages passed in from ip_mroute_decap. 13800 */ 13801 dst = INADDR_BROADCAST; 13802 ll_multicast = 1; 13803 } else if (CLASSD(dst)) { 13804 /* packet is multicast */ 13805 mp->b_next = NULL; 13806 if (ip_rput_process_multicast(q, mp, ill, ipha, 13807 &ll_multicast, &dst)) 13808 continue; 13809 } 13810 13811 13812 /* 13813 * Check if the packet is coming from the Mobile IP 13814 * forward tunnel interface 13815 */ 13816 if (ill->ill_srcif_refcnt > 0) { 13817 ire = ire_srcif_table_lookup(dst, IRE_INTERFACE, 13818 NULL, ill, MATCH_IRE_TYPE); 13819 if (ire != NULL && ire->ire_dlureq_mp == NULL && 13820 ire->ire_ipif->ipif_net_type == 13821 IRE_IF_RESOLVER) { 13822 /* We need to resolve the link layer info */ 13823 ire_refrele(ire); 13824 ip_rput_noire(q, (ill_t *)q->q_ptr, mp, 13825 ll_multicast, dst); 13826 continue; 13827 } 13828 } 13829 13830 if (ire == NULL) { 13831 ire = ire_cache_lookup(dst, ALL_ZONES, 13832 MBLK_GETLABEL(mp)); 13833 } 13834 13835 /* 13836 * If mipagent is running and reverse tunnel is created as per 13837 * mobile node request, then any packet coming through the 13838 * incoming interface from the mobile-node, should be reverse 13839 * tunneled to it's home agent except those that are destined 13840 * to foreign agent only. 13841 * This needs source address based ire lookup. The routing 13842 * entries for source address based lookup are only created by 13843 * mipagent program only when a reverse tunnel is created. 13844 * Reference : RFC2002, RFC2344 13845 */ 13846 if (ill->ill_mrtun_refcnt > 0) { 13847 ipaddr_t srcaddr; 13848 ire_t *tmp_ire; 13849 13850 tmp_ire = ire; /* Save, we might need it later */ 13851 if (ire == NULL || (ire->ire_type != IRE_LOCAL && 13852 ire->ire_type != IRE_BROADCAST)) { 13853 srcaddr = ipha->ipha_src; 13854 ire = ire_mrtun_lookup(srcaddr, ill); 13855 if (ire != NULL) { 13856 /* 13857 * Should not be getting iphada packet 13858 * here. we should only get those for 13859 * IRE_LOCAL traffic, excluded above. 13860 * Fail-safe (drop packet) in the event 13861 * hardware is misbehaving. 13862 */ 13863 if (first_mp != mp) { 13864 /* IPsec KSTATS: beancount me */ 13865 freemsg(first_mp); 13866 } else { 13867 /* 13868 * This packet must be forwarded 13869 * to Reverse Tunnel 13870 */ 13871 ip_mrtun_forward(ire, ill, mp); 13872 } 13873 ire_refrele(ire); 13874 if (tmp_ire != NULL) 13875 ire_refrele(tmp_ire); 13876 TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, 13877 "ip_input_end: q %p (%S)", 13878 q, "uninit"); 13879 continue; 13880 } 13881 } 13882 /* 13883 * If this packet is from a non-mobilenode or a 13884 * mobile-node which does not request reverse 13885 * tunnel service 13886 */ 13887 ire = tmp_ire; 13888 } 13889 13890 13891 /* 13892 * If we reach here that means the incoming packet satisfies 13893 * one of the following conditions: 13894 * - packet is from a mobile node which does not request 13895 * reverse tunnel 13896 * - packet is from a non-mobile node, which is the most 13897 * common case 13898 * - packet is from a reverse tunnel enabled mobile node 13899 * and destined to foreign agent only 13900 */ 13901 13902 if (ire == NULL) { 13903 /* 13904 * No IRE for this destination, so it can't be for us. 13905 * Unless we are forwarding, drop the packet. 13906 * We have to let source routed packets through 13907 * since we don't yet know if they are 'ping -l' 13908 * packets i.e. if they will go out over the 13909 * same interface as they came in on. 13910 */ 13911 ip_rput_noire(q, NULL, mp, ll_multicast, dst); 13912 continue; 13913 } 13914 13915 /* 13916 * Broadcast IRE may indicate either broadcast or 13917 * multicast packet 13918 */ 13919 if (ire->ire_type == IRE_BROADCAST) { 13920 /* 13921 * Skip broadcast checks if packet is UDP multicast; 13922 * we'd rather not enter ip_rput_process_broadcast() 13923 * unless the packet is broadcast for real, since 13924 * that routine is a no-op for multicast. 13925 */ 13926 if ((ipha->ipha_protocol != IPPROTO_UDP || 13927 !CLASSD(ipha->ipha_dst)) && 13928 ip_rput_process_broadcast(&q, mp, &ire, ipha, ill, 13929 dst, cgtp_flt_pkt, ll_multicast)) { 13930 continue; 13931 } 13932 } else if (ire->ire_stq != NULL) { 13933 /* fowarding? */ 13934 ip_rput_process_forward(q, mp, ire, ipha, ill, 13935 ll_multicast); 13936 continue; 13937 } 13938 13939 /* packet not for us */ 13940 if (ire->ire_rfq != q) { 13941 if (ip_rput_notforus(&q, mp, ire, ill)) { 13942 continue; 13943 } 13944 } 13945 13946 switch (ipha->ipha_protocol) { 13947 case IPPROTO_TCP: 13948 ASSERT(first_mp == mp); 13949 if ((mp = ip_tcp_input(mp, ipha, ill, B_FALSE, ire, 13950 mp, 0, q, ip_ring)) != NULL) { 13951 if (curr_sqp == NULL) { 13952 curr_sqp = GET_SQUEUE(mp); 13953 ASSERT(cnt == 0); 13954 cnt++; 13955 head = tail = mp; 13956 } else if (curr_sqp == GET_SQUEUE(mp)) { 13957 ASSERT(tail != NULL); 13958 cnt++; 13959 tail->b_next = mp; 13960 tail = mp; 13961 } else { 13962 /* 13963 * A different squeue. Send the 13964 * chain for the previous squeue on 13965 * its way. This shouldn't happen 13966 * often unless interrupt binding 13967 * changes. 13968 */ 13969 IP_STAT(ip_input_multi_squeue); 13970 squeue_enter_chain(curr_sqp, head, 13971 tail, cnt, SQTAG_IP_INPUT); 13972 curr_sqp = GET_SQUEUE(mp); 13973 head = mp; 13974 tail = mp; 13975 cnt = 1; 13976 } 13977 } 13978 IRE_REFRELE(ire); 13979 continue; 13980 case IPPROTO_UDP: 13981 ASSERT(first_mp == mp); 13982 ip_udp_input(q, mp, ipha, ire, ill); 13983 IRE_REFRELE(ire); 13984 continue; 13985 case IPPROTO_SCTP: 13986 ASSERT(first_mp == mp); 13987 ip_sctp_input(mp, ipha, ill, B_FALSE, ire, mp, 0, 13988 q, dst); 13989 continue; 13990 default: 13991 ip_proto_input(q, first_mp, ipha, ire, ill); 13992 IRE_REFRELE(ire); 13993 continue; 13994 } 13995 } 13996 13997 if (head != NULL) 13998 squeue_enter_chain(curr_sqp, head, tail, cnt, SQTAG_IP_INPUT); 13999 14000 /* 14001 * This code is there just to make netperf/ttcp look good. 14002 * 14003 * Its possible that after being in polling mode (and having cleared 14004 * the backlog), squeues have turned the interrupt frequency higher 14005 * to improve latency at the expense of more CPU utilization (less 14006 * packets per interrupts or more number of interrupts). Workloads 14007 * like ttcp/netperf do manage to tickle polling once in a while 14008 * but for the remaining time, stay in higher interrupt mode since 14009 * their packet arrival rate is pretty uniform and this shows up 14010 * as higher CPU utilization. Since people care about CPU utilization 14011 * while running netperf/ttcp, turn the interrupt frequency back to 14012 * normal/default if polling has not been used in ip_poll_normal_ticks. 14013 */ 14014 if (ip_ring != NULL && (ip_ring->rr_poll_state & ILL_POLLING)) { 14015 if (lbolt >= (ip_ring->rr_poll_time + ip_poll_normal_ticks)) { 14016 ip_ring->rr_poll_state &= ~ILL_POLLING; 14017 ip_ring->rr_blank(ip_ring->rr_handle, 14018 ip_ring->rr_normal_blank_time, 14019 ip_ring->rr_normal_pkt_cnt); 14020 } 14021 } 14022 14023 TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, 14024 "ip_input_end: q %p (%S)", q, "end"); 14025 #undef rptr 14026 } 14027 14028 static void 14029 ip_dlpi_error(ill_t *ill, t_uscalar_t prim, t_uscalar_t dl_err, 14030 t_uscalar_t err) 14031 { 14032 if (dl_err == DL_SYSERR) { 14033 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 14034 "%s: %s failed: DL_SYSERR (errno %u)\n", 14035 ill->ill_name, dlpi_prim_str(prim), err); 14036 return; 14037 } 14038 14039 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 14040 "%s: %s failed: %s\n", ill->ill_name, dlpi_prim_str(prim), 14041 dlpi_err_str(dl_err)); 14042 } 14043 14044 /* 14045 * ip_rput_dlpi is called by ip_rput to handle all DLPI messages other 14046 * than DL_UNITDATA_IND messages. If we need to process this message 14047 * exclusively, we call qwriter_ip, in which case we also need to call 14048 * ill_refhold before that, since qwriter_ip does an ill_refrele. 14049 */ 14050 void 14051 ip_rput_dlpi(queue_t *q, mblk_t *mp) 14052 { 14053 dl_ok_ack_t *dloa = (dl_ok_ack_t *)mp->b_rptr; 14054 dl_error_ack_t *dlea = (dl_error_ack_t *)dloa; 14055 ill_t *ill; 14056 14057 ip1dbg(("ip_rput_dlpi")); 14058 ill = (ill_t *)q->q_ptr; 14059 switch (dloa->dl_primitive) { 14060 case DL_ERROR_ACK: 14061 ip2dbg(("ip_rput_dlpi(%s): DL_ERROR_ACK %s (0x%x): " 14062 "%s (0x%x), unix %u\n", ill->ill_name, 14063 dlpi_prim_str(dlea->dl_error_primitive), 14064 dlea->dl_error_primitive, 14065 dlpi_err_str(dlea->dl_errno), 14066 dlea->dl_errno, 14067 dlea->dl_unix_errno)); 14068 switch (dlea->dl_error_primitive) { 14069 case DL_UNBIND_REQ: 14070 mutex_enter(&ill->ill_lock); 14071 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS; 14072 cv_signal(&ill->ill_cv); 14073 mutex_exit(&ill->ill_lock); 14074 /* FALLTHRU */ 14075 case DL_NOTIFY_REQ: 14076 case DL_ATTACH_REQ: 14077 case DL_DETACH_REQ: 14078 case DL_INFO_REQ: 14079 case DL_BIND_REQ: 14080 case DL_ENABMULTI_REQ: 14081 case DL_PHYS_ADDR_REQ: 14082 case DL_CAPABILITY_REQ: 14083 case DL_CONTROL_REQ: 14084 /* 14085 * Refhold the ill to match qwriter_ip which does a 14086 * refrele. Since this is on the ill stream we 14087 * unconditionally bump up the refcount without 14088 * checking for ILL_CAN_LOOKUP 14089 */ 14090 ill_refhold(ill); 14091 (void) qwriter_ip(NULL, ill, q, mp, ip_rput_dlpi_writer, 14092 CUR_OP, B_FALSE); 14093 return; 14094 case DL_DISABMULTI_REQ: 14095 freemsg(mp); /* Don't want to pass this up */ 14096 return; 14097 default: 14098 break; 14099 } 14100 ip_dlpi_error(ill, dlea->dl_error_primitive, 14101 dlea->dl_errno, dlea->dl_unix_errno); 14102 freemsg(mp); 14103 return; 14104 case DL_INFO_ACK: 14105 case DL_BIND_ACK: 14106 case DL_PHYS_ADDR_ACK: 14107 case DL_NOTIFY_ACK: 14108 case DL_CAPABILITY_ACK: 14109 case DL_CONTROL_ACK: 14110 /* 14111 * Refhold the ill to match qwriter_ip which does a refrele 14112 * Since this is on the ill stream we unconditionally 14113 * bump up the refcount without doing ILL_CAN_LOOKUP. 14114 */ 14115 ill_refhold(ill); 14116 (void) qwriter_ip(NULL, ill, q, mp, ip_rput_dlpi_writer, 14117 CUR_OP, B_FALSE); 14118 return; 14119 case DL_NOTIFY_IND: 14120 ill_refhold(ill); 14121 /* 14122 * The DL_NOTIFY_IND is an asynchronous message that has no 14123 * relation to the current ioctl in progress (if any). Hence we 14124 * pass in NEW_OP in this case. 14125 */ 14126 (void) qwriter_ip(NULL, ill, q, mp, ip_rput_dlpi_writer, 14127 NEW_OP, B_FALSE); 14128 return; 14129 case DL_OK_ACK: 14130 ip1dbg(("ip_rput: DL_OK_ACK for %s\n", 14131 dlpi_prim_str((int)dloa->dl_correct_primitive))); 14132 switch (dloa->dl_correct_primitive) { 14133 case DL_UNBIND_REQ: 14134 mutex_enter(&ill->ill_lock); 14135 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS; 14136 cv_signal(&ill->ill_cv); 14137 mutex_exit(&ill->ill_lock); 14138 /* FALLTHRU */ 14139 case DL_ATTACH_REQ: 14140 case DL_DETACH_REQ: 14141 /* 14142 * Refhold the ill to match qwriter_ip which does a 14143 * refrele. Since this is on the ill stream we 14144 * unconditionally bump up the refcount 14145 */ 14146 ill_refhold(ill); 14147 qwriter_ip(NULL, ill, q, mp, ip_rput_dlpi_writer, 14148 CUR_OP, B_FALSE); 14149 return; 14150 case DL_ENABMULTI_REQ: 14151 if (ill->ill_dlpi_multicast_state == IDMS_INPROGRESS) 14152 ill->ill_dlpi_multicast_state = IDMS_OK; 14153 break; 14154 14155 } 14156 break; 14157 default: 14158 break; 14159 } 14160 freemsg(mp); 14161 } 14162 14163 /* 14164 * Handling of DLPI messages that require exclusive access to the ipsq. 14165 * 14166 * Need to do ill_pending_mp_release on ioctl completion, which could 14167 * happen here. (along with mi_copy_done) 14168 */ 14169 /* ARGSUSED */ 14170 static void 14171 ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 14172 { 14173 dl_ok_ack_t *dloa = (dl_ok_ack_t *)mp->b_rptr; 14174 dl_error_ack_t *dlea = (dl_error_ack_t *)dloa; 14175 int err = 0; 14176 ill_t *ill; 14177 ipif_t *ipif = NULL; 14178 mblk_t *mp1 = NULL; 14179 conn_t *connp = NULL; 14180 t_uscalar_t physaddr_req; 14181 mblk_t *mp_hw; 14182 union DL_primitives *dlp; 14183 boolean_t success; 14184 boolean_t ioctl_aborted = B_FALSE; 14185 boolean_t log = B_TRUE; 14186 14187 ip1dbg(("ip_rput_dlpi_writer ..")); 14188 ill = (ill_t *)q->q_ptr; 14189 ASSERT(ipsq == ill->ill_phyint->phyint_ipsq); 14190 14191 ASSERT(IAM_WRITER_ILL(ill)); 14192 14193 /* 14194 * ipsq_pending_mp and ipsq_pending_ipif track each other. i.e. 14195 * both are null or non-null. However we can assert that only 14196 * after grabbing the ipsq_lock. So we don't make any assertion 14197 * here and in other places in the code. 14198 */ 14199 ipif = ipsq->ipsq_pending_ipif; 14200 /* 14201 * The current ioctl could have been aborted by the user and a new 14202 * ioctl to bring up another ill could have started. We could still 14203 * get a response from the driver later. 14204 */ 14205 if (ipif != NULL && ipif->ipif_ill != ill) 14206 ioctl_aborted = B_TRUE; 14207 14208 switch (dloa->dl_primitive) { 14209 case DL_ERROR_ACK: 14210 switch (dlea->dl_error_primitive) { 14211 case DL_UNBIND_REQ: 14212 case DL_ATTACH_REQ: 14213 case DL_DETACH_REQ: 14214 case DL_INFO_REQ: 14215 ill_dlpi_done(ill, dlea->dl_error_primitive); 14216 break; 14217 case DL_NOTIFY_REQ: 14218 ill_dlpi_done(ill, DL_NOTIFY_REQ); 14219 log = B_FALSE; 14220 break; 14221 case DL_PHYS_ADDR_REQ: 14222 /* 14223 * For IPv6 only, there are two additional 14224 * phys_addr_req's sent to the driver to get the 14225 * IPv6 token and lla. This allows IP to acquire 14226 * the hardware address format for a given interface 14227 * without having built in knowledge of the hardware 14228 * address. ill_phys_addr_pend keeps track of the last 14229 * DL_PAR sent so we know which response we are 14230 * dealing with. ill_dlpi_done will update 14231 * ill_phys_addr_pend when it sends the next req. 14232 * We don't complete the IOCTL until all three DL_PARs 14233 * have been attempted, so set *_len to 0 and break. 14234 */ 14235 physaddr_req = ill->ill_phys_addr_pend; 14236 ill_dlpi_done(ill, DL_PHYS_ADDR_REQ); 14237 if (physaddr_req == DL_IPV6_TOKEN) { 14238 ill->ill_token_length = 0; 14239 log = B_FALSE; 14240 break; 14241 } else if (physaddr_req == DL_IPV6_LINK_LAYER_ADDR) { 14242 ill->ill_nd_lla_len = 0; 14243 log = B_FALSE; 14244 break; 14245 } 14246 /* 14247 * Something went wrong with the DL_PHYS_ADDR_REQ. 14248 * We presumably have an IOCTL hanging out waiting 14249 * for completion. Find it and complete the IOCTL 14250 * with the error noted. 14251 * However, ill_dl_phys was called on an ill queue 14252 * (from SIOCSLIFNAME), thus conn_pending_ill is not 14253 * set. But the ioctl is known to be pending on ill_wq. 14254 */ 14255 if (!ill->ill_ifname_pending) 14256 break; 14257 ill->ill_ifname_pending = 0; 14258 if (!ioctl_aborted) 14259 mp1 = ipsq_pending_mp_get(ipsq, &connp); 14260 if (mp1 != NULL) { 14261 /* 14262 * This operation (SIOCSLIFNAME) must have 14263 * happened on the ill. Assert there is no conn 14264 */ 14265 ASSERT(connp == NULL); 14266 q = ill->ill_wq; 14267 } 14268 break; 14269 case DL_BIND_REQ: 14270 ill_dlpi_done(ill, DL_BIND_REQ); 14271 if (ill->ill_ifname_pending) 14272 break; 14273 /* 14274 * Something went wrong with the bind. We presumably 14275 * have an IOCTL hanging out waiting for completion. 14276 * Find it, take down the interface that was coming 14277 * up, and complete the IOCTL with the error noted. 14278 */ 14279 if (!ioctl_aborted) 14280 mp1 = ipsq_pending_mp_get(ipsq, &connp); 14281 if (mp1 != NULL) { 14282 /* 14283 * This operation (SIOCSLIFFLAGS) must have 14284 * happened from a conn. 14285 */ 14286 ASSERT(connp != NULL); 14287 q = CONNP_TO_WQ(connp); 14288 if (ill->ill_move_in_progress) { 14289 ILL_CLEAR_MOVE(ill); 14290 } 14291 (void) ipif_down(ipif, NULL, NULL); 14292 /* error is set below the switch */ 14293 } 14294 break; 14295 case DL_ENABMULTI_REQ: 14296 ip1dbg(("DL_ERROR_ACK to enabmulti\n")); 14297 14298 if (ill->ill_dlpi_multicast_state == IDMS_INPROGRESS) 14299 ill->ill_dlpi_multicast_state = IDMS_FAILED; 14300 if (ill->ill_dlpi_multicast_state == IDMS_FAILED) { 14301 ipif_t *ipif; 14302 14303 log = B_FALSE; 14304 printf("ip: joining multicasts failed (%d)" 14305 " on %s - will use link layer " 14306 "broadcasts for multicast\n", 14307 dlea->dl_errno, ill->ill_name); 14308 14309 /* 14310 * Set up the multicast mapping alone. 14311 * writer, so ok to access ill->ill_ipif 14312 * without any lock. 14313 */ 14314 ipif = ill->ill_ipif; 14315 mutex_enter(&ill->ill_phyint->phyint_lock); 14316 ill->ill_phyint->phyint_flags |= 14317 PHYI_MULTI_BCAST; 14318 mutex_exit(&ill->ill_phyint->phyint_lock); 14319 14320 if (!ill->ill_isv6) { 14321 (void) ipif_arp_setup_multicast(ipif, 14322 NULL); 14323 } else { 14324 (void) ipif_ndp_setup_multicast(ipif, 14325 NULL); 14326 } 14327 } 14328 freemsg(mp); /* Don't want to pass this up */ 14329 return; 14330 case DL_CAPABILITY_REQ: 14331 case DL_CONTROL_REQ: 14332 ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for " 14333 "DL_CAPABILITY/CONTROL REQ\n")); 14334 ill_dlpi_done(ill, dlea->dl_error_primitive); 14335 ill->ill_capab_state = IDMS_FAILED; 14336 freemsg(mp); 14337 return; 14338 } 14339 /* 14340 * Note the error for IOCTL completion (mp1 is set when 14341 * ready to complete ioctl). If ill_ifname_pending_err is 14342 * set, an error occured during plumbing (ill_ifname_pending), 14343 * so we want to report that error. 14344 * 14345 * NOTE: there are two addtional DL_PHYS_ADDR_REQ's 14346 * (DL_IPV6_TOKEN and DL_IPV6_LINK_LAYER_ADDR) that are 14347 * expected to get errack'd if the driver doesn't support 14348 * these flags (e.g. ethernet). log will be set to B_FALSE 14349 * if these error conditions are encountered. 14350 */ 14351 if (mp1 != NULL) { 14352 if (ill->ill_ifname_pending_err != 0) { 14353 err = ill->ill_ifname_pending_err; 14354 ill->ill_ifname_pending_err = 0; 14355 } else { 14356 err = dlea->dl_unix_errno ? 14357 dlea->dl_unix_errno : ENXIO; 14358 } 14359 /* 14360 * If we're plumbing an interface and an error hasn't already 14361 * been saved, set ill_ifname_pending_err to the error passed 14362 * up. Ignore the error if log is B_FALSE (see comment above). 14363 */ 14364 } else if (log && ill->ill_ifname_pending && 14365 ill->ill_ifname_pending_err == 0) { 14366 ill->ill_ifname_pending_err = dlea->dl_unix_errno ? 14367 dlea->dl_unix_errno : ENXIO; 14368 } 14369 14370 if (log) 14371 ip_dlpi_error(ill, dlea->dl_error_primitive, 14372 dlea->dl_errno, dlea->dl_unix_errno); 14373 break; 14374 case DL_CAPABILITY_ACK: { 14375 boolean_t reneg_flag = B_FALSE; 14376 /* Call a routine to handle this one. */ 14377 ill_dlpi_done(ill, DL_CAPABILITY_REQ); 14378 /* 14379 * Check if the ACK is due to renegotiation case since we 14380 * will need to send a new CAPABILITY_REQ later. 14381 */ 14382 if (ill->ill_capab_state == IDMS_RENEG) { 14383 /* This is the ack for a renogiation case */ 14384 reneg_flag = B_TRUE; 14385 ill->ill_capab_state = IDMS_UNKNOWN; 14386 } 14387 ill_capability_ack(ill, mp); 14388 if (reneg_flag) 14389 ill_capability_probe(ill); 14390 break; 14391 } 14392 case DL_CONTROL_ACK: 14393 /* We treat all of these as "fire and forget" */ 14394 ill_dlpi_done(ill, DL_CONTROL_REQ); 14395 break; 14396 case DL_INFO_ACK: 14397 /* Call a routine to handle this one. */ 14398 ill_dlpi_done(ill, DL_INFO_REQ); 14399 ip_ll_subnet_defaults(ill, mp); 14400 ASSERT(!MUTEX_HELD(&ill->ill_phyint->phyint_ipsq->ipsq_lock)); 14401 return; 14402 case DL_BIND_ACK: 14403 /* 14404 * We should have an IOCTL waiting on this unless 14405 * sent by ill_dl_phys, in which case just return 14406 */ 14407 ill_dlpi_done(ill, DL_BIND_REQ); 14408 if (ill->ill_ifname_pending) 14409 break; 14410 14411 if (!ioctl_aborted) 14412 mp1 = ipsq_pending_mp_get(ipsq, &connp); 14413 if (mp1 == NULL) 14414 break; 14415 ASSERT(connp != NULL); 14416 q = CONNP_TO_WQ(connp); 14417 14418 /* 14419 * We are exclusive. So nothing can change even after 14420 * we get the pending mp. If need be we can put it back 14421 * and restart, as in calling ipif_arp_up() below. 14422 */ 14423 ip1dbg(("ip_rput_dlpi: bind_ack %s\n", ill->ill_name)); 14424 14425 mutex_enter(&ill->ill_lock); 14426 ill->ill_dl_up = 1; 14427 mutex_exit(&ill->ill_lock); 14428 14429 /* 14430 * Now bring up the resolver, when that is 14431 * done we'll create IREs and we are done. 14432 */ 14433 if (ill->ill_isv6) { 14434 /* 14435 * v6 interfaces. 14436 * Unlike ARP which has to do another bind 14437 * and attach, once we get here we are 14438 * done withh NDP. Except in the case of 14439 * ILLF_XRESOLV, in which case we send an 14440 * AR_INTERFACE_UP to the external resolver. 14441 * If all goes well, the ioctl will complete 14442 * in ip_rput(). If there's an error, we 14443 * complete it here. 14444 */ 14445 err = ipif_ndp_up(ipif, &ipif->ipif_v6lcl_addr, 14446 B_FALSE); 14447 if (err == 0) { 14448 if (ill->ill_flags & ILLF_XRESOLV) { 14449 mutex_enter(&connp->conn_lock); 14450 mutex_enter(&ill->ill_lock); 14451 success = ipsq_pending_mp_add( 14452 connp, ipif, q, mp1, 0); 14453 mutex_exit(&ill->ill_lock); 14454 mutex_exit(&connp->conn_lock); 14455 if (success) { 14456 err = ipif_resolver_up(ipif, 14457 B_FALSE); 14458 if (err == EINPROGRESS) { 14459 freemsg(mp); 14460 return; 14461 } 14462 ASSERT(err != 0); 14463 mp1 = ipsq_pending_mp_get(ipsq, 14464 &connp); 14465 ASSERT(mp1 != NULL); 14466 } else { 14467 /* conn has started closing */ 14468 err = EINTR; 14469 } 14470 } else { /* Non XRESOLV interface */ 14471 err = ipif_up_done_v6(ipif); 14472 } 14473 } 14474 } else if (ill->ill_net_type == IRE_IF_RESOLVER) { 14475 /* 14476 * ARP and other v4 external resolvers. 14477 * Leave the pending mblk intact so that 14478 * the ioctl completes in ip_rput(). 14479 */ 14480 mutex_enter(&connp->conn_lock); 14481 mutex_enter(&ill->ill_lock); 14482 success = ipsq_pending_mp_add(connp, ipif, q, mp1, 0); 14483 mutex_exit(&ill->ill_lock); 14484 mutex_exit(&connp->conn_lock); 14485 if (success) { 14486 err = ipif_resolver_up(ipif, B_FALSE); 14487 if (err == EINPROGRESS) { 14488 freemsg(mp); 14489 return; 14490 } 14491 ASSERT(err != 0); 14492 mp1 = ipsq_pending_mp_get(ipsq, &connp); 14493 } else { 14494 /* The conn has started closing */ 14495 err = EINTR; 14496 } 14497 } else { 14498 /* 14499 * This one is complete. Reply to pending ioctl. 14500 */ 14501 err = ipif_up_done(ipif); 14502 } 14503 14504 if ((err == 0) && (ill->ill_up_ipifs)) { 14505 err = ill_up_ipifs(ill, q, mp1); 14506 if (err == EINPROGRESS) { 14507 freemsg(mp); 14508 return; 14509 } 14510 } 14511 14512 if (ill->ill_up_ipifs) { 14513 ill_group_cleanup(ill); 14514 } 14515 14516 break; 14517 case DL_NOTIFY_IND: { 14518 dl_notify_ind_t *notify = (dl_notify_ind_t *)mp->b_rptr; 14519 ire_t *ire; 14520 boolean_t need_ire_walk_v4 = B_FALSE; 14521 boolean_t need_ire_walk_v6 = B_FALSE; 14522 14523 /* 14524 * Change the address everywhere we need to. 14525 * What we're getting here is a link-level addr or phys addr. 14526 * The new addr is at notify + notify->dl_addr_offset 14527 * The address length is notify->dl_addr_length; 14528 */ 14529 switch (notify->dl_notification) { 14530 case DL_NOTE_PHYS_ADDR: 14531 mp_hw = copyb(mp); 14532 if (mp_hw == NULL) { 14533 err = ENOMEM; 14534 break; 14535 } 14536 dlp = (union DL_primitives *)mp_hw->b_rptr; 14537 /* 14538 * We currently don't support changing 14539 * the token via DL_NOTIFY_IND. 14540 * When we do support it, we have to consider 14541 * what the implications are with respect to 14542 * the token and the link local address. 14543 */ 14544 mutex_enter(&ill->ill_lock); 14545 if (dlp->notify_ind.dl_data == 14546 DL_IPV6_LINK_LAYER_ADDR) { 14547 if (ill->ill_nd_lla_mp != NULL) 14548 freemsg(ill->ill_nd_lla_mp); 14549 ill->ill_nd_lla_mp = mp_hw; 14550 ill->ill_nd_lla = (uchar_t *)mp_hw->b_rptr + 14551 dlp->notify_ind.dl_addr_offset; 14552 ill->ill_nd_lla_len = 14553 dlp->notify_ind.dl_addr_length - 14554 ABS(ill->ill_sap_length); 14555 mutex_exit(&ill->ill_lock); 14556 break; 14557 } else if (dlp->notify_ind.dl_data == 14558 DL_CURR_PHYS_ADDR) { 14559 if (ill->ill_phys_addr_mp != NULL) 14560 freemsg(ill->ill_phys_addr_mp); 14561 ill->ill_phys_addr_mp = mp_hw; 14562 ill->ill_phys_addr = (uchar_t *)mp_hw->b_rptr + 14563 dlp->notify_ind.dl_addr_offset; 14564 ill->ill_phys_addr_length = 14565 dlp->notify_ind.dl_addr_length - 14566 ABS(ill->ill_sap_length); 14567 if (ill->ill_isv6 && 14568 !(ill->ill_flags & ILLF_XRESOLV)) { 14569 if (ill->ill_nd_lla_mp != NULL) 14570 freemsg(ill->ill_nd_lla_mp); 14571 ill->ill_nd_lla_mp = copyb(mp_hw); 14572 ill->ill_nd_lla = (uchar_t *) 14573 ill->ill_nd_lla_mp->b_rptr + 14574 dlp->notify_ind.dl_addr_offset; 14575 ill->ill_nd_lla_len = 14576 ill->ill_phys_addr_length; 14577 } 14578 } 14579 mutex_exit(&ill->ill_lock); 14580 /* 14581 * Send out gratuitous arp request for our new 14582 * hardware address. 14583 */ 14584 for (ipif = ill->ill_ipif; ipif != NULL; 14585 ipif = ipif->ipif_next) { 14586 if (!(ipif->ipif_flags & IPIF_UP)) 14587 continue; 14588 if (ill->ill_isv6) { 14589 ipif_ndp_down(ipif); 14590 /* 14591 * Set B_TRUE to enable 14592 * ipif_ndp_up() to send out 14593 * unsolicited advertisements. 14594 */ 14595 err = ipif_ndp_up(ipif, 14596 &ipif->ipif_v6lcl_addr, 14597 B_TRUE); 14598 if (err) { 14599 ip1dbg(( 14600 "ip_rput_dlpi_writer: " 14601 "Failed to update ndp " 14602 "err %d\n", err)); 14603 } 14604 } else { 14605 /* 14606 * IPv4 ARP case 14607 * 14608 * Set B_TRUE, as we only want 14609 * ipif_resolver_up to send an 14610 * AR_ENTRY_ADD request up to 14611 * ARP. 14612 */ 14613 err = ipif_resolver_up(ipif, 14614 B_TRUE); 14615 if (err) { 14616 ip1dbg(( 14617 "ip_rput_dlpi_writer: " 14618 "Failed to update arp " 14619 "err %d\n", err)); 14620 } 14621 } 14622 } 14623 /* 14624 * Allow "fall through" to the DL_NOTE_FASTPATH_FLUSH 14625 * case so that all old fastpath information can be 14626 * purged from IRE caches. 14627 */ 14628 /* FALLTHRU */ 14629 case DL_NOTE_FASTPATH_FLUSH: 14630 /* 14631 * Any fastpath probe sent henceforth will get the 14632 * new fp mp. So we first delete any ires that are 14633 * waiting for the fastpath. Then walk all ires and 14634 * delete the ire or delete the fp mp. In the case of 14635 * IRE_MIPRTUN and IRE_BROADCAST it is difficult to 14636 * recreate the ire's without going through a complex 14637 * ipif up/down dance. So we don't delete the ire 14638 * itself, but just the ire_fp_mp for these 2 ire's 14639 * In the case of the other ire's we delete the ire's 14640 * themselves. Access to ire_fp_mp is completely 14641 * protected by ire_lock for IRE_MIPRTUN and 14642 * IRE_BROADCAST. Deleting the ire is preferable in the 14643 * other cases for performance. 14644 */ 14645 if (ill->ill_isv6) { 14646 nce_fastpath_list_dispatch(ill, NULL, NULL); 14647 ndp_walk(ill, (pfi_t)ndp_fastpath_flush, 14648 NULL); 14649 } else { 14650 ire_fastpath_list_dispatch(ill, NULL, NULL); 14651 ire_walk_ill_v4(MATCH_IRE_WQ | MATCH_IRE_TYPE, 14652 IRE_CACHE | IRE_BROADCAST, 14653 ire_fastpath_flush, NULL, ill); 14654 mutex_enter(&ire_mrtun_lock); 14655 if (ire_mrtun_count != 0) { 14656 mutex_exit(&ire_mrtun_lock); 14657 ire_walk_ill_mrtun(MATCH_IRE_WQ, 14658 IRE_MIPRTUN, ire_fastpath_flush, 14659 NULL, ill); 14660 } else { 14661 mutex_exit(&ire_mrtun_lock); 14662 } 14663 } 14664 break; 14665 case DL_NOTE_SDU_SIZE: 14666 /* 14667 * Change the MTU size of the interface, of all 14668 * attached ipif's, and of all relevant ire's. The 14669 * new value's a uint32_t at notify->dl_data. 14670 * Mtu change Vs. new ire creation - protocol below. 14671 * 14672 * a Mark the ipif as IPIF_CHANGING. 14673 * b Set the new mtu in the ipif. 14674 * c Change the ire_max_frag on all affected ires 14675 * d Unmark the IPIF_CHANGING 14676 * 14677 * To see how the protocol works, assume an interface 14678 * route is also being added simultaneously by 14679 * ip_rt_add and let 'ipif' be the ipif referenced by 14680 * the ire. If the ire is created before step a, 14681 * it will be cleaned up by step c. If the ire is 14682 * created after step d, it will see the new value of 14683 * ipif_mtu. Any attempt to create the ire between 14684 * steps a to d will fail because of the IPIF_CHANGING 14685 * flag. Note that ire_create() is passed a pointer to 14686 * the ipif_mtu, and not the value. During ire_add 14687 * under the bucket lock, the ire_max_frag of the 14688 * new ire being created is set from the ipif/ire from 14689 * which it is being derived. 14690 */ 14691 mutex_enter(&ill->ill_lock); 14692 ill->ill_max_frag = (uint_t)notify->dl_data; 14693 14694 /* 14695 * If an SIOCSLIFLNKINFO has changed the ill_max_mtu 14696 * leave it alone 14697 */ 14698 if (ill->ill_mtu_userspecified) { 14699 mutex_exit(&ill->ill_lock); 14700 break; 14701 } 14702 ill->ill_max_mtu = ill->ill_max_frag; 14703 if (ill->ill_isv6) { 14704 if (ill->ill_max_mtu < IPV6_MIN_MTU) 14705 ill->ill_max_mtu = IPV6_MIN_MTU; 14706 } else { 14707 if (ill->ill_max_mtu < IP_MIN_MTU) 14708 ill->ill_max_mtu = IP_MIN_MTU; 14709 } 14710 for (ipif = ill->ill_ipif; ipif != NULL; 14711 ipif = ipif->ipif_next) { 14712 /* 14713 * Don't override the mtu if the user 14714 * has explicitly set it. 14715 */ 14716 if (ipif->ipif_flags & IPIF_FIXEDMTU) 14717 continue; 14718 ipif->ipif_mtu = (uint_t)notify->dl_data; 14719 if (ipif->ipif_isv6) 14720 ire = ipif_to_ire_v6(ipif); 14721 else 14722 ire = ipif_to_ire(ipif); 14723 if (ire != NULL) { 14724 ire->ire_max_frag = ipif->ipif_mtu; 14725 ire_refrele(ire); 14726 } 14727 if (ipif->ipif_flags & IPIF_UP) { 14728 if (ill->ill_isv6) 14729 need_ire_walk_v6 = B_TRUE; 14730 else 14731 need_ire_walk_v4 = B_TRUE; 14732 } 14733 } 14734 mutex_exit(&ill->ill_lock); 14735 if (need_ire_walk_v4) 14736 ire_walk_v4(ill_mtu_change, (char *)ill, 14737 ALL_ZONES); 14738 if (need_ire_walk_v6) 14739 ire_walk_v6(ill_mtu_change, (char *)ill, 14740 ALL_ZONES); 14741 break; 14742 case DL_NOTE_LINK_UP: 14743 case DL_NOTE_LINK_DOWN: { 14744 /* 14745 * We are writer. ill / phyint / ipsq assocs stable. 14746 * The RUNNING flag reflects the state of the link. 14747 */ 14748 phyint_t *phyint = ill->ill_phyint; 14749 uint64_t new_phyint_flags; 14750 boolean_t changed = B_FALSE; 14751 14752 mutex_enter(&phyint->phyint_lock); 14753 new_phyint_flags = 14754 (notify->dl_notification == DL_NOTE_LINK_UP) ? 14755 phyint->phyint_flags | PHYI_RUNNING : 14756 phyint->phyint_flags & ~PHYI_RUNNING; 14757 if (new_phyint_flags != phyint->phyint_flags) { 14758 phyint->phyint_flags = new_phyint_flags; 14759 changed = B_TRUE; 14760 } 14761 mutex_exit(&phyint->phyint_lock); 14762 /* 14763 * If the flags have changed, send a message to 14764 * the routing socket. 14765 */ 14766 if (changed) { 14767 if (phyint->phyint_illv4 != NULL) { 14768 ip_rts_ifmsg( 14769 phyint->phyint_illv4->ill_ipif); 14770 } 14771 if (phyint->phyint_illv6 != NULL) { 14772 ip_rts_ifmsg( 14773 phyint->phyint_illv6->ill_ipif); 14774 } 14775 } 14776 break; 14777 } 14778 case DL_NOTE_PROMISC_ON_PHYS: 14779 IPSECHW_DEBUG(IPSECHW_PKT, ("ip_rput_dlpi_writer: " 14780 "got a DL_NOTE_PROMISC_ON_PHYS\n")); 14781 mutex_enter(&ill->ill_lock); 14782 ill->ill_promisc_on_phys = B_TRUE; 14783 mutex_exit(&ill->ill_lock); 14784 break; 14785 case DL_NOTE_PROMISC_OFF_PHYS: 14786 IPSECHW_DEBUG(IPSECHW_PKT, ("ip_rput_dlpi_writer: " 14787 "got a DL_NOTE_PROMISC_OFF_PHYS\n")); 14788 mutex_enter(&ill->ill_lock); 14789 ill->ill_promisc_on_phys = B_FALSE; 14790 mutex_exit(&ill->ill_lock); 14791 break; 14792 case DL_NOTE_CAPAB_RENEG: 14793 /* 14794 * Something changed on the driver side. 14795 * It wants us to renegotiate the capabilities 14796 * on this ill. The most likely cause is the 14797 * aggregation interface under us where a 14798 * port got added or went away. 14799 * 14800 * We reset the capabilities and set the 14801 * state to IDMS_RENG so that when the ack 14802 * comes back, we can start the 14803 * renegotiation process. 14804 */ 14805 ill_capability_reset(ill); 14806 ill->ill_capab_state = IDMS_RENEG; 14807 break; 14808 default: 14809 ip0dbg(("ip_rput_dlpi_writer: unknown notification " 14810 "type 0x%x for DL_NOTIFY_IND\n", 14811 notify->dl_notification)); 14812 break; 14813 } 14814 14815 /* 14816 * As this is an asynchronous operation, we 14817 * should not call ill_dlpi_done 14818 */ 14819 break; 14820 } 14821 case DL_NOTIFY_ACK: 14822 /* 14823 * Don't really need to check for what notifications 14824 * are supported; we'll process what gets sent upstream, 14825 * and we know it'll be something we support changing 14826 * based on our DL_NOTIFY_REQ. 14827 */ 14828 ill_dlpi_done(ill, DL_NOTIFY_REQ); 14829 break; 14830 case DL_PHYS_ADDR_ACK: { 14831 /* 14832 * We should have an IOCTL waiting on this when request 14833 * sent by ill_dl_phys. 14834 * However, ill_dl_phys was called on an ill queue (from 14835 * SIOCSLIFNAME), thus conn_pending_ill is not set. But the 14836 * ioctl is known to be pending on ill_wq. 14837 * There are two additional phys_addr_req's sent to the 14838 * driver to get the token and lla. ill_phys_addr_pend 14839 * keeps track of the last one sent so we know which 14840 * response we are dealing with. ill_dlpi_done will 14841 * update ill_phys_addr_pend when it sends the next req. 14842 * We don't complete the IOCTL until all three DL_PARs 14843 * have been attempted. 14844 * 14845 * We don't need any lock to update ill_nd_lla* fields, 14846 * since the ill is not yet up, We grab the lock just 14847 * for uniformity with other code that accesses ill_nd_lla. 14848 */ 14849 physaddr_req = ill->ill_phys_addr_pend; 14850 ill_dlpi_done(ill, DL_PHYS_ADDR_REQ); 14851 if (physaddr_req == DL_IPV6_TOKEN || 14852 physaddr_req == DL_IPV6_LINK_LAYER_ADDR) { 14853 if (physaddr_req == DL_IPV6_TOKEN) { 14854 /* 14855 * bcopy to low-order bits of ill_token 14856 * 14857 * XXX Temporary hack - currently, 14858 * all known tokens are 64 bits, 14859 * so I'll cheat for the moment. 14860 */ 14861 dlp = (union DL_primitives *)mp->b_rptr; 14862 14863 mutex_enter(&ill->ill_lock); 14864 bcopy((uchar_t *)(mp->b_rptr + 14865 dlp->physaddr_ack.dl_addr_offset), 14866 (void *)&ill->ill_token.s6_addr32[2], 14867 dlp->physaddr_ack.dl_addr_length); 14868 ill->ill_token_length = 14869 dlp->physaddr_ack.dl_addr_length; 14870 mutex_exit(&ill->ill_lock); 14871 } else { 14872 ASSERT(ill->ill_nd_lla_mp == NULL); 14873 mp_hw = copyb(mp); 14874 if (mp_hw == NULL) { 14875 err = ENOMEM; 14876 break; 14877 } 14878 dlp = (union DL_primitives *)mp_hw->b_rptr; 14879 mutex_enter(&ill->ill_lock); 14880 ill->ill_nd_lla_mp = mp_hw; 14881 ill->ill_nd_lla = (uchar_t *)mp_hw->b_rptr + 14882 dlp->physaddr_ack.dl_addr_offset; 14883 ill->ill_nd_lla_len = 14884 dlp->physaddr_ack.dl_addr_length; 14885 mutex_exit(&ill->ill_lock); 14886 } 14887 break; 14888 } 14889 ASSERT(physaddr_req == DL_CURR_PHYS_ADDR); 14890 ASSERT(ill->ill_phys_addr_mp == NULL); 14891 if (!ill->ill_ifname_pending) 14892 break; 14893 ill->ill_ifname_pending = 0; 14894 if (!ioctl_aborted) 14895 mp1 = ipsq_pending_mp_get(ipsq, &connp); 14896 if (mp1 != NULL) { 14897 ASSERT(connp == NULL); 14898 q = ill->ill_wq; 14899 } 14900 /* 14901 * If any error acks received during the plumbing sequence, 14902 * ill_ifname_pending_err will be set. Break out and send up 14903 * the error to the pending ioctl. 14904 */ 14905 if (ill->ill_ifname_pending_err != 0) { 14906 err = ill->ill_ifname_pending_err; 14907 ill->ill_ifname_pending_err = 0; 14908 break; 14909 } 14910 /* 14911 * Get the interface token. If the zeroth interface 14912 * address is zero then set the address to the link local 14913 * address 14914 */ 14915 mp_hw = copyb(mp); 14916 if (mp_hw == NULL) { 14917 err = ENOMEM; 14918 break; 14919 } 14920 dlp = (union DL_primitives *)mp_hw->b_rptr; 14921 ill->ill_phys_addr_mp = mp_hw; 14922 ill->ill_phys_addr = (uchar_t *)mp_hw->b_rptr + 14923 dlp->physaddr_ack.dl_addr_offset; 14924 if (dlp->physaddr_ack.dl_addr_length == 0 || 14925 ill->ill_phys_addr_length == 0 || 14926 ill->ill_phys_addr_length == IP_ADDR_LEN) { 14927 /* 14928 * Compatibility: atun driver returns a length of 0. 14929 * ipdptp has an ill_phys_addr_length of zero(from 14930 * DL_BIND_ACK) but a non-zero length here. 14931 * ipd has an ill_phys_addr_length of 4(from 14932 * DL_BIND_ACK) but a non-zero length here. 14933 */ 14934 ill->ill_phys_addr = NULL; 14935 } else if (dlp->physaddr_ack.dl_addr_length != 14936 ill->ill_phys_addr_length) { 14937 ip0dbg(("DL_PHYS_ADDR_ACK: " 14938 "Address length mismatch %d %d\n", 14939 dlp->physaddr_ack.dl_addr_length, 14940 ill->ill_phys_addr_length)); 14941 err = EINVAL; 14942 break; 14943 } 14944 mutex_enter(&ill->ill_lock); 14945 if (ill->ill_nd_lla_mp == NULL) { 14946 ill->ill_nd_lla_mp = copyb(mp_hw); 14947 if (ill->ill_nd_lla_mp == NULL) { 14948 err = ENOMEM; 14949 mutex_exit(&ill->ill_lock); 14950 break; 14951 } 14952 ill->ill_nd_lla = 14953 (uchar_t *)ill->ill_nd_lla_mp->b_rptr + 14954 dlp->physaddr_ack.dl_addr_offset; 14955 ill->ill_nd_lla_len = ill->ill_phys_addr_length; 14956 } 14957 mutex_exit(&ill->ill_lock); 14958 if (IN6_IS_ADDR_UNSPECIFIED(&ill->ill_token)) 14959 (void) ill_setdefaulttoken(ill); 14960 14961 /* 14962 * If the ill zero interface has a zero address assign 14963 * it the proper link local address. 14964 */ 14965 ASSERT(ill->ill_ipif->ipif_id == 0); 14966 if (ipif != NULL && 14967 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) 14968 (void) ipif_setlinklocal(ipif); 14969 break; 14970 } 14971 case DL_OK_ACK: 14972 ip2dbg(("DL_OK_ACK %s (0x%x)\n", 14973 dlpi_prim_str((int)dloa->dl_correct_primitive), 14974 dloa->dl_correct_primitive)); 14975 switch (dloa->dl_correct_primitive) { 14976 case DL_UNBIND_REQ: 14977 case DL_ATTACH_REQ: 14978 case DL_DETACH_REQ: 14979 ill_dlpi_done(ill, dloa->dl_correct_primitive); 14980 break; 14981 } 14982 break; 14983 default: 14984 break; 14985 } 14986 14987 freemsg(mp); 14988 if (mp1) { 14989 struct iocblk *iocp; 14990 int mode; 14991 14992 /* 14993 * Complete the waiting IOCTL. For SIOCLIFADDIF or 14994 * SIOCSLIFNAME do a copyout. 14995 */ 14996 iocp = (struct iocblk *)mp1->b_rptr; 14997 14998 if (iocp->ioc_cmd == SIOCLIFADDIF || 14999 iocp->ioc_cmd == SIOCSLIFNAME) 15000 mode = COPYOUT; 15001 else 15002 mode = NO_COPYOUT; 15003 /* 15004 * The ioctl must complete now without EINPROGRESS 15005 * since ipsq_pending_mp_get has removed the ioctl mblk 15006 * from ipsq_pending_mp. Otherwise the ioctl will be 15007 * stuck for ever in the ipsq. 15008 */ 15009 ASSERT(err != EINPROGRESS); 15010 ip_ioctl_finish(q, mp1, err, mode, ipif, ipsq); 15011 15012 } 15013 } 15014 15015 /* 15016 * ip_rput_other is called by ip_rput to handle messages modifying the global 15017 * state in IP. Normally called as writer. Exception SIOCGTUNPARAM (shared) 15018 */ 15019 /* ARGSUSED */ 15020 void 15021 ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 15022 { 15023 ill_t *ill; 15024 struct iocblk *iocp; 15025 mblk_t *mp1; 15026 conn_t *connp = NULL; 15027 15028 ip1dbg(("ip_rput_other ")); 15029 ill = (ill_t *)q->q_ptr; 15030 /* 15031 * This routine is not a writer in the case of SIOCGTUNPARAM 15032 * in which case ipsq is NULL. 15033 */ 15034 if (ipsq != NULL) { 15035 ASSERT(IAM_WRITER_IPSQ(ipsq)); 15036 ASSERT(ipsq == ill->ill_phyint->phyint_ipsq); 15037 } 15038 15039 switch (mp->b_datap->db_type) { 15040 case M_ERROR: 15041 case M_HANGUP: 15042 /* 15043 * The device has a problem. We force the ILL down. It can 15044 * be brought up again manually using SIOCSIFFLAGS (via 15045 * ifconfig or equivalent). 15046 */ 15047 ASSERT(ipsq != NULL); 15048 if (mp->b_rptr < mp->b_wptr) 15049 ill->ill_error = (int)(*mp->b_rptr & 0xFF); 15050 if (ill->ill_error == 0) 15051 ill->ill_error = ENXIO; 15052 if (!ill_down_start(q, mp)) 15053 return; 15054 ipif_all_down_tail(ipsq, q, mp, NULL); 15055 break; 15056 case M_IOCACK: 15057 iocp = (struct iocblk *)mp->b_rptr; 15058 ASSERT(iocp->ioc_cmd != DL_IOC_HDR_INFO); 15059 switch (iocp->ioc_cmd) { 15060 case SIOCSTUNPARAM: 15061 case OSIOCSTUNPARAM: 15062 ASSERT(ipsq != NULL); 15063 /* 15064 * Finish socket ioctl passed through to tun. 15065 * We should have an IOCTL waiting on this. 15066 */ 15067 mp1 = ipsq_pending_mp_get(ipsq, &connp); 15068 if (ill->ill_isv6) { 15069 struct iftun_req *ta; 15070 15071 /* 15072 * if a source or destination is 15073 * being set, try and set the link 15074 * local address for the tunnel 15075 */ 15076 ta = (struct iftun_req *)mp->b_cont-> 15077 b_cont->b_rptr; 15078 if (ta->ifta_flags & (IFTUN_SRC | IFTUN_DST)) { 15079 ipif_set_tun_llink(ill, ta); 15080 } 15081 15082 } 15083 if (mp1 != NULL) { 15084 /* 15085 * Now copy back the b_next/b_prev used by 15086 * mi code for the mi_copy* functions. 15087 * See ip_sioctl_tunparam() for the reason. 15088 * Also protect against missing b_cont. 15089 */ 15090 if (mp->b_cont != NULL) { 15091 mp->b_cont->b_next = 15092 mp1->b_cont->b_next; 15093 mp->b_cont->b_prev = 15094 mp1->b_cont->b_prev; 15095 } 15096 inet_freemsg(mp1); 15097 ASSERT(ipsq->ipsq_current_ipif != NULL); 15098 ASSERT(connp != NULL); 15099 ip_ioctl_finish(CONNP_TO_WQ(connp), mp, 15100 iocp->ioc_error, NO_COPYOUT, 15101 ipsq->ipsq_current_ipif, ipsq); 15102 } else { 15103 ASSERT(connp == NULL); 15104 putnext(q, mp); 15105 } 15106 break; 15107 case SIOCGTUNPARAM: 15108 case OSIOCGTUNPARAM: 15109 /* 15110 * This is really M_IOCDATA from the tunnel driver. 15111 * convert back and complete the ioctl. 15112 * We should have an IOCTL waiting on this. 15113 */ 15114 mp1 = ill_pending_mp_get(ill, &connp, iocp->ioc_id); 15115 if (mp1) { 15116 /* 15117 * Now copy back the b_next/b_prev used by 15118 * mi code for the mi_copy* functions. 15119 * See ip_sioctl_tunparam() for the reason. 15120 * Also protect against missing b_cont. 15121 */ 15122 if (mp->b_cont != NULL) { 15123 mp->b_cont->b_next = 15124 mp1->b_cont->b_next; 15125 mp->b_cont->b_prev = 15126 mp1->b_cont->b_prev; 15127 } 15128 inet_freemsg(mp1); 15129 if (iocp->ioc_error == 0) 15130 mp->b_datap->db_type = M_IOCDATA; 15131 ASSERT(connp != NULL); 15132 ip_ioctl_finish(CONNP_TO_WQ(connp), mp, 15133 iocp->ioc_error, COPYOUT, NULL, NULL); 15134 } else { 15135 ASSERT(connp == NULL); 15136 putnext(q, mp); 15137 } 15138 break; 15139 default: 15140 break; 15141 } 15142 break; 15143 case M_IOCNAK: 15144 iocp = (struct iocblk *)mp->b_rptr; 15145 15146 switch (iocp->ioc_cmd) { 15147 int mode; 15148 ipif_t *ipif; 15149 15150 case DL_IOC_HDR_INFO: 15151 /* 15152 * If this was the first attempt turn of the 15153 * fastpath probing. 15154 */ 15155 mutex_enter(&ill->ill_lock); 15156 if (ill->ill_dlpi_fastpath_state == IDMS_INPROGRESS) { 15157 ill->ill_dlpi_fastpath_state = IDMS_FAILED; 15158 mutex_exit(&ill->ill_lock); 15159 ill_fastpath_nack(ill); 15160 ip1dbg(("ip_rput: DLPI fastpath off on " 15161 "interface %s\n", 15162 ill->ill_name)); 15163 } else { 15164 mutex_exit(&ill->ill_lock); 15165 } 15166 freemsg(mp); 15167 break; 15168 case SIOCSTUNPARAM: 15169 case OSIOCSTUNPARAM: 15170 ASSERT(ipsq != NULL); 15171 /* 15172 * Finish socket ioctl passed through to tun 15173 * We should have an IOCTL waiting on this. 15174 */ 15175 /* FALLTHRU */ 15176 case SIOCGTUNPARAM: 15177 case OSIOCGTUNPARAM: 15178 /* 15179 * This is really M_IOCDATA from the tunnel driver. 15180 * convert back and complete the ioctl. 15181 * We should have an IOCTL waiting on this. 15182 */ 15183 if (iocp->ioc_cmd == SIOCGTUNPARAM || 15184 iocp->ioc_cmd == OSIOCGTUNPARAM) { 15185 mp1 = ill_pending_mp_get(ill, &connp, 15186 iocp->ioc_id); 15187 mode = COPYOUT; 15188 ipsq = NULL; 15189 ipif = NULL; 15190 } else { 15191 mp1 = ipsq_pending_mp_get(ipsq, &connp); 15192 mode = NO_COPYOUT; 15193 ASSERT(ipsq->ipsq_current_ipif != NULL); 15194 ipif = ipsq->ipsq_current_ipif; 15195 } 15196 if (mp1 != NULL) { 15197 /* 15198 * Now copy back the b_next/b_prev used by 15199 * mi code for the mi_copy* functions. 15200 * See ip_sioctl_tunparam() for the reason. 15201 * Also protect against missing b_cont. 15202 */ 15203 if (mp->b_cont != NULL) { 15204 mp->b_cont->b_next = 15205 mp1->b_cont->b_next; 15206 mp->b_cont->b_prev = 15207 mp1->b_cont->b_prev; 15208 } 15209 inet_freemsg(mp1); 15210 if (iocp->ioc_error == 0) 15211 iocp->ioc_error = EINVAL; 15212 ASSERT(connp != NULL); 15213 ip_ioctl_finish(CONNP_TO_WQ(connp), mp, 15214 iocp->ioc_error, mode, ipif, ipsq); 15215 } else { 15216 ASSERT(connp == NULL); 15217 putnext(q, mp); 15218 } 15219 break; 15220 default: 15221 break; 15222 } 15223 default: 15224 break; 15225 } 15226 } 15227 15228 /* 15229 * NOTE : This function does not ire_refrele the ire argument passed in. 15230 * 15231 * IPQoS notes 15232 * IP policy is invoked twice for a forwarded packet, once on the read side 15233 * and again on the write side if both, IPP_FWD_IN and IPP_FWD_OUT are 15234 * enabled. An additional parameter, in_ill, has been added for this purpose. 15235 * Note that in_ill could be NULL when called from ip_rput_forward_multicast 15236 * because ip_mroute drops this information. 15237 * 15238 */ 15239 void 15240 ip_rput_forward(ire_t *ire, ipha_t *ipha, mblk_t *mp, ill_t *in_ill) 15241 { 15242 uint32_t pkt_len; 15243 queue_t *q; 15244 uint32_t sum; 15245 #define rptr ((uchar_t *)ipha) 15246 uint32_t max_frag; 15247 uint32_t ill_index; 15248 15249 /* Get the ill_index of the incoming ILL */ 15250 ill_index = (in_ill != NULL) ? in_ill->ill_phyint->phyint_ifindex : 0; 15251 15252 /* Initiate Read side IPPF processing */ 15253 if (IPP_ENABLED(IPP_FWD_IN)) { 15254 ip_process(IPP_FWD_IN, &mp, ill_index); 15255 if (mp == NULL) { 15256 ip2dbg(("ip_rput_forward: pkt dropped/deferred "\ 15257 "during IPPF processing\n")); 15258 return; 15259 } 15260 } 15261 pkt_len = ntohs(ipha->ipha_length); 15262 15263 /* Adjust the checksum to reflect the ttl decrement. */ 15264 sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST; 15265 ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16)); 15266 15267 if (ipha->ipha_ttl-- <= 1) { 15268 if (ip_csum_hdr(ipha)) { 15269 BUMP_MIB(&ip_mib, ipInCksumErrs); 15270 goto drop_pkt; 15271 } 15272 /* 15273 * Note: ire_stq this will be NULL for multicast 15274 * datagrams using the long path through arp (the IRE 15275 * is not an IRE_CACHE). This should not cause 15276 * problems since we don't generate ICMP errors for 15277 * multicast packets. 15278 */ 15279 q = ire->ire_stq; 15280 if (q) 15281 icmp_time_exceeded(q, mp, ICMP_TTL_EXCEEDED); 15282 else 15283 freemsg(mp); 15284 return; 15285 } 15286 15287 /* 15288 * Don't forward if the interface is down 15289 */ 15290 if (ire->ire_ipif->ipif_ill->ill_ipif_up_count == 0) { 15291 BUMP_MIB(&ip_mib, ipInDiscards); 15292 goto drop_pkt; 15293 } 15294 15295 /* Get the ill_index of the outgoing ILL */ 15296 ill_index = ire->ire_ipif->ipif_ill->ill_phyint->phyint_ifindex; 15297 15298 if (is_system_labeled()) { 15299 mblk_t *mp1; 15300 15301 if ((mp1 = tsol_ip_forward(ire, mp)) == NULL) { 15302 BUMP_MIB(&ip_mib, ipForwProhibits); 15303 goto drop_pkt; 15304 } 15305 /* Size may have changed */ 15306 mp = mp1; 15307 ipha = (ipha_t *)mp->b_rptr; 15308 pkt_len = ntohs(ipha->ipha_length); 15309 } 15310 15311 /* Check if there are options to update */ 15312 if (!IS_SIMPLE_IPH(ipha)) { 15313 if (ip_csum_hdr(ipha)) { 15314 BUMP_MIB(&ip_mib, ipInCksumErrs); 15315 goto drop_pkt; 15316 } 15317 if (ip_rput_forward_options(mp, ipha, ire)) { 15318 return; 15319 } 15320 15321 ipha->ipha_hdr_checksum = 0; 15322 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 15323 } 15324 max_frag = ire->ire_max_frag; 15325 if (pkt_len > max_frag) { 15326 /* 15327 * It needs fragging on its way out. We haven't 15328 * verified the header checksum yet. Since we 15329 * are going to put a surely good checksum in the 15330 * outgoing header, we have to make sure that it 15331 * was good coming in. 15332 */ 15333 if (ip_csum_hdr(ipha)) { 15334 BUMP_MIB(&ip_mib, ipInCksumErrs); 15335 goto drop_pkt; 15336 } 15337 /* Initiate Write side IPPF processing */ 15338 if (IPP_ENABLED(IPP_FWD_OUT)) { 15339 ip_process(IPP_FWD_OUT, &mp, ill_index); 15340 if (mp == NULL) { 15341 ip2dbg(("ip_rput_forward: pkt dropped/deferred"\ 15342 " during IPPF processing\n")); 15343 return; 15344 } 15345 } 15346 ip_wput_frag(ire, mp, IB_PKT, max_frag, 0); 15347 return; 15348 } 15349 15350 mp = ip_wput_attach_llhdr(mp, ire, IPP_FWD_OUT, ill_index); 15351 if (mp == NULL) { 15352 BUMP_MIB(&ip_mib, ipInDiscards); 15353 return; 15354 } 15355 15356 q = ire->ire_stq; 15357 UPDATE_IB_PKT_COUNT(ire); 15358 ire->ire_last_used_time = lbolt; 15359 BUMP_MIB(&ip_mib, ipForwDatagrams); 15360 putnext(q, mp); 15361 return; 15362 15363 drop_pkt:; 15364 ip1dbg(("ip_rput_forward: drop pkt\n")); 15365 freemsg(mp); 15366 #undef rptr 15367 } 15368 15369 void 15370 ip_rput_forward_multicast(ipaddr_t dst, mblk_t *mp, ipif_t *ipif) 15371 { 15372 ire_t *ire; 15373 15374 ASSERT(!ipif->ipif_isv6); 15375 /* 15376 * Find an IRE which matches the destination and the outgoing 15377 * queue in the cache table. All we need is an IRE_CACHE which 15378 * is pointing at ipif->ipif_ill. If it is part of some ill group, 15379 * then it is enough to have some IRE_CACHE in the group. 15380 */ 15381 if (ipif->ipif_flags & IPIF_POINTOPOINT) 15382 dst = ipif->ipif_pp_dst_addr; 15383 ire = ire_ctable_lookup(dst, 0, 0, ipif, ALL_ZONES, MBLK_GETLABEL(mp), 15384 MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR); 15385 if (ire == NULL) { 15386 /* 15387 * Mark this packet to make it be delivered to 15388 * ip_rput_forward after the new ire has been 15389 * created. 15390 */ 15391 mp->b_prev = NULL; 15392 mp->b_next = mp; 15393 ip_newroute_ipif(ipif->ipif_ill->ill_wq, mp, ipif, dst, 15394 NULL, 0); 15395 } else { 15396 ip_rput_forward(ire, (ipha_t *)mp->b_rptr, mp, NULL); 15397 IRE_REFRELE(ire); 15398 } 15399 } 15400 15401 /* Update any source route, record route or timestamp options */ 15402 static int 15403 ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire) 15404 { 15405 ipoptp_t opts; 15406 uchar_t *opt; 15407 uint8_t optval; 15408 uint8_t optlen; 15409 ipaddr_t dst; 15410 uint32_t ts; 15411 ire_t *dst_ire = NULL; 15412 ire_t *tmp_ire = NULL; 15413 timestruc_t now; 15414 15415 ip2dbg(("ip_rput_forward_options\n")); 15416 dst = ipha->ipha_dst; 15417 for (optval = ipoptp_first(&opts, ipha); 15418 optval != IPOPT_EOL; 15419 optval = ipoptp_next(&opts)) { 15420 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 15421 opt = opts.ipoptp_cur; 15422 optlen = opts.ipoptp_len; 15423 ip2dbg(("ip_rput_forward_options: opt %d, len %d\n", 15424 optval, opts.ipoptp_len)); 15425 switch (optval) { 15426 uint32_t off; 15427 case IPOPT_SSRR: 15428 case IPOPT_LSRR: 15429 /* Check if adminstratively disabled */ 15430 if (!ip_forward_src_routed) { 15431 BUMP_MIB(&ip_mib, ipForwProhibits); 15432 if (ire->ire_stq) 15433 icmp_unreachable(ire->ire_stq, mp, 15434 ICMP_SOURCE_ROUTE_FAILED); 15435 else { 15436 ip0dbg(("ip_rput_forward_options: " 15437 "unable to send unreach\n")); 15438 freemsg(mp); 15439 } 15440 return (-1); 15441 } 15442 15443 dst_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, 15444 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 15445 if (dst_ire == NULL) { 15446 /* 15447 * Must be partial since ip_rput_options 15448 * checked for strict. 15449 */ 15450 break; 15451 } 15452 off = opt[IPOPT_OFFSET]; 15453 off--; 15454 redo_srr: 15455 if (optlen < IP_ADDR_LEN || 15456 off > optlen - IP_ADDR_LEN) { 15457 /* End of source route */ 15458 ip1dbg(( 15459 "ip_rput_forward_options: end of SR\n")); 15460 ire_refrele(dst_ire); 15461 break; 15462 } 15463 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 15464 bcopy(&ire->ire_src_addr, (char *)opt + off, 15465 IP_ADDR_LEN); 15466 ip1dbg(("ip_rput_forward_options: next hop 0x%x\n", 15467 ntohl(dst))); 15468 15469 /* 15470 * Check if our address is present more than 15471 * once as consecutive hops in source route. 15472 */ 15473 tmp_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, 15474 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 15475 if (tmp_ire != NULL) { 15476 ire_refrele(tmp_ire); 15477 off += IP_ADDR_LEN; 15478 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 15479 goto redo_srr; 15480 } 15481 ipha->ipha_dst = dst; 15482 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 15483 ire_refrele(dst_ire); 15484 break; 15485 case IPOPT_RR: 15486 off = opt[IPOPT_OFFSET]; 15487 off--; 15488 if (optlen < IP_ADDR_LEN || 15489 off > optlen - IP_ADDR_LEN) { 15490 /* No more room - ignore */ 15491 ip1dbg(( 15492 "ip_rput_forward_options: end of RR\n")); 15493 break; 15494 } 15495 bcopy(&ire->ire_src_addr, (char *)opt + off, 15496 IP_ADDR_LEN); 15497 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 15498 break; 15499 case IPOPT_TS: 15500 /* Insert timestamp if there is room */ 15501 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 15502 case IPOPT_TS_TSONLY: 15503 off = IPOPT_TS_TIMELEN; 15504 break; 15505 case IPOPT_TS_PRESPEC: 15506 case IPOPT_TS_PRESPEC_RFC791: 15507 /* Verify that the address matched */ 15508 off = opt[IPOPT_OFFSET] - 1; 15509 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 15510 dst_ire = ire_ctable_lookup(dst, 0, 15511 IRE_LOCAL, NULL, ALL_ZONES, NULL, 15512 MATCH_IRE_TYPE); 15513 15514 if (dst_ire == NULL) { 15515 /* Not for us */ 15516 break; 15517 } 15518 ire_refrele(dst_ire); 15519 /* FALLTHRU */ 15520 case IPOPT_TS_TSANDADDR: 15521 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 15522 break; 15523 default: 15524 /* 15525 * ip_*put_options should have already 15526 * dropped this packet. 15527 */ 15528 cmn_err(CE_PANIC, "ip_rput_forward_options: " 15529 "unknown IT - bug in ip_rput_options?\n"); 15530 return (0); /* Keep "lint" happy */ 15531 } 15532 if (opt[IPOPT_OFFSET] - 1 + off > optlen) { 15533 /* Increase overflow counter */ 15534 off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1; 15535 opt[IPOPT_POS_OV_FLG] = 15536 (uint8_t)((opt[IPOPT_POS_OV_FLG] & 0x0F) | 15537 (off << 4)); 15538 break; 15539 } 15540 off = opt[IPOPT_OFFSET] - 1; 15541 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 15542 case IPOPT_TS_PRESPEC: 15543 case IPOPT_TS_PRESPEC_RFC791: 15544 case IPOPT_TS_TSANDADDR: 15545 bcopy(&ire->ire_src_addr, 15546 (char *)opt + off, IP_ADDR_LEN); 15547 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 15548 /* FALLTHRU */ 15549 case IPOPT_TS_TSONLY: 15550 off = opt[IPOPT_OFFSET] - 1; 15551 /* Compute # of milliseconds since midnight */ 15552 gethrestime(&now); 15553 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + 15554 now.tv_nsec / (NANOSEC / MILLISEC); 15555 bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN); 15556 opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN; 15557 break; 15558 } 15559 break; 15560 } 15561 } 15562 return (0); 15563 } 15564 15565 /* 15566 * This is called after processing at least one of AH/ESP headers. 15567 * 15568 * NOTE: the ill corresponding to ipsec_in_ill_index may not be 15569 * the actual, physical interface on which the packet was received, 15570 * but, when ip_strict_dst_multihoming is set to 1, could be the 15571 * interface which had the ipha_dst configured when the packet went 15572 * through ip_rput. The ill_index corresponding to the recv_ill 15573 * is saved in ipsec_in_rill_index 15574 */ 15575 void 15576 ip_fanout_proto_again(mblk_t *ipsec_mp, ill_t *ill, ill_t *recv_ill, ire_t *ire) 15577 { 15578 mblk_t *mp; 15579 ipaddr_t dst; 15580 in6_addr_t *v6dstp; 15581 ipha_t *ipha; 15582 ip6_t *ip6h; 15583 ipsec_in_t *ii; 15584 boolean_t ill_need_rele = B_FALSE; 15585 boolean_t rill_need_rele = B_FALSE; 15586 boolean_t ire_need_rele = B_FALSE; 15587 15588 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 15589 ASSERT(ii->ipsec_in_ill_index != 0); 15590 15591 mp = ipsec_mp->b_cont; 15592 ASSERT(mp != NULL); 15593 15594 15595 if (ill == NULL) { 15596 ASSERT(recv_ill == NULL); 15597 /* 15598 * We need to get the original queue on which ip_rput_local 15599 * or ip_rput_data_v6 was called. 15600 */ 15601 ill = ill_lookup_on_ifindex(ii->ipsec_in_ill_index, 15602 !ii->ipsec_in_v4, NULL, NULL, NULL, NULL); 15603 ill_need_rele = B_TRUE; 15604 15605 if (ii->ipsec_in_ill_index != ii->ipsec_in_rill_index) { 15606 recv_ill = ill_lookup_on_ifindex( 15607 ii->ipsec_in_rill_index, !ii->ipsec_in_v4, 15608 NULL, NULL, NULL, NULL); 15609 rill_need_rele = B_TRUE; 15610 } else { 15611 recv_ill = ill; 15612 } 15613 15614 if ((ill == NULL) || (recv_ill == NULL)) { 15615 ip0dbg(("ip_fanout_proto_again: interface " 15616 "disappeared\n")); 15617 if (ill != NULL) 15618 ill_refrele(ill); 15619 if (recv_ill != NULL) 15620 ill_refrele(recv_ill); 15621 freemsg(ipsec_mp); 15622 return; 15623 } 15624 } 15625 15626 ASSERT(ill != NULL && recv_ill != NULL); 15627 15628 if (mp->b_datap->db_type == M_CTL) { 15629 /* 15630 * AH/ESP is returning the ICMP message after 15631 * removing their headers. Fanout again till 15632 * it gets to the right protocol. 15633 */ 15634 if (ii->ipsec_in_v4) { 15635 icmph_t *icmph; 15636 int iph_hdr_length; 15637 int hdr_length; 15638 15639 ipha = (ipha_t *)mp->b_rptr; 15640 iph_hdr_length = IPH_HDR_LENGTH(ipha); 15641 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 15642 ipha = (ipha_t *)&icmph[1]; 15643 hdr_length = IPH_HDR_LENGTH(ipha); 15644 /* 15645 * icmp_inbound_error_fanout may need to do pullupmsg. 15646 * Reset the type to M_DATA. 15647 */ 15648 mp->b_datap->db_type = M_DATA; 15649 icmp_inbound_error_fanout(ill->ill_rq, ill, ipsec_mp, 15650 icmph, ipha, iph_hdr_length, hdr_length, B_TRUE, 15651 B_FALSE, ill, ii->ipsec_in_zoneid); 15652 } else { 15653 icmp6_t *icmp6; 15654 int hdr_length; 15655 15656 ip6h = (ip6_t *)mp->b_rptr; 15657 /* Don't call hdr_length_v6() unless you have to. */ 15658 if (ip6h->ip6_nxt != IPPROTO_ICMPV6) 15659 hdr_length = ip_hdr_length_v6(mp, ip6h); 15660 else 15661 hdr_length = IPV6_HDR_LEN; 15662 15663 icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]); 15664 /* 15665 * icmp_inbound_error_fanout_v6 may need to do 15666 * pullupmsg. Reset the type to M_DATA. 15667 */ 15668 mp->b_datap->db_type = M_DATA; 15669 icmp_inbound_error_fanout_v6(ill->ill_rq, ipsec_mp, 15670 ip6h, icmp6, ill, B_TRUE, ii->ipsec_in_zoneid); 15671 } 15672 if (ill_need_rele) 15673 ill_refrele(ill); 15674 if (rill_need_rele) 15675 ill_refrele(recv_ill); 15676 return; 15677 } 15678 15679 if (ii->ipsec_in_v4) { 15680 ipha = (ipha_t *)mp->b_rptr; 15681 dst = ipha->ipha_dst; 15682 if (CLASSD(dst)) { 15683 /* 15684 * Multicast has to be delivered to all streams. 15685 */ 15686 dst = INADDR_BROADCAST; 15687 } 15688 15689 if (ire == NULL) { 15690 ire = ire_cache_lookup(dst, ii->ipsec_in_zoneid, 15691 MBLK_GETLABEL(mp)); 15692 if (ire == NULL) { 15693 if (ill_need_rele) 15694 ill_refrele(ill); 15695 if (rill_need_rele) 15696 ill_refrele(recv_ill); 15697 ip1dbg(("ip_fanout_proto_again: " 15698 "IRE not found")); 15699 freemsg(ipsec_mp); 15700 return; 15701 } 15702 ire_need_rele = B_TRUE; 15703 } 15704 15705 switch (ipha->ipha_protocol) { 15706 case IPPROTO_UDP: 15707 ip_udp_input(ill->ill_rq, ipsec_mp, ipha, ire, 15708 recv_ill); 15709 if (ire_need_rele) 15710 ire_refrele(ire); 15711 break; 15712 case IPPROTO_TCP: 15713 if (!ire_need_rele) 15714 IRE_REFHOLD(ire); 15715 mp = ip_tcp_input(mp, ipha, ill, B_TRUE, 15716 ire, ipsec_mp, 0, ill->ill_rq, NULL); 15717 IRE_REFRELE(ire); 15718 if (mp != NULL) 15719 squeue_enter_chain(GET_SQUEUE(mp), mp, 15720 mp, 1, SQTAG_IP_PROTO_AGAIN); 15721 break; 15722 case IPPROTO_SCTP: 15723 if (!ire_need_rele) 15724 IRE_REFHOLD(ire); 15725 ip_sctp_input(mp, ipha, ill, B_TRUE, ire, 15726 ipsec_mp, 0, ill->ill_rq, dst); 15727 break; 15728 default: 15729 ip_proto_input(ill->ill_rq, ipsec_mp, ipha, ire, 15730 recv_ill); 15731 if (ire_need_rele) 15732 ire_refrele(ire); 15733 break; 15734 } 15735 } else { 15736 uint32_t rput_flags = 0; 15737 15738 ip6h = (ip6_t *)mp->b_rptr; 15739 v6dstp = &ip6h->ip6_dst; 15740 /* 15741 * XXX Assumes ip_rput_v6 sets ll_multicast only for multicast 15742 * address. 15743 * 15744 * Currently, we don't store that state in the IPSEC_IN 15745 * message, and we may need to. 15746 */ 15747 rput_flags |= (IN6_IS_ADDR_MULTICAST(v6dstp) ? 15748 IP6_IN_LLMCAST : 0); 15749 ip_rput_data_v6(ill->ill_rq, ill, ipsec_mp, ip6h, rput_flags, 15750 NULL); 15751 } 15752 if (ill_need_rele) 15753 ill_refrele(ill); 15754 if (rill_need_rele) 15755 ill_refrele(recv_ill); 15756 } 15757 15758 /* 15759 * Call ill_frag_timeout to do garbage collection. ill_frag_timeout 15760 * returns 'true' if there are still fragments left on the queue, in 15761 * which case we restart the timer. 15762 */ 15763 void 15764 ill_frag_timer(void *arg) 15765 { 15766 ill_t *ill = (ill_t *)arg; 15767 boolean_t frag_pending; 15768 15769 mutex_enter(&ill->ill_lock); 15770 ASSERT(!ill->ill_fragtimer_executing); 15771 if (ill->ill_state_flags & ILL_CONDEMNED) { 15772 ill->ill_frag_timer_id = 0; 15773 mutex_exit(&ill->ill_lock); 15774 return; 15775 } 15776 ill->ill_fragtimer_executing = 1; 15777 mutex_exit(&ill->ill_lock); 15778 15779 frag_pending = ill_frag_timeout(ill, ip_g_frag_timeout); 15780 15781 /* 15782 * Restart the timer, if we have fragments pending or if someone 15783 * wanted us to be scheduled again. 15784 */ 15785 mutex_enter(&ill->ill_lock); 15786 ill->ill_fragtimer_executing = 0; 15787 ill->ill_frag_timer_id = 0; 15788 if (frag_pending || ill->ill_fragtimer_needrestart) 15789 ill_frag_timer_start(ill); 15790 mutex_exit(&ill->ill_lock); 15791 } 15792 15793 void 15794 ill_frag_timer_start(ill_t *ill) 15795 { 15796 ASSERT(MUTEX_HELD(&ill->ill_lock)); 15797 15798 /* If the ill is closing or opening don't proceed */ 15799 if (ill->ill_state_flags & ILL_CONDEMNED) 15800 return; 15801 15802 if (ill->ill_fragtimer_executing) { 15803 /* 15804 * ill_frag_timer is currently executing. Just record the 15805 * the fact that we want the timer to be restarted. 15806 * ill_frag_timer will post a timeout before it returns, 15807 * ensuring it will be called again. 15808 */ 15809 ill->ill_fragtimer_needrestart = 1; 15810 return; 15811 } 15812 15813 if (ill->ill_frag_timer_id == 0) { 15814 /* 15815 * The timer is neither running nor is the timeout handler 15816 * executing. Post a timeout so that ill_frag_timer will be 15817 * called 15818 */ 15819 ill->ill_frag_timer_id = timeout(ill_frag_timer, ill, 15820 MSEC_TO_TICK(ip_g_frag_timo_ms >> 1)); 15821 ill->ill_fragtimer_needrestart = 0; 15822 } 15823 } 15824 15825 /* 15826 * This routine is needed for loopback when forwarding multicasts. 15827 * 15828 * IPQoS Notes: 15829 * IPPF processing is done in fanout routines. 15830 * Policy processing is done only if IPP_lOCAL_IN is enabled. Further, 15831 * processing for IPSec packets is done when it comes back in clear. 15832 * NOTE : The callers of this function need to do the ire_refrele for the 15833 * ire that is being passed in. 15834 */ 15835 void 15836 ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, 15837 ill_t *recv_ill) 15838 { 15839 ill_t *ill = (ill_t *)q->q_ptr; 15840 uint32_t sum; 15841 uint32_t u1; 15842 uint32_t u2; 15843 int hdr_length; 15844 boolean_t mctl_present; 15845 mblk_t *first_mp = mp; 15846 mblk_t *hada_mp = NULL; 15847 ipha_t *inner_ipha; 15848 15849 TRACE_1(TR_FAC_IP, TR_IP_RPUT_LOCL_START, 15850 "ip_rput_locl_start: q %p", q); 15851 15852 ASSERT(ire->ire_ipversion == IPV4_VERSION); 15853 15854 15855 #define rptr ((uchar_t *)ipha) 15856 #define iphs ((uint16_t *)ipha) 15857 15858 /* 15859 * no UDP or TCP packet should come here anymore. 15860 */ 15861 ASSERT((ipha->ipha_protocol != IPPROTO_TCP) && 15862 (ipha->ipha_protocol != IPPROTO_UDP)); 15863 15864 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 15865 if (mctl_present && 15866 ((da_ipsec_t *)first_mp->b_rptr)->da_type == IPHADA_M_CTL) { 15867 ASSERT(MBLKL(first_mp) >= sizeof (da_ipsec_t)); 15868 15869 /* 15870 * It's an IPsec accelerated packet. 15871 * Keep a pointer to the data attributes around until 15872 * we allocate the ipsec_info_t. 15873 */ 15874 IPSECHW_DEBUG(IPSECHW_PKT, 15875 ("ip_rput_local: inbound HW accelerated IPsec pkt\n")); 15876 hada_mp = first_mp; 15877 hada_mp->b_cont = NULL; 15878 /* 15879 * Since it is accelerated, it comes directly from 15880 * the ill and the data attributes is followed by 15881 * the packet data. 15882 */ 15883 ASSERT(mp->b_datap->db_type != M_CTL); 15884 first_mp = mp; 15885 mctl_present = B_FALSE; 15886 } 15887 15888 /* 15889 * IF M_CTL is not present, then ipsec_in_is_secure 15890 * should return B_TRUE. There is a case where loopback 15891 * packets has an M_CTL in the front with all the 15892 * IPSEC options set to IPSEC_PREF_NEVER - which means 15893 * ipsec_in_is_secure will return B_FALSE. As loopback 15894 * packets never comes here, it is safe to ASSERT the 15895 * following. 15896 */ 15897 ASSERT(!mctl_present || ipsec_in_is_secure(first_mp)); 15898 15899 15900 /* u1 is # words of IP options */ 15901 u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) 15902 + IP_SIMPLE_HDR_LENGTH_IN_WORDS); 15903 15904 if (u1) { 15905 if (!ip_options_cksum(q, mp, ipha, ire)) { 15906 if (hada_mp != NULL) 15907 freemsg(hada_mp); 15908 return; 15909 } 15910 } else { 15911 /* Check the IP header checksum. */ 15912 #define uph ((uint16_t *)ipha) 15913 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + uph[5] + 15914 uph[6] + uph[7] + uph[8] + uph[9]; 15915 #undef uph 15916 /* finish doing IP checksum */ 15917 sum = (sum & 0xFFFF) + (sum >> 16); 15918 sum = ~(sum + (sum >> 16)) & 0xFFFF; 15919 /* 15920 * Don't verify header checksum if this packet is coming 15921 * back from AH/ESP as we already did it. 15922 */ 15923 if (!mctl_present && (sum && sum != 0xFFFF)) { 15924 BUMP_MIB(&ip_mib, ipInCksumErrs); 15925 goto drop_pkt; 15926 } 15927 } 15928 15929 /* 15930 * Count for SNMP of inbound packets for ire. As ip_proto_input 15931 * might be called more than once for secure packets, count only 15932 * the first time. 15933 */ 15934 if (!mctl_present) { 15935 UPDATE_IB_PKT_COUNT(ire); 15936 ire->ire_last_used_time = lbolt; 15937 } 15938 15939 /* Check for fragmentation offset. */ 15940 u2 = ntohs(ipha->ipha_fragment_offset_and_flags); 15941 u1 = u2 & (IPH_MF | IPH_OFFSET); 15942 if (u1) { 15943 /* 15944 * We re-assemble fragments before we do the AH/ESP 15945 * processing. Thus, M_CTL should not be present 15946 * while we are re-assembling. 15947 */ 15948 ASSERT(!mctl_present); 15949 ASSERT(first_mp == mp); 15950 if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) { 15951 return; 15952 } 15953 /* 15954 * Make sure that first_mp points back to mp as 15955 * the mp we came in with could have changed in 15956 * ip_rput_fragment(). 15957 */ 15958 ipha = (ipha_t *)mp->b_rptr; 15959 first_mp = mp; 15960 } 15961 15962 /* 15963 * Clear hardware checksumming flag as it is currently only 15964 * used by TCP and UDP. 15965 */ 15966 DB_CKSUMFLAGS(mp) = 0; 15967 15968 /* Now we have a complete datagram, destined for this machine. */ 15969 u1 = IPH_HDR_LENGTH(ipha); 15970 switch (ipha->ipha_protocol) { 15971 case IPPROTO_ICMP: { 15972 ire_t *ire_zone; 15973 ilm_t *ilm; 15974 mblk_t *mp1; 15975 zoneid_t last_zoneid; 15976 15977 if (CLASSD(ipha->ipha_dst) && 15978 !(recv_ill->ill_phyint->phyint_flags & PHYI_LOOPBACK)) { 15979 ASSERT(ire->ire_type == IRE_BROADCAST); 15980 /* 15981 * In the multicast case, applications may have joined 15982 * the group from different zones, so we need to deliver 15983 * the packet to each of them. Loop through the 15984 * multicast memberships structures (ilm) on the receive 15985 * ill and send a copy of the packet up each matching 15986 * one. However, we don't do this for multicasts sent on 15987 * the loopback interface (PHYI_LOOPBACK flag set) as 15988 * they must stay in the sender's zone. 15989 * 15990 * ilm_add_v6() ensures that ilms in the same zone are 15991 * contiguous in the ill_ilm list. We use this property 15992 * to avoid sending duplicates needed when two 15993 * applications in the same zone join the same group on 15994 * different logical interfaces: we ignore the ilm if 15995 * its zoneid is the same as the last matching one. 15996 * In addition, the sending of the packet for 15997 * ire_zoneid is delayed until all of the other ilms 15998 * have been exhausted. 15999 */ 16000 last_zoneid = -1; 16001 ILM_WALKER_HOLD(recv_ill); 16002 for (ilm = recv_ill->ill_ilm; ilm != NULL; 16003 ilm = ilm->ilm_next) { 16004 if ((ilm->ilm_flags & ILM_DELETED) || 16005 ipha->ipha_dst != ilm->ilm_addr || 16006 ilm->ilm_zoneid == last_zoneid || 16007 ilm->ilm_zoneid == ire->ire_zoneid || 16008 ilm->ilm_zoneid == ALL_ZONES || 16009 !(ilm->ilm_ipif->ipif_flags & IPIF_UP)) 16010 continue; 16011 mp1 = ip_copymsg(first_mp); 16012 if (mp1 == NULL) 16013 continue; 16014 icmp_inbound(q, mp1, B_TRUE, ill, 16015 0, sum, mctl_present, B_TRUE, 16016 recv_ill, ilm->ilm_zoneid); 16017 last_zoneid = ilm->ilm_zoneid; 16018 } 16019 ILM_WALKER_RELE(recv_ill); 16020 } else if (ire->ire_type == IRE_BROADCAST) { 16021 /* 16022 * In the broadcast case, there may be many zones 16023 * which need a copy of the packet delivered to them. 16024 * There is one IRE_BROADCAST per broadcast address 16025 * and per zone; we walk those using a helper function. 16026 * In addition, the sending of the packet for ire is 16027 * delayed until all of the other ires have been 16028 * processed. 16029 */ 16030 IRB_REFHOLD(ire->ire_bucket); 16031 ire_zone = NULL; 16032 while ((ire_zone = ire_get_next_bcast_ire(ire_zone, 16033 ire)) != NULL) { 16034 mp1 = ip_copymsg(first_mp); 16035 if (mp1 == NULL) 16036 continue; 16037 16038 UPDATE_IB_PKT_COUNT(ire_zone); 16039 ire_zone->ire_last_used_time = lbolt; 16040 icmp_inbound(q, mp1, B_TRUE, ill, 16041 0, sum, mctl_present, B_TRUE, 16042 recv_ill, ire_zone->ire_zoneid); 16043 } 16044 IRB_REFRELE(ire->ire_bucket); 16045 } 16046 icmp_inbound(q, first_mp, (ire->ire_type == IRE_BROADCAST), 16047 ill, 0, sum, mctl_present, B_TRUE, recv_ill, 16048 ire->ire_zoneid); 16049 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 16050 "ip_rput_locl_end: q %p (%S)", q, "icmp"); 16051 return; 16052 } 16053 case IPPROTO_IGMP: 16054 /* 16055 * If we are not willing to accept IGMP packets in clear, 16056 * then check with global policy. 16057 */ 16058 if (igmp_accept_clear_messages == 0) { 16059 first_mp = ipsec_check_global_policy(first_mp, NULL, 16060 ipha, NULL, mctl_present); 16061 if (first_mp == NULL) 16062 return; 16063 } 16064 if (is_system_labeled() && !tsol_can_accept_raw(mp, B_TRUE)) { 16065 freemsg(first_mp); 16066 ip1dbg(("ip_proto_input: zone all cannot accept raw")); 16067 BUMP_MIB(&ip_mib, ipInDiscards); 16068 return; 16069 } 16070 if (igmp_input(q, mp, ill)) { 16071 /* Bad packet - discarded by igmp_input */ 16072 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 16073 "ip_rput_locl_end: q %p (%S)", q, "igmp"); 16074 if (mctl_present) 16075 freeb(first_mp); 16076 return; 16077 } 16078 /* 16079 * igmp_input() may have pulled up the message so ipha needs to 16080 * be reinitialized. 16081 */ 16082 ipha = (ipha_t *)mp->b_rptr; 16083 if (ipcl_proto_search(ipha->ipha_protocol) == NULL) { 16084 /* No user-level listener for IGMP packets */ 16085 goto drop_pkt; 16086 } 16087 /* deliver to local raw users */ 16088 break; 16089 case IPPROTO_PIM: 16090 /* 16091 * If we are not willing to accept PIM packets in clear, 16092 * then check with global policy. 16093 */ 16094 if (pim_accept_clear_messages == 0) { 16095 first_mp = ipsec_check_global_policy(first_mp, NULL, 16096 ipha, NULL, mctl_present); 16097 if (first_mp == NULL) 16098 return; 16099 } 16100 if (is_system_labeled() && !tsol_can_accept_raw(mp, B_TRUE)) { 16101 freemsg(first_mp); 16102 ip1dbg(("ip_proto_input: zone all cannot accept PIM")); 16103 BUMP_MIB(&ip_mib, ipInDiscards); 16104 return; 16105 } 16106 if (pim_input(q, mp) != 0) { 16107 /* Bad packet - discarded by pim_input */ 16108 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 16109 "ip_rput_locl_end: q %p (%S)", q, "pim"); 16110 if (mctl_present) 16111 freeb(first_mp); 16112 return; 16113 } 16114 16115 /* 16116 * pim_input() may have pulled up the message so ipha needs to 16117 * be reinitialized. 16118 */ 16119 ipha = (ipha_t *)mp->b_rptr; 16120 if (ipcl_proto_search(ipha->ipha_protocol) == NULL) { 16121 /* No user-level listener for PIM packets */ 16122 goto drop_pkt; 16123 } 16124 /* deliver to local raw users */ 16125 break; 16126 case IPPROTO_ENCAP: 16127 /* 16128 * Handle self-encapsulated packets (IP-in-IP where 16129 * the inner addresses == the outer addresses). 16130 */ 16131 hdr_length = IPH_HDR_LENGTH(ipha); 16132 if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) > 16133 mp->b_wptr) { 16134 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 16135 sizeof (ipha_t) - mp->b_rptr)) { 16136 BUMP_MIB(&ip_mib, ipInDiscards); 16137 freemsg(first_mp); 16138 return; 16139 } 16140 ipha = (ipha_t *)mp->b_rptr; 16141 } 16142 inner_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length); 16143 /* 16144 * Check the sanity of the inner IP header. 16145 */ 16146 if ((IPH_HDR_VERSION(inner_ipha) != IPV4_VERSION)) { 16147 BUMP_MIB(&ip_mib, ipInDiscards); 16148 freemsg(first_mp); 16149 return; 16150 } 16151 if (IPH_HDR_LENGTH(inner_ipha) < sizeof (ipha_t)) { 16152 BUMP_MIB(&ip_mib, ipInDiscards); 16153 freemsg(first_mp); 16154 return; 16155 } 16156 if (inner_ipha->ipha_src == ipha->ipha_src && 16157 inner_ipha->ipha_dst == ipha->ipha_dst) { 16158 ipsec_in_t *ii; 16159 16160 /* 16161 * Self-encapsulated tunnel packet. Remove 16162 * the outer IP header and fanout again. 16163 * We also need to make sure that the inner 16164 * header is pulled up until options. 16165 */ 16166 mp->b_rptr = (uchar_t *)inner_ipha; 16167 ipha = inner_ipha; 16168 hdr_length = IPH_HDR_LENGTH(ipha); 16169 if ((uchar_t *)ipha + hdr_length > mp->b_wptr) { 16170 if (!pullupmsg(mp, (uchar_t *)ipha + 16171 + hdr_length - mp->b_rptr)) { 16172 freemsg(first_mp); 16173 return; 16174 } 16175 ipha = (ipha_t *)mp->b_rptr; 16176 } 16177 if (!mctl_present) { 16178 ASSERT(first_mp == mp); 16179 /* 16180 * This means that somebody is sending 16181 * Self-encapsualted packets without AH/ESP. 16182 * If AH/ESP was present, we would have already 16183 * allocated the first_mp. 16184 */ 16185 if ((first_mp = ipsec_in_alloc(B_TRUE)) == 16186 NULL) { 16187 ip1dbg(("ip_proto_input: IPSEC_IN " 16188 "allocation failure.\n")); 16189 BUMP_MIB(&ip_mib, ipInDiscards); 16190 freemsg(mp); 16191 return; 16192 } 16193 first_mp->b_cont = mp; 16194 } 16195 /* 16196 * We generally store the ill_index if we need to 16197 * do IPSEC processing as we lose the ill queue when 16198 * we come back. But in this case, we never should 16199 * have to store the ill_index here as it should have 16200 * been stored previously when we processed the 16201 * AH/ESP header in this routine or for non-ipsec 16202 * cases, we still have the queue. But for some bad 16203 * packets from the wire, we can get to IPSEC after 16204 * this and we better store the index for that case. 16205 */ 16206 ill = (ill_t *)q->q_ptr; 16207 ii = (ipsec_in_t *)first_mp->b_rptr; 16208 ii->ipsec_in_ill_index = 16209 ill->ill_phyint->phyint_ifindex; 16210 ii->ipsec_in_rill_index = 16211 recv_ill->ill_phyint->phyint_ifindex; 16212 if (ii->ipsec_in_decaps) { 16213 /* 16214 * This packet is self-encapsulated multiple 16215 * times. We don't want to recurse infinitely. 16216 * To keep it simple, drop the packet. 16217 */ 16218 BUMP_MIB(&ip_mib, ipInDiscards); 16219 freemsg(first_mp); 16220 return; 16221 } 16222 ii->ipsec_in_decaps = B_TRUE; 16223 ip_proto_input(q, first_mp, ipha, ire, recv_ill); 16224 return; 16225 } 16226 break; 16227 case IPPROTO_AH: 16228 case IPPROTO_ESP: { 16229 /* 16230 * Fast path for AH/ESP. If this is the first time 16231 * we are sending a datagram to AH/ESP, allocate 16232 * a IPSEC_IN message and prepend it. Otherwise, 16233 * just fanout. 16234 */ 16235 16236 int ipsec_rc; 16237 ipsec_in_t *ii; 16238 16239 IP_STAT(ipsec_proto_ahesp); 16240 if (!mctl_present) { 16241 ASSERT(first_mp == mp); 16242 if ((first_mp = ipsec_in_alloc(B_TRUE)) == NULL) { 16243 ip1dbg(("ip_proto_input: IPSEC_IN " 16244 "allocation failure.\n")); 16245 freemsg(hada_mp); /* okay ifnull */ 16246 BUMP_MIB(&ip_mib, ipInDiscards); 16247 freemsg(mp); 16248 return; 16249 } 16250 /* 16251 * Store the ill_index so that when we come back 16252 * from IPSEC we ride on the same queue. 16253 */ 16254 ill = (ill_t *)q->q_ptr; 16255 ii = (ipsec_in_t *)first_mp->b_rptr; 16256 ii->ipsec_in_ill_index = 16257 ill->ill_phyint->phyint_ifindex; 16258 ii->ipsec_in_rill_index = 16259 recv_ill->ill_phyint->phyint_ifindex; 16260 first_mp->b_cont = mp; 16261 /* 16262 * Cache hardware acceleration info. 16263 */ 16264 if (hada_mp != NULL) { 16265 IPSECHW_DEBUG(IPSECHW_PKT, 16266 ("ip_rput_local: caching data attr.\n")); 16267 ii->ipsec_in_accelerated = B_TRUE; 16268 ii->ipsec_in_da = hada_mp; 16269 hada_mp = NULL; 16270 } 16271 } else { 16272 ii = (ipsec_in_t *)first_mp->b_rptr; 16273 } 16274 16275 if (!ipsec_loaded()) { 16276 ip_proto_not_sup(q, first_mp, IP_FF_SEND_ICMP, 16277 ire->ire_zoneid); 16278 return; 16279 } 16280 16281 /* select inbound SA and have IPsec process the pkt */ 16282 if (ipha->ipha_protocol == IPPROTO_ESP) { 16283 esph_t *esph = ipsec_inbound_esp_sa(first_mp); 16284 if (esph == NULL) 16285 return; 16286 ASSERT(ii->ipsec_in_esp_sa != NULL); 16287 ASSERT(ii->ipsec_in_esp_sa->ipsa_input_func != NULL); 16288 ipsec_rc = ii->ipsec_in_esp_sa->ipsa_input_func( 16289 first_mp, esph); 16290 } else { 16291 ah_t *ah = ipsec_inbound_ah_sa(first_mp); 16292 if (ah == NULL) 16293 return; 16294 ASSERT(ii->ipsec_in_ah_sa != NULL); 16295 ASSERT(ii->ipsec_in_ah_sa->ipsa_input_func != NULL); 16296 ipsec_rc = ii->ipsec_in_ah_sa->ipsa_input_func( 16297 first_mp, ah); 16298 } 16299 16300 switch (ipsec_rc) { 16301 case IPSEC_STATUS_SUCCESS: 16302 break; 16303 case IPSEC_STATUS_FAILED: 16304 BUMP_MIB(&ip_mib, ipInDiscards); 16305 /* FALLTHRU */ 16306 case IPSEC_STATUS_PENDING: 16307 return; 16308 } 16309 /* we're done with IPsec processing, send it up */ 16310 ip_fanout_proto_again(first_mp, ill, recv_ill, ire); 16311 return; 16312 } 16313 default: 16314 break; 16315 } 16316 if (is_system_labeled() && !tsol_can_accept_raw(mp, B_FALSE)) { 16317 ip1dbg(("ip_proto_input: zone %d cannot accept raw IP", 16318 ire->ire_zoneid)); 16319 goto drop_pkt; 16320 } 16321 /* 16322 * Handle protocols with which IP is less intimate. There 16323 * can be more than one stream bound to a particular 16324 * protocol. When this is the case, each one gets a copy 16325 * of any incoming packets. 16326 */ 16327 ip_fanout_proto(q, first_mp, ill, ipha, 16328 IP_FF_SEND_ICMP | IP_FF_CKSUM | IP_FF_RAWIP, mctl_present, 16329 B_TRUE, recv_ill, ire->ire_zoneid); 16330 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 16331 "ip_rput_locl_end: q %p (%S)", q, "ip_fanout_proto"); 16332 return; 16333 16334 drop_pkt: 16335 freemsg(first_mp); 16336 if (hada_mp != NULL) 16337 freeb(hada_mp); 16338 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 16339 "ip_rput_locl_end: q %p (%S)", q, "droppkt"); 16340 #undef rptr 16341 #undef iphs 16342 16343 } 16344 16345 /* 16346 * Update any source route, record route or timestamp options. 16347 * Check that we are at end of strict source route. 16348 * The options have already been checked for sanity in ip_rput_options(). 16349 */ 16350 static boolean_t 16351 ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire) 16352 { 16353 ipoptp_t opts; 16354 uchar_t *opt; 16355 uint8_t optval; 16356 uint8_t optlen; 16357 ipaddr_t dst; 16358 uint32_t ts; 16359 ire_t *dst_ire; 16360 timestruc_t now; 16361 16362 ASSERT(ire->ire_ipversion == IPV4_VERSION); 16363 16364 ip2dbg(("ip_rput_local_options\n")); 16365 16366 for (optval = ipoptp_first(&opts, ipha); 16367 optval != IPOPT_EOL; 16368 optval = ipoptp_next(&opts)) { 16369 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 16370 opt = opts.ipoptp_cur; 16371 optlen = opts.ipoptp_len; 16372 ip2dbg(("ip_rput_local_options: opt %d, len %d\n", 16373 optval, optlen)); 16374 switch (optval) { 16375 uint32_t off; 16376 case IPOPT_SSRR: 16377 case IPOPT_LSRR: 16378 off = opt[IPOPT_OFFSET]; 16379 off--; 16380 if (optlen < IP_ADDR_LEN || 16381 off > optlen - IP_ADDR_LEN) { 16382 /* End of source route */ 16383 ip1dbg(("ip_rput_local_options: end of SR\n")); 16384 break; 16385 } 16386 /* 16387 * This will only happen if two consecutive entries 16388 * in the source route contains our address or if 16389 * it is a packet with a loose source route which 16390 * reaches us before consuming the whole source route 16391 */ 16392 ip1dbg(("ip_rput_local_options: not end of SR\n")); 16393 if (optval == IPOPT_SSRR) { 16394 goto bad_src_route; 16395 } 16396 /* 16397 * Hack: instead of dropping the packet truncate the 16398 * source route to what has been used by filling the 16399 * rest with IPOPT_NOP. 16400 */ 16401 opt[IPOPT_OLEN] = (uint8_t)off; 16402 while (off < optlen) { 16403 opt[off++] = IPOPT_NOP; 16404 } 16405 break; 16406 case IPOPT_RR: 16407 off = opt[IPOPT_OFFSET]; 16408 off--; 16409 if (optlen < IP_ADDR_LEN || 16410 off > optlen - IP_ADDR_LEN) { 16411 /* No more room - ignore */ 16412 ip1dbg(( 16413 "ip_rput_local_options: end of RR\n")); 16414 break; 16415 } 16416 bcopy(&ire->ire_src_addr, (char *)opt + off, 16417 IP_ADDR_LEN); 16418 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 16419 break; 16420 case IPOPT_TS: 16421 /* Insert timestamp if there is romm */ 16422 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 16423 case IPOPT_TS_TSONLY: 16424 off = IPOPT_TS_TIMELEN; 16425 break; 16426 case IPOPT_TS_PRESPEC: 16427 case IPOPT_TS_PRESPEC_RFC791: 16428 /* Verify that the address matched */ 16429 off = opt[IPOPT_OFFSET] - 1; 16430 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 16431 dst_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, 16432 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 16433 if (dst_ire == NULL) { 16434 /* Not for us */ 16435 break; 16436 } 16437 ire_refrele(dst_ire); 16438 /* FALLTHRU */ 16439 case IPOPT_TS_TSANDADDR: 16440 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 16441 break; 16442 default: 16443 /* 16444 * ip_*put_options should have already 16445 * dropped this packet. 16446 */ 16447 cmn_err(CE_PANIC, "ip_rput_local_options: " 16448 "unknown IT - bug in ip_rput_options?\n"); 16449 return (B_TRUE); /* Keep "lint" happy */ 16450 } 16451 if (opt[IPOPT_OFFSET] - 1 + off > optlen) { 16452 /* Increase overflow counter */ 16453 off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1; 16454 opt[IPOPT_POS_OV_FLG] = 16455 (uint8_t)((opt[IPOPT_POS_OV_FLG] & 0x0F) | 16456 (off << 4)); 16457 break; 16458 } 16459 off = opt[IPOPT_OFFSET] - 1; 16460 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 16461 case IPOPT_TS_PRESPEC: 16462 case IPOPT_TS_PRESPEC_RFC791: 16463 case IPOPT_TS_TSANDADDR: 16464 bcopy(&ire->ire_src_addr, (char *)opt + off, 16465 IP_ADDR_LEN); 16466 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 16467 /* FALLTHRU */ 16468 case IPOPT_TS_TSONLY: 16469 off = opt[IPOPT_OFFSET] - 1; 16470 /* Compute # of milliseconds since midnight */ 16471 gethrestime(&now); 16472 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + 16473 now.tv_nsec / (NANOSEC / MILLISEC); 16474 bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN); 16475 opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN; 16476 break; 16477 } 16478 break; 16479 } 16480 } 16481 return (B_TRUE); 16482 16483 bad_src_route: 16484 q = WR(q); 16485 /* make sure we clear any indication of a hardware checksum */ 16486 DB_CKSUMFLAGS(mp) = 0; 16487 icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED); 16488 return (B_FALSE); 16489 16490 } 16491 16492 /* 16493 * Process IP options in an inbound packet. If an option affects the 16494 * effective destination address, return the next hop address via dstp. 16495 * Returns -1 if something fails in which case an ICMP error has been sent 16496 * and mp freed. 16497 */ 16498 static int 16499 ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp) 16500 { 16501 ipoptp_t opts; 16502 uchar_t *opt; 16503 uint8_t optval; 16504 uint8_t optlen; 16505 ipaddr_t dst; 16506 intptr_t code = 0; 16507 ire_t *ire = NULL; 16508 16509 ip2dbg(("ip_rput_options\n")); 16510 dst = ipha->ipha_dst; 16511 for (optval = ipoptp_first(&opts, ipha); 16512 optval != IPOPT_EOL; 16513 optval = ipoptp_next(&opts)) { 16514 opt = opts.ipoptp_cur; 16515 optlen = opts.ipoptp_len; 16516 ip2dbg(("ip_rput_options: opt %d, len %d\n", 16517 optval, optlen)); 16518 /* 16519 * Note: we need to verify the checksum before we 16520 * modify anything thus this routine only extracts the next 16521 * hop dst from any source route. 16522 */ 16523 switch (optval) { 16524 uint32_t off; 16525 case IPOPT_SSRR: 16526 case IPOPT_LSRR: 16527 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, 16528 ALL_ZONES, NULL, MATCH_IRE_TYPE); 16529 if (ire == NULL) { 16530 if (optval == IPOPT_SSRR) { 16531 ip1dbg(("ip_rput_options: not next" 16532 " strict source route 0x%x\n", 16533 ntohl(dst))); 16534 code = (char *)&ipha->ipha_dst - 16535 (char *)ipha; 16536 goto param_prob; /* RouterReq's */ 16537 } 16538 ip2dbg(("ip_rput_options: " 16539 "not next source route 0x%x\n", 16540 ntohl(dst))); 16541 break; 16542 } 16543 ire_refrele(ire); 16544 16545 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 16546 ip1dbg(( 16547 "ip_rput_options: bad option offset\n")); 16548 code = (char *)&opt[IPOPT_OLEN] - 16549 (char *)ipha; 16550 goto param_prob; 16551 } 16552 off = opt[IPOPT_OFFSET]; 16553 off--; 16554 redo_srr: 16555 if (optlen < IP_ADDR_LEN || 16556 off > optlen - IP_ADDR_LEN) { 16557 /* End of source route */ 16558 ip1dbg(("ip_rput_options: end of SR\n")); 16559 break; 16560 } 16561 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 16562 ip1dbg(("ip_rput_options: next hop 0x%x\n", 16563 ntohl(dst))); 16564 16565 /* 16566 * Check if our address is present more than 16567 * once as consecutive hops in source route. 16568 * XXX verify per-interface ip_forwarding 16569 * for source route? 16570 */ 16571 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, 16572 ALL_ZONES, NULL, MATCH_IRE_TYPE); 16573 16574 if (ire != NULL) { 16575 ire_refrele(ire); 16576 off += IP_ADDR_LEN; 16577 goto redo_srr; 16578 } 16579 16580 if (dst == htonl(INADDR_LOOPBACK)) { 16581 ip1dbg(("ip_rput_options: loopback addr in " 16582 "source route!\n")); 16583 goto bad_src_route; 16584 } 16585 /* 16586 * For strict: verify that dst is directly 16587 * reachable. 16588 */ 16589 if (optval == IPOPT_SSRR) { 16590 ire = ire_ftable_lookup(dst, 0, 0, 16591 IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, 16592 MBLK_GETLABEL(mp), 16593 MATCH_IRE_TYPE | MATCH_IRE_SECATTR); 16594 if (ire == NULL) { 16595 ip1dbg(("ip_rput_options: SSRR not " 16596 "directly reachable: 0x%x\n", 16597 ntohl(dst))); 16598 goto bad_src_route; 16599 } 16600 ire_refrele(ire); 16601 } 16602 /* 16603 * Defer update of the offset and the record route 16604 * until the packet is forwarded. 16605 */ 16606 break; 16607 case IPOPT_RR: 16608 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 16609 ip1dbg(( 16610 "ip_rput_options: bad option offset\n")); 16611 code = (char *)&opt[IPOPT_OLEN] - 16612 (char *)ipha; 16613 goto param_prob; 16614 } 16615 break; 16616 case IPOPT_TS: 16617 /* 16618 * Verify that length >= 5 and that there is either 16619 * room for another timestamp or that the overflow 16620 * counter is not maxed out. 16621 */ 16622 code = (char *)&opt[IPOPT_OLEN] - (char *)ipha; 16623 if (optlen < IPOPT_MINLEN_IT) { 16624 goto param_prob; 16625 } 16626 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 16627 ip1dbg(( 16628 "ip_rput_options: bad option offset\n")); 16629 code = (char *)&opt[IPOPT_OFFSET] - 16630 (char *)ipha; 16631 goto param_prob; 16632 } 16633 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 16634 case IPOPT_TS_TSONLY: 16635 off = IPOPT_TS_TIMELEN; 16636 break; 16637 case IPOPT_TS_TSANDADDR: 16638 case IPOPT_TS_PRESPEC: 16639 case IPOPT_TS_PRESPEC_RFC791: 16640 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 16641 break; 16642 default: 16643 code = (char *)&opt[IPOPT_POS_OV_FLG] - 16644 (char *)ipha; 16645 goto param_prob; 16646 } 16647 if (opt[IPOPT_OFFSET] - 1 + off > optlen && 16648 (opt[IPOPT_POS_OV_FLG] & 0xF0) == 0xF0) { 16649 /* 16650 * No room and the overflow counter is 15 16651 * already. 16652 */ 16653 goto param_prob; 16654 } 16655 break; 16656 } 16657 } 16658 16659 if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0) { 16660 *dstp = dst; 16661 return (0); 16662 } 16663 16664 ip1dbg(("ip_rput_options: error processing IP options.")); 16665 code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha; 16666 16667 param_prob: 16668 q = WR(q); 16669 /* make sure we clear any indication of a hardware checksum */ 16670 DB_CKSUMFLAGS(mp) = 0; 16671 icmp_param_problem(q, mp, (uint8_t)code); 16672 return (-1); 16673 16674 bad_src_route: 16675 q = WR(q); 16676 /* make sure we clear any indication of a hardware checksum */ 16677 DB_CKSUMFLAGS(mp) = 0; 16678 icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED); 16679 return (-1); 16680 } 16681 16682 /* 16683 * IP & ICMP info in >=14 msg's ... 16684 * - ip fixed part (mib2_ip_t) 16685 * - icmp fixed part (mib2_icmp_t) 16686 * - ipAddrEntryTable (ip 20) all IPv4 ipifs 16687 * - ipRouteEntryTable (ip 21) all IPv4 IREs 16688 * - ipNetToMediaEntryTable (ip 22) IPv4 IREs for on-link destinations 16689 * - ipRouteAttributeTable (ip 102) labeled routes 16690 * - ip multicast membership (ip_member_t) 16691 * - ip multicast source filtering (ip_grpsrc_t) 16692 * - igmp fixed part (struct igmpstat) 16693 * - multicast routing stats (struct mrtstat) 16694 * - multicast routing vifs (array of struct vifctl) 16695 * - multicast routing routes (array of struct mfcctl) 16696 * - ip6 fixed part (mib2_ipv6IfStatsEntry_t) 16697 * One per ill plus one generic 16698 * - icmp6 fixed part (mib2_ipv6IfIcmpEntry_t) 16699 * One per ill plus one generic 16700 * - ipv6RouteEntry all IPv6 IREs 16701 * - ipv6RouteAttributeTable (ip6 102) labeled routes 16702 * - ipv6NetToMediaEntry all Neighbor Cache entries 16703 * - ipv6AddrEntry all IPv6 ipifs 16704 * - ipv6 multicast membership (ipv6_member_t) 16705 * - ipv6 multicast source filtering (ipv6_grpsrc_t) 16706 * 16707 * IP_ROUTE and IP_MEDIA are augmented in arp to include arp cache entries not 16708 * already present. 16709 * NOTE: original mpctl is copied for msg's 2..N, since its ctl part is 16710 * already filled in by the caller. 16711 * Return value of 0 indicates that no messages were sent and caller 16712 * should free mpctl. 16713 */ 16714 int 16715 ip_snmp_get(queue_t *q, mblk_t *mpctl) 16716 { 16717 16718 if (mpctl == NULL || mpctl->b_cont == NULL) { 16719 return (0); 16720 } 16721 16722 if ((mpctl = ip_snmp_get_mib2_ip(q, mpctl)) == NULL) { 16723 return (1); 16724 } 16725 16726 if ((mpctl = ip_snmp_get_mib2_ip6(q, mpctl)) == NULL) { 16727 return (1); 16728 } 16729 16730 if ((mpctl = ip_snmp_get_mib2_icmp(q, mpctl)) == NULL) { 16731 return (1); 16732 } 16733 16734 if ((mpctl = ip_snmp_get_mib2_icmp6(q, mpctl)) == NULL) { 16735 return (1); 16736 } 16737 16738 if ((mpctl = ip_snmp_get_mib2_igmp(q, mpctl)) == NULL) { 16739 return (1); 16740 } 16741 16742 if ((mpctl = ip_snmp_get_mib2_multi(q, mpctl)) == NULL) { 16743 return (1); 16744 } 16745 16746 if ((mpctl = ip_snmp_get_mib2_ip_addr(q, mpctl)) == NULL) { 16747 return (1); 16748 } 16749 16750 if ((mpctl = ip_snmp_get_mib2_ip6_addr(q, mpctl)) == NULL) { 16751 return (1); 16752 } 16753 16754 if ((mpctl = ip_snmp_get_mib2_ip_group_mem(q, mpctl)) == NULL) { 16755 return (1); 16756 } 16757 16758 if ((mpctl = ip_snmp_get_mib2_ip6_group_mem(q, mpctl)) == NULL) { 16759 return (1); 16760 } 16761 16762 if ((mpctl = ip_snmp_get_mib2_ip_group_src(q, mpctl)) == NULL) { 16763 return (1); 16764 } 16765 16766 if ((mpctl = ip_snmp_get_mib2_ip6_group_src(q, mpctl)) == NULL) { 16767 return (1); 16768 } 16769 16770 if ((mpctl = ip_snmp_get_mib2_virt_multi(q, mpctl)) == NULL) { 16771 return (1); 16772 } 16773 16774 if ((mpctl = ip_snmp_get_mib2_multi_rtable(q, mpctl)) == NULL) { 16775 return (1); 16776 } 16777 16778 if ((mpctl = ip_snmp_get_mib2_ip_route_media(q, mpctl)) == NULL) { 16779 return (1); 16780 } 16781 16782 if ((mpctl = ip_snmp_get_mib2_ip6_route_media(q, mpctl)) == NULL) { 16783 return (1); 16784 } 16785 16786 if ((mpctl = sctp_snmp_get_mib2(q, mpctl)) == NULL) { 16787 return (1); 16788 } 16789 freemsg(mpctl); 16790 return (1); 16791 } 16792 16793 16794 /* Get global IPv4 statistics */ 16795 static mblk_t * 16796 ip_snmp_get_mib2_ip(queue_t *q, mblk_t *mpctl) 16797 { 16798 struct opthdr *optp; 16799 mblk_t *mp2ctl; 16800 16801 /* 16802 * make a copy of the original message 16803 */ 16804 mp2ctl = copymsg(mpctl); 16805 16806 /* fixed length IP structure... */ 16807 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 16808 optp->level = MIB2_IP; 16809 optp->name = 0; 16810 SET_MIB(ip_mib.ipForwarding, 16811 (WE_ARE_FORWARDING ? 1 : 2)); 16812 SET_MIB(ip_mib.ipDefaultTTL, 16813 (uint32_t)ip_def_ttl); 16814 SET_MIB(ip_mib.ipReasmTimeout, 16815 ip_g_frag_timeout); 16816 SET_MIB(ip_mib.ipAddrEntrySize, 16817 sizeof (mib2_ipAddrEntry_t)); 16818 SET_MIB(ip_mib.ipRouteEntrySize, 16819 sizeof (mib2_ipRouteEntry_t)); 16820 SET_MIB(ip_mib.ipNetToMediaEntrySize, 16821 sizeof (mib2_ipNetToMediaEntry_t)); 16822 SET_MIB(ip_mib.ipMemberEntrySize, sizeof (ip_member_t)); 16823 SET_MIB(ip_mib.ipGroupSourceEntrySize, sizeof (ip_grpsrc_t)); 16824 SET_MIB(ip_mib.ipRouteAttributeSize, sizeof (mib2_ipAttributeEntry_t)); 16825 SET_MIB(ip_mib.transportMLPSize, sizeof (mib2_transportMLPEntry_t)); 16826 if (!snmp_append_data(mpctl->b_cont, (char *)&ip_mib, 16827 (int)sizeof (ip_mib))) { 16828 ip1dbg(("ip_snmp_get_mib2_ip: failed to allocate %u bytes\n", 16829 (uint_t)sizeof (ip_mib))); 16830 } 16831 16832 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 16833 ip3dbg(("ip_snmp_get_mib2_ip: level %d, name %d, len %d\n", 16834 (int)optp->level, (int)optp->name, (int)optp->len)); 16835 qreply(q, mpctl); 16836 return (mp2ctl); 16837 } 16838 16839 /* Global IPv4 ICMP statistics */ 16840 static mblk_t * 16841 ip_snmp_get_mib2_icmp(queue_t *q, mblk_t *mpctl) 16842 { 16843 struct opthdr *optp; 16844 mblk_t *mp2ctl; 16845 16846 /* 16847 * Make a copy of the original message 16848 */ 16849 mp2ctl = copymsg(mpctl); 16850 16851 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 16852 optp->level = MIB2_ICMP; 16853 optp->name = 0; 16854 if (!snmp_append_data(mpctl->b_cont, (char *)&icmp_mib, 16855 (int)sizeof (icmp_mib))) { 16856 ip1dbg(("ip_snmp_get_mib2_icmp: failed to allocate %u bytes\n", 16857 (uint_t)sizeof (icmp_mib))); 16858 } 16859 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 16860 ip3dbg(("ip_snmp_get_mib2_icmp: level %d, name %d, len %d\n", 16861 (int)optp->level, (int)optp->name, (int)optp->len)); 16862 qreply(q, mpctl); 16863 return (mp2ctl); 16864 } 16865 16866 /* Global IPv4 IGMP statistics */ 16867 static mblk_t * 16868 ip_snmp_get_mib2_igmp(queue_t *q, mblk_t *mpctl) 16869 { 16870 struct opthdr *optp; 16871 mblk_t *mp2ctl; 16872 16873 /* 16874 * make a copy of the original message 16875 */ 16876 mp2ctl = copymsg(mpctl); 16877 16878 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 16879 optp->level = EXPER_IGMP; 16880 optp->name = 0; 16881 if (!snmp_append_data(mpctl->b_cont, (char *)&igmpstat, 16882 (int)sizeof (igmpstat))) { 16883 ip1dbg(("ip_snmp_get_mib2_igmp: failed to allocate %u bytes\n", 16884 (uint_t)sizeof (igmpstat))); 16885 } 16886 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 16887 ip3dbg(("ip_snmp_get_mib2_igmp: level %d, name %d, len %d\n", 16888 (int)optp->level, (int)optp->name, (int)optp->len)); 16889 qreply(q, mpctl); 16890 return (mp2ctl); 16891 } 16892 16893 /* Global IPv4 Multicast Routing statistics */ 16894 static mblk_t * 16895 ip_snmp_get_mib2_multi(queue_t *q, mblk_t *mpctl) 16896 { 16897 struct opthdr *optp; 16898 mblk_t *mp2ctl; 16899 16900 /* 16901 * make a copy of the original message 16902 */ 16903 mp2ctl = copymsg(mpctl); 16904 16905 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 16906 optp->level = EXPER_DVMRP; 16907 optp->name = 0; 16908 if (!ip_mroute_stats(mpctl->b_cont)) { 16909 ip0dbg(("ip_mroute_stats: failed\n")); 16910 } 16911 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 16912 ip3dbg(("ip_snmp_get_mib2_multi: level %d, name %d, len %d\n", 16913 (int)optp->level, (int)optp->name, (int)optp->len)); 16914 qreply(q, mpctl); 16915 return (mp2ctl); 16916 } 16917 16918 /* IPv4 address information */ 16919 static mblk_t * 16920 ip_snmp_get_mib2_ip_addr(queue_t *q, mblk_t *mpctl) 16921 { 16922 struct opthdr *optp; 16923 mblk_t *mp2ctl; 16924 mblk_t *mp_tail = NULL; 16925 ill_t *ill; 16926 ipif_t *ipif; 16927 uint_t bitval; 16928 mib2_ipAddrEntry_t mae; 16929 zoneid_t zoneid; 16930 ill_walk_context_t ctx; 16931 16932 /* 16933 * make a copy of the original message 16934 */ 16935 mp2ctl = copymsg(mpctl); 16936 16937 /* ipAddrEntryTable */ 16938 16939 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 16940 optp->level = MIB2_IP; 16941 optp->name = MIB2_IP_ADDR; 16942 zoneid = Q_TO_CONN(q)->conn_zoneid; 16943 16944 rw_enter(&ill_g_lock, RW_READER); 16945 ill = ILL_START_WALK_V4(&ctx); 16946 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 16947 for (ipif = ill->ill_ipif; ipif != NULL; 16948 ipif = ipif->ipif_next) { 16949 if (ipif->ipif_zoneid != zoneid && 16950 ipif->ipif_zoneid != ALL_ZONES) 16951 continue; 16952 mae.ipAdEntInfo.ae_ibcnt = ipif->ipif_ib_pkt_count; 16953 mae.ipAdEntInfo.ae_obcnt = ipif->ipif_ob_pkt_count; 16954 mae.ipAdEntInfo.ae_focnt = ipif->ipif_fo_pkt_count; 16955 16956 (void) ipif_get_name(ipif, 16957 mae.ipAdEntIfIndex.o_bytes, 16958 OCTET_LENGTH); 16959 mae.ipAdEntIfIndex.o_length = 16960 mi_strlen(mae.ipAdEntIfIndex.o_bytes); 16961 mae.ipAdEntAddr = ipif->ipif_lcl_addr; 16962 mae.ipAdEntNetMask = ipif->ipif_net_mask; 16963 mae.ipAdEntInfo.ae_subnet = ipif->ipif_subnet; 16964 mae.ipAdEntInfo.ae_subnet_len = 16965 ip_mask_to_plen(ipif->ipif_net_mask); 16966 mae.ipAdEntInfo.ae_src_addr = ipif->ipif_src_addr; 16967 for (bitval = 1; 16968 bitval && 16969 !(bitval & ipif->ipif_brd_addr); 16970 bitval <<= 1) 16971 noop; 16972 mae.ipAdEntBcastAddr = bitval; 16973 mae.ipAdEntReasmMaxSize = 65535; 16974 mae.ipAdEntInfo.ae_mtu = ipif->ipif_mtu; 16975 mae.ipAdEntInfo.ae_metric = ipif->ipif_metric; 16976 mae.ipAdEntInfo.ae_broadcast_addr = 16977 ipif->ipif_brd_addr; 16978 mae.ipAdEntInfo.ae_pp_dst_addr = 16979 ipif->ipif_pp_dst_addr; 16980 mae.ipAdEntInfo.ae_flags = ipif->ipif_flags | 16981 ill->ill_flags | ill->ill_phyint->phyint_flags; 16982 16983 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 16984 (char *)&mae, (int)sizeof (mib2_ipAddrEntry_t))) { 16985 ip1dbg(("ip_snmp_get_mib2_ip_addr: failed to " 16986 "allocate %u bytes\n", 16987 (uint_t)sizeof (mib2_ipAddrEntry_t))); 16988 } 16989 } 16990 } 16991 rw_exit(&ill_g_lock); 16992 16993 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 16994 ip3dbg(("ip_snmp_get_mib2_ip_addr: level %d, name %d, len %d\n", 16995 (int)optp->level, (int)optp->name, (int)optp->len)); 16996 qreply(q, mpctl); 16997 return (mp2ctl); 16998 } 16999 17000 /* IPv6 address information */ 17001 static mblk_t * 17002 ip_snmp_get_mib2_ip6_addr(queue_t *q, mblk_t *mpctl) 17003 { 17004 struct opthdr *optp; 17005 mblk_t *mp2ctl; 17006 mblk_t *mp_tail = NULL; 17007 ill_t *ill; 17008 ipif_t *ipif; 17009 mib2_ipv6AddrEntry_t mae6; 17010 zoneid_t zoneid; 17011 ill_walk_context_t ctx; 17012 17013 /* 17014 * make a copy of the original message 17015 */ 17016 mp2ctl = copymsg(mpctl); 17017 17018 /* ipv6AddrEntryTable */ 17019 17020 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17021 optp->level = MIB2_IP6; 17022 optp->name = MIB2_IP6_ADDR; 17023 zoneid = Q_TO_CONN(q)->conn_zoneid; 17024 17025 rw_enter(&ill_g_lock, RW_READER); 17026 ill = ILL_START_WALK_V6(&ctx); 17027 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 17028 for (ipif = ill->ill_ipif; ipif; ipif = ipif->ipif_next) { 17029 if (ipif->ipif_zoneid != zoneid && 17030 ipif->ipif_zoneid != ALL_ZONES) 17031 continue; 17032 mae6.ipv6AddrInfo.ae_ibcnt = ipif->ipif_ib_pkt_count; 17033 mae6.ipv6AddrInfo.ae_obcnt = ipif->ipif_ob_pkt_count; 17034 mae6.ipv6AddrInfo.ae_focnt = ipif->ipif_fo_pkt_count; 17035 17036 (void) ipif_get_name(ipif, 17037 mae6.ipv6AddrIfIndex.o_bytes, 17038 OCTET_LENGTH); 17039 mae6.ipv6AddrIfIndex.o_length = 17040 mi_strlen(mae6.ipv6AddrIfIndex.o_bytes); 17041 mae6.ipv6AddrAddress = ipif->ipif_v6lcl_addr; 17042 mae6.ipv6AddrPfxLength = 17043 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 17044 mae6.ipv6AddrInfo.ae_subnet = ipif->ipif_v6subnet; 17045 mae6.ipv6AddrInfo.ae_subnet_len = 17046 mae6.ipv6AddrPfxLength; 17047 mae6.ipv6AddrInfo.ae_src_addr = ipif->ipif_v6src_addr; 17048 17049 /* Type: stateless(1), stateful(2), unknown(3) */ 17050 if (ipif->ipif_flags & IPIF_ADDRCONF) 17051 mae6.ipv6AddrType = 1; 17052 else 17053 mae6.ipv6AddrType = 2; 17054 /* Anycast: true(1), false(2) */ 17055 if (ipif->ipif_flags & IPIF_ANYCAST) 17056 mae6.ipv6AddrAnycastFlag = 1; 17057 else 17058 mae6.ipv6AddrAnycastFlag = 2; 17059 17060 /* 17061 * Address status: preferred(1), deprecated(2), 17062 * invalid(3), inaccessible(4), unknown(5) 17063 */ 17064 if (ipif->ipif_flags & IPIF_NOLOCAL) 17065 mae6.ipv6AddrStatus = 3; 17066 else if (ipif->ipif_flags & IPIF_DEPRECATED) 17067 mae6.ipv6AddrStatus = 2; 17068 else 17069 mae6.ipv6AddrStatus = 1; 17070 mae6.ipv6AddrInfo.ae_mtu = ipif->ipif_mtu; 17071 mae6.ipv6AddrInfo.ae_metric = ipif->ipif_metric; 17072 mae6.ipv6AddrInfo.ae_pp_dst_addr = 17073 ipif->ipif_v6pp_dst_addr; 17074 mae6.ipv6AddrInfo.ae_flags = ipif->ipif_flags | 17075 ill->ill_flags | ill->ill_phyint->phyint_flags; 17076 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 17077 (char *)&mae6, 17078 (int)sizeof (mib2_ipv6AddrEntry_t))) { 17079 ip1dbg(("ip_snmp_get_mib2_ip6_addr: failed to " 17080 "allocate %u bytes\n", 17081 (uint_t)sizeof (mib2_ipv6AddrEntry_t))); 17082 } 17083 } 17084 } 17085 rw_exit(&ill_g_lock); 17086 17087 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 17088 ip3dbg(("ip_snmp_get_mib2_ip6_addr: level %d, name %d, len %d\n", 17089 (int)optp->level, (int)optp->name, (int)optp->len)); 17090 qreply(q, mpctl); 17091 return (mp2ctl); 17092 } 17093 17094 /* IPv4 multicast group membership. */ 17095 static mblk_t * 17096 ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl) 17097 { 17098 struct opthdr *optp; 17099 mblk_t *mp2ctl; 17100 ill_t *ill; 17101 ipif_t *ipif; 17102 ilm_t *ilm; 17103 ip_member_t ipm; 17104 mblk_t *mp_tail = NULL; 17105 ill_walk_context_t ctx; 17106 zoneid_t zoneid; 17107 17108 /* 17109 * make a copy of the original message 17110 */ 17111 mp2ctl = copymsg(mpctl); 17112 zoneid = Q_TO_CONN(q)->conn_zoneid; 17113 17114 /* ipGroupMember table */ 17115 optp = (struct opthdr *)&mpctl->b_rptr[ 17116 sizeof (struct T_optmgmt_ack)]; 17117 optp->level = MIB2_IP; 17118 optp->name = EXPER_IP_GROUP_MEMBERSHIP; 17119 17120 rw_enter(&ill_g_lock, RW_READER); 17121 ill = ILL_START_WALK_V4(&ctx); 17122 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 17123 ILM_WALKER_HOLD(ill); 17124 for (ipif = ill->ill_ipif; ipif != NULL; 17125 ipif = ipif->ipif_next) { 17126 if (ipif->ipif_zoneid != zoneid && 17127 ipif->ipif_zoneid != ALL_ZONES) 17128 continue; /* not this zone */ 17129 (void) ipif_get_name(ipif, 17130 ipm.ipGroupMemberIfIndex.o_bytes, 17131 OCTET_LENGTH); 17132 ipm.ipGroupMemberIfIndex.o_length = 17133 mi_strlen(ipm.ipGroupMemberIfIndex.o_bytes); 17134 for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { 17135 ASSERT(ilm->ilm_ipif != NULL); 17136 ASSERT(ilm->ilm_ill == NULL); 17137 if (ilm->ilm_ipif != ipif) 17138 continue; 17139 ipm.ipGroupMemberAddress = ilm->ilm_addr; 17140 ipm.ipGroupMemberRefCnt = ilm->ilm_refcnt; 17141 ipm.ipGroupMemberFilterMode = ilm->ilm_fmode; 17142 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 17143 (char *)&ipm, (int)sizeof (ipm))) { 17144 ip1dbg(("ip_snmp_get_mib2_ip_group: " 17145 "failed to allocate %u bytes\n", 17146 (uint_t)sizeof (ipm))); 17147 } 17148 } 17149 } 17150 ILM_WALKER_RELE(ill); 17151 } 17152 rw_exit(&ill_g_lock); 17153 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 17154 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 17155 (int)optp->level, (int)optp->name, (int)optp->len)); 17156 qreply(q, mpctl); 17157 return (mp2ctl); 17158 } 17159 17160 /* IPv6 multicast group membership. */ 17161 static mblk_t * 17162 ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl) 17163 { 17164 struct opthdr *optp; 17165 mblk_t *mp2ctl; 17166 ill_t *ill; 17167 ilm_t *ilm; 17168 ipv6_member_t ipm6; 17169 mblk_t *mp_tail = NULL; 17170 ill_walk_context_t ctx; 17171 zoneid_t zoneid; 17172 17173 /* 17174 * make a copy of the original message 17175 */ 17176 mp2ctl = copymsg(mpctl); 17177 zoneid = Q_TO_CONN(q)->conn_zoneid; 17178 17179 /* ip6GroupMember table */ 17180 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17181 optp->level = MIB2_IP6; 17182 optp->name = EXPER_IP6_GROUP_MEMBERSHIP; 17183 17184 rw_enter(&ill_g_lock, RW_READER); 17185 ill = ILL_START_WALK_V6(&ctx); 17186 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 17187 ILM_WALKER_HOLD(ill); 17188 ipm6.ipv6GroupMemberIfIndex = ill->ill_phyint->phyint_ifindex; 17189 for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { 17190 ASSERT(ilm->ilm_ipif == NULL); 17191 ASSERT(ilm->ilm_ill != NULL); 17192 if (ilm->ilm_zoneid != zoneid) 17193 continue; /* not this zone */ 17194 ipm6.ipv6GroupMemberAddress = ilm->ilm_v6addr; 17195 ipm6.ipv6GroupMemberRefCnt = ilm->ilm_refcnt; 17196 ipm6.ipv6GroupMemberFilterMode = ilm->ilm_fmode; 17197 if (!snmp_append_data2(mpctl->b_cont, 17198 &mp_tail, 17199 (char *)&ipm6, (int)sizeof (ipm6))) { 17200 ip1dbg(("ip_snmp_get_mib2_ip6_group: " 17201 "failed to allocate %u bytes\n", 17202 (uint_t)sizeof (ipm6))); 17203 } 17204 } 17205 ILM_WALKER_RELE(ill); 17206 } 17207 rw_exit(&ill_g_lock); 17208 17209 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 17210 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 17211 (int)optp->level, (int)optp->name, (int)optp->len)); 17212 qreply(q, mpctl); 17213 return (mp2ctl); 17214 } 17215 17216 /* IP multicast filtered sources */ 17217 static mblk_t * 17218 ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl) 17219 { 17220 struct opthdr *optp; 17221 mblk_t *mp2ctl; 17222 ill_t *ill; 17223 ipif_t *ipif; 17224 ilm_t *ilm; 17225 ip_grpsrc_t ips; 17226 mblk_t *mp_tail = NULL; 17227 ill_walk_context_t ctx; 17228 zoneid_t zoneid; 17229 int i; 17230 slist_t *sl; 17231 17232 /* 17233 * make a copy of the original message 17234 */ 17235 mp2ctl = copymsg(mpctl); 17236 zoneid = Q_TO_CONN(q)->conn_zoneid; 17237 17238 /* ipGroupSource table */ 17239 optp = (struct opthdr *)&mpctl->b_rptr[ 17240 sizeof (struct T_optmgmt_ack)]; 17241 optp->level = MIB2_IP; 17242 optp->name = EXPER_IP_GROUP_SOURCES; 17243 17244 rw_enter(&ill_g_lock, RW_READER); 17245 ill = ILL_START_WALK_V4(&ctx); 17246 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 17247 ILM_WALKER_HOLD(ill); 17248 for (ipif = ill->ill_ipif; ipif != NULL; 17249 ipif = ipif->ipif_next) { 17250 if (ipif->ipif_zoneid != zoneid) 17251 continue; /* not this zone */ 17252 (void) ipif_get_name(ipif, 17253 ips.ipGroupSourceIfIndex.o_bytes, 17254 OCTET_LENGTH); 17255 ips.ipGroupSourceIfIndex.o_length = 17256 mi_strlen(ips.ipGroupSourceIfIndex.o_bytes); 17257 for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { 17258 ASSERT(ilm->ilm_ipif != NULL); 17259 ASSERT(ilm->ilm_ill == NULL); 17260 sl = ilm->ilm_filter; 17261 if (ilm->ilm_ipif != ipif || SLIST_IS_EMPTY(sl)) 17262 continue; 17263 ips.ipGroupSourceGroup = ilm->ilm_addr; 17264 for (i = 0; i < sl->sl_numsrc; i++) { 17265 if (!IN6_IS_ADDR_V4MAPPED( 17266 &sl->sl_addr[i])) 17267 continue; 17268 IN6_V4MAPPED_TO_IPADDR(&sl->sl_addr[i], 17269 ips.ipGroupSourceAddress); 17270 if (snmp_append_data2(mpctl->b_cont, 17271 &mp_tail, (char *)&ips, 17272 (int)sizeof (ips)) == 0) { 17273 ip1dbg(("ip_snmp_get_mib2_" 17274 "ip_group_src: failed to " 17275 "allocate %u bytes\n", 17276 (uint_t)sizeof (ips))); 17277 } 17278 } 17279 } 17280 } 17281 ILM_WALKER_RELE(ill); 17282 } 17283 rw_exit(&ill_g_lock); 17284 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 17285 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 17286 (int)optp->level, (int)optp->name, (int)optp->len)); 17287 qreply(q, mpctl); 17288 return (mp2ctl); 17289 } 17290 17291 /* IPv6 multicast filtered sources. */ 17292 static mblk_t * 17293 ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl) 17294 { 17295 struct opthdr *optp; 17296 mblk_t *mp2ctl; 17297 ill_t *ill; 17298 ilm_t *ilm; 17299 ipv6_grpsrc_t ips6; 17300 mblk_t *mp_tail = NULL; 17301 ill_walk_context_t ctx; 17302 zoneid_t zoneid; 17303 int i; 17304 slist_t *sl; 17305 17306 /* 17307 * make a copy of the original message 17308 */ 17309 mp2ctl = copymsg(mpctl); 17310 zoneid = Q_TO_CONN(q)->conn_zoneid; 17311 17312 /* ip6GroupMember table */ 17313 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17314 optp->level = MIB2_IP6; 17315 optp->name = EXPER_IP6_GROUP_SOURCES; 17316 17317 rw_enter(&ill_g_lock, RW_READER); 17318 ill = ILL_START_WALK_V6(&ctx); 17319 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 17320 ILM_WALKER_HOLD(ill); 17321 ips6.ipv6GroupSourceIfIndex = ill->ill_phyint->phyint_ifindex; 17322 for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { 17323 ASSERT(ilm->ilm_ipif == NULL); 17324 ASSERT(ilm->ilm_ill != NULL); 17325 sl = ilm->ilm_filter; 17326 if (ilm->ilm_zoneid != zoneid || SLIST_IS_EMPTY(sl)) 17327 continue; 17328 ips6.ipv6GroupSourceGroup = ilm->ilm_v6addr; 17329 for (i = 0; i < sl->sl_numsrc; i++) { 17330 ips6.ipv6GroupSourceAddress = sl->sl_addr[i]; 17331 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 17332 (char *)&ips6, (int)sizeof (ips6))) { 17333 ip1dbg(("ip_snmp_get_mib2_ip6_" 17334 "group_src: failed to allocate " 17335 "%u bytes\n", 17336 (uint_t)sizeof (ips6))); 17337 } 17338 } 17339 } 17340 ILM_WALKER_RELE(ill); 17341 } 17342 rw_exit(&ill_g_lock); 17343 17344 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 17345 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 17346 (int)optp->level, (int)optp->name, (int)optp->len)); 17347 qreply(q, mpctl); 17348 return (mp2ctl); 17349 } 17350 17351 /* Multicast routing virtual interface table. */ 17352 static mblk_t * 17353 ip_snmp_get_mib2_virt_multi(queue_t *q, mblk_t *mpctl) 17354 { 17355 struct opthdr *optp; 17356 mblk_t *mp2ctl; 17357 17358 /* 17359 * make a copy of the original message 17360 */ 17361 mp2ctl = copymsg(mpctl); 17362 17363 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17364 optp->level = EXPER_DVMRP; 17365 optp->name = EXPER_DVMRP_VIF; 17366 if (!ip_mroute_vif(mpctl->b_cont)) { 17367 ip0dbg(("ip_mroute_vif: failed\n")); 17368 } 17369 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 17370 ip3dbg(("ip_snmp_get_mib2_virt_multi: level %d, name %d, len %d\n", 17371 (int)optp->level, (int)optp->name, (int)optp->len)); 17372 qreply(q, mpctl); 17373 return (mp2ctl); 17374 } 17375 17376 /* Multicast routing table. */ 17377 static mblk_t * 17378 ip_snmp_get_mib2_multi_rtable(queue_t *q, mblk_t *mpctl) 17379 { 17380 struct opthdr *optp; 17381 mblk_t *mp2ctl; 17382 17383 /* 17384 * make a copy of the original message 17385 */ 17386 mp2ctl = copymsg(mpctl); 17387 17388 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17389 optp->level = EXPER_DVMRP; 17390 optp->name = EXPER_DVMRP_MRT; 17391 if (!ip_mroute_mrt(mpctl->b_cont)) { 17392 ip0dbg(("ip_mroute_mrt: failed\n")); 17393 } 17394 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 17395 ip3dbg(("ip_snmp_get_mib2_multi_rtable: level %d, name %d, len %d\n", 17396 (int)optp->level, (int)optp->name, (int)optp->len)); 17397 qreply(q, mpctl); 17398 return (mp2ctl); 17399 } 17400 17401 /* 17402 * Return ipRouteEntryTable, ipNetToMediaEntryTable, and ipRouteAttributeTable 17403 * in one IRE walk. 17404 */ 17405 static mblk_t * 17406 ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl) 17407 { 17408 struct opthdr *optp; 17409 mblk_t *mp2ctl; /* Returned */ 17410 mblk_t *mp3ctl; /* nettomedia */ 17411 mblk_t *mp4ctl; /* routeattrs */ 17412 iproutedata_t ird; 17413 zoneid_t zoneid; 17414 17415 /* 17416 * make copies of the original message 17417 * - mp2ctl is returned unchanged to the caller for his use 17418 * - mpctl is sent upstream as ipRouteEntryTable 17419 * - mp3ctl is sent upstream as ipNetToMediaEntryTable 17420 * - mp4ctl is sent upstream as ipRouteAttributeTable 17421 */ 17422 mp2ctl = copymsg(mpctl); 17423 mp3ctl = copymsg(mpctl); 17424 mp4ctl = copymsg(mpctl); 17425 if (mp3ctl == NULL || mp4ctl == NULL) { 17426 freemsg(mp4ctl); 17427 freemsg(mp3ctl); 17428 freemsg(mp2ctl); 17429 freemsg(mpctl); 17430 return (NULL); 17431 } 17432 17433 bzero(&ird, sizeof (ird)); 17434 17435 ird.ird_route.lp_head = mpctl->b_cont; 17436 ird.ird_netmedia.lp_head = mp3ctl->b_cont; 17437 ird.ird_attrs.lp_head = mp4ctl->b_cont; 17438 17439 zoneid = Q_TO_CONN(q)->conn_zoneid; 17440 ire_walk_v4(ip_snmp_get2_v4, &ird, zoneid); 17441 if (zoneid == GLOBAL_ZONEID) { 17442 /* 17443 * Those IREs are used by Mobile-IP; since mipagent(1M) requires 17444 * the sys_net_config privilege, it can only run in the global 17445 * zone, so we don't display these IREs in the other zones. 17446 */ 17447 ire_walk_srcif_table_v4(ip_snmp_get2_v4, &ird); 17448 ire_walk_ill_mrtun(0, 0, ip_snmp_get2_v4, &ird, NULL); 17449 } 17450 17451 /* ipRouteEntryTable in mpctl */ 17452 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17453 optp->level = MIB2_IP; 17454 optp->name = MIB2_IP_ROUTE; 17455 optp->len = msgdsize(ird.ird_route.lp_head); 17456 ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n", 17457 (int)optp->level, (int)optp->name, (int)optp->len)); 17458 qreply(q, mpctl); 17459 17460 /* ipNetToMediaEntryTable in mp3ctl */ 17461 optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17462 optp->level = MIB2_IP; 17463 optp->name = MIB2_IP_MEDIA; 17464 optp->len = msgdsize(ird.ird_netmedia.lp_head); 17465 ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n", 17466 (int)optp->level, (int)optp->name, (int)optp->len)); 17467 qreply(q, mp3ctl); 17468 17469 /* ipRouteAttributeTable in mp4ctl */ 17470 optp = (struct opthdr *)&mp4ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17471 optp->level = MIB2_IP; 17472 optp->name = EXPER_IP_RTATTR; 17473 optp->len = msgdsize(ird.ird_attrs.lp_head); 17474 ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n", 17475 (int)optp->level, (int)optp->name, (int)optp->len)); 17476 if (optp->len == 0) 17477 freemsg(mp4ctl); 17478 else 17479 qreply(q, mp4ctl); 17480 17481 return (mp2ctl); 17482 } 17483 17484 /* 17485 * Return ipv6RouteEntryTable and ipv6RouteAttributeTable in one IRE walk, and 17486 * ipv6NetToMediaEntryTable in an NDP walk. 17487 */ 17488 static mblk_t * 17489 ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl) 17490 { 17491 struct opthdr *optp; 17492 mblk_t *mp2ctl; /* Returned */ 17493 mblk_t *mp3ctl; /* nettomedia */ 17494 mblk_t *mp4ctl; /* routeattrs */ 17495 iproutedata_t ird; 17496 zoneid_t zoneid; 17497 17498 /* 17499 * make copies of the original message 17500 * - mp2ctl is returned unchanged to the caller for his use 17501 * - mpctl is sent upstream as ipv6RouteEntryTable 17502 * - mp3ctl is sent upstream as ipv6NetToMediaEntryTable 17503 * - mp4ctl is sent upstream as ipv6RouteAttributeTable 17504 */ 17505 mp2ctl = copymsg(mpctl); 17506 mp3ctl = copymsg(mpctl); 17507 mp4ctl = copymsg(mpctl); 17508 if (mp3ctl == NULL || mp4ctl == NULL) { 17509 freemsg(mp4ctl); 17510 freemsg(mp3ctl); 17511 freemsg(mp2ctl); 17512 freemsg(mpctl); 17513 return (NULL); 17514 } 17515 17516 bzero(&ird, sizeof (ird)); 17517 17518 ird.ird_route.lp_head = mpctl->b_cont; 17519 ird.ird_netmedia.lp_head = mp3ctl->b_cont; 17520 ird.ird_attrs.lp_head = mp4ctl->b_cont; 17521 17522 zoneid = Q_TO_CONN(q)->conn_zoneid; 17523 ire_walk_v6(ip_snmp_get2_v6_route, &ird, zoneid); 17524 17525 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17526 optp->level = MIB2_IP6; 17527 optp->name = MIB2_IP6_ROUTE; 17528 optp->len = msgdsize(ird.ird_route.lp_head); 17529 ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n", 17530 (int)optp->level, (int)optp->name, (int)optp->len)); 17531 qreply(q, mpctl); 17532 17533 /* ipv6NetToMediaEntryTable in mp3ctl */ 17534 ndp_walk(NULL, ip_snmp_get2_v6_media, &ird); 17535 17536 optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17537 optp->level = MIB2_IP6; 17538 optp->name = MIB2_IP6_MEDIA; 17539 optp->len = msgdsize(ird.ird_netmedia.lp_head); 17540 ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n", 17541 (int)optp->level, (int)optp->name, (int)optp->len)); 17542 qreply(q, mp3ctl); 17543 17544 /* ipv6RouteAttributeTable in mp4ctl */ 17545 optp = (struct opthdr *)&mp4ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17546 optp->level = MIB2_IP6; 17547 optp->name = EXPER_IP_RTATTR; 17548 optp->len = msgdsize(ird.ird_attrs.lp_head); 17549 ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n", 17550 (int)optp->level, (int)optp->name, (int)optp->len)); 17551 if (optp->len == 0) 17552 freemsg(mp4ctl); 17553 else 17554 qreply(q, mp4ctl); 17555 17556 return (mp2ctl); 17557 } 17558 17559 /* 17560 * ICMPv6 mib: One per ill 17561 */ 17562 static mblk_t * 17563 ip_snmp_get_mib2_ip6(queue_t *q, mblk_t *mpctl) 17564 { 17565 struct opthdr *optp; 17566 mblk_t *mp2ctl; 17567 ill_t *ill; 17568 ill_walk_context_t ctx; 17569 mblk_t *mp_tail = NULL; 17570 17571 /* 17572 * Make a copy of the original message 17573 */ 17574 mp2ctl = copymsg(mpctl); 17575 17576 /* fixed length IPv6 structure ... */ 17577 17578 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17579 optp->level = MIB2_IP6; 17580 optp->name = 0; 17581 /* Include "unknown interface" ip6_mib */ 17582 ip6_mib.ipv6IfIndex = 0; /* Flag to netstat */ 17583 SET_MIB(ip6_mib.ipv6Forwarding, ipv6_forward ? 1 : 2); 17584 SET_MIB(ip6_mib.ipv6DefaultHopLimit, ipv6_def_hops); 17585 SET_MIB(ip6_mib.ipv6IfStatsEntrySize, 17586 sizeof (mib2_ipv6IfStatsEntry_t)); 17587 SET_MIB(ip6_mib.ipv6AddrEntrySize, sizeof (mib2_ipv6AddrEntry_t)); 17588 SET_MIB(ip6_mib.ipv6RouteEntrySize, sizeof (mib2_ipv6RouteEntry_t)); 17589 SET_MIB(ip6_mib.ipv6NetToMediaEntrySize, 17590 sizeof (mib2_ipv6NetToMediaEntry_t)); 17591 SET_MIB(ip6_mib.ipv6MemberEntrySize, sizeof (ipv6_member_t)); 17592 SET_MIB(ip6_mib.ipv6GroupSourceEntrySize, sizeof (ipv6_grpsrc_t)); 17593 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, (char *)&ip6_mib, 17594 (int)sizeof (ip6_mib))) { 17595 ip1dbg(("ip_snmp_get_mib2_ip6: failed to allocate %u bytes\n", 17596 (uint_t)sizeof (ip6_mib))); 17597 } 17598 17599 rw_enter(&ill_g_lock, RW_READER); 17600 ill = ILL_START_WALK_V6(&ctx); 17601 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 17602 ill->ill_ip6_mib->ipv6IfIndex = 17603 ill->ill_phyint->phyint_ifindex; 17604 SET_MIB(ill->ill_ip6_mib->ipv6Forwarding, 17605 ipv6_forward ? 1 : 2); 17606 SET_MIB(ill->ill_ip6_mib->ipv6DefaultHopLimit, 17607 ill->ill_max_hops); 17608 SET_MIB(ill->ill_ip6_mib->ipv6IfStatsEntrySize, 17609 sizeof (mib2_ipv6IfStatsEntry_t)); 17610 SET_MIB(ill->ill_ip6_mib->ipv6AddrEntrySize, 17611 sizeof (mib2_ipv6AddrEntry_t)); 17612 SET_MIB(ill->ill_ip6_mib->ipv6RouteEntrySize, 17613 sizeof (mib2_ipv6RouteEntry_t)); 17614 SET_MIB(ill->ill_ip6_mib->ipv6NetToMediaEntrySize, 17615 sizeof (mib2_ipv6NetToMediaEntry_t)); 17616 SET_MIB(ill->ill_ip6_mib->ipv6MemberEntrySize, 17617 sizeof (ipv6_member_t)); 17618 17619 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 17620 (char *)ill->ill_ip6_mib, 17621 (int)sizeof (*ill->ill_ip6_mib))) { 17622 ip1dbg(("ip_snmp_get_mib2_ip6: failed to allocate " 17623 "%u bytes\n", 17624 (uint_t)sizeof (*ill->ill_ip6_mib))); 17625 } 17626 } 17627 rw_exit(&ill_g_lock); 17628 17629 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 17630 ip3dbg(("ip_snmp_get_mib2_ip6: level %d, name %d, len %d\n", 17631 (int)optp->level, (int)optp->name, (int)optp->len)); 17632 qreply(q, mpctl); 17633 return (mp2ctl); 17634 } 17635 17636 /* 17637 * ICMPv6 mib: One per ill 17638 */ 17639 static mblk_t * 17640 ip_snmp_get_mib2_icmp6(queue_t *q, mblk_t *mpctl) 17641 { 17642 struct opthdr *optp; 17643 mblk_t *mp2ctl; 17644 ill_t *ill; 17645 ill_walk_context_t ctx; 17646 mblk_t *mp_tail = NULL; 17647 /* 17648 * Make a copy of the original message 17649 */ 17650 mp2ctl = copymsg(mpctl); 17651 17652 /* fixed length ICMPv6 structure ... */ 17653 17654 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17655 optp->level = MIB2_ICMP6; 17656 optp->name = 0; 17657 /* Include "unknown interface" icmp6_mib */ 17658 icmp6_mib.ipv6IfIcmpIfIndex = 0; /* Flag to netstat */ 17659 icmp6_mib.ipv6IfIcmpEntrySize = sizeof (mib2_ipv6IfIcmpEntry_t); 17660 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, (char *)&icmp6_mib, 17661 (int)sizeof (icmp6_mib))) { 17662 ip1dbg(("ip_snmp_get_mib2_icmp6: failed to allocate %u bytes\n", 17663 (uint_t)sizeof (icmp6_mib))); 17664 } 17665 17666 rw_enter(&ill_g_lock, RW_READER); 17667 ill = ILL_START_WALK_V6(&ctx); 17668 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 17669 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex = 17670 ill->ill_phyint->phyint_ifindex; 17671 ill->ill_icmp6_mib->ipv6IfIcmpEntrySize = 17672 sizeof (mib2_ipv6IfIcmpEntry_t); 17673 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 17674 (char *)ill->ill_icmp6_mib, 17675 (int)sizeof (*ill->ill_icmp6_mib))) { 17676 ip1dbg(("ip_snmp_get_mib2_icmp6: failed to allocate " 17677 "%u bytes\n", 17678 (uint_t)sizeof (*ill->ill_icmp6_mib))); 17679 } 17680 } 17681 rw_exit(&ill_g_lock); 17682 17683 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 17684 ip3dbg(("ip_snmp_get_mib2_icmp6: level %d, name %d, len %d\n", 17685 (int)optp->level, (int)optp->name, (int)optp->len)); 17686 qreply(q, mpctl); 17687 return (mp2ctl); 17688 } 17689 17690 /* 17691 * ire_walk routine to create both ipRouteEntryTable and 17692 * ipNetToMediaEntryTable in one IRE walk 17693 */ 17694 static void 17695 ip_snmp_get2_v4(ire_t *ire, iproutedata_t *ird) 17696 { 17697 ill_t *ill; 17698 ipif_t *ipif; 17699 mblk_t *llmp; 17700 dl_unitdata_req_t *dlup; 17701 mib2_ipRouteEntry_t *re; 17702 mib2_ipNetToMediaEntry_t ntme; 17703 mib2_ipAttributeEntry_t *iae, *iaeptr; 17704 ipaddr_t gw_addr; 17705 tsol_ire_gw_secattr_t *attrp; 17706 tsol_gc_t *gc = NULL; 17707 tsol_gcgrp_t *gcgrp = NULL; 17708 uint_t sacnt = 0; 17709 int i; 17710 17711 ASSERT(ire->ire_ipversion == IPV4_VERSION); 17712 17713 if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL) 17714 return; 17715 17716 if ((attrp = ire->ire_gw_secattr) != NULL) { 17717 mutex_enter(&attrp->igsa_lock); 17718 if ((gc = attrp->igsa_gc) != NULL) { 17719 gcgrp = gc->gc_grp; 17720 ASSERT(gcgrp != NULL); 17721 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); 17722 sacnt = 1; 17723 } else if ((gcgrp = attrp->igsa_gcgrp) != NULL) { 17724 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); 17725 gc = gcgrp->gcgrp_head; 17726 sacnt = gcgrp->gcgrp_count; 17727 } 17728 mutex_exit(&attrp->igsa_lock); 17729 17730 /* do nothing if there's no gc to report */ 17731 if (gc == NULL) { 17732 ASSERT(sacnt == 0); 17733 if (gcgrp != NULL) { 17734 /* we might as well drop the lock now */ 17735 rw_exit(&gcgrp->gcgrp_rwlock); 17736 gcgrp = NULL; 17737 } 17738 attrp = NULL; 17739 } 17740 17741 ASSERT(gc == NULL || (gcgrp != NULL && 17742 RW_LOCK_HELD(&gcgrp->gcgrp_rwlock))); 17743 } 17744 ASSERT(sacnt == 0 || gc != NULL); 17745 17746 if (sacnt != 0 && 17747 (iae = kmem_alloc(sacnt * sizeof (*iae), KM_NOSLEEP)) == NULL) { 17748 kmem_free(re, sizeof (*re)); 17749 rw_exit(&gcgrp->gcgrp_rwlock); 17750 return; 17751 } 17752 17753 /* 17754 * Return all IRE types for route table... let caller pick and choose 17755 */ 17756 re->ipRouteDest = ire->ire_addr; 17757 ipif = ire->ire_ipif; 17758 re->ipRouteIfIndex.o_length = 0; 17759 if (ire->ire_type == IRE_CACHE) { 17760 ill = (ill_t *)ire->ire_stq->q_ptr; 17761 re->ipRouteIfIndex.o_length = 17762 ill->ill_name_length == 0 ? 0 : 17763 MIN(OCTET_LENGTH, ill->ill_name_length - 1); 17764 bcopy(ill->ill_name, re->ipRouteIfIndex.o_bytes, 17765 re->ipRouteIfIndex.o_length); 17766 } else if (ipif != NULL) { 17767 (void) ipif_get_name(ipif, re->ipRouteIfIndex.o_bytes, 17768 OCTET_LENGTH); 17769 re->ipRouteIfIndex.o_length = 17770 mi_strlen(re->ipRouteIfIndex.o_bytes); 17771 } 17772 re->ipRouteMetric1 = -1; 17773 re->ipRouteMetric2 = -1; 17774 re->ipRouteMetric3 = -1; 17775 re->ipRouteMetric4 = -1; 17776 17777 gw_addr = ire->ire_gateway_addr; 17778 17779 if (ire->ire_type & (IRE_INTERFACE|IRE_LOOPBACK|IRE_BROADCAST)) 17780 re->ipRouteNextHop = ire->ire_src_addr; 17781 else 17782 re->ipRouteNextHop = gw_addr; 17783 /* indirect(4), direct(3), or invalid(2) */ 17784 if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) 17785 re->ipRouteType = 2; 17786 else 17787 re->ipRouteType = (gw_addr != 0) ? 4 : 3; 17788 re->ipRouteProto = -1; 17789 re->ipRouteAge = gethrestime_sec() - ire->ire_create_time; 17790 re->ipRouteMask = ire->ire_mask; 17791 re->ipRouteMetric5 = -1; 17792 re->ipRouteInfo.re_max_frag = ire->ire_max_frag; 17793 re->ipRouteInfo.re_frag_flag = ire->ire_frag_flag; 17794 re->ipRouteInfo.re_rtt = ire->ire_uinfo.iulp_rtt; 17795 llmp = ire->ire_dlureq_mp; 17796 re->ipRouteInfo.re_ref = ire->ire_refcnt; 17797 re->ipRouteInfo.re_src_addr = ire->ire_src_addr; 17798 re->ipRouteInfo.re_ire_type = ire->ire_type; 17799 re->ipRouteInfo.re_obpkt = ire->ire_ob_pkt_count; 17800 re->ipRouteInfo.re_ibpkt = ire->ire_ib_pkt_count; 17801 re->ipRouteInfo.re_flags = ire->ire_flags; 17802 re->ipRouteInfo.re_in_ill.o_length = 0; 17803 if (ire->ire_in_ill != NULL) { 17804 re->ipRouteInfo.re_in_ill.o_length = 17805 ire->ire_in_ill->ill_name_length == 0 ? 0 : 17806 MIN(OCTET_LENGTH, ire->ire_in_ill->ill_name_length - 1); 17807 bcopy(ire->ire_in_ill->ill_name, 17808 re->ipRouteInfo.re_in_ill.o_bytes, 17809 re->ipRouteInfo.re_in_ill.o_length); 17810 } 17811 re->ipRouteInfo.re_in_src_addr = ire->ire_in_src_addr; 17812 17813 if (!snmp_append_data2(ird->ird_route.lp_head, &ird->ird_route.lp_tail, 17814 (char *)re, (int)sizeof (*re))) { 17815 ip1dbg(("ip_snmp_get2_v4: failed to allocate %u bytes\n", 17816 (uint_t)sizeof (*re))); 17817 } 17818 17819 for (iaeptr = iae, i = 0; i < sacnt; i++, iaeptr++, gc = gc->gc_next) { 17820 iaeptr->iae_routeidx = ird->ird_idx; 17821 iaeptr->iae_doi = gc->gc_db->gcdb_doi; 17822 iaeptr->iae_slrange = gc->gc_db->gcdb_slrange; 17823 } 17824 17825 if (!snmp_append_data2(ird->ird_attrs.lp_head, &ird->ird_attrs.lp_tail, 17826 (char *)iae, sacnt * sizeof (*iae))) { 17827 ip1dbg(("ip_snmp_get2_v4: failed to allocate %u bytes\n", 17828 (unsigned)(sacnt * sizeof (*iae)))); 17829 } 17830 17831 if (ire->ire_type != IRE_CACHE || gw_addr != 0) 17832 goto done; 17833 /* 17834 * only IRE_CACHE entries that are for a directly connected subnet 17835 * get appended to net -> phys addr table 17836 * (others in arp) 17837 */ 17838 ntme.ipNetToMediaIfIndex.o_length = 0; 17839 ill = ire_to_ill(ire); 17840 ASSERT(ill != NULL); 17841 ntme.ipNetToMediaIfIndex.o_length = 17842 ill->ill_name_length == 0 ? 0 : 17843 MIN(OCTET_LENGTH, ill->ill_name_length - 1); 17844 bcopy(ill->ill_name, ntme.ipNetToMediaIfIndex.o_bytes, 17845 ntme.ipNetToMediaIfIndex.o_length); 17846 17847 ntme.ipNetToMediaPhysAddress.o_length = 0; 17848 if (llmp) { 17849 uchar_t *addr; 17850 17851 dlup = (dl_unitdata_req_t *)llmp->b_rptr; 17852 /* Remove sap from address */ 17853 if (ill->ill_sap_length < 0) 17854 addr = llmp->b_rptr + dlup->dl_dest_addr_offset; 17855 else 17856 addr = llmp->b_rptr + dlup->dl_dest_addr_offset + 17857 ill->ill_sap_length; 17858 17859 ntme.ipNetToMediaPhysAddress.o_length = 17860 MIN(OCTET_LENGTH, ill->ill_phys_addr_length); 17861 bcopy(addr, ntme.ipNetToMediaPhysAddress.o_bytes, 17862 ntme.ipNetToMediaPhysAddress.o_length); 17863 } 17864 ntme.ipNetToMediaNetAddress = ire->ire_addr; 17865 /* assume dynamic (may be changed in arp) */ 17866 ntme.ipNetToMediaType = 3; 17867 ntme.ipNetToMediaInfo.ntm_mask.o_length = sizeof (uint32_t); 17868 bcopy(&ire->ire_mask, ntme.ipNetToMediaInfo.ntm_mask.o_bytes, 17869 ntme.ipNetToMediaInfo.ntm_mask.o_length); 17870 ntme.ipNetToMediaInfo.ntm_flags = ACE_F_RESOLVED; 17871 if (!snmp_append_data2(ird->ird_netmedia.lp_head, 17872 &ird->ird_netmedia.lp_tail, (char *)&ntme, sizeof (ntme))) { 17873 ip1dbg(("ip_snmp_get2_v4: failed to allocate %u bytes\n", 17874 (uint_t)sizeof (ntme))); 17875 } 17876 done: 17877 /* bump route index for next pass */ 17878 ird->ird_idx++; 17879 17880 kmem_free(re, sizeof (*re)); 17881 if (sacnt != 0) 17882 kmem_free(iae, sacnt * sizeof (*iae)); 17883 17884 if (gcgrp != NULL) 17885 rw_exit(&gcgrp->gcgrp_rwlock); 17886 } 17887 17888 /* 17889 * ire_walk routine to create ipv6RouteEntryTable and ipRouteEntryTable. 17890 */ 17891 static void 17892 ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird) 17893 { 17894 ill_t *ill; 17895 ipif_t *ipif; 17896 mib2_ipv6RouteEntry_t *re; 17897 mib2_ipAttributeEntry_t *iae, *iaeptr; 17898 in6_addr_t gw_addr_v6; 17899 tsol_ire_gw_secattr_t *attrp; 17900 tsol_gc_t *gc = NULL; 17901 tsol_gcgrp_t *gcgrp = NULL; 17902 uint_t sacnt = 0; 17903 int i; 17904 17905 ASSERT(ire->ire_ipversion == IPV6_VERSION); 17906 17907 if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL) 17908 return; 17909 17910 if ((attrp = ire->ire_gw_secattr) != NULL) { 17911 mutex_enter(&attrp->igsa_lock); 17912 if ((gc = attrp->igsa_gc) != NULL) { 17913 gcgrp = gc->gc_grp; 17914 ASSERT(gcgrp != NULL); 17915 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); 17916 sacnt = 1; 17917 } else if ((gcgrp = attrp->igsa_gcgrp) != NULL) { 17918 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); 17919 gc = gcgrp->gcgrp_head; 17920 sacnt = gcgrp->gcgrp_count; 17921 } 17922 mutex_exit(&attrp->igsa_lock); 17923 17924 /* do nothing if there's no gc to report */ 17925 if (gc == NULL) { 17926 ASSERT(sacnt == 0); 17927 if (gcgrp != NULL) { 17928 /* we might as well drop the lock now */ 17929 rw_exit(&gcgrp->gcgrp_rwlock); 17930 gcgrp = NULL; 17931 } 17932 attrp = NULL; 17933 } 17934 17935 ASSERT(gc == NULL || (gcgrp != NULL && 17936 RW_LOCK_HELD(&gcgrp->gcgrp_rwlock))); 17937 } 17938 ASSERT(sacnt == 0 || gc != NULL); 17939 17940 if (sacnt != 0 && 17941 (iae = kmem_alloc(sacnt * sizeof (*iae), KM_NOSLEEP)) == NULL) { 17942 kmem_free(re, sizeof (*re)); 17943 rw_exit(&gcgrp->gcgrp_rwlock); 17944 return; 17945 } 17946 17947 /* 17948 * Return all IRE types for route table... let caller pick and choose 17949 */ 17950 re->ipv6RouteDest = ire->ire_addr_v6; 17951 re->ipv6RoutePfxLength = ip_mask_to_plen_v6(&ire->ire_mask_v6); 17952 re->ipv6RouteIndex = 0; /* Unique when multiple with same dest/plen */ 17953 re->ipv6RouteIfIndex.o_length = 0; 17954 ipif = ire->ire_ipif; 17955 if (ire->ire_type == IRE_CACHE) { 17956 ill = (ill_t *)ire->ire_stq->q_ptr; 17957 re->ipv6RouteIfIndex.o_length = 17958 ill->ill_name_length == 0 ? 0 : 17959 MIN(OCTET_LENGTH, ill->ill_name_length - 1); 17960 bcopy(ill->ill_name, re->ipv6RouteIfIndex.o_bytes, 17961 re->ipv6RouteIfIndex.o_length); 17962 } else if (ipif != NULL) { 17963 (void) ipif_get_name(ipif, re->ipv6RouteIfIndex.o_bytes, 17964 OCTET_LENGTH); 17965 re->ipv6RouteIfIndex.o_length = 17966 mi_strlen(re->ipv6RouteIfIndex.o_bytes); 17967 } 17968 17969 ASSERT(!(ire->ire_type & IRE_BROADCAST)); 17970 17971 mutex_enter(&ire->ire_lock); 17972 gw_addr_v6 = ire->ire_gateway_addr_v6; 17973 mutex_exit(&ire->ire_lock); 17974 17975 if (ire->ire_type & (IRE_INTERFACE|IRE_LOOPBACK)) 17976 re->ipv6RouteNextHop = ire->ire_src_addr_v6; 17977 else 17978 re->ipv6RouteNextHop = gw_addr_v6; 17979 17980 /* remote(4), local(3), or discard(2) */ 17981 if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) 17982 re->ipv6RouteType = 2; 17983 else if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) 17984 re->ipv6RouteType = 3; 17985 else 17986 re->ipv6RouteType = 4; 17987 17988 re->ipv6RouteProtocol = -1; 17989 re->ipv6RoutePolicy = 0; 17990 re->ipv6RouteAge = gethrestime_sec() - ire->ire_create_time; 17991 re->ipv6RouteNextHopRDI = 0; 17992 re->ipv6RouteWeight = 0; 17993 re->ipv6RouteMetric = 0; 17994 re->ipv6RouteInfo.re_max_frag = ire->ire_max_frag; 17995 re->ipv6RouteInfo.re_frag_flag = ire->ire_frag_flag; 17996 re->ipv6RouteInfo.re_rtt = ire->ire_uinfo.iulp_rtt; 17997 re->ipv6RouteInfo.re_src_addr = ire->ire_src_addr_v6; 17998 re->ipv6RouteInfo.re_ire_type = ire->ire_type; 17999 re->ipv6RouteInfo.re_obpkt = ire->ire_ob_pkt_count; 18000 re->ipv6RouteInfo.re_ibpkt = ire->ire_ib_pkt_count; 18001 re->ipv6RouteInfo.re_ref = ire->ire_refcnt; 18002 re->ipv6RouteInfo.re_flags = ire->ire_flags; 18003 18004 if (!snmp_append_data2(ird->ird_route.lp_head, &ird->ird_route.lp_tail, 18005 (char *)re, (int)sizeof (*re))) { 18006 ip1dbg(("ip_snmp_get2_v6: failed to allocate %u bytes\n", 18007 (uint_t)sizeof (*re))); 18008 } 18009 18010 for (iaeptr = iae, i = 0; i < sacnt; i++, iaeptr++, gc = gc->gc_next) { 18011 iaeptr->iae_routeidx = ird->ird_idx; 18012 iaeptr->iae_doi = gc->gc_db->gcdb_doi; 18013 iaeptr->iae_slrange = gc->gc_db->gcdb_slrange; 18014 } 18015 18016 if (!snmp_append_data2(ird->ird_attrs.lp_head, &ird->ird_attrs.lp_tail, 18017 (char *)iae, sacnt * sizeof (*iae))) { 18018 ip1dbg(("ip_snmp_get2_v6: failed to allocate %u bytes\n", 18019 (unsigned)(sacnt * sizeof (*iae)))); 18020 } 18021 18022 /* bump route index for next pass */ 18023 ird->ird_idx++; 18024 18025 kmem_free(re, sizeof (*re)); 18026 if (sacnt != 0) 18027 kmem_free(iae, sacnt * sizeof (*iae)); 18028 18029 if (gcgrp != NULL) 18030 rw_exit(&gcgrp->gcgrp_rwlock); 18031 } 18032 18033 /* 18034 * ndp_walk routine to create ipv6NetToMediaEntryTable 18035 */ 18036 static int 18037 ip_snmp_get2_v6_media(nce_t *nce, iproutedata_t *ird) 18038 { 18039 ill_t *ill; 18040 mib2_ipv6NetToMediaEntry_t ntme; 18041 dl_unitdata_req_t *dl; 18042 18043 ill = nce->nce_ill; 18044 ASSERT(ill->ill_isv6); 18045 18046 /* 18047 * Neighbor cache entry attached to IRE with on-link 18048 * destination. 18049 */ 18050 ntme.ipv6NetToMediaIfIndex = ill->ill_phyint->phyint_ifindex; 18051 ntme.ipv6NetToMediaNetAddress = nce->nce_addr; 18052 if ((ill->ill_flags & ILLF_XRESOLV) && 18053 (nce->nce_res_mp != NULL)) { 18054 dl = (dl_unitdata_req_t *)(nce->nce_res_mp->b_rptr); 18055 ntme.ipv6NetToMediaPhysAddress.o_length = 18056 dl->dl_dest_addr_length; 18057 } else { 18058 ntme.ipv6NetToMediaPhysAddress.o_length = 18059 ill->ill_phys_addr_length; 18060 } 18061 if (nce->nce_res_mp != NULL) { 18062 bcopy((char *)nce->nce_res_mp->b_rptr + 18063 NCE_LL_ADDR_OFFSET(ill), 18064 ntme.ipv6NetToMediaPhysAddress.o_bytes, 18065 ntme.ipv6NetToMediaPhysAddress.o_length); 18066 } else { 18067 bzero(ntme.ipv6NetToMediaPhysAddress.o_bytes, 18068 ill->ill_phys_addr_length); 18069 } 18070 /* 18071 * Note: Returns ND_* states. Should be: 18072 * reachable(1), stale(2), delay(3), probe(4), 18073 * invalid(5), unknown(6) 18074 */ 18075 ntme.ipv6NetToMediaState = nce->nce_state; 18076 ntme.ipv6NetToMediaLastUpdated = 0; 18077 18078 /* other(1), dynamic(2), static(3), local(4) */ 18079 if (IN6_IS_ADDR_LOOPBACK(&nce->nce_addr)) { 18080 ntme.ipv6NetToMediaType = 4; 18081 } else if (IN6_IS_ADDR_MULTICAST(&nce->nce_addr)) { 18082 ntme.ipv6NetToMediaType = 1; 18083 } else { 18084 ntme.ipv6NetToMediaType = 2; 18085 } 18086 18087 if (!snmp_append_data2(ird->ird_netmedia.lp_head, 18088 &ird->ird_netmedia.lp_tail, (char *)&ntme, sizeof (ntme))) { 18089 ip1dbg(("ip_snmp_get2_v6_media: failed to allocate %u bytes\n", 18090 (uint_t)sizeof (ntme))); 18091 } 18092 return (0); 18093 } 18094 18095 /* 18096 * return (0) if invalid set request, 1 otherwise, including non-tcp requests 18097 */ 18098 /* ARGSUSED */ 18099 int 18100 ip_snmp_set(queue_t *q, int level, int name, uchar_t *ptr, int len) 18101 { 18102 switch (level) { 18103 case MIB2_IP: 18104 case MIB2_ICMP: 18105 switch (name) { 18106 default: 18107 break; 18108 } 18109 return (1); 18110 default: 18111 return (1); 18112 } 18113 } 18114 18115 /* 18116 * Called before the options are updated to check if this packet will 18117 * be source routed from here. 18118 * This routine assumes that the options are well formed i.e. that they 18119 * have already been checked. 18120 */ 18121 static boolean_t 18122 ip_source_routed(ipha_t *ipha) 18123 { 18124 ipoptp_t opts; 18125 uchar_t *opt; 18126 uint8_t optval; 18127 uint8_t optlen; 18128 ipaddr_t dst; 18129 ire_t *ire; 18130 18131 if (IS_SIMPLE_IPH(ipha)) { 18132 ip2dbg(("not source routed\n")); 18133 return (B_FALSE); 18134 } 18135 dst = ipha->ipha_dst; 18136 for (optval = ipoptp_first(&opts, ipha); 18137 optval != IPOPT_EOL; 18138 optval = ipoptp_next(&opts)) { 18139 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 18140 opt = opts.ipoptp_cur; 18141 optlen = opts.ipoptp_len; 18142 ip2dbg(("ip_source_routed: opt %d, len %d\n", 18143 optval, optlen)); 18144 switch (optval) { 18145 uint32_t off; 18146 case IPOPT_SSRR: 18147 case IPOPT_LSRR: 18148 /* 18149 * If dst is one of our addresses and there are some 18150 * entries left in the source route return (true). 18151 */ 18152 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, 18153 ALL_ZONES, NULL, MATCH_IRE_TYPE); 18154 if (ire == NULL) { 18155 ip2dbg(("ip_source_routed: not next" 18156 " source route 0x%x\n", 18157 ntohl(dst))); 18158 return (B_FALSE); 18159 } 18160 ire_refrele(ire); 18161 off = opt[IPOPT_OFFSET]; 18162 off--; 18163 if (optlen < IP_ADDR_LEN || 18164 off > optlen - IP_ADDR_LEN) { 18165 /* End of source route */ 18166 ip1dbg(("ip_source_routed: end of SR\n")); 18167 return (B_FALSE); 18168 } 18169 return (B_TRUE); 18170 } 18171 } 18172 ip2dbg(("not source routed\n")); 18173 return (B_FALSE); 18174 } 18175 18176 /* 18177 * Check if the packet contains any source route. 18178 */ 18179 static boolean_t 18180 ip_source_route_included(ipha_t *ipha) 18181 { 18182 ipoptp_t opts; 18183 uint8_t optval; 18184 18185 if (IS_SIMPLE_IPH(ipha)) 18186 return (B_FALSE); 18187 for (optval = ipoptp_first(&opts, ipha); 18188 optval != IPOPT_EOL; 18189 optval = ipoptp_next(&opts)) { 18190 switch (optval) { 18191 case IPOPT_SSRR: 18192 case IPOPT_LSRR: 18193 return (B_TRUE); 18194 } 18195 } 18196 return (B_FALSE); 18197 } 18198 18199 /* 18200 * Called when the IRE expiration timer fires. 18201 */ 18202 /* ARGSUSED */ 18203 void 18204 ip_trash_timer_expire(void *args) 18205 { 18206 int flush_flag = 0; 18207 18208 /* 18209 * ip_ire_expire_id is protected by ip_trash_timer_lock. 18210 * This lock makes sure that a new invocation of this function 18211 * that occurs due to an almost immediate timer firing will not 18212 * progress beyond this point until the current invocation is done 18213 */ 18214 mutex_enter(&ip_trash_timer_lock); 18215 ip_ire_expire_id = 0; 18216 mutex_exit(&ip_trash_timer_lock); 18217 18218 /* Periodic timer */ 18219 if (ip_ire_arp_time_elapsed >= ip_ire_arp_interval) { 18220 /* 18221 * Remove all IRE_CACHE entries since they might 18222 * contain arp information. 18223 */ 18224 flush_flag |= FLUSH_ARP_TIME; 18225 ip_ire_arp_time_elapsed = 0; 18226 IP_STAT(ip_ire_arp_timer_expired); 18227 } 18228 if (ip_ire_rd_time_elapsed >= ip_ire_redir_interval) { 18229 /* Remove all redirects */ 18230 flush_flag |= FLUSH_REDIRECT_TIME; 18231 ip_ire_rd_time_elapsed = 0; 18232 IP_STAT(ip_ire_redirect_timer_expired); 18233 } 18234 if (ip_ire_pmtu_time_elapsed >= ip_ire_pathmtu_interval) { 18235 /* Increase path mtu */ 18236 flush_flag |= FLUSH_MTU_TIME; 18237 ip_ire_pmtu_time_elapsed = 0; 18238 IP_STAT(ip_ire_pmtu_timer_expired); 18239 } 18240 if (flush_flag != 0) { 18241 /* Walk all IPv4 IRE's and update them */ 18242 ire_walk_v4(ire_expire, (char *)(uintptr_t)flush_flag, 18243 ALL_ZONES); 18244 } 18245 if (flush_flag & FLUSH_MTU_TIME) { 18246 /* 18247 * Walk all IPv6 IRE's and update them 18248 * Note that ARP and redirect timers are not 18249 * needed since NUD handles stale entries. 18250 */ 18251 flush_flag = FLUSH_MTU_TIME; 18252 ire_walk_v6(ire_expire, (char *)(uintptr_t)flush_flag, 18253 ALL_ZONES); 18254 } 18255 18256 ip_ire_arp_time_elapsed += ip_timer_interval; 18257 ip_ire_rd_time_elapsed += ip_timer_interval; 18258 ip_ire_pmtu_time_elapsed += ip_timer_interval; 18259 18260 /* 18261 * Hold the lock to serialize timeout calls and prevent 18262 * stale values in ip_ire_expire_id. Otherwise it is possible 18263 * for the timer to fire and a new invocation of this function 18264 * to start before the return value of timeout has been stored 18265 * in ip_ire_expire_id by the current invocation. 18266 */ 18267 mutex_enter(&ip_trash_timer_lock); 18268 ip_ire_expire_id = timeout(ip_trash_timer_expire, NULL, 18269 MSEC_TO_TICK(ip_timer_interval)); 18270 mutex_exit(&ip_trash_timer_lock); 18271 } 18272 18273 /* 18274 * Called by the memory allocator subsystem directly, when the system 18275 * is running low on memory. 18276 */ 18277 /* ARGSUSED */ 18278 void 18279 ip_trash_ire_reclaim(void *args) 18280 { 18281 ire_cache_count_t icc; 18282 ire_cache_reclaim_t icr; 18283 ncc_cache_count_t ncc; 18284 nce_cache_reclaim_t ncr; 18285 uint_t delete_cnt; 18286 /* 18287 * Memory reclaim call back. 18288 * Count unused, offlink, pmtu, and onlink IRE_CACHE entries. 18289 * Then, with a target of freeing 1/Nth of IRE_CACHE 18290 * entries, determine what fraction to free for 18291 * each category of IRE_CACHE entries giving absolute priority 18292 * in the order of onlink, pmtu, offlink, unused (e.g. no pmtu 18293 * entry will be freed unless all offlink entries are freed). 18294 */ 18295 icc.icc_total = 0; 18296 icc.icc_unused = 0; 18297 icc.icc_offlink = 0; 18298 icc.icc_pmtu = 0; 18299 icc.icc_onlink = 0; 18300 ire_walk(ire_cache_count, (char *)&icc); 18301 18302 /* 18303 * Free NCEs for IPv6 like the onlink ires. 18304 */ 18305 ncc.ncc_total = 0; 18306 ncc.ncc_host = 0; 18307 ndp_walk(NULL, (pfi_t)ndp_cache_count, (uchar_t *)&ncc); 18308 18309 ASSERT(icc.icc_total == icc.icc_unused + icc.icc_offlink + 18310 icc.icc_pmtu + icc.icc_onlink); 18311 delete_cnt = icc.icc_total/ip_ire_reclaim_fraction; 18312 IP_STAT(ip_trash_ire_reclaim_calls); 18313 if (delete_cnt == 0) 18314 return; 18315 IP_STAT(ip_trash_ire_reclaim_success); 18316 /* Always delete all unused offlink entries */ 18317 icr.icr_unused = 1; 18318 if (delete_cnt <= icc.icc_unused) { 18319 /* 18320 * Only need to free unused entries. In other words, 18321 * there are enough unused entries to free to meet our 18322 * target number of freed ire cache entries. 18323 */ 18324 icr.icr_offlink = icr.icr_pmtu = icr.icr_onlink = 0; 18325 ncr.ncr_host = 0; 18326 } else if (delete_cnt <= icc.icc_unused + icc.icc_offlink) { 18327 /* 18328 * Only need to free unused entries, plus a fraction of offlink 18329 * entries. It follows from the first if statement that 18330 * icc_offlink is non-zero, and that delete_cnt != icc_unused. 18331 */ 18332 delete_cnt -= icc.icc_unused; 18333 /* Round up # deleted by truncating fraction */ 18334 icr.icr_offlink = icc.icc_offlink / delete_cnt; 18335 icr.icr_pmtu = icr.icr_onlink = 0; 18336 ncr.ncr_host = 0; 18337 } else if (delete_cnt <= 18338 icc.icc_unused + icc.icc_offlink + icc.icc_pmtu) { 18339 /* 18340 * Free all unused and offlink entries, plus a fraction of 18341 * pmtu entries. It follows from the previous if statement 18342 * that icc_pmtu is non-zero, and that 18343 * delete_cnt != icc_unused + icc_offlink. 18344 */ 18345 icr.icr_offlink = 1; 18346 delete_cnt -= icc.icc_unused + icc.icc_offlink; 18347 /* Round up # deleted by truncating fraction */ 18348 icr.icr_pmtu = icc.icc_pmtu / delete_cnt; 18349 icr.icr_onlink = 0; 18350 ncr.ncr_host = 0; 18351 } else { 18352 /* 18353 * Free all unused, offlink, and pmtu entries, plus a fraction 18354 * of onlink entries. If we're here, then we know that 18355 * icc_onlink is non-zero, and that 18356 * delete_cnt != icc_unused + icc_offlink + icc_pmtu. 18357 */ 18358 icr.icr_offlink = icr.icr_pmtu = 1; 18359 delete_cnt -= icc.icc_unused + icc.icc_offlink + 18360 icc.icc_pmtu; 18361 /* Round up # deleted by truncating fraction */ 18362 icr.icr_onlink = icc.icc_onlink / delete_cnt; 18363 /* Using the same delete fraction as for onlink IREs */ 18364 ncr.ncr_host = ncc.ncc_host / delete_cnt; 18365 } 18366 #ifdef DEBUG 18367 ip1dbg(("IP reclaim: target %d out of %d current %d/%d/%d/%d " 18368 "fractions %d/%d/%d/%d\n", 18369 icc.icc_total/ip_ire_reclaim_fraction, icc.icc_total, 18370 icc.icc_unused, icc.icc_offlink, 18371 icc.icc_pmtu, icc.icc_onlink, 18372 icr.icr_unused, icr.icr_offlink, 18373 icr.icr_pmtu, icr.icr_onlink)); 18374 #endif 18375 ire_walk(ire_cache_reclaim, (char *)&icr); 18376 if (ncr.ncr_host != 0) 18377 ndp_walk(NULL, (pfi_t)ndp_cache_reclaim, 18378 (uchar_t *)&ncr); 18379 #ifdef DEBUG 18380 icc.icc_total = 0; icc.icc_unused = 0; icc.icc_offlink = 0; 18381 icc.icc_pmtu = 0; icc.icc_onlink = 0; 18382 ire_walk(ire_cache_count, (char *)&icc); 18383 ip1dbg(("IP reclaim: result total %d %d/%d/%d/%d\n", 18384 icc.icc_total, icc.icc_unused, icc.icc_offlink, 18385 icc.icc_pmtu, icc.icc_onlink)); 18386 #endif 18387 } 18388 18389 /* 18390 * ip_unbind is called when a copy of an unbind request is received from the 18391 * upper level protocol. We remove this conn from any fanout hash list it is 18392 * on, and zero out the bind information. No reply is expected up above. 18393 */ 18394 mblk_t * 18395 ip_unbind(queue_t *q, mblk_t *mp) 18396 { 18397 conn_t *connp = Q_TO_CONN(q); 18398 18399 ASSERT(!MUTEX_HELD(&connp->conn_lock)); 18400 18401 if (is_system_labeled() && connp->conn_anon_port) { 18402 (void) tsol_mlp_anon(crgetzone(connp->conn_cred), 18403 connp->conn_mlp_type, connp->conn_ulp, 18404 ntohs(connp->conn_lport), B_FALSE); 18405 connp->conn_anon_port = 0; 18406 } 18407 connp->conn_mlp_type = mlptSingle; 18408 18409 ipcl_hash_remove(connp); 18410 18411 ASSERT(mp->b_cont == NULL); 18412 /* 18413 * Convert mp into a T_OK_ACK 18414 */ 18415 mp = mi_tpi_ok_ack_alloc(mp); 18416 18417 /* 18418 * should not happen in practice... T_OK_ACK is smaller than the 18419 * original message. 18420 */ 18421 if (mp == NULL) 18422 return (NULL); 18423 18424 /* 18425 * Don't bzero the ports if its TCP since TCP still needs the 18426 * lport to remove it from its own bind hash. TCP will do the 18427 * cleanup. 18428 */ 18429 if (!IPCL_IS_TCP(connp)) 18430 bzero(&connp->u_port, sizeof (connp->u_port)); 18431 18432 return (mp); 18433 } 18434 18435 /* 18436 * Write side put procedure. Outbound data, IOCTLs, responses from 18437 * resolvers, etc, come down through here. 18438 */ 18439 void 18440 ip_output(void *arg, mblk_t *mp, void *arg2, int caller) 18441 { 18442 conn_t *connp = NULL; 18443 queue_t *q = (queue_t *)arg2; 18444 ipha_t *ipha; 18445 #define rptr ((uchar_t *)ipha) 18446 ire_t *ire = NULL; 18447 ire_t *sctp_ire = NULL; 18448 uint32_t v_hlen_tos_len; 18449 ipaddr_t dst; 18450 mblk_t *first_mp = NULL; 18451 boolean_t mctl_present; 18452 ipsec_out_t *io; 18453 int match_flags; 18454 ill_t *attach_ill = NULL; 18455 /* Bind to IPIF_NOFAILOVER ill etc. */ 18456 ill_t *xmit_ill = NULL; /* IP_XMIT_IF etc. */ 18457 ipif_t *dst_ipif; 18458 boolean_t multirt_need_resolve = B_FALSE; 18459 mblk_t *copy_mp = NULL; 18460 int err; 18461 zoneid_t zoneid; 18462 int adjust; 18463 uint16_t iplen; 18464 boolean_t need_decref = B_FALSE; 18465 boolean_t ignore_dontroute = B_FALSE; 18466 boolean_t ignore_nexthop = B_FALSE; 18467 boolean_t ip_nexthop = B_FALSE; 18468 ipaddr_t nexthop_addr; 18469 18470 #ifdef _BIG_ENDIAN 18471 #define V_HLEN (v_hlen_tos_len >> 24) 18472 #else 18473 #define V_HLEN (v_hlen_tos_len & 0xFF) 18474 #endif 18475 18476 TRACE_1(TR_FAC_IP, TR_IP_WPUT_START, 18477 "ip_wput_start: q %p", q); 18478 18479 /* 18480 * ip_wput fast path 18481 */ 18482 18483 /* is packet from ARP ? */ 18484 if (q->q_next != NULL) 18485 goto qnext; 18486 18487 connp = (conn_t *)arg; 18488 zoneid = (connp != NULL ? connp->conn_zoneid : ALL_ZONES); 18489 18490 /* is queue flow controlled? */ 18491 if ((q->q_first != NULL || connp->conn_draining) && 18492 (caller == IP_WPUT)) { 18493 ASSERT(!need_decref); 18494 (void) putq(q, mp); 18495 return; 18496 } 18497 18498 /* Multidata transmit? */ 18499 if (DB_TYPE(mp) == M_MULTIDATA) { 18500 /* 18501 * We should never get here, since all Multidata messages 18502 * originating from tcp should have been directed over to 18503 * tcp_multisend() in the first place. 18504 */ 18505 BUMP_MIB(&ip_mib, ipOutDiscards); 18506 freemsg(mp); 18507 return; 18508 } else if (DB_TYPE(mp) != M_DATA) 18509 goto notdata; 18510 18511 if (mp->b_flag & MSGHASREF) { 18512 ASSERT(connp->conn_ulp == IPPROTO_SCTP); 18513 mp->b_flag &= ~MSGHASREF; 18514 SCTP_EXTRACT_IPINFO(mp, sctp_ire); 18515 need_decref = B_TRUE; 18516 } 18517 ipha = (ipha_t *)mp->b_rptr; 18518 18519 /* is IP header non-aligned or mblk smaller than basic IP header */ 18520 #ifndef SAFETY_BEFORE_SPEED 18521 if (!OK_32PTR(rptr) || 18522 (mp->b_wptr - rptr) < IP_SIMPLE_HDR_LENGTH) 18523 goto hdrtoosmall; 18524 #endif 18525 18526 ASSERT(OK_32PTR(ipha)); 18527 18528 /* 18529 * This function assumes that mp points to an IPv4 packet. If it's the 18530 * wrong version, we'll catch it again in ip_output_v6. 18531 * 18532 * Note that this is *only* locally-generated output here, and never 18533 * forwarded data, and that we need to deal only with transports that 18534 * don't know how to label. (TCP, UDP, and ICMP/raw-IP all know how to 18535 * label.) 18536 */ 18537 if (is_system_labeled() && 18538 (ipha->ipha_version_and_hdr_length & 0xf0) == (IPV4_VERSION << 4) && 18539 !connp->conn_ulp_labeled) { 18540 err = tsol_check_label(BEST_CRED(mp, connp), &mp, &adjust, 18541 connp->conn_mac_exempt); 18542 ipha = (ipha_t *)mp->b_rptr; 18543 if (err != 0) { 18544 first_mp = mp; 18545 if (err == EINVAL) 18546 goto icmp_parameter_problem; 18547 ip2dbg(("ip_wput: label check failed (%d)\n", err)); 18548 goto drop_pkt; 18549 } 18550 iplen = ntohs(ipha->ipha_length) + adjust; 18551 ipha->ipha_length = htons(iplen); 18552 } 18553 18554 /* 18555 * If there is a policy, try to attach an ipsec_out in 18556 * the front. At the end, first_mp either points to a 18557 * M_DATA message or IPSEC_OUT message linked to a 18558 * M_DATA message. We have to do it now as we might 18559 * lose the "conn" if we go through ip_newroute. 18560 */ 18561 if (connp->conn_out_enforce_policy || (connp->conn_latch != NULL)) { 18562 if (((mp = ipsec_attach_ipsec_out(mp, connp, NULL, 18563 ipha->ipha_protocol)) == NULL)) { 18564 if (need_decref) 18565 CONN_DEC_REF(connp); 18566 return; 18567 } else { 18568 ASSERT(mp->b_datap->db_type == M_CTL); 18569 first_mp = mp; 18570 mp = mp->b_cont; 18571 mctl_present = B_TRUE; 18572 } 18573 } else { 18574 first_mp = mp; 18575 mctl_present = B_FALSE; 18576 } 18577 18578 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 18579 18580 /* is wrong version or IP options present */ 18581 if (V_HLEN != IP_SIMPLE_HDR_VERSION) 18582 goto version_hdrlen_check; 18583 dst = ipha->ipha_dst; 18584 18585 if (connp->conn_nofailover_ill != NULL) { 18586 attach_ill = conn_get_held_ill(connp, 18587 &connp->conn_nofailover_ill, &err); 18588 if (err == ILL_LOOKUP_FAILED) { 18589 if (need_decref) 18590 CONN_DEC_REF(connp); 18591 freemsg(first_mp); 18592 return; 18593 } 18594 } 18595 18596 /* is packet multicast? */ 18597 if (CLASSD(dst)) 18598 goto multicast; 18599 18600 if ((connp->conn_dontroute) || (connp->conn_xmit_if_ill != NULL) || 18601 (connp->conn_nexthop_set)) { 18602 /* 18603 * If the destination is a broadcast or a loopback 18604 * address, SO_DONTROUTE, IP_XMIT_IF and IP_NEXTHOP go 18605 * through the standard path. But in the case of local 18606 * destination only SO_DONTROUTE and IP_NEXTHOP go through 18607 * the standard path not IP_XMIT_IF. 18608 */ 18609 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp)); 18610 if ((ire == NULL) || ((ire->ire_type != IRE_BROADCAST) && 18611 (ire->ire_type != IRE_LOOPBACK))) { 18612 if ((connp->conn_dontroute || 18613 connp->conn_nexthop_set) && (ire != NULL) && 18614 (ire->ire_type == IRE_LOCAL)) 18615 goto standard_path; 18616 18617 if (ire != NULL) { 18618 ire_refrele(ire); 18619 /* No more access to ire */ 18620 ire = NULL; 18621 } 18622 /* 18623 * bypass routing checks and go directly to 18624 * interface. 18625 */ 18626 if (connp->conn_dontroute) { 18627 goto dontroute; 18628 } else if (connp->conn_nexthop_set) { 18629 ip_nexthop = B_TRUE; 18630 nexthop_addr = connp->conn_nexthop_v4; 18631 goto send_from_ill; 18632 } 18633 18634 /* 18635 * If IP_XMIT_IF socket option is set, 18636 * then we allow unicast and multicast 18637 * packets to go through the ill. It is 18638 * quite possible that the destination 18639 * is not in the ire cache table and we 18640 * do not want to go to ip_newroute() 18641 * instead we call ip_newroute_ipif. 18642 */ 18643 xmit_ill = conn_get_held_ill(connp, 18644 &connp->conn_xmit_if_ill, &err); 18645 if (err == ILL_LOOKUP_FAILED) { 18646 if (attach_ill != NULL) 18647 ill_refrele(attach_ill); 18648 if (need_decref) 18649 CONN_DEC_REF(connp); 18650 freemsg(first_mp); 18651 return; 18652 } 18653 goto send_from_ill; 18654 } 18655 standard_path: 18656 /* Must be a broadcast, a loopback or a local ire */ 18657 if (ire != NULL) { 18658 ire_refrele(ire); 18659 /* No more access to ire */ 18660 ire = NULL; 18661 } 18662 } 18663 18664 if (attach_ill != NULL) 18665 goto send_from_ill; 18666 18667 /* 18668 * We cache IRE_CACHEs to avoid lookups. We don't do 18669 * this for the tcp global queue and listen end point 18670 * as it does not really have a real destination to 18671 * talk to. This is also true for SCTP. 18672 */ 18673 if (IP_FLOW_CONTROLLED_ULP(connp->conn_ulp) && 18674 !connp->conn_fully_bound) { 18675 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp)); 18676 if (ire == NULL) 18677 goto noirefound; 18678 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 18679 "ip_wput_end: q %p (%S)", q, "end"); 18680 18681 /* 18682 * Check if the ire has the RTF_MULTIRT flag, inherited 18683 * from an IRE_OFFSUBNET ire entry in ip_newroute(). 18684 */ 18685 if (ire->ire_flags & RTF_MULTIRT) { 18686 18687 /* 18688 * Force the TTL of multirouted packets if required. 18689 * The TTL of such packets is bounded by the 18690 * ip_multirt_ttl ndd variable. 18691 */ 18692 if ((ip_multirt_ttl > 0) && 18693 (ipha->ipha_ttl > ip_multirt_ttl)) { 18694 ip2dbg(("ip_wput: forcing multirt TTL to %d " 18695 "(was %d), dst 0x%08x\n", 18696 ip_multirt_ttl, ipha->ipha_ttl, 18697 ntohl(ire->ire_addr))); 18698 ipha->ipha_ttl = ip_multirt_ttl; 18699 } 18700 /* 18701 * We look at this point if there are pending 18702 * unresolved routes. ire_multirt_resolvable() 18703 * checks in O(n) that all IRE_OFFSUBNET ire 18704 * entries for the packet's destination and 18705 * flagged RTF_MULTIRT are currently resolved. 18706 * If some remain unresolved, we make a copy 18707 * of the current message. It will be used 18708 * to initiate additional route resolutions. 18709 */ 18710 multirt_need_resolve = 18711 ire_multirt_need_resolve(ire->ire_addr, 18712 MBLK_GETLABEL(first_mp)); 18713 ip2dbg(("ip_wput[TCP]: ire %p, " 18714 "multirt_need_resolve %d, first_mp %p\n", 18715 (void *)ire, multirt_need_resolve, 18716 (void *)first_mp)); 18717 if (multirt_need_resolve) { 18718 copy_mp = copymsg(first_mp); 18719 if (copy_mp != NULL) { 18720 MULTIRT_DEBUG_TAG(copy_mp); 18721 } 18722 } 18723 } 18724 18725 ip_wput_ire(q, first_mp, ire, connp, caller); 18726 18727 /* 18728 * Try to resolve another multiroute if 18729 * ire_multirt_need_resolve() deemed it necessary. 18730 */ 18731 if (copy_mp != NULL) { 18732 ip_newroute(q, copy_mp, dst, NULL, connp); 18733 } 18734 if (need_decref) 18735 CONN_DEC_REF(connp); 18736 return; 18737 } 18738 18739 /* 18740 * Access to conn_ire_cache. (protected by conn_lock) 18741 * 18742 * IRE_MARK_CONDEMNED is marked in ire_delete. We don't grab 18743 * the ire bucket lock here to check for CONDEMNED as it is okay to 18744 * send a packet or two with the IRE_CACHE that is going away. 18745 * Access to the ire requires an ire refhold on the ire prior to 18746 * its use since an interface unplumb thread may delete the cached 18747 * ire and release the refhold at any time. 18748 * 18749 * Caching an ire in the conn_ire_cache 18750 * 18751 * o Caching an ire pointer in the conn requires a strict check for 18752 * IRE_MARK_CONDEMNED. An interface unplumb thread deletes all relevant 18753 * ires before cleaning up the conns. So the caching of an ire pointer 18754 * in the conn is done after making sure under the bucket lock that the 18755 * ire has not yet been marked CONDEMNED. Otherwise we will end up 18756 * caching an ire after the unplumb thread has cleaned up the conn. 18757 * If the conn does not send a packet subsequently the unplumb thread 18758 * will be hanging waiting for the ire count to drop to zero. 18759 * 18760 * o We also need to atomically test for a null conn_ire_cache and 18761 * set the conn_ire_cache under the the protection of the conn_lock 18762 * to avoid races among concurrent threads trying to simultaneously 18763 * cache an ire in the conn_ire_cache. 18764 */ 18765 mutex_enter(&connp->conn_lock); 18766 ire = sctp_ire != NULL ? sctp_ire : connp->conn_ire_cache; 18767 18768 if (ire != NULL && ire->ire_addr == dst && 18769 !(ire->ire_marks & IRE_MARK_CONDEMNED)) { 18770 18771 IRE_REFHOLD(ire); 18772 mutex_exit(&connp->conn_lock); 18773 18774 } else { 18775 boolean_t cached = B_FALSE; 18776 connp->conn_ire_cache = NULL; 18777 mutex_exit(&connp->conn_lock); 18778 /* Release the old ire */ 18779 if (ire != NULL && sctp_ire == NULL) 18780 IRE_REFRELE_NOTR(ire); 18781 18782 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp)); 18783 if (ire == NULL) 18784 goto noirefound; 18785 IRE_REFHOLD_NOTR(ire); 18786 18787 mutex_enter(&connp->conn_lock); 18788 if (!(connp->conn_state_flags & CONN_CLOSING) && 18789 connp->conn_ire_cache == NULL) { 18790 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 18791 if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { 18792 connp->conn_ire_cache = ire; 18793 cached = B_TRUE; 18794 } 18795 rw_exit(&ire->ire_bucket->irb_lock); 18796 } 18797 mutex_exit(&connp->conn_lock); 18798 18799 /* 18800 * We can continue to use the ire but since it was 18801 * not cached, we should drop the extra reference. 18802 */ 18803 if (!cached) 18804 IRE_REFRELE_NOTR(ire); 18805 } 18806 18807 18808 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 18809 "ip_wput_end: q %p (%S)", q, "end"); 18810 18811 /* 18812 * Check if the ire has the RTF_MULTIRT flag, inherited 18813 * from an IRE_OFFSUBNET ire entry in ip_newroute(). 18814 */ 18815 if (ire->ire_flags & RTF_MULTIRT) { 18816 18817 /* 18818 * Force the TTL of multirouted packets if required. 18819 * The TTL of such packets is bounded by the 18820 * ip_multirt_ttl ndd variable. 18821 */ 18822 if ((ip_multirt_ttl > 0) && 18823 (ipha->ipha_ttl > ip_multirt_ttl)) { 18824 ip2dbg(("ip_wput: forcing multirt TTL to %d " 18825 "(was %d), dst 0x%08x\n", 18826 ip_multirt_ttl, ipha->ipha_ttl, 18827 ntohl(ire->ire_addr))); 18828 ipha->ipha_ttl = ip_multirt_ttl; 18829 } 18830 18831 /* 18832 * At this point, we check to see if there are any pending 18833 * unresolved routes. ire_multirt_resolvable() 18834 * checks in O(n) that all IRE_OFFSUBNET ire 18835 * entries for the packet's destination and 18836 * flagged RTF_MULTIRT are currently resolved. 18837 * If some remain unresolved, we make a copy 18838 * of the current message. It will be used 18839 * to initiate additional route resolutions. 18840 */ 18841 multirt_need_resolve = ire_multirt_need_resolve(ire->ire_addr, 18842 MBLK_GETLABEL(first_mp)); 18843 ip2dbg(("ip_wput[not TCP]: ire %p, " 18844 "multirt_need_resolve %d, first_mp %p\n", 18845 (void *)ire, multirt_need_resolve, (void *)first_mp)); 18846 if (multirt_need_resolve) { 18847 copy_mp = copymsg(first_mp); 18848 if (copy_mp != NULL) { 18849 MULTIRT_DEBUG_TAG(copy_mp); 18850 } 18851 } 18852 } 18853 18854 ip_wput_ire(q, first_mp, ire, connp, caller); 18855 18856 /* 18857 * Try to resolve another multiroute if 18858 * ire_multirt_resolvable() deemed it necessary 18859 */ 18860 if (copy_mp != NULL) { 18861 ip_newroute(q, copy_mp, dst, NULL, connp); 18862 } 18863 if (need_decref) 18864 CONN_DEC_REF(connp); 18865 return; 18866 18867 qnext: 18868 /* 18869 * Upper Level Protocols pass down complete IP datagrams 18870 * as M_DATA messages. Everything else is a sideshow. 18871 * 18872 * 1) We could be re-entering ip_wput because of ip_neworute 18873 * in which case we could have a IPSEC_OUT message. We 18874 * need to pass through ip_wput like other datagrams and 18875 * hence cannot branch to ip_wput_nondata. 18876 * 18877 * 2) ARP, AH, ESP, and other clients who are on the module 18878 * instance of IP stream, give us something to deal with. 18879 * We will handle AH and ESP here and rest in ip_wput_nondata. 18880 * 18881 * 3) ICMP replies also could come here. 18882 */ 18883 if (DB_TYPE(mp) != M_DATA) { 18884 notdata: 18885 if (DB_TYPE(mp) == M_CTL) { 18886 /* 18887 * M_CTL messages are used by ARP, AH and ESP to 18888 * communicate with IP. We deal with IPSEC_IN and 18889 * IPSEC_OUT here. ip_wput_nondata handles other 18890 * cases. 18891 */ 18892 ipsec_info_t *ii = (ipsec_info_t *)mp->b_rptr; 18893 if (mp->b_cont && (mp->b_cont->b_flag & MSGHASREF)) { 18894 first_mp = mp->b_cont; 18895 first_mp->b_flag &= ~MSGHASREF; 18896 ASSERT(connp->conn_ulp == IPPROTO_SCTP); 18897 SCTP_EXTRACT_IPINFO(first_mp, sctp_ire); 18898 CONN_DEC_REF(connp); 18899 connp = NULL; 18900 } 18901 if (ii->ipsec_info_type == IPSEC_IN) { 18902 /* 18903 * Either this message goes back to 18904 * IPSEC for further processing or to 18905 * ULP after policy checks. 18906 */ 18907 ip_fanout_proto_again(mp, NULL, NULL, NULL); 18908 return; 18909 } else if (ii->ipsec_info_type == IPSEC_OUT) { 18910 io = (ipsec_out_t *)ii; 18911 if (io->ipsec_out_proc_begin) { 18912 /* 18913 * IPSEC processing has already started. 18914 * Complete it. 18915 * IPQoS notes: We don't care what is 18916 * in ipsec_out_ill_index since this 18917 * won't be processed for IPQoS policies 18918 * in ipsec_out_process. 18919 */ 18920 ipsec_out_process(q, mp, NULL, 18921 io->ipsec_out_ill_index); 18922 return; 18923 } else { 18924 connp = (q->q_next != NULL) ? 18925 NULL : Q_TO_CONN(q); 18926 first_mp = mp; 18927 mp = mp->b_cont; 18928 mctl_present = B_TRUE; 18929 } 18930 zoneid = io->ipsec_out_zoneid; 18931 ASSERT(zoneid != ALL_ZONES); 18932 } else if (ii->ipsec_info_type == IPSEC_CTL) { 18933 /* 18934 * It's an IPsec control message requesting 18935 * an SADB update to be sent to the IPsec 18936 * hardware acceleration capable ills. 18937 */ 18938 ipsec_ctl_t *ipsec_ctl = 18939 (ipsec_ctl_t *)mp->b_rptr; 18940 ipsa_t *sa = (ipsa_t *)ipsec_ctl->ipsec_ctl_sa; 18941 uint_t satype = ipsec_ctl->ipsec_ctl_sa_type; 18942 mblk_t *cmp = mp->b_cont; 18943 18944 ASSERT(MBLKL(mp) >= sizeof (ipsec_ctl_t)); 18945 ASSERT(cmp != NULL); 18946 18947 freeb(mp); 18948 ill_ipsec_capab_send_all(satype, cmp, sa); 18949 return; 18950 } else { 18951 /* 18952 * This must be ARP or special TSOL signaling. 18953 */ 18954 ip_wput_nondata(NULL, q, mp, NULL); 18955 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 18956 "ip_wput_end: q %p (%S)", q, "nondata"); 18957 return; 18958 } 18959 } else { 18960 /* 18961 * This must be non-(ARP/AH/ESP) messages. 18962 */ 18963 ASSERT(!need_decref); 18964 ip_wput_nondata(NULL, q, mp, NULL); 18965 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 18966 "ip_wput_end: q %p (%S)", q, "nondata"); 18967 return; 18968 } 18969 } else { 18970 first_mp = mp; 18971 mctl_present = B_FALSE; 18972 } 18973 18974 ASSERT(first_mp != NULL); 18975 /* 18976 * ICMP echo replies attach an ipsec_out and set ipsec_out_attach_if 18977 * to make sure that this packet goes out on the same interface it 18978 * came in. We handle that here. 18979 */ 18980 if (mctl_present) { 18981 uint_t ifindex; 18982 18983 io = (ipsec_out_t *)first_mp->b_rptr; 18984 if (io->ipsec_out_attach_if || 18985 io->ipsec_out_xmit_if || 18986 io->ipsec_out_ip_nexthop) { 18987 ill_t *ill; 18988 18989 /* 18990 * We may have lost the conn context if we are 18991 * coming here from ip_newroute(). Copy the 18992 * nexthop information. 18993 */ 18994 if (io->ipsec_out_ip_nexthop) { 18995 ip_nexthop = B_TRUE; 18996 nexthop_addr = io->ipsec_out_nexthop_addr; 18997 18998 ipha = (ipha_t *)mp->b_rptr; 18999 dst = ipha->ipha_dst; 19000 goto send_from_ill; 19001 } else { 19002 ASSERT(io->ipsec_out_ill_index != 0); 19003 ifindex = io->ipsec_out_ill_index; 19004 ill = ill_lookup_on_ifindex(ifindex, B_FALSE, 19005 NULL, NULL, NULL, NULL); 19006 /* 19007 * ipsec_out_xmit_if bit is used to tell 19008 * ip_wput to use the ill to send outgoing data 19009 * as we have no conn when data comes from ICMP 19010 * error msg routines. Currently this feature is 19011 * only used by ip_mrtun_forward routine. 19012 */ 19013 if (io->ipsec_out_xmit_if) { 19014 xmit_ill = ill; 19015 if (xmit_ill == NULL) { 19016 ip1dbg(("ip_output:bad ifindex " 19017 "for xmit_ill %d\n", 19018 ifindex)); 19019 freemsg(first_mp); 19020 BUMP_MIB(&ip_mib, 19021 ipOutDiscards); 19022 ASSERT(!need_decref); 19023 return; 19024 } 19025 /* Free up the ipsec_out_t mblk */ 19026 ASSERT(first_mp->b_cont == mp); 19027 first_mp->b_cont = NULL; 19028 freeb(first_mp); 19029 /* Just send the IP header+ICMP+data */ 19030 first_mp = mp; 19031 ipha = (ipha_t *)mp->b_rptr; 19032 dst = ipha->ipha_dst; 19033 goto send_from_ill; 19034 } else { 19035 attach_ill = ill; 19036 } 19037 19038 if (attach_ill == NULL) { 19039 ASSERT(xmit_ill == NULL); 19040 ip1dbg(("ip_output: bad ifindex for " 19041 "(BIND TO IPIF_NOFAILOVER) %d\n", 19042 ifindex)); 19043 freemsg(first_mp); 19044 BUMP_MIB(&ip_mib, ipOutDiscards); 19045 ASSERT(!need_decref); 19046 return; 19047 } 19048 } 19049 } 19050 } 19051 19052 ASSERT(xmit_ill == NULL); 19053 19054 /* We have a complete IP datagram heading outbound. */ 19055 ipha = (ipha_t *)mp->b_rptr; 19056 19057 #ifndef SPEED_BEFORE_SAFETY 19058 /* 19059 * Make sure we have a full-word aligned message and that at least 19060 * a simple IP header is accessible in the first message. If not, 19061 * try a pullup. 19062 */ 19063 if (!OK_32PTR(rptr) || 19064 (mp->b_wptr - rptr) < IP_SIMPLE_HDR_LENGTH) { 19065 hdrtoosmall: 19066 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 19067 BUMP_MIB(&ip_mib, ipOutDiscards); 19068 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19069 "ip_wput_end: q %p (%S)", q, "pullupfailed"); 19070 if (first_mp == NULL) 19071 first_mp = mp; 19072 goto drop_pkt; 19073 } 19074 19075 /* This function assumes that mp points to an IPv4 packet. */ 19076 if (is_system_labeled() && q->q_next == NULL && 19077 (*mp->b_rptr & 0xf0) == (IPV4_VERSION << 4) && 19078 !connp->conn_ulp_labeled) { 19079 err = tsol_check_label(BEST_CRED(mp, connp), &mp, 19080 &adjust, connp->conn_mac_exempt); 19081 ipha = (ipha_t *)mp->b_rptr; 19082 if (first_mp != NULL) 19083 first_mp->b_cont = mp; 19084 if (err != 0) { 19085 if (first_mp == NULL) 19086 first_mp = mp; 19087 if (err == EINVAL) 19088 goto icmp_parameter_problem; 19089 ip2dbg(("ip_wput: label check failed (%d)\n", 19090 err)); 19091 goto drop_pkt; 19092 } 19093 iplen = ntohs(ipha->ipha_length) + adjust; 19094 ipha->ipha_length = htons(iplen); 19095 } 19096 19097 ipha = (ipha_t *)mp->b_rptr; 19098 if (first_mp == NULL) { 19099 ASSERT(attach_ill == NULL && xmit_ill == NULL); 19100 /* 19101 * If we got here because of "goto hdrtoosmall" 19102 * We need to attach a IPSEC_OUT. 19103 */ 19104 if (connp->conn_out_enforce_policy) { 19105 if (((mp = ipsec_attach_ipsec_out(mp, connp, 19106 NULL, ipha->ipha_protocol)) == NULL)) { 19107 if (need_decref) 19108 CONN_DEC_REF(connp); 19109 return; 19110 } else { 19111 ASSERT(mp->b_datap->db_type == M_CTL); 19112 first_mp = mp; 19113 mp = mp->b_cont; 19114 mctl_present = B_TRUE; 19115 } 19116 } else { 19117 first_mp = mp; 19118 mctl_present = B_FALSE; 19119 } 19120 } 19121 } 19122 #endif 19123 19124 /* Most of the code below is written for speed, not readability */ 19125 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 19126 19127 /* 19128 * If ip_newroute() fails, we're going to need a full 19129 * header for the icmp wraparound. 19130 */ 19131 if (V_HLEN != IP_SIMPLE_HDR_VERSION) { 19132 uint_t v_hlen; 19133 version_hdrlen_check: 19134 ASSERT(first_mp != NULL); 19135 v_hlen = V_HLEN; 19136 /* 19137 * siphon off IPv6 packets coming down from transport 19138 * layer modules here. 19139 * Note: high-order bit carries NUD reachability confirmation 19140 */ 19141 if (((v_hlen >> 4) & 0x7) == IPV6_VERSION) { 19142 /* 19143 * XXX implement a IPv4 and IPv6 packet counter per 19144 * conn and switch when ratio exceeds e.g. 10:1 19145 */ 19146 #ifdef notyet 19147 if (q->q_next == NULL) /* Avoid ill queue */ 19148 ip_setqinfo(RD(q), B_TRUE, B_TRUE); 19149 #endif 19150 BUMP_MIB(&ip_mib, ipOutIPv6); 19151 ASSERT(xmit_ill == NULL); 19152 if (attach_ill != NULL) 19153 ill_refrele(attach_ill); 19154 if (need_decref) 19155 mp->b_flag |= MSGHASREF; 19156 (void) ip_output_v6(connp, first_mp, q, caller); 19157 return; 19158 } 19159 19160 if ((v_hlen >> 4) != IP_VERSION) { 19161 BUMP_MIB(&ip_mib, ipOutDiscards); 19162 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19163 "ip_wput_end: q %p (%S)", q, "badvers"); 19164 goto drop_pkt; 19165 } 19166 /* 19167 * Is the header length at least 20 bytes? 19168 * 19169 * Are there enough bytes accessible in the header? If 19170 * not, try a pullup. 19171 */ 19172 v_hlen &= 0xF; 19173 v_hlen <<= 2; 19174 if (v_hlen < IP_SIMPLE_HDR_LENGTH) { 19175 BUMP_MIB(&ip_mib, ipOutDiscards); 19176 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19177 "ip_wput_end: q %p (%S)", q, "badlen"); 19178 goto drop_pkt; 19179 } 19180 if (v_hlen > (mp->b_wptr - rptr)) { 19181 if (!pullupmsg(mp, v_hlen)) { 19182 BUMP_MIB(&ip_mib, ipOutDiscards); 19183 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19184 "ip_wput_end: q %p (%S)", q, "badpullup2"); 19185 goto drop_pkt; 19186 } 19187 ipha = (ipha_t *)mp->b_rptr; 19188 } 19189 /* 19190 * Move first entry from any source route into ipha_dst and 19191 * verify the options 19192 */ 19193 if (ip_wput_options(q, first_mp, ipha, mctl_present, zoneid)) { 19194 ASSERT(xmit_ill == NULL); 19195 if (attach_ill != NULL) 19196 ill_refrele(attach_ill); 19197 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19198 "ip_wput_end: q %p (%S)", q, "badopts"); 19199 if (need_decref) 19200 CONN_DEC_REF(connp); 19201 return; 19202 } 19203 } 19204 dst = ipha->ipha_dst; 19205 19206 /* 19207 * Try to get an IRE_CACHE for the destination address. If we can't, 19208 * we have to run the packet through ip_newroute which will take 19209 * the appropriate action to arrange for an IRE_CACHE, such as querying 19210 * a resolver, or assigning a default gateway, etc. 19211 */ 19212 if (CLASSD(dst)) { 19213 ipif_t *ipif; 19214 uint32_t setsrc = 0; 19215 19216 multicast: 19217 ASSERT(first_mp != NULL); 19218 ASSERT(xmit_ill == NULL); 19219 ip2dbg(("ip_wput: CLASSD\n")); 19220 if (connp == NULL) { 19221 /* 19222 * Use the first good ipif on the ill. 19223 * XXX Should this ever happen? (Appears 19224 * to show up with just ppp and no ethernet due 19225 * to in.rdisc.) 19226 * However, ire_send should be able to 19227 * call ip_wput_ire directly. 19228 * 19229 * XXX Also, this can happen for ICMP and other packets 19230 * with multicast source addresses. Perhaps we should 19231 * fix things so that we drop the packet in question, 19232 * but for now, just run with it. 19233 */ 19234 ill_t *ill = (ill_t *)q->q_ptr; 19235 19236 /* 19237 * Don't honor attach_if for this case. If ill 19238 * is part of the group, ipif could belong to 19239 * any ill and we cannot maintain attach_ill 19240 * and ipif_ill same anymore and the assert 19241 * below would fail. 19242 */ 19243 if (mctl_present) { 19244 io->ipsec_out_ill_index = 0; 19245 io->ipsec_out_attach_if = B_FALSE; 19246 ASSERT(attach_ill != NULL); 19247 ill_refrele(attach_ill); 19248 attach_ill = NULL; 19249 } 19250 19251 ASSERT(attach_ill == NULL); 19252 ipif = ipif_select_source(ill, dst, GLOBAL_ZONEID); 19253 if (ipif == NULL) { 19254 if (need_decref) 19255 CONN_DEC_REF(connp); 19256 freemsg(first_mp); 19257 return; 19258 } 19259 ip1dbg(("ip_wput: CLASSD no CONN: dst 0x%x on %s\n", 19260 ntohl(dst), ill->ill_name)); 19261 } else { 19262 /* 19263 * If both IP_MULTICAST_IF and IP_XMIT_IF are set, 19264 * IP_XMIT_IF is honoured. 19265 * Block comment above this function explains the 19266 * locking mechanism used here 19267 */ 19268 xmit_ill = conn_get_held_ill(connp, 19269 &connp->conn_xmit_if_ill, &err); 19270 if (err == ILL_LOOKUP_FAILED) { 19271 ip1dbg(("ip_wput: No ill for IP_XMIT_IF\n")); 19272 goto drop_pkt; 19273 } 19274 if (xmit_ill == NULL) { 19275 ipif = conn_get_held_ipif(connp, 19276 &connp->conn_multicast_ipif, &err); 19277 if (err == IPIF_LOOKUP_FAILED) { 19278 ip1dbg(("ip_wput: No ipif for " 19279 "multicast\n")); 19280 BUMP_MIB(&ip_mib, ipOutNoRoutes); 19281 goto drop_pkt; 19282 } 19283 } 19284 if (xmit_ill != NULL) { 19285 ipif = ipif_get_next_ipif(NULL, xmit_ill); 19286 if (ipif == NULL) { 19287 ip1dbg(("ip_wput: No ipif for " 19288 "IP_XMIT_IF\n")); 19289 BUMP_MIB(&ip_mib, ipOutNoRoutes); 19290 goto drop_pkt; 19291 } 19292 } else if (ipif == NULL || ipif->ipif_isv6) { 19293 /* 19294 * We must do this ipif determination here 19295 * else we could pass through ip_newroute 19296 * and come back here without the conn context. 19297 * 19298 * Note: we do late binding i.e. we bind to 19299 * the interface when the first packet is sent. 19300 * For performance reasons we do not rebind on 19301 * each packet but keep the binding until the 19302 * next IP_MULTICAST_IF option. 19303 * 19304 * conn_multicast_{ipif,ill} are shared between 19305 * IPv4 and IPv6 and AF_INET6 sockets can 19306 * send both IPv4 and IPv6 packets. Hence 19307 * we have to check that "isv6" matches above. 19308 */ 19309 if (ipif != NULL) 19310 ipif_refrele(ipif); 19311 ipif = ipif_lookup_group(dst, zoneid); 19312 if (ipif == NULL) { 19313 ip1dbg(("ip_wput: No ipif for " 19314 "multicast\n")); 19315 BUMP_MIB(&ip_mib, ipOutNoRoutes); 19316 goto drop_pkt; 19317 } 19318 err = conn_set_held_ipif(connp, 19319 &connp->conn_multicast_ipif, ipif); 19320 if (err == IPIF_LOOKUP_FAILED) { 19321 ipif_refrele(ipif); 19322 ip1dbg(("ip_wput: No ipif for " 19323 "multicast\n")); 19324 BUMP_MIB(&ip_mib, ipOutNoRoutes); 19325 goto drop_pkt; 19326 } 19327 } 19328 } 19329 ASSERT(!ipif->ipif_isv6); 19330 /* 19331 * As we may lose the conn by the time we reach ip_wput_ire, 19332 * we copy conn_multicast_loop and conn_dontroute on to an 19333 * ipsec_out. In case if this datagram goes out secure, 19334 * we need the ill_index also. Copy that also into the 19335 * ipsec_out. 19336 */ 19337 if (mctl_present) { 19338 io = (ipsec_out_t *)first_mp->b_rptr; 19339 ASSERT(first_mp->b_datap->db_type == M_CTL); 19340 ASSERT(io->ipsec_out_type == IPSEC_OUT); 19341 } else { 19342 ASSERT(mp == first_mp); 19343 if ((first_mp = allocb(sizeof (ipsec_info_t), 19344 BPRI_HI)) == NULL) { 19345 ipif_refrele(ipif); 19346 first_mp = mp; 19347 goto drop_pkt; 19348 } 19349 first_mp->b_datap->db_type = M_CTL; 19350 first_mp->b_wptr += sizeof (ipsec_info_t); 19351 /* ipsec_out_secure is B_FALSE now */ 19352 bzero(first_mp->b_rptr, sizeof (ipsec_info_t)); 19353 io = (ipsec_out_t *)first_mp->b_rptr; 19354 io->ipsec_out_type = IPSEC_OUT; 19355 io->ipsec_out_len = sizeof (ipsec_out_t); 19356 io->ipsec_out_use_global_policy = B_TRUE; 19357 first_mp->b_cont = mp; 19358 mctl_present = B_TRUE; 19359 } 19360 if (attach_ill != NULL) { 19361 ASSERT(attach_ill == ipif->ipif_ill); 19362 match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; 19363 19364 /* 19365 * Check if we need an ire that will not be 19366 * looked up by anybody else i.e. HIDDEN. 19367 */ 19368 if (ill_is_probeonly(attach_ill)) { 19369 match_flags |= MATCH_IRE_MARK_HIDDEN; 19370 } 19371 io->ipsec_out_ill_index = 19372 attach_ill->ill_phyint->phyint_ifindex; 19373 io->ipsec_out_attach_if = B_TRUE; 19374 } else { 19375 match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR; 19376 io->ipsec_out_ill_index = 19377 ipif->ipif_ill->ill_phyint->phyint_ifindex; 19378 } 19379 if (connp != NULL) { 19380 io->ipsec_out_multicast_loop = 19381 connp->conn_multicast_loop; 19382 io->ipsec_out_dontroute = connp->conn_dontroute; 19383 io->ipsec_out_zoneid = connp->conn_zoneid; 19384 } 19385 /* 19386 * If the application uses IP_MULTICAST_IF with 19387 * different logical addresses of the same ILL, we 19388 * need to make sure that the soruce address of 19389 * the packet matches the logical IP address used 19390 * in the option. We do it by initializing ipha_src 19391 * here. This should keep IPSEC also happy as 19392 * when we return from IPSEC processing, we don't 19393 * have to worry about getting the right address on 19394 * the packet. Thus it is sufficient to look for 19395 * IRE_CACHE using MATCH_IRE_ILL rathen than 19396 * MATCH_IRE_IPIF. 19397 * 19398 * NOTE : We need to do it for non-secure case also as 19399 * this might go out secure if there is a global policy 19400 * match in ip_wput_ire. For bind to IPIF_NOFAILOVER 19401 * address, the source should be initialized already and 19402 * hence we won't be initializing here. 19403 * 19404 * As we do not have the ire yet, it is possible that 19405 * we set the source address here and then later discover 19406 * that the ire implies the source address to be assigned 19407 * through the RTF_SETSRC flag. 19408 * In that case, the setsrc variable will remind us 19409 * that overwritting the source address by the one 19410 * of the RTF_SETSRC-flagged ire is allowed. 19411 */ 19412 if (ipha->ipha_src == INADDR_ANY && 19413 (connp == NULL || !connp->conn_unspec_src)) { 19414 ipha->ipha_src = ipif->ipif_src_addr; 19415 setsrc = RTF_SETSRC; 19416 } 19417 /* 19418 * Find an IRE which matches the destination and the outgoing 19419 * queue (i.e. the outgoing interface.) 19420 * For loopback use a unicast IP address for 19421 * the ire lookup. 19422 */ 19423 if (ipif->ipif_ill->ill_phyint->phyint_flags & 19424 PHYI_LOOPBACK) { 19425 dst = ipif->ipif_lcl_addr; 19426 } 19427 /* 19428 * If IP_XMIT_IF is set, we branch out to ip_newroute_ipif. 19429 * We don't need to lookup ire in ctable as the packet 19430 * needs to be sent to the destination through the specified 19431 * ill irrespective of ires in the cache table. 19432 */ 19433 ire = NULL; 19434 if (xmit_ill == NULL) { 19435 ire = ire_ctable_lookup(dst, 0, 0, ipif, 19436 zoneid, MBLK_GETLABEL(mp), match_flags); 19437 } 19438 19439 /* 19440 * refrele attach_ill as its not needed anymore. 19441 */ 19442 if (attach_ill != NULL) { 19443 ill_refrele(attach_ill); 19444 attach_ill = NULL; 19445 } 19446 19447 if (ire == NULL) { 19448 /* 19449 * Multicast loopback and multicast forwarding is 19450 * done in ip_wput_ire. 19451 * 19452 * Mark this packet to make it be delivered to 19453 * ip_wput_ire after the new ire has been 19454 * created. 19455 * 19456 * The call to ip_newroute_ipif takes into account 19457 * the setsrc reminder. In any case, we take care 19458 * of the RTF_MULTIRT flag. 19459 */ 19460 mp->b_prev = mp->b_next = NULL; 19461 if (xmit_ill == NULL || 19462 xmit_ill->ill_ipif_up_count > 0) { 19463 ip_newroute_ipif(q, first_mp, ipif, dst, connp, 19464 setsrc | RTF_MULTIRT); 19465 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19466 "ip_wput_end: q %p (%S)", q, "noire"); 19467 } else { 19468 freemsg(first_mp); 19469 } 19470 ipif_refrele(ipif); 19471 if (xmit_ill != NULL) 19472 ill_refrele(xmit_ill); 19473 if (need_decref) 19474 CONN_DEC_REF(connp); 19475 return; 19476 } 19477 19478 ipif_refrele(ipif); 19479 ipif = NULL; 19480 ASSERT(xmit_ill == NULL); 19481 19482 /* 19483 * Honor the RTF_SETSRC flag for multicast packets, 19484 * if allowed by the setsrc reminder. 19485 */ 19486 if ((ire->ire_flags & RTF_SETSRC) && setsrc) { 19487 ipha->ipha_src = ire->ire_src_addr; 19488 } 19489 19490 /* 19491 * Unconditionally force the TTL to 1 for 19492 * multirouted multicast packets: 19493 * multirouted multicast should not cross 19494 * multicast routers. 19495 */ 19496 if (ire->ire_flags & RTF_MULTIRT) { 19497 if (ipha->ipha_ttl > 1) { 19498 ip2dbg(("ip_wput: forcing multicast " 19499 "multirt TTL to 1 (was %d), dst 0x%08x\n", 19500 ipha->ipha_ttl, ntohl(ire->ire_addr))); 19501 ipha->ipha_ttl = 1; 19502 } 19503 } 19504 } else { 19505 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp)); 19506 if ((ire != NULL) && (ire->ire_type & 19507 (IRE_BROADCAST | IRE_LOCAL | IRE_LOOPBACK))) { 19508 ignore_dontroute = B_TRUE; 19509 ignore_nexthop = B_TRUE; 19510 } 19511 if (ire != NULL) { 19512 ire_refrele(ire); 19513 ire = NULL; 19514 } 19515 /* 19516 * Guard against coming in from arp in which case conn is NULL. 19517 * Also guard against non M_DATA with dontroute set but 19518 * destined to local, loopback or broadcast addresses. 19519 */ 19520 if (connp != NULL && connp->conn_dontroute && 19521 !ignore_dontroute) { 19522 dontroute: 19523 /* 19524 * Set TTL to 1 if SO_DONTROUTE is set to prevent 19525 * routing protocols from seeing false direct 19526 * connectivity. 19527 */ 19528 ipha->ipha_ttl = 1; 19529 /* 19530 * If IP_XMIT_IF is also set (conn_xmit_if_ill != NULL) 19531 * along with SO_DONTROUTE, higher precedence is 19532 * given to IP_XMIT_IF and the IP_XMIT_IF ipif is used. 19533 */ 19534 if (connp->conn_xmit_if_ill == NULL) { 19535 /* If suitable ipif not found, drop packet */ 19536 dst_ipif = ipif_lookup_onlink_addr(dst, zoneid); 19537 if (dst_ipif == NULL) { 19538 ip1dbg(("ip_wput: no route for " 19539 "dst using SO_DONTROUTE\n")); 19540 BUMP_MIB(&ip_mib, ipOutNoRoutes); 19541 mp->b_prev = mp->b_next = NULL; 19542 if (first_mp == NULL) 19543 first_mp = mp; 19544 goto drop_pkt; 19545 } else { 19546 /* 19547 * If suitable ipif has been found, set 19548 * xmit_ill to the corresponding 19549 * ipif_ill because we'll be following 19550 * the IP_XMIT_IF logic. 19551 */ 19552 ASSERT(xmit_ill == NULL); 19553 xmit_ill = dst_ipif->ipif_ill; 19554 mutex_enter(&xmit_ill->ill_lock); 19555 if (!ILL_CAN_LOOKUP(xmit_ill)) { 19556 mutex_exit(&xmit_ill->ill_lock); 19557 xmit_ill = NULL; 19558 ipif_refrele(dst_ipif); 19559 ip1dbg(("ip_wput: no route for" 19560 " dst using" 19561 " SO_DONTROUTE\n")); 19562 BUMP_MIB(&ip_mib, 19563 ipOutNoRoutes); 19564 mp->b_prev = mp->b_next = NULL; 19565 if (first_mp == NULL) 19566 first_mp = mp; 19567 goto drop_pkt; 19568 } 19569 ill_refhold_locked(xmit_ill); 19570 mutex_exit(&xmit_ill->ill_lock); 19571 ipif_refrele(dst_ipif); 19572 } 19573 } 19574 19575 } 19576 /* 19577 * If we are bound to IPIF_NOFAILOVER address, look for 19578 * an IRE_CACHE matching the ill. 19579 */ 19580 send_from_ill: 19581 if (attach_ill != NULL) { 19582 ipif_t *attach_ipif; 19583 19584 match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; 19585 19586 /* 19587 * Check if we need an ire that will not be 19588 * looked up by anybody else i.e. HIDDEN. 19589 */ 19590 if (ill_is_probeonly(attach_ill)) { 19591 match_flags |= MATCH_IRE_MARK_HIDDEN; 19592 } 19593 19594 attach_ipif = ipif_get_next_ipif(NULL, attach_ill); 19595 if (attach_ipif == NULL) { 19596 ip1dbg(("ip_wput: No ipif for attach_ill\n")); 19597 goto drop_pkt; 19598 } 19599 ire = ire_ctable_lookup(dst, 0, 0, attach_ipif, 19600 zoneid, MBLK_GETLABEL(mp), match_flags); 19601 ipif_refrele(attach_ipif); 19602 } else if (xmit_ill != NULL || (connp != NULL && 19603 connp->conn_xmit_if_ill != NULL)) { 19604 /* 19605 * Mark this packet as originated locally 19606 */ 19607 mp->b_prev = mp->b_next = NULL; 19608 /* 19609 * xmit_ill could be NULL if SO_DONTROUTE 19610 * is also set. 19611 */ 19612 if (xmit_ill == NULL) { 19613 xmit_ill = conn_get_held_ill(connp, 19614 &connp->conn_xmit_if_ill, &err); 19615 if (err == ILL_LOOKUP_FAILED) { 19616 if (need_decref) 19617 CONN_DEC_REF(connp); 19618 freemsg(first_mp); 19619 return; 19620 } 19621 if (xmit_ill == NULL) { 19622 if (connp->conn_dontroute) 19623 goto dontroute; 19624 goto send_from_ill; 19625 } 19626 } 19627 /* 19628 * could be SO_DONTROUTE case also. 19629 * check at least one interface is UP as 19630 * spcified by this ILL, and then call 19631 * ip_newroute_ipif() 19632 */ 19633 if (xmit_ill->ill_ipif_up_count > 0) { 19634 ipif_t *ipif; 19635 19636 ipif = ipif_get_next_ipif(NULL, xmit_ill); 19637 if (ipif != NULL) { 19638 ip_newroute_ipif(q, first_mp, ipif, 19639 dst, connp, 0); 19640 ipif_refrele(ipif); 19641 ip1dbg(("ip_wput: ip_unicast_if\n")); 19642 } 19643 } else { 19644 freemsg(first_mp); 19645 } 19646 ill_refrele(xmit_ill); 19647 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19648 "ip_wput_end: q %p (%S)", q, "unicast_if"); 19649 if (need_decref) 19650 CONN_DEC_REF(connp); 19651 return; 19652 } else if (ip_nexthop || (connp != NULL && 19653 (connp->conn_nexthop_set)) && !ignore_nexthop) { 19654 if (!ip_nexthop) { 19655 ip_nexthop = B_TRUE; 19656 nexthop_addr = connp->conn_nexthop_v4; 19657 } 19658 match_flags = MATCH_IRE_MARK_PRIVATE_ADDR | 19659 MATCH_IRE_GW; 19660 ire = ire_ctable_lookup(dst, nexthop_addr, 0, 19661 NULL, zoneid, MBLK_GETLABEL(mp), match_flags); 19662 } else { 19663 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp)); 19664 } 19665 if (!ire) { 19666 /* 19667 * Make sure we don't load spread if this 19668 * is IPIF_NOFAILOVER case. 19669 */ 19670 if ((attach_ill != NULL) || 19671 (ip_nexthop && !ignore_nexthop)) { 19672 if (mctl_present) { 19673 io = (ipsec_out_t *)first_mp->b_rptr; 19674 ASSERT(first_mp->b_datap->db_type == 19675 M_CTL); 19676 ASSERT(io->ipsec_out_type == IPSEC_OUT); 19677 } else { 19678 ASSERT(mp == first_mp); 19679 first_mp = allocb( 19680 sizeof (ipsec_info_t), BPRI_HI); 19681 if (first_mp == NULL) { 19682 first_mp = mp; 19683 goto drop_pkt; 19684 } 19685 first_mp->b_datap->db_type = M_CTL; 19686 first_mp->b_wptr += 19687 sizeof (ipsec_info_t); 19688 /* ipsec_out_secure is B_FALSE now */ 19689 bzero(first_mp->b_rptr, 19690 sizeof (ipsec_info_t)); 19691 io = (ipsec_out_t *)first_mp->b_rptr; 19692 io->ipsec_out_type = IPSEC_OUT; 19693 io->ipsec_out_len = 19694 sizeof (ipsec_out_t); 19695 io->ipsec_out_use_global_policy = 19696 B_TRUE; 19697 first_mp->b_cont = mp; 19698 mctl_present = B_TRUE; 19699 } 19700 if (attach_ill != NULL) { 19701 io->ipsec_out_ill_index = attach_ill-> 19702 ill_phyint->phyint_ifindex; 19703 io->ipsec_out_attach_if = B_TRUE; 19704 } else { 19705 io->ipsec_out_ip_nexthop = ip_nexthop; 19706 io->ipsec_out_nexthop_addr = 19707 nexthop_addr; 19708 } 19709 } 19710 noirefound: 19711 /* 19712 * Mark this packet as having originated on 19713 * this machine. This will be noted in 19714 * ire_add_then_send, which needs to know 19715 * whether to run it back through ip_wput or 19716 * ip_rput following successful resolution. 19717 */ 19718 mp->b_prev = NULL; 19719 mp->b_next = NULL; 19720 ip_newroute(q, first_mp, dst, NULL, connp); 19721 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19722 "ip_wput_end: q %p (%S)", q, "newroute"); 19723 if (attach_ill != NULL) 19724 ill_refrele(attach_ill); 19725 if (xmit_ill != NULL) 19726 ill_refrele(xmit_ill); 19727 if (need_decref) 19728 CONN_DEC_REF(connp); 19729 return; 19730 } 19731 } 19732 19733 /* We now know where we are going with it. */ 19734 19735 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19736 "ip_wput_end: q %p (%S)", q, "end"); 19737 19738 /* 19739 * Check if the ire has the RTF_MULTIRT flag, inherited 19740 * from an IRE_OFFSUBNET ire entry in ip_newroute. 19741 */ 19742 if (ire->ire_flags & RTF_MULTIRT) { 19743 /* 19744 * Force the TTL of multirouted packets if required. 19745 * The TTL of such packets is bounded by the 19746 * ip_multirt_ttl ndd variable. 19747 */ 19748 if ((ip_multirt_ttl > 0) && 19749 (ipha->ipha_ttl > ip_multirt_ttl)) { 19750 ip2dbg(("ip_wput: forcing multirt TTL to %d " 19751 "(was %d), dst 0x%08x\n", 19752 ip_multirt_ttl, ipha->ipha_ttl, 19753 ntohl(ire->ire_addr))); 19754 ipha->ipha_ttl = ip_multirt_ttl; 19755 } 19756 /* 19757 * At this point, we check to see if there are any pending 19758 * unresolved routes. ire_multirt_resolvable() 19759 * checks in O(n) that all IRE_OFFSUBNET ire 19760 * entries for the packet's destination and 19761 * flagged RTF_MULTIRT are currently resolved. 19762 * If some remain unresolved, we make a copy 19763 * of the current message. It will be used 19764 * to initiate additional route resolutions. 19765 */ 19766 multirt_need_resolve = ire_multirt_need_resolve(ire->ire_addr, 19767 MBLK_GETLABEL(first_mp)); 19768 ip2dbg(("ip_wput[noirefound]: ire %p, " 19769 "multirt_need_resolve %d, first_mp %p\n", 19770 (void *)ire, multirt_need_resolve, (void *)first_mp)); 19771 if (multirt_need_resolve) { 19772 copy_mp = copymsg(first_mp); 19773 if (copy_mp != NULL) { 19774 MULTIRT_DEBUG_TAG(copy_mp); 19775 } 19776 } 19777 } 19778 19779 ip_wput_ire(q, first_mp, ire, connp, caller); 19780 /* 19781 * Try to resolve another multiroute if 19782 * ire_multirt_resolvable() deemed it necessary. 19783 * At this point, we need to distinguish 19784 * multicasts from other packets. For multicasts, 19785 * we call ip_newroute_ipif() and request that both 19786 * multirouting and setsrc flags are checked. 19787 */ 19788 if (copy_mp != NULL) { 19789 if (CLASSD(dst)) { 19790 ipif_t *ipif = ipif_lookup_group(dst, zoneid); 19791 if (ipif) { 19792 ip_newroute_ipif(q, copy_mp, ipif, dst, connp, 19793 RTF_SETSRC | RTF_MULTIRT); 19794 ipif_refrele(ipif); 19795 } else { 19796 MULTIRT_DEBUG_UNTAG(copy_mp); 19797 freemsg(copy_mp); 19798 copy_mp = NULL; 19799 } 19800 } else { 19801 ip_newroute(q, copy_mp, dst, NULL, connp); 19802 } 19803 } 19804 if (attach_ill != NULL) 19805 ill_refrele(attach_ill); 19806 if (xmit_ill != NULL) 19807 ill_refrele(xmit_ill); 19808 if (need_decref) 19809 CONN_DEC_REF(connp); 19810 return; 19811 19812 icmp_parameter_problem: 19813 /* could not have originated externally */ 19814 ASSERT(mp->b_prev == NULL); 19815 if (ip_hdr_complete(ipha, zoneid) == 0) { 19816 BUMP_MIB(&ip_mib, ipOutNoRoutes); 19817 /* it's the IP header length that's in trouble */ 19818 icmp_param_problem(q, first_mp, 0); 19819 first_mp = NULL; 19820 } 19821 19822 drop_pkt: 19823 ip1dbg(("ip_wput: dropped packet\n")); 19824 if (ire != NULL) 19825 ire_refrele(ire); 19826 if (need_decref) 19827 CONN_DEC_REF(connp); 19828 freemsg(first_mp); 19829 if (attach_ill != NULL) 19830 ill_refrele(attach_ill); 19831 if (xmit_ill != NULL) 19832 ill_refrele(xmit_ill); 19833 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19834 "ip_wput_end: q %p (%S)", q, "droppkt"); 19835 } 19836 19837 void 19838 ip_wput(queue_t *q, mblk_t *mp) 19839 { 19840 ip_output(Q_TO_CONN(q), mp, q, IP_WPUT); 19841 } 19842 19843 /* 19844 * 19845 * The following rules must be observed when accessing any ipif or ill 19846 * that has been cached in the conn. Typically conn_nofailover_ill, 19847 * conn_xmit_if_ill, conn_multicast_ipif and conn_multicast_ill. 19848 * 19849 * Access: The ipif or ill pointed to from the conn can be accessed under 19850 * the protection of the conn_lock or after it has been refheld under the 19851 * protection of the conn lock. In addition the IPIF_CAN_LOOKUP or 19852 * ILL_CAN_LOOKUP macros must be used before actually doing the refhold. 19853 * The reason for this is that a concurrent unplumb could actually be 19854 * cleaning up these cached pointers by walking the conns and might have 19855 * finished cleaning up the conn in question. The macros check that an 19856 * unplumb has not yet started on the ipif or ill. 19857 * 19858 * Caching: An ipif or ill pointer may be cached in the conn only after 19859 * making sure that an unplumb has not started. So the caching is done 19860 * while holding both the conn_lock and the ill_lock and after using the 19861 * ILL_CAN_LOOKUP/IPIF_CAN_LOOKUP macro. An unplumb will set the ILL_CONDEMNED 19862 * flag before starting the cleanup of conns. 19863 * 19864 * The list of ipifs hanging off the ill is protected by ill_g_lock and ill_lock 19865 * On the other hand to access ipif->ipif_ill, we need one of either ill_g_lock 19866 * or a reference to the ipif or a reference to an ire that references the 19867 * ipif. An ipif does not change its ill except for failover/failback. Since 19868 * failover/failback happens only after bringing down the ipif and making sure 19869 * the ipif refcnt has gone to zero and holding the ill_g_lock and ill_lock 19870 * the above holds. 19871 */ 19872 ipif_t * 19873 conn_get_held_ipif(conn_t *connp, ipif_t **ipifp, int *err) 19874 { 19875 ipif_t *ipif; 19876 ill_t *ill; 19877 19878 *err = 0; 19879 rw_enter(&ill_g_lock, RW_READER); 19880 mutex_enter(&connp->conn_lock); 19881 ipif = *ipifp; 19882 if (ipif != NULL) { 19883 ill = ipif->ipif_ill; 19884 mutex_enter(&ill->ill_lock); 19885 if (IPIF_CAN_LOOKUP(ipif)) { 19886 ipif_refhold_locked(ipif); 19887 mutex_exit(&ill->ill_lock); 19888 mutex_exit(&connp->conn_lock); 19889 rw_exit(&ill_g_lock); 19890 return (ipif); 19891 } else { 19892 *err = IPIF_LOOKUP_FAILED; 19893 } 19894 mutex_exit(&ill->ill_lock); 19895 } 19896 mutex_exit(&connp->conn_lock); 19897 rw_exit(&ill_g_lock); 19898 return (NULL); 19899 } 19900 19901 ill_t * 19902 conn_get_held_ill(conn_t *connp, ill_t **illp, int *err) 19903 { 19904 ill_t *ill; 19905 19906 *err = 0; 19907 mutex_enter(&connp->conn_lock); 19908 ill = *illp; 19909 if (ill != NULL) { 19910 mutex_enter(&ill->ill_lock); 19911 if (ILL_CAN_LOOKUP(ill)) { 19912 ill_refhold_locked(ill); 19913 mutex_exit(&ill->ill_lock); 19914 mutex_exit(&connp->conn_lock); 19915 return (ill); 19916 } else { 19917 *err = ILL_LOOKUP_FAILED; 19918 } 19919 mutex_exit(&ill->ill_lock); 19920 } 19921 mutex_exit(&connp->conn_lock); 19922 return (NULL); 19923 } 19924 19925 static int 19926 conn_set_held_ipif(conn_t *connp, ipif_t **ipifp, ipif_t *ipif) 19927 { 19928 ill_t *ill; 19929 19930 ill = ipif->ipif_ill; 19931 mutex_enter(&connp->conn_lock); 19932 mutex_enter(&ill->ill_lock); 19933 if (IPIF_CAN_LOOKUP(ipif)) { 19934 *ipifp = ipif; 19935 mutex_exit(&ill->ill_lock); 19936 mutex_exit(&connp->conn_lock); 19937 return (0); 19938 } 19939 mutex_exit(&ill->ill_lock); 19940 mutex_exit(&connp->conn_lock); 19941 return (IPIF_LOOKUP_FAILED); 19942 } 19943 19944 /* 19945 * This is called if the outbound datagram needs fragmentation. 19946 * 19947 * NOTE : This function does not ire_refrele the ire argument passed in. 19948 */ 19949 static void 19950 ip_wput_ire_fragmentit(mblk_t *ipsec_mp, ire_t *ire) 19951 { 19952 ipha_t *ipha; 19953 mblk_t *mp; 19954 uint32_t v_hlen_tos_len; 19955 uint32_t max_frag; 19956 uint32_t frag_flag; 19957 boolean_t dont_use; 19958 19959 if (ipsec_mp->b_datap->db_type == M_CTL) { 19960 mp = ipsec_mp->b_cont; 19961 } else { 19962 mp = ipsec_mp; 19963 } 19964 19965 ipha = (ipha_t *)mp->b_rptr; 19966 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 19967 19968 #ifdef _BIG_ENDIAN 19969 #define V_HLEN (v_hlen_tos_len >> 24) 19970 #define LENGTH (v_hlen_tos_len & 0xFFFF) 19971 #else 19972 #define V_HLEN (v_hlen_tos_len & 0xFF) 19973 #define LENGTH ((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00)) 19974 #endif 19975 19976 #ifndef SPEED_BEFORE_SAFETY 19977 /* 19978 * Check that ipha_length is consistent with 19979 * the mblk length 19980 */ 19981 if (LENGTH != (mp->b_cont ? msgdsize(mp) : mp->b_wptr - rptr)) { 19982 ip0dbg(("Packet length mismatch: %d, %ld\n", 19983 LENGTH, msgdsize(mp))); 19984 freemsg(ipsec_mp); 19985 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 19986 "ip_wput_ire_fragmentit: mp %p (%S)", mp, 19987 "packet length mismatch"); 19988 return; 19989 } 19990 #endif 19991 /* 19992 * Don't use frag_flag if pre-built packet or source 19993 * routed or if multicast (since multicast packets do not solicit 19994 * ICMP "packet too big" messages). Get the values of 19995 * max_frag and frag_flag atomically by acquiring the 19996 * ire_lock. 19997 */ 19998 mutex_enter(&ire->ire_lock); 19999 max_frag = ire->ire_max_frag; 20000 frag_flag = ire->ire_frag_flag; 20001 mutex_exit(&ire->ire_lock); 20002 20003 dont_use = ((ipha->ipha_ident == IP_HDR_INCLUDED) || 20004 (V_HLEN != IP_SIMPLE_HDR_VERSION && 20005 ip_source_route_included(ipha)) || CLASSD(ipha->ipha_dst)); 20006 20007 ip_wput_frag(ire, ipsec_mp, OB_PKT, max_frag, 20008 (dont_use ? 0 : frag_flag)); 20009 } 20010 20011 /* 20012 * Used for deciding the MSS size for the upper layer. Thus 20013 * we need to check the outbound policy values in the conn. 20014 */ 20015 int 20016 conn_ipsec_length(conn_t *connp) 20017 { 20018 ipsec_latch_t *ipl; 20019 20020 ipl = connp->conn_latch; 20021 if (ipl == NULL) 20022 return (0); 20023 20024 if (ipl->ipl_out_policy == NULL) 20025 return (0); 20026 20027 return (ipl->ipl_out_policy->ipsp_act->ipa_ovhd); 20028 } 20029 20030 /* 20031 * Returns an estimate of the IPSEC headers size. This is used if 20032 * we don't want to call into IPSEC to get the exact size. 20033 */ 20034 int 20035 ipsec_out_extra_length(mblk_t *ipsec_mp) 20036 { 20037 ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr; 20038 ipsec_action_t *a; 20039 20040 ASSERT(io->ipsec_out_type == IPSEC_OUT); 20041 if (!io->ipsec_out_secure) 20042 return (0); 20043 20044 a = io->ipsec_out_act; 20045 20046 if (a == NULL) { 20047 ASSERT(io->ipsec_out_policy != NULL); 20048 a = io->ipsec_out_policy->ipsp_act; 20049 } 20050 ASSERT(a != NULL); 20051 20052 return (a->ipa_ovhd); 20053 } 20054 20055 /* 20056 * Returns an estimate of the IPSEC headers size. This is used if 20057 * we don't want to call into IPSEC to get the exact size. 20058 */ 20059 int 20060 ipsec_in_extra_length(mblk_t *ipsec_mp) 20061 { 20062 ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr; 20063 ipsec_action_t *a; 20064 20065 ASSERT(ii->ipsec_in_type == IPSEC_IN); 20066 20067 a = ii->ipsec_in_action; 20068 return (a == NULL ? 0 : a->ipa_ovhd); 20069 } 20070 20071 /* 20072 * If there are any source route options, return the true final 20073 * destination. Otherwise, return the destination. 20074 */ 20075 ipaddr_t 20076 ip_get_dst(ipha_t *ipha) 20077 { 20078 ipoptp_t opts; 20079 uchar_t *opt; 20080 uint8_t optval; 20081 uint8_t optlen; 20082 ipaddr_t dst; 20083 uint32_t off; 20084 20085 dst = ipha->ipha_dst; 20086 20087 if (IS_SIMPLE_IPH(ipha)) 20088 return (dst); 20089 20090 for (optval = ipoptp_first(&opts, ipha); 20091 optval != IPOPT_EOL; 20092 optval = ipoptp_next(&opts)) { 20093 opt = opts.ipoptp_cur; 20094 optlen = opts.ipoptp_len; 20095 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 20096 switch (optval) { 20097 case IPOPT_SSRR: 20098 case IPOPT_LSRR: 20099 off = opt[IPOPT_OFFSET]; 20100 /* 20101 * If one of the conditions is true, it means 20102 * end of options and dst already has the right 20103 * value. 20104 */ 20105 if (!(optlen < IP_ADDR_LEN || off > optlen - 3)) { 20106 off = optlen - IP_ADDR_LEN; 20107 bcopy(&opt[off], &dst, IP_ADDR_LEN); 20108 } 20109 return (dst); 20110 default: 20111 break; 20112 } 20113 } 20114 20115 return (dst); 20116 } 20117 20118 mblk_t * 20119 ip_wput_ire_parse_ipsec_out(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, ire_t *ire, 20120 conn_t *connp, boolean_t unspec_src) 20121 { 20122 ipsec_out_t *io; 20123 mblk_t *first_mp; 20124 boolean_t policy_present; 20125 20126 first_mp = mp; 20127 if (mp->b_datap->db_type == M_CTL) { 20128 io = (ipsec_out_t *)first_mp->b_rptr; 20129 /* 20130 * ip_wput[_v6] attaches an IPSEC_OUT in two cases. 20131 * 20132 * 1) There is per-socket policy (including cached global 20133 * policy). 20134 * 2) There is no per-socket policy, but it is 20135 * a multicast packet that needs to go out 20136 * on a specific interface. This is the case 20137 * where (ip_wput and ip_wput_multicast) attaches 20138 * an IPSEC_OUT and sets ipsec_out_secure B_FALSE. 20139 * 20140 * In case (2) we check with global policy to 20141 * see if there is a match and set the ill_index 20142 * appropriately so that we can lookup the ire 20143 * properly in ip_wput_ipsec_out. 20144 */ 20145 20146 /* 20147 * ipsec_out_use_global_policy is set to B_FALSE 20148 * in ipsec_in_to_out(). Refer to that function for 20149 * details. 20150 */ 20151 if ((io->ipsec_out_latch == NULL) && 20152 (io->ipsec_out_use_global_policy)) { 20153 return (ip_wput_attach_policy(first_mp, ipha, ip6h, 20154 ire, connp, unspec_src)); 20155 } 20156 if (!io->ipsec_out_secure) { 20157 /* 20158 * If this is not a secure packet, drop 20159 * the IPSEC_OUT mp and treat it as a clear 20160 * packet. This happens when we are sending 20161 * a ICMP reply back to a clear packet. See 20162 * ipsec_in_to_out() for details. 20163 */ 20164 mp = first_mp->b_cont; 20165 freeb(first_mp); 20166 } 20167 return (mp); 20168 } 20169 /* 20170 * See whether we need to attach a global policy here. We 20171 * don't depend on the conn (as it could be null) for deciding 20172 * what policy this datagram should go through because it 20173 * should have happened in ip_wput if there was some 20174 * policy. This normally happens for connections which are not 20175 * fully bound preventing us from caching policies in 20176 * ip_bind. Packets coming from the TCP listener/global queue 20177 * - which are non-hard_bound - could also be affected by 20178 * applying policy here. 20179 * 20180 * If this packet is coming from tcp global queue or listener, 20181 * we will be applying policy here. This may not be *right* 20182 * if these packets are coming from the detached connection as 20183 * it could have gone in clear before. This happens only if a 20184 * TCP connection started when there is no policy and somebody 20185 * added policy before it became detached. Thus packets of the 20186 * detached connection could go out secure and the other end 20187 * would drop it because it will be expecting in clear. The 20188 * converse is not true i.e if somebody starts a TCP 20189 * connection and deletes the policy, all the packets will 20190 * still go out with the policy that existed before deleting 20191 * because ip_unbind sends up policy information which is used 20192 * by TCP on subsequent ip_wputs. The right solution is to fix 20193 * TCP to attach a dummy IPSEC_OUT and set 20194 * ipsec_out_use_global_policy to B_FALSE. As this might 20195 * affect performance for normal cases, we are not doing it. 20196 * Thus, set policy before starting any TCP connections. 20197 * 20198 * NOTE - We might apply policy even for a hard bound connection 20199 * - for which we cached policy in ip_bind - if somebody added 20200 * global policy after we inherited the policy in ip_bind. 20201 * This means that the packets that were going out in clear 20202 * previously would start going secure and hence get dropped 20203 * on the other side. To fix this, TCP attaches a dummy 20204 * ipsec_out and make sure that we don't apply global policy. 20205 */ 20206 if (ipha != NULL) 20207 policy_present = ipsec_outbound_v4_policy_present; 20208 else 20209 policy_present = ipsec_outbound_v6_policy_present; 20210 if (!policy_present) 20211 return (mp); 20212 20213 return (ip_wput_attach_policy(mp, ipha, ip6h, ire, connp, unspec_src)); 20214 } 20215 20216 ire_t * 20217 conn_set_outgoing_ill(conn_t *connp, ire_t *ire, ill_t **conn_outgoing_ill) 20218 { 20219 ipaddr_t addr; 20220 ire_t *save_ire; 20221 irb_t *irb; 20222 ill_group_t *illgrp; 20223 int err; 20224 20225 save_ire = ire; 20226 addr = ire->ire_addr; 20227 20228 ASSERT(ire->ire_type == IRE_BROADCAST); 20229 20230 illgrp = connp->conn_outgoing_ill->ill_group; 20231 if (illgrp == NULL) { 20232 *conn_outgoing_ill = conn_get_held_ill(connp, 20233 &connp->conn_outgoing_ill, &err); 20234 if (err == ILL_LOOKUP_FAILED) { 20235 ire_refrele(save_ire); 20236 return (NULL); 20237 } 20238 return (save_ire); 20239 } 20240 /* 20241 * If IP_BOUND_IF has been done, conn_outgoing_ill will be set. 20242 * If it is part of the group, we need to send on the ire 20243 * that has been cleared of IRE_MARK_NORECV and that belongs 20244 * to this group. This is okay as IP_BOUND_IF really means 20245 * any ill in the group. We depend on the fact that the 20246 * first ire in the group is always cleared of IRE_MARK_NORECV 20247 * if such an ire exists. This is possible only if you have 20248 * at least one ill in the group that has not failed. 20249 * 20250 * First get to the ire that matches the address and group. 20251 * 20252 * We don't look for an ire with a matching zoneid because a given zone 20253 * won't always have broadcast ires on all ills in the group. 20254 */ 20255 irb = ire->ire_bucket; 20256 rw_enter(&irb->irb_lock, RW_READER); 20257 if (ire->ire_marks & IRE_MARK_NORECV) { 20258 /* 20259 * If the current zone only has an ire broadcast for this 20260 * address marked NORECV, the ire we want is ahead in the 20261 * bucket, so we look it up deliberately ignoring the zoneid. 20262 */ 20263 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 20264 if (ire->ire_addr != addr) 20265 continue; 20266 /* skip over deleted ires */ 20267 if (ire->ire_marks & IRE_MARK_CONDEMNED) 20268 continue; 20269 } 20270 } 20271 while (ire != NULL) { 20272 /* 20273 * If a new interface is coming up, we could end up 20274 * seeing the loopback ire and the non-loopback ire 20275 * may not have been added yet. So check for ire_stq 20276 */ 20277 if (ire->ire_stq != NULL && (ire->ire_addr != addr || 20278 ire->ire_ipif->ipif_ill->ill_group == illgrp)) { 20279 break; 20280 } 20281 ire = ire->ire_next; 20282 } 20283 if (ire != NULL && ire->ire_addr == addr && 20284 ire->ire_ipif->ipif_ill->ill_group == illgrp) { 20285 IRE_REFHOLD(ire); 20286 rw_exit(&irb->irb_lock); 20287 ire_refrele(save_ire); 20288 *conn_outgoing_ill = ire_to_ill(ire); 20289 /* 20290 * Refhold the ill to make the conn_outgoing_ill 20291 * independent of the ire. ip_wput_ire goes in a loop 20292 * and may refrele the ire. Since we have an ire at this 20293 * point we don't need to use ILL_CAN_LOOKUP on the ill. 20294 */ 20295 ill_refhold(*conn_outgoing_ill); 20296 return (ire); 20297 } 20298 rw_exit(&irb->irb_lock); 20299 ip1dbg(("conn_set_outgoing_ill: No matching ire\n")); 20300 /* 20301 * If we can't find a suitable ire, return the original ire. 20302 */ 20303 return (save_ire); 20304 } 20305 20306 /* 20307 * This function does the ire_refrele of the ire passed in as the 20308 * argument. As this function looks up more ires i.e broadcast ires, 20309 * it needs to REFRELE them. Currently, for simplicity we don't 20310 * differentiate the one passed in and looked up here. We always 20311 * REFRELE. 20312 * IPQoS Notes: 20313 * IP policy is invoked if IPP_LOCAL_OUT is enabled. Processing for 20314 * IPSec packets are done in ipsec_out_process. 20315 * 20316 */ 20317 void 20318 ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller) 20319 { 20320 ipha_t *ipha; 20321 #define rptr ((uchar_t *)ipha) 20322 mblk_t *mp1; 20323 queue_t *stq; 20324 #define Q_TO_INDEX(stq) (((ill_t *)stq->q_ptr)->ill_phyint->phyint_ifindex) 20325 uint32_t v_hlen_tos_len; 20326 uint32_t ttl_protocol; 20327 ipaddr_t src; 20328 ipaddr_t dst; 20329 uint32_t cksum; 20330 ipaddr_t orig_src; 20331 ire_t *ire1; 20332 mblk_t *next_mp; 20333 uint_t hlen; 20334 uint16_t *up; 20335 uint32_t max_frag = ire->ire_max_frag; 20336 ill_t *ill = ire_to_ill(ire); 20337 int clusterwide; 20338 uint16_t ip_hdr_included; /* IP header included by ULP? */ 20339 int ipsec_len; 20340 mblk_t *first_mp; 20341 ipsec_out_t *io; 20342 boolean_t conn_dontroute; /* conn value for multicast */ 20343 boolean_t conn_multicast_loop; /* conn value for multicast */ 20344 boolean_t multicast_forward; /* Should we forward ? */ 20345 boolean_t unspec_src; 20346 ill_t *conn_outgoing_ill = NULL; 20347 ill_t *ire_ill; 20348 ill_t *ire1_ill; 20349 uint32_t ill_index = 0; 20350 boolean_t multirt_send = B_FALSE; 20351 int err; 20352 zoneid_t zoneid; 20353 20354 TRACE_1(TR_FAC_IP, TR_IP_WPUT_IRE_START, 20355 "ip_wput_ire_start: q %p", q); 20356 20357 multicast_forward = B_FALSE; 20358 unspec_src = (connp != NULL && connp->conn_unspec_src); 20359 20360 if (ire->ire_flags & RTF_MULTIRT) { 20361 /* 20362 * Multirouting case. The bucket where ire is stored 20363 * probably holds other RTF_MULTIRT flagged ire 20364 * to the destination. In this call to ip_wput_ire, 20365 * we attempt to send the packet through all 20366 * those ires. Thus, we first ensure that ire is the 20367 * first RTF_MULTIRT ire in the bucket, 20368 * before walking the ire list. 20369 */ 20370 ire_t *first_ire; 20371 irb_t *irb = ire->ire_bucket; 20372 ASSERT(irb != NULL); 20373 20374 /* Make sure we do not omit any multiroute ire. */ 20375 IRB_REFHOLD(irb); 20376 for (first_ire = irb->irb_ire; 20377 first_ire != NULL; 20378 first_ire = first_ire->ire_next) { 20379 if ((first_ire->ire_flags & RTF_MULTIRT) && 20380 (first_ire->ire_addr == ire->ire_addr) && 20381 !(first_ire->ire_marks & 20382 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) 20383 break; 20384 } 20385 20386 if ((first_ire != NULL) && (first_ire != ire)) { 20387 IRE_REFHOLD(first_ire); 20388 ire_refrele(ire); 20389 ire = first_ire; 20390 ill = ire_to_ill(ire); 20391 } 20392 IRB_REFRELE(irb); 20393 } 20394 20395 /* 20396 * conn_outgoing_ill is used only in the broadcast loop. 20397 * for performance we don't grab the mutexs in the fastpath 20398 */ 20399 if ((connp != NULL) && 20400 (connp->conn_xmit_if_ill == NULL) && 20401 (ire->ire_type == IRE_BROADCAST) && 20402 ((connp->conn_nofailover_ill != NULL) || 20403 (connp->conn_outgoing_ill != NULL))) { 20404 /* 20405 * Bind to IPIF_NOFAILOVER address overrides IP_BOUND_IF 20406 * option. So, see if this endpoint is bound to a 20407 * IPIF_NOFAILOVER address. If so, honor it. This implies 20408 * that if the interface is failed, we will still send 20409 * the packet on the same ill which is what we want. 20410 */ 20411 conn_outgoing_ill = conn_get_held_ill(connp, 20412 &connp->conn_nofailover_ill, &err); 20413 if (err == ILL_LOOKUP_FAILED) { 20414 ire_refrele(ire); 20415 freemsg(mp); 20416 return; 20417 } 20418 if (conn_outgoing_ill == NULL) { 20419 /* 20420 * Choose a good ill in the group to send the 20421 * packets on. 20422 */ 20423 ire = conn_set_outgoing_ill(connp, ire, 20424 &conn_outgoing_ill); 20425 if (ire == NULL) { 20426 freemsg(mp); 20427 return; 20428 } 20429 } 20430 } 20431 20432 if (mp->b_datap->db_type != M_CTL) { 20433 ipha = (ipha_t *)mp->b_rptr; 20434 zoneid = (connp != NULL ? connp->conn_zoneid : ALL_ZONES); 20435 } else { 20436 io = (ipsec_out_t *)mp->b_rptr; 20437 ASSERT(io->ipsec_out_type == IPSEC_OUT); 20438 zoneid = io->ipsec_out_zoneid; 20439 ASSERT(zoneid != ALL_ZONES); 20440 ipha = (ipha_t *)mp->b_cont->b_rptr; 20441 dst = ipha->ipha_dst; 20442 /* 20443 * For the multicast case, ipsec_out carries conn_dontroute and 20444 * conn_multicast_loop as conn may not be available here. We 20445 * need this for multicast loopback and forwarding which is done 20446 * later in the code. 20447 */ 20448 if (CLASSD(dst)) { 20449 conn_dontroute = io->ipsec_out_dontroute; 20450 conn_multicast_loop = io->ipsec_out_multicast_loop; 20451 /* 20452 * If conn_dontroute is not set or conn_multicast_loop 20453 * is set, we need to do forwarding/loopback. For 20454 * datagrams from ip_wput_multicast, conn_dontroute is 20455 * set to B_TRUE and conn_multicast_loop is set to 20456 * B_FALSE so that we neither do forwarding nor 20457 * loopback. 20458 */ 20459 if (!conn_dontroute || conn_multicast_loop) 20460 multicast_forward = B_TRUE; 20461 } 20462 } 20463 20464 if (ire->ire_type == IRE_LOCAL && ire->ire_zoneid != zoneid && 20465 ire->ire_zoneid != ALL_ZONES) { 20466 /* 20467 * When a zone sends a packet to another zone, we try to deliver 20468 * the packet under the same conditions as if the destination 20469 * was a real node on the network. To do so, we look for a 20470 * matching route in the forwarding table. 20471 * RTF_REJECT and RTF_BLACKHOLE are handled just like 20472 * ip_newroute() does. 20473 */ 20474 ire_t *src_ire = ire_ftable_lookup(ipha->ipha_dst, 0, 0, 0, 20475 NULL, NULL, zoneid, 0, NULL, (MATCH_IRE_RECURSIVE | 20476 MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE)); 20477 if (src_ire != NULL && 20478 !(src_ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE))) { 20479 if (ipha->ipha_src == INADDR_ANY && !unspec_src) 20480 ipha->ipha_src = src_ire->ire_src_addr; 20481 ire_refrele(src_ire); 20482 } else { 20483 ire_refrele(ire); 20484 if (conn_outgoing_ill != NULL) 20485 ill_refrele(conn_outgoing_ill); 20486 BUMP_MIB(&ip_mib, ipOutNoRoutes); 20487 if (src_ire != NULL) { 20488 if (src_ire->ire_flags & RTF_BLACKHOLE) { 20489 ire_refrele(src_ire); 20490 freemsg(mp); 20491 return; 20492 } 20493 ire_refrele(src_ire); 20494 } 20495 if (ip_hdr_complete(ipha, zoneid)) { 20496 /* Failed */ 20497 freemsg(mp); 20498 return; 20499 } 20500 icmp_unreachable(q, mp, ICMP_HOST_UNREACHABLE); 20501 return; 20502 } 20503 } 20504 20505 if (mp->b_datap->db_type == M_CTL || 20506 ipsec_outbound_v4_policy_present) { 20507 mp = ip_wput_ire_parse_ipsec_out(mp, ipha, NULL, ire, connp, 20508 unspec_src); 20509 if (mp == NULL) { 20510 ire_refrele(ire); 20511 if (conn_outgoing_ill != NULL) 20512 ill_refrele(conn_outgoing_ill); 20513 return; 20514 } 20515 } 20516 20517 first_mp = mp; 20518 ipsec_len = 0; 20519 20520 if (first_mp->b_datap->db_type == M_CTL) { 20521 io = (ipsec_out_t *)first_mp->b_rptr; 20522 ASSERT(io->ipsec_out_type == IPSEC_OUT); 20523 mp = first_mp->b_cont; 20524 ipsec_len = ipsec_out_extra_length(first_mp); 20525 ASSERT(ipsec_len >= 0); 20526 zoneid = io->ipsec_out_zoneid; 20527 ASSERT(zoneid != ALL_ZONES); 20528 20529 /* 20530 * Drop M_CTL here if IPsec processing is not needed. 20531 * (Non-IPsec use of M_CTL extracted any information it 20532 * needed above). 20533 */ 20534 if (ipsec_len == 0) { 20535 freeb(first_mp); 20536 first_mp = mp; 20537 } 20538 } 20539 20540 /* 20541 * Fast path for ip_wput_ire 20542 */ 20543 20544 ipha = (ipha_t *)mp->b_rptr; 20545 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 20546 dst = ipha->ipha_dst; 20547 20548 /* 20549 * ICMP(RAWIP) module should set the ipha_ident to IP_HDR_INCLUDED 20550 * if the socket is a SOCK_RAW type. The transport checksum should 20551 * be provided in the pre-built packet, so we don't need to compute it. 20552 * Also, other application set flags, like DF, should not be altered. 20553 * Other transport MUST pass down zero. 20554 */ 20555 ip_hdr_included = ipha->ipha_ident; 20556 ASSERT(ipha->ipha_ident == 0 || ipha->ipha_ident == IP_HDR_INCLUDED); 20557 20558 if (CLASSD(dst)) { 20559 ip1dbg(("ip_wput_ire: to 0x%x ire %s addr 0x%x\n", 20560 ntohl(dst), 20561 ip_nv_lookup(ire_nv_tbl, ire->ire_type), 20562 ntohl(ire->ire_addr))); 20563 } 20564 20565 /* Macros to extract header fields from data already in registers */ 20566 #ifdef _BIG_ENDIAN 20567 #define V_HLEN (v_hlen_tos_len >> 24) 20568 #define LENGTH (v_hlen_tos_len & 0xFFFF) 20569 #define PROTO (ttl_protocol & 0xFF) 20570 #else 20571 #define V_HLEN (v_hlen_tos_len & 0xFF) 20572 #define LENGTH ((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00)) 20573 #define PROTO (ttl_protocol >> 8) 20574 #endif 20575 20576 20577 orig_src = src = ipha->ipha_src; 20578 /* (The loop back to "another" is explained down below.) */ 20579 another:; 20580 /* 20581 * Assign an ident value for this packet. We assign idents on 20582 * a per destination basis out of the IRE. There could be 20583 * other threads targeting the same destination, so we have to 20584 * arrange for a atomic increment. Note that we use a 32-bit 20585 * atomic add because it has better performance than its 20586 * 16-bit sibling. 20587 * 20588 * If running in cluster mode and if the source address 20589 * belongs to a replicated service then vector through 20590 * cl_inet_ipident vector to allocate ip identifier 20591 * NOTE: This is a contract private interface with the 20592 * clustering group. 20593 */ 20594 clusterwide = 0; 20595 if (cl_inet_ipident) { 20596 ASSERT(cl_inet_isclusterwide); 20597 if ((*cl_inet_isclusterwide)(IPPROTO_IP, 20598 AF_INET, (uint8_t *)(uintptr_t)src)) { 20599 ipha->ipha_ident = (*cl_inet_ipident)(IPPROTO_IP, 20600 AF_INET, (uint8_t *)(uintptr_t)src, 20601 (uint8_t *)(uintptr_t)dst); 20602 clusterwide = 1; 20603 } 20604 } 20605 if (!clusterwide) { 20606 ipha->ipha_ident = 20607 (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1); 20608 } 20609 20610 #ifndef _BIG_ENDIAN 20611 ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8); 20612 #endif 20613 20614 /* 20615 * Set source address unless sent on an ill or conn_unspec_src is set. 20616 * This is needed to obey conn_unspec_src when packets go through 20617 * ip_newroute + arp. 20618 * Assumes ip_newroute{,_multi} sets the source address as well. 20619 */ 20620 if (src == INADDR_ANY && !unspec_src) { 20621 /* 20622 * Assign the appropriate source address from the IRE if none 20623 * was specified. 20624 */ 20625 ASSERT(ire->ire_ipversion == IPV4_VERSION); 20626 20627 /* 20628 * With IP multipathing, broadcast packets are sent on the ire 20629 * that has been cleared of IRE_MARK_NORECV and that belongs to 20630 * the group. However, this ire might not be in the same zone so 20631 * we can't always use its source address. We look for a 20632 * broadcast ire in the same group and in the right zone. 20633 */ 20634 if (ire->ire_type == IRE_BROADCAST && 20635 ire->ire_zoneid != zoneid) { 20636 ire_t *src_ire = ire_ctable_lookup(dst, 0, 20637 IRE_BROADCAST, ire->ire_ipif, zoneid, NULL, 20638 (MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP)); 20639 if (src_ire != NULL) { 20640 src = src_ire->ire_src_addr; 20641 ire_refrele(src_ire); 20642 } else { 20643 ire_refrele(ire); 20644 if (conn_outgoing_ill != NULL) 20645 ill_refrele(conn_outgoing_ill); 20646 freemsg(first_mp); 20647 BUMP_MIB(&ip_mib, ipOutDiscards); 20648 return; 20649 } 20650 } else { 20651 src = ire->ire_src_addr; 20652 } 20653 20654 if (connp == NULL) { 20655 ip1dbg(("ip_wput_ire: no connp and no src " 20656 "address for dst 0x%x, using src 0x%x\n", 20657 ntohl(dst), 20658 ntohl(src))); 20659 } 20660 ipha->ipha_src = src; 20661 } 20662 stq = ire->ire_stq; 20663 20664 /* 20665 * We only allow ire chains for broadcasts since there will 20666 * be multiple IRE_CACHE entries for the same multicast 20667 * address (one per ipif). 20668 */ 20669 next_mp = NULL; 20670 20671 /* broadcast packet */ 20672 if (ire->ire_type == IRE_BROADCAST) 20673 goto broadcast; 20674 20675 /* loopback ? */ 20676 if (stq == NULL) 20677 goto nullstq; 20678 20679 /* The ill_index for outbound ILL */ 20680 ill_index = Q_TO_INDEX(stq); 20681 20682 BUMP_MIB(&ip_mib, ipOutRequests); 20683 ttl_protocol = ((uint16_t *)ipha)[4]; 20684 20685 /* pseudo checksum (do it in parts for IP header checksum) */ 20686 cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); 20687 20688 if (!IP_FLOW_CONTROLLED_ULP(PROTO)) { 20689 queue_t *dev_q = stq->q_next; 20690 20691 /* flow controlled */ 20692 if ((dev_q->q_next || dev_q->q_first) && 20693 !canput(dev_q)) 20694 goto blocked; 20695 if ((PROTO == IPPROTO_UDP) && 20696 (ip_hdr_included != IP_HDR_INCLUDED)) { 20697 hlen = (V_HLEN & 0xF) << 2; 20698 up = IPH_UDPH_CHECKSUMP(ipha, hlen); 20699 if (*up != 0) { 20700 IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO, 20701 hlen, LENGTH, max_frag, ipsec_len, cksum); 20702 /* Software checksum? */ 20703 if (DB_CKSUMFLAGS(mp) == 0) { 20704 IP_STAT(ip_out_sw_cksum); 20705 IP_STAT_UPDATE( 20706 ip_udp_out_sw_cksum_bytes, 20707 LENGTH - hlen); 20708 } 20709 } 20710 } 20711 } else if (ip_hdr_included != IP_HDR_INCLUDED) { 20712 hlen = (V_HLEN & 0xF) << 2; 20713 if (PROTO == IPPROTO_TCP) { 20714 up = IPH_TCPH_CHECKSUMP(ipha, hlen); 20715 /* 20716 * The packet header is processed once and for all, even 20717 * in the multirouting case. We disable hardware 20718 * checksum if the packet is multirouted, as it will be 20719 * replicated via several interfaces, and not all of 20720 * them may have this capability. 20721 */ 20722 IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO, hlen, 20723 LENGTH, max_frag, ipsec_len, cksum); 20724 /* Software checksum? */ 20725 if (DB_CKSUMFLAGS(mp) == 0) { 20726 IP_STAT(ip_out_sw_cksum); 20727 IP_STAT_UPDATE(ip_tcp_out_sw_cksum_bytes, 20728 LENGTH - hlen); 20729 } 20730 } else { 20731 sctp_hdr_t *sctph; 20732 20733 ASSERT(PROTO == IPPROTO_SCTP); 20734 ASSERT(MBLKL(mp) >= (hlen + sizeof (*sctph))); 20735 sctph = (sctp_hdr_t *)(mp->b_rptr + hlen); 20736 /* 20737 * Zero out the checksum field to ensure proper 20738 * checksum calculation. 20739 */ 20740 sctph->sh_chksum = 0; 20741 #ifdef DEBUG 20742 if (!skip_sctp_cksum) 20743 #endif 20744 sctph->sh_chksum = sctp_cksum(mp, hlen); 20745 } 20746 } 20747 20748 /* 20749 * If this is a multicast packet and originated from ip_wput 20750 * we need to do loopback and forwarding checks. If it comes 20751 * from ip_wput_multicast, we SHOULD not do this. 20752 */ 20753 if (CLASSD(ipha->ipha_dst) && multicast_forward) goto multi_loopback; 20754 20755 /* checksum */ 20756 cksum += ttl_protocol; 20757 20758 /* fragment the packet */ 20759 if (max_frag < (uint_t)(LENGTH + ipsec_len)) 20760 goto fragmentit; 20761 /* 20762 * Don't use frag_flag if packet is pre-built or source 20763 * routed or if multicast (since multicast packets do 20764 * not solicit ICMP "packet too big" messages). 20765 */ 20766 if ((ip_hdr_included != IP_HDR_INCLUDED) && 20767 (V_HLEN == IP_SIMPLE_HDR_VERSION || 20768 !ip_source_route_included(ipha)) && 20769 !CLASSD(ipha->ipha_dst)) 20770 ipha->ipha_fragment_offset_and_flags |= 20771 htons(ire->ire_frag_flag); 20772 20773 if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) { 20774 /* calculate IP header checksum */ 20775 cksum += ipha->ipha_ident; 20776 cksum += (v_hlen_tos_len >> 16)+(v_hlen_tos_len & 0xFFFF); 20777 cksum += ipha->ipha_fragment_offset_and_flags; 20778 20779 /* IP options present */ 20780 hlen = (V_HLEN & 0xF) - IP_SIMPLE_HDR_LENGTH_IN_WORDS; 20781 if (hlen) 20782 goto checksumoptions; 20783 20784 /* calculate hdr checksum */ 20785 cksum = ((cksum & 0xFFFF) + (cksum >> 16)); 20786 cksum = ~(cksum + (cksum >> 16)); 20787 ipha->ipha_hdr_checksum = (uint16_t)cksum; 20788 } 20789 if (ipsec_len != 0) { 20790 /* 20791 * We will do the rest of the processing after 20792 * we come back from IPSEC in ip_wput_ipsec_out(). 20793 */ 20794 ASSERT(MBLKL(first_mp) >= sizeof (ipsec_out_t)); 20795 20796 io = (ipsec_out_t *)first_mp->b_rptr; 20797 io->ipsec_out_ill_index = ((ill_t *)stq->q_ptr)-> 20798 ill_phyint->phyint_ifindex; 20799 20800 ipsec_out_process(q, first_mp, ire, ill_index); 20801 ire_refrele(ire); 20802 if (conn_outgoing_ill != NULL) 20803 ill_refrele(conn_outgoing_ill); 20804 return; 20805 } 20806 20807 /* 20808 * In most cases, the emission loop below is entered only 20809 * once. Only in the case where the ire holds the 20810 * RTF_MULTIRT flag, do we loop to process all RTF_MULTIRT 20811 * flagged ires in the bucket, and send the packet 20812 * through all crossed RTF_MULTIRT routes. 20813 */ 20814 if (ire->ire_flags & RTF_MULTIRT) { 20815 multirt_send = B_TRUE; 20816 } 20817 do { 20818 if (multirt_send) { 20819 irb_t *irb; 20820 /* 20821 * We are in a multiple send case, need to get 20822 * the next ire and make a duplicate of the packet. 20823 * ire1 holds here the next ire to process in the 20824 * bucket. If multirouting is expected, 20825 * any non-RTF_MULTIRT ire that has the 20826 * right destination address is ignored. 20827 */ 20828 irb = ire->ire_bucket; 20829 ASSERT(irb != NULL); 20830 20831 IRB_REFHOLD(irb); 20832 for (ire1 = ire->ire_next; 20833 ire1 != NULL; 20834 ire1 = ire1->ire_next) { 20835 if ((ire1->ire_flags & RTF_MULTIRT) == 0) 20836 continue; 20837 if (ire1->ire_addr != ire->ire_addr) 20838 continue; 20839 if (ire1->ire_marks & 20840 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)) 20841 continue; 20842 20843 /* Got one */ 20844 IRE_REFHOLD(ire1); 20845 break; 20846 } 20847 IRB_REFRELE(irb); 20848 20849 if (ire1 != NULL) { 20850 next_mp = copyb(mp); 20851 if ((next_mp == NULL) || 20852 ((mp->b_cont != NULL) && 20853 ((next_mp->b_cont = 20854 dupmsg(mp->b_cont)) == NULL))) { 20855 freemsg(next_mp); 20856 next_mp = NULL; 20857 ire_refrele(ire1); 20858 ire1 = NULL; 20859 } 20860 } 20861 20862 /* Last multiroute ire; don't loop anymore. */ 20863 if (ire1 == NULL) { 20864 multirt_send = B_FALSE; 20865 } 20866 } 20867 mp = ip_wput_attach_llhdr(mp, ire, IPP_LOCAL_OUT, ill_index); 20868 if (mp == NULL) { 20869 BUMP_MIB(&ip_mib, ipOutDiscards); 20870 ip2dbg(("ip_wput_ire: fastpath wput pkt dropped "\ 20871 "during IPPF processing\n")); 20872 ire_refrele(ire); 20873 if (next_mp != NULL) { 20874 freemsg(next_mp); 20875 ire_refrele(ire1); 20876 } 20877 if (conn_outgoing_ill != NULL) 20878 ill_refrele(conn_outgoing_ill); 20879 return; 20880 } 20881 UPDATE_OB_PKT_COUNT(ire); 20882 ire->ire_last_used_time = lbolt; 20883 20884 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 20885 "ip_wput_ire_end: q %p (%S)", 20886 q, "last copy out"); 20887 putnext(stq, mp); 20888 IRE_REFRELE(ire); 20889 20890 if (multirt_send) { 20891 ASSERT(ire1); 20892 /* 20893 * Proceed with the next RTF_MULTIRT ire, 20894 * Also set up the send-to queue accordingly. 20895 */ 20896 ire = ire1; 20897 ire1 = NULL; 20898 stq = ire->ire_stq; 20899 mp = next_mp; 20900 next_mp = NULL; 20901 ipha = (ipha_t *)mp->b_rptr; 20902 ill_index = Q_TO_INDEX(stq); 20903 } 20904 } while (multirt_send); 20905 if (conn_outgoing_ill != NULL) 20906 ill_refrele(conn_outgoing_ill); 20907 return; 20908 20909 /* 20910 * ire->ire_type == IRE_BROADCAST (minimize diffs) 20911 */ 20912 broadcast: 20913 { 20914 /* 20915 * Avoid broadcast storms by setting the ttl to 1 20916 * for broadcasts. This parameter can be set 20917 * via ndd, so make sure that for the SO_DONTROUTE 20918 * case that ipha_ttl is always set to 1. 20919 * In the event that we are replying to incoming 20920 * ICMP packets, conn could be NULL. 20921 */ 20922 if ((connp != NULL) && connp->conn_dontroute) 20923 ipha->ipha_ttl = 1; 20924 else 20925 ipha->ipha_ttl = ip_broadcast_ttl; 20926 20927 /* 20928 * Note that we are not doing a IRB_REFHOLD here. 20929 * Actually we don't care if the list changes i.e 20930 * if somebody deletes an IRE from the list while 20931 * we drop the lock, the next time we come around 20932 * ire_next will be NULL and hence we won't send 20933 * out multiple copies which is fine. 20934 */ 20935 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 20936 ire1 = ire->ire_next; 20937 if (conn_outgoing_ill != NULL) { 20938 while (ire->ire_ipif->ipif_ill != conn_outgoing_ill) { 20939 ASSERT(ire1 == ire->ire_next); 20940 if (ire1 != NULL && ire1->ire_addr == dst) { 20941 ire_refrele(ire); 20942 ire = ire1; 20943 IRE_REFHOLD(ire); 20944 ire1 = ire->ire_next; 20945 continue; 20946 } 20947 rw_exit(&ire->ire_bucket->irb_lock); 20948 /* Did not find a matching ill */ 20949 ip1dbg(("ip_wput_ire: broadcast with no " 20950 "matching IP_BOUND_IF ill %s\n", 20951 conn_outgoing_ill->ill_name)); 20952 freemsg(first_mp); 20953 if (ire != NULL) 20954 ire_refrele(ire); 20955 ill_refrele(conn_outgoing_ill); 20956 return; 20957 } 20958 } else if (ire1 != NULL && ire1->ire_addr == dst) { 20959 /* 20960 * If the next IRE has the same address and is not one 20961 * of the two copies that we need to send, try to see 20962 * whether this copy should be sent at all. This 20963 * assumes that we insert loopbacks first and then 20964 * non-loopbacks. This is acheived by inserting the 20965 * loopback always before non-loopback. 20966 * This is used to send a single copy of a broadcast 20967 * packet out all physical interfaces that have an 20968 * matching IRE_BROADCAST while also looping 20969 * back one copy (to ip_wput_local) for each 20970 * matching physical interface. However, we avoid 20971 * sending packets out different logical that match by 20972 * having ipif_up/ipif_down supress duplicate 20973 * IRE_BROADCASTS. 20974 * 20975 * This feature is currently used to get broadcasts 20976 * sent to multiple interfaces, when the broadcast 20977 * address being used applies to multiple interfaces. 20978 * For example, a whole net broadcast will be 20979 * replicated on every connected subnet of 20980 * the target net. 20981 * 20982 * Each zone has its own set of IRE_BROADCASTs, so that 20983 * we're able to distribute inbound packets to multiple 20984 * zones who share a broadcast address. We avoid looping 20985 * back outbound packets in different zones but on the 20986 * same ill, as the application would see duplicates. 20987 * 20988 * If the interfaces are part of the same group, 20989 * we would want to send only one copy out for 20990 * whole group. 20991 * 20992 * This logic assumes that ire_add_v4() groups the 20993 * IRE_BROADCAST entries so that those with the same 20994 * ire_addr and ill_group are kept together. 20995 */ 20996 ire_ill = ire->ire_ipif->ipif_ill; 20997 if (ire->ire_stq == NULL && ire1->ire_stq != NULL) { 20998 if (ire_ill->ill_group != NULL && 20999 (ire->ire_marks & IRE_MARK_NORECV)) { 21000 /* 21001 * If the current zone only has an ire 21002 * broadcast for this address marked 21003 * NORECV, the ire we want is ahead in 21004 * the bucket, so we look it up 21005 * deliberately ignoring the zoneid. 21006 */ 21007 for (ire1 = ire->ire_bucket->irb_ire; 21008 ire1 != NULL; 21009 ire1 = ire1->ire_next) { 21010 ire1_ill = 21011 ire1->ire_ipif->ipif_ill; 21012 if (ire1->ire_addr != dst) 21013 continue; 21014 /* skip over the current ire */ 21015 if (ire1 == ire) 21016 continue; 21017 /* skip over deleted ires */ 21018 if (ire1->ire_marks & 21019 IRE_MARK_CONDEMNED) 21020 continue; 21021 /* 21022 * non-loopback ire in our 21023 * group: use it for the next 21024 * pass in the loop 21025 */ 21026 if (ire1->ire_stq != NULL && 21027 ire1_ill->ill_group == 21028 ire_ill->ill_group) 21029 break; 21030 } 21031 } 21032 } else { 21033 while (ire1 != NULL && ire1->ire_addr == dst) { 21034 ire1_ill = ire1->ire_ipif->ipif_ill; 21035 /* 21036 * We can have two broadcast ires on the 21037 * same ill in different zones; here 21038 * we'll send a copy of the packet on 21039 * each ill and the fanout code will 21040 * call conn_wantpacket() to check that 21041 * the zone has the broadcast address 21042 * configured on the ill. If the two 21043 * ires are in the same group we only 21044 * send one copy up. 21045 */ 21046 if (ire1_ill != ire_ill && 21047 (ire1_ill->ill_group == NULL || 21048 ire_ill->ill_group == NULL || 21049 ire1_ill->ill_group != 21050 ire_ill->ill_group)) { 21051 break; 21052 } 21053 ire1 = ire1->ire_next; 21054 } 21055 } 21056 } 21057 ASSERT(multirt_send == B_FALSE); 21058 if (ire1 != NULL && ire1->ire_addr == dst) { 21059 if ((ire->ire_flags & RTF_MULTIRT) && 21060 (ire1->ire_flags & RTF_MULTIRT)) { 21061 /* 21062 * We are in the multirouting case. 21063 * The message must be sent at least 21064 * on both ires. These ires have been 21065 * inserted AFTER the standard ones 21066 * in ip_rt_add(). There are thus no 21067 * other ire entries for the destination 21068 * address in the rest of the bucket 21069 * that do not have the RTF_MULTIRT 21070 * flag. We don't process a copy 21071 * of the message here. This will be 21072 * done in the final sending loop. 21073 */ 21074 multirt_send = B_TRUE; 21075 } else { 21076 next_mp = ip_copymsg(first_mp); 21077 if (next_mp != NULL) 21078 IRE_REFHOLD(ire1); 21079 } 21080 } 21081 rw_exit(&ire->ire_bucket->irb_lock); 21082 } 21083 21084 if (stq) { 21085 /* 21086 * A non-NULL send-to queue means this packet is going 21087 * out of this machine. 21088 */ 21089 21090 BUMP_MIB(&ip_mib, ipOutRequests); 21091 ttl_protocol = ((uint16_t *)ipha)[4]; 21092 /* 21093 * We accumulate the pseudo header checksum in cksum. 21094 * This is pretty hairy code, so watch close. One 21095 * thing to keep in mind is that UDP and TCP have 21096 * stored their respective datagram lengths in their 21097 * checksum fields. This lines things up real nice. 21098 */ 21099 cksum = (dst >> 16) + (dst & 0xFFFF) + 21100 (src >> 16) + (src & 0xFFFF); 21101 /* 21102 * We assume the udp checksum field contains the 21103 * length, so to compute the pseudo header checksum, 21104 * all we need is the protocol number and src/dst. 21105 */ 21106 /* Provide the checksums for UDP and TCP. */ 21107 if ((PROTO == IPPROTO_TCP) && 21108 (ip_hdr_included != IP_HDR_INCLUDED)) { 21109 /* hlen gets the number of uchar_ts in the IP header */ 21110 hlen = (V_HLEN & 0xF) << 2; 21111 up = IPH_TCPH_CHECKSUMP(ipha, hlen); 21112 IP_STAT(ip_out_sw_cksum); 21113 IP_STAT_UPDATE(ip_tcp_out_sw_cksum_bytes, 21114 LENGTH - hlen); 21115 *up = IP_CSUM(mp, hlen, cksum + IP_TCP_CSUM_COMP); 21116 if (*up == 0) 21117 *up = 0xFFFF; 21118 } else if (PROTO == IPPROTO_SCTP && 21119 (ip_hdr_included != IP_HDR_INCLUDED)) { 21120 sctp_hdr_t *sctph; 21121 21122 hlen = (V_HLEN & 0xF) << 2; 21123 ASSERT(MBLKL(mp) >= (hlen + sizeof (*sctph))); 21124 sctph = (sctp_hdr_t *)(mp->b_rptr + hlen); 21125 sctph->sh_chksum = 0; 21126 #ifdef DEBUG 21127 if (!skip_sctp_cksum) 21128 #endif 21129 sctph->sh_chksum = sctp_cksum(mp, hlen); 21130 } else { 21131 queue_t *dev_q = stq->q_next; 21132 21133 if ((dev_q->q_next || dev_q->q_first) && 21134 !canput(dev_q)) { 21135 blocked: 21136 ipha->ipha_ident = ip_hdr_included; 21137 /* 21138 * If we don't have a conn to apply 21139 * backpressure, free the message. 21140 * In the ire_send path, we don't know 21141 * the position to requeue the packet. Rather 21142 * than reorder packets, we just drop this 21143 * packet. 21144 */ 21145 if (ip_output_queue && connp != NULL && 21146 caller != IRE_SEND) { 21147 if (caller == IP_WSRV) { 21148 connp->conn_did_putbq = 1; 21149 (void) putbq(connp->conn_wq, 21150 first_mp); 21151 conn_drain_insert(connp); 21152 /* 21153 * This is the service thread, 21154 * and the queue is already 21155 * noenabled. The check for 21156 * canput and the putbq is not 21157 * atomic. So we need to check 21158 * again. 21159 */ 21160 if (canput(stq->q_next)) 21161 connp->conn_did_putbq 21162 = 0; 21163 IP_STAT(ip_conn_flputbq); 21164 } else { 21165 /* 21166 * We are not the service proc. 21167 * ip_wsrv will be scheduled or 21168 * is already running. 21169 */ 21170 (void) putq(connp->conn_wq, 21171 first_mp); 21172 } 21173 } else { 21174 BUMP_MIB(&ip_mib, ipOutDiscards); 21175 freemsg(first_mp); 21176 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 21177 "ip_wput_ire_end: q %p (%S)", 21178 q, "discard"); 21179 } 21180 ire_refrele(ire); 21181 if (next_mp) { 21182 ire_refrele(ire1); 21183 freemsg(next_mp); 21184 } 21185 if (conn_outgoing_ill != NULL) 21186 ill_refrele(conn_outgoing_ill); 21187 return; 21188 } 21189 if ((PROTO == IPPROTO_UDP) && 21190 (ip_hdr_included != IP_HDR_INCLUDED)) { 21191 /* 21192 * hlen gets the number of uchar_ts in the 21193 * IP header 21194 */ 21195 hlen = (V_HLEN & 0xF) << 2; 21196 up = IPH_UDPH_CHECKSUMP(ipha, hlen); 21197 max_frag = ire->ire_max_frag; 21198 if (*up != 0) { 21199 IP_CKSUM_XMIT(ire_ill, ire, mp, ipha, 21200 up, PROTO, hlen, LENGTH, max_frag, 21201 ipsec_len, cksum); 21202 /* Software checksum? */ 21203 if (DB_CKSUMFLAGS(mp) == 0) { 21204 IP_STAT(ip_out_sw_cksum); 21205 IP_STAT_UPDATE( 21206 ip_udp_out_sw_cksum_bytes, 21207 LENGTH - hlen); 21208 } 21209 } 21210 } 21211 } 21212 /* 21213 * Need to do this even when fragmenting. The local 21214 * loopback can be done without computing checksums 21215 * but forwarding out other interface must be done 21216 * after the IP checksum (and ULP checksums) have been 21217 * computed. 21218 * 21219 * NOTE : multicast_forward is set only if this packet 21220 * originated from ip_wput. For packets originating from 21221 * ip_wput_multicast, it is not set. 21222 */ 21223 if (CLASSD(ipha->ipha_dst) && multicast_forward) { 21224 multi_loopback: 21225 ip2dbg(("ip_wput: multicast, loop %d\n", 21226 conn_multicast_loop)); 21227 21228 /* Forget header checksum offload */ 21229 DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; 21230 21231 /* 21232 * Local loopback of multicasts? Check the 21233 * ill. 21234 * 21235 * Note that the loopback function will not come 21236 * in through ip_rput - it will only do the 21237 * client fanout thus we need to do an mforward 21238 * as well. The is different from the BSD 21239 * logic. 21240 */ 21241 if (ill != NULL) { 21242 ilm_t *ilm; 21243 21244 ILM_WALKER_HOLD(ill); 21245 ilm = ilm_lookup_ill(ill, ipha->ipha_dst, 21246 ALL_ZONES); 21247 ILM_WALKER_RELE(ill); 21248 if (ilm != NULL) { 21249 /* 21250 * Pass along the virtual output q. 21251 * ip_wput_local() will distribute the 21252 * packet to all the matching zones, 21253 * except the sending zone when 21254 * IP_MULTICAST_LOOP is false. 21255 */ 21256 ip_multicast_loopback(q, ill, first_mp, 21257 conn_multicast_loop ? 0 : 21258 IP_FF_NO_MCAST_LOOP, zoneid); 21259 } 21260 } 21261 if (ipha->ipha_ttl == 0) { 21262 /* 21263 * 0 => only to this host i.e. we are 21264 * done. We are also done if this was the 21265 * loopback interface since it is sufficient 21266 * to loopback one copy of a multicast packet. 21267 */ 21268 freemsg(first_mp); 21269 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 21270 "ip_wput_ire_end: q %p (%S)", 21271 q, "loopback"); 21272 ire_refrele(ire); 21273 if (conn_outgoing_ill != NULL) 21274 ill_refrele(conn_outgoing_ill); 21275 return; 21276 } 21277 /* 21278 * ILLF_MULTICAST is checked in ip_newroute 21279 * i.e. we don't need to check it here since 21280 * all IRE_CACHEs come from ip_newroute. 21281 * For multicast traffic, SO_DONTROUTE is interpreted 21282 * to mean only send the packet out the interface 21283 * (optionally specified with IP_MULTICAST_IF) 21284 * and do not forward it out additional interfaces. 21285 * RSVP and the rsvp daemon is an example of a 21286 * protocol and user level process that 21287 * handles it's own routing. Hence, it uses the 21288 * SO_DONTROUTE option to accomplish this. 21289 */ 21290 21291 if (ip_g_mrouter && !conn_dontroute && ill != NULL) { 21292 /* Unconditionally redo the checksum */ 21293 ipha->ipha_hdr_checksum = 0; 21294 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 21295 21296 /* 21297 * If this needs to go out secure, we need 21298 * to wait till we finish the IPSEC 21299 * processing. 21300 */ 21301 if (ipsec_len == 0 && 21302 ip_mforward(ill, ipha, mp)) { 21303 freemsg(first_mp); 21304 ip1dbg(("ip_wput: mforward failed\n")); 21305 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 21306 "ip_wput_ire_end: q %p (%S)", 21307 q, "mforward failed"); 21308 ire_refrele(ire); 21309 if (conn_outgoing_ill != NULL) 21310 ill_refrele(conn_outgoing_ill); 21311 return; 21312 } 21313 } 21314 } 21315 max_frag = ire->ire_max_frag; 21316 cksum += ttl_protocol; 21317 if (max_frag >= (uint_t)(LENGTH + ipsec_len)) { 21318 /* No fragmentation required for this one. */ 21319 /* 21320 * Don't use frag_flag if packet is pre-built or source 21321 * routed or if multicast (since multicast packets do 21322 * not solicit ICMP "packet too big" messages). 21323 */ 21324 if ((ip_hdr_included != IP_HDR_INCLUDED) && 21325 (V_HLEN == IP_SIMPLE_HDR_VERSION || 21326 !ip_source_route_included(ipha)) && 21327 !CLASSD(ipha->ipha_dst)) 21328 ipha->ipha_fragment_offset_and_flags |= 21329 htons(ire->ire_frag_flag); 21330 21331 if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) { 21332 /* Complete the IP header checksum. */ 21333 cksum += ipha->ipha_ident; 21334 cksum += (v_hlen_tos_len >> 16)+ 21335 (v_hlen_tos_len & 0xFFFF); 21336 cksum += ipha->ipha_fragment_offset_and_flags; 21337 hlen = (V_HLEN & 0xF) - 21338 IP_SIMPLE_HDR_LENGTH_IN_WORDS; 21339 if (hlen) { 21340 checksumoptions: 21341 /* 21342 * Account for the IP Options in the IP 21343 * header checksum. 21344 */ 21345 up = (uint16_t *)(rptr+ 21346 IP_SIMPLE_HDR_LENGTH); 21347 do { 21348 cksum += up[0]; 21349 cksum += up[1]; 21350 up += 2; 21351 } while (--hlen); 21352 } 21353 cksum = ((cksum & 0xFFFF) + (cksum >> 16)); 21354 cksum = ~(cksum + (cksum >> 16)); 21355 ipha->ipha_hdr_checksum = (uint16_t)cksum; 21356 } 21357 if (ipsec_len != 0) { 21358 ipsec_out_process(q, first_mp, ire, ill_index); 21359 if (!next_mp) { 21360 ire_refrele(ire); 21361 if (conn_outgoing_ill != NULL) 21362 ill_refrele(conn_outgoing_ill); 21363 return; 21364 } 21365 goto next; 21366 } 21367 21368 /* 21369 * multirt_send has already been handled 21370 * for broadcast, but not yet for multicast 21371 * or IP options. 21372 */ 21373 if (next_mp == NULL) { 21374 if (ire->ire_flags & RTF_MULTIRT) { 21375 multirt_send = B_TRUE; 21376 } 21377 } 21378 21379 /* 21380 * In most cases, the emission loop below is 21381 * entered only once. Only in the case where 21382 * the ire holds the RTF_MULTIRT flag, do we loop 21383 * to process all RTF_MULTIRT ires in the bucket, 21384 * and send the packet through all crossed 21385 * RTF_MULTIRT routes. 21386 */ 21387 do { 21388 if (multirt_send) { 21389 irb_t *irb; 21390 21391 irb = ire->ire_bucket; 21392 ASSERT(irb != NULL); 21393 /* 21394 * We are in a multiple send case, 21395 * need to get the next IRE and make 21396 * a duplicate of the packet. 21397 */ 21398 IRB_REFHOLD(irb); 21399 for (ire1 = ire->ire_next; 21400 ire1 != NULL; 21401 ire1 = ire1->ire_next) { 21402 if (!(ire1->ire_flags & 21403 RTF_MULTIRT)) 21404 continue; 21405 if (ire1->ire_addr != 21406 ire->ire_addr) 21407 continue; 21408 if (ire1->ire_marks & 21409 (IRE_MARK_CONDEMNED| 21410 IRE_MARK_HIDDEN)) 21411 continue; 21412 21413 /* Got one */ 21414 IRE_REFHOLD(ire1); 21415 break; 21416 } 21417 IRB_REFRELE(irb); 21418 21419 if (ire1 != NULL) { 21420 next_mp = copyb(mp); 21421 if ((next_mp == NULL) || 21422 ((mp->b_cont != NULL) && 21423 ((next_mp->b_cont = 21424 dupmsg(mp->b_cont)) 21425 == NULL))) { 21426 freemsg(next_mp); 21427 next_mp = NULL; 21428 ire_refrele(ire1); 21429 ire1 = NULL; 21430 } 21431 } 21432 21433 /* 21434 * Last multiroute ire; don't loop 21435 * anymore. The emission is over 21436 * and next_mp is NULL. 21437 */ 21438 if (ire1 == NULL) { 21439 multirt_send = B_FALSE; 21440 } 21441 } 21442 21443 ASSERT(ipsec_len == 0); 21444 mp1 = ip_wput_attach_llhdr(mp, ire, 21445 IPP_LOCAL_OUT, ill_index); 21446 if (mp1 == NULL) { 21447 BUMP_MIB(&ip_mib, ipOutDiscards); 21448 if (next_mp) { 21449 freemsg(next_mp); 21450 ire_refrele(ire1); 21451 } 21452 ire_refrele(ire); 21453 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 21454 "ip_wput_ire_end: q %p (%S)", 21455 q, "discard MDATA"); 21456 if (conn_outgoing_ill != NULL) 21457 ill_refrele(conn_outgoing_ill); 21458 return; 21459 } 21460 UPDATE_OB_PKT_COUNT(ire); 21461 ire->ire_last_used_time = lbolt; 21462 21463 if (multirt_send) { 21464 /* 21465 * We are in a multiple send case, 21466 * need to re-enter the sending loop 21467 * using the next ire. 21468 */ 21469 putnext(stq, mp1); 21470 ire_refrele(ire); 21471 ire = ire1; 21472 stq = ire->ire_stq; 21473 mp = next_mp; 21474 next_mp = NULL; 21475 ipha = (ipha_t *)mp->b_rptr; 21476 ill_index = Q_TO_INDEX(stq); 21477 } 21478 } while (multirt_send); 21479 21480 if (!next_mp) { 21481 /* 21482 * Last copy going out (the ultra-common 21483 * case). Note that we intentionally replicate 21484 * the putnext rather than calling it before 21485 * the next_mp check in hopes of a little 21486 * tail-call action out of the compiler. 21487 */ 21488 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 21489 "ip_wput_ire_end: q %p (%S)", 21490 q, "last copy out(1)"); 21491 putnext(stq, mp1); 21492 ire_refrele(ire); 21493 if (conn_outgoing_ill != NULL) 21494 ill_refrele(conn_outgoing_ill); 21495 return; 21496 } 21497 /* More copies going out below. */ 21498 putnext(stq, mp1); 21499 } else { 21500 int offset; 21501 fragmentit: 21502 offset = ntohs(ipha->ipha_fragment_offset_and_flags); 21503 /* 21504 * If this would generate a icmp_frag_needed message, 21505 * we need to handle it before we do the IPSEC 21506 * processing. Otherwise, we need to strip the IPSEC 21507 * headers before we send up the message to the ULPs 21508 * which becomes messy and difficult. 21509 */ 21510 if (ipsec_len != 0) { 21511 if ((max_frag < (unsigned int)(LENGTH + 21512 ipsec_len)) && (offset & IPH_DF)) { 21513 21514 BUMP_MIB(&ip_mib, ipFragFails); 21515 ipha->ipha_hdr_checksum = 0; 21516 ipha->ipha_hdr_checksum = 21517 (uint16_t)ip_csum_hdr(ipha); 21518 icmp_frag_needed(ire->ire_stq, first_mp, 21519 max_frag); 21520 if (!next_mp) { 21521 ire_refrele(ire); 21522 if (conn_outgoing_ill != NULL) { 21523 ill_refrele( 21524 conn_outgoing_ill); 21525 } 21526 return; 21527 } 21528 } else { 21529 /* 21530 * This won't cause a icmp_frag_needed 21531 * message. to be gnerated. Send it on 21532 * the wire. Note that this could still 21533 * cause fragmentation and all we 21534 * do is the generation of the message 21535 * to the ULP if needed before IPSEC. 21536 */ 21537 if (!next_mp) { 21538 ipsec_out_process(q, first_mp, 21539 ire, ill_index); 21540 TRACE_2(TR_FAC_IP, 21541 TR_IP_WPUT_IRE_END, 21542 "ip_wput_ire_end: q %p " 21543 "(%S)", q, 21544 "last ipsec_out_process"); 21545 ire_refrele(ire); 21546 if (conn_outgoing_ill != NULL) { 21547 ill_refrele( 21548 conn_outgoing_ill); 21549 } 21550 return; 21551 } 21552 ipsec_out_process(q, first_mp, 21553 ire, ill_index); 21554 } 21555 } else { 21556 /* Initiate IPPF processing */ 21557 if (IPP_ENABLED(IPP_LOCAL_OUT)) { 21558 ip_process(IPP_LOCAL_OUT, &mp, 21559 ill_index); 21560 if (mp == NULL) { 21561 BUMP_MIB(&ip_mib, 21562 ipOutDiscards); 21563 if (next_mp != NULL) { 21564 freemsg(next_mp); 21565 ire_refrele(ire1); 21566 } 21567 ire_refrele(ire); 21568 TRACE_2(TR_FAC_IP, 21569 TR_IP_WPUT_IRE_END, 21570 "ip_wput_ire: q %p (%S)", 21571 q, "discard MDATA"); 21572 if (conn_outgoing_ill != NULL) { 21573 ill_refrele( 21574 conn_outgoing_ill); 21575 } 21576 return; 21577 } 21578 } 21579 if (!next_mp) { 21580 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 21581 "ip_wput_ire_end: q %p (%S)", 21582 q, "last fragmentation"); 21583 ip_wput_ire_fragmentit(mp, ire); 21584 ire_refrele(ire); 21585 if (conn_outgoing_ill != NULL) 21586 ill_refrele(conn_outgoing_ill); 21587 return; 21588 } 21589 ip_wput_ire_fragmentit(mp, ire); 21590 } 21591 } 21592 } else { 21593 nullstq: 21594 /* A NULL stq means the destination address is local. */ 21595 UPDATE_OB_PKT_COUNT(ire); 21596 ire->ire_last_used_time = lbolt; 21597 ASSERT(ire->ire_ipif != NULL); 21598 if (!next_mp) { 21599 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 21600 "ip_wput_ire_end: q %p (%S)", 21601 q, "local address"); 21602 ip_wput_local(q, ire->ire_ipif->ipif_ill, ipha, 21603 first_mp, ire, 0, ire->ire_zoneid); 21604 ire_refrele(ire); 21605 if (conn_outgoing_ill != NULL) 21606 ill_refrele(conn_outgoing_ill); 21607 return; 21608 } 21609 ip_wput_local(q, ire->ire_ipif->ipif_ill, ipha, first_mp, 21610 ire, 0, ire->ire_zoneid); 21611 } 21612 next: 21613 /* 21614 * More copies going out to additional interfaces. 21615 * ire1 has already been held. We don't need the 21616 * "ire" anymore. 21617 */ 21618 ire_refrele(ire); 21619 ire = ire1; 21620 ASSERT(ire != NULL && ire->ire_refcnt >= 1 && next_mp != NULL); 21621 mp = next_mp; 21622 ASSERT(ire->ire_ipversion == IPV4_VERSION); 21623 ill = ire_to_ill(ire); 21624 first_mp = mp; 21625 if (ipsec_len != 0) { 21626 ASSERT(first_mp->b_datap->db_type == M_CTL); 21627 mp = mp->b_cont; 21628 } 21629 dst = ire->ire_addr; 21630 ipha = (ipha_t *)mp->b_rptr; 21631 /* 21632 * Restore src so that we will pick up ire->ire_src_addr if src was 0. 21633 * Restore ipha_ident "no checksum" flag. 21634 */ 21635 src = orig_src; 21636 ipha->ipha_ident = ip_hdr_included; 21637 goto another; 21638 21639 #undef rptr 21640 #undef Q_TO_INDEX 21641 } 21642 21643 /* 21644 * Routine to allocate a message that is used to notify the ULP about MDT. 21645 * The caller may provide a pointer to the link-layer MDT capabilities, 21646 * or NULL if MDT is to be disabled on the stream. 21647 */ 21648 mblk_t * 21649 ip_mdinfo_alloc(ill_mdt_capab_t *isrc) 21650 { 21651 mblk_t *mp; 21652 ip_mdt_info_t *mdti; 21653 ill_mdt_capab_t *idst; 21654 21655 if ((mp = allocb(sizeof (*mdti), BPRI_HI)) != NULL) { 21656 DB_TYPE(mp) = M_CTL; 21657 mp->b_wptr = mp->b_rptr + sizeof (*mdti); 21658 mdti = (ip_mdt_info_t *)mp->b_rptr; 21659 mdti->mdt_info_id = MDT_IOC_INFO_UPDATE; 21660 idst = &(mdti->mdt_capab); 21661 21662 /* 21663 * If the caller provides us with the capability, copy 21664 * it over into our notification message; otherwise 21665 * we zero out the capability portion. 21666 */ 21667 if (isrc != NULL) 21668 bcopy((caddr_t)isrc, (caddr_t)idst, sizeof (*idst)); 21669 else 21670 bzero((caddr_t)idst, sizeof (*idst)); 21671 } 21672 return (mp); 21673 } 21674 21675 /* 21676 * Routine which determines whether MDT can be enabled on the destination 21677 * IRE and IPC combination, and if so, allocates and returns the MDT 21678 * notification mblk that may be used by ULP. We also check if we need to 21679 * turn MDT back to 'on' when certain restrictions prohibiting us to allow 21680 * MDT usage in the past have been lifted. This gets called during IP 21681 * and ULP binding. 21682 */ 21683 mblk_t * 21684 ip_mdinfo_return(ire_t *dst_ire, conn_t *connp, char *ill_name, 21685 ill_mdt_capab_t *mdt_cap) 21686 { 21687 mblk_t *mp; 21688 boolean_t rc = B_FALSE; 21689 21690 ASSERT(dst_ire != NULL); 21691 ASSERT(connp != NULL); 21692 ASSERT(mdt_cap != NULL); 21693 21694 /* 21695 * Currently, we only support simple TCP/{IPv4,IPv6} with 21696 * Multidata, which is handled in tcp_multisend(). This 21697 * is the reason why we do all these checks here, to ensure 21698 * that we don't enable Multidata for the cases which we 21699 * can't handle at the moment. 21700 */ 21701 do { 21702 /* Only do TCP at the moment */ 21703 if (connp->conn_ulp != IPPROTO_TCP) 21704 break; 21705 21706 /* 21707 * IPSEC outbound policy present? Note that we get here 21708 * after calling ipsec_conn_cache_policy() where the global 21709 * policy checking is performed. conn_latch will be 21710 * non-NULL as long as there's a policy defined, 21711 * i.e. conn_out_enforce_policy may be NULL in such case 21712 * when the connection is non-secure, and hence we check 21713 * further if the latch refers to an outbound policy. 21714 */ 21715 if (CONN_IPSEC_OUT_ENCAPSULATED(connp)) 21716 break; 21717 21718 /* CGTP (multiroute) is enabled? */ 21719 if (dst_ire->ire_flags & RTF_MULTIRT) 21720 break; 21721 21722 /* Outbound IPQoS enabled? */ 21723 if (IPP_ENABLED(IPP_LOCAL_OUT)) { 21724 /* 21725 * In this case, we disable MDT for this and all 21726 * future connections going over the interface. 21727 */ 21728 mdt_cap->ill_mdt_on = 0; 21729 break; 21730 } 21731 21732 /* socket option(s) present? */ 21733 if (!CONN_IS_MD_FASTPATH(connp)) 21734 break; 21735 21736 rc = B_TRUE; 21737 /* CONSTCOND */ 21738 } while (0); 21739 21740 /* Remember the result */ 21741 connp->conn_mdt_ok = rc; 21742 21743 if (!rc) 21744 return (NULL); 21745 else if (!mdt_cap->ill_mdt_on) { 21746 /* 21747 * If MDT has been previously turned off in the past, and we 21748 * currently can do MDT (due to IPQoS policy removal, etc.) 21749 * then enable it for this interface. 21750 */ 21751 mdt_cap->ill_mdt_on = 1; 21752 ip1dbg(("ip_mdinfo_return: reenabling MDT for " 21753 "interface %s\n", ill_name)); 21754 } 21755 21756 /* Allocate the MDT info mblk */ 21757 if ((mp = ip_mdinfo_alloc(mdt_cap)) == NULL) { 21758 ip0dbg(("ip_mdinfo_return: can't enable Multidata for " 21759 "conn %p on %s (ENOMEM)\n", (void *)connp, ill_name)); 21760 return (NULL); 21761 } 21762 return (mp); 21763 } 21764 21765 /* 21766 * Create destination address attribute, and fill it with the physical 21767 * destination address and SAP taken from the template DL_UNITDATA_REQ 21768 * message block. 21769 */ 21770 boolean_t 21771 ip_md_addr_attr(multidata_t *mmd, pdesc_t *pd, const mblk_t *dlmp) 21772 { 21773 dl_unitdata_req_t *dlurp; 21774 pattr_t *pa; 21775 pattrinfo_t pa_info; 21776 pattr_addr_t **das = (pattr_addr_t **)&pa_info.buf; 21777 uint_t das_len, das_off; 21778 21779 ASSERT(dlmp != NULL); 21780 21781 dlurp = (dl_unitdata_req_t *)dlmp->b_rptr; 21782 das_len = dlurp->dl_dest_addr_length; 21783 das_off = dlurp->dl_dest_addr_offset; 21784 21785 pa_info.type = PATTR_DSTADDRSAP; 21786 pa_info.len = sizeof (**das) + das_len - 1; 21787 21788 /* create and associate the attribute */ 21789 pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP); 21790 if (pa != NULL) { 21791 ASSERT(*das != NULL); 21792 (*das)->addr_is_group = 0; 21793 (*das)->addr_len = (uint8_t)das_len; 21794 bcopy((caddr_t)dlurp + das_off, (*das)->addr, das_len); 21795 } 21796 21797 return (pa != NULL); 21798 } 21799 21800 /* 21801 * Create hardware checksum attribute and fill it with the values passed. 21802 */ 21803 boolean_t 21804 ip_md_hcksum_attr(multidata_t *mmd, pdesc_t *pd, uint32_t start_offset, 21805 uint32_t stuff_offset, uint32_t end_offset, uint32_t flags) 21806 { 21807 pattr_t *pa; 21808 pattrinfo_t pa_info; 21809 21810 ASSERT(mmd != NULL); 21811 21812 pa_info.type = PATTR_HCKSUM; 21813 pa_info.len = sizeof (pattr_hcksum_t); 21814 21815 /* create and associate the attribute */ 21816 pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP); 21817 if (pa != NULL) { 21818 pattr_hcksum_t *hck = (pattr_hcksum_t *)pa_info.buf; 21819 21820 hck->hcksum_start_offset = start_offset; 21821 hck->hcksum_stuff_offset = stuff_offset; 21822 hck->hcksum_end_offset = end_offset; 21823 hck->hcksum_flags = flags; 21824 } 21825 return (pa != NULL); 21826 } 21827 21828 /* 21829 * Create zerocopy attribute and fill it with the specified flags 21830 */ 21831 boolean_t 21832 ip_md_zcopy_attr(multidata_t *mmd, pdesc_t *pd, uint_t flags) 21833 { 21834 pattr_t *pa; 21835 pattrinfo_t pa_info; 21836 21837 ASSERT(mmd != NULL); 21838 pa_info.type = PATTR_ZCOPY; 21839 pa_info.len = sizeof (pattr_zcopy_t); 21840 21841 /* create and associate the attribute */ 21842 pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP); 21843 if (pa != NULL) { 21844 pattr_zcopy_t *zcopy = (pattr_zcopy_t *)pa_info.buf; 21845 21846 zcopy->zcopy_flags = flags; 21847 } 21848 return (pa != NULL); 21849 } 21850 21851 /* 21852 * Check if ip_wput_frag_mdt() and ip_wput_frag_mdt_v6() can handle a message 21853 * block chain. We could rewrite to handle arbitrary message block chains but 21854 * that would make the code complicated and slow. Right now there three 21855 * restrictions: 21856 * 21857 * 1. The first message block must contain the complete IP header and 21858 * at least 1 byte of payload data. 21859 * 2. At most MULTIDATA_MAX_PBUFS non-empty message blocks are allowed 21860 * so that we can use a single Multidata message. 21861 * 3. No frag must be distributed over two or more message blocks so 21862 * that we don't need more than two packet descriptors per frag. 21863 * 21864 * The above restrictions allow us to support userland applications (which 21865 * will send down a single message block) and NFS over UDP (which will 21866 * send down a chain of at most three message blocks). 21867 * 21868 * We also don't use MDT for payloads with less than or equal to 21869 * ip_wput_frag_mdt_min bytes because it would cause too much overhead. 21870 */ 21871 boolean_t 21872 ip_can_frag_mdt(mblk_t *mp, ssize_t hdr_len, ssize_t len) 21873 { 21874 int blocks; 21875 ssize_t total, missing, size; 21876 21877 ASSERT(mp != NULL); 21878 ASSERT(hdr_len > 0); 21879 21880 size = MBLKL(mp) - hdr_len; 21881 if (size <= 0) 21882 return (B_FALSE); 21883 21884 /* The first mblk contains the header and some payload. */ 21885 blocks = 1; 21886 total = size; 21887 size %= len; 21888 missing = (size == 0) ? 0 : (len - size); 21889 mp = mp->b_cont; 21890 21891 while (mp != NULL) { 21892 /* 21893 * Give up if we encounter a zero length message block. 21894 * In practice, this should rarely happen and therefore 21895 * not worth the trouble of freeing and re-linking the 21896 * mblk from the chain to handle such case. 21897 */ 21898 if ((size = MBLKL(mp)) == 0) 21899 return (B_FALSE); 21900 21901 /* Too many payload buffers for a single Multidata message? */ 21902 if (++blocks > MULTIDATA_MAX_PBUFS) 21903 return (B_FALSE); 21904 21905 total += size; 21906 /* Is a frag distributed over two or more message blocks? */ 21907 if (missing > size) 21908 return (B_FALSE); 21909 size -= missing; 21910 21911 size %= len; 21912 missing = (size == 0) ? 0 : (len - size); 21913 21914 mp = mp->b_cont; 21915 } 21916 21917 return (total > ip_wput_frag_mdt_min); 21918 } 21919 21920 /* 21921 * Outbound IPv4 fragmentation routine using MDT. 21922 */ 21923 static void 21924 ip_wput_frag_mdt(ire_t *ire, mblk_t *mp, ip_pkt_t pkt_type, int len, 21925 uint32_t frag_flag, int offset) 21926 { 21927 ipha_t *ipha_orig; 21928 int i1, ip_data_end; 21929 uint_t pkts, wroff, hdr_chunk_len, pbuf_idx; 21930 mblk_t *hdr_mp, *md_mp = NULL; 21931 unsigned char *hdr_ptr, *pld_ptr; 21932 multidata_t *mmd; 21933 ip_pdescinfo_t pdi; 21934 21935 ASSERT(DB_TYPE(mp) == M_DATA); 21936 ASSERT(MBLKL(mp) > sizeof (ipha_t)); 21937 21938 ipha_orig = (ipha_t *)mp->b_rptr; 21939 mp->b_rptr += sizeof (ipha_t); 21940 21941 /* Calculate how many packets we will send out */ 21942 i1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgsize(mp); 21943 pkts = (i1 + len - 1) / len; 21944 ASSERT(pkts > 1); 21945 21946 /* Allocate a message block which will hold all the IP Headers. */ 21947 wroff = ip_wroff_extra; 21948 hdr_chunk_len = wroff + IP_SIMPLE_HDR_LENGTH; 21949 21950 i1 = pkts * hdr_chunk_len; 21951 /* 21952 * Create the header buffer, Multidata and destination address 21953 * and SAP attribute that should be associated with it. 21954 */ 21955 if ((hdr_mp = allocb(i1, BPRI_HI)) == NULL || 21956 ((hdr_mp->b_wptr += i1), 21957 (mmd = mmd_alloc(hdr_mp, &md_mp, KM_NOSLEEP)) == NULL) || 21958 !ip_md_addr_attr(mmd, NULL, ire->ire_dlureq_mp)) { 21959 freemsg(mp); 21960 if (md_mp == NULL) { 21961 freemsg(hdr_mp); 21962 } else { 21963 free_mmd: IP_STAT(ip_frag_mdt_discarded); 21964 freemsg(md_mp); 21965 } 21966 IP_STAT(ip_frag_mdt_allocfail); 21967 UPDATE_MIB(&ip_mib, ipOutDiscards, pkts); 21968 return; 21969 } 21970 IP_STAT(ip_frag_mdt_allocd); 21971 21972 /* 21973 * Add a payload buffer to the Multidata; this operation must not 21974 * fail, or otherwise our logic in this routine is broken. There 21975 * is no memory allocation done by the routine, so any returned 21976 * failure simply tells us that we've done something wrong. 21977 * 21978 * A failure tells us that either we're adding the same payload 21979 * buffer more than once, or we're trying to add more buffers than 21980 * allowed. None of the above cases should happen, and we panic 21981 * because either there's horrible heap corruption, and/or 21982 * programming mistake. 21983 */ 21984 if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) 21985 goto pbuf_panic; 21986 21987 hdr_ptr = hdr_mp->b_rptr; 21988 pld_ptr = mp->b_rptr; 21989 21990 /* Establish the ending byte offset, based on the starting offset. */ 21991 offset <<= 3; 21992 ip_data_end = offset + ntohs(ipha_orig->ipha_length) - 21993 IP_SIMPLE_HDR_LENGTH; 21994 21995 pdi.flags = PDESC_HBUF_REF | PDESC_PBUF_REF; 21996 21997 while (pld_ptr < mp->b_wptr) { 21998 ipha_t *ipha; 21999 uint16_t offset_and_flags; 22000 uint16_t ip_len; 22001 int error; 22002 22003 ASSERT((hdr_ptr + hdr_chunk_len) <= hdr_mp->b_wptr); 22004 ipha = (ipha_t *)(hdr_ptr + wroff); 22005 ASSERT(OK_32PTR(ipha)); 22006 *ipha = *ipha_orig; 22007 22008 if (ip_data_end - offset > len) { 22009 offset_and_flags = IPH_MF; 22010 } else { 22011 /* 22012 * Last frag. Set len to the length of this last piece. 22013 */ 22014 len = ip_data_end - offset; 22015 /* A frag of a frag might have IPH_MF non-zero */ 22016 offset_and_flags = 22017 ntohs(ipha->ipha_fragment_offset_and_flags) & 22018 IPH_MF; 22019 } 22020 offset_and_flags |= (uint16_t)(offset >> 3); 22021 offset_and_flags |= (uint16_t)frag_flag; 22022 /* Store the offset and flags in the IP header. */ 22023 ipha->ipha_fragment_offset_and_flags = htons(offset_and_flags); 22024 22025 /* Store the length in the IP header. */ 22026 ip_len = (uint16_t)(len + IP_SIMPLE_HDR_LENGTH); 22027 ipha->ipha_length = htons(ip_len); 22028 22029 /* 22030 * Set the IP header checksum. Note that mp is just 22031 * the header, so this is easy to pass to ip_csum. 22032 */ 22033 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 22034 22035 /* 22036 * Record offset and size of header and data of the next packet 22037 * in the multidata message. 22038 */ 22039 PDESC_HDR_ADD(&pdi, hdr_ptr, wroff, IP_SIMPLE_HDR_LENGTH, 0); 22040 PDESC_PLD_INIT(&pdi); 22041 i1 = MIN(mp->b_wptr - pld_ptr, len); 22042 ASSERT(i1 > 0); 22043 PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, pld_ptr, i1); 22044 if (i1 == len) { 22045 pld_ptr += len; 22046 } else { 22047 i1 = len - i1; 22048 mp = mp->b_cont; 22049 ASSERT(mp != NULL); 22050 ASSERT(MBLKL(mp) >= i1); 22051 /* 22052 * Attach the next payload message block to the 22053 * multidata message. 22054 */ 22055 if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) 22056 goto pbuf_panic; 22057 PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, mp->b_rptr, i1); 22058 pld_ptr = mp->b_rptr + i1; 22059 } 22060 22061 if ((mmd_addpdesc(mmd, (pdescinfo_t *)&pdi, &error, 22062 KM_NOSLEEP)) == NULL) { 22063 /* 22064 * Any failure other than ENOMEM indicates that we 22065 * have passed in invalid pdesc info or parameters 22066 * to mmd_addpdesc, which must not happen. 22067 * 22068 * EINVAL is a result of failure on boundary checks 22069 * against the pdesc info contents. It should not 22070 * happen, and we panic because either there's 22071 * horrible heap corruption, and/or programming 22072 * mistake. 22073 */ 22074 if (error != ENOMEM) { 22075 cmn_err(CE_PANIC, "ip_wput_frag_mdt: " 22076 "pdesc logic error detected for " 22077 "mmd %p pinfo %p (%d)\n", 22078 (void *)mmd, (void *)&pdi, error); 22079 /* NOTREACHED */ 22080 } 22081 IP_STAT(ip_frag_mdt_addpdescfail); 22082 /* Free unattached payload message blocks as well */ 22083 md_mp->b_cont = mp->b_cont; 22084 goto free_mmd; 22085 } 22086 22087 /* Advance fragment offset. */ 22088 offset += len; 22089 22090 /* Advance to location for next header in the buffer. */ 22091 hdr_ptr += hdr_chunk_len; 22092 22093 /* Did we reach the next payload message block? */ 22094 if (pld_ptr == mp->b_wptr && mp->b_cont != NULL) { 22095 mp = mp->b_cont; 22096 /* 22097 * Attach the next message block with payload 22098 * data to the multidata message. 22099 */ 22100 if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) 22101 goto pbuf_panic; 22102 pld_ptr = mp->b_rptr; 22103 } 22104 } 22105 22106 ASSERT(hdr_mp->b_wptr == hdr_ptr); 22107 ASSERT(mp->b_wptr == pld_ptr); 22108 22109 /* Update IP statistics */ 22110 UPDATE_MIB(&ip_mib, ipFragCreates, pkts); 22111 BUMP_MIB(&ip_mib, ipFragOKs); 22112 IP_STAT_UPDATE(ip_frag_mdt_pkt_out, pkts); 22113 22114 if (pkt_type == OB_PKT) { 22115 ire->ire_ob_pkt_count += pkts; 22116 if (ire->ire_ipif != NULL) 22117 atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, pkts); 22118 } else { 22119 /* 22120 * The type is IB_PKT in the forwarding path and in 22121 * the mobile IP case when the packet is being reverse- 22122 * tunneled to the home agent. 22123 */ 22124 ire->ire_ib_pkt_count += pkts; 22125 ASSERT(!IRE_IS_LOCAL(ire)); 22126 if (ire->ire_type & IRE_BROADCAST) 22127 atomic_add_32(&ire->ire_ipif->ipif_ib_pkt_count, pkts); 22128 else 22129 atomic_add_32(&ire->ire_ipif->ipif_fo_pkt_count, pkts); 22130 } 22131 ire->ire_last_used_time = lbolt; 22132 /* Send it down */ 22133 putnext(ire->ire_stq, md_mp); 22134 return; 22135 22136 pbuf_panic: 22137 cmn_err(CE_PANIC, "ip_wput_frag_mdt: payload buffer logic " 22138 "error for mmd %p pbuf %p (%d)", (void *)mmd, (void *)mp, 22139 pbuf_idx); 22140 /* NOTREACHED */ 22141 } 22142 22143 /* 22144 * Outbound IP fragmentation routine. 22145 * 22146 * NOTE : This routine does not ire_refrele the ire that is passed in 22147 * as the argument. 22148 */ 22149 static void 22150 ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, 22151 uint32_t frag_flag) 22152 { 22153 int i1; 22154 mblk_t *ll_hdr_mp; 22155 int ll_hdr_len; 22156 int hdr_len; 22157 mblk_t *hdr_mp; 22158 ipha_t *ipha; 22159 int ip_data_end; 22160 int len; 22161 mblk_t *mp = mp_orig; 22162 int offset; 22163 queue_t *q; 22164 uint32_t v_hlen_tos_len; 22165 mblk_t *first_mp; 22166 boolean_t mctl_present; 22167 ill_t *ill; 22168 mblk_t *xmit_mp; 22169 mblk_t *carve_mp; 22170 ire_t *ire1 = NULL; 22171 ire_t *save_ire = NULL; 22172 mblk_t *next_mp = NULL; 22173 boolean_t last_frag = B_FALSE; 22174 boolean_t multirt_send = B_FALSE; 22175 ire_t *first_ire = NULL; 22176 irb_t *irb = NULL; 22177 22178 TRACE_0(TR_FAC_IP, TR_IP_WPUT_FRAG_START, 22179 "ip_wput_frag_start:"); 22180 22181 if (mp->b_datap->db_type == M_CTL) { 22182 first_mp = mp; 22183 mp_orig = mp = mp->b_cont; 22184 mctl_present = B_TRUE; 22185 } else { 22186 first_mp = mp; 22187 mctl_present = B_FALSE; 22188 } 22189 22190 ASSERT(MBLKL(mp) >= sizeof (ipha_t)); 22191 ipha = (ipha_t *)mp->b_rptr; 22192 22193 /* 22194 * If the Don't Fragment flag is on, generate an ICMP destination 22195 * unreachable, fragmentation needed. 22196 */ 22197 offset = ntohs(ipha->ipha_fragment_offset_and_flags); 22198 if (offset & IPH_DF) { 22199 BUMP_MIB(&ip_mib, ipFragFails); 22200 /* 22201 * Need to compute hdr checksum if called from ip_wput_ire. 22202 * Note that ip_rput_forward verifies the checksum before 22203 * calling this routine so in that case this is a noop. 22204 */ 22205 ipha->ipha_hdr_checksum = 0; 22206 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 22207 icmp_frag_needed(ire->ire_stq, first_mp, max_frag); 22208 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 22209 "ip_wput_frag_end:(%S)", 22210 "don't fragment"); 22211 return; 22212 } 22213 if (mctl_present) 22214 freeb(first_mp); 22215 /* 22216 * Establish the starting offset. May not be zero if we are fragging 22217 * a fragment that is being forwarded. 22218 */ 22219 offset = offset & IPH_OFFSET; 22220 22221 /* TODO why is this test needed? */ 22222 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 22223 if (((max_frag - LENGTH) & ~7) < 8) { 22224 /* TODO: notify ulp somehow */ 22225 BUMP_MIB(&ip_mib, ipFragFails); 22226 freemsg(mp); 22227 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 22228 "ip_wput_frag_end:(%S)", 22229 "len < 8"); 22230 return; 22231 } 22232 22233 hdr_len = (V_HLEN & 0xF) << 2; 22234 22235 ipha->ipha_hdr_checksum = 0; 22236 22237 /* 22238 * Establish the number of bytes maximum per frag, after putting 22239 * in the header. 22240 */ 22241 len = (max_frag - hdr_len) & ~7; 22242 22243 /* Check if we can use MDT to send out the frags. */ 22244 ASSERT(!IRE_IS_LOCAL(ire)); 22245 if (hdr_len == IP_SIMPLE_HDR_LENGTH && ip_multidata_outbound && 22246 !(ire->ire_flags & RTF_MULTIRT) && !IPP_ENABLED(IPP_LOCAL_OUT) && 22247 (ill = ire_to_ill(ire)) != NULL && ILL_MDT_CAPABLE(ill) && 22248 IP_CAN_FRAG_MDT(mp, IP_SIMPLE_HDR_LENGTH, len)) { 22249 ASSERT(ill->ill_mdt_capab != NULL); 22250 if (!ill->ill_mdt_capab->ill_mdt_on) { 22251 /* 22252 * If MDT has been previously turned off in the past, 22253 * and we currently can do MDT (due to IPQoS policy 22254 * removal, etc.) then enable it for this interface. 22255 */ 22256 ill->ill_mdt_capab->ill_mdt_on = 1; 22257 ip1dbg(("ip_wput_frag: enabled MDT for interface %s\n", 22258 ill->ill_name)); 22259 } 22260 ip_wput_frag_mdt(ire, mp, pkt_type, len, frag_flag, 22261 offset); 22262 return; 22263 } 22264 22265 /* Get a copy of the header for the trailing frags */ 22266 hdr_mp = ip_wput_frag_copyhdr((uchar_t *)ipha, hdr_len, offset); 22267 if (!hdr_mp) { 22268 BUMP_MIB(&ip_mib, ipOutDiscards); 22269 freemsg(mp); 22270 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 22271 "ip_wput_frag_end:(%S)", 22272 "couldn't copy hdr"); 22273 return; 22274 } 22275 if (DB_CRED(mp) != NULL) 22276 mblk_setcred(hdr_mp, DB_CRED(mp)); 22277 22278 /* Store the starting offset, with the MoreFrags flag. */ 22279 i1 = offset | IPH_MF | frag_flag; 22280 ipha->ipha_fragment_offset_and_flags = htons((uint16_t)i1); 22281 22282 /* Establish the ending byte offset, based on the starting offset. */ 22283 offset <<= 3; 22284 ip_data_end = offset + ntohs(ipha->ipha_length) - hdr_len; 22285 22286 /* Store the length of the first fragment in the IP header. */ 22287 i1 = len + hdr_len; 22288 ASSERT(i1 <= IP_MAXPACKET); 22289 ipha->ipha_length = htons((uint16_t)i1); 22290 22291 /* 22292 * Compute the IP header checksum for the first frag. We have to 22293 * watch out that we stop at the end of the header. 22294 */ 22295 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 22296 22297 /* 22298 * Now carve off the first frag. Note that this will include the 22299 * original IP header. 22300 */ 22301 if (!(mp = ip_carve_mp(&mp_orig, i1))) { 22302 BUMP_MIB(&ip_mib, ipOutDiscards); 22303 freeb(hdr_mp); 22304 freemsg(mp_orig); 22305 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 22306 "ip_wput_frag_end:(%S)", 22307 "couldn't carve first"); 22308 return; 22309 } 22310 22311 /* 22312 * Multirouting case. Each fragment is replicated 22313 * via all non-condemned RTF_MULTIRT routes 22314 * currently resolved. 22315 * We ensure that first_ire is the first RTF_MULTIRT 22316 * ire in the bucket. 22317 */ 22318 if (ire->ire_flags & RTF_MULTIRT) { 22319 irb = ire->ire_bucket; 22320 ASSERT(irb != NULL); 22321 22322 multirt_send = B_TRUE; 22323 22324 /* Make sure we do not omit any multiroute ire. */ 22325 IRB_REFHOLD(irb); 22326 for (first_ire = irb->irb_ire; 22327 first_ire != NULL; 22328 first_ire = first_ire->ire_next) { 22329 if ((first_ire->ire_flags & RTF_MULTIRT) && 22330 (first_ire->ire_addr == ire->ire_addr) && 22331 !(first_ire->ire_marks & 22332 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) 22333 break; 22334 } 22335 22336 if (first_ire != NULL) { 22337 if (first_ire != ire) { 22338 IRE_REFHOLD(first_ire); 22339 /* 22340 * Do not release the ire passed in 22341 * as the argument. 22342 */ 22343 ire = first_ire; 22344 } else { 22345 first_ire = NULL; 22346 } 22347 } 22348 IRB_REFRELE(irb); 22349 22350 /* 22351 * Save the first ire; we will need to restore it 22352 * for the trailing frags. 22353 * We REFHOLD save_ire, as each iterated ire will be 22354 * REFRELEd. 22355 */ 22356 save_ire = ire; 22357 IRE_REFHOLD(save_ire); 22358 } 22359 22360 /* 22361 * First fragment emission loop. 22362 * In most cases, the emission loop below is entered only 22363 * once. Only in the case where the ire holds the RTF_MULTIRT 22364 * flag, do we loop to process all RTF_MULTIRT ires in the 22365 * bucket, and send the fragment through all crossed 22366 * RTF_MULTIRT routes. 22367 */ 22368 do { 22369 if (ire->ire_flags & RTF_MULTIRT) { 22370 /* 22371 * We are in a multiple send case, need to get 22372 * the next ire and make a copy of the packet. 22373 * ire1 holds here the next ire to process in the 22374 * bucket. If multirouting is expected, 22375 * any non-RTF_MULTIRT ire that has the 22376 * right destination address is ignored. 22377 * 22378 * We have to take into account the MTU of 22379 * each walked ire. max_frag is set by the 22380 * the caller and generally refers to 22381 * the primary ire entry. Here we ensure that 22382 * no route with a lower MTU will be used, as 22383 * fragments are carved once for all ires, 22384 * then replicated. 22385 */ 22386 ASSERT(irb != NULL); 22387 IRB_REFHOLD(irb); 22388 for (ire1 = ire->ire_next; 22389 ire1 != NULL; 22390 ire1 = ire1->ire_next) { 22391 if ((ire1->ire_flags & RTF_MULTIRT) == 0) 22392 continue; 22393 if (ire1->ire_addr != ire->ire_addr) 22394 continue; 22395 if (ire1->ire_marks & 22396 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)) 22397 continue; 22398 /* 22399 * Ensure we do not exceed the MTU 22400 * of the next route. 22401 */ 22402 if (ire1->ire_max_frag < max_frag) { 22403 ip_multirt_bad_mtu(ire1, max_frag); 22404 continue; 22405 } 22406 22407 /* Got one. */ 22408 IRE_REFHOLD(ire1); 22409 break; 22410 } 22411 IRB_REFRELE(irb); 22412 22413 if (ire1 != NULL) { 22414 next_mp = copyb(mp); 22415 if ((next_mp == NULL) || 22416 ((mp->b_cont != NULL) && 22417 ((next_mp->b_cont = 22418 dupmsg(mp->b_cont)) == NULL))) { 22419 freemsg(next_mp); 22420 next_mp = NULL; 22421 ire_refrele(ire1); 22422 ire1 = NULL; 22423 } 22424 } 22425 22426 /* Last multiroute ire; don't loop anymore. */ 22427 if (ire1 == NULL) { 22428 multirt_send = B_FALSE; 22429 } 22430 } 22431 22432 ll_hdr_len = 0; 22433 LOCK_IRE_FP_MP(ire); 22434 ll_hdr_mp = ire->ire_fp_mp; 22435 if (ll_hdr_mp != NULL) { 22436 ASSERT(ll_hdr_mp->b_datap->db_type == M_DATA); 22437 ll_hdr_len = ll_hdr_mp->b_wptr - ll_hdr_mp->b_rptr; 22438 } else { 22439 ll_hdr_mp = ire->ire_dlureq_mp; 22440 } 22441 22442 /* If there is a transmit header, get a copy for this frag. */ 22443 /* 22444 * TODO: should check db_ref before calling ip_carve_mp since 22445 * it might give us a dup. 22446 */ 22447 if (!ll_hdr_mp) { 22448 /* No xmit header. */ 22449 xmit_mp = mp; 22450 } else if (mp->b_datap->db_ref == 1 && 22451 ll_hdr_len != 0 && 22452 ll_hdr_len <= mp->b_rptr - mp->b_datap->db_base) { 22453 /* M_DATA fastpath */ 22454 mp->b_rptr -= ll_hdr_len; 22455 bcopy(ll_hdr_mp->b_rptr, mp->b_rptr, ll_hdr_len); 22456 xmit_mp = mp; 22457 } else if (!(xmit_mp = copyb(ll_hdr_mp))) { 22458 UNLOCK_IRE_FP_MP(ire); 22459 BUMP_MIB(&ip_mib, ipOutDiscards); 22460 freeb(hdr_mp); 22461 freemsg(mp); 22462 freemsg(mp_orig); 22463 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 22464 "ip_wput_frag_end:(%S)", 22465 "discard"); 22466 22467 if (multirt_send) { 22468 ASSERT(ire1); 22469 ASSERT(next_mp); 22470 22471 freemsg(next_mp); 22472 ire_refrele(ire1); 22473 } 22474 if (save_ire != NULL) 22475 IRE_REFRELE(save_ire); 22476 22477 if (first_ire != NULL) 22478 ire_refrele(first_ire); 22479 return; 22480 } else { 22481 xmit_mp->b_cont = mp; 22482 if (DB_CRED(mp) != NULL) 22483 mblk_setcred(xmit_mp, DB_CRED(mp)); 22484 /* Get priority marking, if any. */ 22485 if (DB_TYPE(xmit_mp) == M_DATA) 22486 xmit_mp->b_band = mp->b_band; 22487 } 22488 UNLOCK_IRE_FP_MP(ire); 22489 q = ire->ire_stq; 22490 BUMP_MIB(&ip_mib, ipFragCreates); 22491 putnext(q, xmit_mp); 22492 if (pkt_type != OB_PKT) { 22493 /* 22494 * Update the packet count of trailing 22495 * RTF_MULTIRT ires. 22496 */ 22497 UPDATE_OB_PKT_COUNT(ire); 22498 } 22499 22500 if (multirt_send) { 22501 /* 22502 * We are in a multiple send case; look for 22503 * the next ire and re-enter the loop. 22504 */ 22505 ASSERT(ire1); 22506 ASSERT(next_mp); 22507 /* REFRELE the current ire before looping */ 22508 ire_refrele(ire); 22509 ire = ire1; 22510 ire1 = NULL; 22511 mp = next_mp; 22512 next_mp = NULL; 22513 } 22514 } while (multirt_send); 22515 22516 ASSERT(ire1 == NULL); 22517 22518 /* Restore the original ire; we need it for the trailing frags */ 22519 if (save_ire != NULL) { 22520 /* REFRELE the last iterated ire */ 22521 ire_refrele(ire); 22522 /* save_ire has been REFHOLDed */ 22523 ire = save_ire; 22524 save_ire = NULL; 22525 q = ire->ire_stq; 22526 } 22527 22528 if (pkt_type == OB_PKT) { 22529 UPDATE_OB_PKT_COUNT(ire); 22530 } else { 22531 UPDATE_IB_PKT_COUNT(ire); 22532 } 22533 22534 /* Advance the offset to the second frag starting point. */ 22535 offset += len; 22536 /* 22537 * Update hdr_len from the copied header - there might be less options 22538 * in the later fragments. 22539 */ 22540 hdr_len = IPH_HDR_LENGTH(hdr_mp->b_rptr); 22541 /* Loop until done. */ 22542 for (;;) { 22543 uint16_t offset_and_flags; 22544 uint16_t ip_len; 22545 22546 if (ip_data_end - offset > len) { 22547 /* 22548 * Carve off the appropriate amount from the original 22549 * datagram. 22550 */ 22551 if (!(carve_mp = ip_carve_mp(&mp_orig, len))) { 22552 mp = NULL; 22553 break; 22554 } 22555 /* 22556 * More frags after this one. Get another copy 22557 * of the header. 22558 */ 22559 if (carve_mp->b_datap->db_ref == 1 && 22560 hdr_mp->b_wptr - hdr_mp->b_rptr < 22561 carve_mp->b_rptr - carve_mp->b_datap->db_base) { 22562 /* Inline IP header */ 22563 carve_mp->b_rptr -= hdr_mp->b_wptr - 22564 hdr_mp->b_rptr; 22565 bcopy(hdr_mp->b_rptr, carve_mp->b_rptr, 22566 hdr_mp->b_wptr - hdr_mp->b_rptr); 22567 mp = carve_mp; 22568 } else { 22569 if (!(mp = copyb(hdr_mp))) { 22570 freemsg(carve_mp); 22571 break; 22572 } 22573 /* Get priority marking, if any. */ 22574 mp->b_band = carve_mp->b_band; 22575 mp->b_cont = carve_mp; 22576 } 22577 ipha = (ipha_t *)mp->b_rptr; 22578 offset_and_flags = IPH_MF; 22579 } else { 22580 /* 22581 * Last frag. Consume the header. Set len to 22582 * the length of this last piece. 22583 */ 22584 len = ip_data_end - offset; 22585 22586 /* 22587 * Carve off the appropriate amount from the original 22588 * datagram. 22589 */ 22590 if (!(carve_mp = ip_carve_mp(&mp_orig, len))) { 22591 mp = NULL; 22592 break; 22593 } 22594 if (carve_mp->b_datap->db_ref == 1 && 22595 hdr_mp->b_wptr - hdr_mp->b_rptr < 22596 carve_mp->b_rptr - carve_mp->b_datap->db_base) { 22597 /* Inline IP header */ 22598 carve_mp->b_rptr -= hdr_mp->b_wptr - 22599 hdr_mp->b_rptr; 22600 bcopy(hdr_mp->b_rptr, carve_mp->b_rptr, 22601 hdr_mp->b_wptr - hdr_mp->b_rptr); 22602 mp = carve_mp; 22603 freeb(hdr_mp); 22604 hdr_mp = mp; 22605 } else { 22606 mp = hdr_mp; 22607 /* Get priority marking, if any. */ 22608 mp->b_band = carve_mp->b_band; 22609 mp->b_cont = carve_mp; 22610 } 22611 ipha = (ipha_t *)mp->b_rptr; 22612 /* A frag of a frag might have IPH_MF non-zero */ 22613 offset_and_flags = 22614 ntohs(ipha->ipha_fragment_offset_and_flags) & 22615 IPH_MF; 22616 } 22617 offset_and_flags |= (uint16_t)(offset >> 3); 22618 offset_and_flags |= (uint16_t)frag_flag; 22619 /* Store the offset and flags in the IP header. */ 22620 ipha->ipha_fragment_offset_and_flags = htons(offset_and_flags); 22621 22622 /* Store the length in the IP header. */ 22623 ip_len = (uint16_t)(len + hdr_len); 22624 ipha->ipha_length = htons(ip_len); 22625 22626 /* 22627 * Set the IP header checksum. Note that mp is just 22628 * the header, so this is easy to pass to ip_csum. 22629 */ 22630 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 22631 22632 /* Attach a transmit header, if any, and ship it. */ 22633 if (pkt_type == OB_PKT) { 22634 UPDATE_OB_PKT_COUNT(ire); 22635 } else { 22636 UPDATE_IB_PKT_COUNT(ire); 22637 } 22638 22639 if (ire->ire_flags & RTF_MULTIRT) { 22640 irb = ire->ire_bucket; 22641 ASSERT(irb != NULL); 22642 22643 multirt_send = B_TRUE; 22644 22645 /* 22646 * Save the original ire; we will need to restore it 22647 * for the tailing frags. 22648 */ 22649 save_ire = ire; 22650 IRE_REFHOLD(save_ire); 22651 } 22652 /* 22653 * Emission loop for this fragment, similar 22654 * to what is done for the first fragment. 22655 */ 22656 do { 22657 if (multirt_send) { 22658 /* 22659 * We are in a multiple send case, need to get 22660 * the next ire and make a copy of the packet. 22661 */ 22662 ASSERT(irb != NULL); 22663 IRB_REFHOLD(irb); 22664 for (ire1 = ire->ire_next; 22665 ire1 != NULL; 22666 ire1 = ire1->ire_next) { 22667 if (!(ire1->ire_flags & RTF_MULTIRT)) 22668 continue; 22669 if (ire1->ire_addr != ire->ire_addr) 22670 continue; 22671 if (ire1->ire_marks & 22672 (IRE_MARK_CONDEMNED| 22673 IRE_MARK_HIDDEN)) 22674 continue; 22675 /* 22676 * Ensure we do not exceed the MTU 22677 * of the next route. 22678 */ 22679 if (ire1->ire_max_frag < max_frag) { 22680 ip_multirt_bad_mtu(ire1, 22681 max_frag); 22682 continue; 22683 } 22684 22685 /* Got one. */ 22686 IRE_REFHOLD(ire1); 22687 break; 22688 } 22689 IRB_REFRELE(irb); 22690 22691 if (ire1 != NULL) { 22692 next_mp = copyb(mp); 22693 if ((next_mp == NULL) || 22694 ((mp->b_cont != NULL) && 22695 ((next_mp->b_cont = 22696 dupmsg(mp->b_cont)) == NULL))) { 22697 freemsg(next_mp); 22698 next_mp = NULL; 22699 ire_refrele(ire1); 22700 ire1 = NULL; 22701 } 22702 } 22703 22704 /* Last multiroute ire; don't loop anymore. */ 22705 if (ire1 == NULL) { 22706 multirt_send = B_FALSE; 22707 } 22708 } 22709 22710 /* Update transmit header */ 22711 ll_hdr_len = 0; 22712 LOCK_IRE_FP_MP(ire); 22713 ll_hdr_mp = ire->ire_fp_mp; 22714 if (ll_hdr_mp != NULL) { 22715 ASSERT(ll_hdr_mp->b_datap->db_type == M_DATA); 22716 ll_hdr_len = MBLKL(ll_hdr_mp); 22717 } else { 22718 ll_hdr_mp = ire->ire_dlureq_mp; 22719 } 22720 22721 if (!ll_hdr_mp) { 22722 xmit_mp = mp; 22723 } else if (mp->b_datap->db_ref == 1 && 22724 ll_hdr_len != 0 && 22725 ll_hdr_len <= mp->b_rptr - mp->b_datap->db_base) { 22726 /* M_DATA fastpath */ 22727 mp->b_rptr -= ll_hdr_len; 22728 bcopy(ll_hdr_mp->b_rptr, mp->b_rptr, 22729 ll_hdr_len); 22730 xmit_mp = mp; 22731 } else if ((xmit_mp = copyb(ll_hdr_mp)) != NULL) { 22732 xmit_mp->b_cont = mp; 22733 if (DB_CRED(mp) != NULL) 22734 mblk_setcred(xmit_mp, DB_CRED(mp)); 22735 /* Get priority marking, if any. */ 22736 if (DB_TYPE(xmit_mp) == M_DATA) 22737 xmit_mp->b_band = mp->b_band; 22738 } else { 22739 /* 22740 * Exit both the replication and 22741 * fragmentation loops. 22742 */ 22743 UNLOCK_IRE_FP_MP(ire); 22744 goto drop_pkt; 22745 } 22746 UNLOCK_IRE_FP_MP(ire); 22747 BUMP_MIB(&ip_mib, ipFragCreates); 22748 putnext(q, xmit_mp); 22749 22750 if (pkt_type != OB_PKT) { 22751 /* 22752 * Update the packet count of trailing 22753 * RTF_MULTIRT ires. 22754 */ 22755 UPDATE_OB_PKT_COUNT(ire); 22756 } 22757 22758 /* All done if we just consumed the hdr_mp. */ 22759 if (mp == hdr_mp) { 22760 last_frag = B_TRUE; 22761 } 22762 22763 if (multirt_send) { 22764 /* 22765 * We are in a multiple send case; look for 22766 * the next ire and re-enter the loop. 22767 */ 22768 ASSERT(ire1); 22769 ASSERT(next_mp); 22770 /* REFRELE the current ire before looping */ 22771 ire_refrele(ire); 22772 ire = ire1; 22773 ire1 = NULL; 22774 q = ire->ire_stq; 22775 mp = next_mp; 22776 next_mp = NULL; 22777 } 22778 } while (multirt_send); 22779 /* 22780 * Restore the original ire; we need it for the 22781 * trailing frags 22782 */ 22783 if (save_ire != NULL) { 22784 ASSERT(ire1 == NULL); 22785 /* REFRELE the last iterated ire */ 22786 ire_refrele(ire); 22787 /* save_ire has been REFHOLDed */ 22788 ire = save_ire; 22789 q = ire->ire_stq; 22790 save_ire = NULL; 22791 } 22792 22793 if (last_frag) { 22794 BUMP_MIB(&ip_mib, ipFragOKs); 22795 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 22796 "ip_wput_frag_end:(%S)", 22797 "consumed hdr_mp"); 22798 22799 if (first_ire != NULL) 22800 ire_refrele(first_ire); 22801 return; 22802 } 22803 /* Otherwise, advance and loop. */ 22804 offset += len; 22805 } 22806 22807 drop_pkt: 22808 /* Clean up following allocation failure. */ 22809 BUMP_MIB(&ip_mib, ipOutDiscards); 22810 freemsg(mp); 22811 if (mp != hdr_mp) 22812 freeb(hdr_mp); 22813 if (mp != mp_orig) 22814 freemsg(mp_orig); 22815 22816 if (save_ire != NULL) 22817 IRE_REFRELE(save_ire); 22818 if (first_ire != NULL) 22819 ire_refrele(first_ire); 22820 22821 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 22822 "ip_wput_frag_end:(%S)", 22823 "end--alloc failure"); 22824 } 22825 22826 /* 22827 * Copy the header plus those options which have the copy bit set 22828 */ 22829 static mblk_t * 22830 ip_wput_frag_copyhdr(uchar_t *rptr, int hdr_len, int offset) 22831 { 22832 mblk_t *mp; 22833 uchar_t *up; 22834 22835 /* 22836 * Quick check if we need to look for options without the copy bit 22837 * set 22838 */ 22839 mp = allocb(ip_wroff_extra + hdr_len, BPRI_HI); 22840 if (!mp) 22841 return (mp); 22842 mp->b_rptr += ip_wroff_extra; 22843 if (hdr_len == IP_SIMPLE_HDR_LENGTH || offset != 0) { 22844 bcopy(rptr, mp->b_rptr, hdr_len); 22845 mp->b_wptr += hdr_len + ip_wroff_extra; 22846 return (mp); 22847 } 22848 up = mp->b_rptr; 22849 bcopy(rptr, up, IP_SIMPLE_HDR_LENGTH); 22850 up += IP_SIMPLE_HDR_LENGTH; 22851 rptr += IP_SIMPLE_HDR_LENGTH; 22852 hdr_len -= IP_SIMPLE_HDR_LENGTH; 22853 while (hdr_len > 0) { 22854 uint32_t optval; 22855 uint32_t optlen; 22856 22857 optval = *rptr; 22858 if (optval == IPOPT_EOL) 22859 break; 22860 if (optval == IPOPT_NOP) 22861 optlen = 1; 22862 else 22863 optlen = rptr[1]; 22864 if (optval & IPOPT_COPY) { 22865 bcopy(rptr, up, optlen); 22866 up += optlen; 22867 } 22868 rptr += optlen; 22869 hdr_len -= optlen; 22870 } 22871 /* 22872 * Make sure that we drop an even number of words by filling 22873 * with EOL to the next word boundary. 22874 */ 22875 for (hdr_len = up - (mp->b_rptr + IP_SIMPLE_HDR_LENGTH); 22876 hdr_len & 0x3; hdr_len++) 22877 *up++ = IPOPT_EOL; 22878 mp->b_wptr = up; 22879 /* Update header length */ 22880 mp->b_rptr[0] = (uint8_t)((IP_VERSION << 4) | ((up - mp->b_rptr) >> 2)); 22881 return (mp); 22882 } 22883 22884 /* 22885 * Delivery to local recipients including fanout to multiple recipients. 22886 * Does not do checksumming of UDP/TCP. 22887 * Note: q should be the read side queue for either the ill or conn. 22888 * Note: rq should be the read side q for the lower (ill) stream. 22889 * We don't send packets to IPPF processing, thus the last argument 22890 * to all the fanout calls are B_FALSE. 22891 */ 22892 void 22893 ip_wput_local(queue_t *q, ill_t *ill, ipha_t *ipha, mblk_t *mp, ire_t *ire, 22894 int fanout_flags, zoneid_t zoneid) 22895 { 22896 uint32_t protocol; 22897 mblk_t *first_mp; 22898 boolean_t mctl_present; 22899 int ire_type; 22900 #define rptr ((uchar_t *)ipha) 22901 22902 TRACE_1(TR_FAC_IP, TR_IP_WPUT_LOCAL_START, 22903 "ip_wput_local_start: q %p", q); 22904 22905 if (ire != NULL) { 22906 ire_type = ire->ire_type; 22907 } else { 22908 /* 22909 * Only ip_multicast_loopback() calls us with a NULL ire. If the 22910 * packet is not multicast, we can't tell the ire type. 22911 */ 22912 ASSERT(CLASSD(ipha->ipha_dst)); 22913 ire_type = IRE_BROADCAST; 22914 } 22915 22916 first_mp = mp; 22917 if (first_mp->b_datap->db_type == M_CTL) { 22918 ipsec_out_t *io = (ipsec_out_t *)first_mp->b_rptr; 22919 if (!io->ipsec_out_secure) { 22920 /* 22921 * This ipsec_out_t was allocated in ip_wput 22922 * for multicast packets to store the ill_index. 22923 * As this is being delivered locally, we don't 22924 * need this anymore. 22925 */ 22926 mp = first_mp->b_cont; 22927 freeb(first_mp); 22928 first_mp = mp; 22929 mctl_present = B_FALSE; 22930 } else { 22931 mctl_present = B_TRUE; 22932 mp = first_mp->b_cont; 22933 ASSERT(mp != NULL); 22934 ipsec_out_to_in(first_mp); 22935 } 22936 } else { 22937 mctl_present = B_FALSE; 22938 } 22939 22940 loopback_packets++; 22941 22942 ip2dbg(("ip_wput_local: from 0x%x to 0x%x in zone %d\n", 22943 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), zoneid)); 22944 if (!IS_SIMPLE_IPH(ipha)) { 22945 ip_wput_local_options(ipha); 22946 } 22947 22948 protocol = ipha->ipha_protocol; 22949 switch (protocol) { 22950 case IPPROTO_ICMP: { 22951 ire_t *ire_zone; 22952 ilm_t *ilm; 22953 mblk_t *mp1; 22954 zoneid_t last_zoneid; 22955 22956 if (CLASSD(ipha->ipha_dst) && 22957 !(ill->ill_phyint->phyint_flags & PHYI_LOOPBACK)) { 22958 ASSERT(ire_type == IRE_BROADCAST); 22959 /* 22960 * In the multicast case, applications may have joined 22961 * the group from different zones, so we need to deliver 22962 * the packet to each of them. Loop through the 22963 * multicast memberships structures (ilm) on the receive 22964 * ill and send a copy of the packet up each matching 22965 * one. However, we don't do this for multicasts sent on 22966 * the loopback interface (PHYI_LOOPBACK flag set) as 22967 * they must stay in the sender's zone. 22968 * 22969 * ilm_add_v6() ensures that ilms in the same zone are 22970 * contiguous in the ill_ilm list. We use this property 22971 * to avoid sending duplicates needed when two 22972 * applications in the same zone join the same group on 22973 * different logical interfaces: we ignore the ilm if 22974 * its zoneid is the same as the last matching one. 22975 * In addition, the sending of the packet for 22976 * ire_zoneid is delayed until all of the other ilms 22977 * have been exhausted. 22978 */ 22979 last_zoneid = -1; 22980 ILM_WALKER_HOLD(ill); 22981 for (ilm = ill->ill_ilm; ilm != NULL; 22982 ilm = ilm->ilm_next) { 22983 if ((ilm->ilm_flags & ILM_DELETED) || 22984 ipha->ipha_dst != ilm->ilm_addr || 22985 ilm->ilm_zoneid == last_zoneid || 22986 ilm->ilm_zoneid == zoneid || 22987 !(ilm->ilm_ipif->ipif_flags & IPIF_UP)) 22988 continue; 22989 mp1 = ip_copymsg(first_mp); 22990 if (mp1 == NULL) 22991 continue; 22992 icmp_inbound(q, mp1, B_TRUE, ill, 0, 0, 22993 mctl_present, B_FALSE, ill, 22994 ilm->ilm_zoneid); 22995 last_zoneid = ilm->ilm_zoneid; 22996 } 22997 ILM_WALKER_RELE(ill); 22998 /* 22999 * Loopback case: the sending endpoint has 23000 * IP_MULTICAST_LOOP disabled, therefore we don't 23001 * dispatch the multicast packet to the sending zone. 23002 */ 23003 if (fanout_flags & IP_FF_NO_MCAST_LOOP) { 23004 freemsg(first_mp); 23005 return; 23006 } 23007 } else if (ire_type == IRE_BROADCAST) { 23008 /* 23009 * In the broadcast case, there may be many zones 23010 * which need a copy of the packet delivered to them. 23011 * There is one IRE_BROADCAST per broadcast address 23012 * and per zone; we walk those using a helper function. 23013 * In addition, the sending of the packet for zoneid is 23014 * delayed until all of the other ires have been 23015 * processed. 23016 */ 23017 IRB_REFHOLD(ire->ire_bucket); 23018 ire_zone = NULL; 23019 while ((ire_zone = ire_get_next_bcast_ire(ire_zone, 23020 ire)) != NULL) { 23021 mp1 = ip_copymsg(first_mp); 23022 if (mp1 == NULL) 23023 continue; 23024 23025 UPDATE_IB_PKT_COUNT(ire_zone); 23026 ire_zone->ire_last_used_time = lbolt; 23027 icmp_inbound(q, mp1, B_TRUE, ill, 0, 0, 23028 mctl_present, B_FALSE, ill, 23029 ire_zone->ire_zoneid); 23030 } 23031 IRB_REFRELE(ire->ire_bucket); 23032 } 23033 icmp_inbound(q, first_mp, (ire_type == IRE_BROADCAST), ill, 0, 23034 0, mctl_present, B_FALSE, ill, zoneid); 23035 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 23036 "ip_wput_local_end: q %p (%S)", 23037 q, "icmp"); 23038 return; 23039 } 23040 case IPPROTO_IGMP: 23041 if (igmp_input(q, mp, ill)) { 23042 /* Bad packet - discarded by igmp_input */ 23043 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 23044 "ip_wput_local_end: q %p (%S)", 23045 q, "igmp_input--bad packet"); 23046 if (mctl_present) 23047 freeb(first_mp); 23048 return; 23049 } 23050 /* 23051 * igmp_input() may have pulled up the message so ipha needs to 23052 * be reinitialized. 23053 */ 23054 ipha = (ipha_t *)mp->b_rptr; 23055 /* deliver to local raw users */ 23056 break; 23057 case IPPROTO_ENCAP: 23058 /* 23059 * This case is covered by either ip_fanout_proto, or by 23060 * the above security processing for self-tunneled packets. 23061 */ 23062 break; 23063 case IPPROTO_UDP: { 23064 uint16_t *up; 23065 uint32_t ports; 23066 23067 up = (uint16_t *)(rptr + IPH_HDR_LENGTH(ipha) + 23068 UDP_PORTS_OFFSET); 23069 /* Force a 'valid' checksum. */ 23070 up[3] = 0; 23071 23072 ports = *(uint32_t *)up; 23073 ip_fanout_udp(q, first_mp, ill, ipha, ports, 23074 (ire_type == IRE_BROADCAST), 23075 fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | 23076 IP_FF_SEND_SLLA | IP_FF_IP6INFO, mctl_present, B_FALSE, 23077 ill, zoneid); 23078 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 23079 "ip_wput_local_end: q %p (%S)", q, "ip_fanout_udp"); 23080 return; 23081 } 23082 case IPPROTO_TCP: { 23083 23084 /* 23085 * For TCP, discard broadcast packets. 23086 */ 23087 if ((ushort_t)ire_type == IRE_BROADCAST) { 23088 freemsg(first_mp); 23089 BUMP_MIB(&ip_mib, ipInDiscards); 23090 ip2dbg(("ip_wput_local: discard broadcast\n")); 23091 return; 23092 } 23093 23094 if (mp->b_datap->db_type == M_DATA) { 23095 /* 23096 * M_DATA mblk, so init mblk (chain) for no struio(). 23097 */ 23098 mblk_t *mp1 = mp; 23099 23100 do 23101 mp1->b_datap->db_struioflag = 0; 23102 while ((mp1 = mp1->b_cont) != NULL); 23103 } 23104 ASSERT((rptr + IPH_HDR_LENGTH(ipha) + TCP_PORTS_OFFSET + 4) 23105 <= mp->b_wptr); 23106 ip_fanout_tcp(q, first_mp, ill, ipha, 23107 fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | 23108 IP_FF_SYN_ADDIRE | IP_FF_IP6INFO, 23109 mctl_present, B_FALSE, zoneid); 23110 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 23111 "ip_wput_local_end: q %p (%S)", q, "ip_fanout_tcp"); 23112 return; 23113 } 23114 case IPPROTO_SCTP: 23115 { 23116 uint32_t ports; 23117 23118 bcopy(rptr + IPH_HDR_LENGTH(ipha), &ports, sizeof (ports)); 23119 ip_fanout_sctp(first_mp, ill, ipha, ports, 23120 fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | 23121 IP_FF_IP6INFO, 23122 mctl_present, B_FALSE, 0, zoneid); 23123 return; 23124 } 23125 23126 default: 23127 break; 23128 } 23129 /* 23130 * Find a client for some other protocol. We give 23131 * copies to multiple clients, if more than one is 23132 * bound. 23133 */ 23134 ip_fanout_proto(q, first_mp, ill, ipha, 23135 fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | IP_FF_RAWIP, 23136 mctl_present, B_FALSE, ill, zoneid); 23137 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 23138 "ip_wput_local_end: q %p (%S)", q, "ip_fanout_proto"); 23139 #undef rptr 23140 } 23141 23142 /* 23143 * Update any source route, record route, or timestamp options. 23144 * Check that we are at end of strict source route. 23145 * The options have been sanity checked by ip_wput_options(). 23146 */ 23147 static void 23148 ip_wput_local_options(ipha_t *ipha) 23149 { 23150 ipoptp_t opts; 23151 uchar_t *opt; 23152 uint8_t optval; 23153 uint8_t optlen; 23154 ipaddr_t dst; 23155 uint32_t ts; 23156 ire_t *ire; 23157 timestruc_t now; 23158 23159 ip2dbg(("ip_wput_local_options\n")); 23160 for (optval = ipoptp_first(&opts, ipha); 23161 optval != IPOPT_EOL; 23162 optval = ipoptp_next(&opts)) { 23163 opt = opts.ipoptp_cur; 23164 optlen = opts.ipoptp_len; 23165 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 23166 switch (optval) { 23167 uint32_t off; 23168 case IPOPT_SSRR: 23169 case IPOPT_LSRR: 23170 off = opt[IPOPT_OFFSET]; 23171 off--; 23172 if (optlen < IP_ADDR_LEN || 23173 off > optlen - IP_ADDR_LEN) { 23174 /* End of source route */ 23175 break; 23176 } 23177 /* 23178 * This will only happen if two consecutive entries 23179 * in the source route contains our address or if 23180 * it is a packet with a loose source route which 23181 * reaches us before consuming the whole source route 23182 */ 23183 ip1dbg(("ip_wput_local_options: not end of SR\n")); 23184 if (optval == IPOPT_SSRR) { 23185 return; 23186 } 23187 /* 23188 * Hack: instead of dropping the packet truncate the 23189 * source route to what has been used by filling the 23190 * rest with IPOPT_NOP. 23191 */ 23192 opt[IPOPT_OLEN] = (uint8_t)off; 23193 while (off < optlen) { 23194 opt[off++] = IPOPT_NOP; 23195 } 23196 break; 23197 case IPOPT_RR: 23198 off = opt[IPOPT_OFFSET]; 23199 off--; 23200 if (optlen < IP_ADDR_LEN || 23201 off > optlen - IP_ADDR_LEN) { 23202 /* No more room - ignore */ 23203 ip1dbg(( 23204 "ip_wput_forward_options: end of RR\n")); 23205 break; 23206 } 23207 dst = htonl(INADDR_LOOPBACK); 23208 bcopy(&dst, (char *)opt + off, IP_ADDR_LEN); 23209 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 23210 break; 23211 case IPOPT_TS: 23212 /* Insert timestamp if there is romm */ 23213 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 23214 case IPOPT_TS_TSONLY: 23215 off = IPOPT_TS_TIMELEN; 23216 break; 23217 case IPOPT_TS_PRESPEC: 23218 case IPOPT_TS_PRESPEC_RFC791: 23219 /* Verify that the address matched */ 23220 off = opt[IPOPT_OFFSET] - 1; 23221 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 23222 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, 23223 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 23224 if (ire == NULL) { 23225 /* Not for us */ 23226 break; 23227 } 23228 ire_refrele(ire); 23229 /* FALLTHRU */ 23230 case IPOPT_TS_TSANDADDR: 23231 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 23232 break; 23233 default: 23234 /* 23235 * ip_*put_options should have already 23236 * dropped this packet. 23237 */ 23238 cmn_err(CE_PANIC, "ip_wput_local_options: " 23239 "unknown IT - bug in ip_wput_options?\n"); 23240 return; /* Keep "lint" happy */ 23241 } 23242 if (opt[IPOPT_OFFSET] - 1 + off > optlen) { 23243 /* Increase overflow counter */ 23244 off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1; 23245 opt[IPOPT_POS_OV_FLG] = (uint8_t) 23246 (opt[IPOPT_POS_OV_FLG] & 0x0F) | 23247 (off << 4); 23248 break; 23249 } 23250 off = opt[IPOPT_OFFSET] - 1; 23251 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 23252 case IPOPT_TS_PRESPEC: 23253 case IPOPT_TS_PRESPEC_RFC791: 23254 case IPOPT_TS_TSANDADDR: 23255 dst = htonl(INADDR_LOOPBACK); 23256 bcopy(&dst, (char *)opt + off, IP_ADDR_LEN); 23257 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 23258 /* FALLTHRU */ 23259 case IPOPT_TS_TSONLY: 23260 off = opt[IPOPT_OFFSET] - 1; 23261 /* Compute # of milliseconds since midnight */ 23262 gethrestime(&now); 23263 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + 23264 now.tv_nsec / (NANOSEC / MILLISEC); 23265 bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN); 23266 opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN; 23267 break; 23268 } 23269 break; 23270 } 23271 } 23272 } 23273 23274 /* 23275 * Send out a multicast packet on interface ipif. 23276 * The sender does not have an conn. 23277 * Caller verifies that this isn't a PHYI_LOOPBACK. 23278 */ 23279 void 23280 ip_wput_multicast(queue_t *q, mblk_t *mp, ipif_t *ipif) 23281 { 23282 ipha_t *ipha; 23283 ire_t *ire; 23284 ipaddr_t dst; 23285 mblk_t *first_mp; 23286 23287 /* igmp_sendpkt always allocates a ipsec_out_t */ 23288 ASSERT(mp->b_datap->db_type == M_CTL); 23289 ASSERT(!ipif->ipif_isv6); 23290 ASSERT(!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_LOOPBACK)); 23291 23292 first_mp = mp; 23293 mp = first_mp->b_cont; 23294 ASSERT(mp->b_datap->db_type == M_DATA); 23295 ipha = (ipha_t *)mp->b_rptr; 23296 23297 /* 23298 * Find an IRE which matches the destination and the outgoing 23299 * queue (i.e. the outgoing interface.) 23300 */ 23301 if (ipif->ipif_flags & IPIF_POINTOPOINT) 23302 dst = ipif->ipif_pp_dst_addr; 23303 else 23304 dst = ipha->ipha_dst; 23305 /* 23306 * The source address has already been initialized by the 23307 * caller and hence matching on ILL (MATCH_IRE_ILL) would 23308 * be sufficient rather than MATCH_IRE_IPIF. 23309 * 23310 * This function is used for sending IGMP packets. We need 23311 * to make sure that we send the packet out of the interface 23312 * (ipif->ipif_ill) where we joined the group. This is to 23313 * prevent from switches doing IGMP snooping to send us multicast 23314 * packets for a given group on the interface we have joined. 23315 * If we can't find an ire, igmp_sendpkt has already initialized 23316 * ipsec_out_attach_if so that this will not be load spread in 23317 * ip_newroute_ipif. 23318 */ 23319 ire = ire_ctable_lookup(dst, 0, 0, ipif, ALL_ZONES, NULL, 23320 MATCH_IRE_ILL); 23321 if (!ire) { 23322 /* 23323 * Mark this packet to make it be delivered to 23324 * ip_wput_ire after the new ire has been 23325 * created. 23326 */ 23327 mp->b_prev = NULL; 23328 mp->b_next = NULL; 23329 ip_newroute_ipif(q, first_mp, ipif, dst, NULL, RTF_SETSRC); 23330 return; 23331 } 23332 23333 /* 23334 * Honor the RTF_SETSRC flag; this is the only case 23335 * where we force this addr whatever the current src addr is, 23336 * because this address is set by igmp_sendpkt(), and 23337 * cannot be specified by any user. 23338 */ 23339 if (ire->ire_flags & RTF_SETSRC) { 23340 ipha->ipha_src = ire->ire_src_addr; 23341 } 23342 23343 ip_wput_ire(q, first_mp, ire, NULL, B_FALSE); 23344 } 23345 23346 /* 23347 * NOTE : This function does not ire_refrele the ire argument passed in. 23348 * 23349 * Copy the link layer header and do IPQoS if needed. Frees the mblk on 23350 * failure. The ire_fp_mp can vanish any time in the case of IRE_MIPRTUN 23351 * and IRE_BROADCAST due to DL_NOTE_FASTPATH_FLUSH. Hence we have to hold 23352 * the ire_lock to access the ire_fp_mp in this case. 23353 * IPQoS assumes that the first M_DATA contains the IP header. So, if we are 23354 * prepending a fastpath message IPQoS processing must precede it, we also set 23355 * the b_band of the fastpath message to that of the mblk returned by IPQoS 23356 * (IPQoS might have set the b_band for CoS marking). 23357 * However, if we are prepending DL_UNITDATA_REQ message, IPQoS processing 23358 * must follow it so that IPQoS can mark the dl_priority field for CoS 23359 * marking, if needed. 23360 */ 23361 static mblk_t * 23362 ip_wput_attach_llhdr(mblk_t *mp, ire_t *ire, ip_proc_t proc, uint32_t ill_index) 23363 { 23364 uint_t hlen; 23365 ipha_t *ipha; 23366 mblk_t *mp1; 23367 boolean_t qos_done = B_FALSE; 23368 uchar_t *ll_hdr; 23369 23370 #define rptr ((uchar_t *)ipha) 23371 23372 ipha = (ipha_t *)mp->b_rptr; 23373 hlen = 0; 23374 LOCK_IRE_FP_MP(ire); 23375 if ((mp1 = ire->ire_fp_mp) != NULL) { 23376 ASSERT(DB_TYPE(mp1) == M_DATA); 23377 /* Initiate IPPF processing */ 23378 if ((proc != 0) && IPP_ENABLED(proc)) { 23379 UNLOCK_IRE_FP_MP(ire); 23380 ip_process(proc, &mp, ill_index); 23381 if (mp == NULL) 23382 return (NULL); 23383 23384 ipha = (ipha_t *)mp->b_rptr; 23385 LOCK_IRE_FP_MP(ire); 23386 if ((mp1 = ire->ire_fp_mp) == NULL) { 23387 qos_done = B_TRUE; 23388 goto no_fp_mp; 23389 } 23390 ASSERT(DB_TYPE(mp1) == M_DATA); 23391 } 23392 hlen = MBLKL(mp1); 23393 /* 23394 * Check if we have enough room to prepend fastpath 23395 * header 23396 */ 23397 if (hlen != 0 && (rptr - mp->b_datap->db_base) >= hlen) { 23398 ll_hdr = rptr - hlen; 23399 bcopy(mp1->b_rptr, ll_hdr, hlen); 23400 /* XXX ipha is not aligned here */ 23401 ipha = (ipha_t *)(rptr - hlen); 23402 /* 23403 * Set the b_rptr to the start of the link layer 23404 * header 23405 */ 23406 mp->b_rptr = rptr; 23407 mp1 = mp; 23408 } else { 23409 mp1 = copyb(mp1); 23410 if (mp1 == NULL) 23411 goto unlock_err; 23412 mp1->b_band = mp->b_band; 23413 mp1->b_cont = mp; 23414 /* 23415 * certain system generated traffic may not 23416 * have cred/label in ip header block. This 23417 * is true even for a labeled system. But for 23418 * labeled traffic, inherit the label in the 23419 * new header. 23420 */ 23421 if (DB_CRED(mp) != NULL) 23422 mblk_setcred(mp1, DB_CRED(mp)); 23423 /* 23424 * XXX disable ICK_VALID and compute checksum 23425 * here; can happen if ire_fp_mp changes and 23426 * it can't be copied now due to insufficient 23427 * space. (unlikely, fp mp can change, but it 23428 * does not increase in length) 23429 */ 23430 } 23431 UNLOCK_IRE_FP_MP(ire); 23432 } else { 23433 no_fp_mp: 23434 mp1 = copyb(ire->ire_dlureq_mp); 23435 if (mp1 == NULL) { 23436 unlock_err: 23437 UNLOCK_IRE_FP_MP(ire); 23438 freemsg(mp); 23439 return (NULL); 23440 } 23441 UNLOCK_IRE_FP_MP(ire); 23442 mp1->b_cont = mp; 23443 /* 23444 * certain system generated traffic may not 23445 * have cred/label in ip header block. This 23446 * is true even for a labeled system. But for 23447 * labeled traffic, inherit the label in the 23448 * new header. 23449 */ 23450 if (DB_CRED(mp) != NULL) 23451 mblk_setcred(mp1, DB_CRED(mp)); 23452 if (!qos_done && (proc != 0) && IPP_ENABLED(proc)) { 23453 ip_process(proc, &mp1, ill_index); 23454 if (mp1 == NULL) 23455 return (NULL); 23456 } 23457 } 23458 return (mp1); 23459 #undef rptr 23460 } 23461 23462 /* 23463 * Finish the outbound IPsec processing for an IPv6 packet. This function 23464 * is called from ipsec_out_process() if the IPsec packet was processed 23465 * synchronously, or from {ah,esp}_kcf_callback() if it was processed 23466 * asynchronously. 23467 */ 23468 void 23469 ip_wput_ipsec_out_v6(queue_t *q, mblk_t *ipsec_mp, ip6_t *ip6h, ill_t *ill, 23470 ire_t *ire_arg) 23471 { 23472 in6_addr_t *v6dstp; 23473 ire_t *ire; 23474 mblk_t *mp; 23475 uint_t ill_index; 23476 ipsec_out_t *io; 23477 boolean_t attach_if, hwaccel; 23478 uint32_t flags = IP6_NO_IPPOLICY; 23479 int match_flags; 23480 zoneid_t zoneid; 23481 boolean_t ill_need_rele = B_FALSE; 23482 boolean_t ire_need_rele = B_FALSE; 23483 23484 mp = ipsec_mp->b_cont; 23485 io = (ipsec_out_t *)ipsec_mp->b_rptr; 23486 ill_index = io->ipsec_out_ill_index; 23487 if (io->ipsec_out_reachable) { 23488 flags |= IPV6_REACHABILITY_CONFIRMATION; 23489 } 23490 attach_if = io->ipsec_out_attach_if; 23491 hwaccel = io->ipsec_out_accelerated; 23492 zoneid = io->ipsec_out_zoneid; 23493 ASSERT(zoneid != ALL_ZONES); 23494 match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR; 23495 /* Multicast addresses should have non-zero ill_index. */ 23496 v6dstp = &ip6h->ip6_dst; 23497 ASSERT(ip6h->ip6_nxt != IPPROTO_RAW); 23498 ASSERT(!IN6_IS_ADDR_MULTICAST(v6dstp) || ill_index != 0); 23499 ASSERT(!attach_if || ill_index != 0); 23500 if (ill_index != 0) { 23501 if (ill == NULL) { 23502 ill = ip_grab_attach_ill(NULL, ipsec_mp, ill_index, 23503 B_TRUE); 23504 23505 /* Failure case frees things for us. */ 23506 if (ill == NULL) 23507 return; 23508 23509 ill_need_rele = B_TRUE; 23510 } 23511 /* 23512 * If this packet needs to go out on a particular interface 23513 * honor it. 23514 */ 23515 if (attach_if) { 23516 match_flags = MATCH_IRE_ILL; 23517 23518 /* 23519 * Check if we need an ire that will not be 23520 * looked up by anybody else i.e. HIDDEN. 23521 */ 23522 if (ill_is_probeonly(ill)) { 23523 match_flags |= MATCH_IRE_MARK_HIDDEN; 23524 } 23525 } 23526 } 23527 ASSERT(mp != NULL); 23528 23529 if (IN6_IS_ADDR_MULTICAST(v6dstp)) { 23530 boolean_t unspec_src; 23531 ipif_t *ipif; 23532 23533 /* 23534 * Use the ill_index to get the right ill. 23535 */ 23536 unspec_src = io->ipsec_out_unspec_src; 23537 (void) ipif_lookup_zoneid(ill, zoneid, 0, &ipif); 23538 if (ipif == NULL) { 23539 if (ill_need_rele) 23540 ill_refrele(ill); 23541 freemsg(ipsec_mp); 23542 return; 23543 } 23544 23545 if (ire_arg != NULL) { 23546 ire = ire_arg; 23547 } else { 23548 ire = ire_ctable_lookup_v6(v6dstp, 0, 0, ipif, 23549 zoneid, MBLK_GETLABEL(mp), match_flags); 23550 ire_need_rele = B_TRUE; 23551 } 23552 if (ire != NULL) { 23553 ipif_refrele(ipif); 23554 /* 23555 * XXX Do the multicast forwarding now, as the IPSEC 23556 * processing has been done. 23557 */ 23558 goto send; 23559 } 23560 23561 ip0dbg(("ip_wput_ipsec_out_v6: multicast: IRE disappeared\n")); 23562 mp->b_prev = NULL; 23563 mp->b_next = NULL; 23564 23565 /* 23566 * If the IPsec packet was processed asynchronously, 23567 * drop it now. 23568 */ 23569 if (q == NULL) { 23570 if (ill_need_rele) 23571 ill_refrele(ill); 23572 freemsg(ipsec_mp); 23573 return; 23574 } 23575 23576 ip_newroute_ipif_v6(q, ipsec_mp, ipif, *v6dstp, 23577 unspec_src, zoneid); 23578 ipif_refrele(ipif); 23579 } else { 23580 if (attach_if) { 23581 ipif_t *ipif; 23582 23583 ipif = ipif_get_next_ipif(NULL, ill); 23584 if (ipif == NULL) { 23585 if (ill_need_rele) 23586 ill_refrele(ill); 23587 freemsg(ipsec_mp); 23588 return; 23589 } 23590 ire = ire_ctable_lookup_v6(v6dstp, 0, 0, ipif, 23591 zoneid, MBLK_GETLABEL(mp), match_flags); 23592 ire_need_rele = B_TRUE; 23593 ipif_refrele(ipif); 23594 } else { 23595 if (ire_arg != NULL) { 23596 ire = ire_arg; 23597 } else { 23598 ire = ire_cache_lookup_v6(v6dstp, zoneid, NULL); 23599 ire_need_rele = B_TRUE; 23600 } 23601 } 23602 if (ire != NULL) 23603 goto send; 23604 /* 23605 * ire disappeared underneath. 23606 * 23607 * What we need to do here is the ip_newroute 23608 * logic to get the ire without doing the IPSEC 23609 * processing. Follow the same old path. But this 23610 * time, ip_wput or ire_add_then_send will call us 23611 * directly as all the IPSEC operations are done. 23612 */ 23613 ip1dbg(("ip_wput_ipsec_out_v6: IRE disappeared\n")); 23614 mp->b_prev = NULL; 23615 mp->b_next = NULL; 23616 23617 /* 23618 * If the IPsec packet was processed asynchronously, 23619 * drop it now. 23620 */ 23621 if (q == NULL) { 23622 if (ill_need_rele) 23623 ill_refrele(ill); 23624 freemsg(ipsec_mp); 23625 return; 23626 } 23627 23628 ip_newroute_v6(q, ipsec_mp, v6dstp, &ip6h->ip6_src, ill, 23629 zoneid); 23630 } 23631 if (ill != NULL && ill_need_rele) 23632 ill_refrele(ill); 23633 return; 23634 send: 23635 if (ill != NULL && ill_need_rele) 23636 ill_refrele(ill); 23637 23638 /* Local delivery */ 23639 if (ire->ire_stq == NULL) { 23640 ASSERT(q != NULL); 23641 ip_wput_local_v6(RD(q), ire->ire_ipif->ipif_ill, ip6h, ipsec_mp, 23642 ire, 0); 23643 if (ire_need_rele) 23644 ire_refrele(ire); 23645 return; 23646 } 23647 /* 23648 * Everything is done. Send it out on the wire. 23649 * We force the insertion of a fragment header using the 23650 * IPH_FRAG_HDR flag in two cases: 23651 * - after reception of an ICMPv6 "packet too big" message 23652 * with a MTU < 1280 (cf. RFC 2460 section 5) 23653 * - for multirouted IPv6 packets, so that the receiver can 23654 * discard duplicates according to their fragment identifier 23655 */ 23656 /* XXX fix flow control problems. */ 23657 if (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN > ire->ire_max_frag || 23658 (ire->ire_frag_flag & IPH_FRAG_HDR)) { 23659 if (hwaccel) { 23660 /* 23661 * hardware acceleration does not handle these 23662 * "slow path" cases. 23663 */ 23664 /* IPsec KSTATS: should bump bean counter here. */ 23665 if (ire_need_rele) 23666 ire_refrele(ire); 23667 freemsg(ipsec_mp); 23668 return; 23669 } 23670 if (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN != 23671 (mp->b_cont ? msgdsize(mp) : 23672 mp->b_wptr - (uchar_t *)ip6h)) { 23673 /* IPsec KSTATS: should bump bean counter here. */ 23674 ip0dbg(("Packet length mismatch: %d, %ld\n", 23675 ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN, 23676 msgdsize(mp))); 23677 if (ire_need_rele) 23678 ire_refrele(ire); 23679 freemsg(ipsec_mp); 23680 return; 23681 } 23682 ASSERT(mp->b_prev == NULL); 23683 ip2dbg(("Fragmenting Size = %d, mtu = %d\n", 23684 ntohs(ip6h->ip6_plen) + 23685 IPV6_HDR_LEN, ire->ire_max_frag)); 23686 ip_wput_frag_v6(mp, ire, flags, NULL, B_FALSE, 23687 ire->ire_max_frag); 23688 } else { 23689 UPDATE_OB_PKT_COUNT(ire); 23690 ire->ire_last_used_time = lbolt; 23691 ip_xmit_v6(mp, ire, flags, NULL, B_FALSE, hwaccel ? io : NULL); 23692 } 23693 if (ire_need_rele) 23694 ire_refrele(ire); 23695 freeb(ipsec_mp); 23696 } 23697 23698 void 23699 ipsec_hw_putnext(queue_t *q, mblk_t *mp) 23700 { 23701 mblk_t *hada_mp; /* attributes M_CTL mblk */ 23702 da_ipsec_t *hada; /* data attributes */ 23703 ill_t *ill = (ill_t *)q->q_ptr; 23704 23705 IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_hw_putnext: accelerated packet\n")); 23706 23707 if ((ill->ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP)) == 0) { 23708 /* IPsec KSTATS: Bump lose counter here! */ 23709 freemsg(mp); 23710 return; 23711 } 23712 23713 /* 23714 * It's an IPsec packet that must be 23715 * accelerated by the Provider, and the 23716 * outbound ill is IPsec acceleration capable. 23717 * Prepends the mblk with an IPHADA_M_CTL, and ship it 23718 * to the ill. 23719 * IPsec KSTATS: should bump packet counter here. 23720 */ 23721 23722 hada_mp = allocb(sizeof (da_ipsec_t), BPRI_HI); 23723 if (hada_mp == NULL) { 23724 /* IPsec KSTATS: should bump packet counter here. */ 23725 freemsg(mp); 23726 return; 23727 } 23728 23729 hada_mp->b_datap->db_type = M_CTL; 23730 hada_mp->b_wptr = hada_mp->b_rptr + sizeof (*hada); 23731 hada_mp->b_cont = mp; 23732 23733 hada = (da_ipsec_t *)hada_mp->b_rptr; 23734 bzero(hada, sizeof (da_ipsec_t)); 23735 hada->da_type = IPHADA_M_CTL; 23736 23737 putnext(q, hada_mp); 23738 } 23739 23740 /* 23741 * Finish the outbound IPsec processing. This function is called from 23742 * ipsec_out_process() if the IPsec packet was processed 23743 * synchronously, or from {ah,esp}_kcf_callback() if it was processed 23744 * asynchronously. 23745 */ 23746 void 23747 ip_wput_ipsec_out(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, ill_t *ill, 23748 ire_t *ire_arg) 23749 { 23750 uint32_t v_hlen_tos_len; 23751 ipaddr_t dst; 23752 ipif_t *ipif = NULL; 23753 ire_t *ire; 23754 ire_t *ire1 = NULL; 23755 mblk_t *next_mp = NULL; 23756 uint32_t max_frag; 23757 boolean_t multirt_send = B_FALSE; 23758 mblk_t *mp; 23759 mblk_t *mp1; 23760 uint_t ill_index; 23761 ipsec_out_t *io; 23762 boolean_t attach_if; 23763 int match_flags, offset; 23764 irb_t *irb = NULL; 23765 boolean_t ill_need_rele = B_FALSE, ire_need_rele = B_TRUE; 23766 zoneid_t zoneid; 23767 uint32_t cksum; 23768 uint16_t *up; 23769 #ifdef _BIG_ENDIAN 23770 #define LENGTH (v_hlen_tos_len & 0xFFFF) 23771 #else 23772 #define LENGTH ((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00)) 23773 #endif 23774 23775 mp = ipsec_mp->b_cont; 23776 ASSERT(mp != NULL); 23777 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 23778 dst = ipha->ipha_dst; 23779 23780 io = (ipsec_out_t *)ipsec_mp->b_rptr; 23781 ill_index = io->ipsec_out_ill_index; 23782 attach_if = io->ipsec_out_attach_if; 23783 zoneid = io->ipsec_out_zoneid; 23784 ASSERT(zoneid != ALL_ZONES); 23785 match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR; 23786 if (ill_index != 0) { 23787 if (ill == NULL) { 23788 ill = ip_grab_attach_ill(NULL, ipsec_mp, 23789 ill_index, B_FALSE); 23790 23791 /* Failure case frees things for us. */ 23792 if (ill == NULL) 23793 return; 23794 23795 ill_need_rele = B_TRUE; 23796 } 23797 /* 23798 * If this packet needs to go out on a particular interface 23799 * honor it. 23800 */ 23801 if (attach_if) { 23802 match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; 23803 23804 /* 23805 * Check if we need an ire that will not be 23806 * looked up by anybody else i.e. HIDDEN. 23807 */ 23808 if (ill_is_probeonly(ill)) { 23809 match_flags |= MATCH_IRE_MARK_HIDDEN; 23810 } 23811 } 23812 } 23813 23814 if (CLASSD(dst)) { 23815 boolean_t conn_dontroute; 23816 /* 23817 * Use the ill_index to get the right ipif. 23818 */ 23819 conn_dontroute = io->ipsec_out_dontroute; 23820 if (ill_index == 0) 23821 ipif = ipif_lookup_group(dst, zoneid); 23822 else 23823 (void) ipif_lookup_zoneid(ill, zoneid, 0, &ipif); 23824 if (ipif == NULL) { 23825 ip1dbg(("ip_wput_ipsec_out: No ipif for" 23826 " multicast\n")); 23827 BUMP_MIB(&ip_mib, ipOutNoRoutes); 23828 freemsg(ipsec_mp); 23829 goto done; 23830 } 23831 /* 23832 * ipha_src has already been intialized with the 23833 * value of the ipif in ip_wput. All we need now is 23834 * an ire to send this downstream. 23835 */ 23836 ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid, 23837 MBLK_GETLABEL(mp), match_flags); 23838 if (ire != NULL) { 23839 ill_t *ill1; 23840 /* 23841 * Do the multicast forwarding now, as the IPSEC 23842 * processing has been done. 23843 */ 23844 if (ip_g_mrouter && !conn_dontroute && 23845 (ill1 = ire_to_ill(ire))) { 23846 if (ip_mforward(ill1, ipha, mp)) { 23847 freemsg(ipsec_mp); 23848 ip1dbg(("ip_wput_ipsec_out: mforward " 23849 "failed\n")); 23850 ire_refrele(ire); 23851 goto done; 23852 } 23853 } 23854 goto send; 23855 } 23856 23857 ip0dbg(("ip_wput_ipsec_out: multicast: IRE disappeared\n")); 23858 mp->b_prev = NULL; 23859 mp->b_next = NULL; 23860 23861 /* 23862 * If the IPsec packet was processed asynchronously, 23863 * drop it now. 23864 */ 23865 if (q == NULL) { 23866 freemsg(ipsec_mp); 23867 goto done; 23868 } 23869 23870 /* 23871 * We may be using a wrong ipif to create the ire. 23872 * But it is okay as the source address is assigned 23873 * for the packet already. Next outbound packet would 23874 * create the IRE with the right IPIF in ip_wput. 23875 * 23876 * Also handle RTF_MULTIRT routes. 23877 */ 23878 ip_newroute_ipif(q, ipsec_mp, ipif, dst, NULL, RTF_MULTIRT); 23879 } else { 23880 if (attach_if) { 23881 ire = ire_ctable_lookup(dst, 0, 0, ill->ill_ipif, 23882 zoneid, MBLK_GETLABEL(mp), match_flags); 23883 } else { 23884 if (ire_arg != NULL) { 23885 ire = ire_arg; 23886 ire_need_rele = B_FALSE; 23887 } else { 23888 ire = ire_cache_lookup(dst, zoneid, 23889 MBLK_GETLABEL(mp)); 23890 } 23891 } 23892 if (ire != NULL) { 23893 goto send; 23894 } 23895 23896 /* 23897 * ire disappeared underneath. 23898 * 23899 * What we need to do here is the ip_newroute 23900 * logic to get the ire without doing the IPSEC 23901 * processing. Follow the same old path. But this 23902 * time, ip_wput or ire_add_then_put will call us 23903 * directly as all the IPSEC operations are done. 23904 */ 23905 ip1dbg(("ip_wput_ipsec_out: IRE disappeared\n")); 23906 mp->b_prev = NULL; 23907 mp->b_next = NULL; 23908 23909 /* 23910 * If the IPsec packet was processed asynchronously, 23911 * drop it now. 23912 */ 23913 if (q == NULL) { 23914 freemsg(ipsec_mp); 23915 goto done; 23916 } 23917 23918 /* 23919 * Since we're going through ip_newroute() again, we 23920 * need to make sure we don't: 23921 * 23922 * 1.) Trigger the ASSERT() with the ipha_ident 23923 * overloading. 23924 * 2.) Redo transport-layer checksumming, since we've 23925 * already done all that to get this far. 23926 * 23927 * The easiest way not do either of the above is to set 23928 * the ipha_ident field to IP_HDR_INCLUDED. 23929 */ 23930 ipha->ipha_ident = IP_HDR_INCLUDED; 23931 ip_newroute(q, ipsec_mp, dst, NULL, 23932 (CONN_Q(q) ? Q_TO_CONN(q) : NULL)); 23933 } 23934 goto done; 23935 send: 23936 if (ipha->ipha_protocol == IPPROTO_UDP && udp_compute_checksum()) { 23937 /* 23938 * ESP NAT-Traversal packet. 23939 * 23940 * Just do software checksum for now. 23941 */ 23942 23943 offset = IP_SIMPLE_HDR_LENGTH + UDP_CHECKSUM_OFFSET; 23944 IP_STAT(ip_out_sw_cksum); 23945 IP_STAT_UPDATE(ip_udp_out_sw_cksum_bytes, 23946 ntohs(htons(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH)); 23947 #define iphs ((uint16_t *)ipha) 23948 cksum = IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] + 23949 iphs[9] + ntohs(htons(ipha->ipha_length) - 23950 IP_SIMPLE_HDR_LENGTH); 23951 #undef iphs 23952 if ((cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH, cksum)) == 0) 23953 cksum = 0xFFFF; 23954 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) 23955 if (mp1->b_wptr - mp1->b_rptr >= 23956 offset + sizeof (uint16_t)) { 23957 up = (uint16_t *)(mp1->b_rptr + offset); 23958 *up = cksum; 23959 break; /* out of for loop */ 23960 } else { 23961 offset -= (mp->b_wptr - mp->b_rptr); 23962 } 23963 } /* Otherwise, just keep the all-zero checksum. */ 23964 23965 if (ire->ire_stq == NULL) { 23966 /* 23967 * Loopbacks go through ip_wput_local except for one case. 23968 * We come here if we generate a icmp_frag_needed message 23969 * after IPSEC processing is over. When this function calls 23970 * ip_wput_ire_fragmentit, ip_wput_frag might end up calling 23971 * icmp_frag_needed. The message generated comes back here 23972 * through icmp_frag_needed -> icmp_pkt -> ip_wput -> 23973 * ipsec_out_process -> ip_wput_ipsec_out. We need to set the 23974 * source address as it is usually set in ip_wput_ire. As 23975 * ipsec_out_proc_begin is set, ip_wput calls ipsec_out_process 23976 * and we end up here. We can't enter ip_wput_ire once the 23977 * IPSEC processing is over and hence we need to do it here. 23978 */ 23979 ASSERT(q != NULL); 23980 UPDATE_OB_PKT_COUNT(ire); 23981 ire->ire_last_used_time = lbolt; 23982 if (ipha->ipha_src == 0) 23983 ipha->ipha_src = ire->ire_src_addr; 23984 ip_wput_local(RD(q), ire->ire_ipif->ipif_ill, ipha, ipsec_mp, 23985 ire, 0, zoneid); 23986 if (ire_need_rele) 23987 ire_refrele(ire); 23988 goto done; 23989 } 23990 23991 if (ire->ire_max_frag < (unsigned int)LENGTH) { 23992 /* 23993 * We are through with IPSEC processing. 23994 * Fragment this and send it on the wire. 23995 */ 23996 if (io->ipsec_out_accelerated) { 23997 /* 23998 * The packet has been accelerated but must 23999 * be fragmented. This should not happen 24000 * since AH and ESP must not accelerate 24001 * packets that need fragmentation, however 24002 * the configuration could have changed 24003 * since the AH or ESP processing. 24004 * Drop packet. 24005 * IPsec KSTATS: bump bean counter here. 24006 */ 24007 IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_wput_ipsec_out: " 24008 "fragmented accelerated packet!\n")); 24009 freemsg(ipsec_mp); 24010 } else { 24011 ip_wput_ire_fragmentit(ipsec_mp, ire); 24012 } 24013 if (ire_need_rele) 24014 ire_refrele(ire); 24015 goto done; 24016 } 24017 24018 ip2dbg(("ip_wput_ipsec_out: ipsec_mp %p, ire %p, ire_ipif %p, " 24019 "ipif %p\n", (void *)ipsec_mp, (void *)ire, 24020 (void *)ire->ire_ipif, (void *)ipif)); 24021 24022 /* 24023 * Multiroute the secured packet, unless IPsec really 24024 * requires the packet to go out only through a particular 24025 * interface. 24026 */ 24027 if ((ire->ire_flags & RTF_MULTIRT) && !attach_if) { 24028 ire_t *first_ire; 24029 irb = ire->ire_bucket; 24030 ASSERT(irb != NULL); 24031 /* 24032 * This ire has been looked up as the one that 24033 * goes through the given ipif; 24034 * make sure we do not omit any other multiroute ire 24035 * that may be present in the bucket before this one. 24036 */ 24037 IRB_REFHOLD(irb); 24038 for (first_ire = irb->irb_ire; 24039 first_ire != NULL; 24040 first_ire = first_ire->ire_next) { 24041 if ((first_ire->ire_flags & RTF_MULTIRT) && 24042 (first_ire->ire_addr == ire->ire_addr) && 24043 !(first_ire->ire_marks & 24044 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) 24045 break; 24046 } 24047 24048 if ((first_ire != NULL) && (first_ire != ire)) { 24049 /* 24050 * Don't change the ire if the packet must 24051 * be fragmented if sent via this new one. 24052 */ 24053 if (first_ire->ire_max_frag >= (unsigned int)LENGTH) { 24054 IRE_REFHOLD(first_ire); 24055 if (ire_need_rele) 24056 ire_refrele(ire); 24057 else 24058 ire_need_rele = B_TRUE; 24059 ire = first_ire; 24060 } 24061 } 24062 IRB_REFRELE(irb); 24063 24064 multirt_send = B_TRUE; 24065 max_frag = ire->ire_max_frag; 24066 } else { 24067 if ((ire->ire_flags & RTF_MULTIRT) && attach_if) { 24068 ip1dbg(("ip_wput_ipsec_out: ignoring multirouting " 24069 "flag, attach_if %d\n", attach_if)); 24070 } 24071 } 24072 24073 /* 24074 * In most cases, the emission loop below is entered only once. 24075 * Only in the case where the ire holds the RTF_MULTIRT 24076 * flag, we loop to process all RTF_MULTIRT ires in the 24077 * bucket, and send the packet through all crossed 24078 * RTF_MULTIRT routes. 24079 */ 24080 do { 24081 if (multirt_send) { 24082 /* 24083 * ire1 holds here the next ire to process in the 24084 * bucket. If multirouting is expected, 24085 * any non-RTF_MULTIRT ire that has the 24086 * right destination address is ignored. 24087 */ 24088 ASSERT(irb != NULL); 24089 IRB_REFHOLD(irb); 24090 for (ire1 = ire->ire_next; 24091 ire1 != NULL; 24092 ire1 = ire1->ire_next) { 24093 if ((ire1->ire_flags & RTF_MULTIRT) == 0) 24094 continue; 24095 if (ire1->ire_addr != ire->ire_addr) 24096 continue; 24097 if (ire1->ire_marks & 24098 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)) 24099 continue; 24100 /* No loopback here */ 24101 if (ire1->ire_stq == NULL) 24102 continue; 24103 /* 24104 * Ensure we do not exceed the MTU 24105 * of the next route. 24106 */ 24107 if (ire1->ire_max_frag < (unsigned int)LENGTH) { 24108 ip_multirt_bad_mtu(ire1, max_frag); 24109 continue; 24110 } 24111 24112 IRE_REFHOLD(ire1); 24113 break; 24114 } 24115 IRB_REFRELE(irb); 24116 if (ire1 != NULL) { 24117 /* 24118 * We are in a multiple send case, need to 24119 * make a copy of the packet. 24120 */ 24121 next_mp = copymsg(ipsec_mp); 24122 if (next_mp == NULL) { 24123 ire_refrele(ire1); 24124 ire1 = NULL; 24125 } 24126 } 24127 } 24128 24129 /* Everything is done. Send it out on the wire */ 24130 mp1 = ip_wput_attach_llhdr(mp, ire, 0, 0); 24131 if (mp1 == NULL) { 24132 BUMP_MIB(&ip_mib, ipOutDiscards); 24133 freemsg(ipsec_mp); 24134 if (ire_need_rele) 24135 ire_refrele(ire); 24136 if (ire1 != NULL) { 24137 ire_refrele(ire1); 24138 freemsg(next_mp); 24139 } 24140 goto done; 24141 } 24142 UPDATE_OB_PKT_COUNT(ire); 24143 ire->ire_last_used_time = lbolt; 24144 if (!io->ipsec_out_accelerated) { 24145 putnext(ire->ire_stq, mp1); 24146 } else { 24147 /* 24148 * Safety Pup says: make sure this is going to 24149 * the right interface! 24150 */ 24151 ill_t *ill1 = (ill_t *)ire->ire_stq->q_ptr; 24152 int ifindex = ill1->ill_phyint->phyint_ifindex; 24153 24154 if (ifindex != io->ipsec_out_capab_ill_index) { 24155 /* IPsec kstats: bump lose counter */ 24156 freemsg(mp1); 24157 } else { 24158 ipsec_hw_putnext(ire->ire_stq, mp1); 24159 } 24160 } 24161 24162 freeb(ipsec_mp); 24163 if (ire_need_rele) 24164 ire_refrele(ire); 24165 24166 if (ire1 != NULL) { 24167 ire = ire1; 24168 ire_need_rele = B_TRUE; 24169 ASSERT(next_mp); 24170 ipsec_mp = next_mp; 24171 mp = ipsec_mp->b_cont; 24172 ire1 = NULL; 24173 next_mp = NULL; 24174 io = (ipsec_out_t *)ipsec_mp->b_rptr; 24175 } else { 24176 multirt_send = B_FALSE; 24177 } 24178 } while (multirt_send); 24179 done: 24180 if (ill != NULL && ill_need_rele) 24181 ill_refrele(ill); 24182 if (ipif != NULL) 24183 ipif_refrele(ipif); 24184 } 24185 24186 /* 24187 * Get the ill corresponding to the specified ire, and compare its 24188 * capabilities with the protocol and algorithms specified by the 24189 * the SA obtained from ipsec_out. If they match, annotate the 24190 * ipsec_out structure to indicate that the packet needs acceleration. 24191 * 24192 * 24193 * A packet is eligible for outbound hardware acceleration if the 24194 * following conditions are satisfied: 24195 * 24196 * 1. the packet will not be fragmented 24197 * 2. the provider supports the algorithm 24198 * 3. there is no pending control message being exchanged 24199 * 4. snoop is not attached 24200 * 5. the destination address is not a broadcast or multicast address. 24201 * 24202 * Rationale: 24203 * - Hardware drivers do not support fragmentation with 24204 * the current interface. 24205 * - snoop, multicast, and broadcast may result in exposure of 24206 * a cleartext datagram. 24207 * We check all five of these conditions here. 24208 * 24209 * XXX would like to nuke "ire_t *" parameter here; problem is that 24210 * IRE is only way to figure out if a v4 address is a broadcast and 24211 * thus ineligible for acceleration... 24212 */ 24213 static void 24214 ipsec_out_is_accelerated(mblk_t *ipsec_mp, ipsa_t *sa, ill_t *ill, ire_t *ire) 24215 { 24216 ipsec_out_t *io; 24217 mblk_t *data_mp; 24218 uint_t plen, overhead; 24219 24220 if ((sa->ipsa_flags & IPSA_F_HW) == 0) 24221 return; 24222 24223 if (ill == NULL) 24224 return; 24225 24226 /* 24227 * Destination address is a broadcast or multicast. Punt. 24228 */ 24229 if ((ire != NULL) && (ire->ire_type & (IRE_BROADCAST|IRE_LOOPBACK| 24230 IRE_LOCAL))) 24231 return; 24232 24233 data_mp = ipsec_mp->b_cont; 24234 24235 if (ill->ill_isv6) { 24236 ip6_t *ip6h = (ip6_t *)data_mp->b_rptr; 24237 24238 if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) 24239 return; 24240 24241 plen = ip6h->ip6_plen; 24242 } else { 24243 ipha_t *ipha = (ipha_t *)data_mp->b_rptr; 24244 24245 if (CLASSD(ipha->ipha_dst)) 24246 return; 24247 24248 plen = ipha->ipha_length; 24249 } 24250 /* 24251 * Is there a pending DLPI control message being exchanged 24252 * between IP/IPsec and the DLS Provider? If there is, it 24253 * could be a SADB update, and the state of the DLS Provider 24254 * SADB might not be in sync with the SADB maintained by 24255 * IPsec. To avoid dropping packets or using the wrong keying 24256 * material, we do not accelerate this packet. 24257 */ 24258 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 24259 IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_out_check_is_accelerated: " 24260 "ill_dlpi_pending! don't accelerate packet\n")); 24261 return; 24262 } 24263 24264 /* 24265 * Is the Provider in promiscous mode? If it does, we don't 24266 * accelerate the packet since it will bounce back up to the 24267 * listeners in the clear. 24268 */ 24269 if (ill->ill_promisc_on_phys) { 24270 IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_out_check_is_accelerated: " 24271 "ill in promiscous mode, don't accelerate packet\n")); 24272 return; 24273 } 24274 24275 /* 24276 * Will the packet require fragmentation? 24277 */ 24278 24279 /* 24280 * IPsec ESP note: this is a pessimistic estimate, but the same 24281 * as is used elsewhere. 24282 * SPI + sequence + MAC + IV(blocksize) + padding(blocksize-1) 24283 * + 2-byte trailer 24284 */ 24285 overhead = (sa->ipsa_type == SADB_SATYPE_AH) ? IPSEC_MAX_AH_HDR_SIZE : 24286 IPSEC_BASE_ESP_HDR_SIZE(sa); 24287 24288 if ((plen + overhead) > ill->ill_max_mtu) 24289 return; 24290 24291 io = (ipsec_out_t *)ipsec_mp->b_rptr; 24292 24293 /* 24294 * Can the ill accelerate this IPsec protocol and algorithm 24295 * specified by the SA? 24296 */ 24297 if (!ipsec_capab_match(ill, io->ipsec_out_capab_ill_index, 24298 ill->ill_isv6, sa)) { 24299 return; 24300 } 24301 24302 /* 24303 * Tell AH or ESP that the outbound ill is capable of 24304 * accelerating this packet. 24305 */ 24306 io->ipsec_out_is_capab_ill = B_TRUE; 24307 } 24308 24309 /* 24310 * Select which AH & ESP SA's to use (if any) for the outbound packet. 24311 * 24312 * If this function returns B_TRUE, the requested SA's have been filled 24313 * into the ipsec_out_*_sa pointers. 24314 * 24315 * If the function returns B_FALSE, the packet has been "consumed", most 24316 * likely by an ACQUIRE sent up via PF_KEY to a key management daemon. 24317 * 24318 * The SA references created by the protocol-specific "select" 24319 * function will be released when the ipsec_mp is freed, thanks to the 24320 * ipsec_out_free destructor -- see spd.c. 24321 */ 24322 static boolean_t 24323 ipsec_out_select_sa(mblk_t *ipsec_mp) 24324 { 24325 boolean_t need_ah_acquire = B_FALSE, need_esp_acquire = B_FALSE; 24326 ipsec_out_t *io; 24327 ipsec_policy_t *pp; 24328 ipsec_action_t *ap; 24329 io = (ipsec_out_t *)ipsec_mp->b_rptr; 24330 ASSERT(io->ipsec_out_type == IPSEC_OUT); 24331 ASSERT(io->ipsec_out_len == sizeof (ipsec_out_t)); 24332 24333 if (!io->ipsec_out_secure) { 24334 /* 24335 * We came here by mistake. 24336 * Don't bother with ipsec processing 24337 * We should "discourage" this path in the future. 24338 */ 24339 ASSERT(io->ipsec_out_proc_begin == B_FALSE); 24340 return (B_FALSE); 24341 } 24342 ASSERT(io->ipsec_out_need_policy == B_FALSE); 24343 ASSERT((io->ipsec_out_policy != NULL) || 24344 (io->ipsec_out_act != NULL)); 24345 24346 ASSERT(io->ipsec_out_failed == B_FALSE); 24347 24348 /* 24349 * IPSEC processing has started. 24350 */ 24351 io->ipsec_out_proc_begin = B_TRUE; 24352 ap = io->ipsec_out_act; 24353 if (ap == NULL) { 24354 pp = io->ipsec_out_policy; 24355 ASSERT(pp != NULL); 24356 ap = pp->ipsp_act; 24357 ASSERT(ap != NULL); 24358 } 24359 24360 /* 24361 * We have an action. now, let's select SA's. 24362 * (In the future, we can cache this in the conn_t..) 24363 */ 24364 if (ap->ipa_want_esp) { 24365 if (io->ipsec_out_esp_sa == NULL) { 24366 need_esp_acquire = !ipsec_outbound_sa(ipsec_mp, 24367 IPPROTO_ESP); 24368 } 24369 ASSERT(need_esp_acquire || io->ipsec_out_esp_sa != NULL); 24370 } 24371 24372 if (ap->ipa_want_ah) { 24373 if (io->ipsec_out_ah_sa == NULL) { 24374 need_ah_acquire = !ipsec_outbound_sa(ipsec_mp, 24375 IPPROTO_AH); 24376 } 24377 ASSERT(need_ah_acquire || io->ipsec_out_ah_sa != NULL); 24378 /* 24379 * The ESP and AH processing order needs to be preserved 24380 * when both protocols are required (ESP should be applied 24381 * before AH for an outbound packet). Force an ESP ACQUIRE 24382 * when both ESP and AH are required, and an AH ACQUIRE 24383 * is needed. 24384 */ 24385 if (ap->ipa_want_esp && need_ah_acquire) 24386 need_esp_acquire = B_TRUE; 24387 } 24388 24389 /* 24390 * Send an ACQUIRE (extended, regular, or both) if we need one. 24391 * Release SAs that got referenced, but will not be used until we 24392 * acquire _all_ of the SAs we need. 24393 */ 24394 if (need_ah_acquire || need_esp_acquire) { 24395 if (io->ipsec_out_ah_sa != NULL) { 24396 IPSA_REFRELE(io->ipsec_out_ah_sa); 24397 io->ipsec_out_ah_sa = NULL; 24398 } 24399 if (io->ipsec_out_esp_sa != NULL) { 24400 IPSA_REFRELE(io->ipsec_out_esp_sa); 24401 io->ipsec_out_esp_sa = NULL; 24402 } 24403 24404 sadb_acquire(ipsec_mp, io, need_ah_acquire, need_esp_acquire); 24405 return (B_FALSE); 24406 } 24407 24408 return (B_TRUE); 24409 } 24410 24411 /* 24412 * Process an IPSEC_OUT message and see what you can 24413 * do with it. 24414 * IPQoS Notes: 24415 * We do IPPF processing if IPP_LOCAL_OUT is enabled before processing for 24416 * IPSec. 24417 * XXX would like to nuke ire_t. 24418 * XXX ill_index better be "real" 24419 */ 24420 void 24421 ipsec_out_process(queue_t *q, mblk_t *ipsec_mp, ire_t *ire, uint_t ill_index) 24422 { 24423 ipsec_out_t *io; 24424 ipsec_policy_t *pp; 24425 ipsec_action_t *ap; 24426 ipha_t *ipha; 24427 ip6_t *ip6h; 24428 mblk_t *mp; 24429 ill_t *ill; 24430 zoneid_t zoneid; 24431 ipsec_status_t ipsec_rc; 24432 boolean_t ill_need_rele = B_FALSE; 24433 24434 io = (ipsec_out_t *)ipsec_mp->b_rptr; 24435 ASSERT(io->ipsec_out_type == IPSEC_OUT); 24436 ASSERT(io->ipsec_out_len == sizeof (ipsec_out_t)); 24437 mp = ipsec_mp->b_cont; 24438 24439 /* 24440 * Initiate IPPF processing. We do it here to account for packets 24441 * coming here that don't have any policy (i.e. !io->ipsec_out_secure). 24442 * We can check for ipsec_out_proc_begin even for such packets, as 24443 * they will always be false (asserted below). 24444 */ 24445 if (IPP_ENABLED(IPP_LOCAL_OUT) && !io->ipsec_out_proc_begin) { 24446 ip_process(IPP_LOCAL_OUT, &mp, io->ipsec_out_ill_index != 0 ? 24447 io->ipsec_out_ill_index : ill_index); 24448 if (mp == NULL) { 24449 ip2dbg(("ipsec_out_process: packet dropped "\ 24450 "during IPPF processing\n")); 24451 freeb(ipsec_mp); 24452 BUMP_MIB(&ip_mib, ipOutDiscards); 24453 return; 24454 } 24455 } 24456 24457 if (!io->ipsec_out_secure) { 24458 /* 24459 * We came here by mistake. 24460 * Don't bother with ipsec processing 24461 * Should "discourage" this path in the future. 24462 */ 24463 ASSERT(io->ipsec_out_proc_begin == B_FALSE); 24464 goto done; 24465 } 24466 ASSERT(io->ipsec_out_need_policy == B_FALSE); 24467 ASSERT((io->ipsec_out_policy != NULL) || 24468 (io->ipsec_out_act != NULL)); 24469 ASSERT(io->ipsec_out_failed == B_FALSE); 24470 24471 if (!ipsec_loaded()) { 24472 ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr; 24473 if (IPH_HDR_VERSION(ipha) == IP_VERSION) { 24474 BUMP_MIB(&ip_mib, ipOutDiscards); 24475 } else { 24476 BUMP_MIB(&ip6_mib, ipv6OutDiscards); 24477 } 24478 ip_drop_packet(ipsec_mp, B_FALSE, NULL, ire, 24479 &ipdrops_ip_ipsec_not_loaded, &ip_dropper); 24480 return; 24481 } 24482 24483 /* 24484 * IPSEC processing has started. 24485 */ 24486 io->ipsec_out_proc_begin = B_TRUE; 24487 ap = io->ipsec_out_act; 24488 if (ap == NULL) { 24489 pp = io->ipsec_out_policy; 24490 ASSERT(pp != NULL); 24491 ap = pp->ipsp_act; 24492 ASSERT(ap != NULL); 24493 } 24494 24495 /* 24496 * Save the outbound ill index. When the packet comes back 24497 * from IPsec, we make sure the ill hasn't changed or disappeared 24498 * before sending it the accelerated packet. 24499 */ 24500 if ((ire != NULL) && (io->ipsec_out_capab_ill_index == 0)) { 24501 int ifindex; 24502 ill = ire_to_ill(ire); 24503 ifindex = ill->ill_phyint->phyint_ifindex; 24504 io->ipsec_out_capab_ill_index = ifindex; 24505 } 24506 24507 /* 24508 * The order of processing is first insert a IP header if needed. 24509 * Then insert the ESP header and then the AH header. 24510 */ 24511 if ((io->ipsec_out_se_done == B_FALSE) && 24512 (ap->ipa_want_se)) { 24513 /* 24514 * First get the outer IP header before sending 24515 * it to ESP. 24516 */ 24517 ipha_t *oipha, *iipha; 24518 mblk_t *outer_mp, *inner_mp; 24519 24520 if ((outer_mp = allocb(sizeof (ipha_t), BPRI_HI)) == NULL) { 24521 (void) mi_strlog(q, 0, SL_ERROR|SL_TRACE|SL_CONSOLE, 24522 "ipsec_out_process: " 24523 "Self-Encapsulation failed: Out of memory\n"); 24524 freemsg(ipsec_mp); 24525 BUMP_MIB(&ip_mib, ipOutDiscards); 24526 return; 24527 } 24528 inner_mp = ipsec_mp->b_cont; 24529 ASSERT(inner_mp->b_datap->db_type == M_DATA); 24530 oipha = (ipha_t *)outer_mp->b_rptr; 24531 iipha = (ipha_t *)inner_mp->b_rptr; 24532 *oipha = *iipha; 24533 outer_mp->b_wptr += sizeof (ipha_t); 24534 oipha->ipha_length = htons(ntohs(iipha->ipha_length) + 24535 sizeof (ipha_t)); 24536 oipha->ipha_protocol = IPPROTO_ENCAP; 24537 oipha->ipha_version_and_hdr_length = 24538 IP_SIMPLE_HDR_VERSION; 24539 oipha->ipha_hdr_checksum = 0; 24540 oipha->ipha_hdr_checksum = ip_csum_hdr(oipha); 24541 outer_mp->b_cont = inner_mp; 24542 ipsec_mp->b_cont = outer_mp; 24543 24544 io->ipsec_out_se_done = B_TRUE; 24545 io->ipsec_out_encaps = B_TRUE; 24546 } 24547 24548 if (((ap->ipa_want_ah && (io->ipsec_out_ah_sa == NULL)) || 24549 (ap->ipa_want_esp && (io->ipsec_out_esp_sa == NULL))) && 24550 !ipsec_out_select_sa(ipsec_mp)) 24551 return; 24552 24553 /* 24554 * By now, we know what SA's to use. Toss over to ESP & AH 24555 * to do the heavy lifting. 24556 */ 24557 zoneid = io->ipsec_out_zoneid; 24558 ASSERT(zoneid != ALL_ZONES); 24559 if ((io->ipsec_out_esp_done == B_FALSE) && (ap->ipa_want_esp)) { 24560 ASSERT(io->ipsec_out_esp_sa != NULL); 24561 io->ipsec_out_esp_done = B_TRUE; 24562 /* 24563 * Note that since hw accel can only apply one transform, 24564 * not two, we skip hw accel for ESP if we also have AH 24565 * This is an design limitation of the interface 24566 * which should be revisited. 24567 */ 24568 ASSERT(ire != NULL); 24569 if (io->ipsec_out_ah_sa == NULL) { 24570 ill = (ill_t *)ire->ire_stq->q_ptr; 24571 ipsec_out_is_accelerated(ipsec_mp, 24572 io->ipsec_out_esp_sa, ill, ire); 24573 } 24574 24575 ipsec_rc = io->ipsec_out_esp_sa->ipsa_output_func(ipsec_mp); 24576 switch (ipsec_rc) { 24577 case IPSEC_STATUS_SUCCESS: 24578 break; 24579 case IPSEC_STATUS_FAILED: 24580 BUMP_MIB(&ip_mib, ipOutDiscards); 24581 /* FALLTHRU */ 24582 case IPSEC_STATUS_PENDING: 24583 return; 24584 } 24585 } 24586 24587 if ((io->ipsec_out_ah_done == B_FALSE) && (ap->ipa_want_ah)) { 24588 ASSERT(io->ipsec_out_ah_sa != NULL); 24589 io->ipsec_out_ah_done = B_TRUE; 24590 if (ire == NULL) { 24591 int idx = io->ipsec_out_capab_ill_index; 24592 ill = ill_lookup_on_ifindex(idx, B_FALSE, 24593 NULL, NULL, NULL, NULL); 24594 ill_need_rele = B_TRUE; 24595 } else { 24596 ill = (ill_t *)ire->ire_stq->q_ptr; 24597 } 24598 ipsec_out_is_accelerated(ipsec_mp, io->ipsec_out_ah_sa, ill, 24599 ire); 24600 24601 ipsec_rc = io->ipsec_out_ah_sa->ipsa_output_func(ipsec_mp); 24602 switch (ipsec_rc) { 24603 case IPSEC_STATUS_SUCCESS: 24604 break; 24605 case IPSEC_STATUS_FAILED: 24606 BUMP_MIB(&ip_mib, ipOutDiscards); 24607 /* FALLTHRU */ 24608 case IPSEC_STATUS_PENDING: 24609 if (ill != NULL && ill_need_rele) 24610 ill_refrele(ill); 24611 return; 24612 } 24613 } 24614 /* 24615 * We are done with IPSEC processing. Send it over 24616 * the wire. 24617 */ 24618 done: 24619 mp = ipsec_mp->b_cont; 24620 ipha = (ipha_t *)mp->b_rptr; 24621 if (IPH_HDR_VERSION(ipha) == IP_VERSION) { 24622 ip_wput_ipsec_out(q, ipsec_mp, ipha, ill, ire); 24623 } else { 24624 ip6h = (ip6_t *)ipha; 24625 ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h, ill, ire); 24626 } 24627 if (ill != NULL && ill_need_rele) 24628 ill_refrele(ill); 24629 } 24630 24631 /* ARGSUSED */ 24632 void 24633 ip_restart_optmgmt(ipsq_t *dummy_sq, queue_t *q, mblk_t *first_mp, void *dummy) 24634 { 24635 opt_restart_t *or; 24636 int err; 24637 conn_t *connp; 24638 24639 ASSERT(CONN_Q(q)); 24640 connp = Q_TO_CONN(q); 24641 24642 ASSERT(first_mp->b_datap->db_type == M_CTL); 24643 or = (opt_restart_t *)first_mp->b_rptr; 24644 /* 24645 * We don't need to pass any credentials here since this is just 24646 * a restart. The credentials are passed in when svr4_optcom_req 24647 * is called the first time (from ip_wput_nondata). 24648 */ 24649 if (or->or_type == T_SVR4_OPTMGMT_REQ) { 24650 err = svr4_optcom_req(q, first_mp, NULL, 24651 &ip_opt_obj); 24652 } else { 24653 ASSERT(or->or_type == T_OPTMGMT_REQ); 24654 err = tpi_optcom_req(q, first_mp, NULL, 24655 &ip_opt_obj); 24656 } 24657 if (err != EINPROGRESS) { 24658 /* operation is done */ 24659 CONN_OPER_PENDING_DONE(connp); 24660 } 24661 } 24662 24663 /* 24664 * ioctls that go through a down/up sequence may need to wait for the down 24665 * to complete. This involves waiting for the ire and ipif refcnts to go down 24666 * to zero. Subsequently the ioctl is restarted from ipif_ill_refrele_tail. 24667 */ 24668 /* ARGSUSED */ 24669 void 24670 ip_reprocess_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 24671 { 24672 struct iocblk *iocp; 24673 mblk_t *mp1; 24674 ipif_t *ipif; 24675 ip_ioctl_cmd_t *ipip; 24676 int err; 24677 sin_t *sin; 24678 struct lifreq *lifr; 24679 struct ifreq *ifr; 24680 24681 iocp = (struct iocblk *)mp->b_rptr; 24682 ASSERT(ipsq != NULL); 24683 /* Existence of mp1 verified in ip_wput_nondata */ 24684 mp1 = mp->b_cont->b_cont; 24685 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 24686 if (ipip->ipi_cmd == SIOCSLIFNAME || ipip->ipi_cmd == IF_UNITSEL) { 24687 ill_t *ill; 24688 /* 24689 * Special case where ipsq_current_ipif may not be set. 24690 * ill_phyint_reinit merged the v4 and v6 into a single ipsq. 24691 * ill could also have become part of a ipmp group in the 24692 * process, we are here as were not able to complete the 24693 * operation in ipif_set_values because we could not become 24694 * exclusive on the new ipsq, In such a case ipsq_current_ipif 24695 * will not be set so we need to set it. 24696 */ 24697 ill = (ill_t *)q->q_ptr; 24698 ipsq->ipsq_current_ipif = ill->ill_ipif; 24699 ipsq->ipsq_last_cmd = ipip->ipi_cmd; 24700 } 24701 24702 ipif = ipsq->ipsq_current_ipif; 24703 ASSERT(ipif != NULL); 24704 if (ipip->ipi_cmd_type == IF_CMD) { 24705 /* This a old style SIOC[GS]IF* command */ 24706 ifr = (struct ifreq *)mp1->b_rptr; 24707 sin = (sin_t *)&ifr->ifr_addr; 24708 } else if (ipip->ipi_cmd_type == LIF_CMD) { 24709 /* This a new style SIOC[GS]LIF* command */ 24710 lifr = (struct lifreq *)mp1->b_rptr; 24711 sin = (sin_t *)&lifr->lifr_addr; 24712 } else { 24713 sin = NULL; 24714 } 24715 24716 err = (*ipip->ipi_func_restart)(ipif, sin, q, mp, ipip, 24717 (void *)mp1->b_rptr); 24718 24719 /* SIOCLIFREMOVEIF could have removed the ipif */ 24720 ip_ioctl_finish(q, mp, err, 24721 ipip->ipi_flags & IPI_GET_CMD ? COPYOUT : NO_COPYOUT, 24722 ipip->ipi_cmd == SIOCLIFREMOVEIF ? NULL : ipif, ipsq); 24723 } 24724 24725 /* 24726 * ioctl processing 24727 * 24728 * ioctl processing starts with ip_sioctl_copyin_setup which looks up 24729 * the ioctl command in the ioctl tables and determines the copyin data size 24730 * from the ioctl property ipi_copyin_size, and does an mi_copyin() of that 24731 * size. 24732 * 24733 * ioctl processing then continues when the M_IOCDATA makes its way down. 24734 * Now the ioctl is looked up again in the ioctl table, and its properties are 24735 * extracted. The associated 'conn' is then refheld till the end of the ioctl 24736 * and the general ioctl processing function ip_process_ioctl is called. 24737 * ip_process_ioctl determines if the ioctl needs to be serialized, and if 24738 * so goes thru the serialization primitive ipsq_try_enter. Then the 24739 * appropriate function to handle the ioctl is called based on the entry in 24740 * the ioctl table. ioctl completion is encapsulated in ip_ioctl_finish 24741 * which also refreleases the 'conn' that was refheld at the start of the 24742 * ioctl. Finally ipsq_exit is called if needed to exit the ipsq. 24743 * ip_extract_lifreq_cmn extracts the interface name from the lifreq/ifreq 24744 * struct and looks up the ipif. ip_extract_tunreq handles the case of tunnel. 24745 * 24746 * Many exclusive ioctls go thru an internal down up sequence as part of 24747 * the operation. For example an attempt to change the IP address of an 24748 * ipif entails ipif_down, set address, ipif_up. Bringing down the interface 24749 * does all the cleanup such as deleting all ires that use this address. 24750 * Then we need to wait till all references to the interface go away. 24751 */ 24752 void 24753 ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) 24754 { 24755 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 24756 ip_ioctl_cmd_t *ipip = (ip_ioctl_cmd_t *)arg; 24757 cmd_info_t ci; 24758 int err; 24759 boolean_t entered_ipsq = B_FALSE; 24760 24761 ip3dbg(("ip_process_ioctl: ioctl %X\n", iocp->ioc_cmd)); 24762 24763 if (ipip == NULL) 24764 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 24765 24766 /* 24767 * SIOCLIFADDIF needs to go thru a special path since the 24768 * ill may not exist yet. This happens in the case of lo0 24769 * which is created using this ioctl. 24770 */ 24771 if (ipip->ipi_cmd == SIOCLIFADDIF) { 24772 err = ip_sioctl_addif(NULL, NULL, q, mp, NULL, NULL); 24773 ip_ioctl_finish(q, mp, err, 24774 ipip->ipi_flags & IPI_GET_CMD ? COPYOUT : NO_COPYOUT, 24775 NULL, NULL); 24776 return; 24777 } 24778 24779 ci.ci_ipif = NULL; 24780 switch (ipip->ipi_cmd_type) { 24781 case IF_CMD: 24782 case LIF_CMD: 24783 /* 24784 * ioctls that pass in a [l]ifreq appear here. 24785 * ip_extract_lifreq_cmn returns a refheld ipif in 24786 * ci.ci_ipif 24787 */ 24788 err = ip_extract_lifreq_cmn(q, mp, ipip->ipi_cmd_type, 24789 ipip->ipi_flags, &ci, ip_process_ioctl); 24790 if (err != 0) { 24791 ip_ioctl_finish(q, mp, err, 24792 ipip->ipi_flags & IPI_GET_CMD ? 24793 COPYOUT : NO_COPYOUT, NULL, NULL); 24794 return; 24795 } 24796 ASSERT(ci.ci_ipif != NULL); 24797 break; 24798 24799 case TUN_CMD: 24800 /* 24801 * SIOC[GS]TUNPARAM appear here. ip_extract_tunreq returns 24802 * a refheld ipif in ci.ci_ipif 24803 */ 24804 err = ip_extract_tunreq(q, mp, &ci.ci_ipif, ip_process_ioctl); 24805 if (err != 0) { 24806 ip_ioctl_finish(q, mp, err, 24807 ipip->ipi_flags & IPI_GET_CMD ? 24808 COPYOUT : NO_COPYOUT, NULL, NULL); 24809 return; 24810 } 24811 ASSERT(ci.ci_ipif != NULL); 24812 break; 24813 24814 case MISC_CMD: 24815 /* 24816 * ioctls that neither pass in [l]ifreq or iftun_req come here 24817 * For eg. SIOCGLIFCONF will appear here. 24818 */ 24819 switch (ipip->ipi_cmd) { 24820 case IF_UNITSEL: 24821 /* ioctl comes down the ill */ 24822 ci.ci_ipif = ((ill_t *)q->q_ptr)->ill_ipif; 24823 ipif_refhold(ci.ci_ipif); 24824 break; 24825 case SIOCGMSFILTER: 24826 case SIOCSMSFILTER: 24827 case SIOCGIPMSFILTER: 24828 case SIOCSIPMSFILTER: 24829 err = ip_extract_msfilter(q, mp, &ci.ci_ipif, 24830 ip_process_ioctl); 24831 if (err != 0) { 24832 ip_ioctl_finish(q, mp, err, 24833 ipip->ipi_flags & IPI_GET_CMD ? 24834 COPYOUT : NO_COPYOUT, NULL, NULL); 24835 return; 24836 } 24837 break; 24838 } 24839 err = 0; 24840 ci.ci_sin = NULL; 24841 ci.ci_sin6 = NULL; 24842 ci.ci_lifr = NULL; 24843 break; 24844 } 24845 24846 /* 24847 * If ipsq is non-null, we are already being called exclusively 24848 */ 24849 ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq)); 24850 if (!(ipip->ipi_flags & IPI_WR)) { 24851 /* 24852 * A return value of EINPROGRESS means the ioctl is 24853 * either queued and waiting for some reason or has 24854 * already completed. 24855 */ 24856 err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, 24857 ci.ci_lifr); 24858 if (ci.ci_ipif != NULL) 24859 ipif_refrele(ci.ci_ipif); 24860 ip_ioctl_finish(q, mp, err, 24861 ipip->ipi_flags & IPI_GET_CMD ? COPYOUT : NO_COPYOUT, 24862 NULL, NULL); 24863 return; 24864 } 24865 24866 ASSERT(ci.ci_ipif != NULL); 24867 24868 if (ipsq == NULL) { 24869 ipsq = ipsq_try_enter(ci.ci_ipif, NULL, q, mp, 24870 ip_process_ioctl, NEW_OP, B_TRUE); 24871 entered_ipsq = B_TRUE; 24872 } 24873 /* 24874 * Release the ipif so that ipif_down and friends that wait for 24875 * references to go away are not misled about the current ipif_refcnt 24876 * values. We are writer so we can access the ipif even after releasing 24877 * the ipif. 24878 */ 24879 ipif_refrele(ci.ci_ipif); 24880 if (ipsq == NULL) 24881 return; 24882 24883 mutex_enter(&ipsq->ipsq_lock); 24884 ASSERT(ipsq->ipsq_current_ipif == NULL); 24885 ipsq->ipsq_current_ipif = ci.ci_ipif; 24886 ipsq->ipsq_last_cmd = ipip->ipi_cmd; 24887 mutex_exit(&ipsq->ipsq_lock); 24888 mutex_enter(&(ci.ci_ipif)->ipif_ill->ill_lock); 24889 /* 24890 * For most set ioctls that come here, this serves as a single point 24891 * where we set the IPIF_CHANGING flag. This ensures that there won't 24892 * be any new references to the ipif. This helps functions that go 24893 * through this path and end up trying to wait for the refcnts 24894 * associated with the ipif to go down to zero. Some exceptions are 24895 * Failover, Failback, and Groupname commands that operate on more than 24896 * just the ci.ci_ipif. These commands internally determine the 24897 * set of ipif's they operate on and set and clear the IPIF_CHANGING 24898 * flags on that set. Another exception is the Removeif command that 24899 * sets the IPIF_CONDEMNED flag internally after identifying the right 24900 * ipif to operate on. 24901 */ 24902 if (ipip->ipi_cmd != SIOCLIFREMOVEIF && 24903 ipip->ipi_cmd != SIOCLIFFAILOVER && 24904 ipip->ipi_cmd != SIOCLIFFAILBACK && 24905 ipip->ipi_cmd != SIOCSLIFGROUPNAME) 24906 (ci.ci_ipif)->ipif_state_flags |= IPIF_CHANGING; 24907 mutex_exit(&(ci.ci_ipif)->ipif_ill->ill_lock); 24908 24909 /* 24910 * A return value of EINPROGRESS means the ioctl is 24911 * either queued and waiting for some reason or has 24912 * already completed. 24913 */ 24914 err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, 24915 ci.ci_lifr); 24916 24917 /* SIOCLIFREMOVEIF could have removed the ipif */ 24918 ip_ioctl_finish(q, mp, err, 24919 ipip->ipi_flags & IPI_GET_CMD ? COPYOUT : NO_COPYOUT, 24920 ipip->ipi_cmd == SIOCLIFREMOVEIF ? NULL : ci.ci_ipif, ipsq); 24921 24922 if (entered_ipsq) 24923 ipsq_exit(ipsq, B_TRUE, B_TRUE); 24924 } 24925 24926 /* 24927 * Complete the ioctl. Typically ioctls use the mi package and need to 24928 * do mi_copyout/mi_copy_done. 24929 */ 24930 void 24931 ip_ioctl_finish(queue_t *q, mblk_t *mp, int err, int mode, 24932 ipif_t *ipif, ipsq_t *ipsq) 24933 { 24934 conn_t *connp = NULL; 24935 24936 if (err == EINPROGRESS) 24937 return; 24938 24939 if (CONN_Q(q)) { 24940 connp = Q_TO_CONN(q); 24941 ASSERT(connp->conn_ref >= 2); 24942 } 24943 24944 switch (mode) { 24945 case COPYOUT: 24946 if (err == 0) 24947 mi_copyout(q, mp); 24948 else 24949 mi_copy_done(q, mp, err); 24950 break; 24951 24952 case NO_COPYOUT: 24953 mi_copy_done(q, mp, err); 24954 break; 24955 24956 default: 24957 /* An ioctl aborted through a conn close would take this path */ 24958 break; 24959 } 24960 24961 /* 24962 * The refhold placed at the start of the ioctl is released here. 24963 */ 24964 if (connp != NULL) 24965 CONN_OPER_PENDING_DONE(connp); 24966 24967 /* 24968 * If the ioctl were an exclusive ioctl it would have set 24969 * IPIF_CHANGING at the start of the ioctl which is undone here. 24970 */ 24971 if (ipif != NULL) { 24972 mutex_enter(&(ipif)->ipif_ill->ill_lock); 24973 ipif->ipif_state_flags &= ~IPIF_CHANGING; 24974 mutex_exit(&(ipif)->ipif_ill->ill_lock); 24975 } 24976 24977 /* 24978 * Clear the current ipif in the ipsq at the completion of the ioctl. 24979 * Note that a non-null ipsq_current_ipif prevents new ioctls from 24980 * entering the ipsq 24981 */ 24982 if (ipsq != NULL) { 24983 mutex_enter(&ipsq->ipsq_lock); 24984 ipsq->ipsq_current_ipif = NULL; 24985 mutex_exit(&ipsq->ipsq_lock); 24986 } 24987 } 24988 24989 /* 24990 * This is called from ip_wput_nondata to resume a deferred TCP bind. 24991 */ 24992 /* ARGSUSED */ 24993 void 24994 ip_resume_tcp_bind(void *arg, mblk_t *mp, void *arg2) 24995 { 24996 conn_t *connp = arg; 24997 tcp_t *tcp; 24998 24999 ASSERT(connp != NULL && IPCL_IS_TCP(connp) && connp->conn_tcp != NULL); 25000 tcp = connp->conn_tcp; 25001 25002 if (connp->conn_tcp->tcp_state == TCPS_CLOSED) 25003 freemsg(mp); 25004 else 25005 tcp_rput_other(tcp, mp); 25006 CONN_OPER_PENDING_DONE(connp); 25007 } 25008 25009 /* Called from ip_wput for all non data messages */ 25010 /* ARGSUSED */ 25011 void 25012 ip_wput_nondata(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 25013 { 25014 mblk_t *mp1; 25015 ire_t *ire; 25016 ill_t *ill; 25017 struct iocblk *iocp; 25018 ip_ioctl_cmd_t *ipip; 25019 cred_t *cr; 25020 conn_t *connp = NULL; 25021 int cmd, err; 25022 25023 if (CONN_Q(q)) 25024 connp = Q_TO_CONN(q); 25025 25026 cr = DB_CREDDEF(mp, GET_QUEUE_CRED(q)); 25027 25028 /* Check if it is a queue to /dev/sctp. */ 25029 if (connp != NULL && connp->conn_ulp == IPPROTO_SCTP && 25030 connp->conn_rq == NULL) { 25031 sctp_wput(q, mp); 25032 return; 25033 } 25034 25035 switch (DB_TYPE(mp)) { 25036 case M_IOCTL: 25037 /* 25038 * IOCTL processing begins in ip_sioctl_copyin_setup which 25039 * will arrange to copy in associated control structures. 25040 */ 25041 ip_sioctl_copyin_setup(q, mp); 25042 return; 25043 case M_IOCDATA: 25044 /* 25045 * Ensure that this is associated with one of our trans- 25046 * parent ioctls. If it's not ours, discard it if we're 25047 * running as a driver, or pass it on if we're a module. 25048 */ 25049 iocp = (struct iocblk *)mp->b_rptr; 25050 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 25051 if (ipip == NULL) { 25052 if (q->q_next == NULL) { 25053 goto nak; 25054 } else { 25055 putnext(q, mp); 25056 } 25057 return; 25058 } else if ((q->q_next != NULL) && 25059 !(ipip->ipi_flags & IPI_MODOK)) { 25060 /* 25061 * the ioctl is one we recognise, but is not 25062 * consumed by IP as a module, pass M_IOCDATA 25063 * for processing downstream, but only for 25064 * common Streams ioctls. 25065 */ 25066 if (ipip->ipi_flags & IPI_PASS_DOWN) { 25067 putnext(q, mp); 25068 return; 25069 } else { 25070 goto nak; 25071 } 25072 } 25073 25074 /* IOCTL continuation following copyin or copyout. */ 25075 if (mi_copy_state(q, mp, NULL) == -1) { 25076 /* 25077 * The copy operation failed. mi_copy_state already 25078 * cleaned up, so we're out of here. 25079 */ 25080 return; 25081 } 25082 /* 25083 * If we just completed a copy in, we become writer and 25084 * continue processing in ip_sioctl_copyin_done. If it 25085 * was a copy out, we call mi_copyout again. If there is 25086 * nothing more to copy out, it will complete the IOCTL. 25087 */ 25088 if (MI_COPY_DIRECTION(mp) == MI_COPY_IN) { 25089 if (!(mp1 = mp->b_cont) || !(mp1 = mp1->b_cont)) { 25090 mi_copy_done(q, mp, EPROTO); 25091 return; 25092 } 25093 /* 25094 * Check for cases that need more copying. A return 25095 * value of 0 means a second copyin has been started, 25096 * so we return; a return value of 1 means no more 25097 * copying is needed, so we continue. 25098 */ 25099 cmd = iocp->ioc_cmd; 25100 if ((cmd == SIOCGMSFILTER || cmd == SIOCSMSFILTER || 25101 cmd == SIOCGIPMSFILTER || cmd == SIOCSIPMSFILTER) && 25102 MI_COPY_COUNT(mp) == 1) { 25103 if (ip_copyin_msfilter(q, mp) == 0) 25104 return; 25105 } 25106 /* 25107 * Refhold the conn, till the ioctl completes. This is 25108 * needed in case the ioctl ends up in the pending mp 25109 * list. Every mp in the ill_pending_mp list and 25110 * the ipsq_pending_mp must have a refhold on the conn 25111 * to resume processing. The refhold is released when 25112 * the ioctl completes. (normally or abnormally) 25113 * In all cases ip_ioctl_finish is called to finish 25114 * the ioctl. 25115 */ 25116 if (connp != NULL) { 25117 /* This is not a reentry */ 25118 ASSERT(ipsq == NULL); 25119 CONN_INC_REF(connp); 25120 } else { 25121 if (!(ipip->ipi_flags & IPI_MODOK)) { 25122 mi_copy_done(q, mp, EINVAL); 25123 return; 25124 } 25125 } 25126 25127 ip_process_ioctl(ipsq, q, mp, ipip); 25128 25129 } else { 25130 mi_copyout(q, mp); 25131 } 25132 return; 25133 nak: 25134 iocp->ioc_error = EINVAL; 25135 mp->b_datap->db_type = M_IOCNAK; 25136 iocp->ioc_count = 0; 25137 qreply(q, mp); 25138 return; 25139 25140 case M_IOCNAK: 25141 /* 25142 * The only way we could get here is if a resolver didn't like 25143 * an IOCTL we sent it. This shouldn't happen. 25144 */ 25145 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 25146 "ip_wput: unexpected M_IOCNAK, ioc_cmd 0x%x", 25147 ((struct iocblk *)mp->b_rptr)->ioc_cmd); 25148 freemsg(mp); 25149 return; 25150 case M_IOCACK: 25151 /* Finish socket ioctls passed through to ARP. */ 25152 ip_sioctl_iocack(q, mp); 25153 return; 25154 case M_FLUSH: 25155 if (*mp->b_rptr & FLUSHW) 25156 flushq(q, FLUSHALL); 25157 if (q->q_next) { 25158 /* 25159 * M_FLUSH is sent up to IP by some drivers during 25160 * unbind. ip_rput has already replied to it. We are 25161 * here for the M_FLUSH that we originated in IP 25162 * before sending the unbind request to the driver. 25163 * Just free it as we don't queue packets in IP 25164 * on the write side of the device instance. 25165 */ 25166 freemsg(mp); 25167 return; 25168 } 25169 if (*mp->b_rptr & FLUSHR) { 25170 *mp->b_rptr &= ~FLUSHW; 25171 qreply(q, mp); 25172 return; 25173 } 25174 freemsg(mp); 25175 return; 25176 case IRE_DB_REQ_TYPE: 25177 /* An Upper Level Protocol wants a copy of an IRE. */ 25178 ip_ire_req(q, mp); 25179 return; 25180 case M_CTL: 25181 if (mp->b_wptr - mp->b_rptr < sizeof (uint32_t)) 25182 break; 25183 25184 if (connp != NULL && *(uint32_t *)mp->b_rptr == 25185 IP_ULP_OUT_LABELED) { 25186 out_labeled_t *olp; 25187 25188 if (mp->b_wptr - mp->b_rptr != sizeof (*olp)) 25189 break; 25190 olp = (out_labeled_t *)mp->b_rptr; 25191 connp->conn_ulp_labeled = olp->out_qnext == q; 25192 freemsg(mp); 25193 return; 25194 } 25195 25196 /* M_CTL messages are used by ARP to tell us things. */ 25197 if ((mp->b_wptr - mp->b_rptr) < sizeof (arc_t)) 25198 break; 25199 switch (((arc_t *)mp->b_rptr)->arc_cmd) { 25200 case AR_ENTRY_SQUERY: 25201 ip_wput_ctl(q, mp); 25202 return; 25203 case AR_CLIENT_NOTIFY: 25204 ip_arp_news(q, mp); 25205 return; 25206 case AR_DLPIOP_DONE: 25207 ASSERT(q->q_next != NULL); 25208 ill = (ill_t *)q->q_ptr; 25209 /* qwriter_ip releases the refhold */ 25210 /* refhold on ill stream is ok without ILL_CAN_LOOKUP */ 25211 ill_refhold(ill); 25212 (void) qwriter_ip(NULL, ill, q, mp, ip_arp_done, 25213 CUR_OP, B_FALSE); 25214 return; 25215 case AR_ARP_CLOSING: 25216 /* 25217 * ARP (above us) is closing. If no ARP bringup is 25218 * currently pending, ack the message so that ARP 25219 * can complete its close. Also mark ill_arp_closing 25220 * so that new ARP bringups will fail. If any 25221 * ARP bringup is currently in progress, we will 25222 * ack this when the current ARP bringup completes. 25223 */ 25224 ASSERT(q->q_next != NULL); 25225 ill = (ill_t *)q->q_ptr; 25226 mutex_enter(&ill->ill_lock); 25227 ill->ill_arp_closing = 1; 25228 if (!ill->ill_arp_bringup_pending) { 25229 mutex_exit(&ill->ill_lock); 25230 qreply(q, mp); 25231 } else { 25232 mutex_exit(&ill->ill_lock); 25233 freemsg(mp); 25234 } 25235 return; 25236 default: 25237 break; 25238 } 25239 break; 25240 case M_PROTO: 25241 case M_PCPROTO: 25242 /* 25243 * The only PROTO messages we expect are ULP binds and 25244 * copies of option negotiation acknowledgements. 25245 */ 25246 switch (((union T_primitives *)mp->b_rptr)->type) { 25247 case O_T_BIND_REQ: 25248 case T_BIND_REQ: { 25249 /* Request can get queued in bind */ 25250 ASSERT(connp != NULL); 25251 /* 25252 * Both TCP and UDP call ip_bind_{v4,v6}() directly 25253 * instead of going through this path. We only get 25254 * here in the following cases: 25255 * 25256 * a. Bind retries, where ipsq is non-NULL. 25257 * b. T_BIND_REQ is issued from non TCP/UDP 25258 * transport, e.g. icmp for raw socket, 25259 * in which case ipsq will be NULL. 25260 */ 25261 ASSERT(ipsq != NULL || 25262 (!IPCL_IS_TCP(connp) && !IPCL_IS_UDP(connp))); 25263 25264 /* Don't increment refcnt if this is a re-entry */ 25265 if (ipsq == NULL) 25266 CONN_INC_REF(connp); 25267 mp = connp->conn_af_isv6 ? ip_bind_v6(q, mp, 25268 connp, NULL) : ip_bind_v4(q, mp, connp); 25269 if (mp == NULL) 25270 return; 25271 if (IPCL_IS_TCP(connp)) { 25272 /* 25273 * In the case of TCP endpoint we 25274 * come here only for bind retries 25275 */ 25276 ASSERT(ipsq != NULL); 25277 CONN_INC_REF(connp); 25278 squeue_fill(connp->conn_sqp, mp, 25279 ip_resume_tcp_bind, connp, 25280 SQTAG_BIND_RETRY); 25281 return; 25282 } else if (IPCL_IS_UDP(connp)) { 25283 /* 25284 * In the case of UDP endpoint we 25285 * come here only for bind retries 25286 */ 25287 ASSERT(ipsq != NULL); 25288 udp_resume_bind(connp, mp); 25289 return; 25290 } 25291 qreply(q, mp); 25292 CONN_OPER_PENDING_DONE(connp); 25293 return; 25294 } 25295 case T_SVR4_OPTMGMT_REQ: 25296 ip2dbg(("ip_wput: T_SVR4_OPTMGMT_REQ flags %x\n", 25297 ((struct T_optmgmt_req *)mp->b_rptr)->MGMT_flags)); 25298 25299 ASSERT(connp != NULL); 25300 if (!snmpcom_req(q, mp, ip_snmp_set, 25301 ip_snmp_get, cr)) { 25302 /* 25303 * Call svr4_optcom_req so that it can 25304 * generate the ack. We don't come here 25305 * if this operation is being restarted. 25306 * ip_restart_optmgmt will drop the conn ref. 25307 * In the case of ipsec option after the ipsec 25308 * load is complete conn_restart_ipsec_waiter 25309 * drops the conn ref. 25310 */ 25311 ASSERT(ipsq == NULL); 25312 CONN_INC_REF(connp); 25313 if (ip_check_for_ipsec_opt(q, mp)) 25314 return; 25315 err = svr4_optcom_req(q, mp, cr, &ip_opt_obj); 25316 if (err != EINPROGRESS) { 25317 /* Operation is done */ 25318 CONN_OPER_PENDING_DONE(connp); 25319 } 25320 } 25321 return; 25322 case T_OPTMGMT_REQ: 25323 ip2dbg(("ip_wput: T_OPTMGMT_REQ\n")); 25324 /* 25325 * Note: No snmpcom_req support through new 25326 * T_OPTMGMT_REQ. 25327 * Call tpi_optcom_req so that it can 25328 * generate the ack. 25329 */ 25330 ASSERT(connp != NULL); 25331 ASSERT(ipsq == NULL); 25332 /* 25333 * We don't come here for restart. ip_restart_optmgmt 25334 * will drop the conn ref. In the case of ipsec option 25335 * after the ipsec load is complete 25336 * conn_restart_ipsec_waiter drops the conn ref. 25337 */ 25338 CONN_INC_REF(connp); 25339 if (ip_check_for_ipsec_opt(q, mp)) 25340 return; 25341 err = tpi_optcom_req(q, mp, cr, &ip_opt_obj); 25342 if (err != EINPROGRESS) { 25343 /* Operation is done */ 25344 CONN_OPER_PENDING_DONE(connp); 25345 } 25346 return; 25347 case T_UNBIND_REQ: 25348 mp = ip_unbind(q, mp); 25349 qreply(q, mp); 25350 return; 25351 default: 25352 /* 25353 * Have to drop any DLPI messages coming down from 25354 * arp (such as an info_req which would cause ip 25355 * to receive an extra info_ack if it was passed 25356 * through. 25357 */ 25358 ip1dbg(("ip_wput_nondata: dropping M_PROTO %d\n", 25359 (int)*(uint_t *)mp->b_rptr)); 25360 freemsg(mp); 25361 return; 25362 } 25363 /* NOTREACHED */ 25364 case IRE_DB_TYPE: { 25365 nce_t *nce; 25366 ill_t *ill; 25367 in6_addr_t gw_addr_v6; 25368 25369 25370 /* 25371 * This is a response back from a resolver. It 25372 * consists of a message chain containing: 25373 * IRE_MBLK-->LL_HDR_MBLK->pkt 25374 * The IRE_MBLK is the one we allocated in ip_newroute. 25375 * The LL_HDR_MBLK is the DLPI header to use to get 25376 * the attached packet, and subsequent ones for the 25377 * same destination, transmitted. 25378 */ 25379 if ((mp->b_wptr - mp->b_rptr) != sizeof (ire_t)) /* ire */ 25380 break; 25381 /* 25382 * First, check to make sure the resolution succeeded. 25383 * If it failed, the second mblk will be empty. 25384 * If it is, free the chain, dropping the packet. 25385 * (We must ire_delete the ire; that frees the ire mblk) 25386 * We're doing this now to support PVCs for ATM; it's 25387 * a partial xresolv implementation. When we fully implement 25388 * xresolv interfaces, instead of freeing everything here 25389 * we'll initiate neighbor discovery. 25390 * 25391 * For v4 (ARP and other external resolvers) the resolver 25392 * frees the message, so no check is needed. This check 25393 * is required, though, for a full xresolve implementation. 25394 * Including this code here now both shows how external 25395 * resolvers can NACK a resolution request using an 25396 * existing design that has no specific provisions for NACKs, 25397 * and also takes into account that the current non-ARP 25398 * external resolver has been coded to use this method of 25399 * NACKing for all IPv6 (xresolv) cases, 25400 * whether our xresolv implementation is complete or not. 25401 * 25402 */ 25403 ire = (ire_t *)mp->b_rptr; 25404 ill = ire_to_ill(ire); 25405 mp1 = mp->b_cont; /* dl_unitdata_req */ 25406 if (mp1->b_rptr == mp1->b_wptr) { 25407 if (ire->ire_ipversion == IPV6_VERSION) { 25408 /* 25409 * XRESOLV interface. 25410 */ 25411 ASSERT(ill->ill_flags & ILLF_XRESOLV); 25412 mutex_enter(&ire->ire_lock); 25413 gw_addr_v6 = ire->ire_gateway_addr_v6; 25414 mutex_exit(&ire->ire_lock); 25415 if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) { 25416 nce = ndp_lookup(ill, 25417 &ire->ire_addr_v6, B_FALSE); 25418 } else { 25419 nce = ndp_lookup(ill, &gw_addr_v6, 25420 B_FALSE); 25421 } 25422 if (nce != NULL) { 25423 nce_resolv_failed(nce); 25424 ndp_delete(nce); 25425 NCE_REFRELE(nce); 25426 } 25427 } 25428 mp->b_cont = NULL; 25429 freemsg(mp1); /* frees the pkt as well */ 25430 ire_delete((ire_t *)mp->b_rptr); 25431 return; 25432 } 25433 /* 25434 * Split them into IRE_MBLK and pkt and feed it into 25435 * ire_add_then_send. Then in ire_add_then_send 25436 * the IRE will be added, and then the packet will be 25437 * run back through ip_wput. This time it will make 25438 * it to the wire. 25439 */ 25440 mp->b_cont = NULL; 25441 mp = mp1->b_cont; /* now, mp points to pkt */ 25442 mp1->b_cont = NULL; 25443 ip1dbg(("ip_wput_nondata: reply from external resolver \n")); 25444 if (ire->ire_ipversion == IPV6_VERSION) { 25445 /* 25446 * XRESOLV interface. Find the nce and put a copy 25447 * of the dl_unitdata_req in nce_res_mp 25448 */ 25449 ASSERT(ill->ill_flags & ILLF_XRESOLV); 25450 mutex_enter(&ire->ire_lock); 25451 gw_addr_v6 = ire->ire_gateway_addr_v6; 25452 mutex_exit(&ire->ire_lock); 25453 if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) { 25454 nce = ndp_lookup(ill, &ire->ire_addr_v6, 25455 B_FALSE); 25456 } else { 25457 nce = ndp_lookup(ill, &gw_addr_v6, B_FALSE); 25458 } 25459 if (nce != NULL) { 25460 /* 25461 * We have to protect nce_res_mp here 25462 * from being accessed by other threads 25463 * while we change the mblk pointer. 25464 * Other functions will also lock the nce when 25465 * accessing nce_res_mp. 25466 * 25467 * The reason we change the mblk pointer 25468 * here rather than copying the resolved address 25469 * into the template is that, unlike with 25470 * ethernet, we have no guarantee that the 25471 * resolved address length will be 25472 * smaller than or equal to the lla length 25473 * with which the template was allocated, 25474 * (for ethernet, they're equal) 25475 * so we have to use the actual resolved 25476 * address mblk - which holds the real 25477 * dl_unitdata_req with the resolved address. 25478 * 25479 * Doing this is the same behavior as was 25480 * previously used in the v4 ARP case. 25481 */ 25482 mutex_enter(&nce->nce_lock); 25483 if (nce->nce_res_mp != NULL) 25484 freemsg(nce->nce_res_mp); 25485 nce->nce_res_mp = mp1; 25486 mutex_exit(&nce->nce_lock); 25487 /* 25488 * We do a fastpath probe here because 25489 * we have resolved the address without 25490 * using Neighbor Discovery. 25491 * In the non-XRESOLV v6 case, the fastpath 25492 * probe is done right after neighbor 25493 * discovery completes. 25494 */ 25495 if (nce->nce_res_mp != NULL) { 25496 int res; 25497 nce_fastpath_list_add(nce); 25498 res = ill_fastpath_probe(ill, 25499 nce->nce_res_mp); 25500 if (res != 0 && res != EAGAIN) 25501 nce_fastpath_list_delete(nce); 25502 } 25503 25504 ire_add_then_send(q, ire, mp); 25505 /* 25506 * Now we have to clean out any packets 25507 * that may have been queued on the nce 25508 * while it was waiting for address resolution 25509 * to complete. 25510 */ 25511 mutex_enter(&nce->nce_lock); 25512 mp1 = nce->nce_qd_mp; 25513 nce->nce_qd_mp = NULL; 25514 mutex_exit(&nce->nce_lock); 25515 while (mp1 != NULL) { 25516 mblk_t *nxt_mp; 25517 queue_t *fwdq = NULL; 25518 ill_t *inbound_ill; 25519 uint_t ifindex; 25520 25521 nxt_mp = mp1->b_next; 25522 mp1->b_next = NULL; 25523 /* 25524 * Retrieve ifindex stored in 25525 * ip_rput_data_v6() 25526 */ 25527 ifindex = 25528 (uint_t)(uintptr_t)mp1->b_prev; 25529 inbound_ill = 25530 ill_lookup_on_ifindex(ifindex, 25531 B_TRUE, NULL, NULL, NULL, 25532 NULL); 25533 mp1->b_prev = NULL; 25534 if (inbound_ill != NULL) 25535 fwdq = inbound_ill->ill_rq; 25536 25537 if (fwdq != NULL) { 25538 put(fwdq, mp1); 25539 ill_refrele(inbound_ill); 25540 } else 25541 put(WR(ill->ill_rq), mp1); 25542 mp1 = nxt_mp; 25543 } 25544 NCE_REFRELE(nce); 25545 } else { /* nce is NULL; clean up */ 25546 ire_delete(ire); 25547 freemsg(mp); 25548 freemsg(mp1); 25549 return; 25550 } 25551 } else { 25552 ire->ire_dlureq_mp = mp1; 25553 ire_add_then_send(q, ire, mp); 25554 } 25555 return; /* All is well, the packet has been sent. */ 25556 } 25557 default: 25558 break; 25559 } 25560 if (q->q_next) { 25561 putnext(q, mp); 25562 } else 25563 freemsg(mp); 25564 } 25565 25566 /* 25567 * Process IP options in an outbound packet. Modify the destination if there 25568 * is a source route option. 25569 * Returns non-zero if something fails in which case an ICMP error has been 25570 * sent and mp freed. 25571 */ 25572 static int 25573 ip_wput_options(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, 25574 boolean_t mctl_present, zoneid_t zoneid) 25575 { 25576 ipoptp_t opts; 25577 uchar_t *opt; 25578 uint8_t optval; 25579 uint8_t optlen; 25580 ipaddr_t dst; 25581 intptr_t code = 0; 25582 mblk_t *mp; 25583 ire_t *ire = NULL; 25584 25585 ip2dbg(("ip_wput_options\n")); 25586 mp = ipsec_mp; 25587 if (mctl_present) { 25588 mp = ipsec_mp->b_cont; 25589 } 25590 25591 dst = ipha->ipha_dst; 25592 for (optval = ipoptp_first(&opts, ipha); 25593 optval != IPOPT_EOL; 25594 optval = ipoptp_next(&opts)) { 25595 opt = opts.ipoptp_cur; 25596 optlen = opts.ipoptp_len; 25597 ip2dbg(("ip_wput_options: opt %d, len %d\n", 25598 optval, optlen)); 25599 switch (optval) { 25600 uint32_t off; 25601 case IPOPT_SSRR: 25602 case IPOPT_LSRR: 25603 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 25604 ip1dbg(( 25605 "ip_wput_options: bad option offset\n")); 25606 code = (char *)&opt[IPOPT_OLEN] - 25607 (char *)ipha; 25608 goto param_prob; 25609 } 25610 off = opt[IPOPT_OFFSET]; 25611 ip1dbg(("ip_wput_options: next hop 0x%x\n", 25612 ntohl(dst))); 25613 /* 25614 * For strict: verify that dst is directly 25615 * reachable. 25616 */ 25617 if (optval == IPOPT_SSRR) { 25618 ire = ire_ftable_lookup(dst, 0, 0, 25619 IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, 25620 MBLK_GETLABEL(mp), 25621 MATCH_IRE_TYPE | MATCH_IRE_SECATTR); 25622 if (ire == NULL) { 25623 ip1dbg(("ip_wput_options: SSRR not" 25624 " directly reachable: 0x%x\n", 25625 ntohl(dst))); 25626 goto bad_src_route; 25627 } 25628 ire_refrele(ire); 25629 } 25630 break; 25631 case IPOPT_RR: 25632 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 25633 ip1dbg(( 25634 "ip_wput_options: bad option offset\n")); 25635 code = (char *)&opt[IPOPT_OLEN] - 25636 (char *)ipha; 25637 goto param_prob; 25638 } 25639 break; 25640 case IPOPT_TS: 25641 /* 25642 * Verify that length >=5 and that there is either 25643 * room for another timestamp or that the overflow 25644 * counter is not maxed out. 25645 */ 25646 code = (char *)&opt[IPOPT_OLEN] - (char *)ipha; 25647 if (optlen < IPOPT_MINLEN_IT) { 25648 goto param_prob; 25649 } 25650 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 25651 ip1dbg(( 25652 "ip_wput_options: bad option offset\n")); 25653 code = (char *)&opt[IPOPT_OFFSET] - 25654 (char *)ipha; 25655 goto param_prob; 25656 } 25657 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 25658 case IPOPT_TS_TSONLY: 25659 off = IPOPT_TS_TIMELEN; 25660 break; 25661 case IPOPT_TS_TSANDADDR: 25662 case IPOPT_TS_PRESPEC: 25663 case IPOPT_TS_PRESPEC_RFC791: 25664 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 25665 break; 25666 default: 25667 code = (char *)&opt[IPOPT_POS_OV_FLG] - 25668 (char *)ipha; 25669 goto param_prob; 25670 } 25671 if (opt[IPOPT_OFFSET] - 1 + off > optlen && 25672 (opt[IPOPT_POS_OV_FLG] & 0xF0) == 0xF0) { 25673 /* 25674 * No room and the overflow counter is 15 25675 * already. 25676 */ 25677 goto param_prob; 25678 } 25679 break; 25680 } 25681 } 25682 25683 if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0) 25684 return (0); 25685 25686 ip1dbg(("ip_wput_options: error processing IP options.")); 25687 code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha; 25688 25689 param_prob: 25690 /* 25691 * Since ip_wput() isn't close to finished, we fill 25692 * in enough of the header for credible error reporting. 25693 */ 25694 if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid)) { 25695 /* Failed */ 25696 freemsg(ipsec_mp); 25697 return (-1); 25698 } 25699 icmp_param_problem(q, ipsec_mp, (uint8_t)code); 25700 return (-1); 25701 25702 bad_src_route: 25703 /* 25704 * Since ip_wput() isn't close to finished, we fill 25705 * in enough of the header for credible error reporting. 25706 */ 25707 if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid)) { 25708 /* Failed */ 25709 freemsg(ipsec_mp); 25710 return (-1); 25711 } 25712 icmp_unreachable(q, ipsec_mp, ICMP_SOURCE_ROUTE_FAILED); 25713 return (-1); 25714 } 25715 25716 /* 25717 * The maximum value of conn_drain_list_cnt is CONN_MAXDRAINCNT. 25718 * conn_drain_list_cnt can be changed by setting conn_drain_nthreads 25719 * thru /etc/system. 25720 */ 25721 #define CONN_MAXDRAINCNT 64 25722 25723 static void 25724 conn_drain_init(void) 25725 { 25726 int i; 25727 25728 conn_drain_list_cnt = conn_drain_nthreads; 25729 25730 if ((conn_drain_list_cnt == 0) || 25731 (conn_drain_list_cnt > CONN_MAXDRAINCNT)) { 25732 /* 25733 * Default value of the number of drainers is the 25734 * number of cpus, subject to maximum of 8 drainers. 25735 */ 25736 if (boot_max_ncpus != -1) 25737 conn_drain_list_cnt = MIN(boot_max_ncpus, 8); 25738 else 25739 conn_drain_list_cnt = MIN(max_ncpus, 8); 25740 } 25741 25742 conn_drain_list = kmem_zalloc(conn_drain_list_cnt * sizeof (idl_t), 25743 KM_SLEEP); 25744 25745 for (i = 0; i < conn_drain_list_cnt; i++) { 25746 mutex_init(&conn_drain_list[i].idl_lock, NULL, 25747 MUTEX_DEFAULT, NULL); 25748 } 25749 } 25750 25751 static void 25752 conn_drain_fini(void) 25753 { 25754 int i; 25755 25756 for (i = 0; i < conn_drain_list_cnt; i++) 25757 mutex_destroy(&conn_drain_list[i].idl_lock); 25758 kmem_free(conn_drain_list, conn_drain_list_cnt * sizeof (idl_t)); 25759 conn_drain_list = NULL; 25760 } 25761 25762 /* 25763 * Note: For an overview of how flowcontrol is handled in IP please see the 25764 * IP Flowcontrol notes at the top of this file. 25765 * 25766 * Flow control has blocked us from proceeding. Insert the given conn in one 25767 * of the conn drain lists. These conn wq's will be qenabled later on when 25768 * STREAMS flow control does a backenable. conn_walk_drain will enable 25769 * the first conn in each of these drain lists. Each of these qenabled conns 25770 * in turn enables the next in the list, after it runs, or when it closes, 25771 * thus sustaining the drain process. 25772 * 25773 * The only possible calling sequence is ip_wsrv (on conn) -> ip_wput -> 25774 * conn_drain_insert. Thus there can be only 1 instance of conn_drain_insert 25775 * running at any time, on a given conn, since there can be only 1 service proc 25776 * running on a queue at any time. 25777 */ 25778 void 25779 conn_drain_insert(conn_t *connp) 25780 { 25781 idl_t *idl; 25782 uint_t index; 25783 25784 mutex_enter(&connp->conn_lock); 25785 if (connp->conn_state_flags & CONN_CLOSING) { 25786 /* 25787 * The conn is closing as a result of which CONN_CLOSING 25788 * is set. Return. 25789 */ 25790 mutex_exit(&connp->conn_lock); 25791 return; 25792 } else if (connp->conn_idl == NULL) { 25793 /* 25794 * Assign the next drain list round robin. We dont' use 25795 * a lock, and thus it may not be strictly round robin. 25796 * Atomicity of load/stores is enough to make sure that 25797 * conn_drain_list_index is always within bounds. 25798 */ 25799 index = conn_drain_list_index; 25800 ASSERT(index < conn_drain_list_cnt); 25801 connp->conn_idl = &conn_drain_list[index]; 25802 index++; 25803 if (index == conn_drain_list_cnt) 25804 index = 0; 25805 conn_drain_list_index = index; 25806 } 25807 mutex_exit(&connp->conn_lock); 25808 25809 mutex_enter(CONN_DRAIN_LIST_LOCK(connp)); 25810 if ((connp->conn_drain_prev != NULL) || 25811 (connp->conn_state_flags & CONN_CLOSING)) { 25812 /* 25813 * The conn is already in the drain list, OR 25814 * the conn is closing. We need to check again for 25815 * the closing case again since close can happen 25816 * after we drop the conn_lock, and before we 25817 * acquire the CONN_DRAIN_LIST_LOCK. 25818 */ 25819 mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); 25820 return; 25821 } else { 25822 idl = connp->conn_idl; 25823 } 25824 25825 /* 25826 * The conn is not in the drain list. Insert it at the 25827 * tail of the drain list. The drain list is circular 25828 * and doubly linked. idl_conn points to the 1st element 25829 * in the list. 25830 */ 25831 if (idl->idl_conn == NULL) { 25832 idl->idl_conn = connp; 25833 connp->conn_drain_next = connp; 25834 connp->conn_drain_prev = connp; 25835 } else { 25836 conn_t *head = idl->idl_conn; 25837 25838 connp->conn_drain_next = head; 25839 connp->conn_drain_prev = head->conn_drain_prev; 25840 head->conn_drain_prev->conn_drain_next = connp; 25841 head->conn_drain_prev = connp; 25842 } 25843 mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); 25844 } 25845 25846 /* 25847 * This conn is closing, and we are called from ip_close. OR 25848 * This conn has been serviced by ip_wsrv, and we need to do the tail 25849 * processing. 25850 * If this conn is part of the drain list, we may need to sustain the drain 25851 * process by qenabling the next conn in the drain list. We may also need to 25852 * remove this conn from the list, if it is done. 25853 */ 25854 static void 25855 conn_drain_tail(conn_t *connp, boolean_t closing) 25856 { 25857 idl_t *idl; 25858 25859 /* 25860 * connp->conn_idl is stable at this point, and no lock is needed 25861 * to check it. If we are called from ip_close, close has already 25862 * set CONN_CLOSING, thus freezing the value of conn_idl, and 25863 * called us only because conn_idl is non-null. If we are called thru 25864 * service, conn_idl could be null, but it cannot change because 25865 * service is single-threaded per queue, and there cannot be another 25866 * instance of service trying to call conn_drain_insert on this conn 25867 * now. 25868 */ 25869 ASSERT(!closing || (connp->conn_idl != NULL)); 25870 25871 /* 25872 * If connp->conn_idl is null, the conn has not been inserted into any 25873 * drain list even once since creation of the conn. Just return. 25874 */ 25875 if (connp->conn_idl == NULL) 25876 return; 25877 25878 mutex_enter(CONN_DRAIN_LIST_LOCK(connp)); 25879 25880 if (connp->conn_drain_prev == NULL) { 25881 /* This conn is currently not in the drain list. */ 25882 mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); 25883 return; 25884 } 25885 idl = connp->conn_idl; 25886 if (idl->idl_conn_draining == connp) { 25887 /* 25888 * This conn is the current drainer. If this is the last conn 25889 * in the drain list, we need to do more checks, in the 'if' 25890 * below. Otherwwise we need to just qenable the next conn, 25891 * to sustain the draining, and is handled in the 'else' 25892 * below. 25893 */ 25894 if (connp->conn_drain_next == idl->idl_conn) { 25895 /* 25896 * This conn is the last in this list. This round 25897 * of draining is complete. If idl_repeat is set, 25898 * it means another flow enabling has happened from 25899 * the driver/streams and we need to another round 25900 * of draining. 25901 * If there are more than 2 conns in the drain list, 25902 * do a left rotate by 1, so that all conns except the 25903 * conn at the head move towards the head by 1, and the 25904 * the conn at the head goes to the tail. This attempts 25905 * a more even share for all queues that are being 25906 * drained. 25907 */ 25908 if ((connp->conn_drain_next != connp) && 25909 (idl->idl_conn->conn_drain_next != connp)) { 25910 idl->idl_conn = idl->idl_conn->conn_drain_next; 25911 } 25912 if (idl->idl_repeat) { 25913 qenable(idl->idl_conn->conn_wq); 25914 idl->idl_conn_draining = idl->idl_conn; 25915 idl->idl_repeat = 0; 25916 } else { 25917 idl->idl_conn_draining = NULL; 25918 } 25919 } else { 25920 /* 25921 * If the next queue that we are now qenable'ing, 25922 * is closing, it will remove itself from this list 25923 * and qenable the subsequent queue in ip_close(). 25924 * Serialization is acheived thru idl_lock. 25925 */ 25926 qenable(connp->conn_drain_next->conn_wq); 25927 idl->idl_conn_draining = connp->conn_drain_next; 25928 } 25929 } 25930 if (!connp->conn_did_putbq || closing) { 25931 /* 25932 * Remove ourself from the drain list, if we did not do 25933 * a putbq, or if the conn is closing. 25934 * Note: It is possible that q->q_first is non-null. It means 25935 * that these messages landed after we did a enableok() in 25936 * ip_wsrv. Thus STREAMS will call ip_wsrv once again to 25937 * service them. 25938 */ 25939 if (connp->conn_drain_next == connp) { 25940 /* Singleton in the list */ 25941 ASSERT(connp->conn_drain_prev == connp); 25942 idl->idl_conn = NULL; 25943 idl->idl_conn_draining = NULL; 25944 } else { 25945 connp->conn_drain_prev->conn_drain_next = 25946 connp->conn_drain_next; 25947 connp->conn_drain_next->conn_drain_prev = 25948 connp->conn_drain_prev; 25949 if (idl->idl_conn == connp) 25950 idl->idl_conn = connp->conn_drain_next; 25951 ASSERT(idl->idl_conn_draining != connp); 25952 25953 } 25954 connp->conn_drain_next = NULL; 25955 connp->conn_drain_prev = NULL; 25956 } 25957 mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); 25958 } 25959 25960 /* 25961 * Write service routine. Shared perimeter entry point. 25962 * ip_wsrv can be called in any of the following ways. 25963 * 1. The device queue's messages has fallen below the low water mark 25964 * and STREAMS has backenabled the ill_wq. We walk thru all the 25965 * the drain lists and backenable the first conn in each list. 25966 * 2. The above causes STREAMS to run ip_wsrv on the conn_wq of the 25967 * qenabled non-tcp upper layers. We start dequeing messages and call 25968 * ip_wput for each message. 25969 */ 25970 25971 void 25972 ip_wsrv(queue_t *q) 25973 { 25974 conn_t *connp; 25975 ill_t *ill; 25976 mblk_t *mp; 25977 25978 if (q->q_next) { 25979 ill = (ill_t *)q->q_ptr; 25980 if (ill->ill_state_flags == 0) { 25981 /* 25982 * The device flow control has opened up. 25983 * Walk through conn drain lists and qenable the 25984 * first conn in each list. This makes sense only 25985 * if the stream is fully plumbed and setup. 25986 * Hence the if check above. 25987 */ 25988 ip1dbg(("ip_wsrv: walking\n")); 25989 conn_walk_drain(); 25990 } 25991 return; 25992 } 25993 25994 connp = Q_TO_CONN(q); 25995 ip1dbg(("ip_wsrv: %p %p\n", (void *)q, (void *)connp)); 25996 25997 /* 25998 * 1. Set conn_draining flag to signal that service is active. 25999 * 26000 * 2. ip_output determines whether it has been called from service, 26001 * based on the last parameter. If it is IP_WSRV it concludes it 26002 * has been called from service. 26003 * 26004 * 3. Message ordering is preserved by the following logic. 26005 * i. A directly called ip_output (i.e. not thru service) will queue 26006 * the message at the tail, if conn_draining is set (i.e. service 26007 * is running) or if q->q_first is non-null. 26008 * 26009 * ii. If ip_output is called from service, and if ip_output cannot 26010 * putnext due to flow control, it does a putbq. 26011 * 26012 * 4. noenable the queue so that a putbq from ip_wsrv does not reenable 26013 * (causing an infinite loop). 26014 */ 26015 ASSERT(!connp->conn_did_putbq); 26016 while ((q->q_first != NULL) && !connp->conn_did_putbq) { 26017 connp->conn_draining = 1; 26018 noenable(q); 26019 while ((mp = getq(q)) != NULL) { 26020 ip_output(Q_TO_CONN(q), mp, q, IP_WSRV); 26021 if (connp->conn_did_putbq) { 26022 /* ip_wput did a putbq */ 26023 break; 26024 } 26025 } 26026 /* 26027 * At this point, a thread coming down from top, calling 26028 * ip_wput, may end up queueing the message. We have not yet 26029 * enabled the queue, so ip_wsrv won't be called again. 26030 * To avoid this race, check q->q_first again (in the loop) 26031 * If the other thread queued the message before we call 26032 * enableok(), we will catch it in the q->q_first check. 26033 * If the other thread queues the message after we call 26034 * enableok(), ip_wsrv will be called again by STREAMS. 26035 */ 26036 connp->conn_draining = 0; 26037 enableok(q); 26038 } 26039 26040 /* Enable the next conn for draining */ 26041 conn_drain_tail(connp, B_FALSE); 26042 26043 connp->conn_did_putbq = 0; 26044 } 26045 26046 /* 26047 * Walk the list of all conn's calling the function provided with the 26048 * specified argument for each. Note that this only walks conn's that 26049 * have been bound. 26050 * Applies to both IPv4 and IPv6. 26051 */ 26052 static void 26053 conn_walk_fanout(pfv_t func, void *arg, zoneid_t zoneid) 26054 { 26055 conn_walk_fanout_table(ipcl_udp_fanout, ipcl_udp_fanout_size, 26056 func, arg, zoneid); 26057 conn_walk_fanout_table(ipcl_conn_fanout, ipcl_conn_fanout_size, 26058 func, arg, zoneid); 26059 conn_walk_fanout_table(ipcl_bind_fanout, ipcl_bind_fanout_size, 26060 func, arg, zoneid); 26061 conn_walk_fanout_table(ipcl_proto_fanout, 26062 A_CNT(ipcl_proto_fanout), func, arg, zoneid); 26063 conn_walk_fanout_table(ipcl_proto_fanout_v6, 26064 A_CNT(ipcl_proto_fanout_v6), func, arg, zoneid); 26065 } 26066 26067 /* 26068 * Flowcontrol has relieved, and STREAMS has backenabled us. For each list 26069 * of conns that need to be drained, check if drain is already in progress. 26070 * If so set the idl_repeat bit, indicating that the last conn in the list 26071 * needs to reinitiate the drain once again, for the list. If drain is not 26072 * in progress for the list, initiate the draining, by qenabling the 1st 26073 * conn in the list. The drain is self-sustaining, each qenabled conn will 26074 * in turn qenable the next conn, when it is done/blocked/closing. 26075 */ 26076 static void 26077 conn_walk_drain(void) 26078 { 26079 int i; 26080 idl_t *idl; 26081 26082 IP_STAT(ip_conn_walk_drain); 26083 26084 for (i = 0; i < conn_drain_list_cnt; i++) { 26085 idl = &conn_drain_list[i]; 26086 mutex_enter(&idl->idl_lock); 26087 if (idl->idl_conn == NULL) { 26088 mutex_exit(&idl->idl_lock); 26089 continue; 26090 } 26091 /* 26092 * If this list is not being drained currently by 26093 * an ip_wsrv thread, start the process. 26094 */ 26095 if (idl->idl_conn_draining == NULL) { 26096 ASSERT(idl->idl_repeat == 0); 26097 qenable(idl->idl_conn->conn_wq); 26098 idl->idl_conn_draining = idl->idl_conn; 26099 } else { 26100 idl->idl_repeat = 1; 26101 } 26102 mutex_exit(&idl->idl_lock); 26103 } 26104 } 26105 26106 /* 26107 * Walk an conn hash table of `count' buckets, calling func for each entry. 26108 */ 26109 static void 26110 conn_walk_fanout_table(connf_t *connfp, uint_t count, pfv_t func, void *arg, 26111 zoneid_t zoneid) 26112 { 26113 conn_t *connp; 26114 26115 while (count-- > 0) { 26116 mutex_enter(&connfp->connf_lock); 26117 for (connp = connfp->connf_head; connp != NULL; 26118 connp = connp->conn_next) { 26119 if (zoneid == GLOBAL_ZONEID || 26120 zoneid == connp->conn_zoneid) { 26121 CONN_INC_REF(connp); 26122 mutex_exit(&connfp->connf_lock); 26123 (*func)(connp, arg); 26124 mutex_enter(&connfp->connf_lock); 26125 CONN_DEC_REF(connp); 26126 } 26127 } 26128 mutex_exit(&connfp->connf_lock); 26129 connfp++; 26130 } 26131 } 26132 26133 /* ipcl_walk routine invoked for ip_conn_report for each conn. */ 26134 static void 26135 conn_report1(conn_t *connp, void *mp) 26136 { 26137 char buf1[INET6_ADDRSTRLEN]; 26138 char buf2[INET6_ADDRSTRLEN]; 26139 uint_t print_len, buf_len; 26140 26141 ASSERT(connp != NULL); 26142 26143 buf_len = ((mblk_t *)mp)->b_datap->db_lim - ((mblk_t *)mp)->b_wptr; 26144 if (buf_len <= 0) 26145 return; 26146 (void) inet_ntop(AF_INET6, &connp->conn_srcv6, buf1, sizeof (buf1)), 26147 (void) inet_ntop(AF_INET6, &connp->conn_remv6, buf2, sizeof (buf2)), 26148 print_len = snprintf((char *)((mblk_t *)mp)->b_wptr, buf_len, 26149 MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR 26150 "%5d %s/%05d %s/%05d\n", 26151 (void *)connp, (void *)CONNP_TO_RQ(connp), 26152 (void *)CONNP_TO_WQ(connp), connp->conn_zoneid, 26153 buf1, connp->conn_lport, 26154 buf2, connp->conn_fport); 26155 if (print_len < buf_len) { 26156 ((mblk_t *)mp)->b_wptr += print_len; 26157 } else { 26158 ((mblk_t *)mp)->b_wptr += buf_len; 26159 } 26160 } 26161 26162 /* 26163 * Named Dispatch routine to produce a formatted report on all conns 26164 * that are listed in one of the fanout tables. 26165 * This report is accessed by using the ndd utility to "get" ND variable 26166 * "ip_conn_status". 26167 */ 26168 /* ARGSUSED */ 26169 static int 26170 ip_conn_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 26171 { 26172 (void) mi_mpprintf(mp, 26173 "CONN " MI_COL_HDRPAD_STR 26174 "rfq " MI_COL_HDRPAD_STR 26175 "stq " MI_COL_HDRPAD_STR 26176 " zone local remote"); 26177 26178 /* 26179 * Because of the ndd constraint, at most we can have 64K buffer 26180 * to put in all conn info. So to be more efficient, just 26181 * allocate a 64K buffer here, assuming we need that large buffer. 26182 * This should be OK as only privileged processes can do ndd /dev/ip. 26183 */ 26184 if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) { 26185 /* The following may work even if we cannot get a large buf. */ 26186 (void) mi_mpprintf(mp, "<< Out of buffer >>\n"); 26187 return (0); 26188 } 26189 26190 conn_walk_fanout(conn_report1, mp->b_cont, Q_TO_CONN(q)->conn_zoneid); 26191 return (0); 26192 } 26193 26194 /* 26195 * Determine if the ill and multicast aspects of that packets 26196 * "matches" the conn. 26197 */ 26198 boolean_t 26199 conn_wantpacket(conn_t *connp, ill_t *ill, ipha_t *ipha, int fanout_flags, 26200 zoneid_t zoneid) 26201 { 26202 ill_t *in_ill; 26203 boolean_t found; 26204 ipif_t *ipif; 26205 ire_t *ire; 26206 ipaddr_t dst, src; 26207 26208 dst = ipha->ipha_dst; 26209 src = ipha->ipha_src; 26210 26211 /* 26212 * conn_incoming_ill is set by IP_BOUND_IF which limits 26213 * unicast, broadcast and multicast reception to 26214 * conn_incoming_ill. conn_wantpacket itself is called 26215 * only for BROADCAST and multicast. 26216 * 26217 * 1) ip_rput supresses duplicate broadcasts if the ill 26218 * is part of a group. Hence, we should be receiving 26219 * just one copy of broadcast for the whole group. 26220 * Thus, if it is part of the group the packet could 26221 * come on any ill of the group and hence we need a 26222 * match on the group. Otherwise, match on ill should 26223 * be sufficient. 26224 * 26225 * 2) ip_rput does not suppress duplicate multicast packets. 26226 * If there are two interfaces in a ill group and we have 26227 * 2 applications (conns) joined a multicast group G on 26228 * both the interfaces, ilm_lookup_ill filter in ip_rput 26229 * will give us two packets because we join G on both the 26230 * interfaces rather than nominating just one interface 26231 * for receiving multicast like broadcast above. So, 26232 * we have to call ilg_lookup_ill to filter out duplicate 26233 * copies, if ill is part of a group. 26234 */ 26235 in_ill = connp->conn_incoming_ill; 26236 if (in_ill != NULL) { 26237 if (in_ill->ill_group == NULL) { 26238 if (in_ill != ill) 26239 return (B_FALSE); 26240 } else if (in_ill->ill_group != ill->ill_group) { 26241 return (B_FALSE); 26242 } 26243 } 26244 26245 if (!CLASSD(dst)) { 26246 if (connp->conn_zoneid == zoneid) 26247 return (B_TRUE); 26248 /* 26249 * The conn is in a different zone; we need to check that this 26250 * broadcast address is configured in the application's zone and 26251 * on one ill in the group. 26252 */ 26253 ipif = ipif_get_next_ipif(NULL, ill); 26254 if (ipif == NULL) 26255 return (B_FALSE); 26256 ire = ire_ctable_lookup(dst, 0, IRE_BROADCAST, ipif, 26257 connp->conn_zoneid, NULL, 26258 (MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP)); 26259 ipif_refrele(ipif); 26260 if (ire != NULL) { 26261 ire_refrele(ire); 26262 return (B_TRUE); 26263 } else { 26264 return (B_FALSE); 26265 } 26266 } 26267 26268 if ((fanout_flags & IP_FF_NO_MCAST_LOOP) && 26269 connp->conn_zoneid == zoneid) { 26270 /* 26271 * Loopback case: the sending endpoint has IP_MULTICAST_LOOP 26272 * disabled, therefore we don't dispatch the multicast packet to 26273 * the sending zone. 26274 */ 26275 return (B_FALSE); 26276 } 26277 26278 if ((ill->ill_phyint->phyint_flags & PHYI_LOOPBACK) && 26279 connp->conn_zoneid != zoneid) { 26280 /* 26281 * Multicast packet on the loopback interface: we only match 26282 * conns who joined the group in the specified zone. 26283 */ 26284 return (B_FALSE); 26285 } 26286 26287 if (connp->conn_multi_router) { 26288 /* multicast packet and multicast router socket: send up */ 26289 return (B_TRUE); 26290 } 26291 26292 mutex_enter(&connp->conn_lock); 26293 found = (ilg_lookup_ill_withsrc(connp, dst, src, ill) != NULL); 26294 mutex_exit(&connp->conn_lock); 26295 return (found); 26296 } 26297 26298 /* 26299 * Finish processing of "arp_up" when AR_DLPIOP_DONE is received from arp. 26300 */ 26301 /* ARGSUSED */ 26302 static void 26303 ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy_arg) 26304 { 26305 ill_t *ill = (ill_t *)q->q_ptr; 26306 mblk_t *mp1, *mp2; 26307 ipif_t *ipif; 26308 int err = 0; 26309 conn_t *connp = NULL; 26310 ipsq_t *ipsq; 26311 arc_t *arc; 26312 26313 ip1dbg(("ip_arp_done(%s)\n", ill->ill_name)); 26314 26315 ASSERT((mp->b_wptr - mp->b_rptr) >= sizeof (arc_t)); 26316 ASSERT(((arc_t *)mp->b_rptr)->arc_cmd == AR_DLPIOP_DONE); 26317 26318 ASSERT(IAM_WRITER_ILL(ill)); 26319 mp2 = mp->b_cont; 26320 mp->b_cont = NULL; 26321 26322 /* 26323 * We have now received the arp bringup completion message 26324 * from ARP. Mark the arp bringup as done. Also if the arp 26325 * stream has already started closing, send up the AR_ARP_CLOSING 26326 * ack now since ARP is waiting in close for this ack. 26327 */ 26328 mutex_enter(&ill->ill_lock); 26329 ill->ill_arp_bringup_pending = 0; 26330 if (ill->ill_arp_closing) { 26331 mutex_exit(&ill->ill_lock); 26332 /* Let's reuse the mp for sending the ack */ 26333 arc = (arc_t *)mp->b_rptr; 26334 mp->b_wptr = mp->b_rptr + sizeof (arc_t); 26335 arc->arc_cmd = AR_ARP_CLOSING; 26336 qreply(q, mp); 26337 } else { 26338 mutex_exit(&ill->ill_lock); 26339 freeb(mp); 26340 } 26341 26342 /* We should have an IOCTL waiting on this. */ 26343 ipsq = ill->ill_phyint->phyint_ipsq; 26344 ipif = ipsq->ipsq_pending_ipif; 26345 mp1 = ipsq_pending_mp_get(ipsq, &connp); 26346 ASSERT(!((mp1 != NULL) ^ (ipif != NULL))); 26347 if (mp1 == NULL) { 26348 /* bringup was aborted by the user */ 26349 freemsg(mp2); 26350 return; 26351 } 26352 ASSERT(connp != NULL); 26353 q = CONNP_TO_WQ(connp); 26354 /* 26355 * If the DL_BIND_REQ fails, it is noted 26356 * in arc_name_offset. 26357 */ 26358 err = *((int *)mp2->b_rptr); 26359 if (err == 0) { 26360 if (ipif->ipif_isv6) { 26361 if ((err = ipif_up_done_v6(ipif)) != 0) 26362 ip0dbg(("ip_arp_done: init failed\n")); 26363 } else { 26364 if ((err = ipif_up_done(ipif)) != 0) 26365 ip0dbg(("ip_arp_done: init failed\n")); 26366 } 26367 } else { 26368 ip0dbg(("ip_arp_done: DL_BIND_REQ failed\n")); 26369 } 26370 26371 freemsg(mp2); 26372 26373 if ((err == 0) && (ill->ill_up_ipifs)) { 26374 err = ill_up_ipifs(ill, q, mp1); 26375 if (err == EINPROGRESS) 26376 return; 26377 } 26378 26379 if (ill->ill_up_ipifs) { 26380 ill_group_cleanup(ill); 26381 } 26382 26383 /* 26384 * The ioctl must complete now without EINPROGRESS 26385 * since ipsq_pending_mp_get has removed the ioctl mblk 26386 * from ipsq_pending_mp. Otherwise the ioctl will be 26387 * stuck for ever in the ipsq. 26388 */ 26389 ASSERT(err != EINPROGRESS); 26390 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipif, ipsq); 26391 } 26392 26393 /* Allocate the private structure */ 26394 static int 26395 ip_priv_alloc(void **bufp) 26396 { 26397 void *buf; 26398 26399 if ((buf = kmem_alloc(sizeof (ip_priv_t), KM_NOSLEEP)) == NULL) 26400 return (ENOMEM); 26401 26402 *bufp = buf; 26403 return (0); 26404 } 26405 26406 /* Function to delete the private structure */ 26407 void 26408 ip_priv_free(void *buf) 26409 { 26410 ASSERT(buf != NULL); 26411 kmem_free(buf, sizeof (ip_priv_t)); 26412 } 26413 26414 /* 26415 * The entry point for IPPF processing. 26416 * If the classifier (IPGPC_CLASSIFY) is not loaded and configured, the 26417 * routine just returns. 26418 * 26419 * When called, ip_process generates an ipp_packet_t structure 26420 * which holds the state information for this packet and invokes the 26421 * the classifier (via ipp_packet_process). The classification, depending on 26422 * configured filters, results in a list of actions for this packet. Invoking 26423 * an action may cause the packet to be dropped, in which case the resulting 26424 * mblk (*mpp) is NULL. proc indicates the callout position for 26425 * this packet and ill_index is the interface this packet on or will leave 26426 * on (inbound and outbound resp.). 26427 */ 26428 void 26429 ip_process(ip_proc_t proc, mblk_t **mpp, uint32_t ill_index) 26430 { 26431 mblk_t *mp; 26432 ip_priv_t *priv; 26433 ipp_action_id_t aid; 26434 int rc = 0; 26435 ipp_packet_t *pp; 26436 #define IP_CLASS "ip" 26437 26438 /* If the classifier is not loaded, return */ 26439 if ((aid = ipp_action_lookup(IPGPC_CLASSIFY)) == IPP_ACTION_INVAL) { 26440 return; 26441 } 26442 26443 mp = *mpp; 26444 ASSERT(mp != NULL); 26445 26446 /* Allocate the packet structure */ 26447 rc = ipp_packet_alloc(&pp, IP_CLASS, aid); 26448 if (rc != 0) { 26449 *mpp = NULL; 26450 freemsg(mp); 26451 return; 26452 } 26453 26454 /* Allocate the private structure */ 26455 rc = ip_priv_alloc((void **)&priv); 26456 if (rc != 0) { 26457 *mpp = NULL; 26458 freemsg(mp); 26459 ipp_packet_free(pp); 26460 return; 26461 } 26462 priv->proc = proc; 26463 priv->ill_index = ill_index; 26464 ipp_packet_set_private(pp, priv, ip_priv_free); 26465 ipp_packet_set_data(pp, mp); 26466 26467 /* Invoke the classifier */ 26468 rc = ipp_packet_process(&pp); 26469 if (pp != NULL) { 26470 mp = ipp_packet_get_data(pp); 26471 ipp_packet_free(pp); 26472 if (rc != 0) { 26473 freemsg(mp); 26474 *mpp = NULL; 26475 } 26476 } else { 26477 *mpp = NULL; 26478 } 26479 #undef IP_CLASS 26480 } 26481 26482 /* 26483 * Propagate a multicast group membership operation (add/drop) on 26484 * all the interfaces crossed by the related multirt routes. 26485 * The call is considered successful if the operation succeeds 26486 * on at least one interface. 26487 */ 26488 static int 26489 ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t, 26490 uint_t *, mcast_record_t, ipaddr_t, mblk_t *), ire_t *ire, conn_t *connp, 26491 boolean_t checkonly, ipaddr_t group, mcast_record_t fmode, ipaddr_t src, 26492 mblk_t *first_mp) 26493 { 26494 ire_t *ire_gw; 26495 irb_t *irb; 26496 int error = 0; 26497 opt_restart_t *or; 26498 26499 irb = ire->ire_bucket; 26500 ASSERT(irb != NULL); 26501 26502 ASSERT(DB_TYPE(first_mp) == M_CTL); 26503 26504 or = (opt_restart_t *)first_mp->b_rptr; 26505 IRB_REFHOLD(irb); 26506 for (; ire != NULL; ire = ire->ire_next) { 26507 if ((ire->ire_flags & RTF_MULTIRT) == 0) 26508 continue; 26509 if (ire->ire_addr != group) 26510 continue; 26511 26512 ire_gw = ire_ftable_lookup(ire->ire_gateway_addr, 0, 0, 26513 IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, NULL, 26514 MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE); 26515 /* No resolver exists for the gateway; skip this ire. */ 26516 if (ire_gw == NULL) 26517 continue; 26518 26519 /* 26520 * This function can return EINPROGRESS. If so the operation 26521 * will be restarted from ip_restart_optmgmt which will 26522 * call ip_opt_set and option processing will restart for 26523 * this option. So we may end up calling 'fn' more than once. 26524 * This requires that 'fn' is idempotent except for the 26525 * return value. The operation is considered a success if 26526 * it succeeds at least once on any one interface. 26527 */ 26528 error = fn(connp, checkonly, group, ire_gw->ire_src_addr, 26529 NULL, fmode, src, first_mp); 26530 if (error == 0) 26531 or->or_private = CGTP_MCAST_SUCCESS; 26532 26533 if (ip_debug > 0) { 26534 ulong_t off; 26535 char *ksym; 26536 ksym = kobj_getsymname((uintptr_t)fn, &off); 26537 ip2dbg(("ip_multirt_apply_membership: " 26538 "called %s, multirt group 0x%08x via itf 0x%08x, " 26539 "error %d [success %u]\n", 26540 ksym ? ksym : "?", 26541 ntohl(group), ntohl(ire_gw->ire_src_addr), 26542 error, or->or_private)); 26543 } 26544 26545 ire_refrele(ire_gw); 26546 if (error == EINPROGRESS) { 26547 IRB_REFRELE(irb); 26548 return (error); 26549 } 26550 } 26551 IRB_REFRELE(irb); 26552 /* 26553 * Consider the call as successful if we succeeded on at least 26554 * one interface. Otherwise, return the last encountered error. 26555 */ 26556 return (or->or_private == CGTP_MCAST_SUCCESS ? 0 : error); 26557 } 26558 26559 26560 /* 26561 * Issue a warning regarding a route crossing an interface with an 26562 * incorrect MTU. Only one message every 'ip_multirt_log_interval' 26563 * amount of time is logged. 26564 */ 26565 static void 26566 ip_multirt_bad_mtu(ire_t *ire, uint32_t max_frag) 26567 { 26568 hrtime_t current = gethrtime(); 26569 char buf[16]; 26570 26571 /* Convert interval in ms to hrtime in ns */ 26572 if (multirt_bad_mtu_last_time + 26573 ((hrtime_t)ip_multirt_log_interval * (hrtime_t)1000000) <= 26574 current) { 26575 cmn_err(CE_WARN, "ip: ignoring multiroute " 26576 "to %s, incorrect MTU %u (expected %u)\n", 26577 ip_dot_addr(ire->ire_addr, buf), 26578 ire->ire_max_frag, max_frag); 26579 26580 multirt_bad_mtu_last_time = current; 26581 } 26582 } 26583 26584 26585 /* 26586 * Get the CGTP (multirouting) filtering status. 26587 * If 0, the CGTP hooks are transparent. 26588 */ 26589 /* ARGSUSED */ 26590 static int 26591 ip_cgtp_filter_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 26592 { 26593 boolean_t *ip_cgtp_filter_value = (boolean_t *)cp; 26594 26595 (void) mi_mpprintf(mp, "%d", (int)*ip_cgtp_filter_value); 26596 return (0); 26597 } 26598 26599 26600 /* 26601 * Set the CGTP (multirouting) filtering status. 26602 * If the status is changed from active to transparent 26603 * or from transparent to active, forward the new status 26604 * to the filtering module (if loaded). 26605 */ 26606 /* ARGSUSED */ 26607 static int 26608 ip_cgtp_filter_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 26609 cred_t *ioc_cr) 26610 { 26611 long new_value; 26612 boolean_t *ip_cgtp_filter_value = (boolean_t *)cp; 26613 26614 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 26615 new_value < 0 || new_value > 1) { 26616 return (EINVAL); 26617 } 26618 26619 /* 26620 * Do not enable CGTP filtering - thus preventing the hooks 26621 * from being invoked - if the version number of the 26622 * filtering module hooks does not match. 26623 */ 26624 if ((ip_cgtp_filter_ops != NULL) && 26625 (ip_cgtp_filter_ops->cfo_filter_rev != CGTP_FILTER_REV)) { 26626 cmn_err(CE_WARN, "IP: CGTP filtering version mismatch " 26627 "(module hooks version %d, expecting %d)\n", 26628 ip_cgtp_filter_ops->cfo_filter_rev, CGTP_FILTER_REV); 26629 return (ENOTSUP); 26630 } 26631 26632 if ((!*ip_cgtp_filter_value) && new_value) { 26633 cmn_err(CE_NOTE, "IP: enabling CGTP filtering%s", 26634 ip_cgtp_filter_ops == NULL ? 26635 " (module not loaded)" : ""); 26636 } 26637 if (*ip_cgtp_filter_value && (!new_value)) { 26638 cmn_err(CE_NOTE, "IP: disabling CGTP filtering%s", 26639 ip_cgtp_filter_ops == NULL ? 26640 " (module not loaded)" : ""); 26641 } 26642 26643 if (ip_cgtp_filter_ops != NULL) { 26644 int res; 26645 if ((res = ip_cgtp_filter_ops->cfo_change_state(new_value))) { 26646 return (res); 26647 } 26648 } 26649 26650 *ip_cgtp_filter_value = (boolean_t)new_value; 26651 26652 return (0); 26653 } 26654 26655 26656 /* 26657 * Return the expected CGTP hooks version number. 26658 */ 26659 int 26660 ip_cgtp_filter_supported(void) 26661 { 26662 return (ip_cgtp_filter_rev); 26663 } 26664 26665 26666 /* 26667 * CGTP hooks can be registered by directly touching ip_cgtp_filter_ops 26668 * or by invoking this function. In the first case, the version number 26669 * of the registered structure is checked at hooks activation time 26670 * in ip_cgtp_filter_set(). 26671 */ 26672 int 26673 ip_cgtp_filter_register(cgtp_filter_ops_t *ops) 26674 { 26675 if (ops->cfo_filter_rev != CGTP_FILTER_REV) 26676 return (ENOTSUP); 26677 26678 ip_cgtp_filter_ops = ops; 26679 return (0); 26680 } 26681 26682 static squeue_func_t 26683 ip_squeue_switch(int val) 26684 { 26685 squeue_func_t rval = squeue_fill; 26686 26687 switch (val) { 26688 case IP_SQUEUE_ENTER_NODRAIN: 26689 rval = squeue_enter_nodrain; 26690 break; 26691 case IP_SQUEUE_ENTER: 26692 rval = squeue_enter; 26693 break; 26694 default: 26695 break; 26696 } 26697 return (rval); 26698 } 26699 26700 /* ARGSUSED */ 26701 static int 26702 ip_input_proc_set(queue_t *q, mblk_t *mp, char *value, 26703 caddr_t addr, cred_t *cr) 26704 { 26705 int *v = (int *)addr; 26706 long new_value; 26707 26708 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 26709 return (EINVAL); 26710 26711 ip_input_proc = ip_squeue_switch(new_value); 26712 *v = new_value; 26713 return (0); 26714 } 26715 26716 /* ARGSUSED */ 26717 static int 26718 ip_int_set(queue_t *q, mblk_t *mp, char *value, 26719 caddr_t addr, cred_t *cr) 26720 { 26721 int *v = (int *)addr; 26722 long new_value; 26723 26724 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 26725 return (EINVAL); 26726 26727 *v = new_value; 26728 return (0); 26729 } 26730 26731 static void 26732 ip_kstat_init(void) 26733 { 26734 ip_named_kstat_t template = { 26735 { "forwarding", KSTAT_DATA_UINT32, 0 }, 26736 { "defaultTTL", KSTAT_DATA_UINT32, 0 }, 26737 { "inReceives", KSTAT_DATA_UINT32, 0 }, 26738 { "inHdrErrors", KSTAT_DATA_UINT32, 0 }, 26739 { "inAddrErrors", KSTAT_DATA_UINT32, 0 }, 26740 { "forwDatagrams", KSTAT_DATA_UINT32, 0 }, 26741 { "inUnknownProtos", KSTAT_DATA_UINT32, 0 }, 26742 { "inDiscards", KSTAT_DATA_UINT32, 0 }, 26743 { "inDelivers", KSTAT_DATA_UINT32, 0 }, 26744 { "outRequests", KSTAT_DATA_UINT32, 0 }, 26745 { "outDiscards", KSTAT_DATA_UINT32, 0 }, 26746 { "outNoRoutes", KSTAT_DATA_UINT32, 0 }, 26747 { "reasmTimeout", KSTAT_DATA_UINT32, 0 }, 26748 { "reasmReqds", KSTAT_DATA_UINT32, 0 }, 26749 { "reasmOKs", KSTAT_DATA_UINT32, 0 }, 26750 { "reasmFails", KSTAT_DATA_UINT32, 0 }, 26751 { "fragOKs", KSTAT_DATA_UINT32, 0 }, 26752 { "fragFails", KSTAT_DATA_UINT32, 0 }, 26753 { "fragCreates", KSTAT_DATA_UINT32, 0 }, 26754 { "addrEntrySize", KSTAT_DATA_INT32, 0 }, 26755 { "routeEntrySize", KSTAT_DATA_INT32, 0 }, 26756 { "netToMediaEntrySize", KSTAT_DATA_INT32, 0 }, 26757 { "routingDiscards", KSTAT_DATA_UINT32, 0 }, 26758 { "inErrs", KSTAT_DATA_UINT32, 0 }, 26759 { "noPorts", KSTAT_DATA_UINT32, 0 }, 26760 { "inCksumErrs", KSTAT_DATA_UINT32, 0 }, 26761 { "reasmDuplicates", KSTAT_DATA_UINT32, 0 }, 26762 { "reasmPartDups", KSTAT_DATA_UINT32, 0 }, 26763 { "forwProhibits", KSTAT_DATA_UINT32, 0 }, 26764 { "udpInCksumErrs", KSTAT_DATA_UINT32, 0 }, 26765 { "udpInOverflows", KSTAT_DATA_UINT32, 0 }, 26766 { "rawipInOverflows", KSTAT_DATA_UINT32, 0 }, 26767 { "ipsecInSucceeded", KSTAT_DATA_UINT32, 0 }, 26768 { "ipsecInFailed", KSTAT_DATA_INT32, 0 }, 26769 { "memberEntrySize", KSTAT_DATA_INT32, 0 }, 26770 { "inIPv6", KSTAT_DATA_UINT32, 0 }, 26771 { "outIPv6", KSTAT_DATA_UINT32, 0 }, 26772 { "outSwitchIPv6", KSTAT_DATA_UINT32, 0 }, 26773 }; 26774 26775 ip_mibkp = kstat_create("ip", 0, "ip", "mib2", KSTAT_TYPE_NAMED, 26776 NUM_OF_FIELDS(ip_named_kstat_t), 26777 0); 26778 if (!ip_mibkp) 26779 return; 26780 26781 template.forwarding.value.ui32 = WE_ARE_FORWARDING ? 1:2; 26782 template.defaultTTL.value.ui32 = (uint32_t)ip_def_ttl; 26783 template.reasmTimeout.value.ui32 = ip_g_frag_timeout; 26784 template.addrEntrySize.value.i32 = sizeof (mib2_ipAddrEntry_t); 26785 template.routeEntrySize.value.i32 = sizeof (mib2_ipRouteEntry_t); 26786 26787 template.netToMediaEntrySize.value.i32 = 26788 sizeof (mib2_ipNetToMediaEntry_t); 26789 26790 template.memberEntrySize.value.i32 = sizeof (ipv6_member_t); 26791 26792 bcopy(&template, ip_mibkp->ks_data, sizeof (template)); 26793 26794 ip_mibkp->ks_update = ip_kstat_update; 26795 26796 kstat_install(ip_mibkp); 26797 } 26798 26799 static void 26800 ip_kstat_fini(void) 26801 { 26802 26803 if (ip_mibkp != NULL) { 26804 kstat_delete(ip_mibkp); 26805 ip_mibkp = NULL; 26806 } 26807 } 26808 26809 static int 26810 ip_kstat_update(kstat_t *kp, int rw) 26811 { 26812 ip_named_kstat_t *ipkp; 26813 26814 if (!kp || !kp->ks_data) 26815 return (EIO); 26816 26817 if (rw == KSTAT_WRITE) 26818 return (EACCES); 26819 26820 ipkp = (ip_named_kstat_t *)kp->ks_data; 26821 26822 ipkp->forwarding.value.ui32 = ip_mib.ipForwarding; 26823 ipkp->defaultTTL.value.ui32 = ip_mib.ipDefaultTTL; 26824 ipkp->inReceives.value.ui32 = ip_mib.ipInReceives; 26825 ipkp->inHdrErrors.value.ui32 = ip_mib.ipInHdrErrors; 26826 ipkp->inAddrErrors.value.ui32 = ip_mib.ipInAddrErrors; 26827 ipkp->forwDatagrams.value.ui32 = ip_mib.ipForwDatagrams; 26828 ipkp->inUnknownProtos.value.ui32 = ip_mib.ipInUnknownProtos; 26829 ipkp->inDiscards.value.ui32 = ip_mib.ipInDiscards; 26830 ipkp->inDelivers.value.ui32 = ip_mib.ipInDelivers; 26831 ipkp->outRequests.value.ui32 = ip_mib.ipOutRequests; 26832 ipkp->outDiscards.value.ui32 = ip_mib.ipOutDiscards; 26833 ipkp->outNoRoutes.value.ui32 = ip_mib.ipOutNoRoutes; 26834 ipkp->reasmTimeout.value.ui32 = ip_mib.ipReasmTimeout; 26835 ipkp->reasmReqds.value.ui32 = ip_mib.ipReasmReqds; 26836 ipkp->reasmOKs.value.ui32 = ip_mib.ipReasmOKs; 26837 ipkp->reasmFails.value.ui32 = ip_mib.ipReasmFails; 26838 ipkp->fragOKs.value.ui32 = ip_mib.ipFragOKs; 26839 ipkp->fragFails.value.ui32 = ip_mib.ipFragFails; 26840 ipkp->fragCreates.value.ui32 = ip_mib.ipFragCreates; 26841 26842 ipkp->routingDiscards.value.ui32 = ip_mib.ipRoutingDiscards; 26843 ipkp->inErrs.value.ui32 = ip_mib.tcpInErrs; 26844 ipkp->noPorts.value.ui32 = ip_mib.udpNoPorts; 26845 ipkp->inCksumErrs.value.ui32 = ip_mib.ipInCksumErrs; 26846 ipkp->reasmDuplicates.value.ui32 = ip_mib.ipReasmDuplicates; 26847 ipkp->reasmPartDups.value.ui32 = ip_mib.ipReasmPartDups; 26848 ipkp->forwProhibits.value.ui32 = ip_mib.ipForwProhibits; 26849 ipkp->udpInCksumErrs.value.ui32 = ip_mib.udpInCksumErrs; 26850 ipkp->udpInOverflows.value.ui32 = ip_mib.udpInOverflows; 26851 ipkp->rawipInOverflows.value.ui32 = ip_mib.rawipInOverflows; 26852 ipkp->ipsecInSucceeded.value.ui32 = ip_mib.ipsecInSucceeded; 26853 ipkp->ipsecInFailed.value.i32 = ip_mib.ipsecInFailed; 26854 26855 ipkp->inIPv6.value.ui32 = ip_mib.ipInIPv6; 26856 ipkp->outIPv6.value.ui32 = ip_mib.ipOutIPv6; 26857 ipkp->outSwitchIPv6.value.ui32 = ip_mib.ipOutSwitchIPv6; 26858 26859 return (0); 26860 } 26861 26862 static void 26863 icmp_kstat_init(void) 26864 { 26865 icmp_named_kstat_t template = { 26866 { "inMsgs", KSTAT_DATA_UINT32 }, 26867 { "inErrors", KSTAT_DATA_UINT32 }, 26868 { "inDestUnreachs", KSTAT_DATA_UINT32 }, 26869 { "inTimeExcds", KSTAT_DATA_UINT32 }, 26870 { "inParmProbs", KSTAT_DATA_UINT32 }, 26871 { "inSrcQuenchs", KSTAT_DATA_UINT32 }, 26872 { "inRedirects", KSTAT_DATA_UINT32 }, 26873 { "inEchos", KSTAT_DATA_UINT32 }, 26874 { "inEchoReps", KSTAT_DATA_UINT32 }, 26875 { "inTimestamps", KSTAT_DATA_UINT32 }, 26876 { "inTimestampReps", KSTAT_DATA_UINT32 }, 26877 { "inAddrMasks", KSTAT_DATA_UINT32 }, 26878 { "inAddrMaskReps", KSTAT_DATA_UINT32 }, 26879 { "outMsgs", KSTAT_DATA_UINT32 }, 26880 { "outErrors", KSTAT_DATA_UINT32 }, 26881 { "outDestUnreachs", KSTAT_DATA_UINT32 }, 26882 { "outTimeExcds", KSTAT_DATA_UINT32 }, 26883 { "outParmProbs", KSTAT_DATA_UINT32 }, 26884 { "outSrcQuenchs", KSTAT_DATA_UINT32 }, 26885 { "outRedirects", KSTAT_DATA_UINT32 }, 26886 { "outEchos", KSTAT_DATA_UINT32 }, 26887 { "outEchoReps", KSTAT_DATA_UINT32 }, 26888 { "outTimestamps", KSTAT_DATA_UINT32 }, 26889 { "outTimestampReps", KSTAT_DATA_UINT32 }, 26890 { "outAddrMasks", KSTAT_DATA_UINT32 }, 26891 { "outAddrMaskReps", KSTAT_DATA_UINT32 }, 26892 { "inChksumErrs", KSTAT_DATA_UINT32 }, 26893 { "inUnknowns", KSTAT_DATA_UINT32 }, 26894 { "inFragNeeded", KSTAT_DATA_UINT32 }, 26895 { "outFragNeeded", KSTAT_DATA_UINT32 }, 26896 { "outDrops", KSTAT_DATA_UINT32 }, 26897 { "inOverFlows", KSTAT_DATA_UINT32 }, 26898 { "inBadRedirects", KSTAT_DATA_UINT32 }, 26899 }; 26900 26901 icmp_mibkp = kstat_create("ip", 0, "icmp", "mib2", KSTAT_TYPE_NAMED, 26902 NUM_OF_FIELDS(icmp_named_kstat_t), 26903 0); 26904 if (icmp_mibkp == NULL) 26905 return; 26906 26907 bcopy(&template, icmp_mibkp->ks_data, sizeof (template)); 26908 26909 icmp_mibkp->ks_update = icmp_kstat_update; 26910 26911 kstat_install(icmp_mibkp); 26912 } 26913 26914 static void 26915 icmp_kstat_fini(void) 26916 { 26917 26918 if (icmp_mibkp != NULL) { 26919 kstat_delete(icmp_mibkp); 26920 icmp_mibkp = NULL; 26921 } 26922 } 26923 26924 static int 26925 icmp_kstat_update(kstat_t *kp, int rw) 26926 { 26927 icmp_named_kstat_t *icmpkp; 26928 26929 if ((kp == NULL) || (kp->ks_data == NULL)) 26930 return (EIO); 26931 26932 if (rw == KSTAT_WRITE) 26933 return (EACCES); 26934 26935 icmpkp = (icmp_named_kstat_t *)kp->ks_data; 26936 26937 icmpkp->inMsgs.value.ui32 = icmp_mib.icmpInMsgs; 26938 icmpkp->inErrors.value.ui32 = icmp_mib.icmpInErrors; 26939 icmpkp->inDestUnreachs.value.ui32 = icmp_mib.icmpInDestUnreachs; 26940 icmpkp->inTimeExcds.value.ui32 = icmp_mib.icmpInTimeExcds; 26941 icmpkp->inParmProbs.value.ui32 = icmp_mib.icmpInParmProbs; 26942 icmpkp->inSrcQuenchs.value.ui32 = icmp_mib.icmpInSrcQuenchs; 26943 icmpkp->inRedirects.value.ui32 = icmp_mib.icmpInRedirects; 26944 icmpkp->inEchos.value.ui32 = icmp_mib.icmpInEchos; 26945 icmpkp->inEchoReps.value.ui32 = icmp_mib.icmpInEchoReps; 26946 icmpkp->inTimestamps.value.ui32 = icmp_mib.icmpInTimestamps; 26947 icmpkp->inTimestampReps.value.ui32 = icmp_mib.icmpInTimestampReps; 26948 icmpkp->inAddrMasks.value.ui32 = icmp_mib.icmpInAddrMasks; 26949 icmpkp->inAddrMaskReps.value.ui32 = icmp_mib.icmpInAddrMaskReps; 26950 icmpkp->outMsgs.value.ui32 = icmp_mib.icmpOutMsgs; 26951 icmpkp->outErrors.value.ui32 = icmp_mib.icmpOutErrors; 26952 icmpkp->outDestUnreachs.value.ui32 = icmp_mib.icmpOutDestUnreachs; 26953 icmpkp->outTimeExcds.value.ui32 = icmp_mib.icmpOutTimeExcds; 26954 icmpkp->outParmProbs.value.ui32 = icmp_mib.icmpOutParmProbs; 26955 icmpkp->outSrcQuenchs.value.ui32 = icmp_mib.icmpOutSrcQuenchs; 26956 icmpkp->outRedirects.value.ui32 = icmp_mib.icmpOutRedirects; 26957 icmpkp->outEchos.value.ui32 = icmp_mib.icmpOutEchos; 26958 icmpkp->outEchoReps.value.ui32 = icmp_mib.icmpOutEchoReps; 26959 icmpkp->outTimestamps.value.ui32 = icmp_mib.icmpOutTimestamps; 26960 icmpkp->outTimestampReps.value.ui32 = icmp_mib.icmpOutTimestampReps; 26961 icmpkp->outAddrMasks.value.ui32 = icmp_mib.icmpOutAddrMasks; 26962 icmpkp->outAddrMaskReps.value.ui32 = icmp_mib.icmpOutAddrMaskReps; 26963 icmpkp->inCksumErrs.value.ui32 = icmp_mib.icmpInCksumErrs; 26964 icmpkp->inUnknowns.value.ui32 = icmp_mib.icmpInUnknowns; 26965 icmpkp->inFragNeeded.value.ui32 = icmp_mib.icmpInFragNeeded; 26966 icmpkp->outFragNeeded.value.ui32 = icmp_mib.icmpOutFragNeeded; 26967 icmpkp->outDrops.value.ui32 = icmp_mib.icmpOutDrops; 26968 icmpkp->inOverflows.value.ui32 = icmp_mib.icmpInOverflows; 26969 icmpkp->inBadRedirects.value.ui32 = icmp_mib.icmpInBadRedirects; 26970 26971 return (0); 26972 } 26973 26974 /* 26975 * This is the fanout function for raw socket opened for SCTP. Note 26976 * that it is called after SCTP checks that there is no socket which 26977 * wants a packet. Then before SCTP handles this out of the blue packet, 26978 * this function is called to see if there is any raw socket for SCTP. 26979 * If there is and it is bound to the correct address, the packet will 26980 * be sent to that socket. Note that only one raw socket can be bound to 26981 * a port. This is assured in ipcl_sctp_hash_insert(); 26982 */ 26983 void 26984 ip_fanout_sctp_raw(mblk_t *mp, ill_t *recv_ill, ipha_t *ipha, boolean_t isv4, 26985 uint32_t ports, boolean_t mctl_present, uint_t flags, boolean_t ip_policy, 26986 uint_t ipif_seqid, zoneid_t zoneid) 26987 { 26988 conn_t *connp; 26989 queue_t *rq; 26990 mblk_t *first_mp; 26991 boolean_t secure; 26992 ip6_t *ip6h; 26993 26994 first_mp = mp; 26995 if (mctl_present) { 26996 mp = first_mp->b_cont; 26997 secure = ipsec_in_is_secure(first_mp); 26998 ASSERT(mp != NULL); 26999 } else { 27000 secure = B_FALSE; 27001 } 27002 ip6h = (isv4) ? NULL : (ip6_t *)ipha; 27003 27004 connp = ipcl_classify_raw(mp, IPPROTO_SCTP, zoneid, ports, ipha); 27005 if (connp == NULL) { 27006 sctp_ootb_input(first_mp, recv_ill, ipif_seqid, zoneid, 27007 mctl_present); 27008 return; 27009 } 27010 rq = connp->conn_rq; 27011 if (!canputnext(rq)) { 27012 CONN_DEC_REF(connp); 27013 BUMP_MIB(&ip_mib, rawipInOverflows); 27014 freemsg(first_mp); 27015 return; 27016 } 27017 if ((isv4 ? CONN_INBOUND_POLICY_PRESENT(connp) : 27018 CONN_INBOUND_POLICY_PRESENT_V6(connp)) || secure) { 27019 first_mp = ipsec_check_inbound_policy(first_mp, connp, 27020 (isv4 ? ipha : NULL), ip6h, mctl_present); 27021 if (first_mp == NULL) { 27022 CONN_DEC_REF(connp); 27023 return; 27024 } 27025 } 27026 /* 27027 * We probably should not send M_CTL message up to 27028 * raw socket. 27029 */ 27030 if (mctl_present) 27031 freeb(first_mp); 27032 27033 /* Initiate IPPF processing here if needed. */ 27034 if ((isv4 && IPP_ENABLED(IPP_LOCAL_IN) && ip_policy) || 27035 (!isv4 && IP6_IN_IPP(flags))) { 27036 ip_process(IPP_LOCAL_IN, &mp, 27037 recv_ill->ill_phyint->phyint_ifindex); 27038 if (mp == NULL) { 27039 CONN_DEC_REF(connp); 27040 return; 27041 } 27042 } 27043 27044 if (connp->conn_recvif || connp->conn_recvslla || 27045 ((connp->conn_ipv6_recvpktinfo || 27046 (!isv4 && IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))) && 27047 (flags & IP_FF_IP6INFO))) { 27048 int in_flags = 0; 27049 27050 if (connp->conn_recvif || connp->conn_ipv6_recvpktinfo) { 27051 in_flags = IPF_RECVIF; 27052 } 27053 if (connp->conn_recvslla) { 27054 in_flags |= IPF_RECVSLLA; 27055 } 27056 if (isv4) { 27057 mp = ip_add_info(mp, recv_ill, in_flags); 27058 } else { 27059 mp = ip_add_info_v6(mp, recv_ill, &ip6h->ip6_dst); 27060 if (mp == NULL) { 27061 CONN_DEC_REF(connp); 27062 return; 27063 } 27064 } 27065 } 27066 27067 BUMP_MIB(&ip_mib, ipInDelivers); 27068 /* 27069 * We are sending the IPSEC_IN message also up. Refer 27070 * to comments above this function. 27071 */ 27072 putnext(rq, mp); 27073 CONN_DEC_REF(connp); 27074 } 27075 27076 /* 27077 * Martian Address Filtering [RFC 1812, Section 5.3.7] 27078 */ 27079 static boolean_t 27080 ip_no_forward(ipha_t *ipha, ill_t *ill) 27081 { 27082 ipaddr_t ip_src, ip_dst; 27083 ire_t *src_ire = NULL; 27084 27085 ip_src = ntohl(ipha->ipha_src); 27086 ip_dst = ntohl(ipha->ipha_dst); 27087 27088 if (ip_dst == INADDR_ANY) 27089 goto dont_forward; 27090 27091 if (IN_CLASSD(ip_src)) 27092 goto dont_forward; 27093 27094 if ((ip_src >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) 27095 goto dont_forward; 27096 27097 if (IN_BADCLASS(ip_dst)) 27098 goto dont_forward; 27099 27100 src_ire = ire_ctable_lookup(ipha->ipha_src, 0, IRE_BROADCAST, NULL, 27101 ALL_ZONES, NULL, MATCH_IRE_TYPE); 27102 if (src_ire != NULL) { 27103 ire_refrele(src_ire); 27104 goto dont_forward; 27105 } 27106 27107 return (B_FALSE); 27108 27109 dont_forward: 27110 if (ip_debug > 2) { 27111 printf("ip_no_forward: dropping packet received on %s\n", 27112 ill->ill_name); 27113 pr_addr_dbg("ip_no_forward: from src %s\n", 27114 AF_INET, &ipha->ipha_src); 27115 pr_addr_dbg("ip_no_forward: to dst %s\n", 27116 AF_INET, &ipha->ipha_dst); 27117 } 27118 BUMP_MIB(&ip_mib, ipForwProhibits); 27119 return (B_TRUE); 27120 } 27121 27122 static boolean_t 27123 ip_loopback_src_or_dst(ipha_t *ipha, ill_t *ill) 27124 { 27125 if (((ntohl(ipha->ipha_src) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) || 27126 ((ntohl(ipha->ipha_dst) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) { 27127 if (ip_debug > 2) { 27128 if (ill != NULL) { 27129 printf("ip_loopback_src_or_dst: " 27130 "dropping packet received on %s\n", 27131 ill->ill_name); 27132 } else { 27133 printf("ip_loopback_src_or_dst: " 27134 "dropping packet\n"); 27135 } 27136 27137 pr_addr_dbg( 27138 "ip_loopback_src_or_dst: from src %s\n", 27139 AF_INET, &ipha->ipha_src); 27140 pr_addr_dbg( 27141 "ip_loopback_src_or_dst: to dst %s\n", 27142 AF_INET, &ipha->ipha_dst); 27143 } 27144 27145 BUMP_MIB(&ip_mib, ipInAddrErrors); 27146 return (B_TRUE); 27147 } 27148 return (B_FALSE); 27149 } 27150 27151 /* 27152 * Return B_TRUE if the buffers differ in length or content. 27153 * This is used for comparing extension header buffers. 27154 * Note that an extension header would be declared different 27155 * even if all that changed was the next header value in that header i.e. 27156 * what really changed is the next extension header. 27157 */ 27158 boolean_t 27159 ip_cmpbuf(const void *abuf, uint_t alen, boolean_t b_valid, const void *bbuf, 27160 uint_t blen) 27161 { 27162 if (!b_valid) 27163 blen = 0; 27164 27165 if (alen != blen) 27166 return (B_TRUE); 27167 if (alen == 0) 27168 return (B_FALSE); /* Both zero length */ 27169 return (bcmp(abuf, bbuf, alen)); 27170 } 27171 27172 /* 27173 * Preallocate memory for ip_savebuf(). Returns B_TRUE if ok. 27174 * Return B_FALSE if memory allocation fails - don't change any state! 27175 */ 27176 boolean_t 27177 ip_allocbuf(void **dstp, uint_t *dstlenp, boolean_t src_valid, 27178 const void *src, uint_t srclen) 27179 { 27180 void *dst; 27181 27182 if (!src_valid) 27183 srclen = 0; 27184 27185 ASSERT(*dstlenp == 0); 27186 if (src != NULL && srclen != 0) { 27187 dst = mi_alloc(srclen, BPRI_MED); 27188 if (dst == NULL) 27189 return (B_FALSE); 27190 } else { 27191 dst = NULL; 27192 } 27193 if (*dstp != NULL) 27194 mi_free(*dstp); 27195 *dstp = dst; 27196 *dstlenp = dst == NULL ? 0 : srclen; 27197 return (B_TRUE); 27198 } 27199 27200 /* 27201 * Replace what is in *dst, *dstlen with the source. 27202 * Assumes ip_allocbuf has already been called. 27203 */ 27204 void 27205 ip_savebuf(void **dstp, uint_t *dstlenp, boolean_t src_valid, 27206 const void *src, uint_t srclen) 27207 { 27208 if (!src_valid) 27209 srclen = 0; 27210 27211 ASSERT(*dstlenp == srclen); 27212 if (src != NULL && srclen != 0) 27213 bcopy(src, *dstp, srclen); 27214 } 27215 27216 /* 27217 * Free the storage pointed to by the members of an ip6_pkt_t. 27218 */ 27219 void 27220 ip6_pkt_free(ip6_pkt_t *ipp) 27221 { 27222 ASSERT(ipp->ipp_pathmtu == NULL && !(ipp->ipp_fields & IPPF_PATHMTU)); 27223 27224 if (ipp->ipp_fields & IPPF_HOPOPTS) { 27225 kmem_free(ipp->ipp_hopopts, ipp->ipp_hopoptslen); 27226 ipp->ipp_hopopts = NULL; 27227 ipp->ipp_hopoptslen = 0; 27228 } 27229 if (ipp->ipp_fields & IPPF_RTDSTOPTS) { 27230 kmem_free(ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen); 27231 ipp->ipp_rtdstopts = NULL; 27232 ipp->ipp_rtdstoptslen = 0; 27233 } 27234 if (ipp->ipp_fields & IPPF_DSTOPTS) { 27235 kmem_free(ipp->ipp_dstopts, ipp->ipp_dstoptslen); 27236 ipp->ipp_dstopts = NULL; 27237 ipp->ipp_dstoptslen = 0; 27238 } 27239 if (ipp->ipp_fields & IPPF_RTHDR) { 27240 kmem_free(ipp->ipp_rthdr, ipp->ipp_rthdrlen); 27241 ipp->ipp_rthdr = NULL; 27242 ipp->ipp_rthdrlen = 0; 27243 } 27244 ipp->ipp_fields &= ~(IPPF_HOPOPTS | IPPF_RTDSTOPTS | IPPF_DSTOPTS | 27245 IPPF_RTHDR); 27246 } 27247