1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 30 /* 31 * This file contains routines that manipulate Internet Routing Entries (IREs). 32 */ 33 34 #include <sys/types.h> 35 #include <sys/stream.h> 36 #include <sys/stropts.h> 37 #include <sys/strsun.h> 38 #include <sys/ddi.h> 39 #include <sys/cmn_err.h> 40 #include <sys/policy.h> 41 42 #include <sys/systm.h> 43 #include <sys/kmem.h> 44 #include <sys/param.h> 45 #include <sys/socket.h> 46 #include <net/if.h> 47 #include <net/route.h> 48 #include <netinet/in.h> 49 #include <net/if_dl.h> 50 #include <netinet/ip6.h> 51 #include <netinet/icmp6.h> 52 53 #include <inet/common.h> 54 #include <inet/mi.h> 55 #include <inet/ip.h> 56 #include <inet/ip6.h> 57 #include <inet/ip_ndp.h> 58 #include <inet/arp.h> 59 #include <inet/ip_if.h> 60 #include <inet/ip_ire.h> 61 #include <inet/ip_ftable.h> 62 #include <inet/ip_rts.h> 63 #include <inet/nd.h> 64 65 #include <net/pfkeyv2.h> 66 #include <inet/ipsec_info.h> 67 #include <inet/sadb.h> 68 #include <sys/kmem.h> 69 #include <inet/tcp.h> 70 #include <inet/ipclassifier.h> 71 #include <sys/zone.h> 72 #include <sys/cpuvar.h> 73 74 #include <sys/tsol/label.h> 75 #include <sys/tsol/tnet.h> 76 #include <sys/dlpi.h> 77 78 struct kmem_cache *rt_entry_cache; 79 80 81 /* 82 * Synchronization notes: 83 * 84 * The fields of the ire_t struct are protected in the following way : 85 * 86 * ire_next/ire_ptpn 87 * 88 * - bucket lock of the respective tables (cache or forwarding tables). 89 * 90 * ire_mp, ire_rfq, ire_stq, ire_u *except* ire_gateway_addr[v6], ire_mask, 91 * ire_type, ire_create_time, ire_masklen, ire_ipversion, ire_flags, ire_ipif, 92 * ire_ihandle, ire_phandle, ire_nce, ire_bucket, ire_in_ill, ire_in_src_addr 93 * 94 * - Set in ire_create_v4/v6 and never changes after that. Thus, 95 * we don't need a lock whenever these fields are accessed. 96 * 97 * - ire_bucket and ire_masklen (also set in ire_create) is set in 98 * ire_add_v4/ire_add_v6 before inserting in the bucket and never 99 * changes after that. Thus we don't need a lock whenever these 100 * fields are accessed. 101 * 102 * ire_gateway_addr_v4[v6] 103 * 104 * - ire_gateway_addr_v4[v6] is set during ire_create and later modified 105 * by rts_setgwr[v6]. As ire_gateway_addr is a uint32_t, updates to 106 * it assumed to be atomic and hence the other parts of the code 107 * does not use any locks. ire_gateway_addr_v6 updates are not atomic 108 * and hence any access to it uses ire_lock to get/set the right value. 109 * 110 * ire_ident, ire_refcnt 111 * 112 * - Updated atomically using atomic_add_32 113 * 114 * ire_ssthresh, ire_rtt_sd, ire_rtt, ire_ib_pkt_count, ire_ob_pkt_count 115 * 116 * - Assumes that 32 bit writes are atomic. No locks. ire_lock is 117 * used to serialize updates to ire_ssthresh, ire_rtt_sd, ire_rtt. 118 * 119 * ire_max_frag, ire_frag_flag 120 * 121 * - ire_lock is used to set/read both of them together. 122 * 123 * ire_tire_mark 124 * 125 * - Set in ire_create and updated in ire_expire, which is called 126 * by only one function namely ip_trash_timer_expire. Thus only 127 * one function updates and examines the value. 128 * 129 * ire_marks 130 * - bucket lock protects this. 131 * 132 * ire_ipsec_overhead/ire_ll_hdr_length 133 * 134 * - Place holder for returning the information to the upper layers 135 * when IRE_DB_REQ comes down. 136 * 137 * 138 * ipv6_ire_default_count is protected by the bucket lock of 139 * ip_forwarding_table_v6[0][0]. 140 * 141 * ipv6_ire_default_index is not protected as it is just a hint 142 * at which default gateway to use. There is nothing 143 * wrong in using the same gateway for two different connections. 144 * 145 * As we always hold the bucket locks in all the places while accessing 146 * the above values, it is natural to use them for protecting them. 147 * 148 * We have a separate cache table and forwarding table for IPv4 and IPv6. 149 * Cache table (ip_cache_table/ip_cache_table_v6) is a pointer to an 150 * array of irb_t structure and forwarding table (ip_forwarding_table/ 151 * ip_forwarding_table_v6) is an array of pointers to array of irb_t 152 * structure. ip_forwarding_table_v6 is allocated dynamically in 153 * ire_add_v6. ire_ft_init_lock is used to serialize multiple threads 154 * initializing the same bucket. Once a bucket is initialized, it is never 155 * de-alloacted. This assumption enables us to access 156 * ip_forwarding_table_v6[i] without any locks. 157 * 158 * Each irb_t - ire bucket structure has a lock to protect 159 * a bucket and the ires residing in the bucket have a back pointer to 160 * the bucket structure. It also has a reference count for the number 161 * of threads walking the bucket - irb_refcnt which is bumped up 162 * using the macro IRB_REFHOLD macro. The flags irb_flags can be 163 * set to IRE_MARK_CONDEMNED indicating that there are some ires 164 * in this bucket that are marked with IRE_MARK_CONDEMNED and the 165 * last thread to leave the bucket should delete the ires. Usually 166 * this is done by the IRB_REFRELE macro which is used to decrement 167 * the reference count on a bucket. 168 * 169 * IRE_REFHOLD/IRE_REFRELE macros operate on the ire which increments/ 170 * decrements the reference count, ire_refcnt, atomically on the ire. 171 * ire_refcnt is modified only using this macro. Operations on the IRE 172 * could be described as follows : 173 * 174 * CREATE an ire with reference count initialized to 1. 175 * 176 * ADDITION of an ire holds the bucket lock, checks for duplicates 177 * and then adds the ire. ire_add_v4/ire_add_v6 returns the ire after 178 * bumping up once more i.e the reference count is 2. This is to avoid 179 * an extra lookup in the functions calling ire_add which wants to 180 * work with the ire after adding. 181 * 182 * LOOKUP of an ire bumps up the reference count using IRE_REFHOLD 183 * macro. It is valid to bump up the referece count of the IRE, 184 * after the lookup has returned an ire. Following are the lookup 185 * functions that return an HELD ire : 186 * 187 * ire_lookup_local[_v6], ire_ctable_lookup[_v6], ire_ftable_lookup[_v6], 188 * ire_cache_lookup[_v6], ire_lookup_multi[_v6], ire_route_lookup[_v6], 189 * ipif_to_ire[_v6], ire_mrtun_lookup, ire_srcif_table_lookup. 190 * 191 * DELETION of an ire holds the bucket lock, removes it from the list 192 * and then decrements the reference count for having removed from the list 193 * by using the IRE_REFRELE macro. If some other thread has looked up 194 * the ire, the reference count would have been bumped up and hence 195 * this ire will not be freed once deleted. It will be freed once the 196 * reference count drops to zero. 197 * 198 * Add and Delete acquires the bucket lock as RW_WRITER, while all the 199 * lookups acquire the bucket lock as RW_READER. 200 * 201 * NOTE : The only functions that does the IRE_REFRELE when an ire is 202 * passed as an argument are : 203 * 204 * 1) ip_wput_ire : This is because it IRE_REFHOLD/RELEs the 205 * broadcast ires it looks up internally within 206 * the function. Currently, for simplicity it does 207 * not differentiate the one that is passed in and 208 * the ones it looks up internally. It always 209 * IRE_REFRELEs. 210 * 2) ire_send 211 * ire_send_v6 : As ire_send calls ip_wput_ire and other functions 212 * that take ire as an argument, it has to selectively 213 * IRE_REFRELE the ire. To maintain symmetry, 214 * ire_send_v6 does the same. 215 * 216 * Otherwise, the general rule is to do the IRE_REFRELE in the function 217 * that is passing the ire as an argument. 218 * 219 * In trying to locate ires the following points are to be noted. 220 * 221 * IRE_MARK_CONDEMNED signifies that the ire has been logically deleted and is 222 * to be ignored when walking the ires using ire_next. 223 * 224 * IRE_MARK_HIDDEN signifies that the ire is a special ire typically for the 225 * benefit of in.mpathd which needs to probe interfaces for failures. Normal 226 * applications should not be seeing this ire and hence this ire is ignored 227 * in most cases in the search using ire_next. 228 * 229 * Zones note: 230 * Walking IREs within a given zone also walks certain ires in other 231 * zones. This is done intentionally. IRE walks with a specified 232 * zoneid are used only when doing informational reports, and 233 * zone users want to see things that they can access. See block 234 * comment in ire_walk_ill_match(). 235 */ 236 237 /* 238 * A per-interface routing table is created ( if not present) 239 * when the first entry is added to this special routing table. 240 * This special routing table is accessed through the ill data structure. 241 * The routing table looks like cache table. For example, currently it 242 * is used by mobile-ip foreign agent to forward data that only comes from 243 * the home agent tunnel for a mobile node. Thus if the outgoing interface 244 * is a RESOLVER interface, IP may need to resolve the hardware address for 245 * the outgoing interface. The routing entries in this table are not updated 246 * in IRE_CACHE. When MCTL msg comes back from ARP, the incoming ill informa- 247 * tion is lost as the write queue is passed to ip_wput. 248 * But, before sending the packet out, the hardware information must be updated 249 * in the special forwarding table. ire_srcif_table_count keeps track of total 250 * number of ires that are in interface based tables. Each interface based 251 * table hangs off of the incoming ill and each ill_t also keeps a refcnt 252 * of ires in that table. 253 */ 254 255 /* 256 * The minimum size of IRE cache table. It will be recalcuated in 257 * ip_ire_init(). 258 * Setable in /etc/system 259 */ 260 uint32_t ip_cache_table_size = IP_CACHE_TABLE_SIZE; 261 uint32_t ip6_cache_table_size = IP6_CACHE_TABLE_SIZE; 262 263 /* 264 * The size of the forwarding table. We will make sure that it is a 265 * power of 2 in ip_ire_init(). 266 * Setable in /etc/system 267 */ 268 uint32_t ip6_ftable_hash_size = IP6_FTABLE_HASH_SIZE; 269 270 struct kmem_cache *ire_cache; 271 static ire_t ire_null; 272 273 /* 274 * The threshold number of IRE in a bucket when the IREs are 275 * cleaned up. This threshold is calculated later in ip_open() 276 * based on the speed of CPU and available memory. This default 277 * value is the maximum. 278 * 279 * We have two kinds of cached IRE, temporary and 280 * non-temporary. Temporary IREs are marked with 281 * IRE_MARK_TEMPORARY. They are IREs created for non 282 * TCP traffic and for forwarding purposes. All others 283 * are non-temporary IREs. We don't mark IRE created for 284 * TCP as temporary because TCP is stateful and there are 285 * info stored in the IRE which can be shared by other TCP 286 * connections to the same destination. For connected 287 * endpoint, we also don't want to mark the IRE used as 288 * temporary because the same IRE will be used frequently, 289 * otherwise, the app should not do a connect(). We change 290 * the marking at ip_bind_connected_*() if necessary. 291 * 292 * We want to keep the cache IRE hash bucket length reasonably 293 * short, otherwise IRE lookup functions will take "forever." 294 * We use the "crude" function that the IRE bucket 295 * length should be based on the CPU speed, which is 1 entry 296 * per x MHz, depending on the shift factor ip_ire_cpu_ratio 297 * (n). This means that with a 750MHz CPU, the max bucket 298 * length can be (750 >> n) entries. 299 * 300 * Note that this threshold is separate for temp and non-temp 301 * IREs. This means that the actual bucket length can be 302 * twice as that. And while we try to keep temporary IRE 303 * length at most at the threshold value, we do not attempt to 304 * make the length for non-temporary IREs fixed, for the 305 * reason stated above. Instead, we start trying to find 306 * "unused" non-temporary IREs when the bucket length reaches 307 * this threshold and clean them up. 308 * 309 * We also want to limit the amount of memory used by 310 * IREs. So if we are allowed to use ~3% of memory (M) 311 * for those IREs, each bucket should not have more than 312 * 313 * M / num of cache bucket / sizeof (ire_t) 314 * 315 * Again the above memory uses are separate for temp and 316 * non-temp cached IREs. 317 * 318 * We may also want the limit to be a function of the number 319 * of interfaces and number of CPUs. Doing the initialization 320 * in ip_open() means that every time an interface is plumbed, 321 * the max is re-calculated. Right now, we don't do anything 322 * different. In future, when we have more experience, we 323 * may want to change this behavior. 324 */ 325 uint32_t ip_ire_max_bucket_cnt = 10; /* Setable in /etc/system */ 326 uint32_t ip6_ire_max_bucket_cnt = 10; 327 328 /* 329 * The minimum of the temporary IRE bucket count. We do not want 330 * the length of each bucket to be too short. This may hurt 331 * performance of some apps as the temporary IREs are removed too 332 * often. 333 */ 334 uint32_t ip_ire_min_bucket_cnt = 3; /* /etc/system - not used */ 335 uint32_t ip6_ire_min_bucket_cnt = 3; 336 337 /* 338 * The ratio of memory consumed by IRE used for temporary to available 339 * memory. This is a shift factor, so 6 means the ratio 1 to 64. This 340 * value can be changed in /etc/system. 6 is a reasonable number. 341 */ 342 uint32_t ip_ire_mem_ratio = 6; /* /etc/system */ 343 /* The shift factor for CPU speed to calculate the max IRE bucket length. */ 344 uint32_t ip_ire_cpu_ratio = 7; /* /etc/system */ 345 346 typedef struct nce_clookup_s { 347 ipaddr_t ncecl_addr; 348 boolean_t ncecl_found; 349 } nce_clookup_t; 350 351 /* 352 * The maximum number of buckets in IRE cache table. In future, we may 353 * want to make it a dynamic hash table. For the moment, we fix the 354 * size and allocate the table in ip_ire_init() when IP is first loaded. 355 * We take into account the amount of memory a system has. 356 */ 357 #define IP_MAX_CACHE_TABLE_SIZE 4096 358 359 /* Setable in /etc/system */ 360 static uint32_t ip_max_cache_table_size = IP_MAX_CACHE_TABLE_SIZE; 361 static uint32_t ip6_max_cache_table_size = IP_MAX_CACHE_TABLE_SIZE; 362 363 #define NUM_ILLS 3 /* To build the ILL list to unlock */ 364 365 /* Zero iulp_t for initialization. */ 366 const iulp_t ire_uinfo_null = { 0 }; 367 368 static int ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, 369 ipsq_func_t func, boolean_t); 370 static int ire_add_srcif_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, 371 ipsq_func_t func); 372 static ire_t *ire_update_srcif_v4(ire_t *ire); 373 static void ire_delete_v4(ire_t *ire); 374 static void ire_report_ctable(ire_t *ire, char *mp); 375 static void ire_report_mrtun_table(ire_t *ire, char *mp); 376 static void ire_report_srcif_table(ire_t *ire, char *mp, ip_stack_t *ipst); 377 static void ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers, 378 zoneid_t zoneid, ip_stack_t *); 379 static void ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type, 380 pfv_t func, void *arg, uchar_t vers, ill_t *ill); 381 static void ire_cache_cleanup(irb_t *irb, uint32_t threshold, int cnt); 382 extern void ill_unlock_ills(ill_t **list, int cnt); 383 static void ip_nce_clookup_and_delete(nce_t *nce, void *arg); 384 extern void th_trace_rrecord(th_trace_t *); 385 #ifdef IRE_DEBUG 386 static void ire_trace_inactive(ire_t *); 387 #endif 388 389 /* 390 * To avoid bloating the code, we call this function instead of 391 * using the macro IRE_REFRELE. Use macro only in performance 392 * critical paths. 393 * 394 * Must not be called while holding any locks. Otherwise if this is 395 * the last reference to be released there is a chance of recursive mutex 396 * panic due to ire_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 397 * to restart an ioctl. The one exception is when the caller is sure that 398 * this is not the last reference to be released. Eg. if the caller is 399 * sure that the ire has not been deleted and won't be deleted. 400 */ 401 void 402 ire_refrele(ire_t *ire) 403 { 404 IRE_REFRELE(ire); 405 } 406 407 void 408 ire_refrele_notr(ire_t *ire) 409 { 410 IRE_REFRELE_NOTR(ire); 411 } 412 413 /* 414 * kmem_cache_alloc constructor for IRE in kma space. 415 * Note that when ire_mp is set the IRE is stored in that mblk and 416 * not in this cache. 417 */ 418 /* ARGSUSED */ 419 static int 420 ip_ire_constructor(void *buf, void *cdrarg, int kmflags) 421 { 422 ire_t *ire = buf; 423 424 ire->ire_nce = NULL; 425 426 return (0); 427 } 428 429 /* ARGSUSED1 */ 430 static void 431 ip_ire_destructor(void *buf, void *cdrarg) 432 { 433 ire_t *ire = buf; 434 435 ASSERT(ire->ire_nce == NULL); 436 } 437 438 /* 439 * This function is associated with the IP_IOC_IRE_ADVISE_NO_REPLY 440 * IOCTL. It is used by TCP (or other ULPs) to supply revised information 441 * for an existing CACHED IRE. 442 */ 443 /* ARGSUSED */ 444 int 445 ip_ire_advise(queue_t *q, mblk_t *mp, cred_t *ioc_cr) 446 { 447 uchar_t *addr_ucp; 448 ipic_t *ipic; 449 ire_t *ire; 450 ipaddr_t addr; 451 in6_addr_t v6addr; 452 irb_t *irb; 453 zoneid_t zoneid; 454 ip_stack_t *ipst = CONNQ_TO_IPST(q); 455 456 ASSERT(q->q_next == NULL); 457 zoneid = Q_TO_CONN(q)->conn_zoneid; 458 459 /* 460 * Check privilege using the ioctl credential; if it is NULL 461 * then this is a kernel message and therefor privileged. 462 */ 463 if (ioc_cr != NULL && secpolicy_ip_config(ioc_cr, B_FALSE) != 0) 464 return (EPERM); 465 466 ipic = (ipic_t *)mp->b_rptr; 467 if (!(addr_ucp = mi_offset_param(mp, ipic->ipic_addr_offset, 468 ipic->ipic_addr_length))) { 469 return (EINVAL); 470 } 471 if (!OK_32PTR(addr_ucp)) 472 return (EINVAL); 473 switch (ipic->ipic_addr_length) { 474 case IP_ADDR_LEN: { 475 /* Extract the destination address. */ 476 addr = *(ipaddr_t *)addr_ucp; 477 /* Find the corresponding IRE. */ 478 ire = ire_cache_lookup(addr, zoneid, NULL, ipst); 479 break; 480 } 481 case IPV6_ADDR_LEN: { 482 /* Extract the destination address. */ 483 v6addr = *(in6_addr_t *)addr_ucp; 484 /* Find the corresponding IRE. */ 485 ire = ire_cache_lookup_v6(&v6addr, zoneid, NULL, ipst); 486 break; 487 } 488 default: 489 return (EINVAL); 490 } 491 492 if (ire == NULL) 493 return (ENOENT); 494 /* 495 * Update the round trip time estimate and/or the max frag size 496 * and/or the slow start threshold. 497 * 498 * We serialize multiple advises using ire_lock. 499 */ 500 mutex_enter(&ire->ire_lock); 501 if (ipic->ipic_rtt) { 502 /* 503 * If there is no old cached values, initialize them 504 * conservatively. Set them to be (1.5 * new value). 505 */ 506 if (ire->ire_uinfo.iulp_rtt != 0) { 507 ire->ire_uinfo.iulp_rtt = (ire->ire_uinfo.iulp_rtt + 508 ipic->ipic_rtt) >> 1; 509 } else { 510 ire->ire_uinfo.iulp_rtt = ipic->ipic_rtt + 511 (ipic->ipic_rtt >> 1); 512 } 513 if (ire->ire_uinfo.iulp_rtt_sd != 0) { 514 ire->ire_uinfo.iulp_rtt_sd = 515 (ire->ire_uinfo.iulp_rtt_sd + 516 ipic->ipic_rtt_sd) >> 1; 517 } else { 518 ire->ire_uinfo.iulp_rtt_sd = ipic->ipic_rtt_sd + 519 (ipic->ipic_rtt_sd >> 1); 520 } 521 } 522 if (ipic->ipic_max_frag) 523 ire->ire_max_frag = MIN(ipic->ipic_max_frag, IP_MAXPACKET); 524 if (ipic->ipic_ssthresh != 0) { 525 if (ire->ire_uinfo.iulp_ssthresh != 0) 526 ire->ire_uinfo.iulp_ssthresh = 527 (ipic->ipic_ssthresh + 528 ire->ire_uinfo.iulp_ssthresh) >> 1; 529 else 530 ire->ire_uinfo.iulp_ssthresh = ipic->ipic_ssthresh; 531 } 532 /* 533 * Don't need the ire_lock below this. ire_type does not change 534 * after initialization. ire_marks is protected by irb_lock. 535 */ 536 mutex_exit(&ire->ire_lock); 537 538 if (ipic->ipic_ire_marks != 0 && ire->ire_type == IRE_CACHE) { 539 /* 540 * Only increment the temporary IRE count if the original 541 * IRE is not already marked temporary. 542 */ 543 irb = ire->ire_bucket; 544 rw_enter(&irb->irb_lock, RW_WRITER); 545 if ((ipic->ipic_ire_marks & IRE_MARK_TEMPORARY) && 546 !(ire->ire_marks & IRE_MARK_TEMPORARY)) { 547 irb->irb_tmp_ire_cnt++; 548 } 549 ire->ire_marks |= ipic->ipic_ire_marks; 550 rw_exit(&irb->irb_lock); 551 } 552 553 ire_refrele(ire); 554 return (0); 555 } 556 557 /* 558 * This function is associated with the IP_IOC_IRE_DELETE[_NO_REPLY] 559 * IOCTL[s]. The NO_REPLY form is used by TCP to delete a route IRE 560 * for a host that is not responding. This will force an attempt to 561 * establish a new route, if available, and flush out the ARP entry so 562 * it will re-resolve. Management processes may want to use the 563 * version that generates a reply. 564 * 565 * This function does not support IPv6 since Neighbor Unreachability Detection 566 * means that negative advise like this is useless. 567 */ 568 /* ARGSUSED */ 569 int 570 ip_ire_delete(queue_t *q, mblk_t *mp, cred_t *ioc_cr) 571 { 572 uchar_t *addr_ucp; 573 ipaddr_t addr; 574 ire_t *ire; 575 ipid_t *ipid; 576 boolean_t routing_sock_info = B_FALSE; /* Sent info? */ 577 zoneid_t zoneid; 578 ire_t *gire = NULL; 579 ill_t *ill; 580 mblk_t *arp_mp; 581 ip_stack_t *ipst; 582 583 ASSERT(q->q_next == NULL); 584 zoneid = Q_TO_CONN(q)->conn_zoneid; 585 ipst = CONNQ_TO_IPST(q); 586 587 /* 588 * Check privilege using the ioctl credential; if it is NULL 589 * then this is a kernel message and therefor privileged. 590 */ 591 if (ioc_cr != NULL && secpolicy_ip_config(ioc_cr, B_FALSE) != 0) 592 return (EPERM); 593 594 ipid = (ipid_t *)mp->b_rptr; 595 596 /* Only actions on IRE_CACHEs are acceptable at present. */ 597 if (ipid->ipid_ire_type != IRE_CACHE) 598 return (EINVAL); 599 600 addr_ucp = mi_offset_param(mp, ipid->ipid_addr_offset, 601 ipid->ipid_addr_length); 602 if (addr_ucp == NULL || !OK_32PTR(addr_ucp)) 603 return (EINVAL); 604 switch (ipid->ipid_addr_length) { 605 case IP_ADDR_LEN: 606 /* addr_ucp points at IP addr */ 607 break; 608 case sizeof (sin_t): { 609 sin_t *sin; 610 /* 611 * got complete (sockaddr) address - increment addr_ucp to point 612 * at the ip_addr field. 613 */ 614 sin = (sin_t *)addr_ucp; 615 addr_ucp = (uchar_t *)&sin->sin_addr.s_addr; 616 break; 617 } 618 default: 619 return (EINVAL); 620 } 621 /* Extract the destination address. */ 622 bcopy(addr_ucp, &addr, IP_ADDR_LEN); 623 624 /* Try to find the CACHED IRE. */ 625 ire = ire_cache_lookup(addr, zoneid, NULL, ipst); 626 627 /* Nail it. */ 628 if (ire) { 629 /* Allow delete only on CACHE entries */ 630 if (ire->ire_type != IRE_CACHE) { 631 ire_refrele(ire); 632 return (EINVAL); 633 } 634 635 /* 636 * Verify that the IRE has been around for a while. 637 * This is to protect against transport protocols 638 * that are too eager in sending delete messages. 639 */ 640 if (gethrestime_sec() < 641 ire->ire_create_time + ipst->ips_ip_ignore_delete_time) { 642 ire_refrele(ire); 643 return (EINVAL); 644 } 645 /* 646 * Now we have a potentially dead cache entry. We need 647 * to remove it. 648 * If this cache entry is generated from a 649 * default route (i.e., ire_cmask == 0), 650 * search the default list and mark it dead and some 651 * background process will try to activate it. 652 */ 653 if ((ire->ire_gateway_addr != 0) && (ire->ire_cmask == 0)) { 654 /* 655 * Make sure that we pick a different 656 * IRE_DEFAULT next time. 657 */ 658 ire_t *gw_ire; 659 irb_t *irb = NULL; 660 uint_t match_flags; 661 662 match_flags = (MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE); 663 664 gire = ire_ftable_lookup(ire->ire_addr, 665 ire->ire_cmask, 0, 0, 666 ire->ire_ipif, NULL, zoneid, 0, NULL, match_flags, 667 ipst); 668 669 ip3dbg(("ire_ftable_lookup() returned gire %p\n", 670 (void *)gire)); 671 672 if (gire != NULL) { 673 irb = gire->ire_bucket; 674 675 /* 676 * We grab it as writer just to serialize 677 * multiple threads trying to bump up 678 * irb_rr_origin 679 */ 680 rw_enter(&irb->irb_lock, RW_WRITER); 681 if ((gw_ire = irb->irb_rr_origin) == NULL) { 682 rw_exit(&irb->irb_lock); 683 goto done; 684 } 685 686 DTRACE_PROBE1(ip__ire__del__origin, 687 (ire_t *), gw_ire); 688 689 /* Skip past the potentially bad gateway */ 690 if (ire->ire_gateway_addr == 691 gw_ire->ire_gateway_addr) { 692 ire_t *next = gw_ire->ire_next; 693 694 DTRACE_PROBE2(ip__ire__del, 695 (ire_t *), gw_ire, (irb_t *), irb); 696 IRE_FIND_NEXT_ORIGIN(next); 697 irb->irb_rr_origin = next; 698 } 699 rw_exit(&irb->irb_lock); 700 } 701 } 702 done: 703 if (gire != NULL) 704 IRE_REFRELE(gire); 705 /* report the bad route to routing sockets */ 706 ip_rts_change(RTM_LOSING, ire->ire_addr, ire->ire_gateway_addr, 707 ire->ire_mask, ire->ire_src_addr, 0, 0, 0, 708 (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA), ipst); 709 routing_sock_info = B_TRUE; 710 711 /* 712 * TCP is really telling us to start over completely, and it 713 * expects that we'll resend the ARP query. Tell ARP to 714 * discard the entry, if this is a local destination. 715 */ 716 ill = ire->ire_stq->q_ptr; 717 if (ire->ire_gateway_addr == 0 && 718 (arp_mp = ill_ared_alloc(ill, addr)) != NULL) { 719 putnext(ill->ill_rq, arp_mp); 720 } 721 722 ire_delete(ire); 723 ire_refrele(ire); 724 } 725 /* 726 * Also look for an IRE_HOST type redirect ire and 727 * remove it if present. 728 */ 729 ire = ire_route_lookup(addr, 0, 0, IRE_HOST, NULL, NULL, 730 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 731 732 /* Nail it. */ 733 if (ire != NULL) { 734 if (ire->ire_flags & RTF_DYNAMIC) { 735 if (!routing_sock_info) { 736 ip_rts_change(RTM_LOSING, ire->ire_addr, 737 ire->ire_gateway_addr, ire->ire_mask, 738 ire->ire_src_addr, 0, 0, 0, 739 (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA), 740 ipst); 741 } 742 ire_delete(ire); 743 } 744 ire_refrele(ire); 745 } 746 return (0); 747 } 748 749 /* 750 * Named Dispatch routine to produce a formatted report on all IREs. 751 * This report is accessed by using the ndd utility to "get" ND variable 752 * "ipv4_ire_status". 753 */ 754 /* ARGSUSED */ 755 int 756 ip_ire_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 757 { 758 zoneid_t zoneid; 759 ip_stack_t *ipst; 760 761 if (CONN_Q(q)) 762 ipst = CONNQ_TO_IPST(q); 763 else 764 ipst = ILLQ_TO_IPST(q); 765 766 (void) mi_mpprintf(mp, 767 "IRE " MI_COL_HDRPAD_STR 768 /* 01234567[89ABCDEF] */ 769 "rfq " MI_COL_HDRPAD_STR 770 /* 01234567[89ABCDEF] */ 771 "stq " MI_COL_HDRPAD_STR 772 /* 01234567[89ABCDEF] */ 773 " zone " 774 /* 12345 */ 775 "addr mask " 776 /* 123.123.123.123 123.123.123.123 */ 777 "src gateway mxfrg rtt rtt_sd ssthresh ref " 778 /* 123.123.123.123 123.123.123.123 12345 12345 123456 12345678 123 */ 779 "rtomax tstamp_ok wscale_ok ecn_ok pmtud_ok sack sendpipe " 780 /* 123456 123456789 123456789 123456 12345678 1234 12345678 */ 781 "recvpipe in/out/forward type"); 782 /* 12345678 in/out/forward xxxxxxxxxx */ 783 784 /* 785 * Because of the ndd constraint, at most we can have 64K buffer 786 * to put in all IRE info. So to be more efficient, just 787 * allocate a 64K buffer here, assuming we need that large buffer. 788 * This should be OK as only root can do ndd /dev/ip. 789 */ 790 if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) { 791 /* The following may work even if we cannot get a large buf. */ 792 (void) mi_mpprintf(mp, "<< Out of buffer >>\n"); 793 return (0); 794 } 795 796 zoneid = Q_TO_CONN(q)->conn_zoneid; 797 if (zoneid == GLOBAL_ZONEID) 798 zoneid = ALL_ZONES; 799 800 ire_walk_v4(ire_report_ftable, mp->b_cont, zoneid, ipst); 801 ire_walk_v4(ire_report_ctable, mp->b_cont, zoneid, ipst); 802 803 return (0); 804 } 805 806 807 /* ire_walk routine invoked for ip_ire_report for each cached IRE. */ 808 static void 809 ire_report_ctable(ire_t *ire, char *mp) 810 { 811 char buf1[16]; 812 char buf2[16]; 813 char buf3[16]; 814 char buf4[16]; 815 uint_t fo_pkt_count; 816 uint_t ib_pkt_count; 817 int ref; 818 uint_t print_len, buf_len; 819 820 if ((ire->ire_type & IRE_CACHETABLE) == 0) 821 return; 822 buf_len = ((mblk_t *)mp)->b_datap->db_lim - ((mblk_t *)mp)->b_wptr; 823 if (buf_len <= 0) 824 return; 825 826 /* Number of active references of this ire */ 827 ref = ire->ire_refcnt; 828 /* "inbound" to a non local address is a forward */ 829 ib_pkt_count = ire->ire_ib_pkt_count; 830 fo_pkt_count = 0; 831 if (!(ire->ire_type & (IRE_LOCAL|IRE_BROADCAST))) { 832 fo_pkt_count = ib_pkt_count; 833 ib_pkt_count = 0; 834 } 835 print_len = snprintf((char *)((mblk_t *)mp)->b_wptr, buf_len, 836 MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR "%5d " 837 "%s %s %s %s %05d %05ld %06ld %08d %03d %06d %09d %09d %06d %08d " 838 "%04d %08d %08d %d/%d/%d %s\n", 839 (void *)ire, (void *)ire->ire_rfq, (void *)ire->ire_stq, 840 (int)ire->ire_zoneid, 841 ip_dot_addr(ire->ire_addr, buf1), ip_dot_addr(ire->ire_mask, buf2), 842 ip_dot_addr(ire->ire_src_addr, buf3), 843 ip_dot_addr(ire->ire_gateway_addr, buf4), 844 ire->ire_max_frag, ire->ire_uinfo.iulp_rtt, 845 ire->ire_uinfo.iulp_rtt_sd, ire->ire_uinfo.iulp_ssthresh, ref, 846 ire->ire_uinfo.iulp_rtomax, 847 (ire->ire_uinfo.iulp_tstamp_ok ? 1: 0), 848 (ire->ire_uinfo.iulp_wscale_ok ? 1: 0), 849 (ire->ire_uinfo.iulp_ecn_ok ? 1: 0), 850 (ire->ire_uinfo.iulp_pmtud_ok ? 1: 0), 851 ire->ire_uinfo.iulp_sack, 852 ire->ire_uinfo.iulp_spipe, ire->ire_uinfo.iulp_rpipe, 853 ib_pkt_count, ire->ire_ob_pkt_count, fo_pkt_count, 854 ip_nv_lookup(ire_nv_tbl, (int)ire->ire_type)); 855 if (print_len < buf_len) { 856 ((mblk_t *)mp)->b_wptr += print_len; 857 } else { 858 ((mblk_t *)mp)->b_wptr += buf_len; 859 } 860 } 861 862 /* ARGSUSED */ 863 int 864 ip_ire_report_mrtun(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 865 { 866 ip_stack_t *ipst; 867 868 if (CONN_Q(q)) 869 ipst = CONNQ_TO_IPST(q); 870 else 871 ipst = ILLQ_TO_IPST(q); 872 873 (void) mi_mpprintf(mp, 874 "IRE " MI_COL_HDRPAD_STR 875 /* 01234567[89ABCDEF] */ 876 "stq " MI_COL_HDRPAD_STR 877 /* 01234567[89ABCDEF] */ 878 "in_ill " MI_COL_HDRPAD_STR 879 /* 01234567[89ABCDEF] */ 880 "in_src_addr " 881 /* 123.123.123.123 */ 882 "max_frag " 883 /* 12345 */ 884 "ref "); 885 /* 123 */ 886 887 ire_walk_ill_mrtun(0, 0, ire_report_mrtun_table, mp, NULL, 888 ipst); 889 return (0); 890 } 891 892 /* mrtun report table - supports ipv4_mrtun_ire_status ndd variable */ 893 894 static void 895 ire_report_mrtun_table(ire_t *ire, char *mp) 896 { 897 char buf1[INET_ADDRSTRLEN]; 898 int ref; 899 900 /* Number of active references of this ire */ 901 ref = ire->ire_refcnt; 902 ASSERT(ire->ire_type == IRE_MIPRTUN); 903 (void) mi_mpprintf((mblk_t *)mp, 904 MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR 905 "%s %05d %03d", 906 (void *)ire, (void *)ire->ire_stq, 907 (void *)ire->ire_in_ill, 908 ip_dot_addr(ire->ire_in_src_addr, buf1), 909 ire->ire_max_frag, ref); 910 } 911 912 /* 913 * Dispatch routine to format ires in interface based routine 914 */ 915 /* ARGSUSED */ 916 int 917 ip_ire_report_srcif(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 918 { 919 ip_stack_t *ipst; 920 921 if (CONN_Q(q)) 922 ipst = CONNQ_TO_IPST(q); 923 else 924 ipst = ILLQ_TO_IPST(q); 925 926 /* Report all interface based ires */ 927 928 (void) mi_mpprintf(mp, 929 "IRE " MI_COL_HDRPAD_STR 930 /* 01234567[89ABCDEF] */ 931 "stq " MI_COL_HDRPAD_STR 932 /* 01234567[89ABCDEF] */ 933 "in_ill " MI_COL_HDRPAD_STR 934 /* 01234567[89ABCDEF] */ 935 "addr " 936 /* 123.123.123.123 */ 937 "gateway " 938 /* 123.123.123.123 */ 939 "max_frag " 940 /* 12345 */ 941 "ref " 942 /* 123 */ 943 "type " 944 /* ABCDEFGH */ 945 "in/out/forward"); 946 ire_walk_srcif_table_v4(ire_report_srcif_table, mp, ipst); 947 return (0); 948 } 949 950 /* Reports the interface table ires */ 951 /* ARGSUSED2 */ 952 static void 953 ire_report_srcif_table(ire_t *ire, char *mp, ip_stack_t *ipst) 954 { 955 char buf1[INET_ADDRSTRLEN]; 956 char buf2[INET_ADDRSTRLEN]; 957 int ref; 958 959 ref = ire->ire_refcnt; 960 (void) mi_mpprintf((mblk_t *)mp, 961 MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR 962 "%s %s %05d %03d %s %d", 963 (void *)ire, (void *)ire->ire_stq, 964 (void *)ire->ire_in_ill, 965 ip_dot_addr(ire->ire_addr, buf1), 966 ip_dot_addr(ire->ire_gateway_addr, buf2), 967 ire->ire_max_frag, ref, 968 ip_nv_lookup(ire_nv_tbl, (int)ire->ire_type), 969 ire->ire_ib_pkt_count); 970 971 } 972 /* 973 * ip_ire_req is called by ip_wput when an IRE_DB_REQ_TYPE message is handed 974 * down from the Upper Level Protocol to request a copy of the IRE (to check 975 * its type or to extract information like round-trip time estimates or the 976 * MTU.) 977 * The address is assumed to be in the ire_addr field. If no IRE is found 978 * an IRE is returned with ire_type being zero. 979 * Note that the upper lavel protocol has to check for broadcast 980 * (IRE_BROADCAST) and multicast (CLASSD(addr)). 981 * If there is a b_cont the resulting IRE_DB_TYPE mblk is placed at the 982 * end of the returned message. 983 * 984 * TCP sends down a message of this type with a connection request packet 985 * chained on. UDP and ICMP send it down to verify that a route exists for 986 * the destination address when they get connected. 987 */ 988 void 989 ip_ire_req(queue_t *q, mblk_t *mp) 990 { 991 ire_t *inire; 992 ire_t *ire; 993 mblk_t *mp1; 994 ire_t *sire = NULL; 995 zoneid_t zoneid = Q_TO_CONN(q)->conn_zoneid; 996 ip_stack_t *ipst = CONNQ_TO_IPST(q); 997 998 ASSERT(q->q_next == NULL); 999 1000 if ((mp->b_wptr - mp->b_rptr) < sizeof (ire_t) || 1001 !OK_32PTR(mp->b_rptr)) { 1002 freemsg(mp); 1003 return; 1004 } 1005 inire = (ire_t *)mp->b_rptr; 1006 /* 1007 * Got it, now take our best shot at an IRE. 1008 */ 1009 if (inire->ire_ipversion == IPV6_VERSION) { 1010 ire = ire_route_lookup_v6(&inire->ire_addr_v6, 0, 0, 0, 1011 NULL, &sire, zoneid, NULL, 1012 (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst); 1013 } else { 1014 ASSERT(inire->ire_ipversion == IPV4_VERSION); 1015 ire = ire_route_lookup(inire->ire_addr, 0, 0, 0, 1016 NULL, &sire, zoneid, NULL, 1017 (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst); 1018 } 1019 1020 /* 1021 * We prevent returning IRES with source address INADDR_ANY 1022 * as these were temporarily created for sending packets 1023 * from endpoints that have conn_unspec_src set. 1024 */ 1025 if (ire == NULL || 1026 (ire->ire_ipversion == IPV4_VERSION && 1027 ire->ire_src_addr == INADDR_ANY) || 1028 (ire->ire_ipversion == IPV6_VERSION && 1029 IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6))) { 1030 inire->ire_type = 0; 1031 } else { 1032 bcopy(ire, inire, sizeof (ire_t)); 1033 /* Copy the route metrics from the parent. */ 1034 if (sire != NULL) { 1035 bcopy(&(sire->ire_uinfo), &(inire->ire_uinfo), 1036 sizeof (iulp_t)); 1037 } 1038 1039 /* 1040 * As we don't lookup global policy here, we may not 1041 * pass the right size if per-socket policy is not 1042 * present. For these cases, path mtu discovery will 1043 * do the right thing. 1044 */ 1045 inire->ire_ipsec_overhead = conn_ipsec_length(Q_TO_CONN(q)); 1046 1047 /* Pass the latest setting of the ip_path_mtu_discovery */ 1048 inire->ire_frag_flag |= 1049 (ipst->ips_ip_path_mtu_discovery) ? IPH_DF : 0; 1050 } 1051 if (ire != NULL) 1052 ire_refrele(ire); 1053 if (sire != NULL) 1054 ire_refrele(sire); 1055 mp->b_wptr = &mp->b_rptr[sizeof (ire_t)]; 1056 mp->b_datap->db_type = IRE_DB_TYPE; 1057 1058 /* Put the IRE_DB_TYPE mblk last in the chain */ 1059 mp1 = mp->b_cont; 1060 if (mp1 != NULL) { 1061 mp->b_cont = NULL; 1062 linkb(mp1, mp); 1063 mp = mp1; 1064 } 1065 qreply(q, mp); 1066 } 1067 1068 /* 1069 * Send a packet using the specified IRE. 1070 * If ire_src_addr_v6 is all zero then discard the IRE after 1071 * the packet has been sent. 1072 */ 1073 static void 1074 ire_send(queue_t *q, mblk_t *pkt, ire_t *ire) 1075 { 1076 mblk_t *ipsec_mp; 1077 boolean_t is_secure; 1078 uint_t ifindex; 1079 ill_t *ill; 1080 zoneid_t zoneid = ire->ire_zoneid; 1081 ip_stack_t *ipst = ire->ire_ipst; 1082 1083 ASSERT(ire->ire_ipversion == IPV4_VERSION); 1084 ASSERT(!(ire->ire_type & IRE_LOCAL)); /* Has different ire_zoneid */ 1085 ipsec_mp = pkt; 1086 is_secure = (pkt->b_datap->db_type == M_CTL); 1087 if (is_secure) { 1088 ipsec_out_t *io; 1089 1090 pkt = pkt->b_cont; 1091 io = (ipsec_out_t *)ipsec_mp->b_rptr; 1092 if (io->ipsec_out_type == IPSEC_OUT) 1093 zoneid = io->ipsec_out_zoneid; 1094 } 1095 1096 /* If the packet originated externally then */ 1097 if (pkt->b_prev) { 1098 ire_refrele(ire); 1099 /* 1100 * Extract the ifindex from b_prev (set in ip_rput_noire). 1101 * Look up interface to see if it still exists (it could have 1102 * been unplumbed by the time the reply came back from ARP) 1103 */ 1104 ifindex = (uint_t)(uintptr_t)pkt->b_prev; 1105 ill = ill_lookup_on_ifindex(ifindex, B_FALSE, 1106 NULL, NULL, NULL, NULL, ipst); 1107 if (ill == NULL) { 1108 pkt->b_prev = NULL; 1109 pkt->b_next = NULL; 1110 freemsg(ipsec_mp); 1111 return; 1112 } 1113 q = ill->ill_rq; 1114 pkt->b_prev = NULL; 1115 /* 1116 * This packet has not gone through IPSEC processing 1117 * and hence we should not have any IPSEC message 1118 * prepended. 1119 */ 1120 ASSERT(ipsec_mp == pkt); 1121 put(q, pkt); 1122 ill_refrele(ill); 1123 } else if (pkt->b_next) { 1124 /* Packets from multicast router */ 1125 pkt->b_next = NULL; 1126 /* 1127 * We never get the IPSEC_OUT while forwarding the 1128 * packet for multicast router. 1129 */ 1130 ASSERT(ipsec_mp == pkt); 1131 ip_rput_forward(ire, (ipha_t *)pkt->b_rptr, ipsec_mp, NULL); 1132 ire_refrele(ire); 1133 } else { 1134 /* Locally originated packets */ 1135 boolean_t is_inaddr_any; 1136 ipha_t *ipha = (ipha_t *)pkt->b_rptr; 1137 1138 /* 1139 * We need to do an ire_delete below for which 1140 * we need to make sure that the IRE will be 1141 * around even after calling ip_wput_ire - 1142 * which does ire_refrele. Otherwise somebody 1143 * could potentially delete this ire and hence 1144 * free this ire and we will be calling ire_delete 1145 * on a freed ire below. 1146 */ 1147 is_inaddr_any = (ire->ire_src_addr == INADDR_ANY); 1148 if (is_inaddr_any) { 1149 IRE_REFHOLD(ire); 1150 } 1151 /* 1152 * If we were resolving a router we can not use the 1153 * routers IRE for sending the packet (since it would 1154 * violate the uniqness of the IP idents) thus we 1155 * make another pass through ip_wput to create the IRE_CACHE 1156 * for the destination. 1157 * When IRE_MARK_NOADD is set, ire_add() is not called. 1158 * Thus ip_wput() will never find a ire and result in an 1159 * infinite loop. Thus we check whether IRE_MARK_NOADD is 1160 * is set. This also implies that IRE_MARK_NOADD can only be 1161 * used to send packets to directly connected hosts. 1162 */ 1163 if (ipha->ipha_dst != ire->ire_addr && 1164 !(ire->ire_marks & IRE_MARK_NOADD)) { 1165 ire_refrele(ire); /* Held in ire_add */ 1166 if (CONN_Q(q)) { 1167 (void) ip_output(Q_TO_CONN(q), ipsec_mp, q, 1168 IRE_SEND); 1169 } else { 1170 (void) ip_output((void *)(uintptr_t)zoneid, 1171 ipsec_mp, q, IRE_SEND); 1172 } 1173 } else { 1174 if (is_secure) { 1175 ipsec_out_t *oi; 1176 ipha_t *ipha; 1177 1178 oi = (ipsec_out_t *)ipsec_mp->b_rptr; 1179 ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr; 1180 if (oi->ipsec_out_proc_begin) { 1181 /* 1182 * This is the case where 1183 * ip_wput_ipsec_out could not find 1184 * the IRE and recreated a new one. 1185 * As ip_wput_ipsec_out does ire 1186 * lookups, ire_refrele for the extra 1187 * bump in ire_add. 1188 */ 1189 ire_refrele(ire); 1190 ip_wput_ipsec_out(q, ipsec_mp, ipha, 1191 NULL, NULL); 1192 } else { 1193 /* 1194 * IRE_REFRELE will be done in 1195 * ip_wput_ire. 1196 */ 1197 ip_wput_ire(q, ipsec_mp, ire, NULL, 1198 IRE_SEND, zoneid); 1199 } 1200 } else { 1201 /* 1202 * IRE_REFRELE will be done in ip_wput_ire. 1203 */ 1204 ip_wput_ire(q, ipsec_mp, ire, NULL, 1205 IRE_SEND, zoneid); 1206 } 1207 } 1208 /* 1209 * Special code to support sending a single packet with 1210 * conn_unspec_src using an IRE which has no source address. 1211 * The IRE is deleted here after sending the packet to avoid 1212 * having other code trip on it. But before we delete the 1213 * ire, somebody could have looked up this ire. 1214 * We prevent returning/using this IRE by the upper layers 1215 * by making checks to NULL source address in other places 1216 * like e.g ip_ire_append, ip_ire_req and ip_bind_connected. 1217 * Though, this does not completely prevent other threads 1218 * from using this ire, this should not cause any problems. 1219 * 1220 * NOTE : We use is_inaddr_any instead of using ire_src_addr 1221 * because for the normal case i.e !is_inaddr_any, ire_refrele 1222 * above could have potentially freed the ire. 1223 */ 1224 if (is_inaddr_any) { 1225 /* 1226 * If this IRE has been deleted by another thread, then 1227 * ire_bucket won't be NULL, but ire_ptpn will be NULL. 1228 * Thus, ire_delete will do nothing. This check 1229 * guards against calling ire_delete when the IRE was 1230 * never inserted in the table, which is handled by 1231 * ire_delete as dropping another reference. 1232 */ 1233 if (ire->ire_bucket != NULL) { 1234 ip1dbg(("ire_send: delete IRE\n")); 1235 ire_delete(ire); 1236 } 1237 ire_refrele(ire); /* Held above */ 1238 } 1239 } 1240 } 1241 1242 /* 1243 * Send a packet using the specified IRE. 1244 * If ire_src_addr_v6 is all zero then discard the IRE after 1245 * the packet has been sent. 1246 */ 1247 static void 1248 ire_send_v6(queue_t *q, mblk_t *pkt, ire_t *ire) 1249 { 1250 mblk_t *ipsec_mp; 1251 boolean_t secure; 1252 uint_t ifindex; 1253 zoneid_t zoneid = ire->ire_zoneid; 1254 ip_stack_t *ipst = ire->ire_ipst; 1255 1256 ASSERT(ire->ire_ipversion == IPV6_VERSION); 1257 ASSERT(!(ire->ire_type & IRE_LOCAL)); /* Has different ire_zoneid */ 1258 if (pkt->b_datap->db_type == M_CTL) { 1259 ipsec_out_t *io; 1260 1261 ipsec_mp = pkt; 1262 pkt = pkt->b_cont; 1263 secure = B_TRUE; 1264 io = (ipsec_out_t *)ipsec_mp->b_rptr; 1265 if (io->ipsec_out_type == IPSEC_OUT) 1266 zoneid = io->ipsec_out_zoneid; 1267 } else { 1268 ipsec_mp = pkt; 1269 secure = B_FALSE; 1270 } 1271 1272 /* If the packet originated externally then */ 1273 if (pkt->b_prev) { 1274 ill_t *ill; 1275 /* 1276 * Extract the ifindex from b_prev (set in ip_rput_data_v6). 1277 * Look up interface to see if it still exists (it could have 1278 * been unplumbed by the time the reply came back from the 1279 * resolver). 1280 */ 1281 ifindex = (uint_t)(uintptr_t)pkt->b_prev; 1282 ill = ill_lookup_on_ifindex(ifindex, B_TRUE, 1283 NULL, NULL, NULL, NULL, ipst); 1284 if (ill == NULL) { 1285 pkt->b_prev = NULL; 1286 pkt->b_next = NULL; 1287 freemsg(ipsec_mp); 1288 ire_refrele(ire); /* Held in ire_add */ 1289 return; 1290 } 1291 q = ill->ill_rq; 1292 pkt->b_prev = NULL; 1293 /* 1294 * This packet has not gone through IPSEC processing 1295 * and hence we should not have any IPSEC message 1296 * prepended. 1297 */ 1298 ASSERT(ipsec_mp == pkt); 1299 put(q, pkt); 1300 ill_refrele(ill); 1301 } else if (pkt->b_next) { 1302 /* Packets from multicast router */ 1303 pkt->b_next = NULL; 1304 /* 1305 * We never get the IPSEC_OUT while forwarding the 1306 * packet for multicast router. 1307 */ 1308 ASSERT(ipsec_mp == pkt); 1309 /* 1310 * XXX TODO IPv6. 1311 */ 1312 freemsg(pkt); 1313 #ifdef XXX 1314 ip_rput_forward(ire, (ipha_t *)pkt->b_rptr, pkt, NULL); 1315 #endif 1316 } else { 1317 if (secure) { 1318 ipsec_out_t *oi; 1319 ip6_t *ip6h; 1320 1321 oi = (ipsec_out_t *)ipsec_mp->b_rptr; 1322 ip6h = (ip6_t *)ipsec_mp->b_cont->b_rptr; 1323 if (oi->ipsec_out_proc_begin) { 1324 /* 1325 * This is the case where 1326 * ip_wput_ipsec_out could not find 1327 * the IRE and recreated a new one. 1328 */ 1329 ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h, 1330 NULL, NULL); 1331 } else { 1332 if (CONN_Q(q)) { 1333 (void) ip_output_v6(Q_TO_CONN(q), 1334 ipsec_mp, q, IRE_SEND); 1335 } else { 1336 (void) ip_output_v6( 1337 (void *)(uintptr_t)zoneid, 1338 ipsec_mp, q, IRE_SEND); 1339 } 1340 } 1341 } else { 1342 /* 1343 * Send packets through ip_output_v6 so that any 1344 * ip6_info header can be processed again. 1345 */ 1346 if (CONN_Q(q)) { 1347 (void) ip_output_v6(Q_TO_CONN(q), ipsec_mp, q, 1348 IRE_SEND); 1349 } else { 1350 (void) ip_output_v6((void *)(uintptr_t)zoneid, 1351 ipsec_mp, q, IRE_SEND); 1352 } 1353 } 1354 /* 1355 * Special code to support sending a single packet with 1356 * conn_unspec_src using an IRE which has no source address. 1357 * The IRE is deleted here after sending the packet to avoid 1358 * having other code trip on it. But before we delete the 1359 * ire, somebody could have looked up this ire. 1360 * We prevent returning/using this IRE by the upper layers 1361 * by making checks to NULL source address in other places 1362 * like e.g ip_ire_append_v6, ip_ire_req and 1363 * ip_bind_connected_v6. Though, this does not completely 1364 * prevent other threads from using this ire, this should 1365 * not cause any problems. 1366 */ 1367 if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6)) { 1368 ip1dbg(("ire_send_v6: delete IRE\n")); 1369 ire_delete(ire); 1370 } 1371 } 1372 ire_refrele(ire); /* Held in ire_add */ 1373 } 1374 1375 /* 1376 * Make sure that IRE bucket does not get too long. 1377 * This can cause lock up because ire_cache_lookup() 1378 * may take "forever" to finish. 1379 * 1380 * We just remove cnt IREs each time. This means that 1381 * the bucket length will stay approximately constant, 1382 * depending on cnt. This should be enough to defend 1383 * against DoS attack based on creating temporary IREs 1384 * (for forwarding and non-TCP traffic). 1385 * 1386 * Note that new IRE is normally added at the tail of the 1387 * bucket. This means that we are removing the "oldest" 1388 * temporary IRE added. Only if there are IREs with 1389 * the same ire_addr, do we not add it at the tail. Refer 1390 * to ire_add_v*(). It should be OK for our purpose. 1391 * 1392 * For non-temporary cached IREs, we make sure that they 1393 * have not been used for some time (defined below), they 1394 * are non-local destinations, and there is no one using 1395 * them at the moment (refcnt == 1). 1396 * 1397 * The above means that the IRE bucket length may become 1398 * very long, consisting of mostly non-temporary IREs. 1399 * This can happen when the hash function does a bad job 1400 * so that most TCP connections cluster to a specific bucket. 1401 * This "hopefully" should never happen. It can also 1402 * happen if most TCP connections have very long lives. 1403 * Even with the minimal hash table size of 256, there 1404 * has to be a lot of such connections to make the bucket 1405 * length unreasonably long. This should probably not 1406 * happen either. The third can when this can happen is 1407 * when the machine is under attack, such as SYN flooding. 1408 * TCP should already have the proper mechanism to protect 1409 * that. So we should be safe. 1410 * 1411 * This function is called by ire_add_then_send() after 1412 * a new IRE is added and the packet is sent. 1413 * 1414 * The idle cutoff interval is set to 60s. It can be 1415 * changed using /etc/system. 1416 */ 1417 uint32_t ire_idle_cutoff_interval = 60000; 1418 1419 static void 1420 ire_cache_cleanup(irb_t *irb, uint32_t threshold, int cnt) 1421 { 1422 ire_t *ire; 1423 int tmp_cnt = cnt; 1424 clock_t cut_off = drv_usectohz(ire_idle_cutoff_interval * 1000); 1425 1426 /* 1427 * irb is NULL if the IRE is not added to the hash. This 1428 * happens when IRE_MARK_NOADD is set in ire_add_then_send() 1429 * and when ires are returned from ire_update_srcif_v4() routine. 1430 */ 1431 if (irb == NULL) 1432 return; 1433 1434 IRB_REFHOLD(irb); 1435 if (irb->irb_tmp_ire_cnt > threshold) { 1436 for (ire = irb->irb_ire; ire != NULL && tmp_cnt > 0; 1437 ire = ire->ire_next) { 1438 if (ire->ire_marks & IRE_MARK_CONDEMNED) 1439 continue; 1440 if (ire->ire_marks & IRE_MARK_TEMPORARY) { 1441 ASSERT(ire->ire_type == IRE_CACHE); 1442 ire_delete(ire); 1443 tmp_cnt--; 1444 } 1445 } 1446 } 1447 if (irb->irb_ire_cnt - irb->irb_tmp_ire_cnt > threshold) { 1448 for (ire = irb->irb_ire; ire != NULL && cnt > 0; 1449 ire = ire->ire_next) { 1450 if (ire->ire_marks & IRE_MARK_CONDEMNED) 1451 continue; 1452 if (ire->ire_ipversion == IPV4_VERSION) { 1453 if (ire->ire_gateway_addr == 0) 1454 continue; 1455 } else { 1456 if (IN6_IS_ADDR_UNSPECIFIED( 1457 &ire->ire_gateway_addr_v6)) 1458 continue; 1459 } 1460 if ((ire->ire_type == IRE_CACHE) && 1461 (lbolt - ire->ire_last_used_time > cut_off) && 1462 (ire->ire_refcnt == 1)) { 1463 ire_delete(ire); 1464 cnt--; 1465 } 1466 } 1467 } 1468 IRB_REFRELE(irb); 1469 } 1470 1471 /* 1472 * ire_add_then_send is called when a new IRE has been created in order to 1473 * route an outgoing packet. Typically, it is called from ip_wput when 1474 * a response comes back down from a resolver. We add the IRE, and then 1475 * possibly run the packet through ip_wput or ip_rput, as appropriate. 1476 * However, we do not add the newly created IRE in the cache when 1477 * IRE_MARK_NOADD is set in the IRE. IRE_MARK_NOADD is set at 1478 * ip_newroute_ipif(). The ires with IRE_MARK_NOADD and ires returned 1479 * by ire_update_srcif_v4() are ire_refrele'd by ip_wput_ire() and get 1480 * deleted. 1481 * Multirouting support: the packet is silently discarded when the new IRE 1482 * holds the RTF_MULTIRT flag, but is not the first IRE to be added with the 1483 * RTF_MULTIRT flag for the same destination address. 1484 * In this case, we just want to register this additional ire without 1485 * sending the packet, as it has already been replicated through 1486 * existing multirt routes in ip_wput(). 1487 */ 1488 void 1489 ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp) 1490 { 1491 irb_t *irb; 1492 boolean_t drop = B_FALSE; 1493 /* LINTED : set but not used in function */ 1494 boolean_t mctl_present; 1495 mblk_t *first_mp = NULL; 1496 mblk_t *save_mp = NULL; 1497 ire_t *dst_ire; 1498 ipha_t *ipha; 1499 ip6_t *ip6h; 1500 ip_stack_t *ipst = ire->ire_ipst; 1501 1502 if (mp != NULL) { 1503 /* 1504 * We first have to retrieve the destination address carried 1505 * by the packet. 1506 * We can't rely on ire as it can be related to a gateway. 1507 * The destination address will help in determining if 1508 * other RTF_MULTIRT ires are already registered. 1509 * 1510 * We first need to know where we are going : v4 or V6. 1511 * the ire version is enough, as there is no risk that 1512 * we resolve an IPv6 address with an IPv4 ire 1513 * or vice versa. 1514 */ 1515 if (ire->ire_ipversion == IPV4_VERSION) { 1516 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 1517 ipha = (ipha_t *)mp->b_rptr; 1518 save_mp = mp; 1519 mp = first_mp; 1520 1521 dst_ire = ire_cache_lookup(ipha->ipha_dst, 1522 ire->ire_zoneid, MBLK_GETLABEL(mp), ipst); 1523 } else { 1524 ASSERT(ire->ire_ipversion == IPV6_VERSION); 1525 /* 1526 * Get a pointer to the beginning of the IPv6 header. 1527 * Ignore leading IPsec control mblks. 1528 */ 1529 first_mp = mp; 1530 if (mp->b_datap->db_type == M_CTL) { 1531 mp = mp->b_cont; 1532 } 1533 ip6h = (ip6_t *)mp->b_rptr; 1534 save_mp = mp; 1535 mp = first_mp; 1536 dst_ire = ire_cache_lookup_v6(&ip6h->ip6_dst, 1537 ire->ire_zoneid, MBLK_GETLABEL(mp), ipst); 1538 } 1539 if (dst_ire != NULL) { 1540 if (dst_ire->ire_flags & RTF_MULTIRT) { 1541 /* 1542 * At least one resolved multirt route 1543 * already exists for the destination, 1544 * don't sent this packet: either drop it 1545 * or complete the pending resolution, 1546 * depending on the ire. 1547 */ 1548 drop = B_TRUE; 1549 } 1550 ip1dbg(("ire_add_then_send: dst_ire %p " 1551 "[dst %08x, gw %08x], drop %d\n", 1552 (void *)dst_ire, 1553 (dst_ire->ire_ipversion == IPV4_VERSION) ? \ 1554 ntohl(dst_ire->ire_addr) : \ 1555 ntohl(V4_PART_OF_V6(dst_ire->ire_addr_v6)), 1556 (dst_ire->ire_ipversion == IPV4_VERSION) ? \ 1557 ntohl(dst_ire->ire_gateway_addr) : \ 1558 ntohl(V4_PART_OF_V6( 1559 dst_ire->ire_gateway_addr_v6)), 1560 drop)); 1561 ire_refrele(dst_ire); 1562 } 1563 } 1564 1565 if (!(ire->ire_marks & IRE_MARK_NOADD)) { 1566 /* 1567 * Regular packets with cache bound ires and 1568 * the packets from ARP response for ires which 1569 * belong to the ire_srcif_v4 table, are here. 1570 */ 1571 if (ire->ire_in_ill == NULL) { 1572 /* Add the ire */ 1573 (void) ire_add(&ire, NULL, NULL, NULL, B_FALSE); 1574 } else { 1575 /* 1576 * This must be ARP response for ire in interface based 1577 * table. Note that we don't add them in cache table, 1578 * instead we update the existing table with dlureq_mp 1579 * information. The reverse tunnel ires do not come 1580 * here, as reverse tunnel is non-resolver interface. 1581 * XXX- another design alternative was to mark the 1582 * ires in interface based table with a special mark to 1583 * make absolutely sure that we operate in right ires. 1584 * This idea was not implemented as part of code review 1585 * suggestion, as ire_in_ill suffice to distinguish 1586 * between the regular ires and interface based 1587 * ires now and thus we save a bit in the ire_marks. 1588 */ 1589 ire = ire_update_srcif_v4(ire); 1590 } 1591 1592 if (ire == NULL) { 1593 mp->b_prev = NULL; 1594 mp->b_next = NULL; 1595 MULTIRT_DEBUG_UNTAG(mp); 1596 freemsg(mp); 1597 return; 1598 } 1599 if (mp == NULL) { 1600 ire_refrele(ire); /* Held in ire_add_v4/v6 */ 1601 return; 1602 } 1603 } 1604 if (drop) { 1605 /* 1606 * If we're adding an RTF_MULTIRT ire, the resolution 1607 * is over: we just drop the packet. 1608 */ 1609 if (ire->ire_flags & RTF_MULTIRT) { 1610 if (save_mp) { 1611 save_mp->b_prev = NULL; 1612 save_mp->b_next = NULL; 1613 } 1614 MULTIRT_DEBUG_UNTAG(mp); 1615 freemsg(mp); 1616 } else { 1617 /* 1618 * Otherwise, we're adding the ire to a gateway 1619 * for a multirt route. 1620 * Invoke ip_newroute() to complete the resolution 1621 * of the route. We will then come back here and 1622 * finally drop this packet in the above code. 1623 */ 1624 if (ire->ire_ipversion == IPV4_VERSION) { 1625 /* 1626 * TODO: in order for CGTP to work in non-global 1627 * zones, ip_newroute() must create the IRE 1628 * cache in the zone indicated by 1629 * ire->ire_zoneid. 1630 */ 1631 ip_newroute(q, mp, ipha->ipha_dst, 0, 1632 (CONN_Q(q) ? Q_TO_CONN(q) : NULL), 1633 ire->ire_zoneid, ipst); 1634 } else { 1635 ASSERT(ire->ire_ipversion == IPV6_VERSION); 1636 ip_newroute_v6(q, mp, &ip6h->ip6_dst, NULL, 1637 NULL, ire->ire_zoneid, ipst); 1638 } 1639 } 1640 1641 ire_refrele(ire); /* As done by ire_send(). */ 1642 return; 1643 } 1644 /* 1645 * Need to remember ire_bucket here as ire_send*() may delete 1646 * the ire so we cannot reference it after that. 1647 */ 1648 irb = ire->ire_bucket; 1649 if (ire->ire_ipversion == IPV6_VERSION) { 1650 ire_send_v6(q, mp, ire); 1651 /* 1652 * Clean up more than 1 IRE so that the clean up does not 1653 * need to be done every time when a new IRE is added and 1654 * the threshold is reached. 1655 */ 1656 ire_cache_cleanup(irb, ip6_ire_max_bucket_cnt, 2); 1657 } else { 1658 ire_send(q, mp, ire); 1659 ire_cache_cleanup(irb, ip_ire_max_bucket_cnt, 2); 1660 } 1661 } 1662 1663 /* 1664 * Initialize the ire that is specific to IPv4 part and call 1665 * ire_init_common to finish it. 1666 */ 1667 ire_t * 1668 ire_init(ire_t *ire, uchar_t *addr, uchar_t *mask, uchar_t *src_addr, 1669 uchar_t *gateway, uchar_t *in_src_addr, uint_t *max_fragp, mblk_t *fp_mp, 1670 queue_t *rfq, queue_t *stq, ushort_t type, mblk_t *dlureq_mp, ipif_t *ipif, 1671 ill_t *in_ill, ipaddr_t cmask, uint32_t phandle, uint32_t ihandle, 1672 uint32_t flags, const iulp_t *ulp_info, tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, 1673 ip_stack_t *ipst) 1674 { 1675 /* 1676 * Reject IRE security attribute creation/initialization 1677 * if system is not running in Trusted mode. 1678 */ 1679 if ((gc != NULL || gcgrp != NULL) && !is_system_labeled()) 1680 return (NULL); 1681 1682 if (fp_mp != NULL) { 1683 /* 1684 * We can't dupb() here as multiple threads could be 1685 * calling dupb on the same mp which is incorrect. 1686 * First dupb() should be called only by one thread. 1687 */ 1688 fp_mp = copyb(fp_mp); 1689 if (fp_mp == NULL) 1690 return (NULL); 1691 } 1692 1693 if (dlureq_mp != NULL) { 1694 /* 1695 * We can't dupb() here as multiple threads could be 1696 * calling dupb on the same mp which is incorrect. 1697 * First dupb() should be called only by one thread. 1698 */ 1699 dlureq_mp = copyb(dlureq_mp); 1700 if (dlureq_mp == NULL) { 1701 if (fp_mp != NULL) 1702 freeb(fp_mp); 1703 return (NULL); 1704 } 1705 } 1706 1707 /* 1708 * Check that IRE_IF_RESOLVER and IRE_IF_NORESOLVER have a 1709 * dlureq_mp which is the ill_resolver_mp for IRE_IF_RESOLVER 1710 * and DL_UNITDATA_REQ for IRE_IF_NORESOLVER. 1711 */ 1712 if ((type & IRE_INTERFACE) && 1713 dlureq_mp == NULL) { 1714 ASSERT(fp_mp == NULL); 1715 ip0dbg(("ire_init: no dlureq_mp\n")); 1716 return (NULL); 1717 } 1718 1719 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_alloced); 1720 1721 if (addr != NULL) 1722 bcopy(addr, &ire->ire_addr, IP_ADDR_LEN); 1723 if (src_addr != NULL) 1724 bcopy(src_addr, &ire->ire_src_addr, IP_ADDR_LEN); 1725 if (mask != NULL) { 1726 bcopy(mask, &ire->ire_mask, IP_ADDR_LEN); 1727 ire->ire_masklen = ip_mask_to_plen(ire->ire_mask); 1728 } 1729 if (gateway != NULL) { 1730 bcopy(gateway, &ire->ire_gateway_addr, IP_ADDR_LEN); 1731 } 1732 if (in_src_addr != NULL) { 1733 bcopy(in_src_addr, &ire->ire_in_src_addr, IP_ADDR_LEN); 1734 } 1735 1736 if (type == IRE_CACHE) 1737 ire->ire_cmask = cmask; 1738 1739 /* ire_init_common will free the mblks upon encountering any failure */ 1740 if (!ire_init_common(ire, max_fragp, fp_mp, rfq, stq, type, dlureq_mp, 1741 ipif, in_ill, phandle, ihandle, flags, IPV4_VERSION, ulp_info, 1742 gc, gcgrp, ipst)) 1743 return (NULL); 1744 1745 return (ire); 1746 } 1747 1748 /* 1749 * Similar to ire_create except that it is called only when 1750 * we want to allocate ire as an mblk e.g. we have an external 1751 * resolver ARP. 1752 */ 1753 ire_t * 1754 ire_create_mp(uchar_t *addr, uchar_t *mask, uchar_t *src_addr, uchar_t *gateway, 1755 uchar_t *in_src_addr, uint_t max_frag, mblk_t *fp_mp, queue_t *rfq, 1756 queue_t *stq, ushort_t type, mblk_t *dlureq_mp, ipif_t *ipif, ill_t *in_ill, 1757 ipaddr_t cmask, uint32_t phandle, uint32_t ihandle, uint32_t flags, 1758 const iulp_t *ulp_info, tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, 1759 ip_stack_t *ipst) 1760 { 1761 ire_t *ire, *buf; 1762 ire_t *ret_ire; 1763 mblk_t *mp; 1764 size_t bufsize; 1765 frtn_t *frtnp; 1766 ill_t *ill; 1767 1768 bufsize = sizeof (ire_t) + sizeof (frtn_t); 1769 buf = kmem_alloc(bufsize, KM_NOSLEEP); 1770 if (buf == NULL) { 1771 ip1dbg(("ire_create_mp: alloc failed\n")); 1772 return (NULL); 1773 } 1774 frtnp = (frtn_t *)(buf + 1); 1775 frtnp->free_arg = (caddr_t)buf; 1776 frtnp->free_func = ire_freemblk; 1777 1778 /* 1779 * Allocate the new IRE. The ire created will hold a ref on 1780 * an nce_t after ire_nce_init, and this ref must either be 1781 * (a) transferred to the ire_cache entry created when ire_add_v4 1782 * is called after successful arp resolution, or, 1783 * (b) released, when arp resolution fails 1784 * Case (b) is handled in ire_freemblk() which will be called 1785 * when mp is freed as a result of failed arp. 1786 */ 1787 mp = esballoc((unsigned char *)buf, bufsize, BPRI_MED, frtnp); 1788 if (mp == NULL) { 1789 ip1dbg(("ire_create_mp: alloc failed\n")); 1790 kmem_free(buf, bufsize); 1791 return (NULL); 1792 } 1793 ire = (ire_t *)mp->b_rptr; 1794 mp->b_wptr = (uchar_t *)&ire[1]; 1795 1796 /* Start clean. */ 1797 *ire = ire_null; 1798 ire->ire_mp = mp; 1799 mp->b_datap->db_type = IRE_DB_TYPE; 1800 ire->ire_marks |= IRE_MARK_UNCACHED; 1801 1802 ret_ire = ire_init(ire, addr, mask, src_addr, gateway, in_src_addr, 1803 NULL, fp_mp, rfq, stq, type, dlureq_mp, ipif, in_ill, cmask, 1804 phandle, ihandle, flags, ulp_info, gc, gcgrp, ipst); 1805 1806 ill = (ill_t *)(stq->q_ptr); 1807 if (ret_ire == NULL) { 1808 /* ire_freemblk needs these set */ 1809 ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex; 1810 ire->ire_ipst = ipst; 1811 freeb(ire->ire_mp); 1812 return (NULL); 1813 } 1814 ret_ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex; 1815 ASSERT(ret_ire == ire); 1816 /* 1817 * ire_max_frag is normally zero here and is atomically set 1818 * under the irebucket lock in ire_add_v[46] except for the 1819 * case of IRE_MARK_NOADD. In that event the the ire_max_frag 1820 * is non-zero here. 1821 */ 1822 ire->ire_max_frag = max_frag; 1823 return (ire); 1824 } 1825 1826 /* 1827 * ire_create is called to allocate and initialize a new IRE. 1828 * 1829 * NOTE : This is called as writer sometimes though not required 1830 * by this function. 1831 */ 1832 ire_t * 1833 ire_create(uchar_t *addr, uchar_t *mask, uchar_t *src_addr, uchar_t *gateway, 1834 uchar_t *in_src_addr, uint_t *max_fragp, mblk_t *fp_mp, queue_t *rfq, 1835 queue_t *stq, ushort_t type, mblk_t *dlureq_mp, ipif_t *ipif, ill_t *in_ill, 1836 ipaddr_t cmask, uint32_t phandle, uint32_t ihandle, uint32_t flags, 1837 const iulp_t *ulp_info, tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, 1838 ip_stack_t *ipst) 1839 { 1840 ire_t *ire; 1841 ire_t *ret_ire; 1842 1843 ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 1844 if (ire == NULL) { 1845 ip1dbg(("ire_create: alloc failed\n")); 1846 return (NULL); 1847 } 1848 *ire = ire_null; 1849 1850 ret_ire = ire_init(ire, addr, mask, src_addr, gateway, in_src_addr, 1851 max_fragp, fp_mp, rfq, stq, type, dlureq_mp, ipif, in_ill, cmask, 1852 phandle, ihandle, flags, ulp_info, gc, gcgrp, ipst); 1853 1854 if (ret_ire == NULL) { 1855 kmem_cache_free(ire_cache, ire); 1856 return (NULL); 1857 } 1858 ASSERT(ret_ire == ire); 1859 return (ire); 1860 } 1861 1862 1863 /* 1864 * Common to IPv4 and IPv6 1865 */ 1866 boolean_t 1867 ire_init_common(ire_t *ire, uint_t *max_fragp, mblk_t *fp_mp, 1868 queue_t *rfq, queue_t *stq, ushort_t type, 1869 mblk_t *dlureq_mp, ipif_t *ipif, ill_t *in_ill, uint32_t phandle, 1870 uint32_t ihandle, uint32_t flags, uchar_t ipversion, 1871 const iulp_t *ulp_info, tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, 1872 ip_stack_t *ipst) 1873 { 1874 ire->ire_max_fragp = max_fragp; 1875 ire->ire_frag_flag |= (ipst->ips_ip_path_mtu_discovery) ? IPH_DF : 0; 1876 1877 ASSERT(fp_mp == NULL || fp_mp->b_datap->db_type == M_DATA); 1878 #ifdef DEBUG 1879 if (ipif != NULL) { 1880 if (ipif->ipif_isv6) 1881 ASSERT(ipversion == IPV6_VERSION); 1882 else 1883 ASSERT(ipversion == IPV4_VERSION); 1884 } 1885 #endif /* DEBUG */ 1886 1887 /* 1888 * Create/initialize IRE security attribute only in Trusted mode; 1889 * if the passed in gc/gcgrp is non-NULL, we expect that the caller 1890 * has held a reference to it and will release it when this routine 1891 * returns a failure, otherwise we own the reference. We do this 1892 * prior to initializing the rest IRE fields. 1893 * 1894 * Don't allocate ire_gw_secattr for the resolver case to prevent 1895 * memory leak (in case of external resolution failure). We'll 1896 * allocate it after a successful external resolution, in ire_add(). 1897 * Note that ire->ire_mp != NULL here means this ire is headed 1898 * to an external resolver. 1899 */ 1900 if (is_system_labeled()) { 1901 if ((type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST | 1902 IRE_INTERFACE)) != 0) { 1903 /* release references on behalf of caller */ 1904 if (gc != NULL) 1905 GC_REFRELE(gc); 1906 if (gcgrp != NULL) 1907 GCGRP_REFRELE(gcgrp); 1908 } else if ((ire->ire_mp == NULL) && 1909 tsol_ire_init_gwattr(ire, ipversion, gc, gcgrp) != 0) { 1910 /* free any caller-allocated mblks upon failure */ 1911 if (fp_mp != NULL) 1912 freeb(fp_mp); 1913 if (dlureq_mp != NULL) 1914 freeb(dlureq_mp); 1915 return (B_FALSE); 1916 } 1917 } 1918 1919 ire->ire_stq = stq; 1920 ire->ire_rfq = rfq; 1921 ire->ire_type = type; 1922 ire->ire_flags = RTF_UP | flags; 1923 ire->ire_ident = TICK_TO_MSEC(lbolt); 1924 bcopy(ulp_info, &ire->ire_uinfo, sizeof (iulp_t)); 1925 1926 ire->ire_tire_mark = ire->ire_ob_pkt_count + ire->ire_ib_pkt_count; 1927 ire->ire_last_used_time = lbolt; 1928 ire->ire_create_time = (uint32_t)gethrestime_sec(); 1929 1930 /* 1931 * If this IRE is an IRE_CACHE, inherit the handles from the 1932 * parent IREs. For others in the forwarding table, assign appropriate 1933 * new ones. 1934 * 1935 * The mutex protecting ire_handle is because ire_create is not always 1936 * called as a writer. 1937 */ 1938 if (ire->ire_type & IRE_OFFSUBNET) { 1939 mutex_enter(&ipst->ips_ire_handle_lock); 1940 ire->ire_phandle = (uint32_t)ipst->ips_ire_handle++; 1941 mutex_exit(&ipst->ips_ire_handle_lock); 1942 } else if (ire->ire_type & IRE_INTERFACE) { 1943 mutex_enter(&ipst->ips_ire_handle_lock); 1944 ire->ire_ihandle = (uint32_t)ipst->ips_ire_handle++; 1945 mutex_exit(&ipst->ips_ire_handle_lock); 1946 } else if (ire->ire_type == IRE_CACHE) { 1947 ire->ire_phandle = phandle; 1948 ire->ire_ihandle = ihandle; 1949 } 1950 ire->ire_in_ill = in_ill; 1951 ire->ire_ipif = ipif; 1952 if (ipif != NULL) { 1953 ire->ire_ipif_seqid = ipif->ipif_seqid; 1954 ire->ire_zoneid = ipif->ipif_zoneid; 1955 } else { 1956 ire->ire_zoneid = GLOBAL_ZONEID; 1957 } 1958 ire->ire_ipversion = ipversion; 1959 mutex_init(&ire->ire_lock, NULL, MUTEX_DEFAULT, NULL); 1960 if (ipversion == IPV4_VERSION) { 1961 if (ire_nce_init(ire, fp_mp, dlureq_mp) != 0) { 1962 /* some failure occurred. propagate error back */ 1963 return (B_FALSE); 1964 } 1965 } else { 1966 ASSERT(ipversion == IPV6_VERSION); 1967 /* 1968 * IPv6 initializes the ire_nce in ire_add_v6, 1969 * which expects to find the ire_nce to be null when 1970 * when it is called. 1971 */ 1972 if (dlureq_mp) 1973 freemsg(dlureq_mp); 1974 if (fp_mp) 1975 freemsg(fp_mp); 1976 } 1977 ire->ire_refcnt = 1; 1978 ire->ire_ipst = ipst; /* No netstack_hold */ 1979 1980 #ifdef IRE_DEBUG 1981 bzero(ire->ire_trace, sizeof (th_trace_t *) * IP_TR_HASH_MAX); 1982 #endif 1983 1984 return (B_TRUE); 1985 } 1986 1987 /* 1988 * This routine is called repeatedly by ipif_up to create broadcast IREs. 1989 * It is passed a pointer to a slot in an IRE pointer array into which to 1990 * place the pointer to the new IRE, if indeed we create one. If the 1991 * IRE corresponding to the address passed in would be a duplicate of an 1992 * existing one, we don't create the new one. irep is incremented before 1993 * return only if we do create a new IRE. (Always called as writer.) 1994 * 1995 * Note that with the "match_flags" parameter, we can match on either 1996 * a particular logical interface (MATCH_IRE_IPIF) or for all logical 1997 * interfaces for a given physical interface (MATCH_IRE_ILL). Currently, 1998 * we only create broadcast ire's on a per physical interface basis. If 1999 * someone is going to be mucking with logical interfaces, it is important 2000 * to call "ipif_check_bcast_ires()" to make sure that any change to a 2001 * logical interface will not cause critical broadcast IRE's to be deleted. 2002 */ 2003 ire_t ** 2004 ire_check_and_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep, 2005 int match_flags) 2006 { 2007 ire_t *ire; 2008 uint64_t check_flags = IPIF_DEPRECATED | IPIF_NOLOCAL | IPIF_ANYCAST; 2009 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 2010 2011 /* 2012 * No broadcast IREs for the LOOPBACK interface 2013 * or others such as point to point and IPIF_NOXMIT. 2014 */ 2015 if (!(ipif->ipif_flags & IPIF_BROADCAST) || 2016 (ipif->ipif_flags & IPIF_NOXMIT)) 2017 return (irep); 2018 2019 /* If this would be a duplicate, don't bother. */ 2020 if ((ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ipif, 2021 ipif->ipif_zoneid, NULL, match_flags, ipst)) != NULL) { 2022 /* 2023 * We look for non-deprecated (and non-anycast, non-nolocal) 2024 * ipifs as the best choice. ipifs with check_flags matching 2025 * (deprecated, etc) are used only if non-deprecated ipifs 2026 * are not available. if the existing ire's ipif is deprecated 2027 * and the new ipif is non-deprecated, switch to the new ipif 2028 */ 2029 if ((!(ire->ire_ipif->ipif_flags & check_flags)) || 2030 (ipif->ipif_flags & check_flags)) { 2031 ire_refrele(ire); 2032 return (irep); 2033 } 2034 /* 2035 * Bcast ires exist in pairs. Both have to be deleted, 2036 * Since we are exclusive we can make the above assertion. 2037 * The 1st has to be refrele'd since it was ctable_lookup'd. 2038 */ 2039 ASSERT(IAM_WRITER_IPIF(ipif)); 2040 ASSERT(ire->ire_next->ire_addr == ire->ire_addr); 2041 ire_delete(ire->ire_next); 2042 ire_delete(ire); 2043 ire_refrele(ire); 2044 } 2045 2046 irep = ire_create_bcast(ipif, addr, irep); 2047 2048 return (irep); 2049 } 2050 2051 uint_t ip_loopback_mtu = IP_LOOPBACK_MTU; 2052 2053 /* 2054 * This routine is called from ipif_check_bcast_ires and ire_check_bcast. 2055 * It leaves all the verifying and deleting to those routines. So it always 2056 * creates 2 bcast ires and chains them into the ire array passed in. 2057 */ 2058 ire_t ** 2059 ire_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep) 2060 { 2061 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 2062 2063 *irep++ = ire_create( 2064 (uchar_t *)&addr, /* dest addr */ 2065 (uchar_t *)&ip_g_all_ones, /* mask */ 2066 (uchar_t *)&ipif->ipif_src_addr, /* source addr */ 2067 NULL, /* no gateway */ 2068 NULL, /* no in_src_addr */ 2069 &ipif->ipif_mtu, /* max frag */ 2070 NULL, /* fast path header */ 2071 ipif->ipif_rq, /* recv-from queue */ 2072 ipif->ipif_wq, /* send-to queue */ 2073 IRE_BROADCAST, 2074 ipif->ipif_bcast_mp, /* xmit header */ 2075 ipif, 2076 NULL, 2077 0, 2078 0, 2079 0, 2080 0, 2081 &ire_uinfo_null, 2082 NULL, 2083 NULL, 2084 ipst); 2085 2086 *irep++ = ire_create( 2087 (uchar_t *)&addr, /* dest address */ 2088 (uchar_t *)&ip_g_all_ones, /* mask */ 2089 (uchar_t *)&ipif->ipif_src_addr, /* source address */ 2090 NULL, /* no gateway */ 2091 NULL, /* no in_src_addr */ 2092 &ip_loopback_mtu, /* max frag size */ 2093 NULL, /* Fast Path header */ 2094 ipif->ipif_rq, /* recv-from queue */ 2095 NULL, /* no send-to queue */ 2096 IRE_BROADCAST, /* Needed for fanout in wput */ 2097 NULL, 2098 ipif, 2099 NULL, 2100 0, 2101 0, 2102 0, 2103 0, 2104 &ire_uinfo_null, 2105 NULL, 2106 NULL, 2107 ipst); 2108 2109 return (irep); 2110 } 2111 2112 /* 2113 * ire_walk routine to delete or update any IRE_CACHE that might contain 2114 * stale information. 2115 * The flags state which entries to delete or update. 2116 * Garbage collection is done separately using kmem alloc callbacks to 2117 * ip_trash_ire_reclaim. 2118 * Used for both IPv4 and IPv6. However, IPv6 only uses FLUSH_MTU_TIME 2119 * since other stale information is cleaned up using NUD. 2120 */ 2121 void 2122 ire_expire(ire_t *ire, char *arg) 2123 { 2124 ire_expire_arg_t *ieap = (ire_expire_arg_t *)(uintptr_t)arg; 2125 ill_t *stq_ill; 2126 int flush_flags = ieap->iea_flush_flag; 2127 ip_stack_t *ipst = ieap->iea_ipst; 2128 2129 if ((flush_flags & FLUSH_REDIRECT_TIME) && 2130 (ire->ire_flags & RTF_DYNAMIC)) { 2131 /* Make sure we delete the corresponding IRE_CACHE */ 2132 ip1dbg(("ire_expire: all redirects\n")); 2133 ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst); 2134 ire_delete(ire); 2135 atomic_dec_32(&ipst->ips_ip_redirect_cnt); 2136 return; 2137 } 2138 if (ire->ire_type != IRE_CACHE) 2139 return; 2140 2141 if (flush_flags & FLUSH_ARP_TIME) { 2142 /* 2143 * Remove all IRE_CACHE. 2144 * Verify that create time is more than 2145 * ip_ire_arp_interval milliseconds ago. 2146 */ 2147 if (NCE_EXPIRED(ire->ire_nce, ipst)) { 2148 ire_delete(ire); 2149 return; 2150 } 2151 } 2152 2153 if (ipst->ips_ip_path_mtu_discovery && (flush_flags & FLUSH_MTU_TIME) && 2154 (ire->ire_ipif != NULL)) { 2155 /* Increase pmtu if it is less than the interface mtu */ 2156 mutex_enter(&ire->ire_lock); 2157 /* 2158 * If the ipif is a vni (whose mtu is 0, since it's virtual) 2159 * get the mtu from the sending interfaces' ipif 2160 */ 2161 if (IS_VNI(ire->ire_ipif->ipif_ill)) { 2162 stq_ill = ire->ire_stq->q_ptr; 2163 ire->ire_max_frag = MIN(stq_ill->ill_ipif->ipif_mtu, 2164 IP_MAXPACKET); 2165 } else { 2166 ire->ire_max_frag = MIN(ire->ire_ipif->ipif_mtu, 2167 IP_MAXPACKET); 2168 } 2169 ire->ire_frag_flag |= IPH_DF; 2170 mutex_exit(&ire->ire_lock); 2171 } 2172 } 2173 2174 /* 2175 * Return any local address. We use this to target ourselves 2176 * when the src address was specified as 'default'. 2177 * Preference for IRE_LOCAL entries. 2178 */ 2179 ire_t * 2180 ire_lookup_local(zoneid_t zoneid, ip_stack_t *ipst) 2181 { 2182 ire_t *ire; 2183 irb_t *irb; 2184 ire_t *maybe = NULL; 2185 int i; 2186 2187 for (i = 0; i < ipst->ips_ip_cache_table_size; i++) { 2188 irb = &ipst->ips_ip_cache_table[i]; 2189 if (irb->irb_ire == NULL) 2190 continue; 2191 rw_enter(&irb->irb_lock, RW_READER); 2192 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 2193 if ((ire->ire_marks & IRE_MARK_CONDEMNED) || 2194 (ire->ire_zoneid != zoneid && 2195 ire->ire_zoneid != ALL_ZONES)) 2196 continue; 2197 switch (ire->ire_type) { 2198 case IRE_LOOPBACK: 2199 if (maybe == NULL) { 2200 IRE_REFHOLD(ire); 2201 maybe = ire; 2202 } 2203 break; 2204 case IRE_LOCAL: 2205 if (maybe != NULL) { 2206 ire_refrele(maybe); 2207 } 2208 IRE_REFHOLD(ire); 2209 rw_exit(&irb->irb_lock); 2210 return (ire); 2211 } 2212 } 2213 rw_exit(&irb->irb_lock); 2214 } 2215 return (maybe); 2216 } 2217 2218 /* 2219 * If the specified IRE is associated with a particular ILL, return 2220 * that ILL pointer (May be called as writer.). 2221 * 2222 * NOTE : This is not a generic function that can be used always. 2223 * This function always returns the ill of the outgoing packets 2224 * if this ire is used. 2225 */ 2226 ill_t * 2227 ire_to_ill(const ire_t *ire) 2228 { 2229 ill_t *ill = NULL; 2230 2231 /* 2232 * 1) For an IRE_CACHE, ire_ipif is the one where it obtained 2233 * the source address from. ire_stq is the one where the 2234 * packets will be sent out on. We return that here. 2235 * 2236 * 2) IRE_BROADCAST normally has a loopback and a non-loopback 2237 * copy and they always exist next to each other with loopback 2238 * copy being the first one. If we are called on the non-loopback 2239 * copy, return the one pointed by ire_stq. If it was called on 2240 * a loopback copy, we still return the one pointed by the next 2241 * ire's ire_stq pointer i.e the one pointed by the non-loopback 2242 * copy. We don't want use ire_ipif as it might represent the 2243 * source address (if we borrow source addresses for 2244 * IRE_BROADCASTS in the future). 2245 * However if an interface is currently coming up, the above 2246 * condition may not hold during that period since the ires 2247 * are added one at a time. Thus one of the pair could have been 2248 * added and the other not yet added. 2249 * 3) For many other IREs (e.g., IRE_LOCAL), ire_rfq indicates the ill. 2250 * 4) For all others return the ones pointed by ire_ipif->ipif_ill. 2251 * That handles IRE_LOOPBACK. 2252 */ 2253 2254 if (ire->ire_type == IRE_CACHE) { 2255 ill = (ill_t *)ire->ire_stq->q_ptr; 2256 } else if (ire->ire_type == IRE_BROADCAST) { 2257 if (ire->ire_stq != NULL) { 2258 ill = (ill_t *)ire->ire_stq->q_ptr; 2259 } else { 2260 ire_t *ire_next; 2261 2262 ire_next = ire->ire_next; 2263 if (ire_next != NULL && 2264 ire_next->ire_type == IRE_BROADCAST && 2265 ire_next->ire_addr == ire->ire_addr && 2266 ire_next->ire_ipif == ire->ire_ipif) { 2267 ill = (ill_t *)ire_next->ire_stq->q_ptr; 2268 } 2269 } 2270 } else if (ire->ire_rfq != NULL) { 2271 ill = ire->ire_rfq->q_ptr; 2272 } else if (ire->ire_ipif != NULL) { 2273 ill = ire->ire_ipif->ipif_ill; 2274 } 2275 return (ill); 2276 } 2277 2278 /* Arrange to call the specified function for every IRE in the world. */ 2279 void 2280 ire_walk(pfv_t func, void *arg, ip_stack_t *ipst) 2281 { 2282 ire_walk_ipvers(func, arg, 0, ALL_ZONES, ipst); 2283 } 2284 2285 void 2286 ire_walk_v4(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst) 2287 { 2288 ire_walk_ipvers(func, arg, IPV4_VERSION, zoneid, ipst); 2289 } 2290 2291 void 2292 ire_walk_v6(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst) 2293 { 2294 ire_walk_ipvers(func, arg, IPV6_VERSION, zoneid, ipst); 2295 } 2296 2297 /* 2298 * Walk a particular version. version == 0 means both v4 and v6. 2299 */ 2300 static void 2301 ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers, zoneid_t zoneid, 2302 ip_stack_t *ipst) 2303 { 2304 if (vers != IPV6_VERSION) { 2305 /* 2306 * ip_forwarding_table variable doesn't matter for IPv4 since 2307 * ire_walk_ill_tables uses ips_ip_ftable for IPv4. 2308 */ 2309 ire_walk_ill_tables(0, 0, func, arg, IP_MASK_TABLE_SIZE, 2310 0, NULL, 2311 ipst->ips_ip_cache_table_size, ipst->ips_ip_cache_table, 2312 NULL, zoneid, ipst); 2313 } 2314 if (vers != IPV4_VERSION) { 2315 ire_walk_ill_tables(0, 0, func, arg, IP6_MASK_TABLE_SIZE, 2316 ipst->ips_ip6_ftable_hash_size, 2317 ipst->ips_ip_forwarding_table_v6, 2318 ipst->ips_ip6_cache_table_size, 2319 ipst->ips_ip_cache_table_v6, NULL, zoneid, ipst); 2320 } 2321 } 2322 2323 /* 2324 * Arrange to call the specified 2325 * function for every IRE that matches the ill. 2326 */ 2327 void 2328 ire_walk_ill(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg, 2329 ill_t *ill) 2330 { 2331 ire_walk_ill_ipvers(match_flags, ire_type, func, arg, 0, ill); 2332 } 2333 2334 void 2335 ire_walk_ill_v4(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg, 2336 ill_t *ill) 2337 { 2338 ire_walk_ill_ipvers(match_flags, ire_type, func, arg, IPV4_VERSION, 2339 ill); 2340 } 2341 2342 void 2343 ire_walk_ill_v6(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg, 2344 ill_t *ill) 2345 { 2346 ire_walk_ill_ipvers(match_flags, ire_type, func, arg, IPV6_VERSION, 2347 ill); 2348 } 2349 2350 /* 2351 * Walk a particular ill and version. version == 0 means both v4 and v6. 2352 */ 2353 static void 2354 ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type, pfv_t func, 2355 void *arg, uchar_t vers, ill_t *ill) 2356 { 2357 ip_stack_t *ipst = ill->ill_ipst; 2358 2359 if (vers != IPV6_VERSION) { 2360 ire_walk_ill_tables(match_flags, ire_type, func, arg, 2361 IP_MASK_TABLE_SIZE, 0, 2362 NULL, ipst->ips_ip_cache_table_size, 2363 ipst->ips_ip_cache_table, ill, ALL_ZONES, ipst); 2364 } 2365 if (vers != IPV4_VERSION) { 2366 ire_walk_ill_tables(match_flags, ire_type, func, arg, 2367 IP6_MASK_TABLE_SIZE, ipst->ips_ip6_ftable_hash_size, 2368 ipst->ips_ip_forwarding_table_v6, 2369 ipst->ips_ip6_cache_table_size, 2370 ipst->ips_ip_cache_table_v6, ill, ALL_ZONES, ipst); 2371 } 2372 } 2373 2374 boolean_t 2375 ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire, 2376 ill_t *ill, zoneid_t zoneid, ip_stack_t *ipst) 2377 { 2378 ill_t *ire_stq_ill = NULL; 2379 ill_t *ire_ipif_ill = NULL; 2380 ill_group_t *ire_ill_group = NULL; 2381 2382 ASSERT(match_flags != 0 || zoneid != ALL_ZONES); 2383 /* 2384 * 1) MATCH_IRE_WQ : Used specifically to match on ire_stq. 2385 * The fast path update uses this to make sure it does not 2386 * update the fast path header of interface X with the fast 2387 * path updates it recieved on interface Y. It is similar 2388 * in handling DL_NOTE_FASTPATH_FLUSH. 2389 * 2390 * 2) MATCH_IRE_ILL/MATCH_IRE_ILL_GROUP : We match both on ill 2391 * pointed by ire_stq and ire_ipif. Only in the case of 2392 * IRE_CACHEs can ire_stq and ire_ipif be pointing to 2393 * different ills. But we want to keep this function generic 2394 * enough for future use. So, we always try to match on both. 2395 * The only caller of this function ire_walk_ill_tables, will 2396 * call "func" after we return from this function. We expect 2397 * "func" to do the right filtering of ires in this case. 2398 * 2399 * NOTE : In the case of MATCH_IRE_ILL_GROUP, groups 2400 * pointed by ire_stq and ire_ipif should always be the same. 2401 * So, we just match on only one of them. 2402 */ 2403 if (match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP)) { 2404 if (ire->ire_stq != NULL) 2405 ire_stq_ill = (ill_t *)ire->ire_stq->q_ptr; 2406 if (ire->ire_ipif != NULL) 2407 ire_ipif_ill = ire->ire_ipif->ipif_ill; 2408 if (ire_stq_ill != NULL) 2409 ire_ill_group = ire_stq_ill->ill_group; 2410 if ((ire_ill_group == NULL) && (ire_ipif_ill != NULL)) 2411 ire_ill_group = ire_ipif_ill->ill_group; 2412 } 2413 2414 if (zoneid != ALL_ZONES) { 2415 /* 2416 * We're walking the IREs for a specific zone. The only relevant 2417 * IREs are: 2418 * - all IREs with a matching ire_zoneid 2419 * - all IRE_OFFSUBNETs as they're shared across all zones 2420 * - IRE_INTERFACE IREs for interfaces with a usable source addr 2421 * with a matching zone 2422 * - IRE_DEFAULTs with a gateway reachable from the zone 2423 * We should really match on IRE_OFFSUBNETs and IRE_DEFAULTs 2424 * using the same rule; but the above rules are consistent with 2425 * the behavior of ire_ftable_lookup[_v6]() so that all the 2426 * routes that can be matched during lookup are also matched 2427 * here. 2428 */ 2429 if (zoneid != ire->ire_zoneid && ire->ire_zoneid != ALL_ZONES) { 2430 /* 2431 * Note, IRE_INTERFACE can have the stq as NULL. For 2432 * example, if the default multicast route is tied to 2433 * the loopback address. 2434 */ 2435 if ((ire->ire_type & IRE_INTERFACE) && 2436 (ire->ire_stq != NULL)) { 2437 ire_stq_ill = (ill_t *)ire->ire_stq->q_ptr; 2438 if (ire->ire_ipversion == IPV4_VERSION) { 2439 if (!ipif_usesrc_avail(ire_stq_ill, 2440 zoneid)) 2441 /* No usable src addr in zone */ 2442 return (B_FALSE); 2443 } else if (ire_stq_ill->ill_usesrc_ifindex 2444 != 0) { 2445 /* 2446 * For IPv6 use ipif_select_source_v6() 2447 * so the right scope selection is done 2448 */ 2449 ipif_t *src_ipif; 2450 src_ipif = 2451 ipif_select_source_v6(ire_stq_ill, 2452 &ire->ire_addr_v6, RESTRICT_TO_NONE, 2453 IPV6_PREFER_SRC_DEFAULT, 2454 zoneid); 2455 if (src_ipif != NULL) { 2456 ipif_refrele(src_ipif); 2457 } else { 2458 return (B_FALSE); 2459 } 2460 } else { 2461 return (B_FALSE); 2462 } 2463 2464 } else if (!(ire->ire_type & IRE_OFFSUBNET)) { 2465 return (B_FALSE); 2466 } 2467 } 2468 2469 /* 2470 * Match all default routes from the global zone, irrespective 2471 * of reachability. For a non-global zone only match those 2472 * where ire_gateway_addr has a IRE_INTERFACE for the zoneid. 2473 */ 2474 if (ire->ire_type == IRE_DEFAULT && zoneid != GLOBAL_ZONEID) { 2475 int ire_match_flags = 0; 2476 in6_addr_t gw_addr_v6; 2477 ire_t *rire; 2478 2479 ire_match_flags |= MATCH_IRE_TYPE; 2480 if (ire->ire_ipif != NULL) { 2481 ire_match_flags |= MATCH_IRE_ILL_GROUP; 2482 } 2483 if (ire->ire_ipversion == IPV4_VERSION) { 2484 rire = ire_route_lookup(ire->ire_gateway_addr, 2485 0, 0, IRE_INTERFACE, ire->ire_ipif, NULL, 2486 zoneid, NULL, ire_match_flags, ipst); 2487 } else { 2488 ASSERT(ire->ire_ipversion == IPV6_VERSION); 2489 mutex_enter(&ire->ire_lock); 2490 gw_addr_v6 = ire->ire_gateway_addr_v6; 2491 mutex_exit(&ire->ire_lock); 2492 rire = ire_route_lookup_v6(&gw_addr_v6, 2493 NULL, NULL, IRE_INTERFACE, ire->ire_ipif, 2494 NULL, zoneid, NULL, ire_match_flags, ipst); 2495 } 2496 if (rire == NULL) { 2497 return (B_FALSE); 2498 } 2499 ire_refrele(rire); 2500 } 2501 } 2502 2503 if (((!(match_flags & MATCH_IRE_TYPE)) || 2504 (ire->ire_type & ire_type)) && 2505 ((!(match_flags & MATCH_IRE_WQ)) || 2506 (ire->ire_stq == ill->ill_wq)) && 2507 ((!(match_flags & MATCH_IRE_ILL)) || 2508 (ire_stq_ill == ill || ire_ipif_ill == ill)) && 2509 ((!(match_flags & MATCH_IRE_ILL_GROUP)) || 2510 (ire_stq_ill == ill) || (ire_ipif_ill == ill) || 2511 (ire_ill_group != NULL && 2512 ire_ill_group == ill->ill_group))) { 2513 return (B_TRUE); 2514 } 2515 return (B_FALSE); 2516 } 2517 2518 int 2519 rtfunc(struct radix_node *rn, void *arg) 2520 { 2521 struct rtfuncarg *rtf = arg; 2522 struct rt_entry *rt; 2523 irb_t *irb; 2524 ire_t *ire; 2525 boolean_t ret; 2526 2527 rt = (struct rt_entry *)rn; 2528 ASSERT(rt != NULL); 2529 irb = &rt->rt_irb; 2530 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 2531 if ((rtf->rt_match_flags != 0) || 2532 (rtf->rt_zoneid != ALL_ZONES)) { 2533 ret = ire_walk_ill_match(rtf->rt_match_flags, 2534 rtf->rt_ire_type, ire, 2535 rtf->rt_ill, rtf->rt_zoneid, rtf->rt_ipst); 2536 } else 2537 ret = B_TRUE; 2538 if (ret) 2539 (*rtf->rt_func)(ire, rtf->rt_arg); 2540 } 2541 return (0); 2542 } 2543 2544 /* 2545 * Walk the ftable and the ctable entries that match the ill. 2546 */ 2547 void 2548 ire_walk_ill_tables(uint_t match_flags, uint_t ire_type, pfv_t func, 2549 void *arg, size_t ftbl_sz, size_t htbl_sz, irb_t **ipftbl, 2550 size_t ctbl_sz, irb_t *ipctbl, ill_t *ill, zoneid_t zoneid, 2551 ip_stack_t *ipst) 2552 { 2553 irb_t *irb_ptr; 2554 irb_t *irb; 2555 ire_t *ire; 2556 int i, j; 2557 boolean_t ret; 2558 struct rtfuncarg rtfarg; 2559 2560 ASSERT((!(match_flags & (MATCH_IRE_WQ | MATCH_IRE_ILL | 2561 MATCH_IRE_ILL_GROUP))) || (ill != NULL)); 2562 ASSERT(!(match_flags & MATCH_IRE_TYPE) || (ire_type != 0)); 2563 /* 2564 * Optimize by not looking at the forwarding table if there 2565 * is a MATCH_IRE_TYPE specified with no IRE_FORWARDTABLE 2566 * specified in ire_type. 2567 */ 2568 if (!(match_flags & MATCH_IRE_TYPE) || 2569 ((ire_type & IRE_FORWARDTABLE) != 0)) { 2570 /* knobs such that routine is called only for v6 case */ 2571 if (ipftbl == ipst->ips_ip_forwarding_table_v6) { 2572 for (i = (ftbl_sz - 1); i >= 0; i--) { 2573 if ((irb_ptr = ipftbl[i]) == NULL) 2574 continue; 2575 for (j = 0; j < htbl_sz; j++) { 2576 irb = &irb_ptr[j]; 2577 if (irb->irb_ire == NULL) 2578 continue; 2579 2580 IRB_REFHOLD(irb); 2581 for (ire = irb->irb_ire; ire != NULL; 2582 ire = ire->ire_next) { 2583 if (match_flags == 0 && 2584 zoneid == ALL_ZONES) { 2585 ret = B_TRUE; 2586 } else { 2587 ret = 2588 ire_walk_ill_match( 2589 match_flags, 2590 ire_type, ire, ill, 2591 zoneid, ipst); 2592 } 2593 if (ret) 2594 (*func)(ire, arg); 2595 } 2596 IRB_REFRELE(irb); 2597 } 2598 } 2599 } else { 2600 (void) memset(&rtfarg, 0, sizeof (rtfarg)); 2601 rtfarg.rt_func = func; 2602 rtfarg.rt_arg = arg; 2603 if (match_flags != 0) { 2604 rtfarg.rt_match_flags = match_flags; 2605 } 2606 rtfarg.rt_ire_type = ire_type; 2607 rtfarg.rt_ill = ill; 2608 rtfarg.rt_zoneid = zoneid; 2609 rtfarg.rt_ipst = ipst; /* No netstack_hold */ 2610 (void) ipst->ips_ip_ftable->rnh_walktree_mt( 2611 ipst->ips_ip_ftable, 2612 rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn); 2613 } 2614 } 2615 2616 /* 2617 * Optimize by not looking at the cache table if there 2618 * is a MATCH_IRE_TYPE specified with no IRE_CACHETABLE 2619 * specified in ire_type. 2620 */ 2621 if (!(match_flags & MATCH_IRE_TYPE) || 2622 ((ire_type & IRE_CACHETABLE) != 0)) { 2623 for (i = 0; i < ctbl_sz; i++) { 2624 irb = &ipctbl[i]; 2625 if (irb->irb_ire == NULL) 2626 continue; 2627 IRB_REFHOLD(irb); 2628 for (ire = irb->irb_ire; ire != NULL; 2629 ire = ire->ire_next) { 2630 if (match_flags == 0 && zoneid == ALL_ZONES) { 2631 ret = B_TRUE; 2632 } else { 2633 ret = ire_walk_ill_match( 2634 match_flags, ire_type, 2635 ire, ill, zoneid, ipst); 2636 } 2637 if (ret) 2638 (*func)(ire, arg); 2639 } 2640 IRB_REFRELE(irb); 2641 } 2642 } 2643 } 2644 2645 /* 2646 * This routine walks through the ill chain to find if there is any 2647 * ire linked to the ill's interface based forwarding table 2648 * The arg could be ill or mp. This routine is called when a ill goes 2649 * down/deleted or the 'ipv4_ire_srcif_status' report is printed. 2650 */ 2651 void 2652 ire_walk_srcif_table_v4(pfv_t func, void *arg, ip_stack_t *ipst) 2653 { 2654 irb_t *irb; 2655 ire_t *ire; 2656 ill_t *ill, *next_ill; 2657 int i; 2658 int total_count; 2659 ill_walk_context_t ctx; 2660 2661 /* 2662 * Take care of ire's in other ill's per-interface forwarding 2663 * table. Check if any ire in any of the ill's ill_srcif_table 2664 * is pointing to this ill. 2665 */ 2666 mutex_enter(&ipst->ips_ire_srcif_table_lock); 2667 if (ipst->ips_ire_srcif_table_count == 0) { 2668 mutex_exit(&ipst->ips_ire_srcif_table_lock); 2669 return; 2670 } 2671 mutex_exit(&ipst->ips_ire_srcif_table_lock); 2672 2673 #ifdef DEBUG 2674 /* Keep accounting of all interface based table ires */ 2675 total_count = 0; 2676 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 2677 ill = ILL_START_WALK_V4(&ctx, ipst); 2678 while (ill != NULL) { 2679 mutex_enter(&ill->ill_lock); 2680 total_count += ill->ill_srcif_refcnt; 2681 next_ill = ill_next(&ctx, ill); 2682 mutex_exit(&ill->ill_lock); 2683 ill = next_ill; 2684 } 2685 rw_exit(&ipst->ips_ill_g_lock); 2686 2687 /* Hold lock here to make sure ire_srcif_table_count is stable */ 2688 mutex_enter(&ipst->ips_ire_srcif_table_lock); 2689 i = ipst->ips_ire_srcif_table_count; 2690 mutex_exit(&ipst->ips_ire_srcif_table_lock); 2691 ip1dbg(("ire_walk_srcif_v4: ire_srcif_table_count %d " 2692 "total ill_srcif_refcnt %d\n", i, total_count)); 2693 #endif 2694 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 2695 ill = ILL_START_WALK_V4(&ctx, ipst); 2696 while (ill != NULL) { 2697 mutex_enter(&ill->ill_lock); 2698 if ((ill->ill_srcif_refcnt == 0) || !ILL_CAN_LOOKUP(ill)) { 2699 next_ill = ill_next(&ctx, ill); 2700 mutex_exit(&ill->ill_lock); 2701 ill = next_ill; 2702 continue; 2703 } 2704 ill_refhold_locked(ill); 2705 mutex_exit(&ill->ill_lock); 2706 rw_exit(&ipst->ips_ill_g_lock); 2707 if (ill->ill_srcif_table != NULL) { 2708 for (i = 0; i < IP_SRCIF_TABLE_SIZE; i++) { 2709 irb = &(ill->ill_srcif_table[i]); 2710 if (irb->irb_ire == NULL) 2711 continue; 2712 IRB_REFHOLD(irb); 2713 for (ire = irb->irb_ire; ire != NULL; 2714 ire = ire->ire_next) { 2715 (*func)(ire, arg); 2716 } 2717 IRB_REFRELE(irb); 2718 } 2719 } 2720 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 2721 next_ill = ill_next(&ctx, ill); 2722 ill_refrele(ill); 2723 ill = next_ill; 2724 } 2725 rw_exit(&ipst->ips_ill_g_lock); 2726 } 2727 2728 /* 2729 * This function takes a mask and returns 2730 * number of bits set in the mask. If no 2731 * bit is set it returns 0. 2732 * Assumes a contiguous mask. 2733 */ 2734 int 2735 ip_mask_to_plen(ipaddr_t mask) 2736 { 2737 return (mask == 0 ? 0 : IP_ABITS - (ffs(ntohl(mask)) -1)); 2738 } 2739 2740 /* 2741 * Convert length for a mask to the mask. 2742 */ 2743 ipaddr_t 2744 ip_plen_to_mask(uint_t masklen) 2745 { 2746 return (htonl(IP_HOST_MASK << (IP_ABITS - masklen))); 2747 } 2748 2749 void 2750 ire_atomic_end(irb_t *irb_ptr, ire_t *ire) 2751 { 2752 ill_t *ill_list[NUM_ILLS]; 2753 ip_stack_t *ipst = ire->ire_ipst; 2754 2755 ill_list[0] = ire->ire_stq != NULL ? ire->ire_stq->q_ptr : NULL; 2756 ill_list[1] = ire->ire_ipif != NULL ? ire->ire_ipif->ipif_ill : NULL; 2757 ill_list[2] = ire->ire_in_ill; 2758 ill_unlock_ills(ill_list, NUM_ILLS); 2759 rw_exit(&irb_ptr->irb_lock); 2760 rw_exit(&ipst->ips_ill_g_usesrc_lock); 2761 } 2762 2763 /* 2764 * ire_add_v[46] atomically make sure that the ipif or ill associated 2765 * with the new ire being added is stable and not IPIF_CHANGING or ILL_CHANGING 2766 * before adding the ire to the table. This ensures that we don't create 2767 * new IRE_CACHEs with stale values for parameters that are passed to 2768 * ire_create such as ire_max_frag. Note that ire_create() is passed a pointer 2769 * to the ipif_mtu, and not the value. The actual value is derived from the 2770 * parent ire or ipif under the bucket lock. 2771 */ 2772 int 2773 ire_atomic_start(irb_t *irb_ptr, ire_t *ire, queue_t *q, mblk_t *mp, 2774 ipsq_func_t func) 2775 { 2776 ill_t *stq_ill; 2777 ill_t *ipif_ill; 2778 ill_t *in_ill; 2779 ill_t *ill_list[NUM_ILLS]; 2780 int cnt = NUM_ILLS; 2781 int error = 0; 2782 ill_t *ill = NULL; 2783 ip_stack_t *ipst = ire->ire_ipst; 2784 2785 ill_list[0] = stq_ill = ire->ire_stq != 2786 NULL ? ire->ire_stq->q_ptr : NULL; 2787 ill_list[1] = ipif_ill = ire->ire_ipif != 2788 NULL ? ire->ire_ipif->ipif_ill : NULL; 2789 ill_list[2] = in_ill = ire->ire_in_ill; 2790 2791 ASSERT((q != NULL && mp != NULL && func != NULL) || 2792 (q == NULL && mp == NULL && func == NULL)); 2793 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 2794 GRAB_CONN_LOCK(q); 2795 rw_enter(&irb_ptr->irb_lock, RW_WRITER); 2796 ill_lock_ills(ill_list, cnt); 2797 2798 /* 2799 * While the IRE is in the process of being added, a user may have 2800 * invoked the ifconfig usesrc option on the stq_ill to make it a 2801 * usesrc client ILL. Check for this possibility here, if it is true 2802 * then we fail adding the IRE_CACHE. Another check is to make sure 2803 * that an ipif_ill of an IRE_CACHE being added is not part of a usesrc 2804 * group. The ill_g_usesrc_lock is released in ire_atomic_end 2805 */ 2806 if ((ire->ire_type & IRE_CACHE) && 2807 (ire->ire_marks & IRE_MARK_USESRC_CHECK)) { 2808 if (stq_ill->ill_usesrc_ifindex != 0) { 2809 ASSERT(stq_ill->ill_usesrc_grp_next != NULL); 2810 if ((ipif_ill->ill_phyint->phyint_ifindex != 2811 stq_ill->ill_usesrc_ifindex) || 2812 (ipif_ill->ill_usesrc_grp_next == NULL) || 2813 (ipif_ill->ill_usesrc_ifindex != 0)) { 2814 error = EINVAL; 2815 goto done; 2816 } 2817 } else if (ipif_ill->ill_usesrc_grp_next != NULL) { 2818 error = EINVAL; 2819 goto done; 2820 } 2821 } 2822 2823 /* 2824 * IPMP flag settings happen without taking the exclusive route 2825 * in ip_sioctl_flags. So we need to make an atomic check here 2826 * for FAILED/OFFLINE/INACTIVE flags or if it has hit the 2827 * FAILBACK=no case. 2828 */ 2829 if ((stq_ill != NULL) && !IAM_WRITER_ILL(stq_ill)) { 2830 if (stq_ill->ill_state_flags & ILL_CHANGING) { 2831 ill = stq_ill; 2832 error = EAGAIN; 2833 } else if ((stq_ill->ill_phyint->phyint_flags & PHYI_OFFLINE) || 2834 (ill_is_probeonly(stq_ill) && 2835 !(ire->ire_marks & IRE_MARK_HIDDEN))) { 2836 error = EINVAL; 2837 } 2838 goto done; 2839 } 2840 2841 /* 2842 * We don't check for OFFLINE/FAILED in this case because 2843 * the source address selection logic (ipif_select_source) 2844 * may still select a source address from such an ill. The 2845 * assumption is that these addresses will be moved by in.mpathd 2846 * soon. (i.e. this is a race). However link local addresses 2847 * will not move and hence ipif_select_source_v6 tries to avoid 2848 * FAILED ills. Please see ipif_select_source_v6 for more info 2849 */ 2850 if ((ipif_ill != NULL) && !IAM_WRITER_ILL(ipif_ill) && 2851 (ipif_ill->ill_state_flags & ILL_CHANGING)) { 2852 ill = ipif_ill; 2853 error = EAGAIN; 2854 goto done; 2855 } 2856 2857 if ((in_ill != NULL) && !IAM_WRITER_ILL(in_ill) && 2858 (in_ill->ill_state_flags & ILL_CHANGING)) { 2859 ill = in_ill; 2860 error = EAGAIN; 2861 goto done; 2862 } 2863 2864 if ((ire->ire_ipif != NULL) && !IAM_WRITER_IPIF(ire->ire_ipif) && 2865 (ire->ire_ipif->ipif_state_flags & IPIF_CHANGING)) { 2866 ill = ire->ire_ipif->ipif_ill; 2867 ASSERT(ill != NULL); 2868 error = EAGAIN; 2869 goto done; 2870 } 2871 2872 done: 2873 if (error == EAGAIN && ILL_CAN_WAIT(ill, q)) { 2874 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 2875 mutex_enter(&ipsq->ipsq_lock); 2876 ire_atomic_end(irb_ptr, ire); 2877 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 2878 mutex_exit(&ipsq->ipsq_lock); 2879 error = EINPROGRESS; 2880 } else if (error != 0) { 2881 ire_atomic_end(irb_ptr, ire); 2882 } 2883 2884 RELEASE_CONN_LOCK(q); 2885 return (error); 2886 } 2887 2888 /* 2889 * Add a fully initialized IRE to an appropriate table based on 2890 * ire_type. 2891 * 2892 * allow_unresolved == B_FALSE indicates a legacy code-path call 2893 * that has prohibited the addition of incomplete ire's. If this 2894 * parameter is set, and we find an nce that is in a state other 2895 * than ND_REACHABLE, we fail the add. Note that nce_state could be 2896 * something other than ND_REACHABLE if nce_reinit has just 2897 * kicked in and reset the nce. 2898 */ 2899 int 2900 ire_add(ire_t **irep, queue_t *q, mblk_t *mp, ipsq_func_t func, 2901 boolean_t allow_unresolved) 2902 { 2903 ire_t *ire1; 2904 ill_t *stq_ill = NULL; 2905 ill_t *ill; 2906 ipif_t *ipif = NULL; 2907 ill_walk_context_t ctx; 2908 ire_t *ire = *irep; 2909 int error; 2910 boolean_t ire_is_mblk = B_FALSE; 2911 tsol_gcgrp_t *gcgrp = NULL; 2912 tsol_gcgrp_addr_t ga; 2913 ip_stack_t *ipst = ire->ire_ipst; 2914 2915 ASSERT(ire->ire_type != IRE_MIPRTUN); 2916 2917 /* get ready for the day when original ire is not created as mblk */ 2918 if (ire->ire_mp != NULL) { 2919 ire_is_mblk = B_TRUE; 2920 /* Copy the ire to a kmem_alloc'ed area */ 2921 ire1 = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 2922 if (ire1 == NULL) { 2923 ip1dbg(("ire_add: alloc failed\n")); 2924 ire_delete(ire); 2925 *irep = NULL; 2926 return (ENOMEM); 2927 } 2928 ire->ire_marks &= ~IRE_MARK_UNCACHED; 2929 *ire1 = *ire; 2930 ire1->ire_mp = NULL; 2931 ire1->ire_stq_ifindex = 0; 2932 freeb(ire->ire_mp); 2933 ire = ire1; 2934 } 2935 if (ire->ire_stq != NULL) 2936 stq_ill = (ill_t *)ire->ire_stq->q_ptr; 2937 2938 if (ire->ire_type == IRE_CACHE) { 2939 /* 2940 * If this interface is FAILED, or INACTIVE or has hit 2941 * the FAILBACK=no case, we create IRE_CACHES marked 2942 * HIDDEN for some special cases e.g. bind to 2943 * IPIF_NOFAILOVER address etc. So, if this interface 2944 * is FAILED/INACTIVE/hit FAILBACK=no case, and we are 2945 * not creating hidden ires, we should not allow that. 2946 * This happens because the state of the interface 2947 * changed while we were waiting in ARP. If this is the 2948 * daemon sending probes, the next probe will create 2949 * HIDDEN ires and we will create an ire then. This 2950 * cannot happen with NDP currently because IRE is 2951 * never queued in NDP. But it can happen in the 2952 * future when we have external resolvers with IPv6. 2953 * If the interface gets marked with OFFLINE while we 2954 * are waiting in ARP, don't add the ire. 2955 */ 2956 if ((stq_ill->ill_phyint->phyint_flags & PHYI_OFFLINE) || 2957 (ill_is_probeonly(stq_ill) && 2958 !(ire->ire_marks & IRE_MARK_HIDDEN))) { 2959 /* 2960 * We don't know whether it is a valid ipif or not. 2961 * unless we do the check below. So, set it to NULL. 2962 */ 2963 ire->ire_ipif = NULL; 2964 ire_delete(ire); 2965 *irep = NULL; 2966 return (EINVAL); 2967 } 2968 } 2969 2970 if (stq_ill != NULL && ire->ire_type == IRE_CACHE && 2971 stq_ill->ill_net_type == IRE_IF_RESOLVER) { 2972 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 2973 ill = ILL_START_WALK_ALL(&ctx, ipst); 2974 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 2975 mutex_enter(&ill->ill_lock); 2976 if (ill->ill_state_flags & ILL_CONDEMNED) { 2977 mutex_exit(&ill->ill_lock); 2978 continue; 2979 } 2980 /* 2981 * We need to make sure that the ipif is a valid one 2982 * before adding the IRE_CACHE. This happens only 2983 * with IRE_CACHE when there is an external resolver. 2984 * 2985 * We can unplumb a logical interface while the 2986 * packet is waiting in ARP with the IRE. Then, 2987 * later on when we feed the IRE back, the ipif 2988 * has to be re-checked. This can't happen with 2989 * NDP currently, as we never queue the IRE with 2990 * the packet. We always try to recreate the IRE 2991 * when the resolution is completed. But, we do 2992 * it for IPv6 also here so that in future if 2993 * we have external resolvers, it will work without 2994 * any change. 2995 */ 2996 ipif = ipif_lookup_seqid(ill, ire->ire_ipif_seqid); 2997 if (ipif != NULL) { 2998 ipif_refhold_locked(ipif); 2999 mutex_exit(&ill->ill_lock); 3000 break; 3001 } 3002 mutex_exit(&ill->ill_lock); 3003 } 3004 rw_exit(&ipst->ips_ill_g_lock); 3005 if (ipif == NULL || 3006 (ipif->ipif_isv6 && 3007 !IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6, 3008 &ipif->ipif_v6src_addr)) || 3009 (!ipif->ipif_isv6 && 3010 ire->ire_src_addr != ipif->ipif_src_addr) || 3011 ire->ire_zoneid != ipif->ipif_zoneid) { 3012 3013 if (ipif != NULL) 3014 ipif_refrele(ipif); 3015 ire->ire_ipif = NULL; 3016 ire_delete(ire); 3017 *irep = NULL; 3018 return (EINVAL); 3019 } 3020 3021 3022 ASSERT(ill != NULL); 3023 /* 3024 * If this group was dismantled while this packets was 3025 * queued in ARP, don't add it here. 3026 */ 3027 if (ire->ire_ipif->ipif_ill->ill_group != ill->ill_group) { 3028 /* We don't want ire_inactive bump stats for this */ 3029 ipif_refrele(ipif); 3030 ire->ire_ipif = NULL; 3031 ire_delete(ire); 3032 *irep = NULL; 3033 return (EINVAL); 3034 } 3035 3036 /* 3037 * Since we didn't attach label security attributes to the 3038 * ire for the resolver case, we need to add it now. (only 3039 * for v4 resolver and v6 xresolv case). 3040 */ 3041 if (is_system_labeled() && ire_is_mblk) { 3042 if (ire->ire_ipversion == IPV4_VERSION) { 3043 ga.ga_af = AF_INET; 3044 IN6_IPADDR_TO_V4MAPPED(ire->ire_gateway_addr != 3045 INADDR_ANY ? ire->ire_gateway_addr : 3046 ire->ire_addr, &ga.ga_addr); 3047 } else { 3048 ga.ga_af = AF_INET6; 3049 ga.ga_addr = IN6_IS_ADDR_UNSPECIFIED( 3050 &ire->ire_gateway_addr_v6) ? 3051 ire->ire_addr_v6 : 3052 ire->ire_gateway_addr_v6; 3053 } 3054 gcgrp = gcgrp_lookup(&ga, B_FALSE); 3055 error = tsol_ire_init_gwattr(ire, ire->ire_ipversion, 3056 NULL, gcgrp); 3057 if (error != 0) { 3058 if (gcgrp != NULL) { 3059 GCGRP_REFRELE(gcgrp); 3060 gcgrp = NULL; 3061 } 3062 ipif_refrele(ipif); 3063 ire->ire_ipif = NULL; 3064 ire_delete(ire); 3065 *irep = NULL; 3066 return (error); 3067 } 3068 } 3069 } 3070 3071 /* 3072 * In case ire was changed 3073 */ 3074 *irep = ire; 3075 if (ire->ire_ipversion == IPV6_VERSION) { 3076 error = ire_add_v6(irep, q, mp, func); 3077 } else { 3078 if (ire->ire_in_ill == NULL) 3079 error = ire_add_v4(irep, q, mp, func, allow_unresolved); 3080 else 3081 error = ire_add_srcif_v4(irep, q, mp, func); 3082 } 3083 if (ipif != NULL) 3084 ipif_refrele(ipif); 3085 return (error); 3086 } 3087 3088 /* 3089 * Add an initialized IRE to an appropriate table based on ire_type. 3090 * 3091 * The forward table contains IRE_PREFIX/IRE_HOST and 3092 * IRE_IF_RESOLVER/IRE_IF_NORESOLVER and IRE_DEFAULT. 3093 * 3094 * The cache table contains IRE_BROADCAST/IRE_LOCAL/IRE_LOOPBACK 3095 * and IRE_CACHE. 3096 * 3097 * NOTE : This function is called as writer though not required 3098 * by this function. 3099 */ 3100 static int 3101 ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func, 3102 boolean_t allow_unresolved) 3103 { 3104 ire_t *ire1; 3105 irb_t *irb_ptr; 3106 ire_t **irep; 3107 int flags; 3108 ire_t *pire = NULL; 3109 ill_t *stq_ill; 3110 ire_t *ire = *ire_p; 3111 int error; 3112 boolean_t need_refrele = B_FALSE; 3113 nce_t *nce; 3114 ip_stack_t *ipst = ire->ire_ipst; 3115 3116 if (ire->ire_ipif != NULL) 3117 ASSERT(!MUTEX_HELD(&ire->ire_ipif->ipif_ill->ill_lock)); 3118 if (ire->ire_stq != NULL) 3119 ASSERT(!MUTEX_HELD( 3120 &((ill_t *)(ire->ire_stq->q_ptr))->ill_lock)); 3121 ASSERT(ire->ire_ipversion == IPV4_VERSION); 3122 ASSERT(ire->ire_mp == NULL); /* Calls should go through ire_add */ 3123 ASSERT(ire->ire_in_ill == NULL); /* No srcif entries */ 3124 3125 /* Find the appropriate list head. */ 3126 switch (ire->ire_type) { 3127 case IRE_HOST: 3128 ire->ire_mask = IP_HOST_MASK; 3129 ire->ire_masklen = IP_ABITS; 3130 if ((ire->ire_flags & RTF_SETSRC) == 0) 3131 ire->ire_src_addr = 0; 3132 break; 3133 case IRE_CACHE: 3134 case IRE_BROADCAST: 3135 case IRE_LOCAL: 3136 case IRE_LOOPBACK: 3137 ire->ire_mask = IP_HOST_MASK; 3138 ire->ire_masklen = IP_ABITS; 3139 break; 3140 case IRE_PREFIX: 3141 if ((ire->ire_flags & RTF_SETSRC) == 0) 3142 ire->ire_src_addr = 0; 3143 break; 3144 case IRE_DEFAULT: 3145 if ((ire->ire_flags & RTF_SETSRC) == 0) 3146 ire->ire_src_addr = 0; 3147 break; 3148 case IRE_IF_RESOLVER: 3149 case IRE_IF_NORESOLVER: 3150 break; 3151 default: 3152 ip0dbg(("ire_add_v4: ire %p has unrecognized IRE type (%d)\n", 3153 (void *)ire, ire->ire_type)); 3154 ire_delete(ire); 3155 *ire_p = NULL; 3156 return (EINVAL); 3157 } 3158 3159 /* Make sure the address is properly masked. */ 3160 ire->ire_addr &= ire->ire_mask; 3161 3162 /* 3163 * ip_newroute/ip_newroute_multi are unable to prevent the deletion 3164 * of the interface route while adding an IRE_CACHE for an on-link 3165 * destination in the IRE_IF_RESOLVER case, since the ire has to 3166 * go to ARP and return. We can't do a REFHOLD on the 3167 * associated interface ire for fear of ARP freeing the message. 3168 * Here we look up the interface ire in the forwarding table and 3169 * make sure that the interface route has not been deleted. 3170 */ 3171 if (ire->ire_type == IRE_CACHE && ire->ire_gateway_addr == 0 && 3172 ((ill_t *)ire->ire_stq->q_ptr)->ill_net_type == IRE_IF_RESOLVER) { 3173 3174 ASSERT(ire->ire_max_fragp == NULL); 3175 if (CLASSD(ire->ire_addr) && !(ire->ire_flags & RTF_SETSRC)) { 3176 /* 3177 * The ihandle that we used in ip_newroute_multi 3178 * comes from the interface route corresponding 3179 * to ire_ipif. Lookup here to see if it exists 3180 * still. 3181 * If the ire has a source address assigned using 3182 * RTF_SETSRC, ire_ipif is the logical interface holding 3183 * this source address, so we can't use it to check for 3184 * the existence of the interface route. Instead we rely 3185 * on the brute force ihandle search in 3186 * ire_ihandle_lookup_onlink() below. 3187 */ 3188 pire = ipif_to_ire(ire->ire_ipif); 3189 if (pire == NULL) { 3190 ire_delete(ire); 3191 *ire_p = NULL; 3192 return (EINVAL); 3193 } else if (pire->ire_ihandle != ire->ire_ihandle) { 3194 ire_refrele(pire); 3195 ire_delete(ire); 3196 *ire_p = NULL; 3197 return (EINVAL); 3198 } 3199 } else { 3200 pire = ire_ihandle_lookup_onlink(ire); 3201 if (pire == NULL) { 3202 ire_delete(ire); 3203 *ire_p = NULL; 3204 return (EINVAL); 3205 } 3206 } 3207 /* Prevent pire from getting deleted */ 3208 IRB_REFHOLD(pire->ire_bucket); 3209 /* Has it been removed already ? */ 3210 if (pire->ire_marks & IRE_MARK_CONDEMNED) { 3211 IRB_REFRELE(pire->ire_bucket); 3212 ire_refrele(pire); 3213 ire_delete(ire); 3214 *ire_p = NULL; 3215 return (EINVAL); 3216 } 3217 } else { 3218 ASSERT(ire->ire_max_fragp != NULL); 3219 } 3220 flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW); 3221 3222 if (ire->ire_ipif != NULL) { 3223 /* 3224 * We use MATCH_IRE_IPIF while adding IRE_CACHES only 3225 * for historic reasons and to maintain symmetry with 3226 * IPv6 code path. Historically this was used by 3227 * multicast code to create multiple IRE_CACHES on 3228 * a single ill with different ipifs. This was used 3229 * so that multicast packets leaving the node had the 3230 * right source address. This is no longer needed as 3231 * ip_wput initializes the address correctly. 3232 */ 3233 flags |= MATCH_IRE_IPIF; 3234 /* 3235 * If we are creating hidden ires, make sure we search on 3236 * this ill (MATCH_IRE_ILL) and a hidden ire, 3237 * while we are searching for duplicates below. Otherwise we 3238 * could potentially find an IRE on some other interface 3239 * and it may not be a IRE marked with IRE_MARK_HIDDEN. We 3240 * shouldn't do this as this will lead to an infinite loop 3241 * (if we get to ip_wput again) eventually we need an hidden 3242 * ire for this packet to go out. MATCH_IRE_ILL is explicitly 3243 * done below. 3244 */ 3245 if (ire->ire_type == IRE_CACHE && 3246 (ire->ire_marks & IRE_MARK_HIDDEN)) 3247 flags |= (MATCH_IRE_MARK_HIDDEN); 3248 } 3249 if ((ire->ire_type & IRE_CACHETABLE) == 0) { 3250 irb_ptr = ire_get_bucket(ire); 3251 need_refrele = B_TRUE; 3252 if (irb_ptr == NULL) { 3253 /* 3254 * This assumes that the ire has not added 3255 * a reference to the ipif. 3256 */ 3257 ire->ire_ipif = NULL; 3258 ire_delete(ire); 3259 if (pire != NULL) { 3260 IRB_REFRELE(pire->ire_bucket); 3261 ire_refrele(pire); 3262 } 3263 *ire_p = NULL; 3264 return (EINVAL); 3265 } 3266 } else { 3267 irb_ptr = &(ipst->ips_ip_cache_table[IRE_ADDR_HASH( 3268 ire->ire_addr, ipst->ips_ip_cache_table_size)]); 3269 } 3270 3271 /* 3272 * Start the atomic add of the ire. Grab the ill locks, 3273 * ill_g_usesrc_lock and the bucket lock. Check for condemned 3274 * 3275 * If ipif or ill is changing ire_atomic_start() may queue the 3276 * request and return EINPROGRESS. 3277 * To avoid lock order problems, get the ndp4->ndp_g_lock. 3278 */ 3279 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 3280 error = ire_atomic_start(irb_ptr, ire, q, mp, func); 3281 if (error != 0) { 3282 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3283 /* 3284 * We don't know whether it is a valid ipif or not. 3285 * So, set it to NULL. This assumes that the ire has not added 3286 * a reference to the ipif. 3287 */ 3288 ire->ire_ipif = NULL; 3289 ire_delete(ire); 3290 if (pire != NULL) { 3291 IRB_REFRELE(pire->ire_bucket); 3292 ire_refrele(pire); 3293 } 3294 *ire_p = NULL; 3295 if (need_refrele) 3296 IRB_REFRELE(irb_ptr); 3297 return (error); 3298 } 3299 /* 3300 * To avoid creating ires having stale values for the ire_max_frag 3301 * we get the latest value atomically here. For more details 3302 * see the block comment in ip_sioctl_mtu and in DL_NOTE_SDU_CHANGE 3303 * in ip_rput_dlpi_writer 3304 */ 3305 if (ire->ire_max_fragp == NULL) { 3306 if (CLASSD(ire->ire_addr)) 3307 ire->ire_max_frag = ire->ire_ipif->ipif_mtu; 3308 else 3309 ire->ire_max_frag = pire->ire_max_frag; 3310 } else { 3311 uint_t max_frag; 3312 3313 max_frag = *ire->ire_max_fragp; 3314 ire->ire_max_fragp = NULL; 3315 ire->ire_max_frag = max_frag; 3316 } 3317 /* 3318 * Atomically check for duplicate and insert in the table. 3319 */ 3320 for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 3321 if (ire1->ire_marks & IRE_MARK_CONDEMNED) 3322 continue; 3323 if (ire->ire_ipif != NULL) { 3324 /* 3325 * We do MATCH_IRE_ILL implicitly here for IREs 3326 * with a non-null ire_ipif, including IRE_CACHEs. 3327 * As ire_ipif and ire_stq could point to two 3328 * different ills, we can't pass just ire_ipif to 3329 * ire_match_args and get a match on both ills. 3330 * This is just needed for duplicate checks here and 3331 * so we don't add an extra argument to 3332 * ire_match_args for this. Do it locally. 3333 * 3334 * NOTE : Currently there is no part of the code 3335 * that asks for both MATH_IRE_IPIF and MATCH_IRE_ILL 3336 * match for IRE_CACHEs. Thus we don't want to 3337 * extend the arguments to ire_match_args. 3338 */ 3339 if (ire1->ire_stq != ire->ire_stq) 3340 continue; 3341 /* 3342 * Multiroute IRE_CACHEs for a given destination can 3343 * have the same ire_ipif, typically if their source 3344 * address is forced using RTF_SETSRC, and the same 3345 * send-to queue. We differentiate them using the parent 3346 * handle. 3347 */ 3348 if (ire->ire_type == IRE_CACHE && 3349 (ire1->ire_flags & RTF_MULTIRT) && 3350 (ire->ire_flags & RTF_MULTIRT) && 3351 (ire1->ire_phandle != ire->ire_phandle)) 3352 continue; 3353 } 3354 if (ire1->ire_zoneid != ire->ire_zoneid) 3355 continue; 3356 if (ire_match_args(ire1, ire->ire_addr, ire->ire_mask, 3357 ire->ire_gateway_addr, ire->ire_type, ire->ire_ipif, 3358 ire->ire_zoneid, 0, NULL, flags)) { 3359 /* 3360 * Return the old ire after doing a REFHOLD. 3361 * As most of the callers continue to use the IRE 3362 * after adding, we return a held ire. This will 3363 * avoid a lookup in the caller again. If the callers 3364 * don't want to use it, they need to do a REFRELE. 3365 */ 3366 ip1dbg(("found dup ire existing %p new %p", 3367 (void *)ire1, (void *)ire)); 3368 IRE_REFHOLD(ire1); 3369 ire_atomic_end(irb_ptr, ire); 3370 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3371 ire_delete(ire); 3372 if (pire != NULL) { 3373 /* 3374 * Assert that it is not removed from the 3375 * list yet. 3376 */ 3377 ASSERT(pire->ire_ptpn != NULL); 3378 IRB_REFRELE(pire->ire_bucket); 3379 ire_refrele(pire); 3380 } 3381 *ire_p = ire1; 3382 if (need_refrele) 3383 IRB_REFRELE(irb_ptr); 3384 return (0); 3385 } 3386 } 3387 if (ire->ire_type & IRE_CACHE) { 3388 ASSERT(ire->ire_stq != NULL); 3389 nce = ndp_lookup_v4(ire_to_ill(ire), 3390 ((ire->ire_gateway_addr != INADDR_ANY) ? 3391 &ire->ire_gateway_addr : &ire->ire_addr), 3392 B_TRUE); 3393 if (nce != NULL) 3394 mutex_enter(&nce->nce_lock); 3395 /* 3396 * if the nce is NCE_F_CONDEMNED, or if it is not ND_REACHABLE 3397 * and the caller has prohibited the addition of incomplete 3398 * ire's, we fail the add. Note that nce_state could be 3399 * something other than ND_REACHABLE if nce_reinit has just 3400 * kicked in and reset the nce. 3401 */ 3402 if ((nce == NULL) || 3403 (nce->nce_flags & NCE_F_CONDEMNED) || 3404 (!allow_unresolved && 3405 (nce->nce_state != ND_REACHABLE))) { 3406 if (nce != NULL) 3407 mutex_exit(&nce->nce_lock); 3408 ire_atomic_end(irb_ptr, ire); 3409 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3410 if (nce != NULL) 3411 NCE_REFRELE(nce); 3412 DTRACE_PROBE1(ire__no__nce, ire_t *, ire); 3413 ire_delete(ire); 3414 if (pire != NULL) { 3415 IRB_REFRELE(pire->ire_bucket); 3416 ire_refrele(pire); 3417 } 3418 *ire_p = NULL; 3419 if (need_refrele) 3420 IRB_REFRELE(irb_ptr); 3421 return (EINVAL); 3422 } else { 3423 ire->ire_nce = nce; 3424 mutex_exit(&nce->nce_lock); 3425 /* 3426 * We are associating this nce to the ire, so 3427 * change the nce ref taken in ndp_lookup_v4() from 3428 * NCE_REFHOLD to NCE_REFHOLD_NOTR 3429 */ 3430 NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce); 3431 } 3432 } 3433 /* 3434 * Make it easy for ip_wput_ire() to hit multiple broadcast ires by 3435 * grouping identical addresses together on the hash chain. We also 3436 * don't want to send multiple copies out if there are two ills part 3437 * of the same group. Thus we group the ires with same addr and same 3438 * ill group together so that ip_wput_ire can easily skip all the 3439 * ires with same addr and same group after sending the first copy. 3440 * We do this only for IRE_BROADCASTs as ip_wput_ire is currently 3441 * interested in such groupings only for broadcasts. 3442 * 3443 * NOTE : If the interfaces are brought up first and then grouped, 3444 * illgrp_insert will handle it. We come here when the interfaces 3445 * are already in group and we are bringing them UP. 3446 * 3447 * Find the first entry that matches ire_addr. *irep will be null 3448 * if no match. 3449 */ 3450 irep = (ire_t **)irb_ptr; 3451 while ((ire1 = *irep) != NULL && ire->ire_addr != ire1->ire_addr) 3452 irep = &ire1->ire_next; 3453 if (ire->ire_type == IRE_BROADCAST && *irep != NULL) { 3454 /* 3455 * We found some ire (i.e *irep) with a matching addr. We 3456 * want to group ires with same addr and same ill group 3457 * together. 3458 * 3459 * First get to the entry that matches our address and 3460 * ill group i.e stop as soon as we find the first ire 3461 * matching the ill group and address. If there is only 3462 * an address match, we should walk and look for some 3463 * group match. These are some of the possible scenarios : 3464 * 3465 * 1) There are no groups at all i.e all ire's ill_group 3466 * are NULL. In that case we will essentially group 3467 * all the ires with the same addr together. Same as 3468 * the "else" block of this "if". 3469 * 3470 * 2) There are some groups and this ire's ill_group is 3471 * NULL. In this case, we will first find the group 3472 * that matches the address and a NULL group. Then 3473 * we will insert the ire at the end of that group. 3474 * 3475 * 3) There are some groups and this ires's ill_group is 3476 * non-NULL. In this case we will first find the group 3477 * that matches the address and the ill_group. Then 3478 * we will insert the ire at the end of that group. 3479 */ 3480 /* LINTED : constant in conditional context */ 3481 while (1) { 3482 ire1 = *irep; 3483 if ((ire1->ire_next == NULL) || 3484 (ire1->ire_next->ire_addr != ire->ire_addr) || 3485 (ire1->ire_type != IRE_BROADCAST) || 3486 (ire1->ire_ipif->ipif_ill->ill_group == 3487 ire->ire_ipif->ipif_ill->ill_group)) 3488 break; 3489 irep = &ire1->ire_next; 3490 } 3491 ASSERT(*irep != NULL); 3492 irep = &((*irep)->ire_next); 3493 3494 /* 3495 * Either we have hit the end of the list or the address 3496 * did not match or the group *matched*. If we found 3497 * a match on the group, skip to the end of the group. 3498 */ 3499 while (*irep != NULL) { 3500 ire1 = *irep; 3501 if ((ire1->ire_addr != ire->ire_addr) || 3502 (ire1->ire_type != IRE_BROADCAST) || 3503 (ire1->ire_ipif->ipif_ill->ill_group != 3504 ire->ire_ipif->ipif_ill->ill_group)) 3505 break; 3506 if (ire1->ire_ipif->ipif_ill->ill_group == NULL && 3507 ire1->ire_ipif == ire->ire_ipif) { 3508 irep = &ire1->ire_next; 3509 break; 3510 } 3511 irep = &ire1->ire_next; 3512 } 3513 } else if (*irep != NULL) { 3514 /* 3515 * Find the last ire which matches ire_addr. 3516 * Needed to do tail insertion among entries with the same 3517 * ire_addr. 3518 */ 3519 while (ire->ire_addr == ire1->ire_addr) { 3520 irep = &ire1->ire_next; 3521 ire1 = *irep; 3522 if (ire1 == NULL) 3523 break; 3524 } 3525 } 3526 3527 /* Insert at *irep */ 3528 ire1 = *irep; 3529 if (ire1 != NULL) 3530 ire1->ire_ptpn = &ire->ire_next; 3531 ire->ire_next = ire1; 3532 /* Link the new one in. */ 3533 ire->ire_ptpn = irep; 3534 3535 /* 3536 * ire_walk routines de-reference ire_next without holding 3537 * a lock. Before we point to the new ire, we want to make 3538 * sure the store that sets the ire_next of the new ire 3539 * reaches global visibility, so that ire_walk routines 3540 * don't see a truncated list of ires i.e if the ire_next 3541 * of the new ire gets set after we do "*irep = ire" due 3542 * to re-ordering, the ire_walk thread will see a NULL 3543 * once it accesses the ire_next of the new ire. 3544 * membar_producer() makes sure that the following store 3545 * happens *after* all of the above stores. 3546 */ 3547 membar_producer(); 3548 *irep = ire; 3549 ire->ire_bucket = irb_ptr; 3550 /* 3551 * We return a bumped up IRE above. Keep it symmetrical 3552 * so that the callers will always have to release. This 3553 * helps the callers of this function because they continue 3554 * to use the IRE after adding and hence they don't have to 3555 * lookup again after we return the IRE. 3556 * 3557 * NOTE : We don't have to use atomics as this is appearing 3558 * in the list for the first time and no one else can bump 3559 * up the reference count on this yet. 3560 */ 3561 IRE_REFHOLD_LOCKED(ire); 3562 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_inserted); 3563 3564 irb_ptr->irb_ire_cnt++; 3565 if (irb_ptr->irb_marks & IRB_MARK_FTABLE) 3566 irb_ptr->irb_nire++; 3567 3568 if (ire->ire_marks & IRE_MARK_TEMPORARY) 3569 irb_ptr->irb_tmp_ire_cnt++; 3570 3571 if (ire->ire_ipif != NULL) { 3572 ire->ire_ipif->ipif_ire_cnt++; 3573 if (ire->ire_stq != NULL) { 3574 stq_ill = (ill_t *)ire->ire_stq->q_ptr; 3575 stq_ill->ill_ire_cnt++; 3576 } 3577 } else { 3578 ASSERT(ire->ire_stq == NULL); 3579 } 3580 3581 ire_atomic_end(irb_ptr, ire); 3582 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3583 3584 if (pire != NULL) { 3585 /* Assert that it is not removed from the list yet */ 3586 ASSERT(pire->ire_ptpn != NULL); 3587 IRB_REFRELE(pire->ire_bucket); 3588 ire_refrele(pire); 3589 } 3590 3591 if (ire->ire_type != IRE_CACHE) { 3592 /* 3593 * For ire's with host mask see if there is an entry 3594 * in the cache. If there is one flush the whole cache as 3595 * there might be multiple entries due to RTF_MULTIRT (CGTP). 3596 * If no entry is found than there is no need to flush the 3597 * cache. 3598 */ 3599 if (ire->ire_mask == IP_HOST_MASK) { 3600 ire_t *lire; 3601 lire = ire_ctable_lookup(ire->ire_addr, NULL, IRE_CACHE, 3602 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 3603 if (lire != NULL) { 3604 ire_refrele(lire); 3605 ire_flush_cache_v4(ire, IRE_FLUSH_ADD); 3606 } 3607 } else { 3608 ire_flush_cache_v4(ire, IRE_FLUSH_ADD); 3609 } 3610 } 3611 /* 3612 * We had to delay the fast path probe until the ire is inserted 3613 * in the list. Otherwise the fast path ack won't find the ire in 3614 * the table. 3615 */ 3616 if (ire->ire_type == IRE_CACHE || 3617 (ire->ire_type == IRE_BROADCAST && ire->ire_stq != NULL)) { 3618 ASSERT(ire->ire_nce != NULL); 3619 nce_fastpath(ire->ire_nce); 3620 } 3621 if (ire->ire_ipif != NULL) 3622 ASSERT(!MUTEX_HELD(&ire->ire_ipif->ipif_ill->ill_lock)); 3623 *ire_p = ire; 3624 if (need_refrele) { 3625 IRB_REFRELE(irb_ptr); 3626 } 3627 return (0); 3628 } 3629 3630 /* 3631 * IRB_REFRELE is the only caller of the function. ire_unlink calls to 3632 * do the final cleanup for this ire. 3633 */ 3634 void 3635 ire_cleanup(ire_t *ire) 3636 { 3637 ire_t *ire_next; 3638 ip_stack_t *ipst = ire->ire_ipst; 3639 3640 ASSERT(ire != NULL); 3641 3642 while (ire != NULL) { 3643 ire_next = ire->ire_next; 3644 if (ire->ire_ipversion == IPV4_VERSION) { 3645 ire_delete_v4(ire); 3646 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, 3647 ire_stats_deleted); 3648 } else { 3649 ASSERT(ire->ire_ipversion == IPV6_VERSION); 3650 ire_delete_v6(ire); 3651 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, 3652 ire_stats_deleted); 3653 } 3654 /* 3655 * Now it's really out of the list. Before doing the 3656 * REFRELE, set ire_next to NULL as ire_inactive asserts 3657 * so. 3658 */ 3659 ire->ire_next = NULL; 3660 IRE_REFRELE_NOTR(ire); 3661 ire = ire_next; 3662 } 3663 } 3664 3665 /* 3666 * IRB_REFRELE is the only caller of the function. It calls to unlink 3667 * all the CONDEMNED ires from this bucket. 3668 */ 3669 ire_t * 3670 ire_unlink(irb_t *irb) 3671 { 3672 ire_t *ire; 3673 ire_t *ire1; 3674 ire_t **ptpn; 3675 ire_t *ire_list = NULL; 3676 3677 ASSERT(RW_WRITE_HELD(&irb->irb_lock)); 3678 ASSERT(((irb->irb_marks & IRB_MARK_FTABLE) && irb->irb_refcnt == 1) || 3679 (irb->irb_refcnt == 0)); 3680 ASSERT(irb->irb_marks & IRB_MARK_CONDEMNED); 3681 ASSERT(irb->irb_ire != NULL); 3682 3683 for (ire = irb->irb_ire; ire != NULL; ire = ire1) { 3684 ip_stack_t *ipst = ire->ire_ipst; 3685 3686 ire1 = ire->ire_next; 3687 if (ire->ire_marks & IRE_MARK_CONDEMNED) { 3688 ptpn = ire->ire_ptpn; 3689 ire1 = ire->ire_next; 3690 if (ire1) 3691 ire1->ire_ptpn = ptpn; 3692 *ptpn = ire1; 3693 ire->ire_ptpn = NULL; 3694 ire->ire_next = NULL; 3695 if (ire->ire_type == IRE_DEFAULT) { 3696 /* 3697 * IRE is out of the list. We need to adjust 3698 * the accounting before the caller drops 3699 * the lock. 3700 */ 3701 if (ire->ire_ipversion == IPV6_VERSION) { 3702 ASSERT(ipst-> 3703 ips_ipv6_ire_default_count != 3704 0); 3705 ipst->ips_ipv6_ire_default_count--; 3706 } 3707 } 3708 /* 3709 * We need to call ire_delete_v4 or ire_delete_v6 3710 * to clean up the cache or the redirects pointing at 3711 * the default gateway. We need to drop the lock 3712 * as ire_flush_cache/ire_delete_host_redircts require 3713 * so. But we can't drop the lock, as ire_unlink needs 3714 * to atomically remove the ires from the list. 3715 * So, create a temporary list of CONDEMNED ires 3716 * for doing ire_delete_v4/ire_delete_v6 operations 3717 * later on. 3718 */ 3719 ire->ire_next = ire_list; 3720 ire_list = ire; 3721 } 3722 } 3723 irb->irb_marks &= ~IRB_MARK_CONDEMNED; 3724 return (ire_list); 3725 } 3726 3727 /* 3728 * Delete all the cache entries with this 'addr'. When IP gets a gratuitous 3729 * ARP message on any of its interface queue, it scans the nce table and 3730 * deletes and calls ndp_delete() for the appropriate nce. This action 3731 * also deletes all the neighbor/ire cache entries for that address. 3732 * This function is called from ip_arp_news in ip.c and also for 3733 * ARP ioctl processing in ip_if.c. ip_ire_clookup_and_delete returns 3734 * true if it finds a nce entry which is used by ip_arp_news to determine if 3735 * it needs to do an ire_walk_v4. The return value is also used for the 3736 * same purpose by ARP IOCTL processing * in ip_if.c when deleting 3737 * ARP entries. For SIOC*IFARP ioctls in addition to the address, 3738 * ip_if->ipif_ill also needs to be matched. 3739 */ 3740 boolean_t 3741 ip_ire_clookup_and_delete(ipaddr_t addr, ipif_t *ipif, ip_stack_t *ipst) 3742 { 3743 ill_t *ill; 3744 nce_t *nce; 3745 3746 ill = (ipif ? ipif->ipif_ill : NULL); 3747 3748 if (ill != NULL) { 3749 /* 3750 * clean up the nce (and any relevant ire's) that matches 3751 * on addr and ill. 3752 */ 3753 nce = ndp_lookup_v4(ill, &addr, B_FALSE); 3754 if (nce != NULL) { 3755 ndp_delete(nce); 3756 return (B_TRUE); 3757 } 3758 } else { 3759 /* 3760 * ill is wildcard. clean up all nce's and 3761 * ire's that match on addr 3762 */ 3763 nce_clookup_t cl; 3764 3765 cl.ncecl_addr = addr; 3766 cl.ncecl_found = B_FALSE; 3767 3768 ndp_walk_common(ipst->ips_ndp4, NULL, 3769 (pfi_t)ip_nce_clookup_and_delete, (uchar_t *)&cl, B_TRUE); 3770 3771 /* 3772 * ncecl_found would be set by ip_nce_clookup_and_delete if 3773 * we found a matching nce. 3774 */ 3775 return (cl.ncecl_found); 3776 } 3777 return (B_FALSE); 3778 3779 } 3780 3781 /* Delete the supplied nce if its nce_addr matches the supplied address */ 3782 static void 3783 ip_nce_clookup_and_delete(nce_t *nce, void *arg) 3784 { 3785 nce_clookup_t *cl = (nce_clookup_t *)arg; 3786 ipaddr_t nce_addr; 3787 3788 IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr); 3789 if (nce_addr == cl->ncecl_addr) { 3790 cl->ncecl_found = B_TRUE; 3791 /* clean up the nce (and any relevant ire's) */ 3792 ndp_delete(nce); 3793 } 3794 } 3795 3796 /* 3797 * Clean up the radix node for this ire. Must be called by IRB_REFRELE 3798 * when there are no ire's left in the bucket. Returns TRUE if the bucket 3799 * is deleted and freed. 3800 */ 3801 boolean_t 3802 irb_inactive(irb_t *irb) 3803 { 3804 struct rt_entry *rt; 3805 struct radix_node *rn; 3806 ip_stack_t *ipst = irb->irb_ipst; 3807 3808 ASSERT(irb->irb_ipst != NULL); 3809 3810 rt = IRB2RT(irb); 3811 rn = (struct radix_node *)rt; 3812 3813 /* first remove it from the radix tree. */ 3814 RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable); 3815 rw_enter(&irb->irb_lock, RW_WRITER); 3816 if (irb->irb_refcnt == 1 && irb->irb_nire == 0) { 3817 rn = ipst->ips_ip_ftable->rnh_deladdr(rn->rn_key, rn->rn_mask, 3818 ipst->ips_ip_ftable); 3819 DTRACE_PROBE1(irb__free, rt_t *, rt); 3820 ASSERT((void *)rn == (void *)rt); 3821 Free(rt, rt_entry_cache); 3822 /* irb_lock is freed */ 3823 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 3824 return (B_TRUE); 3825 } 3826 rw_exit(&irb->irb_lock); 3827 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 3828 return (B_FALSE); 3829 } 3830 3831 /* 3832 * Delete the specified IRE. 3833 */ 3834 void 3835 ire_delete(ire_t *ire) 3836 { 3837 ire_t *ire1; 3838 ire_t **ptpn; 3839 irb_t *irb; 3840 ip_stack_t *ipst = ire->ire_ipst; 3841 3842 if ((irb = ire->ire_bucket) == NULL) { 3843 /* 3844 * It was never inserted in the list. Should call REFRELE 3845 * to free this IRE. 3846 */ 3847 IRE_REFRELE_NOTR(ire); 3848 return; 3849 } 3850 3851 rw_enter(&irb->irb_lock, RW_WRITER); 3852 3853 if (irb->irb_rr_origin == ire) { 3854 irb->irb_rr_origin = NULL; 3855 } 3856 3857 /* 3858 * In case of V4 we might still be waiting for fastpath ack. 3859 */ 3860 if (ire->ire_ipversion == IPV4_VERSION && 3861 (ire->ire_type == IRE_CACHE || 3862 (ire->ire_type == IRE_BROADCAST && ire->ire_stq != NULL))) { 3863 ASSERT(ire->ire_nce != NULL); 3864 nce_fastpath_list_delete(ire->ire_nce); 3865 } 3866 3867 if (ire->ire_ptpn == NULL) { 3868 /* 3869 * Some other thread has removed us from the list. 3870 * It should have done the REFRELE for us. 3871 */ 3872 rw_exit(&irb->irb_lock); 3873 return; 3874 } 3875 3876 if (irb->irb_refcnt != 0) { 3877 /* 3878 * The last thread to leave this bucket will 3879 * delete this ire. 3880 */ 3881 if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { 3882 irb->irb_ire_cnt--; 3883 if (ire->ire_marks & IRE_MARK_TEMPORARY) 3884 irb->irb_tmp_ire_cnt--; 3885 ire->ire_marks |= IRE_MARK_CONDEMNED; 3886 } 3887 irb->irb_marks |= IRB_MARK_CONDEMNED; 3888 rw_exit(&irb->irb_lock); 3889 return; 3890 } 3891 3892 /* 3893 * Normally to delete an ire, we walk the bucket. While we 3894 * walk the bucket, we normally bump up irb_refcnt and hence 3895 * we return from above where we mark CONDEMNED and the ire 3896 * gets deleted from ire_unlink. This case is where somebody 3897 * knows the ire e.g by doing a lookup, and wants to delete the 3898 * IRE. irb_refcnt would be 0 in this case if nobody is walking 3899 * the bucket. 3900 */ 3901 ptpn = ire->ire_ptpn; 3902 ire1 = ire->ire_next; 3903 if (ire1 != NULL) 3904 ire1->ire_ptpn = ptpn; 3905 ASSERT(ptpn != NULL); 3906 *ptpn = ire1; 3907 ire->ire_ptpn = NULL; 3908 ire->ire_next = NULL; 3909 if (ire->ire_ipversion == IPV6_VERSION) { 3910 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_deleted); 3911 } else { 3912 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_deleted); 3913 } 3914 /* 3915 * ip_wput/ip_wput_v6 checks this flag to see whether 3916 * it should still use the cached ire or not. 3917 */ 3918 ire->ire_marks |= IRE_MARK_CONDEMNED; 3919 if (ire->ire_type == IRE_DEFAULT) { 3920 /* 3921 * IRE is out of the list. We need to adjust the 3922 * accounting before we drop the lock. 3923 */ 3924 if (ire->ire_ipversion == IPV6_VERSION) { 3925 ASSERT(ipst->ips_ipv6_ire_default_count != 0); 3926 ipst->ips_ipv6_ire_default_count--; 3927 } 3928 } 3929 irb->irb_ire_cnt--; 3930 3931 if (ire->ire_marks & IRE_MARK_TEMPORARY) 3932 irb->irb_tmp_ire_cnt--; 3933 rw_exit(&irb->irb_lock); 3934 3935 if (ire->ire_ipversion == IPV6_VERSION) { 3936 ire_delete_v6(ire); 3937 } else { 3938 ire_delete_v4(ire); 3939 } 3940 /* 3941 * We removed it from the list. Decrement the 3942 * reference count. 3943 */ 3944 IRE_REFRELE_NOTR(ire); 3945 } 3946 3947 /* 3948 * Delete the specified IRE. 3949 * All calls should use ire_delete(). 3950 * Sometimes called as writer though not required by this function. 3951 * 3952 * NOTE : This function is called only if the ire was added 3953 * in the list. 3954 */ 3955 static void 3956 ire_delete_v4(ire_t *ire) 3957 { 3958 ip_stack_t *ipst = ire->ire_ipst; 3959 3960 ASSERT(ire->ire_refcnt >= 1); 3961 ASSERT(ire->ire_ipversion == IPV4_VERSION); 3962 3963 if (ire->ire_type != IRE_CACHE) 3964 ire_flush_cache_v4(ire, IRE_FLUSH_DELETE); 3965 if (ire->ire_type == IRE_DEFAULT) { 3966 /* 3967 * when a default gateway is going away 3968 * delete all the host redirects pointing at that 3969 * gateway. 3970 */ 3971 ire_delete_host_redirects(ire->ire_gateway_addr, ipst); 3972 } 3973 } 3974 3975 /* 3976 * IRE_REFRELE/ire_refrele are the only caller of the function. It calls 3977 * to free the ire when the reference count goes to zero. 3978 */ 3979 void 3980 ire_inactive(ire_t *ire) 3981 { 3982 nce_t *nce; 3983 ill_t *ill = NULL; 3984 ill_t *stq_ill = NULL; 3985 ill_t *in_ill = NULL; 3986 ipif_t *ipif; 3987 boolean_t need_wakeup = B_FALSE; 3988 irb_t *irb; 3989 ip_stack_t *ipst = ire->ire_ipst; 3990 3991 ASSERT(ire->ire_refcnt == 0); 3992 ASSERT(ire->ire_ptpn == NULL); 3993 ASSERT(ire->ire_next == NULL); 3994 3995 if (ire->ire_gw_secattr != NULL) { 3996 ire_gw_secattr_free(ire->ire_gw_secattr); 3997 ire->ire_gw_secattr = NULL; 3998 } 3999 4000 if (ire->ire_mp != NULL) { 4001 ASSERT(ire->ire_bucket == NULL); 4002 mutex_destroy(&ire->ire_lock); 4003 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed); 4004 if (ire->ire_nce != NULL) 4005 NCE_REFRELE_NOTR(ire->ire_nce); 4006 freeb(ire->ire_mp); 4007 return; 4008 } 4009 4010 if ((nce = ire->ire_nce) != NULL) { 4011 NCE_REFRELE_NOTR(nce); 4012 ire->ire_nce = NULL; 4013 } 4014 4015 if (ire->ire_ipif == NULL) 4016 goto end; 4017 4018 ipif = ire->ire_ipif; 4019 ill = ipif->ipif_ill; 4020 4021 if (ire->ire_bucket == NULL) { 4022 /* The ire was never inserted in the table. */ 4023 goto end; 4024 } 4025 4026 /* 4027 * ipif_ire_cnt on this ipif goes down by 1. If the ire_stq is 4028 * non-null ill_ire_count also goes down by 1. If the in_ill is 4029 * non-null either ill_mrtun_refcnt or ill_srcif_refcnt goes down by 1. 4030 * 4031 * The ipif that is associated with an ire is ire->ire_ipif and 4032 * hence when the ire->ire_ipif->ipif_ire_cnt drops to zero we call 4033 * ipif_ill_refrele_tail. Usually stq_ill is null or the same as 4034 * ire->ire_ipif->ipif_ill. So nothing more needs to be done. Only 4035 * in the case of IRE_CACHES when IPMP is used, stq_ill can be 4036 * different. If this is different from ire->ire_ipif->ipif_ill and 4037 * if the ill_ire_cnt on the stq_ill also has dropped to zero, we call 4038 * ipif_ill_refrele_tail on the stq_ill. If mobile ip is in use 4039 * in_ill could be non-null. If it is a reverse tunnel related ire 4040 * ill_mrtun_refcnt is non-zero. If it is forward tunnel related ire 4041 * ill_srcif_refcnt is non-null. 4042 */ 4043 4044 if (ire->ire_stq != NULL) 4045 stq_ill = (ill_t *)ire->ire_stq->q_ptr; 4046 if (ire->ire_in_ill != NULL) 4047 in_ill = ire->ire_in_ill; 4048 4049 if ((stq_ill == NULL || stq_ill == ill) && (in_ill == NULL)) { 4050 /* Optimize the most common case */ 4051 mutex_enter(&ill->ill_lock); 4052 ASSERT(ipif->ipif_ire_cnt != 0); 4053 ipif->ipif_ire_cnt--; 4054 if (ipif->ipif_ire_cnt == 0) 4055 need_wakeup = B_TRUE; 4056 if (stq_ill != NULL) { 4057 ASSERT(stq_ill->ill_ire_cnt != 0); 4058 stq_ill->ill_ire_cnt--; 4059 if (stq_ill->ill_ire_cnt == 0) 4060 need_wakeup = B_TRUE; 4061 } 4062 if (need_wakeup) { 4063 /* Drops the ill lock */ 4064 ipif_ill_refrele_tail(ill); 4065 } else { 4066 mutex_exit(&ill->ill_lock); 4067 } 4068 } else { 4069 /* 4070 * We can't grab all the ill locks at the same time. 4071 * It can lead to recursive lock enter in the call to 4072 * ipif_ill_refrele_tail and later. Instead do it 1 at 4073 * a time. 4074 */ 4075 mutex_enter(&ill->ill_lock); 4076 ASSERT(ipif->ipif_ire_cnt != 0); 4077 ipif->ipif_ire_cnt--; 4078 if (ipif->ipif_ire_cnt == 0) { 4079 /* Drops the lock */ 4080 ipif_ill_refrele_tail(ill); 4081 } else { 4082 mutex_exit(&ill->ill_lock); 4083 } 4084 if (stq_ill != NULL) { 4085 mutex_enter(&stq_ill->ill_lock); 4086 ASSERT(stq_ill->ill_ire_cnt != 0); 4087 stq_ill->ill_ire_cnt--; 4088 if (stq_ill->ill_ire_cnt == 0) { 4089 /* Drops the ill lock */ 4090 ipif_ill_refrele_tail(stq_ill); 4091 } else { 4092 mutex_exit(&stq_ill->ill_lock); 4093 } 4094 } 4095 if (in_ill != NULL) { 4096 mutex_enter(&in_ill->ill_lock); 4097 if (ire->ire_type == IRE_MIPRTUN) { 4098 /* 4099 * Mobile IP reverse tunnel ire. 4100 * Decrement table count and the 4101 * ill reference count. This signifies 4102 * mipagent is deleting reverse tunnel 4103 * route for a particular mobile node. 4104 */ 4105 mutex_enter(&ipst->ips_ire_mrtun_lock); 4106 ipst->ips_ire_mrtun_count--; 4107 mutex_exit(&ipst->ips_ire_mrtun_lock); 4108 ASSERT(in_ill->ill_mrtun_refcnt != 0); 4109 in_ill->ill_mrtun_refcnt--; 4110 if (in_ill->ill_mrtun_refcnt == 0) { 4111 /* Drops the ill lock */ 4112 ipif_ill_refrele_tail(in_ill); 4113 } else { 4114 mutex_exit(&in_ill->ill_lock); 4115 } 4116 } else { 4117 mutex_enter(&ipst->ips_ire_srcif_table_lock); 4118 ipst->ips_ire_srcif_table_count--; 4119 mutex_exit(&ipst->ips_ire_srcif_table_lock); 4120 ASSERT(in_ill->ill_srcif_refcnt != 0); 4121 in_ill->ill_srcif_refcnt--; 4122 if (in_ill->ill_srcif_refcnt == 0) { 4123 /* Drops the ill lock */ 4124 ipif_ill_refrele_tail(in_ill); 4125 } else { 4126 mutex_exit(&in_ill->ill_lock); 4127 } 4128 } 4129 } 4130 } 4131 end: 4132 /* This should be true for both V4 and V6 */ 4133 4134 if ((ire->ire_type & IRE_FORWARDTABLE) && 4135 (ire->ire_ipversion == IPV4_VERSION) && 4136 ((irb = ire->ire_bucket) != NULL)) { 4137 rw_enter(&irb->irb_lock, RW_WRITER); 4138 irb->irb_nire--; 4139 /* 4140 * Instead of examining the conditions for freeing 4141 * the radix node here, we do it by calling 4142 * IRB_REFRELE which is a single point in the code 4143 * that embeds that logic. Bump up the refcnt to 4144 * be able to call IRB_REFRELE 4145 */ 4146 IRB_REFHOLD_LOCKED(irb); 4147 rw_exit(&irb->irb_lock); 4148 IRB_REFRELE(irb); 4149 } 4150 ire->ire_ipif = NULL; 4151 4152 if (ire->ire_in_ill != NULL) { 4153 ire->ire_in_ill = NULL; 4154 } 4155 4156 #ifdef IRE_DEBUG 4157 ire_trace_inactive(ire); 4158 #endif 4159 mutex_destroy(&ire->ire_lock); 4160 if (ire->ire_ipversion == IPV6_VERSION) { 4161 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_freed); 4162 } else { 4163 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed); 4164 } 4165 ASSERT(ire->ire_mp == NULL); 4166 /* Has been allocated out of the cache */ 4167 kmem_cache_free(ire_cache, ire); 4168 } 4169 4170 /* 4171 * ire_walk routine to delete all IRE_CACHE/IRE_HOST types redirect 4172 * entries that have a given gateway address. 4173 */ 4174 void 4175 ire_delete_cache_gw(ire_t *ire, char *cp) 4176 { 4177 ipaddr_t gw_addr; 4178 4179 if (!(ire->ire_type & IRE_CACHE) && 4180 !(ire->ire_flags & RTF_DYNAMIC)) 4181 return; 4182 4183 bcopy(cp, &gw_addr, sizeof (gw_addr)); 4184 if (ire->ire_gateway_addr == gw_addr) { 4185 ip1dbg(("ire_delete_cache_gw: deleted 0x%x type %d to 0x%x\n", 4186 (int)ntohl(ire->ire_addr), ire->ire_type, 4187 (int)ntohl(ire->ire_gateway_addr))); 4188 ire_delete(ire); 4189 } 4190 } 4191 4192 /* 4193 * Remove all IRE_CACHE entries that match the ire specified. 4194 * 4195 * The flag argument indicates if the flush request is due to addition 4196 * of new route (IRE_FLUSH_ADD) or deletion of old route (IRE_FLUSH_DELETE). 4197 * 4198 * This routine takes only the IREs from the forwarding table and flushes 4199 * the corresponding entries from the cache table. 4200 * 4201 * When flushing due to the deletion of an old route, it 4202 * just checks the cache handles (ire_phandle and ire_ihandle) and 4203 * deletes the ones that match. 4204 * 4205 * When flushing due to the creation of a new route, it checks 4206 * if a cache entry's address matches the one in the IRE and 4207 * that the cache entry's parent has a less specific mask than the 4208 * one in IRE. The destination of such a cache entry could be the 4209 * gateway for other cache entries, so we need to flush those as 4210 * well by looking for gateway addresses matching the IRE's address. 4211 */ 4212 void 4213 ire_flush_cache_v4(ire_t *ire, int flag) 4214 { 4215 int i; 4216 ire_t *cire; 4217 irb_t *irb; 4218 ip_stack_t *ipst = ire->ire_ipst; 4219 4220 if (ire->ire_type & IRE_CACHE) 4221 return; 4222 4223 /* 4224 * If a default is just created, there is no point 4225 * in going through the cache, as there will not be any 4226 * cached ires. 4227 */ 4228 if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) 4229 return; 4230 if (flag == IRE_FLUSH_ADD) { 4231 /* 4232 * This selective flush is due to the addition of 4233 * new IRE. 4234 */ 4235 for (i = 0; i < ipst->ips_ip_cache_table_size; i++) { 4236 irb = &ipst->ips_ip_cache_table[i]; 4237 if ((cire = irb->irb_ire) == NULL) 4238 continue; 4239 IRB_REFHOLD(irb); 4240 for (cire = irb->irb_ire; cire != NULL; 4241 cire = cire->ire_next) { 4242 if (cire->ire_type != IRE_CACHE) 4243 continue; 4244 /* 4245 * If 'cire' belongs to the same subnet 4246 * as the new ire being added, and 'cire' 4247 * is derived from a prefix that is less 4248 * specific than the new ire being added, 4249 * we need to flush 'cire'; for instance, 4250 * when a new interface comes up. 4251 */ 4252 if (((cire->ire_addr & ire->ire_mask) == 4253 (ire->ire_addr & ire->ire_mask)) && 4254 (ip_mask_to_plen(cire->ire_cmask) <= 4255 ire->ire_masklen)) { 4256 ire_delete(cire); 4257 continue; 4258 } 4259 /* 4260 * This is the case when the ire_gateway_addr 4261 * of 'cire' belongs to the same subnet as 4262 * the new ire being added. 4263 * Flushing such ires is sometimes required to 4264 * avoid misrouting: say we have a machine with 4265 * two interfaces (I1 and I2), a default router 4266 * R on the I1 subnet, and a host route to an 4267 * off-link destination D with a gateway G on 4268 * the I2 subnet. 4269 * Under normal operation, we will have an 4270 * on-link cache entry for G and an off-link 4271 * cache entry for D with G as ire_gateway_addr, 4272 * traffic to D will reach its destination 4273 * through gateway G. 4274 * If the administrator does 'ifconfig I2 down', 4275 * the cache entries for D and G will be 4276 * flushed. However, G will now be resolved as 4277 * an off-link destination using R (the default 4278 * router) as gateway. Then D will also be 4279 * resolved as an off-link destination using G 4280 * as gateway - this behavior is due to 4281 * compatibility reasons, see comment in 4282 * ire_ihandle_lookup_offlink(). Traffic to D 4283 * will go to the router R and probably won't 4284 * reach the destination. 4285 * The administrator then does 'ifconfig I2 up'. 4286 * Since G is on the I2 subnet, this routine 4287 * will flush its cache entry. It must also 4288 * flush the cache entry for D, otherwise 4289 * traffic will stay misrouted until the IRE 4290 * times out. 4291 */ 4292 if ((cire->ire_gateway_addr & ire->ire_mask) == 4293 (ire->ire_addr & ire->ire_mask)) { 4294 ire_delete(cire); 4295 continue; 4296 } 4297 } 4298 IRB_REFRELE(irb); 4299 } 4300 } else { 4301 /* 4302 * delete the cache entries based on 4303 * handle in the IRE as this IRE is 4304 * being deleted/changed. 4305 */ 4306 for (i = 0; i < ipst->ips_ip_cache_table_size; i++) { 4307 irb = &ipst->ips_ip_cache_table[i]; 4308 if ((cire = irb->irb_ire) == NULL) 4309 continue; 4310 IRB_REFHOLD(irb); 4311 for (cire = irb->irb_ire; cire != NULL; 4312 cire = cire->ire_next) { 4313 if (cire->ire_type != IRE_CACHE) 4314 continue; 4315 if ((cire->ire_phandle == 0 || 4316 cire->ire_phandle != ire->ire_phandle) && 4317 (cire->ire_ihandle == 0 || 4318 cire->ire_ihandle != ire->ire_ihandle)) 4319 continue; 4320 ire_delete(cire); 4321 } 4322 IRB_REFRELE(irb); 4323 } 4324 } 4325 } 4326 4327 /* 4328 * Matches the arguments passed with the values in the ire. 4329 * 4330 * Note: for match types that match using "ipif" passed in, ipif 4331 * must be checked for non-NULL before calling this routine. 4332 */ 4333 boolean_t 4334 ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, 4335 int type, const ipif_t *ipif, zoneid_t zoneid, uint32_t ihandle, 4336 const ts_label_t *tsl, int match_flags) 4337 { 4338 ill_t *ire_ill = NULL, *dst_ill; 4339 ill_t *ipif_ill = NULL; 4340 ill_group_t *ire_ill_group = NULL; 4341 ill_group_t *ipif_ill_group = NULL; 4342 4343 ASSERT(ire->ire_ipversion == IPV4_VERSION); 4344 ASSERT((ire->ire_addr & ~ire->ire_mask) == 0); 4345 ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP))) || 4346 (ipif != NULL && !ipif->ipif_isv6)); 4347 ASSERT(!(match_flags & MATCH_IRE_WQ)); 4348 4349 /* 4350 * HIDDEN cache entries have to be looked up specifically with 4351 * MATCH_IRE_MARK_HIDDEN. MATCH_IRE_MARK_HIDDEN is usually set 4352 * when the interface is FAILED or INACTIVE. In that case, 4353 * any IRE_CACHES that exists should be marked with 4354 * IRE_MARK_HIDDEN. So, we don't really need to match below 4355 * for IRE_MARK_HIDDEN. But we do so for consistency. 4356 */ 4357 if (!(match_flags & MATCH_IRE_MARK_HIDDEN) && 4358 (ire->ire_marks & IRE_MARK_HIDDEN)) 4359 return (B_FALSE); 4360 4361 /* 4362 * MATCH_IRE_MARK_PRIVATE_ADDR is set when IP_NEXTHOP option 4363 * is used. In that case the routing table is bypassed and the 4364 * packets are sent directly to the specified nexthop. The 4365 * IRE_CACHE entry representing this route should be marked 4366 * with IRE_MARK_PRIVATE_ADDR. 4367 */ 4368 4369 if (!(match_flags & MATCH_IRE_MARK_PRIVATE_ADDR) && 4370 (ire->ire_marks & IRE_MARK_PRIVATE_ADDR)) 4371 return (B_FALSE); 4372 4373 if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid && 4374 ire->ire_zoneid != ALL_ZONES) { 4375 /* 4376 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid is 4377 * valid and does not match that of ire_zoneid, a failure to 4378 * match is reported at this point. Otherwise, since some IREs 4379 * that are available in the global zone can be used in local 4380 * zones, additional checks need to be performed: 4381 * 4382 * IRE_BROADCAST, IRE_CACHE and IRE_LOOPBACK 4383 * entries should never be matched in this situation. 4384 * 4385 * IRE entries that have an interface associated with them 4386 * should in general not match unless they are an IRE_LOCAL 4387 * or in the case when MATCH_IRE_DEFAULT has been set in 4388 * the caller. In the case of the former, checking of the 4389 * other fields supplied should take place. 4390 * 4391 * In the case where MATCH_IRE_DEFAULT has been set, 4392 * all of the ipif's associated with the IRE's ill are 4393 * checked to see if there is a matching zoneid. If any 4394 * one ipif has a matching zoneid, this IRE is a 4395 * potential candidate so checking of the other fields 4396 * takes place. 4397 * 4398 * In the case where the IRE_INTERFACE has a usable source 4399 * address (indicated by ill_usesrc_ifindex) in the 4400 * correct zone then it's permitted to return this IRE 4401 */ 4402 if (match_flags & MATCH_IRE_ZONEONLY) 4403 return (B_FALSE); 4404 if (ire->ire_type & (IRE_BROADCAST | IRE_CACHE | IRE_LOOPBACK)) 4405 return (B_FALSE); 4406 /* 4407 * Note, IRE_INTERFACE can have the stq as NULL. For 4408 * example, if the default multicast route is tied to 4409 * the loopback address. 4410 */ 4411 if ((ire->ire_type & IRE_INTERFACE) && 4412 (ire->ire_stq != NULL)) { 4413 dst_ill = (ill_t *)ire->ire_stq->q_ptr; 4414 /* 4415 * If there is a usable source address in the 4416 * zone, then it's ok to return an 4417 * IRE_INTERFACE 4418 */ 4419 if (ipif_usesrc_avail(dst_ill, zoneid)) { 4420 ip3dbg(("ire_match_args: dst_ill %p match %d\n", 4421 (void *)dst_ill, 4422 (ire->ire_addr == (addr & mask)))); 4423 } else { 4424 ip3dbg(("ire_match_args: src_ipif NULL" 4425 " dst_ill %p\n", (void *)dst_ill)); 4426 return (B_FALSE); 4427 } 4428 } 4429 if (ire->ire_ipif != NULL && ire->ire_type != IRE_LOCAL && 4430 !(ire->ire_type & IRE_INTERFACE)) { 4431 ipif_t *tipif; 4432 4433 if ((match_flags & MATCH_IRE_DEFAULT) == 0) { 4434 return (B_FALSE); 4435 } 4436 mutex_enter(&ire->ire_ipif->ipif_ill->ill_lock); 4437 for (tipif = ire->ire_ipif->ipif_ill->ill_ipif; 4438 tipif != NULL; tipif = tipif->ipif_next) { 4439 if (IPIF_CAN_LOOKUP(tipif) && 4440 (tipif->ipif_flags & IPIF_UP) && 4441 (tipif->ipif_zoneid == zoneid || 4442 tipif->ipif_zoneid == ALL_ZONES)) 4443 break; 4444 } 4445 mutex_exit(&ire->ire_ipif->ipif_ill->ill_lock); 4446 if (tipif == NULL) { 4447 return (B_FALSE); 4448 } 4449 } 4450 } 4451 4452 /* 4453 * For IRE_CACHES, MATCH_IRE_ILL/ILL_GROUP really means that 4454 * somebody wants to send out on a particular interface which 4455 * is given by ire_stq and hence use ire_stq to derive the ill 4456 * value. ire_ipif for IRE_CACHES is just the means of getting 4457 * a source address i.e ire_src_addr = ire->ire_ipif->ipif_src_addr. 4458 * ire_to_ill does the right thing for this. 4459 */ 4460 if (match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP)) { 4461 ire_ill = ire_to_ill(ire); 4462 if (ire_ill != NULL) 4463 ire_ill_group = ire_ill->ill_group; 4464 ipif_ill = ipif->ipif_ill; 4465 ipif_ill_group = ipif_ill->ill_group; 4466 } 4467 4468 if ((ire->ire_addr == (addr & mask)) && 4469 ((!(match_flags & MATCH_IRE_GW)) || 4470 (ire->ire_gateway_addr == gateway)) && 4471 ((!(match_flags & MATCH_IRE_TYPE)) || 4472 (ire->ire_type & type)) && 4473 ((!(match_flags & MATCH_IRE_SRC)) || 4474 (ire->ire_src_addr == ipif->ipif_src_addr)) && 4475 ((!(match_flags & MATCH_IRE_IPIF)) || 4476 (ire->ire_ipif == ipif)) && 4477 ((!(match_flags & MATCH_IRE_MARK_HIDDEN)) || 4478 (ire->ire_type != IRE_CACHE || 4479 ire->ire_marks & IRE_MARK_HIDDEN)) && 4480 ((!(match_flags & MATCH_IRE_MARK_PRIVATE_ADDR)) || 4481 (ire->ire_type != IRE_CACHE || 4482 ire->ire_marks & IRE_MARK_PRIVATE_ADDR)) && 4483 ((!(match_flags & MATCH_IRE_ILL)) || 4484 (ire_ill == ipif_ill)) && 4485 ((!(match_flags & MATCH_IRE_IHANDLE)) || 4486 (ire->ire_ihandle == ihandle)) && 4487 ((!(match_flags & MATCH_IRE_MASK)) || 4488 (ire->ire_mask == mask)) && 4489 ((!(match_flags & MATCH_IRE_ILL_GROUP)) || 4490 (ire_ill == ipif_ill) || 4491 (ire_ill_group != NULL && 4492 ire_ill_group == ipif_ill_group)) && 4493 ((!(match_flags & MATCH_IRE_SECATTR)) || 4494 (!is_system_labeled()) || 4495 (tsol_ire_match_gwattr(ire, tsl) == 0))) { 4496 /* We found the matched IRE */ 4497 return (B_TRUE); 4498 } 4499 return (B_FALSE); 4500 } 4501 4502 4503 /* 4504 * Lookup for a route in all the tables 4505 */ 4506 ire_t * 4507 ire_route_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, 4508 int type, const ipif_t *ipif, ire_t **pire, zoneid_t zoneid, 4509 const ts_label_t *tsl, int flags, ip_stack_t *ipst) 4510 { 4511 ire_t *ire = NULL; 4512 4513 /* 4514 * ire_match_args() will dereference ipif MATCH_IRE_SRC or 4515 * MATCH_IRE_ILL is set. 4516 */ 4517 if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) && 4518 (ipif == NULL)) 4519 return (NULL); 4520 4521 /* 4522 * might be asking for a cache lookup, 4523 * This is not best way to lookup cache, 4524 * user should call ire_cache_lookup directly. 4525 * 4526 * If MATCH_IRE_TYPE was set, first lookup in the cache table and then 4527 * in the forwarding table, if the applicable type flags were set. 4528 */ 4529 if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_CACHETABLE) != 0) { 4530 ire = ire_ctable_lookup(addr, gateway, type, ipif, zoneid, 4531 tsl, flags, ipst); 4532 if (ire != NULL) 4533 return (ire); 4534 } 4535 if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_FORWARDTABLE) != 0) { 4536 ire = ire_ftable_lookup(addr, mask, gateway, type, ipif, pire, 4537 zoneid, 0, tsl, flags, ipst); 4538 } 4539 return (ire); 4540 } 4541 4542 4543 /* 4544 * Delete the IRE cache for the gateway and all IRE caches whose 4545 * ire_gateway_addr points to this gateway, and allow them to 4546 * be created on demand by ip_newroute. 4547 */ 4548 void 4549 ire_clookup_delete_cache_gw(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst) 4550 { 4551 irb_t *irb; 4552 ire_t *ire; 4553 4554 irb = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(addr, 4555 ipst->ips_ip_cache_table_size)]; 4556 IRB_REFHOLD(irb); 4557 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 4558 if (ire->ire_marks & IRE_MARK_CONDEMNED) 4559 continue; 4560 4561 ASSERT(ire->ire_mask == IP_HOST_MASK); 4562 ASSERT(ire->ire_type != IRE_MIPRTUN && ire->ire_in_ill == NULL); 4563 if (ire_match_args(ire, addr, ire->ire_mask, 0, IRE_CACHE, 4564 NULL, zoneid, 0, NULL, MATCH_IRE_TYPE)) { 4565 ire_delete(ire); 4566 } 4567 } 4568 IRB_REFRELE(irb); 4569 4570 ire_walk_v4(ire_delete_cache_gw, &addr, zoneid, ipst); 4571 } 4572 4573 /* 4574 * Looks up cache table for a route. 4575 * specific lookup can be indicated by 4576 * passing the MATCH_* flags and the 4577 * necessary parameters. 4578 */ 4579 ire_t * 4580 ire_ctable_lookup(ipaddr_t addr, ipaddr_t gateway, int type, const ipif_t *ipif, 4581 zoneid_t zoneid, const ts_label_t *tsl, int flags, ip_stack_t *ipst) 4582 { 4583 irb_t *irb_ptr; 4584 ire_t *ire; 4585 4586 /* 4587 * ire_match_args() will dereference ipif MATCH_IRE_SRC or 4588 * MATCH_IRE_ILL is set. 4589 */ 4590 if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) && 4591 (ipif == NULL)) 4592 return (NULL); 4593 4594 irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(addr, 4595 ipst->ips_ip_cache_table_size)]; 4596 rw_enter(&irb_ptr->irb_lock, RW_READER); 4597 for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 4598 if (ire->ire_marks & IRE_MARK_CONDEMNED) 4599 continue; 4600 ASSERT(ire->ire_mask == IP_HOST_MASK); 4601 ASSERT(ire->ire_type != IRE_MIPRTUN && ire->ire_in_ill == NULL); 4602 if (ire_match_args(ire, addr, ire->ire_mask, gateway, type, 4603 ipif, zoneid, 0, tsl, flags)) { 4604 IRE_REFHOLD(ire); 4605 rw_exit(&irb_ptr->irb_lock); 4606 return (ire); 4607 } 4608 } 4609 rw_exit(&irb_ptr->irb_lock); 4610 return (NULL); 4611 } 4612 4613 /* 4614 * Check whether the IRE_LOCAL and the IRE potentially used to transmit 4615 * (could be an IRE_CACHE, IRE_BROADCAST, or IRE_INTERFACE) are part of 4616 * the same ill group. 4617 */ 4618 boolean_t 4619 ire_local_same_ill_group(ire_t *ire_local, ire_t *xmit_ire) 4620 { 4621 ill_t *recv_ill, *xmit_ill; 4622 ill_group_t *recv_group, *xmit_group; 4623 4624 ASSERT(ire_local->ire_type & (IRE_LOCAL|IRE_LOOPBACK)); 4625 ASSERT(xmit_ire->ire_type & (IRE_CACHETABLE|IRE_INTERFACE)); 4626 4627 recv_ill = ire_to_ill(ire_local); 4628 xmit_ill = ire_to_ill(xmit_ire); 4629 4630 ASSERT(recv_ill != NULL); 4631 ASSERT(xmit_ill != NULL); 4632 4633 if (recv_ill == xmit_ill) 4634 return (B_TRUE); 4635 4636 recv_group = recv_ill->ill_group; 4637 xmit_group = xmit_ill->ill_group; 4638 4639 if (recv_group != NULL && recv_group == xmit_group) 4640 return (B_TRUE); 4641 4642 return (B_FALSE); 4643 } 4644 4645 /* 4646 * Check if the IRE_LOCAL uses the same ill (group) as another route would use. 4647 * If there is no alternate route, or the alternate is a REJECT or BLACKHOLE, 4648 * then we don't allow this IRE_LOCAL to be used. 4649 */ 4650 boolean_t 4651 ire_local_ok_across_zones(ire_t *ire_local, zoneid_t zoneid, void *addr, 4652 const ts_label_t *tsl, ip_stack_t *ipst) 4653 { 4654 ire_t *alt_ire; 4655 boolean_t rval; 4656 4657 if (ire_local->ire_ipversion == IPV4_VERSION) { 4658 alt_ire = ire_ftable_lookup(*((ipaddr_t *)addr), 0, 0, 0, NULL, 4659 NULL, zoneid, 0, tsl, 4660 MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 4661 MATCH_IRE_RJ_BHOLE, ipst); 4662 } else { 4663 alt_ire = ire_ftable_lookup_v6((in6_addr_t *)addr, NULL, NULL, 4664 0, NULL, NULL, zoneid, 0, tsl, 4665 MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 4666 MATCH_IRE_RJ_BHOLE, ipst); 4667 } 4668 4669 if (alt_ire == NULL) 4670 return (B_FALSE); 4671 4672 if (alt_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 4673 ire_refrele(alt_ire); 4674 return (B_FALSE); 4675 } 4676 rval = ire_local_same_ill_group(ire_local, alt_ire); 4677 4678 ire_refrele(alt_ire); 4679 return (rval); 4680 } 4681 4682 /* 4683 * Lookup cache. Don't return IRE_MARK_HIDDEN entries. Callers 4684 * should use ire_ctable_lookup with MATCH_IRE_MARK_HIDDEN to get 4685 * to the hidden ones. 4686 * 4687 * In general the zoneid has to match (where ALL_ZONES match all of them). 4688 * But for IRE_LOCAL we also need to handle the case where L2 should 4689 * conceptually loop back the packet. This is necessary since neither 4690 * Ethernet drivers nor Ethernet hardware loops back packets sent to their 4691 * own MAC address. This loopback is needed when the normal 4692 * routes (ignoring IREs with different zoneids) would send out the packet on 4693 * the same ill (or ill group) as the ill with which this IRE_LOCAL is 4694 * associated. 4695 * 4696 * Earlier versions of this code always matched an IRE_LOCAL independently of 4697 * the zoneid. We preserve that earlier behavior when 4698 * ip_restrict_interzone_loopback is turned off. 4699 */ 4700 ire_t * 4701 ire_cache_lookup(ipaddr_t addr, zoneid_t zoneid, const ts_label_t *tsl, 4702 ip_stack_t *ipst) 4703 { 4704 irb_t *irb_ptr; 4705 ire_t *ire; 4706 4707 irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(addr, 4708 ipst->ips_ip_cache_table_size)]; 4709 rw_enter(&irb_ptr->irb_lock, RW_READER); 4710 for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 4711 if (ire->ire_marks & (IRE_MARK_CONDEMNED | 4712 IRE_MARK_HIDDEN | IRE_MARK_PRIVATE_ADDR)) { 4713 continue; 4714 } 4715 if (ire->ire_addr == addr) { 4716 /* 4717 * Finally, check if the security policy has any 4718 * restriction on using this route for the specified 4719 * message. 4720 */ 4721 if (tsl != NULL && 4722 ire->ire_gw_secattr != NULL && 4723 tsol_ire_match_gwattr(ire, tsl) != 0) { 4724 continue; 4725 } 4726 4727 if (zoneid == ALL_ZONES || ire->ire_zoneid == zoneid || 4728 ire->ire_zoneid == ALL_ZONES) { 4729 IRE_REFHOLD(ire); 4730 rw_exit(&irb_ptr->irb_lock); 4731 return (ire); 4732 } 4733 4734 if (ire->ire_type == IRE_LOCAL) { 4735 if (ipst->ips_ip_restrict_interzone_loopback && 4736 !ire_local_ok_across_zones(ire, zoneid, 4737 &addr, tsl, ipst)) 4738 continue; 4739 4740 IRE_REFHOLD(ire); 4741 rw_exit(&irb_ptr->irb_lock); 4742 return (ire); 4743 } 4744 } 4745 } 4746 rw_exit(&irb_ptr->irb_lock); 4747 return (NULL); 4748 } 4749 4750 /* 4751 * Locate the interface ire that is tied to the cache ire 'cire' via 4752 * cire->ire_ihandle. 4753 * 4754 * We are trying to create the cache ire for an offlink destn based 4755 * on the cache ire of the gateway in 'cire'. 'pire' is the prefix ire 4756 * as found by ip_newroute(). We are called from ip_newroute() in 4757 * the IRE_CACHE case. 4758 */ 4759 ire_t * 4760 ire_ihandle_lookup_offlink(ire_t *cire, ire_t *pire) 4761 { 4762 ire_t *ire; 4763 int match_flags; 4764 ipaddr_t gw_addr; 4765 ipif_t *gw_ipif; 4766 ip_stack_t *ipst = cire->ire_ipst; 4767 4768 ASSERT(cire != NULL && pire != NULL); 4769 4770 /* 4771 * We don't need to specify the zoneid to ire_ftable_lookup() below 4772 * because the ihandle refers to an ipif which can be in only one zone. 4773 */ 4774 match_flags = MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK; 4775 /* 4776 * ip_newroute calls ire_ftable_lookup with MATCH_IRE_ILL only 4777 * for on-link hosts. We should never be here for onlink. 4778 * Thus, use MATCH_IRE_ILL_GROUP. 4779 */ 4780 if (pire->ire_ipif != NULL) 4781 match_flags |= MATCH_IRE_ILL_GROUP; 4782 /* 4783 * We know that the mask of the interface ire equals cire->ire_cmask. 4784 * (When ip_newroute() created 'cire' for the gateway it set its 4785 * cmask from the interface ire's mask) 4786 */ 4787 ire = ire_ftable_lookup(cire->ire_addr, cire->ire_cmask, 0, 4788 IRE_INTERFACE, pire->ire_ipif, NULL, ALL_ZONES, cire->ire_ihandle, 4789 NULL, match_flags, ipst); 4790 if (ire != NULL) 4791 return (ire); 4792 /* 4793 * If we didn't find an interface ire above, we can't declare failure. 4794 * For backwards compatibility, we need to support prefix routes 4795 * pointing to next hop gateways that are not on-link. 4796 * 4797 * Assume we are trying to ping some offlink destn, and we have the 4798 * routing table below. 4799 * 4800 * Eg. default - gw1 <--- pire (line 1) 4801 * gw1 - gw2 (line 2) 4802 * gw2 - hme0 (line 3) 4803 * 4804 * If we already have a cache ire for gw1 in 'cire', the 4805 * ire_ftable_lookup above would have failed, since there is no 4806 * interface ire to reach gw1. We will fallthru below. 4807 * 4808 * Here we duplicate the steps that ire_ftable_lookup() did in 4809 * getting 'cire' from 'pire', in the MATCH_IRE_RECURSIVE case. 4810 * The differences are the following 4811 * i. We want the interface ire only, so we call ire_ftable_lookup() 4812 * instead of ire_route_lookup() 4813 * ii. We look for only prefix routes in the 1st call below. 4814 * ii. We want to match on the ihandle in the 2nd call below. 4815 */ 4816 match_flags = MATCH_IRE_TYPE; 4817 if (pire->ire_ipif != NULL) 4818 match_flags |= MATCH_IRE_ILL_GROUP; 4819 ire = ire_ftable_lookup(pire->ire_gateway_addr, 0, 0, IRE_OFFSUBNET, 4820 pire->ire_ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst); 4821 if (ire == NULL) 4822 return (NULL); 4823 /* 4824 * At this point 'ire' corresponds to the entry shown in line 2. 4825 * gw_addr is 'gw2' in the example above. 4826 */ 4827 gw_addr = ire->ire_gateway_addr; 4828 gw_ipif = ire->ire_ipif; 4829 ire_refrele(ire); 4830 4831 match_flags |= MATCH_IRE_IHANDLE; 4832 ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, 4833 gw_ipif, NULL, ALL_ZONES, cire->ire_ihandle, NULL, match_flags, 4834 ipst); 4835 return (ire); 4836 } 4837 4838 /* 4839 * ire_mrtun_lookup() is called by ip_rput() when packet is to be 4840 * tunneled through reverse tunnel. This is only supported for 4841 * IPv4 packets 4842 */ 4843 4844 ire_t * 4845 ire_mrtun_lookup(ipaddr_t srcaddr, ill_t *ill) 4846 { 4847 irb_t *irb_ptr; 4848 ire_t *ire; 4849 ip_stack_t *ipst = ill->ill_ipst; 4850 4851 ASSERT(ill != NULL); 4852 ASSERT(!(ill->ill_isv6)); 4853 4854 if (ipst->ips_ip_mrtun_table == NULL) 4855 return (NULL); 4856 irb_ptr = &ipst->ips_ip_mrtun_table[IRE_ADDR_HASH(srcaddr, 4857 IP_MRTUN_TABLE_SIZE)]; 4858 rw_enter(&irb_ptr->irb_lock, RW_READER); 4859 for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 4860 if (ire->ire_marks & IRE_MARK_CONDEMNED) 4861 continue; 4862 if ((ire->ire_in_src_addr == srcaddr) && 4863 ire->ire_in_ill == ill) { 4864 IRE_REFHOLD(ire); 4865 rw_exit(&irb_ptr->irb_lock); 4866 return (ire); 4867 } 4868 } 4869 rw_exit(&irb_ptr->irb_lock); 4870 return (NULL); 4871 } 4872 4873 /* 4874 * Return the IRE_LOOPBACK, IRE_IF_RESOLVER or IRE_IF_NORESOLVER 4875 * ire associated with the specified ipif. 4876 * 4877 * This might occasionally be called when IPIF_UP is not set since 4878 * the IP_MULTICAST_IF as well as creating interface routes 4879 * allows specifying a down ipif (ipif_lookup* match ipifs that are down). 4880 * 4881 * Note that if IPIF_NOLOCAL, IPIF_NOXMIT, or IPIF_DEPRECATED is set on 4882 * the ipif, this routine might return NULL. 4883 */ 4884 ire_t * 4885 ipif_to_ire(const ipif_t *ipif) 4886 { 4887 ire_t *ire; 4888 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 4889 4890 ASSERT(!ipif->ipif_isv6); 4891 if (ipif->ipif_ire_type == IRE_LOOPBACK) { 4892 ire = ire_ctable_lookup(ipif->ipif_lcl_addr, 0, IRE_LOOPBACK, 4893 ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF), 4894 ipst); 4895 } else if (ipif->ipif_flags & IPIF_POINTOPOINT) { 4896 /* In this case we need to lookup destination address. */ 4897 ire = ire_ftable_lookup(ipif->ipif_pp_dst_addr, IP_HOST_MASK, 0, 4898 IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL, 4899 (MATCH_IRE_TYPE | MATCH_IRE_IPIF | MATCH_IRE_MASK), ipst); 4900 } else { 4901 ire = ire_ftable_lookup(ipif->ipif_subnet, 4902 ipif->ipif_net_mask, 0, IRE_INTERFACE, ipif, NULL, 4903 ALL_ZONES, 0, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF | 4904 MATCH_IRE_MASK), ipst); 4905 } 4906 return (ire); 4907 } 4908 4909 /* 4910 * ire_walk function. 4911 * Count the number of IRE_CACHE entries in different categories. 4912 */ 4913 void 4914 ire_cache_count(ire_t *ire, char *arg) 4915 { 4916 ire_cache_count_t *icc = (ire_cache_count_t *)arg; 4917 4918 if (ire->ire_type != IRE_CACHE) 4919 return; 4920 4921 icc->icc_total++; 4922 4923 if (ire->ire_ipversion == IPV6_VERSION) { 4924 mutex_enter(&ire->ire_lock); 4925 if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) { 4926 mutex_exit(&ire->ire_lock); 4927 icc->icc_onlink++; 4928 return; 4929 } 4930 mutex_exit(&ire->ire_lock); 4931 } else { 4932 if (ire->ire_gateway_addr == 0) { 4933 icc->icc_onlink++; 4934 return; 4935 } 4936 } 4937 4938 ASSERT(ire->ire_ipif != NULL); 4939 if (ire->ire_max_frag < ire->ire_ipif->ipif_mtu) 4940 icc->icc_pmtu++; 4941 else if (ire->ire_tire_mark != ire->ire_ob_pkt_count + 4942 ire->ire_ib_pkt_count) 4943 icc->icc_offlink++; 4944 else 4945 icc->icc_unused++; 4946 } 4947 4948 /* 4949 * ire_walk function called by ip_trash_ire_reclaim(). 4950 * Free a fraction of the IRE_CACHE cache entries. The fractions are 4951 * different for different categories of IRE_CACHE entries. 4952 * A fraction of zero means to not free any in that category. 4953 * Use the hash bucket id plus lbolt as a random number. Thus if the fraction 4954 * is N then every Nth hash bucket chain will be freed. 4955 */ 4956 void 4957 ire_cache_reclaim(ire_t *ire, char *arg) 4958 { 4959 ire_cache_reclaim_t *icr = (ire_cache_reclaim_t *)arg; 4960 uint_t rand; 4961 ip_stack_t *ipst = icr->icr_ipst; 4962 4963 if (ire->ire_type != IRE_CACHE) 4964 return; 4965 4966 if (ire->ire_ipversion == IPV6_VERSION) { 4967 rand = (uint_t)lbolt + 4968 IRE_ADDR_HASH_V6(ire->ire_addr_v6, 4969 ipst->ips_ip6_cache_table_size); 4970 mutex_enter(&ire->ire_lock); 4971 if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) { 4972 mutex_exit(&ire->ire_lock); 4973 if (icr->icr_onlink != 0 && 4974 (rand/icr->icr_onlink)*icr->icr_onlink == rand) { 4975 ire_delete(ire); 4976 return; 4977 } 4978 goto done; 4979 } 4980 mutex_exit(&ire->ire_lock); 4981 } else { 4982 rand = (uint_t)lbolt + 4983 IRE_ADDR_HASH(ire->ire_addr, ipst->ips_ip_cache_table_size); 4984 if (ire->ire_gateway_addr == 0) { 4985 if (icr->icr_onlink != 0 && 4986 (rand/icr->icr_onlink)*icr->icr_onlink == rand) { 4987 ire_delete(ire); 4988 return; 4989 } 4990 goto done; 4991 } 4992 } 4993 /* Not onlink IRE */ 4994 ASSERT(ire->ire_ipif != NULL); 4995 if (ire->ire_max_frag < ire->ire_ipif->ipif_mtu) { 4996 /* Use ptmu fraction */ 4997 if (icr->icr_pmtu != 0 && 4998 (rand/icr->icr_pmtu)*icr->icr_pmtu == rand) { 4999 ire_delete(ire); 5000 return; 5001 } 5002 } else if (ire->ire_tire_mark != ire->ire_ob_pkt_count + 5003 ire->ire_ib_pkt_count) { 5004 /* Use offlink fraction */ 5005 if (icr->icr_offlink != 0 && 5006 (rand/icr->icr_offlink)*icr->icr_offlink == rand) { 5007 ire_delete(ire); 5008 return; 5009 } 5010 } else { 5011 /* Use unused fraction */ 5012 if (icr->icr_unused != 0 && 5013 (rand/icr->icr_unused)*icr->icr_unused == rand) { 5014 ire_delete(ire); 5015 return; 5016 } 5017 } 5018 done: 5019 /* 5020 * Update tire_mark so that those that haven't been used since this 5021 * reclaim will be considered unused next time we reclaim. 5022 */ 5023 ire->ire_tire_mark = ire->ire_ob_pkt_count + ire->ire_ib_pkt_count; 5024 } 5025 5026 static void 5027 power2_roundup(uint32_t *value) 5028 { 5029 int i; 5030 5031 for (i = 1; i < 31; i++) { 5032 if (*value <= (1 << i)) 5033 break; 5034 } 5035 *value = (1 << i); 5036 } 5037 5038 /* Global init for all zones */ 5039 void 5040 ip_ire_g_init() 5041 { 5042 /* 5043 * Create ire caches, ire_reclaim() 5044 * will give IRE_CACHE back to system when needed. 5045 * This needs to be done here before anything else, since 5046 * ire_add() expects the cache to be created. 5047 */ 5048 ire_cache = kmem_cache_create("ire_cache", 5049 sizeof (ire_t), 0, ip_ire_constructor, 5050 ip_ire_destructor, ip_trash_ire_reclaim, NULL, NULL, 0); 5051 5052 rt_entry_cache = kmem_cache_create("rt_entry", 5053 sizeof (struct rt_entry), 0, NULL, NULL, NULL, NULL, NULL, 0); 5054 5055 /* 5056 * Have radix code setup kmem caches etc. 5057 */ 5058 rn_init(); 5059 } 5060 5061 void 5062 ip_ire_init(ip_stack_t *ipst) 5063 { 5064 int i; 5065 uint32_t mem_cnt; 5066 uint32_t cpu_cnt; 5067 uint32_t min_cnt; 5068 pgcnt_t mem_avail; 5069 5070 /* 5071 * ip_ire_max_bucket_cnt is sized below based on the memory 5072 * size and the cpu speed of the machine. This is upper 5073 * bounded by the compile time value of ip_ire_max_bucket_cnt 5074 * and is lower bounded by the compile time value of 5075 * ip_ire_min_bucket_cnt. Similar logic applies to 5076 * ip6_ire_max_bucket_cnt. 5077 * 5078 * We calculate this for each IP Instances in order to use 5079 * the kmem_avail and ip_ire_{min,max}_bucket_cnt that are 5080 * in effect when the zone is booted. 5081 */ 5082 mem_avail = kmem_avail(); 5083 mem_cnt = (mem_avail >> ip_ire_mem_ratio) / 5084 ip_cache_table_size / sizeof (ire_t); 5085 cpu_cnt = CPU->cpu_type_info.pi_clock >> ip_ire_cpu_ratio; 5086 5087 min_cnt = MIN(cpu_cnt, mem_cnt); 5088 if (min_cnt < ip_ire_min_bucket_cnt) 5089 min_cnt = ip_ire_min_bucket_cnt; 5090 if (ip_ire_max_bucket_cnt > min_cnt) { 5091 ip_ire_max_bucket_cnt = min_cnt; 5092 } 5093 5094 mem_cnt = (mem_avail >> ip_ire_mem_ratio) / 5095 ip6_cache_table_size / sizeof (ire_t); 5096 min_cnt = MIN(cpu_cnt, mem_cnt); 5097 if (min_cnt < ip6_ire_min_bucket_cnt) 5098 min_cnt = ip6_ire_min_bucket_cnt; 5099 if (ip6_ire_max_bucket_cnt > min_cnt) { 5100 ip6_ire_max_bucket_cnt = min_cnt; 5101 } 5102 5103 mutex_init(&ipst->ips_ire_ft_init_lock, NULL, MUTEX_DEFAULT, 0); 5104 mutex_init(&ipst->ips_ire_handle_lock, NULL, MUTEX_DEFAULT, NULL); 5105 mutex_init(&ipst->ips_ire_mrtun_lock, NULL, MUTEX_DEFAULT, NULL); 5106 mutex_init(&ipst->ips_ire_srcif_table_lock, NULL, MUTEX_DEFAULT, NULL); 5107 5108 (void) rn_inithead((void **)&ipst->ips_ip_ftable, 32); 5109 5110 5111 /* Calculate the IPv4 cache table size. */ 5112 ipst->ips_ip_cache_table_size = MAX(ip_cache_table_size, 5113 ((mem_avail >> ip_ire_mem_ratio) / sizeof (ire_t) / 5114 ip_ire_max_bucket_cnt)); 5115 if (ipst->ips_ip_cache_table_size > ip_max_cache_table_size) 5116 ipst->ips_ip_cache_table_size = ip_max_cache_table_size; 5117 /* 5118 * Make sure that the table size is always a power of 2. The 5119 * hash macro IRE_ADDR_HASH() depends on that. 5120 */ 5121 power2_roundup(&ipst->ips_ip_cache_table_size); 5122 5123 ipst->ips_ip_cache_table = kmem_zalloc(ipst->ips_ip_cache_table_size * 5124 sizeof (irb_t), KM_SLEEP); 5125 5126 for (i = 0; i < ipst->ips_ip_cache_table_size; i++) { 5127 rw_init(&ipst->ips_ip_cache_table[i].irb_lock, NULL, 5128 RW_DEFAULT, NULL); 5129 } 5130 5131 /* Calculate the IPv6 cache table size. */ 5132 ipst->ips_ip6_cache_table_size = MAX(ip6_cache_table_size, 5133 ((mem_avail >> ip_ire_mem_ratio) / sizeof (ire_t) / 5134 ip6_ire_max_bucket_cnt)); 5135 if (ipst->ips_ip6_cache_table_size > ip6_max_cache_table_size) 5136 ipst->ips_ip6_cache_table_size = ip6_max_cache_table_size; 5137 /* 5138 * Make sure that the table size is always a power of 2. The 5139 * hash macro IRE_ADDR_HASH_V6() depends on that. 5140 */ 5141 power2_roundup(&ipst->ips_ip6_cache_table_size); 5142 5143 ipst->ips_ip_cache_table_v6 = kmem_zalloc( 5144 ipst->ips_ip6_cache_table_size * sizeof (irb_t), KM_SLEEP); 5145 5146 for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) { 5147 rw_init(&ipst->ips_ip_cache_table_v6[i].irb_lock, NULL, 5148 RW_DEFAULT, NULL); 5149 } 5150 5151 /* 5152 * Initialize ip_mrtun_table to NULL now, it will be 5153 * populated by ip_rt_add if reverse tunnel is created 5154 */ 5155 ipst->ips_ip_mrtun_table = NULL; 5156 5157 /* 5158 * Make sure that the forwarding table size is a power of 2. 5159 * The IRE*_ADDR_HASH() macroes depend on that. 5160 */ 5161 ipst->ips_ip6_ftable_hash_size = ip6_ftable_hash_size; 5162 power2_roundup(&ipst->ips_ip6_ftable_hash_size); 5163 5164 ipst->ips_ire_handle = 1; 5165 } 5166 5167 void 5168 ip_ire_g_fini(void) 5169 { 5170 kmem_cache_destroy(ire_cache); 5171 kmem_cache_destroy(rt_entry_cache); 5172 5173 rn_fini(); 5174 } 5175 5176 void 5177 ip_ire_fini(ip_stack_t *ipst) 5178 { 5179 int i; 5180 5181 /* 5182 * Delete all IREs - assumes that the ill/ipifs have 5183 * been removed so what remains are just the ftable and IRE_CACHE. 5184 */ 5185 ire_walk_ill_mrtun(0, 0, ire_delete, NULL, NULL, ipst); 5186 ire_walk(ire_delete, NULL, ipst); 5187 5188 rn_freehead(ipst->ips_ip_ftable); 5189 ipst->ips_ip_ftable = NULL; 5190 5191 mutex_destroy(&ipst->ips_ire_ft_init_lock); 5192 mutex_destroy(&ipst->ips_ire_handle_lock); 5193 mutex_destroy(&ipst->ips_ire_mrtun_lock); 5194 mutex_destroy(&ipst->ips_ire_srcif_table_lock); 5195 5196 for (i = 0; i < ipst->ips_ip_cache_table_size; i++) { 5197 ASSERT(ipst->ips_ip_cache_table[i].irb_ire == NULL); 5198 rw_destroy(&ipst->ips_ip_cache_table[i].irb_lock); 5199 } 5200 kmem_free(ipst->ips_ip_cache_table, 5201 ipst->ips_ip_cache_table_size * sizeof (irb_t)); 5202 ipst->ips_ip_cache_table = NULL; 5203 5204 for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) { 5205 ASSERT(ipst->ips_ip_cache_table_v6[i].irb_ire == NULL); 5206 rw_destroy(&ipst->ips_ip_cache_table_v6[i].irb_lock); 5207 } 5208 kmem_free(ipst->ips_ip_cache_table_v6, 5209 ipst->ips_ip6_cache_table_size * sizeof (irb_t)); 5210 ipst->ips_ip_cache_table_v6 = NULL; 5211 5212 if (ipst->ips_ip_mrtun_table != NULL) { 5213 for (i = 0; i < IP_MRTUN_TABLE_SIZE; i++) { 5214 ASSERT(ipst->ips_ip_mrtun_table[i].irb_ire == NULL); 5215 rw_destroy(&ipst->ips_ip_mrtun_table[i].irb_lock); 5216 } 5217 kmem_free(ipst->ips_ip_mrtun_table, 5218 IP_MRTUN_TABLE_SIZE * sizeof (irb_t)); 5219 ipst->ips_ip_mrtun_table = NULL; 5220 } 5221 5222 for (i = 0; i < IP6_MASK_TABLE_SIZE; i++) { 5223 irb_t *ptr; 5224 int j; 5225 5226 if ((ptr = ipst->ips_ip_forwarding_table_v6[i]) == NULL) 5227 continue; 5228 5229 for (j = 0; j < ipst->ips_ip6_ftable_hash_size; j++) { 5230 ASSERT(ptr[j].irb_ire == NULL); 5231 rw_destroy(&ptr[j].irb_lock); 5232 } 5233 mi_free(ptr); 5234 ipst->ips_ip_forwarding_table_v6[i] = NULL; 5235 } 5236 } 5237 5238 int 5239 ire_add_mrtun(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func) 5240 { 5241 ire_t *ire1; 5242 irb_t *irb_ptr; 5243 ire_t **irep; 5244 ire_t *ire = *ire_p; 5245 int i; 5246 uint_t max_frag; 5247 ill_t *stq_ill; 5248 int error; 5249 ip_stack_t *ipst = ire->ire_ipst; 5250 5251 ASSERT(ire->ire_ipversion == IPV4_VERSION); 5252 /* Is ip_mrtun_table empty ? */ 5253 5254 if (ipst->ips_ip_mrtun_table == NULL) { 5255 /* create the mrtun table */ 5256 mutex_enter(&ipst->ips_ire_mrtun_lock); 5257 if (ipst->ips_ip_mrtun_table == NULL) { 5258 ipst->ips_ip_mrtun_table = kmem_zalloc( 5259 IP_MRTUN_TABLE_SIZE * sizeof (irb_t), KM_NOSLEEP); 5260 5261 if (ipst->ips_ip_mrtun_table == NULL) { 5262 ip2dbg(("ire_add_mrtun: allocation failure\n")); 5263 mutex_exit(&ipst->ips_ire_mrtun_lock); 5264 ire_refrele(ire); 5265 *ire_p = NULL; 5266 return (ENOMEM); 5267 } 5268 5269 for (i = 0; i < IP_MRTUN_TABLE_SIZE; i++) { 5270 rw_init(&ipst->ips_ip_mrtun_table[i].irb_lock, NULL, 5271 RW_DEFAULT, NULL); 5272 } 5273 ip2dbg(("ire_add_mrtun: mrtun table is created\n")); 5274 } 5275 /* some other thread got it and created the table */ 5276 mutex_exit(&ipst->ips_ire_mrtun_lock); 5277 } 5278 5279 /* 5280 * Check for duplicate in the bucket and insert in the table 5281 */ 5282 irb_ptr = &(ipst->ips_ip_mrtun_table[IRE_ADDR_HASH(ire->ire_in_src_addr, 5283 IP_MRTUN_TABLE_SIZE)]); 5284 5285 /* 5286 * Start the atomic add of the ire. Grab the ill locks, 5287 * ill_g_usesrc_lock and the bucket lock. 5288 * 5289 * If ipif or ill is changing ire_atomic_start() may queue the 5290 * request and return EINPROGRESS. 5291 */ 5292 error = ire_atomic_start(irb_ptr, ire, q, mp, func); 5293 if (error != 0) { 5294 /* 5295 * We don't know whether it is a valid ipif or not. 5296 * So, set it to NULL. This assumes that the ire has not added 5297 * a reference to the ipif. 5298 */ 5299 ire->ire_ipif = NULL; 5300 ire_delete(ire); 5301 ip1dbg(("ire_add_mrtun: ire_atomic_start failed\n")); 5302 *ire_p = NULL; 5303 return (error); 5304 } 5305 for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 5306 if (ire1->ire_marks & IRE_MARK_CONDEMNED) 5307 continue; 5308 /* has anyone inserted the route in the meanwhile ? */ 5309 if (ire1->ire_in_ill == ire->ire_in_ill && 5310 ire1->ire_in_src_addr == ire->ire_in_src_addr) { 5311 ip1dbg(("ire_add_mrtun: Duplicate entry exists\n")); 5312 IRE_REFHOLD(ire1); 5313 ire_atomic_end(irb_ptr, ire); 5314 ire_delete(ire); 5315 /* Return the old ire */ 5316 *ire_p = ire1; 5317 return (0); 5318 } 5319 } 5320 5321 /* Atomically set the ire_max_frag */ 5322 max_frag = *ire->ire_max_fragp; 5323 ire->ire_max_fragp = NULL; 5324 ire->ire_max_frag = MIN(max_frag, IP_MAXPACKET); 5325 ASSERT(ire->ire_type != IRE_CACHE); 5326 irep = (ire_t **)irb_ptr; 5327 if (*irep != NULL) { 5328 /* Find the last ire which matches ire_in_src_addr */ 5329 ire1 = *irep; 5330 while (ire1->ire_in_src_addr == ire->ire_in_src_addr) { 5331 irep = &ire1->ire_next; 5332 ire1 = *irep; 5333 if (ire1 == NULL) 5334 break; 5335 } 5336 } 5337 ire1 = *irep; 5338 if (ire1 != NULL) 5339 ire1->ire_ptpn = &ire->ire_next; 5340 ire->ire_next = ire1; 5341 /* Link the new one in. */ 5342 ire->ire_ptpn = irep; 5343 membar_producer(); 5344 *irep = ire; 5345 ire->ire_bucket = irb_ptr; 5346 IRE_REFHOLD_LOCKED(ire); 5347 5348 ip2dbg(("ire_add_mrtun: created and linked ire %p\n", (void *)*irep)); 5349 5350 /* 5351 * Protect ire_mrtun_count and ill_mrtun_refcnt from 5352 * another thread trying to add ire in the table 5353 */ 5354 mutex_enter(&ipst->ips_ire_mrtun_lock); 5355 ipst->ips_ire_mrtun_count++; 5356 mutex_exit(&ipst->ips_ire_mrtun_lock); 5357 /* 5358 * ill_mrtun_refcnt is protected by the ill_lock held via 5359 * ire_atomic_start 5360 */ 5361 ire->ire_in_ill->ill_mrtun_refcnt++; 5362 5363 if (ire->ire_ipif != NULL) { 5364 ire->ire_ipif->ipif_ire_cnt++; 5365 if (ire->ire_stq != NULL) { 5366 stq_ill = (ill_t *)ire->ire_stq->q_ptr; 5367 stq_ill->ill_ire_cnt++; 5368 } 5369 } else { 5370 ASSERT(ire->ire_stq == NULL); 5371 } 5372 5373 ire_atomic_end(irb_ptr, ire); 5374 nce_fastpath(ire->ire_nce); 5375 *ire_p = ire; 5376 return (0); 5377 } 5378 5379 5380 /* Walks down the mrtun table */ 5381 5382 void 5383 ire_walk_ill_mrtun(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg, 5384 ill_t *ill, ip_stack_t *ipst) 5385 { 5386 irb_t *irb; 5387 ire_t *ire; 5388 int i; 5389 int ret; 5390 5391 ASSERT((!(match_flags & (MATCH_IRE_WQ | MATCH_IRE_ILL | 5392 MATCH_IRE_ILL_GROUP))) || (ill != NULL)); 5393 ASSERT(match_flags == 0 || ire_type == IRE_MIPRTUN); 5394 5395 mutex_enter(&ipst->ips_ire_mrtun_lock); 5396 if (ipst->ips_ire_mrtun_count == 0) { 5397 mutex_exit(&ipst->ips_ire_mrtun_lock); 5398 return; 5399 } 5400 mutex_exit(&ipst->ips_ire_mrtun_lock); 5401 5402 ip2dbg(("ire_walk_ill_mrtun:walking the reverse tunnel table \n")); 5403 for (i = 0; i < IP_MRTUN_TABLE_SIZE; i++) { 5404 5405 irb = &(ipst->ips_ip_mrtun_table[i]); 5406 if (irb->irb_ire == NULL) 5407 continue; 5408 IRB_REFHOLD(irb); 5409 for (ire = irb->irb_ire; ire != NULL; 5410 ire = ire->ire_next) { 5411 ASSERT(ire->ire_ipversion == IPV4_VERSION); 5412 if (match_flags != 0) { 5413 ret = ire_walk_ill_match( 5414 match_flags, ire_type, 5415 ire, ill, ALL_ZONES, ipst); 5416 } 5417 if (match_flags == 0 || ret) 5418 (*func)(ire, arg); 5419 } 5420 IRB_REFRELE(irb); 5421 } 5422 } 5423 5424 /* 5425 * Source interface based lookup routine (IPV4 only). 5426 * This routine is called only when RTA_SRCIFP bitflag is set 5427 * by routing socket while adding/deleting the route and it is 5428 * also called from ip_rput() when packets arrive from an interface 5429 * for which ill_srcif_ref_cnt is positive. This function is useful 5430 * when a packet coming from one interface must be forwarded to another 5431 * designated interface to reach the correct node. This function is also 5432 * called from ip_newroute when the link-layer address of an ire is resolved. 5433 * We need to make sure that ip_newroute searches for IRE_IF_RESOLVER type 5434 * ires--thus the ire_type parameter is needed. 5435 */ 5436 5437 ire_t * 5438 ire_srcif_table_lookup(ipaddr_t dst_addr, int ire_type, ipif_t *ipif, 5439 ill_t *in_ill, int flags) 5440 { 5441 irb_t *irb_ptr; 5442 ire_t *ire; 5443 irb_t *ire_srcif_table; 5444 5445 ASSERT(in_ill != NULL && !in_ill->ill_isv6); 5446 ASSERT(!(flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP)) || 5447 (ipif != NULL && !ipif->ipif_isv6)); 5448 5449 /* 5450 * No need to lock the ill since it is refheld by the caller of this 5451 * function 5452 */ 5453 if (in_ill->ill_srcif_table == NULL) { 5454 return (NULL); 5455 } 5456 5457 if (!(flags & MATCH_IRE_TYPE)) { 5458 flags |= MATCH_IRE_TYPE; 5459 ire_type = IRE_INTERFACE; 5460 } 5461 ire_srcif_table = in_ill->ill_srcif_table; 5462 irb_ptr = &ire_srcif_table[IRE_ADDR_HASH(dst_addr, 5463 IP_SRCIF_TABLE_SIZE)]; 5464 rw_enter(&irb_ptr->irb_lock, RW_READER); 5465 for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 5466 if (ire->ire_marks & IRE_MARK_CONDEMNED) 5467 continue; 5468 if (ire_match_args(ire, dst_addr, ire->ire_mask, 0, 5469 ire_type, ipif, ire->ire_zoneid, 0, NULL, flags)) { 5470 IRE_REFHOLD(ire); 5471 rw_exit(&irb_ptr->irb_lock); 5472 return (ire); 5473 } 5474 } 5475 /* Not Found */ 5476 rw_exit(&irb_ptr->irb_lock); 5477 return (NULL); 5478 } 5479 5480 5481 /* 5482 * Adds the ire into the special routing table which is hanging off of 5483 * the src_ipif->ipif_ill. It also increments the refcnt in the ill. 5484 * The forward table contains only IRE_IF_RESOLVER, IRE_IF_NORESOLVER 5485 * i,e. IRE_INTERFACE entries. Originally the dlureq_mp field is NULL 5486 * for IRE_IF_RESOLVER entry because we do not have the dst_addr's 5487 * link-layer address at the time of addition. 5488 * Upon resolving the address from ARP, dlureq_mp field is updated with 5489 * proper information in ire_update_srcif_v4. 5490 */ 5491 static int 5492 ire_add_srcif_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func) 5493 { 5494 ire_t *ire1; 5495 irb_t *ire_srcifp_table = NULL; 5496 irb_t *irb_ptr = NULL; 5497 ire_t **irep; 5498 ire_t *ire; 5499 int flags; 5500 int i; 5501 ill_t *stq_ill; 5502 uint_t max_frag; 5503 int error = 0; 5504 ip_stack_t *ipst; 5505 5506 ire = *ire_p; 5507 ipst = ire->ire_ipst; 5508 ASSERT(ire->ire_in_ill != NULL); 5509 ASSERT(ire->ire_ipversion == IPV4_VERSION); 5510 ASSERT(ire->ire_type == IRE_IF_NORESOLVER || 5511 ire->ire_type == IRE_IF_RESOLVER); 5512 5513 ire->ire_mask = IP_HOST_MASK; 5514 /* 5515 * Update ire_nce->nce_res_mp with NULL value upon creation; 5516 * first free the default res_mp created by ire_nce_init. 5517 */ 5518 freeb(ire->ire_nce->nce_res_mp); 5519 if (ire->ire_type == IRE_IF_RESOLVER) { 5520 /* 5521 * assign NULL now, it will be updated 5522 * with correct value upon returning from 5523 * ARP 5524 */ 5525 ire->ire_nce->nce_res_mp = NULL; 5526 } else { 5527 ire->ire_nce->nce_res_mp = ill_dlur_gen(NULL, 5528 ire->ire_ipif->ipif_ill->ill_phys_addr_length, 5529 ire->ire_ipif->ipif_ill->ill_sap, 5530 ire->ire_ipif->ipif_ill->ill_sap_length); 5531 } 5532 /* Make sure the address is properly masked. */ 5533 ire->ire_addr &= ire->ire_mask; 5534 5535 ASSERT(ire->ire_max_fragp != NULL); 5536 max_frag = *ire->ire_max_fragp; 5537 ire->ire_max_fragp = NULL; 5538 ire->ire_max_frag = MIN(max_frag, IP_MAXPACKET); 5539 5540 mutex_enter(&ire->ire_in_ill->ill_lock); 5541 if (ire->ire_in_ill->ill_srcif_table == NULL) { 5542 /* create the incoming interface based table */ 5543 ire->ire_in_ill->ill_srcif_table = kmem_zalloc( 5544 IP_SRCIF_TABLE_SIZE * sizeof (irb_t), KM_NOSLEEP); 5545 if (ire->ire_in_ill->ill_srcif_table == NULL) { 5546 ip1dbg(("ire_add_srcif_v4: Allocation fail\n")); 5547 mutex_exit(&ire->ire_in_ill->ill_lock); 5548 ire_delete(ire); 5549 *ire_p = NULL; 5550 return (ENOMEM); 5551 } 5552 ire_srcifp_table = ire->ire_in_ill->ill_srcif_table; 5553 for (i = 0; i < IP_SRCIF_TABLE_SIZE; i++) { 5554 rw_init(&ire_srcifp_table[i].irb_lock, NULL, 5555 RW_DEFAULT, NULL); 5556 } 5557 ip2dbg(("ire_add_srcif_v4: table created for ill %p\n", 5558 (void *)ire->ire_in_ill)); 5559 } 5560 /* Check for duplicate and insert */ 5561 ASSERT(ire->ire_in_ill->ill_srcif_table != NULL); 5562 irb_ptr = 5563 &(ire->ire_in_ill->ill_srcif_table[IRE_ADDR_HASH(ire->ire_addr, 5564 IP_SRCIF_TABLE_SIZE)]); 5565 mutex_exit(&ire->ire_in_ill->ill_lock); 5566 flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW); 5567 flags |= MATCH_IRE_IPIF; 5568 5569 /* 5570 * Start the atomic add of the ire. Grab the ill locks, 5571 * ill_g_usesrc_lock and the bucket lock. 5572 * 5573 * If ipif or ill is changing ire_atomic_start() may queue the 5574 * request and return EINPROGRESS. 5575 */ 5576 error = ire_atomic_start(irb_ptr, ire, q, mp, func); 5577 if (error != 0) { 5578 /* 5579 * We don't know whether it is a valid ipif or not. 5580 * So, set it to NULL. This assumes that the ire has not added 5581 * a reference to the ipif. 5582 */ 5583 ire->ire_ipif = NULL; 5584 ire_delete(ire); 5585 ip1dbg(("ire_add_srcif_v4: ire_atomic_start failed\n")); 5586 *ire_p = NULL; 5587 return (error); 5588 } 5589 for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 5590 if (ire1->ire_marks & IRE_MARK_CONDEMNED) 5591 continue; 5592 if (ire1->ire_zoneid != ire->ire_zoneid) 5593 continue; 5594 /* Has anyone inserted route in the meanwhile ? */ 5595 if (ire_match_args(ire1, ire->ire_addr, ire->ire_mask, 0, 5596 ire->ire_type, ire->ire_ipif, ire->ire_zoneid, 0, NULL, 5597 flags)) { 5598 ip1dbg(("ire_add_srcif_v4 : Duplicate entry exists\n")); 5599 IRE_REFHOLD(ire1); 5600 ire_atomic_end(irb_ptr, ire); 5601 ire_delete(ire); 5602 /* Return old ire as in ire_add_v4 */ 5603 *ire_p = ire1; 5604 return (0); 5605 } 5606 } 5607 irep = (ire_t **)irb_ptr; 5608 if (*irep != NULL) { 5609 /* Find the last ire which matches ire_addr */ 5610 ire1 = *irep; 5611 while (ire1->ire_addr == ire->ire_addr) { 5612 irep = &ire1->ire_next; 5613 ire1 = *irep; 5614 if (ire1 == NULL) 5615 break; 5616 } 5617 } 5618 ire1 = *irep; 5619 if (ire1 != NULL) 5620 ire1->ire_ptpn = &ire->ire_next; 5621 ire->ire_next = ire1; 5622 /* Link the new one in. */ 5623 ire->ire_ptpn = irep; 5624 membar_producer(); 5625 *irep = ire; 5626 ire->ire_bucket = irb_ptr; 5627 IRE_REFHOLD_LOCKED(ire); 5628 5629 /* 5630 * Protect ire_in_ill->ill_srcif_refcnt and table reference count. 5631 * Note, ire_atomic_start already grabs the ire_in_ill->ill_lock 5632 * so ill_srcif_refcnt is already protected. 5633 */ 5634 ire->ire_in_ill->ill_srcif_refcnt++; 5635 mutex_enter(&ipst->ips_ire_srcif_table_lock); 5636 ipst->ips_ire_srcif_table_count++; 5637 mutex_exit(&ipst->ips_ire_srcif_table_lock); 5638 irb_ptr->irb_ire_cnt++; 5639 if (ire->ire_ipif != NULL) { 5640 ire->ire_ipif->ipif_ire_cnt++; 5641 if (ire->ire_stq != NULL) { 5642 stq_ill = (ill_t *)ire->ire_stq->q_ptr; 5643 stq_ill->ill_ire_cnt++; 5644 } 5645 } else { 5646 ASSERT(ire->ire_stq == NULL); 5647 } 5648 5649 ire_atomic_end(irb_ptr, ire); 5650 *ire_p = ire; 5651 return (0); 5652 } 5653 5654 5655 /* 5656 * This function is called by ire_add_then_send when ARP request comes 5657 * back to ip_wput->ire_add_then_send for resolved ire in the interface 5658 * based routing table. At this point, it only needs to update the resolver 5659 * information for the ire. The passed ire is returned to the caller as it 5660 * is the ire which is created as mblk. 5661 */ 5662 static ire_t * 5663 ire_update_srcif_v4(ire_t *ire) 5664 { 5665 ire_t *ire1; 5666 irb_t *irb; 5667 int error; 5668 5669 ASSERT(ire->ire_type != IRE_MIPRTUN && 5670 ire->ire_ipif->ipif_net_type == IRE_IF_RESOLVER); 5671 ASSERT(ire->ire_ipversion == IPV4_VERSION); 5672 5673 /* 5674 * This ire is from ARP. Update 5675 * ire_nce->nce_res_mp info 5676 */ 5677 ire1 = ire_srcif_table_lookup(ire->ire_addr, 5678 IRE_IF_RESOLVER, ire->ire_ipif, 5679 ire->ire_in_ill, 5680 MATCH_IRE_ILL | MATCH_IRE_TYPE); 5681 if (ire1 == NULL) { 5682 /* Mobile node registration expired ? */ 5683 ire_delete(ire); 5684 return (NULL); 5685 } 5686 irb = ire1->ire_bucket; 5687 ASSERT(irb != NULL); 5688 /* 5689 * Start the atomic add of the ire. Grab the ill locks, 5690 * ill_g_usesrc_lock and the bucket lock. 5691 */ 5692 error = ire_atomic_start(irb, ire1, NULL, NULL, NULL); 5693 if (error != 0) { 5694 /* 5695 * We don't know whether it is a valid ipif or not. 5696 * So, set it to NULL. This assumes that the ire has not added 5697 * a reference to the ipif. 5698 */ 5699 ire->ire_ipif = NULL; 5700 ire_delete(ire); 5701 ip1dbg(("ire_update_srcif_v4: ire_atomic_start failed\n")); 5702 return (NULL); 5703 } 5704 ASSERT(ire->ire_max_fragp == NULL); 5705 ire->ire_max_frag = ire1->ire_max_frag; 5706 /* 5707 * Update resolver information and 5708 * send-to queue. 5709 */ 5710 ASSERT(ire->ire_nce->nce_res_mp != NULL); 5711 ire1->ire_nce->nce_res_mp = copyb(ire->ire_nce->nce_res_mp); 5712 if (ire1->ire_nce->nce_res_mp == NULL) { 5713 ip0dbg(("ire_update_srcif: copyb failed\n")); 5714 ire_refrele(ire1); 5715 ire_refrele(ire); 5716 ire_atomic_end(irb, ire1); 5717 return (NULL); 5718 } 5719 ire1->ire_stq = ire->ire_stq; 5720 5721 ASSERT(ire->ire_nce->nce_fp_mp == NULL); 5722 5723 ire_atomic_end(irb, ire1); 5724 ire_refrele(ire1); 5725 /* Return the passed ire */ 5726 return (ire); /* Update done */ 5727 } 5728 5729 5730 /* 5731 * Check if another multirt route resolution is needed. 5732 * B_TRUE is returned is there remain a resolvable route, 5733 * or if no route for that dst is resolved yet. 5734 * B_FALSE is returned if all routes for that dst are resolved 5735 * or if the remaining unresolved routes are actually not 5736 * resolvable. 5737 * This only works in the global zone. 5738 */ 5739 boolean_t 5740 ire_multirt_need_resolve(ipaddr_t dst, const ts_label_t *tsl, ip_stack_t *ipst) 5741 { 5742 ire_t *first_fire; 5743 ire_t *first_cire; 5744 ire_t *fire; 5745 ire_t *cire; 5746 irb_t *firb; 5747 irb_t *cirb; 5748 int unres_cnt = 0; 5749 boolean_t resolvable = B_FALSE; 5750 5751 /* Retrieve the first IRE_HOST that matches the destination */ 5752 first_fire = ire_ftable_lookup(dst, IP_HOST_MASK, 0, IRE_HOST, NULL, 5753 NULL, ALL_ZONES, 0, tsl, 5754 MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_SECATTR, ipst); 5755 5756 /* No route at all */ 5757 if (first_fire == NULL) { 5758 return (B_TRUE); 5759 } 5760 5761 firb = first_fire->ire_bucket; 5762 ASSERT(firb != NULL); 5763 5764 /* Retrieve the first IRE_CACHE ire for that destination. */ 5765 first_cire = ire_cache_lookup(dst, GLOBAL_ZONEID, tsl, ipst); 5766 5767 /* No resolved route. */ 5768 if (first_cire == NULL) { 5769 ire_refrele(first_fire); 5770 return (B_TRUE); 5771 } 5772 5773 /* 5774 * At least one route is resolved. Here we look through the forward 5775 * and cache tables, to compare the number of declared routes 5776 * with the number of resolved routes. The search for a resolvable 5777 * route is performed only if at least one route remains 5778 * unresolved. 5779 */ 5780 cirb = first_cire->ire_bucket; 5781 ASSERT(cirb != NULL); 5782 5783 /* Count the number of routes to that dest that are declared. */ 5784 IRB_REFHOLD(firb); 5785 for (fire = first_fire; fire != NULL; fire = fire->ire_next) { 5786 if (!(fire->ire_flags & RTF_MULTIRT)) 5787 continue; 5788 if (fire->ire_addr != dst) 5789 continue; 5790 unres_cnt++; 5791 } 5792 IRB_REFRELE(firb); 5793 5794 /* Then subtract the number of routes to that dst that are resolved */ 5795 IRB_REFHOLD(cirb); 5796 for (cire = first_cire; cire != NULL; cire = cire->ire_next) { 5797 if (!(cire->ire_flags & RTF_MULTIRT)) 5798 continue; 5799 if (cire->ire_addr != dst) 5800 continue; 5801 if (cire->ire_marks & (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)) 5802 continue; 5803 unres_cnt--; 5804 } 5805 IRB_REFRELE(cirb); 5806 5807 /* At least one route is unresolved; search for a resolvable route. */ 5808 if (unres_cnt > 0) 5809 resolvable = ire_multirt_lookup(&first_cire, &first_fire, 5810 MULTIRT_USESTAMP | MULTIRT_CACHEGW, tsl, ipst); 5811 5812 if (first_fire != NULL) 5813 ire_refrele(first_fire); 5814 5815 if (first_cire != NULL) 5816 ire_refrele(first_cire); 5817 5818 return (resolvable); 5819 } 5820 5821 5822 /* 5823 * Explore a forward_table bucket, starting from fire_arg. 5824 * fire_arg MUST be an IRE_HOST entry. 5825 * 5826 * Return B_TRUE and update *ire_arg and *fire_arg 5827 * if at least one resolvable route is found. *ire_arg 5828 * is the IRE entry for *fire_arg's gateway. 5829 * 5830 * Return B_FALSE otherwise (all routes are resolved or 5831 * the remaining unresolved routes are all unresolvable). 5832 * 5833 * The IRE selection relies on a priority mechanism 5834 * driven by the flags passed in by the caller. 5835 * The caller, such as ip_newroute_ipif(), can get the most 5836 * relevant ire at each stage of a multiple route resolution. 5837 * 5838 * The rules are: 5839 * 5840 * - if MULTIRT_CACHEGW is specified in flags, IRE_CACHETABLE 5841 * ires are preferred for the gateway. This gives the highest 5842 * priority to routes that can be resolved without using 5843 * a resolver. 5844 * 5845 * - if MULTIRT_CACHEGW is not specified, or if MULTIRT_CACHEGW 5846 * is specified but no IRE_CACHETABLE ire entry for the gateway 5847 * is found, the following rules apply. 5848 * 5849 * - if MULTIRT_USESTAMP is specified in flags, IRE_INTERFACE 5850 * ires for the gateway, that have not been tried since 5851 * a configurable amount of time, are preferred. 5852 * This applies when a resolver must be invoked for 5853 * a missing route, but we don't want to use the resolver 5854 * upon each packet emission. If no such resolver is found, 5855 * B_FALSE is returned. 5856 * The MULTIRT_USESTAMP flag can be combined with 5857 * MULTIRT_CACHEGW. 5858 * 5859 * - if MULTIRT_USESTAMP is not specified in flags, the first 5860 * unresolved but resolvable route is selected. 5861 * 5862 * - Otherwise, there is no resolvalble route, and 5863 * B_FALSE is returned. 5864 * 5865 * At last, MULTIRT_SETSTAMP can be specified in flags to 5866 * request the timestamp of unresolvable routes to 5867 * be refreshed. This prevents the useless exploration 5868 * of those routes for a while, when MULTIRT_USESTAMP is used. 5869 * 5870 * This only works in the global zone. 5871 */ 5872 boolean_t 5873 ire_multirt_lookup(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags, 5874 const ts_label_t *tsl, ip_stack_t *ipst) 5875 { 5876 clock_t delta; 5877 ire_t *best_fire = NULL; 5878 ire_t *best_cire = NULL; 5879 ire_t *first_fire; 5880 ire_t *first_cire; 5881 ire_t *fire; 5882 ire_t *cire; 5883 irb_t *firb = NULL; 5884 irb_t *cirb = NULL; 5885 ire_t *gw_ire; 5886 boolean_t already_resolved; 5887 boolean_t res; 5888 ipaddr_t dst; 5889 ipaddr_t gw; 5890 5891 ip2dbg(("ire_multirt_lookup: *ire_arg %p, *fire_arg %p, flags %04x\n", 5892 (void *)*ire_arg, (void *)*fire_arg, flags)); 5893 5894 ASSERT(ire_arg != NULL); 5895 ASSERT(fire_arg != NULL); 5896 5897 /* Not an IRE_HOST ire; give up. */ 5898 if ((*fire_arg == NULL) || ((*fire_arg)->ire_type != IRE_HOST)) { 5899 return (B_FALSE); 5900 } 5901 5902 /* This is the first IRE_HOST ire for that destination. */ 5903 first_fire = *fire_arg; 5904 firb = first_fire->ire_bucket; 5905 ASSERT(firb != NULL); 5906 5907 dst = first_fire->ire_addr; 5908 5909 ip2dbg(("ire_multirt_lookup: dst %08x\n", ntohl(dst))); 5910 5911 /* 5912 * Retrieve the first IRE_CACHE ire for that destination; 5913 * if we don't find one, no route for that dest is 5914 * resolved yet. 5915 */ 5916 first_cire = ire_cache_lookup(dst, GLOBAL_ZONEID, tsl, ipst); 5917 if (first_cire != NULL) { 5918 cirb = first_cire->ire_bucket; 5919 } 5920 5921 ip2dbg(("ire_multirt_lookup: first_cire %p\n", (void *)first_cire)); 5922 5923 /* 5924 * Search for a resolvable route, giving the top priority 5925 * to routes that can be resolved without any call to the resolver. 5926 */ 5927 IRB_REFHOLD(firb); 5928 5929 if (!CLASSD(dst)) { 5930 /* 5931 * For all multiroute IRE_HOST ires for that destination, 5932 * check if the route via the IRE_HOST's gateway is 5933 * resolved yet. 5934 */ 5935 for (fire = first_fire; fire != NULL; fire = fire->ire_next) { 5936 5937 if (!(fire->ire_flags & RTF_MULTIRT)) 5938 continue; 5939 if (fire->ire_addr != dst) 5940 continue; 5941 5942 if (fire->ire_gw_secattr != NULL && 5943 tsol_ire_match_gwattr(fire, tsl) != 0) { 5944 continue; 5945 } 5946 5947 gw = fire->ire_gateway_addr; 5948 5949 ip2dbg(("ire_multirt_lookup: fire %p, " 5950 "ire_addr %08x, ire_gateway_addr %08x\n", 5951 (void *)fire, ntohl(fire->ire_addr), ntohl(gw))); 5952 5953 already_resolved = B_FALSE; 5954 5955 if (first_cire != NULL) { 5956 ASSERT(cirb != NULL); 5957 5958 IRB_REFHOLD(cirb); 5959 /* 5960 * For all IRE_CACHE ires for that 5961 * destination. 5962 */ 5963 for (cire = first_cire; 5964 cire != NULL; 5965 cire = cire->ire_next) { 5966 5967 if (!(cire->ire_flags & RTF_MULTIRT)) 5968 continue; 5969 if (cire->ire_addr != dst) 5970 continue; 5971 if (cire->ire_marks & 5972 (IRE_MARK_CONDEMNED | 5973 IRE_MARK_HIDDEN)) 5974 continue; 5975 5976 if (cire->ire_gw_secattr != NULL && 5977 tsol_ire_match_gwattr(cire, 5978 tsl) != 0) { 5979 continue; 5980 } 5981 5982 /* 5983 * Check if the IRE_CACHE's gateway 5984 * matches the IRE_HOST's gateway. 5985 */ 5986 if (cire->ire_gateway_addr == gw) { 5987 already_resolved = B_TRUE; 5988 break; 5989 } 5990 } 5991 IRB_REFRELE(cirb); 5992 } 5993 5994 /* 5995 * This route is already resolved; 5996 * proceed with next one. 5997 */ 5998 if (already_resolved) { 5999 ip2dbg(("ire_multirt_lookup: found cire %p, " 6000 "already resolved\n", (void *)cire)); 6001 continue; 6002 } 6003 6004 /* 6005 * The route is unresolved; is it actually 6006 * resolvable, i.e. is there a cache or a resolver 6007 * for the gateway? 6008 */ 6009 gw_ire = ire_route_lookup(gw, 0, 0, 0, NULL, NULL, 6010 ALL_ZONES, tsl, 6011 MATCH_IRE_RECURSIVE | MATCH_IRE_SECATTR, ipst); 6012 6013 ip2dbg(("ire_multirt_lookup: looked up gw_ire %p\n", 6014 (void *)gw_ire)); 6015 6016 /* 6017 * If gw_ire is typed IRE_CACHETABLE, 6018 * this route can be resolved without any call to the 6019 * resolver. If the MULTIRT_CACHEGW flag is set, 6020 * give the top priority to this ire and exit the 6021 * loop. 6022 * This is typically the case when an ARP reply 6023 * is processed through ip_wput_nondata(). 6024 */ 6025 if ((flags & MULTIRT_CACHEGW) && 6026 (gw_ire != NULL) && 6027 (gw_ire->ire_type & IRE_CACHETABLE)) { 6028 ASSERT(gw_ire->ire_nce == NULL || 6029 gw_ire->ire_nce->nce_state == ND_REACHABLE); 6030 /* 6031 * Release the resolver associated to the 6032 * previous candidate best ire, if any. 6033 */ 6034 if (best_cire != NULL) { 6035 ire_refrele(best_cire); 6036 ASSERT(best_fire != NULL); 6037 } 6038 6039 best_fire = fire; 6040 best_cire = gw_ire; 6041 6042 ip2dbg(("ire_multirt_lookup: found top prio " 6043 "best_fire %p, best_cire %p\n", 6044 (void *)best_fire, (void *)best_cire)); 6045 break; 6046 } 6047 6048 /* 6049 * Compute the time elapsed since our preceding 6050 * attempt to resolve that route. 6051 * If the MULTIRT_USESTAMP flag is set, we take that 6052 * route into account only if this time interval 6053 * exceeds ip_multirt_resolution_interval; 6054 * this prevents us from attempting to resolve a 6055 * broken route upon each sending of a packet. 6056 */ 6057 delta = lbolt - fire->ire_last_used_time; 6058 delta = TICK_TO_MSEC(delta); 6059 6060 res = (boolean_t)((delta > 6061 ipst->ips_ip_multirt_resolution_interval) || 6062 (!(flags & MULTIRT_USESTAMP))); 6063 6064 ip2dbg(("ire_multirt_lookup: fire %p, delta %lu, " 6065 "res %d\n", 6066 (void *)fire, delta, res)); 6067 6068 if (res) { 6069 /* 6070 * We are here if MULTIRT_USESTAMP flag is set 6071 * and the resolver for fire's gateway 6072 * has not been tried since 6073 * ip_multirt_resolution_interval, or if 6074 * MULTIRT_USESTAMP is not set but gw_ire did 6075 * not fill the conditions for MULTIRT_CACHEGW, 6076 * or if neither MULTIRT_USESTAMP nor 6077 * MULTIRT_CACHEGW are set. 6078 */ 6079 if (gw_ire != NULL) { 6080 if (best_fire == NULL) { 6081 ASSERT(best_cire == NULL); 6082 6083 best_fire = fire; 6084 best_cire = gw_ire; 6085 6086 ip2dbg(("ire_multirt_lookup:" 6087 "found candidate " 6088 "best_fire %p, " 6089 "best_cire %p\n", 6090 (void *)best_fire, 6091 (void *)best_cire)); 6092 6093 /* 6094 * If MULTIRT_CACHEGW is not 6095 * set, we ignore the top 6096 * priority ires that can 6097 * be resolved without any 6098 * call to the resolver; 6099 * In that case, there is 6100 * actually no need 6101 * to continue the loop. 6102 */ 6103 if (!(flags & 6104 MULTIRT_CACHEGW)) { 6105 break; 6106 } 6107 continue; 6108 } 6109 } else { 6110 /* 6111 * No resolver for the gateway: the 6112 * route is not resolvable. 6113 * If the MULTIRT_SETSTAMP flag is 6114 * set, we stamp the IRE_HOST ire, 6115 * so we will not select it again 6116 * during this resolution interval. 6117 */ 6118 if (flags & MULTIRT_SETSTAMP) 6119 fire->ire_last_used_time = 6120 lbolt; 6121 } 6122 } 6123 6124 if (gw_ire != NULL) 6125 ire_refrele(gw_ire); 6126 } 6127 } else { /* CLASSD(dst) */ 6128 6129 for (fire = first_fire; 6130 fire != NULL; 6131 fire = fire->ire_next) { 6132 6133 if (!(fire->ire_flags & RTF_MULTIRT)) 6134 continue; 6135 if (fire->ire_addr != dst) 6136 continue; 6137 6138 if (fire->ire_gw_secattr != NULL && 6139 tsol_ire_match_gwattr(fire, tsl) != 0) { 6140 continue; 6141 } 6142 6143 already_resolved = B_FALSE; 6144 6145 gw = fire->ire_gateway_addr; 6146 6147 gw_ire = ire_ftable_lookup(gw, 0, 0, IRE_INTERFACE, 6148 NULL, NULL, ALL_ZONES, 0, tsl, 6149 MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE | 6150 MATCH_IRE_SECATTR, ipst); 6151 6152 /* No resolver for the gateway; we skip this ire. */ 6153 if (gw_ire == NULL) { 6154 continue; 6155 } 6156 ASSERT(gw_ire->ire_nce == NULL || 6157 gw_ire->ire_nce->nce_state == ND_REACHABLE); 6158 6159 if (first_cire != NULL) { 6160 6161 IRB_REFHOLD(cirb); 6162 /* 6163 * For all IRE_CACHE ires for that 6164 * destination. 6165 */ 6166 for (cire = first_cire; 6167 cire != NULL; 6168 cire = cire->ire_next) { 6169 6170 if (!(cire->ire_flags & RTF_MULTIRT)) 6171 continue; 6172 if (cire->ire_addr != dst) 6173 continue; 6174 if (cire->ire_marks & 6175 (IRE_MARK_CONDEMNED | 6176 IRE_MARK_HIDDEN)) 6177 continue; 6178 6179 if (cire->ire_gw_secattr != NULL && 6180 tsol_ire_match_gwattr(cire, 6181 tsl) != 0) { 6182 continue; 6183 } 6184 6185 /* 6186 * Cache entries are linked to the 6187 * parent routes using the parent handle 6188 * (ire_phandle). If no cache entry has 6189 * the same handle as fire, fire is 6190 * still unresolved. 6191 */ 6192 ASSERT(cire->ire_phandle != 0); 6193 if (cire->ire_phandle == 6194 fire->ire_phandle) { 6195 already_resolved = B_TRUE; 6196 break; 6197 } 6198 } 6199 IRB_REFRELE(cirb); 6200 } 6201 6202 /* 6203 * This route is already resolved; proceed with 6204 * next one. 6205 */ 6206 if (already_resolved) { 6207 ire_refrele(gw_ire); 6208 continue; 6209 } 6210 6211 /* 6212 * Compute the time elapsed since our preceding 6213 * attempt to resolve that route. 6214 * If the MULTIRT_USESTAMP flag is set, we take 6215 * that route into account only if this time 6216 * interval exceeds ip_multirt_resolution_interval; 6217 * this prevents us from attempting to resolve a 6218 * broken route upon each sending of a packet. 6219 */ 6220 delta = lbolt - fire->ire_last_used_time; 6221 delta = TICK_TO_MSEC(delta); 6222 6223 res = (boolean_t)((delta > 6224 ipst->ips_ip_multirt_resolution_interval) || 6225 (!(flags & MULTIRT_USESTAMP))); 6226 6227 ip3dbg(("ire_multirt_lookup: fire %p, delta %lx, " 6228 "flags %04x, res %d\n", 6229 (void *)fire, delta, flags, res)); 6230 6231 if (res) { 6232 if (best_cire != NULL) { 6233 /* 6234 * Release the resolver associated 6235 * to the preceding candidate best 6236 * ire, if any. 6237 */ 6238 ire_refrele(best_cire); 6239 ASSERT(best_fire != NULL); 6240 } 6241 best_fire = fire; 6242 best_cire = gw_ire; 6243 continue; 6244 } 6245 6246 ire_refrele(gw_ire); 6247 } 6248 } 6249 6250 if (best_fire != NULL) { 6251 IRE_REFHOLD(best_fire); 6252 } 6253 IRB_REFRELE(firb); 6254 6255 /* Release the first IRE_CACHE we initially looked up, if any. */ 6256 if (first_cire != NULL) 6257 ire_refrele(first_cire); 6258 6259 /* Found a resolvable route. */ 6260 if (best_fire != NULL) { 6261 ASSERT(best_cire != NULL); 6262 6263 if (*fire_arg != NULL) 6264 ire_refrele(*fire_arg); 6265 if (*ire_arg != NULL) 6266 ire_refrele(*ire_arg); 6267 6268 /* 6269 * Update the passed-in arguments with the 6270 * resolvable multirt route we found. 6271 */ 6272 *fire_arg = best_fire; 6273 *ire_arg = best_cire; 6274 6275 ip2dbg(("ire_multirt_lookup: returning B_TRUE, " 6276 "*fire_arg %p, *ire_arg %p\n", 6277 (void *)best_fire, (void *)best_cire)); 6278 6279 return (B_TRUE); 6280 } 6281 6282 ASSERT(best_cire == NULL); 6283 6284 ip2dbg(("ire_multirt_lookup: returning B_FALSE, *fire_arg %p, " 6285 "*ire_arg %p\n", 6286 (void *)*fire_arg, (void *)*ire_arg)); 6287 6288 /* No resolvable route. */ 6289 return (B_FALSE); 6290 } 6291 6292 /* 6293 * IRE iterator for inbound and loopback broadcast processing. 6294 * Given an IRE_BROADCAST ire, walk the ires with the same destination 6295 * address, but skip over the passed-in ire. Returns the next ire without 6296 * a hold - assumes that the caller holds a reference on the IRE bucket. 6297 */ 6298 ire_t * 6299 ire_get_next_bcast_ire(ire_t *curr, ire_t *ire) 6300 { 6301 ill_t *ill; 6302 6303 if (curr == NULL) { 6304 for (curr = ire->ire_bucket->irb_ire; curr != NULL; 6305 curr = curr->ire_next) { 6306 if (curr->ire_addr == ire->ire_addr) 6307 break; 6308 } 6309 } else { 6310 curr = curr->ire_next; 6311 } 6312 ill = ire_to_ill(ire); 6313 for (; curr != NULL; curr = curr->ire_next) { 6314 if (curr->ire_addr != ire->ire_addr) { 6315 /* 6316 * All the IREs to a given destination are contiguous; 6317 * break out once the address doesn't match. 6318 */ 6319 break; 6320 } 6321 if (curr == ire) { 6322 /* skip over the passed-in ire */ 6323 continue; 6324 } 6325 if ((curr->ire_stq != NULL && ire->ire_stq == NULL) || 6326 (curr->ire_stq == NULL && ire->ire_stq != NULL)) { 6327 /* 6328 * If the passed-in ire is loopback, skip over 6329 * non-loopback ires and vice versa. 6330 */ 6331 continue; 6332 } 6333 if (ire_to_ill(curr) != ill) { 6334 /* skip over IREs going through a different interface */ 6335 continue; 6336 } 6337 if (curr->ire_marks & IRE_MARK_CONDEMNED) { 6338 /* skip over deleted IREs */ 6339 continue; 6340 } 6341 return (curr); 6342 } 6343 return (NULL); 6344 } 6345 6346 #ifdef IRE_DEBUG 6347 th_trace_t * 6348 th_trace_ire_lookup(ire_t *ire) 6349 { 6350 int bucket_id; 6351 th_trace_t *th_trace; 6352 6353 ASSERT(MUTEX_HELD(&ire->ire_lock)); 6354 6355 bucket_id = IP_TR_HASH(curthread); 6356 ASSERT(bucket_id < IP_TR_HASH_MAX); 6357 6358 for (th_trace = ire->ire_trace[bucket_id]; th_trace != NULL; 6359 th_trace = th_trace->th_next) { 6360 if (th_trace->th_id == curthread) 6361 return (th_trace); 6362 } 6363 return (NULL); 6364 } 6365 6366 void 6367 ire_trace_ref(ire_t *ire) 6368 { 6369 int bucket_id; 6370 th_trace_t *th_trace; 6371 6372 /* 6373 * Attempt to locate the trace buffer for the curthread. 6374 * If it does not exist, then allocate a new trace buffer 6375 * and link it in list of trace bufs for this ipif, at the head 6376 */ 6377 mutex_enter(&ire->ire_lock); 6378 if (ire->ire_trace_disable == B_TRUE) { 6379 mutex_exit(&ire->ire_lock); 6380 return; 6381 } 6382 th_trace = th_trace_ire_lookup(ire); 6383 if (th_trace == NULL) { 6384 bucket_id = IP_TR_HASH(curthread); 6385 th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t), 6386 KM_NOSLEEP); 6387 if (th_trace == NULL) { 6388 ire->ire_trace_disable = B_TRUE; 6389 mutex_exit(&ire->ire_lock); 6390 ire_trace_inactive(ire); 6391 return; 6392 } 6393 6394 th_trace->th_id = curthread; 6395 th_trace->th_next = ire->ire_trace[bucket_id]; 6396 th_trace->th_prev = &ire->ire_trace[bucket_id]; 6397 if (th_trace->th_next != NULL) 6398 th_trace->th_next->th_prev = &th_trace->th_next; 6399 ire->ire_trace[bucket_id] = th_trace; 6400 } 6401 ASSERT(th_trace->th_refcnt < TR_BUF_MAX - 1); 6402 th_trace->th_refcnt++; 6403 th_trace_rrecord(th_trace); 6404 mutex_exit(&ire->ire_lock); 6405 } 6406 6407 void 6408 ire_trace_free(th_trace_t *th_trace) 6409 { 6410 /* unlink th_trace and free it */ 6411 *th_trace->th_prev = th_trace->th_next; 6412 if (th_trace->th_next != NULL) 6413 th_trace->th_next->th_prev = th_trace->th_prev; 6414 th_trace->th_next = NULL; 6415 th_trace->th_prev = NULL; 6416 kmem_free(th_trace, sizeof (th_trace_t)); 6417 } 6418 6419 void 6420 ire_untrace_ref(ire_t *ire) 6421 { 6422 th_trace_t *th_trace; 6423 6424 mutex_enter(&ire->ire_lock); 6425 6426 if (ire->ire_trace_disable == B_TRUE) { 6427 mutex_exit(&ire->ire_lock); 6428 return; 6429 } 6430 6431 th_trace = th_trace_ire_lookup(ire); 6432 ASSERT(th_trace != NULL && th_trace->th_refcnt > 0); 6433 th_trace_rrecord(th_trace); 6434 th_trace->th_refcnt--; 6435 6436 if (th_trace->th_refcnt == 0) 6437 ire_trace_free(th_trace); 6438 6439 mutex_exit(&ire->ire_lock); 6440 } 6441 6442 static void 6443 ire_trace_inactive(ire_t *ire) 6444 { 6445 th_trace_t *th_trace; 6446 int i; 6447 6448 mutex_enter(&ire->ire_lock); 6449 for (i = 0; i < IP_TR_HASH_MAX; i++) { 6450 while (ire->ire_trace[i] != NULL) { 6451 th_trace = ire->ire_trace[i]; 6452 6453 /* unlink th_trace and free it */ 6454 ire->ire_trace[i] = th_trace->th_next; 6455 if (th_trace->th_next != NULL) 6456 th_trace->th_next->th_prev = 6457 &ire->ire_trace[i]; 6458 6459 th_trace->th_next = NULL; 6460 th_trace->th_prev = NULL; 6461 kmem_free(th_trace, sizeof (th_trace_t)); 6462 } 6463 } 6464 6465 mutex_exit(&ire->ire_lock); 6466 } 6467 6468 /* ARGSUSED */ 6469 void 6470 ire_thread_exit(ire_t *ire, caddr_t arg) 6471 { 6472 th_trace_t *th_trace; 6473 6474 mutex_enter(&ire->ire_lock); 6475 th_trace = th_trace_ire_lookup(ire); 6476 if (th_trace == NULL) { 6477 mutex_exit(&ire->ire_lock); 6478 return; 6479 } 6480 ASSERT(th_trace->th_refcnt == 0); 6481 6482 ire_trace_free(th_trace); 6483 mutex_exit(&ire->ire_lock); 6484 } 6485 6486 #endif 6487 6488 /* 6489 * Generate a message chain with an arp request to resolve the in_ire. 6490 * It is assumed that in_ire itself is currently in the ire cache table, 6491 * so we create a fake_ire filled with enough information about ire_addr etc. 6492 * to retrieve in_ire when the DL_UNITDATA response from the resolver 6493 * comes back. The fake_ire itself is created by calling esballoc with 6494 * the fr_rtnp (free routine) set to ire_freemblk. This routine will be 6495 * invoked when the mblk containing fake_ire is freed. 6496 */ 6497 void 6498 ire_arpresolve(ire_t *in_ire, ill_t *dst_ill) 6499 { 6500 areq_t *areq; 6501 ipaddr_t *addrp; 6502 mblk_t *ire_mp, *dlureq_mp; 6503 ire_t *ire, *buf; 6504 size_t bufsize; 6505 frtn_t *frtnp; 6506 ill_t *ill; 6507 ip_stack_t *ipst = dst_ill->ill_ipst; 6508 6509 /* 6510 * Construct message chain for the resolver 6511 * of the form: 6512 * ARP_REQ_MBLK-->IRE_MBLK 6513 * 6514 * NOTE : If the response does not 6515 * come back, ARP frees the packet. For this reason, 6516 * we can't REFHOLD the bucket of save_ire to prevent 6517 * deletions. We may not be able to REFRELE the bucket 6518 * if the response never comes back. Thus, before 6519 * adding the ire, ire_add_v4 will make sure that the 6520 * interface route does not get deleted. This is the 6521 * only case unlike ip_newroute_v6, ip_newroute_ipif_v6 6522 * where we can always prevent deletions because of 6523 * the synchronous nature of adding IRES i.e 6524 * ire_add_then_send is called after creating the IRE. 6525 */ 6526 6527 /* 6528 * We use esballoc to allocate the second part(the ire_t size mblk) 6529 * of the message chain depicted above. THis mblk will be freed 6530 * by arp when there is a timeout, and otherwise passed to IP 6531 * and IP will * free it after processing the ARP response. 6532 */ 6533 6534 bufsize = sizeof (ire_t) + sizeof (frtn_t); 6535 buf = kmem_alloc(bufsize, KM_NOSLEEP); 6536 if (buf == NULL) { 6537 ip1dbg(("ire_arpresolver:alloc buffer failed\n ")); 6538 return; 6539 } 6540 frtnp = (frtn_t *)(buf + 1); 6541 frtnp->free_arg = (caddr_t)buf; 6542 frtnp->free_func = ire_freemblk; 6543 6544 ire_mp = esballoc((unsigned char *)buf, bufsize, BPRI_MED, frtnp); 6545 6546 if (ire_mp == NULL) { 6547 ip1dbg(("ire_arpresolve: esballoc failed\n")); 6548 kmem_free(buf, bufsize); 6549 return; 6550 } 6551 ASSERT(in_ire->ire_nce != NULL); 6552 dlureq_mp = copyb(dst_ill->ill_resolver_mp); 6553 if (dlureq_mp == NULL) { 6554 kmem_free(buf, bufsize); 6555 return; 6556 } 6557 6558 ire_mp->b_datap->db_type = IRE_ARPRESOLVE_TYPE; 6559 ire = (ire_t *)buf; 6560 /* 6561 * keep enough info in the fake ire so that we can pull up 6562 * the incomplete ire (in_ire) after result comes back from 6563 * arp and make it complete. 6564 */ 6565 *ire = ire_null; 6566 ire->ire_u = in_ire->ire_u; 6567 ire->ire_ipif_seqid = in_ire->ire_ipif_seqid; 6568 ire->ire_ipif = in_ire->ire_ipif; 6569 ire->ire_stq = in_ire->ire_stq; 6570 ill = ire_to_ill(ire); 6571 ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex; 6572 ire->ire_zoneid = in_ire->ire_zoneid; 6573 ire->ire_ipst = ipst; 6574 6575 /* 6576 * ire_freemblk will be called when ire_mp is freed, both for 6577 * successful and failed arp resolution. IRE_MARK_UNCACHED will be set 6578 * when the arp resolution failed. 6579 */ 6580 ire->ire_marks |= IRE_MARK_UNCACHED; 6581 ire->ire_mp = ire_mp; 6582 ire_mp->b_wptr = (uchar_t *)&ire[1]; 6583 ire_mp->b_cont = NULL; 6584 ASSERT(dlureq_mp != NULL); 6585 linkb(dlureq_mp, ire_mp); 6586 6587 /* 6588 * Fill in the source and dest addrs for the resolver. 6589 * NOTE: this depends on memory layouts imposed by 6590 * ill_init(). 6591 */ 6592 areq = (areq_t *)dlureq_mp->b_rptr; 6593 addrp = (ipaddr_t *)((char *)areq + areq->areq_sender_addr_offset); 6594 *addrp = ire->ire_src_addr; 6595 6596 addrp = (ipaddr_t *)((char *)areq + areq->areq_target_addr_offset); 6597 if (ire->ire_gateway_addr != INADDR_ANY) { 6598 *addrp = ire->ire_gateway_addr; 6599 } else { 6600 *addrp = ire->ire_addr; 6601 } 6602 6603 /* Up to the resolver. */ 6604 if (canputnext(dst_ill->ill_rq)) { 6605 putnext(dst_ill->ill_rq, dlureq_mp); 6606 } else { 6607 /* Prepare for cleanup */ 6608 freemsg(dlureq_mp); 6609 } 6610 } 6611 6612 /* 6613 * Esballoc free function for AR_ENTRY_QUERY request to clean up any 6614 * unresolved ire_t and/or nce_t structures when ARP resolution fails. 6615 * 6616 * This function can be called by ARP via free routine for ire_mp or 6617 * by IPv4(both host and forwarding path) via ire_delete 6618 * in case ARP resolution fails. 6619 * NOTE: Since IP is MT, ARP can call into IP but not vice versa 6620 * (for IP to talk to ARP, it still has to send AR* messages). 6621 * 6622 * Note that the ARP/IP merge should replace the functioanlity by providing 6623 * direct function calls to clean up unresolved entries in ire/nce lists. 6624 */ 6625 void 6626 ire_freemblk(ire_t *ire_mp) 6627 { 6628 nce_t *nce = NULL; 6629 ill_t *ill; 6630 ip_stack_t *ipst; 6631 6632 ASSERT(ire_mp != NULL); 6633 6634 if ((ire_mp->ire_addr == NULL) && (ire_mp->ire_gateway_addr == NULL)) { 6635 ip1dbg(("ire_freemblk(0x%p) ire_addr is NULL\n", 6636 (void *)ire_mp)); 6637 goto cleanup; 6638 } 6639 if ((ire_mp->ire_marks & IRE_MARK_UNCACHED) == 0) { 6640 goto cleanup; /* everything succeeded. just free and return */ 6641 } 6642 6643 /* 6644 * the arp information corresponding to this ire_mp was not 6645 * transferred to a ire_cache entry. Need 6646 * to clean up incomplete ire's and nce, if necessary. 6647 */ 6648 ASSERT(ire_mp->ire_stq != NULL); 6649 ASSERT(ire_mp->ire_stq_ifindex != 0); 6650 ASSERT(ire_mp->ire_ipst != NULL); 6651 6652 ipst = ire_mp->ire_ipst; 6653 6654 /* 6655 * Get any nce's corresponding to this ire_mp. We first have to 6656 * make sure that the ill is still around. 6657 */ 6658 ill = ill_lookup_on_ifindex(ire_mp->ire_stq_ifindex, 6659 B_FALSE, NULL, NULL, NULL, NULL, ipst); 6660 if (ill == NULL || (ire_mp->ire_stq != ill->ill_wq) || 6661 (ill->ill_state_flags & ILL_CONDEMNED)) { 6662 /* 6663 * ill went away. no nce to clean up. 6664 * Note that the ill_state_flags could be set to 6665 * ILL_CONDEMNED after this point, but if we know 6666 * that it is CONDEMNED now, we just bail out quickly. 6667 */ 6668 if (ill != NULL) 6669 ill_refrele(ill); 6670 goto cleanup; 6671 } 6672 nce = ndp_lookup_v4(ill, 6673 ((ire_mp->ire_gateway_addr != INADDR_ANY) ? 6674 &ire_mp->ire_gateway_addr : &ire_mp->ire_addr), 6675 B_FALSE); 6676 ill_refrele(ill); 6677 6678 if ((nce != NULL) && (nce->nce_state != ND_REACHABLE)) { 6679 /* 6680 * some incomplete nce was found. 6681 */ 6682 DTRACE_PROBE2(ire__freemblk__arp__resolv__fail, 6683 nce_t *, nce, ire_t *, ire_mp); 6684 /* 6685 * Send the icmp_unreachable messages for the queued mblks in 6686 * ire->ire_nce->nce_qd_mp, since ARP resolution failed 6687 * for this ire 6688 */ 6689 arp_resolv_failed(nce); 6690 /* 6691 * Delete the nce and clean up all ire's pointing at this nce 6692 * in the cachetable 6693 */ 6694 ndp_delete(nce); 6695 } 6696 if (nce != NULL) 6697 NCE_REFRELE(nce); /* release the ref taken by ndp_lookup_v4 */ 6698 6699 cleanup: 6700 /* 6701 * Get rid of the ire buffer 6702 * We call kmem_free here(instead of ire_delete()), since 6703 * this is the freeb's callback. 6704 */ 6705 kmem_free(ire_mp, sizeof (ire_t) + sizeof (frtn_t)); 6706 } 6707 6708 /* 6709 * The mp passed to this function is typically res_mp. 6710 * Note that res_mp field can contain the request (ie AR_ENTRY_QUERY) 6711 * or the response (ie DL_UNITDATA_REQ). So in case of the 6712 * forwarding path, there is a small window of time between the two 6713 * when the forwarding path creates an unresolved nce and the 6714 * ip_newroute path finds this and uses this in the creation of an 6715 * ire cache. To account for this possible race case we we check 6716 * for DL_UNITDATA_REQ to make sure this is indeed the response. 6717 */ 6718 boolean_t 6719 ire_nce_valid_dlureq_mp(mblk_t *mp) 6720 { 6721 dl_unitdata_req_t *dlur; 6722 6723 if (mp == NULL) 6724 return (B_FALSE); 6725 dlur = (dl_unitdata_req_t *)mp->b_rptr; 6726 if ((DB_TYPE(mp) == M_PROTO) && 6727 (dlur->dl_primitive == DL_UNITDATA_REQ)) { 6728 return (B_TRUE); 6729 } else { 6730 return (B_FALSE); 6731 } 6732 } 6733 6734 /* 6735 * create the neighbor cache entry nce_t for IRE_CACHE and 6736 * non-loopback IRE_BROADCAST ire's. Note that IRE_BROADCAST 6737 * (non-loopback) entries have the nce_res_mp set to the 6738 * template passed in (generated from ill_bcast_mp); IRE_CACHE ire's 6739 * contain the information for the nexthop (ire_gateway_addr) in the 6740 * case of indirect routes, and for the dst itself (ire_addr) in the 6741 * case of direct routes, with the nce_res_mp containing a template 6742 * DL_UNITDATA request. 6743 * 6744 * This function always consumes res_mp and fp_mp. 6745 * 6746 * The actual association of the ire_nce to the nce created here is 6747 * typically done in ire_add_v4 for IRE_CACHE entries. Exceptions 6748 * to this rule are SO_DONTROUTE ire's (IRE_MARK_NO_ADD), for which 6749 * the ire_nce assignment is done in ire_add_then_send, and mobile-ip 6750 * where the assignment is done in ire_add_mrtun(). 6751 */ 6752 int 6753 ire_nce_init(ire_t *ire, mblk_t *fp_mp, mblk_t *res_mp) 6754 { 6755 in_addr_t addr4, mask4; 6756 int err; 6757 nce_t *arpce = NULL; 6758 ill_t *ire_ill; 6759 uint16_t nce_state, nce_flags; 6760 ip_stack_t *ipst; 6761 6762 if (ire->ire_stq == NULL) { 6763 if (res_mp) 6764 freemsg(res_mp); 6765 if (fp_mp) 6766 freemsg(fp_mp); 6767 return (0); /* no need to create nce for local/loopback */ 6768 } 6769 6770 mask4 = IP_HOST_MASK; 6771 switch (ire->ire_type) { 6772 case IRE_CACHE: 6773 if (ire->ire_gateway_addr != INADDR_ANY) 6774 addr4 = ire->ire_gateway_addr; /* 'G' route */ 6775 else 6776 addr4 = ire->ire_addr; /* direct route */ 6777 break; 6778 case IRE_BROADCAST: 6779 addr4 = ire->ire_addr; 6780 break; 6781 default: 6782 if (res_mp) 6783 freemsg(res_mp); 6784 if (fp_mp) 6785 freemsg(fp_mp); 6786 return (0); 6787 } 6788 6789 /* 6790 * ire_ipif is picked based on RTF_SETSRC, usesrc etc. 6791 * rules in ire_forward_src_ipif. We want the dlureq_mp 6792 * for the outgoing interface, which we get from the ire_stq. 6793 */ 6794 ire_ill = ire_to_ill(ire); 6795 ipst = ire_ill->ill_ipst; 6796 6797 /* 6798 * if we are creating an nce for the first time, and this is 6799 * a NORESOLVER interface, atomically create the nce in the 6800 * REACHABLE state; else create it in the ND_INITIAL state. 6801 */ 6802 if (ire_ill->ill_net_type == IRE_IF_NORESOLVER) { 6803 nce_state = ND_REACHABLE; 6804 nce_flags = NCE_F_PERMANENT; 6805 } else { 6806 /* Make sure you have the response and not the request. */ 6807 if (ire_nce_valid_dlureq_mp(res_mp)) { 6808 nce_state = ND_REACHABLE; 6809 } else { 6810 nce_state = ND_INITIAL; 6811 if (fp_mp) 6812 freemsg(fp_mp); 6813 fp_mp = NULL; 6814 } 6815 nce_flags = 0; 6816 } 6817 6818 err = ndp_lookup_then_add(ire_ill, NULL, 6819 &addr4, &mask4, NULL, 0, nce_flags, nce_state, &arpce, 6820 fp_mp, res_mp); 6821 6822 ip1dbg(("ire 0x%p addr 0x%lx mask 0x%lx type 0x%x; " 6823 "found nce 0x%p err %d\n", (void *)ire, (ulong_t)addr4, 6824 (ulong_t)mask4, ire->ire_type, (void *)arpce, err)); 6825 6826 switch (err) { 6827 case 0: 6828 break; 6829 case EEXIST: 6830 /* 6831 * return a pointer to an existing nce_t; 6832 * note that the ire-nce mapping is many-one, i.e., 6833 * multiple ire's could point to the same nce_t; 6834 */ 6835 if (fp_mp != NULL) { 6836 freemsg(fp_mp); 6837 } 6838 if (res_mp != NULL) { 6839 freemsg(res_mp); 6840 } 6841 break; 6842 default: 6843 DTRACE_PROBE2(nce__init__fail, ill_t *, ire_ill, int, err); 6844 if (res_mp) 6845 freemsg(res_mp); 6846 if (fp_mp) 6847 freemsg(fp_mp); 6848 return (EINVAL); 6849 } 6850 #if DEBUG 6851 /* 6852 * If an nce_fp_mp was passed in by ndp_lookup_then_add() 6853 * we should be picking up an existing nce_t in 6854 * the ND_REACHABLE state. 6855 */ 6856 mutex_enter(&arpce->nce_lock); 6857 ASSERT(arpce->nce_fp_mp == NULL || arpce->nce_state == ND_REACHABLE); 6858 mutex_exit(&arpce->nce_lock); 6859 #endif 6860 if (ire->ire_type == IRE_BROADCAST) { 6861 /* 6862 * Two bcast ires are created for each interface; 6863 * 1. loopback copy (which does not have an 6864 * ire_stq, and therefore has no ire_nce), and, 6865 * 2. the non-loopback copy, which has the nce_res_mp 6866 * initialized to a copy of the ill_bcast_mp, and 6867 * is marked as ND_REACHABLE at this point. 6868 * This nce does not undergo any further state changes, 6869 * and exists as long as the interface is plumbed. 6870 * Note: we do the ire_nce assignment here for IRE_BROADCAST 6871 * because some functions like ill_mark_bcast() inline the 6872 * ire_add functionality; 6873 */ 6874 mutex_enter(&arpce->nce_lock); 6875 arpce->nce_state = ND_REACHABLE; 6876 arpce->nce_flags |= (NCE_F_PERMANENT | NCE_F_BCAST); 6877 arpce->nce_last = TICK_TO_MSEC(lbolt64); 6878 ire->ire_nce = arpce; 6879 mutex_exit(&arpce->nce_lock); 6880 /* 6881 * We are associating this nce to the ire, 6882 * so change the nce ref taken in 6883 * ndp_lookup_then_add_v4() from 6884 * NCE_REFHOLD to NCE_REFHOLD_NOTR 6885 */ 6886 NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce); 6887 } else { 6888 if (NCE_EXPIRED(arpce, ipst)) 6889 arpce = nce_reinit(arpce); 6890 if (arpce != NULL) { 6891 /* 6892 * We are not using this nce_t just yet so release 6893 * the ref taken in ndp_lookup_then_add_v4() 6894 */ 6895 NCE_REFRELE(arpce); 6896 } else { 6897 ip0dbg(("can't reinit arpce for ill 0x%p;\n", 6898 (void *)ire_ill)); 6899 } 6900 } 6901 return (0); 6902 } 6903