1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 30 /* 31 * This file contains routines that manipulate Internet Routing Entries (IREs). 32 */ 33 34 #include <sys/types.h> 35 #include <sys/stream.h> 36 #include <sys/stropts.h> 37 #include <sys/strsun.h> 38 #include <sys/ddi.h> 39 #include <sys/cmn_err.h> 40 #include <sys/policy.h> 41 42 #include <sys/systm.h> 43 #include <sys/kmem.h> 44 #include <sys/param.h> 45 #include <sys/socket.h> 46 #include <net/if.h> 47 #include <net/route.h> 48 #include <netinet/in.h> 49 #include <net/if_dl.h> 50 #include <netinet/ip6.h> 51 #include <netinet/icmp6.h> 52 53 #include <inet/common.h> 54 #include <inet/mi.h> 55 #include <inet/ip.h> 56 #include <inet/ip6.h> 57 #include <inet/ip_ndp.h> 58 #include <inet/arp.h> 59 #include <inet/ip_if.h> 60 #include <inet/ip_ire.h> 61 #include <inet/ip_ftable.h> 62 #include <inet/ip_rts.h> 63 #include <inet/nd.h> 64 65 #include <net/pfkeyv2.h> 66 #include <inet/ipsec_info.h> 67 #include <inet/sadb.h> 68 #include <sys/kmem.h> 69 #include <inet/tcp.h> 70 #include <inet/ipclassifier.h> 71 #include <sys/zone.h> 72 #include <sys/tsol/label.h> 73 #include <sys/tsol/tnet.h> 74 75 struct kmem_cache *rt_entry_cache; 76 77 78 /* 79 * Synchronization notes: 80 * 81 * The fields of the ire_t struct are protected in the following way : 82 * 83 * ire_next/ire_ptpn 84 * 85 * - bucket lock of the respective tables (cache or forwarding tables). 86 * 87 * ire_mp, ire_rfq, ire_stq, ire_u *except* ire_gateway_addr[v6], ire_mask, 88 * ire_type, ire_create_time, ire_masklen, ire_ipversion, ire_flags, ire_ipif, 89 * ire_ihandle, ire_phandle, ire_nce, ire_bucket, ire_in_ill, ire_in_src_addr 90 * 91 * - Set in ire_create_v4/v6 and never changes after that. Thus, 92 * we don't need a lock whenever these fields are accessed. 93 * 94 * - ire_bucket and ire_masklen (also set in ire_create) is set in 95 * ire_add_v4/ire_add_v6 before inserting in the bucket and never 96 * changes after that. Thus we don't need a lock whenever these 97 * fields are accessed. 98 * 99 * ire_gateway_addr_v4[v6] 100 * 101 * - ire_gateway_addr_v4[v6] is set during ire_create and later modified 102 * by rts_setgwr[v6]. As ire_gateway_addr is a uint32_t, updates to 103 * it assumed to be atomic and hence the other parts of the code 104 * does not use any locks. ire_gateway_addr_v6 updates are not atomic 105 * and hence any access to it uses ire_lock to get/set the right value. 106 * 107 * ire_ident, ire_refcnt 108 * 109 * - Updated atomically using atomic_add_32 110 * 111 * ire_ssthresh, ire_rtt_sd, ire_rtt, ire_ib_pkt_count, ire_ob_pkt_count 112 * 113 * - Assumes that 32 bit writes are atomic. No locks. ire_lock is 114 * used to serialize updates to ire_ssthresh, ire_rtt_sd, ire_rtt. 115 * 116 * ire_max_frag, ire_frag_flag 117 * 118 * - ire_lock is used to set/read both of them together. 119 * 120 * ire_tire_mark 121 * 122 * - Set in ire_create and updated in ire_expire, which is called 123 * by only one function namely ip_trash_timer_expire. Thus only 124 * one function updates and examines the value. 125 * 126 * ire_marks 127 * - bucket lock protects this. 128 * 129 * ire_ipsec_overhead/ire_ll_hdr_length 130 * 131 * - Place holder for returning the information to the upper layers 132 * when IRE_DB_REQ comes down. 133 * 134 * 135 * ipv6_ire_default_count is protected by the bucket lock of 136 * ip_forwarding_table_v6[0][0]. 137 * 138 * ipv6_ire_default_index is not protected as it is just a hint 139 * at which default gateway to use. There is nothing 140 * wrong in using the same gateway for two different connections. 141 * 142 * As we always hold the bucket locks in all the places while accessing 143 * the above values, it is natural to use them for protecting them. 144 * 145 * We have a separate cache table and forwarding table for IPv4 and IPv6. 146 * Cache table (ip_cache_table/ip_cache_table_v6) is a pointer to an 147 * array of irb_t structure and forwarding table (ip_forwarding_table/ 148 * ip_forwarding_table_v6) is an array of pointers to array of irb_t 149 * structure. ip_forwarding_table[_v6] is allocated dynamically in 150 * ire_add_v4/v6. ire_ft_init_lock is used to serialize multiple threads 151 * initializing the same bucket. Once a bucket is initialized, it is never 152 * de-alloacted. This assumption enables us to access ip_forwarding_table[i] 153 * or ip_forwarding_table_v6[i] without any locks. 154 * 155 * Each irb_t - ire bucket structure has a lock to protect 156 * a bucket and the ires residing in the bucket have a back pointer to 157 * the bucket structure. It also has a reference count for the number 158 * of threads walking the bucket - irb_refcnt which is bumped up 159 * using the macro IRB_REFHOLD macro. The flags irb_flags can be 160 * set to IRE_MARK_CONDEMNED indicating that there are some ires 161 * in this bucket that are marked with IRE_MARK_CONDEMNED and the 162 * last thread to leave the bucket should delete the ires. Usually 163 * this is done by the IRB_REFRELE macro which is used to decrement 164 * the reference count on a bucket. 165 * 166 * IRE_REFHOLD/IRE_REFRELE macros operate on the ire which increments/ 167 * decrements the reference count, ire_refcnt, atomically on the ire. 168 * ire_refcnt is modified only using this macro. Operations on the IRE 169 * could be described as follows : 170 * 171 * CREATE an ire with reference count initialized to 1. 172 * 173 * ADDITION of an ire holds the bucket lock, checks for duplicates 174 * and then adds the ire. ire_add_v4/ire_add_v6 returns the ire after 175 * bumping up once more i.e the reference count is 2. This is to avoid 176 * an extra lookup in the functions calling ire_add which wants to 177 * work with the ire after adding. 178 * 179 * LOOKUP of an ire bumps up the reference count using IRE_REFHOLD 180 * macro. It is valid to bump up the referece count of the IRE, 181 * after the lookup has returned an ire. Following are the lookup 182 * functions that return an HELD ire : 183 * 184 * ire_lookup_local[_v6], ire_ctable_lookup[_v6], ire_ftable_lookup[_v6], 185 * ire_cache_lookup[_v6], ire_lookup_multi[_v6], ire_route_lookup[_v6], 186 * ipif_to_ire[_v6], ire_mrtun_lookup, ire_srcif_table_lookup. 187 * 188 * DELETION of an ire holds the bucket lock, removes it from the list 189 * and then decrements the reference count for having removed from the list 190 * by using the IRE_REFRELE macro. If some other thread has looked up 191 * the ire, the reference count would have been bumped up and hence 192 * this ire will not be freed once deleted. It will be freed once the 193 * reference count drops to zero. 194 * 195 * Add and Delete acquires the bucket lock as RW_WRITER, while all the 196 * lookups acquire the bucket lock as RW_READER. 197 * 198 * NOTE : The only functions that does the IRE_REFRELE when an ire is 199 * passed as an argument are : 200 * 201 * 1) ip_wput_ire : This is because it IRE_REFHOLD/RELEs the 202 * broadcast ires it looks up internally within 203 * the function. Currently, for simplicity it does 204 * not differentiate the one that is passed in and 205 * the ones it looks up internally. It always 206 * IRE_REFRELEs. 207 * 2) ire_send 208 * ire_send_v6 : As ire_send calls ip_wput_ire and other functions 209 * that take ire as an argument, it has to selectively 210 * IRE_REFRELE the ire. To maintain symmetry, 211 * ire_send_v6 does the same. 212 * 213 * Otherwise, the general rule is to do the IRE_REFRELE in the function 214 * that is passing the ire as an argument. 215 * 216 * In trying to locate ires the following points are to be noted. 217 * 218 * IRE_MARK_CONDEMNED signifies that the ire has been logically deleted and is 219 * to be ignored when walking the ires using ire_next. 220 * 221 * IRE_MARK_HIDDEN signifies that the ire is a special ire typically for the 222 * benefit of in.mpathd which needs to probe interfaces for failures. Normal 223 * applications should not be seeing this ire and hence this ire is ignored 224 * in most cases in the search using ire_next. 225 * 226 * Zones note: 227 * Walking IREs within a given zone also walks certain ires in other 228 * zones. This is done intentionally. IRE walks with a specified 229 * zoneid are used only when doing informational reports, and 230 * zone users want to see things that they can access. See block 231 * comment in ire_walk_ill_match(). 232 */ 233 234 /* This is dynamically allocated in ip_ire_init */ 235 irb_t *ip_cache_table; 236 /* This is dynamically allocated in ire_add_mrtun */ 237 irb_t *ip_mrtun_table; 238 239 uint32_t ire_handle = 1; 240 /* 241 * ire_ft_init_lock is used while initializing ip_forwarding_table 242 * dynamically in ire_add. 243 */ 244 kmutex_t ire_ft_init_lock; 245 kmutex_t ire_mrtun_lock; /* Protects creation of table and it's count */ 246 kmutex_t ire_srcif_table_lock; /* Same as above */ 247 /* 248 * The following counts are used to determine whether a walk is 249 * needed through the reverse tunnel table or through ills 250 */ 251 kmutex_t ire_handle_lock; /* Protects ire_handle */ 252 uint_t ire_mrtun_count; /* Number of ires in reverse tun table */ 253 254 /* 255 * A per-interface routing table is created ( if not present) 256 * when the first entry is added to this special routing table. 257 * This special routing table is accessed through the ill data structure. 258 * The routing table looks like cache table. For example, currently it 259 * is used by mobile-ip foreign agent to forward data that only comes from 260 * the home agent tunnel for a mobile node. Thus if the outgoing interface 261 * is a RESOLVER interface, IP may need to resolve the hardware address for 262 * the outgoing interface. The routing entries in this table are not updated 263 * in IRE_CACHE. When MCTL msg comes back from ARP, the incoming ill informa- 264 * tion is lost as the write queue is passed to ip_wput. 265 * But, before sending the packet out, the hardware information must be updated 266 * in the special forwarding table. ire_srcif_table_count keeps track of total 267 * number of ires that are in interface based tables. Each interface based 268 * table hangs off of the incoming ill and each ill_t also keeps a refcnt 269 * of ires in that table. 270 */ 271 272 uint_t ire_srcif_table_count; /* Number of ires in all srcif tables */ 273 274 /* 275 * The minimum size of IRE cache table. It will be recalcuated in 276 * ip_ire_init(). 277 */ 278 uint32_t ip_cache_table_size = IP_CACHE_TABLE_SIZE; 279 uint32_t ip6_cache_table_size = IP6_CACHE_TABLE_SIZE; 280 281 /* 282 * The size of the forwarding table. We will make sure that it is a 283 * power of 2 in ip_ire_init(). 284 */ 285 uint32_t ip6_ftable_hash_size = IP6_FTABLE_HASH_SIZE; 286 287 struct kmem_cache *ire_cache; 288 static ire_t ire_null; 289 290 ire_stats_t ire_stats_v4; /* IPv4 ire statistics */ 291 ire_stats_t ire_stats_v6; /* IPv6 ire statistics */ 292 293 /* 294 * The threshold number of IRE in a bucket when the IREs are 295 * cleaned up. This threshold is calculated later in ip_open() 296 * based on the speed of CPU and available memory. This default 297 * value is the maximum. 298 * 299 * We have two kinds of cached IRE, temporary and 300 * non-temporary. Temporary IREs are marked with 301 * IRE_MARK_TEMPORARY. They are IREs created for non 302 * TCP traffic and for forwarding purposes. All others 303 * are non-temporary IREs. We don't mark IRE created for 304 * TCP as temporary because TCP is stateful and there are 305 * info stored in the IRE which can be shared by other TCP 306 * connections to the same destination. For connected 307 * endpoint, we also don't want to mark the IRE used as 308 * temporary because the same IRE will be used frequently, 309 * otherwise, the app should not do a connect(). We change 310 * the marking at ip_bind_connected_*() if necessary. 311 * 312 * We want to keep the cache IRE hash bucket length reasonably 313 * short, otherwise IRE lookup functions will take "forever." 314 * We use the "crude" function that the IRE bucket 315 * length should be based on the CPU speed, which is 1 entry 316 * per x MHz, depending on the shift factor ip_ire_cpu_ratio 317 * (n). This means that with a 750MHz CPU, the max bucket 318 * length can be (750 >> n) entries. 319 * 320 * Note that this threshold is separate for temp and non-temp 321 * IREs. This means that the actual bucket length can be 322 * twice as that. And while we try to keep temporary IRE 323 * length at most at the threshold value, we do not attempt to 324 * make the length for non-temporary IREs fixed, for the 325 * reason stated above. Instead, we start trying to find 326 * "unused" non-temporary IREs when the bucket length reaches 327 * this threshold and clean them up. 328 * 329 * We also want to limit the amount of memory used by 330 * IREs. So if we are allowed to use ~3% of memory (M) 331 * for those IREs, each bucket should not have more than 332 * 333 * M / num of cache bucket / sizeof (ire_t) 334 * 335 * Again the above memory uses are separate for temp and 336 * non-temp cached IREs. 337 * 338 * We may also want the limit to be a function of the number 339 * of interfaces and number of CPUs. Doing the initialization 340 * in ip_open() means that every time an interface is plumbed, 341 * the max is re-calculated. Right now, we don't do anything 342 * different. In future, when we have more experience, we 343 * may want to change this behavior. 344 */ 345 uint32_t ip_ire_max_bucket_cnt = 10; 346 uint32_t ip6_ire_max_bucket_cnt = 10; 347 348 /* 349 * The minimum of the temporary IRE bucket count. We do not want 350 * the length of each bucket to be too short. This may hurt 351 * performance of some apps as the temporary IREs are removed too 352 * often. 353 */ 354 uint32_t ip_ire_min_bucket_cnt = 3; 355 uint32_t ip6_ire_min_bucket_cnt = 3; 356 357 /* 358 * The ratio of memory consumed by IRE used for temporary to available 359 * memory. This is a shift factor, so 6 means the ratio 1 to 64. This 360 * value can be changed in /etc/system. 6 is a reasonable number. 361 */ 362 uint32_t ip_ire_mem_ratio = 6; 363 /* The shift factor for CPU speed to calculate the max IRE bucket length. */ 364 uint32_t ip_ire_cpu_ratio = 7; 365 366 typedef struct nce_clookup_s { 367 ipaddr_t ncecl_addr; 368 boolean_t ncecl_found; 369 } nce_clookup_t; 370 371 /* 372 * The maximum number of buckets in IRE cache table. In future, we may 373 * want to make it a dynamic hash table. For the moment, we fix the 374 * size and allocate the table in ip_ire_init() when IP is first loaded. 375 * We take into account the amount of memory a system has. 376 */ 377 #define IP_MAX_CACHE_TABLE_SIZE 4096 378 379 static uint32_t ip_max_cache_table_size = IP_MAX_CACHE_TABLE_SIZE; 380 static uint32_t ip6_max_cache_table_size = IP_MAX_CACHE_TABLE_SIZE; 381 382 #define NUM_ILLS 3 /* To build the ILL list to unlock */ 383 384 /* Zero iulp_t for initialization. */ 385 const iulp_t ire_uinfo_null = { 0 }; 386 387 static int ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, 388 ipsq_func_t func, boolean_t); 389 static int ire_add_srcif_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, 390 ipsq_func_t func); 391 static ire_t *ire_update_srcif_v4(ire_t *ire); 392 static void ire_delete_v4(ire_t *ire); 393 static void ire_report_ctable(ire_t *ire, char *mp); 394 static void ire_report_mrtun_table(ire_t *ire, char *mp); 395 static void ire_report_srcif_table(ire_t *ire, char *mp); 396 static void ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers, 397 zoneid_t zoneid); 398 static void ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type, 399 pfv_t func, void *arg, uchar_t vers, ill_t *ill); 400 static void ire_cache_cleanup(irb_t *irb, uint32_t threshold, int cnt); 401 extern void ill_unlock_ills(ill_t **list, int cnt); 402 static void ire_fastpath_list_add(ill_t *ill, ire_t *ire); 403 static void ip_nce_clookup_and_delete(nce_t *nce, void *arg); 404 extern void th_trace_rrecord(th_trace_t *); 405 #ifdef IRE_DEBUG 406 static void ire_trace_inactive(ire_t *); 407 #endif 408 409 /* 410 * To avoid bloating the code, we call this function instead of 411 * using the macro IRE_REFRELE. Use macro only in performance 412 * critical paths. 413 * 414 * Must not be called while holding any locks. Otherwise if this is 415 * the last reference to be released there is a chance of recursive mutex 416 * panic due to ire_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 417 * to restart an ioctl. The one exception is when the caller is sure that 418 * this is not the last reference to be released. Eg. if the caller is 419 * sure that the ire has not been deleted and won't be deleted. 420 */ 421 void 422 ire_refrele(ire_t *ire) 423 { 424 IRE_REFRELE(ire); 425 } 426 427 void 428 ire_refrele_notr(ire_t *ire) 429 { 430 IRE_REFRELE_NOTR(ire); 431 } 432 433 /* 434 * kmem_cache_alloc constructor for IRE in kma space. 435 * Note that when ire_mp is set the IRE is stored in that mblk and 436 * not in this cache. 437 */ 438 /* ARGSUSED */ 439 static int 440 ip_ire_constructor(void *buf, void *cdrarg, int kmflags) 441 { 442 ire_t *ire = buf; 443 444 ire->ire_nce = NULL; 445 446 return (0); 447 } 448 449 /* ARGSUSED1 */ 450 static void 451 ip_ire_destructor(void *buf, void *cdrarg) 452 { 453 ire_t *ire = buf; 454 455 ASSERT(ire->ire_nce == NULL); 456 } 457 458 /* 459 * This function is associated with the IP_IOC_IRE_ADVISE_NO_REPLY 460 * IOCTL. It is used by TCP (or other ULPs) to supply revised information 461 * for an existing CACHED IRE. 462 */ 463 /* ARGSUSED */ 464 int 465 ip_ire_advise(queue_t *q, mblk_t *mp, cred_t *ioc_cr) 466 { 467 uchar_t *addr_ucp; 468 ipic_t *ipic; 469 ire_t *ire; 470 ipaddr_t addr; 471 in6_addr_t v6addr; 472 irb_t *irb; 473 zoneid_t zoneid; 474 475 ASSERT(q->q_next == NULL); 476 zoneid = Q_TO_CONN(q)->conn_zoneid; 477 478 /* 479 * Check privilege using the ioctl credential; if it is NULL 480 * then this is a kernel message and therefor privileged. 481 */ 482 if (ioc_cr != NULL && secpolicy_net_config(ioc_cr, B_FALSE) != 0) 483 return (EPERM); 484 485 ipic = (ipic_t *)mp->b_rptr; 486 if (!(addr_ucp = mi_offset_param(mp, ipic->ipic_addr_offset, 487 ipic->ipic_addr_length))) { 488 return (EINVAL); 489 } 490 if (!OK_32PTR(addr_ucp)) 491 return (EINVAL); 492 switch (ipic->ipic_addr_length) { 493 case IP_ADDR_LEN: { 494 /* Extract the destination address. */ 495 addr = *(ipaddr_t *)addr_ucp; 496 /* Find the corresponding IRE. */ 497 ire = ire_cache_lookup(addr, zoneid, NULL); 498 break; 499 } 500 case IPV6_ADDR_LEN: { 501 /* Extract the destination address. */ 502 v6addr = *(in6_addr_t *)addr_ucp; 503 /* Find the corresponding IRE. */ 504 ire = ire_cache_lookup_v6(&v6addr, zoneid, NULL); 505 break; 506 } 507 default: 508 return (EINVAL); 509 } 510 511 if (ire == NULL) 512 return (ENOENT); 513 /* 514 * Update the round trip time estimate and/or the max frag size 515 * and/or the slow start threshold. 516 * 517 * We serialize multiple advises using ire_lock. 518 */ 519 mutex_enter(&ire->ire_lock); 520 if (ipic->ipic_rtt) { 521 /* 522 * If there is no old cached values, initialize them 523 * conservatively. Set them to be (1.5 * new value). 524 */ 525 if (ire->ire_uinfo.iulp_rtt != 0) { 526 ire->ire_uinfo.iulp_rtt = (ire->ire_uinfo.iulp_rtt + 527 ipic->ipic_rtt) >> 1; 528 } else { 529 ire->ire_uinfo.iulp_rtt = ipic->ipic_rtt + 530 (ipic->ipic_rtt >> 1); 531 } 532 if (ire->ire_uinfo.iulp_rtt_sd != 0) { 533 ire->ire_uinfo.iulp_rtt_sd = 534 (ire->ire_uinfo.iulp_rtt_sd + 535 ipic->ipic_rtt_sd) >> 1; 536 } else { 537 ire->ire_uinfo.iulp_rtt_sd = ipic->ipic_rtt_sd + 538 (ipic->ipic_rtt_sd >> 1); 539 } 540 } 541 if (ipic->ipic_max_frag) 542 ire->ire_max_frag = MIN(ipic->ipic_max_frag, IP_MAXPACKET); 543 if (ipic->ipic_ssthresh != 0) { 544 if (ire->ire_uinfo.iulp_ssthresh != 0) 545 ire->ire_uinfo.iulp_ssthresh = 546 (ipic->ipic_ssthresh + 547 ire->ire_uinfo.iulp_ssthresh) >> 1; 548 else 549 ire->ire_uinfo.iulp_ssthresh = ipic->ipic_ssthresh; 550 } 551 /* 552 * Don't need the ire_lock below this. ire_type does not change 553 * after initialization. ire_marks is protected by irb_lock. 554 */ 555 mutex_exit(&ire->ire_lock); 556 557 if (ipic->ipic_ire_marks != 0 && ire->ire_type == IRE_CACHE) { 558 /* 559 * Only increment the temporary IRE count if the original 560 * IRE is not already marked temporary. 561 */ 562 irb = ire->ire_bucket; 563 rw_enter(&irb->irb_lock, RW_WRITER); 564 if ((ipic->ipic_ire_marks & IRE_MARK_TEMPORARY) && 565 !(ire->ire_marks & IRE_MARK_TEMPORARY)) { 566 irb->irb_tmp_ire_cnt++; 567 } 568 ire->ire_marks |= ipic->ipic_ire_marks; 569 rw_exit(&irb->irb_lock); 570 } 571 572 ire_refrele(ire); 573 return (0); 574 } 575 576 /* 577 * This function is associated with the IP_IOC_IRE_DELETE[_NO_REPLY] 578 * IOCTL[s]. The NO_REPLY form is used by TCP to delete a route IRE 579 * for a host that is not responding. This will force an attempt to 580 * establish a new route, if available, and flush out the ARP entry so 581 * it will re-resolve. Management processes may want to use the 582 * version that generates a reply. 583 * 584 * This function does not support IPv6 since Neighbor Unreachability Detection 585 * means that negative advise like this is useless. 586 */ 587 /* ARGSUSED */ 588 int 589 ip_ire_delete(queue_t *q, mblk_t *mp, cred_t *ioc_cr) 590 { 591 uchar_t *addr_ucp; 592 ipaddr_t addr; 593 ire_t *ire; 594 ipid_t *ipid; 595 boolean_t routing_sock_info = B_FALSE; /* Sent info? */ 596 zoneid_t zoneid; 597 ire_t *gire = NULL; 598 ill_t *ill; 599 mblk_t *arp_mp; 600 601 ASSERT(q->q_next == NULL); 602 zoneid = Q_TO_CONN(q)->conn_zoneid; 603 604 /* 605 * Check privilege using the ioctl credential; if it is NULL 606 * then this is a kernel message and therefor privileged. 607 */ 608 if (ioc_cr != NULL && secpolicy_net_config(ioc_cr, B_FALSE) != 0) 609 return (EPERM); 610 611 ipid = (ipid_t *)mp->b_rptr; 612 613 /* Only actions on IRE_CACHEs are acceptable at present. */ 614 if (ipid->ipid_ire_type != IRE_CACHE) 615 return (EINVAL); 616 617 addr_ucp = mi_offset_param(mp, ipid->ipid_addr_offset, 618 ipid->ipid_addr_length); 619 if (addr_ucp == NULL || !OK_32PTR(addr_ucp)) 620 return (EINVAL); 621 switch (ipid->ipid_addr_length) { 622 case IP_ADDR_LEN: 623 /* addr_ucp points at IP addr */ 624 break; 625 case sizeof (sin_t): { 626 sin_t *sin; 627 /* 628 * got complete (sockaddr) address - increment addr_ucp to point 629 * at the ip_addr field. 630 */ 631 sin = (sin_t *)addr_ucp; 632 addr_ucp = (uchar_t *)&sin->sin_addr.s_addr; 633 break; 634 } 635 default: 636 return (EINVAL); 637 } 638 /* Extract the destination address. */ 639 bcopy(addr_ucp, &addr, IP_ADDR_LEN); 640 641 /* Try to find the CACHED IRE. */ 642 ire = ire_cache_lookup(addr, zoneid, NULL); 643 644 /* Nail it. */ 645 if (ire) { 646 /* Allow delete only on CACHE entries */ 647 if (ire->ire_type != IRE_CACHE) { 648 ire_refrele(ire); 649 return (EINVAL); 650 } 651 652 /* 653 * Verify that the IRE has been around for a while. 654 * This is to protect against transport protocols 655 * that are too eager in sending delete messages. 656 */ 657 if (gethrestime_sec() < 658 ire->ire_create_time + ip_ignore_delete_time) { 659 ire_refrele(ire); 660 return (EINVAL); 661 } 662 /* 663 * Now we have a potentially dead cache entry. We need 664 * to remove it. 665 * If this cache entry is generated from a 666 * default route (i.e., ire_cmask == 0), 667 * search the default list and mark it dead and some 668 * background process will try to activate it. 669 */ 670 if ((ire->ire_gateway_addr != 0) && (ire->ire_cmask == 0)) { 671 /* 672 * Make sure that we pick a different 673 * IRE_DEFAULT next time. 674 */ 675 ire_t *gw_ire; 676 irb_t *irb = NULL; 677 uint_t match_flags; 678 679 match_flags = (MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE); 680 681 gire = ire_ftable_lookup(ire->ire_addr, 682 ire->ire_cmask, 0, 0, 683 ire->ire_ipif, NULL, zoneid, 0, NULL, match_flags); 684 685 ip3dbg(("ire_ftable_lookup() returned gire %p\n", 686 (void *)gire)); 687 688 if (gire != NULL) { 689 irb = gire->ire_bucket; 690 691 /* 692 * We grab it as writer just to serialize 693 * multiple threads trying to bump up 694 * irb_rr_origin 695 */ 696 rw_enter(&irb->irb_lock, RW_WRITER); 697 if ((gw_ire = irb->irb_rr_origin) == NULL) { 698 rw_exit(&irb->irb_lock); 699 goto done; 700 } 701 702 703 /* Skip past the potentially bad gateway */ 704 if (ire->ire_gateway_addr == 705 gw_ire->ire_gateway_addr) 706 irb->irb_rr_origin = gw_ire->ire_next; 707 708 rw_exit(&irb->irb_lock); 709 } 710 } 711 done: 712 if (gire != NULL) 713 IRE_REFRELE(gire); 714 /* report the bad route to routing sockets */ 715 ip_rts_change(RTM_LOSING, ire->ire_addr, ire->ire_gateway_addr, 716 ire->ire_mask, ire->ire_src_addr, 0, 0, 0, 717 (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA)); 718 routing_sock_info = B_TRUE; 719 720 /* 721 * TCP is really telling us to start over completely, and it 722 * expects that we'll resend the ARP query. Tell ARP to 723 * discard the entry, if this is a local destination. 724 */ 725 ill = ire->ire_stq->q_ptr; 726 if (ire->ire_gateway_addr == 0 && 727 (arp_mp = ill_ared_alloc(ill, addr)) != NULL) { 728 putnext(ill->ill_rq, arp_mp); 729 } 730 731 ire_delete(ire); 732 ire_refrele(ire); 733 } 734 /* Also look for an IRE_HOST_REDIRECT and remove it if present */ 735 ire = ire_route_lookup(addr, 0, 0, IRE_HOST_REDIRECT, NULL, NULL, 736 ALL_ZONES, NULL, MATCH_IRE_TYPE); 737 738 /* Nail it. */ 739 if (ire) { 740 if (!routing_sock_info) { 741 ip_rts_change(RTM_LOSING, ire->ire_addr, 742 ire->ire_gateway_addr, ire->ire_mask, 743 ire->ire_src_addr, 0, 0, 0, 744 (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA)); 745 } 746 ire_delete(ire); 747 ire_refrele(ire); 748 } 749 return (0); 750 } 751 752 /* 753 * Named Dispatch routine to produce a formatted report on all IREs. 754 * This report is accessed by using the ndd utility to "get" ND variable 755 * "ipv4_ire_status". 756 */ 757 /* ARGSUSED */ 758 int 759 ip_ire_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 760 { 761 zoneid_t zoneid; 762 763 (void) mi_mpprintf(mp, 764 "IRE " MI_COL_HDRPAD_STR 765 /* 01234567[89ABCDEF] */ 766 "rfq " MI_COL_HDRPAD_STR 767 /* 01234567[89ABCDEF] */ 768 "stq " MI_COL_HDRPAD_STR 769 /* 01234567[89ABCDEF] */ 770 " zone " 771 /* 12345 */ 772 "addr mask " 773 /* 123.123.123.123 123.123.123.123 */ 774 "src gateway mxfrg rtt rtt_sd ssthresh ref " 775 /* 123.123.123.123 123.123.123.123 12345 12345 123456 12345678 123 */ 776 "rtomax tstamp_ok wscale_ok ecn_ok pmtud_ok sack sendpipe " 777 /* 123456 123456789 123456789 123456 12345678 1234 12345678 */ 778 "recvpipe in/out/forward type"); 779 /* 12345678 in/out/forward xxxxxxxxxx */ 780 781 /* 782 * Because of the ndd constraint, at most we can have 64K buffer 783 * to put in all IRE info. So to be more efficient, just 784 * allocate a 64K buffer here, assuming we need that large buffer. 785 * This should be OK as only root can do ndd /dev/ip. 786 */ 787 if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) { 788 /* The following may work even if we cannot get a large buf. */ 789 (void) mi_mpprintf(mp, "<< Out of buffer >>\n"); 790 return (0); 791 } 792 793 zoneid = Q_TO_CONN(q)->conn_zoneid; 794 if (zoneid == GLOBAL_ZONEID) 795 zoneid = ALL_ZONES; 796 797 ire_walk_v4(ire_report_ftable, mp->b_cont, zoneid); 798 ire_walk_v4(ire_report_ctable, mp->b_cont, zoneid); 799 800 return (0); 801 } 802 803 804 /* ire_walk routine invoked for ip_ire_report for each cached IRE. */ 805 static void 806 ire_report_ctable(ire_t *ire, char *mp) 807 { 808 char buf1[16]; 809 char buf2[16]; 810 char buf3[16]; 811 char buf4[16]; 812 uint_t fo_pkt_count; 813 uint_t ib_pkt_count; 814 int ref; 815 uint_t print_len, buf_len; 816 817 if ((ire->ire_type & IRE_CACHETABLE) == 0) 818 return; 819 buf_len = ((mblk_t *)mp)->b_datap->db_lim - ((mblk_t *)mp)->b_wptr; 820 if (buf_len <= 0) 821 return; 822 823 /* Number of active references of this ire */ 824 ref = ire->ire_refcnt; 825 /* "inbound" to a non local address is a forward */ 826 ib_pkt_count = ire->ire_ib_pkt_count; 827 fo_pkt_count = 0; 828 if (!(ire->ire_type & (IRE_LOCAL|IRE_BROADCAST))) { 829 fo_pkt_count = ib_pkt_count; 830 ib_pkt_count = 0; 831 } 832 print_len = snprintf((char *)((mblk_t *)mp)->b_wptr, buf_len, 833 MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR "%5d " 834 "%s %s %s %s %05d %05ld %06ld %08d %03d %06d %09d %09d %06d %08d " 835 "%04d %08d %08d %d/%d/%d %s\n", 836 (void *)ire, (void *)ire->ire_rfq, (void *)ire->ire_stq, 837 (int)ire->ire_zoneid, 838 ip_dot_addr(ire->ire_addr, buf1), ip_dot_addr(ire->ire_mask, buf2), 839 ip_dot_addr(ire->ire_src_addr, buf3), 840 ip_dot_addr(ire->ire_gateway_addr, buf4), 841 ire->ire_max_frag, ire->ire_uinfo.iulp_rtt, 842 ire->ire_uinfo.iulp_rtt_sd, ire->ire_uinfo.iulp_ssthresh, ref, 843 ire->ire_uinfo.iulp_rtomax, 844 (ire->ire_uinfo.iulp_tstamp_ok ? 1: 0), 845 (ire->ire_uinfo.iulp_wscale_ok ? 1: 0), 846 (ire->ire_uinfo.iulp_ecn_ok ? 1: 0), 847 (ire->ire_uinfo.iulp_pmtud_ok ? 1: 0), 848 ire->ire_uinfo.iulp_sack, 849 ire->ire_uinfo.iulp_spipe, ire->ire_uinfo.iulp_rpipe, 850 ib_pkt_count, ire->ire_ob_pkt_count, fo_pkt_count, 851 ip_nv_lookup(ire_nv_tbl, (int)ire->ire_type)); 852 if (print_len < buf_len) { 853 ((mblk_t *)mp)->b_wptr += print_len; 854 } else { 855 ((mblk_t *)mp)->b_wptr += buf_len; 856 } 857 } 858 859 /* ARGSUSED */ 860 int 861 ip_ire_report_mrtun(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 862 { 863 (void) mi_mpprintf(mp, 864 "IRE " MI_COL_HDRPAD_STR 865 /* 01234567[89ABCDEF] */ 866 "stq " MI_COL_HDRPAD_STR 867 /* 01234567[89ABCDEF] */ 868 "in_ill " MI_COL_HDRPAD_STR 869 /* 01234567[89ABCDEF] */ 870 "in_src_addr " 871 /* 123.123.123.123 */ 872 "max_frag " 873 /* 12345 */ 874 "ref "); 875 /* 123 */ 876 877 ire_walk_ill_mrtun(0, 0, ire_report_mrtun_table, mp, NULL); 878 return (0); 879 } 880 881 /* mrtun report table - supports ipv4_mrtun_ire_status ndd variable */ 882 883 static void 884 ire_report_mrtun_table(ire_t *ire, char *mp) 885 { 886 char buf1[INET_ADDRSTRLEN]; 887 int ref; 888 889 /* Number of active references of this ire */ 890 ref = ire->ire_refcnt; 891 ASSERT(ire->ire_type == IRE_MIPRTUN); 892 (void) mi_mpprintf((mblk_t *)mp, 893 MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR 894 "%s %05d %03d", 895 (void *)ire, (void *)ire->ire_stq, 896 (void *)ire->ire_in_ill, 897 ip_dot_addr(ire->ire_in_src_addr, buf1), 898 ire->ire_max_frag, ref); 899 } 900 901 /* 902 * Dispatch routine to format ires in interface based routine 903 */ 904 /* ARGSUSED */ 905 int 906 ip_ire_report_srcif(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 907 { 908 909 /* Report all interface based ires */ 910 911 (void) mi_mpprintf(mp, 912 "IRE " MI_COL_HDRPAD_STR 913 /* 01234567[89ABCDEF] */ 914 "stq " MI_COL_HDRPAD_STR 915 /* 01234567[89ABCDEF] */ 916 "in_ill " MI_COL_HDRPAD_STR 917 /* 01234567[89ABCDEF] */ 918 "addr " 919 /* 123.123.123.123 */ 920 "gateway " 921 /* 123.123.123.123 */ 922 "max_frag " 923 /* 12345 */ 924 "ref " 925 /* 123 */ 926 "type " 927 /* ABCDEFGH */ 928 "in/out/forward"); 929 ire_walk_srcif_table_v4(ire_report_srcif_table, mp); 930 return (0); 931 } 932 933 /* Reports the interface table ires */ 934 static void 935 ire_report_srcif_table(ire_t *ire, char *mp) 936 { 937 char buf1[INET_ADDRSTRLEN]; 938 char buf2[INET_ADDRSTRLEN]; 939 int ref; 940 941 ref = ire->ire_refcnt; 942 (void) mi_mpprintf((mblk_t *)mp, 943 MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR 944 "%s %s %05d %03d %s %d", 945 (void *)ire, (void *)ire->ire_stq, 946 (void *)ire->ire_in_ill, 947 ip_dot_addr(ire->ire_addr, buf1), 948 ip_dot_addr(ire->ire_gateway_addr, buf2), 949 ire->ire_max_frag, ref, 950 ip_nv_lookup(ire_nv_tbl, (int)ire->ire_type), 951 ire->ire_ib_pkt_count); 952 953 } 954 /* 955 * ip_ire_req is called by ip_wput when an IRE_DB_REQ_TYPE message is handed 956 * down from the Upper Level Protocol to request a copy of the IRE (to check 957 * its type or to extract information like round-trip time estimates or the 958 * MTU.) 959 * The address is assumed to be in the ire_addr field. If no IRE is found 960 * an IRE is returned with ire_type being zero. 961 * Note that the upper lavel protocol has to check for broadcast 962 * (IRE_BROADCAST) and multicast (CLASSD(addr)). 963 * If there is a b_cont the resulting IRE_DB_TYPE mblk is placed at the 964 * end of the returned message. 965 * 966 * TCP sends down a message of this type with a connection request packet 967 * chained on. UDP and ICMP send it down to verify that a route exists for 968 * the destination address when they get connected. 969 */ 970 void 971 ip_ire_req(queue_t *q, mblk_t *mp) 972 { 973 ire_t *inire; 974 ire_t *ire; 975 mblk_t *mp1; 976 ire_t *sire = NULL; 977 zoneid_t zoneid = Q_TO_CONN(q)->conn_zoneid; 978 979 if ((mp->b_wptr - mp->b_rptr) < sizeof (ire_t) || 980 !OK_32PTR(mp->b_rptr)) { 981 freemsg(mp); 982 return; 983 } 984 inire = (ire_t *)mp->b_rptr; 985 /* 986 * Got it, now take our best shot at an IRE. 987 */ 988 if (inire->ire_ipversion == IPV6_VERSION) { 989 ire = ire_route_lookup_v6(&inire->ire_addr_v6, 0, 0, 0, 990 NULL, &sire, zoneid, NULL, 991 (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT)); 992 } else { 993 ASSERT(inire->ire_ipversion == IPV4_VERSION); 994 ire = ire_route_lookup(inire->ire_addr, 0, 0, 0, 995 NULL, &sire, zoneid, NULL, 996 (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT)); 997 } 998 999 /* 1000 * We prevent returning IRES with source address INADDR_ANY 1001 * as these were temporarily created for sending packets 1002 * from endpoints that have conn_unspec_src set. 1003 */ 1004 if (ire == NULL || 1005 (ire->ire_ipversion == IPV4_VERSION && 1006 ire->ire_src_addr == INADDR_ANY) || 1007 (ire->ire_ipversion == IPV6_VERSION && 1008 IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6))) { 1009 inire->ire_type = 0; 1010 } else { 1011 bcopy(ire, inire, sizeof (ire_t)); 1012 /* Copy the route metrics from the parent. */ 1013 if (sire != NULL) { 1014 bcopy(&(sire->ire_uinfo), &(inire->ire_uinfo), 1015 sizeof (iulp_t)); 1016 } 1017 1018 /* 1019 * As we don't lookup global policy here, we may not 1020 * pass the right size if per-socket policy is not 1021 * present. For these cases, path mtu discovery will 1022 * do the right thing. 1023 */ 1024 inire->ire_ipsec_overhead = conn_ipsec_length(Q_TO_CONN(q)); 1025 1026 /* Pass the latest setting of the ip_path_mtu_discovery */ 1027 inire->ire_frag_flag |= (ip_path_mtu_discovery) ? IPH_DF : 0; 1028 } 1029 if (ire != NULL) 1030 ire_refrele(ire); 1031 if (sire != NULL) 1032 ire_refrele(sire); 1033 mp->b_wptr = &mp->b_rptr[sizeof (ire_t)]; 1034 mp->b_datap->db_type = IRE_DB_TYPE; 1035 1036 /* Put the IRE_DB_TYPE mblk last in the chain */ 1037 mp1 = mp->b_cont; 1038 if (mp1 != NULL) { 1039 mp->b_cont = NULL; 1040 linkb(mp1, mp); 1041 mp = mp1; 1042 } 1043 qreply(q, mp); 1044 } 1045 1046 /* 1047 * Send a packet using the specified IRE. 1048 * If ire_src_addr_v6 is all zero then discard the IRE after 1049 * the packet has been sent. 1050 */ 1051 static void 1052 ire_send(queue_t *q, mblk_t *pkt, ire_t *ire) 1053 { 1054 mblk_t *ipsec_mp; 1055 boolean_t is_secure; 1056 uint_t ifindex; 1057 ill_t *ill; 1058 zoneid_t zoneid = ire->ire_zoneid; 1059 1060 ASSERT(ire->ire_ipversion == IPV4_VERSION); 1061 ASSERT(!(ire->ire_type & IRE_LOCAL)); /* Has different ire_zoneid */ 1062 ipsec_mp = pkt; 1063 is_secure = (pkt->b_datap->db_type == M_CTL); 1064 if (is_secure) { 1065 ipsec_out_t *io; 1066 1067 pkt = pkt->b_cont; 1068 io = (ipsec_out_t *)ipsec_mp->b_rptr; 1069 if (io->ipsec_out_type == IPSEC_OUT) 1070 zoneid = io->ipsec_out_zoneid; 1071 } 1072 1073 /* If the packet originated externally then */ 1074 if (pkt->b_prev) { 1075 ire_refrele(ire); 1076 /* 1077 * Extract the ifindex from b_prev (set in ip_rput_noire). 1078 * Look up interface to see if it still exists (it could have 1079 * been unplumbed by the time the reply came back from ARP) 1080 */ 1081 ifindex = (uint_t)(uintptr_t)pkt->b_prev; 1082 ill = ill_lookup_on_ifindex(ifindex, B_FALSE, 1083 NULL, NULL, NULL, NULL); 1084 if (ill == NULL) { 1085 pkt->b_prev = NULL; 1086 pkt->b_next = NULL; 1087 freemsg(ipsec_mp); 1088 return; 1089 } 1090 q = ill->ill_rq; 1091 pkt->b_prev = NULL; 1092 /* 1093 * This packet has not gone through IPSEC processing 1094 * and hence we should not have any IPSEC message 1095 * prepended. 1096 */ 1097 ASSERT(ipsec_mp == pkt); 1098 put(q, pkt); 1099 ill_refrele(ill); 1100 } else if (pkt->b_next) { 1101 /* Packets from multicast router */ 1102 pkt->b_next = NULL; 1103 /* 1104 * We never get the IPSEC_OUT while forwarding the 1105 * packet for multicast router. 1106 */ 1107 ASSERT(ipsec_mp == pkt); 1108 ip_rput_forward(ire, (ipha_t *)pkt->b_rptr, ipsec_mp, NULL); 1109 ire_refrele(ire); 1110 } else { 1111 /* Locally originated packets */ 1112 boolean_t is_inaddr_any; 1113 ipha_t *ipha = (ipha_t *)pkt->b_rptr; 1114 1115 /* 1116 * We need to do an ire_delete below for which 1117 * we need to make sure that the IRE will be 1118 * around even after calling ip_wput_ire - 1119 * which does ire_refrele. Otherwise somebody 1120 * could potentially delete this ire and hence 1121 * free this ire and we will be calling ire_delete 1122 * on a freed ire below. 1123 */ 1124 is_inaddr_any = (ire->ire_src_addr == INADDR_ANY); 1125 if (is_inaddr_any) { 1126 IRE_REFHOLD(ire); 1127 } 1128 /* 1129 * If we were resolving a router we can not use the 1130 * routers IRE for sending the packet (since it would 1131 * violate the uniqness of the IP idents) thus we 1132 * make another pass through ip_wput to create the IRE_CACHE 1133 * for the destination. 1134 * When IRE_MARK_NOADD is set, ire_add() is not called. 1135 * Thus ip_wput() will never find a ire and result in an 1136 * infinite loop. Thus we check whether IRE_MARK_NOADD is 1137 * is set. This also implies that IRE_MARK_NOADD can only be 1138 * used to send packets to directly connected hosts. 1139 */ 1140 if (ipha->ipha_dst != ire->ire_addr && 1141 !(ire->ire_marks & IRE_MARK_NOADD)) { 1142 ire_refrele(ire); /* Held in ire_add */ 1143 if (CONN_Q(q)) { 1144 (void) ip_output(Q_TO_CONN(q), ipsec_mp, q, 1145 IRE_SEND); 1146 } else { 1147 (void) ip_output((void *)(uintptr_t)zoneid, 1148 ipsec_mp, q, IRE_SEND); 1149 } 1150 } else { 1151 if (is_secure) { 1152 ipsec_out_t *oi; 1153 ipha_t *ipha; 1154 1155 oi = (ipsec_out_t *)ipsec_mp->b_rptr; 1156 ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr; 1157 if (oi->ipsec_out_proc_begin) { 1158 /* 1159 * This is the case where 1160 * ip_wput_ipsec_out could not find 1161 * the IRE and recreated a new one. 1162 * As ip_wput_ipsec_out does ire 1163 * lookups, ire_refrele for the extra 1164 * bump in ire_add. 1165 */ 1166 ire_refrele(ire); 1167 ip_wput_ipsec_out(q, ipsec_mp, ipha, 1168 NULL, NULL); 1169 } else { 1170 /* 1171 * IRE_REFRELE will be done in 1172 * ip_wput_ire. 1173 */ 1174 ip_wput_ire(q, ipsec_mp, ire, NULL, 1175 IRE_SEND, zoneid); 1176 } 1177 } else { 1178 /* 1179 * IRE_REFRELE will be done in ip_wput_ire. 1180 */ 1181 ip_wput_ire(q, ipsec_mp, ire, NULL, 1182 IRE_SEND, zoneid); 1183 } 1184 } 1185 /* 1186 * Special code to support sending a single packet with 1187 * conn_unspec_src using an IRE which has no source address. 1188 * The IRE is deleted here after sending the packet to avoid 1189 * having other code trip on it. But before we delete the 1190 * ire, somebody could have looked up this ire. 1191 * We prevent returning/using this IRE by the upper layers 1192 * by making checks to NULL source address in other places 1193 * like e.g ip_ire_append, ip_ire_req and ip_bind_connected. 1194 * Though, this does not completely prevent other threads 1195 * from using this ire, this should not cause any problems. 1196 * 1197 * NOTE : We use is_inaddr_any instead of using ire_src_addr 1198 * because for the normal case i.e !is_inaddr_any, ire_refrele 1199 * above could have potentially freed the ire. 1200 */ 1201 if (is_inaddr_any) { 1202 /* 1203 * If this IRE has been deleted by another thread, then 1204 * ire_bucket won't be NULL, but ire_ptpn will be NULL. 1205 * Thus, ire_delete will do nothing. This check 1206 * guards against calling ire_delete when the IRE was 1207 * never inserted in the table, which is handled by 1208 * ire_delete as dropping another reference. 1209 */ 1210 if (ire->ire_bucket != NULL) { 1211 ip1dbg(("ire_send: delete IRE\n")); 1212 ire_delete(ire); 1213 } 1214 ire_refrele(ire); /* Held above */ 1215 } 1216 } 1217 } 1218 1219 /* 1220 * Send a packet using the specified IRE. 1221 * If ire_src_addr_v6 is all zero then discard the IRE after 1222 * the packet has been sent. 1223 */ 1224 static void 1225 ire_send_v6(queue_t *q, mblk_t *pkt, ire_t *ire) 1226 { 1227 mblk_t *ipsec_mp; 1228 boolean_t secure; 1229 uint_t ifindex; 1230 zoneid_t zoneid = ire->ire_zoneid; 1231 1232 ASSERT(ire->ire_ipversion == IPV6_VERSION); 1233 ASSERT(!(ire->ire_type & IRE_LOCAL)); /* Has different ire_zoneid */ 1234 if (pkt->b_datap->db_type == M_CTL) { 1235 ipsec_out_t *io; 1236 1237 ipsec_mp = pkt; 1238 pkt = pkt->b_cont; 1239 secure = B_TRUE; 1240 io = (ipsec_out_t *)ipsec_mp->b_rptr; 1241 if (io->ipsec_out_type == IPSEC_OUT) 1242 zoneid = io->ipsec_out_zoneid; 1243 } else { 1244 ipsec_mp = pkt; 1245 secure = B_FALSE; 1246 } 1247 1248 /* If the packet originated externally then */ 1249 if (pkt->b_prev) { 1250 ill_t *ill; 1251 /* 1252 * Extract the ifindex from b_prev (set in ip_rput_data_v6). 1253 * Look up interface to see if it still exists (it could have 1254 * been unplumbed by the time the reply came back from the 1255 * resolver). 1256 */ 1257 ifindex = (uint_t)(uintptr_t)pkt->b_prev; 1258 ill = ill_lookup_on_ifindex(ifindex, B_TRUE, 1259 NULL, NULL, NULL, NULL); 1260 if (ill == NULL) { 1261 pkt->b_prev = NULL; 1262 pkt->b_next = NULL; 1263 freemsg(ipsec_mp); 1264 ire_refrele(ire); /* Held in ire_add */ 1265 return; 1266 } 1267 q = ill->ill_rq; 1268 pkt->b_prev = NULL; 1269 /* 1270 * This packet has not gone through IPSEC processing 1271 * and hence we should not have any IPSEC message 1272 * prepended. 1273 */ 1274 ASSERT(ipsec_mp == pkt); 1275 put(q, pkt); 1276 ill_refrele(ill); 1277 } else if (pkt->b_next) { 1278 /* Packets from multicast router */ 1279 pkt->b_next = NULL; 1280 /* 1281 * We never get the IPSEC_OUT while forwarding the 1282 * packet for multicast router. 1283 */ 1284 ASSERT(ipsec_mp == pkt); 1285 /* 1286 * XXX TODO IPv6. 1287 */ 1288 freemsg(pkt); 1289 #ifdef XXX 1290 ip_rput_forward(ire, (ipha_t *)pkt->b_rptr, pkt, NULL); 1291 #endif 1292 } else { 1293 if (secure) { 1294 ipsec_out_t *oi; 1295 ip6_t *ip6h; 1296 1297 oi = (ipsec_out_t *)ipsec_mp->b_rptr; 1298 ip6h = (ip6_t *)ipsec_mp->b_cont->b_rptr; 1299 if (oi->ipsec_out_proc_begin) { 1300 /* 1301 * This is the case where 1302 * ip_wput_ipsec_out could not find 1303 * the IRE and recreated a new one. 1304 */ 1305 ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h, 1306 NULL, NULL); 1307 } else { 1308 if (CONN_Q(q)) { 1309 (void) ip_output_v6(Q_TO_CONN(q), 1310 ipsec_mp, q, IRE_SEND); 1311 } else { 1312 (void) ip_output_v6( 1313 (void *)(uintptr_t)zoneid, 1314 ipsec_mp, q, IRE_SEND); 1315 } 1316 } 1317 } else { 1318 /* 1319 * Send packets through ip_output_v6 so that any 1320 * ip6_info header can be processed again. 1321 */ 1322 if (CONN_Q(q)) { 1323 (void) ip_output_v6(Q_TO_CONN(q), ipsec_mp, q, 1324 IRE_SEND); 1325 } else { 1326 (void) ip_output_v6((void *)(uintptr_t)zoneid, 1327 ipsec_mp, q, IRE_SEND); 1328 } 1329 } 1330 /* 1331 * Special code to support sending a single packet with 1332 * conn_unspec_src using an IRE which has no source address. 1333 * The IRE is deleted here after sending the packet to avoid 1334 * having other code trip on it. But before we delete the 1335 * ire, somebody could have looked up this ire. 1336 * We prevent returning/using this IRE by the upper layers 1337 * by making checks to NULL source address in other places 1338 * like e.g ip_ire_append_v6, ip_ire_req and 1339 * ip_bind_connected_v6. Though, this does not completely 1340 * prevent other threads from using this ire, this should 1341 * not cause any problems. 1342 */ 1343 if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6)) { 1344 ip1dbg(("ire_send_v6: delete IRE\n")); 1345 ire_delete(ire); 1346 } 1347 } 1348 ire_refrele(ire); /* Held in ire_add */ 1349 } 1350 1351 /* 1352 * Make sure that IRE bucket does not get too long. 1353 * This can cause lock up because ire_cache_lookup() 1354 * may take "forever" to finish. 1355 * 1356 * We just remove cnt IREs each time. This means that 1357 * the bucket length will stay approximately constant, 1358 * depending on cnt. This should be enough to defend 1359 * against DoS attack based on creating temporary IREs 1360 * (for forwarding and non-TCP traffic). 1361 * 1362 * Note that new IRE is normally added at the tail of the 1363 * bucket. This means that we are removing the "oldest" 1364 * temporary IRE added. Only if there are IREs with 1365 * the same ire_addr, do we not add it at the tail. Refer 1366 * to ire_add_v*(). It should be OK for our purpose. 1367 * 1368 * For non-temporary cached IREs, we make sure that they 1369 * have not been used for some time (defined below), they 1370 * are non-local destinations, and there is no one using 1371 * them at the moment (refcnt == 1). 1372 * 1373 * The above means that the IRE bucket length may become 1374 * very long, consisting of mostly non-temporary IREs. 1375 * This can happen when the hash function does a bad job 1376 * so that most TCP connections cluster to a specific bucket. 1377 * This "hopefully" should never happen. It can also 1378 * happen if most TCP connections have very long lives. 1379 * Even with the minimal hash table size of 256, there 1380 * has to be a lot of such connections to make the bucket 1381 * length unreasonably long. This should probably not 1382 * happen either. The third can when this can happen is 1383 * when the machine is under attack, such as SYN flooding. 1384 * TCP should already have the proper mechanism to protect 1385 * that. So we should be safe. 1386 * 1387 * This function is called by ire_add_then_send() after 1388 * a new IRE is added and the packet is sent. 1389 * 1390 * The idle cutoff interval is set to 60s. It can be 1391 * changed using /etc/system. 1392 */ 1393 uint32_t ire_idle_cutoff_interval = 60000; 1394 1395 static void 1396 ire_cache_cleanup(irb_t *irb, uint32_t threshold, int cnt) 1397 { 1398 ire_t *ire; 1399 int tmp_cnt = cnt; 1400 clock_t cut_off = drv_usectohz(ire_idle_cutoff_interval * 1000); 1401 1402 /* 1403 * irb is NULL if the IRE is not added to the hash. This 1404 * happens when IRE_MARK_NOADD is set in ire_add_then_send() 1405 * and when ires are returned from ire_update_srcif_v4() routine. 1406 */ 1407 if (irb == NULL) 1408 return; 1409 1410 IRB_REFHOLD(irb); 1411 if (irb->irb_tmp_ire_cnt > threshold) { 1412 for (ire = irb->irb_ire; ire != NULL && tmp_cnt > 0; 1413 ire = ire->ire_next) { 1414 if (ire->ire_marks & IRE_MARK_CONDEMNED) 1415 continue; 1416 if (ire->ire_marks & IRE_MARK_TEMPORARY) { 1417 ASSERT(ire->ire_type == IRE_CACHE); 1418 ire_delete(ire); 1419 tmp_cnt--; 1420 } 1421 } 1422 } 1423 if (irb->irb_ire_cnt - irb->irb_tmp_ire_cnt > threshold) { 1424 for (ire = irb->irb_ire; ire != NULL && cnt > 0; 1425 ire = ire->ire_next) { 1426 if (ire->ire_marks & IRE_MARK_CONDEMNED || 1427 ire->ire_gateway_addr == 0) { 1428 continue; 1429 } 1430 if ((ire->ire_type == IRE_CACHE) && 1431 (lbolt - ire->ire_last_used_time > cut_off) && 1432 (ire->ire_refcnt == 1)) { 1433 ire_delete(ire); 1434 cnt--; 1435 } 1436 } 1437 } 1438 IRB_REFRELE(irb); 1439 } 1440 1441 /* 1442 * ire_add_then_send is called when a new IRE has been created in order to 1443 * route an outgoing packet. Typically, it is called from ip_wput when 1444 * a response comes back down from a resolver. We add the IRE, and then 1445 * possibly run the packet through ip_wput or ip_rput, as appropriate. 1446 * However, we do not add the newly created IRE in the cache when 1447 * IRE_MARK_NOADD is set in the IRE. IRE_MARK_NOADD is set at 1448 * ip_newroute_ipif(). The ires with IRE_MARK_NOADD and ires returned 1449 * by ire_update_srcif_v4() are ire_refrele'd by ip_wput_ire() and get 1450 * deleted. 1451 * Multirouting support: the packet is silently discarded when the new IRE 1452 * holds the RTF_MULTIRT flag, but is not the first IRE to be added with the 1453 * RTF_MULTIRT flag for the same destination address. 1454 * In this case, we just want to register this additional ire without 1455 * sending the packet, as it has already been replicated through 1456 * existing multirt routes in ip_wput(). 1457 */ 1458 void 1459 ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp) 1460 { 1461 irb_t *irb; 1462 boolean_t drop = B_FALSE; 1463 /* LINTED : set but not used in function */ 1464 boolean_t mctl_present; 1465 mblk_t *first_mp = NULL; 1466 mblk_t *save_mp = NULL; 1467 ire_t *dst_ire; 1468 ipha_t *ipha; 1469 ip6_t *ip6h; 1470 1471 if (mp != NULL) { 1472 /* 1473 * We first have to retrieve the destination address carried 1474 * by the packet. 1475 * We can't rely on ire as it can be related to a gateway. 1476 * The destination address will help in determining if 1477 * other RTF_MULTIRT ires are already registered. 1478 * 1479 * We first need to know where we are going : v4 or V6. 1480 * the ire version is enough, as there is no risk that 1481 * we resolve an IPv6 address with an IPv4 ire 1482 * or vice versa. 1483 */ 1484 if (ire->ire_ipversion == IPV4_VERSION) { 1485 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 1486 ipha = (ipha_t *)mp->b_rptr; 1487 save_mp = mp; 1488 mp = first_mp; 1489 1490 dst_ire = ire_cache_lookup(ipha->ipha_dst, 1491 ire->ire_zoneid, MBLK_GETLABEL(mp)); 1492 } else { 1493 ASSERT(ire->ire_ipversion == IPV6_VERSION); 1494 /* 1495 * Get a pointer to the beginning of the IPv6 header. 1496 * Ignore leading IPsec control mblks. 1497 */ 1498 first_mp = mp; 1499 if (mp->b_datap->db_type == M_CTL) { 1500 mp = mp->b_cont; 1501 } 1502 ip6h = (ip6_t *)mp->b_rptr; 1503 save_mp = mp; 1504 mp = first_mp; 1505 dst_ire = ire_cache_lookup_v6(&ip6h->ip6_dst, 1506 ire->ire_zoneid, MBLK_GETLABEL(mp)); 1507 } 1508 if (dst_ire != NULL) { 1509 if (dst_ire->ire_flags & RTF_MULTIRT) { 1510 /* 1511 * At least one resolved multirt route 1512 * already exists for the destination, 1513 * don't sent this packet: either drop it 1514 * or complete the pending resolution, 1515 * depending on the ire. 1516 */ 1517 drop = B_TRUE; 1518 } 1519 ip1dbg(("ire_add_then_send: dst_ire %p " 1520 "[dst %08x, gw %08x], drop %d\n", 1521 (void *)dst_ire, 1522 (dst_ire->ire_ipversion == IPV4_VERSION) ? \ 1523 ntohl(dst_ire->ire_addr) : \ 1524 ntohl(V4_PART_OF_V6(dst_ire->ire_addr_v6)), 1525 (dst_ire->ire_ipversion == IPV4_VERSION) ? \ 1526 ntohl(dst_ire->ire_gateway_addr) : \ 1527 ntohl(V4_PART_OF_V6( 1528 dst_ire->ire_gateway_addr_v6)), 1529 drop)); 1530 ire_refrele(dst_ire); 1531 } 1532 } 1533 1534 if (!(ire->ire_marks & IRE_MARK_NOADD)) { 1535 /* 1536 * Regular packets with cache bound ires and 1537 * the packets from ARP response for ires which 1538 * belong to the ire_srcif_v4 table, are here. 1539 */ 1540 if (ire->ire_in_ill == NULL) { 1541 /* Add the ire */ 1542 (void) ire_add(&ire, NULL, NULL, NULL, B_FALSE); 1543 } else { 1544 /* 1545 * This must be ARP response for ire in interface based 1546 * table. Note that we don't add them in cache table, 1547 * instead we update the existing table with dlureq_mp 1548 * information. The reverse tunnel ires do not come 1549 * here, as reverse tunnel is non-resolver interface. 1550 * XXX- another design alternative was to mark the 1551 * ires in interface based table with a special mark to 1552 * make absolutely sure that we operate in right ires. 1553 * This idea was not implemented as part of code review 1554 * suggestion, as ire_in_ill suffice to distinguish 1555 * between the regular ires and interface based 1556 * ires now and thus we save a bit in the ire_marks. 1557 */ 1558 ire = ire_update_srcif_v4(ire); 1559 } 1560 1561 if (ire == NULL) { 1562 mp->b_prev = NULL; 1563 mp->b_next = NULL; 1564 MULTIRT_DEBUG_UNTAG(mp); 1565 freemsg(mp); 1566 return; 1567 } 1568 if (mp == NULL) { 1569 ire_refrele(ire); /* Held in ire_add_v4/v6 */ 1570 return; 1571 } 1572 } 1573 if (drop) { 1574 /* 1575 * If we're adding an RTF_MULTIRT ire, the resolution 1576 * is over: we just drop the packet. 1577 */ 1578 if (ire->ire_flags & RTF_MULTIRT) { 1579 if (save_mp) { 1580 save_mp->b_prev = NULL; 1581 save_mp->b_next = NULL; 1582 } 1583 MULTIRT_DEBUG_UNTAG(mp); 1584 freemsg(mp); 1585 } else { 1586 /* 1587 * Otherwise, we're adding the ire to a gateway 1588 * for a multirt route. 1589 * Invoke ip_newroute() to complete the resolution 1590 * of the route. We will then come back here and 1591 * finally drop this packet in the above code. 1592 */ 1593 if (ire->ire_ipversion == IPV4_VERSION) { 1594 /* 1595 * TODO: in order for CGTP to work in non-global 1596 * zones, ip_newroute() must create the IRE 1597 * cache in the zone indicated by 1598 * ire->ire_zoneid. 1599 */ 1600 ip_newroute(q, mp, ipha->ipha_dst, 0, 1601 (CONN_Q(q) ? Q_TO_CONN(q) : NULL), 1602 ire->ire_zoneid); 1603 } else { 1604 ASSERT(ire->ire_ipversion == IPV6_VERSION); 1605 ip_newroute_v6(q, mp, &ip6h->ip6_dst, NULL, 1606 NULL, ire->ire_zoneid); 1607 } 1608 } 1609 1610 ire_refrele(ire); /* As done by ire_send(). */ 1611 return; 1612 } 1613 /* 1614 * Need to remember ire_bucket here as ire_send*() may delete 1615 * the ire so we cannot reference it after that. 1616 */ 1617 irb = ire->ire_bucket; 1618 if (ire->ire_ipversion == IPV6_VERSION) { 1619 ire_send_v6(q, mp, ire); 1620 /* 1621 * Clean up more than 1 IRE so that the clean up does not 1622 * need to be done every time when a new IRE is added and 1623 * the threshold is reached. 1624 */ 1625 ire_cache_cleanup(irb, ip6_ire_max_bucket_cnt, 2); 1626 } else { 1627 ire_send(q, mp, ire); 1628 ire_cache_cleanup(irb, ip_ire_max_bucket_cnt, 2); 1629 } 1630 } 1631 1632 /* 1633 * Initialize the ire that is specific to IPv4 part and call 1634 * ire_init_common to finish it. 1635 */ 1636 ire_t * 1637 ire_init(ire_t *ire, uchar_t *addr, uchar_t *mask, uchar_t *src_addr, 1638 uchar_t *gateway, uchar_t *in_src_addr, uint_t *max_fragp, mblk_t *fp_mp, 1639 queue_t *rfq, queue_t *stq, ushort_t type, mblk_t *dlureq_mp, ipif_t *ipif, 1640 ill_t *in_ill, ipaddr_t cmask, uint32_t phandle, uint32_t ihandle, 1641 uint32_t flags, const iulp_t *ulp_info, tsol_gc_t *gc, tsol_gcgrp_t *gcgrp) 1642 { 1643 /* 1644 * Reject IRE security attribute creation/initialization 1645 * if system is not running in Trusted mode. 1646 */ 1647 if ((gc != NULL || gcgrp != NULL) && !is_system_labeled()) 1648 return (NULL); 1649 1650 if (fp_mp != NULL) { 1651 /* 1652 * We can't dupb() here as multiple threads could be 1653 * calling dupb on the same mp which is incorrect. 1654 * First dupb() should be called only by one thread. 1655 */ 1656 fp_mp = copyb(fp_mp); 1657 if (fp_mp == NULL) 1658 return (NULL); 1659 } 1660 1661 if (dlureq_mp != NULL) { 1662 /* 1663 * We can't dupb() here as multiple threads could be 1664 * calling dupb on the same mp which is incorrect. 1665 * First dupb() should be called only by one thread. 1666 */ 1667 dlureq_mp = copyb(dlureq_mp); 1668 if (dlureq_mp == NULL) { 1669 if (fp_mp != NULL) 1670 freeb(fp_mp); 1671 return (NULL); 1672 } 1673 } 1674 1675 /* 1676 * Check that IRE_IF_RESOLVER and IRE_IF_NORESOLVER have a 1677 * dlureq_mp which is the ill_resolver_mp for IRE_IF_RESOLVER 1678 * and DL_UNITDATA_REQ for IRE_IF_NORESOLVER. 1679 */ 1680 if ((type & IRE_INTERFACE) && 1681 dlureq_mp == NULL) { 1682 ASSERT(fp_mp == NULL); 1683 ip0dbg(("ire_init: no dlureq_mp\n")); 1684 return (NULL); 1685 } 1686 1687 BUMP_IRE_STATS(ire_stats_v4, ire_stats_alloced); 1688 1689 if (addr != NULL) 1690 bcopy(addr, &ire->ire_addr, IP_ADDR_LEN); 1691 if (src_addr != NULL) 1692 bcopy(src_addr, &ire->ire_src_addr, IP_ADDR_LEN); 1693 if (mask != NULL) { 1694 bcopy(mask, &ire->ire_mask, IP_ADDR_LEN); 1695 ire->ire_masklen = ip_mask_to_plen(ire->ire_mask); 1696 } 1697 if (gateway != NULL) { 1698 bcopy(gateway, &ire->ire_gateway_addr, IP_ADDR_LEN); 1699 } 1700 if (in_src_addr != NULL) { 1701 bcopy(in_src_addr, &ire->ire_in_src_addr, IP_ADDR_LEN); 1702 } 1703 1704 if (type == IRE_CACHE) 1705 ire->ire_cmask = cmask; 1706 1707 /* ire_init_common will free the mblks upon encountering any failure */ 1708 if (!ire_init_common(ire, max_fragp, fp_mp, rfq, stq, type, dlureq_mp, 1709 ipif, in_ill, phandle, ihandle, flags, IPV4_VERSION, ulp_info, 1710 gc, gcgrp)) 1711 return (NULL); 1712 1713 return (ire); 1714 } 1715 1716 /* 1717 * Similar to ire_create except that it is called only when 1718 * we want to allocate ire as an mblk e.g. we have an external 1719 * resolver ARP. 1720 */ 1721 ire_t * 1722 ire_create_mp(uchar_t *addr, uchar_t *mask, uchar_t *src_addr, uchar_t *gateway, 1723 uchar_t *in_src_addr, uint_t max_frag, mblk_t *fp_mp, queue_t *rfq, 1724 queue_t *stq, ushort_t type, mblk_t *dlureq_mp, ipif_t *ipif, ill_t *in_ill, 1725 ipaddr_t cmask, uint32_t phandle, uint32_t ihandle, uint32_t flags, 1726 const iulp_t *ulp_info, tsol_gc_t *gc, tsol_gcgrp_t *gcgrp) 1727 { 1728 ire_t *ire, *buf; 1729 ire_t *ret_ire; 1730 mblk_t *mp; 1731 size_t bufsize; 1732 frtn_t *frtnp; 1733 ill_t *ill; 1734 1735 bufsize = sizeof (ire_t) + sizeof (frtn_t); 1736 buf = kmem_alloc(bufsize, KM_NOSLEEP); 1737 if (buf == NULL) { 1738 ip1dbg(("ire_create_mp: alloc failed\n")); 1739 return (NULL); 1740 } 1741 frtnp = (frtn_t *)(buf + 1); 1742 frtnp->free_arg = (caddr_t)buf; 1743 frtnp->free_func = ire_freemblk; 1744 1745 /* 1746 * Allocate the new IRE. The ire created will hold a ref on 1747 * an nce_t after ire_nce_init, and this ref must either be 1748 * (a) transferred to the ire_cache entry created when ire_add_v4 1749 * is called after successful arp resolution, or, 1750 * (b) released, when arp resolution fails 1751 * Case (b) is handled in ire_freemblk() which will be called 1752 * when mp is freed as a result of failed arp. 1753 */ 1754 mp = esballoc((unsigned char *)buf, bufsize, BPRI_MED, frtnp); 1755 if (mp == NULL) { 1756 ip1dbg(("ire_create_mp: alloc failed\n")); 1757 kmem_free(buf, bufsize); 1758 return (NULL); 1759 } 1760 ire = (ire_t *)mp->b_rptr; 1761 mp->b_wptr = (uchar_t *)&ire[1]; 1762 1763 /* Start clean. */ 1764 *ire = ire_null; 1765 ire->ire_mp = mp; 1766 mp->b_datap->db_type = IRE_DB_TYPE; 1767 ire->ire_marks |= IRE_MARK_UNCACHED; 1768 1769 ret_ire = ire_init(ire, addr, mask, src_addr, gateway, in_src_addr, 1770 NULL, fp_mp, rfq, stq, type, dlureq_mp, ipif, in_ill, cmask, 1771 phandle, ihandle, flags, ulp_info, gc, gcgrp); 1772 1773 ill = (ill_t *)(stq->q_ptr); 1774 if (ret_ire == NULL) { 1775 ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex; 1776 freeb(ire->ire_mp); 1777 return (NULL); 1778 } 1779 ret_ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex; 1780 ASSERT(ret_ire == ire); 1781 /* 1782 * ire_max_frag is normally zero here and is atomically set 1783 * under the irebucket lock in ire_add_v[46] except for the 1784 * case of IRE_MARK_NOADD. In that event the the ire_max_frag 1785 * is non-zero here. 1786 */ 1787 ire->ire_max_frag = max_frag; 1788 return (ire); 1789 } 1790 1791 /* 1792 * ire_create is called to allocate and initialize a new IRE. 1793 * 1794 * NOTE : This is called as writer sometimes though not required 1795 * by this function. 1796 */ 1797 ire_t * 1798 ire_create(uchar_t *addr, uchar_t *mask, uchar_t *src_addr, uchar_t *gateway, 1799 uchar_t *in_src_addr, uint_t *max_fragp, mblk_t *fp_mp, queue_t *rfq, 1800 queue_t *stq, ushort_t type, mblk_t *dlureq_mp, ipif_t *ipif, ill_t *in_ill, 1801 ipaddr_t cmask, uint32_t phandle, uint32_t ihandle, uint32_t flags, 1802 const iulp_t *ulp_info, tsol_gc_t *gc, tsol_gcgrp_t *gcgrp) 1803 { 1804 ire_t *ire; 1805 ire_t *ret_ire; 1806 1807 ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 1808 if (ire == NULL) { 1809 ip1dbg(("ire_create: alloc failed\n")); 1810 return (NULL); 1811 } 1812 *ire = ire_null; 1813 1814 ret_ire = ire_init(ire, addr, mask, src_addr, gateway, in_src_addr, 1815 max_fragp, fp_mp, rfq, stq, type, dlureq_mp, ipif, in_ill, cmask, 1816 phandle, ihandle, flags, ulp_info, gc, gcgrp); 1817 1818 if (ret_ire == NULL) { 1819 kmem_cache_free(ire_cache, ire); 1820 return (NULL); 1821 } 1822 ASSERT(ret_ire == ire); 1823 return (ire); 1824 } 1825 1826 1827 /* 1828 * Common to IPv4 and IPv6 1829 */ 1830 boolean_t 1831 ire_init_common(ire_t *ire, uint_t *max_fragp, mblk_t *fp_mp, 1832 queue_t *rfq, queue_t *stq, ushort_t type, 1833 mblk_t *dlureq_mp, ipif_t *ipif, ill_t *in_ill, uint32_t phandle, 1834 uint32_t ihandle, uint32_t flags, uchar_t ipversion, 1835 const iulp_t *ulp_info, tsol_gc_t *gc, tsol_gcgrp_t *gcgrp) 1836 { 1837 ire->ire_max_fragp = max_fragp; 1838 ire->ire_frag_flag |= (ip_path_mtu_discovery) ? IPH_DF : 0; 1839 1840 ASSERT(fp_mp == NULL || fp_mp->b_datap->db_type == M_DATA); 1841 #ifdef DEBUG 1842 if (ipif != NULL) { 1843 if (ipif->ipif_isv6) 1844 ASSERT(ipversion == IPV6_VERSION); 1845 else 1846 ASSERT(ipversion == IPV4_VERSION); 1847 } 1848 #endif /* DEBUG */ 1849 1850 /* 1851 * Create/initialize IRE security attribute only in Trusted mode; 1852 * if the passed in gc/gcgrp is non-NULL, we expect that the caller 1853 * has held a reference to it and will release it when this routine 1854 * returns a failure, otherwise we own the reference. We do this 1855 * prior to initializing the rest IRE fields. 1856 * 1857 * Don't allocate ire_gw_secattr for the resolver case to prevent 1858 * memory leak (in case of external resolution failure). We'll 1859 * allocate it after a successful external resolution, in ire_add(). 1860 * Note that ire->ire_mp != NULL here means this ire is headed 1861 * to an external resolver. 1862 */ 1863 if (is_system_labeled()) { 1864 if ((type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST | 1865 IRE_INTERFACE)) != 0) { 1866 /* release references on behalf of caller */ 1867 if (gc != NULL) 1868 GC_REFRELE(gc); 1869 if (gcgrp != NULL) 1870 GCGRP_REFRELE(gcgrp); 1871 } else if ((ire->ire_mp == NULL) && 1872 tsol_ire_init_gwattr(ire, ipversion, gc, gcgrp) != 0) { 1873 /* free any caller-allocated mblks upon failure */ 1874 if (fp_mp != NULL) 1875 freeb(fp_mp); 1876 if (dlureq_mp != NULL) 1877 freeb(dlureq_mp); 1878 return (B_FALSE); 1879 } 1880 } 1881 1882 ire->ire_stq = stq; 1883 ire->ire_rfq = rfq; 1884 ire->ire_type = type; 1885 ire->ire_flags = RTF_UP | flags; 1886 ire->ire_ident = TICK_TO_MSEC(lbolt); 1887 bcopy(ulp_info, &ire->ire_uinfo, sizeof (iulp_t)); 1888 1889 ire->ire_tire_mark = ire->ire_ob_pkt_count + ire->ire_ib_pkt_count; 1890 ire->ire_last_used_time = lbolt; 1891 ire->ire_create_time = (uint32_t)gethrestime_sec(); 1892 1893 /* 1894 * If this IRE is an IRE_CACHE, inherit the handles from the 1895 * parent IREs. For others in the forwarding table, assign appropriate 1896 * new ones. 1897 * 1898 * The mutex protecting ire_handle is because ire_create is not always 1899 * called as a writer. 1900 */ 1901 if (ire->ire_type & IRE_OFFSUBNET) { 1902 mutex_enter(&ire_handle_lock); 1903 ire->ire_phandle = (uint32_t)ire_handle++; 1904 mutex_exit(&ire_handle_lock); 1905 } else if (ire->ire_type & IRE_INTERFACE) { 1906 mutex_enter(&ire_handle_lock); 1907 ire->ire_ihandle = (uint32_t)ire_handle++; 1908 mutex_exit(&ire_handle_lock); 1909 } else if (ire->ire_type == IRE_CACHE) { 1910 ire->ire_phandle = phandle; 1911 ire->ire_ihandle = ihandle; 1912 } 1913 ire->ire_in_ill = in_ill; 1914 ire->ire_ipif = ipif; 1915 if (ipif != NULL) { 1916 ire->ire_ipif_seqid = ipif->ipif_seqid; 1917 ire->ire_zoneid = ipif->ipif_zoneid; 1918 } else { 1919 ire->ire_zoneid = GLOBAL_ZONEID; 1920 } 1921 ire->ire_ipversion = ipversion; 1922 mutex_init(&ire->ire_lock, NULL, MUTEX_DEFAULT, NULL); 1923 if (ipversion == IPV4_VERSION) { 1924 if (ire_nce_init(ire, fp_mp, dlureq_mp) != 0) { 1925 /* some failure occurred. propagate error back */ 1926 return (B_FALSE); 1927 } 1928 } else { 1929 ASSERT(ipversion == IPV6_VERSION); 1930 /* 1931 * IPv6 initializes the ire_nce in ire_add_v6, 1932 * which expects to find the ire_nce to be null when 1933 * when it is called. 1934 */ 1935 if (dlureq_mp) 1936 freemsg(dlureq_mp); 1937 if (fp_mp) 1938 freemsg(fp_mp); 1939 } 1940 ire->ire_refcnt = 1; 1941 1942 #ifdef IRE_DEBUG 1943 bzero(ire->ire_trace, sizeof (th_trace_t *) * IP_TR_HASH_MAX); 1944 #endif 1945 1946 return (B_TRUE); 1947 } 1948 1949 /* 1950 * This routine is called repeatedly by ipif_up to create broadcast IREs. 1951 * It is passed a pointer to a slot in an IRE pointer array into which to 1952 * place the pointer to the new IRE, if indeed we create one. If the 1953 * IRE corresponding to the address passed in would be a duplicate of an 1954 * existing one, we don't create the new one. irep is incremented before 1955 * return only if we do create a new IRE. (Always called as writer.) 1956 * 1957 * Note that with the "match_flags" parameter, we can match on either 1958 * a particular logical interface (MATCH_IRE_IPIF) or for all logical 1959 * interfaces for a given physical interface (MATCH_IRE_ILL). Currently, 1960 * we only create broadcast ire's on a per physical interface basis. If 1961 * someone is going to be mucking with logical interfaces, it is important 1962 * to call "ipif_check_bcast_ires()" to make sure that any change to a 1963 * logical interface will not cause critical broadcast IRE's to be deleted. 1964 */ 1965 ire_t ** 1966 ire_check_and_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep, 1967 int match_flags) 1968 { 1969 ire_t *ire; 1970 uint64_t check_flags = IPIF_DEPRECATED | IPIF_NOLOCAL | IPIF_ANYCAST; 1971 1972 /* 1973 * No broadcast IREs for the LOOPBACK interface 1974 * or others such as point to point and IPIF_NOXMIT. 1975 */ 1976 if (!(ipif->ipif_flags & IPIF_BROADCAST) || 1977 (ipif->ipif_flags & IPIF_NOXMIT)) 1978 return (irep); 1979 1980 /* If this would be a duplicate, don't bother. */ 1981 if ((ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ipif, 1982 ipif->ipif_zoneid, NULL, match_flags)) != NULL) { 1983 /* 1984 * We look for non-deprecated (and non-anycast, non-nolocal) 1985 * ipifs as the best choice. ipifs with check_flags matching 1986 * (deprecated, etc) are used only if non-deprecated ipifs 1987 * are not available. if the existing ire's ipif is deprecated 1988 * and the new ipif is non-deprecated, switch to the new ipif 1989 */ 1990 if ((!(ire->ire_ipif->ipif_flags & check_flags)) || 1991 (ipif->ipif_flags & check_flags)) { 1992 ire_refrele(ire); 1993 return (irep); 1994 } 1995 /* 1996 * Bcast ires exist in pairs. Both have to be deleted, 1997 * Since we are exclusive we can make the above assertion. 1998 * The 1st has to be refrele'd since it was ctable_lookup'd. 1999 */ 2000 ASSERT(IAM_WRITER_IPIF(ipif)); 2001 ASSERT(ire->ire_next->ire_addr == ire->ire_addr); 2002 ire_delete(ire->ire_next); 2003 ire_delete(ire); 2004 ire_refrele(ire); 2005 } 2006 2007 irep = ire_create_bcast(ipif, addr, irep); 2008 2009 return (irep); 2010 } 2011 2012 uint_t ip_loopback_mtu = IP_LOOPBACK_MTU; 2013 2014 /* 2015 * This routine is called from ipif_check_bcast_ires and ire_check_bcast. 2016 * It leaves all the verifying and deleting to those routines. So it always 2017 * creates 2 bcast ires and chains them into the ire array passed in. 2018 */ 2019 ire_t ** 2020 ire_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep) 2021 { 2022 *irep++ = ire_create( 2023 (uchar_t *)&addr, /* dest addr */ 2024 (uchar_t *)&ip_g_all_ones, /* mask */ 2025 (uchar_t *)&ipif->ipif_src_addr, /* source addr */ 2026 NULL, /* no gateway */ 2027 NULL, /* no in_src_addr */ 2028 &ipif->ipif_mtu, /* max frag */ 2029 NULL, /* fast path header */ 2030 ipif->ipif_rq, /* recv-from queue */ 2031 ipif->ipif_wq, /* send-to queue */ 2032 IRE_BROADCAST, 2033 ipif->ipif_bcast_mp, /* xmit header */ 2034 ipif, 2035 NULL, 2036 0, 2037 0, 2038 0, 2039 0, 2040 &ire_uinfo_null, 2041 NULL, 2042 NULL); 2043 2044 *irep++ = ire_create( 2045 (uchar_t *)&addr, /* dest address */ 2046 (uchar_t *)&ip_g_all_ones, /* mask */ 2047 (uchar_t *)&ipif->ipif_src_addr, /* source address */ 2048 NULL, /* no gateway */ 2049 NULL, /* no in_src_addr */ 2050 &ip_loopback_mtu, /* max frag size */ 2051 NULL, /* Fast Path header */ 2052 ipif->ipif_rq, /* recv-from queue */ 2053 NULL, /* no send-to queue */ 2054 IRE_BROADCAST, /* Needed for fanout in wput */ 2055 NULL, 2056 ipif, 2057 NULL, 2058 0, 2059 0, 2060 0, 2061 0, 2062 &ire_uinfo_null, 2063 NULL, 2064 NULL); 2065 2066 return (irep); 2067 } 2068 2069 /* 2070 * ire_walk routine to delete or update any IRE_CACHE that might contain 2071 * stale information. 2072 * The flags state which entries to delete or update. 2073 * Garbage collection is done separately using kmem alloc callbacks to 2074 * ip_trash_ire_reclaim. 2075 * Used for both IPv4 and IPv6. However, IPv6 only uses FLUSH_MTU_TIME 2076 * since other stale information is cleaned up using NUD. 2077 */ 2078 void 2079 ire_expire(ire_t *ire, char *arg) 2080 { 2081 int flush_flags = (int)(uintptr_t)arg; 2082 ill_t *stq_ill; 2083 2084 if ((flush_flags & FLUSH_REDIRECT_TIME) && 2085 ire->ire_type == IRE_HOST_REDIRECT) { 2086 /* Make sure we delete the corresponding IRE_CACHE */ 2087 ip1dbg(("ire_expire: all redirects\n")); 2088 ip_rts_rtmsg(RTM_DELETE, ire, 0); 2089 ire_delete(ire); 2090 atomic_dec_32(&ip_redirect_cnt); 2091 return; 2092 } 2093 if (ire->ire_type != IRE_CACHE) 2094 return; 2095 2096 if (flush_flags & FLUSH_ARP_TIME) { 2097 /* 2098 * Remove all IRE_CACHE. 2099 * Verify that create time is more than 2100 * ip_ire_arp_interval milliseconds ago. 2101 */ 2102 if (NCE_EXPIRED(ire->ire_nce)) { 2103 ire_delete(ire); 2104 return; 2105 } 2106 } 2107 2108 if (ip_path_mtu_discovery && (flush_flags & FLUSH_MTU_TIME) && 2109 (ire->ire_ipif != NULL)) { 2110 /* Increase pmtu if it is less than the interface mtu */ 2111 mutex_enter(&ire->ire_lock); 2112 /* 2113 * If the ipif is a vni (whose mtu is 0, since it's virtual) 2114 * get the mtu from the sending interfaces' ipif 2115 */ 2116 if (IS_VNI(ire->ire_ipif->ipif_ill)) { 2117 stq_ill = ire->ire_stq->q_ptr; 2118 ire->ire_max_frag = MIN(stq_ill->ill_ipif->ipif_mtu, 2119 IP_MAXPACKET); 2120 } else { 2121 ire->ire_max_frag = MIN(ire->ire_ipif->ipif_mtu, 2122 IP_MAXPACKET); 2123 } 2124 ire->ire_frag_flag |= IPH_DF; 2125 mutex_exit(&ire->ire_lock); 2126 } 2127 } 2128 2129 /* 2130 * Do fast path probing if necessary. 2131 */ 2132 void 2133 ire_fastpath(ire_t *ire) 2134 { 2135 ill_t *ill; 2136 int res; 2137 2138 if (ire->ire_nce == NULL || ire->ire_nce->nce_fp_mp != NULL || 2139 ire->ire_nce->nce_state != ND_REACHABLE || 2140 ire->ire_nce->nce_res_mp == NULL) { 2141 2142 /* 2143 * Already contains fastpath info or 2144 * doesn't have DL_UNITDATA_REQ header or 2145 * or is an incomplete ire in the ire table 2146 * or is a loopback broadcast ire i.e. no stq. 2147 */ 2148 return; 2149 } 2150 ill = ire_to_ill(ire); 2151 if (ill == NULL) 2152 return; 2153 ire_fastpath_list_add(ill, ire); 2154 res = ill_fastpath_probe(ill, ire->ire_nce->nce_res_mp); 2155 /* 2156 * EAGAIN is an indication of a transient error 2157 * i.e. allocation failure etc. leave the ire in the list it will 2158 * be updated when another probe happens for another ire if not 2159 * it will be taken out of the list when the ire is deleted. 2160 */ 2161 if (res != 0 && res != EAGAIN) 2162 ire_fastpath_list_delete(ill, ire); 2163 } 2164 2165 /* 2166 * Update all IRE's that are not in fastpath mode and 2167 * have an dlureq_mp that matches mp. mp->b_cont contains 2168 * the fastpath header. 2169 * 2170 * Returns TRUE if entry should be dequeued, or FALSE otherwise. 2171 */ 2172 boolean_t 2173 ire_fastpath_update(ire_t *ire, void *arg) 2174 { 2175 mblk_t *mp, *fp_mp; 2176 uchar_t *up, *up2; 2177 ptrdiff_t cmplen; 2178 nce_t *arpce; 2179 2180 ASSERT((ire->ire_type & (IRE_CACHE | IRE_BROADCAST | 2181 IRE_MIPRTUN)) != 0); 2182 2183 /* 2184 * Already contains fastpath info or doesn't have 2185 * DL_UNITDATA_REQ header or is an incomplete ire. 2186 */ 2187 if (ire->ire_nce == NULL || ire->ire_nce->nce_res_mp == NULL || 2188 ire->ire_nce->nce_fp_mp != NULL || 2189 ire->ire_nce->nce_state != ND_REACHABLE) 2190 return (B_TRUE); 2191 2192 ip2dbg(("ire_fastpath_update: trying\n")); 2193 mp = arg; 2194 up = mp->b_rptr; 2195 cmplen = mp->b_wptr - up; 2196 /* Serialize multiple fast path updates */ 2197 mutex_enter(&ire->ire_nce->nce_lock); 2198 up2 = ire->ire_nce->nce_res_mp->b_rptr; 2199 ASSERT(cmplen >= 0); 2200 if (ire->ire_nce->nce_res_mp->b_wptr - up2 != cmplen || 2201 bcmp(up, up2, cmplen) != 0) { 2202 mutex_exit(&ire->ire_nce->nce_lock); 2203 /* 2204 * Don't take the ire off the fastpath list yet, 2205 * since the response may come later. 2206 */ 2207 return (B_FALSE); 2208 } 2209 arpce = ire->ire_nce; 2210 /* Matched - install mp as the nce_fp_mp */ 2211 ip1dbg(("ire_fastpath_update: match\n")); 2212 fp_mp = dupb(mp->b_cont); 2213 if (fp_mp) { 2214 /* 2215 * We checked nce_fp_mp above. Check it again with the 2216 * lock. Update fp_mp only if it has not been done 2217 * already. 2218 */ 2219 if (arpce->nce_fp_mp == NULL) { 2220 /* 2221 * ire_ll_hdr_length is just an optimization to 2222 * store the length. It is used to return the 2223 * fast path header length to the upper layers. 2224 */ 2225 arpce->nce_fp_mp = fp_mp; 2226 ire->ire_ll_hdr_length = 2227 (uint_t)(fp_mp->b_wptr - fp_mp->b_rptr); 2228 } else { 2229 freeb(fp_mp); 2230 } 2231 } 2232 mutex_exit(&ire->ire_nce->nce_lock); 2233 return (B_TRUE); 2234 } 2235 2236 /* 2237 * This function handles the DL_NOTE_FASTPATH_FLUSH notification from the 2238 * driver. 2239 */ 2240 /* ARGSUSED */ 2241 void 2242 ire_fastpath_flush(ire_t *ire, void *arg) 2243 { 2244 ill_t *ill; 2245 int res; 2246 2247 /* No fastpath info? */ 2248 if (ire->ire_nce == NULL || 2249 ire->ire_nce->nce_fp_mp == NULL || ire->ire_nce->nce_res_mp == NULL) 2250 return; 2251 2252 /* 2253 * Just remove the IRE if it is for non-broadcast dest. Then 2254 * we will create another one which will have the correct 2255 * fastpath info. 2256 */ 2257 switch (ire->ire_type) { 2258 case IRE_CACHE: 2259 ire_delete(ire); 2260 break; 2261 case IRE_MIPRTUN: 2262 case IRE_BROADCAST: 2263 /* 2264 * We can't delete the ire since it is difficult to 2265 * recreate these ire's without going through the 2266 * ipif down/up dance. The nce_fp_mp is protected by the 2267 * nce_lock in the case of IRE_MIPRTUN and IRE_BROADCAST. 2268 * All access to ire->ire_nce->nce_fp_mp in the case of these 2269 * 2 ire types * is protected by nce_lock. 2270 */ 2271 mutex_enter(&ire->ire_nce->nce_lock); 2272 if (ire->ire_nce->nce_fp_mp != NULL) { 2273 freeb(ire->ire_nce->nce_fp_mp); 2274 ire->ire_nce->nce_fp_mp = NULL; 2275 mutex_exit(&ire->ire_nce->nce_lock); 2276 /* 2277 * No fastpath probe if there is no stq i.e. 2278 * i.e. the case of loopback broadcast ire. 2279 */ 2280 if (ire->ire_stq == NULL) 2281 break; 2282 ill = (ill_t *)((ire->ire_stq)->q_ptr); 2283 ire_fastpath_list_add(ill, ire); 2284 res = ill_fastpath_probe(ill, ire->ire_nce->nce_res_mp); 2285 /* 2286 * EAGAIN is an indication of a transient error 2287 * i.e. allocation failure etc. leave the ire in the 2288 * list it will be updated when another probe happens 2289 * for another ire if not it will be taken out of the 2290 * list when the ire is deleted. 2291 */ 2292 if (res != 0 && res != EAGAIN) 2293 ire_fastpath_list_delete(ill, ire); 2294 } else { 2295 mutex_exit(&ire->ire_nce->nce_lock); 2296 } 2297 break; 2298 default: 2299 /* This should not happen! */ 2300 ip0dbg(("ire_fastpath_flush: Wrong ire type %s\n", 2301 ip_nv_lookup(ire_nv_tbl, (int)ire->ire_type))); 2302 break; 2303 } 2304 } 2305 2306 /* 2307 * Drain the list of ire's waiting for fastpath response. 2308 */ 2309 void 2310 ire_fastpath_list_dispatch(ill_t *ill, boolean_t (*func)(ire_t *, void *), 2311 void *arg) 2312 { 2313 ire_t *next_ire; 2314 ire_t *current_ire; 2315 ire_t *first_ire; 2316 ire_t *prev_ire = NULL; 2317 2318 ASSERT(ill != NULL); 2319 2320 mutex_enter(&ill->ill_lock); 2321 first_ire = current_ire = (ire_t *)ill->ill_fastpath_list; 2322 while (current_ire != (ire_t *)&ill->ill_fastpath_list) { 2323 next_ire = current_ire->ire_fastpath; 2324 /* 2325 * Take it off the list if we're flushing, or if the callback 2326 * routine tells us to do so. Otherwise, leave the ire in the 2327 * fastpath list to handle any pending response from the lower 2328 * layer. We can't drain the list when the callback routine 2329 * comparison failed, because the response is asynchronous in 2330 * nature, and may not arrive in the same order as the list 2331 * insertion. 2332 */ 2333 if (func == NULL || func(current_ire, arg)) { 2334 current_ire->ire_fastpath = NULL; 2335 if (current_ire == first_ire) 2336 ill->ill_fastpath_list = first_ire = next_ire; 2337 else 2338 prev_ire->ire_fastpath = next_ire; 2339 } else { 2340 /* previous element that is still in the list */ 2341 prev_ire = current_ire; 2342 } 2343 current_ire = next_ire; 2344 } 2345 mutex_exit(&ill->ill_lock); 2346 } 2347 2348 /* 2349 * Add ire to the ire fastpath list. 2350 */ 2351 static void 2352 ire_fastpath_list_add(ill_t *ill, ire_t *ire) 2353 { 2354 ASSERT(ill != NULL); 2355 ASSERT(ire->ire_stq != NULL); 2356 2357 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 2358 mutex_enter(&ill->ill_lock); 2359 2360 /* 2361 * if ire has not been deleted and 2362 * is not already in the list add it. 2363 */ 2364 if (((ire->ire_marks & IRE_MARK_CONDEMNED) == 0) && 2365 (ire->ire_fastpath == NULL)) { 2366 ire->ire_fastpath = (ire_t *)ill->ill_fastpath_list; 2367 ill->ill_fastpath_list = ire; 2368 } 2369 2370 mutex_exit(&ill->ill_lock); 2371 rw_exit(&ire->ire_bucket->irb_lock); 2372 } 2373 2374 /* 2375 * remove ire from the ire fastpath list. 2376 */ 2377 void 2378 ire_fastpath_list_delete(ill_t *ill, ire_t *ire) 2379 { 2380 ire_t *ire_ptr; 2381 2382 ASSERT(ire->ire_stq != NULL && ill != NULL); 2383 2384 mutex_enter(&ill->ill_lock); 2385 if (ire->ire_fastpath == NULL) 2386 goto done; 2387 2388 ASSERT(ill->ill_fastpath_list != &ill->ill_fastpath_list); 2389 2390 if (ill->ill_fastpath_list == ire) { 2391 ill->ill_fastpath_list = ire->ire_fastpath; 2392 } else { 2393 ire_ptr = ill->ill_fastpath_list; 2394 while (ire_ptr != (ire_t *)&ill->ill_fastpath_list) { 2395 if (ire_ptr->ire_fastpath == ire) { 2396 ire_ptr->ire_fastpath = ire->ire_fastpath; 2397 break; 2398 } 2399 ire_ptr = ire_ptr->ire_fastpath; 2400 } 2401 } 2402 ire->ire_fastpath = NULL; 2403 done: 2404 mutex_exit(&ill->ill_lock); 2405 } 2406 2407 /* 2408 * Return any local address. We use this to target ourselves 2409 * when the src address was specified as 'default'. 2410 * Preference for IRE_LOCAL entries. 2411 */ 2412 ire_t * 2413 ire_lookup_local(zoneid_t zoneid) 2414 { 2415 ire_t *ire; 2416 irb_t *irb; 2417 ire_t *maybe = NULL; 2418 int i; 2419 2420 for (i = 0; i < ip_cache_table_size; i++) { 2421 irb = &ip_cache_table[i]; 2422 if (irb->irb_ire == NULL) 2423 continue; 2424 rw_enter(&irb->irb_lock, RW_READER); 2425 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 2426 if ((ire->ire_marks & IRE_MARK_CONDEMNED) || 2427 (ire->ire_zoneid != zoneid && 2428 ire->ire_zoneid != ALL_ZONES)) 2429 continue; 2430 switch (ire->ire_type) { 2431 case IRE_LOOPBACK: 2432 if (maybe == NULL) { 2433 IRE_REFHOLD(ire); 2434 maybe = ire; 2435 } 2436 break; 2437 case IRE_LOCAL: 2438 if (maybe != NULL) { 2439 ire_refrele(maybe); 2440 } 2441 IRE_REFHOLD(ire); 2442 rw_exit(&irb->irb_lock); 2443 return (ire); 2444 } 2445 } 2446 rw_exit(&irb->irb_lock); 2447 } 2448 return (maybe); 2449 } 2450 2451 /* 2452 * If the specified IRE is associated with a particular ILL, return 2453 * that ILL pointer (May be called as writer.). 2454 * 2455 * NOTE : This is not a generic function that can be used always. 2456 * This function always returns the ill of the outgoing packets 2457 * if this ire is used. 2458 */ 2459 ill_t * 2460 ire_to_ill(const ire_t *ire) 2461 { 2462 ill_t *ill = NULL; 2463 2464 /* 2465 * 1) For an IRE_CACHE, ire_ipif is the one where it obtained 2466 * the source address from. ire_stq is the one where the 2467 * packets will be sent out on. We return that here. 2468 * 2469 * 2) IRE_BROADCAST normally has a loopback and a non-loopback 2470 * copy and they always exist next to each other with loopback 2471 * copy being the first one. If we are called on the non-loopback 2472 * copy, return the one pointed by ire_stq. If it was called on 2473 * a loopback copy, we still return the one pointed by the next 2474 * ire's ire_stq pointer i.e the one pointed by the non-loopback 2475 * copy. We don't want use ire_ipif as it might represent the 2476 * source address (if we borrow source addresses for 2477 * IRE_BROADCASTS in the future). 2478 * However if an interface is currently coming up, the above 2479 * condition may not hold during that period since the ires 2480 * are added one at a time. Thus one of the pair could have been 2481 * added and the other not yet added. 2482 * 3) For all others return the ones pointed by ire_ipif->ipif_ill. 2483 */ 2484 2485 if (ire->ire_type == IRE_CACHE) { 2486 ill = (ill_t *)ire->ire_stq->q_ptr; 2487 } else if (ire->ire_type == IRE_BROADCAST) { 2488 if (ire->ire_stq != NULL) { 2489 ill = (ill_t *)ire->ire_stq->q_ptr; 2490 } else { 2491 ire_t *ire_next; 2492 2493 ire_next = ire->ire_next; 2494 if (ire_next != NULL && 2495 ire_next->ire_type == IRE_BROADCAST && 2496 ire_next->ire_addr == ire->ire_addr && 2497 ire_next->ire_ipif == ire->ire_ipif) { 2498 ill = (ill_t *)ire_next->ire_stq->q_ptr; 2499 } 2500 } 2501 } else if (ire->ire_ipif != NULL) { 2502 ill = ire->ire_ipif->ipif_ill; 2503 } 2504 return (ill); 2505 } 2506 2507 /* Arrange to call the specified function for every IRE in the world. */ 2508 void 2509 ire_walk(pfv_t func, void *arg) 2510 { 2511 ire_walk_ipvers(func, arg, 0, ALL_ZONES); 2512 } 2513 2514 void 2515 ire_walk_v4(pfv_t func, void *arg, zoneid_t zoneid) 2516 { 2517 ire_walk_ipvers(func, arg, IPV4_VERSION, zoneid); 2518 } 2519 2520 void 2521 ire_walk_v6(pfv_t func, void *arg, zoneid_t zoneid) 2522 { 2523 ire_walk_ipvers(func, arg, IPV6_VERSION, zoneid); 2524 } 2525 2526 /* 2527 * Walk a particular version. version == 0 means both v4 and v6. 2528 */ 2529 static void 2530 ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers, zoneid_t zoneid) 2531 { 2532 if (vers != IPV6_VERSION) { 2533 /* 2534 * ip_forwarding_table variable doesn't matter for IPv4 since 2535 * ire_walk_ill_tables directly calls with the ip_ftable global 2536 */ 2537 ire_walk_ill_tables(0, 0, func, arg, IP_MASK_TABLE_SIZE, 2538 0, NULL, 2539 ip_cache_table_size, ip_cache_table, NULL, zoneid); 2540 } 2541 if (vers != IPV4_VERSION) { 2542 ire_walk_ill_tables(0, 0, func, arg, IP6_MASK_TABLE_SIZE, 2543 ip6_ftable_hash_size, ip_forwarding_table_v6, 2544 ip6_cache_table_size, ip_cache_table_v6, NULL, zoneid); 2545 } 2546 } 2547 2548 /* 2549 * Arrange to call the specified 2550 * function for every IRE that matches the ill. 2551 */ 2552 void 2553 ire_walk_ill(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg, 2554 ill_t *ill) 2555 { 2556 ire_walk_ill_ipvers(match_flags, ire_type, func, arg, 0, ill); 2557 } 2558 2559 void 2560 ire_walk_ill_v4(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg, 2561 ill_t *ill) 2562 { 2563 ire_walk_ill_ipvers(match_flags, ire_type, func, arg, IPV4_VERSION, 2564 ill); 2565 } 2566 2567 void 2568 ire_walk_ill_v6(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg, 2569 ill_t *ill) 2570 { 2571 ire_walk_ill_ipvers(match_flags, ire_type, func, arg, IPV6_VERSION, 2572 ill); 2573 } 2574 2575 /* 2576 * Walk a particular ill and version. version == 0 means both v4 and v6. 2577 */ 2578 static void 2579 ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type, pfv_t func, 2580 void *arg, uchar_t vers, ill_t *ill) 2581 { 2582 if (vers != IPV6_VERSION) { 2583 ire_walk_ill_tables(match_flags, ire_type, func, arg, 2584 IP_MASK_TABLE_SIZE, 0, 2585 NULL, ip_cache_table_size, 2586 ip_cache_table, ill, ALL_ZONES); 2587 } 2588 if (vers != IPV4_VERSION) { 2589 ire_walk_ill_tables(match_flags, ire_type, func, arg, 2590 IP6_MASK_TABLE_SIZE, ip6_ftable_hash_size, 2591 ip_forwarding_table_v6, ip6_cache_table_size, 2592 ip_cache_table_v6, ill, ALL_ZONES); 2593 } 2594 } 2595 2596 boolean_t 2597 ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire, 2598 ill_t *ill, zoneid_t zoneid) 2599 { 2600 ill_t *ire_stq_ill = NULL; 2601 ill_t *ire_ipif_ill = NULL; 2602 ill_group_t *ire_ill_group = NULL; 2603 2604 ASSERT(match_flags != 0 || zoneid != ALL_ZONES); 2605 /* 2606 * 1) MATCH_IRE_WQ : Used specifically to match on ire_stq. 2607 * The fast path update uses this to make sure it does not 2608 * update the fast path header of interface X with the fast 2609 * path updates it recieved on interface Y. It is similar 2610 * in handling DL_NOTE_FASTPATH_FLUSH. 2611 * 2612 * 2) MATCH_IRE_ILL/MATCH_IRE_ILL_GROUP : We match both on ill 2613 * pointed by ire_stq and ire_ipif. Only in the case of 2614 * IRE_CACHEs can ire_stq and ire_ipif be pointing to 2615 * different ills. But we want to keep this function generic 2616 * enough for future use. So, we always try to match on both. 2617 * The only caller of this function ire_walk_ill_tables, will 2618 * call "func" after we return from this function. We expect 2619 * "func" to do the right filtering of ires in this case. 2620 * 2621 * NOTE : In the case of MATCH_IRE_ILL_GROUP, groups 2622 * pointed by ire_stq and ire_ipif should always be the same. 2623 * So, we just match on only one of them. 2624 */ 2625 if (match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP)) { 2626 if (ire->ire_stq != NULL) 2627 ire_stq_ill = (ill_t *)ire->ire_stq->q_ptr; 2628 if (ire->ire_ipif != NULL) 2629 ire_ipif_ill = ire->ire_ipif->ipif_ill; 2630 if (ire_stq_ill != NULL) 2631 ire_ill_group = ire_stq_ill->ill_group; 2632 if ((ire_ill_group == NULL) && (ire_ipif_ill != NULL)) 2633 ire_ill_group = ire_ipif_ill->ill_group; 2634 } 2635 2636 if (zoneid != ALL_ZONES) { 2637 /* 2638 * We're walking the IREs for a specific zone. The only relevant 2639 * IREs are: 2640 * - all IREs with a matching ire_zoneid 2641 * - all IRE_OFFSUBNETs as they're shared across all zones 2642 * - IRE_INTERFACE IREs for interfaces with a usable source addr 2643 * with a matching zone 2644 * - IRE_DEFAULTs with a gateway reachable from the zone 2645 * We should really match on IRE_OFFSUBNETs and IRE_DEFAULTs 2646 * using the same rule; but the above rules are consistent with 2647 * the behavior of ire_ftable_lookup[_v6]() so that all the 2648 * routes that can be matched during lookup are also matched 2649 * here. 2650 */ 2651 if (zoneid != ire->ire_zoneid && ire->ire_zoneid != ALL_ZONES) { 2652 /* 2653 * Note, IRE_INTERFACE can have the stq as NULL. For 2654 * example, if the default multicast route is tied to 2655 * the loopback address. 2656 */ 2657 if ((ire->ire_type & IRE_INTERFACE) && 2658 (ire->ire_stq != NULL)) { 2659 ire_stq_ill = (ill_t *)ire->ire_stq->q_ptr; 2660 if (ire->ire_ipversion == IPV4_VERSION) { 2661 if (!ipif_usesrc_avail(ire_stq_ill, 2662 zoneid)) 2663 /* No usable src addr in zone */ 2664 return (B_FALSE); 2665 } else if (ire_stq_ill->ill_usesrc_ifindex 2666 != 0) { 2667 /* 2668 * For IPv6 use ipif_select_source_v6() 2669 * so the right scope selection is done 2670 */ 2671 ipif_t *src_ipif; 2672 src_ipif = 2673 ipif_select_source_v6(ire_stq_ill, 2674 &ire->ire_addr_v6, RESTRICT_TO_NONE, 2675 IPV6_PREFER_SRC_DEFAULT, 2676 zoneid); 2677 if (src_ipif != NULL) { 2678 ipif_refrele(src_ipif); 2679 } else { 2680 return (B_FALSE); 2681 } 2682 } else { 2683 return (B_FALSE); 2684 } 2685 2686 } else if (!(ire->ire_type & IRE_OFFSUBNET)) { 2687 return (B_FALSE); 2688 } 2689 } 2690 2691 /* 2692 * Match all default routes from the global zone, irrespective 2693 * of reachability. For a non-global zone only match those 2694 * where ire_gateway_addr has a IRE_INTERFACE for the zoneid. 2695 */ 2696 if (ire->ire_type == IRE_DEFAULT && zoneid != GLOBAL_ZONEID) { 2697 int ire_match_flags = 0; 2698 in6_addr_t gw_addr_v6; 2699 ire_t *rire; 2700 2701 ire_match_flags |= MATCH_IRE_TYPE; 2702 if (ire->ire_ipif != NULL) { 2703 ire_match_flags |= MATCH_IRE_ILL_GROUP; 2704 } 2705 if (ire->ire_ipversion == IPV4_VERSION) { 2706 rire = ire_route_lookup(ire->ire_gateway_addr, 2707 0, 0, IRE_INTERFACE, ire->ire_ipif, NULL, 2708 zoneid, NULL, ire_match_flags); 2709 } else { 2710 ASSERT(ire->ire_ipversion == IPV6_VERSION); 2711 mutex_enter(&ire->ire_lock); 2712 gw_addr_v6 = ire->ire_gateway_addr_v6; 2713 mutex_exit(&ire->ire_lock); 2714 rire = ire_route_lookup_v6(&gw_addr_v6, 2715 NULL, NULL, IRE_INTERFACE, ire->ire_ipif, 2716 NULL, zoneid, NULL, ire_match_flags); 2717 } 2718 if (rire == NULL) { 2719 return (B_FALSE); 2720 } 2721 ire_refrele(rire); 2722 } 2723 } 2724 2725 if (((!(match_flags & MATCH_IRE_TYPE)) || 2726 (ire->ire_type & ire_type)) && 2727 ((!(match_flags & MATCH_IRE_WQ)) || 2728 (ire->ire_stq == ill->ill_wq)) && 2729 ((!(match_flags & MATCH_IRE_ILL)) || 2730 (ire_stq_ill == ill || ire_ipif_ill == ill)) && 2731 ((!(match_flags & MATCH_IRE_ILL_GROUP)) || 2732 (ire_stq_ill == ill) || (ire_ipif_ill == ill) || 2733 (ire_ill_group != NULL && 2734 ire_ill_group == ill->ill_group))) { 2735 return (B_TRUE); 2736 } 2737 return (B_FALSE); 2738 } 2739 2740 int 2741 rtfunc(struct radix_node *rn, void *arg) 2742 { 2743 struct rtfuncarg *rtf = arg; 2744 struct rt_entry *rt; 2745 irb_t *irb; 2746 ire_t *ire; 2747 boolean_t ret; 2748 2749 rt = (struct rt_entry *)rn; 2750 ASSERT(rt != NULL); 2751 irb = &rt->rt_irb; 2752 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 2753 if ((rtf->rt_match_flags != 0) || 2754 (rtf->rt_zoneid != ALL_ZONES)) { 2755 ret = ire_walk_ill_match(rtf->rt_match_flags, 2756 rtf->rt_ire_type, ire, 2757 rtf->rt_ill, rtf->rt_zoneid); 2758 } else 2759 ret = B_TRUE; 2760 if (ret) 2761 (*rtf->rt_func)(ire, rtf->rt_arg); 2762 } 2763 return (0); 2764 } 2765 2766 /* 2767 * Walk the ftable and the ctable entries that match the ill. 2768 */ 2769 void 2770 ire_walk_ill_tables(uint_t match_flags, uint_t ire_type, pfv_t func, 2771 void *arg, size_t ftbl_sz, size_t htbl_sz, irb_t **ipftbl, 2772 size_t ctbl_sz, irb_t *ipctbl, ill_t *ill, zoneid_t zoneid) 2773 { 2774 irb_t *irb_ptr; 2775 irb_t *irb; 2776 ire_t *ire; 2777 int i, j; 2778 boolean_t ret; 2779 struct rtfuncarg rtfarg; 2780 2781 ASSERT((!(match_flags & (MATCH_IRE_WQ | MATCH_IRE_ILL | 2782 MATCH_IRE_ILL_GROUP))) || (ill != NULL)); 2783 ASSERT(!(match_flags & MATCH_IRE_TYPE) || (ire_type != 0)); 2784 /* 2785 * Optimize by not looking at the forwarding table if there 2786 * is a MATCH_IRE_TYPE specified with no IRE_FORWARDTABLE 2787 * specified in ire_type. 2788 */ 2789 if (!(match_flags & MATCH_IRE_TYPE) || 2790 ((ire_type & IRE_FORWARDTABLE) != 0)) { 2791 /* knobs such that routine is called only for v6 case */ 2792 if (ipftbl == ip_forwarding_table_v6) { 2793 for (i = (ftbl_sz - 1); i >= 0; i--) { 2794 if ((irb_ptr = ipftbl[i]) == NULL) 2795 continue; 2796 for (j = 0; j < htbl_sz; j++) { 2797 irb = &irb_ptr[j]; 2798 if (irb->irb_ire == NULL) 2799 continue; 2800 2801 IRB_REFHOLD(irb); 2802 for (ire = irb->irb_ire; ire != NULL; 2803 ire = ire->ire_next) { 2804 if (match_flags == 0 && 2805 zoneid == ALL_ZONES) { 2806 ret = B_TRUE; 2807 } else { 2808 ret = 2809 ire_walk_ill_match( 2810 match_flags, 2811 ire_type, ire, ill, 2812 zoneid); 2813 } 2814 if (ret) 2815 (*func)(ire, arg); 2816 } 2817 IRB_REFRELE(irb); 2818 } 2819 } 2820 } else { 2821 (void) memset(&rtfarg, 0, sizeof (rtfarg)); 2822 rtfarg.rt_func = func; 2823 rtfarg.rt_arg = arg; 2824 if (match_flags != 0) { 2825 rtfarg.rt_match_flags = match_flags; 2826 } 2827 rtfarg.rt_ire_type = ire_type; 2828 rtfarg.rt_ill = ill; 2829 rtfarg.rt_zoneid = zoneid; 2830 (void) ip_ftable->rnh_walktree(ip_ftable, rtfunc, 2831 &rtfarg); 2832 } 2833 } 2834 2835 /* 2836 * Optimize by not looking at the cache table if there 2837 * is a MATCH_IRE_TYPE specified with no IRE_CACHETABLE 2838 * specified in ire_type. 2839 */ 2840 if (!(match_flags & MATCH_IRE_TYPE) || 2841 ((ire_type & IRE_CACHETABLE) != 0)) { 2842 for (i = 0; i < ctbl_sz; i++) { 2843 irb = &ipctbl[i]; 2844 if (irb->irb_ire == NULL) 2845 continue; 2846 IRB_REFHOLD(irb); 2847 for (ire = irb->irb_ire; ire != NULL; 2848 ire = ire->ire_next) { 2849 if (match_flags == 0 && zoneid == ALL_ZONES) { 2850 ret = B_TRUE; 2851 } else { 2852 ret = ire_walk_ill_match( 2853 match_flags, ire_type, 2854 ire, ill, zoneid); 2855 } 2856 if (ret) 2857 (*func)(ire, arg); 2858 } 2859 IRB_REFRELE(irb); 2860 } 2861 } 2862 } 2863 2864 /* 2865 * This routine walks through the ill chain to find if there is any 2866 * ire linked to the ill's interface based forwarding table 2867 * The arg could be ill or mp. This routine is called when a ill goes 2868 * down/deleted or the 'ipv4_ire_srcif_status' report is printed. 2869 */ 2870 void 2871 ire_walk_srcif_table_v4(pfv_t func, void *arg) 2872 { 2873 irb_t *irb; 2874 ire_t *ire; 2875 ill_t *ill, *next_ill; 2876 int i; 2877 int total_count; 2878 ill_walk_context_t ctx; 2879 2880 /* 2881 * Take care of ire's in other ill's per-interface forwarding 2882 * table. Check if any ire in any of the ill's ill_srcif_table 2883 * is pointing to this ill. 2884 */ 2885 mutex_enter(&ire_srcif_table_lock); 2886 if (ire_srcif_table_count == 0) { 2887 mutex_exit(&ire_srcif_table_lock); 2888 return; 2889 } 2890 mutex_exit(&ire_srcif_table_lock); 2891 2892 #ifdef DEBUG 2893 /* Keep accounting of all interface based table ires */ 2894 total_count = 0; 2895 rw_enter(&ill_g_lock, RW_READER); 2896 ill = ILL_START_WALK_V4(&ctx); 2897 while (ill != NULL) { 2898 mutex_enter(&ill->ill_lock); 2899 total_count += ill->ill_srcif_refcnt; 2900 next_ill = ill_next(&ctx, ill); 2901 mutex_exit(&ill->ill_lock); 2902 ill = next_ill; 2903 } 2904 rw_exit(&ill_g_lock); 2905 2906 /* Hold lock here to make sure ire_srcif_table_count is stable */ 2907 mutex_enter(&ire_srcif_table_lock); 2908 i = ire_srcif_table_count; 2909 mutex_exit(&ire_srcif_table_lock); 2910 ip1dbg(("ire_walk_srcif_v4: ire_srcif_table_count %d " 2911 "total ill_srcif_refcnt %d\n", i, total_count)); 2912 #endif 2913 rw_enter(&ill_g_lock, RW_READER); 2914 ill = ILL_START_WALK_V4(&ctx); 2915 while (ill != NULL) { 2916 mutex_enter(&ill->ill_lock); 2917 if ((ill->ill_srcif_refcnt == 0) || !ILL_CAN_LOOKUP(ill)) { 2918 next_ill = ill_next(&ctx, ill); 2919 mutex_exit(&ill->ill_lock); 2920 ill = next_ill; 2921 continue; 2922 } 2923 ill_refhold_locked(ill); 2924 mutex_exit(&ill->ill_lock); 2925 rw_exit(&ill_g_lock); 2926 if (ill->ill_srcif_table != NULL) { 2927 for (i = 0; i < IP_SRCIF_TABLE_SIZE; i++) { 2928 irb = &(ill->ill_srcif_table[i]); 2929 if (irb->irb_ire == NULL) 2930 continue; 2931 IRB_REFHOLD(irb); 2932 for (ire = irb->irb_ire; ire != NULL; 2933 ire = ire->ire_next) { 2934 (*func)(ire, arg); 2935 } 2936 IRB_REFRELE(irb); 2937 } 2938 } 2939 rw_enter(&ill_g_lock, RW_READER); 2940 next_ill = ill_next(&ctx, ill); 2941 ill_refrele(ill); 2942 ill = next_ill; 2943 } 2944 rw_exit(&ill_g_lock); 2945 } 2946 2947 /* 2948 * This function takes a mask and returns 2949 * number of bits set in the mask. If no 2950 * bit is set it returns 0. 2951 * Assumes a contiguous mask. 2952 */ 2953 int 2954 ip_mask_to_plen(ipaddr_t mask) 2955 { 2956 return (mask == 0 ? 0 : IP_ABITS - (ffs(ntohl(mask)) -1)); 2957 } 2958 2959 /* 2960 * Convert length for a mask to the mask. 2961 */ 2962 ipaddr_t 2963 ip_plen_to_mask(uint_t masklen) 2964 { 2965 return (htonl(IP_HOST_MASK << (IP_ABITS - masklen))); 2966 } 2967 2968 void 2969 ire_atomic_end(irb_t *irb_ptr, ire_t *ire) 2970 { 2971 ill_t *ill_list[NUM_ILLS]; 2972 2973 ill_list[0] = ire->ire_stq != NULL ? ire->ire_stq->q_ptr : NULL; 2974 ill_list[1] = ire->ire_ipif != NULL ? ire->ire_ipif->ipif_ill : NULL; 2975 ill_list[2] = ire->ire_in_ill; 2976 ill_unlock_ills(ill_list, NUM_ILLS); 2977 rw_exit(&irb_ptr->irb_lock); 2978 rw_exit(&ill_g_usesrc_lock); 2979 } 2980 2981 /* 2982 * ire_add_v[46] atomically make sure that the ipif or ill associated 2983 * with the new ire being added is stable and not IPIF_CHANGING or ILL_CHANGING 2984 * before adding the ire to the table. This ensures that we don't create 2985 * new IRE_CACHEs with stale values for parameters that are passed to 2986 * ire_create such as ire_max_frag. Note that ire_create() is passed a pointer 2987 * to the ipif_mtu, and not the value. The actual value is derived from the 2988 * parent ire or ipif under the bucket lock. 2989 */ 2990 int 2991 ire_atomic_start(irb_t *irb_ptr, ire_t *ire, queue_t *q, mblk_t *mp, 2992 ipsq_func_t func) 2993 { 2994 ill_t *stq_ill; 2995 ill_t *ipif_ill; 2996 ill_t *in_ill; 2997 ill_t *ill_list[NUM_ILLS]; 2998 int cnt = NUM_ILLS; 2999 int error = 0; 3000 ill_t *ill = NULL; 3001 3002 ill_list[0] = stq_ill = ire->ire_stq != 3003 NULL ? ire->ire_stq->q_ptr : NULL; 3004 ill_list[1] = ipif_ill = ire->ire_ipif != 3005 NULL ? ire->ire_ipif->ipif_ill : NULL; 3006 ill_list[2] = in_ill = ire->ire_in_ill; 3007 3008 ASSERT((q != NULL && mp != NULL && func != NULL) || 3009 (q == NULL && mp == NULL && func == NULL)); 3010 rw_enter(&ill_g_usesrc_lock, RW_READER); 3011 GRAB_CONN_LOCK(q); 3012 rw_enter(&irb_ptr->irb_lock, RW_WRITER); 3013 ill_lock_ills(ill_list, cnt); 3014 3015 /* 3016 * While the IRE is in the process of being added, a user may have 3017 * invoked the ifconfig usesrc option on the stq_ill to make it a 3018 * usesrc client ILL. Check for this possibility here, if it is true 3019 * then we fail adding the IRE_CACHE. Another check is to make sure 3020 * that an ipif_ill of an IRE_CACHE being added is not part of a usesrc 3021 * group. The ill_g_usesrc_lock is released in ire_atomic_end 3022 */ 3023 if ((ire->ire_type & IRE_CACHE) && 3024 (ire->ire_marks & IRE_MARK_USESRC_CHECK)) { 3025 if (stq_ill->ill_usesrc_ifindex != 0) { 3026 ASSERT(stq_ill->ill_usesrc_grp_next != NULL); 3027 if ((ipif_ill->ill_phyint->phyint_ifindex != 3028 stq_ill->ill_usesrc_ifindex) || 3029 (ipif_ill->ill_usesrc_grp_next == NULL) || 3030 (ipif_ill->ill_usesrc_ifindex != 0)) { 3031 error = EINVAL; 3032 goto done; 3033 } 3034 } else if (ipif_ill->ill_usesrc_grp_next != NULL) { 3035 error = EINVAL; 3036 goto done; 3037 } 3038 } 3039 3040 /* 3041 * IPMP flag settings happen without taking the exclusive route 3042 * in ip_sioctl_flags. So we need to make an atomic check here 3043 * for FAILED/OFFLINE/INACTIVE flags or if it has hit the 3044 * FAILBACK=no case. 3045 */ 3046 if ((stq_ill != NULL) && !IAM_WRITER_ILL(stq_ill)) { 3047 if (stq_ill->ill_state_flags & ILL_CHANGING) { 3048 ill = stq_ill; 3049 error = EAGAIN; 3050 } else if ((stq_ill->ill_phyint->phyint_flags & PHYI_OFFLINE) || 3051 (ill_is_probeonly(stq_ill) && 3052 !(ire->ire_marks & IRE_MARK_HIDDEN))) { 3053 error = EINVAL; 3054 } 3055 goto done; 3056 } 3057 3058 /* 3059 * We don't check for OFFLINE/FAILED in this case because 3060 * the source address selection logic (ipif_select_source) 3061 * may still select a source address from such an ill. The 3062 * assumption is that these addresses will be moved by in.mpathd 3063 * soon. (i.e. this is a race). However link local addresses 3064 * will not move and hence ipif_select_source_v6 tries to avoid 3065 * FAILED ills. Please see ipif_select_source_v6 for more info 3066 */ 3067 if ((ipif_ill != NULL) && !IAM_WRITER_ILL(ipif_ill) && 3068 (ipif_ill->ill_state_flags & ILL_CHANGING)) { 3069 ill = ipif_ill; 3070 error = EAGAIN; 3071 goto done; 3072 } 3073 3074 if ((in_ill != NULL) && !IAM_WRITER_ILL(in_ill) && 3075 (in_ill->ill_state_flags & ILL_CHANGING)) { 3076 ill = in_ill; 3077 error = EAGAIN; 3078 goto done; 3079 } 3080 3081 if ((ire->ire_ipif != NULL) && !IAM_WRITER_IPIF(ire->ire_ipif) && 3082 (ire->ire_ipif->ipif_state_flags & IPIF_CHANGING)) { 3083 ill = ire->ire_ipif->ipif_ill; 3084 ASSERT(ill != NULL); 3085 error = EAGAIN; 3086 goto done; 3087 } 3088 3089 done: 3090 if (error == EAGAIN && ILL_CAN_WAIT(ill, q)) { 3091 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 3092 mutex_enter(&ipsq->ipsq_lock); 3093 ire_atomic_end(irb_ptr, ire); 3094 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 3095 mutex_exit(&ipsq->ipsq_lock); 3096 error = EINPROGRESS; 3097 } else if (error != 0) { 3098 ire_atomic_end(irb_ptr, ire); 3099 } 3100 3101 RELEASE_CONN_LOCK(q); 3102 return (error); 3103 } 3104 3105 /* 3106 * Add a fully initialized IRE to an appropriate table based on 3107 * ire_type. 3108 * 3109 * allow_unresolved == B_FALSE indicates a legacy code-path call 3110 * that has prohibited the addition of incomplete ire's. If this 3111 * parameter is set, and we find an nce that is in a state other 3112 * than ND_REACHABLE, we fail the add. Note that nce_state could be 3113 * something other than ND_REACHABLE if nce_reinit has just 3114 * kicked in and reset the nce. 3115 */ 3116 int 3117 ire_add(ire_t **irep, queue_t *q, mblk_t *mp, ipsq_func_t func, 3118 boolean_t allow_unresolved) 3119 { 3120 ire_t *ire1; 3121 ill_t *stq_ill = NULL; 3122 ill_t *ill; 3123 ipif_t *ipif = NULL; 3124 ill_walk_context_t ctx; 3125 ire_t *ire = *irep; 3126 int error; 3127 boolean_t ire_is_mblk = B_FALSE; 3128 tsol_gcgrp_t *gcgrp = NULL; 3129 tsol_gcgrp_addr_t ga; 3130 3131 ASSERT(ire->ire_type != IRE_MIPRTUN); 3132 3133 /* get ready for the day when original ire is not created as mblk */ 3134 if (ire->ire_mp != NULL) { 3135 ire_is_mblk = B_TRUE; 3136 /* Copy the ire to a kmem_alloc'ed area */ 3137 ire1 = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 3138 if (ire1 == NULL) { 3139 ip1dbg(("ire_add: alloc failed\n")); 3140 ire_delete(ire); 3141 *irep = NULL; 3142 return (ENOMEM); 3143 } 3144 ire->ire_marks &= ~IRE_MARK_UNCACHED; 3145 *ire1 = *ire; 3146 ire1->ire_mp = NULL; 3147 ire1->ire_stq_ifindex = 0; 3148 freeb(ire->ire_mp); 3149 ire = ire1; 3150 } 3151 if (ire->ire_stq != NULL) 3152 stq_ill = (ill_t *)ire->ire_stq->q_ptr; 3153 3154 if (ire->ire_type == IRE_CACHE) { 3155 /* 3156 * If this interface is FAILED, or INACTIVE or has hit 3157 * the FAILBACK=no case, we create IRE_CACHES marked 3158 * HIDDEN for some special cases e.g. bind to 3159 * IPIF_NOFAILOVER address etc. So, if this interface 3160 * is FAILED/INACTIVE/hit FAILBACK=no case, and we are 3161 * not creating hidden ires, we should not allow that. 3162 * This happens because the state of the interface 3163 * changed while we were waiting in ARP. If this is the 3164 * daemon sending probes, the next probe will create 3165 * HIDDEN ires and we will create an ire then. This 3166 * cannot happen with NDP currently because IRE is 3167 * never queued in NDP. But it can happen in the 3168 * future when we have external resolvers with IPv6. 3169 * If the interface gets marked with OFFLINE while we 3170 * are waiting in ARP, don't add the ire. 3171 */ 3172 if ((stq_ill->ill_phyint->phyint_flags & PHYI_OFFLINE) || 3173 (ill_is_probeonly(stq_ill) && 3174 !(ire->ire_marks & IRE_MARK_HIDDEN))) { 3175 /* 3176 * We don't know whether it is a valid ipif or not. 3177 * unless we do the check below. So, set it to NULL. 3178 */ 3179 ire->ire_ipif = NULL; 3180 ire_delete(ire); 3181 *irep = NULL; 3182 return (EINVAL); 3183 } 3184 } 3185 3186 if (stq_ill != NULL && ire->ire_type == IRE_CACHE && 3187 stq_ill->ill_net_type == IRE_IF_RESOLVER) { 3188 rw_enter(&ill_g_lock, RW_READER); 3189 ill = ILL_START_WALK_ALL(&ctx); 3190 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 3191 mutex_enter(&ill->ill_lock); 3192 if (ill->ill_state_flags & ILL_CONDEMNED) { 3193 mutex_exit(&ill->ill_lock); 3194 continue; 3195 } 3196 /* 3197 * We need to make sure that the ipif is a valid one 3198 * before adding the IRE_CACHE. This happens only 3199 * with IRE_CACHE when there is an external resolver. 3200 * 3201 * We can unplumb a logical interface while the 3202 * packet is waiting in ARP with the IRE. Then, 3203 * later on when we feed the IRE back, the ipif 3204 * has to be re-checked. This can't happen with 3205 * NDP currently, as we never queue the IRE with 3206 * the packet. We always try to recreate the IRE 3207 * when the resolution is completed. But, we do 3208 * it for IPv6 also here so that in future if 3209 * we have external resolvers, it will work without 3210 * any change. 3211 */ 3212 ipif = ipif_lookup_seqid(ill, ire->ire_ipif_seqid); 3213 if (ipif != NULL) { 3214 ipif_refhold_locked(ipif); 3215 mutex_exit(&ill->ill_lock); 3216 break; 3217 } 3218 mutex_exit(&ill->ill_lock); 3219 } 3220 rw_exit(&ill_g_lock); 3221 if (ipif == NULL || 3222 (ipif->ipif_isv6 && 3223 !IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6, 3224 &ipif->ipif_v6src_addr)) || 3225 (!ipif->ipif_isv6 && 3226 ire->ire_src_addr != ipif->ipif_src_addr) || 3227 ire->ire_zoneid != ipif->ipif_zoneid) { 3228 3229 if (ipif != NULL) 3230 ipif_refrele(ipif); 3231 ire->ire_ipif = NULL; 3232 ire_delete(ire); 3233 *irep = NULL; 3234 return (EINVAL); 3235 } 3236 3237 3238 ASSERT(ill != NULL); 3239 /* 3240 * If this group was dismantled while this packets was 3241 * queued in ARP, don't add it here. 3242 */ 3243 if (ire->ire_ipif->ipif_ill->ill_group != ill->ill_group) { 3244 /* We don't want ire_inactive bump stats for this */ 3245 ipif_refrele(ipif); 3246 ire->ire_ipif = NULL; 3247 ire_delete(ire); 3248 *irep = NULL; 3249 return (EINVAL); 3250 } 3251 3252 /* 3253 * Since we didn't attach label security attributes to the 3254 * ire for the resolver case, we need to add it now. (only 3255 * for v4 resolver and v6 xresolv case). 3256 */ 3257 if (is_system_labeled() && ire_is_mblk) { 3258 if (ire->ire_ipversion == IPV4_VERSION) { 3259 ga.ga_af = AF_INET; 3260 IN6_IPADDR_TO_V4MAPPED(ire->ire_gateway_addr != 3261 INADDR_ANY ? ire->ire_gateway_addr : 3262 ire->ire_addr, &ga.ga_addr); 3263 } else { 3264 ga.ga_af = AF_INET6; 3265 ga.ga_addr = IN6_IS_ADDR_UNSPECIFIED( 3266 &ire->ire_gateway_addr_v6) ? 3267 ire->ire_addr_v6 : 3268 ire->ire_gateway_addr_v6; 3269 } 3270 gcgrp = gcgrp_lookup(&ga, B_FALSE); 3271 error = tsol_ire_init_gwattr(ire, ire->ire_ipversion, 3272 NULL, gcgrp); 3273 if (error != 0) { 3274 if (gcgrp != NULL) { 3275 GCGRP_REFRELE(gcgrp); 3276 gcgrp = NULL; 3277 } 3278 ipif_refrele(ipif); 3279 ire->ire_ipif = NULL; 3280 ire_delete(ire); 3281 *irep = NULL; 3282 return (error); 3283 } 3284 } 3285 } 3286 3287 /* 3288 * In case ire was changed 3289 */ 3290 *irep = ire; 3291 if (ire->ire_ipversion == IPV6_VERSION) { 3292 error = ire_add_v6(irep, q, mp, func); 3293 } else { 3294 if (ire->ire_in_ill == NULL) 3295 error = ire_add_v4(irep, q, mp, func, allow_unresolved); 3296 else 3297 error = ire_add_srcif_v4(irep, q, mp, func); 3298 } 3299 if (ipif != NULL) 3300 ipif_refrele(ipif); 3301 return (error); 3302 } 3303 3304 /* 3305 * Add an initialized IRE to an appropriate table based on ire_type. 3306 * 3307 * The forward table contains IRE_PREFIX/IRE_HOST/IRE_HOST_REDIRECT 3308 * IRE_IF_RESOLVER/IRE_IF_NORESOLVER and IRE_DEFAULT. 3309 * 3310 * The cache table contains IRE_BROADCAST/IRE_LOCAL/IRE_LOOPBACK 3311 * and IRE_CACHE. 3312 * 3313 * NOTE : This function is called as writer though not required 3314 * by this function. 3315 */ 3316 static int 3317 ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func, 3318 boolean_t allow_unresolved) 3319 { 3320 ire_t *ire1; 3321 irb_t *irb_ptr; 3322 ire_t **irep; 3323 int flags; 3324 ire_t *pire = NULL; 3325 ill_t *stq_ill; 3326 ire_t *ire = *ire_p; 3327 int error; 3328 boolean_t need_refrele = B_FALSE; 3329 nce_t *nce; 3330 3331 if (ire->ire_ipif != NULL) 3332 ASSERT(!MUTEX_HELD(&ire->ire_ipif->ipif_ill->ill_lock)); 3333 if (ire->ire_stq != NULL) 3334 ASSERT(!MUTEX_HELD( 3335 &((ill_t *)(ire->ire_stq->q_ptr))->ill_lock)); 3336 ASSERT(ire->ire_ipversion == IPV4_VERSION); 3337 ASSERT(ire->ire_mp == NULL); /* Calls should go through ire_add */ 3338 ASSERT(ire->ire_in_ill == NULL); /* No srcif entries */ 3339 3340 /* Find the appropriate list head. */ 3341 switch (ire->ire_type) { 3342 case IRE_HOST: 3343 ire->ire_mask = IP_HOST_MASK; 3344 ire->ire_masklen = IP_ABITS; 3345 if ((ire->ire_flags & RTF_SETSRC) == 0) 3346 ire->ire_src_addr = 0; 3347 break; 3348 case IRE_HOST_REDIRECT: 3349 ire->ire_mask = IP_HOST_MASK; 3350 ire->ire_masklen = IP_ABITS; 3351 ire->ire_src_addr = 0; 3352 break; 3353 case IRE_CACHE: 3354 case IRE_BROADCAST: 3355 case IRE_LOCAL: 3356 case IRE_LOOPBACK: 3357 ire->ire_mask = IP_HOST_MASK; 3358 ire->ire_masklen = IP_ABITS; 3359 break; 3360 case IRE_PREFIX: 3361 if ((ire->ire_flags & RTF_SETSRC) == 0) 3362 ire->ire_src_addr = 0; 3363 break; 3364 case IRE_DEFAULT: 3365 if ((ire->ire_flags & RTF_SETSRC) == 0) 3366 ire->ire_src_addr = 0; 3367 break; 3368 case IRE_IF_RESOLVER: 3369 case IRE_IF_NORESOLVER: 3370 break; 3371 default: 3372 ip0dbg(("ire_add_v4: ire %p has unrecognized IRE type (%d)\n", 3373 (void *)ire, ire->ire_type)); 3374 ire_delete(ire); 3375 *ire_p = NULL; 3376 return (EINVAL); 3377 } 3378 3379 /* Make sure the address is properly masked. */ 3380 ire->ire_addr &= ire->ire_mask; 3381 3382 /* 3383 * ip_newroute/ip_newroute_multi are unable to prevent the deletion 3384 * of the interface route while adding an IRE_CACHE for an on-link 3385 * destination in the IRE_IF_RESOLVER case, since the ire has to 3386 * go to ARP and return. We can't do a REFHOLD on the 3387 * associated interface ire for fear of ARP freeing the message. 3388 * Here we look up the interface ire in the forwarding table and 3389 * make sure that the interface route has not been deleted. 3390 */ 3391 if (ire->ire_type == IRE_CACHE && ire->ire_gateway_addr == 0 && 3392 ((ill_t *)ire->ire_stq->q_ptr)->ill_net_type == IRE_IF_RESOLVER) { 3393 3394 ASSERT(ire->ire_max_fragp == NULL); 3395 if (CLASSD(ire->ire_addr) && !(ire->ire_flags & RTF_SETSRC)) { 3396 /* 3397 * The ihandle that we used in ip_newroute_multi 3398 * comes from the interface route corresponding 3399 * to ire_ipif. Lookup here to see if it exists 3400 * still. 3401 * If the ire has a source address assigned using 3402 * RTF_SETSRC, ire_ipif is the logical interface holding 3403 * this source address, so we can't use it to check for 3404 * the existence of the interface route. Instead we rely 3405 * on the brute force ihandle search in 3406 * ire_ihandle_lookup_onlink() below. 3407 */ 3408 pire = ipif_to_ire(ire->ire_ipif); 3409 if (pire == NULL) { 3410 ire_delete(ire); 3411 *ire_p = NULL; 3412 return (EINVAL); 3413 } else if (pire->ire_ihandle != ire->ire_ihandle) { 3414 ire_refrele(pire); 3415 ire_delete(ire); 3416 *ire_p = NULL; 3417 return (EINVAL); 3418 } 3419 } else { 3420 pire = ire_ihandle_lookup_onlink(ire); 3421 if (pire == NULL) { 3422 ire_delete(ire); 3423 *ire_p = NULL; 3424 return (EINVAL); 3425 } 3426 } 3427 /* Prevent pire from getting deleted */ 3428 IRB_REFHOLD(pire->ire_bucket); 3429 /* Has it been removed already ? */ 3430 if (pire->ire_marks & IRE_MARK_CONDEMNED) { 3431 IRB_REFRELE(pire->ire_bucket); 3432 ire_refrele(pire); 3433 ire_delete(ire); 3434 *ire_p = NULL; 3435 return (EINVAL); 3436 } 3437 } else { 3438 ASSERT(ire->ire_max_fragp != NULL); 3439 } 3440 flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW); 3441 3442 if (ire->ire_ipif != NULL) { 3443 /* 3444 * We use MATCH_IRE_IPIF while adding IRE_CACHES only 3445 * for historic reasons and to maintain symmetry with 3446 * IPv6 code path. Historically this was used by 3447 * multicast code to create multiple IRE_CACHES on 3448 * a single ill with different ipifs. This was used 3449 * so that multicast packets leaving the node had the 3450 * right source address. This is no longer needed as 3451 * ip_wput initializes the address correctly. 3452 */ 3453 flags |= MATCH_IRE_IPIF; 3454 /* 3455 * If we are creating hidden ires, make sure we search on 3456 * this ill (MATCH_IRE_ILL) and a hidden ire, 3457 * while we are searching for duplicates below. Otherwise we 3458 * could potentially find an IRE on some other interface 3459 * and it may not be a IRE marked with IRE_MARK_HIDDEN. We 3460 * shouldn't do this as this will lead to an infinite loop 3461 * (if we get to ip_wput again) eventually we need an hidden 3462 * ire for this packet to go out. MATCH_IRE_ILL is explicitly 3463 * done below. 3464 */ 3465 if (ire->ire_type == IRE_CACHE && 3466 (ire->ire_marks & IRE_MARK_HIDDEN)) 3467 flags |= (MATCH_IRE_MARK_HIDDEN); 3468 } 3469 if ((ire->ire_type & IRE_CACHETABLE) == 0) { 3470 irb_ptr = ire_get_bucket(ire); 3471 need_refrele = B_TRUE; 3472 if (irb_ptr == NULL) { 3473 /* 3474 * This assumes that the ire has not added 3475 * a reference to the ipif. 3476 */ 3477 ire->ire_ipif = NULL; 3478 ire_delete(ire); 3479 if (pire != NULL) { 3480 IRB_REFRELE(pire->ire_bucket); 3481 ire_refrele(pire); 3482 } 3483 *ire_p = NULL; 3484 return (EINVAL); 3485 } 3486 } else { 3487 irb_ptr = &(ip_cache_table[IRE_ADDR_HASH(ire->ire_addr, 3488 ip_cache_table_size)]); 3489 } 3490 3491 /* 3492 * Start the atomic add of the ire. Grab the ill locks, 3493 * ill_g_usesrc_lock and the bucket lock. Check for condemned 3494 * 3495 * If ipif or ill is changing ire_atomic_start() may queue the 3496 * request and return EINPROGRESS. 3497 * To avoid lock order problems, get the ndp4.ndp_g_lock. 3498 */ 3499 mutex_enter(&ndp4.ndp_g_lock); 3500 error = ire_atomic_start(irb_ptr, ire, q, mp, func); 3501 if (error != 0) { 3502 mutex_exit(&ndp4.ndp_g_lock); 3503 /* 3504 * We don't know whether it is a valid ipif or not. 3505 * So, set it to NULL. This assumes that the ire has not added 3506 * a reference to the ipif. 3507 */ 3508 ire->ire_ipif = NULL; 3509 ire_delete(ire); 3510 if (pire != NULL) { 3511 IRB_REFRELE(pire->ire_bucket); 3512 ire_refrele(pire); 3513 } 3514 *ire_p = NULL; 3515 if (need_refrele) 3516 IRB_REFRELE(irb_ptr); 3517 return (error); 3518 } 3519 /* 3520 * To avoid creating ires having stale values for the ire_max_frag 3521 * we get the latest value atomically here. For more details 3522 * see the block comment in ip_sioctl_mtu and in DL_NOTE_SDU_CHANGE 3523 * in ip_rput_dlpi_writer 3524 */ 3525 if (ire->ire_max_fragp == NULL) { 3526 if (CLASSD(ire->ire_addr)) 3527 ire->ire_max_frag = ire->ire_ipif->ipif_mtu; 3528 else 3529 ire->ire_max_frag = pire->ire_max_frag; 3530 } else { 3531 uint_t max_frag; 3532 3533 max_frag = *ire->ire_max_fragp; 3534 ire->ire_max_fragp = NULL; 3535 ire->ire_max_frag = max_frag; 3536 } 3537 /* 3538 * Atomically check for duplicate and insert in the table. 3539 */ 3540 for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 3541 if (ire1->ire_marks & IRE_MARK_CONDEMNED) 3542 continue; 3543 if (ire->ire_ipif != NULL) { 3544 /* 3545 * We do MATCH_IRE_ILL implicitly here for IREs 3546 * with a non-null ire_ipif, including IRE_CACHEs. 3547 * As ire_ipif and ire_stq could point to two 3548 * different ills, we can't pass just ire_ipif to 3549 * ire_match_args and get a match on both ills. 3550 * This is just needed for duplicate checks here and 3551 * so we don't add an extra argument to 3552 * ire_match_args for this. Do it locally. 3553 * 3554 * NOTE : Currently there is no part of the code 3555 * that asks for both MATH_IRE_IPIF and MATCH_IRE_ILL 3556 * match for IRE_CACHEs. Thus we don't want to 3557 * extend the arguments to ire_match_args. 3558 */ 3559 if (ire1->ire_stq != ire->ire_stq) 3560 continue; 3561 /* 3562 * Multiroute IRE_CACHEs for a given destination can 3563 * have the same ire_ipif, typically if their source 3564 * address is forced using RTF_SETSRC, and the same 3565 * send-to queue. We differentiate them using the parent 3566 * handle. 3567 */ 3568 if (ire->ire_type == IRE_CACHE && 3569 (ire1->ire_flags & RTF_MULTIRT) && 3570 (ire->ire_flags & RTF_MULTIRT) && 3571 (ire1->ire_phandle != ire->ire_phandle)) 3572 continue; 3573 } 3574 if (ire1->ire_zoneid != ire->ire_zoneid) 3575 continue; 3576 if (ire_match_args(ire1, ire->ire_addr, ire->ire_mask, 3577 ire->ire_gateway_addr, ire->ire_type, ire->ire_ipif, 3578 ire->ire_zoneid, 0, NULL, flags)) { 3579 /* 3580 * Return the old ire after doing a REFHOLD. 3581 * As most of the callers continue to use the IRE 3582 * after adding, we return a held ire. This will 3583 * avoid a lookup in the caller again. If the callers 3584 * don't want to use it, they need to do a REFRELE. 3585 */ 3586 ip1dbg(("found dup ire existing %p new %p", 3587 (void *)ire1, (void *)ire)); 3588 IRE_REFHOLD(ire1); 3589 ire_atomic_end(irb_ptr, ire); 3590 mutex_exit(&ndp4.ndp_g_lock); 3591 ire_delete(ire); 3592 if (pire != NULL) { 3593 /* 3594 * Assert that it is not removed from the 3595 * list yet. 3596 */ 3597 ASSERT(pire->ire_ptpn != NULL); 3598 IRB_REFRELE(pire->ire_bucket); 3599 ire_refrele(pire); 3600 } 3601 *ire_p = ire1; 3602 if (need_refrele) 3603 IRB_REFRELE(irb_ptr); 3604 return (0); 3605 } 3606 } 3607 if (ire->ire_type & IRE_CACHE) { 3608 ASSERT(ire->ire_stq != NULL); 3609 nce = ndp_lookup_v4(ire_to_ill(ire), 3610 ((ire->ire_gateway_addr != INADDR_ANY) ? 3611 &ire->ire_gateway_addr : &ire->ire_addr), 3612 B_TRUE); 3613 if (nce != NULL) 3614 mutex_enter(&nce->nce_lock); 3615 /* 3616 * if the nce is NCE_F_CONDEMNED, or if it is not ND_REACHABLE 3617 * and the caller has prohibited the addition of incomplete 3618 * ire's, we fail the add. Note that nce_state could be 3619 * something other than ND_REACHABLE if nce_reinit has just 3620 * kicked in and reset the nce. 3621 */ 3622 if ((nce == NULL) || 3623 (nce->nce_flags & NCE_F_CONDEMNED) || 3624 (!allow_unresolved && 3625 ((nce->nce_state & ND_REACHABLE) == 0))) { 3626 if (nce != NULL) 3627 mutex_exit(&nce->nce_lock); 3628 ire_atomic_end(irb_ptr, ire); 3629 mutex_exit(&ndp4.ndp_g_lock); 3630 if (nce != NULL) 3631 NCE_REFRELE(nce); 3632 DTRACE_PROBE1(ire__no__nce, ire_t *, ire); 3633 ire_delete(ire); 3634 if (pire != NULL) { 3635 IRB_REFRELE(pire->ire_bucket); 3636 ire_refrele(pire); 3637 } 3638 *ire_p = NULL; 3639 if (need_refrele) 3640 IRB_REFRELE(irb_ptr); 3641 return (EINVAL); 3642 } else { 3643 ire->ire_nce = nce; 3644 mutex_exit(&nce->nce_lock); 3645 /* 3646 * We are associating this nce to the ire, so 3647 * change the nce ref taken in ndp_lookup_v4() from 3648 * NCE_REFHOLD to NCE_REFHOLD_NOTR 3649 */ 3650 NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce); 3651 } 3652 } 3653 /* 3654 * Make it easy for ip_wput_ire() to hit multiple broadcast ires by 3655 * grouping identical addresses together on the hash chain. We also 3656 * don't want to send multiple copies out if there are two ills part 3657 * of the same group. Thus we group the ires with same addr and same 3658 * ill group together so that ip_wput_ire can easily skip all the 3659 * ires with same addr and same group after sending the first copy. 3660 * We do this only for IRE_BROADCASTs as ip_wput_ire is currently 3661 * interested in such groupings only for broadcasts. 3662 * 3663 * NOTE : If the interfaces are brought up first and then grouped, 3664 * illgrp_insert will handle it. We come here when the interfaces 3665 * are already in group and we are bringing them UP. 3666 * 3667 * Find the first entry that matches ire_addr. *irep will be null 3668 * if no match. 3669 */ 3670 irep = (ire_t **)irb_ptr; 3671 while ((ire1 = *irep) != NULL && ire->ire_addr != ire1->ire_addr) 3672 irep = &ire1->ire_next; 3673 if (ire->ire_type == IRE_BROADCAST && *irep != NULL) { 3674 /* 3675 * We found some ire (i.e *irep) with a matching addr. We 3676 * want to group ires with same addr and same ill group 3677 * together. 3678 * 3679 * First get to the entry that matches our address and 3680 * ill group i.e stop as soon as we find the first ire 3681 * matching the ill group and address. If there is only 3682 * an address match, we should walk and look for some 3683 * group match. These are some of the possible scenarios : 3684 * 3685 * 1) There are no groups at all i.e all ire's ill_group 3686 * are NULL. In that case we will essentially group 3687 * all the ires with the same addr together. Same as 3688 * the "else" block of this "if". 3689 * 3690 * 2) There are some groups and this ire's ill_group is 3691 * NULL. In this case, we will first find the group 3692 * that matches the address and a NULL group. Then 3693 * we will insert the ire at the end of that group. 3694 * 3695 * 3) There are some groups and this ires's ill_group is 3696 * non-NULL. In this case we will first find the group 3697 * that matches the address and the ill_group. Then 3698 * we will insert the ire at the end of that group. 3699 */ 3700 /* LINTED : constant in conditional context */ 3701 while (1) { 3702 ire1 = *irep; 3703 if ((ire1->ire_next == NULL) || 3704 (ire1->ire_next->ire_addr != ire->ire_addr) || 3705 (ire1->ire_type != IRE_BROADCAST) || 3706 (ire1->ire_ipif->ipif_ill->ill_group == 3707 ire->ire_ipif->ipif_ill->ill_group)) 3708 break; 3709 irep = &ire1->ire_next; 3710 } 3711 ASSERT(*irep != NULL); 3712 irep = &((*irep)->ire_next); 3713 3714 /* 3715 * Either we have hit the end of the list or the address 3716 * did not match or the group *matched*. If we found 3717 * a match on the group, skip to the end of the group. 3718 */ 3719 while (*irep != NULL) { 3720 ire1 = *irep; 3721 if ((ire1->ire_addr != ire->ire_addr) || 3722 (ire1->ire_type != IRE_BROADCAST) || 3723 (ire1->ire_ipif->ipif_ill->ill_group != 3724 ire->ire_ipif->ipif_ill->ill_group)) 3725 break; 3726 if (ire1->ire_ipif->ipif_ill->ill_group == NULL && 3727 ire1->ire_ipif == ire->ire_ipif) { 3728 irep = &ire1->ire_next; 3729 break; 3730 } 3731 irep = &ire1->ire_next; 3732 } 3733 } else if (*irep != NULL) { 3734 /* 3735 * Find the last ire which matches ire_addr. 3736 * Needed to do tail insertion among entries with the same 3737 * ire_addr. 3738 */ 3739 while (ire->ire_addr == ire1->ire_addr) { 3740 irep = &ire1->ire_next; 3741 ire1 = *irep; 3742 if (ire1 == NULL) 3743 break; 3744 } 3745 } 3746 3747 /* Insert at *irep */ 3748 ire1 = *irep; 3749 if (ire1 != NULL) 3750 ire1->ire_ptpn = &ire->ire_next; 3751 ire->ire_next = ire1; 3752 /* Link the new one in. */ 3753 ire->ire_ptpn = irep; 3754 3755 /* 3756 * ire_walk routines de-reference ire_next without holding 3757 * a lock. Before we point to the new ire, we want to make 3758 * sure the store that sets the ire_next of the new ire 3759 * reaches global visibility, so that ire_walk routines 3760 * don't see a truncated list of ires i.e if the ire_next 3761 * of the new ire gets set after we do "*irep = ire" due 3762 * to re-ordering, the ire_walk thread will see a NULL 3763 * once it accesses the ire_next of the new ire. 3764 * membar_producer() makes sure that the following store 3765 * happens *after* all of the above stores. 3766 */ 3767 membar_producer(); 3768 *irep = ire; 3769 ire->ire_bucket = irb_ptr; 3770 /* 3771 * We return a bumped up IRE above. Keep it symmetrical 3772 * so that the callers will always have to release. This 3773 * helps the callers of this function because they continue 3774 * to use the IRE after adding and hence they don't have to 3775 * lookup again after we return the IRE. 3776 * 3777 * NOTE : We don't have to use atomics as this is appearing 3778 * in the list for the first time and no one else can bump 3779 * up the reference count on this yet. 3780 */ 3781 IRE_REFHOLD_LOCKED(ire); 3782 BUMP_IRE_STATS(ire_stats_v4, ire_stats_inserted); 3783 3784 irb_ptr->irb_ire_cnt++; 3785 if (irb_ptr->irb_marks & IRB_MARK_FTABLE) 3786 irb_ptr->irb_nire++; 3787 3788 if (ire->ire_marks & IRE_MARK_TEMPORARY) 3789 irb_ptr->irb_tmp_ire_cnt++; 3790 3791 if (ire->ire_ipif != NULL) { 3792 ire->ire_ipif->ipif_ire_cnt++; 3793 if (ire->ire_stq != NULL) { 3794 stq_ill = (ill_t *)ire->ire_stq->q_ptr; 3795 stq_ill->ill_ire_cnt++; 3796 } 3797 } else { 3798 ASSERT(ire->ire_stq == NULL); 3799 } 3800 3801 ire_atomic_end(irb_ptr, ire); 3802 mutex_exit(&ndp4.ndp_g_lock); 3803 3804 if (pire != NULL) { 3805 /* Assert that it is not removed from the list yet */ 3806 ASSERT(pire->ire_ptpn != NULL); 3807 IRB_REFRELE(pire->ire_bucket); 3808 ire_refrele(pire); 3809 } 3810 3811 if (ire->ire_type != IRE_CACHE) { 3812 /* 3813 * For ire's with host mask see if there is an entry 3814 * in the cache. If there is one flush the whole cache as 3815 * there might be multiple entries due to RTF_MULTIRT (CGTP). 3816 * If no entry is found than there is no need to flush the 3817 * cache. 3818 */ 3819 if (ire->ire_mask == IP_HOST_MASK) { 3820 ire_t *lire; 3821 lire = ire_ctable_lookup(ire->ire_addr, NULL, IRE_CACHE, 3822 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 3823 if (lire != NULL) { 3824 ire_refrele(lire); 3825 ire_flush_cache_v4(ire, IRE_FLUSH_ADD); 3826 } 3827 } else { 3828 ire_flush_cache_v4(ire, IRE_FLUSH_ADD); 3829 } 3830 } 3831 /* 3832 * We had to delay the fast path probe until the ire is inserted 3833 * in the list. Otherwise the fast path ack won't find the ire in 3834 * the table. 3835 */ 3836 if (ire->ire_type == IRE_CACHE || ire->ire_type == IRE_BROADCAST) 3837 ire_fastpath(ire); 3838 if (ire->ire_ipif != NULL) 3839 ASSERT(!MUTEX_HELD(&ire->ire_ipif->ipif_ill->ill_lock)); 3840 *ire_p = ire; 3841 if (need_refrele) { 3842 IRB_REFRELE(irb_ptr); 3843 } 3844 return (0); 3845 } 3846 3847 /* 3848 * IRB_REFRELE is the only caller of the function. ire_unlink calls to 3849 * do the final cleanup for this ire. 3850 */ 3851 void 3852 ire_cleanup(ire_t *ire) 3853 { 3854 ire_t *ire_next; 3855 3856 ASSERT(ire != NULL); 3857 3858 while (ire != NULL) { 3859 ire_next = ire->ire_next; 3860 if (ire->ire_ipversion == IPV4_VERSION) { 3861 ire_delete_v4(ire); 3862 BUMP_IRE_STATS(ire_stats_v4, ire_stats_deleted); 3863 } else { 3864 ASSERT(ire->ire_ipversion == IPV6_VERSION); 3865 ire_delete_v6(ire); 3866 BUMP_IRE_STATS(ire_stats_v6, ire_stats_deleted); 3867 } 3868 /* 3869 * Now it's really out of the list. Before doing the 3870 * REFRELE, set ire_next to NULL as ire_inactive asserts 3871 * so. 3872 */ 3873 ire->ire_next = NULL; 3874 IRE_REFRELE_NOTR(ire); 3875 ire = ire_next; 3876 } 3877 } 3878 3879 /* 3880 * IRB_REFRELE is the only caller of the function. It calls to unlink 3881 * all the CONDEMNED ires from this bucket. 3882 */ 3883 ire_t * 3884 ire_unlink(irb_t *irb) 3885 { 3886 ire_t *ire; 3887 ire_t *ire1; 3888 ire_t **ptpn; 3889 ire_t *ire_list = NULL; 3890 3891 ASSERT(RW_WRITE_HELD(&irb->irb_lock)); 3892 ASSERT(((irb->irb_marks & IRB_MARK_FTABLE) && irb->irb_refcnt == 1) || 3893 (irb->irb_refcnt == 0)); 3894 ASSERT(irb->irb_marks & IRB_MARK_CONDEMNED); 3895 ASSERT(irb->irb_ire != NULL); 3896 3897 for (ire = irb->irb_ire; ire != NULL; ire = ire1) { 3898 ire1 = ire->ire_next; 3899 if (ire->ire_marks & IRE_MARK_CONDEMNED) { 3900 ptpn = ire->ire_ptpn; 3901 ire1 = ire->ire_next; 3902 if (ire1) 3903 ire1->ire_ptpn = ptpn; 3904 *ptpn = ire1; 3905 ire->ire_ptpn = NULL; 3906 ire->ire_next = NULL; 3907 if (ire->ire_type == IRE_DEFAULT) { 3908 /* 3909 * IRE is out of the list. We need to adjust 3910 * the accounting before the caller drops 3911 * the lock. 3912 */ 3913 if (ire->ire_ipversion == IPV6_VERSION) { 3914 ASSERT(ipv6_ire_default_count != 0); 3915 ipv6_ire_default_count--; 3916 } 3917 } 3918 /* 3919 * We need to call ire_delete_v4 or ire_delete_v6 3920 * to clean up the cache or the redirects pointing at 3921 * the default gateway. We need to drop the lock 3922 * as ire_flush_cache/ire_delete_host_redircts require 3923 * so. But we can't drop the lock, as ire_unlink needs 3924 * to atomically remove the ires from the list. 3925 * So, create a temporary list of CONDEMNED ires 3926 * for doing ire_delete_v4/ire_delete_v6 operations 3927 * later on. 3928 */ 3929 ire->ire_next = ire_list; 3930 ire_list = ire; 3931 } 3932 } 3933 irb->irb_marks &= ~IRB_MARK_CONDEMNED; 3934 return (ire_list); 3935 } 3936 3937 /* 3938 * Delete all the cache entries with this 'addr'. When IP gets a gratuitous 3939 * ARP message on any of its interface queue, it scans the nce table and 3940 * deletes and calls ndp_delete() for the appropriate nce. This action 3941 * also deletes all the neighbor/ire cache entries for that address. 3942 * This function is called from ip_arp_news in ip.c and also for 3943 * ARP ioctl processing in ip_if.c. ip_ire_clookup_and_delete returns 3944 * true if it finds a nce entry which is used by ip_arp_news to determine if 3945 * it needs to do an ire_walk_v4. The return value is also used for the 3946 * same purpose by ARP IOCTL processing * in ip_if.c when deleting 3947 * ARP entries. For SIOC*IFARP ioctls in addition to the address, 3948 * ip_if->ipif_ill also needs to be matched. 3949 */ 3950 boolean_t 3951 ip_ire_clookup_and_delete(ipaddr_t addr, ipif_t *ipif) 3952 { 3953 ill_t *ill; 3954 nce_t *nce; 3955 3956 ill = (ipif ? ipif->ipif_ill : NULL); 3957 3958 if (ill != NULL) { 3959 /* 3960 * clean up the nce (and any relevant ire's) that matches 3961 * on addr and ill. 3962 */ 3963 nce = ndp_lookup_v4(ill, &addr, B_FALSE); 3964 if (nce != NULL) { 3965 ndp_delete(nce); 3966 return (B_TRUE); 3967 } 3968 } else { 3969 /* 3970 * ill is wildcard. clean up all nce's and 3971 * ire's that match on addr 3972 */ 3973 nce_clookup_t cl; 3974 3975 cl.ncecl_addr = addr; 3976 cl.ncecl_found = B_FALSE; 3977 3978 ndp_walk_common(&ndp4, NULL, 3979 (pfi_t)ip_nce_clookup_and_delete, (uchar_t *)&cl, B_TRUE); 3980 3981 /* 3982 * ncecl_found would be set by ip_nce_clookup_and_delete if 3983 * we found a matching nce. 3984 */ 3985 return (cl.ncecl_found); 3986 } 3987 return (B_FALSE); 3988 3989 } 3990 3991 /* Delete the supplied nce if its nce_addr matches the supplied address */ 3992 static void 3993 ip_nce_clookup_and_delete(nce_t *nce, void *arg) 3994 { 3995 nce_clookup_t *cl = (nce_clookup_t *)arg; 3996 ipaddr_t nce_addr; 3997 3998 IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr); 3999 if (nce_addr == cl->ncecl_addr) { 4000 cl->ncecl_found = B_TRUE; 4001 /* clean up the nce (and any relevant ire's) */ 4002 ndp_delete(nce); 4003 } 4004 } 4005 4006 /* 4007 * Clean up the radix node for this ire. Must be called by IRB_REFRELE 4008 * when there are no ire's left in the bucket. Returns TRUE if the bucket 4009 * is deleted and freed. 4010 */ 4011 boolean_t 4012 irb_inactive(irb_t *irb) 4013 { 4014 struct rt_entry *rt; 4015 struct radix_node *rn; 4016 4017 rt = IRB2RT(irb); 4018 rn = (struct radix_node *)rt; 4019 4020 /* first remove it from the radix tree. */ 4021 RADIX_NODE_HEAD_WLOCK(ip_ftable); 4022 rw_enter(&irb->irb_lock, RW_WRITER); 4023 if (irb->irb_refcnt == 1 && irb->irb_nire == 0) { 4024 rn = ip_ftable->rnh_deladdr(rn->rn_key, rn->rn_mask, 4025 ip_ftable); 4026 DTRACE_PROBE1(irb__free, rt_t *, rt); 4027 ASSERT((void *)rn == (void *)rt); 4028 Free(rt, rt_entry_cache); 4029 /* irb_lock is freed */ 4030 RADIX_NODE_HEAD_UNLOCK(ip_ftable); 4031 return (B_TRUE); 4032 } 4033 rw_exit(&irb->irb_lock); 4034 RADIX_NODE_HEAD_UNLOCK(ip_ftable); 4035 return (B_FALSE); 4036 } 4037 4038 /* 4039 * Delete the specified IRE. 4040 */ 4041 void 4042 ire_delete(ire_t *ire) 4043 { 4044 ire_t *ire1; 4045 ire_t **ptpn; 4046 irb_t *irb; 4047 4048 if ((irb = ire->ire_bucket) == NULL) { 4049 /* 4050 * It was never inserted in the list. Should call REFRELE 4051 * to free this IRE. 4052 */ 4053 IRE_REFRELE_NOTR(ire); 4054 return; 4055 } 4056 4057 rw_enter(&irb->irb_lock, RW_WRITER); 4058 4059 if (irb->irb_rr_origin == ire) { 4060 irb->irb_rr_origin = NULL; 4061 } 4062 4063 /* 4064 * In case of V4 we might still be waiting for fastpath ack. 4065 */ 4066 if (ire->ire_ipversion == IPV4_VERSION && ire->ire_stq != NULL) { 4067 ill_t *ill; 4068 4069 ill = ire_to_ill(ire); 4070 if (ill != NULL) 4071 ire_fastpath_list_delete(ill, ire); 4072 } 4073 4074 if (ire->ire_ptpn == NULL) { 4075 /* 4076 * Some other thread has removed us from the list. 4077 * It should have done the REFRELE for us. 4078 */ 4079 rw_exit(&irb->irb_lock); 4080 return; 4081 } 4082 4083 if (irb->irb_refcnt != 0) { 4084 /* 4085 * The last thread to leave this bucket will 4086 * delete this ire. 4087 */ 4088 if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { 4089 irb->irb_ire_cnt--; 4090 if (ire->ire_marks & IRE_MARK_TEMPORARY) 4091 irb->irb_tmp_ire_cnt--; 4092 ire->ire_marks |= IRE_MARK_CONDEMNED; 4093 } 4094 irb->irb_marks |= IRB_MARK_CONDEMNED; 4095 rw_exit(&irb->irb_lock); 4096 return; 4097 } 4098 4099 /* 4100 * Normally to delete an ire, we walk the bucket. While we 4101 * walk the bucket, we normally bump up irb_refcnt and hence 4102 * we return from above where we mark CONDEMNED and the ire 4103 * gets deleted from ire_unlink. This case is where somebody 4104 * knows the ire e.g by doing a lookup, and wants to delete the 4105 * IRE. irb_refcnt would be 0 in this case if nobody is walking 4106 * the bucket. 4107 */ 4108 ptpn = ire->ire_ptpn; 4109 ire1 = ire->ire_next; 4110 if (ire1 != NULL) 4111 ire1->ire_ptpn = ptpn; 4112 ASSERT(ptpn != NULL); 4113 *ptpn = ire1; 4114 ire->ire_ptpn = NULL; 4115 ire->ire_next = NULL; 4116 if (ire->ire_ipversion == IPV6_VERSION) { 4117 BUMP_IRE_STATS(ire_stats_v6, ire_stats_deleted); 4118 } else { 4119 BUMP_IRE_STATS(ire_stats_v4, ire_stats_deleted); 4120 } 4121 /* 4122 * ip_wput/ip_wput_v6 checks this flag to see whether 4123 * it should still use the cached ire or not. 4124 */ 4125 ire->ire_marks |= IRE_MARK_CONDEMNED; 4126 if (ire->ire_type == IRE_DEFAULT) { 4127 /* 4128 * IRE is out of the list. We need to adjust the 4129 * accounting before we drop the lock. 4130 */ 4131 if (ire->ire_ipversion == IPV6_VERSION) { 4132 ASSERT(ipv6_ire_default_count != 0); 4133 ipv6_ire_default_count--; 4134 } 4135 } 4136 irb->irb_ire_cnt--; 4137 4138 if (ire->ire_marks & IRE_MARK_TEMPORARY) 4139 irb->irb_tmp_ire_cnt--; 4140 rw_exit(&irb->irb_lock); 4141 4142 if (ire->ire_ipversion == IPV6_VERSION) { 4143 ire_delete_v6(ire); 4144 } else { 4145 ire_delete_v4(ire); 4146 } 4147 /* 4148 * We removed it from the list. Decrement the 4149 * reference count. 4150 */ 4151 IRE_REFRELE_NOTR(ire); 4152 } 4153 4154 /* 4155 * Delete the specified IRE. 4156 * All calls should use ire_delete(). 4157 * Sometimes called as writer though not required by this function. 4158 * 4159 * NOTE : This function is called only if the ire was added 4160 * in the list. 4161 */ 4162 static void 4163 ire_delete_v4(ire_t *ire) 4164 { 4165 ASSERT(ire->ire_refcnt >= 1); 4166 ASSERT(ire->ire_ipversion == IPV4_VERSION); 4167 4168 if (ire->ire_type != IRE_CACHE) 4169 ire_flush_cache_v4(ire, IRE_FLUSH_DELETE); 4170 if (ire->ire_type == IRE_DEFAULT) { 4171 /* 4172 * when a default gateway is going away 4173 * delete all the host redirects pointing at that 4174 * gateway. 4175 */ 4176 ire_delete_host_redirects(ire->ire_gateway_addr); 4177 } 4178 } 4179 4180 /* 4181 * IRE_REFRELE/ire_refrele are the only caller of the function. It calls 4182 * to free the ire when the reference count goes to zero. 4183 */ 4184 void 4185 ire_inactive(ire_t *ire) 4186 { 4187 nce_t *nce; 4188 ill_t *ill = NULL; 4189 ill_t *stq_ill = NULL; 4190 ill_t *in_ill = NULL; 4191 ipif_t *ipif; 4192 boolean_t need_wakeup = B_FALSE; 4193 irb_t *irb; 4194 4195 ASSERT(ire->ire_refcnt == 0); 4196 ASSERT(ire->ire_ptpn == NULL); 4197 ASSERT(ire->ire_next == NULL); 4198 4199 if (ire->ire_gw_secattr != NULL) { 4200 ire_gw_secattr_free(ire->ire_gw_secattr); 4201 ire->ire_gw_secattr = NULL; 4202 } 4203 4204 if (ire->ire_mp != NULL) { 4205 ASSERT(ire->ire_fastpath == NULL); 4206 ASSERT(ire->ire_bucket == NULL); 4207 mutex_destroy(&ire->ire_lock); 4208 BUMP_IRE_STATS(ire_stats_v4, ire_stats_freed); 4209 if (ire->ire_nce != NULL) 4210 NCE_REFRELE_NOTR(ire->ire_nce); 4211 freeb(ire->ire_mp); 4212 return; 4213 } 4214 4215 if ((nce = ire->ire_nce) != NULL) { 4216 NCE_REFRELE_NOTR(nce); 4217 ire->ire_nce = NULL; 4218 } 4219 4220 if (ire->ire_ipif == NULL) 4221 goto end; 4222 4223 ipif = ire->ire_ipif; 4224 ill = ipif->ipif_ill; 4225 4226 if (ire->ire_bucket == NULL) { 4227 /* The ire was never inserted in the table. */ 4228 goto end; 4229 } 4230 4231 /* 4232 * ipif_ire_cnt on this ipif goes down by 1. If the ire_stq is 4233 * non-null ill_ire_count also goes down by 1. If the in_ill is 4234 * non-null either ill_mrtun_refcnt or ill_srcif_refcnt goes down by 1. 4235 * 4236 * The ipif that is associated with an ire is ire->ire_ipif and 4237 * hence when the ire->ire_ipif->ipif_ire_cnt drops to zero we call 4238 * ipif_ill_refrele_tail. Usually stq_ill is null or the same as 4239 * ire->ire_ipif->ipif_ill. So nothing more needs to be done. Only 4240 * in the case of IRE_CACHES when IPMP is used, stq_ill can be 4241 * different. If this is different from ire->ire_ipif->ipif_ill and 4242 * if the ill_ire_cnt on the stq_ill also has dropped to zero, we call 4243 * ipif_ill_refrele_tail on the stq_ill. If mobile ip is in use 4244 * in_ill could be non-null. If it is a reverse tunnel related ire 4245 * ill_mrtun_refcnt is non-zero. If it is forward tunnel related ire 4246 * ill_srcif_refcnt is non-null. 4247 */ 4248 4249 if (ire->ire_stq != NULL) 4250 stq_ill = (ill_t *)ire->ire_stq->q_ptr; 4251 if (ire->ire_in_ill != NULL) 4252 in_ill = ire->ire_in_ill; 4253 4254 if ((stq_ill == NULL || stq_ill == ill) && (in_ill == NULL)) { 4255 /* Optimize the most common case */ 4256 mutex_enter(&ill->ill_lock); 4257 ASSERT(ipif->ipif_ire_cnt != 0); 4258 ipif->ipif_ire_cnt--; 4259 if (ipif->ipif_ire_cnt == 0) 4260 need_wakeup = B_TRUE; 4261 if (stq_ill != NULL) { 4262 ASSERT(stq_ill->ill_ire_cnt != 0); 4263 stq_ill->ill_ire_cnt--; 4264 if (stq_ill->ill_ire_cnt == 0) 4265 need_wakeup = B_TRUE; 4266 } 4267 if (need_wakeup) { 4268 /* Drops the ill lock */ 4269 ipif_ill_refrele_tail(ill); 4270 } else { 4271 mutex_exit(&ill->ill_lock); 4272 } 4273 } else { 4274 /* 4275 * We can't grab all the ill locks at the same time. 4276 * It can lead to recursive lock enter in the call to 4277 * ipif_ill_refrele_tail and later. Instead do it 1 at 4278 * a time. 4279 */ 4280 mutex_enter(&ill->ill_lock); 4281 ASSERT(ipif->ipif_ire_cnt != 0); 4282 ipif->ipif_ire_cnt--; 4283 if (ipif->ipif_ire_cnt == 0) { 4284 /* Drops the lock */ 4285 ipif_ill_refrele_tail(ill); 4286 } else { 4287 mutex_exit(&ill->ill_lock); 4288 } 4289 if (stq_ill != NULL) { 4290 mutex_enter(&stq_ill->ill_lock); 4291 ASSERT(stq_ill->ill_ire_cnt != 0); 4292 stq_ill->ill_ire_cnt--; 4293 if (stq_ill->ill_ire_cnt == 0) { 4294 /* Drops the ill lock */ 4295 ipif_ill_refrele_tail(stq_ill); 4296 } else { 4297 mutex_exit(&stq_ill->ill_lock); 4298 } 4299 } 4300 if (in_ill != NULL) { 4301 mutex_enter(&in_ill->ill_lock); 4302 if (ire->ire_type == IRE_MIPRTUN) { 4303 /* 4304 * Mobile IP reverse tunnel ire. 4305 * Decrement table count and the 4306 * ill reference count. This signifies 4307 * mipagent is deleting reverse tunnel 4308 * route for a particular mobile node. 4309 */ 4310 mutex_enter(&ire_mrtun_lock); 4311 ire_mrtun_count--; 4312 mutex_exit(&ire_mrtun_lock); 4313 ASSERT(in_ill->ill_mrtun_refcnt != 0); 4314 in_ill->ill_mrtun_refcnt--; 4315 if (in_ill->ill_mrtun_refcnt == 0) { 4316 /* Drops the ill lock */ 4317 ipif_ill_refrele_tail(in_ill); 4318 } else { 4319 mutex_exit(&in_ill->ill_lock); 4320 } 4321 } else { 4322 mutex_enter(&ire_srcif_table_lock); 4323 ire_srcif_table_count--; 4324 mutex_exit(&ire_srcif_table_lock); 4325 ASSERT(in_ill->ill_srcif_refcnt != 0); 4326 in_ill->ill_srcif_refcnt--; 4327 if (in_ill->ill_srcif_refcnt == 0) { 4328 /* Drops the ill lock */ 4329 ipif_ill_refrele_tail(in_ill); 4330 } else { 4331 mutex_exit(&in_ill->ill_lock); 4332 } 4333 } 4334 } 4335 } 4336 end: 4337 /* This should be true for both V4 and V6 */ 4338 ASSERT(ire->ire_fastpath == NULL); 4339 4340 if ((ire->ire_type & IRE_FORWARDTABLE) && 4341 (ire->ire_ipversion == IPV4_VERSION) && 4342 ((irb = ire->ire_bucket) != NULL)) { 4343 rw_enter(&irb->irb_lock, RW_WRITER); 4344 irb->irb_nire--; 4345 /* 4346 * Instead of examining the conditions for freeing 4347 * the radix node here, we do it by calling 4348 * IRB_REFRELE which is a single point in the code 4349 * that embeds that logic. Bump up the refcnt to 4350 * be able to call IRB_REFRELE 4351 */ 4352 IRB_REFHOLD_LOCKED(irb); 4353 rw_exit(&irb->irb_lock); 4354 IRB_REFRELE(irb); 4355 } 4356 ire->ire_ipif = NULL; 4357 4358 if (ire->ire_in_ill != NULL) { 4359 ire->ire_in_ill = NULL; 4360 } 4361 4362 #ifdef IRE_DEBUG 4363 ire_trace_inactive(ire); 4364 #endif 4365 mutex_destroy(&ire->ire_lock); 4366 if (ire->ire_ipversion == IPV6_VERSION) { 4367 BUMP_IRE_STATS(ire_stats_v6, ire_stats_freed); 4368 } else { 4369 BUMP_IRE_STATS(ire_stats_v4, ire_stats_freed); 4370 } 4371 ASSERT(ire->ire_mp == NULL); 4372 /* Has been allocated out of the cache */ 4373 kmem_cache_free(ire_cache, ire); 4374 } 4375 4376 /* 4377 * ire_walk routine to delete all IRE_CACHE/IRE_HOST_REDIRECT entries 4378 * that have a given gateway address. 4379 */ 4380 void 4381 ire_delete_cache_gw(ire_t *ire, char *cp) 4382 { 4383 ipaddr_t gw_addr; 4384 4385 if (!(ire->ire_type & (IRE_CACHE|IRE_HOST_REDIRECT))) 4386 return; 4387 4388 bcopy(cp, &gw_addr, sizeof (gw_addr)); 4389 if (ire->ire_gateway_addr == gw_addr) { 4390 ip1dbg(("ire_delete_cache_gw: deleted 0x%x type %d to 0x%x\n", 4391 (int)ntohl(ire->ire_addr), ire->ire_type, 4392 (int)ntohl(ire->ire_gateway_addr))); 4393 ire_delete(ire); 4394 } 4395 } 4396 4397 /* 4398 * Remove all IRE_CACHE entries that match the ire specified. 4399 * 4400 * The flag argument indicates if the flush request is due to addition 4401 * of new route (IRE_FLUSH_ADD) or deletion of old route (IRE_FLUSH_DELETE). 4402 * 4403 * This routine takes only the IREs from the forwarding table and flushes 4404 * the corresponding entries from the cache table. 4405 * 4406 * When flushing due to the deletion of an old route, it 4407 * just checks the cache handles (ire_phandle and ire_ihandle) and 4408 * deletes the ones that match. 4409 * 4410 * When flushing due to the creation of a new route, it checks 4411 * if a cache entry's address matches the one in the IRE and 4412 * that the cache entry's parent has a less specific mask than the 4413 * one in IRE. The destination of such a cache entry could be the 4414 * gateway for other cache entries, so we need to flush those as 4415 * well by looking for gateway addresses matching the IRE's address. 4416 */ 4417 void 4418 ire_flush_cache_v4(ire_t *ire, int flag) 4419 { 4420 int i; 4421 ire_t *cire; 4422 irb_t *irb; 4423 4424 if (ire->ire_type & IRE_CACHE) 4425 return; 4426 4427 /* 4428 * If a default is just created, there is no point 4429 * in going through the cache, as there will not be any 4430 * cached ires. 4431 */ 4432 if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) 4433 return; 4434 if (flag == IRE_FLUSH_ADD) { 4435 /* 4436 * This selective flush is due to the addition of 4437 * new IRE. 4438 */ 4439 for (i = 0; i < ip_cache_table_size; i++) { 4440 irb = &ip_cache_table[i]; 4441 if ((cire = irb->irb_ire) == NULL) 4442 continue; 4443 IRB_REFHOLD(irb); 4444 for (cire = irb->irb_ire; cire != NULL; 4445 cire = cire->ire_next) { 4446 if (cire->ire_type != IRE_CACHE) 4447 continue; 4448 /* 4449 * If 'cire' belongs to the same subnet 4450 * as the new ire being added, and 'cire' 4451 * is derived from a prefix that is less 4452 * specific than the new ire being added, 4453 * we need to flush 'cire'; for instance, 4454 * when a new interface comes up. 4455 */ 4456 if (((cire->ire_addr & ire->ire_mask) == 4457 (ire->ire_addr & ire->ire_mask)) && 4458 (ip_mask_to_plen(cire->ire_cmask) <= 4459 ire->ire_masklen)) { 4460 ire_delete(cire); 4461 continue; 4462 } 4463 /* 4464 * This is the case when the ire_gateway_addr 4465 * of 'cire' belongs to the same subnet as 4466 * the new ire being added. 4467 * Flushing such ires is sometimes required to 4468 * avoid misrouting: say we have a machine with 4469 * two interfaces (I1 and I2), a default router 4470 * R on the I1 subnet, and a host route to an 4471 * off-link destination D with a gateway G on 4472 * the I2 subnet. 4473 * Under normal operation, we will have an 4474 * on-link cache entry for G and an off-link 4475 * cache entry for D with G as ire_gateway_addr, 4476 * traffic to D will reach its destination 4477 * through gateway G. 4478 * If the administrator does 'ifconfig I2 down', 4479 * the cache entries for D and G will be 4480 * flushed. However, G will now be resolved as 4481 * an off-link destination using R (the default 4482 * router) as gateway. Then D will also be 4483 * resolved as an off-link destination using G 4484 * as gateway - this behavior is due to 4485 * compatibility reasons, see comment in 4486 * ire_ihandle_lookup_offlink(). Traffic to D 4487 * will go to the router R and probably won't 4488 * reach the destination. 4489 * The administrator then does 'ifconfig I2 up'. 4490 * Since G is on the I2 subnet, this routine 4491 * will flush its cache entry. It must also 4492 * flush the cache entry for D, otherwise 4493 * traffic will stay misrouted until the IRE 4494 * times out. 4495 */ 4496 if ((cire->ire_gateway_addr & ire->ire_mask) == 4497 (ire->ire_addr & ire->ire_mask)) { 4498 ire_delete(cire); 4499 continue; 4500 } 4501 } 4502 IRB_REFRELE(irb); 4503 } 4504 } else { 4505 /* 4506 * delete the cache entries based on 4507 * handle in the IRE as this IRE is 4508 * being deleted/changed. 4509 */ 4510 for (i = 0; i < ip_cache_table_size; i++) { 4511 irb = &ip_cache_table[i]; 4512 if ((cire = irb->irb_ire) == NULL) 4513 continue; 4514 IRB_REFHOLD(irb); 4515 for (cire = irb->irb_ire; cire != NULL; 4516 cire = cire->ire_next) { 4517 if (cire->ire_type != IRE_CACHE) 4518 continue; 4519 if ((cire->ire_phandle == 0 || 4520 cire->ire_phandle != ire->ire_phandle) && 4521 (cire->ire_ihandle == 0 || 4522 cire->ire_ihandle != ire->ire_ihandle)) 4523 continue; 4524 ire_delete(cire); 4525 } 4526 IRB_REFRELE(irb); 4527 } 4528 } 4529 } 4530 4531 /* 4532 * Matches the arguments passed with the values in the ire. 4533 * 4534 * Note: for match types that match using "ipif" passed in, ipif 4535 * must be checked for non-NULL before calling this routine. 4536 */ 4537 boolean_t 4538 ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, 4539 int type, const ipif_t *ipif, zoneid_t zoneid, uint32_t ihandle, 4540 const ts_label_t *tsl, int match_flags) 4541 { 4542 ill_t *ire_ill = NULL, *dst_ill; 4543 ill_t *ipif_ill = NULL; 4544 ill_group_t *ire_ill_group = NULL; 4545 ill_group_t *ipif_ill_group = NULL; 4546 4547 ASSERT(ire->ire_ipversion == IPV4_VERSION); 4548 ASSERT((ire->ire_addr & ~ire->ire_mask) == 0); 4549 ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP))) || 4550 (ipif != NULL && !ipif->ipif_isv6)); 4551 ASSERT(!(match_flags & MATCH_IRE_WQ)); 4552 4553 /* 4554 * HIDDEN cache entries have to be looked up specifically with 4555 * MATCH_IRE_MARK_HIDDEN. MATCH_IRE_MARK_HIDDEN is usually set 4556 * when the interface is FAILED or INACTIVE. In that case, 4557 * any IRE_CACHES that exists should be marked with 4558 * IRE_MARK_HIDDEN. So, we don't really need to match below 4559 * for IRE_MARK_HIDDEN. But we do so for consistency. 4560 */ 4561 if (!(match_flags & MATCH_IRE_MARK_HIDDEN) && 4562 (ire->ire_marks & IRE_MARK_HIDDEN)) 4563 return (B_FALSE); 4564 4565 /* 4566 * MATCH_IRE_MARK_PRIVATE_ADDR is set when IP_NEXTHOP option 4567 * is used. In that case the routing table is bypassed and the 4568 * packets are sent directly to the specified nexthop. The 4569 * IRE_CACHE entry representing this route should be marked 4570 * with IRE_MARK_PRIVATE_ADDR. 4571 */ 4572 4573 if (!(match_flags & MATCH_IRE_MARK_PRIVATE_ADDR) && 4574 (ire->ire_marks & IRE_MARK_PRIVATE_ADDR)) 4575 return (B_FALSE); 4576 4577 if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid && 4578 ire->ire_zoneid != ALL_ZONES) { 4579 /* 4580 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid is 4581 * valid and does not match that of ire_zoneid, a failure to 4582 * match is reported at this point. Otherwise, since some IREs 4583 * that are available in the global zone can be used in local 4584 * zones, additional checks need to be performed: 4585 * 4586 * IRE_BROADCAST, IRE_CACHE and IRE_LOOPBACK 4587 * entries should never be matched in this situation. 4588 * 4589 * IRE entries that have an interface associated with them 4590 * should in general not match unless they are an IRE_LOCAL 4591 * or in the case when MATCH_IRE_DEFAULT has been set in 4592 * the caller. In the case of the former, checking of the 4593 * other fields supplied should take place. 4594 * 4595 * In the case where MATCH_IRE_DEFAULT has been set, 4596 * all of the ipif's associated with the IRE's ill are 4597 * checked to see if there is a matching zoneid. If any 4598 * one ipif has a matching zoneid, this IRE is a 4599 * potential candidate so checking of the other fields 4600 * takes place. 4601 * 4602 * In the case where the IRE_INTERFACE has a usable source 4603 * address (indicated by ill_usesrc_ifindex) in the 4604 * correct zone then it's permitted to return this IRE 4605 */ 4606 if (match_flags & MATCH_IRE_ZONEONLY) 4607 return (B_FALSE); 4608 if (ire->ire_type & (IRE_BROADCAST | IRE_CACHE | IRE_LOOPBACK)) 4609 return (B_FALSE); 4610 /* 4611 * Note, IRE_INTERFACE can have the stq as NULL. For 4612 * example, if the default multicast route is tied to 4613 * the loopback address. 4614 */ 4615 if ((ire->ire_type & IRE_INTERFACE) && 4616 (ire->ire_stq != NULL)) { 4617 dst_ill = (ill_t *)ire->ire_stq->q_ptr; 4618 /* 4619 * If there is a usable source address in the 4620 * zone, then it's ok to return an 4621 * IRE_INTERFACE 4622 */ 4623 if (ipif_usesrc_avail(dst_ill, zoneid)) { 4624 ip3dbg(("ire_match_args: dst_ill %p match %d\n", 4625 (void *)dst_ill, 4626 (ire->ire_addr == (addr & mask)))); 4627 } else { 4628 ip3dbg(("ire_match_args: src_ipif NULL" 4629 " dst_ill %p\n", (void *)dst_ill)); 4630 return (B_FALSE); 4631 } 4632 } 4633 if (ire->ire_ipif != NULL && ire->ire_type != IRE_LOCAL && 4634 !(ire->ire_type & IRE_INTERFACE)) { 4635 ipif_t *tipif; 4636 4637 if ((match_flags & MATCH_IRE_DEFAULT) == 0) { 4638 return (B_FALSE); 4639 } 4640 mutex_enter(&ire->ire_ipif->ipif_ill->ill_lock); 4641 for (tipif = ire->ire_ipif->ipif_ill->ill_ipif; 4642 tipif != NULL; tipif = tipif->ipif_next) { 4643 if (IPIF_CAN_LOOKUP(tipif) && 4644 (tipif->ipif_flags & IPIF_UP) && 4645 (tipif->ipif_zoneid == zoneid || 4646 tipif->ipif_zoneid == ALL_ZONES)) 4647 break; 4648 } 4649 mutex_exit(&ire->ire_ipif->ipif_ill->ill_lock); 4650 if (tipif == NULL) { 4651 return (B_FALSE); 4652 } 4653 } 4654 } 4655 4656 /* 4657 * For IRE_CACHES, MATCH_IRE_ILL/ILL_GROUP really means that 4658 * somebody wants to send out on a particular interface which 4659 * is given by ire_stq and hence use ire_stq to derive the ill 4660 * value. ire_ipif for IRE_CACHES is just the means of getting 4661 * a source address i.e ire_src_addr = ire->ire_ipif->ipif_src_addr. 4662 * ire_to_ill does the right thing for this. 4663 */ 4664 if (match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP)) { 4665 ire_ill = ire_to_ill(ire); 4666 if (ire_ill != NULL) 4667 ire_ill_group = ire_ill->ill_group; 4668 ipif_ill = ipif->ipif_ill; 4669 ipif_ill_group = ipif_ill->ill_group; 4670 } 4671 4672 if ((ire->ire_addr == (addr & mask)) && 4673 ((!(match_flags & MATCH_IRE_GW)) || 4674 (ire->ire_gateway_addr == gateway)) && 4675 ((!(match_flags & MATCH_IRE_TYPE)) || 4676 (ire->ire_type & type)) && 4677 ((!(match_flags & MATCH_IRE_SRC)) || 4678 (ire->ire_src_addr == ipif->ipif_src_addr)) && 4679 ((!(match_flags & MATCH_IRE_IPIF)) || 4680 (ire->ire_ipif == ipif)) && 4681 ((!(match_flags & MATCH_IRE_MARK_HIDDEN)) || 4682 (ire->ire_type != IRE_CACHE || 4683 ire->ire_marks & IRE_MARK_HIDDEN)) && 4684 ((!(match_flags & MATCH_IRE_MARK_PRIVATE_ADDR)) || 4685 (ire->ire_type != IRE_CACHE || 4686 ire->ire_marks & IRE_MARK_PRIVATE_ADDR)) && 4687 ((!(match_flags & MATCH_IRE_ILL)) || 4688 (ire_ill == ipif_ill)) && 4689 ((!(match_flags & MATCH_IRE_IHANDLE)) || 4690 (ire->ire_ihandle == ihandle)) && 4691 ((!(match_flags & MATCH_IRE_MASK)) || 4692 (ire->ire_mask == mask)) && 4693 ((!(match_flags & MATCH_IRE_ILL_GROUP)) || 4694 (ire_ill == ipif_ill) || 4695 (ire_ill_group != NULL && 4696 ire_ill_group == ipif_ill_group)) && 4697 ((!(match_flags & MATCH_IRE_SECATTR)) || 4698 (!is_system_labeled()) || 4699 (tsol_ire_match_gwattr(ire, tsl) == 0))) { 4700 /* We found the matched IRE */ 4701 return (B_TRUE); 4702 } 4703 return (B_FALSE); 4704 } 4705 4706 4707 /* 4708 * Lookup for a route in all the tables 4709 */ 4710 ire_t * 4711 ire_route_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, 4712 int type, const ipif_t *ipif, ire_t **pire, zoneid_t zoneid, 4713 const ts_label_t *tsl, int flags) 4714 { 4715 ire_t *ire = NULL; 4716 4717 /* 4718 * ire_match_args() will dereference ipif MATCH_IRE_SRC or 4719 * MATCH_IRE_ILL is set. 4720 */ 4721 if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) && 4722 (ipif == NULL)) 4723 return (NULL); 4724 4725 /* 4726 * might be asking for a cache lookup, 4727 * This is not best way to lookup cache, 4728 * user should call ire_cache_lookup directly. 4729 * 4730 * If MATCH_IRE_TYPE was set, first lookup in the cache table and then 4731 * in the forwarding table, if the applicable type flags were set. 4732 */ 4733 if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_CACHETABLE) != 0) { 4734 ire = ire_ctable_lookup(addr, gateway, type, ipif, zoneid, 4735 tsl, flags); 4736 if (ire != NULL) 4737 return (ire); 4738 } 4739 if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_FORWARDTABLE) != 0) { 4740 ire = ire_ftable_lookup(addr, mask, gateway, type, ipif, pire, 4741 zoneid, 0, tsl, flags); 4742 } 4743 return (ire); 4744 } 4745 4746 4747 /* 4748 * Delete the IRE cache for the gateway and all IRE caches whose 4749 * ire_gateway_addr points to this gateway, and allow them to 4750 * be created on demand by ip_newroute. 4751 */ 4752 void 4753 ire_clookup_delete_cache_gw(ipaddr_t addr, zoneid_t zoneid) 4754 { 4755 irb_t *irb; 4756 ire_t *ire; 4757 4758 irb = &ip_cache_table[IRE_ADDR_HASH(addr, ip_cache_table_size)]; 4759 IRB_REFHOLD(irb); 4760 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 4761 if (ire->ire_marks & IRE_MARK_CONDEMNED) 4762 continue; 4763 4764 ASSERT(ire->ire_mask == IP_HOST_MASK); 4765 ASSERT(ire->ire_type != IRE_MIPRTUN && ire->ire_in_ill == NULL); 4766 if (ire_match_args(ire, addr, ire->ire_mask, 0, IRE_CACHE, 4767 NULL, zoneid, 0, NULL, MATCH_IRE_TYPE)) { 4768 ire_delete(ire); 4769 } 4770 } 4771 IRB_REFRELE(irb); 4772 4773 ire_walk_v4(ire_delete_cache_gw, &addr, zoneid); 4774 } 4775 4776 /* 4777 * Looks up cache table for a route. 4778 * specific lookup can be indicated by 4779 * passing the MATCH_* flags and the 4780 * necessary parameters. 4781 */ 4782 ire_t * 4783 ire_ctable_lookup(ipaddr_t addr, ipaddr_t gateway, int type, const ipif_t *ipif, 4784 zoneid_t zoneid, const ts_label_t *tsl, int flags) 4785 { 4786 irb_t *irb_ptr; 4787 ire_t *ire; 4788 4789 /* 4790 * ire_match_args() will dereference ipif MATCH_IRE_SRC or 4791 * MATCH_IRE_ILL is set. 4792 */ 4793 if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) && 4794 (ipif == NULL)) 4795 return (NULL); 4796 4797 irb_ptr = &ip_cache_table[IRE_ADDR_HASH(addr, ip_cache_table_size)]; 4798 rw_enter(&irb_ptr->irb_lock, RW_READER); 4799 for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 4800 if (ire->ire_marks & IRE_MARK_CONDEMNED) 4801 continue; 4802 ASSERT(ire->ire_mask == IP_HOST_MASK); 4803 ASSERT(ire->ire_type != IRE_MIPRTUN && ire->ire_in_ill == NULL); 4804 if (ire_match_args(ire, addr, ire->ire_mask, gateway, type, 4805 ipif, zoneid, 0, tsl, flags)) { 4806 IRE_REFHOLD(ire); 4807 rw_exit(&irb_ptr->irb_lock); 4808 return (ire); 4809 } 4810 } 4811 rw_exit(&irb_ptr->irb_lock); 4812 return (NULL); 4813 } 4814 4815 /* 4816 * Check whether the IRE_LOCAL and the IRE potentially used to transmit 4817 * (could be an IRE_CACHE, IRE_BROADCAST, or IRE_INTERFACE) are part of 4818 * the same ill group. 4819 */ 4820 boolean_t 4821 ire_local_same_ill_group(ire_t *ire_local, ire_t *xmit_ire) 4822 { 4823 ill_t *recv_ill, *xmit_ill; 4824 ill_group_t *recv_group, *xmit_group; 4825 4826 ASSERT(ire_local->ire_type == IRE_LOCAL); 4827 ASSERT(ire_local->ire_rfq != NULL); 4828 ASSERT(xmit_ire->ire_type & (IRE_CACHE|IRE_BROADCAST|IRE_INTERFACE)); 4829 ASSERT(xmit_ire->ire_stq != NULL); 4830 ASSERT(xmit_ire->ire_ipif != NULL); 4831 4832 recv_ill = ire_local->ire_rfq->q_ptr; 4833 xmit_ill = xmit_ire->ire_stq->q_ptr; 4834 4835 if (recv_ill == xmit_ill) 4836 return (B_TRUE); 4837 4838 recv_group = recv_ill->ill_group; 4839 xmit_group = xmit_ill->ill_group; 4840 4841 if (recv_group != NULL && recv_group == xmit_group) 4842 return (B_TRUE); 4843 4844 return (B_FALSE); 4845 } 4846 4847 /* 4848 * Check if the IRE_LOCAL uses the same ill (group) as another route would use. 4849 */ 4850 boolean_t 4851 ire_local_ok_across_zones(ire_t *ire_local, zoneid_t zoneid, void *addr, 4852 const ts_label_t *tsl) 4853 { 4854 ire_t *alt_ire; 4855 boolean_t rval; 4856 4857 if (ire_local->ire_ipversion == IPV4_VERSION) { 4858 alt_ire = ire_ftable_lookup(*((ipaddr_t *)addr), 0, 0, 0, NULL, 4859 NULL, zoneid, 0, tsl, 4860 MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 4861 MATCH_IRE_RJ_BHOLE); 4862 } else { 4863 alt_ire = ire_ftable_lookup_v6((in6_addr_t *)addr, NULL, NULL, 4864 0, NULL, NULL, zoneid, 0, tsl, 4865 MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 4866 MATCH_IRE_RJ_BHOLE); 4867 } 4868 4869 if (alt_ire == NULL) 4870 return (B_FALSE); 4871 4872 rval = ire_local_same_ill_group(ire_local, alt_ire); 4873 4874 ire_refrele(alt_ire); 4875 return (rval); 4876 } 4877 4878 /* 4879 * Lookup cache. Don't return IRE_MARK_HIDDEN entries. Callers 4880 * should use ire_ctable_lookup with MATCH_IRE_MARK_HIDDEN to get 4881 * to the hidden ones. 4882 * 4883 * In general the zoneid has to match (where ALL_ZONES match all of them). 4884 * But for IRE_LOCAL we also need to handle the case where L2 should 4885 * conceptually loop back the packet. This is necessary since neither 4886 * Ethernet drivers nor Ethernet hardware loops back packets sent to their 4887 * own MAC address. This loopback is needed when the normal 4888 * routes (ignoring IREs with different zoneids) would send out the packet on 4889 * the same ill (or ill group) as the ill with which this IRE_LOCAL is 4890 * associated. 4891 * 4892 * Earlier versions of this code always matched an IRE_LOCAL independently of 4893 * the zoneid. We preserve that earlier behavior when 4894 * ip_restrict_interzone_loopback is turned off. 4895 */ 4896 ire_t * 4897 ire_cache_lookup(ipaddr_t addr, zoneid_t zoneid, const ts_label_t *tsl) 4898 { 4899 irb_t *irb_ptr; 4900 ire_t *ire; 4901 4902 irb_ptr = &ip_cache_table[IRE_ADDR_HASH(addr, ip_cache_table_size)]; 4903 rw_enter(&irb_ptr->irb_lock, RW_READER); 4904 for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 4905 if (ire->ire_marks & (IRE_MARK_CONDEMNED | 4906 IRE_MARK_HIDDEN | IRE_MARK_PRIVATE_ADDR)) { 4907 continue; 4908 } 4909 if (ire->ire_addr == addr) { 4910 /* 4911 * Finally, check if the security policy has any 4912 * restriction on using this route for the specified 4913 * message. 4914 */ 4915 if (tsl != NULL && 4916 ire->ire_gw_secattr != NULL && 4917 tsol_ire_match_gwattr(ire, tsl) != 0) { 4918 continue; 4919 } 4920 4921 if (zoneid == ALL_ZONES || ire->ire_zoneid == zoneid || 4922 ire->ire_zoneid == ALL_ZONES) { 4923 IRE_REFHOLD(ire); 4924 rw_exit(&irb_ptr->irb_lock); 4925 return (ire); 4926 } 4927 4928 if (ire->ire_type == IRE_LOCAL) { 4929 if (ip_restrict_interzone_loopback && 4930 !ire_local_ok_across_zones(ire, zoneid, 4931 &addr, tsl)) 4932 continue; 4933 4934 IRE_REFHOLD(ire); 4935 rw_exit(&irb_ptr->irb_lock); 4936 return (ire); 4937 } 4938 } 4939 } 4940 rw_exit(&irb_ptr->irb_lock); 4941 return (NULL); 4942 } 4943 4944 /* 4945 * Locate the interface ire that is tied to the cache ire 'cire' via 4946 * cire->ire_ihandle. 4947 * 4948 * We are trying to create the cache ire for an offlink destn based 4949 * on the cache ire of the gateway in 'cire'. 'pire' is the prefix ire 4950 * as found by ip_newroute(). We are called from ip_newroute() in 4951 * the IRE_CACHE case. 4952 */ 4953 ire_t * 4954 ire_ihandle_lookup_offlink(ire_t *cire, ire_t *pire) 4955 { 4956 ire_t *ire; 4957 int match_flags; 4958 ipaddr_t gw_addr; 4959 ipif_t *gw_ipif; 4960 4961 ASSERT(cire != NULL && pire != NULL); 4962 4963 /* 4964 * We don't need to specify the zoneid to ire_ftable_lookup() below 4965 * because the ihandle refers to an ipif which can be in only one zone. 4966 */ 4967 match_flags = MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK; 4968 /* 4969 * ip_newroute calls ire_ftable_lookup with MATCH_IRE_ILL only 4970 * for on-link hosts. We should never be here for onlink. 4971 * Thus, use MATCH_IRE_ILL_GROUP. 4972 */ 4973 if (pire->ire_ipif != NULL) 4974 match_flags |= MATCH_IRE_ILL_GROUP; 4975 /* 4976 * We know that the mask of the interface ire equals cire->ire_cmask. 4977 * (When ip_newroute() created 'cire' for the gateway it set its 4978 * cmask from the interface ire's mask) 4979 */ 4980 ire = ire_ftable_lookup(cire->ire_addr, cire->ire_cmask, 0, 4981 IRE_INTERFACE, pire->ire_ipif, NULL, ALL_ZONES, cire->ire_ihandle, 4982 NULL, match_flags); 4983 if (ire != NULL) 4984 return (ire); 4985 /* 4986 * If we didn't find an interface ire above, we can't declare failure. 4987 * For backwards compatibility, we need to support prefix routes 4988 * pointing to next hop gateways that are not on-link. 4989 * 4990 * Assume we are trying to ping some offlink destn, and we have the 4991 * routing table below. 4992 * 4993 * Eg. default - gw1 <--- pire (line 1) 4994 * gw1 - gw2 (line 2) 4995 * gw2 - hme0 (line 3) 4996 * 4997 * If we already have a cache ire for gw1 in 'cire', the 4998 * ire_ftable_lookup above would have failed, since there is no 4999 * interface ire to reach gw1. We will fallthru below. 5000 * 5001 * Here we duplicate the steps that ire_ftable_lookup() did in 5002 * getting 'cire' from 'pire', in the MATCH_IRE_RECURSIVE case. 5003 * The differences are the following 5004 * i. We want the interface ire only, so we call ire_ftable_lookup() 5005 * instead of ire_route_lookup() 5006 * ii. We look for only prefix routes in the 1st call below. 5007 * ii. We want to match on the ihandle in the 2nd call below. 5008 */ 5009 match_flags = MATCH_IRE_TYPE; 5010 if (pire->ire_ipif != NULL) 5011 match_flags |= MATCH_IRE_ILL_GROUP; 5012 ire = ire_ftable_lookup(pire->ire_gateway_addr, 0, 0, IRE_OFFSUBNET, 5013 pire->ire_ipif, NULL, ALL_ZONES, 0, NULL, match_flags); 5014 if (ire == NULL) 5015 return (NULL); 5016 /* 5017 * At this point 'ire' corresponds to the entry shown in line 2. 5018 * gw_addr is 'gw2' in the example above. 5019 */ 5020 gw_addr = ire->ire_gateway_addr; 5021 gw_ipif = ire->ire_ipif; 5022 ire_refrele(ire); 5023 5024 match_flags |= MATCH_IRE_IHANDLE; 5025 ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, 5026 gw_ipif, NULL, ALL_ZONES, cire->ire_ihandle, NULL, match_flags); 5027 return (ire); 5028 } 5029 5030 /* 5031 * ire_mrtun_lookup() is called by ip_rput() when packet is to be 5032 * tunneled through reverse tunnel. This is only supported for 5033 * IPv4 packets 5034 */ 5035 5036 ire_t * 5037 ire_mrtun_lookup(ipaddr_t srcaddr, ill_t *ill) 5038 { 5039 irb_t *irb_ptr; 5040 ire_t *ire; 5041 5042 ASSERT(ill != NULL); 5043 ASSERT(!(ill->ill_isv6)); 5044 5045 if (ip_mrtun_table == NULL) 5046 return (NULL); 5047 irb_ptr = &ip_mrtun_table[IRE_ADDR_HASH(srcaddr, IP_MRTUN_TABLE_SIZE)]; 5048 rw_enter(&irb_ptr->irb_lock, RW_READER); 5049 for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 5050 if (ire->ire_marks & IRE_MARK_CONDEMNED) 5051 continue; 5052 if ((ire->ire_in_src_addr == srcaddr) && 5053 ire->ire_in_ill == ill) { 5054 IRE_REFHOLD(ire); 5055 rw_exit(&irb_ptr->irb_lock); 5056 return (ire); 5057 } 5058 } 5059 rw_exit(&irb_ptr->irb_lock); 5060 return (NULL); 5061 } 5062 5063 /* 5064 * Return the IRE_LOOPBACK, IRE_IF_RESOLVER or IRE_IF_NORESOLVER 5065 * ire associated with the specified ipif. 5066 * 5067 * This might occasionally be called when IPIF_UP is not set since 5068 * the IP_MULTICAST_IF as well as creating interface routes 5069 * allows specifying a down ipif (ipif_lookup* match ipifs that are down). 5070 * 5071 * Note that if IPIF_NOLOCAL, IPIF_NOXMIT, or IPIF_DEPRECATED is set on 5072 * the ipif, this routine might return NULL. 5073 */ 5074 ire_t * 5075 ipif_to_ire(const ipif_t *ipif) 5076 { 5077 ire_t *ire; 5078 5079 ASSERT(!ipif->ipif_isv6); 5080 if (ipif->ipif_ire_type == IRE_LOOPBACK) { 5081 ire = ire_ctable_lookup(ipif->ipif_lcl_addr, 0, IRE_LOOPBACK, 5082 ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF)); 5083 } else if (ipif->ipif_flags & IPIF_POINTOPOINT) { 5084 /* In this case we need to lookup destination address. */ 5085 ire = ire_ftable_lookup(ipif->ipif_pp_dst_addr, IP_HOST_MASK, 0, 5086 IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL, 5087 (MATCH_IRE_TYPE | MATCH_IRE_IPIF | MATCH_IRE_MASK)); 5088 } else { 5089 ire = ire_ftable_lookup(ipif->ipif_subnet, 5090 ipif->ipif_net_mask, 0, IRE_INTERFACE, ipif, NULL, 5091 ALL_ZONES, 0, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF | 5092 MATCH_IRE_MASK)); 5093 } 5094 return (ire); 5095 } 5096 5097 /* 5098 * ire_walk function. 5099 * Count the number of IRE_CACHE entries in different categories. 5100 */ 5101 void 5102 ire_cache_count(ire_t *ire, char *arg) 5103 { 5104 ire_cache_count_t *icc = (ire_cache_count_t *)arg; 5105 5106 if (ire->ire_type != IRE_CACHE) 5107 return; 5108 5109 icc->icc_total++; 5110 5111 if (ire->ire_ipversion == IPV6_VERSION) { 5112 mutex_enter(&ire->ire_lock); 5113 if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) { 5114 mutex_exit(&ire->ire_lock); 5115 icc->icc_onlink++; 5116 return; 5117 } 5118 mutex_exit(&ire->ire_lock); 5119 } else { 5120 if (ire->ire_gateway_addr == 0) { 5121 icc->icc_onlink++; 5122 return; 5123 } 5124 } 5125 5126 ASSERT(ire->ire_ipif != NULL); 5127 if (ire->ire_max_frag < ire->ire_ipif->ipif_mtu) 5128 icc->icc_pmtu++; 5129 else if (ire->ire_tire_mark != ire->ire_ob_pkt_count + 5130 ire->ire_ib_pkt_count) 5131 icc->icc_offlink++; 5132 else 5133 icc->icc_unused++; 5134 } 5135 5136 /* 5137 * ire_walk function called by ip_trash_ire_reclaim(). 5138 * Free a fraction of the IRE_CACHE cache entries. The fractions are 5139 * different for different categories of IRE_CACHE entries. 5140 * A fraction of zero means to not free any in that category. 5141 * Use the hash bucket id plus lbolt as a random number. Thus if the fraction 5142 * is N then every Nth hash bucket chain will be freed. 5143 */ 5144 void 5145 ire_cache_reclaim(ire_t *ire, char *arg) 5146 { 5147 ire_cache_reclaim_t *icr = (ire_cache_reclaim_t *)arg; 5148 uint_t rand; 5149 5150 if (ire->ire_type != IRE_CACHE) 5151 return; 5152 5153 if (ire->ire_ipversion == IPV6_VERSION) { 5154 rand = (uint_t)lbolt + 5155 IRE_ADDR_HASH_V6(ire->ire_addr_v6, ip6_cache_table_size); 5156 mutex_enter(&ire->ire_lock); 5157 if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) { 5158 mutex_exit(&ire->ire_lock); 5159 if (icr->icr_onlink != 0 && 5160 (rand/icr->icr_onlink)*icr->icr_onlink == rand) { 5161 ire_delete(ire); 5162 return; 5163 } 5164 goto done; 5165 } 5166 mutex_exit(&ire->ire_lock); 5167 } else { 5168 rand = (uint_t)lbolt + 5169 IRE_ADDR_HASH(ire->ire_addr, ip_cache_table_size); 5170 if (ire->ire_gateway_addr == 0) { 5171 if (icr->icr_onlink != 0 && 5172 (rand/icr->icr_onlink)*icr->icr_onlink == rand) { 5173 ire_delete(ire); 5174 return; 5175 } 5176 goto done; 5177 } 5178 } 5179 /* Not onlink IRE */ 5180 ASSERT(ire->ire_ipif != NULL); 5181 if (ire->ire_max_frag < ire->ire_ipif->ipif_mtu) { 5182 /* Use ptmu fraction */ 5183 if (icr->icr_pmtu != 0 && 5184 (rand/icr->icr_pmtu)*icr->icr_pmtu == rand) { 5185 ire_delete(ire); 5186 return; 5187 } 5188 } else if (ire->ire_tire_mark != ire->ire_ob_pkt_count + 5189 ire->ire_ib_pkt_count) { 5190 /* Use offlink fraction */ 5191 if (icr->icr_offlink != 0 && 5192 (rand/icr->icr_offlink)*icr->icr_offlink == rand) { 5193 ire_delete(ire); 5194 return; 5195 } 5196 } else { 5197 /* Use unused fraction */ 5198 if (icr->icr_unused != 0 && 5199 (rand/icr->icr_unused)*icr->icr_unused == rand) { 5200 ire_delete(ire); 5201 return; 5202 } 5203 } 5204 done: 5205 /* 5206 * Update tire_mark so that those that haven't been used since this 5207 * reclaim will be considered unused next time we reclaim. 5208 */ 5209 ire->ire_tire_mark = ire->ire_ob_pkt_count + ire->ire_ib_pkt_count; 5210 } 5211 5212 static void 5213 power2_roundup(uint32_t *value) 5214 { 5215 int i; 5216 5217 for (i = 1; i < 31; i++) { 5218 if (*value <= (1 << i)) 5219 break; 5220 } 5221 *value = (1 << i); 5222 } 5223 5224 void 5225 ip_ire_init() 5226 { 5227 int i; 5228 5229 mutex_init(&ire_ft_init_lock, NULL, MUTEX_DEFAULT, 0); 5230 mutex_init(&ire_handle_lock, NULL, MUTEX_DEFAULT, NULL); 5231 mutex_init(&ire_mrtun_lock, NULL, MUTEX_DEFAULT, NULL); 5232 mutex_init(&ire_srcif_table_lock, NULL, MUTEX_DEFAULT, NULL); 5233 mutex_init(&ndp4.ndp_g_lock, NULL, MUTEX_DEFAULT, NULL); 5234 5235 rn_init(); 5236 (void) rn_inithead((void **)&ip_ftable, 32); 5237 /* 5238 * mark kernel ip ftable with RNF_SUNW_FT flag. 5239 */ 5240 ip_ftable->rnh_treetop->rn_flags |= RNF_SUNW_FT; 5241 rt_entry_cache = kmem_cache_create("rt_entry", 5242 sizeof (struct rt_entry), 0, NULL, NULL, NULL, NULL, NULL, 0); 5243 5244 /* Calculate the IPv4 cache table size. */ 5245 ip_cache_table_size = MAX(ip_cache_table_size, 5246 ((kmem_avail() >> ip_ire_mem_ratio) / sizeof (ire_t) / 5247 ip_ire_max_bucket_cnt)); 5248 if (ip_cache_table_size > ip_max_cache_table_size) 5249 ip_cache_table_size = ip_max_cache_table_size; 5250 /* 5251 * Make sure that the table size is always a power of 2. The 5252 * hash macro IRE_ADDR_HASH() depends on that. 5253 */ 5254 power2_roundup(&ip_cache_table_size); 5255 5256 ip_cache_table = (irb_t *)kmem_zalloc(ip_cache_table_size * 5257 sizeof (irb_t), KM_SLEEP); 5258 5259 for (i = 0; i < ip_cache_table_size; i++) { 5260 rw_init(&ip_cache_table[i].irb_lock, NULL, 5261 RW_DEFAULT, NULL); 5262 } 5263 5264 /* Calculate the IPv6 cache table size. */ 5265 ip6_cache_table_size = MAX(ip6_cache_table_size, 5266 ((kmem_avail() >> ip_ire_mem_ratio) / sizeof (ire_t) / 5267 ip6_ire_max_bucket_cnt)); 5268 if (ip6_cache_table_size > ip6_max_cache_table_size) 5269 ip6_cache_table_size = ip6_max_cache_table_size; 5270 /* 5271 * Make sure that the table size is always a power of 2. The 5272 * hash macro IRE_ADDR_HASH_V6() depends on that. 5273 */ 5274 power2_roundup(&ip6_cache_table_size); 5275 5276 ip_cache_table_v6 = (irb_t *)kmem_zalloc(ip6_cache_table_size * 5277 sizeof (irb_t), KM_SLEEP); 5278 5279 for (i = 0; i < ip6_cache_table_size; i++) { 5280 rw_init(&ip_cache_table_v6[i].irb_lock, NULL, 5281 RW_DEFAULT, NULL); 5282 } 5283 /* 5284 * Create ire caches, ire_reclaim() 5285 * will give IRE_CACHE back to system when needed. 5286 * This needs to be done here before anything else, since 5287 * ire_add() expects the cache to be created. 5288 */ 5289 ire_cache = kmem_cache_create("ire_cache", 5290 sizeof (ire_t), 0, ip_ire_constructor, 5291 ip_ire_destructor, ip_trash_ire_reclaim, NULL, NULL, 0); 5292 5293 /* 5294 * Initialize ip_mrtun_table to NULL now, it will be 5295 * populated by ip_rt_add if reverse tunnel is created 5296 */ 5297 ip_mrtun_table = NULL; 5298 5299 /* 5300 * Make sure that the forwarding table size is a power of 2. 5301 * The IRE*_ADDR_HASH() macroes depend on that. 5302 */ 5303 power2_roundup(&ip6_ftable_hash_size); 5304 } 5305 5306 void 5307 ip_ire_fini() 5308 { 5309 int i; 5310 5311 mutex_destroy(&ire_ft_init_lock); 5312 mutex_destroy(&ire_handle_lock); 5313 mutex_destroy(&ndp4.ndp_g_lock); 5314 5315 rn_fini(); 5316 RADIX_NODE_HEAD_DESTROY(ip_ftable); 5317 kmem_cache_destroy(rt_entry_cache); 5318 5319 for (i = 0; i < ip_cache_table_size; i++) { 5320 rw_destroy(&ip_cache_table[i].irb_lock); 5321 } 5322 kmem_free(ip_cache_table, ip_cache_table_size * sizeof (irb_t)); 5323 5324 for (i = 0; i < ip6_cache_table_size; i++) { 5325 rw_destroy(&ip_cache_table_v6[i].irb_lock); 5326 } 5327 kmem_free(ip_cache_table_v6, ip6_cache_table_size * sizeof (irb_t)); 5328 5329 if (ip_mrtun_table != NULL) { 5330 for (i = 0; i < IP_MRTUN_TABLE_SIZE; i++) { 5331 rw_destroy(&ip_mrtun_table[i].irb_lock); 5332 } 5333 kmem_free(ip_mrtun_table, IP_MRTUN_TABLE_SIZE * sizeof (irb_t)); 5334 } 5335 kmem_cache_destroy(ire_cache); 5336 } 5337 5338 int 5339 ire_add_mrtun(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func) 5340 { 5341 ire_t *ire1; 5342 irb_t *irb_ptr; 5343 ire_t **irep; 5344 ire_t *ire; 5345 int i; 5346 uint_t max_frag; 5347 ill_t *stq_ill; 5348 int error; 5349 5350 ire = *ire_p; 5351 ASSERT(ire->ire_ipversion == IPV4_VERSION); 5352 /* Is ip_mrtun_table empty ? */ 5353 5354 if (ip_mrtun_table == NULL) { 5355 /* create the mrtun table */ 5356 mutex_enter(&ire_mrtun_lock); 5357 if (ip_mrtun_table == NULL) { 5358 ip_mrtun_table = 5359 (irb_t *)kmem_zalloc(IP_MRTUN_TABLE_SIZE * 5360 sizeof (irb_t), KM_NOSLEEP); 5361 5362 if (ip_mrtun_table == NULL) { 5363 ip2dbg(("ire_add_mrtun: allocation failure\n")); 5364 mutex_exit(&ire_mrtun_lock); 5365 ire_refrele(ire); 5366 *ire_p = NULL; 5367 return (ENOMEM); 5368 } 5369 5370 for (i = 0; i < IP_MRTUN_TABLE_SIZE; i++) { 5371 rw_init(&ip_mrtun_table[i].irb_lock, NULL, 5372 RW_DEFAULT, NULL); 5373 } 5374 ip2dbg(("ire_add_mrtun: mrtun table is created\n")); 5375 } 5376 /* some other thread got it and created the table */ 5377 mutex_exit(&ire_mrtun_lock); 5378 } 5379 5380 /* 5381 * Check for duplicate in the bucket and insert in the table 5382 */ 5383 irb_ptr = &(ip_mrtun_table[IRE_ADDR_HASH(ire->ire_in_src_addr, 5384 IP_MRTUN_TABLE_SIZE)]); 5385 5386 /* 5387 * Start the atomic add of the ire. Grab the ill locks, 5388 * ill_g_usesrc_lock and the bucket lock. 5389 * 5390 * If ipif or ill is changing ire_atomic_start() may queue the 5391 * request and return EINPROGRESS. 5392 */ 5393 error = ire_atomic_start(irb_ptr, ire, q, mp, func); 5394 if (error != 0) { 5395 /* 5396 * We don't know whether it is a valid ipif or not. 5397 * So, set it to NULL. This assumes that the ire has not added 5398 * a reference to the ipif. 5399 */ 5400 ire->ire_ipif = NULL; 5401 ire_delete(ire); 5402 ip1dbg(("ire_add_mrtun: ire_atomic_start failed\n")); 5403 *ire_p = NULL; 5404 return (error); 5405 } 5406 for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 5407 if (ire1->ire_marks & IRE_MARK_CONDEMNED) 5408 continue; 5409 /* has anyone inserted the route in the meanwhile ? */ 5410 if (ire1->ire_in_ill == ire->ire_in_ill && 5411 ire1->ire_in_src_addr == ire->ire_in_src_addr) { 5412 ip1dbg(("ire_add_mrtun: Duplicate entry exists\n")); 5413 IRE_REFHOLD(ire1); 5414 ire_atomic_end(irb_ptr, ire); 5415 ire_delete(ire); 5416 /* Return the old ire */ 5417 *ire_p = ire1; 5418 return (0); 5419 } 5420 } 5421 5422 /* Atomically set the ire_max_frag */ 5423 max_frag = *ire->ire_max_fragp; 5424 ire->ire_max_fragp = NULL; 5425 ire->ire_max_frag = MIN(max_frag, IP_MAXPACKET); 5426 ASSERT(ire->ire_type != IRE_CACHE); 5427 irep = (ire_t **)irb_ptr; 5428 if (*irep != NULL) { 5429 /* Find the last ire which matches ire_in_src_addr */ 5430 ire1 = *irep; 5431 while (ire1->ire_in_src_addr == ire->ire_in_src_addr) { 5432 irep = &ire1->ire_next; 5433 ire1 = *irep; 5434 if (ire1 == NULL) 5435 break; 5436 } 5437 } 5438 ire1 = *irep; 5439 if (ire1 != NULL) 5440 ire1->ire_ptpn = &ire->ire_next; 5441 ire->ire_next = ire1; 5442 /* Link the new one in. */ 5443 ire->ire_ptpn = irep; 5444 membar_producer(); 5445 *irep = ire; 5446 ire->ire_bucket = irb_ptr; 5447 IRE_REFHOLD_LOCKED(ire); 5448 5449 ip2dbg(("ire_add_mrtun: created and linked ire %p\n", (void *)*irep)); 5450 5451 /* 5452 * Protect ire_mrtun_count and ill_mrtun_refcnt from 5453 * another thread trying to add ire in the table 5454 */ 5455 mutex_enter(&ire_mrtun_lock); 5456 ire_mrtun_count++; 5457 mutex_exit(&ire_mrtun_lock); 5458 /* 5459 * ill_mrtun_refcnt is protected by the ill_lock held via 5460 * ire_atomic_start 5461 */ 5462 ire->ire_in_ill->ill_mrtun_refcnt++; 5463 5464 if (ire->ire_ipif != NULL) { 5465 ire->ire_ipif->ipif_ire_cnt++; 5466 if (ire->ire_stq != NULL) { 5467 stq_ill = (ill_t *)ire->ire_stq->q_ptr; 5468 stq_ill->ill_ire_cnt++; 5469 } 5470 } else { 5471 ASSERT(ire->ire_stq == NULL); 5472 } 5473 5474 ire_atomic_end(irb_ptr, ire); 5475 ire_fastpath(ire); 5476 *ire_p = ire; 5477 return (0); 5478 } 5479 5480 5481 /* Walks down the mrtun table */ 5482 5483 void 5484 ire_walk_ill_mrtun(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg, 5485 ill_t *ill) 5486 { 5487 irb_t *irb; 5488 ire_t *ire; 5489 int i; 5490 int ret; 5491 5492 ASSERT((!(match_flags & (MATCH_IRE_WQ | MATCH_IRE_ILL | 5493 MATCH_IRE_ILL_GROUP))) || (ill != NULL)); 5494 ASSERT(match_flags == 0 || ire_type == IRE_MIPRTUN); 5495 5496 mutex_enter(&ire_mrtun_lock); 5497 if (ire_mrtun_count == 0) { 5498 mutex_exit(&ire_mrtun_lock); 5499 return; 5500 } 5501 mutex_exit(&ire_mrtun_lock); 5502 5503 ip2dbg(("ire_walk_ill_mrtun:walking the reverse tunnel table \n")); 5504 for (i = 0; i < IP_MRTUN_TABLE_SIZE; i++) { 5505 5506 irb = &(ip_mrtun_table[i]); 5507 if (irb->irb_ire == NULL) 5508 continue; 5509 IRB_REFHOLD(irb); 5510 for (ire = irb->irb_ire; ire != NULL; 5511 ire = ire->ire_next) { 5512 ASSERT(ire->ire_ipversion == IPV4_VERSION); 5513 if (match_flags != 0) { 5514 ret = ire_walk_ill_match( 5515 match_flags, ire_type, 5516 ire, ill, ALL_ZONES); 5517 } 5518 if (match_flags == 0 || ret) 5519 (*func)(ire, arg); 5520 } 5521 IRB_REFRELE(irb); 5522 } 5523 } 5524 5525 /* 5526 * Source interface based lookup routine (IPV4 only). 5527 * This routine is called only when RTA_SRCIFP bitflag is set 5528 * by routing socket while adding/deleting the route and it is 5529 * also called from ip_rput() when packets arrive from an interface 5530 * for which ill_srcif_ref_cnt is positive. This function is useful 5531 * when a packet coming from one interface must be forwarded to another 5532 * designated interface to reach the correct node. This function is also 5533 * called from ip_newroute when the link-layer address of an ire is resolved. 5534 * We need to make sure that ip_newroute searches for IRE_IF_RESOLVER type 5535 * ires--thus the ire_type parameter is needed. 5536 */ 5537 5538 ire_t * 5539 ire_srcif_table_lookup(ipaddr_t dst_addr, int ire_type, ipif_t *ipif, 5540 ill_t *in_ill, int flags) 5541 { 5542 irb_t *irb_ptr; 5543 ire_t *ire; 5544 irb_t *ire_srcif_table; 5545 5546 ASSERT(in_ill != NULL && !in_ill->ill_isv6); 5547 ASSERT(!(flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP)) || 5548 (ipif != NULL && !ipif->ipif_isv6)); 5549 5550 /* 5551 * No need to lock the ill since it is refheld by the caller of this 5552 * function 5553 */ 5554 if (in_ill->ill_srcif_table == NULL) { 5555 return (NULL); 5556 } 5557 5558 if (!(flags & MATCH_IRE_TYPE)) { 5559 flags |= MATCH_IRE_TYPE; 5560 ire_type = IRE_INTERFACE; 5561 } 5562 ire_srcif_table = in_ill->ill_srcif_table; 5563 irb_ptr = &ire_srcif_table[IRE_ADDR_HASH(dst_addr, 5564 IP_SRCIF_TABLE_SIZE)]; 5565 rw_enter(&irb_ptr->irb_lock, RW_READER); 5566 for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 5567 if (ire->ire_marks & IRE_MARK_CONDEMNED) 5568 continue; 5569 if (ire_match_args(ire, dst_addr, ire->ire_mask, 0, 5570 ire_type, ipif, ire->ire_zoneid, 0, NULL, flags)) { 5571 IRE_REFHOLD(ire); 5572 rw_exit(&irb_ptr->irb_lock); 5573 return (ire); 5574 } 5575 } 5576 /* Not Found */ 5577 rw_exit(&irb_ptr->irb_lock); 5578 return (NULL); 5579 } 5580 5581 5582 /* 5583 * Adds the ire into the special routing table which is hanging off of 5584 * the src_ipif->ipif_ill. It also increments the refcnt in the ill. 5585 * The forward table contains only IRE_IF_RESOLVER, IRE_IF_NORESOLVER 5586 * i,e. IRE_INTERFACE entries. Originally the dlureq_mp field is NULL 5587 * for IRE_IF_RESOLVER entry because we do not have the dst_addr's 5588 * link-layer address at the time of addition. 5589 * Upon resolving the address from ARP, dlureq_mp field is updated with 5590 * proper information in ire_update_srcif_v4. 5591 */ 5592 static int 5593 ire_add_srcif_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func) 5594 { 5595 ire_t *ire1; 5596 irb_t *ire_srcifp_table = NULL; 5597 irb_t *irb_ptr = NULL; 5598 ire_t **irep; 5599 ire_t *ire; 5600 int flags; 5601 int i; 5602 ill_t *stq_ill; 5603 uint_t max_frag; 5604 int error = 0; 5605 5606 ire = *ire_p; 5607 ASSERT(ire->ire_in_ill != NULL); 5608 ASSERT(ire->ire_ipversion == IPV4_VERSION); 5609 ASSERT(ire->ire_type == IRE_IF_NORESOLVER || 5610 ire->ire_type == IRE_IF_RESOLVER); 5611 5612 ire->ire_mask = IP_HOST_MASK; 5613 /* 5614 * Update ire_nce->nce_res_mp with NULL value upon creation; 5615 * first free the default res_mp created by ire_nce_init. 5616 */ 5617 freeb(ire->ire_nce->nce_res_mp); 5618 if (ire->ire_type == IRE_IF_RESOLVER) { 5619 /* 5620 * assign NULL now, it will be updated 5621 * with correct value upon returning from 5622 * ARP 5623 */ 5624 ire->ire_nce->nce_res_mp = NULL; 5625 } else { 5626 ire->ire_nce->nce_res_mp = ill_dlur_gen(NULL, 5627 ire->ire_ipif->ipif_ill->ill_phys_addr_length, 5628 ire->ire_ipif->ipif_ill->ill_sap, 5629 ire->ire_ipif->ipif_ill->ill_sap_length); 5630 } 5631 /* Make sure the address is properly masked. */ 5632 ire->ire_addr &= ire->ire_mask; 5633 5634 ASSERT(ire->ire_max_fragp != NULL); 5635 max_frag = *ire->ire_max_fragp; 5636 ire->ire_max_fragp = NULL; 5637 ire->ire_max_frag = MIN(max_frag, IP_MAXPACKET); 5638 5639 mutex_enter(&ire->ire_in_ill->ill_lock); 5640 if (ire->ire_in_ill->ill_srcif_table == NULL) { 5641 /* create the incoming interface based table */ 5642 ire->ire_in_ill->ill_srcif_table = 5643 (irb_t *)kmem_zalloc(IP_SRCIF_TABLE_SIZE * 5644 sizeof (irb_t), KM_NOSLEEP); 5645 if (ire->ire_in_ill->ill_srcif_table == NULL) { 5646 ip1dbg(("ire_add_srcif_v4: Allocation fail\n")); 5647 mutex_exit(&ire->ire_in_ill->ill_lock); 5648 ire_delete(ire); 5649 *ire_p = NULL; 5650 return (ENOMEM); 5651 } 5652 ire_srcifp_table = ire->ire_in_ill->ill_srcif_table; 5653 for (i = 0; i < IP_SRCIF_TABLE_SIZE; i++) { 5654 rw_init(&ire_srcifp_table[i].irb_lock, NULL, 5655 RW_DEFAULT, NULL); 5656 } 5657 ip2dbg(("ire_add_srcif_v4: table created for ill %p\n", 5658 (void *)ire->ire_in_ill)); 5659 } 5660 /* Check for duplicate and insert */ 5661 ASSERT(ire->ire_in_ill->ill_srcif_table != NULL); 5662 irb_ptr = 5663 &(ire->ire_in_ill->ill_srcif_table[IRE_ADDR_HASH(ire->ire_addr, 5664 IP_SRCIF_TABLE_SIZE)]); 5665 mutex_exit(&ire->ire_in_ill->ill_lock); 5666 flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW); 5667 flags |= MATCH_IRE_IPIF; 5668 5669 /* 5670 * Start the atomic add of the ire. Grab the ill locks, 5671 * ill_g_usesrc_lock and the bucket lock. 5672 * 5673 * If ipif or ill is changing ire_atomic_start() may queue the 5674 * request and return EINPROGRESS. 5675 */ 5676 error = ire_atomic_start(irb_ptr, ire, q, mp, func); 5677 if (error != 0) { 5678 /* 5679 * We don't know whether it is a valid ipif or not. 5680 * So, set it to NULL. This assumes that the ire has not added 5681 * a reference to the ipif. 5682 */ 5683 ire->ire_ipif = NULL; 5684 ire_delete(ire); 5685 ip1dbg(("ire_add_srcif_v4: ire_atomic_start failed\n")); 5686 *ire_p = NULL; 5687 return (error); 5688 } 5689 for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 5690 if (ire1->ire_marks & IRE_MARK_CONDEMNED) 5691 continue; 5692 if (ire1->ire_zoneid != ire->ire_zoneid) 5693 continue; 5694 /* Has anyone inserted route in the meanwhile ? */ 5695 if (ire_match_args(ire1, ire->ire_addr, ire->ire_mask, 0, 5696 ire->ire_type, ire->ire_ipif, ire->ire_zoneid, 0, NULL, 5697 flags)) { 5698 ip1dbg(("ire_add_srcif_v4 : Duplicate entry exists\n")); 5699 IRE_REFHOLD(ire1); 5700 ire_atomic_end(irb_ptr, ire); 5701 ire_delete(ire); 5702 /* Return old ire as in ire_add_v4 */ 5703 *ire_p = ire1; 5704 return (0); 5705 } 5706 } 5707 irep = (ire_t **)irb_ptr; 5708 if (*irep != NULL) { 5709 /* Find the last ire which matches ire_addr */ 5710 ire1 = *irep; 5711 while (ire1->ire_addr == ire->ire_addr) { 5712 irep = &ire1->ire_next; 5713 ire1 = *irep; 5714 if (ire1 == NULL) 5715 break; 5716 } 5717 } 5718 ire1 = *irep; 5719 if (ire1 != NULL) 5720 ire1->ire_ptpn = &ire->ire_next; 5721 ire->ire_next = ire1; 5722 /* Link the new one in. */ 5723 ire->ire_ptpn = irep; 5724 membar_producer(); 5725 *irep = ire; 5726 ire->ire_bucket = irb_ptr; 5727 IRE_REFHOLD_LOCKED(ire); 5728 5729 /* 5730 * Protect ire_in_ill->ill_srcif_refcnt and table reference count. 5731 * Note, ire_atomic_start already grabs the ire_in_ill->ill_lock 5732 * so ill_srcif_refcnt is already protected. 5733 */ 5734 ire->ire_in_ill->ill_srcif_refcnt++; 5735 mutex_enter(&ire_srcif_table_lock); 5736 ire_srcif_table_count++; 5737 mutex_exit(&ire_srcif_table_lock); 5738 irb_ptr->irb_ire_cnt++; 5739 if (ire->ire_ipif != NULL) { 5740 ire->ire_ipif->ipif_ire_cnt++; 5741 if (ire->ire_stq != NULL) { 5742 stq_ill = (ill_t *)ire->ire_stq->q_ptr; 5743 stq_ill->ill_ire_cnt++; 5744 } 5745 } else { 5746 ASSERT(ire->ire_stq == NULL); 5747 } 5748 5749 ire_atomic_end(irb_ptr, ire); 5750 *ire_p = ire; 5751 return (0); 5752 } 5753 5754 5755 /* 5756 * This function is called by ire_add_then_send when ARP request comes 5757 * back to ip_wput->ire_add_then_send for resolved ire in the interface 5758 * based routing table. At this point, it only needs to update the resolver 5759 * information for the ire. The passed ire is returned to the caller as it 5760 * is the ire which is created as mblk. 5761 */ 5762 5763 static ire_t * 5764 ire_update_srcif_v4(ire_t *ire) 5765 { 5766 ire_t *ire1; 5767 irb_t *irb; 5768 int error; 5769 5770 ASSERT(ire->ire_type != IRE_MIPRTUN && 5771 ire->ire_ipif->ipif_net_type == IRE_IF_RESOLVER); 5772 ASSERT(ire->ire_ipversion == IPV4_VERSION); 5773 5774 /* 5775 * This ire is from ARP. Update 5776 * ire_nce->nce_res_mp info 5777 */ 5778 ire1 = ire_srcif_table_lookup(ire->ire_addr, 5779 IRE_IF_RESOLVER, ire->ire_ipif, 5780 ire->ire_in_ill, 5781 MATCH_IRE_ILL | MATCH_IRE_TYPE); 5782 if (ire1 == NULL) { 5783 /* Mobile node registration expired ? */ 5784 ire_delete(ire); 5785 return (NULL); 5786 } 5787 irb = ire1->ire_bucket; 5788 ASSERT(irb != NULL); 5789 /* 5790 * Start the atomic add of the ire. Grab the ill locks, 5791 * ill_g_usesrc_lock and the bucket lock. 5792 */ 5793 error = ire_atomic_start(irb, ire1, NULL, NULL, NULL); 5794 if (error != 0) { 5795 /* 5796 * We don't know whether it is a valid ipif or not. 5797 * So, set it to NULL. This assumes that the ire has not added 5798 * a reference to the ipif. 5799 */ 5800 ire->ire_ipif = NULL; 5801 ire_delete(ire); 5802 ip1dbg(("ire_update_srcif_v4: ire_atomic_start failed\n")); 5803 return (NULL); 5804 } 5805 ASSERT(ire->ire_max_fragp == NULL); 5806 ire->ire_max_frag = ire1->ire_max_frag; 5807 /* 5808 * Update resolver information and 5809 * send-to queue. 5810 */ 5811 ASSERT(ire->ire_nce->nce_res_mp != NULL); 5812 ire1->ire_nce->nce_res_mp = copyb(ire->ire_nce->nce_res_mp); 5813 if (ire1->ire_nce->nce_res_mp == NULL) { 5814 ip0dbg(("ire_update_srcif: copyb failed\n")); 5815 ire_refrele(ire1); 5816 ire_refrele(ire); 5817 ire_atomic_end(irb, ire1); 5818 return (NULL); 5819 } 5820 ire1->ire_stq = ire->ire_stq; 5821 5822 ASSERT(ire->ire_nce->nce_fp_mp == NULL); 5823 5824 ire_atomic_end(irb, ire1); 5825 ire_refrele(ire1); 5826 /* Return the passed ire */ 5827 return (ire); /* Update done */ 5828 } 5829 5830 5831 /* 5832 * Check if another multirt route resolution is needed. 5833 * B_TRUE is returned is there remain a resolvable route, 5834 * or if no route for that dst is resolved yet. 5835 * B_FALSE is returned if all routes for that dst are resolved 5836 * or if the remaining unresolved routes are actually not 5837 * resolvable. 5838 * This only works in the global zone. 5839 */ 5840 boolean_t 5841 ire_multirt_need_resolve(ipaddr_t dst, const ts_label_t *tsl) 5842 { 5843 ire_t *first_fire; 5844 ire_t *first_cire; 5845 ire_t *fire; 5846 ire_t *cire; 5847 irb_t *firb; 5848 irb_t *cirb; 5849 int unres_cnt = 0; 5850 boolean_t resolvable = B_FALSE; 5851 5852 /* Retrieve the first IRE_HOST that matches the destination */ 5853 first_fire = ire_ftable_lookup(dst, IP_HOST_MASK, 0, IRE_HOST, NULL, 5854 NULL, ALL_ZONES, 0, tsl, 5855 MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_SECATTR); 5856 5857 /* No route at all */ 5858 if (first_fire == NULL) { 5859 return (B_TRUE); 5860 } 5861 5862 firb = first_fire->ire_bucket; 5863 ASSERT(firb != NULL); 5864 5865 /* Retrieve the first IRE_CACHE ire for that destination. */ 5866 first_cire = ire_cache_lookup(dst, GLOBAL_ZONEID, tsl); 5867 5868 /* No resolved route. */ 5869 if (first_cire == NULL) { 5870 ire_refrele(first_fire); 5871 return (B_TRUE); 5872 } 5873 5874 /* 5875 * At least one route is resolved. Here we look through the forward 5876 * and cache tables, to compare the number of declared routes 5877 * with the number of resolved routes. The search for a resolvable 5878 * route is performed only if at least one route remains 5879 * unresolved. 5880 */ 5881 cirb = first_cire->ire_bucket; 5882 ASSERT(cirb != NULL); 5883 5884 /* Count the number of routes to that dest that are declared. */ 5885 IRB_REFHOLD(firb); 5886 for (fire = first_fire; fire != NULL; fire = fire->ire_next) { 5887 if (!(fire->ire_flags & RTF_MULTIRT)) 5888 continue; 5889 if (fire->ire_addr != dst) 5890 continue; 5891 unres_cnt++; 5892 } 5893 IRB_REFRELE(firb); 5894 5895 /* Then subtract the number of routes to that dst that are resolved */ 5896 IRB_REFHOLD(cirb); 5897 for (cire = first_cire; cire != NULL; cire = cire->ire_next) { 5898 if (!(cire->ire_flags & RTF_MULTIRT)) 5899 continue; 5900 if (cire->ire_addr != dst) 5901 continue; 5902 if (cire->ire_marks & (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)) 5903 continue; 5904 unres_cnt--; 5905 } 5906 IRB_REFRELE(cirb); 5907 5908 /* At least one route is unresolved; search for a resolvable route. */ 5909 if (unres_cnt > 0) 5910 resolvable = ire_multirt_lookup(&first_cire, &first_fire, 5911 MULTIRT_USESTAMP | MULTIRT_CACHEGW, tsl); 5912 5913 if (first_fire != NULL) 5914 ire_refrele(first_fire); 5915 5916 if (first_cire != NULL) 5917 ire_refrele(first_cire); 5918 5919 return (resolvable); 5920 } 5921 5922 5923 /* 5924 * Explore a forward_table bucket, starting from fire_arg. 5925 * fire_arg MUST be an IRE_HOST entry. 5926 * 5927 * Return B_TRUE and update *ire_arg and *fire_arg 5928 * if at least one resolvable route is found. *ire_arg 5929 * is the IRE entry for *fire_arg's gateway. 5930 * 5931 * Return B_FALSE otherwise (all routes are resolved or 5932 * the remaining unresolved routes are all unresolvable). 5933 * 5934 * The IRE selection relies on a priority mechanism 5935 * driven by the flags passed in by the caller. 5936 * The caller, such as ip_newroute_ipif(), can get the most 5937 * relevant ire at each stage of a multiple route resolution. 5938 * 5939 * The rules are: 5940 * 5941 * - if MULTIRT_CACHEGW is specified in flags, IRE_CACHETABLE 5942 * ires are preferred for the gateway. This gives the highest 5943 * priority to routes that can be resolved without using 5944 * a resolver. 5945 * 5946 * - if MULTIRT_CACHEGW is not specified, or if MULTIRT_CACHEGW 5947 * is specified but no IRE_CACHETABLE ire entry for the gateway 5948 * is found, the following rules apply. 5949 * 5950 * - if MULTIRT_USESTAMP is specified in flags, IRE_INTERFACE 5951 * ires for the gateway, that have not been tried since 5952 * a configurable amount of time, are preferred. 5953 * This applies when a resolver must be invoked for 5954 * a missing route, but we don't want to use the resolver 5955 * upon each packet emission. If no such resolver is found, 5956 * B_FALSE is returned. 5957 * The MULTIRT_USESTAMP flag can be combined with 5958 * MULTIRT_CACHEGW. 5959 * 5960 * - if MULTIRT_USESTAMP is not specified in flags, the first 5961 * unresolved but resolvable route is selected. 5962 * 5963 * - Otherwise, there is no resolvalble route, and 5964 * B_FALSE is returned. 5965 * 5966 * At last, MULTIRT_SETSTAMP can be specified in flags to 5967 * request the timestamp of unresolvable routes to 5968 * be refreshed. This prevents the useless exploration 5969 * of those routes for a while, when MULTIRT_USESTAMP is used. 5970 * 5971 * This only works in the global zone. 5972 */ 5973 boolean_t 5974 ire_multirt_lookup(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags, 5975 const ts_label_t *tsl) 5976 { 5977 clock_t delta; 5978 ire_t *best_fire = NULL; 5979 ire_t *best_cire = NULL; 5980 ire_t *first_fire; 5981 ire_t *first_cire; 5982 ire_t *fire; 5983 ire_t *cire; 5984 irb_t *firb = NULL; 5985 irb_t *cirb = NULL; 5986 ire_t *gw_ire; 5987 boolean_t already_resolved; 5988 boolean_t res; 5989 ipaddr_t dst; 5990 ipaddr_t gw; 5991 5992 ip2dbg(("ire_multirt_lookup: *ire_arg %p, *fire_arg %p, flags %04x\n", 5993 (void *)*ire_arg, (void *)*fire_arg, flags)); 5994 5995 ASSERT(ire_arg != NULL); 5996 ASSERT(fire_arg != NULL); 5997 5998 /* Not an IRE_HOST ire; give up. */ 5999 if ((*fire_arg == NULL) || ((*fire_arg)->ire_type != IRE_HOST)) { 6000 return (B_FALSE); 6001 } 6002 6003 /* This is the first IRE_HOST ire for that destination. */ 6004 first_fire = *fire_arg; 6005 firb = first_fire->ire_bucket; 6006 ASSERT(firb != NULL); 6007 6008 dst = first_fire->ire_addr; 6009 6010 ip2dbg(("ire_multirt_lookup: dst %08x\n", ntohl(dst))); 6011 6012 /* 6013 * Retrieve the first IRE_CACHE ire for that destination; 6014 * if we don't find one, no route for that dest is 6015 * resolved yet. 6016 */ 6017 first_cire = ire_cache_lookup(dst, GLOBAL_ZONEID, tsl); 6018 if (first_cire != NULL) { 6019 cirb = first_cire->ire_bucket; 6020 } 6021 6022 ip2dbg(("ire_multirt_lookup: first_cire %p\n", (void *)first_cire)); 6023 6024 /* 6025 * Search for a resolvable route, giving the top priority 6026 * to routes that can be resolved without any call to the resolver. 6027 */ 6028 IRB_REFHOLD(firb); 6029 6030 if (!CLASSD(dst)) { 6031 /* 6032 * For all multiroute IRE_HOST ires for that destination, 6033 * check if the route via the IRE_HOST's gateway is 6034 * resolved yet. 6035 */ 6036 for (fire = first_fire; fire != NULL; fire = fire->ire_next) { 6037 6038 if (!(fire->ire_flags & RTF_MULTIRT)) 6039 continue; 6040 if (fire->ire_addr != dst) 6041 continue; 6042 6043 if (fire->ire_gw_secattr != NULL && 6044 tsol_ire_match_gwattr(fire, tsl) != 0) { 6045 continue; 6046 } 6047 6048 gw = fire->ire_gateway_addr; 6049 6050 ip2dbg(("ire_multirt_lookup: fire %p, " 6051 "ire_addr %08x, ire_gateway_addr %08x\n", 6052 (void *)fire, ntohl(fire->ire_addr), ntohl(gw))); 6053 6054 already_resolved = B_FALSE; 6055 6056 if (first_cire != NULL) { 6057 ASSERT(cirb != NULL); 6058 6059 IRB_REFHOLD(cirb); 6060 /* 6061 * For all IRE_CACHE ires for that 6062 * destination. 6063 */ 6064 for (cire = first_cire; 6065 cire != NULL; 6066 cire = cire->ire_next) { 6067 6068 if (!(cire->ire_flags & RTF_MULTIRT)) 6069 continue; 6070 if (cire->ire_addr != dst) 6071 continue; 6072 if (cire->ire_marks & 6073 (IRE_MARK_CONDEMNED | 6074 IRE_MARK_HIDDEN)) 6075 continue; 6076 6077 if (cire->ire_gw_secattr != NULL && 6078 tsol_ire_match_gwattr(cire, 6079 tsl) != 0) { 6080 continue; 6081 } 6082 6083 /* 6084 * Check if the IRE_CACHE's gateway 6085 * matches the IRE_HOST's gateway. 6086 */ 6087 if (cire->ire_gateway_addr == gw) { 6088 already_resolved = B_TRUE; 6089 break; 6090 } 6091 } 6092 IRB_REFRELE(cirb); 6093 } 6094 6095 /* 6096 * This route is already resolved; 6097 * proceed with next one. 6098 */ 6099 if (already_resolved) { 6100 ip2dbg(("ire_multirt_lookup: found cire %p, " 6101 "already resolved\n", (void *)cire)); 6102 continue; 6103 } 6104 6105 /* 6106 * The route is unresolved; is it actually 6107 * resolvable, i.e. is there a cache or a resolver 6108 * for the gateway? 6109 */ 6110 gw_ire = ire_route_lookup(gw, 0, 0, 0, NULL, NULL, 6111 ALL_ZONES, tsl, 6112 MATCH_IRE_RECURSIVE | MATCH_IRE_SECATTR); 6113 6114 ip2dbg(("ire_multirt_lookup: looked up gw_ire %p\n", 6115 (void *)gw_ire)); 6116 6117 /* 6118 * If gw_ire is typed IRE_CACHETABLE, 6119 * this route can be resolved without any call to the 6120 * resolver. If the MULTIRT_CACHEGW flag is set, 6121 * give the top priority to this ire and exit the 6122 * loop. 6123 * This is typically the case when an ARP reply 6124 * is processed through ip_wput_nondata(). 6125 */ 6126 if ((flags & MULTIRT_CACHEGW) && 6127 (gw_ire != NULL) && 6128 (gw_ire->ire_type & IRE_CACHETABLE)) { 6129 ASSERT(gw_ire->ire_nce == NULL || 6130 gw_ire->ire_nce->nce_state == ND_REACHABLE); 6131 /* 6132 * Release the resolver associated to the 6133 * previous candidate best ire, if any. 6134 */ 6135 if (best_cire != NULL) { 6136 ire_refrele(best_cire); 6137 ASSERT(best_fire != NULL); 6138 } 6139 6140 best_fire = fire; 6141 best_cire = gw_ire; 6142 6143 ip2dbg(("ire_multirt_lookup: found top prio " 6144 "best_fire %p, best_cire %p\n", 6145 (void *)best_fire, (void *)best_cire)); 6146 break; 6147 } 6148 6149 /* 6150 * Compute the time elapsed since our preceding 6151 * attempt to resolve that route. 6152 * If the MULTIRT_USESTAMP flag is set, we take that 6153 * route into account only if this time interval 6154 * exceeds ip_multirt_resolution_interval; 6155 * this prevents us from attempting to resolve a 6156 * broken route upon each sending of a packet. 6157 */ 6158 delta = lbolt - fire->ire_last_used_time; 6159 delta = TICK_TO_MSEC(delta); 6160 6161 res = (boolean_t) 6162 ((delta > ip_multirt_resolution_interval) || 6163 (!(flags & MULTIRT_USESTAMP))); 6164 6165 ip2dbg(("ire_multirt_lookup: fire %p, delta %lu, " 6166 "res %d\n", 6167 (void *)fire, delta, res)); 6168 6169 if (res) { 6170 /* 6171 * We are here if MULTIRT_USESTAMP flag is set 6172 * and the resolver for fire's gateway 6173 * has not been tried since 6174 * ip_multirt_resolution_interval, or if 6175 * MULTIRT_USESTAMP is not set but gw_ire did 6176 * not fill the conditions for MULTIRT_CACHEGW, 6177 * or if neither MULTIRT_USESTAMP nor 6178 * MULTIRT_CACHEGW are set. 6179 */ 6180 if (gw_ire != NULL) { 6181 if (best_fire == NULL) { 6182 ASSERT(best_cire == NULL); 6183 6184 best_fire = fire; 6185 best_cire = gw_ire; 6186 6187 ip2dbg(("ire_multirt_lookup:" 6188 "found candidate " 6189 "best_fire %p, " 6190 "best_cire %p\n", 6191 (void *)best_fire, 6192 (void *)best_cire)); 6193 6194 /* 6195 * If MULTIRT_CACHEGW is not 6196 * set, we ignore the top 6197 * priority ires that can 6198 * be resolved without any 6199 * call to the resolver; 6200 * In that case, there is 6201 * actually no need 6202 * to continue the loop. 6203 */ 6204 if (!(flags & 6205 MULTIRT_CACHEGW)) { 6206 break; 6207 } 6208 continue; 6209 } 6210 } else { 6211 /* 6212 * No resolver for the gateway: the 6213 * route is not resolvable. 6214 * If the MULTIRT_SETSTAMP flag is 6215 * set, we stamp the IRE_HOST ire, 6216 * so we will not select it again 6217 * during this resolution interval. 6218 */ 6219 if (flags & MULTIRT_SETSTAMP) 6220 fire->ire_last_used_time = 6221 lbolt; 6222 } 6223 } 6224 6225 if (gw_ire != NULL) 6226 ire_refrele(gw_ire); 6227 } 6228 } else { /* CLASSD(dst) */ 6229 6230 for (fire = first_fire; 6231 fire != NULL; 6232 fire = fire->ire_next) { 6233 6234 if (!(fire->ire_flags & RTF_MULTIRT)) 6235 continue; 6236 if (fire->ire_addr != dst) 6237 continue; 6238 6239 if (fire->ire_gw_secattr != NULL && 6240 tsol_ire_match_gwattr(fire, tsl) != 0) { 6241 continue; 6242 } 6243 6244 already_resolved = B_FALSE; 6245 6246 gw = fire->ire_gateway_addr; 6247 6248 gw_ire = ire_ftable_lookup(gw, 0, 0, IRE_INTERFACE, 6249 NULL, NULL, ALL_ZONES, 0, tsl, 6250 MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE | 6251 MATCH_IRE_SECATTR); 6252 6253 /* No resolver for the gateway; we skip this ire. */ 6254 if (gw_ire == NULL) { 6255 continue; 6256 } 6257 ASSERT(gw_ire->ire_nce == NULL || 6258 gw_ire->ire_nce->nce_state == ND_REACHABLE); 6259 6260 if (first_cire != NULL) { 6261 6262 IRB_REFHOLD(cirb); 6263 /* 6264 * For all IRE_CACHE ires for that 6265 * destination. 6266 */ 6267 for (cire = first_cire; 6268 cire != NULL; 6269 cire = cire->ire_next) { 6270 6271 if (!(cire->ire_flags & RTF_MULTIRT)) 6272 continue; 6273 if (cire->ire_addr != dst) 6274 continue; 6275 if (cire->ire_marks & 6276 (IRE_MARK_CONDEMNED | 6277 IRE_MARK_HIDDEN)) 6278 continue; 6279 6280 if (cire->ire_gw_secattr != NULL && 6281 tsol_ire_match_gwattr(cire, 6282 tsl) != 0) { 6283 continue; 6284 } 6285 6286 /* 6287 * Cache entries are linked to the 6288 * parent routes using the parent handle 6289 * (ire_phandle). If no cache entry has 6290 * the same handle as fire, fire is 6291 * still unresolved. 6292 */ 6293 ASSERT(cire->ire_phandle != 0); 6294 if (cire->ire_phandle == 6295 fire->ire_phandle) { 6296 already_resolved = B_TRUE; 6297 break; 6298 } 6299 } 6300 IRB_REFRELE(cirb); 6301 } 6302 6303 /* 6304 * This route is already resolved; proceed with 6305 * next one. 6306 */ 6307 if (already_resolved) { 6308 ire_refrele(gw_ire); 6309 continue; 6310 } 6311 6312 /* 6313 * Compute the time elapsed since our preceding 6314 * attempt to resolve that route. 6315 * If the MULTIRT_USESTAMP flag is set, we take 6316 * that route into account only if this time 6317 * interval exceeds ip_multirt_resolution_interval; 6318 * this prevents us from attempting to resolve a 6319 * broken route upon each sending of a packet. 6320 */ 6321 delta = lbolt - fire->ire_last_used_time; 6322 delta = TICK_TO_MSEC(delta); 6323 6324 res = (boolean_t) 6325 ((delta > ip_multirt_resolution_interval) || 6326 (!(flags & MULTIRT_USESTAMP))); 6327 6328 ip3dbg(("ire_multirt_lookup: fire %p, delta %lx, " 6329 "flags %04x, res %d\n", 6330 (void *)fire, delta, flags, res)); 6331 6332 if (res) { 6333 if (best_cire != NULL) { 6334 /* 6335 * Release the resolver associated 6336 * to the preceding candidate best 6337 * ire, if any. 6338 */ 6339 ire_refrele(best_cire); 6340 ASSERT(best_fire != NULL); 6341 } 6342 best_fire = fire; 6343 best_cire = gw_ire; 6344 continue; 6345 } 6346 6347 ire_refrele(gw_ire); 6348 } 6349 } 6350 6351 if (best_fire != NULL) { 6352 IRE_REFHOLD(best_fire); 6353 } 6354 IRB_REFRELE(firb); 6355 6356 /* Release the first IRE_CACHE we initially looked up, if any. */ 6357 if (first_cire != NULL) 6358 ire_refrele(first_cire); 6359 6360 /* Found a resolvable route. */ 6361 if (best_fire != NULL) { 6362 ASSERT(best_cire != NULL); 6363 6364 if (*fire_arg != NULL) 6365 ire_refrele(*fire_arg); 6366 if (*ire_arg != NULL) 6367 ire_refrele(*ire_arg); 6368 6369 /* 6370 * Update the passed-in arguments with the 6371 * resolvable multirt route we found. 6372 */ 6373 *fire_arg = best_fire; 6374 *ire_arg = best_cire; 6375 6376 ip2dbg(("ire_multirt_lookup: returning B_TRUE, " 6377 "*fire_arg %p, *ire_arg %p\n", 6378 (void *)best_fire, (void *)best_cire)); 6379 6380 return (B_TRUE); 6381 } 6382 6383 ASSERT(best_cire == NULL); 6384 6385 ip2dbg(("ire_multirt_lookup: returning B_FALSE, *fire_arg %p, " 6386 "*ire_arg %p\n", 6387 (void *)*fire_arg, (void *)*ire_arg)); 6388 6389 /* No resolvable route. */ 6390 return (B_FALSE); 6391 } 6392 6393 /* 6394 * The purpose of the next two functions is to provide some external access to 6395 * routing/l2 lookup functionality while hiding the implementation of routing 6396 * and interface data structures (IRE/ILL). Thus, interfaces are passed/ 6397 * returned by name instead of by ILL reference. These functions are used by 6398 * IP Filter. 6399 * Return a link layer header suitable for an IP packet being sent to the 6400 * dst_addr IP address. The interface associated with the route is put into 6401 * ifname, which must be a buffer of LIFNAMSIZ bytes. The dst_addr is the 6402 * packet's ultimate destination address, not a router address. 6403 * 6404 * This function is used when the caller wants to know the outbound interface 6405 * and MAC header for a packet given only the address. 6406 */ 6407 mblk_t * 6408 ip_nexthop_route(const struct sockaddr *target, char *ifname) 6409 { 6410 struct nce_s *nce; 6411 ire_t *dir; 6412 ill_t *ill; 6413 mblk_t *mp, *tmp_mp; 6414 6415 /* parameter sanity */ 6416 if (ifname == NULL || target == NULL) 6417 return (NULL); 6418 6419 /* Find the route entry, if it exists. */ 6420 switch (target->sa_family) { 6421 case AF_INET: 6422 dir = ire_route_lookup( 6423 ((struct sockaddr_in *)target)->sin_addr.s_addr, 6424 0xffffffff, 6425 0, 0, NULL, NULL, ALL_ZONES, NULL, 6426 MATCH_IRE_DSTONLY|MATCH_IRE_DEFAULT|MATCH_IRE_RECURSIVE); 6427 break; 6428 case AF_INET6: 6429 dir = ire_route_lookup_v6( 6430 &((struct sockaddr_in6 *)target)->sin6_addr, 6431 NULL, 6432 0, 0, NULL, NULL, ALL_ZONES, NULL, 6433 MATCH_IRE_DSTONLY|MATCH_IRE_DEFAULT|MATCH_IRE_RECURSIVE); 6434 if ((dir != NULL) && (dir->ire_nce == NULL)) { 6435 ire_refrele(dir); 6436 dir = NULL; 6437 } 6438 break; 6439 default: 6440 dir = NULL; 6441 break; 6442 } 6443 6444 if (dir == NULL) { 6445 return (NULL); 6446 } 6447 6448 /* Map the IRE to an ILL so we can fill in ifname. */ 6449 ill = ire_to_ill(dir); 6450 if (ill == NULL) { 6451 ire_refrele(dir); 6452 return (NULL); 6453 } 6454 (void) strncpy(ifname, ill->ill_name, LIFNAMSIZ); 6455 6456 if ((dir->ire_type & (IRE_CACHE|IRE_BROADCAST)) == 0) { 6457 mp = copyb(ill->ill_resolver_mp); 6458 ire_refrele(dir); 6459 return (mp); 6460 } 6461 6462 /* Return a copy of the header to the caller. */ 6463 switch (target->sa_family) { 6464 case AF_INET : 6465 if (dir->ire_nce != NULL && 6466 dir->ire_nce->nce_state == ND_REACHABLE) { 6467 if (dir->ire_nce->nce_fp_mp != NULL) 6468 tmp_mp = dir->ire_nce->nce_fp_mp; 6469 else 6470 tmp_mp = dir->ire_nce->nce_fp_mp; 6471 if ((mp = dupb(tmp_mp)) == NULL) 6472 mp = copyb(tmp_mp); 6473 } else { 6474 mp = copyb(ill->ill_resolver_mp); 6475 } 6476 break; 6477 case AF_INET6 : 6478 nce = dir->ire_nce; 6479 if (nce->nce_fp_mp != NULL) { 6480 if ((mp = dupb(nce->nce_fp_mp)) == NULL) 6481 mp = copyb(nce->nce_fp_mp); 6482 } else if (nce->nce_res_mp != NULL) { 6483 if ((mp = dupb(nce->nce_res_mp)) == NULL) 6484 mp = copyb(nce->nce_res_mp); 6485 } else { 6486 mp = NULL; 6487 } 6488 break; 6489 } 6490 6491 ire_refrele(dir); 6492 return (mp); 6493 } 6494 6495 6496 /* 6497 * Return a link layer header suitable for an IP packet being sent to the 6498 * dst_addr IP address on the specified output interface. The dst_addr 6499 * may be the packet's ultimate destination or a predetermined next hop 6500 * router's address. 6501 * ifname must be nul-terminated. 6502 * 6503 * This function is used when the caller knows the outbound interface (usually 6504 * because it was specified by policy) and only needs the MAC header for a 6505 * packet. 6506 */ 6507 mblk_t * 6508 ip_nexthop(const struct sockaddr *target, const char *ifname) 6509 { 6510 struct nce_s *nce; 6511 ill_walk_context_t ctx; 6512 t_uscalar_t sap; 6513 ire_t *dir; 6514 ill_t *ill; 6515 mblk_t *mp; 6516 6517 /* parameter sanity */ 6518 if (ifname == NULL || target == NULL) 6519 return (NULL); 6520 6521 switch (target->sa_family) { 6522 case AF_INET : 6523 sap = IP_DL_SAP; 6524 break; 6525 case AF_INET6 : 6526 sap = IP6_DL_SAP; 6527 break; 6528 default: 6529 return (NULL); 6530 } 6531 6532 /* Lock ill_g_lock before walking through the list */ 6533 rw_enter(&ill_g_lock, RW_READER); 6534 /* 6535 * Can we find the interface name among those currently configured? 6536 */ 6537 for (ill = ILL_START_WALK_ALL(&ctx); ill != NULL; 6538 ill = ill_next(&ctx, ill)) { 6539 if ((strcmp(ifname, ill->ill_name) == 0) && 6540 (ill->ill_sap == sap)) 6541 break; 6542 } 6543 if (ill == NULL || ill->ill_ipif == NULL) { 6544 rw_exit(&ill_g_lock); 6545 return (NULL); 6546 } 6547 6548 mutex_enter(&ill->ill_lock); 6549 if (!ILL_CAN_LOOKUP(ill)) { 6550 mutex_exit(&ill->ill_lock); 6551 rw_exit(&ill_g_lock); 6552 return (NULL); 6553 } 6554 ill_refhold_locked(ill); 6555 mutex_exit(&ill->ill_lock); 6556 rw_exit(&ill_g_lock); 6557 6558 /* Find the resolver entry, if it exists. */ 6559 switch (target->sa_family) { 6560 case AF_INET: 6561 dir = ire_route_lookup( 6562 ((struct sockaddr_in *)target)->sin_addr.s_addr, 6563 0xffffffff, 6564 0, 0, ill->ill_ipif, NULL, ALL_ZONES, NULL, 6565 MATCH_IRE_DSTONLY|MATCH_IRE_DEFAULT| 6566 MATCH_IRE_RECURSIVE|MATCH_IRE_IPIF); 6567 if ((dir != NULL) && dir->ire_nce != NULL && 6568 dir->ire_nce->nce_state != ND_REACHABLE) { 6569 ire_refrele(dir); 6570 dir = NULL; 6571 } 6572 break; 6573 case AF_INET6: 6574 dir = ire_route_lookup_v6( 6575 &((struct sockaddr_in6 *)target)->sin6_addr, NULL, 6576 0, 0, ill->ill_ipif, NULL, ALL_ZONES, NULL, 6577 MATCH_IRE_DSTONLY|MATCH_IRE_DEFAULT| 6578 MATCH_IRE_RECURSIVE|MATCH_IRE_IPIF); 6579 if ((dir != NULL) && (dir->ire_nce == NULL)) { 6580 ire_refrele(dir); 6581 dir = NULL; 6582 } 6583 break; 6584 default: 6585 dir = NULL; 6586 break; 6587 } 6588 6589 if (dir == NULL) { 6590 return (NULL); 6591 } 6592 6593 if ((dir->ire_type & (IRE_CACHE|IRE_BROADCAST)) == 0) { 6594 mp = copyb(ill->ill_resolver_mp); 6595 ill_refrele(ill); 6596 ire_refrele(dir); 6597 return (mp); 6598 } 6599 6600 /* Return a copy of the header to the caller. */ 6601 switch (target->sa_family) { 6602 case AF_INET : 6603 if (dir->ire_nce->nce_fp_mp != NULL) { 6604 if ((mp = dupb(dir->ire_nce->nce_fp_mp)) == NULL) 6605 mp = copyb(dir->ire_nce->nce_fp_mp); 6606 } else if (dir->ire_nce->nce_res_mp != NULL) { 6607 if ((mp = dupb(dir->ire_nce->nce_res_mp)) == NULL) 6608 mp = copyb(dir->ire_nce->nce_res_mp); 6609 } else { 6610 mp = copyb(ill->ill_resolver_mp); 6611 } 6612 break; 6613 case AF_INET6 : 6614 nce = dir->ire_nce; 6615 if (nce->nce_fp_mp != NULL) { 6616 if ((mp = dupb(nce->nce_fp_mp)) == NULL) 6617 mp = copyb(nce->nce_fp_mp); 6618 } else if (nce->nce_res_mp != NULL) { 6619 if ((mp = dupb(nce->nce_res_mp)) == NULL) 6620 mp = copyb(nce->nce_res_mp); 6621 } else { 6622 mp = NULL; 6623 } 6624 break; 6625 } 6626 6627 ire_refrele(dir); 6628 ill_refrele(ill); 6629 return (mp); 6630 } 6631 6632 /* 6633 * IRE iterator for inbound and loopback broadcast processing. 6634 * Given an IRE_BROADCAST ire, walk the ires with the same destination 6635 * address, but skip over the passed-in ire. Returns the next ire without 6636 * a hold - assumes that the caller holds a reference on the IRE bucket. 6637 */ 6638 ire_t * 6639 ire_get_next_bcast_ire(ire_t *curr, ire_t *ire) 6640 { 6641 ill_t *ill; 6642 6643 if (curr == NULL) { 6644 for (curr = ire->ire_bucket->irb_ire; curr != NULL; 6645 curr = curr->ire_next) { 6646 if (curr->ire_addr == ire->ire_addr) 6647 break; 6648 } 6649 } else { 6650 curr = curr->ire_next; 6651 } 6652 ill = ire_to_ill(ire); 6653 for (; curr != NULL; curr = curr->ire_next) { 6654 if (curr->ire_addr != ire->ire_addr) { 6655 /* 6656 * All the IREs to a given destination are contiguous; 6657 * break out once the address doesn't match. 6658 */ 6659 break; 6660 } 6661 if (curr == ire) { 6662 /* skip over the passed-in ire */ 6663 continue; 6664 } 6665 if ((curr->ire_stq != NULL && ire->ire_stq == NULL) || 6666 (curr->ire_stq == NULL && ire->ire_stq != NULL)) { 6667 /* 6668 * If the passed-in ire is loopback, skip over 6669 * non-loopback ires and vice versa. 6670 */ 6671 continue; 6672 } 6673 if (ire_to_ill(curr) != ill) { 6674 /* skip over IREs going through a different interface */ 6675 continue; 6676 } 6677 if (curr->ire_marks & IRE_MARK_CONDEMNED) { 6678 /* skip over deleted IREs */ 6679 continue; 6680 } 6681 return (curr); 6682 } 6683 return (NULL); 6684 } 6685 6686 #ifdef IRE_DEBUG 6687 th_trace_t * 6688 th_trace_ire_lookup(ire_t *ire) 6689 { 6690 int bucket_id; 6691 th_trace_t *th_trace; 6692 6693 ASSERT(MUTEX_HELD(&ire->ire_lock)); 6694 6695 bucket_id = IP_TR_HASH(curthread); 6696 ASSERT(bucket_id < IP_TR_HASH_MAX); 6697 6698 for (th_trace = ire->ire_trace[bucket_id]; th_trace != NULL; 6699 th_trace = th_trace->th_next) { 6700 if (th_trace->th_id == curthread) 6701 return (th_trace); 6702 } 6703 return (NULL); 6704 } 6705 6706 void 6707 ire_trace_ref(ire_t *ire) 6708 { 6709 int bucket_id; 6710 th_trace_t *th_trace; 6711 6712 /* 6713 * Attempt to locate the trace buffer for the curthread. 6714 * If it does not exist, then allocate a new trace buffer 6715 * and link it in list of trace bufs for this ipif, at the head 6716 */ 6717 mutex_enter(&ire->ire_lock); 6718 if (ire->ire_trace_disable == B_TRUE) { 6719 mutex_exit(&ire->ire_lock); 6720 return; 6721 } 6722 th_trace = th_trace_ire_lookup(ire); 6723 if (th_trace == NULL) { 6724 bucket_id = IP_TR_HASH(curthread); 6725 th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t), 6726 KM_NOSLEEP); 6727 if (th_trace == NULL) { 6728 ire->ire_trace_disable = B_TRUE; 6729 mutex_exit(&ire->ire_lock); 6730 ire_trace_inactive(ire); 6731 return; 6732 } 6733 6734 th_trace->th_id = curthread; 6735 th_trace->th_next = ire->ire_trace[bucket_id]; 6736 th_trace->th_prev = &ire->ire_trace[bucket_id]; 6737 if (th_trace->th_next != NULL) 6738 th_trace->th_next->th_prev = &th_trace->th_next; 6739 ire->ire_trace[bucket_id] = th_trace; 6740 } 6741 ASSERT(th_trace->th_refcnt < TR_BUF_MAX - 1); 6742 th_trace->th_refcnt++; 6743 th_trace_rrecord(th_trace); 6744 mutex_exit(&ire->ire_lock); 6745 } 6746 6747 void 6748 ire_trace_free(th_trace_t *th_trace) 6749 { 6750 /* unlink th_trace and free it */ 6751 *th_trace->th_prev = th_trace->th_next; 6752 if (th_trace->th_next != NULL) 6753 th_trace->th_next->th_prev = th_trace->th_prev; 6754 th_trace->th_next = NULL; 6755 th_trace->th_prev = NULL; 6756 kmem_free(th_trace, sizeof (th_trace_t)); 6757 } 6758 6759 void 6760 ire_untrace_ref(ire_t *ire) 6761 { 6762 th_trace_t *th_trace; 6763 6764 mutex_enter(&ire->ire_lock); 6765 6766 if (ire->ire_trace_disable == B_TRUE) { 6767 mutex_exit(&ire->ire_lock); 6768 return; 6769 } 6770 6771 th_trace = th_trace_ire_lookup(ire); 6772 ASSERT(th_trace != NULL && th_trace->th_refcnt > 0); 6773 th_trace_rrecord(th_trace); 6774 th_trace->th_refcnt--; 6775 6776 if (th_trace->th_refcnt == 0) 6777 ire_trace_free(th_trace); 6778 6779 mutex_exit(&ire->ire_lock); 6780 } 6781 6782 static void 6783 ire_trace_inactive(ire_t *ire) 6784 { 6785 th_trace_t *th_trace; 6786 int i; 6787 6788 mutex_enter(&ire->ire_lock); 6789 for (i = 0; i < IP_TR_HASH_MAX; i++) { 6790 while (ire->ire_trace[i] != NULL) { 6791 th_trace = ire->ire_trace[i]; 6792 6793 /* unlink th_trace and free it */ 6794 ire->ire_trace[i] = th_trace->th_next; 6795 if (th_trace->th_next != NULL) 6796 th_trace->th_next->th_prev = 6797 &ire->ire_trace[i]; 6798 6799 th_trace->th_next = NULL; 6800 th_trace->th_prev = NULL; 6801 kmem_free(th_trace, sizeof (th_trace_t)); 6802 } 6803 } 6804 6805 mutex_exit(&ire->ire_lock); 6806 } 6807 6808 /* ARGSUSED */ 6809 void 6810 ire_thread_exit(ire_t *ire, caddr_t arg) 6811 { 6812 th_trace_t *th_trace; 6813 6814 mutex_enter(&ire->ire_lock); 6815 th_trace = th_trace_ire_lookup(ire); 6816 if (th_trace == NULL) { 6817 mutex_exit(&ire->ire_lock); 6818 return; 6819 } 6820 ASSERT(th_trace->th_refcnt == 0); 6821 6822 ire_trace_free(th_trace); 6823 mutex_exit(&ire->ire_lock); 6824 } 6825 6826 #endif 6827 6828 /* 6829 * Generate a message chain with an arp request to resolve the in_ire. 6830 * It is assumed that in_ire itself is currently in the ire cache table, 6831 * so we create a fake_ire filled with enough information about ire_addr etc. 6832 * to retrieve in_ire when the DL_UNITDATA response from the resolver 6833 * comes back. The fake_ire itself is created by calling esballoc with 6834 * the fr_rtnp (free routine) set to ire_freemblk. This routine will be 6835 * invoked when the mblk containing fake_ire is freed. 6836 */ 6837 void 6838 ire_arpresolve(ire_t *in_ire, ill_t *dst_ill) 6839 { 6840 areq_t *areq; 6841 ipaddr_t *addrp; 6842 mblk_t *ire_mp, *dlureq_mp; 6843 ire_t *ire, *buf; 6844 size_t bufsize; 6845 frtn_t *frtnp; 6846 ill_t *ill; 6847 6848 /* 6849 * Construct message chain for the resolver 6850 * of the form: 6851 * ARP_REQ_MBLK-->IRE_MBLK 6852 * 6853 * NOTE : If the response does not 6854 * come back, ARP frees the packet. For this reason, 6855 * we can't REFHOLD the bucket of save_ire to prevent 6856 * deletions. We may not be able to REFRELE the bucket 6857 * if the response never comes back. Thus, before 6858 * adding the ire, ire_add_v4 will make sure that the 6859 * interface route does not get deleted. This is the 6860 * only case unlike ip_newroute_v6, ip_newroute_ipif_v6 6861 * where we can always prevent deletions because of 6862 * the synchronous nature of adding IRES i.e 6863 * ire_add_then_send is called after creating the IRE. 6864 */ 6865 6866 /* 6867 * We use esballoc to allocate the second part(the ire_t size mblk) 6868 * of the message chain depicted above. THis mblk will be freed 6869 * by arp when there is a timeout, and otherwise passed to IP 6870 * and IP will * free it after processing the ARP response. 6871 */ 6872 6873 bufsize = sizeof (ire_t) + sizeof (frtn_t); 6874 buf = kmem_alloc(bufsize, KM_NOSLEEP); 6875 if (buf == NULL) { 6876 ip1dbg(("ire_arpresolver:alloc buffer failed\n ")); 6877 return; 6878 } 6879 frtnp = (frtn_t *)(buf + 1); 6880 frtnp->free_arg = (caddr_t)buf; 6881 frtnp->free_func = ire_freemblk; 6882 6883 ire_mp = esballoc((unsigned char *)buf, bufsize, BPRI_MED, frtnp); 6884 6885 if (ire_mp == NULL) { 6886 ip1dbg(("ire_arpresolve: esballoc failed\n")); 6887 kmem_free(buf, bufsize); 6888 return; 6889 } 6890 ASSERT(in_ire->ire_nce != NULL); 6891 dlureq_mp = copyb(dst_ill->ill_resolver_mp); 6892 if (dlureq_mp == NULL) { 6893 kmem_free(buf, bufsize); 6894 return; 6895 } 6896 6897 ire_mp->b_datap->db_type = IRE_ARPRESOLVE_TYPE; 6898 ire = (ire_t *)buf; 6899 /* 6900 * keep enough info in the fake ire so that we can pull up 6901 * the incomplete ire (in_ire) after result comes back from 6902 * arp and make it complete. 6903 */ 6904 *ire = ire_null; 6905 ire->ire_u = in_ire->ire_u; 6906 ire->ire_ipif_seqid = in_ire->ire_ipif_seqid; 6907 ire->ire_ipif = in_ire->ire_ipif; 6908 ire->ire_stq = in_ire->ire_stq; 6909 ill = ire_to_ill(ire); 6910 ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex; 6911 ire->ire_zoneid = in_ire->ire_zoneid; 6912 /* 6913 * ire_freemblk will be called when ire_mp is freed, both for 6914 * successful and failed arp resolution. IRE_MARK_UNCACHED will be set 6915 * when the arp resolution failed. 6916 */ 6917 ire->ire_marks |= IRE_MARK_UNCACHED; 6918 ire->ire_mp = ire_mp; 6919 ire_mp->b_wptr = (uchar_t *)&ire[1]; 6920 ire_mp->b_cont = NULL; 6921 ASSERT(dlureq_mp != NULL); 6922 linkb(dlureq_mp, ire_mp); 6923 6924 /* 6925 * Fill in the source and dest addrs for the resolver. 6926 * NOTE: this depends on memory layouts imposed by 6927 * ill_init(). 6928 */ 6929 areq = (areq_t *)dlureq_mp->b_rptr; 6930 addrp = (ipaddr_t *)((char *)areq + areq->areq_sender_addr_offset); 6931 *addrp = ire->ire_src_addr; 6932 6933 addrp = (ipaddr_t *)((char *)areq + areq->areq_target_addr_offset); 6934 if (ire->ire_gateway_addr != INADDR_ANY) { 6935 *addrp = ire->ire_gateway_addr; 6936 } else { 6937 *addrp = ire->ire_addr; 6938 } 6939 6940 /* Up to the resolver. */ 6941 if (canputnext(dst_ill->ill_rq)) { 6942 putnext(dst_ill->ill_rq, dlureq_mp); 6943 } else { 6944 /* Prepare for cleanup */ 6945 freemsg(dlureq_mp); 6946 } 6947 } 6948 6949 /* 6950 * Esballoc free function for AR_ENTRY_QUERY request to clean up any 6951 * unresolved ire_t and/or nce_t structures when ARP resolution fails. 6952 * 6953 * This function can be called by ARP via free routine for ire_mp or 6954 * by IPv4(both host and forwarding path) via ire_delete 6955 * in case ARP resolution fails. 6956 * NOTE: Since IP is MT, ARP can call into IP but not vice versa 6957 * (for IP to talk to ARP, it still has to send AR* messages). 6958 * 6959 * Note that the ARP/IP merge should replace the functioanlity by providing 6960 * direct function calls to clean up unresolved entries in ire/nce lists. 6961 */ 6962 void 6963 ire_freemblk(ire_t *ire_mp) 6964 { 6965 nce_t *nce = NULL; 6966 ill_t *ill; 6967 6968 ASSERT(ire_mp != NULL); 6969 6970 if ((ire_mp->ire_addr == NULL) && (ire_mp->ire_gateway_addr == NULL)) { 6971 ip1dbg(("ire_freemblk(0x%p) ire_addr is NULL\n", 6972 (void *)ire_mp)); 6973 goto cleanup; 6974 } 6975 if ((ire_mp->ire_marks & IRE_MARK_UNCACHED) == 0) { 6976 goto cleanup; /* everything succeeded. just free and return */ 6977 } 6978 6979 /* 6980 * the arp information corresponding to this ire_mp was not 6981 * transferred to a ire_cache entry. Need 6982 * to clean up incomplete ire's and nce, if necessary. 6983 */ 6984 ASSERT(ire_mp->ire_stq != NULL); 6985 ASSERT(ire_mp->ire_stq_ifindex != 0); 6986 /* 6987 * Get any nce's corresponding to this ire_mp. We first have to 6988 * make sure that the ill is still around. 6989 */ 6990 ill = ill_lookup_on_ifindex(ire_mp->ire_stq_ifindex, B_FALSE, 6991 NULL, NULL, NULL, NULL); 6992 if (ill == NULL || (ire_mp->ire_stq != ill->ill_wq) || 6993 (ill->ill_state_flags & ILL_CONDEMNED)) { 6994 /* 6995 * ill went away. no nce to clean up. 6996 * Note that the ill_state_flags could be set to 6997 * ILL_CONDEMNED after this point, but if we know 6998 * that it is CONDEMNED now, we just bail out quickly. 6999 */ 7000 if (ill != NULL) 7001 ill_refrele(ill); 7002 goto cleanup; 7003 } 7004 nce = ndp_lookup_v4(ill, 7005 ((ire_mp->ire_gateway_addr != INADDR_ANY) ? 7006 &ire_mp->ire_gateway_addr : &ire_mp->ire_addr), 7007 B_FALSE); 7008 ill_refrele(ill); 7009 7010 if ((nce != NULL) && (nce->nce_state != ND_REACHABLE)) { 7011 /* 7012 * some incomplete nce was found. 7013 */ 7014 DTRACE_PROBE2(ire__freemblk__arp__resolv__fail, 7015 nce_t *, nce, ire_t *, ire_mp); 7016 /* 7017 * Send the icmp_unreachable messages for the queued mblks in 7018 * ire->ire_nce->nce_qd_mp, since ARP resolution failed 7019 * for this ire 7020 */ 7021 arp_resolv_failed(nce); 7022 /* 7023 * Delete the nce and clean up all ire's pointing at this nce 7024 * in the cachetable 7025 */ 7026 ndp_delete(nce); 7027 } 7028 if (nce != NULL) 7029 NCE_REFRELE(nce); /* release the ref taken by ndp_lookup_v4 */ 7030 7031 cleanup: 7032 /* 7033 * Get rid of the ire buffer 7034 * We call kmem_free here(instead of ire_delete()), since 7035 * this is the freeb's callback. 7036 */ 7037 kmem_free(ire_mp, sizeof (ire_t) + sizeof (frtn_t)); 7038 } 7039 7040 7041 /* 7042 * create the neighbor cache entry nce_t for IRE_CACHE and 7043 * non-loopback IRE_BROADCAST ire's. Note that IRE_BROADCAST 7044 * (non-loopback) entries have the nce_res_mp set to the 7045 * template passed in (generated from ill_bcast_mp); IRE_CACHE ire's 7046 * contain the information for the nexthop (ire_gateway_addr) in the 7047 * case of indirect routes, and for the dst itself (ire_addr) in the 7048 * case of direct routes, with the nce_res_mp containing a template 7049 * DL_UNITDATA request. 7050 * 7051 * This function always consumes res_mp and fp_mp. 7052 * 7053 * The actual association of the ire_nce to the nce created here is 7054 * typically done in ire_add_v4 for IRE_CACHE entries. Exceptions 7055 * to this rule are SO_DONTROUTE ire's (IRE_MARK_NO_ADD), for which 7056 * the ire_nce assignment is done in ire_add_then_send, and mobile-ip 7057 * where the assignment is done in ire_add_mrtun(). 7058 */ 7059 int 7060 ire_nce_init(ire_t *ire, mblk_t *fp_mp, mblk_t *res_mp) 7061 { 7062 in_addr_t addr4, mask4; 7063 int err; 7064 nce_t *arpce = NULL; 7065 ill_t *ire_ill; 7066 uint16_t nce_state, nce_flags; 7067 7068 if (ire->ire_stq == NULL) { 7069 if (res_mp) 7070 freemsg(res_mp); 7071 if (fp_mp) 7072 freemsg(fp_mp); 7073 return (0); /* no need to create nce for local/loopback */ 7074 } 7075 7076 mask4 = IP_HOST_MASK; 7077 switch (ire->ire_type) { 7078 case IRE_CACHE: 7079 if (ire->ire_gateway_addr != INADDR_ANY) 7080 addr4 = ire->ire_gateway_addr; /* 'G' route */ 7081 else 7082 addr4 = ire->ire_addr; /* direct route */ 7083 break; 7084 case IRE_BROADCAST: 7085 addr4 = ire->ire_addr; 7086 break; 7087 default: 7088 if (res_mp) 7089 freemsg(res_mp); 7090 if (fp_mp) 7091 freemsg(fp_mp); 7092 return (0); 7093 } 7094 7095 /* 7096 * ire_ipif is picked based on RTF_SETSRC, usesrc etc. 7097 * rules in ire_forward_src_ipif. We want the dlureq_mp 7098 * for the outgoing interface, which we get from the ire_stq. 7099 */ 7100 ire_ill = ire_to_ill(ire); 7101 7102 /* 7103 * if we are creating an nce for the first time, and this is 7104 * a NORESOLVER interface, atomically create the nce in the 7105 * REACHABLE state; else create it in the ND_INITIAL state. 7106 */ 7107 if (ire_ill->ill_net_type == IRE_IF_NORESOLVER) { 7108 nce_state = ND_REACHABLE; 7109 nce_flags = NCE_F_PERMANENT; 7110 } else { 7111 if (fp_mp != NULL) 7112 nce_state = ND_REACHABLE; 7113 else 7114 nce_state = ND_INITIAL; 7115 nce_flags = 0; 7116 } 7117 7118 err = ndp_lookup_then_add(ire_ill, NULL, 7119 &addr4, &mask4, NULL, 0, nce_flags, nce_state, &arpce, 7120 fp_mp, res_mp); 7121 7122 ip1dbg(("ire 0x%p addr 0x%lx mask 0x%lx type 0x%x; " 7123 "found nce 0x%p err %d\n", (void *)ire, (ulong_t)addr4, 7124 (ulong_t)mask4, ire->ire_type, (void *)arpce, err)); 7125 7126 switch (err) { 7127 case 0: 7128 break; 7129 case EEXIST: 7130 /* 7131 * return a pointer to an existing nce_t; 7132 * note that the ire-nce mapping is many-one, i.e., 7133 * multiple ire's could point to the same nce_t; 7134 */ 7135 if (fp_mp != NULL) { 7136 freemsg(fp_mp); 7137 } 7138 if (res_mp != NULL) { 7139 freemsg(res_mp); 7140 } 7141 break; 7142 default: 7143 DTRACE_PROBE2(nce__init__fail, ill_t *, ire_ill, int, err); 7144 if (res_mp) 7145 freemsg(res_mp); 7146 if (fp_mp) 7147 freemsg(fp_mp); 7148 return (EINVAL); 7149 } 7150 #if DEBUG 7151 /* 7152 * if an nce_fp_mp was passed in, we should be picking up an 7153 * existing nce_t in the ND_REACHABLE state. 7154 */ 7155 mutex_enter(&arpce->nce_lock); 7156 ASSERT(arpce->nce_fp_mp == NULL || arpce->nce_state == ND_REACHABLE); 7157 mutex_exit(&arpce->nce_lock); 7158 #endif 7159 if (ire->ire_type == IRE_BROADCAST) { 7160 /* 7161 * Two bcast ires are created for each interface; 7162 * 1. loopback copy (which does not have an 7163 * ire_stq, and therefore has no ire_nce), and, 7164 * 2. the non-loopback copy, which has the nce_res_mp 7165 * initialized to a copy of the ill_bcast_mp, and 7166 * is marked as ND_REACHABLE at this point. 7167 * This nce does not undergo any further state changes, 7168 * and exists as long as the interface is plumbed. 7169 * Note: we do the ire_nce assignment here for IRE_BROADCAST 7170 * because some functions like ill_mark_bcast() inline the 7171 * ire_add functionality; 7172 */ 7173 mutex_enter(&arpce->nce_lock); 7174 arpce->nce_state = ND_REACHABLE; 7175 arpce->nce_flags |= NCE_F_PERMANENT; 7176 arpce->nce_last = TICK_TO_MSEC(lbolt64); 7177 ire->ire_nce = arpce; 7178 mutex_exit(&arpce->nce_lock); 7179 /* 7180 * We are associating this nce to the ire, 7181 * so change the nce ref taken in 7182 * ndp_lookup_then_add_v4() from 7183 * NCE_REFHOLD to NCE_REFHOLD_NOTR 7184 */ 7185 NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce); 7186 } else { 7187 if (NCE_EXPIRED(arpce)) 7188 arpce = nce_reinit(arpce); 7189 if (arpce != NULL) { 7190 /* 7191 * We are not using this nce_t just yet so release 7192 * the ref taken in ndp_lookup_then_add_v4() 7193 */ 7194 NCE_REFRELE(arpce); 7195 } else { 7196 ip0dbg(("can't reinit arpce for ill 0x%p;\n", 7197 (void *)ire_ill)); 7198 } 7199 } 7200 return (0); 7201 } 7202