1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 /* 28 * This file contains routines that manipulate Internet Routing Entries (IREs). 29 */ 30 31 #include <sys/types.h> 32 #include <sys/stream.h> 33 #include <sys/stropts.h> 34 #include <sys/strsun.h> 35 #include <sys/strsubr.h> 36 #include <sys/ddi.h> 37 #include <sys/cmn_err.h> 38 #include <sys/policy.h> 39 40 #include <sys/systm.h> 41 #include <sys/kmem.h> 42 #include <sys/param.h> 43 #include <sys/socket.h> 44 #include <net/if.h> 45 #include <net/route.h> 46 #include <netinet/in.h> 47 #include <net/if_dl.h> 48 #include <netinet/ip6.h> 49 #include <netinet/icmp6.h> 50 51 #include <inet/common.h> 52 #include <inet/mi.h> 53 #include <inet/ip.h> 54 #include <inet/ip6.h> 55 #include <inet/ip_ndp.h> 56 #include <inet/arp.h> 57 #include <inet/ip_if.h> 58 #include <inet/ip_ire.h> 59 #include <inet/ip_ftable.h> 60 #include <inet/ip_rts.h> 61 #include <inet/nd.h> 62 63 #include <net/pfkeyv2.h> 64 #include <inet/ipsec_info.h> 65 #include <inet/sadb.h> 66 #include <inet/tcp.h> 67 #include <inet/ipclassifier.h> 68 #include <sys/zone.h> 69 #include <sys/cpuvar.h> 70 71 #include <sys/tsol/label.h> 72 #include <sys/tsol/tnet.h> 73 74 struct kmem_cache *rt_entry_cache; 75 76 /* 77 * Synchronization notes: 78 * 79 * The fields of the ire_t struct are protected in the following way : 80 * 81 * ire_next/ire_ptpn 82 * 83 * - bucket lock of the respective tables (cache or forwarding tables). 84 * 85 * ire_mp, ire_rfq, ire_stq, ire_u *except* ire_gateway_addr[v6], ire_mask, 86 * ire_type, ire_create_time, ire_masklen, ire_ipversion, ire_flags, ire_ipif, 87 * ire_ihandle, ire_phandle, ire_nce, ire_bucket, ire_in_ill, ire_in_src_addr 88 * 89 * - Set in ire_create_v4/v6 and never changes after that. Thus, 90 * we don't need a lock whenever these fields are accessed. 91 * 92 * - ire_bucket and ire_masklen (also set in ire_create) is set in 93 * ire_add_v4/ire_add_v6 before inserting in the bucket and never 94 * changes after that. Thus we don't need a lock whenever these 95 * fields are accessed. 96 * 97 * ire_gateway_addr_v4[v6] 98 * 99 * - ire_gateway_addr_v4[v6] is set during ire_create and later modified 100 * by rts_setgwr[v6]. As ire_gateway_addr is a uint32_t, updates to 101 * it assumed to be atomic and hence the other parts of the code 102 * does not use any locks. ire_gateway_addr_v6 updates are not atomic 103 * and hence any access to it uses ire_lock to get/set the right value. 104 * 105 * ire_ident, ire_refcnt 106 * 107 * - Updated atomically using atomic_add_32 108 * 109 * ire_ssthresh, ire_rtt_sd, ire_rtt, ire_ib_pkt_count, ire_ob_pkt_count 110 * 111 * - Assumes that 32 bit writes are atomic. No locks. ire_lock is 112 * used to serialize updates to ire_ssthresh, ire_rtt_sd, ire_rtt. 113 * 114 * ire_max_frag, ire_frag_flag 115 * 116 * - ire_lock is used to set/read both of them together. 117 * 118 * ire_tire_mark 119 * 120 * - Set in ire_create and updated in ire_expire, which is called 121 * by only one function namely ip_trash_timer_expire. Thus only 122 * one function updates and examines the value. 123 * 124 * ire_marks 125 * - bucket lock protects this. 126 * 127 * ire_ipsec_overhead/ire_ll_hdr_length 128 * 129 * - Place holder for returning the information to the upper layers 130 * when IRE_DB_REQ comes down. 131 * 132 * 133 * ipv6_ire_default_count is protected by the bucket lock of 134 * ip_forwarding_table_v6[0][0]. 135 * 136 * ipv6_ire_default_index is not protected as it is just a hint 137 * at which default gateway to use. There is nothing 138 * wrong in using the same gateway for two different connections. 139 * 140 * As we always hold the bucket locks in all the places while accessing 141 * the above values, it is natural to use them for protecting them. 142 * 143 * We have a separate cache table and forwarding table for IPv4 and IPv6. 144 * Cache table (ip_cache_table/ip_cache_table_v6) is a pointer to an 145 * array of irb_t structures. The IPv6 forwarding table 146 * (ip_forwarding_table_v6) is an array of pointers to arrays of irb_t 147 * structure. ip_forwarding_table_v6 is allocated dynamically in 148 * ire_add_v6. ire_ft_init_lock is used to serialize multiple threads 149 * initializing the same bucket. Once a bucket is initialized, it is never 150 * de-alloacted. This assumption enables us to access 151 * ip_forwarding_table_v6[i] without any locks. 152 * 153 * The forwarding table for IPv4 is a radix tree whose leaves 154 * are rt_entry structures containing the irb_t for the rt_dst. The irb_t 155 * for IPv4 is dynamically allocated and freed. 156 * 157 * Each irb_t - ire bucket structure has a lock to protect 158 * a bucket and the ires residing in the bucket have a back pointer to 159 * the bucket structure. It also has a reference count for the number 160 * of threads walking the bucket - irb_refcnt which is bumped up 161 * using the macro IRB_REFHOLD macro. The flags irb_flags can be 162 * set to IRE_MARK_CONDEMNED indicating that there are some ires 163 * in this bucket that are marked with IRE_MARK_CONDEMNED and the 164 * last thread to leave the bucket should delete the ires. Usually 165 * this is done by the IRB_REFRELE macro which is used to decrement 166 * the reference count on a bucket. See comments above irb_t structure 167 * definition in ip.h for further details. 168 * 169 * IRE_REFHOLD/IRE_REFRELE macros operate on the ire which increments/ 170 * decrements the reference count, ire_refcnt, atomically on the ire. 171 * ire_refcnt is modified only using this macro. Operations on the IRE 172 * could be described as follows : 173 * 174 * CREATE an ire with reference count initialized to 1. 175 * 176 * ADDITION of an ire holds the bucket lock, checks for duplicates 177 * and then adds the ire. ire_add_v4/ire_add_v6 returns the ire after 178 * bumping up once more i.e the reference count is 2. This is to avoid 179 * an extra lookup in the functions calling ire_add which wants to 180 * work with the ire after adding. 181 * 182 * LOOKUP of an ire bumps up the reference count using IRE_REFHOLD 183 * macro. It is valid to bump up the referece count of the IRE, 184 * after the lookup has returned an ire. Following are the lookup 185 * functions that return an HELD ire : 186 * 187 * ire_lookup_local[_v6], ire_ctable_lookup[_v6], ire_ftable_lookup[_v6], 188 * ire_cache_lookup[_v6], ire_lookup_multi[_v6], ire_route_lookup[_v6], 189 * ipif_to_ire[_v6]. 190 * 191 * DELETION of an ire holds the bucket lock, removes it from the list 192 * and then decrements the reference count for having removed from the list 193 * by using the IRE_REFRELE macro. If some other thread has looked up 194 * the ire, the reference count would have been bumped up and hence 195 * this ire will not be freed once deleted. It will be freed once the 196 * reference count drops to zero. 197 * 198 * Add and Delete acquires the bucket lock as RW_WRITER, while all the 199 * lookups acquire the bucket lock as RW_READER. 200 * 201 * NOTE : The only functions that does the IRE_REFRELE when an ire is 202 * passed as an argument are : 203 * 204 * 1) ip_wput_ire : This is because it IRE_REFHOLD/RELEs the 205 * broadcast ires it looks up internally within 206 * the function. Currently, for simplicity it does 207 * not differentiate the one that is passed in and 208 * the ones it looks up internally. It always 209 * IRE_REFRELEs. 210 * 2) ire_send 211 * ire_send_v6 : As ire_send calls ip_wput_ire and other functions 212 * that take ire as an argument, it has to selectively 213 * IRE_REFRELE the ire. To maintain symmetry, 214 * ire_send_v6 does the same. 215 * 216 * Otherwise, the general rule is to do the IRE_REFRELE in the function 217 * that is passing the ire as an argument. 218 * 219 * In trying to locate ires the following points are to be noted. 220 * 221 * IRE_MARK_CONDEMNED signifies that the ire has been logically deleted and is 222 * to be ignored when walking the ires using ire_next. 223 * 224 * Zones note: 225 * Walking IREs within a given zone also walks certain ires in other 226 * zones. This is done intentionally. IRE walks with a specified 227 * zoneid are used only when doing informational reports, and 228 * zone users want to see things that they can access. See block 229 * comment in ire_walk_ill_match(). 230 */ 231 232 /* 233 * The minimum size of IRE cache table. It will be recalcuated in 234 * ip_ire_init(). 235 * Setable in /etc/system 236 */ 237 uint32_t ip_cache_table_size = IP_CACHE_TABLE_SIZE; 238 uint32_t ip6_cache_table_size = IP6_CACHE_TABLE_SIZE; 239 240 /* 241 * The size of the forwarding table. We will make sure that it is a 242 * power of 2 in ip_ire_init(). 243 * Setable in /etc/system 244 */ 245 uint32_t ip6_ftable_hash_size = IP6_FTABLE_HASH_SIZE; 246 247 struct kmem_cache *ire_cache; 248 static ire_t ire_null; 249 250 /* 251 * The threshold number of IRE in a bucket when the IREs are 252 * cleaned up. This threshold is calculated later in ip_open() 253 * based on the speed of CPU and available memory. This default 254 * value is the maximum. 255 * 256 * We have two kinds of cached IRE, temporary and 257 * non-temporary. Temporary IREs are marked with 258 * IRE_MARK_TEMPORARY. They are IREs created for non 259 * TCP traffic and for forwarding purposes. All others 260 * are non-temporary IREs. We don't mark IRE created for 261 * TCP as temporary because TCP is stateful and there are 262 * info stored in the IRE which can be shared by other TCP 263 * connections to the same destination. For connected 264 * endpoint, we also don't want to mark the IRE used as 265 * temporary because the same IRE will be used frequently, 266 * otherwise, the app should not do a connect(). We change 267 * the marking at ip_bind_connected_*() if necessary. 268 * 269 * We want to keep the cache IRE hash bucket length reasonably 270 * short, otherwise IRE lookup functions will take "forever." 271 * We use the "crude" function that the IRE bucket 272 * length should be based on the CPU speed, which is 1 entry 273 * per x MHz, depending on the shift factor ip_ire_cpu_ratio 274 * (n). This means that with a 750MHz CPU, the max bucket 275 * length can be (750 >> n) entries. 276 * 277 * Note that this threshold is separate for temp and non-temp 278 * IREs. This means that the actual bucket length can be 279 * twice as that. And while we try to keep temporary IRE 280 * length at most at the threshold value, we do not attempt to 281 * make the length for non-temporary IREs fixed, for the 282 * reason stated above. Instead, we start trying to find 283 * "unused" non-temporary IREs when the bucket length reaches 284 * this threshold and clean them up. 285 * 286 * We also want to limit the amount of memory used by 287 * IREs. So if we are allowed to use ~3% of memory (M) 288 * for those IREs, each bucket should not have more than 289 * 290 * M / num of cache bucket / sizeof (ire_t) 291 * 292 * Again the above memory uses are separate for temp and 293 * non-temp cached IREs. 294 * 295 * We may also want the limit to be a function of the number 296 * of interfaces and number of CPUs. Doing the initialization 297 * in ip_open() means that every time an interface is plumbed, 298 * the max is re-calculated. Right now, we don't do anything 299 * different. In future, when we have more experience, we 300 * may want to change this behavior. 301 */ 302 uint32_t ip_ire_max_bucket_cnt = 10; /* Setable in /etc/system */ 303 uint32_t ip6_ire_max_bucket_cnt = 10; 304 uint32_t ip_ire_cleanup_cnt = 2; 305 306 /* 307 * The minimum of the temporary IRE bucket count. We do not want 308 * the length of each bucket to be too short. This may hurt 309 * performance of some apps as the temporary IREs are removed too 310 * often. 311 */ 312 uint32_t ip_ire_min_bucket_cnt = 3; /* /etc/system - not used */ 313 uint32_t ip6_ire_min_bucket_cnt = 3; 314 315 /* 316 * The ratio of memory consumed by IRE used for temporary to available 317 * memory. This is a shift factor, so 6 means the ratio 1 to 64. This 318 * value can be changed in /etc/system. 6 is a reasonable number. 319 */ 320 uint32_t ip_ire_mem_ratio = 6; /* /etc/system */ 321 /* The shift factor for CPU speed to calculate the max IRE bucket length. */ 322 uint32_t ip_ire_cpu_ratio = 7; /* /etc/system */ 323 324 typedef struct nce_clookup_s { 325 ipaddr_t ncecl_addr; 326 boolean_t ncecl_found; 327 } nce_clookup_t; 328 329 /* 330 * The maximum number of buckets in IRE cache table. In future, we may 331 * want to make it a dynamic hash table. For the moment, we fix the 332 * size and allocate the table in ip_ire_init() when IP is first loaded. 333 * We take into account the amount of memory a system has. 334 */ 335 #define IP_MAX_CACHE_TABLE_SIZE 4096 336 337 /* Setable in /etc/system */ 338 static uint32_t ip_max_cache_table_size = IP_MAX_CACHE_TABLE_SIZE; 339 static uint32_t ip6_max_cache_table_size = IP_MAX_CACHE_TABLE_SIZE; 340 341 /* Zero iulp_t for initialization. */ 342 const iulp_t ire_uinfo_null = { 0 }; 343 344 static int ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, 345 ipsq_func_t func, boolean_t); 346 static void ire_delete_v4(ire_t *ire); 347 static void ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers, 348 zoneid_t zoneid, ip_stack_t *); 349 static void ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type, 350 pfv_t func, void *arg, uchar_t vers, ill_t *ill); 351 static void ire_cache_cleanup(irb_t *irb, uint32_t threshold, 352 ire_t *ref_ire); 353 static void ip_nce_clookup_and_delete(nce_t *nce, void *arg); 354 static ire_t *ip4_ctable_lookup_impl(ire_ctable_args_t *margs); 355 #ifdef DEBUG 356 static void ire_trace_cleanup(const ire_t *); 357 #endif 358 359 /* 360 * To avoid bloating the code, we call this function instead of 361 * using the macro IRE_REFRELE. Use macro only in performance 362 * critical paths. 363 * 364 * Must not be called while holding any locks. Otherwise if this is 365 * the last reference to be released there is a chance of recursive mutex 366 * panic due to ire_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 367 * to restart an ioctl. The one exception is when the caller is sure that 368 * this is not the last reference to be released. Eg. if the caller is 369 * sure that the ire has not been deleted and won't be deleted. 370 */ 371 void 372 ire_refrele(ire_t *ire) 373 { 374 IRE_REFRELE(ire); 375 } 376 377 void 378 ire_refrele_notr(ire_t *ire) 379 { 380 IRE_REFRELE_NOTR(ire); 381 } 382 383 /* 384 * kmem_cache_alloc constructor for IRE in kma space. 385 * Note that when ire_mp is set the IRE is stored in that mblk and 386 * not in this cache. 387 */ 388 /* ARGSUSED */ 389 static int 390 ip_ire_constructor(void *buf, void *cdrarg, int kmflags) 391 { 392 ire_t *ire = buf; 393 394 ire->ire_nce = NULL; 395 396 return (0); 397 } 398 399 /* ARGSUSED1 */ 400 static void 401 ip_ire_destructor(void *buf, void *cdrarg) 402 { 403 ire_t *ire = buf; 404 405 ASSERT(ire->ire_nce == NULL); 406 } 407 408 /* 409 * This function is associated with the IP_IOC_IRE_ADVISE_NO_REPLY 410 * IOCTL. It is used by TCP (or other ULPs) to supply revised information 411 * for an existing CACHED IRE. 412 */ 413 /* ARGSUSED */ 414 int 415 ip_ire_advise(queue_t *q, mblk_t *mp, cred_t *ioc_cr) 416 { 417 uchar_t *addr_ucp; 418 ipic_t *ipic; 419 ire_t *ire; 420 ipaddr_t addr; 421 in6_addr_t v6addr; 422 irb_t *irb; 423 zoneid_t zoneid; 424 ip_stack_t *ipst = CONNQ_TO_IPST(q); 425 426 ASSERT(q->q_next == NULL); 427 zoneid = Q_TO_CONN(q)->conn_zoneid; 428 429 /* 430 * Check privilege using the ioctl credential; if it is NULL 431 * then this is a kernel message and therefor privileged. 432 */ 433 if (ioc_cr != NULL && secpolicy_ip_config(ioc_cr, B_FALSE) != 0) 434 return (EPERM); 435 436 ipic = (ipic_t *)mp->b_rptr; 437 if (!(addr_ucp = mi_offset_param(mp, ipic->ipic_addr_offset, 438 ipic->ipic_addr_length))) { 439 return (EINVAL); 440 } 441 if (!OK_32PTR(addr_ucp)) 442 return (EINVAL); 443 switch (ipic->ipic_addr_length) { 444 case IP_ADDR_LEN: { 445 /* Extract the destination address. */ 446 addr = *(ipaddr_t *)addr_ucp; 447 /* Find the corresponding IRE. */ 448 ire = ire_cache_lookup(addr, zoneid, NULL, ipst); 449 break; 450 } 451 case IPV6_ADDR_LEN: { 452 /* Extract the destination address. */ 453 v6addr = *(in6_addr_t *)addr_ucp; 454 /* Find the corresponding IRE. */ 455 ire = ire_cache_lookup_v6(&v6addr, zoneid, NULL, ipst); 456 break; 457 } 458 default: 459 return (EINVAL); 460 } 461 462 if (ire == NULL) 463 return (ENOENT); 464 /* 465 * Update the round trip time estimate and/or the max frag size 466 * and/or the slow start threshold. 467 * 468 * We serialize multiple advises using ire_lock. 469 */ 470 mutex_enter(&ire->ire_lock); 471 if (ipic->ipic_rtt) { 472 /* 473 * If there is no old cached values, initialize them 474 * conservatively. Set them to be (1.5 * new value). 475 */ 476 if (ire->ire_uinfo.iulp_rtt != 0) { 477 ire->ire_uinfo.iulp_rtt = (ire->ire_uinfo.iulp_rtt + 478 ipic->ipic_rtt) >> 1; 479 } else { 480 ire->ire_uinfo.iulp_rtt = ipic->ipic_rtt + 481 (ipic->ipic_rtt >> 1); 482 } 483 if (ire->ire_uinfo.iulp_rtt_sd != 0) { 484 ire->ire_uinfo.iulp_rtt_sd = 485 (ire->ire_uinfo.iulp_rtt_sd + 486 ipic->ipic_rtt_sd) >> 1; 487 } else { 488 ire->ire_uinfo.iulp_rtt_sd = ipic->ipic_rtt_sd + 489 (ipic->ipic_rtt_sd >> 1); 490 } 491 } 492 if (ipic->ipic_max_frag) 493 ire->ire_max_frag = MIN(ipic->ipic_max_frag, IP_MAXPACKET); 494 if (ipic->ipic_ssthresh != 0) { 495 if (ire->ire_uinfo.iulp_ssthresh != 0) 496 ire->ire_uinfo.iulp_ssthresh = 497 (ipic->ipic_ssthresh + 498 ire->ire_uinfo.iulp_ssthresh) >> 1; 499 else 500 ire->ire_uinfo.iulp_ssthresh = ipic->ipic_ssthresh; 501 } 502 /* 503 * Don't need the ire_lock below this. ire_type does not change 504 * after initialization. ire_marks is protected by irb_lock. 505 */ 506 mutex_exit(&ire->ire_lock); 507 508 if (ipic->ipic_ire_marks != 0 && ire->ire_type == IRE_CACHE) { 509 /* 510 * Only increment the temporary IRE count if the original 511 * IRE is not already marked temporary. 512 */ 513 irb = ire->ire_bucket; 514 rw_enter(&irb->irb_lock, RW_WRITER); 515 if ((ipic->ipic_ire_marks & IRE_MARK_TEMPORARY) && 516 !(ire->ire_marks & IRE_MARK_TEMPORARY)) { 517 irb->irb_tmp_ire_cnt++; 518 } 519 ire->ire_marks |= ipic->ipic_ire_marks; 520 rw_exit(&irb->irb_lock); 521 } 522 523 ire_refrele(ire); 524 return (0); 525 } 526 527 /* 528 * This function is associated with the IP_IOC_IRE_DELETE[_NO_REPLY] 529 * IOCTL[s]. The NO_REPLY form is used by TCP to delete a route IRE 530 * for a host that is not responding. This will force an attempt to 531 * establish a new route, if available, and flush out the ARP entry so 532 * it will re-resolve. Management processes may want to use the 533 * version that generates a reply. 534 * 535 * This function does not support IPv6 since Neighbor Unreachability Detection 536 * means that negative advise like this is useless. 537 */ 538 /* ARGSUSED */ 539 int 540 ip_ire_delete(queue_t *q, mblk_t *mp, cred_t *ioc_cr) 541 { 542 uchar_t *addr_ucp; 543 ipaddr_t addr; 544 ire_t *ire; 545 ipid_t *ipid; 546 boolean_t routing_sock_info = B_FALSE; /* Sent info? */ 547 zoneid_t zoneid; 548 ire_t *gire = NULL; 549 ill_t *ill; 550 mblk_t *arp_mp; 551 ip_stack_t *ipst; 552 553 ASSERT(q->q_next == NULL); 554 zoneid = Q_TO_CONN(q)->conn_zoneid; 555 ipst = CONNQ_TO_IPST(q); 556 557 /* 558 * Check privilege using the ioctl credential; if it is NULL 559 * then this is a kernel message and therefor privileged. 560 */ 561 if (ioc_cr != NULL && secpolicy_ip_config(ioc_cr, B_FALSE) != 0) 562 return (EPERM); 563 564 ipid = (ipid_t *)mp->b_rptr; 565 566 /* Only actions on IRE_CACHEs are acceptable at present. */ 567 if (ipid->ipid_ire_type != IRE_CACHE) 568 return (EINVAL); 569 570 addr_ucp = mi_offset_param(mp, ipid->ipid_addr_offset, 571 ipid->ipid_addr_length); 572 if (addr_ucp == NULL || !OK_32PTR(addr_ucp)) 573 return (EINVAL); 574 switch (ipid->ipid_addr_length) { 575 case IP_ADDR_LEN: 576 /* addr_ucp points at IP addr */ 577 break; 578 case sizeof (sin_t): { 579 sin_t *sin; 580 /* 581 * got complete (sockaddr) address - increment addr_ucp to point 582 * at the ip_addr field. 583 */ 584 sin = (sin_t *)addr_ucp; 585 addr_ucp = (uchar_t *)&sin->sin_addr.s_addr; 586 break; 587 } 588 default: 589 return (EINVAL); 590 } 591 /* Extract the destination address. */ 592 bcopy(addr_ucp, &addr, IP_ADDR_LEN); 593 594 /* Try to find the CACHED IRE. */ 595 ire = ire_cache_lookup(addr, zoneid, NULL, ipst); 596 597 /* Nail it. */ 598 if (ire) { 599 /* Allow delete only on CACHE entries */ 600 if (ire->ire_type != IRE_CACHE) { 601 ire_refrele(ire); 602 return (EINVAL); 603 } 604 605 /* 606 * Verify that the IRE has been around for a while. 607 * This is to protect against transport protocols 608 * that are too eager in sending delete messages. 609 */ 610 if (gethrestime_sec() < 611 ire->ire_create_time + ipst->ips_ip_ignore_delete_time) { 612 ire_refrele(ire); 613 return (EINVAL); 614 } 615 /* 616 * Now we have a potentially dead cache entry. We need 617 * to remove it. 618 * If this cache entry is generated from a 619 * default route (i.e., ire_cmask == 0), 620 * search the default list and mark it dead and some 621 * background process will try to activate it. 622 */ 623 if ((ire->ire_gateway_addr != 0) && (ire->ire_cmask == 0)) { 624 /* 625 * Make sure that we pick a different 626 * IRE_DEFAULT next time. 627 */ 628 ire_t *gw_ire; 629 irb_t *irb = NULL; 630 uint_t match_flags; 631 632 match_flags = (MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE); 633 634 gire = ire_ftable_lookup(ire->ire_addr, 635 ire->ire_cmask, 0, 0, 636 ire->ire_ipif, NULL, zoneid, 0, NULL, match_flags, 637 ipst); 638 639 ip3dbg(("ire_ftable_lookup() returned gire %p\n", 640 (void *)gire)); 641 642 if (gire != NULL) { 643 irb = gire->ire_bucket; 644 645 /* 646 * We grab it as writer just to serialize 647 * multiple threads trying to bump up 648 * irb_rr_origin 649 */ 650 rw_enter(&irb->irb_lock, RW_WRITER); 651 if ((gw_ire = irb->irb_rr_origin) == NULL) { 652 rw_exit(&irb->irb_lock); 653 goto done; 654 } 655 656 DTRACE_PROBE1(ip__ire__del__origin, 657 (ire_t *), gw_ire); 658 659 /* Skip past the potentially bad gateway */ 660 if (ire->ire_gateway_addr == 661 gw_ire->ire_gateway_addr) { 662 ire_t *next = gw_ire->ire_next; 663 664 DTRACE_PROBE2(ip__ire__del, 665 (ire_t *), gw_ire, (irb_t *), irb); 666 IRE_FIND_NEXT_ORIGIN(next); 667 irb->irb_rr_origin = next; 668 } 669 rw_exit(&irb->irb_lock); 670 } 671 } 672 done: 673 if (gire != NULL) 674 IRE_REFRELE(gire); 675 /* report the bad route to routing sockets */ 676 ip_rts_change(RTM_LOSING, ire->ire_addr, ire->ire_gateway_addr, 677 ire->ire_mask, ire->ire_src_addr, 0, 0, 0, 678 (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA), ipst); 679 routing_sock_info = B_TRUE; 680 681 /* 682 * TCP is really telling us to start over completely, and it 683 * expects that we'll resend the ARP query. Tell ARP to 684 * discard the entry, if this is a local destination. 685 * 686 * But, if the ARP entry is permanent then it shouldn't be 687 * deleted, so we set ARED_F_PRESERVE_PERM. 688 */ 689 ill = ire->ire_stq->q_ptr; 690 if (ire->ire_gateway_addr == 0 && 691 (arp_mp = ill_ared_alloc(ill, addr)) != NULL) { 692 ared_t *ared = (ared_t *)arp_mp->b_rptr; 693 694 ASSERT(ared->ared_cmd == AR_ENTRY_DELETE); 695 ared->ared_flags |= ARED_F_PRESERVE_PERM; 696 putnext(ill->ill_rq, arp_mp); 697 } 698 699 ire_delete(ire); 700 ire_refrele(ire); 701 } 702 /* 703 * Also look for an IRE_HOST type redirect ire and 704 * remove it if present. 705 */ 706 ire = ire_route_lookup(addr, 0, 0, IRE_HOST, NULL, NULL, 707 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 708 709 /* Nail it. */ 710 if (ire != NULL) { 711 if (ire->ire_flags & RTF_DYNAMIC) { 712 if (!routing_sock_info) { 713 ip_rts_change(RTM_LOSING, ire->ire_addr, 714 ire->ire_gateway_addr, ire->ire_mask, 715 ire->ire_src_addr, 0, 0, 0, 716 (RTA_DST | RTA_GATEWAY | 717 RTA_NETMASK | RTA_IFA), 718 ipst); 719 } 720 ire_delete(ire); 721 } 722 ire_refrele(ire); 723 } 724 return (0); 725 } 726 727 /* 728 * ip_ire_req is called by ip_wput when an IRE_DB_REQ_TYPE message is handed 729 * down from the Upper Level Protocol to request a copy of the IRE (to check 730 * its type or to extract information like round-trip time estimates or the 731 * MTU.) 732 * The address is assumed to be in the ire_addr field. If no IRE is found 733 * an IRE is returned with ire_type being zero. 734 * Note that the upper lavel protocol has to check for broadcast 735 * (IRE_BROADCAST) and multicast (CLASSD(addr)). 736 * If there is a b_cont the resulting IRE_DB_TYPE mblk is placed at the 737 * end of the returned message. 738 * 739 * TCP sends down a message of this type with a connection request packet 740 * chained on. UDP and ICMP send it down to verify that a route exists for 741 * the destination address when they get connected. 742 */ 743 void 744 ip_ire_req(queue_t *q, mblk_t *mp) 745 { 746 ire_t *inire; 747 ire_t *ire; 748 mblk_t *mp1; 749 ire_t *sire = NULL; 750 zoneid_t zoneid = Q_TO_CONN(q)->conn_zoneid; 751 ip_stack_t *ipst = CONNQ_TO_IPST(q); 752 753 ASSERT(q->q_next == NULL); 754 755 if ((mp->b_wptr - mp->b_rptr) < sizeof (ire_t) || 756 !OK_32PTR(mp->b_rptr)) { 757 freemsg(mp); 758 return; 759 } 760 inire = (ire_t *)mp->b_rptr; 761 /* 762 * Got it, now take our best shot at an IRE. 763 */ 764 if (inire->ire_ipversion == IPV6_VERSION) { 765 ire = ire_route_lookup_v6(&inire->ire_addr_v6, 0, 0, 0, 766 NULL, &sire, zoneid, NULL, 767 (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst); 768 } else { 769 ASSERT(inire->ire_ipversion == IPV4_VERSION); 770 ire = ire_route_lookup(inire->ire_addr, 0, 0, 0, 771 NULL, &sire, zoneid, NULL, 772 (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst); 773 } 774 775 /* 776 * We prevent returning IRES with source address INADDR_ANY 777 * as these were temporarily created for sending packets 778 * from endpoints that have conn_unspec_src set. 779 */ 780 if (ire == NULL || 781 (ire->ire_ipversion == IPV4_VERSION && 782 ire->ire_src_addr == INADDR_ANY) || 783 (ire->ire_ipversion == IPV6_VERSION && 784 IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6))) { 785 inire->ire_type = 0; 786 } else { 787 bcopy(ire, inire, sizeof (ire_t)); 788 /* Copy the route metrics from the parent. */ 789 if (sire != NULL) { 790 bcopy(&(sire->ire_uinfo), &(inire->ire_uinfo), 791 sizeof (iulp_t)); 792 } 793 794 /* 795 * As we don't lookup global policy here, we may not 796 * pass the right size if per-socket policy is not 797 * present. For these cases, path mtu discovery will 798 * do the right thing. 799 */ 800 inire->ire_ipsec_overhead = conn_ipsec_length(Q_TO_CONN(q)); 801 802 /* Pass the latest setting of the ip_path_mtu_discovery */ 803 inire->ire_frag_flag |= 804 (ipst->ips_ip_path_mtu_discovery) ? IPH_DF : 0; 805 } 806 if (ire != NULL) 807 ire_refrele(ire); 808 if (sire != NULL) 809 ire_refrele(sire); 810 mp->b_wptr = &mp->b_rptr[sizeof (ire_t)]; 811 mp->b_datap->db_type = IRE_DB_TYPE; 812 813 /* Put the IRE_DB_TYPE mblk last in the chain */ 814 mp1 = mp->b_cont; 815 if (mp1 != NULL) { 816 mp->b_cont = NULL; 817 linkb(mp1, mp); 818 mp = mp1; 819 } 820 qreply(q, mp); 821 } 822 823 /* 824 * Send a packet using the specified IRE. 825 * If ire_src_addr_v6 is all zero then discard the IRE after 826 * the packet has been sent. 827 */ 828 static void 829 ire_send(queue_t *q, mblk_t *pkt, ire_t *ire) 830 { 831 mblk_t *ipsec_mp; 832 boolean_t is_secure; 833 uint_t ifindex; 834 ill_t *ill; 835 zoneid_t zoneid = ire->ire_zoneid; 836 ip_stack_t *ipst = ire->ire_ipst; 837 838 ASSERT(ire->ire_ipversion == IPV4_VERSION); 839 ASSERT(!(ire->ire_type & IRE_LOCAL)); /* Has different ire_zoneid */ 840 ipsec_mp = pkt; 841 is_secure = (pkt->b_datap->db_type == M_CTL); 842 if (is_secure) { 843 ipsec_out_t *io; 844 845 pkt = pkt->b_cont; 846 io = (ipsec_out_t *)ipsec_mp->b_rptr; 847 if (io->ipsec_out_type == IPSEC_OUT) 848 zoneid = io->ipsec_out_zoneid; 849 } 850 851 /* If the packet originated externally then */ 852 if (pkt->b_prev) { 853 ire_refrele(ire); 854 /* 855 * Extract the ifindex from b_prev (set in ip_rput_noire). 856 * Look up interface to see if it still exists (it could have 857 * been unplumbed by the time the reply came back from ARP) 858 */ 859 ifindex = (uint_t)(uintptr_t)pkt->b_prev; 860 ill = ill_lookup_on_ifindex(ifindex, B_FALSE, 861 NULL, NULL, NULL, NULL, ipst); 862 if (ill == NULL) { 863 pkt->b_prev = NULL; 864 pkt->b_next = NULL; 865 freemsg(ipsec_mp); 866 return; 867 } 868 q = ill->ill_rq; 869 pkt->b_prev = NULL; 870 /* 871 * This packet has not gone through IPSEC processing 872 * and hence we should not have any IPSEC message 873 * prepended. 874 */ 875 ASSERT(ipsec_mp == pkt); 876 put(q, pkt); 877 ill_refrele(ill); 878 } else if (pkt->b_next) { 879 /* Packets from multicast router */ 880 pkt->b_next = NULL; 881 /* 882 * We never get the IPSEC_OUT while forwarding the 883 * packet for multicast router. 884 */ 885 ASSERT(ipsec_mp == pkt); 886 ip_rput_forward(ire, (ipha_t *)pkt->b_rptr, ipsec_mp, NULL); 887 ire_refrele(ire); 888 } else { 889 /* Locally originated packets */ 890 boolean_t delete_ire = B_FALSE; 891 ipha_t *ipha = (ipha_t *)pkt->b_rptr; 892 893 /* 894 * If this IRE shouldn't be kept in the table (because its 895 * source address is unspecified), hold a reference to it so 896 * we can delete it even after e.g. ip_wput_ire() has dropped 897 * its reference. 898 */ 899 if (!(ire->ire_marks & IRE_MARK_NOADD) && 900 ire->ire_src_addr == INADDR_ANY) { 901 delete_ire = B_TRUE; 902 IRE_REFHOLD(ire); 903 } 904 905 /* 906 * If we were resolving a router we can not use the 907 * routers IRE for sending the packet (since it would 908 * violate the uniqness of the IP idents) thus we 909 * make another pass through ip_wput to create the IRE_CACHE 910 * for the destination. 911 * When IRE_MARK_NOADD is set, ire_add() is not called. 912 * Thus ip_wput() will never find a ire and result in an 913 * infinite loop. Thus we check whether IRE_MARK_NOADD is 914 * is set. This also implies that IRE_MARK_NOADD can only be 915 * used to send packets to directly connected hosts. 916 */ 917 if (ipha->ipha_dst != ire->ire_addr && 918 !(ire->ire_marks & IRE_MARK_NOADD)) { 919 ire_refrele(ire); /* Held in ire_add */ 920 if (CONN_Q(q)) { 921 (void) ip_output(Q_TO_CONN(q), ipsec_mp, q, 922 IRE_SEND); 923 } else { 924 (void) ip_output((void *)(uintptr_t)zoneid, 925 ipsec_mp, q, IRE_SEND); 926 } 927 } else { 928 if (is_secure) { 929 ipsec_out_t *oi; 930 ipha_t *ipha; 931 932 oi = (ipsec_out_t *)ipsec_mp->b_rptr; 933 ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr; 934 if (oi->ipsec_out_proc_begin) { 935 /* 936 * This is the case where 937 * ip_wput_ipsec_out could not find 938 * the IRE and recreated a new one. 939 * As ip_wput_ipsec_out does ire 940 * lookups, ire_refrele for the extra 941 * bump in ire_add. 942 */ 943 ire_refrele(ire); 944 ip_wput_ipsec_out(q, ipsec_mp, ipha, 945 NULL, NULL); 946 } else { 947 /* 948 * IRE_REFRELE will be done in 949 * ip_wput_ire. 950 */ 951 ip_wput_ire(q, ipsec_mp, ire, NULL, 952 IRE_SEND, zoneid); 953 } 954 } else { 955 /* 956 * IRE_REFRELE will be done in ip_wput_ire. 957 */ 958 ip_wput_ire(q, ipsec_mp, ire, NULL, 959 IRE_SEND, zoneid); 960 } 961 } 962 /* 963 * Special code to support sending a single packet with 964 * conn_unspec_src using an IRE which has no source address. 965 * The IRE is deleted here after sending the packet to avoid 966 * having other code trip on it. But before we delete the 967 * ire, somebody could have looked up this ire. 968 * We prevent returning/using this IRE by the upper layers 969 * by making checks to NULL source address in other places 970 * like e.g ip_ire_append, ip_ire_req and ip_bind_connected. 971 * Though this does not completely prevent other threads 972 * from using this ire, this should not cause any problems. 973 */ 974 if (delete_ire) { 975 ip1dbg(("ire_send: delete IRE\n")); 976 ire_delete(ire); 977 ire_refrele(ire); /* Held above */ 978 } 979 } 980 } 981 982 /* 983 * Send a packet using the specified IRE. 984 * If ire_src_addr_v6 is all zero then discard the IRE after 985 * the packet has been sent. 986 */ 987 static void 988 ire_send_v6(queue_t *q, mblk_t *pkt, ire_t *ire) 989 { 990 mblk_t *ipsec_mp; 991 boolean_t secure; 992 uint_t ifindex; 993 zoneid_t zoneid = ire->ire_zoneid; 994 ip_stack_t *ipst = ire->ire_ipst; 995 996 ASSERT(ire->ire_ipversion == IPV6_VERSION); 997 ASSERT(!(ire->ire_type & IRE_LOCAL)); /* Has different ire_zoneid */ 998 if (pkt->b_datap->db_type == M_CTL) { 999 ipsec_out_t *io; 1000 1001 ipsec_mp = pkt; 1002 pkt = pkt->b_cont; 1003 secure = B_TRUE; 1004 io = (ipsec_out_t *)ipsec_mp->b_rptr; 1005 if (io->ipsec_out_type == IPSEC_OUT) 1006 zoneid = io->ipsec_out_zoneid; 1007 } else { 1008 ipsec_mp = pkt; 1009 secure = B_FALSE; 1010 } 1011 1012 /* If the packet originated externally then */ 1013 if (pkt->b_prev) { 1014 ill_t *ill; 1015 /* 1016 * Extract the ifindex from b_prev (set in ip_rput_data_v6). 1017 * Look up interface to see if it still exists (it could have 1018 * been unplumbed by the time the reply came back from the 1019 * resolver). 1020 */ 1021 ifindex = (uint_t)(uintptr_t)pkt->b_prev; 1022 ill = ill_lookup_on_ifindex(ifindex, B_TRUE, 1023 NULL, NULL, NULL, NULL, ipst); 1024 if (ill == NULL) { 1025 pkt->b_prev = NULL; 1026 pkt->b_next = NULL; 1027 freemsg(ipsec_mp); 1028 ire_refrele(ire); /* Held in ire_add */ 1029 return; 1030 } 1031 q = ill->ill_rq; 1032 pkt->b_prev = NULL; 1033 /* 1034 * This packet has not gone through IPSEC processing 1035 * and hence we should not have any IPSEC message 1036 * prepended. 1037 */ 1038 ASSERT(ipsec_mp == pkt); 1039 put(q, pkt); 1040 ill_refrele(ill); 1041 } else if (pkt->b_next) { 1042 /* Packets from multicast router */ 1043 pkt->b_next = NULL; 1044 /* 1045 * We never get the IPSEC_OUT while forwarding the 1046 * packet for multicast router. 1047 */ 1048 ASSERT(ipsec_mp == pkt); 1049 /* 1050 * XXX TODO IPv6. 1051 */ 1052 freemsg(pkt); 1053 #ifdef XXX 1054 ip_rput_forward(ire, (ipha_t *)pkt->b_rptr, pkt, NULL); 1055 #endif 1056 } else { 1057 if (secure) { 1058 ipsec_out_t *oi; 1059 ip6_t *ip6h; 1060 1061 oi = (ipsec_out_t *)ipsec_mp->b_rptr; 1062 ip6h = (ip6_t *)ipsec_mp->b_cont->b_rptr; 1063 if (oi->ipsec_out_proc_begin) { 1064 /* 1065 * This is the case where 1066 * ip_wput_ipsec_out could not find 1067 * the IRE and recreated a new one. 1068 */ 1069 ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h, 1070 NULL, NULL); 1071 } else { 1072 if (CONN_Q(q)) { 1073 (void) ip_output_v6(Q_TO_CONN(q), 1074 ipsec_mp, q, IRE_SEND); 1075 } else { 1076 (void) ip_output_v6( 1077 (void *)(uintptr_t)zoneid, 1078 ipsec_mp, q, IRE_SEND); 1079 } 1080 } 1081 } else { 1082 /* 1083 * Send packets through ip_output_v6 so that any 1084 * ip6_info header can be processed again. 1085 */ 1086 if (CONN_Q(q)) { 1087 (void) ip_output_v6(Q_TO_CONN(q), ipsec_mp, q, 1088 IRE_SEND); 1089 } else { 1090 (void) ip_output_v6((void *)(uintptr_t)zoneid, 1091 ipsec_mp, q, IRE_SEND); 1092 } 1093 } 1094 /* 1095 * Special code to support sending a single packet with 1096 * conn_unspec_src using an IRE which has no source address. 1097 * The IRE is deleted here after sending the packet to avoid 1098 * having other code trip on it. But before we delete the 1099 * ire, somebody could have looked up this ire. 1100 * We prevent returning/using this IRE by the upper layers 1101 * by making checks to NULL source address in other places 1102 * like e.g ip_ire_append_v6, ip_ire_req and 1103 * ip_bind_connected_v6. Though, this does not completely 1104 * prevent other threads from using this ire, this should 1105 * not cause any problems. 1106 */ 1107 if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6)) { 1108 ip1dbg(("ire_send_v6: delete IRE\n")); 1109 ire_delete(ire); 1110 } 1111 } 1112 ire_refrele(ire); /* Held in ire_add */ 1113 } 1114 1115 /* 1116 * Make sure that IRE bucket does not get too long. 1117 * This can cause lock up because ire_cache_lookup() 1118 * may take "forever" to finish. 1119 * 1120 * We only remove a maximum of cnt IREs each time. This 1121 * should keep the bucket length approximately constant, 1122 * depending on cnt. This should be enough to defend 1123 * against DoS attack based on creating temporary IREs 1124 * (for forwarding and non-TCP traffic). 1125 * 1126 * We also pass in the address of the newly created IRE 1127 * as we do not want to remove this straight after adding 1128 * it. New IREs are normally added at the tail of the 1129 * bucket. This means that we are removing the "oldest" 1130 * temporary IREs added. Only if there are IREs with 1131 * the same ire_addr, do we not add it at the tail. Refer 1132 * to ire_add_v*(). It should be OK for our purpose. 1133 * 1134 * For non-temporary cached IREs, we make sure that they 1135 * have not been used for some time (defined below), they 1136 * are non-local destinations, and there is no one using 1137 * them at the moment (refcnt == 1). 1138 * 1139 * The above means that the IRE bucket length may become 1140 * very long, consisting of mostly non-temporary IREs. 1141 * This can happen when the hash function does a bad job 1142 * so that most TCP connections cluster to a specific bucket. 1143 * This "hopefully" should never happen. It can also 1144 * happen if most TCP connections have very long lives. 1145 * Even with the minimal hash table size of 256, there 1146 * has to be a lot of such connections to make the bucket 1147 * length unreasonably long. This should probably not 1148 * happen either. The third can when this can happen is 1149 * when the machine is under attack, such as SYN flooding. 1150 * TCP should already have the proper mechanism to protect 1151 * that. So we should be safe. 1152 * 1153 * This function is called by ire_add_then_send() after 1154 * a new IRE is added and the packet is sent. 1155 * 1156 * The idle cutoff interval is set to 60s. It can be 1157 * changed using /etc/system. 1158 */ 1159 uint32_t ire_idle_cutoff_interval = 60000; 1160 1161 static void 1162 ire_cache_cleanup(irb_t *irb, uint32_t threshold, ire_t *ref_ire) 1163 { 1164 ire_t *ire; 1165 clock_t cut_off = drv_usectohz(ire_idle_cutoff_interval * 1000); 1166 int cnt = ip_ire_cleanup_cnt; 1167 1168 /* 1169 * Try to remove cnt temporary IREs first. 1170 */ 1171 for (ire = irb->irb_ire; cnt > 0 && ire != NULL; ire = ire->ire_next) { 1172 if (ire == ref_ire) 1173 continue; 1174 if (ire->ire_marks & IRE_MARK_CONDEMNED) 1175 continue; 1176 if (ire->ire_marks & IRE_MARK_TEMPORARY) { 1177 ASSERT(ire->ire_type == IRE_CACHE); 1178 ire_delete(ire); 1179 cnt--; 1180 } 1181 } 1182 if (cnt == 0) 1183 return; 1184 1185 /* 1186 * If we didn't satisfy our removal target from temporary IREs 1187 * we see how many non-temporary IREs are currently in the bucket. 1188 * If this quantity is above the threshold then we see if there are any 1189 * candidates for removal. We are still limited to removing a maximum 1190 * of cnt IREs. 1191 */ 1192 if ((irb->irb_ire_cnt - irb->irb_tmp_ire_cnt) > threshold) { 1193 for (ire = irb->irb_ire; cnt > 0 && ire != NULL; 1194 ire = ire->ire_next) { 1195 if (ire == ref_ire) 1196 continue; 1197 if (ire->ire_type != IRE_CACHE) 1198 continue; 1199 if (ire->ire_marks & IRE_MARK_CONDEMNED) 1200 continue; 1201 if ((ire->ire_refcnt == 1) && 1202 (lbolt - ire->ire_last_used_time > cut_off)) { 1203 ire_delete(ire); 1204 cnt--; 1205 } 1206 } 1207 } 1208 } 1209 1210 /* 1211 * ire_add_then_send is called when a new IRE has been created in order to 1212 * route an outgoing packet. Typically, it is called from ip_wput when 1213 * a response comes back down from a resolver. We add the IRE, and then 1214 * possibly run the packet through ip_wput or ip_rput, as appropriate. 1215 * However, we do not add the newly created IRE in the cache when 1216 * IRE_MARK_NOADD is set in the IRE. IRE_MARK_NOADD is set at 1217 * ip_newroute_ipif(). The ires with IRE_MARK_NOADD are ire_refrele'd by 1218 * ip_wput_ire() and get deleted. 1219 * Multirouting support: the packet is silently discarded when the new IRE 1220 * holds the RTF_MULTIRT flag, but is not the first IRE to be added with the 1221 * RTF_MULTIRT flag for the same destination address. 1222 * In this case, we just want to register this additional ire without 1223 * sending the packet, as it has already been replicated through 1224 * existing multirt routes in ip_wput(). 1225 */ 1226 void 1227 ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp) 1228 { 1229 irb_t *irb; 1230 boolean_t drop = B_FALSE; 1231 boolean_t mctl_present; 1232 mblk_t *first_mp = NULL; 1233 mblk_t *data_mp = NULL; 1234 ire_t *dst_ire; 1235 ipha_t *ipha; 1236 ip6_t *ip6h; 1237 ip_stack_t *ipst = ire->ire_ipst; 1238 int ire_limit; 1239 1240 if (mp != NULL) { 1241 /* 1242 * We first have to retrieve the destination address carried 1243 * by the packet. 1244 * We can't rely on ire as it can be related to a gateway. 1245 * The destination address will help in determining if 1246 * other RTF_MULTIRT ires are already registered. 1247 * 1248 * We first need to know where we are going : v4 or V6. 1249 * the ire version is enough, as there is no risk that 1250 * we resolve an IPv6 address with an IPv4 ire 1251 * or vice versa. 1252 */ 1253 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 1254 data_mp = mp; 1255 mp = first_mp; 1256 if (ire->ire_ipversion == IPV4_VERSION) { 1257 ipha = (ipha_t *)data_mp->b_rptr; 1258 dst_ire = ire_cache_lookup(ipha->ipha_dst, 1259 ire->ire_zoneid, msg_getlabel(mp), ipst); 1260 } else { 1261 ASSERT(ire->ire_ipversion == IPV6_VERSION); 1262 ip6h = (ip6_t *)data_mp->b_rptr; 1263 dst_ire = ire_cache_lookup_v6(&ip6h->ip6_dst, 1264 ire->ire_zoneid, msg_getlabel(mp), ipst); 1265 } 1266 if (dst_ire != NULL) { 1267 if (dst_ire->ire_flags & RTF_MULTIRT) { 1268 /* 1269 * At least one resolved multirt route 1270 * already exists for the destination, 1271 * don't sent this packet: either drop it 1272 * or complete the pending resolution, 1273 * depending on the ire. 1274 */ 1275 drop = B_TRUE; 1276 } 1277 ip1dbg(("ire_add_then_send: dst_ire %p " 1278 "[dst %08x, gw %08x], drop %d\n", 1279 (void *)dst_ire, 1280 (dst_ire->ire_ipversion == IPV4_VERSION) ? \ 1281 ntohl(dst_ire->ire_addr) : \ 1282 ntohl(V4_PART_OF_V6(dst_ire->ire_addr_v6)), 1283 (dst_ire->ire_ipversion == IPV4_VERSION) ? \ 1284 ntohl(dst_ire->ire_gateway_addr) : \ 1285 ntohl(V4_PART_OF_V6( 1286 dst_ire->ire_gateway_addr_v6)), 1287 drop)); 1288 ire_refrele(dst_ire); 1289 } 1290 } 1291 1292 if (!(ire->ire_marks & IRE_MARK_NOADD)) { 1293 /* Regular packets with cache bound ires are here. */ 1294 (void) ire_add(&ire, NULL, NULL, NULL, B_FALSE); 1295 1296 if (ire == NULL) { 1297 mp->b_prev = NULL; 1298 mp->b_next = NULL; 1299 MULTIRT_DEBUG_UNTAG(mp); 1300 freemsg(mp); 1301 return; 1302 } 1303 if (mp == NULL) { 1304 ire_refrele(ire); /* Held in ire_add_v4/v6 */ 1305 return; 1306 } 1307 } 1308 if (drop) { 1309 /* 1310 * If we're adding an RTF_MULTIRT ire, the resolution 1311 * is over: we just drop the packet. 1312 */ 1313 if (ire->ire_flags & RTF_MULTIRT) { 1314 data_mp->b_prev = NULL; 1315 data_mp->b_next = NULL; 1316 MULTIRT_DEBUG_UNTAG(mp); 1317 freemsg(mp); 1318 } else { 1319 /* 1320 * Otherwise, we're adding the ire to a gateway 1321 * for a multirt route. 1322 * Invoke ip_newroute() to complete the resolution 1323 * of the route. We will then come back here and 1324 * finally drop this packet in the above code. 1325 */ 1326 if (ire->ire_ipversion == IPV4_VERSION) { 1327 /* 1328 * TODO: in order for CGTP to work in non-global 1329 * zones, ip_newroute() must create the IRE 1330 * cache in the zone indicated by 1331 * ire->ire_zoneid. 1332 */ 1333 ip_newroute(q, mp, ipha->ipha_dst, 1334 (CONN_Q(q) ? Q_TO_CONN(q) : NULL), 1335 ire->ire_zoneid, ipst); 1336 } else { 1337 int minlen = sizeof (ip6i_t) + IPV6_HDR_LEN; 1338 1339 ASSERT(ire->ire_ipversion == IPV6_VERSION); 1340 1341 /* 1342 * If necessary, skip over the ip6i_t to find 1343 * the header with the actual source address. 1344 */ 1345 if (ip6h->ip6_nxt == IPPROTO_RAW) { 1346 if (MBLKL(data_mp) < minlen && 1347 pullupmsg(data_mp, -1) == 0) { 1348 ip1dbg(("ire_add_then_send: " 1349 "cannot pullupmsg ip6i\n")); 1350 if (mctl_present) 1351 freeb(first_mp); 1352 ire_refrele(ire); 1353 return; 1354 } 1355 ASSERT(MBLKL(data_mp) >= IPV6_HDR_LEN); 1356 ip6h = (ip6_t *)(data_mp->b_rptr + 1357 sizeof (ip6i_t)); 1358 } 1359 ip_newroute_v6(q, mp, &ip6h->ip6_dst, 1360 &ip6h->ip6_src, NULL, ire->ire_zoneid, 1361 ipst); 1362 } 1363 } 1364 1365 ire_refrele(ire); /* As done by ire_send(). */ 1366 return; 1367 } 1368 /* 1369 * Need to remember ire_bucket here as ire_send*() may delete 1370 * the ire so we cannot reference it after that. 1371 */ 1372 irb = ire->ire_bucket; 1373 if (ire->ire_ipversion == IPV4_VERSION) { 1374 ire_send(q, mp, ire); 1375 ire_limit = ip_ire_max_bucket_cnt; 1376 } else { 1377 ire_send_v6(q, mp, ire); 1378 ire_limit = ip6_ire_max_bucket_cnt; 1379 } 1380 1381 /* 1382 * irb is NULL if the IRE was not added to the hash. This happens 1383 * when IRE_MARK_NOADD is set and when IREs are returned from 1384 * ire_update_srcif_v4(). 1385 */ 1386 if (irb != NULL) { 1387 IRB_REFHOLD(irb); 1388 if (irb->irb_ire_cnt > ire_limit) 1389 ire_cache_cleanup(irb, ire_limit, ire); 1390 IRB_REFRELE(irb); 1391 } 1392 } 1393 1394 /* 1395 * Initialize the ire that is specific to IPv4 part and call 1396 * ire_init_common to finish it. 1397 */ 1398 ire_t * 1399 ire_init(ire_t *ire, uchar_t *addr, uchar_t *mask, uchar_t *src_addr, 1400 uchar_t *gateway, uint_t *max_fragp, nce_t *src_nce, queue_t *rfq, 1401 queue_t *stq, ushort_t type, ipif_t *ipif, ipaddr_t cmask, uint32_t phandle, 1402 uint32_t ihandle, uint32_t flags, const iulp_t *ulp_info, tsol_gc_t *gc, 1403 tsol_gcgrp_t *gcgrp, ip_stack_t *ipst) 1404 { 1405 ASSERT(type != IRE_CACHE || stq != NULL); 1406 /* 1407 * Reject IRE security attribute creation/initialization 1408 * if system is not running in Trusted mode. 1409 */ 1410 if ((gc != NULL || gcgrp != NULL) && !is_system_labeled()) 1411 return (NULL); 1412 1413 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_alloced); 1414 1415 if (addr != NULL) 1416 bcopy(addr, &ire->ire_addr, IP_ADDR_LEN); 1417 if (src_addr != NULL) 1418 bcopy(src_addr, &ire->ire_src_addr, IP_ADDR_LEN); 1419 if (mask != NULL) { 1420 bcopy(mask, &ire->ire_mask, IP_ADDR_LEN); 1421 ire->ire_masklen = ip_mask_to_plen(ire->ire_mask); 1422 } 1423 if (gateway != NULL) { 1424 bcopy(gateway, &ire->ire_gateway_addr, IP_ADDR_LEN); 1425 } 1426 1427 if (type == IRE_CACHE) 1428 ire->ire_cmask = cmask; 1429 1430 /* ire_init_common will free the mblks upon encountering any failure */ 1431 if (!ire_init_common(ire, max_fragp, src_nce, rfq, stq, type, ipif, 1432 phandle, ihandle, flags, IPV4_VERSION, ulp_info, gc, gcgrp, ipst)) 1433 return (NULL); 1434 1435 return (ire); 1436 } 1437 1438 /* 1439 * Similar to ire_create except that it is called only when 1440 * we want to allocate ire as an mblk e.g. we have an external 1441 * resolver ARP. 1442 */ 1443 ire_t * 1444 ire_create_mp(uchar_t *addr, uchar_t *mask, uchar_t *src_addr, uchar_t *gateway, 1445 uint_t max_frag, nce_t *src_nce, queue_t *rfq, queue_t *stq, ushort_t type, 1446 ipif_t *ipif, ipaddr_t cmask, uint32_t phandle, uint32_t ihandle, 1447 uint32_t flags, const iulp_t *ulp_info, tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, 1448 ip_stack_t *ipst) 1449 { 1450 ire_t *ire, *buf; 1451 ire_t *ret_ire; 1452 mblk_t *mp; 1453 size_t bufsize; 1454 frtn_t *frtnp; 1455 ill_t *ill; 1456 1457 bufsize = sizeof (ire_t) + sizeof (frtn_t); 1458 buf = kmem_alloc(bufsize, KM_NOSLEEP); 1459 if (buf == NULL) { 1460 ip1dbg(("ire_create_mp: alloc failed\n")); 1461 return (NULL); 1462 } 1463 frtnp = (frtn_t *)(buf + 1); 1464 frtnp->free_arg = (caddr_t)buf; 1465 frtnp->free_func = ire_freemblk; 1466 1467 /* 1468 * Allocate the new IRE. The ire created will hold a ref on 1469 * an nce_t after ire_nce_init, and this ref must either be 1470 * (a) transferred to the ire_cache entry created when ire_add_v4 1471 * is called after successful arp resolution, or, 1472 * (b) released, when arp resolution fails 1473 * Case (b) is handled in ire_freemblk() which will be called 1474 * when mp is freed as a result of failed arp. 1475 */ 1476 mp = esballoc((unsigned char *)buf, bufsize, BPRI_MED, frtnp); 1477 if (mp == NULL) { 1478 ip1dbg(("ire_create_mp: alloc failed\n")); 1479 kmem_free(buf, bufsize); 1480 return (NULL); 1481 } 1482 ire = (ire_t *)mp->b_rptr; 1483 mp->b_wptr = (uchar_t *)&ire[1]; 1484 1485 /* Start clean. */ 1486 *ire = ire_null; 1487 ire->ire_mp = mp; 1488 mp->b_datap->db_type = IRE_DB_TYPE; 1489 ire->ire_marks |= IRE_MARK_UNCACHED; 1490 1491 ret_ire = ire_init(ire, addr, mask, src_addr, gateway, NULL, src_nce, 1492 rfq, stq, type, ipif, cmask, phandle, ihandle, flags, ulp_info, gc, 1493 gcgrp, ipst); 1494 1495 ill = (ill_t *)(stq->q_ptr); 1496 if (ret_ire == NULL) { 1497 /* ire_freemblk needs these set */ 1498 ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex; 1499 ire->ire_stackid = ipst->ips_netstack->netstack_stackid; 1500 ire->ire_ipst = ipst; 1501 freeb(ire->ire_mp); 1502 return (NULL); 1503 } 1504 ret_ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex; 1505 ret_ire->ire_stackid = ipst->ips_netstack->netstack_stackid; 1506 ASSERT(ret_ire == ire); 1507 ASSERT(ret_ire->ire_ipst == ipst); 1508 /* 1509 * ire_max_frag is normally zero here and is atomically set 1510 * under the irebucket lock in ire_add_v[46] except for the 1511 * case of IRE_MARK_NOADD. In that event the the ire_max_frag 1512 * is non-zero here. 1513 */ 1514 ire->ire_max_frag = max_frag; 1515 return (ire); 1516 } 1517 1518 /* 1519 * ire_create is called to allocate and initialize a new IRE. 1520 * 1521 * NOTE : This is called as writer sometimes though not required 1522 * by this function. 1523 */ 1524 ire_t * 1525 ire_create(uchar_t *addr, uchar_t *mask, uchar_t *src_addr, uchar_t *gateway, 1526 uint_t *max_fragp, nce_t *src_nce, queue_t *rfq, queue_t *stq, 1527 ushort_t type, ipif_t *ipif, ipaddr_t cmask, uint32_t phandle, 1528 uint32_t ihandle, uint32_t flags, const iulp_t *ulp_info, tsol_gc_t *gc, 1529 tsol_gcgrp_t *gcgrp, ip_stack_t *ipst) 1530 { 1531 ire_t *ire; 1532 ire_t *ret_ire; 1533 1534 ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 1535 if (ire == NULL) { 1536 ip1dbg(("ire_create: alloc failed\n")); 1537 return (NULL); 1538 } 1539 *ire = ire_null; 1540 1541 ret_ire = ire_init(ire, addr, mask, src_addr, gateway, max_fragp, 1542 src_nce, rfq, stq, type, ipif, cmask, phandle, ihandle, flags, 1543 ulp_info, gc, gcgrp, ipst); 1544 1545 if (ret_ire == NULL) { 1546 kmem_cache_free(ire_cache, ire); 1547 return (NULL); 1548 } 1549 ASSERT(ret_ire == ire); 1550 return (ire); 1551 } 1552 1553 /* 1554 * Common to IPv4 and IPv6 1555 */ 1556 boolean_t 1557 ire_init_common(ire_t *ire, uint_t *max_fragp, nce_t *src_nce, queue_t *rfq, 1558 queue_t *stq, ushort_t type, ipif_t *ipif, uint32_t phandle, 1559 uint32_t ihandle, uint32_t flags, uchar_t ipversion, const iulp_t *ulp_info, 1560 tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, ip_stack_t *ipst) 1561 { 1562 ire->ire_max_fragp = max_fragp; 1563 ire->ire_frag_flag |= (ipst->ips_ip_path_mtu_discovery) ? IPH_DF : 0; 1564 1565 #ifdef DEBUG 1566 if (ipif != NULL) { 1567 if (ipif->ipif_isv6) 1568 ASSERT(ipversion == IPV6_VERSION); 1569 else 1570 ASSERT(ipversion == IPV4_VERSION); 1571 } 1572 #endif /* DEBUG */ 1573 1574 /* 1575 * Create/initialize IRE security attribute only in Trusted mode; 1576 * if the passed in gc/gcgrp is non-NULL, we expect that the caller 1577 * has held a reference to it and will release it when this routine 1578 * returns a failure, otherwise we own the reference. We do this 1579 * prior to initializing the rest IRE fields. 1580 * 1581 * Don't allocate ire_gw_secattr for the resolver case to prevent 1582 * memory leak (in case of external resolution failure). We'll 1583 * allocate it after a successful external resolution, in ire_add(). 1584 * Note that ire->ire_mp != NULL here means this ire is headed 1585 * to an external resolver. 1586 */ 1587 if (is_system_labeled()) { 1588 if ((type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST | 1589 IRE_INTERFACE)) != 0) { 1590 /* release references on behalf of caller */ 1591 if (gc != NULL) 1592 GC_REFRELE(gc); 1593 if (gcgrp != NULL) 1594 GCGRP_REFRELE(gcgrp); 1595 } else if ((ire->ire_mp == NULL) && 1596 tsol_ire_init_gwattr(ire, ipversion, gc, gcgrp) != 0) { 1597 return (B_FALSE); 1598 } 1599 } 1600 1601 ire->ire_stq = stq; 1602 ire->ire_rfq = rfq; 1603 ire->ire_type = type; 1604 ire->ire_flags = RTF_UP | flags; 1605 ire->ire_ident = TICK_TO_MSEC(lbolt); 1606 bcopy(ulp_info, &ire->ire_uinfo, sizeof (iulp_t)); 1607 1608 ire->ire_tire_mark = ire->ire_ob_pkt_count + ire->ire_ib_pkt_count; 1609 ire->ire_last_used_time = lbolt; 1610 ire->ire_create_time = (uint32_t)gethrestime_sec(); 1611 1612 /* 1613 * If this IRE is an IRE_CACHE, inherit the handles from the 1614 * parent IREs. For others in the forwarding table, assign appropriate 1615 * new ones. 1616 * 1617 * The mutex protecting ire_handle is because ire_create is not always 1618 * called as a writer. 1619 */ 1620 if (ire->ire_type & IRE_OFFSUBNET) { 1621 mutex_enter(&ipst->ips_ire_handle_lock); 1622 ire->ire_phandle = (uint32_t)ipst->ips_ire_handle++; 1623 mutex_exit(&ipst->ips_ire_handle_lock); 1624 } else if (ire->ire_type & IRE_INTERFACE) { 1625 mutex_enter(&ipst->ips_ire_handle_lock); 1626 ire->ire_ihandle = (uint32_t)ipst->ips_ire_handle++; 1627 mutex_exit(&ipst->ips_ire_handle_lock); 1628 } else if (ire->ire_type == IRE_CACHE) { 1629 ire->ire_phandle = phandle; 1630 ire->ire_ihandle = ihandle; 1631 } 1632 ire->ire_ipif = ipif; 1633 if (ipif != NULL) { 1634 ire->ire_ipif_seqid = ipif->ipif_seqid; 1635 ire->ire_ipif_ifindex = 1636 ipif->ipif_ill->ill_phyint->phyint_ifindex; 1637 ire->ire_zoneid = ipif->ipif_zoneid; 1638 } else { 1639 ire->ire_zoneid = GLOBAL_ZONEID; 1640 } 1641 ire->ire_ipversion = ipversion; 1642 mutex_init(&ire->ire_lock, NULL, MUTEX_DEFAULT, NULL); 1643 if (ipversion == IPV4_VERSION) { 1644 /* 1645 * IPv6 initializes the ire_nce in ire_add_v6, which expects 1646 * to find the ire_nce to be null when it is called. 1647 */ 1648 if (ire_nce_init(ire, src_nce) != 0) { 1649 /* some failure occurred. propagate error back */ 1650 return (B_FALSE); 1651 } 1652 } 1653 ire->ire_refcnt = 1; 1654 ire->ire_ipst = ipst; /* No netstack_hold */ 1655 ire->ire_trace_disable = B_FALSE; 1656 1657 return (B_TRUE); 1658 } 1659 1660 /* 1661 * This routine is called repeatedly by ipif_up to create broadcast IREs. 1662 * It is passed a pointer to a slot in an IRE pointer array into which to 1663 * place the pointer to the new IRE, if indeed we create one. If the 1664 * IRE corresponding to the address passed in would be a duplicate of an 1665 * existing one, we don't create the new one. irep is incremented before 1666 * return only if we do create a new IRE. (Always called as writer.) 1667 * 1668 * Note that with the "match_flags" parameter, we can match on either 1669 * a particular logical interface (MATCH_IRE_IPIF) or for all logical 1670 * interfaces for a given physical interface (MATCH_IRE_ILL). Currently, 1671 * we only create broadcast ire's on a per physical interface basis. If 1672 * someone is going to be mucking with logical interfaces, it is important 1673 * to call "ipif_check_bcast_ires()" to make sure that any change to a 1674 * logical interface will not cause critical broadcast IRE's to be deleted. 1675 */ 1676 ire_t ** 1677 ire_check_and_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep, 1678 int match_flags) 1679 { 1680 ire_t *ire; 1681 uint64_t check_flags = IPIF_DEPRECATED | IPIF_NOLOCAL | IPIF_ANYCAST; 1682 boolean_t prefer; 1683 ill_t *ill = ipif->ipif_ill; 1684 ip_stack_t *ipst = ill->ill_ipst; 1685 1686 /* 1687 * No broadcast IREs for the LOOPBACK interface 1688 * or others such as point to point and IPIF_NOXMIT. 1689 */ 1690 if (!(ipif->ipif_flags & IPIF_BROADCAST) || 1691 (ipif->ipif_flags & IPIF_NOXMIT)) 1692 return (irep); 1693 1694 /* 1695 * If this new IRE would be a duplicate, only prefer it if one of 1696 * the following is true: 1697 * 1698 * 1. The existing one has IPIF_DEPRECATED|IPIF_LOCAL|IPIF_ANYCAST 1699 * set and the new one has all of those clear. 1700 * 1701 * 2. The existing one corresponds to an underlying ILL in an IPMP 1702 * group and the new one corresponds to an IPMP group interface. 1703 */ 1704 if ((ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ipif, 1705 ipif->ipif_zoneid, NULL, match_flags, ipst)) != NULL) { 1706 prefer = ((ire->ire_ipif->ipif_flags & check_flags) && 1707 !(ipif->ipif_flags & check_flags)) || 1708 (IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) && IS_IPMP(ill)); 1709 if (!prefer) { 1710 ire_refrele(ire); 1711 return (irep); 1712 } 1713 1714 /* 1715 * Bcast ires exist in pairs. Both have to be deleted, 1716 * Since we are exclusive we can make the above assertion. 1717 * The 1st has to be refrele'd since it was ctable_lookup'd. 1718 */ 1719 ASSERT(IAM_WRITER_IPIF(ipif)); 1720 ASSERT(ire->ire_next->ire_addr == ire->ire_addr); 1721 ire_delete(ire->ire_next); 1722 ire_delete(ire); 1723 ire_refrele(ire); 1724 } 1725 return (ire_create_bcast(ipif, addr, irep)); 1726 } 1727 1728 uint_t ip_loopback_mtu = IP_LOOPBACK_MTU; 1729 1730 /* 1731 * This routine is called from ipif_check_bcast_ires and ire_check_bcast. 1732 * It leaves all the verifying and deleting to those routines. So it always 1733 * creates 2 bcast ires and chains them into the ire array passed in. 1734 */ 1735 ire_t ** 1736 ire_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep) 1737 { 1738 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 1739 ill_t *ill = ipif->ipif_ill; 1740 1741 ASSERT(IAM_WRITER_IPIF(ipif)); 1742 1743 if (IS_IPMP(ill)) { 1744 /* 1745 * Broadcast IREs for the IPMP meta-interface use the 1746 * nominated broadcast interface to send and receive packets. 1747 * If there's no nominated interface, send the packets down to 1748 * the IPMP stub driver, which will discard them. If the 1749 * nominated broadcast interface changes, ill_refresh_bcast() 1750 * will refresh the broadcast IREs. 1751 */ 1752 if ((ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL) 1753 ill = ipif->ipif_ill; 1754 } 1755 1756 *irep++ = ire_create( 1757 (uchar_t *)&addr, /* dest addr */ 1758 (uchar_t *)&ip_g_all_ones, /* mask */ 1759 (uchar_t *)&ipif->ipif_src_addr, /* source addr */ 1760 NULL, /* no gateway */ 1761 &ipif->ipif_mtu, /* max frag */ 1762 NULL, /* no src nce */ 1763 ill->ill_rq, /* recv-from queue */ 1764 ill->ill_wq, /* send-to queue */ 1765 IRE_BROADCAST, 1766 ipif, 1767 0, 1768 0, 1769 0, 1770 0, 1771 &ire_uinfo_null, 1772 NULL, 1773 NULL, 1774 ipst); 1775 1776 *irep++ = ire_create( 1777 (uchar_t *)&addr, /* dest address */ 1778 (uchar_t *)&ip_g_all_ones, /* mask */ 1779 (uchar_t *)&ipif->ipif_src_addr, /* source address */ 1780 NULL, /* no gateway */ 1781 &ip_loopback_mtu, /* max frag size */ 1782 NULL, /* no src_nce */ 1783 ill->ill_rq, /* recv-from queue */ 1784 NULL, /* no send-to queue */ 1785 IRE_BROADCAST, /* Needed for fanout in wput */ 1786 ipif, 1787 0, 1788 0, 1789 0, 1790 0, 1791 &ire_uinfo_null, 1792 NULL, 1793 NULL, 1794 ipst); 1795 1796 return (irep); 1797 } 1798 1799 /* 1800 * ire_walk routine to delete or update any IRE_CACHE that might contain 1801 * stale information. 1802 * The flags state which entries to delete or update. 1803 * Garbage collection is done separately using kmem alloc callbacks to 1804 * ip_trash_ire_reclaim. 1805 * Used for both IPv4 and IPv6. However, IPv6 only uses FLUSH_MTU_TIME 1806 * since other stale information is cleaned up using NUD. 1807 */ 1808 void 1809 ire_expire(ire_t *ire, char *arg) 1810 { 1811 ire_expire_arg_t *ieap = (ire_expire_arg_t *)(uintptr_t)arg; 1812 ill_t *stq_ill; 1813 int flush_flags = ieap->iea_flush_flag; 1814 ip_stack_t *ipst = ieap->iea_ipst; 1815 1816 if ((flush_flags & FLUSH_REDIRECT_TIME) && 1817 (ire->ire_flags & RTF_DYNAMIC)) { 1818 /* Make sure we delete the corresponding IRE_CACHE */ 1819 ip1dbg(("ire_expire: all redirects\n")); 1820 ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst); 1821 ire_delete(ire); 1822 atomic_dec_32(&ipst->ips_ip_redirect_cnt); 1823 return; 1824 } 1825 if (ire->ire_type != IRE_CACHE) 1826 return; 1827 1828 if (flush_flags & FLUSH_ARP_TIME) { 1829 /* 1830 * Remove all IRE_CACHE except IPv4 multicast ires. These 1831 * ires will be deleted by ip_trash_ire_reclaim_stack() 1832 * when system runs low in memory. 1833 * Verify that create time is more than ip_ire_arp_interval 1834 * milliseconds ago. 1835 */ 1836 1837 if (!(ire->ire_ipversion == IPV4_VERSION && 1838 CLASSD(ire->ire_addr)) && NCE_EXPIRED(ire->ire_nce, ipst)) { 1839 ire_delete(ire); 1840 return; 1841 } 1842 } 1843 1844 if (ipst->ips_ip_path_mtu_discovery && (flush_flags & FLUSH_MTU_TIME) && 1845 (ire->ire_ipif != NULL)) { 1846 /* Increase pmtu if it is less than the interface mtu */ 1847 mutex_enter(&ire->ire_lock); 1848 /* 1849 * If the ipif is a vni (whose mtu is 0, since it's virtual) 1850 * get the mtu from the sending interfaces' ipif 1851 */ 1852 if (IS_VNI(ire->ire_ipif->ipif_ill)) { 1853 stq_ill = ire->ire_stq->q_ptr; 1854 ire->ire_max_frag = MIN(stq_ill->ill_ipif->ipif_mtu, 1855 IP_MAXPACKET); 1856 } else { 1857 ire->ire_max_frag = MIN(ire->ire_ipif->ipif_mtu, 1858 IP_MAXPACKET); 1859 } 1860 ire->ire_frag_flag |= IPH_DF; 1861 mutex_exit(&ire->ire_lock); 1862 } 1863 } 1864 1865 /* 1866 * Return any local address. We use this to target ourselves 1867 * when the src address was specified as 'default'. 1868 * Preference for IRE_LOCAL entries. 1869 */ 1870 ire_t * 1871 ire_lookup_local(zoneid_t zoneid, ip_stack_t *ipst) 1872 { 1873 ire_t *ire; 1874 irb_t *irb; 1875 ire_t *maybe = NULL; 1876 int i; 1877 1878 for (i = 0; i < ipst->ips_ip_cache_table_size; i++) { 1879 irb = &ipst->ips_ip_cache_table[i]; 1880 if (irb->irb_ire == NULL) 1881 continue; 1882 rw_enter(&irb->irb_lock, RW_READER); 1883 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 1884 if ((ire->ire_marks & IRE_MARK_CONDEMNED) || 1885 (ire->ire_zoneid != zoneid && 1886 ire->ire_zoneid != ALL_ZONES)) 1887 continue; 1888 switch (ire->ire_type) { 1889 case IRE_LOOPBACK: 1890 if (maybe == NULL) { 1891 IRE_REFHOLD(ire); 1892 maybe = ire; 1893 } 1894 break; 1895 case IRE_LOCAL: 1896 if (maybe != NULL) { 1897 ire_refrele(maybe); 1898 } 1899 IRE_REFHOLD(ire); 1900 rw_exit(&irb->irb_lock); 1901 return (ire); 1902 } 1903 } 1904 rw_exit(&irb->irb_lock); 1905 } 1906 return (maybe); 1907 } 1908 1909 /* 1910 * If the specified IRE is associated with a particular ILL, return 1911 * that ILL pointer (May be called as writer.). 1912 * 1913 * NOTE : This is not a generic function that can be used always. 1914 * This function always returns the ill of the outgoing packets 1915 * if this ire is used. 1916 */ 1917 ill_t * 1918 ire_to_ill(const ire_t *ire) 1919 { 1920 ill_t *ill = NULL; 1921 1922 /* 1923 * 1) For an IRE_CACHE, ire_ipif is the one where it obtained 1924 * the source address from. ire_stq is the one where the 1925 * packets will be sent out on. We return that here. 1926 * 1927 * 2) IRE_BROADCAST normally has a loopback and a non-loopback 1928 * copy and they always exist next to each other with loopback 1929 * copy being the first one. If we are called on the non-loopback 1930 * copy, return the one pointed by ire_stq. If it was called on 1931 * a loopback copy, we still return the one pointed by the next 1932 * ire's ire_stq pointer i.e the one pointed by the non-loopback 1933 * copy. We don't want use ire_ipif as it might represent the 1934 * source address (if we borrow source addresses for 1935 * IRE_BROADCASTS in the future). 1936 * However if an interface is currently coming up, the above 1937 * condition may not hold during that period since the ires 1938 * are added one at a time. Thus one of the pair could have been 1939 * added and the other not yet added. 1940 * 3) For many other IREs (e.g., IRE_LOCAL), ire_rfq indicates the ill. 1941 * 4) For all others return the ones pointed by ire_ipif->ipif_ill. 1942 * That handles IRE_LOOPBACK. 1943 */ 1944 1945 if (ire->ire_type == IRE_CACHE) { 1946 ill = (ill_t *)ire->ire_stq->q_ptr; 1947 } else if (ire->ire_type == IRE_BROADCAST) { 1948 if (ire->ire_stq != NULL) { 1949 ill = (ill_t *)ire->ire_stq->q_ptr; 1950 } else { 1951 ire_t *ire_next; 1952 1953 ire_next = ire->ire_next; 1954 if (ire_next != NULL && 1955 ire_next->ire_type == IRE_BROADCAST && 1956 ire_next->ire_addr == ire->ire_addr && 1957 ire_next->ire_ipif == ire->ire_ipif) { 1958 ill = (ill_t *)ire_next->ire_stq->q_ptr; 1959 } 1960 } 1961 } else if (ire->ire_rfq != NULL) { 1962 ill = ire->ire_rfq->q_ptr; 1963 } else if (ire->ire_ipif != NULL) { 1964 ill = ire->ire_ipif->ipif_ill; 1965 } 1966 return (ill); 1967 } 1968 1969 /* Arrange to call the specified function for every IRE in the world. */ 1970 void 1971 ire_walk(pfv_t func, void *arg, ip_stack_t *ipst) 1972 { 1973 ire_walk_ipvers(func, arg, 0, ALL_ZONES, ipst); 1974 } 1975 1976 void 1977 ire_walk_v4(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst) 1978 { 1979 ire_walk_ipvers(func, arg, IPV4_VERSION, zoneid, ipst); 1980 } 1981 1982 void 1983 ire_walk_v6(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst) 1984 { 1985 ire_walk_ipvers(func, arg, IPV6_VERSION, zoneid, ipst); 1986 } 1987 1988 /* 1989 * Walk a particular version. version == 0 means both v4 and v6. 1990 */ 1991 static void 1992 ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers, zoneid_t zoneid, 1993 ip_stack_t *ipst) 1994 { 1995 if (vers != IPV6_VERSION) { 1996 /* 1997 * ip_forwarding_table variable doesn't matter for IPv4 since 1998 * ire_walk_ill_tables uses ips_ip_ftable for IPv4. 1999 */ 2000 ire_walk_ill_tables(0, 0, func, arg, IP_MASK_TABLE_SIZE, 2001 0, NULL, 2002 ipst->ips_ip_cache_table_size, ipst->ips_ip_cache_table, 2003 NULL, zoneid, ipst); 2004 } 2005 if (vers != IPV4_VERSION) { 2006 ire_walk_ill_tables(0, 0, func, arg, IP6_MASK_TABLE_SIZE, 2007 ipst->ips_ip6_ftable_hash_size, 2008 ipst->ips_ip_forwarding_table_v6, 2009 ipst->ips_ip6_cache_table_size, 2010 ipst->ips_ip_cache_table_v6, NULL, zoneid, ipst); 2011 } 2012 } 2013 2014 /* 2015 * Arrange to call the specified function for every IRE that matches the ill. 2016 */ 2017 void 2018 ire_walk_ill(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg, 2019 ill_t *ill) 2020 { 2021 uchar_t vers = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION); 2022 2023 ire_walk_ill_ipvers(match_flags, ire_type, func, arg, vers, ill); 2024 } 2025 2026 void 2027 ire_walk_ill_v4(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg, 2028 ill_t *ill) 2029 { 2030 ire_walk_ill_ipvers(match_flags, ire_type, func, arg, IPV4_VERSION, 2031 ill); 2032 } 2033 2034 void 2035 ire_walk_ill_v6(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg, 2036 ill_t *ill) 2037 { 2038 ire_walk_ill_ipvers(match_flags, ire_type, func, arg, IPV6_VERSION, 2039 ill); 2040 } 2041 2042 /* 2043 * Walk a particular ill and version. 2044 */ 2045 static void 2046 ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type, pfv_t func, 2047 void *arg, uchar_t vers, ill_t *ill) 2048 { 2049 ip_stack_t *ipst = ill->ill_ipst; 2050 2051 if (vers == IPV4_VERSION) { 2052 ire_walk_ill_tables(match_flags, ire_type, func, arg, 2053 IP_MASK_TABLE_SIZE, 0, 2054 NULL, ipst->ips_ip_cache_table_size, 2055 ipst->ips_ip_cache_table, ill, ALL_ZONES, ipst); 2056 } else if (vers == IPV6_VERSION) { 2057 ire_walk_ill_tables(match_flags, ire_type, func, arg, 2058 IP6_MASK_TABLE_SIZE, ipst->ips_ip6_ftable_hash_size, 2059 ipst->ips_ip_forwarding_table_v6, 2060 ipst->ips_ip6_cache_table_size, 2061 ipst->ips_ip_cache_table_v6, ill, ALL_ZONES, ipst); 2062 } 2063 } 2064 2065 boolean_t 2066 ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire, 2067 ill_t *ill, zoneid_t zoneid, ip_stack_t *ipst) 2068 { 2069 ill_t *ire_stq_ill = NULL; 2070 ill_t *ire_ipif_ill = NULL; 2071 2072 ASSERT(match_flags != 0 || zoneid != ALL_ZONES); 2073 /* 2074 * MATCH_IRE_ILL: We match both on ill pointed by ire_stq and 2075 * ire_ipif. Only in the case of IRE_CACHEs can ire_stq and 2076 * ire_ipif be pointing to different ills. But we want to keep 2077 * this function generic enough for future use. So, we always 2078 * try to match on both. The only caller of this function 2079 * ire_walk_ill_tables, will call "func" after we return from 2080 * this function. We expect "func" to do the right filtering 2081 * of ires in this case. 2082 */ 2083 if (match_flags & MATCH_IRE_ILL) { 2084 if (ire->ire_stq != NULL) 2085 ire_stq_ill = ire->ire_stq->q_ptr; 2086 if (ire->ire_ipif != NULL) 2087 ire_ipif_ill = ire->ire_ipif->ipif_ill; 2088 } 2089 2090 if (zoneid != ALL_ZONES) { 2091 /* 2092 * We're walking the IREs for a specific zone. The only relevant 2093 * IREs are: 2094 * - all IREs with a matching ire_zoneid 2095 * - all IRE_OFFSUBNETs as they're shared across all zones 2096 * - IRE_INTERFACE IREs for interfaces with a usable source addr 2097 * with a matching zone 2098 * - IRE_DEFAULTs with a gateway reachable from the zone 2099 * We should really match on IRE_OFFSUBNETs and IRE_DEFAULTs 2100 * using the same rule; but the above rules are consistent with 2101 * the behavior of ire_ftable_lookup[_v6]() so that all the 2102 * routes that can be matched during lookup are also matched 2103 * here. 2104 */ 2105 if (zoneid != ire->ire_zoneid && ire->ire_zoneid != ALL_ZONES) { 2106 /* 2107 * Note, IRE_INTERFACE can have the stq as NULL. For 2108 * example, if the default multicast route is tied to 2109 * the loopback address. 2110 */ 2111 if ((ire->ire_type & IRE_INTERFACE) && 2112 (ire->ire_stq != NULL)) { 2113 ire_stq_ill = (ill_t *)ire->ire_stq->q_ptr; 2114 if (ire->ire_ipversion == IPV4_VERSION) { 2115 if (!ipif_usesrc_avail(ire_stq_ill, 2116 zoneid)) 2117 /* No usable src addr in zone */ 2118 return (B_FALSE); 2119 } else if (ire_stq_ill->ill_usesrc_ifindex 2120 != 0) { 2121 /* 2122 * For IPv6 use ipif_select_source_v6() 2123 * so the right scope selection is done 2124 */ 2125 ipif_t *src_ipif; 2126 src_ipif = 2127 ipif_select_source_v6(ire_stq_ill, 2128 &ire->ire_addr_v6, B_FALSE, 2129 IPV6_PREFER_SRC_DEFAULT, 2130 zoneid); 2131 if (src_ipif != NULL) { 2132 ipif_refrele(src_ipif); 2133 } else { 2134 return (B_FALSE); 2135 } 2136 } else { 2137 return (B_FALSE); 2138 } 2139 2140 } else if (!(ire->ire_type & IRE_OFFSUBNET)) { 2141 return (B_FALSE); 2142 } 2143 } 2144 2145 /* 2146 * Match all default routes from the global zone, irrespective 2147 * of reachability. For a non-global zone only match those 2148 * where ire_gateway_addr has a IRE_INTERFACE for the zoneid. 2149 */ 2150 if (ire->ire_type == IRE_DEFAULT && zoneid != GLOBAL_ZONEID) { 2151 int ire_match_flags = 0; 2152 in6_addr_t gw_addr_v6; 2153 ire_t *rire; 2154 2155 ire_match_flags |= MATCH_IRE_TYPE; 2156 if (ire->ire_ipif != NULL) 2157 ire_match_flags |= MATCH_IRE_ILL; 2158 2159 if (ire->ire_ipversion == IPV4_VERSION) { 2160 rire = ire_route_lookup(ire->ire_gateway_addr, 2161 0, 0, IRE_INTERFACE, ire->ire_ipif, NULL, 2162 zoneid, NULL, ire_match_flags, ipst); 2163 } else { 2164 ASSERT(ire->ire_ipversion == IPV6_VERSION); 2165 mutex_enter(&ire->ire_lock); 2166 gw_addr_v6 = ire->ire_gateway_addr_v6; 2167 mutex_exit(&ire->ire_lock); 2168 rire = ire_route_lookup_v6(&gw_addr_v6, 2169 NULL, NULL, IRE_INTERFACE, ire->ire_ipif, 2170 NULL, zoneid, NULL, ire_match_flags, ipst); 2171 } 2172 if (rire == NULL) { 2173 return (B_FALSE); 2174 } 2175 ire_refrele(rire); 2176 } 2177 } 2178 2179 if (((!(match_flags & MATCH_IRE_TYPE)) || 2180 (ire->ire_type & ire_type)) && 2181 ((!(match_flags & MATCH_IRE_ILL)) || 2182 (ire_stq_ill == ill || ire_ipif_ill == ill || 2183 ire_ipif_ill != NULL && IS_IN_SAME_ILLGRP(ire_ipif_ill, ill)))) { 2184 return (B_TRUE); 2185 } 2186 return (B_FALSE); 2187 } 2188 2189 int 2190 rtfunc(struct radix_node *rn, void *arg) 2191 { 2192 struct rtfuncarg *rtf = arg; 2193 struct rt_entry *rt; 2194 irb_t *irb; 2195 ire_t *ire; 2196 boolean_t ret; 2197 2198 rt = (struct rt_entry *)rn; 2199 ASSERT(rt != NULL); 2200 irb = &rt->rt_irb; 2201 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 2202 if ((rtf->rt_match_flags != 0) || 2203 (rtf->rt_zoneid != ALL_ZONES)) { 2204 ret = ire_walk_ill_match(rtf->rt_match_flags, 2205 rtf->rt_ire_type, ire, 2206 rtf->rt_ill, rtf->rt_zoneid, rtf->rt_ipst); 2207 } else 2208 ret = B_TRUE; 2209 if (ret) 2210 (*rtf->rt_func)(ire, rtf->rt_arg); 2211 } 2212 return (0); 2213 } 2214 2215 /* 2216 * Walk the ftable and the ctable entries that match the ill. 2217 */ 2218 void 2219 ire_walk_ill_tables(uint_t match_flags, uint_t ire_type, pfv_t func, 2220 void *arg, size_t ftbl_sz, size_t htbl_sz, irb_t **ipftbl, 2221 size_t ctbl_sz, irb_t *ipctbl, ill_t *ill, zoneid_t zoneid, 2222 ip_stack_t *ipst) 2223 { 2224 irb_t *irb_ptr; 2225 irb_t *irb; 2226 ire_t *ire; 2227 int i, j; 2228 boolean_t ret; 2229 struct rtfuncarg rtfarg; 2230 2231 ASSERT((!(match_flags & MATCH_IRE_ILL)) || (ill != NULL)); 2232 ASSERT(!(match_flags & MATCH_IRE_TYPE) || (ire_type != 0)); 2233 /* 2234 * Optimize by not looking at the forwarding table if there 2235 * is a MATCH_IRE_TYPE specified with no IRE_FORWARDTABLE 2236 * specified in ire_type. 2237 */ 2238 if (!(match_flags & MATCH_IRE_TYPE) || 2239 ((ire_type & IRE_FORWARDTABLE) != 0)) { 2240 /* knobs such that routine is called only for v6 case */ 2241 if (ipftbl == ipst->ips_ip_forwarding_table_v6) { 2242 for (i = (ftbl_sz - 1); i >= 0; i--) { 2243 if ((irb_ptr = ipftbl[i]) == NULL) 2244 continue; 2245 for (j = 0; j < htbl_sz; j++) { 2246 irb = &irb_ptr[j]; 2247 if (irb->irb_ire == NULL) 2248 continue; 2249 2250 IRB_REFHOLD(irb); 2251 for (ire = irb->irb_ire; ire != NULL; 2252 ire = ire->ire_next) { 2253 if (match_flags == 0 && 2254 zoneid == ALL_ZONES) { 2255 ret = B_TRUE; 2256 } else { 2257 ret = 2258 ire_walk_ill_match( 2259 match_flags, 2260 ire_type, ire, ill, 2261 zoneid, ipst); 2262 } 2263 if (ret) 2264 (*func)(ire, arg); 2265 } 2266 IRB_REFRELE(irb); 2267 } 2268 } 2269 } else { 2270 (void) memset(&rtfarg, 0, sizeof (rtfarg)); 2271 rtfarg.rt_func = func; 2272 rtfarg.rt_arg = arg; 2273 if (match_flags != 0) { 2274 rtfarg.rt_match_flags = match_flags; 2275 } 2276 rtfarg.rt_ire_type = ire_type; 2277 rtfarg.rt_ill = ill; 2278 rtfarg.rt_zoneid = zoneid; 2279 rtfarg.rt_ipst = ipst; /* No netstack_hold */ 2280 (void) ipst->ips_ip_ftable->rnh_walktree_mt( 2281 ipst->ips_ip_ftable, 2282 rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn); 2283 } 2284 } 2285 2286 /* 2287 * Optimize by not looking at the cache table if there 2288 * is a MATCH_IRE_TYPE specified with no IRE_CACHETABLE 2289 * specified in ire_type. 2290 */ 2291 if (!(match_flags & MATCH_IRE_TYPE) || 2292 ((ire_type & IRE_CACHETABLE) != 0)) { 2293 for (i = 0; i < ctbl_sz; i++) { 2294 irb = &ipctbl[i]; 2295 if (irb->irb_ire == NULL) 2296 continue; 2297 IRB_REFHOLD(irb); 2298 for (ire = irb->irb_ire; ire != NULL; 2299 ire = ire->ire_next) { 2300 if (match_flags == 0 && zoneid == ALL_ZONES) { 2301 ret = B_TRUE; 2302 } else { 2303 ret = ire_walk_ill_match( 2304 match_flags, ire_type, 2305 ire, ill, zoneid, ipst); 2306 } 2307 if (ret) 2308 (*func)(ire, arg); 2309 } 2310 IRB_REFRELE(irb); 2311 } 2312 } 2313 } 2314 2315 /* 2316 * This function takes a mask and returns 2317 * number of bits set in the mask. If no 2318 * bit is set it returns 0. 2319 * Assumes a contiguous mask. 2320 */ 2321 int 2322 ip_mask_to_plen(ipaddr_t mask) 2323 { 2324 return (mask == 0 ? 0 : IP_ABITS - (ffs(ntohl(mask)) -1)); 2325 } 2326 2327 /* 2328 * Convert length for a mask to the mask. 2329 */ 2330 ipaddr_t 2331 ip_plen_to_mask(uint_t masklen) 2332 { 2333 return (htonl(IP_HOST_MASK << (IP_ABITS - masklen))); 2334 } 2335 2336 void 2337 ire_atomic_end(irb_t *irb_ptr, ire_t *ire) 2338 { 2339 ill_t *stq_ill, *ipif_ill; 2340 ip_stack_t *ipst = ire->ire_ipst; 2341 2342 stq_ill = ire->ire_stq != NULL ? ire->ire_stq->q_ptr : NULL; 2343 ipif_ill = ire->ire_ipif != NULL ? ire->ire_ipif->ipif_ill : NULL; 2344 RELEASE_ILL_LOCKS(ipif_ill, stq_ill); 2345 rw_exit(&irb_ptr->irb_lock); 2346 rw_exit(&ipst->ips_ill_g_usesrc_lock); 2347 } 2348 2349 /* 2350 * ire_add_v[46] atomically make sure that the ipif or ill associated 2351 * with the new ire being added is stable and not IPIF_CHANGING or ILL_CHANGING 2352 * before adding the ire to the table. This ensures that we don't create 2353 * new IRE_CACHEs with stale values for parameters that are passed to 2354 * ire_create such as ire_max_frag. Note that ire_create() is passed a pointer 2355 * to the ipif_mtu, and not the value. The actual value is derived from the 2356 * parent ire or ipif under the bucket lock. 2357 */ 2358 int 2359 ire_atomic_start(irb_t *irb_ptr, ire_t *ire, queue_t *q, mblk_t *mp, 2360 ipsq_func_t func) 2361 { 2362 ill_t *stq_ill; 2363 ill_t *ipif_ill; 2364 int error = 0; 2365 ill_t *ill = NULL; 2366 ip_stack_t *ipst = ire->ire_ipst; 2367 2368 stq_ill = ire->ire_stq != NULL ? ire->ire_stq->q_ptr : NULL; 2369 ipif_ill = ire->ire_ipif != NULL ? ire->ire_ipif->ipif_ill : NULL; 2370 2371 ASSERT((q != NULL && mp != NULL && func != NULL) || 2372 (q == NULL && mp == NULL && func == NULL)); 2373 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 2374 GRAB_CONN_LOCK(q); 2375 rw_enter(&irb_ptr->irb_lock, RW_WRITER); 2376 GRAB_ILL_LOCKS(ipif_ill, stq_ill); 2377 2378 /* 2379 * While the IRE is in the process of being added, a user may have 2380 * invoked the ifconfig usesrc option on the stq_ill to make it a 2381 * usesrc client ILL. Check for this possibility here, if it is true 2382 * then we fail adding the IRE_CACHE. Another check is to make sure 2383 * that an ipif_ill of an IRE_CACHE being added is not part of a usesrc 2384 * group. The ill_g_usesrc_lock is released in ire_atomic_end 2385 */ 2386 if ((ire->ire_type & IRE_CACHE) && 2387 (ire->ire_marks & IRE_MARK_USESRC_CHECK)) { 2388 if (stq_ill->ill_usesrc_ifindex != 0) { 2389 ASSERT(stq_ill->ill_usesrc_grp_next != NULL); 2390 if ((ipif_ill->ill_phyint->phyint_ifindex != 2391 stq_ill->ill_usesrc_ifindex) || 2392 (ipif_ill->ill_usesrc_grp_next == NULL) || 2393 (ipif_ill->ill_usesrc_ifindex != 0)) { 2394 error = EINVAL; 2395 goto done; 2396 } 2397 } else if (ipif_ill->ill_usesrc_grp_next != NULL) { 2398 error = EINVAL; 2399 goto done; 2400 } 2401 } 2402 2403 /* 2404 * Don't allow IRE's to be created on changing ill's. Also, since 2405 * IPMP flags can be set on an ill without quiescing it, if we're not 2406 * a writer on stq_ill, check that the flags still allow IRE creation. 2407 */ 2408 if ((stq_ill != NULL) && !IAM_WRITER_ILL(stq_ill)) { 2409 if (stq_ill->ill_state_flags & ILL_CHANGING) { 2410 ill = stq_ill; 2411 error = EAGAIN; 2412 } else if (IS_UNDER_IPMP(stq_ill)) { 2413 mutex_enter(&stq_ill->ill_phyint->phyint_lock); 2414 if (!ipmp_ill_is_active(stq_ill) && 2415 !(ire->ire_marks & IRE_MARK_TESTHIDDEN)) { 2416 error = EINVAL; 2417 } 2418 mutex_exit(&stq_ill->ill_phyint->phyint_lock); 2419 } 2420 if (error != 0) 2421 goto done; 2422 } 2423 2424 if ((ipif_ill != NULL) && !IAM_WRITER_ILL(ipif_ill) && 2425 (ipif_ill->ill_state_flags & ILL_CHANGING)) { 2426 ill = ipif_ill; 2427 error = EAGAIN; 2428 goto done; 2429 } 2430 2431 if ((ire->ire_ipif != NULL) && !IAM_WRITER_IPIF(ire->ire_ipif) && 2432 (ire->ire_ipif->ipif_state_flags & IPIF_CHANGING)) { 2433 ill = ire->ire_ipif->ipif_ill; 2434 ASSERT(ill != NULL); 2435 error = EAGAIN; 2436 goto done; 2437 } 2438 2439 done: 2440 if (error == EAGAIN && ILL_CAN_WAIT(ill, q)) { 2441 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 2442 mutex_enter(&ipsq->ipsq_lock); 2443 mutex_enter(&ipsq->ipsq_xop->ipx_lock); 2444 ire_atomic_end(irb_ptr, ire); 2445 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 2446 mutex_exit(&ipsq->ipsq_xop->ipx_lock); 2447 mutex_exit(&ipsq->ipsq_lock); 2448 error = EINPROGRESS; 2449 } else if (error != 0) { 2450 ire_atomic_end(irb_ptr, ire); 2451 } 2452 2453 RELEASE_CONN_LOCK(q); 2454 return (error); 2455 } 2456 2457 /* 2458 * Add a fully initialized IRE to an appropriate table based on 2459 * ire_type. 2460 * 2461 * allow_unresolved == B_FALSE indicates a legacy code-path call 2462 * that has prohibited the addition of incomplete ire's. If this 2463 * parameter is set, and we find an nce that is in a state other 2464 * than ND_REACHABLE, we fail the add. Note that nce_state could be 2465 * something other than ND_REACHABLE if the nce had just expired and 2466 * the ire_create preceding the ire_add added a new ND_INITIAL nce. 2467 */ 2468 int 2469 ire_add(ire_t **irep, queue_t *q, mblk_t *mp, ipsq_func_t func, 2470 boolean_t allow_unresolved) 2471 { 2472 ire_t *ire1; 2473 ill_t *stq_ill = NULL; 2474 ill_t *ill; 2475 ipif_t *ipif = NULL; 2476 ill_walk_context_t ctx; 2477 ire_t *ire = *irep; 2478 int error; 2479 boolean_t ire_is_mblk = B_FALSE; 2480 tsol_gcgrp_t *gcgrp = NULL; 2481 tsol_gcgrp_addr_t ga; 2482 ip_stack_t *ipst = ire->ire_ipst; 2483 2484 /* get ready for the day when original ire is not created as mblk */ 2485 if (ire->ire_mp != NULL) { 2486 ire_is_mblk = B_TRUE; 2487 /* Copy the ire to a kmem_alloc'ed area */ 2488 ire1 = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 2489 if (ire1 == NULL) { 2490 ip1dbg(("ire_add: alloc failed\n")); 2491 ire_delete(ire); 2492 *irep = NULL; 2493 return (ENOMEM); 2494 } 2495 ire->ire_marks &= ~IRE_MARK_UNCACHED; 2496 *ire1 = *ire; 2497 ire1->ire_mp = NULL; 2498 ire1->ire_stq_ifindex = 0; 2499 freeb(ire->ire_mp); 2500 ire = ire1; 2501 } 2502 if (ire->ire_stq != NULL) 2503 stq_ill = ire->ire_stq->q_ptr; 2504 2505 if (stq_ill != NULL && ire->ire_type == IRE_CACHE && 2506 stq_ill->ill_net_type == IRE_IF_RESOLVER) { 2507 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 2508 ill = ILL_START_WALK_ALL(&ctx, ipst); 2509 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 2510 mutex_enter(&ill->ill_lock); 2511 if (ill->ill_state_flags & ILL_CONDEMNED) { 2512 mutex_exit(&ill->ill_lock); 2513 continue; 2514 } 2515 /* 2516 * We need to make sure that the ipif is a valid one 2517 * before adding the IRE_CACHE. This happens only 2518 * with IRE_CACHE when there is an external resolver. 2519 * 2520 * We can unplumb a logical interface while the 2521 * packet is waiting in ARP with the IRE. Then, 2522 * later on when we feed the IRE back, the ipif 2523 * has to be re-checked. This can't happen with 2524 * NDP currently, as we never queue the IRE with 2525 * the packet. We always try to recreate the IRE 2526 * when the resolution is completed. But, we do 2527 * it for IPv6 also here so that in future if 2528 * we have external resolvers, it will work without 2529 * any change. 2530 */ 2531 ipif = ipif_lookup_seqid(ill, ire->ire_ipif_seqid); 2532 if (ipif != NULL) { 2533 ipif_refhold_locked(ipif); 2534 mutex_exit(&ill->ill_lock); 2535 break; 2536 } 2537 mutex_exit(&ill->ill_lock); 2538 } 2539 rw_exit(&ipst->ips_ill_g_lock); 2540 if (ipif == NULL || 2541 (ipif->ipif_isv6 && 2542 !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6) && 2543 !IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6, 2544 &ipif->ipif_v6src_addr)) || 2545 (!ipif->ipif_isv6 && 2546 ire->ire_src_addr != ipif->ipif_src_addr) || 2547 ire->ire_zoneid != ipif->ipif_zoneid) { 2548 if (ipif != NULL) 2549 ipif_refrele(ipif); 2550 ire->ire_ipif = NULL; 2551 ire_delete(ire); 2552 *irep = NULL; 2553 return (EINVAL); 2554 } 2555 2556 ASSERT(ill != NULL); 2557 2558 /* 2559 * Since we didn't attach label security attributes to the 2560 * ire for the resolver case, we need to add it now. (only 2561 * for v4 resolver and v6 xresolv case). 2562 */ 2563 if (is_system_labeled() && ire_is_mblk) { 2564 if (ire->ire_ipversion == IPV4_VERSION) { 2565 ga.ga_af = AF_INET; 2566 IN6_IPADDR_TO_V4MAPPED(ire->ire_gateway_addr != 2567 INADDR_ANY ? ire->ire_gateway_addr : 2568 ire->ire_addr, &ga.ga_addr); 2569 } else { 2570 ga.ga_af = AF_INET6; 2571 ga.ga_addr = IN6_IS_ADDR_UNSPECIFIED( 2572 &ire->ire_gateway_addr_v6) ? 2573 ire->ire_addr_v6 : 2574 ire->ire_gateway_addr_v6; 2575 } 2576 gcgrp = gcgrp_lookup(&ga, B_FALSE); 2577 error = tsol_ire_init_gwattr(ire, ire->ire_ipversion, 2578 NULL, gcgrp); 2579 if (error != 0) { 2580 if (gcgrp != NULL) { 2581 GCGRP_REFRELE(gcgrp); 2582 gcgrp = NULL; 2583 } 2584 ipif_refrele(ipif); 2585 ire->ire_ipif = NULL; 2586 ire_delete(ire); 2587 *irep = NULL; 2588 return (error); 2589 } 2590 } 2591 } 2592 2593 /* 2594 * In case ire was changed 2595 */ 2596 *irep = ire; 2597 if (ire->ire_ipversion == IPV6_VERSION) 2598 error = ire_add_v6(irep, q, mp, func); 2599 else 2600 error = ire_add_v4(irep, q, mp, func, allow_unresolved); 2601 if (ipif != NULL) 2602 ipif_refrele(ipif); 2603 return (error); 2604 } 2605 2606 /* 2607 * Add an initialized IRE to an appropriate table based on ire_type. 2608 * 2609 * The forward table contains IRE_PREFIX/IRE_HOST and 2610 * IRE_IF_RESOLVER/IRE_IF_NORESOLVER and IRE_DEFAULT. 2611 * 2612 * The cache table contains IRE_BROADCAST/IRE_LOCAL/IRE_LOOPBACK 2613 * and IRE_CACHE. 2614 * 2615 * NOTE : This function is called as writer though not required 2616 * by this function. 2617 */ 2618 static int 2619 ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func, 2620 boolean_t allow_unresolved) 2621 { 2622 ire_t *ire1; 2623 irb_t *irb_ptr; 2624 ire_t **irep; 2625 int flags; 2626 ire_t *pire = NULL; 2627 ill_t *stq_ill; 2628 ire_t *ire = *ire_p; 2629 int error; 2630 boolean_t need_refrele = B_FALSE; 2631 nce_t *nce; 2632 ip_stack_t *ipst = ire->ire_ipst; 2633 uint_t marks = 0; 2634 2635 /* 2636 * IREs with source addresses hosted on interfaces that are under IPMP 2637 * should be hidden so that applications don't accidentally end up 2638 * sending packets with test addresses as their source addresses, or 2639 * sending out interfaces that are e.g. IFF_INACTIVE. Hide them here. 2640 */ 2641 if (ire->ire_ipif != NULL && IS_UNDER_IPMP(ire->ire_ipif->ipif_ill)) 2642 marks |= IRE_MARK_TESTHIDDEN; 2643 2644 if (ire->ire_ipif != NULL) 2645 ASSERT(!MUTEX_HELD(&ire->ire_ipif->ipif_ill->ill_lock)); 2646 if (ire->ire_stq != NULL) 2647 ASSERT(!MUTEX_HELD( 2648 &((ill_t *)(ire->ire_stq->q_ptr))->ill_lock)); 2649 ASSERT(ire->ire_ipversion == IPV4_VERSION); 2650 ASSERT(ire->ire_mp == NULL); /* Calls should go through ire_add */ 2651 2652 /* Find the appropriate list head. */ 2653 switch (ire->ire_type) { 2654 case IRE_HOST: 2655 ire->ire_mask = IP_HOST_MASK; 2656 ire->ire_masklen = IP_ABITS; 2657 ire->ire_marks |= marks; 2658 if ((ire->ire_flags & RTF_SETSRC) == 0) 2659 ire->ire_src_addr = 0; 2660 break; 2661 case IRE_CACHE: 2662 ire->ire_mask = IP_HOST_MASK; 2663 ire->ire_masklen = IP_ABITS; 2664 ire->ire_marks |= marks; 2665 break; 2666 case IRE_BROADCAST: 2667 case IRE_LOCAL: 2668 case IRE_LOOPBACK: 2669 ire->ire_mask = IP_HOST_MASK; 2670 ire->ire_masklen = IP_ABITS; 2671 break; 2672 case IRE_PREFIX: 2673 case IRE_DEFAULT: 2674 ire->ire_marks |= marks; 2675 if ((ire->ire_flags & RTF_SETSRC) == 0) 2676 ire->ire_src_addr = 0; 2677 break; 2678 case IRE_IF_RESOLVER: 2679 case IRE_IF_NORESOLVER: 2680 ire->ire_marks |= marks; 2681 break; 2682 default: 2683 ip0dbg(("ire_add_v4: ire %p has unrecognized IRE type (%d)\n", 2684 (void *)ire, ire->ire_type)); 2685 ire_delete(ire); 2686 *ire_p = NULL; 2687 return (EINVAL); 2688 } 2689 2690 /* Make sure the address is properly masked. */ 2691 ire->ire_addr &= ire->ire_mask; 2692 2693 /* 2694 * ip_newroute/ip_newroute_multi are unable to prevent the deletion 2695 * of the interface route while adding an IRE_CACHE for an on-link 2696 * destination in the IRE_IF_RESOLVER case, since the ire has to 2697 * go to ARP and return. We can't do a REFHOLD on the 2698 * associated interface ire for fear of ARP freeing the message. 2699 * Here we look up the interface ire in the forwarding table and 2700 * make sure that the interface route has not been deleted. 2701 */ 2702 if (ire->ire_type == IRE_CACHE && ire->ire_gateway_addr == 0 && 2703 ((ill_t *)ire->ire_stq->q_ptr)->ill_net_type == IRE_IF_RESOLVER) { 2704 2705 ASSERT(ire->ire_max_fragp == NULL); 2706 if (CLASSD(ire->ire_addr) && !(ire->ire_flags & RTF_SETSRC)) { 2707 /* 2708 * The ihandle that we used in ip_newroute_multi 2709 * comes from the interface route corresponding 2710 * to ire_ipif. Lookup here to see if it exists 2711 * still. 2712 * If the ire has a source address assigned using 2713 * RTF_SETSRC, ire_ipif is the logical interface holding 2714 * this source address, so we can't use it to check for 2715 * the existence of the interface route. Instead we rely 2716 * on the brute force ihandle search in 2717 * ire_ihandle_lookup_onlink() below. 2718 */ 2719 pire = ipif_to_ire(ire->ire_ipif); 2720 if (pire == NULL) { 2721 ire_delete(ire); 2722 *ire_p = NULL; 2723 return (EINVAL); 2724 } else if (pire->ire_ihandle != ire->ire_ihandle) { 2725 ire_refrele(pire); 2726 ire_delete(ire); 2727 *ire_p = NULL; 2728 return (EINVAL); 2729 } 2730 } else { 2731 pire = ire_ihandle_lookup_onlink(ire); 2732 if (pire == NULL) { 2733 ire_delete(ire); 2734 *ire_p = NULL; 2735 return (EINVAL); 2736 } 2737 } 2738 /* Prevent pire from getting deleted */ 2739 IRB_REFHOLD(pire->ire_bucket); 2740 /* Has it been removed already ? */ 2741 if (pire->ire_marks & IRE_MARK_CONDEMNED) { 2742 IRB_REFRELE(pire->ire_bucket); 2743 ire_refrele(pire); 2744 ire_delete(ire); 2745 *ire_p = NULL; 2746 return (EINVAL); 2747 } 2748 } else { 2749 ASSERT(ire->ire_max_fragp != NULL); 2750 } 2751 flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW); 2752 2753 if (ire->ire_ipif != NULL) { 2754 /* 2755 * We use MATCH_IRE_IPIF while adding IRE_CACHES only 2756 * for historic reasons and to maintain symmetry with 2757 * IPv6 code path. Historically this was used by 2758 * multicast code to create multiple IRE_CACHES on 2759 * a single ill with different ipifs. This was used 2760 * so that multicast packets leaving the node had the 2761 * right source address. This is no longer needed as 2762 * ip_wput initializes the address correctly. 2763 */ 2764 flags |= MATCH_IRE_IPIF; 2765 /* 2766 * If we are creating a hidden IRE, make sure we search for 2767 * hidden IREs when searching for duplicates below. 2768 * Otherwise, we might find an IRE on some other interface 2769 * that's not marked hidden. 2770 */ 2771 if (ire->ire_marks & IRE_MARK_TESTHIDDEN) 2772 flags |= MATCH_IRE_MARK_TESTHIDDEN; 2773 } 2774 if ((ire->ire_type & IRE_CACHETABLE) == 0) { 2775 irb_ptr = ire_get_bucket(ire); 2776 need_refrele = B_TRUE; 2777 if (irb_ptr == NULL) { 2778 /* 2779 * This assumes that the ire has not added 2780 * a reference to the ipif. 2781 */ 2782 ire->ire_ipif = NULL; 2783 ire_delete(ire); 2784 if (pire != NULL) { 2785 IRB_REFRELE(pire->ire_bucket); 2786 ire_refrele(pire); 2787 } 2788 *ire_p = NULL; 2789 return (EINVAL); 2790 } 2791 } else { 2792 irb_ptr = &(ipst->ips_ip_cache_table[IRE_ADDR_HASH( 2793 ire->ire_addr, ipst->ips_ip_cache_table_size)]); 2794 } 2795 2796 /* 2797 * Start the atomic add of the ire. Grab the ill locks, 2798 * ill_g_usesrc_lock and the bucket lock. Check for condemned 2799 * 2800 * If ipif or ill is changing ire_atomic_start() may queue the 2801 * request and return EINPROGRESS. 2802 * To avoid lock order problems, get the ndp4->ndp_g_lock. 2803 */ 2804 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 2805 error = ire_atomic_start(irb_ptr, ire, q, mp, func); 2806 if (error != 0) { 2807 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 2808 /* 2809 * We don't know whether it is a valid ipif or not. 2810 * So, set it to NULL. This assumes that the ire has not added 2811 * a reference to the ipif. 2812 */ 2813 ire->ire_ipif = NULL; 2814 ire_delete(ire); 2815 if (pire != NULL) { 2816 IRB_REFRELE(pire->ire_bucket); 2817 ire_refrele(pire); 2818 } 2819 *ire_p = NULL; 2820 if (need_refrele) 2821 IRB_REFRELE(irb_ptr); 2822 return (error); 2823 } 2824 /* 2825 * To avoid creating ires having stale values for the ire_max_frag 2826 * we get the latest value atomically here. For more details 2827 * see the block comment in ip_sioctl_mtu and in DL_NOTE_SDU_CHANGE 2828 * in ip_rput_dlpi_writer 2829 */ 2830 if (ire->ire_max_fragp == NULL) { 2831 if (CLASSD(ire->ire_addr)) 2832 ire->ire_max_frag = ire->ire_ipif->ipif_mtu; 2833 else 2834 ire->ire_max_frag = pire->ire_max_frag; 2835 } else { 2836 uint_t max_frag; 2837 2838 max_frag = *ire->ire_max_fragp; 2839 ire->ire_max_fragp = NULL; 2840 ire->ire_max_frag = max_frag; 2841 } 2842 /* 2843 * Atomically check for duplicate and insert in the table. 2844 */ 2845 for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 2846 if (ire1->ire_marks & IRE_MARK_CONDEMNED) 2847 continue; 2848 if (ire->ire_ipif != NULL) { 2849 /* 2850 * We do MATCH_IRE_ILL implicitly here for IREs 2851 * with a non-null ire_ipif, including IRE_CACHEs. 2852 * As ire_ipif and ire_stq could point to two 2853 * different ills, we can't pass just ire_ipif to 2854 * ire_match_args and get a match on both ills. 2855 * This is just needed for duplicate checks here and 2856 * so we don't add an extra argument to 2857 * ire_match_args for this. Do it locally. 2858 * 2859 * NOTE : Currently there is no part of the code 2860 * that asks for both MATH_IRE_IPIF and MATCH_IRE_ILL 2861 * match for IRE_CACHEs. Thus we don't want to 2862 * extend the arguments to ire_match_args. 2863 */ 2864 if (ire1->ire_stq != ire->ire_stq) 2865 continue; 2866 /* 2867 * Multiroute IRE_CACHEs for a given destination can 2868 * have the same ire_ipif, typically if their source 2869 * address is forced using RTF_SETSRC, and the same 2870 * send-to queue. We differentiate them using the parent 2871 * handle. 2872 */ 2873 if (ire->ire_type == IRE_CACHE && 2874 (ire1->ire_flags & RTF_MULTIRT) && 2875 (ire->ire_flags & RTF_MULTIRT) && 2876 (ire1->ire_phandle != ire->ire_phandle)) 2877 continue; 2878 } 2879 if (ire1->ire_zoneid != ire->ire_zoneid) 2880 continue; 2881 if (ire_match_args(ire1, ire->ire_addr, ire->ire_mask, 2882 ire->ire_gateway_addr, ire->ire_type, ire->ire_ipif, 2883 ire->ire_zoneid, 0, NULL, flags, NULL)) { 2884 /* 2885 * Return the old ire after doing a REFHOLD. 2886 * As most of the callers continue to use the IRE 2887 * after adding, we return a held ire. This will 2888 * avoid a lookup in the caller again. If the callers 2889 * don't want to use it, they need to do a REFRELE. 2890 */ 2891 ip1dbg(("found dup ire existing %p new %p\n", 2892 (void *)ire1, (void *)ire)); 2893 IRE_REFHOLD(ire1); 2894 ire_atomic_end(irb_ptr, ire); 2895 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 2896 ire_delete(ire); 2897 if (pire != NULL) { 2898 /* 2899 * Assert that it is not removed from the 2900 * list yet. 2901 */ 2902 ASSERT(pire->ire_ptpn != NULL); 2903 IRB_REFRELE(pire->ire_bucket); 2904 ire_refrele(pire); 2905 } 2906 *ire_p = ire1; 2907 if (need_refrele) 2908 IRB_REFRELE(irb_ptr); 2909 return (0); 2910 } 2911 } 2912 2913 if (ire->ire_type & IRE_CACHE) { 2914 ASSERT(ire->ire_stq != NULL); 2915 nce = ndp_lookup_v4(ire_to_ill(ire), 2916 ((ire->ire_gateway_addr != INADDR_ANY) ? 2917 &ire->ire_gateway_addr : &ire->ire_addr), 2918 B_TRUE); 2919 if (nce != NULL) 2920 mutex_enter(&nce->nce_lock); 2921 /* 2922 * if the nce is NCE_F_CONDEMNED, or if it is not ND_REACHABLE 2923 * and the caller has prohibited the addition of incomplete 2924 * ire's, we fail the add. Note that nce_state could be 2925 * something other than ND_REACHABLE if the nce had 2926 * just expired and the ire_create preceding the 2927 * ire_add added a new ND_INITIAL nce. 2928 */ 2929 if ((nce == NULL) || 2930 (nce->nce_flags & NCE_F_CONDEMNED) || 2931 (!allow_unresolved && 2932 (nce->nce_state != ND_REACHABLE))) { 2933 if (nce != NULL) { 2934 DTRACE_PROBE1(ire__bad__nce, nce_t *, nce); 2935 mutex_exit(&nce->nce_lock); 2936 } 2937 ire_atomic_end(irb_ptr, ire); 2938 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 2939 if (nce != NULL) 2940 NCE_REFRELE(nce); 2941 DTRACE_PROBE1(ire__no__nce, ire_t *, ire); 2942 ire_delete(ire); 2943 if (pire != NULL) { 2944 IRB_REFRELE(pire->ire_bucket); 2945 ire_refrele(pire); 2946 } 2947 *ire_p = NULL; 2948 if (need_refrele) 2949 IRB_REFRELE(irb_ptr); 2950 return (EINVAL); 2951 } else { 2952 ire->ire_nce = nce; 2953 mutex_exit(&nce->nce_lock); 2954 /* 2955 * We are associating this nce to the ire, so 2956 * change the nce ref taken in ndp_lookup_v4() from 2957 * NCE_REFHOLD to NCE_REFHOLD_NOTR 2958 */ 2959 NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce); 2960 } 2961 } 2962 /* 2963 * Make it easy for ip_wput_ire() to hit multiple broadcast ires by 2964 * grouping identical addresses together on the hash chain. We do 2965 * this only for IRE_BROADCASTs as ip_wput_ire is currently interested 2966 * in such groupings only for broadcasts. 2967 * 2968 * Find the first entry that matches ire_addr. *irep will be null 2969 * if no match. 2970 * 2971 * Note: the loopback and non-loopback broadcast entries for an 2972 * interface MUST be added before any MULTIRT entries. 2973 */ 2974 irep = (ire_t **)irb_ptr; 2975 while ((ire1 = *irep) != NULL && ire->ire_addr != ire1->ire_addr) 2976 irep = &ire1->ire_next; 2977 if (ire->ire_type == IRE_BROADCAST && *irep != NULL) { 2978 /* 2979 * We found some ire (i.e *irep) with a matching addr. We 2980 * want to group ires with same addr. 2981 */ 2982 for (;;) { 2983 ire1 = *irep; 2984 if ((ire1->ire_next == NULL) || 2985 (ire1->ire_next->ire_addr != ire->ire_addr) || 2986 (ire1->ire_type != IRE_BROADCAST) || 2987 (ire1->ire_flags & RTF_MULTIRT) || 2988 (ire1->ire_ipif->ipif_ill->ill_grp == 2989 ire->ire_ipif->ipif_ill->ill_grp)) 2990 break; 2991 irep = &ire1->ire_next; 2992 } 2993 ASSERT(*irep != NULL); 2994 /* 2995 * The ire will be added before *irep, so 2996 * if irep is a MULTIRT ire, just break to 2997 * ire insertion code. 2998 */ 2999 if (((*irep)->ire_flags & RTF_MULTIRT) != 0) 3000 goto insert_ire; 3001 3002 irep = &((*irep)->ire_next); 3003 3004 /* 3005 * Either we have hit the end of the list or the address 3006 * did not match. 3007 */ 3008 while (*irep != NULL) { 3009 ire1 = *irep; 3010 if ((ire1->ire_addr != ire->ire_addr) || 3011 (ire1->ire_type != IRE_BROADCAST)) 3012 break; 3013 if (ire1->ire_ipif == ire->ire_ipif) { 3014 irep = &ire1->ire_next; 3015 break; 3016 } 3017 irep = &ire1->ire_next; 3018 } 3019 } else if (*irep != NULL) { 3020 /* 3021 * Find the last ire which matches ire_addr. 3022 * Needed to do tail insertion among entries with the same 3023 * ire_addr. 3024 */ 3025 while (ire->ire_addr == ire1->ire_addr) { 3026 irep = &ire1->ire_next; 3027 ire1 = *irep; 3028 if (ire1 == NULL) 3029 break; 3030 } 3031 } 3032 3033 insert_ire: 3034 /* Insert at *irep */ 3035 ire1 = *irep; 3036 if (ire1 != NULL) 3037 ire1->ire_ptpn = &ire->ire_next; 3038 ire->ire_next = ire1; 3039 /* Link the new one in. */ 3040 ire->ire_ptpn = irep; 3041 3042 /* 3043 * ire_walk routines de-reference ire_next without holding 3044 * a lock. Before we point to the new ire, we want to make 3045 * sure the store that sets the ire_next of the new ire 3046 * reaches global visibility, so that ire_walk routines 3047 * don't see a truncated list of ires i.e if the ire_next 3048 * of the new ire gets set after we do "*irep = ire" due 3049 * to re-ordering, the ire_walk thread will see a NULL 3050 * once it accesses the ire_next of the new ire. 3051 * membar_producer() makes sure that the following store 3052 * happens *after* all of the above stores. 3053 */ 3054 membar_producer(); 3055 *irep = ire; 3056 ire->ire_bucket = irb_ptr; 3057 /* 3058 * We return a bumped up IRE above. Keep it symmetrical 3059 * so that the callers will always have to release. This 3060 * helps the callers of this function because they continue 3061 * to use the IRE after adding and hence they don't have to 3062 * lookup again after we return the IRE. 3063 * 3064 * NOTE : We don't have to use atomics as this is appearing 3065 * in the list for the first time and no one else can bump 3066 * up the reference count on this yet. 3067 */ 3068 IRE_REFHOLD_LOCKED(ire); 3069 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_inserted); 3070 3071 irb_ptr->irb_ire_cnt++; 3072 if (irb_ptr->irb_marks & IRB_MARK_FTABLE) 3073 irb_ptr->irb_nire++; 3074 3075 if (ire->ire_marks & IRE_MARK_TEMPORARY) 3076 irb_ptr->irb_tmp_ire_cnt++; 3077 3078 if (ire->ire_ipif != NULL) { 3079 DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *), ire->ire_ipif, 3080 (char *), "ire", (void *), ire); 3081 ire->ire_ipif->ipif_ire_cnt++; 3082 if (ire->ire_stq != NULL) { 3083 stq_ill = (ill_t *)ire->ire_stq->q_ptr; 3084 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), stq_ill, 3085 (char *), "ire", (void *), ire); 3086 stq_ill->ill_ire_cnt++; 3087 } 3088 } else { 3089 ASSERT(ire->ire_stq == NULL); 3090 } 3091 3092 ire_atomic_end(irb_ptr, ire); 3093 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3094 3095 if (pire != NULL) { 3096 /* Assert that it is not removed from the list yet */ 3097 ASSERT(pire->ire_ptpn != NULL); 3098 IRB_REFRELE(pire->ire_bucket); 3099 ire_refrele(pire); 3100 } 3101 3102 if (ire->ire_type != IRE_CACHE) { 3103 /* 3104 * For ire's with host mask see if there is an entry 3105 * in the cache. If there is one flush the whole cache as 3106 * there might be multiple entries due to RTF_MULTIRT (CGTP). 3107 * If no entry is found than there is no need to flush the 3108 * cache. 3109 */ 3110 if (ire->ire_mask == IP_HOST_MASK) { 3111 ire_t *lire; 3112 lire = ire_ctable_lookup(ire->ire_addr, NULL, IRE_CACHE, 3113 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 3114 if (lire != NULL) { 3115 ire_refrele(lire); 3116 ire_flush_cache_v4(ire, IRE_FLUSH_ADD); 3117 } 3118 } else { 3119 ire_flush_cache_v4(ire, IRE_FLUSH_ADD); 3120 } 3121 } 3122 /* 3123 * We had to delay the fast path probe until the ire is inserted 3124 * in the list. Otherwise the fast path ack won't find the ire in 3125 * the table. 3126 */ 3127 if (ire->ire_type == IRE_CACHE || 3128 (ire->ire_type == IRE_BROADCAST && ire->ire_stq != NULL)) { 3129 ASSERT(ire->ire_nce != NULL); 3130 if (ire->ire_nce->nce_state == ND_REACHABLE) 3131 nce_fastpath(ire->ire_nce); 3132 } 3133 if (ire->ire_ipif != NULL) 3134 ASSERT(!MUTEX_HELD(&ire->ire_ipif->ipif_ill->ill_lock)); 3135 *ire_p = ire; 3136 if (need_refrele) { 3137 IRB_REFRELE(irb_ptr); 3138 } 3139 return (0); 3140 } 3141 3142 /* 3143 * IRB_REFRELE is the only caller of the function. ire_unlink calls to 3144 * do the final cleanup for this ire. 3145 */ 3146 void 3147 ire_cleanup(ire_t *ire) 3148 { 3149 ire_t *ire_next; 3150 ip_stack_t *ipst = ire->ire_ipst; 3151 3152 ASSERT(ire != NULL); 3153 3154 while (ire != NULL) { 3155 ire_next = ire->ire_next; 3156 if (ire->ire_ipversion == IPV4_VERSION) { 3157 ire_delete_v4(ire); 3158 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, 3159 ire_stats_deleted); 3160 } else { 3161 ASSERT(ire->ire_ipversion == IPV6_VERSION); 3162 ire_delete_v6(ire); 3163 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, 3164 ire_stats_deleted); 3165 } 3166 /* 3167 * Now it's really out of the list. Before doing the 3168 * REFRELE, set ire_next to NULL as ire_inactive asserts 3169 * so. 3170 */ 3171 ire->ire_next = NULL; 3172 IRE_REFRELE_NOTR(ire); 3173 ire = ire_next; 3174 } 3175 } 3176 3177 /* 3178 * IRB_REFRELE is the only caller of the function. It calls to unlink 3179 * all the CONDEMNED ires from this bucket. 3180 */ 3181 ire_t * 3182 ire_unlink(irb_t *irb) 3183 { 3184 ire_t *ire; 3185 ire_t *ire1; 3186 ire_t **ptpn; 3187 ire_t *ire_list = NULL; 3188 3189 ASSERT(RW_WRITE_HELD(&irb->irb_lock)); 3190 ASSERT(((irb->irb_marks & IRB_MARK_FTABLE) && irb->irb_refcnt == 1) || 3191 (irb->irb_refcnt == 0)); 3192 ASSERT(irb->irb_marks & IRB_MARK_CONDEMNED); 3193 ASSERT(irb->irb_ire != NULL); 3194 3195 for (ire = irb->irb_ire; ire != NULL; ire = ire1) { 3196 ip_stack_t *ipst = ire->ire_ipst; 3197 3198 ire1 = ire->ire_next; 3199 if (ire->ire_marks & IRE_MARK_CONDEMNED) { 3200 ptpn = ire->ire_ptpn; 3201 ire1 = ire->ire_next; 3202 if (ire1) 3203 ire1->ire_ptpn = ptpn; 3204 *ptpn = ire1; 3205 ire->ire_ptpn = NULL; 3206 ire->ire_next = NULL; 3207 if (ire->ire_type == IRE_DEFAULT) { 3208 /* 3209 * IRE is out of the list. We need to adjust 3210 * the accounting before the caller drops 3211 * the lock. 3212 */ 3213 if (ire->ire_ipversion == IPV6_VERSION) { 3214 ASSERT(ipst-> 3215 ips_ipv6_ire_default_count != 3216 0); 3217 ipst->ips_ipv6_ire_default_count--; 3218 } 3219 } 3220 /* 3221 * We need to call ire_delete_v4 or ire_delete_v6 3222 * to clean up the cache or the redirects pointing at 3223 * the default gateway. We need to drop the lock 3224 * as ire_flush_cache/ire_delete_host_redircts require 3225 * so. But we can't drop the lock, as ire_unlink needs 3226 * to atomically remove the ires from the list. 3227 * So, create a temporary list of CONDEMNED ires 3228 * for doing ire_delete_v4/ire_delete_v6 operations 3229 * later on. 3230 */ 3231 ire->ire_next = ire_list; 3232 ire_list = ire; 3233 } 3234 } 3235 irb->irb_marks &= ~IRB_MARK_CONDEMNED; 3236 return (ire_list); 3237 } 3238 3239 /* 3240 * Delete all the cache entries with this 'addr'. When IP gets a gratuitous 3241 * ARP message on any of its interface queue, it scans the nce table and 3242 * deletes and calls ndp_delete() for the appropriate nce. This action 3243 * also deletes all the neighbor/ire cache entries for that address. 3244 * This function is called from ip_arp_news in ip.c and also for 3245 * ARP ioctl processing in ip_if.c. ip_ire_clookup_and_delete returns 3246 * true if it finds a nce entry which is used by ip_arp_news to determine if 3247 * it needs to do an ire_walk_v4. The return value is also used for the 3248 * same purpose by ARP IOCTL processing * in ip_if.c when deleting 3249 * ARP entries. For SIOC*IFARP ioctls in addition to the address, 3250 * ip_if->ipif_ill also needs to be matched. 3251 */ 3252 boolean_t 3253 ip_ire_clookup_and_delete(ipaddr_t addr, ipif_t *ipif, ip_stack_t *ipst) 3254 { 3255 ill_t *ill; 3256 nce_t *nce; 3257 3258 ill = (ipif ? ipif->ipif_ill : NULL); 3259 3260 if (ill != NULL) { 3261 /* 3262 * clean up the nce (and any relevant ire's) that matches 3263 * on addr and ill. 3264 */ 3265 nce = ndp_lookup_v4(ill, &addr, B_FALSE); 3266 if (nce != NULL) { 3267 ndp_delete(nce); 3268 return (B_TRUE); 3269 } 3270 } else { 3271 /* 3272 * ill is wildcard. clean up all nce's and 3273 * ire's that match on addr 3274 */ 3275 nce_clookup_t cl; 3276 3277 cl.ncecl_addr = addr; 3278 cl.ncecl_found = B_FALSE; 3279 3280 ndp_walk_common(ipst->ips_ndp4, NULL, 3281 (pfi_t)ip_nce_clookup_and_delete, (uchar_t *)&cl, B_TRUE); 3282 3283 /* 3284 * ncecl_found would be set by ip_nce_clookup_and_delete if 3285 * we found a matching nce. 3286 */ 3287 return (cl.ncecl_found); 3288 } 3289 return (B_FALSE); 3290 3291 } 3292 3293 /* Delete the supplied nce if its nce_addr matches the supplied address */ 3294 static void 3295 ip_nce_clookup_and_delete(nce_t *nce, void *arg) 3296 { 3297 nce_clookup_t *cl = (nce_clookup_t *)arg; 3298 ipaddr_t nce_addr; 3299 3300 IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr); 3301 if (nce_addr == cl->ncecl_addr) { 3302 cl->ncecl_found = B_TRUE; 3303 /* clean up the nce (and any relevant ire's) */ 3304 ndp_delete(nce); 3305 } 3306 } 3307 3308 /* 3309 * Clean up the radix node for this ire. Must be called by IRB_REFRELE 3310 * when there are no ire's left in the bucket. Returns TRUE if the bucket 3311 * is deleted and freed. 3312 */ 3313 boolean_t 3314 irb_inactive(irb_t *irb) 3315 { 3316 struct rt_entry *rt; 3317 struct radix_node *rn; 3318 ip_stack_t *ipst = irb->irb_ipst; 3319 3320 ASSERT(irb->irb_ipst != NULL); 3321 3322 rt = IRB2RT(irb); 3323 rn = (struct radix_node *)rt; 3324 3325 /* first remove it from the radix tree. */ 3326 RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable); 3327 rw_enter(&irb->irb_lock, RW_WRITER); 3328 if (irb->irb_refcnt == 1 && irb->irb_nire == 0) { 3329 rn = ipst->ips_ip_ftable->rnh_deladdr(rn->rn_key, rn->rn_mask, 3330 ipst->ips_ip_ftable); 3331 DTRACE_PROBE1(irb__free, rt_t *, rt); 3332 ASSERT((void *)rn == (void *)rt); 3333 Free(rt, rt_entry_cache); 3334 /* irb_lock is freed */ 3335 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 3336 return (B_TRUE); 3337 } 3338 rw_exit(&irb->irb_lock); 3339 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 3340 return (B_FALSE); 3341 } 3342 3343 /* 3344 * Delete the specified IRE. 3345 */ 3346 void 3347 ire_delete(ire_t *ire) 3348 { 3349 ire_t *ire1; 3350 ire_t **ptpn; 3351 irb_t *irb; 3352 ip_stack_t *ipst = ire->ire_ipst; 3353 3354 if ((irb = ire->ire_bucket) == NULL) { 3355 /* 3356 * It was never inserted in the list. Should call REFRELE 3357 * to free this IRE. 3358 */ 3359 IRE_REFRELE_NOTR(ire); 3360 return; 3361 } 3362 3363 rw_enter(&irb->irb_lock, RW_WRITER); 3364 3365 if (irb->irb_rr_origin == ire) { 3366 irb->irb_rr_origin = NULL; 3367 } 3368 3369 /* 3370 * In case of V4 we might still be waiting for fastpath ack. 3371 */ 3372 if (ire->ire_ipversion == IPV4_VERSION && 3373 (ire->ire_type == IRE_CACHE || 3374 (ire->ire_type == IRE_BROADCAST && ire->ire_stq != NULL))) { 3375 ASSERT(ire->ire_nce != NULL); 3376 nce_fastpath_list_delete(ire->ire_nce); 3377 } 3378 3379 if (ire->ire_ptpn == NULL) { 3380 /* 3381 * Some other thread has removed us from the list. 3382 * It should have done the REFRELE for us. 3383 */ 3384 rw_exit(&irb->irb_lock); 3385 return; 3386 } 3387 3388 if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { 3389 irb->irb_ire_cnt--; 3390 ire->ire_marks |= IRE_MARK_CONDEMNED; 3391 if (ire->ire_marks & IRE_MARK_TEMPORARY) { 3392 irb->irb_tmp_ire_cnt--; 3393 ire->ire_marks &= ~IRE_MARK_TEMPORARY; 3394 } 3395 } 3396 3397 if (irb->irb_refcnt != 0) { 3398 /* 3399 * The last thread to leave this bucket will 3400 * delete this ire. 3401 */ 3402 irb->irb_marks |= IRB_MARK_CONDEMNED; 3403 rw_exit(&irb->irb_lock); 3404 return; 3405 } 3406 3407 /* 3408 * Normally to delete an ire, we walk the bucket. While we 3409 * walk the bucket, we normally bump up irb_refcnt and hence 3410 * we return from above where we mark CONDEMNED and the ire 3411 * gets deleted from ire_unlink. This case is where somebody 3412 * knows the ire e.g by doing a lookup, and wants to delete the 3413 * IRE. irb_refcnt would be 0 in this case if nobody is walking 3414 * the bucket. 3415 */ 3416 ptpn = ire->ire_ptpn; 3417 ire1 = ire->ire_next; 3418 if (ire1 != NULL) 3419 ire1->ire_ptpn = ptpn; 3420 ASSERT(ptpn != NULL); 3421 *ptpn = ire1; 3422 ire->ire_ptpn = NULL; 3423 ire->ire_next = NULL; 3424 if (ire->ire_ipversion == IPV6_VERSION) { 3425 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_deleted); 3426 } else { 3427 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_deleted); 3428 } 3429 /* 3430 * ip_wput/ip_wput_v6 checks this flag to see whether 3431 * it should still use the cached ire or not. 3432 */ 3433 if (ire->ire_type == IRE_DEFAULT) { 3434 /* 3435 * IRE is out of the list. We need to adjust the 3436 * accounting before we drop the lock. 3437 */ 3438 if (ire->ire_ipversion == IPV6_VERSION) { 3439 ASSERT(ipst->ips_ipv6_ire_default_count != 0); 3440 ipst->ips_ipv6_ire_default_count--; 3441 } 3442 } 3443 rw_exit(&irb->irb_lock); 3444 3445 if (ire->ire_ipversion == IPV6_VERSION) { 3446 ire_delete_v6(ire); 3447 } else { 3448 ire_delete_v4(ire); 3449 } 3450 /* 3451 * We removed it from the list. Decrement the 3452 * reference count. 3453 */ 3454 IRE_REFRELE_NOTR(ire); 3455 } 3456 3457 /* 3458 * Delete the specified IRE. 3459 * All calls should use ire_delete(). 3460 * Sometimes called as writer though not required by this function. 3461 * 3462 * NOTE : This function is called only if the ire was added 3463 * in the list. 3464 */ 3465 static void 3466 ire_delete_v4(ire_t *ire) 3467 { 3468 ip_stack_t *ipst = ire->ire_ipst; 3469 3470 ASSERT(ire->ire_refcnt >= 1); 3471 ASSERT(ire->ire_ipversion == IPV4_VERSION); 3472 3473 if (ire->ire_type != IRE_CACHE) 3474 ire_flush_cache_v4(ire, IRE_FLUSH_DELETE); 3475 if (ire->ire_type == IRE_DEFAULT) { 3476 /* 3477 * when a default gateway is going away 3478 * delete all the host redirects pointing at that 3479 * gateway. 3480 */ 3481 ire_delete_host_redirects(ire->ire_gateway_addr, ipst); 3482 } 3483 } 3484 3485 /* 3486 * IRE_REFRELE/ire_refrele are the only caller of the function. It calls 3487 * to free the ire when the reference count goes to zero. 3488 */ 3489 void 3490 ire_inactive(ire_t *ire) 3491 { 3492 nce_t *nce; 3493 ill_t *ill = NULL; 3494 ill_t *stq_ill = NULL; 3495 ipif_t *ipif; 3496 boolean_t need_wakeup = B_FALSE; 3497 irb_t *irb; 3498 ip_stack_t *ipst = ire->ire_ipst; 3499 3500 ASSERT(ire->ire_refcnt == 0); 3501 ASSERT(ire->ire_ptpn == NULL); 3502 ASSERT(ire->ire_next == NULL); 3503 3504 if (ire->ire_gw_secattr != NULL) { 3505 ire_gw_secattr_free(ire->ire_gw_secattr); 3506 ire->ire_gw_secattr = NULL; 3507 } 3508 3509 if (ire->ire_mp != NULL) { 3510 ASSERT(ire->ire_bucket == NULL); 3511 mutex_destroy(&ire->ire_lock); 3512 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed); 3513 if (ire->ire_nce != NULL) 3514 NCE_REFRELE_NOTR(ire->ire_nce); 3515 freeb(ire->ire_mp); 3516 return; 3517 } 3518 3519 if ((nce = ire->ire_nce) != NULL) { 3520 NCE_REFRELE_NOTR(nce); 3521 ire->ire_nce = NULL; 3522 } 3523 3524 if (ire->ire_ipif == NULL) 3525 goto end; 3526 3527 ipif = ire->ire_ipif; 3528 ill = ipif->ipif_ill; 3529 3530 if (ire->ire_bucket == NULL) { 3531 /* The ire was never inserted in the table. */ 3532 goto end; 3533 } 3534 3535 /* 3536 * ipif_ire_cnt on this ipif goes down by 1. If the ire_stq is 3537 * non-null ill_ire_count also goes down by 1. 3538 * 3539 * The ipif that is associated with an ire is ire->ire_ipif and 3540 * hence when the ire->ire_ipif->ipif_ire_cnt drops to zero we call 3541 * ipif_ill_refrele_tail. Usually stq_ill is null or the same as 3542 * ire->ire_ipif->ipif_ill. So nothing more needs to be done. 3543 * However, for VNI or IPMP IRE entries, stq_ill can be different. 3544 * If this is different from ire->ire_ipif->ipif_ill and if the 3545 * ill_ire_cnt on the stq_ill also has dropped to zero, we call 3546 * ipif_ill_refrele_tail on the stq_ill. 3547 */ 3548 if (ire->ire_stq != NULL) 3549 stq_ill = ire->ire_stq->q_ptr; 3550 3551 if (stq_ill == NULL || stq_ill == ill) { 3552 /* Optimize the most common case */ 3553 mutex_enter(&ill->ill_lock); 3554 ASSERT(ipif->ipif_ire_cnt != 0); 3555 DTRACE_PROBE3(ipif__decr__cnt, (ipif_t *), ipif, 3556 (char *), "ire", (void *), ire); 3557 ipif->ipif_ire_cnt--; 3558 if (IPIF_DOWN_OK(ipif)) 3559 need_wakeup = B_TRUE; 3560 if (stq_ill != NULL) { 3561 ASSERT(stq_ill->ill_ire_cnt != 0); 3562 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), stq_ill, 3563 (char *), "ire", (void *), ire); 3564 stq_ill->ill_ire_cnt--; 3565 if (ILL_DOWN_OK(stq_ill)) 3566 need_wakeup = B_TRUE; 3567 } 3568 if (need_wakeup) { 3569 /* Drops the ill lock */ 3570 ipif_ill_refrele_tail(ill); 3571 } else { 3572 mutex_exit(&ill->ill_lock); 3573 } 3574 } else { 3575 /* 3576 * We can't grab all the ill locks at the same time. 3577 * It can lead to recursive lock enter in the call to 3578 * ipif_ill_refrele_tail and later. Instead do it 1 at 3579 * a time. 3580 */ 3581 mutex_enter(&ill->ill_lock); 3582 ASSERT(ipif->ipif_ire_cnt != 0); 3583 DTRACE_PROBE3(ipif__decr__cnt, (ipif_t *), ipif, 3584 (char *), "ire", (void *), ire); 3585 ipif->ipif_ire_cnt--; 3586 if (IPIF_DOWN_OK(ipif)) { 3587 /* Drops the lock */ 3588 ipif_ill_refrele_tail(ill); 3589 } else { 3590 mutex_exit(&ill->ill_lock); 3591 } 3592 if (stq_ill != NULL) { 3593 mutex_enter(&stq_ill->ill_lock); 3594 ASSERT(stq_ill->ill_ire_cnt != 0); 3595 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), stq_ill, 3596 (char *), "ire", (void *), ire); 3597 stq_ill->ill_ire_cnt--; 3598 if (ILL_DOWN_OK(stq_ill)) { 3599 /* Drops the ill lock */ 3600 ipif_ill_refrele_tail(stq_ill); 3601 } else { 3602 mutex_exit(&stq_ill->ill_lock); 3603 } 3604 } 3605 } 3606 end: 3607 /* This should be true for both V4 and V6 */ 3608 3609 if ((ire->ire_type & IRE_FORWARDTABLE) && 3610 (ire->ire_ipversion == IPV4_VERSION) && 3611 ((irb = ire->ire_bucket) != NULL)) { 3612 rw_enter(&irb->irb_lock, RW_WRITER); 3613 irb->irb_nire--; 3614 /* 3615 * Instead of examining the conditions for freeing 3616 * the radix node here, we do it by calling 3617 * IRB_REFRELE which is a single point in the code 3618 * that embeds that logic. Bump up the refcnt to 3619 * be able to call IRB_REFRELE 3620 */ 3621 IRB_REFHOLD_LOCKED(irb); 3622 rw_exit(&irb->irb_lock); 3623 IRB_REFRELE(irb); 3624 } 3625 ire->ire_ipif = NULL; 3626 3627 #ifdef DEBUG 3628 ire_trace_cleanup(ire); 3629 #endif 3630 mutex_destroy(&ire->ire_lock); 3631 if (ire->ire_ipversion == IPV6_VERSION) { 3632 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_freed); 3633 } else { 3634 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed); 3635 } 3636 ASSERT(ire->ire_mp == NULL); 3637 /* Has been allocated out of the cache */ 3638 kmem_cache_free(ire_cache, ire); 3639 } 3640 3641 /* 3642 * ire_walk routine to delete all IRE_CACHE/IRE_HOST types redirect 3643 * entries that have a given gateway address. 3644 */ 3645 void 3646 ire_delete_cache_gw(ire_t *ire, char *cp) 3647 { 3648 ipaddr_t gw_addr; 3649 3650 if (!(ire->ire_type & IRE_CACHE) && 3651 !(ire->ire_flags & RTF_DYNAMIC)) 3652 return; 3653 3654 bcopy(cp, &gw_addr, sizeof (gw_addr)); 3655 if (ire->ire_gateway_addr == gw_addr) { 3656 ip1dbg(("ire_delete_cache_gw: deleted 0x%x type %d to 0x%x\n", 3657 (int)ntohl(ire->ire_addr), ire->ire_type, 3658 (int)ntohl(ire->ire_gateway_addr))); 3659 ire_delete(ire); 3660 } 3661 } 3662 3663 /* 3664 * Remove all IRE_CACHE entries that match the ire specified. 3665 * 3666 * The flag argument indicates if the flush request is due to addition 3667 * of new route (IRE_FLUSH_ADD) or deletion of old route (IRE_FLUSH_DELETE). 3668 * 3669 * This routine takes only the IREs from the forwarding table and flushes 3670 * the corresponding entries from the cache table. 3671 * 3672 * When flushing due to the deletion of an old route, it 3673 * just checks the cache handles (ire_phandle and ire_ihandle) and 3674 * deletes the ones that match. 3675 * 3676 * When flushing due to the creation of a new route, it checks 3677 * if a cache entry's address matches the one in the IRE and 3678 * that the cache entry's parent has a less specific mask than the 3679 * one in IRE. The destination of such a cache entry could be the 3680 * gateway for other cache entries, so we need to flush those as 3681 * well by looking for gateway addresses matching the IRE's address. 3682 */ 3683 void 3684 ire_flush_cache_v4(ire_t *ire, int flag) 3685 { 3686 int i; 3687 ire_t *cire; 3688 irb_t *irb; 3689 ip_stack_t *ipst = ire->ire_ipst; 3690 3691 if (ire->ire_type & IRE_CACHE) 3692 return; 3693 3694 /* 3695 * If a default is just created, there is no point 3696 * in going through the cache, as there will not be any 3697 * cached ires. 3698 */ 3699 if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) 3700 return; 3701 if (flag == IRE_FLUSH_ADD) { 3702 /* 3703 * This selective flush is due to the addition of 3704 * new IRE. 3705 */ 3706 for (i = 0; i < ipst->ips_ip_cache_table_size; i++) { 3707 irb = &ipst->ips_ip_cache_table[i]; 3708 if ((cire = irb->irb_ire) == NULL) 3709 continue; 3710 IRB_REFHOLD(irb); 3711 for (cire = irb->irb_ire; cire != NULL; 3712 cire = cire->ire_next) { 3713 if (cire->ire_type != IRE_CACHE) 3714 continue; 3715 /* 3716 * If 'cire' belongs to the same subnet 3717 * as the new ire being added, and 'cire' 3718 * is derived from a prefix that is less 3719 * specific than the new ire being added, 3720 * we need to flush 'cire'; for instance, 3721 * when a new interface comes up. 3722 */ 3723 if (((cire->ire_addr & ire->ire_mask) == 3724 (ire->ire_addr & ire->ire_mask)) && 3725 (ip_mask_to_plen(cire->ire_cmask) <= 3726 ire->ire_masklen)) { 3727 ire_delete(cire); 3728 continue; 3729 } 3730 /* 3731 * This is the case when the ire_gateway_addr 3732 * of 'cire' belongs to the same subnet as 3733 * the new ire being added. 3734 * Flushing such ires is sometimes required to 3735 * avoid misrouting: say we have a machine with 3736 * two interfaces (I1 and I2), a default router 3737 * R on the I1 subnet, and a host route to an 3738 * off-link destination D with a gateway G on 3739 * the I2 subnet. 3740 * Under normal operation, we will have an 3741 * on-link cache entry for G and an off-link 3742 * cache entry for D with G as ire_gateway_addr, 3743 * traffic to D will reach its destination 3744 * through gateway G. 3745 * If the administrator does 'ifconfig I2 down', 3746 * the cache entries for D and G will be 3747 * flushed. However, G will now be resolved as 3748 * an off-link destination using R (the default 3749 * router) as gateway. Then D will also be 3750 * resolved as an off-link destination using G 3751 * as gateway - this behavior is due to 3752 * compatibility reasons, see comment in 3753 * ire_ihandle_lookup_offlink(). Traffic to D 3754 * will go to the router R and probably won't 3755 * reach the destination. 3756 * The administrator then does 'ifconfig I2 up'. 3757 * Since G is on the I2 subnet, this routine 3758 * will flush its cache entry. It must also 3759 * flush the cache entry for D, otherwise 3760 * traffic will stay misrouted until the IRE 3761 * times out. 3762 */ 3763 if ((cire->ire_gateway_addr & ire->ire_mask) == 3764 (ire->ire_addr & ire->ire_mask)) { 3765 ire_delete(cire); 3766 continue; 3767 } 3768 } 3769 IRB_REFRELE(irb); 3770 } 3771 } else { 3772 /* 3773 * delete the cache entries based on 3774 * handle in the IRE as this IRE is 3775 * being deleted/changed. 3776 */ 3777 for (i = 0; i < ipst->ips_ip_cache_table_size; i++) { 3778 irb = &ipst->ips_ip_cache_table[i]; 3779 if ((cire = irb->irb_ire) == NULL) 3780 continue; 3781 IRB_REFHOLD(irb); 3782 for (cire = irb->irb_ire; cire != NULL; 3783 cire = cire->ire_next) { 3784 if (cire->ire_type != IRE_CACHE) 3785 continue; 3786 if ((cire->ire_phandle == 0 || 3787 cire->ire_phandle != ire->ire_phandle) && 3788 (cire->ire_ihandle == 0 || 3789 cire->ire_ihandle != ire->ire_ihandle)) 3790 continue; 3791 ire_delete(cire); 3792 } 3793 IRB_REFRELE(irb); 3794 } 3795 } 3796 } 3797 3798 /* 3799 * Matches the arguments passed with the values in the ire. 3800 * 3801 * Note: for match types that match using "ipif" passed in, ipif 3802 * must be checked for non-NULL before calling this routine. 3803 */ 3804 boolean_t 3805 ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, 3806 int type, const ipif_t *ipif, zoneid_t zoneid, uint32_t ihandle, 3807 const ts_label_t *tsl, int match_flags, queue_t *wq) 3808 { 3809 ill_t *ire_ill = NULL, *dst_ill; 3810 ill_t *ipif_ill = NULL; 3811 3812 ASSERT(ire->ire_ipversion == IPV4_VERSION); 3813 ASSERT((ire->ire_addr & ~ire->ire_mask) == 0); 3814 ASSERT((!(match_flags & MATCH_IRE_ILL)) || 3815 (ipif != NULL && !ipif->ipif_isv6)); 3816 ASSERT(!(match_flags & MATCH_IRE_WQ) || wq != NULL); 3817 3818 /* 3819 * If MATCH_IRE_MARK_TESTHIDDEN is set, then only return the IRE if it 3820 * is in fact hidden, to ensure the caller gets the right one. One 3821 * exception: if the caller passed MATCH_IRE_IHANDLE, then they 3822 * already know the identity of the given IRE_INTERFACE entry and 3823 * there's no point trying to hide it from them. 3824 */ 3825 if (ire->ire_marks & IRE_MARK_TESTHIDDEN) { 3826 if (match_flags & MATCH_IRE_IHANDLE) 3827 match_flags |= MATCH_IRE_MARK_TESTHIDDEN; 3828 3829 if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) 3830 return (B_FALSE); 3831 } 3832 3833 /* 3834 * MATCH_IRE_MARK_PRIVATE_ADDR is set when IP_NEXTHOP option 3835 * is used. In that case the routing table is bypassed and the 3836 * packets are sent directly to the specified nexthop. The 3837 * IRE_CACHE entry representing this route should be marked 3838 * with IRE_MARK_PRIVATE_ADDR. 3839 */ 3840 3841 if (!(match_flags & MATCH_IRE_MARK_PRIVATE_ADDR) && 3842 (ire->ire_marks & IRE_MARK_PRIVATE_ADDR)) 3843 return (B_FALSE); 3844 3845 if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid && 3846 ire->ire_zoneid != ALL_ZONES) { 3847 /* 3848 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid is 3849 * valid and does not match that of ire_zoneid, a failure to 3850 * match is reported at this point. Otherwise, since some IREs 3851 * that are available in the global zone can be used in local 3852 * zones, additional checks need to be performed: 3853 * 3854 * IRE_BROADCAST, IRE_CACHE and IRE_LOOPBACK 3855 * entries should never be matched in this situation. 3856 * 3857 * IRE entries that have an interface associated with them 3858 * should in general not match unless they are an IRE_LOCAL 3859 * or in the case when MATCH_IRE_DEFAULT has been set in 3860 * the caller. In the case of the former, checking of the 3861 * other fields supplied should take place. 3862 * 3863 * In the case where MATCH_IRE_DEFAULT has been set, 3864 * all of the ipif's associated with the IRE's ill are 3865 * checked to see if there is a matching zoneid. If any 3866 * one ipif has a matching zoneid, this IRE is a 3867 * potential candidate so checking of the other fields 3868 * takes place. 3869 * 3870 * In the case where the IRE_INTERFACE has a usable source 3871 * address (indicated by ill_usesrc_ifindex) in the 3872 * correct zone then it's permitted to return this IRE 3873 */ 3874 if (match_flags & MATCH_IRE_ZONEONLY) 3875 return (B_FALSE); 3876 if (ire->ire_type & (IRE_BROADCAST | IRE_CACHE | IRE_LOOPBACK)) 3877 return (B_FALSE); 3878 /* 3879 * Note, IRE_INTERFACE can have the stq as NULL. For 3880 * example, if the default multicast route is tied to 3881 * the loopback address. 3882 */ 3883 if ((ire->ire_type & IRE_INTERFACE) && 3884 (ire->ire_stq != NULL)) { 3885 dst_ill = (ill_t *)ire->ire_stq->q_ptr; 3886 /* 3887 * If there is a usable source address in the 3888 * zone, then it's ok to return an 3889 * IRE_INTERFACE 3890 */ 3891 if (ipif_usesrc_avail(dst_ill, zoneid)) { 3892 ip3dbg(("ire_match_args: dst_ill %p match %d\n", 3893 (void *)dst_ill, 3894 (ire->ire_addr == (addr & mask)))); 3895 } else { 3896 ip3dbg(("ire_match_args: src_ipif NULL" 3897 " dst_ill %p\n", (void *)dst_ill)); 3898 return (B_FALSE); 3899 } 3900 } 3901 if (ire->ire_ipif != NULL && ire->ire_type != IRE_LOCAL && 3902 !(ire->ire_type & IRE_INTERFACE)) { 3903 ipif_t *tipif; 3904 3905 if ((match_flags & MATCH_IRE_DEFAULT) == 0) { 3906 return (B_FALSE); 3907 } 3908 mutex_enter(&ire->ire_ipif->ipif_ill->ill_lock); 3909 for (tipif = ire->ire_ipif->ipif_ill->ill_ipif; 3910 tipif != NULL; tipif = tipif->ipif_next) { 3911 if (IPIF_CAN_LOOKUP(tipif) && 3912 (tipif->ipif_flags & IPIF_UP) && 3913 (tipif->ipif_zoneid == zoneid || 3914 tipif->ipif_zoneid == ALL_ZONES)) 3915 break; 3916 } 3917 mutex_exit(&ire->ire_ipif->ipif_ill->ill_lock); 3918 if (tipif == NULL) { 3919 return (B_FALSE); 3920 } 3921 } 3922 } 3923 3924 /* 3925 * For IRE_CACHE entries, MATCH_IRE_ILL means that somebody wants to 3926 * send out ire_stq (ire_ipif for IRE_CACHE entries is just the means 3927 * of getting a source address -- i.e., ire_src_addr == 3928 * ire->ire_ipif->ipif_src_addr). ire_to_ill() handles this. 3929 * 3930 * NOTE: For IPMP, MATCH_IRE_ILL usually matches any ill in the group. 3931 * However, if MATCH_IRE_MARK_TESTHIDDEN is set (i.e., the IRE is for 3932 * IPMP test traffic), then the ill must match exactly. 3933 */ 3934 if (match_flags & MATCH_IRE_ILL) { 3935 ire_ill = ire_to_ill(ire); 3936 ipif_ill = ipif->ipif_ill; 3937 } 3938 3939 if ((ire->ire_addr == (addr & mask)) && 3940 ((!(match_flags & MATCH_IRE_GW)) || 3941 (ire->ire_gateway_addr == gateway)) && 3942 ((!(match_flags & MATCH_IRE_TYPE)) || 3943 (ire->ire_type & type)) && 3944 ((!(match_flags & MATCH_IRE_SRC)) || 3945 (ire->ire_src_addr == ipif->ipif_src_addr)) && 3946 ((!(match_flags & MATCH_IRE_IPIF)) || 3947 (ire->ire_ipif == ipif)) && 3948 ((!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) || 3949 (ire->ire_marks & IRE_MARK_TESTHIDDEN)) && 3950 ((!(match_flags & MATCH_IRE_MARK_PRIVATE_ADDR)) || 3951 (ire->ire_type != IRE_CACHE || 3952 ire->ire_marks & IRE_MARK_PRIVATE_ADDR)) && 3953 ((!(match_flags & MATCH_IRE_WQ)) || 3954 (ire->ire_stq == wq)) && 3955 ((!(match_flags & MATCH_IRE_ILL)) || 3956 (ire_ill == ipif_ill || 3957 (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN) && 3958 ire_ill != NULL && IS_IN_SAME_ILLGRP(ipif_ill, ire_ill)))) && 3959 ((!(match_flags & MATCH_IRE_IHANDLE)) || 3960 (ire->ire_ihandle == ihandle)) && 3961 ((!(match_flags & MATCH_IRE_MASK)) || 3962 (ire->ire_mask == mask)) && 3963 ((!(match_flags & MATCH_IRE_SECATTR)) || 3964 (!is_system_labeled()) || 3965 (tsol_ire_match_gwattr(ire, tsl) == 0))) { 3966 /* We found the matched IRE */ 3967 return (B_TRUE); 3968 } 3969 return (B_FALSE); 3970 } 3971 3972 /* 3973 * Lookup for a route in all the tables 3974 */ 3975 ire_t * 3976 ire_route_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, 3977 int type, const ipif_t *ipif, ire_t **pire, zoneid_t zoneid, 3978 const ts_label_t *tsl, int flags, ip_stack_t *ipst) 3979 { 3980 ire_t *ire = NULL; 3981 3982 /* 3983 * ire_match_args() will dereference ipif MATCH_IRE_SRC or 3984 * MATCH_IRE_ILL is set. 3985 */ 3986 if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL)) 3987 return (NULL); 3988 3989 /* 3990 * might be asking for a cache lookup, 3991 * This is not best way to lookup cache, 3992 * user should call ire_cache_lookup directly. 3993 * 3994 * If MATCH_IRE_TYPE was set, first lookup in the cache table and then 3995 * in the forwarding table, if the applicable type flags were set. 3996 */ 3997 if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_CACHETABLE) != 0) { 3998 ire = ire_ctable_lookup(addr, gateway, type, ipif, zoneid, 3999 tsl, flags, ipst); 4000 if (ire != NULL) 4001 return (ire); 4002 } 4003 if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_FORWARDTABLE) != 0) { 4004 ire = ire_ftable_lookup(addr, mask, gateway, type, ipif, pire, 4005 zoneid, 0, tsl, flags, ipst); 4006 } 4007 return (ire); 4008 } 4009 4010 /* 4011 * Delete the IRE cache for the gateway and all IRE caches whose 4012 * ire_gateway_addr points to this gateway, and allow them to 4013 * be created on demand by ip_newroute. 4014 */ 4015 void 4016 ire_clookup_delete_cache_gw(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst) 4017 { 4018 irb_t *irb; 4019 ire_t *ire; 4020 4021 irb = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(addr, 4022 ipst->ips_ip_cache_table_size)]; 4023 IRB_REFHOLD(irb); 4024 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 4025 if (ire->ire_marks & IRE_MARK_CONDEMNED) 4026 continue; 4027 4028 ASSERT(ire->ire_mask == IP_HOST_MASK); 4029 if (ire_match_args(ire, addr, ire->ire_mask, 0, IRE_CACHE, 4030 NULL, zoneid, 0, NULL, MATCH_IRE_TYPE, NULL)) { 4031 ire_delete(ire); 4032 } 4033 } 4034 IRB_REFRELE(irb); 4035 4036 ire_walk_v4(ire_delete_cache_gw, &addr, zoneid, ipst); 4037 } 4038 4039 /* 4040 * Looks up cache table for a route. 4041 * specific lookup can be indicated by 4042 * passing the MATCH_* flags and the 4043 * necessary parameters. 4044 */ 4045 ire_t * 4046 ire_ctable_lookup(ipaddr_t addr, ipaddr_t gateway, int type, const ipif_t *ipif, 4047 zoneid_t zoneid, const ts_label_t *tsl, int flags, ip_stack_t *ipst) 4048 { 4049 ire_ctable_args_t margs; 4050 4051 margs.ict_addr = &addr; 4052 margs.ict_gateway = &gateway; 4053 margs.ict_type = type; 4054 margs.ict_ipif = ipif; 4055 margs.ict_zoneid = zoneid; 4056 margs.ict_tsl = tsl; 4057 margs.ict_flags = flags; 4058 margs.ict_ipst = ipst; 4059 margs.ict_wq = NULL; 4060 4061 return (ip4_ctable_lookup_impl(&margs)); 4062 } 4063 4064 /* 4065 * Check whether the IRE_LOCAL and the IRE potentially used to transmit 4066 * (could be an IRE_CACHE, IRE_BROADCAST, or IRE_INTERFACE) are identical 4067 * or part of the same illgrp. (In the IPMP case, usually the two IREs 4068 * will both belong to the IPMP ill, but exceptions are possible -- e.g. 4069 * if IPMP test addresses are on their own subnet.) 4070 */ 4071 boolean_t 4072 ire_local_same_lan(ire_t *ire_local, ire_t *xmit_ire) 4073 { 4074 ill_t *recv_ill, *xmit_ill; 4075 4076 ASSERT(ire_local->ire_type & (IRE_LOCAL|IRE_LOOPBACK)); 4077 ASSERT(xmit_ire->ire_type & (IRE_CACHETABLE|IRE_INTERFACE)); 4078 4079 recv_ill = ire_to_ill(ire_local); 4080 xmit_ill = ire_to_ill(xmit_ire); 4081 4082 ASSERT(recv_ill != NULL); 4083 ASSERT(xmit_ill != NULL); 4084 4085 return (IS_ON_SAME_LAN(recv_ill, xmit_ill)); 4086 } 4087 4088 /* 4089 * Check if the IRE_LOCAL uses the same ill as another route would use. 4090 * If there is no alternate route, or the alternate is a REJECT or BLACKHOLE, 4091 * then we don't allow this IRE_LOCAL to be used. 4092 */ 4093 boolean_t 4094 ire_local_ok_across_zones(ire_t *ire_local, zoneid_t zoneid, void *addr, 4095 const ts_label_t *tsl, ip_stack_t *ipst) 4096 { 4097 ire_t *alt_ire; 4098 boolean_t rval; 4099 int flags; 4100 4101 flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE; 4102 4103 if (ire_local->ire_ipversion == IPV4_VERSION) { 4104 alt_ire = ire_ftable_lookup(*((ipaddr_t *)addr), 0, 0, 0, NULL, 4105 NULL, zoneid, 0, tsl, flags, ipst); 4106 } else { 4107 alt_ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL, 4108 NULL, zoneid, 0, tsl, flags, ipst); 4109 } 4110 4111 if (alt_ire == NULL) 4112 return (B_FALSE); 4113 4114 if (alt_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 4115 ire_refrele(alt_ire); 4116 return (B_FALSE); 4117 } 4118 rval = ire_local_same_lan(ire_local, alt_ire); 4119 4120 ire_refrele(alt_ire); 4121 return (rval); 4122 } 4123 4124 /* 4125 * Lookup cache 4126 * 4127 * In general the zoneid has to match (where ALL_ZONES match all of them). 4128 * But for IRE_LOCAL we also need to handle the case where L2 should 4129 * conceptually loop back the packet. This is necessary since neither 4130 * Ethernet drivers nor Ethernet hardware loops back packets sent to their 4131 * own MAC address. This loopback is needed when the normal 4132 * routes (ignoring IREs with different zoneids) would send out the packet on 4133 * the same ill as the ill with which this IRE_LOCAL is associated. 4134 * 4135 * Earlier versions of this code always matched an IRE_LOCAL independently of 4136 * the zoneid. We preserve that earlier behavior when 4137 * ip_restrict_interzone_loopback is turned off. 4138 */ 4139 ire_t * 4140 ire_cache_lookup(ipaddr_t addr, zoneid_t zoneid, const ts_label_t *tsl, 4141 ip_stack_t *ipst) 4142 { 4143 irb_t *irb_ptr; 4144 ire_t *ire; 4145 4146 irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(addr, 4147 ipst->ips_ip_cache_table_size)]; 4148 rw_enter(&irb_ptr->irb_lock, RW_READER); 4149 for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 4150 if (ire->ire_marks & (IRE_MARK_CONDEMNED | 4151 IRE_MARK_TESTHIDDEN | IRE_MARK_PRIVATE_ADDR)) { 4152 continue; 4153 } 4154 if (ire->ire_addr == addr) { 4155 /* 4156 * Finally, check if the security policy has any 4157 * restriction on using this route for the specified 4158 * message. 4159 */ 4160 if (tsl != NULL && 4161 ire->ire_gw_secattr != NULL && 4162 tsol_ire_match_gwattr(ire, tsl) != 0) { 4163 continue; 4164 } 4165 4166 if (zoneid == ALL_ZONES || ire->ire_zoneid == zoneid || 4167 ire->ire_zoneid == ALL_ZONES) { 4168 IRE_REFHOLD(ire); 4169 rw_exit(&irb_ptr->irb_lock); 4170 return (ire); 4171 } 4172 4173 if (ire->ire_type == IRE_LOCAL) { 4174 if (ipst->ips_ip_restrict_interzone_loopback && 4175 !ire_local_ok_across_zones(ire, zoneid, 4176 &addr, tsl, ipst)) 4177 continue; 4178 4179 IRE_REFHOLD(ire); 4180 rw_exit(&irb_ptr->irb_lock); 4181 return (ire); 4182 } 4183 } 4184 } 4185 rw_exit(&irb_ptr->irb_lock); 4186 return (NULL); 4187 } 4188 4189 ire_t * 4190 ire_cache_lookup_simple(ipaddr_t dst, ip_stack_t *ipst) 4191 { 4192 irb_t *irb_ptr; 4193 ire_t *ire; 4194 4195 /* 4196 * Look for an ire in the cachetable whose 4197 * ire_addr matches the destination. 4198 * Since we are being called by forwarding fastpath 4199 * no need to check for Trusted Solaris label. 4200 */ 4201 irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH( 4202 dst, ipst->ips_ip_cache_table_size)]; 4203 rw_enter(&irb_ptr->irb_lock, RW_READER); 4204 for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 4205 if (ire->ire_marks & (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN | 4206 IRE_MARK_PRIVATE_ADDR)) { 4207 continue; 4208 } 4209 if (ire->ire_addr == dst) { 4210 IRE_REFHOLD(ire); 4211 rw_exit(&irb_ptr->irb_lock); 4212 return (ire); 4213 } 4214 } 4215 rw_exit(&irb_ptr->irb_lock); 4216 return (NULL); 4217 } 4218 4219 /* 4220 * Locate the interface ire that is tied to the cache ire 'cire' via 4221 * cire->ire_ihandle. 4222 * 4223 * We are trying to create the cache ire for an offlink destn based 4224 * on the cache ire of the gateway in 'cire'. 'pire' is the prefix ire 4225 * as found by ip_newroute(). We are called from ip_newroute() in 4226 * the IRE_CACHE case. 4227 */ 4228 ire_t * 4229 ire_ihandle_lookup_offlink(ire_t *cire, ire_t *pire) 4230 { 4231 ire_t *ire; 4232 int match_flags; 4233 ipaddr_t gw_addr; 4234 ipif_t *gw_ipif; 4235 ip_stack_t *ipst = cire->ire_ipst; 4236 4237 ASSERT(cire != NULL && pire != NULL); 4238 4239 /* 4240 * We don't need to specify the zoneid to ire_ftable_lookup() below 4241 * because the ihandle refers to an ipif which can be in only one zone. 4242 */ 4243 match_flags = MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK; 4244 if (pire->ire_ipif != NULL) 4245 match_flags |= MATCH_IRE_ILL; 4246 /* 4247 * We know that the mask of the interface ire equals cire->ire_cmask. 4248 * (When ip_newroute() created 'cire' for the gateway it set its 4249 * cmask from the interface ire's mask) 4250 */ 4251 ire = ire_ftable_lookup(cire->ire_addr, cire->ire_cmask, 0, 4252 IRE_INTERFACE, pire->ire_ipif, NULL, ALL_ZONES, cire->ire_ihandle, 4253 NULL, match_flags, ipst); 4254 if (ire != NULL) 4255 return (ire); 4256 /* 4257 * If we didn't find an interface ire above, we can't declare failure. 4258 * For backwards compatibility, we need to support prefix routes 4259 * pointing to next hop gateways that are not on-link. 4260 * 4261 * Assume we are trying to ping some offlink destn, and we have the 4262 * routing table below. 4263 * 4264 * Eg. default - gw1 <--- pire (line 1) 4265 * gw1 - gw2 (line 2) 4266 * gw2 - hme0 (line 3) 4267 * 4268 * If we already have a cache ire for gw1 in 'cire', the 4269 * ire_ftable_lookup above would have failed, since there is no 4270 * interface ire to reach gw1. We will fallthru below. 4271 * 4272 * Here we duplicate the steps that ire_ftable_lookup() did in 4273 * getting 'cire' from 'pire', in the MATCH_IRE_RECURSIVE case. 4274 * The differences are the following 4275 * i. We want the interface ire only, so we call ire_ftable_lookup() 4276 * instead of ire_route_lookup() 4277 * ii. We look for only prefix routes in the 1st call below. 4278 * ii. We want to match on the ihandle in the 2nd call below. 4279 */ 4280 match_flags = MATCH_IRE_TYPE; 4281 if (pire->ire_ipif != NULL) 4282 match_flags |= MATCH_IRE_ILL; 4283 ire = ire_ftable_lookup(pire->ire_gateway_addr, 0, 0, IRE_OFFSUBNET, 4284 pire->ire_ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst); 4285 if (ire == NULL) 4286 return (NULL); 4287 /* 4288 * At this point 'ire' corresponds to the entry shown in line 2. 4289 * gw_addr is 'gw2' in the example above. 4290 */ 4291 gw_addr = ire->ire_gateway_addr; 4292 gw_ipif = ire->ire_ipif; 4293 ire_refrele(ire); 4294 4295 match_flags |= MATCH_IRE_IHANDLE; 4296 ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, 4297 gw_ipif, NULL, ALL_ZONES, cire->ire_ihandle, NULL, match_flags, 4298 ipst); 4299 return (ire); 4300 } 4301 4302 /* 4303 * Return the IRE_LOOPBACK, IRE_IF_RESOLVER or IRE_IF_NORESOLVER 4304 * ire associated with the specified ipif. 4305 * 4306 * This might occasionally be called when IPIF_UP is not set since 4307 * the IP_MULTICAST_IF as well as creating interface routes 4308 * allows specifying a down ipif (ipif_lookup* match ipifs that are down). 4309 * 4310 * Note that if IPIF_NOLOCAL, IPIF_NOXMIT, or IPIF_DEPRECATED is set on 4311 * the ipif, this routine might return NULL. 4312 */ 4313 ire_t * 4314 ipif_to_ire(const ipif_t *ipif) 4315 { 4316 ire_t *ire; 4317 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 4318 uint_t match_flags = MATCH_IRE_TYPE | MATCH_IRE_IPIF | MATCH_IRE_MASK; 4319 4320 /* 4321 * IRE_INTERFACE entries for ills under IPMP are IRE_MARK_TESTHIDDEN 4322 * so that they aren't accidentally returned. However, if the 4323 * caller's ipif is on an ill under IPMP, there's no need to hide 'em. 4324 */ 4325 if (IS_UNDER_IPMP(ipif->ipif_ill)) 4326 match_flags |= MATCH_IRE_MARK_TESTHIDDEN; 4327 4328 ASSERT(!ipif->ipif_isv6); 4329 if (ipif->ipif_ire_type == IRE_LOOPBACK) { 4330 ire = ire_ctable_lookup(ipif->ipif_lcl_addr, 0, IRE_LOOPBACK, 4331 ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF), 4332 ipst); 4333 } else if (ipif->ipif_flags & IPIF_POINTOPOINT) { 4334 /* In this case we need to lookup destination address. */ 4335 ire = ire_ftable_lookup(ipif->ipif_pp_dst_addr, IP_HOST_MASK, 0, 4336 IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL, match_flags, 4337 ipst); 4338 } else { 4339 ire = ire_ftable_lookup(ipif->ipif_subnet, 4340 ipif->ipif_net_mask, 0, IRE_INTERFACE, ipif, NULL, 4341 ALL_ZONES, 0, NULL, match_flags, ipst); 4342 } 4343 return (ire); 4344 } 4345 4346 /* 4347 * ire_walk function. 4348 * Count the number of IRE_CACHE entries in different categories. 4349 */ 4350 void 4351 ire_cache_count(ire_t *ire, char *arg) 4352 { 4353 ire_cache_count_t *icc = (ire_cache_count_t *)arg; 4354 4355 if (ire->ire_type != IRE_CACHE) 4356 return; 4357 4358 icc->icc_total++; 4359 4360 if (ire->ire_ipversion == IPV6_VERSION) { 4361 mutex_enter(&ire->ire_lock); 4362 if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) { 4363 mutex_exit(&ire->ire_lock); 4364 icc->icc_onlink++; 4365 return; 4366 } 4367 mutex_exit(&ire->ire_lock); 4368 } else { 4369 if (ire->ire_gateway_addr == 0) { 4370 icc->icc_onlink++; 4371 return; 4372 } 4373 } 4374 4375 ASSERT(ire->ire_ipif != NULL); 4376 if (ire->ire_max_frag < ire->ire_ipif->ipif_mtu) 4377 icc->icc_pmtu++; 4378 else if (ire->ire_tire_mark != ire->ire_ob_pkt_count + 4379 ire->ire_ib_pkt_count) 4380 icc->icc_offlink++; 4381 else 4382 icc->icc_unused++; 4383 } 4384 4385 /* 4386 * ire_walk function called by ip_trash_ire_reclaim(). 4387 * Free a fraction of the IRE_CACHE cache entries. The fractions are 4388 * different for different categories of IRE_CACHE entries. 4389 * A fraction of zero means to not free any in that category. 4390 * Use the hash bucket id plus lbolt as a random number. Thus if the fraction 4391 * is N then every Nth hash bucket chain will be freed. 4392 */ 4393 void 4394 ire_cache_reclaim(ire_t *ire, char *arg) 4395 { 4396 ire_cache_reclaim_t *icr = (ire_cache_reclaim_t *)arg; 4397 uint_t rand; 4398 ip_stack_t *ipst = icr->icr_ipst; 4399 4400 if (ire->ire_type != IRE_CACHE) 4401 return; 4402 4403 if (ire->ire_ipversion == IPV6_VERSION) { 4404 rand = (uint_t)lbolt + 4405 IRE_ADDR_HASH_V6(ire->ire_addr_v6, 4406 ipst->ips_ip6_cache_table_size); 4407 mutex_enter(&ire->ire_lock); 4408 if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) { 4409 mutex_exit(&ire->ire_lock); 4410 if (icr->icr_onlink != 0 && 4411 (rand/icr->icr_onlink)*icr->icr_onlink == rand) { 4412 ire_delete(ire); 4413 return; 4414 } 4415 goto done; 4416 } 4417 mutex_exit(&ire->ire_lock); 4418 } else { 4419 rand = (uint_t)lbolt + 4420 IRE_ADDR_HASH(ire->ire_addr, ipst->ips_ip_cache_table_size); 4421 if (ire->ire_gateway_addr == 0) { 4422 if (icr->icr_onlink != 0 && 4423 (rand/icr->icr_onlink)*icr->icr_onlink == rand) { 4424 ire_delete(ire); 4425 return; 4426 } 4427 goto done; 4428 } 4429 } 4430 /* Not onlink IRE */ 4431 ASSERT(ire->ire_ipif != NULL); 4432 if (ire->ire_max_frag < ire->ire_ipif->ipif_mtu) { 4433 /* Use ptmu fraction */ 4434 if (icr->icr_pmtu != 0 && 4435 (rand/icr->icr_pmtu)*icr->icr_pmtu == rand) { 4436 ire_delete(ire); 4437 return; 4438 } 4439 } else if (ire->ire_tire_mark != ire->ire_ob_pkt_count + 4440 ire->ire_ib_pkt_count) { 4441 /* Use offlink fraction */ 4442 if (icr->icr_offlink != 0 && 4443 (rand/icr->icr_offlink)*icr->icr_offlink == rand) { 4444 ire_delete(ire); 4445 return; 4446 } 4447 } else { 4448 /* Use unused fraction */ 4449 if (icr->icr_unused != 0 && 4450 (rand/icr->icr_unused)*icr->icr_unused == rand) { 4451 ire_delete(ire); 4452 return; 4453 } 4454 } 4455 done: 4456 /* 4457 * Update tire_mark so that those that haven't been used since this 4458 * reclaim will be considered unused next time we reclaim. 4459 */ 4460 ire->ire_tire_mark = ire->ire_ob_pkt_count + ire->ire_ib_pkt_count; 4461 } 4462 4463 static void 4464 power2_roundup(uint32_t *value) 4465 { 4466 int i; 4467 4468 for (i = 1; i < 31; i++) { 4469 if (*value <= (1 << i)) 4470 break; 4471 } 4472 *value = (1 << i); 4473 } 4474 4475 /* Global init for all zones */ 4476 void 4477 ip_ire_g_init() 4478 { 4479 /* 4480 * Create ire caches, ire_reclaim() 4481 * will give IRE_CACHE back to system when needed. 4482 * This needs to be done here before anything else, since 4483 * ire_add() expects the cache to be created. 4484 */ 4485 ire_cache = kmem_cache_create("ire_cache", 4486 sizeof (ire_t), 0, ip_ire_constructor, 4487 ip_ire_destructor, ip_trash_ire_reclaim, NULL, NULL, 0); 4488 4489 rt_entry_cache = kmem_cache_create("rt_entry", 4490 sizeof (struct rt_entry), 0, NULL, NULL, NULL, NULL, NULL, 0); 4491 4492 /* 4493 * Have radix code setup kmem caches etc. 4494 */ 4495 rn_init(); 4496 } 4497 4498 void 4499 ip_ire_init(ip_stack_t *ipst) 4500 { 4501 int i; 4502 uint32_t mem_cnt; 4503 uint32_t cpu_cnt; 4504 uint32_t min_cnt; 4505 pgcnt_t mem_avail; 4506 4507 /* 4508 * ip_ire_max_bucket_cnt is sized below based on the memory 4509 * size and the cpu speed of the machine. This is upper 4510 * bounded by the compile time value of ip_ire_max_bucket_cnt 4511 * and is lower bounded by the compile time value of 4512 * ip_ire_min_bucket_cnt. Similar logic applies to 4513 * ip6_ire_max_bucket_cnt. 4514 * 4515 * We calculate this for each IP Instances in order to use 4516 * the kmem_avail and ip_ire_{min,max}_bucket_cnt that are 4517 * in effect when the zone is booted. 4518 */ 4519 mem_avail = kmem_avail(); 4520 mem_cnt = (mem_avail >> ip_ire_mem_ratio) / 4521 ip_cache_table_size / sizeof (ire_t); 4522 cpu_cnt = CPU->cpu_type_info.pi_clock >> ip_ire_cpu_ratio; 4523 4524 min_cnt = MIN(cpu_cnt, mem_cnt); 4525 if (min_cnt < ip_ire_min_bucket_cnt) 4526 min_cnt = ip_ire_min_bucket_cnt; 4527 if (ip_ire_max_bucket_cnt > min_cnt) { 4528 ip_ire_max_bucket_cnt = min_cnt; 4529 } 4530 4531 mem_cnt = (mem_avail >> ip_ire_mem_ratio) / 4532 ip6_cache_table_size / sizeof (ire_t); 4533 min_cnt = MIN(cpu_cnt, mem_cnt); 4534 if (min_cnt < ip6_ire_min_bucket_cnt) 4535 min_cnt = ip6_ire_min_bucket_cnt; 4536 if (ip6_ire_max_bucket_cnt > min_cnt) { 4537 ip6_ire_max_bucket_cnt = min_cnt; 4538 } 4539 4540 mutex_init(&ipst->ips_ire_ft_init_lock, NULL, MUTEX_DEFAULT, 0); 4541 mutex_init(&ipst->ips_ire_handle_lock, NULL, MUTEX_DEFAULT, NULL); 4542 4543 (void) rn_inithead((void **)&ipst->ips_ip_ftable, 32); 4544 4545 /* Calculate the IPv4 cache table size. */ 4546 ipst->ips_ip_cache_table_size = MAX(ip_cache_table_size, 4547 ((mem_avail >> ip_ire_mem_ratio) / sizeof (ire_t) / 4548 ip_ire_max_bucket_cnt)); 4549 if (ipst->ips_ip_cache_table_size > ip_max_cache_table_size) 4550 ipst->ips_ip_cache_table_size = ip_max_cache_table_size; 4551 /* 4552 * Make sure that the table size is always a power of 2. The 4553 * hash macro IRE_ADDR_HASH() depends on that. 4554 */ 4555 power2_roundup(&ipst->ips_ip_cache_table_size); 4556 4557 ipst->ips_ip_cache_table = kmem_zalloc(ipst->ips_ip_cache_table_size * 4558 sizeof (irb_t), KM_SLEEP); 4559 4560 for (i = 0; i < ipst->ips_ip_cache_table_size; i++) { 4561 rw_init(&ipst->ips_ip_cache_table[i].irb_lock, NULL, 4562 RW_DEFAULT, NULL); 4563 } 4564 4565 /* Calculate the IPv6 cache table size. */ 4566 ipst->ips_ip6_cache_table_size = MAX(ip6_cache_table_size, 4567 ((mem_avail >> ip_ire_mem_ratio) / sizeof (ire_t) / 4568 ip6_ire_max_bucket_cnt)); 4569 if (ipst->ips_ip6_cache_table_size > ip6_max_cache_table_size) 4570 ipst->ips_ip6_cache_table_size = ip6_max_cache_table_size; 4571 /* 4572 * Make sure that the table size is always a power of 2. The 4573 * hash macro IRE_ADDR_HASH_V6() depends on that. 4574 */ 4575 power2_roundup(&ipst->ips_ip6_cache_table_size); 4576 4577 ipst->ips_ip_cache_table_v6 = kmem_zalloc( 4578 ipst->ips_ip6_cache_table_size * sizeof (irb_t), KM_SLEEP); 4579 4580 for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) { 4581 rw_init(&ipst->ips_ip_cache_table_v6[i].irb_lock, NULL, 4582 RW_DEFAULT, NULL); 4583 } 4584 4585 /* 4586 * Make sure that the forwarding table size is a power of 2. 4587 * The IRE*_ADDR_HASH() macroes depend on that. 4588 */ 4589 ipst->ips_ip6_ftable_hash_size = ip6_ftable_hash_size; 4590 power2_roundup(&ipst->ips_ip6_ftable_hash_size); 4591 4592 ipst->ips_ire_handle = 1; 4593 } 4594 4595 void 4596 ip_ire_g_fini(void) 4597 { 4598 kmem_cache_destroy(ire_cache); 4599 kmem_cache_destroy(rt_entry_cache); 4600 4601 rn_fini(); 4602 } 4603 4604 void 4605 ip_ire_fini(ip_stack_t *ipst) 4606 { 4607 int i; 4608 4609 /* 4610 * Delete all IREs - assumes that the ill/ipifs have 4611 * been removed so what remains are just the ftable and IRE_CACHE. 4612 */ 4613 ire_walk(ire_delete, NULL, ipst); 4614 4615 rn_freehead(ipst->ips_ip_ftable); 4616 ipst->ips_ip_ftable = NULL; 4617 4618 mutex_destroy(&ipst->ips_ire_ft_init_lock); 4619 mutex_destroy(&ipst->ips_ire_handle_lock); 4620 4621 for (i = 0; i < ipst->ips_ip_cache_table_size; i++) { 4622 ASSERT(ipst->ips_ip_cache_table[i].irb_ire == NULL); 4623 rw_destroy(&ipst->ips_ip_cache_table[i].irb_lock); 4624 } 4625 kmem_free(ipst->ips_ip_cache_table, 4626 ipst->ips_ip_cache_table_size * sizeof (irb_t)); 4627 ipst->ips_ip_cache_table = NULL; 4628 4629 for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) { 4630 ASSERT(ipst->ips_ip_cache_table_v6[i].irb_ire == NULL); 4631 rw_destroy(&ipst->ips_ip_cache_table_v6[i].irb_lock); 4632 } 4633 kmem_free(ipst->ips_ip_cache_table_v6, 4634 ipst->ips_ip6_cache_table_size * sizeof (irb_t)); 4635 ipst->ips_ip_cache_table_v6 = NULL; 4636 4637 for (i = 0; i < IP6_MASK_TABLE_SIZE; i++) { 4638 irb_t *ptr; 4639 int j; 4640 4641 if ((ptr = ipst->ips_ip_forwarding_table_v6[i]) == NULL) 4642 continue; 4643 4644 for (j = 0; j < ipst->ips_ip6_ftable_hash_size; j++) { 4645 ASSERT(ptr[j].irb_ire == NULL); 4646 rw_destroy(&ptr[j].irb_lock); 4647 } 4648 mi_free(ptr); 4649 ipst->ips_ip_forwarding_table_v6[i] = NULL; 4650 } 4651 } 4652 4653 /* 4654 * Check if another multirt route resolution is needed. 4655 * B_TRUE is returned is there remain a resolvable route, 4656 * or if no route for that dst is resolved yet. 4657 * B_FALSE is returned if all routes for that dst are resolved 4658 * or if the remaining unresolved routes are actually not 4659 * resolvable. 4660 * This only works in the global zone. 4661 */ 4662 boolean_t 4663 ire_multirt_need_resolve(ipaddr_t dst, const ts_label_t *tsl, ip_stack_t *ipst) 4664 { 4665 ire_t *first_fire; 4666 ire_t *first_cire; 4667 ire_t *fire; 4668 ire_t *cire; 4669 irb_t *firb; 4670 irb_t *cirb; 4671 int unres_cnt = 0; 4672 boolean_t resolvable = B_FALSE; 4673 4674 /* Retrieve the first IRE_HOST that matches the destination */ 4675 first_fire = ire_ftable_lookup(dst, IP_HOST_MASK, 0, IRE_HOST, NULL, 4676 NULL, ALL_ZONES, 0, tsl, 4677 MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_SECATTR, ipst); 4678 4679 /* No route at all */ 4680 if (first_fire == NULL) { 4681 return (B_TRUE); 4682 } 4683 4684 firb = first_fire->ire_bucket; 4685 ASSERT(firb != NULL); 4686 4687 /* Retrieve the first IRE_CACHE ire for that destination. */ 4688 first_cire = ire_cache_lookup(dst, GLOBAL_ZONEID, tsl, ipst); 4689 4690 /* No resolved route. */ 4691 if (first_cire == NULL) { 4692 ire_refrele(first_fire); 4693 return (B_TRUE); 4694 } 4695 4696 /* 4697 * At least one route is resolved. Here we look through the forward 4698 * and cache tables, to compare the number of declared routes 4699 * with the number of resolved routes. The search for a resolvable 4700 * route is performed only if at least one route remains 4701 * unresolved. 4702 */ 4703 cirb = first_cire->ire_bucket; 4704 ASSERT(cirb != NULL); 4705 4706 /* Count the number of routes to that dest that are declared. */ 4707 IRB_REFHOLD(firb); 4708 for (fire = first_fire; fire != NULL; fire = fire->ire_next) { 4709 if (!(fire->ire_flags & RTF_MULTIRT)) 4710 continue; 4711 if (fire->ire_addr != dst) 4712 continue; 4713 unres_cnt++; 4714 } 4715 IRB_REFRELE(firb); 4716 4717 /* Then subtract the number of routes to that dst that are resolved */ 4718 IRB_REFHOLD(cirb); 4719 for (cire = first_cire; cire != NULL; cire = cire->ire_next) { 4720 if (!(cire->ire_flags & RTF_MULTIRT)) 4721 continue; 4722 if (cire->ire_addr != dst) 4723 continue; 4724 if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN)) 4725 continue; 4726 unres_cnt--; 4727 } 4728 IRB_REFRELE(cirb); 4729 4730 /* At least one route is unresolved; search for a resolvable route. */ 4731 if (unres_cnt > 0) 4732 resolvable = ire_multirt_lookup(&first_cire, &first_fire, 4733 MULTIRT_USESTAMP | MULTIRT_CACHEGW, tsl, ipst); 4734 4735 if (first_fire != NULL) 4736 ire_refrele(first_fire); 4737 4738 if (first_cire != NULL) 4739 ire_refrele(first_cire); 4740 4741 return (resolvable); 4742 } 4743 4744 /* 4745 * Explore a forward_table bucket, starting from fire_arg. 4746 * fire_arg MUST be an IRE_HOST entry. 4747 * 4748 * Return B_TRUE and update *ire_arg and *fire_arg 4749 * if at least one resolvable route is found. *ire_arg 4750 * is the IRE entry for *fire_arg's gateway. 4751 * 4752 * Return B_FALSE otherwise (all routes are resolved or 4753 * the remaining unresolved routes are all unresolvable). 4754 * 4755 * The IRE selection relies on a priority mechanism 4756 * driven by the flags passed in by the caller. 4757 * The caller, such as ip_newroute_ipif(), can get the most 4758 * relevant ire at each stage of a multiple route resolution. 4759 * 4760 * The rules are: 4761 * 4762 * - if MULTIRT_CACHEGW is specified in flags, IRE_CACHETABLE 4763 * ires are preferred for the gateway. This gives the highest 4764 * priority to routes that can be resolved without using 4765 * a resolver. 4766 * 4767 * - if MULTIRT_CACHEGW is not specified, or if MULTIRT_CACHEGW 4768 * is specified but no IRE_CACHETABLE ire entry for the gateway 4769 * is found, the following rules apply. 4770 * 4771 * - if MULTIRT_USESTAMP is specified in flags, IRE_INTERFACE 4772 * ires for the gateway, that have not been tried since 4773 * a configurable amount of time, are preferred. 4774 * This applies when a resolver must be invoked for 4775 * a missing route, but we don't want to use the resolver 4776 * upon each packet emission. If no such resolver is found, 4777 * B_FALSE is returned. 4778 * The MULTIRT_USESTAMP flag can be combined with 4779 * MULTIRT_CACHEGW. 4780 * 4781 * - if MULTIRT_USESTAMP is not specified in flags, the first 4782 * unresolved but resolvable route is selected. 4783 * 4784 * - Otherwise, there is no resolvalble route, and 4785 * B_FALSE is returned. 4786 * 4787 * At last, MULTIRT_SETSTAMP can be specified in flags to 4788 * request the timestamp of unresolvable routes to 4789 * be refreshed. This prevents the useless exploration 4790 * of those routes for a while, when MULTIRT_USESTAMP is used. 4791 * 4792 * This only works in the global zone. 4793 */ 4794 boolean_t 4795 ire_multirt_lookup(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags, 4796 const ts_label_t *tsl, ip_stack_t *ipst) 4797 { 4798 clock_t delta; 4799 ire_t *best_fire = NULL; 4800 ire_t *best_cire = NULL; 4801 ire_t *first_fire; 4802 ire_t *first_cire; 4803 ire_t *fire; 4804 ire_t *cire; 4805 irb_t *firb = NULL; 4806 irb_t *cirb = NULL; 4807 ire_t *gw_ire; 4808 boolean_t already_resolved; 4809 boolean_t res; 4810 ipaddr_t dst; 4811 ipaddr_t gw; 4812 4813 ip2dbg(("ire_multirt_lookup: *ire_arg %p, *fire_arg %p, flags %04x\n", 4814 (void *)*ire_arg, (void *)*fire_arg, flags)); 4815 4816 ASSERT(ire_arg != NULL); 4817 ASSERT(fire_arg != NULL); 4818 4819 /* Not an IRE_HOST ire; give up. */ 4820 if ((*fire_arg == NULL) || ((*fire_arg)->ire_type != IRE_HOST)) { 4821 return (B_FALSE); 4822 } 4823 4824 /* This is the first IRE_HOST ire for that destination. */ 4825 first_fire = *fire_arg; 4826 firb = first_fire->ire_bucket; 4827 ASSERT(firb != NULL); 4828 4829 dst = first_fire->ire_addr; 4830 4831 ip2dbg(("ire_multirt_lookup: dst %08x\n", ntohl(dst))); 4832 4833 /* 4834 * Retrieve the first IRE_CACHE ire for that destination; 4835 * if we don't find one, no route for that dest is 4836 * resolved yet. 4837 */ 4838 first_cire = ire_cache_lookup(dst, GLOBAL_ZONEID, tsl, ipst); 4839 if (first_cire != NULL) { 4840 cirb = first_cire->ire_bucket; 4841 } 4842 4843 ip2dbg(("ire_multirt_lookup: first_cire %p\n", (void *)first_cire)); 4844 4845 /* 4846 * Search for a resolvable route, giving the top priority 4847 * to routes that can be resolved without any call to the resolver. 4848 */ 4849 IRB_REFHOLD(firb); 4850 4851 if (!CLASSD(dst)) { 4852 /* 4853 * For all multiroute IRE_HOST ires for that destination, 4854 * check if the route via the IRE_HOST's gateway is 4855 * resolved yet. 4856 */ 4857 for (fire = first_fire; fire != NULL; fire = fire->ire_next) { 4858 4859 if (!(fire->ire_flags & RTF_MULTIRT)) 4860 continue; 4861 if (fire->ire_addr != dst) 4862 continue; 4863 4864 if (fire->ire_gw_secattr != NULL && 4865 tsol_ire_match_gwattr(fire, tsl) != 0) { 4866 continue; 4867 } 4868 4869 gw = fire->ire_gateway_addr; 4870 4871 ip2dbg(("ire_multirt_lookup: fire %p, " 4872 "ire_addr %08x, ire_gateway_addr %08x\n", 4873 (void *)fire, ntohl(fire->ire_addr), ntohl(gw))); 4874 4875 already_resolved = B_FALSE; 4876 4877 if (first_cire != NULL) { 4878 ASSERT(cirb != NULL); 4879 4880 IRB_REFHOLD(cirb); 4881 /* 4882 * For all IRE_CACHE ires for that 4883 * destination. 4884 */ 4885 for (cire = first_cire; 4886 cire != NULL; 4887 cire = cire->ire_next) { 4888 4889 if (!(cire->ire_flags & RTF_MULTIRT)) 4890 continue; 4891 if (cire->ire_addr != dst) 4892 continue; 4893 if (cire->ire_marks & 4894 (IRE_MARK_CONDEMNED | 4895 IRE_MARK_TESTHIDDEN)) 4896 continue; 4897 4898 if (cire->ire_gw_secattr != NULL && 4899 tsol_ire_match_gwattr(cire, 4900 tsl) != 0) { 4901 continue; 4902 } 4903 4904 /* 4905 * Check if the IRE_CACHE's gateway 4906 * matches the IRE_HOST's gateway. 4907 */ 4908 if (cire->ire_gateway_addr == gw) { 4909 already_resolved = B_TRUE; 4910 break; 4911 } 4912 } 4913 IRB_REFRELE(cirb); 4914 } 4915 4916 /* 4917 * This route is already resolved; 4918 * proceed with next one. 4919 */ 4920 if (already_resolved) { 4921 ip2dbg(("ire_multirt_lookup: found cire %p, " 4922 "already resolved\n", (void *)cire)); 4923 continue; 4924 } 4925 4926 /* 4927 * The route is unresolved; is it actually 4928 * resolvable, i.e. is there a cache or a resolver 4929 * for the gateway? 4930 */ 4931 gw_ire = ire_route_lookup(gw, 0, 0, 0, NULL, NULL, 4932 ALL_ZONES, tsl, 4933 MATCH_IRE_RECURSIVE | MATCH_IRE_SECATTR, ipst); 4934 4935 ip2dbg(("ire_multirt_lookup: looked up gw_ire %p\n", 4936 (void *)gw_ire)); 4937 4938 /* 4939 * If gw_ire is typed IRE_CACHETABLE, 4940 * this route can be resolved without any call to the 4941 * resolver. If the MULTIRT_CACHEGW flag is set, 4942 * give the top priority to this ire and exit the 4943 * loop. 4944 * This is typically the case when an ARP reply 4945 * is processed through ip_wput_nondata(). 4946 */ 4947 if ((flags & MULTIRT_CACHEGW) && 4948 (gw_ire != NULL) && 4949 (gw_ire->ire_type & IRE_CACHETABLE)) { 4950 ASSERT(gw_ire->ire_nce == NULL || 4951 gw_ire->ire_nce->nce_state == ND_REACHABLE); 4952 /* 4953 * Release the resolver associated to the 4954 * previous candidate best ire, if any. 4955 */ 4956 if (best_cire != NULL) { 4957 ire_refrele(best_cire); 4958 ASSERT(best_fire != NULL); 4959 } 4960 4961 best_fire = fire; 4962 best_cire = gw_ire; 4963 4964 ip2dbg(("ire_multirt_lookup: found top prio " 4965 "best_fire %p, best_cire %p\n", 4966 (void *)best_fire, (void *)best_cire)); 4967 break; 4968 } 4969 4970 /* 4971 * Compute the time elapsed since our preceding 4972 * attempt to resolve that route. 4973 * If the MULTIRT_USESTAMP flag is set, we take that 4974 * route into account only if this time interval 4975 * exceeds ip_multirt_resolution_interval; 4976 * this prevents us from attempting to resolve a 4977 * broken route upon each sending of a packet. 4978 */ 4979 delta = lbolt - fire->ire_last_used_time; 4980 delta = TICK_TO_MSEC(delta); 4981 4982 res = (boolean_t)((delta > 4983 ipst->ips_ip_multirt_resolution_interval) || 4984 (!(flags & MULTIRT_USESTAMP))); 4985 4986 ip2dbg(("ire_multirt_lookup: fire %p, delta %lu, " 4987 "res %d\n", 4988 (void *)fire, delta, res)); 4989 4990 if (res) { 4991 /* 4992 * We are here if MULTIRT_USESTAMP flag is set 4993 * and the resolver for fire's gateway 4994 * has not been tried since 4995 * ip_multirt_resolution_interval, or if 4996 * MULTIRT_USESTAMP is not set but gw_ire did 4997 * not fill the conditions for MULTIRT_CACHEGW, 4998 * or if neither MULTIRT_USESTAMP nor 4999 * MULTIRT_CACHEGW are set. 5000 */ 5001 if (gw_ire != NULL) { 5002 if (best_fire == NULL) { 5003 ASSERT(best_cire == NULL); 5004 5005 best_fire = fire; 5006 best_cire = gw_ire; 5007 5008 ip2dbg(("ire_multirt_lookup:" 5009 "found candidate " 5010 "best_fire %p, " 5011 "best_cire %p\n", 5012 (void *)best_fire, 5013 (void *)best_cire)); 5014 5015 /* 5016 * If MULTIRT_CACHEGW is not 5017 * set, we ignore the top 5018 * priority ires that can 5019 * be resolved without any 5020 * call to the resolver; 5021 * In that case, there is 5022 * actually no need 5023 * to continue the loop. 5024 */ 5025 if (!(flags & 5026 MULTIRT_CACHEGW)) { 5027 break; 5028 } 5029 continue; 5030 } 5031 } else { 5032 /* 5033 * No resolver for the gateway: the 5034 * route is not resolvable. 5035 * If the MULTIRT_SETSTAMP flag is 5036 * set, we stamp the IRE_HOST ire, 5037 * so we will not select it again 5038 * during this resolution interval. 5039 */ 5040 if (flags & MULTIRT_SETSTAMP) 5041 fire->ire_last_used_time = 5042 lbolt; 5043 } 5044 } 5045 5046 if (gw_ire != NULL) 5047 ire_refrele(gw_ire); 5048 } 5049 } else { /* CLASSD(dst) */ 5050 5051 for (fire = first_fire; 5052 fire != NULL; 5053 fire = fire->ire_next) { 5054 5055 if (!(fire->ire_flags & RTF_MULTIRT)) 5056 continue; 5057 if (fire->ire_addr != dst) 5058 continue; 5059 5060 if (fire->ire_gw_secattr != NULL && 5061 tsol_ire_match_gwattr(fire, tsl) != 0) { 5062 continue; 5063 } 5064 5065 already_resolved = B_FALSE; 5066 5067 gw = fire->ire_gateway_addr; 5068 5069 gw_ire = ire_ftable_lookup(gw, 0, 0, IRE_INTERFACE, 5070 NULL, NULL, ALL_ZONES, 0, tsl, 5071 MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE | 5072 MATCH_IRE_SECATTR, ipst); 5073 5074 /* No resolver for the gateway; we skip this ire. */ 5075 if (gw_ire == NULL) { 5076 continue; 5077 } 5078 ASSERT(gw_ire->ire_nce == NULL || 5079 gw_ire->ire_nce->nce_state == ND_REACHABLE); 5080 5081 if (first_cire != NULL) { 5082 5083 IRB_REFHOLD(cirb); 5084 /* 5085 * For all IRE_CACHE ires for that 5086 * destination. 5087 */ 5088 for (cire = first_cire; 5089 cire != NULL; 5090 cire = cire->ire_next) { 5091 5092 if (!(cire->ire_flags & RTF_MULTIRT)) 5093 continue; 5094 if (cire->ire_addr != dst) 5095 continue; 5096 if (cire->ire_marks & 5097 (IRE_MARK_CONDEMNED | 5098 IRE_MARK_TESTHIDDEN)) 5099 continue; 5100 5101 if (cire->ire_gw_secattr != NULL && 5102 tsol_ire_match_gwattr(cire, 5103 tsl) != 0) { 5104 continue; 5105 } 5106 5107 /* 5108 * Cache entries are linked to the 5109 * parent routes using the parent handle 5110 * (ire_phandle). If no cache entry has 5111 * the same handle as fire, fire is 5112 * still unresolved. 5113 */ 5114 ASSERT(cire->ire_phandle != 0); 5115 if (cire->ire_phandle == 5116 fire->ire_phandle) { 5117 already_resolved = B_TRUE; 5118 break; 5119 } 5120 } 5121 IRB_REFRELE(cirb); 5122 } 5123 5124 /* 5125 * This route is already resolved; proceed with 5126 * next one. 5127 */ 5128 if (already_resolved) { 5129 ire_refrele(gw_ire); 5130 continue; 5131 } 5132 5133 /* 5134 * Compute the time elapsed since our preceding 5135 * attempt to resolve that route. 5136 * If the MULTIRT_USESTAMP flag is set, we take 5137 * that route into account only if this time 5138 * interval exceeds ip_multirt_resolution_interval; 5139 * this prevents us from attempting to resolve a 5140 * broken route upon each sending of a packet. 5141 */ 5142 delta = lbolt - fire->ire_last_used_time; 5143 delta = TICK_TO_MSEC(delta); 5144 5145 res = (boolean_t)((delta > 5146 ipst->ips_ip_multirt_resolution_interval) || 5147 (!(flags & MULTIRT_USESTAMP))); 5148 5149 ip3dbg(("ire_multirt_lookup: fire %p, delta %lx, " 5150 "flags %04x, res %d\n", 5151 (void *)fire, delta, flags, res)); 5152 5153 if (res) { 5154 if (best_cire != NULL) { 5155 /* 5156 * Release the resolver associated 5157 * to the preceding candidate best 5158 * ire, if any. 5159 */ 5160 ire_refrele(best_cire); 5161 ASSERT(best_fire != NULL); 5162 } 5163 best_fire = fire; 5164 best_cire = gw_ire; 5165 continue; 5166 } 5167 5168 ire_refrele(gw_ire); 5169 } 5170 } 5171 5172 if (best_fire != NULL) { 5173 IRE_REFHOLD(best_fire); 5174 } 5175 IRB_REFRELE(firb); 5176 5177 /* Release the first IRE_CACHE we initially looked up, if any. */ 5178 if (first_cire != NULL) 5179 ire_refrele(first_cire); 5180 5181 /* Found a resolvable route. */ 5182 if (best_fire != NULL) { 5183 ASSERT(best_cire != NULL); 5184 5185 if (*fire_arg != NULL) 5186 ire_refrele(*fire_arg); 5187 if (*ire_arg != NULL) 5188 ire_refrele(*ire_arg); 5189 5190 /* 5191 * Update the passed-in arguments with the 5192 * resolvable multirt route we found. 5193 */ 5194 *fire_arg = best_fire; 5195 *ire_arg = best_cire; 5196 5197 ip2dbg(("ire_multirt_lookup: returning B_TRUE, " 5198 "*fire_arg %p, *ire_arg %p\n", 5199 (void *)best_fire, (void *)best_cire)); 5200 5201 return (B_TRUE); 5202 } 5203 5204 ASSERT(best_cire == NULL); 5205 5206 ip2dbg(("ire_multirt_lookup: returning B_FALSE, *fire_arg %p, " 5207 "*ire_arg %p\n", 5208 (void *)*fire_arg, (void *)*ire_arg)); 5209 5210 /* No resolvable route. */ 5211 return (B_FALSE); 5212 } 5213 5214 /* 5215 * IRE iterator for inbound and loopback broadcast processing. 5216 * Given an IRE_BROADCAST ire, walk the ires with the same destination 5217 * address, but skip over the passed-in ire. Returns the next ire without 5218 * a hold - assumes that the caller holds a reference on the IRE bucket. 5219 */ 5220 ire_t * 5221 ire_get_next_bcast_ire(ire_t *curr, ire_t *ire) 5222 { 5223 ill_t *ill; 5224 5225 if (curr == NULL) { 5226 for (curr = ire->ire_bucket->irb_ire; curr != NULL; 5227 curr = curr->ire_next) { 5228 if (curr->ire_addr == ire->ire_addr) 5229 break; 5230 } 5231 } else { 5232 curr = curr->ire_next; 5233 } 5234 ill = ire_to_ill(ire); 5235 for (; curr != NULL; curr = curr->ire_next) { 5236 if (curr->ire_addr != ire->ire_addr) { 5237 /* 5238 * All the IREs to a given destination are contiguous; 5239 * break out once the address doesn't match. 5240 */ 5241 break; 5242 } 5243 if (curr == ire) { 5244 /* skip over the passed-in ire */ 5245 continue; 5246 } 5247 if ((curr->ire_stq != NULL && ire->ire_stq == NULL) || 5248 (curr->ire_stq == NULL && ire->ire_stq != NULL)) { 5249 /* 5250 * If the passed-in ire is loopback, skip over 5251 * non-loopback ires and vice versa. 5252 */ 5253 continue; 5254 } 5255 if (ire_to_ill(curr) != ill) { 5256 /* skip over IREs going through a different interface */ 5257 continue; 5258 } 5259 if (curr->ire_marks & IRE_MARK_CONDEMNED) { 5260 /* skip over deleted IREs */ 5261 continue; 5262 } 5263 return (curr); 5264 } 5265 return (NULL); 5266 } 5267 5268 #ifdef DEBUG 5269 void 5270 ire_trace_ref(ire_t *ire) 5271 { 5272 mutex_enter(&ire->ire_lock); 5273 if (ire->ire_trace_disable) { 5274 mutex_exit(&ire->ire_lock); 5275 return; 5276 } 5277 5278 if (th_trace_ref(ire, ire->ire_ipst)) { 5279 mutex_exit(&ire->ire_lock); 5280 } else { 5281 ire->ire_trace_disable = B_TRUE; 5282 mutex_exit(&ire->ire_lock); 5283 ire_trace_cleanup(ire); 5284 } 5285 } 5286 5287 void 5288 ire_untrace_ref(ire_t *ire) 5289 { 5290 mutex_enter(&ire->ire_lock); 5291 if (!ire->ire_trace_disable) 5292 th_trace_unref(ire); 5293 mutex_exit(&ire->ire_lock); 5294 } 5295 5296 static void 5297 ire_trace_cleanup(const ire_t *ire) 5298 { 5299 th_trace_cleanup(ire, ire->ire_trace_disable); 5300 } 5301 #endif /* DEBUG */ 5302 5303 /* 5304 * Generate a message chain with an arp request to resolve the in_ire. 5305 * It is assumed that in_ire itself is currently in the ire cache table, 5306 * so we create a fake_ire filled with enough information about ire_addr etc. 5307 * to retrieve in_ire when the DL_UNITDATA response from the resolver 5308 * comes back. The fake_ire itself is created by calling esballoc with 5309 * the fr_rtnp (free routine) set to ire_freemblk. This routine will be 5310 * invoked when the mblk containing fake_ire is freed. 5311 */ 5312 void 5313 ire_arpresolve(ire_t *in_ire) 5314 { 5315 areq_t *areq; 5316 ipaddr_t *addrp; 5317 mblk_t *ire_mp, *areq_mp; 5318 ire_t *ire, *buf; 5319 size_t bufsize; 5320 frtn_t *frtnp; 5321 ill_t *dst_ill; 5322 ip_stack_t *ipst; 5323 5324 ASSERT(in_ire->ire_nce != NULL); 5325 5326 dst_ill = ire_to_ill(in_ire); 5327 ipst = dst_ill->ill_ipst; 5328 5329 /* 5330 * Construct message chain for the resolver 5331 * of the form: 5332 * ARP_REQ_MBLK-->IRE_MBLK 5333 * 5334 * NOTE : If the response does not 5335 * come back, ARP frees the packet. For this reason, 5336 * we can't REFHOLD the bucket of save_ire to prevent 5337 * deletions. We may not be able to REFRELE the bucket 5338 * if the response never comes back. Thus, before 5339 * adding the ire, ire_add_v4 will make sure that the 5340 * interface route does not get deleted. This is the 5341 * only case unlike ip_newroute_v6, ip_newroute_ipif_v6 5342 * where we can always prevent deletions because of 5343 * the synchronous nature of adding IRES i.e 5344 * ire_add_then_send is called after creating the IRE. 5345 */ 5346 5347 /* 5348 * We use esballoc to allocate the second part (IRE_MBLK) 5349 * of the message chain depicted above. This mblk will be freed 5350 * by arp when there is a timeout, and otherwise passed to IP 5351 * and IP will free it after processing the ARP response. 5352 */ 5353 5354 bufsize = sizeof (ire_t) + sizeof (frtn_t); 5355 buf = kmem_alloc(bufsize, KM_NOSLEEP); 5356 if (buf == NULL) { 5357 ip1dbg(("ire_arpresolve: alloc buffer failed\n")); 5358 return; 5359 } 5360 frtnp = (frtn_t *)(buf + 1); 5361 frtnp->free_arg = (caddr_t)buf; 5362 frtnp->free_func = ire_freemblk; 5363 5364 ire_mp = esballoc((unsigned char *)buf, bufsize, BPRI_MED, frtnp); 5365 if (ire_mp == NULL) { 5366 ip1dbg(("ire_arpresolve: esballoc failed\n")); 5367 kmem_free(buf, bufsize); 5368 return; 5369 } 5370 5371 areq_mp = copyb(dst_ill->ill_resolver_mp); 5372 if (areq_mp == NULL) { 5373 freemsg(ire_mp); 5374 return; 5375 } 5376 5377 ire_mp->b_datap->db_type = IRE_ARPRESOLVE_TYPE; 5378 ire = (ire_t *)buf; 5379 /* 5380 * keep enough info in the fake ire so that we can pull up 5381 * the incomplete ire (in_ire) after result comes back from 5382 * arp and make it complete. 5383 */ 5384 *ire = ire_null; 5385 ire->ire_u = in_ire->ire_u; 5386 ire->ire_ipif_seqid = in_ire->ire_ipif_seqid; 5387 ire->ire_ipif_ifindex = in_ire->ire_ipif_ifindex; 5388 ire->ire_ipif = in_ire->ire_ipif; 5389 ire->ire_stq = dst_ill->ill_wq; 5390 ire->ire_stq_ifindex = dst_ill->ill_phyint->phyint_ifindex; 5391 ire->ire_zoneid = in_ire->ire_zoneid; 5392 ire->ire_stackid = ipst->ips_netstack->netstack_stackid; 5393 ire->ire_ipst = ipst; 5394 5395 /* 5396 * ire_freemblk will be called when ire_mp is freed, both for 5397 * successful and failed arp resolution. IRE_MARK_UNCACHED will be set 5398 * when the arp resolution failed. 5399 */ 5400 ire->ire_marks |= IRE_MARK_UNCACHED; 5401 ire->ire_mp = ire_mp; 5402 ire_mp->b_wptr = (uchar_t *)&ire[1]; 5403 ire_mp->b_cont = NULL; 5404 linkb(areq_mp, ire_mp); 5405 5406 /* 5407 * Fill in the source and dest addrs for the resolver. 5408 * NOTE: this depends on memory layouts imposed by 5409 * ill_init(). 5410 */ 5411 areq = (areq_t *)areq_mp->b_rptr; 5412 addrp = (ipaddr_t *)((char *)areq + areq->areq_sender_addr_offset); 5413 *addrp = ire->ire_src_addr; 5414 5415 addrp = (ipaddr_t *)((char *)areq + areq->areq_target_addr_offset); 5416 if (ire->ire_gateway_addr != INADDR_ANY) { 5417 *addrp = ire->ire_gateway_addr; 5418 } else { 5419 *addrp = ire->ire_addr; 5420 } 5421 5422 /* Up to the resolver. */ 5423 if (canputnext(dst_ill->ill_rq)) { 5424 putnext(dst_ill->ill_rq, areq_mp); 5425 } else { 5426 freemsg(areq_mp); 5427 } 5428 } 5429 5430 /* 5431 * Esballoc free function for AR_ENTRY_QUERY request to clean up any 5432 * unresolved ire_t and/or nce_t structures when ARP resolution fails. 5433 * 5434 * This function can be called by ARP via free routine for ire_mp or 5435 * by IPv4(both host and forwarding path) via ire_delete 5436 * in case ARP resolution fails. 5437 * NOTE: Since IP is MT, ARP can call into IP but not vice versa 5438 * (for IP to talk to ARP, it still has to send AR* messages). 5439 * 5440 * Note that the ARP/IP merge should replace the functioanlity by providing 5441 * direct function calls to clean up unresolved entries in ire/nce lists. 5442 */ 5443 void 5444 ire_freemblk(ire_t *ire_mp) 5445 { 5446 nce_t *nce = NULL; 5447 ill_t *ill; 5448 ip_stack_t *ipst; 5449 netstack_t *ns = NULL; 5450 5451 ASSERT(ire_mp != NULL); 5452 5453 if ((ire_mp->ire_addr == NULL) && (ire_mp->ire_gateway_addr == NULL)) { 5454 ip1dbg(("ire_freemblk(0x%p) ire_addr is NULL\n", 5455 (void *)ire_mp)); 5456 goto cleanup; 5457 } 5458 if ((ire_mp->ire_marks & IRE_MARK_UNCACHED) == 0) { 5459 goto cleanup; /* everything succeeded. just free and return */ 5460 } 5461 5462 /* 5463 * the arp information corresponding to this ire_mp was not 5464 * transferred to an ire_cache entry. Need 5465 * to clean up incomplete ire's and nce, if necessary. 5466 */ 5467 ASSERT(ire_mp->ire_stq != NULL); 5468 ASSERT(ire_mp->ire_stq_ifindex != 0); 5469 ASSERT(ire_mp->ire_ipst != NULL); 5470 5471 ns = netstack_find_by_stackid(ire_mp->ire_stackid); 5472 ipst = (ns ? ns->netstack_ip : NULL); 5473 if (ipst == NULL || ipst != ire_mp->ire_ipst) /* Disapeared on us */ 5474 goto cleanup; 5475 5476 /* 5477 * Get any nce's corresponding to this ire_mp. We first have to 5478 * make sure that the ill is still around. 5479 */ 5480 ill = ill_lookup_on_ifindex(ire_mp->ire_stq_ifindex, 5481 B_FALSE, NULL, NULL, NULL, NULL, ipst); 5482 if (ill == NULL || (ire_mp->ire_stq != ill->ill_wq) || 5483 (ill->ill_state_flags & ILL_CONDEMNED)) { 5484 /* 5485 * ill went away. no nce to clean up. 5486 * Note that the ill_state_flags could be set to 5487 * ILL_CONDEMNED after this point, but if we know 5488 * that it is CONDEMNED now, we just bail out quickly. 5489 */ 5490 if (ill != NULL) 5491 ill_refrele(ill); 5492 goto cleanup; 5493 } 5494 nce = ndp_lookup_v4(ill, 5495 ((ire_mp->ire_gateway_addr != INADDR_ANY) ? 5496 &ire_mp->ire_gateway_addr : &ire_mp->ire_addr), 5497 B_FALSE); 5498 ill_refrele(ill); 5499 5500 if ((nce != NULL) && (nce->nce_state != ND_REACHABLE)) { 5501 /* 5502 * some incomplete nce was found. 5503 */ 5504 DTRACE_PROBE2(ire__freemblk__arp__resolv__fail, 5505 nce_t *, nce, ire_t *, ire_mp); 5506 /* 5507 * Send the icmp_unreachable messages for the queued mblks in 5508 * ire->ire_nce->nce_qd_mp, since ARP resolution failed 5509 * for this ire 5510 */ 5511 arp_resolv_failed(nce); 5512 /* 5513 * Delete the nce and clean up all ire's pointing at this nce 5514 * in the cachetable 5515 */ 5516 ndp_delete(nce); 5517 } 5518 if (nce != NULL) 5519 NCE_REFRELE(nce); /* release the ref taken by ndp_lookup_v4 */ 5520 5521 cleanup: 5522 if (ns != NULL) 5523 netstack_rele(ns); 5524 /* 5525 * Get rid of the ire buffer 5526 * We call kmem_free here(instead of ire_delete()), since 5527 * this is the freeb's callback. 5528 */ 5529 kmem_free(ire_mp, sizeof (ire_t) + sizeof (frtn_t)); 5530 } 5531 5532 /* 5533 * find, or create if needed, a neighbor cache entry nce_t for IRE_CACHE and 5534 * non-loopback IRE_BROADCAST ire's. 5535 * 5536 * If a neighbor-cache entry has to be created (i.e., one does not already 5537 * exist in the nce list) the nce_res_mp and nce_state of the neighbor cache 5538 * entry are initialized in ndp_add_v4(). These values are picked from 5539 * the src_nce, if one is passed in. Otherwise (if src_nce == NULL) the 5540 * ire->ire_type and the outgoing interface (ire_to_ill(ire)) values 5541 * determine the {nce_state, nce_res_mp} of the nce_t created. All 5542 * IRE_BROADCAST entries have nce_state = ND_REACHABLE, and the nce_res_mp 5543 * is set to the ill_bcast_mp of the outgoing inerface. For unicast ire 5544 * entries, 5545 * - if the outgoing interface is of type IRE_IF_RESOLVER, a newly created 5546 * nce_t will have a null nce_res_mp, and will be in the ND_INITIAL state. 5547 * - if the outgoing interface is a IRE_IF_NORESOLVER interface, no link 5548 * layer resolution is necessary, so that the nce_t will be in the 5549 * ND_REACHABLE state and the nce_res_mp will have a copy of the 5550 * ill_resolver_mp of the outgoing interface. 5551 * 5552 * The link layer information needed for broadcast addresses, and for 5553 * packets sent on IRE_IF_NORESOLVER interfaces is a constant mapping that 5554 * never needs re-verification for the lifetime of the nce_t. These are 5555 * therefore marked NCE_F_PERMANENT, and never allowed to expire via 5556 * NCE_EXPIRED. 5557 * 5558 * IRE_CACHE ire's contain the information for the nexthop (ire_gateway_addr) 5559 * in the case of indirect routes, and for the dst itself (ire_addr) in the 5560 * case of direct routes, with the nce_res_mp containing a template 5561 * DL_UNITDATA request. 5562 * 5563 * The actual association of the ire_nce to the nce created here is 5564 * typically done in ire_add_v4 for IRE_CACHE entries. Exceptions 5565 * to this rule are SO_DONTROUTE ire's (IRE_MARK_NO_ADD), for which 5566 * the ire_nce assignment is done in ire_add_then_send. 5567 */ 5568 int 5569 ire_nce_init(ire_t *ire, nce_t *src_nce) 5570 { 5571 in_addr_t addr4; 5572 int err; 5573 nce_t *nce = NULL; 5574 ill_t *ire_ill; 5575 uint16_t nce_flags = 0; 5576 ip_stack_t *ipst; 5577 5578 if (ire->ire_stq == NULL) 5579 return (0); /* no need to create nce for local/loopback */ 5580 5581 switch (ire->ire_type) { 5582 case IRE_CACHE: 5583 if (ire->ire_gateway_addr != INADDR_ANY) 5584 addr4 = ire->ire_gateway_addr; /* 'G' route */ 5585 else 5586 addr4 = ire->ire_addr; /* direct route */ 5587 break; 5588 case IRE_BROADCAST: 5589 addr4 = ire->ire_addr; 5590 nce_flags |= (NCE_F_PERMANENT|NCE_F_BCAST); 5591 break; 5592 default: 5593 return (0); 5594 } 5595 5596 /* 5597 * ire_ipif is picked based on RTF_SETSRC, usesrc etc. 5598 * rules in ire_forward_src_ipif. We want the dlureq_mp 5599 * for the outgoing interface, which we get from the ire_stq. 5600 */ 5601 ire_ill = ire_to_ill(ire); 5602 ipst = ire_ill->ill_ipst; 5603 5604 /* 5605 * IRE_IF_NORESOLVER entries never need re-verification and 5606 * do not expire, so we mark them as NCE_F_PERMANENT. 5607 */ 5608 if (ire_ill->ill_net_type == IRE_IF_NORESOLVER) 5609 nce_flags |= NCE_F_PERMANENT; 5610 5611 retry_nce: 5612 err = ndp_lookup_then_add_v4(ire_ill, &addr4, nce_flags, 5613 &nce, src_nce); 5614 5615 if (err == EEXIST && NCE_EXPIRED(nce, ipst)) { 5616 /* 5617 * We looked up an expired nce. 5618 * Go back and try to create one again. 5619 */ 5620 ndp_delete(nce); 5621 NCE_REFRELE(nce); 5622 nce = NULL; 5623 goto retry_nce; 5624 } 5625 5626 ip1dbg(("ire 0x%p addr 0x%lx type 0x%x; found nce 0x%p err %d\n", 5627 (void *)ire, (ulong_t)addr4, ire->ire_type, (void *)nce, err)); 5628 5629 switch (err) { 5630 case 0: 5631 case EEXIST: 5632 /* 5633 * return a pointer to a newly created or existing nce_t; 5634 * note that the ire-nce mapping is many-one, i.e., 5635 * multiple ire's could point to the same nce_t. 5636 */ 5637 break; 5638 default: 5639 DTRACE_PROBE2(nce__init__fail, ill_t *, ire_ill, int, err); 5640 return (EINVAL); 5641 } 5642 /* 5643 * IRE_BROADCAST ire's must be linked to NCE_F_BCAST nce's and 5644 * vice-versa (IRE_CACHE <-> unicast nce entries). We may have found an 5645 * existing unicast (or bcast) nce when trying to add a BROADCAST (or 5646 * unicast) ire, e.g., when address/netmask modifications were in 5647 * progress, and the ipif_ndp_down() call to quiesce existing state 5648 * during the addr/mask modification may have skipped the ndp_delete() 5649 * because the ipif being affected was not the last one on the ill. We 5650 * recover from the missed ndp_delete() now, by deleting the old nce and 5651 * adding a new one with the correct NCE_F_BCAST state. 5652 */ 5653 if (ire->ire_type == IRE_BROADCAST) { 5654 if ((nce->nce_flags & NCE_F_BCAST) == 0) { 5655 /* IRE_BROADCAST needs NCE_F_BCAST */ 5656 ndp_delete(nce); 5657 NCE_REFRELE(nce); 5658 goto retry_nce; 5659 } 5660 /* 5661 * Two bcast ires are created for each interface; 5662 * 1. loopback copy (which does not have an 5663 * ire_stq, and therefore has no ire_nce), and, 5664 * 2. the non-loopback copy, which has the nce_res_mp 5665 * initialized to a copy of the ill_bcast_mp, and 5666 * is marked as ND_REACHABLE at this point. 5667 * This nce does not undergo any further state changes, 5668 * and exists as long as the interface is plumbed. 5669 * Note: the assignment of ire_nce here is a historical 5670 * artifact of old code that used to inline ire_add(). 5671 */ 5672 ire->ire_nce = nce; 5673 /* 5674 * We are associating this nce to the ire, 5675 * so change the nce ref taken in 5676 * ndp_lookup_then_add_v4() from 5677 * NCE_REFHOLD to NCE_REFHOLD_NOTR 5678 */ 5679 NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce); 5680 } else { 5681 if ((nce->nce_flags & NCE_F_BCAST) != 0) { 5682 /* IRE_CACHE needs unicast nce */ 5683 ndp_delete(nce); 5684 NCE_REFRELE(nce); 5685 goto retry_nce; 5686 } 5687 /* 5688 * We are not using this nce_t just yet so release 5689 * the ref taken in ndp_lookup_then_add_v4() 5690 */ 5691 NCE_REFRELE(nce); 5692 } 5693 return (0); 5694 } 5695 5696 /* 5697 * This is the implementation of the IPv4 IRE cache lookup procedure. 5698 * Separating the interface from the implementation allows additional 5699 * flexibility when specifying search criteria. 5700 */ 5701 static ire_t * 5702 ip4_ctable_lookup_impl(ire_ctable_args_t *margs) 5703 { 5704 irb_t *irb_ptr; 5705 ire_t *ire; 5706 ip_stack_t *ipst = margs->ict_ipst; 5707 5708 if ((margs->ict_flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && 5709 (margs->ict_ipif == NULL)) { 5710 return (NULL); 5711 } 5712 5713 irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH( 5714 *((ipaddr_t *)margs->ict_addr), ipst->ips_ip_cache_table_size)]; 5715 rw_enter(&irb_ptr->irb_lock, RW_READER); 5716 for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 5717 if (ire->ire_marks & IRE_MARK_CONDEMNED) 5718 continue; 5719 ASSERT(ire->ire_mask == IP_HOST_MASK); 5720 if (ire_match_args(ire, *((ipaddr_t *)margs->ict_addr), 5721 ire->ire_mask, *((ipaddr_t *)margs->ict_gateway), 5722 margs->ict_type, margs->ict_ipif, margs->ict_zoneid, 0, 5723 margs->ict_tsl, margs->ict_flags, margs->ict_wq)) { 5724 IRE_REFHOLD(ire); 5725 rw_exit(&irb_ptr->irb_lock); 5726 return (ire); 5727 } 5728 } 5729 5730 rw_exit(&irb_ptr->irb_lock); 5731 return (NULL); 5732 } 5733 5734 /* 5735 * This function locates IRE_CACHE entries which were added by the 5736 * ire_forward() path. We can fully specify the IRE we are looking for by 5737 * providing the ipif (MATCH_IRE_IPIF) *and* the stq (MATCH_IRE_WQ). 5738 */ 5739 ire_t * 5740 ire_arpresolve_lookup(ipaddr_t addr, ipaddr_t gw, ipif_t *ipif, 5741 zoneid_t zoneid, ip_stack_t *ipst, queue_t *wq) 5742 { 5743 ire_ctable_args_t margs; 5744 5745 margs.ict_addr = &addr; 5746 margs.ict_gateway = &gw; 5747 margs.ict_type = IRE_CACHE; 5748 margs.ict_ipif = ipif; 5749 margs.ict_zoneid = zoneid; 5750 margs.ict_tsl = NULL; 5751 margs.ict_flags = MATCH_IRE_GW | MATCH_IRE_IPIF | MATCH_IRE_ZONEONLY | 5752 MATCH_IRE_TYPE | MATCH_IRE_WQ; 5753 margs.ict_ipst = ipst; 5754 margs.ict_wq = wq; 5755 5756 return (ip4_ctable_lookup_impl(&margs)); 5757 } 5758