1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 /* 28 * This file contains routines that manipulate Internet Routing Entries (IREs). 29 */ 30 31 #include <sys/types.h> 32 #include <sys/stream.h> 33 #include <sys/stropts.h> 34 #include <sys/strsun.h> 35 #include <sys/ddi.h> 36 #include <sys/cmn_err.h> 37 #include <sys/policy.h> 38 39 #include <sys/systm.h> 40 #include <sys/kmem.h> 41 #include <sys/param.h> 42 #include <sys/socket.h> 43 #include <net/if.h> 44 #include <net/route.h> 45 #include <netinet/in.h> 46 #include <net/if_dl.h> 47 #include <netinet/ip6.h> 48 #include <netinet/icmp6.h> 49 50 #include <inet/common.h> 51 #include <inet/mi.h> 52 #include <inet/ip.h> 53 #include <inet/ip6.h> 54 #include <inet/ip_ndp.h> 55 #include <inet/arp.h> 56 #include <inet/ip_if.h> 57 #include <inet/ip_ire.h> 58 #include <inet/ip_ftable.h> 59 #include <inet/ip_rts.h> 60 #include <inet/nd.h> 61 62 #include <net/pfkeyv2.h> 63 #include <inet/ipsec_info.h> 64 #include <inet/sadb.h> 65 #include <inet/tcp.h> 66 #include <inet/ipclassifier.h> 67 #include <sys/zone.h> 68 #include <sys/cpuvar.h> 69 70 #include <sys/tsol/label.h> 71 #include <sys/tsol/tnet.h> 72 73 struct kmem_cache *rt_entry_cache; 74 75 /* 76 * Synchronization notes: 77 * 78 * The fields of the ire_t struct are protected in the following way : 79 * 80 * ire_next/ire_ptpn 81 * 82 * - bucket lock of the respective tables (cache or forwarding tables). 83 * 84 * ire_mp, ire_rfq, ire_stq, ire_u *except* ire_gateway_addr[v6], ire_mask, 85 * ire_type, ire_create_time, ire_masklen, ire_ipversion, ire_flags, ire_ipif, 86 * ire_ihandle, ire_phandle, ire_nce, ire_bucket, ire_in_ill, ire_in_src_addr 87 * 88 * - Set in ire_create_v4/v6 and never changes after that. Thus, 89 * we don't need a lock whenever these fields are accessed. 90 * 91 * - ire_bucket and ire_masklen (also set in ire_create) is set in 92 * ire_add_v4/ire_add_v6 before inserting in the bucket and never 93 * changes after that. Thus we don't need a lock whenever these 94 * fields are accessed. 95 * 96 * ire_gateway_addr_v4[v6] 97 * 98 * - ire_gateway_addr_v4[v6] is set during ire_create and later modified 99 * by rts_setgwr[v6]. As ire_gateway_addr is a uint32_t, updates to 100 * it assumed to be atomic and hence the other parts of the code 101 * does not use any locks. ire_gateway_addr_v6 updates are not atomic 102 * and hence any access to it uses ire_lock to get/set the right value. 103 * 104 * ire_ident, ire_refcnt 105 * 106 * - Updated atomically using atomic_add_32 107 * 108 * ire_ssthresh, ire_rtt_sd, ire_rtt, ire_ib_pkt_count, ire_ob_pkt_count 109 * 110 * - Assumes that 32 bit writes are atomic. No locks. ire_lock is 111 * used to serialize updates to ire_ssthresh, ire_rtt_sd, ire_rtt. 112 * 113 * ire_max_frag, ire_frag_flag 114 * 115 * - ire_lock is used to set/read both of them together. 116 * 117 * ire_tire_mark 118 * 119 * - Set in ire_create and updated in ire_expire, which is called 120 * by only one function namely ip_trash_timer_expire. Thus only 121 * one function updates and examines the value. 122 * 123 * ire_marks 124 * - bucket lock protects this. 125 * 126 * ire_ipsec_overhead/ire_ll_hdr_length 127 * 128 * - Place holder for returning the information to the upper layers 129 * when IRE_DB_REQ comes down. 130 * 131 * 132 * ipv6_ire_default_count is protected by the bucket lock of 133 * ip_forwarding_table_v6[0][0]. 134 * 135 * ipv6_ire_default_index is not protected as it is just a hint 136 * at which default gateway to use. There is nothing 137 * wrong in using the same gateway for two different connections. 138 * 139 * As we always hold the bucket locks in all the places while accessing 140 * the above values, it is natural to use them for protecting them. 141 * 142 * We have a separate cache table and forwarding table for IPv4 and IPv6. 143 * Cache table (ip_cache_table/ip_cache_table_v6) is a pointer to an 144 * array of irb_t structures. The IPv6 forwarding table 145 * (ip_forwarding_table_v6) is an array of pointers to arrays of irb_t 146 * structure. ip_forwarding_table_v6 is allocated dynamically in 147 * ire_add_v6. ire_ft_init_lock is used to serialize multiple threads 148 * initializing the same bucket. Once a bucket is initialized, it is never 149 * de-alloacted. This assumption enables us to access 150 * ip_forwarding_table_v6[i] without any locks. 151 * 152 * The forwarding table for IPv4 is a radix tree whose leaves 153 * are rt_entry structures containing the irb_t for the rt_dst. The irb_t 154 * for IPv4 is dynamically allocated and freed. 155 * 156 * Each irb_t - ire bucket structure has a lock to protect 157 * a bucket and the ires residing in the bucket have a back pointer to 158 * the bucket structure. It also has a reference count for the number 159 * of threads walking the bucket - irb_refcnt which is bumped up 160 * using the macro IRB_REFHOLD macro. The flags irb_flags can be 161 * set to IRE_MARK_CONDEMNED indicating that there are some ires 162 * in this bucket that are marked with IRE_MARK_CONDEMNED and the 163 * last thread to leave the bucket should delete the ires. Usually 164 * this is done by the IRB_REFRELE macro which is used to decrement 165 * the reference count on a bucket. See comments above irb_t structure 166 * definition in ip.h for further details. 167 * 168 * IRE_REFHOLD/IRE_REFRELE macros operate on the ire which increments/ 169 * decrements the reference count, ire_refcnt, atomically on the ire. 170 * ire_refcnt is modified only using this macro. Operations on the IRE 171 * could be described as follows : 172 * 173 * CREATE an ire with reference count initialized to 1. 174 * 175 * ADDITION of an ire holds the bucket lock, checks for duplicates 176 * and then adds the ire. ire_add_v4/ire_add_v6 returns the ire after 177 * bumping up once more i.e the reference count is 2. This is to avoid 178 * an extra lookup in the functions calling ire_add which wants to 179 * work with the ire after adding. 180 * 181 * LOOKUP of an ire bumps up the reference count using IRE_REFHOLD 182 * macro. It is valid to bump up the referece count of the IRE, 183 * after the lookup has returned an ire. Following are the lookup 184 * functions that return an HELD ire : 185 * 186 * ire_lookup_local[_v6], ire_ctable_lookup[_v6], ire_ftable_lookup[_v6], 187 * ire_cache_lookup[_v6], ire_lookup_multi[_v6], ire_route_lookup[_v6], 188 * ipif_to_ire[_v6]. 189 * 190 * DELETION of an ire holds the bucket lock, removes it from the list 191 * and then decrements the reference count for having removed from the list 192 * by using the IRE_REFRELE macro. If some other thread has looked up 193 * the ire, the reference count would have been bumped up and hence 194 * this ire will not be freed once deleted. It will be freed once the 195 * reference count drops to zero. 196 * 197 * Add and Delete acquires the bucket lock as RW_WRITER, while all the 198 * lookups acquire the bucket lock as RW_READER. 199 * 200 * NOTE : The only functions that does the IRE_REFRELE when an ire is 201 * passed as an argument are : 202 * 203 * 1) ip_wput_ire : This is because it IRE_REFHOLD/RELEs the 204 * broadcast ires it looks up internally within 205 * the function. Currently, for simplicity it does 206 * not differentiate the one that is passed in and 207 * the ones it looks up internally. It always 208 * IRE_REFRELEs. 209 * 2) ire_send 210 * ire_send_v6 : As ire_send calls ip_wput_ire and other functions 211 * that take ire as an argument, it has to selectively 212 * IRE_REFRELE the ire. To maintain symmetry, 213 * ire_send_v6 does the same. 214 * 215 * Otherwise, the general rule is to do the IRE_REFRELE in the function 216 * that is passing the ire as an argument. 217 * 218 * In trying to locate ires the following points are to be noted. 219 * 220 * IRE_MARK_CONDEMNED signifies that the ire has been logically deleted and is 221 * to be ignored when walking the ires using ire_next. 222 * 223 * Zones note: 224 * Walking IREs within a given zone also walks certain ires in other 225 * zones. This is done intentionally. IRE walks with a specified 226 * zoneid are used only when doing informational reports, and 227 * zone users want to see things that they can access. See block 228 * comment in ire_walk_ill_match(). 229 */ 230 231 /* 232 * The minimum size of IRE cache table. It will be recalcuated in 233 * ip_ire_init(). 234 * Setable in /etc/system 235 */ 236 uint32_t ip_cache_table_size = IP_CACHE_TABLE_SIZE; 237 uint32_t ip6_cache_table_size = IP6_CACHE_TABLE_SIZE; 238 239 /* 240 * The size of the forwarding table. We will make sure that it is a 241 * power of 2 in ip_ire_init(). 242 * Setable in /etc/system 243 */ 244 uint32_t ip6_ftable_hash_size = IP6_FTABLE_HASH_SIZE; 245 246 struct kmem_cache *ire_cache; 247 static ire_t ire_null; 248 249 /* 250 * The threshold number of IRE in a bucket when the IREs are 251 * cleaned up. This threshold is calculated later in ip_open() 252 * based on the speed of CPU and available memory. This default 253 * value is the maximum. 254 * 255 * We have two kinds of cached IRE, temporary and 256 * non-temporary. Temporary IREs are marked with 257 * IRE_MARK_TEMPORARY. They are IREs created for non 258 * TCP traffic and for forwarding purposes. All others 259 * are non-temporary IREs. We don't mark IRE created for 260 * TCP as temporary because TCP is stateful and there are 261 * info stored in the IRE which can be shared by other TCP 262 * connections to the same destination. For connected 263 * endpoint, we also don't want to mark the IRE used as 264 * temporary because the same IRE will be used frequently, 265 * otherwise, the app should not do a connect(). We change 266 * the marking at ip_bind_connected_*() if necessary. 267 * 268 * We want to keep the cache IRE hash bucket length reasonably 269 * short, otherwise IRE lookup functions will take "forever." 270 * We use the "crude" function that the IRE bucket 271 * length should be based on the CPU speed, which is 1 entry 272 * per x MHz, depending on the shift factor ip_ire_cpu_ratio 273 * (n). This means that with a 750MHz CPU, the max bucket 274 * length can be (750 >> n) entries. 275 * 276 * Note that this threshold is separate for temp and non-temp 277 * IREs. This means that the actual bucket length can be 278 * twice as that. And while we try to keep temporary IRE 279 * length at most at the threshold value, we do not attempt to 280 * make the length for non-temporary IREs fixed, for the 281 * reason stated above. Instead, we start trying to find 282 * "unused" non-temporary IREs when the bucket length reaches 283 * this threshold and clean them up. 284 * 285 * We also want to limit the amount of memory used by 286 * IREs. So if we are allowed to use ~3% of memory (M) 287 * for those IREs, each bucket should not have more than 288 * 289 * M / num of cache bucket / sizeof (ire_t) 290 * 291 * Again the above memory uses are separate for temp and 292 * non-temp cached IREs. 293 * 294 * We may also want the limit to be a function of the number 295 * of interfaces and number of CPUs. Doing the initialization 296 * in ip_open() means that every time an interface is plumbed, 297 * the max is re-calculated. Right now, we don't do anything 298 * different. In future, when we have more experience, we 299 * may want to change this behavior. 300 */ 301 uint32_t ip_ire_max_bucket_cnt = 10; /* Setable in /etc/system */ 302 uint32_t ip6_ire_max_bucket_cnt = 10; 303 uint32_t ip_ire_cleanup_cnt = 2; 304 305 /* 306 * The minimum of the temporary IRE bucket count. We do not want 307 * the length of each bucket to be too short. This may hurt 308 * performance of some apps as the temporary IREs are removed too 309 * often. 310 */ 311 uint32_t ip_ire_min_bucket_cnt = 3; /* /etc/system - not used */ 312 uint32_t ip6_ire_min_bucket_cnt = 3; 313 314 /* 315 * The ratio of memory consumed by IRE used for temporary to available 316 * memory. This is a shift factor, so 6 means the ratio 1 to 64. This 317 * value can be changed in /etc/system. 6 is a reasonable number. 318 */ 319 uint32_t ip_ire_mem_ratio = 6; /* /etc/system */ 320 /* The shift factor for CPU speed to calculate the max IRE bucket length. */ 321 uint32_t ip_ire_cpu_ratio = 7; /* /etc/system */ 322 323 typedef struct nce_clookup_s { 324 ipaddr_t ncecl_addr; 325 boolean_t ncecl_found; 326 } nce_clookup_t; 327 328 /* 329 * The maximum number of buckets in IRE cache table. In future, we may 330 * want to make it a dynamic hash table. For the moment, we fix the 331 * size and allocate the table in ip_ire_init() when IP is first loaded. 332 * We take into account the amount of memory a system has. 333 */ 334 #define IP_MAX_CACHE_TABLE_SIZE 4096 335 336 /* Setable in /etc/system */ 337 static uint32_t ip_max_cache_table_size = IP_MAX_CACHE_TABLE_SIZE; 338 static uint32_t ip6_max_cache_table_size = IP_MAX_CACHE_TABLE_SIZE; 339 340 #define NUM_ILLS 2 /* To build the ILL list to unlock */ 341 342 /* Zero iulp_t for initialization. */ 343 const iulp_t ire_uinfo_null = { 0 }; 344 345 static int ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, 346 ipsq_func_t func, boolean_t); 347 static void ire_delete_v4(ire_t *ire); 348 static void ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers, 349 zoneid_t zoneid, ip_stack_t *); 350 static void ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type, 351 pfv_t func, void *arg, uchar_t vers, ill_t *ill); 352 static void ire_cache_cleanup(irb_t *irb, uint32_t threshold, 353 ire_t *ref_ire); 354 static void ip_nce_clookup_and_delete(nce_t *nce, void *arg); 355 static ire_t *ip4_ctable_lookup_impl(ire_ctable_args_t *margs); 356 #ifdef DEBUG 357 static void ire_trace_cleanup(const ire_t *); 358 #endif 359 360 /* 361 * To avoid bloating the code, we call this function instead of 362 * using the macro IRE_REFRELE. Use macro only in performance 363 * critical paths. 364 * 365 * Must not be called while holding any locks. Otherwise if this is 366 * the last reference to be released there is a chance of recursive mutex 367 * panic due to ire_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 368 * to restart an ioctl. The one exception is when the caller is sure that 369 * this is not the last reference to be released. Eg. if the caller is 370 * sure that the ire has not been deleted and won't be deleted. 371 */ 372 void 373 ire_refrele(ire_t *ire) 374 { 375 IRE_REFRELE(ire); 376 } 377 378 void 379 ire_refrele_notr(ire_t *ire) 380 { 381 IRE_REFRELE_NOTR(ire); 382 } 383 384 /* 385 * kmem_cache_alloc constructor for IRE in kma space. 386 * Note that when ire_mp is set the IRE is stored in that mblk and 387 * not in this cache. 388 */ 389 /* ARGSUSED */ 390 static int 391 ip_ire_constructor(void *buf, void *cdrarg, int kmflags) 392 { 393 ire_t *ire = buf; 394 395 ire->ire_nce = NULL; 396 397 return (0); 398 } 399 400 /* ARGSUSED1 */ 401 static void 402 ip_ire_destructor(void *buf, void *cdrarg) 403 { 404 ire_t *ire = buf; 405 406 ASSERT(ire->ire_nce == NULL); 407 } 408 409 /* 410 * This function is associated with the IP_IOC_IRE_ADVISE_NO_REPLY 411 * IOCTL. It is used by TCP (or other ULPs) to supply revised information 412 * for an existing CACHED IRE. 413 */ 414 /* ARGSUSED */ 415 int 416 ip_ire_advise(queue_t *q, mblk_t *mp, cred_t *ioc_cr) 417 { 418 uchar_t *addr_ucp; 419 ipic_t *ipic; 420 ire_t *ire; 421 ipaddr_t addr; 422 in6_addr_t v6addr; 423 irb_t *irb; 424 zoneid_t zoneid; 425 ip_stack_t *ipst = CONNQ_TO_IPST(q); 426 427 ASSERT(q->q_next == NULL); 428 zoneid = Q_TO_CONN(q)->conn_zoneid; 429 430 /* 431 * Check privilege using the ioctl credential; if it is NULL 432 * then this is a kernel message and therefor privileged. 433 */ 434 if (ioc_cr != NULL && secpolicy_ip_config(ioc_cr, B_FALSE) != 0) 435 return (EPERM); 436 437 ipic = (ipic_t *)mp->b_rptr; 438 if (!(addr_ucp = mi_offset_param(mp, ipic->ipic_addr_offset, 439 ipic->ipic_addr_length))) { 440 return (EINVAL); 441 } 442 if (!OK_32PTR(addr_ucp)) 443 return (EINVAL); 444 switch (ipic->ipic_addr_length) { 445 case IP_ADDR_LEN: { 446 /* Extract the destination address. */ 447 addr = *(ipaddr_t *)addr_ucp; 448 /* Find the corresponding IRE. */ 449 ire = ire_cache_lookup(addr, zoneid, NULL, ipst); 450 break; 451 } 452 case IPV6_ADDR_LEN: { 453 /* Extract the destination address. */ 454 v6addr = *(in6_addr_t *)addr_ucp; 455 /* Find the corresponding IRE. */ 456 ire = ire_cache_lookup_v6(&v6addr, zoneid, NULL, ipst); 457 break; 458 } 459 default: 460 return (EINVAL); 461 } 462 463 if (ire == NULL) 464 return (ENOENT); 465 /* 466 * Update the round trip time estimate and/or the max frag size 467 * and/or the slow start threshold. 468 * 469 * We serialize multiple advises using ire_lock. 470 */ 471 mutex_enter(&ire->ire_lock); 472 if (ipic->ipic_rtt) { 473 /* 474 * If there is no old cached values, initialize them 475 * conservatively. Set them to be (1.5 * new value). 476 */ 477 if (ire->ire_uinfo.iulp_rtt != 0) { 478 ire->ire_uinfo.iulp_rtt = (ire->ire_uinfo.iulp_rtt + 479 ipic->ipic_rtt) >> 1; 480 } else { 481 ire->ire_uinfo.iulp_rtt = ipic->ipic_rtt + 482 (ipic->ipic_rtt >> 1); 483 } 484 if (ire->ire_uinfo.iulp_rtt_sd != 0) { 485 ire->ire_uinfo.iulp_rtt_sd = 486 (ire->ire_uinfo.iulp_rtt_sd + 487 ipic->ipic_rtt_sd) >> 1; 488 } else { 489 ire->ire_uinfo.iulp_rtt_sd = ipic->ipic_rtt_sd + 490 (ipic->ipic_rtt_sd >> 1); 491 } 492 } 493 if (ipic->ipic_max_frag) 494 ire->ire_max_frag = MIN(ipic->ipic_max_frag, IP_MAXPACKET); 495 if (ipic->ipic_ssthresh != 0) { 496 if (ire->ire_uinfo.iulp_ssthresh != 0) 497 ire->ire_uinfo.iulp_ssthresh = 498 (ipic->ipic_ssthresh + 499 ire->ire_uinfo.iulp_ssthresh) >> 1; 500 else 501 ire->ire_uinfo.iulp_ssthresh = ipic->ipic_ssthresh; 502 } 503 /* 504 * Don't need the ire_lock below this. ire_type does not change 505 * after initialization. ire_marks is protected by irb_lock. 506 */ 507 mutex_exit(&ire->ire_lock); 508 509 if (ipic->ipic_ire_marks != 0 && ire->ire_type == IRE_CACHE) { 510 /* 511 * Only increment the temporary IRE count if the original 512 * IRE is not already marked temporary. 513 */ 514 irb = ire->ire_bucket; 515 rw_enter(&irb->irb_lock, RW_WRITER); 516 if ((ipic->ipic_ire_marks & IRE_MARK_TEMPORARY) && 517 !(ire->ire_marks & IRE_MARK_TEMPORARY)) { 518 irb->irb_tmp_ire_cnt++; 519 } 520 ire->ire_marks |= ipic->ipic_ire_marks; 521 rw_exit(&irb->irb_lock); 522 } 523 524 ire_refrele(ire); 525 return (0); 526 } 527 528 /* 529 * This function is associated with the IP_IOC_IRE_DELETE[_NO_REPLY] 530 * IOCTL[s]. The NO_REPLY form is used by TCP to delete a route IRE 531 * for a host that is not responding. This will force an attempt to 532 * establish a new route, if available, and flush out the ARP entry so 533 * it will re-resolve. Management processes may want to use the 534 * version that generates a reply. 535 * 536 * This function does not support IPv6 since Neighbor Unreachability Detection 537 * means that negative advise like this is useless. 538 */ 539 /* ARGSUSED */ 540 int 541 ip_ire_delete(queue_t *q, mblk_t *mp, cred_t *ioc_cr) 542 { 543 uchar_t *addr_ucp; 544 ipaddr_t addr; 545 ire_t *ire; 546 ipid_t *ipid; 547 boolean_t routing_sock_info = B_FALSE; /* Sent info? */ 548 zoneid_t zoneid; 549 ire_t *gire = NULL; 550 ill_t *ill; 551 mblk_t *arp_mp; 552 ip_stack_t *ipst; 553 554 ASSERT(q->q_next == NULL); 555 zoneid = Q_TO_CONN(q)->conn_zoneid; 556 ipst = CONNQ_TO_IPST(q); 557 558 /* 559 * Check privilege using the ioctl credential; if it is NULL 560 * then this is a kernel message and therefor privileged. 561 */ 562 if (ioc_cr != NULL && secpolicy_ip_config(ioc_cr, B_FALSE) != 0) 563 return (EPERM); 564 565 ipid = (ipid_t *)mp->b_rptr; 566 567 /* Only actions on IRE_CACHEs are acceptable at present. */ 568 if (ipid->ipid_ire_type != IRE_CACHE) 569 return (EINVAL); 570 571 addr_ucp = mi_offset_param(mp, ipid->ipid_addr_offset, 572 ipid->ipid_addr_length); 573 if (addr_ucp == NULL || !OK_32PTR(addr_ucp)) 574 return (EINVAL); 575 switch (ipid->ipid_addr_length) { 576 case IP_ADDR_LEN: 577 /* addr_ucp points at IP addr */ 578 break; 579 case sizeof (sin_t): { 580 sin_t *sin; 581 /* 582 * got complete (sockaddr) address - increment addr_ucp to point 583 * at the ip_addr field. 584 */ 585 sin = (sin_t *)addr_ucp; 586 addr_ucp = (uchar_t *)&sin->sin_addr.s_addr; 587 break; 588 } 589 default: 590 return (EINVAL); 591 } 592 /* Extract the destination address. */ 593 bcopy(addr_ucp, &addr, IP_ADDR_LEN); 594 595 /* Try to find the CACHED IRE. */ 596 ire = ire_cache_lookup(addr, zoneid, NULL, ipst); 597 598 /* Nail it. */ 599 if (ire) { 600 /* Allow delete only on CACHE entries */ 601 if (ire->ire_type != IRE_CACHE) { 602 ire_refrele(ire); 603 return (EINVAL); 604 } 605 606 /* 607 * Verify that the IRE has been around for a while. 608 * This is to protect against transport protocols 609 * that are too eager in sending delete messages. 610 */ 611 if (gethrestime_sec() < 612 ire->ire_create_time + ipst->ips_ip_ignore_delete_time) { 613 ire_refrele(ire); 614 return (EINVAL); 615 } 616 /* 617 * Now we have a potentially dead cache entry. We need 618 * to remove it. 619 * If this cache entry is generated from a 620 * default route (i.e., ire_cmask == 0), 621 * search the default list and mark it dead and some 622 * background process will try to activate it. 623 */ 624 if ((ire->ire_gateway_addr != 0) && (ire->ire_cmask == 0)) { 625 /* 626 * Make sure that we pick a different 627 * IRE_DEFAULT next time. 628 */ 629 ire_t *gw_ire; 630 irb_t *irb = NULL; 631 uint_t match_flags; 632 633 match_flags = (MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE); 634 635 gire = ire_ftable_lookup(ire->ire_addr, 636 ire->ire_cmask, 0, 0, 637 ire->ire_ipif, NULL, zoneid, 0, NULL, match_flags, 638 ipst); 639 640 ip3dbg(("ire_ftable_lookup() returned gire %p\n", 641 (void *)gire)); 642 643 if (gire != NULL) { 644 irb = gire->ire_bucket; 645 646 /* 647 * We grab it as writer just to serialize 648 * multiple threads trying to bump up 649 * irb_rr_origin 650 */ 651 rw_enter(&irb->irb_lock, RW_WRITER); 652 if ((gw_ire = irb->irb_rr_origin) == NULL) { 653 rw_exit(&irb->irb_lock); 654 goto done; 655 } 656 657 DTRACE_PROBE1(ip__ire__del__origin, 658 (ire_t *), gw_ire); 659 660 /* Skip past the potentially bad gateway */ 661 if (ire->ire_gateway_addr == 662 gw_ire->ire_gateway_addr) { 663 ire_t *next = gw_ire->ire_next; 664 665 DTRACE_PROBE2(ip__ire__del, 666 (ire_t *), gw_ire, (irb_t *), irb); 667 IRE_FIND_NEXT_ORIGIN(next); 668 irb->irb_rr_origin = next; 669 } 670 rw_exit(&irb->irb_lock); 671 } 672 } 673 done: 674 if (gire != NULL) 675 IRE_REFRELE(gire); 676 /* report the bad route to routing sockets */ 677 ip_rts_change(RTM_LOSING, ire->ire_addr, ire->ire_gateway_addr, 678 ire->ire_mask, ire->ire_src_addr, 0, 0, 0, 679 (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA), ipst); 680 routing_sock_info = B_TRUE; 681 682 /* 683 * TCP is really telling us to start over completely, and it 684 * expects that we'll resend the ARP query. Tell ARP to 685 * discard the entry, if this is a local destination. 686 * 687 * But, if the ARP entry is permanent then it shouldn't be 688 * deleted, so we set ARED_F_PRESERVE_PERM. 689 */ 690 ill = ire->ire_stq->q_ptr; 691 if (ire->ire_gateway_addr == 0 && 692 (arp_mp = ill_ared_alloc(ill, addr)) != NULL) { 693 ared_t *ared = (ared_t *)arp_mp->b_rptr; 694 695 ASSERT(ared->ared_cmd == AR_ENTRY_DELETE); 696 ared->ared_flags |= ARED_F_PRESERVE_PERM; 697 putnext(ill->ill_rq, arp_mp); 698 } 699 700 ire_delete(ire); 701 ire_refrele(ire); 702 } 703 /* 704 * Also look for an IRE_HOST type redirect ire and 705 * remove it if present. 706 */ 707 ire = ire_route_lookup(addr, 0, 0, IRE_HOST, NULL, NULL, 708 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 709 710 /* Nail it. */ 711 if (ire != NULL) { 712 if (ire->ire_flags & RTF_DYNAMIC) { 713 if (!routing_sock_info) { 714 ip_rts_change(RTM_LOSING, ire->ire_addr, 715 ire->ire_gateway_addr, ire->ire_mask, 716 ire->ire_src_addr, 0, 0, 0, 717 (RTA_DST | RTA_GATEWAY | 718 RTA_NETMASK | RTA_IFA), 719 ipst); 720 } 721 ire_delete(ire); 722 } 723 ire_refrele(ire); 724 } 725 return (0); 726 } 727 728 729 /* 730 * ip_ire_req is called by ip_wput when an IRE_DB_REQ_TYPE message is handed 731 * down from the Upper Level Protocol to request a copy of the IRE (to check 732 * its type or to extract information like round-trip time estimates or the 733 * MTU.) 734 * The address is assumed to be in the ire_addr field. If no IRE is found 735 * an IRE is returned with ire_type being zero. 736 * Note that the upper lavel protocol has to check for broadcast 737 * (IRE_BROADCAST) and multicast (CLASSD(addr)). 738 * If there is a b_cont the resulting IRE_DB_TYPE mblk is placed at the 739 * end of the returned message. 740 * 741 * TCP sends down a message of this type with a connection request packet 742 * chained on. UDP and ICMP send it down to verify that a route exists for 743 * the destination address when they get connected. 744 */ 745 void 746 ip_ire_req(queue_t *q, mblk_t *mp) 747 { 748 ire_t *inire; 749 ire_t *ire; 750 mblk_t *mp1; 751 ire_t *sire = NULL; 752 zoneid_t zoneid = Q_TO_CONN(q)->conn_zoneid; 753 ip_stack_t *ipst = CONNQ_TO_IPST(q); 754 755 ASSERT(q->q_next == NULL); 756 757 if ((mp->b_wptr - mp->b_rptr) < sizeof (ire_t) || 758 !OK_32PTR(mp->b_rptr)) { 759 freemsg(mp); 760 return; 761 } 762 inire = (ire_t *)mp->b_rptr; 763 /* 764 * Got it, now take our best shot at an IRE. 765 */ 766 if (inire->ire_ipversion == IPV6_VERSION) { 767 ire = ire_route_lookup_v6(&inire->ire_addr_v6, 0, 0, 0, 768 NULL, &sire, zoneid, NULL, 769 (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst); 770 } else { 771 ASSERT(inire->ire_ipversion == IPV4_VERSION); 772 ire = ire_route_lookup(inire->ire_addr, 0, 0, 0, 773 NULL, &sire, zoneid, NULL, 774 (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst); 775 } 776 777 /* 778 * We prevent returning IRES with source address INADDR_ANY 779 * as these were temporarily created for sending packets 780 * from endpoints that have conn_unspec_src set. 781 */ 782 if (ire == NULL || 783 (ire->ire_ipversion == IPV4_VERSION && 784 ire->ire_src_addr == INADDR_ANY) || 785 (ire->ire_ipversion == IPV6_VERSION && 786 IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6))) { 787 inire->ire_type = 0; 788 } else { 789 bcopy(ire, inire, sizeof (ire_t)); 790 /* Copy the route metrics from the parent. */ 791 if (sire != NULL) { 792 bcopy(&(sire->ire_uinfo), &(inire->ire_uinfo), 793 sizeof (iulp_t)); 794 } 795 796 /* 797 * As we don't lookup global policy here, we may not 798 * pass the right size if per-socket policy is not 799 * present. For these cases, path mtu discovery will 800 * do the right thing. 801 */ 802 inire->ire_ipsec_overhead = conn_ipsec_length(Q_TO_CONN(q)); 803 804 /* Pass the latest setting of the ip_path_mtu_discovery */ 805 inire->ire_frag_flag |= 806 (ipst->ips_ip_path_mtu_discovery) ? IPH_DF : 0; 807 } 808 if (ire != NULL) 809 ire_refrele(ire); 810 if (sire != NULL) 811 ire_refrele(sire); 812 mp->b_wptr = &mp->b_rptr[sizeof (ire_t)]; 813 mp->b_datap->db_type = IRE_DB_TYPE; 814 815 /* Put the IRE_DB_TYPE mblk last in the chain */ 816 mp1 = mp->b_cont; 817 if (mp1 != NULL) { 818 mp->b_cont = NULL; 819 linkb(mp1, mp); 820 mp = mp1; 821 } 822 qreply(q, mp); 823 } 824 825 /* 826 * Send a packet using the specified IRE. 827 * If ire_src_addr_v6 is all zero then discard the IRE after 828 * the packet has been sent. 829 */ 830 static void 831 ire_send(queue_t *q, mblk_t *pkt, ire_t *ire) 832 { 833 mblk_t *ipsec_mp; 834 boolean_t is_secure; 835 uint_t ifindex; 836 ill_t *ill; 837 zoneid_t zoneid = ire->ire_zoneid; 838 ip_stack_t *ipst = ire->ire_ipst; 839 840 ASSERT(ire->ire_ipversion == IPV4_VERSION); 841 ASSERT(!(ire->ire_type & IRE_LOCAL)); /* Has different ire_zoneid */ 842 ipsec_mp = pkt; 843 is_secure = (pkt->b_datap->db_type == M_CTL); 844 if (is_secure) { 845 ipsec_out_t *io; 846 847 pkt = pkt->b_cont; 848 io = (ipsec_out_t *)ipsec_mp->b_rptr; 849 if (io->ipsec_out_type == IPSEC_OUT) 850 zoneid = io->ipsec_out_zoneid; 851 } 852 853 /* If the packet originated externally then */ 854 if (pkt->b_prev) { 855 ire_refrele(ire); 856 /* 857 * Extract the ifindex from b_prev (set in ip_rput_noire). 858 * Look up interface to see if it still exists (it could have 859 * been unplumbed by the time the reply came back from ARP) 860 */ 861 ifindex = (uint_t)(uintptr_t)pkt->b_prev; 862 ill = ill_lookup_on_ifindex(ifindex, B_FALSE, 863 NULL, NULL, NULL, NULL, ipst); 864 if (ill == NULL) { 865 pkt->b_prev = NULL; 866 pkt->b_next = NULL; 867 freemsg(ipsec_mp); 868 return; 869 } 870 q = ill->ill_rq; 871 pkt->b_prev = NULL; 872 /* 873 * This packet has not gone through IPSEC processing 874 * and hence we should not have any IPSEC message 875 * prepended. 876 */ 877 ASSERT(ipsec_mp == pkt); 878 put(q, pkt); 879 ill_refrele(ill); 880 } else if (pkt->b_next) { 881 /* Packets from multicast router */ 882 pkt->b_next = NULL; 883 /* 884 * We never get the IPSEC_OUT while forwarding the 885 * packet for multicast router. 886 */ 887 ASSERT(ipsec_mp == pkt); 888 ip_rput_forward(ire, (ipha_t *)pkt->b_rptr, ipsec_mp, NULL); 889 ire_refrele(ire); 890 } else { 891 /* Locally originated packets */ 892 boolean_t delete_ire = B_FALSE; 893 ipha_t *ipha = (ipha_t *)pkt->b_rptr; 894 895 /* 896 * If this IRE shouldn't be kept in the table (because its 897 * source address is unspecified), hold a reference to it so 898 * we can delete it even after e.g. ip_wput_ire() has dropped 899 * its reference. 900 */ 901 if (!(ire->ire_marks & IRE_MARK_NOADD) && 902 ire->ire_src_addr == INADDR_ANY) { 903 delete_ire = B_TRUE; 904 IRE_REFHOLD(ire); 905 } 906 907 /* 908 * If we were resolving a router we can not use the 909 * routers IRE for sending the packet (since it would 910 * violate the uniqness of the IP idents) thus we 911 * make another pass through ip_wput to create the IRE_CACHE 912 * for the destination. 913 * When IRE_MARK_NOADD is set, ire_add() is not called. 914 * Thus ip_wput() will never find a ire and result in an 915 * infinite loop. Thus we check whether IRE_MARK_NOADD is 916 * is set. This also implies that IRE_MARK_NOADD can only be 917 * used to send packets to directly connected hosts. 918 */ 919 if (ipha->ipha_dst != ire->ire_addr && 920 !(ire->ire_marks & IRE_MARK_NOADD)) { 921 ire_refrele(ire); /* Held in ire_add */ 922 if (CONN_Q(q)) { 923 (void) ip_output(Q_TO_CONN(q), ipsec_mp, q, 924 IRE_SEND); 925 } else { 926 (void) ip_output((void *)(uintptr_t)zoneid, 927 ipsec_mp, q, IRE_SEND); 928 } 929 } else { 930 if (is_secure) { 931 ipsec_out_t *oi; 932 ipha_t *ipha; 933 934 oi = (ipsec_out_t *)ipsec_mp->b_rptr; 935 ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr; 936 if (oi->ipsec_out_proc_begin) { 937 /* 938 * This is the case where 939 * ip_wput_ipsec_out could not find 940 * the IRE and recreated a new one. 941 * As ip_wput_ipsec_out does ire 942 * lookups, ire_refrele for the extra 943 * bump in ire_add. 944 */ 945 ire_refrele(ire); 946 ip_wput_ipsec_out(q, ipsec_mp, ipha, 947 NULL, NULL); 948 } else { 949 /* 950 * IRE_REFRELE will be done in 951 * ip_wput_ire. 952 */ 953 ip_wput_ire(q, ipsec_mp, ire, NULL, 954 IRE_SEND, zoneid); 955 } 956 } else { 957 /* 958 * IRE_REFRELE will be done in ip_wput_ire. 959 */ 960 ip_wput_ire(q, ipsec_mp, ire, NULL, 961 IRE_SEND, zoneid); 962 } 963 } 964 /* 965 * Special code to support sending a single packet with 966 * conn_unspec_src using an IRE which has no source address. 967 * The IRE is deleted here after sending the packet to avoid 968 * having other code trip on it. But before we delete the 969 * ire, somebody could have looked up this ire. 970 * We prevent returning/using this IRE by the upper layers 971 * by making checks to NULL source address in other places 972 * like e.g ip_ire_append, ip_ire_req and ip_bind_connected. 973 * Though this does not completely prevent other threads 974 * from using this ire, this should not cause any problems. 975 */ 976 if (delete_ire) { 977 ip1dbg(("ire_send: delete IRE\n")); 978 ire_delete(ire); 979 ire_refrele(ire); /* Held above */ 980 } 981 } 982 } 983 984 /* 985 * Send a packet using the specified IRE. 986 * If ire_src_addr_v6 is all zero then discard the IRE after 987 * the packet has been sent. 988 */ 989 static void 990 ire_send_v6(queue_t *q, mblk_t *pkt, ire_t *ire) 991 { 992 mblk_t *ipsec_mp; 993 boolean_t secure; 994 uint_t ifindex; 995 zoneid_t zoneid = ire->ire_zoneid; 996 ip_stack_t *ipst = ire->ire_ipst; 997 998 ASSERT(ire->ire_ipversion == IPV6_VERSION); 999 ASSERT(!(ire->ire_type & IRE_LOCAL)); /* Has different ire_zoneid */ 1000 if (pkt->b_datap->db_type == M_CTL) { 1001 ipsec_out_t *io; 1002 1003 ipsec_mp = pkt; 1004 pkt = pkt->b_cont; 1005 secure = B_TRUE; 1006 io = (ipsec_out_t *)ipsec_mp->b_rptr; 1007 if (io->ipsec_out_type == IPSEC_OUT) 1008 zoneid = io->ipsec_out_zoneid; 1009 } else { 1010 ipsec_mp = pkt; 1011 secure = B_FALSE; 1012 } 1013 1014 /* If the packet originated externally then */ 1015 if (pkt->b_prev) { 1016 ill_t *ill; 1017 /* 1018 * Extract the ifindex from b_prev (set in ip_rput_data_v6). 1019 * Look up interface to see if it still exists (it could have 1020 * been unplumbed by the time the reply came back from the 1021 * resolver). 1022 */ 1023 ifindex = (uint_t)(uintptr_t)pkt->b_prev; 1024 ill = ill_lookup_on_ifindex(ifindex, B_TRUE, 1025 NULL, NULL, NULL, NULL, ipst); 1026 if (ill == NULL) { 1027 pkt->b_prev = NULL; 1028 pkt->b_next = NULL; 1029 freemsg(ipsec_mp); 1030 ire_refrele(ire); /* Held in ire_add */ 1031 return; 1032 } 1033 q = ill->ill_rq; 1034 pkt->b_prev = NULL; 1035 /* 1036 * This packet has not gone through IPSEC processing 1037 * and hence we should not have any IPSEC message 1038 * prepended. 1039 */ 1040 ASSERT(ipsec_mp == pkt); 1041 put(q, pkt); 1042 ill_refrele(ill); 1043 } else if (pkt->b_next) { 1044 /* Packets from multicast router */ 1045 pkt->b_next = NULL; 1046 /* 1047 * We never get the IPSEC_OUT while forwarding the 1048 * packet for multicast router. 1049 */ 1050 ASSERT(ipsec_mp == pkt); 1051 /* 1052 * XXX TODO IPv6. 1053 */ 1054 freemsg(pkt); 1055 #ifdef XXX 1056 ip_rput_forward(ire, (ipha_t *)pkt->b_rptr, pkt, NULL); 1057 #endif 1058 } else { 1059 if (secure) { 1060 ipsec_out_t *oi; 1061 ip6_t *ip6h; 1062 1063 oi = (ipsec_out_t *)ipsec_mp->b_rptr; 1064 ip6h = (ip6_t *)ipsec_mp->b_cont->b_rptr; 1065 if (oi->ipsec_out_proc_begin) { 1066 /* 1067 * This is the case where 1068 * ip_wput_ipsec_out could not find 1069 * the IRE and recreated a new one. 1070 */ 1071 ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h, 1072 NULL, NULL); 1073 } else { 1074 if (CONN_Q(q)) { 1075 (void) ip_output_v6(Q_TO_CONN(q), 1076 ipsec_mp, q, IRE_SEND); 1077 } else { 1078 (void) ip_output_v6( 1079 (void *)(uintptr_t)zoneid, 1080 ipsec_mp, q, IRE_SEND); 1081 } 1082 } 1083 } else { 1084 /* 1085 * Send packets through ip_output_v6 so that any 1086 * ip6_info header can be processed again. 1087 */ 1088 if (CONN_Q(q)) { 1089 (void) ip_output_v6(Q_TO_CONN(q), ipsec_mp, q, 1090 IRE_SEND); 1091 } else { 1092 (void) ip_output_v6((void *)(uintptr_t)zoneid, 1093 ipsec_mp, q, IRE_SEND); 1094 } 1095 } 1096 /* 1097 * Special code to support sending a single packet with 1098 * conn_unspec_src using an IRE which has no source address. 1099 * The IRE is deleted here after sending the packet to avoid 1100 * having other code trip on it. But before we delete the 1101 * ire, somebody could have looked up this ire. 1102 * We prevent returning/using this IRE by the upper layers 1103 * by making checks to NULL source address in other places 1104 * like e.g ip_ire_append_v6, ip_ire_req and 1105 * ip_bind_connected_v6. Though, this does not completely 1106 * prevent other threads from using this ire, this should 1107 * not cause any problems. 1108 */ 1109 if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6)) { 1110 ip1dbg(("ire_send_v6: delete IRE\n")); 1111 ire_delete(ire); 1112 } 1113 } 1114 ire_refrele(ire); /* Held in ire_add */ 1115 } 1116 1117 /* 1118 * Make sure that IRE bucket does not get too long. 1119 * This can cause lock up because ire_cache_lookup() 1120 * may take "forever" to finish. 1121 * 1122 * We only remove a maximum of cnt IREs each time. This 1123 * should keep the bucket length approximately constant, 1124 * depending on cnt. This should be enough to defend 1125 * against DoS attack based on creating temporary IREs 1126 * (for forwarding and non-TCP traffic). 1127 * 1128 * We also pass in the address of the newly created IRE 1129 * as we do not want to remove this straight after adding 1130 * it. New IREs are normally added at the tail of the 1131 * bucket. This means that we are removing the "oldest" 1132 * temporary IREs added. Only if there are IREs with 1133 * the same ire_addr, do we not add it at the tail. Refer 1134 * to ire_add_v*(). It should be OK for our purpose. 1135 * 1136 * For non-temporary cached IREs, we make sure that they 1137 * have not been used for some time (defined below), they 1138 * are non-local destinations, and there is no one using 1139 * them at the moment (refcnt == 1). 1140 * 1141 * The above means that the IRE bucket length may become 1142 * very long, consisting of mostly non-temporary IREs. 1143 * This can happen when the hash function does a bad job 1144 * so that most TCP connections cluster to a specific bucket. 1145 * This "hopefully" should never happen. It can also 1146 * happen if most TCP connections have very long lives. 1147 * Even with the minimal hash table size of 256, there 1148 * has to be a lot of such connections to make the bucket 1149 * length unreasonably long. This should probably not 1150 * happen either. The third can when this can happen is 1151 * when the machine is under attack, such as SYN flooding. 1152 * TCP should already have the proper mechanism to protect 1153 * that. So we should be safe. 1154 * 1155 * This function is called by ire_add_then_send() after 1156 * a new IRE is added and the packet is sent. 1157 * 1158 * The idle cutoff interval is set to 60s. It can be 1159 * changed using /etc/system. 1160 */ 1161 uint32_t ire_idle_cutoff_interval = 60000; 1162 1163 static void 1164 ire_cache_cleanup(irb_t *irb, uint32_t threshold, ire_t *ref_ire) 1165 { 1166 ire_t *ire; 1167 clock_t cut_off = drv_usectohz(ire_idle_cutoff_interval * 1000); 1168 int cnt = ip_ire_cleanup_cnt; 1169 1170 /* 1171 * Try to remove cnt temporary IREs first. 1172 */ 1173 for (ire = irb->irb_ire; cnt > 0 && ire != NULL; ire = ire->ire_next) { 1174 if (ire == ref_ire) 1175 continue; 1176 if (ire->ire_marks & IRE_MARK_CONDEMNED) 1177 continue; 1178 if (ire->ire_marks & IRE_MARK_TEMPORARY) { 1179 ASSERT(ire->ire_type == IRE_CACHE); 1180 ire_delete(ire); 1181 cnt--; 1182 } 1183 } 1184 if (cnt == 0) 1185 return; 1186 1187 /* 1188 * If we didn't satisfy our removal target from temporary IREs 1189 * we see how many non-temporary IREs are currently in the bucket. 1190 * If this quantity is above the threshold then we see if there are any 1191 * candidates for removal. We are still limited to removing a maximum 1192 * of cnt IREs. 1193 */ 1194 if ((irb->irb_ire_cnt - irb->irb_tmp_ire_cnt) > threshold) { 1195 for (ire = irb->irb_ire; cnt > 0 && ire != NULL; 1196 ire = ire->ire_next) { 1197 if (ire == ref_ire) 1198 continue; 1199 if (ire->ire_type != IRE_CACHE) 1200 continue; 1201 if (ire->ire_marks & IRE_MARK_CONDEMNED) 1202 continue; 1203 if ((ire->ire_refcnt == 1) && 1204 (lbolt - ire->ire_last_used_time > cut_off)) { 1205 ire_delete(ire); 1206 cnt--; 1207 } 1208 } 1209 } 1210 } 1211 1212 /* 1213 * ire_add_then_send is called when a new IRE has been created in order to 1214 * route an outgoing packet. Typically, it is called from ip_wput when 1215 * a response comes back down from a resolver. We add the IRE, and then 1216 * possibly run the packet through ip_wput or ip_rput, as appropriate. 1217 * However, we do not add the newly created IRE in the cache when 1218 * IRE_MARK_NOADD is set in the IRE. IRE_MARK_NOADD is set at 1219 * ip_newroute_ipif(). The ires with IRE_MARK_NOADD are ire_refrele'd by 1220 * ip_wput_ire() and get deleted. 1221 * Multirouting support: the packet is silently discarded when the new IRE 1222 * holds the RTF_MULTIRT flag, but is not the first IRE to be added with the 1223 * RTF_MULTIRT flag for the same destination address. 1224 * In this case, we just want to register this additional ire without 1225 * sending the packet, as it has already been replicated through 1226 * existing multirt routes in ip_wput(). 1227 */ 1228 void 1229 ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp) 1230 { 1231 irb_t *irb; 1232 boolean_t drop = B_FALSE; 1233 boolean_t mctl_present; 1234 mblk_t *first_mp = NULL; 1235 mblk_t *data_mp = NULL; 1236 ire_t *dst_ire; 1237 ipha_t *ipha; 1238 ip6_t *ip6h; 1239 ip_stack_t *ipst = ire->ire_ipst; 1240 int ire_limit; 1241 1242 if (mp != NULL) { 1243 /* 1244 * We first have to retrieve the destination address carried 1245 * by the packet. 1246 * We can't rely on ire as it can be related to a gateway. 1247 * The destination address will help in determining if 1248 * other RTF_MULTIRT ires are already registered. 1249 * 1250 * We first need to know where we are going : v4 or V6. 1251 * the ire version is enough, as there is no risk that 1252 * we resolve an IPv6 address with an IPv4 ire 1253 * or vice versa. 1254 */ 1255 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 1256 data_mp = mp; 1257 mp = first_mp; 1258 if (ire->ire_ipversion == IPV4_VERSION) { 1259 ipha = (ipha_t *)data_mp->b_rptr; 1260 dst_ire = ire_cache_lookup(ipha->ipha_dst, 1261 ire->ire_zoneid, MBLK_GETLABEL(mp), ipst); 1262 } else { 1263 ASSERT(ire->ire_ipversion == IPV6_VERSION); 1264 ip6h = (ip6_t *)data_mp->b_rptr; 1265 dst_ire = ire_cache_lookup_v6(&ip6h->ip6_dst, 1266 ire->ire_zoneid, MBLK_GETLABEL(mp), ipst); 1267 } 1268 if (dst_ire != NULL) { 1269 if (dst_ire->ire_flags & RTF_MULTIRT) { 1270 /* 1271 * At least one resolved multirt route 1272 * already exists for the destination, 1273 * don't sent this packet: either drop it 1274 * or complete the pending resolution, 1275 * depending on the ire. 1276 */ 1277 drop = B_TRUE; 1278 } 1279 ip1dbg(("ire_add_then_send: dst_ire %p " 1280 "[dst %08x, gw %08x], drop %d\n", 1281 (void *)dst_ire, 1282 (dst_ire->ire_ipversion == IPV4_VERSION) ? \ 1283 ntohl(dst_ire->ire_addr) : \ 1284 ntohl(V4_PART_OF_V6(dst_ire->ire_addr_v6)), 1285 (dst_ire->ire_ipversion == IPV4_VERSION) ? \ 1286 ntohl(dst_ire->ire_gateway_addr) : \ 1287 ntohl(V4_PART_OF_V6( 1288 dst_ire->ire_gateway_addr_v6)), 1289 drop)); 1290 ire_refrele(dst_ire); 1291 } 1292 } 1293 1294 if (!(ire->ire_marks & IRE_MARK_NOADD)) { 1295 /* Regular packets with cache bound ires are here. */ 1296 (void) ire_add(&ire, NULL, NULL, NULL, B_FALSE); 1297 1298 if (ire == NULL) { 1299 mp->b_prev = NULL; 1300 mp->b_next = NULL; 1301 MULTIRT_DEBUG_UNTAG(mp); 1302 freemsg(mp); 1303 return; 1304 } 1305 if (mp == NULL) { 1306 ire_refrele(ire); /* Held in ire_add_v4/v6 */ 1307 return; 1308 } 1309 } 1310 if (drop) { 1311 /* 1312 * If we're adding an RTF_MULTIRT ire, the resolution 1313 * is over: we just drop the packet. 1314 */ 1315 if (ire->ire_flags & RTF_MULTIRT) { 1316 data_mp->b_prev = NULL; 1317 data_mp->b_next = NULL; 1318 MULTIRT_DEBUG_UNTAG(mp); 1319 freemsg(mp); 1320 } else { 1321 /* 1322 * Otherwise, we're adding the ire to a gateway 1323 * for a multirt route. 1324 * Invoke ip_newroute() to complete the resolution 1325 * of the route. We will then come back here and 1326 * finally drop this packet in the above code. 1327 */ 1328 if (ire->ire_ipversion == IPV4_VERSION) { 1329 /* 1330 * TODO: in order for CGTP to work in non-global 1331 * zones, ip_newroute() must create the IRE 1332 * cache in the zone indicated by 1333 * ire->ire_zoneid. 1334 */ 1335 ip_newroute(q, mp, ipha->ipha_dst, 1336 (CONN_Q(q) ? Q_TO_CONN(q) : NULL), 1337 ire->ire_zoneid, ipst); 1338 } else { 1339 int minlen = sizeof (ip6i_t) + IPV6_HDR_LEN; 1340 1341 ASSERT(ire->ire_ipversion == IPV6_VERSION); 1342 1343 /* 1344 * If necessary, skip over the ip6i_t to find 1345 * the header with the actual source address. 1346 */ 1347 if (ip6h->ip6_nxt == IPPROTO_RAW) { 1348 if (MBLKL(data_mp) < minlen && 1349 pullupmsg(data_mp, -1) == 0) { 1350 ip1dbg(("ire_add_then_send: " 1351 "cannot pullupmsg ip6i\n")); 1352 if (mctl_present) 1353 freeb(first_mp); 1354 ire_refrele(ire); 1355 return; 1356 } 1357 ASSERT(MBLKL(data_mp) >= IPV6_HDR_LEN); 1358 ip6h = (ip6_t *)(data_mp->b_rptr + 1359 sizeof (ip6i_t)); 1360 } 1361 ip_newroute_v6(q, mp, &ip6h->ip6_dst, 1362 &ip6h->ip6_src, NULL, ire->ire_zoneid, 1363 ipst); 1364 } 1365 } 1366 1367 ire_refrele(ire); /* As done by ire_send(). */ 1368 return; 1369 } 1370 /* 1371 * Need to remember ire_bucket here as ire_send*() may delete 1372 * the ire so we cannot reference it after that. 1373 */ 1374 irb = ire->ire_bucket; 1375 if (ire->ire_ipversion == IPV4_VERSION) { 1376 ire_send(q, mp, ire); 1377 ire_limit = ip_ire_max_bucket_cnt; 1378 } else { 1379 ire_send_v6(q, mp, ire); 1380 ire_limit = ip6_ire_max_bucket_cnt; 1381 } 1382 1383 /* 1384 * irb is NULL if the IRE was not added to the hash. This happens 1385 * when IRE_MARK_NOADD is set and when IREs are returned from 1386 * ire_update_srcif_v4(). 1387 */ 1388 if (irb != NULL) { 1389 IRB_REFHOLD(irb); 1390 if (irb->irb_ire_cnt > ire_limit) 1391 ire_cache_cleanup(irb, ire_limit, ire); 1392 IRB_REFRELE(irb); 1393 } 1394 } 1395 1396 /* 1397 * Initialize the ire that is specific to IPv4 part and call 1398 * ire_init_common to finish it. 1399 */ 1400 ire_t * 1401 ire_init(ire_t *ire, uchar_t *addr, uchar_t *mask, uchar_t *src_addr, 1402 uchar_t *gateway, uint_t *max_fragp, nce_t *src_nce, queue_t *rfq, 1403 queue_t *stq, ushort_t type, ipif_t *ipif, ipaddr_t cmask, uint32_t phandle, 1404 uint32_t ihandle, uint32_t flags, const iulp_t *ulp_info, tsol_gc_t *gc, 1405 tsol_gcgrp_t *gcgrp, ip_stack_t *ipst) 1406 { 1407 ASSERT(type != IRE_CACHE || stq != NULL); 1408 /* 1409 * Reject IRE security attribute creation/initialization 1410 * if system is not running in Trusted mode. 1411 */ 1412 if ((gc != NULL || gcgrp != NULL) && !is_system_labeled()) 1413 return (NULL); 1414 1415 1416 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_alloced); 1417 1418 if (addr != NULL) 1419 bcopy(addr, &ire->ire_addr, IP_ADDR_LEN); 1420 if (src_addr != NULL) 1421 bcopy(src_addr, &ire->ire_src_addr, IP_ADDR_LEN); 1422 if (mask != NULL) { 1423 bcopy(mask, &ire->ire_mask, IP_ADDR_LEN); 1424 ire->ire_masklen = ip_mask_to_plen(ire->ire_mask); 1425 } 1426 if (gateway != NULL) { 1427 bcopy(gateway, &ire->ire_gateway_addr, IP_ADDR_LEN); 1428 } 1429 1430 if (type == IRE_CACHE) 1431 ire->ire_cmask = cmask; 1432 1433 /* ire_init_common will free the mblks upon encountering any failure */ 1434 if (!ire_init_common(ire, max_fragp, src_nce, rfq, stq, type, ipif, 1435 phandle, ihandle, flags, IPV4_VERSION, ulp_info, gc, gcgrp, ipst)) 1436 return (NULL); 1437 1438 return (ire); 1439 } 1440 1441 /* 1442 * Similar to ire_create except that it is called only when 1443 * we want to allocate ire as an mblk e.g. we have an external 1444 * resolver ARP. 1445 */ 1446 ire_t * 1447 ire_create_mp(uchar_t *addr, uchar_t *mask, uchar_t *src_addr, uchar_t *gateway, 1448 uint_t max_frag, nce_t *src_nce, queue_t *rfq, queue_t *stq, ushort_t type, 1449 ipif_t *ipif, ipaddr_t cmask, uint32_t phandle, uint32_t ihandle, 1450 uint32_t flags, const iulp_t *ulp_info, tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, 1451 ip_stack_t *ipst) 1452 { 1453 ire_t *ire, *buf; 1454 ire_t *ret_ire; 1455 mblk_t *mp; 1456 size_t bufsize; 1457 frtn_t *frtnp; 1458 ill_t *ill; 1459 1460 bufsize = sizeof (ire_t) + sizeof (frtn_t); 1461 buf = kmem_alloc(bufsize, KM_NOSLEEP); 1462 if (buf == NULL) { 1463 ip1dbg(("ire_create_mp: alloc failed\n")); 1464 return (NULL); 1465 } 1466 frtnp = (frtn_t *)(buf + 1); 1467 frtnp->free_arg = (caddr_t)buf; 1468 frtnp->free_func = ire_freemblk; 1469 1470 /* 1471 * Allocate the new IRE. The ire created will hold a ref on 1472 * an nce_t after ire_nce_init, and this ref must either be 1473 * (a) transferred to the ire_cache entry created when ire_add_v4 1474 * is called after successful arp resolution, or, 1475 * (b) released, when arp resolution fails 1476 * Case (b) is handled in ire_freemblk() which will be called 1477 * when mp is freed as a result of failed arp. 1478 */ 1479 mp = esballoc((unsigned char *)buf, bufsize, BPRI_MED, frtnp); 1480 if (mp == NULL) { 1481 ip1dbg(("ire_create_mp: alloc failed\n")); 1482 kmem_free(buf, bufsize); 1483 return (NULL); 1484 } 1485 ire = (ire_t *)mp->b_rptr; 1486 mp->b_wptr = (uchar_t *)&ire[1]; 1487 1488 /* Start clean. */ 1489 *ire = ire_null; 1490 ire->ire_mp = mp; 1491 mp->b_datap->db_type = IRE_DB_TYPE; 1492 ire->ire_marks |= IRE_MARK_UNCACHED; 1493 1494 ret_ire = ire_init(ire, addr, mask, src_addr, gateway, NULL, src_nce, 1495 rfq, stq, type, ipif, cmask, phandle, ihandle, flags, ulp_info, gc, 1496 gcgrp, ipst); 1497 1498 ill = (ill_t *)(stq->q_ptr); 1499 if (ret_ire == NULL) { 1500 /* ire_freemblk needs these set */ 1501 ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex; 1502 ire->ire_stackid = ipst->ips_netstack->netstack_stackid; 1503 ire->ire_ipst = ipst; 1504 freeb(ire->ire_mp); 1505 return (NULL); 1506 } 1507 ret_ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex; 1508 ret_ire->ire_stackid = ipst->ips_netstack->netstack_stackid; 1509 ASSERT(ret_ire == ire); 1510 ASSERT(ret_ire->ire_ipst == ipst); 1511 /* 1512 * ire_max_frag is normally zero here and is atomically set 1513 * under the irebucket lock in ire_add_v[46] except for the 1514 * case of IRE_MARK_NOADD. In that event the the ire_max_frag 1515 * is non-zero here. 1516 */ 1517 ire->ire_max_frag = max_frag; 1518 return (ire); 1519 } 1520 1521 /* 1522 * ire_create is called to allocate and initialize a new IRE. 1523 * 1524 * NOTE : This is called as writer sometimes though not required 1525 * by this function. 1526 */ 1527 ire_t * 1528 ire_create(uchar_t *addr, uchar_t *mask, uchar_t *src_addr, uchar_t *gateway, 1529 uint_t *max_fragp, nce_t *src_nce, queue_t *rfq, queue_t *stq, 1530 ushort_t type, ipif_t *ipif, ipaddr_t cmask, uint32_t phandle, 1531 uint32_t ihandle, uint32_t flags, const iulp_t *ulp_info, tsol_gc_t *gc, 1532 tsol_gcgrp_t *gcgrp, ip_stack_t *ipst) 1533 { 1534 ire_t *ire; 1535 ire_t *ret_ire; 1536 1537 ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 1538 if (ire == NULL) { 1539 ip1dbg(("ire_create: alloc failed\n")); 1540 return (NULL); 1541 } 1542 *ire = ire_null; 1543 1544 ret_ire = ire_init(ire, addr, mask, src_addr, gateway, max_fragp, 1545 src_nce, rfq, stq, type, ipif, cmask, phandle, ihandle, flags, 1546 ulp_info, gc, gcgrp, ipst); 1547 1548 if (ret_ire == NULL) { 1549 kmem_cache_free(ire_cache, ire); 1550 return (NULL); 1551 } 1552 ASSERT(ret_ire == ire); 1553 return (ire); 1554 } 1555 1556 1557 /* 1558 * Common to IPv4 and IPv6 1559 */ 1560 boolean_t 1561 ire_init_common(ire_t *ire, uint_t *max_fragp, nce_t *src_nce, queue_t *rfq, 1562 queue_t *stq, ushort_t type, ipif_t *ipif, uint32_t phandle, 1563 uint32_t ihandle, uint32_t flags, uchar_t ipversion, const iulp_t *ulp_info, 1564 tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, ip_stack_t *ipst) 1565 { 1566 ire->ire_max_fragp = max_fragp; 1567 ire->ire_frag_flag |= (ipst->ips_ip_path_mtu_discovery) ? IPH_DF : 0; 1568 1569 #ifdef DEBUG 1570 if (ipif != NULL) { 1571 if (ipif->ipif_isv6) 1572 ASSERT(ipversion == IPV6_VERSION); 1573 else 1574 ASSERT(ipversion == IPV4_VERSION); 1575 } 1576 #endif /* DEBUG */ 1577 1578 /* 1579 * Create/initialize IRE security attribute only in Trusted mode; 1580 * if the passed in gc/gcgrp is non-NULL, we expect that the caller 1581 * has held a reference to it and will release it when this routine 1582 * returns a failure, otherwise we own the reference. We do this 1583 * prior to initializing the rest IRE fields. 1584 * 1585 * Don't allocate ire_gw_secattr for the resolver case to prevent 1586 * memory leak (in case of external resolution failure). We'll 1587 * allocate it after a successful external resolution, in ire_add(). 1588 * Note that ire->ire_mp != NULL here means this ire is headed 1589 * to an external resolver. 1590 */ 1591 if (is_system_labeled()) { 1592 if ((type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST | 1593 IRE_INTERFACE)) != 0) { 1594 /* release references on behalf of caller */ 1595 if (gc != NULL) 1596 GC_REFRELE(gc); 1597 if (gcgrp != NULL) 1598 GCGRP_REFRELE(gcgrp); 1599 } else if ((ire->ire_mp == NULL) && 1600 tsol_ire_init_gwattr(ire, ipversion, gc, gcgrp) != 0) { 1601 return (B_FALSE); 1602 } 1603 } 1604 1605 ire->ire_stq = stq; 1606 ire->ire_rfq = rfq; 1607 ire->ire_type = type; 1608 ire->ire_flags = RTF_UP | flags; 1609 ire->ire_ident = TICK_TO_MSEC(lbolt); 1610 bcopy(ulp_info, &ire->ire_uinfo, sizeof (iulp_t)); 1611 1612 ire->ire_tire_mark = ire->ire_ob_pkt_count + ire->ire_ib_pkt_count; 1613 ire->ire_last_used_time = lbolt; 1614 ire->ire_create_time = (uint32_t)gethrestime_sec(); 1615 1616 /* 1617 * If this IRE is an IRE_CACHE, inherit the handles from the 1618 * parent IREs. For others in the forwarding table, assign appropriate 1619 * new ones. 1620 * 1621 * The mutex protecting ire_handle is because ire_create is not always 1622 * called as a writer. 1623 */ 1624 if (ire->ire_type & IRE_OFFSUBNET) { 1625 mutex_enter(&ipst->ips_ire_handle_lock); 1626 ire->ire_phandle = (uint32_t)ipst->ips_ire_handle++; 1627 mutex_exit(&ipst->ips_ire_handle_lock); 1628 } else if (ire->ire_type & IRE_INTERFACE) { 1629 mutex_enter(&ipst->ips_ire_handle_lock); 1630 ire->ire_ihandle = (uint32_t)ipst->ips_ire_handle++; 1631 mutex_exit(&ipst->ips_ire_handle_lock); 1632 } else if (ire->ire_type == IRE_CACHE) { 1633 ire->ire_phandle = phandle; 1634 ire->ire_ihandle = ihandle; 1635 } 1636 ire->ire_ipif = ipif; 1637 if (ipif != NULL) { 1638 ire->ire_ipif_seqid = ipif->ipif_seqid; 1639 ire->ire_ipif_ifindex = 1640 ipif->ipif_ill->ill_phyint->phyint_ifindex; 1641 ire->ire_zoneid = ipif->ipif_zoneid; 1642 } else { 1643 ire->ire_zoneid = GLOBAL_ZONEID; 1644 } 1645 ire->ire_ipversion = ipversion; 1646 mutex_init(&ire->ire_lock, NULL, MUTEX_DEFAULT, NULL); 1647 if (ipversion == IPV4_VERSION) { 1648 /* 1649 * IPv6 initializes the ire_nce in ire_add_v6, which expects 1650 * to find the ire_nce to be null when it is called. 1651 */ 1652 if (ire_nce_init(ire, src_nce) != 0) { 1653 /* some failure occurred. propagate error back */ 1654 return (B_FALSE); 1655 } 1656 } 1657 ire->ire_refcnt = 1; 1658 ire->ire_ipst = ipst; /* No netstack_hold */ 1659 ire->ire_trace_disable = B_FALSE; 1660 1661 return (B_TRUE); 1662 } 1663 1664 /* 1665 * This routine is called repeatedly by ipif_up to create broadcast IREs. 1666 * It is passed a pointer to a slot in an IRE pointer array into which to 1667 * place the pointer to the new IRE, if indeed we create one. If the 1668 * IRE corresponding to the address passed in would be a duplicate of an 1669 * existing one, we don't create the new one. irep is incremented before 1670 * return only if we do create a new IRE. (Always called as writer.) 1671 * 1672 * Note that with the "match_flags" parameter, we can match on either 1673 * a particular logical interface (MATCH_IRE_IPIF) or for all logical 1674 * interfaces for a given physical interface (MATCH_IRE_ILL). Currently, 1675 * we only create broadcast ire's on a per physical interface basis. If 1676 * someone is going to be mucking with logical interfaces, it is important 1677 * to call "ipif_check_bcast_ires()" to make sure that any change to a 1678 * logical interface will not cause critical broadcast IRE's to be deleted. 1679 */ 1680 ire_t ** 1681 ire_check_and_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep, 1682 int match_flags) 1683 { 1684 ire_t *ire; 1685 uint64_t check_flags = IPIF_DEPRECATED | IPIF_NOLOCAL | IPIF_ANYCAST; 1686 boolean_t prefer; 1687 ill_t *ill = ipif->ipif_ill; 1688 ip_stack_t *ipst = ill->ill_ipst; 1689 1690 /* 1691 * No broadcast IREs for the LOOPBACK interface 1692 * or others such as point to point and IPIF_NOXMIT. 1693 */ 1694 if (!(ipif->ipif_flags & IPIF_BROADCAST) || 1695 (ipif->ipif_flags & IPIF_NOXMIT)) 1696 return (irep); 1697 1698 /* 1699 * If this new IRE would be a duplicate, only prefer it if one of 1700 * the following is true: 1701 * 1702 * 1. The existing one has IPIF_DEPRECATED|IPIF_LOCAL|IPIF_ANYCAST 1703 * set and the new one has all of those clear. 1704 * 1705 * 2. The existing one corresponds to an underlying ILL in an IPMP 1706 * group and the new one corresponds to an IPMP group interface. 1707 */ 1708 if ((ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ipif, 1709 ipif->ipif_zoneid, NULL, match_flags, ipst)) != NULL) { 1710 prefer = ((ire->ire_ipif->ipif_flags & check_flags) && 1711 !(ipif->ipif_flags & check_flags)) || 1712 (IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) && IS_IPMP(ill)); 1713 if (!prefer) { 1714 ire_refrele(ire); 1715 return (irep); 1716 } 1717 1718 /* 1719 * Bcast ires exist in pairs. Both have to be deleted, 1720 * Since we are exclusive we can make the above assertion. 1721 * The 1st has to be refrele'd since it was ctable_lookup'd. 1722 */ 1723 ASSERT(IAM_WRITER_IPIF(ipif)); 1724 ASSERT(ire->ire_next->ire_addr == ire->ire_addr); 1725 ire_delete(ire->ire_next); 1726 ire_delete(ire); 1727 ire_refrele(ire); 1728 } 1729 return (ire_create_bcast(ipif, addr, irep)); 1730 } 1731 1732 uint_t ip_loopback_mtu = IP_LOOPBACK_MTU; 1733 1734 /* 1735 * This routine is called from ipif_check_bcast_ires and ire_check_bcast. 1736 * It leaves all the verifying and deleting to those routines. So it always 1737 * creates 2 bcast ires and chains them into the ire array passed in. 1738 */ 1739 ire_t ** 1740 ire_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep) 1741 { 1742 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 1743 ill_t *ill = ipif->ipif_ill; 1744 1745 ASSERT(IAM_WRITER_IPIF(ipif)); 1746 1747 if (IS_IPMP(ill)) { 1748 /* 1749 * Broadcast IREs for the IPMP meta-interface use the 1750 * nominated broadcast interface to send and receive packets. 1751 * If there's no nominated interface, send the packets down to 1752 * the IPMP stub driver, which will discard them. If the 1753 * nominated broadcast interface changes, ill_refresh_bcast() 1754 * will refresh the broadcast IREs. 1755 */ 1756 if ((ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL) 1757 ill = ipif->ipif_ill; 1758 } 1759 1760 *irep++ = ire_create( 1761 (uchar_t *)&addr, /* dest addr */ 1762 (uchar_t *)&ip_g_all_ones, /* mask */ 1763 (uchar_t *)&ipif->ipif_src_addr, /* source addr */ 1764 NULL, /* no gateway */ 1765 &ipif->ipif_mtu, /* max frag */ 1766 NULL, /* no src nce */ 1767 ill->ill_rq, /* recv-from queue */ 1768 ill->ill_wq, /* send-to queue */ 1769 IRE_BROADCAST, 1770 ipif, 1771 0, 1772 0, 1773 0, 1774 0, 1775 &ire_uinfo_null, 1776 NULL, 1777 NULL, 1778 ipst); 1779 1780 *irep++ = ire_create( 1781 (uchar_t *)&addr, /* dest address */ 1782 (uchar_t *)&ip_g_all_ones, /* mask */ 1783 (uchar_t *)&ipif->ipif_src_addr, /* source address */ 1784 NULL, /* no gateway */ 1785 &ip_loopback_mtu, /* max frag size */ 1786 NULL, /* no src_nce */ 1787 ill->ill_rq, /* recv-from queue */ 1788 NULL, /* no send-to queue */ 1789 IRE_BROADCAST, /* Needed for fanout in wput */ 1790 ipif, 1791 0, 1792 0, 1793 0, 1794 0, 1795 &ire_uinfo_null, 1796 NULL, 1797 NULL, 1798 ipst); 1799 1800 return (irep); 1801 } 1802 1803 /* 1804 * ire_walk routine to delete or update any IRE_CACHE that might contain 1805 * stale information. 1806 * The flags state which entries to delete or update. 1807 * Garbage collection is done separately using kmem alloc callbacks to 1808 * ip_trash_ire_reclaim. 1809 * Used for both IPv4 and IPv6. However, IPv6 only uses FLUSH_MTU_TIME 1810 * since other stale information is cleaned up using NUD. 1811 */ 1812 void 1813 ire_expire(ire_t *ire, char *arg) 1814 { 1815 ire_expire_arg_t *ieap = (ire_expire_arg_t *)(uintptr_t)arg; 1816 ill_t *stq_ill; 1817 int flush_flags = ieap->iea_flush_flag; 1818 ip_stack_t *ipst = ieap->iea_ipst; 1819 1820 if ((flush_flags & FLUSH_REDIRECT_TIME) && 1821 (ire->ire_flags & RTF_DYNAMIC)) { 1822 /* Make sure we delete the corresponding IRE_CACHE */ 1823 ip1dbg(("ire_expire: all redirects\n")); 1824 ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst); 1825 ire_delete(ire); 1826 atomic_dec_32(&ipst->ips_ip_redirect_cnt); 1827 return; 1828 } 1829 if (ire->ire_type != IRE_CACHE) 1830 return; 1831 1832 if (flush_flags & FLUSH_ARP_TIME) { 1833 /* 1834 * Remove all IRE_CACHE except IPv4 multicast ires. These 1835 * ires will be deleted by ip_trash_ire_reclaim_stack() 1836 * when system runs low in memory. 1837 * Verify that create time is more than ip_ire_arp_interval 1838 * milliseconds ago. 1839 */ 1840 1841 if (!(ire->ire_ipversion == IPV4_VERSION && 1842 CLASSD(ire->ire_addr)) && NCE_EXPIRED(ire->ire_nce, ipst)) { 1843 ire_delete(ire); 1844 return; 1845 } 1846 } 1847 1848 if (ipst->ips_ip_path_mtu_discovery && (flush_flags & FLUSH_MTU_TIME) && 1849 (ire->ire_ipif != NULL)) { 1850 /* Increase pmtu if it is less than the interface mtu */ 1851 mutex_enter(&ire->ire_lock); 1852 /* 1853 * If the ipif is a vni (whose mtu is 0, since it's virtual) 1854 * get the mtu from the sending interfaces' ipif 1855 */ 1856 if (IS_VNI(ire->ire_ipif->ipif_ill)) { 1857 stq_ill = ire->ire_stq->q_ptr; 1858 ire->ire_max_frag = MIN(stq_ill->ill_ipif->ipif_mtu, 1859 IP_MAXPACKET); 1860 } else { 1861 ire->ire_max_frag = MIN(ire->ire_ipif->ipif_mtu, 1862 IP_MAXPACKET); 1863 } 1864 ire->ire_frag_flag |= IPH_DF; 1865 mutex_exit(&ire->ire_lock); 1866 } 1867 } 1868 1869 /* 1870 * Return any local address. We use this to target ourselves 1871 * when the src address was specified as 'default'. 1872 * Preference for IRE_LOCAL entries. 1873 */ 1874 ire_t * 1875 ire_lookup_local(zoneid_t zoneid, ip_stack_t *ipst) 1876 { 1877 ire_t *ire; 1878 irb_t *irb; 1879 ire_t *maybe = NULL; 1880 int i; 1881 1882 for (i = 0; i < ipst->ips_ip_cache_table_size; i++) { 1883 irb = &ipst->ips_ip_cache_table[i]; 1884 if (irb->irb_ire == NULL) 1885 continue; 1886 rw_enter(&irb->irb_lock, RW_READER); 1887 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 1888 if ((ire->ire_marks & IRE_MARK_CONDEMNED) || 1889 (ire->ire_zoneid != zoneid && 1890 ire->ire_zoneid != ALL_ZONES)) 1891 continue; 1892 switch (ire->ire_type) { 1893 case IRE_LOOPBACK: 1894 if (maybe == NULL) { 1895 IRE_REFHOLD(ire); 1896 maybe = ire; 1897 } 1898 break; 1899 case IRE_LOCAL: 1900 if (maybe != NULL) { 1901 ire_refrele(maybe); 1902 } 1903 IRE_REFHOLD(ire); 1904 rw_exit(&irb->irb_lock); 1905 return (ire); 1906 } 1907 } 1908 rw_exit(&irb->irb_lock); 1909 } 1910 return (maybe); 1911 } 1912 1913 /* 1914 * If the specified IRE is associated with a particular ILL, return 1915 * that ILL pointer (May be called as writer.). 1916 * 1917 * NOTE : This is not a generic function that can be used always. 1918 * This function always returns the ill of the outgoing packets 1919 * if this ire is used. 1920 */ 1921 ill_t * 1922 ire_to_ill(const ire_t *ire) 1923 { 1924 ill_t *ill = NULL; 1925 1926 /* 1927 * 1) For an IRE_CACHE, ire_ipif is the one where it obtained 1928 * the source address from. ire_stq is the one where the 1929 * packets will be sent out on. We return that here. 1930 * 1931 * 2) IRE_BROADCAST normally has a loopback and a non-loopback 1932 * copy and they always exist next to each other with loopback 1933 * copy being the first one. If we are called on the non-loopback 1934 * copy, return the one pointed by ire_stq. If it was called on 1935 * a loopback copy, we still return the one pointed by the next 1936 * ire's ire_stq pointer i.e the one pointed by the non-loopback 1937 * copy. We don't want use ire_ipif as it might represent the 1938 * source address (if we borrow source addresses for 1939 * IRE_BROADCASTS in the future). 1940 * However if an interface is currently coming up, the above 1941 * condition may not hold during that period since the ires 1942 * are added one at a time. Thus one of the pair could have been 1943 * added and the other not yet added. 1944 * 3) For many other IREs (e.g., IRE_LOCAL), ire_rfq indicates the ill. 1945 * 4) For all others return the ones pointed by ire_ipif->ipif_ill. 1946 * That handles IRE_LOOPBACK. 1947 */ 1948 1949 if (ire->ire_type == IRE_CACHE) { 1950 ill = (ill_t *)ire->ire_stq->q_ptr; 1951 } else if (ire->ire_type == IRE_BROADCAST) { 1952 if (ire->ire_stq != NULL) { 1953 ill = (ill_t *)ire->ire_stq->q_ptr; 1954 } else { 1955 ire_t *ire_next; 1956 1957 ire_next = ire->ire_next; 1958 if (ire_next != NULL && 1959 ire_next->ire_type == IRE_BROADCAST && 1960 ire_next->ire_addr == ire->ire_addr && 1961 ire_next->ire_ipif == ire->ire_ipif) { 1962 ill = (ill_t *)ire_next->ire_stq->q_ptr; 1963 } 1964 } 1965 } else if (ire->ire_rfq != NULL) { 1966 ill = ire->ire_rfq->q_ptr; 1967 } else if (ire->ire_ipif != NULL) { 1968 ill = ire->ire_ipif->ipif_ill; 1969 } 1970 return (ill); 1971 } 1972 1973 /* Arrange to call the specified function for every IRE in the world. */ 1974 void 1975 ire_walk(pfv_t func, void *arg, ip_stack_t *ipst) 1976 { 1977 ire_walk_ipvers(func, arg, 0, ALL_ZONES, ipst); 1978 } 1979 1980 void 1981 ire_walk_v4(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst) 1982 { 1983 ire_walk_ipvers(func, arg, IPV4_VERSION, zoneid, ipst); 1984 } 1985 1986 void 1987 ire_walk_v6(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst) 1988 { 1989 ire_walk_ipvers(func, arg, IPV6_VERSION, zoneid, ipst); 1990 } 1991 1992 /* 1993 * Walk a particular version. version == 0 means both v4 and v6. 1994 */ 1995 static void 1996 ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers, zoneid_t zoneid, 1997 ip_stack_t *ipst) 1998 { 1999 if (vers != IPV6_VERSION) { 2000 /* 2001 * ip_forwarding_table variable doesn't matter for IPv4 since 2002 * ire_walk_ill_tables uses ips_ip_ftable for IPv4. 2003 */ 2004 ire_walk_ill_tables(0, 0, func, arg, IP_MASK_TABLE_SIZE, 2005 0, NULL, 2006 ipst->ips_ip_cache_table_size, ipst->ips_ip_cache_table, 2007 NULL, zoneid, ipst); 2008 } 2009 if (vers != IPV4_VERSION) { 2010 ire_walk_ill_tables(0, 0, func, arg, IP6_MASK_TABLE_SIZE, 2011 ipst->ips_ip6_ftable_hash_size, 2012 ipst->ips_ip_forwarding_table_v6, 2013 ipst->ips_ip6_cache_table_size, 2014 ipst->ips_ip_cache_table_v6, NULL, zoneid, ipst); 2015 } 2016 } 2017 2018 /* 2019 * Arrange to call the specified function for every IRE that matches the ill. 2020 */ 2021 void 2022 ire_walk_ill(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg, 2023 ill_t *ill) 2024 { 2025 uchar_t vers = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION); 2026 2027 ire_walk_ill_ipvers(match_flags, ire_type, func, arg, vers, ill); 2028 } 2029 2030 void 2031 ire_walk_ill_v4(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg, 2032 ill_t *ill) 2033 { 2034 ire_walk_ill_ipvers(match_flags, ire_type, func, arg, IPV4_VERSION, 2035 ill); 2036 } 2037 2038 void 2039 ire_walk_ill_v6(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg, 2040 ill_t *ill) 2041 { 2042 ire_walk_ill_ipvers(match_flags, ire_type, func, arg, IPV6_VERSION, 2043 ill); 2044 } 2045 2046 /* 2047 * Walk a particular ill and version. 2048 */ 2049 static void 2050 ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type, pfv_t func, 2051 void *arg, uchar_t vers, ill_t *ill) 2052 { 2053 ip_stack_t *ipst = ill->ill_ipst; 2054 2055 if (vers == IPV4_VERSION) { 2056 ire_walk_ill_tables(match_flags, ire_type, func, arg, 2057 IP_MASK_TABLE_SIZE, 0, 2058 NULL, ipst->ips_ip_cache_table_size, 2059 ipst->ips_ip_cache_table, ill, ALL_ZONES, ipst); 2060 } else if (vers == IPV6_VERSION) { 2061 ire_walk_ill_tables(match_flags, ire_type, func, arg, 2062 IP6_MASK_TABLE_SIZE, ipst->ips_ip6_ftable_hash_size, 2063 ipst->ips_ip_forwarding_table_v6, 2064 ipst->ips_ip6_cache_table_size, 2065 ipst->ips_ip_cache_table_v6, ill, ALL_ZONES, ipst); 2066 } 2067 } 2068 2069 boolean_t 2070 ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire, 2071 ill_t *ill, zoneid_t zoneid, ip_stack_t *ipst) 2072 { 2073 ill_t *ire_stq_ill = NULL; 2074 ill_t *ire_ipif_ill = NULL; 2075 2076 ASSERT(match_flags != 0 || zoneid != ALL_ZONES); 2077 /* 2078 * MATCH_IRE_ILL: We match both on ill pointed by ire_stq and 2079 * ire_ipif. Only in the case of IRE_CACHEs can ire_stq and 2080 * ire_ipif be pointing to different ills. But we want to keep 2081 * this function generic enough for future use. So, we always 2082 * try to match on both. The only caller of this function 2083 * ire_walk_ill_tables, will call "func" after we return from 2084 * this function. We expect "func" to do the right filtering 2085 * of ires in this case. 2086 */ 2087 if (match_flags & MATCH_IRE_ILL) { 2088 if (ire->ire_stq != NULL) 2089 ire_stq_ill = ire->ire_stq->q_ptr; 2090 if (ire->ire_ipif != NULL) 2091 ire_ipif_ill = ire->ire_ipif->ipif_ill; 2092 } 2093 2094 if (zoneid != ALL_ZONES) { 2095 /* 2096 * We're walking the IREs for a specific zone. The only relevant 2097 * IREs are: 2098 * - all IREs with a matching ire_zoneid 2099 * - all IRE_OFFSUBNETs as they're shared across all zones 2100 * - IRE_INTERFACE IREs for interfaces with a usable source addr 2101 * with a matching zone 2102 * - IRE_DEFAULTs with a gateway reachable from the zone 2103 * We should really match on IRE_OFFSUBNETs and IRE_DEFAULTs 2104 * using the same rule; but the above rules are consistent with 2105 * the behavior of ire_ftable_lookup[_v6]() so that all the 2106 * routes that can be matched during lookup are also matched 2107 * here. 2108 */ 2109 if (zoneid != ire->ire_zoneid && ire->ire_zoneid != ALL_ZONES) { 2110 /* 2111 * Note, IRE_INTERFACE can have the stq as NULL. For 2112 * example, if the default multicast route is tied to 2113 * the loopback address. 2114 */ 2115 if ((ire->ire_type & IRE_INTERFACE) && 2116 (ire->ire_stq != NULL)) { 2117 ire_stq_ill = (ill_t *)ire->ire_stq->q_ptr; 2118 if (ire->ire_ipversion == IPV4_VERSION) { 2119 if (!ipif_usesrc_avail(ire_stq_ill, 2120 zoneid)) 2121 /* No usable src addr in zone */ 2122 return (B_FALSE); 2123 } else if (ire_stq_ill->ill_usesrc_ifindex 2124 != 0) { 2125 /* 2126 * For IPv6 use ipif_select_source_v6() 2127 * so the right scope selection is done 2128 */ 2129 ipif_t *src_ipif; 2130 src_ipif = 2131 ipif_select_source_v6(ire_stq_ill, 2132 &ire->ire_addr_v6, B_FALSE, 2133 IPV6_PREFER_SRC_DEFAULT, 2134 zoneid); 2135 if (src_ipif != NULL) { 2136 ipif_refrele(src_ipif); 2137 } else { 2138 return (B_FALSE); 2139 } 2140 } else { 2141 return (B_FALSE); 2142 } 2143 2144 } else if (!(ire->ire_type & IRE_OFFSUBNET)) { 2145 return (B_FALSE); 2146 } 2147 } 2148 2149 /* 2150 * Match all default routes from the global zone, irrespective 2151 * of reachability. For a non-global zone only match those 2152 * where ire_gateway_addr has a IRE_INTERFACE for the zoneid. 2153 */ 2154 if (ire->ire_type == IRE_DEFAULT && zoneid != GLOBAL_ZONEID) { 2155 int ire_match_flags = 0; 2156 in6_addr_t gw_addr_v6; 2157 ire_t *rire; 2158 2159 ire_match_flags |= MATCH_IRE_TYPE; 2160 if (ire->ire_ipif != NULL) 2161 ire_match_flags |= MATCH_IRE_ILL; 2162 2163 if (ire->ire_ipversion == IPV4_VERSION) { 2164 rire = ire_route_lookup(ire->ire_gateway_addr, 2165 0, 0, IRE_INTERFACE, ire->ire_ipif, NULL, 2166 zoneid, NULL, ire_match_flags, ipst); 2167 } else { 2168 ASSERT(ire->ire_ipversion == IPV6_VERSION); 2169 mutex_enter(&ire->ire_lock); 2170 gw_addr_v6 = ire->ire_gateway_addr_v6; 2171 mutex_exit(&ire->ire_lock); 2172 rire = ire_route_lookup_v6(&gw_addr_v6, 2173 NULL, NULL, IRE_INTERFACE, ire->ire_ipif, 2174 NULL, zoneid, NULL, ire_match_flags, ipst); 2175 } 2176 if (rire == NULL) { 2177 return (B_FALSE); 2178 } 2179 ire_refrele(rire); 2180 } 2181 } 2182 2183 if (((!(match_flags & MATCH_IRE_TYPE)) || 2184 (ire->ire_type & ire_type)) && 2185 ((!(match_flags & MATCH_IRE_ILL)) || 2186 (ire_stq_ill == ill || ire_ipif_ill == ill || 2187 ire_ipif_ill != NULL && IS_IN_SAME_ILLGRP(ire_ipif_ill, ill)))) { 2188 return (B_TRUE); 2189 } 2190 return (B_FALSE); 2191 } 2192 2193 int 2194 rtfunc(struct radix_node *rn, void *arg) 2195 { 2196 struct rtfuncarg *rtf = arg; 2197 struct rt_entry *rt; 2198 irb_t *irb; 2199 ire_t *ire; 2200 boolean_t ret; 2201 2202 rt = (struct rt_entry *)rn; 2203 ASSERT(rt != NULL); 2204 irb = &rt->rt_irb; 2205 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 2206 if ((rtf->rt_match_flags != 0) || 2207 (rtf->rt_zoneid != ALL_ZONES)) { 2208 ret = ire_walk_ill_match(rtf->rt_match_flags, 2209 rtf->rt_ire_type, ire, 2210 rtf->rt_ill, rtf->rt_zoneid, rtf->rt_ipst); 2211 } else 2212 ret = B_TRUE; 2213 if (ret) 2214 (*rtf->rt_func)(ire, rtf->rt_arg); 2215 } 2216 return (0); 2217 } 2218 2219 /* 2220 * Walk the ftable and the ctable entries that match the ill. 2221 */ 2222 void 2223 ire_walk_ill_tables(uint_t match_flags, uint_t ire_type, pfv_t func, 2224 void *arg, size_t ftbl_sz, size_t htbl_sz, irb_t **ipftbl, 2225 size_t ctbl_sz, irb_t *ipctbl, ill_t *ill, zoneid_t zoneid, 2226 ip_stack_t *ipst) 2227 { 2228 irb_t *irb_ptr; 2229 irb_t *irb; 2230 ire_t *ire; 2231 int i, j; 2232 boolean_t ret; 2233 struct rtfuncarg rtfarg; 2234 2235 ASSERT((!(match_flags & MATCH_IRE_ILL)) || (ill != NULL)); 2236 ASSERT(!(match_flags & MATCH_IRE_TYPE) || (ire_type != 0)); 2237 /* 2238 * Optimize by not looking at the forwarding table if there 2239 * is a MATCH_IRE_TYPE specified with no IRE_FORWARDTABLE 2240 * specified in ire_type. 2241 */ 2242 if (!(match_flags & MATCH_IRE_TYPE) || 2243 ((ire_type & IRE_FORWARDTABLE) != 0)) { 2244 /* knobs such that routine is called only for v6 case */ 2245 if (ipftbl == ipst->ips_ip_forwarding_table_v6) { 2246 for (i = (ftbl_sz - 1); i >= 0; i--) { 2247 if ((irb_ptr = ipftbl[i]) == NULL) 2248 continue; 2249 for (j = 0; j < htbl_sz; j++) { 2250 irb = &irb_ptr[j]; 2251 if (irb->irb_ire == NULL) 2252 continue; 2253 2254 IRB_REFHOLD(irb); 2255 for (ire = irb->irb_ire; ire != NULL; 2256 ire = ire->ire_next) { 2257 if (match_flags == 0 && 2258 zoneid == ALL_ZONES) { 2259 ret = B_TRUE; 2260 } else { 2261 ret = 2262 ire_walk_ill_match( 2263 match_flags, 2264 ire_type, ire, ill, 2265 zoneid, ipst); 2266 } 2267 if (ret) 2268 (*func)(ire, arg); 2269 } 2270 IRB_REFRELE(irb); 2271 } 2272 } 2273 } else { 2274 (void) memset(&rtfarg, 0, sizeof (rtfarg)); 2275 rtfarg.rt_func = func; 2276 rtfarg.rt_arg = arg; 2277 if (match_flags != 0) { 2278 rtfarg.rt_match_flags = match_flags; 2279 } 2280 rtfarg.rt_ire_type = ire_type; 2281 rtfarg.rt_ill = ill; 2282 rtfarg.rt_zoneid = zoneid; 2283 rtfarg.rt_ipst = ipst; /* No netstack_hold */ 2284 (void) ipst->ips_ip_ftable->rnh_walktree_mt( 2285 ipst->ips_ip_ftable, 2286 rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn); 2287 } 2288 } 2289 2290 /* 2291 * Optimize by not looking at the cache table if there 2292 * is a MATCH_IRE_TYPE specified with no IRE_CACHETABLE 2293 * specified in ire_type. 2294 */ 2295 if (!(match_flags & MATCH_IRE_TYPE) || 2296 ((ire_type & IRE_CACHETABLE) != 0)) { 2297 for (i = 0; i < ctbl_sz; i++) { 2298 irb = &ipctbl[i]; 2299 if (irb->irb_ire == NULL) 2300 continue; 2301 IRB_REFHOLD(irb); 2302 for (ire = irb->irb_ire; ire != NULL; 2303 ire = ire->ire_next) { 2304 if (match_flags == 0 && zoneid == ALL_ZONES) { 2305 ret = B_TRUE; 2306 } else { 2307 ret = ire_walk_ill_match( 2308 match_flags, ire_type, 2309 ire, ill, zoneid, ipst); 2310 } 2311 if (ret) 2312 (*func)(ire, arg); 2313 } 2314 IRB_REFRELE(irb); 2315 } 2316 } 2317 } 2318 2319 /* 2320 * This function takes a mask and returns 2321 * number of bits set in the mask. If no 2322 * bit is set it returns 0. 2323 * Assumes a contiguous mask. 2324 */ 2325 int 2326 ip_mask_to_plen(ipaddr_t mask) 2327 { 2328 return (mask == 0 ? 0 : IP_ABITS - (ffs(ntohl(mask)) -1)); 2329 } 2330 2331 /* 2332 * Convert length for a mask to the mask. 2333 */ 2334 ipaddr_t 2335 ip_plen_to_mask(uint_t masklen) 2336 { 2337 return (htonl(IP_HOST_MASK << (IP_ABITS - masklen))); 2338 } 2339 2340 void 2341 ire_atomic_end(irb_t *irb_ptr, ire_t *ire) 2342 { 2343 ill_t *ill_list[NUM_ILLS]; 2344 ip_stack_t *ipst = ire->ire_ipst; 2345 2346 ill_list[0] = ire->ire_stq != NULL ? ire->ire_stq->q_ptr : NULL; 2347 ill_list[1] = ire->ire_ipif != NULL ? ire->ire_ipif->ipif_ill : NULL; 2348 ill_unlock_ills(ill_list, NUM_ILLS); 2349 rw_exit(&irb_ptr->irb_lock); 2350 rw_exit(&ipst->ips_ill_g_usesrc_lock); 2351 } 2352 2353 /* 2354 * ire_add_v[46] atomically make sure that the ipif or ill associated 2355 * with the new ire being added is stable and not IPIF_CHANGING or ILL_CHANGING 2356 * before adding the ire to the table. This ensures that we don't create 2357 * new IRE_CACHEs with stale values for parameters that are passed to 2358 * ire_create such as ire_max_frag. Note that ire_create() is passed a pointer 2359 * to the ipif_mtu, and not the value. The actual value is derived from the 2360 * parent ire or ipif under the bucket lock. 2361 */ 2362 int 2363 ire_atomic_start(irb_t *irb_ptr, ire_t *ire, queue_t *q, mblk_t *mp, 2364 ipsq_func_t func) 2365 { 2366 ill_t *stq_ill; 2367 ill_t *ipif_ill; 2368 ill_t *ill_list[NUM_ILLS]; 2369 int cnt = NUM_ILLS; 2370 int error = 0; 2371 ill_t *ill = NULL; 2372 ip_stack_t *ipst = ire->ire_ipst; 2373 2374 ill_list[0] = stq_ill = ire->ire_stq != 2375 NULL ? ire->ire_stq->q_ptr : NULL; 2376 ill_list[1] = ipif_ill = ire->ire_ipif != 2377 NULL ? ire->ire_ipif->ipif_ill : NULL; 2378 2379 ASSERT((q != NULL && mp != NULL && func != NULL) || 2380 (q == NULL && mp == NULL && func == NULL)); 2381 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 2382 GRAB_CONN_LOCK(q); 2383 rw_enter(&irb_ptr->irb_lock, RW_WRITER); 2384 ill_lock_ills(ill_list, cnt); 2385 2386 /* 2387 * While the IRE is in the process of being added, a user may have 2388 * invoked the ifconfig usesrc option on the stq_ill to make it a 2389 * usesrc client ILL. Check for this possibility here, if it is true 2390 * then we fail adding the IRE_CACHE. Another check is to make sure 2391 * that an ipif_ill of an IRE_CACHE being added is not part of a usesrc 2392 * group. The ill_g_usesrc_lock is released in ire_atomic_end 2393 */ 2394 if ((ire->ire_type & IRE_CACHE) && 2395 (ire->ire_marks & IRE_MARK_USESRC_CHECK)) { 2396 if (stq_ill->ill_usesrc_ifindex != 0) { 2397 ASSERT(stq_ill->ill_usesrc_grp_next != NULL); 2398 if ((ipif_ill->ill_phyint->phyint_ifindex != 2399 stq_ill->ill_usesrc_ifindex) || 2400 (ipif_ill->ill_usesrc_grp_next == NULL) || 2401 (ipif_ill->ill_usesrc_ifindex != 0)) { 2402 error = EINVAL; 2403 goto done; 2404 } 2405 } else if (ipif_ill->ill_usesrc_grp_next != NULL) { 2406 error = EINVAL; 2407 goto done; 2408 } 2409 } 2410 2411 /* 2412 * Don't allow IRE's to be created on changing ill's. Also, since 2413 * IPMP flags can be set on an ill without quiescing it, if we're not 2414 * a writer on stq_ill, check that the flags still allow IRE creation. 2415 */ 2416 if ((stq_ill != NULL) && !IAM_WRITER_ILL(stq_ill)) { 2417 if (stq_ill->ill_state_flags & ILL_CHANGING) { 2418 ill = stq_ill; 2419 error = EAGAIN; 2420 } else if (IS_UNDER_IPMP(stq_ill)) { 2421 mutex_enter(&stq_ill->ill_phyint->phyint_lock); 2422 if (!ipmp_ill_is_active(stq_ill) && 2423 !(ire->ire_marks & IRE_MARK_TESTHIDDEN)) { 2424 error = EINVAL; 2425 } 2426 mutex_exit(&stq_ill->ill_phyint->phyint_lock); 2427 } 2428 if (error != 0) 2429 goto done; 2430 } 2431 2432 if ((ipif_ill != NULL) && !IAM_WRITER_ILL(ipif_ill) && 2433 (ipif_ill->ill_state_flags & ILL_CHANGING)) { 2434 ill = ipif_ill; 2435 error = EAGAIN; 2436 goto done; 2437 } 2438 2439 if ((ire->ire_ipif != NULL) && !IAM_WRITER_IPIF(ire->ire_ipif) && 2440 (ire->ire_ipif->ipif_state_flags & IPIF_CHANGING)) { 2441 ill = ire->ire_ipif->ipif_ill; 2442 ASSERT(ill != NULL); 2443 error = EAGAIN; 2444 goto done; 2445 } 2446 2447 done: 2448 if (error == EAGAIN && ILL_CAN_WAIT(ill, q)) { 2449 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 2450 mutex_enter(&ipsq->ipsq_lock); 2451 mutex_enter(&ipsq->ipsq_xop->ipx_lock); 2452 ire_atomic_end(irb_ptr, ire); 2453 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 2454 mutex_exit(&ipsq->ipsq_xop->ipx_lock); 2455 mutex_exit(&ipsq->ipsq_lock); 2456 error = EINPROGRESS; 2457 } else if (error != 0) { 2458 ire_atomic_end(irb_ptr, ire); 2459 } 2460 2461 RELEASE_CONN_LOCK(q); 2462 return (error); 2463 } 2464 2465 /* 2466 * Add a fully initialized IRE to an appropriate table based on 2467 * ire_type. 2468 * 2469 * allow_unresolved == B_FALSE indicates a legacy code-path call 2470 * that has prohibited the addition of incomplete ire's. If this 2471 * parameter is set, and we find an nce that is in a state other 2472 * than ND_REACHABLE, we fail the add. Note that nce_state could be 2473 * something other than ND_REACHABLE if the nce had just expired and 2474 * the ire_create preceding the ire_add added a new ND_INITIAL nce. 2475 */ 2476 int 2477 ire_add(ire_t **irep, queue_t *q, mblk_t *mp, ipsq_func_t func, 2478 boolean_t allow_unresolved) 2479 { 2480 ire_t *ire1; 2481 ill_t *stq_ill = NULL; 2482 ill_t *ill; 2483 ipif_t *ipif = NULL; 2484 ill_walk_context_t ctx; 2485 ire_t *ire = *irep; 2486 int error; 2487 boolean_t ire_is_mblk = B_FALSE; 2488 tsol_gcgrp_t *gcgrp = NULL; 2489 tsol_gcgrp_addr_t ga; 2490 ip_stack_t *ipst = ire->ire_ipst; 2491 2492 /* get ready for the day when original ire is not created as mblk */ 2493 if (ire->ire_mp != NULL) { 2494 ire_is_mblk = B_TRUE; 2495 /* Copy the ire to a kmem_alloc'ed area */ 2496 ire1 = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 2497 if (ire1 == NULL) { 2498 ip1dbg(("ire_add: alloc failed\n")); 2499 ire_delete(ire); 2500 *irep = NULL; 2501 return (ENOMEM); 2502 } 2503 ire->ire_marks &= ~IRE_MARK_UNCACHED; 2504 *ire1 = *ire; 2505 ire1->ire_mp = NULL; 2506 ire1->ire_stq_ifindex = 0; 2507 freeb(ire->ire_mp); 2508 ire = ire1; 2509 } 2510 if (ire->ire_stq != NULL) 2511 stq_ill = ire->ire_stq->q_ptr; 2512 2513 if (stq_ill != NULL && ire->ire_type == IRE_CACHE && 2514 stq_ill->ill_net_type == IRE_IF_RESOLVER) { 2515 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 2516 ill = ILL_START_WALK_ALL(&ctx, ipst); 2517 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 2518 mutex_enter(&ill->ill_lock); 2519 if (ill->ill_state_flags & ILL_CONDEMNED) { 2520 mutex_exit(&ill->ill_lock); 2521 continue; 2522 } 2523 /* 2524 * We need to make sure that the ipif is a valid one 2525 * before adding the IRE_CACHE. This happens only 2526 * with IRE_CACHE when there is an external resolver. 2527 * 2528 * We can unplumb a logical interface while the 2529 * packet is waiting in ARP with the IRE. Then, 2530 * later on when we feed the IRE back, the ipif 2531 * has to be re-checked. This can't happen with 2532 * NDP currently, as we never queue the IRE with 2533 * the packet. We always try to recreate the IRE 2534 * when the resolution is completed. But, we do 2535 * it for IPv6 also here so that in future if 2536 * we have external resolvers, it will work without 2537 * any change. 2538 */ 2539 ipif = ipif_lookup_seqid(ill, ire->ire_ipif_seqid); 2540 if (ipif != NULL) { 2541 ipif_refhold_locked(ipif); 2542 mutex_exit(&ill->ill_lock); 2543 break; 2544 } 2545 mutex_exit(&ill->ill_lock); 2546 } 2547 rw_exit(&ipst->ips_ill_g_lock); 2548 if (ipif == NULL || 2549 (ipif->ipif_isv6 && 2550 !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6) && 2551 !IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6, 2552 &ipif->ipif_v6src_addr)) || 2553 (!ipif->ipif_isv6 && 2554 ire->ire_src_addr != ipif->ipif_src_addr) || 2555 ire->ire_zoneid != ipif->ipif_zoneid) { 2556 if (ipif != NULL) 2557 ipif_refrele(ipif); 2558 ire->ire_ipif = NULL; 2559 ire_delete(ire); 2560 *irep = NULL; 2561 return (EINVAL); 2562 } 2563 2564 ASSERT(ill != NULL); 2565 2566 /* 2567 * Since we didn't attach label security attributes to the 2568 * ire for the resolver case, we need to add it now. (only 2569 * for v4 resolver and v6 xresolv case). 2570 */ 2571 if (is_system_labeled() && ire_is_mblk) { 2572 if (ire->ire_ipversion == IPV4_VERSION) { 2573 ga.ga_af = AF_INET; 2574 IN6_IPADDR_TO_V4MAPPED(ire->ire_gateway_addr != 2575 INADDR_ANY ? ire->ire_gateway_addr : 2576 ire->ire_addr, &ga.ga_addr); 2577 } else { 2578 ga.ga_af = AF_INET6; 2579 ga.ga_addr = IN6_IS_ADDR_UNSPECIFIED( 2580 &ire->ire_gateway_addr_v6) ? 2581 ire->ire_addr_v6 : 2582 ire->ire_gateway_addr_v6; 2583 } 2584 gcgrp = gcgrp_lookup(&ga, B_FALSE); 2585 error = tsol_ire_init_gwattr(ire, ire->ire_ipversion, 2586 NULL, gcgrp); 2587 if (error != 0) { 2588 if (gcgrp != NULL) { 2589 GCGRP_REFRELE(gcgrp); 2590 gcgrp = NULL; 2591 } 2592 ipif_refrele(ipif); 2593 ire->ire_ipif = NULL; 2594 ire_delete(ire); 2595 *irep = NULL; 2596 return (error); 2597 } 2598 } 2599 } 2600 2601 /* 2602 * In case ire was changed 2603 */ 2604 *irep = ire; 2605 if (ire->ire_ipversion == IPV6_VERSION) 2606 error = ire_add_v6(irep, q, mp, func); 2607 else 2608 error = ire_add_v4(irep, q, mp, func, allow_unresolved); 2609 if (ipif != NULL) 2610 ipif_refrele(ipif); 2611 return (error); 2612 } 2613 2614 /* 2615 * Add an initialized IRE to an appropriate table based on ire_type. 2616 * 2617 * The forward table contains IRE_PREFIX/IRE_HOST and 2618 * IRE_IF_RESOLVER/IRE_IF_NORESOLVER and IRE_DEFAULT. 2619 * 2620 * The cache table contains IRE_BROADCAST/IRE_LOCAL/IRE_LOOPBACK 2621 * and IRE_CACHE. 2622 * 2623 * NOTE : This function is called as writer though not required 2624 * by this function. 2625 */ 2626 static int 2627 ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func, 2628 boolean_t allow_unresolved) 2629 { 2630 ire_t *ire1; 2631 irb_t *irb_ptr; 2632 ire_t **irep; 2633 int flags; 2634 ire_t *pire = NULL; 2635 ill_t *stq_ill; 2636 ire_t *ire = *ire_p; 2637 int error; 2638 boolean_t need_refrele = B_FALSE; 2639 nce_t *nce; 2640 ip_stack_t *ipst = ire->ire_ipst; 2641 uint_t marks = 0; 2642 2643 /* 2644 * IREs with source addresses hosted on interfaces that are under IPMP 2645 * should be hidden so that applications don't accidentally end up 2646 * sending packets with test addresses as their source addresses, or 2647 * sending out interfaces that are e.g. IFF_INACTIVE. Hide them here. 2648 */ 2649 if (ire->ire_ipif != NULL && IS_UNDER_IPMP(ire->ire_ipif->ipif_ill)) 2650 marks |= IRE_MARK_TESTHIDDEN; 2651 2652 if (ire->ire_ipif != NULL) 2653 ASSERT(!MUTEX_HELD(&ire->ire_ipif->ipif_ill->ill_lock)); 2654 if (ire->ire_stq != NULL) 2655 ASSERT(!MUTEX_HELD( 2656 &((ill_t *)(ire->ire_stq->q_ptr))->ill_lock)); 2657 ASSERT(ire->ire_ipversion == IPV4_VERSION); 2658 ASSERT(ire->ire_mp == NULL); /* Calls should go through ire_add */ 2659 2660 /* Find the appropriate list head. */ 2661 switch (ire->ire_type) { 2662 case IRE_HOST: 2663 ire->ire_mask = IP_HOST_MASK; 2664 ire->ire_masklen = IP_ABITS; 2665 ire->ire_marks |= marks; 2666 if ((ire->ire_flags & RTF_SETSRC) == 0) 2667 ire->ire_src_addr = 0; 2668 break; 2669 case IRE_CACHE: 2670 ire->ire_mask = IP_HOST_MASK; 2671 ire->ire_masklen = IP_ABITS; 2672 ire->ire_marks |= marks; 2673 break; 2674 case IRE_BROADCAST: 2675 case IRE_LOCAL: 2676 case IRE_LOOPBACK: 2677 ire->ire_mask = IP_HOST_MASK; 2678 ire->ire_masklen = IP_ABITS; 2679 break; 2680 case IRE_PREFIX: 2681 case IRE_DEFAULT: 2682 ire->ire_marks |= marks; 2683 if ((ire->ire_flags & RTF_SETSRC) == 0) 2684 ire->ire_src_addr = 0; 2685 break; 2686 case IRE_IF_RESOLVER: 2687 case IRE_IF_NORESOLVER: 2688 ire->ire_marks |= marks; 2689 break; 2690 default: 2691 ip0dbg(("ire_add_v4: ire %p has unrecognized IRE type (%d)\n", 2692 (void *)ire, ire->ire_type)); 2693 ire_delete(ire); 2694 *ire_p = NULL; 2695 return (EINVAL); 2696 } 2697 2698 /* Make sure the address is properly masked. */ 2699 ire->ire_addr &= ire->ire_mask; 2700 2701 /* 2702 * ip_newroute/ip_newroute_multi are unable to prevent the deletion 2703 * of the interface route while adding an IRE_CACHE for an on-link 2704 * destination in the IRE_IF_RESOLVER case, since the ire has to 2705 * go to ARP and return. We can't do a REFHOLD on the 2706 * associated interface ire for fear of ARP freeing the message. 2707 * Here we look up the interface ire in the forwarding table and 2708 * make sure that the interface route has not been deleted. 2709 */ 2710 if (ire->ire_type == IRE_CACHE && ire->ire_gateway_addr == 0 && 2711 ((ill_t *)ire->ire_stq->q_ptr)->ill_net_type == IRE_IF_RESOLVER) { 2712 2713 ASSERT(ire->ire_max_fragp == NULL); 2714 if (CLASSD(ire->ire_addr) && !(ire->ire_flags & RTF_SETSRC)) { 2715 /* 2716 * The ihandle that we used in ip_newroute_multi 2717 * comes from the interface route corresponding 2718 * to ire_ipif. Lookup here to see if it exists 2719 * still. 2720 * If the ire has a source address assigned using 2721 * RTF_SETSRC, ire_ipif is the logical interface holding 2722 * this source address, so we can't use it to check for 2723 * the existence of the interface route. Instead we rely 2724 * on the brute force ihandle search in 2725 * ire_ihandle_lookup_onlink() below. 2726 */ 2727 pire = ipif_to_ire(ire->ire_ipif); 2728 if (pire == NULL) { 2729 ire_delete(ire); 2730 *ire_p = NULL; 2731 return (EINVAL); 2732 } else if (pire->ire_ihandle != ire->ire_ihandle) { 2733 ire_refrele(pire); 2734 ire_delete(ire); 2735 *ire_p = NULL; 2736 return (EINVAL); 2737 } 2738 } else { 2739 pire = ire_ihandle_lookup_onlink(ire); 2740 if (pire == NULL) { 2741 ire_delete(ire); 2742 *ire_p = NULL; 2743 return (EINVAL); 2744 } 2745 } 2746 /* Prevent pire from getting deleted */ 2747 IRB_REFHOLD(pire->ire_bucket); 2748 /* Has it been removed already ? */ 2749 if (pire->ire_marks & IRE_MARK_CONDEMNED) { 2750 IRB_REFRELE(pire->ire_bucket); 2751 ire_refrele(pire); 2752 ire_delete(ire); 2753 *ire_p = NULL; 2754 return (EINVAL); 2755 } 2756 } else { 2757 ASSERT(ire->ire_max_fragp != NULL); 2758 } 2759 flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW); 2760 2761 if (ire->ire_ipif != NULL) { 2762 /* 2763 * We use MATCH_IRE_IPIF while adding IRE_CACHES only 2764 * for historic reasons and to maintain symmetry with 2765 * IPv6 code path. Historically this was used by 2766 * multicast code to create multiple IRE_CACHES on 2767 * a single ill with different ipifs. This was used 2768 * so that multicast packets leaving the node had the 2769 * right source address. This is no longer needed as 2770 * ip_wput initializes the address correctly. 2771 */ 2772 flags |= MATCH_IRE_IPIF; 2773 /* 2774 * If we are creating a hidden IRE, make sure we search for 2775 * hidden IREs when searching for duplicates below. 2776 * Otherwise, we might find an IRE on some other interface 2777 * that's not marked hidden. 2778 */ 2779 if (ire->ire_marks & IRE_MARK_TESTHIDDEN) 2780 flags |= MATCH_IRE_MARK_TESTHIDDEN; 2781 } 2782 if ((ire->ire_type & IRE_CACHETABLE) == 0) { 2783 irb_ptr = ire_get_bucket(ire); 2784 need_refrele = B_TRUE; 2785 if (irb_ptr == NULL) { 2786 /* 2787 * This assumes that the ire has not added 2788 * a reference to the ipif. 2789 */ 2790 ire->ire_ipif = NULL; 2791 ire_delete(ire); 2792 if (pire != NULL) { 2793 IRB_REFRELE(pire->ire_bucket); 2794 ire_refrele(pire); 2795 } 2796 *ire_p = NULL; 2797 return (EINVAL); 2798 } 2799 } else { 2800 irb_ptr = &(ipst->ips_ip_cache_table[IRE_ADDR_HASH( 2801 ire->ire_addr, ipst->ips_ip_cache_table_size)]); 2802 } 2803 2804 /* 2805 * Start the atomic add of the ire. Grab the ill locks, 2806 * ill_g_usesrc_lock and the bucket lock. Check for condemned 2807 * 2808 * If ipif or ill is changing ire_atomic_start() may queue the 2809 * request and return EINPROGRESS. 2810 * To avoid lock order problems, get the ndp4->ndp_g_lock. 2811 */ 2812 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 2813 error = ire_atomic_start(irb_ptr, ire, q, mp, func); 2814 if (error != 0) { 2815 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 2816 /* 2817 * We don't know whether it is a valid ipif or not. 2818 * So, set it to NULL. This assumes that the ire has not added 2819 * a reference to the ipif. 2820 */ 2821 ire->ire_ipif = NULL; 2822 ire_delete(ire); 2823 if (pire != NULL) { 2824 IRB_REFRELE(pire->ire_bucket); 2825 ire_refrele(pire); 2826 } 2827 *ire_p = NULL; 2828 if (need_refrele) 2829 IRB_REFRELE(irb_ptr); 2830 return (error); 2831 } 2832 /* 2833 * To avoid creating ires having stale values for the ire_max_frag 2834 * we get the latest value atomically here. For more details 2835 * see the block comment in ip_sioctl_mtu and in DL_NOTE_SDU_CHANGE 2836 * in ip_rput_dlpi_writer 2837 */ 2838 if (ire->ire_max_fragp == NULL) { 2839 if (CLASSD(ire->ire_addr)) 2840 ire->ire_max_frag = ire->ire_ipif->ipif_mtu; 2841 else 2842 ire->ire_max_frag = pire->ire_max_frag; 2843 } else { 2844 uint_t max_frag; 2845 2846 max_frag = *ire->ire_max_fragp; 2847 ire->ire_max_fragp = NULL; 2848 ire->ire_max_frag = max_frag; 2849 } 2850 /* 2851 * Atomically check for duplicate and insert in the table. 2852 */ 2853 for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 2854 if (ire1->ire_marks & IRE_MARK_CONDEMNED) 2855 continue; 2856 if (ire->ire_ipif != NULL) { 2857 /* 2858 * We do MATCH_IRE_ILL implicitly here for IREs 2859 * with a non-null ire_ipif, including IRE_CACHEs. 2860 * As ire_ipif and ire_stq could point to two 2861 * different ills, we can't pass just ire_ipif to 2862 * ire_match_args and get a match on both ills. 2863 * This is just needed for duplicate checks here and 2864 * so we don't add an extra argument to 2865 * ire_match_args for this. Do it locally. 2866 * 2867 * NOTE : Currently there is no part of the code 2868 * that asks for both MATH_IRE_IPIF and MATCH_IRE_ILL 2869 * match for IRE_CACHEs. Thus we don't want to 2870 * extend the arguments to ire_match_args. 2871 */ 2872 if (ire1->ire_stq != ire->ire_stq) 2873 continue; 2874 /* 2875 * Multiroute IRE_CACHEs for a given destination can 2876 * have the same ire_ipif, typically if their source 2877 * address is forced using RTF_SETSRC, and the same 2878 * send-to queue. We differentiate them using the parent 2879 * handle. 2880 */ 2881 if (ire->ire_type == IRE_CACHE && 2882 (ire1->ire_flags & RTF_MULTIRT) && 2883 (ire->ire_flags & RTF_MULTIRT) && 2884 (ire1->ire_phandle != ire->ire_phandle)) 2885 continue; 2886 } 2887 if (ire1->ire_zoneid != ire->ire_zoneid) 2888 continue; 2889 if (ire_match_args(ire1, ire->ire_addr, ire->ire_mask, 2890 ire->ire_gateway_addr, ire->ire_type, ire->ire_ipif, 2891 ire->ire_zoneid, 0, NULL, flags, NULL)) { 2892 /* 2893 * Return the old ire after doing a REFHOLD. 2894 * As most of the callers continue to use the IRE 2895 * after adding, we return a held ire. This will 2896 * avoid a lookup in the caller again. If the callers 2897 * don't want to use it, they need to do a REFRELE. 2898 */ 2899 ip1dbg(("found dup ire existing %p new %p\n", 2900 (void *)ire1, (void *)ire)); 2901 IRE_REFHOLD(ire1); 2902 ire_atomic_end(irb_ptr, ire); 2903 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 2904 ire_delete(ire); 2905 if (pire != NULL) { 2906 /* 2907 * Assert that it is not removed from the 2908 * list yet. 2909 */ 2910 ASSERT(pire->ire_ptpn != NULL); 2911 IRB_REFRELE(pire->ire_bucket); 2912 ire_refrele(pire); 2913 } 2914 *ire_p = ire1; 2915 if (need_refrele) 2916 IRB_REFRELE(irb_ptr); 2917 return (0); 2918 } 2919 } 2920 2921 if (ire->ire_type & IRE_CACHE) { 2922 ASSERT(ire->ire_stq != NULL); 2923 nce = ndp_lookup_v4(ire_to_ill(ire), 2924 ((ire->ire_gateway_addr != INADDR_ANY) ? 2925 &ire->ire_gateway_addr : &ire->ire_addr), 2926 B_TRUE); 2927 if (nce != NULL) 2928 mutex_enter(&nce->nce_lock); 2929 /* 2930 * if the nce is NCE_F_CONDEMNED, or if it is not ND_REACHABLE 2931 * and the caller has prohibited the addition of incomplete 2932 * ire's, we fail the add. Note that nce_state could be 2933 * something other than ND_REACHABLE if the nce had 2934 * just expired and the ire_create preceding the 2935 * ire_add added a new ND_INITIAL nce. 2936 */ 2937 if ((nce == NULL) || 2938 (nce->nce_flags & NCE_F_CONDEMNED) || 2939 (!allow_unresolved && 2940 (nce->nce_state != ND_REACHABLE))) { 2941 if (nce != NULL) { 2942 DTRACE_PROBE1(ire__bad__nce, nce_t *, nce); 2943 mutex_exit(&nce->nce_lock); 2944 } 2945 ire_atomic_end(irb_ptr, ire); 2946 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 2947 if (nce != NULL) 2948 NCE_REFRELE(nce); 2949 DTRACE_PROBE1(ire__no__nce, ire_t *, ire); 2950 ire_delete(ire); 2951 if (pire != NULL) { 2952 IRB_REFRELE(pire->ire_bucket); 2953 ire_refrele(pire); 2954 } 2955 *ire_p = NULL; 2956 if (need_refrele) 2957 IRB_REFRELE(irb_ptr); 2958 return (EINVAL); 2959 } else { 2960 ire->ire_nce = nce; 2961 mutex_exit(&nce->nce_lock); 2962 /* 2963 * We are associating this nce to the ire, so 2964 * change the nce ref taken in ndp_lookup_v4() from 2965 * NCE_REFHOLD to NCE_REFHOLD_NOTR 2966 */ 2967 NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce); 2968 } 2969 } 2970 /* 2971 * Make it easy for ip_wput_ire() to hit multiple broadcast ires by 2972 * grouping identical addresses together on the hash chain. We do 2973 * this only for IRE_BROADCASTs as ip_wput_ire is currently interested 2974 * in such groupings only for broadcasts. 2975 * 2976 * Find the first entry that matches ire_addr. *irep will be null 2977 * if no match. 2978 * 2979 * Note: the loopback and non-loopback broadcast entries for an 2980 * interface MUST be added before any MULTIRT entries. 2981 */ 2982 irep = (ire_t **)irb_ptr; 2983 while ((ire1 = *irep) != NULL && ire->ire_addr != ire1->ire_addr) 2984 irep = &ire1->ire_next; 2985 if (ire->ire_type == IRE_BROADCAST && *irep != NULL) { 2986 /* 2987 * We found some ire (i.e *irep) with a matching addr. We 2988 * want to group ires with same addr. 2989 */ 2990 for (;;) { 2991 ire1 = *irep; 2992 if ((ire1->ire_next == NULL) || 2993 (ire1->ire_next->ire_addr != ire->ire_addr) || 2994 (ire1->ire_type != IRE_BROADCAST) || 2995 (ire1->ire_flags & RTF_MULTIRT) || 2996 (ire1->ire_ipif->ipif_ill->ill_grp == 2997 ire->ire_ipif->ipif_ill->ill_grp)) 2998 break; 2999 irep = &ire1->ire_next; 3000 } 3001 ASSERT(*irep != NULL); 3002 /* 3003 * The ire will be added before *irep, so 3004 * if irep is a MULTIRT ire, just break to 3005 * ire insertion code. 3006 */ 3007 if (((*irep)->ire_flags & RTF_MULTIRT) != 0) 3008 goto insert_ire; 3009 3010 irep = &((*irep)->ire_next); 3011 3012 /* 3013 * Either we have hit the end of the list or the address 3014 * did not match. 3015 */ 3016 while (*irep != NULL) { 3017 ire1 = *irep; 3018 if ((ire1->ire_addr != ire->ire_addr) || 3019 (ire1->ire_type != IRE_BROADCAST)) 3020 break; 3021 if (ire1->ire_ipif == ire->ire_ipif) { 3022 irep = &ire1->ire_next; 3023 break; 3024 } 3025 irep = &ire1->ire_next; 3026 } 3027 } else if (*irep != NULL) { 3028 /* 3029 * Find the last ire which matches ire_addr. 3030 * Needed to do tail insertion among entries with the same 3031 * ire_addr. 3032 */ 3033 while (ire->ire_addr == ire1->ire_addr) { 3034 irep = &ire1->ire_next; 3035 ire1 = *irep; 3036 if (ire1 == NULL) 3037 break; 3038 } 3039 } 3040 3041 insert_ire: 3042 /* Insert at *irep */ 3043 ire1 = *irep; 3044 if (ire1 != NULL) 3045 ire1->ire_ptpn = &ire->ire_next; 3046 ire->ire_next = ire1; 3047 /* Link the new one in. */ 3048 ire->ire_ptpn = irep; 3049 3050 /* 3051 * ire_walk routines de-reference ire_next without holding 3052 * a lock. Before we point to the new ire, we want to make 3053 * sure the store that sets the ire_next of the new ire 3054 * reaches global visibility, so that ire_walk routines 3055 * don't see a truncated list of ires i.e if the ire_next 3056 * of the new ire gets set after we do "*irep = ire" due 3057 * to re-ordering, the ire_walk thread will see a NULL 3058 * once it accesses the ire_next of the new ire. 3059 * membar_producer() makes sure that the following store 3060 * happens *after* all of the above stores. 3061 */ 3062 membar_producer(); 3063 *irep = ire; 3064 ire->ire_bucket = irb_ptr; 3065 /* 3066 * We return a bumped up IRE above. Keep it symmetrical 3067 * so that the callers will always have to release. This 3068 * helps the callers of this function because they continue 3069 * to use the IRE after adding and hence they don't have to 3070 * lookup again after we return the IRE. 3071 * 3072 * NOTE : We don't have to use atomics as this is appearing 3073 * in the list for the first time and no one else can bump 3074 * up the reference count on this yet. 3075 */ 3076 IRE_REFHOLD_LOCKED(ire); 3077 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_inserted); 3078 3079 irb_ptr->irb_ire_cnt++; 3080 if (irb_ptr->irb_marks & IRB_MARK_FTABLE) 3081 irb_ptr->irb_nire++; 3082 3083 if (ire->ire_marks & IRE_MARK_TEMPORARY) 3084 irb_ptr->irb_tmp_ire_cnt++; 3085 3086 if (ire->ire_ipif != NULL) { 3087 DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *), ire->ire_ipif, 3088 (char *), "ire", (void *), ire); 3089 ire->ire_ipif->ipif_ire_cnt++; 3090 if (ire->ire_stq != NULL) { 3091 stq_ill = (ill_t *)ire->ire_stq->q_ptr; 3092 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), stq_ill, 3093 (char *), "ire", (void *), ire); 3094 stq_ill->ill_ire_cnt++; 3095 } 3096 } else { 3097 ASSERT(ire->ire_stq == NULL); 3098 } 3099 3100 ire_atomic_end(irb_ptr, ire); 3101 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3102 3103 if (pire != NULL) { 3104 /* Assert that it is not removed from the list yet */ 3105 ASSERT(pire->ire_ptpn != NULL); 3106 IRB_REFRELE(pire->ire_bucket); 3107 ire_refrele(pire); 3108 } 3109 3110 if (ire->ire_type != IRE_CACHE) { 3111 /* 3112 * For ire's with host mask see if there is an entry 3113 * in the cache. If there is one flush the whole cache as 3114 * there might be multiple entries due to RTF_MULTIRT (CGTP). 3115 * If no entry is found than there is no need to flush the 3116 * cache. 3117 */ 3118 if (ire->ire_mask == IP_HOST_MASK) { 3119 ire_t *lire; 3120 lire = ire_ctable_lookup(ire->ire_addr, NULL, IRE_CACHE, 3121 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 3122 if (lire != NULL) { 3123 ire_refrele(lire); 3124 ire_flush_cache_v4(ire, IRE_FLUSH_ADD); 3125 } 3126 } else { 3127 ire_flush_cache_v4(ire, IRE_FLUSH_ADD); 3128 } 3129 } 3130 /* 3131 * We had to delay the fast path probe until the ire is inserted 3132 * in the list. Otherwise the fast path ack won't find the ire in 3133 * the table. 3134 */ 3135 if (ire->ire_type == IRE_CACHE || 3136 (ire->ire_type == IRE_BROADCAST && ire->ire_stq != NULL)) { 3137 ASSERT(ire->ire_nce != NULL); 3138 if (ire->ire_nce->nce_state == ND_REACHABLE) 3139 nce_fastpath(ire->ire_nce); 3140 } 3141 if (ire->ire_ipif != NULL) 3142 ASSERT(!MUTEX_HELD(&ire->ire_ipif->ipif_ill->ill_lock)); 3143 *ire_p = ire; 3144 if (need_refrele) { 3145 IRB_REFRELE(irb_ptr); 3146 } 3147 return (0); 3148 } 3149 3150 /* 3151 * IRB_REFRELE is the only caller of the function. ire_unlink calls to 3152 * do the final cleanup for this ire. 3153 */ 3154 void 3155 ire_cleanup(ire_t *ire) 3156 { 3157 ire_t *ire_next; 3158 ip_stack_t *ipst = ire->ire_ipst; 3159 3160 ASSERT(ire != NULL); 3161 3162 while (ire != NULL) { 3163 ire_next = ire->ire_next; 3164 if (ire->ire_ipversion == IPV4_VERSION) { 3165 ire_delete_v4(ire); 3166 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, 3167 ire_stats_deleted); 3168 } else { 3169 ASSERT(ire->ire_ipversion == IPV6_VERSION); 3170 ire_delete_v6(ire); 3171 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, 3172 ire_stats_deleted); 3173 } 3174 /* 3175 * Now it's really out of the list. Before doing the 3176 * REFRELE, set ire_next to NULL as ire_inactive asserts 3177 * so. 3178 */ 3179 ire->ire_next = NULL; 3180 IRE_REFRELE_NOTR(ire); 3181 ire = ire_next; 3182 } 3183 } 3184 3185 /* 3186 * IRB_REFRELE is the only caller of the function. It calls to unlink 3187 * all the CONDEMNED ires from this bucket. 3188 */ 3189 ire_t * 3190 ire_unlink(irb_t *irb) 3191 { 3192 ire_t *ire; 3193 ire_t *ire1; 3194 ire_t **ptpn; 3195 ire_t *ire_list = NULL; 3196 3197 ASSERT(RW_WRITE_HELD(&irb->irb_lock)); 3198 ASSERT(((irb->irb_marks & IRB_MARK_FTABLE) && irb->irb_refcnt == 1) || 3199 (irb->irb_refcnt == 0)); 3200 ASSERT(irb->irb_marks & IRB_MARK_CONDEMNED); 3201 ASSERT(irb->irb_ire != NULL); 3202 3203 for (ire = irb->irb_ire; ire != NULL; ire = ire1) { 3204 ip_stack_t *ipst = ire->ire_ipst; 3205 3206 ire1 = ire->ire_next; 3207 if (ire->ire_marks & IRE_MARK_CONDEMNED) { 3208 ptpn = ire->ire_ptpn; 3209 ire1 = ire->ire_next; 3210 if (ire1) 3211 ire1->ire_ptpn = ptpn; 3212 *ptpn = ire1; 3213 ire->ire_ptpn = NULL; 3214 ire->ire_next = NULL; 3215 if (ire->ire_type == IRE_DEFAULT) { 3216 /* 3217 * IRE is out of the list. We need to adjust 3218 * the accounting before the caller drops 3219 * the lock. 3220 */ 3221 if (ire->ire_ipversion == IPV6_VERSION) { 3222 ASSERT(ipst-> 3223 ips_ipv6_ire_default_count != 3224 0); 3225 ipst->ips_ipv6_ire_default_count--; 3226 } 3227 } 3228 /* 3229 * We need to call ire_delete_v4 or ire_delete_v6 3230 * to clean up the cache or the redirects pointing at 3231 * the default gateway. We need to drop the lock 3232 * as ire_flush_cache/ire_delete_host_redircts require 3233 * so. But we can't drop the lock, as ire_unlink needs 3234 * to atomically remove the ires from the list. 3235 * So, create a temporary list of CONDEMNED ires 3236 * for doing ire_delete_v4/ire_delete_v6 operations 3237 * later on. 3238 */ 3239 ire->ire_next = ire_list; 3240 ire_list = ire; 3241 } 3242 } 3243 irb->irb_marks &= ~IRB_MARK_CONDEMNED; 3244 return (ire_list); 3245 } 3246 3247 /* 3248 * Delete all the cache entries with this 'addr'. When IP gets a gratuitous 3249 * ARP message on any of its interface queue, it scans the nce table and 3250 * deletes and calls ndp_delete() for the appropriate nce. This action 3251 * also deletes all the neighbor/ire cache entries for that address. 3252 * This function is called from ip_arp_news in ip.c and also for 3253 * ARP ioctl processing in ip_if.c. ip_ire_clookup_and_delete returns 3254 * true if it finds a nce entry which is used by ip_arp_news to determine if 3255 * it needs to do an ire_walk_v4. The return value is also used for the 3256 * same purpose by ARP IOCTL processing * in ip_if.c when deleting 3257 * ARP entries. For SIOC*IFARP ioctls in addition to the address, 3258 * ip_if->ipif_ill also needs to be matched. 3259 */ 3260 boolean_t 3261 ip_ire_clookup_and_delete(ipaddr_t addr, ipif_t *ipif, ip_stack_t *ipst) 3262 { 3263 ill_t *ill; 3264 nce_t *nce; 3265 3266 ill = (ipif ? ipif->ipif_ill : NULL); 3267 3268 if (ill != NULL) { 3269 /* 3270 * clean up the nce (and any relevant ire's) that matches 3271 * on addr and ill. 3272 */ 3273 nce = ndp_lookup_v4(ill, &addr, B_FALSE); 3274 if (nce != NULL) { 3275 ndp_delete(nce); 3276 return (B_TRUE); 3277 } 3278 } else { 3279 /* 3280 * ill is wildcard. clean up all nce's and 3281 * ire's that match on addr 3282 */ 3283 nce_clookup_t cl; 3284 3285 cl.ncecl_addr = addr; 3286 cl.ncecl_found = B_FALSE; 3287 3288 ndp_walk_common(ipst->ips_ndp4, NULL, 3289 (pfi_t)ip_nce_clookup_and_delete, (uchar_t *)&cl, B_TRUE); 3290 3291 /* 3292 * ncecl_found would be set by ip_nce_clookup_and_delete if 3293 * we found a matching nce. 3294 */ 3295 return (cl.ncecl_found); 3296 } 3297 return (B_FALSE); 3298 3299 } 3300 3301 /* Delete the supplied nce if its nce_addr matches the supplied address */ 3302 static void 3303 ip_nce_clookup_and_delete(nce_t *nce, void *arg) 3304 { 3305 nce_clookup_t *cl = (nce_clookup_t *)arg; 3306 ipaddr_t nce_addr; 3307 3308 IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr); 3309 if (nce_addr == cl->ncecl_addr) { 3310 cl->ncecl_found = B_TRUE; 3311 /* clean up the nce (and any relevant ire's) */ 3312 ndp_delete(nce); 3313 } 3314 } 3315 3316 /* 3317 * Clean up the radix node for this ire. Must be called by IRB_REFRELE 3318 * when there are no ire's left in the bucket. Returns TRUE if the bucket 3319 * is deleted and freed. 3320 */ 3321 boolean_t 3322 irb_inactive(irb_t *irb) 3323 { 3324 struct rt_entry *rt; 3325 struct radix_node *rn; 3326 ip_stack_t *ipst = irb->irb_ipst; 3327 3328 ASSERT(irb->irb_ipst != NULL); 3329 3330 rt = IRB2RT(irb); 3331 rn = (struct radix_node *)rt; 3332 3333 /* first remove it from the radix tree. */ 3334 RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable); 3335 rw_enter(&irb->irb_lock, RW_WRITER); 3336 if (irb->irb_refcnt == 1 && irb->irb_nire == 0) { 3337 rn = ipst->ips_ip_ftable->rnh_deladdr(rn->rn_key, rn->rn_mask, 3338 ipst->ips_ip_ftable); 3339 DTRACE_PROBE1(irb__free, rt_t *, rt); 3340 ASSERT((void *)rn == (void *)rt); 3341 Free(rt, rt_entry_cache); 3342 /* irb_lock is freed */ 3343 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 3344 return (B_TRUE); 3345 } 3346 rw_exit(&irb->irb_lock); 3347 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 3348 return (B_FALSE); 3349 } 3350 3351 /* 3352 * Delete the specified IRE. 3353 */ 3354 void 3355 ire_delete(ire_t *ire) 3356 { 3357 ire_t *ire1; 3358 ire_t **ptpn; 3359 irb_t *irb; 3360 ip_stack_t *ipst = ire->ire_ipst; 3361 3362 if ((irb = ire->ire_bucket) == NULL) { 3363 /* 3364 * It was never inserted in the list. Should call REFRELE 3365 * to free this IRE. 3366 */ 3367 IRE_REFRELE_NOTR(ire); 3368 return; 3369 } 3370 3371 rw_enter(&irb->irb_lock, RW_WRITER); 3372 3373 if (irb->irb_rr_origin == ire) { 3374 irb->irb_rr_origin = NULL; 3375 } 3376 3377 /* 3378 * In case of V4 we might still be waiting for fastpath ack. 3379 */ 3380 if (ire->ire_ipversion == IPV4_VERSION && 3381 (ire->ire_type == IRE_CACHE || 3382 (ire->ire_type == IRE_BROADCAST && ire->ire_stq != NULL))) { 3383 ASSERT(ire->ire_nce != NULL); 3384 nce_fastpath_list_delete(ire->ire_nce); 3385 } 3386 3387 if (ire->ire_ptpn == NULL) { 3388 /* 3389 * Some other thread has removed us from the list. 3390 * It should have done the REFRELE for us. 3391 */ 3392 rw_exit(&irb->irb_lock); 3393 return; 3394 } 3395 3396 if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { 3397 irb->irb_ire_cnt--; 3398 ire->ire_marks |= IRE_MARK_CONDEMNED; 3399 if (ire->ire_marks & IRE_MARK_TEMPORARY) { 3400 irb->irb_tmp_ire_cnt--; 3401 ire->ire_marks &= ~IRE_MARK_TEMPORARY; 3402 } 3403 } 3404 3405 if (irb->irb_refcnt != 0) { 3406 /* 3407 * The last thread to leave this bucket will 3408 * delete this ire. 3409 */ 3410 irb->irb_marks |= IRB_MARK_CONDEMNED; 3411 rw_exit(&irb->irb_lock); 3412 return; 3413 } 3414 3415 /* 3416 * Normally to delete an ire, we walk the bucket. While we 3417 * walk the bucket, we normally bump up irb_refcnt and hence 3418 * we return from above where we mark CONDEMNED and the ire 3419 * gets deleted from ire_unlink. This case is where somebody 3420 * knows the ire e.g by doing a lookup, and wants to delete the 3421 * IRE. irb_refcnt would be 0 in this case if nobody is walking 3422 * the bucket. 3423 */ 3424 ptpn = ire->ire_ptpn; 3425 ire1 = ire->ire_next; 3426 if (ire1 != NULL) 3427 ire1->ire_ptpn = ptpn; 3428 ASSERT(ptpn != NULL); 3429 *ptpn = ire1; 3430 ire->ire_ptpn = NULL; 3431 ire->ire_next = NULL; 3432 if (ire->ire_ipversion == IPV6_VERSION) { 3433 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_deleted); 3434 } else { 3435 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_deleted); 3436 } 3437 /* 3438 * ip_wput/ip_wput_v6 checks this flag to see whether 3439 * it should still use the cached ire or not. 3440 */ 3441 if (ire->ire_type == IRE_DEFAULT) { 3442 /* 3443 * IRE is out of the list. We need to adjust the 3444 * accounting before we drop the lock. 3445 */ 3446 if (ire->ire_ipversion == IPV6_VERSION) { 3447 ASSERT(ipst->ips_ipv6_ire_default_count != 0); 3448 ipst->ips_ipv6_ire_default_count--; 3449 } 3450 } 3451 rw_exit(&irb->irb_lock); 3452 3453 if (ire->ire_ipversion == IPV6_VERSION) { 3454 ire_delete_v6(ire); 3455 } else { 3456 ire_delete_v4(ire); 3457 } 3458 /* 3459 * We removed it from the list. Decrement the 3460 * reference count. 3461 */ 3462 IRE_REFRELE_NOTR(ire); 3463 } 3464 3465 /* 3466 * Delete the specified IRE. 3467 * All calls should use ire_delete(). 3468 * Sometimes called as writer though not required by this function. 3469 * 3470 * NOTE : This function is called only if the ire was added 3471 * in the list. 3472 */ 3473 static void 3474 ire_delete_v4(ire_t *ire) 3475 { 3476 ip_stack_t *ipst = ire->ire_ipst; 3477 3478 ASSERT(ire->ire_refcnt >= 1); 3479 ASSERT(ire->ire_ipversion == IPV4_VERSION); 3480 3481 if (ire->ire_type != IRE_CACHE) 3482 ire_flush_cache_v4(ire, IRE_FLUSH_DELETE); 3483 if (ire->ire_type == IRE_DEFAULT) { 3484 /* 3485 * when a default gateway is going away 3486 * delete all the host redirects pointing at that 3487 * gateway. 3488 */ 3489 ire_delete_host_redirects(ire->ire_gateway_addr, ipst); 3490 } 3491 } 3492 3493 /* 3494 * IRE_REFRELE/ire_refrele are the only caller of the function. It calls 3495 * to free the ire when the reference count goes to zero. 3496 */ 3497 void 3498 ire_inactive(ire_t *ire) 3499 { 3500 nce_t *nce; 3501 ill_t *ill = NULL; 3502 ill_t *stq_ill = NULL; 3503 ipif_t *ipif; 3504 boolean_t need_wakeup = B_FALSE; 3505 irb_t *irb; 3506 ip_stack_t *ipst = ire->ire_ipst; 3507 3508 ASSERT(ire->ire_refcnt == 0); 3509 ASSERT(ire->ire_ptpn == NULL); 3510 ASSERT(ire->ire_next == NULL); 3511 3512 if (ire->ire_gw_secattr != NULL) { 3513 ire_gw_secattr_free(ire->ire_gw_secattr); 3514 ire->ire_gw_secattr = NULL; 3515 } 3516 3517 if (ire->ire_mp != NULL) { 3518 ASSERT(ire->ire_bucket == NULL); 3519 mutex_destroy(&ire->ire_lock); 3520 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed); 3521 if (ire->ire_nce != NULL) 3522 NCE_REFRELE_NOTR(ire->ire_nce); 3523 freeb(ire->ire_mp); 3524 return; 3525 } 3526 3527 if ((nce = ire->ire_nce) != NULL) { 3528 NCE_REFRELE_NOTR(nce); 3529 ire->ire_nce = NULL; 3530 } 3531 3532 if (ire->ire_ipif == NULL) 3533 goto end; 3534 3535 ipif = ire->ire_ipif; 3536 ill = ipif->ipif_ill; 3537 3538 if (ire->ire_bucket == NULL) { 3539 /* The ire was never inserted in the table. */ 3540 goto end; 3541 } 3542 3543 /* 3544 * ipif_ire_cnt on this ipif goes down by 1. If the ire_stq is 3545 * non-null ill_ire_count also goes down by 1. 3546 * 3547 * The ipif that is associated with an ire is ire->ire_ipif and 3548 * hence when the ire->ire_ipif->ipif_ire_cnt drops to zero we call 3549 * ipif_ill_refrele_tail. Usually stq_ill is null or the same as 3550 * ire->ire_ipif->ipif_ill. So nothing more needs to be done. 3551 * However, for VNI or IPMP IRE entries, stq_ill can be different. 3552 * If this is different from ire->ire_ipif->ipif_ill and if the 3553 * ill_ire_cnt on the stq_ill also has dropped to zero, we call 3554 * ipif_ill_refrele_tail on the stq_ill. 3555 */ 3556 if (ire->ire_stq != NULL) 3557 stq_ill = ire->ire_stq->q_ptr; 3558 3559 if (stq_ill == NULL || stq_ill == ill) { 3560 /* Optimize the most common case */ 3561 mutex_enter(&ill->ill_lock); 3562 ASSERT(ipif->ipif_ire_cnt != 0); 3563 DTRACE_PROBE3(ipif__decr__cnt, (ipif_t *), ipif, 3564 (char *), "ire", (void *), ire); 3565 ipif->ipif_ire_cnt--; 3566 if (IPIF_DOWN_OK(ipif)) 3567 need_wakeup = B_TRUE; 3568 if (stq_ill != NULL) { 3569 ASSERT(stq_ill->ill_ire_cnt != 0); 3570 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), stq_ill, 3571 (char *), "ire", (void *), ire); 3572 stq_ill->ill_ire_cnt--; 3573 if (ILL_DOWN_OK(stq_ill)) 3574 need_wakeup = B_TRUE; 3575 } 3576 if (need_wakeup) { 3577 /* Drops the ill lock */ 3578 ipif_ill_refrele_tail(ill); 3579 } else { 3580 mutex_exit(&ill->ill_lock); 3581 } 3582 } else { 3583 /* 3584 * We can't grab all the ill locks at the same time. 3585 * It can lead to recursive lock enter in the call to 3586 * ipif_ill_refrele_tail and later. Instead do it 1 at 3587 * a time. 3588 */ 3589 mutex_enter(&ill->ill_lock); 3590 ASSERT(ipif->ipif_ire_cnt != 0); 3591 DTRACE_PROBE3(ipif__decr__cnt, (ipif_t *), ipif, 3592 (char *), "ire", (void *), ire); 3593 ipif->ipif_ire_cnt--; 3594 if (IPIF_DOWN_OK(ipif)) { 3595 /* Drops the lock */ 3596 ipif_ill_refrele_tail(ill); 3597 } else { 3598 mutex_exit(&ill->ill_lock); 3599 } 3600 if (stq_ill != NULL) { 3601 mutex_enter(&stq_ill->ill_lock); 3602 ASSERT(stq_ill->ill_ire_cnt != 0); 3603 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), stq_ill, 3604 (char *), "ire", (void *), ire); 3605 stq_ill->ill_ire_cnt--; 3606 if (ILL_DOWN_OK(stq_ill)) { 3607 /* Drops the ill lock */ 3608 ipif_ill_refrele_tail(stq_ill); 3609 } else { 3610 mutex_exit(&stq_ill->ill_lock); 3611 } 3612 } 3613 } 3614 end: 3615 /* This should be true for both V4 and V6 */ 3616 3617 if ((ire->ire_type & IRE_FORWARDTABLE) && 3618 (ire->ire_ipversion == IPV4_VERSION) && 3619 ((irb = ire->ire_bucket) != NULL)) { 3620 rw_enter(&irb->irb_lock, RW_WRITER); 3621 irb->irb_nire--; 3622 /* 3623 * Instead of examining the conditions for freeing 3624 * the radix node here, we do it by calling 3625 * IRB_REFRELE which is a single point in the code 3626 * that embeds that logic. Bump up the refcnt to 3627 * be able to call IRB_REFRELE 3628 */ 3629 IRB_REFHOLD_LOCKED(irb); 3630 rw_exit(&irb->irb_lock); 3631 IRB_REFRELE(irb); 3632 } 3633 ire->ire_ipif = NULL; 3634 3635 #ifdef DEBUG 3636 ire_trace_cleanup(ire); 3637 #endif 3638 mutex_destroy(&ire->ire_lock); 3639 if (ire->ire_ipversion == IPV6_VERSION) { 3640 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_freed); 3641 } else { 3642 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed); 3643 } 3644 ASSERT(ire->ire_mp == NULL); 3645 /* Has been allocated out of the cache */ 3646 kmem_cache_free(ire_cache, ire); 3647 } 3648 3649 /* 3650 * ire_walk routine to delete all IRE_CACHE/IRE_HOST types redirect 3651 * entries that have a given gateway address. 3652 */ 3653 void 3654 ire_delete_cache_gw(ire_t *ire, char *cp) 3655 { 3656 ipaddr_t gw_addr; 3657 3658 if (!(ire->ire_type & IRE_CACHE) && 3659 !(ire->ire_flags & RTF_DYNAMIC)) 3660 return; 3661 3662 bcopy(cp, &gw_addr, sizeof (gw_addr)); 3663 if (ire->ire_gateway_addr == gw_addr) { 3664 ip1dbg(("ire_delete_cache_gw: deleted 0x%x type %d to 0x%x\n", 3665 (int)ntohl(ire->ire_addr), ire->ire_type, 3666 (int)ntohl(ire->ire_gateway_addr))); 3667 ire_delete(ire); 3668 } 3669 } 3670 3671 /* 3672 * Remove all IRE_CACHE entries that match the ire specified. 3673 * 3674 * The flag argument indicates if the flush request is due to addition 3675 * of new route (IRE_FLUSH_ADD) or deletion of old route (IRE_FLUSH_DELETE). 3676 * 3677 * This routine takes only the IREs from the forwarding table and flushes 3678 * the corresponding entries from the cache table. 3679 * 3680 * When flushing due to the deletion of an old route, it 3681 * just checks the cache handles (ire_phandle and ire_ihandle) and 3682 * deletes the ones that match. 3683 * 3684 * When flushing due to the creation of a new route, it checks 3685 * if a cache entry's address matches the one in the IRE and 3686 * that the cache entry's parent has a less specific mask than the 3687 * one in IRE. The destination of such a cache entry could be the 3688 * gateway for other cache entries, so we need to flush those as 3689 * well by looking for gateway addresses matching the IRE's address. 3690 */ 3691 void 3692 ire_flush_cache_v4(ire_t *ire, int flag) 3693 { 3694 int i; 3695 ire_t *cire; 3696 irb_t *irb; 3697 ip_stack_t *ipst = ire->ire_ipst; 3698 3699 if (ire->ire_type & IRE_CACHE) 3700 return; 3701 3702 /* 3703 * If a default is just created, there is no point 3704 * in going through the cache, as there will not be any 3705 * cached ires. 3706 */ 3707 if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) 3708 return; 3709 if (flag == IRE_FLUSH_ADD) { 3710 /* 3711 * This selective flush is due to the addition of 3712 * new IRE. 3713 */ 3714 for (i = 0; i < ipst->ips_ip_cache_table_size; i++) { 3715 irb = &ipst->ips_ip_cache_table[i]; 3716 if ((cire = irb->irb_ire) == NULL) 3717 continue; 3718 IRB_REFHOLD(irb); 3719 for (cire = irb->irb_ire; cire != NULL; 3720 cire = cire->ire_next) { 3721 if (cire->ire_type != IRE_CACHE) 3722 continue; 3723 /* 3724 * If 'cire' belongs to the same subnet 3725 * as the new ire being added, and 'cire' 3726 * is derived from a prefix that is less 3727 * specific than the new ire being added, 3728 * we need to flush 'cire'; for instance, 3729 * when a new interface comes up. 3730 */ 3731 if (((cire->ire_addr & ire->ire_mask) == 3732 (ire->ire_addr & ire->ire_mask)) && 3733 (ip_mask_to_plen(cire->ire_cmask) <= 3734 ire->ire_masklen)) { 3735 ire_delete(cire); 3736 continue; 3737 } 3738 /* 3739 * This is the case when the ire_gateway_addr 3740 * of 'cire' belongs to the same subnet as 3741 * the new ire being added. 3742 * Flushing such ires is sometimes required to 3743 * avoid misrouting: say we have a machine with 3744 * two interfaces (I1 and I2), a default router 3745 * R on the I1 subnet, and a host route to an 3746 * off-link destination D with a gateway G on 3747 * the I2 subnet. 3748 * Under normal operation, we will have an 3749 * on-link cache entry for G and an off-link 3750 * cache entry for D with G as ire_gateway_addr, 3751 * traffic to D will reach its destination 3752 * through gateway G. 3753 * If the administrator does 'ifconfig I2 down', 3754 * the cache entries for D and G will be 3755 * flushed. However, G will now be resolved as 3756 * an off-link destination using R (the default 3757 * router) as gateway. Then D will also be 3758 * resolved as an off-link destination using G 3759 * as gateway - this behavior is due to 3760 * compatibility reasons, see comment in 3761 * ire_ihandle_lookup_offlink(). Traffic to D 3762 * will go to the router R and probably won't 3763 * reach the destination. 3764 * The administrator then does 'ifconfig I2 up'. 3765 * Since G is on the I2 subnet, this routine 3766 * will flush its cache entry. It must also 3767 * flush the cache entry for D, otherwise 3768 * traffic will stay misrouted until the IRE 3769 * times out. 3770 */ 3771 if ((cire->ire_gateway_addr & ire->ire_mask) == 3772 (ire->ire_addr & ire->ire_mask)) { 3773 ire_delete(cire); 3774 continue; 3775 } 3776 } 3777 IRB_REFRELE(irb); 3778 } 3779 } else { 3780 /* 3781 * delete the cache entries based on 3782 * handle in the IRE as this IRE is 3783 * being deleted/changed. 3784 */ 3785 for (i = 0; i < ipst->ips_ip_cache_table_size; i++) { 3786 irb = &ipst->ips_ip_cache_table[i]; 3787 if ((cire = irb->irb_ire) == NULL) 3788 continue; 3789 IRB_REFHOLD(irb); 3790 for (cire = irb->irb_ire; cire != NULL; 3791 cire = cire->ire_next) { 3792 if (cire->ire_type != IRE_CACHE) 3793 continue; 3794 if ((cire->ire_phandle == 0 || 3795 cire->ire_phandle != ire->ire_phandle) && 3796 (cire->ire_ihandle == 0 || 3797 cire->ire_ihandle != ire->ire_ihandle)) 3798 continue; 3799 ire_delete(cire); 3800 } 3801 IRB_REFRELE(irb); 3802 } 3803 } 3804 } 3805 3806 /* 3807 * Matches the arguments passed with the values in the ire. 3808 * 3809 * Note: for match types that match using "ipif" passed in, ipif 3810 * must be checked for non-NULL before calling this routine. 3811 */ 3812 boolean_t 3813 ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, 3814 int type, const ipif_t *ipif, zoneid_t zoneid, uint32_t ihandle, 3815 const ts_label_t *tsl, int match_flags, queue_t *wq) 3816 { 3817 ill_t *ire_ill = NULL, *dst_ill; 3818 ill_t *ipif_ill = NULL; 3819 3820 ASSERT(ire->ire_ipversion == IPV4_VERSION); 3821 ASSERT((ire->ire_addr & ~ire->ire_mask) == 0); 3822 ASSERT((!(match_flags & MATCH_IRE_ILL)) || 3823 (ipif != NULL && !ipif->ipif_isv6)); 3824 ASSERT(!(match_flags & MATCH_IRE_WQ) || wq != NULL); 3825 3826 /* 3827 * If MATCH_IRE_MARK_TESTHIDDEN is set, then only return the IRE if it 3828 * is in fact hidden, to ensure the caller gets the right one. One 3829 * exception: if the caller passed MATCH_IRE_IHANDLE, then they 3830 * already know the identity of the given IRE_INTERFACE entry and 3831 * there's no point trying to hide it from them. 3832 */ 3833 if (ire->ire_marks & IRE_MARK_TESTHIDDEN) { 3834 if (match_flags & MATCH_IRE_IHANDLE) 3835 match_flags |= MATCH_IRE_MARK_TESTHIDDEN; 3836 3837 if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) 3838 return (B_FALSE); 3839 } 3840 3841 /* 3842 * MATCH_IRE_MARK_PRIVATE_ADDR is set when IP_NEXTHOP option 3843 * is used. In that case the routing table is bypassed and the 3844 * packets are sent directly to the specified nexthop. The 3845 * IRE_CACHE entry representing this route should be marked 3846 * with IRE_MARK_PRIVATE_ADDR. 3847 */ 3848 3849 if (!(match_flags & MATCH_IRE_MARK_PRIVATE_ADDR) && 3850 (ire->ire_marks & IRE_MARK_PRIVATE_ADDR)) 3851 return (B_FALSE); 3852 3853 if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid && 3854 ire->ire_zoneid != ALL_ZONES) { 3855 /* 3856 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid is 3857 * valid and does not match that of ire_zoneid, a failure to 3858 * match is reported at this point. Otherwise, since some IREs 3859 * that are available in the global zone can be used in local 3860 * zones, additional checks need to be performed: 3861 * 3862 * IRE_BROADCAST, IRE_CACHE and IRE_LOOPBACK 3863 * entries should never be matched in this situation. 3864 * 3865 * IRE entries that have an interface associated with them 3866 * should in general not match unless they are an IRE_LOCAL 3867 * or in the case when MATCH_IRE_DEFAULT has been set in 3868 * the caller. In the case of the former, checking of the 3869 * other fields supplied should take place. 3870 * 3871 * In the case where MATCH_IRE_DEFAULT has been set, 3872 * all of the ipif's associated with the IRE's ill are 3873 * checked to see if there is a matching zoneid. If any 3874 * one ipif has a matching zoneid, this IRE is a 3875 * potential candidate so checking of the other fields 3876 * takes place. 3877 * 3878 * In the case where the IRE_INTERFACE has a usable source 3879 * address (indicated by ill_usesrc_ifindex) in the 3880 * correct zone then it's permitted to return this IRE 3881 */ 3882 if (match_flags & MATCH_IRE_ZONEONLY) 3883 return (B_FALSE); 3884 if (ire->ire_type & (IRE_BROADCAST | IRE_CACHE | IRE_LOOPBACK)) 3885 return (B_FALSE); 3886 /* 3887 * Note, IRE_INTERFACE can have the stq as NULL. For 3888 * example, if the default multicast route is tied to 3889 * the loopback address. 3890 */ 3891 if ((ire->ire_type & IRE_INTERFACE) && 3892 (ire->ire_stq != NULL)) { 3893 dst_ill = (ill_t *)ire->ire_stq->q_ptr; 3894 /* 3895 * If there is a usable source address in the 3896 * zone, then it's ok to return an 3897 * IRE_INTERFACE 3898 */ 3899 if (ipif_usesrc_avail(dst_ill, zoneid)) { 3900 ip3dbg(("ire_match_args: dst_ill %p match %d\n", 3901 (void *)dst_ill, 3902 (ire->ire_addr == (addr & mask)))); 3903 } else { 3904 ip3dbg(("ire_match_args: src_ipif NULL" 3905 " dst_ill %p\n", (void *)dst_ill)); 3906 return (B_FALSE); 3907 } 3908 } 3909 if (ire->ire_ipif != NULL && ire->ire_type != IRE_LOCAL && 3910 !(ire->ire_type & IRE_INTERFACE)) { 3911 ipif_t *tipif; 3912 3913 if ((match_flags & MATCH_IRE_DEFAULT) == 0) { 3914 return (B_FALSE); 3915 } 3916 mutex_enter(&ire->ire_ipif->ipif_ill->ill_lock); 3917 for (tipif = ire->ire_ipif->ipif_ill->ill_ipif; 3918 tipif != NULL; tipif = tipif->ipif_next) { 3919 if (IPIF_CAN_LOOKUP(tipif) && 3920 (tipif->ipif_flags & IPIF_UP) && 3921 (tipif->ipif_zoneid == zoneid || 3922 tipif->ipif_zoneid == ALL_ZONES)) 3923 break; 3924 } 3925 mutex_exit(&ire->ire_ipif->ipif_ill->ill_lock); 3926 if (tipif == NULL) { 3927 return (B_FALSE); 3928 } 3929 } 3930 } 3931 3932 /* 3933 * For IRE_CACHE entries, MATCH_IRE_ILL means that somebody wants to 3934 * send out ire_stq (ire_ipif for IRE_CACHE entries is just the means 3935 * of getting a source address -- i.e., ire_src_addr == 3936 * ire->ire_ipif->ipif_src_addr). ire_to_ill() handles this. 3937 * 3938 * NOTE: For IPMP, MATCH_IRE_ILL usually matches any ill in the group. 3939 * However, if MATCH_IRE_MARK_TESTHIDDEN is set (i.e., the IRE is for 3940 * IPMP test traffic), then the ill must match exactly. 3941 */ 3942 if (match_flags & MATCH_IRE_ILL) { 3943 ire_ill = ire_to_ill(ire); 3944 ipif_ill = ipif->ipif_ill; 3945 } 3946 3947 if ((ire->ire_addr == (addr & mask)) && 3948 ((!(match_flags & MATCH_IRE_GW)) || 3949 (ire->ire_gateway_addr == gateway)) && 3950 ((!(match_flags & MATCH_IRE_TYPE)) || 3951 (ire->ire_type & type)) && 3952 ((!(match_flags & MATCH_IRE_SRC)) || 3953 (ire->ire_src_addr == ipif->ipif_src_addr)) && 3954 ((!(match_flags & MATCH_IRE_IPIF)) || 3955 (ire->ire_ipif == ipif)) && 3956 ((!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) || 3957 (ire->ire_marks & IRE_MARK_TESTHIDDEN)) && 3958 ((!(match_flags & MATCH_IRE_MARK_PRIVATE_ADDR)) || 3959 (ire->ire_type != IRE_CACHE || 3960 ire->ire_marks & IRE_MARK_PRIVATE_ADDR)) && 3961 ((!(match_flags & MATCH_IRE_WQ)) || 3962 (ire->ire_stq == wq)) && 3963 ((!(match_flags & MATCH_IRE_ILL)) || 3964 (ire_ill == ipif_ill || 3965 (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN) && 3966 ire_ill != NULL && IS_IN_SAME_ILLGRP(ipif_ill, ire_ill)))) && 3967 ((!(match_flags & MATCH_IRE_IHANDLE)) || 3968 (ire->ire_ihandle == ihandle)) && 3969 ((!(match_flags & MATCH_IRE_MASK)) || 3970 (ire->ire_mask == mask)) && 3971 ((!(match_flags & MATCH_IRE_SECATTR)) || 3972 (!is_system_labeled()) || 3973 (tsol_ire_match_gwattr(ire, tsl) == 0))) { 3974 /* We found the matched IRE */ 3975 return (B_TRUE); 3976 } 3977 return (B_FALSE); 3978 } 3979 3980 3981 /* 3982 * Lookup for a route in all the tables 3983 */ 3984 ire_t * 3985 ire_route_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, 3986 int type, const ipif_t *ipif, ire_t **pire, zoneid_t zoneid, 3987 const ts_label_t *tsl, int flags, ip_stack_t *ipst) 3988 { 3989 ire_t *ire = NULL; 3990 3991 /* 3992 * ire_match_args() will dereference ipif MATCH_IRE_SRC or 3993 * MATCH_IRE_ILL is set. 3994 */ 3995 if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL)) 3996 return (NULL); 3997 3998 /* 3999 * might be asking for a cache lookup, 4000 * This is not best way to lookup cache, 4001 * user should call ire_cache_lookup directly. 4002 * 4003 * If MATCH_IRE_TYPE was set, first lookup in the cache table and then 4004 * in the forwarding table, if the applicable type flags were set. 4005 */ 4006 if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_CACHETABLE) != 0) { 4007 ire = ire_ctable_lookup(addr, gateway, type, ipif, zoneid, 4008 tsl, flags, ipst); 4009 if (ire != NULL) 4010 return (ire); 4011 } 4012 if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_FORWARDTABLE) != 0) { 4013 ire = ire_ftable_lookup(addr, mask, gateway, type, ipif, pire, 4014 zoneid, 0, tsl, flags, ipst); 4015 } 4016 return (ire); 4017 } 4018 4019 4020 /* 4021 * Delete the IRE cache for the gateway and all IRE caches whose 4022 * ire_gateway_addr points to this gateway, and allow them to 4023 * be created on demand by ip_newroute. 4024 */ 4025 void 4026 ire_clookup_delete_cache_gw(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst) 4027 { 4028 irb_t *irb; 4029 ire_t *ire; 4030 4031 irb = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(addr, 4032 ipst->ips_ip_cache_table_size)]; 4033 IRB_REFHOLD(irb); 4034 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 4035 if (ire->ire_marks & IRE_MARK_CONDEMNED) 4036 continue; 4037 4038 ASSERT(ire->ire_mask == IP_HOST_MASK); 4039 if (ire_match_args(ire, addr, ire->ire_mask, 0, IRE_CACHE, 4040 NULL, zoneid, 0, NULL, MATCH_IRE_TYPE, NULL)) { 4041 ire_delete(ire); 4042 } 4043 } 4044 IRB_REFRELE(irb); 4045 4046 ire_walk_v4(ire_delete_cache_gw, &addr, zoneid, ipst); 4047 } 4048 4049 /* 4050 * Looks up cache table for a route. 4051 * specific lookup can be indicated by 4052 * passing the MATCH_* flags and the 4053 * necessary parameters. 4054 */ 4055 ire_t * 4056 ire_ctable_lookup(ipaddr_t addr, ipaddr_t gateway, int type, const ipif_t *ipif, 4057 zoneid_t zoneid, const ts_label_t *tsl, int flags, ip_stack_t *ipst) 4058 { 4059 ire_ctable_args_t margs; 4060 4061 margs.ict_addr = &addr; 4062 margs.ict_gateway = &gateway; 4063 margs.ict_type = type; 4064 margs.ict_ipif = ipif; 4065 margs.ict_zoneid = zoneid; 4066 margs.ict_tsl = tsl; 4067 margs.ict_flags = flags; 4068 margs.ict_ipst = ipst; 4069 margs.ict_wq = NULL; 4070 4071 return (ip4_ctable_lookup_impl(&margs)); 4072 } 4073 4074 /* 4075 * Check whether the IRE_LOCAL and the IRE potentially used to transmit 4076 * (could be an IRE_CACHE, IRE_BROADCAST, or IRE_INTERFACE) are identical 4077 * or part of the same illgrp. (In the IPMP case, usually the two IREs 4078 * will both belong to the IPMP ill, but exceptions are possible -- e.g. 4079 * if IPMP test addresses are on their own subnet.) 4080 */ 4081 boolean_t 4082 ire_local_same_lan(ire_t *ire_local, ire_t *xmit_ire) 4083 { 4084 ill_t *recv_ill, *xmit_ill; 4085 4086 ASSERT(ire_local->ire_type & (IRE_LOCAL|IRE_LOOPBACK)); 4087 ASSERT(xmit_ire->ire_type & (IRE_CACHETABLE|IRE_INTERFACE)); 4088 4089 recv_ill = ire_to_ill(ire_local); 4090 xmit_ill = ire_to_ill(xmit_ire); 4091 4092 ASSERT(recv_ill != NULL); 4093 ASSERT(xmit_ill != NULL); 4094 4095 return (IS_ON_SAME_LAN(recv_ill, xmit_ill)); 4096 } 4097 4098 /* 4099 * Check if the IRE_LOCAL uses the same ill as another route would use. 4100 * If there is no alternate route, or the alternate is a REJECT or BLACKHOLE, 4101 * then we don't allow this IRE_LOCAL to be used. 4102 */ 4103 boolean_t 4104 ire_local_ok_across_zones(ire_t *ire_local, zoneid_t zoneid, void *addr, 4105 const ts_label_t *tsl, ip_stack_t *ipst) 4106 { 4107 ire_t *alt_ire; 4108 boolean_t rval; 4109 int flags; 4110 4111 flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE; 4112 4113 if (ire_local->ire_ipversion == IPV4_VERSION) { 4114 alt_ire = ire_ftable_lookup(*((ipaddr_t *)addr), 0, 0, 0, NULL, 4115 NULL, zoneid, 0, tsl, flags, ipst); 4116 } else { 4117 alt_ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL, 4118 NULL, zoneid, 0, tsl, flags, ipst); 4119 } 4120 4121 if (alt_ire == NULL) 4122 return (B_FALSE); 4123 4124 if (alt_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 4125 ire_refrele(alt_ire); 4126 return (B_FALSE); 4127 } 4128 rval = ire_local_same_lan(ire_local, alt_ire); 4129 4130 ire_refrele(alt_ire); 4131 return (rval); 4132 } 4133 4134 /* 4135 * Lookup cache 4136 * 4137 * In general the zoneid has to match (where ALL_ZONES match all of them). 4138 * But for IRE_LOCAL we also need to handle the case where L2 should 4139 * conceptually loop back the packet. This is necessary since neither 4140 * Ethernet drivers nor Ethernet hardware loops back packets sent to their 4141 * own MAC address. This loopback is needed when the normal 4142 * routes (ignoring IREs with different zoneids) would send out the packet on 4143 * the same ill as the ill with which this IRE_LOCAL is associated. 4144 * 4145 * Earlier versions of this code always matched an IRE_LOCAL independently of 4146 * the zoneid. We preserve that earlier behavior when 4147 * ip_restrict_interzone_loopback is turned off. 4148 */ 4149 ire_t * 4150 ire_cache_lookup(ipaddr_t addr, zoneid_t zoneid, const ts_label_t *tsl, 4151 ip_stack_t *ipst) 4152 { 4153 irb_t *irb_ptr; 4154 ire_t *ire; 4155 4156 irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(addr, 4157 ipst->ips_ip_cache_table_size)]; 4158 rw_enter(&irb_ptr->irb_lock, RW_READER); 4159 for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 4160 if (ire->ire_marks & (IRE_MARK_CONDEMNED | 4161 IRE_MARK_TESTHIDDEN | IRE_MARK_PRIVATE_ADDR)) { 4162 continue; 4163 } 4164 if (ire->ire_addr == addr) { 4165 /* 4166 * Finally, check if the security policy has any 4167 * restriction on using this route for the specified 4168 * message. 4169 */ 4170 if (tsl != NULL && 4171 ire->ire_gw_secattr != NULL && 4172 tsol_ire_match_gwattr(ire, tsl) != 0) { 4173 continue; 4174 } 4175 4176 if (zoneid == ALL_ZONES || ire->ire_zoneid == zoneid || 4177 ire->ire_zoneid == ALL_ZONES) { 4178 IRE_REFHOLD(ire); 4179 rw_exit(&irb_ptr->irb_lock); 4180 return (ire); 4181 } 4182 4183 if (ire->ire_type == IRE_LOCAL) { 4184 if (ipst->ips_ip_restrict_interzone_loopback && 4185 !ire_local_ok_across_zones(ire, zoneid, 4186 &addr, tsl, ipst)) 4187 continue; 4188 4189 IRE_REFHOLD(ire); 4190 rw_exit(&irb_ptr->irb_lock); 4191 return (ire); 4192 } 4193 } 4194 } 4195 rw_exit(&irb_ptr->irb_lock); 4196 return (NULL); 4197 } 4198 4199 ire_t * 4200 ire_cache_lookup_simple(ipaddr_t dst, ip_stack_t *ipst) 4201 { 4202 irb_t *irb_ptr; 4203 ire_t *ire; 4204 4205 /* 4206 * Look for an ire in the cachetable whose 4207 * ire_addr matches the destination. 4208 * Since we are being called by forwarding fastpath 4209 * no need to check for Trusted Solaris label. 4210 */ 4211 irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH( 4212 dst, ipst->ips_ip_cache_table_size)]; 4213 rw_enter(&irb_ptr->irb_lock, RW_READER); 4214 for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 4215 if (ire->ire_marks & (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN | 4216 IRE_MARK_PRIVATE_ADDR)) { 4217 continue; 4218 } 4219 if (ire->ire_addr == dst) { 4220 IRE_REFHOLD(ire); 4221 rw_exit(&irb_ptr->irb_lock); 4222 return (ire); 4223 } 4224 } 4225 rw_exit(&irb_ptr->irb_lock); 4226 return (NULL); 4227 } 4228 4229 /* 4230 * Locate the interface ire that is tied to the cache ire 'cire' via 4231 * cire->ire_ihandle. 4232 * 4233 * We are trying to create the cache ire for an offlink destn based 4234 * on the cache ire of the gateway in 'cire'. 'pire' is the prefix ire 4235 * as found by ip_newroute(). We are called from ip_newroute() in 4236 * the IRE_CACHE case. 4237 */ 4238 ire_t * 4239 ire_ihandle_lookup_offlink(ire_t *cire, ire_t *pire) 4240 { 4241 ire_t *ire; 4242 int match_flags; 4243 ipaddr_t gw_addr; 4244 ipif_t *gw_ipif; 4245 ip_stack_t *ipst = cire->ire_ipst; 4246 4247 ASSERT(cire != NULL && pire != NULL); 4248 4249 /* 4250 * We don't need to specify the zoneid to ire_ftable_lookup() below 4251 * because the ihandle refers to an ipif which can be in only one zone. 4252 */ 4253 match_flags = MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK; 4254 if (pire->ire_ipif != NULL) 4255 match_flags |= MATCH_IRE_ILL; 4256 /* 4257 * We know that the mask of the interface ire equals cire->ire_cmask. 4258 * (When ip_newroute() created 'cire' for the gateway it set its 4259 * cmask from the interface ire's mask) 4260 */ 4261 ire = ire_ftable_lookup(cire->ire_addr, cire->ire_cmask, 0, 4262 IRE_INTERFACE, pire->ire_ipif, NULL, ALL_ZONES, cire->ire_ihandle, 4263 NULL, match_flags, ipst); 4264 if (ire != NULL) 4265 return (ire); 4266 /* 4267 * If we didn't find an interface ire above, we can't declare failure. 4268 * For backwards compatibility, we need to support prefix routes 4269 * pointing to next hop gateways that are not on-link. 4270 * 4271 * Assume we are trying to ping some offlink destn, and we have the 4272 * routing table below. 4273 * 4274 * Eg. default - gw1 <--- pire (line 1) 4275 * gw1 - gw2 (line 2) 4276 * gw2 - hme0 (line 3) 4277 * 4278 * If we already have a cache ire for gw1 in 'cire', the 4279 * ire_ftable_lookup above would have failed, since there is no 4280 * interface ire to reach gw1. We will fallthru below. 4281 * 4282 * Here we duplicate the steps that ire_ftable_lookup() did in 4283 * getting 'cire' from 'pire', in the MATCH_IRE_RECURSIVE case. 4284 * The differences are the following 4285 * i. We want the interface ire only, so we call ire_ftable_lookup() 4286 * instead of ire_route_lookup() 4287 * ii. We look for only prefix routes in the 1st call below. 4288 * ii. We want to match on the ihandle in the 2nd call below. 4289 */ 4290 match_flags = MATCH_IRE_TYPE; 4291 if (pire->ire_ipif != NULL) 4292 match_flags |= MATCH_IRE_ILL; 4293 ire = ire_ftable_lookup(pire->ire_gateway_addr, 0, 0, IRE_OFFSUBNET, 4294 pire->ire_ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst); 4295 if (ire == NULL) 4296 return (NULL); 4297 /* 4298 * At this point 'ire' corresponds to the entry shown in line 2. 4299 * gw_addr is 'gw2' in the example above. 4300 */ 4301 gw_addr = ire->ire_gateway_addr; 4302 gw_ipif = ire->ire_ipif; 4303 ire_refrele(ire); 4304 4305 match_flags |= MATCH_IRE_IHANDLE; 4306 ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, 4307 gw_ipif, NULL, ALL_ZONES, cire->ire_ihandle, NULL, match_flags, 4308 ipst); 4309 return (ire); 4310 } 4311 4312 /* 4313 * Return the IRE_LOOPBACK, IRE_IF_RESOLVER or IRE_IF_NORESOLVER 4314 * ire associated with the specified ipif. 4315 * 4316 * This might occasionally be called when IPIF_UP is not set since 4317 * the IP_MULTICAST_IF as well as creating interface routes 4318 * allows specifying a down ipif (ipif_lookup* match ipifs that are down). 4319 * 4320 * Note that if IPIF_NOLOCAL, IPIF_NOXMIT, or IPIF_DEPRECATED is set on 4321 * the ipif, this routine might return NULL. 4322 */ 4323 ire_t * 4324 ipif_to_ire(const ipif_t *ipif) 4325 { 4326 ire_t *ire; 4327 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 4328 uint_t match_flags = MATCH_IRE_TYPE | MATCH_IRE_IPIF | MATCH_IRE_MASK; 4329 4330 /* 4331 * IRE_INTERFACE entries for ills under IPMP are IRE_MARK_TESTHIDDEN 4332 * so that they aren't accidentally returned. However, if the 4333 * caller's ipif is on an ill under IPMP, there's no need to hide 'em. 4334 */ 4335 if (IS_UNDER_IPMP(ipif->ipif_ill)) 4336 match_flags |= MATCH_IRE_MARK_TESTHIDDEN; 4337 4338 ASSERT(!ipif->ipif_isv6); 4339 if (ipif->ipif_ire_type == IRE_LOOPBACK) { 4340 ire = ire_ctable_lookup(ipif->ipif_lcl_addr, 0, IRE_LOOPBACK, 4341 ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF), 4342 ipst); 4343 } else if (ipif->ipif_flags & IPIF_POINTOPOINT) { 4344 /* In this case we need to lookup destination address. */ 4345 ire = ire_ftable_lookup(ipif->ipif_pp_dst_addr, IP_HOST_MASK, 0, 4346 IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL, match_flags, 4347 ipst); 4348 } else { 4349 ire = ire_ftable_lookup(ipif->ipif_subnet, 4350 ipif->ipif_net_mask, 0, IRE_INTERFACE, ipif, NULL, 4351 ALL_ZONES, 0, NULL, match_flags, ipst); 4352 } 4353 return (ire); 4354 } 4355 4356 /* 4357 * ire_walk function. 4358 * Count the number of IRE_CACHE entries in different categories. 4359 */ 4360 void 4361 ire_cache_count(ire_t *ire, char *arg) 4362 { 4363 ire_cache_count_t *icc = (ire_cache_count_t *)arg; 4364 4365 if (ire->ire_type != IRE_CACHE) 4366 return; 4367 4368 icc->icc_total++; 4369 4370 if (ire->ire_ipversion == IPV6_VERSION) { 4371 mutex_enter(&ire->ire_lock); 4372 if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) { 4373 mutex_exit(&ire->ire_lock); 4374 icc->icc_onlink++; 4375 return; 4376 } 4377 mutex_exit(&ire->ire_lock); 4378 } else { 4379 if (ire->ire_gateway_addr == 0) { 4380 icc->icc_onlink++; 4381 return; 4382 } 4383 } 4384 4385 ASSERT(ire->ire_ipif != NULL); 4386 if (ire->ire_max_frag < ire->ire_ipif->ipif_mtu) 4387 icc->icc_pmtu++; 4388 else if (ire->ire_tire_mark != ire->ire_ob_pkt_count + 4389 ire->ire_ib_pkt_count) 4390 icc->icc_offlink++; 4391 else 4392 icc->icc_unused++; 4393 } 4394 4395 /* 4396 * ire_walk function called by ip_trash_ire_reclaim(). 4397 * Free a fraction of the IRE_CACHE cache entries. The fractions are 4398 * different for different categories of IRE_CACHE entries. 4399 * A fraction of zero means to not free any in that category. 4400 * Use the hash bucket id plus lbolt as a random number. Thus if the fraction 4401 * is N then every Nth hash bucket chain will be freed. 4402 */ 4403 void 4404 ire_cache_reclaim(ire_t *ire, char *arg) 4405 { 4406 ire_cache_reclaim_t *icr = (ire_cache_reclaim_t *)arg; 4407 uint_t rand; 4408 ip_stack_t *ipst = icr->icr_ipst; 4409 4410 if (ire->ire_type != IRE_CACHE) 4411 return; 4412 4413 if (ire->ire_ipversion == IPV6_VERSION) { 4414 rand = (uint_t)lbolt + 4415 IRE_ADDR_HASH_V6(ire->ire_addr_v6, 4416 ipst->ips_ip6_cache_table_size); 4417 mutex_enter(&ire->ire_lock); 4418 if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) { 4419 mutex_exit(&ire->ire_lock); 4420 if (icr->icr_onlink != 0 && 4421 (rand/icr->icr_onlink)*icr->icr_onlink == rand) { 4422 ire_delete(ire); 4423 return; 4424 } 4425 goto done; 4426 } 4427 mutex_exit(&ire->ire_lock); 4428 } else { 4429 rand = (uint_t)lbolt + 4430 IRE_ADDR_HASH(ire->ire_addr, ipst->ips_ip_cache_table_size); 4431 if (ire->ire_gateway_addr == 0) { 4432 if (icr->icr_onlink != 0 && 4433 (rand/icr->icr_onlink)*icr->icr_onlink == rand) { 4434 ire_delete(ire); 4435 return; 4436 } 4437 goto done; 4438 } 4439 } 4440 /* Not onlink IRE */ 4441 ASSERT(ire->ire_ipif != NULL); 4442 if (ire->ire_max_frag < ire->ire_ipif->ipif_mtu) { 4443 /* Use ptmu fraction */ 4444 if (icr->icr_pmtu != 0 && 4445 (rand/icr->icr_pmtu)*icr->icr_pmtu == rand) { 4446 ire_delete(ire); 4447 return; 4448 } 4449 } else if (ire->ire_tire_mark != ire->ire_ob_pkt_count + 4450 ire->ire_ib_pkt_count) { 4451 /* Use offlink fraction */ 4452 if (icr->icr_offlink != 0 && 4453 (rand/icr->icr_offlink)*icr->icr_offlink == rand) { 4454 ire_delete(ire); 4455 return; 4456 } 4457 } else { 4458 /* Use unused fraction */ 4459 if (icr->icr_unused != 0 && 4460 (rand/icr->icr_unused)*icr->icr_unused == rand) { 4461 ire_delete(ire); 4462 return; 4463 } 4464 } 4465 done: 4466 /* 4467 * Update tire_mark so that those that haven't been used since this 4468 * reclaim will be considered unused next time we reclaim. 4469 */ 4470 ire->ire_tire_mark = ire->ire_ob_pkt_count + ire->ire_ib_pkt_count; 4471 } 4472 4473 static void 4474 power2_roundup(uint32_t *value) 4475 { 4476 int i; 4477 4478 for (i = 1; i < 31; i++) { 4479 if (*value <= (1 << i)) 4480 break; 4481 } 4482 *value = (1 << i); 4483 } 4484 4485 /* Global init for all zones */ 4486 void 4487 ip_ire_g_init() 4488 { 4489 /* 4490 * Create ire caches, ire_reclaim() 4491 * will give IRE_CACHE back to system when needed. 4492 * This needs to be done here before anything else, since 4493 * ire_add() expects the cache to be created. 4494 */ 4495 ire_cache = kmem_cache_create("ire_cache", 4496 sizeof (ire_t), 0, ip_ire_constructor, 4497 ip_ire_destructor, ip_trash_ire_reclaim, NULL, NULL, 0); 4498 4499 rt_entry_cache = kmem_cache_create("rt_entry", 4500 sizeof (struct rt_entry), 0, NULL, NULL, NULL, NULL, NULL, 0); 4501 4502 /* 4503 * Have radix code setup kmem caches etc. 4504 */ 4505 rn_init(); 4506 } 4507 4508 void 4509 ip_ire_init(ip_stack_t *ipst) 4510 { 4511 int i; 4512 uint32_t mem_cnt; 4513 uint32_t cpu_cnt; 4514 uint32_t min_cnt; 4515 pgcnt_t mem_avail; 4516 4517 /* 4518 * ip_ire_max_bucket_cnt is sized below based on the memory 4519 * size and the cpu speed of the machine. This is upper 4520 * bounded by the compile time value of ip_ire_max_bucket_cnt 4521 * and is lower bounded by the compile time value of 4522 * ip_ire_min_bucket_cnt. Similar logic applies to 4523 * ip6_ire_max_bucket_cnt. 4524 * 4525 * We calculate this for each IP Instances in order to use 4526 * the kmem_avail and ip_ire_{min,max}_bucket_cnt that are 4527 * in effect when the zone is booted. 4528 */ 4529 mem_avail = kmem_avail(); 4530 mem_cnt = (mem_avail >> ip_ire_mem_ratio) / 4531 ip_cache_table_size / sizeof (ire_t); 4532 cpu_cnt = CPU->cpu_type_info.pi_clock >> ip_ire_cpu_ratio; 4533 4534 min_cnt = MIN(cpu_cnt, mem_cnt); 4535 if (min_cnt < ip_ire_min_bucket_cnt) 4536 min_cnt = ip_ire_min_bucket_cnt; 4537 if (ip_ire_max_bucket_cnt > min_cnt) { 4538 ip_ire_max_bucket_cnt = min_cnt; 4539 } 4540 4541 mem_cnt = (mem_avail >> ip_ire_mem_ratio) / 4542 ip6_cache_table_size / sizeof (ire_t); 4543 min_cnt = MIN(cpu_cnt, mem_cnt); 4544 if (min_cnt < ip6_ire_min_bucket_cnt) 4545 min_cnt = ip6_ire_min_bucket_cnt; 4546 if (ip6_ire_max_bucket_cnt > min_cnt) { 4547 ip6_ire_max_bucket_cnt = min_cnt; 4548 } 4549 4550 mutex_init(&ipst->ips_ire_ft_init_lock, NULL, MUTEX_DEFAULT, 0); 4551 mutex_init(&ipst->ips_ire_handle_lock, NULL, MUTEX_DEFAULT, NULL); 4552 4553 (void) rn_inithead((void **)&ipst->ips_ip_ftable, 32); 4554 4555 4556 /* Calculate the IPv4 cache table size. */ 4557 ipst->ips_ip_cache_table_size = MAX(ip_cache_table_size, 4558 ((mem_avail >> ip_ire_mem_ratio) / sizeof (ire_t) / 4559 ip_ire_max_bucket_cnt)); 4560 if (ipst->ips_ip_cache_table_size > ip_max_cache_table_size) 4561 ipst->ips_ip_cache_table_size = ip_max_cache_table_size; 4562 /* 4563 * Make sure that the table size is always a power of 2. The 4564 * hash macro IRE_ADDR_HASH() depends on that. 4565 */ 4566 power2_roundup(&ipst->ips_ip_cache_table_size); 4567 4568 ipst->ips_ip_cache_table = kmem_zalloc(ipst->ips_ip_cache_table_size * 4569 sizeof (irb_t), KM_SLEEP); 4570 4571 for (i = 0; i < ipst->ips_ip_cache_table_size; i++) { 4572 rw_init(&ipst->ips_ip_cache_table[i].irb_lock, NULL, 4573 RW_DEFAULT, NULL); 4574 } 4575 4576 /* Calculate the IPv6 cache table size. */ 4577 ipst->ips_ip6_cache_table_size = MAX(ip6_cache_table_size, 4578 ((mem_avail >> ip_ire_mem_ratio) / sizeof (ire_t) / 4579 ip6_ire_max_bucket_cnt)); 4580 if (ipst->ips_ip6_cache_table_size > ip6_max_cache_table_size) 4581 ipst->ips_ip6_cache_table_size = ip6_max_cache_table_size; 4582 /* 4583 * Make sure that the table size is always a power of 2. The 4584 * hash macro IRE_ADDR_HASH_V6() depends on that. 4585 */ 4586 power2_roundup(&ipst->ips_ip6_cache_table_size); 4587 4588 ipst->ips_ip_cache_table_v6 = kmem_zalloc( 4589 ipst->ips_ip6_cache_table_size * sizeof (irb_t), KM_SLEEP); 4590 4591 for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) { 4592 rw_init(&ipst->ips_ip_cache_table_v6[i].irb_lock, NULL, 4593 RW_DEFAULT, NULL); 4594 } 4595 4596 /* 4597 * Make sure that the forwarding table size is a power of 2. 4598 * The IRE*_ADDR_HASH() macroes depend on that. 4599 */ 4600 ipst->ips_ip6_ftable_hash_size = ip6_ftable_hash_size; 4601 power2_roundup(&ipst->ips_ip6_ftable_hash_size); 4602 4603 ipst->ips_ire_handle = 1; 4604 } 4605 4606 void 4607 ip_ire_g_fini(void) 4608 { 4609 kmem_cache_destroy(ire_cache); 4610 kmem_cache_destroy(rt_entry_cache); 4611 4612 rn_fini(); 4613 } 4614 4615 void 4616 ip_ire_fini(ip_stack_t *ipst) 4617 { 4618 int i; 4619 4620 /* 4621 * Delete all IREs - assumes that the ill/ipifs have 4622 * been removed so what remains are just the ftable and IRE_CACHE. 4623 */ 4624 ire_walk(ire_delete, NULL, ipst); 4625 4626 rn_freehead(ipst->ips_ip_ftable); 4627 ipst->ips_ip_ftable = NULL; 4628 4629 mutex_destroy(&ipst->ips_ire_ft_init_lock); 4630 mutex_destroy(&ipst->ips_ire_handle_lock); 4631 4632 for (i = 0; i < ipst->ips_ip_cache_table_size; i++) { 4633 ASSERT(ipst->ips_ip_cache_table[i].irb_ire == NULL); 4634 rw_destroy(&ipst->ips_ip_cache_table[i].irb_lock); 4635 } 4636 kmem_free(ipst->ips_ip_cache_table, 4637 ipst->ips_ip_cache_table_size * sizeof (irb_t)); 4638 ipst->ips_ip_cache_table = NULL; 4639 4640 for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) { 4641 ASSERT(ipst->ips_ip_cache_table_v6[i].irb_ire == NULL); 4642 rw_destroy(&ipst->ips_ip_cache_table_v6[i].irb_lock); 4643 } 4644 kmem_free(ipst->ips_ip_cache_table_v6, 4645 ipst->ips_ip6_cache_table_size * sizeof (irb_t)); 4646 ipst->ips_ip_cache_table_v6 = NULL; 4647 4648 for (i = 0; i < IP6_MASK_TABLE_SIZE; i++) { 4649 irb_t *ptr; 4650 int j; 4651 4652 if ((ptr = ipst->ips_ip_forwarding_table_v6[i]) == NULL) 4653 continue; 4654 4655 for (j = 0; j < ipst->ips_ip6_ftable_hash_size; j++) { 4656 ASSERT(ptr[j].irb_ire == NULL); 4657 rw_destroy(&ptr[j].irb_lock); 4658 } 4659 mi_free(ptr); 4660 ipst->ips_ip_forwarding_table_v6[i] = NULL; 4661 } 4662 } 4663 4664 /* 4665 * Check if another multirt route resolution is needed. 4666 * B_TRUE is returned is there remain a resolvable route, 4667 * or if no route for that dst is resolved yet. 4668 * B_FALSE is returned if all routes for that dst are resolved 4669 * or if the remaining unresolved routes are actually not 4670 * resolvable. 4671 * This only works in the global zone. 4672 */ 4673 boolean_t 4674 ire_multirt_need_resolve(ipaddr_t dst, const ts_label_t *tsl, ip_stack_t *ipst) 4675 { 4676 ire_t *first_fire; 4677 ire_t *first_cire; 4678 ire_t *fire; 4679 ire_t *cire; 4680 irb_t *firb; 4681 irb_t *cirb; 4682 int unres_cnt = 0; 4683 boolean_t resolvable = B_FALSE; 4684 4685 /* Retrieve the first IRE_HOST that matches the destination */ 4686 first_fire = ire_ftable_lookup(dst, IP_HOST_MASK, 0, IRE_HOST, NULL, 4687 NULL, ALL_ZONES, 0, tsl, 4688 MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_SECATTR, ipst); 4689 4690 /* No route at all */ 4691 if (first_fire == NULL) { 4692 return (B_TRUE); 4693 } 4694 4695 firb = first_fire->ire_bucket; 4696 ASSERT(firb != NULL); 4697 4698 /* Retrieve the first IRE_CACHE ire for that destination. */ 4699 first_cire = ire_cache_lookup(dst, GLOBAL_ZONEID, tsl, ipst); 4700 4701 /* No resolved route. */ 4702 if (first_cire == NULL) { 4703 ire_refrele(first_fire); 4704 return (B_TRUE); 4705 } 4706 4707 /* 4708 * At least one route is resolved. Here we look through the forward 4709 * and cache tables, to compare the number of declared routes 4710 * with the number of resolved routes. The search for a resolvable 4711 * route is performed only if at least one route remains 4712 * unresolved. 4713 */ 4714 cirb = first_cire->ire_bucket; 4715 ASSERT(cirb != NULL); 4716 4717 /* Count the number of routes to that dest that are declared. */ 4718 IRB_REFHOLD(firb); 4719 for (fire = first_fire; fire != NULL; fire = fire->ire_next) { 4720 if (!(fire->ire_flags & RTF_MULTIRT)) 4721 continue; 4722 if (fire->ire_addr != dst) 4723 continue; 4724 unres_cnt++; 4725 } 4726 IRB_REFRELE(firb); 4727 4728 /* Then subtract the number of routes to that dst that are resolved */ 4729 IRB_REFHOLD(cirb); 4730 for (cire = first_cire; cire != NULL; cire = cire->ire_next) { 4731 if (!(cire->ire_flags & RTF_MULTIRT)) 4732 continue; 4733 if (cire->ire_addr != dst) 4734 continue; 4735 if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN)) 4736 continue; 4737 unres_cnt--; 4738 } 4739 IRB_REFRELE(cirb); 4740 4741 /* At least one route is unresolved; search for a resolvable route. */ 4742 if (unres_cnt > 0) 4743 resolvable = ire_multirt_lookup(&first_cire, &first_fire, 4744 MULTIRT_USESTAMP | MULTIRT_CACHEGW, tsl, ipst); 4745 4746 if (first_fire != NULL) 4747 ire_refrele(first_fire); 4748 4749 if (first_cire != NULL) 4750 ire_refrele(first_cire); 4751 4752 return (resolvable); 4753 } 4754 4755 4756 /* 4757 * Explore a forward_table bucket, starting from fire_arg. 4758 * fire_arg MUST be an IRE_HOST entry. 4759 * 4760 * Return B_TRUE and update *ire_arg and *fire_arg 4761 * if at least one resolvable route is found. *ire_arg 4762 * is the IRE entry for *fire_arg's gateway. 4763 * 4764 * Return B_FALSE otherwise (all routes are resolved or 4765 * the remaining unresolved routes are all unresolvable). 4766 * 4767 * The IRE selection relies on a priority mechanism 4768 * driven by the flags passed in by the caller. 4769 * The caller, such as ip_newroute_ipif(), can get the most 4770 * relevant ire at each stage of a multiple route resolution. 4771 * 4772 * The rules are: 4773 * 4774 * - if MULTIRT_CACHEGW is specified in flags, IRE_CACHETABLE 4775 * ires are preferred for the gateway. This gives the highest 4776 * priority to routes that can be resolved without using 4777 * a resolver. 4778 * 4779 * - if MULTIRT_CACHEGW is not specified, or if MULTIRT_CACHEGW 4780 * is specified but no IRE_CACHETABLE ire entry for the gateway 4781 * is found, the following rules apply. 4782 * 4783 * - if MULTIRT_USESTAMP is specified in flags, IRE_INTERFACE 4784 * ires for the gateway, that have not been tried since 4785 * a configurable amount of time, are preferred. 4786 * This applies when a resolver must be invoked for 4787 * a missing route, but we don't want to use the resolver 4788 * upon each packet emission. If no such resolver is found, 4789 * B_FALSE is returned. 4790 * The MULTIRT_USESTAMP flag can be combined with 4791 * MULTIRT_CACHEGW. 4792 * 4793 * - if MULTIRT_USESTAMP is not specified in flags, the first 4794 * unresolved but resolvable route is selected. 4795 * 4796 * - Otherwise, there is no resolvalble route, and 4797 * B_FALSE is returned. 4798 * 4799 * At last, MULTIRT_SETSTAMP can be specified in flags to 4800 * request the timestamp of unresolvable routes to 4801 * be refreshed. This prevents the useless exploration 4802 * of those routes for a while, when MULTIRT_USESTAMP is used. 4803 * 4804 * This only works in the global zone. 4805 */ 4806 boolean_t 4807 ire_multirt_lookup(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags, 4808 const ts_label_t *tsl, ip_stack_t *ipst) 4809 { 4810 clock_t delta; 4811 ire_t *best_fire = NULL; 4812 ire_t *best_cire = NULL; 4813 ire_t *first_fire; 4814 ire_t *first_cire; 4815 ire_t *fire; 4816 ire_t *cire; 4817 irb_t *firb = NULL; 4818 irb_t *cirb = NULL; 4819 ire_t *gw_ire; 4820 boolean_t already_resolved; 4821 boolean_t res; 4822 ipaddr_t dst; 4823 ipaddr_t gw; 4824 4825 ip2dbg(("ire_multirt_lookup: *ire_arg %p, *fire_arg %p, flags %04x\n", 4826 (void *)*ire_arg, (void *)*fire_arg, flags)); 4827 4828 ASSERT(ire_arg != NULL); 4829 ASSERT(fire_arg != NULL); 4830 4831 /* Not an IRE_HOST ire; give up. */ 4832 if ((*fire_arg == NULL) || ((*fire_arg)->ire_type != IRE_HOST)) { 4833 return (B_FALSE); 4834 } 4835 4836 /* This is the first IRE_HOST ire for that destination. */ 4837 first_fire = *fire_arg; 4838 firb = first_fire->ire_bucket; 4839 ASSERT(firb != NULL); 4840 4841 dst = first_fire->ire_addr; 4842 4843 ip2dbg(("ire_multirt_lookup: dst %08x\n", ntohl(dst))); 4844 4845 /* 4846 * Retrieve the first IRE_CACHE ire for that destination; 4847 * if we don't find one, no route for that dest is 4848 * resolved yet. 4849 */ 4850 first_cire = ire_cache_lookup(dst, GLOBAL_ZONEID, tsl, ipst); 4851 if (first_cire != NULL) { 4852 cirb = first_cire->ire_bucket; 4853 } 4854 4855 ip2dbg(("ire_multirt_lookup: first_cire %p\n", (void *)first_cire)); 4856 4857 /* 4858 * Search for a resolvable route, giving the top priority 4859 * to routes that can be resolved without any call to the resolver. 4860 */ 4861 IRB_REFHOLD(firb); 4862 4863 if (!CLASSD(dst)) { 4864 /* 4865 * For all multiroute IRE_HOST ires for that destination, 4866 * check if the route via the IRE_HOST's gateway is 4867 * resolved yet. 4868 */ 4869 for (fire = first_fire; fire != NULL; fire = fire->ire_next) { 4870 4871 if (!(fire->ire_flags & RTF_MULTIRT)) 4872 continue; 4873 if (fire->ire_addr != dst) 4874 continue; 4875 4876 if (fire->ire_gw_secattr != NULL && 4877 tsol_ire_match_gwattr(fire, tsl) != 0) { 4878 continue; 4879 } 4880 4881 gw = fire->ire_gateway_addr; 4882 4883 ip2dbg(("ire_multirt_lookup: fire %p, " 4884 "ire_addr %08x, ire_gateway_addr %08x\n", 4885 (void *)fire, ntohl(fire->ire_addr), ntohl(gw))); 4886 4887 already_resolved = B_FALSE; 4888 4889 if (first_cire != NULL) { 4890 ASSERT(cirb != NULL); 4891 4892 IRB_REFHOLD(cirb); 4893 /* 4894 * For all IRE_CACHE ires for that 4895 * destination. 4896 */ 4897 for (cire = first_cire; 4898 cire != NULL; 4899 cire = cire->ire_next) { 4900 4901 if (!(cire->ire_flags & RTF_MULTIRT)) 4902 continue; 4903 if (cire->ire_addr != dst) 4904 continue; 4905 if (cire->ire_marks & 4906 (IRE_MARK_CONDEMNED | 4907 IRE_MARK_TESTHIDDEN)) 4908 continue; 4909 4910 if (cire->ire_gw_secattr != NULL && 4911 tsol_ire_match_gwattr(cire, 4912 tsl) != 0) { 4913 continue; 4914 } 4915 4916 /* 4917 * Check if the IRE_CACHE's gateway 4918 * matches the IRE_HOST's gateway. 4919 */ 4920 if (cire->ire_gateway_addr == gw) { 4921 already_resolved = B_TRUE; 4922 break; 4923 } 4924 } 4925 IRB_REFRELE(cirb); 4926 } 4927 4928 /* 4929 * This route is already resolved; 4930 * proceed with next one. 4931 */ 4932 if (already_resolved) { 4933 ip2dbg(("ire_multirt_lookup: found cire %p, " 4934 "already resolved\n", (void *)cire)); 4935 continue; 4936 } 4937 4938 /* 4939 * The route is unresolved; is it actually 4940 * resolvable, i.e. is there a cache or a resolver 4941 * for the gateway? 4942 */ 4943 gw_ire = ire_route_lookup(gw, 0, 0, 0, NULL, NULL, 4944 ALL_ZONES, tsl, 4945 MATCH_IRE_RECURSIVE | MATCH_IRE_SECATTR, ipst); 4946 4947 ip2dbg(("ire_multirt_lookup: looked up gw_ire %p\n", 4948 (void *)gw_ire)); 4949 4950 /* 4951 * If gw_ire is typed IRE_CACHETABLE, 4952 * this route can be resolved without any call to the 4953 * resolver. If the MULTIRT_CACHEGW flag is set, 4954 * give the top priority to this ire and exit the 4955 * loop. 4956 * This is typically the case when an ARP reply 4957 * is processed through ip_wput_nondata(). 4958 */ 4959 if ((flags & MULTIRT_CACHEGW) && 4960 (gw_ire != NULL) && 4961 (gw_ire->ire_type & IRE_CACHETABLE)) { 4962 ASSERT(gw_ire->ire_nce == NULL || 4963 gw_ire->ire_nce->nce_state == ND_REACHABLE); 4964 /* 4965 * Release the resolver associated to the 4966 * previous candidate best ire, if any. 4967 */ 4968 if (best_cire != NULL) { 4969 ire_refrele(best_cire); 4970 ASSERT(best_fire != NULL); 4971 } 4972 4973 best_fire = fire; 4974 best_cire = gw_ire; 4975 4976 ip2dbg(("ire_multirt_lookup: found top prio " 4977 "best_fire %p, best_cire %p\n", 4978 (void *)best_fire, (void *)best_cire)); 4979 break; 4980 } 4981 4982 /* 4983 * Compute the time elapsed since our preceding 4984 * attempt to resolve that route. 4985 * If the MULTIRT_USESTAMP flag is set, we take that 4986 * route into account only if this time interval 4987 * exceeds ip_multirt_resolution_interval; 4988 * this prevents us from attempting to resolve a 4989 * broken route upon each sending of a packet. 4990 */ 4991 delta = lbolt - fire->ire_last_used_time; 4992 delta = TICK_TO_MSEC(delta); 4993 4994 res = (boolean_t)((delta > 4995 ipst->ips_ip_multirt_resolution_interval) || 4996 (!(flags & MULTIRT_USESTAMP))); 4997 4998 ip2dbg(("ire_multirt_lookup: fire %p, delta %lu, " 4999 "res %d\n", 5000 (void *)fire, delta, res)); 5001 5002 if (res) { 5003 /* 5004 * We are here if MULTIRT_USESTAMP flag is set 5005 * and the resolver for fire's gateway 5006 * has not been tried since 5007 * ip_multirt_resolution_interval, or if 5008 * MULTIRT_USESTAMP is not set but gw_ire did 5009 * not fill the conditions for MULTIRT_CACHEGW, 5010 * or if neither MULTIRT_USESTAMP nor 5011 * MULTIRT_CACHEGW are set. 5012 */ 5013 if (gw_ire != NULL) { 5014 if (best_fire == NULL) { 5015 ASSERT(best_cire == NULL); 5016 5017 best_fire = fire; 5018 best_cire = gw_ire; 5019 5020 ip2dbg(("ire_multirt_lookup:" 5021 "found candidate " 5022 "best_fire %p, " 5023 "best_cire %p\n", 5024 (void *)best_fire, 5025 (void *)best_cire)); 5026 5027 /* 5028 * If MULTIRT_CACHEGW is not 5029 * set, we ignore the top 5030 * priority ires that can 5031 * be resolved without any 5032 * call to the resolver; 5033 * In that case, there is 5034 * actually no need 5035 * to continue the loop. 5036 */ 5037 if (!(flags & 5038 MULTIRT_CACHEGW)) { 5039 break; 5040 } 5041 continue; 5042 } 5043 } else { 5044 /* 5045 * No resolver for the gateway: the 5046 * route is not resolvable. 5047 * If the MULTIRT_SETSTAMP flag is 5048 * set, we stamp the IRE_HOST ire, 5049 * so we will not select it again 5050 * during this resolution interval. 5051 */ 5052 if (flags & MULTIRT_SETSTAMP) 5053 fire->ire_last_used_time = 5054 lbolt; 5055 } 5056 } 5057 5058 if (gw_ire != NULL) 5059 ire_refrele(gw_ire); 5060 } 5061 } else { /* CLASSD(dst) */ 5062 5063 for (fire = first_fire; 5064 fire != NULL; 5065 fire = fire->ire_next) { 5066 5067 if (!(fire->ire_flags & RTF_MULTIRT)) 5068 continue; 5069 if (fire->ire_addr != dst) 5070 continue; 5071 5072 if (fire->ire_gw_secattr != NULL && 5073 tsol_ire_match_gwattr(fire, tsl) != 0) { 5074 continue; 5075 } 5076 5077 already_resolved = B_FALSE; 5078 5079 gw = fire->ire_gateway_addr; 5080 5081 gw_ire = ire_ftable_lookup(gw, 0, 0, IRE_INTERFACE, 5082 NULL, NULL, ALL_ZONES, 0, tsl, 5083 MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE | 5084 MATCH_IRE_SECATTR, ipst); 5085 5086 /* No resolver for the gateway; we skip this ire. */ 5087 if (gw_ire == NULL) { 5088 continue; 5089 } 5090 ASSERT(gw_ire->ire_nce == NULL || 5091 gw_ire->ire_nce->nce_state == ND_REACHABLE); 5092 5093 if (first_cire != NULL) { 5094 5095 IRB_REFHOLD(cirb); 5096 /* 5097 * For all IRE_CACHE ires for that 5098 * destination. 5099 */ 5100 for (cire = first_cire; 5101 cire != NULL; 5102 cire = cire->ire_next) { 5103 5104 if (!(cire->ire_flags & RTF_MULTIRT)) 5105 continue; 5106 if (cire->ire_addr != dst) 5107 continue; 5108 if (cire->ire_marks & 5109 (IRE_MARK_CONDEMNED | 5110 IRE_MARK_TESTHIDDEN)) 5111 continue; 5112 5113 if (cire->ire_gw_secattr != NULL && 5114 tsol_ire_match_gwattr(cire, 5115 tsl) != 0) { 5116 continue; 5117 } 5118 5119 /* 5120 * Cache entries are linked to the 5121 * parent routes using the parent handle 5122 * (ire_phandle). If no cache entry has 5123 * the same handle as fire, fire is 5124 * still unresolved. 5125 */ 5126 ASSERT(cire->ire_phandle != 0); 5127 if (cire->ire_phandle == 5128 fire->ire_phandle) { 5129 already_resolved = B_TRUE; 5130 break; 5131 } 5132 } 5133 IRB_REFRELE(cirb); 5134 } 5135 5136 /* 5137 * This route is already resolved; proceed with 5138 * next one. 5139 */ 5140 if (already_resolved) { 5141 ire_refrele(gw_ire); 5142 continue; 5143 } 5144 5145 /* 5146 * Compute the time elapsed since our preceding 5147 * attempt to resolve that route. 5148 * If the MULTIRT_USESTAMP flag is set, we take 5149 * that route into account only if this time 5150 * interval exceeds ip_multirt_resolution_interval; 5151 * this prevents us from attempting to resolve a 5152 * broken route upon each sending of a packet. 5153 */ 5154 delta = lbolt - fire->ire_last_used_time; 5155 delta = TICK_TO_MSEC(delta); 5156 5157 res = (boolean_t)((delta > 5158 ipst->ips_ip_multirt_resolution_interval) || 5159 (!(flags & MULTIRT_USESTAMP))); 5160 5161 ip3dbg(("ire_multirt_lookup: fire %p, delta %lx, " 5162 "flags %04x, res %d\n", 5163 (void *)fire, delta, flags, res)); 5164 5165 if (res) { 5166 if (best_cire != NULL) { 5167 /* 5168 * Release the resolver associated 5169 * to the preceding candidate best 5170 * ire, if any. 5171 */ 5172 ire_refrele(best_cire); 5173 ASSERT(best_fire != NULL); 5174 } 5175 best_fire = fire; 5176 best_cire = gw_ire; 5177 continue; 5178 } 5179 5180 ire_refrele(gw_ire); 5181 } 5182 } 5183 5184 if (best_fire != NULL) { 5185 IRE_REFHOLD(best_fire); 5186 } 5187 IRB_REFRELE(firb); 5188 5189 /* Release the first IRE_CACHE we initially looked up, if any. */ 5190 if (first_cire != NULL) 5191 ire_refrele(first_cire); 5192 5193 /* Found a resolvable route. */ 5194 if (best_fire != NULL) { 5195 ASSERT(best_cire != NULL); 5196 5197 if (*fire_arg != NULL) 5198 ire_refrele(*fire_arg); 5199 if (*ire_arg != NULL) 5200 ire_refrele(*ire_arg); 5201 5202 /* 5203 * Update the passed-in arguments with the 5204 * resolvable multirt route we found. 5205 */ 5206 *fire_arg = best_fire; 5207 *ire_arg = best_cire; 5208 5209 ip2dbg(("ire_multirt_lookup: returning B_TRUE, " 5210 "*fire_arg %p, *ire_arg %p\n", 5211 (void *)best_fire, (void *)best_cire)); 5212 5213 return (B_TRUE); 5214 } 5215 5216 ASSERT(best_cire == NULL); 5217 5218 ip2dbg(("ire_multirt_lookup: returning B_FALSE, *fire_arg %p, " 5219 "*ire_arg %p\n", 5220 (void *)*fire_arg, (void *)*ire_arg)); 5221 5222 /* No resolvable route. */ 5223 return (B_FALSE); 5224 } 5225 5226 /* 5227 * IRE iterator for inbound and loopback broadcast processing. 5228 * Given an IRE_BROADCAST ire, walk the ires with the same destination 5229 * address, but skip over the passed-in ire. Returns the next ire without 5230 * a hold - assumes that the caller holds a reference on the IRE bucket. 5231 */ 5232 ire_t * 5233 ire_get_next_bcast_ire(ire_t *curr, ire_t *ire) 5234 { 5235 ill_t *ill; 5236 5237 if (curr == NULL) { 5238 for (curr = ire->ire_bucket->irb_ire; curr != NULL; 5239 curr = curr->ire_next) { 5240 if (curr->ire_addr == ire->ire_addr) 5241 break; 5242 } 5243 } else { 5244 curr = curr->ire_next; 5245 } 5246 ill = ire_to_ill(ire); 5247 for (; curr != NULL; curr = curr->ire_next) { 5248 if (curr->ire_addr != ire->ire_addr) { 5249 /* 5250 * All the IREs to a given destination are contiguous; 5251 * break out once the address doesn't match. 5252 */ 5253 break; 5254 } 5255 if (curr == ire) { 5256 /* skip over the passed-in ire */ 5257 continue; 5258 } 5259 if ((curr->ire_stq != NULL && ire->ire_stq == NULL) || 5260 (curr->ire_stq == NULL && ire->ire_stq != NULL)) { 5261 /* 5262 * If the passed-in ire is loopback, skip over 5263 * non-loopback ires and vice versa. 5264 */ 5265 continue; 5266 } 5267 if (ire_to_ill(curr) != ill) { 5268 /* skip over IREs going through a different interface */ 5269 continue; 5270 } 5271 if (curr->ire_marks & IRE_MARK_CONDEMNED) { 5272 /* skip over deleted IREs */ 5273 continue; 5274 } 5275 return (curr); 5276 } 5277 return (NULL); 5278 } 5279 5280 #ifdef DEBUG 5281 void 5282 ire_trace_ref(ire_t *ire) 5283 { 5284 mutex_enter(&ire->ire_lock); 5285 if (ire->ire_trace_disable) { 5286 mutex_exit(&ire->ire_lock); 5287 return; 5288 } 5289 5290 if (th_trace_ref(ire, ire->ire_ipst)) { 5291 mutex_exit(&ire->ire_lock); 5292 } else { 5293 ire->ire_trace_disable = B_TRUE; 5294 mutex_exit(&ire->ire_lock); 5295 ire_trace_cleanup(ire); 5296 } 5297 } 5298 5299 void 5300 ire_untrace_ref(ire_t *ire) 5301 { 5302 mutex_enter(&ire->ire_lock); 5303 if (!ire->ire_trace_disable) 5304 th_trace_unref(ire); 5305 mutex_exit(&ire->ire_lock); 5306 } 5307 5308 static void 5309 ire_trace_cleanup(const ire_t *ire) 5310 { 5311 th_trace_cleanup(ire, ire->ire_trace_disable); 5312 } 5313 #endif /* DEBUG */ 5314 5315 /* 5316 * Generate a message chain with an arp request to resolve the in_ire. 5317 * It is assumed that in_ire itself is currently in the ire cache table, 5318 * so we create a fake_ire filled with enough information about ire_addr etc. 5319 * to retrieve in_ire when the DL_UNITDATA response from the resolver 5320 * comes back. The fake_ire itself is created by calling esballoc with 5321 * the fr_rtnp (free routine) set to ire_freemblk. This routine will be 5322 * invoked when the mblk containing fake_ire is freed. 5323 */ 5324 void 5325 ire_arpresolve(ire_t *in_ire) 5326 { 5327 areq_t *areq; 5328 ipaddr_t *addrp; 5329 mblk_t *ire_mp, *areq_mp; 5330 ire_t *ire, *buf; 5331 size_t bufsize; 5332 frtn_t *frtnp; 5333 ill_t *dst_ill; 5334 ip_stack_t *ipst; 5335 5336 ASSERT(in_ire->ire_nce != NULL); 5337 5338 dst_ill = ire_to_ill(in_ire); 5339 ipst = dst_ill->ill_ipst; 5340 5341 /* 5342 * Construct message chain for the resolver 5343 * of the form: 5344 * ARP_REQ_MBLK-->IRE_MBLK 5345 * 5346 * NOTE : If the response does not 5347 * come back, ARP frees the packet. For this reason, 5348 * we can't REFHOLD the bucket of save_ire to prevent 5349 * deletions. We may not be able to REFRELE the bucket 5350 * if the response never comes back. Thus, before 5351 * adding the ire, ire_add_v4 will make sure that the 5352 * interface route does not get deleted. This is the 5353 * only case unlike ip_newroute_v6, ip_newroute_ipif_v6 5354 * where we can always prevent deletions because of 5355 * the synchronous nature of adding IRES i.e 5356 * ire_add_then_send is called after creating the IRE. 5357 */ 5358 5359 /* 5360 * We use esballoc to allocate the second part (IRE_MBLK) 5361 * of the message chain depicted above. This mblk will be freed 5362 * by arp when there is a timeout, and otherwise passed to IP 5363 * and IP will free it after processing the ARP response. 5364 */ 5365 5366 bufsize = sizeof (ire_t) + sizeof (frtn_t); 5367 buf = kmem_alloc(bufsize, KM_NOSLEEP); 5368 if (buf == NULL) { 5369 ip1dbg(("ire_arpresolve: alloc buffer failed\n")); 5370 return; 5371 } 5372 frtnp = (frtn_t *)(buf + 1); 5373 frtnp->free_arg = (caddr_t)buf; 5374 frtnp->free_func = ire_freemblk; 5375 5376 ire_mp = esballoc((unsigned char *)buf, bufsize, BPRI_MED, frtnp); 5377 if (ire_mp == NULL) { 5378 ip1dbg(("ire_arpresolve: esballoc failed\n")); 5379 kmem_free(buf, bufsize); 5380 return; 5381 } 5382 5383 areq_mp = copyb(dst_ill->ill_resolver_mp); 5384 if (areq_mp == NULL) { 5385 freemsg(ire_mp); 5386 return; 5387 } 5388 5389 ire_mp->b_datap->db_type = IRE_ARPRESOLVE_TYPE; 5390 ire = (ire_t *)buf; 5391 /* 5392 * keep enough info in the fake ire so that we can pull up 5393 * the incomplete ire (in_ire) after result comes back from 5394 * arp and make it complete. 5395 */ 5396 *ire = ire_null; 5397 ire->ire_u = in_ire->ire_u; 5398 ire->ire_ipif_seqid = in_ire->ire_ipif_seqid; 5399 ire->ire_ipif_ifindex = in_ire->ire_ipif_ifindex; 5400 ire->ire_ipif = in_ire->ire_ipif; 5401 ire->ire_stq = dst_ill->ill_wq; 5402 ire->ire_stq_ifindex = dst_ill->ill_phyint->phyint_ifindex; 5403 ire->ire_zoneid = in_ire->ire_zoneid; 5404 ire->ire_stackid = ipst->ips_netstack->netstack_stackid; 5405 ire->ire_ipst = ipst; 5406 5407 /* 5408 * ire_freemblk will be called when ire_mp is freed, both for 5409 * successful and failed arp resolution. IRE_MARK_UNCACHED will be set 5410 * when the arp resolution failed. 5411 */ 5412 ire->ire_marks |= IRE_MARK_UNCACHED; 5413 ire->ire_mp = ire_mp; 5414 ire_mp->b_wptr = (uchar_t *)&ire[1]; 5415 ire_mp->b_cont = NULL; 5416 linkb(areq_mp, ire_mp); 5417 5418 /* 5419 * Fill in the source and dest addrs for the resolver. 5420 * NOTE: this depends on memory layouts imposed by 5421 * ill_init(). 5422 */ 5423 areq = (areq_t *)areq_mp->b_rptr; 5424 addrp = (ipaddr_t *)((char *)areq + areq->areq_sender_addr_offset); 5425 *addrp = ire->ire_src_addr; 5426 5427 addrp = (ipaddr_t *)((char *)areq + areq->areq_target_addr_offset); 5428 if (ire->ire_gateway_addr != INADDR_ANY) { 5429 *addrp = ire->ire_gateway_addr; 5430 } else { 5431 *addrp = ire->ire_addr; 5432 } 5433 5434 /* Up to the resolver. */ 5435 if (canputnext(dst_ill->ill_rq)) { 5436 putnext(dst_ill->ill_rq, areq_mp); 5437 } else { 5438 freemsg(areq_mp); 5439 } 5440 } 5441 5442 /* 5443 * Esballoc free function for AR_ENTRY_QUERY request to clean up any 5444 * unresolved ire_t and/or nce_t structures when ARP resolution fails. 5445 * 5446 * This function can be called by ARP via free routine for ire_mp or 5447 * by IPv4(both host and forwarding path) via ire_delete 5448 * in case ARP resolution fails. 5449 * NOTE: Since IP is MT, ARP can call into IP but not vice versa 5450 * (for IP to talk to ARP, it still has to send AR* messages). 5451 * 5452 * Note that the ARP/IP merge should replace the functioanlity by providing 5453 * direct function calls to clean up unresolved entries in ire/nce lists. 5454 */ 5455 void 5456 ire_freemblk(ire_t *ire_mp) 5457 { 5458 nce_t *nce = NULL; 5459 ill_t *ill; 5460 ip_stack_t *ipst; 5461 netstack_t *ns = NULL; 5462 5463 ASSERT(ire_mp != NULL); 5464 5465 if ((ire_mp->ire_addr == NULL) && (ire_mp->ire_gateway_addr == NULL)) { 5466 ip1dbg(("ire_freemblk(0x%p) ire_addr is NULL\n", 5467 (void *)ire_mp)); 5468 goto cleanup; 5469 } 5470 if ((ire_mp->ire_marks & IRE_MARK_UNCACHED) == 0) { 5471 goto cleanup; /* everything succeeded. just free and return */ 5472 } 5473 5474 /* 5475 * the arp information corresponding to this ire_mp was not 5476 * transferred to an ire_cache entry. Need 5477 * to clean up incomplete ire's and nce, if necessary. 5478 */ 5479 ASSERT(ire_mp->ire_stq != NULL); 5480 ASSERT(ire_mp->ire_stq_ifindex != 0); 5481 ASSERT(ire_mp->ire_ipst != NULL); 5482 5483 ns = netstack_find_by_stackid(ire_mp->ire_stackid); 5484 ipst = (ns ? ns->netstack_ip : NULL); 5485 if (ipst == NULL || ipst != ire_mp->ire_ipst) /* Disapeared on us */ 5486 goto cleanup; 5487 5488 /* 5489 * Get any nce's corresponding to this ire_mp. We first have to 5490 * make sure that the ill is still around. 5491 */ 5492 ill = ill_lookup_on_ifindex(ire_mp->ire_stq_ifindex, 5493 B_FALSE, NULL, NULL, NULL, NULL, ipst); 5494 if (ill == NULL || (ire_mp->ire_stq != ill->ill_wq) || 5495 (ill->ill_state_flags & ILL_CONDEMNED)) { 5496 /* 5497 * ill went away. no nce to clean up. 5498 * Note that the ill_state_flags could be set to 5499 * ILL_CONDEMNED after this point, but if we know 5500 * that it is CONDEMNED now, we just bail out quickly. 5501 */ 5502 if (ill != NULL) 5503 ill_refrele(ill); 5504 goto cleanup; 5505 } 5506 nce = ndp_lookup_v4(ill, 5507 ((ire_mp->ire_gateway_addr != INADDR_ANY) ? 5508 &ire_mp->ire_gateway_addr : &ire_mp->ire_addr), 5509 B_FALSE); 5510 ill_refrele(ill); 5511 5512 if ((nce != NULL) && (nce->nce_state != ND_REACHABLE)) { 5513 /* 5514 * some incomplete nce was found. 5515 */ 5516 DTRACE_PROBE2(ire__freemblk__arp__resolv__fail, 5517 nce_t *, nce, ire_t *, ire_mp); 5518 /* 5519 * Send the icmp_unreachable messages for the queued mblks in 5520 * ire->ire_nce->nce_qd_mp, since ARP resolution failed 5521 * for this ire 5522 */ 5523 arp_resolv_failed(nce); 5524 /* 5525 * Delete the nce and clean up all ire's pointing at this nce 5526 * in the cachetable 5527 */ 5528 ndp_delete(nce); 5529 } 5530 if (nce != NULL) 5531 NCE_REFRELE(nce); /* release the ref taken by ndp_lookup_v4 */ 5532 5533 cleanup: 5534 if (ns != NULL) 5535 netstack_rele(ns); 5536 /* 5537 * Get rid of the ire buffer 5538 * We call kmem_free here(instead of ire_delete()), since 5539 * this is the freeb's callback. 5540 */ 5541 kmem_free(ire_mp, sizeof (ire_t) + sizeof (frtn_t)); 5542 } 5543 5544 /* 5545 * find, or create if needed, a neighbor cache entry nce_t for IRE_CACHE and 5546 * non-loopback IRE_BROADCAST ire's. 5547 * 5548 * If a neighbor-cache entry has to be created (i.e., one does not already 5549 * exist in the nce list) the nce_res_mp and nce_state of the neighbor cache 5550 * entry are initialized in ndp_add_v4(). These values are picked from 5551 * the src_nce, if one is passed in. Otherwise (if src_nce == NULL) the 5552 * ire->ire_type and the outgoing interface (ire_to_ill(ire)) values 5553 * determine the {nce_state, nce_res_mp} of the nce_t created. All 5554 * IRE_BROADCAST entries have nce_state = ND_REACHABLE, and the nce_res_mp 5555 * is set to the ill_bcast_mp of the outgoing inerface. For unicast ire 5556 * entries, 5557 * - if the outgoing interface is of type IRE_IF_RESOLVER, a newly created 5558 * nce_t will have a null nce_res_mp, and will be in the ND_INITIAL state. 5559 * - if the outgoing interface is a IRE_IF_NORESOLVER interface, no link 5560 * layer resolution is necessary, so that the nce_t will be in the 5561 * ND_REACHABLE state and the nce_res_mp will have a copy of the 5562 * ill_resolver_mp of the outgoing interface. 5563 * 5564 * The link layer information needed for broadcast addresses, and for 5565 * packets sent on IRE_IF_NORESOLVER interfaces is a constant mapping that 5566 * never needs re-verification for the lifetime of the nce_t. These are 5567 * therefore marked NCE_F_PERMANENT, and never allowed to expire via 5568 * NCE_EXPIRED. 5569 * 5570 * IRE_CACHE ire's contain the information for the nexthop (ire_gateway_addr) 5571 * in the case of indirect routes, and for the dst itself (ire_addr) in the 5572 * case of direct routes, with the nce_res_mp containing a template 5573 * DL_UNITDATA request. 5574 * 5575 * The actual association of the ire_nce to the nce created here is 5576 * typically done in ire_add_v4 for IRE_CACHE entries. Exceptions 5577 * to this rule are SO_DONTROUTE ire's (IRE_MARK_NO_ADD), for which 5578 * the ire_nce assignment is done in ire_add_then_send. 5579 */ 5580 int 5581 ire_nce_init(ire_t *ire, nce_t *src_nce) 5582 { 5583 in_addr_t addr4; 5584 int err; 5585 nce_t *nce = NULL; 5586 ill_t *ire_ill; 5587 uint16_t nce_flags = 0; 5588 ip_stack_t *ipst; 5589 5590 if (ire->ire_stq == NULL) 5591 return (0); /* no need to create nce for local/loopback */ 5592 5593 switch (ire->ire_type) { 5594 case IRE_CACHE: 5595 if (ire->ire_gateway_addr != INADDR_ANY) 5596 addr4 = ire->ire_gateway_addr; /* 'G' route */ 5597 else 5598 addr4 = ire->ire_addr; /* direct route */ 5599 break; 5600 case IRE_BROADCAST: 5601 addr4 = ire->ire_addr; 5602 nce_flags |= (NCE_F_PERMANENT|NCE_F_BCAST); 5603 break; 5604 default: 5605 return (0); 5606 } 5607 5608 /* 5609 * ire_ipif is picked based on RTF_SETSRC, usesrc etc. 5610 * rules in ire_forward_src_ipif. We want the dlureq_mp 5611 * for the outgoing interface, which we get from the ire_stq. 5612 */ 5613 ire_ill = ire_to_ill(ire); 5614 ipst = ire_ill->ill_ipst; 5615 5616 /* 5617 * IRE_IF_NORESOLVER entries never need re-verification and 5618 * do not expire, so we mark them as NCE_F_PERMANENT. 5619 */ 5620 if (ire_ill->ill_net_type == IRE_IF_NORESOLVER) 5621 nce_flags |= NCE_F_PERMANENT; 5622 5623 retry_nce: 5624 err = ndp_lookup_then_add_v4(ire_ill, &addr4, nce_flags, 5625 &nce, src_nce); 5626 5627 if (err == EEXIST && NCE_EXPIRED(nce, ipst)) { 5628 /* 5629 * We looked up an expired nce. 5630 * Go back and try to create one again. 5631 */ 5632 ndp_delete(nce); 5633 NCE_REFRELE(nce); 5634 nce = NULL; 5635 goto retry_nce; 5636 } 5637 5638 ip1dbg(("ire 0x%p addr 0x%lx type 0x%x; found nce 0x%p err %d\n", 5639 (void *)ire, (ulong_t)addr4, ire->ire_type, (void *)nce, err)); 5640 5641 switch (err) { 5642 case 0: 5643 case EEXIST: 5644 /* 5645 * return a pointer to a newly created or existing nce_t; 5646 * note that the ire-nce mapping is many-one, i.e., 5647 * multiple ire's could point to the same nce_t. 5648 */ 5649 break; 5650 default: 5651 DTRACE_PROBE2(nce__init__fail, ill_t *, ire_ill, int, err); 5652 return (EINVAL); 5653 } 5654 if (ire->ire_type == IRE_BROADCAST) { 5655 /* 5656 * Two bcast ires are created for each interface; 5657 * 1. loopback copy (which does not have an 5658 * ire_stq, and therefore has no ire_nce), and, 5659 * 2. the non-loopback copy, which has the nce_res_mp 5660 * initialized to a copy of the ill_bcast_mp, and 5661 * is marked as ND_REACHABLE at this point. 5662 * This nce does not undergo any further state changes, 5663 * and exists as long as the interface is plumbed. 5664 * Note: the assignment of ire_nce here is a historical 5665 * artifact of old code that used to inline ire_add(). 5666 */ 5667 ire->ire_nce = nce; 5668 /* 5669 * We are associating this nce to the ire, 5670 * so change the nce ref taken in 5671 * ndp_lookup_then_add_v4() from 5672 * NCE_REFHOLD to NCE_REFHOLD_NOTR 5673 */ 5674 NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce); 5675 } else { 5676 /* 5677 * We are not using this nce_t just yet so release 5678 * the ref taken in ndp_lookup_then_add_v4() 5679 */ 5680 NCE_REFRELE(nce); 5681 } 5682 return (0); 5683 } 5684 5685 /* 5686 * This is the implementation of the IPv4 IRE cache lookup procedure. 5687 * Separating the interface from the implementation allows additional 5688 * flexibility when specifying search criteria. 5689 */ 5690 static ire_t * 5691 ip4_ctable_lookup_impl(ire_ctable_args_t *margs) 5692 { 5693 irb_t *irb_ptr; 5694 ire_t *ire; 5695 ip_stack_t *ipst = margs->ict_ipst; 5696 5697 if ((margs->ict_flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && 5698 (margs->ict_ipif == NULL)) { 5699 return (NULL); 5700 } 5701 5702 irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH( 5703 *((ipaddr_t *)margs->ict_addr), ipst->ips_ip_cache_table_size)]; 5704 rw_enter(&irb_ptr->irb_lock, RW_READER); 5705 for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 5706 if (ire->ire_marks & IRE_MARK_CONDEMNED) 5707 continue; 5708 ASSERT(ire->ire_mask == IP_HOST_MASK); 5709 if (ire_match_args(ire, *((ipaddr_t *)margs->ict_addr), 5710 ire->ire_mask, *((ipaddr_t *)margs->ict_gateway), 5711 margs->ict_type, margs->ict_ipif, margs->ict_zoneid, 0, 5712 margs->ict_tsl, margs->ict_flags, margs->ict_wq)) { 5713 IRE_REFHOLD(ire); 5714 rw_exit(&irb_ptr->irb_lock); 5715 return (ire); 5716 } 5717 } 5718 5719 rw_exit(&irb_ptr->irb_lock); 5720 return (NULL); 5721 } 5722 5723 /* 5724 * This function locates IRE_CACHE entries which were added by the 5725 * ire_forward() path. We can fully specify the IRE we are looking for by 5726 * providing the ipif (MATCH_IRE_IPIF) *and* the stq (MATCH_IRE_WQ). 5727 */ 5728 ire_t * 5729 ire_arpresolve_lookup(ipaddr_t addr, ipaddr_t gw, ipif_t *ipif, 5730 zoneid_t zoneid, ip_stack_t *ipst, queue_t *wq) 5731 { 5732 ire_ctable_args_t margs; 5733 5734 margs.ict_addr = &addr; 5735 margs.ict_gateway = &gw; 5736 margs.ict_type = IRE_CACHE; 5737 margs.ict_ipif = ipif; 5738 margs.ict_zoneid = zoneid; 5739 margs.ict_tsl = NULL; 5740 margs.ict_flags = MATCH_IRE_GW | MATCH_IRE_IPIF | MATCH_IRE_ZONEONLY | 5741 MATCH_IRE_TYPE | MATCH_IRE_WQ; 5742 margs.ict_ipst = ipst; 5743 margs.ict_wq = wq; 5744 5745 return (ip4_ctable_lookup_impl(&margs)); 5746 } 5747