1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/stream.h> 30 #include <sys/stropts.h> 31 #include <sys/strsun.h> 32 #include <sys/sysmacros.h> 33 #include <sys/errno.h> 34 #include <sys/dlpi.h> 35 #include <sys/socket.h> 36 #include <sys/ddi.h> 37 #include <sys/sunddi.h> 38 #include <sys/cmn_err.h> 39 #include <sys/debug.h> 40 #include <sys/vtrace.h> 41 #include <sys/kmem.h> 42 #include <sys/zone.h> 43 #include <sys/ethernet.h> 44 #include <sys/sdt.h> 45 46 #include <net/if.h> 47 #include <net/if_types.h> 48 #include <net/if_dl.h> 49 #include <net/route.h> 50 #include <netinet/in.h> 51 #include <netinet/ip6.h> 52 #include <netinet/icmp6.h> 53 54 #include <inet/common.h> 55 #include <inet/mi.h> 56 #include <inet/mib2.h> 57 #include <inet/nd.h> 58 #include <inet/ip.h> 59 #include <inet/ip_impl.h> 60 #include <inet/ipclassifier.h> 61 #include <inet/ip_if.h> 62 #include <inet/ip_ire.h> 63 #include <inet/ip_rts.h> 64 #include <inet/ip6.h> 65 #include <inet/ip_ndp.h> 66 #include <inet/ipsec_impl.h> 67 #include <inet/ipsec_info.h> 68 #include <inet/sctp_ip.h> 69 70 /* 71 * Function names with nce_ prefix are static while function 72 * names with ndp_ prefix are used by rest of the IP. 73 * 74 * Lock ordering: 75 * 76 * ndp_g_lock -> ill_lock -> nce_lock 77 * 78 * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and 79 * nce_next. Nce_lock protects the contents of the NCE (particularly 80 * nce_refcnt). 81 */ 82 83 static boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *new_ll_addr, 84 uint32_t ll_addr_len); 85 static void nce_ire_delete(nce_t *nce); 86 static void nce_ire_delete1(ire_t *ire, char *nce_arg); 87 static void nce_set_ll(nce_t *nce, uchar_t *ll_addr); 88 static nce_t *nce_lookup_addr(ill_t *, const in6_addr_t *, nce_t *); 89 static nce_t *nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr); 90 static void nce_make_mapping(nce_t *nce, uchar_t *addrpos, 91 uchar_t *addr); 92 static int nce_set_multicast(ill_t *ill, const in6_addr_t *addr); 93 static void nce_queue_mp(nce_t *nce, mblk_t *mp); 94 static void nce_report1(nce_t *nce, uchar_t *mp_arg); 95 static mblk_t *nce_udreq_alloc(ill_t *ill); 96 static void nce_update(nce_t *nce, uint16_t new_state, 97 uchar_t *new_ll_addr); 98 static uint32_t nce_solicit(nce_t *nce, mblk_t *mp); 99 static boolean_t nce_xmit(ill_t *ill, uint32_t operation, 100 ill_t *hwaddr_ill, boolean_t use_lla_addr, const in6_addr_t *sender, 101 const in6_addr_t *target, int flag); 102 extern void th_trace_rrecord(th_trace_t *); 103 static int ndp_add_v4(ill_t *, const in_addr_t *, uint16_t, 104 nce_t **, nce_t *); 105 106 /* 107 * We track the time of creation of the nce in the nce_init_time field 108 * of IPv4 nce_t entries. If an nce is stuck in the ND_INITIAL state for 109 * more than NCE_STUCK_TIMEOUT milliseconds, trigger the nce-stuck dtrace 110 * probe to assist in debugging. This probe will be fired from 111 * nce_thread_exit() for debug kernels, and from nce_report1() when 112 * 'ndd -get /dev/ip ip_ndp_cache_report' is invoked on both debug and 113 * non-debug kernels. 114 */ 115 #define NCE_STUCK_TIMEOUT 120000 116 117 #ifdef NCE_DEBUG 118 void nce_trace_inactive(nce_t *); 119 #endif 120 121 #define NCE_HASH_PTR_V4(ipst, addr) \ 122 (&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)])) 123 124 #define NCE_HASH_PTR_V6(ipst, addr) \ 125 (&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \ 126 NCE_TABLE_SIZE)])) 127 128 /* 129 * Compute default flags to use for an advertisement of this nce's address. 130 */ 131 static int 132 nce_advert_flags(const nce_t *nce) 133 { 134 int flag = 0; 135 136 if (nce->nce_flags & NCE_F_ISROUTER) 137 flag |= NDP_ISROUTER; 138 if (!(nce->nce_flags & NCE_F_PROXY)) 139 flag |= NDP_ORIDE; 140 return (flag); 141 } 142 143 /* Non-tunable probe interval, based on link capabilities */ 144 #define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500) 145 146 /* 147 * NDP Cache Entry creation routine. 148 * Mapped entries will never do NUD . 149 * This routine must always be called with ndp6->ndp_g_lock held. 150 * Prior to return, nce_refcnt is incremented. 151 */ 152 int 153 ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr, 154 const in6_addr_t *mask, const in6_addr_t *extract_mask, 155 uint32_t hw_extract_start, uint16_t flags, uint16_t state, 156 nce_t **newnce) 157 { 158 static nce_t nce_nil; 159 nce_t *nce; 160 mblk_t *mp; 161 mblk_t *template; 162 nce_t **ncep; 163 int err; 164 boolean_t dropped = B_FALSE; 165 ip_stack_t *ipst = ill->ill_ipst; 166 167 ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock)); 168 ASSERT(ill != NULL && ill->ill_isv6); 169 if (IN6_IS_ADDR_UNSPECIFIED(addr)) { 170 ip0dbg(("ndp_add_v6: no addr\n")); 171 return (EINVAL); 172 } 173 if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) { 174 ip0dbg(("ndp_add_v6: flags = %x\n", (int)flags)); 175 return (EINVAL); 176 } 177 if (IN6_IS_ADDR_UNSPECIFIED(extract_mask) && 178 (flags & NCE_F_MAPPING)) { 179 ip0dbg(("ndp_add_v6: extract mask zero for mapping")); 180 return (EINVAL); 181 } 182 /* 183 * Allocate the mblk to hold the nce. 184 * 185 * XXX This can come out of a separate cache - nce_cache. 186 * We don't need the mp anymore as there are no more 187 * "qwriter"s 188 */ 189 mp = allocb(sizeof (nce_t), BPRI_MED); 190 if (mp == NULL) 191 return (ENOMEM); 192 193 nce = (nce_t *)mp->b_rptr; 194 mp->b_wptr = (uchar_t *)&nce[1]; 195 *nce = nce_nil; 196 197 /* 198 * This one holds link layer address 199 */ 200 if (ill->ill_net_type == IRE_IF_RESOLVER) { 201 template = nce_udreq_alloc(ill); 202 } else { 203 if (ill->ill_resolver_mp == NULL) { 204 freeb(mp); 205 return (EINVAL); 206 } 207 ASSERT((ill->ill_net_type == IRE_IF_NORESOLVER)); 208 template = copyb(ill->ill_resolver_mp); 209 } 210 if (template == NULL) { 211 freeb(mp); 212 return (ENOMEM); 213 } 214 nce->nce_ill = ill; 215 nce->nce_ipversion = IPV6_VERSION; 216 nce->nce_flags = flags; 217 nce->nce_state = state; 218 nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT; 219 nce->nce_rcnt = ill->ill_xmit_count; 220 nce->nce_addr = *addr; 221 nce->nce_mask = *mask; 222 nce->nce_extract_mask = *extract_mask; 223 nce->nce_ll_extract_start = hw_extract_start; 224 nce->nce_fp_mp = NULL; 225 nce->nce_res_mp = template; 226 if (state == ND_REACHABLE) 227 nce->nce_last = TICK_TO_MSEC(lbolt64); 228 else 229 nce->nce_last = 0; 230 nce->nce_qd_mp = NULL; 231 nce->nce_mp = mp; 232 if (hw_addr != NULL) 233 nce_set_ll(nce, hw_addr); 234 /* This one is for nce getting created */ 235 nce->nce_refcnt = 1; 236 mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL); 237 if (nce->nce_flags & NCE_F_MAPPING) { 238 ASSERT(IN6_IS_ADDR_MULTICAST(addr)); 239 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_mask)); 240 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask)); 241 ncep = &ipst->ips_ndp6->nce_mask_entries; 242 } else { 243 ncep = ((nce_t **)NCE_HASH_PTR_V6(ipst, *addr)); 244 } 245 246 #ifdef NCE_DEBUG 247 bzero(nce->nce_trace, sizeof (th_trace_t *) * IP_TR_HASH_MAX); 248 #endif 249 /* 250 * Atomically ensure that the ill is not CONDEMNED, before 251 * adding the NCE. 252 */ 253 mutex_enter(&ill->ill_lock); 254 if (ill->ill_state_flags & ILL_CONDEMNED) { 255 mutex_exit(&ill->ill_lock); 256 freeb(mp); 257 freeb(template); 258 return (EINVAL); 259 } 260 if ((nce->nce_next = *ncep) != NULL) 261 nce->nce_next->nce_ptpn = &nce->nce_next; 262 *ncep = nce; 263 nce->nce_ptpn = ncep; 264 *newnce = nce; 265 /* This one is for nce being used by an active thread */ 266 NCE_REFHOLD(*newnce); 267 268 /* Bump up the number of nce's referencing this ill */ 269 ill->ill_nce_cnt++; 270 mutex_exit(&ill->ill_lock); 271 272 err = 0; 273 if ((flags & NCE_F_PERMANENT) && state == ND_PROBE) { 274 mutex_enter(&nce->nce_lock); 275 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 276 nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT; 277 mutex_exit(&nce->nce_lock); 278 dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE, 279 &ipv6_all_zeros, addr, NDP_PROBE); 280 if (dropped) { 281 mutex_enter(&nce->nce_lock); 282 nce->nce_pcnt++; 283 mutex_exit(&nce->nce_lock); 284 } 285 NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill)); 286 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 287 err = EINPROGRESS; 288 } else if (flags & NCE_F_UNSOL_ADV) { 289 /* 290 * We account for the transmit below by assigning one 291 * less than the ndd variable. Subsequent decrements 292 * are done in ndp_timer. 293 */ 294 mutex_enter(&nce->nce_lock); 295 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 296 nce->nce_unsolicit_count = ipst->ips_ip_ndp_unsolicit_count - 1; 297 mutex_exit(&nce->nce_lock); 298 dropped = nce_xmit(ill, 299 ND_NEIGHBOR_ADVERT, 300 ill, /* ill to be used for extracting ill_nd_lla */ 301 B_TRUE, /* use ill_nd_lla */ 302 addr, /* Source and target of the advertisement pkt */ 303 &ipv6_all_hosts_mcast, /* Destination of the packet */ 304 nce_advert_flags(nce)); 305 mutex_enter(&nce->nce_lock); 306 if (dropped) 307 nce->nce_unsolicit_count++; 308 if (nce->nce_unsolicit_count != 0) { 309 nce->nce_timeout_id = timeout(ndp_timer, nce, 310 MSEC_TO_TICK(ipst->ips_ip_ndp_unsolicit_interval)); 311 } 312 mutex_exit(&nce->nce_lock); 313 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 314 } 315 /* 316 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then 317 * we call nce_fastpath as soon as the nce is resolved in ndp_process. 318 * We call nce_fastpath from nce_update if the link layer address of 319 * the peer changes from nce_update 320 */ 321 if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) 322 nce_fastpath(nce); 323 return (err); 324 } 325 326 int 327 ndp_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr, 328 const in6_addr_t *mask, const in6_addr_t *extract_mask, 329 uint32_t hw_extract_start, uint16_t flags, uint16_t state, 330 nce_t **newnce) 331 { 332 int err = 0; 333 nce_t *nce; 334 ip_stack_t *ipst = ill->ill_ipst; 335 336 ASSERT(ill->ill_isv6); 337 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 338 339 /* Get head of v6 hash table */ 340 nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr)); 341 nce = nce_lookup_addr(ill, addr, nce); 342 if (nce == NULL) { 343 err = ndp_add_v6(ill, 344 hw_addr, 345 addr, 346 mask, 347 extract_mask, 348 hw_extract_start, 349 flags, 350 state, 351 newnce); 352 } else { 353 *newnce = nce; 354 err = EEXIST; 355 } 356 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 357 return (err); 358 } 359 360 /* 361 * Remove all the CONDEMNED nces from the appropriate hash table. 362 * We create a private list of NCEs, these may have ires pointing 363 * to them, so the list will be passed through to clean up dependent 364 * ires and only then we can do NCE_REFRELE which can make NCE inactive. 365 */ 366 static void 367 nce_remove(ndp_g_t *ndp, nce_t *nce, nce_t **free_nce_list) 368 { 369 nce_t *nce1; 370 nce_t **ptpn; 371 372 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); 373 ASSERT(ndp->ndp_g_walker == 0); 374 for (; nce; nce = nce1) { 375 nce1 = nce->nce_next; 376 mutex_enter(&nce->nce_lock); 377 if (nce->nce_flags & NCE_F_CONDEMNED) { 378 ptpn = nce->nce_ptpn; 379 nce1 = nce->nce_next; 380 if (nce1 != NULL) 381 nce1->nce_ptpn = ptpn; 382 *ptpn = nce1; 383 nce->nce_ptpn = NULL; 384 nce->nce_next = NULL; 385 nce->nce_next = *free_nce_list; 386 *free_nce_list = nce; 387 } 388 mutex_exit(&nce->nce_lock); 389 } 390 } 391 392 /* 393 * 1. Mark the nce CONDEMNED. This ensures that no new nce_lookup() 394 * will return this NCE. Also no new IREs will be created that 395 * point to this NCE (See ire_add_v6). Also no new timeouts will 396 * be started (See NDP_RESTART_TIMER). 397 * 2. Cancel any currently running timeouts. 398 * 3. If there is an ndp walker, return. The walker will do the cleanup. 399 * This ensures that walkers see a consistent list of NCEs while walking. 400 * 4. Otherwise remove the NCE from the list of NCEs 401 * 5. Delete all IREs pointing to this NCE. 402 */ 403 void 404 ndp_delete(nce_t *nce) 405 { 406 nce_t **ptpn; 407 nce_t *nce1; 408 int ipversion = nce->nce_ipversion; 409 ndp_g_t *ndp; 410 ip_stack_t *ipst = nce->nce_ill->ill_ipst; 411 412 if (ipversion == IPV4_VERSION) 413 ndp = ipst->ips_ndp4; 414 else 415 ndp = ipst->ips_ndp6; 416 417 /* Serialize deletes */ 418 mutex_enter(&nce->nce_lock); 419 if (nce->nce_flags & NCE_F_CONDEMNED) { 420 /* Some other thread is doing the delete */ 421 mutex_exit(&nce->nce_lock); 422 return; 423 } 424 /* 425 * Caller has a refhold. Also 1 ref for being in the list. Thus 426 * refcnt has to be >= 2 427 */ 428 ASSERT(nce->nce_refcnt >= 2); 429 nce->nce_flags |= NCE_F_CONDEMNED; 430 mutex_exit(&nce->nce_lock); 431 432 nce_fastpath_list_delete(nce); 433 434 /* 435 * Cancel any running timer. Timeout can't be restarted 436 * since CONDEMNED is set. Can't hold nce_lock across untimeout. 437 * Passing invalid timeout id is fine. 438 */ 439 if (nce->nce_timeout_id != 0) { 440 (void) untimeout(nce->nce_timeout_id); 441 nce->nce_timeout_id = 0; 442 } 443 444 mutex_enter(&ndp->ndp_g_lock); 445 if (nce->nce_ptpn == NULL) { 446 /* 447 * The last ndp walker has already removed this nce from 448 * the list after we marked the nce CONDEMNED and before 449 * we grabbed the global lock. 450 */ 451 mutex_exit(&ndp->ndp_g_lock); 452 return; 453 } 454 if (ndp->ndp_g_walker > 0) { 455 /* 456 * Can't unlink. The walker will clean up 457 */ 458 ndp->ndp_g_walker_cleanup = B_TRUE; 459 mutex_exit(&ndp->ndp_g_lock); 460 return; 461 } 462 463 /* 464 * Now remove the nce from the list. NDP_RESTART_TIMER won't restart 465 * the timer since it is marked CONDEMNED. 466 */ 467 ptpn = nce->nce_ptpn; 468 nce1 = nce->nce_next; 469 if (nce1 != NULL) 470 nce1->nce_ptpn = ptpn; 471 *ptpn = nce1; 472 nce->nce_ptpn = NULL; 473 nce->nce_next = NULL; 474 mutex_exit(&ndp->ndp_g_lock); 475 476 nce_ire_delete(nce); 477 } 478 479 void 480 ndp_inactive(nce_t *nce) 481 { 482 mblk_t **mpp; 483 ill_t *ill; 484 485 ASSERT(nce->nce_refcnt == 0); 486 ASSERT(MUTEX_HELD(&nce->nce_lock)); 487 ASSERT(nce->nce_fastpath == NULL); 488 489 /* Free all nce allocated messages */ 490 mpp = &nce->nce_first_mp_to_free; 491 do { 492 while (*mpp != NULL) { 493 mblk_t *mp; 494 495 mp = *mpp; 496 *mpp = mp->b_next; 497 498 inet_freemsg(mp); 499 } 500 } while (mpp++ != &nce->nce_last_mp_to_free); 501 502 #ifdef NCE_DEBUG 503 nce_trace_inactive(nce); 504 #endif 505 506 ill = nce->nce_ill; 507 mutex_enter(&ill->ill_lock); 508 ill->ill_nce_cnt--; 509 /* 510 * If the number of nce's associated with this ill have dropped 511 * to zero, check whether we need to restart any operation that 512 * is waiting for this to happen. 513 */ 514 if (ill->ill_nce_cnt == 0) { 515 /* ipif_ill_refrele_tail drops the ill_lock */ 516 ipif_ill_refrele_tail(ill); 517 } else { 518 mutex_exit(&ill->ill_lock); 519 } 520 mutex_destroy(&nce->nce_lock); 521 if (nce->nce_mp != NULL) 522 inet_freemsg(nce->nce_mp); 523 } 524 525 /* 526 * ndp_walk routine. Delete the nce if it is associated with the ill 527 * that is going away. Always called as a writer. 528 */ 529 void 530 ndp_delete_per_ill(nce_t *nce, uchar_t *arg) 531 { 532 if ((nce != NULL) && nce->nce_ill == (ill_t *)arg) { 533 ndp_delete(nce); 534 } 535 } 536 537 /* 538 * Walk a list of to be inactive NCEs and blow away all the ires. 539 */ 540 static void 541 nce_ire_delete_list(nce_t *nce) 542 { 543 nce_t *nce_next; 544 545 ASSERT(nce != NULL); 546 while (nce != NULL) { 547 nce_next = nce->nce_next; 548 nce->nce_next = NULL; 549 550 /* 551 * It is possible for the last ndp walker (this thread) 552 * to come here after ndp_delete has marked the nce CONDEMNED 553 * and before it has removed the nce from the fastpath list 554 * or called untimeout. So we need to do it here. It is safe 555 * for both ndp_delete and this thread to do it twice or 556 * even simultaneously since each of the threads has a 557 * reference on the nce. 558 */ 559 nce_fastpath_list_delete(nce); 560 /* 561 * Cancel any running timer. Timeout can't be restarted 562 * since CONDEMNED is set. Can't hold nce_lock across untimeout. 563 * Passing invalid timeout id is fine. 564 */ 565 if (nce->nce_timeout_id != 0) { 566 (void) untimeout(nce->nce_timeout_id); 567 nce->nce_timeout_id = 0; 568 } 569 /* 570 * We might hit this func thus in the v4 case: 571 * ipif_down->ipif_ndp_down->ndp_walk 572 */ 573 574 if (nce->nce_ipversion == IPV4_VERSION) { 575 ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, 576 IRE_CACHE, nce_ire_delete1, 577 (char *)nce, nce->nce_ill); 578 } else { 579 ASSERT(nce->nce_ipversion == IPV6_VERSION); 580 ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, 581 IRE_CACHE, nce_ire_delete1, 582 (char *)nce, nce->nce_ill); 583 } 584 NCE_REFRELE_NOTR(nce); 585 nce = nce_next; 586 } 587 } 588 589 /* 590 * Delete an ire when the nce goes away. 591 */ 592 /* ARGSUSED */ 593 static void 594 nce_ire_delete(nce_t *nce) 595 { 596 if (nce->nce_ipversion == IPV6_VERSION) { 597 ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE, 598 nce_ire_delete1, (char *)nce, nce->nce_ill); 599 NCE_REFRELE_NOTR(nce); 600 } else { 601 ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE, 602 nce_ire_delete1, (char *)nce, nce->nce_ill); 603 NCE_REFRELE_NOTR(nce); 604 } 605 } 606 607 /* 608 * ire_walk routine used to delete every IRE that shares this nce 609 */ 610 static void 611 nce_ire_delete1(ire_t *ire, char *nce_arg) 612 { 613 nce_t *nce = (nce_t *)nce_arg; 614 615 ASSERT(ire->ire_type == IRE_CACHE); 616 617 if (ire->ire_nce == nce) { 618 ASSERT(ire->ire_ipversion == nce->nce_ipversion); 619 ire_delete(ire); 620 } 621 } 622 623 /* 624 * Restart DAD on given NCE. Returns B_TRUE if DAD has been restarted. 625 */ 626 boolean_t 627 ndp_restart_dad(nce_t *nce) 628 { 629 boolean_t started; 630 boolean_t dropped; 631 632 if (nce == NULL) 633 return (B_FALSE); 634 mutex_enter(&nce->nce_lock); 635 if (nce->nce_state == ND_PROBE) { 636 mutex_exit(&nce->nce_lock); 637 started = B_TRUE; 638 } else if (nce->nce_state == ND_REACHABLE) { 639 nce->nce_state = ND_PROBE; 640 nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT - 1; 641 mutex_exit(&nce->nce_lock); 642 dropped = nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, NULL, 643 B_FALSE, &ipv6_all_zeros, &nce->nce_addr, NDP_PROBE); 644 if (dropped) { 645 mutex_enter(&nce->nce_lock); 646 nce->nce_pcnt++; 647 mutex_exit(&nce->nce_lock); 648 } 649 NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(nce->nce_ill)); 650 started = B_TRUE; 651 } else { 652 mutex_exit(&nce->nce_lock); 653 started = B_FALSE; 654 } 655 return (started); 656 } 657 658 /* 659 * IPv6 Cache entry lookup. Try to find an nce matching the parameters passed. 660 * If one is found, the refcnt on the nce will be incremented. 661 */ 662 nce_t * 663 ndp_lookup_v6(ill_t *ill, const in6_addr_t *addr, boolean_t caller_holds_lock) 664 { 665 nce_t *nce; 666 ip_stack_t *ipst; 667 668 ASSERT(ill != NULL); 669 ipst = ill->ill_ipst; 670 671 ASSERT(ill != NULL && ill->ill_isv6); 672 if (!caller_holds_lock) { 673 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 674 } 675 676 /* Get head of v6 hash table */ 677 nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr)); 678 nce = nce_lookup_addr(ill, addr, nce); 679 if (nce == NULL) 680 nce = nce_lookup_mapping(ill, addr); 681 if (!caller_holds_lock) 682 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 683 return (nce); 684 } 685 /* 686 * IPv4 Cache entry lookup. Try to find an nce matching the parameters passed. 687 * If one is found, the refcnt on the nce will be incremented. 688 * Since multicast mappings are handled in arp, there are no nce_mcast_entries 689 * so we skip the nce_lookup_mapping call. 690 * XXX TODO: if the nce is found to be ND_STALE, ndp_delete it and return NULL 691 */ 692 nce_t * 693 ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock) 694 { 695 nce_t *nce; 696 in6_addr_t addr6; 697 ip_stack_t *ipst = ill->ill_ipst; 698 699 if (!caller_holds_lock) { 700 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 701 } 702 703 /* Get head of v4 hash table */ 704 nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr)); 705 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 706 nce = nce_lookup_addr(ill, &addr6, nce); 707 if (!caller_holds_lock) 708 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 709 return (nce); 710 } 711 712 /* 713 * Cache entry lookup. Try to find an nce matching the parameters passed. 714 * Look only for exact entries (no mappings). If an nce is found, increment 715 * the hold count on that nce. The caller passes in the start of the 716 * appropriate hash table, and must be holding the appropriate global 717 * lock (ndp_g_lock). 718 */ 719 static nce_t * 720 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr, nce_t *nce) 721 { 722 ndp_g_t *ndp; 723 ip_stack_t *ipst = ill->ill_ipst; 724 725 if (ill->ill_isv6) 726 ndp = ipst->ips_ndp6; 727 else 728 ndp = ipst->ips_ndp4; 729 730 ASSERT(ill != NULL); 731 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); 732 if (IN6_IS_ADDR_UNSPECIFIED(addr)) 733 return (NULL); 734 for (; nce != NULL; nce = nce->nce_next) { 735 if (nce->nce_ill == ill) { 736 if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) && 737 IN6_ARE_ADDR_EQUAL(&nce->nce_mask, 738 &ipv6_all_ones)) { 739 mutex_enter(&nce->nce_lock); 740 if (!(nce->nce_flags & NCE_F_CONDEMNED)) { 741 NCE_REFHOLD_LOCKED(nce); 742 mutex_exit(&nce->nce_lock); 743 break; 744 } 745 mutex_exit(&nce->nce_lock); 746 } 747 } 748 } 749 return (nce); 750 } 751 752 /* 753 * Cache entry lookup. Try to find an nce matching the parameters passed. 754 * Look only for mappings. 755 */ 756 static nce_t * 757 nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr) 758 { 759 nce_t *nce; 760 ip_stack_t *ipst = ill->ill_ipst; 761 762 ASSERT(ill != NULL && ill->ill_isv6); 763 ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock)); 764 if (!IN6_IS_ADDR_MULTICAST(addr)) 765 return (NULL); 766 nce = ipst->ips_ndp6->nce_mask_entries; 767 for (; nce != NULL; nce = nce->nce_next) 768 if (nce->nce_ill == ill && 769 (V6_MASK_EQ(*addr, nce->nce_mask, nce->nce_addr))) { 770 mutex_enter(&nce->nce_lock); 771 if (!(nce->nce_flags & NCE_F_CONDEMNED)) { 772 NCE_REFHOLD_LOCKED(nce); 773 mutex_exit(&nce->nce_lock); 774 break; 775 } 776 mutex_exit(&nce->nce_lock); 777 } 778 return (nce); 779 } 780 781 /* 782 * Process passed in parameters either from an incoming packet or via 783 * user ioctl. 784 */ 785 void 786 ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv) 787 { 788 ill_t *ill = nce->nce_ill; 789 uint32_t hw_addr_len = ill->ill_nd_lla_len; 790 mblk_t *mp; 791 boolean_t ll_updated = B_FALSE; 792 boolean_t ll_changed; 793 ip_stack_t *ipst = ill->ill_ipst; 794 795 ASSERT(nce->nce_ipversion == IPV6_VERSION); 796 /* 797 * No updates of link layer address or the neighbor state is 798 * allowed, when the cache is in NONUD state. This still 799 * allows for responding to reachability solicitation. 800 */ 801 mutex_enter(&nce->nce_lock); 802 if (nce->nce_state == ND_INCOMPLETE) { 803 if (hw_addr == NULL) { 804 mutex_exit(&nce->nce_lock); 805 return; 806 } 807 nce_set_ll(nce, hw_addr); 808 /* 809 * Update nce state and send the queued packets 810 * back to ip this time ire will be added. 811 */ 812 if (flag & ND_NA_FLAG_SOLICITED) { 813 nce_update(nce, ND_REACHABLE, NULL); 814 } else { 815 nce_update(nce, ND_STALE, NULL); 816 } 817 mutex_exit(&nce->nce_lock); 818 nce_fastpath(nce); 819 mutex_enter(&nce->nce_lock); 820 mp = nce->nce_qd_mp; 821 nce->nce_qd_mp = NULL; 822 mutex_exit(&nce->nce_lock); 823 while (mp != NULL) { 824 mblk_t *nxt_mp, *data_mp; 825 826 nxt_mp = mp->b_next; 827 mp->b_next = NULL; 828 829 if (mp->b_datap->db_type == M_CTL) 830 data_mp = mp->b_cont; 831 else 832 data_mp = mp; 833 if (data_mp->b_prev != NULL) { 834 ill_t *inbound_ill; 835 queue_t *fwdq = NULL; 836 uint_t ifindex; 837 838 ifindex = (uint_t)(uintptr_t)data_mp->b_prev; 839 inbound_ill = ill_lookup_on_ifindex(ifindex, 840 B_TRUE, NULL, NULL, NULL, NULL, ipst); 841 if (inbound_ill == NULL) { 842 data_mp->b_prev = NULL; 843 freemsg(mp); 844 return; 845 } else { 846 fwdq = inbound_ill->ill_rq; 847 } 848 data_mp->b_prev = NULL; 849 /* 850 * Send a forwarded packet back into ip_rput_v6 851 * just as in ire_send_v6(). 852 * Extract the queue from b_prev (set in 853 * ip_rput_data_v6). 854 */ 855 if (fwdq != NULL) { 856 /* 857 * Forwarded packets hop count will 858 * get decremented in ip_rput_data_v6 859 */ 860 if (data_mp != mp) 861 freeb(mp); 862 put(fwdq, data_mp); 863 } else { 864 /* 865 * Send locally originated packets back 866 * into * ip_wput_v6. 867 */ 868 put(ill->ill_wq, mp); 869 } 870 ill_refrele(inbound_ill); 871 } else { 872 put(ill->ill_wq, mp); 873 } 874 mp = nxt_mp; 875 } 876 return; 877 } 878 ll_changed = nce_cmp_ll_addr(nce, hw_addr, hw_addr_len); 879 if (!is_adv) { 880 /* If this is a SOLICITATION request only */ 881 if (ll_changed) 882 nce_update(nce, ND_STALE, hw_addr); 883 mutex_exit(&nce->nce_lock); 884 return; 885 } 886 if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) { 887 /* If in any other state than REACHABLE, ignore */ 888 if (nce->nce_state == ND_REACHABLE) { 889 nce_update(nce, ND_STALE, NULL); 890 } 891 mutex_exit(&nce->nce_lock); 892 return; 893 } else { 894 if (ll_changed) { 895 nce_update(nce, ND_UNCHANGED, hw_addr); 896 ll_updated = B_TRUE; 897 } 898 if (flag & ND_NA_FLAG_SOLICITED) { 899 nce_update(nce, ND_REACHABLE, NULL); 900 } else { 901 if (ll_updated) { 902 nce_update(nce, ND_STALE, NULL); 903 } 904 } 905 mutex_exit(&nce->nce_lock); 906 if (!(flag & ND_NA_FLAG_ROUTER) && (nce->nce_flags & 907 NCE_F_ISROUTER)) { 908 ire_t *ire; 909 910 /* 911 * Router turned to host. We need to remove the 912 * entry as well as any default route that may be 913 * using this as a next hop. This is required by 914 * section 7.2.5 of RFC 2461. 915 */ 916 ire = ire_ftable_lookup_v6(&ipv6_all_zeros, 917 &ipv6_all_zeros, &nce->nce_addr, IRE_DEFAULT, 918 nce->nce_ill->ill_ipif, NULL, ALL_ZONES, 0, NULL, 919 MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW | 920 MATCH_IRE_DEFAULT, ipst); 921 if (ire != NULL) { 922 ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst); 923 ire_delete(ire); 924 ire_refrele(ire); 925 } 926 ndp_delete(nce); 927 } 928 } 929 } 930 931 /* 932 * Pass arg1 to the pfi supplied, along with each nce in existence. 933 * ndp_walk() places a REFHOLD on the nce and drops the lock when 934 * walking the hash list. 935 */ 936 void 937 ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1, 938 boolean_t trace) 939 { 940 941 nce_t *nce; 942 nce_t *nce1; 943 nce_t **ncep; 944 nce_t *free_nce_list = NULL; 945 946 mutex_enter(&ndp->ndp_g_lock); 947 /* Prevent ndp_delete from unlink and free of NCE */ 948 ndp->ndp_g_walker++; 949 mutex_exit(&ndp->ndp_g_lock); 950 for (ncep = ndp->nce_hash_tbl; 951 ncep < A_END(ndp->nce_hash_tbl); ncep++) { 952 for (nce = *ncep; nce != NULL; nce = nce1) { 953 nce1 = nce->nce_next; 954 if (ill == NULL || nce->nce_ill == ill) { 955 if (trace) { 956 NCE_REFHOLD(nce); 957 (*pfi)(nce, arg1); 958 NCE_REFRELE(nce); 959 } else { 960 NCE_REFHOLD_NOTR(nce); 961 (*pfi)(nce, arg1); 962 NCE_REFRELE_NOTR(nce); 963 } 964 } 965 } 966 } 967 for (nce = ndp->nce_mask_entries; nce != NULL; nce = nce1) { 968 nce1 = nce->nce_next; 969 if (ill == NULL || nce->nce_ill == ill) { 970 if (trace) { 971 NCE_REFHOLD(nce); 972 (*pfi)(nce, arg1); 973 NCE_REFRELE(nce); 974 } else { 975 NCE_REFHOLD_NOTR(nce); 976 (*pfi)(nce, arg1); 977 NCE_REFRELE_NOTR(nce); 978 } 979 } 980 } 981 mutex_enter(&ndp->ndp_g_lock); 982 ndp->ndp_g_walker--; 983 /* 984 * While NCE's are removed from global list they are placed 985 * in a private list, to be passed to nce_ire_delete_list(). 986 * The reason is, there may be ires pointing to this nce 987 * which needs to cleaned up. 988 */ 989 if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) { 990 /* Time to delete condemned entries */ 991 for (ncep = ndp->nce_hash_tbl; 992 ncep < A_END(ndp->nce_hash_tbl); ncep++) { 993 nce = *ncep; 994 if (nce != NULL) { 995 nce_remove(ndp, nce, &free_nce_list); 996 } 997 } 998 nce = ndp->nce_mask_entries; 999 if (nce != NULL) { 1000 nce_remove(ndp, nce, &free_nce_list); 1001 } 1002 ndp->ndp_g_walker_cleanup = B_FALSE; 1003 } 1004 1005 mutex_exit(&ndp->ndp_g_lock); 1006 1007 if (free_nce_list != NULL) { 1008 nce_ire_delete_list(free_nce_list); 1009 } 1010 } 1011 1012 /* 1013 * Walk everything. 1014 * Note that ill can be NULL hence can't derive the ipst from it. 1015 */ 1016 void 1017 ndp_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst) 1018 { 1019 ndp_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE); 1020 ndp_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE); 1021 } 1022 1023 /* 1024 * Process resolve requests. Handles both mapped entries 1025 * as well as cases that needs to be send out on the wire. 1026 * Lookup a NCE for a given IRE. Regardless of whether one exists 1027 * or one is created, we defer making ire point to nce until the 1028 * ire is actually added at which point the nce_refcnt on the nce is 1029 * incremented. This is done primarily to have symmetry between ire_add() 1030 * and ire_delete() which decrements the nce_refcnt, when an ire is deleted. 1031 */ 1032 int 1033 ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid) 1034 { 1035 nce_t *nce; 1036 int err = 0; 1037 uint32_t ms; 1038 mblk_t *mp_nce = NULL; 1039 ip_stack_t *ipst = ill->ill_ipst; 1040 1041 ASSERT(ill->ill_isv6); 1042 if (IN6_IS_ADDR_MULTICAST(dst)) { 1043 err = nce_set_multicast(ill, dst); 1044 return (err); 1045 } 1046 err = ndp_lookup_then_add_v6(ill, 1047 NULL, /* No hardware address */ 1048 dst, 1049 &ipv6_all_ones, 1050 &ipv6_all_zeros, 1051 0, 1052 (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0, 1053 ND_INCOMPLETE, 1054 &nce); 1055 1056 switch (err) { 1057 case 0: 1058 /* 1059 * New cache entry was created. Make sure that the state 1060 * is not ND_INCOMPLETE. It can be in some other state 1061 * even before we send out the solicitation as we could 1062 * get un-solicited advertisements. 1063 * 1064 * If this is an XRESOLV interface, simply return 0, 1065 * since we don't want to solicit just yet. 1066 */ 1067 if (ill->ill_flags & ILLF_XRESOLV) { 1068 NCE_REFRELE(nce); 1069 return (0); 1070 } 1071 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 1072 mutex_enter(&nce->nce_lock); 1073 if (nce->nce_state != ND_INCOMPLETE) { 1074 mutex_exit(&nce->nce_lock); 1075 rw_exit(&ipst->ips_ill_g_lock); 1076 NCE_REFRELE(nce); 1077 return (0); 1078 } 1079 mp_nce = ip_prepend_zoneid(mp, zoneid, ipst); 1080 if (mp_nce == NULL) { 1081 /* The caller will free mp */ 1082 mutex_exit(&nce->nce_lock); 1083 rw_exit(&ipst->ips_ill_g_lock); 1084 ndp_delete(nce); 1085 NCE_REFRELE(nce); 1086 return (ENOMEM); 1087 } 1088 ms = nce_solicit(nce, mp_nce); 1089 rw_exit(&ipst->ips_ill_g_lock); 1090 if (ms == 0) { 1091 /* The caller will free mp */ 1092 if (mp_nce != mp) 1093 freeb(mp_nce); 1094 mutex_exit(&nce->nce_lock); 1095 ndp_delete(nce); 1096 NCE_REFRELE(nce); 1097 return (EBUSY); 1098 } 1099 mutex_exit(&nce->nce_lock); 1100 NDP_RESTART_TIMER(nce, (clock_t)ms); 1101 NCE_REFRELE(nce); 1102 return (EINPROGRESS); 1103 case EEXIST: 1104 /* Resolution in progress just queue the packet */ 1105 mutex_enter(&nce->nce_lock); 1106 if (nce->nce_state == ND_INCOMPLETE) { 1107 mp_nce = ip_prepend_zoneid(mp, zoneid, ipst); 1108 if (mp_nce == NULL) { 1109 err = ENOMEM; 1110 } else { 1111 nce_queue_mp(nce, mp_nce); 1112 err = EINPROGRESS; 1113 } 1114 } else { 1115 /* 1116 * Any other state implies we have 1117 * a nce but IRE needs to be added ... 1118 * ire_add_v6() will take care of the 1119 * the case when the nce becomes CONDEMNED 1120 * before the ire is added to the table. 1121 */ 1122 err = 0; 1123 } 1124 mutex_exit(&nce->nce_lock); 1125 NCE_REFRELE(nce); 1126 break; 1127 default: 1128 ip1dbg(("ndp_resolver: Can't create NCE %d\n", err)); 1129 break; 1130 } 1131 return (err); 1132 } 1133 1134 /* 1135 * When there is no resolver, the link layer template is passed in 1136 * the IRE. 1137 * Lookup a NCE for a given IRE. Regardless of whether one exists 1138 * or one is created, we defer making ire point to nce until the 1139 * ire is actually added at which point the nce_refcnt on the nce is 1140 * incremented. This is done primarily to have symmetry between ire_add() 1141 * and ire_delete() which decrements the nce_refcnt, when an ire is deleted. 1142 */ 1143 int 1144 ndp_noresolver(ill_t *ill, const in6_addr_t *dst) 1145 { 1146 nce_t *nce; 1147 int err = 0; 1148 1149 ASSERT(ill != NULL); 1150 ASSERT(ill->ill_isv6); 1151 if (IN6_IS_ADDR_MULTICAST(dst)) { 1152 err = nce_set_multicast(ill, dst); 1153 return (err); 1154 } 1155 1156 err = ndp_lookup_then_add_v6(ill, 1157 NULL, /* hardware address */ 1158 dst, 1159 &ipv6_all_ones, 1160 &ipv6_all_zeros, 1161 0, 1162 (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0, 1163 ND_REACHABLE, 1164 &nce); 1165 1166 switch (err) { 1167 case 0: 1168 /* 1169 * Cache entry with a proper resolver cookie was 1170 * created. 1171 */ 1172 NCE_REFRELE(nce); 1173 break; 1174 case EEXIST: 1175 err = 0; 1176 NCE_REFRELE(nce); 1177 break; 1178 default: 1179 ip1dbg(("ndp_noresolver: Can't create NCE %d\n", err)); 1180 break; 1181 } 1182 return (err); 1183 } 1184 1185 /* 1186 * For each interface an entry is added for the unspecified multicast group. 1187 * Here that mapping is used to form the multicast cache entry for a particular 1188 * multicast destination. 1189 */ 1190 static int 1191 nce_set_multicast(ill_t *ill, const in6_addr_t *dst) 1192 { 1193 nce_t *mnce; /* Multicast mapping entry */ 1194 nce_t *nce; 1195 uchar_t *hw_addr = NULL; 1196 int err = 0; 1197 ip_stack_t *ipst = ill->ill_ipst; 1198 1199 ASSERT(ill != NULL); 1200 ASSERT(ill->ill_isv6); 1201 ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst))); 1202 1203 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 1204 nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *dst)); 1205 nce = nce_lookup_addr(ill, dst, nce); 1206 if (nce != NULL) { 1207 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1208 NCE_REFRELE(nce); 1209 return (0); 1210 } 1211 /* No entry, now lookup for a mapping this should never fail */ 1212 mnce = nce_lookup_mapping(ill, dst); 1213 if (mnce == NULL) { 1214 /* Something broken for the interface. */ 1215 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1216 return (ESRCH); 1217 } 1218 ASSERT(mnce->nce_flags & NCE_F_MAPPING); 1219 if (ill->ill_net_type == IRE_IF_RESOLVER) { 1220 /* 1221 * For IRE_IF_RESOLVER a hardware mapping can be 1222 * generated, for IRE_IF_NORESOLVER, resolution cookie 1223 * in the ill is copied in ndp_add_v6(). 1224 */ 1225 hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP); 1226 if (hw_addr == NULL) { 1227 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1228 NCE_REFRELE(mnce); 1229 return (ENOMEM); 1230 } 1231 nce_make_mapping(mnce, hw_addr, (uchar_t *)dst); 1232 } 1233 NCE_REFRELE(mnce); 1234 /* 1235 * IRE_IF_NORESOLVER type simply copies the resolution 1236 * cookie passed in. So no hw_addr is needed. 1237 */ 1238 err = ndp_add_v6(ill, 1239 hw_addr, 1240 dst, 1241 &ipv6_all_ones, 1242 &ipv6_all_zeros, 1243 0, 1244 NCE_F_NONUD, 1245 ND_REACHABLE, 1246 &nce); 1247 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1248 if (hw_addr != NULL) 1249 kmem_free(hw_addr, ill->ill_nd_lla_len); 1250 if (err != 0) { 1251 ip1dbg(("nce_set_multicast: create failed" "%d\n", err)); 1252 return (err); 1253 } 1254 NCE_REFRELE(nce); 1255 return (0); 1256 } 1257 1258 /* 1259 * Return the link layer address, and any flags of a nce. 1260 */ 1261 int 1262 ndp_query(ill_t *ill, struct lif_nd_req *lnr) 1263 { 1264 nce_t *nce; 1265 in6_addr_t *addr; 1266 sin6_t *sin6; 1267 dl_unitdata_req_t *dl; 1268 1269 ASSERT(ill != NULL && ill->ill_isv6); 1270 sin6 = (sin6_t *)&lnr->lnr_addr; 1271 addr = &sin6->sin6_addr; 1272 1273 nce = ndp_lookup_v6(ill, addr, B_FALSE); 1274 if (nce == NULL) 1275 return (ESRCH); 1276 /* If in INCOMPLETE state, no link layer address is available yet */ 1277 if (nce->nce_state == ND_INCOMPLETE) 1278 goto done; 1279 dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr; 1280 if (ill->ill_flags & ILLF_XRESOLV) 1281 lnr->lnr_hdw_len = dl->dl_dest_addr_length; 1282 else 1283 lnr->lnr_hdw_len = ill->ill_nd_lla_len; 1284 ASSERT(NCE_LL_ADDR_OFFSET(ill) + lnr->lnr_hdw_len <= 1285 sizeof (lnr->lnr_hdw_addr)); 1286 bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill), 1287 (uchar_t *)&lnr->lnr_hdw_addr, lnr->lnr_hdw_len); 1288 if (nce->nce_flags & NCE_F_ISROUTER) 1289 lnr->lnr_flags = NDF_ISROUTER_ON; 1290 if (nce->nce_flags & NCE_F_PROXY) 1291 lnr->lnr_flags |= NDF_PROXY_ON; 1292 if (nce->nce_flags & NCE_F_ANYCAST) 1293 lnr->lnr_flags |= NDF_ANYCAST_ON; 1294 done: 1295 NCE_REFRELE(nce); 1296 return (0); 1297 } 1298 1299 /* 1300 * Send Enable/Disable multicast reqs to driver. 1301 */ 1302 int 1303 ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len, 1304 uint32_t hw_addr_offset, mblk_t *mp) 1305 { 1306 nce_t *nce; 1307 uchar_t *hw_addr; 1308 ip_stack_t *ipst = ill->ill_ipst; 1309 1310 ASSERT(ill != NULL && ill->ill_isv6); 1311 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 1312 hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len); 1313 if (hw_addr == NULL || !IN6_IS_ADDR_MULTICAST(addr)) { 1314 freemsg(mp); 1315 return (EINVAL); 1316 } 1317 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 1318 nce = nce_lookup_mapping(ill, addr); 1319 if (nce == NULL) { 1320 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1321 freemsg(mp); 1322 return (ESRCH); 1323 } 1324 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1325 /* 1326 * Update dl_addr_length and dl_addr_offset for primitives that 1327 * have physical addresses as opposed to full saps 1328 */ 1329 switch (((union DL_primitives *)mp->b_rptr)->dl_primitive) { 1330 case DL_ENABMULTI_REQ: 1331 /* Track the state if this is the first enabmulti */ 1332 if (ill->ill_dlpi_multicast_state == IDS_UNKNOWN) 1333 ill->ill_dlpi_multicast_state = IDS_INPROGRESS; 1334 ip1dbg(("ndp_mcastreq: ENABMULTI\n")); 1335 break; 1336 case DL_DISABMULTI_REQ: 1337 ip1dbg(("ndp_mcastreq: DISABMULTI\n")); 1338 break; 1339 default: 1340 NCE_REFRELE(nce); 1341 ip1dbg(("ndp_mcastreq: default\n")); 1342 return (EINVAL); 1343 } 1344 nce_make_mapping(nce, hw_addr, (uchar_t *)addr); 1345 NCE_REFRELE(nce); 1346 ill_dlpi_send(ill, mp); 1347 return (0); 1348 } 1349 1350 /* 1351 * Send a neighbor solicitation. 1352 * Returns number of milliseconds after which we should either rexmit or abort. 1353 * Return of zero means we should abort. 1354 * The caller holds the nce_lock to protect nce_qd_mp and nce_rcnt. 1355 * 1356 * NOTE: This routine drops nce_lock (and later reacquires it) when sending 1357 * the packet. 1358 * NOTE: This routine does not consume mp. 1359 */ 1360 uint32_t 1361 nce_solicit(nce_t *nce, mblk_t *mp) 1362 { 1363 ill_t *ill; 1364 ill_t *src_ill; 1365 ip6_t *ip6h; 1366 in6_addr_t src; 1367 in6_addr_t dst; 1368 ipif_t *ipif; 1369 ip6i_t *ip6i; 1370 boolean_t dropped = B_FALSE; 1371 ip_stack_t *ipst = nce->nce_ill->ill_ipst; 1372 1373 ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock)); 1374 ASSERT(MUTEX_HELD(&nce->nce_lock)); 1375 ill = nce->nce_ill; 1376 ASSERT(ill != NULL); 1377 1378 if (nce->nce_rcnt == 0) { 1379 return (0); 1380 } 1381 1382 if (mp == NULL) { 1383 ASSERT(nce->nce_qd_mp != NULL); 1384 mp = nce->nce_qd_mp; 1385 } else { 1386 nce_queue_mp(nce, mp); 1387 } 1388 1389 /* Handle ip_newroute_v6 giving us IPSEC packets */ 1390 if (mp->b_datap->db_type == M_CTL) 1391 mp = mp->b_cont; 1392 1393 ip6h = (ip6_t *)mp->b_rptr; 1394 if (ip6h->ip6_nxt == IPPROTO_RAW) { 1395 /* 1396 * This message should have been pulled up already in 1397 * ip_wput_v6. We can't do pullups here because the message 1398 * could be from the nce_qd_mp which could have b_next/b_prev 1399 * non-NULL. 1400 */ 1401 ip6i = (ip6i_t *)ip6h; 1402 ASSERT((mp->b_wptr - (uchar_t *)ip6i) >= 1403 sizeof (ip6i_t) + IPV6_HDR_LEN); 1404 ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t)); 1405 } 1406 src = ip6h->ip6_src; 1407 /* 1408 * If the src of outgoing packet is one of the assigned interface 1409 * addresses use it, otherwise we will pick the source address below. 1410 */ 1411 src_ill = ill; 1412 if (!IN6_IS_ADDR_UNSPECIFIED(&src)) { 1413 if (ill->ill_group != NULL) 1414 src_ill = ill->ill_group->illgrp_ill; 1415 for (; src_ill != NULL; src_ill = src_ill->ill_group_next) { 1416 for (ipif = src_ill->ill_ipif; ipif != NULL; 1417 ipif = ipif->ipif_next) { 1418 if (IN6_ARE_ADDR_EQUAL(&src, 1419 &ipif->ipif_v6lcl_addr)) { 1420 break; 1421 } 1422 } 1423 if (ipif != NULL) 1424 break; 1425 } 1426 /* 1427 * If no relevant ipif can be found, then it's not one of our 1428 * addresses. Reset to :: and let nce_xmit. If an ipif can be 1429 * found, but it's not yet done with DAD verification, then 1430 * just postpone this transmission until later. 1431 */ 1432 if (src_ill == NULL) 1433 src = ipv6_all_zeros; 1434 else if (!ipif->ipif_addr_ready) 1435 return (ill->ill_reachable_retrans_time); 1436 } 1437 dst = nce->nce_addr; 1438 /* 1439 * If source address is unspecified, nce_xmit will choose 1440 * one for us and initialize the hardware address also 1441 * appropriately. 1442 */ 1443 if (IN6_IS_ADDR_UNSPECIFIED(&src)) 1444 src_ill = NULL; 1445 nce->nce_rcnt--; 1446 mutex_exit(&nce->nce_lock); 1447 rw_exit(&ipst->ips_ill_g_lock); 1448 dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, src_ill, B_TRUE, &src, 1449 &dst, 0); 1450 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 1451 mutex_enter(&nce->nce_lock); 1452 if (dropped) 1453 nce->nce_rcnt++; 1454 return (ill->ill_reachable_retrans_time); 1455 } 1456 1457 /* 1458 * Attempt to recover an address on an interface that's been marked as a 1459 * duplicate. Because NCEs are destroyed when the interface goes down, there's 1460 * no easy way to just probe the address and have the right thing happen if 1461 * it's no longer in use. Instead, we just bring it up normally and allow the 1462 * regular interface start-up logic to probe for a remaining duplicate and take 1463 * us back down if necessary. 1464 * Neither DHCP nor temporary addresses arrive here; they're excluded by 1465 * ip_ndp_excl. 1466 */ 1467 /* ARGSUSED */ 1468 static void 1469 ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) 1470 { 1471 ill_t *ill = rq->q_ptr; 1472 ipif_t *ipif; 1473 in6_addr_t *addr = (in6_addr_t *)mp->b_rptr; 1474 1475 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 1476 /* 1477 * We do not support recovery of proxy ARP'd interfaces, 1478 * because the system lacks a complete proxy ARP mechanism. 1479 */ 1480 if ((ipif->ipif_flags & IPIF_POINTOPOINT) || 1481 !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, addr)) { 1482 continue; 1483 } 1484 1485 /* 1486 * If we have already recovered or if the interface is going 1487 * away, then ignore. 1488 */ 1489 mutex_enter(&ill->ill_lock); 1490 if (!(ipif->ipif_flags & IPIF_DUPLICATE) || 1491 (ipif->ipif_flags & (IPIF_MOVING | IPIF_CONDEMNED))) { 1492 mutex_exit(&ill->ill_lock); 1493 continue; 1494 } 1495 1496 ipif->ipif_flags &= ~IPIF_DUPLICATE; 1497 ill->ill_ipif_dup_count--; 1498 mutex_exit(&ill->ill_lock); 1499 ipif->ipif_was_dup = B_TRUE; 1500 1501 if (ipif_ndp_up(ipif, addr) != EINPROGRESS) 1502 (void) ipif_up_done_v6(ipif); 1503 } 1504 freeb(mp); 1505 } 1506 1507 /* 1508 * Attempt to recover an IPv6 interface that's been shut down as a duplicate. 1509 * As long as someone else holds the address, the interface will stay down. 1510 * When that conflict goes away, the interface is brought back up. This is 1511 * done so that accidental shutdowns of addresses aren't made permanent. Your 1512 * server will recover from a failure. 1513 * 1514 * For DHCP and temporary addresses, recovery is not done in the kernel. 1515 * Instead, it's handled by user space processes (dhcpagent and in.ndpd). 1516 * 1517 * This function is entered on a timer expiry; the ID is in ipif_recovery_id. 1518 */ 1519 static void 1520 ipif6_dup_recovery(void *arg) 1521 { 1522 ipif_t *ipif = arg; 1523 1524 ipif->ipif_recovery_id = 0; 1525 if (!(ipif->ipif_flags & IPIF_DUPLICATE)) 1526 return; 1527 1528 /* 1529 * No lock, because this is just an optimization. 1530 */ 1531 if (ipif->ipif_state_flags & (IPIF_MOVING | IPIF_CONDEMNED)) 1532 return; 1533 1534 /* If the link is down, we'll retry this later */ 1535 if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING)) 1536 return; 1537 1538 ndp_do_recovery(ipif); 1539 } 1540 1541 /* 1542 * Perform interface recovery by forcing the duplicate interfaces up and 1543 * allowing the system to determine which ones should stay up. 1544 * 1545 * Called both by recovery timer expiry and link-up notification. 1546 */ 1547 void 1548 ndp_do_recovery(ipif_t *ipif) 1549 { 1550 ill_t *ill = ipif->ipif_ill; 1551 mblk_t *mp; 1552 ip_stack_t *ipst = ill->ill_ipst; 1553 1554 mp = allocb(sizeof (ipif->ipif_v6lcl_addr), BPRI_MED); 1555 if (mp == NULL) { 1556 mutex_enter(&ill->ill_lock); 1557 if (ipif->ipif_recovery_id == 0 && 1558 !(ipif->ipif_state_flags & (IPIF_MOVING | 1559 IPIF_CONDEMNED))) { 1560 ipif->ipif_recovery_id = timeout(ipif6_dup_recovery, 1561 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 1562 } 1563 mutex_exit(&ill->ill_lock); 1564 } else { 1565 bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr, 1566 sizeof (ipif->ipif_v6lcl_addr)); 1567 ill_refhold(ill); 1568 qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_recover, NEW_OP, 1569 B_FALSE); 1570 } 1571 } 1572 1573 /* 1574 * Find the solicitation in the given message, and extract printable details 1575 * (MAC and IP addresses) from it. 1576 */ 1577 static nd_neighbor_solicit_t * 1578 ip_ndp_find_solicitation(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, char *hbuf, 1579 size_t hlen, char *sbuf, size_t slen, uchar_t **haddr) 1580 { 1581 nd_neighbor_solicit_t *ns; 1582 ip6_t *ip6h; 1583 uchar_t *addr; 1584 int alen; 1585 1586 alen = 0; 1587 ip6h = (ip6_t *)mp->b_rptr; 1588 if (dl_mp == NULL) { 1589 nd_opt_hdr_t *opt; 1590 int nslen; 1591 1592 /* 1593 * If it's from the fast-path, then it can't be a probe 1594 * message, and thus must include the source linkaddr option. 1595 * Extract that here. 1596 */ 1597 ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN); 1598 nslen = mp->b_wptr - (uchar_t *)ns; 1599 if ((nslen -= sizeof (*ns)) > 0) { 1600 opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1), nslen, 1601 ND_OPT_SOURCE_LINKADDR); 1602 if (opt != NULL && 1603 opt->nd_opt_len * 8 - sizeof (*opt) >= 1604 ill->ill_nd_lla_len) { 1605 addr = (uchar_t *)(opt + 1); 1606 alen = ill->ill_nd_lla_len; 1607 } 1608 } 1609 /* 1610 * We cheat a bit here for the sake of printing usable log 1611 * messages in the rare case where the reply we got was unicast 1612 * without a source linkaddr option, and the interface is in 1613 * fastpath mode. (Sigh.) 1614 */ 1615 if (alen == 0 && ill->ill_type == IFT_ETHER && 1616 MBLKHEAD(mp) >= sizeof (struct ether_header)) { 1617 struct ether_header *pether; 1618 1619 pether = (struct ether_header *)((char *)ip6h - 1620 sizeof (*pether)); 1621 addr = pether->ether_shost.ether_addr_octet; 1622 alen = ETHERADDRL; 1623 } 1624 } else { 1625 dl_unitdata_ind_t *dlu; 1626 1627 dlu = (dl_unitdata_ind_t *)dl_mp->b_rptr; 1628 alen = dlu->dl_src_addr_length; 1629 if (alen > 0 && dlu->dl_src_addr_offset >= sizeof (*dlu) && 1630 dlu->dl_src_addr_offset + alen <= MBLKL(dl_mp)) { 1631 addr = dl_mp->b_rptr + dlu->dl_src_addr_offset; 1632 if (ill->ill_sap_length < 0) { 1633 alen += ill->ill_sap_length; 1634 } else { 1635 addr += ill->ill_sap_length; 1636 alen -= ill->ill_sap_length; 1637 } 1638 } 1639 } 1640 if (alen > 0) { 1641 *haddr = addr; 1642 (void) mac_colon_addr(addr, alen, hbuf, hlen); 1643 } else { 1644 *haddr = NULL; 1645 (void) strcpy(hbuf, "?"); 1646 } 1647 ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN); 1648 (void) inet_ntop(AF_INET6, &ns->nd_ns_target, sbuf, slen); 1649 return (ns); 1650 } 1651 1652 /* 1653 * This is for exclusive changes due to NDP duplicate address detection 1654 * failure. 1655 */ 1656 /* ARGSUSED */ 1657 static void 1658 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) 1659 { 1660 ill_t *ill = rq->q_ptr; 1661 ipif_t *ipif; 1662 char ibuf[LIFNAMSIZ + 10]; /* 10 digits for logical i/f number */ 1663 char hbuf[MAC_STR_LEN]; 1664 char sbuf[INET6_ADDRSTRLEN]; 1665 nd_neighbor_solicit_t *ns; 1666 mblk_t *dl_mp = NULL; 1667 uchar_t *haddr; 1668 ip_stack_t *ipst = ill->ill_ipst; 1669 1670 if (DB_TYPE(mp) != M_DATA) { 1671 dl_mp = mp; 1672 mp = mp->b_cont; 1673 } 1674 ns = ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf, sizeof (hbuf), sbuf, 1675 sizeof (sbuf), &haddr); 1676 if (haddr != NULL && 1677 bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) { 1678 /* 1679 * Ignore conflicts generated by misbehaving switches that just 1680 * reflect our own messages back to us. 1681 */ 1682 goto ignore_conflict; 1683 } 1684 (void) strlcpy(ibuf, ill->ill_name, sizeof (ibuf)); 1685 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 1686 1687 if ((ipif->ipif_flags & IPIF_POINTOPOINT) || 1688 !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, 1689 &ns->nd_ns_target)) { 1690 continue; 1691 } 1692 1693 /* If it's already marked, then don't do anything. */ 1694 if (ipif->ipif_flags & IPIF_DUPLICATE) 1695 continue; 1696 1697 /* 1698 * If this is a failure during duplicate recovery, then don't 1699 * complain. It may take a long time to recover. 1700 */ 1701 if (!ipif->ipif_was_dup) { 1702 if (ipif->ipif_id != 0) { 1703 (void) snprintf(ibuf + ill->ill_name_length - 1, 1704 sizeof (ibuf) - ill->ill_name_length + 1, 1705 ":%d", ipif->ipif_id); 1706 } 1707 cmn_err(CE_WARN, "%s has duplicate address %s (in " 1708 "use by %s); disabled", ibuf, sbuf, hbuf); 1709 } 1710 mutex_enter(&ill->ill_lock); 1711 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 1712 ipif->ipif_flags |= IPIF_DUPLICATE; 1713 ill->ill_ipif_dup_count++; 1714 mutex_exit(&ill->ill_lock); 1715 (void) ipif_down(ipif, NULL, NULL); 1716 ipif_down_tail(ipif); 1717 mutex_enter(&ill->ill_lock); 1718 if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && 1719 ill->ill_net_type == IRE_IF_RESOLVER && 1720 !(ipif->ipif_state_flags & (IPIF_MOVING | 1721 IPIF_CONDEMNED)) && 1722 ipst->ips_ip_dup_recovery > 0) { 1723 ipif->ipif_recovery_id = timeout(ipif6_dup_recovery, 1724 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 1725 } 1726 mutex_exit(&ill->ill_lock); 1727 } 1728 ignore_conflict: 1729 if (dl_mp != NULL) 1730 freeb(dl_mp); 1731 freemsg(mp); 1732 } 1733 1734 /* 1735 * Handle failure by tearing down the ipifs with the specified address. Note 1736 * that tearing down the ipif also means deleting the nce through ipif_down, so 1737 * it's not possible to do recovery by just restarting the nce timer. Instead, 1738 * we start a timer on the ipif. 1739 */ 1740 static void 1741 ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce) 1742 { 1743 if ((mp = copymsg(mp)) != NULL) { 1744 if (dl_mp == NULL) 1745 dl_mp = mp; 1746 else if ((dl_mp = copyb(dl_mp)) != NULL) 1747 dl_mp->b_cont = mp; 1748 if (dl_mp == NULL) { 1749 freemsg(mp); 1750 } else { 1751 ill_refhold(ill); 1752 qwriter_ip(ill, ill->ill_rq, dl_mp, ip_ndp_excl, NEW_OP, 1753 B_FALSE); 1754 } 1755 } 1756 ndp_delete(nce); 1757 } 1758 1759 /* 1760 * Handle a discovered conflict: some other system is advertising that it owns 1761 * one of our IP addresses. We need to defend ourselves, or just shut down the 1762 * interface. 1763 */ 1764 static void 1765 ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce) 1766 { 1767 ipif_t *ipif; 1768 uint32_t now; 1769 uint_t maxdefense; 1770 uint_t defs; 1771 ip_stack_t *ipst = ill->ill_ipst; 1772 1773 ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill, ALL_ZONES, NULL, NULL, 1774 NULL, NULL, ipst); 1775 if (ipif == NULL) 1776 return; 1777 /* 1778 * First, figure out if this address is disposable. 1779 */ 1780 if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY)) 1781 maxdefense = ipst->ips_ip_max_temp_defend; 1782 else 1783 maxdefense = ipst->ips_ip_max_defend; 1784 1785 /* 1786 * Now figure out how many times we've defended ourselves. Ignore 1787 * defenses that happened long in the past. 1788 */ 1789 now = gethrestime_sec(); 1790 mutex_enter(&nce->nce_lock); 1791 if ((defs = nce->nce_defense_count) > 0 && 1792 now - nce->nce_defense_time > ipst->ips_ip_defend_interval) { 1793 nce->nce_defense_count = defs = 0; 1794 } 1795 nce->nce_defense_count++; 1796 nce->nce_defense_time = now; 1797 mutex_exit(&nce->nce_lock); 1798 ipif_refrele(ipif); 1799 1800 /* 1801 * If we've defended ourselves too many times already, then give up and 1802 * tear down the interface(s) using this address. Otherwise, defend by 1803 * sending out an unsolicited Neighbor Advertisement. 1804 */ 1805 if (defs >= maxdefense) { 1806 ip_ndp_failure(ill, mp, dl_mp, nce); 1807 } else { 1808 char hbuf[MAC_STR_LEN]; 1809 char sbuf[INET6_ADDRSTRLEN]; 1810 uchar_t *haddr; 1811 1812 (void) ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf, 1813 sizeof (hbuf), sbuf, sizeof (sbuf), &haddr); 1814 cmn_err(CE_WARN, "node %s is using our IP address %s on %s", 1815 hbuf, sbuf, ill->ill_name); 1816 (void) nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill, B_FALSE, 1817 &nce->nce_addr, &ipv6_all_hosts_mcast, 1818 nce_advert_flags(nce)); 1819 } 1820 } 1821 1822 static void 1823 ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) 1824 { 1825 nd_neighbor_solicit_t *ns; 1826 uint32_t hlen = ill->ill_nd_lla_len; 1827 uchar_t *haddr = NULL; 1828 icmp6_t *icmp_nd; 1829 ip6_t *ip6h; 1830 nce_t *our_nce = NULL; 1831 in6_addr_t target; 1832 in6_addr_t src; 1833 int len; 1834 int flag = 0; 1835 nd_opt_hdr_t *opt = NULL; 1836 boolean_t bad_solicit = B_FALSE; 1837 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 1838 1839 ip6h = (ip6_t *)mp->b_rptr; 1840 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 1841 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 1842 src = ip6h->ip6_src; 1843 ns = (nd_neighbor_solicit_t *)icmp_nd; 1844 target = ns->nd_ns_target; 1845 if (IN6_IS_ADDR_MULTICAST(&target)) { 1846 if (ip_debug > 2) { 1847 /* ip1dbg */ 1848 pr_addr_dbg("ndp_input_solicit: Target is" 1849 " multicast! %s\n", AF_INET6, &target); 1850 } 1851 bad_solicit = B_TRUE; 1852 goto done; 1853 } 1854 if (len > sizeof (nd_neighbor_solicit_t)) { 1855 /* Options present */ 1856 opt = (nd_opt_hdr_t *)&ns[1]; 1857 len -= sizeof (nd_neighbor_solicit_t); 1858 if (!ndp_verify_optlen(opt, len)) { 1859 ip1dbg(("ndp_input_solicit: Bad opt len\n")); 1860 bad_solicit = B_TRUE; 1861 goto done; 1862 } 1863 } 1864 if (IN6_IS_ADDR_UNSPECIFIED(&src)) { 1865 /* Check to see if this is a valid DAD solicitation */ 1866 if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) { 1867 if (ip_debug > 2) { 1868 /* ip1dbg */ 1869 pr_addr_dbg("ndp_input_solicit: IPv6 " 1870 "Destination is not solicited node " 1871 "multicast %s\n", AF_INET6, 1872 &ip6h->ip6_dst); 1873 } 1874 bad_solicit = B_TRUE; 1875 goto done; 1876 } 1877 } 1878 1879 our_nce = ndp_lookup_v6(ill, &target, B_FALSE); 1880 /* 1881 * If this is a valid Solicitation, a permanent 1882 * entry should exist in the cache 1883 */ 1884 if (our_nce == NULL || 1885 !(our_nce->nce_flags & NCE_F_PERMANENT)) { 1886 ip1dbg(("ndp_input_solicit: Wrong target in NS?!" 1887 "ifname=%s ", ill->ill_name)); 1888 if (ip_debug > 2) { 1889 /* ip1dbg */ 1890 pr_addr_dbg(" dst %s\n", AF_INET6, &target); 1891 } 1892 bad_solicit = B_TRUE; 1893 goto done; 1894 } 1895 1896 /* At this point we should have a verified NS per spec */ 1897 if (opt != NULL) { 1898 opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR); 1899 if (opt != NULL) { 1900 haddr = (uchar_t *)&opt[1]; 1901 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) || 1902 hlen == 0) { 1903 ip1dbg(("ndp_input_advert: bad SLLA\n")); 1904 bad_solicit = B_TRUE; 1905 goto done; 1906 } 1907 } 1908 } 1909 1910 /* If sending directly to peer, set the unicast flag */ 1911 if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) 1912 flag |= NDP_UNICAST; 1913 1914 /* 1915 * Create/update the entry for the soliciting node. 1916 * or respond to outstanding queries, don't if 1917 * the source is unspecified address. 1918 */ 1919 if (!IN6_IS_ADDR_UNSPECIFIED(&src)) { 1920 int err; 1921 nce_t *nnce; 1922 1923 ASSERT(ill->ill_isv6); 1924 /* 1925 * Regular solicitations *must* include the Source Link-Layer 1926 * Address option. Ignore messages that do not. 1927 */ 1928 if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) { 1929 ip1dbg(("ndp_input_solicit: source link-layer address " 1930 "option missing with a specified source.\n")); 1931 bad_solicit = B_TRUE; 1932 goto done; 1933 } 1934 1935 /* 1936 * This is a regular solicitation. If we're still in the 1937 * process of verifying the address, then don't respond at all 1938 * and don't keep track of the sender. 1939 */ 1940 if (our_nce->nce_state == ND_PROBE) 1941 goto done; 1942 1943 /* 1944 * If the solicitation doesn't have sender hardware address 1945 * (legal for unicast solicitation), then process without 1946 * installing the return NCE. Either we already know it, or 1947 * we'll be forced to look it up when (and if) we reply to the 1948 * packet. 1949 */ 1950 if (haddr == NULL) 1951 goto no_source; 1952 1953 err = ndp_lookup_then_add_v6(ill, 1954 haddr, 1955 &src, /* Soliciting nodes address */ 1956 &ipv6_all_ones, 1957 &ipv6_all_zeros, 1958 0, 1959 0, 1960 ND_STALE, 1961 &nnce); 1962 switch (err) { 1963 case 0: 1964 /* done with this entry */ 1965 NCE_REFRELE(nnce); 1966 break; 1967 case EEXIST: 1968 /* 1969 * B_FALSE indicates this is not an 1970 * an advertisement. 1971 */ 1972 ndp_process(nnce, haddr, 0, B_FALSE); 1973 NCE_REFRELE(nnce); 1974 break; 1975 default: 1976 ip1dbg(("ndp_input_solicit: Can't create NCE %d\n", 1977 err)); 1978 goto done; 1979 } 1980 no_source: 1981 flag |= NDP_SOLICITED; 1982 } else { 1983 /* 1984 * No source link layer address option should be present in a 1985 * valid DAD request. 1986 */ 1987 if (haddr != NULL) { 1988 ip1dbg(("ndp_input_solicit: source link-layer address " 1989 "option present with an unspecified source.\n")); 1990 bad_solicit = B_TRUE; 1991 goto done; 1992 } 1993 if (our_nce->nce_state == ND_PROBE) { 1994 /* 1995 * Internally looped-back probes won't have DLPI 1996 * attached to them. External ones (which are sent by 1997 * multicast) always will. Just ignore our own 1998 * transmissions. 1999 */ 2000 if (dl_mp != NULL) { 2001 /* 2002 * If someone else is probing our address, then 2003 * we've crossed wires. Declare failure. 2004 */ 2005 ip_ndp_failure(ill, mp, dl_mp, our_nce); 2006 } 2007 goto done; 2008 } 2009 /* 2010 * This is a DAD probe. Multicast the advertisement to the 2011 * all-nodes address. 2012 */ 2013 src = ipv6_all_hosts_mcast; 2014 } 2015 flag |= nce_advert_flags(our_nce); 2016 /* Response to a solicitation */ 2017 (void) nce_xmit(ill, 2018 ND_NEIGHBOR_ADVERT, 2019 ill, /* ill to be used for extracting ill_nd_lla */ 2020 B_TRUE, /* use ill_nd_lla */ 2021 &target, /* Source and target of the advertisement pkt */ 2022 &src, /* IP Destination (source of original pkt) */ 2023 flag); 2024 done: 2025 if (bad_solicit) 2026 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations); 2027 if (our_nce != NULL) 2028 NCE_REFRELE(our_nce); 2029 } 2030 2031 void 2032 ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) 2033 { 2034 nd_neighbor_advert_t *na; 2035 uint32_t hlen = ill->ill_nd_lla_len; 2036 uchar_t *haddr = NULL; 2037 icmp6_t *icmp_nd; 2038 ip6_t *ip6h; 2039 nce_t *dst_nce = NULL; 2040 in6_addr_t target; 2041 nd_opt_hdr_t *opt = NULL; 2042 int len; 2043 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 2044 ip_stack_t *ipst = ill->ill_ipst; 2045 2046 ip6h = (ip6_t *)mp->b_rptr; 2047 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 2048 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 2049 na = (nd_neighbor_advert_t *)icmp_nd; 2050 if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) && 2051 (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) { 2052 ip1dbg(("ndp_input_advert: Target is multicast but the " 2053 "solicited flag is not zero\n")); 2054 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 2055 return; 2056 } 2057 target = na->nd_na_target; 2058 if (IN6_IS_ADDR_MULTICAST(&target)) { 2059 ip1dbg(("ndp_input_advert: Target is multicast!\n")); 2060 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 2061 return; 2062 } 2063 if (len > sizeof (nd_neighbor_advert_t)) { 2064 opt = (nd_opt_hdr_t *)&na[1]; 2065 if (!ndp_verify_optlen(opt, 2066 len - sizeof (nd_neighbor_advert_t))) { 2067 ip1dbg(("ndp_input_advert: cannot verify SLLA\n")); 2068 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 2069 return; 2070 } 2071 /* At this point we have a verified NA per spec */ 2072 len -= sizeof (nd_neighbor_advert_t); 2073 opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR); 2074 if (opt != NULL) { 2075 haddr = (uchar_t *)&opt[1]; 2076 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) || 2077 hlen == 0) { 2078 ip1dbg(("ndp_input_advert: bad SLLA\n")); 2079 BUMP_MIB(mib, 2080 ipv6IfIcmpInBadNeighborAdvertisements); 2081 return; 2082 } 2083 } 2084 } 2085 2086 /* 2087 * If this interface is part of the group look at all the 2088 * ills in the group. 2089 */ 2090 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 2091 if (ill->ill_group != NULL) 2092 ill = ill->ill_group->illgrp_ill; 2093 2094 for (; ill != NULL; ill = ill->ill_group_next) { 2095 mutex_enter(&ill->ill_lock); 2096 if (!ILL_CAN_LOOKUP(ill)) { 2097 mutex_exit(&ill->ill_lock); 2098 continue; 2099 } 2100 ill_refhold_locked(ill); 2101 mutex_exit(&ill->ill_lock); 2102 dst_nce = ndp_lookup_v6(ill, &target, B_FALSE); 2103 /* We have to drop the lock since ndp_process calls put* */ 2104 rw_exit(&ipst->ips_ill_g_lock); 2105 if (dst_nce != NULL) { 2106 if ((dst_nce->nce_flags & NCE_F_PERMANENT) && 2107 dst_nce->nce_state == ND_PROBE) { 2108 /* 2109 * Someone else sent an advertisement for an 2110 * address that we're trying to configure. 2111 * Tear it down. Note that dl_mp might be NULL 2112 * if we're getting a unicast reply. This 2113 * isn't typically done (multicast is the norm 2114 * in response to a probe), but ip_ndp_failure 2115 * will handle the dl_mp == NULL case as well. 2116 */ 2117 ip_ndp_failure(ill, mp, dl_mp, dst_nce); 2118 } else if (dst_nce->nce_flags & NCE_F_PERMANENT) { 2119 /* 2120 * Someone just announced one of our local 2121 * addresses. If it wasn't us, then this is a 2122 * conflict. Defend the address or shut it 2123 * down. 2124 */ 2125 if (dl_mp != NULL && 2126 (haddr == NULL || 2127 nce_cmp_ll_addr(dst_nce, haddr, 2128 ill->ill_nd_lla_len))) { 2129 ip_ndp_conflict(ill, mp, dl_mp, 2130 dst_nce); 2131 } 2132 } else { 2133 if (na->nd_na_flags_reserved & 2134 ND_NA_FLAG_ROUTER) { 2135 dst_nce->nce_flags |= NCE_F_ISROUTER; 2136 } 2137 /* B_TRUE indicates this an advertisement */ 2138 ndp_process(dst_nce, haddr, 2139 na->nd_na_flags_reserved, B_TRUE); 2140 } 2141 NCE_REFRELE(dst_nce); 2142 } 2143 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 2144 ill_refrele(ill); 2145 } 2146 rw_exit(&ipst->ips_ill_g_lock); 2147 } 2148 2149 /* 2150 * Process NDP neighbor solicitation/advertisement messages. 2151 * The checksum has already checked o.k before reaching here. 2152 */ 2153 void 2154 ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) 2155 { 2156 icmp6_t *icmp_nd; 2157 ip6_t *ip6h; 2158 int len; 2159 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 2160 2161 2162 if (!pullupmsg(mp, -1)) { 2163 ip1dbg(("ndp_input: pullupmsg failed\n")); 2164 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2165 goto done; 2166 } 2167 ip6h = (ip6_t *)mp->b_rptr; 2168 if (ip6h->ip6_hops != IPV6_MAX_HOPS) { 2169 ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n")); 2170 BUMP_MIB(mib, ipv6IfIcmpBadHoplimit); 2171 goto done; 2172 } 2173 /* 2174 * NDP does not accept any extension headers between the 2175 * IP header and the ICMP header since e.g. a routing 2176 * header could be dangerous. 2177 * This assumes that any AH or ESP headers are removed 2178 * by ip prior to passing the packet to ndp_input. 2179 */ 2180 if (ip6h->ip6_nxt != IPPROTO_ICMPV6) { 2181 ip1dbg(("ndp_input: Wrong next header 0x%x\n", 2182 ip6h->ip6_nxt)); 2183 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2184 goto done; 2185 } 2186 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 2187 ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT || 2188 icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT); 2189 if (icmp_nd->icmp6_code != 0) { 2190 ip1dbg(("ndp_input: icmp6 code != 0 \n")); 2191 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2192 goto done; 2193 } 2194 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 2195 /* 2196 * Make sure packet length is large enough for either 2197 * a NS or a NA icmp packet. 2198 */ 2199 if (len < sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) { 2200 ip1dbg(("ndp_input: packet too short\n")); 2201 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2202 goto done; 2203 } 2204 if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) { 2205 ndp_input_solicit(ill, mp, dl_mp); 2206 } else { 2207 ndp_input_advert(ill, mp, dl_mp); 2208 } 2209 done: 2210 freemsg(mp); 2211 } 2212 2213 /* 2214 * nce_xmit is called to form and transmit a ND solicitation or 2215 * advertisement ICMP packet. 2216 * 2217 * If the source address is unspecified and this isn't a probe (used for 2218 * duplicate address detection), an appropriate source address and link layer 2219 * address will be chosen here. The link layer address option is included if 2220 * the source is specified (i.e., all non-probe packets), and omitted (per the 2221 * specification) otherwise. 2222 * 2223 * It returns B_FALSE only if it does a successful put() to the 2224 * corresponding ill's ill_wq otherwise returns B_TRUE. 2225 */ 2226 static boolean_t 2227 nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill, 2228 boolean_t use_nd_lla, const in6_addr_t *sender, const in6_addr_t *target, 2229 int flag) 2230 { 2231 uint32_t len; 2232 icmp6_t *icmp6; 2233 mblk_t *mp; 2234 ip6_t *ip6h; 2235 nd_opt_hdr_t *opt; 2236 uint_t plen; 2237 ip6i_t *ip6i; 2238 ipif_t *src_ipif = NULL; 2239 uint8_t *hw_addr; 2240 zoneid_t zoneid = GLOBAL_ZONEID; 2241 2242 /* 2243 * If we have a unspecified source(sender) address, select a 2244 * proper source address for the solicitation here itself so 2245 * that we can initialize the h/w address correctly. This is 2246 * needed for interface groups as source address can come from 2247 * the whole group and the h/w address initialized from ill will 2248 * be wrong if the source address comes from a different ill. 2249 * 2250 * If the sender is specified then we use this address in order 2251 * to lookup the zoneid before calling ip_output_v6(). This is to 2252 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly 2253 * by IP (we cannot guarantee that the global zone has an interface 2254 * route to the destination). 2255 * 2256 * Note that the NA never comes here with the unspecified source 2257 * address. The following asserts that whenever the source 2258 * address is specified, the haddr also should be specified. 2259 */ 2260 ASSERT(IN6_IS_ADDR_UNSPECIFIED(sender) || (hwaddr_ill != NULL)); 2261 2262 if (IN6_IS_ADDR_UNSPECIFIED(sender) && !(flag & NDP_PROBE)) { 2263 ASSERT(operation != ND_NEIGHBOR_ADVERT); 2264 /* 2265 * Pick a source address for this solicitation, but 2266 * restrict the selection to addresses assigned to the 2267 * output interface (or interface group). We do this 2268 * because the destination will create a neighbor cache 2269 * entry for the source address of this packet, so the 2270 * source address had better be a valid neighbor. 2271 */ 2272 src_ipif = ipif_select_source_v6(ill, target, RESTRICT_TO_ILL, 2273 IPV6_PREFER_SRC_DEFAULT, ALL_ZONES); 2274 if (src_ipif == NULL) { 2275 char buf[INET6_ADDRSTRLEN]; 2276 2277 ip1dbg(("nce_xmit: No source ipif for dst %s\n", 2278 inet_ntop(AF_INET6, (char *)target, buf, 2279 sizeof (buf)))); 2280 return (B_TRUE); 2281 } 2282 sender = &src_ipif->ipif_v6src_addr; 2283 hwaddr_ill = src_ipif->ipif_ill; 2284 } else if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) { 2285 zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ill->ill_ipst); 2286 /* 2287 * It's possible for ipif_lookup_addr_zoneid_v6() to return 2288 * ALL_ZONES if it cannot find a matching ipif for the address 2289 * we are trying to use. In this case we err on the side of 2290 * trying to send the packet by defaulting to the GLOBAL_ZONEID. 2291 */ 2292 if (zoneid == ALL_ZONES) 2293 zoneid = GLOBAL_ZONEID; 2294 } 2295 2296 /* 2297 * Always make sure that the NS/NA packets don't get load 2298 * spread. This is needed so that the probe packets sent 2299 * by the in.mpathd daemon can really go out on the desired 2300 * interface. Probe packets are made to go out on a desired 2301 * interface by including a ip6i with ATTACH_IF flag. As these 2302 * packets indirectly end up sending/receiving NS/NA packets 2303 * (neighbor doing NUD), we have to make sure that NA 2304 * also go out on the same interface. 2305 */ 2306 plen = (sizeof (nd_opt_hdr_t) + ill->ill_nd_lla_len + 7) / 8; 2307 len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) + 2308 plen * 8; 2309 mp = allocb(len, BPRI_LO); 2310 if (mp == NULL) { 2311 if (src_ipif != NULL) 2312 ipif_refrele(src_ipif); 2313 return (B_TRUE); 2314 } 2315 bzero((char *)mp->b_rptr, len); 2316 mp->b_wptr = mp->b_rptr + len; 2317 2318 ip6i = (ip6i_t *)mp->b_rptr; 2319 ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW; 2320 ip6i->ip6i_nxt = IPPROTO_RAW; 2321 ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT; 2322 if (flag & NDP_PROBE) 2323 ip6i->ip6i_flags |= IP6I_UNSPEC_SRC; 2324 ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex; 2325 2326 ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t)); 2327 ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW; 2328 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t)); 2329 ip6h->ip6_nxt = IPPROTO_ICMPV6; 2330 ip6h->ip6_hops = IPV6_MAX_HOPS; 2331 ip6h->ip6_dst = *target; 2332 icmp6 = (icmp6_t *)&ip6h[1]; 2333 2334 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN + 2335 sizeof (nd_neighbor_advert_t)); 2336 2337 if (operation == ND_NEIGHBOR_SOLICIT) { 2338 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6; 2339 2340 if (!(flag & NDP_PROBE)) 2341 opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR; 2342 ip6h->ip6_src = *sender; 2343 ns->nd_ns_target = *target; 2344 if (!(flag & NDP_UNICAST)) { 2345 /* Form multicast address of the target */ 2346 ip6h->ip6_dst = ipv6_solicited_node_mcast; 2347 ip6h->ip6_dst.s6_addr32[3] |= 2348 ns->nd_ns_target.s6_addr32[3]; 2349 } 2350 } else { 2351 nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6; 2352 2353 ASSERT(!(flag & NDP_PROBE)); 2354 opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; 2355 ip6h->ip6_src = *sender; 2356 na->nd_na_target = *sender; 2357 if (flag & NDP_ISROUTER) 2358 na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER; 2359 if (flag & NDP_SOLICITED) 2360 na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED; 2361 if (flag & NDP_ORIDE) 2362 na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE; 2363 } 2364 2365 hw_addr = NULL; 2366 if (!(flag & NDP_PROBE)) { 2367 hw_addr = use_nd_lla ? hwaddr_ill->ill_nd_lla : 2368 hwaddr_ill->ill_phys_addr; 2369 if (hw_addr != NULL) { 2370 /* Fill in link layer address and option len */ 2371 opt->nd_opt_len = (uint8_t)plen; 2372 bcopy(hw_addr, &opt[1], hwaddr_ill->ill_nd_lla_len); 2373 } 2374 } 2375 if (hw_addr == NULL) { 2376 /* If there's no link layer address option, then strip it. */ 2377 len -= plen * 8; 2378 mp->b_wptr = mp->b_rptr + len; 2379 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t)); 2380 } 2381 2382 icmp6->icmp6_type = (uint8_t)operation; 2383 icmp6->icmp6_code = 0; 2384 /* 2385 * Prepare for checksum by putting icmp length in the icmp 2386 * checksum field. The checksum is calculated in ip_wput_v6. 2387 */ 2388 icmp6->icmp6_cksum = ip6h->ip6_plen; 2389 2390 if (src_ipif != NULL) 2391 ipif_refrele(src_ipif); 2392 2393 ip_output_v6((void *)(uintptr_t)zoneid, mp, ill->ill_wq, IP_WPUT); 2394 return (B_FALSE); 2395 } 2396 2397 /* 2398 * Make a link layer address (does not include the SAP) from an nce. 2399 * To form the link layer address, use the last four bytes of ipv6 2400 * address passed in and the fixed offset stored in nce. 2401 */ 2402 static void 2403 nce_make_mapping(nce_t *nce, uchar_t *addrpos, uchar_t *addr) 2404 { 2405 uchar_t *mask, *to; 2406 ill_t *ill = nce->nce_ill; 2407 int len; 2408 2409 if (ill->ill_net_type == IRE_IF_NORESOLVER) 2410 return; 2411 ASSERT(nce->nce_res_mp != NULL); 2412 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 2413 ASSERT(nce->nce_flags & NCE_F_MAPPING); 2414 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask)); 2415 ASSERT(addr != NULL); 2416 bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill), 2417 addrpos, ill->ill_nd_lla_len); 2418 len = MIN((int)ill->ill_nd_lla_len - nce->nce_ll_extract_start, 2419 IPV6_ADDR_LEN); 2420 mask = (uchar_t *)&nce->nce_extract_mask; 2421 mask += (IPV6_ADDR_LEN - len); 2422 addr += (IPV6_ADDR_LEN - len); 2423 to = addrpos + nce->nce_ll_extract_start; 2424 while (len-- > 0) 2425 *to++ |= *mask++ & *addr++; 2426 } 2427 2428 /* 2429 * Pass a cache report back out via NDD. 2430 */ 2431 /* ARGSUSED */ 2432 int 2433 ndp_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 2434 { 2435 ip_stack_t *ipst; 2436 2437 if (CONN_Q(q)) 2438 ipst = CONNQ_TO_IPST(q); 2439 else 2440 ipst = ILLQ_TO_IPST(q); 2441 2442 (void) mi_mpprintf(mp, "ifname hardware addr flags" 2443 " proto addr/mask"); 2444 ndp_walk(NULL, (pfi_t)nce_report1, (uchar_t *)mp, ipst); 2445 return (0); 2446 } 2447 2448 /* 2449 * Add a single line to the NDP Cache Entry Report. 2450 */ 2451 static void 2452 nce_report1(nce_t *nce, uchar_t *mp_arg) 2453 { 2454 ill_t *ill = nce->nce_ill; 2455 char local_buf[INET6_ADDRSTRLEN]; 2456 uchar_t flags_buf[10]; 2457 uint32_t flags = nce->nce_flags; 2458 mblk_t *mp = (mblk_t *)mp_arg; 2459 uchar_t *h; 2460 uchar_t *m = flags_buf; 2461 in6_addr_t v6addr; 2462 uint64_t now; 2463 2464 /* 2465 * Lock the nce to protect nce_res_mp from being changed 2466 * if an external resolver address resolution completes 2467 * while nce_res_mp is being accessed here. 2468 * 2469 * Deal with all address formats, not just Ethernet-specific 2470 * In addition, make sure that the mblk has enough space 2471 * before writing to it. If is doesn't, allocate a new one. 2472 */ 2473 if (nce->nce_ipversion == IPV4_VERSION) { 2474 /* 2475 * Don't include v4 NCEs in NDP cache entry report. 2476 * But sanity check for lingering ND_INITIAL entries 2477 * when we do 'ndd -get /dev/ip ip_ndp_cache_report' 2478 */ 2479 if (nce->nce_state == ND_INITIAL) { 2480 2481 now = TICK_TO_MSEC(lbolt64); 2482 if (now - nce->nce_init_time > NCE_STUCK_TIMEOUT) { 2483 DTRACE_PROBE1(nce__stuck, nce_t *, nce); 2484 } 2485 } 2486 return; 2487 } 2488 2489 ASSERT(ill != NULL); 2490 v6addr = nce->nce_mask; 2491 if (flags & NCE_F_PERMANENT) 2492 *m++ = 'P'; 2493 if (flags & NCE_F_ISROUTER) 2494 *m++ = 'R'; 2495 if (flags & NCE_F_MAPPING) 2496 *m++ = 'M'; 2497 *m = '\0'; 2498 2499 if (ill->ill_net_type == IRE_IF_RESOLVER) { 2500 size_t addrlen; 2501 char *addr_buf; 2502 dl_unitdata_req_t *dl; 2503 2504 mutex_enter(&nce->nce_lock); 2505 h = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill); 2506 dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr; 2507 if (ill->ill_flags & ILLF_XRESOLV) 2508 addrlen = (3 * (dl->dl_dest_addr_length)); 2509 else 2510 addrlen = (3 * (ill->ill_nd_lla_len)); 2511 if (addrlen <= 0) { 2512 mutex_exit(&nce->nce_lock); 2513 (void) mi_mpprintf(mp, 2514 "%8s %9s %5s %s/%d", 2515 ill->ill_name, 2516 "None", 2517 (uchar_t *)&flags_buf, 2518 inet_ntop(AF_INET6, (char *)&nce->nce_addr, 2519 (char *)local_buf, sizeof (local_buf)), 2520 ip_mask_to_plen_v6(&v6addr)); 2521 } else { 2522 /* 2523 * Convert the hardware/lla address to ascii 2524 */ 2525 addr_buf = kmem_zalloc(addrlen, KM_NOSLEEP); 2526 if (addr_buf == NULL) { 2527 mutex_exit(&nce->nce_lock); 2528 return; 2529 } 2530 (void) mac_colon_addr((uint8_t *)h, 2531 (ill->ill_flags & ILLF_XRESOLV) ? 2532 dl->dl_dest_addr_length : ill->ill_nd_lla_len, 2533 addr_buf, addrlen); 2534 mutex_exit(&nce->nce_lock); 2535 (void) mi_mpprintf(mp, "%8s %17s %5s %s/%d", 2536 ill->ill_name, addr_buf, (uchar_t *)&flags_buf, 2537 inet_ntop(AF_INET6, (char *)&nce->nce_addr, 2538 (char *)local_buf, sizeof (local_buf)), 2539 ip_mask_to_plen_v6(&v6addr)); 2540 kmem_free(addr_buf, addrlen); 2541 } 2542 } else { 2543 (void) mi_mpprintf(mp, 2544 "%8s %9s %5s %s/%d", 2545 ill->ill_name, 2546 "None", 2547 (uchar_t *)&flags_buf, 2548 inet_ntop(AF_INET6, (char *)&nce->nce_addr, 2549 (char *)local_buf, sizeof (local_buf)), 2550 ip_mask_to_plen_v6(&v6addr)); 2551 } 2552 } 2553 2554 mblk_t * 2555 nce_udreq_alloc(ill_t *ill) 2556 { 2557 mblk_t *template_mp = NULL; 2558 dl_unitdata_req_t *dlur; 2559 int sap_length; 2560 2561 ASSERT(ill->ill_isv6); 2562 2563 sap_length = ill->ill_sap_length; 2564 template_mp = ip_dlpi_alloc(sizeof (dl_unitdata_req_t) + 2565 ill->ill_nd_lla_len + ABS(sap_length), DL_UNITDATA_REQ); 2566 if (template_mp == NULL) 2567 return (NULL); 2568 2569 dlur = (dl_unitdata_req_t *)template_mp->b_rptr; 2570 dlur->dl_priority.dl_min = 0; 2571 dlur->dl_priority.dl_max = 0; 2572 dlur->dl_dest_addr_length = ABS(sap_length) + ill->ill_nd_lla_len; 2573 dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t); 2574 2575 /* Copy in the SAP value. */ 2576 NCE_LL_SAP_COPY(ill, template_mp); 2577 2578 return (template_mp); 2579 } 2580 2581 /* 2582 * NDP retransmit timer. 2583 * This timer goes off when: 2584 * a. It is time to retransmit NS for resolver. 2585 * b. It is time to send reachability probes. 2586 */ 2587 void 2588 ndp_timer(void *arg) 2589 { 2590 nce_t *nce = arg; 2591 ill_t *ill = nce->nce_ill; 2592 uint32_t ms; 2593 char addrbuf[INET6_ADDRSTRLEN]; 2594 mblk_t *mp; 2595 boolean_t dropped = B_FALSE; 2596 ip_stack_t *ipst = ill->ill_ipst; 2597 2598 /* 2599 * The timer has to be cancelled by ndp_delete before doing the final 2600 * refrele. So the NCE is guaranteed to exist when the timer runs 2601 * until it clears the timeout_id. Before clearing the timeout_id 2602 * bump up the refcnt so that we can continue to use the nce 2603 */ 2604 ASSERT(nce != NULL); 2605 2606 /* 2607 * Grab the ill_g_lock now itself to avoid lock order problems. 2608 * nce_solicit needs ill_g_lock to be able to traverse ills 2609 */ 2610 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 2611 mutex_enter(&nce->nce_lock); 2612 NCE_REFHOLD_LOCKED(nce); 2613 nce->nce_timeout_id = 0; 2614 2615 /* 2616 * Check the reachability state first. 2617 */ 2618 switch (nce->nce_state) { 2619 case ND_DELAY: 2620 rw_exit(&ipst->ips_ill_g_lock); 2621 nce->nce_state = ND_PROBE; 2622 mutex_exit(&nce->nce_lock); 2623 (void) nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE, 2624 &ipv6_all_zeros, &nce->nce_addr, NDP_UNICAST); 2625 if (ip_debug > 3) { 2626 /* ip2dbg */ 2627 pr_addr_dbg("ndp_timer: state for %s changed " 2628 "to PROBE\n", AF_INET6, &nce->nce_addr); 2629 } 2630 NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time); 2631 NCE_REFRELE(nce); 2632 return; 2633 case ND_PROBE: 2634 /* must be retransmit timer */ 2635 rw_exit(&ipst->ips_ill_g_lock); 2636 nce->nce_pcnt--; 2637 ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT && 2638 nce->nce_pcnt >= -1); 2639 if (nce->nce_pcnt > 0) { 2640 /* 2641 * As per RFC2461, the nce gets deleted after 2642 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions. 2643 * Note that the first unicast solicitation is sent 2644 * during the DELAY state. 2645 */ 2646 ip2dbg(("ndp_timer: pcount=%x dst %s\n", 2647 nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr, 2648 addrbuf, sizeof (addrbuf)))); 2649 mutex_exit(&nce->nce_lock); 2650 dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, 2651 B_FALSE, &ipv6_all_zeros, &nce->nce_addr, 2652 (nce->nce_flags & NCE_F_PERMANENT) ? NDP_PROBE : 2653 NDP_UNICAST); 2654 if (dropped) { 2655 mutex_enter(&nce->nce_lock); 2656 nce->nce_pcnt++; 2657 mutex_exit(&nce->nce_lock); 2658 } 2659 NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill)); 2660 } else if (nce->nce_pcnt < 0) { 2661 /* No hope, delete the nce */ 2662 nce->nce_state = ND_UNREACHABLE; 2663 mutex_exit(&nce->nce_lock); 2664 if (ip_debug > 2) { 2665 /* ip1dbg */ 2666 pr_addr_dbg("ndp_timer: Delete IRE for" 2667 " dst %s\n", AF_INET6, &nce->nce_addr); 2668 } 2669 ndp_delete(nce); 2670 } else if (!(nce->nce_flags & NCE_F_PERMANENT)) { 2671 /* Wait RetransTimer, before deleting the entry */ 2672 ip2dbg(("ndp_timer: pcount=%x dst %s\n", 2673 nce->nce_pcnt, inet_ntop(AF_INET6, 2674 &nce->nce_addr, addrbuf, sizeof (addrbuf)))); 2675 mutex_exit(&nce->nce_lock); 2676 /* Wait one interval before killing */ 2677 NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time); 2678 } else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) { 2679 ipif_t *ipif; 2680 2681 /* 2682 * We're done probing, and we can now declare this 2683 * address to be usable. Let IP know that it's ok to 2684 * use. 2685 */ 2686 nce->nce_state = ND_REACHABLE; 2687 mutex_exit(&nce->nce_lock); 2688 ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill, 2689 ALL_ZONES, NULL, NULL, NULL, NULL, ipst); 2690 if (ipif != NULL) { 2691 if (ipif->ipif_was_dup) { 2692 char ibuf[LIFNAMSIZ + 10]; 2693 char sbuf[INET6_ADDRSTRLEN]; 2694 2695 ipif->ipif_was_dup = B_FALSE; 2696 (void) strlcpy(ibuf, ill->ill_name, 2697 sizeof (ibuf)); 2698 (void) inet_ntop(AF_INET6, 2699 &ipif->ipif_v6lcl_addr, 2700 sbuf, sizeof (sbuf)); 2701 if (ipif->ipif_id != 0) { 2702 (void) snprintf(ibuf + 2703 ill->ill_name_length - 1, 2704 sizeof (ibuf) - 2705 ill->ill_name_length + 1, 2706 ":%d", ipif->ipif_id); 2707 } 2708 cmn_err(CE_NOTE, "recovered address " 2709 "%s on %s", sbuf, ibuf); 2710 } 2711 if ((ipif->ipif_flags & IPIF_UP) && 2712 !ipif->ipif_addr_ready) { 2713 ip_rts_ifmsg(ipif); 2714 ip_rts_newaddrmsg(RTM_ADD, 0, ipif); 2715 sctp_update_ipif(ipif, SCTP_IPIF_UP); 2716 } 2717 ipif->ipif_addr_ready = 1; 2718 ipif_refrele(ipif); 2719 } 2720 /* Begin defending our new address */ 2721 nce->nce_unsolicit_count = 0; 2722 dropped = nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill, 2723 B_FALSE, &nce->nce_addr, &ipv6_all_hosts_mcast, 2724 nce_advert_flags(nce)); 2725 if (dropped) { 2726 nce->nce_unsolicit_count = 1; 2727 NDP_RESTART_TIMER(nce, 2728 ipst->ips_ip_ndp_unsolicit_interval); 2729 } else if (ipst->ips_ip_ndp_defense_interval != 0) { 2730 NDP_RESTART_TIMER(nce, 2731 ipst->ips_ip_ndp_defense_interval); 2732 } 2733 } else { 2734 /* 2735 * This is an address we're probing to be our own, but 2736 * the ill is down. Wait until it comes back before 2737 * doing anything, but switch to reachable state so 2738 * that the restart will work. 2739 */ 2740 nce->nce_state = ND_REACHABLE; 2741 mutex_exit(&nce->nce_lock); 2742 } 2743 NCE_REFRELE(nce); 2744 return; 2745 case ND_INCOMPLETE: 2746 /* 2747 * Must be resolvers retransmit timer. 2748 */ 2749 for (mp = nce->nce_qd_mp; mp != NULL; mp = mp->b_next) { 2750 ip6i_t *ip6i; 2751 ip6_t *ip6h; 2752 mblk_t *data_mp; 2753 2754 /* 2755 * Walk the list of packets queued, and see if there 2756 * are any multipathing probe packets. Such packets 2757 * are always queued at the head. Since this is a 2758 * retransmit timer firing, mark such packets as 2759 * delayed in ND resolution. This info will be used 2760 * in ip_wput_v6(). Multipathing probe packets will 2761 * always have an ip6i_t. Once we hit a packet without 2762 * it, we can break out of this loop. 2763 */ 2764 if (mp->b_datap->db_type == M_CTL) 2765 data_mp = mp->b_cont; 2766 else 2767 data_mp = mp; 2768 2769 ip6h = (ip6_t *)data_mp->b_rptr; 2770 if (ip6h->ip6_nxt != IPPROTO_RAW) 2771 break; 2772 2773 /* 2774 * This message should have been pulled up already in 2775 * ip_wput_v6. We can't do pullups here because the 2776 * b_next/b_prev is non-NULL. 2777 */ 2778 ip6i = (ip6i_t *)ip6h; 2779 ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >= 2780 sizeof (ip6i_t) + IPV6_HDR_LEN); 2781 2782 /* Mark this packet as delayed due to ND resolution */ 2783 if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED) 2784 ip6i->ip6i_flags |= IP6I_ND_DELAYED; 2785 } 2786 if (nce->nce_qd_mp != NULL) { 2787 ms = nce_solicit(nce, NULL); 2788 rw_exit(&ipst->ips_ill_g_lock); 2789 if (ms == 0) { 2790 if (nce->nce_state != ND_REACHABLE) { 2791 mutex_exit(&nce->nce_lock); 2792 nce_resolv_failed(nce); 2793 ndp_delete(nce); 2794 } else { 2795 mutex_exit(&nce->nce_lock); 2796 } 2797 } else { 2798 mutex_exit(&nce->nce_lock); 2799 NDP_RESTART_TIMER(nce, (clock_t)ms); 2800 } 2801 NCE_REFRELE(nce); 2802 return; 2803 } 2804 mutex_exit(&nce->nce_lock); 2805 rw_exit(&ipst->ips_ill_g_lock); 2806 NCE_REFRELE(nce); 2807 break; 2808 case ND_REACHABLE : 2809 rw_exit(&ipst->ips_ill_g_lock); 2810 if (((nce->nce_flags & NCE_F_UNSOL_ADV) && 2811 nce->nce_unsolicit_count != 0) || 2812 ((nce->nce_flags & NCE_F_PERMANENT) && 2813 ipst->ips_ip_ndp_defense_interval != 0)) { 2814 if (nce->nce_unsolicit_count > 0) 2815 nce->nce_unsolicit_count--; 2816 mutex_exit(&nce->nce_lock); 2817 dropped = nce_xmit(ill, 2818 ND_NEIGHBOR_ADVERT, 2819 ill, /* ill to be used for hw addr */ 2820 B_FALSE, /* use ill_phys_addr */ 2821 &nce->nce_addr, 2822 &ipv6_all_hosts_mcast, 2823 nce_advert_flags(nce)); 2824 if (dropped) { 2825 mutex_enter(&nce->nce_lock); 2826 nce->nce_unsolicit_count++; 2827 mutex_exit(&nce->nce_lock); 2828 } 2829 if (nce->nce_unsolicit_count != 0) { 2830 NDP_RESTART_TIMER(nce, 2831 ipst->ips_ip_ndp_unsolicit_interval); 2832 } else { 2833 NDP_RESTART_TIMER(nce, 2834 ipst->ips_ip_ndp_defense_interval); 2835 } 2836 } else { 2837 mutex_exit(&nce->nce_lock); 2838 } 2839 NCE_REFRELE(nce); 2840 break; 2841 default: 2842 rw_exit(&ipst->ips_ill_g_lock); 2843 mutex_exit(&nce->nce_lock); 2844 NCE_REFRELE(nce); 2845 break; 2846 } 2847 } 2848 2849 /* 2850 * Set a link layer address from the ll_addr passed in. 2851 * Copy SAP from ill. 2852 */ 2853 static void 2854 nce_set_ll(nce_t *nce, uchar_t *ll_addr) 2855 { 2856 ill_t *ill = nce->nce_ill; 2857 uchar_t *woffset; 2858 2859 ASSERT(ll_addr != NULL); 2860 /* Always called before fast_path_probe */ 2861 ASSERT(nce->nce_fp_mp == NULL); 2862 if (ill->ill_sap_length != 0) { 2863 /* 2864 * Copy the SAP type specified in the 2865 * request into the xmit template. 2866 */ 2867 NCE_LL_SAP_COPY(ill, nce->nce_res_mp); 2868 } 2869 if (ill->ill_phys_addr_length > 0) { 2870 /* 2871 * The bcopy() below used to be called for the physical address 2872 * length rather than the link layer address length. For 2873 * ethernet and many other media, the phys_addr and lla are 2874 * identical. 2875 * However, with xresolv interfaces being introduced, the 2876 * phys_addr and lla are no longer the same, and the physical 2877 * address may not have any useful meaning, so we use the lla 2878 * for IPv6 address resolution and destination addressing. 2879 * 2880 * For PPP or other interfaces with a zero length 2881 * physical address, don't do anything here. 2882 * The bcopy() with a zero phys_addr length was previously 2883 * a no-op for interfaces with a zero-length physical address. 2884 * Using the lla for them would change the way they operate. 2885 * Doing nothing in such cases preserves expected behavior. 2886 */ 2887 woffset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill); 2888 bcopy(ll_addr, woffset, ill->ill_nd_lla_len); 2889 } 2890 } 2891 2892 static boolean_t 2893 nce_cmp_ll_addr(const nce_t *nce, const uchar_t *ll_addr, uint32_t ll_addr_len) 2894 { 2895 ill_t *ill = nce->nce_ill; 2896 uchar_t *ll_offset; 2897 2898 ASSERT(nce->nce_res_mp != NULL); 2899 if (ll_addr == NULL) 2900 return (B_FALSE); 2901 ll_offset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill); 2902 if (bcmp(ll_addr, ll_offset, ll_addr_len) != 0) 2903 return (B_TRUE); 2904 return (B_FALSE); 2905 } 2906 2907 /* 2908 * Updates the link layer address or the reachability state of 2909 * a cache entry. Reset probe counter if needed. 2910 */ 2911 static void 2912 nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr) 2913 { 2914 ill_t *ill = nce->nce_ill; 2915 boolean_t need_stop_timer = B_FALSE; 2916 boolean_t need_fastpath_update = B_FALSE; 2917 2918 ASSERT(MUTEX_HELD(&nce->nce_lock)); 2919 ASSERT(nce->nce_ipversion == IPV6_VERSION); 2920 /* 2921 * If this interface does not do NUD, there is no point 2922 * in allowing an update to the cache entry. Although 2923 * we will respond to NS. 2924 * The only time we accept an update for a resolver when 2925 * NUD is turned off is when it has just been created. 2926 * Non-Resolvers will always be created as REACHABLE. 2927 */ 2928 if (new_state != ND_UNCHANGED) { 2929 if ((nce->nce_flags & NCE_F_NONUD) && 2930 (nce->nce_state != ND_INCOMPLETE)) 2931 return; 2932 ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN); 2933 ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX); 2934 need_stop_timer = B_TRUE; 2935 if (new_state == ND_REACHABLE) 2936 nce->nce_last = TICK_TO_MSEC(lbolt64); 2937 else { 2938 /* We force NUD in this case */ 2939 nce->nce_last = 0; 2940 } 2941 nce->nce_state = new_state; 2942 nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT; 2943 } 2944 /* 2945 * In case of fast path we need to free the the fastpath 2946 * M_DATA and do another probe. Otherwise we can just 2947 * overwrite the DL_UNITDATA_REQ data, noting we'll lose 2948 * whatever packets that happens to be transmitting at the time. 2949 */ 2950 if (new_ll_addr != NULL) { 2951 ASSERT(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill) + 2952 ill->ill_nd_lla_len <= nce->nce_res_mp->b_wptr); 2953 bcopy(new_ll_addr, nce->nce_res_mp->b_rptr + 2954 NCE_LL_ADDR_OFFSET(ill), ill->ill_nd_lla_len); 2955 if (nce->nce_fp_mp != NULL) { 2956 freemsg(nce->nce_fp_mp); 2957 nce->nce_fp_mp = NULL; 2958 } 2959 need_fastpath_update = B_TRUE; 2960 } 2961 mutex_exit(&nce->nce_lock); 2962 if (need_stop_timer) { 2963 (void) untimeout(nce->nce_timeout_id); 2964 nce->nce_timeout_id = 0; 2965 } 2966 if (need_fastpath_update) 2967 nce_fastpath(nce); 2968 mutex_enter(&nce->nce_lock); 2969 } 2970 2971 void 2972 nce_queue_mp_common(nce_t *nce, mblk_t *mp, boolean_t head_insert) 2973 { 2974 uint_t count = 0; 2975 mblk_t **mpp; 2976 2977 ASSERT(MUTEX_HELD(&nce->nce_lock)); 2978 2979 for (mpp = &nce->nce_qd_mp; *mpp != NULL; 2980 mpp = &(*mpp)->b_next) { 2981 if (++count > 2982 nce->nce_ill->ill_max_buf) { 2983 mblk_t *tmp = nce->nce_qd_mp->b_next; 2984 2985 nce->nce_qd_mp->b_next = NULL; 2986 nce->nce_qd_mp->b_prev = NULL; 2987 freemsg(nce->nce_qd_mp); 2988 nce->nce_qd_mp = tmp; 2989 } 2990 } 2991 /* put this on the list */ 2992 if (head_insert) { 2993 mp->b_next = nce->nce_qd_mp; 2994 nce->nce_qd_mp = mp; 2995 } else { 2996 *mpp = mp; 2997 } 2998 } 2999 3000 static void 3001 nce_queue_mp(nce_t *nce, mblk_t *mp) 3002 { 3003 boolean_t head_insert = B_FALSE; 3004 ip6_t *ip6h; 3005 ip6i_t *ip6i; 3006 mblk_t *data_mp; 3007 3008 ASSERT(MUTEX_HELD(&nce->nce_lock)); 3009 3010 if (mp->b_datap->db_type == M_CTL) 3011 data_mp = mp->b_cont; 3012 else 3013 data_mp = mp; 3014 ip6h = (ip6_t *)data_mp->b_rptr; 3015 if (ip6h->ip6_nxt == IPPROTO_RAW) { 3016 /* 3017 * This message should have been pulled up already in 3018 * ip_wput_v6. We can't do pullups here because the message 3019 * could be from the nce_qd_mp which could have b_next/b_prev 3020 * non-NULL. 3021 */ 3022 ip6i = (ip6i_t *)ip6h; 3023 ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >= 3024 sizeof (ip6i_t) + IPV6_HDR_LEN); 3025 /* 3026 * Multipathing probe packets have IP6I_DROP_IFDELAYED set. 3027 * This has 2 aspects mentioned below. 3028 * 1. Perform head insertion in the nce_qd_mp for these packets. 3029 * This ensures that next retransmit of ND solicitation 3030 * will use the interface specified by the probe packet, 3031 * for both NS and NA. This corresponds to the src address 3032 * in the IPv6 packet. If we insert at tail, we will be 3033 * depending on the packet at the head for successful 3034 * ND resolution. This is not reliable, because the interface 3035 * on which the NA arrives could be different from the interface 3036 * on which the NS was sent, and if the receiving interface is 3037 * failed, it will appear that the sending interface is also 3038 * failed, causing in.mpathd to misdiagnose this as link 3039 * failure. 3040 * 2. Drop the original packet, if the ND resolution did not 3041 * succeed in the first attempt. However we will create the 3042 * nce and the ire, as soon as the ND resolution succeeds. 3043 * We don't gain anything by queueing multiple probe packets 3044 * and sending them back-to-back once resolution succeeds. 3045 * It is sufficient to send just 1 packet after ND resolution 3046 * succeeds. Since mpathd is sending down probe packets at a 3047 * constant rate, we don't need to send the queued packet. We 3048 * need to queue it only for NDP resolution. The benefit of 3049 * dropping the probe packets that were delayed in ND 3050 * resolution, is that in.mpathd will not see inflated 3051 * RTT. If the ND resolution does not succeed within 3052 * in.mpathd's failure detection time, mpathd may detect 3053 * a failure, and it does not matter whether the packet 3054 * was queued or dropped. 3055 */ 3056 if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED) 3057 head_insert = B_TRUE; 3058 } 3059 3060 nce_queue_mp_common(nce, mp, head_insert); 3061 } 3062 3063 /* 3064 * Called when address resolution failed due to a timeout. 3065 * Send an ICMP unreachable in response to all queued packets. 3066 */ 3067 void 3068 nce_resolv_failed(nce_t *nce) 3069 { 3070 mblk_t *mp, *nxt_mp, *first_mp; 3071 char buf[INET6_ADDRSTRLEN]; 3072 ip6_t *ip6h; 3073 zoneid_t zoneid = GLOBAL_ZONEID; 3074 ip_stack_t *ipst = nce->nce_ill->ill_ipst; 3075 3076 ip1dbg(("nce_resolv_failed: dst %s\n", 3077 inet_ntop(AF_INET6, (char *)&nce->nce_addr, buf, sizeof (buf)))); 3078 mutex_enter(&nce->nce_lock); 3079 mp = nce->nce_qd_mp; 3080 nce->nce_qd_mp = NULL; 3081 mutex_exit(&nce->nce_lock); 3082 while (mp != NULL) { 3083 nxt_mp = mp->b_next; 3084 mp->b_next = NULL; 3085 mp->b_prev = NULL; 3086 3087 first_mp = mp; 3088 if (mp->b_datap->db_type == M_CTL) { 3089 ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr; 3090 ASSERT(io->ipsec_out_type == IPSEC_OUT); 3091 zoneid = io->ipsec_out_zoneid; 3092 ASSERT(zoneid != ALL_ZONES); 3093 mp = mp->b_cont; 3094 } 3095 3096 ip6h = (ip6_t *)mp->b_rptr; 3097 if (ip6h->ip6_nxt == IPPROTO_RAW) { 3098 ip6i_t *ip6i; 3099 /* 3100 * This message should have been pulled up already 3101 * in ip_wput_v6. ip_hdr_complete_v6 assumes that 3102 * the header is pulled up. 3103 */ 3104 ip6i = (ip6i_t *)ip6h; 3105 ASSERT((mp->b_wptr - (uchar_t *)ip6i) >= 3106 sizeof (ip6i_t) + IPV6_HDR_LEN); 3107 mp->b_rptr += sizeof (ip6i_t); 3108 } 3109 /* 3110 * Ignore failure since icmp_unreachable_v6 will silently 3111 * drop packets with an unspecified source address. 3112 */ 3113 (void) ip_hdr_complete_v6((ip6_t *)mp->b_rptr, zoneid, ipst); 3114 icmp_unreachable_v6(nce->nce_ill->ill_wq, first_mp, 3115 ICMP6_DST_UNREACH_ADDR, B_FALSE, B_FALSE, zoneid, ipst); 3116 mp = nxt_mp; 3117 } 3118 } 3119 3120 /* 3121 * Called by SIOCSNDP* ioctl to add/change an nce entry 3122 * and the corresponding attributes. 3123 * Disallow states other than ND_REACHABLE or ND_STALE. 3124 */ 3125 int 3126 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr) 3127 { 3128 sin6_t *sin6; 3129 in6_addr_t *addr; 3130 nce_t *nce; 3131 int err; 3132 uint16_t new_flags = 0; 3133 uint16_t old_flags = 0; 3134 int inflags = lnr->lnr_flags; 3135 ip_stack_t *ipst = ill->ill_ipst; 3136 3137 ASSERT(ill->ill_isv6); 3138 if ((lnr->lnr_state_create != ND_REACHABLE) && 3139 (lnr->lnr_state_create != ND_STALE)) 3140 return (EINVAL); 3141 3142 sin6 = (sin6_t *)&lnr->lnr_addr; 3143 addr = &sin6->sin6_addr; 3144 3145 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 3146 /* We know it can not be mapping so just look in the hash table */ 3147 nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr)); 3148 nce = nce_lookup_addr(ill, addr, nce); 3149 if (nce != NULL) 3150 new_flags = nce->nce_flags; 3151 3152 switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) { 3153 case NDF_ISROUTER_ON: 3154 new_flags |= NCE_F_ISROUTER; 3155 break; 3156 case NDF_ISROUTER_OFF: 3157 new_flags &= ~NCE_F_ISROUTER; 3158 break; 3159 case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON): 3160 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3161 if (nce != NULL) 3162 NCE_REFRELE(nce); 3163 return (EINVAL); 3164 } 3165 3166 switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) { 3167 case NDF_ANYCAST_ON: 3168 new_flags |= NCE_F_ANYCAST; 3169 break; 3170 case NDF_ANYCAST_OFF: 3171 new_flags &= ~NCE_F_ANYCAST; 3172 break; 3173 case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON): 3174 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3175 if (nce != NULL) 3176 NCE_REFRELE(nce); 3177 return (EINVAL); 3178 } 3179 3180 switch (inflags & (NDF_PROXY_ON|NDF_PROXY_OFF)) { 3181 case NDF_PROXY_ON: 3182 new_flags |= NCE_F_PROXY; 3183 break; 3184 case NDF_PROXY_OFF: 3185 new_flags &= ~NCE_F_PROXY; 3186 break; 3187 case (NDF_PROXY_OFF|NDF_PROXY_ON): 3188 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3189 if (nce != NULL) 3190 NCE_REFRELE(nce); 3191 return (EINVAL); 3192 } 3193 3194 if (nce == NULL) { 3195 err = ndp_add_v6(ill, 3196 (uchar_t *)lnr->lnr_hdw_addr, 3197 addr, 3198 &ipv6_all_ones, 3199 &ipv6_all_zeros, 3200 0, 3201 new_flags, 3202 lnr->lnr_state_create, 3203 &nce); 3204 if (err != 0) { 3205 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3206 ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err)); 3207 return (err); 3208 } 3209 } 3210 old_flags = nce->nce_flags; 3211 if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) { 3212 /* 3213 * Router turned to host, delete all ires. 3214 * XXX Just delete the entry, but we need to add too. 3215 */ 3216 nce->nce_flags &= ~NCE_F_ISROUTER; 3217 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3218 ndp_delete(nce); 3219 NCE_REFRELE(nce); 3220 return (0); 3221 } 3222 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3223 3224 mutex_enter(&nce->nce_lock); 3225 nce->nce_flags = new_flags; 3226 mutex_exit(&nce->nce_lock); 3227 /* 3228 * Note that we ignore the state at this point, which 3229 * should be either STALE or REACHABLE. Instead we let 3230 * the link layer address passed in to determine the state 3231 * much like incoming packets. 3232 */ 3233 ndp_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE); 3234 NCE_REFRELE(nce); 3235 return (0); 3236 } 3237 3238 /* 3239 * If the device driver supports it, we make nce_fp_mp to have 3240 * an M_DATA prepend. Otherwise nce_fp_mp will be null. 3241 * The caller ensures there is hold on nce for this function. 3242 * Note that since ill_fastpath_probe() copies the mblk there is 3243 * no need for the hold beyond this function. 3244 */ 3245 void 3246 nce_fastpath(nce_t *nce) 3247 { 3248 ill_t *ill = nce->nce_ill; 3249 int res; 3250 3251 ASSERT(ill != NULL); 3252 ASSERT(nce->nce_state != ND_INITIAL && nce->nce_state != ND_INCOMPLETE); 3253 3254 if (nce->nce_fp_mp != NULL) { 3255 /* Already contains fastpath info */ 3256 return; 3257 } 3258 if (nce->nce_res_mp != NULL) { 3259 nce_fastpath_list_add(nce); 3260 res = ill_fastpath_probe(ill, nce->nce_res_mp); 3261 /* 3262 * EAGAIN is an indication of a transient error 3263 * i.e. allocation failure etc. leave the nce in the list it 3264 * will be updated when another probe happens for another ire 3265 * if not it will be taken out of the list when the ire is 3266 * deleted. 3267 */ 3268 3269 if (res != 0 && res != EAGAIN) 3270 nce_fastpath_list_delete(nce); 3271 } 3272 } 3273 3274 /* 3275 * Drain the list of nce's waiting for fastpath response. 3276 */ 3277 void 3278 nce_fastpath_list_dispatch(ill_t *ill, boolean_t (*func)(nce_t *, void *), 3279 void *arg) 3280 { 3281 3282 nce_t *next_nce; 3283 nce_t *current_nce; 3284 nce_t *first_nce; 3285 nce_t *prev_nce = NULL; 3286 3287 mutex_enter(&ill->ill_lock); 3288 first_nce = current_nce = (nce_t *)ill->ill_fastpath_list; 3289 while (current_nce != (nce_t *)&ill->ill_fastpath_list) { 3290 next_nce = current_nce->nce_fastpath; 3291 /* 3292 * Take it off the list if we're flushing, or if the callback 3293 * routine tells us to do so. Otherwise, leave the nce in the 3294 * fastpath list to handle any pending response from the lower 3295 * layer. We can't drain the list when the callback routine 3296 * comparison failed, because the response is asynchronous in 3297 * nature, and may not arrive in the same order as the list 3298 * insertion. 3299 */ 3300 if (func == NULL || func(current_nce, arg)) { 3301 current_nce->nce_fastpath = NULL; 3302 if (current_nce == first_nce) 3303 ill->ill_fastpath_list = first_nce = next_nce; 3304 else 3305 prev_nce->nce_fastpath = next_nce; 3306 } else { 3307 /* previous element that is still in the list */ 3308 prev_nce = current_nce; 3309 } 3310 current_nce = next_nce; 3311 } 3312 mutex_exit(&ill->ill_lock); 3313 } 3314 3315 /* 3316 * Add nce to the nce fastpath list. 3317 */ 3318 void 3319 nce_fastpath_list_add(nce_t *nce) 3320 { 3321 ill_t *ill; 3322 3323 ill = nce->nce_ill; 3324 3325 mutex_enter(&ill->ill_lock); 3326 mutex_enter(&nce->nce_lock); 3327 3328 /* 3329 * if nce has not been deleted and 3330 * is not already in the list add it. 3331 */ 3332 if (!(nce->nce_flags & NCE_F_CONDEMNED) && 3333 (nce->nce_fastpath == NULL)) { 3334 nce->nce_fastpath = (nce_t *)ill->ill_fastpath_list; 3335 ill->ill_fastpath_list = nce; 3336 } 3337 3338 mutex_exit(&nce->nce_lock); 3339 mutex_exit(&ill->ill_lock); 3340 } 3341 3342 /* 3343 * remove nce from the nce fastpath list. 3344 */ 3345 void 3346 nce_fastpath_list_delete(nce_t *nce) 3347 { 3348 nce_t *nce_ptr; 3349 3350 ill_t *ill; 3351 3352 ill = nce->nce_ill; 3353 ASSERT(ill != NULL); 3354 3355 mutex_enter(&ill->ill_lock); 3356 if (nce->nce_fastpath == NULL) 3357 goto done; 3358 3359 ASSERT(ill->ill_fastpath_list != &ill->ill_fastpath_list); 3360 3361 if (ill->ill_fastpath_list == nce) { 3362 ill->ill_fastpath_list = nce->nce_fastpath; 3363 } else { 3364 nce_ptr = ill->ill_fastpath_list; 3365 while (nce_ptr != (nce_t *)&ill->ill_fastpath_list) { 3366 if (nce_ptr->nce_fastpath == nce) { 3367 nce_ptr->nce_fastpath = nce->nce_fastpath; 3368 break; 3369 } 3370 nce_ptr = nce_ptr->nce_fastpath; 3371 } 3372 } 3373 3374 nce->nce_fastpath = NULL; 3375 done: 3376 mutex_exit(&ill->ill_lock); 3377 } 3378 3379 /* 3380 * Update all NCE's that are not in fastpath mode and 3381 * have an nce_fp_mp that matches mp. mp->b_cont contains 3382 * the fastpath header. 3383 * 3384 * Returns TRUE if entry should be dequeued, or FALSE otherwise. 3385 */ 3386 boolean_t 3387 ndp_fastpath_update(nce_t *nce, void *arg) 3388 { 3389 mblk_t *mp, *fp_mp; 3390 uchar_t *mp_rptr, *ud_mp_rptr; 3391 mblk_t *ud_mp = nce->nce_res_mp; 3392 ptrdiff_t cmplen; 3393 3394 if (nce->nce_flags & NCE_F_MAPPING) 3395 return (B_TRUE); 3396 if ((nce->nce_fp_mp != NULL) || (ud_mp == NULL)) 3397 return (B_TRUE); 3398 3399 ip2dbg(("ndp_fastpath_update: trying\n")); 3400 mp = (mblk_t *)arg; 3401 mp_rptr = mp->b_rptr; 3402 cmplen = mp->b_wptr - mp_rptr; 3403 ASSERT(cmplen >= 0); 3404 ud_mp_rptr = ud_mp->b_rptr; 3405 /* 3406 * The nce is locked here to prevent any other threads 3407 * from accessing and changing nce_res_mp when the IPv6 address 3408 * becomes resolved to an lla while we're in the middle 3409 * of looking at and comparing the hardware address (lla). 3410 * It is also locked to prevent multiple threads in nce_fastpath_update 3411 * from examining nce_res_mp atthe same time. 3412 */ 3413 mutex_enter(&nce->nce_lock); 3414 if (ud_mp->b_wptr - ud_mp_rptr != cmplen || 3415 bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) != 0) { 3416 mutex_exit(&nce->nce_lock); 3417 /* 3418 * Don't take the ire off the fastpath list yet, 3419 * since the response may come later. 3420 */ 3421 return (B_FALSE); 3422 } 3423 /* Matched - install mp as the fastpath mp */ 3424 ip1dbg(("ndp_fastpath_update: match\n")); 3425 fp_mp = dupb(mp->b_cont); 3426 if (fp_mp != NULL) { 3427 nce->nce_fp_mp = fp_mp; 3428 } 3429 mutex_exit(&nce->nce_lock); 3430 return (B_TRUE); 3431 } 3432 3433 /* 3434 * This function handles the DL_NOTE_FASTPATH_FLUSH notification from 3435 * driver. Note that it assumes IP is exclusive... 3436 */ 3437 /* ARGSUSED */ 3438 void 3439 ndp_fastpath_flush(nce_t *nce, char *arg) 3440 { 3441 if (nce->nce_flags & NCE_F_MAPPING) 3442 return; 3443 /* No fastpath info? */ 3444 if (nce->nce_fp_mp == NULL || nce->nce_res_mp == NULL) 3445 return; 3446 3447 if (nce->nce_ipversion == IPV4_VERSION && 3448 nce->nce_flags & NCE_F_BCAST) { 3449 /* 3450 * IPv4 BROADCAST entries: 3451 * We can't delete the nce since it is difficult to 3452 * recreate these without going through the 3453 * ipif down/up dance. 3454 * 3455 * All access to nce->nce_fp_mp in the case of these 3456 * is protected by nce_lock. 3457 */ 3458 mutex_enter(&nce->nce_lock); 3459 if (nce->nce_fp_mp != NULL) { 3460 freeb(nce->nce_fp_mp); 3461 nce->nce_fp_mp = NULL; 3462 mutex_exit(&nce->nce_lock); 3463 nce_fastpath(nce); 3464 } else { 3465 mutex_exit(&nce->nce_lock); 3466 } 3467 } else { 3468 /* Just delete the NCE... */ 3469 ndp_delete(nce); 3470 } 3471 } 3472 3473 /* 3474 * Return a pointer to a given option in the packet. 3475 * Assumes that option part of the packet have already been validated. 3476 */ 3477 nd_opt_hdr_t * 3478 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type) 3479 { 3480 while (optlen > 0) { 3481 if (opt->nd_opt_type == opt_type) 3482 return (opt); 3483 optlen -= 8 * opt->nd_opt_len; 3484 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len); 3485 } 3486 return (NULL); 3487 } 3488 3489 /* 3490 * Verify all option lengths present are > 0, also check to see 3491 * if the option lengths and packet length are consistent. 3492 */ 3493 boolean_t 3494 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen) 3495 { 3496 ASSERT(opt != NULL); 3497 while (optlen > 0) { 3498 if (opt->nd_opt_len == 0) 3499 return (B_FALSE); 3500 optlen -= 8 * opt->nd_opt_len; 3501 if (optlen < 0) 3502 return (B_FALSE); 3503 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len); 3504 } 3505 return (B_TRUE); 3506 } 3507 3508 /* 3509 * ndp_walk function. 3510 * Free a fraction of the NCE cache entries. 3511 * A fraction of zero means to not free any in that category. 3512 */ 3513 void 3514 ndp_cache_reclaim(nce_t *nce, char *arg) 3515 { 3516 nce_cache_reclaim_t *ncr = (nce_cache_reclaim_t *)arg; 3517 uint_t rand; 3518 3519 if (nce->nce_flags & NCE_F_PERMANENT) 3520 return; 3521 3522 rand = (uint_t)lbolt + 3523 NCE_ADDR_HASH_V6(nce->nce_addr, NCE_TABLE_SIZE); 3524 if (ncr->ncr_host != 0 && 3525 (rand/ncr->ncr_host)*ncr->ncr_host == rand) { 3526 ndp_delete(nce); 3527 return; 3528 } 3529 } 3530 3531 /* 3532 * ndp_walk function. 3533 * Count the number of NCEs that can be deleted. 3534 * These would be hosts but not routers. 3535 */ 3536 void 3537 ndp_cache_count(nce_t *nce, char *arg) 3538 { 3539 ncc_cache_count_t *ncc = (ncc_cache_count_t *)arg; 3540 3541 if (nce->nce_flags & NCE_F_PERMANENT) 3542 return; 3543 3544 ncc->ncc_total++; 3545 if (!(nce->nce_flags & NCE_F_ISROUTER)) 3546 ncc->ncc_host++; 3547 } 3548 3549 #ifdef NCE_DEBUG 3550 th_trace_t * 3551 th_trace_nce_lookup(nce_t *nce) 3552 { 3553 int bucket_id; 3554 th_trace_t *th_trace; 3555 3556 ASSERT(MUTEX_HELD(&nce->nce_lock)); 3557 3558 bucket_id = IP_TR_HASH(curthread); 3559 ASSERT(bucket_id < IP_TR_HASH_MAX); 3560 3561 for (th_trace = nce->nce_trace[bucket_id]; th_trace != NULL; 3562 th_trace = th_trace->th_next) { 3563 if (th_trace->th_id == curthread) 3564 return (th_trace); 3565 } 3566 return (NULL); 3567 } 3568 3569 void 3570 nce_trace_ref(nce_t *nce) 3571 { 3572 int bucket_id; 3573 th_trace_t *th_trace; 3574 3575 /* 3576 * Attempt to locate the trace buffer for the curthread. 3577 * If it does not exist, then allocate a new trace buffer 3578 * and link it in list of trace bufs for this ipif, at the head 3579 */ 3580 ASSERT(MUTEX_HELD(&nce->nce_lock)); 3581 3582 if (nce->nce_trace_disable == B_TRUE) 3583 return; 3584 3585 th_trace = th_trace_nce_lookup(nce); 3586 if (th_trace == NULL) { 3587 bucket_id = IP_TR_HASH(curthread); 3588 th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t), 3589 KM_NOSLEEP); 3590 if (th_trace == NULL) { 3591 nce->nce_trace_disable = B_TRUE; 3592 nce_trace_inactive(nce); 3593 return; 3594 } 3595 th_trace->th_id = curthread; 3596 th_trace->th_next = nce->nce_trace[bucket_id]; 3597 th_trace->th_prev = &nce->nce_trace[bucket_id]; 3598 if (th_trace->th_next != NULL) 3599 th_trace->th_next->th_prev = &th_trace->th_next; 3600 nce->nce_trace[bucket_id] = th_trace; 3601 } 3602 ASSERT(th_trace->th_refcnt < TR_BUF_MAX - 1); 3603 th_trace->th_refcnt++; 3604 th_trace_rrecord(th_trace); 3605 } 3606 3607 void 3608 nce_untrace_ref(nce_t *nce) 3609 { 3610 th_trace_t *th_trace; 3611 3612 ASSERT(MUTEX_HELD(&nce->nce_lock)); 3613 3614 if (nce->nce_trace_disable == B_TRUE) 3615 return; 3616 3617 th_trace = th_trace_nce_lookup(nce); 3618 ASSERT(th_trace != NULL && th_trace->th_refcnt > 0); 3619 3620 th_trace_rrecord(th_trace); 3621 th_trace->th_refcnt--; 3622 } 3623 3624 void 3625 nce_trace_inactive(nce_t *nce) 3626 { 3627 th_trace_t *th_trace; 3628 int i; 3629 3630 ASSERT(MUTEX_HELD(&nce->nce_lock)); 3631 3632 for (i = 0; i < IP_TR_HASH_MAX; i++) { 3633 while (nce->nce_trace[i] != NULL) { 3634 th_trace = nce->nce_trace[i]; 3635 3636 /* unlink th_trace and free it */ 3637 nce->nce_trace[i] = th_trace->th_next; 3638 if (th_trace->th_next != NULL) 3639 th_trace->th_next->th_prev = 3640 &nce->nce_trace[i]; 3641 3642 th_trace->th_next = NULL; 3643 th_trace->th_prev = NULL; 3644 kmem_free(th_trace, sizeof (th_trace_t)); 3645 } 3646 } 3647 3648 } 3649 3650 /* ARGSUSED */ 3651 int 3652 nce_thread_exit(nce_t *nce, caddr_t arg) 3653 { 3654 th_trace_t *th_trace; 3655 uint64_t now; 3656 3657 mutex_enter(&nce->nce_lock); 3658 if (nce->nce_state == ND_INITIAL) { 3659 3660 now = TICK_TO_MSEC(lbolt64); 3661 if (now - nce->nce_init_time > NCE_STUCK_TIMEOUT) { 3662 DTRACE_PROBE1(nce__stuck, nce_t *, nce); 3663 } 3664 } 3665 th_trace = th_trace_nce_lookup(nce); 3666 3667 if (th_trace == NULL) { 3668 mutex_exit(&nce->nce_lock); 3669 return (0); 3670 } 3671 3672 ASSERT(th_trace->th_refcnt == 0); 3673 3674 /* unlink th_trace and free it */ 3675 *th_trace->th_prev = th_trace->th_next; 3676 if (th_trace->th_next != NULL) 3677 th_trace->th_next->th_prev = th_trace->th_prev; 3678 th_trace->th_next = NULL; 3679 th_trace->th_prev = NULL; 3680 kmem_free(th_trace, sizeof (th_trace_t)); 3681 mutex_exit(&nce->nce_lock); 3682 return (0); 3683 } 3684 #endif 3685 3686 /* 3687 * Called when address resolution fails due to a timeout. 3688 * Send an ICMP unreachable in response to all queued packets. 3689 */ 3690 void 3691 arp_resolv_failed(nce_t *nce) 3692 { 3693 mblk_t *mp, *nxt_mp, *first_mp; 3694 char buf[INET6_ADDRSTRLEN]; 3695 zoneid_t zoneid = GLOBAL_ZONEID; 3696 struct in_addr ipv4addr; 3697 ip_stack_t *ipst = nce->nce_ill->ill_ipst; 3698 3699 IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &ipv4addr); 3700 ip3dbg(("arp_resolv_failed: dst %s\n", 3701 inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf)))); 3702 mutex_enter(&nce->nce_lock); 3703 mp = nce->nce_qd_mp; 3704 nce->nce_qd_mp = NULL; 3705 mutex_exit(&nce->nce_lock); 3706 3707 while (mp != NULL) { 3708 nxt_mp = mp->b_next; 3709 mp->b_next = NULL; 3710 mp->b_prev = NULL; 3711 3712 first_mp = mp; 3713 /* 3714 * Send icmp unreachable messages 3715 * to the hosts. 3716 */ 3717 (void) ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst); 3718 ip3dbg(("arp_resolv_failed: Calling icmp_unreachable\n")); 3719 icmp_unreachable(nce->nce_ill->ill_wq, first_mp, 3720 ICMP_HOST_UNREACHABLE, zoneid, ipst); 3721 mp = nxt_mp; 3722 } 3723 } 3724 3725 int 3726 ndp_lookup_then_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags, 3727 nce_t **newnce, nce_t *src_nce) 3728 { 3729 int err; 3730 nce_t *nce; 3731 in6_addr_t addr6; 3732 ip_stack_t *ipst = ill->ill_ipst; 3733 3734 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 3735 nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr)); 3736 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 3737 nce = nce_lookup_addr(ill, &addr6, nce); 3738 if (nce == NULL) { 3739 err = ndp_add_v4(ill, addr, flags, newnce, src_nce); 3740 } else { 3741 *newnce = nce; 3742 err = EEXIST; 3743 } 3744 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3745 return (err); 3746 } 3747 3748 /* 3749 * NDP Cache Entry creation routine for IPv4. 3750 * Mapped entries are handled in arp. 3751 * This routine must always be called with ndp4->ndp_g_lock held. 3752 * Prior to return, nce_refcnt is incremented. 3753 */ 3754 static int 3755 ndp_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags, 3756 nce_t **newnce, nce_t *src_nce) 3757 { 3758 static nce_t nce_nil; 3759 nce_t *nce; 3760 mblk_t *mp; 3761 mblk_t *template = NULL; 3762 nce_t **ncep; 3763 ip_stack_t *ipst = ill->ill_ipst; 3764 uint16_t state = ND_INITIAL; 3765 int err; 3766 3767 ASSERT(MUTEX_HELD(&ipst->ips_ndp4->ndp_g_lock)); 3768 ASSERT(!ill->ill_isv6); 3769 ASSERT((flags & NCE_F_MAPPING) == 0); 3770 3771 if (ill->ill_resolver_mp == NULL) 3772 return (EINVAL); 3773 /* 3774 * Allocate the mblk to hold the nce. 3775 */ 3776 mp = allocb(sizeof (nce_t), BPRI_MED); 3777 if (mp == NULL) 3778 return (ENOMEM); 3779 3780 nce = (nce_t *)mp->b_rptr; 3781 mp->b_wptr = (uchar_t *)&nce[1]; 3782 *nce = nce_nil; 3783 nce->nce_ill = ill; 3784 nce->nce_ipversion = IPV4_VERSION; 3785 nce->nce_flags = flags; 3786 nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT; 3787 nce->nce_rcnt = ill->ill_xmit_count; 3788 IN6_IPADDR_TO_V4MAPPED(*addr, &nce->nce_addr); 3789 nce->nce_mask = ipv6_all_ones; 3790 nce->nce_extract_mask = ipv6_all_zeros; 3791 nce->nce_ll_extract_start = 0; 3792 nce->nce_qd_mp = NULL; 3793 nce->nce_mp = mp; 3794 /* This one is for nce getting created */ 3795 nce->nce_refcnt = 1; 3796 mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL); 3797 ncep = ((nce_t **)NCE_HASH_PTR_V4(ipst, *addr)); 3798 3799 #ifdef NCE_DEBUG 3800 bzero(nce->nce_trace, sizeof (th_trace_t *) * IP_TR_HASH_MAX); 3801 #endif 3802 if (src_nce != NULL) { 3803 /* 3804 * src_nce has been provided by the caller. The only 3805 * caller who provides a non-null, non-broadcast 3806 * src_nce is from ip_newroute() which must pass in 3807 * a ND_REACHABLE src_nce (this condition is verified 3808 * via an ASSERT for the save_ire->ire_nce in ip_newroute()) 3809 */ 3810 mutex_enter(&src_nce->nce_lock); 3811 state = src_nce->nce_state; 3812 if ((src_nce->nce_flags & NCE_F_CONDEMNED) || 3813 (ipst->ips_ndp4->ndp_g_hw_change > 0)) { 3814 /* 3815 * src_nce has been deleted, or 3816 * ip_arp_news is in the middle of 3817 * flushing entries in the the nce. 3818 * Fail the add, since we don't know 3819 * if it is safe to copy the contents of 3820 * src_nce 3821 */ 3822 DTRACE_PROBE2(nce__bad__src__nce, 3823 nce_t *, src_nce, ill_t *, ill); 3824 mutex_exit(&src_nce->nce_lock); 3825 err = EINVAL; 3826 goto err_ret; 3827 } 3828 template = copyb(src_nce->nce_res_mp); 3829 mutex_exit(&src_nce->nce_lock); 3830 if (template == NULL) { 3831 err = ENOMEM; 3832 goto err_ret; 3833 } 3834 } else if (flags & NCE_F_BCAST) { 3835 /* 3836 * broadcast nce. 3837 */ 3838 template = copyb(ill->ill_bcast_mp); 3839 if (template == NULL) { 3840 err = ENOMEM; 3841 goto err_ret; 3842 } 3843 state = ND_REACHABLE; 3844 } else if (ill->ill_net_type == IRE_IF_NORESOLVER) { 3845 /* 3846 * NORESOLVER entries are always created in the REACHABLE 3847 * state. We create a nce_res_mp with the IP nexthop address 3848 * in the destination address in the DLPI hdr if the 3849 * physical length is exactly 4 bytes. 3850 * 3851 * XXX not clear which drivers set ill_phys_addr_length to 3852 * IP_ADDR_LEN. 3853 */ 3854 if (ill->ill_phys_addr_length == IP_ADDR_LEN) { 3855 template = ill_dlur_gen((uchar_t *)addr, 3856 ill->ill_phys_addr_length, 3857 ill->ill_sap, ill->ill_sap_length); 3858 } else { 3859 template = copyb(ill->ill_resolver_mp); 3860 } 3861 if (template == NULL) { 3862 err = ENOMEM; 3863 goto err_ret; 3864 } 3865 state = ND_REACHABLE; 3866 } 3867 nce->nce_fp_mp = NULL; 3868 nce->nce_res_mp = template; 3869 nce->nce_state = state; 3870 if (state == ND_REACHABLE) { 3871 nce->nce_last = TICK_TO_MSEC(lbolt64); 3872 nce->nce_init_time = TICK_TO_MSEC(lbolt64); 3873 } else { 3874 nce->nce_last = 0; 3875 if (state == ND_INITIAL) 3876 nce->nce_init_time = TICK_TO_MSEC(lbolt64); 3877 } 3878 3879 ASSERT((nce->nce_res_mp == NULL && nce->nce_state == ND_INITIAL) || 3880 (nce->nce_res_mp != NULL && nce->nce_state == ND_REACHABLE)); 3881 /* 3882 * Atomically ensure that the ill is not CONDEMNED, before 3883 * adding the NCE. 3884 */ 3885 mutex_enter(&ill->ill_lock); 3886 if (ill->ill_state_flags & ILL_CONDEMNED) { 3887 mutex_exit(&ill->ill_lock); 3888 err = EINVAL; 3889 goto err_ret; 3890 } 3891 if ((nce->nce_next = *ncep) != NULL) 3892 nce->nce_next->nce_ptpn = &nce->nce_next; 3893 *ncep = nce; 3894 nce->nce_ptpn = ncep; 3895 *newnce = nce; 3896 /* This one is for nce being used by an active thread */ 3897 NCE_REFHOLD(*newnce); 3898 3899 /* Bump up the number of nce's referencing this ill */ 3900 ill->ill_nce_cnt++; 3901 mutex_exit(&ill->ill_lock); 3902 DTRACE_PROBE1(ndp__add__v4, nce_t *, nce); 3903 return (0); 3904 err_ret: 3905 freeb(mp); 3906 freemsg(template); 3907 return (err); 3908 } 3909 3910 void 3911 ndp_flush_qd_mp(nce_t *nce) 3912 { 3913 mblk_t *qd_mp, *qd_next; 3914 3915 ASSERT(MUTEX_HELD(&nce->nce_lock)); 3916 qd_mp = nce->nce_qd_mp; 3917 nce->nce_qd_mp = NULL; 3918 while (qd_mp != NULL) { 3919 qd_next = qd_mp->b_next; 3920 qd_mp->b_next = NULL; 3921 qd_mp->b_prev = NULL; 3922 freemsg(qd_mp); 3923 qd_mp = qd_next; 3924 } 3925 } 3926 3927 3928 /* 3929 * ndp_walk routine to delete all entries that have a given destination or 3930 * gateway address and cached link layer (MAC) address. This is used when ARP 3931 * informs us that a network-to-link-layer mapping may have changed. 3932 */ 3933 void 3934 nce_delete_hw_changed(nce_t *nce, void *arg) 3935 { 3936 nce_hw_map_t *hwm = arg; 3937 mblk_t *mp; 3938 dl_unitdata_req_t *dlu; 3939 uchar_t *macaddr; 3940 ill_t *ill; 3941 int saplen; 3942 ipaddr_t nce_addr; 3943 3944 if (nce->nce_state != ND_REACHABLE) 3945 return; 3946 3947 IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr); 3948 if (nce_addr != hwm->hwm_addr) 3949 return; 3950 3951 mutex_enter(&nce->nce_lock); 3952 if ((mp = nce->nce_res_mp) == NULL) { 3953 mutex_exit(&nce->nce_lock); 3954 return; 3955 } 3956 dlu = (dl_unitdata_req_t *)mp->b_rptr; 3957 macaddr = (uchar_t *)(dlu + 1); 3958 ill = nce->nce_ill; 3959 if ((saplen = ill->ill_sap_length) > 0) 3960 macaddr += saplen; 3961 else 3962 saplen = -saplen; 3963 3964 /* 3965 * If the hardware address is unchanged, then leave this one alone. 3966 * Note that saplen == abs(saplen) now. 3967 */ 3968 if (hwm->hwm_hwlen == dlu->dl_dest_addr_length - saplen && 3969 bcmp(hwm->hwm_hwaddr, macaddr, hwm->hwm_hwlen) == 0) { 3970 mutex_exit(&nce->nce_lock); 3971 return; 3972 } 3973 mutex_exit(&nce->nce_lock); 3974 3975 DTRACE_PROBE1(nce__hw__deleted, nce_t *, nce); 3976 ndp_delete(nce); 3977 } 3978 3979 /* 3980 * This function verifies whether a given IPv4 address is potentially known to 3981 * the NCE subsystem. If so, then ARP must not delete the corresponding ace_t, 3982 * so that it can continue to look for hardware changes on that address. 3983 */ 3984 boolean_t 3985 ndp_lookup_ipaddr(in_addr_t addr, netstack_t *ns) 3986 { 3987 nce_t *nce; 3988 struct in_addr nceaddr; 3989 ip_stack_t *ipst = ns->netstack_ip; 3990 3991 if (addr == INADDR_ANY) 3992 return (B_FALSE); 3993 3994 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 3995 nce = *(nce_t **)NCE_HASH_PTR_V4(ipst, addr); 3996 for (; nce != NULL; nce = nce->nce_next) { 3997 /* Note that only v4 mapped entries are in the table. */ 3998 IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &nceaddr); 3999 if (addr == nceaddr.s_addr && 4000 IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) { 4001 /* Single flag check; no lock needed */ 4002 if (!(nce->nce_flags & NCE_F_CONDEMNED)) 4003 break; 4004 } 4005 } 4006 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 4007 return (nce != NULL); 4008 } 4009