1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 /* Copyright (c) 1990 Mentat Inc. */ 25 26 /* 27 * Copyright (c) 2018, Joyent, Inc. 28 * Copyright 2024 Oxide Computer Company 29 */ 30 31 /* 32 * Procedures for the kernel part of DVMRP, 33 * a Distance-Vector Multicast Routing Protocol. 34 * (See RFC-1075) 35 * Written by David Waitzman, BBN Labs, August 1988. 36 * Modified by Steve Deering, Stanford, February 1989. 37 * Modified by Mark J. Steiglitz, Stanford, May, 1991 38 * Modified by Van Jacobson, LBL, January 1993 39 * Modified by Ajit Thyagarajan, PARC, August 1993 40 * Modified by Bill Fenner, PARC, April 1995 41 * 42 * MROUTING 3.5 43 */ 44 45 /* 46 * TODO 47 * - function pointer field in vif, void *vif_sendit() 48 */ 49 50 #include <sys/types.h> 51 #include <sys/stream.h> 52 #include <sys/stropts.h> 53 #include <sys/strlog.h> 54 #include <sys/systm.h> 55 #include <sys/ddi.h> 56 #include <sys/cmn_err.h> 57 #include <sys/zone.h> 58 59 #include <sys/param.h> 60 #include <sys/socket.h> 61 #include <sys/vtrace.h> 62 #include <sys/debug.h> 63 #include <net/if.h> 64 #include <sys/sockio.h> 65 #include <netinet/in.h> 66 #include <net/if_dl.h> 67 68 #include <inet/ipsec_impl.h> 69 #include <inet/common.h> 70 #include <inet/mi.h> 71 #include <inet/nd.h> 72 #include <inet/tunables.h> 73 #include <inet/mib2.h> 74 #include <netinet/ip6.h> 75 #include <inet/ip.h> 76 #include <inet/snmpcom.h> 77 78 #include <netinet/igmp.h> 79 #include <netinet/igmp_var.h> 80 #include <netinet/udp.h> 81 #include <netinet/ip_mroute.h> 82 #include <inet/ip_multi.h> 83 #include <inet/ip_ire.h> 84 #include <inet/ip_ndp.h> 85 #include <inet/ip_if.h> 86 #include <inet/ipclassifier.h> 87 88 #include <netinet/pim.h> 89 90 91 /* 92 * MT Design: 93 * 94 * There are three main data structures viftable, mfctable and tbftable that 95 * need to be protected against MT races. 96 * 97 * vitable is a fixed length array of vif structs. There is no lock to protect 98 * the whole array, instead each struct is protected by its own indiviual lock. 99 * The value of v_marks in conjuction with the value of v_refcnt determines the 100 * current state of a vif structure. One special state that needs mention 101 * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates 102 * that vif is being initalized. 103 * Each structure is freed when the refcnt goes down to zero. If a delete comes 104 * in when the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED 105 * which prevents the struct from further use. When the refcnt goes to zero 106 * the struct is freed and is marked VIF_MARK_NOTINUSE. 107 * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill 108 * from going away a refhold is put on the ipif before using it. see 109 * lock_good_vif() and unlock_good_vif(). 110 * 111 * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts 112 * of the vif struct. 113 * 114 * tbftable is also a fixed length array of tbf structs and is only accessed 115 * via v_tbf. It is protected by its own lock tbf_lock. 116 * 117 * Lock Ordering is 118 * v_lock --> tbf_lock 119 * v_lock --> ill_locK 120 * 121 * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb). 122 * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker, 123 * it also maintains a state. These fields are protected by a lock (mfcb_lock). 124 * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to 125 * protect the struct elements. 126 * 127 * mfc structs are dynamically allocated and are singly linked 128 * at the head of the chain. When an mfc structure is to be deleted 129 * it is marked condemned and so is the state in the bucket struct. 130 * When the last walker of the hash bucket exits all the mfc structs 131 * marked condemed are freed. 132 * 133 * Locking Hierarchy: 134 * The bucket lock should be acquired before the mfc struct lock. 135 * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking 136 * operations on the bucket struct. 137 * 138 * last_encap_lock and numvifs_mutex should be acquired after 139 * acquring vif or mfc locks. These locks protect some global variables. 140 * 141 * The statistics are not currently protected by a lock 142 * causing the stats be be approximate, not exact. 143 */ 144 145 #define NO_VIF MAXVIFS /* from mrouted, no route for src */ 146 147 /* 148 * Timeouts: 149 * Upcall timeouts - BSD uses boolean_t mfc->expire and 150 * nexpire[MFCTBLSIZE], the number of times expire has been called. 151 * SunOS 5.x uses mfc->timeout for each mfc. 152 * Some Unixes are limited in the number of simultaneous timeouts 153 * that can be run, SunOS 5.x does not have this restriction. 154 */ 155 156 /* 157 * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and 158 * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall 159 * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE 160 */ 161 #define EXPIRE_TIMEOUT (hz/4) /* 4x / second */ 162 #define UPCALL_EXPIRE 6 /* number of timeouts */ 163 164 /* 165 * Hash function for a source, group entry 166 */ 167 #define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \ 168 ((g) >> 20) ^ ((g) >> 10) ^ (g)) 169 170 #define TBF_REPROCESS (hz / 100) /* 100x /second */ 171 172 /* Identify PIM packet that came on a Register interface */ 173 #define PIM_REGISTER_MARKER 0xffffffff 174 175 /* Function declarations */ 176 static int add_mfc(struct mfcctl *, ip_stack_t *); 177 static int add_vif(struct vifctl *, conn_t *, ip_stack_t *); 178 static int del_mfc(struct mfcctl *, ip_stack_t *); 179 static int del_vif(vifi_t *, ip_stack_t *); 180 static void del_vifp(struct vif *); 181 static void encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 182 static void expire_upcalls(void *); 183 static void fill_route(struct mfc *, struct mfcctl *, ip_stack_t *); 184 static void free_queue(struct mfc *); 185 static int get_assert(uchar_t *, ip_stack_t *); 186 static int get_lsg_cnt(struct sioc_lsg_req *, ip_stack_t *); 187 static int get_sg_cnt(struct sioc_sg_req *, ip_stack_t *); 188 static int get_version(uchar_t *); 189 static int get_vif_cnt(struct sioc_vif_req *, ip_stack_t *); 190 static int ip_mdq(mblk_t *, ipha_t *, ill_t *, 191 ipaddr_t, struct mfc *); 192 static int ip_mrouter_init(conn_t *, uchar_t *, int, ip_stack_t *); 193 static void phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 194 static int register_mforward(mblk_t *, ip_recv_attr_t *); 195 static void register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 196 static int set_assert(int *, ip_stack_t *); 197 198 /* 199 * Token Bucket Filter functions 200 */ 201 static int priority(struct vif *, ipha_t *); 202 static void tbf_control(struct vif *, mblk_t *, ipha_t *); 203 static int tbf_dq_sel(struct vif *, ipha_t *); 204 static void tbf_process_q(struct vif *); 205 static void tbf_queue(struct vif *, mblk_t *); 206 static void tbf_reprocess_q(void *); 207 static void tbf_send_packet(struct vif *, mblk_t *); 208 static void tbf_update_tokens(struct vif *); 209 static void release_mfc(struct mfcb *); 210 211 static boolean_t is_mrouter_off(ip_stack_t *); 212 /* 213 * Encapsulation packets 214 */ 215 216 #define ENCAP_TTL 64 217 218 /* prototype IP hdr for encapsulated packets */ 219 static ipha_t multicast_encap_iphdr = { 220 IP_SIMPLE_HDR_VERSION, 221 0, /* tos */ 222 sizeof (ipha_t), /* total length */ 223 0, /* id */ 224 0, /* frag offset */ 225 ENCAP_TTL, IPPROTO_ENCAP, 226 0, /* checksum */ 227 }; 228 229 /* 230 * Rate limit for assert notification messages, in nsec. 231 */ 232 #define ASSERT_MSG_TIME 3000000000 233 234 235 #define VIF_REFHOLD(vifp) { \ 236 mutex_enter(&(vifp)->v_lock); \ 237 (vifp)->v_refcnt++; \ 238 mutex_exit(&(vifp)->v_lock); \ 239 } 240 241 #define VIF_REFRELE_LOCKED(vifp) { \ 242 (vifp)->v_refcnt--; \ 243 if ((vifp)->v_refcnt == 0 && \ 244 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) { \ 245 del_vifp(vifp); \ 246 } else { \ 247 mutex_exit(&(vifp)->v_lock); \ 248 } \ 249 } 250 251 #define VIF_REFRELE(vifp) { \ 252 mutex_enter(&(vifp)->v_lock); \ 253 (vifp)->v_refcnt--; \ 254 if ((vifp)->v_refcnt == 0 && \ 255 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) { \ 256 del_vifp(vifp); \ 257 } else { \ 258 mutex_exit(&(vifp)->v_lock); \ 259 } \ 260 } 261 262 #define MFCB_REFHOLD(mfcb) { \ 263 mutex_enter(&(mfcb)->mfcb_lock); \ 264 (mfcb)->mfcb_refcnt++; \ 265 ASSERT((mfcb)->mfcb_refcnt != 0); \ 266 mutex_exit(&(mfcb)->mfcb_lock); \ 267 } 268 269 #define MFCB_REFRELE(mfcb) { \ 270 mutex_enter(&(mfcb)->mfcb_lock); \ 271 ASSERT((mfcb)->mfcb_refcnt != 0); \ 272 if (--(mfcb)->mfcb_refcnt == 0 && \ 273 ((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) { \ 274 release_mfc(mfcb); \ 275 } \ 276 mutex_exit(&(mfcb)->mfcb_lock); \ 277 } 278 279 /* 280 * MFCFIND: 281 * Find a route for a given origin IP address and multicast group address. 282 * Skip entries with pending upcalls. 283 * Type of service parameter to be added in the future! 284 */ 285 #define MFCFIND(mfcbp, o, g, rt) { \ 286 struct mfc *_mb_rt = NULL; \ 287 rt = NULL; \ 288 _mb_rt = mfcbp->mfcb_mfc; \ 289 while (_mb_rt) { \ 290 if ((_mb_rt->mfc_origin.s_addr == o) && \ 291 (_mb_rt->mfc_mcastgrp.s_addr == g) && \ 292 (_mb_rt->mfc_rte == NULL) && \ 293 (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) { \ 294 rt = _mb_rt; \ 295 break; \ 296 } \ 297 _mb_rt = _mb_rt->mfc_next; \ 298 } \ 299 } 300 301 /* 302 * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime() 303 * are inefficient. We use gethrestime() which returns a timespec_t with 304 * sec and nsec, the resolution is machine dependent. 305 * The following 2 macros have been changed to use nsec instead of usec. 306 */ 307 /* 308 * Macros to compute elapsed time efficiently. 309 * Borrowed from Van Jacobson's scheduling code. 310 * Delta should be a hrtime_t. 311 */ 312 #define TV_DELTA(a, b, delta) { \ 313 int xxs; \ 314 \ 315 delta = (a).tv_nsec - (b).tv_nsec; \ 316 if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \ 317 switch (xxs) { \ 318 case 2: \ 319 delta += 1000000000; \ 320 /*FALLTHROUGH*/ \ 321 case 1: \ 322 delta += 1000000000; \ 323 break; \ 324 default: \ 325 delta += (1000000000 * xxs); \ 326 } \ 327 } \ 328 } 329 330 #define TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \ 331 (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec) 332 333 /* 334 * Handle MRT setsockopt commands to modify the multicast routing tables. 335 */ 336 int 337 ip_mrouter_set(int cmd, conn_t *connp, int checkonly, uchar_t *data, 338 int datalen) 339 { 340 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 341 342 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 343 if (cmd != MRT_INIT && connp != ipst->ips_ip_g_mrouter) { 344 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 345 return (EACCES); 346 } 347 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 348 349 if (checkonly) { 350 /* 351 * do not do operation, just pretend to - new T_CHECK 352 * Note: Even routines further on can probably fail but 353 * this T_CHECK stuff is only to please XTI so it not 354 * necessary to be perfect. 355 */ 356 switch (cmd) { 357 case MRT_INIT: 358 case MRT_DONE: 359 case MRT_ADD_VIF: 360 case MRT_DEL_VIF: 361 case MRT_ADD_MFC: 362 case MRT_DEL_MFC: 363 case MRT_ASSERT: 364 return (0); 365 default: 366 return (EOPNOTSUPP); 367 } 368 } 369 370 /* 371 * make sure no command is issued after multicast routing has been 372 * turned off. 373 */ 374 if (cmd != MRT_INIT && cmd != MRT_DONE) { 375 if (is_mrouter_off(ipst)) 376 return (EINVAL); 377 } 378 379 switch (cmd) { 380 case MRT_INIT: return (ip_mrouter_init(connp, data, datalen, ipst)); 381 case MRT_DONE: return (ip_mrouter_done(ipst)); 382 case MRT_ADD_VIF: return (add_vif((struct vifctl *)data, connp, ipst)); 383 case MRT_DEL_VIF: return (del_vif((vifi_t *)data, ipst)); 384 case MRT_ADD_MFC: return (add_mfc((struct mfcctl *)data, ipst)); 385 case MRT_DEL_MFC: return (del_mfc((struct mfcctl *)data, ipst)); 386 case MRT_ASSERT: return (set_assert((int *)data, ipst)); 387 default: return (EOPNOTSUPP); 388 } 389 } 390 391 /* 392 * Handle MRT getsockopt commands 393 */ 394 int 395 ip_mrouter_get(int cmd, conn_t *connp, uchar_t *data) 396 { 397 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 398 399 if (connp != ipst->ips_ip_g_mrouter) 400 return (EACCES); 401 402 switch (cmd) { 403 case MRT_VERSION: return (get_version((uchar_t *)data)); 404 case MRT_ASSERT: return (get_assert((uchar_t *)data, ipst)); 405 default: return (EOPNOTSUPP); 406 } 407 } 408 409 /* 410 * Handle ioctl commands to obtain information from the cache. 411 * Called with shared access to IP. These are read_only ioctls. 412 */ 413 /* ARGSUSED */ 414 int 415 mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 416 ip_ioctl_cmd_t *ipip, void *if_req) 417 { 418 mblk_t *mp1; 419 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 420 conn_t *connp = Q_TO_CONN(q); 421 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 422 423 /* Existence verified in ip_wput_nondata */ 424 mp1 = mp->b_cont->b_cont; 425 426 switch (iocp->ioc_cmd) { 427 case (SIOCGETVIFCNT): 428 return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr, ipst)); 429 case (SIOCGETSGCNT): 430 return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr, ipst)); 431 case (SIOCGETLSGCNT): 432 return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr, ipst)); 433 default: 434 return (EINVAL); 435 } 436 } 437 438 /* 439 * Returns the packet, byte, rpf-failure count for the source, group provided. 440 */ 441 static int 442 get_sg_cnt(struct sioc_sg_req *req, ip_stack_t *ipst) 443 { 444 struct mfc *rt; 445 struct mfcb *mfcbp; 446 447 mfcbp = &ipst->ips_mfcs[MFCHASH(req->src.s_addr, req->grp.s_addr)]; 448 MFCB_REFHOLD(mfcbp); 449 MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt); 450 451 if (rt != NULL) { 452 mutex_enter(&rt->mfc_mutex); 453 req->pktcnt = rt->mfc_pkt_cnt; 454 req->bytecnt = rt->mfc_byte_cnt; 455 req->wrong_if = rt->mfc_wrong_if; 456 mutex_exit(&rt->mfc_mutex); 457 } else 458 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU; 459 460 MFCB_REFRELE(mfcbp); 461 return (0); 462 } 463 464 /* 465 * Returns the packet, byte, rpf-failure count for the source, group provided. 466 * Uses larger counters and IPv6 addresses. 467 */ 468 /* ARGSUSED XXX until implemented */ 469 static int 470 get_lsg_cnt(struct sioc_lsg_req *req, ip_stack_t *ipst) 471 { 472 /* XXX TODO SIOCGETLSGCNT */ 473 return (ENXIO); 474 } 475 476 /* 477 * Returns the input and output packet and byte counts on the vif provided. 478 */ 479 static int 480 get_vif_cnt(struct sioc_vif_req *req, ip_stack_t *ipst) 481 { 482 vifi_t vifi = req->vifi; 483 484 if (vifi >= ipst->ips_numvifs) 485 return (EINVAL); 486 487 /* 488 * No locks here, an approximation is fine. 489 */ 490 req->icount = ipst->ips_vifs[vifi].v_pkt_in; 491 req->ocount = ipst->ips_vifs[vifi].v_pkt_out; 492 req->ibytes = ipst->ips_vifs[vifi].v_bytes_in; 493 req->obytes = ipst->ips_vifs[vifi].v_bytes_out; 494 495 return (0); 496 } 497 498 static int 499 get_version(uchar_t *data) 500 { 501 int *v = (int *)data; 502 503 *v = 0x0305; /* XXX !!!! */ 504 505 return (0); 506 } 507 508 /* 509 * Set PIM assert processing global. 510 */ 511 static int 512 set_assert(int *i, ip_stack_t *ipst) 513 { 514 if ((*i != 1) && (*i != 0)) 515 return (EINVAL); 516 517 ipst->ips_pim_assert = *i; 518 519 return (0); 520 } 521 522 /* 523 * Get PIM assert processing global. 524 */ 525 static int 526 get_assert(uchar_t *data, ip_stack_t *ipst) 527 { 528 int *i = (int *)data; 529 530 *i = ipst->ips_pim_assert; 531 532 return (0); 533 } 534 535 /* 536 * Enable multicast routing. 537 */ 538 static int 539 ip_mrouter_init(conn_t *connp, uchar_t *data, int datalen, ip_stack_t *ipst) 540 { 541 int *v; 542 543 if (data == NULL || (datalen != sizeof (int))) 544 return (ENOPROTOOPT); 545 546 v = (int *)data; 547 if (*v != 1) 548 return (ENOPROTOOPT); 549 550 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 551 if (ipst->ips_ip_g_mrouter != NULL) { 552 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 553 return (EADDRINUSE); 554 } 555 556 /* 557 * MRT_INIT should only be allowed for RAW sockets, but we double 558 * check. 559 */ 560 if (!IPCL_IS_RAWIP(connp)) { 561 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 562 return (EINVAL); 563 } 564 565 ipst->ips_ip_g_mrouter = connp; 566 connp->conn_multi_router = 1; 567 /* In order for tunnels to work we have to turn ip_g_forward on */ 568 if (!WE_ARE_FORWARDING(ipst)) { 569 if (ipst->ips_ip_mrtdebug > 1) { 570 (void) mi_strlog(connp->conn_rq, 1, SL_TRACE, 571 "ip_mrouter_init: turning on forwarding"); 572 } 573 ipst->ips_saved_ip_forwarding = ipst->ips_ip_forwarding; 574 ipst->ips_ip_forwarding = IP_FORWARD_ALWAYS; 575 } 576 577 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 578 return (0); 579 } 580 581 void 582 ip_mrouter_stack_init(ip_stack_t *ipst) 583 { 584 mutex_init(&ipst->ips_ip_g_mrouter_mutex, NULL, MUTEX_DEFAULT, NULL); 585 586 ipst->ips_vifs = kmem_zalloc(sizeof (struct vif) * (MAXVIFS+1), 587 KM_SLEEP); 588 ipst->ips_mrtstat = kmem_zalloc(sizeof (struct mrtstat), KM_SLEEP); 589 /* 590 * mfctable: 591 * Includes all mfcs, including waiting upcalls. 592 * Multiple mfcs per bucket. 593 */ 594 ipst->ips_mfcs = kmem_zalloc(sizeof (struct mfcb) * MFCTBLSIZ, 595 KM_SLEEP); 596 /* 597 * Define the token bucket filter structures. 598 * tbftable -> each vif has one of these for storing info. 599 */ 600 ipst->ips_tbfs = kmem_zalloc(sizeof (struct tbf) * MAXVIFS, KM_SLEEP); 601 602 mutex_init(&ipst->ips_last_encap_lock, NULL, MUTEX_DEFAULT, NULL); 603 604 ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl); 605 ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl); 606 } 607 608 /* 609 * Disable multicast routing. 610 * Didn't use global timeout_val (BSD version), instead check the mfctable. 611 */ 612 int 613 ip_mrouter_done(ip_stack_t *ipst) 614 { 615 conn_t *mrouter; 616 vifi_t vifi; 617 struct mfc *mfc_rt; 618 int i; 619 620 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 621 if (ipst->ips_ip_g_mrouter == NULL) { 622 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 623 return (EINVAL); 624 } 625 626 mrouter = ipst->ips_ip_g_mrouter; 627 628 if (ipst->ips_saved_ip_forwarding != -1) { 629 if (ipst->ips_ip_mrtdebug > 1) { 630 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 631 "ip_mrouter_done: turning off forwarding"); 632 } 633 ipst->ips_ip_forwarding = ipst->ips_saved_ip_forwarding; 634 ipst->ips_saved_ip_forwarding = -1; 635 } 636 637 /* 638 * Always clear cache when vifs change. 639 * No need to get ipst->ips_last_encap_lock since we are running as 640 * a writer. 641 */ 642 mutex_enter(&ipst->ips_last_encap_lock); 643 ipst->ips_last_encap_src = 0; 644 ipst->ips_last_encap_vif = NULL; 645 mutex_exit(&ipst->ips_last_encap_lock); 646 mrouter->conn_multi_router = 0; 647 648 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 649 650 /* 651 * For each phyint in use, 652 * disable promiscuous reception of all IP multicasts. 653 */ 654 for (vifi = 0; vifi < MAXVIFS; vifi++) { 655 struct vif *vifp = ipst->ips_vifs + vifi; 656 657 mutex_enter(&vifp->v_lock); 658 /* 659 * if the vif is active mark it condemned. 660 */ 661 if (vifp->v_marks & VIF_MARK_GOOD) { 662 ASSERT(vifp->v_ipif != NULL); 663 ipif_refhold(vifp->v_ipif); 664 /* Phyint only */ 665 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 666 ipif_t *ipif = vifp->v_ipif; 667 ilm_t *ilm = vifp->v_ilm; 668 669 vifp->v_ilm = NULL; 670 vifp->v_marks &= ~VIF_MARK_GOOD; 671 vifp->v_marks |= VIF_MARK_CONDEMNED; 672 673 mutex_exit(&(vifp)->v_lock); 674 if (ilm != NULL) { 675 ill_t *ill = ipif->ipif_ill; 676 677 (void) ip_delmulti(ilm); 678 ASSERT(ill->ill_mrouter_cnt > 0); 679 atomic_dec_32(&ill->ill_mrouter_cnt); 680 } 681 mutex_enter(&vifp->v_lock); 682 } 683 ipif_refrele(vifp->v_ipif); 684 /* 685 * decreases the refcnt added in add_vif. 686 * and release v_lock. 687 */ 688 VIF_REFRELE_LOCKED(vifp); 689 } else { 690 mutex_exit(&vifp->v_lock); 691 continue; 692 } 693 } 694 695 mutex_enter(&ipst->ips_numvifs_mutex); 696 ipst->ips_numvifs = 0; 697 ipst->ips_pim_assert = 0; 698 ipst->ips_reg_vif_num = ALL_VIFS; 699 mutex_exit(&ipst->ips_numvifs_mutex); 700 701 /* 702 * Free upcall msgs. 703 * Go through mfctable and stop any outstanding upcall 704 * timeouts remaining on mfcs. 705 */ 706 for (i = 0; i < MFCTBLSIZ; i++) { 707 mutex_enter(&ipst->ips_mfcs[i].mfcb_lock); 708 ipst->ips_mfcs[i].mfcb_refcnt++; 709 ipst->ips_mfcs[i].mfcb_marks |= MFCB_MARK_CONDEMNED; 710 mutex_exit(&ipst->ips_mfcs[i].mfcb_lock); 711 mfc_rt = ipst->ips_mfcs[i].mfcb_mfc; 712 while (mfc_rt) { 713 /* Free upcalls */ 714 mutex_enter(&mfc_rt->mfc_mutex); 715 if (mfc_rt->mfc_rte != NULL) { 716 if (mfc_rt->mfc_timeout_id != 0) { 717 /* 718 * OK to drop the lock as we have 719 * a refcnt on the bucket. timeout 720 * can fire but it will see that 721 * mfc_timeout_id == 0 and not do 722 * anything. see expire_upcalls(). 723 */ 724 mfc_rt->mfc_timeout_id = 0; 725 mutex_exit(&mfc_rt->mfc_mutex); 726 (void) untimeout( 727 mfc_rt->mfc_timeout_id); 728 mfc_rt->mfc_timeout_id = 0; 729 mutex_enter(&mfc_rt->mfc_mutex); 730 731 /* 732 * all queued upcall packets 733 * and mblk will be freed in 734 * release_mfc(). 735 */ 736 } 737 } 738 739 mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED; 740 741 mutex_exit(&mfc_rt->mfc_mutex); 742 mfc_rt = mfc_rt->mfc_next; 743 } 744 MFCB_REFRELE(&ipst->ips_mfcs[i]); 745 } 746 747 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 748 ipst->ips_ip_g_mrouter = NULL; 749 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 750 return (0); 751 } 752 753 void 754 ip_mrouter_stack_destroy(ip_stack_t *ipst) 755 { 756 struct mfcb *mfcbp; 757 struct mfc *rt; 758 int i; 759 760 for (i = 0; i < MFCTBLSIZ; i++) { 761 mfcbp = &ipst->ips_mfcs[i]; 762 763 while ((rt = mfcbp->mfcb_mfc) != NULL) { 764 (void) printf("ip_mrouter_stack_destroy: free for %d\n", 765 i); 766 767 mfcbp->mfcb_mfc = rt->mfc_next; 768 free_queue(rt); 769 mi_free(rt); 770 } 771 } 772 kmem_free(ipst->ips_vifs, sizeof (struct vif) * (MAXVIFS+1)); 773 ipst->ips_vifs = NULL; 774 kmem_free(ipst->ips_mrtstat, sizeof (struct mrtstat)); 775 ipst->ips_mrtstat = NULL; 776 kmem_free(ipst->ips_mfcs, sizeof (struct mfcb) * MFCTBLSIZ); 777 ipst->ips_mfcs = NULL; 778 kmem_free(ipst->ips_tbfs, sizeof (struct tbf) * MAXVIFS); 779 ipst->ips_tbfs = NULL; 780 781 mutex_destroy(&ipst->ips_last_encap_lock); 782 mutex_destroy(&ipst->ips_ip_g_mrouter_mutex); 783 } 784 785 static boolean_t 786 is_mrouter_off(ip_stack_t *ipst) 787 { 788 conn_t *mrouter; 789 790 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 791 if (ipst->ips_ip_g_mrouter == NULL) { 792 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 793 return (B_TRUE); 794 } 795 796 mrouter = ipst->ips_ip_g_mrouter; 797 if (mrouter->conn_multi_router == 0) { 798 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 799 return (B_TRUE); 800 } 801 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 802 return (B_FALSE); 803 } 804 805 static void 806 unlock_good_vif(struct vif *vifp) 807 { 808 ASSERT(vifp->v_ipif != NULL); 809 ipif_refrele(vifp->v_ipif); 810 VIF_REFRELE(vifp); 811 } 812 813 static boolean_t 814 lock_good_vif(struct vif *vifp) 815 { 816 mutex_enter(&vifp->v_lock); 817 if (!(vifp->v_marks & VIF_MARK_GOOD)) { 818 mutex_exit(&vifp->v_lock); 819 return (B_FALSE); 820 } 821 822 ASSERT(vifp->v_ipif != NULL); 823 mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock); 824 if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) { 825 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock); 826 mutex_exit(&vifp->v_lock); 827 return (B_FALSE); 828 } 829 ipif_refhold_locked(vifp->v_ipif); 830 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock); 831 vifp->v_refcnt++; 832 mutex_exit(&vifp->v_lock); 833 return (B_TRUE); 834 } 835 836 /* 837 * Add a vif to the vif table. 838 */ 839 static int 840 add_vif(struct vifctl *vifcp, conn_t *connp, ip_stack_t *ipst) 841 { 842 struct vif *vifp = ipst->ips_vifs + vifcp->vifc_vifi; 843 ipif_t *ipif; 844 int error = 0; 845 struct tbf *v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi; 846 conn_t *mrouter = ipst->ips_ip_g_mrouter; 847 ilm_t *ilm; 848 ill_t *ill; 849 850 ASSERT(connp != NULL); 851 852 if (vifcp->vifc_vifi >= MAXVIFS) 853 return (EINVAL); 854 855 if (is_mrouter_off(ipst)) 856 return (EINVAL); 857 858 mutex_enter(&vifp->v_lock); 859 /* 860 * Viftable entry should be 0. 861 * if v_marks == 0 but v_refcnt != 0 means struct is being 862 * initialized. 863 * 864 * Also note that it is very unlikely that we will get a MRT_ADD_VIF 865 * request while the delete is in progress, mrouted only sends add 866 * requests when a new interface is added and the new interface cannot 867 * have the same vifi as an existing interface. We make sure that 868 * ill_delete will block till the vif is deleted by adding a refcnt 869 * to ipif in del_vif(). 870 */ 871 if (vifp->v_lcl_addr.s_addr != 0 || 872 vifp->v_marks != 0 || 873 vifp->v_refcnt != 0) { 874 mutex_exit(&vifp->v_lock); 875 return (EADDRINUSE); 876 } 877 878 /* Incoming vif should not be 0 */ 879 if (vifcp->vifc_lcl_addr.s_addr == 0) { 880 mutex_exit(&vifp->v_lock); 881 return (EINVAL); 882 } 883 884 vifp->v_refcnt++; 885 mutex_exit(&vifp->v_lock); 886 /* Find the interface with the local address */ 887 ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL, 888 IPCL_ZONEID(connp), ipst); 889 if (ipif == NULL) { 890 VIF_REFRELE(vifp); 891 return (EADDRNOTAVAIL); 892 } 893 894 if (ipst->ips_ip_mrtdebug > 1) { 895 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 896 "add_vif: src 0x%x enter", 897 vifcp->vifc_lcl_addr.s_addr); 898 } 899 900 mutex_enter(&vifp->v_lock); 901 /* 902 * Always clear cache when vifs change. 903 * Needed to ensure that src isn't left over from before vif was added. 904 * No need to get last_encap_lock, since we are running as a writer. 905 */ 906 907 mutex_enter(&ipst->ips_last_encap_lock); 908 ipst->ips_last_encap_src = 0; 909 ipst->ips_last_encap_vif = NULL; 910 mutex_exit(&ipst->ips_last_encap_lock); 911 912 if (vifcp->vifc_flags & VIFF_TUNNEL) { 913 if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) { 914 cmn_err(CE_WARN, 915 "add_vif: source route tunnels not supported\n"); 916 VIF_REFRELE_LOCKED(vifp); 917 ipif_refrele(ipif); 918 return (EOPNOTSUPP); 919 } 920 vifp->v_rmt_addr = vifcp->vifc_rmt_addr; 921 922 } else { 923 /* Phyint or Register vif */ 924 if (vifcp->vifc_flags & VIFF_REGISTER) { 925 /* 926 * Note: Since all IPPROTO_IP level options (including 927 * MRT_ADD_VIF) are done exclusively via 928 * ip_optmgmt_writer(), a lock is not necessary to 929 * protect reg_vif_num. 930 */ 931 mutex_enter(&ipst->ips_numvifs_mutex); 932 if (ipst->ips_reg_vif_num == ALL_VIFS) { 933 ipst->ips_reg_vif_num = vifcp->vifc_vifi; 934 mutex_exit(&ipst->ips_numvifs_mutex); 935 } else { 936 mutex_exit(&ipst->ips_numvifs_mutex); 937 VIF_REFRELE_LOCKED(vifp); 938 ipif_refrele(ipif); 939 return (EADDRINUSE); 940 } 941 } 942 943 /* Make sure the interface supports multicast */ 944 if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) { 945 VIF_REFRELE_LOCKED(vifp); 946 ipif_refrele(ipif); 947 if (vifcp->vifc_flags & VIFF_REGISTER) { 948 mutex_enter(&ipst->ips_numvifs_mutex); 949 ipst->ips_reg_vif_num = ALL_VIFS; 950 mutex_exit(&ipst->ips_numvifs_mutex); 951 } 952 return (EOPNOTSUPP); 953 } 954 /* Enable promiscuous reception of all IP mcasts from the if */ 955 mutex_exit(&vifp->v_lock); 956 957 ill = ipif->ipif_ill; 958 if (IS_UNDER_IPMP(ill)) 959 ill = ipmp_ill_hold_ipmp_ill(ill); 960 961 if (ill == NULL) { 962 ilm = NULL; 963 } else { 964 ilm = ip_addmulti(&ipv6_all_zeros, ill, 965 ipif->ipif_zoneid, &error); 966 if (ilm != NULL) 967 atomic_inc_32(&ill->ill_mrouter_cnt); 968 if (IS_UNDER_IPMP(ipif->ipif_ill)) { 969 ill_refrele(ill); 970 ill = ipif->ipif_ill; 971 } 972 } 973 974 mutex_enter(&vifp->v_lock); 975 /* 976 * since we released the lock lets make sure that 977 * ip_mrouter_done() has not been called. 978 */ 979 if (ilm == NULL || is_mrouter_off(ipst)) { 980 if (ilm != NULL) { 981 (void) ip_delmulti(ilm); 982 ASSERT(ill->ill_mrouter_cnt > 0); 983 atomic_dec_32(&ill->ill_mrouter_cnt); 984 } 985 if (vifcp->vifc_flags & VIFF_REGISTER) { 986 mutex_enter(&ipst->ips_numvifs_mutex); 987 ipst->ips_reg_vif_num = ALL_VIFS; 988 mutex_exit(&ipst->ips_numvifs_mutex); 989 } 990 VIF_REFRELE_LOCKED(vifp); 991 ipif_refrele(ipif); 992 return (error?error:EINVAL); 993 } 994 vifp->v_ilm = ilm; 995 } 996 /* Define parameters for the tbf structure */ 997 vifp->v_tbf = v_tbf; 998 gethrestime(&vifp->v_tbf->tbf_last_pkt_t); 999 vifp->v_tbf->tbf_n_tok = 0; 1000 vifp->v_tbf->tbf_q_len = 0; 1001 vifp->v_tbf->tbf_max_q_len = MAXQSIZE; 1002 vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL; 1003 1004 vifp->v_flags = vifcp->vifc_flags; 1005 vifp->v_threshold = vifcp->vifc_threshold; 1006 vifp->v_lcl_addr = vifcp->vifc_lcl_addr; 1007 vifp->v_ipif = ipif; 1008 ipif_refrele(ipif); 1009 /* Scaling up here, allows division by 1024 in critical code. */ 1010 vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000); 1011 vifp->v_timeout_id = 0; 1012 /* initialize per vif pkt counters */ 1013 vifp->v_pkt_in = 0; 1014 vifp->v_pkt_out = 0; 1015 vifp->v_bytes_in = 0; 1016 vifp->v_bytes_out = 0; 1017 mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL); 1018 1019 /* Adjust numvifs up, if the vifi is higher than numvifs */ 1020 mutex_enter(&ipst->ips_numvifs_mutex); 1021 if (ipst->ips_numvifs <= vifcp->vifc_vifi) 1022 ipst->ips_numvifs = vifcp->vifc_vifi + 1; 1023 mutex_exit(&ipst->ips_numvifs_mutex); 1024 1025 if (ipst->ips_ip_mrtdebug > 1) { 1026 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1027 "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d", 1028 vifcp->vifc_vifi, 1029 ntohl(vifcp->vifc_lcl_addr.s_addr), 1030 (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask", 1031 ntohl(vifcp->vifc_rmt_addr.s_addr), 1032 vifcp->vifc_threshold, vifcp->vifc_rate_limit); 1033 } 1034 1035 vifp->v_marks = VIF_MARK_GOOD; 1036 mutex_exit(&vifp->v_lock); 1037 return (0); 1038 } 1039 1040 1041 /* Delete a vif from the vif table. */ 1042 static void 1043 del_vifp(struct vif *vifp) 1044 { 1045 struct tbf *t = vifp->v_tbf; 1046 mblk_t *mp0; 1047 vifi_t vifi; 1048 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 1049 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1050 1051 ASSERT(vifp->v_marks & VIF_MARK_CONDEMNED); 1052 ASSERT(t != NULL); 1053 1054 if (ipst->ips_ip_mrtdebug > 1) { 1055 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1056 "del_vif: src 0x%x\n", vifp->v_lcl_addr.s_addr); 1057 } 1058 1059 if (vifp->v_timeout_id != 0) { 1060 (void) untimeout(vifp->v_timeout_id); 1061 vifp->v_timeout_id = 0; 1062 } 1063 1064 /* 1065 * Free packets queued at the interface. 1066 * Mrouted takes care of cleaning up mfcs - makes calls to del_mfc. 1067 */ 1068 mutex_enter(&t->tbf_lock); 1069 while (t->tbf_q != NULL) { 1070 mp0 = t->tbf_q; 1071 t->tbf_q = t->tbf_q->b_next; 1072 mp0->b_prev = mp0->b_next = NULL; 1073 freemsg(mp0); 1074 } 1075 mutex_exit(&t->tbf_lock); 1076 1077 /* 1078 * Always clear cache when vifs change. 1079 * No need to get last_encap_lock since we are running as a writer. 1080 */ 1081 mutex_enter(&ipst->ips_last_encap_lock); 1082 if (vifp == ipst->ips_last_encap_vif) { 1083 ipst->ips_last_encap_vif = NULL; 1084 ipst->ips_last_encap_src = 0; 1085 } 1086 mutex_exit(&ipst->ips_last_encap_lock); 1087 1088 mutex_destroy(&t->tbf_lock); 1089 1090 bzero(vifp->v_tbf, sizeof (*(vifp->v_tbf))); 1091 1092 /* Adjust numvifs down */ 1093 mutex_enter(&ipst->ips_numvifs_mutex); 1094 for (vifi = ipst->ips_numvifs; vifi != 0; vifi--) /* vifi is unsigned */ 1095 if (ipst->ips_vifs[vifi - 1].v_lcl_addr.s_addr != 0) 1096 break; 1097 ipst->ips_numvifs = vifi; 1098 mutex_exit(&ipst->ips_numvifs_mutex); 1099 1100 bzero(vifp, sizeof (*vifp)); 1101 } 1102 1103 static int 1104 del_vif(vifi_t *vifip, ip_stack_t *ipst) 1105 { 1106 struct vif *vifp = ipst->ips_vifs + *vifip; 1107 1108 if (*vifip >= ipst->ips_numvifs) 1109 return (EINVAL); 1110 1111 mutex_enter(&vifp->v_lock); 1112 /* 1113 * Not initialized 1114 * Here we are not looking at the vif that is being initialized 1115 * i.e vifp->v_marks == 0 and refcnt > 0. 1116 */ 1117 if (vifp->v_lcl_addr.s_addr == 0 || 1118 !(vifp->v_marks & VIF_MARK_GOOD)) { 1119 mutex_exit(&vifp->v_lock); 1120 return (EADDRNOTAVAIL); 1121 } 1122 1123 /* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */ 1124 vifp->v_marks &= ~VIF_MARK_GOOD; 1125 vifp->v_marks |= VIF_MARK_CONDEMNED; 1126 1127 /* Phyint only */ 1128 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 1129 ipif_t *ipif = vifp->v_ipif; 1130 ilm_t *ilm = vifp->v_ilm; 1131 1132 vifp->v_ilm = NULL; 1133 1134 ASSERT(ipif != NULL); 1135 /* 1136 * should be OK to drop the lock as we 1137 * have marked this as CONDEMNED. 1138 */ 1139 mutex_exit(&(vifp)->v_lock); 1140 if (ilm != NULL) { 1141 (void) ip_delmulti(ilm); 1142 ASSERT(ipif->ipif_ill->ill_mrouter_cnt > 0); 1143 atomic_dec_32(&ipif->ipif_ill->ill_mrouter_cnt); 1144 } 1145 mutex_enter(&(vifp)->v_lock); 1146 } 1147 1148 if (vifp->v_flags & VIFF_REGISTER) { 1149 mutex_enter(&ipst->ips_numvifs_mutex); 1150 ipst->ips_reg_vif_num = ALL_VIFS; 1151 mutex_exit(&ipst->ips_numvifs_mutex); 1152 } 1153 1154 /* 1155 * decreases the refcnt added in add_vif. 1156 */ 1157 VIF_REFRELE_LOCKED(vifp); 1158 return (0); 1159 } 1160 1161 /* 1162 * Add an mfc entry. 1163 */ 1164 static int 1165 add_mfc(struct mfcctl *mfccp, ip_stack_t *ipst) 1166 { 1167 struct mfc *rt; 1168 struct rtdetq *rte; 1169 ushort_t nstl; 1170 int i; 1171 struct mfcb *mfcbp; 1172 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1173 1174 /* 1175 * The value of vifi is NO_VIF (==MAXVIFS) if Mrouted 1176 * did not have a real route for pkt. 1177 * We want this pkt without rt installed in the mfctable to prevent 1178 * multiiple tries, so go ahead and put it in mfctable, it will 1179 * be discarded later in ip_mdq() because the child is NULL. 1180 */ 1181 1182 /* Error checking, out of bounds? */ 1183 if (mfccp->mfcc_parent > MAXVIFS) { 1184 ip0dbg(("ADD_MFC: mfcc_parent out of range %d", 1185 (int)mfccp->mfcc_parent)); 1186 return (EINVAL); 1187 } 1188 1189 if ((mfccp->mfcc_parent != NO_VIF) && 1190 (ipst->ips_vifs[mfccp->mfcc_parent].v_ipif == NULL)) { 1191 ip0dbg(("ADD_MFC: NULL ipif for parent vif %d\n", 1192 (int)mfccp->mfcc_parent)); 1193 return (EINVAL); 1194 } 1195 1196 if (is_mrouter_off(ipst)) { 1197 return (EINVAL); 1198 } 1199 1200 mfcbp = &ipst->ips_mfcs[MFCHASH(mfccp->mfcc_origin.s_addr, 1201 mfccp->mfcc_mcastgrp.s_addr)]; 1202 MFCB_REFHOLD(mfcbp); 1203 MFCFIND(mfcbp, mfccp->mfcc_origin.s_addr, 1204 mfccp->mfcc_mcastgrp.s_addr, rt); 1205 1206 /* If an entry already exists, just update the fields */ 1207 if (rt) { 1208 if (ipst->ips_ip_mrtdebug > 1) { 1209 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1210 "add_mfc: update o %x grp %x parent %x", 1211 ntohl(mfccp->mfcc_origin.s_addr), 1212 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1213 mfccp->mfcc_parent); 1214 } 1215 mutex_enter(&rt->mfc_mutex); 1216 rt->mfc_parent = mfccp->mfcc_parent; 1217 1218 mutex_enter(&ipst->ips_numvifs_mutex); 1219 for (i = 0; i < (int)ipst->ips_numvifs; i++) 1220 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; 1221 mutex_exit(&ipst->ips_numvifs_mutex); 1222 mutex_exit(&rt->mfc_mutex); 1223 1224 MFCB_REFRELE(mfcbp); 1225 return (0); 1226 } 1227 1228 /* 1229 * Find the entry for which the upcall was made and update. 1230 */ 1231 for (rt = mfcbp->mfcb_mfc, nstl = 0; rt; rt = rt->mfc_next) { 1232 mutex_enter(&rt->mfc_mutex); 1233 if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && 1234 (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) && 1235 (rt->mfc_rte != NULL) && 1236 !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 1237 if (nstl++ != 0) 1238 cmn_err(CE_WARN, 1239 "add_mfc: %s o %x g %x p %x", 1240 "multiple kernel entries", 1241 ntohl(mfccp->mfcc_origin.s_addr), 1242 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1243 mfccp->mfcc_parent); 1244 1245 if (ipst->ips_ip_mrtdebug > 1) { 1246 (void) mi_strlog(mrouter->conn_rq, 1, 1247 SL_TRACE, 1248 "add_mfc: o %x g %x p %x", 1249 ntohl(mfccp->mfcc_origin.s_addr), 1250 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1251 mfccp->mfcc_parent); 1252 } 1253 fill_route(rt, mfccp, ipst); 1254 1255 /* 1256 * Prevent cleanup of cache entry. 1257 * Timer starts in ip_mforward. 1258 */ 1259 if (rt->mfc_timeout_id != 0) { 1260 timeout_id_t id; 1261 id = rt->mfc_timeout_id; 1262 /* 1263 * setting id to zero will avoid this 1264 * entry from being cleaned up in 1265 * expire_up_calls(). 1266 */ 1267 rt->mfc_timeout_id = 0; 1268 /* 1269 * dropping the lock is fine as we 1270 * have a refhold on the bucket. 1271 * so mfc cannot be freed. 1272 * The timeout can fire but it will see 1273 * that mfc_timeout_id == 0 and not cleanup. 1274 */ 1275 mutex_exit(&rt->mfc_mutex); 1276 (void) untimeout(id); 1277 mutex_enter(&rt->mfc_mutex); 1278 } 1279 1280 /* 1281 * Send all pkts that are queued waiting for the upcall. 1282 * ip_mdq param tun set to 0 - 1283 * the return value of ip_mdq() isn't used here, 1284 * so value we send doesn't matter. 1285 */ 1286 while (rt->mfc_rte != NULL) { 1287 rte = rt->mfc_rte; 1288 rt->mfc_rte = rte->rte_next; 1289 mutex_exit(&rt->mfc_mutex); 1290 (void) ip_mdq(rte->mp, (ipha_t *) 1291 rte->mp->b_rptr, rte->ill, 0, rt); 1292 freemsg(rte->mp); 1293 mi_free((char *)rte); 1294 mutex_enter(&rt->mfc_mutex); 1295 } 1296 } 1297 mutex_exit(&rt->mfc_mutex); 1298 } 1299 1300 1301 /* 1302 * It is possible that an entry is being inserted without an upcall 1303 */ 1304 if (nstl == 0) { 1305 mutex_enter(&(mfcbp->mfcb_lock)); 1306 if (ipst->ips_ip_mrtdebug > 1) { 1307 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1308 "add_mfc: no upcall o %x g %x p %x", 1309 ntohl(mfccp->mfcc_origin.s_addr), 1310 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1311 mfccp->mfcc_parent); 1312 } 1313 if (is_mrouter_off(ipst)) { 1314 mutex_exit(&mfcbp->mfcb_lock); 1315 MFCB_REFRELE(mfcbp); 1316 return (EINVAL); 1317 } 1318 1319 for (rt = mfcbp->mfcb_mfc; rt; rt = rt->mfc_next) { 1320 1321 mutex_enter(&rt->mfc_mutex); 1322 if ((rt->mfc_origin.s_addr == 1323 mfccp->mfcc_origin.s_addr) && 1324 (rt->mfc_mcastgrp.s_addr == 1325 mfccp->mfcc_mcastgrp.s_addr) && 1326 (!(rt->mfc_marks & MFCB_MARK_CONDEMNED))) { 1327 fill_route(rt, mfccp, ipst); 1328 mutex_exit(&rt->mfc_mutex); 1329 break; 1330 } 1331 mutex_exit(&rt->mfc_mutex); 1332 } 1333 1334 /* No upcall, so make a new entry into mfctable */ 1335 if (rt == NULL) { 1336 rt = (struct mfc *)mi_zalloc(sizeof (struct mfc)); 1337 if (rt == NULL) { 1338 ip1dbg(("add_mfc: out of memory\n")); 1339 mutex_exit(&mfcbp->mfcb_lock); 1340 MFCB_REFRELE(mfcbp); 1341 return (ENOBUFS); 1342 } 1343 1344 /* Insert new entry at head of hash chain */ 1345 mutex_enter(&rt->mfc_mutex); 1346 fill_route(rt, mfccp, ipst); 1347 1348 /* Link into table */ 1349 rt->mfc_next = mfcbp->mfcb_mfc; 1350 mfcbp->mfcb_mfc = rt; 1351 mutex_exit(&rt->mfc_mutex); 1352 } 1353 mutex_exit(&mfcbp->mfcb_lock); 1354 } 1355 1356 MFCB_REFRELE(mfcbp); 1357 return (0); 1358 } 1359 1360 /* 1361 * Fills in mfc structure from mrouted mfcctl. 1362 */ 1363 static void 1364 fill_route(struct mfc *rt, struct mfcctl *mfccp, ip_stack_t *ipst) 1365 { 1366 int i; 1367 1368 rt->mfc_origin = mfccp->mfcc_origin; 1369 rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; 1370 rt->mfc_parent = mfccp->mfcc_parent; 1371 mutex_enter(&ipst->ips_numvifs_mutex); 1372 for (i = 0; i < (int)ipst->ips_numvifs; i++) { 1373 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; 1374 } 1375 mutex_exit(&ipst->ips_numvifs_mutex); 1376 /* Initialize pkt counters per src-grp */ 1377 rt->mfc_pkt_cnt = 0; 1378 rt->mfc_byte_cnt = 0; 1379 rt->mfc_wrong_if = 0; 1380 rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_nsec = 0; 1381 1382 } 1383 1384 static void 1385 free_queue(struct mfc *mfcp) 1386 { 1387 struct rtdetq *rte0; 1388 1389 /* 1390 * Drop all queued upcall packets. 1391 * Free the mbuf with the pkt. 1392 */ 1393 while ((rte0 = mfcp->mfc_rte) != NULL) { 1394 mfcp->mfc_rte = rte0->rte_next; 1395 freemsg(rte0->mp); 1396 mi_free((char *)rte0); 1397 } 1398 } 1399 /* 1400 * go thorugh the hash bucket and free all the entries marked condemned. 1401 */ 1402 void 1403 release_mfc(struct mfcb *mfcbp) 1404 { 1405 struct mfc *current_mfcp; 1406 struct mfc *prev_mfcp; 1407 1408 prev_mfcp = current_mfcp = mfcbp->mfcb_mfc; 1409 1410 while (current_mfcp != NULL) { 1411 if (current_mfcp->mfc_marks & MFCB_MARK_CONDEMNED) { 1412 if (current_mfcp == mfcbp->mfcb_mfc) { 1413 mfcbp->mfcb_mfc = current_mfcp->mfc_next; 1414 free_queue(current_mfcp); 1415 mi_free(current_mfcp); 1416 prev_mfcp = current_mfcp = mfcbp->mfcb_mfc; 1417 continue; 1418 } 1419 ASSERT(prev_mfcp != NULL); 1420 prev_mfcp->mfc_next = current_mfcp->mfc_next; 1421 free_queue(current_mfcp); 1422 mi_free(current_mfcp); 1423 current_mfcp = NULL; 1424 } else { 1425 prev_mfcp = current_mfcp; 1426 } 1427 1428 current_mfcp = prev_mfcp->mfc_next; 1429 1430 } 1431 mfcbp->mfcb_marks &= ~MFCB_MARK_CONDEMNED; 1432 ASSERT(mfcbp->mfcb_mfc != NULL || mfcbp->mfcb_marks == 0); 1433 } 1434 1435 /* 1436 * Delete an mfc entry. 1437 */ 1438 static int 1439 del_mfc(struct mfcctl *mfccp, ip_stack_t *ipst) 1440 { 1441 struct in_addr origin; 1442 struct in_addr mcastgrp; 1443 struct mfc *rt; 1444 uint_t hash; 1445 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1446 1447 origin = mfccp->mfcc_origin; 1448 mcastgrp = mfccp->mfcc_mcastgrp; 1449 hash = MFCHASH(origin.s_addr, mcastgrp.s_addr); 1450 1451 if (ipst->ips_ip_mrtdebug > 1) { 1452 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1453 "del_mfc: o %x g %x", 1454 ntohl(origin.s_addr), 1455 ntohl(mcastgrp.s_addr)); 1456 } 1457 1458 MFCB_REFHOLD(&ipst->ips_mfcs[hash]); 1459 1460 /* Find mfc in mfctable, finds only entries without upcalls */ 1461 for (rt = ipst->ips_mfcs[hash].mfcb_mfc; rt; rt = rt->mfc_next) { 1462 mutex_enter(&rt->mfc_mutex); 1463 if (origin.s_addr == rt->mfc_origin.s_addr && 1464 mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr && 1465 rt->mfc_rte == NULL && 1466 !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) 1467 break; 1468 mutex_exit(&rt->mfc_mutex); 1469 } 1470 1471 /* 1472 * Return if there was an upcall (mfc_rte != NULL, 1473 * or rt not in mfctable. 1474 */ 1475 if (rt == NULL) { 1476 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 1477 return (EADDRNOTAVAIL); 1478 } 1479 1480 1481 /* 1482 * no need to hold lock as we have a reference. 1483 */ 1484 ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED; 1485 /* error checking */ 1486 if (rt->mfc_timeout_id != 0) { 1487 ip0dbg(("del_mfc: TIMEOUT NOT 0, rte not null")); 1488 /* 1489 * Its ok to drop the lock, the struct cannot be freed 1490 * since we have a ref on the hash bucket. 1491 */ 1492 rt->mfc_timeout_id = 0; 1493 mutex_exit(&rt->mfc_mutex); 1494 (void) untimeout(rt->mfc_timeout_id); 1495 mutex_enter(&rt->mfc_mutex); 1496 } 1497 1498 ASSERT(rt->mfc_rte == NULL); 1499 1500 1501 /* 1502 * Delete the entry from the cache 1503 */ 1504 rt->mfc_marks |= MFCB_MARK_CONDEMNED; 1505 mutex_exit(&rt->mfc_mutex); 1506 1507 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 1508 1509 return (0); 1510 } 1511 1512 #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ 1513 1514 /* 1515 * IP multicast forwarding function. This function assumes that the packet 1516 * pointed to by ipha has arrived on (or is about to be sent to) the interface 1517 * pointed to by "ill", and the packet is to be relayed to other networks 1518 * that have members of the packet's destination IP multicast group. 1519 * 1520 * The packet is returned unscathed to the caller, unless it is 1521 * erroneous, in which case a -1 value tells the caller (IP) 1522 * to discard it. 1523 * 1524 * Unlike BSD, SunOS 5.x needs to return to IP info about 1525 * whether pkt came in thru a tunnel, so it can be discarded, unless 1526 * it's IGMP. In BSD, the ifp is bogus for tunnels, so pkt won't try 1527 * to be delivered. 1528 * Return values are 0 - pkt is okay and phyint 1529 * -1 - pkt is malformed and to be tossed 1530 * 1 - pkt came in on tunnel 1531 */ 1532 int 1533 ip_mforward(mblk_t *mp, ip_recv_attr_t *ira) 1534 { 1535 ipha_t *ipha = (ipha_t *)mp->b_rptr; 1536 ill_t *ill = ira->ira_ill; 1537 struct mfc *rt; 1538 ipaddr_t src, dst, tunnel_src = 0; 1539 static int srctun = 0; 1540 vifi_t vifi; 1541 boolean_t pim_reg_packet = B_FALSE; 1542 struct mfcb *mfcbp; 1543 ip_stack_t *ipst = ill->ill_ipst; 1544 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1545 ill_t *rill = ira->ira_rill; 1546 1547 ASSERT(ira->ira_pktlen == msgdsize(mp)); 1548 1549 if (ipst->ips_ip_mrtdebug > 1) { 1550 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1551 "ip_mforward: RECV ipha_src %x, ipha_dst %x, ill %s", 1552 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), 1553 ill->ill_name); 1554 } 1555 1556 dst = ipha->ipha_dst; 1557 if (ira->ira_flags & IRAF_PIM_REGISTER) 1558 pim_reg_packet = B_TRUE; 1559 else if (ira->ira_flags & IRAF_MROUTE_TUNNEL_SET) 1560 tunnel_src = ira->ira_mroute_tunnel; 1561 1562 /* 1563 * Don't forward a packet with time-to-live of zero or one, 1564 * or a packet destined to a local-only group. 1565 */ 1566 if (CLASSD(dst) && (ipha->ipha_ttl <= 1 || 1567 (ipaddr_t)ntohl(dst) <= INADDR_MAX_LOCAL_GROUP)) { 1568 if (ipst->ips_ip_mrtdebug > 1) { 1569 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1570 "ip_mforward: not forwarded ttl %d," 1571 " dst 0x%x ill %s", 1572 ipha->ipha_ttl, ntohl(dst), ill->ill_name); 1573 } 1574 if (tunnel_src != 0) 1575 return (1); 1576 else 1577 return (0); 1578 } 1579 1580 if ((tunnel_src != 0) || pim_reg_packet) { 1581 /* 1582 * Packet arrived over an encapsulated tunnel or via a PIM 1583 * register message. 1584 */ 1585 if (ipst->ips_ip_mrtdebug > 1) { 1586 if (tunnel_src != 0) { 1587 (void) mi_strlog(mrouter->conn_rq, 1, 1588 SL_TRACE, 1589 "ip_mforward: ill %s arrived via ENCAP TUN", 1590 ill->ill_name); 1591 } else if (pim_reg_packet) { 1592 (void) mi_strlog(mrouter->conn_rq, 1, 1593 SL_TRACE, 1594 "ip_mforward: ill %s arrived via" 1595 " REGISTER VIF", 1596 ill->ill_name); 1597 } 1598 } 1599 } else if ((ipha->ipha_version_and_hdr_length & 0xf) < 1600 (uint_t)(IP_SIMPLE_HDR_LENGTH + TUNNEL_LEN) >> 2 || 1601 ((uchar_t *)(ipha + 1))[1] != IPOPT_LSRR) { 1602 /* Packet arrived via a physical interface. */ 1603 if (ipst->ips_ip_mrtdebug > 1) { 1604 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1605 "ip_mforward: ill %s arrived via PHYINT", 1606 ill->ill_name); 1607 } 1608 1609 } else { 1610 /* 1611 * Packet arrived through a SRCRT tunnel. 1612 * Source-route tunnels are no longer supported. 1613 * Error message printed every 1000 times. 1614 */ 1615 if ((srctun++ % 1000) == 0) { 1616 cmn_err(CE_WARN, 1617 "ip_mforward: received source-routed pkt from %x", 1618 ntohl(ipha->ipha_src)); 1619 } 1620 return (-1); 1621 } 1622 1623 ipst->ips_mrtstat->mrts_fwd_in++; 1624 src = ipha->ipha_src; 1625 1626 /* Find route in cache, return NULL if not there or upcalls q'ed. */ 1627 1628 /* 1629 * Lock the mfctable against changes made by ip_mforward. 1630 * Note that only add_mfc and del_mfc can remove entries and 1631 * they run with exclusive access to IP. So we do not need to 1632 * guard against the rt being deleted, so release lock after reading. 1633 */ 1634 1635 if (is_mrouter_off(ipst)) 1636 return (-1); 1637 1638 mfcbp = &ipst->ips_mfcs[MFCHASH(src, dst)]; 1639 MFCB_REFHOLD(mfcbp); 1640 MFCFIND(mfcbp, src, dst, rt); 1641 1642 /* Entry exists, so forward if necessary */ 1643 if (rt != NULL) { 1644 int ret = 0; 1645 ipst->ips_mrtstat->mrts_mfc_hits++; 1646 if (pim_reg_packet) { 1647 ASSERT(ipst->ips_reg_vif_num != ALL_VIFS); 1648 ret = ip_mdq(mp, ipha, 1649 ipst->ips_vifs[ipst->ips_reg_vif_num]. 1650 v_ipif->ipif_ill, 1651 0, rt); 1652 } else { 1653 ret = ip_mdq(mp, ipha, ill, tunnel_src, rt); 1654 } 1655 1656 MFCB_REFRELE(mfcbp); 1657 return (ret); 1658 1659 /* 1660 * Don't forward if we don't have a cache entry. Mrouted will 1661 * always provide a cache entry in response to an upcall. 1662 */ 1663 } else { 1664 /* 1665 * If we don't have a route for packet's origin, make a copy 1666 * of the packet and send message to routing daemon. 1667 */ 1668 struct mfc *mfc_rt = NULL; 1669 mblk_t *mp0 = NULL; 1670 mblk_t *mp_copy = NULL; 1671 struct rtdetq *rte = NULL; 1672 struct rtdetq *rte_m, *rte1, *prev_rte; 1673 uint_t hash; 1674 int npkts; 1675 boolean_t new_mfc = B_FALSE; 1676 ipst->ips_mrtstat->mrts_mfc_misses++; 1677 /* BSD uses mrts_no_route++ */ 1678 if (ipst->ips_ip_mrtdebug > 1) { 1679 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1680 "ip_mforward: no rte ill %s src %x g %x misses %d", 1681 ill->ill_name, ntohl(src), ntohl(dst), 1682 (int)ipst->ips_mrtstat->mrts_mfc_misses); 1683 } 1684 /* 1685 * The order of the following code differs from the BSD code. 1686 * Pre-mc3.5, the BSD code was incorrect and SunOS 5.x 1687 * code works, so SunOS 5.x wasn't changed to conform to the 1688 * BSD version. 1689 */ 1690 1691 /* Lock mfctable. */ 1692 hash = MFCHASH(src, dst); 1693 mutex_enter(&(ipst->ips_mfcs[hash].mfcb_lock)); 1694 1695 /* 1696 * If we are turning off mrouted return an error 1697 */ 1698 if (is_mrouter_off(ipst)) { 1699 mutex_exit(&mfcbp->mfcb_lock); 1700 MFCB_REFRELE(mfcbp); 1701 return (-1); 1702 } 1703 1704 /* Is there an upcall waiting for this packet? */ 1705 for (mfc_rt = ipst->ips_mfcs[hash].mfcb_mfc; mfc_rt; 1706 mfc_rt = mfc_rt->mfc_next) { 1707 mutex_enter(&mfc_rt->mfc_mutex); 1708 if (ipst->ips_ip_mrtdebug > 1) { 1709 (void) mi_strlog(mrouter->conn_rq, 1, 1710 SL_TRACE, 1711 "ip_mforward: MFCTAB hash %d o 0x%x" 1712 " g 0x%x\n", 1713 hash, ntohl(mfc_rt->mfc_origin.s_addr), 1714 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1715 } 1716 /* There is an upcall */ 1717 if ((src == mfc_rt->mfc_origin.s_addr) && 1718 (dst == mfc_rt->mfc_mcastgrp.s_addr) && 1719 (mfc_rt->mfc_rte != NULL) && 1720 !(mfc_rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 1721 break; 1722 } 1723 mutex_exit(&mfc_rt->mfc_mutex); 1724 } 1725 /* No upcall, so make a new entry into mfctable */ 1726 if (mfc_rt == NULL) { 1727 mfc_rt = (struct mfc *)mi_zalloc(sizeof (struct mfc)); 1728 if (mfc_rt == NULL) { 1729 ipst->ips_mrtstat->mrts_fwd_drop++; 1730 ip1dbg(("ip_mforward: out of memory " 1731 "for mfc, mfc_rt\n")); 1732 goto error_return; 1733 } else 1734 new_mfc = B_TRUE; 1735 /* Get resources */ 1736 /* TODO could copy header and dup rest */ 1737 mp_copy = copymsg(mp); 1738 if (mp_copy == NULL) { 1739 ipst->ips_mrtstat->mrts_fwd_drop++; 1740 ip1dbg(("ip_mforward: out of memory for " 1741 "mblk, mp_copy\n")); 1742 goto error_return; 1743 } 1744 mutex_enter(&mfc_rt->mfc_mutex); 1745 } 1746 /* Get resources for rte, whether first rte or not first. */ 1747 /* Add this packet into rtdetq */ 1748 rte = (struct rtdetq *)mi_zalloc(sizeof (struct rtdetq)); 1749 if (rte == NULL) { 1750 ipst->ips_mrtstat->mrts_fwd_drop++; 1751 mutex_exit(&mfc_rt->mfc_mutex); 1752 ip1dbg(("ip_mforward: out of memory for" 1753 " rtdetq, rte\n")); 1754 goto error_return; 1755 } 1756 1757 mp0 = copymsg(mp); 1758 if (mp0 == NULL) { 1759 ipst->ips_mrtstat->mrts_fwd_drop++; 1760 ip1dbg(("ip_mforward: out of memory for mblk, mp0\n")); 1761 mutex_exit(&mfc_rt->mfc_mutex); 1762 goto error_return; 1763 } 1764 rte->mp = mp0; 1765 if (pim_reg_packet) { 1766 ASSERT(ipst->ips_reg_vif_num != ALL_VIFS); 1767 rte->ill = 1768 ipst->ips_vifs[ipst->ips_reg_vif_num]. 1769 v_ipif->ipif_ill; 1770 } else { 1771 rte->ill = ill; 1772 } 1773 rte->rte_next = NULL; 1774 1775 /* 1776 * Determine if upcall q (rtdetq) has overflowed. 1777 * mfc_rt->mfc_rte is null by mi_zalloc 1778 * if it is the first message. 1779 */ 1780 for (rte_m = mfc_rt->mfc_rte, npkts = 0; rte_m; 1781 rte_m = rte_m->rte_next) 1782 npkts++; 1783 if (ipst->ips_ip_mrtdebug > 1) { 1784 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1785 "ip_mforward: upcalls %d\n", npkts); 1786 } 1787 if (npkts > MAX_UPQ) { 1788 ipst->ips_mrtstat->mrts_upq_ovflw++; 1789 mutex_exit(&mfc_rt->mfc_mutex); 1790 goto error_return; 1791 } 1792 1793 if (npkts == 0) { /* first upcall */ 1794 int i = 0; 1795 /* 1796 * Now finish installing the new mfc! Now that we have 1797 * resources! Insert new entry at head of hash chain. 1798 * Use src and dst which are ipaddr_t's. 1799 */ 1800 mfc_rt->mfc_origin.s_addr = src; 1801 mfc_rt->mfc_mcastgrp.s_addr = dst; 1802 1803 mutex_enter(&ipst->ips_numvifs_mutex); 1804 for (i = 0; i < (int)ipst->ips_numvifs; i++) 1805 mfc_rt->mfc_ttls[i] = 0; 1806 mutex_exit(&ipst->ips_numvifs_mutex); 1807 mfc_rt->mfc_parent = ALL_VIFS; 1808 1809 /* Link into table */ 1810 if (ipst->ips_ip_mrtdebug > 1) { 1811 (void) mi_strlog(mrouter->conn_rq, 1, 1812 SL_TRACE, 1813 "ip_mforward: NEW MFCTAB hash %d o 0x%x " 1814 "g 0x%x\n", hash, 1815 ntohl(mfc_rt->mfc_origin.s_addr), 1816 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1817 } 1818 mfc_rt->mfc_next = ipst->ips_mfcs[hash].mfcb_mfc; 1819 ipst->ips_mfcs[hash].mfcb_mfc = mfc_rt; 1820 mfc_rt->mfc_rte = NULL; 1821 } 1822 1823 /* Link in the upcall */ 1824 /* First upcall */ 1825 if (mfc_rt->mfc_rte == NULL) 1826 mfc_rt->mfc_rte = rte; 1827 else { 1828 /* not the first upcall */ 1829 prev_rte = mfc_rt->mfc_rte; 1830 for (rte1 = mfc_rt->mfc_rte->rte_next; rte1; 1831 prev_rte = rte1, rte1 = rte1->rte_next) 1832 ; 1833 prev_rte->rte_next = rte; 1834 } 1835 1836 /* 1837 * No upcalls waiting, this is first one, so send a message to 1838 * routing daemon to install a route into kernel table. 1839 */ 1840 if (npkts == 0) { 1841 struct igmpmsg *im; 1842 /* ipha_protocol is 0, for upcall */ 1843 ASSERT(mp_copy != NULL); 1844 im = (struct igmpmsg *)mp_copy->b_rptr; 1845 im->im_msgtype = IGMPMSG_NOCACHE; 1846 im->im_mbz = 0; 1847 mutex_enter(&ipst->ips_numvifs_mutex); 1848 if (pim_reg_packet) { 1849 im->im_vif = (uchar_t)ipst->ips_reg_vif_num; 1850 mutex_exit(&ipst->ips_numvifs_mutex); 1851 } else { 1852 /* 1853 * XXX do we need to hold locks here ? 1854 */ 1855 for (vifi = 0; 1856 vifi < ipst->ips_numvifs; 1857 vifi++) { 1858 if (ipst->ips_vifs[vifi].v_ipif == NULL) 1859 continue; 1860 if (ipst->ips_vifs[vifi]. 1861 v_ipif->ipif_ill == ill) { 1862 im->im_vif = (uchar_t)vifi; 1863 break; 1864 } 1865 } 1866 mutex_exit(&ipst->ips_numvifs_mutex); 1867 ASSERT(vifi < ipst->ips_numvifs); 1868 } 1869 1870 ipst->ips_mrtstat->mrts_upcalls++; 1871 /* Timer to discard upcalls if mrouted is too slow */ 1872 mfc_rt->mfc_timeout_id = timeout(expire_upcalls, 1873 mfc_rt, EXPIRE_TIMEOUT * UPCALL_EXPIRE); 1874 mutex_exit(&mfc_rt->mfc_mutex); 1875 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1876 /* Pass to RAWIP */ 1877 ira->ira_ill = ira->ira_rill = NULL; 1878 (mrouter->conn_recv)(mrouter, mp_copy, NULL, ira); 1879 ira->ira_ill = ill; 1880 ira->ira_rill = rill; 1881 } else { 1882 mutex_exit(&mfc_rt->mfc_mutex); 1883 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1884 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1885 ip_drop_input("ip_mforward - upcall already waiting", 1886 mp_copy, ill); 1887 freemsg(mp_copy); 1888 } 1889 1890 MFCB_REFRELE(mfcbp); 1891 if (tunnel_src != 0) 1892 return (1); 1893 else 1894 return (0); 1895 error_return: 1896 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1897 MFCB_REFRELE(mfcbp); 1898 if (mfc_rt != NULL && (new_mfc == B_TRUE)) 1899 mi_free((char *)mfc_rt); 1900 if (rte != NULL) 1901 mi_free((char *)rte); 1902 if (mp_copy != NULL) { 1903 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1904 ip_drop_input("ip_mforward error", mp_copy, ill); 1905 freemsg(mp_copy); 1906 } 1907 if (mp0 != NULL) 1908 freemsg(mp0); 1909 return (-1); 1910 } 1911 } 1912 1913 /* 1914 * Clean up the mfctable cache entry if upcall is not serviced. 1915 * SunOS 5.x has timeout per mfc, unlike BSD which has one timer. 1916 */ 1917 static void 1918 expire_upcalls(void *arg) 1919 { 1920 struct mfc *mfc_rt = arg; 1921 uint_t hash; 1922 struct mfc *prev_mfc, *mfc0; 1923 ip_stack_t *ipst; 1924 conn_t *mrouter; 1925 1926 if (mfc_rt->mfc_rte == NULL || mfc_rt->mfc_rte->ill != NULL) { 1927 cmn_err(CE_WARN, "expire_upcalls: no ILL\n"); 1928 return; 1929 } 1930 ipst = mfc_rt->mfc_rte->ill->ill_ipst; 1931 mrouter = ipst->ips_ip_g_mrouter; 1932 1933 hash = MFCHASH(mfc_rt->mfc_origin.s_addr, mfc_rt->mfc_mcastgrp.s_addr); 1934 if (ipst->ips_ip_mrtdebug > 1) { 1935 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1936 "expire_upcalls: hash %d s %x g %x", 1937 hash, ntohl(mfc_rt->mfc_origin.s_addr), 1938 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1939 } 1940 MFCB_REFHOLD(&ipst->ips_mfcs[hash]); 1941 mutex_enter(&mfc_rt->mfc_mutex); 1942 /* 1943 * if timeout has been set to zero, than the 1944 * entry has been filled, no need to delete it. 1945 */ 1946 if (mfc_rt->mfc_timeout_id == 0) 1947 goto done; 1948 ipst->ips_mrtstat->mrts_cache_cleanups++; 1949 mfc_rt->mfc_timeout_id = 0; 1950 1951 /* Determine entry to be cleaned up in cache table. */ 1952 for (prev_mfc = mfc0 = ipst->ips_mfcs[hash].mfcb_mfc; mfc0; 1953 prev_mfc = mfc0, mfc0 = mfc0->mfc_next) 1954 if (mfc0 == mfc_rt) 1955 break; 1956 1957 /* del_mfc takes care of gone mfcs */ 1958 ASSERT(prev_mfc != NULL); 1959 ASSERT(mfc0 != NULL); 1960 1961 /* 1962 * Delete the entry from the cache 1963 */ 1964 ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED; 1965 mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED; 1966 1967 /* 1968 * release_mfc will drop all queued upcall packets. 1969 * and will free the mbuf with the pkt, if, timing info. 1970 */ 1971 done: 1972 mutex_exit(&mfc_rt->mfc_mutex); 1973 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 1974 } 1975 1976 /* 1977 * Packet forwarding routine once entry in the cache is made. 1978 */ 1979 static int 1980 ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src, 1981 struct mfc *rt) 1982 { 1983 vifi_t vifi; 1984 struct vif *vifp; 1985 ipaddr_t dst = ipha->ipha_dst; 1986 size_t plen = msgdsize(mp); 1987 vifi_t num_of_vifs; 1988 ip_stack_t *ipst = ill->ill_ipst; 1989 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1990 ip_recv_attr_t iras; 1991 1992 if (ipst->ips_ip_mrtdebug > 1) { 1993 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1994 "ip_mdq: SEND src %x, ipha_dst %x, ill %s", 1995 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), 1996 ill->ill_name); 1997 } 1998 1999 /* Macro to send packet on vif */ 2000 #define MC_SEND(ipha, mp, vifp, dst) { \ 2001 if ((vifp)->v_flags & VIFF_TUNNEL) \ 2002 encap_send((ipha), (mp), (vifp), (dst)); \ 2003 else if ((vifp)->v_flags & VIFF_REGISTER) \ 2004 register_send((ipha), (mp), (vifp), (dst)); \ 2005 else \ 2006 phyint_send((ipha), (mp), (vifp), (dst)); \ 2007 } 2008 2009 vifi = rt->mfc_parent; 2010 2011 /* 2012 * The value of vifi is MAXVIFS if the pkt had no parent, i.e., 2013 * Mrouted had no route. 2014 * We wanted the route installed in the mfctable to prevent multiple 2015 * tries, so it passed add_mfc(), but is discarded here. The v_ipif is 2016 * NULL so we don't want to check the ill. Still needed as of Mrouted 2017 * 3.6. 2018 */ 2019 if (vifi == NO_VIF) { 2020 ip1dbg(("ip_mdq: no route for origin ill %s, vifi is NO_VIF\n", 2021 ill->ill_name)); 2022 if (ipst->ips_ip_mrtdebug > 1) { 2023 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2024 "ip_mdq: vifi is NO_VIF ill = %s", ill->ill_name); 2025 } 2026 return (-1); /* drop pkt */ 2027 } 2028 2029 if (!lock_good_vif(&ipst->ips_vifs[vifi])) 2030 return (-1); 2031 /* 2032 * The MFC entries are not cleaned up when an ipif goes 2033 * away thus this code has to guard against an MFC referencing 2034 * an ipif that has been closed. Note: reset_mrt_vif_ipif 2035 * sets the v_ipif to NULL when the ipif disappears. 2036 */ 2037 ASSERT(ipst->ips_vifs[vifi].v_ipif != NULL); 2038 2039 if (vifi >= ipst->ips_numvifs) { 2040 cmn_err(CE_WARN, "ip_mdq: illegal vifi %d numvifs " 2041 "%d ill %s viftable ill %s\n", 2042 (int)vifi, (int)ipst->ips_numvifs, ill->ill_name, 2043 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name); 2044 unlock_good_vif(&ipst->ips_vifs[vifi]); 2045 return (-1); 2046 } 2047 /* 2048 * Don't forward if it didn't arrive from the parent vif for its 2049 * origin. 2050 */ 2051 if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill) || 2052 (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) { 2053 /* Came in the wrong interface */ 2054 ip1dbg(("ip_mdq: arrived wrong if, vifi %d " 2055 "numvifs %d ill %s viftable ill %s\n", 2056 (int)vifi, (int)ipst->ips_numvifs, ill->ill_name, 2057 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name)); 2058 if (ipst->ips_ip_mrtdebug > 1) { 2059 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2060 "ip_mdq: arrived wrong if, vifi %d ill " 2061 "%s viftable ill %s\n", 2062 (int)vifi, ill->ill_name, 2063 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name); 2064 } 2065 ipst->ips_mrtstat->mrts_wrong_if++; 2066 rt->mfc_wrong_if++; 2067 2068 /* 2069 * If we are doing PIM assert processing and we are forwarding 2070 * packets on this interface, and it is a broadcast medium 2071 * interface (and not a tunnel), send a message to the routing. 2072 * 2073 * We use the first ipif on the list, since it's all we have. 2074 * Chances are the ipif_flags are the same for ipifs on the ill. 2075 */ 2076 if (ipst->ips_pim_assert && rt->mfc_ttls[vifi] > 0 && 2077 (ill->ill_ipif->ipif_flags & IPIF_BROADCAST) && 2078 !(ipst->ips_vifs[vifi].v_flags & VIFF_TUNNEL)) { 2079 mblk_t *mp_copy; 2080 struct igmpmsg *im; 2081 2082 /* TODO could copy header and dup rest */ 2083 mp_copy = copymsg(mp); 2084 if (mp_copy == NULL) { 2085 ipst->ips_mrtstat->mrts_fwd_drop++; 2086 ip1dbg(("ip_mdq: out of memory " 2087 "for mblk, mp_copy\n")); 2088 unlock_good_vif(&ipst->ips_vifs[vifi]); 2089 return (-1); 2090 } 2091 2092 im = (struct igmpmsg *)mp_copy->b_rptr; 2093 im->im_msgtype = IGMPMSG_WRONGVIF; 2094 im->im_mbz = 0; 2095 im->im_vif = (ushort_t)vifi; 2096 /* Pass to RAWIP */ 2097 2098 bzero(&iras, sizeof (iras)); 2099 iras.ira_flags = IRAF_IS_IPV4; 2100 iras.ira_ip_hdr_length = 2101 IPH_HDR_LENGTH(mp_copy->b_rptr); 2102 iras.ira_pktlen = msgdsize(mp_copy); 2103 iras.ira_ttl = ipha->ipha_ttl; 2104 (mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras); 2105 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2106 } 2107 unlock_good_vif(&ipst->ips_vifs[vifi]); 2108 if (tunnel_src != 0) 2109 return (1); 2110 else 2111 return (0); 2112 } 2113 /* 2114 * If I sourced this packet, it counts as output, else it was input. 2115 */ 2116 if (ipha->ipha_src == ipst->ips_vifs[vifi].v_lcl_addr.s_addr) { 2117 ipst->ips_vifs[vifi].v_pkt_out++; 2118 ipst->ips_vifs[vifi].v_bytes_out += plen; 2119 } else { 2120 ipst->ips_vifs[vifi].v_pkt_in++; 2121 ipst->ips_vifs[vifi].v_bytes_in += plen; 2122 } 2123 mutex_enter(&rt->mfc_mutex); 2124 rt->mfc_pkt_cnt++; 2125 rt->mfc_byte_cnt += plen; 2126 mutex_exit(&rt->mfc_mutex); 2127 unlock_good_vif(&ipst->ips_vifs[vifi]); 2128 /* 2129 * For each vif, decide if a copy of the packet should be forwarded. 2130 * Forward if: 2131 * - the vif threshold ttl is non-zero AND 2132 * - the pkt ttl exceeds the vif's threshold 2133 * A non-zero mfc_ttl indicates that the vif is part of 2134 * the output set for the mfc entry. 2135 */ 2136 mutex_enter(&ipst->ips_numvifs_mutex); 2137 num_of_vifs = ipst->ips_numvifs; 2138 mutex_exit(&ipst->ips_numvifs_mutex); 2139 for (vifp = ipst->ips_vifs, vifi = 0; 2140 vifi < num_of_vifs; 2141 vifp++, vifi++) { 2142 if (!lock_good_vif(vifp)) 2143 continue; 2144 if ((rt->mfc_ttls[vifi] > 0) && 2145 (ipha->ipha_ttl > rt->mfc_ttls[vifi])) { 2146 /* 2147 * lock_good_vif should not have succedded if 2148 * v_ipif is null. 2149 */ 2150 ASSERT(vifp->v_ipif != NULL); 2151 vifp->v_pkt_out++; 2152 vifp->v_bytes_out += plen; 2153 MC_SEND(ipha, mp, vifp, dst); 2154 ipst->ips_mrtstat->mrts_fwd_out++; 2155 } 2156 unlock_good_vif(vifp); 2157 } 2158 if (tunnel_src != 0) 2159 return (1); 2160 else 2161 return (0); 2162 } 2163 2164 /* 2165 * Send the packet on physical interface. 2166 * Caller assumes can continue to use mp on return. 2167 */ 2168 /* ARGSUSED */ 2169 static void 2170 phyint_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2171 { 2172 mblk_t *mp_copy; 2173 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2174 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2175 2176 /* Make a new reference to the packet */ 2177 mp_copy = copymsg(mp); /* TODO could copy header and dup rest */ 2178 if (mp_copy == NULL) { 2179 ipst->ips_mrtstat->mrts_fwd_drop++; 2180 ip1dbg(("phyint_send: out of memory for mblk, mp_copy\n")); 2181 return; 2182 } 2183 if (vifp->v_rate_limit <= 0) 2184 tbf_send_packet(vifp, mp_copy); 2185 else { 2186 if (ipst->ips_ip_mrtdebug > 1) { 2187 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2188 "phyint_send: tbf_contr rate %d " 2189 "vifp 0x%p mp 0x%p dst 0x%x", 2190 vifp->v_rate_limit, (void *)vifp, (void *)mp, dst); 2191 } 2192 tbf_control(vifp, mp_copy, (ipha_t *)mp_copy->b_rptr); 2193 } 2194 } 2195 2196 /* 2197 * Send the whole packet for REGISTER encapsulation to PIM daemon 2198 * Caller assumes it can continue to use mp on return. 2199 */ 2200 /* ARGSUSED */ 2201 static void 2202 register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2203 { 2204 struct igmpmsg *im; 2205 mblk_t *mp_copy; 2206 ipha_t *ipha_copy; 2207 ill_t *ill = vifp->v_ipif->ipif_ill; 2208 ip_stack_t *ipst = ill->ill_ipst; 2209 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2210 ip_recv_attr_t iras; 2211 2212 if (ipst->ips_ip_mrtdebug > 1) { 2213 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2214 "register_send: src %x, dst %x\n", 2215 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst)); 2216 } 2217 2218 /* 2219 * Copy the old packet & pullup its IP header into the new mblk_t so we 2220 * can modify it. Try to fill the new mblk_t since if we don't the 2221 * ethernet driver will. 2222 */ 2223 mp_copy = allocb(sizeof (struct igmpmsg) + sizeof (ipha_t), BPRI_MED); 2224 if (mp_copy == NULL) { 2225 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2226 if (ipst->ips_ip_mrtdebug > 3) { 2227 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2228 "register_send: allocb failure."); 2229 } 2230 return; 2231 } 2232 2233 /* 2234 * Bump write pointer to account for igmpmsg being added. 2235 */ 2236 mp_copy->b_wptr = mp_copy->b_rptr + sizeof (struct igmpmsg); 2237 2238 /* 2239 * Chain packet to new mblk_t. 2240 */ 2241 if ((mp_copy->b_cont = copymsg(mp)) == NULL) { 2242 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2243 if (ipst->ips_ip_mrtdebug > 3) { 2244 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2245 "register_send: copymsg failure."); 2246 } 2247 freeb(mp_copy); 2248 return; 2249 } 2250 2251 /* 2252 * icmp_input() asserts that IP version field is set to an 2253 * appropriate version. Hence, the struct igmpmsg that this really 2254 * becomes, needs to have the correct IP version field. 2255 */ 2256 ipha_copy = (ipha_t *)mp_copy->b_rptr; 2257 *ipha_copy = multicast_encap_iphdr; 2258 2259 /* 2260 * The kernel uses the struct igmpmsg header to encode the messages to 2261 * the multicast routing daemon. Fill in the fields in the header 2262 * starting with the message type which is IGMPMSG_WHOLEPKT 2263 */ 2264 im = (struct igmpmsg *)mp_copy->b_rptr; 2265 im->im_msgtype = IGMPMSG_WHOLEPKT; 2266 im->im_src.s_addr = ipha->ipha_src; 2267 im->im_dst.s_addr = ipha->ipha_dst; 2268 2269 /* 2270 * Must Be Zero. This is because the struct igmpmsg is really an IP 2271 * header with renamed fields and the multicast routing daemon uses 2272 * an ipha_protocol (aka im_mbz) of 0 to distinguish these messages. 2273 */ 2274 im->im_mbz = 0; 2275 2276 ++ipst->ips_mrtstat->mrts_upcalls; 2277 if (IPCL_IS_NONSTR(mrouter) ? mrouter->conn_flow_cntrld : 2278 !canputnext(mrouter->conn_rq)) { 2279 ++ipst->ips_mrtstat->mrts_pim_regsend_drops; 2280 if (ipst->ips_ip_mrtdebug > 3) { 2281 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2282 "register_send: register upcall failure."); 2283 } 2284 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2285 ip_drop_input("mrts_pim_regsend_drops", mp_copy, ill); 2286 freemsg(mp_copy); 2287 } else { 2288 /* Pass to RAWIP */ 2289 bzero(&iras, sizeof (iras)); 2290 iras.ira_flags = IRAF_IS_IPV4; 2291 iras.ira_ip_hdr_length = sizeof (ipha_t); 2292 iras.ira_pktlen = msgdsize(mp_copy); 2293 iras.ira_ttl = ipha->ipha_ttl; 2294 (mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras); 2295 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2296 } 2297 } 2298 2299 /* 2300 * pim_validate_cksum handles verification of the checksum in the 2301 * pim header. For PIM Register packets, the checksum is calculated 2302 * across the PIM header only. For all other packets, the checksum 2303 * is for the PIM header and remainder of the packet. 2304 * 2305 * returns: B_TRUE, if checksum is okay. 2306 * B_FALSE, if checksum is not valid. 2307 */ 2308 static boolean_t 2309 pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp) 2310 { 2311 mblk_t *mp_dup; 2312 2313 if ((mp_dup = dupmsg(mp)) == NULL) 2314 return (B_FALSE); 2315 2316 mp_dup->b_rptr += IPH_HDR_LENGTH(ip); 2317 if (pimp->pim_type == PIM_REGISTER) 2318 mp_dup->b_wptr = mp_dup->b_rptr + PIM_MINLEN; 2319 if (IP_CSUM(mp_dup, 0, 0)) { 2320 freemsg(mp_dup); 2321 return (B_FALSE); 2322 } 2323 freemsg(mp_dup); 2324 return (B_TRUE); 2325 } 2326 2327 /* 2328 * Process PIM protocol packets i.e. IP Protocol 103. 2329 * Register messages are decapsulated and sent onto multicast forwarding. 2330 * 2331 * Return NULL for a bad packet that is discarded here. 2332 * Return mp if the message is OK and should be handed to "raw" receivers. 2333 * Callers of pim_input() may need to reinitialize variables that were copied 2334 * from the mblk as this calls pullupmsg(). 2335 */ 2336 mblk_t * 2337 pim_input(mblk_t *mp, ip_recv_attr_t *ira) 2338 { 2339 ipha_t *eip, *ip; 2340 int iplen, pimlen, iphlen; 2341 struct pim *pimp; /* pointer to a pim struct */ 2342 uint32_t *reghdr; 2343 ill_t *ill = ira->ira_ill; 2344 ip_stack_t *ipst = ill->ill_ipst; 2345 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2346 2347 /* 2348 * Pullup the msg for PIM protocol processing. 2349 */ 2350 if (pullupmsg(mp, -1) == 0) { 2351 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2352 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2353 ip_drop_input("mrts_pim_nomemory", mp, ill); 2354 freemsg(mp); 2355 return (NULL); 2356 } 2357 2358 ip = (ipha_t *)mp->b_rptr; 2359 iplen = ip->ipha_length; 2360 iphlen = IPH_HDR_LENGTH(ip); 2361 pimlen = ntohs(iplen) - iphlen; 2362 2363 /* 2364 * Validate lengths 2365 */ 2366 if (pimlen < PIM_MINLEN) { 2367 ++ipst->ips_mrtstat->mrts_pim_malformed; 2368 if (ipst->ips_ip_mrtdebug > 1) { 2369 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2370 "pim_input: length not at least minlen"); 2371 } 2372 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2373 ip_drop_input("mrts_pim_malformed", mp, ill); 2374 freemsg(mp); 2375 return (NULL); 2376 } 2377 2378 /* 2379 * Point to the PIM header. 2380 */ 2381 pimp = (struct pim *)((caddr_t)ip + iphlen); 2382 2383 /* 2384 * Check the version number. 2385 */ 2386 if (pimp->pim_vers != PIM_VERSION) { 2387 ++ipst->ips_mrtstat->mrts_pim_badversion; 2388 if (ipst->ips_ip_mrtdebug > 1) { 2389 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2390 "pim_input: unknown version of PIM"); 2391 } 2392 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2393 ip_drop_input("mrts_pim_badversion", mp, ill); 2394 freemsg(mp); 2395 return (NULL); 2396 } 2397 2398 /* 2399 * Validate the checksum 2400 */ 2401 if (!pim_validate_cksum(mp, ip, pimp)) { 2402 ++ipst->ips_mrtstat->mrts_pim_rcv_badcsum; 2403 if (ipst->ips_ip_mrtdebug > 1) { 2404 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2405 "pim_input: invalid checksum"); 2406 } 2407 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2408 ip_drop_input("pim_rcv_badcsum", mp, ill); 2409 freemsg(mp); 2410 return (NULL); 2411 } 2412 2413 if (pimp->pim_type != PIM_REGISTER) 2414 return (mp); 2415 2416 reghdr = (uint32_t *)(pimp + 1); 2417 eip = (ipha_t *)(reghdr + 1); 2418 2419 /* 2420 * check if the inner packet is destined to mcast group 2421 */ 2422 if (!CLASSD(eip->ipha_dst)) { 2423 ++ipst->ips_mrtstat->mrts_pim_badregisters; 2424 if (ipst->ips_ip_mrtdebug > 1) { 2425 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2426 "pim_input: Inner pkt not mcast .. !"); 2427 } 2428 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2429 ip_drop_input("mrts_pim_badregisters", mp, ill); 2430 freemsg(mp); 2431 return (NULL); 2432 } 2433 if (ipst->ips_ip_mrtdebug > 1) { 2434 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2435 "register from %x, to %x, len %d", 2436 ntohl(eip->ipha_src), 2437 ntohl(eip->ipha_dst), 2438 ntohs(eip->ipha_length)); 2439 } 2440 /* 2441 * If the null register bit is not set, decapsulate 2442 * the packet before forwarding it. 2443 * Avoid this in no register vif 2444 */ 2445 if (!(ntohl(*reghdr) & PIM_NULL_REGISTER) && 2446 ipst->ips_reg_vif_num != ALL_VIFS) { 2447 mblk_t *mp_copy; 2448 uint_t saved_pktlen; 2449 2450 /* Copy the message */ 2451 if ((mp_copy = copymsg(mp)) == NULL) { 2452 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2453 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2454 ip_drop_input("mrts_pim_nomemory", mp, ill); 2455 freemsg(mp); 2456 return (NULL); 2457 } 2458 2459 /* 2460 * Decapsulate the packet and give it to 2461 * register_mforward. 2462 */ 2463 mp_copy->b_rptr += iphlen + sizeof (pim_t) + sizeof (*reghdr); 2464 saved_pktlen = ira->ira_pktlen; 2465 ira->ira_pktlen -= iphlen + sizeof (pim_t) + sizeof (*reghdr); 2466 if (register_mforward(mp_copy, ira) != 0) { 2467 /* register_mforward already called ip_drop_input */ 2468 freemsg(mp); 2469 ira->ira_pktlen = saved_pktlen; 2470 return (NULL); 2471 } 2472 ira->ira_pktlen = saved_pktlen; 2473 } 2474 2475 /* 2476 * Pass all valid PIM packets up to any process(es) listening on a raw 2477 * PIM socket. For Solaris it is done right after pim_input() is 2478 * called. 2479 */ 2480 return (mp); 2481 } 2482 2483 /* 2484 * PIM sparse mode hook. Called by pim_input after decapsulating 2485 * the packet. Loop back the packet, as if we have received it. 2486 * In pim_input() we have to check if the destination is a multicast address. 2487 */ 2488 static int 2489 register_mforward(mblk_t *mp, ip_recv_attr_t *ira) 2490 { 2491 ire_t *ire; 2492 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2493 ill_t *ill = ira->ira_ill; 2494 ip_stack_t *ipst = ill->ill_ipst; 2495 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2496 2497 ASSERT(ipst->ips_reg_vif_num <= ipst->ips_numvifs); 2498 2499 if (ipst->ips_ip_mrtdebug > 3) { 2500 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2501 "register_mforward: src %x, dst %x\n", 2502 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst)); 2503 } 2504 /* 2505 * Need to pass in to ip_mforward() the information that the 2506 * packet has arrived on the register_vif. We mark it with 2507 * the IRAF_PIM_REGISTER attribute. 2508 * pim_input verified that the (inner) destination is multicast, 2509 * hence we skip the generic code in ip_input. 2510 */ 2511 ira->ira_flags |= IRAF_PIM_REGISTER; 2512 ++ipst->ips_mrtstat->mrts_pim_regforwards; 2513 2514 if (!CLASSD(ipha->ipha_dst)) { 2515 ire = ire_route_recursive_v4(ipha->ipha_dst, 0, NULL, ALL_ZONES, 2516 ira->ira_tsl, MATCH_IRE_SECATTR, IRR_ALLOCATE, 0, ipst, 2517 NULL, NULL, NULL); 2518 } else { 2519 ire = ire_multicast(ill); 2520 } 2521 ASSERT(ire != NULL); 2522 /* Normally this will return the IRE_MULTICAST */ 2523 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 2524 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2525 ip_drop_input("mrts_pim RTF_REJECT", mp, ill); 2526 freemsg(mp); 2527 ire_refrele(ire); 2528 return (-1); 2529 } 2530 ASSERT(ire->ire_type & IRE_MULTICAST); 2531 (*ire->ire_recvfn)(ire, mp, ipha, ira); 2532 ire_refrele(ire); 2533 2534 return (0); 2535 } 2536 2537 /* 2538 * Send an encapsulated packet. 2539 * Caller assumes can continue to use mp when routine returns. 2540 */ 2541 /* ARGSUSED */ 2542 static void 2543 encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2544 { 2545 mblk_t *mp_copy; 2546 ipha_t *ipha_copy; 2547 size_t len; 2548 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2549 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2550 2551 if (ipst->ips_ip_mrtdebug > 1) { 2552 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2553 "encap_send: vif %ld enter", 2554 (ptrdiff_t)(vifp - ipst->ips_vifs)); 2555 } 2556 len = ntohs(ipha->ipha_length); 2557 2558 /* 2559 * Copy the old packet & pullup it's IP header into the 2560 * new mbuf so we can modify it. Try to fill the new 2561 * mbuf since if we don't the ethernet driver will. 2562 */ 2563 mp_copy = allocb(32 + sizeof (multicast_encap_iphdr), BPRI_MED); 2564 if (mp_copy == NULL) 2565 return; 2566 mp_copy->b_rptr += 32; 2567 mp_copy->b_wptr = mp_copy->b_rptr + sizeof (multicast_encap_iphdr); 2568 if ((mp_copy->b_cont = copymsg(mp)) == NULL) { 2569 freeb(mp_copy); 2570 return; 2571 } 2572 2573 /* 2574 * Fill in the encapsulating IP header. 2575 * Remote tunnel dst in rmt_addr, from add_vif(). 2576 */ 2577 ipha_copy = (ipha_t *)mp_copy->b_rptr; 2578 *ipha_copy = multicast_encap_iphdr; 2579 ASSERT((len + sizeof (ipha_t)) <= IP_MAXPACKET); 2580 ipha_copy->ipha_length = htons(len + sizeof (ipha_t)); 2581 ipha_copy->ipha_src = vifp->v_lcl_addr.s_addr; 2582 ipha_copy->ipha_dst = vifp->v_rmt_addr.s_addr; 2583 ASSERT(ipha_copy->ipha_ident == 0); 2584 2585 /* Turn the encapsulated IP header back into a valid one. */ 2586 ipha = (ipha_t *)mp_copy->b_cont->b_rptr; 2587 ipha->ipha_ttl--; 2588 ipha->ipha_hdr_checksum = 0; 2589 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 2590 2591 ipha_copy->ipha_ttl = ipha->ipha_ttl; 2592 2593 if (ipst->ips_ip_mrtdebug > 1) { 2594 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2595 "encap_send: group 0x%x", ntohl(ipha->ipha_dst)); 2596 } 2597 if (vifp->v_rate_limit <= 0) 2598 tbf_send_packet(vifp, mp_copy); 2599 else 2600 /* ipha is from the original header */ 2601 tbf_control(vifp, mp_copy, ipha); 2602 } 2603 2604 /* 2605 * De-encapsulate a packet and feed it back through IP input if it 2606 * matches one of our multicast tunnels. 2607 * 2608 * This routine is called whenever IP gets a packet with prototype 2609 * IPPROTO_ENCAP and a local destination address and the packet didn't 2610 * match one of our configured IP-in-IP tunnels. 2611 */ 2612 void 2613 ip_mroute_decap(mblk_t *mp, ip_recv_attr_t *ira) 2614 { 2615 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2616 ipha_t *ipha_encap; 2617 int hlen = IPH_HDR_LENGTH(ipha); 2618 int hlen_encap; 2619 ipaddr_t src; 2620 struct vif *vifp; 2621 ire_t *ire; 2622 ill_t *ill = ira->ira_ill; 2623 ip_stack_t *ipst = ill->ill_ipst; 2624 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2625 2626 /* Make sure we have all of the inner header */ 2627 ipha_encap = (ipha_t *)((char *)ipha + hlen); 2628 if (mp->b_wptr - mp->b_rptr < hlen + IP_SIMPLE_HDR_LENGTH) { 2629 ipha = ip_pullup(mp, hlen + IP_SIMPLE_HDR_LENGTH, ira); 2630 if (ipha == NULL) { 2631 ipst->ips_mrtstat->mrts_bad_tunnel++; 2632 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2633 ip_drop_input("ip_mroute_decap: too short", mp, ill); 2634 freemsg(mp); 2635 return; 2636 } 2637 ipha_encap = (ipha_t *)((char *)ipha + hlen); 2638 } 2639 hlen_encap = IPH_HDR_LENGTH(ipha_encap); 2640 if (mp->b_wptr - mp->b_rptr < hlen + hlen_encap) { 2641 ipha = ip_pullup(mp, hlen + hlen_encap, ira); 2642 if (ipha == NULL) { 2643 ipst->ips_mrtstat->mrts_bad_tunnel++; 2644 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2645 ip_drop_input("ip_mroute_decap: too short", mp, ill); 2646 freemsg(mp); 2647 return; 2648 } 2649 ipha_encap = (ipha_t *)((char *)ipha + hlen); 2650 } 2651 2652 /* 2653 * Dump the packet if it's not to a multicast destination or if 2654 * we don't have an encapsulating tunnel with the source. 2655 * Note: This code assumes that the remote site IP address 2656 * uniquely identifies the tunnel (i.e., that this site has 2657 * at most one tunnel with the remote site). 2658 */ 2659 if (!CLASSD(ipha_encap->ipha_dst)) { 2660 ipst->ips_mrtstat->mrts_bad_tunnel++; 2661 ip1dbg(("ip_mroute_decap: bad tunnel\n")); 2662 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2663 ip_drop_input("mrts_bad_tunnel", mp, ill); 2664 freemsg(mp); 2665 return; 2666 } 2667 src = (ipaddr_t)ipha->ipha_src; 2668 mutex_enter(&ipst->ips_last_encap_lock); 2669 if (src != ipst->ips_last_encap_src) { 2670 struct vif *vife; 2671 2672 vifp = ipst->ips_vifs; 2673 vife = vifp + ipst->ips_numvifs; 2674 ipst->ips_last_encap_src = src; 2675 ipst->ips_last_encap_vif = 0; 2676 for (; vifp < vife; ++vifp) { 2677 if (!lock_good_vif(vifp)) 2678 continue; 2679 if (vifp->v_rmt_addr.s_addr == src) { 2680 if (vifp->v_flags & VIFF_TUNNEL) 2681 ipst->ips_last_encap_vif = vifp; 2682 if (ipst->ips_ip_mrtdebug > 1) { 2683 (void) mi_strlog(mrouter->conn_rq, 2684 1, SL_TRACE, 2685 "ip_mroute_decap: good tun " 2686 "vif %ld with %x", 2687 (ptrdiff_t)(vifp - ipst->ips_vifs), 2688 ntohl(src)); 2689 } 2690 unlock_good_vif(vifp); 2691 break; 2692 } 2693 unlock_good_vif(vifp); 2694 } 2695 } 2696 if ((vifp = ipst->ips_last_encap_vif) == 0) { 2697 mutex_exit(&ipst->ips_last_encap_lock); 2698 ipst->ips_mrtstat->mrts_bad_tunnel++; 2699 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2700 ip_drop_input("mrts_bad_tunnel", mp, ill); 2701 freemsg(mp); 2702 ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n", 2703 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(src))); 2704 return; 2705 } 2706 mutex_exit(&ipst->ips_last_encap_lock); 2707 2708 /* 2709 * Need to pass in the tunnel source to ip_mforward (so that it can 2710 * verify that the packet arrived over the correct vif.) 2711 */ 2712 ira->ira_flags |= IRAF_MROUTE_TUNNEL_SET; 2713 ira->ira_mroute_tunnel = src; 2714 mp->b_rptr += hlen; 2715 ira->ira_pktlen -= hlen; 2716 ira->ira_ip_hdr_length = hlen_encap; 2717 2718 /* 2719 * We don't redo any of the filtering in ill_input_full_v4 and we 2720 * have checked that all of ipha_encap and any IP options are 2721 * pulled up. Hence we call ire_recv_multicast_v4 directly. 2722 * However, we have to check for RSVP as in ip_input_full_v4 2723 * and if so we pass it to ire_recv_broadcast_v4 for local delivery 2724 * to the rsvpd. 2725 */ 2726 if (ipha_encap->ipha_protocol == IPPROTO_RSVP && 2727 ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head != NULL) { 2728 ire = ire_route_recursive_v4(INADDR_BROADCAST, 0, ill, 2729 ALL_ZONES, ira->ira_tsl, MATCH_IRE_ILL|MATCH_IRE_SECATTR, 2730 IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL); 2731 } else { 2732 ire = ire_multicast(ill); 2733 } 2734 ASSERT(ire != NULL); 2735 /* Normally this will return the IRE_MULTICAST or IRE_BROADCAST */ 2736 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 2737 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2738 ip_drop_input("ip_mroute_decap: RTF_REJECT", mp, ill); 2739 freemsg(mp); 2740 ire_refrele(ire); 2741 return; 2742 } 2743 ire->ire_ib_pkt_count++; 2744 ASSERT(ire->ire_type & (IRE_MULTICAST|IRE_BROADCAST)); 2745 (*ire->ire_recvfn)(ire, mp, ipha_encap, ira); 2746 ire_refrele(ire); 2747 } 2748 2749 /* 2750 * Remove all records with v_ipif == ipif. Called when an interface goes away 2751 * (stream closed). Called as writer. 2752 */ 2753 void 2754 reset_mrt_vif_ipif(ipif_t *ipif) 2755 { 2756 vifi_t vifi, tmp_vifi; 2757 vifi_t num_of_vifs; 2758 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 2759 2760 /* Can't check vifi >= 0 since vifi_t is unsigned! */ 2761 2762 mutex_enter(&ipst->ips_numvifs_mutex); 2763 num_of_vifs = ipst->ips_numvifs; 2764 mutex_exit(&ipst->ips_numvifs_mutex); 2765 2766 for (vifi = num_of_vifs; vifi != 0; vifi--) { 2767 tmp_vifi = vifi - 1; 2768 if (ipst->ips_vifs[tmp_vifi].v_ipif == ipif) { 2769 (void) del_vif(&tmp_vifi, ipst); 2770 } 2771 } 2772 } 2773 2774 /* Remove pending upcall msgs when ill goes away. Called by ill_delete. */ 2775 void 2776 reset_mrt_ill(ill_t *ill) 2777 { 2778 struct mfc *rt; 2779 struct rtdetq *rte; 2780 int i; 2781 ip_stack_t *ipst = ill->ill_ipst; 2782 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2783 timeout_id_t id; 2784 2785 for (i = 0; i < MFCTBLSIZ; i++) { 2786 MFCB_REFHOLD(&ipst->ips_mfcs[i]); 2787 if ((rt = ipst->ips_mfcs[i].mfcb_mfc) != NULL) { 2788 if (ipst->ips_ip_mrtdebug > 1) { 2789 (void) mi_strlog(mrouter->conn_rq, 1, 2790 SL_TRACE, 2791 "reset_mrt_ill: mfctable [%d]", i); 2792 } 2793 while (rt != NULL) { 2794 mutex_enter(&rt->mfc_mutex); 2795 while ((rte = rt->mfc_rte) != NULL) { 2796 if (rte->ill == ill && 2797 (id = rt->mfc_timeout_id) != 0) { 2798 /* 2799 * Its ok to drop the lock, the 2800 * struct cannot be freed since 2801 * we have a ref on the hash 2802 * bucket. 2803 */ 2804 mutex_exit(&rt->mfc_mutex); 2805 (void) untimeout(id); 2806 mutex_enter(&rt->mfc_mutex); 2807 } 2808 if (rte->ill == ill) { 2809 if (ipst->ips_ip_mrtdebug > 1) { 2810 (void) mi_strlog( 2811 mrouter->conn_rq, 2812 1, SL_TRACE, 2813 "reset_mrt_ill: " 2814 "ill 0x%p", (void *)ill); 2815 } 2816 rt->mfc_rte = rte->rte_next; 2817 freemsg(rte->mp); 2818 mi_free((char *)rte); 2819 } 2820 } 2821 mutex_exit(&rt->mfc_mutex); 2822 rt = rt->mfc_next; 2823 } 2824 } 2825 MFCB_REFRELE(&ipst->ips_mfcs[i]); 2826 } 2827 } 2828 2829 /* 2830 * Token bucket filter module. 2831 * The ipha is for mcastgrp destination for phyint and encap. 2832 */ 2833 static void 2834 tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha) 2835 { 2836 size_t p_len = msgdsize(mp); 2837 struct tbf *t = vifp->v_tbf; 2838 timeout_id_t id = 0; 2839 ill_t *ill = vifp->v_ipif->ipif_ill; 2840 ip_stack_t *ipst = ill->ill_ipst; 2841 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2842 2843 /* Drop if packet is too large */ 2844 if (p_len > MAX_BKT_SIZE) { 2845 ipst->ips_mrtstat->mrts_pkt2large++; 2846 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2847 ip_drop_output("tbf_control - too large", mp, ill); 2848 freemsg(mp); 2849 return; 2850 } 2851 if (ipst->ips_ip_mrtdebug > 1) { 2852 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2853 "tbf_ctrl: SEND vif %ld, qlen %d, ipha_dst 0x%x", 2854 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len, 2855 ntohl(ipha->ipha_dst)); 2856 } 2857 2858 mutex_enter(&t->tbf_lock); 2859 2860 tbf_update_tokens(vifp); 2861 2862 /* 2863 * If there are enough tokens, 2864 * and the queue is empty, send this packet out. 2865 */ 2866 if (ipst->ips_ip_mrtdebug > 1) { 2867 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2868 "tbf_control: vif %ld, TOKENS %d, pkt len %lu, qlen %d", 2869 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_n_tok, p_len, 2870 t->tbf_q_len); 2871 } 2872 /* No packets are queued */ 2873 if (t->tbf_q_len == 0) { 2874 /* queue empty, send packet if enough tokens */ 2875 if (p_len <= t->tbf_n_tok) { 2876 t->tbf_n_tok -= p_len; 2877 mutex_exit(&t->tbf_lock); 2878 tbf_send_packet(vifp, mp); 2879 return; 2880 } else { 2881 /* Queue packet and timeout till later */ 2882 tbf_queue(vifp, mp); 2883 ASSERT(vifp->v_timeout_id == 0); 2884 vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp, 2885 TBF_REPROCESS); 2886 } 2887 } else if (t->tbf_q_len < t->tbf_max_q_len) { 2888 /* Finite queue length, so queue pkts and process queue */ 2889 tbf_queue(vifp, mp); 2890 tbf_process_q(vifp); 2891 } else { 2892 /* Check that we have UDP header with IP header */ 2893 size_t hdr_length = IPH_HDR_LENGTH(ipha) + 2894 sizeof (struct udphdr); 2895 2896 if ((mp->b_wptr - mp->b_rptr) < hdr_length) { 2897 if (!pullupmsg(mp, hdr_length)) { 2898 BUMP_MIB(ill->ill_ip_mib, 2899 ipIfStatsOutDiscards); 2900 ip_drop_output("tbf_control - pullup", mp, ill); 2901 freemsg(mp); 2902 ip1dbg(("tbf_ctl: couldn't pullup udp hdr, " 2903 "vif %ld src 0x%x dst 0x%x\n", 2904 (ptrdiff_t)(vifp - ipst->ips_vifs), 2905 ntohl(ipha->ipha_src), 2906 ntohl(ipha->ipha_dst))); 2907 mutex_exit(&vifp->v_tbf->tbf_lock); 2908 return; 2909 } else 2910 /* Have to reassign ipha after pullupmsg */ 2911 ipha = (ipha_t *)mp->b_rptr; 2912 } 2913 /* 2914 * Queue length too much, 2915 * try to selectively dq, or queue and process 2916 */ 2917 if (!tbf_dq_sel(vifp, ipha)) { 2918 ipst->ips_mrtstat->mrts_q_overflow++; 2919 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2920 ip_drop_output("mrts_q_overflow", mp, ill); 2921 freemsg(mp); 2922 } else { 2923 tbf_queue(vifp, mp); 2924 tbf_process_q(vifp); 2925 } 2926 } 2927 if (t->tbf_q_len == 0) { 2928 id = vifp->v_timeout_id; 2929 vifp->v_timeout_id = 0; 2930 } 2931 mutex_exit(&vifp->v_tbf->tbf_lock); 2932 if (id != 0) 2933 (void) untimeout(id); 2934 } 2935 2936 /* 2937 * Adds a packet to the tbf queue at the interface. 2938 * The ipha is for mcastgrp destination for phyint and encap. 2939 */ 2940 static void 2941 tbf_queue(struct vif *vifp, mblk_t *mp) 2942 { 2943 struct tbf *t = vifp->v_tbf; 2944 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2945 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2946 2947 if (ipst->ips_ip_mrtdebug > 1) { 2948 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2949 "tbf_queue: vif %ld", (ptrdiff_t)(vifp - ipst->ips_vifs)); 2950 } 2951 ASSERT(MUTEX_HELD(&t->tbf_lock)); 2952 2953 if (t->tbf_t == NULL) { 2954 /* Queue was empty */ 2955 t->tbf_q = mp; 2956 } else { 2957 /* Insert at tail */ 2958 t->tbf_t->b_next = mp; 2959 } 2960 /* set new tail pointer */ 2961 t->tbf_t = mp; 2962 2963 mp->b_next = mp->b_prev = NULL; 2964 2965 t->tbf_q_len++; 2966 } 2967 2968 /* 2969 * Process the queue at the vif interface. 2970 * Drops the tbf_lock when sending packets. 2971 * 2972 * NOTE : The caller should quntimeout if the queue length is 0. 2973 */ 2974 static void 2975 tbf_process_q(struct vif *vifp) 2976 { 2977 mblk_t *mp; 2978 struct tbf *t = vifp->v_tbf; 2979 size_t len; 2980 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2981 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2982 2983 if (ipst->ips_ip_mrtdebug > 1) { 2984 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2985 "tbf_process_q 1: vif %ld qlen = %d", 2986 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len); 2987 } 2988 2989 /* 2990 * Loop through the queue at the interface and send 2991 * as many packets as possible. 2992 */ 2993 ASSERT(MUTEX_HELD(&t->tbf_lock)); 2994 2995 while (t->tbf_q_len > 0) { 2996 mp = t->tbf_q; 2997 len = (size_t)msgdsize(mp); /* length of ip pkt */ 2998 2999 /* Determine if the packet can be sent */ 3000 if (len <= t->tbf_n_tok) { 3001 /* 3002 * If so, reduce no. of tokens, dequeue the packet, 3003 * send the packet. 3004 */ 3005 t->tbf_n_tok -= len; 3006 3007 t->tbf_q = mp->b_next; 3008 if (--t->tbf_q_len == 0) { 3009 t->tbf_t = NULL; 3010 } 3011 mp->b_next = NULL; 3012 /* Exit mutex before sending packet, then re-enter */ 3013 mutex_exit(&t->tbf_lock); 3014 tbf_send_packet(vifp, mp); 3015 mutex_enter(&t->tbf_lock); 3016 } else 3017 break; 3018 } 3019 } 3020 3021 /* Called at tbf timeout to update tokens, process q and reset timer. */ 3022 static void 3023 tbf_reprocess_q(void *arg) 3024 { 3025 struct vif *vifp = arg; 3026 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 3027 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3028 3029 mutex_enter(&vifp->v_tbf->tbf_lock); 3030 vifp->v_timeout_id = 0; 3031 tbf_update_tokens(vifp); 3032 3033 tbf_process_q(vifp); 3034 3035 if (vifp->v_tbf->tbf_q_len > 0) { 3036 vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp, 3037 TBF_REPROCESS); 3038 } 3039 mutex_exit(&vifp->v_tbf->tbf_lock); 3040 3041 if (ipst->ips_ip_mrtdebug > 1) { 3042 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3043 "tbf_reprcess_q: vif %ld timeout id = %p", 3044 (ptrdiff_t)(vifp - ipst->ips_vifs), vifp->v_timeout_id); 3045 } 3046 } 3047 3048 /* 3049 * Function that will selectively discard a member of the tbf queue, 3050 * based on the precedence value and the priority. 3051 * 3052 * NOTE : The caller should quntimeout if the queue length is 0. 3053 */ 3054 static int 3055 tbf_dq_sel(struct vif *vifp, ipha_t *ipha) 3056 { 3057 uint_t p; 3058 struct tbf *t = vifp->v_tbf; 3059 mblk_t **np; 3060 mblk_t *last, *mp; 3061 ill_t *ill = vifp->v_ipif->ipif_ill; 3062 ip_stack_t *ipst = ill->ill_ipst; 3063 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3064 3065 if (ipst->ips_ip_mrtdebug > 1) { 3066 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3067 "dq_sel: vif %ld dst 0x%x", 3068 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(ipha->ipha_dst)); 3069 } 3070 3071 ASSERT(MUTEX_HELD(&t->tbf_lock)); 3072 p = priority(vifp, ipha); 3073 3074 np = &t->tbf_q; 3075 last = NULL; 3076 while ((mp = *np) != NULL) { 3077 if (p > (priority(vifp, (ipha_t *)mp->b_rptr))) { 3078 *np = mp->b_next; 3079 /* If removing the last packet, fix the tail pointer */ 3080 if (mp == t->tbf_t) 3081 t->tbf_t = last; 3082 mp->b_prev = mp->b_next = NULL; 3083 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 3084 ip_drop_output("tbf_dq_send", mp, ill); 3085 freemsg(mp); 3086 /* 3087 * It's impossible for the queue to be empty, but 3088 * we check anyway. 3089 */ 3090 if (--t->tbf_q_len == 0) { 3091 t->tbf_t = NULL; 3092 } 3093 ipst->ips_mrtstat->mrts_drop_sel++; 3094 return (1); 3095 } 3096 np = &mp->b_next; 3097 last = mp; 3098 } 3099 return (0); 3100 } 3101 3102 /* Sends packet, 2 cases - encap tunnel, phyint. */ 3103 static void 3104 tbf_send_packet(struct vif *vifp, mblk_t *mp) 3105 { 3106 ipif_t *ipif = vifp->v_ipif; 3107 ill_t *ill = ipif->ipif_ill; 3108 ip_stack_t *ipst = ill->ill_ipst; 3109 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3110 ipha_t *ipha; 3111 3112 ipha = (ipha_t *)mp->b_rptr; 3113 /* If encap tunnel options */ 3114 if (vifp->v_flags & VIFF_TUNNEL) { 3115 ip_xmit_attr_t ixas; 3116 3117 if (ipst->ips_ip_mrtdebug > 1) { 3118 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3119 "tbf_send_packet: ENCAP tunnel vif %ld", 3120 (ptrdiff_t)(vifp - ipst->ips_vifs)); 3121 } 3122 bzero(&ixas, sizeof (ixas)); 3123 ixas.ixa_flags = 3124 IXAF_IS_IPV4 | IXAF_NO_TTL_CHANGE | IXAF_VERIFY_SOURCE; 3125 ixas.ixa_ipst = ipst; 3126 ixas.ixa_ifindex = 0; 3127 ixas.ixa_cred = kcred; 3128 ixas.ixa_cpid = NOPID; 3129 ixas.ixa_tsl = NULL; 3130 ixas.ixa_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */ 3131 ixas.ixa_pktlen = ntohs(ipha->ipha_length); 3132 ixas.ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha); 3133 3134 /* 3135 * Feed into ip_output_simple which will set the ident field 3136 * and checksum the encapsulating header. 3137 * BSD gets the cached route vifp->v_route from ip_output() 3138 * to speed up route table lookups. Not necessary in SunOS 5.x. 3139 * One could make multicast forwarding faster by putting an 3140 * ip_xmit_attr_t in each vif thereby caching the ire/nce. 3141 */ 3142 (void) ip_output_simple(mp, &ixas); 3143 ixa_cleanup(&ixas); 3144 return; 3145 3146 /* phyint */ 3147 } else { 3148 /* Need to loop back to members on the outgoing interface. */ 3149 ipaddr_t dst; 3150 ip_recv_attr_t iras; 3151 nce_t *nce; 3152 3153 bzero(&iras, sizeof (iras)); 3154 iras.ira_flags = IRAF_IS_IPV4; 3155 iras.ira_ill = iras.ira_rill = ill; 3156 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 3157 iras.ira_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */ 3158 iras.ira_pktlen = ntohs(ipha->ipha_length); 3159 iras.ira_ip_hdr_length = IPH_HDR_LENGTH(ipha); 3160 3161 dst = ipha->ipha_dst; 3162 if (ill_hasmembers_v4(ill, dst)) { 3163 iras.ira_flags |= IRAF_LOOPBACK_COPY; 3164 } 3165 if (ipst->ips_ip_mrtdebug > 1) { 3166 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3167 "tbf_send_pkt: phyint forward vif %ld dst = 0x%x", 3168 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(dst)); 3169 } 3170 /* 3171 * Find an NCE which matches the nexthop. 3172 * For a pt-pt interface we use the other end of the pt-pt 3173 * link. 3174 */ 3175 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 3176 dst = ipif->ipif_pp_dst_addr; 3177 nce = arp_nce_init(ill, dst, ill->ill_net_type); 3178 } else { 3179 nce = arp_nce_init(ill, dst, IRE_MULTICAST); 3180 } 3181 if (nce == NULL) { 3182 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 3183 ip_drop_output("tbf_send_packet - no nce", mp, ill); 3184 freemsg(mp); 3185 return; 3186 } 3187 3188 /* 3189 * We don't remeber the incoming ill. Thus we 3190 * pretend the packet arrived on the outbound ill. This means 3191 * statistics for input errors will be increased on the wrong 3192 * ill but that isn't a big deal. 3193 */ 3194 ip_forward_xmit_v4(nce, ill, mp, ipha, &iras, ill->ill_mc_mtu, 3195 0); 3196 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 3197 3198 nce_refrele(nce); 3199 } 3200 } 3201 3202 /* 3203 * Determine the current time and then the elapsed time (between the last time 3204 * and time now). Update the no. of tokens in the bucket. 3205 */ 3206 static void 3207 tbf_update_tokens(struct vif *vifp) 3208 { 3209 timespec_t tp; 3210 hrtime_t tm; 3211 struct tbf *t = vifp->v_tbf; 3212 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 3213 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3214 3215 ASSERT(MUTEX_HELD(&t->tbf_lock)); 3216 3217 /* Time in secs and nsecs, rate limit in kbits/sec */ 3218 gethrestime(&tp); 3219 3220 /*LINTED*/ 3221 TV_DELTA(tp, t->tbf_last_pkt_t, tm); 3222 3223 /* 3224 * This formula is actually 3225 * "time in seconds" * "bytes/second". Scaled for nsec. 3226 * (tm/1000000000) * (v_rate_limit * 1000 * (1000/1024) /8) 3227 * 3228 * The (1000/1024) was introduced in add_vif to optimize 3229 * this divide into a shift. 3230 */ 3231 t->tbf_n_tok += (tm/1000) * vifp->v_rate_limit / 1024 / 8; 3232 t->tbf_last_pkt_t = tp; 3233 3234 if (t->tbf_n_tok > MAX_BKT_SIZE) 3235 t->tbf_n_tok = MAX_BKT_SIZE; 3236 if (ipst->ips_ip_mrtdebug > 1) { 3237 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3238 "tbf_update_tok: tm %lld tok %d vif %ld", 3239 tm, t->tbf_n_tok, (ptrdiff_t)(vifp - ipst->ips_vifs)); 3240 } 3241 } 3242 3243 /* 3244 * Priority currently is based on port nos. 3245 * Different forwarding mechanisms have different ways 3246 * of obtaining the port no. Hence, the vif must be 3247 * given along with the packet itself. 3248 * 3249 */ 3250 static int 3251 priority(struct vif *vifp, ipha_t *ipha) 3252 { 3253 int prio; 3254 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 3255 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3256 3257 /* Temporary hack; may add general packet classifier some day */ 3258 3259 ASSERT(MUTEX_HELD(&vifp->v_tbf->tbf_lock)); 3260 3261 /* 3262 * The UDP port space is divided up into four priority ranges: 3263 * [0, 16384) : unclassified - lowest priority 3264 * [16384, 32768) : audio - highest priority 3265 * [32768, 49152) : whiteboard - medium priority 3266 * [49152, 65536) : video - low priority 3267 */ 3268 3269 if (ipha->ipha_protocol == IPPROTO_UDP) { 3270 struct udphdr *udp = 3271 (struct udphdr *)((char *)ipha + IPH_HDR_LENGTH(ipha)); 3272 switch (ntohs(udp->uh_dport) & 0xc000) { 3273 case 0x4000: 3274 prio = 70; 3275 break; 3276 case 0x8000: 3277 prio = 60; 3278 break; 3279 case 0xc000: 3280 prio = 55; 3281 break; 3282 default: 3283 prio = 50; 3284 break; 3285 } 3286 if (ipst->ips_ip_mrtdebug > 1) { 3287 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3288 "priority: port %x prio %d\n", 3289 ntohs(udp->uh_dport), prio); 3290 } 3291 } else 3292 prio = 50; /* default priority */ 3293 return (prio); 3294 } 3295 3296 /* 3297 * End of token bucket filter modifications 3298 */ 3299 3300 3301 3302 /* 3303 * Produces data for netstat -M. 3304 */ 3305 int 3306 ip_mroute_stats(mblk_t *mp, ip_stack_t *ipst) 3307 { 3308 ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl); 3309 ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl); 3310 if (!snmp_append_data(mp, (char *)ipst->ips_mrtstat, 3311 sizeof (struct mrtstat))) { 3312 ip0dbg(("ip_mroute_stats: failed %ld bytes\n", 3313 (size_t)sizeof (struct mrtstat))); 3314 return (0); 3315 } 3316 return (1); 3317 } 3318 3319 /* 3320 * Sends info for SNMP's MIB. 3321 */ 3322 int 3323 ip_mroute_vif(mblk_t *mp, ip_stack_t *ipst) 3324 { 3325 struct vifctl vi; 3326 vifi_t vifi; 3327 3328 mutex_enter(&ipst->ips_numvifs_mutex); 3329 for (vifi = 0; vifi < ipst->ips_numvifs; vifi++) { 3330 if (ipst->ips_vifs[vifi].v_lcl_addr.s_addr == 0) 3331 continue; 3332 /* 3333 * No locks here, an approximation is fine. 3334 */ 3335 vi.vifc_vifi = vifi; 3336 vi.vifc_flags = ipst->ips_vifs[vifi].v_flags; 3337 vi.vifc_threshold = ipst->ips_vifs[vifi].v_threshold; 3338 vi.vifc_rate_limit = ipst->ips_vifs[vifi].v_rate_limit; 3339 vi.vifc_lcl_addr = ipst->ips_vifs[vifi].v_lcl_addr; 3340 vi.vifc_rmt_addr = ipst->ips_vifs[vifi].v_rmt_addr; 3341 vi.vifc_pkt_in = ipst->ips_vifs[vifi].v_pkt_in; 3342 vi.vifc_pkt_out = ipst->ips_vifs[vifi].v_pkt_out; 3343 3344 if (!snmp_append_data(mp, (char *)&vi, sizeof (vi))) { 3345 ip0dbg(("ip_mroute_vif: failed %ld bytes\n", 3346 (size_t)sizeof (vi))); 3347 mutex_exit(&ipst->ips_numvifs_mutex); 3348 return (0); 3349 } 3350 } 3351 mutex_exit(&ipst->ips_numvifs_mutex); 3352 return (1); 3353 } 3354 3355 /* 3356 * Called by ip_snmp_get to send up multicast routing table. 3357 */ 3358 int 3359 ip_mroute_mrt(mblk_t *mp, ip_stack_t *ipst) 3360 { 3361 int i, j; 3362 struct mfc *rt; 3363 struct mfcctl mfcc; 3364 3365 /* 3366 * Make sure multicast has not been turned off. 3367 */ 3368 if (is_mrouter_off(ipst)) 3369 return (1); 3370 3371 /* Loop over all hash buckets and their chains */ 3372 for (i = 0; i < MFCTBLSIZ; i++) { 3373 MFCB_REFHOLD(&ipst->ips_mfcs[i]); 3374 for (rt = ipst->ips_mfcs[i].mfcb_mfc; rt; rt = rt->mfc_next) { 3375 mutex_enter(&rt->mfc_mutex); 3376 if (rt->mfc_rte != NULL || 3377 (rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 3378 mutex_exit(&rt->mfc_mutex); 3379 continue; 3380 } 3381 mfcc.mfcc_origin = rt->mfc_origin; 3382 mfcc.mfcc_mcastgrp = rt->mfc_mcastgrp; 3383 mfcc.mfcc_parent = rt->mfc_parent; 3384 mfcc.mfcc_pkt_cnt = rt->mfc_pkt_cnt; 3385 mutex_enter(&ipst->ips_numvifs_mutex); 3386 for (j = 0; j < (int)ipst->ips_numvifs; j++) 3387 mfcc.mfcc_ttls[j] = rt->mfc_ttls[j]; 3388 for (j = (int)ipst->ips_numvifs; j < MAXVIFS; j++) 3389 mfcc.mfcc_ttls[j] = 0; 3390 mutex_exit(&ipst->ips_numvifs_mutex); 3391 3392 mutex_exit(&rt->mfc_mutex); 3393 if (!snmp_append_data(mp, (char *)&mfcc, 3394 sizeof (mfcc))) { 3395 MFCB_REFRELE(&ipst->ips_mfcs[i]); 3396 ip0dbg(("ip_mroute_mrt: failed %ld bytes\n", 3397 (size_t)sizeof (mfcc))); 3398 return (0); 3399 } 3400 } 3401 MFCB_REFRELE(&ipst->ips_mfcs[i]); 3402 } 3403 return (1); 3404 } 3405