1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 /* Copyright (c) 1990 Mentat Inc. */ 25 26 /* 27 * Procedures for the kernel part of DVMRP, 28 * a Distance-Vector Multicast Routing Protocol. 29 * (See RFC-1075) 30 * Written by David Waitzman, BBN Labs, August 1988. 31 * Modified by Steve Deering, Stanford, February 1989. 32 * Modified by Mark J. Steiglitz, Stanford, May, 1991 33 * Modified by Van Jacobson, LBL, January 1993 34 * Modified by Ajit Thyagarajan, PARC, August 1993 35 * Modified by Bill Fenner, PARC, April 1995 36 * 37 * MROUTING 3.5 38 */ 39 40 /* 41 * TODO 42 * - function pointer field in vif, void *vif_sendit() 43 */ 44 45 #include <sys/types.h> 46 #include <sys/stream.h> 47 #include <sys/stropts.h> 48 #include <sys/strlog.h> 49 #include <sys/systm.h> 50 #include <sys/ddi.h> 51 #include <sys/cmn_err.h> 52 #include <sys/zone.h> 53 54 #include <sys/param.h> 55 #include <sys/socket.h> 56 #include <sys/vtrace.h> 57 #include <sys/debug.h> 58 #include <net/if.h> 59 #include <sys/sockio.h> 60 #include <netinet/in.h> 61 #include <net/if_dl.h> 62 63 #include <inet/ipsec_impl.h> 64 #include <inet/common.h> 65 #include <inet/mi.h> 66 #include <inet/nd.h> 67 #include <inet/tunables.h> 68 #include <inet/mib2.h> 69 #include <netinet/ip6.h> 70 #include <inet/ip.h> 71 #include <inet/snmpcom.h> 72 73 #include <netinet/igmp.h> 74 #include <netinet/igmp_var.h> 75 #include <netinet/udp.h> 76 #include <netinet/ip_mroute.h> 77 #include <inet/ip_multi.h> 78 #include <inet/ip_ire.h> 79 #include <inet/ip_ndp.h> 80 #include <inet/ip_if.h> 81 #include <inet/ipclassifier.h> 82 83 #include <netinet/pim.h> 84 85 86 /* 87 * MT Design: 88 * 89 * There are three main data structures viftable, mfctable and tbftable that 90 * need to be protected against MT races. 91 * 92 * vitable is a fixed length array of vif structs. There is no lock to protect 93 * the whole array, instead each struct is protected by its own indiviual lock. 94 * The value of v_marks in conjuction with the value of v_refcnt determines the 95 * current state of a vif structure. One special state that needs mention 96 * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates 97 * that vif is being initalized. 98 * Each structure is freed when the refcnt goes down to zero. If a delete comes 99 * in when the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED 100 * which prevents the struct from further use. When the refcnt goes to zero 101 * the struct is freed and is marked VIF_MARK_NOTINUSE. 102 * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill 103 * from going away a refhold is put on the ipif before using it. see 104 * lock_good_vif() and unlock_good_vif(). 105 * 106 * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts 107 * of the vif struct. 108 * 109 * tbftable is also a fixed length array of tbf structs and is only accessed 110 * via v_tbf. It is protected by its own lock tbf_lock. 111 * 112 * Lock Ordering is 113 * v_lock --> tbf_lock 114 * v_lock --> ill_locK 115 * 116 * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb). 117 * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker, 118 * it also maintains a state. These fields are protected by a lock (mfcb_lock). 119 * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to 120 * protect the struct elements. 121 * 122 * mfc structs are dynamically allocated and are singly linked 123 * at the head of the chain. When an mfc structure is to be deleted 124 * it is marked condemned and so is the state in the bucket struct. 125 * When the last walker of the hash bucket exits all the mfc structs 126 * marked condemed are freed. 127 * 128 * Locking Hierarchy: 129 * The bucket lock should be acquired before the mfc struct lock. 130 * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking 131 * operations on the bucket struct. 132 * 133 * last_encap_lock and numvifs_mutex should be acquired after 134 * acquring vif or mfc locks. These locks protect some global variables. 135 * 136 * The statistics are not currently protected by a lock 137 * causing the stats be be approximate, not exact. 138 */ 139 140 #define NO_VIF MAXVIFS /* from mrouted, no route for src */ 141 142 /* 143 * Timeouts: 144 * Upcall timeouts - BSD uses boolean_t mfc->expire and 145 * nexpire[MFCTBLSIZE], the number of times expire has been called. 146 * SunOS 5.x uses mfc->timeout for each mfc. 147 * Some Unixes are limited in the number of simultaneous timeouts 148 * that can be run, SunOS 5.x does not have this restriction. 149 */ 150 151 /* 152 * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and 153 * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall 154 * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE 155 */ 156 #define EXPIRE_TIMEOUT (hz/4) /* 4x / second */ 157 #define UPCALL_EXPIRE 6 /* number of timeouts */ 158 159 /* 160 * Hash function for a source, group entry 161 */ 162 #define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \ 163 ((g) >> 20) ^ ((g) >> 10) ^ (g)) 164 165 #define TBF_REPROCESS (hz / 100) /* 100x /second */ 166 167 /* Identify PIM packet that came on a Register interface */ 168 #define PIM_REGISTER_MARKER 0xffffffff 169 170 /* Function declarations */ 171 static int add_mfc(struct mfcctl *, ip_stack_t *); 172 static int add_vif(struct vifctl *, conn_t *, ip_stack_t *); 173 static int del_mfc(struct mfcctl *, ip_stack_t *); 174 static int del_vif(vifi_t *, ip_stack_t *); 175 static void del_vifp(struct vif *); 176 static void encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 177 static void expire_upcalls(void *); 178 static void fill_route(struct mfc *, struct mfcctl *, ip_stack_t *); 179 static void free_queue(struct mfc *); 180 static int get_assert(uchar_t *, ip_stack_t *); 181 static int get_lsg_cnt(struct sioc_lsg_req *, ip_stack_t *); 182 static int get_sg_cnt(struct sioc_sg_req *, ip_stack_t *); 183 static int get_version(uchar_t *); 184 static int get_vif_cnt(struct sioc_vif_req *, ip_stack_t *); 185 static int ip_mdq(mblk_t *, ipha_t *, ill_t *, 186 ipaddr_t, struct mfc *); 187 static int ip_mrouter_init(conn_t *, uchar_t *, int, ip_stack_t *); 188 static void phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 189 static int register_mforward(mblk_t *, ip_recv_attr_t *); 190 static void register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 191 static int set_assert(int *, ip_stack_t *); 192 193 /* 194 * Token Bucket Filter functions 195 */ 196 static int priority(struct vif *, ipha_t *); 197 static void tbf_control(struct vif *, mblk_t *, ipha_t *); 198 static int tbf_dq_sel(struct vif *, ipha_t *); 199 static void tbf_process_q(struct vif *); 200 static void tbf_queue(struct vif *, mblk_t *); 201 static void tbf_reprocess_q(void *); 202 static void tbf_send_packet(struct vif *, mblk_t *); 203 static void tbf_update_tokens(struct vif *); 204 static void release_mfc(struct mfcb *); 205 206 static boolean_t is_mrouter_off(ip_stack_t *); 207 /* 208 * Encapsulation packets 209 */ 210 211 #define ENCAP_TTL 64 212 213 /* prototype IP hdr for encapsulated packets */ 214 static ipha_t multicast_encap_iphdr = { 215 IP_SIMPLE_HDR_VERSION, 216 0, /* tos */ 217 sizeof (ipha_t), /* total length */ 218 0, /* id */ 219 0, /* frag offset */ 220 ENCAP_TTL, IPPROTO_ENCAP, 221 0, /* checksum */ 222 }; 223 224 /* 225 * Rate limit for assert notification messages, in nsec. 226 */ 227 #define ASSERT_MSG_TIME 3000000000 228 229 230 #define VIF_REFHOLD(vifp) { \ 231 mutex_enter(&(vifp)->v_lock); \ 232 (vifp)->v_refcnt++; \ 233 mutex_exit(&(vifp)->v_lock); \ 234 } 235 236 #define VIF_REFRELE_LOCKED(vifp) { \ 237 (vifp)->v_refcnt--; \ 238 if ((vifp)->v_refcnt == 0 && \ 239 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) { \ 240 del_vifp(vifp); \ 241 } else { \ 242 mutex_exit(&(vifp)->v_lock); \ 243 } \ 244 } 245 246 #define VIF_REFRELE(vifp) { \ 247 mutex_enter(&(vifp)->v_lock); \ 248 (vifp)->v_refcnt--; \ 249 if ((vifp)->v_refcnt == 0 && \ 250 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) { \ 251 del_vifp(vifp); \ 252 } else { \ 253 mutex_exit(&(vifp)->v_lock); \ 254 } \ 255 } 256 257 #define MFCB_REFHOLD(mfcb) { \ 258 mutex_enter(&(mfcb)->mfcb_lock); \ 259 (mfcb)->mfcb_refcnt++; \ 260 ASSERT((mfcb)->mfcb_refcnt != 0); \ 261 mutex_exit(&(mfcb)->mfcb_lock); \ 262 } 263 264 #define MFCB_REFRELE(mfcb) { \ 265 mutex_enter(&(mfcb)->mfcb_lock); \ 266 ASSERT((mfcb)->mfcb_refcnt != 0); \ 267 if (--(mfcb)->mfcb_refcnt == 0 && \ 268 ((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) { \ 269 release_mfc(mfcb); \ 270 } \ 271 mutex_exit(&(mfcb)->mfcb_lock); \ 272 } 273 274 /* 275 * MFCFIND: 276 * Find a route for a given origin IP address and multicast group address. 277 * Skip entries with pending upcalls. 278 * Type of service parameter to be added in the future! 279 */ 280 #define MFCFIND(mfcbp, o, g, rt) { \ 281 struct mfc *_mb_rt = NULL; \ 282 rt = NULL; \ 283 _mb_rt = mfcbp->mfcb_mfc; \ 284 while (_mb_rt) { \ 285 if ((_mb_rt->mfc_origin.s_addr == o) && \ 286 (_mb_rt->mfc_mcastgrp.s_addr == g) && \ 287 (_mb_rt->mfc_rte == NULL) && \ 288 (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) { \ 289 rt = _mb_rt; \ 290 break; \ 291 } \ 292 _mb_rt = _mb_rt->mfc_next; \ 293 } \ 294 } 295 296 /* 297 * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime() 298 * are inefficient. We use gethrestime() which returns a timespec_t with 299 * sec and nsec, the resolution is machine dependent. 300 * The following 2 macros have been changed to use nsec instead of usec. 301 */ 302 /* 303 * Macros to compute elapsed time efficiently. 304 * Borrowed from Van Jacobson's scheduling code. 305 * Delta should be a hrtime_t. 306 */ 307 #define TV_DELTA(a, b, delta) { \ 308 int xxs; \ 309 \ 310 delta = (a).tv_nsec - (b).tv_nsec; \ 311 if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \ 312 switch (xxs) { \ 313 case 2: \ 314 delta += 1000000000; \ 315 /*FALLTHROUGH*/ \ 316 case 1: \ 317 delta += 1000000000; \ 318 break; \ 319 default: \ 320 delta += (1000000000 * xxs); \ 321 } \ 322 } \ 323 } 324 325 #define TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \ 326 (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec) 327 328 /* 329 * Handle MRT setsockopt commands to modify the multicast routing tables. 330 */ 331 int 332 ip_mrouter_set(int cmd, conn_t *connp, int checkonly, uchar_t *data, 333 int datalen) 334 { 335 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 336 337 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 338 if (cmd != MRT_INIT && connp != ipst->ips_ip_g_mrouter) { 339 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 340 return (EACCES); 341 } 342 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 343 344 if (checkonly) { 345 /* 346 * do not do operation, just pretend to - new T_CHECK 347 * Note: Even routines further on can probably fail but 348 * this T_CHECK stuff is only to please XTI so it not 349 * necessary to be perfect. 350 */ 351 switch (cmd) { 352 case MRT_INIT: 353 case MRT_DONE: 354 case MRT_ADD_VIF: 355 case MRT_DEL_VIF: 356 case MRT_ADD_MFC: 357 case MRT_DEL_MFC: 358 case MRT_ASSERT: 359 return (0); 360 default: 361 return (EOPNOTSUPP); 362 } 363 } 364 365 /* 366 * make sure no command is issued after multicast routing has been 367 * turned off. 368 */ 369 if (cmd != MRT_INIT && cmd != MRT_DONE) { 370 if (is_mrouter_off(ipst)) 371 return (EINVAL); 372 } 373 374 switch (cmd) { 375 case MRT_INIT: return (ip_mrouter_init(connp, data, datalen, ipst)); 376 case MRT_DONE: return (ip_mrouter_done(ipst)); 377 case MRT_ADD_VIF: return (add_vif((struct vifctl *)data, connp, ipst)); 378 case MRT_DEL_VIF: return (del_vif((vifi_t *)data, ipst)); 379 case MRT_ADD_MFC: return (add_mfc((struct mfcctl *)data, ipst)); 380 case MRT_DEL_MFC: return (del_mfc((struct mfcctl *)data, ipst)); 381 case MRT_ASSERT: return (set_assert((int *)data, ipst)); 382 default: return (EOPNOTSUPP); 383 } 384 } 385 386 /* 387 * Handle MRT getsockopt commands 388 */ 389 int 390 ip_mrouter_get(int cmd, conn_t *connp, uchar_t *data) 391 { 392 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 393 394 if (connp != ipst->ips_ip_g_mrouter) 395 return (EACCES); 396 397 switch (cmd) { 398 case MRT_VERSION: return (get_version((uchar_t *)data)); 399 case MRT_ASSERT: return (get_assert((uchar_t *)data, ipst)); 400 default: return (EOPNOTSUPP); 401 } 402 } 403 404 /* 405 * Handle ioctl commands to obtain information from the cache. 406 * Called with shared access to IP. These are read_only ioctls. 407 */ 408 /* ARGSUSED */ 409 int 410 mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 411 ip_ioctl_cmd_t *ipip, void *if_req) 412 { 413 mblk_t *mp1; 414 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 415 conn_t *connp = Q_TO_CONN(q); 416 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 417 418 /* Existence verified in ip_wput_nondata */ 419 mp1 = mp->b_cont->b_cont; 420 421 switch (iocp->ioc_cmd) { 422 case (SIOCGETVIFCNT): 423 return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr, ipst)); 424 case (SIOCGETSGCNT): 425 return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr, ipst)); 426 case (SIOCGETLSGCNT): 427 return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr, ipst)); 428 default: 429 return (EINVAL); 430 } 431 } 432 433 /* 434 * Returns the packet, byte, rpf-failure count for the source, group provided. 435 */ 436 static int 437 get_sg_cnt(struct sioc_sg_req *req, ip_stack_t *ipst) 438 { 439 struct mfc *rt; 440 struct mfcb *mfcbp; 441 442 mfcbp = &ipst->ips_mfcs[MFCHASH(req->src.s_addr, req->grp.s_addr)]; 443 MFCB_REFHOLD(mfcbp); 444 MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt); 445 446 if (rt != NULL) { 447 mutex_enter(&rt->mfc_mutex); 448 req->pktcnt = rt->mfc_pkt_cnt; 449 req->bytecnt = rt->mfc_byte_cnt; 450 req->wrong_if = rt->mfc_wrong_if; 451 mutex_exit(&rt->mfc_mutex); 452 } else 453 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU; 454 455 MFCB_REFRELE(mfcbp); 456 return (0); 457 } 458 459 /* 460 * Returns the packet, byte, rpf-failure count for the source, group provided. 461 * Uses larger counters and IPv6 addresses. 462 */ 463 /* ARGSUSED XXX until implemented */ 464 static int 465 get_lsg_cnt(struct sioc_lsg_req *req, ip_stack_t *ipst) 466 { 467 /* XXX TODO SIOCGETLSGCNT */ 468 return (ENXIO); 469 } 470 471 /* 472 * Returns the input and output packet and byte counts on the vif provided. 473 */ 474 static int 475 get_vif_cnt(struct sioc_vif_req *req, ip_stack_t *ipst) 476 { 477 vifi_t vifi = req->vifi; 478 479 if (vifi >= ipst->ips_numvifs) 480 return (EINVAL); 481 482 /* 483 * No locks here, an approximation is fine. 484 */ 485 req->icount = ipst->ips_vifs[vifi].v_pkt_in; 486 req->ocount = ipst->ips_vifs[vifi].v_pkt_out; 487 req->ibytes = ipst->ips_vifs[vifi].v_bytes_in; 488 req->obytes = ipst->ips_vifs[vifi].v_bytes_out; 489 490 return (0); 491 } 492 493 static int 494 get_version(uchar_t *data) 495 { 496 int *v = (int *)data; 497 498 *v = 0x0305; /* XXX !!!! */ 499 500 return (0); 501 } 502 503 /* 504 * Set PIM assert processing global. 505 */ 506 static int 507 set_assert(int *i, ip_stack_t *ipst) 508 { 509 if ((*i != 1) && (*i != 0)) 510 return (EINVAL); 511 512 ipst->ips_pim_assert = *i; 513 514 return (0); 515 } 516 517 /* 518 * Get PIM assert processing global. 519 */ 520 static int 521 get_assert(uchar_t *data, ip_stack_t *ipst) 522 { 523 int *i = (int *)data; 524 525 *i = ipst->ips_pim_assert; 526 527 return (0); 528 } 529 530 /* 531 * Enable multicast routing. 532 */ 533 static int 534 ip_mrouter_init(conn_t *connp, uchar_t *data, int datalen, ip_stack_t *ipst) 535 { 536 int *v; 537 538 if (data == NULL || (datalen != sizeof (int))) 539 return (ENOPROTOOPT); 540 541 v = (int *)data; 542 if (*v != 1) 543 return (ENOPROTOOPT); 544 545 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 546 if (ipst->ips_ip_g_mrouter != NULL) { 547 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 548 return (EADDRINUSE); 549 } 550 551 /* 552 * MRT_INIT should only be allowed for RAW sockets, but we double 553 * check. 554 */ 555 if (!IPCL_IS_RAWIP(connp)) { 556 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 557 return (EINVAL); 558 } 559 560 ipst->ips_ip_g_mrouter = connp; 561 connp->conn_multi_router = 1; 562 /* In order for tunnels to work we have to turn ip_g_forward on */ 563 if (!WE_ARE_FORWARDING(ipst)) { 564 if (ipst->ips_ip_mrtdebug > 1) { 565 (void) mi_strlog(connp->conn_rq, 1, SL_TRACE, 566 "ip_mrouter_init: turning on forwarding"); 567 } 568 ipst->ips_saved_ip_forwarding = ipst->ips_ip_forwarding; 569 ipst->ips_ip_forwarding = IP_FORWARD_ALWAYS; 570 } 571 572 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 573 return (0); 574 } 575 576 void 577 ip_mrouter_stack_init(ip_stack_t *ipst) 578 { 579 mutex_init(&ipst->ips_ip_g_mrouter_mutex, NULL, MUTEX_DEFAULT, NULL); 580 581 ipst->ips_vifs = kmem_zalloc(sizeof (struct vif) * (MAXVIFS+1), 582 KM_SLEEP); 583 ipst->ips_mrtstat = kmem_zalloc(sizeof (struct mrtstat), KM_SLEEP); 584 /* 585 * mfctable: 586 * Includes all mfcs, including waiting upcalls. 587 * Multiple mfcs per bucket. 588 */ 589 ipst->ips_mfcs = kmem_zalloc(sizeof (struct mfcb) * MFCTBLSIZ, 590 KM_SLEEP); 591 /* 592 * Define the token bucket filter structures. 593 * tbftable -> each vif has one of these for storing info. 594 */ 595 ipst->ips_tbfs = kmem_zalloc(sizeof (struct tbf) * MAXVIFS, KM_SLEEP); 596 597 mutex_init(&ipst->ips_last_encap_lock, NULL, MUTEX_DEFAULT, NULL); 598 599 ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl); 600 ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl); 601 } 602 603 /* 604 * Disable multicast routing. 605 * Didn't use global timeout_val (BSD version), instead check the mfctable. 606 */ 607 int 608 ip_mrouter_done(ip_stack_t *ipst) 609 { 610 conn_t *mrouter; 611 vifi_t vifi; 612 struct mfc *mfc_rt; 613 int i; 614 615 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 616 if (ipst->ips_ip_g_mrouter == NULL) { 617 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 618 return (EINVAL); 619 } 620 621 mrouter = ipst->ips_ip_g_mrouter; 622 623 if (ipst->ips_saved_ip_forwarding != -1) { 624 if (ipst->ips_ip_mrtdebug > 1) { 625 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 626 "ip_mrouter_done: turning off forwarding"); 627 } 628 ipst->ips_ip_forwarding = ipst->ips_saved_ip_forwarding; 629 ipst->ips_saved_ip_forwarding = -1; 630 } 631 632 /* 633 * Always clear cache when vifs change. 634 * No need to get ipst->ips_last_encap_lock since we are running as 635 * a writer. 636 */ 637 mutex_enter(&ipst->ips_last_encap_lock); 638 ipst->ips_last_encap_src = 0; 639 ipst->ips_last_encap_vif = NULL; 640 mutex_exit(&ipst->ips_last_encap_lock); 641 mrouter->conn_multi_router = 0; 642 643 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 644 645 /* 646 * For each phyint in use, 647 * disable promiscuous reception of all IP multicasts. 648 */ 649 for (vifi = 0; vifi < MAXVIFS; vifi++) { 650 struct vif *vifp = ipst->ips_vifs + vifi; 651 652 mutex_enter(&vifp->v_lock); 653 /* 654 * if the vif is active mark it condemned. 655 */ 656 if (vifp->v_marks & VIF_MARK_GOOD) { 657 ASSERT(vifp->v_ipif != NULL); 658 ipif_refhold(vifp->v_ipif); 659 /* Phyint only */ 660 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 661 ipif_t *ipif = vifp->v_ipif; 662 ilm_t *ilm = vifp->v_ilm; 663 664 vifp->v_ilm = NULL; 665 vifp->v_marks &= ~VIF_MARK_GOOD; 666 vifp->v_marks |= VIF_MARK_CONDEMNED; 667 668 mutex_exit(&(vifp)->v_lock); 669 if (ilm != NULL) { 670 ill_t *ill = ipif->ipif_ill; 671 672 (void) ip_delmulti(ilm); 673 ASSERT(ill->ill_mrouter_cnt > 0); 674 atomic_dec_32(&ill->ill_mrouter_cnt); 675 } 676 mutex_enter(&vifp->v_lock); 677 } 678 ipif_refrele(vifp->v_ipif); 679 /* 680 * decreases the refcnt added in add_vif. 681 * and release v_lock. 682 */ 683 VIF_REFRELE_LOCKED(vifp); 684 } else { 685 mutex_exit(&vifp->v_lock); 686 continue; 687 } 688 } 689 690 mutex_enter(&ipst->ips_numvifs_mutex); 691 ipst->ips_numvifs = 0; 692 ipst->ips_pim_assert = 0; 693 ipst->ips_reg_vif_num = ALL_VIFS; 694 mutex_exit(&ipst->ips_numvifs_mutex); 695 696 /* 697 * Free upcall msgs. 698 * Go through mfctable and stop any outstanding upcall 699 * timeouts remaining on mfcs. 700 */ 701 for (i = 0; i < MFCTBLSIZ; i++) { 702 mutex_enter(&ipst->ips_mfcs[i].mfcb_lock); 703 ipst->ips_mfcs[i].mfcb_refcnt++; 704 ipst->ips_mfcs[i].mfcb_marks |= MFCB_MARK_CONDEMNED; 705 mutex_exit(&ipst->ips_mfcs[i].mfcb_lock); 706 mfc_rt = ipst->ips_mfcs[i].mfcb_mfc; 707 while (mfc_rt) { 708 /* Free upcalls */ 709 mutex_enter(&mfc_rt->mfc_mutex); 710 if (mfc_rt->mfc_rte != NULL) { 711 if (mfc_rt->mfc_timeout_id != 0) { 712 /* 713 * OK to drop the lock as we have 714 * a refcnt on the bucket. timeout 715 * can fire but it will see that 716 * mfc_timeout_id == 0 and not do 717 * anything. see expire_upcalls(). 718 */ 719 mfc_rt->mfc_timeout_id = 0; 720 mutex_exit(&mfc_rt->mfc_mutex); 721 (void) untimeout( 722 mfc_rt->mfc_timeout_id); 723 mfc_rt->mfc_timeout_id = 0; 724 mutex_enter(&mfc_rt->mfc_mutex); 725 726 /* 727 * all queued upcall packets 728 * and mblk will be freed in 729 * release_mfc(). 730 */ 731 } 732 } 733 734 mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED; 735 736 mutex_exit(&mfc_rt->mfc_mutex); 737 mfc_rt = mfc_rt->mfc_next; 738 } 739 MFCB_REFRELE(&ipst->ips_mfcs[i]); 740 } 741 742 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 743 ipst->ips_ip_g_mrouter = NULL; 744 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 745 return (0); 746 } 747 748 void 749 ip_mrouter_stack_destroy(ip_stack_t *ipst) 750 { 751 struct mfcb *mfcbp; 752 struct mfc *rt; 753 int i; 754 755 for (i = 0; i < MFCTBLSIZ; i++) { 756 mfcbp = &ipst->ips_mfcs[i]; 757 758 while ((rt = mfcbp->mfcb_mfc) != NULL) { 759 (void) printf("ip_mrouter_stack_destroy: free for %d\n", 760 i); 761 762 mfcbp->mfcb_mfc = rt->mfc_next; 763 free_queue(rt); 764 mi_free(rt); 765 } 766 } 767 kmem_free(ipst->ips_vifs, sizeof (struct vif) * (MAXVIFS+1)); 768 ipst->ips_vifs = NULL; 769 kmem_free(ipst->ips_mrtstat, sizeof (struct mrtstat)); 770 ipst->ips_mrtstat = NULL; 771 kmem_free(ipst->ips_mfcs, sizeof (struct mfcb) * MFCTBLSIZ); 772 ipst->ips_mfcs = NULL; 773 kmem_free(ipst->ips_tbfs, sizeof (struct tbf) * MAXVIFS); 774 ipst->ips_tbfs = NULL; 775 776 mutex_destroy(&ipst->ips_last_encap_lock); 777 mutex_destroy(&ipst->ips_ip_g_mrouter_mutex); 778 } 779 780 static boolean_t 781 is_mrouter_off(ip_stack_t *ipst) 782 { 783 conn_t *mrouter; 784 785 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 786 if (ipst->ips_ip_g_mrouter == NULL) { 787 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 788 return (B_TRUE); 789 } 790 791 mrouter = ipst->ips_ip_g_mrouter; 792 if (mrouter->conn_multi_router == 0) { 793 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 794 return (B_TRUE); 795 } 796 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 797 return (B_FALSE); 798 } 799 800 static void 801 unlock_good_vif(struct vif *vifp) 802 { 803 ASSERT(vifp->v_ipif != NULL); 804 ipif_refrele(vifp->v_ipif); 805 VIF_REFRELE(vifp); 806 } 807 808 static boolean_t 809 lock_good_vif(struct vif *vifp) 810 { 811 mutex_enter(&vifp->v_lock); 812 if (!(vifp->v_marks & VIF_MARK_GOOD)) { 813 mutex_exit(&vifp->v_lock); 814 return (B_FALSE); 815 } 816 817 ASSERT(vifp->v_ipif != NULL); 818 mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock); 819 if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) { 820 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock); 821 mutex_exit(&vifp->v_lock); 822 return (B_FALSE); 823 } 824 ipif_refhold_locked(vifp->v_ipif); 825 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock); 826 vifp->v_refcnt++; 827 mutex_exit(&vifp->v_lock); 828 return (B_TRUE); 829 } 830 831 /* 832 * Add a vif to the vif table. 833 */ 834 static int 835 add_vif(struct vifctl *vifcp, conn_t *connp, ip_stack_t *ipst) 836 { 837 struct vif *vifp = ipst->ips_vifs + vifcp->vifc_vifi; 838 ipif_t *ipif; 839 int error = 0; 840 struct tbf *v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi; 841 conn_t *mrouter = ipst->ips_ip_g_mrouter; 842 ilm_t *ilm; 843 ill_t *ill; 844 845 ASSERT(connp != NULL); 846 847 if (vifcp->vifc_vifi >= MAXVIFS) 848 return (EINVAL); 849 850 if (is_mrouter_off(ipst)) 851 return (EINVAL); 852 853 mutex_enter(&vifp->v_lock); 854 /* 855 * Viftable entry should be 0. 856 * if v_marks == 0 but v_refcnt != 0 means struct is being 857 * initialized. 858 * 859 * Also note that it is very unlikely that we will get a MRT_ADD_VIF 860 * request while the delete is in progress, mrouted only sends add 861 * requests when a new interface is added and the new interface cannot 862 * have the same vifi as an existing interface. We make sure that 863 * ill_delete will block till the vif is deleted by adding a refcnt 864 * to ipif in del_vif(). 865 */ 866 if (vifp->v_lcl_addr.s_addr != 0 || 867 vifp->v_marks != 0 || 868 vifp->v_refcnt != 0) { 869 mutex_exit(&vifp->v_lock); 870 return (EADDRINUSE); 871 } 872 873 /* Incoming vif should not be 0 */ 874 if (vifcp->vifc_lcl_addr.s_addr == 0) { 875 mutex_exit(&vifp->v_lock); 876 return (EINVAL); 877 } 878 879 vifp->v_refcnt++; 880 mutex_exit(&vifp->v_lock); 881 /* Find the interface with the local address */ 882 ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL, 883 IPCL_ZONEID(connp), ipst); 884 if (ipif == NULL) { 885 VIF_REFRELE(vifp); 886 return (EADDRNOTAVAIL); 887 } 888 889 if (ipst->ips_ip_mrtdebug > 1) { 890 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 891 "add_vif: src 0x%x enter", 892 vifcp->vifc_lcl_addr.s_addr); 893 } 894 895 mutex_enter(&vifp->v_lock); 896 /* 897 * Always clear cache when vifs change. 898 * Needed to ensure that src isn't left over from before vif was added. 899 * No need to get last_encap_lock, since we are running as a writer. 900 */ 901 902 mutex_enter(&ipst->ips_last_encap_lock); 903 ipst->ips_last_encap_src = 0; 904 ipst->ips_last_encap_vif = NULL; 905 mutex_exit(&ipst->ips_last_encap_lock); 906 907 if (vifcp->vifc_flags & VIFF_TUNNEL) { 908 if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) { 909 cmn_err(CE_WARN, 910 "add_vif: source route tunnels not supported\n"); 911 VIF_REFRELE_LOCKED(vifp); 912 ipif_refrele(ipif); 913 return (EOPNOTSUPP); 914 } 915 vifp->v_rmt_addr = vifcp->vifc_rmt_addr; 916 917 } else { 918 /* Phyint or Register vif */ 919 if (vifcp->vifc_flags & VIFF_REGISTER) { 920 /* 921 * Note: Since all IPPROTO_IP level options (including 922 * MRT_ADD_VIF) are done exclusively via 923 * ip_optmgmt_writer(), a lock is not necessary to 924 * protect reg_vif_num. 925 */ 926 mutex_enter(&ipst->ips_numvifs_mutex); 927 if (ipst->ips_reg_vif_num == ALL_VIFS) { 928 ipst->ips_reg_vif_num = vifcp->vifc_vifi; 929 mutex_exit(&ipst->ips_numvifs_mutex); 930 } else { 931 mutex_exit(&ipst->ips_numvifs_mutex); 932 VIF_REFRELE_LOCKED(vifp); 933 ipif_refrele(ipif); 934 return (EADDRINUSE); 935 } 936 } 937 938 /* Make sure the interface supports multicast */ 939 if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) { 940 VIF_REFRELE_LOCKED(vifp); 941 ipif_refrele(ipif); 942 if (vifcp->vifc_flags & VIFF_REGISTER) { 943 mutex_enter(&ipst->ips_numvifs_mutex); 944 ipst->ips_reg_vif_num = ALL_VIFS; 945 mutex_exit(&ipst->ips_numvifs_mutex); 946 } 947 return (EOPNOTSUPP); 948 } 949 /* Enable promiscuous reception of all IP mcasts from the if */ 950 mutex_exit(&vifp->v_lock); 951 952 ill = ipif->ipif_ill; 953 if (IS_UNDER_IPMP(ill)) 954 ill = ipmp_ill_hold_ipmp_ill(ill); 955 956 if (ill == NULL) { 957 ilm = NULL; 958 } else { 959 ilm = ip_addmulti(&ipv6_all_zeros, ill, 960 ipif->ipif_zoneid, &error); 961 if (ilm != NULL) 962 atomic_inc_32(&ill->ill_mrouter_cnt); 963 if (IS_UNDER_IPMP(ipif->ipif_ill)) { 964 ill_refrele(ill); 965 ill = ipif->ipif_ill; 966 } 967 } 968 969 mutex_enter(&vifp->v_lock); 970 /* 971 * since we released the lock lets make sure that 972 * ip_mrouter_done() has not been called. 973 */ 974 if (ilm == NULL || is_mrouter_off(ipst)) { 975 if (ilm != NULL) { 976 (void) ip_delmulti(ilm); 977 ASSERT(ill->ill_mrouter_cnt > 0); 978 atomic_dec_32(&ill->ill_mrouter_cnt); 979 } 980 if (vifcp->vifc_flags & VIFF_REGISTER) { 981 mutex_enter(&ipst->ips_numvifs_mutex); 982 ipst->ips_reg_vif_num = ALL_VIFS; 983 mutex_exit(&ipst->ips_numvifs_mutex); 984 } 985 VIF_REFRELE_LOCKED(vifp); 986 ipif_refrele(ipif); 987 return (error?error:EINVAL); 988 } 989 vifp->v_ilm = ilm; 990 } 991 /* Define parameters for the tbf structure */ 992 vifp->v_tbf = v_tbf; 993 gethrestime(&vifp->v_tbf->tbf_last_pkt_t); 994 vifp->v_tbf->tbf_n_tok = 0; 995 vifp->v_tbf->tbf_q_len = 0; 996 vifp->v_tbf->tbf_max_q_len = MAXQSIZE; 997 vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL; 998 999 vifp->v_flags = vifcp->vifc_flags; 1000 vifp->v_threshold = vifcp->vifc_threshold; 1001 vifp->v_lcl_addr = vifcp->vifc_lcl_addr; 1002 vifp->v_ipif = ipif; 1003 ipif_refrele(ipif); 1004 /* Scaling up here, allows division by 1024 in critical code. */ 1005 vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000); 1006 vifp->v_timeout_id = 0; 1007 /* initialize per vif pkt counters */ 1008 vifp->v_pkt_in = 0; 1009 vifp->v_pkt_out = 0; 1010 vifp->v_bytes_in = 0; 1011 vifp->v_bytes_out = 0; 1012 mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL); 1013 1014 /* Adjust numvifs up, if the vifi is higher than numvifs */ 1015 mutex_enter(&ipst->ips_numvifs_mutex); 1016 if (ipst->ips_numvifs <= vifcp->vifc_vifi) 1017 ipst->ips_numvifs = vifcp->vifc_vifi + 1; 1018 mutex_exit(&ipst->ips_numvifs_mutex); 1019 1020 if (ipst->ips_ip_mrtdebug > 1) { 1021 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1022 "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d", 1023 vifcp->vifc_vifi, 1024 ntohl(vifcp->vifc_lcl_addr.s_addr), 1025 (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask", 1026 ntohl(vifcp->vifc_rmt_addr.s_addr), 1027 vifcp->vifc_threshold, vifcp->vifc_rate_limit); 1028 } 1029 1030 vifp->v_marks = VIF_MARK_GOOD; 1031 mutex_exit(&vifp->v_lock); 1032 return (0); 1033 } 1034 1035 1036 /* Delete a vif from the vif table. */ 1037 static void 1038 del_vifp(struct vif *vifp) 1039 { 1040 struct tbf *t = vifp->v_tbf; 1041 mblk_t *mp0; 1042 vifi_t vifi; 1043 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 1044 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1045 1046 ASSERT(vifp->v_marks & VIF_MARK_CONDEMNED); 1047 ASSERT(t != NULL); 1048 1049 if (ipst->ips_ip_mrtdebug > 1) { 1050 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1051 "del_vif: src 0x%x\n", vifp->v_lcl_addr.s_addr); 1052 } 1053 1054 if (vifp->v_timeout_id != 0) { 1055 (void) untimeout(vifp->v_timeout_id); 1056 vifp->v_timeout_id = 0; 1057 } 1058 1059 /* 1060 * Free packets queued at the interface. 1061 * Mrouted takes care of cleaning up mfcs - makes calls to del_mfc. 1062 */ 1063 mutex_enter(&t->tbf_lock); 1064 while (t->tbf_q != NULL) { 1065 mp0 = t->tbf_q; 1066 t->tbf_q = t->tbf_q->b_next; 1067 mp0->b_prev = mp0->b_next = NULL; 1068 freemsg(mp0); 1069 } 1070 mutex_exit(&t->tbf_lock); 1071 1072 /* 1073 * Always clear cache when vifs change. 1074 * No need to get last_encap_lock since we are running as a writer. 1075 */ 1076 mutex_enter(&ipst->ips_last_encap_lock); 1077 if (vifp == ipst->ips_last_encap_vif) { 1078 ipst->ips_last_encap_vif = NULL; 1079 ipst->ips_last_encap_src = 0; 1080 } 1081 mutex_exit(&ipst->ips_last_encap_lock); 1082 1083 mutex_destroy(&t->tbf_lock); 1084 1085 bzero(vifp->v_tbf, sizeof (*(vifp->v_tbf))); 1086 1087 /* Adjust numvifs down */ 1088 mutex_enter(&ipst->ips_numvifs_mutex); 1089 for (vifi = ipst->ips_numvifs; vifi != 0; vifi--) /* vifi is unsigned */ 1090 if (ipst->ips_vifs[vifi - 1].v_lcl_addr.s_addr != 0) 1091 break; 1092 ipst->ips_numvifs = vifi; 1093 mutex_exit(&ipst->ips_numvifs_mutex); 1094 1095 bzero(vifp, sizeof (*vifp)); 1096 } 1097 1098 static int 1099 del_vif(vifi_t *vifip, ip_stack_t *ipst) 1100 { 1101 struct vif *vifp = ipst->ips_vifs + *vifip; 1102 1103 if (*vifip >= ipst->ips_numvifs) 1104 return (EINVAL); 1105 1106 mutex_enter(&vifp->v_lock); 1107 /* 1108 * Not initialized 1109 * Here we are not looking at the vif that is being initialized 1110 * i.e vifp->v_marks == 0 and refcnt > 0. 1111 */ 1112 if (vifp->v_lcl_addr.s_addr == 0 || 1113 !(vifp->v_marks & VIF_MARK_GOOD)) { 1114 mutex_exit(&vifp->v_lock); 1115 return (EADDRNOTAVAIL); 1116 } 1117 1118 /* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */ 1119 vifp->v_marks &= ~VIF_MARK_GOOD; 1120 vifp->v_marks |= VIF_MARK_CONDEMNED; 1121 1122 /* Phyint only */ 1123 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 1124 ipif_t *ipif = vifp->v_ipif; 1125 ilm_t *ilm = vifp->v_ilm; 1126 1127 vifp->v_ilm = NULL; 1128 1129 ASSERT(ipif != NULL); 1130 /* 1131 * should be OK to drop the lock as we 1132 * have marked this as CONDEMNED. 1133 */ 1134 mutex_exit(&(vifp)->v_lock); 1135 if (ilm != NULL) { 1136 (void) ip_delmulti(ilm); 1137 ASSERT(ipif->ipif_ill->ill_mrouter_cnt > 0); 1138 atomic_dec_32(&ipif->ipif_ill->ill_mrouter_cnt); 1139 } 1140 mutex_enter(&(vifp)->v_lock); 1141 } 1142 1143 if (vifp->v_flags & VIFF_REGISTER) { 1144 mutex_enter(&ipst->ips_numvifs_mutex); 1145 ipst->ips_reg_vif_num = ALL_VIFS; 1146 mutex_exit(&ipst->ips_numvifs_mutex); 1147 } 1148 1149 /* 1150 * decreases the refcnt added in add_vif. 1151 */ 1152 VIF_REFRELE_LOCKED(vifp); 1153 return (0); 1154 } 1155 1156 /* 1157 * Add an mfc entry. 1158 */ 1159 static int 1160 add_mfc(struct mfcctl *mfccp, ip_stack_t *ipst) 1161 { 1162 struct mfc *rt; 1163 struct rtdetq *rte; 1164 ushort_t nstl; 1165 int i; 1166 struct mfcb *mfcbp; 1167 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1168 1169 /* 1170 * The value of vifi is NO_VIF (==MAXVIFS) if Mrouted 1171 * did not have a real route for pkt. 1172 * We want this pkt without rt installed in the mfctable to prevent 1173 * multiiple tries, so go ahead and put it in mfctable, it will 1174 * be discarded later in ip_mdq() because the child is NULL. 1175 */ 1176 1177 /* Error checking, out of bounds? */ 1178 if (mfccp->mfcc_parent > MAXVIFS) { 1179 ip0dbg(("ADD_MFC: mfcc_parent out of range %d", 1180 (int)mfccp->mfcc_parent)); 1181 return (EINVAL); 1182 } 1183 1184 if ((mfccp->mfcc_parent != NO_VIF) && 1185 (ipst->ips_vifs[mfccp->mfcc_parent].v_ipif == NULL)) { 1186 ip0dbg(("ADD_MFC: NULL ipif for parent vif %d\n", 1187 (int)mfccp->mfcc_parent)); 1188 return (EINVAL); 1189 } 1190 1191 if (is_mrouter_off(ipst)) { 1192 return (EINVAL); 1193 } 1194 1195 mfcbp = &ipst->ips_mfcs[MFCHASH(mfccp->mfcc_origin.s_addr, 1196 mfccp->mfcc_mcastgrp.s_addr)]; 1197 MFCB_REFHOLD(mfcbp); 1198 MFCFIND(mfcbp, mfccp->mfcc_origin.s_addr, 1199 mfccp->mfcc_mcastgrp.s_addr, rt); 1200 1201 /* If an entry already exists, just update the fields */ 1202 if (rt) { 1203 if (ipst->ips_ip_mrtdebug > 1) { 1204 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1205 "add_mfc: update o %x grp %x parent %x", 1206 ntohl(mfccp->mfcc_origin.s_addr), 1207 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1208 mfccp->mfcc_parent); 1209 } 1210 mutex_enter(&rt->mfc_mutex); 1211 rt->mfc_parent = mfccp->mfcc_parent; 1212 1213 mutex_enter(&ipst->ips_numvifs_mutex); 1214 for (i = 0; i < (int)ipst->ips_numvifs; i++) 1215 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; 1216 mutex_exit(&ipst->ips_numvifs_mutex); 1217 mutex_exit(&rt->mfc_mutex); 1218 1219 MFCB_REFRELE(mfcbp); 1220 return (0); 1221 } 1222 1223 /* 1224 * Find the entry for which the upcall was made and update. 1225 */ 1226 for (rt = mfcbp->mfcb_mfc, nstl = 0; rt; rt = rt->mfc_next) { 1227 mutex_enter(&rt->mfc_mutex); 1228 if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && 1229 (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) && 1230 (rt->mfc_rte != NULL) && 1231 !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 1232 if (nstl++ != 0) 1233 cmn_err(CE_WARN, 1234 "add_mfc: %s o %x g %x p %x", 1235 "multiple kernel entries", 1236 ntohl(mfccp->mfcc_origin.s_addr), 1237 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1238 mfccp->mfcc_parent); 1239 1240 if (ipst->ips_ip_mrtdebug > 1) { 1241 (void) mi_strlog(mrouter->conn_rq, 1, 1242 SL_TRACE, 1243 "add_mfc: o %x g %x p %x", 1244 ntohl(mfccp->mfcc_origin.s_addr), 1245 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1246 mfccp->mfcc_parent); 1247 } 1248 fill_route(rt, mfccp, ipst); 1249 1250 /* 1251 * Prevent cleanup of cache entry. 1252 * Timer starts in ip_mforward. 1253 */ 1254 if (rt->mfc_timeout_id != 0) { 1255 timeout_id_t id; 1256 id = rt->mfc_timeout_id; 1257 /* 1258 * setting id to zero will avoid this 1259 * entry from being cleaned up in 1260 * expire_up_calls(). 1261 */ 1262 rt->mfc_timeout_id = 0; 1263 /* 1264 * dropping the lock is fine as we 1265 * have a refhold on the bucket. 1266 * so mfc cannot be freed. 1267 * The timeout can fire but it will see 1268 * that mfc_timeout_id == 0 and not cleanup. 1269 */ 1270 mutex_exit(&rt->mfc_mutex); 1271 (void) untimeout(id); 1272 mutex_enter(&rt->mfc_mutex); 1273 } 1274 1275 /* 1276 * Send all pkts that are queued waiting for the upcall. 1277 * ip_mdq param tun set to 0 - 1278 * the return value of ip_mdq() isn't used here, 1279 * so value we send doesn't matter. 1280 */ 1281 while (rt->mfc_rte != NULL) { 1282 rte = rt->mfc_rte; 1283 rt->mfc_rte = rte->rte_next; 1284 mutex_exit(&rt->mfc_mutex); 1285 (void) ip_mdq(rte->mp, (ipha_t *) 1286 rte->mp->b_rptr, rte->ill, 0, rt); 1287 freemsg(rte->mp); 1288 mi_free((char *)rte); 1289 mutex_enter(&rt->mfc_mutex); 1290 } 1291 } 1292 mutex_exit(&rt->mfc_mutex); 1293 } 1294 1295 1296 /* 1297 * It is possible that an entry is being inserted without an upcall 1298 */ 1299 if (nstl == 0) { 1300 mutex_enter(&(mfcbp->mfcb_lock)); 1301 if (ipst->ips_ip_mrtdebug > 1) { 1302 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1303 "add_mfc: no upcall o %x g %x p %x", 1304 ntohl(mfccp->mfcc_origin.s_addr), 1305 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1306 mfccp->mfcc_parent); 1307 } 1308 if (is_mrouter_off(ipst)) { 1309 mutex_exit(&mfcbp->mfcb_lock); 1310 MFCB_REFRELE(mfcbp); 1311 return (EINVAL); 1312 } 1313 1314 for (rt = mfcbp->mfcb_mfc; rt; rt = rt->mfc_next) { 1315 1316 mutex_enter(&rt->mfc_mutex); 1317 if ((rt->mfc_origin.s_addr == 1318 mfccp->mfcc_origin.s_addr) && 1319 (rt->mfc_mcastgrp.s_addr == 1320 mfccp->mfcc_mcastgrp.s_addr) && 1321 (!(rt->mfc_marks & MFCB_MARK_CONDEMNED))) { 1322 fill_route(rt, mfccp, ipst); 1323 mutex_exit(&rt->mfc_mutex); 1324 break; 1325 } 1326 mutex_exit(&rt->mfc_mutex); 1327 } 1328 1329 /* No upcall, so make a new entry into mfctable */ 1330 if (rt == NULL) { 1331 rt = (struct mfc *)mi_zalloc(sizeof (struct mfc)); 1332 if (rt == NULL) { 1333 ip1dbg(("add_mfc: out of memory\n")); 1334 mutex_exit(&mfcbp->mfcb_lock); 1335 MFCB_REFRELE(mfcbp); 1336 return (ENOBUFS); 1337 } 1338 1339 /* Insert new entry at head of hash chain */ 1340 mutex_enter(&rt->mfc_mutex); 1341 fill_route(rt, mfccp, ipst); 1342 1343 /* Link into table */ 1344 rt->mfc_next = mfcbp->mfcb_mfc; 1345 mfcbp->mfcb_mfc = rt; 1346 mutex_exit(&rt->mfc_mutex); 1347 } 1348 mutex_exit(&mfcbp->mfcb_lock); 1349 } 1350 1351 MFCB_REFRELE(mfcbp); 1352 return (0); 1353 } 1354 1355 /* 1356 * Fills in mfc structure from mrouted mfcctl. 1357 */ 1358 static void 1359 fill_route(struct mfc *rt, struct mfcctl *mfccp, ip_stack_t *ipst) 1360 { 1361 int i; 1362 1363 rt->mfc_origin = mfccp->mfcc_origin; 1364 rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; 1365 rt->mfc_parent = mfccp->mfcc_parent; 1366 mutex_enter(&ipst->ips_numvifs_mutex); 1367 for (i = 0; i < (int)ipst->ips_numvifs; i++) { 1368 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; 1369 } 1370 mutex_exit(&ipst->ips_numvifs_mutex); 1371 /* Initialize pkt counters per src-grp */ 1372 rt->mfc_pkt_cnt = 0; 1373 rt->mfc_byte_cnt = 0; 1374 rt->mfc_wrong_if = 0; 1375 rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_nsec = 0; 1376 1377 } 1378 1379 static void 1380 free_queue(struct mfc *mfcp) 1381 { 1382 struct rtdetq *rte0; 1383 1384 /* 1385 * Drop all queued upcall packets. 1386 * Free the mbuf with the pkt. 1387 */ 1388 while ((rte0 = mfcp->mfc_rte) != NULL) { 1389 mfcp->mfc_rte = rte0->rte_next; 1390 freemsg(rte0->mp); 1391 mi_free((char *)rte0); 1392 } 1393 } 1394 /* 1395 * go thorugh the hash bucket and free all the entries marked condemned. 1396 */ 1397 void 1398 release_mfc(struct mfcb *mfcbp) 1399 { 1400 struct mfc *current_mfcp; 1401 struct mfc *prev_mfcp; 1402 1403 prev_mfcp = current_mfcp = mfcbp->mfcb_mfc; 1404 1405 while (current_mfcp != NULL) { 1406 if (current_mfcp->mfc_marks & MFCB_MARK_CONDEMNED) { 1407 if (current_mfcp == mfcbp->mfcb_mfc) { 1408 mfcbp->mfcb_mfc = current_mfcp->mfc_next; 1409 free_queue(current_mfcp); 1410 mi_free(current_mfcp); 1411 prev_mfcp = current_mfcp = mfcbp->mfcb_mfc; 1412 continue; 1413 } 1414 ASSERT(prev_mfcp != NULL); 1415 prev_mfcp->mfc_next = current_mfcp->mfc_next; 1416 free_queue(current_mfcp); 1417 mi_free(current_mfcp); 1418 current_mfcp = NULL; 1419 } else { 1420 prev_mfcp = current_mfcp; 1421 } 1422 1423 current_mfcp = prev_mfcp->mfc_next; 1424 1425 } 1426 mfcbp->mfcb_marks &= ~MFCB_MARK_CONDEMNED; 1427 ASSERT(mfcbp->mfcb_mfc != NULL || mfcbp->mfcb_marks == 0); 1428 } 1429 1430 /* 1431 * Delete an mfc entry. 1432 */ 1433 static int 1434 del_mfc(struct mfcctl *mfccp, ip_stack_t *ipst) 1435 { 1436 struct in_addr origin; 1437 struct in_addr mcastgrp; 1438 struct mfc *rt; 1439 uint_t hash; 1440 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1441 1442 origin = mfccp->mfcc_origin; 1443 mcastgrp = mfccp->mfcc_mcastgrp; 1444 hash = MFCHASH(origin.s_addr, mcastgrp.s_addr); 1445 1446 if (ipst->ips_ip_mrtdebug > 1) { 1447 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1448 "del_mfc: o %x g %x", 1449 ntohl(origin.s_addr), 1450 ntohl(mcastgrp.s_addr)); 1451 } 1452 1453 MFCB_REFHOLD(&ipst->ips_mfcs[hash]); 1454 1455 /* Find mfc in mfctable, finds only entries without upcalls */ 1456 for (rt = ipst->ips_mfcs[hash].mfcb_mfc; rt; rt = rt->mfc_next) { 1457 mutex_enter(&rt->mfc_mutex); 1458 if (origin.s_addr == rt->mfc_origin.s_addr && 1459 mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr && 1460 rt->mfc_rte == NULL && 1461 !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) 1462 break; 1463 mutex_exit(&rt->mfc_mutex); 1464 } 1465 1466 /* 1467 * Return if there was an upcall (mfc_rte != NULL, 1468 * or rt not in mfctable. 1469 */ 1470 if (rt == NULL) { 1471 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 1472 return (EADDRNOTAVAIL); 1473 } 1474 1475 1476 /* 1477 * no need to hold lock as we have a reference. 1478 */ 1479 ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED; 1480 /* error checking */ 1481 if (rt->mfc_timeout_id != 0) { 1482 ip0dbg(("del_mfc: TIMEOUT NOT 0, rte not null")); 1483 /* 1484 * Its ok to drop the lock, the struct cannot be freed 1485 * since we have a ref on the hash bucket. 1486 */ 1487 rt->mfc_timeout_id = 0; 1488 mutex_exit(&rt->mfc_mutex); 1489 (void) untimeout(rt->mfc_timeout_id); 1490 mutex_enter(&rt->mfc_mutex); 1491 } 1492 1493 ASSERT(rt->mfc_rte == NULL); 1494 1495 1496 /* 1497 * Delete the entry from the cache 1498 */ 1499 rt->mfc_marks |= MFCB_MARK_CONDEMNED; 1500 mutex_exit(&rt->mfc_mutex); 1501 1502 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 1503 1504 return (0); 1505 } 1506 1507 #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ 1508 1509 /* 1510 * IP multicast forwarding function. This function assumes that the packet 1511 * pointed to by ipha has arrived on (or is about to be sent to) the interface 1512 * pointed to by "ill", and the packet is to be relayed to other networks 1513 * that have members of the packet's destination IP multicast group. 1514 * 1515 * The packet is returned unscathed to the caller, unless it is 1516 * erroneous, in which case a -1 value tells the caller (IP) 1517 * to discard it. 1518 * 1519 * Unlike BSD, SunOS 5.x needs to return to IP info about 1520 * whether pkt came in thru a tunnel, so it can be discarded, unless 1521 * it's IGMP. In BSD, the ifp is bogus for tunnels, so pkt won't try 1522 * to be delivered. 1523 * Return values are 0 - pkt is okay and phyint 1524 * -1 - pkt is malformed and to be tossed 1525 * 1 - pkt came in on tunnel 1526 */ 1527 int 1528 ip_mforward(mblk_t *mp, ip_recv_attr_t *ira) 1529 { 1530 ipha_t *ipha = (ipha_t *)mp->b_rptr; 1531 ill_t *ill = ira->ira_ill; 1532 struct mfc *rt; 1533 ipaddr_t src, dst, tunnel_src = 0; 1534 static int srctun = 0; 1535 vifi_t vifi; 1536 boolean_t pim_reg_packet = B_FALSE; 1537 struct mfcb *mfcbp; 1538 ip_stack_t *ipst = ill->ill_ipst; 1539 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1540 ill_t *rill = ira->ira_rill; 1541 1542 ASSERT(ira->ira_pktlen == msgdsize(mp)); 1543 1544 if (ipst->ips_ip_mrtdebug > 1) { 1545 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1546 "ip_mforward: RECV ipha_src %x, ipha_dst %x, ill %s", 1547 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), 1548 ill->ill_name); 1549 } 1550 1551 dst = ipha->ipha_dst; 1552 if (ira->ira_flags & IRAF_PIM_REGISTER) 1553 pim_reg_packet = B_TRUE; 1554 else if (ira->ira_flags & IRAF_MROUTE_TUNNEL_SET) 1555 tunnel_src = ira->ira_mroute_tunnel; 1556 1557 /* 1558 * Don't forward a packet with time-to-live of zero or one, 1559 * or a packet destined to a local-only group. 1560 */ 1561 if (CLASSD(dst) && (ipha->ipha_ttl <= 1 || 1562 (ipaddr_t)ntohl(dst) <= INADDR_MAX_LOCAL_GROUP)) { 1563 if (ipst->ips_ip_mrtdebug > 1) { 1564 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1565 "ip_mforward: not forwarded ttl %d," 1566 " dst 0x%x ill %s", 1567 ipha->ipha_ttl, ntohl(dst), ill->ill_name); 1568 } 1569 if (tunnel_src != 0) 1570 return (1); 1571 else 1572 return (0); 1573 } 1574 1575 if ((tunnel_src != 0) || pim_reg_packet) { 1576 /* 1577 * Packet arrived over an encapsulated tunnel or via a PIM 1578 * register message. 1579 */ 1580 if (ipst->ips_ip_mrtdebug > 1) { 1581 if (tunnel_src != 0) { 1582 (void) mi_strlog(mrouter->conn_rq, 1, 1583 SL_TRACE, 1584 "ip_mforward: ill %s arrived via ENCAP TUN", 1585 ill->ill_name); 1586 } else if (pim_reg_packet) { 1587 (void) mi_strlog(mrouter->conn_rq, 1, 1588 SL_TRACE, 1589 "ip_mforward: ill %s arrived via" 1590 " REGISTER VIF", 1591 ill->ill_name); 1592 } 1593 } 1594 } else if ((ipha->ipha_version_and_hdr_length & 0xf) < 1595 (uint_t)(IP_SIMPLE_HDR_LENGTH + TUNNEL_LEN) >> 2 || 1596 ((uchar_t *)(ipha + 1))[1] != IPOPT_LSRR) { 1597 /* Packet arrived via a physical interface. */ 1598 if (ipst->ips_ip_mrtdebug > 1) { 1599 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1600 "ip_mforward: ill %s arrived via PHYINT", 1601 ill->ill_name); 1602 } 1603 1604 } else { 1605 /* 1606 * Packet arrived through a SRCRT tunnel. 1607 * Source-route tunnels are no longer supported. 1608 * Error message printed every 1000 times. 1609 */ 1610 if ((srctun++ % 1000) == 0) { 1611 cmn_err(CE_WARN, 1612 "ip_mforward: received source-routed pkt from %x", 1613 ntohl(ipha->ipha_src)); 1614 } 1615 return (-1); 1616 } 1617 1618 ipst->ips_mrtstat->mrts_fwd_in++; 1619 src = ipha->ipha_src; 1620 1621 /* Find route in cache, return NULL if not there or upcalls q'ed. */ 1622 1623 /* 1624 * Lock the mfctable against changes made by ip_mforward. 1625 * Note that only add_mfc and del_mfc can remove entries and 1626 * they run with exclusive access to IP. So we do not need to 1627 * guard against the rt being deleted, so release lock after reading. 1628 */ 1629 1630 if (is_mrouter_off(ipst)) 1631 return (-1); 1632 1633 mfcbp = &ipst->ips_mfcs[MFCHASH(src, dst)]; 1634 MFCB_REFHOLD(mfcbp); 1635 MFCFIND(mfcbp, src, dst, rt); 1636 1637 /* Entry exists, so forward if necessary */ 1638 if (rt != NULL) { 1639 int ret = 0; 1640 ipst->ips_mrtstat->mrts_mfc_hits++; 1641 if (pim_reg_packet) { 1642 ASSERT(ipst->ips_reg_vif_num != ALL_VIFS); 1643 ret = ip_mdq(mp, ipha, 1644 ipst->ips_vifs[ipst->ips_reg_vif_num]. 1645 v_ipif->ipif_ill, 1646 0, rt); 1647 } else { 1648 ret = ip_mdq(mp, ipha, ill, tunnel_src, rt); 1649 } 1650 1651 MFCB_REFRELE(mfcbp); 1652 return (ret); 1653 1654 /* 1655 * Don't forward if we don't have a cache entry. Mrouted will 1656 * always provide a cache entry in response to an upcall. 1657 */ 1658 } else { 1659 /* 1660 * If we don't have a route for packet's origin, make a copy 1661 * of the packet and send message to routing daemon. 1662 */ 1663 struct mfc *mfc_rt = NULL; 1664 mblk_t *mp0 = NULL; 1665 mblk_t *mp_copy = NULL; 1666 struct rtdetq *rte = NULL; 1667 struct rtdetq *rte_m, *rte1, *prev_rte; 1668 uint_t hash; 1669 int npkts; 1670 boolean_t new_mfc = B_FALSE; 1671 ipst->ips_mrtstat->mrts_mfc_misses++; 1672 /* BSD uses mrts_no_route++ */ 1673 if (ipst->ips_ip_mrtdebug > 1) { 1674 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1675 "ip_mforward: no rte ill %s src %x g %x misses %d", 1676 ill->ill_name, ntohl(src), ntohl(dst), 1677 (int)ipst->ips_mrtstat->mrts_mfc_misses); 1678 } 1679 /* 1680 * The order of the following code differs from the BSD code. 1681 * Pre-mc3.5, the BSD code was incorrect and SunOS 5.x 1682 * code works, so SunOS 5.x wasn't changed to conform to the 1683 * BSD version. 1684 */ 1685 1686 /* Lock mfctable. */ 1687 hash = MFCHASH(src, dst); 1688 mutex_enter(&(ipst->ips_mfcs[hash].mfcb_lock)); 1689 1690 /* 1691 * If we are turning off mrouted return an error 1692 */ 1693 if (is_mrouter_off(ipst)) { 1694 mutex_exit(&mfcbp->mfcb_lock); 1695 MFCB_REFRELE(mfcbp); 1696 return (-1); 1697 } 1698 1699 /* Is there an upcall waiting for this packet? */ 1700 for (mfc_rt = ipst->ips_mfcs[hash].mfcb_mfc; mfc_rt; 1701 mfc_rt = mfc_rt->mfc_next) { 1702 mutex_enter(&mfc_rt->mfc_mutex); 1703 if (ipst->ips_ip_mrtdebug > 1) { 1704 (void) mi_strlog(mrouter->conn_rq, 1, 1705 SL_TRACE, 1706 "ip_mforward: MFCTAB hash %d o 0x%x" 1707 " g 0x%x\n", 1708 hash, ntohl(mfc_rt->mfc_origin.s_addr), 1709 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1710 } 1711 /* There is an upcall */ 1712 if ((src == mfc_rt->mfc_origin.s_addr) && 1713 (dst == mfc_rt->mfc_mcastgrp.s_addr) && 1714 (mfc_rt->mfc_rte != NULL) && 1715 !(mfc_rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 1716 break; 1717 } 1718 mutex_exit(&mfc_rt->mfc_mutex); 1719 } 1720 /* No upcall, so make a new entry into mfctable */ 1721 if (mfc_rt == NULL) { 1722 mfc_rt = (struct mfc *)mi_zalloc(sizeof (struct mfc)); 1723 if (mfc_rt == NULL) { 1724 ipst->ips_mrtstat->mrts_fwd_drop++; 1725 ip1dbg(("ip_mforward: out of memory " 1726 "for mfc, mfc_rt\n")); 1727 goto error_return; 1728 } else 1729 new_mfc = B_TRUE; 1730 /* Get resources */ 1731 /* TODO could copy header and dup rest */ 1732 mp_copy = copymsg(mp); 1733 if (mp_copy == NULL) { 1734 ipst->ips_mrtstat->mrts_fwd_drop++; 1735 ip1dbg(("ip_mforward: out of memory for " 1736 "mblk, mp_copy\n")); 1737 goto error_return; 1738 } 1739 mutex_enter(&mfc_rt->mfc_mutex); 1740 } 1741 /* Get resources for rte, whether first rte or not first. */ 1742 /* Add this packet into rtdetq */ 1743 rte = (struct rtdetq *)mi_zalloc(sizeof (struct rtdetq)); 1744 if (rte == NULL) { 1745 ipst->ips_mrtstat->mrts_fwd_drop++; 1746 mutex_exit(&mfc_rt->mfc_mutex); 1747 ip1dbg(("ip_mforward: out of memory for" 1748 " rtdetq, rte\n")); 1749 goto error_return; 1750 } 1751 1752 mp0 = copymsg(mp); 1753 if (mp0 == NULL) { 1754 ipst->ips_mrtstat->mrts_fwd_drop++; 1755 ip1dbg(("ip_mforward: out of memory for mblk, mp0\n")); 1756 mutex_exit(&mfc_rt->mfc_mutex); 1757 goto error_return; 1758 } 1759 rte->mp = mp0; 1760 if (pim_reg_packet) { 1761 ASSERT(ipst->ips_reg_vif_num != ALL_VIFS); 1762 rte->ill = 1763 ipst->ips_vifs[ipst->ips_reg_vif_num]. 1764 v_ipif->ipif_ill; 1765 } else { 1766 rte->ill = ill; 1767 } 1768 rte->rte_next = NULL; 1769 1770 /* 1771 * Determine if upcall q (rtdetq) has overflowed. 1772 * mfc_rt->mfc_rte is null by mi_zalloc 1773 * if it is the first message. 1774 */ 1775 for (rte_m = mfc_rt->mfc_rte, npkts = 0; rte_m; 1776 rte_m = rte_m->rte_next) 1777 npkts++; 1778 if (ipst->ips_ip_mrtdebug > 1) { 1779 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1780 "ip_mforward: upcalls %d\n", npkts); 1781 } 1782 if (npkts > MAX_UPQ) { 1783 ipst->ips_mrtstat->mrts_upq_ovflw++; 1784 mutex_exit(&mfc_rt->mfc_mutex); 1785 goto error_return; 1786 } 1787 1788 if (npkts == 0) { /* first upcall */ 1789 int i = 0; 1790 /* 1791 * Now finish installing the new mfc! Now that we have 1792 * resources! Insert new entry at head of hash chain. 1793 * Use src and dst which are ipaddr_t's. 1794 */ 1795 mfc_rt->mfc_origin.s_addr = src; 1796 mfc_rt->mfc_mcastgrp.s_addr = dst; 1797 1798 mutex_enter(&ipst->ips_numvifs_mutex); 1799 for (i = 0; i < (int)ipst->ips_numvifs; i++) 1800 mfc_rt->mfc_ttls[i] = 0; 1801 mutex_exit(&ipst->ips_numvifs_mutex); 1802 mfc_rt->mfc_parent = ALL_VIFS; 1803 1804 /* Link into table */ 1805 if (ipst->ips_ip_mrtdebug > 1) { 1806 (void) mi_strlog(mrouter->conn_rq, 1, 1807 SL_TRACE, 1808 "ip_mforward: NEW MFCTAB hash %d o 0x%x " 1809 "g 0x%x\n", hash, 1810 ntohl(mfc_rt->mfc_origin.s_addr), 1811 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1812 } 1813 mfc_rt->mfc_next = ipst->ips_mfcs[hash].mfcb_mfc; 1814 ipst->ips_mfcs[hash].mfcb_mfc = mfc_rt; 1815 mfc_rt->mfc_rte = NULL; 1816 } 1817 1818 /* Link in the upcall */ 1819 /* First upcall */ 1820 if (mfc_rt->mfc_rte == NULL) 1821 mfc_rt->mfc_rte = rte; 1822 else { 1823 /* not the first upcall */ 1824 prev_rte = mfc_rt->mfc_rte; 1825 for (rte1 = mfc_rt->mfc_rte->rte_next; rte1; 1826 prev_rte = rte1, rte1 = rte1->rte_next) 1827 ; 1828 prev_rte->rte_next = rte; 1829 } 1830 1831 /* 1832 * No upcalls waiting, this is first one, so send a message to 1833 * routing daemon to install a route into kernel table. 1834 */ 1835 if (npkts == 0) { 1836 struct igmpmsg *im; 1837 /* ipha_protocol is 0, for upcall */ 1838 ASSERT(mp_copy != NULL); 1839 im = (struct igmpmsg *)mp_copy->b_rptr; 1840 im->im_msgtype = IGMPMSG_NOCACHE; 1841 im->im_mbz = 0; 1842 mutex_enter(&ipst->ips_numvifs_mutex); 1843 if (pim_reg_packet) { 1844 im->im_vif = (uchar_t)ipst->ips_reg_vif_num; 1845 mutex_exit(&ipst->ips_numvifs_mutex); 1846 } else { 1847 /* 1848 * XXX do we need to hold locks here ? 1849 */ 1850 for (vifi = 0; 1851 vifi < ipst->ips_numvifs; 1852 vifi++) { 1853 if (ipst->ips_vifs[vifi].v_ipif == NULL) 1854 continue; 1855 if (ipst->ips_vifs[vifi]. 1856 v_ipif->ipif_ill == ill) { 1857 im->im_vif = (uchar_t)vifi; 1858 break; 1859 } 1860 } 1861 mutex_exit(&ipst->ips_numvifs_mutex); 1862 ASSERT(vifi < ipst->ips_numvifs); 1863 } 1864 1865 ipst->ips_mrtstat->mrts_upcalls++; 1866 /* Timer to discard upcalls if mrouted is too slow */ 1867 mfc_rt->mfc_timeout_id = timeout(expire_upcalls, 1868 mfc_rt, EXPIRE_TIMEOUT * UPCALL_EXPIRE); 1869 mutex_exit(&mfc_rt->mfc_mutex); 1870 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1871 /* Pass to RAWIP */ 1872 ira->ira_ill = ira->ira_rill = NULL; 1873 (mrouter->conn_recv)(mrouter, mp_copy, NULL, ira); 1874 ira->ira_ill = ill; 1875 ira->ira_rill = rill; 1876 } else { 1877 mutex_exit(&mfc_rt->mfc_mutex); 1878 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1879 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1880 ip_drop_input("ip_mforward - upcall already waiting", 1881 mp_copy, ill); 1882 freemsg(mp_copy); 1883 } 1884 1885 MFCB_REFRELE(mfcbp); 1886 if (tunnel_src != 0) 1887 return (1); 1888 else 1889 return (0); 1890 error_return: 1891 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1892 MFCB_REFRELE(mfcbp); 1893 if (mfc_rt != NULL && (new_mfc == B_TRUE)) 1894 mi_free((char *)mfc_rt); 1895 if (rte != NULL) 1896 mi_free((char *)rte); 1897 if (mp_copy != NULL) { 1898 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1899 ip_drop_input("ip_mforward error", mp_copy, ill); 1900 freemsg(mp_copy); 1901 } 1902 if (mp0 != NULL) 1903 freemsg(mp0); 1904 return (-1); 1905 } 1906 } 1907 1908 /* 1909 * Clean up the mfctable cache entry if upcall is not serviced. 1910 * SunOS 5.x has timeout per mfc, unlike BSD which has one timer. 1911 */ 1912 static void 1913 expire_upcalls(void *arg) 1914 { 1915 struct mfc *mfc_rt = arg; 1916 uint_t hash; 1917 struct mfc *prev_mfc, *mfc0; 1918 ip_stack_t *ipst; 1919 conn_t *mrouter; 1920 1921 if (mfc_rt->mfc_rte == NULL || mfc_rt->mfc_rte->ill != NULL) { 1922 cmn_err(CE_WARN, "expire_upcalls: no ILL\n"); 1923 return; 1924 } 1925 ipst = mfc_rt->mfc_rte->ill->ill_ipst; 1926 mrouter = ipst->ips_ip_g_mrouter; 1927 1928 hash = MFCHASH(mfc_rt->mfc_origin.s_addr, mfc_rt->mfc_mcastgrp.s_addr); 1929 if (ipst->ips_ip_mrtdebug > 1) { 1930 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1931 "expire_upcalls: hash %d s %x g %x", 1932 hash, ntohl(mfc_rt->mfc_origin.s_addr), 1933 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1934 } 1935 MFCB_REFHOLD(&ipst->ips_mfcs[hash]); 1936 mutex_enter(&mfc_rt->mfc_mutex); 1937 /* 1938 * if timeout has been set to zero, than the 1939 * entry has been filled, no need to delete it. 1940 */ 1941 if (mfc_rt->mfc_timeout_id == 0) 1942 goto done; 1943 ipst->ips_mrtstat->mrts_cache_cleanups++; 1944 mfc_rt->mfc_timeout_id = 0; 1945 1946 /* Determine entry to be cleaned up in cache table. */ 1947 for (prev_mfc = mfc0 = ipst->ips_mfcs[hash].mfcb_mfc; mfc0; 1948 prev_mfc = mfc0, mfc0 = mfc0->mfc_next) 1949 if (mfc0 == mfc_rt) 1950 break; 1951 1952 /* del_mfc takes care of gone mfcs */ 1953 ASSERT(prev_mfc != NULL); 1954 ASSERT(mfc0 != NULL); 1955 1956 /* 1957 * Delete the entry from the cache 1958 */ 1959 ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED; 1960 mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED; 1961 1962 /* 1963 * release_mfc will drop all queued upcall packets. 1964 * and will free the mbuf with the pkt, if, timing info. 1965 */ 1966 done: 1967 mutex_exit(&mfc_rt->mfc_mutex); 1968 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 1969 } 1970 1971 /* 1972 * Packet forwarding routine once entry in the cache is made. 1973 */ 1974 static int 1975 ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src, 1976 struct mfc *rt) 1977 { 1978 vifi_t vifi; 1979 struct vif *vifp; 1980 ipaddr_t dst = ipha->ipha_dst; 1981 size_t plen = msgdsize(mp); 1982 vifi_t num_of_vifs; 1983 ip_stack_t *ipst = ill->ill_ipst; 1984 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1985 ip_recv_attr_t iras; 1986 1987 if (ipst->ips_ip_mrtdebug > 1) { 1988 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1989 "ip_mdq: SEND src %x, ipha_dst %x, ill %s", 1990 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), 1991 ill->ill_name); 1992 } 1993 1994 /* Macro to send packet on vif */ 1995 #define MC_SEND(ipha, mp, vifp, dst) { \ 1996 if ((vifp)->v_flags & VIFF_TUNNEL) \ 1997 encap_send((ipha), (mp), (vifp), (dst)); \ 1998 else if ((vifp)->v_flags & VIFF_REGISTER) \ 1999 register_send((ipha), (mp), (vifp), (dst)); \ 2000 else \ 2001 phyint_send((ipha), (mp), (vifp), (dst)); \ 2002 } 2003 2004 vifi = rt->mfc_parent; 2005 2006 /* 2007 * The value of vifi is MAXVIFS if the pkt had no parent, i.e., 2008 * Mrouted had no route. 2009 * We wanted the route installed in the mfctable to prevent multiple 2010 * tries, so it passed add_mfc(), but is discarded here. The v_ipif is 2011 * NULL so we don't want to check the ill. Still needed as of Mrouted 2012 * 3.6. 2013 */ 2014 if (vifi == NO_VIF) { 2015 ip1dbg(("ip_mdq: no route for origin ill %s, vifi is NO_VIF\n", 2016 ill->ill_name)); 2017 if (ipst->ips_ip_mrtdebug > 1) { 2018 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2019 "ip_mdq: vifi is NO_VIF ill = %s", ill->ill_name); 2020 } 2021 return (-1); /* drop pkt */ 2022 } 2023 2024 if (!lock_good_vif(&ipst->ips_vifs[vifi])) 2025 return (-1); 2026 /* 2027 * The MFC entries are not cleaned up when an ipif goes 2028 * away thus this code has to guard against an MFC referencing 2029 * an ipif that has been closed. Note: reset_mrt_vif_ipif 2030 * sets the v_ipif to NULL when the ipif disappears. 2031 */ 2032 ASSERT(ipst->ips_vifs[vifi].v_ipif != NULL); 2033 2034 if (vifi >= ipst->ips_numvifs) { 2035 cmn_err(CE_WARN, "ip_mdq: illegal vifi %d numvifs " 2036 "%d ill %s viftable ill %s\n", 2037 (int)vifi, (int)ipst->ips_numvifs, ill->ill_name, 2038 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name); 2039 unlock_good_vif(&ipst->ips_vifs[vifi]); 2040 return (-1); 2041 } 2042 /* 2043 * Don't forward if it didn't arrive from the parent vif for its 2044 * origin. 2045 */ 2046 if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill) || 2047 (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) { 2048 /* Came in the wrong interface */ 2049 ip1dbg(("ip_mdq: arrived wrong if, vifi %d " 2050 "numvifs %d ill %s viftable ill %s\n", 2051 (int)vifi, (int)ipst->ips_numvifs, ill->ill_name, 2052 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name)); 2053 if (ipst->ips_ip_mrtdebug > 1) { 2054 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2055 "ip_mdq: arrived wrong if, vifi %d ill " 2056 "%s viftable ill %s\n", 2057 (int)vifi, ill->ill_name, 2058 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name); 2059 } 2060 ipst->ips_mrtstat->mrts_wrong_if++; 2061 rt->mfc_wrong_if++; 2062 2063 /* 2064 * If we are doing PIM assert processing and we are forwarding 2065 * packets on this interface, and it is a broadcast medium 2066 * interface (and not a tunnel), send a message to the routing. 2067 * 2068 * We use the first ipif on the list, since it's all we have. 2069 * Chances are the ipif_flags are the same for ipifs on the ill. 2070 */ 2071 if (ipst->ips_pim_assert && rt->mfc_ttls[vifi] > 0 && 2072 (ill->ill_ipif->ipif_flags & IPIF_BROADCAST) && 2073 !(ipst->ips_vifs[vifi].v_flags & VIFF_TUNNEL)) { 2074 mblk_t *mp_copy; 2075 struct igmpmsg *im; 2076 2077 /* TODO could copy header and dup rest */ 2078 mp_copy = copymsg(mp); 2079 if (mp_copy == NULL) { 2080 ipst->ips_mrtstat->mrts_fwd_drop++; 2081 ip1dbg(("ip_mdq: out of memory " 2082 "for mblk, mp_copy\n")); 2083 unlock_good_vif(&ipst->ips_vifs[vifi]); 2084 return (-1); 2085 } 2086 2087 im = (struct igmpmsg *)mp_copy->b_rptr; 2088 im->im_msgtype = IGMPMSG_WRONGVIF; 2089 im->im_mbz = 0; 2090 im->im_vif = (ushort_t)vifi; 2091 /* Pass to RAWIP */ 2092 2093 bzero(&iras, sizeof (iras)); 2094 iras.ira_flags = IRAF_IS_IPV4; 2095 iras.ira_ip_hdr_length = 2096 IPH_HDR_LENGTH(mp_copy->b_rptr); 2097 iras.ira_pktlen = msgdsize(mp_copy); 2098 (mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras); 2099 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2100 } 2101 unlock_good_vif(&ipst->ips_vifs[vifi]); 2102 if (tunnel_src != 0) 2103 return (1); 2104 else 2105 return (0); 2106 } 2107 /* 2108 * If I sourced this packet, it counts as output, else it was input. 2109 */ 2110 if (ipha->ipha_src == ipst->ips_vifs[vifi].v_lcl_addr.s_addr) { 2111 ipst->ips_vifs[vifi].v_pkt_out++; 2112 ipst->ips_vifs[vifi].v_bytes_out += plen; 2113 } else { 2114 ipst->ips_vifs[vifi].v_pkt_in++; 2115 ipst->ips_vifs[vifi].v_bytes_in += plen; 2116 } 2117 mutex_enter(&rt->mfc_mutex); 2118 rt->mfc_pkt_cnt++; 2119 rt->mfc_byte_cnt += plen; 2120 mutex_exit(&rt->mfc_mutex); 2121 unlock_good_vif(&ipst->ips_vifs[vifi]); 2122 /* 2123 * For each vif, decide if a copy of the packet should be forwarded. 2124 * Forward if: 2125 * - the vif threshold ttl is non-zero AND 2126 * - the pkt ttl exceeds the vif's threshold 2127 * A non-zero mfc_ttl indicates that the vif is part of 2128 * the output set for the mfc entry. 2129 */ 2130 mutex_enter(&ipst->ips_numvifs_mutex); 2131 num_of_vifs = ipst->ips_numvifs; 2132 mutex_exit(&ipst->ips_numvifs_mutex); 2133 for (vifp = ipst->ips_vifs, vifi = 0; 2134 vifi < num_of_vifs; 2135 vifp++, vifi++) { 2136 if (!lock_good_vif(vifp)) 2137 continue; 2138 if ((rt->mfc_ttls[vifi] > 0) && 2139 (ipha->ipha_ttl > rt->mfc_ttls[vifi])) { 2140 /* 2141 * lock_good_vif should not have succedded if 2142 * v_ipif is null. 2143 */ 2144 ASSERT(vifp->v_ipif != NULL); 2145 vifp->v_pkt_out++; 2146 vifp->v_bytes_out += plen; 2147 MC_SEND(ipha, mp, vifp, dst); 2148 ipst->ips_mrtstat->mrts_fwd_out++; 2149 } 2150 unlock_good_vif(vifp); 2151 } 2152 if (tunnel_src != 0) 2153 return (1); 2154 else 2155 return (0); 2156 } 2157 2158 /* 2159 * Send the packet on physical interface. 2160 * Caller assumes can continue to use mp on return. 2161 */ 2162 /* ARGSUSED */ 2163 static void 2164 phyint_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2165 { 2166 mblk_t *mp_copy; 2167 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2168 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2169 2170 /* Make a new reference to the packet */ 2171 mp_copy = copymsg(mp); /* TODO could copy header and dup rest */ 2172 if (mp_copy == NULL) { 2173 ipst->ips_mrtstat->mrts_fwd_drop++; 2174 ip1dbg(("phyint_send: out of memory for mblk, mp_copy\n")); 2175 return; 2176 } 2177 if (vifp->v_rate_limit <= 0) 2178 tbf_send_packet(vifp, mp_copy); 2179 else { 2180 if (ipst->ips_ip_mrtdebug > 1) { 2181 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2182 "phyint_send: tbf_contr rate %d " 2183 "vifp 0x%p mp 0x%p dst 0x%x", 2184 vifp->v_rate_limit, (void *)vifp, (void *)mp, dst); 2185 } 2186 tbf_control(vifp, mp_copy, (ipha_t *)mp_copy->b_rptr); 2187 } 2188 } 2189 2190 /* 2191 * Send the whole packet for REGISTER encapsulation to PIM daemon 2192 * Caller assumes it can continue to use mp on return. 2193 */ 2194 /* ARGSUSED */ 2195 static void 2196 register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2197 { 2198 struct igmpmsg *im; 2199 mblk_t *mp_copy; 2200 ipha_t *ipha_copy; 2201 ill_t *ill = vifp->v_ipif->ipif_ill; 2202 ip_stack_t *ipst = ill->ill_ipst; 2203 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2204 ip_recv_attr_t iras; 2205 2206 if (ipst->ips_ip_mrtdebug > 1) { 2207 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2208 "register_send: src %x, dst %x\n", 2209 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst)); 2210 } 2211 2212 /* 2213 * Copy the old packet & pullup its IP header into the new mblk_t so we 2214 * can modify it. Try to fill the new mblk_t since if we don't the 2215 * ethernet driver will. 2216 */ 2217 mp_copy = allocb(sizeof (struct igmpmsg) + sizeof (ipha_t), BPRI_MED); 2218 if (mp_copy == NULL) { 2219 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2220 if (ipst->ips_ip_mrtdebug > 3) { 2221 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2222 "register_send: allocb failure."); 2223 } 2224 return; 2225 } 2226 2227 /* 2228 * Bump write pointer to account for igmpmsg being added. 2229 */ 2230 mp_copy->b_wptr = mp_copy->b_rptr + sizeof (struct igmpmsg); 2231 2232 /* 2233 * Chain packet to new mblk_t. 2234 */ 2235 if ((mp_copy->b_cont = copymsg(mp)) == NULL) { 2236 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2237 if (ipst->ips_ip_mrtdebug > 3) { 2238 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2239 "register_send: copymsg failure."); 2240 } 2241 freeb(mp_copy); 2242 return; 2243 } 2244 2245 /* 2246 * icmp_input() asserts that IP version field is set to an 2247 * appropriate version. Hence, the struct igmpmsg that this really 2248 * becomes, needs to have the correct IP version field. 2249 */ 2250 ipha_copy = (ipha_t *)mp_copy->b_rptr; 2251 *ipha_copy = multicast_encap_iphdr; 2252 2253 /* 2254 * The kernel uses the struct igmpmsg header to encode the messages to 2255 * the multicast routing daemon. Fill in the fields in the header 2256 * starting with the message type which is IGMPMSG_WHOLEPKT 2257 */ 2258 im = (struct igmpmsg *)mp_copy->b_rptr; 2259 im->im_msgtype = IGMPMSG_WHOLEPKT; 2260 im->im_src.s_addr = ipha->ipha_src; 2261 im->im_dst.s_addr = ipha->ipha_dst; 2262 2263 /* 2264 * Must Be Zero. This is because the struct igmpmsg is really an IP 2265 * header with renamed fields and the multicast routing daemon uses 2266 * an ipha_protocol (aka im_mbz) of 0 to distinguish these messages. 2267 */ 2268 im->im_mbz = 0; 2269 2270 ++ipst->ips_mrtstat->mrts_upcalls; 2271 if (IPCL_IS_NONSTR(mrouter) ? mrouter->conn_flow_cntrld : 2272 !canputnext(mrouter->conn_rq)) { 2273 ++ipst->ips_mrtstat->mrts_pim_regsend_drops; 2274 if (ipst->ips_ip_mrtdebug > 3) { 2275 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2276 "register_send: register upcall failure."); 2277 } 2278 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2279 ip_drop_input("mrts_pim_regsend_drops", mp_copy, ill); 2280 freemsg(mp_copy); 2281 } else { 2282 /* Pass to RAWIP */ 2283 bzero(&iras, sizeof (iras)); 2284 iras.ira_flags = IRAF_IS_IPV4; 2285 iras.ira_ip_hdr_length = sizeof (ipha_t); 2286 iras.ira_pktlen = msgdsize(mp_copy); 2287 (mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras); 2288 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2289 } 2290 } 2291 2292 /* 2293 * pim_validate_cksum handles verification of the checksum in the 2294 * pim header. For PIM Register packets, the checksum is calculated 2295 * across the PIM header only. For all other packets, the checksum 2296 * is for the PIM header and remainder of the packet. 2297 * 2298 * returns: B_TRUE, if checksum is okay. 2299 * B_FALSE, if checksum is not valid. 2300 */ 2301 static boolean_t 2302 pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp) 2303 { 2304 mblk_t *mp_dup; 2305 2306 if ((mp_dup = dupmsg(mp)) == NULL) 2307 return (B_FALSE); 2308 2309 mp_dup->b_rptr += IPH_HDR_LENGTH(ip); 2310 if (pimp->pim_type == PIM_REGISTER) 2311 mp_dup->b_wptr = mp_dup->b_rptr + PIM_MINLEN; 2312 if (IP_CSUM(mp_dup, 0, 0)) { 2313 freemsg(mp_dup); 2314 return (B_FALSE); 2315 } 2316 freemsg(mp_dup); 2317 return (B_TRUE); 2318 } 2319 2320 /* 2321 * Process PIM protocol packets i.e. IP Protocol 103. 2322 * Register messages are decapsulated and sent onto multicast forwarding. 2323 * 2324 * Return NULL for a bad packet that is discarded here. 2325 * Return mp if the message is OK and should be handed to "raw" receivers. 2326 * Callers of pim_input() may need to reinitialize variables that were copied 2327 * from the mblk as this calls pullupmsg(). 2328 */ 2329 mblk_t * 2330 pim_input(mblk_t *mp, ip_recv_attr_t *ira) 2331 { 2332 ipha_t *eip, *ip; 2333 int iplen, pimlen, iphlen; 2334 struct pim *pimp; /* pointer to a pim struct */ 2335 uint32_t *reghdr; 2336 ill_t *ill = ira->ira_ill; 2337 ip_stack_t *ipst = ill->ill_ipst; 2338 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2339 2340 /* 2341 * Pullup the msg for PIM protocol processing. 2342 */ 2343 if (pullupmsg(mp, -1) == 0) { 2344 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2345 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2346 ip_drop_input("mrts_pim_nomemory", mp, ill); 2347 freemsg(mp); 2348 return (NULL); 2349 } 2350 2351 ip = (ipha_t *)mp->b_rptr; 2352 iplen = ip->ipha_length; 2353 iphlen = IPH_HDR_LENGTH(ip); 2354 pimlen = ntohs(iplen) - iphlen; 2355 2356 /* 2357 * Validate lengths 2358 */ 2359 if (pimlen < PIM_MINLEN) { 2360 ++ipst->ips_mrtstat->mrts_pim_malformed; 2361 if (ipst->ips_ip_mrtdebug > 1) { 2362 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2363 "pim_input: length not at least minlen"); 2364 } 2365 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2366 ip_drop_input("mrts_pim_malformed", mp, ill); 2367 freemsg(mp); 2368 return (NULL); 2369 } 2370 2371 /* 2372 * Point to the PIM header. 2373 */ 2374 pimp = (struct pim *)((caddr_t)ip + iphlen); 2375 2376 /* 2377 * Check the version number. 2378 */ 2379 if (pimp->pim_vers != PIM_VERSION) { 2380 ++ipst->ips_mrtstat->mrts_pim_badversion; 2381 if (ipst->ips_ip_mrtdebug > 1) { 2382 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2383 "pim_input: unknown version of PIM"); 2384 } 2385 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2386 ip_drop_input("mrts_pim_badversion", mp, ill); 2387 freemsg(mp); 2388 return (NULL); 2389 } 2390 2391 /* 2392 * Validate the checksum 2393 */ 2394 if (!pim_validate_cksum(mp, ip, pimp)) { 2395 ++ipst->ips_mrtstat->mrts_pim_rcv_badcsum; 2396 if (ipst->ips_ip_mrtdebug > 1) { 2397 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2398 "pim_input: invalid checksum"); 2399 } 2400 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2401 ip_drop_input("pim_rcv_badcsum", mp, ill); 2402 freemsg(mp); 2403 return (NULL); 2404 } 2405 2406 if (pimp->pim_type != PIM_REGISTER) 2407 return (mp); 2408 2409 reghdr = (uint32_t *)(pimp + 1); 2410 eip = (ipha_t *)(reghdr + 1); 2411 2412 /* 2413 * check if the inner packet is destined to mcast group 2414 */ 2415 if (!CLASSD(eip->ipha_dst)) { 2416 ++ipst->ips_mrtstat->mrts_pim_badregisters; 2417 if (ipst->ips_ip_mrtdebug > 1) { 2418 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2419 "pim_input: Inner pkt not mcast .. !"); 2420 } 2421 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2422 ip_drop_input("mrts_pim_badregisters", mp, ill); 2423 freemsg(mp); 2424 return (NULL); 2425 } 2426 if (ipst->ips_ip_mrtdebug > 1) { 2427 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2428 "register from %x, to %x, len %d", 2429 ntohl(eip->ipha_src), 2430 ntohl(eip->ipha_dst), 2431 ntohs(eip->ipha_length)); 2432 } 2433 /* 2434 * If the null register bit is not set, decapsulate 2435 * the packet before forwarding it. 2436 * Avoid this in no register vif 2437 */ 2438 if (!(ntohl(*reghdr) & PIM_NULL_REGISTER) && 2439 ipst->ips_reg_vif_num != ALL_VIFS) { 2440 mblk_t *mp_copy; 2441 uint_t saved_pktlen; 2442 2443 /* Copy the message */ 2444 if ((mp_copy = copymsg(mp)) == NULL) { 2445 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2446 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2447 ip_drop_input("mrts_pim_nomemory", mp, ill); 2448 freemsg(mp); 2449 return (NULL); 2450 } 2451 2452 /* 2453 * Decapsulate the packet and give it to 2454 * register_mforward. 2455 */ 2456 mp_copy->b_rptr += iphlen + sizeof (pim_t) + sizeof (*reghdr); 2457 saved_pktlen = ira->ira_pktlen; 2458 ira->ira_pktlen -= iphlen + sizeof (pim_t) + sizeof (*reghdr); 2459 if (register_mforward(mp_copy, ira) != 0) { 2460 /* register_mforward already called ip_drop_input */ 2461 freemsg(mp); 2462 ira->ira_pktlen = saved_pktlen; 2463 return (NULL); 2464 } 2465 ira->ira_pktlen = saved_pktlen; 2466 } 2467 2468 /* 2469 * Pass all valid PIM packets up to any process(es) listening on a raw 2470 * PIM socket. For Solaris it is done right after pim_input() is 2471 * called. 2472 */ 2473 return (mp); 2474 } 2475 2476 /* 2477 * PIM sparse mode hook. Called by pim_input after decapsulating 2478 * the packet. Loop back the packet, as if we have received it. 2479 * In pim_input() we have to check if the destination is a multicast address. 2480 */ 2481 static int 2482 register_mforward(mblk_t *mp, ip_recv_attr_t *ira) 2483 { 2484 ire_t *ire; 2485 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2486 ill_t *ill = ira->ira_ill; 2487 ip_stack_t *ipst = ill->ill_ipst; 2488 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2489 2490 ASSERT(ipst->ips_reg_vif_num <= ipst->ips_numvifs); 2491 2492 if (ipst->ips_ip_mrtdebug > 3) { 2493 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2494 "register_mforward: src %x, dst %x\n", 2495 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst)); 2496 } 2497 /* 2498 * Need to pass in to ip_mforward() the information that the 2499 * packet has arrived on the register_vif. We mark it with 2500 * the IRAF_PIM_REGISTER attribute. 2501 * pim_input verified that the (inner) destination is multicast, 2502 * hence we skip the generic code in ip_input. 2503 */ 2504 ira->ira_flags |= IRAF_PIM_REGISTER; 2505 ++ipst->ips_mrtstat->mrts_pim_regforwards; 2506 2507 if (!CLASSD(ipha->ipha_dst)) { 2508 ire = ire_route_recursive_v4(ipha->ipha_dst, 0, NULL, ALL_ZONES, 2509 ira->ira_tsl, MATCH_IRE_SECATTR, IRR_ALLOCATE, 0, ipst, 2510 NULL, NULL, NULL); 2511 } else { 2512 ire = ire_multicast(ill); 2513 } 2514 ASSERT(ire != NULL); 2515 /* Normally this will return the IRE_MULTICAST */ 2516 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 2517 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2518 ip_drop_input("mrts_pim RTF_REJECT", mp, ill); 2519 freemsg(mp); 2520 ire_refrele(ire); 2521 return (-1); 2522 } 2523 ASSERT(ire->ire_type & IRE_MULTICAST); 2524 (*ire->ire_recvfn)(ire, mp, ipha, ira); 2525 ire_refrele(ire); 2526 2527 return (0); 2528 } 2529 2530 /* 2531 * Send an encapsulated packet. 2532 * Caller assumes can continue to use mp when routine returns. 2533 */ 2534 /* ARGSUSED */ 2535 static void 2536 encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2537 { 2538 mblk_t *mp_copy; 2539 ipha_t *ipha_copy; 2540 size_t len; 2541 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2542 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2543 2544 if (ipst->ips_ip_mrtdebug > 1) { 2545 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2546 "encap_send: vif %ld enter", 2547 (ptrdiff_t)(vifp - ipst->ips_vifs)); 2548 } 2549 len = ntohs(ipha->ipha_length); 2550 2551 /* 2552 * Copy the old packet & pullup it's IP header into the 2553 * new mbuf so we can modify it. Try to fill the new 2554 * mbuf since if we don't the ethernet driver will. 2555 */ 2556 mp_copy = allocb(32 + sizeof (multicast_encap_iphdr), BPRI_MED); 2557 if (mp_copy == NULL) 2558 return; 2559 mp_copy->b_rptr += 32; 2560 mp_copy->b_wptr = mp_copy->b_rptr + sizeof (multicast_encap_iphdr); 2561 if ((mp_copy->b_cont = copymsg(mp)) == NULL) { 2562 freeb(mp_copy); 2563 return; 2564 } 2565 2566 /* 2567 * Fill in the encapsulating IP header. 2568 * Remote tunnel dst in rmt_addr, from add_vif(). 2569 */ 2570 ipha_copy = (ipha_t *)mp_copy->b_rptr; 2571 *ipha_copy = multicast_encap_iphdr; 2572 ASSERT((len + sizeof (ipha_t)) <= IP_MAXPACKET); 2573 ipha_copy->ipha_length = htons(len + sizeof (ipha_t)); 2574 ipha_copy->ipha_src = vifp->v_lcl_addr.s_addr; 2575 ipha_copy->ipha_dst = vifp->v_rmt_addr.s_addr; 2576 ASSERT(ipha_copy->ipha_ident == 0); 2577 2578 /* Turn the encapsulated IP header back into a valid one. */ 2579 ipha = (ipha_t *)mp_copy->b_cont->b_rptr; 2580 ipha->ipha_ttl--; 2581 ipha->ipha_hdr_checksum = 0; 2582 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 2583 2584 ipha_copy->ipha_ttl = ipha->ipha_ttl; 2585 2586 if (ipst->ips_ip_mrtdebug > 1) { 2587 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2588 "encap_send: group 0x%x", ntohl(ipha->ipha_dst)); 2589 } 2590 if (vifp->v_rate_limit <= 0) 2591 tbf_send_packet(vifp, mp_copy); 2592 else 2593 /* ipha is from the original header */ 2594 tbf_control(vifp, mp_copy, ipha); 2595 } 2596 2597 /* 2598 * De-encapsulate a packet and feed it back through IP input if it 2599 * matches one of our multicast tunnels. 2600 * 2601 * This routine is called whenever IP gets a packet with prototype 2602 * IPPROTO_ENCAP and a local destination address and the packet didn't 2603 * match one of our configured IP-in-IP tunnels. 2604 */ 2605 void 2606 ip_mroute_decap(mblk_t *mp, ip_recv_attr_t *ira) 2607 { 2608 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2609 ipha_t *ipha_encap; 2610 int hlen = IPH_HDR_LENGTH(ipha); 2611 int hlen_encap; 2612 ipaddr_t src; 2613 struct vif *vifp; 2614 ire_t *ire; 2615 ill_t *ill = ira->ira_ill; 2616 ip_stack_t *ipst = ill->ill_ipst; 2617 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2618 2619 /* Make sure we have all of the inner header */ 2620 ipha_encap = (ipha_t *)((char *)ipha + hlen); 2621 if (mp->b_wptr - mp->b_rptr < hlen + IP_SIMPLE_HDR_LENGTH) { 2622 ipha = ip_pullup(mp, hlen + IP_SIMPLE_HDR_LENGTH, ira); 2623 if (ipha == NULL) { 2624 ipst->ips_mrtstat->mrts_bad_tunnel++; 2625 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2626 ip_drop_input("ip_mroute_decap: too short", mp, ill); 2627 freemsg(mp); 2628 return; 2629 } 2630 ipha_encap = (ipha_t *)((char *)ipha + hlen); 2631 } 2632 hlen_encap = IPH_HDR_LENGTH(ipha_encap); 2633 if (mp->b_wptr - mp->b_rptr < hlen + hlen_encap) { 2634 ipha = ip_pullup(mp, hlen + hlen_encap, ira); 2635 if (ipha == NULL) { 2636 ipst->ips_mrtstat->mrts_bad_tunnel++; 2637 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2638 ip_drop_input("ip_mroute_decap: too short", mp, ill); 2639 freemsg(mp); 2640 return; 2641 } 2642 ipha_encap = (ipha_t *)((char *)ipha + hlen); 2643 } 2644 2645 /* 2646 * Dump the packet if it's not to a multicast destination or if 2647 * we don't have an encapsulating tunnel with the source. 2648 * Note: This code assumes that the remote site IP address 2649 * uniquely identifies the tunnel (i.e., that this site has 2650 * at most one tunnel with the remote site). 2651 */ 2652 if (!CLASSD(ipha_encap->ipha_dst)) { 2653 ipst->ips_mrtstat->mrts_bad_tunnel++; 2654 ip1dbg(("ip_mroute_decap: bad tunnel\n")); 2655 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2656 ip_drop_input("mrts_bad_tunnel", mp, ill); 2657 freemsg(mp); 2658 return; 2659 } 2660 src = (ipaddr_t)ipha->ipha_src; 2661 mutex_enter(&ipst->ips_last_encap_lock); 2662 if (src != ipst->ips_last_encap_src) { 2663 struct vif *vife; 2664 2665 vifp = ipst->ips_vifs; 2666 vife = vifp + ipst->ips_numvifs; 2667 ipst->ips_last_encap_src = src; 2668 ipst->ips_last_encap_vif = 0; 2669 for (; vifp < vife; ++vifp) { 2670 if (!lock_good_vif(vifp)) 2671 continue; 2672 if (vifp->v_rmt_addr.s_addr == src) { 2673 if (vifp->v_flags & VIFF_TUNNEL) 2674 ipst->ips_last_encap_vif = vifp; 2675 if (ipst->ips_ip_mrtdebug > 1) { 2676 (void) mi_strlog(mrouter->conn_rq, 2677 1, SL_TRACE, 2678 "ip_mroute_decap: good tun " 2679 "vif %ld with %x", 2680 (ptrdiff_t)(vifp - ipst->ips_vifs), 2681 ntohl(src)); 2682 } 2683 unlock_good_vif(vifp); 2684 break; 2685 } 2686 unlock_good_vif(vifp); 2687 } 2688 } 2689 if ((vifp = ipst->ips_last_encap_vif) == 0) { 2690 mutex_exit(&ipst->ips_last_encap_lock); 2691 ipst->ips_mrtstat->mrts_bad_tunnel++; 2692 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2693 ip_drop_input("mrts_bad_tunnel", mp, ill); 2694 freemsg(mp); 2695 ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n", 2696 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(src))); 2697 return; 2698 } 2699 mutex_exit(&ipst->ips_last_encap_lock); 2700 2701 /* 2702 * Need to pass in the tunnel source to ip_mforward (so that it can 2703 * verify that the packet arrived over the correct vif.) 2704 */ 2705 ira->ira_flags |= IRAF_MROUTE_TUNNEL_SET; 2706 ira->ira_mroute_tunnel = src; 2707 mp->b_rptr += hlen; 2708 ira->ira_pktlen -= hlen; 2709 ira->ira_ip_hdr_length = hlen_encap; 2710 2711 /* 2712 * We don't redo any of the filtering in ill_input_full_v4 and we 2713 * have checked that all of ipha_encap and any IP options are 2714 * pulled up. Hence we call ire_recv_multicast_v4 directly. 2715 * However, we have to check for RSVP as in ip_input_full_v4 2716 * and if so we pass it to ire_recv_broadcast_v4 for local delivery 2717 * to the rsvpd. 2718 */ 2719 if (ipha_encap->ipha_protocol == IPPROTO_RSVP && 2720 ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head != NULL) { 2721 ire = ire_route_recursive_v4(INADDR_BROADCAST, 0, ill, 2722 ALL_ZONES, ira->ira_tsl, MATCH_IRE_ILL|MATCH_IRE_SECATTR, 2723 IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL); 2724 } else { 2725 ire = ire_multicast(ill); 2726 } 2727 ASSERT(ire != NULL); 2728 /* Normally this will return the IRE_MULTICAST or IRE_BROADCAST */ 2729 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 2730 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2731 ip_drop_input("ip_mroute_decap: RTF_REJECT", mp, ill); 2732 freemsg(mp); 2733 ire_refrele(ire); 2734 return; 2735 } 2736 ire->ire_ib_pkt_count++; 2737 ASSERT(ire->ire_type & (IRE_MULTICAST|IRE_BROADCAST)); 2738 (*ire->ire_recvfn)(ire, mp, ipha_encap, ira); 2739 ire_refrele(ire); 2740 } 2741 2742 /* 2743 * Remove all records with v_ipif == ipif. Called when an interface goes away 2744 * (stream closed). Called as writer. 2745 */ 2746 void 2747 reset_mrt_vif_ipif(ipif_t *ipif) 2748 { 2749 vifi_t vifi, tmp_vifi; 2750 vifi_t num_of_vifs; 2751 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 2752 2753 /* Can't check vifi >= 0 since vifi_t is unsigned! */ 2754 2755 mutex_enter(&ipst->ips_numvifs_mutex); 2756 num_of_vifs = ipst->ips_numvifs; 2757 mutex_exit(&ipst->ips_numvifs_mutex); 2758 2759 for (vifi = num_of_vifs; vifi != 0; vifi--) { 2760 tmp_vifi = vifi - 1; 2761 if (ipst->ips_vifs[tmp_vifi].v_ipif == ipif) { 2762 (void) del_vif(&tmp_vifi, ipst); 2763 } 2764 } 2765 } 2766 2767 /* Remove pending upcall msgs when ill goes away. Called by ill_delete. */ 2768 void 2769 reset_mrt_ill(ill_t *ill) 2770 { 2771 struct mfc *rt; 2772 struct rtdetq *rte; 2773 int i; 2774 ip_stack_t *ipst = ill->ill_ipst; 2775 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2776 timeout_id_t id; 2777 2778 for (i = 0; i < MFCTBLSIZ; i++) { 2779 MFCB_REFHOLD(&ipst->ips_mfcs[i]); 2780 if ((rt = ipst->ips_mfcs[i].mfcb_mfc) != NULL) { 2781 if (ipst->ips_ip_mrtdebug > 1) { 2782 (void) mi_strlog(mrouter->conn_rq, 1, 2783 SL_TRACE, 2784 "reset_mrt_ill: mfctable [%d]", i); 2785 } 2786 while (rt != NULL) { 2787 mutex_enter(&rt->mfc_mutex); 2788 while ((rte = rt->mfc_rte) != NULL) { 2789 if (rte->ill == ill && 2790 (id = rt->mfc_timeout_id) != 0) { 2791 /* 2792 * Its ok to drop the lock, the 2793 * struct cannot be freed since 2794 * we have a ref on the hash 2795 * bucket. 2796 */ 2797 mutex_exit(&rt->mfc_mutex); 2798 (void) untimeout(id); 2799 mutex_enter(&rt->mfc_mutex); 2800 } 2801 if (rte->ill == ill) { 2802 if (ipst->ips_ip_mrtdebug > 1) { 2803 (void) mi_strlog( 2804 mrouter->conn_rq, 2805 1, SL_TRACE, 2806 "reset_mrt_ill: " 2807 "ill 0x%p", (void *)ill); 2808 } 2809 rt->mfc_rte = rte->rte_next; 2810 freemsg(rte->mp); 2811 mi_free((char *)rte); 2812 } 2813 } 2814 mutex_exit(&rt->mfc_mutex); 2815 rt = rt->mfc_next; 2816 } 2817 } 2818 MFCB_REFRELE(&ipst->ips_mfcs[i]); 2819 } 2820 } 2821 2822 /* 2823 * Token bucket filter module. 2824 * The ipha is for mcastgrp destination for phyint and encap. 2825 */ 2826 static void 2827 tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha) 2828 { 2829 size_t p_len = msgdsize(mp); 2830 struct tbf *t = vifp->v_tbf; 2831 timeout_id_t id = 0; 2832 ill_t *ill = vifp->v_ipif->ipif_ill; 2833 ip_stack_t *ipst = ill->ill_ipst; 2834 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2835 2836 /* Drop if packet is too large */ 2837 if (p_len > MAX_BKT_SIZE) { 2838 ipst->ips_mrtstat->mrts_pkt2large++; 2839 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2840 ip_drop_output("tbf_control - too large", mp, ill); 2841 freemsg(mp); 2842 return; 2843 } 2844 if (ipst->ips_ip_mrtdebug > 1) { 2845 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2846 "tbf_ctrl: SEND vif %ld, qlen %d, ipha_dst 0x%x", 2847 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len, 2848 ntohl(ipha->ipha_dst)); 2849 } 2850 2851 mutex_enter(&t->tbf_lock); 2852 2853 tbf_update_tokens(vifp); 2854 2855 /* 2856 * If there are enough tokens, 2857 * and the queue is empty, send this packet out. 2858 */ 2859 if (ipst->ips_ip_mrtdebug > 1) { 2860 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2861 "tbf_control: vif %ld, TOKENS %d, pkt len %lu, qlen %d", 2862 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_n_tok, p_len, 2863 t->tbf_q_len); 2864 } 2865 /* No packets are queued */ 2866 if (t->tbf_q_len == 0) { 2867 /* queue empty, send packet if enough tokens */ 2868 if (p_len <= t->tbf_n_tok) { 2869 t->tbf_n_tok -= p_len; 2870 mutex_exit(&t->tbf_lock); 2871 tbf_send_packet(vifp, mp); 2872 return; 2873 } else { 2874 /* Queue packet and timeout till later */ 2875 tbf_queue(vifp, mp); 2876 ASSERT(vifp->v_timeout_id == 0); 2877 vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp, 2878 TBF_REPROCESS); 2879 } 2880 } else if (t->tbf_q_len < t->tbf_max_q_len) { 2881 /* Finite queue length, so queue pkts and process queue */ 2882 tbf_queue(vifp, mp); 2883 tbf_process_q(vifp); 2884 } else { 2885 /* Check that we have UDP header with IP header */ 2886 size_t hdr_length = IPH_HDR_LENGTH(ipha) + 2887 sizeof (struct udphdr); 2888 2889 if ((mp->b_wptr - mp->b_rptr) < hdr_length) { 2890 if (!pullupmsg(mp, hdr_length)) { 2891 BUMP_MIB(ill->ill_ip_mib, 2892 ipIfStatsOutDiscards); 2893 ip_drop_output("tbf_control - pullup", mp, ill); 2894 freemsg(mp); 2895 ip1dbg(("tbf_ctl: couldn't pullup udp hdr, " 2896 "vif %ld src 0x%x dst 0x%x\n", 2897 (ptrdiff_t)(vifp - ipst->ips_vifs), 2898 ntohl(ipha->ipha_src), 2899 ntohl(ipha->ipha_dst))); 2900 mutex_exit(&vifp->v_tbf->tbf_lock); 2901 return; 2902 } else 2903 /* Have to reassign ipha after pullupmsg */ 2904 ipha = (ipha_t *)mp->b_rptr; 2905 } 2906 /* 2907 * Queue length too much, 2908 * try to selectively dq, or queue and process 2909 */ 2910 if (!tbf_dq_sel(vifp, ipha)) { 2911 ipst->ips_mrtstat->mrts_q_overflow++; 2912 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2913 ip_drop_output("mrts_q_overflow", mp, ill); 2914 freemsg(mp); 2915 } else { 2916 tbf_queue(vifp, mp); 2917 tbf_process_q(vifp); 2918 } 2919 } 2920 if (t->tbf_q_len == 0) { 2921 id = vifp->v_timeout_id; 2922 vifp->v_timeout_id = 0; 2923 } 2924 mutex_exit(&vifp->v_tbf->tbf_lock); 2925 if (id != 0) 2926 (void) untimeout(id); 2927 } 2928 2929 /* 2930 * Adds a packet to the tbf queue at the interface. 2931 * The ipha is for mcastgrp destination for phyint and encap. 2932 */ 2933 static void 2934 tbf_queue(struct vif *vifp, mblk_t *mp) 2935 { 2936 struct tbf *t = vifp->v_tbf; 2937 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2938 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2939 2940 if (ipst->ips_ip_mrtdebug > 1) { 2941 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2942 "tbf_queue: vif %ld", (ptrdiff_t)(vifp - ipst->ips_vifs)); 2943 } 2944 ASSERT(MUTEX_HELD(&t->tbf_lock)); 2945 2946 if (t->tbf_t == NULL) { 2947 /* Queue was empty */ 2948 t->tbf_q = mp; 2949 } else { 2950 /* Insert at tail */ 2951 t->tbf_t->b_next = mp; 2952 } 2953 /* set new tail pointer */ 2954 t->tbf_t = mp; 2955 2956 mp->b_next = mp->b_prev = NULL; 2957 2958 t->tbf_q_len++; 2959 } 2960 2961 /* 2962 * Process the queue at the vif interface. 2963 * Drops the tbf_lock when sending packets. 2964 * 2965 * NOTE : The caller should quntimeout if the queue length is 0. 2966 */ 2967 static void 2968 tbf_process_q(struct vif *vifp) 2969 { 2970 mblk_t *mp; 2971 struct tbf *t = vifp->v_tbf; 2972 size_t len; 2973 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2974 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2975 2976 if (ipst->ips_ip_mrtdebug > 1) { 2977 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2978 "tbf_process_q 1: vif %ld qlen = %d", 2979 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len); 2980 } 2981 2982 /* 2983 * Loop through the queue at the interface and send 2984 * as many packets as possible. 2985 */ 2986 ASSERT(MUTEX_HELD(&t->tbf_lock)); 2987 2988 while (t->tbf_q_len > 0) { 2989 mp = t->tbf_q; 2990 len = (size_t)msgdsize(mp); /* length of ip pkt */ 2991 2992 /* Determine if the packet can be sent */ 2993 if (len <= t->tbf_n_tok) { 2994 /* 2995 * If so, reduce no. of tokens, dequeue the packet, 2996 * send the packet. 2997 */ 2998 t->tbf_n_tok -= len; 2999 3000 t->tbf_q = mp->b_next; 3001 if (--t->tbf_q_len == 0) { 3002 t->tbf_t = NULL; 3003 } 3004 mp->b_next = NULL; 3005 /* Exit mutex before sending packet, then re-enter */ 3006 mutex_exit(&t->tbf_lock); 3007 tbf_send_packet(vifp, mp); 3008 mutex_enter(&t->tbf_lock); 3009 } else 3010 break; 3011 } 3012 } 3013 3014 /* Called at tbf timeout to update tokens, process q and reset timer. */ 3015 static void 3016 tbf_reprocess_q(void *arg) 3017 { 3018 struct vif *vifp = arg; 3019 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 3020 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3021 3022 mutex_enter(&vifp->v_tbf->tbf_lock); 3023 vifp->v_timeout_id = 0; 3024 tbf_update_tokens(vifp); 3025 3026 tbf_process_q(vifp); 3027 3028 if (vifp->v_tbf->tbf_q_len > 0) { 3029 vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp, 3030 TBF_REPROCESS); 3031 } 3032 mutex_exit(&vifp->v_tbf->tbf_lock); 3033 3034 if (ipst->ips_ip_mrtdebug > 1) { 3035 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3036 "tbf_reprcess_q: vif %ld timeout id = %p", 3037 (ptrdiff_t)(vifp - ipst->ips_vifs), vifp->v_timeout_id); 3038 } 3039 } 3040 3041 /* 3042 * Function that will selectively discard a member of the tbf queue, 3043 * based on the precedence value and the priority. 3044 * 3045 * NOTE : The caller should quntimeout if the queue length is 0. 3046 */ 3047 static int 3048 tbf_dq_sel(struct vif *vifp, ipha_t *ipha) 3049 { 3050 uint_t p; 3051 struct tbf *t = vifp->v_tbf; 3052 mblk_t **np; 3053 mblk_t *last, *mp; 3054 ill_t *ill = vifp->v_ipif->ipif_ill; 3055 ip_stack_t *ipst = ill->ill_ipst; 3056 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3057 3058 if (ipst->ips_ip_mrtdebug > 1) { 3059 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3060 "dq_sel: vif %ld dst 0x%x", 3061 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(ipha->ipha_dst)); 3062 } 3063 3064 ASSERT(MUTEX_HELD(&t->tbf_lock)); 3065 p = priority(vifp, ipha); 3066 3067 np = &t->tbf_q; 3068 last = NULL; 3069 while ((mp = *np) != NULL) { 3070 if (p > (priority(vifp, (ipha_t *)mp->b_rptr))) { 3071 *np = mp->b_next; 3072 /* If removing the last packet, fix the tail pointer */ 3073 if (mp == t->tbf_t) 3074 t->tbf_t = last; 3075 mp->b_prev = mp->b_next = NULL; 3076 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 3077 ip_drop_output("tbf_dq_send", mp, ill); 3078 freemsg(mp); 3079 /* 3080 * It's impossible for the queue to be empty, but 3081 * we check anyway. 3082 */ 3083 if (--t->tbf_q_len == 0) { 3084 t->tbf_t = NULL; 3085 } 3086 ipst->ips_mrtstat->mrts_drop_sel++; 3087 return (1); 3088 } 3089 np = &mp->b_next; 3090 last = mp; 3091 } 3092 return (0); 3093 } 3094 3095 /* Sends packet, 2 cases - encap tunnel, phyint. */ 3096 static void 3097 tbf_send_packet(struct vif *vifp, mblk_t *mp) 3098 { 3099 ipif_t *ipif = vifp->v_ipif; 3100 ill_t *ill = ipif->ipif_ill; 3101 ip_stack_t *ipst = ill->ill_ipst; 3102 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3103 ipha_t *ipha; 3104 3105 ipha = (ipha_t *)mp->b_rptr; 3106 /* If encap tunnel options */ 3107 if (vifp->v_flags & VIFF_TUNNEL) { 3108 ip_xmit_attr_t ixas; 3109 3110 if (ipst->ips_ip_mrtdebug > 1) { 3111 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3112 "tbf_send_packet: ENCAP tunnel vif %ld", 3113 (ptrdiff_t)(vifp - ipst->ips_vifs)); 3114 } 3115 bzero(&ixas, sizeof (ixas)); 3116 ixas.ixa_flags = 3117 IXAF_IS_IPV4 | IXAF_NO_TTL_CHANGE | IXAF_VERIFY_SOURCE; 3118 ixas.ixa_ipst = ipst; 3119 ixas.ixa_ifindex = 0; 3120 ixas.ixa_cred = kcred; 3121 ixas.ixa_cpid = NOPID; 3122 ixas.ixa_tsl = NULL; 3123 ixas.ixa_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */ 3124 ixas.ixa_pktlen = ntohs(ipha->ipha_length); 3125 ixas.ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha); 3126 3127 /* 3128 * Feed into ip_output_simple which will set the ident field 3129 * and checksum the encapsulating header. 3130 * BSD gets the cached route vifp->v_route from ip_output() 3131 * to speed up route table lookups. Not necessary in SunOS 5.x. 3132 * One could make multicast forwarding faster by putting an 3133 * ip_xmit_attr_t in each vif thereby caching the ire/nce. 3134 */ 3135 (void) ip_output_simple(mp, &ixas); 3136 ixa_cleanup(&ixas); 3137 return; 3138 3139 /* phyint */ 3140 } else { 3141 /* Need to loop back to members on the outgoing interface. */ 3142 ipaddr_t dst; 3143 ip_recv_attr_t iras; 3144 nce_t *nce; 3145 3146 bzero(&iras, sizeof (iras)); 3147 iras.ira_flags = IRAF_IS_IPV4; 3148 iras.ira_ill = iras.ira_rill = ill; 3149 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 3150 iras.ira_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */ 3151 iras.ira_pktlen = ntohs(ipha->ipha_length); 3152 iras.ira_ip_hdr_length = IPH_HDR_LENGTH(ipha); 3153 3154 dst = ipha->ipha_dst; 3155 if (ill_hasmembers_v4(ill, dst)) { 3156 iras.ira_flags |= IRAF_LOOPBACK_COPY; 3157 } 3158 if (ipst->ips_ip_mrtdebug > 1) { 3159 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3160 "tbf_send_pkt: phyint forward vif %ld dst = 0x%x", 3161 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(dst)); 3162 } 3163 /* 3164 * Find an NCE which matches the nexthop. 3165 * For a pt-pt interface we use the other end of the pt-pt 3166 * link. 3167 */ 3168 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 3169 dst = ipif->ipif_pp_dst_addr; 3170 nce = arp_nce_init(ill, dst, ill->ill_net_type); 3171 } else { 3172 nce = arp_nce_init(ill, dst, IRE_MULTICAST); 3173 } 3174 if (nce == NULL) { 3175 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 3176 ip_drop_output("tbf_send_packet - no nce", mp, ill); 3177 freemsg(mp); 3178 return; 3179 } 3180 3181 /* 3182 * We don't remeber the incoming ill. Thus we 3183 * pretend the packet arrived on the outbound ill. This means 3184 * statistics for input errors will be increased on the wrong 3185 * ill but that isn't a big deal. 3186 */ 3187 ip_forward_xmit_v4(nce, ill, mp, ipha, &iras, ill->ill_mc_mtu, 3188 0); 3189 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 3190 3191 nce_refrele(nce); 3192 } 3193 } 3194 3195 /* 3196 * Determine the current time and then the elapsed time (between the last time 3197 * and time now). Update the no. of tokens in the bucket. 3198 */ 3199 static void 3200 tbf_update_tokens(struct vif *vifp) 3201 { 3202 timespec_t tp; 3203 hrtime_t tm; 3204 struct tbf *t = vifp->v_tbf; 3205 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 3206 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3207 3208 ASSERT(MUTEX_HELD(&t->tbf_lock)); 3209 3210 /* Time in secs and nsecs, rate limit in kbits/sec */ 3211 gethrestime(&tp); 3212 3213 /*LINTED*/ 3214 TV_DELTA(tp, t->tbf_last_pkt_t, tm); 3215 3216 /* 3217 * This formula is actually 3218 * "time in seconds" * "bytes/second". Scaled for nsec. 3219 * (tm/1000000000) * (v_rate_limit * 1000 * (1000/1024) /8) 3220 * 3221 * The (1000/1024) was introduced in add_vif to optimize 3222 * this divide into a shift. 3223 */ 3224 t->tbf_n_tok += (tm/1000) * vifp->v_rate_limit / 1024 / 8; 3225 t->tbf_last_pkt_t = tp; 3226 3227 if (t->tbf_n_tok > MAX_BKT_SIZE) 3228 t->tbf_n_tok = MAX_BKT_SIZE; 3229 if (ipst->ips_ip_mrtdebug > 1) { 3230 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3231 "tbf_update_tok: tm %lld tok %d vif %ld", 3232 tm, t->tbf_n_tok, (ptrdiff_t)(vifp - ipst->ips_vifs)); 3233 } 3234 } 3235 3236 /* 3237 * Priority currently is based on port nos. 3238 * Different forwarding mechanisms have different ways 3239 * of obtaining the port no. Hence, the vif must be 3240 * given along with the packet itself. 3241 * 3242 */ 3243 static int 3244 priority(struct vif *vifp, ipha_t *ipha) 3245 { 3246 int prio; 3247 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 3248 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3249 3250 /* Temporary hack; may add general packet classifier some day */ 3251 3252 ASSERT(MUTEX_HELD(&vifp->v_tbf->tbf_lock)); 3253 3254 /* 3255 * The UDP port space is divided up into four priority ranges: 3256 * [0, 16384) : unclassified - lowest priority 3257 * [16384, 32768) : audio - highest priority 3258 * [32768, 49152) : whiteboard - medium priority 3259 * [49152, 65536) : video - low priority 3260 */ 3261 3262 if (ipha->ipha_protocol == IPPROTO_UDP) { 3263 struct udphdr *udp = 3264 (struct udphdr *)((char *)ipha + IPH_HDR_LENGTH(ipha)); 3265 switch (ntohs(udp->uh_dport) & 0xc000) { 3266 case 0x4000: 3267 prio = 70; 3268 break; 3269 case 0x8000: 3270 prio = 60; 3271 break; 3272 case 0xc000: 3273 prio = 55; 3274 break; 3275 default: 3276 prio = 50; 3277 break; 3278 } 3279 if (ipst->ips_ip_mrtdebug > 1) { 3280 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3281 "priority: port %x prio %d\n", 3282 ntohs(udp->uh_dport), prio); 3283 } 3284 } else 3285 prio = 50; /* default priority */ 3286 return (prio); 3287 } 3288 3289 /* 3290 * End of token bucket filter modifications 3291 */ 3292 3293 3294 3295 /* 3296 * Produces data for netstat -M. 3297 */ 3298 int 3299 ip_mroute_stats(mblk_t *mp, ip_stack_t *ipst) 3300 { 3301 ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl); 3302 ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl); 3303 if (!snmp_append_data(mp, (char *)ipst->ips_mrtstat, 3304 sizeof (struct mrtstat))) { 3305 ip0dbg(("ip_mroute_stats: failed %ld bytes\n", 3306 (size_t)sizeof (struct mrtstat))); 3307 return (0); 3308 } 3309 return (1); 3310 } 3311 3312 /* 3313 * Sends info for SNMP's MIB. 3314 */ 3315 int 3316 ip_mroute_vif(mblk_t *mp, ip_stack_t *ipst) 3317 { 3318 struct vifctl vi; 3319 vifi_t vifi; 3320 3321 mutex_enter(&ipst->ips_numvifs_mutex); 3322 for (vifi = 0; vifi < ipst->ips_numvifs; vifi++) { 3323 if (ipst->ips_vifs[vifi].v_lcl_addr.s_addr == 0) 3324 continue; 3325 /* 3326 * No locks here, an approximation is fine. 3327 */ 3328 vi.vifc_vifi = vifi; 3329 vi.vifc_flags = ipst->ips_vifs[vifi].v_flags; 3330 vi.vifc_threshold = ipst->ips_vifs[vifi].v_threshold; 3331 vi.vifc_rate_limit = ipst->ips_vifs[vifi].v_rate_limit; 3332 vi.vifc_lcl_addr = ipst->ips_vifs[vifi].v_lcl_addr; 3333 vi.vifc_rmt_addr = ipst->ips_vifs[vifi].v_rmt_addr; 3334 vi.vifc_pkt_in = ipst->ips_vifs[vifi].v_pkt_in; 3335 vi.vifc_pkt_out = ipst->ips_vifs[vifi].v_pkt_out; 3336 3337 if (!snmp_append_data(mp, (char *)&vi, sizeof (vi))) { 3338 ip0dbg(("ip_mroute_vif: failed %ld bytes\n", 3339 (size_t)sizeof (vi))); 3340 mutex_exit(&ipst->ips_numvifs_mutex); 3341 return (0); 3342 } 3343 } 3344 mutex_exit(&ipst->ips_numvifs_mutex); 3345 return (1); 3346 } 3347 3348 /* 3349 * Called by ip_snmp_get to send up multicast routing table. 3350 */ 3351 int 3352 ip_mroute_mrt(mblk_t *mp, ip_stack_t *ipst) 3353 { 3354 int i, j; 3355 struct mfc *rt; 3356 struct mfcctl mfcc; 3357 3358 /* 3359 * Make sure multicast has not been turned off. 3360 */ 3361 if (is_mrouter_off(ipst)) 3362 return (1); 3363 3364 /* Loop over all hash buckets and their chains */ 3365 for (i = 0; i < MFCTBLSIZ; i++) { 3366 MFCB_REFHOLD(&ipst->ips_mfcs[i]); 3367 for (rt = ipst->ips_mfcs[i].mfcb_mfc; rt; rt = rt->mfc_next) { 3368 mutex_enter(&rt->mfc_mutex); 3369 if (rt->mfc_rte != NULL || 3370 (rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 3371 mutex_exit(&rt->mfc_mutex); 3372 continue; 3373 } 3374 mfcc.mfcc_origin = rt->mfc_origin; 3375 mfcc.mfcc_mcastgrp = rt->mfc_mcastgrp; 3376 mfcc.mfcc_parent = rt->mfc_parent; 3377 mfcc.mfcc_pkt_cnt = rt->mfc_pkt_cnt; 3378 mutex_enter(&ipst->ips_numvifs_mutex); 3379 for (j = 0; j < (int)ipst->ips_numvifs; j++) 3380 mfcc.mfcc_ttls[j] = rt->mfc_ttls[j]; 3381 for (j = (int)ipst->ips_numvifs; j < MAXVIFS; j++) 3382 mfcc.mfcc_ttls[j] = 0; 3383 mutex_exit(&ipst->ips_numvifs_mutex); 3384 3385 mutex_exit(&rt->mfc_mutex); 3386 if (!snmp_append_data(mp, (char *)&mfcc, 3387 sizeof (mfcc))) { 3388 MFCB_REFRELE(&ipst->ips_mfcs[i]); 3389 ip0dbg(("ip_mroute_mrt: failed %ld bytes\n", 3390 (size_t)sizeof (mfcc))); 3391 return (0); 3392 } 3393 } 3394 MFCB_REFRELE(&ipst->ips_mfcs[i]); 3395 } 3396 return (1); 3397 } 3398