1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 /* 28 * Procedures for the kernel part of DVMRP, 29 * a Distance-Vector Multicast Routing Protocol. 30 * (See RFC-1075) 31 * Written by David Waitzman, BBN Labs, August 1988. 32 * Modified by Steve Deering, Stanford, February 1989. 33 * Modified by Mark J. Steiglitz, Stanford, May, 1991 34 * Modified by Van Jacobson, LBL, January 1993 35 * Modified by Ajit Thyagarajan, PARC, August 1993 36 * Modified by Bill Fenner, PARC, April 1995 37 * 38 * MROUTING 3.5 39 */ 40 41 /* 42 * TODO 43 * - function pointer field in vif, void *vif_sendit() 44 */ 45 46 #include <sys/types.h> 47 #include <sys/stream.h> 48 #include <sys/stropts.h> 49 #include <sys/strlog.h> 50 #include <sys/systm.h> 51 #include <sys/ddi.h> 52 #include <sys/cmn_err.h> 53 #include <sys/zone.h> 54 55 #include <sys/param.h> 56 #include <sys/socket.h> 57 #include <sys/vtrace.h> 58 #include <sys/debug.h> 59 #include <net/if.h> 60 #include <sys/sockio.h> 61 #include <netinet/in.h> 62 #include <net/if_dl.h> 63 64 #include <inet/ipsec_impl.h> 65 #include <inet/common.h> 66 #include <inet/mi.h> 67 #include <inet/nd.h> 68 #include <inet/tunables.h> 69 #include <inet/mib2.h> 70 #include <netinet/ip6.h> 71 #include <inet/ip.h> 72 #include <inet/snmpcom.h> 73 74 #include <netinet/igmp.h> 75 #include <netinet/igmp_var.h> 76 #include <netinet/udp.h> 77 #include <netinet/ip_mroute.h> 78 #include <inet/ip_multi.h> 79 #include <inet/ip_ire.h> 80 #include <inet/ip_ndp.h> 81 #include <inet/ip_if.h> 82 #include <inet/ipclassifier.h> 83 84 #include <netinet/pim.h> 85 86 87 /* 88 * MT Design: 89 * 90 * There are three main data structures viftable, mfctable and tbftable that 91 * need to be protected against MT races. 92 * 93 * vitable is a fixed length array of vif structs. There is no lock to protect 94 * the whole array, instead each struct is protected by its own indiviual lock. 95 * The value of v_marks in conjuction with the value of v_refcnt determines the 96 * current state of a vif structure. One special state that needs mention 97 * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates 98 * that vif is being initalized. 99 * Each structure is freed when the refcnt goes down to zero. If a delete comes 100 * in when the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED 101 * which prevents the struct from further use. When the refcnt goes to zero 102 * the struct is freed and is marked VIF_MARK_NOTINUSE. 103 * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill 104 * from going away a refhold is put on the ipif before using it. see 105 * lock_good_vif() and unlock_good_vif(). 106 * 107 * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts 108 * of the vif struct. 109 * 110 * tbftable is also a fixed length array of tbf structs and is only accessed 111 * via v_tbf. It is protected by its own lock tbf_lock. 112 * 113 * Lock Ordering is 114 * v_lock --> tbf_lock 115 * v_lock --> ill_locK 116 * 117 * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb). 118 * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker, 119 * it also maintains a state. These fields are protected by a lock (mfcb_lock). 120 * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to 121 * protect the struct elements. 122 * 123 * mfc structs are dynamically allocated and are singly linked 124 * at the head of the chain. When an mfc structure is to be deleted 125 * it is marked condemned and so is the state in the bucket struct. 126 * When the last walker of the hash bucket exits all the mfc structs 127 * marked condemed are freed. 128 * 129 * Locking Hierarchy: 130 * The bucket lock should be acquired before the mfc struct lock. 131 * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking 132 * operations on the bucket struct. 133 * 134 * last_encap_lock and numvifs_mutex should be acquired after 135 * acquring vif or mfc locks. These locks protect some global variables. 136 * 137 * The statistics are not currently protected by a lock 138 * causing the stats be be approximate, not exact. 139 */ 140 141 #define NO_VIF MAXVIFS /* from mrouted, no route for src */ 142 143 /* 144 * Timeouts: 145 * Upcall timeouts - BSD uses boolean_t mfc->expire and 146 * nexpire[MFCTBLSIZE], the number of times expire has been called. 147 * SunOS 5.x uses mfc->timeout for each mfc. 148 * Some Unixes are limited in the number of simultaneous timeouts 149 * that can be run, SunOS 5.x does not have this restriction. 150 */ 151 152 /* 153 * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and 154 * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall 155 * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE 156 */ 157 #define EXPIRE_TIMEOUT (hz/4) /* 4x / second */ 158 #define UPCALL_EXPIRE 6 /* number of timeouts */ 159 160 /* 161 * Hash function for a source, group entry 162 */ 163 #define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \ 164 ((g) >> 20) ^ ((g) >> 10) ^ (g)) 165 166 #define TBF_REPROCESS (hz / 100) /* 100x /second */ 167 168 /* Identify PIM packet that came on a Register interface */ 169 #define PIM_REGISTER_MARKER 0xffffffff 170 171 /* Function declarations */ 172 static int add_mfc(struct mfcctl *, ip_stack_t *); 173 static int add_vif(struct vifctl *, conn_t *, ip_stack_t *); 174 static int del_mfc(struct mfcctl *, ip_stack_t *); 175 static int del_vif(vifi_t *, ip_stack_t *); 176 static void del_vifp(struct vif *); 177 static void encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 178 static void expire_upcalls(void *); 179 static void fill_route(struct mfc *, struct mfcctl *, ip_stack_t *); 180 static void free_queue(struct mfc *); 181 static int get_assert(uchar_t *, ip_stack_t *); 182 static int get_lsg_cnt(struct sioc_lsg_req *, ip_stack_t *); 183 static int get_sg_cnt(struct sioc_sg_req *, ip_stack_t *); 184 static int get_version(uchar_t *); 185 static int get_vif_cnt(struct sioc_vif_req *, ip_stack_t *); 186 static int ip_mdq(mblk_t *, ipha_t *, ill_t *, 187 ipaddr_t, struct mfc *); 188 static int ip_mrouter_init(conn_t *, uchar_t *, int, ip_stack_t *); 189 static void phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 190 static int register_mforward(mblk_t *, ip_recv_attr_t *); 191 static void register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 192 static int set_assert(int *, ip_stack_t *); 193 194 /* 195 * Token Bucket Filter functions 196 */ 197 static int priority(struct vif *, ipha_t *); 198 static void tbf_control(struct vif *, mblk_t *, ipha_t *); 199 static int tbf_dq_sel(struct vif *, ipha_t *); 200 static void tbf_process_q(struct vif *); 201 static void tbf_queue(struct vif *, mblk_t *); 202 static void tbf_reprocess_q(void *); 203 static void tbf_send_packet(struct vif *, mblk_t *); 204 static void tbf_update_tokens(struct vif *); 205 static void release_mfc(struct mfcb *); 206 207 static boolean_t is_mrouter_off(ip_stack_t *); 208 /* 209 * Encapsulation packets 210 */ 211 212 #define ENCAP_TTL 64 213 214 /* prototype IP hdr for encapsulated packets */ 215 static ipha_t multicast_encap_iphdr = { 216 IP_SIMPLE_HDR_VERSION, 217 0, /* tos */ 218 sizeof (ipha_t), /* total length */ 219 0, /* id */ 220 0, /* frag offset */ 221 ENCAP_TTL, IPPROTO_ENCAP, 222 0, /* checksum */ 223 }; 224 225 /* 226 * Rate limit for assert notification messages, in nsec. 227 */ 228 #define ASSERT_MSG_TIME 3000000000 229 230 231 #define VIF_REFHOLD(vifp) { \ 232 mutex_enter(&(vifp)->v_lock); \ 233 (vifp)->v_refcnt++; \ 234 mutex_exit(&(vifp)->v_lock); \ 235 } 236 237 #define VIF_REFRELE_LOCKED(vifp) { \ 238 (vifp)->v_refcnt--; \ 239 if ((vifp)->v_refcnt == 0 && \ 240 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) { \ 241 del_vifp(vifp); \ 242 } else { \ 243 mutex_exit(&(vifp)->v_lock); \ 244 } \ 245 } 246 247 #define VIF_REFRELE(vifp) { \ 248 mutex_enter(&(vifp)->v_lock); \ 249 (vifp)->v_refcnt--; \ 250 if ((vifp)->v_refcnt == 0 && \ 251 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) { \ 252 del_vifp(vifp); \ 253 } else { \ 254 mutex_exit(&(vifp)->v_lock); \ 255 } \ 256 } 257 258 #define MFCB_REFHOLD(mfcb) { \ 259 mutex_enter(&(mfcb)->mfcb_lock); \ 260 (mfcb)->mfcb_refcnt++; \ 261 ASSERT((mfcb)->mfcb_refcnt != 0); \ 262 mutex_exit(&(mfcb)->mfcb_lock); \ 263 } 264 265 #define MFCB_REFRELE(mfcb) { \ 266 mutex_enter(&(mfcb)->mfcb_lock); \ 267 ASSERT((mfcb)->mfcb_refcnt != 0); \ 268 if (--(mfcb)->mfcb_refcnt == 0 && \ 269 ((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) { \ 270 release_mfc(mfcb); \ 271 } \ 272 mutex_exit(&(mfcb)->mfcb_lock); \ 273 } 274 275 /* 276 * MFCFIND: 277 * Find a route for a given origin IP address and multicast group address. 278 * Skip entries with pending upcalls. 279 * Type of service parameter to be added in the future! 280 */ 281 #define MFCFIND(mfcbp, o, g, rt) { \ 282 struct mfc *_mb_rt = NULL; \ 283 rt = NULL; \ 284 _mb_rt = mfcbp->mfcb_mfc; \ 285 while (_mb_rt) { \ 286 if ((_mb_rt->mfc_origin.s_addr == o) && \ 287 (_mb_rt->mfc_mcastgrp.s_addr == g) && \ 288 (_mb_rt->mfc_rte == NULL) && \ 289 (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) { \ 290 rt = _mb_rt; \ 291 break; \ 292 } \ 293 _mb_rt = _mb_rt->mfc_next; \ 294 } \ 295 } 296 297 /* 298 * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime() 299 * are inefficient. We use gethrestime() which returns a timespec_t with 300 * sec and nsec, the resolution is machine dependent. 301 * The following 2 macros have been changed to use nsec instead of usec. 302 */ 303 /* 304 * Macros to compute elapsed time efficiently. 305 * Borrowed from Van Jacobson's scheduling code. 306 * Delta should be a hrtime_t. 307 */ 308 #define TV_DELTA(a, b, delta) { \ 309 int xxs; \ 310 \ 311 delta = (a).tv_nsec - (b).tv_nsec; \ 312 if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \ 313 switch (xxs) { \ 314 case 2: \ 315 delta += 1000000000; \ 316 /*FALLTHROUGH*/ \ 317 case 1: \ 318 delta += 1000000000; \ 319 break; \ 320 default: \ 321 delta += (1000000000 * xxs); \ 322 } \ 323 } \ 324 } 325 326 #define TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \ 327 (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec) 328 329 /* 330 * Handle MRT setsockopt commands to modify the multicast routing tables. 331 */ 332 int 333 ip_mrouter_set(int cmd, conn_t *connp, int checkonly, uchar_t *data, 334 int datalen) 335 { 336 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 337 338 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 339 if (cmd != MRT_INIT && connp != ipst->ips_ip_g_mrouter) { 340 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 341 return (EACCES); 342 } 343 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 344 345 if (checkonly) { 346 /* 347 * do not do operation, just pretend to - new T_CHECK 348 * Note: Even routines further on can probably fail but 349 * this T_CHECK stuff is only to please XTI so it not 350 * necessary to be perfect. 351 */ 352 switch (cmd) { 353 case MRT_INIT: 354 case MRT_DONE: 355 case MRT_ADD_VIF: 356 case MRT_DEL_VIF: 357 case MRT_ADD_MFC: 358 case MRT_DEL_MFC: 359 case MRT_ASSERT: 360 return (0); 361 default: 362 return (EOPNOTSUPP); 363 } 364 } 365 366 /* 367 * make sure no command is issued after multicast routing has been 368 * turned off. 369 */ 370 if (cmd != MRT_INIT && cmd != MRT_DONE) { 371 if (is_mrouter_off(ipst)) 372 return (EINVAL); 373 } 374 375 switch (cmd) { 376 case MRT_INIT: return (ip_mrouter_init(connp, data, datalen, ipst)); 377 case MRT_DONE: return (ip_mrouter_done(ipst)); 378 case MRT_ADD_VIF: return (add_vif((struct vifctl *)data, connp, ipst)); 379 case MRT_DEL_VIF: return (del_vif((vifi_t *)data, ipst)); 380 case MRT_ADD_MFC: return (add_mfc((struct mfcctl *)data, ipst)); 381 case MRT_DEL_MFC: return (del_mfc((struct mfcctl *)data, ipst)); 382 case MRT_ASSERT: return (set_assert((int *)data, ipst)); 383 default: return (EOPNOTSUPP); 384 } 385 } 386 387 /* 388 * Handle MRT getsockopt commands 389 */ 390 int 391 ip_mrouter_get(int cmd, conn_t *connp, uchar_t *data) 392 { 393 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 394 395 if (connp != ipst->ips_ip_g_mrouter) 396 return (EACCES); 397 398 switch (cmd) { 399 case MRT_VERSION: return (get_version((uchar_t *)data)); 400 case MRT_ASSERT: return (get_assert((uchar_t *)data, ipst)); 401 default: return (EOPNOTSUPP); 402 } 403 } 404 405 /* 406 * Handle ioctl commands to obtain information from the cache. 407 * Called with shared access to IP. These are read_only ioctls. 408 */ 409 /* ARGSUSED */ 410 int 411 mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 412 ip_ioctl_cmd_t *ipip, void *if_req) 413 { 414 mblk_t *mp1; 415 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 416 conn_t *connp = Q_TO_CONN(q); 417 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 418 419 /* Existence verified in ip_wput_nondata */ 420 mp1 = mp->b_cont->b_cont; 421 422 switch (iocp->ioc_cmd) { 423 case (SIOCGETVIFCNT): 424 return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr, ipst)); 425 case (SIOCGETSGCNT): 426 return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr, ipst)); 427 case (SIOCGETLSGCNT): 428 return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr, ipst)); 429 default: 430 return (EINVAL); 431 } 432 } 433 434 /* 435 * Returns the packet, byte, rpf-failure count for the source, group provided. 436 */ 437 static int 438 get_sg_cnt(struct sioc_sg_req *req, ip_stack_t *ipst) 439 { 440 struct mfc *rt; 441 struct mfcb *mfcbp; 442 443 mfcbp = &ipst->ips_mfcs[MFCHASH(req->src.s_addr, req->grp.s_addr)]; 444 MFCB_REFHOLD(mfcbp); 445 MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt); 446 447 if (rt != NULL) { 448 mutex_enter(&rt->mfc_mutex); 449 req->pktcnt = rt->mfc_pkt_cnt; 450 req->bytecnt = rt->mfc_byte_cnt; 451 req->wrong_if = rt->mfc_wrong_if; 452 mutex_exit(&rt->mfc_mutex); 453 } else 454 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU; 455 456 MFCB_REFRELE(mfcbp); 457 return (0); 458 } 459 460 /* 461 * Returns the packet, byte, rpf-failure count for the source, group provided. 462 * Uses larger counters and IPv6 addresses. 463 */ 464 /* ARGSUSED XXX until implemented */ 465 static int 466 get_lsg_cnt(struct sioc_lsg_req *req, ip_stack_t *ipst) 467 { 468 /* XXX TODO SIOCGETLSGCNT */ 469 return (ENXIO); 470 } 471 472 /* 473 * Returns the input and output packet and byte counts on the vif provided. 474 */ 475 static int 476 get_vif_cnt(struct sioc_vif_req *req, ip_stack_t *ipst) 477 { 478 vifi_t vifi = req->vifi; 479 480 if (vifi >= ipst->ips_numvifs) 481 return (EINVAL); 482 483 /* 484 * No locks here, an approximation is fine. 485 */ 486 req->icount = ipst->ips_vifs[vifi].v_pkt_in; 487 req->ocount = ipst->ips_vifs[vifi].v_pkt_out; 488 req->ibytes = ipst->ips_vifs[vifi].v_bytes_in; 489 req->obytes = ipst->ips_vifs[vifi].v_bytes_out; 490 491 return (0); 492 } 493 494 static int 495 get_version(uchar_t *data) 496 { 497 int *v = (int *)data; 498 499 *v = 0x0305; /* XXX !!!! */ 500 501 return (0); 502 } 503 504 /* 505 * Set PIM assert processing global. 506 */ 507 static int 508 set_assert(int *i, ip_stack_t *ipst) 509 { 510 if ((*i != 1) && (*i != 0)) 511 return (EINVAL); 512 513 ipst->ips_pim_assert = *i; 514 515 return (0); 516 } 517 518 /* 519 * Get PIM assert processing global. 520 */ 521 static int 522 get_assert(uchar_t *data, ip_stack_t *ipst) 523 { 524 int *i = (int *)data; 525 526 *i = ipst->ips_pim_assert; 527 528 return (0); 529 } 530 531 /* 532 * Enable multicast routing. 533 */ 534 static int 535 ip_mrouter_init(conn_t *connp, uchar_t *data, int datalen, ip_stack_t *ipst) 536 { 537 int *v; 538 539 if (data == NULL || (datalen != sizeof (int))) 540 return (ENOPROTOOPT); 541 542 v = (int *)data; 543 if (*v != 1) 544 return (ENOPROTOOPT); 545 546 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 547 if (ipst->ips_ip_g_mrouter != NULL) { 548 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 549 return (EADDRINUSE); 550 } 551 552 /* 553 * MRT_INIT should only be allowed for RAW sockets, but we double 554 * check. 555 */ 556 if (!IPCL_IS_RAWIP(connp)) { 557 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 558 return (EINVAL); 559 } 560 561 ipst->ips_ip_g_mrouter = connp; 562 connp->conn_multi_router = 1; 563 /* In order for tunnels to work we have to turn ip_g_forward on */ 564 if (!WE_ARE_FORWARDING(ipst)) { 565 if (ipst->ips_ip_mrtdebug > 1) { 566 (void) mi_strlog(connp->conn_rq, 1, SL_TRACE, 567 "ip_mrouter_init: turning on forwarding"); 568 } 569 ipst->ips_saved_ip_forwarding = ipst->ips_ip_forwarding; 570 ipst->ips_ip_forwarding = IP_FORWARD_ALWAYS; 571 } 572 573 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 574 return (0); 575 } 576 577 void 578 ip_mrouter_stack_init(ip_stack_t *ipst) 579 { 580 mutex_init(&ipst->ips_ip_g_mrouter_mutex, NULL, MUTEX_DEFAULT, NULL); 581 582 ipst->ips_vifs = kmem_zalloc(sizeof (struct vif) * (MAXVIFS+1), 583 KM_SLEEP); 584 ipst->ips_mrtstat = kmem_zalloc(sizeof (struct mrtstat), KM_SLEEP); 585 /* 586 * mfctable: 587 * Includes all mfcs, including waiting upcalls. 588 * Multiple mfcs per bucket. 589 */ 590 ipst->ips_mfcs = kmem_zalloc(sizeof (struct mfcb) * MFCTBLSIZ, 591 KM_SLEEP); 592 /* 593 * Define the token bucket filter structures. 594 * tbftable -> each vif has one of these for storing info. 595 */ 596 ipst->ips_tbfs = kmem_zalloc(sizeof (struct tbf) * MAXVIFS, KM_SLEEP); 597 598 mutex_init(&ipst->ips_last_encap_lock, NULL, MUTEX_DEFAULT, NULL); 599 600 ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl); 601 ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl); 602 } 603 604 /* 605 * Disable multicast routing. 606 * Didn't use global timeout_val (BSD version), instead check the mfctable. 607 */ 608 int 609 ip_mrouter_done(ip_stack_t *ipst) 610 { 611 conn_t *mrouter; 612 vifi_t vifi; 613 struct mfc *mfc_rt; 614 int i; 615 616 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 617 if (ipst->ips_ip_g_mrouter == NULL) { 618 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 619 return (EINVAL); 620 } 621 622 mrouter = ipst->ips_ip_g_mrouter; 623 624 if (ipst->ips_saved_ip_forwarding != -1) { 625 if (ipst->ips_ip_mrtdebug > 1) { 626 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 627 "ip_mrouter_done: turning off forwarding"); 628 } 629 ipst->ips_ip_forwarding = ipst->ips_saved_ip_forwarding; 630 ipst->ips_saved_ip_forwarding = -1; 631 } 632 633 /* 634 * Always clear cache when vifs change. 635 * No need to get ipst->ips_last_encap_lock since we are running as 636 * a writer. 637 */ 638 mutex_enter(&ipst->ips_last_encap_lock); 639 ipst->ips_last_encap_src = 0; 640 ipst->ips_last_encap_vif = NULL; 641 mutex_exit(&ipst->ips_last_encap_lock); 642 mrouter->conn_multi_router = 0; 643 644 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 645 646 /* 647 * For each phyint in use, 648 * disable promiscuous reception of all IP multicasts. 649 */ 650 for (vifi = 0; vifi < MAXVIFS; vifi++) { 651 struct vif *vifp = ipst->ips_vifs + vifi; 652 653 mutex_enter(&vifp->v_lock); 654 /* 655 * if the vif is active mark it condemned. 656 */ 657 if (vifp->v_marks & VIF_MARK_GOOD) { 658 ASSERT(vifp->v_ipif != NULL); 659 ipif_refhold(vifp->v_ipif); 660 /* Phyint only */ 661 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 662 ipif_t *ipif = vifp->v_ipif; 663 ilm_t *ilm = vifp->v_ilm; 664 665 vifp->v_ilm = NULL; 666 vifp->v_marks &= ~VIF_MARK_GOOD; 667 vifp->v_marks |= VIF_MARK_CONDEMNED; 668 669 mutex_exit(&(vifp)->v_lock); 670 if (ilm != NULL) { 671 ill_t *ill = ipif->ipif_ill; 672 673 (void) ip_delmulti(ilm); 674 ASSERT(ill->ill_mrouter_cnt > 0); 675 atomic_dec_32(&ill->ill_mrouter_cnt); 676 } 677 mutex_enter(&vifp->v_lock); 678 } 679 ipif_refrele(vifp->v_ipif); 680 /* 681 * decreases the refcnt added in add_vif. 682 * and release v_lock. 683 */ 684 VIF_REFRELE_LOCKED(vifp); 685 } else { 686 mutex_exit(&vifp->v_lock); 687 continue; 688 } 689 } 690 691 mutex_enter(&ipst->ips_numvifs_mutex); 692 ipst->ips_numvifs = 0; 693 ipst->ips_pim_assert = 0; 694 ipst->ips_reg_vif_num = ALL_VIFS; 695 mutex_exit(&ipst->ips_numvifs_mutex); 696 697 /* 698 * Free upcall msgs. 699 * Go through mfctable and stop any outstanding upcall 700 * timeouts remaining on mfcs. 701 */ 702 for (i = 0; i < MFCTBLSIZ; i++) { 703 mutex_enter(&ipst->ips_mfcs[i].mfcb_lock); 704 ipst->ips_mfcs[i].mfcb_refcnt++; 705 ipst->ips_mfcs[i].mfcb_marks |= MFCB_MARK_CONDEMNED; 706 mutex_exit(&ipst->ips_mfcs[i].mfcb_lock); 707 mfc_rt = ipst->ips_mfcs[i].mfcb_mfc; 708 while (mfc_rt) { 709 /* Free upcalls */ 710 mutex_enter(&mfc_rt->mfc_mutex); 711 if (mfc_rt->mfc_rte != NULL) { 712 if (mfc_rt->mfc_timeout_id != 0) { 713 /* 714 * OK to drop the lock as we have 715 * a refcnt on the bucket. timeout 716 * can fire but it will see that 717 * mfc_timeout_id == 0 and not do 718 * anything. see expire_upcalls(). 719 */ 720 mfc_rt->mfc_timeout_id = 0; 721 mutex_exit(&mfc_rt->mfc_mutex); 722 (void) untimeout( 723 mfc_rt->mfc_timeout_id); 724 mfc_rt->mfc_timeout_id = 0; 725 mutex_enter(&mfc_rt->mfc_mutex); 726 727 /* 728 * all queued upcall packets 729 * and mblk will be freed in 730 * release_mfc(). 731 */ 732 } 733 } 734 735 mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED; 736 737 mutex_exit(&mfc_rt->mfc_mutex); 738 mfc_rt = mfc_rt->mfc_next; 739 } 740 MFCB_REFRELE(&ipst->ips_mfcs[i]); 741 } 742 743 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 744 ipst->ips_ip_g_mrouter = NULL; 745 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 746 return (0); 747 } 748 749 void 750 ip_mrouter_stack_destroy(ip_stack_t *ipst) 751 { 752 struct mfcb *mfcbp; 753 struct mfc *rt; 754 int i; 755 756 for (i = 0; i < MFCTBLSIZ; i++) { 757 mfcbp = &ipst->ips_mfcs[i]; 758 759 while ((rt = mfcbp->mfcb_mfc) != NULL) { 760 (void) printf("ip_mrouter_stack_destroy: free for %d\n", 761 i); 762 763 mfcbp->mfcb_mfc = rt->mfc_next; 764 free_queue(rt); 765 mi_free(rt); 766 } 767 } 768 kmem_free(ipst->ips_vifs, sizeof (struct vif) * (MAXVIFS+1)); 769 ipst->ips_vifs = NULL; 770 kmem_free(ipst->ips_mrtstat, sizeof (struct mrtstat)); 771 ipst->ips_mrtstat = NULL; 772 kmem_free(ipst->ips_mfcs, sizeof (struct mfcb) * MFCTBLSIZ); 773 ipst->ips_mfcs = NULL; 774 kmem_free(ipst->ips_tbfs, sizeof (struct tbf) * MAXVIFS); 775 ipst->ips_tbfs = NULL; 776 777 mutex_destroy(&ipst->ips_last_encap_lock); 778 mutex_destroy(&ipst->ips_ip_g_mrouter_mutex); 779 } 780 781 static boolean_t 782 is_mrouter_off(ip_stack_t *ipst) 783 { 784 conn_t *mrouter; 785 786 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 787 if (ipst->ips_ip_g_mrouter == NULL) { 788 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 789 return (B_TRUE); 790 } 791 792 mrouter = ipst->ips_ip_g_mrouter; 793 if (mrouter->conn_multi_router == 0) { 794 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 795 return (B_TRUE); 796 } 797 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 798 return (B_FALSE); 799 } 800 801 static void 802 unlock_good_vif(struct vif *vifp) 803 { 804 ASSERT(vifp->v_ipif != NULL); 805 ipif_refrele(vifp->v_ipif); 806 VIF_REFRELE(vifp); 807 } 808 809 static boolean_t 810 lock_good_vif(struct vif *vifp) 811 { 812 mutex_enter(&vifp->v_lock); 813 if (!(vifp->v_marks & VIF_MARK_GOOD)) { 814 mutex_exit(&vifp->v_lock); 815 return (B_FALSE); 816 } 817 818 ASSERT(vifp->v_ipif != NULL); 819 mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock); 820 if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) { 821 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock); 822 mutex_exit(&vifp->v_lock); 823 return (B_FALSE); 824 } 825 ipif_refhold_locked(vifp->v_ipif); 826 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock); 827 vifp->v_refcnt++; 828 mutex_exit(&vifp->v_lock); 829 return (B_TRUE); 830 } 831 832 /* 833 * Add a vif to the vif table. 834 */ 835 static int 836 add_vif(struct vifctl *vifcp, conn_t *connp, ip_stack_t *ipst) 837 { 838 struct vif *vifp = ipst->ips_vifs + vifcp->vifc_vifi; 839 ipif_t *ipif; 840 int error = 0; 841 struct tbf *v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi; 842 conn_t *mrouter = ipst->ips_ip_g_mrouter; 843 ilm_t *ilm; 844 ill_t *ill; 845 846 ASSERT(connp != NULL); 847 848 if (vifcp->vifc_vifi >= MAXVIFS) 849 return (EINVAL); 850 851 if (is_mrouter_off(ipst)) 852 return (EINVAL); 853 854 mutex_enter(&vifp->v_lock); 855 /* 856 * Viftable entry should be 0. 857 * if v_marks == 0 but v_refcnt != 0 means struct is being 858 * initialized. 859 * 860 * Also note that it is very unlikely that we will get a MRT_ADD_VIF 861 * request while the delete is in progress, mrouted only sends add 862 * requests when a new interface is added and the new interface cannot 863 * have the same vifi as an existing interface. We make sure that 864 * ill_delete will block till the vif is deleted by adding a refcnt 865 * to ipif in del_vif(). 866 */ 867 if (vifp->v_lcl_addr.s_addr != 0 || 868 vifp->v_marks != 0 || 869 vifp->v_refcnt != 0) { 870 mutex_exit(&vifp->v_lock); 871 return (EADDRINUSE); 872 } 873 874 /* Incoming vif should not be 0 */ 875 if (vifcp->vifc_lcl_addr.s_addr == 0) { 876 mutex_exit(&vifp->v_lock); 877 return (EINVAL); 878 } 879 880 vifp->v_refcnt++; 881 mutex_exit(&vifp->v_lock); 882 /* Find the interface with the local address */ 883 ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL, 884 IPCL_ZONEID(connp), ipst); 885 if (ipif == NULL) { 886 VIF_REFRELE(vifp); 887 return (EADDRNOTAVAIL); 888 } 889 890 if (ipst->ips_ip_mrtdebug > 1) { 891 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 892 "add_vif: src 0x%x enter", 893 vifcp->vifc_lcl_addr.s_addr); 894 } 895 896 mutex_enter(&vifp->v_lock); 897 /* 898 * Always clear cache when vifs change. 899 * Needed to ensure that src isn't left over from before vif was added. 900 * No need to get last_encap_lock, since we are running as a writer. 901 */ 902 903 mutex_enter(&ipst->ips_last_encap_lock); 904 ipst->ips_last_encap_src = 0; 905 ipst->ips_last_encap_vif = NULL; 906 mutex_exit(&ipst->ips_last_encap_lock); 907 908 if (vifcp->vifc_flags & VIFF_TUNNEL) { 909 if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) { 910 cmn_err(CE_WARN, 911 "add_vif: source route tunnels not supported\n"); 912 VIF_REFRELE_LOCKED(vifp); 913 ipif_refrele(ipif); 914 return (EOPNOTSUPP); 915 } 916 vifp->v_rmt_addr = vifcp->vifc_rmt_addr; 917 918 } else { 919 /* Phyint or Register vif */ 920 if (vifcp->vifc_flags & VIFF_REGISTER) { 921 /* 922 * Note: Since all IPPROTO_IP level options (including 923 * MRT_ADD_VIF) are done exclusively via 924 * ip_optmgmt_writer(), a lock is not necessary to 925 * protect reg_vif_num. 926 */ 927 mutex_enter(&ipst->ips_numvifs_mutex); 928 if (ipst->ips_reg_vif_num == ALL_VIFS) { 929 ipst->ips_reg_vif_num = vifcp->vifc_vifi; 930 mutex_exit(&ipst->ips_numvifs_mutex); 931 } else { 932 mutex_exit(&ipst->ips_numvifs_mutex); 933 VIF_REFRELE_LOCKED(vifp); 934 ipif_refrele(ipif); 935 return (EADDRINUSE); 936 } 937 } 938 939 /* Make sure the interface supports multicast */ 940 if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) { 941 VIF_REFRELE_LOCKED(vifp); 942 ipif_refrele(ipif); 943 if (vifcp->vifc_flags & VIFF_REGISTER) { 944 mutex_enter(&ipst->ips_numvifs_mutex); 945 ipst->ips_reg_vif_num = ALL_VIFS; 946 mutex_exit(&ipst->ips_numvifs_mutex); 947 } 948 return (EOPNOTSUPP); 949 } 950 /* Enable promiscuous reception of all IP mcasts from the if */ 951 mutex_exit(&vifp->v_lock); 952 953 ill = ipif->ipif_ill; 954 if (IS_UNDER_IPMP(ill)) 955 ill = ipmp_ill_hold_ipmp_ill(ill); 956 957 if (ill == NULL) { 958 ilm = NULL; 959 } else { 960 ilm = ip_addmulti(&ipv6_all_zeros, ill, 961 ipif->ipif_zoneid, &error); 962 if (ilm != NULL) 963 atomic_inc_32(&ill->ill_mrouter_cnt); 964 if (IS_UNDER_IPMP(ipif->ipif_ill)) { 965 ill_refrele(ill); 966 ill = ipif->ipif_ill; 967 } 968 } 969 970 mutex_enter(&vifp->v_lock); 971 /* 972 * since we released the lock lets make sure that 973 * ip_mrouter_done() has not been called. 974 */ 975 if (ilm == NULL || is_mrouter_off(ipst)) { 976 if (ilm != NULL) { 977 (void) ip_delmulti(ilm); 978 ASSERT(ill->ill_mrouter_cnt > 0); 979 atomic_dec_32(&ill->ill_mrouter_cnt); 980 } 981 if (vifcp->vifc_flags & VIFF_REGISTER) { 982 mutex_enter(&ipst->ips_numvifs_mutex); 983 ipst->ips_reg_vif_num = ALL_VIFS; 984 mutex_exit(&ipst->ips_numvifs_mutex); 985 } 986 VIF_REFRELE_LOCKED(vifp); 987 ipif_refrele(ipif); 988 return (error?error:EINVAL); 989 } 990 vifp->v_ilm = ilm; 991 } 992 /* Define parameters for the tbf structure */ 993 vifp->v_tbf = v_tbf; 994 gethrestime(&vifp->v_tbf->tbf_last_pkt_t); 995 vifp->v_tbf->tbf_n_tok = 0; 996 vifp->v_tbf->tbf_q_len = 0; 997 vifp->v_tbf->tbf_max_q_len = MAXQSIZE; 998 vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL; 999 1000 vifp->v_flags = vifcp->vifc_flags; 1001 vifp->v_threshold = vifcp->vifc_threshold; 1002 vifp->v_lcl_addr = vifcp->vifc_lcl_addr; 1003 vifp->v_ipif = ipif; 1004 ipif_refrele(ipif); 1005 /* Scaling up here, allows division by 1024 in critical code. */ 1006 vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000); 1007 vifp->v_timeout_id = 0; 1008 /* initialize per vif pkt counters */ 1009 vifp->v_pkt_in = 0; 1010 vifp->v_pkt_out = 0; 1011 vifp->v_bytes_in = 0; 1012 vifp->v_bytes_out = 0; 1013 mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL); 1014 1015 /* Adjust numvifs up, if the vifi is higher than numvifs */ 1016 mutex_enter(&ipst->ips_numvifs_mutex); 1017 if (ipst->ips_numvifs <= vifcp->vifc_vifi) 1018 ipst->ips_numvifs = vifcp->vifc_vifi + 1; 1019 mutex_exit(&ipst->ips_numvifs_mutex); 1020 1021 if (ipst->ips_ip_mrtdebug > 1) { 1022 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1023 "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d", 1024 vifcp->vifc_vifi, 1025 ntohl(vifcp->vifc_lcl_addr.s_addr), 1026 (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask", 1027 ntohl(vifcp->vifc_rmt_addr.s_addr), 1028 vifcp->vifc_threshold, vifcp->vifc_rate_limit); 1029 } 1030 1031 vifp->v_marks = VIF_MARK_GOOD; 1032 mutex_exit(&vifp->v_lock); 1033 return (0); 1034 } 1035 1036 1037 /* Delete a vif from the vif table. */ 1038 static void 1039 del_vifp(struct vif *vifp) 1040 { 1041 struct tbf *t = vifp->v_tbf; 1042 mblk_t *mp0; 1043 vifi_t vifi; 1044 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 1045 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1046 1047 ASSERT(vifp->v_marks & VIF_MARK_CONDEMNED); 1048 ASSERT(t != NULL); 1049 1050 if (ipst->ips_ip_mrtdebug > 1) { 1051 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1052 "del_vif: src 0x%x\n", vifp->v_lcl_addr.s_addr); 1053 } 1054 1055 if (vifp->v_timeout_id != 0) { 1056 (void) untimeout(vifp->v_timeout_id); 1057 vifp->v_timeout_id = 0; 1058 } 1059 1060 /* 1061 * Free packets queued at the interface. 1062 * Mrouted takes care of cleaning up mfcs - makes calls to del_mfc. 1063 */ 1064 mutex_enter(&t->tbf_lock); 1065 while (t->tbf_q != NULL) { 1066 mp0 = t->tbf_q; 1067 t->tbf_q = t->tbf_q->b_next; 1068 mp0->b_prev = mp0->b_next = NULL; 1069 freemsg(mp0); 1070 } 1071 mutex_exit(&t->tbf_lock); 1072 1073 /* 1074 * Always clear cache when vifs change. 1075 * No need to get last_encap_lock since we are running as a writer. 1076 */ 1077 mutex_enter(&ipst->ips_last_encap_lock); 1078 if (vifp == ipst->ips_last_encap_vif) { 1079 ipst->ips_last_encap_vif = NULL; 1080 ipst->ips_last_encap_src = 0; 1081 } 1082 mutex_exit(&ipst->ips_last_encap_lock); 1083 1084 mutex_destroy(&t->tbf_lock); 1085 1086 bzero(vifp->v_tbf, sizeof (*(vifp->v_tbf))); 1087 1088 /* Adjust numvifs down */ 1089 mutex_enter(&ipst->ips_numvifs_mutex); 1090 for (vifi = ipst->ips_numvifs; vifi != 0; vifi--) /* vifi is unsigned */ 1091 if (ipst->ips_vifs[vifi - 1].v_lcl_addr.s_addr != 0) 1092 break; 1093 ipst->ips_numvifs = vifi; 1094 mutex_exit(&ipst->ips_numvifs_mutex); 1095 1096 bzero(vifp, sizeof (*vifp)); 1097 } 1098 1099 static int 1100 del_vif(vifi_t *vifip, ip_stack_t *ipst) 1101 { 1102 struct vif *vifp = ipst->ips_vifs + *vifip; 1103 1104 if (*vifip >= ipst->ips_numvifs) 1105 return (EINVAL); 1106 1107 mutex_enter(&vifp->v_lock); 1108 /* 1109 * Not initialized 1110 * Here we are not looking at the vif that is being initialized 1111 * i.e vifp->v_marks == 0 and refcnt > 0. 1112 */ 1113 if (vifp->v_lcl_addr.s_addr == 0 || 1114 !(vifp->v_marks & VIF_MARK_GOOD)) { 1115 mutex_exit(&vifp->v_lock); 1116 return (EADDRNOTAVAIL); 1117 } 1118 1119 /* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */ 1120 vifp->v_marks &= ~VIF_MARK_GOOD; 1121 vifp->v_marks |= VIF_MARK_CONDEMNED; 1122 1123 /* Phyint only */ 1124 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 1125 ipif_t *ipif = vifp->v_ipif; 1126 ilm_t *ilm = vifp->v_ilm; 1127 1128 vifp->v_ilm = NULL; 1129 1130 ASSERT(ipif != NULL); 1131 /* 1132 * should be OK to drop the lock as we 1133 * have marked this as CONDEMNED. 1134 */ 1135 mutex_exit(&(vifp)->v_lock); 1136 if (ilm != NULL) { 1137 (void) ip_delmulti(ilm); 1138 ASSERT(ipif->ipif_ill->ill_mrouter_cnt > 0); 1139 atomic_dec_32(&ipif->ipif_ill->ill_mrouter_cnt); 1140 } 1141 mutex_enter(&(vifp)->v_lock); 1142 } 1143 1144 if (vifp->v_flags & VIFF_REGISTER) { 1145 mutex_enter(&ipst->ips_numvifs_mutex); 1146 ipst->ips_reg_vif_num = ALL_VIFS; 1147 mutex_exit(&ipst->ips_numvifs_mutex); 1148 } 1149 1150 /* 1151 * decreases the refcnt added in add_vif. 1152 */ 1153 VIF_REFRELE_LOCKED(vifp); 1154 return (0); 1155 } 1156 1157 /* 1158 * Add an mfc entry. 1159 */ 1160 static int 1161 add_mfc(struct mfcctl *mfccp, ip_stack_t *ipst) 1162 { 1163 struct mfc *rt; 1164 struct rtdetq *rte; 1165 ushort_t nstl; 1166 int i; 1167 struct mfcb *mfcbp; 1168 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1169 1170 /* 1171 * The value of vifi is NO_VIF (==MAXVIFS) if Mrouted 1172 * did not have a real route for pkt. 1173 * We want this pkt without rt installed in the mfctable to prevent 1174 * multiiple tries, so go ahead and put it in mfctable, it will 1175 * be discarded later in ip_mdq() because the child is NULL. 1176 */ 1177 1178 /* Error checking, out of bounds? */ 1179 if (mfccp->mfcc_parent > MAXVIFS) { 1180 ip0dbg(("ADD_MFC: mfcc_parent out of range %d", 1181 (int)mfccp->mfcc_parent)); 1182 return (EINVAL); 1183 } 1184 1185 if ((mfccp->mfcc_parent != NO_VIF) && 1186 (ipst->ips_vifs[mfccp->mfcc_parent].v_ipif == NULL)) { 1187 ip0dbg(("ADD_MFC: NULL ipif for parent vif %d\n", 1188 (int)mfccp->mfcc_parent)); 1189 return (EINVAL); 1190 } 1191 1192 if (is_mrouter_off(ipst)) { 1193 return (EINVAL); 1194 } 1195 1196 mfcbp = &ipst->ips_mfcs[MFCHASH(mfccp->mfcc_origin.s_addr, 1197 mfccp->mfcc_mcastgrp.s_addr)]; 1198 MFCB_REFHOLD(mfcbp); 1199 MFCFIND(mfcbp, mfccp->mfcc_origin.s_addr, 1200 mfccp->mfcc_mcastgrp.s_addr, rt); 1201 1202 /* If an entry already exists, just update the fields */ 1203 if (rt) { 1204 if (ipst->ips_ip_mrtdebug > 1) { 1205 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1206 "add_mfc: update o %x grp %x parent %x", 1207 ntohl(mfccp->mfcc_origin.s_addr), 1208 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1209 mfccp->mfcc_parent); 1210 } 1211 mutex_enter(&rt->mfc_mutex); 1212 rt->mfc_parent = mfccp->mfcc_parent; 1213 1214 mutex_enter(&ipst->ips_numvifs_mutex); 1215 for (i = 0; i < (int)ipst->ips_numvifs; i++) 1216 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; 1217 mutex_exit(&ipst->ips_numvifs_mutex); 1218 mutex_exit(&rt->mfc_mutex); 1219 1220 MFCB_REFRELE(mfcbp); 1221 return (0); 1222 } 1223 1224 /* 1225 * Find the entry for which the upcall was made and update. 1226 */ 1227 for (rt = mfcbp->mfcb_mfc, nstl = 0; rt; rt = rt->mfc_next) { 1228 mutex_enter(&rt->mfc_mutex); 1229 if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && 1230 (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) && 1231 (rt->mfc_rte != NULL) && 1232 !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 1233 if (nstl++ != 0) 1234 cmn_err(CE_WARN, 1235 "add_mfc: %s o %x g %x p %x", 1236 "multiple kernel entries", 1237 ntohl(mfccp->mfcc_origin.s_addr), 1238 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1239 mfccp->mfcc_parent); 1240 1241 if (ipst->ips_ip_mrtdebug > 1) { 1242 (void) mi_strlog(mrouter->conn_rq, 1, 1243 SL_TRACE, 1244 "add_mfc: o %x g %x p %x", 1245 ntohl(mfccp->mfcc_origin.s_addr), 1246 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1247 mfccp->mfcc_parent); 1248 } 1249 fill_route(rt, mfccp, ipst); 1250 1251 /* 1252 * Prevent cleanup of cache entry. 1253 * Timer starts in ip_mforward. 1254 */ 1255 if (rt->mfc_timeout_id != 0) { 1256 timeout_id_t id; 1257 id = rt->mfc_timeout_id; 1258 /* 1259 * setting id to zero will avoid this 1260 * entry from being cleaned up in 1261 * expire_up_calls(). 1262 */ 1263 rt->mfc_timeout_id = 0; 1264 /* 1265 * dropping the lock is fine as we 1266 * have a refhold on the bucket. 1267 * so mfc cannot be freed. 1268 * The timeout can fire but it will see 1269 * that mfc_timeout_id == 0 and not cleanup. 1270 */ 1271 mutex_exit(&rt->mfc_mutex); 1272 (void) untimeout(id); 1273 mutex_enter(&rt->mfc_mutex); 1274 } 1275 1276 /* 1277 * Send all pkts that are queued waiting for the upcall. 1278 * ip_mdq param tun set to 0 - 1279 * the return value of ip_mdq() isn't used here, 1280 * so value we send doesn't matter. 1281 */ 1282 while (rt->mfc_rte != NULL) { 1283 rte = rt->mfc_rte; 1284 rt->mfc_rte = rte->rte_next; 1285 mutex_exit(&rt->mfc_mutex); 1286 (void) ip_mdq(rte->mp, (ipha_t *) 1287 rte->mp->b_rptr, rte->ill, 0, rt); 1288 freemsg(rte->mp); 1289 mi_free((char *)rte); 1290 mutex_enter(&rt->mfc_mutex); 1291 } 1292 } 1293 mutex_exit(&rt->mfc_mutex); 1294 } 1295 1296 1297 /* 1298 * It is possible that an entry is being inserted without an upcall 1299 */ 1300 if (nstl == 0) { 1301 mutex_enter(&(mfcbp->mfcb_lock)); 1302 if (ipst->ips_ip_mrtdebug > 1) { 1303 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1304 "add_mfc: no upcall o %x g %x p %x", 1305 ntohl(mfccp->mfcc_origin.s_addr), 1306 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1307 mfccp->mfcc_parent); 1308 } 1309 if (is_mrouter_off(ipst)) { 1310 mutex_exit(&mfcbp->mfcb_lock); 1311 MFCB_REFRELE(mfcbp); 1312 return (EINVAL); 1313 } 1314 1315 for (rt = mfcbp->mfcb_mfc; rt; rt = rt->mfc_next) { 1316 1317 mutex_enter(&rt->mfc_mutex); 1318 if ((rt->mfc_origin.s_addr == 1319 mfccp->mfcc_origin.s_addr) && 1320 (rt->mfc_mcastgrp.s_addr == 1321 mfccp->mfcc_mcastgrp.s_addr) && 1322 (!(rt->mfc_marks & MFCB_MARK_CONDEMNED))) { 1323 fill_route(rt, mfccp, ipst); 1324 mutex_exit(&rt->mfc_mutex); 1325 break; 1326 } 1327 mutex_exit(&rt->mfc_mutex); 1328 } 1329 1330 /* No upcall, so make a new entry into mfctable */ 1331 if (rt == NULL) { 1332 rt = (struct mfc *)mi_zalloc(sizeof (struct mfc)); 1333 if (rt == NULL) { 1334 ip1dbg(("add_mfc: out of memory\n")); 1335 mutex_exit(&mfcbp->mfcb_lock); 1336 MFCB_REFRELE(mfcbp); 1337 return (ENOBUFS); 1338 } 1339 1340 /* Insert new entry at head of hash chain */ 1341 mutex_enter(&rt->mfc_mutex); 1342 fill_route(rt, mfccp, ipst); 1343 1344 /* Link into table */ 1345 rt->mfc_next = mfcbp->mfcb_mfc; 1346 mfcbp->mfcb_mfc = rt; 1347 mutex_exit(&rt->mfc_mutex); 1348 } 1349 mutex_exit(&mfcbp->mfcb_lock); 1350 } 1351 1352 MFCB_REFRELE(mfcbp); 1353 return (0); 1354 } 1355 1356 /* 1357 * Fills in mfc structure from mrouted mfcctl. 1358 */ 1359 static void 1360 fill_route(struct mfc *rt, struct mfcctl *mfccp, ip_stack_t *ipst) 1361 { 1362 int i; 1363 1364 rt->mfc_origin = mfccp->mfcc_origin; 1365 rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; 1366 rt->mfc_parent = mfccp->mfcc_parent; 1367 mutex_enter(&ipst->ips_numvifs_mutex); 1368 for (i = 0; i < (int)ipst->ips_numvifs; i++) { 1369 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; 1370 } 1371 mutex_exit(&ipst->ips_numvifs_mutex); 1372 /* Initialize pkt counters per src-grp */ 1373 rt->mfc_pkt_cnt = 0; 1374 rt->mfc_byte_cnt = 0; 1375 rt->mfc_wrong_if = 0; 1376 rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_nsec = 0; 1377 1378 } 1379 1380 static void 1381 free_queue(struct mfc *mfcp) 1382 { 1383 struct rtdetq *rte0; 1384 1385 /* 1386 * Drop all queued upcall packets. 1387 * Free the mbuf with the pkt. 1388 */ 1389 while ((rte0 = mfcp->mfc_rte) != NULL) { 1390 mfcp->mfc_rte = rte0->rte_next; 1391 freemsg(rte0->mp); 1392 mi_free((char *)rte0); 1393 } 1394 } 1395 /* 1396 * go thorugh the hash bucket and free all the entries marked condemned. 1397 */ 1398 void 1399 release_mfc(struct mfcb *mfcbp) 1400 { 1401 struct mfc *current_mfcp; 1402 struct mfc *prev_mfcp; 1403 1404 prev_mfcp = current_mfcp = mfcbp->mfcb_mfc; 1405 1406 while (current_mfcp != NULL) { 1407 if (current_mfcp->mfc_marks & MFCB_MARK_CONDEMNED) { 1408 if (current_mfcp == mfcbp->mfcb_mfc) { 1409 mfcbp->mfcb_mfc = current_mfcp->mfc_next; 1410 free_queue(current_mfcp); 1411 mi_free(current_mfcp); 1412 prev_mfcp = current_mfcp = mfcbp->mfcb_mfc; 1413 continue; 1414 } 1415 ASSERT(prev_mfcp != NULL); 1416 prev_mfcp->mfc_next = current_mfcp->mfc_next; 1417 free_queue(current_mfcp); 1418 mi_free(current_mfcp); 1419 current_mfcp = NULL; 1420 } else { 1421 prev_mfcp = current_mfcp; 1422 } 1423 1424 current_mfcp = prev_mfcp->mfc_next; 1425 1426 } 1427 mfcbp->mfcb_marks &= ~MFCB_MARK_CONDEMNED; 1428 ASSERT(mfcbp->mfcb_mfc != NULL || mfcbp->mfcb_marks == 0); 1429 } 1430 1431 /* 1432 * Delete an mfc entry. 1433 */ 1434 static int 1435 del_mfc(struct mfcctl *mfccp, ip_stack_t *ipst) 1436 { 1437 struct in_addr origin; 1438 struct in_addr mcastgrp; 1439 struct mfc *rt; 1440 uint_t hash; 1441 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1442 1443 origin = mfccp->mfcc_origin; 1444 mcastgrp = mfccp->mfcc_mcastgrp; 1445 hash = MFCHASH(origin.s_addr, mcastgrp.s_addr); 1446 1447 if (ipst->ips_ip_mrtdebug > 1) { 1448 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1449 "del_mfc: o %x g %x", 1450 ntohl(origin.s_addr), 1451 ntohl(mcastgrp.s_addr)); 1452 } 1453 1454 MFCB_REFHOLD(&ipst->ips_mfcs[hash]); 1455 1456 /* Find mfc in mfctable, finds only entries without upcalls */ 1457 for (rt = ipst->ips_mfcs[hash].mfcb_mfc; rt; rt = rt->mfc_next) { 1458 mutex_enter(&rt->mfc_mutex); 1459 if (origin.s_addr == rt->mfc_origin.s_addr && 1460 mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr && 1461 rt->mfc_rte == NULL && 1462 !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) 1463 break; 1464 mutex_exit(&rt->mfc_mutex); 1465 } 1466 1467 /* 1468 * Return if there was an upcall (mfc_rte != NULL, 1469 * or rt not in mfctable. 1470 */ 1471 if (rt == NULL) { 1472 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 1473 return (EADDRNOTAVAIL); 1474 } 1475 1476 1477 /* 1478 * no need to hold lock as we have a reference. 1479 */ 1480 ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED; 1481 /* error checking */ 1482 if (rt->mfc_timeout_id != 0) { 1483 ip0dbg(("del_mfc: TIMEOUT NOT 0, rte not null")); 1484 /* 1485 * Its ok to drop the lock, the struct cannot be freed 1486 * since we have a ref on the hash bucket. 1487 */ 1488 rt->mfc_timeout_id = 0; 1489 mutex_exit(&rt->mfc_mutex); 1490 (void) untimeout(rt->mfc_timeout_id); 1491 mutex_enter(&rt->mfc_mutex); 1492 } 1493 1494 ASSERT(rt->mfc_rte == NULL); 1495 1496 1497 /* 1498 * Delete the entry from the cache 1499 */ 1500 rt->mfc_marks |= MFCB_MARK_CONDEMNED; 1501 mutex_exit(&rt->mfc_mutex); 1502 1503 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 1504 1505 return (0); 1506 } 1507 1508 #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ 1509 1510 /* 1511 * IP multicast forwarding function. This function assumes that the packet 1512 * pointed to by ipha has arrived on (or is about to be sent to) the interface 1513 * pointed to by "ill", and the packet is to be relayed to other networks 1514 * that have members of the packet's destination IP multicast group. 1515 * 1516 * The packet is returned unscathed to the caller, unless it is 1517 * erroneous, in which case a -1 value tells the caller (IP) 1518 * to discard it. 1519 * 1520 * Unlike BSD, SunOS 5.x needs to return to IP info about 1521 * whether pkt came in thru a tunnel, so it can be discarded, unless 1522 * it's IGMP. In BSD, the ifp is bogus for tunnels, so pkt won't try 1523 * to be delivered. 1524 * Return values are 0 - pkt is okay and phyint 1525 * -1 - pkt is malformed and to be tossed 1526 * 1 - pkt came in on tunnel 1527 */ 1528 int 1529 ip_mforward(mblk_t *mp, ip_recv_attr_t *ira) 1530 { 1531 ipha_t *ipha = (ipha_t *)mp->b_rptr; 1532 ill_t *ill = ira->ira_ill; 1533 struct mfc *rt; 1534 ipaddr_t src, dst, tunnel_src = 0; 1535 static int srctun = 0; 1536 vifi_t vifi; 1537 boolean_t pim_reg_packet = B_FALSE; 1538 struct mfcb *mfcbp; 1539 ip_stack_t *ipst = ill->ill_ipst; 1540 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1541 ill_t *rill = ira->ira_rill; 1542 1543 ASSERT(ira->ira_pktlen == msgdsize(mp)); 1544 1545 if (ipst->ips_ip_mrtdebug > 1) { 1546 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1547 "ip_mforward: RECV ipha_src %x, ipha_dst %x, ill %s", 1548 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), 1549 ill->ill_name); 1550 } 1551 1552 dst = ipha->ipha_dst; 1553 if (ira->ira_flags & IRAF_PIM_REGISTER) 1554 pim_reg_packet = B_TRUE; 1555 else if (ira->ira_flags & IRAF_MROUTE_TUNNEL_SET) 1556 tunnel_src = ira->ira_mroute_tunnel; 1557 1558 /* 1559 * Don't forward a packet with time-to-live of zero or one, 1560 * or a packet destined to a local-only group. 1561 */ 1562 if (CLASSD(dst) && (ipha->ipha_ttl <= 1 || 1563 (ipaddr_t)ntohl(dst) <= INADDR_MAX_LOCAL_GROUP)) { 1564 if (ipst->ips_ip_mrtdebug > 1) { 1565 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1566 "ip_mforward: not forwarded ttl %d," 1567 " dst 0x%x ill %s", 1568 ipha->ipha_ttl, ntohl(dst), ill->ill_name); 1569 } 1570 if (tunnel_src != 0) 1571 return (1); 1572 else 1573 return (0); 1574 } 1575 1576 if ((tunnel_src != 0) || pim_reg_packet) { 1577 /* 1578 * Packet arrived over an encapsulated tunnel or via a PIM 1579 * register message. 1580 */ 1581 if (ipst->ips_ip_mrtdebug > 1) { 1582 if (tunnel_src != 0) { 1583 (void) mi_strlog(mrouter->conn_rq, 1, 1584 SL_TRACE, 1585 "ip_mforward: ill %s arrived via ENCAP TUN", 1586 ill->ill_name); 1587 } else if (pim_reg_packet) { 1588 (void) mi_strlog(mrouter->conn_rq, 1, 1589 SL_TRACE, 1590 "ip_mforward: ill %s arrived via" 1591 " REGISTER VIF", 1592 ill->ill_name); 1593 } 1594 } 1595 } else if ((ipha->ipha_version_and_hdr_length & 0xf) < 1596 (uint_t)(IP_SIMPLE_HDR_LENGTH + TUNNEL_LEN) >> 2 || 1597 ((uchar_t *)(ipha + 1))[1] != IPOPT_LSRR) { 1598 /* Packet arrived via a physical interface. */ 1599 if (ipst->ips_ip_mrtdebug > 1) { 1600 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1601 "ip_mforward: ill %s arrived via PHYINT", 1602 ill->ill_name); 1603 } 1604 1605 } else { 1606 /* 1607 * Packet arrived through a SRCRT tunnel. 1608 * Source-route tunnels are no longer supported. 1609 * Error message printed every 1000 times. 1610 */ 1611 if ((srctun++ % 1000) == 0) { 1612 cmn_err(CE_WARN, 1613 "ip_mforward: received source-routed pkt from %x", 1614 ntohl(ipha->ipha_src)); 1615 } 1616 return (-1); 1617 } 1618 1619 ipst->ips_mrtstat->mrts_fwd_in++; 1620 src = ipha->ipha_src; 1621 1622 /* Find route in cache, return NULL if not there or upcalls q'ed. */ 1623 1624 /* 1625 * Lock the mfctable against changes made by ip_mforward. 1626 * Note that only add_mfc and del_mfc can remove entries and 1627 * they run with exclusive access to IP. So we do not need to 1628 * guard against the rt being deleted, so release lock after reading. 1629 */ 1630 1631 if (is_mrouter_off(ipst)) 1632 return (-1); 1633 1634 mfcbp = &ipst->ips_mfcs[MFCHASH(src, dst)]; 1635 MFCB_REFHOLD(mfcbp); 1636 MFCFIND(mfcbp, src, dst, rt); 1637 1638 /* Entry exists, so forward if necessary */ 1639 if (rt != NULL) { 1640 int ret = 0; 1641 ipst->ips_mrtstat->mrts_mfc_hits++; 1642 if (pim_reg_packet) { 1643 ASSERT(ipst->ips_reg_vif_num != ALL_VIFS); 1644 ret = ip_mdq(mp, ipha, 1645 ipst->ips_vifs[ipst->ips_reg_vif_num]. 1646 v_ipif->ipif_ill, 1647 0, rt); 1648 } else { 1649 ret = ip_mdq(mp, ipha, ill, tunnel_src, rt); 1650 } 1651 1652 MFCB_REFRELE(mfcbp); 1653 return (ret); 1654 1655 /* 1656 * Don't forward if we don't have a cache entry. Mrouted will 1657 * always provide a cache entry in response to an upcall. 1658 */ 1659 } else { 1660 /* 1661 * If we don't have a route for packet's origin, make a copy 1662 * of the packet and send message to routing daemon. 1663 */ 1664 struct mfc *mfc_rt = NULL; 1665 mblk_t *mp0 = NULL; 1666 mblk_t *mp_copy = NULL; 1667 struct rtdetq *rte = NULL; 1668 struct rtdetq *rte_m, *rte1, *prev_rte; 1669 uint_t hash; 1670 int npkts; 1671 boolean_t new_mfc = B_FALSE; 1672 ipst->ips_mrtstat->mrts_mfc_misses++; 1673 /* BSD uses mrts_no_route++ */ 1674 if (ipst->ips_ip_mrtdebug > 1) { 1675 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1676 "ip_mforward: no rte ill %s src %x g %x misses %d", 1677 ill->ill_name, ntohl(src), ntohl(dst), 1678 (int)ipst->ips_mrtstat->mrts_mfc_misses); 1679 } 1680 /* 1681 * The order of the following code differs from the BSD code. 1682 * Pre-mc3.5, the BSD code was incorrect and SunOS 5.x 1683 * code works, so SunOS 5.x wasn't changed to conform to the 1684 * BSD version. 1685 */ 1686 1687 /* Lock mfctable. */ 1688 hash = MFCHASH(src, dst); 1689 mutex_enter(&(ipst->ips_mfcs[hash].mfcb_lock)); 1690 1691 /* 1692 * If we are turning off mrouted return an error 1693 */ 1694 if (is_mrouter_off(ipst)) { 1695 mutex_exit(&mfcbp->mfcb_lock); 1696 MFCB_REFRELE(mfcbp); 1697 return (-1); 1698 } 1699 1700 /* Is there an upcall waiting for this packet? */ 1701 for (mfc_rt = ipst->ips_mfcs[hash].mfcb_mfc; mfc_rt; 1702 mfc_rt = mfc_rt->mfc_next) { 1703 mutex_enter(&mfc_rt->mfc_mutex); 1704 if (ipst->ips_ip_mrtdebug > 1) { 1705 (void) mi_strlog(mrouter->conn_rq, 1, 1706 SL_TRACE, 1707 "ip_mforward: MFCTAB hash %d o 0x%x" 1708 " g 0x%x\n", 1709 hash, ntohl(mfc_rt->mfc_origin.s_addr), 1710 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1711 } 1712 /* There is an upcall */ 1713 if ((src == mfc_rt->mfc_origin.s_addr) && 1714 (dst == mfc_rt->mfc_mcastgrp.s_addr) && 1715 (mfc_rt->mfc_rte != NULL) && 1716 !(mfc_rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 1717 break; 1718 } 1719 mutex_exit(&mfc_rt->mfc_mutex); 1720 } 1721 /* No upcall, so make a new entry into mfctable */ 1722 if (mfc_rt == NULL) { 1723 mfc_rt = (struct mfc *)mi_zalloc(sizeof (struct mfc)); 1724 if (mfc_rt == NULL) { 1725 ipst->ips_mrtstat->mrts_fwd_drop++; 1726 ip1dbg(("ip_mforward: out of memory " 1727 "for mfc, mfc_rt\n")); 1728 goto error_return; 1729 } else 1730 new_mfc = B_TRUE; 1731 /* Get resources */ 1732 /* TODO could copy header and dup rest */ 1733 mp_copy = copymsg(mp); 1734 if (mp_copy == NULL) { 1735 ipst->ips_mrtstat->mrts_fwd_drop++; 1736 ip1dbg(("ip_mforward: out of memory for " 1737 "mblk, mp_copy\n")); 1738 goto error_return; 1739 } 1740 mutex_enter(&mfc_rt->mfc_mutex); 1741 } 1742 /* Get resources for rte, whether first rte or not first. */ 1743 /* Add this packet into rtdetq */ 1744 rte = (struct rtdetq *)mi_zalloc(sizeof (struct rtdetq)); 1745 if (rte == NULL) { 1746 ipst->ips_mrtstat->mrts_fwd_drop++; 1747 mutex_exit(&mfc_rt->mfc_mutex); 1748 ip1dbg(("ip_mforward: out of memory for" 1749 " rtdetq, rte\n")); 1750 goto error_return; 1751 } 1752 1753 mp0 = copymsg(mp); 1754 if (mp0 == NULL) { 1755 ipst->ips_mrtstat->mrts_fwd_drop++; 1756 ip1dbg(("ip_mforward: out of memory for mblk, mp0\n")); 1757 mutex_exit(&mfc_rt->mfc_mutex); 1758 goto error_return; 1759 } 1760 rte->mp = mp0; 1761 if (pim_reg_packet) { 1762 ASSERT(ipst->ips_reg_vif_num != ALL_VIFS); 1763 rte->ill = 1764 ipst->ips_vifs[ipst->ips_reg_vif_num]. 1765 v_ipif->ipif_ill; 1766 } else { 1767 rte->ill = ill; 1768 } 1769 rte->rte_next = NULL; 1770 1771 /* 1772 * Determine if upcall q (rtdetq) has overflowed. 1773 * mfc_rt->mfc_rte is null by mi_zalloc 1774 * if it is the first message. 1775 */ 1776 for (rte_m = mfc_rt->mfc_rte, npkts = 0; rte_m; 1777 rte_m = rte_m->rte_next) 1778 npkts++; 1779 if (ipst->ips_ip_mrtdebug > 1) { 1780 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1781 "ip_mforward: upcalls %d\n", npkts); 1782 } 1783 if (npkts > MAX_UPQ) { 1784 ipst->ips_mrtstat->mrts_upq_ovflw++; 1785 mutex_exit(&mfc_rt->mfc_mutex); 1786 goto error_return; 1787 } 1788 1789 if (npkts == 0) { /* first upcall */ 1790 int i = 0; 1791 /* 1792 * Now finish installing the new mfc! Now that we have 1793 * resources! Insert new entry at head of hash chain. 1794 * Use src and dst which are ipaddr_t's. 1795 */ 1796 mfc_rt->mfc_origin.s_addr = src; 1797 mfc_rt->mfc_mcastgrp.s_addr = dst; 1798 1799 mutex_enter(&ipst->ips_numvifs_mutex); 1800 for (i = 0; i < (int)ipst->ips_numvifs; i++) 1801 mfc_rt->mfc_ttls[i] = 0; 1802 mutex_exit(&ipst->ips_numvifs_mutex); 1803 mfc_rt->mfc_parent = ALL_VIFS; 1804 1805 /* Link into table */ 1806 if (ipst->ips_ip_mrtdebug > 1) { 1807 (void) mi_strlog(mrouter->conn_rq, 1, 1808 SL_TRACE, 1809 "ip_mforward: NEW MFCTAB hash %d o 0x%x " 1810 "g 0x%x\n", hash, 1811 ntohl(mfc_rt->mfc_origin.s_addr), 1812 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1813 } 1814 mfc_rt->mfc_next = ipst->ips_mfcs[hash].mfcb_mfc; 1815 ipst->ips_mfcs[hash].mfcb_mfc = mfc_rt; 1816 mfc_rt->mfc_rte = NULL; 1817 } 1818 1819 /* Link in the upcall */ 1820 /* First upcall */ 1821 if (mfc_rt->mfc_rte == NULL) 1822 mfc_rt->mfc_rte = rte; 1823 else { 1824 /* not the first upcall */ 1825 prev_rte = mfc_rt->mfc_rte; 1826 for (rte1 = mfc_rt->mfc_rte->rte_next; rte1; 1827 prev_rte = rte1, rte1 = rte1->rte_next) 1828 ; 1829 prev_rte->rte_next = rte; 1830 } 1831 1832 /* 1833 * No upcalls waiting, this is first one, so send a message to 1834 * routing daemon to install a route into kernel table. 1835 */ 1836 if (npkts == 0) { 1837 struct igmpmsg *im; 1838 /* ipha_protocol is 0, for upcall */ 1839 ASSERT(mp_copy != NULL); 1840 im = (struct igmpmsg *)mp_copy->b_rptr; 1841 im->im_msgtype = IGMPMSG_NOCACHE; 1842 im->im_mbz = 0; 1843 mutex_enter(&ipst->ips_numvifs_mutex); 1844 if (pim_reg_packet) { 1845 im->im_vif = (uchar_t)ipst->ips_reg_vif_num; 1846 mutex_exit(&ipst->ips_numvifs_mutex); 1847 } else { 1848 /* 1849 * XXX do we need to hold locks here ? 1850 */ 1851 for (vifi = 0; 1852 vifi < ipst->ips_numvifs; 1853 vifi++) { 1854 if (ipst->ips_vifs[vifi].v_ipif == NULL) 1855 continue; 1856 if (ipst->ips_vifs[vifi]. 1857 v_ipif->ipif_ill == ill) { 1858 im->im_vif = (uchar_t)vifi; 1859 break; 1860 } 1861 } 1862 mutex_exit(&ipst->ips_numvifs_mutex); 1863 ASSERT(vifi < ipst->ips_numvifs); 1864 } 1865 1866 ipst->ips_mrtstat->mrts_upcalls++; 1867 /* Timer to discard upcalls if mrouted is too slow */ 1868 mfc_rt->mfc_timeout_id = timeout(expire_upcalls, 1869 mfc_rt, EXPIRE_TIMEOUT * UPCALL_EXPIRE); 1870 mutex_exit(&mfc_rt->mfc_mutex); 1871 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1872 /* Pass to RAWIP */ 1873 ira->ira_ill = ira->ira_rill = NULL; 1874 (mrouter->conn_recv)(mrouter, mp_copy, NULL, ira); 1875 ira->ira_ill = ill; 1876 ira->ira_rill = rill; 1877 } else { 1878 mutex_exit(&mfc_rt->mfc_mutex); 1879 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1880 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1881 ip_drop_input("ip_mforward - upcall already waiting", 1882 mp_copy, ill); 1883 freemsg(mp_copy); 1884 } 1885 1886 MFCB_REFRELE(mfcbp); 1887 if (tunnel_src != 0) 1888 return (1); 1889 else 1890 return (0); 1891 error_return: 1892 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1893 MFCB_REFRELE(mfcbp); 1894 if (mfc_rt != NULL && (new_mfc == B_TRUE)) 1895 mi_free((char *)mfc_rt); 1896 if (rte != NULL) 1897 mi_free((char *)rte); 1898 if (mp_copy != NULL) { 1899 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1900 ip_drop_input("ip_mforward error", mp_copy, ill); 1901 freemsg(mp_copy); 1902 } 1903 if (mp0 != NULL) 1904 freemsg(mp0); 1905 return (-1); 1906 } 1907 } 1908 1909 /* 1910 * Clean up the mfctable cache entry if upcall is not serviced. 1911 * SunOS 5.x has timeout per mfc, unlike BSD which has one timer. 1912 */ 1913 static void 1914 expire_upcalls(void *arg) 1915 { 1916 struct mfc *mfc_rt = arg; 1917 uint_t hash; 1918 struct mfc *prev_mfc, *mfc0; 1919 ip_stack_t *ipst; 1920 conn_t *mrouter; 1921 1922 if (mfc_rt->mfc_rte == NULL || mfc_rt->mfc_rte->ill != NULL) { 1923 cmn_err(CE_WARN, "expire_upcalls: no ILL\n"); 1924 return; 1925 } 1926 ipst = mfc_rt->mfc_rte->ill->ill_ipst; 1927 mrouter = ipst->ips_ip_g_mrouter; 1928 1929 hash = MFCHASH(mfc_rt->mfc_origin.s_addr, mfc_rt->mfc_mcastgrp.s_addr); 1930 if (ipst->ips_ip_mrtdebug > 1) { 1931 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1932 "expire_upcalls: hash %d s %x g %x", 1933 hash, ntohl(mfc_rt->mfc_origin.s_addr), 1934 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1935 } 1936 MFCB_REFHOLD(&ipst->ips_mfcs[hash]); 1937 mutex_enter(&mfc_rt->mfc_mutex); 1938 /* 1939 * if timeout has been set to zero, than the 1940 * entry has been filled, no need to delete it. 1941 */ 1942 if (mfc_rt->mfc_timeout_id == 0) 1943 goto done; 1944 ipst->ips_mrtstat->mrts_cache_cleanups++; 1945 mfc_rt->mfc_timeout_id = 0; 1946 1947 /* Determine entry to be cleaned up in cache table. */ 1948 for (prev_mfc = mfc0 = ipst->ips_mfcs[hash].mfcb_mfc; mfc0; 1949 prev_mfc = mfc0, mfc0 = mfc0->mfc_next) 1950 if (mfc0 == mfc_rt) 1951 break; 1952 1953 /* del_mfc takes care of gone mfcs */ 1954 ASSERT(prev_mfc != NULL); 1955 ASSERT(mfc0 != NULL); 1956 1957 /* 1958 * Delete the entry from the cache 1959 */ 1960 ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED; 1961 mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED; 1962 1963 /* 1964 * release_mfc will drop all queued upcall packets. 1965 * and will free the mbuf with the pkt, if, timing info. 1966 */ 1967 done: 1968 mutex_exit(&mfc_rt->mfc_mutex); 1969 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 1970 } 1971 1972 /* 1973 * Packet forwarding routine once entry in the cache is made. 1974 */ 1975 static int 1976 ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src, 1977 struct mfc *rt) 1978 { 1979 vifi_t vifi; 1980 struct vif *vifp; 1981 ipaddr_t dst = ipha->ipha_dst; 1982 size_t plen = msgdsize(mp); 1983 vifi_t num_of_vifs; 1984 ip_stack_t *ipst = ill->ill_ipst; 1985 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1986 ip_recv_attr_t iras; 1987 1988 if (ipst->ips_ip_mrtdebug > 1) { 1989 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1990 "ip_mdq: SEND src %x, ipha_dst %x, ill %s", 1991 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), 1992 ill->ill_name); 1993 } 1994 1995 /* Macro to send packet on vif */ 1996 #define MC_SEND(ipha, mp, vifp, dst) { \ 1997 if ((vifp)->v_flags & VIFF_TUNNEL) \ 1998 encap_send((ipha), (mp), (vifp), (dst)); \ 1999 else if ((vifp)->v_flags & VIFF_REGISTER) \ 2000 register_send((ipha), (mp), (vifp), (dst)); \ 2001 else \ 2002 phyint_send((ipha), (mp), (vifp), (dst)); \ 2003 } 2004 2005 vifi = rt->mfc_parent; 2006 2007 /* 2008 * The value of vifi is MAXVIFS if the pkt had no parent, i.e., 2009 * Mrouted had no route. 2010 * We wanted the route installed in the mfctable to prevent multiple 2011 * tries, so it passed add_mfc(), but is discarded here. The v_ipif is 2012 * NULL so we don't want to check the ill. Still needed as of Mrouted 2013 * 3.6. 2014 */ 2015 if (vifi == NO_VIF) { 2016 ip1dbg(("ip_mdq: no route for origin ill %s, vifi is NO_VIF\n", 2017 ill->ill_name)); 2018 if (ipst->ips_ip_mrtdebug > 1) { 2019 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2020 "ip_mdq: vifi is NO_VIF ill = %s", ill->ill_name); 2021 } 2022 return (-1); /* drop pkt */ 2023 } 2024 2025 if (!lock_good_vif(&ipst->ips_vifs[vifi])) 2026 return (-1); 2027 /* 2028 * The MFC entries are not cleaned up when an ipif goes 2029 * away thus this code has to guard against an MFC referencing 2030 * an ipif that has been closed. Note: reset_mrt_vif_ipif 2031 * sets the v_ipif to NULL when the ipif disappears. 2032 */ 2033 ASSERT(ipst->ips_vifs[vifi].v_ipif != NULL); 2034 2035 if (vifi >= ipst->ips_numvifs) { 2036 cmn_err(CE_WARN, "ip_mdq: illegal vifi %d numvifs " 2037 "%d ill %s viftable ill %s\n", 2038 (int)vifi, (int)ipst->ips_numvifs, ill->ill_name, 2039 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name); 2040 unlock_good_vif(&ipst->ips_vifs[vifi]); 2041 return (-1); 2042 } 2043 /* 2044 * Don't forward if it didn't arrive from the parent vif for its 2045 * origin. 2046 */ 2047 if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill) || 2048 (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) { 2049 /* Came in the wrong interface */ 2050 ip1dbg(("ip_mdq: arrived wrong if, vifi %d " 2051 "numvifs %d ill %s viftable ill %s\n", 2052 (int)vifi, (int)ipst->ips_numvifs, ill->ill_name, 2053 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name)); 2054 if (ipst->ips_ip_mrtdebug > 1) { 2055 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2056 "ip_mdq: arrived wrong if, vifi %d ill " 2057 "%s viftable ill %s\n", 2058 (int)vifi, ill->ill_name, 2059 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name); 2060 } 2061 ipst->ips_mrtstat->mrts_wrong_if++; 2062 rt->mfc_wrong_if++; 2063 2064 /* 2065 * If we are doing PIM assert processing and we are forwarding 2066 * packets on this interface, and it is a broadcast medium 2067 * interface (and not a tunnel), send a message to the routing. 2068 * 2069 * We use the first ipif on the list, since it's all we have. 2070 * Chances are the ipif_flags are the same for ipifs on the ill. 2071 */ 2072 if (ipst->ips_pim_assert && rt->mfc_ttls[vifi] > 0 && 2073 (ill->ill_ipif->ipif_flags & IPIF_BROADCAST) && 2074 !(ipst->ips_vifs[vifi].v_flags & VIFF_TUNNEL)) { 2075 mblk_t *mp_copy; 2076 struct igmpmsg *im; 2077 2078 /* TODO could copy header and dup rest */ 2079 mp_copy = copymsg(mp); 2080 if (mp_copy == NULL) { 2081 ipst->ips_mrtstat->mrts_fwd_drop++; 2082 ip1dbg(("ip_mdq: out of memory " 2083 "for mblk, mp_copy\n")); 2084 unlock_good_vif(&ipst->ips_vifs[vifi]); 2085 return (-1); 2086 } 2087 2088 im = (struct igmpmsg *)mp_copy->b_rptr; 2089 im->im_msgtype = IGMPMSG_WRONGVIF; 2090 im->im_mbz = 0; 2091 im->im_vif = (ushort_t)vifi; 2092 /* Pass to RAWIP */ 2093 2094 bzero(&iras, sizeof (iras)); 2095 iras.ira_flags = IRAF_IS_IPV4; 2096 iras.ira_ip_hdr_length = 2097 IPH_HDR_LENGTH(mp_copy->b_rptr); 2098 iras.ira_pktlen = msgdsize(mp_copy); 2099 (mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras); 2100 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2101 } 2102 unlock_good_vif(&ipst->ips_vifs[vifi]); 2103 if (tunnel_src != 0) 2104 return (1); 2105 else 2106 return (0); 2107 } 2108 /* 2109 * If I sourced this packet, it counts as output, else it was input. 2110 */ 2111 if (ipha->ipha_src == ipst->ips_vifs[vifi].v_lcl_addr.s_addr) { 2112 ipst->ips_vifs[vifi].v_pkt_out++; 2113 ipst->ips_vifs[vifi].v_bytes_out += plen; 2114 } else { 2115 ipst->ips_vifs[vifi].v_pkt_in++; 2116 ipst->ips_vifs[vifi].v_bytes_in += plen; 2117 } 2118 mutex_enter(&rt->mfc_mutex); 2119 rt->mfc_pkt_cnt++; 2120 rt->mfc_byte_cnt += plen; 2121 mutex_exit(&rt->mfc_mutex); 2122 unlock_good_vif(&ipst->ips_vifs[vifi]); 2123 /* 2124 * For each vif, decide if a copy of the packet should be forwarded. 2125 * Forward if: 2126 * - the vif threshold ttl is non-zero AND 2127 * - the pkt ttl exceeds the vif's threshold 2128 * A non-zero mfc_ttl indicates that the vif is part of 2129 * the output set for the mfc entry. 2130 */ 2131 mutex_enter(&ipst->ips_numvifs_mutex); 2132 num_of_vifs = ipst->ips_numvifs; 2133 mutex_exit(&ipst->ips_numvifs_mutex); 2134 for (vifp = ipst->ips_vifs, vifi = 0; 2135 vifi < num_of_vifs; 2136 vifp++, vifi++) { 2137 if (!lock_good_vif(vifp)) 2138 continue; 2139 if ((rt->mfc_ttls[vifi] > 0) && 2140 (ipha->ipha_ttl > rt->mfc_ttls[vifi])) { 2141 /* 2142 * lock_good_vif should not have succedded if 2143 * v_ipif is null. 2144 */ 2145 ASSERT(vifp->v_ipif != NULL); 2146 vifp->v_pkt_out++; 2147 vifp->v_bytes_out += plen; 2148 MC_SEND(ipha, mp, vifp, dst); 2149 ipst->ips_mrtstat->mrts_fwd_out++; 2150 } 2151 unlock_good_vif(vifp); 2152 } 2153 if (tunnel_src != 0) 2154 return (1); 2155 else 2156 return (0); 2157 } 2158 2159 /* 2160 * Send the packet on physical interface. 2161 * Caller assumes can continue to use mp on return. 2162 */ 2163 /* ARGSUSED */ 2164 static void 2165 phyint_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2166 { 2167 mblk_t *mp_copy; 2168 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2169 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2170 2171 /* Make a new reference to the packet */ 2172 mp_copy = copymsg(mp); /* TODO could copy header and dup rest */ 2173 if (mp_copy == NULL) { 2174 ipst->ips_mrtstat->mrts_fwd_drop++; 2175 ip1dbg(("phyint_send: out of memory for mblk, mp_copy\n")); 2176 return; 2177 } 2178 if (vifp->v_rate_limit <= 0) 2179 tbf_send_packet(vifp, mp_copy); 2180 else { 2181 if (ipst->ips_ip_mrtdebug > 1) { 2182 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2183 "phyint_send: tbf_contr rate %d " 2184 "vifp 0x%p mp 0x%p dst 0x%x", 2185 vifp->v_rate_limit, (void *)vifp, (void *)mp, dst); 2186 } 2187 tbf_control(vifp, mp_copy, (ipha_t *)mp_copy->b_rptr); 2188 } 2189 } 2190 2191 /* 2192 * Send the whole packet for REGISTER encapsulation to PIM daemon 2193 * Caller assumes it can continue to use mp on return. 2194 */ 2195 /* ARGSUSED */ 2196 static void 2197 register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2198 { 2199 struct igmpmsg *im; 2200 mblk_t *mp_copy; 2201 ipha_t *ipha_copy; 2202 ill_t *ill = vifp->v_ipif->ipif_ill; 2203 ip_stack_t *ipst = ill->ill_ipst; 2204 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2205 ip_recv_attr_t iras; 2206 2207 if (ipst->ips_ip_mrtdebug > 1) { 2208 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2209 "register_send: src %x, dst %x\n", 2210 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst)); 2211 } 2212 2213 /* 2214 * Copy the old packet & pullup its IP header into the new mblk_t so we 2215 * can modify it. Try to fill the new mblk_t since if we don't the 2216 * ethernet driver will. 2217 */ 2218 mp_copy = allocb(sizeof (struct igmpmsg) + sizeof (ipha_t), BPRI_MED); 2219 if (mp_copy == NULL) { 2220 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2221 if (ipst->ips_ip_mrtdebug > 3) { 2222 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2223 "register_send: allocb failure."); 2224 } 2225 return; 2226 } 2227 2228 /* 2229 * Bump write pointer to account for igmpmsg being added. 2230 */ 2231 mp_copy->b_wptr = mp_copy->b_rptr + sizeof (struct igmpmsg); 2232 2233 /* 2234 * Chain packet to new mblk_t. 2235 */ 2236 if ((mp_copy->b_cont = copymsg(mp)) == NULL) { 2237 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2238 if (ipst->ips_ip_mrtdebug > 3) { 2239 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2240 "register_send: copymsg failure."); 2241 } 2242 freeb(mp_copy); 2243 return; 2244 } 2245 2246 /* 2247 * icmp_input() asserts that IP version field is set to an 2248 * appropriate version. Hence, the struct igmpmsg that this really 2249 * becomes, needs to have the correct IP version field. 2250 */ 2251 ipha_copy = (ipha_t *)mp_copy->b_rptr; 2252 *ipha_copy = multicast_encap_iphdr; 2253 2254 /* 2255 * The kernel uses the struct igmpmsg header to encode the messages to 2256 * the multicast routing daemon. Fill in the fields in the header 2257 * starting with the message type which is IGMPMSG_WHOLEPKT 2258 */ 2259 im = (struct igmpmsg *)mp_copy->b_rptr; 2260 im->im_msgtype = IGMPMSG_WHOLEPKT; 2261 im->im_src.s_addr = ipha->ipha_src; 2262 im->im_dst.s_addr = ipha->ipha_dst; 2263 2264 /* 2265 * Must Be Zero. This is because the struct igmpmsg is really an IP 2266 * header with renamed fields and the multicast routing daemon uses 2267 * an ipha_protocol (aka im_mbz) of 0 to distinguish these messages. 2268 */ 2269 im->im_mbz = 0; 2270 2271 ++ipst->ips_mrtstat->mrts_upcalls; 2272 if (IPCL_IS_NONSTR(mrouter) ? mrouter->conn_flow_cntrld : 2273 !canputnext(mrouter->conn_rq)) { 2274 ++ipst->ips_mrtstat->mrts_pim_regsend_drops; 2275 if (ipst->ips_ip_mrtdebug > 3) { 2276 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2277 "register_send: register upcall failure."); 2278 } 2279 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2280 ip_drop_input("mrts_pim_regsend_drops", mp_copy, ill); 2281 freemsg(mp_copy); 2282 } else { 2283 /* Pass to RAWIP */ 2284 bzero(&iras, sizeof (iras)); 2285 iras.ira_flags = IRAF_IS_IPV4; 2286 iras.ira_ip_hdr_length = sizeof (ipha_t); 2287 iras.ira_pktlen = msgdsize(mp_copy); 2288 (mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras); 2289 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2290 } 2291 } 2292 2293 /* 2294 * pim_validate_cksum handles verification of the checksum in the 2295 * pim header. For PIM Register packets, the checksum is calculated 2296 * across the PIM header only. For all other packets, the checksum 2297 * is for the PIM header and remainder of the packet. 2298 * 2299 * returns: B_TRUE, if checksum is okay. 2300 * B_FALSE, if checksum is not valid. 2301 */ 2302 static boolean_t 2303 pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp) 2304 { 2305 mblk_t *mp_dup; 2306 2307 if ((mp_dup = dupmsg(mp)) == NULL) 2308 return (B_FALSE); 2309 2310 mp_dup->b_rptr += IPH_HDR_LENGTH(ip); 2311 if (pimp->pim_type == PIM_REGISTER) 2312 mp_dup->b_wptr = mp_dup->b_rptr + PIM_MINLEN; 2313 if (IP_CSUM(mp_dup, 0, 0)) { 2314 freemsg(mp_dup); 2315 return (B_FALSE); 2316 } 2317 freemsg(mp_dup); 2318 return (B_TRUE); 2319 } 2320 2321 /* 2322 * Process PIM protocol packets i.e. IP Protocol 103. 2323 * Register messages are decapsulated and sent onto multicast forwarding. 2324 * 2325 * Return NULL for a bad packet that is discarded here. 2326 * Return mp if the message is OK and should be handed to "raw" receivers. 2327 * Callers of pim_input() may need to reinitialize variables that were copied 2328 * from the mblk as this calls pullupmsg(). 2329 */ 2330 mblk_t * 2331 pim_input(mblk_t *mp, ip_recv_attr_t *ira) 2332 { 2333 ipha_t *eip, *ip; 2334 int iplen, pimlen, iphlen; 2335 struct pim *pimp; /* pointer to a pim struct */ 2336 uint32_t *reghdr; 2337 ill_t *ill = ira->ira_ill; 2338 ip_stack_t *ipst = ill->ill_ipst; 2339 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2340 2341 /* 2342 * Pullup the msg for PIM protocol processing. 2343 */ 2344 if (pullupmsg(mp, -1) == 0) { 2345 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2346 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2347 ip_drop_input("mrts_pim_nomemory", mp, ill); 2348 freemsg(mp); 2349 return (NULL); 2350 } 2351 2352 ip = (ipha_t *)mp->b_rptr; 2353 iplen = ip->ipha_length; 2354 iphlen = IPH_HDR_LENGTH(ip); 2355 pimlen = ntohs(iplen) - iphlen; 2356 2357 /* 2358 * Validate lengths 2359 */ 2360 if (pimlen < PIM_MINLEN) { 2361 ++ipst->ips_mrtstat->mrts_pim_malformed; 2362 if (ipst->ips_ip_mrtdebug > 1) { 2363 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2364 "pim_input: length not at least minlen"); 2365 } 2366 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2367 ip_drop_input("mrts_pim_malformed", mp, ill); 2368 freemsg(mp); 2369 return (NULL); 2370 } 2371 2372 /* 2373 * Point to the PIM header. 2374 */ 2375 pimp = (struct pim *)((caddr_t)ip + iphlen); 2376 2377 /* 2378 * Check the version number. 2379 */ 2380 if (pimp->pim_vers != PIM_VERSION) { 2381 ++ipst->ips_mrtstat->mrts_pim_badversion; 2382 if (ipst->ips_ip_mrtdebug > 1) { 2383 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2384 "pim_input: unknown version of PIM"); 2385 } 2386 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2387 ip_drop_input("mrts_pim_badversion", mp, ill); 2388 freemsg(mp); 2389 return (NULL); 2390 } 2391 2392 /* 2393 * Validate the checksum 2394 */ 2395 if (!pim_validate_cksum(mp, ip, pimp)) { 2396 ++ipst->ips_mrtstat->mrts_pim_rcv_badcsum; 2397 if (ipst->ips_ip_mrtdebug > 1) { 2398 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2399 "pim_input: invalid checksum"); 2400 } 2401 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2402 ip_drop_input("pim_rcv_badcsum", mp, ill); 2403 freemsg(mp); 2404 return (NULL); 2405 } 2406 2407 if (pimp->pim_type != PIM_REGISTER) 2408 return (mp); 2409 2410 reghdr = (uint32_t *)(pimp + 1); 2411 eip = (ipha_t *)(reghdr + 1); 2412 2413 /* 2414 * check if the inner packet is destined to mcast group 2415 */ 2416 if (!CLASSD(eip->ipha_dst)) { 2417 ++ipst->ips_mrtstat->mrts_pim_badregisters; 2418 if (ipst->ips_ip_mrtdebug > 1) { 2419 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2420 "pim_input: Inner pkt not mcast .. !"); 2421 } 2422 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2423 ip_drop_input("mrts_pim_badregisters", mp, ill); 2424 freemsg(mp); 2425 return (NULL); 2426 } 2427 if (ipst->ips_ip_mrtdebug > 1) { 2428 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2429 "register from %x, to %x, len %d", 2430 ntohl(eip->ipha_src), 2431 ntohl(eip->ipha_dst), 2432 ntohs(eip->ipha_length)); 2433 } 2434 /* 2435 * If the null register bit is not set, decapsulate 2436 * the packet before forwarding it. 2437 * Avoid this in no register vif 2438 */ 2439 if (!(ntohl(*reghdr) & PIM_NULL_REGISTER) && 2440 ipst->ips_reg_vif_num != ALL_VIFS) { 2441 mblk_t *mp_copy; 2442 uint_t saved_pktlen; 2443 2444 /* Copy the message */ 2445 if ((mp_copy = copymsg(mp)) == NULL) { 2446 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2447 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2448 ip_drop_input("mrts_pim_nomemory", mp, ill); 2449 freemsg(mp); 2450 return (NULL); 2451 } 2452 2453 /* 2454 * Decapsulate the packet and give it to 2455 * register_mforward. 2456 */ 2457 mp_copy->b_rptr += iphlen + sizeof (pim_t) + sizeof (*reghdr); 2458 saved_pktlen = ira->ira_pktlen; 2459 ira->ira_pktlen -= iphlen + sizeof (pim_t) + sizeof (*reghdr); 2460 if (register_mforward(mp_copy, ira) != 0) { 2461 /* register_mforward already called ip_drop_input */ 2462 freemsg(mp); 2463 ira->ira_pktlen = saved_pktlen; 2464 return (NULL); 2465 } 2466 ira->ira_pktlen = saved_pktlen; 2467 } 2468 2469 /* 2470 * Pass all valid PIM packets up to any process(es) listening on a raw 2471 * PIM socket. For Solaris it is done right after pim_input() is 2472 * called. 2473 */ 2474 return (mp); 2475 } 2476 2477 /* 2478 * PIM sparse mode hook. Called by pim_input after decapsulating 2479 * the packet. Loop back the packet, as if we have received it. 2480 * In pim_input() we have to check if the destination is a multicast address. 2481 */ 2482 static int 2483 register_mforward(mblk_t *mp, ip_recv_attr_t *ira) 2484 { 2485 ire_t *ire; 2486 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2487 ill_t *ill = ira->ira_ill; 2488 ip_stack_t *ipst = ill->ill_ipst; 2489 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2490 2491 ASSERT(ipst->ips_reg_vif_num <= ipst->ips_numvifs); 2492 2493 if (ipst->ips_ip_mrtdebug > 3) { 2494 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2495 "register_mforward: src %x, dst %x\n", 2496 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst)); 2497 } 2498 /* 2499 * Need to pass in to ip_mforward() the information that the 2500 * packet has arrived on the register_vif. We mark it with 2501 * the IRAF_PIM_REGISTER attribute. 2502 * pim_input verified that the (inner) destination is multicast, 2503 * hence we skip the generic code in ip_input. 2504 */ 2505 ira->ira_flags |= IRAF_PIM_REGISTER; 2506 ++ipst->ips_mrtstat->mrts_pim_regforwards; 2507 2508 if (!CLASSD(ipha->ipha_dst)) { 2509 ire = ire_route_recursive_v4(ipha->ipha_dst, 0, NULL, ALL_ZONES, 2510 ira->ira_tsl, MATCH_IRE_SECATTR, IRR_ALLOCATE, 0, ipst, 2511 NULL, NULL, NULL); 2512 } else { 2513 ire = ire_multicast(ill); 2514 } 2515 ASSERT(ire != NULL); 2516 /* Normally this will return the IRE_MULTICAST */ 2517 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 2518 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2519 ip_drop_input("mrts_pim RTF_REJECT", mp, ill); 2520 freemsg(mp); 2521 ire_refrele(ire); 2522 return (-1); 2523 } 2524 ASSERT(ire->ire_type & IRE_MULTICAST); 2525 (*ire->ire_recvfn)(ire, mp, ipha, ira); 2526 ire_refrele(ire); 2527 2528 return (0); 2529 } 2530 2531 /* 2532 * Send an encapsulated packet. 2533 * Caller assumes can continue to use mp when routine returns. 2534 */ 2535 /* ARGSUSED */ 2536 static void 2537 encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2538 { 2539 mblk_t *mp_copy; 2540 ipha_t *ipha_copy; 2541 size_t len; 2542 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2543 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2544 2545 if (ipst->ips_ip_mrtdebug > 1) { 2546 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2547 "encap_send: vif %ld enter", 2548 (ptrdiff_t)(vifp - ipst->ips_vifs)); 2549 } 2550 len = ntohs(ipha->ipha_length); 2551 2552 /* 2553 * Copy the old packet & pullup it's IP header into the 2554 * new mbuf so we can modify it. Try to fill the new 2555 * mbuf since if we don't the ethernet driver will. 2556 */ 2557 mp_copy = allocb(32 + sizeof (multicast_encap_iphdr), BPRI_MED); 2558 if (mp_copy == NULL) 2559 return; 2560 mp_copy->b_rptr += 32; 2561 mp_copy->b_wptr = mp_copy->b_rptr + sizeof (multicast_encap_iphdr); 2562 if ((mp_copy->b_cont = copymsg(mp)) == NULL) { 2563 freeb(mp_copy); 2564 return; 2565 } 2566 2567 /* 2568 * Fill in the encapsulating IP header. 2569 * Remote tunnel dst in rmt_addr, from add_vif(). 2570 */ 2571 ipha_copy = (ipha_t *)mp_copy->b_rptr; 2572 *ipha_copy = multicast_encap_iphdr; 2573 ASSERT((len + sizeof (ipha_t)) <= IP_MAXPACKET); 2574 ipha_copy->ipha_length = htons(len + sizeof (ipha_t)); 2575 ipha_copy->ipha_src = vifp->v_lcl_addr.s_addr; 2576 ipha_copy->ipha_dst = vifp->v_rmt_addr.s_addr; 2577 ASSERT(ipha_copy->ipha_ident == 0); 2578 2579 /* Turn the encapsulated IP header back into a valid one. */ 2580 ipha = (ipha_t *)mp_copy->b_cont->b_rptr; 2581 ipha->ipha_ttl--; 2582 ipha->ipha_hdr_checksum = 0; 2583 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 2584 2585 ipha_copy->ipha_ttl = ipha->ipha_ttl; 2586 2587 if (ipst->ips_ip_mrtdebug > 1) { 2588 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2589 "encap_send: group 0x%x", ntohl(ipha->ipha_dst)); 2590 } 2591 if (vifp->v_rate_limit <= 0) 2592 tbf_send_packet(vifp, mp_copy); 2593 else 2594 /* ipha is from the original header */ 2595 tbf_control(vifp, mp_copy, ipha); 2596 } 2597 2598 /* 2599 * De-encapsulate a packet and feed it back through IP input if it 2600 * matches one of our multicast tunnels. 2601 * 2602 * This routine is called whenever IP gets a packet with prototype 2603 * IPPROTO_ENCAP and a local destination address and the packet didn't 2604 * match one of our configured IP-in-IP tunnels. 2605 */ 2606 void 2607 ip_mroute_decap(mblk_t *mp, ip_recv_attr_t *ira) 2608 { 2609 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2610 ipha_t *ipha_encap; 2611 int hlen = IPH_HDR_LENGTH(ipha); 2612 int hlen_encap; 2613 ipaddr_t src; 2614 struct vif *vifp; 2615 ire_t *ire; 2616 ill_t *ill = ira->ira_ill; 2617 ip_stack_t *ipst = ill->ill_ipst; 2618 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2619 2620 /* Make sure we have all of the inner header */ 2621 ipha_encap = (ipha_t *)((char *)ipha + hlen); 2622 if (mp->b_wptr - mp->b_rptr < hlen + IP_SIMPLE_HDR_LENGTH) { 2623 ipha = ip_pullup(mp, hlen + IP_SIMPLE_HDR_LENGTH, ira); 2624 if (ipha == NULL) { 2625 ipst->ips_mrtstat->mrts_bad_tunnel++; 2626 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2627 ip_drop_input("ip_mroute_decap: too short", mp, ill); 2628 freemsg(mp); 2629 return; 2630 } 2631 ipha_encap = (ipha_t *)((char *)ipha + hlen); 2632 } 2633 hlen_encap = IPH_HDR_LENGTH(ipha_encap); 2634 if (mp->b_wptr - mp->b_rptr < hlen + hlen_encap) { 2635 ipha = ip_pullup(mp, hlen + hlen_encap, ira); 2636 if (ipha == NULL) { 2637 ipst->ips_mrtstat->mrts_bad_tunnel++; 2638 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2639 ip_drop_input("ip_mroute_decap: too short", mp, ill); 2640 freemsg(mp); 2641 return; 2642 } 2643 ipha_encap = (ipha_t *)((char *)ipha + hlen); 2644 } 2645 2646 /* 2647 * Dump the packet if it's not to a multicast destination or if 2648 * we don't have an encapsulating tunnel with the source. 2649 * Note: This code assumes that the remote site IP address 2650 * uniquely identifies the tunnel (i.e., that this site has 2651 * at most one tunnel with the remote site). 2652 */ 2653 if (!CLASSD(ipha_encap->ipha_dst)) { 2654 ipst->ips_mrtstat->mrts_bad_tunnel++; 2655 ip1dbg(("ip_mroute_decap: bad tunnel\n")); 2656 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2657 ip_drop_input("mrts_bad_tunnel", mp, ill); 2658 freemsg(mp); 2659 return; 2660 } 2661 src = (ipaddr_t)ipha->ipha_src; 2662 mutex_enter(&ipst->ips_last_encap_lock); 2663 if (src != ipst->ips_last_encap_src) { 2664 struct vif *vife; 2665 2666 vifp = ipst->ips_vifs; 2667 vife = vifp + ipst->ips_numvifs; 2668 ipst->ips_last_encap_src = src; 2669 ipst->ips_last_encap_vif = 0; 2670 for (; vifp < vife; ++vifp) { 2671 if (!lock_good_vif(vifp)) 2672 continue; 2673 if (vifp->v_rmt_addr.s_addr == src) { 2674 if (vifp->v_flags & VIFF_TUNNEL) 2675 ipst->ips_last_encap_vif = vifp; 2676 if (ipst->ips_ip_mrtdebug > 1) { 2677 (void) mi_strlog(mrouter->conn_rq, 2678 1, SL_TRACE, 2679 "ip_mroute_decap: good tun " 2680 "vif %ld with %x", 2681 (ptrdiff_t)(vifp - ipst->ips_vifs), 2682 ntohl(src)); 2683 } 2684 unlock_good_vif(vifp); 2685 break; 2686 } 2687 unlock_good_vif(vifp); 2688 } 2689 } 2690 if ((vifp = ipst->ips_last_encap_vif) == 0) { 2691 mutex_exit(&ipst->ips_last_encap_lock); 2692 ipst->ips_mrtstat->mrts_bad_tunnel++; 2693 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2694 ip_drop_input("mrts_bad_tunnel", mp, ill); 2695 freemsg(mp); 2696 ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n", 2697 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(src))); 2698 return; 2699 } 2700 mutex_exit(&ipst->ips_last_encap_lock); 2701 2702 /* 2703 * Need to pass in the tunnel source to ip_mforward (so that it can 2704 * verify that the packet arrived over the correct vif.) 2705 */ 2706 ira->ira_flags |= IRAF_MROUTE_TUNNEL_SET; 2707 ira->ira_mroute_tunnel = src; 2708 mp->b_rptr += hlen; 2709 ira->ira_pktlen -= hlen; 2710 ira->ira_ip_hdr_length = hlen_encap; 2711 2712 /* 2713 * We don't redo any of the filtering in ill_input_full_v4 and we 2714 * have checked that all of ipha_encap and any IP options are 2715 * pulled up. Hence we call ire_recv_multicast_v4 directly. 2716 * However, we have to check for RSVP as in ip_input_full_v4 2717 * and if so we pass it to ire_recv_broadcast_v4 for local delivery 2718 * to the rsvpd. 2719 */ 2720 if (ipha_encap->ipha_protocol == IPPROTO_RSVP && 2721 ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head != NULL) { 2722 ire = ire_route_recursive_v4(INADDR_BROADCAST, 0, ill, 2723 ALL_ZONES, ira->ira_tsl, MATCH_IRE_ILL|MATCH_IRE_SECATTR, 2724 IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL); 2725 } else { 2726 ire = ire_multicast(ill); 2727 } 2728 ASSERT(ire != NULL); 2729 /* Normally this will return the IRE_MULTICAST or IRE_BROADCAST */ 2730 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 2731 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2732 ip_drop_input("ip_mroute_decap: RTF_REJECT", mp, ill); 2733 freemsg(mp); 2734 ire_refrele(ire); 2735 return; 2736 } 2737 ire->ire_ib_pkt_count++; 2738 ASSERT(ire->ire_type & (IRE_MULTICAST|IRE_BROADCAST)); 2739 (*ire->ire_recvfn)(ire, mp, ipha_encap, ira); 2740 ire_refrele(ire); 2741 } 2742 2743 /* 2744 * Remove all records with v_ipif == ipif. Called when an interface goes away 2745 * (stream closed). Called as writer. 2746 */ 2747 void 2748 reset_mrt_vif_ipif(ipif_t *ipif) 2749 { 2750 vifi_t vifi, tmp_vifi; 2751 vifi_t num_of_vifs; 2752 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 2753 2754 /* Can't check vifi >= 0 since vifi_t is unsigned! */ 2755 2756 mutex_enter(&ipst->ips_numvifs_mutex); 2757 num_of_vifs = ipst->ips_numvifs; 2758 mutex_exit(&ipst->ips_numvifs_mutex); 2759 2760 for (vifi = num_of_vifs; vifi != 0; vifi--) { 2761 tmp_vifi = vifi - 1; 2762 if (ipst->ips_vifs[tmp_vifi].v_ipif == ipif) { 2763 (void) del_vif(&tmp_vifi, ipst); 2764 } 2765 } 2766 } 2767 2768 /* Remove pending upcall msgs when ill goes away. Called by ill_delete. */ 2769 void 2770 reset_mrt_ill(ill_t *ill) 2771 { 2772 struct mfc *rt; 2773 struct rtdetq *rte; 2774 int i; 2775 ip_stack_t *ipst = ill->ill_ipst; 2776 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2777 timeout_id_t id; 2778 2779 for (i = 0; i < MFCTBLSIZ; i++) { 2780 MFCB_REFHOLD(&ipst->ips_mfcs[i]); 2781 if ((rt = ipst->ips_mfcs[i].mfcb_mfc) != NULL) { 2782 if (ipst->ips_ip_mrtdebug > 1) { 2783 (void) mi_strlog(mrouter->conn_rq, 1, 2784 SL_TRACE, 2785 "reset_mrt_ill: mfctable [%d]", i); 2786 } 2787 while (rt != NULL) { 2788 mutex_enter(&rt->mfc_mutex); 2789 while ((rte = rt->mfc_rte) != NULL) { 2790 if (rte->ill == ill && 2791 (id = rt->mfc_timeout_id) != 0) { 2792 /* 2793 * Its ok to drop the lock, the 2794 * struct cannot be freed since 2795 * we have a ref on the hash 2796 * bucket. 2797 */ 2798 mutex_exit(&rt->mfc_mutex); 2799 (void) untimeout(id); 2800 mutex_enter(&rt->mfc_mutex); 2801 } 2802 if (rte->ill == ill) { 2803 if (ipst->ips_ip_mrtdebug > 1) { 2804 (void) mi_strlog( 2805 mrouter->conn_rq, 2806 1, SL_TRACE, 2807 "reset_mrt_ill: " 2808 "ill 0x%p", (void *)ill); 2809 } 2810 rt->mfc_rte = rte->rte_next; 2811 freemsg(rte->mp); 2812 mi_free((char *)rte); 2813 } 2814 } 2815 mutex_exit(&rt->mfc_mutex); 2816 rt = rt->mfc_next; 2817 } 2818 } 2819 MFCB_REFRELE(&ipst->ips_mfcs[i]); 2820 } 2821 } 2822 2823 /* 2824 * Token bucket filter module. 2825 * The ipha is for mcastgrp destination for phyint and encap. 2826 */ 2827 static void 2828 tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha) 2829 { 2830 size_t p_len = msgdsize(mp); 2831 struct tbf *t = vifp->v_tbf; 2832 timeout_id_t id = 0; 2833 ill_t *ill = vifp->v_ipif->ipif_ill; 2834 ip_stack_t *ipst = ill->ill_ipst; 2835 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2836 2837 /* Drop if packet is too large */ 2838 if (p_len > MAX_BKT_SIZE) { 2839 ipst->ips_mrtstat->mrts_pkt2large++; 2840 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2841 ip_drop_output("tbf_control - too large", mp, ill); 2842 freemsg(mp); 2843 return; 2844 } 2845 if (ipst->ips_ip_mrtdebug > 1) { 2846 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2847 "tbf_ctrl: SEND vif %ld, qlen %d, ipha_dst 0x%x", 2848 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len, 2849 ntohl(ipha->ipha_dst)); 2850 } 2851 2852 mutex_enter(&t->tbf_lock); 2853 2854 tbf_update_tokens(vifp); 2855 2856 /* 2857 * If there are enough tokens, 2858 * and the queue is empty, send this packet out. 2859 */ 2860 if (ipst->ips_ip_mrtdebug > 1) { 2861 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2862 "tbf_control: vif %ld, TOKENS %d, pkt len %lu, qlen %d", 2863 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_n_tok, p_len, 2864 t->tbf_q_len); 2865 } 2866 /* No packets are queued */ 2867 if (t->tbf_q_len == 0) { 2868 /* queue empty, send packet if enough tokens */ 2869 if (p_len <= t->tbf_n_tok) { 2870 t->tbf_n_tok -= p_len; 2871 mutex_exit(&t->tbf_lock); 2872 tbf_send_packet(vifp, mp); 2873 return; 2874 } else { 2875 /* Queue packet and timeout till later */ 2876 tbf_queue(vifp, mp); 2877 ASSERT(vifp->v_timeout_id == 0); 2878 vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp, 2879 TBF_REPROCESS); 2880 } 2881 } else if (t->tbf_q_len < t->tbf_max_q_len) { 2882 /* Finite queue length, so queue pkts and process queue */ 2883 tbf_queue(vifp, mp); 2884 tbf_process_q(vifp); 2885 } else { 2886 /* Check that we have UDP header with IP header */ 2887 size_t hdr_length = IPH_HDR_LENGTH(ipha) + 2888 sizeof (struct udphdr); 2889 2890 if ((mp->b_wptr - mp->b_rptr) < hdr_length) { 2891 if (!pullupmsg(mp, hdr_length)) { 2892 BUMP_MIB(ill->ill_ip_mib, 2893 ipIfStatsOutDiscards); 2894 ip_drop_output("tbf_control - pullup", mp, ill); 2895 freemsg(mp); 2896 ip1dbg(("tbf_ctl: couldn't pullup udp hdr, " 2897 "vif %ld src 0x%x dst 0x%x\n", 2898 (ptrdiff_t)(vifp - ipst->ips_vifs), 2899 ntohl(ipha->ipha_src), 2900 ntohl(ipha->ipha_dst))); 2901 mutex_exit(&vifp->v_tbf->tbf_lock); 2902 return; 2903 } else 2904 /* Have to reassign ipha after pullupmsg */ 2905 ipha = (ipha_t *)mp->b_rptr; 2906 } 2907 /* 2908 * Queue length too much, 2909 * try to selectively dq, or queue and process 2910 */ 2911 if (!tbf_dq_sel(vifp, ipha)) { 2912 ipst->ips_mrtstat->mrts_q_overflow++; 2913 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2914 ip_drop_output("mrts_q_overflow", mp, ill); 2915 freemsg(mp); 2916 } else { 2917 tbf_queue(vifp, mp); 2918 tbf_process_q(vifp); 2919 } 2920 } 2921 if (t->tbf_q_len == 0) { 2922 id = vifp->v_timeout_id; 2923 vifp->v_timeout_id = 0; 2924 } 2925 mutex_exit(&vifp->v_tbf->tbf_lock); 2926 if (id != 0) 2927 (void) untimeout(id); 2928 } 2929 2930 /* 2931 * Adds a packet to the tbf queue at the interface. 2932 * The ipha is for mcastgrp destination for phyint and encap. 2933 */ 2934 static void 2935 tbf_queue(struct vif *vifp, mblk_t *mp) 2936 { 2937 struct tbf *t = vifp->v_tbf; 2938 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2939 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2940 2941 if (ipst->ips_ip_mrtdebug > 1) { 2942 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2943 "tbf_queue: vif %ld", (ptrdiff_t)(vifp - ipst->ips_vifs)); 2944 } 2945 ASSERT(MUTEX_HELD(&t->tbf_lock)); 2946 2947 if (t->tbf_t == NULL) { 2948 /* Queue was empty */ 2949 t->tbf_q = mp; 2950 } else { 2951 /* Insert at tail */ 2952 t->tbf_t->b_next = mp; 2953 } 2954 /* set new tail pointer */ 2955 t->tbf_t = mp; 2956 2957 mp->b_next = mp->b_prev = NULL; 2958 2959 t->tbf_q_len++; 2960 } 2961 2962 /* 2963 * Process the queue at the vif interface. 2964 * Drops the tbf_lock when sending packets. 2965 * 2966 * NOTE : The caller should quntimeout if the queue length is 0. 2967 */ 2968 static void 2969 tbf_process_q(struct vif *vifp) 2970 { 2971 mblk_t *mp; 2972 struct tbf *t = vifp->v_tbf; 2973 size_t len; 2974 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2975 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2976 2977 if (ipst->ips_ip_mrtdebug > 1) { 2978 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2979 "tbf_process_q 1: vif %ld qlen = %d", 2980 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len); 2981 } 2982 2983 /* 2984 * Loop through the queue at the interface and send 2985 * as many packets as possible. 2986 */ 2987 ASSERT(MUTEX_HELD(&t->tbf_lock)); 2988 2989 while (t->tbf_q_len > 0) { 2990 mp = t->tbf_q; 2991 len = (size_t)msgdsize(mp); /* length of ip pkt */ 2992 2993 /* Determine if the packet can be sent */ 2994 if (len <= t->tbf_n_tok) { 2995 /* 2996 * If so, reduce no. of tokens, dequeue the packet, 2997 * send the packet. 2998 */ 2999 t->tbf_n_tok -= len; 3000 3001 t->tbf_q = mp->b_next; 3002 if (--t->tbf_q_len == 0) { 3003 t->tbf_t = NULL; 3004 } 3005 mp->b_next = NULL; 3006 /* Exit mutex before sending packet, then re-enter */ 3007 mutex_exit(&t->tbf_lock); 3008 tbf_send_packet(vifp, mp); 3009 mutex_enter(&t->tbf_lock); 3010 } else 3011 break; 3012 } 3013 } 3014 3015 /* Called at tbf timeout to update tokens, process q and reset timer. */ 3016 static void 3017 tbf_reprocess_q(void *arg) 3018 { 3019 struct vif *vifp = arg; 3020 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 3021 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3022 3023 mutex_enter(&vifp->v_tbf->tbf_lock); 3024 vifp->v_timeout_id = 0; 3025 tbf_update_tokens(vifp); 3026 3027 tbf_process_q(vifp); 3028 3029 if (vifp->v_tbf->tbf_q_len > 0) { 3030 vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp, 3031 TBF_REPROCESS); 3032 } 3033 mutex_exit(&vifp->v_tbf->tbf_lock); 3034 3035 if (ipst->ips_ip_mrtdebug > 1) { 3036 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3037 "tbf_reprcess_q: vif %ld timeout id = %p", 3038 (ptrdiff_t)(vifp - ipst->ips_vifs), vifp->v_timeout_id); 3039 } 3040 } 3041 3042 /* 3043 * Function that will selectively discard a member of the tbf queue, 3044 * based on the precedence value and the priority. 3045 * 3046 * NOTE : The caller should quntimeout if the queue length is 0. 3047 */ 3048 static int 3049 tbf_dq_sel(struct vif *vifp, ipha_t *ipha) 3050 { 3051 uint_t p; 3052 struct tbf *t = vifp->v_tbf; 3053 mblk_t **np; 3054 mblk_t *last, *mp; 3055 ill_t *ill = vifp->v_ipif->ipif_ill; 3056 ip_stack_t *ipst = ill->ill_ipst; 3057 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3058 3059 if (ipst->ips_ip_mrtdebug > 1) { 3060 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3061 "dq_sel: vif %ld dst 0x%x", 3062 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(ipha->ipha_dst)); 3063 } 3064 3065 ASSERT(MUTEX_HELD(&t->tbf_lock)); 3066 p = priority(vifp, ipha); 3067 3068 np = &t->tbf_q; 3069 last = NULL; 3070 while ((mp = *np) != NULL) { 3071 if (p > (priority(vifp, (ipha_t *)mp->b_rptr))) { 3072 *np = mp->b_next; 3073 /* If removing the last packet, fix the tail pointer */ 3074 if (mp == t->tbf_t) 3075 t->tbf_t = last; 3076 mp->b_prev = mp->b_next = NULL; 3077 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 3078 ip_drop_output("tbf_dq_send", mp, ill); 3079 freemsg(mp); 3080 /* 3081 * It's impossible for the queue to be empty, but 3082 * we check anyway. 3083 */ 3084 if (--t->tbf_q_len == 0) { 3085 t->tbf_t = NULL; 3086 } 3087 ipst->ips_mrtstat->mrts_drop_sel++; 3088 return (1); 3089 } 3090 np = &mp->b_next; 3091 last = mp; 3092 } 3093 return (0); 3094 } 3095 3096 /* Sends packet, 2 cases - encap tunnel, phyint. */ 3097 static void 3098 tbf_send_packet(struct vif *vifp, mblk_t *mp) 3099 { 3100 ipif_t *ipif = vifp->v_ipif; 3101 ill_t *ill = ipif->ipif_ill; 3102 ip_stack_t *ipst = ill->ill_ipst; 3103 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3104 ipha_t *ipha; 3105 3106 ipha = (ipha_t *)mp->b_rptr; 3107 /* If encap tunnel options */ 3108 if (vifp->v_flags & VIFF_TUNNEL) { 3109 ip_xmit_attr_t ixas; 3110 3111 if (ipst->ips_ip_mrtdebug > 1) { 3112 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3113 "tbf_send_packet: ENCAP tunnel vif %ld", 3114 (ptrdiff_t)(vifp - ipst->ips_vifs)); 3115 } 3116 bzero(&ixas, sizeof (ixas)); 3117 ixas.ixa_flags = 3118 IXAF_IS_IPV4 | IXAF_NO_TTL_CHANGE | IXAF_VERIFY_SOURCE; 3119 ixas.ixa_ipst = ipst; 3120 ixas.ixa_ifindex = 0; 3121 ixas.ixa_cred = kcred; 3122 ixas.ixa_cpid = NOPID; 3123 ixas.ixa_tsl = NULL; 3124 ixas.ixa_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */ 3125 ixas.ixa_pktlen = ntohs(ipha->ipha_length); 3126 ixas.ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha); 3127 3128 /* 3129 * Feed into ip_output_simple which will set the ident field 3130 * and checksum the encapsulating header. 3131 * BSD gets the cached route vifp->v_route from ip_output() 3132 * to speed up route table lookups. Not necessary in SunOS 5.x. 3133 * One could make multicast forwarding faster by putting an 3134 * ip_xmit_attr_t in each vif thereby caching the ire/nce. 3135 */ 3136 (void) ip_output_simple(mp, &ixas); 3137 ixa_cleanup(&ixas); 3138 return; 3139 3140 /* phyint */ 3141 } else { 3142 /* Need to loop back to members on the outgoing interface. */ 3143 ipaddr_t dst; 3144 ip_recv_attr_t iras; 3145 nce_t *nce; 3146 3147 bzero(&iras, sizeof (iras)); 3148 iras.ira_flags = IRAF_IS_IPV4; 3149 iras.ira_ill = iras.ira_rill = ill; 3150 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 3151 iras.ira_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */ 3152 iras.ira_pktlen = ntohs(ipha->ipha_length); 3153 iras.ira_ip_hdr_length = IPH_HDR_LENGTH(ipha); 3154 3155 dst = ipha->ipha_dst; 3156 if (ill_hasmembers_v4(ill, dst)) { 3157 iras.ira_flags |= IRAF_LOOPBACK_COPY; 3158 } 3159 if (ipst->ips_ip_mrtdebug > 1) { 3160 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3161 "tbf_send_pkt: phyint forward vif %ld dst = 0x%x", 3162 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(dst)); 3163 } 3164 /* 3165 * Find an NCE which matches the nexthop. 3166 * For a pt-pt interface we use the other end of the pt-pt 3167 * link. 3168 */ 3169 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 3170 dst = ipif->ipif_pp_dst_addr; 3171 nce = arp_nce_init(ill, dst, ill->ill_net_type); 3172 } else { 3173 nce = arp_nce_init(ill, dst, IRE_MULTICAST); 3174 } 3175 if (nce == NULL) { 3176 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 3177 ip_drop_output("tbf_send_packet - no nce", mp, ill); 3178 freemsg(mp); 3179 return; 3180 } 3181 3182 /* 3183 * We don't remeber the incoming ill. Thus we 3184 * pretend the packet arrived on the outbound ill. This means 3185 * statistics for input errors will be increased on the wrong 3186 * ill but that isn't a big deal. 3187 */ 3188 ip_forward_xmit_v4(nce, ill, mp, ipha, &iras, ill->ill_mtu, 0); 3189 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 3190 3191 nce_refrele(nce); 3192 } 3193 } 3194 3195 /* 3196 * Determine the current time and then the elapsed time (between the last time 3197 * and time now). Update the no. of tokens in the bucket. 3198 */ 3199 static void 3200 tbf_update_tokens(struct vif *vifp) 3201 { 3202 timespec_t tp; 3203 hrtime_t tm; 3204 struct tbf *t = vifp->v_tbf; 3205 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 3206 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3207 3208 ASSERT(MUTEX_HELD(&t->tbf_lock)); 3209 3210 /* Time in secs and nsecs, rate limit in kbits/sec */ 3211 gethrestime(&tp); 3212 3213 /*LINTED*/ 3214 TV_DELTA(tp, t->tbf_last_pkt_t, tm); 3215 3216 /* 3217 * This formula is actually 3218 * "time in seconds" * "bytes/second". Scaled for nsec. 3219 * (tm/1000000000) * (v_rate_limit * 1000 * (1000/1024) /8) 3220 * 3221 * The (1000/1024) was introduced in add_vif to optimize 3222 * this divide into a shift. 3223 */ 3224 t->tbf_n_tok += (tm/1000) * vifp->v_rate_limit / 1024 / 8; 3225 t->tbf_last_pkt_t = tp; 3226 3227 if (t->tbf_n_tok > MAX_BKT_SIZE) 3228 t->tbf_n_tok = MAX_BKT_SIZE; 3229 if (ipst->ips_ip_mrtdebug > 1) { 3230 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3231 "tbf_update_tok: tm %lld tok %d vif %ld", 3232 tm, t->tbf_n_tok, (ptrdiff_t)(vifp - ipst->ips_vifs)); 3233 } 3234 } 3235 3236 /* 3237 * Priority currently is based on port nos. 3238 * Different forwarding mechanisms have different ways 3239 * of obtaining the port no. Hence, the vif must be 3240 * given along with the packet itself. 3241 * 3242 */ 3243 static int 3244 priority(struct vif *vifp, ipha_t *ipha) 3245 { 3246 int prio; 3247 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 3248 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3249 3250 /* Temporary hack; may add general packet classifier some day */ 3251 3252 ASSERT(MUTEX_HELD(&vifp->v_tbf->tbf_lock)); 3253 3254 /* 3255 * The UDP port space is divided up into four priority ranges: 3256 * [0, 16384) : unclassified - lowest priority 3257 * [16384, 32768) : audio - highest priority 3258 * [32768, 49152) : whiteboard - medium priority 3259 * [49152, 65536) : video - low priority 3260 */ 3261 3262 if (ipha->ipha_protocol == IPPROTO_UDP) { 3263 struct udphdr *udp = 3264 (struct udphdr *)((char *)ipha + IPH_HDR_LENGTH(ipha)); 3265 switch (ntohs(udp->uh_dport) & 0xc000) { 3266 case 0x4000: 3267 prio = 70; 3268 break; 3269 case 0x8000: 3270 prio = 60; 3271 break; 3272 case 0xc000: 3273 prio = 55; 3274 break; 3275 default: 3276 prio = 50; 3277 break; 3278 } 3279 if (ipst->ips_ip_mrtdebug > 1) { 3280 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3281 "priority: port %x prio %d\n", 3282 ntohs(udp->uh_dport), prio); 3283 } 3284 } else 3285 prio = 50; /* default priority */ 3286 return (prio); 3287 } 3288 3289 /* 3290 * End of token bucket filter modifications 3291 */ 3292 3293 3294 3295 /* 3296 * Produces data for netstat -M. 3297 */ 3298 int 3299 ip_mroute_stats(mblk_t *mp, ip_stack_t *ipst) 3300 { 3301 ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl); 3302 ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl); 3303 if (!snmp_append_data(mp, (char *)ipst->ips_mrtstat, 3304 sizeof (struct mrtstat))) { 3305 ip0dbg(("ip_mroute_stats: failed %ld bytes\n", 3306 (size_t)sizeof (struct mrtstat))); 3307 return (0); 3308 } 3309 return (1); 3310 } 3311 3312 /* 3313 * Sends info for SNMP's MIB. 3314 */ 3315 int 3316 ip_mroute_vif(mblk_t *mp, ip_stack_t *ipst) 3317 { 3318 struct vifctl vi; 3319 vifi_t vifi; 3320 3321 mutex_enter(&ipst->ips_numvifs_mutex); 3322 for (vifi = 0; vifi < ipst->ips_numvifs; vifi++) { 3323 if (ipst->ips_vifs[vifi].v_lcl_addr.s_addr == 0) 3324 continue; 3325 /* 3326 * No locks here, an approximation is fine. 3327 */ 3328 vi.vifc_vifi = vifi; 3329 vi.vifc_flags = ipst->ips_vifs[vifi].v_flags; 3330 vi.vifc_threshold = ipst->ips_vifs[vifi].v_threshold; 3331 vi.vifc_rate_limit = ipst->ips_vifs[vifi].v_rate_limit; 3332 vi.vifc_lcl_addr = ipst->ips_vifs[vifi].v_lcl_addr; 3333 vi.vifc_rmt_addr = ipst->ips_vifs[vifi].v_rmt_addr; 3334 vi.vifc_pkt_in = ipst->ips_vifs[vifi].v_pkt_in; 3335 vi.vifc_pkt_out = ipst->ips_vifs[vifi].v_pkt_out; 3336 3337 if (!snmp_append_data(mp, (char *)&vi, sizeof (vi))) { 3338 ip0dbg(("ip_mroute_vif: failed %ld bytes\n", 3339 (size_t)sizeof (vi))); 3340 mutex_exit(&ipst->ips_numvifs_mutex); 3341 return (0); 3342 } 3343 } 3344 mutex_exit(&ipst->ips_numvifs_mutex); 3345 return (1); 3346 } 3347 3348 /* 3349 * Called by ip_snmp_get to send up multicast routing table. 3350 */ 3351 int 3352 ip_mroute_mrt(mblk_t *mp, ip_stack_t *ipst) 3353 { 3354 int i, j; 3355 struct mfc *rt; 3356 struct mfcctl mfcc; 3357 3358 /* 3359 * Make sure multicast has not been turned off. 3360 */ 3361 if (is_mrouter_off(ipst)) 3362 return (1); 3363 3364 /* Loop over all hash buckets and their chains */ 3365 for (i = 0; i < MFCTBLSIZ; i++) { 3366 MFCB_REFHOLD(&ipst->ips_mfcs[i]); 3367 for (rt = ipst->ips_mfcs[i].mfcb_mfc; rt; rt = rt->mfc_next) { 3368 mutex_enter(&rt->mfc_mutex); 3369 if (rt->mfc_rte != NULL || 3370 (rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 3371 mutex_exit(&rt->mfc_mutex); 3372 continue; 3373 } 3374 mfcc.mfcc_origin = rt->mfc_origin; 3375 mfcc.mfcc_mcastgrp = rt->mfc_mcastgrp; 3376 mfcc.mfcc_parent = rt->mfc_parent; 3377 mfcc.mfcc_pkt_cnt = rt->mfc_pkt_cnt; 3378 mutex_enter(&ipst->ips_numvifs_mutex); 3379 for (j = 0; j < (int)ipst->ips_numvifs; j++) 3380 mfcc.mfcc_ttls[j] = rt->mfc_ttls[j]; 3381 for (j = (int)ipst->ips_numvifs; j < MAXVIFS; j++) 3382 mfcc.mfcc_ttls[j] = 0; 3383 mutex_exit(&ipst->ips_numvifs_mutex); 3384 3385 mutex_exit(&rt->mfc_mutex); 3386 if (!snmp_append_data(mp, (char *)&mfcc, 3387 sizeof (mfcc))) { 3388 MFCB_REFRELE(&ipst->ips_mfcs[i]); 3389 ip0dbg(("ip_mroute_mrt: failed %ld bytes\n", 3390 (size_t)sizeof (mfcc))); 3391 return (0); 3392 } 3393 } 3394 MFCB_REFRELE(&ipst->ips_mfcs[i]); 3395 } 3396 return (1); 3397 } 3398