1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 /* Copyright (c) 1990 Mentat Inc. */ 25 26 /* 27 * Copyright (c) 2018, Joyent, Inc. 28 */ 29 30 /* 31 * Procedures for the kernel part of DVMRP, 32 * a Distance-Vector Multicast Routing Protocol. 33 * (See RFC-1075) 34 * Written by David Waitzman, BBN Labs, August 1988. 35 * Modified by Steve Deering, Stanford, February 1989. 36 * Modified by Mark J. Steiglitz, Stanford, May, 1991 37 * Modified by Van Jacobson, LBL, January 1993 38 * Modified by Ajit Thyagarajan, PARC, August 1993 39 * Modified by Bill Fenner, PARC, April 1995 40 * 41 * MROUTING 3.5 42 */ 43 44 /* 45 * TODO 46 * - function pointer field in vif, void *vif_sendit() 47 */ 48 49 #include <sys/types.h> 50 #include <sys/stream.h> 51 #include <sys/stropts.h> 52 #include <sys/strlog.h> 53 #include <sys/systm.h> 54 #include <sys/ddi.h> 55 #include <sys/cmn_err.h> 56 #include <sys/zone.h> 57 58 #include <sys/param.h> 59 #include <sys/socket.h> 60 #include <sys/vtrace.h> 61 #include <sys/debug.h> 62 #include <net/if.h> 63 #include <sys/sockio.h> 64 #include <netinet/in.h> 65 #include <net/if_dl.h> 66 67 #include <inet/ipsec_impl.h> 68 #include <inet/common.h> 69 #include <inet/mi.h> 70 #include <inet/nd.h> 71 #include <inet/tunables.h> 72 #include <inet/mib2.h> 73 #include <netinet/ip6.h> 74 #include <inet/ip.h> 75 #include <inet/snmpcom.h> 76 77 #include <netinet/igmp.h> 78 #include <netinet/igmp_var.h> 79 #include <netinet/udp.h> 80 #include <netinet/ip_mroute.h> 81 #include <inet/ip_multi.h> 82 #include <inet/ip_ire.h> 83 #include <inet/ip_ndp.h> 84 #include <inet/ip_if.h> 85 #include <inet/ipclassifier.h> 86 87 #include <netinet/pim.h> 88 89 90 /* 91 * MT Design: 92 * 93 * There are three main data structures viftable, mfctable and tbftable that 94 * need to be protected against MT races. 95 * 96 * vitable is a fixed length array of vif structs. There is no lock to protect 97 * the whole array, instead each struct is protected by its own indiviual lock. 98 * The value of v_marks in conjuction with the value of v_refcnt determines the 99 * current state of a vif structure. One special state that needs mention 100 * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates 101 * that vif is being initalized. 102 * Each structure is freed when the refcnt goes down to zero. If a delete comes 103 * in when the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED 104 * which prevents the struct from further use. When the refcnt goes to zero 105 * the struct is freed and is marked VIF_MARK_NOTINUSE. 106 * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill 107 * from going away a refhold is put on the ipif before using it. see 108 * lock_good_vif() and unlock_good_vif(). 109 * 110 * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts 111 * of the vif struct. 112 * 113 * tbftable is also a fixed length array of tbf structs and is only accessed 114 * via v_tbf. It is protected by its own lock tbf_lock. 115 * 116 * Lock Ordering is 117 * v_lock --> tbf_lock 118 * v_lock --> ill_locK 119 * 120 * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb). 121 * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker, 122 * it also maintains a state. These fields are protected by a lock (mfcb_lock). 123 * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to 124 * protect the struct elements. 125 * 126 * mfc structs are dynamically allocated and are singly linked 127 * at the head of the chain. When an mfc structure is to be deleted 128 * it is marked condemned and so is the state in the bucket struct. 129 * When the last walker of the hash bucket exits all the mfc structs 130 * marked condemed are freed. 131 * 132 * Locking Hierarchy: 133 * The bucket lock should be acquired before the mfc struct lock. 134 * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking 135 * operations on the bucket struct. 136 * 137 * last_encap_lock and numvifs_mutex should be acquired after 138 * acquring vif or mfc locks. These locks protect some global variables. 139 * 140 * The statistics are not currently protected by a lock 141 * causing the stats be be approximate, not exact. 142 */ 143 144 #define NO_VIF MAXVIFS /* from mrouted, no route for src */ 145 146 /* 147 * Timeouts: 148 * Upcall timeouts - BSD uses boolean_t mfc->expire and 149 * nexpire[MFCTBLSIZE], the number of times expire has been called. 150 * SunOS 5.x uses mfc->timeout for each mfc. 151 * Some Unixes are limited in the number of simultaneous timeouts 152 * that can be run, SunOS 5.x does not have this restriction. 153 */ 154 155 /* 156 * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and 157 * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall 158 * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE 159 */ 160 #define EXPIRE_TIMEOUT (hz/4) /* 4x / second */ 161 #define UPCALL_EXPIRE 6 /* number of timeouts */ 162 163 /* 164 * Hash function for a source, group entry 165 */ 166 #define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \ 167 ((g) >> 20) ^ ((g) >> 10) ^ (g)) 168 169 #define TBF_REPROCESS (hz / 100) /* 100x /second */ 170 171 /* Identify PIM packet that came on a Register interface */ 172 #define PIM_REGISTER_MARKER 0xffffffff 173 174 /* Function declarations */ 175 static int add_mfc(struct mfcctl *, ip_stack_t *); 176 static int add_vif(struct vifctl *, conn_t *, ip_stack_t *); 177 static int del_mfc(struct mfcctl *, ip_stack_t *); 178 static int del_vif(vifi_t *, ip_stack_t *); 179 static void del_vifp(struct vif *); 180 static void encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 181 static void expire_upcalls(void *); 182 static void fill_route(struct mfc *, struct mfcctl *, ip_stack_t *); 183 static void free_queue(struct mfc *); 184 static int get_assert(uchar_t *, ip_stack_t *); 185 static int get_lsg_cnt(struct sioc_lsg_req *, ip_stack_t *); 186 static int get_sg_cnt(struct sioc_sg_req *, ip_stack_t *); 187 static int get_version(uchar_t *); 188 static int get_vif_cnt(struct sioc_vif_req *, ip_stack_t *); 189 static int ip_mdq(mblk_t *, ipha_t *, ill_t *, 190 ipaddr_t, struct mfc *); 191 static int ip_mrouter_init(conn_t *, uchar_t *, int, ip_stack_t *); 192 static void phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 193 static int register_mforward(mblk_t *, ip_recv_attr_t *); 194 static void register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 195 static int set_assert(int *, ip_stack_t *); 196 197 /* 198 * Token Bucket Filter functions 199 */ 200 static int priority(struct vif *, ipha_t *); 201 static void tbf_control(struct vif *, mblk_t *, ipha_t *); 202 static int tbf_dq_sel(struct vif *, ipha_t *); 203 static void tbf_process_q(struct vif *); 204 static void tbf_queue(struct vif *, mblk_t *); 205 static void tbf_reprocess_q(void *); 206 static void tbf_send_packet(struct vif *, mblk_t *); 207 static void tbf_update_tokens(struct vif *); 208 static void release_mfc(struct mfcb *); 209 210 static boolean_t is_mrouter_off(ip_stack_t *); 211 /* 212 * Encapsulation packets 213 */ 214 215 #define ENCAP_TTL 64 216 217 /* prototype IP hdr for encapsulated packets */ 218 static ipha_t multicast_encap_iphdr = { 219 IP_SIMPLE_HDR_VERSION, 220 0, /* tos */ 221 sizeof (ipha_t), /* total length */ 222 0, /* id */ 223 0, /* frag offset */ 224 ENCAP_TTL, IPPROTO_ENCAP, 225 0, /* checksum */ 226 }; 227 228 /* 229 * Rate limit for assert notification messages, in nsec. 230 */ 231 #define ASSERT_MSG_TIME 3000000000 232 233 234 #define VIF_REFHOLD(vifp) { \ 235 mutex_enter(&(vifp)->v_lock); \ 236 (vifp)->v_refcnt++; \ 237 mutex_exit(&(vifp)->v_lock); \ 238 } 239 240 #define VIF_REFRELE_LOCKED(vifp) { \ 241 (vifp)->v_refcnt--; \ 242 if ((vifp)->v_refcnt == 0 && \ 243 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) { \ 244 del_vifp(vifp); \ 245 } else { \ 246 mutex_exit(&(vifp)->v_lock); \ 247 } \ 248 } 249 250 #define VIF_REFRELE(vifp) { \ 251 mutex_enter(&(vifp)->v_lock); \ 252 (vifp)->v_refcnt--; \ 253 if ((vifp)->v_refcnt == 0 && \ 254 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) { \ 255 del_vifp(vifp); \ 256 } else { \ 257 mutex_exit(&(vifp)->v_lock); \ 258 } \ 259 } 260 261 #define MFCB_REFHOLD(mfcb) { \ 262 mutex_enter(&(mfcb)->mfcb_lock); \ 263 (mfcb)->mfcb_refcnt++; \ 264 ASSERT((mfcb)->mfcb_refcnt != 0); \ 265 mutex_exit(&(mfcb)->mfcb_lock); \ 266 } 267 268 #define MFCB_REFRELE(mfcb) { \ 269 mutex_enter(&(mfcb)->mfcb_lock); \ 270 ASSERT((mfcb)->mfcb_refcnt != 0); \ 271 if (--(mfcb)->mfcb_refcnt == 0 && \ 272 ((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) { \ 273 release_mfc(mfcb); \ 274 } \ 275 mutex_exit(&(mfcb)->mfcb_lock); \ 276 } 277 278 /* 279 * MFCFIND: 280 * Find a route for a given origin IP address and multicast group address. 281 * Skip entries with pending upcalls. 282 * Type of service parameter to be added in the future! 283 */ 284 #define MFCFIND(mfcbp, o, g, rt) { \ 285 struct mfc *_mb_rt = NULL; \ 286 rt = NULL; \ 287 _mb_rt = mfcbp->mfcb_mfc; \ 288 while (_mb_rt) { \ 289 if ((_mb_rt->mfc_origin.s_addr == o) && \ 290 (_mb_rt->mfc_mcastgrp.s_addr == g) && \ 291 (_mb_rt->mfc_rte == NULL) && \ 292 (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) { \ 293 rt = _mb_rt; \ 294 break; \ 295 } \ 296 _mb_rt = _mb_rt->mfc_next; \ 297 } \ 298 } 299 300 /* 301 * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime() 302 * are inefficient. We use gethrestime() which returns a timespec_t with 303 * sec and nsec, the resolution is machine dependent. 304 * The following 2 macros have been changed to use nsec instead of usec. 305 */ 306 /* 307 * Macros to compute elapsed time efficiently. 308 * Borrowed from Van Jacobson's scheduling code. 309 * Delta should be a hrtime_t. 310 */ 311 #define TV_DELTA(a, b, delta) { \ 312 int xxs; \ 313 \ 314 delta = (a).tv_nsec - (b).tv_nsec; \ 315 if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \ 316 switch (xxs) { \ 317 case 2: \ 318 delta += 1000000000; \ 319 /*FALLTHROUGH*/ \ 320 case 1: \ 321 delta += 1000000000; \ 322 break; \ 323 default: \ 324 delta += (1000000000 * xxs); \ 325 } \ 326 } \ 327 } 328 329 #define TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \ 330 (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec) 331 332 /* 333 * Handle MRT setsockopt commands to modify the multicast routing tables. 334 */ 335 int 336 ip_mrouter_set(int cmd, conn_t *connp, int checkonly, uchar_t *data, 337 int datalen) 338 { 339 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 340 341 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 342 if (cmd != MRT_INIT && connp != ipst->ips_ip_g_mrouter) { 343 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 344 return (EACCES); 345 } 346 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 347 348 if (checkonly) { 349 /* 350 * do not do operation, just pretend to - new T_CHECK 351 * Note: Even routines further on can probably fail but 352 * this T_CHECK stuff is only to please XTI so it not 353 * necessary to be perfect. 354 */ 355 switch (cmd) { 356 case MRT_INIT: 357 case MRT_DONE: 358 case MRT_ADD_VIF: 359 case MRT_DEL_VIF: 360 case MRT_ADD_MFC: 361 case MRT_DEL_MFC: 362 case MRT_ASSERT: 363 return (0); 364 default: 365 return (EOPNOTSUPP); 366 } 367 } 368 369 /* 370 * make sure no command is issued after multicast routing has been 371 * turned off. 372 */ 373 if (cmd != MRT_INIT && cmd != MRT_DONE) { 374 if (is_mrouter_off(ipst)) 375 return (EINVAL); 376 } 377 378 switch (cmd) { 379 case MRT_INIT: return (ip_mrouter_init(connp, data, datalen, ipst)); 380 case MRT_DONE: return (ip_mrouter_done(ipst)); 381 case MRT_ADD_VIF: return (add_vif((struct vifctl *)data, connp, ipst)); 382 case MRT_DEL_VIF: return (del_vif((vifi_t *)data, ipst)); 383 case MRT_ADD_MFC: return (add_mfc((struct mfcctl *)data, ipst)); 384 case MRT_DEL_MFC: return (del_mfc((struct mfcctl *)data, ipst)); 385 case MRT_ASSERT: return (set_assert((int *)data, ipst)); 386 default: return (EOPNOTSUPP); 387 } 388 } 389 390 /* 391 * Handle MRT getsockopt commands 392 */ 393 int 394 ip_mrouter_get(int cmd, conn_t *connp, uchar_t *data) 395 { 396 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 397 398 if (connp != ipst->ips_ip_g_mrouter) 399 return (EACCES); 400 401 switch (cmd) { 402 case MRT_VERSION: return (get_version((uchar_t *)data)); 403 case MRT_ASSERT: return (get_assert((uchar_t *)data, ipst)); 404 default: return (EOPNOTSUPP); 405 } 406 } 407 408 /* 409 * Handle ioctl commands to obtain information from the cache. 410 * Called with shared access to IP. These are read_only ioctls. 411 */ 412 /* ARGSUSED */ 413 int 414 mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 415 ip_ioctl_cmd_t *ipip, void *if_req) 416 { 417 mblk_t *mp1; 418 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 419 conn_t *connp = Q_TO_CONN(q); 420 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 421 422 /* Existence verified in ip_wput_nondata */ 423 mp1 = mp->b_cont->b_cont; 424 425 switch (iocp->ioc_cmd) { 426 case (SIOCGETVIFCNT): 427 return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr, ipst)); 428 case (SIOCGETSGCNT): 429 return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr, ipst)); 430 case (SIOCGETLSGCNT): 431 return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr, ipst)); 432 default: 433 return (EINVAL); 434 } 435 } 436 437 /* 438 * Returns the packet, byte, rpf-failure count for the source, group provided. 439 */ 440 static int 441 get_sg_cnt(struct sioc_sg_req *req, ip_stack_t *ipst) 442 { 443 struct mfc *rt; 444 struct mfcb *mfcbp; 445 446 mfcbp = &ipst->ips_mfcs[MFCHASH(req->src.s_addr, req->grp.s_addr)]; 447 MFCB_REFHOLD(mfcbp); 448 MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt); 449 450 if (rt != NULL) { 451 mutex_enter(&rt->mfc_mutex); 452 req->pktcnt = rt->mfc_pkt_cnt; 453 req->bytecnt = rt->mfc_byte_cnt; 454 req->wrong_if = rt->mfc_wrong_if; 455 mutex_exit(&rt->mfc_mutex); 456 } else 457 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU; 458 459 MFCB_REFRELE(mfcbp); 460 return (0); 461 } 462 463 /* 464 * Returns the packet, byte, rpf-failure count for the source, group provided. 465 * Uses larger counters and IPv6 addresses. 466 */ 467 /* ARGSUSED XXX until implemented */ 468 static int 469 get_lsg_cnt(struct sioc_lsg_req *req, ip_stack_t *ipst) 470 { 471 /* XXX TODO SIOCGETLSGCNT */ 472 return (ENXIO); 473 } 474 475 /* 476 * Returns the input and output packet and byte counts on the vif provided. 477 */ 478 static int 479 get_vif_cnt(struct sioc_vif_req *req, ip_stack_t *ipst) 480 { 481 vifi_t vifi = req->vifi; 482 483 if (vifi >= ipst->ips_numvifs) 484 return (EINVAL); 485 486 /* 487 * No locks here, an approximation is fine. 488 */ 489 req->icount = ipst->ips_vifs[vifi].v_pkt_in; 490 req->ocount = ipst->ips_vifs[vifi].v_pkt_out; 491 req->ibytes = ipst->ips_vifs[vifi].v_bytes_in; 492 req->obytes = ipst->ips_vifs[vifi].v_bytes_out; 493 494 return (0); 495 } 496 497 static int 498 get_version(uchar_t *data) 499 { 500 int *v = (int *)data; 501 502 *v = 0x0305; /* XXX !!!! */ 503 504 return (0); 505 } 506 507 /* 508 * Set PIM assert processing global. 509 */ 510 static int 511 set_assert(int *i, ip_stack_t *ipst) 512 { 513 if ((*i != 1) && (*i != 0)) 514 return (EINVAL); 515 516 ipst->ips_pim_assert = *i; 517 518 return (0); 519 } 520 521 /* 522 * Get PIM assert processing global. 523 */ 524 static int 525 get_assert(uchar_t *data, ip_stack_t *ipst) 526 { 527 int *i = (int *)data; 528 529 *i = ipst->ips_pim_assert; 530 531 return (0); 532 } 533 534 /* 535 * Enable multicast routing. 536 */ 537 static int 538 ip_mrouter_init(conn_t *connp, uchar_t *data, int datalen, ip_stack_t *ipst) 539 { 540 int *v; 541 542 if (data == NULL || (datalen != sizeof (int))) 543 return (ENOPROTOOPT); 544 545 v = (int *)data; 546 if (*v != 1) 547 return (ENOPROTOOPT); 548 549 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 550 if (ipst->ips_ip_g_mrouter != NULL) { 551 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 552 return (EADDRINUSE); 553 } 554 555 /* 556 * MRT_INIT should only be allowed for RAW sockets, but we double 557 * check. 558 */ 559 if (!IPCL_IS_RAWIP(connp)) { 560 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 561 return (EINVAL); 562 } 563 564 ipst->ips_ip_g_mrouter = connp; 565 connp->conn_multi_router = 1; 566 /* In order for tunnels to work we have to turn ip_g_forward on */ 567 if (!WE_ARE_FORWARDING(ipst)) { 568 if (ipst->ips_ip_mrtdebug > 1) { 569 (void) mi_strlog(connp->conn_rq, 1, SL_TRACE, 570 "ip_mrouter_init: turning on forwarding"); 571 } 572 ipst->ips_saved_ip_forwarding = ipst->ips_ip_forwarding; 573 ipst->ips_ip_forwarding = IP_FORWARD_ALWAYS; 574 } 575 576 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 577 return (0); 578 } 579 580 void 581 ip_mrouter_stack_init(ip_stack_t *ipst) 582 { 583 mutex_init(&ipst->ips_ip_g_mrouter_mutex, NULL, MUTEX_DEFAULT, NULL); 584 585 ipst->ips_vifs = kmem_zalloc(sizeof (struct vif) * (MAXVIFS+1), 586 KM_SLEEP); 587 ipst->ips_mrtstat = kmem_zalloc(sizeof (struct mrtstat), KM_SLEEP); 588 /* 589 * mfctable: 590 * Includes all mfcs, including waiting upcalls. 591 * Multiple mfcs per bucket. 592 */ 593 ipst->ips_mfcs = kmem_zalloc(sizeof (struct mfcb) * MFCTBLSIZ, 594 KM_SLEEP); 595 /* 596 * Define the token bucket filter structures. 597 * tbftable -> each vif has one of these for storing info. 598 */ 599 ipst->ips_tbfs = kmem_zalloc(sizeof (struct tbf) * MAXVIFS, KM_SLEEP); 600 601 mutex_init(&ipst->ips_last_encap_lock, NULL, MUTEX_DEFAULT, NULL); 602 603 ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl); 604 ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl); 605 } 606 607 /* 608 * Disable multicast routing. 609 * Didn't use global timeout_val (BSD version), instead check the mfctable. 610 */ 611 int 612 ip_mrouter_done(ip_stack_t *ipst) 613 { 614 conn_t *mrouter; 615 vifi_t vifi; 616 struct mfc *mfc_rt; 617 int i; 618 619 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 620 if (ipst->ips_ip_g_mrouter == NULL) { 621 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 622 return (EINVAL); 623 } 624 625 mrouter = ipst->ips_ip_g_mrouter; 626 627 if (ipst->ips_saved_ip_forwarding != -1) { 628 if (ipst->ips_ip_mrtdebug > 1) { 629 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 630 "ip_mrouter_done: turning off forwarding"); 631 } 632 ipst->ips_ip_forwarding = ipst->ips_saved_ip_forwarding; 633 ipst->ips_saved_ip_forwarding = -1; 634 } 635 636 /* 637 * Always clear cache when vifs change. 638 * No need to get ipst->ips_last_encap_lock since we are running as 639 * a writer. 640 */ 641 mutex_enter(&ipst->ips_last_encap_lock); 642 ipst->ips_last_encap_src = 0; 643 ipst->ips_last_encap_vif = NULL; 644 mutex_exit(&ipst->ips_last_encap_lock); 645 mrouter->conn_multi_router = 0; 646 647 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 648 649 /* 650 * For each phyint in use, 651 * disable promiscuous reception of all IP multicasts. 652 */ 653 for (vifi = 0; vifi < MAXVIFS; vifi++) { 654 struct vif *vifp = ipst->ips_vifs + vifi; 655 656 mutex_enter(&vifp->v_lock); 657 /* 658 * if the vif is active mark it condemned. 659 */ 660 if (vifp->v_marks & VIF_MARK_GOOD) { 661 ASSERT(vifp->v_ipif != NULL); 662 ipif_refhold(vifp->v_ipif); 663 /* Phyint only */ 664 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 665 ipif_t *ipif = vifp->v_ipif; 666 ilm_t *ilm = vifp->v_ilm; 667 668 vifp->v_ilm = NULL; 669 vifp->v_marks &= ~VIF_MARK_GOOD; 670 vifp->v_marks |= VIF_MARK_CONDEMNED; 671 672 mutex_exit(&(vifp)->v_lock); 673 if (ilm != NULL) { 674 ill_t *ill = ipif->ipif_ill; 675 676 (void) ip_delmulti(ilm); 677 ASSERT(ill->ill_mrouter_cnt > 0); 678 atomic_dec_32(&ill->ill_mrouter_cnt); 679 } 680 mutex_enter(&vifp->v_lock); 681 } 682 ipif_refrele(vifp->v_ipif); 683 /* 684 * decreases the refcnt added in add_vif. 685 * and release v_lock. 686 */ 687 VIF_REFRELE_LOCKED(vifp); 688 } else { 689 mutex_exit(&vifp->v_lock); 690 continue; 691 } 692 } 693 694 mutex_enter(&ipst->ips_numvifs_mutex); 695 ipst->ips_numvifs = 0; 696 ipst->ips_pim_assert = 0; 697 ipst->ips_reg_vif_num = ALL_VIFS; 698 mutex_exit(&ipst->ips_numvifs_mutex); 699 700 /* 701 * Free upcall msgs. 702 * Go through mfctable and stop any outstanding upcall 703 * timeouts remaining on mfcs. 704 */ 705 for (i = 0; i < MFCTBLSIZ; i++) { 706 mutex_enter(&ipst->ips_mfcs[i].mfcb_lock); 707 ipst->ips_mfcs[i].mfcb_refcnt++; 708 ipst->ips_mfcs[i].mfcb_marks |= MFCB_MARK_CONDEMNED; 709 mutex_exit(&ipst->ips_mfcs[i].mfcb_lock); 710 mfc_rt = ipst->ips_mfcs[i].mfcb_mfc; 711 while (mfc_rt) { 712 /* Free upcalls */ 713 mutex_enter(&mfc_rt->mfc_mutex); 714 if (mfc_rt->mfc_rte != NULL) { 715 if (mfc_rt->mfc_timeout_id != 0) { 716 /* 717 * OK to drop the lock as we have 718 * a refcnt on the bucket. timeout 719 * can fire but it will see that 720 * mfc_timeout_id == 0 and not do 721 * anything. see expire_upcalls(). 722 */ 723 mfc_rt->mfc_timeout_id = 0; 724 mutex_exit(&mfc_rt->mfc_mutex); 725 (void) untimeout( 726 mfc_rt->mfc_timeout_id); 727 mfc_rt->mfc_timeout_id = 0; 728 mutex_enter(&mfc_rt->mfc_mutex); 729 730 /* 731 * all queued upcall packets 732 * and mblk will be freed in 733 * release_mfc(). 734 */ 735 } 736 } 737 738 mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED; 739 740 mutex_exit(&mfc_rt->mfc_mutex); 741 mfc_rt = mfc_rt->mfc_next; 742 } 743 MFCB_REFRELE(&ipst->ips_mfcs[i]); 744 } 745 746 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 747 ipst->ips_ip_g_mrouter = NULL; 748 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 749 return (0); 750 } 751 752 void 753 ip_mrouter_stack_destroy(ip_stack_t *ipst) 754 { 755 struct mfcb *mfcbp; 756 struct mfc *rt; 757 int i; 758 759 for (i = 0; i < MFCTBLSIZ; i++) { 760 mfcbp = &ipst->ips_mfcs[i]; 761 762 while ((rt = mfcbp->mfcb_mfc) != NULL) { 763 (void) printf("ip_mrouter_stack_destroy: free for %d\n", 764 i); 765 766 mfcbp->mfcb_mfc = rt->mfc_next; 767 free_queue(rt); 768 mi_free(rt); 769 } 770 } 771 kmem_free(ipst->ips_vifs, sizeof (struct vif) * (MAXVIFS+1)); 772 ipst->ips_vifs = NULL; 773 kmem_free(ipst->ips_mrtstat, sizeof (struct mrtstat)); 774 ipst->ips_mrtstat = NULL; 775 kmem_free(ipst->ips_mfcs, sizeof (struct mfcb) * MFCTBLSIZ); 776 ipst->ips_mfcs = NULL; 777 kmem_free(ipst->ips_tbfs, sizeof (struct tbf) * MAXVIFS); 778 ipst->ips_tbfs = NULL; 779 780 mutex_destroy(&ipst->ips_last_encap_lock); 781 mutex_destroy(&ipst->ips_ip_g_mrouter_mutex); 782 } 783 784 static boolean_t 785 is_mrouter_off(ip_stack_t *ipst) 786 { 787 conn_t *mrouter; 788 789 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 790 if (ipst->ips_ip_g_mrouter == NULL) { 791 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 792 return (B_TRUE); 793 } 794 795 mrouter = ipst->ips_ip_g_mrouter; 796 if (mrouter->conn_multi_router == 0) { 797 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 798 return (B_TRUE); 799 } 800 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 801 return (B_FALSE); 802 } 803 804 static void 805 unlock_good_vif(struct vif *vifp) 806 { 807 ASSERT(vifp->v_ipif != NULL); 808 ipif_refrele(vifp->v_ipif); 809 VIF_REFRELE(vifp); 810 } 811 812 static boolean_t 813 lock_good_vif(struct vif *vifp) 814 { 815 mutex_enter(&vifp->v_lock); 816 if (!(vifp->v_marks & VIF_MARK_GOOD)) { 817 mutex_exit(&vifp->v_lock); 818 return (B_FALSE); 819 } 820 821 ASSERT(vifp->v_ipif != NULL); 822 mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock); 823 if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) { 824 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock); 825 mutex_exit(&vifp->v_lock); 826 return (B_FALSE); 827 } 828 ipif_refhold_locked(vifp->v_ipif); 829 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock); 830 vifp->v_refcnt++; 831 mutex_exit(&vifp->v_lock); 832 return (B_TRUE); 833 } 834 835 /* 836 * Add a vif to the vif table. 837 */ 838 static int 839 add_vif(struct vifctl *vifcp, conn_t *connp, ip_stack_t *ipst) 840 { 841 struct vif *vifp = ipst->ips_vifs + vifcp->vifc_vifi; 842 ipif_t *ipif; 843 int error = 0; 844 struct tbf *v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi; 845 conn_t *mrouter = ipst->ips_ip_g_mrouter; 846 ilm_t *ilm; 847 ill_t *ill; 848 849 ASSERT(connp != NULL); 850 851 if (vifcp->vifc_vifi >= MAXVIFS) 852 return (EINVAL); 853 854 if (is_mrouter_off(ipst)) 855 return (EINVAL); 856 857 mutex_enter(&vifp->v_lock); 858 /* 859 * Viftable entry should be 0. 860 * if v_marks == 0 but v_refcnt != 0 means struct is being 861 * initialized. 862 * 863 * Also note that it is very unlikely that we will get a MRT_ADD_VIF 864 * request while the delete is in progress, mrouted only sends add 865 * requests when a new interface is added and the new interface cannot 866 * have the same vifi as an existing interface. We make sure that 867 * ill_delete will block till the vif is deleted by adding a refcnt 868 * to ipif in del_vif(). 869 */ 870 if (vifp->v_lcl_addr.s_addr != 0 || 871 vifp->v_marks != 0 || 872 vifp->v_refcnt != 0) { 873 mutex_exit(&vifp->v_lock); 874 return (EADDRINUSE); 875 } 876 877 /* Incoming vif should not be 0 */ 878 if (vifcp->vifc_lcl_addr.s_addr == 0) { 879 mutex_exit(&vifp->v_lock); 880 return (EINVAL); 881 } 882 883 vifp->v_refcnt++; 884 mutex_exit(&vifp->v_lock); 885 /* Find the interface with the local address */ 886 ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL, 887 IPCL_ZONEID(connp), ipst); 888 if (ipif == NULL) { 889 VIF_REFRELE(vifp); 890 return (EADDRNOTAVAIL); 891 } 892 893 if (ipst->ips_ip_mrtdebug > 1) { 894 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 895 "add_vif: src 0x%x enter", 896 vifcp->vifc_lcl_addr.s_addr); 897 } 898 899 mutex_enter(&vifp->v_lock); 900 /* 901 * Always clear cache when vifs change. 902 * Needed to ensure that src isn't left over from before vif was added. 903 * No need to get last_encap_lock, since we are running as a writer. 904 */ 905 906 mutex_enter(&ipst->ips_last_encap_lock); 907 ipst->ips_last_encap_src = 0; 908 ipst->ips_last_encap_vif = NULL; 909 mutex_exit(&ipst->ips_last_encap_lock); 910 911 if (vifcp->vifc_flags & VIFF_TUNNEL) { 912 if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) { 913 cmn_err(CE_WARN, 914 "add_vif: source route tunnels not supported\n"); 915 VIF_REFRELE_LOCKED(vifp); 916 ipif_refrele(ipif); 917 return (EOPNOTSUPP); 918 } 919 vifp->v_rmt_addr = vifcp->vifc_rmt_addr; 920 921 } else { 922 /* Phyint or Register vif */ 923 if (vifcp->vifc_flags & VIFF_REGISTER) { 924 /* 925 * Note: Since all IPPROTO_IP level options (including 926 * MRT_ADD_VIF) are done exclusively via 927 * ip_optmgmt_writer(), a lock is not necessary to 928 * protect reg_vif_num. 929 */ 930 mutex_enter(&ipst->ips_numvifs_mutex); 931 if (ipst->ips_reg_vif_num == ALL_VIFS) { 932 ipst->ips_reg_vif_num = vifcp->vifc_vifi; 933 mutex_exit(&ipst->ips_numvifs_mutex); 934 } else { 935 mutex_exit(&ipst->ips_numvifs_mutex); 936 VIF_REFRELE_LOCKED(vifp); 937 ipif_refrele(ipif); 938 return (EADDRINUSE); 939 } 940 } 941 942 /* Make sure the interface supports multicast */ 943 if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) { 944 VIF_REFRELE_LOCKED(vifp); 945 ipif_refrele(ipif); 946 if (vifcp->vifc_flags & VIFF_REGISTER) { 947 mutex_enter(&ipst->ips_numvifs_mutex); 948 ipst->ips_reg_vif_num = ALL_VIFS; 949 mutex_exit(&ipst->ips_numvifs_mutex); 950 } 951 return (EOPNOTSUPP); 952 } 953 /* Enable promiscuous reception of all IP mcasts from the if */ 954 mutex_exit(&vifp->v_lock); 955 956 ill = ipif->ipif_ill; 957 if (IS_UNDER_IPMP(ill)) 958 ill = ipmp_ill_hold_ipmp_ill(ill); 959 960 if (ill == NULL) { 961 ilm = NULL; 962 } else { 963 ilm = ip_addmulti(&ipv6_all_zeros, ill, 964 ipif->ipif_zoneid, &error); 965 if (ilm != NULL) 966 atomic_inc_32(&ill->ill_mrouter_cnt); 967 if (IS_UNDER_IPMP(ipif->ipif_ill)) { 968 ill_refrele(ill); 969 ill = ipif->ipif_ill; 970 } 971 } 972 973 mutex_enter(&vifp->v_lock); 974 /* 975 * since we released the lock lets make sure that 976 * ip_mrouter_done() has not been called. 977 */ 978 if (ilm == NULL || is_mrouter_off(ipst)) { 979 if (ilm != NULL) { 980 (void) ip_delmulti(ilm); 981 ASSERT(ill->ill_mrouter_cnt > 0); 982 atomic_dec_32(&ill->ill_mrouter_cnt); 983 } 984 if (vifcp->vifc_flags & VIFF_REGISTER) { 985 mutex_enter(&ipst->ips_numvifs_mutex); 986 ipst->ips_reg_vif_num = ALL_VIFS; 987 mutex_exit(&ipst->ips_numvifs_mutex); 988 } 989 VIF_REFRELE_LOCKED(vifp); 990 ipif_refrele(ipif); 991 return (error?error:EINVAL); 992 } 993 vifp->v_ilm = ilm; 994 } 995 /* Define parameters for the tbf structure */ 996 vifp->v_tbf = v_tbf; 997 gethrestime(&vifp->v_tbf->tbf_last_pkt_t); 998 vifp->v_tbf->tbf_n_tok = 0; 999 vifp->v_tbf->tbf_q_len = 0; 1000 vifp->v_tbf->tbf_max_q_len = MAXQSIZE; 1001 vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL; 1002 1003 vifp->v_flags = vifcp->vifc_flags; 1004 vifp->v_threshold = vifcp->vifc_threshold; 1005 vifp->v_lcl_addr = vifcp->vifc_lcl_addr; 1006 vifp->v_ipif = ipif; 1007 ipif_refrele(ipif); 1008 /* Scaling up here, allows division by 1024 in critical code. */ 1009 vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000); 1010 vifp->v_timeout_id = 0; 1011 /* initialize per vif pkt counters */ 1012 vifp->v_pkt_in = 0; 1013 vifp->v_pkt_out = 0; 1014 vifp->v_bytes_in = 0; 1015 vifp->v_bytes_out = 0; 1016 mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL); 1017 1018 /* Adjust numvifs up, if the vifi is higher than numvifs */ 1019 mutex_enter(&ipst->ips_numvifs_mutex); 1020 if (ipst->ips_numvifs <= vifcp->vifc_vifi) 1021 ipst->ips_numvifs = vifcp->vifc_vifi + 1; 1022 mutex_exit(&ipst->ips_numvifs_mutex); 1023 1024 if (ipst->ips_ip_mrtdebug > 1) { 1025 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1026 "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d", 1027 vifcp->vifc_vifi, 1028 ntohl(vifcp->vifc_lcl_addr.s_addr), 1029 (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask", 1030 ntohl(vifcp->vifc_rmt_addr.s_addr), 1031 vifcp->vifc_threshold, vifcp->vifc_rate_limit); 1032 } 1033 1034 vifp->v_marks = VIF_MARK_GOOD; 1035 mutex_exit(&vifp->v_lock); 1036 return (0); 1037 } 1038 1039 1040 /* Delete a vif from the vif table. */ 1041 static void 1042 del_vifp(struct vif *vifp) 1043 { 1044 struct tbf *t = vifp->v_tbf; 1045 mblk_t *mp0; 1046 vifi_t vifi; 1047 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 1048 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1049 1050 ASSERT(vifp->v_marks & VIF_MARK_CONDEMNED); 1051 ASSERT(t != NULL); 1052 1053 if (ipst->ips_ip_mrtdebug > 1) { 1054 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1055 "del_vif: src 0x%x\n", vifp->v_lcl_addr.s_addr); 1056 } 1057 1058 if (vifp->v_timeout_id != 0) { 1059 (void) untimeout(vifp->v_timeout_id); 1060 vifp->v_timeout_id = 0; 1061 } 1062 1063 /* 1064 * Free packets queued at the interface. 1065 * Mrouted takes care of cleaning up mfcs - makes calls to del_mfc. 1066 */ 1067 mutex_enter(&t->tbf_lock); 1068 while (t->tbf_q != NULL) { 1069 mp0 = t->tbf_q; 1070 t->tbf_q = t->tbf_q->b_next; 1071 mp0->b_prev = mp0->b_next = NULL; 1072 freemsg(mp0); 1073 } 1074 mutex_exit(&t->tbf_lock); 1075 1076 /* 1077 * Always clear cache when vifs change. 1078 * No need to get last_encap_lock since we are running as a writer. 1079 */ 1080 mutex_enter(&ipst->ips_last_encap_lock); 1081 if (vifp == ipst->ips_last_encap_vif) { 1082 ipst->ips_last_encap_vif = NULL; 1083 ipst->ips_last_encap_src = 0; 1084 } 1085 mutex_exit(&ipst->ips_last_encap_lock); 1086 1087 mutex_destroy(&t->tbf_lock); 1088 1089 bzero(vifp->v_tbf, sizeof (*(vifp->v_tbf))); 1090 1091 /* Adjust numvifs down */ 1092 mutex_enter(&ipst->ips_numvifs_mutex); 1093 for (vifi = ipst->ips_numvifs; vifi != 0; vifi--) /* vifi is unsigned */ 1094 if (ipst->ips_vifs[vifi - 1].v_lcl_addr.s_addr != 0) 1095 break; 1096 ipst->ips_numvifs = vifi; 1097 mutex_exit(&ipst->ips_numvifs_mutex); 1098 1099 bzero(vifp, sizeof (*vifp)); 1100 } 1101 1102 static int 1103 del_vif(vifi_t *vifip, ip_stack_t *ipst) 1104 { 1105 struct vif *vifp = ipst->ips_vifs + *vifip; 1106 1107 if (*vifip >= ipst->ips_numvifs) 1108 return (EINVAL); 1109 1110 mutex_enter(&vifp->v_lock); 1111 /* 1112 * Not initialized 1113 * Here we are not looking at the vif that is being initialized 1114 * i.e vifp->v_marks == 0 and refcnt > 0. 1115 */ 1116 if (vifp->v_lcl_addr.s_addr == 0 || 1117 !(vifp->v_marks & VIF_MARK_GOOD)) { 1118 mutex_exit(&vifp->v_lock); 1119 return (EADDRNOTAVAIL); 1120 } 1121 1122 /* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */ 1123 vifp->v_marks &= ~VIF_MARK_GOOD; 1124 vifp->v_marks |= VIF_MARK_CONDEMNED; 1125 1126 /* Phyint only */ 1127 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 1128 ipif_t *ipif = vifp->v_ipif; 1129 ilm_t *ilm = vifp->v_ilm; 1130 1131 vifp->v_ilm = NULL; 1132 1133 ASSERT(ipif != NULL); 1134 /* 1135 * should be OK to drop the lock as we 1136 * have marked this as CONDEMNED. 1137 */ 1138 mutex_exit(&(vifp)->v_lock); 1139 if (ilm != NULL) { 1140 (void) ip_delmulti(ilm); 1141 ASSERT(ipif->ipif_ill->ill_mrouter_cnt > 0); 1142 atomic_dec_32(&ipif->ipif_ill->ill_mrouter_cnt); 1143 } 1144 mutex_enter(&(vifp)->v_lock); 1145 } 1146 1147 if (vifp->v_flags & VIFF_REGISTER) { 1148 mutex_enter(&ipst->ips_numvifs_mutex); 1149 ipst->ips_reg_vif_num = ALL_VIFS; 1150 mutex_exit(&ipst->ips_numvifs_mutex); 1151 } 1152 1153 /* 1154 * decreases the refcnt added in add_vif. 1155 */ 1156 VIF_REFRELE_LOCKED(vifp); 1157 return (0); 1158 } 1159 1160 /* 1161 * Add an mfc entry. 1162 */ 1163 static int 1164 add_mfc(struct mfcctl *mfccp, ip_stack_t *ipst) 1165 { 1166 struct mfc *rt; 1167 struct rtdetq *rte; 1168 ushort_t nstl; 1169 int i; 1170 struct mfcb *mfcbp; 1171 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1172 1173 /* 1174 * The value of vifi is NO_VIF (==MAXVIFS) if Mrouted 1175 * did not have a real route for pkt. 1176 * We want this pkt without rt installed in the mfctable to prevent 1177 * multiiple tries, so go ahead and put it in mfctable, it will 1178 * be discarded later in ip_mdq() because the child is NULL. 1179 */ 1180 1181 /* Error checking, out of bounds? */ 1182 if (mfccp->mfcc_parent > MAXVIFS) { 1183 ip0dbg(("ADD_MFC: mfcc_parent out of range %d", 1184 (int)mfccp->mfcc_parent)); 1185 return (EINVAL); 1186 } 1187 1188 if ((mfccp->mfcc_parent != NO_VIF) && 1189 (ipst->ips_vifs[mfccp->mfcc_parent].v_ipif == NULL)) { 1190 ip0dbg(("ADD_MFC: NULL ipif for parent vif %d\n", 1191 (int)mfccp->mfcc_parent)); 1192 return (EINVAL); 1193 } 1194 1195 if (is_mrouter_off(ipst)) { 1196 return (EINVAL); 1197 } 1198 1199 mfcbp = &ipst->ips_mfcs[MFCHASH(mfccp->mfcc_origin.s_addr, 1200 mfccp->mfcc_mcastgrp.s_addr)]; 1201 MFCB_REFHOLD(mfcbp); 1202 MFCFIND(mfcbp, mfccp->mfcc_origin.s_addr, 1203 mfccp->mfcc_mcastgrp.s_addr, rt); 1204 1205 /* If an entry already exists, just update the fields */ 1206 if (rt) { 1207 if (ipst->ips_ip_mrtdebug > 1) { 1208 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1209 "add_mfc: update o %x grp %x parent %x", 1210 ntohl(mfccp->mfcc_origin.s_addr), 1211 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1212 mfccp->mfcc_parent); 1213 } 1214 mutex_enter(&rt->mfc_mutex); 1215 rt->mfc_parent = mfccp->mfcc_parent; 1216 1217 mutex_enter(&ipst->ips_numvifs_mutex); 1218 for (i = 0; i < (int)ipst->ips_numvifs; i++) 1219 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; 1220 mutex_exit(&ipst->ips_numvifs_mutex); 1221 mutex_exit(&rt->mfc_mutex); 1222 1223 MFCB_REFRELE(mfcbp); 1224 return (0); 1225 } 1226 1227 /* 1228 * Find the entry for which the upcall was made and update. 1229 */ 1230 for (rt = mfcbp->mfcb_mfc, nstl = 0; rt; rt = rt->mfc_next) { 1231 mutex_enter(&rt->mfc_mutex); 1232 if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && 1233 (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) && 1234 (rt->mfc_rte != NULL) && 1235 !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 1236 if (nstl++ != 0) 1237 cmn_err(CE_WARN, 1238 "add_mfc: %s o %x g %x p %x", 1239 "multiple kernel entries", 1240 ntohl(mfccp->mfcc_origin.s_addr), 1241 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1242 mfccp->mfcc_parent); 1243 1244 if (ipst->ips_ip_mrtdebug > 1) { 1245 (void) mi_strlog(mrouter->conn_rq, 1, 1246 SL_TRACE, 1247 "add_mfc: o %x g %x p %x", 1248 ntohl(mfccp->mfcc_origin.s_addr), 1249 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1250 mfccp->mfcc_parent); 1251 } 1252 fill_route(rt, mfccp, ipst); 1253 1254 /* 1255 * Prevent cleanup of cache entry. 1256 * Timer starts in ip_mforward. 1257 */ 1258 if (rt->mfc_timeout_id != 0) { 1259 timeout_id_t id; 1260 id = rt->mfc_timeout_id; 1261 /* 1262 * setting id to zero will avoid this 1263 * entry from being cleaned up in 1264 * expire_up_calls(). 1265 */ 1266 rt->mfc_timeout_id = 0; 1267 /* 1268 * dropping the lock is fine as we 1269 * have a refhold on the bucket. 1270 * so mfc cannot be freed. 1271 * The timeout can fire but it will see 1272 * that mfc_timeout_id == 0 and not cleanup. 1273 */ 1274 mutex_exit(&rt->mfc_mutex); 1275 (void) untimeout(id); 1276 mutex_enter(&rt->mfc_mutex); 1277 } 1278 1279 /* 1280 * Send all pkts that are queued waiting for the upcall. 1281 * ip_mdq param tun set to 0 - 1282 * the return value of ip_mdq() isn't used here, 1283 * so value we send doesn't matter. 1284 */ 1285 while (rt->mfc_rte != NULL) { 1286 rte = rt->mfc_rte; 1287 rt->mfc_rte = rte->rte_next; 1288 mutex_exit(&rt->mfc_mutex); 1289 (void) ip_mdq(rte->mp, (ipha_t *) 1290 rte->mp->b_rptr, rte->ill, 0, rt); 1291 freemsg(rte->mp); 1292 mi_free((char *)rte); 1293 mutex_enter(&rt->mfc_mutex); 1294 } 1295 } 1296 mutex_exit(&rt->mfc_mutex); 1297 } 1298 1299 1300 /* 1301 * It is possible that an entry is being inserted without an upcall 1302 */ 1303 if (nstl == 0) { 1304 mutex_enter(&(mfcbp->mfcb_lock)); 1305 if (ipst->ips_ip_mrtdebug > 1) { 1306 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1307 "add_mfc: no upcall o %x g %x p %x", 1308 ntohl(mfccp->mfcc_origin.s_addr), 1309 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1310 mfccp->mfcc_parent); 1311 } 1312 if (is_mrouter_off(ipst)) { 1313 mutex_exit(&mfcbp->mfcb_lock); 1314 MFCB_REFRELE(mfcbp); 1315 return (EINVAL); 1316 } 1317 1318 for (rt = mfcbp->mfcb_mfc; rt; rt = rt->mfc_next) { 1319 1320 mutex_enter(&rt->mfc_mutex); 1321 if ((rt->mfc_origin.s_addr == 1322 mfccp->mfcc_origin.s_addr) && 1323 (rt->mfc_mcastgrp.s_addr == 1324 mfccp->mfcc_mcastgrp.s_addr) && 1325 (!(rt->mfc_marks & MFCB_MARK_CONDEMNED))) { 1326 fill_route(rt, mfccp, ipst); 1327 mutex_exit(&rt->mfc_mutex); 1328 break; 1329 } 1330 mutex_exit(&rt->mfc_mutex); 1331 } 1332 1333 /* No upcall, so make a new entry into mfctable */ 1334 if (rt == NULL) { 1335 rt = (struct mfc *)mi_zalloc(sizeof (struct mfc)); 1336 if (rt == NULL) { 1337 ip1dbg(("add_mfc: out of memory\n")); 1338 mutex_exit(&mfcbp->mfcb_lock); 1339 MFCB_REFRELE(mfcbp); 1340 return (ENOBUFS); 1341 } 1342 1343 /* Insert new entry at head of hash chain */ 1344 mutex_enter(&rt->mfc_mutex); 1345 fill_route(rt, mfccp, ipst); 1346 1347 /* Link into table */ 1348 rt->mfc_next = mfcbp->mfcb_mfc; 1349 mfcbp->mfcb_mfc = rt; 1350 mutex_exit(&rt->mfc_mutex); 1351 } 1352 mutex_exit(&mfcbp->mfcb_lock); 1353 } 1354 1355 MFCB_REFRELE(mfcbp); 1356 return (0); 1357 } 1358 1359 /* 1360 * Fills in mfc structure from mrouted mfcctl. 1361 */ 1362 static void 1363 fill_route(struct mfc *rt, struct mfcctl *mfccp, ip_stack_t *ipst) 1364 { 1365 int i; 1366 1367 rt->mfc_origin = mfccp->mfcc_origin; 1368 rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; 1369 rt->mfc_parent = mfccp->mfcc_parent; 1370 mutex_enter(&ipst->ips_numvifs_mutex); 1371 for (i = 0; i < (int)ipst->ips_numvifs; i++) { 1372 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; 1373 } 1374 mutex_exit(&ipst->ips_numvifs_mutex); 1375 /* Initialize pkt counters per src-grp */ 1376 rt->mfc_pkt_cnt = 0; 1377 rt->mfc_byte_cnt = 0; 1378 rt->mfc_wrong_if = 0; 1379 rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_nsec = 0; 1380 1381 } 1382 1383 static void 1384 free_queue(struct mfc *mfcp) 1385 { 1386 struct rtdetq *rte0; 1387 1388 /* 1389 * Drop all queued upcall packets. 1390 * Free the mbuf with the pkt. 1391 */ 1392 while ((rte0 = mfcp->mfc_rte) != NULL) { 1393 mfcp->mfc_rte = rte0->rte_next; 1394 freemsg(rte0->mp); 1395 mi_free((char *)rte0); 1396 } 1397 } 1398 /* 1399 * go thorugh the hash bucket and free all the entries marked condemned. 1400 */ 1401 void 1402 release_mfc(struct mfcb *mfcbp) 1403 { 1404 struct mfc *current_mfcp; 1405 struct mfc *prev_mfcp; 1406 1407 prev_mfcp = current_mfcp = mfcbp->mfcb_mfc; 1408 1409 while (current_mfcp != NULL) { 1410 if (current_mfcp->mfc_marks & MFCB_MARK_CONDEMNED) { 1411 if (current_mfcp == mfcbp->mfcb_mfc) { 1412 mfcbp->mfcb_mfc = current_mfcp->mfc_next; 1413 free_queue(current_mfcp); 1414 mi_free(current_mfcp); 1415 prev_mfcp = current_mfcp = mfcbp->mfcb_mfc; 1416 continue; 1417 } 1418 ASSERT(prev_mfcp != NULL); 1419 prev_mfcp->mfc_next = current_mfcp->mfc_next; 1420 free_queue(current_mfcp); 1421 mi_free(current_mfcp); 1422 current_mfcp = NULL; 1423 } else { 1424 prev_mfcp = current_mfcp; 1425 } 1426 1427 current_mfcp = prev_mfcp->mfc_next; 1428 1429 } 1430 mfcbp->mfcb_marks &= ~MFCB_MARK_CONDEMNED; 1431 ASSERT(mfcbp->mfcb_mfc != NULL || mfcbp->mfcb_marks == 0); 1432 } 1433 1434 /* 1435 * Delete an mfc entry. 1436 */ 1437 static int 1438 del_mfc(struct mfcctl *mfccp, ip_stack_t *ipst) 1439 { 1440 struct in_addr origin; 1441 struct in_addr mcastgrp; 1442 struct mfc *rt; 1443 uint_t hash; 1444 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1445 1446 origin = mfccp->mfcc_origin; 1447 mcastgrp = mfccp->mfcc_mcastgrp; 1448 hash = MFCHASH(origin.s_addr, mcastgrp.s_addr); 1449 1450 if (ipst->ips_ip_mrtdebug > 1) { 1451 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1452 "del_mfc: o %x g %x", 1453 ntohl(origin.s_addr), 1454 ntohl(mcastgrp.s_addr)); 1455 } 1456 1457 MFCB_REFHOLD(&ipst->ips_mfcs[hash]); 1458 1459 /* Find mfc in mfctable, finds only entries without upcalls */ 1460 for (rt = ipst->ips_mfcs[hash].mfcb_mfc; rt; rt = rt->mfc_next) { 1461 mutex_enter(&rt->mfc_mutex); 1462 if (origin.s_addr == rt->mfc_origin.s_addr && 1463 mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr && 1464 rt->mfc_rte == NULL && 1465 !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) 1466 break; 1467 mutex_exit(&rt->mfc_mutex); 1468 } 1469 1470 /* 1471 * Return if there was an upcall (mfc_rte != NULL, 1472 * or rt not in mfctable. 1473 */ 1474 if (rt == NULL) { 1475 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 1476 return (EADDRNOTAVAIL); 1477 } 1478 1479 1480 /* 1481 * no need to hold lock as we have a reference. 1482 */ 1483 ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED; 1484 /* error checking */ 1485 if (rt->mfc_timeout_id != 0) { 1486 ip0dbg(("del_mfc: TIMEOUT NOT 0, rte not null")); 1487 /* 1488 * Its ok to drop the lock, the struct cannot be freed 1489 * since we have a ref on the hash bucket. 1490 */ 1491 rt->mfc_timeout_id = 0; 1492 mutex_exit(&rt->mfc_mutex); 1493 (void) untimeout(rt->mfc_timeout_id); 1494 mutex_enter(&rt->mfc_mutex); 1495 } 1496 1497 ASSERT(rt->mfc_rte == NULL); 1498 1499 1500 /* 1501 * Delete the entry from the cache 1502 */ 1503 rt->mfc_marks |= MFCB_MARK_CONDEMNED; 1504 mutex_exit(&rt->mfc_mutex); 1505 1506 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 1507 1508 return (0); 1509 } 1510 1511 #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ 1512 1513 /* 1514 * IP multicast forwarding function. This function assumes that the packet 1515 * pointed to by ipha has arrived on (or is about to be sent to) the interface 1516 * pointed to by "ill", and the packet is to be relayed to other networks 1517 * that have members of the packet's destination IP multicast group. 1518 * 1519 * The packet is returned unscathed to the caller, unless it is 1520 * erroneous, in which case a -1 value tells the caller (IP) 1521 * to discard it. 1522 * 1523 * Unlike BSD, SunOS 5.x needs to return to IP info about 1524 * whether pkt came in thru a tunnel, so it can be discarded, unless 1525 * it's IGMP. In BSD, the ifp is bogus for tunnels, so pkt won't try 1526 * to be delivered. 1527 * Return values are 0 - pkt is okay and phyint 1528 * -1 - pkt is malformed and to be tossed 1529 * 1 - pkt came in on tunnel 1530 */ 1531 int 1532 ip_mforward(mblk_t *mp, ip_recv_attr_t *ira) 1533 { 1534 ipha_t *ipha = (ipha_t *)mp->b_rptr; 1535 ill_t *ill = ira->ira_ill; 1536 struct mfc *rt; 1537 ipaddr_t src, dst, tunnel_src = 0; 1538 static int srctun = 0; 1539 vifi_t vifi; 1540 boolean_t pim_reg_packet = B_FALSE; 1541 struct mfcb *mfcbp; 1542 ip_stack_t *ipst = ill->ill_ipst; 1543 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1544 ill_t *rill = ira->ira_rill; 1545 1546 ASSERT(ira->ira_pktlen == msgdsize(mp)); 1547 1548 if (ipst->ips_ip_mrtdebug > 1) { 1549 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1550 "ip_mforward: RECV ipha_src %x, ipha_dst %x, ill %s", 1551 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), 1552 ill->ill_name); 1553 } 1554 1555 dst = ipha->ipha_dst; 1556 if (ira->ira_flags & IRAF_PIM_REGISTER) 1557 pim_reg_packet = B_TRUE; 1558 else if (ira->ira_flags & IRAF_MROUTE_TUNNEL_SET) 1559 tunnel_src = ira->ira_mroute_tunnel; 1560 1561 /* 1562 * Don't forward a packet with time-to-live of zero or one, 1563 * or a packet destined to a local-only group. 1564 */ 1565 if (CLASSD(dst) && (ipha->ipha_ttl <= 1 || 1566 (ipaddr_t)ntohl(dst) <= INADDR_MAX_LOCAL_GROUP)) { 1567 if (ipst->ips_ip_mrtdebug > 1) { 1568 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1569 "ip_mforward: not forwarded ttl %d," 1570 " dst 0x%x ill %s", 1571 ipha->ipha_ttl, ntohl(dst), ill->ill_name); 1572 } 1573 if (tunnel_src != 0) 1574 return (1); 1575 else 1576 return (0); 1577 } 1578 1579 if ((tunnel_src != 0) || pim_reg_packet) { 1580 /* 1581 * Packet arrived over an encapsulated tunnel or via a PIM 1582 * register message. 1583 */ 1584 if (ipst->ips_ip_mrtdebug > 1) { 1585 if (tunnel_src != 0) { 1586 (void) mi_strlog(mrouter->conn_rq, 1, 1587 SL_TRACE, 1588 "ip_mforward: ill %s arrived via ENCAP TUN", 1589 ill->ill_name); 1590 } else if (pim_reg_packet) { 1591 (void) mi_strlog(mrouter->conn_rq, 1, 1592 SL_TRACE, 1593 "ip_mforward: ill %s arrived via" 1594 " REGISTER VIF", 1595 ill->ill_name); 1596 } 1597 } 1598 } else if ((ipha->ipha_version_and_hdr_length & 0xf) < 1599 (uint_t)(IP_SIMPLE_HDR_LENGTH + TUNNEL_LEN) >> 2 || 1600 ((uchar_t *)(ipha + 1))[1] != IPOPT_LSRR) { 1601 /* Packet arrived via a physical interface. */ 1602 if (ipst->ips_ip_mrtdebug > 1) { 1603 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1604 "ip_mforward: ill %s arrived via PHYINT", 1605 ill->ill_name); 1606 } 1607 1608 } else { 1609 /* 1610 * Packet arrived through a SRCRT tunnel. 1611 * Source-route tunnels are no longer supported. 1612 * Error message printed every 1000 times. 1613 */ 1614 if ((srctun++ % 1000) == 0) { 1615 cmn_err(CE_WARN, 1616 "ip_mforward: received source-routed pkt from %x", 1617 ntohl(ipha->ipha_src)); 1618 } 1619 return (-1); 1620 } 1621 1622 ipst->ips_mrtstat->mrts_fwd_in++; 1623 src = ipha->ipha_src; 1624 1625 /* Find route in cache, return NULL if not there or upcalls q'ed. */ 1626 1627 /* 1628 * Lock the mfctable against changes made by ip_mforward. 1629 * Note that only add_mfc and del_mfc can remove entries and 1630 * they run with exclusive access to IP. So we do not need to 1631 * guard against the rt being deleted, so release lock after reading. 1632 */ 1633 1634 if (is_mrouter_off(ipst)) 1635 return (-1); 1636 1637 mfcbp = &ipst->ips_mfcs[MFCHASH(src, dst)]; 1638 MFCB_REFHOLD(mfcbp); 1639 MFCFIND(mfcbp, src, dst, rt); 1640 1641 /* Entry exists, so forward if necessary */ 1642 if (rt != NULL) { 1643 int ret = 0; 1644 ipst->ips_mrtstat->mrts_mfc_hits++; 1645 if (pim_reg_packet) { 1646 ASSERT(ipst->ips_reg_vif_num != ALL_VIFS); 1647 ret = ip_mdq(mp, ipha, 1648 ipst->ips_vifs[ipst->ips_reg_vif_num]. 1649 v_ipif->ipif_ill, 1650 0, rt); 1651 } else { 1652 ret = ip_mdq(mp, ipha, ill, tunnel_src, rt); 1653 } 1654 1655 MFCB_REFRELE(mfcbp); 1656 return (ret); 1657 1658 /* 1659 * Don't forward if we don't have a cache entry. Mrouted will 1660 * always provide a cache entry in response to an upcall. 1661 */ 1662 } else { 1663 /* 1664 * If we don't have a route for packet's origin, make a copy 1665 * of the packet and send message to routing daemon. 1666 */ 1667 struct mfc *mfc_rt = NULL; 1668 mblk_t *mp0 = NULL; 1669 mblk_t *mp_copy = NULL; 1670 struct rtdetq *rte = NULL; 1671 struct rtdetq *rte_m, *rte1, *prev_rte; 1672 uint_t hash; 1673 int npkts; 1674 boolean_t new_mfc = B_FALSE; 1675 ipst->ips_mrtstat->mrts_mfc_misses++; 1676 /* BSD uses mrts_no_route++ */ 1677 if (ipst->ips_ip_mrtdebug > 1) { 1678 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1679 "ip_mforward: no rte ill %s src %x g %x misses %d", 1680 ill->ill_name, ntohl(src), ntohl(dst), 1681 (int)ipst->ips_mrtstat->mrts_mfc_misses); 1682 } 1683 /* 1684 * The order of the following code differs from the BSD code. 1685 * Pre-mc3.5, the BSD code was incorrect and SunOS 5.x 1686 * code works, so SunOS 5.x wasn't changed to conform to the 1687 * BSD version. 1688 */ 1689 1690 /* Lock mfctable. */ 1691 hash = MFCHASH(src, dst); 1692 mutex_enter(&(ipst->ips_mfcs[hash].mfcb_lock)); 1693 1694 /* 1695 * If we are turning off mrouted return an error 1696 */ 1697 if (is_mrouter_off(ipst)) { 1698 mutex_exit(&mfcbp->mfcb_lock); 1699 MFCB_REFRELE(mfcbp); 1700 return (-1); 1701 } 1702 1703 /* Is there an upcall waiting for this packet? */ 1704 for (mfc_rt = ipst->ips_mfcs[hash].mfcb_mfc; mfc_rt; 1705 mfc_rt = mfc_rt->mfc_next) { 1706 mutex_enter(&mfc_rt->mfc_mutex); 1707 if (ipst->ips_ip_mrtdebug > 1) { 1708 (void) mi_strlog(mrouter->conn_rq, 1, 1709 SL_TRACE, 1710 "ip_mforward: MFCTAB hash %d o 0x%x" 1711 " g 0x%x\n", 1712 hash, ntohl(mfc_rt->mfc_origin.s_addr), 1713 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1714 } 1715 /* There is an upcall */ 1716 if ((src == mfc_rt->mfc_origin.s_addr) && 1717 (dst == mfc_rt->mfc_mcastgrp.s_addr) && 1718 (mfc_rt->mfc_rte != NULL) && 1719 !(mfc_rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 1720 break; 1721 } 1722 mutex_exit(&mfc_rt->mfc_mutex); 1723 } 1724 /* No upcall, so make a new entry into mfctable */ 1725 if (mfc_rt == NULL) { 1726 mfc_rt = (struct mfc *)mi_zalloc(sizeof (struct mfc)); 1727 if (mfc_rt == NULL) { 1728 ipst->ips_mrtstat->mrts_fwd_drop++; 1729 ip1dbg(("ip_mforward: out of memory " 1730 "for mfc, mfc_rt\n")); 1731 goto error_return; 1732 } else 1733 new_mfc = B_TRUE; 1734 /* Get resources */ 1735 /* TODO could copy header and dup rest */ 1736 mp_copy = copymsg(mp); 1737 if (mp_copy == NULL) { 1738 ipst->ips_mrtstat->mrts_fwd_drop++; 1739 ip1dbg(("ip_mforward: out of memory for " 1740 "mblk, mp_copy\n")); 1741 goto error_return; 1742 } 1743 mutex_enter(&mfc_rt->mfc_mutex); 1744 } 1745 /* Get resources for rte, whether first rte or not first. */ 1746 /* Add this packet into rtdetq */ 1747 rte = (struct rtdetq *)mi_zalloc(sizeof (struct rtdetq)); 1748 if (rte == NULL) { 1749 ipst->ips_mrtstat->mrts_fwd_drop++; 1750 mutex_exit(&mfc_rt->mfc_mutex); 1751 ip1dbg(("ip_mforward: out of memory for" 1752 " rtdetq, rte\n")); 1753 goto error_return; 1754 } 1755 1756 mp0 = copymsg(mp); 1757 if (mp0 == NULL) { 1758 ipst->ips_mrtstat->mrts_fwd_drop++; 1759 ip1dbg(("ip_mforward: out of memory for mblk, mp0\n")); 1760 mutex_exit(&mfc_rt->mfc_mutex); 1761 goto error_return; 1762 } 1763 rte->mp = mp0; 1764 if (pim_reg_packet) { 1765 ASSERT(ipst->ips_reg_vif_num != ALL_VIFS); 1766 rte->ill = 1767 ipst->ips_vifs[ipst->ips_reg_vif_num]. 1768 v_ipif->ipif_ill; 1769 } else { 1770 rte->ill = ill; 1771 } 1772 rte->rte_next = NULL; 1773 1774 /* 1775 * Determine if upcall q (rtdetq) has overflowed. 1776 * mfc_rt->mfc_rte is null by mi_zalloc 1777 * if it is the first message. 1778 */ 1779 for (rte_m = mfc_rt->mfc_rte, npkts = 0; rte_m; 1780 rte_m = rte_m->rte_next) 1781 npkts++; 1782 if (ipst->ips_ip_mrtdebug > 1) { 1783 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1784 "ip_mforward: upcalls %d\n", npkts); 1785 } 1786 if (npkts > MAX_UPQ) { 1787 ipst->ips_mrtstat->mrts_upq_ovflw++; 1788 mutex_exit(&mfc_rt->mfc_mutex); 1789 goto error_return; 1790 } 1791 1792 if (npkts == 0) { /* first upcall */ 1793 int i = 0; 1794 /* 1795 * Now finish installing the new mfc! Now that we have 1796 * resources! Insert new entry at head of hash chain. 1797 * Use src and dst which are ipaddr_t's. 1798 */ 1799 mfc_rt->mfc_origin.s_addr = src; 1800 mfc_rt->mfc_mcastgrp.s_addr = dst; 1801 1802 mutex_enter(&ipst->ips_numvifs_mutex); 1803 for (i = 0; i < (int)ipst->ips_numvifs; i++) 1804 mfc_rt->mfc_ttls[i] = 0; 1805 mutex_exit(&ipst->ips_numvifs_mutex); 1806 mfc_rt->mfc_parent = ALL_VIFS; 1807 1808 /* Link into table */ 1809 if (ipst->ips_ip_mrtdebug > 1) { 1810 (void) mi_strlog(mrouter->conn_rq, 1, 1811 SL_TRACE, 1812 "ip_mforward: NEW MFCTAB hash %d o 0x%x " 1813 "g 0x%x\n", hash, 1814 ntohl(mfc_rt->mfc_origin.s_addr), 1815 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1816 } 1817 mfc_rt->mfc_next = ipst->ips_mfcs[hash].mfcb_mfc; 1818 ipst->ips_mfcs[hash].mfcb_mfc = mfc_rt; 1819 mfc_rt->mfc_rte = NULL; 1820 } 1821 1822 /* Link in the upcall */ 1823 /* First upcall */ 1824 if (mfc_rt->mfc_rte == NULL) 1825 mfc_rt->mfc_rte = rte; 1826 else { 1827 /* not the first upcall */ 1828 prev_rte = mfc_rt->mfc_rte; 1829 for (rte1 = mfc_rt->mfc_rte->rte_next; rte1; 1830 prev_rte = rte1, rte1 = rte1->rte_next) 1831 ; 1832 prev_rte->rte_next = rte; 1833 } 1834 1835 /* 1836 * No upcalls waiting, this is first one, so send a message to 1837 * routing daemon to install a route into kernel table. 1838 */ 1839 if (npkts == 0) { 1840 struct igmpmsg *im; 1841 /* ipha_protocol is 0, for upcall */ 1842 ASSERT(mp_copy != NULL); 1843 im = (struct igmpmsg *)mp_copy->b_rptr; 1844 im->im_msgtype = IGMPMSG_NOCACHE; 1845 im->im_mbz = 0; 1846 mutex_enter(&ipst->ips_numvifs_mutex); 1847 if (pim_reg_packet) { 1848 im->im_vif = (uchar_t)ipst->ips_reg_vif_num; 1849 mutex_exit(&ipst->ips_numvifs_mutex); 1850 } else { 1851 /* 1852 * XXX do we need to hold locks here ? 1853 */ 1854 for (vifi = 0; 1855 vifi < ipst->ips_numvifs; 1856 vifi++) { 1857 if (ipst->ips_vifs[vifi].v_ipif == NULL) 1858 continue; 1859 if (ipst->ips_vifs[vifi]. 1860 v_ipif->ipif_ill == ill) { 1861 im->im_vif = (uchar_t)vifi; 1862 break; 1863 } 1864 } 1865 mutex_exit(&ipst->ips_numvifs_mutex); 1866 ASSERT(vifi < ipst->ips_numvifs); 1867 } 1868 1869 ipst->ips_mrtstat->mrts_upcalls++; 1870 /* Timer to discard upcalls if mrouted is too slow */ 1871 mfc_rt->mfc_timeout_id = timeout(expire_upcalls, 1872 mfc_rt, EXPIRE_TIMEOUT * UPCALL_EXPIRE); 1873 mutex_exit(&mfc_rt->mfc_mutex); 1874 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1875 /* Pass to RAWIP */ 1876 ira->ira_ill = ira->ira_rill = NULL; 1877 (mrouter->conn_recv)(mrouter, mp_copy, NULL, ira); 1878 ira->ira_ill = ill; 1879 ira->ira_rill = rill; 1880 } else { 1881 mutex_exit(&mfc_rt->mfc_mutex); 1882 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1883 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1884 ip_drop_input("ip_mforward - upcall already waiting", 1885 mp_copy, ill); 1886 freemsg(mp_copy); 1887 } 1888 1889 MFCB_REFRELE(mfcbp); 1890 if (tunnel_src != 0) 1891 return (1); 1892 else 1893 return (0); 1894 error_return: 1895 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1896 MFCB_REFRELE(mfcbp); 1897 if (mfc_rt != NULL && (new_mfc == B_TRUE)) 1898 mi_free((char *)mfc_rt); 1899 if (rte != NULL) 1900 mi_free((char *)rte); 1901 if (mp_copy != NULL) { 1902 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1903 ip_drop_input("ip_mforward error", mp_copy, ill); 1904 freemsg(mp_copy); 1905 } 1906 if (mp0 != NULL) 1907 freemsg(mp0); 1908 return (-1); 1909 } 1910 } 1911 1912 /* 1913 * Clean up the mfctable cache entry if upcall is not serviced. 1914 * SunOS 5.x has timeout per mfc, unlike BSD which has one timer. 1915 */ 1916 static void 1917 expire_upcalls(void *arg) 1918 { 1919 struct mfc *mfc_rt = arg; 1920 uint_t hash; 1921 struct mfc *prev_mfc, *mfc0; 1922 ip_stack_t *ipst; 1923 conn_t *mrouter; 1924 1925 if (mfc_rt->mfc_rte == NULL || mfc_rt->mfc_rte->ill != NULL) { 1926 cmn_err(CE_WARN, "expire_upcalls: no ILL\n"); 1927 return; 1928 } 1929 ipst = mfc_rt->mfc_rte->ill->ill_ipst; 1930 mrouter = ipst->ips_ip_g_mrouter; 1931 1932 hash = MFCHASH(mfc_rt->mfc_origin.s_addr, mfc_rt->mfc_mcastgrp.s_addr); 1933 if (ipst->ips_ip_mrtdebug > 1) { 1934 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1935 "expire_upcalls: hash %d s %x g %x", 1936 hash, ntohl(mfc_rt->mfc_origin.s_addr), 1937 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1938 } 1939 MFCB_REFHOLD(&ipst->ips_mfcs[hash]); 1940 mutex_enter(&mfc_rt->mfc_mutex); 1941 /* 1942 * if timeout has been set to zero, than the 1943 * entry has been filled, no need to delete it. 1944 */ 1945 if (mfc_rt->mfc_timeout_id == 0) 1946 goto done; 1947 ipst->ips_mrtstat->mrts_cache_cleanups++; 1948 mfc_rt->mfc_timeout_id = 0; 1949 1950 /* Determine entry to be cleaned up in cache table. */ 1951 for (prev_mfc = mfc0 = ipst->ips_mfcs[hash].mfcb_mfc; mfc0; 1952 prev_mfc = mfc0, mfc0 = mfc0->mfc_next) 1953 if (mfc0 == mfc_rt) 1954 break; 1955 1956 /* del_mfc takes care of gone mfcs */ 1957 ASSERT(prev_mfc != NULL); 1958 ASSERT(mfc0 != NULL); 1959 1960 /* 1961 * Delete the entry from the cache 1962 */ 1963 ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED; 1964 mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED; 1965 1966 /* 1967 * release_mfc will drop all queued upcall packets. 1968 * and will free the mbuf with the pkt, if, timing info. 1969 */ 1970 done: 1971 mutex_exit(&mfc_rt->mfc_mutex); 1972 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 1973 } 1974 1975 /* 1976 * Packet forwarding routine once entry in the cache is made. 1977 */ 1978 static int 1979 ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src, 1980 struct mfc *rt) 1981 { 1982 vifi_t vifi; 1983 struct vif *vifp; 1984 ipaddr_t dst = ipha->ipha_dst; 1985 size_t plen = msgdsize(mp); 1986 vifi_t num_of_vifs; 1987 ip_stack_t *ipst = ill->ill_ipst; 1988 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1989 ip_recv_attr_t iras; 1990 1991 if (ipst->ips_ip_mrtdebug > 1) { 1992 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1993 "ip_mdq: SEND src %x, ipha_dst %x, ill %s", 1994 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), 1995 ill->ill_name); 1996 } 1997 1998 /* Macro to send packet on vif */ 1999 #define MC_SEND(ipha, mp, vifp, dst) { \ 2000 if ((vifp)->v_flags & VIFF_TUNNEL) \ 2001 encap_send((ipha), (mp), (vifp), (dst)); \ 2002 else if ((vifp)->v_flags & VIFF_REGISTER) \ 2003 register_send((ipha), (mp), (vifp), (dst)); \ 2004 else \ 2005 phyint_send((ipha), (mp), (vifp), (dst)); \ 2006 } 2007 2008 vifi = rt->mfc_parent; 2009 2010 /* 2011 * The value of vifi is MAXVIFS if the pkt had no parent, i.e., 2012 * Mrouted had no route. 2013 * We wanted the route installed in the mfctable to prevent multiple 2014 * tries, so it passed add_mfc(), but is discarded here. The v_ipif is 2015 * NULL so we don't want to check the ill. Still needed as of Mrouted 2016 * 3.6. 2017 */ 2018 if (vifi == NO_VIF) { 2019 ip1dbg(("ip_mdq: no route for origin ill %s, vifi is NO_VIF\n", 2020 ill->ill_name)); 2021 if (ipst->ips_ip_mrtdebug > 1) { 2022 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2023 "ip_mdq: vifi is NO_VIF ill = %s", ill->ill_name); 2024 } 2025 return (-1); /* drop pkt */ 2026 } 2027 2028 if (!lock_good_vif(&ipst->ips_vifs[vifi])) 2029 return (-1); 2030 /* 2031 * The MFC entries are not cleaned up when an ipif goes 2032 * away thus this code has to guard against an MFC referencing 2033 * an ipif that has been closed. Note: reset_mrt_vif_ipif 2034 * sets the v_ipif to NULL when the ipif disappears. 2035 */ 2036 ASSERT(ipst->ips_vifs[vifi].v_ipif != NULL); 2037 2038 if (vifi >= ipst->ips_numvifs) { 2039 cmn_err(CE_WARN, "ip_mdq: illegal vifi %d numvifs " 2040 "%d ill %s viftable ill %s\n", 2041 (int)vifi, (int)ipst->ips_numvifs, ill->ill_name, 2042 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name); 2043 unlock_good_vif(&ipst->ips_vifs[vifi]); 2044 return (-1); 2045 } 2046 /* 2047 * Don't forward if it didn't arrive from the parent vif for its 2048 * origin. 2049 */ 2050 if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill) || 2051 (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) { 2052 /* Came in the wrong interface */ 2053 ip1dbg(("ip_mdq: arrived wrong if, vifi %d " 2054 "numvifs %d ill %s viftable ill %s\n", 2055 (int)vifi, (int)ipst->ips_numvifs, ill->ill_name, 2056 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name)); 2057 if (ipst->ips_ip_mrtdebug > 1) { 2058 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2059 "ip_mdq: arrived wrong if, vifi %d ill " 2060 "%s viftable ill %s\n", 2061 (int)vifi, ill->ill_name, 2062 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name); 2063 } 2064 ipst->ips_mrtstat->mrts_wrong_if++; 2065 rt->mfc_wrong_if++; 2066 2067 /* 2068 * If we are doing PIM assert processing and we are forwarding 2069 * packets on this interface, and it is a broadcast medium 2070 * interface (and not a tunnel), send a message to the routing. 2071 * 2072 * We use the first ipif on the list, since it's all we have. 2073 * Chances are the ipif_flags are the same for ipifs on the ill. 2074 */ 2075 if (ipst->ips_pim_assert && rt->mfc_ttls[vifi] > 0 && 2076 (ill->ill_ipif->ipif_flags & IPIF_BROADCAST) && 2077 !(ipst->ips_vifs[vifi].v_flags & VIFF_TUNNEL)) { 2078 mblk_t *mp_copy; 2079 struct igmpmsg *im; 2080 2081 /* TODO could copy header and dup rest */ 2082 mp_copy = copymsg(mp); 2083 if (mp_copy == NULL) { 2084 ipst->ips_mrtstat->mrts_fwd_drop++; 2085 ip1dbg(("ip_mdq: out of memory " 2086 "for mblk, mp_copy\n")); 2087 unlock_good_vif(&ipst->ips_vifs[vifi]); 2088 return (-1); 2089 } 2090 2091 im = (struct igmpmsg *)mp_copy->b_rptr; 2092 im->im_msgtype = IGMPMSG_WRONGVIF; 2093 im->im_mbz = 0; 2094 im->im_vif = (ushort_t)vifi; 2095 /* Pass to RAWIP */ 2096 2097 bzero(&iras, sizeof (iras)); 2098 iras.ira_flags = IRAF_IS_IPV4; 2099 iras.ira_ip_hdr_length = 2100 IPH_HDR_LENGTH(mp_copy->b_rptr); 2101 iras.ira_pktlen = msgdsize(mp_copy); 2102 (mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras); 2103 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2104 } 2105 unlock_good_vif(&ipst->ips_vifs[vifi]); 2106 if (tunnel_src != 0) 2107 return (1); 2108 else 2109 return (0); 2110 } 2111 /* 2112 * If I sourced this packet, it counts as output, else it was input. 2113 */ 2114 if (ipha->ipha_src == ipst->ips_vifs[vifi].v_lcl_addr.s_addr) { 2115 ipst->ips_vifs[vifi].v_pkt_out++; 2116 ipst->ips_vifs[vifi].v_bytes_out += plen; 2117 } else { 2118 ipst->ips_vifs[vifi].v_pkt_in++; 2119 ipst->ips_vifs[vifi].v_bytes_in += plen; 2120 } 2121 mutex_enter(&rt->mfc_mutex); 2122 rt->mfc_pkt_cnt++; 2123 rt->mfc_byte_cnt += plen; 2124 mutex_exit(&rt->mfc_mutex); 2125 unlock_good_vif(&ipst->ips_vifs[vifi]); 2126 /* 2127 * For each vif, decide if a copy of the packet should be forwarded. 2128 * Forward if: 2129 * - the vif threshold ttl is non-zero AND 2130 * - the pkt ttl exceeds the vif's threshold 2131 * A non-zero mfc_ttl indicates that the vif is part of 2132 * the output set for the mfc entry. 2133 */ 2134 mutex_enter(&ipst->ips_numvifs_mutex); 2135 num_of_vifs = ipst->ips_numvifs; 2136 mutex_exit(&ipst->ips_numvifs_mutex); 2137 for (vifp = ipst->ips_vifs, vifi = 0; 2138 vifi < num_of_vifs; 2139 vifp++, vifi++) { 2140 if (!lock_good_vif(vifp)) 2141 continue; 2142 if ((rt->mfc_ttls[vifi] > 0) && 2143 (ipha->ipha_ttl > rt->mfc_ttls[vifi])) { 2144 /* 2145 * lock_good_vif should not have succedded if 2146 * v_ipif is null. 2147 */ 2148 ASSERT(vifp->v_ipif != NULL); 2149 vifp->v_pkt_out++; 2150 vifp->v_bytes_out += plen; 2151 MC_SEND(ipha, mp, vifp, dst); 2152 ipst->ips_mrtstat->mrts_fwd_out++; 2153 } 2154 unlock_good_vif(vifp); 2155 } 2156 if (tunnel_src != 0) 2157 return (1); 2158 else 2159 return (0); 2160 } 2161 2162 /* 2163 * Send the packet on physical interface. 2164 * Caller assumes can continue to use mp on return. 2165 */ 2166 /* ARGSUSED */ 2167 static void 2168 phyint_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2169 { 2170 mblk_t *mp_copy; 2171 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2172 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2173 2174 /* Make a new reference to the packet */ 2175 mp_copy = copymsg(mp); /* TODO could copy header and dup rest */ 2176 if (mp_copy == NULL) { 2177 ipst->ips_mrtstat->mrts_fwd_drop++; 2178 ip1dbg(("phyint_send: out of memory for mblk, mp_copy\n")); 2179 return; 2180 } 2181 if (vifp->v_rate_limit <= 0) 2182 tbf_send_packet(vifp, mp_copy); 2183 else { 2184 if (ipst->ips_ip_mrtdebug > 1) { 2185 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2186 "phyint_send: tbf_contr rate %d " 2187 "vifp 0x%p mp 0x%p dst 0x%x", 2188 vifp->v_rate_limit, (void *)vifp, (void *)mp, dst); 2189 } 2190 tbf_control(vifp, mp_copy, (ipha_t *)mp_copy->b_rptr); 2191 } 2192 } 2193 2194 /* 2195 * Send the whole packet for REGISTER encapsulation to PIM daemon 2196 * Caller assumes it can continue to use mp on return. 2197 */ 2198 /* ARGSUSED */ 2199 static void 2200 register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2201 { 2202 struct igmpmsg *im; 2203 mblk_t *mp_copy; 2204 ipha_t *ipha_copy; 2205 ill_t *ill = vifp->v_ipif->ipif_ill; 2206 ip_stack_t *ipst = ill->ill_ipst; 2207 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2208 ip_recv_attr_t iras; 2209 2210 if (ipst->ips_ip_mrtdebug > 1) { 2211 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2212 "register_send: src %x, dst %x\n", 2213 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst)); 2214 } 2215 2216 /* 2217 * Copy the old packet & pullup its IP header into the new mblk_t so we 2218 * can modify it. Try to fill the new mblk_t since if we don't the 2219 * ethernet driver will. 2220 */ 2221 mp_copy = allocb(sizeof (struct igmpmsg) + sizeof (ipha_t), BPRI_MED); 2222 if (mp_copy == NULL) { 2223 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2224 if (ipst->ips_ip_mrtdebug > 3) { 2225 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2226 "register_send: allocb failure."); 2227 } 2228 return; 2229 } 2230 2231 /* 2232 * Bump write pointer to account for igmpmsg being added. 2233 */ 2234 mp_copy->b_wptr = mp_copy->b_rptr + sizeof (struct igmpmsg); 2235 2236 /* 2237 * Chain packet to new mblk_t. 2238 */ 2239 if ((mp_copy->b_cont = copymsg(mp)) == NULL) { 2240 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2241 if (ipst->ips_ip_mrtdebug > 3) { 2242 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2243 "register_send: copymsg failure."); 2244 } 2245 freeb(mp_copy); 2246 return; 2247 } 2248 2249 /* 2250 * icmp_input() asserts that IP version field is set to an 2251 * appropriate version. Hence, the struct igmpmsg that this really 2252 * becomes, needs to have the correct IP version field. 2253 */ 2254 ipha_copy = (ipha_t *)mp_copy->b_rptr; 2255 *ipha_copy = multicast_encap_iphdr; 2256 2257 /* 2258 * The kernel uses the struct igmpmsg header to encode the messages to 2259 * the multicast routing daemon. Fill in the fields in the header 2260 * starting with the message type which is IGMPMSG_WHOLEPKT 2261 */ 2262 im = (struct igmpmsg *)mp_copy->b_rptr; 2263 im->im_msgtype = IGMPMSG_WHOLEPKT; 2264 im->im_src.s_addr = ipha->ipha_src; 2265 im->im_dst.s_addr = ipha->ipha_dst; 2266 2267 /* 2268 * Must Be Zero. This is because the struct igmpmsg is really an IP 2269 * header with renamed fields and the multicast routing daemon uses 2270 * an ipha_protocol (aka im_mbz) of 0 to distinguish these messages. 2271 */ 2272 im->im_mbz = 0; 2273 2274 ++ipst->ips_mrtstat->mrts_upcalls; 2275 if (IPCL_IS_NONSTR(mrouter) ? mrouter->conn_flow_cntrld : 2276 !canputnext(mrouter->conn_rq)) { 2277 ++ipst->ips_mrtstat->mrts_pim_regsend_drops; 2278 if (ipst->ips_ip_mrtdebug > 3) { 2279 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2280 "register_send: register upcall failure."); 2281 } 2282 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2283 ip_drop_input("mrts_pim_regsend_drops", mp_copy, ill); 2284 freemsg(mp_copy); 2285 } else { 2286 /* Pass to RAWIP */ 2287 bzero(&iras, sizeof (iras)); 2288 iras.ira_flags = IRAF_IS_IPV4; 2289 iras.ira_ip_hdr_length = sizeof (ipha_t); 2290 iras.ira_pktlen = msgdsize(mp_copy); 2291 (mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras); 2292 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2293 } 2294 } 2295 2296 /* 2297 * pim_validate_cksum handles verification of the checksum in the 2298 * pim header. For PIM Register packets, the checksum is calculated 2299 * across the PIM header only. For all other packets, the checksum 2300 * is for the PIM header and remainder of the packet. 2301 * 2302 * returns: B_TRUE, if checksum is okay. 2303 * B_FALSE, if checksum is not valid. 2304 */ 2305 static boolean_t 2306 pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp) 2307 { 2308 mblk_t *mp_dup; 2309 2310 if ((mp_dup = dupmsg(mp)) == NULL) 2311 return (B_FALSE); 2312 2313 mp_dup->b_rptr += IPH_HDR_LENGTH(ip); 2314 if (pimp->pim_type == PIM_REGISTER) 2315 mp_dup->b_wptr = mp_dup->b_rptr + PIM_MINLEN; 2316 if (IP_CSUM(mp_dup, 0, 0)) { 2317 freemsg(mp_dup); 2318 return (B_FALSE); 2319 } 2320 freemsg(mp_dup); 2321 return (B_TRUE); 2322 } 2323 2324 /* 2325 * Process PIM protocol packets i.e. IP Protocol 103. 2326 * Register messages are decapsulated and sent onto multicast forwarding. 2327 * 2328 * Return NULL for a bad packet that is discarded here. 2329 * Return mp if the message is OK and should be handed to "raw" receivers. 2330 * Callers of pim_input() may need to reinitialize variables that were copied 2331 * from the mblk as this calls pullupmsg(). 2332 */ 2333 mblk_t * 2334 pim_input(mblk_t *mp, ip_recv_attr_t *ira) 2335 { 2336 ipha_t *eip, *ip; 2337 int iplen, pimlen, iphlen; 2338 struct pim *pimp; /* pointer to a pim struct */ 2339 uint32_t *reghdr; 2340 ill_t *ill = ira->ira_ill; 2341 ip_stack_t *ipst = ill->ill_ipst; 2342 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2343 2344 /* 2345 * Pullup the msg for PIM protocol processing. 2346 */ 2347 if (pullupmsg(mp, -1) == 0) { 2348 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2349 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2350 ip_drop_input("mrts_pim_nomemory", mp, ill); 2351 freemsg(mp); 2352 return (NULL); 2353 } 2354 2355 ip = (ipha_t *)mp->b_rptr; 2356 iplen = ip->ipha_length; 2357 iphlen = IPH_HDR_LENGTH(ip); 2358 pimlen = ntohs(iplen) - iphlen; 2359 2360 /* 2361 * Validate lengths 2362 */ 2363 if (pimlen < PIM_MINLEN) { 2364 ++ipst->ips_mrtstat->mrts_pim_malformed; 2365 if (ipst->ips_ip_mrtdebug > 1) { 2366 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2367 "pim_input: length not at least minlen"); 2368 } 2369 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2370 ip_drop_input("mrts_pim_malformed", mp, ill); 2371 freemsg(mp); 2372 return (NULL); 2373 } 2374 2375 /* 2376 * Point to the PIM header. 2377 */ 2378 pimp = (struct pim *)((caddr_t)ip + iphlen); 2379 2380 /* 2381 * Check the version number. 2382 */ 2383 if (pimp->pim_vers != PIM_VERSION) { 2384 ++ipst->ips_mrtstat->mrts_pim_badversion; 2385 if (ipst->ips_ip_mrtdebug > 1) { 2386 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2387 "pim_input: unknown version of PIM"); 2388 } 2389 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2390 ip_drop_input("mrts_pim_badversion", mp, ill); 2391 freemsg(mp); 2392 return (NULL); 2393 } 2394 2395 /* 2396 * Validate the checksum 2397 */ 2398 if (!pim_validate_cksum(mp, ip, pimp)) { 2399 ++ipst->ips_mrtstat->mrts_pim_rcv_badcsum; 2400 if (ipst->ips_ip_mrtdebug > 1) { 2401 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2402 "pim_input: invalid checksum"); 2403 } 2404 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2405 ip_drop_input("pim_rcv_badcsum", mp, ill); 2406 freemsg(mp); 2407 return (NULL); 2408 } 2409 2410 if (pimp->pim_type != PIM_REGISTER) 2411 return (mp); 2412 2413 reghdr = (uint32_t *)(pimp + 1); 2414 eip = (ipha_t *)(reghdr + 1); 2415 2416 /* 2417 * check if the inner packet is destined to mcast group 2418 */ 2419 if (!CLASSD(eip->ipha_dst)) { 2420 ++ipst->ips_mrtstat->mrts_pim_badregisters; 2421 if (ipst->ips_ip_mrtdebug > 1) { 2422 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2423 "pim_input: Inner pkt not mcast .. !"); 2424 } 2425 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2426 ip_drop_input("mrts_pim_badregisters", mp, ill); 2427 freemsg(mp); 2428 return (NULL); 2429 } 2430 if (ipst->ips_ip_mrtdebug > 1) { 2431 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2432 "register from %x, to %x, len %d", 2433 ntohl(eip->ipha_src), 2434 ntohl(eip->ipha_dst), 2435 ntohs(eip->ipha_length)); 2436 } 2437 /* 2438 * If the null register bit is not set, decapsulate 2439 * the packet before forwarding it. 2440 * Avoid this in no register vif 2441 */ 2442 if (!(ntohl(*reghdr) & PIM_NULL_REGISTER) && 2443 ipst->ips_reg_vif_num != ALL_VIFS) { 2444 mblk_t *mp_copy; 2445 uint_t saved_pktlen; 2446 2447 /* Copy the message */ 2448 if ((mp_copy = copymsg(mp)) == NULL) { 2449 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2450 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2451 ip_drop_input("mrts_pim_nomemory", mp, ill); 2452 freemsg(mp); 2453 return (NULL); 2454 } 2455 2456 /* 2457 * Decapsulate the packet and give it to 2458 * register_mforward. 2459 */ 2460 mp_copy->b_rptr += iphlen + sizeof (pim_t) + sizeof (*reghdr); 2461 saved_pktlen = ira->ira_pktlen; 2462 ira->ira_pktlen -= iphlen + sizeof (pim_t) + sizeof (*reghdr); 2463 if (register_mforward(mp_copy, ira) != 0) { 2464 /* register_mforward already called ip_drop_input */ 2465 freemsg(mp); 2466 ira->ira_pktlen = saved_pktlen; 2467 return (NULL); 2468 } 2469 ira->ira_pktlen = saved_pktlen; 2470 } 2471 2472 /* 2473 * Pass all valid PIM packets up to any process(es) listening on a raw 2474 * PIM socket. For Solaris it is done right after pim_input() is 2475 * called. 2476 */ 2477 return (mp); 2478 } 2479 2480 /* 2481 * PIM sparse mode hook. Called by pim_input after decapsulating 2482 * the packet. Loop back the packet, as if we have received it. 2483 * In pim_input() we have to check if the destination is a multicast address. 2484 */ 2485 static int 2486 register_mforward(mblk_t *mp, ip_recv_attr_t *ira) 2487 { 2488 ire_t *ire; 2489 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2490 ill_t *ill = ira->ira_ill; 2491 ip_stack_t *ipst = ill->ill_ipst; 2492 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2493 2494 ASSERT(ipst->ips_reg_vif_num <= ipst->ips_numvifs); 2495 2496 if (ipst->ips_ip_mrtdebug > 3) { 2497 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2498 "register_mforward: src %x, dst %x\n", 2499 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst)); 2500 } 2501 /* 2502 * Need to pass in to ip_mforward() the information that the 2503 * packet has arrived on the register_vif. We mark it with 2504 * the IRAF_PIM_REGISTER attribute. 2505 * pim_input verified that the (inner) destination is multicast, 2506 * hence we skip the generic code in ip_input. 2507 */ 2508 ira->ira_flags |= IRAF_PIM_REGISTER; 2509 ++ipst->ips_mrtstat->mrts_pim_regforwards; 2510 2511 if (!CLASSD(ipha->ipha_dst)) { 2512 ire = ire_route_recursive_v4(ipha->ipha_dst, 0, NULL, ALL_ZONES, 2513 ira->ira_tsl, MATCH_IRE_SECATTR, IRR_ALLOCATE, 0, ipst, 2514 NULL, NULL, NULL); 2515 } else { 2516 ire = ire_multicast(ill); 2517 } 2518 ASSERT(ire != NULL); 2519 /* Normally this will return the IRE_MULTICAST */ 2520 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 2521 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2522 ip_drop_input("mrts_pim RTF_REJECT", mp, ill); 2523 freemsg(mp); 2524 ire_refrele(ire); 2525 return (-1); 2526 } 2527 ASSERT(ire->ire_type & IRE_MULTICAST); 2528 (*ire->ire_recvfn)(ire, mp, ipha, ira); 2529 ire_refrele(ire); 2530 2531 return (0); 2532 } 2533 2534 /* 2535 * Send an encapsulated packet. 2536 * Caller assumes can continue to use mp when routine returns. 2537 */ 2538 /* ARGSUSED */ 2539 static void 2540 encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2541 { 2542 mblk_t *mp_copy; 2543 ipha_t *ipha_copy; 2544 size_t len; 2545 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2546 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2547 2548 if (ipst->ips_ip_mrtdebug > 1) { 2549 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2550 "encap_send: vif %ld enter", 2551 (ptrdiff_t)(vifp - ipst->ips_vifs)); 2552 } 2553 len = ntohs(ipha->ipha_length); 2554 2555 /* 2556 * Copy the old packet & pullup it's IP header into the 2557 * new mbuf so we can modify it. Try to fill the new 2558 * mbuf since if we don't the ethernet driver will. 2559 */ 2560 mp_copy = allocb(32 + sizeof (multicast_encap_iphdr), BPRI_MED); 2561 if (mp_copy == NULL) 2562 return; 2563 mp_copy->b_rptr += 32; 2564 mp_copy->b_wptr = mp_copy->b_rptr + sizeof (multicast_encap_iphdr); 2565 if ((mp_copy->b_cont = copymsg(mp)) == NULL) { 2566 freeb(mp_copy); 2567 return; 2568 } 2569 2570 /* 2571 * Fill in the encapsulating IP header. 2572 * Remote tunnel dst in rmt_addr, from add_vif(). 2573 */ 2574 ipha_copy = (ipha_t *)mp_copy->b_rptr; 2575 *ipha_copy = multicast_encap_iphdr; 2576 ASSERT((len + sizeof (ipha_t)) <= IP_MAXPACKET); 2577 ipha_copy->ipha_length = htons(len + sizeof (ipha_t)); 2578 ipha_copy->ipha_src = vifp->v_lcl_addr.s_addr; 2579 ipha_copy->ipha_dst = vifp->v_rmt_addr.s_addr; 2580 ASSERT(ipha_copy->ipha_ident == 0); 2581 2582 /* Turn the encapsulated IP header back into a valid one. */ 2583 ipha = (ipha_t *)mp_copy->b_cont->b_rptr; 2584 ipha->ipha_ttl--; 2585 ipha->ipha_hdr_checksum = 0; 2586 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 2587 2588 ipha_copy->ipha_ttl = ipha->ipha_ttl; 2589 2590 if (ipst->ips_ip_mrtdebug > 1) { 2591 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2592 "encap_send: group 0x%x", ntohl(ipha->ipha_dst)); 2593 } 2594 if (vifp->v_rate_limit <= 0) 2595 tbf_send_packet(vifp, mp_copy); 2596 else 2597 /* ipha is from the original header */ 2598 tbf_control(vifp, mp_copy, ipha); 2599 } 2600 2601 /* 2602 * De-encapsulate a packet and feed it back through IP input if it 2603 * matches one of our multicast tunnels. 2604 * 2605 * This routine is called whenever IP gets a packet with prototype 2606 * IPPROTO_ENCAP and a local destination address and the packet didn't 2607 * match one of our configured IP-in-IP tunnels. 2608 */ 2609 void 2610 ip_mroute_decap(mblk_t *mp, ip_recv_attr_t *ira) 2611 { 2612 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2613 ipha_t *ipha_encap; 2614 int hlen = IPH_HDR_LENGTH(ipha); 2615 int hlen_encap; 2616 ipaddr_t src; 2617 struct vif *vifp; 2618 ire_t *ire; 2619 ill_t *ill = ira->ira_ill; 2620 ip_stack_t *ipst = ill->ill_ipst; 2621 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2622 2623 /* Make sure we have all of the inner header */ 2624 ipha_encap = (ipha_t *)((char *)ipha + hlen); 2625 if (mp->b_wptr - mp->b_rptr < hlen + IP_SIMPLE_HDR_LENGTH) { 2626 ipha = ip_pullup(mp, hlen + IP_SIMPLE_HDR_LENGTH, ira); 2627 if (ipha == NULL) { 2628 ipst->ips_mrtstat->mrts_bad_tunnel++; 2629 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2630 ip_drop_input("ip_mroute_decap: too short", mp, ill); 2631 freemsg(mp); 2632 return; 2633 } 2634 ipha_encap = (ipha_t *)((char *)ipha + hlen); 2635 } 2636 hlen_encap = IPH_HDR_LENGTH(ipha_encap); 2637 if (mp->b_wptr - mp->b_rptr < hlen + hlen_encap) { 2638 ipha = ip_pullup(mp, hlen + hlen_encap, ira); 2639 if (ipha == NULL) { 2640 ipst->ips_mrtstat->mrts_bad_tunnel++; 2641 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2642 ip_drop_input("ip_mroute_decap: too short", mp, ill); 2643 freemsg(mp); 2644 return; 2645 } 2646 ipha_encap = (ipha_t *)((char *)ipha + hlen); 2647 } 2648 2649 /* 2650 * Dump the packet if it's not to a multicast destination or if 2651 * we don't have an encapsulating tunnel with the source. 2652 * Note: This code assumes that the remote site IP address 2653 * uniquely identifies the tunnel (i.e., that this site has 2654 * at most one tunnel with the remote site). 2655 */ 2656 if (!CLASSD(ipha_encap->ipha_dst)) { 2657 ipst->ips_mrtstat->mrts_bad_tunnel++; 2658 ip1dbg(("ip_mroute_decap: bad tunnel\n")); 2659 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2660 ip_drop_input("mrts_bad_tunnel", mp, ill); 2661 freemsg(mp); 2662 return; 2663 } 2664 src = (ipaddr_t)ipha->ipha_src; 2665 mutex_enter(&ipst->ips_last_encap_lock); 2666 if (src != ipst->ips_last_encap_src) { 2667 struct vif *vife; 2668 2669 vifp = ipst->ips_vifs; 2670 vife = vifp + ipst->ips_numvifs; 2671 ipst->ips_last_encap_src = src; 2672 ipst->ips_last_encap_vif = 0; 2673 for (; vifp < vife; ++vifp) { 2674 if (!lock_good_vif(vifp)) 2675 continue; 2676 if (vifp->v_rmt_addr.s_addr == src) { 2677 if (vifp->v_flags & VIFF_TUNNEL) 2678 ipst->ips_last_encap_vif = vifp; 2679 if (ipst->ips_ip_mrtdebug > 1) { 2680 (void) mi_strlog(mrouter->conn_rq, 2681 1, SL_TRACE, 2682 "ip_mroute_decap: good tun " 2683 "vif %ld with %x", 2684 (ptrdiff_t)(vifp - ipst->ips_vifs), 2685 ntohl(src)); 2686 } 2687 unlock_good_vif(vifp); 2688 break; 2689 } 2690 unlock_good_vif(vifp); 2691 } 2692 } 2693 if ((vifp = ipst->ips_last_encap_vif) == 0) { 2694 mutex_exit(&ipst->ips_last_encap_lock); 2695 ipst->ips_mrtstat->mrts_bad_tunnel++; 2696 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2697 ip_drop_input("mrts_bad_tunnel", mp, ill); 2698 freemsg(mp); 2699 ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n", 2700 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(src))); 2701 return; 2702 } 2703 mutex_exit(&ipst->ips_last_encap_lock); 2704 2705 /* 2706 * Need to pass in the tunnel source to ip_mforward (so that it can 2707 * verify that the packet arrived over the correct vif.) 2708 */ 2709 ira->ira_flags |= IRAF_MROUTE_TUNNEL_SET; 2710 ira->ira_mroute_tunnel = src; 2711 mp->b_rptr += hlen; 2712 ira->ira_pktlen -= hlen; 2713 ira->ira_ip_hdr_length = hlen_encap; 2714 2715 /* 2716 * We don't redo any of the filtering in ill_input_full_v4 and we 2717 * have checked that all of ipha_encap and any IP options are 2718 * pulled up. Hence we call ire_recv_multicast_v4 directly. 2719 * However, we have to check for RSVP as in ip_input_full_v4 2720 * and if so we pass it to ire_recv_broadcast_v4 for local delivery 2721 * to the rsvpd. 2722 */ 2723 if (ipha_encap->ipha_protocol == IPPROTO_RSVP && 2724 ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head != NULL) { 2725 ire = ire_route_recursive_v4(INADDR_BROADCAST, 0, ill, 2726 ALL_ZONES, ira->ira_tsl, MATCH_IRE_ILL|MATCH_IRE_SECATTR, 2727 IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL); 2728 } else { 2729 ire = ire_multicast(ill); 2730 } 2731 ASSERT(ire != NULL); 2732 /* Normally this will return the IRE_MULTICAST or IRE_BROADCAST */ 2733 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 2734 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2735 ip_drop_input("ip_mroute_decap: RTF_REJECT", mp, ill); 2736 freemsg(mp); 2737 ire_refrele(ire); 2738 return; 2739 } 2740 ire->ire_ib_pkt_count++; 2741 ASSERT(ire->ire_type & (IRE_MULTICAST|IRE_BROADCAST)); 2742 (*ire->ire_recvfn)(ire, mp, ipha_encap, ira); 2743 ire_refrele(ire); 2744 } 2745 2746 /* 2747 * Remove all records with v_ipif == ipif. Called when an interface goes away 2748 * (stream closed). Called as writer. 2749 */ 2750 void 2751 reset_mrt_vif_ipif(ipif_t *ipif) 2752 { 2753 vifi_t vifi, tmp_vifi; 2754 vifi_t num_of_vifs; 2755 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 2756 2757 /* Can't check vifi >= 0 since vifi_t is unsigned! */ 2758 2759 mutex_enter(&ipst->ips_numvifs_mutex); 2760 num_of_vifs = ipst->ips_numvifs; 2761 mutex_exit(&ipst->ips_numvifs_mutex); 2762 2763 for (vifi = num_of_vifs; vifi != 0; vifi--) { 2764 tmp_vifi = vifi - 1; 2765 if (ipst->ips_vifs[tmp_vifi].v_ipif == ipif) { 2766 (void) del_vif(&tmp_vifi, ipst); 2767 } 2768 } 2769 } 2770 2771 /* Remove pending upcall msgs when ill goes away. Called by ill_delete. */ 2772 void 2773 reset_mrt_ill(ill_t *ill) 2774 { 2775 struct mfc *rt; 2776 struct rtdetq *rte; 2777 int i; 2778 ip_stack_t *ipst = ill->ill_ipst; 2779 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2780 timeout_id_t id; 2781 2782 for (i = 0; i < MFCTBLSIZ; i++) { 2783 MFCB_REFHOLD(&ipst->ips_mfcs[i]); 2784 if ((rt = ipst->ips_mfcs[i].mfcb_mfc) != NULL) { 2785 if (ipst->ips_ip_mrtdebug > 1) { 2786 (void) mi_strlog(mrouter->conn_rq, 1, 2787 SL_TRACE, 2788 "reset_mrt_ill: mfctable [%d]", i); 2789 } 2790 while (rt != NULL) { 2791 mutex_enter(&rt->mfc_mutex); 2792 while ((rte = rt->mfc_rte) != NULL) { 2793 if (rte->ill == ill && 2794 (id = rt->mfc_timeout_id) != 0) { 2795 /* 2796 * Its ok to drop the lock, the 2797 * struct cannot be freed since 2798 * we have a ref on the hash 2799 * bucket. 2800 */ 2801 mutex_exit(&rt->mfc_mutex); 2802 (void) untimeout(id); 2803 mutex_enter(&rt->mfc_mutex); 2804 } 2805 if (rte->ill == ill) { 2806 if (ipst->ips_ip_mrtdebug > 1) { 2807 (void) mi_strlog( 2808 mrouter->conn_rq, 2809 1, SL_TRACE, 2810 "reset_mrt_ill: " 2811 "ill 0x%p", (void *)ill); 2812 } 2813 rt->mfc_rte = rte->rte_next; 2814 freemsg(rte->mp); 2815 mi_free((char *)rte); 2816 } 2817 } 2818 mutex_exit(&rt->mfc_mutex); 2819 rt = rt->mfc_next; 2820 } 2821 } 2822 MFCB_REFRELE(&ipst->ips_mfcs[i]); 2823 } 2824 } 2825 2826 /* 2827 * Token bucket filter module. 2828 * The ipha is for mcastgrp destination for phyint and encap. 2829 */ 2830 static void 2831 tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha) 2832 { 2833 size_t p_len = msgdsize(mp); 2834 struct tbf *t = vifp->v_tbf; 2835 timeout_id_t id = 0; 2836 ill_t *ill = vifp->v_ipif->ipif_ill; 2837 ip_stack_t *ipst = ill->ill_ipst; 2838 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2839 2840 /* Drop if packet is too large */ 2841 if (p_len > MAX_BKT_SIZE) { 2842 ipst->ips_mrtstat->mrts_pkt2large++; 2843 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2844 ip_drop_output("tbf_control - too large", mp, ill); 2845 freemsg(mp); 2846 return; 2847 } 2848 if (ipst->ips_ip_mrtdebug > 1) { 2849 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2850 "tbf_ctrl: SEND vif %ld, qlen %d, ipha_dst 0x%x", 2851 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len, 2852 ntohl(ipha->ipha_dst)); 2853 } 2854 2855 mutex_enter(&t->tbf_lock); 2856 2857 tbf_update_tokens(vifp); 2858 2859 /* 2860 * If there are enough tokens, 2861 * and the queue is empty, send this packet out. 2862 */ 2863 if (ipst->ips_ip_mrtdebug > 1) { 2864 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2865 "tbf_control: vif %ld, TOKENS %d, pkt len %lu, qlen %d", 2866 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_n_tok, p_len, 2867 t->tbf_q_len); 2868 } 2869 /* No packets are queued */ 2870 if (t->tbf_q_len == 0) { 2871 /* queue empty, send packet if enough tokens */ 2872 if (p_len <= t->tbf_n_tok) { 2873 t->tbf_n_tok -= p_len; 2874 mutex_exit(&t->tbf_lock); 2875 tbf_send_packet(vifp, mp); 2876 return; 2877 } else { 2878 /* Queue packet and timeout till later */ 2879 tbf_queue(vifp, mp); 2880 ASSERT(vifp->v_timeout_id == 0); 2881 vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp, 2882 TBF_REPROCESS); 2883 } 2884 } else if (t->tbf_q_len < t->tbf_max_q_len) { 2885 /* Finite queue length, so queue pkts and process queue */ 2886 tbf_queue(vifp, mp); 2887 tbf_process_q(vifp); 2888 } else { 2889 /* Check that we have UDP header with IP header */ 2890 size_t hdr_length = IPH_HDR_LENGTH(ipha) + 2891 sizeof (struct udphdr); 2892 2893 if ((mp->b_wptr - mp->b_rptr) < hdr_length) { 2894 if (!pullupmsg(mp, hdr_length)) { 2895 BUMP_MIB(ill->ill_ip_mib, 2896 ipIfStatsOutDiscards); 2897 ip_drop_output("tbf_control - pullup", mp, ill); 2898 freemsg(mp); 2899 ip1dbg(("tbf_ctl: couldn't pullup udp hdr, " 2900 "vif %ld src 0x%x dst 0x%x\n", 2901 (ptrdiff_t)(vifp - ipst->ips_vifs), 2902 ntohl(ipha->ipha_src), 2903 ntohl(ipha->ipha_dst))); 2904 mutex_exit(&vifp->v_tbf->tbf_lock); 2905 return; 2906 } else 2907 /* Have to reassign ipha after pullupmsg */ 2908 ipha = (ipha_t *)mp->b_rptr; 2909 } 2910 /* 2911 * Queue length too much, 2912 * try to selectively dq, or queue and process 2913 */ 2914 if (!tbf_dq_sel(vifp, ipha)) { 2915 ipst->ips_mrtstat->mrts_q_overflow++; 2916 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2917 ip_drop_output("mrts_q_overflow", mp, ill); 2918 freemsg(mp); 2919 } else { 2920 tbf_queue(vifp, mp); 2921 tbf_process_q(vifp); 2922 } 2923 } 2924 if (t->tbf_q_len == 0) { 2925 id = vifp->v_timeout_id; 2926 vifp->v_timeout_id = 0; 2927 } 2928 mutex_exit(&vifp->v_tbf->tbf_lock); 2929 if (id != 0) 2930 (void) untimeout(id); 2931 } 2932 2933 /* 2934 * Adds a packet to the tbf queue at the interface. 2935 * The ipha is for mcastgrp destination for phyint and encap. 2936 */ 2937 static void 2938 tbf_queue(struct vif *vifp, mblk_t *mp) 2939 { 2940 struct tbf *t = vifp->v_tbf; 2941 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2942 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2943 2944 if (ipst->ips_ip_mrtdebug > 1) { 2945 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2946 "tbf_queue: vif %ld", (ptrdiff_t)(vifp - ipst->ips_vifs)); 2947 } 2948 ASSERT(MUTEX_HELD(&t->tbf_lock)); 2949 2950 if (t->tbf_t == NULL) { 2951 /* Queue was empty */ 2952 t->tbf_q = mp; 2953 } else { 2954 /* Insert at tail */ 2955 t->tbf_t->b_next = mp; 2956 } 2957 /* set new tail pointer */ 2958 t->tbf_t = mp; 2959 2960 mp->b_next = mp->b_prev = NULL; 2961 2962 t->tbf_q_len++; 2963 } 2964 2965 /* 2966 * Process the queue at the vif interface. 2967 * Drops the tbf_lock when sending packets. 2968 * 2969 * NOTE : The caller should quntimeout if the queue length is 0. 2970 */ 2971 static void 2972 tbf_process_q(struct vif *vifp) 2973 { 2974 mblk_t *mp; 2975 struct tbf *t = vifp->v_tbf; 2976 size_t len; 2977 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2978 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2979 2980 if (ipst->ips_ip_mrtdebug > 1) { 2981 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2982 "tbf_process_q 1: vif %ld qlen = %d", 2983 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len); 2984 } 2985 2986 /* 2987 * Loop through the queue at the interface and send 2988 * as many packets as possible. 2989 */ 2990 ASSERT(MUTEX_HELD(&t->tbf_lock)); 2991 2992 while (t->tbf_q_len > 0) { 2993 mp = t->tbf_q; 2994 len = (size_t)msgdsize(mp); /* length of ip pkt */ 2995 2996 /* Determine if the packet can be sent */ 2997 if (len <= t->tbf_n_tok) { 2998 /* 2999 * If so, reduce no. of tokens, dequeue the packet, 3000 * send the packet. 3001 */ 3002 t->tbf_n_tok -= len; 3003 3004 t->tbf_q = mp->b_next; 3005 if (--t->tbf_q_len == 0) { 3006 t->tbf_t = NULL; 3007 } 3008 mp->b_next = NULL; 3009 /* Exit mutex before sending packet, then re-enter */ 3010 mutex_exit(&t->tbf_lock); 3011 tbf_send_packet(vifp, mp); 3012 mutex_enter(&t->tbf_lock); 3013 } else 3014 break; 3015 } 3016 } 3017 3018 /* Called at tbf timeout to update tokens, process q and reset timer. */ 3019 static void 3020 tbf_reprocess_q(void *arg) 3021 { 3022 struct vif *vifp = arg; 3023 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 3024 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3025 3026 mutex_enter(&vifp->v_tbf->tbf_lock); 3027 vifp->v_timeout_id = 0; 3028 tbf_update_tokens(vifp); 3029 3030 tbf_process_q(vifp); 3031 3032 if (vifp->v_tbf->tbf_q_len > 0) { 3033 vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp, 3034 TBF_REPROCESS); 3035 } 3036 mutex_exit(&vifp->v_tbf->tbf_lock); 3037 3038 if (ipst->ips_ip_mrtdebug > 1) { 3039 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3040 "tbf_reprcess_q: vif %ld timeout id = %p", 3041 (ptrdiff_t)(vifp - ipst->ips_vifs), vifp->v_timeout_id); 3042 } 3043 } 3044 3045 /* 3046 * Function that will selectively discard a member of the tbf queue, 3047 * based on the precedence value and the priority. 3048 * 3049 * NOTE : The caller should quntimeout if the queue length is 0. 3050 */ 3051 static int 3052 tbf_dq_sel(struct vif *vifp, ipha_t *ipha) 3053 { 3054 uint_t p; 3055 struct tbf *t = vifp->v_tbf; 3056 mblk_t **np; 3057 mblk_t *last, *mp; 3058 ill_t *ill = vifp->v_ipif->ipif_ill; 3059 ip_stack_t *ipst = ill->ill_ipst; 3060 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3061 3062 if (ipst->ips_ip_mrtdebug > 1) { 3063 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3064 "dq_sel: vif %ld dst 0x%x", 3065 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(ipha->ipha_dst)); 3066 } 3067 3068 ASSERT(MUTEX_HELD(&t->tbf_lock)); 3069 p = priority(vifp, ipha); 3070 3071 np = &t->tbf_q; 3072 last = NULL; 3073 while ((mp = *np) != NULL) { 3074 if (p > (priority(vifp, (ipha_t *)mp->b_rptr))) { 3075 *np = mp->b_next; 3076 /* If removing the last packet, fix the tail pointer */ 3077 if (mp == t->tbf_t) 3078 t->tbf_t = last; 3079 mp->b_prev = mp->b_next = NULL; 3080 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 3081 ip_drop_output("tbf_dq_send", mp, ill); 3082 freemsg(mp); 3083 /* 3084 * It's impossible for the queue to be empty, but 3085 * we check anyway. 3086 */ 3087 if (--t->tbf_q_len == 0) { 3088 t->tbf_t = NULL; 3089 } 3090 ipst->ips_mrtstat->mrts_drop_sel++; 3091 return (1); 3092 } 3093 np = &mp->b_next; 3094 last = mp; 3095 } 3096 return (0); 3097 } 3098 3099 /* Sends packet, 2 cases - encap tunnel, phyint. */ 3100 static void 3101 tbf_send_packet(struct vif *vifp, mblk_t *mp) 3102 { 3103 ipif_t *ipif = vifp->v_ipif; 3104 ill_t *ill = ipif->ipif_ill; 3105 ip_stack_t *ipst = ill->ill_ipst; 3106 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3107 ipha_t *ipha; 3108 3109 ipha = (ipha_t *)mp->b_rptr; 3110 /* If encap tunnel options */ 3111 if (vifp->v_flags & VIFF_TUNNEL) { 3112 ip_xmit_attr_t ixas; 3113 3114 if (ipst->ips_ip_mrtdebug > 1) { 3115 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3116 "tbf_send_packet: ENCAP tunnel vif %ld", 3117 (ptrdiff_t)(vifp - ipst->ips_vifs)); 3118 } 3119 bzero(&ixas, sizeof (ixas)); 3120 ixas.ixa_flags = 3121 IXAF_IS_IPV4 | IXAF_NO_TTL_CHANGE | IXAF_VERIFY_SOURCE; 3122 ixas.ixa_ipst = ipst; 3123 ixas.ixa_ifindex = 0; 3124 ixas.ixa_cred = kcred; 3125 ixas.ixa_cpid = NOPID; 3126 ixas.ixa_tsl = NULL; 3127 ixas.ixa_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */ 3128 ixas.ixa_pktlen = ntohs(ipha->ipha_length); 3129 ixas.ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha); 3130 3131 /* 3132 * Feed into ip_output_simple which will set the ident field 3133 * and checksum the encapsulating header. 3134 * BSD gets the cached route vifp->v_route from ip_output() 3135 * to speed up route table lookups. Not necessary in SunOS 5.x. 3136 * One could make multicast forwarding faster by putting an 3137 * ip_xmit_attr_t in each vif thereby caching the ire/nce. 3138 */ 3139 (void) ip_output_simple(mp, &ixas); 3140 ixa_cleanup(&ixas); 3141 return; 3142 3143 /* phyint */ 3144 } else { 3145 /* Need to loop back to members on the outgoing interface. */ 3146 ipaddr_t dst; 3147 ip_recv_attr_t iras; 3148 nce_t *nce; 3149 3150 bzero(&iras, sizeof (iras)); 3151 iras.ira_flags = IRAF_IS_IPV4; 3152 iras.ira_ill = iras.ira_rill = ill; 3153 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 3154 iras.ira_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */ 3155 iras.ira_pktlen = ntohs(ipha->ipha_length); 3156 iras.ira_ip_hdr_length = IPH_HDR_LENGTH(ipha); 3157 3158 dst = ipha->ipha_dst; 3159 if (ill_hasmembers_v4(ill, dst)) { 3160 iras.ira_flags |= IRAF_LOOPBACK_COPY; 3161 } 3162 if (ipst->ips_ip_mrtdebug > 1) { 3163 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3164 "tbf_send_pkt: phyint forward vif %ld dst = 0x%x", 3165 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(dst)); 3166 } 3167 /* 3168 * Find an NCE which matches the nexthop. 3169 * For a pt-pt interface we use the other end of the pt-pt 3170 * link. 3171 */ 3172 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 3173 dst = ipif->ipif_pp_dst_addr; 3174 nce = arp_nce_init(ill, dst, ill->ill_net_type); 3175 } else { 3176 nce = arp_nce_init(ill, dst, IRE_MULTICAST); 3177 } 3178 if (nce == NULL) { 3179 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 3180 ip_drop_output("tbf_send_packet - no nce", mp, ill); 3181 freemsg(mp); 3182 return; 3183 } 3184 3185 /* 3186 * We don't remeber the incoming ill. Thus we 3187 * pretend the packet arrived on the outbound ill. This means 3188 * statistics for input errors will be increased on the wrong 3189 * ill but that isn't a big deal. 3190 */ 3191 ip_forward_xmit_v4(nce, ill, mp, ipha, &iras, ill->ill_mc_mtu, 3192 0); 3193 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 3194 3195 nce_refrele(nce); 3196 } 3197 } 3198 3199 /* 3200 * Determine the current time and then the elapsed time (between the last time 3201 * and time now). Update the no. of tokens in the bucket. 3202 */ 3203 static void 3204 tbf_update_tokens(struct vif *vifp) 3205 { 3206 timespec_t tp; 3207 hrtime_t tm; 3208 struct tbf *t = vifp->v_tbf; 3209 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 3210 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3211 3212 ASSERT(MUTEX_HELD(&t->tbf_lock)); 3213 3214 /* Time in secs and nsecs, rate limit in kbits/sec */ 3215 gethrestime(&tp); 3216 3217 /*LINTED*/ 3218 TV_DELTA(tp, t->tbf_last_pkt_t, tm); 3219 3220 /* 3221 * This formula is actually 3222 * "time in seconds" * "bytes/second". Scaled for nsec. 3223 * (tm/1000000000) * (v_rate_limit * 1000 * (1000/1024) /8) 3224 * 3225 * The (1000/1024) was introduced in add_vif to optimize 3226 * this divide into a shift. 3227 */ 3228 t->tbf_n_tok += (tm/1000) * vifp->v_rate_limit / 1024 / 8; 3229 t->tbf_last_pkt_t = tp; 3230 3231 if (t->tbf_n_tok > MAX_BKT_SIZE) 3232 t->tbf_n_tok = MAX_BKT_SIZE; 3233 if (ipst->ips_ip_mrtdebug > 1) { 3234 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3235 "tbf_update_tok: tm %lld tok %d vif %ld", 3236 tm, t->tbf_n_tok, (ptrdiff_t)(vifp - ipst->ips_vifs)); 3237 } 3238 } 3239 3240 /* 3241 * Priority currently is based on port nos. 3242 * Different forwarding mechanisms have different ways 3243 * of obtaining the port no. Hence, the vif must be 3244 * given along with the packet itself. 3245 * 3246 */ 3247 static int 3248 priority(struct vif *vifp, ipha_t *ipha) 3249 { 3250 int prio; 3251 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 3252 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3253 3254 /* Temporary hack; may add general packet classifier some day */ 3255 3256 ASSERT(MUTEX_HELD(&vifp->v_tbf->tbf_lock)); 3257 3258 /* 3259 * The UDP port space is divided up into four priority ranges: 3260 * [0, 16384) : unclassified - lowest priority 3261 * [16384, 32768) : audio - highest priority 3262 * [32768, 49152) : whiteboard - medium priority 3263 * [49152, 65536) : video - low priority 3264 */ 3265 3266 if (ipha->ipha_protocol == IPPROTO_UDP) { 3267 struct udphdr *udp = 3268 (struct udphdr *)((char *)ipha + IPH_HDR_LENGTH(ipha)); 3269 switch (ntohs(udp->uh_dport) & 0xc000) { 3270 case 0x4000: 3271 prio = 70; 3272 break; 3273 case 0x8000: 3274 prio = 60; 3275 break; 3276 case 0xc000: 3277 prio = 55; 3278 break; 3279 default: 3280 prio = 50; 3281 break; 3282 } 3283 if (ipst->ips_ip_mrtdebug > 1) { 3284 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3285 "priority: port %x prio %d\n", 3286 ntohs(udp->uh_dport), prio); 3287 } 3288 } else 3289 prio = 50; /* default priority */ 3290 return (prio); 3291 } 3292 3293 /* 3294 * End of token bucket filter modifications 3295 */ 3296 3297 3298 3299 /* 3300 * Produces data for netstat -M. 3301 */ 3302 int 3303 ip_mroute_stats(mblk_t *mp, ip_stack_t *ipst) 3304 { 3305 ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl); 3306 ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl); 3307 if (!snmp_append_data(mp, (char *)ipst->ips_mrtstat, 3308 sizeof (struct mrtstat))) { 3309 ip0dbg(("ip_mroute_stats: failed %ld bytes\n", 3310 (size_t)sizeof (struct mrtstat))); 3311 return (0); 3312 } 3313 return (1); 3314 } 3315 3316 /* 3317 * Sends info for SNMP's MIB. 3318 */ 3319 int 3320 ip_mroute_vif(mblk_t *mp, ip_stack_t *ipst) 3321 { 3322 struct vifctl vi; 3323 vifi_t vifi; 3324 3325 mutex_enter(&ipst->ips_numvifs_mutex); 3326 for (vifi = 0; vifi < ipst->ips_numvifs; vifi++) { 3327 if (ipst->ips_vifs[vifi].v_lcl_addr.s_addr == 0) 3328 continue; 3329 /* 3330 * No locks here, an approximation is fine. 3331 */ 3332 vi.vifc_vifi = vifi; 3333 vi.vifc_flags = ipst->ips_vifs[vifi].v_flags; 3334 vi.vifc_threshold = ipst->ips_vifs[vifi].v_threshold; 3335 vi.vifc_rate_limit = ipst->ips_vifs[vifi].v_rate_limit; 3336 vi.vifc_lcl_addr = ipst->ips_vifs[vifi].v_lcl_addr; 3337 vi.vifc_rmt_addr = ipst->ips_vifs[vifi].v_rmt_addr; 3338 vi.vifc_pkt_in = ipst->ips_vifs[vifi].v_pkt_in; 3339 vi.vifc_pkt_out = ipst->ips_vifs[vifi].v_pkt_out; 3340 3341 if (!snmp_append_data(mp, (char *)&vi, sizeof (vi))) { 3342 ip0dbg(("ip_mroute_vif: failed %ld bytes\n", 3343 (size_t)sizeof (vi))); 3344 mutex_exit(&ipst->ips_numvifs_mutex); 3345 return (0); 3346 } 3347 } 3348 mutex_exit(&ipst->ips_numvifs_mutex); 3349 return (1); 3350 } 3351 3352 /* 3353 * Called by ip_snmp_get to send up multicast routing table. 3354 */ 3355 int 3356 ip_mroute_mrt(mblk_t *mp, ip_stack_t *ipst) 3357 { 3358 int i, j; 3359 struct mfc *rt; 3360 struct mfcctl mfcc; 3361 3362 /* 3363 * Make sure multicast has not been turned off. 3364 */ 3365 if (is_mrouter_off(ipst)) 3366 return (1); 3367 3368 /* Loop over all hash buckets and their chains */ 3369 for (i = 0; i < MFCTBLSIZ; i++) { 3370 MFCB_REFHOLD(&ipst->ips_mfcs[i]); 3371 for (rt = ipst->ips_mfcs[i].mfcb_mfc; rt; rt = rt->mfc_next) { 3372 mutex_enter(&rt->mfc_mutex); 3373 if (rt->mfc_rte != NULL || 3374 (rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 3375 mutex_exit(&rt->mfc_mutex); 3376 continue; 3377 } 3378 mfcc.mfcc_origin = rt->mfc_origin; 3379 mfcc.mfcc_mcastgrp = rt->mfc_mcastgrp; 3380 mfcc.mfcc_parent = rt->mfc_parent; 3381 mfcc.mfcc_pkt_cnt = rt->mfc_pkt_cnt; 3382 mutex_enter(&ipst->ips_numvifs_mutex); 3383 for (j = 0; j < (int)ipst->ips_numvifs; j++) 3384 mfcc.mfcc_ttls[j] = rt->mfc_ttls[j]; 3385 for (j = (int)ipst->ips_numvifs; j < MAXVIFS; j++) 3386 mfcc.mfcc_ttls[j] = 0; 3387 mutex_exit(&ipst->ips_numvifs_mutex); 3388 3389 mutex_exit(&rt->mfc_mutex); 3390 if (!snmp_append_data(mp, (char *)&mfcc, 3391 sizeof (mfcc))) { 3392 MFCB_REFRELE(&ipst->ips_mfcs[i]); 3393 ip0dbg(("ip_mroute_mrt: failed %ld bytes\n", 3394 (size_t)sizeof (mfcc))); 3395 return (0); 3396 } 3397 } 3398 MFCB_REFRELE(&ipst->ips_mfcs[i]); 3399 } 3400 return (1); 3401 } 3402