1 /* 2 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 3 * Use is subject to license terms. 4 */ 5 /* 6 * CDDL HEADER START 7 * 8 * The contents of this file are subject to the terms of the 9 * Common Development and Distribution License (the "License"). 10 * You may not use this file except in compliance with the License. 11 * 12 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 13 * or http://www.opensolaris.org/os/licensing. 14 * See the License for the specific language governing permissions 15 * and limitations under the License. 16 * 17 * When distributing Covered Code, include this CDDL HEADER in each 18 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 19 * If applicable, add the following below this CDDL HEADER, with the 20 * fields enclosed by brackets "[]" replaced with your own identifying 21 * information: Portions Copyright [yyyy] [name of copyright owner] 22 * 23 * CDDL HEADER END 24 */ 25 /* 26 * Copyright 2008 Sun Microsystems, Inc. 27 * All rights reserved. Use is subject to license terms. 28 */ 29 /* Copyright (c) 1990 Mentat Inc. */ 30 31 /* 32 * Procedures for the kernel part of DVMRP, 33 * a Distance-Vector Multicast Routing Protocol. 34 * (See RFC-1075) 35 * Written by David Waitzman, BBN Labs, August 1988. 36 * Modified by Steve Deering, Stanford, February 1989. 37 * Modified by Mark J. Steiglitz, Stanford, May, 1991 38 * Modified by Van Jacobson, LBL, January 1993 39 * Modified by Ajit Thyagarajan, PARC, August 1993 40 * Modified by Bill Fenner, PARC, April 1995 41 * 42 * MROUTING 3.5 43 */ 44 45 /* 46 * TODO 47 * - function pointer field in vif, void *vif_sendit() 48 */ 49 50 #include <sys/types.h> 51 #include <sys/stream.h> 52 #include <sys/stropts.h> 53 #include <sys/strlog.h> 54 #include <sys/systm.h> 55 #include <sys/ddi.h> 56 #include <sys/cmn_err.h> 57 #include <sys/zone.h> 58 59 #include <sys/param.h> 60 #include <sys/socket.h> 61 #include <sys/vtrace.h> 62 #include <sys/debug.h> 63 #include <net/if.h> 64 #include <sys/sockio.h> 65 #include <netinet/in.h> 66 #include <net/if_dl.h> 67 68 #include <inet/common.h> 69 #include <inet/mi.h> 70 #include <inet/nd.h> 71 #include <inet/mib2.h> 72 #include <netinet/ip6.h> 73 #include <inet/ip.h> 74 #include <inet/snmpcom.h> 75 76 #include <netinet/igmp.h> 77 #include <netinet/igmp_var.h> 78 #include <netinet/udp.h> 79 #include <netinet/ip_mroute.h> 80 #include <inet/ip_multi.h> 81 #include <inet/ip_ire.h> 82 #include <inet/ip_if.h> 83 #include <inet/ipclassifier.h> 84 85 #include <netinet/pim.h> 86 87 88 /* 89 * MT Design: 90 * 91 * There are three main data structures viftable, mfctable and tbftable that 92 * need to be protected against MT races. 93 * 94 * vitable is a fixed length array of vif structs. There is no lock to protect 95 * the whole array, instead each struct is protected by its own indiviual lock. 96 * The value of v_marks in conjuction with the value of v_refcnt determines the 97 * current state of a vif structure. One special state that needs mention 98 * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates 99 * that vif is being initalized. 100 * Each structure is freed when the refcnt goes down to zero. If a delete comes 101 * in when the the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED 102 * which prevents the struct from further use. When the refcnt goes to zero 103 * the struct is freed and is marked VIF_MARK_NOTINUSE. 104 * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill 105 * from going away a refhold is put on the ipif before using it. see 106 * lock_good_vif() and unlock_good_vif(). 107 * 108 * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts 109 * of the vif struct. 110 * 111 * tbftable is also a fixed length array of tbf structs and is only accessed 112 * via v_tbf. It is protected by its own lock tbf_lock. 113 * 114 * Lock Ordering is 115 * v_lock --> tbf_lock 116 * v_lock --> ill_locK 117 * 118 * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb). 119 * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker, 120 * it also maintains a state. These fields are protected by a lock (mfcb_lock). 121 * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to 122 * protect the struct elements. 123 * 124 * mfc structs are dynamically allocated and are singly linked 125 * at the head of the chain. When an mfc structure is to be deleted 126 * it is marked condemned and so is the state in the bucket struct. 127 * When the last walker of the hash bucket exits all the mfc structs 128 * marked condemed are freed. 129 * 130 * Locking Hierarchy: 131 * The bucket lock should be acquired before the mfc struct lock. 132 * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking 133 * operations on the bucket struct. 134 * 135 * last_encap_lock and numvifs_mutex should be acquired after 136 * acquring vif or mfc locks. These locks protect some global variables. 137 * 138 * The statistics are not currently protected by a lock 139 * causing the stats be be approximate, not exact. 140 */ 141 142 #define NO_VIF MAXVIFS /* from mrouted, no route for src */ 143 144 /* 145 * Timeouts: 146 * Upcall timeouts - BSD uses boolean_t mfc->expire and 147 * nexpire[MFCTBLSIZE], the number of times expire has been called. 148 * SunOS 5.x uses mfc->timeout for each mfc. 149 * Some Unixes are limited in the number of simultaneous timeouts 150 * that can be run, SunOS 5.x does not have this restriction. 151 */ 152 153 /* 154 * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and 155 * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall 156 * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE 157 */ 158 #define EXPIRE_TIMEOUT (hz/4) /* 4x / second */ 159 #define UPCALL_EXPIRE 6 /* number of timeouts */ 160 161 /* 162 * Hash function for a source, group entry 163 */ 164 #define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \ 165 ((g) >> 20) ^ ((g) >> 10) ^ (g)) 166 167 #define TBF_REPROCESS (hz / 100) /* 100x /second */ 168 169 /* Identify PIM packet that came on a Register interface */ 170 #define PIM_REGISTER_MARKER 0xffffffff 171 172 /* Function declarations */ 173 static int add_mfc(struct mfcctl *, ip_stack_t *); 174 static int add_vif(struct vifctl *, conn_t *, mblk_t *, ip_stack_t *); 175 static int del_mfc(struct mfcctl *, ip_stack_t *); 176 static int del_vif(vifi_t *, conn_t *, mblk_t *, ip_stack_t *); 177 static void del_vifp(struct vif *); 178 static void encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 179 static void expire_upcalls(void *); 180 static void fill_route(struct mfc *, struct mfcctl *, ip_stack_t *); 181 static void free_queue(struct mfc *); 182 static int get_assert(uchar_t *, ip_stack_t *); 183 static int get_lsg_cnt(struct sioc_lsg_req *, ip_stack_t *); 184 static int get_sg_cnt(struct sioc_sg_req *, ip_stack_t *); 185 static int get_version(uchar_t *); 186 static int get_vif_cnt(struct sioc_vif_req *, ip_stack_t *); 187 static int ip_mdq(mblk_t *, ipha_t *, ill_t *, 188 ipaddr_t, struct mfc *); 189 static int ip_mrouter_init(conn_t *, uchar_t *, int, ip_stack_t *); 190 static void phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 191 static int register_mforward(queue_t *, mblk_t *, ill_t *); 192 static void register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 193 static int set_assert(int *, ip_stack_t *); 194 195 /* 196 * Token Bucket Filter functions 197 */ 198 static int priority(struct vif *, ipha_t *); 199 static void tbf_control(struct vif *, mblk_t *, ipha_t *); 200 static int tbf_dq_sel(struct vif *, ipha_t *); 201 static void tbf_process_q(struct vif *); 202 static void tbf_queue(struct vif *, mblk_t *); 203 static void tbf_reprocess_q(void *); 204 static void tbf_send_packet(struct vif *, mblk_t *); 205 static void tbf_update_tokens(struct vif *); 206 static void release_mfc(struct mfcb *); 207 208 static boolean_t is_mrouter_off(ip_stack_t *); 209 /* 210 * Encapsulation packets 211 */ 212 213 #define ENCAP_TTL 64 214 215 /* prototype IP hdr for encapsulated packets */ 216 static ipha_t multicast_encap_iphdr = { 217 IP_SIMPLE_HDR_VERSION, 218 0, /* tos */ 219 sizeof (ipha_t), /* total length */ 220 0, /* id */ 221 0, /* frag offset */ 222 ENCAP_TTL, IPPROTO_ENCAP, 223 0, /* checksum */ 224 }; 225 226 /* 227 * Rate limit for assert notification messages, in nsec. 228 */ 229 #define ASSERT_MSG_TIME 3000000000 230 231 232 #define VIF_REFHOLD(vifp) { \ 233 mutex_enter(&(vifp)->v_lock); \ 234 (vifp)->v_refcnt++; \ 235 mutex_exit(&(vifp)->v_lock); \ 236 } 237 238 #define VIF_REFRELE_LOCKED(vifp) { \ 239 (vifp)->v_refcnt--; \ 240 if ((vifp)->v_refcnt == 0 && \ 241 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) { \ 242 del_vifp(vifp); \ 243 } else { \ 244 mutex_exit(&(vifp)->v_lock); \ 245 } \ 246 } 247 248 #define VIF_REFRELE(vifp) { \ 249 mutex_enter(&(vifp)->v_lock); \ 250 (vifp)->v_refcnt--; \ 251 if ((vifp)->v_refcnt == 0 && \ 252 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) { \ 253 del_vifp(vifp); \ 254 } else { \ 255 mutex_exit(&(vifp)->v_lock); \ 256 } \ 257 } 258 259 #define MFCB_REFHOLD(mfcb) { \ 260 mutex_enter(&(mfcb)->mfcb_lock); \ 261 (mfcb)->mfcb_refcnt++; \ 262 ASSERT((mfcb)->mfcb_refcnt != 0); \ 263 mutex_exit(&(mfcb)->mfcb_lock); \ 264 } 265 266 #define MFCB_REFRELE(mfcb) { \ 267 mutex_enter(&(mfcb)->mfcb_lock); \ 268 ASSERT((mfcb)->mfcb_refcnt != 0); \ 269 if (--(mfcb)->mfcb_refcnt == 0 && \ 270 ((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) { \ 271 release_mfc(mfcb); \ 272 } \ 273 mutex_exit(&(mfcb)->mfcb_lock); \ 274 } 275 276 /* 277 * MFCFIND: 278 * Find a route for a given origin IP address and multicast group address. 279 * Skip entries with pending upcalls. 280 * Type of service parameter to be added in the future! 281 */ 282 #define MFCFIND(mfcbp, o, g, rt) { \ 283 struct mfc *_mb_rt = NULL; \ 284 rt = NULL; \ 285 _mb_rt = mfcbp->mfcb_mfc; \ 286 while (_mb_rt) { \ 287 if ((_mb_rt->mfc_origin.s_addr == o) && \ 288 (_mb_rt->mfc_mcastgrp.s_addr == g) && \ 289 (_mb_rt->mfc_rte == NULL) && \ 290 (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) { \ 291 rt = _mb_rt; \ 292 break; \ 293 } \ 294 _mb_rt = _mb_rt->mfc_next; \ 295 } \ 296 } 297 298 /* 299 * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime() 300 * are inefficient. We use gethrestime() which returns a timespec_t with 301 * sec and nsec, the resolution is machine dependent. 302 * The following 2 macros have been changed to use nsec instead of usec. 303 */ 304 /* 305 * Macros to compute elapsed time efficiently. 306 * Borrowed from Van Jacobson's scheduling code. 307 * Delta should be a hrtime_t. 308 */ 309 #define TV_DELTA(a, b, delta) { \ 310 int xxs; \ 311 \ 312 delta = (a).tv_nsec - (b).tv_nsec; \ 313 if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \ 314 switch (xxs) { \ 315 case 2: \ 316 delta += 1000000000; \ 317 /*FALLTHROUGH*/ \ 318 case 1: \ 319 delta += 1000000000; \ 320 break; \ 321 default: \ 322 delta += (1000000000 * xxs); \ 323 } \ 324 } \ 325 } 326 327 #define TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \ 328 (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec) 329 330 /* 331 * Handle MRT setsockopt commands to modify the multicast routing tables. 332 */ 333 int 334 ip_mrouter_set(int cmd, queue_t *q, int checkonly, uchar_t *data, 335 int datalen, mblk_t *first_mp) 336 { 337 conn_t *connp = Q_TO_CONN(q); 338 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 339 340 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 341 if (cmd != MRT_INIT && connp != ipst->ips_ip_g_mrouter) { 342 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 343 return (EACCES); 344 } 345 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 346 347 if (checkonly) { 348 /* 349 * do not do operation, just pretend to - new T_CHECK 350 * Note: Even routines further on can probably fail but 351 * this T_CHECK stuff is only to please XTI so it not 352 * necessary to be perfect. 353 */ 354 switch (cmd) { 355 case MRT_INIT: 356 case MRT_DONE: 357 case MRT_ADD_VIF: 358 case MRT_DEL_VIF: 359 case MRT_ADD_MFC: 360 case MRT_DEL_MFC: 361 case MRT_ASSERT: 362 return (0); 363 default: 364 return (EOPNOTSUPP); 365 } 366 } 367 368 /* 369 * make sure no command is issued after multicast routing has been 370 * turned off. 371 */ 372 if (cmd != MRT_INIT && cmd != MRT_DONE) { 373 if (is_mrouter_off(ipst)) 374 return (EINVAL); 375 } 376 377 switch (cmd) { 378 case MRT_INIT: return (ip_mrouter_init(connp, data, datalen, ipst)); 379 case MRT_DONE: return (ip_mrouter_done(first_mp, ipst)); 380 case MRT_ADD_VIF: return (add_vif((struct vifctl *)data, connp, 381 first_mp, ipst)); 382 case MRT_DEL_VIF: return (del_vif((vifi_t *)data, connp, first_mp, 383 ipst)); 384 case MRT_ADD_MFC: return (add_mfc((struct mfcctl *)data, ipst)); 385 case MRT_DEL_MFC: return (del_mfc((struct mfcctl *)data, ipst)); 386 case MRT_ASSERT: return (set_assert((int *)data, ipst)); 387 default: return (EOPNOTSUPP); 388 } 389 } 390 391 /* 392 * Handle MRT getsockopt commands 393 */ 394 int 395 ip_mrouter_get(int cmd, queue_t *q, uchar_t *data) 396 { 397 conn_t *connp = Q_TO_CONN(q); 398 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 399 400 if (connp != ipst->ips_ip_g_mrouter) 401 return (EACCES); 402 403 switch (cmd) { 404 case MRT_VERSION: return (get_version((uchar_t *)data)); 405 case MRT_ASSERT: return (get_assert((uchar_t *)data, ipst)); 406 default: return (EOPNOTSUPP); 407 } 408 } 409 410 /* 411 * Handle ioctl commands to obtain information from the cache. 412 * Called with shared access to IP. These are read_only ioctls. 413 */ 414 /* ARGSUSED */ 415 int 416 mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 417 ip_ioctl_cmd_t *ipip, void *if_req) 418 { 419 mblk_t *mp1; 420 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 421 conn_t *connp = Q_TO_CONN(q); 422 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 423 424 /* Existence verified in ip_wput_nondata */ 425 mp1 = mp->b_cont->b_cont; 426 427 switch (iocp->ioc_cmd) { 428 case (SIOCGETVIFCNT): 429 return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr, ipst)); 430 case (SIOCGETSGCNT): 431 return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr, ipst)); 432 case (SIOCGETLSGCNT): 433 return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr, ipst)); 434 default: 435 return (EINVAL); 436 } 437 } 438 439 /* 440 * Returns the packet, byte, rpf-failure count for the source, group provided. 441 */ 442 static int 443 get_sg_cnt(struct sioc_sg_req *req, ip_stack_t *ipst) 444 { 445 struct mfc *rt; 446 struct mfcb *mfcbp; 447 448 mfcbp = &ipst->ips_mfcs[MFCHASH(req->src.s_addr, req->grp.s_addr)]; 449 MFCB_REFHOLD(mfcbp); 450 MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt); 451 452 if (rt != NULL) { 453 mutex_enter(&rt->mfc_mutex); 454 req->pktcnt = rt->mfc_pkt_cnt; 455 req->bytecnt = rt->mfc_byte_cnt; 456 req->wrong_if = rt->mfc_wrong_if; 457 mutex_exit(&rt->mfc_mutex); 458 } else 459 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU; 460 461 MFCB_REFRELE(mfcbp); 462 return (0); 463 } 464 465 /* 466 * Returns the packet, byte, rpf-failure count for the source, group provided. 467 * Uses larger counters and IPv6 addresses. 468 */ 469 /* ARGSUSED XXX until implemented */ 470 static int 471 get_lsg_cnt(struct sioc_lsg_req *req, ip_stack_t *ipst) 472 { 473 /* XXX TODO SIOCGETLSGCNT */ 474 return (ENXIO); 475 } 476 477 /* 478 * Returns the input and output packet and byte counts on the vif provided. 479 */ 480 static int 481 get_vif_cnt(struct sioc_vif_req *req, ip_stack_t *ipst) 482 { 483 vifi_t vifi = req->vifi; 484 485 if (vifi >= ipst->ips_numvifs) 486 return (EINVAL); 487 488 /* 489 * No locks here, an approximation is fine. 490 */ 491 req->icount = ipst->ips_vifs[vifi].v_pkt_in; 492 req->ocount = ipst->ips_vifs[vifi].v_pkt_out; 493 req->ibytes = ipst->ips_vifs[vifi].v_bytes_in; 494 req->obytes = ipst->ips_vifs[vifi].v_bytes_out; 495 496 return (0); 497 } 498 499 static int 500 get_version(uchar_t *data) 501 { 502 int *v = (int *)data; 503 504 *v = 0x0305; /* XXX !!!! */ 505 506 return (0); 507 } 508 509 /* 510 * Set PIM assert processing global. 511 */ 512 static int 513 set_assert(int *i, ip_stack_t *ipst) 514 { 515 if ((*i != 1) && (*i != 0)) 516 return (EINVAL); 517 518 ipst->ips_pim_assert = *i; 519 520 return (0); 521 } 522 523 /* 524 * Get PIM assert processing global. 525 */ 526 static int 527 get_assert(uchar_t *data, ip_stack_t *ipst) 528 { 529 int *i = (int *)data; 530 531 *i = ipst->ips_pim_assert; 532 533 return (0); 534 } 535 536 /* 537 * Enable multicast routing. 538 */ 539 static int 540 ip_mrouter_init(conn_t *connp, uchar_t *data, int datalen, ip_stack_t *ipst) 541 { 542 int *v; 543 544 if (data == NULL || (datalen != sizeof (int))) 545 return (ENOPROTOOPT); 546 547 v = (int *)data; 548 if (*v != 1) 549 return (ENOPROTOOPT); 550 551 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 552 if (ipst->ips_ip_g_mrouter != NULL) { 553 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 554 return (EADDRINUSE); 555 } 556 557 /* 558 * MRT_INIT should only be allowed for RAW sockets, but we double 559 * check. 560 */ 561 if (!IPCL_IS_RAWIP(connp)) { 562 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 563 return (EINVAL); 564 } 565 566 ipst->ips_ip_g_mrouter = connp; 567 connp->conn_multi_router = 1; 568 /* In order for tunnels to work we have to turn ip_g_forward on */ 569 if (!WE_ARE_FORWARDING(ipst)) { 570 if (ipst->ips_ip_mrtdebug > 1) { 571 (void) mi_strlog(connp->conn_rq, 1, SL_TRACE, 572 "ip_mrouter_init: turning on forwarding"); 573 } 574 ipst->ips_saved_ip_g_forward = ipst->ips_ip_g_forward; 575 ipst->ips_ip_g_forward = IP_FORWARD_ALWAYS; 576 } 577 578 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 579 return (0); 580 } 581 582 void 583 ip_mrouter_stack_init(ip_stack_t *ipst) 584 { 585 mutex_init(&ipst->ips_ip_g_mrouter_mutex, NULL, MUTEX_DEFAULT, NULL); 586 587 ipst->ips_vifs = kmem_zalloc(sizeof (struct vif) * (MAXVIFS+1), 588 KM_SLEEP); 589 ipst->ips_mrtstat = kmem_zalloc(sizeof (struct mrtstat), KM_SLEEP); 590 /* 591 * mfctable: 592 * Includes all mfcs, including waiting upcalls. 593 * Multiple mfcs per bucket. 594 */ 595 ipst->ips_mfcs = kmem_zalloc(sizeof (struct mfcb) * MFCTBLSIZ, 596 KM_SLEEP); 597 /* 598 * Define the token bucket filter structures. 599 * tbftable -> each vif has one of these for storing info. 600 */ 601 ipst->ips_tbfs = kmem_zalloc(sizeof (struct tbf) * MAXVIFS, KM_SLEEP); 602 603 mutex_init(&ipst->ips_last_encap_lock, NULL, MUTEX_DEFAULT, NULL); 604 605 ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl); 606 ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl); 607 } 608 609 /* 610 * Disable multicast routing. 611 * Didn't use global timeout_val (BSD version), instead check the mfctable. 612 */ 613 int 614 ip_mrouter_done(mblk_t *mp, ip_stack_t *ipst) 615 { 616 conn_t *mrouter; 617 vifi_t vifi; 618 struct mfc *mfc_rt; 619 int i; 620 621 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 622 if (ipst->ips_ip_g_mrouter == NULL) { 623 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 624 return (EINVAL); 625 } 626 627 mrouter = ipst->ips_ip_g_mrouter; 628 629 if (ipst->ips_saved_ip_g_forward != -1) { 630 if (ipst->ips_ip_mrtdebug > 1) { 631 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 632 "ip_mrouter_done: turning off forwarding"); 633 } 634 ipst->ips_ip_g_forward = ipst->ips_saved_ip_g_forward; 635 ipst->ips_saved_ip_g_forward = -1; 636 } 637 638 /* 639 * Always clear cache when vifs change. 640 * No need to get ipst->ips_last_encap_lock since we are running as 641 * a writer. 642 */ 643 mutex_enter(&ipst->ips_last_encap_lock); 644 ipst->ips_last_encap_src = 0; 645 ipst->ips_last_encap_vif = NULL; 646 mutex_exit(&ipst->ips_last_encap_lock); 647 mrouter->conn_multi_router = 0; 648 649 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 650 651 /* 652 * For each phyint in use, 653 * disable promiscuous reception of all IP multicasts. 654 */ 655 for (vifi = 0; vifi < MAXVIFS; vifi++) { 656 struct vif *vifp = ipst->ips_vifs + vifi; 657 658 mutex_enter(&vifp->v_lock); 659 /* 660 * if the vif is active mark it condemned. 661 */ 662 if (vifp->v_marks & VIF_MARK_GOOD) { 663 ASSERT(vifp->v_ipif != NULL); 664 ipif_refhold(vifp->v_ipif); 665 /* Phyint only */ 666 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 667 ipif_t *ipif = vifp->v_ipif; 668 ipsq_t *ipsq; 669 boolean_t suc; 670 ill_t *ill; 671 672 ill = ipif->ipif_ill; 673 suc = B_FALSE; 674 if (mp == NULL) { 675 /* 676 * being called from ip_close, 677 * lets do it synchronously. 678 * Clear VIF_MARK_GOOD and 679 * set VIF_MARK_CONDEMNED. 680 */ 681 vifp->v_marks &= ~VIF_MARK_GOOD; 682 vifp->v_marks |= VIF_MARK_CONDEMNED; 683 mutex_exit(&(vifp)->v_lock); 684 suc = ipsq_enter(ill, B_FALSE, NEW_OP); 685 ipsq = ill->ill_phyint->phyint_ipsq; 686 } else { 687 ipsq = ipsq_try_enter(ipif, NULL, 688 mrouter->conn_wq, mp, 689 ip_restart_optmgmt, NEW_OP, B_TRUE); 690 if (ipsq == NULL) { 691 mutex_exit(&(vifp)->v_lock); 692 ipif_refrele(ipif); 693 return (EINPROGRESS); 694 } 695 /* 696 * Clear VIF_MARK_GOOD and 697 * set VIF_MARK_CONDEMNED. 698 */ 699 vifp->v_marks &= ~VIF_MARK_GOOD; 700 vifp->v_marks |= VIF_MARK_CONDEMNED; 701 mutex_exit(&(vifp)->v_lock); 702 suc = B_TRUE; 703 } 704 705 if (suc) { 706 (void) ip_delmulti(INADDR_ANY, ipif, 707 B_TRUE, B_TRUE); 708 ipsq_exit(ipsq); 709 } 710 mutex_enter(&vifp->v_lock); 711 } 712 ipif_refrele(vifp->v_ipif); 713 /* 714 * decreases the refcnt added in add_vif. 715 * and release v_lock. 716 */ 717 VIF_REFRELE_LOCKED(vifp); 718 } else { 719 mutex_exit(&vifp->v_lock); 720 continue; 721 } 722 } 723 724 mutex_enter(&ipst->ips_numvifs_mutex); 725 ipst->ips_numvifs = 0; 726 ipst->ips_pim_assert = 0; 727 ipst->ips_reg_vif_num = ALL_VIFS; 728 mutex_exit(&ipst->ips_numvifs_mutex); 729 730 /* 731 * Free upcall msgs. 732 * Go through mfctable and stop any outstanding upcall 733 * timeouts remaining on mfcs. 734 */ 735 for (i = 0; i < MFCTBLSIZ; i++) { 736 mutex_enter(&ipst->ips_mfcs[i].mfcb_lock); 737 ipst->ips_mfcs[i].mfcb_refcnt++; 738 ipst->ips_mfcs[i].mfcb_marks |= MFCB_MARK_CONDEMNED; 739 mutex_exit(&ipst->ips_mfcs[i].mfcb_lock); 740 mfc_rt = ipst->ips_mfcs[i].mfcb_mfc; 741 while (mfc_rt) { 742 /* Free upcalls */ 743 mutex_enter(&mfc_rt->mfc_mutex); 744 if (mfc_rt->mfc_rte != NULL) { 745 if (mfc_rt->mfc_timeout_id != 0) { 746 /* 747 * OK to drop the lock as we have 748 * a refcnt on the bucket. timeout 749 * can fire but it will see that 750 * mfc_timeout_id == 0 and not do 751 * anything. see expire_upcalls(). 752 */ 753 mfc_rt->mfc_timeout_id = 0; 754 mutex_exit(&mfc_rt->mfc_mutex); 755 (void) untimeout( 756 mfc_rt->mfc_timeout_id); 757 mfc_rt->mfc_timeout_id = 0; 758 mutex_enter(&mfc_rt->mfc_mutex); 759 760 /* 761 * all queued upcall packets 762 * and mblk will be freed in 763 * release_mfc(). 764 */ 765 } 766 } 767 768 mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED; 769 770 mutex_exit(&mfc_rt->mfc_mutex); 771 mfc_rt = mfc_rt->mfc_next; 772 } 773 MFCB_REFRELE(&ipst->ips_mfcs[i]); 774 } 775 776 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 777 ipst->ips_ip_g_mrouter = NULL; 778 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 779 return (0); 780 } 781 782 void 783 ip_mrouter_stack_destroy(ip_stack_t *ipst) 784 { 785 struct mfcb *mfcbp; 786 struct mfc *rt; 787 int i; 788 789 for (i = 0; i < MFCTBLSIZ; i++) { 790 mfcbp = &ipst->ips_mfcs[i]; 791 792 while ((rt = mfcbp->mfcb_mfc) != NULL) { 793 (void) printf("ip_mrouter_stack_destroy: free for %d\n", 794 i); 795 796 mfcbp->mfcb_mfc = rt->mfc_next; 797 free_queue(rt); 798 mi_free(rt); 799 } 800 } 801 kmem_free(ipst->ips_vifs, sizeof (struct vif) * (MAXVIFS+1)); 802 ipst->ips_vifs = NULL; 803 kmem_free(ipst->ips_mrtstat, sizeof (struct mrtstat)); 804 ipst->ips_mrtstat = NULL; 805 kmem_free(ipst->ips_mfcs, sizeof (struct mfcb) * MFCTBLSIZ); 806 ipst->ips_mfcs = NULL; 807 kmem_free(ipst->ips_tbfs, sizeof (struct tbf) * MAXVIFS); 808 ipst->ips_tbfs = NULL; 809 810 mutex_destroy(&ipst->ips_last_encap_lock); 811 mutex_destroy(&ipst->ips_ip_g_mrouter_mutex); 812 } 813 814 static boolean_t 815 is_mrouter_off(ip_stack_t *ipst) 816 { 817 conn_t *mrouter; 818 819 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 820 if (ipst->ips_ip_g_mrouter == NULL) { 821 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 822 return (B_TRUE); 823 } 824 825 mrouter = ipst->ips_ip_g_mrouter; 826 if (mrouter->conn_multi_router == 0) { 827 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 828 return (B_TRUE); 829 } 830 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 831 return (B_FALSE); 832 } 833 834 static void 835 unlock_good_vif(struct vif *vifp) 836 { 837 ASSERT(vifp->v_ipif != NULL); 838 ipif_refrele(vifp->v_ipif); 839 VIF_REFRELE(vifp); 840 } 841 842 static boolean_t 843 lock_good_vif(struct vif *vifp) 844 { 845 mutex_enter(&vifp->v_lock); 846 if (!(vifp->v_marks & VIF_MARK_GOOD)) { 847 mutex_exit(&vifp->v_lock); 848 return (B_FALSE); 849 } 850 851 ASSERT(vifp->v_ipif != NULL); 852 mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock); 853 if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) { 854 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock); 855 mutex_exit(&vifp->v_lock); 856 return (B_FALSE); 857 } 858 ipif_refhold_locked(vifp->v_ipif); 859 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock); 860 vifp->v_refcnt++; 861 mutex_exit(&vifp->v_lock); 862 return (B_TRUE); 863 } 864 865 /* 866 * Add a vif to the vif table. 867 */ 868 static int 869 add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst) 870 { 871 struct vif *vifp = ipst->ips_vifs + vifcp->vifc_vifi; 872 ipif_t *ipif; 873 int error; 874 struct tbf *v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi; 875 ipsq_t *ipsq; 876 conn_t *mrouter = ipst->ips_ip_g_mrouter; 877 878 ASSERT(connp != NULL); 879 880 if (vifcp->vifc_vifi >= MAXVIFS) 881 return (EINVAL); 882 883 if (is_mrouter_off(ipst)) 884 return (EINVAL); 885 886 mutex_enter(&vifp->v_lock); 887 /* 888 * Viftable entry should be 0. 889 * if v_marks == 0 but v_refcnt != 0 means struct is being 890 * initialized. 891 * 892 * Also note that it is very unlikely that we will get a MRT_ADD_VIF 893 * request while the delete is in progress, mrouted only sends add 894 * requests when a new interface is added and the new interface cannot 895 * have the same vifi as an existing interface. We make sure that 896 * ill_delete will block till the vif is deleted by adding a refcnt 897 * to ipif in del_vif(). 898 */ 899 if (vifp->v_lcl_addr.s_addr != 0 || 900 vifp->v_marks != 0 || 901 vifp->v_refcnt != 0) { 902 mutex_exit(&vifp->v_lock); 903 return (EADDRINUSE); 904 } 905 906 /* Incoming vif should not be 0 */ 907 if (vifcp->vifc_lcl_addr.s_addr == 0) { 908 mutex_exit(&vifp->v_lock); 909 return (EINVAL); 910 } 911 912 vifp->v_refcnt++; 913 mutex_exit(&vifp->v_lock); 914 /* Find the interface with the local address */ 915 ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL, 916 connp->conn_zoneid, CONNP_TO_WQ(connp), first_mp, 917 ip_restart_optmgmt, &error, ipst); 918 if (ipif == NULL) { 919 VIF_REFRELE(vifp); 920 if (error == EINPROGRESS) 921 return (error); 922 return (EADDRNOTAVAIL); 923 } 924 925 /* 926 * We have to be exclusive as we have to call ip_addmulti() 927 * This is the best position to try to be exclusive in case 928 * we have to wait. 929 */ 930 ipsq = ipsq_try_enter(ipif, NULL, CONNP_TO_WQ(connp), first_mp, 931 ip_restart_optmgmt, NEW_OP, B_TRUE); 932 if ((ipsq) == NULL) { 933 VIF_REFRELE(vifp); 934 ipif_refrele(ipif); 935 return (EINPROGRESS); 936 } 937 938 if (ipst->ips_ip_mrtdebug > 1) { 939 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 940 "add_vif: src 0x%x enter", 941 vifcp->vifc_lcl_addr.s_addr); 942 } 943 944 mutex_enter(&vifp->v_lock); 945 /* 946 * Always clear cache when vifs change. 947 * Needed to ensure that src isn't left over from before vif was added. 948 * No need to get last_encap_lock, since we are running as a writer. 949 */ 950 951 mutex_enter(&ipst->ips_last_encap_lock); 952 ipst->ips_last_encap_src = 0; 953 ipst->ips_last_encap_vif = NULL; 954 mutex_exit(&ipst->ips_last_encap_lock); 955 956 if (vifcp->vifc_flags & VIFF_TUNNEL) { 957 if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) { 958 cmn_err(CE_WARN, 959 "add_vif: source route tunnels not supported\n"); 960 VIF_REFRELE_LOCKED(vifp); 961 ipif_refrele(ipif); 962 ipsq_exit(ipsq); 963 return (EOPNOTSUPP); 964 } 965 vifp->v_rmt_addr = vifcp->vifc_rmt_addr; 966 967 } else { 968 /* Phyint or Register vif */ 969 if (vifcp->vifc_flags & VIFF_REGISTER) { 970 /* 971 * Note: Since all IPPROTO_IP level options (including 972 * MRT_ADD_VIF) are done exclusively via 973 * ip_optmgmt_writer(), a lock is not necessary to 974 * protect reg_vif_num. 975 */ 976 mutex_enter(&ipst->ips_numvifs_mutex); 977 if (ipst->ips_reg_vif_num == ALL_VIFS) { 978 ipst->ips_reg_vif_num = vifcp->vifc_vifi; 979 mutex_exit(&ipst->ips_numvifs_mutex); 980 } else { 981 mutex_exit(&ipst->ips_numvifs_mutex); 982 VIF_REFRELE_LOCKED(vifp); 983 ipif_refrele(ipif); 984 ipsq_exit(ipsq); 985 return (EADDRINUSE); 986 } 987 } 988 989 /* Make sure the interface supports multicast */ 990 if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) { 991 VIF_REFRELE_LOCKED(vifp); 992 ipif_refrele(ipif); 993 if (vifcp->vifc_flags & VIFF_REGISTER) { 994 mutex_enter(&ipst->ips_numvifs_mutex); 995 ipst->ips_reg_vif_num = ALL_VIFS; 996 mutex_exit(&ipst->ips_numvifs_mutex); 997 } 998 ipsq_exit(ipsq); 999 return (EOPNOTSUPP); 1000 } 1001 /* Enable promiscuous reception of all IP mcasts from the if */ 1002 mutex_exit(&vifp->v_lock); 1003 error = ip_addmulti(INADDR_ANY, ipif, ILGSTAT_NONE, 1004 MODE_IS_EXCLUDE, NULL); 1005 mutex_enter(&vifp->v_lock); 1006 /* 1007 * since we released the lock lets make sure that 1008 * ip_mrouter_done() has not been called. 1009 */ 1010 if (error != 0 || is_mrouter_off(ipst)) { 1011 if (error == 0) 1012 (void) ip_delmulti(INADDR_ANY, ipif, B_TRUE, 1013 B_TRUE); 1014 if (vifcp->vifc_flags & VIFF_REGISTER) { 1015 mutex_enter(&ipst->ips_numvifs_mutex); 1016 ipst->ips_reg_vif_num = ALL_VIFS; 1017 mutex_exit(&ipst->ips_numvifs_mutex); 1018 } 1019 VIF_REFRELE_LOCKED(vifp); 1020 ipif_refrele(ipif); 1021 ipsq_exit(ipsq); 1022 return (error?error:EINVAL); 1023 } 1024 } 1025 /* Define parameters for the tbf structure */ 1026 vifp->v_tbf = v_tbf; 1027 gethrestime(&vifp->v_tbf->tbf_last_pkt_t); 1028 vifp->v_tbf->tbf_n_tok = 0; 1029 vifp->v_tbf->tbf_q_len = 0; 1030 vifp->v_tbf->tbf_max_q_len = MAXQSIZE; 1031 vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL; 1032 1033 vifp->v_flags = vifcp->vifc_flags; 1034 vifp->v_threshold = vifcp->vifc_threshold; 1035 vifp->v_lcl_addr = vifcp->vifc_lcl_addr; 1036 vifp->v_ipif = ipif; 1037 ipif_refrele(ipif); 1038 /* Scaling up here, allows division by 1024 in critical code. */ 1039 vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000); 1040 vifp->v_timeout_id = 0; 1041 /* initialize per vif pkt counters */ 1042 vifp->v_pkt_in = 0; 1043 vifp->v_pkt_out = 0; 1044 vifp->v_bytes_in = 0; 1045 vifp->v_bytes_out = 0; 1046 mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL); 1047 1048 /* Adjust numvifs up, if the vifi is higher than numvifs */ 1049 mutex_enter(&ipst->ips_numvifs_mutex); 1050 if (ipst->ips_numvifs <= vifcp->vifc_vifi) 1051 ipst->ips_numvifs = vifcp->vifc_vifi + 1; 1052 mutex_exit(&ipst->ips_numvifs_mutex); 1053 1054 if (ipst->ips_ip_mrtdebug > 1) { 1055 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1056 "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d", 1057 vifcp->vifc_vifi, 1058 ntohl(vifcp->vifc_lcl_addr.s_addr), 1059 (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask", 1060 ntohl(vifcp->vifc_rmt_addr.s_addr), 1061 vifcp->vifc_threshold, vifcp->vifc_rate_limit); 1062 } 1063 1064 vifp->v_marks = VIF_MARK_GOOD; 1065 mutex_exit(&vifp->v_lock); 1066 ipsq_exit(ipsq); 1067 return (0); 1068 } 1069 1070 1071 /* Delete a vif from the vif table. */ 1072 static void 1073 del_vifp(struct vif *vifp) 1074 { 1075 struct tbf *t = vifp->v_tbf; 1076 mblk_t *mp0; 1077 vifi_t vifi; 1078 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 1079 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1080 1081 ASSERT(vifp->v_marks & VIF_MARK_CONDEMNED); 1082 ASSERT(t != NULL); 1083 1084 if (ipst->ips_ip_mrtdebug > 1) { 1085 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1086 "del_vif: src 0x%x\n", vifp->v_lcl_addr.s_addr); 1087 } 1088 1089 if (vifp->v_timeout_id != 0) { 1090 (void) untimeout(vifp->v_timeout_id); 1091 vifp->v_timeout_id = 0; 1092 } 1093 1094 /* 1095 * Free packets queued at the interface. 1096 * Mrouted takes care of cleaning up mfcs - makes calls to del_mfc. 1097 */ 1098 mutex_enter(&t->tbf_lock); 1099 while (t->tbf_q != NULL) { 1100 mp0 = t->tbf_q; 1101 t->tbf_q = t->tbf_q->b_next; 1102 mp0->b_prev = mp0->b_next = NULL; 1103 freemsg(mp0); 1104 } 1105 mutex_exit(&t->tbf_lock); 1106 1107 /* 1108 * Always clear cache when vifs change. 1109 * No need to get last_encap_lock since we are running as a writer. 1110 */ 1111 mutex_enter(&ipst->ips_last_encap_lock); 1112 if (vifp == ipst->ips_last_encap_vif) { 1113 ipst->ips_last_encap_vif = NULL; 1114 ipst->ips_last_encap_src = 0; 1115 } 1116 mutex_exit(&ipst->ips_last_encap_lock); 1117 1118 mutex_destroy(&t->tbf_lock); 1119 1120 bzero(vifp->v_tbf, sizeof (*(vifp->v_tbf))); 1121 1122 /* Adjust numvifs down */ 1123 mutex_enter(&ipst->ips_numvifs_mutex); 1124 for (vifi = ipst->ips_numvifs; vifi != 0; vifi--) /* vifi is unsigned */ 1125 if (ipst->ips_vifs[vifi - 1].v_lcl_addr.s_addr != 0) 1126 break; 1127 ipst->ips_numvifs = vifi; 1128 mutex_exit(&ipst->ips_numvifs_mutex); 1129 1130 bzero(vifp, sizeof (*vifp)); 1131 } 1132 1133 static int 1134 del_vif(vifi_t *vifip, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst) 1135 { 1136 struct vif *vifp = ipst->ips_vifs + *vifip; 1137 ipsq_t *ipsq; 1138 1139 if (*vifip >= ipst->ips_numvifs) 1140 return (EINVAL); 1141 1142 mutex_enter(&vifp->v_lock); 1143 /* 1144 * Not initialized 1145 * Here we are not looking at the vif that is being initialized 1146 * i.e vifp->v_marks == 0 and refcnt > 0. 1147 */ 1148 if (vifp->v_lcl_addr.s_addr == 0 || 1149 !(vifp->v_marks & VIF_MARK_GOOD)) { 1150 mutex_exit(&vifp->v_lock); 1151 return (EADDRNOTAVAIL); 1152 } 1153 1154 /* 1155 * This is an optimization, if first_mp == NULL 1156 * than we are being called from reset_mrt_vif_ipif() 1157 * so we already have exclusive access to the ipsq. 1158 * the ASSERT below is a check for this condition. 1159 */ 1160 if (first_mp != NULL && 1161 !(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 1162 ASSERT(connp != NULL); 1163 /* 1164 * We have to be exclusive as we have to call ip_delmulti() 1165 * This is the best position to try to be exclusive in case 1166 * we have to wait. 1167 */ 1168 ipsq = ipsq_try_enter(vifp->v_ipif, NULL, CONNP_TO_WQ(connp), 1169 first_mp, ip_restart_optmgmt, NEW_OP, B_TRUE); 1170 if ((ipsq) == NULL) { 1171 mutex_exit(&vifp->v_lock); 1172 return (EINPROGRESS); 1173 } 1174 /* recheck after being exclusive */ 1175 if (vifp->v_lcl_addr.s_addr == 0 || 1176 !vifp->v_marks & VIF_MARK_GOOD) { 1177 /* 1178 * someone beat us. 1179 */ 1180 mutex_exit(&vifp->v_lock); 1181 ipsq_exit(ipsq); 1182 return (EADDRNOTAVAIL); 1183 } 1184 } 1185 1186 1187 ASSERT(IAM_WRITER_IPIF(vifp->v_ipif)); 1188 1189 /* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */ 1190 vifp->v_marks &= ~VIF_MARK_GOOD; 1191 vifp->v_marks |= VIF_MARK_CONDEMNED; 1192 1193 /* Phyint only */ 1194 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 1195 ipif_t *ipif = vifp->v_ipif; 1196 ASSERT(ipif != NULL); 1197 /* 1198 * should be OK to drop the lock as we 1199 * have marked this as CONDEMNED. 1200 */ 1201 mutex_exit(&(vifp)->v_lock); 1202 (void) ip_delmulti(INADDR_ANY, ipif, B_TRUE, B_TRUE); 1203 if (first_mp != NULL) 1204 ipsq_exit(ipsq); 1205 mutex_enter(&(vifp)->v_lock); 1206 } 1207 1208 /* 1209 * decreases the refcnt added in add_vif. 1210 */ 1211 VIF_REFRELE_LOCKED(vifp); 1212 return (0); 1213 } 1214 1215 /* 1216 * Add an mfc entry. 1217 */ 1218 static int 1219 add_mfc(struct mfcctl *mfccp, ip_stack_t *ipst) 1220 { 1221 struct mfc *rt; 1222 struct rtdetq *rte; 1223 ushort_t nstl; 1224 int i; 1225 struct mfcb *mfcbp; 1226 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1227 1228 /* 1229 * The value of vifi is NO_VIF (==MAXVIFS) if Mrouted 1230 * did not have a real route for pkt. 1231 * We want this pkt without rt installed in the mfctable to prevent 1232 * multiiple tries, so go ahead and put it in mfctable, it will 1233 * be discarded later in ip_mdq() because the child is NULL. 1234 */ 1235 1236 /* Error checking, out of bounds? */ 1237 if (mfccp->mfcc_parent > MAXVIFS) { 1238 ip0dbg(("ADD_MFC: mfcc_parent out of range %d", 1239 (int)mfccp->mfcc_parent)); 1240 return (EINVAL); 1241 } 1242 1243 if ((mfccp->mfcc_parent != NO_VIF) && 1244 (ipst->ips_vifs[mfccp->mfcc_parent].v_ipif == NULL)) { 1245 ip0dbg(("ADD_MFC: NULL ipif for parent vif %d\n", 1246 (int)mfccp->mfcc_parent)); 1247 return (EINVAL); 1248 } 1249 1250 if (is_mrouter_off(ipst)) { 1251 return (EINVAL); 1252 } 1253 1254 mfcbp = &ipst->ips_mfcs[MFCHASH(mfccp->mfcc_origin.s_addr, 1255 mfccp->mfcc_mcastgrp.s_addr)]; 1256 MFCB_REFHOLD(mfcbp); 1257 MFCFIND(mfcbp, mfccp->mfcc_origin.s_addr, 1258 mfccp->mfcc_mcastgrp.s_addr, rt); 1259 1260 /* If an entry already exists, just update the fields */ 1261 if (rt) { 1262 if (ipst->ips_ip_mrtdebug > 1) { 1263 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1264 "add_mfc: update o %x grp %x parent %x", 1265 ntohl(mfccp->mfcc_origin.s_addr), 1266 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1267 mfccp->mfcc_parent); 1268 } 1269 mutex_enter(&rt->mfc_mutex); 1270 rt->mfc_parent = mfccp->mfcc_parent; 1271 1272 mutex_enter(&ipst->ips_numvifs_mutex); 1273 for (i = 0; i < (int)ipst->ips_numvifs; i++) 1274 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; 1275 mutex_exit(&ipst->ips_numvifs_mutex); 1276 mutex_exit(&rt->mfc_mutex); 1277 1278 MFCB_REFRELE(mfcbp); 1279 return (0); 1280 } 1281 1282 /* 1283 * Find the entry for which the upcall was made and update. 1284 */ 1285 for (rt = mfcbp->mfcb_mfc, nstl = 0; rt; rt = rt->mfc_next) { 1286 mutex_enter(&rt->mfc_mutex); 1287 if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && 1288 (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) && 1289 (rt->mfc_rte != NULL) && 1290 !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 1291 if (nstl++ != 0) 1292 cmn_err(CE_WARN, 1293 "add_mfc: %s o %x g %x p %x", 1294 "multiple kernel entries", 1295 ntohl(mfccp->mfcc_origin.s_addr), 1296 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1297 mfccp->mfcc_parent); 1298 1299 if (ipst->ips_ip_mrtdebug > 1) { 1300 (void) mi_strlog(mrouter->conn_rq, 1, 1301 SL_TRACE, 1302 "add_mfc: o %x g %x p %x", 1303 ntohl(mfccp->mfcc_origin.s_addr), 1304 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1305 mfccp->mfcc_parent); 1306 } 1307 fill_route(rt, mfccp, ipst); 1308 1309 /* 1310 * Prevent cleanup of cache entry. 1311 * Timer starts in ip_mforward. 1312 */ 1313 if (rt->mfc_timeout_id != 0) { 1314 timeout_id_t id; 1315 id = rt->mfc_timeout_id; 1316 /* 1317 * setting id to zero will avoid this 1318 * entry from being cleaned up in 1319 * expire_up_calls(). 1320 */ 1321 rt->mfc_timeout_id = 0; 1322 /* 1323 * dropping the lock is fine as we 1324 * have a refhold on the bucket. 1325 * so mfc cannot be freed. 1326 * The timeout can fire but it will see 1327 * that mfc_timeout_id == 0 and not cleanup. 1328 */ 1329 mutex_exit(&rt->mfc_mutex); 1330 (void) untimeout(id); 1331 mutex_enter(&rt->mfc_mutex); 1332 } 1333 1334 /* 1335 * Send all pkts that are queued waiting for the upcall. 1336 * ip_mdq param tun set to 0 - 1337 * the return value of ip_mdq() isn't used here, 1338 * so value we send doesn't matter. 1339 */ 1340 while (rt->mfc_rte != NULL) { 1341 rte = rt->mfc_rte; 1342 rt->mfc_rte = rte->rte_next; 1343 mutex_exit(&rt->mfc_mutex); 1344 (void) ip_mdq(rte->mp, (ipha_t *) 1345 rte->mp->b_rptr, rte->ill, 0, rt); 1346 freemsg(rte->mp); 1347 mi_free((char *)rte); 1348 mutex_enter(&rt->mfc_mutex); 1349 } 1350 } 1351 mutex_exit(&rt->mfc_mutex); 1352 } 1353 1354 1355 /* 1356 * It is possible that an entry is being inserted without an upcall 1357 */ 1358 if (nstl == 0) { 1359 mutex_enter(&(mfcbp->mfcb_lock)); 1360 if (ipst->ips_ip_mrtdebug > 1) { 1361 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1362 "add_mfc: no upcall o %x g %x p %x", 1363 ntohl(mfccp->mfcc_origin.s_addr), 1364 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1365 mfccp->mfcc_parent); 1366 } 1367 if (is_mrouter_off(ipst)) { 1368 mutex_exit(&mfcbp->mfcb_lock); 1369 MFCB_REFRELE(mfcbp); 1370 return (EINVAL); 1371 } 1372 1373 for (rt = mfcbp->mfcb_mfc; rt; rt = rt->mfc_next) { 1374 1375 mutex_enter(&rt->mfc_mutex); 1376 if ((rt->mfc_origin.s_addr == 1377 mfccp->mfcc_origin.s_addr) && 1378 (rt->mfc_mcastgrp.s_addr == 1379 mfccp->mfcc_mcastgrp.s_addr) && 1380 (!(rt->mfc_marks & MFCB_MARK_CONDEMNED))) { 1381 fill_route(rt, mfccp, ipst); 1382 mutex_exit(&rt->mfc_mutex); 1383 break; 1384 } 1385 mutex_exit(&rt->mfc_mutex); 1386 } 1387 1388 /* No upcall, so make a new entry into mfctable */ 1389 if (rt == NULL) { 1390 rt = (struct mfc *)mi_zalloc(sizeof (struct mfc)); 1391 if (rt == NULL) { 1392 ip1dbg(("add_mfc: out of memory\n")); 1393 mutex_exit(&mfcbp->mfcb_lock); 1394 MFCB_REFRELE(mfcbp); 1395 return (ENOBUFS); 1396 } 1397 1398 /* Insert new entry at head of hash chain */ 1399 mutex_enter(&rt->mfc_mutex); 1400 fill_route(rt, mfccp, ipst); 1401 1402 /* Link into table */ 1403 rt->mfc_next = mfcbp->mfcb_mfc; 1404 mfcbp->mfcb_mfc = rt; 1405 mutex_exit(&rt->mfc_mutex); 1406 } 1407 mutex_exit(&mfcbp->mfcb_lock); 1408 } 1409 1410 MFCB_REFRELE(mfcbp); 1411 return (0); 1412 } 1413 1414 /* 1415 * Fills in mfc structure from mrouted mfcctl. 1416 */ 1417 static void 1418 fill_route(struct mfc *rt, struct mfcctl *mfccp, ip_stack_t *ipst) 1419 { 1420 int i; 1421 1422 rt->mfc_origin = mfccp->mfcc_origin; 1423 rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; 1424 rt->mfc_parent = mfccp->mfcc_parent; 1425 mutex_enter(&ipst->ips_numvifs_mutex); 1426 for (i = 0; i < (int)ipst->ips_numvifs; i++) { 1427 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; 1428 } 1429 mutex_exit(&ipst->ips_numvifs_mutex); 1430 /* Initialize pkt counters per src-grp */ 1431 rt->mfc_pkt_cnt = 0; 1432 rt->mfc_byte_cnt = 0; 1433 rt->mfc_wrong_if = 0; 1434 rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_nsec = 0; 1435 1436 } 1437 1438 static void 1439 free_queue(struct mfc *mfcp) 1440 { 1441 struct rtdetq *rte0; 1442 1443 /* 1444 * Drop all queued upcall packets. 1445 * Free the mbuf with the pkt. 1446 */ 1447 while ((rte0 = mfcp->mfc_rte) != NULL) { 1448 mfcp->mfc_rte = rte0->rte_next; 1449 freemsg(rte0->mp); 1450 mi_free((char *)rte0); 1451 } 1452 } 1453 /* 1454 * go thorugh the hash bucket and free all the entries marked condemned. 1455 */ 1456 void 1457 release_mfc(struct mfcb *mfcbp) 1458 { 1459 struct mfc *current_mfcp; 1460 struct mfc *prev_mfcp; 1461 1462 prev_mfcp = current_mfcp = mfcbp->mfcb_mfc; 1463 1464 while (current_mfcp != NULL) { 1465 if (current_mfcp->mfc_marks & MFCB_MARK_CONDEMNED) { 1466 if (current_mfcp == mfcbp->mfcb_mfc) { 1467 mfcbp->mfcb_mfc = current_mfcp->mfc_next; 1468 free_queue(current_mfcp); 1469 mi_free(current_mfcp); 1470 prev_mfcp = current_mfcp = mfcbp->mfcb_mfc; 1471 continue; 1472 } 1473 ASSERT(prev_mfcp != NULL); 1474 prev_mfcp->mfc_next = current_mfcp->mfc_next; 1475 free_queue(current_mfcp); 1476 mi_free(current_mfcp); 1477 current_mfcp = NULL; 1478 } else { 1479 prev_mfcp = current_mfcp; 1480 } 1481 1482 current_mfcp = prev_mfcp->mfc_next; 1483 1484 } 1485 mfcbp->mfcb_marks &= ~MFCB_MARK_CONDEMNED; 1486 ASSERT(mfcbp->mfcb_mfc != NULL || mfcbp->mfcb_marks == 0); 1487 } 1488 1489 /* 1490 * Delete an mfc entry. 1491 */ 1492 static int 1493 del_mfc(struct mfcctl *mfccp, ip_stack_t *ipst) 1494 { 1495 struct in_addr origin; 1496 struct in_addr mcastgrp; 1497 struct mfc *rt; 1498 uint_t hash; 1499 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1500 1501 origin = mfccp->mfcc_origin; 1502 mcastgrp = mfccp->mfcc_mcastgrp; 1503 hash = MFCHASH(origin.s_addr, mcastgrp.s_addr); 1504 1505 if (ipst->ips_ip_mrtdebug > 1) { 1506 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1507 "del_mfc: o %x g %x", 1508 ntohl(origin.s_addr), 1509 ntohl(mcastgrp.s_addr)); 1510 } 1511 1512 MFCB_REFHOLD(&ipst->ips_mfcs[hash]); 1513 1514 /* Find mfc in mfctable, finds only entries without upcalls */ 1515 for (rt = ipst->ips_mfcs[hash].mfcb_mfc; rt; rt = rt->mfc_next) { 1516 mutex_enter(&rt->mfc_mutex); 1517 if (origin.s_addr == rt->mfc_origin.s_addr && 1518 mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr && 1519 rt->mfc_rte == NULL && 1520 !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) 1521 break; 1522 mutex_exit(&rt->mfc_mutex); 1523 } 1524 1525 /* 1526 * Return if there was an upcall (mfc_rte != NULL, 1527 * or rt not in mfctable. 1528 */ 1529 if (rt == NULL) { 1530 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 1531 return (EADDRNOTAVAIL); 1532 } 1533 1534 1535 /* 1536 * no need to hold lock as we have a reference. 1537 */ 1538 ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED; 1539 /* error checking */ 1540 if (rt->mfc_timeout_id != 0) { 1541 ip0dbg(("del_mfc: TIMEOUT NOT 0, rte not null")); 1542 /* 1543 * Its ok to drop the lock, the struct cannot be freed 1544 * since we have a ref on the hash bucket. 1545 */ 1546 rt->mfc_timeout_id = 0; 1547 mutex_exit(&rt->mfc_mutex); 1548 (void) untimeout(rt->mfc_timeout_id); 1549 mutex_enter(&rt->mfc_mutex); 1550 } 1551 1552 ASSERT(rt->mfc_rte == NULL); 1553 1554 1555 /* 1556 * Delete the entry from the cache 1557 */ 1558 rt->mfc_marks |= MFCB_MARK_CONDEMNED; 1559 mutex_exit(&rt->mfc_mutex); 1560 1561 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 1562 1563 return (0); 1564 } 1565 1566 #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ 1567 1568 /* 1569 * IP multicast forwarding function. This function assumes that the packet 1570 * pointed to by ipha has arrived on (or is about to be sent to) the interface 1571 * pointed to by "ill", and the packet is to be relayed to other networks 1572 * that have members of the packet's destination IP multicast group. 1573 * 1574 * The packet is returned unscathed to the caller, unless it is 1575 * erroneous, in which case a -1 value tells the caller (IP) 1576 * to discard it. 1577 * 1578 * Unlike BSD, SunOS 5.x needs to return to IP info about 1579 * whether pkt came in thru a tunnel, so it can be discarded, unless 1580 * it's IGMP. In BSD, the ifp is bogus for tunnels, so pkt won't try 1581 * to be delivered. 1582 * Return values are 0 - pkt is okay and phyint 1583 * -1 - pkt is malformed and to be tossed 1584 * 1 - pkt came in on tunnel 1585 */ 1586 int 1587 ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp) 1588 { 1589 struct mfc *rt; 1590 ipaddr_t src, dst, tunnel_src = 0; 1591 static int srctun = 0; 1592 vifi_t vifi; 1593 boolean_t pim_reg_packet = B_FALSE; 1594 struct mfcb *mfcbp; 1595 ip_stack_t *ipst = ill->ill_ipst; 1596 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1597 1598 if (ipst->ips_ip_mrtdebug > 1) { 1599 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1600 "ip_mforward: RECV ipha_src %x, ipha_dst %x, ill %s", 1601 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), 1602 ill->ill_name); 1603 } 1604 1605 dst = ipha->ipha_dst; 1606 if ((uint32_t)(uintptr_t)mp->b_prev == PIM_REGISTER_MARKER) 1607 pim_reg_packet = B_TRUE; 1608 else 1609 tunnel_src = (ipaddr_t)(uintptr_t)mp->b_prev; 1610 1611 /* 1612 * Don't forward a packet with time-to-live of zero or one, 1613 * or a packet destined to a local-only group. 1614 */ 1615 if (CLASSD(dst) && (ipha->ipha_ttl <= 1 || 1616 (ipaddr_t)ntohl(dst) <= INADDR_MAX_LOCAL_GROUP)) { 1617 if (ipst->ips_ip_mrtdebug > 1) { 1618 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1619 "ip_mforward: not forwarded ttl %d," 1620 " dst 0x%x ill %s", 1621 ipha->ipha_ttl, ntohl(dst), ill->ill_name); 1622 } 1623 mp->b_prev = NULL; 1624 if (tunnel_src != 0) 1625 return (1); 1626 else 1627 return (0); 1628 } 1629 1630 if ((tunnel_src != 0) || pim_reg_packet) { 1631 /* 1632 * Packet arrived over an encapsulated tunnel or via a PIM 1633 * register message. Both ip_mroute_decap() and pim_input() 1634 * encode information in mp->b_prev. 1635 */ 1636 mp->b_prev = NULL; 1637 if (ipst->ips_ip_mrtdebug > 1) { 1638 if (tunnel_src != 0) { 1639 (void) mi_strlog(mrouter->conn_rq, 1, 1640 SL_TRACE, 1641 "ip_mforward: ill %s arrived via ENCAP TUN", 1642 ill->ill_name); 1643 } else if (pim_reg_packet) { 1644 (void) mi_strlog(mrouter->conn_rq, 1, 1645 SL_TRACE, 1646 "ip_mforward: ill %s arrived via" 1647 " REGISTER VIF", 1648 ill->ill_name); 1649 } 1650 } 1651 } else if ((ipha->ipha_version_and_hdr_length & 0xf) < 1652 (uint_t)(IP_SIMPLE_HDR_LENGTH + TUNNEL_LEN) >> 2 || 1653 ((uchar_t *)(ipha + 1))[1] != IPOPT_LSRR) { 1654 /* Packet arrived via a physical interface. */ 1655 if (ipst->ips_ip_mrtdebug > 1) { 1656 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1657 "ip_mforward: ill %s arrived via PHYINT", 1658 ill->ill_name); 1659 } 1660 1661 } else { 1662 /* 1663 * Packet arrived through a SRCRT tunnel. 1664 * Source-route tunnels are no longer supported. 1665 * Error message printed every 1000 times. 1666 */ 1667 if ((srctun++ % 1000) == 0) { 1668 cmn_err(CE_WARN, 1669 "ip_mforward: received source-routed pkt from %x", 1670 ntohl(ipha->ipha_src)); 1671 } 1672 return (-1); 1673 } 1674 1675 ipst->ips_mrtstat->mrts_fwd_in++; 1676 src = ipha->ipha_src; 1677 1678 /* Find route in cache, return NULL if not there or upcalls q'ed. */ 1679 1680 /* 1681 * Lock the mfctable against changes made by ip_mforward. 1682 * Note that only add_mfc and del_mfc can remove entries and 1683 * they run with exclusive access to IP. So we do not need to 1684 * guard against the rt being deleted, so release lock after reading. 1685 */ 1686 1687 if (is_mrouter_off(ipst)) 1688 return (-1); 1689 1690 mfcbp = &ipst->ips_mfcs[MFCHASH(src, dst)]; 1691 MFCB_REFHOLD(mfcbp); 1692 MFCFIND(mfcbp, src, dst, rt); 1693 1694 /* Entry exists, so forward if necessary */ 1695 if (rt != NULL) { 1696 int ret = 0; 1697 ipst->ips_mrtstat->mrts_mfc_hits++; 1698 if (pim_reg_packet) { 1699 ASSERT(ipst->ips_reg_vif_num != ALL_VIFS); 1700 ret = ip_mdq(mp, ipha, 1701 ipst->ips_vifs[ipst->ips_reg_vif_num]. 1702 v_ipif->ipif_ill, 1703 0, rt); 1704 } else { 1705 ret = ip_mdq(mp, ipha, ill, tunnel_src, rt); 1706 } 1707 1708 MFCB_REFRELE(mfcbp); 1709 return (ret); 1710 1711 /* 1712 * Don't forward if we don't have a cache entry. Mrouted will 1713 * always provide a cache entry in response to an upcall. 1714 */ 1715 } else { 1716 /* 1717 * If we don't have a route for packet's origin, make a copy 1718 * of the packet and send message to routing daemon. 1719 */ 1720 struct mfc *mfc_rt = NULL; 1721 mblk_t *mp0 = NULL; 1722 mblk_t *mp_copy = NULL; 1723 struct rtdetq *rte = NULL; 1724 struct rtdetq *rte_m, *rte1, *prev_rte; 1725 uint_t hash; 1726 int npkts; 1727 boolean_t new_mfc = B_FALSE; 1728 ipst->ips_mrtstat->mrts_mfc_misses++; 1729 /* BSD uses mrts_no_route++ */ 1730 if (ipst->ips_ip_mrtdebug > 1) { 1731 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1732 "ip_mforward: no rte ill %s src %x g %x misses %d", 1733 ill->ill_name, ntohl(src), ntohl(dst), 1734 (int)ipst->ips_mrtstat->mrts_mfc_misses); 1735 } 1736 /* 1737 * The order of the following code differs from the BSD code. 1738 * Pre-mc3.5, the BSD code was incorrect and SunOS 5.x 1739 * code works, so SunOS 5.x wasn't changed to conform to the 1740 * BSD version. 1741 */ 1742 1743 /* Lock mfctable. */ 1744 hash = MFCHASH(src, dst); 1745 mutex_enter(&(ipst->ips_mfcs[hash].mfcb_lock)); 1746 1747 /* 1748 * If we are turning off mrouted return an error 1749 */ 1750 if (is_mrouter_off(ipst)) { 1751 mutex_exit(&mfcbp->mfcb_lock); 1752 MFCB_REFRELE(mfcbp); 1753 return (-1); 1754 } 1755 1756 /* Is there an upcall waiting for this packet? */ 1757 for (mfc_rt = ipst->ips_mfcs[hash].mfcb_mfc; mfc_rt; 1758 mfc_rt = mfc_rt->mfc_next) { 1759 mutex_enter(&mfc_rt->mfc_mutex); 1760 if (ipst->ips_ip_mrtdebug > 1) { 1761 (void) mi_strlog(mrouter->conn_rq, 1, 1762 SL_TRACE, 1763 "ip_mforward: MFCTAB hash %d o 0x%x" 1764 " g 0x%x\n", 1765 hash, ntohl(mfc_rt->mfc_origin.s_addr), 1766 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1767 } 1768 /* There is an upcall */ 1769 if ((src == mfc_rt->mfc_origin.s_addr) && 1770 (dst == mfc_rt->mfc_mcastgrp.s_addr) && 1771 (mfc_rt->mfc_rte != NULL) && 1772 !(mfc_rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 1773 break; 1774 } 1775 mutex_exit(&mfc_rt->mfc_mutex); 1776 } 1777 /* No upcall, so make a new entry into mfctable */ 1778 if (mfc_rt == NULL) { 1779 mfc_rt = (struct mfc *)mi_zalloc(sizeof (struct mfc)); 1780 if (mfc_rt == NULL) { 1781 ipst->ips_mrtstat->mrts_fwd_drop++; 1782 ip1dbg(("ip_mforward: out of memory " 1783 "for mfc, mfc_rt\n")); 1784 goto error_return; 1785 } else 1786 new_mfc = B_TRUE; 1787 /* Get resources */ 1788 /* TODO could copy header and dup rest */ 1789 mp_copy = copymsg(mp); 1790 if (mp_copy == NULL) { 1791 ipst->ips_mrtstat->mrts_fwd_drop++; 1792 ip1dbg(("ip_mforward: out of memory for " 1793 "mblk, mp_copy\n")); 1794 goto error_return; 1795 } 1796 mutex_enter(&mfc_rt->mfc_mutex); 1797 } 1798 /* Get resources for rte, whether first rte or not first. */ 1799 /* Add this packet into rtdetq */ 1800 rte = (struct rtdetq *)mi_zalloc(sizeof (struct rtdetq)); 1801 if (rte == NULL) { 1802 ipst->ips_mrtstat->mrts_fwd_drop++; 1803 mutex_exit(&mfc_rt->mfc_mutex); 1804 ip1dbg(("ip_mforward: out of memory for" 1805 " rtdetq, rte\n")); 1806 goto error_return; 1807 } 1808 1809 mp0 = copymsg(mp); 1810 if (mp0 == NULL) { 1811 ipst->ips_mrtstat->mrts_fwd_drop++; 1812 ip1dbg(("ip_mforward: out of memory for mblk, mp0\n")); 1813 mutex_exit(&mfc_rt->mfc_mutex); 1814 goto error_return; 1815 } 1816 rte->mp = mp0; 1817 if (pim_reg_packet) { 1818 ASSERT(ipst->ips_reg_vif_num != ALL_VIFS); 1819 rte->ill = 1820 ipst->ips_vifs[ipst->ips_reg_vif_num]. 1821 v_ipif->ipif_ill; 1822 } else { 1823 rte->ill = ill; 1824 } 1825 rte->rte_next = NULL; 1826 1827 /* 1828 * Determine if upcall q (rtdetq) has overflowed. 1829 * mfc_rt->mfc_rte is null by mi_zalloc 1830 * if it is the first message. 1831 */ 1832 for (rte_m = mfc_rt->mfc_rte, npkts = 0; rte_m; 1833 rte_m = rte_m->rte_next) 1834 npkts++; 1835 if (ipst->ips_ip_mrtdebug > 1) { 1836 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1837 "ip_mforward: upcalls %d\n", npkts); 1838 } 1839 if (npkts > MAX_UPQ) { 1840 ipst->ips_mrtstat->mrts_upq_ovflw++; 1841 mutex_exit(&mfc_rt->mfc_mutex); 1842 goto error_return; 1843 } 1844 1845 if (npkts == 0) { /* first upcall */ 1846 int i = 0; 1847 /* 1848 * Now finish installing the new mfc! Now that we have 1849 * resources! Insert new entry at head of hash chain. 1850 * Use src and dst which are ipaddr_t's. 1851 */ 1852 mfc_rt->mfc_origin.s_addr = src; 1853 mfc_rt->mfc_mcastgrp.s_addr = dst; 1854 1855 mutex_enter(&ipst->ips_numvifs_mutex); 1856 for (i = 0; i < (int)ipst->ips_numvifs; i++) 1857 mfc_rt->mfc_ttls[i] = 0; 1858 mutex_exit(&ipst->ips_numvifs_mutex); 1859 mfc_rt->mfc_parent = ALL_VIFS; 1860 1861 /* Link into table */ 1862 if (ipst->ips_ip_mrtdebug > 1) { 1863 (void) mi_strlog(mrouter->conn_rq, 1, 1864 SL_TRACE, 1865 "ip_mforward: NEW MFCTAB hash %d o 0x%x " 1866 "g 0x%x\n", hash, 1867 ntohl(mfc_rt->mfc_origin.s_addr), 1868 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1869 } 1870 mfc_rt->mfc_next = ipst->ips_mfcs[hash].mfcb_mfc; 1871 ipst->ips_mfcs[hash].mfcb_mfc = mfc_rt; 1872 mfc_rt->mfc_rte = NULL; 1873 } 1874 1875 /* Link in the upcall */ 1876 /* First upcall */ 1877 if (mfc_rt->mfc_rte == NULL) 1878 mfc_rt->mfc_rte = rte; 1879 else { 1880 /* not the first upcall */ 1881 prev_rte = mfc_rt->mfc_rte; 1882 for (rte1 = mfc_rt->mfc_rte->rte_next; rte1; 1883 prev_rte = rte1, rte1 = rte1->rte_next) 1884 ; 1885 prev_rte->rte_next = rte; 1886 } 1887 1888 /* 1889 * No upcalls waiting, this is first one, so send a message to 1890 * routing daemon to install a route into kernel table. 1891 */ 1892 if (npkts == 0) { 1893 struct igmpmsg *im; 1894 /* ipha_protocol is 0, for upcall */ 1895 ASSERT(mp_copy != NULL); 1896 im = (struct igmpmsg *)mp_copy->b_rptr; 1897 im->im_msgtype = IGMPMSG_NOCACHE; 1898 im->im_mbz = 0; 1899 mutex_enter(&ipst->ips_numvifs_mutex); 1900 if (pim_reg_packet) { 1901 im->im_vif = (uchar_t)ipst->ips_reg_vif_num; 1902 mutex_exit(&ipst->ips_numvifs_mutex); 1903 } else { 1904 /* 1905 * XXX do we need to hold locks here ? 1906 */ 1907 for (vifi = 0; 1908 vifi < ipst->ips_numvifs; 1909 vifi++) { 1910 if (ipst->ips_vifs[vifi].v_ipif == NULL) 1911 continue; 1912 if (ipst->ips_vifs[vifi]. 1913 v_ipif->ipif_ill == ill) { 1914 im->im_vif = (uchar_t)vifi; 1915 break; 1916 } 1917 } 1918 mutex_exit(&ipst->ips_numvifs_mutex); 1919 ASSERT(vifi < ipst->ips_numvifs); 1920 } 1921 1922 ipst->ips_mrtstat->mrts_upcalls++; 1923 /* Timer to discard upcalls if mrouted is too slow */ 1924 mfc_rt->mfc_timeout_id = timeout(expire_upcalls, 1925 mfc_rt, EXPIRE_TIMEOUT * UPCALL_EXPIRE); 1926 mutex_exit(&mfc_rt->mfc_mutex); 1927 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1928 /* Pass to RAWIP */ 1929 (mrouter->conn_recv)(mrouter, mp_copy, NULL); 1930 } else { 1931 mutex_exit(&mfc_rt->mfc_mutex); 1932 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1933 freemsg(mp_copy); 1934 } 1935 1936 MFCB_REFRELE(mfcbp); 1937 if (tunnel_src != 0) 1938 return (1); 1939 else 1940 return (0); 1941 error_return: 1942 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1943 MFCB_REFRELE(mfcbp); 1944 if (mfc_rt != NULL && (new_mfc == B_TRUE)) 1945 mi_free((char *)mfc_rt); 1946 if (rte != NULL) 1947 mi_free((char *)rte); 1948 if (mp_copy != NULL) 1949 freemsg(mp_copy); 1950 if (mp0 != NULL) 1951 freemsg(mp0); 1952 return (-1); 1953 } 1954 } 1955 1956 /* 1957 * Clean up the mfctable cache entry if upcall is not serviced. 1958 * SunOS 5.x has timeout per mfc, unlike BSD which has one timer. 1959 */ 1960 static void 1961 expire_upcalls(void *arg) 1962 { 1963 struct mfc *mfc_rt = arg; 1964 uint_t hash; 1965 struct mfc *prev_mfc, *mfc0; 1966 ip_stack_t *ipst; 1967 conn_t *mrouter; 1968 1969 if (mfc_rt->mfc_rte == NULL || mfc_rt->mfc_rte->ill != NULL) { 1970 cmn_err(CE_WARN, "expire_upcalls: no ILL\n"); 1971 return; 1972 } 1973 ipst = mfc_rt->mfc_rte->ill->ill_ipst; 1974 mrouter = ipst->ips_ip_g_mrouter; 1975 1976 hash = MFCHASH(mfc_rt->mfc_origin.s_addr, mfc_rt->mfc_mcastgrp.s_addr); 1977 if (ipst->ips_ip_mrtdebug > 1) { 1978 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1979 "expire_upcalls: hash %d s %x g %x", 1980 hash, ntohl(mfc_rt->mfc_origin.s_addr), 1981 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1982 } 1983 MFCB_REFHOLD(&ipst->ips_mfcs[hash]); 1984 mutex_enter(&mfc_rt->mfc_mutex); 1985 /* 1986 * if timeout has been set to zero, than the 1987 * entry has been filled, no need to delete it. 1988 */ 1989 if (mfc_rt->mfc_timeout_id == 0) 1990 goto done; 1991 ipst->ips_mrtstat->mrts_cache_cleanups++; 1992 mfc_rt->mfc_timeout_id = 0; 1993 1994 /* Determine entry to be cleaned up in cache table. */ 1995 for (prev_mfc = mfc0 = ipst->ips_mfcs[hash].mfcb_mfc; mfc0; 1996 prev_mfc = mfc0, mfc0 = mfc0->mfc_next) 1997 if (mfc0 == mfc_rt) 1998 break; 1999 2000 /* del_mfc takes care of gone mfcs */ 2001 ASSERT(prev_mfc != NULL); 2002 ASSERT(mfc0 != NULL); 2003 2004 /* 2005 * Delete the entry from the cache 2006 */ 2007 ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED; 2008 mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED; 2009 2010 /* 2011 * release_mfc will drop all queued upcall packets. 2012 * and will free the mbuf with the pkt, if, timing info. 2013 */ 2014 done: 2015 mutex_exit(&mfc_rt->mfc_mutex); 2016 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 2017 } 2018 2019 /* 2020 * Packet forwarding routine once entry in the cache is made. 2021 */ 2022 static int 2023 ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src, 2024 struct mfc *rt) 2025 { 2026 ill_t *vill; 2027 vifi_t vifi; 2028 struct vif *vifp; 2029 ipaddr_t dst = ipha->ipha_dst; 2030 size_t plen = msgdsize(mp); 2031 vifi_t num_of_vifs; 2032 ip_stack_t *ipst = ill->ill_ipst; 2033 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2034 2035 if (ipst->ips_ip_mrtdebug > 1) { 2036 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2037 "ip_mdq: SEND src %x, ipha_dst %x, ill %s", 2038 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), 2039 ill->ill_name); 2040 } 2041 2042 /* Macro to send packet on vif */ 2043 #define MC_SEND(ipha, mp, vifp, dst) { \ 2044 if ((vifp)->v_flags & VIFF_TUNNEL) \ 2045 encap_send((ipha), (mp), (vifp), (dst)); \ 2046 else if ((vifp)->v_flags & VIFF_REGISTER) \ 2047 register_send((ipha), (mp), (vifp), (dst)); \ 2048 else \ 2049 phyint_send((ipha), (mp), (vifp), (dst)); \ 2050 } 2051 2052 vifi = rt->mfc_parent; 2053 2054 /* 2055 * The value of vifi is MAXVIFS if the pkt had no parent, i.e., 2056 * Mrouted had no route. 2057 * We wanted the route installed in the mfctable to prevent multiple 2058 * tries, so it passed add_mfc(), but is discarded here. The v_ipif is 2059 * NULL so we don't want to check the ill. Still needed as of Mrouted 2060 * 3.6. 2061 */ 2062 if (vifi == NO_VIF) { 2063 ip1dbg(("ip_mdq: no route for origin ill %s, vifi is NO_VIF\n", 2064 ill->ill_name)); 2065 if (ipst->ips_ip_mrtdebug > 1) { 2066 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2067 "ip_mdq: vifi is NO_VIF ill = %s", ill->ill_name); 2068 } 2069 return (-1); /* drop pkt */ 2070 } 2071 2072 if (!lock_good_vif(&ipst->ips_vifs[vifi])) 2073 return (-1); 2074 /* 2075 * The MFC entries are not cleaned up when an ipif goes 2076 * away thus this code has to guard against an MFC referencing 2077 * an ipif that has been closed. Note: reset_mrt_vif_ipif 2078 * sets the v_ipif to NULL when the ipif disappears. 2079 */ 2080 ASSERT(ipst->ips_vifs[vifi].v_ipif != NULL); 2081 2082 if (vifi >= ipst->ips_numvifs) { 2083 cmn_err(CE_WARN, "ip_mdq: illegal vifi %d numvifs " 2084 "%d ill %s viftable ill %s\n", 2085 (int)vifi, (int)ipst->ips_numvifs, ill->ill_name, 2086 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name); 2087 unlock_good_vif(&ipst->ips_vifs[vifi]); 2088 return (-1); 2089 } 2090 /* 2091 * Don't forward if it didn't arrive from the parent vif for its 2092 * origin. 2093 */ 2094 vill = ipst->ips_vifs[vifi].v_ipif->ipif_ill; 2095 if ((vill != ill && !IS_IN_SAME_ILLGRP(vill, ill)) || 2096 (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) { 2097 /* Came in the wrong interface */ 2098 ip1dbg(("ip_mdq: arrived wrong if, vifi %d " 2099 "numvifs %d ill %s viftable ill %s\n", 2100 (int)vifi, (int)ipst->ips_numvifs, ill->ill_name, 2101 vill->ill_name)); 2102 if (ipst->ips_ip_mrtdebug > 1) { 2103 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2104 "ip_mdq: arrived wrong if, vifi %d ill " 2105 "%s viftable ill %s\n", 2106 (int)vifi, ill->ill_name, vill->ill_name); 2107 } 2108 ipst->ips_mrtstat->mrts_wrong_if++; 2109 rt->mfc_wrong_if++; 2110 2111 /* 2112 * If we are doing PIM assert processing and we are forwarding 2113 * packets on this interface, and it is a broadcast medium 2114 * interface (and not a tunnel), send a message to the routing. 2115 * 2116 * We use the first ipif on the list, since it's all we have. 2117 * Chances are the ipif_flags are the same for ipifs on the ill. 2118 */ 2119 if (ipst->ips_pim_assert && rt->mfc_ttls[vifi] > 0 && 2120 (ill->ill_ipif->ipif_flags & IPIF_BROADCAST) && 2121 !(ipst->ips_vifs[vifi].v_flags & VIFF_TUNNEL)) { 2122 mblk_t *mp_copy; 2123 struct igmpmsg *im; 2124 2125 /* TODO could copy header and dup rest */ 2126 mp_copy = copymsg(mp); 2127 if (mp_copy == NULL) { 2128 ipst->ips_mrtstat->mrts_fwd_drop++; 2129 ip1dbg(("ip_mdq: out of memory " 2130 "for mblk, mp_copy\n")); 2131 unlock_good_vif(&ipst->ips_vifs[vifi]); 2132 return (-1); 2133 } 2134 2135 im = (struct igmpmsg *)mp_copy->b_rptr; 2136 im->im_msgtype = IGMPMSG_WRONGVIF; 2137 im->im_mbz = 0; 2138 im->im_vif = (ushort_t)vifi; 2139 /* Pass to RAWIP */ 2140 (mrouter->conn_recv)(mrouter, mp_copy, NULL); 2141 } 2142 unlock_good_vif(&ipst->ips_vifs[vifi]); 2143 if (tunnel_src != 0) 2144 return (1); 2145 else 2146 return (0); 2147 } 2148 /* 2149 * If I sourced this packet, it counts as output, else it was input. 2150 */ 2151 if (ipha->ipha_src == ipst->ips_vifs[vifi].v_lcl_addr.s_addr) { 2152 ipst->ips_vifs[vifi].v_pkt_out++; 2153 ipst->ips_vifs[vifi].v_bytes_out += plen; 2154 } else { 2155 ipst->ips_vifs[vifi].v_pkt_in++; 2156 ipst->ips_vifs[vifi].v_bytes_in += plen; 2157 } 2158 mutex_enter(&rt->mfc_mutex); 2159 rt->mfc_pkt_cnt++; 2160 rt->mfc_byte_cnt += plen; 2161 mutex_exit(&rt->mfc_mutex); 2162 unlock_good_vif(&ipst->ips_vifs[vifi]); 2163 /* 2164 * For each vif, decide if a copy of the packet should be forwarded. 2165 * Forward if: 2166 * - the vif threshold ttl is non-zero AND 2167 * - the pkt ttl exceeds the vif's threshold 2168 * A non-zero mfc_ttl indicates that the vif is part of 2169 * the output set for the mfc entry. 2170 */ 2171 mutex_enter(&ipst->ips_numvifs_mutex); 2172 num_of_vifs = ipst->ips_numvifs; 2173 mutex_exit(&ipst->ips_numvifs_mutex); 2174 for (vifp = ipst->ips_vifs, vifi = 0; 2175 vifi < num_of_vifs; 2176 vifp++, vifi++) { 2177 if (!lock_good_vif(vifp)) 2178 continue; 2179 if ((rt->mfc_ttls[vifi] > 0) && 2180 (ipha->ipha_ttl > rt->mfc_ttls[vifi])) { 2181 /* 2182 * lock_good_vif should not have succedded if 2183 * v_ipif is null. 2184 */ 2185 ASSERT(vifp->v_ipif != NULL); 2186 vifp->v_pkt_out++; 2187 vifp->v_bytes_out += plen; 2188 MC_SEND(ipha, mp, vifp, dst); 2189 ipst->ips_mrtstat->mrts_fwd_out++; 2190 } 2191 unlock_good_vif(vifp); 2192 } 2193 if (tunnel_src != 0) 2194 return (1); 2195 else 2196 return (0); 2197 } 2198 2199 /* 2200 * Send the packet on physical interface. 2201 * Caller assumes can continue to use mp on return. 2202 */ 2203 /* ARGSUSED */ 2204 static void 2205 phyint_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2206 { 2207 mblk_t *mp_copy; 2208 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2209 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2210 2211 /* Make a new reference to the packet */ 2212 mp_copy = copymsg(mp); /* TODO could copy header and dup rest */ 2213 if (mp_copy == NULL) { 2214 ipst->ips_mrtstat->mrts_fwd_drop++; 2215 ip1dbg(("phyint_send: out of memory for mblk, mp_copy\n")); 2216 return; 2217 } 2218 if (vifp->v_rate_limit <= 0) 2219 tbf_send_packet(vifp, mp_copy); 2220 else { 2221 if (ipst->ips_ip_mrtdebug > 1) { 2222 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2223 "phyint_send: tbf_contr rate %d " 2224 "vifp 0x%p mp 0x%p dst 0x%x", 2225 vifp->v_rate_limit, (void *)vifp, (void *)mp, dst); 2226 } 2227 tbf_control(vifp, mp_copy, (ipha_t *)mp_copy->b_rptr); 2228 } 2229 } 2230 2231 /* 2232 * Send the whole packet for REGISTER encapsulation to PIM daemon 2233 * Caller assumes it can continue to use mp on return. 2234 */ 2235 /* ARGSUSED */ 2236 static void 2237 register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2238 { 2239 struct igmpmsg *im; 2240 mblk_t *mp_copy; 2241 ipha_t *ipha_copy; 2242 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2243 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2244 2245 if (ipst->ips_ip_mrtdebug > 1) { 2246 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2247 "register_send: src %x, dst %x\n", 2248 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst)); 2249 } 2250 2251 /* 2252 * Copy the old packet & pullup its IP header into the new mblk_t so we 2253 * can modify it. Try to fill the new mblk_t since if we don't the 2254 * ethernet driver will. 2255 */ 2256 mp_copy = allocb(sizeof (struct igmpmsg) + sizeof (ipha_t), BPRI_MED); 2257 if (mp_copy == NULL) { 2258 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2259 if (ipst->ips_ip_mrtdebug > 3) { 2260 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2261 "register_send: allocb failure."); 2262 } 2263 return; 2264 } 2265 2266 /* 2267 * Bump write pointer to account for igmpmsg being added. 2268 */ 2269 mp_copy->b_wptr = mp_copy->b_rptr + sizeof (struct igmpmsg); 2270 2271 /* 2272 * Chain packet to new mblk_t. 2273 */ 2274 if ((mp_copy->b_cont = copymsg(mp)) == NULL) { 2275 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2276 if (ipst->ips_ip_mrtdebug > 3) { 2277 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2278 "register_send: copymsg failure."); 2279 } 2280 freeb(mp_copy); 2281 return; 2282 } 2283 2284 /* 2285 * icmp_input() asserts that IP version field is set to an 2286 * appropriate version. Hence, the struct igmpmsg that this really 2287 * becomes, needs to have the correct IP version field. 2288 */ 2289 ipha_copy = (ipha_t *)mp_copy->b_rptr; 2290 *ipha_copy = multicast_encap_iphdr; 2291 2292 /* 2293 * The kernel uses the struct igmpmsg header to encode the messages to 2294 * the multicast routing daemon. Fill in the fields in the header 2295 * starting with the message type which is IGMPMSG_WHOLEPKT 2296 */ 2297 im = (struct igmpmsg *)mp_copy->b_rptr; 2298 im->im_msgtype = IGMPMSG_WHOLEPKT; 2299 im->im_src.s_addr = ipha->ipha_src; 2300 im->im_dst.s_addr = ipha->ipha_dst; 2301 2302 /* 2303 * Must Be Zero. This is because the struct igmpmsg is really an IP 2304 * header with renamed fields and the multicast routing daemon uses 2305 * an ipha_protocol (aka im_mbz) of 0 to distinguish these messages. 2306 */ 2307 im->im_mbz = 0; 2308 2309 ++ipst->ips_mrtstat->mrts_upcalls; 2310 if (!canputnext(mrouter->conn_rq)) { 2311 ++ipst->ips_mrtstat->mrts_pim_regsend_drops; 2312 if (ipst->ips_ip_mrtdebug > 3) { 2313 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2314 "register_send: register upcall failure."); 2315 } 2316 freemsg(mp_copy); 2317 } else { 2318 /* Pass to RAWIP */ 2319 (mrouter->conn_recv)(mrouter, mp_copy, NULL); 2320 } 2321 } 2322 2323 /* 2324 * pim_validate_cksum handles verification of the checksum in the 2325 * pim header. For PIM Register packets, the checksum is calculated 2326 * across the PIM header only. For all other packets, the checksum 2327 * is for the PIM header and remainder of the packet. 2328 * 2329 * returns: B_TRUE, if checksum is okay. 2330 * B_FALSE, if checksum is not valid. 2331 */ 2332 static boolean_t 2333 pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp) 2334 { 2335 mblk_t *mp_dup; 2336 2337 if ((mp_dup = dupmsg(mp)) == NULL) 2338 return (B_FALSE); 2339 2340 mp_dup->b_rptr += IPH_HDR_LENGTH(ip); 2341 if (pimp->pim_type == PIM_REGISTER) 2342 mp_dup->b_wptr = mp_dup->b_rptr + PIM_MINLEN; 2343 if (IP_CSUM(mp_dup, 0, 0)) { 2344 freemsg(mp_dup); 2345 return (B_FALSE); 2346 } 2347 freemsg(mp_dup); 2348 return (B_TRUE); 2349 } 2350 2351 /* 2352 * int 2353 * pim_input(queue_t *, mblk_t *, ill_t *ill) - Process PIM protocol packets. 2354 * IP Protocol 103. Register messages are decapsulated and sent 2355 * onto multicast forwarding. 2356 */ 2357 int 2358 pim_input(queue_t *q, mblk_t *mp, ill_t *ill) 2359 { 2360 ipha_t *eip, *ip; 2361 int iplen, pimlen, iphlen; 2362 struct pim *pimp; /* pointer to a pim struct */ 2363 uint32_t *reghdr; 2364 ip_stack_t *ipst = ill->ill_ipst; 2365 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2366 2367 /* 2368 * Pullup the msg for PIM protocol processing. 2369 */ 2370 if (pullupmsg(mp, -1) == 0) { 2371 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2372 freemsg(mp); 2373 return (-1); 2374 } 2375 2376 ip = (ipha_t *)mp->b_rptr; 2377 iplen = ip->ipha_length; 2378 iphlen = IPH_HDR_LENGTH(ip); 2379 pimlen = ntohs(iplen) - iphlen; 2380 2381 /* 2382 * Validate lengths 2383 */ 2384 if (pimlen < PIM_MINLEN) { 2385 ++ipst->ips_mrtstat->mrts_pim_malformed; 2386 if (ipst->ips_ip_mrtdebug > 1) { 2387 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2388 "pim_input: length not at least minlen"); 2389 } 2390 freemsg(mp); 2391 return (-1); 2392 } 2393 2394 /* 2395 * Point to the PIM header. 2396 */ 2397 pimp = (struct pim *)((caddr_t)ip + iphlen); 2398 2399 /* 2400 * Check the version number. 2401 */ 2402 if (pimp->pim_vers != PIM_VERSION) { 2403 ++ipst->ips_mrtstat->mrts_pim_badversion; 2404 if (ipst->ips_ip_mrtdebug > 1) { 2405 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2406 "pim_input: unknown version of PIM"); 2407 } 2408 freemsg(mp); 2409 return (-1); 2410 } 2411 2412 /* 2413 * Validate the checksum 2414 */ 2415 if (!pim_validate_cksum(mp, ip, pimp)) { 2416 ++ipst->ips_mrtstat->mrts_pim_rcv_badcsum; 2417 if (ipst->ips_ip_mrtdebug > 1) { 2418 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2419 "pim_input: invalid checksum"); 2420 } 2421 freemsg(mp); 2422 return (-1); 2423 } 2424 2425 if (pimp->pim_type != PIM_REGISTER) 2426 return (0); 2427 2428 reghdr = (uint32_t *)(pimp + 1); 2429 eip = (ipha_t *)(reghdr + 1); 2430 2431 /* 2432 * check if the inner packet is destined to mcast group 2433 */ 2434 if (!CLASSD(eip->ipha_dst)) { 2435 ++ipst->ips_mrtstat->mrts_pim_badregisters; 2436 if (ipst->ips_ip_mrtdebug > 1) { 2437 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2438 "pim_input: Inner pkt not mcast .. !"); 2439 } 2440 freemsg(mp); 2441 return (-1); 2442 } 2443 if (ipst->ips_ip_mrtdebug > 1) { 2444 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2445 "register from %x, to %x, len %d", 2446 ntohl(eip->ipha_src), 2447 ntohl(eip->ipha_dst), 2448 ntohs(eip->ipha_length)); 2449 } 2450 /* 2451 * If the null register bit is not set, decapsulate 2452 * the packet before forwarding it. 2453 */ 2454 if (!(ntohl(*reghdr) & PIM_NULL_REGISTER)) { 2455 mblk_t *mp_copy; 2456 2457 /* Copy the message */ 2458 if ((mp_copy = copymsg(mp)) == NULL) { 2459 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2460 freemsg(mp); 2461 return (-1); 2462 } 2463 2464 /* 2465 * Decapsulate the packet and give it to 2466 * register_mforward. 2467 */ 2468 mp_copy->b_rptr += iphlen + sizeof (pim_t) + 2469 sizeof (*reghdr); 2470 if (register_mforward(q, mp_copy, ill) != 0) { 2471 freemsg(mp); 2472 return (-1); 2473 } 2474 } 2475 2476 /* 2477 * Pass all valid PIM packets up to any process(es) listening on a raw 2478 * PIM socket. For Solaris it is done right after pim_input() is 2479 * called. 2480 */ 2481 return (0); 2482 } 2483 2484 /* 2485 * PIM sparse mode hook. Called by pim_input after decapsulating 2486 * the packet. Loop back the packet, as if we have received it. 2487 * In pim_input() we have to check if the destination is a multicast address. 2488 */ 2489 /* ARGSUSED */ 2490 static int 2491 register_mforward(queue_t *q, mblk_t *mp, ill_t *ill) 2492 { 2493 ip_stack_t *ipst = ill->ill_ipst; 2494 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2495 2496 ASSERT(ipst->ips_reg_vif_num <= ipst->ips_numvifs); 2497 2498 if (ipst->ips_ip_mrtdebug > 3) { 2499 ipha_t *ipha; 2500 2501 ipha = (ipha_t *)mp->b_rptr; 2502 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2503 "register_mforward: src %x, dst %x\n", 2504 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst)); 2505 } 2506 /* 2507 * Need to pass in to ip_mforward() the information that the 2508 * packet has arrived on the register_vif. We use the solution that 2509 * ip_mroute_decap() employs: use mp->b_prev to pass some information 2510 * to ip_mforward(). Nonzero value means the packet has arrived on a 2511 * tunnel (ip_mroute_decap() puts the address of the other side of the 2512 * tunnel there.) This is safe since ip_rput() either frees the packet 2513 * or passes it to ip_mforward(). We use 2514 * PIM_REGISTER_MARKER = 0xffffffff to indicate the has arrived on the 2515 * register vif. If in the future we have more than one register vifs, 2516 * then this will need re-examination. 2517 */ 2518 mp->b_prev = (mblk_t *)PIM_REGISTER_MARKER; 2519 ++ipst->ips_mrtstat->mrts_pim_regforwards; 2520 ip_rput(q, mp); 2521 return (0); 2522 } 2523 2524 /* 2525 * Send an encapsulated packet. 2526 * Caller assumes can continue to use mp when routine returns. 2527 */ 2528 /* ARGSUSED */ 2529 static void 2530 encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2531 { 2532 mblk_t *mp_copy; 2533 ipha_t *ipha_copy; 2534 size_t len; 2535 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2536 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2537 2538 if (ipst->ips_ip_mrtdebug > 1) { 2539 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2540 "encap_send: vif %ld enter", 2541 (ptrdiff_t)(vifp - ipst->ips_vifs)); 2542 } 2543 len = ntohs(ipha->ipha_length); 2544 2545 /* 2546 * Copy the old packet & pullup it's IP header into the 2547 * new mbuf so we can modify it. Try to fill the new 2548 * mbuf since if we don't the ethernet driver will. 2549 */ 2550 mp_copy = allocb(32 + sizeof (multicast_encap_iphdr), BPRI_MED); 2551 if (mp_copy == NULL) 2552 return; 2553 mp_copy->b_rptr += 32; 2554 mp_copy->b_wptr = mp_copy->b_rptr + sizeof (multicast_encap_iphdr); 2555 if ((mp_copy->b_cont = copymsg(mp)) == NULL) { 2556 freeb(mp_copy); 2557 return; 2558 } 2559 2560 /* 2561 * Fill in the encapsulating IP header. 2562 * Remote tunnel dst in rmt_addr, from add_vif(). 2563 */ 2564 ipha_copy = (ipha_t *)mp_copy->b_rptr; 2565 *ipha_copy = multicast_encap_iphdr; 2566 ASSERT((len + sizeof (ipha_t)) <= IP_MAXPACKET); 2567 ipha_copy->ipha_length = htons(len + sizeof (ipha_t)); 2568 ipha_copy->ipha_src = vifp->v_lcl_addr.s_addr; 2569 ipha_copy->ipha_dst = vifp->v_rmt_addr.s_addr; 2570 ASSERT(ipha_copy->ipha_ident == 0); 2571 2572 /* Turn the encapsulated IP header back into a valid one. */ 2573 ipha = (ipha_t *)mp_copy->b_cont->b_rptr; 2574 ipha->ipha_ttl--; 2575 ipha->ipha_hdr_checksum = 0; 2576 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 2577 2578 if (ipst->ips_ip_mrtdebug > 1) { 2579 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2580 "encap_send: group 0x%x", ntohl(ipha->ipha_dst)); 2581 } 2582 if (vifp->v_rate_limit <= 0) 2583 tbf_send_packet(vifp, mp_copy); 2584 else 2585 /* ipha is from the original header */ 2586 tbf_control(vifp, mp_copy, ipha); 2587 } 2588 2589 /* 2590 * De-encapsulate a packet and feed it back through IP input. 2591 * This routine is called whenever IP gets a packet with prototype 2592 * IPPROTO_ENCAP and a local destination address. 2593 */ 2594 void 2595 ip_mroute_decap(queue_t *q, mblk_t *mp, ill_t *ill) 2596 { 2597 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2598 ipha_t *ipha_encap; 2599 int hlen = IPH_HDR_LENGTH(ipha); 2600 ipaddr_t src; 2601 struct vif *vifp; 2602 ip_stack_t *ipst = ill->ill_ipst; 2603 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2604 2605 /* 2606 * Dump the packet if it's not to a multicast destination or if 2607 * we don't have an encapsulating tunnel with the source. 2608 * Note: This code assumes that the remote site IP address 2609 * uniquely identifies the tunnel (i.e., that this site has 2610 * at most one tunnel with the remote site). 2611 */ 2612 ipha_encap = (ipha_t *)((char *)ipha + hlen); 2613 if (!CLASSD(ipha_encap->ipha_dst)) { 2614 ipst->ips_mrtstat->mrts_bad_tunnel++; 2615 ip1dbg(("ip_mroute_decap: bad tunnel\n")); 2616 freemsg(mp); 2617 return; 2618 } 2619 src = (ipaddr_t)ipha->ipha_src; 2620 mutex_enter(&ipst->ips_last_encap_lock); 2621 if (src != ipst->ips_last_encap_src) { 2622 struct vif *vife; 2623 2624 vifp = ipst->ips_vifs; 2625 vife = vifp + ipst->ips_numvifs; 2626 ipst->ips_last_encap_src = src; 2627 ipst->ips_last_encap_vif = 0; 2628 for (; vifp < vife; ++vifp) { 2629 if (!lock_good_vif(vifp)) 2630 continue; 2631 if (vifp->v_rmt_addr.s_addr == src) { 2632 if (vifp->v_flags & VIFF_TUNNEL) 2633 ipst->ips_last_encap_vif = vifp; 2634 if (ipst->ips_ip_mrtdebug > 1) { 2635 (void) mi_strlog(mrouter->conn_rq, 2636 1, SL_TRACE, 2637 "ip_mroute_decap: good tun " 2638 "vif %ld with %x", 2639 (ptrdiff_t)(vifp - ipst->ips_vifs), 2640 ntohl(src)); 2641 } 2642 unlock_good_vif(vifp); 2643 break; 2644 } 2645 unlock_good_vif(vifp); 2646 } 2647 } 2648 if ((vifp = ipst->ips_last_encap_vif) == 0) { 2649 mutex_exit(&ipst->ips_last_encap_lock); 2650 ipst->ips_mrtstat->mrts_bad_tunnel++; 2651 freemsg(mp); 2652 ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n", 2653 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(src))); 2654 return; 2655 } 2656 mutex_exit(&ipst->ips_last_encap_lock); 2657 2658 /* 2659 * Need to pass in the tunnel source to ip_mforward (so that it can 2660 * verify that the packet arrived over the correct vif.) We use b_prev 2661 * to pass this information. This is safe since the ip_rput either 2662 * frees the packet or passes it to ip_mforward. 2663 */ 2664 mp->b_prev = (mblk_t *)(uintptr_t)src; 2665 mp->b_rptr += hlen; 2666 /* Feed back into ip_rput as an M_DATA. */ 2667 ip_rput(q, mp); 2668 } 2669 2670 /* 2671 * Remove all records with v_ipif == ipif. Called when an interface goes away 2672 * (stream closed). Called as writer. 2673 */ 2674 void 2675 reset_mrt_vif_ipif(ipif_t *ipif) 2676 { 2677 vifi_t vifi, tmp_vifi; 2678 vifi_t num_of_vifs; 2679 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 2680 2681 /* Can't check vifi >= 0 since vifi_t is unsigned! */ 2682 2683 mutex_enter(&ipst->ips_numvifs_mutex); 2684 num_of_vifs = ipst->ips_numvifs; 2685 mutex_exit(&ipst->ips_numvifs_mutex); 2686 2687 for (vifi = num_of_vifs; vifi != 0; vifi--) { 2688 tmp_vifi = vifi - 1; 2689 if (ipst->ips_vifs[tmp_vifi].v_ipif == ipif) { 2690 (void) del_vif(&tmp_vifi, NULL, NULL, ipst); 2691 } 2692 } 2693 } 2694 2695 /* Remove pending upcall msgs when ill goes away. Called by ill_delete. */ 2696 void 2697 reset_mrt_ill(ill_t *ill) 2698 { 2699 struct mfc *rt; 2700 struct rtdetq *rte; 2701 int i; 2702 ip_stack_t *ipst = ill->ill_ipst; 2703 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2704 2705 for (i = 0; i < MFCTBLSIZ; i++) { 2706 MFCB_REFHOLD(&ipst->ips_mfcs[i]); 2707 if ((rt = ipst->ips_mfcs[i].mfcb_mfc) != NULL) { 2708 if (ipst->ips_ip_mrtdebug > 1) { 2709 (void) mi_strlog(mrouter->conn_rq, 1, 2710 SL_TRACE, 2711 "reset_mrt_ill: mfctable [%d]", i); 2712 } 2713 while (rt != NULL) { 2714 mutex_enter(&rt->mfc_mutex); 2715 while ((rte = rt->mfc_rte) != NULL) { 2716 if (rte->ill == ill) { 2717 if (ipst->ips_ip_mrtdebug > 1) { 2718 (void) mi_strlog( 2719 mrouter->conn_rq, 2720 1, SL_TRACE, 2721 "reset_mrt_ill: " 2722 "ill 0x%p", (void *)ill); 2723 } 2724 rt->mfc_rte = rte->rte_next; 2725 freemsg(rte->mp); 2726 mi_free((char *)rte); 2727 } 2728 } 2729 mutex_exit(&rt->mfc_mutex); 2730 rt = rt->mfc_next; 2731 } 2732 } 2733 MFCB_REFRELE(&ipst->ips_mfcs[i]); 2734 } 2735 } 2736 2737 /* 2738 * Token bucket filter module. 2739 * The ipha is for mcastgrp destination for phyint and encap. 2740 */ 2741 static void 2742 tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha) 2743 { 2744 size_t p_len = msgdsize(mp); 2745 struct tbf *t = vifp->v_tbf; 2746 timeout_id_t id = 0; 2747 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2748 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2749 2750 /* Drop if packet is too large */ 2751 if (p_len > MAX_BKT_SIZE) { 2752 ipst->ips_mrtstat->mrts_pkt2large++; 2753 freemsg(mp); 2754 return; 2755 } 2756 if (ipst->ips_ip_mrtdebug > 1) { 2757 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2758 "tbf_ctrl: SEND vif %ld, qlen %d, ipha_dst 0x%x", 2759 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len, 2760 ntohl(ipha->ipha_dst)); 2761 } 2762 2763 mutex_enter(&t->tbf_lock); 2764 2765 tbf_update_tokens(vifp); 2766 2767 /* 2768 * If there are enough tokens, 2769 * and the queue is empty, send this packet out. 2770 */ 2771 if (ipst->ips_ip_mrtdebug > 1) { 2772 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2773 "tbf_control: vif %ld, TOKENS %d, pkt len %lu, qlen %d", 2774 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_n_tok, p_len, 2775 t->tbf_q_len); 2776 } 2777 /* No packets are queued */ 2778 if (t->tbf_q_len == 0) { 2779 /* queue empty, send packet if enough tokens */ 2780 if (p_len <= t->tbf_n_tok) { 2781 t->tbf_n_tok -= p_len; 2782 mutex_exit(&t->tbf_lock); 2783 tbf_send_packet(vifp, mp); 2784 return; 2785 } else { 2786 /* Queue packet and timeout till later */ 2787 tbf_queue(vifp, mp); 2788 ASSERT(vifp->v_timeout_id == 0); 2789 vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp, 2790 TBF_REPROCESS); 2791 } 2792 } else if (t->tbf_q_len < t->tbf_max_q_len) { 2793 /* Finite queue length, so queue pkts and process queue */ 2794 tbf_queue(vifp, mp); 2795 tbf_process_q(vifp); 2796 } else { 2797 /* Check that we have UDP header with IP header */ 2798 size_t hdr_length = IPH_HDR_LENGTH(ipha) + 2799 sizeof (struct udphdr); 2800 2801 if ((mp->b_wptr - mp->b_rptr) < hdr_length) { 2802 if (!pullupmsg(mp, hdr_length)) { 2803 freemsg(mp); 2804 ip1dbg(("tbf_ctl: couldn't pullup udp hdr, " 2805 "vif %ld src 0x%x dst 0x%x\n", 2806 (ptrdiff_t)(vifp - ipst->ips_vifs), 2807 ntohl(ipha->ipha_src), 2808 ntohl(ipha->ipha_dst))); 2809 mutex_exit(&vifp->v_tbf->tbf_lock); 2810 return; 2811 } else 2812 /* Have to reassign ipha after pullupmsg */ 2813 ipha = (ipha_t *)mp->b_rptr; 2814 } 2815 /* 2816 * Queue length too much, 2817 * try to selectively dq, or queue and process 2818 */ 2819 if (!tbf_dq_sel(vifp, ipha)) { 2820 ipst->ips_mrtstat->mrts_q_overflow++; 2821 freemsg(mp); 2822 } else { 2823 tbf_queue(vifp, mp); 2824 tbf_process_q(vifp); 2825 } 2826 } 2827 if (t->tbf_q_len == 0) { 2828 id = vifp->v_timeout_id; 2829 vifp->v_timeout_id = 0; 2830 } 2831 mutex_exit(&vifp->v_tbf->tbf_lock); 2832 if (id != 0) 2833 (void) untimeout(id); 2834 } 2835 2836 /* 2837 * Adds a packet to the tbf queue at the interface. 2838 * The ipha is for mcastgrp destination for phyint and encap. 2839 */ 2840 static void 2841 tbf_queue(struct vif *vifp, mblk_t *mp) 2842 { 2843 struct tbf *t = vifp->v_tbf; 2844 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2845 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2846 2847 if (ipst->ips_ip_mrtdebug > 1) { 2848 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2849 "tbf_queue: vif %ld", (ptrdiff_t)(vifp - ipst->ips_vifs)); 2850 } 2851 ASSERT(MUTEX_HELD(&t->tbf_lock)); 2852 2853 if (t->tbf_t == NULL) { 2854 /* Queue was empty */ 2855 t->tbf_q = mp; 2856 } else { 2857 /* Insert at tail */ 2858 t->tbf_t->b_next = mp; 2859 } 2860 /* set new tail pointer */ 2861 t->tbf_t = mp; 2862 2863 mp->b_next = mp->b_prev = NULL; 2864 2865 t->tbf_q_len++; 2866 } 2867 2868 /* 2869 * Process the queue at the vif interface. 2870 * Drops the tbf_lock when sending packets. 2871 * 2872 * NOTE : The caller should quntimeout if the queue length is 0. 2873 */ 2874 static void 2875 tbf_process_q(struct vif *vifp) 2876 { 2877 mblk_t *mp; 2878 struct tbf *t = vifp->v_tbf; 2879 size_t len; 2880 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2881 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2882 2883 if (ipst->ips_ip_mrtdebug > 1) { 2884 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2885 "tbf_process_q 1: vif %ld qlen = %d", 2886 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len); 2887 } 2888 2889 /* 2890 * Loop through the queue at the interface and send 2891 * as many packets as possible. 2892 */ 2893 ASSERT(MUTEX_HELD(&t->tbf_lock)); 2894 2895 while (t->tbf_q_len > 0) { 2896 mp = t->tbf_q; 2897 len = (size_t)msgdsize(mp); /* length of ip pkt */ 2898 2899 /* Determine if the packet can be sent */ 2900 if (len <= t->tbf_n_tok) { 2901 /* 2902 * If so, reduce no. of tokens, dequeue the packet, 2903 * send the packet. 2904 */ 2905 t->tbf_n_tok -= len; 2906 2907 t->tbf_q = mp->b_next; 2908 if (--t->tbf_q_len == 0) { 2909 t->tbf_t = NULL; 2910 } 2911 mp->b_next = NULL; 2912 /* Exit mutex before sending packet, then re-enter */ 2913 mutex_exit(&t->tbf_lock); 2914 tbf_send_packet(vifp, mp); 2915 mutex_enter(&t->tbf_lock); 2916 } else 2917 break; 2918 } 2919 } 2920 2921 /* Called at tbf timeout to update tokens, process q and reset timer. */ 2922 static void 2923 tbf_reprocess_q(void *arg) 2924 { 2925 struct vif *vifp = arg; 2926 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2927 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2928 2929 mutex_enter(&vifp->v_tbf->tbf_lock); 2930 vifp->v_timeout_id = 0; 2931 tbf_update_tokens(vifp); 2932 2933 tbf_process_q(vifp); 2934 2935 if (vifp->v_tbf->tbf_q_len > 0) { 2936 vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp, 2937 TBF_REPROCESS); 2938 } 2939 mutex_exit(&vifp->v_tbf->tbf_lock); 2940 2941 if (ipst->ips_ip_mrtdebug > 1) { 2942 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2943 "tbf_reprcess_q: vif %ld timeout id = %p", 2944 (ptrdiff_t)(vifp - ipst->ips_vifs), vifp->v_timeout_id); 2945 } 2946 } 2947 2948 /* 2949 * Function that will selectively discard a member of the tbf queue, 2950 * based on the precedence value and the priority. 2951 * 2952 * NOTE : The caller should quntimeout if the queue length is 0. 2953 */ 2954 static int 2955 tbf_dq_sel(struct vif *vifp, ipha_t *ipha) 2956 { 2957 uint_t p; 2958 struct tbf *t = vifp->v_tbf; 2959 mblk_t **np; 2960 mblk_t *last, *mp; 2961 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2962 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2963 2964 if (ipst->ips_ip_mrtdebug > 1) { 2965 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2966 "dq_sel: vif %ld dst 0x%x", 2967 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(ipha->ipha_dst)); 2968 } 2969 2970 ASSERT(MUTEX_HELD(&t->tbf_lock)); 2971 p = priority(vifp, ipha); 2972 2973 np = &t->tbf_q; 2974 last = NULL; 2975 while ((mp = *np) != NULL) { 2976 if (p > (priority(vifp, (ipha_t *)mp->b_rptr))) { 2977 *np = mp->b_next; 2978 /* If removing the last packet, fix the tail pointer */ 2979 if (mp == t->tbf_t) 2980 t->tbf_t = last; 2981 mp->b_prev = mp->b_next = NULL; 2982 freemsg(mp); 2983 /* 2984 * It's impossible for the queue to be empty, but 2985 * we check anyway. 2986 */ 2987 if (--t->tbf_q_len == 0) { 2988 t->tbf_t = NULL; 2989 } 2990 ipst->ips_mrtstat->mrts_drop_sel++; 2991 return (1); 2992 } 2993 np = &mp->b_next; 2994 last = mp; 2995 } 2996 return (0); 2997 } 2998 2999 /* Sends packet, 2 cases - encap tunnel, phyint. */ 3000 static void 3001 tbf_send_packet(struct vif *vifp, mblk_t *mp) 3002 { 3003 ipif_t *ipif; 3004 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 3005 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3006 3007 /* If encap tunnel options */ 3008 if (vifp->v_flags & VIFF_TUNNEL) { 3009 if (ipst->ips_ip_mrtdebug > 1) { 3010 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3011 "tbf_send_pkt: ENCAP tunnel vif %ld", 3012 (ptrdiff_t)(vifp - ipst->ips_vifs)); 3013 } 3014 3015 /* 3016 * Feed into ip_wput which will set the ident field and 3017 * checksum the encapsulating header. 3018 * BSD gets the cached route vifp->v_route from ip_output() 3019 * to speed up route table lookups. Not necessary in SunOS 5.x. 3020 */ 3021 put(vifp->v_ipif->ipif_wq, mp); 3022 return; 3023 3024 /* phyint */ 3025 } else { 3026 /* Need to loop back to members on the outgoing interface. */ 3027 ipha_t *ipha; 3028 ipaddr_t dst; 3029 ipha = (ipha_t *)mp->b_rptr; 3030 dst = ipha->ipha_dst; 3031 ipif = vifp->v_ipif; 3032 3033 if (ilm_lookup_ipif(ipif, dst) != NULL) { 3034 /* 3035 * The packet is not yet reassembled, thus we need to 3036 * pass it to ip_rput_local for checksum verification 3037 * and reassembly (and fanout the user stream). 3038 */ 3039 mblk_t *mp_loop; 3040 ire_t *ire; 3041 3042 if (ipst->ips_ip_mrtdebug > 1) { 3043 (void) mi_strlog(mrouter->conn_rq, 1, 3044 SL_TRACE, 3045 "tbf_send_pkt: loopback vif %ld", 3046 (ptrdiff_t)(vifp - ipst->ips_vifs)); 3047 } 3048 mp_loop = copymsg(mp); 3049 ire = ire_ctable_lookup(~0, 0, IRE_BROADCAST, NULL, 3050 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 3051 3052 if (mp_loop != NULL && ire != NULL) { 3053 IP_RPUT_LOCAL(ipif->ipif_rq, mp_loop, 3054 ((ipha_t *)mp_loop->b_rptr), 3055 ire, (ill_t *)ipif->ipif_rq->q_ptr); 3056 } else { 3057 /* Either copymsg failed or no ire */ 3058 (void) mi_strlog(mrouter->conn_rq, 1, 3059 SL_TRACE, 3060 "tbf_send_pkt: mp_loop 0x%p, ire 0x%p " 3061 "vif %ld\n", (void *)mp_loop, (void *)ire, 3062 (ptrdiff_t)(vifp - ipst->ips_vifs)); 3063 } 3064 if (ire != NULL) 3065 ire_refrele(ire); 3066 } 3067 if (ipst->ips_ip_mrtdebug > 1) { 3068 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3069 "tbf_send_pkt: phyint forward vif %ld dst = 0x%x", 3070 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(dst)); 3071 } 3072 ip_rput_forward_multicast(dst, mp, ipif); 3073 } 3074 } 3075 3076 /* 3077 * Determine the current time and then the elapsed time (between the last time 3078 * and time now). Update the no. of tokens in the bucket. 3079 */ 3080 static void 3081 tbf_update_tokens(struct vif *vifp) 3082 { 3083 timespec_t tp; 3084 hrtime_t tm; 3085 struct tbf *t = vifp->v_tbf; 3086 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 3087 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3088 3089 ASSERT(MUTEX_HELD(&t->tbf_lock)); 3090 3091 /* Time in secs and nsecs, rate limit in kbits/sec */ 3092 gethrestime(&tp); 3093 3094 /*LINTED*/ 3095 TV_DELTA(tp, t->tbf_last_pkt_t, tm); 3096 3097 /* 3098 * This formula is actually 3099 * "time in seconds" * "bytes/second". Scaled for nsec. 3100 * (tm/1000000000) * (v_rate_limit * 1000 * (1000/1024) /8) 3101 * 3102 * The (1000/1024) was introduced in add_vif to optimize 3103 * this divide into a shift. 3104 */ 3105 t->tbf_n_tok += (tm/1000) * vifp->v_rate_limit / 1024 / 8; 3106 t->tbf_last_pkt_t = tp; 3107 3108 if (t->tbf_n_tok > MAX_BKT_SIZE) 3109 t->tbf_n_tok = MAX_BKT_SIZE; 3110 if (ipst->ips_ip_mrtdebug > 1) { 3111 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3112 "tbf_update_tok: tm %lld tok %d vif %ld", 3113 tm, t->tbf_n_tok, (ptrdiff_t)(vifp - ipst->ips_vifs)); 3114 } 3115 } 3116 3117 /* 3118 * Priority currently is based on port nos. 3119 * Different forwarding mechanisms have different ways 3120 * of obtaining the port no. Hence, the vif must be 3121 * given along with the packet itself. 3122 * 3123 */ 3124 static int 3125 priority(struct vif *vifp, ipha_t *ipha) 3126 { 3127 int prio; 3128 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 3129 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3130 3131 /* Temporary hack; may add general packet classifier some day */ 3132 3133 ASSERT(MUTEX_HELD(&vifp->v_tbf->tbf_lock)); 3134 3135 /* 3136 * The UDP port space is divided up into four priority ranges: 3137 * [0, 16384) : unclassified - lowest priority 3138 * [16384, 32768) : audio - highest priority 3139 * [32768, 49152) : whiteboard - medium priority 3140 * [49152, 65536) : video - low priority 3141 */ 3142 3143 if (ipha->ipha_protocol == IPPROTO_UDP) { 3144 struct udphdr *udp = 3145 (struct udphdr *)((char *)ipha + IPH_HDR_LENGTH(ipha)); 3146 switch (ntohs(udp->uh_dport) & 0xc000) { 3147 case 0x4000: 3148 prio = 70; 3149 break; 3150 case 0x8000: 3151 prio = 60; 3152 break; 3153 case 0xc000: 3154 prio = 55; 3155 break; 3156 default: 3157 prio = 50; 3158 break; 3159 } 3160 if (ipst->ips_ip_mrtdebug > 1) { 3161 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3162 "priority: port %x prio %d\n", 3163 ntohs(udp->uh_dport), prio); 3164 } 3165 } else 3166 prio = 50; /* default priority */ 3167 return (prio); 3168 } 3169 3170 /* 3171 * End of token bucket filter modifications 3172 */ 3173 3174 3175 3176 /* 3177 * Produces data for netstat -M. 3178 */ 3179 int 3180 ip_mroute_stats(mblk_t *mp, ip_stack_t *ipst) 3181 { 3182 ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl); 3183 ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl); 3184 if (!snmp_append_data(mp, (char *)ipst->ips_mrtstat, 3185 sizeof (struct mrtstat))) { 3186 ip0dbg(("ip_mroute_stats: failed %ld bytes\n", 3187 (size_t)sizeof (struct mrtstat))); 3188 return (0); 3189 } 3190 return (1); 3191 } 3192 3193 /* 3194 * Sends info for SNMP's MIB. 3195 */ 3196 int 3197 ip_mroute_vif(mblk_t *mp, ip_stack_t *ipst) 3198 { 3199 struct vifctl vi; 3200 vifi_t vifi; 3201 3202 mutex_enter(&ipst->ips_numvifs_mutex); 3203 for (vifi = 0; vifi < ipst->ips_numvifs; vifi++) { 3204 if (ipst->ips_vifs[vifi].v_lcl_addr.s_addr == 0) 3205 continue; 3206 /* 3207 * No locks here, an approximation is fine. 3208 */ 3209 vi.vifc_vifi = vifi; 3210 vi.vifc_flags = ipst->ips_vifs[vifi].v_flags; 3211 vi.vifc_threshold = ipst->ips_vifs[vifi].v_threshold; 3212 vi.vifc_rate_limit = ipst->ips_vifs[vifi].v_rate_limit; 3213 vi.vifc_lcl_addr = ipst->ips_vifs[vifi].v_lcl_addr; 3214 vi.vifc_rmt_addr = ipst->ips_vifs[vifi].v_rmt_addr; 3215 vi.vifc_pkt_in = ipst->ips_vifs[vifi].v_pkt_in; 3216 vi.vifc_pkt_out = ipst->ips_vifs[vifi].v_pkt_out; 3217 3218 if (!snmp_append_data(mp, (char *)&vi, sizeof (vi))) { 3219 ip0dbg(("ip_mroute_vif: failed %ld bytes\n", 3220 (size_t)sizeof (vi))); 3221 mutex_exit(&ipst->ips_numvifs_mutex); 3222 return (0); 3223 } 3224 } 3225 mutex_exit(&ipst->ips_numvifs_mutex); 3226 return (1); 3227 } 3228 3229 /* 3230 * Called by ip_snmp_get to send up multicast routing table. 3231 */ 3232 int 3233 ip_mroute_mrt(mblk_t *mp, ip_stack_t *ipst) 3234 { 3235 int i, j; 3236 struct mfc *rt; 3237 struct mfcctl mfcc; 3238 3239 /* 3240 * Make sure multicast has not been turned off. 3241 */ 3242 if (is_mrouter_off(ipst)) 3243 return (1); 3244 3245 /* Loop over all hash buckets and their chains */ 3246 for (i = 0; i < MFCTBLSIZ; i++) { 3247 MFCB_REFHOLD(&ipst->ips_mfcs[i]); 3248 for (rt = ipst->ips_mfcs[i].mfcb_mfc; rt; rt = rt->mfc_next) { 3249 mutex_enter(&rt->mfc_mutex); 3250 if (rt->mfc_rte != NULL || 3251 (rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 3252 mutex_exit(&rt->mfc_mutex); 3253 continue; 3254 } 3255 mfcc.mfcc_origin = rt->mfc_origin; 3256 mfcc.mfcc_mcastgrp = rt->mfc_mcastgrp; 3257 mfcc.mfcc_parent = rt->mfc_parent; 3258 mfcc.mfcc_pkt_cnt = rt->mfc_pkt_cnt; 3259 mutex_enter(&ipst->ips_numvifs_mutex); 3260 for (j = 0; j < (int)ipst->ips_numvifs; j++) 3261 mfcc.mfcc_ttls[j] = rt->mfc_ttls[j]; 3262 for (j = (int)ipst->ips_numvifs; j < MAXVIFS; j++) 3263 mfcc.mfcc_ttls[j] = 0; 3264 mutex_exit(&ipst->ips_numvifs_mutex); 3265 3266 mutex_exit(&rt->mfc_mutex); 3267 if (!snmp_append_data(mp, (char *)&mfcc, 3268 sizeof (mfcc))) { 3269 MFCB_REFRELE(&ipst->ips_mfcs[i]); 3270 ip0dbg(("ip_mroute_mrt: failed %ld bytes\n", 3271 (size_t)sizeof (mfcc))); 3272 return (0); 3273 } 3274 } 3275 MFCB_REFRELE(&ipst->ips_mfcs[i]); 3276 } 3277 return (1); 3278 } 3279