1 /* 2 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 3 * Use is subject to license terms. 4 */ 5 /* 6 * CDDL HEADER START 7 * 8 * The contents of this file are subject to the terms of the 9 * Common Development and Distribution License (the "License"). 10 * You may not use this file except in compliance with the License. 11 * 12 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 13 * or http://www.opensolaris.org/os/licensing. 14 * See the License for the specific language governing permissions 15 * and limitations under the License. 16 * 17 * When distributing Covered Code, include this CDDL HEADER in each 18 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 19 * If applicable, add the following below this CDDL HEADER, with the 20 * fields enclosed by brackets "[]" replaced with your own identifying 21 * information: Portions Copyright [yyyy] [name of copyright owner] 22 * 23 * CDDL HEADER END 24 */ 25 /* 26 * Copyright 2008 Sun Microsystems, Inc. 27 * All rights reserved. Use is subject to license terms. 28 */ 29 /* Copyright (c) 1990 Mentat Inc. */ 30 31 /* 32 * Procedures for the kernel part of DVMRP, 33 * a Distance-Vector Multicast Routing Protocol. 34 * (See RFC-1075) 35 * Written by David Waitzman, BBN Labs, August 1988. 36 * Modified by Steve Deering, Stanford, February 1989. 37 * Modified by Mark J. Steiglitz, Stanford, May, 1991 38 * Modified by Van Jacobson, LBL, January 1993 39 * Modified by Ajit Thyagarajan, PARC, August 1993 40 * Modified by Bill Fenner, PARC, April 1995 41 * 42 * MROUTING 3.5 43 */ 44 45 /* 46 * TODO 47 * - function pointer field in vif, void *vif_sendit() 48 */ 49 50 #include <sys/types.h> 51 #include <sys/stream.h> 52 #include <sys/stropts.h> 53 #include <sys/strlog.h> 54 #include <sys/systm.h> 55 #include <sys/ddi.h> 56 #include <sys/cmn_err.h> 57 #include <sys/zone.h> 58 59 #include <sys/param.h> 60 #include <sys/socket.h> 61 #include <sys/vtrace.h> 62 #include <sys/debug.h> 63 #include <net/if.h> 64 #include <sys/sockio.h> 65 #include <netinet/in.h> 66 #include <net/if_dl.h> 67 68 #include <inet/common.h> 69 #include <inet/mi.h> 70 #include <inet/nd.h> 71 #include <inet/mib2.h> 72 #include <netinet/ip6.h> 73 #include <inet/ip.h> 74 #include <inet/snmpcom.h> 75 76 #include <netinet/igmp.h> 77 #include <netinet/igmp_var.h> 78 #include <netinet/udp.h> 79 #include <netinet/ip_mroute.h> 80 #include <inet/ip_multi.h> 81 #include <inet/ip_ire.h> 82 #include <inet/ip_if.h> 83 #include <inet/ipclassifier.h> 84 85 #include <netinet/pim.h> 86 87 88 /* 89 * MT Design: 90 * 91 * There are three main data structures viftable, mfctable and tbftable that 92 * need to be protected against MT races. 93 * 94 * vitable is a fixed length array of vif structs. There is no lock to protect 95 * the whole array, instead each struct is protected by its own indiviual lock. 96 * The value of v_marks in conjuction with the value of v_refcnt determines the 97 * current state of a vif structure. One special state that needs mention 98 * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates 99 * that vif is being initalized. 100 * Each structure is freed when the refcnt goes down to zero. If a delete comes 101 * in when the the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED 102 * which prevents the struct from further use. When the refcnt goes to zero 103 * the struct is freed and is marked VIF_MARK_NOTINUSE. 104 * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill 105 * from going away a refhold is put on the ipif before using it. see 106 * lock_good_vif() and unlock_good_vif(). 107 * 108 * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts 109 * of the vif struct. 110 * 111 * tbftable is also a fixed length array of tbf structs and is only accessed 112 * via v_tbf. It is protected by its own lock tbf_lock. 113 * 114 * Lock Ordering is 115 * v_lock --> tbf_lock 116 * v_lock --> ill_locK 117 * 118 * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb). 119 * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker, 120 * it also maintains a state. These fields are protected by a lock (mfcb_lock). 121 * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to 122 * protect the struct elements. 123 * 124 * mfc structs are dynamically allocated and are singly linked 125 * at the head of the chain. When an mfc structure is to be deleted 126 * it is marked condemned and so is the state in the bucket struct. 127 * When the last walker of the hash bucket exits all the mfc structs 128 * marked condemed are freed. 129 * 130 * Locking Hierarchy: 131 * The bucket lock should be acquired before the mfc struct lock. 132 * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking 133 * operations on the bucket struct. 134 * 135 * last_encap_lock and numvifs_mutex should be acquired after 136 * acquring vif or mfc locks. These locks protect some global variables. 137 * 138 * The statistics are not currently protected by a lock 139 * causing the stats be be approximate, not exact. 140 */ 141 142 #define NO_VIF MAXVIFS /* from mrouted, no route for src */ 143 144 /* 145 * Timeouts: 146 * Upcall timeouts - BSD uses boolean_t mfc->expire and 147 * nexpire[MFCTBLSIZE], the number of times expire has been called. 148 * SunOS 5.x uses mfc->timeout for each mfc. 149 * Some Unixes are limited in the number of simultaneous timeouts 150 * that can be run, SunOS 5.x does not have this restriction. 151 */ 152 153 /* 154 * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and 155 * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall 156 * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE 157 */ 158 #define EXPIRE_TIMEOUT (hz/4) /* 4x / second */ 159 #define UPCALL_EXPIRE 6 /* number of timeouts */ 160 161 /* 162 * Hash function for a source, group entry 163 */ 164 #define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \ 165 ((g) >> 20) ^ ((g) >> 10) ^ (g)) 166 167 #define TBF_REPROCESS (hz / 100) /* 100x /second */ 168 169 /* Identify PIM packet that came on a Register interface */ 170 #define PIM_REGISTER_MARKER 0xffffffff 171 172 /* Function declarations */ 173 static int add_mfc(struct mfcctl *, ip_stack_t *); 174 static int add_vif(struct vifctl *, conn_t *, mblk_t *, ip_stack_t *); 175 static int del_mfc(struct mfcctl *, ip_stack_t *); 176 static int del_vif(vifi_t *, conn_t *, mblk_t *, ip_stack_t *); 177 static void del_vifp(struct vif *); 178 static void encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 179 static void expire_upcalls(void *); 180 static void fill_route(struct mfc *, struct mfcctl *, ip_stack_t *); 181 static void free_queue(struct mfc *); 182 static int get_assert(uchar_t *, ip_stack_t *); 183 static int get_lsg_cnt(struct sioc_lsg_req *, ip_stack_t *); 184 static int get_sg_cnt(struct sioc_sg_req *, ip_stack_t *); 185 static int get_version(uchar_t *); 186 static int get_vif_cnt(struct sioc_vif_req *, ip_stack_t *); 187 static int ip_mdq(mblk_t *, ipha_t *, ill_t *, 188 ipaddr_t, struct mfc *); 189 static int ip_mrouter_init(conn_t *, uchar_t *, int, ip_stack_t *); 190 static void phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 191 static int register_mforward(queue_t *, mblk_t *, ill_t *); 192 static void register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 193 static int set_assert(int *, ip_stack_t *); 194 195 /* 196 * Token Bucket Filter functions 197 */ 198 static int priority(struct vif *, ipha_t *); 199 static void tbf_control(struct vif *, mblk_t *, ipha_t *); 200 static int tbf_dq_sel(struct vif *, ipha_t *); 201 static void tbf_process_q(struct vif *); 202 static void tbf_queue(struct vif *, mblk_t *); 203 static void tbf_reprocess_q(void *); 204 static void tbf_send_packet(struct vif *, mblk_t *); 205 static void tbf_update_tokens(struct vif *); 206 static void release_mfc(struct mfcb *); 207 208 static boolean_t is_mrouter_off(ip_stack_t *); 209 /* 210 * Encapsulation packets 211 */ 212 213 #define ENCAP_TTL 64 214 215 /* prototype IP hdr for encapsulated packets */ 216 static ipha_t multicast_encap_iphdr = { 217 IP_SIMPLE_HDR_VERSION, 218 0, /* tos */ 219 sizeof (ipha_t), /* total length */ 220 0, /* id */ 221 0, /* frag offset */ 222 ENCAP_TTL, IPPROTO_ENCAP, 223 0, /* checksum */ 224 }; 225 226 /* 227 * Rate limit for assert notification messages, in nsec. 228 */ 229 #define ASSERT_MSG_TIME 3000000000 230 231 232 #define VIF_REFHOLD(vifp) { \ 233 mutex_enter(&(vifp)->v_lock); \ 234 (vifp)->v_refcnt++; \ 235 mutex_exit(&(vifp)->v_lock); \ 236 } 237 238 #define VIF_REFRELE_LOCKED(vifp) { \ 239 (vifp)->v_refcnt--; \ 240 if ((vifp)->v_refcnt == 0 && \ 241 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) { \ 242 del_vifp(vifp); \ 243 } else { \ 244 mutex_exit(&(vifp)->v_lock); \ 245 } \ 246 } 247 248 #define VIF_REFRELE(vifp) { \ 249 mutex_enter(&(vifp)->v_lock); \ 250 (vifp)->v_refcnt--; \ 251 if ((vifp)->v_refcnt == 0 && \ 252 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) { \ 253 del_vifp(vifp); \ 254 } else { \ 255 mutex_exit(&(vifp)->v_lock); \ 256 } \ 257 } 258 259 #define MFCB_REFHOLD(mfcb) { \ 260 mutex_enter(&(mfcb)->mfcb_lock); \ 261 (mfcb)->mfcb_refcnt++; \ 262 ASSERT((mfcb)->mfcb_refcnt != 0); \ 263 mutex_exit(&(mfcb)->mfcb_lock); \ 264 } 265 266 #define MFCB_REFRELE(mfcb) { \ 267 mutex_enter(&(mfcb)->mfcb_lock); \ 268 ASSERT((mfcb)->mfcb_refcnt != 0); \ 269 if (--(mfcb)->mfcb_refcnt == 0 && \ 270 ((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) { \ 271 release_mfc(mfcb); \ 272 } \ 273 mutex_exit(&(mfcb)->mfcb_lock); \ 274 } 275 276 /* 277 * MFCFIND: 278 * Find a route for a given origin IP address and multicast group address. 279 * Skip entries with pending upcalls. 280 * Type of service parameter to be added in the future! 281 */ 282 #define MFCFIND(mfcbp, o, g, rt) { \ 283 struct mfc *_mb_rt = NULL; \ 284 rt = NULL; \ 285 _mb_rt = mfcbp->mfcb_mfc; \ 286 while (_mb_rt) { \ 287 if ((_mb_rt->mfc_origin.s_addr == o) && \ 288 (_mb_rt->mfc_mcastgrp.s_addr == g) && \ 289 (_mb_rt->mfc_rte == NULL) && \ 290 (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) { \ 291 rt = _mb_rt; \ 292 break; \ 293 } \ 294 _mb_rt = _mb_rt->mfc_next; \ 295 } \ 296 } 297 298 /* 299 * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime() 300 * are inefficient. We use gethrestime() which returns a timespec_t with 301 * sec and nsec, the resolution is machine dependent. 302 * The following 2 macros have been changed to use nsec instead of usec. 303 */ 304 /* 305 * Macros to compute elapsed time efficiently. 306 * Borrowed from Van Jacobson's scheduling code. 307 * Delta should be a hrtime_t. 308 */ 309 #define TV_DELTA(a, b, delta) { \ 310 int xxs; \ 311 \ 312 delta = (a).tv_nsec - (b).tv_nsec; \ 313 if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \ 314 switch (xxs) { \ 315 case 2: \ 316 delta += 1000000000; \ 317 /*FALLTHROUGH*/ \ 318 case 1: \ 319 delta += 1000000000; \ 320 break; \ 321 default: \ 322 delta += (1000000000 * xxs); \ 323 } \ 324 } \ 325 } 326 327 #define TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \ 328 (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec) 329 330 /* 331 * Handle MRT setsockopt commands to modify the multicast routing tables. 332 */ 333 int 334 ip_mrouter_set(int cmd, queue_t *q, int checkonly, uchar_t *data, 335 int datalen, mblk_t *first_mp) 336 { 337 conn_t *connp = Q_TO_CONN(q); 338 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 339 340 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 341 if (cmd != MRT_INIT && connp != ipst->ips_ip_g_mrouter) { 342 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 343 return (EACCES); 344 } 345 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 346 347 if (checkonly) { 348 /* 349 * do not do operation, just pretend to - new T_CHECK 350 * Note: Even routines further on can probably fail but 351 * this T_CHECK stuff is only to please XTI so it not 352 * necessary to be perfect. 353 */ 354 switch (cmd) { 355 case MRT_INIT: 356 case MRT_DONE: 357 case MRT_ADD_VIF: 358 case MRT_DEL_VIF: 359 case MRT_ADD_MFC: 360 case MRT_DEL_MFC: 361 case MRT_ASSERT: 362 return (0); 363 default: 364 return (EOPNOTSUPP); 365 } 366 } 367 368 /* 369 * make sure no command is issued after multicast routing has been 370 * turned off. 371 */ 372 if (cmd != MRT_INIT && cmd != MRT_DONE) { 373 if (is_mrouter_off(ipst)) 374 return (EINVAL); 375 } 376 377 switch (cmd) { 378 case MRT_INIT: return (ip_mrouter_init(connp, data, datalen, ipst)); 379 case MRT_DONE: return (ip_mrouter_done(first_mp, ipst)); 380 case MRT_ADD_VIF: return (add_vif((struct vifctl *)data, connp, 381 first_mp, ipst)); 382 case MRT_DEL_VIF: return (del_vif((vifi_t *)data, connp, first_mp, 383 ipst)); 384 case MRT_ADD_MFC: return (add_mfc((struct mfcctl *)data, ipst)); 385 case MRT_DEL_MFC: return (del_mfc((struct mfcctl *)data, ipst)); 386 case MRT_ASSERT: return (set_assert((int *)data, ipst)); 387 default: return (EOPNOTSUPP); 388 } 389 } 390 391 /* 392 * Handle MRT getsockopt commands 393 */ 394 int 395 ip_mrouter_get(int cmd, queue_t *q, uchar_t *data) 396 { 397 conn_t *connp = Q_TO_CONN(q); 398 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 399 400 if (connp != ipst->ips_ip_g_mrouter) 401 return (EACCES); 402 403 switch (cmd) { 404 case MRT_VERSION: return (get_version((uchar_t *)data)); 405 case MRT_ASSERT: return (get_assert((uchar_t *)data, ipst)); 406 default: return (EOPNOTSUPP); 407 } 408 } 409 410 /* 411 * Handle ioctl commands to obtain information from the cache. 412 * Called with shared access to IP. These are read_only ioctls. 413 */ 414 /* ARGSUSED */ 415 int 416 mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 417 ip_ioctl_cmd_t *ipip, void *if_req) 418 { 419 mblk_t *mp1; 420 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 421 conn_t *connp = Q_TO_CONN(q); 422 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 423 424 /* Existence verified in ip_wput_nondata */ 425 mp1 = mp->b_cont->b_cont; 426 427 switch (iocp->ioc_cmd) { 428 case (SIOCGETVIFCNT): 429 return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr, ipst)); 430 case (SIOCGETSGCNT): 431 return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr, ipst)); 432 case (SIOCGETLSGCNT): 433 return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr, ipst)); 434 default: 435 return (EINVAL); 436 } 437 } 438 439 /* 440 * Returns the packet, byte, rpf-failure count for the source, group provided. 441 */ 442 static int 443 get_sg_cnt(struct sioc_sg_req *req, ip_stack_t *ipst) 444 { 445 struct mfc *rt; 446 struct mfcb *mfcbp; 447 448 mfcbp = &ipst->ips_mfcs[MFCHASH(req->src.s_addr, req->grp.s_addr)]; 449 MFCB_REFHOLD(mfcbp); 450 MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt); 451 452 if (rt != NULL) { 453 mutex_enter(&rt->mfc_mutex); 454 req->pktcnt = rt->mfc_pkt_cnt; 455 req->bytecnt = rt->mfc_byte_cnt; 456 req->wrong_if = rt->mfc_wrong_if; 457 mutex_exit(&rt->mfc_mutex); 458 } else 459 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU; 460 461 MFCB_REFRELE(mfcbp); 462 return (0); 463 } 464 465 /* 466 * Returns the packet, byte, rpf-failure count for the source, group provided. 467 * Uses larger counters and IPv6 addresses. 468 */ 469 /* ARGSUSED XXX until implemented */ 470 static int 471 get_lsg_cnt(struct sioc_lsg_req *req, ip_stack_t *ipst) 472 { 473 /* XXX TODO SIOCGETLSGCNT */ 474 return (ENXIO); 475 } 476 477 /* 478 * Returns the input and output packet and byte counts on the vif provided. 479 */ 480 static int 481 get_vif_cnt(struct sioc_vif_req *req, ip_stack_t *ipst) 482 { 483 vifi_t vifi = req->vifi; 484 485 if (vifi >= ipst->ips_numvifs) 486 return (EINVAL); 487 488 /* 489 * No locks here, an approximation is fine. 490 */ 491 req->icount = ipst->ips_vifs[vifi].v_pkt_in; 492 req->ocount = ipst->ips_vifs[vifi].v_pkt_out; 493 req->ibytes = ipst->ips_vifs[vifi].v_bytes_in; 494 req->obytes = ipst->ips_vifs[vifi].v_bytes_out; 495 496 return (0); 497 } 498 499 static int 500 get_version(uchar_t *data) 501 { 502 int *v = (int *)data; 503 504 *v = 0x0305; /* XXX !!!! */ 505 506 return (0); 507 } 508 509 /* 510 * Set PIM assert processing global. 511 */ 512 static int 513 set_assert(int *i, ip_stack_t *ipst) 514 { 515 if ((*i != 1) && (*i != 0)) 516 return (EINVAL); 517 518 ipst->ips_pim_assert = *i; 519 520 return (0); 521 } 522 523 /* 524 * Get PIM assert processing global. 525 */ 526 static int 527 get_assert(uchar_t *data, ip_stack_t *ipst) 528 { 529 int *i = (int *)data; 530 531 *i = ipst->ips_pim_assert; 532 533 return (0); 534 } 535 536 /* 537 * Enable multicast routing. 538 */ 539 static int 540 ip_mrouter_init(conn_t *connp, uchar_t *data, int datalen, ip_stack_t *ipst) 541 { 542 int *v; 543 544 if (data == NULL || (datalen != sizeof (int))) 545 return (ENOPROTOOPT); 546 547 v = (int *)data; 548 if (*v != 1) 549 return (ENOPROTOOPT); 550 551 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 552 if (ipst->ips_ip_g_mrouter != NULL) { 553 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 554 return (EADDRINUSE); 555 } 556 557 /* 558 * MRT_INIT should only be allowed for RAW sockets, but we double 559 * check. 560 */ 561 if (!IPCL_IS_RAWIP(connp)) { 562 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 563 return (EINVAL); 564 } 565 566 ipst->ips_ip_g_mrouter = connp; 567 connp->conn_multi_router = 1; 568 /* In order for tunnels to work we have to turn ip_g_forward on */ 569 if (!WE_ARE_FORWARDING(ipst)) { 570 if (ipst->ips_ip_mrtdebug > 1) { 571 (void) mi_strlog(connp->conn_rq, 1, SL_TRACE, 572 "ip_mrouter_init: turning on forwarding"); 573 } 574 ipst->ips_saved_ip_g_forward = ipst->ips_ip_g_forward; 575 ipst->ips_ip_g_forward = IP_FORWARD_ALWAYS; 576 } 577 578 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 579 return (0); 580 } 581 582 void 583 ip_mrouter_stack_init(ip_stack_t *ipst) 584 { 585 mutex_init(&ipst->ips_ip_g_mrouter_mutex, NULL, MUTEX_DEFAULT, NULL); 586 587 ipst->ips_vifs = kmem_zalloc(sizeof (struct vif) * (MAXVIFS+1), 588 KM_SLEEP); 589 ipst->ips_mrtstat = kmem_zalloc(sizeof (struct mrtstat), KM_SLEEP); 590 /* 591 * mfctable: 592 * Includes all mfcs, including waiting upcalls. 593 * Multiple mfcs per bucket. 594 */ 595 ipst->ips_mfcs = kmem_zalloc(sizeof (struct mfcb) * MFCTBLSIZ, 596 KM_SLEEP); 597 /* 598 * Define the token bucket filter structures. 599 * tbftable -> each vif has one of these for storing info. 600 */ 601 ipst->ips_tbfs = kmem_zalloc(sizeof (struct tbf) * MAXVIFS, KM_SLEEP); 602 603 mutex_init(&ipst->ips_last_encap_lock, NULL, MUTEX_DEFAULT, NULL); 604 605 ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl); 606 ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl); 607 } 608 609 /* 610 * Disable multicast routing. 611 * Didn't use global timeout_val (BSD version), instead check the mfctable. 612 */ 613 int 614 ip_mrouter_done(mblk_t *mp, ip_stack_t *ipst) 615 { 616 conn_t *mrouter; 617 vifi_t vifi; 618 struct mfc *mfc_rt; 619 int i; 620 621 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 622 if (ipst->ips_ip_g_mrouter == NULL) { 623 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 624 return (EINVAL); 625 } 626 627 mrouter = ipst->ips_ip_g_mrouter; 628 629 if (ipst->ips_saved_ip_g_forward != -1) { 630 if (ipst->ips_ip_mrtdebug > 1) { 631 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 632 "ip_mrouter_done: turning off forwarding"); 633 } 634 ipst->ips_ip_g_forward = ipst->ips_saved_ip_g_forward; 635 ipst->ips_saved_ip_g_forward = -1; 636 } 637 638 /* 639 * Always clear cache when vifs change. 640 * No need to get ipst->ips_last_encap_lock since we are running as 641 * a writer. 642 */ 643 mutex_enter(&ipst->ips_last_encap_lock); 644 ipst->ips_last_encap_src = 0; 645 ipst->ips_last_encap_vif = NULL; 646 mutex_exit(&ipst->ips_last_encap_lock); 647 mrouter->conn_multi_router = 0; 648 649 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 650 651 /* 652 * For each phyint in use, 653 * disable promiscuous reception of all IP multicasts. 654 */ 655 for (vifi = 0; vifi < MAXVIFS; vifi++) { 656 struct vif *vifp = ipst->ips_vifs + vifi; 657 658 mutex_enter(&vifp->v_lock); 659 /* 660 * if the vif is active mark it condemned. 661 */ 662 if (vifp->v_marks & VIF_MARK_GOOD) { 663 ASSERT(vifp->v_ipif != NULL); 664 ipif_refhold(vifp->v_ipif); 665 /* Phyint only */ 666 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 667 ipif_t *ipif = vifp->v_ipif; 668 ipsq_t *ipsq; 669 boolean_t suc; 670 ill_t *ill; 671 672 ill = ipif->ipif_ill; 673 suc = B_FALSE; 674 if (mp == NULL) { 675 /* 676 * being called from ip_close, 677 * lets do it synchronously. 678 * Clear VIF_MARK_GOOD and 679 * set VIF_MARK_CONDEMNED. 680 */ 681 vifp->v_marks &= ~VIF_MARK_GOOD; 682 vifp->v_marks |= VIF_MARK_CONDEMNED; 683 mutex_exit(&(vifp)->v_lock); 684 suc = ipsq_enter(ill, B_FALSE, NEW_OP); 685 ipsq = ill->ill_phyint->phyint_ipsq; 686 } else { 687 ipsq = ipsq_try_enter(ipif, NULL, 688 mrouter->conn_wq, mp, 689 ip_restart_optmgmt, NEW_OP, B_TRUE); 690 if (ipsq == NULL) { 691 mutex_exit(&(vifp)->v_lock); 692 ipif_refrele(ipif); 693 return (EINPROGRESS); 694 } 695 /* 696 * Clear VIF_MARK_GOOD and 697 * set VIF_MARK_CONDEMNED. 698 */ 699 vifp->v_marks &= ~VIF_MARK_GOOD; 700 vifp->v_marks |= VIF_MARK_CONDEMNED; 701 mutex_exit(&(vifp)->v_lock); 702 suc = B_TRUE; 703 } 704 705 if (suc) { 706 (void) ip_delmulti(INADDR_ANY, ipif, 707 B_TRUE, B_TRUE); 708 ipsq_exit(ipsq); 709 } 710 mutex_enter(&vifp->v_lock); 711 } 712 /* 713 * decreases the refcnt added in add_vif. 714 * and release v_lock. 715 */ 716 VIF_REFRELE_LOCKED(vifp); 717 } else { 718 mutex_exit(&vifp->v_lock); 719 continue; 720 } 721 } 722 723 mutex_enter(&ipst->ips_numvifs_mutex); 724 ipst->ips_numvifs = 0; 725 ipst->ips_pim_assert = 0; 726 ipst->ips_reg_vif_num = ALL_VIFS; 727 mutex_exit(&ipst->ips_numvifs_mutex); 728 729 /* 730 * Free upcall msgs. 731 * Go through mfctable and stop any outstanding upcall 732 * timeouts remaining on mfcs. 733 */ 734 for (i = 0; i < MFCTBLSIZ; i++) { 735 mutex_enter(&ipst->ips_mfcs[i].mfcb_lock); 736 ipst->ips_mfcs[i].mfcb_refcnt++; 737 ipst->ips_mfcs[i].mfcb_marks |= MFCB_MARK_CONDEMNED; 738 mutex_exit(&ipst->ips_mfcs[i].mfcb_lock); 739 mfc_rt = ipst->ips_mfcs[i].mfcb_mfc; 740 while (mfc_rt) { 741 /* Free upcalls */ 742 mutex_enter(&mfc_rt->mfc_mutex); 743 if (mfc_rt->mfc_rte != NULL) { 744 if (mfc_rt->mfc_timeout_id != 0) { 745 /* 746 * OK to drop the lock as we have 747 * a refcnt on the bucket. timeout 748 * can fire but it will see that 749 * mfc_timeout_id == 0 and not do 750 * anything. see expire_upcalls(). 751 */ 752 mfc_rt->mfc_timeout_id = 0; 753 mutex_exit(&mfc_rt->mfc_mutex); 754 (void) untimeout( 755 mfc_rt->mfc_timeout_id); 756 mfc_rt->mfc_timeout_id = 0; 757 mutex_enter(&mfc_rt->mfc_mutex); 758 759 /* 760 * all queued upcall packets 761 * and mblk will be freed in 762 * release_mfc(). 763 */ 764 } 765 } 766 767 mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED; 768 769 mutex_exit(&mfc_rt->mfc_mutex); 770 mfc_rt = mfc_rt->mfc_next; 771 } 772 MFCB_REFRELE(&ipst->ips_mfcs[i]); 773 } 774 775 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 776 ipst->ips_ip_g_mrouter = NULL; 777 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 778 return (0); 779 } 780 781 void 782 ip_mrouter_stack_destroy(ip_stack_t *ipst) 783 { 784 struct mfcb *mfcbp; 785 struct mfc *rt; 786 int i; 787 788 for (i = 0; i < MFCTBLSIZ; i++) { 789 mfcbp = &ipst->ips_mfcs[i]; 790 791 while ((rt = mfcbp->mfcb_mfc) != NULL) { 792 (void) printf("ip_mrouter_stack_destroy: free for %d\n", 793 i); 794 795 mfcbp->mfcb_mfc = rt->mfc_next; 796 free_queue(rt); 797 mi_free(rt); 798 } 799 } 800 kmem_free(ipst->ips_vifs, sizeof (struct vif) * (MAXVIFS+1)); 801 ipst->ips_vifs = NULL; 802 kmem_free(ipst->ips_mrtstat, sizeof (struct mrtstat)); 803 ipst->ips_mrtstat = NULL; 804 kmem_free(ipst->ips_mfcs, sizeof (struct mfcb) * MFCTBLSIZ); 805 ipst->ips_mfcs = NULL; 806 kmem_free(ipst->ips_tbfs, sizeof (struct tbf) * MAXVIFS); 807 ipst->ips_tbfs = NULL; 808 809 mutex_destroy(&ipst->ips_last_encap_lock); 810 mutex_destroy(&ipst->ips_ip_g_mrouter_mutex); 811 } 812 813 static boolean_t 814 is_mrouter_off(ip_stack_t *ipst) 815 { 816 conn_t *mrouter; 817 818 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 819 if (ipst->ips_ip_g_mrouter == NULL) { 820 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 821 return (B_TRUE); 822 } 823 824 mrouter = ipst->ips_ip_g_mrouter; 825 if (mrouter->conn_multi_router == 0) { 826 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 827 return (B_TRUE); 828 } 829 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 830 return (B_FALSE); 831 } 832 833 static void 834 unlock_good_vif(struct vif *vifp) 835 { 836 ASSERT(vifp->v_ipif != NULL); 837 ipif_refrele(vifp->v_ipif); 838 VIF_REFRELE(vifp); 839 } 840 841 static boolean_t 842 lock_good_vif(struct vif *vifp) 843 { 844 mutex_enter(&vifp->v_lock); 845 if (!(vifp->v_marks & VIF_MARK_GOOD)) { 846 mutex_exit(&vifp->v_lock); 847 return (B_FALSE); 848 } 849 850 ASSERT(vifp->v_ipif != NULL); 851 mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock); 852 if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) { 853 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock); 854 mutex_exit(&vifp->v_lock); 855 return (B_FALSE); 856 } 857 ipif_refhold_locked(vifp->v_ipif); 858 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock); 859 vifp->v_refcnt++; 860 mutex_exit(&vifp->v_lock); 861 return (B_TRUE); 862 } 863 864 /* 865 * Add a vif to the vif table. 866 */ 867 static int 868 add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst) 869 { 870 struct vif *vifp = ipst->ips_vifs + vifcp->vifc_vifi; 871 ipif_t *ipif; 872 int error; 873 struct tbf *v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi; 874 ipsq_t *ipsq; 875 conn_t *mrouter = ipst->ips_ip_g_mrouter; 876 877 ASSERT(connp != NULL); 878 879 if (vifcp->vifc_vifi >= MAXVIFS) 880 return (EINVAL); 881 882 if (is_mrouter_off(ipst)) 883 return (EINVAL); 884 885 mutex_enter(&vifp->v_lock); 886 /* 887 * Viftable entry should be 0. 888 * if v_marks == 0 but v_refcnt != 0 means struct is being 889 * initialized. 890 * 891 * Also note that it is very unlikely that we will get a MRT_ADD_VIF 892 * request while the delete is in progress, mrouted only sends add 893 * requests when a new interface is added and the new interface cannot 894 * have the same vifi as an existing interface. We make sure that 895 * ill_delete will block till the vif is deleted by adding a refcnt 896 * to ipif in del_vif(). 897 */ 898 if (vifp->v_lcl_addr.s_addr != 0 || 899 vifp->v_marks != 0 || 900 vifp->v_refcnt != 0) { 901 mutex_exit(&vifp->v_lock); 902 return (EADDRINUSE); 903 } 904 905 /* Incoming vif should not be 0 */ 906 if (vifcp->vifc_lcl_addr.s_addr == 0) { 907 mutex_exit(&vifp->v_lock); 908 return (EINVAL); 909 } 910 911 vifp->v_refcnt++; 912 mutex_exit(&vifp->v_lock); 913 /* Find the interface with the local address */ 914 ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL, 915 connp->conn_zoneid, CONNP_TO_WQ(connp), first_mp, 916 ip_restart_optmgmt, &error, ipst); 917 if (ipif == NULL) { 918 VIF_REFRELE(vifp); 919 if (error == EINPROGRESS) 920 return (error); 921 return (EADDRNOTAVAIL); 922 } 923 924 /* 925 * We have to be exclusive as we have to call ip_addmulti() 926 * This is the best position to try to be exclusive in case 927 * we have to wait. 928 */ 929 ipsq = ipsq_try_enter(ipif, NULL, CONNP_TO_WQ(connp), first_mp, 930 ip_restart_optmgmt, NEW_OP, B_TRUE); 931 if ((ipsq) == NULL) { 932 VIF_REFRELE(vifp); 933 ipif_refrele(ipif); 934 return (EINPROGRESS); 935 } 936 937 if (ipst->ips_ip_mrtdebug > 1) { 938 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 939 "add_vif: src 0x%x enter", 940 vifcp->vifc_lcl_addr.s_addr); 941 } 942 943 mutex_enter(&vifp->v_lock); 944 /* 945 * Always clear cache when vifs change. 946 * Needed to ensure that src isn't left over from before vif was added. 947 * No need to get last_encap_lock, since we are running as a writer. 948 */ 949 950 mutex_enter(&ipst->ips_last_encap_lock); 951 ipst->ips_last_encap_src = 0; 952 ipst->ips_last_encap_vif = NULL; 953 mutex_exit(&ipst->ips_last_encap_lock); 954 955 if (vifcp->vifc_flags & VIFF_TUNNEL) { 956 if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) { 957 cmn_err(CE_WARN, 958 "add_vif: source route tunnels not supported\n"); 959 VIF_REFRELE_LOCKED(vifp); 960 ipif_refrele(ipif); 961 ipsq_exit(ipsq); 962 return (EOPNOTSUPP); 963 } 964 vifp->v_rmt_addr = vifcp->vifc_rmt_addr; 965 966 } else { 967 /* Phyint or Register vif */ 968 if (vifcp->vifc_flags & VIFF_REGISTER) { 969 /* 970 * Note: Since all IPPROTO_IP level options (including 971 * MRT_ADD_VIF) are done exclusively via 972 * ip_optmgmt_writer(), a lock is not necessary to 973 * protect reg_vif_num. 974 */ 975 mutex_enter(&ipst->ips_numvifs_mutex); 976 if (ipst->ips_reg_vif_num == ALL_VIFS) { 977 ipst->ips_reg_vif_num = vifcp->vifc_vifi; 978 mutex_exit(&ipst->ips_numvifs_mutex); 979 } else { 980 mutex_exit(&ipst->ips_numvifs_mutex); 981 VIF_REFRELE_LOCKED(vifp); 982 ipif_refrele(ipif); 983 ipsq_exit(ipsq); 984 return (EADDRINUSE); 985 } 986 } 987 988 /* Make sure the interface supports multicast */ 989 if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) { 990 VIF_REFRELE_LOCKED(vifp); 991 ipif_refrele(ipif); 992 if (vifcp->vifc_flags & VIFF_REGISTER) { 993 mutex_enter(&ipst->ips_numvifs_mutex); 994 ipst->ips_reg_vif_num = ALL_VIFS; 995 mutex_exit(&ipst->ips_numvifs_mutex); 996 } 997 ipsq_exit(ipsq); 998 return (EOPNOTSUPP); 999 } 1000 /* Enable promiscuous reception of all IP mcasts from the if */ 1001 mutex_exit(&vifp->v_lock); 1002 error = ip_addmulti(INADDR_ANY, ipif, ILGSTAT_NONE, 1003 MODE_IS_EXCLUDE, NULL); 1004 mutex_enter(&vifp->v_lock); 1005 /* 1006 * since we released the lock lets make sure that 1007 * ip_mrouter_done() has not been called. 1008 */ 1009 if (error != 0 || is_mrouter_off(ipst)) { 1010 if (error == 0) 1011 (void) ip_delmulti(INADDR_ANY, ipif, B_TRUE, 1012 B_TRUE); 1013 if (vifcp->vifc_flags & VIFF_REGISTER) { 1014 mutex_enter(&ipst->ips_numvifs_mutex); 1015 ipst->ips_reg_vif_num = ALL_VIFS; 1016 mutex_exit(&ipst->ips_numvifs_mutex); 1017 } 1018 VIF_REFRELE_LOCKED(vifp); 1019 ipif_refrele(ipif); 1020 ipsq_exit(ipsq); 1021 return (error?error:EINVAL); 1022 } 1023 } 1024 /* Define parameters for the tbf structure */ 1025 vifp->v_tbf = v_tbf; 1026 gethrestime(&vifp->v_tbf->tbf_last_pkt_t); 1027 vifp->v_tbf->tbf_n_tok = 0; 1028 vifp->v_tbf->tbf_q_len = 0; 1029 vifp->v_tbf->tbf_max_q_len = MAXQSIZE; 1030 vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL; 1031 1032 vifp->v_flags = vifcp->vifc_flags; 1033 vifp->v_threshold = vifcp->vifc_threshold; 1034 vifp->v_lcl_addr = vifcp->vifc_lcl_addr; 1035 vifp->v_ipif = ipif; 1036 ipif_refrele(ipif); 1037 /* Scaling up here, allows division by 1024 in critical code. */ 1038 vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000); 1039 vifp->v_timeout_id = 0; 1040 /* initialize per vif pkt counters */ 1041 vifp->v_pkt_in = 0; 1042 vifp->v_pkt_out = 0; 1043 vifp->v_bytes_in = 0; 1044 vifp->v_bytes_out = 0; 1045 mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL); 1046 1047 /* Adjust numvifs up, if the vifi is higher than numvifs */ 1048 mutex_enter(&ipst->ips_numvifs_mutex); 1049 if (ipst->ips_numvifs <= vifcp->vifc_vifi) 1050 ipst->ips_numvifs = vifcp->vifc_vifi + 1; 1051 mutex_exit(&ipst->ips_numvifs_mutex); 1052 1053 if (ipst->ips_ip_mrtdebug > 1) { 1054 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1055 "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d", 1056 vifcp->vifc_vifi, 1057 ntohl(vifcp->vifc_lcl_addr.s_addr), 1058 (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask", 1059 ntohl(vifcp->vifc_rmt_addr.s_addr), 1060 vifcp->vifc_threshold, vifcp->vifc_rate_limit); 1061 } 1062 1063 vifp->v_marks = VIF_MARK_GOOD; 1064 mutex_exit(&vifp->v_lock); 1065 ipsq_exit(ipsq); 1066 return (0); 1067 } 1068 1069 1070 /* Delete a vif from the vif table. */ 1071 static void 1072 del_vifp(struct vif *vifp) 1073 { 1074 struct tbf *t = vifp->v_tbf; 1075 mblk_t *mp0; 1076 vifi_t vifi; 1077 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 1078 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1079 1080 ASSERT(vifp->v_marks & VIF_MARK_CONDEMNED); 1081 ASSERT(t != NULL); 1082 1083 /* 1084 * release the ref we put in vif_del. 1085 */ 1086 ASSERT(vifp->v_ipif != NULL); 1087 ipif_refrele(vifp->v_ipif); 1088 1089 if (ipst->ips_ip_mrtdebug > 1) { 1090 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1091 "del_vif: src 0x%x\n", vifp->v_lcl_addr.s_addr); 1092 } 1093 1094 if (vifp->v_timeout_id != 0) { 1095 (void) untimeout(vifp->v_timeout_id); 1096 vifp->v_timeout_id = 0; 1097 } 1098 1099 /* 1100 * Free packets queued at the interface. 1101 * Mrouted takes care of cleaning up mfcs - makes calls to del_mfc. 1102 */ 1103 mutex_enter(&t->tbf_lock); 1104 while (t->tbf_q != NULL) { 1105 mp0 = t->tbf_q; 1106 t->tbf_q = t->tbf_q->b_next; 1107 mp0->b_prev = mp0->b_next = NULL; 1108 freemsg(mp0); 1109 } 1110 mutex_exit(&t->tbf_lock); 1111 1112 /* 1113 * Always clear cache when vifs change. 1114 * No need to get last_encap_lock since we are running as a writer. 1115 */ 1116 mutex_enter(&ipst->ips_last_encap_lock); 1117 if (vifp == ipst->ips_last_encap_vif) { 1118 ipst->ips_last_encap_vif = NULL; 1119 ipst->ips_last_encap_src = 0; 1120 } 1121 mutex_exit(&ipst->ips_last_encap_lock); 1122 1123 mutex_destroy(&t->tbf_lock); 1124 1125 bzero(vifp->v_tbf, sizeof (*(vifp->v_tbf))); 1126 1127 /* Adjust numvifs down */ 1128 mutex_enter(&ipst->ips_numvifs_mutex); 1129 for (vifi = ipst->ips_numvifs; vifi != 0; vifi--) /* vifi is unsigned */ 1130 if (ipst->ips_vifs[vifi - 1].v_lcl_addr.s_addr != 0) 1131 break; 1132 ipst->ips_numvifs = vifi; 1133 mutex_exit(&ipst->ips_numvifs_mutex); 1134 1135 bzero(vifp, sizeof (*vifp)); 1136 } 1137 1138 static int 1139 del_vif(vifi_t *vifip, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst) 1140 { 1141 struct vif *vifp = ipst->ips_vifs + *vifip; 1142 ipsq_t *ipsq; 1143 1144 if (*vifip >= ipst->ips_numvifs) 1145 return (EINVAL); 1146 1147 1148 mutex_enter(&vifp->v_lock); 1149 /* 1150 * Not initialized 1151 * Here we are not looking at the vif that is being initialized 1152 * i.e vifp->v_marks == 0 and refcnt > 0. 1153 */ 1154 if (vifp->v_lcl_addr.s_addr == 0 || 1155 !(vifp->v_marks & VIF_MARK_GOOD)) { 1156 mutex_exit(&vifp->v_lock); 1157 return (EADDRNOTAVAIL); 1158 } 1159 1160 /* 1161 * This is an optimization, if first_mp == NULL 1162 * than we are being called from reset_mrt_vif_ipif() 1163 * so we already have exclusive access to the ipsq. 1164 * the ASSERT below is a check for this condition. 1165 */ 1166 if (first_mp != NULL && 1167 !(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 1168 ASSERT(connp != NULL); 1169 /* 1170 * We have to be exclusive as we have to call ip_delmulti() 1171 * This is the best position to try to be exclusive in case 1172 * we have to wait. 1173 */ 1174 ipsq = ipsq_try_enter(vifp->v_ipif, NULL, CONNP_TO_WQ(connp), 1175 first_mp, ip_restart_optmgmt, NEW_OP, B_TRUE); 1176 if ((ipsq) == NULL) { 1177 mutex_exit(&vifp->v_lock); 1178 return (EINPROGRESS); 1179 } 1180 /* recheck after being exclusive */ 1181 if (vifp->v_lcl_addr.s_addr == 0 || 1182 !vifp->v_marks & VIF_MARK_GOOD) { 1183 /* 1184 * someone beat us. 1185 */ 1186 mutex_exit(&vifp->v_lock); 1187 ipsq_exit(ipsq); 1188 return (EADDRNOTAVAIL); 1189 } 1190 } 1191 1192 1193 ASSERT(IAM_WRITER_IPIF(vifp->v_ipif)); 1194 1195 1196 /* 1197 * add a refhold so that ipif does not go away while 1198 * there are still users, this will be released in del_vifp 1199 * when we free the vif. 1200 */ 1201 ipif_refhold(vifp->v_ipif); 1202 1203 /* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */ 1204 vifp->v_marks &= ~VIF_MARK_GOOD; 1205 vifp->v_marks |= VIF_MARK_CONDEMNED; 1206 1207 /* Phyint only */ 1208 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 1209 ipif_t *ipif = vifp->v_ipif; 1210 ASSERT(ipif != NULL); 1211 /* 1212 * should be OK to drop the lock as we 1213 * have marked this as CONDEMNED. 1214 */ 1215 mutex_exit(&(vifp)->v_lock); 1216 (void) ip_delmulti(INADDR_ANY, ipif, B_TRUE, B_TRUE); 1217 if (first_mp != NULL) 1218 ipsq_exit(ipsq); 1219 mutex_enter(&(vifp)->v_lock); 1220 } 1221 1222 /* 1223 * decreases the refcnt added in add_vif. 1224 */ 1225 VIF_REFRELE_LOCKED(vifp); 1226 return (0); 1227 } 1228 1229 /* 1230 * Add an mfc entry. 1231 */ 1232 static int 1233 add_mfc(struct mfcctl *mfccp, ip_stack_t *ipst) 1234 { 1235 struct mfc *rt; 1236 struct rtdetq *rte; 1237 ushort_t nstl; 1238 int i; 1239 struct mfcb *mfcbp; 1240 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1241 1242 /* 1243 * The value of vifi is NO_VIF (==MAXVIFS) if Mrouted 1244 * did not have a real route for pkt. 1245 * We want this pkt without rt installed in the mfctable to prevent 1246 * multiiple tries, so go ahead and put it in mfctable, it will 1247 * be discarded later in ip_mdq() because the child is NULL. 1248 */ 1249 1250 /* Error checking, out of bounds? */ 1251 if (mfccp->mfcc_parent > MAXVIFS) { 1252 ip0dbg(("ADD_MFC: mfcc_parent out of range %d", 1253 (int)mfccp->mfcc_parent)); 1254 return (EINVAL); 1255 } 1256 1257 if ((mfccp->mfcc_parent != NO_VIF) && 1258 (ipst->ips_vifs[mfccp->mfcc_parent].v_ipif == NULL)) { 1259 ip0dbg(("ADD_MFC: NULL ipif for parent vif %d\n", 1260 (int)mfccp->mfcc_parent)); 1261 return (EINVAL); 1262 } 1263 1264 if (is_mrouter_off(ipst)) { 1265 return (EINVAL); 1266 } 1267 1268 mfcbp = &ipst->ips_mfcs[MFCHASH(mfccp->mfcc_origin.s_addr, 1269 mfccp->mfcc_mcastgrp.s_addr)]; 1270 MFCB_REFHOLD(mfcbp); 1271 MFCFIND(mfcbp, mfccp->mfcc_origin.s_addr, 1272 mfccp->mfcc_mcastgrp.s_addr, rt); 1273 1274 /* If an entry already exists, just update the fields */ 1275 if (rt) { 1276 if (ipst->ips_ip_mrtdebug > 1) { 1277 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1278 "add_mfc: update o %x grp %x parent %x", 1279 ntohl(mfccp->mfcc_origin.s_addr), 1280 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1281 mfccp->mfcc_parent); 1282 } 1283 mutex_enter(&rt->mfc_mutex); 1284 rt->mfc_parent = mfccp->mfcc_parent; 1285 1286 mutex_enter(&ipst->ips_numvifs_mutex); 1287 for (i = 0; i < (int)ipst->ips_numvifs; i++) 1288 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; 1289 mutex_exit(&ipst->ips_numvifs_mutex); 1290 mutex_exit(&rt->mfc_mutex); 1291 1292 MFCB_REFRELE(mfcbp); 1293 return (0); 1294 } 1295 1296 /* 1297 * Find the entry for which the upcall was made and update. 1298 */ 1299 for (rt = mfcbp->mfcb_mfc, nstl = 0; rt; rt = rt->mfc_next) { 1300 mutex_enter(&rt->mfc_mutex); 1301 if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && 1302 (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) && 1303 (rt->mfc_rte != NULL) && 1304 !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 1305 if (nstl++ != 0) 1306 cmn_err(CE_WARN, 1307 "add_mfc: %s o %x g %x p %x", 1308 "multiple kernel entries", 1309 ntohl(mfccp->mfcc_origin.s_addr), 1310 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1311 mfccp->mfcc_parent); 1312 1313 if (ipst->ips_ip_mrtdebug > 1) { 1314 (void) mi_strlog(mrouter->conn_rq, 1, 1315 SL_TRACE, 1316 "add_mfc: o %x g %x p %x", 1317 ntohl(mfccp->mfcc_origin.s_addr), 1318 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1319 mfccp->mfcc_parent); 1320 } 1321 fill_route(rt, mfccp, ipst); 1322 1323 /* 1324 * Prevent cleanup of cache entry. 1325 * Timer starts in ip_mforward. 1326 */ 1327 if (rt->mfc_timeout_id != 0) { 1328 timeout_id_t id; 1329 id = rt->mfc_timeout_id; 1330 /* 1331 * setting id to zero will avoid this 1332 * entry from being cleaned up in 1333 * expire_up_calls(). 1334 */ 1335 rt->mfc_timeout_id = 0; 1336 /* 1337 * dropping the lock is fine as we 1338 * have a refhold on the bucket. 1339 * so mfc cannot be freed. 1340 * The timeout can fire but it will see 1341 * that mfc_timeout_id == 0 and not cleanup. 1342 */ 1343 mutex_exit(&rt->mfc_mutex); 1344 (void) untimeout(id); 1345 mutex_enter(&rt->mfc_mutex); 1346 } 1347 1348 /* 1349 * Send all pkts that are queued waiting for the upcall. 1350 * ip_mdq param tun set to 0 - 1351 * the return value of ip_mdq() isn't used here, 1352 * so value we send doesn't matter. 1353 */ 1354 while (rt->mfc_rte != NULL) { 1355 rte = rt->mfc_rte; 1356 rt->mfc_rte = rte->rte_next; 1357 mutex_exit(&rt->mfc_mutex); 1358 (void) ip_mdq(rte->mp, (ipha_t *) 1359 rte->mp->b_rptr, rte->ill, 0, rt); 1360 freemsg(rte->mp); 1361 mi_free((char *)rte); 1362 mutex_enter(&rt->mfc_mutex); 1363 } 1364 } 1365 mutex_exit(&rt->mfc_mutex); 1366 } 1367 1368 1369 /* 1370 * It is possible that an entry is being inserted without an upcall 1371 */ 1372 if (nstl == 0) { 1373 mutex_enter(&(mfcbp->mfcb_lock)); 1374 if (ipst->ips_ip_mrtdebug > 1) { 1375 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1376 "add_mfc: no upcall o %x g %x p %x", 1377 ntohl(mfccp->mfcc_origin.s_addr), 1378 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1379 mfccp->mfcc_parent); 1380 } 1381 if (is_mrouter_off(ipst)) { 1382 mutex_exit(&mfcbp->mfcb_lock); 1383 MFCB_REFRELE(mfcbp); 1384 return (EINVAL); 1385 } 1386 1387 for (rt = mfcbp->mfcb_mfc; rt; rt = rt->mfc_next) { 1388 1389 mutex_enter(&rt->mfc_mutex); 1390 if ((rt->mfc_origin.s_addr == 1391 mfccp->mfcc_origin.s_addr) && 1392 (rt->mfc_mcastgrp.s_addr == 1393 mfccp->mfcc_mcastgrp.s_addr) && 1394 (!(rt->mfc_marks & MFCB_MARK_CONDEMNED))) { 1395 fill_route(rt, mfccp, ipst); 1396 mutex_exit(&rt->mfc_mutex); 1397 break; 1398 } 1399 mutex_exit(&rt->mfc_mutex); 1400 } 1401 1402 /* No upcall, so make a new entry into mfctable */ 1403 if (rt == NULL) { 1404 rt = (struct mfc *)mi_zalloc(sizeof (struct mfc)); 1405 if (rt == NULL) { 1406 ip1dbg(("add_mfc: out of memory\n")); 1407 mutex_exit(&mfcbp->mfcb_lock); 1408 MFCB_REFRELE(mfcbp); 1409 return (ENOBUFS); 1410 } 1411 1412 /* Insert new entry at head of hash chain */ 1413 mutex_enter(&rt->mfc_mutex); 1414 fill_route(rt, mfccp, ipst); 1415 1416 /* Link into table */ 1417 rt->mfc_next = mfcbp->mfcb_mfc; 1418 mfcbp->mfcb_mfc = rt; 1419 mutex_exit(&rt->mfc_mutex); 1420 } 1421 mutex_exit(&mfcbp->mfcb_lock); 1422 } 1423 1424 MFCB_REFRELE(mfcbp); 1425 return (0); 1426 } 1427 1428 /* 1429 * Fills in mfc structure from mrouted mfcctl. 1430 */ 1431 static void 1432 fill_route(struct mfc *rt, struct mfcctl *mfccp, ip_stack_t *ipst) 1433 { 1434 int i; 1435 1436 rt->mfc_origin = mfccp->mfcc_origin; 1437 rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; 1438 rt->mfc_parent = mfccp->mfcc_parent; 1439 mutex_enter(&ipst->ips_numvifs_mutex); 1440 for (i = 0; i < (int)ipst->ips_numvifs; i++) { 1441 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; 1442 } 1443 mutex_exit(&ipst->ips_numvifs_mutex); 1444 /* Initialize pkt counters per src-grp */ 1445 rt->mfc_pkt_cnt = 0; 1446 rt->mfc_byte_cnt = 0; 1447 rt->mfc_wrong_if = 0; 1448 rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_nsec = 0; 1449 1450 } 1451 1452 static void 1453 free_queue(struct mfc *mfcp) 1454 { 1455 struct rtdetq *rte0; 1456 1457 /* 1458 * Drop all queued upcall packets. 1459 * Free the mbuf with the pkt. 1460 */ 1461 while ((rte0 = mfcp->mfc_rte) != NULL) { 1462 mfcp->mfc_rte = rte0->rte_next; 1463 freemsg(rte0->mp); 1464 mi_free((char *)rte0); 1465 } 1466 } 1467 /* 1468 * go thorugh the hash bucket and free all the entries marked condemned. 1469 */ 1470 void 1471 release_mfc(struct mfcb *mfcbp) 1472 { 1473 struct mfc *current_mfcp; 1474 struct mfc *prev_mfcp; 1475 1476 prev_mfcp = current_mfcp = mfcbp->mfcb_mfc; 1477 1478 while (current_mfcp != NULL) { 1479 if (current_mfcp->mfc_marks & MFCB_MARK_CONDEMNED) { 1480 if (current_mfcp == mfcbp->mfcb_mfc) { 1481 mfcbp->mfcb_mfc = current_mfcp->mfc_next; 1482 free_queue(current_mfcp); 1483 mi_free(current_mfcp); 1484 prev_mfcp = current_mfcp = mfcbp->mfcb_mfc; 1485 continue; 1486 } 1487 ASSERT(prev_mfcp != NULL); 1488 prev_mfcp->mfc_next = current_mfcp->mfc_next; 1489 free_queue(current_mfcp); 1490 mi_free(current_mfcp); 1491 current_mfcp = NULL; 1492 } else { 1493 prev_mfcp = current_mfcp; 1494 } 1495 1496 current_mfcp = prev_mfcp->mfc_next; 1497 1498 } 1499 mfcbp->mfcb_marks &= ~MFCB_MARK_CONDEMNED; 1500 ASSERT(mfcbp->mfcb_mfc != NULL || mfcbp->mfcb_marks == 0); 1501 } 1502 1503 /* 1504 * Delete an mfc entry. 1505 */ 1506 static int 1507 del_mfc(struct mfcctl *mfccp, ip_stack_t *ipst) 1508 { 1509 struct in_addr origin; 1510 struct in_addr mcastgrp; 1511 struct mfc *rt; 1512 uint_t hash; 1513 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1514 1515 origin = mfccp->mfcc_origin; 1516 mcastgrp = mfccp->mfcc_mcastgrp; 1517 hash = MFCHASH(origin.s_addr, mcastgrp.s_addr); 1518 1519 if (ipst->ips_ip_mrtdebug > 1) { 1520 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1521 "del_mfc: o %x g %x", 1522 ntohl(origin.s_addr), 1523 ntohl(mcastgrp.s_addr)); 1524 } 1525 1526 MFCB_REFHOLD(&ipst->ips_mfcs[hash]); 1527 1528 /* Find mfc in mfctable, finds only entries without upcalls */ 1529 for (rt = ipst->ips_mfcs[hash].mfcb_mfc; rt; rt = rt->mfc_next) { 1530 mutex_enter(&rt->mfc_mutex); 1531 if (origin.s_addr == rt->mfc_origin.s_addr && 1532 mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr && 1533 rt->mfc_rte == NULL && 1534 !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) 1535 break; 1536 mutex_exit(&rt->mfc_mutex); 1537 } 1538 1539 /* 1540 * Return if there was an upcall (mfc_rte != NULL, 1541 * or rt not in mfctable. 1542 */ 1543 if (rt == NULL) { 1544 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 1545 return (EADDRNOTAVAIL); 1546 } 1547 1548 1549 /* 1550 * no need to hold lock as we have a reference. 1551 */ 1552 ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED; 1553 /* error checking */ 1554 if (rt->mfc_timeout_id != 0) { 1555 ip0dbg(("del_mfc: TIMEOUT NOT 0, rte not null")); 1556 /* 1557 * Its ok to drop the lock, the struct cannot be freed 1558 * since we have a ref on the hash bucket. 1559 */ 1560 rt->mfc_timeout_id = 0; 1561 mutex_exit(&rt->mfc_mutex); 1562 (void) untimeout(rt->mfc_timeout_id); 1563 mutex_enter(&rt->mfc_mutex); 1564 } 1565 1566 ASSERT(rt->mfc_rte == NULL); 1567 1568 1569 /* 1570 * Delete the entry from the cache 1571 */ 1572 rt->mfc_marks |= MFCB_MARK_CONDEMNED; 1573 mutex_exit(&rt->mfc_mutex); 1574 1575 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 1576 1577 return (0); 1578 } 1579 1580 #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ 1581 1582 /* 1583 * IP multicast forwarding function. This function assumes that the packet 1584 * pointed to by ipha has arrived on (or is about to be sent to) the interface 1585 * pointed to by "ill", and the packet is to be relayed to other networks 1586 * that have members of the packet's destination IP multicast group. 1587 * 1588 * The packet is returned unscathed to the caller, unless it is 1589 * erroneous, in which case a -1 value tells the caller (IP) 1590 * to discard it. 1591 * 1592 * Unlike BSD, SunOS 5.x needs to return to IP info about 1593 * whether pkt came in thru a tunnel, so it can be discarded, unless 1594 * it's IGMP. In BSD, the ifp is bogus for tunnels, so pkt won't try 1595 * to be delivered. 1596 * Return values are 0 - pkt is okay and phyint 1597 * -1 - pkt is malformed and to be tossed 1598 * 1 - pkt came in on tunnel 1599 */ 1600 int 1601 ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp) 1602 { 1603 struct mfc *rt; 1604 ipaddr_t src, dst, tunnel_src = 0; 1605 static int srctun = 0; 1606 vifi_t vifi; 1607 boolean_t pim_reg_packet = B_FALSE; 1608 struct mfcb *mfcbp; 1609 ip_stack_t *ipst = ill->ill_ipst; 1610 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1611 1612 if (ipst->ips_ip_mrtdebug > 1) { 1613 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1614 "ip_mforward: RECV ipha_src %x, ipha_dst %x, ill %s", 1615 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), 1616 ill->ill_name); 1617 } 1618 1619 dst = ipha->ipha_dst; 1620 if ((uint32_t)(uintptr_t)mp->b_prev == PIM_REGISTER_MARKER) 1621 pim_reg_packet = B_TRUE; 1622 else 1623 tunnel_src = (ipaddr_t)(uintptr_t)mp->b_prev; 1624 1625 /* 1626 * Don't forward a packet with time-to-live of zero or one, 1627 * or a packet destined to a local-only group. 1628 */ 1629 if (CLASSD(dst) && (ipha->ipha_ttl <= 1 || 1630 (ipaddr_t)ntohl(dst) <= INADDR_MAX_LOCAL_GROUP)) { 1631 if (ipst->ips_ip_mrtdebug > 1) { 1632 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1633 "ip_mforward: not forwarded ttl %d," 1634 " dst 0x%x ill %s", 1635 ipha->ipha_ttl, ntohl(dst), ill->ill_name); 1636 } 1637 mp->b_prev = NULL; 1638 if (tunnel_src != 0) 1639 return (1); 1640 else 1641 return (0); 1642 } 1643 1644 if ((tunnel_src != 0) || pim_reg_packet) { 1645 /* 1646 * Packet arrived over an encapsulated tunnel or via a PIM 1647 * register message. Both ip_mroute_decap() and pim_input() 1648 * encode information in mp->b_prev. 1649 */ 1650 mp->b_prev = NULL; 1651 if (ipst->ips_ip_mrtdebug > 1) { 1652 if (tunnel_src != 0) { 1653 (void) mi_strlog(mrouter->conn_rq, 1, 1654 SL_TRACE, 1655 "ip_mforward: ill %s arrived via ENCAP TUN", 1656 ill->ill_name); 1657 } else if (pim_reg_packet) { 1658 (void) mi_strlog(mrouter->conn_rq, 1, 1659 SL_TRACE, 1660 "ip_mforward: ill %s arrived via" 1661 " REGISTER VIF", 1662 ill->ill_name); 1663 } 1664 } 1665 } else if ((ipha->ipha_version_and_hdr_length & 0xf) < 1666 (uint_t)(IP_SIMPLE_HDR_LENGTH + TUNNEL_LEN) >> 2 || 1667 ((uchar_t *)(ipha + 1))[1] != IPOPT_LSRR) { 1668 /* Packet arrived via a physical interface. */ 1669 if (ipst->ips_ip_mrtdebug > 1) { 1670 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1671 "ip_mforward: ill %s arrived via PHYINT", 1672 ill->ill_name); 1673 } 1674 1675 } else { 1676 /* 1677 * Packet arrived through a SRCRT tunnel. 1678 * Source-route tunnels are no longer supported. 1679 * Error message printed every 1000 times. 1680 */ 1681 if ((srctun++ % 1000) == 0) { 1682 cmn_err(CE_WARN, 1683 "ip_mforward: received source-routed pkt from %x", 1684 ntohl(ipha->ipha_src)); 1685 } 1686 return (-1); 1687 } 1688 1689 ipst->ips_mrtstat->mrts_fwd_in++; 1690 src = ipha->ipha_src; 1691 1692 /* Find route in cache, return NULL if not there or upcalls q'ed. */ 1693 1694 /* 1695 * Lock the mfctable against changes made by ip_mforward. 1696 * Note that only add_mfc and del_mfc can remove entries and 1697 * they run with exclusive access to IP. So we do not need to 1698 * guard against the rt being deleted, so release lock after reading. 1699 */ 1700 1701 if (is_mrouter_off(ipst)) 1702 return (-1); 1703 1704 mfcbp = &ipst->ips_mfcs[MFCHASH(src, dst)]; 1705 MFCB_REFHOLD(mfcbp); 1706 MFCFIND(mfcbp, src, dst, rt); 1707 1708 /* Entry exists, so forward if necessary */ 1709 if (rt != NULL) { 1710 int ret = 0; 1711 ipst->ips_mrtstat->mrts_mfc_hits++; 1712 if (pim_reg_packet) { 1713 ASSERT(ipst->ips_reg_vif_num != ALL_VIFS); 1714 ret = ip_mdq(mp, ipha, 1715 ipst->ips_vifs[ipst->ips_reg_vif_num]. 1716 v_ipif->ipif_ill, 1717 0, rt); 1718 } else { 1719 ret = ip_mdq(mp, ipha, ill, tunnel_src, rt); 1720 } 1721 1722 MFCB_REFRELE(mfcbp); 1723 return (ret); 1724 1725 /* 1726 * Don't forward if we don't have a cache entry. Mrouted will 1727 * always provide a cache entry in response to an upcall. 1728 */ 1729 } else { 1730 /* 1731 * If we don't have a route for packet's origin, make a copy 1732 * of the packet and send message to routing daemon. 1733 */ 1734 struct mfc *mfc_rt = NULL; 1735 mblk_t *mp0 = NULL; 1736 mblk_t *mp_copy = NULL; 1737 struct rtdetq *rte = NULL; 1738 struct rtdetq *rte_m, *rte1, *prev_rte; 1739 uint_t hash; 1740 int npkts; 1741 boolean_t new_mfc = B_FALSE; 1742 ipst->ips_mrtstat->mrts_mfc_misses++; 1743 /* BSD uses mrts_no_route++ */ 1744 if (ipst->ips_ip_mrtdebug > 1) { 1745 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1746 "ip_mforward: no rte ill %s src %x g %x misses %d", 1747 ill->ill_name, ntohl(src), ntohl(dst), 1748 (int)ipst->ips_mrtstat->mrts_mfc_misses); 1749 } 1750 /* 1751 * The order of the following code differs from the BSD code. 1752 * Pre-mc3.5, the BSD code was incorrect and SunOS 5.x 1753 * code works, so SunOS 5.x wasn't changed to conform to the 1754 * BSD version. 1755 */ 1756 1757 /* Lock mfctable. */ 1758 hash = MFCHASH(src, dst); 1759 mutex_enter(&(ipst->ips_mfcs[hash].mfcb_lock)); 1760 1761 /* 1762 * If we are turning off mrouted return an error 1763 */ 1764 if (is_mrouter_off(ipst)) { 1765 mutex_exit(&mfcbp->mfcb_lock); 1766 MFCB_REFRELE(mfcbp); 1767 return (-1); 1768 } 1769 1770 /* Is there an upcall waiting for this packet? */ 1771 for (mfc_rt = ipst->ips_mfcs[hash].mfcb_mfc; mfc_rt; 1772 mfc_rt = mfc_rt->mfc_next) { 1773 mutex_enter(&mfc_rt->mfc_mutex); 1774 if (ipst->ips_ip_mrtdebug > 1) { 1775 (void) mi_strlog(mrouter->conn_rq, 1, 1776 SL_TRACE, 1777 "ip_mforward: MFCTAB hash %d o 0x%x" 1778 " g 0x%x\n", 1779 hash, ntohl(mfc_rt->mfc_origin.s_addr), 1780 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1781 } 1782 /* There is an upcall */ 1783 if ((src == mfc_rt->mfc_origin.s_addr) && 1784 (dst == mfc_rt->mfc_mcastgrp.s_addr) && 1785 (mfc_rt->mfc_rte != NULL) && 1786 !(mfc_rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 1787 break; 1788 } 1789 mutex_exit(&mfc_rt->mfc_mutex); 1790 } 1791 /* No upcall, so make a new entry into mfctable */ 1792 if (mfc_rt == NULL) { 1793 mfc_rt = (struct mfc *)mi_zalloc(sizeof (struct mfc)); 1794 if (mfc_rt == NULL) { 1795 ipst->ips_mrtstat->mrts_fwd_drop++; 1796 ip1dbg(("ip_mforward: out of memory " 1797 "for mfc, mfc_rt\n")); 1798 goto error_return; 1799 } else 1800 new_mfc = B_TRUE; 1801 /* Get resources */ 1802 /* TODO could copy header and dup rest */ 1803 mp_copy = copymsg(mp); 1804 if (mp_copy == NULL) { 1805 ipst->ips_mrtstat->mrts_fwd_drop++; 1806 ip1dbg(("ip_mforward: out of memory for " 1807 "mblk, mp_copy\n")); 1808 goto error_return; 1809 } 1810 mutex_enter(&mfc_rt->mfc_mutex); 1811 } 1812 /* Get resources for rte, whether first rte or not first. */ 1813 /* Add this packet into rtdetq */ 1814 rte = (struct rtdetq *)mi_zalloc(sizeof (struct rtdetq)); 1815 if (rte == NULL) { 1816 ipst->ips_mrtstat->mrts_fwd_drop++; 1817 mutex_exit(&mfc_rt->mfc_mutex); 1818 ip1dbg(("ip_mforward: out of memory for" 1819 " rtdetq, rte\n")); 1820 goto error_return; 1821 } 1822 1823 mp0 = copymsg(mp); 1824 if (mp0 == NULL) { 1825 ipst->ips_mrtstat->mrts_fwd_drop++; 1826 ip1dbg(("ip_mforward: out of memory for mblk, mp0\n")); 1827 mutex_exit(&mfc_rt->mfc_mutex); 1828 goto error_return; 1829 } 1830 rte->mp = mp0; 1831 if (pim_reg_packet) { 1832 ASSERT(ipst->ips_reg_vif_num != ALL_VIFS); 1833 rte->ill = 1834 ipst->ips_vifs[ipst->ips_reg_vif_num]. 1835 v_ipif->ipif_ill; 1836 } else { 1837 rte->ill = ill; 1838 } 1839 rte->rte_next = NULL; 1840 1841 /* 1842 * Determine if upcall q (rtdetq) has overflowed. 1843 * mfc_rt->mfc_rte is null by mi_zalloc 1844 * if it is the first message. 1845 */ 1846 for (rte_m = mfc_rt->mfc_rte, npkts = 0; rte_m; 1847 rte_m = rte_m->rte_next) 1848 npkts++; 1849 if (ipst->ips_ip_mrtdebug > 1) { 1850 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1851 "ip_mforward: upcalls %d\n", npkts); 1852 } 1853 if (npkts > MAX_UPQ) { 1854 ipst->ips_mrtstat->mrts_upq_ovflw++; 1855 mutex_exit(&mfc_rt->mfc_mutex); 1856 goto error_return; 1857 } 1858 1859 if (npkts == 0) { /* first upcall */ 1860 int i = 0; 1861 /* 1862 * Now finish installing the new mfc! Now that we have 1863 * resources! Insert new entry at head of hash chain. 1864 * Use src and dst which are ipaddr_t's. 1865 */ 1866 mfc_rt->mfc_origin.s_addr = src; 1867 mfc_rt->mfc_mcastgrp.s_addr = dst; 1868 1869 mutex_enter(&ipst->ips_numvifs_mutex); 1870 for (i = 0; i < (int)ipst->ips_numvifs; i++) 1871 mfc_rt->mfc_ttls[i] = 0; 1872 mutex_exit(&ipst->ips_numvifs_mutex); 1873 mfc_rt->mfc_parent = ALL_VIFS; 1874 1875 /* Link into table */ 1876 if (ipst->ips_ip_mrtdebug > 1) { 1877 (void) mi_strlog(mrouter->conn_rq, 1, 1878 SL_TRACE, 1879 "ip_mforward: NEW MFCTAB hash %d o 0x%x " 1880 "g 0x%x\n", hash, 1881 ntohl(mfc_rt->mfc_origin.s_addr), 1882 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1883 } 1884 mfc_rt->mfc_next = ipst->ips_mfcs[hash].mfcb_mfc; 1885 ipst->ips_mfcs[hash].mfcb_mfc = mfc_rt; 1886 mfc_rt->mfc_rte = NULL; 1887 } 1888 1889 /* Link in the upcall */ 1890 /* First upcall */ 1891 if (mfc_rt->mfc_rte == NULL) 1892 mfc_rt->mfc_rte = rte; 1893 else { 1894 /* not the first upcall */ 1895 prev_rte = mfc_rt->mfc_rte; 1896 for (rte1 = mfc_rt->mfc_rte->rte_next; rte1; 1897 prev_rte = rte1, rte1 = rte1->rte_next) 1898 ; 1899 prev_rte->rte_next = rte; 1900 } 1901 1902 /* 1903 * No upcalls waiting, this is first one, so send a message to 1904 * routing daemon to install a route into kernel table. 1905 */ 1906 if (npkts == 0) { 1907 struct igmpmsg *im; 1908 /* ipha_protocol is 0, for upcall */ 1909 ASSERT(mp_copy != NULL); 1910 im = (struct igmpmsg *)mp_copy->b_rptr; 1911 im->im_msgtype = IGMPMSG_NOCACHE; 1912 im->im_mbz = 0; 1913 mutex_enter(&ipst->ips_numvifs_mutex); 1914 if (pim_reg_packet) { 1915 im->im_vif = (uchar_t)ipst->ips_reg_vif_num; 1916 mutex_exit(&ipst->ips_numvifs_mutex); 1917 } else { 1918 /* 1919 * XXX do we need to hold locks here ? 1920 */ 1921 for (vifi = 0; 1922 vifi < ipst->ips_numvifs; 1923 vifi++) { 1924 if (ipst->ips_vifs[vifi].v_ipif == NULL) 1925 continue; 1926 if (ipst->ips_vifs[vifi]. 1927 v_ipif->ipif_ill == ill) { 1928 im->im_vif = (uchar_t)vifi; 1929 break; 1930 } 1931 } 1932 mutex_exit(&ipst->ips_numvifs_mutex); 1933 ASSERT(vifi < ipst->ips_numvifs); 1934 } 1935 1936 ipst->ips_mrtstat->mrts_upcalls++; 1937 /* Timer to discard upcalls if mrouted is too slow */ 1938 mfc_rt->mfc_timeout_id = timeout(expire_upcalls, 1939 mfc_rt, EXPIRE_TIMEOUT * UPCALL_EXPIRE); 1940 mutex_exit(&mfc_rt->mfc_mutex); 1941 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1942 /* Pass to RAWIP */ 1943 (mrouter->conn_recv)(mrouter, mp_copy, NULL); 1944 } else { 1945 mutex_exit(&mfc_rt->mfc_mutex); 1946 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1947 freemsg(mp_copy); 1948 } 1949 1950 MFCB_REFRELE(mfcbp); 1951 if (tunnel_src != 0) 1952 return (1); 1953 else 1954 return (0); 1955 error_return: 1956 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1957 MFCB_REFRELE(mfcbp); 1958 if (mfc_rt != NULL && (new_mfc == B_TRUE)) 1959 mi_free((char *)mfc_rt); 1960 if (rte != NULL) 1961 mi_free((char *)rte); 1962 if (mp_copy != NULL) 1963 freemsg(mp_copy); 1964 if (mp0 != NULL) 1965 freemsg(mp0); 1966 return (-1); 1967 } 1968 } 1969 1970 /* 1971 * Clean up the mfctable cache entry if upcall is not serviced. 1972 * SunOS 5.x has timeout per mfc, unlike BSD which has one timer. 1973 */ 1974 static void 1975 expire_upcalls(void *arg) 1976 { 1977 struct mfc *mfc_rt = arg; 1978 uint_t hash; 1979 struct mfc *prev_mfc, *mfc0; 1980 ip_stack_t *ipst; 1981 conn_t *mrouter; 1982 1983 if (mfc_rt->mfc_rte == NULL || mfc_rt->mfc_rte->ill != NULL) { 1984 cmn_err(CE_WARN, "expire_upcalls: no ILL\n"); 1985 return; 1986 } 1987 ipst = mfc_rt->mfc_rte->ill->ill_ipst; 1988 mrouter = ipst->ips_ip_g_mrouter; 1989 1990 hash = MFCHASH(mfc_rt->mfc_origin.s_addr, mfc_rt->mfc_mcastgrp.s_addr); 1991 if (ipst->ips_ip_mrtdebug > 1) { 1992 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1993 "expire_upcalls: hash %d s %x g %x", 1994 hash, ntohl(mfc_rt->mfc_origin.s_addr), 1995 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1996 } 1997 MFCB_REFHOLD(&ipst->ips_mfcs[hash]); 1998 mutex_enter(&mfc_rt->mfc_mutex); 1999 /* 2000 * if timeout has been set to zero, than the 2001 * entry has been filled, no need to delete it. 2002 */ 2003 if (mfc_rt->mfc_timeout_id == 0) 2004 goto done; 2005 ipst->ips_mrtstat->mrts_cache_cleanups++; 2006 mfc_rt->mfc_timeout_id = 0; 2007 2008 /* Determine entry to be cleaned up in cache table. */ 2009 for (prev_mfc = mfc0 = ipst->ips_mfcs[hash].mfcb_mfc; mfc0; 2010 prev_mfc = mfc0, mfc0 = mfc0->mfc_next) 2011 if (mfc0 == mfc_rt) 2012 break; 2013 2014 /* del_mfc takes care of gone mfcs */ 2015 ASSERT(prev_mfc != NULL); 2016 ASSERT(mfc0 != NULL); 2017 2018 /* 2019 * Delete the entry from the cache 2020 */ 2021 ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED; 2022 mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED; 2023 2024 /* 2025 * release_mfc will drop all queued upcall packets. 2026 * and will free the mbuf with the pkt, if, timing info. 2027 */ 2028 done: 2029 mutex_exit(&mfc_rt->mfc_mutex); 2030 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 2031 } 2032 2033 /* 2034 * Packet forwarding routine once entry in the cache is made. 2035 */ 2036 static int 2037 ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src, 2038 struct mfc *rt) 2039 { 2040 ill_t *vill; 2041 vifi_t vifi; 2042 struct vif *vifp; 2043 ipaddr_t dst = ipha->ipha_dst; 2044 size_t plen = msgdsize(mp); 2045 vifi_t num_of_vifs; 2046 ip_stack_t *ipst = ill->ill_ipst; 2047 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2048 2049 if (ipst->ips_ip_mrtdebug > 1) { 2050 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2051 "ip_mdq: SEND src %x, ipha_dst %x, ill %s", 2052 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), 2053 ill->ill_name); 2054 } 2055 2056 /* Macro to send packet on vif */ 2057 #define MC_SEND(ipha, mp, vifp, dst) { \ 2058 if ((vifp)->v_flags & VIFF_TUNNEL) \ 2059 encap_send((ipha), (mp), (vifp), (dst)); \ 2060 else if ((vifp)->v_flags & VIFF_REGISTER) \ 2061 register_send((ipha), (mp), (vifp), (dst)); \ 2062 else \ 2063 phyint_send((ipha), (mp), (vifp), (dst)); \ 2064 } 2065 2066 vifi = rt->mfc_parent; 2067 2068 /* 2069 * The value of vifi is MAXVIFS if the pkt had no parent, i.e., 2070 * Mrouted had no route. 2071 * We wanted the route installed in the mfctable to prevent multiple 2072 * tries, so it passed add_mfc(), but is discarded here. The v_ipif is 2073 * NULL so we don't want to check the ill. Still needed as of Mrouted 2074 * 3.6. 2075 */ 2076 if (vifi == NO_VIF) { 2077 ip1dbg(("ip_mdq: no route for origin ill %s, vifi is NO_VIF\n", 2078 ill->ill_name)); 2079 if (ipst->ips_ip_mrtdebug > 1) { 2080 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2081 "ip_mdq: vifi is NO_VIF ill = %s", ill->ill_name); 2082 } 2083 return (-1); /* drop pkt */ 2084 } 2085 2086 if (!lock_good_vif(&ipst->ips_vifs[vifi])) 2087 return (-1); 2088 /* 2089 * The MFC entries are not cleaned up when an ipif goes 2090 * away thus this code has to guard against an MFC referencing 2091 * an ipif that has been closed. Note: reset_mrt_vif_ipif 2092 * sets the v_ipif to NULL when the ipif disappears. 2093 */ 2094 ASSERT(ipst->ips_vifs[vifi].v_ipif != NULL); 2095 2096 if (vifi >= ipst->ips_numvifs) { 2097 cmn_err(CE_WARN, "ip_mdq: illegal vifi %d numvifs " 2098 "%d ill %s viftable ill %s\n", 2099 (int)vifi, (int)ipst->ips_numvifs, ill->ill_name, 2100 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name); 2101 unlock_good_vif(&ipst->ips_vifs[vifi]); 2102 return (-1); 2103 } 2104 /* 2105 * Don't forward if it didn't arrive from the parent vif for its 2106 * origin. 2107 */ 2108 vill = ipst->ips_vifs[vifi].v_ipif->ipif_ill; 2109 if ((vill != ill && !IS_IN_SAME_ILLGRP(vill, ill)) || 2110 (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) { 2111 /* Came in the wrong interface */ 2112 ip1dbg(("ip_mdq: arrived wrong if, vifi %d " 2113 "numvifs %d ill %s viftable ill %s\n", 2114 (int)vifi, (int)ipst->ips_numvifs, ill->ill_name, 2115 vill->ill_name)); 2116 if (ipst->ips_ip_mrtdebug > 1) { 2117 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2118 "ip_mdq: arrived wrong if, vifi %d ill " 2119 "%s viftable ill %s\n", 2120 (int)vifi, ill->ill_name, vill->ill_name); 2121 } 2122 ipst->ips_mrtstat->mrts_wrong_if++; 2123 rt->mfc_wrong_if++; 2124 2125 /* 2126 * If we are doing PIM assert processing and we are forwarding 2127 * packets on this interface, and it is a broadcast medium 2128 * interface (and not a tunnel), send a message to the routing. 2129 * 2130 * We use the first ipif on the list, since it's all we have. 2131 * Chances are the ipif_flags are the same for ipifs on the ill. 2132 */ 2133 if (ipst->ips_pim_assert && rt->mfc_ttls[vifi] > 0 && 2134 (ill->ill_ipif->ipif_flags & IPIF_BROADCAST) && 2135 !(ipst->ips_vifs[vifi].v_flags & VIFF_TUNNEL)) { 2136 mblk_t *mp_copy; 2137 struct igmpmsg *im; 2138 2139 /* TODO could copy header and dup rest */ 2140 mp_copy = copymsg(mp); 2141 if (mp_copy == NULL) { 2142 ipst->ips_mrtstat->mrts_fwd_drop++; 2143 ip1dbg(("ip_mdq: out of memory " 2144 "for mblk, mp_copy\n")); 2145 unlock_good_vif(&ipst->ips_vifs[vifi]); 2146 return (-1); 2147 } 2148 2149 im = (struct igmpmsg *)mp_copy->b_rptr; 2150 im->im_msgtype = IGMPMSG_WRONGVIF; 2151 im->im_mbz = 0; 2152 im->im_vif = (ushort_t)vifi; 2153 /* Pass to RAWIP */ 2154 (mrouter->conn_recv)(mrouter, mp_copy, NULL); 2155 } 2156 unlock_good_vif(&ipst->ips_vifs[vifi]); 2157 if (tunnel_src != 0) 2158 return (1); 2159 else 2160 return (0); 2161 } 2162 /* 2163 * If I sourced this packet, it counts as output, else it was input. 2164 */ 2165 if (ipha->ipha_src == ipst->ips_vifs[vifi].v_lcl_addr.s_addr) { 2166 ipst->ips_vifs[vifi].v_pkt_out++; 2167 ipst->ips_vifs[vifi].v_bytes_out += plen; 2168 } else { 2169 ipst->ips_vifs[vifi].v_pkt_in++; 2170 ipst->ips_vifs[vifi].v_bytes_in += plen; 2171 } 2172 mutex_enter(&rt->mfc_mutex); 2173 rt->mfc_pkt_cnt++; 2174 rt->mfc_byte_cnt += plen; 2175 mutex_exit(&rt->mfc_mutex); 2176 unlock_good_vif(&ipst->ips_vifs[vifi]); 2177 /* 2178 * For each vif, decide if a copy of the packet should be forwarded. 2179 * Forward if: 2180 * - the vif threshold ttl is non-zero AND 2181 * - the pkt ttl exceeds the vif's threshold 2182 * A non-zero mfc_ttl indicates that the vif is part of 2183 * the output set for the mfc entry. 2184 */ 2185 mutex_enter(&ipst->ips_numvifs_mutex); 2186 num_of_vifs = ipst->ips_numvifs; 2187 mutex_exit(&ipst->ips_numvifs_mutex); 2188 for (vifp = ipst->ips_vifs, vifi = 0; 2189 vifi < num_of_vifs; 2190 vifp++, vifi++) { 2191 if (!lock_good_vif(vifp)) 2192 continue; 2193 if ((rt->mfc_ttls[vifi] > 0) && 2194 (ipha->ipha_ttl > rt->mfc_ttls[vifi])) { 2195 /* 2196 * lock_good_vif should not have succedded if 2197 * v_ipif is null. 2198 */ 2199 ASSERT(vifp->v_ipif != NULL); 2200 vifp->v_pkt_out++; 2201 vifp->v_bytes_out += plen; 2202 MC_SEND(ipha, mp, vifp, dst); 2203 ipst->ips_mrtstat->mrts_fwd_out++; 2204 } 2205 unlock_good_vif(vifp); 2206 } 2207 if (tunnel_src != 0) 2208 return (1); 2209 else 2210 return (0); 2211 } 2212 2213 /* 2214 * Send the packet on physical interface. 2215 * Caller assumes can continue to use mp on return. 2216 */ 2217 /* ARGSUSED */ 2218 static void 2219 phyint_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2220 { 2221 mblk_t *mp_copy; 2222 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2223 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2224 2225 /* Make a new reference to the packet */ 2226 mp_copy = copymsg(mp); /* TODO could copy header and dup rest */ 2227 if (mp_copy == NULL) { 2228 ipst->ips_mrtstat->mrts_fwd_drop++; 2229 ip1dbg(("phyint_send: out of memory for mblk, mp_copy\n")); 2230 return; 2231 } 2232 if (vifp->v_rate_limit <= 0) 2233 tbf_send_packet(vifp, mp_copy); 2234 else { 2235 if (ipst->ips_ip_mrtdebug > 1) { 2236 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2237 "phyint_send: tbf_contr rate %d " 2238 "vifp 0x%p mp 0x%p dst 0x%x", 2239 vifp->v_rate_limit, (void *)vifp, (void *)mp, dst); 2240 } 2241 tbf_control(vifp, mp_copy, (ipha_t *)mp_copy->b_rptr); 2242 } 2243 } 2244 2245 /* 2246 * Send the whole packet for REGISTER encapsulation to PIM daemon 2247 * Caller assumes it can continue to use mp on return. 2248 */ 2249 /* ARGSUSED */ 2250 static void 2251 register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2252 { 2253 struct igmpmsg *im; 2254 mblk_t *mp_copy; 2255 ipha_t *ipha_copy; 2256 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2257 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2258 2259 if (ipst->ips_ip_mrtdebug > 1) { 2260 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2261 "register_send: src %x, dst %x\n", 2262 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst)); 2263 } 2264 2265 /* 2266 * Copy the old packet & pullup its IP header into the new mblk_t so we 2267 * can modify it. Try to fill the new mblk_t since if we don't the 2268 * ethernet driver will. 2269 */ 2270 mp_copy = allocb(sizeof (struct igmpmsg) + sizeof (ipha_t), BPRI_MED); 2271 if (mp_copy == NULL) { 2272 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2273 if (ipst->ips_ip_mrtdebug > 3) { 2274 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2275 "register_send: allocb failure."); 2276 } 2277 return; 2278 } 2279 2280 /* 2281 * Bump write pointer to account for igmpmsg being added. 2282 */ 2283 mp_copy->b_wptr = mp_copy->b_rptr + sizeof (struct igmpmsg); 2284 2285 /* 2286 * Chain packet to new mblk_t. 2287 */ 2288 if ((mp_copy->b_cont = copymsg(mp)) == NULL) { 2289 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2290 if (ipst->ips_ip_mrtdebug > 3) { 2291 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2292 "register_send: copymsg failure."); 2293 } 2294 freeb(mp_copy); 2295 return; 2296 } 2297 2298 /* 2299 * icmp_input() asserts that IP version field is set to an 2300 * appropriate version. Hence, the struct igmpmsg that this really 2301 * becomes, needs to have the correct IP version field. 2302 */ 2303 ipha_copy = (ipha_t *)mp_copy->b_rptr; 2304 *ipha_copy = multicast_encap_iphdr; 2305 2306 /* 2307 * The kernel uses the struct igmpmsg header to encode the messages to 2308 * the multicast routing daemon. Fill in the fields in the header 2309 * starting with the message type which is IGMPMSG_WHOLEPKT 2310 */ 2311 im = (struct igmpmsg *)mp_copy->b_rptr; 2312 im->im_msgtype = IGMPMSG_WHOLEPKT; 2313 im->im_src.s_addr = ipha->ipha_src; 2314 im->im_dst.s_addr = ipha->ipha_dst; 2315 2316 /* 2317 * Must Be Zero. This is because the struct igmpmsg is really an IP 2318 * header with renamed fields and the multicast routing daemon uses 2319 * an ipha_protocol (aka im_mbz) of 0 to distinguish these messages. 2320 */ 2321 im->im_mbz = 0; 2322 2323 ++ipst->ips_mrtstat->mrts_upcalls; 2324 if (!canputnext(mrouter->conn_rq)) { 2325 ++ipst->ips_mrtstat->mrts_pim_regsend_drops; 2326 if (ipst->ips_ip_mrtdebug > 3) { 2327 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2328 "register_send: register upcall failure."); 2329 } 2330 freemsg(mp_copy); 2331 } else { 2332 /* Pass to RAWIP */ 2333 (mrouter->conn_recv)(mrouter, mp_copy, NULL); 2334 } 2335 } 2336 2337 /* 2338 * pim_validate_cksum handles verification of the checksum in the 2339 * pim header. For PIM Register packets, the checksum is calculated 2340 * across the PIM header only. For all other packets, the checksum 2341 * is for the PIM header and remainder of the packet. 2342 * 2343 * returns: B_TRUE, if checksum is okay. 2344 * B_FALSE, if checksum is not valid. 2345 */ 2346 static boolean_t 2347 pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp) 2348 { 2349 mblk_t *mp_dup; 2350 2351 if ((mp_dup = dupmsg(mp)) == NULL) 2352 return (B_FALSE); 2353 2354 mp_dup->b_rptr += IPH_HDR_LENGTH(ip); 2355 if (pimp->pim_type == PIM_REGISTER) 2356 mp_dup->b_wptr = mp_dup->b_rptr + PIM_MINLEN; 2357 if (IP_CSUM(mp_dup, 0, 0)) { 2358 freemsg(mp_dup); 2359 return (B_FALSE); 2360 } 2361 freemsg(mp_dup); 2362 return (B_TRUE); 2363 } 2364 2365 /* 2366 * int 2367 * pim_input(queue_t *, mblk_t *, ill_t *ill) - Process PIM protocol packets. 2368 * IP Protocol 103. Register messages are decapsulated and sent 2369 * onto multicast forwarding. 2370 */ 2371 int 2372 pim_input(queue_t *q, mblk_t *mp, ill_t *ill) 2373 { 2374 ipha_t *eip, *ip; 2375 int iplen, pimlen, iphlen; 2376 struct pim *pimp; /* pointer to a pim struct */ 2377 uint32_t *reghdr; 2378 ip_stack_t *ipst = ill->ill_ipst; 2379 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2380 2381 /* 2382 * Pullup the msg for PIM protocol processing. 2383 */ 2384 if (pullupmsg(mp, -1) == 0) { 2385 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2386 freemsg(mp); 2387 return (-1); 2388 } 2389 2390 ip = (ipha_t *)mp->b_rptr; 2391 iplen = ip->ipha_length; 2392 iphlen = IPH_HDR_LENGTH(ip); 2393 pimlen = ntohs(iplen) - iphlen; 2394 2395 /* 2396 * Validate lengths 2397 */ 2398 if (pimlen < PIM_MINLEN) { 2399 ++ipst->ips_mrtstat->mrts_pim_malformed; 2400 if (ipst->ips_ip_mrtdebug > 1) { 2401 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2402 "pim_input: length not at least minlen"); 2403 } 2404 freemsg(mp); 2405 return (-1); 2406 } 2407 2408 /* 2409 * Point to the PIM header. 2410 */ 2411 pimp = (struct pim *)((caddr_t)ip + iphlen); 2412 2413 /* 2414 * Check the version number. 2415 */ 2416 if (pimp->pim_vers != PIM_VERSION) { 2417 ++ipst->ips_mrtstat->mrts_pim_badversion; 2418 if (ipst->ips_ip_mrtdebug > 1) { 2419 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2420 "pim_input: unknown version of PIM"); 2421 } 2422 freemsg(mp); 2423 return (-1); 2424 } 2425 2426 /* 2427 * Validate the checksum 2428 */ 2429 if (!pim_validate_cksum(mp, ip, pimp)) { 2430 ++ipst->ips_mrtstat->mrts_pim_rcv_badcsum; 2431 if (ipst->ips_ip_mrtdebug > 1) { 2432 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2433 "pim_input: invalid checksum"); 2434 } 2435 freemsg(mp); 2436 return (-1); 2437 } 2438 2439 if (pimp->pim_type != PIM_REGISTER) 2440 return (0); 2441 2442 reghdr = (uint32_t *)(pimp + 1); 2443 eip = (ipha_t *)(reghdr + 1); 2444 2445 /* 2446 * check if the inner packet is destined to mcast group 2447 */ 2448 if (!CLASSD(eip->ipha_dst)) { 2449 ++ipst->ips_mrtstat->mrts_pim_badregisters; 2450 if (ipst->ips_ip_mrtdebug > 1) { 2451 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2452 "pim_input: Inner pkt not mcast .. !"); 2453 } 2454 freemsg(mp); 2455 return (-1); 2456 } 2457 if (ipst->ips_ip_mrtdebug > 1) { 2458 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2459 "register from %x, to %x, len %d", 2460 ntohl(eip->ipha_src), 2461 ntohl(eip->ipha_dst), 2462 ntohs(eip->ipha_length)); 2463 } 2464 /* 2465 * If the null register bit is not set, decapsulate 2466 * the packet before forwarding it. 2467 */ 2468 if (!(ntohl(*reghdr) & PIM_NULL_REGISTER)) { 2469 mblk_t *mp_copy; 2470 2471 /* Copy the message */ 2472 if ((mp_copy = copymsg(mp)) == NULL) { 2473 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2474 freemsg(mp); 2475 return (-1); 2476 } 2477 2478 /* 2479 * Decapsulate the packet and give it to 2480 * register_mforward. 2481 */ 2482 mp_copy->b_rptr += iphlen + sizeof (pim_t) + 2483 sizeof (*reghdr); 2484 if (register_mforward(q, mp_copy, ill) != 0) { 2485 freemsg(mp); 2486 return (-1); 2487 } 2488 } 2489 2490 /* 2491 * Pass all valid PIM packets up to any process(es) listening on a raw 2492 * PIM socket. For Solaris it is done right after pim_input() is 2493 * called. 2494 */ 2495 return (0); 2496 } 2497 2498 /* 2499 * PIM sparse mode hook. Called by pim_input after decapsulating 2500 * the packet. Loop back the packet, as if we have received it. 2501 * In pim_input() we have to check if the destination is a multicast address. 2502 */ 2503 /* ARGSUSED */ 2504 static int 2505 register_mforward(queue_t *q, mblk_t *mp, ill_t *ill) 2506 { 2507 ip_stack_t *ipst = ill->ill_ipst; 2508 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2509 2510 ASSERT(ipst->ips_reg_vif_num <= ipst->ips_numvifs); 2511 2512 if (ipst->ips_ip_mrtdebug > 3) { 2513 ipha_t *ipha; 2514 2515 ipha = (ipha_t *)mp->b_rptr; 2516 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2517 "register_mforward: src %x, dst %x\n", 2518 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst)); 2519 } 2520 /* 2521 * Need to pass in to ip_mforward() the information that the 2522 * packet has arrived on the register_vif. We use the solution that 2523 * ip_mroute_decap() employs: use mp->b_prev to pass some information 2524 * to ip_mforward(). Nonzero value means the packet has arrived on a 2525 * tunnel (ip_mroute_decap() puts the address of the other side of the 2526 * tunnel there.) This is safe since ip_rput() either frees the packet 2527 * or passes it to ip_mforward(). We use 2528 * PIM_REGISTER_MARKER = 0xffffffff to indicate the has arrived on the 2529 * register vif. If in the future we have more than one register vifs, 2530 * then this will need re-examination. 2531 */ 2532 mp->b_prev = (mblk_t *)PIM_REGISTER_MARKER; 2533 ++ipst->ips_mrtstat->mrts_pim_regforwards; 2534 ip_rput(q, mp); 2535 return (0); 2536 } 2537 2538 /* 2539 * Send an encapsulated packet. 2540 * Caller assumes can continue to use mp when routine returns. 2541 */ 2542 /* ARGSUSED */ 2543 static void 2544 encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2545 { 2546 mblk_t *mp_copy; 2547 ipha_t *ipha_copy; 2548 size_t len; 2549 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2550 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2551 2552 if (ipst->ips_ip_mrtdebug > 1) { 2553 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2554 "encap_send: vif %ld enter", 2555 (ptrdiff_t)(vifp - ipst->ips_vifs)); 2556 } 2557 len = ntohs(ipha->ipha_length); 2558 2559 /* 2560 * Copy the old packet & pullup it's IP header into the 2561 * new mbuf so we can modify it. Try to fill the new 2562 * mbuf since if we don't the ethernet driver will. 2563 */ 2564 mp_copy = allocb(32 + sizeof (multicast_encap_iphdr), BPRI_MED); 2565 if (mp_copy == NULL) 2566 return; 2567 mp_copy->b_rptr += 32; 2568 mp_copy->b_wptr = mp_copy->b_rptr + sizeof (multicast_encap_iphdr); 2569 if ((mp_copy->b_cont = copymsg(mp)) == NULL) { 2570 freeb(mp_copy); 2571 return; 2572 } 2573 2574 /* 2575 * Fill in the encapsulating IP header. 2576 * Remote tunnel dst in rmt_addr, from add_vif(). 2577 */ 2578 ipha_copy = (ipha_t *)mp_copy->b_rptr; 2579 *ipha_copy = multicast_encap_iphdr; 2580 ASSERT((len + sizeof (ipha_t)) <= IP_MAXPACKET); 2581 ipha_copy->ipha_length = htons(len + sizeof (ipha_t)); 2582 ipha_copy->ipha_src = vifp->v_lcl_addr.s_addr; 2583 ipha_copy->ipha_dst = vifp->v_rmt_addr.s_addr; 2584 ASSERT(ipha_copy->ipha_ident == 0); 2585 2586 /* Turn the encapsulated IP header back into a valid one. */ 2587 ipha = (ipha_t *)mp_copy->b_cont->b_rptr; 2588 ipha->ipha_ttl--; 2589 ipha->ipha_hdr_checksum = 0; 2590 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 2591 2592 if (ipst->ips_ip_mrtdebug > 1) { 2593 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2594 "encap_send: group 0x%x", ntohl(ipha->ipha_dst)); 2595 } 2596 if (vifp->v_rate_limit <= 0) 2597 tbf_send_packet(vifp, mp_copy); 2598 else 2599 /* ipha is from the original header */ 2600 tbf_control(vifp, mp_copy, ipha); 2601 } 2602 2603 /* 2604 * De-encapsulate a packet and feed it back through IP input. 2605 * This routine is called whenever IP gets a packet with prototype 2606 * IPPROTO_ENCAP and a local destination address. 2607 */ 2608 void 2609 ip_mroute_decap(queue_t *q, mblk_t *mp, ill_t *ill) 2610 { 2611 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2612 ipha_t *ipha_encap; 2613 int hlen = IPH_HDR_LENGTH(ipha); 2614 ipaddr_t src; 2615 struct vif *vifp; 2616 ip_stack_t *ipst = ill->ill_ipst; 2617 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2618 2619 /* 2620 * Dump the packet if it's not to a multicast destination or if 2621 * we don't have an encapsulating tunnel with the source. 2622 * Note: This code assumes that the remote site IP address 2623 * uniquely identifies the tunnel (i.e., that this site has 2624 * at most one tunnel with the remote site). 2625 */ 2626 ipha_encap = (ipha_t *)((char *)ipha + hlen); 2627 if (!CLASSD(ipha_encap->ipha_dst)) { 2628 ipst->ips_mrtstat->mrts_bad_tunnel++; 2629 ip1dbg(("ip_mroute_decap: bad tunnel\n")); 2630 freemsg(mp); 2631 return; 2632 } 2633 src = (ipaddr_t)ipha->ipha_src; 2634 mutex_enter(&ipst->ips_last_encap_lock); 2635 if (src != ipst->ips_last_encap_src) { 2636 struct vif *vife; 2637 2638 vifp = ipst->ips_vifs; 2639 vife = vifp + ipst->ips_numvifs; 2640 ipst->ips_last_encap_src = src; 2641 ipst->ips_last_encap_vif = 0; 2642 for (; vifp < vife; ++vifp) { 2643 if (!lock_good_vif(vifp)) 2644 continue; 2645 if (vifp->v_rmt_addr.s_addr == src) { 2646 if (vifp->v_flags & VIFF_TUNNEL) 2647 ipst->ips_last_encap_vif = vifp; 2648 if (ipst->ips_ip_mrtdebug > 1) { 2649 (void) mi_strlog(mrouter->conn_rq, 2650 1, SL_TRACE, 2651 "ip_mroute_decap: good tun " 2652 "vif %ld with %x", 2653 (ptrdiff_t)(vifp - ipst->ips_vifs), 2654 ntohl(src)); 2655 } 2656 unlock_good_vif(vifp); 2657 break; 2658 } 2659 unlock_good_vif(vifp); 2660 } 2661 } 2662 if ((vifp = ipst->ips_last_encap_vif) == 0) { 2663 mutex_exit(&ipst->ips_last_encap_lock); 2664 ipst->ips_mrtstat->mrts_bad_tunnel++; 2665 freemsg(mp); 2666 ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n", 2667 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(src))); 2668 return; 2669 } 2670 mutex_exit(&ipst->ips_last_encap_lock); 2671 2672 /* 2673 * Need to pass in the tunnel source to ip_mforward (so that it can 2674 * verify that the packet arrived over the correct vif.) We use b_prev 2675 * to pass this information. This is safe since the ip_rput either 2676 * frees the packet or passes it to ip_mforward. 2677 */ 2678 mp->b_prev = (mblk_t *)(uintptr_t)src; 2679 mp->b_rptr += hlen; 2680 /* Feed back into ip_rput as an M_DATA. */ 2681 ip_rput(q, mp); 2682 } 2683 2684 /* 2685 * Remove all records with v_ipif == ipif. Called when an interface goes away 2686 * (stream closed). Called as writer. 2687 */ 2688 void 2689 reset_mrt_vif_ipif(ipif_t *ipif) 2690 { 2691 vifi_t vifi, tmp_vifi; 2692 vifi_t num_of_vifs; 2693 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 2694 2695 /* Can't check vifi >= 0 since vifi_t is unsigned! */ 2696 2697 mutex_enter(&ipst->ips_numvifs_mutex); 2698 num_of_vifs = ipst->ips_numvifs; 2699 mutex_exit(&ipst->ips_numvifs_mutex); 2700 2701 for (vifi = num_of_vifs; vifi != 0; vifi--) { 2702 tmp_vifi = vifi - 1; 2703 if (ipst->ips_vifs[tmp_vifi].v_ipif == ipif) { 2704 (void) del_vif(&tmp_vifi, NULL, NULL, ipst); 2705 } 2706 } 2707 } 2708 2709 /* Remove pending upcall msgs when ill goes away. Called by ill_delete. */ 2710 void 2711 reset_mrt_ill(ill_t *ill) 2712 { 2713 struct mfc *rt; 2714 struct rtdetq *rte; 2715 int i; 2716 ip_stack_t *ipst = ill->ill_ipst; 2717 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2718 2719 for (i = 0; i < MFCTBLSIZ; i++) { 2720 MFCB_REFHOLD(&ipst->ips_mfcs[i]); 2721 if ((rt = ipst->ips_mfcs[i].mfcb_mfc) != NULL) { 2722 if (ipst->ips_ip_mrtdebug > 1) { 2723 (void) mi_strlog(mrouter->conn_rq, 1, 2724 SL_TRACE, 2725 "reset_mrt_ill: mfctable [%d]", i); 2726 } 2727 while (rt != NULL) { 2728 mutex_enter(&rt->mfc_mutex); 2729 while ((rte = rt->mfc_rte) != NULL) { 2730 if (rte->ill == ill) { 2731 if (ipst->ips_ip_mrtdebug > 1) { 2732 (void) mi_strlog( 2733 mrouter->conn_rq, 2734 1, SL_TRACE, 2735 "reset_mrt_ill: " 2736 "ill 0x%p", (void *)ill); 2737 } 2738 rt->mfc_rte = rte->rte_next; 2739 freemsg(rte->mp); 2740 mi_free((char *)rte); 2741 } 2742 } 2743 mutex_exit(&rt->mfc_mutex); 2744 rt = rt->mfc_next; 2745 } 2746 } 2747 MFCB_REFRELE(&ipst->ips_mfcs[i]); 2748 } 2749 } 2750 2751 /* 2752 * Token bucket filter module. 2753 * The ipha is for mcastgrp destination for phyint and encap. 2754 */ 2755 static void 2756 tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha) 2757 { 2758 size_t p_len = msgdsize(mp); 2759 struct tbf *t = vifp->v_tbf; 2760 timeout_id_t id = 0; 2761 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2762 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2763 2764 /* Drop if packet is too large */ 2765 if (p_len > MAX_BKT_SIZE) { 2766 ipst->ips_mrtstat->mrts_pkt2large++; 2767 freemsg(mp); 2768 return; 2769 } 2770 if (ipst->ips_ip_mrtdebug > 1) { 2771 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2772 "tbf_ctrl: SEND vif %ld, qlen %d, ipha_dst 0x%x", 2773 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len, 2774 ntohl(ipha->ipha_dst)); 2775 } 2776 2777 mutex_enter(&t->tbf_lock); 2778 2779 tbf_update_tokens(vifp); 2780 2781 /* 2782 * If there are enough tokens, 2783 * and the queue is empty, send this packet out. 2784 */ 2785 if (ipst->ips_ip_mrtdebug > 1) { 2786 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2787 "tbf_control: vif %ld, TOKENS %d, pkt len %lu, qlen %d", 2788 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_n_tok, p_len, 2789 t->tbf_q_len); 2790 } 2791 /* No packets are queued */ 2792 if (t->tbf_q_len == 0) { 2793 /* queue empty, send packet if enough tokens */ 2794 if (p_len <= t->tbf_n_tok) { 2795 t->tbf_n_tok -= p_len; 2796 mutex_exit(&t->tbf_lock); 2797 tbf_send_packet(vifp, mp); 2798 return; 2799 } else { 2800 /* Queue packet and timeout till later */ 2801 tbf_queue(vifp, mp); 2802 ASSERT(vifp->v_timeout_id == 0); 2803 vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp, 2804 TBF_REPROCESS); 2805 } 2806 } else if (t->tbf_q_len < t->tbf_max_q_len) { 2807 /* Finite queue length, so queue pkts and process queue */ 2808 tbf_queue(vifp, mp); 2809 tbf_process_q(vifp); 2810 } else { 2811 /* Check that we have UDP header with IP header */ 2812 size_t hdr_length = IPH_HDR_LENGTH(ipha) + 2813 sizeof (struct udphdr); 2814 2815 if ((mp->b_wptr - mp->b_rptr) < hdr_length) { 2816 if (!pullupmsg(mp, hdr_length)) { 2817 freemsg(mp); 2818 ip1dbg(("tbf_ctl: couldn't pullup udp hdr, " 2819 "vif %ld src 0x%x dst 0x%x\n", 2820 (ptrdiff_t)(vifp - ipst->ips_vifs), 2821 ntohl(ipha->ipha_src), 2822 ntohl(ipha->ipha_dst))); 2823 mutex_exit(&vifp->v_tbf->tbf_lock); 2824 return; 2825 } else 2826 /* Have to reassign ipha after pullupmsg */ 2827 ipha = (ipha_t *)mp->b_rptr; 2828 } 2829 /* 2830 * Queue length too much, 2831 * try to selectively dq, or queue and process 2832 */ 2833 if (!tbf_dq_sel(vifp, ipha)) { 2834 ipst->ips_mrtstat->mrts_q_overflow++; 2835 freemsg(mp); 2836 } else { 2837 tbf_queue(vifp, mp); 2838 tbf_process_q(vifp); 2839 } 2840 } 2841 if (t->tbf_q_len == 0) { 2842 id = vifp->v_timeout_id; 2843 vifp->v_timeout_id = 0; 2844 } 2845 mutex_exit(&vifp->v_tbf->tbf_lock); 2846 if (id != 0) 2847 (void) untimeout(id); 2848 } 2849 2850 /* 2851 * Adds a packet to the tbf queue at the interface. 2852 * The ipha is for mcastgrp destination for phyint and encap. 2853 */ 2854 static void 2855 tbf_queue(struct vif *vifp, mblk_t *mp) 2856 { 2857 struct tbf *t = vifp->v_tbf; 2858 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2859 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2860 2861 if (ipst->ips_ip_mrtdebug > 1) { 2862 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2863 "tbf_queue: vif %ld", (ptrdiff_t)(vifp - ipst->ips_vifs)); 2864 } 2865 ASSERT(MUTEX_HELD(&t->tbf_lock)); 2866 2867 if (t->tbf_t == NULL) { 2868 /* Queue was empty */ 2869 t->tbf_q = mp; 2870 } else { 2871 /* Insert at tail */ 2872 t->tbf_t->b_next = mp; 2873 } 2874 /* set new tail pointer */ 2875 t->tbf_t = mp; 2876 2877 mp->b_next = mp->b_prev = NULL; 2878 2879 t->tbf_q_len++; 2880 } 2881 2882 /* 2883 * Process the queue at the vif interface. 2884 * Drops the tbf_lock when sending packets. 2885 * 2886 * NOTE : The caller should quntimeout if the queue length is 0. 2887 */ 2888 static void 2889 tbf_process_q(struct vif *vifp) 2890 { 2891 mblk_t *mp; 2892 struct tbf *t = vifp->v_tbf; 2893 size_t len; 2894 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2895 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2896 2897 if (ipst->ips_ip_mrtdebug > 1) { 2898 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2899 "tbf_process_q 1: vif %ld qlen = %d", 2900 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len); 2901 } 2902 2903 /* 2904 * Loop through the queue at the interface and send 2905 * as many packets as possible. 2906 */ 2907 ASSERT(MUTEX_HELD(&t->tbf_lock)); 2908 2909 while (t->tbf_q_len > 0) { 2910 mp = t->tbf_q; 2911 len = (size_t)msgdsize(mp); /* length of ip pkt */ 2912 2913 /* Determine if the packet can be sent */ 2914 if (len <= t->tbf_n_tok) { 2915 /* 2916 * If so, reduce no. of tokens, dequeue the packet, 2917 * send the packet. 2918 */ 2919 t->tbf_n_tok -= len; 2920 2921 t->tbf_q = mp->b_next; 2922 if (--t->tbf_q_len == 0) { 2923 t->tbf_t = NULL; 2924 } 2925 mp->b_next = NULL; 2926 /* Exit mutex before sending packet, then re-enter */ 2927 mutex_exit(&t->tbf_lock); 2928 tbf_send_packet(vifp, mp); 2929 mutex_enter(&t->tbf_lock); 2930 } else 2931 break; 2932 } 2933 } 2934 2935 /* Called at tbf timeout to update tokens, process q and reset timer. */ 2936 static void 2937 tbf_reprocess_q(void *arg) 2938 { 2939 struct vif *vifp = arg; 2940 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2941 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2942 2943 mutex_enter(&vifp->v_tbf->tbf_lock); 2944 vifp->v_timeout_id = 0; 2945 tbf_update_tokens(vifp); 2946 2947 tbf_process_q(vifp); 2948 2949 if (vifp->v_tbf->tbf_q_len > 0) { 2950 vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp, 2951 TBF_REPROCESS); 2952 } 2953 mutex_exit(&vifp->v_tbf->tbf_lock); 2954 2955 if (ipst->ips_ip_mrtdebug > 1) { 2956 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2957 "tbf_reprcess_q: vif %ld timeout id = %p", 2958 (ptrdiff_t)(vifp - ipst->ips_vifs), vifp->v_timeout_id); 2959 } 2960 } 2961 2962 /* 2963 * Function that will selectively discard a member of the tbf queue, 2964 * based on the precedence value and the priority. 2965 * 2966 * NOTE : The caller should quntimeout if the queue length is 0. 2967 */ 2968 static int 2969 tbf_dq_sel(struct vif *vifp, ipha_t *ipha) 2970 { 2971 uint_t p; 2972 struct tbf *t = vifp->v_tbf; 2973 mblk_t **np; 2974 mblk_t *last, *mp; 2975 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2976 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2977 2978 if (ipst->ips_ip_mrtdebug > 1) { 2979 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2980 "dq_sel: vif %ld dst 0x%x", 2981 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(ipha->ipha_dst)); 2982 } 2983 2984 ASSERT(MUTEX_HELD(&t->tbf_lock)); 2985 p = priority(vifp, ipha); 2986 2987 np = &t->tbf_q; 2988 last = NULL; 2989 while ((mp = *np) != NULL) { 2990 if (p > (priority(vifp, (ipha_t *)mp->b_rptr))) { 2991 *np = mp->b_next; 2992 /* If removing the last packet, fix the tail pointer */ 2993 if (mp == t->tbf_t) 2994 t->tbf_t = last; 2995 mp->b_prev = mp->b_next = NULL; 2996 freemsg(mp); 2997 /* 2998 * It's impossible for the queue to be empty, but 2999 * we check anyway. 3000 */ 3001 if (--t->tbf_q_len == 0) { 3002 t->tbf_t = NULL; 3003 } 3004 ipst->ips_mrtstat->mrts_drop_sel++; 3005 return (1); 3006 } 3007 np = &mp->b_next; 3008 last = mp; 3009 } 3010 return (0); 3011 } 3012 3013 /* Sends packet, 2 cases - encap tunnel, phyint. */ 3014 static void 3015 tbf_send_packet(struct vif *vifp, mblk_t *mp) 3016 { 3017 ipif_t *ipif; 3018 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 3019 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3020 3021 /* If encap tunnel options */ 3022 if (vifp->v_flags & VIFF_TUNNEL) { 3023 if (ipst->ips_ip_mrtdebug > 1) { 3024 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3025 "tbf_send_pkt: ENCAP tunnel vif %ld", 3026 (ptrdiff_t)(vifp - ipst->ips_vifs)); 3027 } 3028 3029 /* 3030 * Feed into ip_wput which will set the ident field and 3031 * checksum the encapsulating header. 3032 * BSD gets the cached route vifp->v_route from ip_output() 3033 * to speed up route table lookups. Not necessary in SunOS 5.x. 3034 */ 3035 put(vifp->v_ipif->ipif_wq, mp); 3036 return; 3037 3038 /* phyint */ 3039 } else { 3040 /* Need to loop back to members on the outgoing interface. */ 3041 ipha_t *ipha; 3042 ipaddr_t dst; 3043 ipha = (ipha_t *)mp->b_rptr; 3044 dst = ipha->ipha_dst; 3045 ipif = vifp->v_ipif; 3046 3047 if (ilm_lookup_ipif(ipif, dst) != NULL) { 3048 /* 3049 * The packet is not yet reassembled, thus we need to 3050 * pass it to ip_rput_local for checksum verification 3051 * and reassembly (and fanout the user stream). 3052 */ 3053 mblk_t *mp_loop; 3054 ire_t *ire; 3055 3056 if (ipst->ips_ip_mrtdebug > 1) { 3057 (void) mi_strlog(mrouter->conn_rq, 1, 3058 SL_TRACE, 3059 "tbf_send_pkt: loopback vif %ld", 3060 (ptrdiff_t)(vifp - ipst->ips_vifs)); 3061 } 3062 mp_loop = copymsg(mp); 3063 ire = ire_ctable_lookup(~0, 0, IRE_BROADCAST, NULL, 3064 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 3065 3066 if (mp_loop != NULL && ire != NULL) { 3067 IP_RPUT_LOCAL(ipif->ipif_rq, mp_loop, 3068 ((ipha_t *)mp_loop->b_rptr), 3069 ire, (ill_t *)ipif->ipif_rq->q_ptr); 3070 } else { 3071 /* Either copymsg failed or no ire */ 3072 (void) mi_strlog(mrouter->conn_rq, 1, 3073 SL_TRACE, 3074 "tbf_send_pkt: mp_loop 0x%p, ire 0x%p " 3075 "vif %ld\n", (void *)mp_loop, (void *)ire, 3076 (ptrdiff_t)(vifp - ipst->ips_vifs)); 3077 } 3078 if (ire != NULL) 3079 ire_refrele(ire); 3080 } 3081 if (ipst->ips_ip_mrtdebug > 1) { 3082 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3083 "tbf_send_pkt: phyint forward vif %ld dst = 0x%x", 3084 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(dst)); 3085 } 3086 ip_rput_forward_multicast(dst, mp, ipif); 3087 } 3088 } 3089 3090 /* 3091 * Determine the current time and then the elapsed time (between the last time 3092 * and time now). Update the no. of tokens in the bucket. 3093 */ 3094 static void 3095 tbf_update_tokens(struct vif *vifp) 3096 { 3097 timespec_t tp; 3098 hrtime_t tm; 3099 struct tbf *t = vifp->v_tbf; 3100 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 3101 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3102 3103 ASSERT(MUTEX_HELD(&t->tbf_lock)); 3104 3105 /* Time in secs and nsecs, rate limit in kbits/sec */ 3106 gethrestime(&tp); 3107 3108 /*LINTED*/ 3109 TV_DELTA(tp, t->tbf_last_pkt_t, tm); 3110 3111 /* 3112 * This formula is actually 3113 * "time in seconds" * "bytes/second". Scaled for nsec. 3114 * (tm/1000000000) * (v_rate_limit * 1000 * (1000/1024) /8) 3115 * 3116 * The (1000/1024) was introduced in add_vif to optimize 3117 * this divide into a shift. 3118 */ 3119 t->tbf_n_tok += (tm/1000) * vifp->v_rate_limit / 1024 / 8; 3120 t->tbf_last_pkt_t = tp; 3121 3122 if (t->tbf_n_tok > MAX_BKT_SIZE) 3123 t->tbf_n_tok = MAX_BKT_SIZE; 3124 if (ipst->ips_ip_mrtdebug > 1) { 3125 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3126 "tbf_update_tok: tm %lld tok %d vif %ld", 3127 tm, t->tbf_n_tok, (ptrdiff_t)(vifp - ipst->ips_vifs)); 3128 } 3129 } 3130 3131 /* 3132 * Priority currently is based on port nos. 3133 * Different forwarding mechanisms have different ways 3134 * of obtaining the port no. Hence, the vif must be 3135 * given along with the packet itself. 3136 * 3137 */ 3138 static int 3139 priority(struct vif *vifp, ipha_t *ipha) 3140 { 3141 int prio; 3142 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 3143 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3144 3145 /* Temporary hack; may add general packet classifier some day */ 3146 3147 ASSERT(MUTEX_HELD(&vifp->v_tbf->tbf_lock)); 3148 3149 /* 3150 * The UDP port space is divided up into four priority ranges: 3151 * [0, 16384) : unclassified - lowest priority 3152 * [16384, 32768) : audio - highest priority 3153 * [32768, 49152) : whiteboard - medium priority 3154 * [49152, 65536) : video - low priority 3155 */ 3156 3157 if (ipha->ipha_protocol == IPPROTO_UDP) { 3158 struct udphdr *udp = 3159 (struct udphdr *)((char *)ipha + IPH_HDR_LENGTH(ipha)); 3160 switch (ntohs(udp->uh_dport) & 0xc000) { 3161 case 0x4000: 3162 prio = 70; 3163 break; 3164 case 0x8000: 3165 prio = 60; 3166 break; 3167 case 0xc000: 3168 prio = 55; 3169 break; 3170 default: 3171 prio = 50; 3172 break; 3173 } 3174 if (ipst->ips_ip_mrtdebug > 1) { 3175 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3176 "priority: port %x prio %d\n", 3177 ntohs(udp->uh_dport), prio); 3178 } 3179 } else 3180 prio = 50; /* default priority */ 3181 return (prio); 3182 } 3183 3184 /* 3185 * End of token bucket filter modifications 3186 */ 3187 3188 3189 3190 /* 3191 * Produces data for netstat -M. 3192 */ 3193 int 3194 ip_mroute_stats(mblk_t *mp, ip_stack_t *ipst) 3195 { 3196 ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl); 3197 ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl); 3198 if (!snmp_append_data(mp, (char *)ipst->ips_mrtstat, 3199 sizeof (struct mrtstat))) { 3200 ip0dbg(("ip_mroute_stats: failed %ld bytes\n", 3201 (size_t)sizeof (struct mrtstat))); 3202 return (0); 3203 } 3204 return (1); 3205 } 3206 3207 /* 3208 * Sends info for SNMP's MIB. 3209 */ 3210 int 3211 ip_mroute_vif(mblk_t *mp, ip_stack_t *ipst) 3212 { 3213 struct vifctl vi; 3214 vifi_t vifi; 3215 3216 mutex_enter(&ipst->ips_numvifs_mutex); 3217 for (vifi = 0; vifi < ipst->ips_numvifs; vifi++) { 3218 if (ipst->ips_vifs[vifi].v_lcl_addr.s_addr == 0) 3219 continue; 3220 /* 3221 * No locks here, an approximation is fine. 3222 */ 3223 vi.vifc_vifi = vifi; 3224 vi.vifc_flags = ipst->ips_vifs[vifi].v_flags; 3225 vi.vifc_threshold = ipst->ips_vifs[vifi].v_threshold; 3226 vi.vifc_rate_limit = ipst->ips_vifs[vifi].v_rate_limit; 3227 vi.vifc_lcl_addr = ipst->ips_vifs[vifi].v_lcl_addr; 3228 vi.vifc_rmt_addr = ipst->ips_vifs[vifi].v_rmt_addr; 3229 vi.vifc_pkt_in = ipst->ips_vifs[vifi].v_pkt_in; 3230 vi.vifc_pkt_out = ipst->ips_vifs[vifi].v_pkt_out; 3231 3232 if (!snmp_append_data(mp, (char *)&vi, sizeof (vi))) { 3233 ip0dbg(("ip_mroute_vif: failed %ld bytes\n", 3234 (size_t)sizeof (vi))); 3235 mutex_exit(&ipst->ips_numvifs_mutex); 3236 return (0); 3237 } 3238 } 3239 mutex_exit(&ipst->ips_numvifs_mutex); 3240 return (1); 3241 } 3242 3243 /* 3244 * Called by ip_snmp_get to send up multicast routing table. 3245 */ 3246 int 3247 ip_mroute_mrt(mblk_t *mp, ip_stack_t *ipst) 3248 { 3249 int i, j; 3250 struct mfc *rt; 3251 struct mfcctl mfcc; 3252 3253 /* 3254 * Make sure multicast has not been turned off. 3255 */ 3256 if (is_mrouter_off(ipst)) 3257 return (1); 3258 3259 /* Loop over all hash buckets and their chains */ 3260 for (i = 0; i < MFCTBLSIZ; i++) { 3261 MFCB_REFHOLD(&ipst->ips_mfcs[i]); 3262 for (rt = ipst->ips_mfcs[i].mfcb_mfc; rt; rt = rt->mfc_next) { 3263 mutex_enter(&rt->mfc_mutex); 3264 if (rt->mfc_rte != NULL || 3265 (rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 3266 mutex_exit(&rt->mfc_mutex); 3267 continue; 3268 } 3269 mfcc.mfcc_origin = rt->mfc_origin; 3270 mfcc.mfcc_mcastgrp = rt->mfc_mcastgrp; 3271 mfcc.mfcc_parent = rt->mfc_parent; 3272 mfcc.mfcc_pkt_cnt = rt->mfc_pkt_cnt; 3273 mutex_enter(&ipst->ips_numvifs_mutex); 3274 for (j = 0; j < (int)ipst->ips_numvifs; j++) 3275 mfcc.mfcc_ttls[j] = rt->mfc_ttls[j]; 3276 for (j = (int)ipst->ips_numvifs; j < MAXVIFS; j++) 3277 mfcc.mfcc_ttls[j] = 0; 3278 mutex_exit(&ipst->ips_numvifs_mutex); 3279 3280 mutex_exit(&rt->mfc_mutex); 3281 if (!snmp_append_data(mp, (char *)&mfcc, 3282 sizeof (mfcc))) { 3283 MFCB_REFRELE(&ipst->ips_mfcs[i]); 3284 ip0dbg(("ip_mroute_mrt: failed %ld bytes\n", 3285 (size_t)sizeof (mfcc))); 3286 return (0); 3287 } 3288 } 3289 MFCB_REFRELE(&ipst->ips_mfcs[i]); 3290 } 3291 return (1); 3292 } 3293