1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. 23 * All rights reserved. Use is subject to license terms. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Procedures for the kernel part of DVMRP, 31 * a Distance-Vector Multicast Routing Protocol. 32 * (See RFC-1075) 33 * Written by David Waitzman, BBN Labs, August 1988. 34 * Modified by Steve Deering, Stanford, February 1989. 35 * Modified by Mark J. Steiglitz, Stanford, May, 1991 36 * Modified by Van Jacobson, LBL, January 1993 37 * Modified by Ajit Thyagarajan, PARC, August 1993 38 * Modified by Bill Fenner, PARC, April 1995 39 * 40 * MROUTING 3.5 41 */ 42 43 /* 44 * TODO 45 * - function pointer field in vif, void *vif_sendit() 46 */ 47 48 #include <sys/types.h> 49 #include <sys/stream.h> 50 #include <sys/stropts.h> 51 #include <sys/strlog.h> 52 #include <sys/systm.h> 53 #include <sys/ddi.h> 54 #include <sys/cmn_err.h> 55 #include <sys/zone.h> 56 57 #include <sys/param.h> 58 #include <sys/socket.h> 59 #include <sys/vtrace.h> 60 #include <sys/debug.h> 61 #include <net/if.h> 62 #include <sys/sockio.h> 63 #include <netinet/in.h> 64 #include <net/if_dl.h> 65 66 #include <inet/common.h> 67 #include <inet/mi.h> 68 #include <inet/nd.h> 69 #include <inet/mib2.h> 70 #include <netinet/ip6.h> 71 #include <inet/ip.h> 72 #include <inet/snmpcom.h> 73 74 #include <netinet/igmp.h> 75 #include <netinet/igmp_var.h> 76 #include <netinet/udp.h> 77 #include <netinet/ip_mroute.h> 78 #include <inet/ip_multi.h> 79 #include <inet/ip_ire.h> 80 #include <inet/ip_if.h> 81 #include <inet/ipclassifier.h> 82 83 #include <netinet/pim.h> 84 85 86 /* 87 * MT Design: 88 * 89 * There are three main data structures viftable, mfctable and tbftable that 90 * need to be protected against MT races. 91 * 92 * vitable is a fixed length array of vif structs. There is no lock to protect 93 * the whole array, instead each struct is protected by its own indiviual lock. 94 * The value of v_marks in conjuction with the value of v_refcnt determines the 95 * current state of a vif structure. One special state that needs mention 96 * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates 97 * that vif is being initalized. 98 * Each structure is freed when the refcnt goes down to zero. If a delete comes 99 * in when the the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED 100 * which prevents the struct from further use. When the refcnt goes to zero 101 * the struct is freed and is marked VIF_MARK_NOTINUSE. 102 * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill 103 * from going away a refhold is put on the ipif before using it. see 104 * lock_good_vif() and unlock_good_vif(). 105 * 106 * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts 107 * of the vif struct. 108 * 109 * tbftable is also a fixed length array of tbf structs and is only accessed 110 * via v_tbf. It is protected by its own lock tbf_lock. 111 * 112 * Lock Ordering is 113 * v_lock --> tbf_lock 114 * v_lock --> ill_locK 115 * 116 * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb). 117 * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker, 118 * it also maintains a state. These fields are protected by a lock (mfcb_lock). 119 * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to 120 * protect the struct elements. 121 * 122 * mfc structs are dynamically allocated and are singly linked 123 * at the head of the chain. When an mfc structure is to be deleted 124 * it is marked condemned and so is the state in the bucket struct. 125 * When the last walker of the hash bucket exits all the mfc structs 126 * marked condemed are freed. 127 * 128 * Locking Hierarchy: 129 * The bucket lock should be acquired before the mfc struct lock. 130 * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking 131 * operations on the bucket struct. 132 * 133 * last_encap_lock and numvifs_mutex should be acquired after 134 * acquring vif or mfc locks. These locks protect some global variables. 135 * 136 * The statistics are not currently protected by a lock 137 * causing the stats be be approximate, not exact. 138 */ 139 140 #define NO_VIF MAXVIFS /* from mrouted, no route for src */ 141 142 /* 143 * Timeouts: 144 * Upcall timeouts - BSD uses boolean_t mfc->expire and 145 * nexpire[MFCTBLSIZE], the number of times expire has been called. 146 * SunOS 5.x uses mfc->timeout for each mfc. 147 * Some Unixes are limited in the number of simultaneous timeouts 148 * that can be run, SunOS 5.x does not have this restriction. 149 */ 150 151 /* 152 * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and 153 * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall 154 * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE 155 */ 156 #define EXPIRE_TIMEOUT (hz/4) /* 4x / second */ 157 #define UPCALL_EXPIRE 6 /* number of timeouts */ 158 159 /* 160 * Hash function for a source, group entry 161 */ 162 #define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \ 163 ((g) >> 20) ^ ((g) >> 10) ^ (g)) 164 165 #define TBF_REPROCESS (hz / 100) /* 100x /second */ 166 167 /* Identify PIM packet that came on a Register interface */ 168 #define PIM_REGISTER_MARKER 0xffffffff 169 170 /* Function declarations */ 171 static int add_mfc(struct mfcctl *, ip_stack_t *); 172 static int add_vif(struct vifctl *, conn_t *, mblk_t *, ip_stack_t *); 173 static int del_mfc(struct mfcctl *, ip_stack_t *); 174 static int del_vif(vifi_t *, conn_t *, mblk_t *, ip_stack_t *); 175 static void del_vifp(struct vif *); 176 static void encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 177 static void expire_upcalls(void *); 178 static void fill_route(struct mfc *, struct mfcctl *, ip_stack_t *); 179 static void free_queue(struct mfc *); 180 static int get_assert(uchar_t *, ip_stack_t *); 181 static int get_lsg_cnt(struct sioc_lsg_req *, ip_stack_t *); 182 static int get_sg_cnt(struct sioc_sg_req *, ip_stack_t *); 183 static int get_version(uchar_t *); 184 static int get_vif_cnt(struct sioc_vif_req *, ip_stack_t *); 185 static int ip_mdq(mblk_t *, ipha_t *, ill_t *, 186 ipaddr_t, struct mfc *); 187 static int ip_mrouter_init(conn_t *, uchar_t *, int, ip_stack_t *); 188 static void phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 189 static int register_mforward(queue_t *, mblk_t *, ill_t *); 190 static void register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 191 static int set_assert(int *, ip_stack_t *); 192 193 /* 194 * Token Bucket Filter functions 195 */ 196 static int priority(struct vif *, ipha_t *); 197 static void tbf_control(struct vif *, mblk_t *, ipha_t *); 198 static int tbf_dq_sel(struct vif *, ipha_t *); 199 static void tbf_process_q(struct vif *); 200 static void tbf_queue(struct vif *, mblk_t *); 201 static void tbf_reprocess_q(void *); 202 static void tbf_send_packet(struct vif *, mblk_t *); 203 static void tbf_update_tokens(struct vif *); 204 static void release_mfc(struct mfcb *); 205 206 static boolean_t is_mrouter_off(ip_stack_t *); 207 /* 208 * Encapsulation packets 209 */ 210 211 #define ENCAP_TTL 64 212 213 /* prototype IP hdr for encapsulated packets */ 214 static ipha_t multicast_encap_iphdr = { 215 IP_SIMPLE_HDR_VERSION, 216 0, /* tos */ 217 sizeof (ipha_t), /* total length */ 218 0, /* id */ 219 0, /* frag offset */ 220 ENCAP_TTL, IPPROTO_ENCAP, 221 0, /* checksum */ 222 }; 223 224 /* 225 * Rate limit for assert notification messages, in nsec. 226 */ 227 #define ASSERT_MSG_TIME 3000000000 228 229 230 #define VIF_REFHOLD(vifp) { \ 231 mutex_enter(&(vifp)->v_lock); \ 232 (vifp)->v_refcnt++; \ 233 mutex_exit(&(vifp)->v_lock); \ 234 } 235 236 #define VIF_REFRELE_LOCKED(vifp) { \ 237 (vifp)->v_refcnt--; \ 238 if ((vifp)->v_refcnt == 0 && \ 239 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) { \ 240 del_vifp(vifp); \ 241 } else { \ 242 mutex_exit(&(vifp)->v_lock); \ 243 } \ 244 } 245 246 #define VIF_REFRELE(vifp) { \ 247 mutex_enter(&(vifp)->v_lock); \ 248 (vifp)->v_refcnt--; \ 249 if ((vifp)->v_refcnt == 0 && \ 250 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) { \ 251 del_vifp(vifp); \ 252 } else { \ 253 mutex_exit(&(vifp)->v_lock); \ 254 } \ 255 } 256 257 #define MFCB_REFHOLD(mfcb) { \ 258 mutex_enter(&(mfcb)->mfcb_lock); \ 259 (mfcb)->mfcb_refcnt++; \ 260 ASSERT((mfcb)->mfcb_refcnt != 0); \ 261 mutex_exit(&(mfcb)->mfcb_lock); \ 262 } 263 264 #define MFCB_REFRELE(mfcb) { \ 265 mutex_enter(&(mfcb)->mfcb_lock); \ 266 ASSERT((mfcb)->mfcb_refcnt != 0); \ 267 if (--(mfcb)->mfcb_refcnt == 0 && \ 268 ((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) { \ 269 release_mfc(mfcb); \ 270 } \ 271 mutex_exit(&(mfcb)->mfcb_lock); \ 272 } 273 274 /* 275 * MFCFIND: 276 * Find a route for a given origin IP address and multicast group address. 277 * Skip entries with pending upcalls. 278 * Type of service parameter to be added in the future! 279 */ 280 #define MFCFIND(mfcbp, o, g, rt) { \ 281 struct mfc *_mb_rt = NULL; \ 282 rt = NULL; \ 283 _mb_rt = mfcbp->mfcb_mfc; \ 284 while (_mb_rt) { \ 285 if ((_mb_rt->mfc_origin.s_addr == o) && \ 286 (_mb_rt->mfc_mcastgrp.s_addr == g) && \ 287 (_mb_rt->mfc_rte == NULL) && \ 288 (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) { \ 289 rt = _mb_rt; \ 290 break; \ 291 } \ 292 _mb_rt = _mb_rt->mfc_next; \ 293 } \ 294 } 295 296 /* 297 * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime() 298 * are inefficient. We use gethrestime() which returns a timespec_t with 299 * sec and nsec, the resolution is machine dependent. 300 * The following 2 macros have been changed to use nsec instead of usec. 301 */ 302 /* 303 * Macros to compute elapsed time efficiently. 304 * Borrowed from Van Jacobson's scheduling code. 305 * Delta should be a hrtime_t. 306 */ 307 #define TV_DELTA(a, b, delta) { \ 308 int xxs; \ 309 \ 310 delta = (a).tv_nsec - (b).tv_nsec; \ 311 if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \ 312 switch (xxs) { \ 313 case 2: \ 314 delta += 1000000000; \ 315 /*FALLTHROUGH*/ \ 316 case 1: \ 317 delta += 1000000000; \ 318 break; \ 319 default: \ 320 delta += (1000000000 * xxs); \ 321 } \ 322 } \ 323 } 324 325 #define TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \ 326 (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec) 327 328 /* 329 * Handle MRT setsockopt commands to modify the multicast routing tables. 330 */ 331 int 332 ip_mrouter_set(int cmd, queue_t *q, int checkonly, uchar_t *data, 333 int datalen, mblk_t *first_mp) 334 { 335 conn_t *connp = Q_TO_CONN(q); 336 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 337 338 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 339 if (cmd != MRT_INIT && connp != ipst->ips_ip_g_mrouter) { 340 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 341 return (EACCES); 342 } 343 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 344 345 if (checkonly) { 346 /* 347 * do not do operation, just pretend to - new T_CHECK 348 * Note: Even routines further on can probably fail but 349 * this T_CHECK stuff is only to please XTI so it not 350 * necessary to be perfect. 351 */ 352 switch (cmd) { 353 case MRT_INIT: 354 case MRT_DONE: 355 case MRT_ADD_VIF: 356 case MRT_DEL_VIF: 357 case MRT_ADD_MFC: 358 case MRT_DEL_MFC: 359 case MRT_ASSERT: 360 return (0); 361 default: 362 return (EOPNOTSUPP); 363 } 364 } 365 366 /* 367 * make sure no command is issued after multicast routing has been 368 * turned off. 369 */ 370 if (cmd != MRT_INIT && cmd != MRT_DONE) { 371 if (is_mrouter_off(ipst)) 372 return (EINVAL); 373 } 374 375 switch (cmd) { 376 case MRT_INIT: return (ip_mrouter_init(connp, data, datalen, ipst)); 377 case MRT_DONE: return (ip_mrouter_done(first_mp, ipst)); 378 case MRT_ADD_VIF: return (add_vif((struct vifctl *)data, connp, 379 first_mp, ipst)); 380 case MRT_DEL_VIF: return (del_vif((vifi_t *)data, connp, first_mp, 381 ipst)); 382 case MRT_ADD_MFC: return (add_mfc((struct mfcctl *)data, ipst)); 383 case MRT_DEL_MFC: return (del_mfc((struct mfcctl *)data, ipst)); 384 case MRT_ASSERT: return (set_assert((int *)data, ipst)); 385 default: return (EOPNOTSUPP); 386 } 387 } 388 389 /* 390 * Handle MRT getsockopt commands 391 */ 392 int 393 ip_mrouter_get(int cmd, queue_t *q, uchar_t *data) 394 { 395 conn_t *connp = Q_TO_CONN(q); 396 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 397 398 if (connp != ipst->ips_ip_g_mrouter) 399 return (EACCES); 400 401 switch (cmd) { 402 case MRT_VERSION: return (get_version((uchar_t *)data)); 403 case MRT_ASSERT: return (get_assert((uchar_t *)data, ipst)); 404 default: return (EOPNOTSUPP); 405 } 406 } 407 408 /* 409 * Handle ioctl commands to obtain information from the cache. 410 * Called with shared access to IP. These are read_only ioctls. 411 */ 412 /* ARGSUSED */ 413 int 414 mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 415 ip_ioctl_cmd_t *ipip, void *if_req) 416 { 417 mblk_t *mp1; 418 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 419 conn_t *connp = Q_TO_CONN(q); 420 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 421 422 /* Existence verified in ip_wput_nondata */ 423 mp1 = mp->b_cont->b_cont; 424 425 switch (iocp->ioc_cmd) { 426 case (SIOCGETVIFCNT): 427 return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr, ipst)); 428 case (SIOCGETSGCNT): 429 return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr, ipst)); 430 case (SIOCGETLSGCNT): 431 return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr, ipst)); 432 default: 433 return (EINVAL); 434 } 435 } 436 437 /* 438 * Returns the packet, byte, rpf-failure count for the source, group provided. 439 */ 440 static int 441 get_sg_cnt(struct sioc_sg_req *req, ip_stack_t *ipst) 442 { 443 struct mfc *rt; 444 struct mfcb *mfcbp; 445 446 mfcbp = &ipst->ips_mfcs[MFCHASH(req->src.s_addr, req->grp.s_addr)]; 447 MFCB_REFHOLD(mfcbp); 448 MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt); 449 450 if (rt != NULL) { 451 mutex_enter(&rt->mfc_mutex); 452 req->pktcnt = rt->mfc_pkt_cnt; 453 req->bytecnt = rt->mfc_byte_cnt; 454 req->wrong_if = rt->mfc_wrong_if; 455 mutex_exit(&rt->mfc_mutex); 456 } else 457 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU; 458 459 MFCB_REFRELE(mfcbp); 460 return (0); 461 } 462 463 /* 464 * Returns the packet, byte, rpf-failure count for the source, group provided. 465 * Uses larger counters and IPv6 addresses. 466 */ 467 /* ARGSUSED XXX until implemented */ 468 static int 469 get_lsg_cnt(struct sioc_lsg_req *req, ip_stack_t *ipst) 470 { 471 /* XXX TODO SIOCGETLSGCNT */ 472 return (ENXIO); 473 } 474 475 /* 476 * Returns the input and output packet and byte counts on the vif provided. 477 */ 478 static int 479 get_vif_cnt(struct sioc_vif_req *req, ip_stack_t *ipst) 480 { 481 vifi_t vifi = req->vifi; 482 483 if (vifi >= ipst->ips_numvifs) 484 return (EINVAL); 485 486 /* 487 * No locks here, an approximation is fine. 488 */ 489 req->icount = ipst->ips_vifs[vifi].v_pkt_in; 490 req->ocount = ipst->ips_vifs[vifi].v_pkt_out; 491 req->ibytes = ipst->ips_vifs[vifi].v_bytes_in; 492 req->obytes = ipst->ips_vifs[vifi].v_bytes_out; 493 494 return (0); 495 } 496 497 static int 498 get_version(uchar_t *data) 499 { 500 int *v = (int *)data; 501 502 *v = 0x0305; /* XXX !!!! */ 503 504 return (0); 505 } 506 507 /* 508 * Set PIM assert processing global. 509 */ 510 static int 511 set_assert(int *i, ip_stack_t *ipst) 512 { 513 if ((*i != 1) && (*i != 0)) 514 return (EINVAL); 515 516 ipst->ips_pim_assert = *i; 517 518 return (0); 519 } 520 521 /* 522 * Get PIM assert processing global. 523 */ 524 static int 525 get_assert(uchar_t *data, ip_stack_t *ipst) 526 { 527 int *i = (int *)data; 528 529 *i = ipst->ips_pim_assert; 530 531 return (0); 532 } 533 534 /* 535 * Enable multicast routing. 536 */ 537 static int 538 ip_mrouter_init(conn_t *connp, uchar_t *data, int datalen, ip_stack_t *ipst) 539 { 540 int *v; 541 542 if (data == NULL || (datalen != sizeof (int))) 543 return (ENOPROTOOPT); 544 545 v = (int *)data; 546 if (*v != 1) 547 return (ENOPROTOOPT); 548 549 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 550 if (ipst->ips_ip_g_mrouter != NULL) { 551 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 552 return (EADDRINUSE); 553 } 554 555 /* 556 * MRT_INIT should only be allowed for RAW sockets, but we double 557 * check. 558 */ 559 if (!IPCL_IS_RAWIP(connp)) { 560 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 561 return (EINVAL); 562 } 563 564 ipst->ips_ip_g_mrouter = connp; 565 connp->conn_multi_router = 1; 566 /* In order for tunnels to work we have to turn ip_g_forward on */ 567 if (!WE_ARE_FORWARDING(ipst)) { 568 if (ipst->ips_ip_mrtdebug > 1) { 569 (void) mi_strlog(connp->conn_rq, 1, SL_TRACE, 570 "ip_mrouter_init: turning on forwarding"); 571 } 572 ipst->ips_saved_ip_g_forward = ipst->ips_ip_g_forward; 573 ipst->ips_ip_g_forward = IP_FORWARD_ALWAYS; 574 } 575 576 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 577 return (0); 578 } 579 580 void 581 ip_mrouter_stack_init(ip_stack_t *ipst) 582 { 583 mutex_init(&ipst->ips_ip_g_mrouter_mutex, NULL, MUTEX_DEFAULT, NULL); 584 585 ipst->ips_vifs = kmem_zalloc(sizeof (struct vif) * (MAXVIFS+1), 586 KM_SLEEP); 587 ipst->ips_mrtstat = kmem_zalloc(sizeof (struct mrtstat), KM_SLEEP); 588 /* 589 * mfctable: 590 * Includes all mfcs, including waiting upcalls. 591 * Multiple mfcs per bucket. 592 */ 593 ipst->ips_mfcs = kmem_zalloc(sizeof (struct mfcb) * MFCTBLSIZ, 594 KM_SLEEP); 595 /* 596 * Define the token bucket filter structures. 597 * tbftable -> each vif has one of these for storing info. 598 */ 599 ipst->ips_tbfs = kmem_zalloc(sizeof (struct tbf) * MAXVIFS, KM_SLEEP); 600 601 mutex_init(&ipst->ips_last_encap_lock, NULL, MUTEX_DEFAULT, NULL); 602 603 ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl); 604 ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl); 605 } 606 607 /* 608 * Disable multicast routing. 609 * Didn't use global timeout_val (BSD version), instead check the mfctable. 610 */ 611 int 612 ip_mrouter_done(mblk_t *mp, ip_stack_t *ipst) 613 { 614 conn_t *mrouter; 615 vifi_t vifi; 616 struct mfc *mfc_rt; 617 int i; 618 619 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 620 if (ipst->ips_ip_g_mrouter == NULL) { 621 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 622 return (EINVAL); 623 } 624 625 mrouter = ipst->ips_ip_g_mrouter; 626 627 if (ipst->ips_saved_ip_g_forward != -1) { 628 if (ipst->ips_ip_mrtdebug > 1) { 629 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 630 "ip_mrouter_done: turning off forwarding"); 631 } 632 ipst->ips_ip_g_forward = ipst->ips_saved_ip_g_forward; 633 ipst->ips_saved_ip_g_forward = -1; 634 } 635 636 /* 637 * Always clear cache when vifs change. 638 * No need to get ipst->ips_last_encap_lock since we are running as 639 * a writer. 640 */ 641 mutex_enter(&ipst->ips_last_encap_lock); 642 ipst->ips_last_encap_src = 0; 643 ipst->ips_last_encap_vif = NULL; 644 mutex_exit(&ipst->ips_last_encap_lock); 645 mrouter->conn_multi_router = 0; 646 647 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 648 649 /* 650 * For each phyint in use, 651 * disable promiscuous reception of all IP multicasts. 652 */ 653 for (vifi = 0; vifi < MAXVIFS; vifi++) { 654 struct vif *vifp = ipst->ips_vifs + vifi; 655 656 mutex_enter(&vifp->v_lock); 657 /* 658 * if the vif is active mark it condemned. 659 */ 660 if (vifp->v_marks & VIF_MARK_GOOD) { 661 ASSERT(vifp->v_ipif != NULL); 662 ipif_refhold(vifp->v_ipif); 663 /* Phyint only */ 664 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 665 ipif_t *ipif = vifp->v_ipif; 666 ipsq_t *ipsq; 667 boolean_t suc; 668 ill_t *ill; 669 670 ill = ipif->ipif_ill; 671 suc = B_FALSE; 672 if (mp == NULL) { 673 /* 674 * being called from ip_close, 675 * lets do it synchronously. 676 * Clear VIF_MARK_GOOD and 677 * set VIF_MARK_CONDEMNED. 678 */ 679 vifp->v_marks &= ~VIF_MARK_GOOD; 680 vifp->v_marks |= VIF_MARK_CONDEMNED; 681 mutex_exit(&(vifp)->v_lock); 682 suc = ipsq_enter(ill, B_FALSE); 683 ipsq = ill->ill_phyint->phyint_ipsq; 684 } else { 685 ipsq = ipsq_try_enter(ipif, NULL, 686 mrouter->conn_wq, mp, 687 ip_restart_optmgmt, NEW_OP, B_TRUE); 688 if (ipsq == NULL) { 689 mutex_exit(&(vifp)->v_lock); 690 ipif_refrele(ipif); 691 return (EINPROGRESS); 692 } 693 /* 694 * Clear VIF_MARK_GOOD and 695 * set VIF_MARK_CONDEMNED. 696 */ 697 vifp->v_marks &= ~VIF_MARK_GOOD; 698 vifp->v_marks |= VIF_MARK_CONDEMNED; 699 mutex_exit(&(vifp)->v_lock); 700 suc = B_TRUE; 701 } 702 703 if (suc) { 704 (void) ip_delmulti(INADDR_ANY, ipif, 705 B_TRUE, B_TRUE); 706 ipsq_exit(ipsq, B_TRUE, B_TRUE); 707 } 708 mutex_enter(&vifp->v_lock); 709 } 710 /* 711 * decreases the refcnt added in add_vif. 712 * and release v_lock. 713 */ 714 VIF_REFRELE_LOCKED(vifp); 715 } else { 716 mutex_exit(&vifp->v_lock); 717 continue; 718 } 719 } 720 721 mutex_enter(&ipst->ips_numvifs_mutex); 722 ipst->ips_numvifs = 0; 723 ipst->ips_pim_assert = 0; 724 ipst->ips_reg_vif_num = ALL_VIFS; 725 mutex_exit(&ipst->ips_numvifs_mutex); 726 727 /* 728 * Free upcall msgs. 729 * Go through mfctable and stop any outstanding upcall 730 * timeouts remaining on mfcs. 731 */ 732 for (i = 0; i < MFCTBLSIZ; i++) { 733 mutex_enter(&ipst->ips_mfcs[i].mfcb_lock); 734 ipst->ips_mfcs[i].mfcb_refcnt++; 735 ipst->ips_mfcs[i].mfcb_marks |= MFCB_MARK_CONDEMNED; 736 mutex_exit(&ipst->ips_mfcs[i].mfcb_lock); 737 mfc_rt = ipst->ips_mfcs[i].mfcb_mfc; 738 while (mfc_rt) { 739 /* Free upcalls */ 740 mutex_enter(&mfc_rt->mfc_mutex); 741 if (mfc_rt->mfc_rte != NULL) { 742 if (mfc_rt->mfc_timeout_id != 0) { 743 /* 744 * OK to drop the lock as we have 745 * a refcnt on the bucket. timeout 746 * can fire but it will see that 747 * mfc_timeout_id == 0 and not do 748 * anything. see expire_upcalls(). 749 */ 750 mfc_rt->mfc_timeout_id = 0; 751 mutex_exit(&mfc_rt->mfc_mutex); 752 (void) untimeout( 753 mfc_rt->mfc_timeout_id); 754 mfc_rt->mfc_timeout_id = 0; 755 mutex_enter(&mfc_rt->mfc_mutex); 756 757 /* 758 * all queued upcall packets 759 * and mblk will be freed in 760 * release_mfc(). 761 */ 762 } 763 } 764 765 mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED; 766 767 mutex_exit(&mfc_rt->mfc_mutex); 768 mfc_rt = mfc_rt->mfc_next; 769 } 770 MFCB_REFRELE(&ipst->ips_mfcs[i]); 771 } 772 773 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 774 ipst->ips_ip_g_mrouter = NULL; 775 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 776 return (0); 777 } 778 779 void 780 ip_mrouter_stack_destroy(ip_stack_t *ipst) 781 { 782 struct mfcb *mfcbp; 783 struct mfc *rt; 784 int i; 785 786 for (i = 0; i < MFCTBLSIZ; i++) { 787 mfcbp = &ipst->ips_mfcs[i]; 788 789 while ((rt = mfcbp->mfcb_mfc) != NULL) { 790 (void) printf("ip_mrouter_stack_destroy: free for %d\n", 791 i); 792 793 mfcbp->mfcb_mfc = rt->mfc_next; 794 free_queue(rt); 795 mi_free(rt); 796 } 797 } 798 kmem_free(ipst->ips_vifs, sizeof (struct vif) * (MAXVIFS+1)); 799 ipst->ips_vifs = NULL; 800 kmem_free(ipst->ips_mrtstat, sizeof (struct mrtstat)); 801 ipst->ips_mrtstat = NULL; 802 kmem_free(ipst->ips_mfcs, sizeof (struct mfcb) * MFCTBLSIZ); 803 ipst->ips_mfcs = NULL; 804 kmem_free(ipst->ips_tbfs, sizeof (struct tbf) * MAXVIFS); 805 ipst->ips_tbfs = NULL; 806 807 mutex_destroy(&ipst->ips_last_encap_lock); 808 mutex_destroy(&ipst->ips_ip_g_mrouter_mutex); 809 } 810 811 static boolean_t 812 is_mrouter_off(ip_stack_t *ipst) 813 { 814 conn_t *mrouter; 815 816 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 817 if (ipst->ips_ip_g_mrouter == NULL) { 818 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 819 return (B_TRUE); 820 } 821 822 mrouter = ipst->ips_ip_g_mrouter; 823 if (mrouter->conn_multi_router == 0) { 824 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 825 return (B_TRUE); 826 } 827 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 828 return (B_FALSE); 829 } 830 831 static void 832 unlock_good_vif(struct vif *vifp) 833 { 834 ASSERT(vifp->v_ipif != NULL); 835 ipif_refrele(vifp->v_ipif); 836 VIF_REFRELE(vifp); 837 } 838 839 static boolean_t 840 lock_good_vif(struct vif *vifp) 841 { 842 mutex_enter(&vifp->v_lock); 843 if (!(vifp->v_marks & VIF_MARK_GOOD)) { 844 mutex_exit(&vifp->v_lock); 845 return (B_FALSE); 846 } 847 848 ASSERT(vifp->v_ipif != NULL); 849 mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock); 850 if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) { 851 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock); 852 mutex_exit(&vifp->v_lock); 853 return (B_FALSE); 854 } 855 ipif_refhold_locked(vifp->v_ipif); 856 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock); 857 vifp->v_refcnt++; 858 mutex_exit(&vifp->v_lock); 859 return (B_TRUE); 860 } 861 862 /* 863 * Add a vif to the vif table. 864 */ 865 static int 866 add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst) 867 { 868 struct vif *vifp = ipst->ips_vifs + vifcp->vifc_vifi; 869 ipif_t *ipif; 870 int error; 871 struct tbf *v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi; 872 ipsq_t *ipsq; 873 conn_t *mrouter = ipst->ips_ip_g_mrouter; 874 875 ASSERT(connp != NULL); 876 877 if (vifcp->vifc_vifi >= MAXVIFS) 878 return (EINVAL); 879 880 if (is_mrouter_off(ipst)) 881 return (EINVAL); 882 883 mutex_enter(&vifp->v_lock); 884 /* 885 * Viftable entry should be 0. 886 * if v_marks == 0 but v_refcnt != 0 means struct is being 887 * initialized. 888 * 889 * Also note that it is very unlikely that we will get a MRT_ADD_VIF 890 * request while the delete is in progress, mrouted only sends add 891 * requests when a new interface is added and the new interface cannot 892 * have the same vifi as an existing interface. We make sure that 893 * ill_delete will block till the vif is deleted by adding a refcnt 894 * to ipif in del_vif(). 895 */ 896 if (vifp->v_lcl_addr.s_addr != 0 || 897 vifp->v_marks != 0 || 898 vifp->v_refcnt != 0) { 899 mutex_exit(&vifp->v_lock); 900 return (EADDRINUSE); 901 } 902 903 /* Incoming vif should not be 0 */ 904 if (vifcp->vifc_lcl_addr.s_addr == 0) { 905 mutex_exit(&vifp->v_lock); 906 return (EINVAL); 907 } 908 909 vifp->v_refcnt++; 910 mutex_exit(&vifp->v_lock); 911 /* Find the interface with the local address */ 912 ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL, 913 connp->conn_zoneid, CONNP_TO_WQ(connp), first_mp, 914 ip_restart_optmgmt, &error, ipst); 915 if (ipif == NULL) { 916 VIF_REFRELE(vifp); 917 if (error == EINPROGRESS) 918 return (error); 919 return (EADDRNOTAVAIL); 920 } 921 922 /* 923 * We have to be exclusive as we have to call ip_addmulti() 924 * This is the best position to try to be exclusive in case 925 * we have to wait. 926 */ 927 ipsq = ipsq_try_enter(ipif, NULL, CONNP_TO_WQ(connp), first_mp, 928 ip_restart_optmgmt, NEW_OP, B_TRUE); 929 if ((ipsq) == NULL) { 930 VIF_REFRELE(vifp); 931 ipif_refrele(ipif); 932 return (EINPROGRESS); 933 } 934 935 if (ipst->ips_ip_mrtdebug > 1) { 936 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 937 "add_vif: src 0x%x enter", 938 vifcp->vifc_lcl_addr.s_addr); 939 } 940 941 mutex_enter(&vifp->v_lock); 942 /* 943 * Always clear cache when vifs change. 944 * Needed to ensure that src isn't left over from before vif was added. 945 * No need to get last_encap_lock, since we are running as a writer. 946 */ 947 948 mutex_enter(&ipst->ips_last_encap_lock); 949 ipst->ips_last_encap_src = 0; 950 ipst->ips_last_encap_vif = NULL; 951 mutex_exit(&ipst->ips_last_encap_lock); 952 953 if (vifcp->vifc_flags & VIFF_TUNNEL) { 954 if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) { 955 cmn_err(CE_WARN, 956 "add_vif: source route tunnels not supported\n"); 957 VIF_REFRELE_LOCKED(vifp); 958 ipif_refrele(ipif); 959 ipsq_exit(ipsq, B_TRUE, B_TRUE); 960 return (EOPNOTSUPP); 961 } 962 vifp->v_rmt_addr = vifcp->vifc_rmt_addr; 963 964 } else { 965 /* Phyint or Register vif */ 966 if (vifcp->vifc_flags & VIFF_REGISTER) { 967 /* 968 * Note: Since all IPPROTO_IP level options (including 969 * MRT_ADD_VIF) are done exclusively via 970 * ip_optmgmt_writer(), a lock is not necessary to 971 * protect reg_vif_num. 972 */ 973 mutex_enter(&ipst->ips_numvifs_mutex); 974 if (ipst->ips_reg_vif_num == ALL_VIFS) { 975 ipst->ips_reg_vif_num = vifcp->vifc_vifi; 976 mutex_exit(&ipst->ips_numvifs_mutex); 977 } else { 978 mutex_exit(&ipst->ips_numvifs_mutex); 979 VIF_REFRELE_LOCKED(vifp); 980 ipif_refrele(ipif); 981 ipsq_exit(ipsq, B_TRUE, B_TRUE); 982 return (EADDRINUSE); 983 } 984 } 985 986 /* Make sure the interface supports multicast */ 987 if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) { 988 VIF_REFRELE_LOCKED(vifp); 989 ipif_refrele(ipif); 990 if (vifcp->vifc_flags & VIFF_REGISTER) { 991 mutex_enter(&ipst->ips_numvifs_mutex); 992 ipst->ips_reg_vif_num = ALL_VIFS; 993 mutex_exit(&ipst->ips_numvifs_mutex); 994 } 995 ipsq_exit(ipsq, B_TRUE, B_TRUE); 996 return (EOPNOTSUPP); 997 } 998 /* Enable promiscuous reception of all IP mcasts from the if */ 999 mutex_exit(&vifp->v_lock); 1000 error = ip_addmulti(INADDR_ANY, ipif, ILGSTAT_NONE, 1001 MODE_IS_EXCLUDE, NULL); 1002 mutex_enter(&vifp->v_lock); 1003 /* 1004 * since we released the lock lets make sure that 1005 * ip_mrouter_done() has not been called. 1006 */ 1007 if (error != 0 || is_mrouter_off(ipst)) { 1008 if (error == 0) 1009 (void) ip_delmulti(INADDR_ANY, ipif, B_TRUE, 1010 B_TRUE); 1011 if (vifcp->vifc_flags & VIFF_REGISTER) { 1012 mutex_enter(&ipst->ips_numvifs_mutex); 1013 ipst->ips_reg_vif_num = ALL_VIFS; 1014 mutex_exit(&ipst->ips_numvifs_mutex); 1015 } 1016 VIF_REFRELE_LOCKED(vifp); 1017 ipif_refrele(ipif); 1018 ipsq_exit(ipsq, B_TRUE, B_TRUE); 1019 return (error?error:EINVAL); 1020 } 1021 } 1022 /* Define parameters for the tbf structure */ 1023 vifp->v_tbf = v_tbf; 1024 gethrestime(&vifp->v_tbf->tbf_last_pkt_t); 1025 vifp->v_tbf->tbf_n_tok = 0; 1026 vifp->v_tbf->tbf_q_len = 0; 1027 vifp->v_tbf->tbf_max_q_len = MAXQSIZE; 1028 vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL; 1029 1030 vifp->v_flags = vifcp->vifc_flags; 1031 vifp->v_threshold = vifcp->vifc_threshold; 1032 vifp->v_lcl_addr = vifcp->vifc_lcl_addr; 1033 vifp->v_ipif = ipif; 1034 ipif_refrele(ipif); 1035 /* Scaling up here, allows division by 1024 in critical code. */ 1036 vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000); 1037 vifp->v_timeout_id = 0; 1038 /* initialize per vif pkt counters */ 1039 vifp->v_pkt_in = 0; 1040 vifp->v_pkt_out = 0; 1041 vifp->v_bytes_in = 0; 1042 vifp->v_bytes_out = 0; 1043 mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL); 1044 1045 /* Adjust numvifs up, if the vifi is higher than numvifs */ 1046 mutex_enter(&ipst->ips_numvifs_mutex); 1047 if (ipst->ips_numvifs <= vifcp->vifc_vifi) 1048 ipst->ips_numvifs = vifcp->vifc_vifi + 1; 1049 mutex_exit(&ipst->ips_numvifs_mutex); 1050 1051 if (ipst->ips_ip_mrtdebug > 1) { 1052 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1053 "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d", 1054 vifcp->vifc_vifi, 1055 ntohl(vifcp->vifc_lcl_addr.s_addr), 1056 (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask", 1057 ntohl(vifcp->vifc_rmt_addr.s_addr), 1058 vifcp->vifc_threshold, vifcp->vifc_rate_limit); 1059 } 1060 1061 vifp->v_marks = VIF_MARK_GOOD; 1062 mutex_exit(&vifp->v_lock); 1063 ipsq_exit(ipsq, B_TRUE, B_TRUE); 1064 return (0); 1065 } 1066 1067 1068 /* Delete a vif from the vif table. */ 1069 static void 1070 del_vifp(struct vif *vifp) 1071 { 1072 struct tbf *t = vifp->v_tbf; 1073 mblk_t *mp0; 1074 vifi_t vifi; 1075 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 1076 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1077 1078 ASSERT(vifp->v_marks & VIF_MARK_CONDEMNED); 1079 ASSERT(t != NULL); 1080 1081 /* 1082 * release the ref we put in vif_del. 1083 */ 1084 ASSERT(vifp->v_ipif != NULL); 1085 ipif_refrele(vifp->v_ipif); 1086 1087 if (ipst->ips_ip_mrtdebug > 1) { 1088 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1089 "del_vif: src 0x%x\n", vifp->v_lcl_addr.s_addr); 1090 } 1091 1092 if (vifp->v_timeout_id != 0) { 1093 (void) untimeout(vifp->v_timeout_id); 1094 vifp->v_timeout_id = 0; 1095 } 1096 1097 /* 1098 * Free packets queued at the interface. 1099 * Mrouted takes care of cleaning up mfcs - makes calls to del_mfc. 1100 */ 1101 mutex_enter(&t->tbf_lock); 1102 while (t->tbf_q != NULL) { 1103 mp0 = t->tbf_q; 1104 t->tbf_q = t->tbf_q->b_next; 1105 mp0->b_prev = mp0->b_next = NULL; 1106 freemsg(mp0); 1107 } 1108 mutex_exit(&t->tbf_lock); 1109 1110 /* 1111 * Always clear cache when vifs change. 1112 * No need to get last_encap_lock since we are running as a writer. 1113 */ 1114 mutex_enter(&ipst->ips_last_encap_lock); 1115 if (vifp == ipst->ips_last_encap_vif) { 1116 ipst->ips_last_encap_vif = NULL; 1117 ipst->ips_last_encap_src = 0; 1118 } 1119 mutex_exit(&ipst->ips_last_encap_lock); 1120 1121 mutex_destroy(&t->tbf_lock); 1122 1123 bzero(vifp->v_tbf, sizeof (*(vifp->v_tbf))); 1124 1125 /* Adjust numvifs down */ 1126 mutex_enter(&ipst->ips_numvifs_mutex); 1127 for (vifi = ipst->ips_numvifs; vifi != 0; vifi--) /* vifi is unsigned */ 1128 if (ipst->ips_vifs[vifi - 1].v_lcl_addr.s_addr != 0) 1129 break; 1130 ipst->ips_numvifs = vifi; 1131 mutex_exit(&ipst->ips_numvifs_mutex); 1132 1133 bzero(vifp, sizeof (*vifp)); 1134 } 1135 1136 static int 1137 del_vif(vifi_t *vifip, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst) 1138 { 1139 struct vif *vifp = ipst->ips_vifs + *vifip; 1140 ipsq_t *ipsq; 1141 1142 if (*vifip >= ipst->ips_numvifs) 1143 return (EINVAL); 1144 1145 1146 mutex_enter(&vifp->v_lock); 1147 /* 1148 * Not initialized 1149 * Here we are not looking at the vif that is being initialized 1150 * i.e vifp->v_marks == 0 and refcnt > 0. 1151 */ 1152 if (vifp->v_lcl_addr.s_addr == 0 || 1153 !(vifp->v_marks & VIF_MARK_GOOD)) { 1154 mutex_exit(&vifp->v_lock); 1155 return (EADDRNOTAVAIL); 1156 } 1157 1158 /* 1159 * This is an optimization, if first_mp == NULL 1160 * than we are being called from reset_mrt_vif_ipif() 1161 * so we already have exclusive access to the ipsq. 1162 * the ASSERT below is a check for this condition. 1163 */ 1164 if (first_mp != NULL && 1165 !(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 1166 ASSERT(connp != NULL); 1167 /* 1168 * We have to be exclusive as we have to call ip_delmulti() 1169 * This is the best position to try to be exclusive in case 1170 * we have to wait. 1171 */ 1172 ipsq = ipsq_try_enter(vifp->v_ipif, NULL, CONNP_TO_WQ(connp), 1173 first_mp, ip_restart_optmgmt, NEW_OP, B_TRUE); 1174 if ((ipsq) == NULL) { 1175 mutex_exit(&vifp->v_lock); 1176 return (EINPROGRESS); 1177 } 1178 /* recheck after being exclusive */ 1179 if (vifp->v_lcl_addr.s_addr == 0 || 1180 !vifp->v_marks & VIF_MARK_GOOD) { 1181 /* 1182 * someone beat us. 1183 */ 1184 mutex_exit(&vifp->v_lock); 1185 ipsq_exit(ipsq, B_TRUE, B_TRUE); 1186 return (EADDRNOTAVAIL); 1187 } 1188 } 1189 1190 1191 ASSERT(IAM_WRITER_IPIF(vifp->v_ipif)); 1192 1193 1194 /* 1195 * add a refhold so that ipif does not go away while 1196 * there are still users, this will be released in del_vifp 1197 * when we free the vif. 1198 */ 1199 ipif_refhold(vifp->v_ipif); 1200 1201 /* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */ 1202 vifp->v_marks &= ~VIF_MARK_GOOD; 1203 vifp->v_marks |= VIF_MARK_CONDEMNED; 1204 1205 /* Phyint only */ 1206 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 1207 ipif_t *ipif = vifp->v_ipif; 1208 ASSERT(ipif != NULL); 1209 /* 1210 * should be OK to drop the lock as we 1211 * have marked this as CONDEMNED. 1212 */ 1213 mutex_exit(&(vifp)->v_lock); 1214 (void) ip_delmulti(INADDR_ANY, ipif, B_TRUE, B_TRUE); 1215 if (first_mp != NULL) 1216 ipsq_exit(ipsq, B_TRUE, B_TRUE); 1217 mutex_enter(&(vifp)->v_lock); 1218 } 1219 1220 /* 1221 * decreases the refcnt added in add_vif. 1222 */ 1223 VIF_REFRELE_LOCKED(vifp); 1224 return (0); 1225 } 1226 1227 /* 1228 * Add an mfc entry. 1229 */ 1230 static int 1231 add_mfc(struct mfcctl *mfccp, ip_stack_t *ipst) 1232 { 1233 struct mfc *rt; 1234 struct rtdetq *rte; 1235 ushort_t nstl; 1236 int i; 1237 struct mfcb *mfcbp; 1238 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1239 1240 /* 1241 * The value of vifi is NO_VIF (==MAXVIFS) if Mrouted 1242 * did not have a real route for pkt. 1243 * We want this pkt without rt installed in the mfctable to prevent 1244 * multiiple tries, so go ahead and put it in mfctable, it will 1245 * be discarded later in ip_mdq() because the child is NULL. 1246 */ 1247 1248 /* Error checking, out of bounds? */ 1249 if (mfccp->mfcc_parent > MAXVIFS) { 1250 ip0dbg(("ADD_MFC: mfcc_parent out of range %d", 1251 (int)mfccp->mfcc_parent)); 1252 return (EINVAL); 1253 } 1254 1255 if ((mfccp->mfcc_parent != NO_VIF) && 1256 (ipst->ips_vifs[mfccp->mfcc_parent].v_ipif == NULL)) { 1257 ip0dbg(("ADD_MFC: NULL ipif for parent vif %d\n", 1258 (int)mfccp->mfcc_parent)); 1259 return (EINVAL); 1260 } 1261 1262 if (is_mrouter_off(ipst)) { 1263 return (EINVAL); 1264 } 1265 1266 mfcbp = &ipst->ips_mfcs[MFCHASH(mfccp->mfcc_origin.s_addr, 1267 mfccp->mfcc_mcastgrp.s_addr)]; 1268 MFCB_REFHOLD(mfcbp); 1269 MFCFIND(mfcbp, mfccp->mfcc_origin.s_addr, 1270 mfccp->mfcc_mcastgrp.s_addr, rt); 1271 1272 /* If an entry already exists, just update the fields */ 1273 if (rt) { 1274 if (ipst->ips_ip_mrtdebug > 1) { 1275 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1276 "add_mfc: update o %x grp %x parent %x", 1277 ntohl(mfccp->mfcc_origin.s_addr), 1278 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1279 mfccp->mfcc_parent); 1280 } 1281 mutex_enter(&rt->mfc_mutex); 1282 rt->mfc_parent = mfccp->mfcc_parent; 1283 1284 mutex_enter(&ipst->ips_numvifs_mutex); 1285 for (i = 0; i < (int)ipst->ips_numvifs; i++) 1286 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; 1287 mutex_exit(&ipst->ips_numvifs_mutex); 1288 mutex_exit(&rt->mfc_mutex); 1289 1290 MFCB_REFRELE(mfcbp); 1291 return (0); 1292 } 1293 1294 /* 1295 * Find the entry for which the upcall was made and update. 1296 */ 1297 for (rt = mfcbp->mfcb_mfc, nstl = 0; rt; rt = rt->mfc_next) { 1298 mutex_enter(&rt->mfc_mutex); 1299 if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && 1300 (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) && 1301 (rt->mfc_rte != NULL) && 1302 !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 1303 if (nstl++ != 0) 1304 cmn_err(CE_WARN, 1305 "add_mfc: %s o %x g %x p %x", 1306 "multiple kernel entries", 1307 ntohl(mfccp->mfcc_origin.s_addr), 1308 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1309 mfccp->mfcc_parent); 1310 1311 if (ipst->ips_ip_mrtdebug > 1) { 1312 (void) mi_strlog(mrouter->conn_rq, 1, 1313 SL_TRACE, 1314 "add_mfc: o %x g %x p %x", 1315 ntohl(mfccp->mfcc_origin.s_addr), 1316 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1317 mfccp->mfcc_parent); 1318 } 1319 fill_route(rt, mfccp, ipst); 1320 1321 /* 1322 * Prevent cleanup of cache entry. 1323 * Timer starts in ip_mforward. 1324 */ 1325 if (rt->mfc_timeout_id != 0) { 1326 timeout_id_t id; 1327 id = rt->mfc_timeout_id; 1328 /* 1329 * setting id to zero will avoid this 1330 * entry from being cleaned up in 1331 * expire_up_calls(). 1332 */ 1333 rt->mfc_timeout_id = 0; 1334 /* 1335 * dropping the lock is fine as we 1336 * have a refhold on the bucket. 1337 * so mfc cannot be freed. 1338 * The timeout can fire but it will see 1339 * that mfc_timeout_id == 0 and not cleanup. 1340 */ 1341 mutex_exit(&rt->mfc_mutex); 1342 (void) untimeout(id); 1343 mutex_enter(&rt->mfc_mutex); 1344 } 1345 1346 /* 1347 * Send all pkts that are queued waiting for the upcall. 1348 * ip_mdq param tun set to 0 - 1349 * the return value of ip_mdq() isn't used here, 1350 * so value we send doesn't matter. 1351 */ 1352 while (rt->mfc_rte != NULL) { 1353 rte = rt->mfc_rte; 1354 rt->mfc_rte = rte->rte_next; 1355 mutex_exit(&rt->mfc_mutex); 1356 (void) ip_mdq(rte->mp, (ipha_t *) 1357 rte->mp->b_rptr, rte->ill, 0, rt); 1358 freemsg(rte->mp); 1359 mi_free((char *)rte); 1360 mutex_enter(&rt->mfc_mutex); 1361 } 1362 } 1363 mutex_exit(&rt->mfc_mutex); 1364 } 1365 1366 1367 /* 1368 * It is possible that an entry is being inserted without an upcall 1369 */ 1370 if (nstl == 0) { 1371 mutex_enter(&(mfcbp->mfcb_lock)); 1372 if (ipst->ips_ip_mrtdebug > 1) { 1373 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1374 "add_mfc: no upcall o %x g %x p %x", 1375 ntohl(mfccp->mfcc_origin.s_addr), 1376 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1377 mfccp->mfcc_parent); 1378 } 1379 if (is_mrouter_off(ipst)) { 1380 mutex_exit(&mfcbp->mfcb_lock); 1381 MFCB_REFRELE(mfcbp); 1382 return (EINVAL); 1383 } 1384 1385 for (rt = mfcbp->mfcb_mfc; rt; rt = rt->mfc_next) { 1386 1387 mutex_enter(&rt->mfc_mutex); 1388 if ((rt->mfc_origin.s_addr == 1389 mfccp->mfcc_origin.s_addr) && 1390 (rt->mfc_mcastgrp.s_addr == 1391 mfccp->mfcc_mcastgrp.s_addr) && 1392 (!(rt->mfc_marks & MFCB_MARK_CONDEMNED))) { 1393 fill_route(rt, mfccp, ipst); 1394 mutex_exit(&rt->mfc_mutex); 1395 break; 1396 } 1397 mutex_exit(&rt->mfc_mutex); 1398 } 1399 1400 /* No upcall, so make a new entry into mfctable */ 1401 if (rt == NULL) { 1402 rt = (struct mfc *)mi_zalloc(sizeof (struct mfc)); 1403 if (rt == NULL) { 1404 ip1dbg(("add_mfc: out of memory\n")); 1405 mutex_exit(&mfcbp->mfcb_lock); 1406 MFCB_REFRELE(mfcbp); 1407 return (ENOBUFS); 1408 } 1409 1410 /* Insert new entry at head of hash chain */ 1411 mutex_enter(&rt->mfc_mutex); 1412 fill_route(rt, mfccp, ipst); 1413 1414 /* Link into table */ 1415 rt->mfc_next = mfcbp->mfcb_mfc; 1416 mfcbp->mfcb_mfc = rt; 1417 mutex_exit(&rt->mfc_mutex); 1418 } 1419 mutex_exit(&mfcbp->mfcb_lock); 1420 } 1421 1422 MFCB_REFRELE(mfcbp); 1423 return (0); 1424 } 1425 1426 /* 1427 * Fills in mfc structure from mrouted mfcctl. 1428 */ 1429 static void 1430 fill_route(struct mfc *rt, struct mfcctl *mfccp, ip_stack_t *ipst) 1431 { 1432 int i; 1433 1434 rt->mfc_origin = mfccp->mfcc_origin; 1435 rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; 1436 rt->mfc_parent = mfccp->mfcc_parent; 1437 mutex_enter(&ipst->ips_numvifs_mutex); 1438 for (i = 0; i < (int)ipst->ips_numvifs; i++) { 1439 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; 1440 } 1441 mutex_exit(&ipst->ips_numvifs_mutex); 1442 /* Initialize pkt counters per src-grp */ 1443 rt->mfc_pkt_cnt = 0; 1444 rt->mfc_byte_cnt = 0; 1445 rt->mfc_wrong_if = 0; 1446 rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_nsec = 0; 1447 1448 } 1449 1450 static void 1451 free_queue(struct mfc *mfcp) 1452 { 1453 struct rtdetq *rte0; 1454 1455 /* 1456 * Drop all queued upcall packets. 1457 * Free the mbuf with the pkt. 1458 */ 1459 while ((rte0 = mfcp->mfc_rte) != NULL) { 1460 mfcp->mfc_rte = rte0->rte_next; 1461 freemsg(rte0->mp); 1462 mi_free((char *)rte0); 1463 } 1464 } 1465 /* 1466 * go thorugh the hash bucket and free all the entries marked condemned. 1467 */ 1468 void 1469 release_mfc(struct mfcb *mfcbp) 1470 { 1471 struct mfc *current_mfcp; 1472 struct mfc *prev_mfcp; 1473 1474 prev_mfcp = current_mfcp = mfcbp->mfcb_mfc; 1475 1476 while (current_mfcp != NULL) { 1477 if (current_mfcp->mfc_marks & MFCB_MARK_CONDEMNED) { 1478 if (current_mfcp == mfcbp->mfcb_mfc) { 1479 mfcbp->mfcb_mfc = current_mfcp->mfc_next; 1480 free_queue(current_mfcp); 1481 mi_free(current_mfcp); 1482 prev_mfcp = current_mfcp = mfcbp->mfcb_mfc; 1483 continue; 1484 } 1485 ASSERT(prev_mfcp != NULL); 1486 prev_mfcp->mfc_next = current_mfcp->mfc_next; 1487 free_queue(current_mfcp); 1488 mi_free(current_mfcp); 1489 current_mfcp = NULL; 1490 } else { 1491 prev_mfcp = current_mfcp; 1492 } 1493 1494 current_mfcp = prev_mfcp->mfc_next; 1495 1496 } 1497 mfcbp->mfcb_marks &= ~MFCB_MARK_CONDEMNED; 1498 ASSERT(mfcbp->mfcb_mfc != NULL || mfcbp->mfcb_marks == 0); 1499 } 1500 1501 /* 1502 * Delete an mfc entry. 1503 */ 1504 static int 1505 del_mfc(struct mfcctl *mfccp, ip_stack_t *ipst) 1506 { 1507 struct in_addr origin; 1508 struct in_addr mcastgrp; 1509 struct mfc *rt; 1510 uint_t hash; 1511 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1512 1513 origin = mfccp->mfcc_origin; 1514 mcastgrp = mfccp->mfcc_mcastgrp; 1515 hash = MFCHASH(origin.s_addr, mcastgrp.s_addr); 1516 1517 if (ipst->ips_ip_mrtdebug > 1) { 1518 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1519 "del_mfc: o %x g %x", 1520 ntohl(origin.s_addr), 1521 ntohl(mcastgrp.s_addr)); 1522 } 1523 1524 MFCB_REFHOLD(&ipst->ips_mfcs[hash]); 1525 1526 /* Find mfc in mfctable, finds only entries without upcalls */ 1527 for (rt = ipst->ips_mfcs[hash].mfcb_mfc; rt; rt = rt->mfc_next) { 1528 mutex_enter(&rt->mfc_mutex); 1529 if (origin.s_addr == rt->mfc_origin.s_addr && 1530 mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr && 1531 rt->mfc_rte == NULL && 1532 !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) 1533 break; 1534 mutex_exit(&rt->mfc_mutex); 1535 } 1536 1537 /* 1538 * Return if there was an upcall (mfc_rte != NULL, 1539 * or rt not in mfctable. 1540 */ 1541 if (rt == NULL) { 1542 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 1543 return (EADDRNOTAVAIL); 1544 } 1545 1546 1547 /* 1548 * no need to hold lock as we have a reference. 1549 */ 1550 ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED; 1551 /* error checking */ 1552 if (rt->mfc_timeout_id != 0) { 1553 ip0dbg(("del_mfc: TIMEOUT NOT 0, rte not null")); 1554 /* 1555 * Its ok to drop the lock, the struct cannot be freed 1556 * since we have a ref on the hash bucket. 1557 */ 1558 rt->mfc_timeout_id = 0; 1559 mutex_exit(&rt->mfc_mutex); 1560 (void) untimeout(rt->mfc_timeout_id); 1561 mutex_enter(&rt->mfc_mutex); 1562 } 1563 1564 ASSERT(rt->mfc_rte == NULL); 1565 1566 1567 /* 1568 * Delete the entry from the cache 1569 */ 1570 rt->mfc_marks |= MFCB_MARK_CONDEMNED; 1571 mutex_exit(&rt->mfc_mutex); 1572 1573 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 1574 1575 return (0); 1576 } 1577 1578 #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ 1579 1580 /* 1581 * IP multicast forwarding function. This function assumes that the packet 1582 * pointed to by ipha has arrived on (or is about to be sent to) the interface 1583 * pointed to by "ill", and the packet is to be relayed to other networks 1584 * that have members of the packet's destination IP multicast group. 1585 * 1586 * The packet is returned unscathed to the caller, unless it is 1587 * erroneous, in which case a -1 value tells the caller (IP) 1588 * to discard it. 1589 * 1590 * Unlike BSD, SunOS 5.x needs to return to IP info about 1591 * whether pkt came in thru a tunnel, so it can be discarded, unless 1592 * it's IGMP. In BSD, the ifp is bogus for tunnels, so pkt won't try 1593 * to be delivered. 1594 * Return values are 0 - pkt is okay and phyint 1595 * -1 - pkt is malformed and to be tossed 1596 * 1 - pkt came in on tunnel 1597 */ 1598 int 1599 ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp) 1600 { 1601 struct mfc *rt; 1602 ipaddr_t src, dst, tunnel_src = 0; 1603 static int srctun = 0; 1604 vifi_t vifi; 1605 boolean_t pim_reg_packet = B_FALSE; 1606 struct mfcb *mfcbp; 1607 ip_stack_t *ipst = ill->ill_ipst; 1608 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1609 1610 if (ipst->ips_ip_mrtdebug > 1) { 1611 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1612 "ip_mforward: RECV ipha_src %x, ipha_dst %x, ill %s", 1613 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), 1614 ill->ill_name); 1615 } 1616 1617 dst = ipha->ipha_dst; 1618 if ((uint32_t)(uintptr_t)mp->b_prev == PIM_REGISTER_MARKER) 1619 pim_reg_packet = B_TRUE; 1620 else 1621 tunnel_src = (ipaddr_t)(uintptr_t)mp->b_prev; 1622 1623 /* 1624 * Don't forward a packet with time-to-live of zero or one, 1625 * or a packet destined to a local-only group. 1626 */ 1627 if (CLASSD(dst) && (ipha->ipha_ttl <= 1 || 1628 (ipaddr_t)ntohl(dst) <= INADDR_MAX_LOCAL_GROUP)) { 1629 if (ipst->ips_ip_mrtdebug > 1) { 1630 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1631 "ip_mforward: not forwarded ttl %d," 1632 " dst 0x%x ill %s", 1633 ipha->ipha_ttl, ntohl(dst), ill->ill_name); 1634 } 1635 mp->b_prev = NULL; 1636 if (tunnel_src != 0) 1637 return (1); 1638 else 1639 return (0); 1640 } 1641 1642 if ((tunnel_src != 0) || pim_reg_packet) { 1643 /* 1644 * Packet arrived over an encapsulated tunnel or via a PIM 1645 * register message. Both ip_mroute_decap() and pim_input() 1646 * encode information in mp->b_prev. 1647 */ 1648 mp->b_prev = NULL; 1649 if (ipst->ips_ip_mrtdebug > 1) { 1650 if (tunnel_src != 0) { 1651 (void) mi_strlog(mrouter->conn_rq, 1, 1652 SL_TRACE, 1653 "ip_mforward: ill %s arrived via ENCAP TUN", 1654 ill->ill_name); 1655 } else if (pim_reg_packet) { 1656 (void) mi_strlog(mrouter->conn_rq, 1, 1657 SL_TRACE, 1658 "ip_mforward: ill %s arrived via" 1659 " REGISTER VIF", 1660 ill->ill_name); 1661 } 1662 } 1663 } else if ((ipha->ipha_version_and_hdr_length & 0xf) < 1664 (uint_t)(IP_SIMPLE_HDR_LENGTH + TUNNEL_LEN) >> 2 || 1665 ((uchar_t *)(ipha + 1))[1] != IPOPT_LSRR) { 1666 /* Packet arrived via a physical interface. */ 1667 if (ipst->ips_ip_mrtdebug > 1) { 1668 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1669 "ip_mforward: ill %s arrived via PHYINT", 1670 ill->ill_name); 1671 } 1672 1673 } else { 1674 /* 1675 * Packet arrived through a SRCRT tunnel. 1676 * Source-route tunnels are no longer supported. 1677 * Error message printed every 1000 times. 1678 */ 1679 if ((srctun++ % 1000) == 0) { 1680 cmn_err(CE_WARN, 1681 "ip_mforward: received source-routed pkt from %x", 1682 ntohl(ipha->ipha_src)); 1683 } 1684 return (-1); 1685 } 1686 1687 ipst->ips_mrtstat->mrts_fwd_in++; 1688 src = ipha->ipha_src; 1689 1690 /* Find route in cache, return NULL if not there or upcalls q'ed. */ 1691 1692 /* 1693 * Lock the mfctable against changes made by ip_mforward. 1694 * Note that only add_mfc and del_mfc can remove entries and 1695 * they run with exclusive access to IP. So we do not need to 1696 * guard against the rt being deleted, so release lock after reading. 1697 */ 1698 1699 if (is_mrouter_off(ipst)) 1700 return (-1); 1701 1702 mfcbp = &ipst->ips_mfcs[MFCHASH(src, dst)]; 1703 MFCB_REFHOLD(mfcbp); 1704 MFCFIND(mfcbp, src, dst, rt); 1705 1706 /* Entry exists, so forward if necessary */ 1707 if (rt != NULL) { 1708 int ret = 0; 1709 ipst->ips_mrtstat->mrts_mfc_hits++; 1710 if (pim_reg_packet) { 1711 ASSERT(ipst->ips_reg_vif_num != ALL_VIFS); 1712 ret = ip_mdq(mp, ipha, 1713 ipst->ips_vifs[ipst->ips_reg_vif_num]. 1714 v_ipif->ipif_ill, 1715 0, rt); 1716 } else { 1717 ret = ip_mdq(mp, ipha, ill, tunnel_src, rt); 1718 } 1719 1720 MFCB_REFRELE(mfcbp); 1721 return (ret); 1722 1723 /* 1724 * Don't forward if we don't have a cache entry. Mrouted will 1725 * always provide a cache entry in response to an upcall. 1726 */ 1727 } else { 1728 /* 1729 * If we don't have a route for packet's origin, make a copy 1730 * of the packet and send message to routing daemon. 1731 */ 1732 struct mfc *mfc_rt = NULL; 1733 mblk_t *mp0 = NULL; 1734 mblk_t *mp_copy = NULL; 1735 struct rtdetq *rte = NULL; 1736 struct rtdetq *rte_m, *rte1, *prev_rte; 1737 uint_t hash; 1738 int npkts; 1739 boolean_t new_mfc = B_FALSE; 1740 ipst->ips_mrtstat->mrts_mfc_misses++; 1741 /* BSD uses mrts_no_route++ */ 1742 if (ipst->ips_ip_mrtdebug > 1) { 1743 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1744 "ip_mforward: no rte ill %s src %x g %x misses %d", 1745 ill->ill_name, ntohl(src), ntohl(dst), 1746 (int)ipst->ips_mrtstat->mrts_mfc_misses); 1747 } 1748 /* 1749 * The order of the following code differs from the BSD code. 1750 * Pre-mc3.5, the BSD code was incorrect and SunOS 5.x 1751 * code works, so SunOS 5.x wasn't changed to conform to the 1752 * BSD version. 1753 */ 1754 1755 /* Lock mfctable. */ 1756 hash = MFCHASH(src, dst); 1757 mutex_enter(&(ipst->ips_mfcs[hash].mfcb_lock)); 1758 1759 /* 1760 * If we are turning off mrouted return an error 1761 */ 1762 if (is_mrouter_off(ipst)) { 1763 mutex_exit(&mfcbp->mfcb_lock); 1764 MFCB_REFRELE(mfcbp); 1765 return (-1); 1766 } 1767 1768 /* Is there an upcall waiting for this packet? */ 1769 for (mfc_rt = ipst->ips_mfcs[hash].mfcb_mfc; mfc_rt; 1770 mfc_rt = mfc_rt->mfc_next) { 1771 mutex_enter(&mfc_rt->mfc_mutex); 1772 if (ipst->ips_ip_mrtdebug > 1) { 1773 (void) mi_strlog(mrouter->conn_rq, 1, 1774 SL_TRACE, 1775 "ip_mforward: MFCTAB hash %d o 0x%x" 1776 " g 0x%x\n", 1777 hash, ntohl(mfc_rt->mfc_origin.s_addr), 1778 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1779 } 1780 /* There is an upcall */ 1781 if ((src == mfc_rt->mfc_origin.s_addr) && 1782 (dst == mfc_rt->mfc_mcastgrp.s_addr) && 1783 (mfc_rt->mfc_rte != NULL) && 1784 !(mfc_rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 1785 break; 1786 } 1787 mutex_exit(&mfc_rt->mfc_mutex); 1788 } 1789 /* No upcall, so make a new entry into mfctable */ 1790 if (mfc_rt == NULL) { 1791 mfc_rt = (struct mfc *)mi_zalloc(sizeof (struct mfc)); 1792 if (mfc_rt == NULL) { 1793 ipst->ips_mrtstat->mrts_fwd_drop++; 1794 ip1dbg(("ip_mforward: out of memory " 1795 "for mfc, mfc_rt\n")); 1796 goto error_return; 1797 } else 1798 new_mfc = B_TRUE; 1799 /* Get resources */ 1800 /* TODO could copy header and dup rest */ 1801 mp_copy = copymsg(mp); 1802 if (mp_copy == NULL) { 1803 ipst->ips_mrtstat->mrts_fwd_drop++; 1804 ip1dbg(("ip_mforward: out of memory for " 1805 "mblk, mp_copy\n")); 1806 goto error_return; 1807 } 1808 mutex_enter(&mfc_rt->mfc_mutex); 1809 } 1810 /* Get resources for rte, whether first rte or not first. */ 1811 /* Add this packet into rtdetq */ 1812 rte = (struct rtdetq *)mi_zalloc(sizeof (struct rtdetq)); 1813 if (rte == NULL) { 1814 ipst->ips_mrtstat->mrts_fwd_drop++; 1815 mutex_exit(&mfc_rt->mfc_mutex); 1816 ip1dbg(("ip_mforward: out of memory for" 1817 " rtdetq, rte\n")); 1818 goto error_return; 1819 } 1820 1821 mp0 = copymsg(mp); 1822 if (mp0 == NULL) { 1823 ipst->ips_mrtstat->mrts_fwd_drop++; 1824 ip1dbg(("ip_mforward: out of memory for mblk, mp0\n")); 1825 mutex_exit(&mfc_rt->mfc_mutex); 1826 goto error_return; 1827 } 1828 rte->mp = mp0; 1829 if (pim_reg_packet) { 1830 ASSERT(ipst->ips_reg_vif_num != ALL_VIFS); 1831 rte->ill = 1832 ipst->ips_vifs[ipst->ips_reg_vif_num]. 1833 v_ipif->ipif_ill; 1834 } else { 1835 rte->ill = ill; 1836 } 1837 rte->rte_next = NULL; 1838 1839 /* 1840 * Determine if upcall q (rtdetq) has overflowed. 1841 * mfc_rt->mfc_rte is null by mi_zalloc 1842 * if it is the first message. 1843 */ 1844 for (rte_m = mfc_rt->mfc_rte, npkts = 0; rte_m; 1845 rte_m = rte_m->rte_next) 1846 npkts++; 1847 if (ipst->ips_ip_mrtdebug > 1) { 1848 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1849 "ip_mforward: upcalls %d\n", npkts); 1850 } 1851 if (npkts > MAX_UPQ) { 1852 ipst->ips_mrtstat->mrts_upq_ovflw++; 1853 mutex_exit(&mfc_rt->mfc_mutex); 1854 goto error_return; 1855 } 1856 1857 if (npkts == 0) { /* first upcall */ 1858 int i = 0; 1859 /* 1860 * Now finish installing the new mfc! Now that we have 1861 * resources! Insert new entry at head of hash chain. 1862 * Use src and dst which are ipaddr_t's. 1863 */ 1864 mfc_rt->mfc_origin.s_addr = src; 1865 mfc_rt->mfc_mcastgrp.s_addr = dst; 1866 1867 mutex_enter(&ipst->ips_numvifs_mutex); 1868 for (i = 0; i < (int)ipst->ips_numvifs; i++) 1869 mfc_rt->mfc_ttls[i] = 0; 1870 mutex_exit(&ipst->ips_numvifs_mutex); 1871 mfc_rt->mfc_parent = ALL_VIFS; 1872 1873 /* Link into table */ 1874 if (ipst->ips_ip_mrtdebug > 1) { 1875 (void) mi_strlog(mrouter->conn_rq, 1, 1876 SL_TRACE, 1877 "ip_mforward: NEW MFCTAB hash %d o 0x%x " 1878 "g 0x%x\n", hash, 1879 ntohl(mfc_rt->mfc_origin.s_addr), 1880 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1881 } 1882 mfc_rt->mfc_next = ipst->ips_mfcs[hash].mfcb_mfc; 1883 ipst->ips_mfcs[hash].mfcb_mfc = mfc_rt; 1884 mfc_rt->mfc_rte = NULL; 1885 } 1886 1887 /* Link in the upcall */ 1888 /* First upcall */ 1889 if (mfc_rt->mfc_rte == NULL) 1890 mfc_rt->mfc_rte = rte; 1891 else { 1892 /* not the first upcall */ 1893 prev_rte = mfc_rt->mfc_rte; 1894 for (rte1 = mfc_rt->mfc_rte->rte_next; rte1; 1895 prev_rte = rte1, rte1 = rte1->rte_next) 1896 ; 1897 prev_rte->rte_next = rte; 1898 } 1899 1900 /* 1901 * No upcalls waiting, this is first one, so send a message to 1902 * routing daemon to install a route into kernel table. 1903 */ 1904 if (npkts == 0) { 1905 struct igmpmsg *im; 1906 /* ipha_protocol is 0, for upcall */ 1907 ASSERT(mp_copy != NULL); 1908 im = (struct igmpmsg *)mp_copy->b_rptr; 1909 im->im_msgtype = IGMPMSG_NOCACHE; 1910 im->im_mbz = 0; 1911 mutex_enter(&ipst->ips_numvifs_mutex); 1912 if (pim_reg_packet) { 1913 im->im_vif = (uchar_t)ipst->ips_reg_vif_num; 1914 mutex_exit(&ipst->ips_numvifs_mutex); 1915 } else { 1916 /* 1917 * XXX do we need to hold locks here ? 1918 */ 1919 for (vifi = 0; 1920 vifi < ipst->ips_numvifs; 1921 vifi++) { 1922 if (ipst->ips_vifs[vifi].v_ipif == NULL) 1923 continue; 1924 if (ipst->ips_vifs[vifi]. 1925 v_ipif->ipif_ill == ill) { 1926 im->im_vif = (uchar_t)vifi; 1927 break; 1928 } 1929 } 1930 mutex_exit(&ipst->ips_numvifs_mutex); 1931 ASSERT(vifi < ipst->ips_numvifs); 1932 } 1933 1934 ipst->ips_mrtstat->mrts_upcalls++; 1935 /* Timer to discard upcalls if mrouted is too slow */ 1936 mfc_rt->mfc_timeout_id = timeout(expire_upcalls, 1937 mfc_rt, EXPIRE_TIMEOUT * UPCALL_EXPIRE); 1938 mutex_exit(&mfc_rt->mfc_mutex); 1939 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1940 /* Pass to RAWIP */ 1941 (mrouter->conn_recv)(mrouter, mp_copy, NULL); 1942 } else { 1943 mutex_exit(&mfc_rt->mfc_mutex); 1944 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1945 freemsg(mp_copy); 1946 } 1947 1948 MFCB_REFRELE(mfcbp); 1949 if (tunnel_src != 0) 1950 return (1); 1951 else 1952 return (0); 1953 error_return: 1954 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1955 MFCB_REFRELE(mfcbp); 1956 if (mfc_rt != NULL && (new_mfc == B_TRUE)) 1957 mi_free((char *)mfc_rt); 1958 if (rte != NULL) 1959 mi_free((char *)rte); 1960 if (mp_copy != NULL) 1961 freemsg(mp_copy); 1962 if (mp0 != NULL) 1963 freemsg(mp0); 1964 return (-1); 1965 } 1966 } 1967 1968 /* 1969 * Clean up the mfctable cache entry if upcall is not serviced. 1970 * SunOS 5.x has timeout per mfc, unlike BSD which has one timer. 1971 */ 1972 static void 1973 expire_upcalls(void *arg) 1974 { 1975 struct mfc *mfc_rt = arg; 1976 uint_t hash; 1977 struct mfc *prev_mfc, *mfc0; 1978 ip_stack_t *ipst; 1979 conn_t *mrouter; 1980 1981 if (mfc_rt->mfc_rte == NULL || mfc_rt->mfc_rte->ill != NULL) { 1982 cmn_err(CE_WARN, "expire_upcalls: no ILL\n"); 1983 return; 1984 } 1985 ipst = mfc_rt->mfc_rte->ill->ill_ipst; 1986 mrouter = ipst->ips_ip_g_mrouter; 1987 1988 hash = MFCHASH(mfc_rt->mfc_origin.s_addr, mfc_rt->mfc_mcastgrp.s_addr); 1989 if (ipst->ips_ip_mrtdebug > 1) { 1990 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1991 "expire_upcalls: hash %d s %x g %x", 1992 hash, ntohl(mfc_rt->mfc_origin.s_addr), 1993 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1994 } 1995 MFCB_REFHOLD(&ipst->ips_mfcs[hash]); 1996 mutex_enter(&mfc_rt->mfc_mutex); 1997 /* 1998 * if timeout has been set to zero, than the 1999 * entry has been filled, no need to delete it. 2000 */ 2001 if (mfc_rt->mfc_timeout_id == 0) 2002 goto done; 2003 ipst->ips_mrtstat->mrts_cache_cleanups++; 2004 mfc_rt->mfc_timeout_id = 0; 2005 2006 /* Determine entry to be cleaned up in cache table. */ 2007 for (prev_mfc = mfc0 = ipst->ips_mfcs[hash].mfcb_mfc; mfc0; 2008 prev_mfc = mfc0, mfc0 = mfc0->mfc_next) 2009 if (mfc0 == mfc_rt) 2010 break; 2011 2012 /* del_mfc takes care of gone mfcs */ 2013 ASSERT(prev_mfc != NULL); 2014 ASSERT(mfc0 != NULL); 2015 2016 /* 2017 * Delete the entry from the cache 2018 */ 2019 ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED; 2020 mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED; 2021 2022 /* 2023 * release_mfc will drop all queued upcall packets. 2024 * and will free the mbuf with the pkt, if, timing info. 2025 */ 2026 done: 2027 mutex_exit(&mfc_rt->mfc_mutex); 2028 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 2029 } 2030 2031 /* 2032 * Packet forwarding routine once entry in the cache is made. 2033 */ 2034 static int 2035 ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src, 2036 struct mfc *rt) 2037 { 2038 vifi_t vifi; 2039 struct vif *vifp; 2040 ipaddr_t dst = ipha->ipha_dst; 2041 size_t plen = msgdsize(mp); 2042 vifi_t num_of_vifs; 2043 ip_stack_t *ipst = ill->ill_ipst; 2044 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2045 2046 if (ipst->ips_ip_mrtdebug > 1) { 2047 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2048 "ip_mdq: SEND src %x, ipha_dst %x, ill %s", 2049 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), 2050 ill->ill_name); 2051 } 2052 2053 /* Macro to send packet on vif */ 2054 #define MC_SEND(ipha, mp, vifp, dst) { \ 2055 if ((vifp)->v_flags & VIFF_TUNNEL) \ 2056 encap_send((ipha), (mp), (vifp), (dst)); \ 2057 else if ((vifp)->v_flags & VIFF_REGISTER) \ 2058 register_send((ipha), (mp), (vifp), (dst)); \ 2059 else \ 2060 phyint_send((ipha), (mp), (vifp), (dst)); \ 2061 } 2062 2063 vifi = rt->mfc_parent; 2064 2065 /* 2066 * The value of vifi is MAXVIFS if the pkt had no parent, i.e., 2067 * Mrouted had no route. 2068 * We wanted the route installed in the mfctable to prevent multiple 2069 * tries, so it passed add_mfc(), but is discarded here. The v_ipif is 2070 * NULL so we don't want to check the ill. Still needed as of Mrouted 2071 * 3.6. 2072 */ 2073 if (vifi == NO_VIF) { 2074 ip1dbg(("ip_mdq: no route for origin ill %s, vifi is NO_VIF\n", 2075 ill->ill_name)); 2076 if (ipst->ips_ip_mrtdebug > 1) { 2077 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2078 "ip_mdq: vifi is NO_VIF ill = %s", ill->ill_name); 2079 } 2080 return (-1); /* drop pkt */ 2081 } 2082 2083 if (!lock_good_vif(&ipst->ips_vifs[vifi])) 2084 return (-1); 2085 /* 2086 * The MFC entries are not cleaned up when an ipif goes 2087 * away thus this code has to guard against an MFC referencing 2088 * an ipif that has been closed. Note: reset_mrt_vif_ipif 2089 * sets the v_ipif to NULL when the ipif disappears. 2090 */ 2091 ASSERT(ipst->ips_vifs[vifi].v_ipif != NULL); 2092 2093 if (vifi >= ipst->ips_numvifs) { 2094 cmn_err(CE_WARN, "ip_mdq: illegal vifi %d numvifs " 2095 "%d ill %s viftable ill %s\n", 2096 (int)vifi, (int)ipst->ips_numvifs, ill->ill_name, 2097 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name); 2098 unlock_good_vif(&ipst->ips_vifs[vifi]); 2099 return (-1); 2100 } 2101 /* 2102 * Don't forward if it didn't arrive from the parent vif for its 2103 * origin. But do match on the groups as we nominate only one 2104 * ill in the group for receiving allmulti packets. 2105 */ 2106 if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill && 2107 (ill->ill_group == NULL || 2108 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_group != 2109 ill->ill_group)) || 2110 (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) { 2111 /* Came in the wrong interface */ 2112 ip1dbg(("ip_mdq: arrived wrong if, vifi %d " 2113 "numvifs %d ill %s viftable ill %s\n", 2114 (int)vifi, (int)ipst->ips_numvifs, ill->ill_name, 2115 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name)); 2116 if (ipst->ips_ip_mrtdebug > 1) { 2117 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2118 "ip_mdq: arrived wrong if, vifi %d ill " 2119 "%s viftable ill %s\n", 2120 (int)vifi, ill->ill_name, 2121 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name); 2122 } 2123 ipst->ips_mrtstat->mrts_wrong_if++; 2124 rt->mfc_wrong_if++; 2125 2126 /* 2127 * If we are doing PIM assert processing and we are forwarding 2128 * packets on this interface, and it is a broadcast medium 2129 * interface (and not a tunnel), send a message to the routing. 2130 * 2131 * We use the first ipif on the list, since it's all we have. 2132 * Chances are the ipif_flags are the same for ipifs on the ill. 2133 */ 2134 if (ipst->ips_pim_assert && rt->mfc_ttls[vifi] > 0 && 2135 (ill->ill_ipif->ipif_flags & IPIF_BROADCAST) && 2136 !(ipst->ips_vifs[vifi].v_flags & VIFF_TUNNEL)) { 2137 mblk_t *mp_copy; 2138 struct igmpmsg *im; 2139 2140 /* TODO could copy header and dup rest */ 2141 mp_copy = copymsg(mp); 2142 if (mp_copy == NULL) { 2143 ipst->ips_mrtstat->mrts_fwd_drop++; 2144 ip1dbg(("ip_mdq: out of memory " 2145 "for mblk, mp_copy\n")); 2146 unlock_good_vif(&ipst->ips_vifs[vifi]); 2147 return (-1); 2148 } 2149 2150 im = (struct igmpmsg *)mp_copy->b_rptr; 2151 im->im_msgtype = IGMPMSG_WRONGVIF; 2152 im->im_mbz = 0; 2153 im->im_vif = (ushort_t)vifi; 2154 /* Pass to RAWIP */ 2155 (mrouter->conn_recv)(mrouter, mp_copy, NULL); 2156 } 2157 unlock_good_vif(&ipst->ips_vifs[vifi]); 2158 if (tunnel_src != 0) 2159 return (1); 2160 else 2161 return (0); 2162 } 2163 /* 2164 * If I sourced this packet, it counts as output, else it was input. 2165 */ 2166 if (ipha->ipha_src == ipst->ips_vifs[vifi].v_lcl_addr.s_addr) { 2167 ipst->ips_vifs[vifi].v_pkt_out++; 2168 ipst->ips_vifs[vifi].v_bytes_out += plen; 2169 } else { 2170 ipst->ips_vifs[vifi].v_pkt_in++; 2171 ipst->ips_vifs[vifi].v_bytes_in += plen; 2172 } 2173 mutex_enter(&rt->mfc_mutex); 2174 rt->mfc_pkt_cnt++; 2175 rt->mfc_byte_cnt += plen; 2176 mutex_exit(&rt->mfc_mutex); 2177 unlock_good_vif(&ipst->ips_vifs[vifi]); 2178 /* 2179 * For each vif, decide if a copy of the packet should be forwarded. 2180 * Forward if: 2181 * - the vif threshold ttl is non-zero AND 2182 * - the pkt ttl exceeds the vif's threshold 2183 * A non-zero mfc_ttl indicates that the vif is part of 2184 * the output set for the mfc entry. 2185 */ 2186 mutex_enter(&ipst->ips_numvifs_mutex); 2187 num_of_vifs = ipst->ips_numvifs; 2188 mutex_exit(&ipst->ips_numvifs_mutex); 2189 for (vifp = ipst->ips_vifs, vifi = 0; 2190 vifi < num_of_vifs; 2191 vifp++, vifi++) { 2192 if (!lock_good_vif(vifp)) 2193 continue; 2194 if ((rt->mfc_ttls[vifi] > 0) && 2195 (ipha->ipha_ttl > rt->mfc_ttls[vifi])) { 2196 /* 2197 * lock_good_vif should not have succedded if 2198 * v_ipif is null. 2199 */ 2200 ASSERT(vifp->v_ipif != NULL); 2201 vifp->v_pkt_out++; 2202 vifp->v_bytes_out += plen; 2203 MC_SEND(ipha, mp, vifp, dst); 2204 ipst->ips_mrtstat->mrts_fwd_out++; 2205 } 2206 unlock_good_vif(vifp); 2207 } 2208 if (tunnel_src != 0) 2209 return (1); 2210 else 2211 return (0); 2212 } 2213 2214 /* 2215 * Send the packet on physical interface. 2216 * Caller assumes can continue to use mp on return. 2217 */ 2218 /* ARGSUSED */ 2219 static void 2220 phyint_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2221 { 2222 mblk_t *mp_copy; 2223 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2224 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2225 2226 /* Make a new reference to the packet */ 2227 mp_copy = copymsg(mp); /* TODO could copy header and dup rest */ 2228 if (mp_copy == NULL) { 2229 ipst->ips_mrtstat->mrts_fwd_drop++; 2230 ip1dbg(("phyint_send: out of memory for mblk, mp_copy\n")); 2231 return; 2232 } 2233 if (vifp->v_rate_limit <= 0) 2234 tbf_send_packet(vifp, mp_copy); 2235 else { 2236 if (ipst->ips_ip_mrtdebug > 1) { 2237 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2238 "phyint_send: tbf_contr rate %d " 2239 "vifp 0x%p mp 0x%p dst 0x%x", 2240 vifp->v_rate_limit, (void *)vifp, (void *)mp, dst); 2241 } 2242 tbf_control(vifp, mp_copy, (ipha_t *)mp_copy->b_rptr); 2243 } 2244 } 2245 2246 /* 2247 * Send the whole packet for REGISTER encapsulation to PIM daemon 2248 * Caller assumes it can continue to use mp on return. 2249 */ 2250 /* ARGSUSED */ 2251 static void 2252 register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2253 { 2254 struct igmpmsg *im; 2255 mblk_t *mp_copy; 2256 ipha_t *ipha_copy; 2257 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2258 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2259 2260 if (ipst->ips_ip_mrtdebug > 1) { 2261 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2262 "register_send: src %x, dst %x\n", 2263 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst)); 2264 } 2265 2266 /* 2267 * Copy the old packet & pullup its IP header into the new mblk_t so we 2268 * can modify it. Try to fill the new mblk_t since if we don't the 2269 * ethernet driver will. 2270 */ 2271 mp_copy = allocb(sizeof (struct igmpmsg) + sizeof (ipha_t), BPRI_MED); 2272 if (mp_copy == NULL) { 2273 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2274 if (ipst->ips_ip_mrtdebug > 3) { 2275 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2276 "register_send: allocb failure."); 2277 } 2278 return; 2279 } 2280 2281 /* 2282 * Bump write pointer to account for igmpmsg being added. 2283 */ 2284 mp_copy->b_wptr = mp_copy->b_rptr + sizeof (struct igmpmsg); 2285 2286 /* 2287 * Chain packet to new mblk_t. 2288 */ 2289 if ((mp_copy->b_cont = copymsg(mp)) == NULL) { 2290 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2291 if (ipst->ips_ip_mrtdebug > 3) { 2292 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2293 "register_send: copymsg failure."); 2294 } 2295 freeb(mp_copy); 2296 return; 2297 } 2298 2299 /* 2300 * icmp_input() asserts that IP version field is set to an 2301 * appropriate version. Hence, the struct igmpmsg that this really 2302 * becomes, needs to have the correct IP version field. 2303 */ 2304 ipha_copy = (ipha_t *)mp_copy->b_rptr; 2305 *ipha_copy = multicast_encap_iphdr; 2306 2307 /* 2308 * The kernel uses the struct igmpmsg header to encode the messages to 2309 * the multicast routing daemon. Fill in the fields in the header 2310 * starting with the message type which is IGMPMSG_WHOLEPKT 2311 */ 2312 im = (struct igmpmsg *)mp_copy->b_rptr; 2313 im->im_msgtype = IGMPMSG_WHOLEPKT; 2314 im->im_src.s_addr = ipha->ipha_src; 2315 im->im_dst.s_addr = ipha->ipha_dst; 2316 2317 /* 2318 * Must Be Zero. This is because the struct igmpmsg is really an IP 2319 * header with renamed fields and the multicast routing daemon uses 2320 * an ipha_protocol (aka im_mbz) of 0 to distinguish these messages. 2321 */ 2322 im->im_mbz = 0; 2323 2324 ++ipst->ips_mrtstat->mrts_upcalls; 2325 if (!canputnext(mrouter->conn_rq)) { 2326 ++ipst->ips_mrtstat->mrts_pim_regsend_drops; 2327 if (ipst->ips_ip_mrtdebug > 3) { 2328 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2329 "register_send: register upcall failure."); 2330 } 2331 freemsg(mp_copy); 2332 } else { 2333 /* Pass to RAWIP */ 2334 (mrouter->conn_recv)(mrouter, mp_copy, NULL); 2335 } 2336 } 2337 2338 /* 2339 * pim_validate_cksum handles verification of the checksum in the 2340 * pim header. For PIM Register packets, the checksum is calculated 2341 * across the PIM header only. For all other packets, the checksum 2342 * is for the PIM header and remainder of the packet. 2343 * 2344 * returns: B_TRUE, if checksum is okay. 2345 * B_FALSE, if checksum is not valid. 2346 */ 2347 static boolean_t 2348 pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp) 2349 { 2350 mblk_t *mp_dup; 2351 2352 if ((mp_dup = dupmsg(mp)) == NULL) 2353 return (B_FALSE); 2354 2355 mp_dup->b_rptr += IPH_HDR_LENGTH(ip); 2356 if (pimp->pim_type == PIM_REGISTER) 2357 mp_dup->b_wptr = mp_dup->b_rptr + PIM_MINLEN; 2358 if (IP_CSUM(mp_dup, 0, 0)) { 2359 freemsg(mp_dup); 2360 return (B_FALSE); 2361 } 2362 freemsg(mp_dup); 2363 return (B_TRUE); 2364 } 2365 2366 /* 2367 * int 2368 * pim_input(queue_t *, mblk_t *, ill_t *ill) - Process PIM protocol packets. 2369 * IP Protocol 103. Register messages are decapsulated and sent 2370 * onto multicast forwarding. 2371 */ 2372 int 2373 pim_input(queue_t *q, mblk_t *mp, ill_t *ill) 2374 { 2375 ipha_t *eip, *ip; 2376 int iplen, pimlen, iphlen; 2377 struct pim *pimp; /* pointer to a pim struct */ 2378 uint32_t *reghdr; 2379 ip_stack_t *ipst = ill->ill_ipst; 2380 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2381 2382 /* 2383 * Pullup the msg for PIM protocol processing. 2384 */ 2385 if (pullupmsg(mp, -1) == 0) { 2386 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2387 freemsg(mp); 2388 return (-1); 2389 } 2390 2391 ip = (ipha_t *)mp->b_rptr; 2392 iplen = ip->ipha_length; 2393 iphlen = IPH_HDR_LENGTH(ip); 2394 pimlen = ntohs(iplen) - iphlen; 2395 2396 /* 2397 * Validate lengths 2398 */ 2399 if (pimlen < PIM_MINLEN) { 2400 ++ipst->ips_mrtstat->mrts_pim_malformed; 2401 if (ipst->ips_ip_mrtdebug > 1) { 2402 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2403 "pim_input: length not at least minlen"); 2404 } 2405 freemsg(mp); 2406 return (-1); 2407 } 2408 2409 /* 2410 * Point to the PIM header. 2411 */ 2412 pimp = (struct pim *)((caddr_t)ip + iphlen); 2413 2414 /* 2415 * Check the version number. 2416 */ 2417 if (pimp->pim_vers != PIM_VERSION) { 2418 ++ipst->ips_mrtstat->mrts_pim_badversion; 2419 if (ipst->ips_ip_mrtdebug > 1) { 2420 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2421 "pim_input: unknown version of PIM"); 2422 } 2423 freemsg(mp); 2424 return (-1); 2425 } 2426 2427 /* 2428 * Validate the checksum 2429 */ 2430 if (!pim_validate_cksum(mp, ip, pimp)) { 2431 ++ipst->ips_mrtstat->mrts_pim_rcv_badcsum; 2432 if (ipst->ips_ip_mrtdebug > 1) { 2433 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2434 "pim_input: invalid checksum"); 2435 } 2436 freemsg(mp); 2437 return (-1); 2438 } 2439 2440 if (pimp->pim_type != PIM_REGISTER) 2441 return (0); 2442 2443 reghdr = (uint32_t *)(pimp + 1); 2444 eip = (ipha_t *)(reghdr + 1); 2445 2446 /* 2447 * check if the inner packet is destined to mcast group 2448 */ 2449 if (!CLASSD(eip->ipha_dst)) { 2450 ++ipst->ips_mrtstat->mrts_pim_badregisters; 2451 if (ipst->ips_ip_mrtdebug > 1) { 2452 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2453 "pim_input: Inner pkt not mcast .. !"); 2454 } 2455 freemsg(mp); 2456 return (-1); 2457 } 2458 if (ipst->ips_ip_mrtdebug > 1) { 2459 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2460 "register from %x, to %x, len %d", 2461 ntohl(eip->ipha_src), 2462 ntohl(eip->ipha_dst), 2463 ntohs(eip->ipha_length)); 2464 } 2465 /* 2466 * If the null register bit is not set, decapsulate 2467 * the packet before forwarding it. 2468 */ 2469 if (!(ntohl(*reghdr) & PIM_NULL_REGISTER)) { 2470 mblk_t *mp_copy; 2471 2472 /* Copy the message */ 2473 if ((mp_copy = copymsg(mp)) == NULL) { 2474 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2475 freemsg(mp); 2476 return (-1); 2477 } 2478 2479 /* 2480 * Decapsulate the packet and give it to 2481 * register_mforward. 2482 */ 2483 mp_copy->b_rptr += iphlen + sizeof (pim_t) + 2484 sizeof (*reghdr); 2485 if (register_mforward(q, mp_copy, ill) != 0) { 2486 freemsg(mp); 2487 return (-1); 2488 } 2489 } 2490 2491 /* 2492 * Pass all valid PIM packets up to any process(es) listening on a raw 2493 * PIM socket. For Solaris it is done right after pim_input() is 2494 * called. 2495 */ 2496 return (0); 2497 } 2498 2499 /* 2500 * PIM sparse mode hook. Called by pim_input after decapsulating 2501 * the packet. Loop back the packet, as if we have received it. 2502 * In pim_input() we have to check if the destination is a multicast address. 2503 */ 2504 /* ARGSUSED */ 2505 static int 2506 register_mforward(queue_t *q, mblk_t *mp, ill_t *ill) 2507 { 2508 ip_stack_t *ipst = ill->ill_ipst; 2509 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2510 2511 ASSERT(ipst->ips_reg_vif_num <= ipst->ips_numvifs); 2512 2513 if (ipst->ips_ip_mrtdebug > 3) { 2514 ipha_t *ipha; 2515 2516 ipha = (ipha_t *)mp->b_rptr; 2517 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2518 "register_mforward: src %x, dst %x\n", 2519 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst)); 2520 } 2521 /* 2522 * Need to pass in to ip_mforward() the information that the 2523 * packet has arrived on the register_vif. We use the solution that 2524 * ip_mroute_decap() employs: use mp->b_prev to pass some information 2525 * to ip_mforward(). Nonzero value means the packet has arrived on a 2526 * tunnel (ip_mroute_decap() puts the address of the other side of the 2527 * tunnel there.) This is safe since ip_rput() either frees the packet 2528 * or passes it to ip_mforward(). We use 2529 * PIM_REGISTER_MARKER = 0xffffffff to indicate the has arrived on the 2530 * register vif. If in the future we have more than one register vifs, 2531 * then this will need re-examination. 2532 */ 2533 mp->b_prev = (mblk_t *)PIM_REGISTER_MARKER; 2534 ++ipst->ips_mrtstat->mrts_pim_regforwards; 2535 ip_rput(q, mp); 2536 return (0); 2537 } 2538 2539 /* 2540 * Send an encapsulated packet. 2541 * Caller assumes can continue to use mp when routine returns. 2542 */ 2543 /* ARGSUSED */ 2544 static void 2545 encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2546 { 2547 mblk_t *mp_copy; 2548 ipha_t *ipha_copy; 2549 size_t len; 2550 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2551 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2552 2553 if (ipst->ips_ip_mrtdebug > 1) { 2554 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2555 "encap_send: vif %ld enter", 2556 (ptrdiff_t)(vifp - ipst->ips_vifs)); 2557 } 2558 len = ntohs(ipha->ipha_length); 2559 2560 /* 2561 * Copy the old packet & pullup it's IP header into the 2562 * new mbuf so we can modify it. Try to fill the new 2563 * mbuf since if we don't the ethernet driver will. 2564 */ 2565 mp_copy = allocb(32 + sizeof (multicast_encap_iphdr), BPRI_MED); 2566 if (mp_copy == NULL) 2567 return; 2568 mp_copy->b_rptr += 32; 2569 mp_copy->b_wptr = mp_copy->b_rptr + sizeof (multicast_encap_iphdr); 2570 if ((mp_copy->b_cont = copymsg(mp)) == NULL) { 2571 freeb(mp_copy); 2572 return; 2573 } 2574 2575 /* 2576 * Fill in the encapsulating IP header. 2577 * Remote tunnel dst in rmt_addr, from add_vif(). 2578 */ 2579 ipha_copy = (ipha_t *)mp_copy->b_rptr; 2580 *ipha_copy = multicast_encap_iphdr; 2581 ASSERT((len + sizeof (ipha_t)) <= IP_MAXPACKET); 2582 ipha_copy->ipha_length = htons(len + sizeof (ipha_t)); 2583 ipha_copy->ipha_src = vifp->v_lcl_addr.s_addr; 2584 ipha_copy->ipha_dst = vifp->v_rmt_addr.s_addr; 2585 ASSERT(ipha_copy->ipha_ident == 0); 2586 2587 /* Turn the encapsulated IP header back into a valid one. */ 2588 ipha = (ipha_t *)mp_copy->b_cont->b_rptr; 2589 ipha->ipha_ttl--; 2590 ipha->ipha_hdr_checksum = 0; 2591 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 2592 2593 if (ipst->ips_ip_mrtdebug > 1) { 2594 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2595 "encap_send: group 0x%x", ntohl(ipha->ipha_dst)); 2596 } 2597 if (vifp->v_rate_limit <= 0) 2598 tbf_send_packet(vifp, mp_copy); 2599 else 2600 /* ipha is from the original header */ 2601 tbf_control(vifp, mp_copy, ipha); 2602 } 2603 2604 /* 2605 * De-encapsulate a packet and feed it back through IP input. 2606 * This routine is called whenever IP gets a packet with prototype 2607 * IPPROTO_ENCAP and a local destination address. 2608 */ 2609 void 2610 ip_mroute_decap(queue_t *q, mblk_t *mp, ill_t *ill) 2611 { 2612 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2613 ipha_t *ipha_encap; 2614 int hlen = IPH_HDR_LENGTH(ipha); 2615 ipaddr_t src; 2616 struct vif *vifp; 2617 ip_stack_t *ipst = ill->ill_ipst; 2618 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2619 2620 /* 2621 * Dump the packet if it's not to a multicast destination or if 2622 * we don't have an encapsulating tunnel with the source. 2623 * Note: This code assumes that the remote site IP address 2624 * uniquely identifies the tunnel (i.e., that this site has 2625 * at most one tunnel with the remote site). 2626 */ 2627 ipha_encap = (ipha_t *)((char *)ipha + hlen); 2628 if (!CLASSD(ipha_encap->ipha_dst)) { 2629 ipst->ips_mrtstat->mrts_bad_tunnel++; 2630 ip1dbg(("ip_mroute_decap: bad tunnel\n")); 2631 freemsg(mp); 2632 return; 2633 } 2634 src = (ipaddr_t)ipha->ipha_src; 2635 mutex_enter(&ipst->ips_last_encap_lock); 2636 if (src != ipst->ips_last_encap_src) { 2637 struct vif *vife; 2638 2639 vifp = ipst->ips_vifs; 2640 vife = vifp + ipst->ips_numvifs; 2641 ipst->ips_last_encap_src = src; 2642 ipst->ips_last_encap_vif = 0; 2643 for (; vifp < vife; ++vifp) { 2644 if (!lock_good_vif(vifp)) 2645 continue; 2646 if (vifp->v_rmt_addr.s_addr == src) { 2647 if (vifp->v_flags & VIFF_TUNNEL) 2648 ipst->ips_last_encap_vif = vifp; 2649 if (ipst->ips_ip_mrtdebug > 1) { 2650 (void) mi_strlog(mrouter->conn_rq, 2651 1, SL_TRACE, 2652 "ip_mroute_decap: good tun " 2653 "vif %ld with %x", 2654 (ptrdiff_t)(vifp - ipst->ips_vifs), 2655 ntohl(src)); 2656 } 2657 unlock_good_vif(vifp); 2658 break; 2659 } 2660 unlock_good_vif(vifp); 2661 } 2662 } 2663 if ((vifp = ipst->ips_last_encap_vif) == 0) { 2664 mutex_exit(&ipst->ips_last_encap_lock); 2665 ipst->ips_mrtstat->mrts_bad_tunnel++; 2666 freemsg(mp); 2667 ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n", 2668 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(src))); 2669 return; 2670 } 2671 mutex_exit(&ipst->ips_last_encap_lock); 2672 2673 /* 2674 * Need to pass in the tunnel source to ip_mforward (so that it can 2675 * verify that the packet arrived over the correct vif.) We use b_prev 2676 * to pass this information. This is safe since the ip_rput either 2677 * frees the packet or passes it to ip_mforward. 2678 */ 2679 mp->b_prev = (mblk_t *)(uintptr_t)src; 2680 mp->b_rptr += hlen; 2681 /* Feed back into ip_rput as an M_DATA. */ 2682 ip_rput(q, mp); 2683 } 2684 2685 /* 2686 * Remove all records with v_ipif == ipif. Called when an interface goes away 2687 * (stream closed). Called as writer. 2688 */ 2689 void 2690 reset_mrt_vif_ipif(ipif_t *ipif) 2691 { 2692 vifi_t vifi, tmp_vifi; 2693 vifi_t num_of_vifs; 2694 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 2695 2696 /* Can't check vifi >= 0 since vifi_t is unsigned! */ 2697 2698 mutex_enter(&ipst->ips_numvifs_mutex); 2699 num_of_vifs = ipst->ips_numvifs; 2700 mutex_exit(&ipst->ips_numvifs_mutex); 2701 2702 for (vifi = num_of_vifs; vifi != 0; vifi--) { 2703 tmp_vifi = vifi - 1; 2704 if (ipst->ips_vifs[tmp_vifi].v_ipif == ipif) { 2705 (void) del_vif(&tmp_vifi, NULL, NULL, ipst); 2706 } 2707 } 2708 } 2709 2710 /* Remove pending upcall msgs when ill goes away. Called by ill_delete. */ 2711 void 2712 reset_mrt_ill(ill_t *ill) 2713 { 2714 struct mfc *rt; 2715 struct rtdetq *rte; 2716 int i; 2717 ip_stack_t *ipst = ill->ill_ipst; 2718 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2719 2720 for (i = 0; i < MFCTBLSIZ; i++) { 2721 MFCB_REFHOLD(&ipst->ips_mfcs[i]); 2722 if ((rt = ipst->ips_mfcs[i].mfcb_mfc) != NULL) { 2723 if (ipst->ips_ip_mrtdebug > 1) { 2724 (void) mi_strlog(mrouter->conn_rq, 1, 2725 SL_TRACE, 2726 "reset_mrt_ill: mfctable [%d]", i); 2727 } 2728 while (rt != NULL) { 2729 mutex_enter(&rt->mfc_mutex); 2730 while ((rte = rt->mfc_rte) != NULL) { 2731 if (rte->ill == ill) { 2732 if (ipst->ips_ip_mrtdebug > 1) { 2733 (void) mi_strlog( 2734 mrouter->conn_rq, 2735 1, SL_TRACE, 2736 "reset_mrt_ill: " 2737 "ill 0x%p", ill); 2738 } 2739 rt->mfc_rte = rte->rte_next; 2740 freemsg(rte->mp); 2741 mi_free((char *)rte); 2742 } 2743 } 2744 mutex_exit(&rt->mfc_mutex); 2745 rt = rt->mfc_next; 2746 } 2747 } 2748 MFCB_REFRELE(&ipst->ips_mfcs[i]); 2749 } 2750 } 2751 2752 /* 2753 * Token bucket filter module. 2754 * The ipha is for mcastgrp destination for phyint and encap. 2755 */ 2756 static void 2757 tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha) 2758 { 2759 size_t p_len = msgdsize(mp); 2760 struct tbf *t = vifp->v_tbf; 2761 timeout_id_t id = 0; 2762 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2763 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2764 2765 /* Drop if packet is too large */ 2766 if (p_len > MAX_BKT_SIZE) { 2767 ipst->ips_mrtstat->mrts_pkt2large++; 2768 freemsg(mp); 2769 return; 2770 } 2771 if (ipst->ips_ip_mrtdebug > 1) { 2772 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2773 "tbf_ctrl: SEND vif %ld, qlen %d, ipha_dst 0x%x", 2774 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len, 2775 ntohl(ipha->ipha_dst)); 2776 } 2777 2778 mutex_enter(&t->tbf_lock); 2779 2780 tbf_update_tokens(vifp); 2781 2782 /* 2783 * If there are enough tokens, 2784 * and the queue is empty, send this packet out. 2785 */ 2786 if (ipst->ips_ip_mrtdebug > 1) { 2787 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2788 "tbf_control: vif %ld, TOKENS %d, pkt len %lu, qlen %d", 2789 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_n_tok, p_len, 2790 t->tbf_q_len); 2791 } 2792 /* No packets are queued */ 2793 if (t->tbf_q_len == 0) { 2794 /* queue empty, send packet if enough tokens */ 2795 if (p_len <= t->tbf_n_tok) { 2796 t->tbf_n_tok -= p_len; 2797 mutex_exit(&t->tbf_lock); 2798 tbf_send_packet(vifp, mp); 2799 return; 2800 } else { 2801 /* Queue packet and timeout till later */ 2802 tbf_queue(vifp, mp); 2803 ASSERT(vifp->v_timeout_id == 0); 2804 vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp, 2805 TBF_REPROCESS); 2806 } 2807 } else if (t->tbf_q_len < t->tbf_max_q_len) { 2808 /* Finite queue length, so queue pkts and process queue */ 2809 tbf_queue(vifp, mp); 2810 tbf_process_q(vifp); 2811 } else { 2812 /* Check that we have UDP header with IP header */ 2813 size_t hdr_length = IPH_HDR_LENGTH(ipha) + 2814 sizeof (struct udphdr); 2815 2816 if ((mp->b_wptr - mp->b_rptr) < hdr_length) { 2817 if (!pullupmsg(mp, hdr_length)) { 2818 freemsg(mp); 2819 ip1dbg(("tbf_ctl: couldn't pullup udp hdr, " 2820 "vif %ld src 0x%x dst 0x%x\n", 2821 (ptrdiff_t)(vifp - ipst->ips_vifs), 2822 ntohl(ipha->ipha_src), 2823 ntohl(ipha->ipha_dst))); 2824 mutex_exit(&vifp->v_tbf->tbf_lock); 2825 return; 2826 } else 2827 /* Have to reassign ipha after pullupmsg */ 2828 ipha = (ipha_t *)mp->b_rptr; 2829 } 2830 /* 2831 * Queue length too much, 2832 * try to selectively dq, or queue and process 2833 */ 2834 if (!tbf_dq_sel(vifp, ipha)) { 2835 ipst->ips_mrtstat->mrts_q_overflow++; 2836 freemsg(mp); 2837 } else { 2838 tbf_queue(vifp, mp); 2839 tbf_process_q(vifp); 2840 } 2841 } 2842 if (t->tbf_q_len == 0) { 2843 id = vifp->v_timeout_id; 2844 vifp->v_timeout_id = 0; 2845 } 2846 mutex_exit(&vifp->v_tbf->tbf_lock); 2847 if (id != 0) 2848 (void) untimeout(id); 2849 } 2850 2851 /* 2852 * Adds a packet to the tbf queue at the interface. 2853 * The ipha is for mcastgrp destination for phyint and encap. 2854 */ 2855 static void 2856 tbf_queue(struct vif *vifp, mblk_t *mp) 2857 { 2858 struct tbf *t = vifp->v_tbf; 2859 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2860 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2861 2862 if (ipst->ips_ip_mrtdebug > 1) { 2863 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2864 "tbf_queue: vif %ld", (ptrdiff_t)(vifp - ipst->ips_vifs)); 2865 } 2866 ASSERT(MUTEX_HELD(&t->tbf_lock)); 2867 2868 if (t->tbf_t == NULL) { 2869 /* Queue was empty */ 2870 t->tbf_q = mp; 2871 } else { 2872 /* Insert at tail */ 2873 t->tbf_t->b_next = mp; 2874 } 2875 /* set new tail pointer */ 2876 t->tbf_t = mp; 2877 2878 mp->b_next = mp->b_prev = NULL; 2879 2880 t->tbf_q_len++; 2881 } 2882 2883 /* 2884 * Process the queue at the vif interface. 2885 * Drops the tbf_lock when sending packets. 2886 * 2887 * NOTE : The caller should quntimeout if the queue length is 0. 2888 */ 2889 static void 2890 tbf_process_q(struct vif *vifp) 2891 { 2892 mblk_t *mp; 2893 struct tbf *t = vifp->v_tbf; 2894 size_t len; 2895 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2896 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2897 2898 if (ipst->ips_ip_mrtdebug > 1) { 2899 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2900 "tbf_process_q 1: vif %ld qlen = %d", 2901 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len); 2902 } 2903 2904 /* 2905 * Loop through the queue at the interface and send 2906 * as many packets as possible. 2907 */ 2908 ASSERT(MUTEX_HELD(&t->tbf_lock)); 2909 2910 while (t->tbf_q_len > 0) { 2911 mp = t->tbf_q; 2912 len = (size_t)msgdsize(mp); /* length of ip pkt */ 2913 2914 /* Determine if the packet can be sent */ 2915 if (len <= t->tbf_n_tok) { 2916 /* 2917 * If so, reduce no. of tokens, dequeue the packet, 2918 * send the packet. 2919 */ 2920 t->tbf_n_tok -= len; 2921 2922 t->tbf_q = mp->b_next; 2923 if (--t->tbf_q_len == 0) { 2924 t->tbf_t = NULL; 2925 } 2926 mp->b_next = NULL; 2927 /* Exit mutex before sending packet, then re-enter */ 2928 mutex_exit(&t->tbf_lock); 2929 tbf_send_packet(vifp, mp); 2930 mutex_enter(&t->tbf_lock); 2931 } else 2932 break; 2933 } 2934 } 2935 2936 /* Called at tbf timeout to update tokens, process q and reset timer. */ 2937 static void 2938 tbf_reprocess_q(void *arg) 2939 { 2940 struct vif *vifp = arg; 2941 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2942 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2943 2944 mutex_enter(&vifp->v_tbf->tbf_lock); 2945 vifp->v_timeout_id = 0; 2946 tbf_update_tokens(vifp); 2947 2948 tbf_process_q(vifp); 2949 2950 if (vifp->v_tbf->tbf_q_len > 0) { 2951 vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp, 2952 TBF_REPROCESS); 2953 } 2954 mutex_exit(&vifp->v_tbf->tbf_lock); 2955 2956 if (ipst->ips_ip_mrtdebug > 1) { 2957 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2958 "tbf_reprcess_q: vif %ld timeout id = %p", 2959 (ptrdiff_t)(vifp - ipst->ips_vifs), vifp->v_timeout_id); 2960 } 2961 } 2962 2963 /* 2964 * Function that will selectively discard a member of the tbf queue, 2965 * based on the precedence value and the priority. 2966 * 2967 * NOTE : The caller should quntimeout if the queue length is 0. 2968 */ 2969 static int 2970 tbf_dq_sel(struct vif *vifp, ipha_t *ipha) 2971 { 2972 uint_t p; 2973 struct tbf *t = vifp->v_tbf; 2974 mblk_t **np; 2975 mblk_t *last, *mp; 2976 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2977 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2978 2979 if (ipst->ips_ip_mrtdebug > 1) { 2980 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2981 "dq_sel: vif %ld dst 0x%x", 2982 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(ipha->ipha_dst)); 2983 } 2984 2985 ASSERT(MUTEX_HELD(&t->tbf_lock)); 2986 p = priority(vifp, ipha); 2987 2988 np = &t->tbf_q; 2989 last = NULL; 2990 while ((mp = *np) != NULL) { 2991 if (p > (priority(vifp, (ipha_t *)mp->b_rptr))) { 2992 *np = mp->b_next; 2993 /* If removing the last packet, fix the tail pointer */ 2994 if (mp == t->tbf_t) 2995 t->tbf_t = last; 2996 mp->b_prev = mp->b_next = NULL; 2997 freemsg(mp); 2998 /* 2999 * It's impossible for the queue to be empty, but 3000 * we check anyway. 3001 */ 3002 if (--t->tbf_q_len == 0) { 3003 t->tbf_t = NULL; 3004 } 3005 ipst->ips_mrtstat->mrts_drop_sel++; 3006 return (1); 3007 } 3008 np = &mp->b_next; 3009 last = mp; 3010 } 3011 return (0); 3012 } 3013 3014 /* Sends packet, 2 cases - encap tunnel, phyint. */ 3015 static void 3016 tbf_send_packet(struct vif *vifp, mblk_t *mp) 3017 { 3018 ipif_t *ipif; 3019 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 3020 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3021 3022 /* If encap tunnel options */ 3023 if (vifp->v_flags & VIFF_TUNNEL) { 3024 if (ipst->ips_ip_mrtdebug > 1) { 3025 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3026 "tbf_send_pkt: ENCAP tunnel vif %ld", 3027 (ptrdiff_t)(vifp - ipst->ips_vifs)); 3028 } 3029 3030 /* 3031 * Feed into ip_wput which will set the ident field and 3032 * checksum the encapsulating header. 3033 * BSD gets the cached route vifp->v_route from ip_output() 3034 * to speed up route table lookups. Not necessary in SunOS 5.x. 3035 */ 3036 put(vifp->v_ipif->ipif_wq, mp); 3037 return; 3038 3039 /* phyint */ 3040 } else { 3041 /* Need to loop back to members on the outgoing interface. */ 3042 ipha_t *ipha; 3043 ipaddr_t dst; 3044 ipha = (ipha_t *)mp->b_rptr; 3045 dst = ipha->ipha_dst; 3046 ipif = vifp->v_ipif; 3047 3048 mutex_enter(&ipif->ipif_ill->ill_lock); 3049 if (ilm_lookup_ipif(ipif, dst) != NULL) { 3050 /* 3051 * The packet is not yet reassembled, thus we need to 3052 * pass it to ip_rput_local for checksum verification 3053 * and reassembly (and fanout the user stream). 3054 */ 3055 mblk_t *mp_loop; 3056 ire_t *ire; 3057 3058 mutex_exit(&ipif->ipif_ill->ill_lock); 3059 if (ipst->ips_ip_mrtdebug > 1) { 3060 (void) mi_strlog(mrouter->conn_rq, 1, 3061 SL_TRACE, 3062 "tbf_send_pkt: loopback vif %ld", 3063 (ptrdiff_t)(vifp - ipst->ips_vifs)); 3064 } 3065 mp_loop = copymsg(mp); 3066 ire = ire_ctable_lookup(~0, 0, IRE_BROADCAST, NULL, 3067 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 3068 3069 if (mp_loop != NULL && ire != NULL) { 3070 IP_RPUT_LOCAL(ipif->ipif_rq, mp_loop, 3071 ((ipha_t *)mp_loop->b_rptr), 3072 ire, (ill_t *)ipif->ipif_rq->q_ptr); 3073 } else { 3074 /* Either copymsg failed or no ire */ 3075 (void) mi_strlog(mrouter->conn_rq, 1, 3076 SL_TRACE, 3077 "tbf_send_pkt: mp_loop 0x%p, ire 0x%p " 3078 "vif %ld\n", mp_loop, ire, 3079 (ptrdiff_t)(vifp - ipst->ips_vifs)); 3080 } 3081 if (ire != NULL) 3082 ire_refrele(ire); 3083 } else { 3084 mutex_exit(&ipif->ipif_ill->ill_lock); 3085 } 3086 if (ipst->ips_ip_mrtdebug > 1) { 3087 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3088 "tbf_send_pkt: phyint forward vif %ld dst = 0x%x", 3089 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(dst)); 3090 } 3091 ip_rput_forward_multicast(dst, mp, ipif); 3092 } 3093 } 3094 3095 /* 3096 * Determine the current time and then the elapsed time (between the last time 3097 * and time now). Update the no. of tokens in the bucket. 3098 */ 3099 static void 3100 tbf_update_tokens(struct vif *vifp) 3101 { 3102 timespec_t tp; 3103 hrtime_t tm; 3104 struct tbf *t = vifp->v_tbf; 3105 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 3106 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3107 3108 ASSERT(MUTEX_HELD(&t->tbf_lock)); 3109 3110 /* Time in secs and nsecs, rate limit in kbits/sec */ 3111 gethrestime(&tp); 3112 3113 /*LINTED*/ 3114 TV_DELTA(tp, t->tbf_last_pkt_t, tm); 3115 3116 /* 3117 * This formula is actually 3118 * "time in seconds" * "bytes/second". Scaled for nsec. 3119 * (tm/1000000000) * (v_rate_limit * 1000 * (1000/1024) /8) 3120 * 3121 * The (1000/1024) was introduced in add_vif to optimize 3122 * this divide into a shift. 3123 */ 3124 t->tbf_n_tok += (tm/1000) * vifp->v_rate_limit / 1024 / 8; 3125 t->tbf_last_pkt_t = tp; 3126 3127 if (t->tbf_n_tok > MAX_BKT_SIZE) 3128 t->tbf_n_tok = MAX_BKT_SIZE; 3129 if (ipst->ips_ip_mrtdebug > 1) { 3130 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3131 "tbf_update_tok: tm %lld tok %d vif %ld", 3132 tm, t->tbf_n_tok, (ptrdiff_t)(vifp - ipst->ips_vifs)); 3133 } 3134 } 3135 3136 /* 3137 * Priority currently is based on port nos. 3138 * Different forwarding mechanisms have different ways 3139 * of obtaining the port no. Hence, the vif must be 3140 * given along with the packet itself. 3141 * 3142 */ 3143 static int 3144 priority(struct vif *vifp, ipha_t *ipha) 3145 { 3146 int prio; 3147 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 3148 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3149 3150 /* Temporary hack; may add general packet classifier some day */ 3151 3152 ASSERT(MUTEX_HELD(&vifp->v_tbf->tbf_lock)); 3153 3154 /* 3155 * The UDP port space is divided up into four priority ranges: 3156 * [0, 16384) : unclassified - lowest priority 3157 * [16384, 32768) : audio - highest priority 3158 * [32768, 49152) : whiteboard - medium priority 3159 * [49152, 65536) : video - low priority 3160 */ 3161 3162 if (ipha->ipha_protocol == IPPROTO_UDP) { 3163 struct udphdr *udp = 3164 (struct udphdr *)((char *)ipha + IPH_HDR_LENGTH(ipha)); 3165 switch (ntohs(udp->uh_dport) & 0xc000) { 3166 case 0x4000: 3167 prio = 70; 3168 break; 3169 case 0x8000: 3170 prio = 60; 3171 break; 3172 case 0xc000: 3173 prio = 55; 3174 break; 3175 default: 3176 prio = 50; 3177 break; 3178 } 3179 if (ipst->ips_ip_mrtdebug > 1) { 3180 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3181 "priority: port %x prio %d\n", 3182 ntohs(udp->uh_dport), prio); 3183 } 3184 } else 3185 prio = 50; /* default priority */ 3186 return (prio); 3187 } 3188 3189 /* 3190 * End of token bucket filter modifications 3191 */ 3192 3193 3194 3195 /* 3196 * Produces data for netstat -M. 3197 */ 3198 int 3199 ip_mroute_stats(mblk_t *mp, ip_stack_t *ipst) 3200 { 3201 ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl); 3202 ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl); 3203 if (!snmp_append_data(mp, (char *)ipst->ips_mrtstat, 3204 sizeof (struct mrtstat))) { 3205 ip0dbg(("ip_mroute_stats: failed %ld bytes\n", 3206 (size_t)sizeof (struct mrtstat))); 3207 return (0); 3208 } 3209 return (1); 3210 } 3211 3212 /* 3213 * Sends info for SNMP's MIB. 3214 */ 3215 int 3216 ip_mroute_vif(mblk_t *mp, ip_stack_t *ipst) 3217 { 3218 struct vifctl vi; 3219 vifi_t vifi; 3220 3221 mutex_enter(&ipst->ips_numvifs_mutex); 3222 for (vifi = 0; vifi < ipst->ips_numvifs; vifi++) { 3223 if (ipst->ips_vifs[vifi].v_lcl_addr.s_addr == 0) 3224 continue; 3225 /* 3226 * No locks here, an approximation is fine. 3227 */ 3228 vi.vifc_vifi = vifi; 3229 vi.vifc_flags = ipst->ips_vifs[vifi].v_flags; 3230 vi.vifc_threshold = ipst->ips_vifs[vifi].v_threshold; 3231 vi.vifc_rate_limit = ipst->ips_vifs[vifi].v_rate_limit; 3232 vi.vifc_lcl_addr = ipst->ips_vifs[vifi].v_lcl_addr; 3233 vi.vifc_rmt_addr = ipst->ips_vifs[vifi].v_rmt_addr; 3234 vi.vifc_pkt_in = ipst->ips_vifs[vifi].v_pkt_in; 3235 vi.vifc_pkt_out = ipst->ips_vifs[vifi].v_pkt_out; 3236 3237 if (!snmp_append_data(mp, (char *)&vi, sizeof (vi))) { 3238 ip0dbg(("ip_mroute_vif: failed %ld bytes\n", 3239 (size_t)sizeof (vi))); 3240 return (0); 3241 } 3242 } 3243 mutex_exit(&ipst->ips_numvifs_mutex); 3244 return (1); 3245 } 3246 3247 /* 3248 * Called by ip_snmp_get to send up multicast routing table. 3249 */ 3250 int 3251 ip_mroute_mrt(mblk_t *mp, ip_stack_t *ipst) 3252 { 3253 int i, j; 3254 struct mfc *rt; 3255 struct mfcctl mfcc; 3256 3257 /* 3258 * Make sure multicast has not been turned off. 3259 */ 3260 if (is_mrouter_off(ipst)) 3261 return (1); 3262 3263 /* Loop over all hash buckets and their chains */ 3264 for (i = 0; i < MFCTBLSIZ; i++) { 3265 MFCB_REFHOLD(&ipst->ips_mfcs[i]); 3266 for (rt = ipst->ips_mfcs[i].mfcb_mfc; rt; rt = rt->mfc_next) { 3267 mutex_enter(&rt->mfc_mutex); 3268 if (rt->mfc_rte != NULL || 3269 (rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 3270 mutex_exit(&rt->mfc_mutex); 3271 continue; 3272 } 3273 mfcc.mfcc_origin = rt->mfc_origin; 3274 mfcc.mfcc_mcastgrp = rt->mfc_mcastgrp; 3275 mfcc.mfcc_parent = rt->mfc_parent; 3276 mfcc.mfcc_pkt_cnt = rt->mfc_pkt_cnt; 3277 mutex_enter(&ipst->ips_numvifs_mutex); 3278 for (j = 0; j < (int)ipst->ips_numvifs; j++) 3279 mfcc.mfcc_ttls[j] = rt->mfc_ttls[j]; 3280 for (j = (int)ipst->ips_numvifs; j < MAXVIFS; j++) 3281 mfcc.mfcc_ttls[j] = 0; 3282 mutex_exit(&ipst->ips_numvifs_mutex); 3283 3284 mutex_exit(&rt->mfc_mutex); 3285 if (!snmp_append_data(mp, (char *)&mfcc, 3286 sizeof (mfcc))) { 3287 MFCB_REFRELE(&ipst->ips_mfcs[i]); 3288 ip0dbg(("ip_mroute_mrt: failed %ld bytes\n", 3289 (size_t)sizeof (mfcc))); 3290 return (0); 3291 } 3292 } 3293 MFCB_REFRELE(&ipst->ips_mfcs[i]); 3294 } 3295 return (1); 3296 } 3297