1 /* 2 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 3 * Use is subject to license terms. 4 */ 5 /* 6 * CDDL HEADER START 7 * 8 * The contents of this file are subject to the terms of the 9 * Common Development and Distribution License (the "License"). 10 * You may not use this file except in compliance with the License. 11 * 12 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 13 * or http://www.opensolaris.org/os/licensing. 14 * See the License for the specific language governing permissions 15 * and limitations under the License. 16 * 17 * When distributing Covered Code, include this CDDL HEADER in each 18 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 19 * If applicable, add the following below this CDDL HEADER, with the 20 * fields enclosed by brackets "[]" replaced with your own identifying 21 * information: Portions Copyright [yyyy] [name of copyright owner] 22 * 23 * CDDL HEADER END 24 */ 25 /* 26 * Copyright 2008 Sun Microsystems, Inc. 27 * All rights reserved. Use is subject to license terms. 28 */ 29 /* Copyright (c) 1990 Mentat Inc. */ 30 31 /* 32 * Procedures for the kernel part of DVMRP, 33 * a Distance-Vector Multicast Routing Protocol. 34 * (See RFC-1075) 35 * Written by David Waitzman, BBN Labs, August 1988. 36 * Modified by Steve Deering, Stanford, February 1989. 37 * Modified by Mark J. Steiglitz, Stanford, May, 1991 38 * Modified by Van Jacobson, LBL, January 1993 39 * Modified by Ajit Thyagarajan, PARC, August 1993 40 * Modified by Bill Fenner, PARC, April 1995 41 * 42 * MROUTING 3.5 43 */ 44 45 /* 46 * TODO 47 * - function pointer field in vif, void *vif_sendit() 48 */ 49 50 #include <sys/types.h> 51 #include <sys/stream.h> 52 #include <sys/stropts.h> 53 #include <sys/strlog.h> 54 #include <sys/systm.h> 55 #include <sys/ddi.h> 56 #include <sys/cmn_err.h> 57 #include <sys/zone.h> 58 59 #include <sys/param.h> 60 #include <sys/socket.h> 61 #include <sys/vtrace.h> 62 #include <sys/debug.h> 63 #include <net/if.h> 64 #include <sys/sockio.h> 65 #include <netinet/in.h> 66 #include <net/if_dl.h> 67 68 #include <inet/common.h> 69 #include <inet/mi.h> 70 #include <inet/nd.h> 71 #include <inet/mib2.h> 72 #include <netinet/ip6.h> 73 #include <inet/ip.h> 74 #include <inet/snmpcom.h> 75 76 #include <netinet/igmp.h> 77 #include <netinet/igmp_var.h> 78 #include <netinet/udp.h> 79 #include <netinet/ip_mroute.h> 80 #include <inet/ip_multi.h> 81 #include <inet/ip_ire.h> 82 #include <inet/ip_if.h> 83 #include <inet/ipclassifier.h> 84 85 #include <netinet/pim.h> 86 87 88 /* 89 * MT Design: 90 * 91 * There are three main data structures viftable, mfctable and tbftable that 92 * need to be protected against MT races. 93 * 94 * vitable is a fixed length array of vif structs. There is no lock to protect 95 * the whole array, instead each struct is protected by its own indiviual lock. 96 * The value of v_marks in conjuction with the value of v_refcnt determines the 97 * current state of a vif structure. One special state that needs mention 98 * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates 99 * that vif is being initalized. 100 * Each structure is freed when the refcnt goes down to zero. If a delete comes 101 * in when the the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED 102 * which prevents the struct from further use. When the refcnt goes to zero 103 * the struct is freed and is marked VIF_MARK_NOTINUSE. 104 * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill 105 * from going away a refhold is put on the ipif before using it. see 106 * lock_good_vif() and unlock_good_vif(). 107 * 108 * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts 109 * of the vif struct. 110 * 111 * tbftable is also a fixed length array of tbf structs and is only accessed 112 * via v_tbf. It is protected by its own lock tbf_lock. 113 * 114 * Lock Ordering is 115 * v_lock --> tbf_lock 116 * v_lock --> ill_locK 117 * 118 * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb). 119 * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker, 120 * it also maintains a state. These fields are protected by a lock (mfcb_lock). 121 * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to 122 * protect the struct elements. 123 * 124 * mfc structs are dynamically allocated and are singly linked 125 * at the head of the chain. When an mfc structure is to be deleted 126 * it is marked condemned and so is the state in the bucket struct. 127 * When the last walker of the hash bucket exits all the mfc structs 128 * marked condemed are freed. 129 * 130 * Locking Hierarchy: 131 * The bucket lock should be acquired before the mfc struct lock. 132 * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking 133 * operations on the bucket struct. 134 * 135 * last_encap_lock and numvifs_mutex should be acquired after 136 * acquring vif or mfc locks. These locks protect some global variables. 137 * 138 * The statistics are not currently protected by a lock 139 * causing the stats be be approximate, not exact. 140 */ 141 142 #define NO_VIF MAXVIFS /* from mrouted, no route for src */ 143 144 /* 145 * Timeouts: 146 * Upcall timeouts - BSD uses boolean_t mfc->expire and 147 * nexpire[MFCTBLSIZE], the number of times expire has been called. 148 * SunOS 5.x uses mfc->timeout for each mfc. 149 * Some Unixes are limited in the number of simultaneous timeouts 150 * that can be run, SunOS 5.x does not have this restriction. 151 */ 152 153 /* 154 * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and 155 * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall 156 * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE 157 */ 158 #define EXPIRE_TIMEOUT (hz/4) /* 4x / second */ 159 #define UPCALL_EXPIRE 6 /* number of timeouts */ 160 161 /* 162 * Hash function for a source, group entry 163 */ 164 #define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \ 165 ((g) >> 20) ^ ((g) >> 10) ^ (g)) 166 167 #define TBF_REPROCESS (hz / 100) /* 100x /second */ 168 169 /* Identify PIM packet that came on a Register interface */ 170 #define PIM_REGISTER_MARKER 0xffffffff 171 172 /* Function declarations */ 173 static int add_mfc(struct mfcctl *, ip_stack_t *); 174 static int add_vif(struct vifctl *, conn_t *, mblk_t *, ip_stack_t *); 175 static int del_mfc(struct mfcctl *, ip_stack_t *); 176 static int del_vif(vifi_t *, conn_t *, mblk_t *, ip_stack_t *); 177 static void del_vifp(struct vif *); 178 static void encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 179 static void expire_upcalls(void *); 180 static void fill_route(struct mfc *, struct mfcctl *, ip_stack_t *); 181 static void free_queue(struct mfc *); 182 static int get_assert(uchar_t *, ip_stack_t *); 183 static int get_lsg_cnt(struct sioc_lsg_req *, ip_stack_t *); 184 static int get_sg_cnt(struct sioc_sg_req *, ip_stack_t *); 185 static int get_version(uchar_t *); 186 static int get_vif_cnt(struct sioc_vif_req *, ip_stack_t *); 187 static int ip_mdq(mblk_t *, ipha_t *, ill_t *, 188 ipaddr_t, struct mfc *); 189 static int ip_mrouter_init(conn_t *, uchar_t *, int, ip_stack_t *); 190 static void phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 191 static int register_mforward(queue_t *, mblk_t *, ill_t *); 192 static void register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 193 static int set_assert(int *, ip_stack_t *); 194 195 /* 196 * Token Bucket Filter functions 197 */ 198 static int priority(struct vif *, ipha_t *); 199 static void tbf_control(struct vif *, mblk_t *, ipha_t *); 200 static int tbf_dq_sel(struct vif *, ipha_t *); 201 static void tbf_process_q(struct vif *); 202 static void tbf_queue(struct vif *, mblk_t *); 203 static void tbf_reprocess_q(void *); 204 static void tbf_send_packet(struct vif *, mblk_t *); 205 static void tbf_update_tokens(struct vif *); 206 static void release_mfc(struct mfcb *); 207 208 static boolean_t is_mrouter_off(ip_stack_t *); 209 /* 210 * Encapsulation packets 211 */ 212 213 #define ENCAP_TTL 64 214 215 /* prototype IP hdr for encapsulated packets */ 216 static ipha_t multicast_encap_iphdr = { 217 IP_SIMPLE_HDR_VERSION, 218 0, /* tos */ 219 sizeof (ipha_t), /* total length */ 220 0, /* id */ 221 0, /* frag offset */ 222 ENCAP_TTL, IPPROTO_ENCAP, 223 0, /* checksum */ 224 }; 225 226 /* 227 * Rate limit for assert notification messages, in nsec. 228 */ 229 #define ASSERT_MSG_TIME 3000000000 230 231 232 #define VIF_REFHOLD(vifp) { \ 233 mutex_enter(&(vifp)->v_lock); \ 234 (vifp)->v_refcnt++; \ 235 mutex_exit(&(vifp)->v_lock); \ 236 } 237 238 #define VIF_REFRELE_LOCKED(vifp) { \ 239 (vifp)->v_refcnt--; \ 240 if ((vifp)->v_refcnt == 0 && \ 241 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) { \ 242 del_vifp(vifp); \ 243 } else { \ 244 mutex_exit(&(vifp)->v_lock); \ 245 } \ 246 } 247 248 #define VIF_REFRELE(vifp) { \ 249 mutex_enter(&(vifp)->v_lock); \ 250 (vifp)->v_refcnt--; \ 251 if ((vifp)->v_refcnt == 0 && \ 252 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) { \ 253 del_vifp(vifp); \ 254 } else { \ 255 mutex_exit(&(vifp)->v_lock); \ 256 } \ 257 } 258 259 #define MFCB_REFHOLD(mfcb) { \ 260 mutex_enter(&(mfcb)->mfcb_lock); \ 261 (mfcb)->mfcb_refcnt++; \ 262 ASSERT((mfcb)->mfcb_refcnt != 0); \ 263 mutex_exit(&(mfcb)->mfcb_lock); \ 264 } 265 266 #define MFCB_REFRELE(mfcb) { \ 267 mutex_enter(&(mfcb)->mfcb_lock); \ 268 ASSERT((mfcb)->mfcb_refcnt != 0); \ 269 if (--(mfcb)->mfcb_refcnt == 0 && \ 270 ((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) { \ 271 release_mfc(mfcb); \ 272 } \ 273 mutex_exit(&(mfcb)->mfcb_lock); \ 274 } 275 276 /* 277 * MFCFIND: 278 * Find a route for a given origin IP address and multicast group address. 279 * Skip entries with pending upcalls. 280 * Type of service parameter to be added in the future! 281 */ 282 #define MFCFIND(mfcbp, o, g, rt) { \ 283 struct mfc *_mb_rt = NULL; \ 284 rt = NULL; \ 285 _mb_rt = mfcbp->mfcb_mfc; \ 286 while (_mb_rt) { \ 287 if ((_mb_rt->mfc_origin.s_addr == o) && \ 288 (_mb_rt->mfc_mcastgrp.s_addr == g) && \ 289 (_mb_rt->mfc_rte == NULL) && \ 290 (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) { \ 291 rt = _mb_rt; \ 292 break; \ 293 } \ 294 _mb_rt = _mb_rt->mfc_next; \ 295 } \ 296 } 297 298 /* 299 * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime() 300 * are inefficient. We use gethrestime() which returns a timespec_t with 301 * sec and nsec, the resolution is machine dependent. 302 * The following 2 macros have been changed to use nsec instead of usec. 303 */ 304 /* 305 * Macros to compute elapsed time efficiently. 306 * Borrowed from Van Jacobson's scheduling code. 307 * Delta should be a hrtime_t. 308 */ 309 #define TV_DELTA(a, b, delta) { \ 310 int xxs; \ 311 \ 312 delta = (a).tv_nsec - (b).tv_nsec; \ 313 if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \ 314 switch (xxs) { \ 315 case 2: \ 316 delta += 1000000000; \ 317 /*FALLTHROUGH*/ \ 318 case 1: \ 319 delta += 1000000000; \ 320 break; \ 321 default: \ 322 delta += (1000000000 * xxs); \ 323 } \ 324 } \ 325 } 326 327 #define TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \ 328 (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec) 329 330 /* 331 * Handle MRT setsockopt commands to modify the multicast routing tables. 332 */ 333 int 334 ip_mrouter_set(int cmd, queue_t *q, int checkonly, uchar_t *data, 335 int datalen, mblk_t *first_mp) 336 { 337 conn_t *connp = Q_TO_CONN(q); 338 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 339 340 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 341 if (cmd != MRT_INIT && connp != ipst->ips_ip_g_mrouter) { 342 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 343 return (EACCES); 344 } 345 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 346 347 if (checkonly) { 348 /* 349 * do not do operation, just pretend to - new T_CHECK 350 * Note: Even routines further on can probably fail but 351 * this T_CHECK stuff is only to please XTI so it not 352 * necessary to be perfect. 353 */ 354 switch (cmd) { 355 case MRT_INIT: 356 case MRT_DONE: 357 case MRT_ADD_VIF: 358 case MRT_DEL_VIF: 359 case MRT_ADD_MFC: 360 case MRT_DEL_MFC: 361 case MRT_ASSERT: 362 return (0); 363 default: 364 return (EOPNOTSUPP); 365 } 366 } 367 368 /* 369 * make sure no command is issued after multicast routing has been 370 * turned off. 371 */ 372 if (cmd != MRT_INIT && cmd != MRT_DONE) { 373 if (is_mrouter_off(ipst)) 374 return (EINVAL); 375 } 376 377 switch (cmd) { 378 case MRT_INIT: return (ip_mrouter_init(connp, data, datalen, ipst)); 379 case MRT_DONE: return (ip_mrouter_done(first_mp, ipst)); 380 case MRT_ADD_VIF: return (add_vif((struct vifctl *)data, connp, 381 first_mp, ipst)); 382 case MRT_DEL_VIF: return (del_vif((vifi_t *)data, connp, first_mp, 383 ipst)); 384 case MRT_ADD_MFC: return (add_mfc((struct mfcctl *)data, ipst)); 385 case MRT_DEL_MFC: return (del_mfc((struct mfcctl *)data, ipst)); 386 case MRT_ASSERT: return (set_assert((int *)data, ipst)); 387 default: return (EOPNOTSUPP); 388 } 389 } 390 391 /* 392 * Handle MRT getsockopt commands 393 */ 394 int 395 ip_mrouter_get(int cmd, queue_t *q, uchar_t *data) 396 { 397 conn_t *connp = Q_TO_CONN(q); 398 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 399 400 if (connp != ipst->ips_ip_g_mrouter) 401 return (EACCES); 402 403 switch (cmd) { 404 case MRT_VERSION: return (get_version((uchar_t *)data)); 405 case MRT_ASSERT: return (get_assert((uchar_t *)data, ipst)); 406 default: return (EOPNOTSUPP); 407 } 408 } 409 410 /* 411 * Handle ioctl commands to obtain information from the cache. 412 * Called with shared access to IP. These are read_only ioctls. 413 */ 414 /* ARGSUSED */ 415 int 416 mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 417 ip_ioctl_cmd_t *ipip, void *if_req) 418 { 419 mblk_t *mp1; 420 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 421 conn_t *connp = Q_TO_CONN(q); 422 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 423 424 /* Existence verified in ip_wput_nondata */ 425 mp1 = mp->b_cont->b_cont; 426 427 switch (iocp->ioc_cmd) { 428 case (SIOCGETVIFCNT): 429 return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr, ipst)); 430 case (SIOCGETSGCNT): 431 return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr, ipst)); 432 case (SIOCGETLSGCNT): 433 return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr, ipst)); 434 default: 435 return (EINVAL); 436 } 437 } 438 439 /* 440 * Returns the packet, byte, rpf-failure count for the source, group provided. 441 */ 442 static int 443 get_sg_cnt(struct sioc_sg_req *req, ip_stack_t *ipst) 444 { 445 struct mfc *rt; 446 struct mfcb *mfcbp; 447 448 mfcbp = &ipst->ips_mfcs[MFCHASH(req->src.s_addr, req->grp.s_addr)]; 449 MFCB_REFHOLD(mfcbp); 450 MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt); 451 452 if (rt != NULL) { 453 mutex_enter(&rt->mfc_mutex); 454 req->pktcnt = rt->mfc_pkt_cnt; 455 req->bytecnt = rt->mfc_byte_cnt; 456 req->wrong_if = rt->mfc_wrong_if; 457 mutex_exit(&rt->mfc_mutex); 458 } else 459 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU; 460 461 MFCB_REFRELE(mfcbp); 462 return (0); 463 } 464 465 /* 466 * Returns the packet, byte, rpf-failure count for the source, group provided. 467 * Uses larger counters and IPv6 addresses. 468 */ 469 /* ARGSUSED XXX until implemented */ 470 static int 471 get_lsg_cnt(struct sioc_lsg_req *req, ip_stack_t *ipst) 472 { 473 /* XXX TODO SIOCGETLSGCNT */ 474 return (ENXIO); 475 } 476 477 /* 478 * Returns the input and output packet and byte counts on the vif provided. 479 */ 480 static int 481 get_vif_cnt(struct sioc_vif_req *req, ip_stack_t *ipst) 482 { 483 vifi_t vifi = req->vifi; 484 485 if (vifi >= ipst->ips_numvifs) 486 return (EINVAL); 487 488 /* 489 * No locks here, an approximation is fine. 490 */ 491 req->icount = ipst->ips_vifs[vifi].v_pkt_in; 492 req->ocount = ipst->ips_vifs[vifi].v_pkt_out; 493 req->ibytes = ipst->ips_vifs[vifi].v_bytes_in; 494 req->obytes = ipst->ips_vifs[vifi].v_bytes_out; 495 496 return (0); 497 } 498 499 static int 500 get_version(uchar_t *data) 501 { 502 int *v = (int *)data; 503 504 *v = 0x0305; /* XXX !!!! */ 505 506 return (0); 507 } 508 509 /* 510 * Set PIM assert processing global. 511 */ 512 static int 513 set_assert(int *i, ip_stack_t *ipst) 514 { 515 if ((*i != 1) && (*i != 0)) 516 return (EINVAL); 517 518 ipst->ips_pim_assert = *i; 519 520 return (0); 521 } 522 523 /* 524 * Get PIM assert processing global. 525 */ 526 static int 527 get_assert(uchar_t *data, ip_stack_t *ipst) 528 { 529 int *i = (int *)data; 530 531 *i = ipst->ips_pim_assert; 532 533 return (0); 534 } 535 536 /* 537 * Enable multicast routing. 538 */ 539 static int 540 ip_mrouter_init(conn_t *connp, uchar_t *data, int datalen, ip_stack_t *ipst) 541 { 542 int *v; 543 544 if (data == NULL || (datalen != sizeof (int))) 545 return (ENOPROTOOPT); 546 547 v = (int *)data; 548 if (*v != 1) 549 return (ENOPROTOOPT); 550 551 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 552 if (ipst->ips_ip_g_mrouter != NULL) { 553 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 554 return (EADDRINUSE); 555 } 556 557 /* 558 * MRT_INIT should only be allowed for RAW sockets, but we double 559 * check. 560 */ 561 if (!IPCL_IS_RAWIP(connp)) { 562 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 563 return (EINVAL); 564 } 565 566 ipst->ips_ip_g_mrouter = connp; 567 connp->conn_multi_router = 1; 568 /* In order for tunnels to work we have to turn ip_g_forward on */ 569 if (!WE_ARE_FORWARDING(ipst)) { 570 if (ipst->ips_ip_mrtdebug > 1) { 571 (void) mi_strlog(connp->conn_rq, 1, SL_TRACE, 572 "ip_mrouter_init: turning on forwarding"); 573 } 574 ipst->ips_saved_ip_g_forward = ipst->ips_ip_g_forward; 575 ipst->ips_ip_g_forward = IP_FORWARD_ALWAYS; 576 } 577 578 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 579 return (0); 580 } 581 582 void 583 ip_mrouter_stack_init(ip_stack_t *ipst) 584 { 585 mutex_init(&ipst->ips_ip_g_mrouter_mutex, NULL, MUTEX_DEFAULT, NULL); 586 587 ipst->ips_vifs = kmem_zalloc(sizeof (struct vif) * (MAXVIFS+1), 588 KM_SLEEP); 589 ipst->ips_mrtstat = kmem_zalloc(sizeof (struct mrtstat), KM_SLEEP); 590 /* 591 * mfctable: 592 * Includes all mfcs, including waiting upcalls. 593 * Multiple mfcs per bucket. 594 */ 595 ipst->ips_mfcs = kmem_zalloc(sizeof (struct mfcb) * MFCTBLSIZ, 596 KM_SLEEP); 597 /* 598 * Define the token bucket filter structures. 599 * tbftable -> each vif has one of these for storing info. 600 */ 601 ipst->ips_tbfs = kmem_zalloc(sizeof (struct tbf) * MAXVIFS, KM_SLEEP); 602 603 mutex_init(&ipst->ips_last_encap_lock, NULL, MUTEX_DEFAULT, NULL); 604 605 ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl); 606 ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl); 607 } 608 609 /* 610 * Disable multicast routing. 611 * Didn't use global timeout_val (BSD version), instead check the mfctable. 612 */ 613 int 614 ip_mrouter_done(mblk_t *mp, ip_stack_t *ipst) 615 { 616 conn_t *mrouter; 617 vifi_t vifi; 618 struct mfc *mfc_rt; 619 int i; 620 621 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 622 if (ipst->ips_ip_g_mrouter == NULL) { 623 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 624 return (EINVAL); 625 } 626 627 mrouter = ipst->ips_ip_g_mrouter; 628 629 if (ipst->ips_saved_ip_g_forward != -1) { 630 if (ipst->ips_ip_mrtdebug > 1) { 631 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 632 "ip_mrouter_done: turning off forwarding"); 633 } 634 ipst->ips_ip_g_forward = ipst->ips_saved_ip_g_forward; 635 ipst->ips_saved_ip_g_forward = -1; 636 } 637 638 /* 639 * Always clear cache when vifs change. 640 * No need to get ipst->ips_last_encap_lock since we are running as 641 * a writer. 642 */ 643 mutex_enter(&ipst->ips_last_encap_lock); 644 ipst->ips_last_encap_src = 0; 645 ipst->ips_last_encap_vif = NULL; 646 mutex_exit(&ipst->ips_last_encap_lock); 647 mrouter->conn_multi_router = 0; 648 649 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 650 651 /* 652 * For each phyint in use, 653 * disable promiscuous reception of all IP multicasts. 654 */ 655 for (vifi = 0; vifi < MAXVIFS; vifi++) { 656 struct vif *vifp = ipst->ips_vifs + vifi; 657 658 mutex_enter(&vifp->v_lock); 659 /* 660 * if the vif is active mark it condemned. 661 */ 662 if (vifp->v_marks & VIF_MARK_GOOD) { 663 ASSERT(vifp->v_ipif != NULL); 664 ipif_refhold(vifp->v_ipif); 665 /* Phyint only */ 666 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 667 ipif_t *ipif = vifp->v_ipif; 668 ipsq_t *ipsq; 669 boolean_t suc; 670 ill_t *ill; 671 672 ill = ipif->ipif_ill; 673 suc = B_FALSE; 674 if (mp == NULL) { 675 /* 676 * being called from ip_close, 677 * lets do it synchronously. 678 * Clear VIF_MARK_GOOD and 679 * set VIF_MARK_CONDEMNED. 680 */ 681 vifp->v_marks &= ~VIF_MARK_GOOD; 682 vifp->v_marks |= VIF_MARK_CONDEMNED; 683 mutex_exit(&(vifp)->v_lock); 684 suc = ipsq_enter(ill, B_FALSE, NEW_OP); 685 ipsq = ill->ill_phyint->phyint_ipsq; 686 } else { 687 ipsq = ipsq_try_enter(ipif, NULL, 688 mrouter->conn_wq, mp, 689 ip_restart_optmgmt, NEW_OP, B_TRUE); 690 if (ipsq == NULL) { 691 mutex_exit(&(vifp)->v_lock); 692 ipif_refrele(ipif); 693 return (EINPROGRESS); 694 } 695 /* 696 * Clear VIF_MARK_GOOD and 697 * set VIF_MARK_CONDEMNED. 698 */ 699 vifp->v_marks &= ~VIF_MARK_GOOD; 700 vifp->v_marks |= VIF_MARK_CONDEMNED; 701 mutex_exit(&(vifp)->v_lock); 702 suc = B_TRUE; 703 } 704 705 if (suc) { 706 (void) ip_delmulti(INADDR_ANY, ipif, 707 B_TRUE, B_TRUE); 708 ipsq_exit(ipsq); 709 } 710 mutex_enter(&vifp->v_lock); 711 } 712 /* 713 * decreases the refcnt added in add_vif. 714 * and release v_lock. 715 */ 716 VIF_REFRELE_LOCKED(vifp); 717 } else { 718 mutex_exit(&vifp->v_lock); 719 continue; 720 } 721 } 722 723 mutex_enter(&ipst->ips_numvifs_mutex); 724 ipst->ips_numvifs = 0; 725 ipst->ips_pim_assert = 0; 726 ipst->ips_reg_vif_num = ALL_VIFS; 727 mutex_exit(&ipst->ips_numvifs_mutex); 728 729 /* 730 * Free upcall msgs. 731 * Go through mfctable and stop any outstanding upcall 732 * timeouts remaining on mfcs. 733 */ 734 for (i = 0; i < MFCTBLSIZ; i++) { 735 mutex_enter(&ipst->ips_mfcs[i].mfcb_lock); 736 ipst->ips_mfcs[i].mfcb_refcnt++; 737 ipst->ips_mfcs[i].mfcb_marks |= MFCB_MARK_CONDEMNED; 738 mutex_exit(&ipst->ips_mfcs[i].mfcb_lock); 739 mfc_rt = ipst->ips_mfcs[i].mfcb_mfc; 740 while (mfc_rt) { 741 /* Free upcalls */ 742 mutex_enter(&mfc_rt->mfc_mutex); 743 if (mfc_rt->mfc_rte != NULL) { 744 if (mfc_rt->mfc_timeout_id != 0) { 745 /* 746 * OK to drop the lock as we have 747 * a refcnt on the bucket. timeout 748 * can fire but it will see that 749 * mfc_timeout_id == 0 and not do 750 * anything. see expire_upcalls(). 751 */ 752 mfc_rt->mfc_timeout_id = 0; 753 mutex_exit(&mfc_rt->mfc_mutex); 754 (void) untimeout( 755 mfc_rt->mfc_timeout_id); 756 mfc_rt->mfc_timeout_id = 0; 757 mutex_enter(&mfc_rt->mfc_mutex); 758 759 /* 760 * all queued upcall packets 761 * and mblk will be freed in 762 * release_mfc(). 763 */ 764 } 765 } 766 767 mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED; 768 769 mutex_exit(&mfc_rt->mfc_mutex); 770 mfc_rt = mfc_rt->mfc_next; 771 } 772 MFCB_REFRELE(&ipst->ips_mfcs[i]); 773 } 774 775 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 776 ipst->ips_ip_g_mrouter = NULL; 777 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 778 return (0); 779 } 780 781 void 782 ip_mrouter_stack_destroy(ip_stack_t *ipst) 783 { 784 struct mfcb *mfcbp; 785 struct mfc *rt; 786 int i; 787 788 for (i = 0; i < MFCTBLSIZ; i++) { 789 mfcbp = &ipst->ips_mfcs[i]; 790 791 while ((rt = mfcbp->mfcb_mfc) != NULL) { 792 (void) printf("ip_mrouter_stack_destroy: free for %d\n", 793 i); 794 795 mfcbp->mfcb_mfc = rt->mfc_next; 796 free_queue(rt); 797 mi_free(rt); 798 } 799 } 800 kmem_free(ipst->ips_vifs, sizeof (struct vif) * (MAXVIFS+1)); 801 ipst->ips_vifs = NULL; 802 kmem_free(ipst->ips_mrtstat, sizeof (struct mrtstat)); 803 ipst->ips_mrtstat = NULL; 804 kmem_free(ipst->ips_mfcs, sizeof (struct mfcb) * MFCTBLSIZ); 805 ipst->ips_mfcs = NULL; 806 kmem_free(ipst->ips_tbfs, sizeof (struct tbf) * MAXVIFS); 807 ipst->ips_tbfs = NULL; 808 809 mutex_destroy(&ipst->ips_last_encap_lock); 810 mutex_destroy(&ipst->ips_ip_g_mrouter_mutex); 811 } 812 813 static boolean_t 814 is_mrouter_off(ip_stack_t *ipst) 815 { 816 conn_t *mrouter; 817 818 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 819 if (ipst->ips_ip_g_mrouter == NULL) { 820 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 821 return (B_TRUE); 822 } 823 824 mrouter = ipst->ips_ip_g_mrouter; 825 if (mrouter->conn_multi_router == 0) { 826 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 827 return (B_TRUE); 828 } 829 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 830 return (B_FALSE); 831 } 832 833 static void 834 unlock_good_vif(struct vif *vifp) 835 { 836 ASSERT(vifp->v_ipif != NULL); 837 ipif_refrele(vifp->v_ipif); 838 VIF_REFRELE(vifp); 839 } 840 841 static boolean_t 842 lock_good_vif(struct vif *vifp) 843 { 844 mutex_enter(&vifp->v_lock); 845 if (!(vifp->v_marks & VIF_MARK_GOOD)) { 846 mutex_exit(&vifp->v_lock); 847 return (B_FALSE); 848 } 849 850 ASSERT(vifp->v_ipif != NULL); 851 mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock); 852 if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) { 853 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock); 854 mutex_exit(&vifp->v_lock); 855 return (B_FALSE); 856 } 857 ipif_refhold_locked(vifp->v_ipif); 858 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock); 859 vifp->v_refcnt++; 860 mutex_exit(&vifp->v_lock); 861 return (B_TRUE); 862 } 863 864 /* 865 * Add a vif to the vif table. 866 */ 867 static int 868 add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst) 869 { 870 struct vif *vifp = ipst->ips_vifs + vifcp->vifc_vifi; 871 ipif_t *ipif; 872 int error; 873 struct tbf *v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi; 874 ipsq_t *ipsq; 875 conn_t *mrouter = ipst->ips_ip_g_mrouter; 876 877 ASSERT(connp != NULL); 878 879 if (vifcp->vifc_vifi >= MAXVIFS) 880 return (EINVAL); 881 882 if (is_mrouter_off(ipst)) 883 return (EINVAL); 884 885 mutex_enter(&vifp->v_lock); 886 /* 887 * Viftable entry should be 0. 888 * if v_marks == 0 but v_refcnt != 0 means struct is being 889 * initialized. 890 * 891 * Also note that it is very unlikely that we will get a MRT_ADD_VIF 892 * request while the delete is in progress, mrouted only sends add 893 * requests when a new interface is added and the new interface cannot 894 * have the same vifi as an existing interface. We make sure that 895 * ill_delete will block till the vif is deleted by adding a refcnt 896 * to ipif in del_vif(). 897 */ 898 if (vifp->v_lcl_addr.s_addr != 0 || 899 vifp->v_marks != 0 || 900 vifp->v_refcnt != 0) { 901 mutex_exit(&vifp->v_lock); 902 return (EADDRINUSE); 903 } 904 905 /* Incoming vif should not be 0 */ 906 if (vifcp->vifc_lcl_addr.s_addr == 0) { 907 mutex_exit(&vifp->v_lock); 908 return (EINVAL); 909 } 910 911 vifp->v_refcnt++; 912 mutex_exit(&vifp->v_lock); 913 /* Find the interface with the local address */ 914 ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL, 915 connp->conn_zoneid, CONNP_TO_WQ(connp), first_mp, 916 ip_restart_optmgmt, &error, ipst); 917 if (ipif == NULL) { 918 VIF_REFRELE(vifp); 919 if (error == EINPROGRESS) 920 return (error); 921 return (EADDRNOTAVAIL); 922 } 923 924 /* 925 * We have to be exclusive as we have to call ip_addmulti() 926 * This is the best position to try to be exclusive in case 927 * we have to wait. 928 */ 929 ipsq = ipsq_try_enter(ipif, NULL, CONNP_TO_WQ(connp), first_mp, 930 ip_restart_optmgmt, NEW_OP, B_TRUE); 931 if ((ipsq) == NULL) { 932 VIF_REFRELE(vifp); 933 ipif_refrele(ipif); 934 return (EINPROGRESS); 935 } 936 937 if (ipst->ips_ip_mrtdebug > 1) { 938 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 939 "add_vif: src 0x%x enter", 940 vifcp->vifc_lcl_addr.s_addr); 941 } 942 943 mutex_enter(&vifp->v_lock); 944 /* 945 * Always clear cache when vifs change. 946 * Needed to ensure that src isn't left over from before vif was added. 947 * No need to get last_encap_lock, since we are running as a writer. 948 */ 949 950 mutex_enter(&ipst->ips_last_encap_lock); 951 ipst->ips_last_encap_src = 0; 952 ipst->ips_last_encap_vif = NULL; 953 mutex_exit(&ipst->ips_last_encap_lock); 954 955 if (vifcp->vifc_flags & VIFF_TUNNEL) { 956 if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) { 957 cmn_err(CE_WARN, 958 "add_vif: source route tunnels not supported\n"); 959 VIF_REFRELE_LOCKED(vifp); 960 ipif_refrele(ipif); 961 ipsq_exit(ipsq); 962 return (EOPNOTSUPP); 963 } 964 vifp->v_rmt_addr = vifcp->vifc_rmt_addr; 965 966 } else { 967 /* Phyint or Register vif */ 968 if (vifcp->vifc_flags & VIFF_REGISTER) { 969 /* 970 * Note: Since all IPPROTO_IP level options (including 971 * MRT_ADD_VIF) are done exclusively via 972 * ip_optmgmt_writer(), a lock is not necessary to 973 * protect reg_vif_num. 974 */ 975 mutex_enter(&ipst->ips_numvifs_mutex); 976 if (ipst->ips_reg_vif_num == ALL_VIFS) { 977 ipst->ips_reg_vif_num = vifcp->vifc_vifi; 978 mutex_exit(&ipst->ips_numvifs_mutex); 979 } else { 980 mutex_exit(&ipst->ips_numvifs_mutex); 981 VIF_REFRELE_LOCKED(vifp); 982 ipif_refrele(ipif); 983 ipsq_exit(ipsq); 984 return (EADDRINUSE); 985 } 986 } 987 988 /* Make sure the interface supports multicast */ 989 if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) { 990 VIF_REFRELE_LOCKED(vifp); 991 ipif_refrele(ipif); 992 if (vifcp->vifc_flags & VIFF_REGISTER) { 993 mutex_enter(&ipst->ips_numvifs_mutex); 994 ipst->ips_reg_vif_num = ALL_VIFS; 995 mutex_exit(&ipst->ips_numvifs_mutex); 996 } 997 ipsq_exit(ipsq); 998 return (EOPNOTSUPP); 999 } 1000 /* Enable promiscuous reception of all IP mcasts from the if */ 1001 mutex_exit(&vifp->v_lock); 1002 error = ip_addmulti(INADDR_ANY, ipif, ILGSTAT_NONE, 1003 MODE_IS_EXCLUDE, NULL); 1004 mutex_enter(&vifp->v_lock); 1005 /* 1006 * since we released the lock lets make sure that 1007 * ip_mrouter_done() has not been called. 1008 */ 1009 if (error != 0 || is_mrouter_off(ipst)) { 1010 if (error == 0) 1011 (void) ip_delmulti(INADDR_ANY, ipif, B_TRUE, 1012 B_TRUE); 1013 if (vifcp->vifc_flags & VIFF_REGISTER) { 1014 mutex_enter(&ipst->ips_numvifs_mutex); 1015 ipst->ips_reg_vif_num = ALL_VIFS; 1016 mutex_exit(&ipst->ips_numvifs_mutex); 1017 } 1018 VIF_REFRELE_LOCKED(vifp); 1019 ipif_refrele(ipif); 1020 ipsq_exit(ipsq); 1021 return (error?error:EINVAL); 1022 } 1023 } 1024 /* Define parameters for the tbf structure */ 1025 vifp->v_tbf = v_tbf; 1026 gethrestime(&vifp->v_tbf->tbf_last_pkt_t); 1027 vifp->v_tbf->tbf_n_tok = 0; 1028 vifp->v_tbf->tbf_q_len = 0; 1029 vifp->v_tbf->tbf_max_q_len = MAXQSIZE; 1030 vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL; 1031 1032 vifp->v_flags = vifcp->vifc_flags; 1033 vifp->v_threshold = vifcp->vifc_threshold; 1034 vifp->v_lcl_addr = vifcp->vifc_lcl_addr; 1035 vifp->v_ipif = ipif; 1036 ipif_refrele(ipif); 1037 /* Scaling up here, allows division by 1024 in critical code. */ 1038 vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000); 1039 vifp->v_timeout_id = 0; 1040 /* initialize per vif pkt counters */ 1041 vifp->v_pkt_in = 0; 1042 vifp->v_pkt_out = 0; 1043 vifp->v_bytes_in = 0; 1044 vifp->v_bytes_out = 0; 1045 mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL); 1046 1047 /* Adjust numvifs up, if the vifi is higher than numvifs */ 1048 mutex_enter(&ipst->ips_numvifs_mutex); 1049 if (ipst->ips_numvifs <= vifcp->vifc_vifi) 1050 ipst->ips_numvifs = vifcp->vifc_vifi + 1; 1051 mutex_exit(&ipst->ips_numvifs_mutex); 1052 1053 if (ipst->ips_ip_mrtdebug > 1) { 1054 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1055 "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d", 1056 vifcp->vifc_vifi, 1057 ntohl(vifcp->vifc_lcl_addr.s_addr), 1058 (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask", 1059 ntohl(vifcp->vifc_rmt_addr.s_addr), 1060 vifcp->vifc_threshold, vifcp->vifc_rate_limit); 1061 } 1062 1063 vifp->v_marks = VIF_MARK_GOOD; 1064 mutex_exit(&vifp->v_lock); 1065 ipsq_exit(ipsq); 1066 return (0); 1067 } 1068 1069 1070 /* Delete a vif from the vif table. */ 1071 static void 1072 del_vifp(struct vif *vifp) 1073 { 1074 struct tbf *t = vifp->v_tbf; 1075 mblk_t *mp0; 1076 vifi_t vifi; 1077 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 1078 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1079 1080 ASSERT(vifp->v_marks & VIF_MARK_CONDEMNED); 1081 ASSERT(t != NULL); 1082 1083 /* 1084 * release the ref we put in vif_del. 1085 */ 1086 ASSERT(vifp->v_ipif != NULL); 1087 ipif_refrele(vifp->v_ipif); 1088 1089 if (ipst->ips_ip_mrtdebug > 1) { 1090 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1091 "del_vif: src 0x%x\n", vifp->v_lcl_addr.s_addr); 1092 } 1093 1094 if (vifp->v_timeout_id != 0) { 1095 (void) untimeout(vifp->v_timeout_id); 1096 vifp->v_timeout_id = 0; 1097 } 1098 1099 /* 1100 * Free packets queued at the interface. 1101 * Mrouted takes care of cleaning up mfcs - makes calls to del_mfc. 1102 */ 1103 mutex_enter(&t->tbf_lock); 1104 while (t->tbf_q != NULL) { 1105 mp0 = t->tbf_q; 1106 t->tbf_q = t->tbf_q->b_next; 1107 mp0->b_prev = mp0->b_next = NULL; 1108 freemsg(mp0); 1109 } 1110 mutex_exit(&t->tbf_lock); 1111 1112 /* 1113 * Always clear cache when vifs change. 1114 * No need to get last_encap_lock since we are running as a writer. 1115 */ 1116 mutex_enter(&ipst->ips_last_encap_lock); 1117 if (vifp == ipst->ips_last_encap_vif) { 1118 ipst->ips_last_encap_vif = NULL; 1119 ipst->ips_last_encap_src = 0; 1120 } 1121 mutex_exit(&ipst->ips_last_encap_lock); 1122 1123 mutex_destroy(&t->tbf_lock); 1124 1125 bzero(vifp->v_tbf, sizeof (*(vifp->v_tbf))); 1126 1127 /* Adjust numvifs down */ 1128 mutex_enter(&ipst->ips_numvifs_mutex); 1129 for (vifi = ipst->ips_numvifs; vifi != 0; vifi--) /* vifi is unsigned */ 1130 if (ipst->ips_vifs[vifi - 1].v_lcl_addr.s_addr != 0) 1131 break; 1132 ipst->ips_numvifs = vifi; 1133 mutex_exit(&ipst->ips_numvifs_mutex); 1134 1135 bzero(vifp, sizeof (*vifp)); 1136 } 1137 1138 static int 1139 del_vif(vifi_t *vifip, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst) 1140 { 1141 struct vif *vifp = ipst->ips_vifs + *vifip; 1142 ipsq_t *ipsq; 1143 1144 if (*vifip >= ipst->ips_numvifs) 1145 return (EINVAL); 1146 1147 1148 mutex_enter(&vifp->v_lock); 1149 /* 1150 * Not initialized 1151 * Here we are not looking at the vif that is being initialized 1152 * i.e vifp->v_marks == 0 and refcnt > 0. 1153 */ 1154 if (vifp->v_lcl_addr.s_addr == 0 || 1155 !(vifp->v_marks & VIF_MARK_GOOD)) { 1156 mutex_exit(&vifp->v_lock); 1157 return (EADDRNOTAVAIL); 1158 } 1159 1160 /* 1161 * This is an optimization, if first_mp == NULL 1162 * than we are being called from reset_mrt_vif_ipif() 1163 * so we already have exclusive access to the ipsq. 1164 * the ASSERT below is a check for this condition. 1165 */ 1166 if (first_mp != NULL && 1167 !(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 1168 ASSERT(connp != NULL); 1169 /* 1170 * We have to be exclusive as we have to call ip_delmulti() 1171 * This is the best position to try to be exclusive in case 1172 * we have to wait. 1173 */ 1174 ipsq = ipsq_try_enter(vifp->v_ipif, NULL, CONNP_TO_WQ(connp), 1175 first_mp, ip_restart_optmgmt, NEW_OP, B_TRUE); 1176 if ((ipsq) == NULL) { 1177 mutex_exit(&vifp->v_lock); 1178 return (EINPROGRESS); 1179 } 1180 /* recheck after being exclusive */ 1181 if (vifp->v_lcl_addr.s_addr == 0 || 1182 !vifp->v_marks & VIF_MARK_GOOD) { 1183 /* 1184 * someone beat us. 1185 */ 1186 mutex_exit(&vifp->v_lock); 1187 ipsq_exit(ipsq); 1188 return (EADDRNOTAVAIL); 1189 } 1190 } 1191 1192 1193 ASSERT(IAM_WRITER_IPIF(vifp->v_ipif)); 1194 1195 1196 /* 1197 * add a refhold so that ipif does not go away while 1198 * there are still users, this will be released in del_vifp 1199 * when we free the vif. 1200 */ 1201 ipif_refhold(vifp->v_ipif); 1202 1203 /* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */ 1204 vifp->v_marks &= ~VIF_MARK_GOOD; 1205 vifp->v_marks |= VIF_MARK_CONDEMNED; 1206 1207 /* Phyint only */ 1208 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 1209 ipif_t *ipif = vifp->v_ipif; 1210 ASSERT(ipif != NULL); 1211 /* 1212 * should be OK to drop the lock as we 1213 * have marked this as CONDEMNED. 1214 */ 1215 mutex_exit(&(vifp)->v_lock); 1216 (void) ip_delmulti(INADDR_ANY, ipif, B_TRUE, B_TRUE); 1217 if (first_mp != NULL) 1218 ipsq_exit(ipsq); 1219 mutex_enter(&(vifp)->v_lock); 1220 } 1221 1222 /* 1223 * decreases the refcnt added in add_vif. 1224 */ 1225 VIF_REFRELE_LOCKED(vifp); 1226 return (0); 1227 } 1228 1229 /* 1230 * Add an mfc entry. 1231 */ 1232 static int 1233 add_mfc(struct mfcctl *mfccp, ip_stack_t *ipst) 1234 { 1235 struct mfc *rt; 1236 struct rtdetq *rte; 1237 ushort_t nstl; 1238 int i; 1239 struct mfcb *mfcbp; 1240 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1241 1242 /* 1243 * The value of vifi is NO_VIF (==MAXVIFS) if Mrouted 1244 * did not have a real route for pkt. 1245 * We want this pkt without rt installed in the mfctable to prevent 1246 * multiiple tries, so go ahead and put it in mfctable, it will 1247 * be discarded later in ip_mdq() because the child is NULL. 1248 */ 1249 1250 /* Error checking, out of bounds? */ 1251 if (mfccp->mfcc_parent > MAXVIFS) { 1252 ip0dbg(("ADD_MFC: mfcc_parent out of range %d", 1253 (int)mfccp->mfcc_parent)); 1254 return (EINVAL); 1255 } 1256 1257 if ((mfccp->mfcc_parent != NO_VIF) && 1258 (ipst->ips_vifs[mfccp->mfcc_parent].v_ipif == NULL)) { 1259 ip0dbg(("ADD_MFC: NULL ipif for parent vif %d\n", 1260 (int)mfccp->mfcc_parent)); 1261 return (EINVAL); 1262 } 1263 1264 if (is_mrouter_off(ipst)) { 1265 return (EINVAL); 1266 } 1267 1268 mfcbp = &ipst->ips_mfcs[MFCHASH(mfccp->mfcc_origin.s_addr, 1269 mfccp->mfcc_mcastgrp.s_addr)]; 1270 MFCB_REFHOLD(mfcbp); 1271 MFCFIND(mfcbp, mfccp->mfcc_origin.s_addr, 1272 mfccp->mfcc_mcastgrp.s_addr, rt); 1273 1274 /* If an entry already exists, just update the fields */ 1275 if (rt) { 1276 if (ipst->ips_ip_mrtdebug > 1) { 1277 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1278 "add_mfc: update o %x grp %x parent %x", 1279 ntohl(mfccp->mfcc_origin.s_addr), 1280 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1281 mfccp->mfcc_parent); 1282 } 1283 mutex_enter(&rt->mfc_mutex); 1284 rt->mfc_parent = mfccp->mfcc_parent; 1285 1286 mutex_enter(&ipst->ips_numvifs_mutex); 1287 for (i = 0; i < (int)ipst->ips_numvifs; i++) 1288 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; 1289 mutex_exit(&ipst->ips_numvifs_mutex); 1290 mutex_exit(&rt->mfc_mutex); 1291 1292 MFCB_REFRELE(mfcbp); 1293 return (0); 1294 } 1295 1296 /* 1297 * Find the entry for which the upcall was made and update. 1298 */ 1299 for (rt = mfcbp->mfcb_mfc, nstl = 0; rt; rt = rt->mfc_next) { 1300 mutex_enter(&rt->mfc_mutex); 1301 if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && 1302 (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) && 1303 (rt->mfc_rte != NULL) && 1304 !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 1305 if (nstl++ != 0) 1306 cmn_err(CE_WARN, 1307 "add_mfc: %s o %x g %x p %x", 1308 "multiple kernel entries", 1309 ntohl(mfccp->mfcc_origin.s_addr), 1310 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1311 mfccp->mfcc_parent); 1312 1313 if (ipst->ips_ip_mrtdebug > 1) { 1314 (void) mi_strlog(mrouter->conn_rq, 1, 1315 SL_TRACE, 1316 "add_mfc: o %x g %x p %x", 1317 ntohl(mfccp->mfcc_origin.s_addr), 1318 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1319 mfccp->mfcc_parent); 1320 } 1321 fill_route(rt, mfccp, ipst); 1322 1323 /* 1324 * Prevent cleanup of cache entry. 1325 * Timer starts in ip_mforward. 1326 */ 1327 if (rt->mfc_timeout_id != 0) { 1328 timeout_id_t id; 1329 id = rt->mfc_timeout_id; 1330 /* 1331 * setting id to zero will avoid this 1332 * entry from being cleaned up in 1333 * expire_up_calls(). 1334 */ 1335 rt->mfc_timeout_id = 0; 1336 /* 1337 * dropping the lock is fine as we 1338 * have a refhold on the bucket. 1339 * so mfc cannot be freed. 1340 * The timeout can fire but it will see 1341 * that mfc_timeout_id == 0 and not cleanup. 1342 */ 1343 mutex_exit(&rt->mfc_mutex); 1344 (void) untimeout(id); 1345 mutex_enter(&rt->mfc_mutex); 1346 } 1347 1348 /* 1349 * Send all pkts that are queued waiting for the upcall. 1350 * ip_mdq param tun set to 0 - 1351 * the return value of ip_mdq() isn't used here, 1352 * so value we send doesn't matter. 1353 */ 1354 while (rt->mfc_rte != NULL) { 1355 rte = rt->mfc_rte; 1356 rt->mfc_rte = rte->rte_next; 1357 mutex_exit(&rt->mfc_mutex); 1358 (void) ip_mdq(rte->mp, (ipha_t *) 1359 rte->mp->b_rptr, rte->ill, 0, rt); 1360 freemsg(rte->mp); 1361 mi_free((char *)rte); 1362 mutex_enter(&rt->mfc_mutex); 1363 } 1364 } 1365 mutex_exit(&rt->mfc_mutex); 1366 } 1367 1368 1369 /* 1370 * It is possible that an entry is being inserted without an upcall 1371 */ 1372 if (nstl == 0) { 1373 mutex_enter(&(mfcbp->mfcb_lock)); 1374 if (ipst->ips_ip_mrtdebug > 1) { 1375 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1376 "add_mfc: no upcall o %x g %x p %x", 1377 ntohl(mfccp->mfcc_origin.s_addr), 1378 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1379 mfccp->mfcc_parent); 1380 } 1381 if (is_mrouter_off(ipst)) { 1382 mutex_exit(&mfcbp->mfcb_lock); 1383 MFCB_REFRELE(mfcbp); 1384 return (EINVAL); 1385 } 1386 1387 for (rt = mfcbp->mfcb_mfc; rt; rt = rt->mfc_next) { 1388 1389 mutex_enter(&rt->mfc_mutex); 1390 if ((rt->mfc_origin.s_addr == 1391 mfccp->mfcc_origin.s_addr) && 1392 (rt->mfc_mcastgrp.s_addr == 1393 mfccp->mfcc_mcastgrp.s_addr) && 1394 (!(rt->mfc_marks & MFCB_MARK_CONDEMNED))) { 1395 fill_route(rt, mfccp, ipst); 1396 mutex_exit(&rt->mfc_mutex); 1397 break; 1398 } 1399 mutex_exit(&rt->mfc_mutex); 1400 } 1401 1402 /* No upcall, so make a new entry into mfctable */ 1403 if (rt == NULL) { 1404 rt = (struct mfc *)mi_zalloc(sizeof (struct mfc)); 1405 if (rt == NULL) { 1406 ip1dbg(("add_mfc: out of memory\n")); 1407 mutex_exit(&mfcbp->mfcb_lock); 1408 MFCB_REFRELE(mfcbp); 1409 return (ENOBUFS); 1410 } 1411 1412 /* Insert new entry at head of hash chain */ 1413 mutex_enter(&rt->mfc_mutex); 1414 fill_route(rt, mfccp, ipst); 1415 1416 /* Link into table */ 1417 rt->mfc_next = mfcbp->mfcb_mfc; 1418 mfcbp->mfcb_mfc = rt; 1419 mutex_exit(&rt->mfc_mutex); 1420 } 1421 mutex_exit(&mfcbp->mfcb_lock); 1422 } 1423 1424 MFCB_REFRELE(mfcbp); 1425 return (0); 1426 } 1427 1428 /* 1429 * Fills in mfc structure from mrouted mfcctl. 1430 */ 1431 static void 1432 fill_route(struct mfc *rt, struct mfcctl *mfccp, ip_stack_t *ipst) 1433 { 1434 int i; 1435 1436 rt->mfc_origin = mfccp->mfcc_origin; 1437 rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; 1438 rt->mfc_parent = mfccp->mfcc_parent; 1439 mutex_enter(&ipst->ips_numvifs_mutex); 1440 for (i = 0; i < (int)ipst->ips_numvifs; i++) { 1441 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; 1442 } 1443 mutex_exit(&ipst->ips_numvifs_mutex); 1444 /* Initialize pkt counters per src-grp */ 1445 rt->mfc_pkt_cnt = 0; 1446 rt->mfc_byte_cnt = 0; 1447 rt->mfc_wrong_if = 0; 1448 rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_nsec = 0; 1449 1450 } 1451 1452 static void 1453 free_queue(struct mfc *mfcp) 1454 { 1455 struct rtdetq *rte0; 1456 1457 /* 1458 * Drop all queued upcall packets. 1459 * Free the mbuf with the pkt. 1460 */ 1461 while ((rte0 = mfcp->mfc_rte) != NULL) { 1462 mfcp->mfc_rte = rte0->rte_next; 1463 freemsg(rte0->mp); 1464 mi_free((char *)rte0); 1465 } 1466 } 1467 /* 1468 * go thorugh the hash bucket and free all the entries marked condemned. 1469 */ 1470 void 1471 release_mfc(struct mfcb *mfcbp) 1472 { 1473 struct mfc *current_mfcp; 1474 struct mfc *prev_mfcp; 1475 1476 prev_mfcp = current_mfcp = mfcbp->mfcb_mfc; 1477 1478 while (current_mfcp != NULL) { 1479 if (current_mfcp->mfc_marks & MFCB_MARK_CONDEMNED) { 1480 if (current_mfcp == mfcbp->mfcb_mfc) { 1481 mfcbp->mfcb_mfc = current_mfcp->mfc_next; 1482 free_queue(current_mfcp); 1483 mi_free(current_mfcp); 1484 prev_mfcp = current_mfcp = mfcbp->mfcb_mfc; 1485 continue; 1486 } 1487 ASSERT(prev_mfcp != NULL); 1488 prev_mfcp->mfc_next = current_mfcp->mfc_next; 1489 free_queue(current_mfcp); 1490 mi_free(current_mfcp); 1491 current_mfcp = NULL; 1492 } else { 1493 prev_mfcp = current_mfcp; 1494 } 1495 1496 current_mfcp = prev_mfcp->mfc_next; 1497 1498 } 1499 mfcbp->mfcb_marks &= ~MFCB_MARK_CONDEMNED; 1500 ASSERT(mfcbp->mfcb_mfc != NULL || mfcbp->mfcb_marks == 0); 1501 } 1502 1503 /* 1504 * Delete an mfc entry. 1505 */ 1506 static int 1507 del_mfc(struct mfcctl *mfccp, ip_stack_t *ipst) 1508 { 1509 struct in_addr origin; 1510 struct in_addr mcastgrp; 1511 struct mfc *rt; 1512 uint_t hash; 1513 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1514 1515 origin = mfccp->mfcc_origin; 1516 mcastgrp = mfccp->mfcc_mcastgrp; 1517 hash = MFCHASH(origin.s_addr, mcastgrp.s_addr); 1518 1519 if (ipst->ips_ip_mrtdebug > 1) { 1520 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1521 "del_mfc: o %x g %x", 1522 ntohl(origin.s_addr), 1523 ntohl(mcastgrp.s_addr)); 1524 } 1525 1526 MFCB_REFHOLD(&ipst->ips_mfcs[hash]); 1527 1528 /* Find mfc in mfctable, finds only entries without upcalls */ 1529 for (rt = ipst->ips_mfcs[hash].mfcb_mfc; rt; rt = rt->mfc_next) { 1530 mutex_enter(&rt->mfc_mutex); 1531 if (origin.s_addr == rt->mfc_origin.s_addr && 1532 mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr && 1533 rt->mfc_rte == NULL && 1534 !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) 1535 break; 1536 mutex_exit(&rt->mfc_mutex); 1537 } 1538 1539 /* 1540 * Return if there was an upcall (mfc_rte != NULL, 1541 * or rt not in mfctable. 1542 */ 1543 if (rt == NULL) { 1544 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 1545 return (EADDRNOTAVAIL); 1546 } 1547 1548 1549 /* 1550 * no need to hold lock as we have a reference. 1551 */ 1552 ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED; 1553 /* error checking */ 1554 if (rt->mfc_timeout_id != 0) { 1555 ip0dbg(("del_mfc: TIMEOUT NOT 0, rte not null")); 1556 /* 1557 * Its ok to drop the lock, the struct cannot be freed 1558 * since we have a ref on the hash bucket. 1559 */ 1560 rt->mfc_timeout_id = 0; 1561 mutex_exit(&rt->mfc_mutex); 1562 (void) untimeout(rt->mfc_timeout_id); 1563 mutex_enter(&rt->mfc_mutex); 1564 } 1565 1566 ASSERT(rt->mfc_rte == NULL); 1567 1568 1569 /* 1570 * Delete the entry from the cache 1571 */ 1572 rt->mfc_marks |= MFCB_MARK_CONDEMNED; 1573 mutex_exit(&rt->mfc_mutex); 1574 1575 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 1576 1577 return (0); 1578 } 1579 1580 #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ 1581 1582 /* 1583 * IP multicast forwarding function. This function assumes that the packet 1584 * pointed to by ipha has arrived on (or is about to be sent to) the interface 1585 * pointed to by "ill", and the packet is to be relayed to other networks 1586 * that have members of the packet's destination IP multicast group. 1587 * 1588 * The packet is returned unscathed to the caller, unless it is 1589 * erroneous, in which case a -1 value tells the caller (IP) 1590 * to discard it. 1591 * 1592 * Unlike BSD, SunOS 5.x needs to return to IP info about 1593 * whether pkt came in thru a tunnel, so it can be discarded, unless 1594 * it's IGMP. In BSD, the ifp is bogus for tunnels, so pkt won't try 1595 * to be delivered. 1596 * Return values are 0 - pkt is okay and phyint 1597 * -1 - pkt is malformed and to be tossed 1598 * 1 - pkt came in on tunnel 1599 */ 1600 int 1601 ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp) 1602 { 1603 struct mfc *rt; 1604 ipaddr_t src, dst, tunnel_src = 0; 1605 static int srctun = 0; 1606 vifi_t vifi; 1607 boolean_t pim_reg_packet = B_FALSE; 1608 struct mfcb *mfcbp; 1609 ip_stack_t *ipst = ill->ill_ipst; 1610 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1611 1612 if (ipst->ips_ip_mrtdebug > 1) { 1613 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1614 "ip_mforward: RECV ipha_src %x, ipha_dst %x, ill %s", 1615 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), 1616 ill->ill_name); 1617 } 1618 1619 dst = ipha->ipha_dst; 1620 if ((uint32_t)(uintptr_t)mp->b_prev == PIM_REGISTER_MARKER) 1621 pim_reg_packet = B_TRUE; 1622 else 1623 tunnel_src = (ipaddr_t)(uintptr_t)mp->b_prev; 1624 1625 /* 1626 * Don't forward a packet with time-to-live of zero or one, 1627 * or a packet destined to a local-only group. 1628 */ 1629 if (CLASSD(dst) && (ipha->ipha_ttl <= 1 || 1630 (ipaddr_t)ntohl(dst) <= INADDR_MAX_LOCAL_GROUP)) { 1631 if (ipst->ips_ip_mrtdebug > 1) { 1632 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1633 "ip_mforward: not forwarded ttl %d," 1634 " dst 0x%x ill %s", 1635 ipha->ipha_ttl, ntohl(dst), ill->ill_name); 1636 } 1637 mp->b_prev = NULL; 1638 if (tunnel_src != 0) 1639 return (1); 1640 else 1641 return (0); 1642 } 1643 1644 if ((tunnel_src != 0) || pim_reg_packet) { 1645 /* 1646 * Packet arrived over an encapsulated tunnel or via a PIM 1647 * register message. Both ip_mroute_decap() and pim_input() 1648 * encode information in mp->b_prev. 1649 */ 1650 mp->b_prev = NULL; 1651 if (ipst->ips_ip_mrtdebug > 1) { 1652 if (tunnel_src != 0) { 1653 (void) mi_strlog(mrouter->conn_rq, 1, 1654 SL_TRACE, 1655 "ip_mforward: ill %s arrived via ENCAP TUN", 1656 ill->ill_name); 1657 } else if (pim_reg_packet) { 1658 (void) mi_strlog(mrouter->conn_rq, 1, 1659 SL_TRACE, 1660 "ip_mforward: ill %s arrived via" 1661 " REGISTER VIF", 1662 ill->ill_name); 1663 } 1664 } 1665 } else if ((ipha->ipha_version_and_hdr_length & 0xf) < 1666 (uint_t)(IP_SIMPLE_HDR_LENGTH + TUNNEL_LEN) >> 2 || 1667 ((uchar_t *)(ipha + 1))[1] != IPOPT_LSRR) { 1668 /* Packet arrived via a physical interface. */ 1669 if (ipst->ips_ip_mrtdebug > 1) { 1670 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1671 "ip_mforward: ill %s arrived via PHYINT", 1672 ill->ill_name); 1673 } 1674 1675 } else { 1676 /* 1677 * Packet arrived through a SRCRT tunnel. 1678 * Source-route tunnels are no longer supported. 1679 * Error message printed every 1000 times. 1680 */ 1681 if ((srctun++ % 1000) == 0) { 1682 cmn_err(CE_WARN, 1683 "ip_mforward: received source-routed pkt from %x", 1684 ntohl(ipha->ipha_src)); 1685 } 1686 return (-1); 1687 } 1688 1689 ipst->ips_mrtstat->mrts_fwd_in++; 1690 src = ipha->ipha_src; 1691 1692 /* Find route in cache, return NULL if not there or upcalls q'ed. */ 1693 1694 /* 1695 * Lock the mfctable against changes made by ip_mforward. 1696 * Note that only add_mfc and del_mfc can remove entries and 1697 * they run with exclusive access to IP. So we do not need to 1698 * guard against the rt being deleted, so release lock after reading. 1699 */ 1700 1701 if (is_mrouter_off(ipst)) 1702 return (-1); 1703 1704 mfcbp = &ipst->ips_mfcs[MFCHASH(src, dst)]; 1705 MFCB_REFHOLD(mfcbp); 1706 MFCFIND(mfcbp, src, dst, rt); 1707 1708 /* Entry exists, so forward if necessary */ 1709 if (rt != NULL) { 1710 int ret = 0; 1711 ipst->ips_mrtstat->mrts_mfc_hits++; 1712 if (pim_reg_packet) { 1713 ASSERT(ipst->ips_reg_vif_num != ALL_VIFS); 1714 ret = ip_mdq(mp, ipha, 1715 ipst->ips_vifs[ipst->ips_reg_vif_num]. 1716 v_ipif->ipif_ill, 1717 0, rt); 1718 } else { 1719 ret = ip_mdq(mp, ipha, ill, tunnel_src, rt); 1720 } 1721 1722 MFCB_REFRELE(mfcbp); 1723 return (ret); 1724 1725 /* 1726 * Don't forward if we don't have a cache entry. Mrouted will 1727 * always provide a cache entry in response to an upcall. 1728 */ 1729 } else { 1730 /* 1731 * If we don't have a route for packet's origin, make a copy 1732 * of the packet and send message to routing daemon. 1733 */ 1734 struct mfc *mfc_rt = NULL; 1735 mblk_t *mp0 = NULL; 1736 mblk_t *mp_copy = NULL; 1737 struct rtdetq *rte = NULL; 1738 struct rtdetq *rte_m, *rte1, *prev_rte; 1739 uint_t hash; 1740 int npkts; 1741 boolean_t new_mfc = B_FALSE; 1742 ipst->ips_mrtstat->mrts_mfc_misses++; 1743 /* BSD uses mrts_no_route++ */ 1744 if (ipst->ips_ip_mrtdebug > 1) { 1745 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1746 "ip_mforward: no rte ill %s src %x g %x misses %d", 1747 ill->ill_name, ntohl(src), ntohl(dst), 1748 (int)ipst->ips_mrtstat->mrts_mfc_misses); 1749 } 1750 /* 1751 * The order of the following code differs from the BSD code. 1752 * Pre-mc3.5, the BSD code was incorrect and SunOS 5.x 1753 * code works, so SunOS 5.x wasn't changed to conform to the 1754 * BSD version. 1755 */ 1756 1757 /* Lock mfctable. */ 1758 hash = MFCHASH(src, dst); 1759 mutex_enter(&(ipst->ips_mfcs[hash].mfcb_lock)); 1760 1761 /* 1762 * If we are turning off mrouted return an error 1763 */ 1764 if (is_mrouter_off(ipst)) { 1765 mutex_exit(&mfcbp->mfcb_lock); 1766 MFCB_REFRELE(mfcbp); 1767 return (-1); 1768 } 1769 1770 /* Is there an upcall waiting for this packet? */ 1771 for (mfc_rt = ipst->ips_mfcs[hash].mfcb_mfc; mfc_rt; 1772 mfc_rt = mfc_rt->mfc_next) { 1773 mutex_enter(&mfc_rt->mfc_mutex); 1774 if (ipst->ips_ip_mrtdebug > 1) { 1775 (void) mi_strlog(mrouter->conn_rq, 1, 1776 SL_TRACE, 1777 "ip_mforward: MFCTAB hash %d o 0x%x" 1778 " g 0x%x\n", 1779 hash, ntohl(mfc_rt->mfc_origin.s_addr), 1780 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1781 } 1782 /* There is an upcall */ 1783 if ((src == mfc_rt->mfc_origin.s_addr) && 1784 (dst == mfc_rt->mfc_mcastgrp.s_addr) && 1785 (mfc_rt->mfc_rte != NULL) && 1786 !(mfc_rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 1787 break; 1788 } 1789 mutex_exit(&mfc_rt->mfc_mutex); 1790 } 1791 /* No upcall, so make a new entry into mfctable */ 1792 if (mfc_rt == NULL) { 1793 mfc_rt = (struct mfc *)mi_zalloc(sizeof (struct mfc)); 1794 if (mfc_rt == NULL) { 1795 ipst->ips_mrtstat->mrts_fwd_drop++; 1796 ip1dbg(("ip_mforward: out of memory " 1797 "for mfc, mfc_rt\n")); 1798 goto error_return; 1799 } else 1800 new_mfc = B_TRUE; 1801 /* Get resources */ 1802 /* TODO could copy header and dup rest */ 1803 mp_copy = copymsg(mp); 1804 if (mp_copy == NULL) { 1805 ipst->ips_mrtstat->mrts_fwd_drop++; 1806 ip1dbg(("ip_mforward: out of memory for " 1807 "mblk, mp_copy\n")); 1808 goto error_return; 1809 } 1810 mutex_enter(&mfc_rt->mfc_mutex); 1811 } 1812 /* Get resources for rte, whether first rte or not first. */ 1813 /* Add this packet into rtdetq */ 1814 rte = (struct rtdetq *)mi_zalloc(sizeof (struct rtdetq)); 1815 if (rte == NULL) { 1816 ipst->ips_mrtstat->mrts_fwd_drop++; 1817 mutex_exit(&mfc_rt->mfc_mutex); 1818 ip1dbg(("ip_mforward: out of memory for" 1819 " rtdetq, rte\n")); 1820 goto error_return; 1821 } 1822 1823 mp0 = copymsg(mp); 1824 if (mp0 == NULL) { 1825 ipst->ips_mrtstat->mrts_fwd_drop++; 1826 ip1dbg(("ip_mforward: out of memory for mblk, mp0\n")); 1827 mutex_exit(&mfc_rt->mfc_mutex); 1828 goto error_return; 1829 } 1830 rte->mp = mp0; 1831 if (pim_reg_packet) { 1832 ASSERT(ipst->ips_reg_vif_num != ALL_VIFS); 1833 rte->ill = 1834 ipst->ips_vifs[ipst->ips_reg_vif_num]. 1835 v_ipif->ipif_ill; 1836 } else { 1837 rte->ill = ill; 1838 } 1839 rte->rte_next = NULL; 1840 1841 /* 1842 * Determine if upcall q (rtdetq) has overflowed. 1843 * mfc_rt->mfc_rte is null by mi_zalloc 1844 * if it is the first message. 1845 */ 1846 for (rte_m = mfc_rt->mfc_rte, npkts = 0; rte_m; 1847 rte_m = rte_m->rte_next) 1848 npkts++; 1849 if (ipst->ips_ip_mrtdebug > 1) { 1850 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1851 "ip_mforward: upcalls %d\n", npkts); 1852 } 1853 if (npkts > MAX_UPQ) { 1854 ipst->ips_mrtstat->mrts_upq_ovflw++; 1855 mutex_exit(&mfc_rt->mfc_mutex); 1856 goto error_return; 1857 } 1858 1859 if (npkts == 0) { /* first upcall */ 1860 int i = 0; 1861 /* 1862 * Now finish installing the new mfc! Now that we have 1863 * resources! Insert new entry at head of hash chain. 1864 * Use src and dst which are ipaddr_t's. 1865 */ 1866 mfc_rt->mfc_origin.s_addr = src; 1867 mfc_rt->mfc_mcastgrp.s_addr = dst; 1868 1869 mutex_enter(&ipst->ips_numvifs_mutex); 1870 for (i = 0; i < (int)ipst->ips_numvifs; i++) 1871 mfc_rt->mfc_ttls[i] = 0; 1872 mutex_exit(&ipst->ips_numvifs_mutex); 1873 mfc_rt->mfc_parent = ALL_VIFS; 1874 1875 /* Link into table */ 1876 if (ipst->ips_ip_mrtdebug > 1) { 1877 (void) mi_strlog(mrouter->conn_rq, 1, 1878 SL_TRACE, 1879 "ip_mforward: NEW MFCTAB hash %d o 0x%x " 1880 "g 0x%x\n", hash, 1881 ntohl(mfc_rt->mfc_origin.s_addr), 1882 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1883 } 1884 mfc_rt->mfc_next = ipst->ips_mfcs[hash].mfcb_mfc; 1885 ipst->ips_mfcs[hash].mfcb_mfc = mfc_rt; 1886 mfc_rt->mfc_rte = NULL; 1887 } 1888 1889 /* Link in the upcall */ 1890 /* First upcall */ 1891 if (mfc_rt->mfc_rte == NULL) 1892 mfc_rt->mfc_rte = rte; 1893 else { 1894 /* not the first upcall */ 1895 prev_rte = mfc_rt->mfc_rte; 1896 for (rte1 = mfc_rt->mfc_rte->rte_next; rte1; 1897 prev_rte = rte1, rte1 = rte1->rte_next) 1898 ; 1899 prev_rte->rte_next = rte; 1900 } 1901 1902 /* 1903 * No upcalls waiting, this is first one, so send a message to 1904 * routing daemon to install a route into kernel table. 1905 */ 1906 if (npkts == 0) { 1907 struct igmpmsg *im; 1908 /* ipha_protocol is 0, for upcall */ 1909 ASSERT(mp_copy != NULL); 1910 im = (struct igmpmsg *)mp_copy->b_rptr; 1911 im->im_msgtype = IGMPMSG_NOCACHE; 1912 im->im_mbz = 0; 1913 mutex_enter(&ipst->ips_numvifs_mutex); 1914 if (pim_reg_packet) { 1915 im->im_vif = (uchar_t)ipst->ips_reg_vif_num; 1916 mutex_exit(&ipst->ips_numvifs_mutex); 1917 } else { 1918 /* 1919 * XXX do we need to hold locks here ? 1920 */ 1921 for (vifi = 0; 1922 vifi < ipst->ips_numvifs; 1923 vifi++) { 1924 if (ipst->ips_vifs[vifi].v_ipif == NULL) 1925 continue; 1926 if (ipst->ips_vifs[vifi]. 1927 v_ipif->ipif_ill == ill) { 1928 im->im_vif = (uchar_t)vifi; 1929 break; 1930 } 1931 } 1932 mutex_exit(&ipst->ips_numvifs_mutex); 1933 ASSERT(vifi < ipst->ips_numvifs); 1934 } 1935 1936 ipst->ips_mrtstat->mrts_upcalls++; 1937 /* Timer to discard upcalls if mrouted is too slow */ 1938 mfc_rt->mfc_timeout_id = timeout(expire_upcalls, 1939 mfc_rt, EXPIRE_TIMEOUT * UPCALL_EXPIRE); 1940 mutex_exit(&mfc_rt->mfc_mutex); 1941 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1942 /* Pass to RAWIP */ 1943 (mrouter->conn_recv)(mrouter, mp_copy, NULL); 1944 } else { 1945 mutex_exit(&mfc_rt->mfc_mutex); 1946 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1947 freemsg(mp_copy); 1948 } 1949 1950 MFCB_REFRELE(mfcbp); 1951 if (tunnel_src != 0) 1952 return (1); 1953 else 1954 return (0); 1955 error_return: 1956 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1957 MFCB_REFRELE(mfcbp); 1958 if (mfc_rt != NULL && (new_mfc == B_TRUE)) 1959 mi_free((char *)mfc_rt); 1960 if (rte != NULL) 1961 mi_free((char *)rte); 1962 if (mp_copy != NULL) 1963 freemsg(mp_copy); 1964 if (mp0 != NULL) 1965 freemsg(mp0); 1966 return (-1); 1967 } 1968 } 1969 1970 /* 1971 * Clean up the mfctable cache entry if upcall is not serviced. 1972 * SunOS 5.x has timeout per mfc, unlike BSD which has one timer. 1973 */ 1974 static void 1975 expire_upcalls(void *arg) 1976 { 1977 struct mfc *mfc_rt = arg; 1978 uint_t hash; 1979 struct mfc *prev_mfc, *mfc0; 1980 ip_stack_t *ipst; 1981 conn_t *mrouter; 1982 1983 if (mfc_rt->mfc_rte == NULL || mfc_rt->mfc_rte->ill != NULL) { 1984 cmn_err(CE_WARN, "expire_upcalls: no ILL\n"); 1985 return; 1986 } 1987 ipst = mfc_rt->mfc_rte->ill->ill_ipst; 1988 mrouter = ipst->ips_ip_g_mrouter; 1989 1990 hash = MFCHASH(mfc_rt->mfc_origin.s_addr, mfc_rt->mfc_mcastgrp.s_addr); 1991 if (ipst->ips_ip_mrtdebug > 1) { 1992 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1993 "expire_upcalls: hash %d s %x g %x", 1994 hash, ntohl(mfc_rt->mfc_origin.s_addr), 1995 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1996 } 1997 MFCB_REFHOLD(&ipst->ips_mfcs[hash]); 1998 mutex_enter(&mfc_rt->mfc_mutex); 1999 /* 2000 * if timeout has been set to zero, than the 2001 * entry has been filled, no need to delete it. 2002 */ 2003 if (mfc_rt->mfc_timeout_id == 0) 2004 goto done; 2005 ipst->ips_mrtstat->mrts_cache_cleanups++; 2006 mfc_rt->mfc_timeout_id = 0; 2007 2008 /* Determine entry to be cleaned up in cache table. */ 2009 for (prev_mfc = mfc0 = ipst->ips_mfcs[hash].mfcb_mfc; mfc0; 2010 prev_mfc = mfc0, mfc0 = mfc0->mfc_next) 2011 if (mfc0 == mfc_rt) 2012 break; 2013 2014 /* del_mfc takes care of gone mfcs */ 2015 ASSERT(prev_mfc != NULL); 2016 ASSERT(mfc0 != NULL); 2017 2018 /* 2019 * Delete the entry from the cache 2020 */ 2021 ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED; 2022 mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED; 2023 2024 /* 2025 * release_mfc will drop all queued upcall packets. 2026 * and will free the mbuf with the pkt, if, timing info. 2027 */ 2028 done: 2029 mutex_exit(&mfc_rt->mfc_mutex); 2030 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 2031 } 2032 2033 /* 2034 * Packet forwarding routine once entry in the cache is made. 2035 */ 2036 static int 2037 ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src, 2038 struct mfc *rt) 2039 { 2040 vifi_t vifi; 2041 struct vif *vifp; 2042 ipaddr_t dst = ipha->ipha_dst; 2043 size_t plen = msgdsize(mp); 2044 vifi_t num_of_vifs; 2045 ip_stack_t *ipst = ill->ill_ipst; 2046 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2047 2048 if (ipst->ips_ip_mrtdebug > 1) { 2049 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2050 "ip_mdq: SEND src %x, ipha_dst %x, ill %s", 2051 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), 2052 ill->ill_name); 2053 } 2054 2055 /* Macro to send packet on vif */ 2056 #define MC_SEND(ipha, mp, vifp, dst) { \ 2057 if ((vifp)->v_flags & VIFF_TUNNEL) \ 2058 encap_send((ipha), (mp), (vifp), (dst)); \ 2059 else if ((vifp)->v_flags & VIFF_REGISTER) \ 2060 register_send((ipha), (mp), (vifp), (dst)); \ 2061 else \ 2062 phyint_send((ipha), (mp), (vifp), (dst)); \ 2063 } 2064 2065 vifi = rt->mfc_parent; 2066 2067 /* 2068 * The value of vifi is MAXVIFS if the pkt had no parent, i.e., 2069 * Mrouted had no route. 2070 * We wanted the route installed in the mfctable to prevent multiple 2071 * tries, so it passed add_mfc(), but is discarded here. The v_ipif is 2072 * NULL so we don't want to check the ill. Still needed as of Mrouted 2073 * 3.6. 2074 */ 2075 if (vifi == NO_VIF) { 2076 ip1dbg(("ip_mdq: no route for origin ill %s, vifi is NO_VIF\n", 2077 ill->ill_name)); 2078 if (ipst->ips_ip_mrtdebug > 1) { 2079 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2080 "ip_mdq: vifi is NO_VIF ill = %s", ill->ill_name); 2081 } 2082 return (-1); /* drop pkt */ 2083 } 2084 2085 if (!lock_good_vif(&ipst->ips_vifs[vifi])) 2086 return (-1); 2087 /* 2088 * The MFC entries are not cleaned up when an ipif goes 2089 * away thus this code has to guard against an MFC referencing 2090 * an ipif that has been closed. Note: reset_mrt_vif_ipif 2091 * sets the v_ipif to NULL when the ipif disappears. 2092 */ 2093 ASSERT(ipst->ips_vifs[vifi].v_ipif != NULL); 2094 2095 if (vifi >= ipst->ips_numvifs) { 2096 cmn_err(CE_WARN, "ip_mdq: illegal vifi %d numvifs " 2097 "%d ill %s viftable ill %s\n", 2098 (int)vifi, (int)ipst->ips_numvifs, ill->ill_name, 2099 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name); 2100 unlock_good_vif(&ipst->ips_vifs[vifi]); 2101 return (-1); 2102 } 2103 /* 2104 * Don't forward if it didn't arrive from the parent vif for its 2105 * origin. But do match on the groups as we nominate only one 2106 * ill in the group for receiving allmulti packets. 2107 */ 2108 if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill && 2109 (ill->ill_group == NULL || 2110 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_group != 2111 ill->ill_group)) || 2112 (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) { 2113 /* Came in the wrong interface */ 2114 ip1dbg(("ip_mdq: arrived wrong if, vifi %d " 2115 "numvifs %d ill %s viftable ill %s\n", 2116 (int)vifi, (int)ipst->ips_numvifs, ill->ill_name, 2117 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name)); 2118 if (ipst->ips_ip_mrtdebug > 1) { 2119 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2120 "ip_mdq: arrived wrong if, vifi %d ill " 2121 "%s viftable ill %s\n", 2122 (int)vifi, ill->ill_name, 2123 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name); 2124 } 2125 ipst->ips_mrtstat->mrts_wrong_if++; 2126 rt->mfc_wrong_if++; 2127 2128 /* 2129 * If we are doing PIM assert processing and we are forwarding 2130 * packets on this interface, and it is a broadcast medium 2131 * interface (and not a tunnel), send a message to the routing. 2132 * 2133 * We use the first ipif on the list, since it's all we have. 2134 * Chances are the ipif_flags are the same for ipifs on the ill. 2135 */ 2136 if (ipst->ips_pim_assert && rt->mfc_ttls[vifi] > 0 && 2137 (ill->ill_ipif->ipif_flags & IPIF_BROADCAST) && 2138 !(ipst->ips_vifs[vifi].v_flags & VIFF_TUNNEL)) { 2139 mblk_t *mp_copy; 2140 struct igmpmsg *im; 2141 2142 /* TODO could copy header and dup rest */ 2143 mp_copy = copymsg(mp); 2144 if (mp_copy == NULL) { 2145 ipst->ips_mrtstat->mrts_fwd_drop++; 2146 ip1dbg(("ip_mdq: out of memory " 2147 "for mblk, mp_copy\n")); 2148 unlock_good_vif(&ipst->ips_vifs[vifi]); 2149 return (-1); 2150 } 2151 2152 im = (struct igmpmsg *)mp_copy->b_rptr; 2153 im->im_msgtype = IGMPMSG_WRONGVIF; 2154 im->im_mbz = 0; 2155 im->im_vif = (ushort_t)vifi; 2156 /* Pass to RAWIP */ 2157 (mrouter->conn_recv)(mrouter, mp_copy, NULL); 2158 } 2159 unlock_good_vif(&ipst->ips_vifs[vifi]); 2160 if (tunnel_src != 0) 2161 return (1); 2162 else 2163 return (0); 2164 } 2165 /* 2166 * If I sourced this packet, it counts as output, else it was input. 2167 */ 2168 if (ipha->ipha_src == ipst->ips_vifs[vifi].v_lcl_addr.s_addr) { 2169 ipst->ips_vifs[vifi].v_pkt_out++; 2170 ipst->ips_vifs[vifi].v_bytes_out += plen; 2171 } else { 2172 ipst->ips_vifs[vifi].v_pkt_in++; 2173 ipst->ips_vifs[vifi].v_bytes_in += plen; 2174 } 2175 mutex_enter(&rt->mfc_mutex); 2176 rt->mfc_pkt_cnt++; 2177 rt->mfc_byte_cnt += plen; 2178 mutex_exit(&rt->mfc_mutex); 2179 unlock_good_vif(&ipst->ips_vifs[vifi]); 2180 /* 2181 * For each vif, decide if a copy of the packet should be forwarded. 2182 * Forward if: 2183 * - the vif threshold ttl is non-zero AND 2184 * - the pkt ttl exceeds the vif's threshold 2185 * A non-zero mfc_ttl indicates that the vif is part of 2186 * the output set for the mfc entry. 2187 */ 2188 mutex_enter(&ipst->ips_numvifs_mutex); 2189 num_of_vifs = ipst->ips_numvifs; 2190 mutex_exit(&ipst->ips_numvifs_mutex); 2191 for (vifp = ipst->ips_vifs, vifi = 0; 2192 vifi < num_of_vifs; 2193 vifp++, vifi++) { 2194 if (!lock_good_vif(vifp)) 2195 continue; 2196 if ((rt->mfc_ttls[vifi] > 0) && 2197 (ipha->ipha_ttl > rt->mfc_ttls[vifi])) { 2198 /* 2199 * lock_good_vif should not have succedded if 2200 * v_ipif is null. 2201 */ 2202 ASSERT(vifp->v_ipif != NULL); 2203 vifp->v_pkt_out++; 2204 vifp->v_bytes_out += plen; 2205 MC_SEND(ipha, mp, vifp, dst); 2206 ipst->ips_mrtstat->mrts_fwd_out++; 2207 } 2208 unlock_good_vif(vifp); 2209 } 2210 if (tunnel_src != 0) 2211 return (1); 2212 else 2213 return (0); 2214 } 2215 2216 /* 2217 * Send the packet on physical interface. 2218 * Caller assumes can continue to use mp on return. 2219 */ 2220 /* ARGSUSED */ 2221 static void 2222 phyint_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2223 { 2224 mblk_t *mp_copy; 2225 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2226 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2227 2228 /* Make a new reference to the packet */ 2229 mp_copy = copymsg(mp); /* TODO could copy header and dup rest */ 2230 if (mp_copy == NULL) { 2231 ipst->ips_mrtstat->mrts_fwd_drop++; 2232 ip1dbg(("phyint_send: out of memory for mblk, mp_copy\n")); 2233 return; 2234 } 2235 if (vifp->v_rate_limit <= 0) 2236 tbf_send_packet(vifp, mp_copy); 2237 else { 2238 if (ipst->ips_ip_mrtdebug > 1) { 2239 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2240 "phyint_send: tbf_contr rate %d " 2241 "vifp 0x%p mp 0x%p dst 0x%x", 2242 vifp->v_rate_limit, (void *)vifp, (void *)mp, dst); 2243 } 2244 tbf_control(vifp, mp_copy, (ipha_t *)mp_copy->b_rptr); 2245 } 2246 } 2247 2248 /* 2249 * Send the whole packet for REGISTER encapsulation to PIM daemon 2250 * Caller assumes it can continue to use mp on return. 2251 */ 2252 /* ARGSUSED */ 2253 static void 2254 register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2255 { 2256 struct igmpmsg *im; 2257 mblk_t *mp_copy; 2258 ipha_t *ipha_copy; 2259 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2260 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2261 2262 if (ipst->ips_ip_mrtdebug > 1) { 2263 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2264 "register_send: src %x, dst %x\n", 2265 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst)); 2266 } 2267 2268 /* 2269 * Copy the old packet & pullup its IP header into the new mblk_t so we 2270 * can modify it. Try to fill the new mblk_t since if we don't the 2271 * ethernet driver will. 2272 */ 2273 mp_copy = allocb(sizeof (struct igmpmsg) + sizeof (ipha_t), BPRI_MED); 2274 if (mp_copy == NULL) { 2275 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2276 if (ipst->ips_ip_mrtdebug > 3) { 2277 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2278 "register_send: allocb failure."); 2279 } 2280 return; 2281 } 2282 2283 /* 2284 * Bump write pointer to account for igmpmsg being added. 2285 */ 2286 mp_copy->b_wptr = mp_copy->b_rptr + sizeof (struct igmpmsg); 2287 2288 /* 2289 * Chain packet to new mblk_t. 2290 */ 2291 if ((mp_copy->b_cont = copymsg(mp)) == NULL) { 2292 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2293 if (ipst->ips_ip_mrtdebug > 3) { 2294 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2295 "register_send: copymsg failure."); 2296 } 2297 freeb(mp_copy); 2298 return; 2299 } 2300 2301 /* 2302 * icmp_input() asserts that IP version field is set to an 2303 * appropriate version. Hence, the struct igmpmsg that this really 2304 * becomes, needs to have the correct IP version field. 2305 */ 2306 ipha_copy = (ipha_t *)mp_copy->b_rptr; 2307 *ipha_copy = multicast_encap_iphdr; 2308 2309 /* 2310 * The kernel uses the struct igmpmsg header to encode the messages to 2311 * the multicast routing daemon. Fill in the fields in the header 2312 * starting with the message type which is IGMPMSG_WHOLEPKT 2313 */ 2314 im = (struct igmpmsg *)mp_copy->b_rptr; 2315 im->im_msgtype = IGMPMSG_WHOLEPKT; 2316 im->im_src.s_addr = ipha->ipha_src; 2317 im->im_dst.s_addr = ipha->ipha_dst; 2318 2319 /* 2320 * Must Be Zero. This is because the struct igmpmsg is really an IP 2321 * header with renamed fields and the multicast routing daemon uses 2322 * an ipha_protocol (aka im_mbz) of 0 to distinguish these messages. 2323 */ 2324 im->im_mbz = 0; 2325 2326 ++ipst->ips_mrtstat->mrts_upcalls; 2327 if (!canputnext(mrouter->conn_rq)) { 2328 ++ipst->ips_mrtstat->mrts_pim_regsend_drops; 2329 if (ipst->ips_ip_mrtdebug > 3) { 2330 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2331 "register_send: register upcall failure."); 2332 } 2333 freemsg(mp_copy); 2334 } else { 2335 /* Pass to RAWIP */ 2336 (mrouter->conn_recv)(mrouter, mp_copy, NULL); 2337 } 2338 } 2339 2340 /* 2341 * pim_validate_cksum handles verification of the checksum in the 2342 * pim header. For PIM Register packets, the checksum is calculated 2343 * across the PIM header only. For all other packets, the checksum 2344 * is for the PIM header and remainder of the packet. 2345 * 2346 * returns: B_TRUE, if checksum is okay. 2347 * B_FALSE, if checksum is not valid. 2348 */ 2349 static boolean_t 2350 pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp) 2351 { 2352 mblk_t *mp_dup; 2353 2354 if ((mp_dup = dupmsg(mp)) == NULL) 2355 return (B_FALSE); 2356 2357 mp_dup->b_rptr += IPH_HDR_LENGTH(ip); 2358 if (pimp->pim_type == PIM_REGISTER) 2359 mp_dup->b_wptr = mp_dup->b_rptr + PIM_MINLEN; 2360 if (IP_CSUM(mp_dup, 0, 0)) { 2361 freemsg(mp_dup); 2362 return (B_FALSE); 2363 } 2364 freemsg(mp_dup); 2365 return (B_TRUE); 2366 } 2367 2368 /* 2369 * int 2370 * pim_input(queue_t *, mblk_t *, ill_t *ill) - Process PIM protocol packets. 2371 * IP Protocol 103. Register messages are decapsulated and sent 2372 * onto multicast forwarding. 2373 */ 2374 int 2375 pim_input(queue_t *q, mblk_t *mp, ill_t *ill) 2376 { 2377 ipha_t *eip, *ip; 2378 int iplen, pimlen, iphlen; 2379 struct pim *pimp; /* pointer to a pim struct */ 2380 uint32_t *reghdr; 2381 ip_stack_t *ipst = ill->ill_ipst; 2382 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2383 2384 /* 2385 * Pullup the msg for PIM protocol processing. 2386 */ 2387 if (pullupmsg(mp, -1) == 0) { 2388 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2389 freemsg(mp); 2390 return (-1); 2391 } 2392 2393 ip = (ipha_t *)mp->b_rptr; 2394 iplen = ip->ipha_length; 2395 iphlen = IPH_HDR_LENGTH(ip); 2396 pimlen = ntohs(iplen) - iphlen; 2397 2398 /* 2399 * Validate lengths 2400 */ 2401 if (pimlen < PIM_MINLEN) { 2402 ++ipst->ips_mrtstat->mrts_pim_malformed; 2403 if (ipst->ips_ip_mrtdebug > 1) { 2404 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2405 "pim_input: length not at least minlen"); 2406 } 2407 freemsg(mp); 2408 return (-1); 2409 } 2410 2411 /* 2412 * Point to the PIM header. 2413 */ 2414 pimp = (struct pim *)((caddr_t)ip + iphlen); 2415 2416 /* 2417 * Check the version number. 2418 */ 2419 if (pimp->pim_vers != PIM_VERSION) { 2420 ++ipst->ips_mrtstat->mrts_pim_badversion; 2421 if (ipst->ips_ip_mrtdebug > 1) { 2422 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2423 "pim_input: unknown version of PIM"); 2424 } 2425 freemsg(mp); 2426 return (-1); 2427 } 2428 2429 /* 2430 * Validate the checksum 2431 */ 2432 if (!pim_validate_cksum(mp, ip, pimp)) { 2433 ++ipst->ips_mrtstat->mrts_pim_rcv_badcsum; 2434 if (ipst->ips_ip_mrtdebug > 1) { 2435 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2436 "pim_input: invalid checksum"); 2437 } 2438 freemsg(mp); 2439 return (-1); 2440 } 2441 2442 if (pimp->pim_type != PIM_REGISTER) 2443 return (0); 2444 2445 reghdr = (uint32_t *)(pimp + 1); 2446 eip = (ipha_t *)(reghdr + 1); 2447 2448 /* 2449 * check if the inner packet is destined to mcast group 2450 */ 2451 if (!CLASSD(eip->ipha_dst)) { 2452 ++ipst->ips_mrtstat->mrts_pim_badregisters; 2453 if (ipst->ips_ip_mrtdebug > 1) { 2454 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2455 "pim_input: Inner pkt not mcast .. !"); 2456 } 2457 freemsg(mp); 2458 return (-1); 2459 } 2460 if (ipst->ips_ip_mrtdebug > 1) { 2461 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2462 "register from %x, to %x, len %d", 2463 ntohl(eip->ipha_src), 2464 ntohl(eip->ipha_dst), 2465 ntohs(eip->ipha_length)); 2466 } 2467 /* 2468 * If the null register bit is not set, decapsulate 2469 * the packet before forwarding it. 2470 */ 2471 if (!(ntohl(*reghdr) & PIM_NULL_REGISTER)) { 2472 mblk_t *mp_copy; 2473 2474 /* Copy the message */ 2475 if ((mp_copy = copymsg(mp)) == NULL) { 2476 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2477 freemsg(mp); 2478 return (-1); 2479 } 2480 2481 /* 2482 * Decapsulate the packet and give it to 2483 * register_mforward. 2484 */ 2485 mp_copy->b_rptr += iphlen + sizeof (pim_t) + 2486 sizeof (*reghdr); 2487 if (register_mforward(q, mp_copy, ill) != 0) { 2488 freemsg(mp); 2489 return (-1); 2490 } 2491 } 2492 2493 /* 2494 * Pass all valid PIM packets up to any process(es) listening on a raw 2495 * PIM socket. For Solaris it is done right after pim_input() is 2496 * called. 2497 */ 2498 return (0); 2499 } 2500 2501 /* 2502 * PIM sparse mode hook. Called by pim_input after decapsulating 2503 * the packet. Loop back the packet, as if we have received it. 2504 * In pim_input() we have to check if the destination is a multicast address. 2505 */ 2506 /* ARGSUSED */ 2507 static int 2508 register_mforward(queue_t *q, mblk_t *mp, ill_t *ill) 2509 { 2510 ip_stack_t *ipst = ill->ill_ipst; 2511 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2512 2513 ASSERT(ipst->ips_reg_vif_num <= ipst->ips_numvifs); 2514 2515 if (ipst->ips_ip_mrtdebug > 3) { 2516 ipha_t *ipha; 2517 2518 ipha = (ipha_t *)mp->b_rptr; 2519 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2520 "register_mforward: src %x, dst %x\n", 2521 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst)); 2522 } 2523 /* 2524 * Need to pass in to ip_mforward() the information that the 2525 * packet has arrived on the register_vif. We use the solution that 2526 * ip_mroute_decap() employs: use mp->b_prev to pass some information 2527 * to ip_mforward(). Nonzero value means the packet has arrived on a 2528 * tunnel (ip_mroute_decap() puts the address of the other side of the 2529 * tunnel there.) This is safe since ip_rput() either frees the packet 2530 * or passes it to ip_mforward(). We use 2531 * PIM_REGISTER_MARKER = 0xffffffff to indicate the has arrived on the 2532 * register vif. If in the future we have more than one register vifs, 2533 * then this will need re-examination. 2534 */ 2535 mp->b_prev = (mblk_t *)PIM_REGISTER_MARKER; 2536 ++ipst->ips_mrtstat->mrts_pim_regforwards; 2537 ip_rput(q, mp); 2538 return (0); 2539 } 2540 2541 /* 2542 * Send an encapsulated packet. 2543 * Caller assumes can continue to use mp when routine returns. 2544 */ 2545 /* ARGSUSED */ 2546 static void 2547 encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2548 { 2549 mblk_t *mp_copy; 2550 ipha_t *ipha_copy; 2551 size_t len; 2552 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2553 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2554 2555 if (ipst->ips_ip_mrtdebug > 1) { 2556 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2557 "encap_send: vif %ld enter", 2558 (ptrdiff_t)(vifp - ipst->ips_vifs)); 2559 } 2560 len = ntohs(ipha->ipha_length); 2561 2562 /* 2563 * Copy the old packet & pullup it's IP header into the 2564 * new mbuf so we can modify it. Try to fill the new 2565 * mbuf since if we don't the ethernet driver will. 2566 */ 2567 mp_copy = allocb(32 + sizeof (multicast_encap_iphdr), BPRI_MED); 2568 if (mp_copy == NULL) 2569 return; 2570 mp_copy->b_rptr += 32; 2571 mp_copy->b_wptr = mp_copy->b_rptr + sizeof (multicast_encap_iphdr); 2572 if ((mp_copy->b_cont = copymsg(mp)) == NULL) { 2573 freeb(mp_copy); 2574 return; 2575 } 2576 2577 /* 2578 * Fill in the encapsulating IP header. 2579 * Remote tunnel dst in rmt_addr, from add_vif(). 2580 */ 2581 ipha_copy = (ipha_t *)mp_copy->b_rptr; 2582 *ipha_copy = multicast_encap_iphdr; 2583 ASSERT((len + sizeof (ipha_t)) <= IP_MAXPACKET); 2584 ipha_copy->ipha_length = htons(len + sizeof (ipha_t)); 2585 ipha_copy->ipha_src = vifp->v_lcl_addr.s_addr; 2586 ipha_copy->ipha_dst = vifp->v_rmt_addr.s_addr; 2587 ASSERT(ipha_copy->ipha_ident == 0); 2588 2589 /* Turn the encapsulated IP header back into a valid one. */ 2590 ipha = (ipha_t *)mp_copy->b_cont->b_rptr; 2591 ipha->ipha_ttl--; 2592 ipha->ipha_hdr_checksum = 0; 2593 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 2594 2595 if (ipst->ips_ip_mrtdebug > 1) { 2596 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2597 "encap_send: group 0x%x", ntohl(ipha->ipha_dst)); 2598 } 2599 if (vifp->v_rate_limit <= 0) 2600 tbf_send_packet(vifp, mp_copy); 2601 else 2602 /* ipha is from the original header */ 2603 tbf_control(vifp, mp_copy, ipha); 2604 } 2605 2606 /* 2607 * De-encapsulate a packet and feed it back through IP input. 2608 * This routine is called whenever IP gets a packet with prototype 2609 * IPPROTO_ENCAP and a local destination address. 2610 */ 2611 void 2612 ip_mroute_decap(queue_t *q, mblk_t *mp, ill_t *ill) 2613 { 2614 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2615 ipha_t *ipha_encap; 2616 int hlen = IPH_HDR_LENGTH(ipha); 2617 ipaddr_t src; 2618 struct vif *vifp; 2619 ip_stack_t *ipst = ill->ill_ipst; 2620 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2621 2622 /* 2623 * Dump the packet if it's not to a multicast destination or if 2624 * we don't have an encapsulating tunnel with the source. 2625 * Note: This code assumes that the remote site IP address 2626 * uniquely identifies the tunnel (i.e., that this site has 2627 * at most one tunnel with the remote site). 2628 */ 2629 ipha_encap = (ipha_t *)((char *)ipha + hlen); 2630 if (!CLASSD(ipha_encap->ipha_dst)) { 2631 ipst->ips_mrtstat->mrts_bad_tunnel++; 2632 ip1dbg(("ip_mroute_decap: bad tunnel\n")); 2633 freemsg(mp); 2634 return; 2635 } 2636 src = (ipaddr_t)ipha->ipha_src; 2637 mutex_enter(&ipst->ips_last_encap_lock); 2638 if (src != ipst->ips_last_encap_src) { 2639 struct vif *vife; 2640 2641 vifp = ipst->ips_vifs; 2642 vife = vifp + ipst->ips_numvifs; 2643 ipst->ips_last_encap_src = src; 2644 ipst->ips_last_encap_vif = 0; 2645 for (; vifp < vife; ++vifp) { 2646 if (!lock_good_vif(vifp)) 2647 continue; 2648 if (vifp->v_rmt_addr.s_addr == src) { 2649 if (vifp->v_flags & VIFF_TUNNEL) 2650 ipst->ips_last_encap_vif = vifp; 2651 if (ipst->ips_ip_mrtdebug > 1) { 2652 (void) mi_strlog(mrouter->conn_rq, 2653 1, SL_TRACE, 2654 "ip_mroute_decap: good tun " 2655 "vif %ld with %x", 2656 (ptrdiff_t)(vifp - ipst->ips_vifs), 2657 ntohl(src)); 2658 } 2659 unlock_good_vif(vifp); 2660 break; 2661 } 2662 unlock_good_vif(vifp); 2663 } 2664 } 2665 if ((vifp = ipst->ips_last_encap_vif) == 0) { 2666 mutex_exit(&ipst->ips_last_encap_lock); 2667 ipst->ips_mrtstat->mrts_bad_tunnel++; 2668 freemsg(mp); 2669 ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n", 2670 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(src))); 2671 return; 2672 } 2673 mutex_exit(&ipst->ips_last_encap_lock); 2674 2675 /* 2676 * Need to pass in the tunnel source to ip_mforward (so that it can 2677 * verify that the packet arrived over the correct vif.) We use b_prev 2678 * to pass this information. This is safe since the ip_rput either 2679 * frees the packet or passes it to ip_mforward. 2680 */ 2681 mp->b_prev = (mblk_t *)(uintptr_t)src; 2682 mp->b_rptr += hlen; 2683 /* Feed back into ip_rput as an M_DATA. */ 2684 ip_rput(q, mp); 2685 } 2686 2687 /* 2688 * Remove all records with v_ipif == ipif. Called when an interface goes away 2689 * (stream closed). Called as writer. 2690 */ 2691 void 2692 reset_mrt_vif_ipif(ipif_t *ipif) 2693 { 2694 vifi_t vifi, tmp_vifi; 2695 vifi_t num_of_vifs; 2696 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 2697 2698 /* Can't check vifi >= 0 since vifi_t is unsigned! */ 2699 2700 mutex_enter(&ipst->ips_numvifs_mutex); 2701 num_of_vifs = ipst->ips_numvifs; 2702 mutex_exit(&ipst->ips_numvifs_mutex); 2703 2704 for (vifi = num_of_vifs; vifi != 0; vifi--) { 2705 tmp_vifi = vifi - 1; 2706 if (ipst->ips_vifs[tmp_vifi].v_ipif == ipif) { 2707 (void) del_vif(&tmp_vifi, NULL, NULL, ipst); 2708 } 2709 } 2710 } 2711 2712 /* Remove pending upcall msgs when ill goes away. Called by ill_delete. */ 2713 void 2714 reset_mrt_ill(ill_t *ill) 2715 { 2716 struct mfc *rt; 2717 struct rtdetq *rte; 2718 int i; 2719 ip_stack_t *ipst = ill->ill_ipst; 2720 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2721 2722 for (i = 0; i < MFCTBLSIZ; i++) { 2723 MFCB_REFHOLD(&ipst->ips_mfcs[i]); 2724 if ((rt = ipst->ips_mfcs[i].mfcb_mfc) != NULL) { 2725 if (ipst->ips_ip_mrtdebug > 1) { 2726 (void) mi_strlog(mrouter->conn_rq, 1, 2727 SL_TRACE, 2728 "reset_mrt_ill: mfctable [%d]", i); 2729 } 2730 while (rt != NULL) { 2731 mutex_enter(&rt->mfc_mutex); 2732 while ((rte = rt->mfc_rte) != NULL) { 2733 if (rte->ill == ill) { 2734 if (ipst->ips_ip_mrtdebug > 1) { 2735 (void) mi_strlog( 2736 mrouter->conn_rq, 2737 1, SL_TRACE, 2738 "reset_mrt_ill: " 2739 "ill 0x%p", (void *)ill); 2740 } 2741 rt->mfc_rte = rte->rte_next; 2742 freemsg(rte->mp); 2743 mi_free((char *)rte); 2744 } 2745 } 2746 mutex_exit(&rt->mfc_mutex); 2747 rt = rt->mfc_next; 2748 } 2749 } 2750 MFCB_REFRELE(&ipst->ips_mfcs[i]); 2751 } 2752 } 2753 2754 /* 2755 * Token bucket filter module. 2756 * The ipha is for mcastgrp destination for phyint and encap. 2757 */ 2758 static void 2759 tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha) 2760 { 2761 size_t p_len = msgdsize(mp); 2762 struct tbf *t = vifp->v_tbf; 2763 timeout_id_t id = 0; 2764 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2765 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2766 2767 /* Drop if packet is too large */ 2768 if (p_len > MAX_BKT_SIZE) { 2769 ipst->ips_mrtstat->mrts_pkt2large++; 2770 freemsg(mp); 2771 return; 2772 } 2773 if (ipst->ips_ip_mrtdebug > 1) { 2774 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2775 "tbf_ctrl: SEND vif %ld, qlen %d, ipha_dst 0x%x", 2776 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len, 2777 ntohl(ipha->ipha_dst)); 2778 } 2779 2780 mutex_enter(&t->tbf_lock); 2781 2782 tbf_update_tokens(vifp); 2783 2784 /* 2785 * If there are enough tokens, 2786 * and the queue is empty, send this packet out. 2787 */ 2788 if (ipst->ips_ip_mrtdebug > 1) { 2789 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2790 "tbf_control: vif %ld, TOKENS %d, pkt len %lu, qlen %d", 2791 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_n_tok, p_len, 2792 t->tbf_q_len); 2793 } 2794 /* No packets are queued */ 2795 if (t->tbf_q_len == 0) { 2796 /* queue empty, send packet if enough tokens */ 2797 if (p_len <= t->tbf_n_tok) { 2798 t->tbf_n_tok -= p_len; 2799 mutex_exit(&t->tbf_lock); 2800 tbf_send_packet(vifp, mp); 2801 return; 2802 } else { 2803 /* Queue packet and timeout till later */ 2804 tbf_queue(vifp, mp); 2805 ASSERT(vifp->v_timeout_id == 0); 2806 vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp, 2807 TBF_REPROCESS); 2808 } 2809 } else if (t->tbf_q_len < t->tbf_max_q_len) { 2810 /* Finite queue length, so queue pkts and process queue */ 2811 tbf_queue(vifp, mp); 2812 tbf_process_q(vifp); 2813 } else { 2814 /* Check that we have UDP header with IP header */ 2815 size_t hdr_length = IPH_HDR_LENGTH(ipha) + 2816 sizeof (struct udphdr); 2817 2818 if ((mp->b_wptr - mp->b_rptr) < hdr_length) { 2819 if (!pullupmsg(mp, hdr_length)) { 2820 freemsg(mp); 2821 ip1dbg(("tbf_ctl: couldn't pullup udp hdr, " 2822 "vif %ld src 0x%x dst 0x%x\n", 2823 (ptrdiff_t)(vifp - ipst->ips_vifs), 2824 ntohl(ipha->ipha_src), 2825 ntohl(ipha->ipha_dst))); 2826 mutex_exit(&vifp->v_tbf->tbf_lock); 2827 return; 2828 } else 2829 /* Have to reassign ipha after pullupmsg */ 2830 ipha = (ipha_t *)mp->b_rptr; 2831 } 2832 /* 2833 * Queue length too much, 2834 * try to selectively dq, or queue and process 2835 */ 2836 if (!tbf_dq_sel(vifp, ipha)) { 2837 ipst->ips_mrtstat->mrts_q_overflow++; 2838 freemsg(mp); 2839 } else { 2840 tbf_queue(vifp, mp); 2841 tbf_process_q(vifp); 2842 } 2843 } 2844 if (t->tbf_q_len == 0) { 2845 id = vifp->v_timeout_id; 2846 vifp->v_timeout_id = 0; 2847 } 2848 mutex_exit(&vifp->v_tbf->tbf_lock); 2849 if (id != 0) 2850 (void) untimeout(id); 2851 } 2852 2853 /* 2854 * Adds a packet to the tbf queue at the interface. 2855 * The ipha is for mcastgrp destination for phyint and encap. 2856 */ 2857 static void 2858 tbf_queue(struct vif *vifp, mblk_t *mp) 2859 { 2860 struct tbf *t = vifp->v_tbf; 2861 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2862 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2863 2864 if (ipst->ips_ip_mrtdebug > 1) { 2865 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2866 "tbf_queue: vif %ld", (ptrdiff_t)(vifp - ipst->ips_vifs)); 2867 } 2868 ASSERT(MUTEX_HELD(&t->tbf_lock)); 2869 2870 if (t->tbf_t == NULL) { 2871 /* Queue was empty */ 2872 t->tbf_q = mp; 2873 } else { 2874 /* Insert at tail */ 2875 t->tbf_t->b_next = mp; 2876 } 2877 /* set new tail pointer */ 2878 t->tbf_t = mp; 2879 2880 mp->b_next = mp->b_prev = NULL; 2881 2882 t->tbf_q_len++; 2883 } 2884 2885 /* 2886 * Process the queue at the vif interface. 2887 * Drops the tbf_lock when sending packets. 2888 * 2889 * NOTE : The caller should quntimeout if the queue length is 0. 2890 */ 2891 static void 2892 tbf_process_q(struct vif *vifp) 2893 { 2894 mblk_t *mp; 2895 struct tbf *t = vifp->v_tbf; 2896 size_t len; 2897 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2898 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2899 2900 if (ipst->ips_ip_mrtdebug > 1) { 2901 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2902 "tbf_process_q 1: vif %ld qlen = %d", 2903 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len); 2904 } 2905 2906 /* 2907 * Loop through the queue at the interface and send 2908 * as many packets as possible. 2909 */ 2910 ASSERT(MUTEX_HELD(&t->tbf_lock)); 2911 2912 while (t->tbf_q_len > 0) { 2913 mp = t->tbf_q; 2914 len = (size_t)msgdsize(mp); /* length of ip pkt */ 2915 2916 /* Determine if the packet can be sent */ 2917 if (len <= t->tbf_n_tok) { 2918 /* 2919 * If so, reduce no. of tokens, dequeue the packet, 2920 * send the packet. 2921 */ 2922 t->tbf_n_tok -= len; 2923 2924 t->tbf_q = mp->b_next; 2925 if (--t->tbf_q_len == 0) { 2926 t->tbf_t = NULL; 2927 } 2928 mp->b_next = NULL; 2929 /* Exit mutex before sending packet, then re-enter */ 2930 mutex_exit(&t->tbf_lock); 2931 tbf_send_packet(vifp, mp); 2932 mutex_enter(&t->tbf_lock); 2933 } else 2934 break; 2935 } 2936 } 2937 2938 /* Called at tbf timeout to update tokens, process q and reset timer. */ 2939 static void 2940 tbf_reprocess_q(void *arg) 2941 { 2942 struct vif *vifp = arg; 2943 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2944 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2945 2946 mutex_enter(&vifp->v_tbf->tbf_lock); 2947 vifp->v_timeout_id = 0; 2948 tbf_update_tokens(vifp); 2949 2950 tbf_process_q(vifp); 2951 2952 if (vifp->v_tbf->tbf_q_len > 0) { 2953 vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp, 2954 TBF_REPROCESS); 2955 } 2956 mutex_exit(&vifp->v_tbf->tbf_lock); 2957 2958 if (ipst->ips_ip_mrtdebug > 1) { 2959 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2960 "tbf_reprcess_q: vif %ld timeout id = %p", 2961 (ptrdiff_t)(vifp - ipst->ips_vifs), vifp->v_timeout_id); 2962 } 2963 } 2964 2965 /* 2966 * Function that will selectively discard a member of the tbf queue, 2967 * based on the precedence value and the priority. 2968 * 2969 * NOTE : The caller should quntimeout if the queue length is 0. 2970 */ 2971 static int 2972 tbf_dq_sel(struct vif *vifp, ipha_t *ipha) 2973 { 2974 uint_t p; 2975 struct tbf *t = vifp->v_tbf; 2976 mblk_t **np; 2977 mblk_t *last, *mp; 2978 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2979 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2980 2981 if (ipst->ips_ip_mrtdebug > 1) { 2982 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2983 "dq_sel: vif %ld dst 0x%x", 2984 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(ipha->ipha_dst)); 2985 } 2986 2987 ASSERT(MUTEX_HELD(&t->tbf_lock)); 2988 p = priority(vifp, ipha); 2989 2990 np = &t->tbf_q; 2991 last = NULL; 2992 while ((mp = *np) != NULL) { 2993 if (p > (priority(vifp, (ipha_t *)mp->b_rptr))) { 2994 *np = mp->b_next; 2995 /* If removing the last packet, fix the tail pointer */ 2996 if (mp == t->tbf_t) 2997 t->tbf_t = last; 2998 mp->b_prev = mp->b_next = NULL; 2999 freemsg(mp); 3000 /* 3001 * It's impossible for the queue to be empty, but 3002 * we check anyway. 3003 */ 3004 if (--t->tbf_q_len == 0) { 3005 t->tbf_t = NULL; 3006 } 3007 ipst->ips_mrtstat->mrts_drop_sel++; 3008 return (1); 3009 } 3010 np = &mp->b_next; 3011 last = mp; 3012 } 3013 return (0); 3014 } 3015 3016 /* Sends packet, 2 cases - encap tunnel, phyint. */ 3017 static void 3018 tbf_send_packet(struct vif *vifp, mblk_t *mp) 3019 { 3020 ipif_t *ipif; 3021 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 3022 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3023 3024 /* If encap tunnel options */ 3025 if (vifp->v_flags & VIFF_TUNNEL) { 3026 if (ipst->ips_ip_mrtdebug > 1) { 3027 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3028 "tbf_send_pkt: ENCAP tunnel vif %ld", 3029 (ptrdiff_t)(vifp - ipst->ips_vifs)); 3030 } 3031 3032 /* 3033 * Feed into ip_wput which will set the ident field and 3034 * checksum the encapsulating header. 3035 * BSD gets the cached route vifp->v_route from ip_output() 3036 * to speed up route table lookups. Not necessary in SunOS 5.x. 3037 */ 3038 put(vifp->v_ipif->ipif_wq, mp); 3039 return; 3040 3041 /* phyint */ 3042 } else { 3043 /* Need to loop back to members on the outgoing interface. */ 3044 ipha_t *ipha; 3045 ipaddr_t dst; 3046 ipha = (ipha_t *)mp->b_rptr; 3047 dst = ipha->ipha_dst; 3048 ipif = vifp->v_ipif; 3049 3050 mutex_enter(&ipif->ipif_ill->ill_lock); 3051 if (ilm_lookup_ipif(ipif, dst) != NULL) { 3052 /* 3053 * The packet is not yet reassembled, thus we need to 3054 * pass it to ip_rput_local for checksum verification 3055 * and reassembly (and fanout the user stream). 3056 */ 3057 mblk_t *mp_loop; 3058 ire_t *ire; 3059 3060 mutex_exit(&ipif->ipif_ill->ill_lock); 3061 if (ipst->ips_ip_mrtdebug > 1) { 3062 (void) mi_strlog(mrouter->conn_rq, 1, 3063 SL_TRACE, 3064 "tbf_send_pkt: loopback vif %ld", 3065 (ptrdiff_t)(vifp - ipst->ips_vifs)); 3066 } 3067 mp_loop = copymsg(mp); 3068 ire = ire_ctable_lookup(~0, 0, IRE_BROADCAST, NULL, 3069 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 3070 3071 if (mp_loop != NULL && ire != NULL) { 3072 IP_RPUT_LOCAL(ipif->ipif_rq, mp_loop, 3073 ((ipha_t *)mp_loop->b_rptr), 3074 ire, (ill_t *)ipif->ipif_rq->q_ptr); 3075 } else { 3076 /* Either copymsg failed or no ire */ 3077 (void) mi_strlog(mrouter->conn_rq, 1, 3078 SL_TRACE, 3079 "tbf_send_pkt: mp_loop 0x%p, ire 0x%p " 3080 "vif %ld\n", (void *)mp_loop, (void *)ire, 3081 (ptrdiff_t)(vifp - ipst->ips_vifs)); 3082 } 3083 if (ire != NULL) 3084 ire_refrele(ire); 3085 } else { 3086 mutex_exit(&ipif->ipif_ill->ill_lock); 3087 } 3088 if (ipst->ips_ip_mrtdebug > 1) { 3089 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3090 "tbf_send_pkt: phyint forward vif %ld dst = 0x%x", 3091 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(dst)); 3092 } 3093 ip_rput_forward_multicast(dst, mp, ipif); 3094 } 3095 } 3096 3097 /* 3098 * Determine the current time and then the elapsed time (between the last time 3099 * and time now). Update the no. of tokens in the bucket. 3100 */ 3101 static void 3102 tbf_update_tokens(struct vif *vifp) 3103 { 3104 timespec_t tp; 3105 hrtime_t tm; 3106 struct tbf *t = vifp->v_tbf; 3107 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 3108 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3109 3110 ASSERT(MUTEX_HELD(&t->tbf_lock)); 3111 3112 /* Time in secs and nsecs, rate limit in kbits/sec */ 3113 gethrestime(&tp); 3114 3115 /*LINTED*/ 3116 TV_DELTA(tp, t->tbf_last_pkt_t, tm); 3117 3118 /* 3119 * This formula is actually 3120 * "time in seconds" * "bytes/second". Scaled for nsec. 3121 * (tm/1000000000) * (v_rate_limit * 1000 * (1000/1024) /8) 3122 * 3123 * The (1000/1024) was introduced in add_vif to optimize 3124 * this divide into a shift. 3125 */ 3126 t->tbf_n_tok += (tm/1000) * vifp->v_rate_limit / 1024 / 8; 3127 t->tbf_last_pkt_t = tp; 3128 3129 if (t->tbf_n_tok > MAX_BKT_SIZE) 3130 t->tbf_n_tok = MAX_BKT_SIZE; 3131 if (ipst->ips_ip_mrtdebug > 1) { 3132 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3133 "tbf_update_tok: tm %lld tok %d vif %ld", 3134 tm, t->tbf_n_tok, (ptrdiff_t)(vifp - ipst->ips_vifs)); 3135 } 3136 } 3137 3138 /* 3139 * Priority currently is based on port nos. 3140 * Different forwarding mechanisms have different ways 3141 * of obtaining the port no. Hence, the vif must be 3142 * given along with the packet itself. 3143 * 3144 */ 3145 static int 3146 priority(struct vif *vifp, ipha_t *ipha) 3147 { 3148 int prio; 3149 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 3150 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3151 3152 /* Temporary hack; may add general packet classifier some day */ 3153 3154 ASSERT(MUTEX_HELD(&vifp->v_tbf->tbf_lock)); 3155 3156 /* 3157 * The UDP port space is divided up into four priority ranges: 3158 * [0, 16384) : unclassified - lowest priority 3159 * [16384, 32768) : audio - highest priority 3160 * [32768, 49152) : whiteboard - medium priority 3161 * [49152, 65536) : video - low priority 3162 */ 3163 3164 if (ipha->ipha_protocol == IPPROTO_UDP) { 3165 struct udphdr *udp = 3166 (struct udphdr *)((char *)ipha + IPH_HDR_LENGTH(ipha)); 3167 switch (ntohs(udp->uh_dport) & 0xc000) { 3168 case 0x4000: 3169 prio = 70; 3170 break; 3171 case 0x8000: 3172 prio = 60; 3173 break; 3174 case 0xc000: 3175 prio = 55; 3176 break; 3177 default: 3178 prio = 50; 3179 break; 3180 } 3181 if (ipst->ips_ip_mrtdebug > 1) { 3182 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3183 "priority: port %x prio %d\n", 3184 ntohs(udp->uh_dport), prio); 3185 } 3186 } else 3187 prio = 50; /* default priority */ 3188 return (prio); 3189 } 3190 3191 /* 3192 * End of token bucket filter modifications 3193 */ 3194 3195 3196 3197 /* 3198 * Produces data for netstat -M. 3199 */ 3200 int 3201 ip_mroute_stats(mblk_t *mp, ip_stack_t *ipst) 3202 { 3203 ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl); 3204 ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl); 3205 if (!snmp_append_data(mp, (char *)ipst->ips_mrtstat, 3206 sizeof (struct mrtstat))) { 3207 ip0dbg(("ip_mroute_stats: failed %ld bytes\n", 3208 (size_t)sizeof (struct mrtstat))); 3209 return (0); 3210 } 3211 return (1); 3212 } 3213 3214 /* 3215 * Sends info for SNMP's MIB. 3216 */ 3217 int 3218 ip_mroute_vif(mblk_t *mp, ip_stack_t *ipst) 3219 { 3220 struct vifctl vi; 3221 vifi_t vifi; 3222 3223 mutex_enter(&ipst->ips_numvifs_mutex); 3224 for (vifi = 0; vifi < ipst->ips_numvifs; vifi++) { 3225 if (ipst->ips_vifs[vifi].v_lcl_addr.s_addr == 0) 3226 continue; 3227 /* 3228 * No locks here, an approximation is fine. 3229 */ 3230 vi.vifc_vifi = vifi; 3231 vi.vifc_flags = ipst->ips_vifs[vifi].v_flags; 3232 vi.vifc_threshold = ipst->ips_vifs[vifi].v_threshold; 3233 vi.vifc_rate_limit = ipst->ips_vifs[vifi].v_rate_limit; 3234 vi.vifc_lcl_addr = ipst->ips_vifs[vifi].v_lcl_addr; 3235 vi.vifc_rmt_addr = ipst->ips_vifs[vifi].v_rmt_addr; 3236 vi.vifc_pkt_in = ipst->ips_vifs[vifi].v_pkt_in; 3237 vi.vifc_pkt_out = ipst->ips_vifs[vifi].v_pkt_out; 3238 3239 if (!snmp_append_data(mp, (char *)&vi, sizeof (vi))) { 3240 ip0dbg(("ip_mroute_vif: failed %ld bytes\n", 3241 (size_t)sizeof (vi))); 3242 return (0); 3243 } 3244 } 3245 mutex_exit(&ipst->ips_numvifs_mutex); 3246 return (1); 3247 } 3248 3249 /* 3250 * Called by ip_snmp_get to send up multicast routing table. 3251 */ 3252 int 3253 ip_mroute_mrt(mblk_t *mp, ip_stack_t *ipst) 3254 { 3255 int i, j; 3256 struct mfc *rt; 3257 struct mfcctl mfcc; 3258 3259 /* 3260 * Make sure multicast has not been turned off. 3261 */ 3262 if (is_mrouter_off(ipst)) 3263 return (1); 3264 3265 /* Loop over all hash buckets and their chains */ 3266 for (i = 0; i < MFCTBLSIZ; i++) { 3267 MFCB_REFHOLD(&ipst->ips_mfcs[i]); 3268 for (rt = ipst->ips_mfcs[i].mfcb_mfc; rt; rt = rt->mfc_next) { 3269 mutex_enter(&rt->mfc_mutex); 3270 if (rt->mfc_rte != NULL || 3271 (rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 3272 mutex_exit(&rt->mfc_mutex); 3273 continue; 3274 } 3275 mfcc.mfcc_origin = rt->mfc_origin; 3276 mfcc.mfcc_mcastgrp = rt->mfc_mcastgrp; 3277 mfcc.mfcc_parent = rt->mfc_parent; 3278 mfcc.mfcc_pkt_cnt = rt->mfc_pkt_cnt; 3279 mutex_enter(&ipst->ips_numvifs_mutex); 3280 for (j = 0; j < (int)ipst->ips_numvifs; j++) 3281 mfcc.mfcc_ttls[j] = rt->mfc_ttls[j]; 3282 for (j = (int)ipst->ips_numvifs; j < MAXVIFS; j++) 3283 mfcc.mfcc_ttls[j] = 0; 3284 mutex_exit(&ipst->ips_numvifs_mutex); 3285 3286 mutex_exit(&rt->mfc_mutex); 3287 if (!snmp_append_data(mp, (char *)&mfcc, 3288 sizeof (mfcc))) { 3289 MFCB_REFRELE(&ipst->ips_mfcs[i]); 3290 ip0dbg(("ip_mroute_mrt: failed %ld bytes\n", 3291 (size_t)sizeof (mfcc))); 3292 return (0); 3293 } 3294 } 3295 MFCB_REFRELE(&ipst->ips_mfcs[i]); 3296 } 3297 return (1); 3298 } 3299