1 /* 2 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 3 * Use is subject to license terms. 4 */ 5 /* 6 * CDDL HEADER START 7 * 8 * The contents of this file are subject to the terms of the 9 * Common Development and Distribution License (the "License"). 10 * You may not use this file except in compliance with the License. 11 * 12 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 13 * or http://www.opensolaris.org/os/licensing. 14 * See the License for the specific language governing permissions 15 * and limitations under the License. 16 * 17 * When distributing Covered Code, include this CDDL HEADER in each 18 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 19 * If applicable, add the following below this CDDL HEADER, with the 20 * fields enclosed by brackets "[]" replaced with your own identifying 21 * information: Portions Copyright [yyyy] [name of copyright owner] 22 * 23 * CDDL HEADER END 24 */ 25 /* 26 * Copyright 2008 Sun Microsystems, Inc. 27 * All rights reserved. Use is subject to license terms. 28 */ 29 /* Copyright (c) 1990 Mentat Inc. */ 30 31 #pragma ident "%Z%%M% %I% %E% SMI" 32 33 /* 34 * Procedures for the kernel part of DVMRP, 35 * a Distance-Vector Multicast Routing Protocol. 36 * (See RFC-1075) 37 * Written by David Waitzman, BBN Labs, August 1988. 38 * Modified by Steve Deering, Stanford, February 1989. 39 * Modified by Mark J. Steiglitz, Stanford, May, 1991 40 * Modified by Van Jacobson, LBL, January 1993 41 * Modified by Ajit Thyagarajan, PARC, August 1993 42 * Modified by Bill Fenner, PARC, April 1995 43 * 44 * MROUTING 3.5 45 */ 46 47 /* 48 * TODO 49 * - function pointer field in vif, void *vif_sendit() 50 */ 51 52 #include <sys/types.h> 53 #include <sys/stream.h> 54 #include <sys/stropts.h> 55 #include <sys/strlog.h> 56 #include <sys/systm.h> 57 #include <sys/ddi.h> 58 #include <sys/cmn_err.h> 59 #include <sys/zone.h> 60 61 #include <sys/param.h> 62 #include <sys/socket.h> 63 #include <sys/vtrace.h> 64 #include <sys/debug.h> 65 #include <net/if.h> 66 #include <sys/sockio.h> 67 #include <netinet/in.h> 68 #include <net/if_dl.h> 69 70 #include <inet/common.h> 71 #include <inet/mi.h> 72 #include <inet/nd.h> 73 #include <inet/mib2.h> 74 #include <netinet/ip6.h> 75 #include <inet/ip.h> 76 #include <inet/snmpcom.h> 77 78 #include <netinet/igmp.h> 79 #include <netinet/igmp_var.h> 80 #include <netinet/udp.h> 81 #include <netinet/ip_mroute.h> 82 #include <inet/ip_multi.h> 83 #include <inet/ip_ire.h> 84 #include <inet/ip_if.h> 85 #include <inet/ipclassifier.h> 86 87 #include <netinet/pim.h> 88 89 90 /* 91 * MT Design: 92 * 93 * There are three main data structures viftable, mfctable and tbftable that 94 * need to be protected against MT races. 95 * 96 * vitable is a fixed length array of vif structs. There is no lock to protect 97 * the whole array, instead each struct is protected by its own indiviual lock. 98 * The value of v_marks in conjuction with the value of v_refcnt determines the 99 * current state of a vif structure. One special state that needs mention 100 * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates 101 * that vif is being initalized. 102 * Each structure is freed when the refcnt goes down to zero. If a delete comes 103 * in when the the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED 104 * which prevents the struct from further use. When the refcnt goes to zero 105 * the struct is freed and is marked VIF_MARK_NOTINUSE. 106 * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill 107 * from going away a refhold is put on the ipif before using it. see 108 * lock_good_vif() and unlock_good_vif(). 109 * 110 * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts 111 * of the vif struct. 112 * 113 * tbftable is also a fixed length array of tbf structs and is only accessed 114 * via v_tbf. It is protected by its own lock tbf_lock. 115 * 116 * Lock Ordering is 117 * v_lock --> tbf_lock 118 * v_lock --> ill_locK 119 * 120 * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb). 121 * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker, 122 * it also maintains a state. These fields are protected by a lock (mfcb_lock). 123 * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to 124 * protect the struct elements. 125 * 126 * mfc structs are dynamically allocated and are singly linked 127 * at the head of the chain. When an mfc structure is to be deleted 128 * it is marked condemned and so is the state in the bucket struct. 129 * When the last walker of the hash bucket exits all the mfc structs 130 * marked condemed are freed. 131 * 132 * Locking Hierarchy: 133 * The bucket lock should be acquired before the mfc struct lock. 134 * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking 135 * operations on the bucket struct. 136 * 137 * last_encap_lock and numvifs_mutex should be acquired after 138 * acquring vif or mfc locks. These locks protect some global variables. 139 * 140 * The statistics are not currently protected by a lock 141 * causing the stats be be approximate, not exact. 142 */ 143 144 #define NO_VIF MAXVIFS /* from mrouted, no route for src */ 145 146 /* 147 * Timeouts: 148 * Upcall timeouts - BSD uses boolean_t mfc->expire and 149 * nexpire[MFCTBLSIZE], the number of times expire has been called. 150 * SunOS 5.x uses mfc->timeout for each mfc. 151 * Some Unixes are limited in the number of simultaneous timeouts 152 * that can be run, SunOS 5.x does not have this restriction. 153 */ 154 155 /* 156 * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and 157 * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall 158 * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE 159 */ 160 #define EXPIRE_TIMEOUT (hz/4) /* 4x / second */ 161 #define UPCALL_EXPIRE 6 /* number of timeouts */ 162 163 /* 164 * Hash function for a source, group entry 165 */ 166 #define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \ 167 ((g) >> 20) ^ ((g) >> 10) ^ (g)) 168 169 #define TBF_REPROCESS (hz / 100) /* 100x /second */ 170 171 /* Identify PIM packet that came on a Register interface */ 172 #define PIM_REGISTER_MARKER 0xffffffff 173 174 /* Function declarations */ 175 static int add_mfc(struct mfcctl *, ip_stack_t *); 176 static int add_vif(struct vifctl *, conn_t *, mblk_t *, ip_stack_t *); 177 static int del_mfc(struct mfcctl *, ip_stack_t *); 178 static int del_vif(vifi_t *, conn_t *, mblk_t *, ip_stack_t *); 179 static void del_vifp(struct vif *); 180 static void encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 181 static void expire_upcalls(void *); 182 static void fill_route(struct mfc *, struct mfcctl *, ip_stack_t *); 183 static void free_queue(struct mfc *); 184 static int get_assert(uchar_t *, ip_stack_t *); 185 static int get_lsg_cnt(struct sioc_lsg_req *, ip_stack_t *); 186 static int get_sg_cnt(struct sioc_sg_req *, ip_stack_t *); 187 static int get_version(uchar_t *); 188 static int get_vif_cnt(struct sioc_vif_req *, ip_stack_t *); 189 static int ip_mdq(mblk_t *, ipha_t *, ill_t *, 190 ipaddr_t, struct mfc *); 191 static int ip_mrouter_init(conn_t *, uchar_t *, int, ip_stack_t *); 192 static void phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 193 static int register_mforward(queue_t *, mblk_t *, ill_t *); 194 static void register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 195 static int set_assert(int *, ip_stack_t *); 196 197 /* 198 * Token Bucket Filter functions 199 */ 200 static int priority(struct vif *, ipha_t *); 201 static void tbf_control(struct vif *, mblk_t *, ipha_t *); 202 static int tbf_dq_sel(struct vif *, ipha_t *); 203 static void tbf_process_q(struct vif *); 204 static void tbf_queue(struct vif *, mblk_t *); 205 static void tbf_reprocess_q(void *); 206 static void tbf_send_packet(struct vif *, mblk_t *); 207 static void tbf_update_tokens(struct vif *); 208 static void release_mfc(struct mfcb *); 209 210 static boolean_t is_mrouter_off(ip_stack_t *); 211 /* 212 * Encapsulation packets 213 */ 214 215 #define ENCAP_TTL 64 216 217 /* prototype IP hdr for encapsulated packets */ 218 static ipha_t multicast_encap_iphdr = { 219 IP_SIMPLE_HDR_VERSION, 220 0, /* tos */ 221 sizeof (ipha_t), /* total length */ 222 0, /* id */ 223 0, /* frag offset */ 224 ENCAP_TTL, IPPROTO_ENCAP, 225 0, /* checksum */ 226 }; 227 228 /* 229 * Rate limit for assert notification messages, in nsec. 230 */ 231 #define ASSERT_MSG_TIME 3000000000 232 233 234 #define VIF_REFHOLD(vifp) { \ 235 mutex_enter(&(vifp)->v_lock); \ 236 (vifp)->v_refcnt++; \ 237 mutex_exit(&(vifp)->v_lock); \ 238 } 239 240 #define VIF_REFRELE_LOCKED(vifp) { \ 241 (vifp)->v_refcnt--; \ 242 if ((vifp)->v_refcnt == 0 && \ 243 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) { \ 244 del_vifp(vifp); \ 245 } else { \ 246 mutex_exit(&(vifp)->v_lock); \ 247 } \ 248 } 249 250 #define VIF_REFRELE(vifp) { \ 251 mutex_enter(&(vifp)->v_lock); \ 252 (vifp)->v_refcnt--; \ 253 if ((vifp)->v_refcnt == 0 && \ 254 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) { \ 255 del_vifp(vifp); \ 256 } else { \ 257 mutex_exit(&(vifp)->v_lock); \ 258 } \ 259 } 260 261 #define MFCB_REFHOLD(mfcb) { \ 262 mutex_enter(&(mfcb)->mfcb_lock); \ 263 (mfcb)->mfcb_refcnt++; \ 264 ASSERT((mfcb)->mfcb_refcnt != 0); \ 265 mutex_exit(&(mfcb)->mfcb_lock); \ 266 } 267 268 #define MFCB_REFRELE(mfcb) { \ 269 mutex_enter(&(mfcb)->mfcb_lock); \ 270 ASSERT((mfcb)->mfcb_refcnt != 0); \ 271 if (--(mfcb)->mfcb_refcnt == 0 && \ 272 ((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) { \ 273 release_mfc(mfcb); \ 274 } \ 275 mutex_exit(&(mfcb)->mfcb_lock); \ 276 } 277 278 /* 279 * MFCFIND: 280 * Find a route for a given origin IP address and multicast group address. 281 * Skip entries with pending upcalls. 282 * Type of service parameter to be added in the future! 283 */ 284 #define MFCFIND(mfcbp, o, g, rt) { \ 285 struct mfc *_mb_rt = NULL; \ 286 rt = NULL; \ 287 _mb_rt = mfcbp->mfcb_mfc; \ 288 while (_mb_rt) { \ 289 if ((_mb_rt->mfc_origin.s_addr == o) && \ 290 (_mb_rt->mfc_mcastgrp.s_addr == g) && \ 291 (_mb_rt->mfc_rte == NULL) && \ 292 (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) { \ 293 rt = _mb_rt; \ 294 break; \ 295 } \ 296 _mb_rt = _mb_rt->mfc_next; \ 297 } \ 298 } 299 300 /* 301 * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime() 302 * are inefficient. We use gethrestime() which returns a timespec_t with 303 * sec and nsec, the resolution is machine dependent. 304 * The following 2 macros have been changed to use nsec instead of usec. 305 */ 306 /* 307 * Macros to compute elapsed time efficiently. 308 * Borrowed from Van Jacobson's scheduling code. 309 * Delta should be a hrtime_t. 310 */ 311 #define TV_DELTA(a, b, delta) { \ 312 int xxs; \ 313 \ 314 delta = (a).tv_nsec - (b).tv_nsec; \ 315 if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \ 316 switch (xxs) { \ 317 case 2: \ 318 delta += 1000000000; \ 319 /*FALLTHROUGH*/ \ 320 case 1: \ 321 delta += 1000000000; \ 322 break; \ 323 default: \ 324 delta += (1000000000 * xxs); \ 325 } \ 326 } \ 327 } 328 329 #define TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \ 330 (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec) 331 332 /* 333 * Handle MRT setsockopt commands to modify the multicast routing tables. 334 */ 335 int 336 ip_mrouter_set(int cmd, queue_t *q, int checkonly, uchar_t *data, 337 int datalen, mblk_t *first_mp) 338 { 339 conn_t *connp = Q_TO_CONN(q); 340 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 341 342 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 343 if (cmd != MRT_INIT && connp != ipst->ips_ip_g_mrouter) { 344 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 345 return (EACCES); 346 } 347 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 348 349 if (checkonly) { 350 /* 351 * do not do operation, just pretend to - new T_CHECK 352 * Note: Even routines further on can probably fail but 353 * this T_CHECK stuff is only to please XTI so it not 354 * necessary to be perfect. 355 */ 356 switch (cmd) { 357 case MRT_INIT: 358 case MRT_DONE: 359 case MRT_ADD_VIF: 360 case MRT_DEL_VIF: 361 case MRT_ADD_MFC: 362 case MRT_DEL_MFC: 363 case MRT_ASSERT: 364 return (0); 365 default: 366 return (EOPNOTSUPP); 367 } 368 } 369 370 /* 371 * make sure no command is issued after multicast routing has been 372 * turned off. 373 */ 374 if (cmd != MRT_INIT && cmd != MRT_DONE) { 375 if (is_mrouter_off(ipst)) 376 return (EINVAL); 377 } 378 379 switch (cmd) { 380 case MRT_INIT: return (ip_mrouter_init(connp, data, datalen, ipst)); 381 case MRT_DONE: return (ip_mrouter_done(first_mp, ipst)); 382 case MRT_ADD_VIF: return (add_vif((struct vifctl *)data, connp, 383 first_mp, ipst)); 384 case MRT_DEL_VIF: return (del_vif((vifi_t *)data, connp, first_mp, 385 ipst)); 386 case MRT_ADD_MFC: return (add_mfc((struct mfcctl *)data, ipst)); 387 case MRT_DEL_MFC: return (del_mfc((struct mfcctl *)data, ipst)); 388 case MRT_ASSERT: return (set_assert((int *)data, ipst)); 389 default: return (EOPNOTSUPP); 390 } 391 } 392 393 /* 394 * Handle MRT getsockopt commands 395 */ 396 int 397 ip_mrouter_get(int cmd, queue_t *q, uchar_t *data) 398 { 399 conn_t *connp = Q_TO_CONN(q); 400 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 401 402 if (connp != ipst->ips_ip_g_mrouter) 403 return (EACCES); 404 405 switch (cmd) { 406 case MRT_VERSION: return (get_version((uchar_t *)data)); 407 case MRT_ASSERT: return (get_assert((uchar_t *)data, ipst)); 408 default: return (EOPNOTSUPP); 409 } 410 } 411 412 /* 413 * Handle ioctl commands to obtain information from the cache. 414 * Called with shared access to IP. These are read_only ioctls. 415 */ 416 /* ARGSUSED */ 417 int 418 mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 419 ip_ioctl_cmd_t *ipip, void *if_req) 420 { 421 mblk_t *mp1; 422 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 423 conn_t *connp = Q_TO_CONN(q); 424 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 425 426 /* Existence verified in ip_wput_nondata */ 427 mp1 = mp->b_cont->b_cont; 428 429 switch (iocp->ioc_cmd) { 430 case (SIOCGETVIFCNT): 431 return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr, ipst)); 432 case (SIOCGETSGCNT): 433 return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr, ipst)); 434 case (SIOCGETLSGCNT): 435 return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr, ipst)); 436 default: 437 return (EINVAL); 438 } 439 } 440 441 /* 442 * Returns the packet, byte, rpf-failure count for the source, group provided. 443 */ 444 static int 445 get_sg_cnt(struct sioc_sg_req *req, ip_stack_t *ipst) 446 { 447 struct mfc *rt; 448 struct mfcb *mfcbp; 449 450 mfcbp = &ipst->ips_mfcs[MFCHASH(req->src.s_addr, req->grp.s_addr)]; 451 MFCB_REFHOLD(mfcbp); 452 MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt); 453 454 if (rt != NULL) { 455 mutex_enter(&rt->mfc_mutex); 456 req->pktcnt = rt->mfc_pkt_cnt; 457 req->bytecnt = rt->mfc_byte_cnt; 458 req->wrong_if = rt->mfc_wrong_if; 459 mutex_exit(&rt->mfc_mutex); 460 } else 461 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU; 462 463 MFCB_REFRELE(mfcbp); 464 return (0); 465 } 466 467 /* 468 * Returns the packet, byte, rpf-failure count for the source, group provided. 469 * Uses larger counters and IPv6 addresses. 470 */ 471 /* ARGSUSED XXX until implemented */ 472 static int 473 get_lsg_cnt(struct sioc_lsg_req *req, ip_stack_t *ipst) 474 { 475 /* XXX TODO SIOCGETLSGCNT */ 476 return (ENXIO); 477 } 478 479 /* 480 * Returns the input and output packet and byte counts on the vif provided. 481 */ 482 static int 483 get_vif_cnt(struct sioc_vif_req *req, ip_stack_t *ipst) 484 { 485 vifi_t vifi = req->vifi; 486 487 if (vifi >= ipst->ips_numvifs) 488 return (EINVAL); 489 490 /* 491 * No locks here, an approximation is fine. 492 */ 493 req->icount = ipst->ips_vifs[vifi].v_pkt_in; 494 req->ocount = ipst->ips_vifs[vifi].v_pkt_out; 495 req->ibytes = ipst->ips_vifs[vifi].v_bytes_in; 496 req->obytes = ipst->ips_vifs[vifi].v_bytes_out; 497 498 return (0); 499 } 500 501 static int 502 get_version(uchar_t *data) 503 { 504 int *v = (int *)data; 505 506 *v = 0x0305; /* XXX !!!! */ 507 508 return (0); 509 } 510 511 /* 512 * Set PIM assert processing global. 513 */ 514 static int 515 set_assert(int *i, ip_stack_t *ipst) 516 { 517 if ((*i != 1) && (*i != 0)) 518 return (EINVAL); 519 520 ipst->ips_pim_assert = *i; 521 522 return (0); 523 } 524 525 /* 526 * Get PIM assert processing global. 527 */ 528 static int 529 get_assert(uchar_t *data, ip_stack_t *ipst) 530 { 531 int *i = (int *)data; 532 533 *i = ipst->ips_pim_assert; 534 535 return (0); 536 } 537 538 /* 539 * Enable multicast routing. 540 */ 541 static int 542 ip_mrouter_init(conn_t *connp, uchar_t *data, int datalen, ip_stack_t *ipst) 543 { 544 int *v; 545 546 if (data == NULL || (datalen != sizeof (int))) 547 return (ENOPROTOOPT); 548 549 v = (int *)data; 550 if (*v != 1) 551 return (ENOPROTOOPT); 552 553 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 554 if (ipst->ips_ip_g_mrouter != NULL) { 555 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 556 return (EADDRINUSE); 557 } 558 559 /* 560 * MRT_INIT should only be allowed for RAW sockets, but we double 561 * check. 562 */ 563 if (!IPCL_IS_RAWIP(connp)) { 564 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 565 return (EINVAL); 566 } 567 568 ipst->ips_ip_g_mrouter = connp; 569 connp->conn_multi_router = 1; 570 /* In order for tunnels to work we have to turn ip_g_forward on */ 571 if (!WE_ARE_FORWARDING(ipst)) { 572 if (ipst->ips_ip_mrtdebug > 1) { 573 (void) mi_strlog(connp->conn_rq, 1, SL_TRACE, 574 "ip_mrouter_init: turning on forwarding"); 575 } 576 ipst->ips_saved_ip_g_forward = ipst->ips_ip_g_forward; 577 ipst->ips_ip_g_forward = IP_FORWARD_ALWAYS; 578 } 579 580 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 581 return (0); 582 } 583 584 void 585 ip_mrouter_stack_init(ip_stack_t *ipst) 586 { 587 mutex_init(&ipst->ips_ip_g_mrouter_mutex, NULL, MUTEX_DEFAULT, NULL); 588 589 ipst->ips_vifs = kmem_zalloc(sizeof (struct vif) * (MAXVIFS+1), 590 KM_SLEEP); 591 ipst->ips_mrtstat = kmem_zalloc(sizeof (struct mrtstat), KM_SLEEP); 592 /* 593 * mfctable: 594 * Includes all mfcs, including waiting upcalls. 595 * Multiple mfcs per bucket. 596 */ 597 ipst->ips_mfcs = kmem_zalloc(sizeof (struct mfcb) * MFCTBLSIZ, 598 KM_SLEEP); 599 /* 600 * Define the token bucket filter structures. 601 * tbftable -> each vif has one of these for storing info. 602 */ 603 ipst->ips_tbfs = kmem_zalloc(sizeof (struct tbf) * MAXVIFS, KM_SLEEP); 604 605 mutex_init(&ipst->ips_last_encap_lock, NULL, MUTEX_DEFAULT, NULL); 606 607 ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl); 608 ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl); 609 } 610 611 /* 612 * Disable multicast routing. 613 * Didn't use global timeout_val (BSD version), instead check the mfctable. 614 */ 615 int 616 ip_mrouter_done(mblk_t *mp, ip_stack_t *ipst) 617 { 618 conn_t *mrouter; 619 vifi_t vifi; 620 struct mfc *mfc_rt; 621 int i; 622 623 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 624 if (ipst->ips_ip_g_mrouter == NULL) { 625 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 626 return (EINVAL); 627 } 628 629 mrouter = ipst->ips_ip_g_mrouter; 630 631 if (ipst->ips_saved_ip_g_forward != -1) { 632 if (ipst->ips_ip_mrtdebug > 1) { 633 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 634 "ip_mrouter_done: turning off forwarding"); 635 } 636 ipst->ips_ip_g_forward = ipst->ips_saved_ip_g_forward; 637 ipst->ips_saved_ip_g_forward = -1; 638 } 639 640 /* 641 * Always clear cache when vifs change. 642 * No need to get ipst->ips_last_encap_lock since we are running as 643 * a writer. 644 */ 645 mutex_enter(&ipst->ips_last_encap_lock); 646 ipst->ips_last_encap_src = 0; 647 ipst->ips_last_encap_vif = NULL; 648 mutex_exit(&ipst->ips_last_encap_lock); 649 mrouter->conn_multi_router = 0; 650 651 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 652 653 /* 654 * For each phyint in use, 655 * disable promiscuous reception of all IP multicasts. 656 */ 657 for (vifi = 0; vifi < MAXVIFS; vifi++) { 658 struct vif *vifp = ipst->ips_vifs + vifi; 659 660 mutex_enter(&vifp->v_lock); 661 /* 662 * if the vif is active mark it condemned. 663 */ 664 if (vifp->v_marks & VIF_MARK_GOOD) { 665 ASSERT(vifp->v_ipif != NULL); 666 ipif_refhold(vifp->v_ipif); 667 /* Phyint only */ 668 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 669 ipif_t *ipif = vifp->v_ipif; 670 ipsq_t *ipsq; 671 boolean_t suc; 672 ill_t *ill; 673 674 ill = ipif->ipif_ill; 675 suc = B_FALSE; 676 if (mp == NULL) { 677 /* 678 * being called from ip_close, 679 * lets do it synchronously. 680 * Clear VIF_MARK_GOOD and 681 * set VIF_MARK_CONDEMNED. 682 */ 683 vifp->v_marks &= ~VIF_MARK_GOOD; 684 vifp->v_marks |= VIF_MARK_CONDEMNED; 685 mutex_exit(&(vifp)->v_lock); 686 suc = ipsq_enter(ill, B_FALSE); 687 ipsq = ill->ill_phyint->phyint_ipsq; 688 } else { 689 ipsq = ipsq_try_enter(ipif, NULL, 690 mrouter->conn_wq, mp, 691 ip_restart_optmgmt, NEW_OP, B_TRUE); 692 if (ipsq == NULL) { 693 mutex_exit(&(vifp)->v_lock); 694 ipif_refrele(ipif); 695 return (EINPROGRESS); 696 } 697 /* 698 * Clear VIF_MARK_GOOD and 699 * set VIF_MARK_CONDEMNED. 700 */ 701 vifp->v_marks &= ~VIF_MARK_GOOD; 702 vifp->v_marks |= VIF_MARK_CONDEMNED; 703 mutex_exit(&(vifp)->v_lock); 704 suc = B_TRUE; 705 } 706 707 if (suc) { 708 (void) ip_delmulti(INADDR_ANY, ipif, 709 B_TRUE, B_TRUE); 710 ipsq_exit(ipsq); 711 } 712 mutex_enter(&vifp->v_lock); 713 } 714 /* 715 * decreases the refcnt added in add_vif. 716 * and release v_lock. 717 */ 718 VIF_REFRELE_LOCKED(vifp); 719 } else { 720 mutex_exit(&vifp->v_lock); 721 continue; 722 } 723 } 724 725 mutex_enter(&ipst->ips_numvifs_mutex); 726 ipst->ips_numvifs = 0; 727 ipst->ips_pim_assert = 0; 728 ipst->ips_reg_vif_num = ALL_VIFS; 729 mutex_exit(&ipst->ips_numvifs_mutex); 730 731 /* 732 * Free upcall msgs. 733 * Go through mfctable and stop any outstanding upcall 734 * timeouts remaining on mfcs. 735 */ 736 for (i = 0; i < MFCTBLSIZ; i++) { 737 mutex_enter(&ipst->ips_mfcs[i].mfcb_lock); 738 ipst->ips_mfcs[i].mfcb_refcnt++; 739 ipst->ips_mfcs[i].mfcb_marks |= MFCB_MARK_CONDEMNED; 740 mutex_exit(&ipst->ips_mfcs[i].mfcb_lock); 741 mfc_rt = ipst->ips_mfcs[i].mfcb_mfc; 742 while (mfc_rt) { 743 /* Free upcalls */ 744 mutex_enter(&mfc_rt->mfc_mutex); 745 if (mfc_rt->mfc_rte != NULL) { 746 if (mfc_rt->mfc_timeout_id != 0) { 747 /* 748 * OK to drop the lock as we have 749 * a refcnt on the bucket. timeout 750 * can fire but it will see that 751 * mfc_timeout_id == 0 and not do 752 * anything. see expire_upcalls(). 753 */ 754 mfc_rt->mfc_timeout_id = 0; 755 mutex_exit(&mfc_rt->mfc_mutex); 756 (void) untimeout( 757 mfc_rt->mfc_timeout_id); 758 mfc_rt->mfc_timeout_id = 0; 759 mutex_enter(&mfc_rt->mfc_mutex); 760 761 /* 762 * all queued upcall packets 763 * and mblk will be freed in 764 * release_mfc(). 765 */ 766 } 767 } 768 769 mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED; 770 771 mutex_exit(&mfc_rt->mfc_mutex); 772 mfc_rt = mfc_rt->mfc_next; 773 } 774 MFCB_REFRELE(&ipst->ips_mfcs[i]); 775 } 776 777 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 778 ipst->ips_ip_g_mrouter = NULL; 779 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 780 return (0); 781 } 782 783 void 784 ip_mrouter_stack_destroy(ip_stack_t *ipst) 785 { 786 struct mfcb *mfcbp; 787 struct mfc *rt; 788 int i; 789 790 for (i = 0; i < MFCTBLSIZ; i++) { 791 mfcbp = &ipst->ips_mfcs[i]; 792 793 while ((rt = mfcbp->mfcb_mfc) != NULL) { 794 (void) printf("ip_mrouter_stack_destroy: free for %d\n", 795 i); 796 797 mfcbp->mfcb_mfc = rt->mfc_next; 798 free_queue(rt); 799 mi_free(rt); 800 } 801 } 802 kmem_free(ipst->ips_vifs, sizeof (struct vif) * (MAXVIFS+1)); 803 ipst->ips_vifs = NULL; 804 kmem_free(ipst->ips_mrtstat, sizeof (struct mrtstat)); 805 ipst->ips_mrtstat = NULL; 806 kmem_free(ipst->ips_mfcs, sizeof (struct mfcb) * MFCTBLSIZ); 807 ipst->ips_mfcs = NULL; 808 kmem_free(ipst->ips_tbfs, sizeof (struct tbf) * MAXVIFS); 809 ipst->ips_tbfs = NULL; 810 811 mutex_destroy(&ipst->ips_last_encap_lock); 812 mutex_destroy(&ipst->ips_ip_g_mrouter_mutex); 813 } 814 815 static boolean_t 816 is_mrouter_off(ip_stack_t *ipst) 817 { 818 conn_t *mrouter; 819 820 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 821 if (ipst->ips_ip_g_mrouter == NULL) { 822 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 823 return (B_TRUE); 824 } 825 826 mrouter = ipst->ips_ip_g_mrouter; 827 if (mrouter->conn_multi_router == 0) { 828 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 829 return (B_TRUE); 830 } 831 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 832 return (B_FALSE); 833 } 834 835 static void 836 unlock_good_vif(struct vif *vifp) 837 { 838 ASSERT(vifp->v_ipif != NULL); 839 ipif_refrele(vifp->v_ipif); 840 VIF_REFRELE(vifp); 841 } 842 843 static boolean_t 844 lock_good_vif(struct vif *vifp) 845 { 846 mutex_enter(&vifp->v_lock); 847 if (!(vifp->v_marks & VIF_MARK_GOOD)) { 848 mutex_exit(&vifp->v_lock); 849 return (B_FALSE); 850 } 851 852 ASSERT(vifp->v_ipif != NULL); 853 mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock); 854 if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) { 855 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock); 856 mutex_exit(&vifp->v_lock); 857 return (B_FALSE); 858 } 859 ipif_refhold_locked(vifp->v_ipif); 860 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock); 861 vifp->v_refcnt++; 862 mutex_exit(&vifp->v_lock); 863 return (B_TRUE); 864 } 865 866 /* 867 * Add a vif to the vif table. 868 */ 869 static int 870 add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst) 871 { 872 struct vif *vifp = ipst->ips_vifs + vifcp->vifc_vifi; 873 ipif_t *ipif; 874 int error; 875 struct tbf *v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi; 876 ipsq_t *ipsq; 877 conn_t *mrouter = ipst->ips_ip_g_mrouter; 878 879 ASSERT(connp != NULL); 880 881 if (vifcp->vifc_vifi >= MAXVIFS) 882 return (EINVAL); 883 884 if (is_mrouter_off(ipst)) 885 return (EINVAL); 886 887 mutex_enter(&vifp->v_lock); 888 /* 889 * Viftable entry should be 0. 890 * if v_marks == 0 but v_refcnt != 0 means struct is being 891 * initialized. 892 * 893 * Also note that it is very unlikely that we will get a MRT_ADD_VIF 894 * request while the delete is in progress, mrouted only sends add 895 * requests when a new interface is added and the new interface cannot 896 * have the same vifi as an existing interface. We make sure that 897 * ill_delete will block till the vif is deleted by adding a refcnt 898 * to ipif in del_vif(). 899 */ 900 if (vifp->v_lcl_addr.s_addr != 0 || 901 vifp->v_marks != 0 || 902 vifp->v_refcnt != 0) { 903 mutex_exit(&vifp->v_lock); 904 return (EADDRINUSE); 905 } 906 907 /* Incoming vif should not be 0 */ 908 if (vifcp->vifc_lcl_addr.s_addr == 0) { 909 mutex_exit(&vifp->v_lock); 910 return (EINVAL); 911 } 912 913 vifp->v_refcnt++; 914 mutex_exit(&vifp->v_lock); 915 /* Find the interface with the local address */ 916 ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL, 917 connp->conn_zoneid, CONNP_TO_WQ(connp), first_mp, 918 ip_restart_optmgmt, &error, ipst); 919 if (ipif == NULL) { 920 VIF_REFRELE(vifp); 921 if (error == EINPROGRESS) 922 return (error); 923 return (EADDRNOTAVAIL); 924 } 925 926 /* 927 * We have to be exclusive as we have to call ip_addmulti() 928 * This is the best position to try to be exclusive in case 929 * we have to wait. 930 */ 931 ipsq = ipsq_try_enter(ipif, NULL, CONNP_TO_WQ(connp), first_mp, 932 ip_restart_optmgmt, NEW_OP, B_TRUE); 933 if ((ipsq) == NULL) { 934 VIF_REFRELE(vifp); 935 ipif_refrele(ipif); 936 return (EINPROGRESS); 937 } 938 939 if (ipst->ips_ip_mrtdebug > 1) { 940 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 941 "add_vif: src 0x%x enter", 942 vifcp->vifc_lcl_addr.s_addr); 943 } 944 945 mutex_enter(&vifp->v_lock); 946 /* 947 * Always clear cache when vifs change. 948 * Needed to ensure that src isn't left over from before vif was added. 949 * No need to get last_encap_lock, since we are running as a writer. 950 */ 951 952 mutex_enter(&ipst->ips_last_encap_lock); 953 ipst->ips_last_encap_src = 0; 954 ipst->ips_last_encap_vif = NULL; 955 mutex_exit(&ipst->ips_last_encap_lock); 956 957 if (vifcp->vifc_flags & VIFF_TUNNEL) { 958 if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) { 959 cmn_err(CE_WARN, 960 "add_vif: source route tunnels not supported\n"); 961 VIF_REFRELE_LOCKED(vifp); 962 ipif_refrele(ipif); 963 ipsq_exit(ipsq); 964 return (EOPNOTSUPP); 965 } 966 vifp->v_rmt_addr = vifcp->vifc_rmt_addr; 967 968 } else { 969 /* Phyint or Register vif */ 970 if (vifcp->vifc_flags & VIFF_REGISTER) { 971 /* 972 * Note: Since all IPPROTO_IP level options (including 973 * MRT_ADD_VIF) are done exclusively via 974 * ip_optmgmt_writer(), a lock is not necessary to 975 * protect reg_vif_num. 976 */ 977 mutex_enter(&ipst->ips_numvifs_mutex); 978 if (ipst->ips_reg_vif_num == ALL_VIFS) { 979 ipst->ips_reg_vif_num = vifcp->vifc_vifi; 980 mutex_exit(&ipst->ips_numvifs_mutex); 981 } else { 982 mutex_exit(&ipst->ips_numvifs_mutex); 983 VIF_REFRELE_LOCKED(vifp); 984 ipif_refrele(ipif); 985 ipsq_exit(ipsq); 986 return (EADDRINUSE); 987 } 988 } 989 990 /* Make sure the interface supports multicast */ 991 if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) { 992 VIF_REFRELE_LOCKED(vifp); 993 ipif_refrele(ipif); 994 if (vifcp->vifc_flags & VIFF_REGISTER) { 995 mutex_enter(&ipst->ips_numvifs_mutex); 996 ipst->ips_reg_vif_num = ALL_VIFS; 997 mutex_exit(&ipst->ips_numvifs_mutex); 998 } 999 ipsq_exit(ipsq); 1000 return (EOPNOTSUPP); 1001 } 1002 /* Enable promiscuous reception of all IP mcasts from the if */ 1003 mutex_exit(&vifp->v_lock); 1004 error = ip_addmulti(INADDR_ANY, ipif, ILGSTAT_NONE, 1005 MODE_IS_EXCLUDE, NULL); 1006 mutex_enter(&vifp->v_lock); 1007 /* 1008 * since we released the lock lets make sure that 1009 * ip_mrouter_done() has not been called. 1010 */ 1011 if (error != 0 || is_mrouter_off(ipst)) { 1012 if (error == 0) 1013 (void) ip_delmulti(INADDR_ANY, ipif, B_TRUE, 1014 B_TRUE); 1015 if (vifcp->vifc_flags & VIFF_REGISTER) { 1016 mutex_enter(&ipst->ips_numvifs_mutex); 1017 ipst->ips_reg_vif_num = ALL_VIFS; 1018 mutex_exit(&ipst->ips_numvifs_mutex); 1019 } 1020 VIF_REFRELE_LOCKED(vifp); 1021 ipif_refrele(ipif); 1022 ipsq_exit(ipsq); 1023 return (error?error:EINVAL); 1024 } 1025 } 1026 /* Define parameters for the tbf structure */ 1027 vifp->v_tbf = v_tbf; 1028 gethrestime(&vifp->v_tbf->tbf_last_pkt_t); 1029 vifp->v_tbf->tbf_n_tok = 0; 1030 vifp->v_tbf->tbf_q_len = 0; 1031 vifp->v_tbf->tbf_max_q_len = MAXQSIZE; 1032 vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL; 1033 1034 vifp->v_flags = vifcp->vifc_flags; 1035 vifp->v_threshold = vifcp->vifc_threshold; 1036 vifp->v_lcl_addr = vifcp->vifc_lcl_addr; 1037 vifp->v_ipif = ipif; 1038 ipif_refrele(ipif); 1039 /* Scaling up here, allows division by 1024 in critical code. */ 1040 vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000); 1041 vifp->v_timeout_id = 0; 1042 /* initialize per vif pkt counters */ 1043 vifp->v_pkt_in = 0; 1044 vifp->v_pkt_out = 0; 1045 vifp->v_bytes_in = 0; 1046 vifp->v_bytes_out = 0; 1047 mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL); 1048 1049 /* Adjust numvifs up, if the vifi is higher than numvifs */ 1050 mutex_enter(&ipst->ips_numvifs_mutex); 1051 if (ipst->ips_numvifs <= vifcp->vifc_vifi) 1052 ipst->ips_numvifs = vifcp->vifc_vifi + 1; 1053 mutex_exit(&ipst->ips_numvifs_mutex); 1054 1055 if (ipst->ips_ip_mrtdebug > 1) { 1056 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1057 "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d", 1058 vifcp->vifc_vifi, 1059 ntohl(vifcp->vifc_lcl_addr.s_addr), 1060 (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask", 1061 ntohl(vifcp->vifc_rmt_addr.s_addr), 1062 vifcp->vifc_threshold, vifcp->vifc_rate_limit); 1063 } 1064 1065 vifp->v_marks = VIF_MARK_GOOD; 1066 mutex_exit(&vifp->v_lock); 1067 ipsq_exit(ipsq); 1068 return (0); 1069 } 1070 1071 1072 /* Delete a vif from the vif table. */ 1073 static void 1074 del_vifp(struct vif *vifp) 1075 { 1076 struct tbf *t = vifp->v_tbf; 1077 mblk_t *mp0; 1078 vifi_t vifi; 1079 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 1080 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1081 1082 ASSERT(vifp->v_marks & VIF_MARK_CONDEMNED); 1083 ASSERT(t != NULL); 1084 1085 /* 1086 * release the ref we put in vif_del. 1087 */ 1088 ASSERT(vifp->v_ipif != NULL); 1089 ipif_refrele(vifp->v_ipif); 1090 1091 if (ipst->ips_ip_mrtdebug > 1) { 1092 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1093 "del_vif: src 0x%x\n", vifp->v_lcl_addr.s_addr); 1094 } 1095 1096 if (vifp->v_timeout_id != 0) { 1097 (void) untimeout(vifp->v_timeout_id); 1098 vifp->v_timeout_id = 0; 1099 } 1100 1101 /* 1102 * Free packets queued at the interface. 1103 * Mrouted takes care of cleaning up mfcs - makes calls to del_mfc. 1104 */ 1105 mutex_enter(&t->tbf_lock); 1106 while (t->tbf_q != NULL) { 1107 mp0 = t->tbf_q; 1108 t->tbf_q = t->tbf_q->b_next; 1109 mp0->b_prev = mp0->b_next = NULL; 1110 freemsg(mp0); 1111 } 1112 mutex_exit(&t->tbf_lock); 1113 1114 /* 1115 * Always clear cache when vifs change. 1116 * No need to get last_encap_lock since we are running as a writer. 1117 */ 1118 mutex_enter(&ipst->ips_last_encap_lock); 1119 if (vifp == ipst->ips_last_encap_vif) { 1120 ipst->ips_last_encap_vif = NULL; 1121 ipst->ips_last_encap_src = 0; 1122 } 1123 mutex_exit(&ipst->ips_last_encap_lock); 1124 1125 mutex_destroy(&t->tbf_lock); 1126 1127 bzero(vifp->v_tbf, sizeof (*(vifp->v_tbf))); 1128 1129 /* Adjust numvifs down */ 1130 mutex_enter(&ipst->ips_numvifs_mutex); 1131 for (vifi = ipst->ips_numvifs; vifi != 0; vifi--) /* vifi is unsigned */ 1132 if (ipst->ips_vifs[vifi - 1].v_lcl_addr.s_addr != 0) 1133 break; 1134 ipst->ips_numvifs = vifi; 1135 mutex_exit(&ipst->ips_numvifs_mutex); 1136 1137 bzero(vifp, sizeof (*vifp)); 1138 } 1139 1140 static int 1141 del_vif(vifi_t *vifip, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst) 1142 { 1143 struct vif *vifp = ipst->ips_vifs + *vifip; 1144 ipsq_t *ipsq; 1145 1146 if (*vifip >= ipst->ips_numvifs) 1147 return (EINVAL); 1148 1149 1150 mutex_enter(&vifp->v_lock); 1151 /* 1152 * Not initialized 1153 * Here we are not looking at the vif that is being initialized 1154 * i.e vifp->v_marks == 0 and refcnt > 0. 1155 */ 1156 if (vifp->v_lcl_addr.s_addr == 0 || 1157 !(vifp->v_marks & VIF_MARK_GOOD)) { 1158 mutex_exit(&vifp->v_lock); 1159 return (EADDRNOTAVAIL); 1160 } 1161 1162 /* 1163 * This is an optimization, if first_mp == NULL 1164 * than we are being called from reset_mrt_vif_ipif() 1165 * so we already have exclusive access to the ipsq. 1166 * the ASSERT below is a check for this condition. 1167 */ 1168 if (first_mp != NULL && 1169 !(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 1170 ASSERT(connp != NULL); 1171 /* 1172 * We have to be exclusive as we have to call ip_delmulti() 1173 * This is the best position to try to be exclusive in case 1174 * we have to wait. 1175 */ 1176 ipsq = ipsq_try_enter(vifp->v_ipif, NULL, CONNP_TO_WQ(connp), 1177 first_mp, ip_restart_optmgmt, NEW_OP, B_TRUE); 1178 if ((ipsq) == NULL) { 1179 mutex_exit(&vifp->v_lock); 1180 return (EINPROGRESS); 1181 } 1182 /* recheck after being exclusive */ 1183 if (vifp->v_lcl_addr.s_addr == 0 || 1184 !vifp->v_marks & VIF_MARK_GOOD) { 1185 /* 1186 * someone beat us. 1187 */ 1188 mutex_exit(&vifp->v_lock); 1189 ipsq_exit(ipsq); 1190 return (EADDRNOTAVAIL); 1191 } 1192 } 1193 1194 1195 ASSERT(IAM_WRITER_IPIF(vifp->v_ipif)); 1196 1197 1198 /* 1199 * add a refhold so that ipif does not go away while 1200 * there are still users, this will be released in del_vifp 1201 * when we free the vif. 1202 */ 1203 ipif_refhold(vifp->v_ipif); 1204 1205 /* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */ 1206 vifp->v_marks &= ~VIF_MARK_GOOD; 1207 vifp->v_marks |= VIF_MARK_CONDEMNED; 1208 1209 /* Phyint only */ 1210 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 1211 ipif_t *ipif = vifp->v_ipif; 1212 ASSERT(ipif != NULL); 1213 /* 1214 * should be OK to drop the lock as we 1215 * have marked this as CONDEMNED. 1216 */ 1217 mutex_exit(&(vifp)->v_lock); 1218 (void) ip_delmulti(INADDR_ANY, ipif, B_TRUE, B_TRUE); 1219 if (first_mp != NULL) 1220 ipsq_exit(ipsq); 1221 mutex_enter(&(vifp)->v_lock); 1222 } 1223 1224 /* 1225 * decreases the refcnt added in add_vif. 1226 */ 1227 VIF_REFRELE_LOCKED(vifp); 1228 return (0); 1229 } 1230 1231 /* 1232 * Add an mfc entry. 1233 */ 1234 static int 1235 add_mfc(struct mfcctl *mfccp, ip_stack_t *ipst) 1236 { 1237 struct mfc *rt; 1238 struct rtdetq *rte; 1239 ushort_t nstl; 1240 int i; 1241 struct mfcb *mfcbp; 1242 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1243 1244 /* 1245 * The value of vifi is NO_VIF (==MAXVIFS) if Mrouted 1246 * did not have a real route for pkt. 1247 * We want this pkt without rt installed in the mfctable to prevent 1248 * multiiple tries, so go ahead and put it in mfctable, it will 1249 * be discarded later in ip_mdq() because the child is NULL. 1250 */ 1251 1252 /* Error checking, out of bounds? */ 1253 if (mfccp->mfcc_parent > MAXVIFS) { 1254 ip0dbg(("ADD_MFC: mfcc_parent out of range %d", 1255 (int)mfccp->mfcc_parent)); 1256 return (EINVAL); 1257 } 1258 1259 if ((mfccp->mfcc_parent != NO_VIF) && 1260 (ipst->ips_vifs[mfccp->mfcc_parent].v_ipif == NULL)) { 1261 ip0dbg(("ADD_MFC: NULL ipif for parent vif %d\n", 1262 (int)mfccp->mfcc_parent)); 1263 return (EINVAL); 1264 } 1265 1266 if (is_mrouter_off(ipst)) { 1267 return (EINVAL); 1268 } 1269 1270 mfcbp = &ipst->ips_mfcs[MFCHASH(mfccp->mfcc_origin.s_addr, 1271 mfccp->mfcc_mcastgrp.s_addr)]; 1272 MFCB_REFHOLD(mfcbp); 1273 MFCFIND(mfcbp, mfccp->mfcc_origin.s_addr, 1274 mfccp->mfcc_mcastgrp.s_addr, rt); 1275 1276 /* If an entry already exists, just update the fields */ 1277 if (rt) { 1278 if (ipst->ips_ip_mrtdebug > 1) { 1279 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1280 "add_mfc: update o %x grp %x parent %x", 1281 ntohl(mfccp->mfcc_origin.s_addr), 1282 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1283 mfccp->mfcc_parent); 1284 } 1285 mutex_enter(&rt->mfc_mutex); 1286 rt->mfc_parent = mfccp->mfcc_parent; 1287 1288 mutex_enter(&ipst->ips_numvifs_mutex); 1289 for (i = 0; i < (int)ipst->ips_numvifs; i++) 1290 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; 1291 mutex_exit(&ipst->ips_numvifs_mutex); 1292 mutex_exit(&rt->mfc_mutex); 1293 1294 MFCB_REFRELE(mfcbp); 1295 return (0); 1296 } 1297 1298 /* 1299 * Find the entry for which the upcall was made and update. 1300 */ 1301 for (rt = mfcbp->mfcb_mfc, nstl = 0; rt; rt = rt->mfc_next) { 1302 mutex_enter(&rt->mfc_mutex); 1303 if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && 1304 (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) && 1305 (rt->mfc_rte != NULL) && 1306 !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 1307 if (nstl++ != 0) 1308 cmn_err(CE_WARN, 1309 "add_mfc: %s o %x g %x p %x", 1310 "multiple kernel entries", 1311 ntohl(mfccp->mfcc_origin.s_addr), 1312 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1313 mfccp->mfcc_parent); 1314 1315 if (ipst->ips_ip_mrtdebug > 1) { 1316 (void) mi_strlog(mrouter->conn_rq, 1, 1317 SL_TRACE, 1318 "add_mfc: o %x g %x p %x", 1319 ntohl(mfccp->mfcc_origin.s_addr), 1320 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1321 mfccp->mfcc_parent); 1322 } 1323 fill_route(rt, mfccp, ipst); 1324 1325 /* 1326 * Prevent cleanup of cache entry. 1327 * Timer starts in ip_mforward. 1328 */ 1329 if (rt->mfc_timeout_id != 0) { 1330 timeout_id_t id; 1331 id = rt->mfc_timeout_id; 1332 /* 1333 * setting id to zero will avoid this 1334 * entry from being cleaned up in 1335 * expire_up_calls(). 1336 */ 1337 rt->mfc_timeout_id = 0; 1338 /* 1339 * dropping the lock is fine as we 1340 * have a refhold on the bucket. 1341 * so mfc cannot be freed. 1342 * The timeout can fire but it will see 1343 * that mfc_timeout_id == 0 and not cleanup. 1344 */ 1345 mutex_exit(&rt->mfc_mutex); 1346 (void) untimeout(id); 1347 mutex_enter(&rt->mfc_mutex); 1348 } 1349 1350 /* 1351 * Send all pkts that are queued waiting for the upcall. 1352 * ip_mdq param tun set to 0 - 1353 * the return value of ip_mdq() isn't used here, 1354 * so value we send doesn't matter. 1355 */ 1356 while (rt->mfc_rte != NULL) { 1357 rte = rt->mfc_rte; 1358 rt->mfc_rte = rte->rte_next; 1359 mutex_exit(&rt->mfc_mutex); 1360 (void) ip_mdq(rte->mp, (ipha_t *) 1361 rte->mp->b_rptr, rte->ill, 0, rt); 1362 freemsg(rte->mp); 1363 mi_free((char *)rte); 1364 mutex_enter(&rt->mfc_mutex); 1365 } 1366 } 1367 mutex_exit(&rt->mfc_mutex); 1368 } 1369 1370 1371 /* 1372 * It is possible that an entry is being inserted without an upcall 1373 */ 1374 if (nstl == 0) { 1375 mutex_enter(&(mfcbp->mfcb_lock)); 1376 if (ipst->ips_ip_mrtdebug > 1) { 1377 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1378 "add_mfc: no upcall o %x g %x p %x", 1379 ntohl(mfccp->mfcc_origin.s_addr), 1380 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1381 mfccp->mfcc_parent); 1382 } 1383 if (is_mrouter_off(ipst)) { 1384 mutex_exit(&mfcbp->mfcb_lock); 1385 MFCB_REFRELE(mfcbp); 1386 return (EINVAL); 1387 } 1388 1389 for (rt = mfcbp->mfcb_mfc; rt; rt = rt->mfc_next) { 1390 1391 mutex_enter(&rt->mfc_mutex); 1392 if ((rt->mfc_origin.s_addr == 1393 mfccp->mfcc_origin.s_addr) && 1394 (rt->mfc_mcastgrp.s_addr == 1395 mfccp->mfcc_mcastgrp.s_addr) && 1396 (!(rt->mfc_marks & MFCB_MARK_CONDEMNED))) { 1397 fill_route(rt, mfccp, ipst); 1398 mutex_exit(&rt->mfc_mutex); 1399 break; 1400 } 1401 mutex_exit(&rt->mfc_mutex); 1402 } 1403 1404 /* No upcall, so make a new entry into mfctable */ 1405 if (rt == NULL) { 1406 rt = (struct mfc *)mi_zalloc(sizeof (struct mfc)); 1407 if (rt == NULL) { 1408 ip1dbg(("add_mfc: out of memory\n")); 1409 mutex_exit(&mfcbp->mfcb_lock); 1410 MFCB_REFRELE(mfcbp); 1411 return (ENOBUFS); 1412 } 1413 1414 /* Insert new entry at head of hash chain */ 1415 mutex_enter(&rt->mfc_mutex); 1416 fill_route(rt, mfccp, ipst); 1417 1418 /* Link into table */ 1419 rt->mfc_next = mfcbp->mfcb_mfc; 1420 mfcbp->mfcb_mfc = rt; 1421 mutex_exit(&rt->mfc_mutex); 1422 } 1423 mutex_exit(&mfcbp->mfcb_lock); 1424 } 1425 1426 MFCB_REFRELE(mfcbp); 1427 return (0); 1428 } 1429 1430 /* 1431 * Fills in mfc structure from mrouted mfcctl. 1432 */ 1433 static void 1434 fill_route(struct mfc *rt, struct mfcctl *mfccp, ip_stack_t *ipst) 1435 { 1436 int i; 1437 1438 rt->mfc_origin = mfccp->mfcc_origin; 1439 rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; 1440 rt->mfc_parent = mfccp->mfcc_parent; 1441 mutex_enter(&ipst->ips_numvifs_mutex); 1442 for (i = 0; i < (int)ipst->ips_numvifs; i++) { 1443 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; 1444 } 1445 mutex_exit(&ipst->ips_numvifs_mutex); 1446 /* Initialize pkt counters per src-grp */ 1447 rt->mfc_pkt_cnt = 0; 1448 rt->mfc_byte_cnt = 0; 1449 rt->mfc_wrong_if = 0; 1450 rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_nsec = 0; 1451 1452 } 1453 1454 static void 1455 free_queue(struct mfc *mfcp) 1456 { 1457 struct rtdetq *rte0; 1458 1459 /* 1460 * Drop all queued upcall packets. 1461 * Free the mbuf with the pkt. 1462 */ 1463 while ((rte0 = mfcp->mfc_rte) != NULL) { 1464 mfcp->mfc_rte = rte0->rte_next; 1465 freemsg(rte0->mp); 1466 mi_free((char *)rte0); 1467 } 1468 } 1469 /* 1470 * go thorugh the hash bucket and free all the entries marked condemned. 1471 */ 1472 void 1473 release_mfc(struct mfcb *mfcbp) 1474 { 1475 struct mfc *current_mfcp; 1476 struct mfc *prev_mfcp; 1477 1478 prev_mfcp = current_mfcp = mfcbp->mfcb_mfc; 1479 1480 while (current_mfcp != NULL) { 1481 if (current_mfcp->mfc_marks & MFCB_MARK_CONDEMNED) { 1482 if (current_mfcp == mfcbp->mfcb_mfc) { 1483 mfcbp->mfcb_mfc = current_mfcp->mfc_next; 1484 free_queue(current_mfcp); 1485 mi_free(current_mfcp); 1486 prev_mfcp = current_mfcp = mfcbp->mfcb_mfc; 1487 continue; 1488 } 1489 ASSERT(prev_mfcp != NULL); 1490 prev_mfcp->mfc_next = current_mfcp->mfc_next; 1491 free_queue(current_mfcp); 1492 mi_free(current_mfcp); 1493 current_mfcp = NULL; 1494 } else { 1495 prev_mfcp = current_mfcp; 1496 } 1497 1498 current_mfcp = prev_mfcp->mfc_next; 1499 1500 } 1501 mfcbp->mfcb_marks &= ~MFCB_MARK_CONDEMNED; 1502 ASSERT(mfcbp->mfcb_mfc != NULL || mfcbp->mfcb_marks == 0); 1503 } 1504 1505 /* 1506 * Delete an mfc entry. 1507 */ 1508 static int 1509 del_mfc(struct mfcctl *mfccp, ip_stack_t *ipst) 1510 { 1511 struct in_addr origin; 1512 struct in_addr mcastgrp; 1513 struct mfc *rt; 1514 uint_t hash; 1515 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1516 1517 origin = mfccp->mfcc_origin; 1518 mcastgrp = mfccp->mfcc_mcastgrp; 1519 hash = MFCHASH(origin.s_addr, mcastgrp.s_addr); 1520 1521 if (ipst->ips_ip_mrtdebug > 1) { 1522 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1523 "del_mfc: o %x g %x", 1524 ntohl(origin.s_addr), 1525 ntohl(mcastgrp.s_addr)); 1526 } 1527 1528 MFCB_REFHOLD(&ipst->ips_mfcs[hash]); 1529 1530 /* Find mfc in mfctable, finds only entries without upcalls */ 1531 for (rt = ipst->ips_mfcs[hash].mfcb_mfc; rt; rt = rt->mfc_next) { 1532 mutex_enter(&rt->mfc_mutex); 1533 if (origin.s_addr == rt->mfc_origin.s_addr && 1534 mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr && 1535 rt->mfc_rte == NULL && 1536 !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) 1537 break; 1538 mutex_exit(&rt->mfc_mutex); 1539 } 1540 1541 /* 1542 * Return if there was an upcall (mfc_rte != NULL, 1543 * or rt not in mfctable. 1544 */ 1545 if (rt == NULL) { 1546 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 1547 return (EADDRNOTAVAIL); 1548 } 1549 1550 1551 /* 1552 * no need to hold lock as we have a reference. 1553 */ 1554 ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED; 1555 /* error checking */ 1556 if (rt->mfc_timeout_id != 0) { 1557 ip0dbg(("del_mfc: TIMEOUT NOT 0, rte not null")); 1558 /* 1559 * Its ok to drop the lock, the struct cannot be freed 1560 * since we have a ref on the hash bucket. 1561 */ 1562 rt->mfc_timeout_id = 0; 1563 mutex_exit(&rt->mfc_mutex); 1564 (void) untimeout(rt->mfc_timeout_id); 1565 mutex_enter(&rt->mfc_mutex); 1566 } 1567 1568 ASSERT(rt->mfc_rte == NULL); 1569 1570 1571 /* 1572 * Delete the entry from the cache 1573 */ 1574 rt->mfc_marks |= MFCB_MARK_CONDEMNED; 1575 mutex_exit(&rt->mfc_mutex); 1576 1577 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 1578 1579 return (0); 1580 } 1581 1582 #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ 1583 1584 /* 1585 * IP multicast forwarding function. This function assumes that the packet 1586 * pointed to by ipha has arrived on (or is about to be sent to) the interface 1587 * pointed to by "ill", and the packet is to be relayed to other networks 1588 * that have members of the packet's destination IP multicast group. 1589 * 1590 * The packet is returned unscathed to the caller, unless it is 1591 * erroneous, in which case a -1 value tells the caller (IP) 1592 * to discard it. 1593 * 1594 * Unlike BSD, SunOS 5.x needs to return to IP info about 1595 * whether pkt came in thru a tunnel, so it can be discarded, unless 1596 * it's IGMP. In BSD, the ifp is bogus for tunnels, so pkt won't try 1597 * to be delivered. 1598 * Return values are 0 - pkt is okay and phyint 1599 * -1 - pkt is malformed and to be tossed 1600 * 1 - pkt came in on tunnel 1601 */ 1602 int 1603 ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp) 1604 { 1605 struct mfc *rt; 1606 ipaddr_t src, dst, tunnel_src = 0; 1607 static int srctun = 0; 1608 vifi_t vifi; 1609 boolean_t pim_reg_packet = B_FALSE; 1610 struct mfcb *mfcbp; 1611 ip_stack_t *ipst = ill->ill_ipst; 1612 conn_t *mrouter = ipst->ips_ip_g_mrouter; 1613 1614 if (ipst->ips_ip_mrtdebug > 1) { 1615 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1616 "ip_mforward: RECV ipha_src %x, ipha_dst %x, ill %s", 1617 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), 1618 ill->ill_name); 1619 } 1620 1621 dst = ipha->ipha_dst; 1622 if ((uint32_t)(uintptr_t)mp->b_prev == PIM_REGISTER_MARKER) 1623 pim_reg_packet = B_TRUE; 1624 else 1625 tunnel_src = (ipaddr_t)(uintptr_t)mp->b_prev; 1626 1627 /* 1628 * Don't forward a packet with time-to-live of zero or one, 1629 * or a packet destined to a local-only group. 1630 */ 1631 if (CLASSD(dst) && (ipha->ipha_ttl <= 1 || 1632 (ipaddr_t)ntohl(dst) <= INADDR_MAX_LOCAL_GROUP)) { 1633 if (ipst->ips_ip_mrtdebug > 1) { 1634 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1635 "ip_mforward: not forwarded ttl %d," 1636 " dst 0x%x ill %s", 1637 ipha->ipha_ttl, ntohl(dst), ill->ill_name); 1638 } 1639 mp->b_prev = NULL; 1640 if (tunnel_src != 0) 1641 return (1); 1642 else 1643 return (0); 1644 } 1645 1646 if ((tunnel_src != 0) || pim_reg_packet) { 1647 /* 1648 * Packet arrived over an encapsulated tunnel or via a PIM 1649 * register message. Both ip_mroute_decap() and pim_input() 1650 * encode information in mp->b_prev. 1651 */ 1652 mp->b_prev = NULL; 1653 if (ipst->ips_ip_mrtdebug > 1) { 1654 if (tunnel_src != 0) { 1655 (void) mi_strlog(mrouter->conn_rq, 1, 1656 SL_TRACE, 1657 "ip_mforward: ill %s arrived via ENCAP TUN", 1658 ill->ill_name); 1659 } else if (pim_reg_packet) { 1660 (void) mi_strlog(mrouter->conn_rq, 1, 1661 SL_TRACE, 1662 "ip_mforward: ill %s arrived via" 1663 " REGISTER VIF", 1664 ill->ill_name); 1665 } 1666 } 1667 } else if ((ipha->ipha_version_and_hdr_length & 0xf) < 1668 (uint_t)(IP_SIMPLE_HDR_LENGTH + TUNNEL_LEN) >> 2 || 1669 ((uchar_t *)(ipha + 1))[1] != IPOPT_LSRR) { 1670 /* Packet arrived via a physical interface. */ 1671 if (ipst->ips_ip_mrtdebug > 1) { 1672 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1673 "ip_mforward: ill %s arrived via PHYINT", 1674 ill->ill_name); 1675 } 1676 1677 } else { 1678 /* 1679 * Packet arrived through a SRCRT tunnel. 1680 * Source-route tunnels are no longer supported. 1681 * Error message printed every 1000 times. 1682 */ 1683 if ((srctun++ % 1000) == 0) { 1684 cmn_err(CE_WARN, 1685 "ip_mforward: received source-routed pkt from %x", 1686 ntohl(ipha->ipha_src)); 1687 } 1688 return (-1); 1689 } 1690 1691 ipst->ips_mrtstat->mrts_fwd_in++; 1692 src = ipha->ipha_src; 1693 1694 /* Find route in cache, return NULL if not there or upcalls q'ed. */ 1695 1696 /* 1697 * Lock the mfctable against changes made by ip_mforward. 1698 * Note that only add_mfc and del_mfc can remove entries and 1699 * they run with exclusive access to IP. So we do not need to 1700 * guard against the rt being deleted, so release lock after reading. 1701 */ 1702 1703 if (is_mrouter_off(ipst)) 1704 return (-1); 1705 1706 mfcbp = &ipst->ips_mfcs[MFCHASH(src, dst)]; 1707 MFCB_REFHOLD(mfcbp); 1708 MFCFIND(mfcbp, src, dst, rt); 1709 1710 /* Entry exists, so forward if necessary */ 1711 if (rt != NULL) { 1712 int ret = 0; 1713 ipst->ips_mrtstat->mrts_mfc_hits++; 1714 if (pim_reg_packet) { 1715 ASSERT(ipst->ips_reg_vif_num != ALL_VIFS); 1716 ret = ip_mdq(mp, ipha, 1717 ipst->ips_vifs[ipst->ips_reg_vif_num]. 1718 v_ipif->ipif_ill, 1719 0, rt); 1720 } else { 1721 ret = ip_mdq(mp, ipha, ill, tunnel_src, rt); 1722 } 1723 1724 MFCB_REFRELE(mfcbp); 1725 return (ret); 1726 1727 /* 1728 * Don't forward if we don't have a cache entry. Mrouted will 1729 * always provide a cache entry in response to an upcall. 1730 */ 1731 } else { 1732 /* 1733 * If we don't have a route for packet's origin, make a copy 1734 * of the packet and send message to routing daemon. 1735 */ 1736 struct mfc *mfc_rt = NULL; 1737 mblk_t *mp0 = NULL; 1738 mblk_t *mp_copy = NULL; 1739 struct rtdetq *rte = NULL; 1740 struct rtdetq *rte_m, *rte1, *prev_rte; 1741 uint_t hash; 1742 int npkts; 1743 boolean_t new_mfc = B_FALSE; 1744 ipst->ips_mrtstat->mrts_mfc_misses++; 1745 /* BSD uses mrts_no_route++ */ 1746 if (ipst->ips_ip_mrtdebug > 1) { 1747 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1748 "ip_mforward: no rte ill %s src %x g %x misses %d", 1749 ill->ill_name, ntohl(src), ntohl(dst), 1750 (int)ipst->ips_mrtstat->mrts_mfc_misses); 1751 } 1752 /* 1753 * The order of the following code differs from the BSD code. 1754 * Pre-mc3.5, the BSD code was incorrect and SunOS 5.x 1755 * code works, so SunOS 5.x wasn't changed to conform to the 1756 * BSD version. 1757 */ 1758 1759 /* Lock mfctable. */ 1760 hash = MFCHASH(src, dst); 1761 mutex_enter(&(ipst->ips_mfcs[hash].mfcb_lock)); 1762 1763 /* 1764 * If we are turning off mrouted return an error 1765 */ 1766 if (is_mrouter_off(ipst)) { 1767 mutex_exit(&mfcbp->mfcb_lock); 1768 MFCB_REFRELE(mfcbp); 1769 return (-1); 1770 } 1771 1772 /* Is there an upcall waiting for this packet? */ 1773 for (mfc_rt = ipst->ips_mfcs[hash].mfcb_mfc; mfc_rt; 1774 mfc_rt = mfc_rt->mfc_next) { 1775 mutex_enter(&mfc_rt->mfc_mutex); 1776 if (ipst->ips_ip_mrtdebug > 1) { 1777 (void) mi_strlog(mrouter->conn_rq, 1, 1778 SL_TRACE, 1779 "ip_mforward: MFCTAB hash %d o 0x%x" 1780 " g 0x%x\n", 1781 hash, ntohl(mfc_rt->mfc_origin.s_addr), 1782 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1783 } 1784 /* There is an upcall */ 1785 if ((src == mfc_rt->mfc_origin.s_addr) && 1786 (dst == mfc_rt->mfc_mcastgrp.s_addr) && 1787 (mfc_rt->mfc_rte != NULL) && 1788 !(mfc_rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 1789 break; 1790 } 1791 mutex_exit(&mfc_rt->mfc_mutex); 1792 } 1793 /* No upcall, so make a new entry into mfctable */ 1794 if (mfc_rt == NULL) { 1795 mfc_rt = (struct mfc *)mi_zalloc(sizeof (struct mfc)); 1796 if (mfc_rt == NULL) { 1797 ipst->ips_mrtstat->mrts_fwd_drop++; 1798 ip1dbg(("ip_mforward: out of memory " 1799 "for mfc, mfc_rt\n")); 1800 goto error_return; 1801 } else 1802 new_mfc = B_TRUE; 1803 /* Get resources */ 1804 /* TODO could copy header and dup rest */ 1805 mp_copy = copymsg(mp); 1806 if (mp_copy == NULL) { 1807 ipst->ips_mrtstat->mrts_fwd_drop++; 1808 ip1dbg(("ip_mforward: out of memory for " 1809 "mblk, mp_copy\n")); 1810 goto error_return; 1811 } 1812 mutex_enter(&mfc_rt->mfc_mutex); 1813 } 1814 /* Get resources for rte, whether first rte or not first. */ 1815 /* Add this packet into rtdetq */ 1816 rte = (struct rtdetq *)mi_zalloc(sizeof (struct rtdetq)); 1817 if (rte == NULL) { 1818 ipst->ips_mrtstat->mrts_fwd_drop++; 1819 mutex_exit(&mfc_rt->mfc_mutex); 1820 ip1dbg(("ip_mforward: out of memory for" 1821 " rtdetq, rte\n")); 1822 goto error_return; 1823 } 1824 1825 mp0 = copymsg(mp); 1826 if (mp0 == NULL) { 1827 ipst->ips_mrtstat->mrts_fwd_drop++; 1828 ip1dbg(("ip_mforward: out of memory for mblk, mp0\n")); 1829 mutex_exit(&mfc_rt->mfc_mutex); 1830 goto error_return; 1831 } 1832 rte->mp = mp0; 1833 if (pim_reg_packet) { 1834 ASSERT(ipst->ips_reg_vif_num != ALL_VIFS); 1835 rte->ill = 1836 ipst->ips_vifs[ipst->ips_reg_vif_num]. 1837 v_ipif->ipif_ill; 1838 } else { 1839 rte->ill = ill; 1840 } 1841 rte->rte_next = NULL; 1842 1843 /* 1844 * Determine if upcall q (rtdetq) has overflowed. 1845 * mfc_rt->mfc_rte is null by mi_zalloc 1846 * if it is the first message. 1847 */ 1848 for (rte_m = mfc_rt->mfc_rte, npkts = 0; rte_m; 1849 rte_m = rte_m->rte_next) 1850 npkts++; 1851 if (ipst->ips_ip_mrtdebug > 1) { 1852 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1853 "ip_mforward: upcalls %d\n", npkts); 1854 } 1855 if (npkts > MAX_UPQ) { 1856 ipst->ips_mrtstat->mrts_upq_ovflw++; 1857 mutex_exit(&mfc_rt->mfc_mutex); 1858 goto error_return; 1859 } 1860 1861 if (npkts == 0) { /* first upcall */ 1862 int i = 0; 1863 /* 1864 * Now finish installing the new mfc! Now that we have 1865 * resources! Insert new entry at head of hash chain. 1866 * Use src and dst which are ipaddr_t's. 1867 */ 1868 mfc_rt->mfc_origin.s_addr = src; 1869 mfc_rt->mfc_mcastgrp.s_addr = dst; 1870 1871 mutex_enter(&ipst->ips_numvifs_mutex); 1872 for (i = 0; i < (int)ipst->ips_numvifs; i++) 1873 mfc_rt->mfc_ttls[i] = 0; 1874 mutex_exit(&ipst->ips_numvifs_mutex); 1875 mfc_rt->mfc_parent = ALL_VIFS; 1876 1877 /* Link into table */ 1878 if (ipst->ips_ip_mrtdebug > 1) { 1879 (void) mi_strlog(mrouter->conn_rq, 1, 1880 SL_TRACE, 1881 "ip_mforward: NEW MFCTAB hash %d o 0x%x " 1882 "g 0x%x\n", hash, 1883 ntohl(mfc_rt->mfc_origin.s_addr), 1884 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1885 } 1886 mfc_rt->mfc_next = ipst->ips_mfcs[hash].mfcb_mfc; 1887 ipst->ips_mfcs[hash].mfcb_mfc = mfc_rt; 1888 mfc_rt->mfc_rte = NULL; 1889 } 1890 1891 /* Link in the upcall */ 1892 /* First upcall */ 1893 if (mfc_rt->mfc_rte == NULL) 1894 mfc_rt->mfc_rte = rte; 1895 else { 1896 /* not the first upcall */ 1897 prev_rte = mfc_rt->mfc_rte; 1898 for (rte1 = mfc_rt->mfc_rte->rte_next; rte1; 1899 prev_rte = rte1, rte1 = rte1->rte_next) 1900 ; 1901 prev_rte->rte_next = rte; 1902 } 1903 1904 /* 1905 * No upcalls waiting, this is first one, so send a message to 1906 * routing daemon to install a route into kernel table. 1907 */ 1908 if (npkts == 0) { 1909 struct igmpmsg *im; 1910 /* ipha_protocol is 0, for upcall */ 1911 ASSERT(mp_copy != NULL); 1912 im = (struct igmpmsg *)mp_copy->b_rptr; 1913 im->im_msgtype = IGMPMSG_NOCACHE; 1914 im->im_mbz = 0; 1915 mutex_enter(&ipst->ips_numvifs_mutex); 1916 if (pim_reg_packet) { 1917 im->im_vif = (uchar_t)ipst->ips_reg_vif_num; 1918 mutex_exit(&ipst->ips_numvifs_mutex); 1919 } else { 1920 /* 1921 * XXX do we need to hold locks here ? 1922 */ 1923 for (vifi = 0; 1924 vifi < ipst->ips_numvifs; 1925 vifi++) { 1926 if (ipst->ips_vifs[vifi].v_ipif == NULL) 1927 continue; 1928 if (ipst->ips_vifs[vifi]. 1929 v_ipif->ipif_ill == ill) { 1930 im->im_vif = (uchar_t)vifi; 1931 break; 1932 } 1933 } 1934 mutex_exit(&ipst->ips_numvifs_mutex); 1935 ASSERT(vifi < ipst->ips_numvifs); 1936 } 1937 1938 ipst->ips_mrtstat->mrts_upcalls++; 1939 /* Timer to discard upcalls if mrouted is too slow */ 1940 mfc_rt->mfc_timeout_id = timeout(expire_upcalls, 1941 mfc_rt, EXPIRE_TIMEOUT * UPCALL_EXPIRE); 1942 mutex_exit(&mfc_rt->mfc_mutex); 1943 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1944 /* Pass to RAWIP */ 1945 (mrouter->conn_recv)(mrouter, mp_copy, NULL); 1946 } else { 1947 mutex_exit(&mfc_rt->mfc_mutex); 1948 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1949 freemsg(mp_copy); 1950 } 1951 1952 MFCB_REFRELE(mfcbp); 1953 if (tunnel_src != 0) 1954 return (1); 1955 else 1956 return (0); 1957 error_return: 1958 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1959 MFCB_REFRELE(mfcbp); 1960 if (mfc_rt != NULL && (new_mfc == B_TRUE)) 1961 mi_free((char *)mfc_rt); 1962 if (rte != NULL) 1963 mi_free((char *)rte); 1964 if (mp_copy != NULL) 1965 freemsg(mp_copy); 1966 if (mp0 != NULL) 1967 freemsg(mp0); 1968 return (-1); 1969 } 1970 } 1971 1972 /* 1973 * Clean up the mfctable cache entry if upcall is not serviced. 1974 * SunOS 5.x has timeout per mfc, unlike BSD which has one timer. 1975 */ 1976 static void 1977 expire_upcalls(void *arg) 1978 { 1979 struct mfc *mfc_rt = arg; 1980 uint_t hash; 1981 struct mfc *prev_mfc, *mfc0; 1982 ip_stack_t *ipst; 1983 conn_t *mrouter; 1984 1985 if (mfc_rt->mfc_rte == NULL || mfc_rt->mfc_rte->ill != NULL) { 1986 cmn_err(CE_WARN, "expire_upcalls: no ILL\n"); 1987 return; 1988 } 1989 ipst = mfc_rt->mfc_rte->ill->ill_ipst; 1990 mrouter = ipst->ips_ip_g_mrouter; 1991 1992 hash = MFCHASH(mfc_rt->mfc_origin.s_addr, mfc_rt->mfc_mcastgrp.s_addr); 1993 if (ipst->ips_ip_mrtdebug > 1) { 1994 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1995 "expire_upcalls: hash %d s %x g %x", 1996 hash, ntohl(mfc_rt->mfc_origin.s_addr), 1997 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1998 } 1999 MFCB_REFHOLD(&ipst->ips_mfcs[hash]); 2000 mutex_enter(&mfc_rt->mfc_mutex); 2001 /* 2002 * if timeout has been set to zero, than the 2003 * entry has been filled, no need to delete it. 2004 */ 2005 if (mfc_rt->mfc_timeout_id == 0) 2006 goto done; 2007 ipst->ips_mrtstat->mrts_cache_cleanups++; 2008 mfc_rt->mfc_timeout_id = 0; 2009 2010 /* Determine entry to be cleaned up in cache table. */ 2011 for (prev_mfc = mfc0 = ipst->ips_mfcs[hash].mfcb_mfc; mfc0; 2012 prev_mfc = mfc0, mfc0 = mfc0->mfc_next) 2013 if (mfc0 == mfc_rt) 2014 break; 2015 2016 /* del_mfc takes care of gone mfcs */ 2017 ASSERT(prev_mfc != NULL); 2018 ASSERT(mfc0 != NULL); 2019 2020 /* 2021 * Delete the entry from the cache 2022 */ 2023 ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED; 2024 mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED; 2025 2026 /* 2027 * release_mfc will drop all queued upcall packets. 2028 * and will free the mbuf with the pkt, if, timing info. 2029 */ 2030 done: 2031 mutex_exit(&mfc_rt->mfc_mutex); 2032 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 2033 } 2034 2035 /* 2036 * Packet forwarding routine once entry in the cache is made. 2037 */ 2038 static int 2039 ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src, 2040 struct mfc *rt) 2041 { 2042 vifi_t vifi; 2043 struct vif *vifp; 2044 ipaddr_t dst = ipha->ipha_dst; 2045 size_t plen = msgdsize(mp); 2046 vifi_t num_of_vifs; 2047 ip_stack_t *ipst = ill->ill_ipst; 2048 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2049 2050 if (ipst->ips_ip_mrtdebug > 1) { 2051 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2052 "ip_mdq: SEND src %x, ipha_dst %x, ill %s", 2053 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), 2054 ill->ill_name); 2055 } 2056 2057 /* Macro to send packet on vif */ 2058 #define MC_SEND(ipha, mp, vifp, dst) { \ 2059 if ((vifp)->v_flags & VIFF_TUNNEL) \ 2060 encap_send((ipha), (mp), (vifp), (dst)); \ 2061 else if ((vifp)->v_flags & VIFF_REGISTER) \ 2062 register_send((ipha), (mp), (vifp), (dst)); \ 2063 else \ 2064 phyint_send((ipha), (mp), (vifp), (dst)); \ 2065 } 2066 2067 vifi = rt->mfc_parent; 2068 2069 /* 2070 * The value of vifi is MAXVIFS if the pkt had no parent, i.e., 2071 * Mrouted had no route. 2072 * We wanted the route installed in the mfctable to prevent multiple 2073 * tries, so it passed add_mfc(), but is discarded here. The v_ipif is 2074 * NULL so we don't want to check the ill. Still needed as of Mrouted 2075 * 3.6. 2076 */ 2077 if (vifi == NO_VIF) { 2078 ip1dbg(("ip_mdq: no route for origin ill %s, vifi is NO_VIF\n", 2079 ill->ill_name)); 2080 if (ipst->ips_ip_mrtdebug > 1) { 2081 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2082 "ip_mdq: vifi is NO_VIF ill = %s", ill->ill_name); 2083 } 2084 return (-1); /* drop pkt */ 2085 } 2086 2087 if (!lock_good_vif(&ipst->ips_vifs[vifi])) 2088 return (-1); 2089 /* 2090 * The MFC entries are not cleaned up when an ipif goes 2091 * away thus this code has to guard against an MFC referencing 2092 * an ipif that has been closed. Note: reset_mrt_vif_ipif 2093 * sets the v_ipif to NULL when the ipif disappears. 2094 */ 2095 ASSERT(ipst->ips_vifs[vifi].v_ipif != NULL); 2096 2097 if (vifi >= ipst->ips_numvifs) { 2098 cmn_err(CE_WARN, "ip_mdq: illegal vifi %d numvifs " 2099 "%d ill %s viftable ill %s\n", 2100 (int)vifi, (int)ipst->ips_numvifs, ill->ill_name, 2101 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name); 2102 unlock_good_vif(&ipst->ips_vifs[vifi]); 2103 return (-1); 2104 } 2105 /* 2106 * Don't forward if it didn't arrive from the parent vif for its 2107 * origin. But do match on the groups as we nominate only one 2108 * ill in the group for receiving allmulti packets. 2109 */ 2110 if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill && 2111 (ill->ill_group == NULL || 2112 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_group != 2113 ill->ill_group)) || 2114 (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) { 2115 /* Came in the wrong interface */ 2116 ip1dbg(("ip_mdq: arrived wrong if, vifi %d " 2117 "numvifs %d ill %s viftable ill %s\n", 2118 (int)vifi, (int)ipst->ips_numvifs, ill->ill_name, 2119 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name)); 2120 if (ipst->ips_ip_mrtdebug > 1) { 2121 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2122 "ip_mdq: arrived wrong if, vifi %d ill " 2123 "%s viftable ill %s\n", 2124 (int)vifi, ill->ill_name, 2125 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name); 2126 } 2127 ipst->ips_mrtstat->mrts_wrong_if++; 2128 rt->mfc_wrong_if++; 2129 2130 /* 2131 * If we are doing PIM assert processing and we are forwarding 2132 * packets on this interface, and it is a broadcast medium 2133 * interface (and not a tunnel), send a message to the routing. 2134 * 2135 * We use the first ipif on the list, since it's all we have. 2136 * Chances are the ipif_flags are the same for ipifs on the ill. 2137 */ 2138 if (ipst->ips_pim_assert && rt->mfc_ttls[vifi] > 0 && 2139 (ill->ill_ipif->ipif_flags & IPIF_BROADCAST) && 2140 !(ipst->ips_vifs[vifi].v_flags & VIFF_TUNNEL)) { 2141 mblk_t *mp_copy; 2142 struct igmpmsg *im; 2143 2144 /* TODO could copy header and dup rest */ 2145 mp_copy = copymsg(mp); 2146 if (mp_copy == NULL) { 2147 ipst->ips_mrtstat->mrts_fwd_drop++; 2148 ip1dbg(("ip_mdq: out of memory " 2149 "for mblk, mp_copy\n")); 2150 unlock_good_vif(&ipst->ips_vifs[vifi]); 2151 return (-1); 2152 } 2153 2154 im = (struct igmpmsg *)mp_copy->b_rptr; 2155 im->im_msgtype = IGMPMSG_WRONGVIF; 2156 im->im_mbz = 0; 2157 im->im_vif = (ushort_t)vifi; 2158 /* Pass to RAWIP */ 2159 (mrouter->conn_recv)(mrouter, mp_copy, NULL); 2160 } 2161 unlock_good_vif(&ipst->ips_vifs[vifi]); 2162 if (tunnel_src != 0) 2163 return (1); 2164 else 2165 return (0); 2166 } 2167 /* 2168 * If I sourced this packet, it counts as output, else it was input. 2169 */ 2170 if (ipha->ipha_src == ipst->ips_vifs[vifi].v_lcl_addr.s_addr) { 2171 ipst->ips_vifs[vifi].v_pkt_out++; 2172 ipst->ips_vifs[vifi].v_bytes_out += plen; 2173 } else { 2174 ipst->ips_vifs[vifi].v_pkt_in++; 2175 ipst->ips_vifs[vifi].v_bytes_in += plen; 2176 } 2177 mutex_enter(&rt->mfc_mutex); 2178 rt->mfc_pkt_cnt++; 2179 rt->mfc_byte_cnt += plen; 2180 mutex_exit(&rt->mfc_mutex); 2181 unlock_good_vif(&ipst->ips_vifs[vifi]); 2182 /* 2183 * For each vif, decide if a copy of the packet should be forwarded. 2184 * Forward if: 2185 * - the vif threshold ttl is non-zero AND 2186 * - the pkt ttl exceeds the vif's threshold 2187 * A non-zero mfc_ttl indicates that the vif is part of 2188 * the output set for the mfc entry. 2189 */ 2190 mutex_enter(&ipst->ips_numvifs_mutex); 2191 num_of_vifs = ipst->ips_numvifs; 2192 mutex_exit(&ipst->ips_numvifs_mutex); 2193 for (vifp = ipst->ips_vifs, vifi = 0; 2194 vifi < num_of_vifs; 2195 vifp++, vifi++) { 2196 if (!lock_good_vif(vifp)) 2197 continue; 2198 if ((rt->mfc_ttls[vifi] > 0) && 2199 (ipha->ipha_ttl > rt->mfc_ttls[vifi])) { 2200 /* 2201 * lock_good_vif should not have succedded if 2202 * v_ipif is null. 2203 */ 2204 ASSERT(vifp->v_ipif != NULL); 2205 vifp->v_pkt_out++; 2206 vifp->v_bytes_out += plen; 2207 MC_SEND(ipha, mp, vifp, dst); 2208 ipst->ips_mrtstat->mrts_fwd_out++; 2209 } 2210 unlock_good_vif(vifp); 2211 } 2212 if (tunnel_src != 0) 2213 return (1); 2214 else 2215 return (0); 2216 } 2217 2218 /* 2219 * Send the packet on physical interface. 2220 * Caller assumes can continue to use mp on return. 2221 */ 2222 /* ARGSUSED */ 2223 static void 2224 phyint_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2225 { 2226 mblk_t *mp_copy; 2227 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2228 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2229 2230 /* Make a new reference to the packet */ 2231 mp_copy = copymsg(mp); /* TODO could copy header and dup rest */ 2232 if (mp_copy == NULL) { 2233 ipst->ips_mrtstat->mrts_fwd_drop++; 2234 ip1dbg(("phyint_send: out of memory for mblk, mp_copy\n")); 2235 return; 2236 } 2237 if (vifp->v_rate_limit <= 0) 2238 tbf_send_packet(vifp, mp_copy); 2239 else { 2240 if (ipst->ips_ip_mrtdebug > 1) { 2241 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2242 "phyint_send: tbf_contr rate %d " 2243 "vifp 0x%p mp 0x%p dst 0x%x", 2244 vifp->v_rate_limit, (void *)vifp, (void *)mp, dst); 2245 } 2246 tbf_control(vifp, mp_copy, (ipha_t *)mp_copy->b_rptr); 2247 } 2248 } 2249 2250 /* 2251 * Send the whole packet for REGISTER encapsulation to PIM daemon 2252 * Caller assumes it can continue to use mp on return. 2253 */ 2254 /* ARGSUSED */ 2255 static void 2256 register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2257 { 2258 struct igmpmsg *im; 2259 mblk_t *mp_copy; 2260 ipha_t *ipha_copy; 2261 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2262 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2263 2264 if (ipst->ips_ip_mrtdebug > 1) { 2265 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2266 "register_send: src %x, dst %x\n", 2267 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst)); 2268 } 2269 2270 /* 2271 * Copy the old packet & pullup its IP header into the new mblk_t so we 2272 * can modify it. Try to fill the new mblk_t since if we don't the 2273 * ethernet driver will. 2274 */ 2275 mp_copy = allocb(sizeof (struct igmpmsg) + sizeof (ipha_t), BPRI_MED); 2276 if (mp_copy == NULL) { 2277 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2278 if (ipst->ips_ip_mrtdebug > 3) { 2279 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2280 "register_send: allocb failure."); 2281 } 2282 return; 2283 } 2284 2285 /* 2286 * Bump write pointer to account for igmpmsg being added. 2287 */ 2288 mp_copy->b_wptr = mp_copy->b_rptr + sizeof (struct igmpmsg); 2289 2290 /* 2291 * Chain packet to new mblk_t. 2292 */ 2293 if ((mp_copy->b_cont = copymsg(mp)) == NULL) { 2294 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2295 if (ipst->ips_ip_mrtdebug > 3) { 2296 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2297 "register_send: copymsg failure."); 2298 } 2299 freeb(mp_copy); 2300 return; 2301 } 2302 2303 /* 2304 * icmp_input() asserts that IP version field is set to an 2305 * appropriate version. Hence, the struct igmpmsg that this really 2306 * becomes, needs to have the correct IP version field. 2307 */ 2308 ipha_copy = (ipha_t *)mp_copy->b_rptr; 2309 *ipha_copy = multicast_encap_iphdr; 2310 2311 /* 2312 * The kernel uses the struct igmpmsg header to encode the messages to 2313 * the multicast routing daemon. Fill in the fields in the header 2314 * starting with the message type which is IGMPMSG_WHOLEPKT 2315 */ 2316 im = (struct igmpmsg *)mp_copy->b_rptr; 2317 im->im_msgtype = IGMPMSG_WHOLEPKT; 2318 im->im_src.s_addr = ipha->ipha_src; 2319 im->im_dst.s_addr = ipha->ipha_dst; 2320 2321 /* 2322 * Must Be Zero. This is because the struct igmpmsg is really an IP 2323 * header with renamed fields and the multicast routing daemon uses 2324 * an ipha_protocol (aka im_mbz) of 0 to distinguish these messages. 2325 */ 2326 im->im_mbz = 0; 2327 2328 ++ipst->ips_mrtstat->mrts_upcalls; 2329 if (!canputnext(mrouter->conn_rq)) { 2330 ++ipst->ips_mrtstat->mrts_pim_regsend_drops; 2331 if (ipst->ips_ip_mrtdebug > 3) { 2332 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2333 "register_send: register upcall failure."); 2334 } 2335 freemsg(mp_copy); 2336 } else { 2337 /* Pass to RAWIP */ 2338 (mrouter->conn_recv)(mrouter, mp_copy, NULL); 2339 } 2340 } 2341 2342 /* 2343 * pim_validate_cksum handles verification of the checksum in the 2344 * pim header. For PIM Register packets, the checksum is calculated 2345 * across the PIM header only. For all other packets, the checksum 2346 * is for the PIM header and remainder of the packet. 2347 * 2348 * returns: B_TRUE, if checksum is okay. 2349 * B_FALSE, if checksum is not valid. 2350 */ 2351 static boolean_t 2352 pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp) 2353 { 2354 mblk_t *mp_dup; 2355 2356 if ((mp_dup = dupmsg(mp)) == NULL) 2357 return (B_FALSE); 2358 2359 mp_dup->b_rptr += IPH_HDR_LENGTH(ip); 2360 if (pimp->pim_type == PIM_REGISTER) 2361 mp_dup->b_wptr = mp_dup->b_rptr + PIM_MINLEN; 2362 if (IP_CSUM(mp_dup, 0, 0)) { 2363 freemsg(mp_dup); 2364 return (B_FALSE); 2365 } 2366 freemsg(mp_dup); 2367 return (B_TRUE); 2368 } 2369 2370 /* 2371 * int 2372 * pim_input(queue_t *, mblk_t *, ill_t *ill) - Process PIM protocol packets. 2373 * IP Protocol 103. Register messages are decapsulated and sent 2374 * onto multicast forwarding. 2375 */ 2376 int 2377 pim_input(queue_t *q, mblk_t *mp, ill_t *ill) 2378 { 2379 ipha_t *eip, *ip; 2380 int iplen, pimlen, iphlen; 2381 struct pim *pimp; /* pointer to a pim struct */ 2382 uint32_t *reghdr; 2383 ip_stack_t *ipst = ill->ill_ipst; 2384 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2385 2386 /* 2387 * Pullup the msg for PIM protocol processing. 2388 */ 2389 if (pullupmsg(mp, -1) == 0) { 2390 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2391 freemsg(mp); 2392 return (-1); 2393 } 2394 2395 ip = (ipha_t *)mp->b_rptr; 2396 iplen = ip->ipha_length; 2397 iphlen = IPH_HDR_LENGTH(ip); 2398 pimlen = ntohs(iplen) - iphlen; 2399 2400 /* 2401 * Validate lengths 2402 */ 2403 if (pimlen < PIM_MINLEN) { 2404 ++ipst->ips_mrtstat->mrts_pim_malformed; 2405 if (ipst->ips_ip_mrtdebug > 1) { 2406 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2407 "pim_input: length not at least minlen"); 2408 } 2409 freemsg(mp); 2410 return (-1); 2411 } 2412 2413 /* 2414 * Point to the PIM header. 2415 */ 2416 pimp = (struct pim *)((caddr_t)ip + iphlen); 2417 2418 /* 2419 * Check the version number. 2420 */ 2421 if (pimp->pim_vers != PIM_VERSION) { 2422 ++ipst->ips_mrtstat->mrts_pim_badversion; 2423 if (ipst->ips_ip_mrtdebug > 1) { 2424 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2425 "pim_input: unknown version of PIM"); 2426 } 2427 freemsg(mp); 2428 return (-1); 2429 } 2430 2431 /* 2432 * Validate the checksum 2433 */ 2434 if (!pim_validate_cksum(mp, ip, pimp)) { 2435 ++ipst->ips_mrtstat->mrts_pim_rcv_badcsum; 2436 if (ipst->ips_ip_mrtdebug > 1) { 2437 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2438 "pim_input: invalid checksum"); 2439 } 2440 freemsg(mp); 2441 return (-1); 2442 } 2443 2444 if (pimp->pim_type != PIM_REGISTER) 2445 return (0); 2446 2447 reghdr = (uint32_t *)(pimp + 1); 2448 eip = (ipha_t *)(reghdr + 1); 2449 2450 /* 2451 * check if the inner packet is destined to mcast group 2452 */ 2453 if (!CLASSD(eip->ipha_dst)) { 2454 ++ipst->ips_mrtstat->mrts_pim_badregisters; 2455 if (ipst->ips_ip_mrtdebug > 1) { 2456 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2457 "pim_input: Inner pkt not mcast .. !"); 2458 } 2459 freemsg(mp); 2460 return (-1); 2461 } 2462 if (ipst->ips_ip_mrtdebug > 1) { 2463 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2464 "register from %x, to %x, len %d", 2465 ntohl(eip->ipha_src), 2466 ntohl(eip->ipha_dst), 2467 ntohs(eip->ipha_length)); 2468 } 2469 /* 2470 * If the null register bit is not set, decapsulate 2471 * the packet before forwarding it. 2472 */ 2473 if (!(ntohl(*reghdr) & PIM_NULL_REGISTER)) { 2474 mblk_t *mp_copy; 2475 2476 /* Copy the message */ 2477 if ((mp_copy = copymsg(mp)) == NULL) { 2478 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2479 freemsg(mp); 2480 return (-1); 2481 } 2482 2483 /* 2484 * Decapsulate the packet and give it to 2485 * register_mforward. 2486 */ 2487 mp_copy->b_rptr += iphlen + sizeof (pim_t) + 2488 sizeof (*reghdr); 2489 if (register_mforward(q, mp_copy, ill) != 0) { 2490 freemsg(mp); 2491 return (-1); 2492 } 2493 } 2494 2495 /* 2496 * Pass all valid PIM packets up to any process(es) listening on a raw 2497 * PIM socket. For Solaris it is done right after pim_input() is 2498 * called. 2499 */ 2500 return (0); 2501 } 2502 2503 /* 2504 * PIM sparse mode hook. Called by pim_input after decapsulating 2505 * the packet. Loop back the packet, as if we have received it. 2506 * In pim_input() we have to check if the destination is a multicast address. 2507 */ 2508 /* ARGSUSED */ 2509 static int 2510 register_mforward(queue_t *q, mblk_t *mp, ill_t *ill) 2511 { 2512 ip_stack_t *ipst = ill->ill_ipst; 2513 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2514 2515 ASSERT(ipst->ips_reg_vif_num <= ipst->ips_numvifs); 2516 2517 if (ipst->ips_ip_mrtdebug > 3) { 2518 ipha_t *ipha; 2519 2520 ipha = (ipha_t *)mp->b_rptr; 2521 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2522 "register_mforward: src %x, dst %x\n", 2523 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst)); 2524 } 2525 /* 2526 * Need to pass in to ip_mforward() the information that the 2527 * packet has arrived on the register_vif. We use the solution that 2528 * ip_mroute_decap() employs: use mp->b_prev to pass some information 2529 * to ip_mforward(). Nonzero value means the packet has arrived on a 2530 * tunnel (ip_mroute_decap() puts the address of the other side of the 2531 * tunnel there.) This is safe since ip_rput() either frees the packet 2532 * or passes it to ip_mforward(). We use 2533 * PIM_REGISTER_MARKER = 0xffffffff to indicate the has arrived on the 2534 * register vif. If in the future we have more than one register vifs, 2535 * then this will need re-examination. 2536 */ 2537 mp->b_prev = (mblk_t *)PIM_REGISTER_MARKER; 2538 ++ipst->ips_mrtstat->mrts_pim_regforwards; 2539 ip_rput(q, mp); 2540 return (0); 2541 } 2542 2543 /* 2544 * Send an encapsulated packet. 2545 * Caller assumes can continue to use mp when routine returns. 2546 */ 2547 /* ARGSUSED */ 2548 static void 2549 encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2550 { 2551 mblk_t *mp_copy; 2552 ipha_t *ipha_copy; 2553 size_t len; 2554 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2555 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2556 2557 if (ipst->ips_ip_mrtdebug > 1) { 2558 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2559 "encap_send: vif %ld enter", 2560 (ptrdiff_t)(vifp - ipst->ips_vifs)); 2561 } 2562 len = ntohs(ipha->ipha_length); 2563 2564 /* 2565 * Copy the old packet & pullup it's IP header into the 2566 * new mbuf so we can modify it. Try to fill the new 2567 * mbuf since if we don't the ethernet driver will. 2568 */ 2569 mp_copy = allocb(32 + sizeof (multicast_encap_iphdr), BPRI_MED); 2570 if (mp_copy == NULL) 2571 return; 2572 mp_copy->b_rptr += 32; 2573 mp_copy->b_wptr = mp_copy->b_rptr + sizeof (multicast_encap_iphdr); 2574 if ((mp_copy->b_cont = copymsg(mp)) == NULL) { 2575 freeb(mp_copy); 2576 return; 2577 } 2578 2579 /* 2580 * Fill in the encapsulating IP header. 2581 * Remote tunnel dst in rmt_addr, from add_vif(). 2582 */ 2583 ipha_copy = (ipha_t *)mp_copy->b_rptr; 2584 *ipha_copy = multicast_encap_iphdr; 2585 ASSERT((len + sizeof (ipha_t)) <= IP_MAXPACKET); 2586 ipha_copy->ipha_length = htons(len + sizeof (ipha_t)); 2587 ipha_copy->ipha_src = vifp->v_lcl_addr.s_addr; 2588 ipha_copy->ipha_dst = vifp->v_rmt_addr.s_addr; 2589 ASSERT(ipha_copy->ipha_ident == 0); 2590 2591 /* Turn the encapsulated IP header back into a valid one. */ 2592 ipha = (ipha_t *)mp_copy->b_cont->b_rptr; 2593 ipha->ipha_ttl--; 2594 ipha->ipha_hdr_checksum = 0; 2595 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 2596 2597 if (ipst->ips_ip_mrtdebug > 1) { 2598 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2599 "encap_send: group 0x%x", ntohl(ipha->ipha_dst)); 2600 } 2601 if (vifp->v_rate_limit <= 0) 2602 tbf_send_packet(vifp, mp_copy); 2603 else 2604 /* ipha is from the original header */ 2605 tbf_control(vifp, mp_copy, ipha); 2606 } 2607 2608 /* 2609 * De-encapsulate a packet and feed it back through IP input. 2610 * This routine is called whenever IP gets a packet with prototype 2611 * IPPROTO_ENCAP and a local destination address. 2612 */ 2613 void 2614 ip_mroute_decap(queue_t *q, mblk_t *mp, ill_t *ill) 2615 { 2616 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2617 ipha_t *ipha_encap; 2618 int hlen = IPH_HDR_LENGTH(ipha); 2619 ipaddr_t src; 2620 struct vif *vifp; 2621 ip_stack_t *ipst = ill->ill_ipst; 2622 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2623 2624 /* 2625 * Dump the packet if it's not to a multicast destination or if 2626 * we don't have an encapsulating tunnel with the source. 2627 * Note: This code assumes that the remote site IP address 2628 * uniquely identifies the tunnel (i.e., that this site has 2629 * at most one tunnel with the remote site). 2630 */ 2631 ipha_encap = (ipha_t *)((char *)ipha + hlen); 2632 if (!CLASSD(ipha_encap->ipha_dst)) { 2633 ipst->ips_mrtstat->mrts_bad_tunnel++; 2634 ip1dbg(("ip_mroute_decap: bad tunnel\n")); 2635 freemsg(mp); 2636 return; 2637 } 2638 src = (ipaddr_t)ipha->ipha_src; 2639 mutex_enter(&ipst->ips_last_encap_lock); 2640 if (src != ipst->ips_last_encap_src) { 2641 struct vif *vife; 2642 2643 vifp = ipst->ips_vifs; 2644 vife = vifp + ipst->ips_numvifs; 2645 ipst->ips_last_encap_src = src; 2646 ipst->ips_last_encap_vif = 0; 2647 for (; vifp < vife; ++vifp) { 2648 if (!lock_good_vif(vifp)) 2649 continue; 2650 if (vifp->v_rmt_addr.s_addr == src) { 2651 if (vifp->v_flags & VIFF_TUNNEL) 2652 ipst->ips_last_encap_vif = vifp; 2653 if (ipst->ips_ip_mrtdebug > 1) { 2654 (void) mi_strlog(mrouter->conn_rq, 2655 1, SL_TRACE, 2656 "ip_mroute_decap: good tun " 2657 "vif %ld with %x", 2658 (ptrdiff_t)(vifp - ipst->ips_vifs), 2659 ntohl(src)); 2660 } 2661 unlock_good_vif(vifp); 2662 break; 2663 } 2664 unlock_good_vif(vifp); 2665 } 2666 } 2667 if ((vifp = ipst->ips_last_encap_vif) == 0) { 2668 mutex_exit(&ipst->ips_last_encap_lock); 2669 ipst->ips_mrtstat->mrts_bad_tunnel++; 2670 freemsg(mp); 2671 ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n", 2672 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(src))); 2673 return; 2674 } 2675 mutex_exit(&ipst->ips_last_encap_lock); 2676 2677 /* 2678 * Need to pass in the tunnel source to ip_mforward (so that it can 2679 * verify that the packet arrived over the correct vif.) We use b_prev 2680 * to pass this information. This is safe since the ip_rput either 2681 * frees the packet or passes it to ip_mforward. 2682 */ 2683 mp->b_prev = (mblk_t *)(uintptr_t)src; 2684 mp->b_rptr += hlen; 2685 /* Feed back into ip_rput as an M_DATA. */ 2686 ip_rput(q, mp); 2687 } 2688 2689 /* 2690 * Remove all records with v_ipif == ipif. Called when an interface goes away 2691 * (stream closed). Called as writer. 2692 */ 2693 void 2694 reset_mrt_vif_ipif(ipif_t *ipif) 2695 { 2696 vifi_t vifi, tmp_vifi; 2697 vifi_t num_of_vifs; 2698 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 2699 2700 /* Can't check vifi >= 0 since vifi_t is unsigned! */ 2701 2702 mutex_enter(&ipst->ips_numvifs_mutex); 2703 num_of_vifs = ipst->ips_numvifs; 2704 mutex_exit(&ipst->ips_numvifs_mutex); 2705 2706 for (vifi = num_of_vifs; vifi != 0; vifi--) { 2707 tmp_vifi = vifi - 1; 2708 if (ipst->ips_vifs[tmp_vifi].v_ipif == ipif) { 2709 (void) del_vif(&tmp_vifi, NULL, NULL, ipst); 2710 } 2711 } 2712 } 2713 2714 /* Remove pending upcall msgs when ill goes away. Called by ill_delete. */ 2715 void 2716 reset_mrt_ill(ill_t *ill) 2717 { 2718 struct mfc *rt; 2719 struct rtdetq *rte; 2720 int i; 2721 ip_stack_t *ipst = ill->ill_ipst; 2722 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2723 2724 for (i = 0; i < MFCTBLSIZ; i++) { 2725 MFCB_REFHOLD(&ipst->ips_mfcs[i]); 2726 if ((rt = ipst->ips_mfcs[i].mfcb_mfc) != NULL) { 2727 if (ipst->ips_ip_mrtdebug > 1) { 2728 (void) mi_strlog(mrouter->conn_rq, 1, 2729 SL_TRACE, 2730 "reset_mrt_ill: mfctable [%d]", i); 2731 } 2732 while (rt != NULL) { 2733 mutex_enter(&rt->mfc_mutex); 2734 while ((rte = rt->mfc_rte) != NULL) { 2735 if (rte->ill == ill) { 2736 if (ipst->ips_ip_mrtdebug > 1) { 2737 (void) mi_strlog( 2738 mrouter->conn_rq, 2739 1, SL_TRACE, 2740 "reset_mrt_ill: " 2741 "ill 0x%p", (void *)ill); 2742 } 2743 rt->mfc_rte = rte->rte_next; 2744 freemsg(rte->mp); 2745 mi_free((char *)rte); 2746 } 2747 } 2748 mutex_exit(&rt->mfc_mutex); 2749 rt = rt->mfc_next; 2750 } 2751 } 2752 MFCB_REFRELE(&ipst->ips_mfcs[i]); 2753 } 2754 } 2755 2756 /* 2757 * Token bucket filter module. 2758 * The ipha is for mcastgrp destination for phyint and encap. 2759 */ 2760 static void 2761 tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha) 2762 { 2763 size_t p_len = msgdsize(mp); 2764 struct tbf *t = vifp->v_tbf; 2765 timeout_id_t id = 0; 2766 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2767 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2768 2769 /* Drop if packet is too large */ 2770 if (p_len > MAX_BKT_SIZE) { 2771 ipst->ips_mrtstat->mrts_pkt2large++; 2772 freemsg(mp); 2773 return; 2774 } 2775 if (ipst->ips_ip_mrtdebug > 1) { 2776 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2777 "tbf_ctrl: SEND vif %ld, qlen %d, ipha_dst 0x%x", 2778 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len, 2779 ntohl(ipha->ipha_dst)); 2780 } 2781 2782 mutex_enter(&t->tbf_lock); 2783 2784 tbf_update_tokens(vifp); 2785 2786 /* 2787 * If there are enough tokens, 2788 * and the queue is empty, send this packet out. 2789 */ 2790 if (ipst->ips_ip_mrtdebug > 1) { 2791 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2792 "tbf_control: vif %ld, TOKENS %d, pkt len %lu, qlen %d", 2793 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_n_tok, p_len, 2794 t->tbf_q_len); 2795 } 2796 /* No packets are queued */ 2797 if (t->tbf_q_len == 0) { 2798 /* queue empty, send packet if enough tokens */ 2799 if (p_len <= t->tbf_n_tok) { 2800 t->tbf_n_tok -= p_len; 2801 mutex_exit(&t->tbf_lock); 2802 tbf_send_packet(vifp, mp); 2803 return; 2804 } else { 2805 /* Queue packet and timeout till later */ 2806 tbf_queue(vifp, mp); 2807 ASSERT(vifp->v_timeout_id == 0); 2808 vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp, 2809 TBF_REPROCESS); 2810 } 2811 } else if (t->tbf_q_len < t->tbf_max_q_len) { 2812 /* Finite queue length, so queue pkts and process queue */ 2813 tbf_queue(vifp, mp); 2814 tbf_process_q(vifp); 2815 } else { 2816 /* Check that we have UDP header with IP header */ 2817 size_t hdr_length = IPH_HDR_LENGTH(ipha) + 2818 sizeof (struct udphdr); 2819 2820 if ((mp->b_wptr - mp->b_rptr) < hdr_length) { 2821 if (!pullupmsg(mp, hdr_length)) { 2822 freemsg(mp); 2823 ip1dbg(("tbf_ctl: couldn't pullup udp hdr, " 2824 "vif %ld src 0x%x dst 0x%x\n", 2825 (ptrdiff_t)(vifp - ipst->ips_vifs), 2826 ntohl(ipha->ipha_src), 2827 ntohl(ipha->ipha_dst))); 2828 mutex_exit(&vifp->v_tbf->tbf_lock); 2829 return; 2830 } else 2831 /* Have to reassign ipha after pullupmsg */ 2832 ipha = (ipha_t *)mp->b_rptr; 2833 } 2834 /* 2835 * Queue length too much, 2836 * try to selectively dq, or queue and process 2837 */ 2838 if (!tbf_dq_sel(vifp, ipha)) { 2839 ipst->ips_mrtstat->mrts_q_overflow++; 2840 freemsg(mp); 2841 } else { 2842 tbf_queue(vifp, mp); 2843 tbf_process_q(vifp); 2844 } 2845 } 2846 if (t->tbf_q_len == 0) { 2847 id = vifp->v_timeout_id; 2848 vifp->v_timeout_id = 0; 2849 } 2850 mutex_exit(&vifp->v_tbf->tbf_lock); 2851 if (id != 0) 2852 (void) untimeout(id); 2853 } 2854 2855 /* 2856 * Adds a packet to the tbf queue at the interface. 2857 * The ipha is for mcastgrp destination for phyint and encap. 2858 */ 2859 static void 2860 tbf_queue(struct vif *vifp, mblk_t *mp) 2861 { 2862 struct tbf *t = vifp->v_tbf; 2863 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2864 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2865 2866 if (ipst->ips_ip_mrtdebug > 1) { 2867 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2868 "tbf_queue: vif %ld", (ptrdiff_t)(vifp - ipst->ips_vifs)); 2869 } 2870 ASSERT(MUTEX_HELD(&t->tbf_lock)); 2871 2872 if (t->tbf_t == NULL) { 2873 /* Queue was empty */ 2874 t->tbf_q = mp; 2875 } else { 2876 /* Insert at tail */ 2877 t->tbf_t->b_next = mp; 2878 } 2879 /* set new tail pointer */ 2880 t->tbf_t = mp; 2881 2882 mp->b_next = mp->b_prev = NULL; 2883 2884 t->tbf_q_len++; 2885 } 2886 2887 /* 2888 * Process the queue at the vif interface. 2889 * Drops the tbf_lock when sending packets. 2890 * 2891 * NOTE : The caller should quntimeout if the queue length is 0. 2892 */ 2893 static void 2894 tbf_process_q(struct vif *vifp) 2895 { 2896 mblk_t *mp; 2897 struct tbf *t = vifp->v_tbf; 2898 size_t len; 2899 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2900 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2901 2902 if (ipst->ips_ip_mrtdebug > 1) { 2903 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2904 "tbf_process_q 1: vif %ld qlen = %d", 2905 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len); 2906 } 2907 2908 /* 2909 * Loop through the queue at the interface and send 2910 * as many packets as possible. 2911 */ 2912 ASSERT(MUTEX_HELD(&t->tbf_lock)); 2913 2914 while (t->tbf_q_len > 0) { 2915 mp = t->tbf_q; 2916 len = (size_t)msgdsize(mp); /* length of ip pkt */ 2917 2918 /* Determine if the packet can be sent */ 2919 if (len <= t->tbf_n_tok) { 2920 /* 2921 * If so, reduce no. of tokens, dequeue the packet, 2922 * send the packet. 2923 */ 2924 t->tbf_n_tok -= len; 2925 2926 t->tbf_q = mp->b_next; 2927 if (--t->tbf_q_len == 0) { 2928 t->tbf_t = NULL; 2929 } 2930 mp->b_next = NULL; 2931 /* Exit mutex before sending packet, then re-enter */ 2932 mutex_exit(&t->tbf_lock); 2933 tbf_send_packet(vifp, mp); 2934 mutex_enter(&t->tbf_lock); 2935 } else 2936 break; 2937 } 2938 } 2939 2940 /* Called at tbf timeout to update tokens, process q and reset timer. */ 2941 static void 2942 tbf_reprocess_q(void *arg) 2943 { 2944 struct vif *vifp = arg; 2945 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2946 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2947 2948 mutex_enter(&vifp->v_tbf->tbf_lock); 2949 vifp->v_timeout_id = 0; 2950 tbf_update_tokens(vifp); 2951 2952 tbf_process_q(vifp); 2953 2954 if (vifp->v_tbf->tbf_q_len > 0) { 2955 vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp, 2956 TBF_REPROCESS); 2957 } 2958 mutex_exit(&vifp->v_tbf->tbf_lock); 2959 2960 if (ipst->ips_ip_mrtdebug > 1) { 2961 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2962 "tbf_reprcess_q: vif %ld timeout id = %p", 2963 (ptrdiff_t)(vifp - ipst->ips_vifs), vifp->v_timeout_id); 2964 } 2965 } 2966 2967 /* 2968 * Function that will selectively discard a member of the tbf queue, 2969 * based on the precedence value and the priority. 2970 * 2971 * NOTE : The caller should quntimeout if the queue length is 0. 2972 */ 2973 static int 2974 tbf_dq_sel(struct vif *vifp, ipha_t *ipha) 2975 { 2976 uint_t p; 2977 struct tbf *t = vifp->v_tbf; 2978 mblk_t **np; 2979 mblk_t *last, *mp; 2980 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2981 conn_t *mrouter = ipst->ips_ip_g_mrouter; 2982 2983 if (ipst->ips_ip_mrtdebug > 1) { 2984 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 2985 "dq_sel: vif %ld dst 0x%x", 2986 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(ipha->ipha_dst)); 2987 } 2988 2989 ASSERT(MUTEX_HELD(&t->tbf_lock)); 2990 p = priority(vifp, ipha); 2991 2992 np = &t->tbf_q; 2993 last = NULL; 2994 while ((mp = *np) != NULL) { 2995 if (p > (priority(vifp, (ipha_t *)mp->b_rptr))) { 2996 *np = mp->b_next; 2997 /* If removing the last packet, fix the tail pointer */ 2998 if (mp == t->tbf_t) 2999 t->tbf_t = last; 3000 mp->b_prev = mp->b_next = NULL; 3001 freemsg(mp); 3002 /* 3003 * It's impossible for the queue to be empty, but 3004 * we check anyway. 3005 */ 3006 if (--t->tbf_q_len == 0) { 3007 t->tbf_t = NULL; 3008 } 3009 ipst->ips_mrtstat->mrts_drop_sel++; 3010 return (1); 3011 } 3012 np = &mp->b_next; 3013 last = mp; 3014 } 3015 return (0); 3016 } 3017 3018 /* Sends packet, 2 cases - encap tunnel, phyint. */ 3019 static void 3020 tbf_send_packet(struct vif *vifp, mblk_t *mp) 3021 { 3022 ipif_t *ipif; 3023 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 3024 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3025 3026 /* If encap tunnel options */ 3027 if (vifp->v_flags & VIFF_TUNNEL) { 3028 if (ipst->ips_ip_mrtdebug > 1) { 3029 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3030 "tbf_send_pkt: ENCAP tunnel vif %ld", 3031 (ptrdiff_t)(vifp - ipst->ips_vifs)); 3032 } 3033 3034 /* 3035 * Feed into ip_wput which will set the ident field and 3036 * checksum the encapsulating header. 3037 * BSD gets the cached route vifp->v_route from ip_output() 3038 * to speed up route table lookups. Not necessary in SunOS 5.x. 3039 */ 3040 put(vifp->v_ipif->ipif_wq, mp); 3041 return; 3042 3043 /* phyint */ 3044 } else { 3045 /* Need to loop back to members on the outgoing interface. */ 3046 ipha_t *ipha; 3047 ipaddr_t dst; 3048 ipha = (ipha_t *)mp->b_rptr; 3049 dst = ipha->ipha_dst; 3050 ipif = vifp->v_ipif; 3051 3052 mutex_enter(&ipif->ipif_ill->ill_lock); 3053 if (ilm_lookup_ipif(ipif, dst) != NULL) { 3054 /* 3055 * The packet is not yet reassembled, thus we need to 3056 * pass it to ip_rput_local for checksum verification 3057 * and reassembly (and fanout the user stream). 3058 */ 3059 mblk_t *mp_loop; 3060 ire_t *ire; 3061 3062 mutex_exit(&ipif->ipif_ill->ill_lock); 3063 if (ipst->ips_ip_mrtdebug > 1) { 3064 (void) mi_strlog(mrouter->conn_rq, 1, 3065 SL_TRACE, 3066 "tbf_send_pkt: loopback vif %ld", 3067 (ptrdiff_t)(vifp - ipst->ips_vifs)); 3068 } 3069 mp_loop = copymsg(mp); 3070 ire = ire_ctable_lookup(~0, 0, IRE_BROADCAST, NULL, 3071 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 3072 3073 if (mp_loop != NULL && ire != NULL) { 3074 IP_RPUT_LOCAL(ipif->ipif_rq, mp_loop, 3075 ((ipha_t *)mp_loop->b_rptr), 3076 ire, (ill_t *)ipif->ipif_rq->q_ptr); 3077 } else { 3078 /* Either copymsg failed or no ire */ 3079 (void) mi_strlog(mrouter->conn_rq, 1, 3080 SL_TRACE, 3081 "tbf_send_pkt: mp_loop 0x%p, ire 0x%p " 3082 "vif %ld\n", (void *)mp_loop, (void *)ire, 3083 (ptrdiff_t)(vifp - ipst->ips_vifs)); 3084 } 3085 if (ire != NULL) 3086 ire_refrele(ire); 3087 } else { 3088 mutex_exit(&ipif->ipif_ill->ill_lock); 3089 } 3090 if (ipst->ips_ip_mrtdebug > 1) { 3091 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3092 "tbf_send_pkt: phyint forward vif %ld dst = 0x%x", 3093 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(dst)); 3094 } 3095 ip_rput_forward_multicast(dst, mp, ipif); 3096 } 3097 } 3098 3099 /* 3100 * Determine the current time and then the elapsed time (between the last time 3101 * and time now). Update the no. of tokens in the bucket. 3102 */ 3103 static void 3104 tbf_update_tokens(struct vif *vifp) 3105 { 3106 timespec_t tp; 3107 hrtime_t tm; 3108 struct tbf *t = vifp->v_tbf; 3109 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 3110 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3111 3112 ASSERT(MUTEX_HELD(&t->tbf_lock)); 3113 3114 /* Time in secs and nsecs, rate limit in kbits/sec */ 3115 gethrestime(&tp); 3116 3117 /*LINTED*/ 3118 TV_DELTA(tp, t->tbf_last_pkt_t, tm); 3119 3120 /* 3121 * This formula is actually 3122 * "time in seconds" * "bytes/second". Scaled for nsec. 3123 * (tm/1000000000) * (v_rate_limit * 1000 * (1000/1024) /8) 3124 * 3125 * The (1000/1024) was introduced in add_vif to optimize 3126 * this divide into a shift. 3127 */ 3128 t->tbf_n_tok += (tm/1000) * vifp->v_rate_limit / 1024 / 8; 3129 t->tbf_last_pkt_t = tp; 3130 3131 if (t->tbf_n_tok > MAX_BKT_SIZE) 3132 t->tbf_n_tok = MAX_BKT_SIZE; 3133 if (ipst->ips_ip_mrtdebug > 1) { 3134 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3135 "tbf_update_tok: tm %lld tok %d vif %ld", 3136 tm, t->tbf_n_tok, (ptrdiff_t)(vifp - ipst->ips_vifs)); 3137 } 3138 } 3139 3140 /* 3141 * Priority currently is based on port nos. 3142 * Different forwarding mechanisms have different ways 3143 * of obtaining the port no. Hence, the vif must be 3144 * given along with the packet itself. 3145 * 3146 */ 3147 static int 3148 priority(struct vif *vifp, ipha_t *ipha) 3149 { 3150 int prio; 3151 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 3152 conn_t *mrouter = ipst->ips_ip_g_mrouter; 3153 3154 /* Temporary hack; may add general packet classifier some day */ 3155 3156 ASSERT(MUTEX_HELD(&vifp->v_tbf->tbf_lock)); 3157 3158 /* 3159 * The UDP port space is divided up into four priority ranges: 3160 * [0, 16384) : unclassified - lowest priority 3161 * [16384, 32768) : audio - highest priority 3162 * [32768, 49152) : whiteboard - medium priority 3163 * [49152, 65536) : video - low priority 3164 */ 3165 3166 if (ipha->ipha_protocol == IPPROTO_UDP) { 3167 struct udphdr *udp = 3168 (struct udphdr *)((char *)ipha + IPH_HDR_LENGTH(ipha)); 3169 switch (ntohs(udp->uh_dport) & 0xc000) { 3170 case 0x4000: 3171 prio = 70; 3172 break; 3173 case 0x8000: 3174 prio = 60; 3175 break; 3176 case 0xc000: 3177 prio = 55; 3178 break; 3179 default: 3180 prio = 50; 3181 break; 3182 } 3183 if (ipst->ips_ip_mrtdebug > 1) { 3184 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 3185 "priority: port %x prio %d\n", 3186 ntohs(udp->uh_dport), prio); 3187 } 3188 } else 3189 prio = 50; /* default priority */ 3190 return (prio); 3191 } 3192 3193 /* 3194 * End of token bucket filter modifications 3195 */ 3196 3197 3198 3199 /* 3200 * Produces data for netstat -M. 3201 */ 3202 int 3203 ip_mroute_stats(mblk_t *mp, ip_stack_t *ipst) 3204 { 3205 ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl); 3206 ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl); 3207 if (!snmp_append_data(mp, (char *)ipst->ips_mrtstat, 3208 sizeof (struct mrtstat))) { 3209 ip0dbg(("ip_mroute_stats: failed %ld bytes\n", 3210 (size_t)sizeof (struct mrtstat))); 3211 return (0); 3212 } 3213 return (1); 3214 } 3215 3216 /* 3217 * Sends info for SNMP's MIB. 3218 */ 3219 int 3220 ip_mroute_vif(mblk_t *mp, ip_stack_t *ipst) 3221 { 3222 struct vifctl vi; 3223 vifi_t vifi; 3224 3225 mutex_enter(&ipst->ips_numvifs_mutex); 3226 for (vifi = 0; vifi < ipst->ips_numvifs; vifi++) { 3227 if (ipst->ips_vifs[vifi].v_lcl_addr.s_addr == 0) 3228 continue; 3229 /* 3230 * No locks here, an approximation is fine. 3231 */ 3232 vi.vifc_vifi = vifi; 3233 vi.vifc_flags = ipst->ips_vifs[vifi].v_flags; 3234 vi.vifc_threshold = ipst->ips_vifs[vifi].v_threshold; 3235 vi.vifc_rate_limit = ipst->ips_vifs[vifi].v_rate_limit; 3236 vi.vifc_lcl_addr = ipst->ips_vifs[vifi].v_lcl_addr; 3237 vi.vifc_rmt_addr = ipst->ips_vifs[vifi].v_rmt_addr; 3238 vi.vifc_pkt_in = ipst->ips_vifs[vifi].v_pkt_in; 3239 vi.vifc_pkt_out = ipst->ips_vifs[vifi].v_pkt_out; 3240 3241 if (!snmp_append_data(mp, (char *)&vi, sizeof (vi))) { 3242 ip0dbg(("ip_mroute_vif: failed %ld bytes\n", 3243 (size_t)sizeof (vi))); 3244 return (0); 3245 } 3246 } 3247 mutex_exit(&ipst->ips_numvifs_mutex); 3248 return (1); 3249 } 3250 3251 /* 3252 * Called by ip_snmp_get to send up multicast routing table. 3253 */ 3254 int 3255 ip_mroute_mrt(mblk_t *mp, ip_stack_t *ipst) 3256 { 3257 int i, j; 3258 struct mfc *rt; 3259 struct mfcctl mfcc; 3260 3261 /* 3262 * Make sure multicast has not been turned off. 3263 */ 3264 if (is_mrouter_off(ipst)) 3265 return (1); 3266 3267 /* Loop over all hash buckets and their chains */ 3268 for (i = 0; i < MFCTBLSIZ; i++) { 3269 MFCB_REFHOLD(&ipst->ips_mfcs[i]); 3270 for (rt = ipst->ips_mfcs[i].mfcb_mfc; rt; rt = rt->mfc_next) { 3271 mutex_enter(&rt->mfc_mutex); 3272 if (rt->mfc_rte != NULL || 3273 (rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 3274 mutex_exit(&rt->mfc_mutex); 3275 continue; 3276 } 3277 mfcc.mfcc_origin = rt->mfc_origin; 3278 mfcc.mfcc_mcastgrp = rt->mfc_mcastgrp; 3279 mfcc.mfcc_parent = rt->mfc_parent; 3280 mfcc.mfcc_pkt_cnt = rt->mfc_pkt_cnt; 3281 mutex_enter(&ipst->ips_numvifs_mutex); 3282 for (j = 0; j < (int)ipst->ips_numvifs; j++) 3283 mfcc.mfcc_ttls[j] = rt->mfc_ttls[j]; 3284 for (j = (int)ipst->ips_numvifs; j < MAXVIFS; j++) 3285 mfcc.mfcc_ttls[j] = 0; 3286 mutex_exit(&ipst->ips_numvifs_mutex); 3287 3288 mutex_exit(&rt->mfc_mutex); 3289 if (!snmp_append_data(mp, (char *)&mfcc, 3290 sizeof (mfcc))) { 3291 MFCB_REFRELE(&ipst->ips_mfcs[i]); 3292 ip0dbg(("ip_mroute_mrt: failed %ld bytes\n", 3293 (size_t)sizeof (mfcc))); 3294 return (0); 3295 } 3296 } 3297 MFCB_REFRELE(&ipst->ips_mfcs[i]); 3298 } 3299 return (1); 3300 } 3301