1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. 23 * All rights reserved. Use is subject to license terms. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Procedures for the kernel part of DVMRP, 31 * a Distance-Vector Multicast Routing Protocol. 32 * (See RFC-1075) 33 * Written by David Waitzman, BBN Labs, August 1988. 34 * Modified by Steve Deering, Stanford, February 1989. 35 * Modified by Mark J. Steiglitz, Stanford, May, 1991 36 * Modified by Van Jacobson, LBL, January 1993 37 * Modified by Ajit Thyagarajan, PARC, August 1993 38 * Modified by Bill Fenner, PARC, April 1995 39 * 40 * MROUTING 3.5 41 */ 42 43 /* 44 * TODO 45 * - function pointer field in vif, void *vif_sendit() 46 */ 47 48 #include <sys/types.h> 49 #include <sys/stream.h> 50 #include <sys/stropts.h> 51 #include <sys/strlog.h> 52 #include <sys/systm.h> 53 #include <sys/ddi.h> 54 #include <sys/cmn_err.h> 55 #include <sys/zone.h> 56 57 #include <sys/param.h> 58 #include <sys/socket.h> 59 #include <sys/vtrace.h> 60 #include <sys/debug.h> 61 #include <net/if.h> 62 #include <sys/sockio.h> 63 #include <netinet/in.h> 64 #include <net/if_dl.h> 65 66 #include <inet/common.h> 67 #include <inet/mi.h> 68 #include <inet/nd.h> 69 #include <inet/mib2.h> 70 #include <netinet/ip6.h> 71 #include <inet/ip.h> 72 #include <inet/snmpcom.h> 73 74 #include <netinet/igmp.h> 75 #include <netinet/igmp_var.h> 76 #include <netinet/udp.h> 77 #include <netinet/ip_mroute.h> 78 #include <inet/ip_multi.h> 79 #include <inet/ip_ire.h> 80 #include <inet/ip_if.h> 81 #include <inet/ipclassifier.h> 82 83 #include <netinet/pim.h> 84 85 86 /* 87 * MT Design: 88 * 89 * There are three main data structures viftable, mfctable and tbftable that 90 * need to be protected against MT races. 91 * 92 * vitable is a fixed length array of vif structs. There is no lock to protect 93 * the whole array, instead each struct is protected by its own indiviual lock. 94 * The value of v_marks in conjuction with the value of v_refcnt determines the 95 * current state of a vif structure. One special state that needs mention 96 * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates 97 * that vif is being initalized. 98 * Each structure is freed when the refcnt goes down to zero. If a delete comes 99 * in when the the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED 100 * which prevents the struct from further use. When the refcnt goes to zero 101 * the struct is freed and is marked VIF_MARK_NOTINUSE. 102 * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill 103 * from going away a refhold is put on the ipif before using it. see 104 * lock_good_vif() and unlock_good_vif(). 105 * 106 * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts 107 * of the vif struct. 108 * 109 * tbftable is also a fixed length array of tbf structs and is only accessed 110 * via v_tbf. It is protected by its own lock tbf_lock. 111 * 112 * Lock Ordering is 113 * v_lock --> tbf_lock 114 * v_lock --> ill_locK 115 * 116 * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb). 117 * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker, 118 * it also maintains a state. These fields are protected by a lock (mfcb_lock). 119 * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to 120 * protect the struct elements. 121 * 122 * mfc structs are dynamically allocated and are singly linked 123 * at the head of the chain. When an mfc structure is to be deleted 124 * it is marked condemned and so is the state in the bucket struct. 125 * When the last walker of the hash bucket exits all the mfc structs 126 * marked condemed are freed. 127 * 128 * Locking Hierarchy: 129 * The bucket lock should be acquired before the mfc struct lock. 130 * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking 131 * operations on the bucket struct. 132 * 133 * last_encap_lock and numvifs_mutex should be acquired after 134 * acquring vif or mfc locks. These locks protect some global variables. 135 * 136 * The statistics are not currently protected by a lock 137 * causing the stats be be approximate, not exact. 138 */ 139 140 #define NO_VIF MAXVIFS /* from mrouted, no route for src */ 141 142 /* 143 * Timeouts: 144 * Upcall timeouts - BSD uses boolean_t mfc->expire and 145 * nexpire[MFCTBLSIZE], the number of times expire has been called. 146 * SunOS 5.x uses mfc->timeout for each mfc. 147 * Some Unixes are limited in the number of simultaneous timeouts 148 * that can be run, SunOS 5.x does not have this restriction. 149 */ 150 151 /* 152 * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and 153 * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall 154 * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE 155 */ 156 #define EXPIRE_TIMEOUT (hz/4) /* 4x / second */ 157 #define UPCALL_EXPIRE 6 /* number of timeouts */ 158 159 /* 160 * Hash function for a source, group entry 161 */ 162 #define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \ 163 ((g) >> 20) ^ ((g) >> 10) ^ (g)) 164 165 #define TBF_REPROCESS (hz / 100) /* 100x /second */ 166 167 /* Identify PIM packet that came on a Register interface */ 168 #define PIM_REGISTER_MARKER 0xffffffff 169 170 /* Function declarations */ 171 static int add_mfc(struct mfcctl *, ip_stack_t *); 172 static int add_vif(struct vifctl *, queue_t *, mblk_t *, ip_stack_t *); 173 static int del_mfc(struct mfcctl *, ip_stack_t *); 174 static int del_vif(vifi_t *, queue_t *, mblk_t *, ip_stack_t *); 175 static void del_vifp(struct vif *); 176 static void encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 177 static void expire_upcalls(void *); 178 static void fill_route(struct mfc *, struct mfcctl *, ip_stack_t *); 179 static void free_queue(struct mfc *); 180 static int get_assert(uchar_t *, ip_stack_t *); 181 static int get_lsg_cnt(struct sioc_lsg_req *, ip_stack_t *); 182 static int get_sg_cnt(struct sioc_sg_req *, ip_stack_t *); 183 static int get_version(uchar_t *); 184 static int get_vif_cnt(struct sioc_vif_req *, ip_stack_t *); 185 static int ip_mdq(mblk_t *, ipha_t *, ill_t *, 186 ipaddr_t, struct mfc *); 187 static int ip_mrouter_init(queue_t *, uchar_t *, int, ip_stack_t *); 188 static void phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 189 static int register_mforward(queue_t *, mblk_t *, ill_t *); 190 static void register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 191 static int set_assert(int *, ip_stack_t *); 192 193 /* 194 * Token Bucket Filter functions 195 */ 196 static int priority(struct vif *, ipha_t *); 197 static void tbf_control(struct vif *, mblk_t *, ipha_t *); 198 static int tbf_dq_sel(struct vif *, ipha_t *); 199 static void tbf_process_q(struct vif *); 200 static void tbf_queue(struct vif *, mblk_t *); 201 static void tbf_reprocess_q(void *); 202 static void tbf_send_packet(struct vif *, mblk_t *); 203 static void tbf_update_tokens(struct vif *); 204 static void release_mfc(struct mfcb *); 205 206 static boolean_t is_mrouter_off(ip_stack_t *); 207 /* 208 * Encapsulation packets 209 */ 210 211 #define ENCAP_TTL 64 212 213 /* prototype IP hdr for encapsulated packets */ 214 static ipha_t multicast_encap_iphdr = { 215 IP_SIMPLE_HDR_VERSION, 216 0, /* tos */ 217 sizeof (ipha_t), /* total length */ 218 0, /* id */ 219 0, /* frag offset */ 220 ENCAP_TTL, IPPROTO_ENCAP, 221 0, /* checksum */ 222 }; 223 224 /* 225 * Rate limit for assert notification messages, in nsec. 226 */ 227 #define ASSERT_MSG_TIME 3000000000 228 229 230 #define VIF_REFHOLD(vifp) { \ 231 mutex_enter(&(vifp)->v_lock); \ 232 (vifp)->v_refcnt++; \ 233 mutex_exit(&(vifp)->v_lock); \ 234 } 235 236 #define VIF_REFRELE_LOCKED(vifp) { \ 237 (vifp)->v_refcnt--; \ 238 if ((vifp)->v_refcnt == 0 && \ 239 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) { \ 240 del_vifp(vifp); \ 241 } else { \ 242 mutex_exit(&(vifp)->v_lock); \ 243 } \ 244 } 245 246 #define VIF_REFRELE(vifp) { \ 247 mutex_enter(&(vifp)->v_lock); \ 248 (vifp)->v_refcnt--; \ 249 if ((vifp)->v_refcnt == 0 && \ 250 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) { \ 251 del_vifp(vifp); \ 252 } else { \ 253 mutex_exit(&(vifp)->v_lock); \ 254 } \ 255 } 256 257 #define MFCB_REFHOLD(mfcb) { \ 258 mutex_enter(&(mfcb)->mfcb_lock); \ 259 (mfcb)->mfcb_refcnt++; \ 260 ASSERT((mfcb)->mfcb_refcnt != 0); \ 261 mutex_exit(&(mfcb)->mfcb_lock); \ 262 } 263 264 #define MFCB_REFRELE(mfcb) { \ 265 mutex_enter(&(mfcb)->mfcb_lock); \ 266 ASSERT((mfcb)->mfcb_refcnt != 0); \ 267 if (--(mfcb)->mfcb_refcnt == 0 && \ 268 ((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) { \ 269 release_mfc(mfcb); \ 270 } \ 271 mutex_exit(&(mfcb)->mfcb_lock); \ 272 } 273 274 /* 275 * MFCFIND: 276 * Find a route for a given origin IP address and multicast group address. 277 * Skip entries with pending upcalls. 278 * Type of service parameter to be added in the future! 279 */ 280 #define MFCFIND(mfcbp, o, g, rt) { \ 281 struct mfc *_mb_rt = NULL; \ 282 rt = NULL; \ 283 _mb_rt = mfcbp->mfcb_mfc; \ 284 while (_mb_rt) { \ 285 if ((_mb_rt->mfc_origin.s_addr == o) && \ 286 (_mb_rt->mfc_mcastgrp.s_addr == g) && \ 287 (_mb_rt->mfc_rte == NULL) && \ 288 (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) { \ 289 rt = _mb_rt; \ 290 break; \ 291 } \ 292 _mb_rt = _mb_rt->mfc_next; \ 293 } \ 294 } 295 296 /* 297 * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime() 298 * are inefficient. We use gethrestime() which returns a timespec_t with 299 * sec and nsec, the resolution is machine dependent. 300 * The following 2 macros have been changed to use nsec instead of usec. 301 */ 302 /* 303 * Macros to compute elapsed time efficiently. 304 * Borrowed from Van Jacobson's scheduling code. 305 * Delta should be a hrtime_t. 306 */ 307 #define TV_DELTA(a, b, delta) { \ 308 int xxs; \ 309 \ 310 delta = (a).tv_nsec - (b).tv_nsec; \ 311 if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \ 312 switch (xxs) { \ 313 case 2: \ 314 delta += 1000000000; \ 315 /*FALLTHROUGH*/ \ 316 case 1: \ 317 delta += 1000000000; \ 318 break; \ 319 default: \ 320 delta += (1000000000 * xxs); \ 321 } \ 322 } \ 323 } 324 325 #define TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \ 326 (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec) 327 328 /* 329 * Handle MRT setsockopt commands to modify the multicast routing tables. 330 */ 331 int 332 ip_mrouter_set(int cmd, queue_t *q, int checkonly, uchar_t *data, 333 int datalen, mblk_t *first_mp) 334 { 335 ip_stack_t *ipst = CONNQ_TO_IPST(q); 336 337 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 338 if (cmd != MRT_INIT && q != ipst->ips_ip_g_mrouter) { 339 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 340 return (EACCES); 341 } 342 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 343 344 if (checkonly) { 345 /* 346 * do not do operation, just pretend to - new T_CHECK 347 * Note: Even routines further on can probably fail but 348 * this T_CHECK stuff is only to please XTI so it not 349 * necessary to be perfect. 350 */ 351 switch (cmd) { 352 case MRT_INIT: 353 case MRT_DONE: 354 case MRT_ADD_VIF: 355 case MRT_DEL_VIF: 356 case MRT_ADD_MFC: 357 case MRT_DEL_MFC: 358 case MRT_ASSERT: 359 return (0); 360 default: 361 return (EOPNOTSUPP); 362 } 363 } 364 365 /* 366 * make sure no command is issued after multicast routing has been 367 * turned off. 368 */ 369 if (cmd != MRT_INIT && cmd != MRT_DONE) { 370 if (is_mrouter_off(ipst)) 371 return (EINVAL); 372 } 373 374 switch (cmd) { 375 case MRT_INIT: return (ip_mrouter_init(q, data, datalen, ipst)); 376 case MRT_DONE: return (ip_mrouter_done(first_mp, ipst)); 377 case MRT_ADD_VIF: return (add_vif((struct vifctl *)data, q, first_mp, 378 ipst)); 379 case MRT_DEL_VIF: return (del_vif((vifi_t *)data, q, first_mp, ipst)); 380 case MRT_ADD_MFC: return (add_mfc((struct mfcctl *)data, ipst)); 381 case MRT_DEL_MFC: return (del_mfc((struct mfcctl *)data, ipst)); 382 case MRT_ASSERT: return (set_assert((int *)data, ipst)); 383 default: return (EOPNOTSUPP); 384 } 385 } 386 387 /* 388 * Handle MRT getsockopt commands 389 */ 390 int 391 ip_mrouter_get(int cmd, queue_t *q, uchar_t *data) 392 { 393 ip_stack_t *ipst = CONNQ_TO_IPST(q); 394 395 if (q != ipst->ips_ip_g_mrouter) 396 return (EACCES); 397 398 switch (cmd) { 399 case MRT_VERSION: return (get_version((uchar_t *)data)); 400 case MRT_ASSERT: return (get_assert((uchar_t *)data, ipst)); 401 default: return (EOPNOTSUPP); 402 } 403 } 404 405 /* 406 * Handle ioctl commands to obtain information from the cache. 407 * Called with shared access to IP. These are read_only ioctls. 408 */ 409 /* ARGSUSED */ 410 int 411 mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 412 ip_ioctl_cmd_t *ipip, void *if_req) 413 { 414 mblk_t *mp1; 415 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 416 ip_stack_t *ipst = CONNQ_TO_IPST(q); 417 418 /* Existence verified in ip_wput_nondata */ 419 mp1 = mp->b_cont->b_cont; 420 421 switch (iocp->ioc_cmd) { 422 case (SIOCGETVIFCNT): 423 return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr, ipst)); 424 case (SIOCGETSGCNT): 425 return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr, ipst)); 426 case (SIOCGETLSGCNT): 427 return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr, ipst)); 428 default: 429 return (EINVAL); 430 } 431 } 432 433 /* 434 * Returns the packet, byte, rpf-failure count for the source, group provided. 435 */ 436 static int 437 get_sg_cnt(struct sioc_sg_req *req, ip_stack_t *ipst) 438 { 439 struct mfc *rt; 440 struct mfcb *mfcbp; 441 442 mfcbp = &ipst->ips_mfcs[MFCHASH(req->src.s_addr, req->grp.s_addr)]; 443 MFCB_REFHOLD(mfcbp); 444 MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt); 445 446 if (rt != NULL) { 447 mutex_enter(&rt->mfc_mutex); 448 req->pktcnt = rt->mfc_pkt_cnt; 449 req->bytecnt = rt->mfc_byte_cnt; 450 req->wrong_if = rt->mfc_wrong_if; 451 mutex_exit(&rt->mfc_mutex); 452 } else 453 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU; 454 455 MFCB_REFRELE(mfcbp); 456 return (0); 457 } 458 459 /* 460 * Returns the packet, byte, rpf-failure count for the source, group provided. 461 * Uses larger counters and IPv6 addresses. 462 */ 463 /* ARGSUSED XXX until implemented */ 464 static int 465 get_lsg_cnt(struct sioc_lsg_req *req, ip_stack_t *ipst) 466 { 467 /* XXX TODO SIOCGETLSGCNT */ 468 return (ENXIO); 469 } 470 471 /* 472 * Returns the input and output packet and byte counts on the vif provided. 473 */ 474 static int 475 get_vif_cnt(struct sioc_vif_req *req, ip_stack_t *ipst) 476 { 477 vifi_t vifi = req->vifi; 478 479 if (vifi >= ipst->ips_numvifs) 480 return (EINVAL); 481 482 /* 483 * No locks here, an approximation is fine. 484 */ 485 req->icount = ipst->ips_vifs[vifi].v_pkt_in; 486 req->ocount = ipst->ips_vifs[vifi].v_pkt_out; 487 req->ibytes = ipst->ips_vifs[vifi].v_bytes_in; 488 req->obytes = ipst->ips_vifs[vifi].v_bytes_out; 489 490 return (0); 491 } 492 493 static int 494 get_version(uchar_t *data) 495 { 496 int *v = (int *)data; 497 498 *v = 0x0305; /* XXX !!!! */ 499 500 return (0); 501 } 502 503 /* 504 * Set PIM assert processing global. 505 */ 506 static int 507 set_assert(int *i, ip_stack_t *ipst) 508 { 509 if ((*i != 1) && (*i != 0)) 510 return (EINVAL); 511 512 ipst->ips_pim_assert = *i; 513 514 return (0); 515 } 516 517 /* 518 * Get PIM assert processing global. 519 */ 520 static int 521 get_assert(uchar_t *data, ip_stack_t *ipst) 522 { 523 int *i = (int *)data; 524 525 *i = ipst->ips_pim_assert; 526 527 return (0); 528 } 529 530 /* 531 * Enable multicast routing. 532 */ 533 static int 534 ip_mrouter_init(queue_t *q, uchar_t *data, int datalen, ip_stack_t *ipst) 535 { 536 conn_t *connp = Q_TO_CONN(q); 537 int *v; 538 539 if (data == NULL || (datalen != sizeof (int))) 540 return (ENOPROTOOPT); 541 542 v = (int *)data; 543 if (*v != 1) 544 return (ENOPROTOOPT); 545 546 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 547 if (ipst->ips_ip_g_mrouter != NULL) { 548 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 549 return (EADDRINUSE); 550 } 551 552 ipst->ips_ip_g_mrouter = q; 553 connp->conn_multi_router = 1; 554 /* In order for tunnels to work we have to turn ip_g_forward on */ 555 if (!WE_ARE_FORWARDING(ipst)) { 556 if (ipst->ips_ip_mrtdebug > 1) { 557 (void) mi_strlog(q, 1, SL_TRACE, 558 "ip_mrouter_init: turning on forwarding"); 559 } 560 ipst->ips_saved_ip_g_forward = ipst->ips_ip_g_forward; 561 ipst->ips_ip_g_forward = IP_FORWARD_ALWAYS; 562 } 563 564 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 565 return (0); 566 } 567 568 void 569 ip_mrouter_stack_init(ip_stack_t *ipst) 570 { 571 mutex_init(&ipst->ips_ip_g_mrouter_mutex, NULL, MUTEX_DEFAULT, NULL); 572 573 ipst->ips_vifs = kmem_zalloc(sizeof (struct vif) * (MAXVIFS+1), 574 KM_SLEEP); 575 ipst->ips_mrtstat = kmem_zalloc(sizeof (struct mrtstat), KM_SLEEP); 576 /* 577 * mfctable: 578 * Includes all mfcs, including waiting upcalls. 579 * Multiple mfcs per bucket. 580 */ 581 ipst->ips_mfcs = kmem_zalloc(sizeof (struct mfcb) * MFCTBLSIZ, 582 KM_SLEEP); 583 /* 584 * Define the token bucket filter structures. 585 * tbftable -> each vif has one of these for storing info. 586 */ 587 ipst->ips_tbfs = kmem_zalloc(sizeof (struct tbf) * MAXVIFS, KM_SLEEP); 588 589 mutex_init(&ipst->ips_last_encap_lock, NULL, MUTEX_DEFAULT, NULL); 590 591 ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl); 592 ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl); 593 } 594 595 /* 596 * Disable multicast routing. 597 * Didn't use global timeout_val (BSD version), instead check the mfctable. 598 */ 599 int 600 ip_mrouter_done(mblk_t *mp, ip_stack_t *ipst) 601 { 602 conn_t *connp; 603 vifi_t vifi; 604 struct mfc *mfc_rt; 605 int i; 606 607 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 608 if (ipst->ips_ip_g_mrouter == NULL) { 609 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 610 return (EINVAL); 611 } 612 613 connp = Q_TO_CONN(ipst->ips_ip_g_mrouter); 614 615 if (ipst->ips_saved_ip_g_forward != -1) { 616 if (ipst->ips_ip_mrtdebug > 1) { 617 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 618 "ip_mrouter_done: turning off forwarding"); 619 } 620 ipst->ips_ip_g_forward = ipst->ips_saved_ip_g_forward; 621 ipst->ips_saved_ip_g_forward = -1; 622 } 623 624 /* 625 * Always clear cache when vifs change. 626 * No need to get ipst->ips_last_encap_lock since we are running as 627 * a writer. 628 */ 629 mutex_enter(&ipst->ips_last_encap_lock); 630 ipst->ips_last_encap_src = 0; 631 ipst->ips_last_encap_vif = NULL; 632 mutex_exit(&ipst->ips_last_encap_lock); 633 connp->conn_multi_router = 0; 634 635 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 636 637 /* 638 * For each phyint in use, 639 * disable promiscuous reception of all IP multicasts. 640 */ 641 for (vifi = 0; vifi < MAXVIFS; vifi++) { 642 struct vif *vifp = ipst->ips_vifs + vifi; 643 644 mutex_enter(&vifp->v_lock); 645 /* 646 * if the vif is active mark it condemned. 647 */ 648 if (vifp->v_marks & VIF_MARK_GOOD) { 649 ASSERT(vifp->v_ipif != NULL); 650 ipif_refhold(vifp->v_ipif); 651 /* Phyint only */ 652 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 653 ipif_t *ipif = vifp->v_ipif; 654 ipsq_t *ipsq; 655 boolean_t suc; 656 ill_t *ill; 657 658 ill = ipif->ipif_ill; 659 suc = B_FALSE; 660 if (mp == NULL) { 661 /* 662 * being called from ip_close, 663 * lets do it synchronously. 664 * Clear VIF_MARK_GOOD and 665 * set VIF_MARK_CONDEMNED. 666 */ 667 vifp->v_marks &= ~VIF_MARK_GOOD; 668 vifp->v_marks |= VIF_MARK_CONDEMNED; 669 mutex_exit(&(vifp)->v_lock); 670 suc = ipsq_enter(ill, B_FALSE); 671 ipsq = ill->ill_phyint->phyint_ipsq; 672 } else { 673 ipsq = ipsq_try_enter(ipif, NULL, 674 ipst->ips_ip_g_mrouter, mp, 675 ip_restart_optmgmt, NEW_OP, B_TRUE); 676 if (ipsq == NULL) { 677 mutex_exit(&(vifp)->v_lock); 678 return (EINPROGRESS); 679 } 680 /* 681 * Clear VIF_MARK_GOOD and 682 * set VIF_MARK_CONDEMNED. 683 */ 684 vifp->v_marks &= ~VIF_MARK_GOOD; 685 vifp->v_marks |= VIF_MARK_CONDEMNED; 686 mutex_exit(&(vifp)->v_lock); 687 suc = B_TRUE; 688 } 689 690 if (suc) { 691 (void) ip_delmulti(INADDR_ANY, ipif, 692 B_TRUE, B_TRUE); 693 ipsq_exit(ipsq, B_TRUE, B_TRUE); 694 } 695 mutex_enter(&vifp->v_lock); 696 } 697 /* 698 * decreases the refcnt added in add_vif. 699 * and release v_lock. 700 */ 701 VIF_REFRELE_LOCKED(vifp); 702 } else { 703 mutex_exit(&vifp->v_lock); 704 continue; 705 } 706 } 707 708 mutex_enter(&ipst->ips_numvifs_mutex); 709 ipst->ips_numvifs = 0; 710 ipst->ips_pim_assert = 0; 711 ipst->ips_reg_vif_num = ALL_VIFS; 712 mutex_exit(&ipst->ips_numvifs_mutex); 713 714 /* 715 * Free upcall msgs. 716 * Go through mfctable and stop any outstanding upcall 717 * timeouts remaining on mfcs. 718 */ 719 for (i = 0; i < MFCTBLSIZ; i++) { 720 mutex_enter(&ipst->ips_mfcs[i].mfcb_lock); 721 ipst->ips_mfcs[i].mfcb_refcnt++; 722 ipst->ips_mfcs[i].mfcb_marks |= MFCB_MARK_CONDEMNED; 723 mutex_exit(&ipst->ips_mfcs[i].mfcb_lock); 724 mfc_rt = ipst->ips_mfcs[i].mfcb_mfc; 725 while (mfc_rt) { 726 /* Free upcalls */ 727 mutex_enter(&mfc_rt->mfc_mutex); 728 if (mfc_rt->mfc_rte != NULL) { 729 if (mfc_rt->mfc_timeout_id != 0) { 730 /* 731 * OK to drop the lock as we have 732 * a refcnt on the bucket. timeout 733 * can fire but it will see that 734 * mfc_timeout_id == 0 and not do 735 * anything. see expire_upcalls(). 736 */ 737 mfc_rt->mfc_timeout_id = 0; 738 mutex_exit(&mfc_rt->mfc_mutex); 739 (void) untimeout( 740 mfc_rt->mfc_timeout_id); 741 mfc_rt->mfc_timeout_id = 0; 742 mutex_enter(&mfc_rt->mfc_mutex); 743 744 /* 745 * all queued upcall packets 746 * and mblk will be freed in 747 * release_mfc(). 748 */ 749 } 750 } 751 752 mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED; 753 754 mutex_exit(&mfc_rt->mfc_mutex); 755 mfc_rt = mfc_rt->mfc_next; 756 } 757 MFCB_REFRELE(&ipst->ips_mfcs[i]); 758 } 759 760 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 761 ipst->ips_ip_g_mrouter = NULL; 762 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 763 return (0); 764 } 765 766 void 767 ip_mrouter_stack_destroy(ip_stack_t *ipst) 768 { 769 struct mfcb *mfcbp; 770 struct mfc *rt; 771 int i; 772 773 for (i = 0; i < MFCTBLSIZ; i++) { 774 mfcbp = &ipst->ips_mfcs[i]; 775 776 while ((rt = mfcbp->mfcb_mfc) != NULL) { 777 (void) printf("ip_mrouter_stack_destroy: free for %d\n", 778 i); 779 780 mfcbp->mfcb_mfc = rt->mfc_next; 781 free_queue(rt); 782 mi_free(rt); 783 } 784 } 785 kmem_free(ipst->ips_vifs, sizeof (struct vif) * (MAXVIFS+1)); 786 ipst->ips_vifs = NULL; 787 kmem_free(ipst->ips_mrtstat, sizeof (struct mrtstat)); 788 ipst->ips_mrtstat = NULL; 789 kmem_free(ipst->ips_mfcs, sizeof (struct mfcb) * MFCTBLSIZ); 790 ipst->ips_mfcs = NULL; 791 kmem_free(ipst->ips_tbfs, sizeof (struct tbf) * MAXVIFS); 792 ipst->ips_tbfs = NULL; 793 794 mutex_destroy(&ipst->ips_last_encap_lock); 795 mutex_destroy(&ipst->ips_ip_g_mrouter_mutex); 796 } 797 798 static boolean_t 799 is_mrouter_off(ip_stack_t *ipst) 800 { 801 conn_t *connp; 802 803 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 804 if (ipst->ips_ip_g_mrouter == NULL) { 805 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 806 return (B_TRUE); 807 } 808 809 connp = Q_TO_CONN(ipst->ips_ip_g_mrouter); 810 if (connp->conn_multi_router == 0) { 811 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 812 return (B_TRUE); 813 } 814 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 815 return (B_FALSE); 816 } 817 818 static void 819 unlock_good_vif(struct vif *vifp) 820 { 821 ASSERT(vifp->v_ipif != NULL); 822 ipif_refrele(vifp->v_ipif); 823 VIF_REFRELE(vifp); 824 } 825 826 static boolean_t 827 lock_good_vif(struct vif *vifp) 828 { 829 mutex_enter(&vifp->v_lock); 830 if (!(vifp->v_marks & VIF_MARK_GOOD)) { 831 mutex_exit(&vifp->v_lock); 832 return (B_FALSE); 833 } 834 835 ASSERT(vifp->v_ipif != NULL); 836 mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock); 837 if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) { 838 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock); 839 mutex_exit(&vifp->v_lock); 840 return (B_FALSE); 841 } 842 ipif_refhold_locked(vifp->v_ipif); 843 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock); 844 vifp->v_refcnt++; 845 mutex_exit(&vifp->v_lock); 846 return (B_TRUE); 847 } 848 849 /* 850 * Add a vif to the vif table. 851 */ 852 static int 853 add_vif(struct vifctl *vifcp, queue_t *q, mblk_t *first_mp, ip_stack_t *ipst) 854 { 855 struct vif *vifp = ipst->ips_vifs + vifcp->vifc_vifi; 856 ipif_t *ipif; 857 int error; 858 struct tbf *v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi; 859 conn_t *connp = Q_TO_CONN(q); 860 ipsq_t *ipsq; 861 862 ASSERT(connp != NULL); 863 864 if (vifcp->vifc_vifi >= MAXVIFS) 865 return (EINVAL); 866 867 if (is_mrouter_off(ipst)) 868 return (EINVAL); 869 870 mutex_enter(&vifp->v_lock); 871 /* 872 * Viftable entry should be 0. 873 * if v_marks == 0 but v_refcnt != 0 means struct is being 874 * initialized. 875 * 876 * Also note that it is very unlikely that we will get a MRT_ADD_VIF 877 * request while the delete is in progress, mrouted only sends add 878 * requests when a new interface is added and the new interface cannot 879 * have the same vifi as an existing interface. We make sure that 880 * ill_delete will block till the vif is deleted by adding a refcnt 881 * to ipif in del_vif(). 882 */ 883 if (vifp->v_lcl_addr.s_addr != 0 || 884 vifp->v_marks != 0 || 885 vifp->v_refcnt != 0) { 886 mutex_exit(&vifp->v_lock); 887 return (EADDRINUSE); 888 } 889 890 /* Incoming vif should not be 0 */ 891 if (vifcp->vifc_lcl_addr.s_addr == 0) { 892 mutex_exit(&vifp->v_lock); 893 return (EINVAL); 894 } 895 896 vifp->v_refcnt++; 897 mutex_exit(&vifp->v_lock); 898 /* Find the interface with the local address */ 899 ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL, 900 connp->conn_zoneid, CONNP_TO_WQ(connp), first_mp, 901 ip_restart_optmgmt, &error, ipst); 902 if (ipif == NULL) { 903 VIF_REFRELE(vifp); 904 if (error == EINPROGRESS) 905 return (error); 906 return (EADDRNOTAVAIL); 907 } 908 909 /* 910 * We have to be exclusive as we have to call ip_addmulti() 911 * This is the best position to try to be exclusive in case 912 * we have to wait. 913 */ 914 ipsq = ipsq_try_enter(ipif, NULL, CONNP_TO_WQ(connp), first_mp, 915 ip_restart_optmgmt, NEW_OP, B_TRUE); 916 if ((ipsq) == NULL) { 917 VIF_REFRELE(vifp); 918 ipif_refrele(ipif); 919 return (EINPROGRESS); 920 } 921 922 if (ipst->ips_ip_mrtdebug > 1) { 923 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 924 "add_vif: src 0x%x enter", 925 vifcp->vifc_lcl_addr.s_addr); 926 } 927 928 mutex_enter(&vifp->v_lock); 929 /* 930 * Always clear cache when vifs change. 931 * Needed to ensure that src isn't left over from before vif was added. 932 * No need to get last_encap_lock, since we are running as a writer. 933 */ 934 935 mutex_enter(&ipst->ips_last_encap_lock); 936 ipst->ips_last_encap_src = 0; 937 ipst->ips_last_encap_vif = NULL; 938 mutex_exit(&ipst->ips_last_encap_lock); 939 940 if (vifcp->vifc_flags & VIFF_TUNNEL) { 941 if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) { 942 cmn_err(CE_WARN, 943 "add_vif: source route tunnels not supported\n"); 944 VIF_REFRELE_LOCKED(vifp); 945 ipif_refrele(ipif); 946 ipsq_exit(ipsq, B_TRUE, B_TRUE); 947 return (EOPNOTSUPP); 948 } 949 vifp->v_rmt_addr = vifcp->vifc_rmt_addr; 950 951 } else { 952 /* Phyint or Register vif */ 953 if (vifcp->vifc_flags & VIFF_REGISTER) { 954 /* 955 * Note: Since all IPPROTO_IP level options (including 956 * MRT_ADD_VIF) are done exclusively via 957 * ip_optmgmt_writer(), a lock is not necessary to 958 * protect reg_vif_num. 959 */ 960 mutex_enter(&ipst->ips_numvifs_mutex); 961 if (ipst->ips_reg_vif_num == ALL_VIFS) { 962 ipst->ips_reg_vif_num = vifcp->vifc_vifi; 963 mutex_exit(&ipst->ips_numvifs_mutex); 964 } else { 965 mutex_exit(&ipst->ips_numvifs_mutex); 966 VIF_REFRELE_LOCKED(vifp); 967 ipif_refrele(ipif); 968 ipsq_exit(ipsq, B_TRUE, B_TRUE); 969 return (EADDRINUSE); 970 } 971 } 972 973 /* Make sure the interface supports multicast */ 974 if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) { 975 VIF_REFRELE_LOCKED(vifp); 976 ipif_refrele(ipif); 977 if (vifcp->vifc_flags & VIFF_REGISTER) { 978 mutex_enter(&ipst->ips_numvifs_mutex); 979 ipst->ips_reg_vif_num = ALL_VIFS; 980 mutex_exit(&ipst->ips_numvifs_mutex); 981 } 982 ipsq_exit(ipsq, B_TRUE, B_TRUE); 983 return (EOPNOTSUPP); 984 } 985 /* Enable promiscuous reception of all IP mcasts from the if */ 986 mutex_exit(&vifp->v_lock); 987 error = ip_addmulti(INADDR_ANY, ipif, ILGSTAT_NONE, 988 MODE_IS_EXCLUDE, NULL); 989 mutex_enter(&vifp->v_lock); 990 /* 991 * since we released the lock lets make sure that 992 * ip_mrouter_done() has not been called. 993 */ 994 if (error != 0 || is_mrouter_off(ipst)) { 995 if (error == 0) 996 (void) ip_delmulti(INADDR_ANY, ipif, B_TRUE, 997 B_TRUE); 998 if (vifcp->vifc_flags & VIFF_REGISTER) { 999 mutex_enter(&ipst->ips_numvifs_mutex); 1000 ipst->ips_reg_vif_num = ALL_VIFS; 1001 mutex_exit(&ipst->ips_numvifs_mutex); 1002 } 1003 VIF_REFRELE_LOCKED(vifp); 1004 ipif_refrele(ipif); 1005 ipsq_exit(ipsq, B_TRUE, B_TRUE); 1006 return (error?error:EINVAL); 1007 } 1008 } 1009 /* Define parameters for the tbf structure */ 1010 vifp->v_tbf = v_tbf; 1011 gethrestime(&vifp->v_tbf->tbf_last_pkt_t); 1012 vifp->v_tbf->tbf_n_tok = 0; 1013 vifp->v_tbf->tbf_q_len = 0; 1014 vifp->v_tbf->tbf_max_q_len = MAXQSIZE; 1015 vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL; 1016 1017 vifp->v_flags = vifcp->vifc_flags; 1018 vifp->v_threshold = vifcp->vifc_threshold; 1019 vifp->v_lcl_addr = vifcp->vifc_lcl_addr; 1020 vifp->v_ipif = ipif; 1021 ipif_refrele(ipif); 1022 /* Scaling up here, allows division by 1024 in critical code. */ 1023 vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000); 1024 vifp->v_timeout_id = 0; 1025 /* initialize per vif pkt counters */ 1026 vifp->v_pkt_in = 0; 1027 vifp->v_pkt_out = 0; 1028 vifp->v_bytes_in = 0; 1029 vifp->v_bytes_out = 0; 1030 mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL); 1031 1032 /* Adjust numvifs up, if the vifi is higher than numvifs */ 1033 mutex_enter(&ipst->ips_numvifs_mutex); 1034 if (ipst->ips_numvifs <= vifcp->vifc_vifi) 1035 ipst->ips_numvifs = vifcp->vifc_vifi + 1; 1036 mutex_exit(&ipst->ips_numvifs_mutex); 1037 1038 if (ipst->ips_ip_mrtdebug > 1) { 1039 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 1040 "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d", 1041 vifcp->vifc_vifi, 1042 ntohl(vifcp->vifc_lcl_addr.s_addr), 1043 (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask", 1044 ntohl(vifcp->vifc_rmt_addr.s_addr), 1045 vifcp->vifc_threshold, vifcp->vifc_rate_limit); 1046 } 1047 1048 vifp->v_marks = VIF_MARK_GOOD; 1049 mutex_exit(&vifp->v_lock); 1050 ipsq_exit(ipsq, B_TRUE, B_TRUE); 1051 return (0); 1052 } 1053 1054 1055 /* Delete a vif from the vif table. */ 1056 static void 1057 del_vifp(struct vif *vifp) 1058 { 1059 struct tbf *t = vifp->v_tbf; 1060 mblk_t *mp0; 1061 vifi_t vifi; 1062 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 1063 1064 ASSERT(vifp->v_marks & VIF_MARK_CONDEMNED); 1065 ASSERT(t != NULL); 1066 1067 /* 1068 * release the ref we put in vif_del. 1069 */ 1070 ASSERT(vifp->v_ipif != NULL); 1071 ipif_refrele(vifp->v_ipif); 1072 1073 if (ipst->ips_ip_mrtdebug > 1) { 1074 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 1075 "del_vif: src 0x%x\n", vifp->v_lcl_addr.s_addr); 1076 } 1077 1078 if (vifp->v_timeout_id != 0) { 1079 (void) untimeout(vifp->v_timeout_id); 1080 vifp->v_timeout_id = 0; 1081 } 1082 1083 /* 1084 * Free packets queued at the interface. 1085 * Mrouted takes care of cleaning up mfcs - makes calls to del_mfc. 1086 */ 1087 mutex_enter(&t->tbf_lock); 1088 while (t->tbf_q != NULL) { 1089 mp0 = t->tbf_q; 1090 t->tbf_q = t->tbf_q->b_next; 1091 mp0->b_prev = mp0->b_next = NULL; 1092 freemsg(mp0); 1093 } 1094 mutex_exit(&t->tbf_lock); 1095 1096 /* 1097 * Always clear cache when vifs change. 1098 * No need to get last_encap_lock since we are running as a writer. 1099 */ 1100 mutex_enter(&ipst->ips_last_encap_lock); 1101 if (vifp == ipst->ips_last_encap_vif) { 1102 ipst->ips_last_encap_vif = NULL; 1103 ipst->ips_last_encap_src = 0; 1104 } 1105 mutex_exit(&ipst->ips_last_encap_lock); 1106 1107 mutex_destroy(&t->tbf_lock); 1108 1109 bzero(vifp->v_tbf, sizeof (*(vifp->v_tbf))); 1110 1111 /* Adjust numvifs down */ 1112 mutex_enter(&ipst->ips_numvifs_mutex); 1113 for (vifi = ipst->ips_numvifs; vifi != 0; vifi--) /* vifi is unsigned */ 1114 if (ipst->ips_vifs[vifi - 1].v_lcl_addr.s_addr != 0) 1115 break; 1116 ipst->ips_numvifs = vifi; 1117 mutex_exit(&ipst->ips_numvifs_mutex); 1118 1119 bzero(vifp, sizeof (*vifp)); 1120 } 1121 1122 static int 1123 del_vif(vifi_t *vifip, queue_t *q, mblk_t *first_mp, ip_stack_t *ipst) 1124 { 1125 struct vif *vifp = ipst->ips_vifs + *vifip; 1126 conn_t *connp; 1127 ipsq_t *ipsq; 1128 1129 if (*vifip >= ipst->ips_numvifs) 1130 return (EINVAL); 1131 1132 1133 mutex_enter(&vifp->v_lock); 1134 /* 1135 * Not initialized 1136 * Here we are not looking at the vif that is being initialized 1137 * i.e vifp->v_marks == 0 and refcnt > 0. 1138 */ 1139 if (vifp->v_lcl_addr.s_addr == 0 || 1140 !(vifp->v_marks & VIF_MARK_GOOD)) { 1141 mutex_exit(&vifp->v_lock); 1142 return (EADDRNOTAVAIL); 1143 } 1144 1145 /* 1146 * This is an optimization, if first_mp == NULL 1147 * than we are being called from reset_mrt_vif_ipif() 1148 * so we already have exclusive access to the ipsq. 1149 * the ASSERT below is a check for this condition. 1150 */ 1151 if (first_mp != NULL && 1152 !(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 1153 connp = Q_TO_CONN(q); 1154 ASSERT(connp != NULL); 1155 /* 1156 * We have to be exclusive as we have to call ip_delmulti() 1157 * This is the best position to try to be exclusive in case 1158 * we have to wait. 1159 */ 1160 ipsq = ipsq_try_enter(vifp->v_ipif, NULL, CONNP_TO_WQ(connp), 1161 first_mp, ip_restart_optmgmt, NEW_OP, B_TRUE); 1162 if ((ipsq) == NULL) { 1163 mutex_exit(&vifp->v_lock); 1164 return (EINPROGRESS); 1165 } 1166 /* recheck after being exclusive */ 1167 if (vifp->v_lcl_addr.s_addr == 0 || 1168 !vifp->v_marks & VIF_MARK_GOOD) { 1169 /* 1170 * someone beat us. 1171 */ 1172 mutex_exit(&vifp->v_lock); 1173 ipsq_exit(ipsq, B_TRUE, B_TRUE); 1174 return (EADDRNOTAVAIL); 1175 } 1176 } 1177 1178 1179 ASSERT(IAM_WRITER_IPIF(vifp->v_ipif)); 1180 1181 1182 /* 1183 * add a refhold so that ipif does not go away while 1184 * there are still users, this will be released in del_vifp 1185 * when we free the vif. 1186 */ 1187 ipif_refhold(vifp->v_ipif); 1188 1189 /* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */ 1190 vifp->v_marks &= ~VIF_MARK_GOOD; 1191 vifp->v_marks |= VIF_MARK_CONDEMNED; 1192 1193 /* Phyint only */ 1194 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 1195 ipif_t *ipif = vifp->v_ipif; 1196 ASSERT(ipif != NULL); 1197 /* 1198 * should be OK to drop the lock as we 1199 * have marked this as CONDEMNED. 1200 */ 1201 mutex_exit(&(vifp)->v_lock); 1202 (void) ip_delmulti(INADDR_ANY, ipif, B_TRUE, B_TRUE); 1203 if (first_mp != NULL) 1204 ipsq_exit(ipsq, B_TRUE, B_TRUE); 1205 mutex_enter(&(vifp)->v_lock); 1206 } 1207 1208 /* 1209 * decreases the refcnt added in add_vif. 1210 */ 1211 VIF_REFRELE_LOCKED(vifp); 1212 return (0); 1213 } 1214 1215 /* 1216 * Add an mfc entry. 1217 */ 1218 static int 1219 add_mfc(struct mfcctl *mfccp, ip_stack_t *ipst) 1220 { 1221 struct mfc *rt; 1222 struct rtdetq *rte; 1223 ushort_t nstl; 1224 int i; 1225 struct mfcb *mfcbp; 1226 1227 /* 1228 * The value of vifi is NO_VIF (==MAXVIFS) if Mrouted 1229 * did not have a real route for pkt. 1230 * We want this pkt without rt installed in the mfctable to prevent 1231 * multiiple tries, so go ahead and put it in mfctable, it will 1232 * be discarded later in ip_mdq() because the child is NULL. 1233 */ 1234 1235 /* Error checking, out of bounds? */ 1236 if (mfccp->mfcc_parent > MAXVIFS) { 1237 ip0dbg(("ADD_MFC: mfcc_parent out of range %d", 1238 (int)mfccp->mfcc_parent)); 1239 return (EINVAL); 1240 } 1241 1242 if ((mfccp->mfcc_parent != NO_VIF) && 1243 (ipst->ips_vifs[mfccp->mfcc_parent].v_ipif == NULL)) { 1244 ip0dbg(("ADD_MFC: NULL ipif for parent vif %d\n", 1245 (int)mfccp->mfcc_parent)); 1246 return (EINVAL); 1247 } 1248 1249 if (is_mrouter_off(ipst)) { 1250 return (EINVAL); 1251 } 1252 1253 mfcbp = &ipst->ips_mfcs[MFCHASH(mfccp->mfcc_origin.s_addr, 1254 mfccp->mfcc_mcastgrp.s_addr)]; 1255 MFCB_REFHOLD(mfcbp); 1256 MFCFIND(mfcbp, mfccp->mfcc_origin.s_addr, 1257 mfccp->mfcc_mcastgrp.s_addr, rt); 1258 1259 /* If an entry already exists, just update the fields */ 1260 if (rt) { 1261 if (ipst->ips_ip_mrtdebug > 1) { 1262 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 1263 "add_mfc: update o %x grp %x parent %x", 1264 ntohl(mfccp->mfcc_origin.s_addr), 1265 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1266 mfccp->mfcc_parent); 1267 } 1268 mutex_enter(&rt->mfc_mutex); 1269 rt->mfc_parent = mfccp->mfcc_parent; 1270 1271 mutex_enter(&ipst->ips_numvifs_mutex); 1272 for (i = 0; i < (int)ipst->ips_numvifs; i++) 1273 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; 1274 mutex_exit(&ipst->ips_numvifs_mutex); 1275 mutex_exit(&rt->mfc_mutex); 1276 1277 MFCB_REFRELE(mfcbp); 1278 return (0); 1279 } 1280 1281 /* 1282 * Find the entry for which the upcall was made and update. 1283 */ 1284 for (rt = mfcbp->mfcb_mfc, nstl = 0; rt; rt = rt->mfc_next) { 1285 mutex_enter(&rt->mfc_mutex); 1286 if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && 1287 (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) && 1288 (rt->mfc_rte != NULL) && 1289 !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 1290 if (nstl++ != 0) 1291 cmn_err(CE_WARN, 1292 "add_mfc: %s o %x g %x p %x", 1293 "multiple kernel entries", 1294 ntohl(mfccp->mfcc_origin.s_addr), 1295 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1296 mfccp->mfcc_parent); 1297 1298 if (ipst->ips_ip_mrtdebug > 1) { 1299 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, 1300 SL_TRACE, 1301 "add_mfc: o %x g %x p %x", 1302 ntohl(mfccp->mfcc_origin.s_addr), 1303 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1304 mfccp->mfcc_parent); 1305 } 1306 fill_route(rt, mfccp, ipst); 1307 1308 /* 1309 * Prevent cleanup of cache entry. 1310 * Timer starts in ip_mforward. 1311 */ 1312 if (rt->mfc_timeout_id != 0) { 1313 timeout_id_t id; 1314 id = rt->mfc_timeout_id; 1315 /* 1316 * setting id to zero will avoid this 1317 * entry from being cleaned up in 1318 * expire_up_calls(). 1319 */ 1320 rt->mfc_timeout_id = 0; 1321 /* 1322 * dropping the lock is fine as we 1323 * have a refhold on the bucket. 1324 * so mfc cannot be freed. 1325 * The timeout can fire but it will see 1326 * that mfc_timeout_id == 0 and not cleanup. 1327 */ 1328 mutex_exit(&rt->mfc_mutex); 1329 (void) untimeout(id); 1330 mutex_enter(&rt->mfc_mutex); 1331 } 1332 1333 /* 1334 * Send all pkts that are queued waiting for the upcall. 1335 * ip_mdq param tun set to 0 - 1336 * the return value of ip_mdq() isn't used here, 1337 * so value we send doesn't matter. 1338 */ 1339 while (rt->mfc_rte != NULL) { 1340 rte = rt->mfc_rte; 1341 rt->mfc_rte = rte->rte_next; 1342 mutex_exit(&rt->mfc_mutex); 1343 (void) ip_mdq(rte->mp, (ipha_t *) 1344 rte->mp->b_rptr, rte->ill, 0, rt); 1345 freemsg(rte->mp); 1346 mi_free((char *)rte); 1347 mutex_enter(&rt->mfc_mutex); 1348 } 1349 } 1350 mutex_exit(&rt->mfc_mutex); 1351 } 1352 1353 1354 /* 1355 * It is possible that an entry is being inserted without an upcall 1356 */ 1357 if (nstl == 0) { 1358 mutex_enter(&(mfcbp->mfcb_lock)); 1359 if (ipst->ips_ip_mrtdebug > 1) { 1360 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 1361 "add_mfc: no upcall o %x g %x p %x", 1362 ntohl(mfccp->mfcc_origin.s_addr), 1363 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1364 mfccp->mfcc_parent); 1365 } 1366 if (is_mrouter_off(ipst)) { 1367 mutex_exit(&mfcbp->mfcb_lock); 1368 MFCB_REFRELE(mfcbp); 1369 return (EINVAL); 1370 } 1371 1372 for (rt = mfcbp->mfcb_mfc; rt; rt = rt->mfc_next) { 1373 1374 mutex_enter(&rt->mfc_mutex); 1375 if ((rt->mfc_origin.s_addr == 1376 mfccp->mfcc_origin.s_addr) && 1377 (rt->mfc_mcastgrp.s_addr == 1378 mfccp->mfcc_mcastgrp.s_addr) && 1379 (!(rt->mfc_marks & MFCB_MARK_CONDEMNED))) { 1380 fill_route(rt, mfccp, ipst); 1381 mutex_exit(&rt->mfc_mutex); 1382 break; 1383 } 1384 mutex_exit(&rt->mfc_mutex); 1385 } 1386 1387 /* No upcall, so make a new entry into mfctable */ 1388 if (rt == NULL) { 1389 rt = (struct mfc *)mi_zalloc(sizeof (struct mfc)); 1390 if (rt == NULL) { 1391 ip1dbg(("add_mfc: out of memory\n")); 1392 mutex_exit(&mfcbp->mfcb_lock); 1393 MFCB_REFRELE(mfcbp); 1394 return (ENOBUFS); 1395 } 1396 1397 /* Insert new entry at head of hash chain */ 1398 mutex_enter(&rt->mfc_mutex); 1399 fill_route(rt, mfccp, ipst); 1400 1401 /* Link into table */ 1402 rt->mfc_next = mfcbp->mfcb_mfc; 1403 mfcbp->mfcb_mfc = rt; 1404 mutex_exit(&rt->mfc_mutex); 1405 } 1406 mutex_exit(&mfcbp->mfcb_lock); 1407 } 1408 1409 MFCB_REFRELE(mfcbp); 1410 return (0); 1411 } 1412 1413 /* 1414 * Fills in mfc structure from mrouted mfcctl. 1415 */ 1416 static void 1417 fill_route(struct mfc *rt, struct mfcctl *mfccp, ip_stack_t *ipst) 1418 { 1419 int i; 1420 1421 rt->mfc_origin = mfccp->mfcc_origin; 1422 rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; 1423 rt->mfc_parent = mfccp->mfcc_parent; 1424 mutex_enter(&ipst->ips_numvifs_mutex); 1425 for (i = 0; i < (int)ipst->ips_numvifs; i++) { 1426 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; 1427 } 1428 mutex_exit(&ipst->ips_numvifs_mutex); 1429 /* Initialize pkt counters per src-grp */ 1430 rt->mfc_pkt_cnt = 0; 1431 rt->mfc_byte_cnt = 0; 1432 rt->mfc_wrong_if = 0; 1433 rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_nsec = 0; 1434 1435 } 1436 1437 static void 1438 free_queue(struct mfc *mfcp) 1439 { 1440 struct rtdetq *rte0; 1441 1442 /* 1443 * Drop all queued upcall packets. 1444 * Free the mbuf with the pkt. 1445 */ 1446 while ((rte0 = mfcp->mfc_rte) != NULL) { 1447 mfcp->mfc_rte = rte0->rte_next; 1448 freemsg(rte0->mp); 1449 mi_free((char *)rte0); 1450 } 1451 } 1452 /* 1453 * go thorugh the hash bucket and free all the entries marked condemned. 1454 */ 1455 void 1456 release_mfc(struct mfcb *mfcbp) 1457 { 1458 struct mfc *current_mfcp; 1459 struct mfc *prev_mfcp; 1460 1461 prev_mfcp = current_mfcp = mfcbp->mfcb_mfc; 1462 1463 while (current_mfcp != NULL) { 1464 if (current_mfcp->mfc_marks & MFCB_MARK_CONDEMNED) { 1465 if (current_mfcp == mfcbp->mfcb_mfc) { 1466 mfcbp->mfcb_mfc = current_mfcp->mfc_next; 1467 free_queue(current_mfcp); 1468 mi_free(current_mfcp); 1469 prev_mfcp = current_mfcp = mfcbp->mfcb_mfc; 1470 continue; 1471 } 1472 ASSERT(prev_mfcp != NULL); 1473 prev_mfcp->mfc_next = current_mfcp->mfc_next; 1474 free_queue(current_mfcp); 1475 mi_free(current_mfcp); 1476 current_mfcp = NULL; 1477 } else { 1478 prev_mfcp = current_mfcp; 1479 } 1480 1481 current_mfcp = prev_mfcp->mfc_next; 1482 1483 } 1484 mfcbp->mfcb_marks &= ~MFCB_MARK_CONDEMNED; 1485 ASSERT(mfcbp->mfcb_mfc != NULL || mfcbp->mfcb_marks == 0); 1486 } 1487 1488 /* 1489 * Delete an mfc entry. 1490 */ 1491 static int 1492 del_mfc(struct mfcctl *mfccp, ip_stack_t *ipst) 1493 { 1494 struct in_addr origin; 1495 struct in_addr mcastgrp; 1496 struct mfc *rt; 1497 uint_t hash; 1498 1499 origin = mfccp->mfcc_origin; 1500 mcastgrp = mfccp->mfcc_mcastgrp; 1501 hash = MFCHASH(origin.s_addr, mcastgrp.s_addr); 1502 1503 if (ipst->ips_ip_mrtdebug > 1) { 1504 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 1505 "del_mfc: o %x g %x", 1506 ntohl(origin.s_addr), 1507 ntohl(mcastgrp.s_addr)); 1508 } 1509 1510 MFCB_REFHOLD(&ipst->ips_mfcs[hash]); 1511 1512 /* Find mfc in mfctable, finds only entries without upcalls */ 1513 for (rt = ipst->ips_mfcs[hash].mfcb_mfc; rt; rt = rt->mfc_next) { 1514 mutex_enter(&rt->mfc_mutex); 1515 if (origin.s_addr == rt->mfc_origin.s_addr && 1516 mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr && 1517 rt->mfc_rte == NULL && 1518 !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) 1519 break; 1520 mutex_exit(&rt->mfc_mutex); 1521 } 1522 1523 /* 1524 * Return if there was an upcall (mfc_rte != NULL, 1525 * or rt not in mfctable. 1526 */ 1527 if (rt == NULL) { 1528 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 1529 return (EADDRNOTAVAIL); 1530 } 1531 1532 1533 /* 1534 * no need to hold lock as we have a reference. 1535 */ 1536 ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED; 1537 /* error checking */ 1538 if (rt->mfc_timeout_id != 0) { 1539 ip0dbg(("del_mfc: TIMEOUT NOT 0, rte not null")); 1540 /* 1541 * Its ok to drop the lock, the struct cannot be freed 1542 * since we have a ref on the hash bucket. 1543 */ 1544 rt->mfc_timeout_id = 0; 1545 mutex_exit(&rt->mfc_mutex); 1546 (void) untimeout(rt->mfc_timeout_id); 1547 mutex_enter(&rt->mfc_mutex); 1548 } 1549 1550 ASSERT(rt->mfc_rte == NULL); 1551 1552 1553 /* 1554 * Delete the entry from the cache 1555 */ 1556 rt->mfc_marks |= MFCB_MARK_CONDEMNED; 1557 mutex_exit(&rt->mfc_mutex); 1558 1559 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 1560 1561 return (0); 1562 } 1563 1564 #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ 1565 1566 /* 1567 * IP multicast forwarding function. This function assumes that the packet 1568 * pointed to by ipha has arrived on (or is about to be sent to) the interface 1569 * pointed to by "ill", and the packet is to be relayed to other networks 1570 * that have members of the packet's destination IP multicast group. 1571 * 1572 * The packet is returned unscathed to the caller, unless it is 1573 * erroneous, in which case a -1 value tells the caller (IP) 1574 * to discard it. 1575 * 1576 * Unlike BSD, SunOS 5.x needs to return to IP info about 1577 * whether pkt came in thru a tunnel, so it can be discarded, unless 1578 * it's IGMP. In BSD, the ifp is bogus for tunnels, so pkt won't try 1579 * to be delivered. 1580 * Return values are 0 - pkt is okay and phyint 1581 * -1 - pkt is malformed and to be tossed 1582 * 1 - pkt came in on tunnel 1583 */ 1584 int 1585 ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp) 1586 { 1587 struct mfc *rt; 1588 ipaddr_t src, dst, tunnel_src = 0; 1589 static int srctun = 0; 1590 vifi_t vifi; 1591 boolean_t pim_reg_packet = B_FALSE; 1592 struct mfcb *mfcbp; 1593 ip_stack_t *ipst = ill->ill_ipst; 1594 1595 if (ipst->ips_ip_mrtdebug > 1) { 1596 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 1597 "ip_mforward: RECV ipha_src %x, ipha_dst %x, ill %s", 1598 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), 1599 ill->ill_name); 1600 } 1601 1602 dst = ipha->ipha_dst; 1603 if ((uint32_t)(uintptr_t)mp->b_prev == PIM_REGISTER_MARKER) 1604 pim_reg_packet = B_TRUE; 1605 else 1606 tunnel_src = (ipaddr_t)(uintptr_t)mp->b_prev; 1607 1608 /* 1609 * Don't forward a packet with time-to-live of zero or one, 1610 * or a packet destined to a local-only group. 1611 */ 1612 if (CLASSD(dst) && (ipha->ipha_ttl <= 1 || 1613 (ipaddr_t)ntohl(dst) <= INADDR_MAX_LOCAL_GROUP)) { 1614 if (ipst->ips_ip_mrtdebug > 1) { 1615 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 1616 "ip_mforward: not forwarded ttl %d," 1617 " dst 0x%x ill %s", 1618 ipha->ipha_ttl, ntohl(dst), ill->ill_name); 1619 } 1620 mp->b_prev = NULL; 1621 if (tunnel_src != 0) 1622 return (1); 1623 else 1624 return (0); 1625 } 1626 1627 if ((tunnel_src != 0) || pim_reg_packet) { 1628 /* 1629 * Packet arrived over an encapsulated tunnel or via a PIM 1630 * register message. Both ip_mroute_decap() and pim_input() 1631 * encode information in mp->b_prev. 1632 */ 1633 mp->b_prev = NULL; 1634 if (ipst->ips_ip_mrtdebug > 1) { 1635 if (tunnel_src != 0) { 1636 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, 1637 SL_TRACE, 1638 "ip_mforward: ill %s arrived via ENCAP TUN", 1639 ill->ill_name); 1640 } else if (pim_reg_packet) { 1641 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, 1642 SL_TRACE, 1643 "ip_mforward: ill %s arrived via" 1644 " REGISTER VIF", 1645 ill->ill_name); 1646 } 1647 } 1648 } else if ((ipha->ipha_version_and_hdr_length & 0xf) < 1649 (uint_t)(IP_SIMPLE_HDR_LENGTH + TUNNEL_LEN) >> 2 || 1650 ((uchar_t *)(ipha + 1))[1] != IPOPT_LSRR) { 1651 /* Packet arrived via a physical interface. */ 1652 if (ipst->ips_ip_mrtdebug > 1) { 1653 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 1654 "ip_mforward: ill %s arrived via PHYINT", 1655 ill->ill_name); 1656 } 1657 1658 } else { 1659 /* 1660 * Packet arrived through a SRCRT tunnel. 1661 * Source-route tunnels are no longer supported. 1662 * Error message printed every 1000 times. 1663 */ 1664 if ((srctun++ % 1000) == 0) { 1665 cmn_err(CE_WARN, 1666 "ip_mforward: received source-routed pkt from %x", 1667 ntohl(ipha->ipha_src)); 1668 } 1669 return (-1); 1670 } 1671 1672 ipst->ips_mrtstat->mrts_fwd_in++; 1673 src = ipha->ipha_src; 1674 1675 /* Find route in cache, return NULL if not there or upcalls q'ed. */ 1676 1677 /* 1678 * Lock the mfctable against changes made by ip_mforward. 1679 * Note that only add_mfc and del_mfc can remove entries and 1680 * they run with exclusive access to IP. So we do not need to 1681 * guard against the rt being deleted, so release lock after reading. 1682 */ 1683 1684 if (is_mrouter_off(ipst)) 1685 return (-1); 1686 1687 mfcbp = &ipst->ips_mfcs[MFCHASH(src, dst)]; 1688 MFCB_REFHOLD(mfcbp); 1689 MFCFIND(mfcbp, src, dst, rt); 1690 1691 /* Entry exists, so forward if necessary */ 1692 if (rt != NULL) { 1693 int ret = 0; 1694 ipst->ips_mrtstat->mrts_mfc_hits++; 1695 if (pim_reg_packet) { 1696 ASSERT(ipst->ips_reg_vif_num != ALL_VIFS); 1697 ret = ip_mdq(mp, ipha, 1698 ipst->ips_vifs[ipst->ips_reg_vif_num]. 1699 v_ipif->ipif_ill, 1700 0, rt); 1701 } else { 1702 ret = ip_mdq(mp, ipha, ill, tunnel_src, rt); 1703 } 1704 1705 MFCB_REFRELE(mfcbp); 1706 return (ret); 1707 1708 /* 1709 * Don't forward if we don't have a cache entry. Mrouted will 1710 * always provide a cache entry in response to an upcall. 1711 */ 1712 } else { 1713 /* 1714 * If we don't have a route for packet's origin, make a copy 1715 * of the packet and send message to routing daemon. 1716 */ 1717 struct mfc *mfc_rt = NULL; 1718 mblk_t *mp0 = NULL; 1719 mblk_t *mp_copy = NULL; 1720 struct rtdetq *rte = NULL; 1721 struct rtdetq *rte_m, *rte1, *prev_rte; 1722 uint_t hash; 1723 int npkts; 1724 boolean_t new_mfc = B_FALSE; 1725 ipst->ips_mrtstat->mrts_mfc_misses++; 1726 /* BSD uses mrts_no_route++ */ 1727 if (ipst->ips_ip_mrtdebug > 1) { 1728 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 1729 "ip_mforward: no rte ill %s src %x g %x misses %d", 1730 ill->ill_name, ntohl(src), ntohl(dst), 1731 (int)ipst->ips_mrtstat->mrts_mfc_misses); 1732 } 1733 /* 1734 * The order of the following code differs from the BSD code. 1735 * Pre-mc3.5, the BSD code was incorrect and SunOS 5.x 1736 * code works, so SunOS 5.x wasn't changed to conform to the 1737 * BSD version. 1738 */ 1739 1740 /* Lock mfctable. */ 1741 hash = MFCHASH(src, dst); 1742 mutex_enter(&(ipst->ips_mfcs[hash].mfcb_lock)); 1743 1744 /* 1745 * If we are turning off mrouted return an error 1746 */ 1747 if (is_mrouter_off(ipst)) { 1748 mutex_exit(&mfcbp->mfcb_lock); 1749 MFCB_REFRELE(mfcbp); 1750 return (-1); 1751 } 1752 1753 /* Is there an upcall waiting for this packet? */ 1754 for (mfc_rt = ipst->ips_mfcs[hash].mfcb_mfc; mfc_rt; 1755 mfc_rt = mfc_rt->mfc_next) { 1756 mutex_enter(&mfc_rt->mfc_mutex); 1757 if (ipst->ips_ip_mrtdebug > 1) { 1758 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, 1759 SL_TRACE, 1760 "ip_mforward: MFCTAB hash %d o 0x%x" 1761 " g 0x%x\n", 1762 hash, ntohl(mfc_rt->mfc_origin.s_addr), 1763 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1764 } 1765 /* There is an upcall */ 1766 if ((src == mfc_rt->mfc_origin.s_addr) && 1767 (dst == mfc_rt->mfc_mcastgrp.s_addr) && 1768 (mfc_rt->mfc_rte != NULL) && 1769 !(mfc_rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 1770 break; 1771 } 1772 mutex_exit(&mfc_rt->mfc_mutex); 1773 } 1774 /* No upcall, so make a new entry into mfctable */ 1775 if (mfc_rt == NULL) { 1776 mfc_rt = (struct mfc *)mi_zalloc(sizeof (struct mfc)); 1777 if (mfc_rt == NULL) { 1778 ipst->ips_mrtstat->mrts_fwd_drop++; 1779 ip1dbg(("ip_mforward: out of memory " 1780 "for mfc, mfc_rt\n")); 1781 goto error_return; 1782 } else 1783 new_mfc = B_TRUE; 1784 /* Get resources */ 1785 /* TODO could copy header and dup rest */ 1786 mp_copy = copymsg(mp); 1787 if (mp_copy == NULL) { 1788 ipst->ips_mrtstat->mrts_fwd_drop++; 1789 ip1dbg(("ip_mforward: out of memory for " 1790 "mblk, mp_copy\n")); 1791 goto error_return; 1792 } 1793 mutex_enter(&mfc_rt->mfc_mutex); 1794 } 1795 /* Get resources for rte, whether first rte or not first. */ 1796 /* Add this packet into rtdetq */ 1797 rte = (struct rtdetq *)mi_zalloc(sizeof (struct rtdetq)); 1798 if (rte == NULL) { 1799 ipst->ips_mrtstat->mrts_fwd_drop++; 1800 mutex_exit(&mfc_rt->mfc_mutex); 1801 ip1dbg(("ip_mforward: out of memory for" 1802 " rtdetq, rte\n")); 1803 goto error_return; 1804 } 1805 1806 mp0 = copymsg(mp); 1807 if (mp0 == NULL) { 1808 ipst->ips_mrtstat->mrts_fwd_drop++; 1809 ip1dbg(("ip_mforward: out of memory for mblk, mp0\n")); 1810 mutex_exit(&mfc_rt->mfc_mutex); 1811 goto error_return; 1812 } 1813 rte->mp = mp0; 1814 if (pim_reg_packet) { 1815 ASSERT(ipst->ips_reg_vif_num != ALL_VIFS); 1816 rte->ill = 1817 ipst->ips_vifs[ipst->ips_reg_vif_num]. 1818 v_ipif->ipif_ill; 1819 } else { 1820 rte->ill = ill; 1821 } 1822 rte->rte_next = NULL; 1823 1824 /* 1825 * Determine if upcall q (rtdetq) has overflowed. 1826 * mfc_rt->mfc_rte is null by mi_zalloc 1827 * if it is the first message. 1828 */ 1829 for (rte_m = mfc_rt->mfc_rte, npkts = 0; rte_m; 1830 rte_m = rte_m->rte_next) 1831 npkts++; 1832 if (ipst->ips_ip_mrtdebug > 1) { 1833 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 1834 "ip_mforward: upcalls %d\n", npkts); 1835 } 1836 if (npkts > MAX_UPQ) { 1837 ipst->ips_mrtstat->mrts_upq_ovflw++; 1838 mutex_exit(&mfc_rt->mfc_mutex); 1839 goto error_return; 1840 } 1841 1842 if (npkts == 0) { /* first upcall */ 1843 int i = 0; 1844 /* 1845 * Now finish installing the new mfc! Now that we have 1846 * resources! Insert new entry at head of hash chain. 1847 * Use src and dst which are ipaddr_t's. 1848 */ 1849 mfc_rt->mfc_origin.s_addr = src; 1850 mfc_rt->mfc_mcastgrp.s_addr = dst; 1851 1852 mutex_enter(&ipst->ips_numvifs_mutex); 1853 for (i = 0; i < (int)ipst->ips_numvifs; i++) 1854 mfc_rt->mfc_ttls[i] = 0; 1855 mutex_exit(&ipst->ips_numvifs_mutex); 1856 mfc_rt->mfc_parent = ALL_VIFS; 1857 1858 /* Link into table */ 1859 if (ipst->ips_ip_mrtdebug > 1) { 1860 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, 1861 SL_TRACE, 1862 "ip_mforward: NEW MFCTAB hash %d o 0x%x " 1863 "g 0x%x\n", hash, 1864 ntohl(mfc_rt->mfc_origin.s_addr), 1865 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1866 } 1867 mfc_rt->mfc_next = ipst->ips_mfcs[hash].mfcb_mfc; 1868 ipst->ips_mfcs[hash].mfcb_mfc = mfc_rt; 1869 mfc_rt->mfc_rte = NULL; 1870 } 1871 1872 /* Link in the upcall */ 1873 /* First upcall */ 1874 if (mfc_rt->mfc_rte == NULL) 1875 mfc_rt->mfc_rte = rte; 1876 else { 1877 /* not the first upcall */ 1878 prev_rte = mfc_rt->mfc_rte; 1879 for (rte1 = mfc_rt->mfc_rte->rte_next; rte1; 1880 prev_rte = rte1, rte1 = rte1->rte_next); 1881 prev_rte->rte_next = rte; 1882 } 1883 1884 /* 1885 * No upcalls waiting, this is first one, so send a message to 1886 * routing daemon to install a route into kernel table. 1887 */ 1888 if (npkts == 0) { 1889 struct igmpmsg *im; 1890 /* ipha_protocol is 0, for upcall */ 1891 ASSERT(mp_copy != NULL); 1892 im = (struct igmpmsg *)mp_copy->b_rptr; 1893 im->im_msgtype = IGMPMSG_NOCACHE; 1894 im->im_mbz = 0; 1895 mutex_enter(&ipst->ips_numvifs_mutex); 1896 if (pim_reg_packet) { 1897 im->im_vif = (uchar_t)ipst->ips_reg_vif_num; 1898 mutex_exit(&ipst->ips_numvifs_mutex); 1899 } else { 1900 /* 1901 * XXX do we need to hold locks here ? 1902 */ 1903 for (vifi = 0; 1904 vifi < ipst->ips_numvifs; 1905 vifi++) { 1906 if (ipst->ips_vifs[vifi].v_ipif == NULL) 1907 continue; 1908 if (ipst->ips_vifs[vifi]. 1909 v_ipif->ipif_ill == ill) { 1910 im->im_vif = (uchar_t)vifi; 1911 break; 1912 } 1913 } 1914 mutex_exit(&ipst->ips_numvifs_mutex); 1915 ASSERT(vifi < ipst->ips_numvifs); 1916 } 1917 1918 ipst->ips_mrtstat->mrts_upcalls++; 1919 /* Timer to discard upcalls if mrouted is too slow */ 1920 mfc_rt->mfc_timeout_id = timeout(expire_upcalls, 1921 mfc_rt, EXPIRE_TIMEOUT * UPCALL_EXPIRE); 1922 mutex_exit(&mfc_rt->mfc_mutex); 1923 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1924 putnext(RD(ipst->ips_ip_g_mrouter), mp_copy); 1925 1926 } else { 1927 mutex_exit(&mfc_rt->mfc_mutex); 1928 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1929 freemsg(mp_copy); 1930 } 1931 1932 MFCB_REFRELE(mfcbp); 1933 if (tunnel_src != 0) 1934 return (1); 1935 else 1936 return (0); 1937 error_return: 1938 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); 1939 MFCB_REFRELE(mfcbp); 1940 if (mfc_rt != NULL && (new_mfc == B_TRUE)) 1941 mi_free((char *)mfc_rt); 1942 if (rte != NULL) 1943 mi_free((char *)rte); 1944 if (mp_copy != NULL) 1945 freemsg(mp_copy); 1946 if (mp0 != NULL) 1947 freemsg(mp0); 1948 return (-1); 1949 } 1950 } 1951 1952 /* 1953 * Clean up the mfctable cache entry if upcall is not serviced. 1954 * SunOS 5.x has timeout per mfc, unlike BSD which has one timer. 1955 */ 1956 static void 1957 expire_upcalls(void *arg) 1958 { 1959 struct mfc *mfc_rt = arg; 1960 uint_t hash; 1961 struct mfc *prev_mfc, *mfc0; 1962 ip_stack_t *ipst; 1963 1964 if (mfc_rt->mfc_rte == NULL || mfc_rt->mfc_rte->ill != NULL) { 1965 cmn_err(CE_WARN, "expire_upcalls: no ILL\n"); 1966 return; 1967 } 1968 ipst = mfc_rt->mfc_rte->ill->ill_ipst; 1969 1970 hash = MFCHASH(mfc_rt->mfc_origin.s_addr, mfc_rt->mfc_mcastgrp.s_addr); 1971 if (ipst->ips_ip_mrtdebug > 1) { 1972 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 1973 "expire_upcalls: hash %d s %x g %x", 1974 hash, ntohl(mfc_rt->mfc_origin.s_addr), 1975 ntohl(mfc_rt->mfc_mcastgrp.s_addr)); 1976 } 1977 MFCB_REFHOLD(&ipst->ips_mfcs[hash]); 1978 mutex_enter(&mfc_rt->mfc_mutex); 1979 /* 1980 * if timeout has been set to zero, than the 1981 * entry has been filled, no need to delete it. 1982 */ 1983 if (mfc_rt->mfc_timeout_id == 0) 1984 goto done; 1985 ipst->ips_mrtstat->mrts_cache_cleanups++; 1986 mfc_rt->mfc_timeout_id = 0; 1987 1988 /* Determine entry to be cleaned up in cache table. */ 1989 for (prev_mfc = mfc0 = ipst->ips_mfcs[hash].mfcb_mfc; mfc0; 1990 prev_mfc = mfc0, mfc0 = mfc0->mfc_next) 1991 if (mfc0 == mfc_rt) 1992 break; 1993 1994 /* del_mfc takes care of gone mfcs */ 1995 ASSERT(prev_mfc != NULL); 1996 ASSERT(mfc0 != NULL); 1997 1998 /* 1999 * Delete the entry from the cache 2000 */ 2001 ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED; 2002 mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED; 2003 2004 /* 2005 * release_mfc will drop all queued upcall packets. 2006 * and will free the mbuf with the pkt, if, timing info. 2007 */ 2008 done: 2009 mutex_exit(&mfc_rt->mfc_mutex); 2010 MFCB_REFRELE(&ipst->ips_mfcs[hash]); 2011 } 2012 2013 /* 2014 * Packet forwarding routine once entry in the cache is made. 2015 */ 2016 static int 2017 ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src, 2018 struct mfc *rt) 2019 { 2020 vifi_t vifi; 2021 struct vif *vifp; 2022 ipaddr_t dst = ipha->ipha_dst; 2023 size_t plen = msgdsize(mp); 2024 vifi_t num_of_vifs; 2025 ip_stack_t *ipst = ill->ill_ipst; 2026 2027 if (ipst->ips_ip_mrtdebug > 1) { 2028 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 2029 "ip_mdq: SEND src %x, ipha_dst %x, ill %s", 2030 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), 2031 ill->ill_name); 2032 } 2033 2034 /* Macro to send packet on vif */ 2035 #define MC_SEND(ipha, mp, vifp, dst) { \ 2036 if ((vifp)->v_flags & VIFF_TUNNEL) \ 2037 encap_send((ipha), (mp), (vifp), (dst)); \ 2038 else if ((vifp)->v_flags & VIFF_REGISTER) \ 2039 register_send((ipha), (mp), (vifp), (dst)); \ 2040 else \ 2041 phyint_send((ipha), (mp), (vifp), (dst)); \ 2042 } 2043 2044 vifi = rt->mfc_parent; 2045 2046 /* 2047 * The value of vifi is MAXVIFS if the pkt had no parent, i.e., 2048 * Mrouted had no route. 2049 * We wanted the route installed in the mfctable to prevent multiple 2050 * tries, so it passed add_mfc(), but is discarded here. The v_ipif is 2051 * NULL so we don't want to check the ill. Still needed as of Mrouted 2052 * 3.6. 2053 */ 2054 if (vifi == NO_VIF) { 2055 ip1dbg(("ip_mdq: no route for origin ill %s, vifi is NO_VIF\n", 2056 ill->ill_name)); 2057 if (ipst->ips_ip_mrtdebug > 1) { 2058 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 2059 "ip_mdq: vifi is NO_VIF ill = %s", ill->ill_name); 2060 } 2061 return (-1); /* drop pkt */ 2062 } 2063 2064 if (!lock_good_vif(&ipst->ips_vifs[vifi])) 2065 return (-1); 2066 /* 2067 * The MFC entries are not cleaned up when an ipif goes 2068 * away thus this code has to guard against an MFC referencing 2069 * an ipif that has been closed. Note: reset_mrt_vif_ipif 2070 * sets the v_ipif to NULL when the ipif disappears. 2071 */ 2072 ASSERT(ipst->ips_vifs[vifi].v_ipif != NULL); 2073 2074 if (vifi >= ipst->ips_numvifs) { 2075 cmn_err(CE_WARN, "ip_mdq: illegal vifi %d numvifs " 2076 "%d ill %s viftable ill %s\n", 2077 (int)vifi, (int)ipst->ips_numvifs, ill->ill_name, 2078 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name); 2079 unlock_good_vif(&ipst->ips_vifs[vifi]); 2080 return (-1); 2081 } 2082 /* 2083 * Don't forward if it didn't arrive from the parent vif for its 2084 * origin. But do match on the groups as we nominate only one 2085 * ill in the group for receiving allmulti packets. 2086 */ 2087 if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill && 2088 (ill->ill_group == NULL || 2089 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_group != 2090 ill->ill_group)) || 2091 (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) { 2092 /* Came in the wrong interface */ 2093 ip1dbg(("ip_mdq: arrived wrong if, vifi %d " 2094 "numvifs %d ill %s viftable ill %s\n", 2095 (int)vifi, (int)ipst->ips_numvifs, ill->ill_name, 2096 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name)); 2097 if (ipst->ips_ip_mrtdebug > 1) { 2098 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 2099 "ip_mdq: arrived wrong if, vifi %d ill " 2100 "%s viftable ill %s\n", 2101 (int)vifi, ill->ill_name, 2102 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name); 2103 } 2104 ipst->ips_mrtstat->mrts_wrong_if++; 2105 rt->mfc_wrong_if++; 2106 2107 /* 2108 * If we are doing PIM assert processing and we are forwarding 2109 * packets on this interface, and it is a broadcast medium 2110 * interface (and not a tunnel), send a message to the routing. 2111 * 2112 * We use the first ipif on the list, since it's all we have. 2113 * Chances are the ipif_flags are the same for ipifs on the ill. 2114 */ 2115 if (ipst->ips_pim_assert && rt->mfc_ttls[vifi] > 0 && 2116 (ill->ill_ipif->ipif_flags & IPIF_BROADCAST) && 2117 !(ipst->ips_vifs[vifi].v_flags & VIFF_TUNNEL)) { 2118 mblk_t *mp_copy; 2119 struct igmpmsg *im; 2120 2121 /* TODO could copy header and dup rest */ 2122 mp_copy = copymsg(mp); 2123 if (mp_copy == NULL) { 2124 ipst->ips_mrtstat->mrts_fwd_drop++; 2125 ip1dbg(("ip_mdq: out of memory " 2126 "for mblk, mp_copy\n")); 2127 unlock_good_vif(&ipst->ips_vifs[vifi]); 2128 return (-1); 2129 } 2130 2131 im = (struct igmpmsg *)mp_copy->b_rptr; 2132 im->im_msgtype = IGMPMSG_WRONGVIF; 2133 im->im_mbz = 0; 2134 im->im_vif = (ushort_t)vifi; 2135 putnext(RD(ipst->ips_ip_g_mrouter), mp_copy); 2136 } 2137 unlock_good_vif(&ipst->ips_vifs[vifi]); 2138 if (tunnel_src != 0) 2139 return (1); 2140 else 2141 return (0); 2142 } 2143 /* 2144 * If I sourced this packet, it counts as output, else it was input. 2145 */ 2146 if (ipha->ipha_src == ipst->ips_vifs[vifi].v_lcl_addr.s_addr) { 2147 ipst->ips_vifs[vifi].v_pkt_out++; 2148 ipst->ips_vifs[vifi].v_bytes_out += plen; 2149 } else { 2150 ipst->ips_vifs[vifi].v_pkt_in++; 2151 ipst->ips_vifs[vifi].v_bytes_in += plen; 2152 } 2153 mutex_enter(&rt->mfc_mutex); 2154 rt->mfc_pkt_cnt++; 2155 rt->mfc_byte_cnt += plen; 2156 mutex_exit(&rt->mfc_mutex); 2157 unlock_good_vif(&ipst->ips_vifs[vifi]); 2158 /* 2159 * For each vif, decide if a copy of the packet should be forwarded. 2160 * Forward if: 2161 * - the vif threshold ttl is non-zero AND 2162 * - the pkt ttl exceeds the vif's threshold 2163 * A non-zero mfc_ttl indicates that the vif is part of 2164 * the output set for the mfc entry. 2165 */ 2166 mutex_enter(&ipst->ips_numvifs_mutex); 2167 num_of_vifs = ipst->ips_numvifs; 2168 mutex_exit(&ipst->ips_numvifs_mutex); 2169 for (vifp = ipst->ips_vifs, vifi = 0; 2170 vifi < num_of_vifs; 2171 vifp++, vifi++) { 2172 if (!lock_good_vif(vifp)) 2173 continue; 2174 if ((rt->mfc_ttls[vifi] > 0) && 2175 (ipha->ipha_ttl > rt->mfc_ttls[vifi])) { 2176 /* 2177 * lock_good_vif should not have succedded if 2178 * v_ipif is null. 2179 */ 2180 ASSERT(vifp->v_ipif != NULL); 2181 vifp->v_pkt_out++; 2182 vifp->v_bytes_out += plen; 2183 MC_SEND(ipha, mp, vifp, dst); 2184 ipst->ips_mrtstat->mrts_fwd_out++; 2185 } 2186 unlock_good_vif(vifp); 2187 } 2188 if (tunnel_src != 0) 2189 return (1); 2190 else 2191 return (0); 2192 } 2193 2194 /* 2195 * Send the packet on physical interface. 2196 * Caller assumes can continue to use mp on return. 2197 */ 2198 /* ARGSUSED */ 2199 static void 2200 phyint_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2201 { 2202 mblk_t *mp_copy; 2203 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2204 2205 /* Make a new reference to the packet */ 2206 mp_copy = copymsg(mp); /* TODO could copy header and dup rest */ 2207 if (mp_copy == NULL) { 2208 ipst->ips_mrtstat->mrts_fwd_drop++; 2209 ip1dbg(("phyint_send: out of memory for mblk, mp_copy\n")); 2210 return; 2211 } 2212 if (vifp->v_rate_limit <= 0) 2213 tbf_send_packet(vifp, mp_copy); 2214 else { 2215 if (ipst->ips_ip_mrtdebug > 1) { 2216 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 2217 "phyint_send: tbf_contr rate %d " 2218 "vifp 0x%p mp 0x%p dst 0x%x", 2219 vifp->v_rate_limit, (void *)vifp, (void *)mp, dst); 2220 } 2221 tbf_control(vifp, mp_copy, (ipha_t *)mp_copy->b_rptr); 2222 } 2223 } 2224 2225 /* 2226 * Send the whole packet for REGISTER encapsulation to PIM daemon 2227 * Caller assumes it can continue to use mp on return. 2228 */ 2229 /* ARGSUSED */ 2230 static void 2231 register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2232 { 2233 struct igmpmsg *im; 2234 mblk_t *mp_copy; 2235 ipha_t *ipha_copy; 2236 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2237 2238 if (ipst->ips_ip_mrtdebug > 1) { 2239 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 2240 "register_send: src %x, dst %x\n", 2241 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst)); 2242 } 2243 2244 /* 2245 * Copy the old packet & pullup its IP header into the new mblk_t so we 2246 * can modify it. Try to fill the new mblk_t since if we don't the 2247 * ethernet driver will. 2248 */ 2249 mp_copy = allocb(sizeof (struct igmpmsg) + sizeof (ipha_t), BPRI_MED); 2250 if (mp_copy == NULL) { 2251 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2252 if (ipst->ips_ip_mrtdebug > 3) { 2253 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 2254 "register_send: allocb failure."); 2255 } 2256 return; 2257 } 2258 2259 /* 2260 * Bump write pointer to account for igmpmsg being added. 2261 */ 2262 mp_copy->b_wptr = mp_copy->b_rptr + sizeof (struct igmpmsg); 2263 2264 /* 2265 * Chain packet to new mblk_t. 2266 */ 2267 if ((mp_copy->b_cont = copymsg(mp)) == NULL) { 2268 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2269 if (ipst->ips_ip_mrtdebug > 3) { 2270 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 2271 "register_send: copymsg failure."); 2272 } 2273 freeb(mp_copy); 2274 return; 2275 } 2276 2277 /* 2278 * icmp_rput() asserts that IP version field is set to an 2279 * appropriate version. Hence, the struct igmpmsg that this really 2280 * becomes, needs to have the correct IP version field. 2281 */ 2282 ipha_copy = (ipha_t *)mp_copy->b_rptr; 2283 *ipha_copy = multicast_encap_iphdr; 2284 2285 /* 2286 * The kernel uses the struct igmpmsg header to encode the messages to 2287 * the multicast routing daemon. Fill in the fields in the header 2288 * starting with the message type which is IGMPMSG_WHOLEPKT 2289 */ 2290 im = (struct igmpmsg *)mp_copy->b_rptr; 2291 im->im_msgtype = IGMPMSG_WHOLEPKT; 2292 im->im_src.s_addr = ipha->ipha_src; 2293 im->im_dst.s_addr = ipha->ipha_dst; 2294 2295 /* 2296 * Must Be Zero. This is because the struct igmpmsg is really an IP 2297 * header with renamed fields and the multicast routing daemon uses 2298 * an ipha_protocol (aka im_mbz) of 0 to distinguish these messages. 2299 */ 2300 im->im_mbz = 0; 2301 2302 ++ipst->ips_mrtstat->mrts_upcalls; 2303 if (!canputnext(RD(ipst->ips_ip_g_mrouter))) { 2304 ++ipst->ips_mrtstat->mrts_pim_regsend_drops; 2305 if (ipst->ips_ip_mrtdebug > 3) { 2306 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 2307 "register_send: register upcall failure."); 2308 } 2309 freemsg(mp_copy); 2310 } else { 2311 putnext(RD(ipst->ips_ip_g_mrouter), mp_copy); 2312 } 2313 } 2314 2315 /* 2316 * pim_validate_cksum handles verification of the checksum in the 2317 * pim header. For PIM Register packets, the checksum is calculated 2318 * across the PIM header only. For all other packets, the checksum 2319 * is for the PIM header and remainder of the packet. 2320 * 2321 * returns: B_TRUE, if checksum is okay. 2322 * B_FALSE, if checksum is not valid. 2323 */ 2324 static boolean_t 2325 pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp) 2326 { 2327 mblk_t *mp_dup; 2328 2329 if ((mp_dup = dupmsg(mp)) == NULL) 2330 return (B_FALSE); 2331 2332 mp_dup->b_rptr += IPH_HDR_LENGTH(ip); 2333 if (pimp->pim_type == PIM_REGISTER) 2334 mp_dup->b_wptr = mp_dup->b_rptr + PIM_MINLEN; 2335 if (IP_CSUM(mp_dup, 0, 0)) { 2336 freemsg(mp_dup); 2337 return (B_FALSE); 2338 } 2339 freemsg(mp_dup); 2340 return (B_TRUE); 2341 } 2342 2343 /* 2344 * int 2345 * pim_input(queue_t *, mblk_t *, ill_t *ill) - Process PIM protocol packets. 2346 * IP Protocol 103. Register messages are decapsulated and sent 2347 * onto multicast forwarding. 2348 */ 2349 int 2350 pim_input(queue_t *q, mblk_t *mp, ill_t *ill) 2351 { 2352 ipha_t *eip, *ip; 2353 int iplen, pimlen, iphlen; 2354 struct pim *pimp; /* pointer to a pim struct */ 2355 uint32_t *reghdr; 2356 ip_stack_t *ipst = ill->ill_ipst; 2357 2358 /* 2359 * Pullup the msg for PIM protocol processing. 2360 */ 2361 if (pullupmsg(mp, -1) == 0) { 2362 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2363 freemsg(mp); 2364 return (-1); 2365 } 2366 2367 ip = (ipha_t *)mp->b_rptr; 2368 iplen = ip->ipha_length; 2369 iphlen = IPH_HDR_LENGTH(ip); 2370 pimlen = ntohs(iplen) - iphlen; 2371 2372 /* 2373 * Validate lengths 2374 */ 2375 if (pimlen < PIM_MINLEN) { 2376 ++ipst->ips_mrtstat->mrts_pim_malformed; 2377 if (ipst->ips_ip_mrtdebug > 1) { 2378 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 2379 "pim_input: length not at least minlen"); 2380 } 2381 freemsg(mp); 2382 return (-1); 2383 } 2384 2385 /* 2386 * Point to the PIM header. 2387 */ 2388 pimp = (struct pim *)((caddr_t)ip + iphlen); 2389 2390 /* 2391 * Check the version number. 2392 */ 2393 if (pimp->pim_vers != PIM_VERSION) { 2394 ++ipst->ips_mrtstat->mrts_pim_badversion; 2395 if (ipst->ips_ip_mrtdebug > 1) { 2396 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 2397 "pim_input: unknown version of PIM"); 2398 } 2399 freemsg(mp); 2400 return (-1); 2401 } 2402 2403 /* 2404 * Validate the checksum 2405 */ 2406 if (!pim_validate_cksum(mp, ip, pimp)) { 2407 ++ipst->ips_mrtstat->mrts_pim_rcv_badcsum; 2408 if (ipst->ips_ip_mrtdebug > 1) { 2409 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 2410 "pim_input: invalid checksum"); 2411 } 2412 freemsg(mp); 2413 return (-1); 2414 } 2415 2416 if (pimp->pim_type != PIM_REGISTER) 2417 return (0); 2418 2419 reghdr = (uint32_t *)(pimp + 1); 2420 eip = (ipha_t *)(reghdr + 1); 2421 2422 /* 2423 * check if the inner packet is destined to mcast group 2424 */ 2425 if (!CLASSD(eip->ipha_dst)) { 2426 ++ipst->ips_mrtstat->mrts_pim_badregisters; 2427 if (ipst->ips_ip_mrtdebug > 1) { 2428 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 2429 "pim_input: Inner pkt not mcast .. !"); 2430 } 2431 freemsg(mp); 2432 return (-1); 2433 } 2434 if (ipst->ips_ip_mrtdebug > 1) { 2435 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 2436 "register from %x, to %x, len %d", 2437 ntohl(eip->ipha_src), 2438 ntohl(eip->ipha_dst), 2439 ntohs(eip->ipha_length)); 2440 } 2441 /* 2442 * If the null register bit is not set, decapsulate 2443 * the packet before forwarding it. 2444 */ 2445 if (!(ntohl(*reghdr) & PIM_NULL_REGISTER)) { 2446 mblk_t *mp_copy; 2447 2448 /* Copy the message */ 2449 if ((mp_copy = copymsg(mp)) == NULL) { 2450 ++ipst->ips_mrtstat->mrts_pim_nomemory; 2451 freemsg(mp); 2452 return (-1); 2453 } 2454 2455 /* 2456 * Decapsulate the packet and give it to 2457 * register_mforward. 2458 */ 2459 mp_copy->b_rptr += iphlen + sizeof (pim_t) + 2460 sizeof (*reghdr); 2461 if (register_mforward(q, mp_copy, ill) != 0) { 2462 freemsg(mp); 2463 return (-1); 2464 } 2465 } 2466 2467 /* 2468 * Pass all valid PIM packets up to any process(es) listening on a raw 2469 * PIM socket. For Solaris it is done right after pim_input() is 2470 * called. 2471 */ 2472 return (0); 2473 } 2474 2475 /* 2476 * PIM sparse mode hook. Called by pim_input after decapsulating 2477 * the packet. Loop back the packet, as if we have received it. 2478 * In pim_input() we have to check if the destination is a multicast address. 2479 */ 2480 /* ARGSUSED */ 2481 static int 2482 register_mforward(queue_t *q, mblk_t *mp, ill_t *ill) 2483 { 2484 ip_stack_t *ipst = ill->ill_ipst; 2485 2486 ASSERT(ipst->ips_reg_vif_num <= ipst->ips_numvifs); 2487 2488 if (ipst->ips_ip_mrtdebug > 3) { 2489 ipha_t *ipha; 2490 2491 ipha = (ipha_t *)mp->b_rptr; 2492 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 2493 "register_mforward: src %x, dst %x\n", 2494 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst)); 2495 } 2496 /* 2497 * Need to pass in to ip_mforward() the information that the 2498 * packet has arrived on the register_vif. We use the solution that 2499 * ip_mroute_decap() employs: use mp->b_prev to pass some information 2500 * to ip_mforward(). Nonzero value means the packet has arrived on a 2501 * tunnel (ip_mroute_decap() puts the address of the other side of the 2502 * tunnel there.) This is safe since ip_rput() either frees the packet 2503 * or passes it to ip_mforward(). We use 2504 * PIM_REGISTER_MARKER = 0xffffffff to indicate the has arrived on the 2505 * register vif. If in the future we have more than one register vifs, 2506 * then this will need re-examination. 2507 */ 2508 mp->b_prev = (mblk_t *)PIM_REGISTER_MARKER; 2509 ++ipst->ips_mrtstat->mrts_pim_regforwards; 2510 ip_rput(q, mp); 2511 return (0); 2512 } 2513 2514 /* 2515 * Send an encapsulated packet. 2516 * Caller assumes can continue to use mp when routine returns. 2517 */ 2518 /* ARGSUSED */ 2519 static void 2520 encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) 2521 { 2522 mblk_t *mp_copy; 2523 ipha_t *ipha_copy; 2524 size_t len; 2525 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2526 2527 if (ipst->ips_ip_mrtdebug > 1) { 2528 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 2529 "encap_send: vif %ld enter", 2530 (ptrdiff_t)(vifp - ipst->ips_vifs)); 2531 } 2532 len = ntohs(ipha->ipha_length); 2533 2534 /* 2535 * Copy the old packet & pullup it's IP header into the 2536 * new mbuf so we can modify it. Try to fill the new 2537 * mbuf since if we don't the ethernet driver will. 2538 */ 2539 mp_copy = allocb(32 + sizeof (multicast_encap_iphdr), BPRI_MED); 2540 if (mp_copy == NULL) 2541 return; 2542 mp_copy->b_rptr += 32; 2543 mp_copy->b_wptr = mp_copy->b_rptr + sizeof (multicast_encap_iphdr); 2544 if ((mp_copy->b_cont = copymsg(mp)) == NULL) { 2545 freeb(mp_copy); 2546 return; 2547 } 2548 2549 /* 2550 * Fill in the encapsulating IP header. 2551 * Remote tunnel dst in rmt_addr, from add_vif(). 2552 */ 2553 ipha_copy = (ipha_t *)mp_copy->b_rptr; 2554 *ipha_copy = multicast_encap_iphdr; 2555 ASSERT((len + sizeof (ipha_t)) <= IP_MAXPACKET); 2556 ipha_copy->ipha_length = htons(len + sizeof (ipha_t)); 2557 ipha_copy->ipha_src = vifp->v_lcl_addr.s_addr; 2558 ipha_copy->ipha_dst = vifp->v_rmt_addr.s_addr; 2559 ASSERT(ipha_copy->ipha_ident == 0); 2560 2561 /* Turn the encapsulated IP header back into a valid one. */ 2562 ipha = (ipha_t *)mp_copy->b_cont->b_rptr; 2563 ipha->ipha_ttl--; 2564 ipha->ipha_hdr_checksum = 0; 2565 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 2566 2567 if (ipst->ips_ip_mrtdebug > 1) { 2568 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 2569 "encap_send: group 0x%x", ntohl(ipha->ipha_dst)); 2570 } 2571 if (vifp->v_rate_limit <= 0) 2572 tbf_send_packet(vifp, mp_copy); 2573 else 2574 /* ipha is from the original header */ 2575 tbf_control(vifp, mp_copy, ipha); 2576 } 2577 2578 /* 2579 * De-encapsulate a packet and feed it back through IP input. 2580 * This routine is called whenever IP gets a packet with prototype 2581 * IPPROTO_ENCAP and a local destination address. 2582 */ 2583 void 2584 ip_mroute_decap(queue_t *q, mblk_t *mp, ill_t *ill) 2585 { 2586 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2587 ipha_t *ipha_encap; 2588 int hlen = IPH_HDR_LENGTH(ipha); 2589 ipaddr_t src; 2590 struct vif *vifp; 2591 ip_stack_t *ipst = ill->ill_ipst; 2592 2593 /* 2594 * Dump the packet if it's not to a multicast destination or if 2595 * we don't have an encapsulating tunnel with the source. 2596 * Note: This code assumes that the remote site IP address 2597 * uniquely identifies the tunnel (i.e., that this site has 2598 * at most one tunnel with the remote site). 2599 */ 2600 ipha_encap = (ipha_t *)((char *)ipha + hlen); 2601 if (!CLASSD(ipha_encap->ipha_dst)) { 2602 ipst->ips_mrtstat->mrts_bad_tunnel++; 2603 ip1dbg(("ip_mroute_decap: bad tunnel\n")); 2604 freemsg(mp); 2605 return; 2606 } 2607 src = (ipaddr_t)ipha->ipha_src; 2608 mutex_enter(&ipst->ips_last_encap_lock); 2609 if (src != ipst->ips_last_encap_src) { 2610 struct vif *vife; 2611 2612 vifp = ipst->ips_vifs; 2613 vife = vifp + ipst->ips_numvifs; 2614 ipst->ips_last_encap_src = src; 2615 ipst->ips_last_encap_vif = 0; 2616 for (; vifp < vife; ++vifp) { 2617 if (!lock_good_vif(vifp)) 2618 continue; 2619 if (vifp->v_rmt_addr.s_addr == src) { 2620 if (vifp->v_flags & VIFF_TUNNEL) 2621 ipst->ips_last_encap_vif = vifp; 2622 if (ipst->ips_ip_mrtdebug > 1) { 2623 (void) mi_strlog(ipst->ips_ip_g_mrouter, 2624 1, SL_TRACE, 2625 "ip_mroute_decap: good tun " 2626 "vif %ld with %x", 2627 (ptrdiff_t)(vifp - ipst->ips_vifs), 2628 ntohl(src)); 2629 } 2630 unlock_good_vif(vifp); 2631 break; 2632 } 2633 unlock_good_vif(vifp); 2634 } 2635 } 2636 if ((vifp = ipst->ips_last_encap_vif) == 0) { 2637 mutex_exit(&ipst->ips_last_encap_lock); 2638 ipst->ips_mrtstat->mrts_bad_tunnel++; 2639 freemsg(mp); 2640 ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n", 2641 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(src))); 2642 return; 2643 } 2644 mutex_exit(&ipst->ips_last_encap_lock); 2645 2646 /* 2647 * Need to pass in the tunnel source to ip_mforward (so that it can 2648 * verify that the packet arrived over the correct vif.) We use b_prev 2649 * to pass this information. This is safe since the ip_rput either 2650 * frees the packet or passes it to ip_mforward. 2651 */ 2652 mp->b_prev = (mblk_t *)(uintptr_t)src; 2653 mp->b_rptr += hlen; 2654 /* Feed back into ip_rput as an M_DATA. */ 2655 ip_rput(q, mp); 2656 } 2657 2658 /* 2659 * Remove all records with v_ipif == ipif. Called when an interface goes away 2660 * (stream closed). Called as writer. 2661 */ 2662 void 2663 reset_mrt_vif_ipif(ipif_t *ipif) 2664 { 2665 vifi_t vifi, tmp_vifi; 2666 vifi_t num_of_vifs; 2667 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 2668 2669 /* Can't check vifi >= 0 since vifi_t is unsigned! */ 2670 2671 mutex_enter(&ipst->ips_numvifs_mutex); 2672 num_of_vifs = ipst->ips_numvifs; 2673 mutex_exit(&ipst->ips_numvifs_mutex); 2674 2675 for (vifi = num_of_vifs; vifi != 0; vifi--) { 2676 tmp_vifi = vifi - 1; 2677 if (ipst->ips_vifs[tmp_vifi].v_ipif == ipif) { 2678 (void) del_vif(&tmp_vifi, NULL, NULL, ipst); 2679 } 2680 } 2681 } 2682 2683 /* Remove pending upcall msgs when ill goes away. Called by ill_delete. */ 2684 void 2685 reset_mrt_ill(ill_t *ill) 2686 { 2687 struct mfc *rt; 2688 struct rtdetq *rte; 2689 int i; 2690 ip_stack_t *ipst = ill->ill_ipst; 2691 2692 for (i = 0; i < MFCTBLSIZ; i++) { 2693 MFCB_REFHOLD(&ipst->ips_mfcs[i]); 2694 if ((rt = ipst->ips_mfcs[i].mfcb_mfc) != NULL) { 2695 if (ipst->ips_ip_mrtdebug > 1) { 2696 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, 2697 SL_TRACE, 2698 "reset_mrt_ill: mfctable [%d]", i); 2699 } 2700 while (rt != NULL) { 2701 mutex_enter(&rt->mfc_mutex); 2702 while ((rte = rt->mfc_rte) != NULL) { 2703 if (rte->ill == ill) { 2704 if (ipst->ips_ip_mrtdebug > 1) { 2705 (void) mi_strlog( 2706 ipst->ips_ip_g_mrouter, 2707 1, SL_TRACE, 2708 "reset_mrt_ill: " 2709 "ill 0x%p", ill); 2710 } 2711 rt->mfc_rte = rte->rte_next; 2712 freemsg(rte->mp); 2713 mi_free((char *)rte); 2714 } 2715 } 2716 mutex_exit(&rt->mfc_mutex); 2717 rt = rt->mfc_next; 2718 } 2719 } 2720 MFCB_REFRELE(&ipst->ips_mfcs[i]); 2721 } 2722 } 2723 2724 /* 2725 * Token bucket filter module. 2726 * The ipha is for mcastgrp destination for phyint and encap. 2727 */ 2728 static void 2729 tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha) 2730 { 2731 size_t p_len = msgdsize(mp); 2732 struct tbf *t = vifp->v_tbf; 2733 timeout_id_t id = 0; 2734 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2735 2736 /* Drop if packet is too large */ 2737 if (p_len > MAX_BKT_SIZE) { 2738 ipst->ips_mrtstat->mrts_pkt2large++; 2739 freemsg(mp); 2740 return; 2741 } 2742 if (ipst->ips_ip_mrtdebug > 1) { 2743 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 2744 "tbf_ctrl: SEND vif %ld, qlen %d, ipha_dst 0x%x", 2745 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len, 2746 ntohl(ipha->ipha_dst)); 2747 } 2748 2749 mutex_enter(&t->tbf_lock); 2750 2751 tbf_update_tokens(vifp); 2752 2753 /* 2754 * If there are enough tokens, 2755 * and the queue is empty, send this packet out. 2756 */ 2757 if (ipst->ips_ip_mrtdebug > 1) { 2758 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 2759 "tbf_control: vif %ld, TOKENS %d, pkt len %lu, qlen %d", 2760 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_n_tok, p_len, 2761 t->tbf_q_len); 2762 } 2763 /* No packets are queued */ 2764 if (t->tbf_q_len == 0) { 2765 /* queue empty, send packet if enough tokens */ 2766 if (p_len <= t->tbf_n_tok) { 2767 t->tbf_n_tok -= p_len; 2768 mutex_exit(&t->tbf_lock); 2769 tbf_send_packet(vifp, mp); 2770 return; 2771 } else { 2772 /* Queue packet and timeout till later */ 2773 tbf_queue(vifp, mp); 2774 ASSERT(vifp->v_timeout_id == 0); 2775 vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp, 2776 TBF_REPROCESS); 2777 } 2778 } else if (t->tbf_q_len < t->tbf_max_q_len) { 2779 /* Finite queue length, so queue pkts and process queue */ 2780 tbf_queue(vifp, mp); 2781 tbf_process_q(vifp); 2782 } else { 2783 /* Check that we have UDP header with IP header */ 2784 size_t hdr_length = IPH_HDR_LENGTH(ipha) + 2785 sizeof (struct udphdr); 2786 2787 if ((mp->b_wptr - mp->b_rptr) < hdr_length) { 2788 if (!pullupmsg(mp, hdr_length)) { 2789 freemsg(mp); 2790 ip1dbg(("tbf_ctl: couldn't pullup udp hdr, " 2791 "vif %ld src 0x%x dst 0x%x\n", 2792 (ptrdiff_t)(vifp - ipst->ips_vifs), 2793 ntohl(ipha->ipha_src), 2794 ntohl(ipha->ipha_dst))); 2795 mutex_exit(&vifp->v_tbf->tbf_lock); 2796 return; 2797 } else 2798 /* Have to reassign ipha after pullupmsg */ 2799 ipha = (ipha_t *)mp->b_rptr; 2800 } 2801 /* 2802 * Queue length too much, 2803 * try to selectively dq, or queue and process 2804 */ 2805 if (!tbf_dq_sel(vifp, ipha)) { 2806 ipst->ips_mrtstat->mrts_q_overflow++; 2807 freemsg(mp); 2808 } else { 2809 tbf_queue(vifp, mp); 2810 tbf_process_q(vifp); 2811 } 2812 } 2813 if (t->tbf_q_len == 0) { 2814 id = vifp->v_timeout_id; 2815 vifp->v_timeout_id = 0; 2816 } 2817 mutex_exit(&vifp->v_tbf->tbf_lock); 2818 if (id != 0) 2819 (void) untimeout(id); 2820 } 2821 2822 /* 2823 * Adds a packet to the tbf queue at the interface. 2824 * The ipha is for mcastgrp destination for phyint and encap. 2825 */ 2826 static void 2827 tbf_queue(struct vif *vifp, mblk_t *mp) 2828 { 2829 struct tbf *t = vifp->v_tbf; 2830 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2831 2832 if (ipst->ips_ip_mrtdebug > 1) { 2833 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 2834 "tbf_queue: vif %ld", (ptrdiff_t)(vifp - ipst->ips_vifs)); 2835 } 2836 ASSERT(MUTEX_HELD(&t->tbf_lock)); 2837 2838 if (t->tbf_t == NULL) { 2839 /* Queue was empty */ 2840 t->tbf_q = mp; 2841 } else { 2842 /* Insert at tail */ 2843 t->tbf_t->b_next = mp; 2844 } 2845 /* set new tail pointer */ 2846 t->tbf_t = mp; 2847 2848 mp->b_next = mp->b_prev = NULL; 2849 2850 t->tbf_q_len++; 2851 } 2852 2853 /* 2854 * Process the queue at the vif interface. 2855 * Drops the tbf_lock when sending packets. 2856 * 2857 * NOTE : The caller should quntimeout if the queue length is 0. 2858 */ 2859 static void 2860 tbf_process_q(struct vif *vifp) 2861 { 2862 mblk_t *mp; 2863 struct tbf *t = vifp->v_tbf; 2864 size_t len; 2865 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2866 2867 if (ipst->ips_ip_mrtdebug > 1) { 2868 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 2869 "tbf_process_q 1: vif %ld qlen = %d", 2870 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len); 2871 } 2872 2873 /* 2874 * Loop through the queue at the interface and send 2875 * as many packets as possible. 2876 */ 2877 ASSERT(MUTEX_HELD(&t->tbf_lock)); 2878 2879 while (t->tbf_q_len > 0) { 2880 mp = t->tbf_q; 2881 len = (size_t)msgdsize(mp); /* length of ip pkt */ 2882 2883 /* Determine if the packet can be sent */ 2884 if (len <= t->tbf_n_tok) { 2885 /* 2886 * If so, reduce no. of tokens, dequeue the packet, 2887 * send the packet. 2888 */ 2889 t->tbf_n_tok -= len; 2890 2891 t->tbf_q = mp->b_next; 2892 if (--t->tbf_q_len == 0) { 2893 t->tbf_t = NULL; 2894 } 2895 mp->b_next = NULL; 2896 /* Exit mutex before sending packet, then re-enter */ 2897 mutex_exit(&t->tbf_lock); 2898 tbf_send_packet(vifp, mp); 2899 mutex_enter(&t->tbf_lock); 2900 } else 2901 break; 2902 } 2903 } 2904 2905 /* Called at tbf timeout to update tokens, process q and reset timer. */ 2906 static void 2907 tbf_reprocess_q(void *arg) 2908 { 2909 struct vif *vifp = arg; 2910 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2911 2912 mutex_enter(&vifp->v_tbf->tbf_lock); 2913 vifp->v_timeout_id = 0; 2914 tbf_update_tokens(vifp); 2915 2916 tbf_process_q(vifp); 2917 2918 if (vifp->v_tbf->tbf_q_len > 0) { 2919 vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp, 2920 TBF_REPROCESS); 2921 } 2922 mutex_exit(&vifp->v_tbf->tbf_lock); 2923 2924 if (ipst->ips_ip_mrtdebug > 1) { 2925 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 2926 "tbf_reprcess_q: vif %ld timeout id = %p", 2927 (ptrdiff_t)(vifp - ipst->ips_vifs), vifp->v_timeout_id); 2928 } 2929 } 2930 2931 /* 2932 * Function that will selectively discard a member of the tbf queue, 2933 * based on the precedence value and the priority. 2934 * 2935 * NOTE : The caller should quntimeout if the queue length is 0. 2936 */ 2937 static int 2938 tbf_dq_sel(struct vif *vifp, ipha_t *ipha) 2939 { 2940 uint_t p; 2941 struct tbf *t = vifp->v_tbf; 2942 mblk_t **np; 2943 mblk_t *last, *mp; 2944 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2945 2946 if (ipst->ips_ip_mrtdebug > 1) { 2947 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 2948 "dq_sel: vif %ld dst 0x%x", 2949 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(ipha->ipha_dst)); 2950 } 2951 2952 ASSERT(MUTEX_HELD(&t->tbf_lock)); 2953 p = priority(vifp, ipha); 2954 2955 np = &t->tbf_q; 2956 last = NULL; 2957 while ((mp = *np) != NULL) { 2958 if (p > (priority(vifp, (ipha_t *)mp->b_rptr))) { 2959 *np = mp->b_next; 2960 /* If removing the last packet, fix the tail pointer */ 2961 if (mp == t->tbf_t) 2962 t->tbf_t = last; 2963 mp->b_prev = mp->b_next = NULL; 2964 freemsg(mp); 2965 /* 2966 * It's impossible for the queue to be empty, but 2967 * we check anyway. 2968 */ 2969 if (--t->tbf_q_len == 0) { 2970 t->tbf_t = NULL; 2971 } 2972 ipst->ips_mrtstat->mrts_drop_sel++; 2973 return (1); 2974 } 2975 np = &mp->b_next; 2976 last = mp; 2977 } 2978 return (0); 2979 } 2980 2981 /* Sends packet, 2 cases - encap tunnel, phyint. */ 2982 static void 2983 tbf_send_packet(struct vif *vifp, mblk_t *mp) 2984 { 2985 ipif_t *ipif; 2986 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 2987 2988 /* If encap tunnel options */ 2989 if (vifp->v_flags & VIFF_TUNNEL) { 2990 if (ipst->ips_ip_mrtdebug > 1) { 2991 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 2992 "tbf_send_pkt: ENCAP tunnel vif %ld", 2993 (ptrdiff_t)(vifp - ipst->ips_vifs)); 2994 } 2995 2996 /* 2997 * Feed into ip_wput which will set the ident field and 2998 * checksum the encapsulating header. 2999 * BSD gets the cached route vifp->v_route from ip_output() 3000 * to speed up route table lookups. Not necessary in SunOS 5.x. 3001 */ 3002 put(vifp->v_ipif->ipif_wq, mp); 3003 return; 3004 3005 /* phyint */ 3006 } else { 3007 /* Need to loop back to members on the outgoing interface. */ 3008 ipha_t *ipha; 3009 ipaddr_t dst; 3010 ipha = (ipha_t *)mp->b_rptr; 3011 dst = ipha->ipha_dst; 3012 ipif = vifp->v_ipif; 3013 3014 mutex_enter(&ipif->ipif_ill->ill_lock); 3015 if (ilm_lookup_ipif(ipif, dst) != NULL) { 3016 /* 3017 * The packet is not yet reassembled, thus we need to 3018 * pass it to ip_rput_local for checksum verification 3019 * and reassembly (and fanout the user stream). 3020 */ 3021 mblk_t *mp_loop; 3022 ire_t *ire; 3023 3024 mutex_exit(&ipif->ipif_ill->ill_lock); 3025 if (ipst->ips_ip_mrtdebug > 1) { 3026 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, 3027 SL_TRACE, 3028 "tbf_send_pkt: loopback vif %ld", 3029 (ptrdiff_t)(vifp - ipst->ips_vifs)); 3030 } 3031 mp_loop = copymsg(mp); 3032 ire = ire_ctable_lookup(~0, 0, IRE_BROADCAST, NULL, 3033 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 3034 3035 if (mp_loop != NULL && ire != NULL) { 3036 IP_RPUT_LOCAL(ipif->ipif_rq, mp_loop, 3037 ((ipha_t *)mp_loop->b_rptr), 3038 ire, (ill_t *)ipif->ipif_rq->q_ptr); 3039 } else { 3040 /* Either copymsg failed or no ire */ 3041 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, 3042 SL_TRACE, 3043 "tbf_send_pkt: mp_loop 0x%p, ire 0x%p " 3044 "vif %ld\n", mp_loop, ire, 3045 (ptrdiff_t)(vifp - ipst->ips_vifs)); 3046 } 3047 if (ire != NULL) 3048 ire_refrele(ire); 3049 } else { 3050 mutex_exit(&ipif->ipif_ill->ill_lock); 3051 } 3052 if (ipst->ips_ip_mrtdebug > 1) { 3053 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 3054 "tbf_send_pkt: phyint forward vif %ld dst = 0x%x", 3055 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(dst)); 3056 } 3057 ip_rput_forward_multicast(dst, mp, ipif); 3058 } 3059 } 3060 3061 /* 3062 * Determine the current time and then the elapsed time (between the last time 3063 * and time now). Update the no. of tokens in the bucket. 3064 */ 3065 static void 3066 tbf_update_tokens(struct vif *vifp) 3067 { 3068 timespec_t tp; 3069 hrtime_t tm; 3070 struct tbf *t = vifp->v_tbf; 3071 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 3072 3073 ASSERT(MUTEX_HELD(&t->tbf_lock)); 3074 3075 /* Time in secs and nsecs, rate limit in kbits/sec */ 3076 gethrestime(&tp); 3077 3078 /*LINTED*/ 3079 TV_DELTA(tp, t->tbf_last_pkt_t, tm); 3080 3081 /* 3082 * This formula is actually 3083 * "time in seconds" * "bytes/second". Scaled for nsec. 3084 * (tm/1000000000) * (v_rate_limit * 1000 * (1000/1024) /8) 3085 * 3086 * The (1000/1024) was introduced in add_vif to optimize 3087 * this divide into a shift. 3088 */ 3089 t->tbf_n_tok += (tm/1000) * vifp->v_rate_limit / 1024 / 8; 3090 t->tbf_last_pkt_t = tp; 3091 3092 if (t->tbf_n_tok > MAX_BKT_SIZE) 3093 t->tbf_n_tok = MAX_BKT_SIZE; 3094 if (ipst->ips_ip_mrtdebug > 1) { 3095 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 3096 "tbf_update_tok: tm %lld tok %d vif %ld", 3097 tm, t->tbf_n_tok, (ptrdiff_t)(vifp - ipst->ips_vifs)); 3098 } 3099 } 3100 3101 /* 3102 * Priority currently is based on port nos. 3103 * Different forwarding mechanisms have different ways 3104 * of obtaining the port no. Hence, the vif must be 3105 * given along with the packet itself. 3106 * 3107 */ 3108 static int 3109 priority(struct vif *vifp, ipha_t *ipha) 3110 { 3111 int prio; 3112 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; 3113 3114 /* Temporary hack; may add general packet classifier some day */ 3115 3116 ASSERT(MUTEX_HELD(&vifp->v_tbf->tbf_lock)); 3117 3118 /* 3119 * The UDP port space is divided up into four priority ranges: 3120 * [0, 16384) : unclassified - lowest priority 3121 * [16384, 32768) : audio - highest priority 3122 * [32768, 49152) : whiteboard - medium priority 3123 * [49152, 65536) : video - low priority 3124 */ 3125 3126 if (ipha->ipha_protocol == IPPROTO_UDP) { 3127 struct udphdr *udp = 3128 (struct udphdr *)((char *)ipha + IPH_HDR_LENGTH(ipha)); 3129 switch (ntohs(udp->uh_dport) & 0xc000) { 3130 case 0x4000: 3131 prio = 70; 3132 break; 3133 case 0x8000: 3134 prio = 60; 3135 break; 3136 case 0xc000: 3137 prio = 55; 3138 break; 3139 default: 3140 prio = 50; 3141 break; 3142 } 3143 if (ipst->ips_ip_mrtdebug > 1) { 3144 (void) mi_strlog(ipst->ips_ip_g_mrouter, 1, SL_TRACE, 3145 "priority: port %x prio %d\n", 3146 ntohs(udp->uh_dport), prio); 3147 } 3148 } else 3149 prio = 50; /* default priority */ 3150 return (prio); 3151 } 3152 3153 /* 3154 * End of token bucket filter modifications 3155 */ 3156 3157 3158 3159 /* 3160 * Produces data for netstat -M. 3161 */ 3162 int 3163 ip_mroute_stats(mblk_t *mp, ip_stack_t *ipst) 3164 { 3165 ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl); 3166 ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl); 3167 if (!snmp_append_data(mp, (char *)ipst->ips_mrtstat, 3168 sizeof (struct mrtstat))) { 3169 ip0dbg(("ip_mroute_stats: failed %ld bytes\n", 3170 (size_t)sizeof (struct mrtstat))); 3171 return (0); 3172 } 3173 return (1); 3174 } 3175 3176 /* 3177 * Sends info for SNMP's MIB. 3178 */ 3179 int 3180 ip_mroute_vif(mblk_t *mp, ip_stack_t *ipst) 3181 { 3182 struct vifctl vi; 3183 vifi_t vifi; 3184 3185 mutex_enter(&ipst->ips_numvifs_mutex); 3186 for (vifi = 0; vifi < ipst->ips_numvifs; vifi++) { 3187 if (ipst->ips_vifs[vifi].v_lcl_addr.s_addr == 0) 3188 continue; 3189 /* 3190 * No locks here, an approximation is fine. 3191 */ 3192 vi.vifc_vifi = vifi; 3193 vi.vifc_flags = ipst->ips_vifs[vifi].v_flags; 3194 vi.vifc_threshold = ipst->ips_vifs[vifi].v_threshold; 3195 vi.vifc_rate_limit = ipst->ips_vifs[vifi].v_rate_limit; 3196 vi.vifc_lcl_addr = ipst->ips_vifs[vifi].v_lcl_addr; 3197 vi.vifc_rmt_addr = ipst->ips_vifs[vifi].v_rmt_addr; 3198 vi.vifc_pkt_in = ipst->ips_vifs[vifi].v_pkt_in; 3199 vi.vifc_pkt_out = ipst->ips_vifs[vifi].v_pkt_out; 3200 3201 if (!snmp_append_data(mp, (char *)&vi, sizeof (vi))) { 3202 ip0dbg(("ip_mroute_vif: failed %ld bytes\n", 3203 (size_t)sizeof (vi))); 3204 return (0); 3205 } 3206 } 3207 mutex_exit(&ipst->ips_numvifs_mutex); 3208 return (1); 3209 } 3210 3211 /* 3212 * Called by ip_snmp_get to send up multicast routing table. 3213 */ 3214 int 3215 ip_mroute_mrt(mblk_t *mp, ip_stack_t *ipst) 3216 { 3217 int i, j; 3218 struct mfc *rt; 3219 struct mfcctl mfcc; 3220 3221 /* 3222 * Make sure multicast has not been turned off. 3223 */ 3224 if (is_mrouter_off(ipst)) 3225 return (1); 3226 3227 /* Loop over all hash buckets and their chains */ 3228 for (i = 0; i < MFCTBLSIZ; i++) { 3229 MFCB_REFHOLD(&ipst->ips_mfcs[i]); 3230 for (rt = ipst->ips_mfcs[i].mfcb_mfc; rt; rt = rt->mfc_next) { 3231 mutex_enter(&rt->mfc_mutex); 3232 if (rt->mfc_rte != NULL || 3233 (rt->mfc_marks & MFCB_MARK_CONDEMNED)) { 3234 mutex_exit(&rt->mfc_mutex); 3235 continue; 3236 } 3237 mfcc.mfcc_origin = rt->mfc_origin; 3238 mfcc.mfcc_mcastgrp = rt->mfc_mcastgrp; 3239 mfcc.mfcc_parent = rt->mfc_parent; 3240 mfcc.mfcc_pkt_cnt = rt->mfc_pkt_cnt; 3241 mutex_enter(&ipst->ips_numvifs_mutex); 3242 for (j = 0; j < (int)ipst->ips_numvifs; j++) 3243 mfcc.mfcc_ttls[j] = rt->mfc_ttls[j]; 3244 for (j = (int)ipst->ips_numvifs; j < MAXVIFS; j++) 3245 mfcc.mfcc_ttls[j] = 0; 3246 mutex_exit(&ipst->ips_numvifs_mutex); 3247 3248 mutex_exit(&rt->mfc_mutex); 3249 if (!snmp_append_data(mp, (char *)&mfcc, 3250 sizeof (mfcc))) { 3251 MFCB_REFRELE(&ipst->ips_mfcs[i]); 3252 ip0dbg(("ip_mroute_mrt: failed %ld bytes\n", 3253 (size_t)sizeof (mfcc))); 3254 return (0); 3255 } 3256 } 3257 MFCB_REFRELE(&ipst->ips_mfcs[i]); 3258 } 3259 return (1); 3260 } 3261