1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24 /* Copyright (c) 1990 Mentat Inc. */
25
26 /*
27 * Copyright (c) 2018, Joyent, Inc.
28 * Copyright 2024 Oxide Computer Company
29 */
30
31 /*
32 * Procedures for the kernel part of DVMRP,
33 * a Distance-Vector Multicast Routing Protocol.
34 * (See RFC-1075)
35 * Written by David Waitzman, BBN Labs, August 1988.
36 * Modified by Steve Deering, Stanford, February 1989.
37 * Modified by Mark J. Steiglitz, Stanford, May, 1991
38 * Modified by Van Jacobson, LBL, January 1993
39 * Modified by Ajit Thyagarajan, PARC, August 1993
40 * Modified by Bill Fenner, PARC, April 1995
41 *
42 * MROUTING 3.5
43 */
44
45 /*
46 * TODO
47 * - function pointer field in vif, void *vif_sendit()
48 */
49
50 #include <sys/types.h>
51 #include <sys/stream.h>
52 #include <sys/stropts.h>
53 #include <sys/strlog.h>
54 #include <sys/systm.h>
55 #include <sys/ddi.h>
56 #include <sys/cmn_err.h>
57 #include <sys/zone.h>
58
59 #include <sys/param.h>
60 #include <sys/socket.h>
61 #include <sys/vtrace.h>
62 #include <sys/debug.h>
63 #include <net/if.h>
64 #include <sys/sockio.h>
65 #include <netinet/in.h>
66 #include <net/if_dl.h>
67
68 #include <inet/ipsec_impl.h>
69 #include <inet/common.h>
70 #include <inet/mi.h>
71 #include <inet/nd.h>
72 #include <inet/tunables.h>
73 #include <inet/mib2.h>
74 #include <netinet/ip6.h>
75 #include <inet/ip.h>
76 #include <inet/snmpcom.h>
77
78 #include <netinet/igmp.h>
79 #include <netinet/igmp_var.h>
80 #include <netinet/udp.h>
81 #include <netinet/ip_mroute.h>
82 #include <inet/ip_multi.h>
83 #include <inet/ip_ire.h>
84 #include <inet/ip_ndp.h>
85 #include <inet/ip_if.h>
86 #include <inet/ipclassifier.h>
87
88 #include <netinet/pim.h>
89
90
91 /*
92 * MT Design:
93 *
94 * There are three main data structures viftable, mfctable and tbftable that
95 * need to be protected against MT races.
96 *
97 * vitable is a fixed length array of vif structs. There is no lock to protect
98 * the whole array, instead each struct is protected by its own indiviual lock.
99 * The value of v_marks in conjuction with the value of v_refcnt determines the
100 * current state of a vif structure. One special state that needs mention
101 * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates
102 * that vif is being initalized.
103 * Each structure is freed when the refcnt goes down to zero. If a delete comes
104 * in when the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED
105 * which prevents the struct from further use. When the refcnt goes to zero
106 * the struct is freed and is marked VIF_MARK_NOTINUSE.
107 * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill
108 * from going away a refhold is put on the ipif before using it. see
109 * lock_good_vif() and unlock_good_vif().
110 *
111 * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts
112 * of the vif struct.
113 *
114 * tbftable is also a fixed length array of tbf structs and is only accessed
115 * via v_tbf. It is protected by its own lock tbf_lock.
116 *
117 * Lock Ordering is
118 * v_lock --> tbf_lock
119 * v_lock --> ill_locK
120 *
121 * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb).
122 * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker,
123 * it also maintains a state. These fields are protected by a lock (mfcb_lock).
124 * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to
125 * protect the struct elements.
126 *
127 * mfc structs are dynamically allocated and are singly linked
128 * at the head of the chain. When an mfc structure is to be deleted
129 * it is marked condemned and so is the state in the bucket struct.
130 * When the last walker of the hash bucket exits all the mfc structs
131 * marked condemed are freed.
132 *
133 * Locking Hierarchy:
134 * The bucket lock should be acquired before the mfc struct lock.
135 * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking
136 * operations on the bucket struct.
137 *
138 * last_encap_lock and numvifs_mutex should be acquired after
139 * acquring vif or mfc locks. These locks protect some global variables.
140 *
141 * The statistics are not currently protected by a lock
142 * causing the stats be be approximate, not exact.
143 */
144
145 #define NO_VIF MAXVIFS /* from mrouted, no route for src */
146
147 /*
148 * Timeouts:
149 * Upcall timeouts - BSD uses boolean_t mfc->expire and
150 * nexpire[MFCTBLSIZE], the number of times expire has been called.
151 * SunOS 5.x uses mfc->timeout for each mfc.
152 * Some Unixes are limited in the number of simultaneous timeouts
153 * that can be run, SunOS 5.x does not have this restriction.
154 */
155
156 /*
157 * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and
158 * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall
159 * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE
160 */
161 #define EXPIRE_TIMEOUT (hz/4) /* 4x / second */
162 #define UPCALL_EXPIRE 6 /* number of timeouts */
163
164 /*
165 * Hash function for a source, group entry
166 */
167 #define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \
168 ((g) >> 20) ^ ((g) >> 10) ^ (g))
169
170 #define TBF_REPROCESS (hz / 100) /* 100x /second */
171
172 /* Identify PIM packet that came on a Register interface */
173 #define PIM_REGISTER_MARKER 0xffffffff
174
175 /* Function declarations */
176 static int add_mfc(struct mfcctl *, ip_stack_t *);
177 static int add_vif(struct vifctl *, conn_t *, ip_stack_t *);
178 static int del_mfc(struct mfcctl *, ip_stack_t *);
179 static int del_vif(vifi_t *, ip_stack_t *);
180 static void del_vifp(struct vif *);
181 static void encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
182 static void expire_upcalls(void *);
183 static void fill_route(struct mfc *, struct mfcctl *, ip_stack_t *);
184 static void free_queue(struct mfc *);
185 static int get_assert(uchar_t *, ip_stack_t *);
186 static int get_lsg_cnt(struct sioc_lsg_req *, ip_stack_t *);
187 static int get_sg_cnt(struct sioc_sg_req *, ip_stack_t *);
188 static int get_version(uchar_t *);
189 static int get_vif_cnt(struct sioc_vif_req *, ip_stack_t *);
190 static int ip_mdq(mblk_t *, ipha_t *, ill_t *,
191 ipaddr_t, struct mfc *);
192 static int ip_mrouter_init(conn_t *, uchar_t *, int, ip_stack_t *);
193 static void phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
194 static int register_mforward(mblk_t *, ip_recv_attr_t *);
195 static void register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
196 static int set_assert(int *, ip_stack_t *);
197
198 /*
199 * Token Bucket Filter functions
200 */
201 static int priority(struct vif *, ipha_t *);
202 static void tbf_control(struct vif *, mblk_t *, ipha_t *);
203 static int tbf_dq_sel(struct vif *, ipha_t *);
204 static void tbf_process_q(struct vif *);
205 static void tbf_queue(struct vif *, mblk_t *);
206 static void tbf_reprocess_q(void *);
207 static void tbf_send_packet(struct vif *, mblk_t *);
208 static void tbf_update_tokens(struct vif *);
209 static void release_mfc(struct mfcb *);
210
211 static boolean_t is_mrouter_off(ip_stack_t *);
212 /*
213 * Encapsulation packets
214 */
215
216 #define ENCAP_TTL 64
217
218 /* prototype IP hdr for encapsulated packets */
219 static ipha_t multicast_encap_iphdr = {
220 IP_SIMPLE_HDR_VERSION,
221 0, /* tos */
222 sizeof (ipha_t), /* total length */
223 0, /* id */
224 0, /* frag offset */
225 ENCAP_TTL, IPPROTO_ENCAP,
226 0, /* checksum */
227 };
228
229 /*
230 * Rate limit for assert notification messages, in nsec.
231 */
232 #define ASSERT_MSG_TIME 3000000000
233
234
235 #define VIF_REFHOLD(vifp) { \
236 mutex_enter(&(vifp)->v_lock); \
237 (vifp)->v_refcnt++; \
238 mutex_exit(&(vifp)->v_lock); \
239 }
240
241 #define VIF_REFRELE_LOCKED(vifp) { \
242 (vifp)->v_refcnt--; \
243 if ((vifp)->v_refcnt == 0 && \
244 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) { \
245 del_vifp(vifp); \
246 } else { \
247 mutex_exit(&(vifp)->v_lock); \
248 } \
249 }
250
251 #define VIF_REFRELE(vifp) { \
252 mutex_enter(&(vifp)->v_lock); \
253 (vifp)->v_refcnt--; \
254 if ((vifp)->v_refcnt == 0 && \
255 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) { \
256 del_vifp(vifp); \
257 } else { \
258 mutex_exit(&(vifp)->v_lock); \
259 } \
260 }
261
262 #define MFCB_REFHOLD(mfcb) { \
263 mutex_enter(&(mfcb)->mfcb_lock); \
264 (mfcb)->mfcb_refcnt++; \
265 ASSERT((mfcb)->mfcb_refcnt != 0); \
266 mutex_exit(&(mfcb)->mfcb_lock); \
267 }
268
269 #define MFCB_REFRELE(mfcb) { \
270 mutex_enter(&(mfcb)->mfcb_lock); \
271 ASSERT((mfcb)->mfcb_refcnt != 0); \
272 if (--(mfcb)->mfcb_refcnt == 0 && \
273 ((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) { \
274 release_mfc(mfcb); \
275 } \
276 mutex_exit(&(mfcb)->mfcb_lock); \
277 }
278
279 /*
280 * MFCFIND:
281 * Find a route for a given origin IP address and multicast group address.
282 * Skip entries with pending upcalls.
283 * Type of service parameter to be added in the future!
284 */
285 #define MFCFIND(mfcbp, o, g, rt) { \
286 struct mfc *_mb_rt = NULL; \
287 rt = NULL; \
288 _mb_rt = mfcbp->mfcb_mfc; \
289 while (_mb_rt) { \
290 if ((_mb_rt->mfc_origin.s_addr == o) && \
291 (_mb_rt->mfc_mcastgrp.s_addr == g) && \
292 (_mb_rt->mfc_rte == NULL) && \
293 (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) { \
294 rt = _mb_rt; \
295 break; \
296 } \
297 _mb_rt = _mb_rt->mfc_next; \
298 } \
299 }
300
301 /*
302 * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime()
303 * are inefficient. We use gethrestime() which returns a timespec_t with
304 * sec and nsec, the resolution is machine dependent.
305 * The following 2 macros have been changed to use nsec instead of usec.
306 */
307 /*
308 * Macros to compute elapsed time efficiently.
309 * Borrowed from Van Jacobson's scheduling code.
310 * Delta should be a hrtime_t.
311 */
312 #define TV_DELTA(a, b, delta) { \
313 int xxs; \
314 \
315 delta = (a).tv_nsec - (b).tv_nsec; \
316 if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \
317 switch (xxs) { \
318 case 2: \
319 delta += 1000000000; \
320 /*FALLTHROUGH*/ \
321 case 1: \
322 delta += 1000000000; \
323 break; \
324 default: \
325 delta += (1000000000 * xxs); \
326 } \
327 } \
328 }
329
330 #define TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \
331 (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
332
333 /*
334 * Handle MRT setsockopt commands to modify the multicast routing tables.
335 */
336 int
ip_mrouter_set(int cmd,conn_t * connp,int checkonly,uchar_t * data,int datalen)337 ip_mrouter_set(int cmd, conn_t *connp, int checkonly, uchar_t *data,
338 int datalen)
339 {
340 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
341
342 mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
343 if (cmd != MRT_INIT && connp != ipst->ips_ip_g_mrouter) {
344 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
345 return (EACCES);
346 }
347 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
348
349 if (checkonly) {
350 /*
351 * do not do operation, just pretend to - new T_CHECK
352 * Note: Even routines further on can probably fail but
353 * this T_CHECK stuff is only to please XTI so it not
354 * necessary to be perfect.
355 */
356 switch (cmd) {
357 case MRT_INIT:
358 case MRT_DONE:
359 case MRT_ADD_VIF:
360 case MRT_DEL_VIF:
361 case MRT_ADD_MFC:
362 case MRT_DEL_MFC:
363 case MRT_ASSERT:
364 return (0);
365 default:
366 return (EOPNOTSUPP);
367 }
368 }
369
370 /*
371 * make sure no command is issued after multicast routing has been
372 * turned off.
373 */
374 if (cmd != MRT_INIT && cmd != MRT_DONE) {
375 if (is_mrouter_off(ipst))
376 return (EINVAL);
377 }
378
379 switch (cmd) {
380 case MRT_INIT: return (ip_mrouter_init(connp, data, datalen, ipst));
381 case MRT_DONE: return (ip_mrouter_done(ipst));
382 case MRT_ADD_VIF: return (add_vif((struct vifctl *)data, connp, ipst));
383 case MRT_DEL_VIF: return (del_vif((vifi_t *)data, ipst));
384 case MRT_ADD_MFC: return (add_mfc((struct mfcctl *)data, ipst));
385 case MRT_DEL_MFC: return (del_mfc((struct mfcctl *)data, ipst));
386 case MRT_ASSERT: return (set_assert((int *)data, ipst));
387 default: return (EOPNOTSUPP);
388 }
389 }
390
391 /*
392 * Handle MRT getsockopt commands
393 */
394 int
ip_mrouter_get(int cmd,conn_t * connp,uchar_t * data)395 ip_mrouter_get(int cmd, conn_t *connp, uchar_t *data)
396 {
397 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
398
399 if (connp != ipst->ips_ip_g_mrouter)
400 return (EACCES);
401
402 switch (cmd) {
403 case MRT_VERSION: return (get_version((uchar_t *)data));
404 case MRT_ASSERT: return (get_assert((uchar_t *)data, ipst));
405 default: return (EOPNOTSUPP);
406 }
407 }
408
409 /*
410 * Handle ioctl commands to obtain information from the cache.
411 * Called with shared access to IP. These are read_only ioctls.
412 */
413 /* ARGSUSED */
414 int
mrt_ioctl(ipif_t * ipif,sin_t * sin,queue_t * q,mblk_t * mp,ip_ioctl_cmd_t * ipip,void * if_req)415 mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
416 ip_ioctl_cmd_t *ipip, void *if_req)
417 {
418 mblk_t *mp1;
419 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
420 conn_t *connp = Q_TO_CONN(q);
421 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
422
423 /* Existence verified in ip_wput_nondata */
424 mp1 = mp->b_cont->b_cont;
425
426 switch (iocp->ioc_cmd) {
427 case (SIOCGETVIFCNT):
428 return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr, ipst));
429 case (SIOCGETSGCNT):
430 return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr, ipst));
431 case (SIOCGETLSGCNT):
432 return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr, ipst));
433 default:
434 return (EINVAL);
435 }
436 }
437
438 /*
439 * Returns the packet, byte, rpf-failure count for the source, group provided.
440 */
441 static int
get_sg_cnt(struct sioc_sg_req * req,ip_stack_t * ipst)442 get_sg_cnt(struct sioc_sg_req *req, ip_stack_t *ipst)
443 {
444 struct mfc *rt;
445 struct mfcb *mfcbp;
446
447 mfcbp = &ipst->ips_mfcs[MFCHASH(req->src.s_addr, req->grp.s_addr)];
448 MFCB_REFHOLD(mfcbp);
449 MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt);
450
451 if (rt != NULL) {
452 mutex_enter(&rt->mfc_mutex);
453 req->pktcnt = rt->mfc_pkt_cnt;
454 req->bytecnt = rt->mfc_byte_cnt;
455 req->wrong_if = rt->mfc_wrong_if;
456 mutex_exit(&rt->mfc_mutex);
457 } else
458 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU;
459
460 MFCB_REFRELE(mfcbp);
461 return (0);
462 }
463
464 /*
465 * Returns the packet, byte, rpf-failure count for the source, group provided.
466 * Uses larger counters and IPv6 addresses.
467 */
468 /* ARGSUSED XXX until implemented */
469 static int
get_lsg_cnt(struct sioc_lsg_req * req,ip_stack_t * ipst)470 get_lsg_cnt(struct sioc_lsg_req *req, ip_stack_t *ipst)
471 {
472 /* XXX TODO SIOCGETLSGCNT */
473 return (ENXIO);
474 }
475
476 /*
477 * Returns the input and output packet and byte counts on the vif provided.
478 */
479 static int
get_vif_cnt(struct sioc_vif_req * req,ip_stack_t * ipst)480 get_vif_cnt(struct sioc_vif_req *req, ip_stack_t *ipst)
481 {
482 vifi_t vifi = req->vifi;
483
484 if (vifi >= ipst->ips_numvifs)
485 return (EINVAL);
486
487 /*
488 * No locks here, an approximation is fine.
489 */
490 req->icount = ipst->ips_vifs[vifi].v_pkt_in;
491 req->ocount = ipst->ips_vifs[vifi].v_pkt_out;
492 req->ibytes = ipst->ips_vifs[vifi].v_bytes_in;
493 req->obytes = ipst->ips_vifs[vifi].v_bytes_out;
494
495 return (0);
496 }
497
498 static int
get_version(uchar_t * data)499 get_version(uchar_t *data)
500 {
501 int *v = (int *)data;
502
503 *v = 0x0305; /* XXX !!!! */
504
505 return (0);
506 }
507
508 /*
509 * Set PIM assert processing global.
510 */
511 static int
set_assert(int * i,ip_stack_t * ipst)512 set_assert(int *i, ip_stack_t *ipst)
513 {
514 if ((*i != 1) && (*i != 0))
515 return (EINVAL);
516
517 ipst->ips_pim_assert = *i;
518
519 return (0);
520 }
521
522 /*
523 * Get PIM assert processing global.
524 */
525 static int
get_assert(uchar_t * data,ip_stack_t * ipst)526 get_assert(uchar_t *data, ip_stack_t *ipst)
527 {
528 int *i = (int *)data;
529
530 *i = ipst->ips_pim_assert;
531
532 return (0);
533 }
534
535 /*
536 * Enable multicast routing.
537 */
538 static int
ip_mrouter_init(conn_t * connp,uchar_t * data,int datalen,ip_stack_t * ipst)539 ip_mrouter_init(conn_t *connp, uchar_t *data, int datalen, ip_stack_t *ipst)
540 {
541 int *v;
542
543 if (data == NULL || (datalen != sizeof (int)))
544 return (ENOPROTOOPT);
545
546 v = (int *)data;
547 if (*v != 1)
548 return (ENOPROTOOPT);
549
550 mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
551 if (ipst->ips_ip_g_mrouter != NULL) {
552 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
553 return (EADDRINUSE);
554 }
555
556 /*
557 * MRT_INIT should only be allowed for RAW sockets, but we double
558 * check.
559 */
560 if (!IPCL_IS_RAWIP(connp)) {
561 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
562 return (EINVAL);
563 }
564
565 ipst->ips_ip_g_mrouter = connp;
566 connp->conn_multi_router = 1;
567 /* In order for tunnels to work we have to turn ip_g_forward on */
568 if (!WE_ARE_FORWARDING(ipst)) {
569 if (ipst->ips_ip_mrtdebug > 1) {
570 (void) mi_strlog(connp->conn_rq, 1, SL_TRACE,
571 "ip_mrouter_init: turning on forwarding");
572 }
573 ipst->ips_saved_ip_forwarding = ipst->ips_ip_forwarding;
574 ipst->ips_ip_forwarding = IP_FORWARD_ALWAYS;
575 }
576
577 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
578 return (0);
579 }
580
581 void
ip_mrouter_stack_init(ip_stack_t * ipst)582 ip_mrouter_stack_init(ip_stack_t *ipst)
583 {
584 mutex_init(&ipst->ips_ip_g_mrouter_mutex, NULL, MUTEX_DEFAULT, NULL);
585
586 ipst->ips_vifs = kmem_zalloc(sizeof (struct vif) * (MAXVIFS+1),
587 KM_SLEEP);
588 ipst->ips_mrtstat = kmem_zalloc(sizeof (struct mrtstat), KM_SLEEP);
589 /*
590 * mfctable:
591 * Includes all mfcs, including waiting upcalls.
592 * Multiple mfcs per bucket.
593 */
594 ipst->ips_mfcs = kmem_zalloc(sizeof (struct mfcb) * MFCTBLSIZ,
595 KM_SLEEP);
596 /*
597 * Define the token bucket filter structures.
598 * tbftable -> each vif has one of these for storing info.
599 */
600 ipst->ips_tbfs = kmem_zalloc(sizeof (struct tbf) * MAXVIFS, KM_SLEEP);
601
602 mutex_init(&ipst->ips_last_encap_lock, NULL, MUTEX_DEFAULT, NULL);
603
604 ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
605 ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
606 }
607
608 /*
609 * Disable multicast routing.
610 * Didn't use global timeout_val (BSD version), instead check the mfctable.
611 */
612 int
ip_mrouter_done(ip_stack_t * ipst)613 ip_mrouter_done(ip_stack_t *ipst)
614 {
615 conn_t *mrouter;
616 vifi_t vifi;
617 struct mfc *mfc_rt;
618 int i;
619
620 mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
621 if (ipst->ips_ip_g_mrouter == NULL) {
622 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
623 return (EINVAL);
624 }
625
626 mrouter = ipst->ips_ip_g_mrouter;
627
628 if (ipst->ips_saved_ip_forwarding != -1) {
629 if (ipst->ips_ip_mrtdebug > 1) {
630 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
631 "ip_mrouter_done: turning off forwarding");
632 }
633 ipst->ips_ip_forwarding = ipst->ips_saved_ip_forwarding;
634 ipst->ips_saved_ip_forwarding = -1;
635 }
636
637 /*
638 * Always clear cache when vifs change.
639 * No need to get ipst->ips_last_encap_lock since we are running as
640 * a writer.
641 */
642 mutex_enter(&ipst->ips_last_encap_lock);
643 ipst->ips_last_encap_src = 0;
644 ipst->ips_last_encap_vif = NULL;
645 mutex_exit(&ipst->ips_last_encap_lock);
646 mrouter->conn_multi_router = 0;
647
648 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
649
650 /*
651 * For each phyint in use,
652 * disable promiscuous reception of all IP multicasts.
653 */
654 for (vifi = 0; vifi < MAXVIFS; vifi++) {
655 struct vif *vifp = ipst->ips_vifs + vifi;
656
657 mutex_enter(&vifp->v_lock);
658 /*
659 * if the vif is active mark it condemned.
660 */
661 if (vifp->v_marks & VIF_MARK_GOOD) {
662 ASSERT(vifp->v_ipif != NULL);
663 ipif_refhold(vifp->v_ipif);
664 /* Phyint only */
665 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
666 ipif_t *ipif = vifp->v_ipif;
667 ilm_t *ilm = vifp->v_ilm;
668
669 vifp->v_ilm = NULL;
670 vifp->v_marks &= ~VIF_MARK_GOOD;
671 vifp->v_marks |= VIF_MARK_CONDEMNED;
672
673 mutex_exit(&(vifp)->v_lock);
674 if (ilm != NULL) {
675 ill_t *ill = ipif->ipif_ill;
676
677 (void) ip_delmulti(ilm);
678 ASSERT(ill->ill_mrouter_cnt > 0);
679 atomic_dec_32(&ill->ill_mrouter_cnt);
680 }
681 mutex_enter(&vifp->v_lock);
682 }
683 ipif_refrele(vifp->v_ipif);
684 /*
685 * decreases the refcnt added in add_vif.
686 * and release v_lock.
687 */
688 VIF_REFRELE_LOCKED(vifp);
689 } else {
690 mutex_exit(&vifp->v_lock);
691 continue;
692 }
693 }
694
695 mutex_enter(&ipst->ips_numvifs_mutex);
696 ipst->ips_numvifs = 0;
697 ipst->ips_pim_assert = 0;
698 ipst->ips_reg_vif_num = ALL_VIFS;
699 mutex_exit(&ipst->ips_numvifs_mutex);
700
701 /*
702 * Free upcall msgs.
703 * Go through mfctable and stop any outstanding upcall
704 * timeouts remaining on mfcs.
705 */
706 for (i = 0; i < MFCTBLSIZ; i++) {
707 mutex_enter(&ipst->ips_mfcs[i].mfcb_lock);
708 ipst->ips_mfcs[i].mfcb_refcnt++;
709 ipst->ips_mfcs[i].mfcb_marks |= MFCB_MARK_CONDEMNED;
710 mutex_exit(&ipst->ips_mfcs[i].mfcb_lock);
711 mfc_rt = ipst->ips_mfcs[i].mfcb_mfc;
712 while (mfc_rt) {
713 /* Free upcalls */
714 mutex_enter(&mfc_rt->mfc_mutex);
715 if (mfc_rt->mfc_rte != NULL) {
716 if (mfc_rt->mfc_timeout_id != 0) {
717 /*
718 * OK to drop the lock as we have
719 * a refcnt on the bucket. timeout
720 * can fire but it will see that
721 * mfc_timeout_id == 0 and not do
722 * anything. see expire_upcalls().
723 */
724 mfc_rt->mfc_timeout_id = 0;
725 mutex_exit(&mfc_rt->mfc_mutex);
726 (void) untimeout(
727 mfc_rt->mfc_timeout_id);
728 mfc_rt->mfc_timeout_id = 0;
729 mutex_enter(&mfc_rt->mfc_mutex);
730
731 /*
732 * all queued upcall packets
733 * and mblk will be freed in
734 * release_mfc().
735 */
736 }
737 }
738
739 mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
740
741 mutex_exit(&mfc_rt->mfc_mutex);
742 mfc_rt = mfc_rt->mfc_next;
743 }
744 MFCB_REFRELE(&ipst->ips_mfcs[i]);
745 }
746
747 mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
748 ipst->ips_ip_g_mrouter = NULL;
749 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
750 return (0);
751 }
752
753 void
ip_mrouter_stack_destroy(ip_stack_t * ipst)754 ip_mrouter_stack_destroy(ip_stack_t *ipst)
755 {
756 struct mfcb *mfcbp;
757 struct mfc *rt;
758 int i;
759
760 for (i = 0; i < MFCTBLSIZ; i++) {
761 mfcbp = &ipst->ips_mfcs[i];
762
763 while ((rt = mfcbp->mfcb_mfc) != NULL) {
764 (void) printf("ip_mrouter_stack_destroy: free for %d\n",
765 i);
766
767 mfcbp->mfcb_mfc = rt->mfc_next;
768 free_queue(rt);
769 mi_free(rt);
770 }
771 }
772 kmem_free(ipst->ips_vifs, sizeof (struct vif) * (MAXVIFS+1));
773 ipst->ips_vifs = NULL;
774 kmem_free(ipst->ips_mrtstat, sizeof (struct mrtstat));
775 ipst->ips_mrtstat = NULL;
776 kmem_free(ipst->ips_mfcs, sizeof (struct mfcb) * MFCTBLSIZ);
777 ipst->ips_mfcs = NULL;
778 kmem_free(ipst->ips_tbfs, sizeof (struct tbf) * MAXVIFS);
779 ipst->ips_tbfs = NULL;
780
781 mutex_destroy(&ipst->ips_last_encap_lock);
782 mutex_destroy(&ipst->ips_ip_g_mrouter_mutex);
783 }
784
785 static boolean_t
is_mrouter_off(ip_stack_t * ipst)786 is_mrouter_off(ip_stack_t *ipst)
787 {
788 conn_t *mrouter;
789
790 mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
791 if (ipst->ips_ip_g_mrouter == NULL) {
792 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
793 return (B_TRUE);
794 }
795
796 mrouter = ipst->ips_ip_g_mrouter;
797 if (mrouter->conn_multi_router == 0) {
798 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
799 return (B_TRUE);
800 }
801 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
802 return (B_FALSE);
803 }
804
805 static void
unlock_good_vif(struct vif * vifp)806 unlock_good_vif(struct vif *vifp)
807 {
808 ASSERT(vifp->v_ipif != NULL);
809 ipif_refrele(vifp->v_ipif);
810 VIF_REFRELE(vifp);
811 }
812
813 static boolean_t
lock_good_vif(struct vif * vifp)814 lock_good_vif(struct vif *vifp)
815 {
816 mutex_enter(&vifp->v_lock);
817 if (!(vifp->v_marks & VIF_MARK_GOOD)) {
818 mutex_exit(&vifp->v_lock);
819 return (B_FALSE);
820 }
821
822 ASSERT(vifp->v_ipif != NULL);
823 mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock);
824 if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) {
825 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
826 mutex_exit(&vifp->v_lock);
827 return (B_FALSE);
828 }
829 ipif_refhold_locked(vifp->v_ipif);
830 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
831 vifp->v_refcnt++;
832 mutex_exit(&vifp->v_lock);
833 return (B_TRUE);
834 }
835
836 /*
837 * Add a vif to the vif table.
838 */
839 static int
add_vif(struct vifctl * vifcp,conn_t * connp,ip_stack_t * ipst)840 add_vif(struct vifctl *vifcp, conn_t *connp, ip_stack_t *ipst)
841 {
842 struct vif *vifp = ipst->ips_vifs + vifcp->vifc_vifi;
843 ipif_t *ipif;
844 int error = 0;
845 struct tbf *v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi;
846 conn_t *mrouter = ipst->ips_ip_g_mrouter;
847 ilm_t *ilm;
848 ill_t *ill;
849
850 ASSERT(connp != NULL);
851
852 if (vifcp->vifc_vifi >= MAXVIFS)
853 return (EINVAL);
854
855 if (is_mrouter_off(ipst))
856 return (EINVAL);
857
858 mutex_enter(&vifp->v_lock);
859 /*
860 * Viftable entry should be 0.
861 * if v_marks == 0 but v_refcnt != 0 means struct is being
862 * initialized.
863 *
864 * Also note that it is very unlikely that we will get a MRT_ADD_VIF
865 * request while the delete is in progress, mrouted only sends add
866 * requests when a new interface is added and the new interface cannot
867 * have the same vifi as an existing interface. We make sure that
868 * ill_delete will block till the vif is deleted by adding a refcnt
869 * to ipif in del_vif().
870 */
871 if (vifp->v_lcl_addr.s_addr != 0 ||
872 vifp->v_marks != 0 ||
873 vifp->v_refcnt != 0) {
874 mutex_exit(&vifp->v_lock);
875 return (EADDRINUSE);
876 }
877
878 /* Incoming vif should not be 0 */
879 if (vifcp->vifc_lcl_addr.s_addr == 0) {
880 mutex_exit(&vifp->v_lock);
881 return (EINVAL);
882 }
883
884 vifp->v_refcnt++;
885 mutex_exit(&vifp->v_lock);
886 /* Find the interface with the local address */
887 ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL,
888 IPCL_ZONEID(connp), ipst);
889 if (ipif == NULL) {
890 VIF_REFRELE(vifp);
891 return (EADDRNOTAVAIL);
892 }
893
894 if (ipst->ips_ip_mrtdebug > 1) {
895 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
896 "add_vif: src 0x%x enter",
897 vifcp->vifc_lcl_addr.s_addr);
898 }
899
900 mutex_enter(&vifp->v_lock);
901 /*
902 * Always clear cache when vifs change.
903 * Needed to ensure that src isn't left over from before vif was added.
904 * No need to get last_encap_lock, since we are running as a writer.
905 */
906
907 mutex_enter(&ipst->ips_last_encap_lock);
908 ipst->ips_last_encap_src = 0;
909 ipst->ips_last_encap_vif = NULL;
910 mutex_exit(&ipst->ips_last_encap_lock);
911
912 if (vifcp->vifc_flags & VIFF_TUNNEL) {
913 if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) {
914 cmn_err(CE_WARN,
915 "add_vif: source route tunnels not supported\n");
916 VIF_REFRELE_LOCKED(vifp);
917 ipif_refrele(ipif);
918 return (EOPNOTSUPP);
919 }
920 vifp->v_rmt_addr = vifcp->vifc_rmt_addr;
921
922 } else {
923 /* Phyint or Register vif */
924 if (vifcp->vifc_flags & VIFF_REGISTER) {
925 /*
926 * Note: Since all IPPROTO_IP level options (including
927 * MRT_ADD_VIF) are done exclusively via
928 * ip_optmgmt_writer(), a lock is not necessary to
929 * protect reg_vif_num.
930 */
931 mutex_enter(&ipst->ips_numvifs_mutex);
932 if (ipst->ips_reg_vif_num == ALL_VIFS) {
933 ipst->ips_reg_vif_num = vifcp->vifc_vifi;
934 mutex_exit(&ipst->ips_numvifs_mutex);
935 } else {
936 mutex_exit(&ipst->ips_numvifs_mutex);
937 VIF_REFRELE_LOCKED(vifp);
938 ipif_refrele(ipif);
939 return (EADDRINUSE);
940 }
941 }
942
943 /* Make sure the interface supports multicast */
944 if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) {
945 VIF_REFRELE_LOCKED(vifp);
946 ipif_refrele(ipif);
947 if (vifcp->vifc_flags & VIFF_REGISTER) {
948 mutex_enter(&ipst->ips_numvifs_mutex);
949 ipst->ips_reg_vif_num = ALL_VIFS;
950 mutex_exit(&ipst->ips_numvifs_mutex);
951 }
952 return (EOPNOTSUPP);
953 }
954 /* Enable promiscuous reception of all IP mcasts from the if */
955 mutex_exit(&vifp->v_lock);
956
957 ill = ipif->ipif_ill;
958 if (IS_UNDER_IPMP(ill))
959 ill = ipmp_ill_hold_ipmp_ill(ill);
960
961 if (ill == NULL) {
962 ilm = NULL;
963 } else {
964 ilm = ip_addmulti(&ipv6_all_zeros, ill,
965 ipif->ipif_zoneid, &error);
966 if (ilm != NULL)
967 atomic_inc_32(&ill->ill_mrouter_cnt);
968 if (IS_UNDER_IPMP(ipif->ipif_ill)) {
969 ill_refrele(ill);
970 ill = ipif->ipif_ill;
971 }
972 }
973
974 mutex_enter(&vifp->v_lock);
975 /*
976 * since we released the lock lets make sure that
977 * ip_mrouter_done() has not been called.
978 */
979 if (ilm == NULL || is_mrouter_off(ipst)) {
980 if (ilm != NULL) {
981 (void) ip_delmulti(ilm);
982 ASSERT(ill->ill_mrouter_cnt > 0);
983 atomic_dec_32(&ill->ill_mrouter_cnt);
984 }
985 if (vifcp->vifc_flags & VIFF_REGISTER) {
986 mutex_enter(&ipst->ips_numvifs_mutex);
987 ipst->ips_reg_vif_num = ALL_VIFS;
988 mutex_exit(&ipst->ips_numvifs_mutex);
989 }
990 VIF_REFRELE_LOCKED(vifp);
991 ipif_refrele(ipif);
992 return (error?error:EINVAL);
993 }
994 vifp->v_ilm = ilm;
995 }
996 /* Define parameters for the tbf structure */
997 vifp->v_tbf = v_tbf;
998 gethrestime(&vifp->v_tbf->tbf_last_pkt_t);
999 vifp->v_tbf->tbf_n_tok = 0;
1000 vifp->v_tbf->tbf_q_len = 0;
1001 vifp->v_tbf->tbf_max_q_len = MAXQSIZE;
1002 vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL;
1003
1004 vifp->v_flags = vifcp->vifc_flags;
1005 vifp->v_threshold = vifcp->vifc_threshold;
1006 vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
1007 vifp->v_ipif = ipif;
1008 ipif_refrele(ipif);
1009 /* Scaling up here, allows division by 1024 in critical code. */
1010 vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000);
1011 vifp->v_timeout_id = 0;
1012 /* initialize per vif pkt counters */
1013 vifp->v_pkt_in = 0;
1014 vifp->v_pkt_out = 0;
1015 vifp->v_bytes_in = 0;
1016 vifp->v_bytes_out = 0;
1017 mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL);
1018
1019 /* Adjust numvifs up, if the vifi is higher than numvifs */
1020 mutex_enter(&ipst->ips_numvifs_mutex);
1021 if (ipst->ips_numvifs <= vifcp->vifc_vifi)
1022 ipst->ips_numvifs = vifcp->vifc_vifi + 1;
1023 mutex_exit(&ipst->ips_numvifs_mutex);
1024
1025 if (ipst->ips_ip_mrtdebug > 1) {
1026 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1027 "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d",
1028 vifcp->vifc_vifi,
1029 ntohl(vifcp->vifc_lcl_addr.s_addr),
1030 (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
1031 ntohl(vifcp->vifc_rmt_addr.s_addr),
1032 vifcp->vifc_threshold, vifcp->vifc_rate_limit);
1033 }
1034
1035 vifp->v_marks = VIF_MARK_GOOD;
1036 mutex_exit(&vifp->v_lock);
1037 return (0);
1038 }
1039
1040
1041 /* Delete a vif from the vif table. */
1042 static void
del_vifp(struct vif * vifp)1043 del_vifp(struct vif *vifp)
1044 {
1045 struct tbf *t = vifp->v_tbf;
1046 mblk_t *mp0;
1047 vifi_t vifi;
1048 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
1049 conn_t *mrouter = ipst->ips_ip_g_mrouter;
1050
1051 ASSERT(vifp->v_marks & VIF_MARK_CONDEMNED);
1052 ASSERT(t != NULL);
1053
1054 if (ipst->ips_ip_mrtdebug > 1) {
1055 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1056 "del_vif: src 0x%x\n", vifp->v_lcl_addr.s_addr);
1057 }
1058
1059 if (vifp->v_timeout_id != 0) {
1060 (void) untimeout(vifp->v_timeout_id);
1061 vifp->v_timeout_id = 0;
1062 }
1063
1064 /*
1065 * Free packets queued at the interface.
1066 * Mrouted takes care of cleaning up mfcs - makes calls to del_mfc.
1067 */
1068 mutex_enter(&t->tbf_lock);
1069 while (t->tbf_q != NULL) {
1070 mp0 = t->tbf_q;
1071 t->tbf_q = t->tbf_q->b_next;
1072 mp0->b_prev = mp0->b_next = NULL;
1073 freemsg(mp0);
1074 }
1075 mutex_exit(&t->tbf_lock);
1076
1077 /*
1078 * Always clear cache when vifs change.
1079 * No need to get last_encap_lock since we are running as a writer.
1080 */
1081 mutex_enter(&ipst->ips_last_encap_lock);
1082 if (vifp == ipst->ips_last_encap_vif) {
1083 ipst->ips_last_encap_vif = NULL;
1084 ipst->ips_last_encap_src = 0;
1085 }
1086 mutex_exit(&ipst->ips_last_encap_lock);
1087
1088 mutex_destroy(&t->tbf_lock);
1089
1090 bzero(vifp->v_tbf, sizeof (*(vifp->v_tbf)));
1091
1092 /* Adjust numvifs down */
1093 mutex_enter(&ipst->ips_numvifs_mutex);
1094 for (vifi = ipst->ips_numvifs; vifi != 0; vifi--) /* vifi is unsigned */
1095 if (ipst->ips_vifs[vifi - 1].v_lcl_addr.s_addr != 0)
1096 break;
1097 ipst->ips_numvifs = vifi;
1098 mutex_exit(&ipst->ips_numvifs_mutex);
1099
1100 bzero(vifp, sizeof (*vifp));
1101 }
1102
1103 static int
del_vif(vifi_t * vifip,ip_stack_t * ipst)1104 del_vif(vifi_t *vifip, ip_stack_t *ipst)
1105 {
1106 struct vif *vifp = ipst->ips_vifs + *vifip;
1107
1108 if (*vifip >= ipst->ips_numvifs)
1109 return (EINVAL);
1110
1111 mutex_enter(&vifp->v_lock);
1112 /*
1113 * Not initialized
1114 * Here we are not looking at the vif that is being initialized
1115 * i.e vifp->v_marks == 0 and refcnt > 0.
1116 */
1117 if (vifp->v_lcl_addr.s_addr == 0 ||
1118 !(vifp->v_marks & VIF_MARK_GOOD)) {
1119 mutex_exit(&vifp->v_lock);
1120 return (EADDRNOTAVAIL);
1121 }
1122
1123 /* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */
1124 vifp->v_marks &= ~VIF_MARK_GOOD;
1125 vifp->v_marks |= VIF_MARK_CONDEMNED;
1126
1127 /* Phyint only */
1128 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
1129 ipif_t *ipif = vifp->v_ipif;
1130 ilm_t *ilm = vifp->v_ilm;
1131
1132 vifp->v_ilm = NULL;
1133
1134 ASSERT(ipif != NULL);
1135 /*
1136 * should be OK to drop the lock as we
1137 * have marked this as CONDEMNED.
1138 */
1139 mutex_exit(&(vifp)->v_lock);
1140 if (ilm != NULL) {
1141 (void) ip_delmulti(ilm);
1142 ASSERT(ipif->ipif_ill->ill_mrouter_cnt > 0);
1143 atomic_dec_32(&ipif->ipif_ill->ill_mrouter_cnt);
1144 }
1145 mutex_enter(&(vifp)->v_lock);
1146 }
1147
1148 if (vifp->v_flags & VIFF_REGISTER) {
1149 mutex_enter(&ipst->ips_numvifs_mutex);
1150 ipst->ips_reg_vif_num = ALL_VIFS;
1151 mutex_exit(&ipst->ips_numvifs_mutex);
1152 }
1153
1154 /*
1155 * decreases the refcnt added in add_vif.
1156 */
1157 VIF_REFRELE_LOCKED(vifp);
1158 return (0);
1159 }
1160
1161 /*
1162 * Add an mfc entry.
1163 */
1164 static int
add_mfc(struct mfcctl * mfccp,ip_stack_t * ipst)1165 add_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
1166 {
1167 struct mfc *rt;
1168 struct rtdetq *rte;
1169 ushort_t nstl;
1170 int i;
1171 struct mfcb *mfcbp;
1172 conn_t *mrouter = ipst->ips_ip_g_mrouter;
1173
1174 /*
1175 * The value of vifi is NO_VIF (==MAXVIFS) if Mrouted
1176 * did not have a real route for pkt.
1177 * We want this pkt without rt installed in the mfctable to prevent
1178 * multiiple tries, so go ahead and put it in mfctable, it will
1179 * be discarded later in ip_mdq() because the child is NULL.
1180 */
1181
1182 /* Error checking, out of bounds? */
1183 if (mfccp->mfcc_parent > MAXVIFS) {
1184 ip0dbg(("ADD_MFC: mfcc_parent out of range %d",
1185 (int)mfccp->mfcc_parent));
1186 return (EINVAL);
1187 }
1188
1189 if ((mfccp->mfcc_parent != NO_VIF) &&
1190 (ipst->ips_vifs[mfccp->mfcc_parent].v_ipif == NULL)) {
1191 ip0dbg(("ADD_MFC: NULL ipif for parent vif %d\n",
1192 (int)mfccp->mfcc_parent));
1193 return (EINVAL);
1194 }
1195
1196 if (is_mrouter_off(ipst)) {
1197 return (EINVAL);
1198 }
1199
1200 mfcbp = &ipst->ips_mfcs[MFCHASH(mfccp->mfcc_origin.s_addr,
1201 mfccp->mfcc_mcastgrp.s_addr)];
1202 MFCB_REFHOLD(mfcbp);
1203 MFCFIND(mfcbp, mfccp->mfcc_origin.s_addr,
1204 mfccp->mfcc_mcastgrp.s_addr, rt);
1205
1206 /* If an entry already exists, just update the fields */
1207 if (rt) {
1208 if (ipst->ips_ip_mrtdebug > 1) {
1209 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1210 "add_mfc: update o %x grp %x parent %x",
1211 ntohl(mfccp->mfcc_origin.s_addr),
1212 ntohl(mfccp->mfcc_mcastgrp.s_addr),
1213 mfccp->mfcc_parent);
1214 }
1215 mutex_enter(&rt->mfc_mutex);
1216 rt->mfc_parent = mfccp->mfcc_parent;
1217
1218 mutex_enter(&ipst->ips_numvifs_mutex);
1219 for (i = 0; i < (int)ipst->ips_numvifs; i++)
1220 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1221 mutex_exit(&ipst->ips_numvifs_mutex);
1222 mutex_exit(&rt->mfc_mutex);
1223
1224 MFCB_REFRELE(mfcbp);
1225 return (0);
1226 }
1227
1228 /*
1229 * Find the entry for which the upcall was made and update.
1230 */
1231 for (rt = mfcbp->mfcb_mfc, nstl = 0; rt; rt = rt->mfc_next) {
1232 mutex_enter(&rt->mfc_mutex);
1233 if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
1234 (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) &&
1235 (rt->mfc_rte != NULL) &&
1236 !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1237 if (nstl++ != 0)
1238 cmn_err(CE_WARN,
1239 "add_mfc: %s o %x g %x p %x",
1240 "multiple kernel entries",
1241 ntohl(mfccp->mfcc_origin.s_addr),
1242 ntohl(mfccp->mfcc_mcastgrp.s_addr),
1243 mfccp->mfcc_parent);
1244
1245 if (ipst->ips_ip_mrtdebug > 1) {
1246 (void) mi_strlog(mrouter->conn_rq, 1,
1247 SL_TRACE,
1248 "add_mfc: o %x g %x p %x",
1249 ntohl(mfccp->mfcc_origin.s_addr),
1250 ntohl(mfccp->mfcc_mcastgrp.s_addr),
1251 mfccp->mfcc_parent);
1252 }
1253 fill_route(rt, mfccp, ipst);
1254
1255 /*
1256 * Prevent cleanup of cache entry.
1257 * Timer starts in ip_mforward.
1258 */
1259 if (rt->mfc_timeout_id != 0) {
1260 timeout_id_t id;
1261 id = rt->mfc_timeout_id;
1262 /*
1263 * setting id to zero will avoid this
1264 * entry from being cleaned up in
1265 * expire_up_calls().
1266 */
1267 rt->mfc_timeout_id = 0;
1268 /*
1269 * dropping the lock is fine as we
1270 * have a refhold on the bucket.
1271 * so mfc cannot be freed.
1272 * The timeout can fire but it will see
1273 * that mfc_timeout_id == 0 and not cleanup.
1274 */
1275 mutex_exit(&rt->mfc_mutex);
1276 (void) untimeout(id);
1277 mutex_enter(&rt->mfc_mutex);
1278 }
1279
1280 /*
1281 * Send all pkts that are queued waiting for the upcall.
1282 * ip_mdq param tun set to 0 -
1283 * the return value of ip_mdq() isn't used here,
1284 * so value we send doesn't matter.
1285 */
1286 while (rt->mfc_rte != NULL) {
1287 rte = rt->mfc_rte;
1288 rt->mfc_rte = rte->rte_next;
1289 mutex_exit(&rt->mfc_mutex);
1290 (void) ip_mdq(rte->mp, (ipha_t *)
1291 rte->mp->b_rptr, rte->ill, 0, rt);
1292 freemsg(rte->mp);
1293 mi_free((char *)rte);
1294 mutex_enter(&rt->mfc_mutex);
1295 }
1296 }
1297 mutex_exit(&rt->mfc_mutex);
1298 }
1299
1300
1301 /*
1302 * It is possible that an entry is being inserted without an upcall
1303 */
1304 if (nstl == 0) {
1305 mutex_enter(&(mfcbp->mfcb_lock));
1306 if (ipst->ips_ip_mrtdebug > 1) {
1307 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1308 "add_mfc: no upcall o %x g %x p %x",
1309 ntohl(mfccp->mfcc_origin.s_addr),
1310 ntohl(mfccp->mfcc_mcastgrp.s_addr),
1311 mfccp->mfcc_parent);
1312 }
1313 if (is_mrouter_off(ipst)) {
1314 mutex_exit(&mfcbp->mfcb_lock);
1315 MFCB_REFRELE(mfcbp);
1316 return (EINVAL);
1317 }
1318
1319 for (rt = mfcbp->mfcb_mfc; rt; rt = rt->mfc_next) {
1320
1321 mutex_enter(&rt->mfc_mutex);
1322 if ((rt->mfc_origin.s_addr ==
1323 mfccp->mfcc_origin.s_addr) &&
1324 (rt->mfc_mcastgrp.s_addr ==
1325 mfccp->mfcc_mcastgrp.s_addr) &&
1326 (!(rt->mfc_marks & MFCB_MARK_CONDEMNED))) {
1327 fill_route(rt, mfccp, ipst);
1328 mutex_exit(&rt->mfc_mutex);
1329 break;
1330 }
1331 mutex_exit(&rt->mfc_mutex);
1332 }
1333
1334 /* No upcall, so make a new entry into mfctable */
1335 if (rt == NULL) {
1336 rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1337 if (rt == NULL) {
1338 ip1dbg(("add_mfc: out of memory\n"));
1339 mutex_exit(&mfcbp->mfcb_lock);
1340 MFCB_REFRELE(mfcbp);
1341 return (ENOBUFS);
1342 }
1343
1344 /* Insert new entry at head of hash chain */
1345 mutex_enter(&rt->mfc_mutex);
1346 fill_route(rt, mfccp, ipst);
1347
1348 /* Link into table */
1349 rt->mfc_next = mfcbp->mfcb_mfc;
1350 mfcbp->mfcb_mfc = rt;
1351 mutex_exit(&rt->mfc_mutex);
1352 }
1353 mutex_exit(&mfcbp->mfcb_lock);
1354 }
1355
1356 MFCB_REFRELE(mfcbp);
1357 return (0);
1358 }
1359
1360 /*
1361 * Fills in mfc structure from mrouted mfcctl.
1362 */
1363 static void
fill_route(struct mfc * rt,struct mfcctl * mfccp,ip_stack_t * ipst)1364 fill_route(struct mfc *rt, struct mfcctl *mfccp, ip_stack_t *ipst)
1365 {
1366 int i;
1367
1368 rt->mfc_origin = mfccp->mfcc_origin;
1369 rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp;
1370 rt->mfc_parent = mfccp->mfcc_parent;
1371 mutex_enter(&ipst->ips_numvifs_mutex);
1372 for (i = 0; i < (int)ipst->ips_numvifs; i++) {
1373 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1374 }
1375 mutex_exit(&ipst->ips_numvifs_mutex);
1376 /* Initialize pkt counters per src-grp */
1377 rt->mfc_pkt_cnt = 0;
1378 rt->mfc_byte_cnt = 0;
1379 rt->mfc_wrong_if = 0;
1380 rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_nsec = 0;
1381
1382 }
1383
1384 static void
free_queue(struct mfc * mfcp)1385 free_queue(struct mfc *mfcp)
1386 {
1387 struct rtdetq *rte0;
1388
1389 /*
1390 * Drop all queued upcall packets.
1391 * Free the mbuf with the pkt.
1392 */
1393 while ((rte0 = mfcp->mfc_rte) != NULL) {
1394 mfcp->mfc_rte = rte0->rte_next;
1395 freemsg(rte0->mp);
1396 mi_free((char *)rte0);
1397 }
1398 }
1399 /*
1400 * go thorugh the hash bucket and free all the entries marked condemned.
1401 */
1402 void
release_mfc(struct mfcb * mfcbp)1403 release_mfc(struct mfcb *mfcbp)
1404 {
1405 struct mfc *current_mfcp;
1406 struct mfc *prev_mfcp;
1407
1408 prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1409
1410 while (current_mfcp != NULL) {
1411 if (current_mfcp->mfc_marks & MFCB_MARK_CONDEMNED) {
1412 if (current_mfcp == mfcbp->mfcb_mfc) {
1413 mfcbp->mfcb_mfc = current_mfcp->mfc_next;
1414 free_queue(current_mfcp);
1415 mi_free(current_mfcp);
1416 prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1417 continue;
1418 }
1419 ASSERT(prev_mfcp != NULL);
1420 prev_mfcp->mfc_next = current_mfcp->mfc_next;
1421 free_queue(current_mfcp);
1422 mi_free(current_mfcp);
1423 current_mfcp = NULL;
1424 } else {
1425 prev_mfcp = current_mfcp;
1426 }
1427
1428 current_mfcp = prev_mfcp->mfc_next;
1429
1430 }
1431 mfcbp->mfcb_marks &= ~MFCB_MARK_CONDEMNED;
1432 ASSERT(mfcbp->mfcb_mfc != NULL || mfcbp->mfcb_marks == 0);
1433 }
1434
1435 /*
1436 * Delete an mfc entry.
1437 */
1438 static int
del_mfc(struct mfcctl * mfccp,ip_stack_t * ipst)1439 del_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
1440 {
1441 struct in_addr origin;
1442 struct in_addr mcastgrp;
1443 struct mfc *rt;
1444 uint_t hash;
1445 conn_t *mrouter = ipst->ips_ip_g_mrouter;
1446
1447 origin = mfccp->mfcc_origin;
1448 mcastgrp = mfccp->mfcc_mcastgrp;
1449 hash = MFCHASH(origin.s_addr, mcastgrp.s_addr);
1450
1451 if (ipst->ips_ip_mrtdebug > 1) {
1452 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1453 "del_mfc: o %x g %x",
1454 ntohl(origin.s_addr),
1455 ntohl(mcastgrp.s_addr));
1456 }
1457
1458 MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
1459
1460 /* Find mfc in mfctable, finds only entries without upcalls */
1461 for (rt = ipst->ips_mfcs[hash].mfcb_mfc; rt; rt = rt->mfc_next) {
1462 mutex_enter(&rt->mfc_mutex);
1463 if (origin.s_addr == rt->mfc_origin.s_addr &&
1464 mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr &&
1465 rt->mfc_rte == NULL &&
1466 !(rt->mfc_marks & MFCB_MARK_CONDEMNED))
1467 break;
1468 mutex_exit(&rt->mfc_mutex);
1469 }
1470
1471 /*
1472 * Return if there was an upcall (mfc_rte != NULL,
1473 * or rt not in mfctable.
1474 */
1475 if (rt == NULL) {
1476 MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1477 return (EADDRNOTAVAIL);
1478 }
1479
1480
1481 /*
1482 * no need to hold lock as we have a reference.
1483 */
1484 ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
1485 /* error checking */
1486 if (rt->mfc_timeout_id != 0) {
1487 ip0dbg(("del_mfc: TIMEOUT NOT 0, rte not null"));
1488 /*
1489 * Its ok to drop the lock, the struct cannot be freed
1490 * since we have a ref on the hash bucket.
1491 */
1492 rt->mfc_timeout_id = 0;
1493 mutex_exit(&rt->mfc_mutex);
1494 (void) untimeout(rt->mfc_timeout_id);
1495 mutex_enter(&rt->mfc_mutex);
1496 }
1497
1498 ASSERT(rt->mfc_rte == NULL);
1499
1500
1501 /*
1502 * Delete the entry from the cache
1503 */
1504 rt->mfc_marks |= MFCB_MARK_CONDEMNED;
1505 mutex_exit(&rt->mfc_mutex);
1506
1507 MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1508
1509 return (0);
1510 }
1511
1512 #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */
1513
1514 /*
1515 * IP multicast forwarding function. This function assumes that the packet
1516 * pointed to by ipha has arrived on (or is about to be sent to) the interface
1517 * pointed to by "ill", and the packet is to be relayed to other networks
1518 * that have members of the packet's destination IP multicast group.
1519 *
1520 * The packet is returned unscathed to the caller, unless it is
1521 * erroneous, in which case a -1 value tells the caller (IP)
1522 * to discard it.
1523 *
1524 * Unlike BSD, SunOS 5.x needs to return to IP info about
1525 * whether pkt came in thru a tunnel, so it can be discarded, unless
1526 * it's IGMP. In BSD, the ifp is bogus for tunnels, so pkt won't try
1527 * to be delivered.
1528 * Return values are 0 - pkt is okay and phyint
1529 * -1 - pkt is malformed and to be tossed
1530 * 1 - pkt came in on tunnel
1531 */
1532 int
ip_mforward(mblk_t * mp,ip_recv_attr_t * ira)1533 ip_mforward(mblk_t *mp, ip_recv_attr_t *ira)
1534 {
1535 ipha_t *ipha = (ipha_t *)mp->b_rptr;
1536 ill_t *ill = ira->ira_ill;
1537 struct mfc *rt;
1538 ipaddr_t src, dst, tunnel_src = 0;
1539 static int srctun = 0;
1540 vifi_t vifi;
1541 boolean_t pim_reg_packet = B_FALSE;
1542 struct mfcb *mfcbp;
1543 ip_stack_t *ipst = ill->ill_ipst;
1544 conn_t *mrouter = ipst->ips_ip_g_mrouter;
1545 ill_t *rill = ira->ira_rill;
1546
1547 ASSERT(ira->ira_pktlen == msgdsize(mp));
1548
1549 if (ipst->ips_ip_mrtdebug > 1) {
1550 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1551 "ip_mforward: RECV ipha_src %x, ipha_dst %x, ill %s",
1552 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
1553 ill->ill_name);
1554 }
1555
1556 dst = ipha->ipha_dst;
1557 if (ira->ira_flags & IRAF_PIM_REGISTER)
1558 pim_reg_packet = B_TRUE;
1559 else if (ira->ira_flags & IRAF_MROUTE_TUNNEL_SET)
1560 tunnel_src = ira->ira_mroute_tunnel;
1561
1562 /*
1563 * Don't forward a packet with time-to-live of zero or one,
1564 * or a packet destined to a local-only group.
1565 */
1566 if (CLASSD(dst) && (ipha->ipha_ttl <= 1 ||
1567 (ipaddr_t)ntohl(dst) <= INADDR_MAX_LOCAL_GROUP)) {
1568 if (ipst->ips_ip_mrtdebug > 1) {
1569 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1570 "ip_mforward: not forwarded ttl %d,"
1571 " dst 0x%x ill %s",
1572 ipha->ipha_ttl, ntohl(dst), ill->ill_name);
1573 }
1574 if (tunnel_src != 0)
1575 return (1);
1576 else
1577 return (0);
1578 }
1579
1580 if ((tunnel_src != 0) || pim_reg_packet) {
1581 /*
1582 * Packet arrived over an encapsulated tunnel or via a PIM
1583 * register message.
1584 */
1585 if (ipst->ips_ip_mrtdebug > 1) {
1586 if (tunnel_src != 0) {
1587 (void) mi_strlog(mrouter->conn_rq, 1,
1588 SL_TRACE,
1589 "ip_mforward: ill %s arrived via ENCAP TUN",
1590 ill->ill_name);
1591 } else if (pim_reg_packet) {
1592 (void) mi_strlog(mrouter->conn_rq, 1,
1593 SL_TRACE,
1594 "ip_mforward: ill %s arrived via"
1595 " REGISTER VIF",
1596 ill->ill_name);
1597 }
1598 }
1599 } else if ((ipha->ipha_version_and_hdr_length & 0xf) <
1600 (uint_t)(IP_SIMPLE_HDR_LENGTH + TUNNEL_LEN) >> 2 ||
1601 ((uchar_t *)(ipha + 1))[1] != IPOPT_LSRR) {
1602 /* Packet arrived via a physical interface. */
1603 if (ipst->ips_ip_mrtdebug > 1) {
1604 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1605 "ip_mforward: ill %s arrived via PHYINT",
1606 ill->ill_name);
1607 }
1608
1609 } else {
1610 /*
1611 * Packet arrived through a SRCRT tunnel.
1612 * Source-route tunnels are no longer supported.
1613 * Error message printed every 1000 times.
1614 */
1615 if ((srctun++ % 1000) == 0) {
1616 cmn_err(CE_WARN,
1617 "ip_mforward: received source-routed pkt from %x",
1618 ntohl(ipha->ipha_src));
1619 }
1620 return (-1);
1621 }
1622
1623 ipst->ips_mrtstat->mrts_fwd_in++;
1624 src = ipha->ipha_src;
1625
1626 /* Find route in cache, return NULL if not there or upcalls q'ed. */
1627
1628 /*
1629 * Lock the mfctable against changes made by ip_mforward.
1630 * Note that only add_mfc and del_mfc can remove entries and
1631 * they run with exclusive access to IP. So we do not need to
1632 * guard against the rt being deleted, so release lock after reading.
1633 */
1634
1635 if (is_mrouter_off(ipst))
1636 return (-1);
1637
1638 mfcbp = &ipst->ips_mfcs[MFCHASH(src, dst)];
1639 MFCB_REFHOLD(mfcbp);
1640 MFCFIND(mfcbp, src, dst, rt);
1641
1642 /* Entry exists, so forward if necessary */
1643 if (rt != NULL) {
1644 int ret = 0;
1645 ipst->ips_mrtstat->mrts_mfc_hits++;
1646 if (pim_reg_packet) {
1647 ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
1648 ret = ip_mdq(mp, ipha,
1649 ipst->ips_vifs[ipst->ips_reg_vif_num].
1650 v_ipif->ipif_ill,
1651 0, rt);
1652 } else {
1653 ret = ip_mdq(mp, ipha, ill, tunnel_src, rt);
1654 }
1655
1656 MFCB_REFRELE(mfcbp);
1657 return (ret);
1658
1659 /*
1660 * Don't forward if we don't have a cache entry. Mrouted will
1661 * always provide a cache entry in response to an upcall.
1662 */
1663 } else {
1664 /*
1665 * If we don't have a route for packet's origin, make a copy
1666 * of the packet and send message to routing daemon.
1667 */
1668 struct mfc *mfc_rt = NULL;
1669 mblk_t *mp0 = NULL;
1670 mblk_t *mp_copy = NULL;
1671 struct rtdetq *rte = NULL;
1672 struct rtdetq *rte_m, *rte1, *prev_rte;
1673 uint_t hash;
1674 int npkts;
1675 boolean_t new_mfc = B_FALSE;
1676 ipst->ips_mrtstat->mrts_mfc_misses++;
1677 /* BSD uses mrts_no_route++ */
1678 if (ipst->ips_ip_mrtdebug > 1) {
1679 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1680 "ip_mforward: no rte ill %s src %x g %x misses %d",
1681 ill->ill_name, ntohl(src), ntohl(dst),
1682 (int)ipst->ips_mrtstat->mrts_mfc_misses);
1683 }
1684 /*
1685 * The order of the following code differs from the BSD code.
1686 * Pre-mc3.5, the BSD code was incorrect and SunOS 5.x
1687 * code works, so SunOS 5.x wasn't changed to conform to the
1688 * BSD version.
1689 */
1690
1691 /* Lock mfctable. */
1692 hash = MFCHASH(src, dst);
1693 mutex_enter(&(ipst->ips_mfcs[hash].mfcb_lock));
1694
1695 /*
1696 * If we are turning off mrouted return an error
1697 */
1698 if (is_mrouter_off(ipst)) {
1699 mutex_exit(&mfcbp->mfcb_lock);
1700 MFCB_REFRELE(mfcbp);
1701 return (-1);
1702 }
1703
1704 /* Is there an upcall waiting for this packet? */
1705 for (mfc_rt = ipst->ips_mfcs[hash].mfcb_mfc; mfc_rt;
1706 mfc_rt = mfc_rt->mfc_next) {
1707 mutex_enter(&mfc_rt->mfc_mutex);
1708 if (ipst->ips_ip_mrtdebug > 1) {
1709 (void) mi_strlog(mrouter->conn_rq, 1,
1710 SL_TRACE,
1711 "ip_mforward: MFCTAB hash %d o 0x%x"
1712 " g 0x%x\n",
1713 hash, ntohl(mfc_rt->mfc_origin.s_addr),
1714 ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1715 }
1716 /* There is an upcall */
1717 if ((src == mfc_rt->mfc_origin.s_addr) &&
1718 (dst == mfc_rt->mfc_mcastgrp.s_addr) &&
1719 (mfc_rt->mfc_rte != NULL) &&
1720 !(mfc_rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1721 break;
1722 }
1723 mutex_exit(&mfc_rt->mfc_mutex);
1724 }
1725 /* No upcall, so make a new entry into mfctable */
1726 if (mfc_rt == NULL) {
1727 mfc_rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1728 if (mfc_rt == NULL) {
1729 ipst->ips_mrtstat->mrts_fwd_drop++;
1730 ip1dbg(("ip_mforward: out of memory "
1731 "for mfc, mfc_rt\n"));
1732 goto error_return;
1733 } else
1734 new_mfc = B_TRUE;
1735 /* Get resources */
1736 /* TODO could copy header and dup rest */
1737 mp_copy = copymsg(mp);
1738 if (mp_copy == NULL) {
1739 ipst->ips_mrtstat->mrts_fwd_drop++;
1740 ip1dbg(("ip_mforward: out of memory for "
1741 "mblk, mp_copy\n"));
1742 goto error_return;
1743 }
1744 mutex_enter(&mfc_rt->mfc_mutex);
1745 }
1746 /* Get resources for rte, whether first rte or not first. */
1747 /* Add this packet into rtdetq */
1748 rte = (struct rtdetq *)mi_zalloc(sizeof (struct rtdetq));
1749 if (rte == NULL) {
1750 ipst->ips_mrtstat->mrts_fwd_drop++;
1751 mutex_exit(&mfc_rt->mfc_mutex);
1752 ip1dbg(("ip_mforward: out of memory for"
1753 " rtdetq, rte\n"));
1754 goto error_return;
1755 }
1756
1757 mp0 = copymsg(mp);
1758 if (mp0 == NULL) {
1759 ipst->ips_mrtstat->mrts_fwd_drop++;
1760 ip1dbg(("ip_mforward: out of memory for mblk, mp0\n"));
1761 mutex_exit(&mfc_rt->mfc_mutex);
1762 goto error_return;
1763 }
1764 rte->mp = mp0;
1765 if (pim_reg_packet) {
1766 ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
1767 rte->ill =
1768 ipst->ips_vifs[ipst->ips_reg_vif_num].
1769 v_ipif->ipif_ill;
1770 } else {
1771 rte->ill = ill;
1772 }
1773 rte->rte_next = NULL;
1774
1775 /*
1776 * Determine if upcall q (rtdetq) has overflowed.
1777 * mfc_rt->mfc_rte is null by mi_zalloc
1778 * if it is the first message.
1779 */
1780 for (rte_m = mfc_rt->mfc_rte, npkts = 0; rte_m;
1781 rte_m = rte_m->rte_next)
1782 npkts++;
1783 if (ipst->ips_ip_mrtdebug > 1) {
1784 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1785 "ip_mforward: upcalls %d\n", npkts);
1786 }
1787 if (npkts > MAX_UPQ) {
1788 ipst->ips_mrtstat->mrts_upq_ovflw++;
1789 mutex_exit(&mfc_rt->mfc_mutex);
1790 goto error_return;
1791 }
1792
1793 if (npkts == 0) { /* first upcall */
1794 int i = 0;
1795 /*
1796 * Now finish installing the new mfc! Now that we have
1797 * resources! Insert new entry at head of hash chain.
1798 * Use src and dst which are ipaddr_t's.
1799 */
1800 mfc_rt->mfc_origin.s_addr = src;
1801 mfc_rt->mfc_mcastgrp.s_addr = dst;
1802
1803 mutex_enter(&ipst->ips_numvifs_mutex);
1804 for (i = 0; i < (int)ipst->ips_numvifs; i++)
1805 mfc_rt->mfc_ttls[i] = 0;
1806 mutex_exit(&ipst->ips_numvifs_mutex);
1807 mfc_rt->mfc_parent = ALL_VIFS;
1808
1809 /* Link into table */
1810 if (ipst->ips_ip_mrtdebug > 1) {
1811 (void) mi_strlog(mrouter->conn_rq, 1,
1812 SL_TRACE,
1813 "ip_mforward: NEW MFCTAB hash %d o 0x%x "
1814 "g 0x%x\n", hash,
1815 ntohl(mfc_rt->mfc_origin.s_addr),
1816 ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1817 }
1818 mfc_rt->mfc_next = ipst->ips_mfcs[hash].mfcb_mfc;
1819 ipst->ips_mfcs[hash].mfcb_mfc = mfc_rt;
1820 mfc_rt->mfc_rte = NULL;
1821 }
1822
1823 /* Link in the upcall */
1824 /* First upcall */
1825 if (mfc_rt->mfc_rte == NULL)
1826 mfc_rt->mfc_rte = rte;
1827 else {
1828 /* not the first upcall */
1829 prev_rte = mfc_rt->mfc_rte;
1830 for (rte1 = mfc_rt->mfc_rte->rte_next; rte1;
1831 prev_rte = rte1, rte1 = rte1->rte_next)
1832 ;
1833 prev_rte->rte_next = rte;
1834 }
1835
1836 /*
1837 * No upcalls waiting, this is first one, so send a message to
1838 * routing daemon to install a route into kernel table.
1839 */
1840 if (npkts == 0) {
1841 struct igmpmsg *im;
1842 /* ipha_protocol is 0, for upcall */
1843 ASSERT(mp_copy != NULL);
1844 im = (struct igmpmsg *)mp_copy->b_rptr;
1845 im->im_msgtype = IGMPMSG_NOCACHE;
1846 im->im_mbz = 0;
1847 mutex_enter(&ipst->ips_numvifs_mutex);
1848 if (pim_reg_packet) {
1849 im->im_vif = (uchar_t)ipst->ips_reg_vif_num;
1850 mutex_exit(&ipst->ips_numvifs_mutex);
1851 } else {
1852 /*
1853 * XXX do we need to hold locks here ?
1854 */
1855 for (vifi = 0;
1856 vifi < ipst->ips_numvifs;
1857 vifi++) {
1858 if (ipst->ips_vifs[vifi].v_ipif == NULL)
1859 continue;
1860 if (ipst->ips_vifs[vifi].
1861 v_ipif->ipif_ill == ill) {
1862 im->im_vif = (uchar_t)vifi;
1863 break;
1864 }
1865 }
1866 mutex_exit(&ipst->ips_numvifs_mutex);
1867 ASSERT(vifi < ipst->ips_numvifs);
1868 }
1869
1870 ipst->ips_mrtstat->mrts_upcalls++;
1871 /* Timer to discard upcalls if mrouted is too slow */
1872 mfc_rt->mfc_timeout_id = timeout(expire_upcalls,
1873 mfc_rt, EXPIRE_TIMEOUT * UPCALL_EXPIRE);
1874 mutex_exit(&mfc_rt->mfc_mutex);
1875 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1876 /* Pass to RAWIP */
1877 ira->ira_ill = ira->ira_rill = NULL;
1878 (mrouter->conn_recv)(mrouter, mp_copy, NULL, ira);
1879 ira->ira_ill = ill;
1880 ira->ira_rill = rill;
1881 } else {
1882 mutex_exit(&mfc_rt->mfc_mutex);
1883 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1884 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1885 ip_drop_input("ip_mforward - upcall already waiting",
1886 mp_copy, ill);
1887 freemsg(mp_copy);
1888 }
1889
1890 MFCB_REFRELE(mfcbp);
1891 if (tunnel_src != 0)
1892 return (1);
1893 else
1894 return (0);
1895 error_return:
1896 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1897 MFCB_REFRELE(mfcbp);
1898 if (mfc_rt != NULL && (new_mfc == B_TRUE))
1899 mi_free((char *)mfc_rt);
1900 if (rte != NULL)
1901 mi_free((char *)rte);
1902 if (mp_copy != NULL) {
1903 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1904 ip_drop_input("ip_mforward error", mp_copy, ill);
1905 freemsg(mp_copy);
1906 }
1907 if (mp0 != NULL)
1908 freemsg(mp0);
1909 return (-1);
1910 }
1911 }
1912
1913 /*
1914 * Clean up the mfctable cache entry if upcall is not serviced.
1915 * SunOS 5.x has timeout per mfc, unlike BSD which has one timer.
1916 */
1917 static void
expire_upcalls(void * arg)1918 expire_upcalls(void *arg)
1919 {
1920 struct mfc *mfc_rt = arg;
1921 uint_t hash;
1922 struct mfc *prev_mfc, *mfc0;
1923 ip_stack_t *ipst;
1924 conn_t *mrouter;
1925
1926 if (mfc_rt->mfc_rte == NULL || mfc_rt->mfc_rte->ill != NULL) {
1927 cmn_err(CE_WARN, "expire_upcalls: no ILL\n");
1928 return;
1929 }
1930 ipst = mfc_rt->mfc_rte->ill->ill_ipst;
1931 mrouter = ipst->ips_ip_g_mrouter;
1932
1933 hash = MFCHASH(mfc_rt->mfc_origin.s_addr, mfc_rt->mfc_mcastgrp.s_addr);
1934 if (ipst->ips_ip_mrtdebug > 1) {
1935 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1936 "expire_upcalls: hash %d s %x g %x",
1937 hash, ntohl(mfc_rt->mfc_origin.s_addr),
1938 ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1939 }
1940 MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
1941 mutex_enter(&mfc_rt->mfc_mutex);
1942 /*
1943 * if timeout has been set to zero, than the
1944 * entry has been filled, no need to delete it.
1945 */
1946 if (mfc_rt->mfc_timeout_id == 0)
1947 goto done;
1948 ipst->ips_mrtstat->mrts_cache_cleanups++;
1949 mfc_rt->mfc_timeout_id = 0;
1950
1951 /* Determine entry to be cleaned up in cache table. */
1952 for (prev_mfc = mfc0 = ipst->ips_mfcs[hash].mfcb_mfc; mfc0;
1953 prev_mfc = mfc0, mfc0 = mfc0->mfc_next)
1954 if (mfc0 == mfc_rt)
1955 break;
1956
1957 /* del_mfc takes care of gone mfcs */
1958 ASSERT(prev_mfc != NULL);
1959 ASSERT(mfc0 != NULL);
1960
1961 /*
1962 * Delete the entry from the cache
1963 */
1964 ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
1965 mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
1966
1967 /*
1968 * release_mfc will drop all queued upcall packets.
1969 * and will free the mbuf with the pkt, if, timing info.
1970 */
1971 done:
1972 mutex_exit(&mfc_rt->mfc_mutex);
1973 MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1974 }
1975
1976 /*
1977 * Packet forwarding routine once entry in the cache is made.
1978 */
1979 static int
ip_mdq(mblk_t * mp,ipha_t * ipha,ill_t * ill,ipaddr_t tunnel_src,struct mfc * rt)1980 ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
1981 struct mfc *rt)
1982 {
1983 vifi_t vifi;
1984 struct vif *vifp;
1985 ipaddr_t dst = ipha->ipha_dst;
1986 size_t plen = msgdsize(mp);
1987 vifi_t num_of_vifs;
1988 ip_stack_t *ipst = ill->ill_ipst;
1989 conn_t *mrouter = ipst->ips_ip_g_mrouter;
1990 ip_recv_attr_t iras;
1991
1992 if (ipst->ips_ip_mrtdebug > 1) {
1993 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1994 "ip_mdq: SEND src %x, ipha_dst %x, ill %s",
1995 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
1996 ill->ill_name);
1997 }
1998
1999 /* Macro to send packet on vif */
2000 #define MC_SEND(ipha, mp, vifp, dst) { \
2001 if ((vifp)->v_flags & VIFF_TUNNEL) \
2002 encap_send((ipha), (mp), (vifp), (dst)); \
2003 else if ((vifp)->v_flags & VIFF_REGISTER) \
2004 register_send((ipha), (mp), (vifp), (dst)); \
2005 else \
2006 phyint_send((ipha), (mp), (vifp), (dst)); \
2007 }
2008
2009 vifi = rt->mfc_parent;
2010
2011 /*
2012 * The value of vifi is MAXVIFS if the pkt had no parent, i.e.,
2013 * Mrouted had no route.
2014 * We wanted the route installed in the mfctable to prevent multiple
2015 * tries, so it passed add_mfc(), but is discarded here. The v_ipif is
2016 * NULL so we don't want to check the ill. Still needed as of Mrouted
2017 * 3.6.
2018 */
2019 if (vifi == NO_VIF) {
2020 ip1dbg(("ip_mdq: no route for origin ill %s, vifi is NO_VIF\n",
2021 ill->ill_name));
2022 if (ipst->ips_ip_mrtdebug > 1) {
2023 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2024 "ip_mdq: vifi is NO_VIF ill = %s", ill->ill_name);
2025 }
2026 return (-1); /* drop pkt */
2027 }
2028
2029 if (!lock_good_vif(&ipst->ips_vifs[vifi]))
2030 return (-1);
2031 /*
2032 * The MFC entries are not cleaned up when an ipif goes
2033 * away thus this code has to guard against an MFC referencing
2034 * an ipif that has been closed. Note: reset_mrt_vif_ipif
2035 * sets the v_ipif to NULL when the ipif disappears.
2036 */
2037 ASSERT(ipst->ips_vifs[vifi].v_ipif != NULL);
2038
2039 if (vifi >= ipst->ips_numvifs) {
2040 cmn_err(CE_WARN, "ip_mdq: illegal vifi %d numvifs "
2041 "%d ill %s viftable ill %s\n",
2042 (int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
2043 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
2044 unlock_good_vif(&ipst->ips_vifs[vifi]);
2045 return (-1);
2046 }
2047 /*
2048 * Don't forward if it didn't arrive from the parent vif for its
2049 * origin.
2050 */
2051 if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill) ||
2052 (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) {
2053 /* Came in the wrong interface */
2054 ip1dbg(("ip_mdq: arrived wrong if, vifi %d "
2055 "numvifs %d ill %s viftable ill %s\n",
2056 (int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
2057 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name));
2058 if (ipst->ips_ip_mrtdebug > 1) {
2059 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2060 "ip_mdq: arrived wrong if, vifi %d ill "
2061 "%s viftable ill %s\n",
2062 (int)vifi, ill->ill_name,
2063 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
2064 }
2065 ipst->ips_mrtstat->mrts_wrong_if++;
2066 rt->mfc_wrong_if++;
2067
2068 /*
2069 * If we are doing PIM assert processing and we are forwarding
2070 * packets on this interface, and it is a broadcast medium
2071 * interface (and not a tunnel), send a message to the routing.
2072 *
2073 * We use the first ipif on the list, since it's all we have.
2074 * Chances are the ipif_flags are the same for ipifs on the ill.
2075 */
2076 if (ipst->ips_pim_assert && rt->mfc_ttls[vifi] > 0 &&
2077 (ill->ill_ipif->ipif_flags & IPIF_BROADCAST) &&
2078 !(ipst->ips_vifs[vifi].v_flags & VIFF_TUNNEL)) {
2079 mblk_t *mp_copy;
2080 struct igmpmsg *im;
2081
2082 /* TODO could copy header and dup rest */
2083 mp_copy = copymsg(mp);
2084 if (mp_copy == NULL) {
2085 ipst->ips_mrtstat->mrts_fwd_drop++;
2086 ip1dbg(("ip_mdq: out of memory "
2087 "for mblk, mp_copy\n"));
2088 unlock_good_vif(&ipst->ips_vifs[vifi]);
2089 return (-1);
2090 }
2091
2092 im = (struct igmpmsg *)mp_copy->b_rptr;
2093 im->im_msgtype = IGMPMSG_WRONGVIF;
2094 im->im_mbz = 0;
2095 im->im_vif = (ushort_t)vifi;
2096 /* Pass to RAWIP */
2097
2098 bzero(&iras, sizeof (iras));
2099 iras.ira_flags = IRAF_IS_IPV4;
2100 iras.ira_ip_hdr_length =
2101 IPH_HDR_LENGTH(mp_copy->b_rptr);
2102 iras.ira_pktlen = msgdsize(mp_copy);
2103 iras.ira_ttl = ipha->ipha_ttl;
2104 (mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras);
2105 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2106 }
2107 unlock_good_vif(&ipst->ips_vifs[vifi]);
2108 if (tunnel_src != 0)
2109 return (1);
2110 else
2111 return (0);
2112 }
2113 /*
2114 * If I sourced this packet, it counts as output, else it was input.
2115 */
2116 if (ipha->ipha_src == ipst->ips_vifs[vifi].v_lcl_addr.s_addr) {
2117 ipst->ips_vifs[vifi].v_pkt_out++;
2118 ipst->ips_vifs[vifi].v_bytes_out += plen;
2119 } else {
2120 ipst->ips_vifs[vifi].v_pkt_in++;
2121 ipst->ips_vifs[vifi].v_bytes_in += plen;
2122 }
2123 mutex_enter(&rt->mfc_mutex);
2124 rt->mfc_pkt_cnt++;
2125 rt->mfc_byte_cnt += plen;
2126 mutex_exit(&rt->mfc_mutex);
2127 unlock_good_vif(&ipst->ips_vifs[vifi]);
2128 /*
2129 * For each vif, decide if a copy of the packet should be forwarded.
2130 * Forward if:
2131 * - the vif threshold ttl is non-zero AND
2132 * - the pkt ttl exceeds the vif's threshold
2133 * A non-zero mfc_ttl indicates that the vif is part of
2134 * the output set for the mfc entry.
2135 */
2136 mutex_enter(&ipst->ips_numvifs_mutex);
2137 num_of_vifs = ipst->ips_numvifs;
2138 mutex_exit(&ipst->ips_numvifs_mutex);
2139 for (vifp = ipst->ips_vifs, vifi = 0;
2140 vifi < num_of_vifs;
2141 vifp++, vifi++) {
2142 if (!lock_good_vif(vifp))
2143 continue;
2144 if ((rt->mfc_ttls[vifi] > 0) &&
2145 (ipha->ipha_ttl > rt->mfc_ttls[vifi])) {
2146 /*
2147 * lock_good_vif should not have succedded if
2148 * v_ipif is null.
2149 */
2150 ASSERT(vifp->v_ipif != NULL);
2151 vifp->v_pkt_out++;
2152 vifp->v_bytes_out += plen;
2153 MC_SEND(ipha, mp, vifp, dst);
2154 ipst->ips_mrtstat->mrts_fwd_out++;
2155 }
2156 unlock_good_vif(vifp);
2157 }
2158 if (tunnel_src != 0)
2159 return (1);
2160 else
2161 return (0);
2162 }
2163
2164 /*
2165 * Send the packet on physical interface.
2166 * Caller assumes can continue to use mp on return.
2167 */
2168 /* ARGSUSED */
2169 static void
phyint_send(ipha_t * ipha,mblk_t * mp,struct vif * vifp,ipaddr_t dst)2170 phyint_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2171 {
2172 mblk_t *mp_copy;
2173 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2174 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2175
2176 /* Make a new reference to the packet */
2177 mp_copy = copymsg(mp); /* TODO could copy header and dup rest */
2178 if (mp_copy == NULL) {
2179 ipst->ips_mrtstat->mrts_fwd_drop++;
2180 ip1dbg(("phyint_send: out of memory for mblk, mp_copy\n"));
2181 return;
2182 }
2183 if (vifp->v_rate_limit <= 0)
2184 tbf_send_packet(vifp, mp_copy);
2185 else {
2186 if (ipst->ips_ip_mrtdebug > 1) {
2187 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2188 "phyint_send: tbf_contr rate %d "
2189 "vifp 0x%p mp 0x%p dst 0x%x",
2190 vifp->v_rate_limit, (void *)vifp, (void *)mp, dst);
2191 }
2192 tbf_control(vifp, mp_copy, (ipha_t *)mp_copy->b_rptr);
2193 }
2194 }
2195
2196 /*
2197 * Send the whole packet for REGISTER encapsulation to PIM daemon
2198 * Caller assumes it can continue to use mp on return.
2199 */
2200 /* ARGSUSED */
2201 static void
register_send(ipha_t * ipha,mblk_t * mp,struct vif * vifp,ipaddr_t dst)2202 register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2203 {
2204 struct igmpmsg *im;
2205 mblk_t *mp_copy;
2206 ipha_t *ipha_copy;
2207 ill_t *ill = vifp->v_ipif->ipif_ill;
2208 ip_stack_t *ipst = ill->ill_ipst;
2209 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2210 ip_recv_attr_t iras;
2211
2212 if (ipst->ips_ip_mrtdebug > 1) {
2213 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2214 "register_send: src %x, dst %x\n",
2215 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2216 }
2217
2218 /*
2219 * Copy the old packet & pullup its IP header into the new mblk_t so we
2220 * can modify it. Try to fill the new mblk_t since if we don't the
2221 * ethernet driver will.
2222 */
2223 mp_copy = allocb(sizeof (struct igmpmsg) + sizeof (ipha_t), BPRI_MED);
2224 if (mp_copy == NULL) {
2225 ++ipst->ips_mrtstat->mrts_pim_nomemory;
2226 if (ipst->ips_ip_mrtdebug > 3) {
2227 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2228 "register_send: allocb failure.");
2229 }
2230 return;
2231 }
2232
2233 /*
2234 * Bump write pointer to account for igmpmsg being added.
2235 */
2236 mp_copy->b_wptr = mp_copy->b_rptr + sizeof (struct igmpmsg);
2237
2238 /*
2239 * Chain packet to new mblk_t.
2240 */
2241 if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2242 ++ipst->ips_mrtstat->mrts_pim_nomemory;
2243 if (ipst->ips_ip_mrtdebug > 3) {
2244 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2245 "register_send: copymsg failure.");
2246 }
2247 freeb(mp_copy);
2248 return;
2249 }
2250
2251 /*
2252 * icmp_input() asserts that IP version field is set to an
2253 * appropriate version. Hence, the struct igmpmsg that this really
2254 * becomes, needs to have the correct IP version field.
2255 */
2256 ipha_copy = (ipha_t *)mp_copy->b_rptr;
2257 *ipha_copy = multicast_encap_iphdr;
2258
2259 /*
2260 * The kernel uses the struct igmpmsg header to encode the messages to
2261 * the multicast routing daemon. Fill in the fields in the header
2262 * starting with the message type which is IGMPMSG_WHOLEPKT
2263 */
2264 im = (struct igmpmsg *)mp_copy->b_rptr;
2265 im->im_msgtype = IGMPMSG_WHOLEPKT;
2266 im->im_src.s_addr = ipha->ipha_src;
2267 im->im_dst.s_addr = ipha->ipha_dst;
2268
2269 /*
2270 * Must Be Zero. This is because the struct igmpmsg is really an IP
2271 * header with renamed fields and the multicast routing daemon uses
2272 * an ipha_protocol (aka im_mbz) of 0 to distinguish these messages.
2273 */
2274 im->im_mbz = 0;
2275
2276 ++ipst->ips_mrtstat->mrts_upcalls;
2277 if (IPCL_IS_NONSTR(mrouter) ? mrouter->conn_flow_cntrld :
2278 !canputnext(mrouter->conn_rq)) {
2279 ++ipst->ips_mrtstat->mrts_pim_regsend_drops;
2280 if (ipst->ips_ip_mrtdebug > 3) {
2281 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2282 "register_send: register upcall failure.");
2283 }
2284 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2285 ip_drop_input("mrts_pim_regsend_drops", mp_copy, ill);
2286 freemsg(mp_copy);
2287 } else {
2288 /* Pass to RAWIP */
2289 bzero(&iras, sizeof (iras));
2290 iras.ira_flags = IRAF_IS_IPV4;
2291 iras.ira_ip_hdr_length = sizeof (ipha_t);
2292 iras.ira_pktlen = msgdsize(mp_copy);
2293 iras.ira_ttl = ipha->ipha_ttl;
2294 (mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras);
2295 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2296 }
2297 }
2298
2299 /*
2300 * pim_validate_cksum handles verification of the checksum in the
2301 * pim header. For PIM Register packets, the checksum is calculated
2302 * across the PIM header only. For all other packets, the checksum
2303 * is for the PIM header and remainder of the packet.
2304 *
2305 * returns: B_TRUE, if checksum is okay.
2306 * B_FALSE, if checksum is not valid.
2307 */
2308 static boolean_t
pim_validate_cksum(mblk_t * mp,ipha_t * ip,struct pim * pimp)2309 pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp)
2310 {
2311 mblk_t *mp_dup;
2312
2313 if ((mp_dup = dupmsg(mp)) == NULL)
2314 return (B_FALSE);
2315
2316 mp_dup->b_rptr += IPH_HDR_LENGTH(ip);
2317 if (pimp->pim_type == PIM_REGISTER)
2318 mp_dup->b_wptr = mp_dup->b_rptr + PIM_MINLEN;
2319 if (IP_CSUM(mp_dup, 0, 0)) {
2320 freemsg(mp_dup);
2321 return (B_FALSE);
2322 }
2323 freemsg(mp_dup);
2324 return (B_TRUE);
2325 }
2326
2327 /*
2328 * Process PIM protocol packets i.e. IP Protocol 103.
2329 * Register messages are decapsulated and sent onto multicast forwarding.
2330 *
2331 * Return NULL for a bad packet that is discarded here.
2332 * Return mp if the message is OK and should be handed to "raw" receivers.
2333 * Callers of pim_input() may need to reinitialize variables that were copied
2334 * from the mblk as this calls pullupmsg().
2335 */
2336 mblk_t *
pim_input(mblk_t * mp,ip_recv_attr_t * ira)2337 pim_input(mblk_t *mp, ip_recv_attr_t *ira)
2338 {
2339 ipha_t *eip, *ip;
2340 int iplen, pimlen, iphlen;
2341 struct pim *pimp; /* pointer to a pim struct */
2342 uint32_t *reghdr;
2343 ill_t *ill = ira->ira_ill;
2344 ip_stack_t *ipst = ill->ill_ipst;
2345 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2346
2347 /*
2348 * Pullup the msg for PIM protocol processing.
2349 */
2350 if (pullupmsg(mp, -1) == 0) {
2351 ++ipst->ips_mrtstat->mrts_pim_nomemory;
2352 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2353 ip_drop_input("mrts_pim_nomemory", mp, ill);
2354 freemsg(mp);
2355 return (NULL);
2356 }
2357
2358 ip = (ipha_t *)mp->b_rptr;
2359 iplen = ip->ipha_length;
2360 iphlen = IPH_HDR_LENGTH(ip);
2361 pimlen = ntohs(iplen) - iphlen;
2362
2363 /*
2364 * Validate lengths
2365 */
2366 if (pimlen < PIM_MINLEN) {
2367 ++ipst->ips_mrtstat->mrts_pim_malformed;
2368 if (ipst->ips_ip_mrtdebug > 1) {
2369 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2370 "pim_input: length not at least minlen");
2371 }
2372 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2373 ip_drop_input("mrts_pim_malformed", mp, ill);
2374 freemsg(mp);
2375 return (NULL);
2376 }
2377
2378 /*
2379 * Point to the PIM header.
2380 */
2381 pimp = (struct pim *)((caddr_t)ip + iphlen);
2382
2383 /*
2384 * Check the version number.
2385 */
2386 if (pimp->pim_vers != PIM_VERSION) {
2387 ++ipst->ips_mrtstat->mrts_pim_badversion;
2388 if (ipst->ips_ip_mrtdebug > 1) {
2389 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2390 "pim_input: unknown version of PIM");
2391 }
2392 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2393 ip_drop_input("mrts_pim_badversion", mp, ill);
2394 freemsg(mp);
2395 return (NULL);
2396 }
2397
2398 /*
2399 * Validate the checksum
2400 */
2401 if (!pim_validate_cksum(mp, ip, pimp)) {
2402 ++ipst->ips_mrtstat->mrts_pim_rcv_badcsum;
2403 if (ipst->ips_ip_mrtdebug > 1) {
2404 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2405 "pim_input: invalid checksum");
2406 }
2407 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2408 ip_drop_input("pim_rcv_badcsum", mp, ill);
2409 freemsg(mp);
2410 return (NULL);
2411 }
2412
2413 if (pimp->pim_type != PIM_REGISTER)
2414 return (mp);
2415
2416 reghdr = (uint32_t *)(pimp + 1);
2417 eip = (ipha_t *)(reghdr + 1);
2418
2419 /*
2420 * check if the inner packet is destined to mcast group
2421 */
2422 if (!CLASSD(eip->ipha_dst)) {
2423 ++ipst->ips_mrtstat->mrts_pim_badregisters;
2424 if (ipst->ips_ip_mrtdebug > 1) {
2425 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2426 "pim_input: Inner pkt not mcast .. !");
2427 }
2428 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2429 ip_drop_input("mrts_pim_badregisters", mp, ill);
2430 freemsg(mp);
2431 return (NULL);
2432 }
2433 if (ipst->ips_ip_mrtdebug > 1) {
2434 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2435 "register from %x, to %x, len %d",
2436 ntohl(eip->ipha_src),
2437 ntohl(eip->ipha_dst),
2438 ntohs(eip->ipha_length));
2439 }
2440 /*
2441 * If the null register bit is not set, decapsulate
2442 * the packet before forwarding it.
2443 * Avoid this in no register vif
2444 */
2445 if (!(ntohl(*reghdr) & PIM_NULL_REGISTER) &&
2446 ipst->ips_reg_vif_num != ALL_VIFS) {
2447 mblk_t *mp_copy;
2448 uint_t saved_pktlen;
2449
2450 /* Copy the message */
2451 if ((mp_copy = copymsg(mp)) == NULL) {
2452 ++ipst->ips_mrtstat->mrts_pim_nomemory;
2453 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2454 ip_drop_input("mrts_pim_nomemory", mp, ill);
2455 freemsg(mp);
2456 return (NULL);
2457 }
2458
2459 /*
2460 * Decapsulate the packet and give it to
2461 * register_mforward.
2462 */
2463 mp_copy->b_rptr += iphlen + sizeof (pim_t) + sizeof (*reghdr);
2464 saved_pktlen = ira->ira_pktlen;
2465 ira->ira_pktlen -= iphlen + sizeof (pim_t) + sizeof (*reghdr);
2466 if (register_mforward(mp_copy, ira) != 0) {
2467 /* register_mforward already called ip_drop_input */
2468 freemsg(mp);
2469 ira->ira_pktlen = saved_pktlen;
2470 return (NULL);
2471 }
2472 ira->ira_pktlen = saved_pktlen;
2473 }
2474
2475 /*
2476 * Pass all valid PIM packets up to any process(es) listening on a raw
2477 * PIM socket. For Solaris it is done right after pim_input() is
2478 * called.
2479 */
2480 return (mp);
2481 }
2482
2483 /*
2484 * PIM sparse mode hook. Called by pim_input after decapsulating
2485 * the packet. Loop back the packet, as if we have received it.
2486 * In pim_input() we have to check if the destination is a multicast address.
2487 */
2488 static int
register_mforward(mblk_t * mp,ip_recv_attr_t * ira)2489 register_mforward(mblk_t *mp, ip_recv_attr_t *ira)
2490 {
2491 ire_t *ire;
2492 ipha_t *ipha = (ipha_t *)mp->b_rptr;
2493 ill_t *ill = ira->ira_ill;
2494 ip_stack_t *ipst = ill->ill_ipst;
2495 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2496
2497 ASSERT(ipst->ips_reg_vif_num <= ipst->ips_numvifs);
2498
2499 if (ipst->ips_ip_mrtdebug > 3) {
2500 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2501 "register_mforward: src %x, dst %x\n",
2502 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2503 }
2504 /*
2505 * Need to pass in to ip_mforward() the information that the
2506 * packet has arrived on the register_vif. We mark it with
2507 * the IRAF_PIM_REGISTER attribute.
2508 * pim_input verified that the (inner) destination is multicast,
2509 * hence we skip the generic code in ip_input.
2510 */
2511 ira->ira_flags |= IRAF_PIM_REGISTER;
2512 ++ipst->ips_mrtstat->mrts_pim_regforwards;
2513
2514 if (!CLASSD(ipha->ipha_dst)) {
2515 ire = ire_route_recursive_v4(ipha->ipha_dst, 0, NULL, ALL_ZONES,
2516 ira->ira_tsl, MATCH_IRE_SECATTR, IRR_ALLOCATE, 0, ipst,
2517 NULL, NULL, NULL);
2518 } else {
2519 ire = ire_multicast(ill);
2520 }
2521 ASSERT(ire != NULL);
2522 /* Normally this will return the IRE_MULTICAST */
2523 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
2524 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2525 ip_drop_input("mrts_pim RTF_REJECT", mp, ill);
2526 freemsg(mp);
2527 ire_refrele(ire);
2528 return (-1);
2529 }
2530 ASSERT(ire->ire_type & IRE_MULTICAST);
2531 (*ire->ire_recvfn)(ire, mp, ipha, ira);
2532 ire_refrele(ire);
2533
2534 return (0);
2535 }
2536
2537 /*
2538 * Send an encapsulated packet.
2539 * Caller assumes can continue to use mp when routine returns.
2540 */
2541 /* ARGSUSED */
2542 static void
encap_send(ipha_t * ipha,mblk_t * mp,struct vif * vifp,ipaddr_t dst)2543 encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2544 {
2545 mblk_t *mp_copy;
2546 ipha_t *ipha_copy;
2547 size_t len;
2548 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2549 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2550
2551 if (ipst->ips_ip_mrtdebug > 1) {
2552 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2553 "encap_send: vif %ld enter",
2554 (ptrdiff_t)(vifp - ipst->ips_vifs));
2555 }
2556 len = ntohs(ipha->ipha_length);
2557
2558 /*
2559 * Copy the old packet & pullup it's IP header into the
2560 * new mbuf so we can modify it. Try to fill the new
2561 * mbuf since if we don't the ethernet driver will.
2562 */
2563 mp_copy = allocb(32 + sizeof (multicast_encap_iphdr), BPRI_MED);
2564 if (mp_copy == NULL)
2565 return;
2566 mp_copy->b_rptr += 32;
2567 mp_copy->b_wptr = mp_copy->b_rptr + sizeof (multicast_encap_iphdr);
2568 if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2569 freeb(mp_copy);
2570 return;
2571 }
2572
2573 /*
2574 * Fill in the encapsulating IP header.
2575 * Remote tunnel dst in rmt_addr, from add_vif().
2576 */
2577 ipha_copy = (ipha_t *)mp_copy->b_rptr;
2578 *ipha_copy = multicast_encap_iphdr;
2579 ASSERT((len + sizeof (ipha_t)) <= IP_MAXPACKET);
2580 ipha_copy->ipha_length = htons(len + sizeof (ipha_t));
2581 ipha_copy->ipha_src = vifp->v_lcl_addr.s_addr;
2582 ipha_copy->ipha_dst = vifp->v_rmt_addr.s_addr;
2583 ASSERT(ipha_copy->ipha_ident == 0);
2584
2585 /* Turn the encapsulated IP header back into a valid one. */
2586 ipha = (ipha_t *)mp_copy->b_cont->b_rptr;
2587 ipha->ipha_ttl--;
2588 ipha->ipha_hdr_checksum = 0;
2589 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2590
2591 ipha_copy->ipha_ttl = ipha->ipha_ttl;
2592
2593 if (ipst->ips_ip_mrtdebug > 1) {
2594 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2595 "encap_send: group 0x%x", ntohl(ipha->ipha_dst));
2596 }
2597 if (vifp->v_rate_limit <= 0)
2598 tbf_send_packet(vifp, mp_copy);
2599 else
2600 /* ipha is from the original header */
2601 tbf_control(vifp, mp_copy, ipha);
2602 }
2603
2604 /*
2605 * De-encapsulate a packet and feed it back through IP input if it
2606 * matches one of our multicast tunnels.
2607 *
2608 * This routine is called whenever IP gets a packet with prototype
2609 * IPPROTO_ENCAP and a local destination address and the packet didn't
2610 * match one of our configured IP-in-IP tunnels.
2611 */
2612 void
ip_mroute_decap(mblk_t * mp,ip_recv_attr_t * ira)2613 ip_mroute_decap(mblk_t *mp, ip_recv_attr_t *ira)
2614 {
2615 ipha_t *ipha = (ipha_t *)mp->b_rptr;
2616 ipha_t *ipha_encap;
2617 int hlen = IPH_HDR_LENGTH(ipha);
2618 int hlen_encap;
2619 ipaddr_t src;
2620 struct vif *vifp;
2621 ire_t *ire;
2622 ill_t *ill = ira->ira_ill;
2623 ip_stack_t *ipst = ill->ill_ipst;
2624 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2625
2626 /* Make sure we have all of the inner header */
2627 ipha_encap = (ipha_t *)((char *)ipha + hlen);
2628 if (mp->b_wptr - mp->b_rptr < hlen + IP_SIMPLE_HDR_LENGTH) {
2629 ipha = ip_pullup(mp, hlen + IP_SIMPLE_HDR_LENGTH, ira);
2630 if (ipha == NULL) {
2631 ipst->ips_mrtstat->mrts_bad_tunnel++;
2632 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2633 ip_drop_input("ip_mroute_decap: too short", mp, ill);
2634 freemsg(mp);
2635 return;
2636 }
2637 ipha_encap = (ipha_t *)((char *)ipha + hlen);
2638 }
2639 hlen_encap = IPH_HDR_LENGTH(ipha_encap);
2640 if (mp->b_wptr - mp->b_rptr < hlen + hlen_encap) {
2641 ipha = ip_pullup(mp, hlen + hlen_encap, ira);
2642 if (ipha == NULL) {
2643 ipst->ips_mrtstat->mrts_bad_tunnel++;
2644 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2645 ip_drop_input("ip_mroute_decap: too short", mp, ill);
2646 freemsg(mp);
2647 return;
2648 }
2649 ipha_encap = (ipha_t *)((char *)ipha + hlen);
2650 }
2651
2652 /*
2653 * Dump the packet if it's not to a multicast destination or if
2654 * we don't have an encapsulating tunnel with the source.
2655 * Note: This code assumes that the remote site IP address
2656 * uniquely identifies the tunnel (i.e., that this site has
2657 * at most one tunnel with the remote site).
2658 */
2659 if (!CLASSD(ipha_encap->ipha_dst)) {
2660 ipst->ips_mrtstat->mrts_bad_tunnel++;
2661 ip1dbg(("ip_mroute_decap: bad tunnel\n"));
2662 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2663 ip_drop_input("mrts_bad_tunnel", mp, ill);
2664 freemsg(mp);
2665 return;
2666 }
2667 src = (ipaddr_t)ipha->ipha_src;
2668 mutex_enter(&ipst->ips_last_encap_lock);
2669 if (src != ipst->ips_last_encap_src) {
2670 struct vif *vife;
2671
2672 vifp = ipst->ips_vifs;
2673 vife = vifp + ipst->ips_numvifs;
2674 ipst->ips_last_encap_src = src;
2675 ipst->ips_last_encap_vif = 0;
2676 for (; vifp < vife; ++vifp) {
2677 if (!lock_good_vif(vifp))
2678 continue;
2679 if (vifp->v_rmt_addr.s_addr == src) {
2680 if (vifp->v_flags & VIFF_TUNNEL)
2681 ipst->ips_last_encap_vif = vifp;
2682 if (ipst->ips_ip_mrtdebug > 1) {
2683 (void) mi_strlog(mrouter->conn_rq,
2684 1, SL_TRACE,
2685 "ip_mroute_decap: good tun "
2686 "vif %ld with %x",
2687 (ptrdiff_t)(vifp - ipst->ips_vifs),
2688 ntohl(src));
2689 }
2690 unlock_good_vif(vifp);
2691 break;
2692 }
2693 unlock_good_vif(vifp);
2694 }
2695 }
2696 if ((vifp = ipst->ips_last_encap_vif) == 0) {
2697 mutex_exit(&ipst->ips_last_encap_lock);
2698 ipst->ips_mrtstat->mrts_bad_tunnel++;
2699 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2700 ip_drop_input("mrts_bad_tunnel", mp, ill);
2701 freemsg(mp);
2702 ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n",
2703 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(src)));
2704 return;
2705 }
2706 mutex_exit(&ipst->ips_last_encap_lock);
2707
2708 /*
2709 * Need to pass in the tunnel source to ip_mforward (so that it can
2710 * verify that the packet arrived over the correct vif.)
2711 */
2712 ira->ira_flags |= IRAF_MROUTE_TUNNEL_SET;
2713 ira->ira_mroute_tunnel = src;
2714 mp->b_rptr += hlen;
2715 ira->ira_pktlen -= hlen;
2716 ira->ira_ip_hdr_length = hlen_encap;
2717
2718 /*
2719 * We don't redo any of the filtering in ill_input_full_v4 and we
2720 * have checked that all of ipha_encap and any IP options are
2721 * pulled up. Hence we call ire_recv_multicast_v4 directly.
2722 * However, we have to check for RSVP as in ip_input_full_v4
2723 * and if so we pass it to ire_recv_broadcast_v4 for local delivery
2724 * to the rsvpd.
2725 */
2726 if (ipha_encap->ipha_protocol == IPPROTO_RSVP &&
2727 ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head != NULL) {
2728 ire = ire_route_recursive_v4(INADDR_BROADCAST, 0, ill,
2729 ALL_ZONES, ira->ira_tsl, MATCH_IRE_ILL|MATCH_IRE_SECATTR,
2730 IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL);
2731 } else {
2732 ire = ire_multicast(ill);
2733 }
2734 ASSERT(ire != NULL);
2735 /* Normally this will return the IRE_MULTICAST or IRE_BROADCAST */
2736 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
2737 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2738 ip_drop_input("ip_mroute_decap: RTF_REJECT", mp, ill);
2739 freemsg(mp);
2740 ire_refrele(ire);
2741 return;
2742 }
2743 ire->ire_ib_pkt_count++;
2744 ASSERT(ire->ire_type & (IRE_MULTICAST|IRE_BROADCAST));
2745 (*ire->ire_recvfn)(ire, mp, ipha_encap, ira);
2746 ire_refrele(ire);
2747 }
2748
2749 /*
2750 * Remove all records with v_ipif == ipif. Called when an interface goes away
2751 * (stream closed). Called as writer.
2752 */
2753 void
reset_mrt_vif_ipif(ipif_t * ipif)2754 reset_mrt_vif_ipif(ipif_t *ipif)
2755 {
2756 vifi_t vifi, tmp_vifi;
2757 vifi_t num_of_vifs;
2758 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
2759
2760 /* Can't check vifi >= 0 since vifi_t is unsigned! */
2761
2762 mutex_enter(&ipst->ips_numvifs_mutex);
2763 num_of_vifs = ipst->ips_numvifs;
2764 mutex_exit(&ipst->ips_numvifs_mutex);
2765
2766 for (vifi = num_of_vifs; vifi != 0; vifi--) {
2767 tmp_vifi = vifi - 1;
2768 if (ipst->ips_vifs[tmp_vifi].v_ipif == ipif) {
2769 (void) del_vif(&tmp_vifi, ipst);
2770 }
2771 }
2772 }
2773
2774 /* Remove pending upcall msgs when ill goes away. Called by ill_delete. */
2775 void
reset_mrt_ill(ill_t * ill)2776 reset_mrt_ill(ill_t *ill)
2777 {
2778 struct mfc *rt;
2779 struct rtdetq *rte;
2780 int i;
2781 ip_stack_t *ipst = ill->ill_ipst;
2782 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2783 timeout_id_t id;
2784
2785 for (i = 0; i < MFCTBLSIZ; i++) {
2786 MFCB_REFHOLD(&ipst->ips_mfcs[i]);
2787 if ((rt = ipst->ips_mfcs[i].mfcb_mfc) != NULL) {
2788 if (ipst->ips_ip_mrtdebug > 1) {
2789 (void) mi_strlog(mrouter->conn_rq, 1,
2790 SL_TRACE,
2791 "reset_mrt_ill: mfctable [%d]", i);
2792 }
2793 while (rt != NULL) {
2794 mutex_enter(&rt->mfc_mutex);
2795 while ((rte = rt->mfc_rte) != NULL) {
2796 if (rte->ill == ill &&
2797 (id = rt->mfc_timeout_id) != 0) {
2798 /*
2799 * Its ok to drop the lock, the
2800 * struct cannot be freed since
2801 * we have a ref on the hash
2802 * bucket.
2803 */
2804 mutex_exit(&rt->mfc_mutex);
2805 (void) untimeout(id);
2806 mutex_enter(&rt->mfc_mutex);
2807 }
2808 if (rte->ill == ill) {
2809 if (ipst->ips_ip_mrtdebug > 1) {
2810 (void) mi_strlog(
2811 mrouter->conn_rq,
2812 1, SL_TRACE,
2813 "reset_mrt_ill: "
2814 "ill 0x%p", (void *)ill);
2815 }
2816 rt->mfc_rte = rte->rte_next;
2817 freemsg(rte->mp);
2818 mi_free((char *)rte);
2819 }
2820 }
2821 mutex_exit(&rt->mfc_mutex);
2822 rt = rt->mfc_next;
2823 }
2824 }
2825 MFCB_REFRELE(&ipst->ips_mfcs[i]);
2826 }
2827 }
2828
2829 /*
2830 * Token bucket filter module.
2831 * The ipha is for mcastgrp destination for phyint and encap.
2832 */
2833 static void
tbf_control(struct vif * vifp,mblk_t * mp,ipha_t * ipha)2834 tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha)
2835 {
2836 size_t p_len = msgdsize(mp);
2837 struct tbf *t = vifp->v_tbf;
2838 timeout_id_t id = 0;
2839 ill_t *ill = vifp->v_ipif->ipif_ill;
2840 ip_stack_t *ipst = ill->ill_ipst;
2841 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2842
2843 /* Drop if packet is too large */
2844 if (p_len > MAX_BKT_SIZE) {
2845 ipst->ips_mrtstat->mrts_pkt2large++;
2846 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2847 ip_drop_output("tbf_control - too large", mp, ill);
2848 freemsg(mp);
2849 return;
2850 }
2851 if (ipst->ips_ip_mrtdebug > 1) {
2852 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2853 "tbf_ctrl: SEND vif %ld, qlen %d, ipha_dst 0x%x",
2854 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len,
2855 ntohl(ipha->ipha_dst));
2856 }
2857
2858 mutex_enter(&t->tbf_lock);
2859
2860 tbf_update_tokens(vifp);
2861
2862 /*
2863 * If there are enough tokens,
2864 * and the queue is empty, send this packet out.
2865 */
2866 if (ipst->ips_ip_mrtdebug > 1) {
2867 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2868 "tbf_control: vif %ld, TOKENS %d, pkt len %lu, qlen %d",
2869 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_n_tok, p_len,
2870 t->tbf_q_len);
2871 }
2872 /* No packets are queued */
2873 if (t->tbf_q_len == 0) {
2874 /* queue empty, send packet if enough tokens */
2875 if (p_len <= t->tbf_n_tok) {
2876 t->tbf_n_tok -= p_len;
2877 mutex_exit(&t->tbf_lock);
2878 tbf_send_packet(vifp, mp);
2879 return;
2880 } else {
2881 /* Queue packet and timeout till later */
2882 tbf_queue(vifp, mp);
2883 ASSERT(vifp->v_timeout_id == 0);
2884 vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
2885 TBF_REPROCESS);
2886 }
2887 } else if (t->tbf_q_len < t->tbf_max_q_len) {
2888 /* Finite queue length, so queue pkts and process queue */
2889 tbf_queue(vifp, mp);
2890 tbf_process_q(vifp);
2891 } else {
2892 /* Check that we have UDP header with IP header */
2893 size_t hdr_length = IPH_HDR_LENGTH(ipha) +
2894 sizeof (struct udphdr);
2895
2896 if ((mp->b_wptr - mp->b_rptr) < hdr_length) {
2897 if (!pullupmsg(mp, hdr_length)) {
2898 BUMP_MIB(ill->ill_ip_mib,
2899 ipIfStatsOutDiscards);
2900 ip_drop_output("tbf_control - pullup", mp, ill);
2901 freemsg(mp);
2902 ip1dbg(("tbf_ctl: couldn't pullup udp hdr, "
2903 "vif %ld src 0x%x dst 0x%x\n",
2904 (ptrdiff_t)(vifp - ipst->ips_vifs),
2905 ntohl(ipha->ipha_src),
2906 ntohl(ipha->ipha_dst)));
2907 mutex_exit(&vifp->v_tbf->tbf_lock);
2908 return;
2909 } else
2910 /* Have to reassign ipha after pullupmsg */
2911 ipha = (ipha_t *)mp->b_rptr;
2912 }
2913 /*
2914 * Queue length too much,
2915 * try to selectively dq, or queue and process
2916 */
2917 if (!tbf_dq_sel(vifp, ipha)) {
2918 ipst->ips_mrtstat->mrts_q_overflow++;
2919 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2920 ip_drop_output("mrts_q_overflow", mp, ill);
2921 freemsg(mp);
2922 } else {
2923 tbf_queue(vifp, mp);
2924 tbf_process_q(vifp);
2925 }
2926 }
2927 if (t->tbf_q_len == 0) {
2928 id = vifp->v_timeout_id;
2929 vifp->v_timeout_id = 0;
2930 }
2931 mutex_exit(&vifp->v_tbf->tbf_lock);
2932 if (id != 0)
2933 (void) untimeout(id);
2934 }
2935
2936 /*
2937 * Adds a packet to the tbf queue at the interface.
2938 * The ipha is for mcastgrp destination for phyint and encap.
2939 */
2940 static void
tbf_queue(struct vif * vifp,mblk_t * mp)2941 tbf_queue(struct vif *vifp, mblk_t *mp)
2942 {
2943 struct tbf *t = vifp->v_tbf;
2944 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2945 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2946
2947 if (ipst->ips_ip_mrtdebug > 1) {
2948 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2949 "tbf_queue: vif %ld", (ptrdiff_t)(vifp - ipst->ips_vifs));
2950 }
2951 ASSERT(MUTEX_HELD(&t->tbf_lock));
2952
2953 if (t->tbf_t == NULL) {
2954 /* Queue was empty */
2955 t->tbf_q = mp;
2956 } else {
2957 /* Insert at tail */
2958 t->tbf_t->b_next = mp;
2959 }
2960 /* set new tail pointer */
2961 t->tbf_t = mp;
2962
2963 mp->b_next = mp->b_prev = NULL;
2964
2965 t->tbf_q_len++;
2966 }
2967
2968 /*
2969 * Process the queue at the vif interface.
2970 * Drops the tbf_lock when sending packets.
2971 *
2972 * NOTE : The caller should quntimeout if the queue length is 0.
2973 */
2974 static void
tbf_process_q(struct vif * vifp)2975 tbf_process_q(struct vif *vifp)
2976 {
2977 mblk_t *mp;
2978 struct tbf *t = vifp->v_tbf;
2979 size_t len;
2980 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2981 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2982
2983 if (ipst->ips_ip_mrtdebug > 1) {
2984 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2985 "tbf_process_q 1: vif %ld qlen = %d",
2986 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len);
2987 }
2988
2989 /*
2990 * Loop through the queue at the interface and send
2991 * as many packets as possible.
2992 */
2993 ASSERT(MUTEX_HELD(&t->tbf_lock));
2994
2995 while (t->tbf_q_len > 0) {
2996 mp = t->tbf_q;
2997 len = (size_t)msgdsize(mp); /* length of ip pkt */
2998
2999 /* Determine if the packet can be sent */
3000 if (len <= t->tbf_n_tok) {
3001 /*
3002 * If so, reduce no. of tokens, dequeue the packet,
3003 * send the packet.
3004 */
3005 t->tbf_n_tok -= len;
3006
3007 t->tbf_q = mp->b_next;
3008 if (--t->tbf_q_len == 0) {
3009 t->tbf_t = NULL;
3010 }
3011 mp->b_next = NULL;
3012 /* Exit mutex before sending packet, then re-enter */
3013 mutex_exit(&t->tbf_lock);
3014 tbf_send_packet(vifp, mp);
3015 mutex_enter(&t->tbf_lock);
3016 } else
3017 break;
3018 }
3019 }
3020
3021 /* Called at tbf timeout to update tokens, process q and reset timer. */
3022 static void
tbf_reprocess_q(void * arg)3023 tbf_reprocess_q(void *arg)
3024 {
3025 struct vif *vifp = arg;
3026 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3027 conn_t *mrouter = ipst->ips_ip_g_mrouter;
3028
3029 mutex_enter(&vifp->v_tbf->tbf_lock);
3030 vifp->v_timeout_id = 0;
3031 tbf_update_tokens(vifp);
3032
3033 tbf_process_q(vifp);
3034
3035 if (vifp->v_tbf->tbf_q_len > 0) {
3036 vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
3037 TBF_REPROCESS);
3038 }
3039 mutex_exit(&vifp->v_tbf->tbf_lock);
3040
3041 if (ipst->ips_ip_mrtdebug > 1) {
3042 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3043 "tbf_reprcess_q: vif %ld timeout id = %p",
3044 (ptrdiff_t)(vifp - ipst->ips_vifs), vifp->v_timeout_id);
3045 }
3046 }
3047
3048 /*
3049 * Function that will selectively discard a member of the tbf queue,
3050 * based on the precedence value and the priority.
3051 *
3052 * NOTE : The caller should quntimeout if the queue length is 0.
3053 */
3054 static int
tbf_dq_sel(struct vif * vifp,ipha_t * ipha)3055 tbf_dq_sel(struct vif *vifp, ipha_t *ipha)
3056 {
3057 uint_t p;
3058 struct tbf *t = vifp->v_tbf;
3059 mblk_t **np;
3060 mblk_t *last, *mp;
3061 ill_t *ill = vifp->v_ipif->ipif_ill;
3062 ip_stack_t *ipst = ill->ill_ipst;
3063 conn_t *mrouter = ipst->ips_ip_g_mrouter;
3064
3065 if (ipst->ips_ip_mrtdebug > 1) {
3066 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3067 "dq_sel: vif %ld dst 0x%x",
3068 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(ipha->ipha_dst));
3069 }
3070
3071 ASSERT(MUTEX_HELD(&t->tbf_lock));
3072 p = priority(vifp, ipha);
3073
3074 np = &t->tbf_q;
3075 last = NULL;
3076 while ((mp = *np) != NULL) {
3077 if (p > (priority(vifp, (ipha_t *)mp->b_rptr))) {
3078 *np = mp->b_next;
3079 /* If removing the last packet, fix the tail pointer */
3080 if (mp == t->tbf_t)
3081 t->tbf_t = last;
3082 mp->b_prev = mp->b_next = NULL;
3083 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3084 ip_drop_output("tbf_dq_send", mp, ill);
3085 freemsg(mp);
3086 /*
3087 * It's impossible for the queue to be empty, but
3088 * we check anyway.
3089 */
3090 if (--t->tbf_q_len == 0) {
3091 t->tbf_t = NULL;
3092 }
3093 ipst->ips_mrtstat->mrts_drop_sel++;
3094 return (1);
3095 }
3096 np = &mp->b_next;
3097 last = mp;
3098 }
3099 return (0);
3100 }
3101
3102 /* Sends packet, 2 cases - encap tunnel, phyint. */
3103 static void
tbf_send_packet(struct vif * vifp,mblk_t * mp)3104 tbf_send_packet(struct vif *vifp, mblk_t *mp)
3105 {
3106 ipif_t *ipif = vifp->v_ipif;
3107 ill_t *ill = ipif->ipif_ill;
3108 ip_stack_t *ipst = ill->ill_ipst;
3109 conn_t *mrouter = ipst->ips_ip_g_mrouter;
3110 ipha_t *ipha;
3111
3112 ipha = (ipha_t *)mp->b_rptr;
3113 /* If encap tunnel options */
3114 if (vifp->v_flags & VIFF_TUNNEL) {
3115 ip_xmit_attr_t ixas;
3116
3117 if (ipst->ips_ip_mrtdebug > 1) {
3118 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3119 "tbf_send_packet: ENCAP tunnel vif %ld",
3120 (ptrdiff_t)(vifp - ipst->ips_vifs));
3121 }
3122 bzero(&ixas, sizeof (ixas));
3123 ixas.ixa_flags =
3124 IXAF_IS_IPV4 | IXAF_NO_TTL_CHANGE | IXAF_VERIFY_SOURCE;
3125 ixas.ixa_ipst = ipst;
3126 ixas.ixa_ifindex = 0;
3127 ixas.ixa_cred = kcred;
3128 ixas.ixa_cpid = NOPID;
3129 ixas.ixa_tsl = NULL;
3130 ixas.ixa_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */
3131 ixas.ixa_pktlen = ntohs(ipha->ipha_length);
3132 ixas.ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha);
3133
3134 /*
3135 * Feed into ip_output_simple which will set the ident field
3136 * and checksum the encapsulating header.
3137 * BSD gets the cached route vifp->v_route from ip_output()
3138 * to speed up route table lookups. Not necessary in SunOS 5.x.
3139 * One could make multicast forwarding faster by putting an
3140 * ip_xmit_attr_t in each vif thereby caching the ire/nce.
3141 */
3142 (void) ip_output_simple(mp, &ixas);
3143 ixa_cleanup(&ixas);
3144 return;
3145
3146 /* phyint */
3147 } else {
3148 /* Need to loop back to members on the outgoing interface. */
3149 ipaddr_t dst;
3150 ip_recv_attr_t iras;
3151 nce_t *nce;
3152
3153 bzero(&iras, sizeof (iras));
3154 iras.ira_flags = IRAF_IS_IPV4;
3155 iras.ira_ill = iras.ira_rill = ill;
3156 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3157 iras.ira_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */
3158 iras.ira_pktlen = ntohs(ipha->ipha_length);
3159 iras.ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
3160
3161 dst = ipha->ipha_dst;
3162 if (ill_hasmembers_v4(ill, dst)) {
3163 iras.ira_flags |= IRAF_LOOPBACK_COPY;
3164 }
3165 if (ipst->ips_ip_mrtdebug > 1) {
3166 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3167 "tbf_send_pkt: phyint forward vif %ld dst = 0x%x",
3168 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(dst));
3169 }
3170 /*
3171 * Find an NCE which matches the nexthop.
3172 * For a pt-pt interface we use the other end of the pt-pt
3173 * link.
3174 */
3175 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
3176 dst = ipif->ipif_pp_dst_addr;
3177 nce = arp_nce_init(ill, dst, ill->ill_net_type);
3178 } else {
3179 nce = arp_nce_init(ill, dst, IRE_MULTICAST);
3180 }
3181 if (nce == NULL) {
3182 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3183 ip_drop_output("tbf_send_packet - no nce", mp, ill);
3184 freemsg(mp);
3185 return;
3186 }
3187
3188 /*
3189 * We don't remeber the incoming ill. Thus we
3190 * pretend the packet arrived on the outbound ill. This means
3191 * statistics for input errors will be increased on the wrong
3192 * ill but that isn't a big deal.
3193 */
3194 ip_forward_xmit_v4(nce, ill, mp, ipha, &iras, ill->ill_mc_mtu,
3195 0);
3196 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3197
3198 nce_refrele(nce);
3199 }
3200 }
3201
3202 /*
3203 * Determine the current time and then the elapsed time (between the last time
3204 * and time now). Update the no. of tokens in the bucket.
3205 */
3206 static void
tbf_update_tokens(struct vif * vifp)3207 tbf_update_tokens(struct vif *vifp)
3208 {
3209 timespec_t tp;
3210 hrtime_t tm;
3211 struct tbf *t = vifp->v_tbf;
3212 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3213 conn_t *mrouter = ipst->ips_ip_g_mrouter;
3214
3215 ASSERT(MUTEX_HELD(&t->tbf_lock));
3216
3217 /* Time in secs and nsecs, rate limit in kbits/sec */
3218 gethrestime(&tp);
3219
3220 /*LINTED*/
3221 TV_DELTA(tp, t->tbf_last_pkt_t, tm);
3222
3223 /*
3224 * This formula is actually
3225 * "time in seconds" * "bytes/second". Scaled for nsec.
3226 * (tm/1000000000) * (v_rate_limit * 1000 * (1000/1024) /8)
3227 *
3228 * The (1000/1024) was introduced in add_vif to optimize
3229 * this divide into a shift.
3230 */
3231 t->tbf_n_tok += (tm/1000) * vifp->v_rate_limit / 1024 / 8;
3232 t->tbf_last_pkt_t = tp;
3233
3234 if (t->tbf_n_tok > MAX_BKT_SIZE)
3235 t->tbf_n_tok = MAX_BKT_SIZE;
3236 if (ipst->ips_ip_mrtdebug > 1) {
3237 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3238 "tbf_update_tok: tm %lld tok %d vif %ld",
3239 tm, t->tbf_n_tok, (ptrdiff_t)(vifp - ipst->ips_vifs));
3240 }
3241 }
3242
3243 /*
3244 * Priority currently is based on port nos.
3245 * Different forwarding mechanisms have different ways
3246 * of obtaining the port no. Hence, the vif must be
3247 * given along with the packet itself.
3248 *
3249 */
3250 static int
priority(struct vif * vifp,ipha_t * ipha)3251 priority(struct vif *vifp, ipha_t *ipha)
3252 {
3253 int prio;
3254 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3255 conn_t *mrouter = ipst->ips_ip_g_mrouter;
3256
3257 /* Temporary hack; may add general packet classifier some day */
3258
3259 ASSERT(MUTEX_HELD(&vifp->v_tbf->tbf_lock));
3260
3261 /*
3262 * The UDP port space is divided up into four priority ranges:
3263 * [0, 16384) : unclassified - lowest priority
3264 * [16384, 32768) : audio - highest priority
3265 * [32768, 49152) : whiteboard - medium priority
3266 * [49152, 65536) : video - low priority
3267 */
3268
3269 if (ipha->ipha_protocol == IPPROTO_UDP) {
3270 struct udphdr *udp =
3271 (struct udphdr *)((char *)ipha + IPH_HDR_LENGTH(ipha));
3272 switch (ntohs(udp->uh_dport) & 0xc000) {
3273 case 0x4000:
3274 prio = 70;
3275 break;
3276 case 0x8000:
3277 prio = 60;
3278 break;
3279 case 0xc000:
3280 prio = 55;
3281 break;
3282 default:
3283 prio = 50;
3284 break;
3285 }
3286 if (ipst->ips_ip_mrtdebug > 1) {
3287 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3288 "priority: port %x prio %d\n",
3289 ntohs(udp->uh_dport), prio);
3290 }
3291 } else
3292 prio = 50; /* default priority */
3293 return (prio);
3294 }
3295
3296 /*
3297 * End of token bucket filter modifications
3298 */
3299
3300
3301
3302 /*
3303 * Produces data for netstat -M.
3304 */
3305 int
ip_mroute_stats(mblk_t * mp,ip_stack_t * ipst)3306 ip_mroute_stats(mblk_t *mp, ip_stack_t *ipst)
3307 {
3308 ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
3309 ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
3310 if (!snmp_append_data(mp, (char *)ipst->ips_mrtstat,
3311 sizeof (struct mrtstat))) {
3312 ip0dbg(("ip_mroute_stats: failed %ld bytes\n",
3313 (size_t)sizeof (struct mrtstat)));
3314 return (0);
3315 }
3316 return (1);
3317 }
3318
3319 /*
3320 * Sends info for SNMP's MIB.
3321 */
3322 int
ip_mroute_vif(mblk_t * mp,ip_stack_t * ipst)3323 ip_mroute_vif(mblk_t *mp, ip_stack_t *ipst)
3324 {
3325 struct vifctl vi;
3326 vifi_t vifi;
3327
3328 mutex_enter(&ipst->ips_numvifs_mutex);
3329 for (vifi = 0; vifi < ipst->ips_numvifs; vifi++) {
3330 if (ipst->ips_vifs[vifi].v_lcl_addr.s_addr == 0)
3331 continue;
3332 /*
3333 * No locks here, an approximation is fine.
3334 */
3335 vi.vifc_vifi = vifi;
3336 vi.vifc_flags = ipst->ips_vifs[vifi].v_flags;
3337 vi.vifc_threshold = ipst->ips_vifs[vifi].v_threshold;
3338 vi.vifc_rate_limit = ipst->ips_vifs[vifi].v_rate_limit;
3339 vi.vifc_lcl_addr = ipst->ips_vifs[vifi].v_lcl_addr;
3340 vi.vifc_rmt_addr = ipst->ips_vifs[vifi].v_rmt_addr;
3341 vi.vifc_pkt_in = ipst->ips_vifs[vifi].v_pkt_in;
3342 vi.vifc_pkt_out = ipst->ips_vifs[vifi].v_pkt_out;
3343
3344 if (!snmp_append_data(mp, (char *)&vi, sizeof (vi))) {
3345 ip0dbg(("ip_mroute_vif: failed %ld bytes\n",
3346 (size_t)sizeof (vi)));
3347 mutex_exit(&ipst->ips_numvifs_mutex);
3348 return (0);
3349 }
3350 }
3351 mutex_exit(&ipst->ips_numvifs_mutex);
3352 return (1);
3353 }
3354
3355 /*
3356 * Called by ip_snmp_get to send up multicast routing table.
3357 */
3358 int
ip_mroute_mrt(mblk_t * mp,ip_stack_t * ipst)3359 ip_mroute_mrt(mblk_t *mp, ip_stack_t *ipst)
3360 {
3361 int i, j;
3362 struct mfc *rt;
3363 struct mfcctl mfcc;
3364
3365 /*
3366 * Make sure multicast has not been turned off.
3367 */
3368 if (is_mrouter_off(ipst))
3369 return (1);
3370
3371 /* Loop over all hash buckets and their chains */
3372 for (i = 0; i < MFCTBLSIZ; i++) {
3373 MFCB_REFHOLD(&ipst->ips_mfcs[i]);
3374 for (rt = ipst->ips_mfcs[i].mfcb_mfc; rt; rt = rt->mfc_next) {
3375 mutex_enter(&rt->mfc_mutex);
3376 if (rt->mfc_rte != NULL ||
3377 (rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
3378 mutex_exit(&rt->mfc_mutex);
3379 continue;
3380 }
3381 mfcc.mfcc_origin = rt->mfc_origin;
3382 mfcc.mfcc_mcastgrp = rt->mfc_mcastgrp;
3383 mfcc.mfcc_parent = rt->mfc_parent;
3384 mfcc.mfcc_pkt_cnt = rt->mfc_pkt_cnt;
3385 mutex_enter(&ipst->ips_numvifs_mutex);
3386 for (j = 0; j < (int)ipst->ips_numvifs; j++)
3387 mfcc.mfcc_ttls[j] = rt->mfc_ttls[j];
3388 for (j = (int)ipst->ips_numvifs; j < MAXVIFS; j++)
3389 mfcc.mfcc_ttls[j] = 0;
3390 mutex_exit(&ipst->ips_numvifs_mutex);
3391
3392 mutex_exit(&rt->mfc_mutex);
3393 if (!snmp_append_data(mp, (char *)&mfcc,
3394 sizeof (mfcc))) {
3395 MFCB_REFRELE(&ipst->ips_mfcs[i]);
3396 ip0dbg(("ip_mroute_mrt: failed %ld bytes\n",
3397 (size_t)sizeof (mfcc)));
3398 return (0);
3399 }
3400 }
3401 MFCB_REFRELE(&ipst->ips_mfcs[i]);
3402 }
3403 return (1);
3404 }
3405