1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24 /* Copyright (c) 1990 Mentat Inc. */
25
26 /*
27 * Procedures for the kernel part of DVMRP,
28 * a Distance-Vector Multicast Routing Protocol.
29 * (See RFC-1075)
30 * Written by David Waitzman, BBN Labs, August 1988.
31 * Modified by Steve Deering, Stanford, February 1989.
32 * Modified by Mark J. Steiglitz, Stanford, May, 1991
33 * Modified by Van Jacobson, LBL, January 1993
34 * Modified by Ajit Thyagarajan, PARC, August 1993
35 * Modified by Bill Fenner, PARC, April 1995
36 *
37 * MROUTING 3.5
38 */
39
40 /*
41 * TODO
42 * - function pointer field in vif, void *vif_sendit()
43 */
44
45 #include <sys/types.h>
46 #include <sys/stream.h>
47 #include <sys/stropts.h>
48 #include <sys/strlog.h>
49 #include <sys/systm.h>
50 #include <sys/ddi.h>
51 #include <sys/cmn_err.h>
52 #include <sys/zone.h>
53
54 #include <sys/param.h>
55 #include <sys/socket.h>
56 #include <sys/vtrace.h>
57 #include <sys/debug.h>
58 #include <net/if.h>
59 #include <sys/sockio.h>
60 #include <netinet/in.h>
61 #include <net/if_dl.h>
62
63 #include <inet/ipsec_impl.h>
64 #include <inet/common.h>
65 #include <inet/mi.h>
66 #include <inet/nd.h>
67 #include <inet/tunables.h>
68 #include <inet/mib2.h>
69 #include <netinet/ip6.h>
70 #include <inet/ip.h>
71 #include <inet/snmpcom.h>
72
73 #include <netinet/igmp.h>
74 #include <netinet/igmp_var.h>
75 #include <netinet/udp.h>
76 #include <netinet/ip_mroute.h>
77 #include <inet/ip_multi.h>
78 #include <inet/ip_ire.h>
79 #include <inet/ip_ndp.h>
80 #include <inet/ip_if.h>
81 #include <inet/ipclassifier.h>
82
83 #include <netinet/pim.h>
84
85
86 /*
87 * MT Design:
88 *
89 * There are three main data structures viftable, mfctable and tbftable that
90 * need to be protected against MT races.
91 *
92 * vitable is a fixed length array of vif structs. There is no lock to protect
93 * the whole array, instead each struct is protected by its own indiviual lock.
94 * The value of v_marks in conjuction with the value of v_refcnt determines the
95 * current state of a vif structure. One special state that needs mention
96 * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates
97 * that vif is being initalized.
98 * Each structure is freed when the refcnt goes down to zero. If a delete comes
99 * in when the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED
100 * which prevents the struct from further use. When the refcnt goes to zero
101 * the struct is freed and is marked VIF_MARK_NOTINUSE.
102 * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill
103 * from going away a refhold is put on the ipif before using it. see
104 * lock_good_vif() and unlock_good_vif().
105 *
106 * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts
107 * of the vif struct.
108 *
109 * tbftable is also a fixed length array of tbf structs and is only accessed
110 * via v_tbf. It is protected by its own lock tbf_lock.
111 *
112 * Lock Ordering is
113 * v_lock --> tbf_lock
114 * v_lock --> ill_locK
115 *
116 * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb).
117 * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker,
118 * it also maintains a state. These fields are protected by a lock (mfcb_lock).
119 * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to
120 * protect the struct elements.
121 *
122 * mfc structs are dynamically allocated and are singly linked
123 * at the head of the chain. When an mfc structure is to be deleted
124 * it is marked condemned and so is the state in the bucket struct.
125 * When the last walker of the hash bucket exits all the mfc structs
126 * marked condemed are freed.
127 *
128 * Locking Hierarchy:
129 * The bucket lock should be acquired before the mfc struct lock.
130 * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking
131 * operations on the bucket struct.
132 *
133 * last_encap_lock and numvifs_mutex should be acquired after
134 * acquring vif or mfc locks. These locks protect some global variables.
135 *
136 * The statistics are not currently protected by a lock
137 * causing the stats be be approximate, not exact.
138 */
139
140 #define NO_VIF MAXVIFS /* from mrouted, no route for src */
141
142 /*
143 * Timeouts:
144 * Upcall timeouts - BSD uses boolean_t mfc->expire and
145 * nexpire[MFCTBLSIZE], the number of times expire has been called.
146 * SunOS 5.x uses mfc->timeout for each mfc.
147 * Some Unixes are limited in the number of simultaneous timeouts
148 * that can be run, SunOS 5.x does not have this restriction.
149 */
150
151 /*
152 * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and
153 * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall
154 * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE
155 */
156 #define EXPIRE_TIMEOUT (hz/4) /* 4x / second */
157 #define UPCALL_EXPIRE 6 /* number of timeouts */
158
159 /*
160 * Hash function for a source, group entry
161 */
162 #define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \
163 ((g) >> 20) ^ ((g) >> 10) ^ (g))
164
165 #define TBF_REPROCESS (hz / 100) /* 100x /second */
166
167 /* Identify PIM packet that came on a Register interface */
168 #define PIM_REGISTER_MARKER 0xffffffff
169
170 /* Function declarations */
171 static int add_mfc(struct mfcctl *, ip_stack_t *);
172 static int add_vif(struct vifctl *, conn_t *, ip_stack_t *);
173 static int del_mfc(struct mfcctl *, ip_stack_t *);
174 static int del_vif(vifi_t *, ip_stack_t *);
175 static void del_vifp(struct vif *);
176 static void encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
177 static void expire_upcalls(void *);
178 static void fill_route(struct mfc *, struct mfcctl *, ip_stack_t *);
179 static void free_queue(struct mfc *);
180 static int get_assert(uchar_t *, ip_stack_t *);
181 static int get_lsg_cnt(struct sioc_lsg_req *, ip_stack_t *);
182 static int get_sg_cnt(struct sioc_sg_req *, ip_stack_t *);
183 static int get_version(uchar_t *);
184 static int get_vif_cnt(struct sioc_vif_req *, ip_stack_t *);
185 static int ip_mdq(mblk_t *, ipha_t *, ill_t *,
186 ipaddr_t, struct mfc *);
187 static int ip_mrouter_init(conn_t *, uchar_t *, int, ip_stack_t *);
188 static void phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
189 static int register_mforward(mblk_t *, ip_recv_attr_t *);
190 static void register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
191 static int set_assert(int *, ip_stack_t *);
192
193 /*
194 * Token Bucket Filter functions
195 */
196 static int priority(struct vif *, ipha_t *);
197 static void tbf_control(struct vif *, mblk_t *, ipha_t *);
198 static int tbf_dq_sel(struct vif *, ipha_t *);
199 static void tbf_process_q(struct vif *);
200 static void tbf_queue(struct vif *, mblk_t *);
201 static void tbf_reprocess_q(void *);
202 static void tbf_send_packet(struct vif *, mblk_t *);
203 static void tbf_update_tokens(struct vif *);
204 static void release_mfc(struct mfcb *);
205
206 static boolean_t is_mrouter_off(ip_stack_t *);
207 /*
208 * Encapsulation packets
209 */
210
211 #define ENCAP_TTL 64
212
213 /* prototype IP hdr for encapsulated packets */
214 static ipha_t multicast_encap_iphdr = {
215 IP_SIMPLE_HDR_VERSION,
216 0, /* tos */
217 sizeof (ipha_t), /* total length */
218 0, /* id */
219 0, /* frag offset */
220 ENCAP_TTL, IPPROTO_ENCAP,
221 0, /* checksum */
222 };
223
224 /*
225 * Rate limit for assert notification messages, in nsec.
226 */
227 #define ASSERT_MSG_TIME 3000000000
228
229
230 #define VIF_REFHOLD(vifp) { \
231 mutex_enter(&(vifp)->v_lock); \
232 (vifp)->v_refcnt++; \
233 mutex_exit(&(vifp)->v_lock); \
234 }
235
236 #define VIF_REFRELE_LOCKED(vifp) { \
237 (vifp)->v_refcnt--; \
238 if ((vifp)->v_refcnt == 0 && \
239 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) { \
240 del_vifp(vifp); \
241 } else { \
242 mutex_exit(&(vifp)->v_lock); \
243 } \
244 }
245
246 #define VIF_REFRELE(vifp) { \
247 mutex_enter(&(vifp)->v_lock); \
248 (vifp)->v_refcnt--; \
249 if ((vifp)->v_refcnt == 0 && \
250 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) { \
251 del_vifp(vifp); \
252 } else { \
253 mutex_exit(&(vifp)->v_lock); \
254 } \
255 }
256
257 #define MFCB_REFHOLD(mfcb) { \
258 mutex_enter(&(mfcb)->mfcb_lock); \
259 (mfcb)->mfcb_refcnt++; \
260 ASSERT((mfcb)->mfcb_refcnt != 0); \
261 mutex_exit(&(mfcb)->mfcb_lock); \
262 }
263
264 #define MFCB_REFRELE(mfcb) { \
265 mutex_enter(&(mfcb)->mfcb_lock); \
266 ASSERT((mfcb)->mfcb_refcnt != 0); \
267 if (--(mfcb)->mfcb_refcnt == 0 && \
268 ((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) { \
269 release_mfc(mfcb); \
270 } \
271 mutex_exit(&(mfcb)->mfcb_lock); \
272 }
273
274 /*
275 * MFCFIND:
276 * Find a route for a given origin IP address and multicast group address.
277 * Skip entries with pending upcalls.
278 * Type of service parameter to be added in the future!
279 */
280 #define MFCFIND(mfcbp, o, g, rt) { \
281 struct mfc *_mb_rt = NULL; \
282 rt = NULL; \
283 _mb_rt = mfcbp->mfcb_mfc; \
284 while (_mb_rt) { \
285 if ((_mb_rt->mfc_origin.s_addr == o) && \
286 (_mb_rt->mfc_mcastgrp.s_addr == g) && \
287 (_mb_rt->mfc_rte == NULL) && \
288 (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) { \
289 rt = _mb_rt; \
290 break; \
291 } \
292 _mb_rt = _mb_rt->mfc_next; \
293 } \
294 }
295
296 /*
297 * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime()
298 * are inefficient. We use gethrestime() which returns a timespec_t with
299 * sec and nsec, the resolution is machine dependent.
300 * The following 2 macros have been changed to use nsec instead of usec.
301 */
302 /*
303 * Macros to compute elapsed time efficiently.
304 * Borrowed from Van Jacobson's scheduling code.
305 * Delta should be a hrtime_t.
306 */
307 #define TV_DELTA(a, b, delta) { \
308 int xxs; \
309 \
310 delta = (a).tv_nsec - (b).tv_nsec; \
311 if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \
312 switch (xxs) { \
313 case 2: \
314 delta += 1000000000; \
315 /*FALLTHROUGH*/ \
316 case 1: \
317 delta += 1000000000; \
318 break; \
319 default: \
320 delta += (1000000000 * xxs); \
321 } \
322 } \
323 }
324
325 #define TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \
326 (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
327
328 /*
329 * Handle MRT setsockopt commands to modify the multicast routing tables.
330 */
331 int
ip_mrouter_set(int cmd,conn_t * connp,int checkonly,uchar_t * data,int datalen)332 ip_mrouter_set(int cmd, conn_t *connp, int checkonly, uchar_t *data,
333 int datalen)
334 {
335 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
336
337 mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
338 if (cmd != MRT_INIT && connp != ipst->ips_ip_g_mrouter) {
339 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
340 return (EACCES);
341 }
342 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
343
344 if (checkonly) {
345 /*
346 * do not do operation, just pretend to - new T_CHECK
347 * Note: Even routines further on can probably fail but
348 * this T_CHECK stuff is only to please XTI so it not
349 * necessary to be perfect.
350 */
351 switch (cmd) {
352 case MRT_INIT:
353 case MRT_DONE:
354 case MRT_ADD_VIF:
355 case MRT_DEL_VIF:
356 case MRT_ADD_MFC:
357 case MRT_DEL_MFC:
358 case MRT_ASSERT:
359 return (0);
360 default:
361 return (EOPNOTSUPP);
362 }
363 }
364
365 /*
366 * make sure no command is issued after multicast routing has been
367 * turned off.
368 */
369 if (cmd != MRT_INIT && cmd != MRT_DONE) {
370 if (is_mrouter_off(ipst))
371 return (EINVAL);
372 }
373
374 switch (cmd) {
375 case MRT_INIT: return (ip_mrouter_init(connp, data, datalen, ipst));
376 case MRT_DONE: return (ip_mrouter_done(ipst));
377 case MRT_ADD_VIF: return (add_vif((struct vifctl *)data, connp, ipst));
378 case MRT_DEL_VIF: return (del_vif((vifi_t *)data, ipst));
379 case MRT_ADD_MFC: return (add_mfc((struct mfcctl *)data, ipst));
380 case MRT_DEL_MFC: return (del_mfc((struct mfcctl *)data, ipst));
381 case MRT_ASSERT: return (set_assert((int *)data, ipst));
382 default: return (EOPNOTSUPP);
383 }
384 }
385
386 /*
387 * Handle MRT getsockopt commands
388 */
389 int
ip_mrouter_get(int cmd,conn_t * connp,uchar_t * data)390 ip_mrouter_get(int cmd, conn_t *connp, uchar_t *data)
391 {
392 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
393
394 if (connp != ipst->ips_ip_g_mrouter)
395 return (EACCES);
396
397 switch (cmd) {
398 case MRT_VERSION: return (get_version((uchar_t *)data));
399 case MRT_ASSERT: return (get_assert((uchar_t *)data, ipst));
400 default: return (EOPNOTSUPP);
401 }
402 }
403
404 /*
405 * Handle ioctl commands to obtain information from the cache.
406 * Called with shared access to IP. These are read_only ioctls.
407 */
408 /* ARGSUSED */
409 int
mrt_ioctl(ipif_t * ipif,sin_t * sin,queue_t * q,mblk_t * mp,ip_ioctl_cmd_t * ipip,void * if_req)410 mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
411 ip_ioctl_cmd_t *ipip, void *if_req)
412 {
413 mblk_t *mp1;
414 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
415 conn_t *connp = Q_TO_CONN(q);
416 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
417
418 /* Existence verified in ip_wput_nondata */
419 mp1 = mp->b_cont->b_cont;
420
421 switch (iocp->ioc_cmd) {
422 case (SIOCGETVIFCNT):
423 return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr, ipst));
424 case (SIOCGETSGCNT):
425 return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr, ipst));
426 case (SIOCGETLSGCNT):
427 return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr, ipst));
428 default:
429 return (EINVAL);
430 }
431 }
432
433 /*
434 * Returns the packet, byte, rpf-failure count for the source, group provided.
435 */
436 static int
get_sg_cnt(struct sioc_sg_req * req,ip_stack_t * ipst)437 get_sg_cnt(struct sioc_sg_req *req, ip_stack_t *ipst)
438 {
439 struct mfc *rt;
440 struct mfcb *mfcbp;
441
442 mfcbp = &ipst->ips_mfcs[MFCHASH(req->src.s_addr, req->grp.s_addr)];
443 MFCB_REFHOLD(mfcbp);
444 MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt);
445
446 if (rt != NULL) {
447 mutex_enter(&rt->mfc_mutex);
448 req->pktcnt = rt->mfc_pkt_cnt;
449 req->bytecnt = rt->mfc_byte_cnt;
450 req->wrong_if = rt->mfc_wrong_if;
451 mutex_exit(&rt->mfc_mutex);
452 } else
453 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU;
454
455 MFCB_REFRELE(mfcbp);
456 return (0);
457 }
458
459 /*
460 * Returns the packet, byte, rpf-failure count for the source, group provided.
461 * Uses larger counters and IPv6 addresses.
462 */
463 /* ARGSUSED XXX until implemented */
464 static int
get_lsg_cnt(struct sioc_lsg_req * req,ip_stack_t * ipst)465 get_lsg_cnt(struct sioc_lsg_req *req, ip_stack_t *ipst)
466 {
467 /* XXX TODO SIOCGETLSGCNT */
468 return (ENXIO);
469 }
470
471 /*
472 * Returns the input and output packet and byte counts on the vif provided.
473 */
474 static int
get_vif_cnt(struct sioc_vif_req * req,ip_stack_t * ipst)475 get_vif_cnt(struct sioc_vif_req *req, ip_stack_t *ipst)
476 {
477 vifi_t vifi = req->vifi;
478
479 if (vifi >= ipst->ips_numvifs)
480 return (EINVAL);
481
482 /*
483 * No locks here, an approximation is fine.
484 */
485 req->icount = ipst->ips_vifs[vifi].v_pkt_in;
486 req->ocount = ipst->ips_vifs[vifi].v_pkt_out;
487 req->ibytes = ipst->ips_vifs[vifi].v_bytes_in;
488 req->obytes = ipst->ips_vifs[vifi].v_bytes_out;
489
490 return (0);
491 }
492
493 static int
get_version(uchar_t * data)494 get_version(uchar_t *data)
495 {
496 int *v = (int *)data;
497
498 *v = 0x0305; /* XXX !!!! */
499
500 return (0);
501 }
502
503 /*
504 * Set PIM assert processing global.
505 */
506 static int
set_assert(int * i,ip_stack_t * ipst)507 set_assert(int *i, ip_stack_t *ipst)
508 {
509 if ((*i != 1) && (*i != 0))
510 return (EINVAL);
511
512 ipst->ips_pim_assert = *i;
513
514 return (0);
515 }
516
517 /*
518 * Get PIM assert processing global.
519 */
520 static int
get_assert(uchar_t * data,ip_stack_t * ipst)521 get_assert(uchar_t *data, ip_stack_t *ipst)
522 {
523 int *i = (int *)data;
524
525 *i = ipst->ips_pim_assert;
526
527 return (0);
528 }
529
530 /*
531 * Enable multicast routing.
532 */
533 static int
ip_mrouter_init(conn_t * connp,uchar_t * data,int datalen,ip_stack_t * ipst)534 ip_mrouter_init(conn_t *connp, uchar_t *data, int datalen, ip_stack_t *ipst)
535 {
536 int *v;
537
538 if (data == NULL || (datalen != sizeof (int)))
539 return (ENOPROTOOPT);
540
541 v = (int *)data;
542 if (*v != 1)
543 return (ENOPROTOOPT);
544
545 mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
546 if (ipst->ips_ip_g_mrouter != NULL) {
547 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
548 return (EADDRINUSE);
549 }
550
551 /*
552 * MRT_INIT should only be allowed for RAW sockets, but we double
553 * check.
554 */
555 if (!IPCL_IS_RAWIP(connp)) {
556 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
557 return (EINVAL);
558 }
559
560 ipst->ips_ip_g_mrouter = connp;
561 connp->conn_multi_router = 1;
562 /* In order for tunnels to work we have to turn ip_g_forward on */
563 if (!WE_ARE_FORWARDING(ipst)) {
564 if (ipst->ips_ip_mrtdebug > 1) {
565 (void) mi_strlog(connp->conn_rq, 1, SL_TRACE,
566 "ip_mrouter_init: turning on forwarding");
567 }
568 ipst->ips_saved_ip_forwarding = ipst->ips_ip_forwarding;
569 ipst->ips_ip_forwarding = IP_FORWARD_ALWAYS;
570 }
571
572 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
573 return (0);
574 }
575
576 void
ip_mrouter_stack_init(ip_stack_t * ipst)577 ip_mrouter_stack_init(ip_stack_t *ipst)
578 {
579 mutex_init(&ipst->ips_ip_g_mrouter_mutex, NULL, MUTEX_DEFAULT, NULL);
580
581 ipst->ips_vifs = kmem_zalloc(sizeof (struct vif) * (MAXVIFS+1),
582 KM_SLEEP);
583 ipst->ips_mrtstat = kmem_zalloc(sizeof (struct mrtstat), KM_SLEEP);
584 /*
585 * mfctable:
586 * Includes all mfcs, including waiting upcalls.
587 * Multiple mfcs per bucket.
588 */
589 ipst->ips_mfcs = kmem_zalloc(sizeof (struct mfcb) * MFCTBLSIZ,
590 KM_SLEEP);
591 /*
592 * Define the token bucket filter structures.
593 * tbftable -> each vif has one of these for storing info.
594 */
595 ipst->ips_tbfs = kmem_zalloc(sizeof (struct tbf) * MAXVIFS, KM_SLEEP);
596
597 mutex_init(&ipst->ips_last_encap_lock, NULL, MUTEX_DEFAULT, NULL);
598
599 ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
600 ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
601 }
602
603 /*
604 * Disable multicast routing.
605 * Didn't use global timeout_val (BSD version), instead check the mfctable.
606 */
607 int
ip_mrouter_done(ip_stack_t * ipst)608 ip_mrouter_done(ip_stack_t *ipst)
609 {
610 conn_t *mrouter;
611 vifi_t vifi;
612 struct mfc *mfc_rt;
613 int i;
614
615 mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
616 if (ipst->ips_ip_g_mrouter == NULL) {
617 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
618 return (EINVAL);
619 }
620
621 mrouter = ipst->ips_ip_g_mrouter;
622
623 if (ipst->ips_saved_ip_forwarding != -1) {
624 if (ipst->ips_ip_mrtdebug > 1) {
625 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
626 "ip_mrouter_done: turning off forwarding");
627 }
628 ipst->ips_ip_forwarding = ipst->ips_saved_ip_forwarding;
629 ipst->ips_saved_ip_forwarding = -1;
630 }
631
632 /*
633 * Always clear cache when vifs change.
634 * No need to get ipst->ips_last_encap_lock since we are running as
635 * a writer.
636 */
637 mutex_enter(&ipst->ips_last_encap_lock);
638 ipst->ips_last_encap_src = 0;
639 ipst->ips_last_encap_vif = NULL;
640 mutex_exit(&ipst->ips_last_encap_lock);
641 mrouter->conn_multi_router = 0;
642
643 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
644
645 /*
646 * For each phyint in use,
647 * disable promiscuous reception of all IP multicasts.
648 */
649 for (vifi = 0; vifi < MAXVIFS; vifi++) {
650 struct vif *vifp = ipst->ips_vifs + vifi;
651
652 mutex_enter(&vifp->v_lock);
653 /*
654 * if the vif is active mark it condemned.
655 */
656 if (vifp->v_marks & VIF_MARK_GOOD) {
657 ASSERT(vifp->v_ipif != NULL);
658 ipif_refhold(vifp->v_ipif);
659 /* Phyint only */
660 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
661 ipif_t *ipif = vifp->v_ipif;
662 ilm_t *ilm = vifp->v_ilm;
663
664 vifp->v_ilm = NULL;
665 vifp->v_marks &= ~VIF_MARK_GOOD;
666 vifp->v_marks |= VIF_MARK_CONDEMNED;
667
668 mutex_exit(&(vifp)->v_lock);
669 if (ilm != NULL) {
670 ill_t *ill = ipif->ipif_ill;
671
672 (void) ip_delmulti(ilm);
673 ASSERT(ill->ill_mrouter_cnt > 0);
674 atomic_dec_32(&ill->ill_mrouter_cnt);
675 }
676 mutex_enter(&vifp->v_lock);
677 }
678 ipif_refrele(vifp->v_ipif);
679 /*
680 * decreases the refcnt added in add_vif.
681 * and release v_lock.
682 */
683 VIF_REFRELE_LOCKED(vifp);
684 } else {
685 mutex_exit(&vifp->v_lock);
686 continue;
687 }
688 }
689
690 mutex_enter(&ipst->ips_numvifs_mutex);
691 ipst->ips_numvifs = 0;
692 ipst->ips_pim_assert = 0;
693 ipst->ips_reg_vif_num = ALL_VIFS;
694 mutex_exit(&ipst->ips_numvifs_mutex);
695
696 /*
697 * Free upcall msgs.
698 * Go through mfctable and stop any outstanding upcall
699 * timeouts remaining on mfcs.
700 */
701 for (i = 0; i < MFCTBLSIZ; i++) {
702 mutex_enter(&ipst->ips_mfcs[i].mfcb_lock);
703 ipst->ips_mfcs[i].mfcb_refcnt++;
704 ipst->ips_mfcs[i].mfcb_marks |= MFCB_MARK_CONDEMNED;
705 mutex_exit(&ipst->ips_mfcs[i].mfcb_lock);
706 mfc_rt = ipst->ips_mfcs[i].mfcb_mfc;
707 while (mfc_rt) {
708 /* Free upcalls */
709 mutex_enter(&mfc_rt->mfc_mutex);
710 if (mfc_rt->mfc_rte != NULL) {
711 if (mfc_rt->mfc_timeout_id != 0) {
712 /*
713 * OK to drop the lock as we have
714 * a refcnt on the bucket. timeout
715 * can fire but it will see that
716 * mfc_timeout_id == 0 and not do
717 * anything. see expire_upcalls().
718 */
719 mfc_rt->mfc_timeout_id = 0;
720 mutex_exit(&mfc_rt->mfc_mutex);
721 (void) untimeout(
722 mfc_rt->mfc_timeout_id);
723 mfc_rt->mfc_timeout_id = 0;
724 mutex_enter(&mfc_rt->mfc_mutex);
725
726 /*
727 * all queued upcall packets
728 * and mblk will be freed in
729 * release_mfc().
730 */
731 }
732 }
733
734 mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
735
736 mutex_exit(&mfc_rt->mfc_mutex);
737 mfc_rt = mfc_rt->mfc_next;
738 }
739 MFCB_REFRELE(&ipst->ips_mfcs[i]);
740 }
741
742 mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
743 ipst->ips_ip_g_mrouter = NULL;
744 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
745 return (0);
746 }
747
748 void
ip_mrouter_stack_destroy(ip_stack_t * ipst)749 ip_mrouter_stack_destroy(ip_stack_t *ipst)
750 {
751 struct mfcb *mfcbp;
752 struct mfc *rt;
753 int i;
754
755 for (i = 0; i < MFCTBLSIZ; i++) {
756 mfcbp = &ipst->ips_mfcs[i];
757
758 while ((rt = mfcbp->mfcb_mfc) != NULL) {
759 (void) printf("ip_mrouter_stack_destroy: free for %d\n",
760 i);
761
762 mfcbp->mfcb_mfc = rt->mfc_next;
763 free_queue(rt);
764 mi_free(rt);
765 }
766 }
767 kmem_free(ipst->ips_vifs, sizeof (struct vif) * (MAXVIFS+1));
768 ipst->ips_vifs = NULL;
769 kmem_free(ipst->ips_mrtstat, sizeof (struct mrtstat));
770 ipst->ips_mrtstat = NULL;
771 kmem_free(ipst->ips_mfcs, sizeof (struct mfcb) * MFCTBLSIZ);
772 ipst->ips_mfcs = NULL;
773 kmem_free(ipst->ips_tbfs, sizeof (struct tbf) * MAXVIFS);
774 ipst->ips_tbfs = NULL;
775
776 mutex_destroy(&ipst->ips_last_encap_lock);
777 mutex_destroy(&ipst->ips_ip_g_mrouter_mutex);
778 }
779
780 static boolean_t
is_mrouter_off(ip_stack_t * ipst)781 is_mrouter_off(ip_stack_t *ipst)
782 {
783 conn_t *mrouter;
784
785 mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
786 if (ipst->ips_ip_g_mrouter == NULL) {
787 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
788 return (B_TRUE);
789 }
790
791 mrouter = ipst->ips_ip_g_mrouter;
792 if (mrouter->conn_multi_router == 0) {
793 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
794 return (B_TRUE);
795 }
796 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
797 return (B_FALSE);
798 }
799
800 static void
unlock_good_vif(struct vif * vifp)801 unlock_good_vif(struct vif *vifp)
802 {
803 ASSERT(vifp->v_ipif != NULL);
804 ipif_refrele(vifp->v_ipif);
805 VIF_REFRELE(vifp);
806 }
807
808 static boolean_t
lock_good_vif(struct vif * vifp)809 lock_good_vif(struct vif *vifp)
810 {
811 mutex_enter(&vifp->v_lock);
812 if (!(vifp->v_marks & VIF_MARK_GOOD)) {
813 mutex_exit(&vifp->v_lock);
814 return (B_FALSE);
815 }
816
817 ASSERT(vifp->v_ipif != NULL);
818 mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock);
819 if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) {
820 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
821 mutex_exit(&vifp->v_lock);
822 return (B_FALSE);
823 }
824 ipif_refhold_locked(vifp->v_ipif);
825 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
826 vifp->v_refcnt++;
827 mutex_exit(&vifp->v_lock);
828 return (B_TRUE);
829 }
830
831 /*
832 * Add a vif to the vif table.
833 */
834 static int
add_vif(struct vifctl * vifcp,conn_t * connp,ip_stack_t * ipst)835 add_vif(struct vifctl *vifcp, conn_t *connp, ip_stack_t *ipst)
836 {
837 struct vif *vifp = ipst->ips_vifs + vifcp->vifc_vifi;
838 ipif_t *ipif;
839 int error = 0;
840 struct tbf *v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi;
841 conn_t *mrouter = ipst->ips_ip_g_mrouter;
842 ilm_t *ilm;
843 ill_t *ill;
844
845 ASSERT(connp != NULL);
846
847 if (vifcp->vifc_vifi >= MAXVIFS)
848 return (EINVAL);
849
850 if (is_mrouter_off(ipst))
851 return (EINVAL);
852
853 mutex_enter(&vifp->v_lock);
854 /*
855 * Viftable entry should be 0.
856 * if v_marks == 0 but v_refcnt != 0 means struct is being
857 * initialized.
858 *
859 * Also note that it is very unlikely that we will get a MRT_ADD_VIF
860 * request while the delete is in progress, mrouted only sends add
861 * requests when a new interface is added and the new interface cannot
862 * have the same vifi as an existing interface. We make sure that
863 * ill_delete will block till the vif is deleted by adding a refcnt
864 * to ipif in del_vif().
865 */
866 if (vifp->v_lcl_addr.s_addr != 0 ||
867 vifp->v_marks != 0 ||
868 vifp->v_refcnt != 0) {
869 mutex_exit(&vifp->v_lock);
870 return (EADDRINUSE);
871 }
872
873 /* Incoming vif should not be 0 */
874 if (vifcp->vifc_lcl_addr.s_addr == 0) {
875 mutex_exit(&vifp->v_lock);
876 return (EINVAL);
877 }
878
879 vifp->v_refcnt++;
880 mutex_exit(&vifp->v_lock);
881 /* Find the interface with the local address */
882 ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL,
883 IPCL_ZONEID(connp), ipst);
884 if (ipif == NULL) {
885 VIF_REFRELE(vifp);
886 return (EADDRNOTAVAIL);
887 }
888
889 if (ipst->ips_ip_mrtdebug > 1) {
890 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
891 "add_vif: src 0x%x enter",
892 vifcp->vifc_lcl_addr.s_addr);
893 }
894
895 mutex_enter(&vifp->v_lock);
896 /*
897 * Always clear cache when vifs change.
898 * Needed to ensure that src isn't left over from before vif was added.
899 * No need to get last_encap_lock, since we are running as a writer.
900 */
901
902 mutex_enter(&ipst->ips_last_encap_lock);
903 ipst->ips_last_encap_src = 0;
904 ipst->ips_last_encap_vif = NULL;
905 mutex_exit(&ipst->ips_last_encap_lock);
906
907 if (vifcp->vifc_flags & VIFF_TUNNEL) {
908 if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) {
909 cmn_err(CE_WARN,
910 "add_vif: source route tunnels not supported\n");
911 VIF_REFRELE_LOCKED(vifp);
912 ipif_refrele(ipif);
913 return (EOPNOTSUPP);
914 }
915 vifp->v_rmt_addr = vifcp->vifc_rmt_addr;
916
917 } else {
918 /* Phyint or Register vif */
919 if (vifcp->vifc_flags & VIFF_REGISTER) {
920 /*
921 * Note: Since all IPPROTO_IP level options (including
922 * MRT_ADD_VIF) are done exclusively via
923 * ip_optmgmt_writer(), a lock is not necessary to
924 * protect reg_vif_num.
925 */
926 mutex_enter(&ipst->ips_numvifs_mutex);
927 if (ipst->ips_reg_vif_num == ALL_VIFS) {
928 ipst->ips_reg_vif_num = vifcp->vifc_vifi;
929 mutex_exit(&ipst->ips_numvifs_mutex);
930 } else {
931 mutex_exit(&ipst->ips_numvifs_mutex);
932 VIF_REFRELE_LOCKED(vifp);
933 ipif_refrele(ipif);
934 return (EADDRINUSE);
935 }
936 }
937
938 /* Make sure the interface supports multicast */
939 if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) {
940 VIF_REFRELE_LOCKED(vifp);
941 ipif_refrele(ipif);
942 if (vifcp->vifc_flags & VIFF_REGISTER) {
943 mutex_enter(&ipst->ips_numvifs_mutex);
944 ipst->ips_reg_vif_num = ALL_VIFS;
945 mutex_exit(&ipst->ips_numvifs_mutex);
946 }
947 return (EOPNOTSUPP);
948 }
949 /* Enable promiscuous reception of all IP mcasts from the if */
950 mutex_exit(&vifp->v_lock);
951
952 ill = ipif->ipif_ill;
953 if (IS_UNDER_IPMP(ill))
954 ill = ipmp_ill_hold_ipmp_ill(ill);
955
956 if (ill == NULL) {
957 ilm = NULL;
958 } else {
959 ilm = ip_addmulti(&ipv6_all_zeros, ill,
960 ipif->ipif_zoneid, &error);
961 if (ilm != NULL)
962 atomic_inc_32(&ill->ill_mrouter_cnt);
963 if (IS_UNDER_IPMP(ipif->ipif_ill)) {
964 ill_refrele(ill);
965 ill = ipif->ipif_ill;
966 }
967 }
968
969 mutex_enter(&vifp->v_lock);
970 /*
971 * since we released the lock lets make sure that
972 * ip_mrouter_done() has not been called.
973 */
974 if (ilm == NULL || is_mrouter_off(ipst)) {
975 if (ilm != NULL) {
976 (void) ip_delmulti(ilm);
977 ASSERT(ill->ill_mrouter_cnt > 0);
978 atomic_dec_32(&ill->ill_mrouter_cnt);
979 }
980 if (vifcp->vifc_flags & VIFF_REGISTER) {
981 mutex_enter(&ipst->ips_numvifs_mutex);
982 ipst->ips_reg_vif_num = ALL_VIFS;
983 mutex_exit(&ipst->ips_numvifs_mutex);
984 }
985 VIF_REFRELE_LOCKED(vifp);
986 ipif_refrele(ipif);
987 return (error?error:EINVAL);
988 }
989 vifp->v_ilm = ilm;
990 }
991 /* Define parameters for the tbf structure */
992 vifp->v_tbf = v_tbf;
993 gethrestime(&vifp->v_tbf->tbf_last_pkt_t);
994 vifp->v_tbf->tbf_n_tok = 0;
995 vifp->v_tbf->tbf_q_len = 0;
996 vifp->v_tbf->tbf_max_q_len = MAXQSIZE;
997 vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL;
998
999 vifp->v_flags = vifcp->vifc_flags;
1000 vifp->v_threshold = vifcp->vifc_threshold;
1001 vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
1002 vifp->v_ipif = ipif;
1003 ipif_refrele(ipif);
1004 /* Scaling up here, allows division by 1024 in critical code. */
1005 vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000);
1006 vifp->v_timeout_id = 0;
1007 /* initialize per vif pkt counters */
1008 vifp->v_pkt_in = 0;
1009 vifp->v_pkt_out = 0;
1010 vifp->v_bytes_in = 0;
1011 vifp->v_bytes_out = 0;
1012 mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL);
1013
1014 /* Adjust numvifs up, if the vifi is higher than numvifs */
1015 mutex_enter(&ipst->ips_numvifs_mutex);
1016 if (ipst->ips_numvifs <= vifcp->vifc_vifi)
1017 ipst->ips_numvifs = vifcp->vifc_vifi + 1;
1018 mutex_exit(&ipst->ips_numvifs_mutex);
1019
1020 if (ipst->ips_ip_mrtdebug > 1) {
1021 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1022 "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d",
1023 vifcp->vifc_vifi,
1024 ntohl(vifcp->vifc_lcl_addr.s_addr),
1025 (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
1026 ntohl(vifcp->vifc_rmt_addr.s_addr),
1027 vifcp->vifc_threshold, vifcp->vifc_rate_limit);
1028 }
1029
1030 vifp->v_marks = VIF_MARK_GOOD;
1031 mutex_exit(&vifp->v_lock);
1032 return (0);
1033 }
1034
1035
1036 /* Delete a vif from the vif table. */
1037 static void
del_vifp(struct vif * vifp)1038 del_vifp(struct vif *vifp)
1039 {
1040 struct tbf *t = vifp->v_tbf;
1041 mblk_t *mp0;
1042 vifi_t vifi;
1043 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
1044 conn_t *mrouter = ipst->ips_ip_g_mrouter;
1045
1046 ASSERT(vifp->v_marks & VIF_MARK_CONDEMNED);
1047 ASSERT(t != NULL);
1048
1049 if (ipst->ips_ip_mrtdebug > 1) {
1050 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1051 "del_vif: src 0x%x\n", vifp->v_lcl_addr.s_addr);
1052 }
1053
1054 if (vifp->v_timeout_id != 0) {
1055 (void) untimeout(vifp->v_timeout_id);
1056 vifp->v_timeout_id = 0;
1057 }
1058
1059 /*
1060 * Free packets queued at the interface.
1061 * Mrouted takes care of cleaning up mfcs - makes calls to del_mfc.
1062 */
1063 mutex_enter(&t->tbf_lock);
1064 while (t->tbf_q != NULL) {
1065 mp0 = t->tbf_q;
1066 t->tbf_q = t->tbf_q->b_next;
1067 mp0->b_prev = mp0->b_next = NULL;
1068 freemsg(mp0);
1069 }
1070 mutex_exit(&t->tbf_lock);
1071
1072 /*
1073 * Always clear cache when vifs change.
1074 * No need to get last_encap_lock since we are running as a writer.
1075 */
1076 mutex_enter(&ipst->ips_last_encap_lock);
1077 if (vifp == ipst->ips_last_encap_vif) {
1078 ipst->ips_last_encap_vif = NULL;
1079 ipst->ips_last_encap_src = 0;
1080 }
1081 mutex_exit(&ipst->ips_last_encap_lock);
1082
1083 mutex_destroy(&t->tbf_lock);
1084
1085 bzero(vifp->v_tbf, sizeof (*(vifp->v_tbf)));
1086
1087 /* Adjust numvifs down */
1088 mutex_enter(&ipst->ips_numvifs_mutex);
1089 for (vifi = ipst->ips_numvifs; vifi != 0; vifi--) /* vifi is unsigned */
1090 if (ipst->ips_vifs[vifi - 1].v_lcl_addr.s_addr != 0)
1091 break;
1092 ipst->ips_numvifs = vifi;
1093 mutex_exit(&ipst->ips_numvifs_mutex);
1094
1095 bzero(vifp, sizeof (*vifp));
1096 }
1097
1098 static int
del_vif(vifi_t * vifip,ip_stack_t * ipst)1099 del_vif(vifi_t *vifip, ip_stack_t *ipst)
1100 {
1101 struct vif *vifp = ipst->ips_vifs + *vifip;
1102
1103 if (*vifip >= ipst->ips_numvifs)
1104 return (EINVAL);
1105
1106 mutex_enter(&vifp->v_lock);
1107 /*
1108 * Not initialized
1109 * Here we are not looking at the vif that is being initialized
1110 * i.e vifp->v_marks == 0 and refcnt > 0.
1111 */
1112 if (vifp->v_lcl_addr.s_addr == 0 ||
1113 !(vifp->v_marks & VIF_MARK_GOOD)) {
1114 mutex_exit(&vifp->v_lock);
1115 return (EADDRNOTAVAIL);
1116 }
1117
1118 /* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */
1119 vifp->v_marks &= ~VIF_MARK_GOOD;
1120 vifp->v_marks |= VIF_MARK_CONDEMNED;
1121
1122 /* Phyint only */
1123 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
1124 ipif_t *ipif = vifp->v_ipif;
1125 ilm_t *ilm = vifp->v_ilm;
1126
1127 vifp->v_ilm = NULL;
1128
1129 ASSERT(ipif != NULL);
1130 /*
1131 * should be OK to drop the lock as we
1132 * have marked this as CONDEMNED.
1133 */
1134 mutex_exit(&(vifp)->v_lock);
1135 if (ilm != NULL) {
1136 (void) ip_delmulti(ilm);
1137 ASSERT(ipif->ipif_ill->ill_mrouter_cnt > 0);
1138 atomic_dec_32(&ipif->ipif_ill->ill_mrouter_cnt);
1139 }
1140 mutex_enter(&(vifp)->v_lock);
1141 }
1142
1143 if (vifp->v_flags & VIFF_REGISTER) {
1144 mutex_enter(&ipst->ips_numvifs_mutex);
1145 ipst->ips_reg_vif_num = ALL_VIFS;
1146 mutex_exit(&ipst->ips_numvifs_mutex);
1147 }
1148
1149 /*
1150 * decreases the refcnt added in add_vif.
1151 */
1152 VIF_REFRELE_LOCKED(vifp);
1153 return (0);
1154 }
1155
1156 /*
1157 * Add an mfc entry.
1158 */
1159 static int
add_mfc(struct mfcctl * mfccp,ip_stack_t * ipst)1160 add_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
1161 {
1162 struct mfc *rt;
1163 struct rtdetq *rte;
1164 ushort_t nstl;
1165 int i;
1166 struct mfcb *mfcbp;
1167 conn_t *mrouter = ipst->ips_ip_g_mrouter;
1168
1169 /*
1170 * The value of vifi is NO_VIF (==MAXVIFS) if Mrouted
1171 * did not have a real route for pkt.
1172 * We want this pkt without rt installed in the mfctable to prevent
1173 * multiiple tries, so go ahead and put it in mfctable, it will
1174 * be discarded later in ip_mdq() because the child is NULL.
1175 */
1176
1177 /* Error checking, out of bounds? */
1178 if (mfccp->mfcc_parent > MAXVIFS) {
1179 ip0dbg(("ADD_MFC: mfcc_parent out of range %d",
1180 (int)mfccp->mfcc_parent));
1181 return (EINVAL);
1182 }
1183
1184 if ((mfccp->mfcc_parent != NO_VIF) &&
1185 (ipst->ips_vifs[mfccp->mfcc_parent].v_ipif == NULL)) {
1186 ip0dbg(("ADD_MFC: NULL ipif for parent vif %d\n",
1187 (int)mfccp->mfcc_parent));
1188 return (EINVAL);
1189 }
1190
1191 if (is_mrouter_off(ipst)) {
1192 return (EINVAL);
1193 }
1194
1195 mfcbp = &ipst->ips_mfcs[MFCHASH(mfccp->mfcc_origin.s_addr,
1196 mfccp->mfcc_mcastgrp.s_addr)];
1197 MFCB_REFHOLD(mfcbp);
1198 MFCFIND(mfcbp, mfccp->mfcc_origin.s_addr,
1199 mfccp->mfcc_mcastgrp.s_addr, rt);
1200
1201 /* If an entry already exists, just update the fields */
1202 if (rt) {
1203 if (ipst->ips_ip_mrtdebug > 1) {
1204 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1205 "add_mfc: update o %x grp %x parent %x",
1206 ntohl(mfccp->mfcc_origin.s_addr),
1207 ntohl(mfccp->mfcc_mcastgrp.s_addr),
1208 mfccp->mfcc_parent);
1209 }
1210 mutex_enter(&rt->mfc_mutex);
1211 rt->mfc_parent = mfccp->mfcc_parent;
1212
1213 mutex_enter(&ipst->ips_numvifs_mutex);
1214 for (i = 0; i < (int)ipst->ips_numvifs; i++)
1215 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1216 mutex_exit(&ipst->ips_numvifs_mutex);
1217 mutex_exit(&rt->mfc_mutex);
1218
1219 MFCB_REFRELE(mfcbp);
1220 return (0);
1221 }
1222
1223 /*
1224 * Find the entry for which the upcall was made and update.
1225 */
1226 for (rt = mfcbp->mfcb_mfc, nstl = 0; rt; rt = rt->mfc_next) {
1227 mutex_enter(&rt->mfc_mutex);
1228 if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
1229 (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) &&
1230 (rt->mfc_rte != NULL) &&
1231 !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1232 if (nstl++ != 0)
1233 cmn_err(CE_WARN,
1234 "add_mfc: %s o %x g %x p %x",
1235 "multiple kernel entries",
1236 ntohl(mfccp->mfcc_origin.s_addr),
1237 ntohl(mfccp->mfcc_mcastgrp.s_addr),
1238 mfccp->mfcc_parent);
1239
1240 if (ipst->ips_ip_mrtdebug > 1) {
1241 (void) mi_strlog(mrouter->conn_rq, 1,
1242 SL_TRACE,
1243 "add_mfc: o %x g %x p %x",
1244 ntohl(mfccp->mfcc_origin.s_addr),
1245 ntohl(mfccp->mfcc_mcastgrp.s_addr),
1246 mfccp->mfcc_parent);
1247 }
1248 fill_route(rt, mfccp, ipst);
1249
1250 /*
1251 * Prevent cleanup of cache entry.
1252 * Timer starts in ip_mforward.
1253 */
1254 if (rt->mfc_timeout_id != 0) {
1255 timeout_id_t id;
1256 id = rt->mfc_timeout_id;
1257 /*
1258 * setting id to zero will avoid this
1259 * entry from being cleaned up in
1260 * expire_up_calls().
1261 */
1262 rt->mfc_timeout_id = 0;
1263 /*
1264 * dropping the lock is fine as we
1265 * have a refhold on the bucket.
1266 * so mfc cannot be freed.
1267 * The timeout can fire but it will see
1268 * that mfc_timeout_id == 0 and not cleanup.
1269 */
1270 mutex_exit(&rt->mfc_mutex);
1271 (void) untimeout(id);
1272 mutex_enter(&rt->mfc_mutex);
1273 }
1274
1275 /*
1276 * Send all pkts that are queued waiting for the upcall.
1277 * ip_mdq param tun set to 0 -
1278 * the return value of ip_mdq() isn't used here,
1279 * so value we send doesn't matter.
1280 */
1281 while (rt->mfc_rte != NULL) {
1282 rte = rt->mfc_rte;
1283 rt->mfc_rte = rte->rte_next;
1284 mutex_exit(&rt->mfc_mutex);
1285 (void) ip_mdq(rte->mp, (ipha_t *)
1286 rte->mp->b_rptr, rte->ill, 0, rt);
1287 freemsg(rte->mp);
1288 mi_free((char *)rte);
1289 mutex_enter(&rt->mfc_mutex);
1290 }
1291 }
1292 mutex_exit(&rt->mfc_mutex);
1293 }
1294
1295
1296 /*
1297 * It is possible that an entry is being inserted without an upcall
1298 */
1299 if (nstl == 0) {
1300 mutex_enter(&(mfcbp->mfcb_lock));
1301 if (ipst->ips_ip_mrtdebug > 1) {
1302 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1303 "add_mfc: no upcall o %x g %x p %x",
1304 ntohl(mfccp->mfcc_origin.s_addr),
1305 ntohl(mfccp->mfcc_mcastgrp.s_addr),
1306 mfccp->mfcc_parent);
1307 }
1308 if (is_mrouter_off(ipst)) {
1309 mutex_exit(&mfcbp->mfcb_lock);
1310 MFCB_REFRELE(mfcbp);
1311 return (EINVAL);
1312 }
1313
1314 for (rt = mfcbp->mfcb_mfc; rt; rt = rt->mfc_next) {
1315
1316 mutex_enter(&rt->mfc_mutex);
1317 if ((rt->mfc_origin.s_addr ==
1318 mfccp->mfcc_origin.s_addr) &&
1319 (rt->mfc_mcastgrp.s_addr ==
1320 mfccp->mfcc_mcastgrp.s_addr) &&
1321 (!(rt->mfc_marks & MFCB_MARK_CONDEMNED))) {
1322 fill_route(rt, mfccp, ipst);
1323 mutex_exit(&rt->mfc_mutex);
1324 break;
1325 }
1326 mutex_exit(&rt->mfc_mutex);
1327 }
1328
1329 /* No upcall, so make a new entry into mfctable */
1330 if (rt == NULL) {
1331 rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1332 if (rt == NULL) {
1333 ip1dbg(("add_mfc: out of memory\n"));
1334 mutex_exit(&mfcbp->mfcb_lock);
1335 MFCB_REFRELE(mfcbp);
1336 return (ENOBUFS);
1337 }
1338
1339 /* Insert new entry at head of hash chain */
1340 mutex_enter(&rt->mfc_mutex);
1341 fill_route(rt, mfccp, ipst);
1342
1343 /* Link into table */
1344 rt->mfc_next = mfcbp->mfcb_mfc;
1345 mfcbp->mfcb_mfc = rt;
1346 mutex_exit(&rt->mfc_mutex);
1347 }
1348 mutex_exit(&mfcbp->mfcb_lock);
1349 }
1350
1351 MFCB_REFRELE(mfcbp);
1352 return (0);
1353 }
1354
1355 /*
1356 * Fills in mfc structure from mrouted mfcctl.
1357 */
1358 static void
fill_route(struct mfc * rt,struct mfcctl * mfccp,ip_stack_t * ipst)1359 fill_route(struct mfc *rt, struct mfcctl *mfccp, ip_stack_t *ipst)
1360 {
1361 int i;
1362
1363 rt->mfc_origin = mfccp->mfcc_origin;
1364 rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp;
1365 rt->mfc_parent = mfccp->mfcc_parent;
1366 mutex_enter(&ipst->ips_numvifs_mutex);
1367 for (i = 0; i < (int)ipst->ips_numvifs; i++) {
1368 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1369 }
1370 mutex_exit(&ipst->ips_numvifs_mutex);
1371 /* Initialize pkt counters per src-grp */
1372 rt->mfc_pkt_cnt = 0;
1373 rt->mfc_byte_cnt = 0;
1374 rt->mfc_wrong_if = 0;
1375 rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_nsec = 0;
1376
1377 }
1378
1379 static void
free_queue(struct mfc * mfcp)1380 free_queue(struct mfc *mfcp)
1381 {
1382 struct rtdetq *rte0;
1383
1384 /*
1385 * Drop all queued upcall packets.
1386 * Free the mbuf with the pkt.
1387 */
1388 while ((rte0 = mfcp->mfc_rte) != NULL) {
1389 mfcp->mfc_rte = rte0->rte_next;
1390 freemsg(rte0->mp);
1391 mi_free((char *)rte0);
1392 }
1393 }
1394 /*
1395 * go thorugh the hash bucket and free all the entries marked condemned.
1396 */
1397 void
release_mfc(struct mfcb * mfcbp)1398 release_mfc(struct mfcb *mfcbp)
1399 {
1400 struct mfc *current_mfcp;
1401 struct mfc *prev_mfcp;
1402
1403 prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1404
1405 while (current_mfcp != NULL) {
1406 if (current_mfcp->mfc_marks & MFCB_MARK_CONDEMNED) {
1407 if (current_mfcp == mfcbp->mfcb_mfc) {
1408 mfcbp->mfcb_mfc = current_mfcp->mfc_next;
1409 free_queue(current_mfcp);
1410 mi_free(current_mfcp);
1411 prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1412 continue;
1413 }
1414 ASSERT(prev_mfcp != NULL);
1415 prev_mfcp->mfc_next = current_mfcp->mfc_next;
1416 free_queue(current_mfcp);
1417 mi_free(current_mfcp);
1418 current_mfcp = NULL;
1419 } else {
1420 prev_mfcp = current_mfcp;
1421 }
1422
1423 current_mfcp = prev_mfcp->mfc_next;
1424
1425 }
1426 mfcbp->mfcb_marks &= ~MFCB_MARK_CONDEMNED;
1427 ASSERT(mfcbp->mfcb_mfc != NULL || mfcbp->mfcb_marks == 0);
1428 }
1429
1430 /*
1431 * Delete an mfc entry.
1432 */
1433 static int
del_mfc(struct mfcctl * mfccp,ip_stack_t * ipst)1434 del_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
1435 {
1436 struct in_addr origin;
1437 struct in_addr mcastgrp;
1438 struct mfc *rt;
1439 uint_t hash;
1440 conn_t *mrouter = ipst->ips_ip_g_mrouter;
1441
1442 origin = mfccp->mfcc_origin;
1443 mcastgrp = mfccp->mfcc_mcastgrp;
1444 hash = MFCHASH(origin.s_addr, mcastgrp.s_addr);
1445
1446 if (ipst->ips_ip_mrtdebug > 1) {
1447 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1448 "del_mfc: o %x g %x",
1449 ntohl(origin.s_addr),
1450 ntohl(mcastgrp.s_addr));
1451 }
1452
1453 MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
1454
1455 /* Find mfc in mfctable, finds only entries without upcalls */
1456 for (rt = ipst->ips_mfcs[hash].mfcb_mfc; rt; rt = rt->mfc_next) {
1457 mutex_enter(&rt->mfc_mutex);
1458 if (origin.s_addr == rt->mfc_origin.s_addr &&
1459 mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr &&
1460 rt->mfc_rte == NULL &&
1461 !(rt->mfc_marks & MFCB_MARK_CONDEMNED))
1462 break;
1463 mutex_exit(&rt->mfc_mutex);
1464 }
1465
1466 /*
1467 * Return if there was an upcall (mfc_rte != NULL,
1468 * or rt not in mfctable.
1469 */
1470 if (rt == NULL) {
1471 MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1472 return (EADDRNOTAVAIL);
1473 }
1474
1475
1476 /*
1477 * no need to hold lock as we have a reference.
1478 */
1479 ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
1480 /* error checking */
1481 if (rt->mfc_timeout_id != 0) {
1482 ip0dbg(("del_mfc: TIMEOUT NOT 0, rte not null"));
1483 /*
1484 * Its ok to drop the lock, the struct cannot be freed
1485 * since we have a ref on the hash bucket.
1486 */
1487 rt->mfc_timeout_id = 0;
1488 mutex_exit(&rt->mfc_mutex);
1489 (void) untimeout(rt->mfc_timeout_id);
1490 mutex_enter(&rt->mfc_mutex);
1491 }
1492
1493 ASSERT(rt->mfc_rte == NULL);
1494
1495
1496 /*
1497 * Delete the entry from the cache
1498 */
1499 rt->mfc_marks |= MFCB_MARK_CONDEMNED;
1500 mutex_exit(&rt->mfc_mutex);
1501
1502 MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1503
1504 return (0);
1505 }
1506
1507 #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */
1508
1509 /*
1510 * IP multicast forwarding function. This function assumes that the packet
1511 * pointed to by ipha has arrived on (or is about to be sent to) the interface
1512 * pointed to by "ill", and the packet is to be relayed to other networks
1513 * that have members of the packet's destination IP multicast group.
1514 *
1515 * The packet is returned unscathed to the caller, unless it is
1516 * erroneous, in which case a -1 value tells the caller (IP)
1517 * to discard it.
1518 *
1519 * Unlike BSD, SunOS 5.x needs to return to IP info about
1520 * whether pkt came in thru a tunnel, so it can be discarded, unless
1521 * it's IGMP. In BSD, the ifp is bogus for tunnels, so pkt won't try
1522 * to be delivered.
1523 * Return values are 0 - pkt is okay and phyint
1524 * -1 - pkt is malformed and to be tossed
1525 * 1 - pkt came in on tunnel
1526 */
1527 int
ip_mforward(mblk_t * mp,ip_recv_attr_t * ira)1528 ip_mforward(mblk_t *mp, ip_recv_attr_t *ira)
1529 {
1530 ipha_t *ipha = (ipha_t *)mp->b_rptr;
1531 ill_t *ill = ira->ira_ill;
1532 struct mfc *rt;
1533 ipaddr_t src, dst, tunnel_src = 0;
1534 static int srctun = 0;
1535 vifi_t vifi;
1536 boolean_t pim_reg_packet = B_FALSE;
1537 struct mfcb *mfcbp;
1538 ip_stack_t *ipst = ill->ill_ipst;
1539 conn_t *mrouter = ipst->ips_ip_g_mrouter;
1540 ill_t *rill = ira->ira_rill;
1541
1542 ASSERT(ira->ira_pktlen == msgdsize(mp));
1543
1544 if (ipst->ips_ip_mrtdebug > 1) {
1545 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1546 "ip_mforward: RECV ipha_src %x, ipha_dst %x, ill %s",
1547 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
1548 ill->ill_name);
1549 }
1550
1551 dst = ipha->ipha_dst;
1552 if (ira->ira_flags & IRAF_PIM_REGISTER)
1553 pim_reg_packet = B_TRUE;
1554 else if (ira->ira_flags & IRAF_MROUTE_TUNNEL_SET)
1555 tunnel_src = ira->ira_mroute_tunnel;
1556
1557 /*
1558 * Don't forward a packet with time-to-live of zero or one,
1559 * or a packet destined to a local-only group.
1560 */
1561 if (CLASSD(dst) && (ipha->ipha_ttl <= 1 ||
1562 (ipaddr_t)ntohl(dst) <= INADDR_MAX_LOCAL_GROUP)) {
1563 if (ipst->ips_ip_mrtdebug > 1) {
1564 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1565 "ip_mforward: not forwarded ttl %d,"
1566 " dst 0x%x ill %s",
1567 ipha->ipha_ttl, ntohl(dst), ill->ill_name);
1568 }
1569 if (tunnel_src != 0)
1570 return (1);
1571 else
1572 return (0);
1573 }
1574
1575 if ((tunnel_src != 0) || pim_reg_packet) {
1576 /*
1577 * Packet arrived over an encapsulated tunnel or via a PIM
1578 * register message.
1579 */
1580 if (ipst->ips_ip_mrtdebug > 1) {
1581 if (tunnel_src != 0) {
1582 (void) mi_strlog(mrouter->conn_rq, 1,
1583 SL_TRACE,
1584 "ip_mforward: ill %s arrived via ENCAP TUN",
1585 ill->ill_name);
1586 } else if (pim_reg_packet) {
1587 (void) mi_strlog(mrouter->conn_rq, 1,
1588 SL_TRACE,
1589 "ip_mforward: ill %s arrived via"
1590 " REGISTER VIF",
1591 ill->ill_name);
1592 }
1593 }
1594 } else if ((ipha->ipha_version_and_hdr_length & 0xf) <
1595 (uint_t)(IP_SIMPLE_HDR_LENGTH + TUNNEL_LEN) >> 2 ||
1596 ((uchar_t *)(ipha + 1))[1] != IPOPT_LSRR) {
1597 /* Packet arrived via a physical interface. */
1598 if (ipst->ips_ip_mrtdebug > 1) {
1599 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1600 "ip_mforward: ill %s arrived via PHYINT",
1601 ill->ill_name);
1602 }
1603
1604 } else {
1605 /*
1606 * Packet arrived through a SRCRT tunnel.
1607 * Source-route tunnels are no longer supported.
1608 * Error message printed every 1000 times.
1609 */
1610 if ((srctun++ % 1000) == 0) {
1611 cmn_err(CE_WARN,
1612 "ip_mforward: received source-routed pkt from %x",
1613 ntohl(ipha->ipha_src));
1614 }
1615 return (-1);
1616 }
1617
1618 ipst->ips_mrtstat->mrts_fwd_in++;
1619 src = ipha->ipha_src;
1620
1621 /* Find route in cache, return NULL if not there or upcalls q'ed. */
1622
1623 /*
1624 * Lock the mfctable against changes made by ip_mforward.
1625 * Note that only add_mfc and del_mfc can remove entries and
1626 * they run with exclusive access to IP. So we do not need to
1627 * guard against the rt being deleted, so release lock after reading.
1628 */
1629
1630 if (is_mrouter_off(ipst))
1631 return (-1);
1632
1633 mfcbp = &ipst->ips_mfcs[MFCHASH(src, dst)];
1634 MFCB_REFHOLD(mfcbp);
1635 MFCFIND(mfcbp, src, dst, rt);
1636
1637 /* Entry exists, so forward if necessary */
1638 if (rt != NULL) {
1639 int ret = 0;
1640 ipst->ips_mrtstat->mrts_mfc_hits++;
1641 if (pim_reg_packet) {
1642 ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
1643 ret = ip_mdq(mp, ipha,
1644 ipst->ips_vifs[ipst->ips_reg_vif_num].
1645 v_ipif->ipif_ill,
1646 0, rt);
1647 } else {
1648 ret = ip_mdq(mp, ipha, ill, tunnel_src, rt);
1649 }
1650
1651 MFCB_REFRELE(mfcbp);
1652 return (ret);
1653
1654 /*
1655 * Don't forward if we don't have a cache entry. Mrouted will
1656 * always provide a cache entry in response to an upcall.
1657 */
1658 } else {
1659 /*
1660 * If we don't have a route for packet's origin, make a copy
1661 * of the packet and send message to routing daemon.
1662 */
1663 struct mfc *mfc_rt = NULL;
1664 mblk_t *mp0 = NULL;
1665 mblk_t *mp_copy = NULL;
1666 struct rtdetq *rte = NULL;
1667 struct rtdetq *rte_m, *rte1, *prev_rte;
1668 uint_t hash;
1669 int npkts;
1670 boolean_t new_mfc = B_FALSE;
1671 ipst->ips_mrtstat->mrts_mfc_misses++;
1672 /* BSD uses mrts_no_route++ */
1673 if (ipst->ips_ip_mrtdebug > 1) {
1674 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1675 "ip_mforward: no rte ill %s src %x g %x misses %d",
1676 ill->ill_name, ntohl(src), ntohl(dst),
1677 (int)ipst->ips_mrtstat->mrts_mfc_misses);
1678 }
1679 /*
1680 * The order of the following code differs from the BSD code.
1681 * Pre-mc3.5, the BSD code was incorrect and SunOS 5.x
1682 * code works, so SunOS 5.x wasn't changed to conform to the
1683 * BSD version.
1684 */
1685
1686 /* Lock mfctable. */
1687 hash = MFCHASH(src, dst);
1688 mutex_enter(&(ipst->ips_mfcs[hash].mfcb_lock));
1689
1690 /*
1691 * If we are turning off mrouted return an error
1692 */
1693 if (is_mrouter_off(ipst)) {
1694 mutex_exit(&mfcbp->mfcb_lock);
1695 MFCB_REFRELE(mfcbp);
1696 return (-1);
1697 }
1698
1699 /* Is there an upcall waiting for this packet? */
1700 for (mfc_rt = ipst->ips_mfcs[hash].mfcb_mfc; mfc_rt;
1701 mfc_rt = mfc_rt->mfc_next) {
1702 mutex_enter(&mfc_rt->mfc_mutex);
1703 if (ipst->ips_ip_mrtdebug > 1) {
1704 (void) mi_strlog(mrouter->conn_rq, 1,
1705 SL_TRACE,
1706 "ip_mforward: MFCTAB hash %d o 0x%x"
1707 " g 0x%x\n",
1708 hash, ntohl(mfc_rt->mfc_origin.s_addr),
1709 ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1710 }
1711 /* There is an upcall */
1712 if ((src == mfc_rt->mfc_origin.s_addr) &&
1713 (dst == mfc_rt->mfc_mcastgrp.s_addr) &&
1714 (mfc_rt->mfc_rte != NULL) &&
1715 !(mfc_rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1716 break;
1717 }
1718 mutex_exit(&mfc_rt->mfc_mutex);
1719 }
1720 /* No upcall, so make a new entry into mfctable */
1721 if (mfc_rt == NULL) {
1722 mfc_rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1723 if (mfc_rt == NULL) {
1724 ipst->ips_mrtstat->mrts_fwd_drop++;
1725 ip1dbg(("ip_mforward: out of memory "
1726 "for mfc, mfc_rt\n"));
1727 goto error_return;
1728 } else
1729 new_mfc = B_TRUE;
1730 /* Get resources */
1731 /* TODO could copy header and dup rest */
1732 mp_copy = copymsg(mp);
1733 if (mp_copy == NULL) {
1734 ipst->ips_mrtstat->mrts_fwd_drop++;
1735 ip1dbg(("ip_mforward: out of memory for "
1736 "mblk, mp_copy\n"));
1737 goto error_return;
1738 }
1739 mutex_enter(&mfc_rt->mfc_mutex);
1740 }
1741 /* Get resources for rte, whether first rte or not first. */
1742 /* Add this packet into rtdetq */
1743 rte = (struct rtdetq *)mi_zalloc(sizeof (struct rtdetq));
1744 if (rte == NULL) {
1745 ipst->ips_mrtstat->mrts_fwd_drop++;
1746 mutex_exit(&mfc_rt->mfc_mutex);
1747 ip1dbg(("ip_mforward: out of memory for"
1748 " rtdetq, rte\n"));
1749 goto error_return;
1750 }
1751
1752 mp0 = copymsg(mp);
1753 if (mp0 == NULL) {
1754 ipst->ips_mrtstat->mrts_fwd_drop++;
1755 ip1dbg(("ip_mforward: out of memory for mblk, mp0\n"));
1756 mutex_exit(&mfc_rt->mfc_mutex);
1757 goto error_return;
1758 }
1759 rte->mp = mp0;
1760 if (pim_reg_packet) {
1761 ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
1762 rte->ill =
1763 ipst->ips_vifs[ipst->ips_reg_vif_num].
1764 v_ipif->ipif_ill;
1765 } else {
1766 rte->ill = ill;
1767 }
1768 rte->rte_next = NULL;
1769
1770 /*
1771 * Determine if upcall q (rtdetq) has overflowed.
1772 * mfc_rt->mfc_rte is null by mi_zalloc
1773 * if it is the first message.
1774 */
1775 for (rte_m = mfc_rt->mfc_rte, npkts = 0; rte_m;
1776 rte_m = rte_m->rte_next)
1777 npkts++;
1778 if (ipst->ips_ip_mrtdebug > 1) {
1779 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1780 "ip_mforward: upcalls %d\n", npkts);
1781 }
1782 if (npkts > MAX_UPQ) {
1783 ipst->ips_mrtstat->mrts_upq_ovflw++;
1784 mutex_exit(&mfc_rt->mfc_mutex);
1785 goto error_return;
1786 }
1787
1788 if (npkts == 0) { /* first upcall */
1789 int i = 0;
1790 /*
1791 * Now finish installing the new mfc! Now that we have
1792 * resources! Insert new entry at head of hash chain.
1793 * Use src and dst which are ipaddr_t's.
1794 */
1795 mfc_rt->mfc_origin.s_addr = src;
1796 mfc_rt->mfc_mcastgrp.s_addr = dst;
1797
1798 mutex_enter(&ipst->ips_numvifs_mutex);
1799 for (i = 0; i < (int)ipst->ips_numvifs; i++)
1800 mfc_rt->mfc_ttls[i] = 0;
1801 mutex_exit(&ipst->ips_numvifs_mutex);
1802 mfc_rt->mfc_parent = ALL_VIFS;
1803
1804 /* Link into table */
1805 if (ipst->ips_ip_mrtdebug > 1) {
1806 (void) mi_strlog(mrouter->conn_rq, 1,
1807 SL_TRACE,
1808 "ip_mforward: NEW MFCTAB hash %d o 0x%x "
1809 "g 0x%x\n", hash,
1810 ntohl(mfc_rt->mfc_origin.s_addr),
1811 ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1812 }
1813 mfc_rt->mfc_next = ipst->ips_mfcs[hash].mfcb_mfc;
1814 ipst->ips_mfcs[hash].mfcb_mfc = mfc_rt;
1815 mfc_rt->mfc_rte = NULL;
1816 }
1817
1818 /* Link in the upcall */
1819 /* First upcall */
1820 if (mfc_rt->mfc_rte == NULL)
1821 mfc_rt->mfc_rte = rte;
1822 else {
1823 /* not the first upcall */
1824 prev_rte = mfc_rt->mfc_rte;
1825 for (rte1 = mfc_rt->mfc_rte->rte_next; rte1;
1826 prev_rte = rte1, rte1 = rte1->rte_next)
1827 ;
1828 prev_rte->rte_next = rte;
1829 }
1830
1831 /*
1832 * No upcalls waiting, this is first one, so send a message to
1833 * routing daemon to install a route into kernel table.
1834 */
1835 if (npkts == 0) {
1836 struct igmpmsg *im;
1837 /* ipha_protocol is 0, for upcall */
1838 ASSERT(mp_copy != NULL);
1839 im = (struct igmpmsg *)mp_copy->b_rptr;
1840 im->im_msgtype = IGMPMSG_NOCACHE;
1841 im->im_mbz = 0;
1842 mutex_enter(&ipst->ips_numvifs_mutex);
1843 if (pim_reg_packet) {
1844 im->im_vif = (uchar_t)ipst->ips_reg_vif_num;
1845 mutex_exit(&ipst->ips_numvifs_mutex);
1846 } else {
1847 /*
1848 * XXX do we need to hold locks here ?
1849 */
1850 for (vifi = 0;
1851 vifi < ipst->ips_numvifs;
1852 vifi++) {
1853 if (ipst->ips_vifs[vifi].v_ipif == NULL)
1854 continue;
1855 if (ipst->ips_vifs[vifi].
1856 v_ipif->ipif_ill == ill) {
1857 im->im_vif = (uchar_t)vifi;
1858 break;
1859 }
1860 }
1861 mutex_exit(&ipst->ips_numvifs_mutex);
1862 ASSERT(vifi < ipst->ips_numvifs);
1863 }
1864
1865 ipst->ips_mrtstat->mrts_upcalls++;
1866 /* Timer to discard upcalls if mrouted is too slow */
1867 mfc_rt->mfc_timeout_id = timeout(expire_upcalls,
1868 mfc_rt, EXPIRE_TIMEOUT * UPCALL_EXPIRE);
1869 mutex_exit(&mfc_rt->mfc_mutex);
1870 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1871 /* Pass to RAWIP */
1872 ira->ira_ill = ira->ira_rill = NULL;
1873 (mrouter->conn_recv)(mrouter, mp_copy, NULL, ira);
1874 ira->ira_ill = ill;
1875 ira->ira_rill = rill;
1876 } else {
1877 mutex_exit(&mfc_rt->mfc_mutex);
1878 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1879 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1880 ip_drop_input("ip_mforward - upcall already waiting",
1881 mp_copy, ill);
1882 freemsg(mp_copy);
1883 }
1884
1885 MFCB_REFRELE(mfcbp);
1886 if (tunnel_src != 0)
1887 return (1);
1888 else
1889 return (0);
1890 error_return:
1891 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1892 MFCB_REFRELE(mfcbp);
1893 if (mfc_rt != NULL && (new_mfc == B_TRUE))
1894 mi_free((char *)mfc_rt);
1895 if (rte != NULL)
1896 mi_free((char *)rte);
1897 if (mp_copy != NULL) {
1898 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1899 ip_drop_input("ip_mforward error", mp_copy, ill);
1900 freemsg(mp_copy);
1901 }
1902 if (mp0 != NULL)
1903 freemsg(mp0);
1904 return (-1);
1905 }
1906 }
1907
1908 /*
1909 * Clean up the mfctable cache entry if upcall is not serviced.
1910 * SunOS 5.x has timeout per mfc, unlike BSD which has one timer.
1911 */
1912 static void
expire_upcalls(void * arg)1913 expire_upcalls(void *arg)
1914 {
1915 struct mfc *mfc_rt = arg;
1916 uint_t hash;
1917 struct mfc *prev_mfc, *mfc0;
1918 ip_stack_t *ipst;
1919 conn_t *mrouter;
1920
1921 if (mfc_rt->mfc_rte == NULL || mfc_rt->mfc_rte->ill != NULL) {
1922 cmn_err(CE_WARN, "expire_upcalls: no ILL\n");
1923 return;
1924 }
1925 ipst = mfc_rt->mfc_rte->ill->ill_ipst;
1926 mrouter = ipst->ips_ip_g_mrouter;
1927
1928 hash = MFCHASH(mfc_rt->mfc_origin.s_addr, mfc_rt->mfc_mcastgrp.s_addr);
1929 if (ipst->ips_ip_mrtdebug > 1) {
1930 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1931 "expire_upcalls: hash %d s %x g %x",
1932 hash, ntohl(mfc_rt->mfc_origin.s_addr),
1933 ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1934 }
1935 MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
1936 mutex_enter(&mfc_rt->mfc_mutex);
1937 /*
1938 * if timeout has been set to zero, than the
1939 * entry has been filled, no need to delete it.
1940 */
1941 if (mfc_rt->mfc_timeout_id == 0)
1942 goto done;
1943 ipst->ips_mrtstat->mrts_cache_cleanups++;
1944 mfc_rt->mfc_timeout_id = 0;
1945
1946 /* Determine entry to be cleaned up in cache table. */
1947 for (prev_mfc = mfc0 = ipst->ips_mfcs[hash].mfcb_mfc; mfc0;
1948 prev_mfc = mfc0, mfc0 = mfc0->mfc_next)
1949 if (mfc0 == mfc_rt)
1950 break;
1951
1952 /* del_mfc takes care of gone mfcs */
1953 ASSERT(prev_mfc != NULL);
1954 ASSERT(mfc0 != NULL);
1955
1956 /*
1957 * Delete the entry from the cache
1958 */
1959 ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
1960 mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
1961
1962 /*
1963 * release_mfc will drop all queued upcall packets.
1964 * and will free the mbuf with the pkt, if, timing info.
1965 */
1966 done:
1967 mutex_exit(&mfc_rt->mfc_mutex);
1968 MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1969 }
1970
1971 /*
1972 * Packet forwarding routine once entry in the cache is made.
1973 */
1974 static int
ip_mdq(mblk_t * mp,ipha_t * ipha,ill_t * ill,ipaddr_t tunnel_src,struct mfc * rt)1975 ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
1976 struct mfc *rt)
1977 {
1978 vifi_t vifi;
1979 struct vif *vifp;
1980 ipaddr_t dst = ipha->ipha_dst;
1981 size_t plen = msgdsize(mp);
1982 vifi_t num_of_vifs;
1983 ip_stack_t *ipst = ill->ill_ipst;
1984 conn_t *mrouter = ipst->ips_ip_g_mrouter;
1985 ip_recv_attr_t iras;
1986
1987 if (ipst->ips_ip_mrtdebug > 1) {
1988 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1989 "ip_mdq: SEND src %x, ipha_dst %x, ill %s",
1990 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
1991 ill->ill_name);
1992 }
1993
1994 /* Macro to send packet on vif */
1995 #define MC_SEND(ipha, mp, vifp, dst) { \
1996 if ((vifp)->v_flags & VIFF_TUNNEL) \
1997 encap_send((ipha), (mp), (vifp), (dst)); \
1998 else if ((vifp)->v_flags & VIFF_REGISTER) \
1999 register_send((ipha), (mp), (vifp), (dst)); \
2000 else \
2001 phyint_send((ipha), (mp), (vifp), (dst)); \
2002 }
2003
2004 vifi = rt->mfc_parent;
2005
2006 /*
2007 * The value of vifi is MAXVIFS if the pkt had no parent, i.e.,
2008 * Mrouted had no route.
2009 * We wanted the route installed in the mfctable to prevent multiple
2010 * tries, so it passed add_mfc(), but is discarded here. The v_ipif is
2011 * NULL so we don't want to check the ill. Still needed as of Mrouted
2012 * 3.6.
2013 */
2014 if (vifi == NO_VIF) {
2015 ip1dbg(("ip_mdq: no route for origin ill %s, vifi is NO_VIF\n",
2016 ill->ill_name));
2017 if (ipst->ips_ip_mrtdebug > 1) {
2018 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2019 "ip_mdq: vifi is NO_VIF ill = %s", ill->ill_name);
2020 }
2021 return (-1); /* drop pkt */
2022 }
2023
2024 if (!lock_good_vif(&ipst->ips_vifs[vifi]))
2025 return (-1);
2026 /*
2027 * The MFC entries are not cleaned up when an ipif goes
2028 * away thus this code has to guard against an MFC referencing
2029 * an ipif that has been closed. Note: reset_mrt_vif_ipif
2030 * sets the v_ipif to NULL when the ipif disappears.
2031 */
2032 ASSERT(ipst->ips_vifs[vifi].v_ipif != NULL);
2033
2034 if (vifi >= ipst->ips_numvifs) {
2035 cmn_err(CE_WARN, "ip_mdq: illegal vifi %d numvifs "
2036 "%d ill %s viftable ill %s\n",
2037 (int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
2038 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
2039 unlock_good_vif(&ipst->ips_vifs[vifi]);
2040 return (-1);
2041 }
2042 /*
2043 * Don't forward if it didn't arrive from the parent vif for its
2044 * origin.
2045 */
2046 if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill) ||
2047 (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) {
2048 /* Came in the wrong interface */
2049 ip1dbg(("ip_mdq: arrived wrong if, vifi %d "
2050 "numvifs %d ill %s viftable ill %s\n",
2051 (int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
2052 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name));
2053 if (ipst->ips_ip_mrtdebug > 1) {
2054 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2055 "ip_mdq: arrived wrong if, vifi %d ill "
2056 "%s viftable ill %s\n",
2057 (int)vifi, ill->ill_name,
2058 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
2059 }
2060 ipst->ips_mrtstat->mrts_wrong_if++;
2061 rt->mfc_wrong_if++;
2062
2063 /*
2064 * If we are doing PIM assert processing and we are forwarding
2065 * packets on this interface, and it is a broadcast medium
2066 * interface (and not a tunnel), send a message to the routing.
2067 *
2068 * We use the first ipif on the list, since it's all we have.
2069 * Chances are the ipif_flags are the same for ipifs on the ill.
2070 */
2071 if (ipst->ips_pim_assert && rt->mfc_ttls[vifi] > 0 &&
2072 (ill->ill_ipif->ipif_flags & IPIF_BROADCAST) &&
2073 !(ipst->ips_vifs[vifi].v_flags & VIFF_TUNNEL)) {
2074 mblk_t *mp_copy;
2075 struct igmpmsg *im;
2076
2077 /* TODO could copy header and dup rest */
2078 mp_copy = copymsg(mp);
2079 if (mp_copy == NULL) {
2080 ipst->ips_mrtstat->mrts_fwd_drop++;
2081 ip1dbg(("ip_mdq: out of memory "
2082 "for mblk, mp_copy\n"));
2083 unlock_good_vif(&ipst->ips_vifs[vifi]);
2084 return (-1);
2085 }
2086
2087 im = (struct igmpmsg *)mp_copy->b_rptr;
2088 im->im_msgtype = IGMPMSG_WRONGVIF;
2089 im->im_mbz = 0;
2090 im->im_vif = (ushort_t)vifi;
2091 /* Pass to RAWIP */
2092
2093 bzero(&iras, sizeof (iras));
2094 iras.ira_flags = IRAF_IS_IPV4;
2095 iras.ira_ip_hdr_length =
2096 IPH_HDR_LENGTH(mp_copy->b_rptr);
2097 iras.ira_pktlen = msgdsize(mp_copy);
2098 (mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras);
2099 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2100 }
2101 unlock_good_vif(&ipst->ips_vifs[vifi]);
2102 if (tunnel_src != 0)
2103 return (1);
2104 else
2105 return (0);
2106 }
2107 /*
2108 * If I sourced this packet, it counts as output, else it was input.
2109 */
2110 if (ipha->ipha_src == ipst->ips_vifs[vifi].v_lcl_addr.s_addr) {
2111 ipst->ips_vifs[vifi].v_pkt_out++;
2112 ipst->ips_vifs[vifi].v_bytes_out += plen;
2113 } else {
2114 ipst->ips_vifs[vifi].v_pkt_in++;
2115 ipst->ips_vifs[vifi].v_bytes_in += plen;
2116 }
2117 mutex_enter(&rt->mfc_mutex);
2118 rt->mfc_pkt_cnt++;
2119 rt->mfc_byte_cnt += plen;
2120 mutex_exit(&rt->mfc_mutex);
2121 unlock_good_vif(&ipst->ips_vifs[vifi]);
2122 /*
2123 * For each vif, decide if a copy of the packet should be forwarded.
2124 * Forward if:
2125 * - the vif threshold ttl is non-zero AND
2126 * - the pkt ttl exceeds the vif's threshold
2127 * A non-zero mfc_ttl indicates that the vif is part of
2128 * the output set for the mfc entry.
2129 */
2130 mutex_enter(&ipst->ips_numvifs_mutex);
2131 num_of_vifs = ipst->ips_numvifs;
2132 mutex_exit(&ipst->ips_numvifs_mutex);
2133 for (vifp = ipst->ips_vifs, vifi = 0;
2134 vifi < num_of_vifs;
2135 vifp++, vifi++) {
2136 if (!lock_good_vif(vifp))
2137 continue;
2138 if ((rt->mfc_ttls[vifi] > 0) &&
2139 (ipha->ipha_ttl > rt->mfc_ttls[vifi])) {
2140 /*
2141 * lock_good_vif should not have succedded if
2142 * v_ipif is null.
2143 */
2144 ASSERT(vifp->v_ipif != NULL);
2145 vifp->v_pkt_out++;
2146 vifp->v_bytes_out += plen;
2147 MC_SEND(ipha, mp, vifp, dst);
2148 ipst->ips_mrtstat->mrts_fwd_out++;
2149 }
2150 unlock_good_vif(vifp);
2151 }
2152 if (tunnel_src != 0)
2153 return (1);
2154 else
2155 return (0);
2156 }
2157
2158 /*
2159 * Send the packet on physical interface.
2160 * Caller assumes can continue to use mp on return.
2161 */
2162 /* ARGSUSED */
2163 static void
phyint_send(ipha_t * ipha,mblk_t * mp,struct vif * vifp,ipaddr_t dst)2164 phyint_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2165 {
2166 mblk_t *mp_copy;
2167 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2168 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2169
2170 /* Make a new reference to the packet */
2171 mp_copy = copymsg(mp); /* TODO could copy header and dup rest */
2172 if (mp_copy == NULL) {
2173 ipst->ips_mrtstat->mrts_fwd_drop++;
2174 ip1dbg(("phyint_send: out of memory for mblk, mp_copy\n"));
2175 return;
2176 }
2177 if (vifp->v_rate_limit <= 0)
2178 tbf_send_packet(vifp, mp_copy);
2179 else {
2180 if (ipst->ips_ip_mrtdebug > 1) {
2181 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2182 "phyint_send: tbf_contr rate %d "
2183 "vifp 0x%p mp 0x%p dst 0x%x",
2184 vifp->v_rate_limit, (void *)vifp, (void *)mp, dst);
2185 }
2186 tbf_control(vifp, mp_copy, (ipha_t *)mp_copy->b_rptr);
2187 }
2188 }
2189
2190 /*
2191 * Send the whole packet for REGISTER encapsulation to PIM daemon
2192 * Caller assumes it can continue to use mp on return.
2193 */
2194 /* ARGSUSED */
2195 static void
register_send(ipha_t * ipha,mblk_t * mp,struct vif * vifp,ipaddr_t dst)2196 register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2197 {
2198 struct igmpmsg *im;
2199 mblk_t *mp_copy;
2200 ipha_t *ipha_copy;
2201 ill_t *ill = vifp->v_ipif->ipif_ill;
2202 ip_stack_t *ipst = ill->ill_ipst;
2203 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2204 ip_recv_attr_t iras;
2205
2206 if (ipst->ips_ip_mrtdebug > 1) {
2207 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2208 "register_send: src %x, dst %x\n",
2209 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2210 }
2211
2212 /*
2213 * Copy the old packet & pullup its IP header into the new mblk_t so we
2214 * can modify it. Try to fill the new mblk_t since if we don't the
2215 * ethernet driver will.
2216 */
2217 mp_copy = allocb(sizeof (struct igmpmsg) + sizeof (ipha_t), BPRI_MED);
2218 if (mp_copy == NULL) {
2219 ++ipst->ips_mrtstat->mrts_pim_nomemory;
2220 if (ipst->ips_ip_mrtdebug > 3) {
2221 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2222 "register_send: allocb failure.");
2223 }
2224 return;
2225 }
2226
2227 /*
2228 * Bump write pointer to account for igmpmsg being added.
2229 */
2230 mp_copy->b_wptr = mp_copy->b_rptr + sizeof (struct igmpmsg);
2231
2232 /*
2233 * Chain packet to new mblk_t.
2234 */
2235 if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2236 ++ipst->ips_mrtstat->mrts_pim_nomemory;
2237 if (ipst->ips_ip_mrtdebug > 3) {
2238 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2239 "register_send: copymsg failure.");
2240 }
2241 freeb(mp_copy);
2242 return;
2243 }
2244
2245 /*
2246 * icmp_input() asserts that IP version field is set to an
2247 * appropriate version. Hence, the struct igmpmsg that this really
2248 * becomes, needs to have the correct IP version field.
2249 */
2250 ipha_copy = (ipha_t *)mp_copy->b_rptr;
2251 *ipha_copy = multicast_encap_iphdr;
2252
2253 /*
2254 * The kernel uses the struct igmpmsg header to encode the messages to
2255 * the multicast routing daemon. Fill in the fields in the header
2256 * starting with the message type which is IGMPMSG_WHOLEPKT
2257 */
2258 im = (struct igmpmsg *)mp_copy->b_rptr;
2259 im->im_msgtype = IGMPMSG_WHOLEPKT;
2260 im->im_src.s_addr = ipha->ipha_src;
2261 im->im_dst.s_addr = ipha->ipha_dst;
2262
2263 /*
2264 * Must Be Zero. This is because the struct igmpmsg is really an IP
2265 * header with renamed fields and the multicast routing daemon uses
2266 * an ipha_protocol (aka im_mbz) of 0 to distinguish these messages.
2267 */
2268 im->im_mbz = 0;
2269
2270 ++ipst->ips_mrtstat->mrts_upcalls;
2271 if (IPCL_IS_NONSTR(mrouter) ? mrouter->conn_flow_cntrld :
2272 !canputnext(mrouter->conn_rq)) {
2273 ++ipst->ips_mrtstat->mrts_pim_regsend_drops;
2274 if (ipst->ips_ip_mrtdebug > 3) {
2275 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2276 "register_send: register upcall failure.");
2277 }
2278 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2279 ip_drop_input("mrts_pim_regsend_drops", mp_copy, ill);
2280 freemsg(mp_copy);
2281 } else {
2282 /* Pass to RAWIP */
2283 bzero(&iras, sizeof (iras));
2284 iras.ira_flags = IRAF_IS_IPV4;
2285 iras.ira_ip_hdr_length = sizeof (ipha_t);
2286 iras.ira_pktlen = msgdsize(mp_copy);
2287 (mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras);
2288 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2289 }
2290 }
2291
2292 /*
2293 * pim_validate_cksum handles verification of the checksum in the
2294 * pim header. For PIM Register packets, the checksum is calculated
2295 * across the PIM header only. For all other packets, the checksum
2296 * is for the PIM header and remainder of the packet.
2297 *
2298 * returns: B_TRUE, if checksum is okay.
2299 * B_FALSE, if checksum is not valid.
2300 */
2301 static boolean_t
pim_validate_cksum(mblk_t * mp,ipha_t * ip,struct pim * pimp)2302 pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp)
2303 {
2304 mblk_t *mp_dup;
2305
2306 if ((mp_dup = dupmsg(mp)) == NULL)
2307 return (B_FALSE);
2308
2309 mp_dup->b_rptr += IPH_HDR_LENGTH(ip);
2310 if (pimp->pim_type == PIM_REGISTER)
2311 mp_dup->b_wptr = mp_dup->b_rptr + PIM_MINLEN;
2312 if (IP_CSUM(mp_dup, 0, 0)) {
2313 freemsg(mp_dup);
2314 return (B_FALSE);
2315 }
2316 freemsg(mp_dup);
2317 return (B_TRUE);
2318 }
2319
2320 /*
2321 * Process PIM protocol packets i.e. IP Protocol 103.
2322 * Register messages are decapsulated and sent onto multicast forwarding.
2323 *
2324 * Return NULL for a bad packet that is discarded here.
2325 * Return mp if the message is OK and should be handed to "raw" receivers.
2326 * Callers of pim_input() may need to reinitialize variables that were copied
2327 * from the mblk as this calls pullupmsg().
2328 */
2329 mblk_t *
pim_input(mblk_t * mp,ip_recv_attr_t * ira)2330 pim_input(mblk_t *mp, ip_recv_attr_t *ira)
2331 {
2332 ipha_t *eip, *ip;
2333 int iplen, pimlen, iphlen;
2334 struct pim *pimp; /* pointer to a pim struct */
2335 uint32_t *reghdr;
2336 ill_t *ill = ira->ira_ill;
2337 ip_stack_t *ipst = ill->ill_ipst;
2338 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2339
2340 /*
2341 * Pullup the msg for PIM protocol processing.
2342 */
2343 if (pullupmsg(mp, -1) == 0) {
2344 ++ipst->ips_mrtstat->mrts_pim_nomemory;
2345 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2346 ip_drop_input("mrts_pim_nomemory", mp, ill);
2347 freemsg(mp);
2348 return (NULL);
2349 }
2350
2351 ip = (ipha_t *)mp->b_rptr;
2352 iplen = ip->ipha_length;
2353 iphlen = IPH_HDR_LENGTH(ip);
2354 pimlen = ntohs(iplen) - iphlen;
2355
2356 /*
2357 * Validate lengths
2358 */
2359 if (pimlen < PIM_MINLEN) {
2360 ++ipst->ips_mrtstat->mrts_pim_malformed;
2361 if (ipst->ips_ip_mrtdebug > 1) {
2362 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2363 "pim_input: length not at least minlen");
2364 }
2365 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2366 ip_drop_input("mrts_pim_malformed", mp, ill);
2367 freemsg(mp);
2368 return (NULL);
2369 }
2370
2371 /*
2372 * Point to the PIM header.
2373 */
2374 pimp = (struct pim *)((caddr_t)ip + iphlen);
2375
2376 /*
2377 * Check the version number.
2378 */
2379 if (pimp->pim_vers != PIM_VERSION) {
2380 ++ipst->ips_mrtstat->mrts_pim_badversion;
2381 if (ipst->ips_ip_mrtdebug > 1) {
2382 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2383 "pim_input: unknown version of PIM");
2384 }
2385 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2386 ip_drop_input("mrts_pim_badversion", mp, ill);
2387 freemsg(mp);
2388 return (NULL);
2389 }
2390
2391 /*
2392 * Validate the checksum
2393 */
2394 if (!pim_validate_cksum(mp, ip, pimp)) {
2395 ++ipst->ips_mrtstat->mrts_pim_rcv_badcsum;
2396 if (ipst->ips_ip_mrtdebug > 1) {
2397 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2398 "pim_input: invalid checksum");
2399 }
2400 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2401 ip_drop_input("pim_rcv_badcsum", mp, ill);
2402 freemsg(mp);
2403 return (NULL);
2404 }
2405
2406 if (pimp->pim_type != PIM_REGISTER)
2407 return (mp);
2408
2409 reghdr = (uint32_t *)(pimp + 1);
2410 eip = (ipha_t *)(reghdr + 1);
2411
2412 /*
2413 * check if the inner packet is destined to mcast group
2414 */
2415 if (!CLASSD(eip->ipha_dst)) {
2416 ++ipst->ips_mrtstat->mrts_pim_badregisters;
2417 if (ipst->ips_ip_mrtdebug > 1) {
2418 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2419 "pim_input: Inner pkt not mcast .. !");
2420 }
2421 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2422 ip_drop_input("mrts_pim_badregisters", mp, ill);
2423 freemsg(mp);
2424 return (NULL);
2425 }
2426 if (ipst->ips_ip_mrtdebug > 1) {
2427 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2428 "register from %x, to %x, len %d",
2429 ntohl(eip->ipha_src),
2430 ntohl(eip->ipha_dst),
2431 ntohs(eip->ipha_length));
2432 }
2433 /*
2434 * If the null register bit is not set, decapsulate
2435 * the packet before forwarding it.
2436 * Avoid this in no register vif
2437 */
2438 if (!(ntohl(*reghdr) & PIM_NULL_REGISTER) &&
2439 ipst->ips_reg_vif_num != ALL_VIFS) {
2440 mblk_t *mp_copy;
2441 uint_t saved_pktlen;
2442
2443 /* Copy the message */
2444 if ((mp_copy = copymsg(mp)) == NULL) {
2445 ++ipst->ips_mrtstat->mrts_pim_nomemory;
2446 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2447 ip_drop_input("mrts_pim_nomemory", mp, ill);
2448 freemsg(mp);
2449 return (NULL);
2450 }
2451
2452 /*
2453 * Decapsulate the packet and give it to
2454 * register_mforward.
2455 */
2456 mp_copy->b_rptr += iphlen + sizeof (pim_t) + sizeof (*reghdr);
2457 saved_pktlen = ira->ira_pktlen;
2458 ira->ira_pktlen -= iphlen + sizeof (pim_t) + sizeof (*reghdr);
2459 if (register_mforward(mp_copy, ira) != 0) {
2460 /* register_mforward already called ip_drop_input */
2461 freemsg(mp);
2462 ira->ira_pktlen = saved_pktlen;
2463 return (NULL);
2464 }
2465 ira->ira_pktlen = saved_pktlen;
2466 }
2467
2468 /*
2469 * Pass all valid PIM packets up to any process(es) listening on a raw
2470 * PIM socket. For Solaris it is done right after pim_input() is
2471 * called.
2472 */
2473 return (mp);
2474 }
2475
2476 /*
2477 * PIM sparse mode hook. Called by pim_input after decapsulating
2478 * the packet. Loop back the packet, as if we have received it.
2479 * In pim_input() we have to check if the destination is a multicast address.
2480 */
2481 static int
register_mforward(mblk_t * mp,ip_recv_attr_t * ira)2482 register_mforward(mblk_t *mp, ip_recv_attr_t *ira)
2483 {
2484 ire_t *ire;
2485 ipha_t *ipha = (ipha_t *)mp->b_rptr;
2486 ill_t *ill = ira->ira_ill;
2487 ip_stack_t *ipst = ill->ill_ipst;
2488 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2489
2490 ASSERT(ipst->ips_reg_vif_num <= ipst->ips_numvifs);
2491
2492 if (ipst->ips_ip_mrtdebug > 3) {
2493 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2494 "register_mforward: src %x, dst %x\n",
2495 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2496 }
2497 /*
2498 * Need to pass in to ip_mforward() the information that the
2499 * packet has arrived on the register_vif. We mark it with
2500 * the IRAF_PIM_REGISTER attribute.
2501 * pim_input verified that the (inner) destination is multicast,
2502 * hence we skip the generic code in ip_input.
2503 */
2504 ira->ira_flags |= IRAF_PIM_REGISTER;
2505 ++ipst->ips_mrtstat->mrts_pim_regforwards;
2506
2507 if (!CLASSD(ipha->ipha_dst)) {
2508 ire = ire_route_recursive_v4(ipha->ipha_dst, 0, NULL, ALL_ZONES,
2509 ira->ira_tsl, MATCH_IRE_SECATTR, IRR_ALLOCATE, 0, ipst,
2510 NULL, NULL, NULL);
2511 } else {
2512 ire = ire_multicast(ill);
2513 }
2514 ASSERT(ire != NULL);
2515 /* Normally this will return the IRE_MULTICAST */
2516 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
2517 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2518 ip_drop_input("mrts_pim RTF_REJECT", mp, ill);
2519 freemsg(mp);
2520 ire_refrele(ire);
2521 return (-1);
2522 }
2523 ASSERT(ire->ire_type & IRE_MULTICAST);
2524 (*ire->ire_recvfn)(ire, mp, ipha, ira);
2525 ire_refrele(ire);
2526
2527 return (0);
2528 }
2529
2530 /*
2531 * Send an encapsulated packet.
2532 * Caller assumes can continue to use mp when routine returns.
2533 */
2534 /* ARGSUSED */
2535 static void
encap_send(ipha_t * ipha,mblk_t * mp,struct vif * vifp,ipaddr_t dst)2536 encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2537 {
2538 mblk_t *mp_copy;
2539 ipha_t *ipha_copy;
2540 size_t len;
2541 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2542 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2543
2544 if (ipst->ips_ip_mrtdebug > 1) {
2545 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2546 "encap_send: vif %ld enter",
2547 (ptrdiff_t)(vifp - ipst->ips_vifs));
2548 }
2549 len = ntohs(ipha->ipha_length);
2550
2551 /*
2552 * Copy the old packet & pullup it's IP header into the
2553 * new mbuf so we can modify it. Try to fill the new
2554 * mbuf since if we don't the ethernet driver will.
2555 */
2556 mp_copy = allocb(32 + sizeof (multicast_encap_iphdr), BPRI_MED);
2557 if (mp_copy == NULL)
2558 return;
2559 mp_copy->b_rptr += 32;
2560 mp_copy->b_wptr = mp_copy->b_rptr + sizeof (multicast_encap_iphdr);
2561 if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2562 freeb(mp_copy);
2563 return;
2564 }
2565
2566 /*
2567 * Fill in the encapsulating IP header.
2568 * Remote tunnel dst in rmt_addr, from add_vif().
2569 */
2570 ipha_copy = (ipha_t *)mp_copy->b_rptr;
2571 *ipha_copy = multicast_encap_iphdr;
2572 ASSERT((len + sizeof (ipha_t)) <= IP_MAXPACKET);
2573 ipha_copy->ipha_length = htons(len + sizeof (ipha_t));
2574 ipha_copy->ipha_src = vifp->v_lcl_addr.s_addr;
2575 ipha_copy->ipha_dst = vifp->v_rmt_addr.s_addr;
2576 ASSERT(ipha_copy->ipha_ident == 0);
2577
2578 /* Turn the encapsulated IP header back into a valid one. */
2579 ipha = (ipha_t *)mp_copy->b_cont->b_rptr;
2580 ipha->ipha_ttl--;
2581 ipha->ipha_hdr_checksum = 0;
2582 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2583
2584 ipha_copy->ipha_ttl = ipha->ipha_ttl;
2585
2586 if (ipst->ips_ip_mrtdebug > 1) {
2587 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2588 "encap_send: group 0x%x", ntohl(ipha->ipha_dst));
2589 }
2590 if (vifp->v_rate_limit <= 0)
2591 tbf_send_packet(vifp, mp_copy);
2592 else
2593 /* ipha is from the original header */
2594 tbf_control(vifp, mp_copy, ipha);
2595 }
2596
2597 /*
2598 * De-encapsulate a packet and feed it back through IP input if it
2599 * matches one of our multicast tunnels.
2600 *
2601 * This routine is called whenever IP gets a packet with prototype
2602 * IPPROTO_ENCAP and a local destination address and the packet didn't
2603 * match one of our configured IP-in-IP tunnels.
2604 */
2605 void
ip_mroute_decap(mblk_t * mp,ip_recv_attr_t * ira)2606 ip_mroute_decap(mblk_t *mp, ip_recv_attr_t *ira)
2607 {
2608 ipha_t *ipha = (ipha_t *)mp->b_rptr;
2609 ipha_t *ipha_encap;
2610 int hlen = IPH_HDR_LENGTH(ipha);
2611 int hlen_encap;
2612 ipaddr_t src;
2613 struct vif *vifp;
2614 ire_t *ire;
2615 ill_t *ill = ira->ira_ill;
2616 ip_stack_t *ipst = ill->ill_ipst;
2617 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2618
2619 /* Make sure we have all of the inner header */
2620 ipha_encap = (ipha_t *)((char *)ipha + hlen);
2621 if (mp->b_wptr - mp->b_rptr < hlen + IP_SIMPLE_HDR_LENGTH) {
2622 ipha = ip_pullup(mp, hlen + IP_SIMPLE_HDR_LENGTH, ira);
2623 if (ipha == NULL) {
2624 ipst->ips_mrtstat->mrts_bad_tunnel++;
2625 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2626 ip_drop_input("ip_mroute_decap: too short", mp, ill);
2627 freemsg(mp);
2628 return;
2629 }
2630 ipha_encap = (ipha_t *)((char *)ipha + hlen);
2631 }
2632 hlen_encap = IPH_HDR_LENGTH(ipha_encap);
2633 if (mp->b_wptr - mp->b_rptr < hlen + hlen_encap) {
2634 ipha = ip_pullup(mp, hlen + hlen_encap, ira);
2635 if (ipha == NULL) {
2636 ipst->ips_mrtstat->mrts_bad_tunnel++;
2637 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2638 ip_drop_input("ip_mroute_decap: too short", mp, ill);
2639 freemsg(mp);
2640 return;
2641 }
2642 ipha_encap = (ipha_t *)((char *)ipha + hlen);
2643 }
2644
2645 /*
2646 * Dump the packet if it's not to a multicast destination or if
2647 * we don't have an encapsulating tunnel with the source.
2648 * Note: This code assumes that the remote site IP address
2649 * uniquely identifies the tunnel (i.e., that this site has
2650 * at most one tunnel with the remote site).
2651 */
2652 if (!CLASSD(ipha_encap->ipha_dst)) {
2653 ipst->ips_mrtstat->mrts_bad_tunnel++;
2654 ip1dbg(("ip_mroute_decap: bad tunnel\n"));
2655 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2656 ip_drop_input("mrts_bad_tunnel", mp, ill);
2657 freemsg(mp);
2658 return;
2659 }
2660 src = (ipaddr_t)ipha->ipha_src;
2661 mutex_enter(&ipst->ips_last_encap_lock);
2662 if (src != ipst->ips_last_encap_src) {
2663 struct vif *vife;
2664
2665 vifp = ipst->ips_vifs;
2666 vife = vifp + ipst->ips_numvifs;
2667 ipst->ips_last_encap_src = src;
2668 ipst->ips_last_encap_vif = 0;
2669 for (; vifp < vife; ++vifp) {
2670 if (!lock_good_vif(vifp))
2671 continue;
2672 if (vifp->v_rmt_addr.s_addr == src) {
2673 if (vifp->v_flags & VIFF_TUNNEL)
2674 ipst->ips_last_encap_vif = vifp;
2675 if (ipst->ips_ip_mrtdebug > 1) {
2676 (void) mi_strlog(mrouter->conn_rq,
2677 1, SL_TRACE,
2678 "ip_mroute_decap: good tun "
2679 "vif %ld with %x",
2680 (ptrdiff_t)(vifp - ipst->ips_vifs),
2681 ntohl(src));
2682 }
2683 unlock_good_vif(vifp);
2684 break;
2685 }
2686 unlock_good_vif(vifp);
2687 }
2688 }
2689 if ((vifp = ipst->ips_last_encap_vif) == 0) {
2690 mutex_exit(&ipst->ips_last_encap_lock);
2691 ipst->ips_mrtstat->mrts_bad_tunnel++;
2692 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2693 ip_drop_input("mrts_bad_tunnel", mp, ill);
2694 freemsg(mp);
2695 ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n",
2696 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(src)));
2697 return;
2698 }
2699 mutex_exit(&ipst->ips_last_encap_lock);
2700
2701 /*
2702 * Need to pass in the tunnel source to ip_mforward (so that it can
2703 * verify that the packet arrived over the correct vif.)
2704 */
2705 ira->ira_flags |= IRAF_MROUTE_TUNNEL_SET;
2706 ira->ira_mroute_tunnel = src;
2707 mp->b_rptr += hlen;
2708 ira->ira_pktlen -= hlen;
2709 ira->ira_ip_hdr_length = hlen_encap;
2710
2711 /*
2712 * We don't redo any of the filtering in ill_input_full_v4 and we
2713 * have checked that all of ipha_encap and any IP options are
2714 * pulled up. Hence we call ire_recv_multicast_v4 directly.
2715 * However, we have to check for RSVP as in ip_input_full_v4
2716 * and if so we pass it to ire_recv_broadcast_v4 for local delivery
2717 * to the rsvpd.
2718 */
2719 if (ipha_encap->ipha_protocol == IPPROTO_RSVP &&
2720 ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head != NULL) {
2721 ire = ire_route_recursive_v4(INADDR_BROADCAST, 0, ill,
2722 ALL_ZONES, ira->ira_tsl, MATCH_IRE_ILL|MATCH_IRE_SECATTR,
2723 IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL);
2724 } else {
2725 ire = ire_multicast(ill);
2726 }
2727 ASSERT(ire != NULL);
2728 /* Normally this will return the IRE_MULTICAST or IRE_BROADCAST */
2729 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
2730 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2731 ip_drop_input("ip_mroute_decap: RTF_REJECT", mp, ill);
2732 freemsg(mp);
2733 ire_refrele(ire);
2734 return;
2735 }
2736 ire->ire_ib_pkt_count++;
2737 ASSERT(ire->ire_type & (IRE_MULTICAST|IRE_BROADCAST));
2738 (*ire->ire_recvfn)(ire, mp, ipha_encap, ira);
2739 ire_refrele(ire);
2740 }
2741
2742 /*
2743 * Remove all records with v_ipif == ipif. Called when an interface goes away
2744 * (stream closed). Called as writer.
2745 */
2746 void
reset_mrt_vif_ipif(ipif_t * ipif)2747 reset_mrt_vif_ipif(ipif_t *ipif)
2748 {
2749 vifi_t vifi, tmp_vifi;
2750 vifi_t num_of_vifs;
2751 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
2752
2753 /* Can't check vifi >= 0 since vifi_t is unsigned! */
2754
2755 mutex_enter(&ipst->ips_numvifs_mutex);
2756 num_of_vifs = ipst->ips_numvifs;
2757 mutex_exit(&ipst->ips_numvifs_mutex);
2758
2759 for (vifi = num_of_vifs; vifi != 0; vifi--) {
2760 tmp_vifi = vifi - 1;
2761 if (ipst->ips_vifs[tmp_vifi].v_ipif == ipif) {
2762 (void) del_vif(&tmp_vifi, ipst);
2763 }
2764 }
2765 }
2766
2767 /* Remove pending upcall msgs when ill goes away. Called by ill_delete. */
2768 void
reset_mrt_ill(ill_t * ill)2769 reset_mrt_ill(ill_t *ill)
2770 {
2771 struct mfc *rt;
2772 struct rtdetq *rte;
2773 int i;
2774 ip_stack_t *ipst = ill->ill_ipst;
2775 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2776 timeout_id_t id;
2777
2778 for (i = 0; i < MFCTBLSIZ; i++) {
2779 MFCB_REFHOLD(&ipst->ips_mfcs[i]);
2780 if ((rt = ipst->ips_mfcs[i].mfcb_mfc) != NULL) {
2781 if (ipst->ips_ip_mrtdebug > 1) {
2782 (void) mi_strlog(mrouter->conn_rq, 1,
2783 SL_TRACE,
2784 "reset_mrt_ill: mfctable [%d]", i);
2785 }
2786 while (rt != NULL) {
2787 mutex_enter(&rt->mfc_mutex);
2788 while ((rte = rt->mfc_rte) != NULL) {
2789 if (rte->ill == ill &&
2790 (id = rt->mfc_timeout_id) != 0) {
2791 /*
2792 * Its ok to drop the lock, the
2793 * struct cannot be freed since
2794 * we have a ref on the hash
2795 * bucket.
2796 */
2797 mutex_exit(&rt->mfc_mutex);
2798 (void) untimeout(id);
2799 mutex_enter(&rt->mfc_mutex);
2800 }
2801 if (rte->ill == ill) {
2802 if (ipst->ips_ip_mrtdebug > 1) {
2803 (void) mi_strlog(
2804 mrouter->conn_rq,
2805 1, SL_TRACE,
2806 "reset_mrt_ill: "
2807 "ill 0x%p", (void *)ill);
2808 }
2809 rt->mfc_rte = rte->rte_next;
2810 freemsg(rte->mp);
2811 mi_free((char *)rte);
2812 }
2813 }
2814 mutex_exit(&rt->mfc_mutex);
2815 rt = rt->mfc_next;
2816 }
2817 }
2818 MFCB_REFRELE(&ipst->ips_mfcs[i]);
2819 }
2820 }
2821
2822 /*
2823 * Token bucket filter module.
2824 * The ipha is for mcastgrp destination for phyint and encap.
2825 */
2826 static void
tbf_control(struct vif * vifp,mblk_t * mp,ipha_t * ipha)2827 tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha)
2828 {
2829 size_t p_len = msgdsize(mp);
2830 struct tbf *t = vifp->v_tbf;
2831 timeout_id_t id = 0;
2832 ill_t *ill = vifp->v_ipif->ipif_ill;
2833 ip_stack_t *ipst = ill->ill_ipst;
2834 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2835
2836 /* Drop if packet is too large */
2837 if (p_len > MAX_BKT_SIZE) {
2838 ipst->ips_mrtstat->mrts_pkt2large++;
2839 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2840 ip_drop_output("tbf_control - too large", mp, ill);
2841 freemsg(mp);
2842 return;
2843 }
2844 if (ipst->ips_ip_mrtdebug > 1) {
2845 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2846 "tbf_ctrl: SEND vif %ld, qlen %d, ipha_dst 0x%x",
2847 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len,
2848 ntohl(ipha->ipha_dst));
2849 }
2850
2851 mutex_enter(&t->tbf_lock);
2852
2853 tbf_update_tokens(vifp);
2854
2855 /*
2856 * If there are enough tokens,
2857 * and the queue is empty, send this packet out.
2858 */
2859 if (ipst->ips_ip_mrtdebug > 1) {
2860 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2861 "tbf_control: vif %ld, TOKENS %d, pkt len %lu, qlen %d",
2862 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_n_tok, p_len,
2863 t->tbf_q_len);
2864 }
2865 /* No packets are queued */
2866 if (t->tbf_q_len == 0) {
2867 /* queue empty, send packet if enough tokens */
2868 if (p_len <= t->tbf_n_tok) {
2869 t->tbf_n_tok -= p_len;
2870 mutex_exit(&t->tbf_lock);
2871 tbf_send_packet(vifp, mp);
2872 return;
2873 } else {
2874 /* Queue packet and timeout till later */
2875 tbf_queue(vifp, mp);
2876 ASSERT(vifp->v_timeout_id == 0);
2877 vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
2878 TBF_REPROCESS);
2879 }
2880 } else if (t->tbf_q_len < t->tbf_max_q_len) {
2881 /* Finite queue length, so queue pkts and process queue */
2882 tbf_queue(vifp, mp);
2883 tbf_process_q(vifp);
2884 } else {
2885 /* Check that we have UDP header with IP header */
2886 size_t hdr_length = IPH_HDR_LENGTH(ipha) +
2887 sizeof (struct udphdr);
2888
2889 if ((mp->b_wptr - mp->b_rptr) < hdr_length) {
2890 if (!pullupmsg(mp, hdr_length)) {
2891 BUMP_MIB(ill->ill_ip_mib,
2892 ipIfStatsOutDiscards);
2893 ip_drop_output("tbf_control - pullup", mp, ill);
2894 freemsg(mp);
2895 ip1dbg(("tbf_ctl: couldn't pullup udp hdr, "
2896 "vif %ld src 0x%x dst 0x%x\n",
2897 (ptrdiff_t)(vifp - ipst->ips_vifs),
2898 ntohl(ipha->ipha_src),
2899 ntohl(ipha->ipha_dst)));
2900 mutex_exit(&vifp->v_tbf->tbf_lock);
2901 return;
2902 } else
2903 /* Have to reassign ipha after pullupmsg */
2904 ipha = (ipha_t *)mp->b_rptr;
2905 }
2906 /*
2907 * Queue length too much,
2908 * try to selectively dq, or queue and process
2909 */
2910 if (!tbf_dq_sel(vifp, ipha)) {
2911 ipst->ips_mrtstat->mrts_q_overflow++;
2912 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2913 ip_drop_output("mrts_q_overflow", mp, ill);
2914 freemsg(mp);
2915 } else {
2916 tbf_queue(vifp, mp);
2917 tbf_process_q(vifp);
2918 }
2919 }
2920 if (t->tbf_q_len == 0) {
2921 id = vifp->v_timeout_id;
2922 vifp->v_timeout_id = 0;
2923 }
2924 mutex_exit(&vifp->v_tbf->tbf_lock);
2925 if (id != 0)
2926 (void) untimeout(id);
2927 }
2928
2929 /*
2930 * Adds a packet to the tbf queue at the interface.
2931 * The ipha is for mcastgrp destination for phyint and encap.
2932 */
2933 static void
tbf_queue(struct vif * vifp,mblk_t * mp)2934 tbf_queue(struct vif *vifp, mblk_t *mp)
2935 {
2936 struct tbf *t = vifp->v_tbf;
2937 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2938 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2939
2940 if (ipst->ips_ip_mrtdebug > 1) {
2941 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2942 "tbf_queue: vif %ld", (ptrdiff_t)(vifp - ipst->ips_vifs));
2943 }
2944 ASSERT(MUTEX_HELD(&t->tbf_lock));
2945
2946 if (t->tbf_t == NULL) {
2947 /* Queue was empty */
2948 t->tbf_q = mp;
2949 } else {
2950 /* Insert at tail */
2951 t->tbf_t->b_next = mp;
2952 }
2953 /* set new tail pointer */
2954 t->tbf_t = mp;
2955
2956 mp->b_next = mp->b_prev = NULL;
2957
2958 t->tbf_q_len++;
2959 }
2960
2961 /*
2962 * Process the queue at the vif interface.
2963 * Drops the tbf_lock when sending packets.
2964 *
2965 * NOTE : The caller should quntimeout if the queue length is 0.
2966 */
2967 static void
tbf_process_q(struct vif * vifp)2968 tbf_process_q(struct vif *vifp)
2969 {
2970 mblk_t *mp;
2971 struct tbf *t = vifp->v_tbf;
2972 size_t len;
2973 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2974 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2975
2976 if (ipst->ips_ip_mrtdebug > 1) {
2977 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2978 "tbf_process_q 1: vif %ld qlen = %d",
2979 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len);
2980 }
2981
2982 /*
2983 * Loop through the queue at the interface and send
2984 * as many packets as possible.
2985 */
2986 ASSERT(MUTEX_HELD(&t->tbf_lock));
2987
2988 while (t->tbf_q_len > 0) {
2989 mp = t->tbf_q;
2990 len = (size_t)msgdsize(mp); /* length of ip pkt */
2991
2992 /* Determine if the packet can be sent */
2993 if (len <= t->tbf_n_tok) {
2994 /*
2995 * If so, reduce no. of tokens, dequeue the packet,
2996 * send the packet.
2997 */
2998 t->tbf_n_tok -= len;
2999
3000 t->tbf_q = mp->b_next;
3001 if (--t->tbf_q_len == 0) {
3002 t->tbf_t = NULL;
3003 }
3004 mp->b_next = NULL;
3005 /* Exit mutex before sending packet, then re-enter */
3006 mutex_exit(&t->tbf_lock);
3007 tbf_send_packet(vifp, mp);
3008 mutex_enter(&t->tbf_lock);
3009 } else
3010 break;
3011 }
3012 }
3013
3014 /* Called at tbf timeout to update tokens, process q and reset timer. */
3015 static void
tbf_reprocess_q(void * arg)3016 tbf_reprocess_q(void *arg)
3017 {
3018 struct vif *vifp = arg;
3019 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3020 conn_t *mrouter = ipst->ips_ip_g_mrouter;
3021
3022 mutex_enter(&vifp->v_tbf->tbf_lock);
3023 vifp->v_timeout_id = 0;
3024 tbf_update_tokens(vifp);
3025
3026 tbf_process_q(vifp);
3027
3028 if (vifp->v_tbf->tbf_q_len > 0) {
3029 vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
3030 TBF_REPROCESS);
3031 }
3032 mutex_exit(&vifp->v_tbf->tbf_lock);
3033
3034 if (ipst->ips_ip_mrtdebug > 1) {
3035 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3036 "tbf_reprcess_q: vif %ld timeout id = %p",
3037 (ptrdiff_t)(vifp - ipst->ips_vifs), vifp->v_timeout_id);
3038 }
3039 }
3040
3041 /*
3042 * Function that will selectively discard a member of the tbf queue,
3043 * based on the precedence value and the priority.
3044 *
3045 * NOTE : The caller should quntimeout if the queue length is 0.
3046 */
3047 static int
tbf_dq_sel(struct vif * vifp,ipha_t * ipha)3048 tbf_dq_sel(struct vif *vifp, ipha_t *ipha)
3049 {
3050 uint_t p;
3051 struct tbf *t = vifp->v_tbf;
3052 mblk_t **np;
3053 mblk_t *last, *mp;
3054 ill_t *ill = vifp->v_ipif->ipif_ill;
3055 ip_stack_t *ipst = ill->ill_ipst;
3056 conn_t *mrouter = ipst->ips_ip_g_mrouter;
3057
3058 if (ipst->ips_ip_mrtdebug > 1) {
3059 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3060 "dq_sel: vif %ld dst 0x%x",
3061 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(ipha->ipha_dst));
3062 }
3063
3064 ASSERT(MUTEX_HELD(&t->tbf_lock));
3065 p = priority(vifp, ipha);
3066
3067 np = &t->tbf_q;
3068 last = NULL;
3069 while ((mp = *np) != NULL) {
3070 if (p > (priority(vifp, (ipha_t *)mp->b_rptr))) {
3071 *np = mp->b_next;
3072 /* If removing the last packet, fix the tail pointer */
3073 if (mp == t->tbf_t)
3074 t->tbf_t = last;
3075 mp->b_prev = mp->b_next = NULL;
3076 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3077 ip_drop_output("tbf_dq_send", mp, ill);
3078 freemsg(mp);
3079 /*
3080 * It's impossible for the queue to be empty, but
3081 * we check anyway.
3082 */
3083 if (--t->tbf_q_len == 0) {
3084 t->tbf_t = NULL;
3085 }
3086 ipst->ips_mrtstat->mrts_drop_sel++;
3087 return (1);
3088 }
3089 np = &mp->b_next;
3090 last = mp;
3091 }
3092 return (0);
3093 }
3094
3095 /* Sends packet, 2 cases - encap tunnel, phyint. */
3096 static void
tbf_send_packet(struct vif * vifp,mblk_t * mp)3097 tbf_send_packet(struct vif *vifp, mblk_t *mp)
3098 {
3099 ipif_t *ipif = vifp->v_ipif;
3100 ill_t *ill = ipif->ipif_ill;
3101 ip_stack_t *ipst = ill->ill_ipst;
3102 conn_t *mrouter = ipst->ips_ip_g_mrouter;
3103 ipha_t *ipha;
3104
3105 ipha = (ipha_t *)mp->b_rptr;
3106 /* If encap tunnel options */
3107 if (vifp->v_flags & VIFF_TUNNEL) {
3108 ip_xmit_attr_t ixas;
3109
3110 if (ipst->ips_ip_mrtdebug > 1) {
3111 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3112 "tbf_send_packet: ENCAP tunnel vif %ld",
3113 (ptrdiff_t)(vifp - ipst->ips_vifs));
3114 }
3115 bzero(&ixas, sizeof (ixas));
3116 ixas.ixa_flags =
3117 IXAF_IS_IPV4 | IXAF_NO_TTL_CHANGE | IXAF_VERIFY_SOURCE;
3118 ixas.ixa_ipst = ipst;
3119 ixas.ixa_ifindex = 0;
3120 ixas.ixa_cred = kcred;
3121 ixas.ixa_cpid = NOPID;
3122 ixas.ixa_tsl = NULL;
3123 ixas.ixa_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */
3124 ixas.ixa_pktlen = ntohs(ipha->ipha_length);
3125 ixas.ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha);
3126
3127 /*
3128 * Feed into ip_output_simple which will set the ident field
3129 * and checksum the encapsulating header.
3130 * BSD gets the cached route vifp->v_route from ip_output()
3131 * to speed up route table lookups. Not necessary in SunOS 5.x.
3132 * One could make multicast forwarding faster by putting an
3133 * ip_xmit_attr_t in each vif thereby caching the ire/nce.
3134 */
3135 (void) ip_output_simple(mp, &ixas);
3136 ixa_cleanup(&ixas);
3137 return;
3138
3139 /* phyint */
3140 } else {
3141 /* Need to loop back to members on the outgoing interface. */
3142 ipaddr_t dst;
3143 ip_recv_attr_t iras;
3144 nce_t *nce;
3145
3146 bzero(&iras, sizeof (iras));
3147 iras.ira_flags = IRAF_IS_IPV4;
3148 iras.ira_ill = iras.ira_rill = ill;
3149 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3150 iras.ira_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */
3151 iras.ira_pktlen = ntohs(ipha->ipha_length);
3152 iras.ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
3153
3154 dst = ipha->ipha_dst;
3155 if (ill_hasmembers_v4(ill, dst)) {
3156 iras.ira_flags |= IRAF_LOOPBACK_COPY;
3157 }
3158 if (ipst->ips_ip_mrtdebug > 1) {
3159 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3160 "tbf_send_pkt: phyint forward vif %ld dst = 0x%x",
3161 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(dst));
3162 }
3163 /*
3164 * Find an NCE which matches the nexthop.
3165 * For a pt-pt interface we use the other end of the pt-pt
3166 * link.
3167 */
3168 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
3169 dst = ipif->ipif_pp_dst_addr;
3170 nce = arp_nce_init(ill, dst, ill->ill_net_type);
3171 } else {
3172 nce = arp_nce_init(ill, dst, IRE_MULTICAST);
3173 }
3174 if (nce == NULL) {
3175 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3176 ip_drop_output("tbf_send_packet - no nce", mp, ill);
3177 freemsg(mp);
3178 return;
3179 }
3180
3181 /*
3182 * We don't remeber the incoming ill. Thus we
3183 * pretend the packet arrived on the outbound ill. This means
3184 * statistics for input errors will be increased on the wrong
3185 * ill but that isn't a big deal.
3186 */
3187 ip_forward_xmit_v4(nce, ill, mp, ipha, &iras, ill->ill_mc_mtu,
3188 0);
3189 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3190
3191 nce_refrele(nce);
3192 }
3193 }
3194
3195 /*
3196 * Determine the current time and then the elapsed time (between the last time
3197 * and time now). Update the no. of tokens in the bucket.
3198 */
3199 static void
tbf_update_tokens(struct vif * vifp)3200 tbf_update_tokens(struct vif *vifp)
3201 {
3202 timespec_t tp;
3203 hrtime_t tm;
3204 struct tbf *t = vifp->v_tbf;
3205 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3206 conn_t *mrouter = ipst->ips_ip_g_mrouter;
3207
3208 ASSERT(MUTEX_HELD(&t->tbf_lock));
3209
3210 /* Time in secs and nsecs, rate limit in kbits/sec */
3211 gethrestime(&tp);
3212
3213 /*LINTED*/
3214 TV_DELTA(tp, t->tbf_last_pkt_t, tm);
3215
3216 /*
3217 * This formula is actually
3218 * "time in seconds" * "bytes/second". Scaled for nsec.
3219 * (tm/1000000000) * (v_rate_limit * 1000 * (1000/1024) /8)
3220 *
3221 * The (1000/1024) was introduced in add_vif to optimize
3222 * this divide into a shift.
3223 */
3224 t->tbf_n_tok += (tm/1000) * vifp->v_rate_limit / 1024 / 8;
3225 t->tbf_last_pkt_t = tp;
3226
3227 if (t->tbf_n_tok > MAX_BKT_SIZE)
3228 t->tbf_n_tok = MAX_BKT_SIZE;
3229 if (ipst->ips_ip_mrtdebug > 1) {
3230 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3231 "tbf_update_tok: tm %lld tok %d vif %ld",
3232 tm, t->tbf_n_tok, (ptrdiff_t)(vifp - ipst->ips_vifs));
3233 }
3234 }
3235
3236 /*
3237 * Priority currently is based on port nos.
3238 * Different forwarding mechanisms have different ways
3239 * of obtaining the port no. Hence, the vif must be
3240 * given along with the packet itself.
3241 *
3242 */
3243 static int
priority(struct vif * vifp,ipha_t * ipha)3244 priority(struct vif *vifp, ipha_t *ipha)
3245 {
3246 int prio;
3247 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3248 conn_t *mrouter = ipst->ips_ip_g_mrouter;
3249
3250 /* Temporary hack; may add general packet classifier some day */
3251
3252 ASSERT(MUTEX_HELD(&vifp->v_tbf->tbf_lock));
3253
3254 /*
3255 * The UDP port space is divided up into four priority ranges:
3256 * [0, 16384) : unclassified - lowest priority
3257 * [16384, 32768) : audio - highest priority
3258 * [32768, 49152) : whiteboard - medium priority
3259 * [49152, 65536) : video - low priority
3260 */
3261
3262 if (ipha->ipha_protocol == IPPROTO_UDP) {
3263 struct udphdr *udp =
3264 (struct udphdr *)((char *)ipha + IPH_HDR_LENGTH(ipha));
3265 switch (ntohs(udp->uh_dport) & 0xc000) {
3266 case 0x4000:
3267 prio = 70;
3268 break;
3269 case 0x8000:
3270 prio = 60;
3271 break;
3272 case 0xc000:
3273 prio = 55;
3274 break;
3275 default:
3276 prio = 50;
3277 break;
3278 }
3279 if (ipst->ips_ip_mrtdebug > 1) {
3280 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3281 "priority: port %x prio %d\n",
3282 ntohs(udp->uh_dport), prio);
3283 }
3284 } else
3285 prio = 50; /* default priority */
3286 return (prio);
3287 }
3288
3289 /*
3290 * End of token bucket filter modifications
3291 */
3292
3293
3294
3295 /*
3296 * Produces data for netstat -M.
3297 */
3298 int
ip_mroute_stats(mblk_t * mp,ip_stack_t * ipst)3299 ip_mroute_stats(mblk_t *mp, ip_stack_t *ipst)
3300 {
3301 ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
3302 ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
3303 if (!snmp_append_data(mp, (char *)ipst->ips_mrtstat,
3304 sizeof (struct mrtstat))) {
3305 ip0dbg(("ip_mroute_stats: failed %ld bytes\n",
3306 (size_t)sizeof (struct mrtstat)));
3307 return (0);
3308 }
3309 return (1);
3310 }
3311
3312 /*
3313 * Sends info for SNMP's MIB.
3314 */
3315 int
ip_mroute_vif(mblk_t * mp,ip_stack_t * ipst)3316 ip_mroute_vif(mblk_t *mp, ip_stack_t *ipst)
3317 {
3318 struct vifctl vi;
3319 vifi_t vifi;
3320
3321 mutex_enter(&ipst->ips_numvifs_mutex);
3322 for (vifi = 0; vifi < ipst->ips_numvifs; vifi++) {
3323 if (ipst->ips_vifs[vifi].v_lcl_addr.s_addr == 0)
3324 continue;
3325 /*
3326 * No locks here, an approximation is fine.
3327 */
3328 vi.vifc_vifi = vifi;
3329 vi.vifc_flags = ipst->ips_vifs[vifi].v_flags;
3330 vi.vifc_threshold = ipst->ips_vifs[vifi].v_threshold;
3331 vi.vifc_rate_limit = ipst->ips_vifs[vifi].v_rate_limit;
3332 vi.vifc_lcl_addr = ipst->ips_vifs[vifi].v_lcl_addr;
3333 vi.vifc_rmt_addr = ipst->ips_vifs[vifi].v_rmt_addr;
3334 vi.vifc_pkt_in = ipst->ips_vifs[vifi].v_pkt_in;
3335 vi.vifc_pkt_out = ipst->ips_vifs[vifi].v_pkt_out;
3336
3337 if (!snmp_append_data(mp, (char *)&vi, sizeof (vi))) {
3338 ip0dbg(("ip_mroute_vif: failed %ld bytes\n",
3339 (size_t)sizeof (vi)));
3340 mutex_exit(&ipst->ips_numvifs_mutex);
3341 return (0);
3342 }
3343 }
3344 mutex_exit(&ipst->ips_numvifs_mutex);
3345 return (1);
3346 }
3347
3348 /*
3349 * Called by ip_snmp_get to send up multicast routing table.
3350 */
3351 int
ip_mroute_mrt(mblk_t * mp,ip_stack_t * ipst)3352 ip_mroute_mrt(mblk_t *mp, ip_stack_t *ipst)
3353 {
3354 int i, j;
3355 struct mfc *rt;
3356 struct mfcctl mfcc;
3357
3358 /*
3359 * Make sure multicast has not been turned off.
3360 */
3361 if (is_mrouter_off(ipst))
3362 return (1);
3363
3364 /* Loop over all hash buckets and their chains */
3365 for (i = 0; i < MFCTBLSIZ; i++) {
3366 MFCB_REFHOLD(&ipst->ips_mfcs[i]);
3367 for (rt = ipst->ips_mfcs[i].mfcb_mfc; rt; rt = rt->mfc_next) {
3368 mutex_enter(&rt->mfc_mutex);
3369 if (rt->mfc_rte != NULL ||
3370 (rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
3371 mutex_exit(&rt->mfc_mutex);
3372 continue;
3373 }
3374 mfcc.mfcc_origin = rt->mfc_origin;
3375 mfcc.mfcc_mcastgrp = rt->mfc_mcastgrp;
3376 mfcc.mfcc_parent = rt->mfc_parent;
3377 mfcc.mfcc_pkt_cnt = rt->mfc_pkt_cnt;
3378 mutex_enter(&ipst->ips_numvifs_mutex);
3379 for (j = 0; j < (int)ipst->ips_numvifs; j++)
3380 mfcc.mfcc_ttls[j] = rt->mfc_ttls[j];
3381 for (j = (int)ipst->ips_numvifs; j < MAXVIFS; j++)
3382 mfcc.mfcc_ttls[j] = 0;
3383 mutex_exit(&ipst->ips_numvifs_mutex);
3384
3385 mutex_exit(&rt->mfc_mutex);
3386 if (!snmp_append_data(mp, (char *)&mfcc,
3387 sizeof (mfcc))) {
3388 MFCB_REFRELE(&ipst->ips_mfcs[i]);
3389 ip0dbg(("ip_mroute_mrt: failed %ld bytes\n",
3390 (size_t)sizeof (mfcc)));
3391 return (0);
3392 }
3393 }
3394 MFCB_REFRELE(&ipst->ips_mfcs[i]);
3395 }
3396 return (1);
3397 }
3398