xref: /linux/net/ipv4/ipmr.c (revision f3d9478b2ce468c3115b02ecae7e975990697f15)
1 /*
2  *	IP multicast routing support for mrouted 3.6/3.8
3  *
4  *		(c) 1995 Alan Cox, <alan@redhat.com>
5  *	  Linux Consultancy and Custom Driver Development
6  *
7  *	This program is free software; you can redistribute it and/or
8  *	modify it under the terms of the GNU General Public License
9  *	as published by the Free Software Foundation; either version
10  *	2 of the License, or (at your option) any later version.
11  *
12  *	Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
13  *
14  *	Fixes:
15  *	Michael Chastain	:	Incorrect size of copying.
16  *	Alan Cox		:	Added the cache manager code
17  *	Alan Cox		:	Fixed the clone/copy bug and device race.
18  *	Mike McLagan		:	Routing by source
19  *	Malcolm Beattie		:	Buffer handling fixes.
20  *	Alexey Kuznetsov	:	Double buffer free and other fixes.
21  *	SVR Anand		:	Fixed several multicast bugs and problems.
22  *	Alexey Kuznetsov	:	Status, optimisations and more.
23  *	Brad Parker		:	Better behaviour on mrouted upcall
24  *					overflow.
25  *      Carlos Picoto           :       PIMv1 Support
26  *	Pavlin Ivanov Radoslavov:	PIMv2 Registers must checksum only PIM header
27  *					Relax this requrement to work with older peers.
28  *
29  */
30 
31 #include <linux/config.h>
32 #include <asm/system.h>
33 #include <asm/uaccess.h>
34 #include <linux/types.h>
35 #include <linux/sched.h>
36 #include <linux/capability.h>
37 #include <linux/errno.h>
38 #include <linux/timer.h>
39 #include <linux/mm.h>
40 #include <linux/kernel.h>
41 #include <linux/fcntl.h>
42 #include <linux/stat.h>
43 #include <linux/socket.h>
44 #include <linux/in.h>
45 #include <linux/inet.h>
46 #include <linux/netdevice.h>
47 #include <linux/inetdevice.h>
48 #include <linux/igmp.h>
49 #include <linux/proc_fs.h>
50 #include <linux/seq_file.h>
51 #include <linux/mroute.h>
52 #include <linux/init.h>
53 #include <linux/if_ether.h>
54 #include <net/ip.h>
55 #include <net/protocol.h>
56 #include <linux/skbuff.h>
57 #include <net/route.h>
58 #include <net/sock.h>
59 #include <net/icmp.h>
60 #include <net/udp.h>
61 #include <net/raw.h>
62 #include <linux/notifier.h>
63 #include <linux/if_arp.h>
64 #include <linux/netfilter_ipv4.h>
65 #include <net/ipip.h>
66 #include <net/checksum.h>
67 
68 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
69 #define CONFIG_IP_PIMSM	1
70 #endif
71 
72 static struct sock *mroute_socket;
73 
74 
75 /* Big lock, protecting vif table, mrt cache and mroute socket state.
76    Note that the changes are semaphored via rtnl_lock.
77  */
78 
79 static DEFINE_RWLOCK(mrt_lock);
80 
81 /*
82  *	Multicast router control variables
83  */
84 
85 static struct vif_device vif_table[MAXVIFS];		/* Devices 		*/
86 static int maxvif;
87 
88 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
89 
90 static int mroute_do_assert;				/* Set in PIM assert	*/
91 static int mroute_do_pim;
92 
93 static struct mfc_cache *mfc_cache_array[MFC_LINES];	/* Forwarding cache	*/
94 
95 static struct mfc_cache *mfc_unres_queue;		/* Queue of unresolved entries */
96 static atomic_t cache_resolve_queue_len;		/* Size of unresolved	*/
97 
98 /* Special spinlock for queue of unresolved entries */
99 static DEFINE_SPINLOCK(mfc_unres_lock);
100 
101 /* We return to original Alan's scheme. Hash table of resolved
102    entries is changed only in process context and protected
103    with weak lock mrt_lock. Queue of unresolved entries is protected
104    with strong spinlock mfc_unres_lock.
105 
106    In this case data path is free of exclusive locks at all.
107  */
108 
109 static kmem_cache_t *mrt_cachep __read_mostly;
110 
111 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
112 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
113 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
114 
115 #ifdef CONFIG_IP_PIMSM_V2
116 static struct net_protocol pim_protocol;
117 #endif
118 
119 static struct timer_list ipmr_expire_timer;
120 
121 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
122 
123 static
124 struct net_device *ipmr_new_tunnel(struct vifctl *v)
125 {
126 	struct net_device  *dev;
127 
128 	dev = __dev_get_by_name("tunl0");
129 
130 	if (dev) {
131 		int err;
132 		struct ifreq ifr;
133 		mm_segment_t	oldfs;
134 		struct ip_tunnel_parm p;
135 		struct in_device  *in_dev;
136 
137 		memset(&p, 0, sizeof(p));
138 		p.iph.daddr = v->vifc_rmt_addr.s_addr;
139 		p.iph.saddr = v->vifc_lcl_addr.s_addr;
140 		p.iph.version = 4;
141 		p.iph.ihl = 5;
142 		p.iph.protocol = IPPROTO_IPIP;
143 		sprintf(p.name, "dvmrp%d", v->vifc_vifi);
144 		ifr.ifr_ifru.ifru_data = (void*)&p;
145 
146 		oldfs = get_fs(); set_fs(KERNEL_DS);
147 		err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
148 		set_fs(oldfs);
149 
150 		dev = NULL;
151 
152 		if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
153 			dev->flags |= IFF_MULTICAST;
154 
155 			in_dev = __in_dev_get_rtnl(dev);
156 			if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL)
157 				goto failure;
158 			in_dev->cnf.rp_filter = 0;
159 
160 			if (dev_open(dev))
161 				goto failure;
162 		}
163 	}
164 	return dev;
165 
166 failure:
167 	/* allow the register to be completed before unregistering. */
168 	rtnl_unlock();
169 	rtnl_lock();
170 
171 	unregister_netdevice(dev);
172 	return NULL;
173 }
174 
175 #ifdef CONFIG_IP_PIMSM
176 
177 static int reg_vif_num = -1;
178 
179 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
180 {
181 	read_lock(&mrt_lock);
182 	((struct net_device_stats*)netdev_priv(dev))->tx_bytes += skb->len;
183 	((struct net_device_stats*)netdev_priv(dev))->tx_packets++;
184 	ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
185 	read_unlock(&mrt_lock);
186 	kfree_skb(skb);
187 	return 0;
188 }
189 
190 static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
191 {
192 	return (struct net_device_stats*)netdev_priv(dev);
193 }
194 
195 static void reg_vif_setup(struct net_device *dev)
196 {
197 	dev->type		= ARPHRD_PIMREG;
198 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 8;
199 	dev->flags		= IFF_NOARP;
200 	dev->hard_start_xmit	= reg_vif_xmit;
201 	dev->get_stats		= reg_vif_get_stats;
202 	dev->destructor		= free_netdev;
203 }
204 
205 static struct net_device *ipmr_reg_vif(void)
206 {
207 	struct net_device *dev;
208 	struct in_device *in_dev;
209 
210 	dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
211 			   reg_vif_setup);
212 
213 	if (dev == NULL)
214 		return NULL;
215 
216 	if (register_netdevice(dev)) {
217 		free_netdev(dev);
218 		return NULL;
219 	}
220 	dev->iflink = 0;
221 
222 	if ((in_dev = inetdev_init(dev)) == NULL)
223 		goto failure;
224 
225 	in_dev->cnf.rp_filter = 0;
226 
227 	if (dev_open(dev))
228 		goto failure;
229 
230 	return dev;
231 
232 failure:
233 	/* allow the register to be completed before unregistering. */
234 	rtnl_unlock();
235 	rtnl_lock();
236 
237 	unregister_netdevice(dev);
238 	return NULL;
239 }
240 #endif
241 
242 /*
243  *	Delete a VIF entry
244  */
245 
246 static int vif_delete(int vifi)
247 {
248 	struct vif_device *v;
249 	struct net_device *dev;
250 	struct in_device *in_dev;
251 
252 	if (vifi < 0 || vifi >= maxvif)
253 		return -EADDRNOTAVAIL;
254 
255 	v = &vif_table[vifi];
256 
257 	write_lock_bh(&mrt_lock);
258 	dev = v->dev;
259 	v->dev = NULL;
260 
261 	if (!dev) {
262 		write_unlock_bh(&mrt_lock);
263 		return -EADDRNOTAVAIL;
264 	}
265 
266 #ifdef CONFIG_IP_PIMSM
267 	if (vifi == reg_vif_num)
268 		reg_vif_num = -1;
269 #endif
270 
271 	if (vifi+1 == maxvif) {
272 		int tmp;
273 		for (tmp=vifi-1; tmp>=0; tmp--) {
274 			if (VIF_EXISTS(tmp))
275 				break;
276 		}
277 		maxvif = tmp+1;
278 	}
279 
280 	write_unlock_bh(&mrt_lock);
281 
282 	dev_set_allmulti(dev, -1);
283 
284 	if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
285 		in_dev->cnf.mc_forwarding--;
286 		ip_rt_multicast_event(in_dev);
287 	}
288 
289 	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
290 		unregister_netdevice(dev);
291 
292 	dev_put(dev);
293 	return 0;
294 }
295 
296 /* Destroy an unresolved cache entry, killing queued skbs
297    and reporting error to netlink readers.
298  */
299 
300 static void ipmr_destroy_unres(struct mfc_cache *c)
301 {
302 	struct sk_buff *skb;
303 	struct nlmsgerr *e;
304 
305 	atomic_dec(&cache_resolve_queue_len);
306 
307 	while((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
308 		if (skb->nh.iph->version == 0) {
309 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
310 			nlh->nlmsg_type = NLMSG_ERROR;
311 			nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
312 			skb_trim(skb, nlh->nlmsg_len);
313 			e = NLMSG_DATA(nlh);
314 			e->error = -ETIMEDOUT;
315 			memset(&e->msg, 0, sizeof(e->msg));
316 			netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
317 		} else
318 			kfree_skb(skb);
319 	}
320 
321 	kmem_cache_free(mrt_cachep, c);
322 }
323 
324 
325 /* Single timer process for all the unresolved queue. */
326 
327 static void ipmr_expire_process(unsigned long dummy)
328 {
329 	unsigned long now;
330 	unsigned long expires;
331 	struct mfc_cache *c, **cp;
332 
333 	if (!spin_trylock(&mfc_unres_lock)) {
334 		mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
335 		return;
336 	}
337 
338 	if (atomic_read(&cache_resolve_queue_len) == 0)
339 		goto out;
340 
341 	now = jiffies;
342 	expires = 10*HZ;
343 	cp = &mfc_unres_queue;
344 
345 	while ((c=*cp) != NULL) {
346 		if (time_after(c->mfc_un.unres.expires, now)) {
347 			unsigned long interval = c->mfc_un.unres.expires - now;
348 			if (interval < expires)
349 				expires = interval;
350 			cp = &c->next;
351 			continue;
352 		}
353 
354 		*cp = c->next;
355 
356 		ipmr_destroy_unres(c);
357 	}
358 
359 	if (atomic_read(&cache_resolve_queue_len))
360 		mod_timer(&ipmr_expire_timer, jiffies + expires);
361 
362 out:
363 	spin_unlock(&mfc_unres_lock);
364 }
365 
366 /* Fill oifs list. It is called under write locked mrt_lock. */
367 
368 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
369 {
370 	int vifi;
371 
372 	cache->mfc_un.res.minvif = MAXVIFS;
373 	cache->mfc_un.res.maxvif = 0;
374 	memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
375 
376 	for (vifi=0; vifi<maxvif; vifi++) {
377 		if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
378 			cache->mfc_un.res.ttls[vifi] = ttls[vifi];
379 			if (cache->mfc_un.res.minvif > vifi)
380 				cache->mfc_un.res.minvif = vifi;
381 			if (cache->mfc_un.res.maxvif <= vifi)
382 				cache->mfc_un.res.maxvif = vifi + 1;
383 		}
384 	}
385 }
386 
387 static int vif_add(struct vifctl *vifc, int mrtsock)
388 {
389 	int vifi = vifc->vifc_vifi;
390 	struct vif_device *v = &vif_table[vifi];
391 	struct net_device *dev;
392 	struct in_device *in_dev;
393 
394 	/* Is vif busy ? */
395 	if (VIF_EXISTS(vifi))
396 		return -EADDRINUSE;
397 
398 	switch (vifc->vifc_flags) {
399 #ifdef CONFIG_IP_PIMSM
400 	case VIFF_REGISTER:
401 		/*
402 		 * Special Purpose VIF in PIM
403 		 * All the packets will be sent to the daemon
404 		 */
405 		if (reg_vif_num >= 0)
406 			return -EADDRINUSE;
407 		dev = ipmr_reg_vif();
408 		if (!dev)
409 			return -ENOBUFS;
410 		break;
411 #endif
412 	case VIFF_TUNNEL:
413 		dev = ipmr_new_tunnel(vifc);
414 		if (!dev)
415 			return -ENOBUFS;
416 		break;
417 	case 0:
418 		dev = ip_dev_find(vifc->vifc_lcl_addr.s_addr);
419 		if (!dev)
420 			return -EADDRNOTAVAIL;
421 		dev_put(dev);
422 		break;
423 	default:
424 		return -EINVAL;
425 	}
426 
427 	if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
428 		return -EADDRNOTAVAIL;
429 	in_dev->cnf.mc_forwarding++;
430 	dev_set_allmulti(dev, +1);
431 	ip_rt_multicast_event(in_dev);
432 
433 	/*
434 	 *	Fill in the VIF structures
435 	 */
436 	v->rate_limit=vifc->vifc_rate_limit;
437 	v->local=vifc->vifc_lcl_addr.s_addr;
438 	v->remote=vifc->vifc_rmt_addr.s_addr;
439 	v->flags=vifc->vifc_flags;
440 	if (!mrtsock)
441 		v->flags |= VIFF_STATIC;
442 	v->threshold=vifc->vifc_threshold;
443 	v->bytes_in = 0;
444 	v->bytes_out = 0;
445 	v->pkt_in = 0;
446 	v->pkt_out = 0;
447 	v->link = dev->ifindex;
448 	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
449 		v->link = dev->iflink;
450 
451 	/* And finish update writing critical data */
452 	write_lock_bh(&mrt_lock);
453 	dev_hold(dev);
454 	v->dev=dev;
455 #ifdef CONFIG_IP_PIMSM
456 	if (v->flags&VIFF_REGISTER)
457 		reg_vif_num = vifi;
458 #endif
459 	if (vifi+1 > maxvif)
460 		maxvif = vifi+1;
461 	write_unlock_bh(&mrt_lock);
462 	return 0;
463 }
464 
465 static struct mfc_cache *ipmr_cache_find(__u32 origin, __u32 mcastgrp)
466 {
467 	int line=MFC_HASH(mcastgrp,origin);
468 	struct mfc_cache *c;
469 
470 	for (c=mfc_cache_array[line]; c; c = c->next) {
471 		if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
472 			break;
473 	}
474 	return c;
475 }
476 
477 /*
478  *	Allocate a multicast cache entry
479  */
480 static struct mfc_cache *ipmr_cache_alloc(void)
481 {
482 	struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_KERNEL);
483 	if(c==NULL)
484 		return NULL;
485 	memset(c, 0, sizeof(*c));
486 	c->mfc_un.res.minvif = MAXVIFS;
487 	return c;
488 }
489 
490 static struct mfc_cache *ipmr_cache_alloc_unres(void)
491 {
492 	struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_ATOMIC);
493 	if(c==NULL)
494 		return NULL;
495 	memset(c, 0, sizeof(*c));
496 	skb_queue_head_init(&c->mfc_un.unres.unresolved);
497 	c->mfc_un.unres.expires = jiffies + 10*HZ;
498 	return c;
499 }
500 
501 /*
502  *	A cache entry has gone into a resolved state from queued
503  */
504 
505 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
506 {
507 	struct sk_buff *skb;
508 	struct nlmsgerr *e;
509 
510 	/*
511 	 *	Play the pending entries through our router
512 	 */
513 
514 	while((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
515 		if (skb->nh.iph->version == 0) {
516 			int err;
517 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
518 
519 			if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
520 				nlh->nlmsg_len = skb->tail - (u8*)nlh;
521 			} else {
522 				nlh->nlmsg_type = NLMSG_ERROR;
523 				nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
524 				skb_trim(skb, nlh->nlmsg_len);
525 				e = NLMSG_DATA(nlh);
526 				e->error = -EMSGSIZE;
527 				memset(&e->msg, 0, sizeof(e->msg));
528 			}
529 			err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
530 		} else
531 			ip_mr_forward(skb, c, 0);
532 	}
533 }
534 
535 /*
536  *	Bounce a cache query up to mrouted. We could use netlink for this but mrouted
537  *	expects the following bizarre scheme.
538  *
539  *	Called under mrt_lock.
540  */
541 
542 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
543 {
544 	struct sk_buff *skb;
545 	int ihl = pkt->nh.iph->ihl<<2;
546 	struct igmphdr *igmp;
547 	struct igmpmsg *msg;
548 	int ret;
549 
550 #ifdef CONFIG_IP_PIMSM
551 	if (assert == IGMPMSG_WHOLEPKT)
552 		skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
553 	else
554 #endif
555 		skb = alloc_skb(128, GFP_ATOMIC);
556 
557 	if(!skb)
558 		return -ENOBUFS;
559 
560 #ifdef CONFIG_IP_PIMSM
561 	if (assert == IGMPMSG_WHOLEPKT) {
562 		/* Ugly, but we have no choice with this interface.
563 		   Duplicate old header, fix ihl, length etc.
564 		   And all this only to mangle msg->im_msgtype and
565 		   to set msg->im_mbz to "mbz" :-)
566 		 */
567 		msg = (struct igmpmsg*)skb_push(skb, sizeof(struct iphdr));
568 		skb->nh.raw = skb->h.raw = (u8*)msg;
569 		memcpy(msg, pkt->nh.raw, sizeof(struct iphdr));
570 		msg->im_msgtype = IGMPMSG_WHOLEPKT;
571 		msg->im_mbz = 0;
572  		msg->im_vif = reg_vif_num;
573 		skb->nh.iph->ihl = sizeof(struct iphdr) >> 2;
574 		skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr));
575 	} else
576 #endif
577 	{
578 
579 	/*
580 	 *	Copy the IP header
581 	 */
582 
583 	skb->nh.iph = (struct iphdr *)skb_put(skb, ihl);
584 	memcpy(skb->data,pkt->data,ihl);
585 	skb->nh.iph->protocol = 0;			/* Flag to the kernel this is a route add */
586 	msg = (struct igmpmsg*)skb->nh.iph;
587 	msg->im_vif = vifi;
588 	skb->dst = dst_clone(pkt->dst);
589 
590 	/*
591 	 *	Add our header
592 	 */
593 
594 	igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
595 	igmp->type	=
596 	msg->im_msgtype = assert;
597 	igmp->code 	=	0;
598 	skb->nh.iph->tot_len=htons(skb->len);			/* Fix the length */
599 	skb->h.raw = skb->nh.raw;
600         }
601 
602 	if (mroute_socket == NULL) {
603 		kfree_skb(skb);
604 		return -EINVAL;
605 	}
606 
607 	/*
608 	 *	Deliver to mrouted
609 	 */
610 	if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
611 		if (net_ratelimit())
612 			printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
613 		kfree_skb(skb);
614 	}
615 
616 	return ret;
617 }
618 
619 /*
620  *	Queue a packet for resolution. It gets locked cache entry!
621  */
622 
623 static int
624 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
625 {
626 	int err;
627 	struct mfc_cache *c;
628 
629 	spin_lock_bh(&mfc_unres_lock);
630 	for (c=mfc_unres_queue; c; c=c->next) {
631 		if (c->mfc_mcastgrp == skb->nh.iph->daddr &&
632 		    c->mfc_origin == skb->nh.iph->saddr)
633 			break;
634 	}
635 
636 	if (c == NULL) {
637 		/*
638 		 *	Create a new entry if allowable
639 		 */
640 
641 		if (atomic_read(&cache_resolve_queue_len)>=10 ||
642 		    (c=ipmr_cache_alloc_unres())==NULL) {
643 			spin_unlock_bh(&mfc_unres_lock);
644 
645 			kfree_skb(skb);
646 			return -ENOBUFS;
647 		}
648 
649 		/*
650 		 *	Fill in the new cache entry
651 		 */
652 		c->mfc_parent=-1;
653 		c->mfc_origin=skb->nh.iph->saddr;
654 		c->mfc_mcastgrp=skb->nh.iph->daddr;
655 
656 		/*
657 		 *	Reflect first query at mrouted.
658 		 */
659 		if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
660 			/* If the report failed throw the cache entry
661 			   out - Brad Parker
662 			 */
663 			spin_unlock_bh(&mfc_unres_lock);
664 
665 			kmem_cache_free(mrt_cachep, c);
666 			kfree_skb(skb);
667 			return err;
668 		}
669 
670 		atomic_inc(&cache_resolve_queue_len);
671 		c->next = mfc_unres_queue;
672 		mfc_unres_queue = c;
673 
674 		mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
675 	}
676 
677 	/*
678 	 *	See if we can append the packet
679 	 */
680 	if (c->mfc_un.unres.unresolved.qlen>3) {
681 		kfree_skb(skb);
682 		err = -ENOBUFS;
683 	} else {
684 		skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
685 		err = 0;
686 	}
687 
688 	spin_unlock_bh(&mfc_unres_lock);
689 	return err;
690 }
691 
692 /*
693  *	MFC cache manipulation by user space mroute daemon
694  */
695 
696 static int ipmr_mfc_delete(struct mfcctl *mfc)
697 {
698 	int line;
699 	struct mfc_cache *c, **cp;
700 
701 	line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
702 
703 	for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
704 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
705 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
706 			write_lock_bh(&mrt_lock);
707 			*cp = c->next;
708 			write_unlock_bh(&mrt_lock);
709 
710 			kmem_cache_free(mrt_cachep, c);
711 			return 0;
712 		}
713 	}
714 	return -ENOENT;
715 }
716 
717 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
718 {
719 	int line;
720 	struct mfc_cache *uc, *c, **cp;
721 
722 	line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
723 
724 	for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
725 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
726 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
727 			break;
728 	}
729 
730 	if (c != NULL) {
731 		write_lock_bh(&mrt_lock);
732 		c->mfc_parent = mfc->mfcc_parent;
733 		ipmr_update_thresholds(c, mfc->mfcc_ttls);
734 		if (!mrtsock)
735 			c->mfc_flags |= MFC_STATIC;
736 		write_unlock_bh(&mrt_lock);
737 		return 0;
738 	}
739 
740 	if(!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
741 		return -EINVAL;
742 
743 	c=ipmr_cache_alloc();
744 	if (c==NULL)
745 		return -ENOMEM;
746 
747 	c->mfc_origin=mfc->mfcc_origin.s_addr;
748 	c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
749 	c->mfc_parent=mfc->mfcc_parent;
750 	ipmr_update_thresholds(c, mfc->mfcc_ttls);
751 	if (!mrtsock)
752 		c->mfc_flags |= MFC_STATIC;
753 
754 	write_lock_bh(&mrt_lock);
755 	c->next = mfc_cache_array[line];
756 	mfc_cache_array[line] = c;
757 	write_unlock_bh(&mrt_lock);
758 
759 	/*
760 	 *	Check to see if we resolved a queued list. If so we
761 	 *	need to send on the frames and tidy up.
762 	 */
763 	spin_lock_bh(&mfc_unres_lock);
764 	for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
765 	     cp = &uc->next) {
766 		if (uc->mfc_origin == c->mfc_origin &&
767 		    uc->mfc_mcastgrp == c->mfc_mcastgrp) {
768 			*cp = uc->next;
769 			if (atomic_dec_and_test(&cache_resolve_queue_len))
770 				del_timer(&ipmr_expire_timer);
771 			break;
772 		}
773 	}
774 	spin_unlock_bh(&mfc_unres_lock);
775 
776 	if (uc) {
777 		ipmr_cache_resolve(uc, c);
778 		kmem_cache_free(mrt_cachep, uc);
779 	}
780 	return 0;
781 }
782 
783 /*
784  *	Close the multicast socket, and clear the vif tables etc
785  */
786 
787 static void mroute_clean_tables(struct sock *sk)
788 {
789 	int i;
790 
791 	/*
792 	 *	Shut down all active vif entries
793 	 */
794 	for(i=0; i<maxvif; i++) {
795 		if (!(vif_table[i].flags&VIFF_STATIC))
796 			vif_delete(i);
797 	}
798 
799 	/*
800 	 *	Wipe the cache
801 	 */
802 	for (i=0;i<MFC_LINES;i++) {
803 		struct mfc_cache *c, **cp;
804 
805 		cp = &mfc_cache_array[i];
806 		while ((c = *cp) != NULL) {
807 			if (c->mfc_flags&MFC_STATIC) {
808 				cp = &c->next;
809 				continue;
810 			}
811 			write_lock_bh(&mrt_lock);
812 			*cp = c->next;
813 			write_unlock_bh(&mrt_lock);
814 
815 			kmem_cache_free(mrt_cachep, c);
816 		}
817 	}
818 
819 	if (atomic_read(&cache_resolve_queue_len) != 0) {
820 		struct mfc_cache *c;
821 
822 		spin_lock_bh(&mfc_unres_lock);
823 		while (mfc_unres_queue != NULL) {
824 			c = mfc_unres_queue;
825 			mfc_unres_queue = c->next;
826 			spin_unlock_bh(&mfc_unres_lock);
827 
828 			ipmr_destroy_unres(c);
829 
830 			spin_lock_bh(&mfc_unres_lock);
831 		}
832 		spin_unlock_bh(&mfc_unres_lock);
833 	}
834 }
835 
836 static void mrtsock_destruct(struct sock *sk)
837 {
838 	rtnl_lock();
839 	if (sk == mroute_socket) {
840 		ipv4_devconf.mc_forwarding--;
841 
842 		write_lock_bh(&mrt_lock);
843 		mroute_socket=NULL;
844 		write_unlock_bh(&mrt_lock);
845 
846 		mroute_clean_tables(sk);
847 	}
848 	rtnl_unlock();
849 }
850 
851 /*
852  *	Socket options and virtual interface manipulation. The whole
853  *	virtual interface system is a complete heap, but unfortunately
854  *	that's how BSD mrouted happens to think. Maybe one day with a proper
855  *	MOSPF/PIM router set up we can clean this up.
856  */
857 
858 int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
859 {
860 	int ret;
861 	struct vifctl vif;
862 	struct mfcctl mfc;
863 
864 	if(optname!=MRT_INIT)
865 	{
866 		if(sk!=mroute_socket && !capable(CAP_NET_ADMIN))
867 			return -EACCES;
868 	}
869 
870 	switch(optname)
871 	{
872 		case MRT_INIT:
873 			if (sk->sk_type != SOCK_RAW ||
874 			    inet_sk(sk)->num != IPPROTO_IGMP)
875 				return -EOPNOTSUPP;
876 			if(optlen!=sizeof(int))
877 				return -ENOPROTOOPT;
878 
879 			rtnl_lock();
880 			if (mroute_socket) {
881 				rtnl_unlock();
882 				return -EADDRINUSE;
883 			}
884 
885 			ret = ip_ra_control(sk, 1, mrtsock_destruct);
886 			if (ret == 0) {
887 				write_lock_bh(&mrt_lock);
888 				mroute_socket=sk;
889 				write_unlock_bh(&mrt_lock);
890 
891 				ipv4_devconf.mc_forwarding++;
892 			}
893 			rtnl_unlock();
894 			return ret;
895 		case MRT_DONE:
896 			if (sk!=mroute_socket)
897 				return -EACCES;
898 			return ip_ra_control(sk, 0, NULL);
899 		case MRT_ADD_VIF:
900 		case MRT_DEL_VIF:
901 			if(optlen!=sizeof(vif))
902 				return -EINVAL;
903 			if (copy_from_user(&vif,optval,sizeof(vif)))
904 				return -EFAULT;
905 			if(vif.vifc_vifi >= MAXVIFS)
906 				return -ENFILE;
907 			rtnl_lock();
908 			if (optname==MRT_ADD_VIF) {
909 				ret = vif_add(&vif, sk==mroute_socket);
910 			} else {
911 				ret = vif_delete(vif.vifc_vifi);
912 			}
913 			rtnl_unlock();
914 			return ret;
915 
916 		/*
917 		 *	Manipulate the forwarding caches. These live
918 		 *	in a sort of kernel/user symbiosis.
919 		 */
920 		case MRT_ADD_MFC:
921 		case MRT_DEL_MFC:
922 			if(optlen!=sizeof(mfc))
923 				return -EINVAL;
924 			if (copy_from_user(&mfc,optval, sizeof(mfc)))
925 				return -EFAULT;
926 			rtnl_lock();
927 			if (optname==MRT_DEL_MFC)
928 				ret = ipmr_mfc_delete(&mfc);
929 			else
930 				ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
931 			rtnl_unlock();
932 			return ret;
933 		/*
934 		 *	Control PIM assert.
935 		 */
936 		case MRT_ASSERT:
937 		{
938 			int v;
939 			if(get_user(v,(int __user *)optval))
940 				return -EFAULT;
941 			mroute_do_assert=(v)?1:0;
942 			return 0;
943 		}
944 #ifdef CONFIG_IP_PIMSM
945 		case MRT_PIM:
946 		{
947 			int v, ret;
948 			if(get_user(v,(int __user *)optval))
949 				return -EFAULT;
950 			v = (v)?1:0;
951 			rtnl_lock();
952 			ret = 0;
953 			if (v != mroute_do_pim) {
954 				mroute_do_pim = v;
955 				mroute_do_assert = v;
956 #ifdef CONFIG_IP_PIMSM_V2
957 				if (mroute_do_pim)
958 					ret = inet_add_protocol(&pim_protocol,
959 								IPPROTO_PIM);
960 				else
961 					ret = inet_del_protocol(&pim_protocol,
962 								IPPROTO_PIM);
963 				if (ret < 0)
964 					ret = -EAGAIN;
965 #endif
966 			}
967 			rtnl_unlock();
968 			return ret;
969 		}
970 #endif
971 		/*
972 		 *	Spurious command, or MRT_VERSION which you cannot
973 		 *	set.
974 		 */
975 		default:
976 			return -ENOPROTOOPT;
977 	}
978 }
979 
980 /*
981  *	Getsock opt support for the multicast routing system.
982  */
983 
984 int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
985 {
986 	int olr;
987 	int val;
988 
989 	if(optname!=MRT_VERSION &&
990 #ifdef CONFIG_IP_PIMSM
991 	   optname!=MRT_PIM &&
992 #endif
993 	   optname!=MRT_ASSERT)
994 		return -ENOPROTOOPT;
995 
996 	if (get_user(olr, optlen))
997 		return -EFAULT;
998 
999 	olr = min_t(unsigned int, olr, sizeof(int));
1000 	if (olr < 0)
1001 		return -EINVAL;
1002 
1003 	if(put_user(olr,optlen))
1004 		return -EFAULT;
1005 	if(optname==MRT_VERSION)
1006 		val=0x0305;
1007 #ifdef CONFIG_IP_PIMSM
1008 	else if(optname==MRT_PIM)
1009 		val=mroute_do_pim;
1010 #endif
1011 	else
1012 		val=mroute_do_assert;
1013 	if(copy_to_user(optval,&val,olr))
1014 		return -EFAULT;
1015 	return 0;
1016 }
1017 
1018 /*
1019  *	The IP multicast ioctl support routines.
1020  */
1021 
1022 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1023 {
1024 	struct sioc_sg_req sr;
1025 	struct sioc_vif_req vr;
1026 	struct vif_device *vif;
1027 	struct mfc_cache *c;
1028 
1029 	switch(cmd)
1030 	{
1031 		case SIOCGETVIFCNT:
1032 			if (copy_from_user(&vr,arg,sizeof(vr)))
1033 				return -EFAULT;
1034 			if(vr.vifi>=maxvif)
1035 				return -EINVAL;
1036 			read_lock(&mrt_lock);
1037 			vif=&vif_table[vr.vifi];
1038 			if(VIF_EXISTS(vr.vifi))	{
1039 				vr.icount=vif->pkt_in;
1040 				vr.ocount=vif->pkt_out;
1041 				vr.ibytes=vif->bytes_in;
1042 				vr.obytes=vif->bytes_out;
1043 				read_unlock(&mrt_lock);
1044 
1045 				if (copy_to_user(arg,&vr,sizeof(vr)))
1046 					return -EFAULT;
1047 				return 0;
1048 			}
1049 			read_unlock(&mrt_lock);
1050 			return -EADDRNOTAVAIL;
1051 		case SIOCGETSGCNT:
1052 			if (copy_from_user(&sr,arg,sizeof(sr)))
1053 				return -EFAULT;
1054 
1055 			read_lock(&mrt_lock);
1056 			c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1057 			if (c) {
1058 				sr.pktcnt = c->mfc_un.res.pkt;
1059 				sr.bytecnt = c->mfc_un.res.bytes;
1060 				sr.wrong_if = c->mfc_un.res.wrong_if;
1061 				read_unlock(&mrt_lock);
1062 
1063 				if (copy_to_user(arg,&sr,sizeof(sr)))
1064 					return -EFAULT;
1065 				return 0;
1066 			}
1067 			read_unlock(&mrt_lock);
1068 			return -EADDRNOTAVAIL;
1069 		default:
1070 			return -ENOIOCTLCMD;
1071 	}
1072 }
1073 
1074 
1075 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1076 {
1077 	struct vif_device *v;
1078 	int ct;
1079 	if (event != NETDEV_UNREGISTER)
1080 		return NOTIFY_DONE;
1081 	v=&vif_table[0];
1082 	for(ct=0;ct<maxvif;ct++,v++) {
1083 		if (v->dev==ptr)
1084 			vif_delete(ct);
1085 	}
1086 	return NOTIFY_DONE;
1087 }
1088 
1089 
1090 static struct notifier_block ip_mr_notifier={
1091 	.notifier_call = ipmr_device_event,
1092 };
1093 
1094 /*
1095  * 	Encapsulate a packet by attaching a valid IPIP header to it.
1096  *	This avoids tunnel drivers and other mess and gives us the speed so
1097  *	important for multicast video.
1098  */
1099 
1100 static void ip_encap(struct sk_buff *skb, u32 saddr, u32 daddr)
1101 {
1102 	struct iphdr *iph = (struct iphdr *)skb_push(skb,sizeof(struct iphdr));
1103 
1104 	iph->version	= 	4;
1105 	iph->tos	=	skb->nh.iph->tos;
1106 	iph->ttl	=	skb->nh.iph->ttl;
1107 	iph->frag_off	=	0;
1108 	iph->daddr	=	daddr;
1109 	iph->saddr	=	saddr;
1110 	iph->protocol	=	IPPROTO_IPIP;
1111 	iph->ihl	=	5;
1112 	iph->tot_len	=	htons(skb->len);
1113 	ip_select_ident(iph, skb->dst, NULL);
1114 	ip_send_check(iph);
1115 
1116 	skb->h.ipiph = skb->nh.iph;
1117 	skb->nh.iph = iph;
1118 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1119 	nf_reset(skb);
1120 }
1121 
1122 static inline int ipmr_forward_finish(struct sk_buff *skb)
1123 {
1124 	struct ip_options * opt	= &(IPCB(skb)->opt);
1125 
1126 	IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
1127 
1128 	if (unlikely(opt->optlen))
1129 		ip_forward_options(skb);
1130 
1131 	return dst_output(skb);
1132 }
1133 
1134 /*
1135  *	Processing handlers for ipmr_forward
1136  */
1137 
1138 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1139 {
1140 	struct iphdr *iph = skb->nh.iph;
1141 	struct vif_device *vif = &vif_table[vifi];
1142 	struct net_device *dev;
1143 	struct rtable *rt;
1144 	int    encap = 0;
1145 
1146 	if (vif->dev == NULL)
1147 		goto out_free;
1148 
1149 #ifdef CONFIG_IP_PIMSM
1150 	if (vif->flags & VIFF_REGISTER) {
1151 		vif->pkt_out++;
1152 		vif->bytes_out+=skb->len;
1153 		((struct net_device_stats*)netdev_priv(vif->dev))->tx_bytes += skb->len;
1154 		((struct net_device_stats*)netdev_priv(vif->dev))->tx_packets++;
1155 		ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1156 		kfree_skb(skb);
1157 		return;
1158 	}
1159 #endif
1160 
1161 	if (vif->flags&VIFF_TUNNEL) {
1162 		struct flowi fl = { .oif = vif->link,
1163 				    .nl_u = { .ip4_u =
1164 					      { .daddr = vif->remote,
1165 						.saddr = vif->local,
1166 						.tos = RT_TOS(iph->tos) } },
1167 				    .proto = IPPROTO_IPIP };
1168 		if (ip_route_output_key(&rt, &fl))
1169 			goto out_free;
1170 		encap = sizeof(struct iphdr);
1171 	} else {
1172 		struct flowi fl = { .oif = vif->link,
1173 				    .nl_u = { .ip4_u =
1174 					      { .daddr = iph->daddr,
1175 						.tos = RT_TOS(iph->tos) } },
1176 				    .proto = IPPROTO_IPIP };
1177 		if (ip_route_output_key(&rt, &fl))
1178 			goto out_free;
1179 	}
1180 
1181 	dev = rt->u.dst.dev;
1182 
1183 	if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1184 		/* Do not fragment multicasts. Alas, IPv4 does not
1185 		   allow to send ICMP, so that packets will disappear
1186 		   to blackhole.
1187 		 */
1188 
1189 		IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
1190 		ip_rt_put(rt);
1191 		goto out_free;
1192 	}
1193 
1194 	encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1195 
1196 	if (skb_cow(skb, encap)) {
1197  		ip_rt_put(rt);
1198 		goto out_free;
1199 	}
1200 
1201 	vif->pkt_out++;
1202 	vif->bytes_out+=skb->len;
1203 
1204 	dst_release(skb->dst);
1205 	skb->dst = &rt->u.dst;
1206 	iph = skb->nh.iph;
1207 	ip_decrease_ttl(iph);
1208 
1209 	/* FIXME: forward and output firewalls used to be called here.
1210 	 * What do we do with netfilter? -- RR */
1211 	if (vif->flags & VIFF_TUNNEL) {
1212 		ip_encap(skb, vif->local, vif->remote);
1213 		/* FIXME: extra output firewall step used to be here. --RR */
1214 		((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_packets++;
1215 		((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_bytes+=skb->len;
1216 	}
1217 
1218 	IPCB(skb)->flags |= IPSKB_FORWARDED;
1219 
1220 	/*
1221 	 * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1222 	 * not only before forwarding, but after forwarding on all output
1223 	 * interfaces. It is clear, if mrouter runs a multicasting
1224 	 * program, it should receive packets not depending to what interface
1225 	 * program is joined.
1226 	 * If we will not make it, the program will have to join on all
1227 	 * interfaces. On the other hand, multihoming host (or router, but
1228 	 * not mrouter) cannot join to more than one interface - it will
1229 	 * result in receiving multiple packets.
1230 	 */
1231 	NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev,
1232 		ipmr_forward_finish);
1233 	return;
1234 
1235 out_free:
1236 	kfree_skb(skb);
1237 	return;
1238 }
1239 
1240 static int ipmr_find_vif(struct net_device *dev)
1241 {
1242 	int ct;
1243 	for (ct=maxvif-1; ct>=0; ct--) {
1244 		if (vif_table[ct].dev == dev)
1245 			break;
1246 	}
1247 	return ct;
1248 }
1249 
1250 /* "local" means that we should preserve one skb (for local delivery) */
1251 
1252 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1253 {
1254 	int psend = -1;
1255 	int vif, ct;
1256 
1257 	vif = cache->mfc_parent;
1258 	cache->mfc_un.res.pkt++;
1259 	cache->mfc_un.res.bytes += skb->len;
1260 
1261 	/*
1262 	 * Wrong interface: drop packet and (maybe) send PIM assert.
1263 	 */
1264 	if (vif_table[vif].dev != skb->dev) {
1265 		int true_vifi;
1266 
1267 		if (((struct rtable*)skb->dst)->fl.iif == 0) {
1268 			/* It is our own packet, looped back.
1269 			   Very complicated situation...
1270 
1271 			   The best workaround until routing daemons will be
1272 			   fixed is not to redistribute packet, if it was
1273 			   send through wrong interface. It means, that
1274 			   multicast applications WILL NOT work for
1275 			   (S,G), which have default multicast route pointing
1276 			   to wrong oif. In any case, it is not a good
1277 			   idea to use multicasting applications on router.
1278 			 */
1279 			goto dont_forward;
1280 		}
1281 
1282 		cache->mfc_un.res.wrong_if++;
1283 		true_vifi = ipmr_find_vif(skb->dev);
1284 
1285 		if (true_vifi >= 0 && mroute_do_assert &&
1286 		    /* pimsm uses asserts, when switching from RPT to SPT,
1287 		       so that we cannot check that packet arrived on an oif.
1288 		       It is bad, but otherwise we would need to move pretty
1289 		       large chunk of pimd to kernel. Ough... --ANK
1290 		     */
1291 		    (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1292 		    time_after(jiffies,
1293 			       cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1294 			cache->mfc_un.res.last_assert = jiffies;
1295 			ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1296 		}
1297 		goto dont_forward;
1298 	}
1299 
1300 	vif_table[vif].pkt_in++;
1301 	vif_table[vif].bytes_in+=skb->len;
1302 
1303 	/*
1304 	 *	Forward the frame
1305 	 */
1306 	for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1307 		if (skb->nh.iph->ttl > cache->mfc_un.res.ttls[ct]) {
1308 			if (psend != -1) {
1309 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1310 				if (skb2)
1311 					ipmr_queue_xmit(skb2, cache, psend);
1312 			}
1313 			psend=ct;
1314 		}
1315 	}
1316 	if (psend != -1) {
1317 		if (local) {
1318 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1319 			if (skb2)
1320 				ipmr_queue_xmit(skb2, cache, psend);
1321 		} else {
1322 			ipmr_queue_xmit(skb, cache, psend);
1323 			return 0;
1324 		}
1325 	}
1326 
1327 dont_forward:
1328 	if (!local)
1329 		kfree_skb(skb);
1330 	return 0;
1331 }
1332 
1333 
1334 /*
1335  *	Multicast packets for forwarding arrive here
1336  */
1337 
1338 int ip_mr_input(struct sk_buff *skb)
1339 {
1340 	struct mfc_cache *cache;
1341 	int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
1342 
1343 	/* Packet is looped back after forward, it should not be
1344 	   forwarded second time, but still can be delivered locally.
1345 	 */
1346 	if (IPCB(skb)->flags&IPSKB_FORWARDED)
1347 		goto dont_forward;
1348 
1349 	if (!local) {
1350 		    if (IPCB(skb)->opt.router_alert) {
1351 			    if (ip_call_ra_chain(skb))
1352 				    return 0;
1353 		    } else if (skb->nh.iph->protocol == IPPROTO_IGMP){
1354 			    /* IGMPv1 (and broken IGMPv2 implementations sort of
1355 			       Cisco IOS <= 11.2(8)) do not put router alert
1356 			       option to IGMP packets destined to routable
1357 			       groups. It is very bad, because it means
1358 			       that we can forward NO IGMP messages.
1359 			     */
1360 			    read_lock(&mrt_lock);
1361 			    if (mroute_socket) {
1362 				    nf_reset(skb);
1363 				    raw_rcv(mroute_socket, skb);
1364 				    read_unlock(&mrt_lock);
1365 				    return 0;
1366 			    }
1367 			    read_unlock(&mrt_lock);
1368 		    }
1369 	}
1370 
1371 	read_lock(&mrt_lock);
1372 	cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr);
1373 
1374 	/*
1375 	 *	No usable cache entry
1376 	 */
1377 	if (cache==NULL) {
1378 		int vif;
1379 
1380 		if (local) {
1381 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1382 			ip_local_deliver(skb);
1383 			if (skb2 == NULL) {
1384 				read_unlock(&mrt_lock);
1385 				return -ENOBUFS;
1386 			}
1387 			skb = skb2;
1388 		}
1389 
1390 		vif = ipmr_find_vif(skb->dev);
1391 		if (vif >= 0) {
1392 			int err = ipmr_cache_unresolved(vif, skb);
1393 			read_unlock(&mrt_lock);
1394 
1395 			return err;
1396 		}
1397 		read_unlock(&mrt_lock);
1398 		kfree_skb(skb);
1399 		return -ENODEV;
1400 	}
1401 
1402 	ip_mr_forward(skb, cache, local);
1403 
1404 	read_unlock(&mrt_lock);
1405 
1406 	if (local)
1407 		return ip_local_deliver(skb);
1408 
1409 	return 0;
1410 
1411 dont_forward:
1412 	if (local)
1413 		return ip_local_deliver(skb);
1414 	kfree_skb(skb);
1415 	return 0;
1416 }
1417 
1418 #ifdef CONFIG_IP_PIMSM_V1
1419 /*
1420  * Handle IGMP messages of PIMv1
1421  */
1422 
1423 int pim_rcv_v1(struct sk_buff * skb)
1424 {
1425 	struct igmphdr *pim;
1426 	struct iphdr   *encap;
1427 	struct net_device  *reg_dev = NULL;
1428 
1429 	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1430 		goto drop;
1431 
1432 	pim = (struct igmphdr*)skb->h.raw;
1433 
1434         if (!mroute_do_pim ||
1435 	    skb->len < sizeof(*pim) + sizeof(*encap) ||
1436 	    pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1437 		goto drop;
1438 
1439 	encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr));
1440 	/*
1441 	   Check that:
1442 	   a. packet is really destinted to a multicast group
1443 	   b. packet is not a NULL-REGISTER
1444 	   c. packet is not truncated
1445 	 */
1446 	if (!MULTICAST(encap->daddr) ||
1447 	    encap->tot_len == 0 ||
1448 	    ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1449 		goto drop;
1450 
1451 	read_lock(&mrt_lock);
1452 	if (reg_vif_num >= 0)
1453 		reg_dev = vif_table[reg_vif_num].dev;
1454 	if (reg_dev)
1455 		dev_hold(reg_dev);
1456 	read_unlock(&mrt_lock);
1457 
1458 	if (reg_dev == NULL)
1459 		goto drop;
1460 
1461 	skb->mac.raw = skb->nh.raw;
1462 	skb_pull(skb, (u8*)encap - skb->data);
1463 	skb->nh.iph = (struct iphdr *)skb->data;
1464 	skb->dev = reg_dev;
1465 	memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
1466 	skb->protocol = htons(ETH_P_IP);
1467 	skb->ip_summed = 0;
1468 	skb->pkt_type = PACKET_HOST;
1469 	dst_release(skb->dst);
1470 	skb->dst = NULL;
1471 	((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1472 	((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1473 	nf_reset(skb);
1474 	netif_rx(skb);
1475 	dev_put(reg_dev);
1476 	return 0;
1477  drop:
1478 	kfree_skb(skb);
1479 	return 0;
1480 }
1481 #endif
1482 
1483 #ifdef CONFIG_IP_PIMSM_V2
1484 static int pim_rcv(struct sk_buff * skb)
1485 {
1486 	struct pimreghdr *pim;
1487 	struct iphdr   *encap;
1488 	struct net_device  *reg_dev = NULL;
1489 
1490 	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1491 		goto drop;
1492 
1493 	pim = (struct pimreghdr*)skb->h.raw;
1494         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1495 	    (pim->flags&PIM_NULL_REGISTER) ||
1496 	    (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1497 	     (u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1498 		goto drop;
1499 
1500 	/* check if the inner packet is destined to mcast group */
1501 	encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr));
1502 	if (!MULTICAST(encap->daddr) ||
1503 	    encap->tot_len == 0 ||
1504 	    ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1505 		goto drop;
1506 
1507 	read_lock(&mrt_lock);
1508 	if (reg_vif_num >= 0)
1509 		reg_dev = vif_table[reg_vif_num].dev;
1510 	if (reg_dev)
1511 		dev_hold(reg_dev);
1512 	read_unlock(&mrt_lock);
1513 
1514 	if (reg_dev == NULL)
1515 		goto drop;
1516 
1517 	skb->mac.raw = skb->nh.raw;
1518 	skb_pull(skb, (u8*)encap - skb->data);
1519 	skb->nh.iph = (struct iphdr *)skb->data;
1520 	skb->dev = reg_dev;
1521 	memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
1522 	skb->protocol = htons(ETH_P_IP);
1523 	skb->ip_summed = 0;
1524 	skb->pkt_type = PACKET_HOST;
1525 	dst_release(skb->dst);
1526 	((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1527 	((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1528 	skb->dst = NULL;
1529 	nf_reset(skb);
1530 	netif_rx(skb);
1531 	dev_put(reg_dev);
1532 	return 0;
1533  drop:
1534 	kfree_skb(skb);
1535 	return 0;
1536 }
1537 #endif
1538 
1539 static int
1540 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1541 {
1542 	int ct;
1543 	struct rtnexthop *nhp;
1544 	struct net_device *dev = vif_table[c->mfc_parent].dev;
1545 	u8 *b = skb->tail;
1546 	struct rtattr *mp_head;
1547 
1548 	if (dev)
1549 		RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1550 
1551 	mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1552 
1553 	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1554 		if (c->mfc_un.res.ttls[ct] < 255) {
1555 			if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1556 				goto rtattr_failure;
1557 			nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1558 			nhp->rtnh_flags = 0;
1559 			nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1560 			nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1561 			nhp->rtnh_len = sizeof(*nhp);
1562 		}
1563 	}
1564 	mp_head->rta_type = RTA_MULTIPATH;
1565 	mp_head->rta_len = skb->tail - (u8*)mp_head;
1566 	rtm->rtm_type = RTN_MULTICAST;
1567 	return 1;
1568 
1569 rtattr_failure:
1570 	skb_trim(skb, b - skb->data);
1571 	return -EMSGSIZE;
1572 }
1573 
1574 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1575 {
1576 	int err;
1577 	struct mfc_cache *cache;
1578 	struct rtable *rt = (struct rtable*)skb->dst;
1579 
1580 	read_lock(&mrt_lock);
1581 	cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1582 
1583 	if (cache==NULL) {
1584 		struct net_device *dev;
1585 		int vif;
1586 
1587 		if (nowait) {
1588 			read_unlock(&mrt_lock);
1589 			return -EAGAIN;
1590 		}
1591 
1592 		dev = skb->dev;
1593 		if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1594 			read_unlock(&mrt_lock);
1595 			return -ENODEV;
1596 		}
1597 		skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
1598 		skb->nh.iph->ihl = sizeof(struct iphdr)>>2;
1599 		skb->nh.iph->saddr = rt->rt_src;
1600 		skb->nh.iph->daddr = rt->rt_dst;
1601 		skb->nh.iph->version = 0;
1602 		err = ipmr_cache_unresolved(vif, skb);
1603 		read_unlock(&mrt_lock);
1604 		return err;
1605 	}
1606 
1607 	if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1608 		cache->mfc_flags |= MFC_NOTIFY;
1609 	err = ipmr_fill_mroute(skb, cache, rtm);
1610 	read_unlock(&mrt_lock);
1611 	return err;
1612 }
1613 
1614 #ifdef CONFIG_PROC_FS
1615 /*
1616  *	The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1617  */
1618 struct ipmr_vif_iter {
1619 	int ct;
1620 };
1621 
1622 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1623 					   loff_t pos)
1624 {
1625 	for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1626 		if(!VIF_EXISTS(iter->ct))
1627 			continue;
1628 		if (pos-- == 0)
1629 			return &vif_table[iter->ct];
1630 	}
1631 	return NULL;
1632 }
1633 
1634 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1635 {
1636 	read_lock(&mrt_lock);
1637 	return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1638 		: SEQ_START_TOKEN;
1639 }
1640 
1641 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1642 {
1643 	struct ipmr_vif_iter *iter = seq->private;
1644 
1645 	++*pos;
1646 	if (v == SEQ_START_TOKEN)
1647 		return ipmr_vif_seq_idx(iter, 0);
1648 
1649 	while (++iter->ct < maxvif) {
1650 		if(!VIF_EXISTS(iter->ct))
1651 			continue;
1652 		return &vif_table[iter->ct];
1653 	}
1654 	return NULL;
1655 }
1656 
1657 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1658 {
1659 	read_unlock(&mrt_lock);
1660 }
1661 
1662 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1663 {
1664 	if (v == SEQ_START_TOKEN) {
1665 		seq_puts(seq,
1666 			 "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1667 	} else {
1668 		const struct vif_device *vif = v;
1669 		const char *name =  vif->dev ? vif->dev->name : "none";
1670 
1671 		seq_printf(seq,
1672 			   "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1673 			   vif - vif_table,
1674 			   name, vif->bytes_in, vif->pkt_in,
1675 			   vif->bytes_out, vif->pkt_out,
1676 			   vif->flags, vif->local, vif->remote);
1677 	}
1678 	return 0;
1679 }
1680 
1681 static struct seq_operations ipmr_vif_seq_ops = {
1682 	.start = ipmr_vif_seq_start,
1683 	.next  = ipmr_vif_seq_next,
1684 	.stop  = ipmr_vif_seq_stop,
1685 	.show  = ipmr_vif_seq_show,
1686 };
1687 
1688 static int ipmr_vif_open(struct inode *inode, struct file *file)
1689 {
1690 	struct seq_file *seq;
1691 	int rc = -ENOMEM;
1692 	struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1693 
1694 	if (!s)
1695 		goto out;
1696 
1697 	rc = seq_open(file, &ipmr_vif_seq_ops);
1698 	if (rc)
1699 		goto out_kfree;
1700 
1701 	s->ct = 0;
1702 	seq = file->private_data;
1703 	seq->private = s;
1704 out:
1705 	return rc;
1706 out_kfree:
1707 	kfree(s);
1708 	goto out;
1709 
1710 }
1711 
1712 static struct file_operations ipmr_vif_fops = {
1713 	.owner	 = THIS_MODULE,
1714 	.open    = ipmr_vif_open,
1715 	.read    = seq_read,
1716 	.llseek  = seq_lseek,
1717 	.release = seq_release_private,
1718 };
1719 
1720 struct ipmr_mfc_iter {
1721 	struct mfc_cache **cache;
1722 	int ct;
1723 };
1724 
1725 
1726 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1727 {
1728 	struct mfc_cache *mfc;
1729 
1730 	it->cache = mfc_cache_array;
1731 	read_lock(&mrt_lock);
1732 	for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1733 		for(mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
1734 			if (pos-- == 0)
1735 				return mfc;
1736 	read_unlock(&mrt_lock);
1737 
1738 	it->cache = &mfc_unres_queue;
1739 	spin_lock_bh(&mfc_unres_lock);
1740 	for(mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1741 		if (pos-- == 0)
1742 			return mfc;
1743 	spin_unlock_bh(&mfc_unres_lock);
1744 
1745 	it->cache = NULL;
1746 	return NULL;
1747 }
1748 
1749 
1750 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1751 {
1752 	struct ipmr_mfc_iter *it = seq->private;
1753 	it->cache = NULL;
1754 	it->ct = 0;
1755 	return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1756 		: SEQ_START_TOKEN;
1757 }
1758 
1759 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1760 {
1761 	struct mfc_cache *mfc = v;
1762 	struct ipmr_mfc_iter *it = seq->private;
1763 
1764 	++*pos;
1765 
1766 	if (v == SEQ_START_TOKEN)
1767 		return ipmr_mfc_seq_idx(seq->private, 0);
1768 
1769 	if (mfc->next)
1770 		return mfc->next;
1771 
1772 	if (it->cache == &mfc_unres_queue)
1773 		goto end_of_list;
1774 
1775 	BUG_ON(it->cache != mfc_cache_array);
1776 
1777 	while (++it->ct < MFC_LINES) {
1778 		mfc = mfc_cache_array[it->ct];
1779 		if (mfc)
1780 			return mfc;
1781 	}
1782 
1783 	/* exhausted cache_array, show unresolved */
1784 	read_unlock(&mrt_lock);
1785 	it->cache = &mfc_unres_queue;
1786 	it->ct = 0;
1787 
1788 	spin_lock_bh(&mfc_unres_lock);
1789 	mfc = mfc_unres_queue;
1790 	if (mfc)
1791 		return mfc;
1792 
1793  end_of_list:
1794 	spin_unlock_bh(&mfc_unres_lock);
1795 	it->cache = NULL;
1796 
1797 	return NULL;
1798 }
1799 
1800 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1801 {
1802 	struct ipmr_mfc_iter *it = seq->private;
1803 
1804 	if (it->cache == &mfc_unres_queue)
1805 		spin_unlock_bh(&mfc_unres_lock);
1806 	else if (it->cache == mfc_cache_array)
1807 		read_unlock(&mrt_lock);
1808 }
1809 
1810 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1811 {
1812 	int n;
1813 
1814 	if (v == SEQ_START_TOKEN) {
1815 		seq_puts(seq,
1816 		 "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1817 	} else {
1818 		const struct mfc_cache *mfc = v;
1819 		const struct ipmr_mfc_iter *it = seq->private;
1820 
1821 		seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1822 			   (unsigned long) mfc->mfc_mcastgrp,
1823 			   (unsigned long) mfc->mfc_origin,
1824 			   mfc->mfc_parent,
1825 			   mfc->mfc_un.res.pkt,
1826 			   mfc->mfc_un.res.bytes,
1827 			   mfc->mfc_un.res.wrong_if);
1828 
1829 		if (it->cache != &mfc_unres_queue) {
1830 			for(n = mfc->mfc_un.res.minvif;
1831 			    n < mfc->mfc_un.res.maxvif; n++ ) {
1832 				if(VIF_EXISTS(n)
1833 				   && mfc->mfc_un.res.ttls[n] < 255)
1834 				seq_printf(seq,
1835 					   " %2d:%-3d",
1836 					   n, mfc->mfc_un.res.ttls[n]);
1837 			}
1838 		}
1839 		seq_putc(seq, '\n');
1840 	}
1841 	return 0;
1842 }
1843 
1844 static struct seq_operations ipmr_mfc_seq_ops = {
1845 	.start = ipmr_mfc_seq_start,
1846 	.next  = ipmr_mfc_seq_next,
1847 	.stop  = ipmr_mfc_seq_stop,
1848 	.show  = ipmr_mfc_seq_show,
1849 };
1850 
1851 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1852 {
1853 	struct seq_file *seq;
1854 	int rc = -ENOMEM;
1855 	struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1856 
1857 	if (!s)
1858 		goto out;
1859 
1860 	rc = seq_open(file, &ipmr_mfc_seq_ops);
1861 	if (rc)
1862 		goto out_kfree;
1863 
1864 	seq = file->private_data;
1865 	seq->private = s;
1866 out:
1867 	return rc;
1868 out_kfree:
1869 	kfree(s);
1870 	goto out;
1871 
1872 }
1873 
1874 static struct file_operations ipmr_mfc_fops = {
1875 	.owner	 = THIS_MODULE,
1876 	.open    = ipmr_mfc_open,
1877 	.read    = seq_read,
1878 	.llseek  = seq_lseek,
1879 	.release = seq_release_private,
1880 };
1881 #endif
1882 
1883 #ifdef CONFIG_IP_PIMSM_V2
1884 static struct net_protocol pim_protocol = {
1885 	.handler	=	pim_rcv,
1886 };
1887 #endif
1888 
1889 
1890 /*
1891  *	Setup for IP multicast routing
1892  */
1893 
1894 void __init ip_mr_init(void)
1895 {
1896 	mrt_cachep = kmem_cache_create("ip_mrt_cache",
1897 				       sizeof(struct mfc_cache),
1898 				       0, SLAB_HWCACHE_ALIGN,
1899 				       NULL, NULL);
1900 	if (!mrt_cachep)
1901 		panic("cannot allocate ip_mrt_cache");
1902 
1903 	init_timer(&ipmr_expire_timer);
1904 	ipmr_expire_timer.function=ipmr_expire_process;
1905 	register_netdevice_notifier(&ip_mr_notifier);
1906 #ifdef CONFIG_PROC_FS
1907 	proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops);
1908 	proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops);
1909 #endif
1910 }
1911