xref: /linux/net/ipv4/ipmr.c (revision 14b42963f64b98ab61fa9723c03d71aa5ef4f862)
1 /*
2  *	IP multicast routing support for mrouted 3.6/3.8
3  *
4  *		(c) 1995 Alan Cox, <alan@redhat.com>
5  *	  Linux Consultancy and Custom Driver Development
6  *
7  *	This program is free software; you can redistribute it and/or
8  *	modify it under the terms of the GNU General Public License
9  *	as published by the Free Software Foundation; either version
10  *	2 of the License, or (at your option) any later version.
11  *
12  *	Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
13  *
14  *	Fixes:
15  *	Michael Chastain	:	Incorrect size of copying.
16  *	Alan Cox		:	Added the cache manager code
17  *	Alan Cox		:	Fixed the clone/copy bug and device race.
18  *	Mike McLagan		:	Routing by source
19  *	Malcolm Beattie		:	Buffer handling fixes.
20  *	Alexey Kuznetsov	:	Double buffer free and other fixes.
21  *	SVR Anand		:	Fixed several multicast bugs and problems.
22  *	Alexey Kuznetsov	:	Status, optimisations and more.
23  *	Brad Parker		:	Better behaviour on mrouted upcall
24  *					overflow.
25  *      Carlos Picoto           :       PIMv1 Support
26  *	Pavlin Ivanov Radoslavov:	PIMv2 Registers must checksum only PIM header
27  *					Relax this requrement to work with older peers.
28  *
29  */
30 
31 #include <asm/system.h>
32 #include <asm/uaccess.h>
33 #include <linux/types.h>
34 #include <linux/sched.h>
35 #include <linux/capability.h>
36 #include <linux/errno.h>
37 #include <linux/timer.h>
38 #include <linux/mm.h>
39 #include <linux/kernel.h>
40 #include <linux/fcntl.h>
41 #include <linux/stat.h>
42 #include <linux/socket.h>
43 #include <linux/in.h>
44 #include <linux/inet.h>
45 #include <linux/netdevice.h>
46 #include <linux/inetdevice.h>
47 #include <linux/igmp.h>
48 #include <linux/proc_fs.h>
49 #include <linux/seq_file.h>
50 #include <linux/mroute.h>
51 #include <linux/init.h>
52 #include <linux/if_ether.h>
53 #include <net/ip.h>
54 #include <net/protocol.h>
55 #include <linux/skbuff.h>
56 #include <net/route.h>
57 #include <net/sock.h>
58 #include <net/icmp.h>
59 #include <net/udp.h>
60 #include <net/raw.h>
61 #include <linux/notifier.h>
62 #include <linux/if_arp.h>
63 #include <linux/netfilter_ipv4.h>
64 #include <net/ipip.h>
65 #include <net/checksum.h>
66 
67 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
68 #define CONFIG_IP_PIMSM	1
69 #endif
70 
71 static struct sock *mroute_socket;
72 
73 
74 /* Big lock, protecting vif table, mrt cache and mroute socket state.
75    Note that the changes are semaphored via rtnl_lock.
76  */
77 
78 static DEFINE_RWLOCK(mrt_lock);
79 
80 /*
81  *	Multicast router control variables
82  */
83 
84 static struct vif_device vif_table[MAXVIFS];		/* Devices 		*/
85 static int maxvif;
86 
87 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
88 
89 static int mroute_do_assert;				/* Set in PIM assert	*/
90 static int mroute_do_pim;
91 
92 static struct mfc_cache *mfc_cache_array[MFC_LINES];	/* Forwarding cache	*/
93 
94 static struct mfc_cache *mfc_unres_queue;		/* Queue of unresolved entries */
95 static atomic_t cache_resolve_queue_len;		/* Size of unresolved	*/
96 
97 /* Special spinlock for queue of unresolved entries */
98 static DEFINE_SPINLOCK(mfc_unres_lock);
99 
100 /* We return to original Alan's scheme. Hash table of resolved
101    entries is changed only in process context and protected
102    with weak lock mrt_lock. Queue of unresolved entries is protected
103    with strong spinlock mfc_unres_lock.
104 
105    In this case data path is free of exclusive locks at all.
106  */
107 
108 static kmem_cache_t *mrt_cachep __read_mostly;
109 
110 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
111 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
112 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
113 
114 #ifdef CONFIG_IP_PIMSM_V2
115 static struct net_protocol pim_protocol;
116 #endif
117 
118 static struct timer_list ipmr_expire_timer;
119 
120 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
121 
122 static
123 struct net_device *ipmr_new_tunnel(struct vifctl *v)
124 {
125 	struct net_device  *dev;
126 
127 	dev = __dev_get_by_name("tunl0");
128 
129 	if (dev) {
130 		int err;
131 		struct ifreq ifr;
132 		mm_segment_t	oldfs;
133 		struct ip_tunnel_parm p;
134 		struct in_device  *in_dev;
135 
136 		memset(&p, 0, sizeof(p));
137 		p.iph.daddr = v->vifc_rmt_addr.s_addr;
138 		p.iph.saddr = v->vifc_lcl_addr.s_addr;
139 		p.iph.version = 4;
140 		p.iph.ihl = 5;
141 		p.iph.protocol = IPPROTO_IPIP;
142 		sprintf(p.name, "dvmrp%d", v->vifc_vifi);
143 		ifr.ifr_ifru.ifru_data = (void*)&p;
144 
145 		oldfs = get_fs(); set_fs(KERNEL_DS);
146 		err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
147 		set_fs(oldfs);
148 
149 		dev = NULL;
150 
151 		if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
152 			dev->flags |= IFF_MULTICAST;
153 
154 			in_dev = __in_dev_get_rtnl(dev);
155 			if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL)
156 				goto failure;
157 			in_dev->cnf.rp_filter = 0;
158 
159 			if (dev_open(dev))
160 				goto failure;
161 		}
162 	}
163 	return dev;
164 
165 failure:
166 	/* allow the register to be completed before unregistering. */
167 	rtnl_unlock();
168 	rtnl_lock();
169 
170 	unregister_netdevice(dev);
171 	return NULL;
172 }
173 
174 #ifdef CONFIG_IP_PIMSM
175 
176 static int reg_vif_num = -1;
177 
178 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
179 {
180 	read_lock(&mrt_lock);
181 	((struct net_device_stats*)netdev_priv(dev))->tx_bytes += skb->len;
182 	((struct net_device_stats*)netdev_priv(dev))->tx_packets++;
183 	ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
184 	read_unlock(&mrt_lock);
185 	kfree_skb(skb);
186 	return 0;
187 }
188 
189 static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
190 {
191 	return (struct net_device_stats*)netdev_priv(dev);
192 }
193 
194 static void reg_vif_setup(struct net_device *dev)
195 {
196 	dev->type		= ARPHRD_PIMREG;
197 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 8;
198 	dev->flags		= IFF_NOARP;
199 	dev->hard_start_xmit	= reg_vif_xmit;
200 	dev->get_stats		= reg_vif_get_stats;
201 	dev->destructor		= free_netdev;
202 }
203 
204 static struct net_device *ipmr_reg_vif(void)
205 {
206 	struct net_device *dev;
207 	struct in_device *in_dev;
208 
209 	dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
210 			   reg_vif_setup);
211 
212 	if (dev == NULL)
213 		return NULL;
214 
215 	if (register_netdevice(dev)) {
216 		free_netdev(dev);
217 		return NULL;
218 	}
219 	dev->iflink = 0;
220 
221 	if ((in_dev = inetdev_init(dev)) == NULL)
222 		goto failure;
223 
224 	in_dev->cnf.rp_filter = 0;
225 
226 	if (dev_open(dev))
227 		goto failure;
228 
229 	return dev;
230 
231 failure:
232 	/* allow the register to be completed before unregistering. */
233 	rtnl_unlock();
234 	rtnl_lock();
235 
236 	unregister_netdevice(dev);
237 	return NULL;
238 }
239 #endif
240 
241 /*
242  *	Delete a VIF entry
243  */
244 
245 static int vif_delete(int vifi)
246 {
247 	struct vif_device *v;
248 	struct net_device *dev;
249 	struct in_device *in_dev;
250 
251 	if (vifi < 0 || vifi >= maxvif)
252 		return -EADDRNOTAVAIL;
253 
254 	v = &vif_table[vifi];
255 
256 	write_lock_bh(&mrt_lock);
257 	dev = v->dev;
258 	v->dev = NULL;
259 
260 	if (!dev) {
261 		write_unlock_bh(&mrt_lock);
262 		return -EADDRNOTAVAIL;
263 	}
264 
265 #ifdef CONFIG_IP_PIMSM
266 	if (vifi == reg_vif_num)
267 		reg_vif_num = -1;
268 #endif
269 
270 	if (vifi+1 == maxvif) {
271 		int tmp;
272 		for (tmp=vifi-1; tmp>=0; tmp--) {
273 			if (VIF_EXISTS(tmp))
274 				break;
275 		}
276 		maxvif = tmp+1;
277 	}
278 
279 	write_unlock_bh(&mrt_lock);
280 
281 	dev_set_allmulti(dev, -1);
282 
283 	if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
284 		in_dev->cnf.mc_forwarding--;
285 		ip_rt_multicast_event(in_dev);
286 	}
287 
288 	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
289 		unregister_netdevice(dev);
290 
291 	dev_put(dev);
292 	return 0;
293 }
294 
295 /* Destroy an unresolved cache entry, killing queued skbs
296    and reporting error to netlink readers.
297  */
298 
299 static void ipmr_destroy_unres(struct mfc_cache *c)
300 {
301 	struct sk_buff *skb;
302 	struct nlmsgerr *e;
303 
304 	atomic_dec(&cache_resolve_queue_len);
305 
306 	while((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
307 		if (skb->nh.iph->version == 0) {
308 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
309 			nlh->nlmsg_type = NLMSG_ERROR;
310 			nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
311 			skb_trim(skb, nlh->nlmsg_len);
312 			e = NLMSG_DATA(nlh);
313 			e->error = -ETIMEDOUT;
314 			memset(&e->msg, 0, sizeof(e->msg));
315 			netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
316 		} else
317 			kfree_skb(skb);
318 	}
319 
320 	kmem_cache_free(mrt_cachep, c);
321 }
322 
323 
324 /* Single timer process for all the unresolved queue. */
325 
326 static void ipmr_expire_process(unsigned long dummy)
327 {
328 	unsigned long now;
329 	unsigned long expires;
330 	struct mfc_cache *c, **cp;
331 
332 	if (!spin_trylock(&mfc_unres_lock)) {
333 		mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
334 		return;
335 	}
336 
337 	if (atomic_read(&cache_resolve_queue_len) == 0)
338 		goto out;
339 
340 	now = jiffies;
341 	expires = 10*HZ;
342 	cp = &mfc_unres_queue;
343 
344 	while ((c=*cp) != NULL) {
345 		if (time_after(c->mfc_un.unres.expires, now)) {
346 			unsigned long interval = c->mfc_un.unres.expires - now;
347 			if (interval < expires)
348 				expires = interval;
349 			cp = &c->next;
350 			continue;
351 		}
352 
353 		*cp = c->next;
354 
355 		ipmr_destroy_unres(c);
356 	}
357 
358 	if (atomic_read(&cache_resolve_queue_len))
359 		mod_timer(&ipmr_expire_timer, jiffies + expires);
360 
361 out:
362 	spin_unlock(&mfc_unres_lock);
363 }
364 
365 /* Fill oifs list. It is called under write locked mrt_lock. */
366 
367 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
368 {
369 	int vifi;
370 
371 	cache->mfc_un.res.minvif = MAXVIFS;
372 	cache->mfc_un.res.maxvif = 0;
373 	memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
374 
375 	for (vifi=0; vifi<maxvif; vifi++) {
376 		if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
377 			cache->mfc_un.res.ttls[vifi] = ttls[vifi];
378 			if (cache->mfc_un.res.minvif > vifi)
379 				cache->mfc_un.res.minvif = vifi;
380 			if (cache->mfc_un.res.maxvif <= vifi)
381 				cache->mfc_un.res.maxvif = vifi + 1;
382 		}
383 	}
384 }
385 
386 static int vif_add(struct vifctl *vifc, int mrtsock)
387 {
388 	int vifi = vifc->vifc_vifi;
389 	struct vif_device *v = &vif_table[vifi];
390 	struct net_device *dev;
391 	struct in_device *in_dev;
392 
393 	/* Is vif busy ? */
394 	if (VIF_EXISTS(vifi))
395 		return -EADDRINUSE;
396 
397 	switch (vifc->vifc_flags) {
398 #ifdef CONFIG_IP_PIMSM
399 	case VIFF_REGISTER:
400 		/*
401 		 * Special Purpose VIF in PIM
402 		 * All the packets will be sent to the daemon
403 		 */
404 		if (reg_vif_num >= 0)
405 			return -EADDRINUSE;
406 		dev = ipmr_reg_vif();
407 		if (!dev)
408 			return -ENOBUFS;
409 		break;
410 #endif
411 	case VIFF_TUNNEL:
412 		dev = ipmr_new_tunnel(vifc);
413 		if (!dev)
414 			return -ENOBUFS;
415 		break;
416 	case 0:
417 		dev = ip_dev_find(vifc->vifc_lcl_addr.s_addr);
418 		if (!dev)
419 			return -EADDRNOTAVAIL;
420 		dev_put(dev);
421 		break;
422 	default:
423 		return -EINVAL;
424 	}
425 
426 	if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
427 		return -EADDRNOTAVAIL;
428 	in_dev->cnf.mc_forwarding++;
429 	dev_set_allmulti(dev, +1);
430 	ip_rt_multicast_event(in_dev);
431 
432 	/*
433 	 *	Fill in the VIF structures
434 	 */
435 	v->rate_limit=vifc->vifc_rate_limit;
436 	v->local=vifc->vifc_lcl_addr.s_addr;
437 	v->remote=vifc->vifc_rmt_addr.s_addr;
438 	v->flags=vifc->vifc_flags;
439 	if (!mrtsock)
440 		v->flags |= VIFF_STATIC;
441 	v->threshold=vifc->vifc_threshold;
442 	v->bytes_in = 0;
443 	v->bytes_out = 0;
444 	v->pkt_in = 0;
445 	v->pkt_out = 0;
446 	v->link = dev->ifindex;
447 	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
448 		v->link = dev->iflink;
449 
450 	/* And finish update writing critical data */
451 	write_lock_bh(&mrt_lock);
452 	dev_hold(dev);
453 	v->dev=dev;
454 #ifdef CONFIG_IP_PIMSM
455 	if (v->flags&VIFF_REGISTER)
456 		reg_vif_num = vifi;
457 #endif
458 	if (vifi+1 > maxvif)
459 		maxvif = vifi+1;
460 	write_unlock_bh(&mrt_lock);
461 	return 0;
462 }
463 
464 static struct mfc_cache *ipmr_cache_find(__u32 origin, __u32 mcastgrp)
465 {
466 	int line=MFC_HASH(mcastgrp,origin);
467 	struct mfc_cache *c;
468 
469 	for (c=mfc_cache_array[line]; c; c = c->next) {
470 		if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
471 			break;
472 	}
473 	return c;
474 }
475 
476 /*
477  *	Allocate a multicast cache entry
478  */
479 static struct mfc_cache *ipmr_cache_alloc(void)
480 {
481 	struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_KERNEL);
482 	if(c==NULL)
483 		return NULL;
484 	memset(c, 0, sizeof(*c));
485 	c->mfc_un.res.minvif = MAXVIFS;
486 	return c;
487 }
488 
489 static struct mfc_cache *ipmr_cache_alloc_unres(void)
490 {
491 	struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_ATOMIC);
492 	if(c==NULL)
493 		return NULL;
494 	memset(c, 0, sizeof(*c));
495 	skb_queue_head_init(&c->mfc_un.unres.unresolved);
496 	c->mfc_un.unres.expires = jiffies + 10*HZ;
497 	return c;
498 }
499 
500 /*
501  *	A cache entry has gone into a resolved state from queued
502  */
503 
504 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
505 {
506 	struct sk_buff *skb;
507 	struct nlmsgerr *e;
508 
509 	/*
510 	 *	Play the pending entries through our router
511 	 */
512 
513 	while((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
514 		if (skb->nh.iph->version == 0) {
515 			int err;
516 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
517 
518 			if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
519 				nlh->nlmsg_len = skb->tail - (u8*)nlh;
520 			} else {
521 				nlh->nlmsg_type = NLMSG_ERROR;
522 				nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
523 				skb_trim(skb, nlh->nlmsg_len);
524 				e = NLMSG_DATA(nlh);
525 				e->error = -EMSGSIZE;
526 				memset(&e->msg, 0, sizeof(e->msg));
527 			}
528 			err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
529 		} else
530 			ip_mr_forward(skb, c, 0);
531 	}
532 }
533 
534 /*
535  *	Bounce a cache query up to mrouted. We could use netlink for this but mrouted
536  *	expects the following bizarre scheme.
537  *
538  *	Called under mrt_lock.
539  */
540 
541 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
542 {
543 	struct sk_buff *skb;
544 	int ihl = pkt->nh.iph->ihl<<2;
545 	struct igmphdr *igmp;
546 	struct igmpmsg *msg;
547 	int ret;
548 
549 #ifdef CONFIG_IP_PIMSM
550 	if (assert == IGMPMSG_WHOLEPKT)
551 		skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
552 	else
553 #endif
554 		skb = alloc_skb(128, GFP_ATOMIC);
555 
556 	if(!skb)
557 		return -ENOBUFS;
558 
559 #ifdef CONFIG_IP_PIMSM
560 	if (assert == IGMPMSG_WHOLEPKT) {
561 		/* Ugly, but we have no choice with this interface.
562 		   Duplicate old header, fix ihl, length etc.
563 		   And all this only to mangle msg->im_msgtype and
564 		   to set msg->im_mbz to "mbz" :-)
565 		 */
566 		msg = (struct igmpmsg*)skb_push(skb, sizeof(struct iphdr));
567 		skb->nh.raw = skb->h.raw = (u8*)msg;
568 		memcpy(msg, pkt->nh.raw, sizeof(struct iphdr));
569 		msg->im_msgtype = IGMPMSG_WHOLEPKT;
570 		msg->im_mbz = 0;
571  		msg->im_vif = reg_vif_num;
572 		skb->nh.iph->ihl = sizeof(struct iphdr) >> 2;
573 		skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr));
574 	} else
575 #endif
576 	{
577 
578 	/*
579 	 *	Copy the IP header
580 	 */
581 
582 	skb->nh.iph = (struct iphdr *)skb_put(skb, ihl);
583 	memcpy(skb->data,pkt->data,ihl);
584 	skb->nh.iph->protocol = 0;			/* Flag to the kernel this is a route add */
585 	msg = (struct igmpmsg*)skb->nh.iph;
586 	msg->im_vif = vifi;
587 	skb->dst = dst_clone(pkt->dst);
588 
589 	/*
590 	 *	Add our header
591 	 */
592 
593 	igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
594 	igmp->type	=
595 	msg->im_msgtype = assert;
596 	igmp->code 	=	0;
597 	skb->nh.iph->tot_len=htons(skb->len);			/* Fix the length */
598 	skb->h.raw = skb->nh.raw;
599         }
600 
601 	if (mroute_socket == NULL) {
602 		kfree_skb(skb);
603 		return -EINVAL;
604 	}
605 
606 	/*
607 	 *	Deliver to mrouted
608 	 */
609 	if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
610 		if (net_ratelimit())
611 			printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
612 		kfree_skb(skb);
613 	}
614 
615 	return ret;
616 }
617 
618 /*
619  *	Queue a packet for resolution. It gets locked cache entry!
620  */
621 
622 static int
623 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
624 {
625 	int err;
626 	struct mfc_cache *c;
627 
628 	spin_lock_bh(&mfc_unres_lock);
629 	for (c=mfc_unres_queue; c; c=c->next) {
630 		if (c->mfc_mcastgrp == skb->nh.iph->daddr &&
631 		    c->mfc_origin == skb->nh.iph->saddr)
632 			break;
633 	}
634 
635 	if (c == NULL) {
636 		/*
637 		 *	Create a new entry if allowable
638 		 */
639 
640 		if (atomic_read(&cache_resolve_queue_len)>=10 ||
641 		    (c=ipmr_cache_alloc_unres())==NULL) {
642 			spin_unlock_bh(&mfc_unres_lock);
643 
644 			kfree_skb(skb);
645 			return -ENOBUFS;
646 		}
647 
648 		/*
649 		 *	Fill in the new cache entry
650 		 */
651 		c->mfc_parent=-1;
652 		c->mfc_origin=skb->nh.iph->saddr;
653 		c->mfc_mcastgrp=skb->nh.iph->daddr;
654 
655 		/*
656 		 *	Reflect first query at mrouted.
657 		 */
658 		if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
659 			/* If the report failed throw the cache entry
660 			   out - Brad Parker
661 			 */
662 			spin_unlock_bh(&mfc_unres_lock);
663 
664 			kmem_cache_free(mrt_cachep, c);
665 			kfree_skb(skb);
666 			return err;
667 		}
668 
669 		atomic_inc(&cache_resolve_queue_len);
670 		c->next = mfc_unres_queue;
671 		mfc_unres_queue = c;
672 
673 		mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
674 	}
675 
676 	/*
677 	 *	See if we can append the packet
678 	 */
679 	if (c->mfc_un.unres.unresolved.qlen>3) {
680 		kfree_skb(skb);
681 		err = -ENOBUFS;
682 	} else {
683 		skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
684 		err = 0;
685 	}
686 
687 	spin_unlock_bh(&mfc_unres_lock);
688 	return err;
689 }
690 
691 /*
692  *	MFC cache manipulation by user space mroute daemon
693  */
694 
695 static int ipmr_mfc_delete(struct mfcctl *mfc)
696 {
697 	int line;
698 	struct mfc_cache *c, **cp;
699 
700 	line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
701 
702 	for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
703 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
704 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
705 			write_lock_bh(&mrt_lock);
706 			*cp = c->next;
707 			write_unlock_bh(&mrt_lock);
708 
709 			kmem_cache_free(mrt_cachep, c);
710 			return 0;
711 		}
712 	}
713 	return -ENOENT;
714 }
715 
716 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
717 {
718 	int line;
719 	struct mfc_cache *uc, *c, **cp;
720 
721 	line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
722 
723 	for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
724 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
725 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
726 			break;
727 	}
728 
729 	if (c != NULL) {
730 		write_lock_bh(&mrt_lock);
731 		c->mfc_parent = mfc->mfcc_parent;
732 		ipmr_update_thresholds(c, mfc->mfcc_ttls);
733 		if (!mrtsock)
734 			c->mfc_flags |= MFC_STATIC;
735 		write_unlock_bh(&mrt_lock);
736 		return 0;
737 	}
738 
739 	if(!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
740 		return -EINVAL;
741 
742 	c=ipmr_cache_alloc();
743 	if (c==NULL)
744 		return -ENOMEM;
745 
746 	c->mfc_origin=mfc->mfcc_origin.s_addr;
747 	c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
748 	c->mfc_parent=mfc->mfcc_parent;
749 	ipmr_update_thresholds(c, mfc->mfcc_ttls);
750 	if (!mrtsock)
751 		c->mfc_flags |= MFC_STATIC;
752 
753 	write_lock_bh(&mrt_lock);
754 	c->next = mfc_cache_array[line];
755 	mfc_cache_array[line] = c;
756 	write_unlock_bh(&mrt_lock);
757 
758 	/*
759 	 *	Check to see if we resolved a queued list. If so we
760 	 *	need to send on the frames and tidy up.
761 	 */
762 	spin_lock_bh(&mfc_unres_lock);
763 	for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
764 	     cp = &uc->next) {
765 		if (uc->mfc_origin == c->mfc_origin &&
766 		    uc->mfc_mcastgrp == c->mfc_mcastgrp) {
767 			*cp = uc->next;
768 			if (atomic_dec_and_test(&cache_resolve_queue_len))
769 				del_timer(&ipmr_expire_timer);
770 			break;
771 		}
772 	}
773 	spin_unlock_bh(&mfc_unres_lock);
774 
775 	if (uc) {
776 		ipmr_cache_resolve(uc, c);
777 		kmem_cache_free(mrt_cachep, uc);
778 	}
779 	return 0;
780 }
781 
782 /*
783  *	Close the multicast socket, and clear the vif tables etc
784  */
785 
786 static void mroute_clean_tables(struct sock *sk)
787 {
788 	int i;
789 
790 	/*
791 	 *	Shut down all active vif entries
792 	 */
793 	for(i=0; i<maxvif; i++) {
794 		if (!(vif_table[i].flags&VIFF_STATIC))
795 			vif_delete(i);
796 	}
797 
798 	/*
799 	 *	Wipe the cache
800 	 */
801 	for (i=0;i<MFC_LINES;i++) {
802 		struct mfc_cache *c, **cp;
803 
804 		cp = &mfc_cache_array[i];
805 		while ((c = *cp) != NULL) {
806 			if (c->mfc_flags&MFC_STATIC) {
807 				cp = &c->next;
808 				continue;
809 			}
810 			write_lock_bh(&mrt_lock);
811 			*cp = c->next;
812 			write_unlock_bh(&mrt_lock);
813 
814 			kmem_cache_free(mrt_cachep, c);
815 		}
816 	}
817 
818 	if (atomic_read(&cache_resolve_queue_len) != 0) {
819 		struct mfc_cache *c;
820 
821 		spin_lock_bh(&mfc_unres_lock);
822 		while (mfc_unres_queue != NULL) {
823 			c = mfc_unres_queue;
824 			mfc_unres_queue = c->next;
825 			spin_unlock_bh(&mfc_unres_lock);
826 
827 			ipmr_destroy_unres(c);
828 
829 			spin_lock_bh(&mfc_unres_lock);
830 		}
831 		spin_unlock_bh(&mfc_unres_lock);
832 	}
833 }
834 
835 static void mrtsock_destruct(struct sock *sk)
836 {
837 	rtnl_lock();
838 	if (sk == mroute_socket) {
839 		ipv4_devconf.mc_forwarding--;
840 
841 		write_lock_bh(&mrt_lock);
842 		mroute_socket=NULL;
843 		write_unlock_bh(&mrt_lock);
844 
845 		mroute_clean_tables(sk);
846 	}
847 	rtnl_unlock();
848 }
849 
850 /*
851  *	Socket options and virtual interface manipulation. The whole
852  *	virtual interface system is a complete heap, but unfortunately
853  *	that's how BSD mrouted happens to think. Maybe one day with a proper
854  *	MOSPF/PIM router set up we can clean this up.
855  */
856 
857 int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
858 {
859 	int ret;
860 	struct vifctl vif;
861 	struct mfcctl mfc;
862 
863 	if(optname!=MRT_INIT)
864 	{
865 		if(sk!=mroute_socket && !capable(CAP_NET_ADMIN))
866 			return -EACCES;
867 	}
868 
869 	switch(optname)
870 	{
871 		case MRT_INIT:
872 			if (sk->sk_type != SOCK_RAW ||
873 			    inet_sk(sk)->num != IPPROTO_IGMP)
874 				return -EOPNOTSUPP;
875 			if(optlen!=sizeof(int))
876 				return -ENOPROTOOPT;
877 
878 			rtnl_lock();
879 			if (mroute_socket) {
880 				rtnl_unlock();
881 				return -EADDRINUSE;
882 			}
883 
884 			ret = ip_ra_control(sk, 1, mrtsock_destruct);
885 			if (ret == 0) {
886 				write_lock_bh(&mrt_lock);
887 				mroute_socket=sk;
888 				write_unlock_bh(&mrt_lock);
889 
890 				ipv4_devconf.mc_forwarding++;
891 			}
892 			rtnl_unlock();
893 			return ret;
894 		case MRT_DONE:
895 			if (sk!=mroute_socket)
896 				return -EACCES;
897 			return ip_ra_control(sk, 0, NULL);
898 		case MRT_ADD_VIF:
899 		case MRT_DEL_VIF:
900 			if(optlen!=sizeof(vif))
901 				return -EINVAL;
902 			if (copy_from_user(&vif,optval,sizeof(vif)))
903 				return -EFAULT;
904 			if(vif.vifc_vifi >= MAXVIFS)
905 				return -ENFILE;
906 			rtnl_lock();
907 			if (optname==MRT_ADD_VIF) {
908 				ret = vif_add(&vif, sk==mroute_socket);
909 			} else {
910 				ret = vif_delete(vif.vifc_vifi);
911 			}
912 			rtnl_unlock();
913 			return ret;
914 
915 		/*
916 		 *	Manipulate the forwarding caches. These live
917 		 *	in a sort of kernel/user symbiosis.
918 		 */
919 		case MRT_ADD_MFC:
920 		case MRT_DEL_MFC:
921 			if(optlen!=sizeof(mfc))
922 				return -EINVAL;
923 			if (copy_from_user(&mfc,optval, sizeof(mfc)))
924 				return -EFAULT;
925 			rtnl_lock();
926 			if (optname==MRT_DEL_MFC)
927 				ret = ipmr_mfc_delete(&mfc);
928 			else
929 				ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
930 			rtnl_unlock();
931 			return ret;
932 		/*
933 		 *	Control PIM assert.
934 		 */
935 		case MRT_ASSERT:
936 		{
937 			int v;
938 			if(get_user(v,(int __user *)optval))
939 				return -EFAULT;
940 			mroute_do_assert=(v)?1:0;
941 			return 0;
942 		}
943 #ifdef CONFIG_IP_PIMSM
944 		case MRT_PIM:
945 		{
946 			int v, ret;
947 			if(get_user(v,(int __user *)optval))
948 				return -EFAULT;
949 			v = (v)?1:0;
950 			rtnl_lock();
951 			ret = 0;
952 			if (v != mroute_do_pim) {
953 				mroute_do_pim = v;
954 				mroute_do_assert = v;
955 #ifdef CONFIG_IP_PIMSM_V2
956 				if (mroute_do_pim)
957 					ret = inet_add_protocol(&pim_protocol,
958 								IPPROTO_PIM);
959 				else
960 					ret = inet_del_protocol(&pim_protocol,
961 								IPPROTO_PIM);
962 				if (ret < 0)
963 					ret = -EAGAIN;
964 #endif
965 			}
966 			rtnl_unlock();
967 			return ret;
968 		}
969 #endif
970 		/*
971 		 *	Spurious command, or MRT_VERSION which you cannot
972 		 *	set.
973 		 */
974 		default:
975 			return -ENOPROTOOPT;
976 	}
977 }
978 
979 /*
980  *	Getsock opt support for the multicast routing system.
981  */
982 
983 int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
984 {
985 	int olr;
986 	int val;
987 
988 	if(optname!=MRT_VERSION &&
989 #ifdef CONFIG_IP_PIMSM
990 	   optname!=MRT_PIM &&
991 #endif
992 	   optname!=MRT_ASSERT)
993 		return -ENOPROTOOPT;
994 
995 	if (get_user(olr, optlen))
996 		return -EFAULT;
997 
998 	olr = min_t(unsigned int, olr, sizeof(int));
999 	if (olr < 0)
1000 		return -EINVAL;
1001 
1002 	if(put_user(olr,optlen))
1003 		return -EFAULT;
1004 	if(optname==MRT_VERSION)
1005 		val=0x0305;
1006 #ifdef CONFIG_IP_PIMSM
1007 	else if(optname==MRT_PIM)
1008 		val=mroute_do_pim;
1009 #endif
1010 	else
1011 		val=mroute_do_assert;
1012 	if(copy_to_user(optval,&val,olr))
1013 		return -EFAULT;
1014 	return 0;
1015 }
1016 
1017 /*
1018  *	The IP multicast ioctl support routines.
1019  */
1020 
1021 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1022 {
1023 	struct sioc_sg_req sr;
1024 	struct sioc_vif_req vr;
1025 	struct vif_device *vif;
1026 	struct mfc_cache *c;
1027 
1028 	switch(cmd)
1029 	{
1030 		case SIOCGETVIFCNT:
1031 			if (copy_from_user(&vr,arg,sizeof(vr)))
1032 				return -EFAULT;
1033 			if(vr.vifi>=maxvif)
1034 				return -EINVAL;
1035 			read_lock(&mrt_lock);
1036 			vif=&vif_table[vr.vifi];
1037 			if(VIF_EXISTS(vr.vifi))	{
1038 				vr.icount=vif->pkt_in;
1039 				vr.ocount=vif->pkt_out;
1040 				vr.ibytes=vif->bytes_in;
1041 				vr.obytes=vif->bytes_out;
1042 				read_unlock(&mrt_lock);
1043 
1044 				if (copy_to_user(arg,&vr,sizeof(vr)))
1045 					return -EFAULT;
1046 				return 0;
1047 			}
1048 			read_unlock(&mrt_lock);
1049 			return -EADDRNOTAVAIL;
1050 		case SIOCGETSGCNT:
1051 			if (copy_from_user(&sr,arg,sizeof(sr)))
1052 				return -EFAULT;
1053 
1054 			read_lock(&mrt_lock);
1055 			c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1056 			if (c) {
1057 				sr.pktcnt = c->mfc_un.res.pkt;
1058 				sr.bytecnt = c->mfc_un.res.bytes;
1059 				sr.wrong_if = c->mfc_un.res.wrong_if;
1060 				read_unlock(&mrt_lock);
1061 
1062 				if (copy_to_user(arg,&sr,sizeof(sr)))
1063 					return -EFAULT;
1064 				return 0;
1065 			}
1066 			read_unlock(&mrt_lock);
1067 			return -EADDRNOTAVAIL;
1068 		default:
1069 			return -ENOIOCTLCMD;
1070 	}
1071 }
1072 
1073 
1074 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1075 {
1076 	struct vif_device *v;
1077 	int ct;
1078 	if (event != NETDEV_UNREGISTER)
1079 		return NOTIFY_DONE;
1080 	v=&vif_table[0];
1081 	for(ct=0;ct<maxvif;ct++,v++) {
1082 		if (v->dev==ptr)
1083 			vif_delete(ct);
1084 	}
1085 	return NOTIFY_DONE;
1086 }
1087 
1088 
1089 static struct notifier_block ip_mr_notifier={
1090 	.notifier_call = ipmr_device_event,
1091 };
1092 
1093 /*
1094  * 	Encapsulate a packet by attaching a valid IPIP header to it.
1095  *	This avoids tunnel drivers and other mess and gives us the speed so
1096  *	important for multicast video.
1097  */
1098 
1099 static void ip_encap(struct sk_buff *skb, u32 saddr, u32 daddr)
1100 {
1101 	struct iphdr *iph = (struct iphdr *)skb_push(skb,sizeof(struct iphdr));
1102 
1103 	iph->version	= 	4;
1104 	iph->tos	=	skb->nh.iph->tos;
1105 	iph->ttl	=	skb->nh.iph->ttl;
1106 	iph->frag_off	=	0;
1107 	iph->daddr	=	daddr;
1108 	iph->saddr	=	saddr;
1109 	iph->protocol	=	IPPROTO_IPIP;
1110 	iph->ihl	=	5;
1111 	iph->tot_len	=	htons(skb->len);
1112 	ip_select_ident(iph, skb->dst, NULL);
1113 	ip_send_check(iph);
1114 
1115 	skb->h.ipiph = skb->nh.iph;
1116 	skb->nh.iph = iph;
1117 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1118 	nf_reset(skb);
1119 }
1120 
1121 static inline int ipmr_forward_finish(struct sk_buff *skb)
1122 {
1123 	struct ip_options * opt	= &(IPCB(skb)->opt);
1124 
1125 	IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
1126 
1127 	if (unlikely(opt->optlen))
1128 		ip_forward_options(skb);
1129 
1130 	return dst_output(skb);
1131 }
1132 
1133 /*
1134  *	Processing handlers for ipmr_forward
1135  */
1136 
1137 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1138 {
1139 	struct iphdr *iph = skb->nh.iph;
1140 	struct vif_device *vif = &vif_table[vifi];
1141 	struct net_device *dev;
1142 	struct rtable *rt;
1143 	int    encap = 0;
1144 
1145 	if (vif->dev == NULL)
1146 		goto out_free;
1147 
1148 #ifdef CONFIG_IP_PIMSM
1149 	if (vif->flags & VIFF_REGISTER) {
1150 		vif->pkt_out++;
1151 		vif->bytes_out+=skb->len;
1152 		((struct net_device_stats*)netdev_priv(vif->dev))->tx_bytes += skb->len;
1153 		((struct net_device_stats*)netdev_priv(vif->dev))->tx_packets++;
1154 		ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1155 		kfree_skb(skb);
1156 		return;
1157 	}
1158 #endif
1159 
1160 	if (vif->flags&VIFF_TUNNEL) {
1161 		struct flowi fl = { .oif = vif->link,
1162 				    .nl_u = { .ip4_u =
1163 					      { .daddr = vif->remote,
1164 						.saddr = vif->local,
1165 						.tos = RT_TOS(iph->tos) } },
1166 				    .proto = IPPROTO_IPIP };
1167 		if (ip_route_output_key(&rt, &fl))
1168 			goto out_free;
1169 		encap = sizeof(struct iphdr);
1170 	} else {
1171 		struct flowi fl = { .oif = vif->link,
1172 				    .nl_u = { .ip4_u =
1173 					      { .daddr = iph->daddr,
1174 						.tos = RT_TOS(iph->tos) } },
1175 				    .proto = IPPROTO_IPIP };
1176 		if (ip_route_output_key(&rt, &fl))
1177 			goto out_free;
1178 	}
1179 
1180 	dev = rt->u.dst.dev;
1181 
1182 	if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1183 		/* Do not fragment multicasts. Alas, IPv4 does not
1184 		   allow to send ICMP, so that packets will disappear
1185 		   to blackhole.
1186 		 */
1187 
1188 		IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
1189 		ip_rt_put(rt);
1190 		goto out_free;
1191 	}
1192 
1193 	encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1194 
1195 	if (skb_cow(skb, encap)) {
1196  		ip_rt_put(rt);
1197 		goto out_free;
1198 	}
1199 
1200 	vif->pkt_out++;
1201 	vif->bytes_out+=skb->len;
1202 
1203 	dst_release(skb->dst);
1204 	skb->dst = &rt->u.dst;
1205 	iph = skb->nh.iph;
1206 	ip_decrease_ttl(iph);
1207 
1208 	/* FIXME: forward and output firewalls used to be called here.
1209 	 * What do we do with netfilter? -- RR */
1210 	if (vif->flags & VIFF_TUNNEL) {
1211 		ip_encap(skb, vif->local, vif->remote);
1212 		/* FIXME: extra output firewall step used to be here. --RR */
1213 		((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_packets++;
1214 		((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_bytes+=skb->len;
1215 	}
1216 
1217 	IPCB(skb)->flags |= IPSKB_FORWARDED;
1218 
1219 	/*
1220 	 * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1221 	 * not only before forwarding, but after forwarding on all output
1222 	 * interfaces. It is clear, if mrouter runs a multicasting
1223 	 * program, it should receive packets not depending to what interface
1224 	 * program is joined.
1225 	 * If we will not make it, the program will have to join on all
1226 	 * interfaces. On the other hand, multihoming host (or router, but
1227 	 * not mrouter) cannot join to more than one interface - it will
1228 	 * result in receiving multiple packets.
1229 	 */
1230 	NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev,
1231 		ipmr_forward_finish);
1232 	return;
1233 
1234 out_free:
1235 	kfree_skb(skb);
1236 	return;
1237 }
1238 
1239 static int ipmr_find_vif(struct net_device *dev)
1240 {
1241 	int ct;
1242 	for (ct=maxvif-1; ct>=0; ct--) {
1243 		if (vif_table[ct].dev == dev)
1244 			break;
1245 	}
1246 	return ct;
1247 }
1248 
1249 /* "local" means that we should preserve one skb (for local delivery) */
1250 
1251 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1252 {
1253 	int psend = -1;
1254 	int vif, ct;
1255 
1256 	vif = cache->mfc_parent;
1257 	cache->mfc_un.res.pkt++;
1258 	cache->mfc_un.res.bytes += skb->len;
1259 
1260 	/*
1261 	 * Wrong interface: drop packet and (maybe) send PIM assert.
1262 	 */
1263 	if (vif_table[vif].dev != skb->dev) {
1264 		int true_vifi;
1265 
1266 		if (((struct rtable*)skb->dst)->fl.iif == 0) {
1267 			/* It is our own packet, looped back.
1268 			   Very complicated situation...
1269 
1270 			   The best workaround until routing daemons will be
1271 			   fixed is not to redistribute packet, if it was
1272 			   send through wrong interface. It means, that
1273 			   multicast applications WILL NOT work for
1274 			   (S,G), which have default multicast route pointing
1275 			   to wrong oif. In any case, it is not a good
1276 			   idea to use multicasting applications on router.
1277 			 */
1278 			goto dont_forward;
1279 		}
1280 
1281 		cache->mfc_un.res.wrong_if++;
1282 		true_vifi = ipmr_find_vif(skb->dev);
1283 
1284 		if (true_vifi >= 0 && mroute_do_assert &&
1285 		    /* pimsm uses asserts, when switching from RPT to SPT,
1286 		       so that we cannot check that packet arrived on an oif.
1287 		       It is bad, but otherwise we would need to move pretty
1288 		       large chunk of pimd to kernel. Ough... --ANK
1289 		     */
1290 		    (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1291 		    time_after(jiffies,
1292 			       cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1293 			cache->mfc_un.res.last_assert = jiffies;
1294 			ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1295 		}
1296 		goto dont_forward;
1297 	}
1298 
1299 	vif_table[vif].pkt_in++;
1300 	vif_table[vif].bytes_in+=skb->len;
1301 
1302 	/*
1303 	 *	Forward the frame
1304 	 */
1305 	for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1306 		if (skb->nh.iph->ttl > cache->mfc_un.res.ttls[ct]) {
1307 			if (psend != -1) {
1308 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1309 				if (skb2)
1310 					ipmr_queue_xmit(skb2, cache, psend);
1311 			}
1312 			psend=ct;
1313 		}
1314 	}
1315 	if (psend != -1) {
1316 		if (local) {
1317 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1318 			if (skb2)
1319 				ipmr_queue_xmit(skb2, cache, psend);
1320 		} else {
1321 			ipmr_queue_xmit(skb, cache, psend);
1322 			return 0;
1323 		}
1324 	}
1325 
1326 dont_forward:
1327 	if (!local)
1328 		kfree_skb(skb);
1329 	return 0;
1330 }
1331 
1332 
1333 /*
1334  *	Multicast packets for forwarding arrive here
1335  */
1336 
1337 int ip_mr_input(struct sk_buff *skb)
1338 {
1339 	struct mfc_cache *cache;
1340 	int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
1341 
1342 	/* Packet is looped back after forward, it should not be
1343 	   forwarded second time, but still can be delivered locally.
1344 	 */
1345 	if (IPCB(skb)->flags&IPSKB_FORWARDED)
1346 		goto dont_forward;
1347 
1348 	if (!local) {
1349 		    if (IPCB(skb)->opt.router_alert) {
1350 			    if (ip_call_ra_chain(skb))
1351 				    return 0;
1352 		    } else if (skb->nh.iph->protocol == IPPROTO_IGMP){
1353 			    /* IGMPv1 (and broken IGMPv2 implementations sort of
1354 			       Cisco IOS <= 11.2(8)) do not put router alert
1355 			       option to IGMP packets destined to routable
1356 			       groups. It is very bad, because it means
1357 			       that we can forward NO IGMP messages.
1358 			     */
1359 			    read_lock(&mrt_lock);
1360 			    if (mroute_socket) {
1361 				    nf_reset(skb);
1362 				    raw_rcv(mroute_socket, skb);
1363 				    read_unlock(&mrt_lock);
1364 				    return 0;
1365 			    }
1366 			    read_unlock(&mrt_lock);
1367 		    }
1368 	}
1369 
1370 	read_lock(&mrt_lock);
1371 	cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr);
1372 
1373 	/*
1374 	 *	No usable cache entry
1375 	 */
1376 	if (cache==NULL) {
1377 		int vif;
1378 
1379 		if (local) {
1380 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1381 			ip_local_deliver(skb);
1382 			if (skb2 == NULL) {
1383 				read_unlock(&mrt_lock);
1384 				return -ENOBUFS;
1385 			}
1386 			skb = skb2;
1387 		}
1388 
1389 		vif = ipmr_find_vif(skb->dev);
1390 		if (vif >= 0) {
1391 			int err = ipmr_cache_unresolved(vif, skb);
1392 			read_unlock(&mrt_lock);
1393 
1394 			return err;
1395 		}
1396 		read_unlock(&mrt_lock);
1397 		kfree_skb(skb);
1398 		return -ENODEV;
1399 	}
1400 
1401 	ip_mr_forward(skb, cache, local);
1402 
1403 	read_unlock(&mrt_lock);
1404 
1405 	if (local)
1406 		return ip_local_deliver(skb);
1407 
1408 	return 0;
1409 
1410 dont_forward:
1411 	if (local)
1412 		return ip_local_deliver(skb);
1413 	kfree_skb(skb);
1414 	return 0;
1415 }
1416 
1417 #ifdef CONFIG_IP_PIMSM_V1
1418 /*
1419  * Handle IGMP messages of PIMv1
1420  */
1421 
1422 int pim_rcv_v1(struct sk_buff * skb)
1423 {
1424 	struct igmphdr *pim;
1425 	struct iphdr   *encap;
1426 	struct net_device  *reg_dev = NULL;
1427 
1428 	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1429 		goto drop;
1430 
1431 	pim = (struct igmphdr*)skb->h.raw;
1432 
1433         if (!mroute_do_pim ||
1434 	    skb->len < sizeof(*pim) + sizeof(*encap) ||
1435 	    pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1436 		goto drop;
1437 
1438 	encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr));
1439 	/*
1440 	   Check that:
1441 	   a. packet is really destinted to a multicast group
1442 	   b. packet is not a NULL-REGISTER
1443 	   c. packet is not truncated
1444 	 */
1445 	if (!MULTICAST(encap->daddr) ||
1446 	    encap->tot_len == 0 ||
1447 	    ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1448 		goto drop;
1449 
1450 	read_lock(&mrt_lock);
1451 	if (reg_vif_num >= 0)
1452 		reg_dev = vif_table[reg_vif_num].dev;
1453 	if (reg_dev)
1454 		dev_hold(reg_dev);
1455 	read_unlock(&mrt_lock);
1456 
1457 	if (reg_dev == NULL)
1458 		goto drop;
1459 
1460 	skb->mac.raw = skb->nh.raw;
1461 	skb_pull(skb, (u8*)encap - skb->data);
1462 	skb->nh.iph = (struct iphdr *)skb->data;
1463 	skb->dev = reg_dev;
1464 	memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
1465 	skb->protocol = htons(ETH_P_IP);
1466 	skb->ip_summed = 0;
1467 	skb->pkt_type = PACKET_HOST;
1468 	dst_release(skb->dst);
1469 	skb->dst = NULL;
1470 	((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1471 	((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1472 	nf_reset(skb);
1473 	netif_rx(skb);
1474 	dev_put(reg_dev);
1475 	return 0;
1476  drop:
1477 	kfree_skb(skb);
1478 	return 0;
1479 }
1480 #endif
1481 
1482 #ifdef CONFIG_IP_PIMSM_V2
1483 static int pim_rcv(struct sk_buff * skb)
1484 {
1485 	struct pimreghdr *pim;
1486 	struct iphdr   *encap;
1487 	struct net_device  *reg_dev = NULL;
1488 
1489 	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1490 		goto drop;
1491 
1492 	pim = (struct pimreghdr*)skb->h.raw;
1493         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1494 	    (pim->flags&PIM_NULL_REGISTER) ||
1495 	    (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1496 	     (u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1497 		goto drop;
1498 
1499 	/* check if the inner packet is destined to mcast group */
1500 	encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr));
1501 	if (!MULTICAST(encap->daddr) ||
1502 	    encap->tot_len == 0 ||
1503 	    ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1504 		goto drop;
1505 
1506 	read_lock(&mrt_lock);
1507 	if (reg_vif_num >= 0)
1508 		reg_dev = vif_table[reg_vif_num].dev;
1509 	if (reg_dev)
1510 		dev_hold(reg_dev);
1511 	read_unlock(&mrt_lock);
1512 
1513 	if (reg_dev == NULL)
1514 		goto drop;
1515 
1516 	skb->mac.raw = skb->nh.raw;
1517 	skb_pull(skb, (u8*)encap - skb->data);
1518 	skb->nh.iph = (struct iphdr *)skb->data;
1519 	skb->dev = reg_dev;
1520 	memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
1521 	skb->protocol = htons(ETH_P_IP);
1522 	skb->ip_summed = 0;
1523 	skb->pkt_type = PACKET_HOST;
1524 	dst_release(skb->dst);
1525 	((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1526 	((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1527 	skb->dst = NULL;
1528 	nf_reset(skb);
1529 	netif_rx(skb);
1530 	dev_put(reg_dev);
1531 	return 0;
1532  drop:
1533 	kfree_skb(skb);
1534 	return 0;
1535 }
1536 #endif
1537 
1538 static int
1539 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1540 {
1541 	int ct;
1542 	struct rtnexthop *nhp;
1543 	struct net_device *dev = vif_table[c->mfc_parent].dev;
1544 	u8 *b = skb->tail;
1545 	struct rtattr *mp_head;
1546 
1547 	if (dev)
1548 		RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1549 
1550 	mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1551 
1552 	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1553 		if (c->mfc_un.res.ttls[ct] < 255) {
1554 			if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1555 				goto rtattr_failure;
1556 			nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1557 			nhp->rtnh_flags = 0;
1558 			nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1559 			nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1560 			nhp->rtnh_len = sizeof(*nhp);
1561 		}
1562 	}
1563 	mp_head->rta_type = RTA_MULTIPATH;
1564 	mp_head->rta_len = skb->tail - (u8*)mp_head;
1565 	rtm->rtm_type = RTN_MULTICAST;
1566 	return 1;
1567 
1568 rtattr_failure:
1569 	skb_trim(skb, b - skb->data);
1570 	return -EMSGSIZE;
1571 }
1572 
1573 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1574 {
1575 	int err;
1576 	struct mfc_cache *cache;
1577 	struct rtable *rt = (struct rtable*)skb->dst;
1578 
1579 	read_lock(&mrt_lock);
1580 	cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1581 
1582 	if (cache==NULL) {
1583 		struct net_device *dev;
1584 		int vif;
1585 
1586 		if (nowait) {
1587 			read_unlock(&mrt_lock);
1588 			return -EAGAIN;
1589 		}
1590 
1591 		dev = skb->dev;
1592 		if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1593 			read_unlock(&mrt_lock);
1594 			return -ENODEV;
1595 		}
1596 		skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
1597 		skb->nh.iph->ihl = sizeof(struct iphdr)>>2;
1598 		skb->nh.iph->saddr = rt->rt_src;
1599 		skb->nh.iph->daddr = rt->rt_dst;
1600 		skb->nh.iph->version = 0;
1601 		err = ipmr_cache_unresolved(vif, skb);
1602 		read_unlock(&mrt_lock);
1603 		return err;
1604 	}
1605 
1606 	if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1607 		cache->mfc_flags |= MFC_NOTIFY;
1608 	err = ipmr_fill_mroute(skb, cache, rtm);
1609 	read_unlock(&mrt_lock);
1610 	return err;
1611 }
1612 
1613 #ifdef CONFIG_PROC_FS
1614 /*
1615  *	The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1616  */
1617 struct ipmr_vif_iter {
1618 	int ct;
1619 };
1620 
1621 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1622 					   loff_t pos)
1623 {
1624 	for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1625 		if(!VIF_EXISTS(iter->ct))
1626 			continue;
1627 		if (pos-- == 0)
1628 			return &vif_table[iter->ct];
1629 	}
1630 	return NULL;
1631 }
1632 
1633 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1634 {
1635 	read_lock(&mrt_lock);
1636 	return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1637 		: SEQ_START_TOKEN;
1638 }
1639 
1640 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1641 {
1642 	struct ipmr_vif_iter *iter = seq->private;
1643 
1644 	++*pos;
1645 	if (v == SEQ_START_TOKEN)
1646 		return ipmr_vif_seq_idx(iter, 0);
1647 
1648 	while (++iter->ct < maxvif) {
1649 		if(!VIF_EXISTS(iter->ct))
1650 			continue;
1651 		return &vif_table[iter->ct];
1652 	}
1653 	return NULL;
1654 }
1655 
1656 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1657 {
1658 	read_unlock(&mrt_lock);
1659 }
1660 
1661 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1662 {
1663 	if (v == SEQ_START_TOKEN) {
1664 		seq_puts(seq,
1665 			 "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1666 	} else {
1667 		const struct vif_device *vif = v;
1668 		const char *name =  vif->dev ? vif->dev->name : "none";
1669 
1670 		seq_printf(seq,
1671 			   "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1672 			   vif - vif_table,
1673 			   name, vif->bytes_in, vif->pkt_in,
1674 			   vif->bytes_out, vif->pkt_out,
1675 			   vif->flags, vif->local, vif->remote);
1676 	}
1677 	return 0;
1678 }
1679 
1680 static struct seq_operations ipmr_vif_seq_ops = {
1681 	.start = ipmr_vif_seq_start,
1682 	.next  = ipmr_vif_seq_next,
1683 	.stop  = ipmr_vif_seq_stop,
1684 	.show  = ipmr_vif_seq_show,
1685 };
1686 
1687 static int ipmr_vif_open(struct inode *inode, struct file *file)
1688 {
1689 	struct seq_file *seq;
1690 	int rc = -ENOMEM;
1691 	struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1692 
1693 	if (!s)
1694 		goto out;
1695 
1696 	rc = seq_open(file, &ipmr_vif_seq_ops);
1697 	if (rc)
1698 		goto out_kfree;
1699 
1700 	s->ct = 0;
1701 	seq = file->private_data;
1702 	seq->private = s;
1703 out:
1704 	return rc;
1705 out_kfree:
1706 	kfree(s);
1707 	goto out;
1708 
1709 }
1710 
1711 static struct file_operations ipmr_vif_fops = {
1712 	.owner	 = THIS_MODULE,
1713 	.open    = ipmr_vif_open,
1714 	.read    = seq_read,
1715 	.llseek  = seq_lseek,
1716 	.release = seq_release_private,
1717 };
1718 
1719 struct ipmr_mfc_iter {
1720 	struct mfc_cache **cache;
1721 	int ct;
1722 };
1723 
1724 
1725 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1726 {
1727 	struct mfc_cache *mfc;
1728 
1729 	it->cache = mfc_cache_array;
1730 	read_lock(&mrt_lock);
1731 	for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1732 		for(mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
1733 			if (pos-- == 0)
1734 				return mfc;
1735 	read_unlock(&mrt_lock);
1736 
1737 	it->cache = &mfc_unres_queue;
1738 	spin_lock_bh(&mfc_unres_lock);
1739 	for(mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1740 		if (pos-- == 0)
1741 			return mfc;
1742 	spin_unlock_bh(&mfc_unres_lock);
1743 
1744 	it->cache = NULL;
1745 	return NULL;
1746 }
1747 
1748 
1749 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1750 {
1751 	struct ipmr_mfc_iter *it = seq->private;
1752 	it->cache = NULL;
1753 	it->ct = 0;
1754 	return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1755 		: SEQ_START_TOKEN;
1756 }
1757 
1758 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1759 {
1760 	struct mfc_cache *mfc = v;
1761 	struct ipmr_mfc_iter *it = seq->private;
1762 
1763 	++*pos;
1764 
1765 	if (v == SEQ_START_TOKEN)
1766 		return ipmr_mfc_seq_idx(seq->private, 0);
1767 
1768 	if (mfc->next)
1769 		return mfc->next;
1770 
1771 	if (it->cache == &mfc_unres_queue)
1772 		goto end_of_list;
1773 
1774 	BUG_ON(it->cache != mfc_cache_array);
1775 
1776 	while (++it->ct < MFC_LINES) {
1777 		mfc = mfc_cache_array[it->ct];
1778 		if (mfc)
1779 			return mfc;
1780 	}
1781 
1782 	/* exhausted cache_array, show unresolved */
1783 	read_unlock(&mrt_lock);
1784 	it->cache = &mfc_unres_queue;
1785 	it->ct = 0;
1786 
1787 	spin_lock_bh(&mfc_unres_lock);
1788 	mfc = mfc_unres_queue;
1789 	if (mfc)
1790 		return mfc;
1791 
1792  end_of_list:
1793 	spin_unlock_bh(&mfc_unres_lock);
1794 	it->cache = NULL;
1795 
1796 	return NULL;
1797 }
1798 
1799 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1800 {
1801 	struct ipmr_mfc_iter *it = seq->private;
1802 
1803 	if (it->cache == &mfc_unres_queue)
1804 		spin_unlock_bh(&mfc_unres_lock);
1805 	else if (it->cache == mfc_cache_array)
1806 		read_unlock(&mrt_lock);
1807 }
1808 
1809 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1810 {
1811 	int n;
1812 
1813 	if (v == SEQ_START_TOKEN) {
1814 		seq_puts(seq,
1815 		 "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1816 	} else {
1817 		const struct mfc_cache *mfc = v;
1818 		const struct ipmr_mfc_iter *it = seq->private;
1819 
1820 		seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1821 			   (unsigned long) mfc->mfc_mcastgrp,
1822 			   (unsigned long) mfc->mfc_origin,
1823 			   mfc->mfc_parent,
1824 			   mfc->mfc_un.res.pkt,
1825 			   mfc->mfc_un.res.bytes,
1826 			   mfc->mfc_un.res.wrong_if);
1827 
1828 		if (it->cache != &mfc_unres_queue) {
1829 			for(n = mfc->mfc_un.res.minvif;
1830 			    n < mfc->mfc_un.res.maxvif; n++ ) {
1831 				if(VIF_EXISTS(n)
1832 				   && mfc->mfc_un.res.ttls[n] < 255)
1833 				seq_printf(seq,
1834 					   " %2d:%-3d",
1835 					   n, mfc->mfc_un.res.ttls[n]);
1836 			}
1837 		}
1838 		seq_putc(seq, '\n');
1839 	}
1840 	return 0;
1841 }
1842 
1843 static struct seq_operations ipmr_mfc_seq_ops = {
1844 	.start = ipmr_mfc_seq_start,
1845 	.next  = ipmr_mfc_seq_next,
1846 	.stop  = ipmr_mfc_seq_stop,
1847 	.show  = ipmr_mfc_seq_show,
1848 };
1849 
1850 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1851 {
1852 	struct seq_file *seq;
1853 	int rc = -ENOMEM;
1854 	struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1855 
1856 	if (!s)
1857 		goto out;
1858 
1859 	rc = seq_open(file, &ipmr_mfc_seq_ops);
1860 	if (rc)
1861 		goto out_kfree;
1862 
1863 	seq = file->private_data;
1864 	seq->private = s;
1865 out:
1866 	return rc;
1867 out_kfree:
1868 	kfree(s);
1869 	goto out;
1870 
1871 }
1872 
1873 static struct file_operations ipmr_mfc_fops = {
1874 	.owner	 = THIS_MODULE,
1875 	.open    = ipmr_mfc_open,
1876 	.read    = seq_read,
1877 	.llseek  = seq_lseek,
1878 	.release = seq_release_private,
1879 };
1880 #endif
1881 
1882 #ifdef CONFIG_IP_PIMSM_V2
1883 static struct net_protocol pim_protocol = {
1884 	.handler	=	pim_rcv,
1885 };
1886 #endif
1887 
1888 
1889 /*
1890  *	Setup for IP multicast routing
1891  */
1892 
1893 void __init ip_mr_init(void)
1894 {
1895 	mrt_cachep = kmem_cache_create("ip_mrt_cache",
1896 				       sizeof(struct mfc_cache),
1897 				       0, SLAB_HWCACHE_ALIGN,
1898 				       NULL, NULL);
1899 	if (!mrt_cachep)
1900 		panic("cannot allocate ip_mrt_cache");
1901 
1902 	init_timer(&ipmr_expire_timer);
1903 	ipmr_expire_timer.function=ipmr_expire_process;
1904 	register_netdevice_notifier(&ip_mr_notifier);
1905 #ifdef CONFIG_PROC_FS
1906 	proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops);
1907 	proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops);
1908 #endif
1909 }
1910