xref: /linux/net/ipv4/ipmr.c (revision 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2)
1 /*
2  *	IP multicast routing support for mrouted 3.6/3.8
3  *
4  *		(c) 1995 Alan Cox, <alan@redhat.com>
5  *	  Linux Consultancy and Custom Driver Development
6  *
7  *	This program is free software; you can redistribute it and/or
8  *	modify it under the terms of the GNU General Public License
9  *	as published by the Free Software Foundation; either version
10  *	2 of the License, or (at your option) any later version.
11  *
12  *	Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
13  *
14  *	Fixes:
15  *	Michael Chastain	:	Incorrect size of copying.
16  *	Alan Cox		:	Added the cache manager code
17  *	Alan Cox		:	Fixed the clone/copy bug and device race.
18  *	Mike McLagan		:	Routing by source
19  *	Malcolm Beattie		:	Buffer handling fixes.
20  *	Alexey Kuznetsov	:	Double buffer free and other fixes.
21  *	SVR Anand		:	Fixed several multicast bugs and problems.
22  *	Alexey Kuznetsov	:	Status, optimisations and more.
23  *	Brad Parker		:	Better behaviour on mrouted upcall
24  *					overflow.
25  *      Carlos Picoto           :       PIMv1 Support
26  *	Pavlin Ivanov Radoslavov:	PIMv2 Registers must checksum only PIM header
27  *					Relax this requrement to work with older peers.
28  *
29  */
30 
31 #include <linux/config.h>
32 #include <asm/system.h>
33 #include <asm/uaccess.h>
34 #include <linux/types.h>
35 #include <linux/sched.h>
36 #include <linux/errno.h>
37 #include <linux/timer.h>
38 #include <linux/mm.h>
39 #include <linux/kernel.h>
40 #include <linux/fcntl.h>
41 #include <linux/stat.h>
42 #include <linux/socket.h>
43 #include <linux/in.h>
44 #include <linux/inet.h>
45 #include <linux/netdevice.h>
46 #include <linux/inetdevice.h>
47 #include <linux/igmp.h>
48 #include <linux/proc_fs.h>
49 #include <linux/seq_file.h>
50 #include <linux/mroute.h>
51 #include <linux/init.h>
52 #include <net/ip.h>
53 #include <net/protocol.h>
54 #include <linux/skbuff.h>
55 #include <net/sock.h>
56 #include <net/icmp.h>
57 #include <net/udp.h>
58 #include <net/raw.h>
59 #include <linux/notifier.h>
60 #include <linux/if_arp.h>
61 #include <linux/netfilter_ipv4.h>
62 #include <net/ipip.h>
63 #include <net/checksum.h>
64 
65 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
66 #define CONFIG_IP_PIMSM	1
67 #endif
68 
69 static struct sock *mroute_socket;
70 
71 
72 /* Big lock, protecting vif table, mrt cache and mroute socket state.
73    Note that the changes are semaphored via rtnl_lock.
74  */
75 
76 static DEFINE_RWLOCK(mrt_lock);
77 
78 /*
79  *	Multicast router control variables
80  */
81 
82 static struct vif_device vif_table[MAXVIFS];		/* Devices 		*/
83 static int maxvif;
84 
85 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
86 
87 static int mroute_do_assert;				/* Set in PIM assert	*/
88 static int mroute_do_pim;
89 
90 static struct mfc_cache *mfc_cache_array[MFC_LINES];	/* Forwarding cache	*/
91 
92 static struct mfc_cache *mfc_unres_queue;		/* Queue of unresolved entries */
93 static atomic_t cache_resolve_queue_len;		/* Size of unresolved	*/
94 
95 /* Special spinlock for queue of unresolved entries */
96 static DEFINE_SPINLOCK(mfc_unres_lock);
97 
98 /* We return to original Alan's scheme. Hash table of resolved
99    entries is changed only in process context and protected
100    with weak lock mrt_lock. Queue of unresolved entries is protected
101    with strong spinlock mfc_unres_lock.
102 
103    In this case data path is free of exclusive locks at all.
104  */
105 
106 static kmem_cache_t *mrt_cachep;
107 
108 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
109 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
110 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
111 
112 #ifdef CONFIG_IP_PIMSM_V2
113 static struct net_protocol pim_protocol;
114 #endif
115 
116 static struct timer_list ipmr_expire_timer;
117 
118 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
119 
120 static
121 struct net_device *ipmr_new_tunnel(struct vifctl *v)
122 {
123 	struct net_device  *dev;
124 
125 	dev = __dev_get_by_name("tunl0");
126 
127 	if (dev) {
128 		int err;
129 		struct ifreq ifr;
130 		mm_segment_t	oldfs;
131 		struct ip_tunnel_parm p;
132 		struct in_device  *in_dev;
133 
134 		memset(&p, 0, sizeof(p));
135 		p.iph.daddr = v->vifc_rmt_addr.s_addr;
136 		p.iph.saddr = v->vifc_lcl_addr.s_addr;
137 		p.iph.version = 4;
138 		p.iph.ihl = 5;
139 		p.iph.protocol = IPPROTO_IPIP;
140 		sprintf(p.name, "dvmrp%d", v->vifc_vifi);
141 		ifr.ifr_ifru.ifru_data = (void*)&p;
142 
143 		oldfs = get_fs(); set_fs(KERNEL_DS);
144 		err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
145 		set_fs(oldfs);
146 
147 		dev = NULL;
148 
149 		if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
150 			dev->flags |= IFF_MULTICAST;
151 
152 			in_dev = __in_dev_get(dev);
153 			if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL)
154 				goto failure;
155 			in_dev->cnf.rp_filter = 0;
156 
157 			if (dev_open(dev))
158 				goto failure;
159 		}
160 	}
161 	return dev;
162 
163 failure:
164 	/* allow the register to be completed before unregistering. */
165 	rtnl_unlock();
166 	rtnl_lock();
167 
168 	unregister_netdevice(dev);
169 	return NULL;
170 }
171 
172 #ifdef CONFIG_IP_PIMSM
173 
174 static int reg_vif_num = -1;
175 
176 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
177 {
178 	read_lock(&mrt_lock);
179 	((struct net_device_stats*)dev->priv)->tx_bytes += skb->len;
180 	((struct net_device_stats*)dev->priv)->tx_packets++;
181 	ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
182 	read_unlock(&mrt_lock);
183 	kfree_skb(skb);
184 	return 0;
185 }
186 
187 static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
188 {
189 	return (struct net_device_stats*)dev->priv;
190 }
191 
192 static void reg_vif_setup(struct net_device *dev)
193 {
194 	dev->type		= ARPHRD_PIMREG;
195 	dev->mtu		= 1500 - sizeof(struct iphdr) - 8;
196 	dev->flags		= IFF_NOARP;
197 	dev->hard_start_xmit	= reg_vif_xmit;
198 	dev->get_stats		= reg_vif_get_stats;
199 	dev->destructor		= free_netdev;
200 }
201 
202 static struct net_device *ipmr_reg_vif(void)
203 {
204 	struct net_device *dev;
205 	struct in_device *in_dev;
206 
207 	dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
208 			   reg_vif_setup);
209 
210 	if (dev == NULL)
211 		return NULL;
212 
213 	if (register_netdevice(dev)) {
214 		free_netdev(dev);
215 		return NULL;
216 	}
217 	dev->iflink = 0;
218 
219 	if ((in_dev = inetdev_init(dev)) == NULL)
220 		goto failure;
221 
222 	in_dev->cnf.rp_filter = 0;
223 
224 	if (dev_open(dev))
225 		goto failure;
226 
227 	return dev;
228 
229 failure:
230 	/* allow the register to be completed before unregistering. */
231 	rtnl_unlock();
232 	rtnl_lock();
233 
234 	unregister_netdevice(dev);
235 	return NULL;
236 }
237 #endif
238 
239 /*
240  *	Delete a VIF entry
241  */
242 
243 static int vif_delete(int vifi)
244 {
245 	struct vif_device *v;
246 	struct net_device *dev;
247 	struct in_device *in_dev;
248 
249 	if (vifi < 0 || vifi >= maxvif)
250 		return -EADDRNOTAVAIL;
251 
252 	v = &vif_table[vifi];
253 
254 	write_lock_bh(&mrt_lock);
255 	dev = v->dev;
256 	v->dev = NULL;
257 
258 	if (!dev) {
259 		write_unlock_bh(&mrt_lock);
260 		return -EADDRNOTAVAIL;
261 	}
262 
263 #ifdef CONFIG_IP_PIMSM
264 	if (vifi == reg_vif_num)
265 		reg_vif_num = -1;
266 #endif
267 
268 	if (vifi+1 == maxvif) {
269 		int tmp;
270 		for (tmp=vifi-1; tmp>=0; tmp--) {
271 			if (VIF_EXISTS(tmp))
272 				break;
273 		}
274 		maxvif = tmp+1;
275 	}
276 
277 	write_unlock_bh(&mrt_lock);
278 
279 	dev_set_allmulti(dev, -1);
280 
281 	if ((in_dev = __in_dev_get(dev)) != NULL) {
282 		in_dev->cnf.mc_forwarding--;
283 		ip_rt_multicast_event(in_dev);
284 	}
285 
286 	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
287 		unregister_netdevice(dev);
288 
289 	dev_put(dev);
290 	return 0;
291 }
292 
293 /* Destroy an unresolved cache entry, killing queued skbs
294    and reporting error to netlink readers.
295  */
296 
297 static void ipmr_destroy_unres(struct mfc_cache *c)
298 {
299 	struct sk_buff *skb;
300 
301 	atomic_dec(&cache_resolve_queue_len);
302 
303 	while((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
304 		if (skb->nh.iph->version == 0) {
305 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
306 			nlh->nlmsg_type = NLMSG_ERROR;
307 			nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
308 			skb_trim(skb, nlh->nlmsg_len);
309 			((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -ETIMEDOUT;
310 			netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
311 		} else
312 			kfree_skb(skb);
313 	}
314 
315 	kmem_cache_free(mrt_cachep, c);
316 }
317 
318 
319 /* Single timer process for all the unresolved queue. */
320 
321 static void ipmr_expire_process(unsigned long dummy)
322 {
323 	unsigned long now;
324 	unsigned long expires;
325 	struct mfc_cache *c, **cp;
326 
327 	if (!spin_trylock(&mfc_unres_lock)) {
328 		mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
329 		return;
330 	}
331 
332 	if (atomic_read(&cache_resolve_queue_len) == 0)
333 		goto out;
334 
335 	now = jiffies;
336 	expires = 10*HZ;
337 	cp = &mfc_unres_queue;
338 
339 	while ((c=*cp) != NULL) {
340 		if (time_after(c->mfc_un.unres.expires, now)) {
341 			unsigned long interval = c->mfc_un.unres.expires - now;
342 			if (interval < expires)
343 				expires = interval;
344 			cp = &c->next;
345 			continue;
346 		}
347 
348 		*cp = c->next;
349 
350 		ipmr_destroy_unres(c);
351 	}
352 
353 	if (atomic_read(&cache_resolve_queue_len))
354 		mod_timer(&ipmr_expire_timer, jiffies + expires);
355 
356 out:
357 	spin_unlock(&mfc_unres_lock);
358 }
359 
360 /* Fill oifs list. It is called under write locked mrt_lock. */
361 
362 static void ipmr_update_threshoulds(struct mfc_cache *cache, unsigned char *ttls)
363 {
364 	int vifi;
365 
366 	cache->mfc_un.res.minvif = MAXVIFS;
367 	cache->mfc_un.res.maxvif = 0;
368 	memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
369 
370 	for (vifi=0; vifi<maxvif; vifi++) {
371 		if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
372 			cache->mfc_un.res.ttls[vifi] = ttls[vifi];
373 			if (cache->mfc_un.res.minvif > vifi)
374 				cache->mfc_un.res.minvif = vifi;
375 			if (cache->mfc_un.res.maxvif <= vifi)
376 				cache->mfc_un.res.maxvif = vifi + 1;
377 		}
378 	}
379 }
380 
381 static int vif_add(struct vifctl *vifc, int mrtsock)
382 {
383 	int vifi = vifc->vifc_vifi;
384 	struct vif_device *v = &vif_table[vifi];
385 	struct net_device *dev;
386 	struct in_device *in_dev;
387 
388 	/* Is vif busy ? */
389 	if (VIF_EXISTS(vifi))
390 		return -EADDRINUSE;
391 
392 	switch (vifc->vifc_flags) {
393 #ifdef CONFIG_IP_PIMSM
394 	case VIFF_REGISTER:
395 		/*
396 		 * Special Purpose VIF in PIM
397 		 * All the packets will be sent to the daemon
398 		 */
399 		if (reg_vif_num >= 0)
400 			return -EADDRINUSE;
401 		dev = ipmr_reg_vif();
402 		if (!dev)
403 			return -ENOBUFS;
404 		break;
405 #endif
406 	case VIFF_TUNNEL:
407 		dev = ipmr_new_tunnel(vifc);
408 		if (!dev)
409 			return -ENOBUFS;
410 		break;
411 	case 0:
412 		dev=ip_dev_find(vifc->vifc_lcl_addr.s_addr);
413 		if (!dev)
414 			return -EADDRNOTAVAIL;
415 		__dev_put(dev);
416 		break;
417 	default:
418 		return -EINVAL;
419 	}
420 
421 	if ((in_dev = __in_dev_get(dev)) == NULL)
422 		return -EADDRNOTAVAIL;
423 	in_dev->cnf.mc_forwarding++;
424 	dev_set_allmulti(dev, +1);
425 	ip_rt_multicast_event(in_dev);
426 
427 	/*
428 	 *	Fill in the VIF structures
429 	 */
430 	v->rate_limit=vifc->vifc_rate_limit;
431 	v->local=vifc->vifc_lcl_addr.s_addr;
432 	v->remote=vifc->vifc_rmt_addr.s_addr;
433 	v->flags=vifc->vifc_flags;
434 	if (!mrtsock)
435 		v->flags |= VIFF_STATIC;
436 	v->threshold=vifc->vifc_threshold;
437 	v->bytes_in = 0;
438 	v->bytes_out = 0;
439 	v->pkt_in = 0;
440 	v->pkt_out = 0;
441 	v->link = dev->ifindex;
442 	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
443 		v->link = dev->iflink;
444 
445 	/* And finish update writing critical data */
446 	write_lock_bh(&mrt_lock);
447 	dev_hold(dev);
448 	v->dev=dev;
449 #ifdef CONFIG_IP_PIMSM
450 	if (v->flags&VIFF_REGISTER)
451 		reg_vif_num = vifi;
452 #endif
453 	if (vifi+1 > maxvif)
454 		maxvif = vifi+1;
455 	write_unlock_bh(&mrt_lock);
456 	return 0;
457 }
458 
459 static struct mfc_cache *ipmr_cache_find(__u32 origin, __u32 mcastgrp)
460 {
461 	int line=MFC_HASH(mcastgrp,origin);
462 	struct mfc_cache *c;
463 
464 	for (c=mfc_cache_array[line]; c; c = c->next) {
465 		if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
466 			break;
467 	}
468 	return c;
469 }
470 
471 /*
472  *	Allocate a multicast cache entry
473  */
474 static struct mfc_cache *ipmr_cache_alloc(void)
475 {
476 	struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_KERNEL);
477 	if(c==NULL)
478 		return NULL;
479 	memset(c, 0, sizeof(*c));
480 	c->mfc_un.res.minvif = MAXVIFS;
481 	return c;
482 }
483 
484 static struct mfc_cache *ipmr_cache_alloc_unres(void)
485 {
486 	struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_ATOMIC);
487 	if(c==NULL)
488 		return NULL;
489 	memset(c, 0, sizeof(*c));
490 	skb_queue_head_init(&c->mfc_un.unres.unresolved);
491 	c->mfc_un.unres.expires = jiffies + 10*HZ;
492 	return c;
493 }
494 
495 /*
496  *	A cache entry has gone into a resolved state from queued
497  */
498 
499 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
500 {
501 	struct sk_buff *skb;
502 
503 	/*
504 	 *	Play the pending entries through our router
505 	 */
506 
507 	while((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
508 		if (skb->nh.iph->version == 0) {
509 			int err;
510 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
511 
512 			if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
513 				nlh->nlmsg_len = skb->tail - (u8*)nlh;
514 			} else {
515 				nlh->nlmsg_type = NLMSG_ERROR;
516 				nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
517 				skb_trim(skb, nlh->nlmsg_len);
518 				((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -EMSGSIZE;
519 			}
520 			err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
521 		} else
522 			ip_mr_forward(skb, c, 0);
523 	}
524 }
525 
526 /*
527  *	Bounce a cache query up to mrouted. We could use netlink for this but mrouted
528  *	expects the following bizarre scheme.
529  *
530  *	Called under mrt_lock.
531  */
532 
533 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
534 {
535 	struct sk_buff *skb;
536 	int ihl = pkt->nh.iph->ihl<<2;
537 	struct igmphdr *igmp;
538 	struct igmpmsg *msg;
539 	int ret;
540 
541 #ifdef CONFIG_IP_PIMSM
542 	if (assert == IGMPMSG_WHOLEPKT)
543 		skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
544 	else
545 #endif
546 		skb = alloc_skb(128, GFP_ATOMIC);
547 
548 	if(!skb)
549 		return -ENOBUFS;
550 
551 #ifdef CONFIG_IP_PIMSM
552 	if (assert == IGMPMSG_WHOLEPKT) {
553 		/* Ugly, but we have no choice with this interface.
554 		   Duplicate old header, fix ihl, length etc.
555 		   And all this only to mangle msg->im_msgtype and
556 		   to set msg->im_mbz to "mbz" :-)
557 		 */
558 		msg = (struct igmpmsg*)skb_push(skb, sizeof(struct iphdr));
559 		skb->nh.raw = skb->h.raw = (u8*)msg;
560 		memcpy(msg, pkt->nh.raw, sizeof(struct iphdr));
561 		msg->im_msgtype = IGMPMSG_WHOLEPKT;
562 		msg->im_mbz = 0;
563  		msg->im_vif = reg_vif_num;
564 		skb->nh.iph->ihl = sizeof(struct iphdr) >> 2;
565 		skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr));
566 	} else
567 #endif
568 	{
569 
570 	/*
571 	 *	Copy the IP header
572 	 */
573 
574 	skb->nh.iph = (struct iphdr *)skb_put(skb, ihl);
575 	memcpy(skb->data,pkt->data,ihl);
576 	skb->nh.iph->protocol = 0;			/* Flag to the kernel this is a route add */
577 	msg = (struct igmpmsg*)skb->nh.iph;
578 	msg->im_vif = vifi;
579 	skb->dst = dst_clone(pkt->dst);
580 
581 	/*
582 	 *	Add our header
583 	 */
584 
585 	igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
586 	igmp->type	=
587 	msg->im_msgtype = assert;
588 	igmp->code 	=	0;
589 	skb->nh.iph->tot_len=htons(skb->len);			/* Fix the length */
590 	skb->h.raw = skb->nh.raw;
591         }
592 
593 	if (mroute_socket == NULL) {
594 		kfree_skb(skb);
595 		return -EINVAL;
596 	}
597 
598 	/*
599 	 *	Deliver to mrouted
600 	 */
601 	if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
602 		if (net_ratelimit())
603 			printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
604 		kfree_skb(skb);
605 	}
606 
607 	return ret;
608 }
609 
610 /*
611  *	Queue a packet for resolution. It gets locked cache entry!
612  */
613 
614 static int
615 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
616 {
617 	int err;
618 	struct mfc_cache *c;
619 
620 	spin_lock_bh(&mfc_unres_lock);
621 	for (c=mfc_unres_queue; c; c=c->next) {
622 		if (c->mfc_mcastgrp == skb->nh.iph->daddr &&
623 		    c->mfc_origin == skb->nh.iph->saddr)
624 			break;
625 	}
626 
627 	if (c == NULL) {
628 		/*
629 		 *	Create a new entry if allowable
630 		 */
631 
632 		if (atomic_read(&cache_resolve_queue_len)>=10 ||
633 		    (c=ipmr_cache_alloc_unres())==NULL) {
634 			spin_unlock_bh(&mfc_unres_lock);
635 
636 			kfree_skb(skb);
637 			return -ENOBUFS;
638 		}
639 
640 		/*
641 		 *	Fill in the new cache entry
642 		 */
643 		c->mfc_parent=-1;
644 		c->mfc_origin=skb->nh.iph->saddr;
645 		c->mfc_mcastgrp=skb->nh.iph->daddr;
646 
647 		/*
648 		 *	Reflect first query at mrouted.
649 		 */
650 		if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
651 			/* If the report failed throw the cache entry
652 			   out - Brad Parker
653 			 */
654 			spin_unlock_bh(&mfc_unres_lock);
655 
656 			kmem_cache_free(mrt_cachep, c);
657 			kfree_skb(skb);
658 			return err;
659 		}
660 
661 		atomic_inc(&cache_resolve_queue_len);
662 		c->next = mfc_unres_queue;
663 		mfc_unres_queue = c;
664 
665 		mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
666 	}
667 
668 	/*
669 	 *	See if we can append the packet
670 	 */
671 	if (c->mfc_un.unres.unresolved.qlen>3) {
672 		kfree_skb(skb);
673 		err = -ENOBUFS;
674 	} else {
675 		skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
676 		err = 0;
677 	}
678 
679 	spin_unlock_bh(&mfc_unres_lock);
680 	return err;
681 }
682 
683 /*
684  *	MFC cache manipulation by user space mroute daemon
685  */
686 
687 static int ipmr_mfc_delete(struct mfcctl *mfc)
688 {
689 	int line;
690 	struct mfc_cache *c, **cp;
691 
692 	line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
693 
694 	for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
695 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
696 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
697 			write_lock_bh(&mrt_lock);
698 			*cp = c->next;
699 			write_unlock_bh(&mrt_lock);
700 
701 			kmem_cache_free(mrt_cachep, c);
702 			return 0;
703 		}
704 	}
705 	return -ENOENT;
706 }
707 
708 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
709 {
710 	int line;
711 	struct mfc_cache *uc, *c, **cp;
712 
713 	line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
714 
715 	for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
716 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
717 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
718 			break;
719 	}
720 
721 	if (c != NULL) {
722 		write_lock_bh(&mrt_lock);
723 		c->mfc_parent = mfc->mfcc_parent;
724 		ipmr_update_threshoulds(c, mfc->mfcc_ttls);
725 		if (!mrtsock)
726 			c->mfc_flags |= MFC_STATIC;
727 		write_unlock_bh(&mrt_lock);
728 		return 0;
729 	}
730 
731 	if(!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
732 		return -EINVAL;
733 
734 	c=ipmr_cache_alloc();
735 	if (c==NULL)
736 		return -ENOMEM;
737 
738 	c->mfc_origin=mfc->mfcc_origin.s_addr;
739 	c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
740 	c->mfc_parent=mfc->mfcc_parent;
741 	ipmr_update_threshoulds(c, mfc->mfcc_ttls);
742 	if (!mrtsock)
743 		c->mfc_flags |= MFC_STATIC;
744 
745 	write_lock_bh(&mrt_lock);
746 	c->next = mfc_cache_array[line];
747 	mfc_cache_array[line] = c;
748 	write_unlock_bh(&mrt_lock);
749 
750 	/*
751 	 *	Check to see if we resolved a queued list. If so we
752 	 *	need to send on the frames and tidy up.
753 	 */
754 	spin_lock_bh(&mfc_unres_lock);
755 	for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
756 	     cp = &uc->next) {
757 		if (uc->mfc_origin == c->mfc_origin &&
758 		    uc->mfc_mcastgrp == c->mfc_mcastgrp) {
759 			*cp = uc->next;
760 			if (atomic_dec_and_test(&cache_resolve_queue_len))
761 				del_timer(&ipmr_expire_timer);
762 			break;
763 		}
764 	}
765 	spin_unlock_bh(&mfc_unres_lock);
766 
767 	if (uc) {
768 		ipmr_cache_resolve(uc, c);
769 		kmem_cache_free(mrt_cachep, uc);
770 	}
771 	return 0;
772 }
773 
774 /*
775  *	Close the multicast socket, and clear the vif tables etc
776  */
777 
778 static void mroute_clean_tables(struct sock *sk)
779 {
780 	int i;
781 
782 	/*
783 	 *	Shut down all active vif entries
784 	 */
785 	for(i=0; i<maxvif; i++) {
786 		if (!(vif_table[i].flags&VIFF_STATIC))
787 			vif_delete(i);
788 	}
789 
790 	/*
791 	 *	Wipe the cache
792 	 */
793 	for (i=0;i<MFC_LINES;i++) {
794 		struct mfc_cache *c, **cp;
795 
796 		cp = &mfc_cache_array[i];
797 		while ((c = *cp) != NULL) {
798 			if (c->mfc_flags&MFC_STATIC) {
799 				cp = &c->next;
800 				continue;
801 			}
802 			write_lock_bh(&mrt_lock);
803 			*cp = c->next;
804 			write_unlock_bh(&mrt_lock);
805 
806 			kmem_cache_free(mrt_cachep, c);
807 		}
808 	}
809 
810 	if (atomic_read(&cache_resolve_queue_len) != 0) {
811 		struct mfc_cache *c;
812 
813 		spin_lock_bh(&mfc_unres_lock);
814 		while (mfc_unres_queue != NULL) {
815 			c = mfc_unres_queue;
816 			mfc_unres_queue = c->next;
817 			spin_unlock_bh(&mfc_unres_lock);
818 
819 			ipmr_destroy_unres(c);
820 
821 			spin_lock_bh(&mfc_unres_lock);
822 		}
823 		spin_unlock_bh(&mfc_unres_lock);
824 	}
825 }
826 
827 static void mrtsock_destruct(struct sock *sk)
828 {
829 	rtnl_lock();
830 	if (sk == mroute_socket) {
831 		ipv4_devconf.mc_forwarding--;
832 
833 		write_lock_bh(&mrt_lock);
834 		mroute_socket=NULL;
835 		write_unlock_bh(&mrt_lock);
836 
837 		mroute_clean_tables(sk);
838 	}
839 	rtnl_unlock();
840 }
841 
842 /*
843  *	Socket options and virtual interface manipulation. The whole
844  *	virtual interface system is a complete heap, but unfortunately
845  *	that's how BSD mrouted happens to think. Maybe one day with a proper
846  *	MOSPF/PIM router set up we can clean this up.
847  */
848 
849 int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
850 {
851 	int ret;
852 	struct vifctl vif;
853 	struct mfcctl mfc;
854 
855 	if(optname!=MRT_INIT)
856 	{
857 		if(sk!=mroute_socket && !capable(CAP_NET_ADMIN))
858 			return -EACCES;
859 	}
860 
861 	switch(optname)
862 	{
863 		case MRT_INIT:
864 			if (sk->sk_type != SOCK_RAW ||
865 			    inet_sk(sk)->num != IPPROTO_IGMP)
866 				return -EOPNOTSUPP;
867 			if(optlen!=sizeof(int))
868 				return -ENOPROTOOPT;
869 
870 			rtnl_lock();
871 			if (mroute_socket) {
872 				rtnl_unlock();
873 				return -EADDRINUSE;
874 			}
875 
876 			ret = ip_ra_control(sk, 1, mrtsock_destruct);
877 			if (ret == 0) {
878 				write_lock_bh(&mrt_lock);
879 				mroute_socket=sk;
880 				write_unlock_bh(&mrt_lock);
881 
882 				ipv4_devconf.mc_forwarding++;
883 			}
884 			rtnl_unlock();
885 			return ret;
886 		case MRT_DONE:
887 			if (sk!=mroute_socket)
888 				return -EACCES;
889 			return ip_ra_control(sk, 0, NULL);
890 		case MRT_ADD_VIF:
891 		case MRT_DEL_VIF:
892 			if(optlen!=sizeof(vif))
893 				return -EINVAL;
894 			if (copy_from_user(&vif,optval,sizeof(vif)))
895 				return -EFAULT;
896 			if(vif.vifc_vifi >= MAXVIFS)
897 				return -ENFILE;
898 			rtnl_lock();
899 			if (optname==MRT_ADD_VIF) {
900 				ret = vif_add(&vif, sk==mroute_socket);
901 			} else {
902 				ret = vif_delete(vif.vifc_vifi);
903 			}
904 			rtnl_unlock();
905 			return ret;
906 
907 		/*
908 		 *	Manipulate the forwarding caches. These live
909 		 *	in a sort of kernel/user symbiosis.
910 		 */
911 		case MRT_ADD_MFC:
912 		case MRT_DEL_MFC:
913 			if(optlen!=sizeof(mfc))
914 				return -EINVAL;
915 			if (copy_from_user(&mfc,optval, sizeof(mfc)))
916 				return -EFAULT;
917 			rtnl_lock();
918 			if (optname==MRT_DEL_MFC)
919 				ret = ipmr_mfc_delete(&mfc);
920 			else
921 				ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
922 			rtnl_unlock();
923 			return ret;
924 		/*
925 		 *	Control PIM assert.
926 		 */
927 		case MRT_ASSERT:
928 		{
929 			int v;
930 			if(get_user(v,(int __user *)optval))
931 				return -EFAULT;
932 			mroute_do_assert=(v)?1:0;
933 			return 0;
934 		}
935 #ifdef CONFIG_IP_PIMSM
936 		case MRT_PIM:
937 		{
938 			int v, ret;
939 			if(get_user(v,(int __user *)optval))
940 				return -EFAULT;
941 			v = (v)?1:0;
942 			rtnl_lock();
943 			ret = 0;
944 			if (v != mroute_do_pim) {
945 				mroute_do_pim = v;
946 				mroute_do_assert = v;
947 #ifdef CONFIG_IP_PIMSM_V2
948 				if (mroute_do_pim)
949 					ret = inet_add_protocol(&pim_protocol,
950 								IPPROTO_PIM);
951 				else
952 					ret = inet_del_protocol(&pim_protocol,
953 								IPPROTO_PIM);
954 				if (ret < 0)
955 					ret = -EAGAIN;
956 #endif
957 			}
958 			rtnl_unlock();
959 			return ret;
960 		}
961 #endif
962 		/*
963 		 *	Spurious command, or MRT_VERSION which you cannot
964 		 *	set.
965 		 */
966 		default:
967 			return -ENOPROTOOPT;
968 	}
969 }
970 
971 /*
972  *	Getsock opt support for the multicast routing system.
973  */
974 
975 int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
976 {
977 	int olr;
978 	int val;
979 
980 	if(optname!=MRT_VERSION &&
981 #ifdef CONFIG_IP_PIMSM
982 	   optname!=MRT_PIM &&
983 #endif
984 	   optname!=MRT_ASSERT)
985 		return -ENOPROTOOPT;
986 
987 	if (get_user(olr, optlen))
988 		return -EFAULT;
989 
990 	olr = min_t(unsigned int, olr, sizeof(int));
991 	if (olr < 0)
992 		return -EINVAL;
993 
994 	if(put_user(olr,optlen))
995 		return -EFAULT;
996 	if(optname==MRT_VERSION)
997 		val=0x0305;
998 #ifdef CONFIG_IP_PIMSM
999 	else if(optname==MRT_PIM)
1000 		val=mroute_do_pim;
1001 #endif
1002 	else
1003 		val=mroute_do_assert;
1004 	if(copy_to_user(optval,&val,olr))
1005 		return -EFAULT;
1006 	return 0;
1007 }
1008 
1009 /*
1010  *	The IP multicast ioctl support routines.
1011  */
1012 
1013 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1014 {
1015 	struct sioc_sg_req sr;
1016 	struct sioc_vif_req vr;
1017 	struct vif_device *vif;
1018 	struct mfc_cache *c;
1019 
1020 	switch(cmd)
1021 	{
1022 		case SIOCGETVIFCNT:
1023 			if (copy_from_user(&vr,arg,sizeof(vr)))
1024 				return -EFAULT;
1025 			if(vr.vifi>=maxvif)
1026 				return -EINVAL;
1027 			read_lock(&mrt_lock);
1028 			vif=&vif_table[vr.vifi];
1029 			if(VIF_EXISTS(vr.vifi))	{
1030 				vr.icount=vif->pkt_in;
1031 				vr.ocount=vif->pkt_out;
1032 				vr.ibytes=vif->bytes_in;
1033 				vr.obytes=vif->bytes_out;
1034 				read_unlock(&mrt_lock);
1035 
1036 				if (copy_to_user(arg,&vr,sizeof(vr)))
1037 					return -EFAULT;
1038 				return 0;
1039 			}
1040 			read_unlock(&mrt_lock);
1041 			return -EADDRNOTAVAIL;
1042 		case SIOCGETSGCNT:
1043 			if (copy_from_user(&sr,arg,sizeof(sr)))
1044 				return -EFAULT;
1045 
1046 			read_lock(&mrt_lock);
1047 			c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1048 			if (c) {
1049 				sr.pktcnt = c->mfc_un.res.pkt;
1050 				sr.bytecnt = c->mfc_un.res.bytes;
1051 				sr.wrong_if = c->mfc_un.res.wrong_if;
1052 				read_unlock(&mrt_lock);
1053 
1054 				if (copy_to_user(arg,&sr,sizeof(sr)))
1055 					return -EFAULT;
1056 				return 0;
1057 			}
1058 			read_unlock(&mrt_lock);
1059 			return -EADDRNOTAVAIL;
1060 		default:
1061 			return -ENOIOCTLCMD;
1062 	}
1063 }
1064 
1065 
1066 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1067 {
1068 	struct vif_device *v;
1069 	int ct;
1070 	if (event != NETDEV_UNREGISTER)
1071 		return NOTIFY_DONE;
1072 	v=&vif_table[0];
1073 	for(ct=0;ct<maxvif;ct++,v++) {
1074 		if (v->dev==ptr)
1075 			vif_delete(ct);
1076 	}
1077 	return NOTIFY_DONE;
1078 }
1079 
1080 
1081 static struct notifier_block ip_mr_notifier={
1082 	.notifier_call = ipmr_device_event,
1083 };
1084 
1085 /*
1086  * 	Encapsulate a packet by attaching a valid IPIP header to it.
1087  *	This avoids tunnel drivers and other mess and gives us the speed so
1088  *	important for multicast video.
1089  */
1090 
1091 static void ip_encap(struct sk_buff *skb, u32 saddr, u32 daddr)
1092 {
1093 	struct iphdr *iph = (struct iphdr *)skb_push(skb,sizeof(struct iphdr));
1094 
1095 	iph->version	= 	4;
1096 	iph->tos	=	skb->nh.iph->tos;
1097 	iph->ttl	=	skb->nh.iph->ttl;
1098 	iph->frag_off	=	0;
1099 	iph->daddr	=	daddr;
1100 	iph->saddr	=	saddr;
1101 	iph->protocol	=	IPPROTO_IPIP;
1102 	iph->ihl	=	5;
1103 	iph->tot_len	=	htons(skb->len);
1104 	ip_select_ident(iph, skb->dst, NULL);
1105 	ip_send_check(iph);
1106 
1107 	skb->h.ipiph = skb->nh.iph;
1108 	skb->nh.iph = iph;
1109 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1110 	nf_reset(skb);
1111 }
1112 
1113 static inline int ipmr_forward_finish(struct sk_buff *skb)
1114 {
1115 	struct ip_options * opt	= &(IPCB(skb)->opt);
1116 
1117 	IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
1118 
1119 	if (unlikely(opt->optlen))
1120 		ip_forward_options(skb);
1121 
1122 	return dst_output(skb);
1123 }
1124 
1125 /*
1126  *	Processing handlers for ipmr_forward
1127  */
1128 
1129 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1130 {
1131 	struct iphdr *iph = skb->nh.iph;
1132 	struct vif_device *vif = &vif_table[vifi];
1133 	struct net_device *dev;
1134 	struct rtable *rt;
1135 	int    encap = 0;
1136 
1137 	if (vif->dev == NULL)
1138 		goto out_free;
1139 
1140 #ifdef CONFIG_IP_PIMSM
1141 	if (vif->flags & VIFF_REGISTER) {
1142 		vif->pkt_out++;
1143 		vif->bytes_out+=skb->len;
1144 		((struct net_device_stats*)vif->dev->priv)->tx_bytes += skb->len;
1145 		((struct net_device_stats*)vif->dev->priv)->tx_packets++;
1146 		ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1147 		kfree_skb(skb);
1148 		return;
1149 	}
1150 #endif
1151 
1152 	if (vif->flags&VIFF_TUNNEL) {
1153 		struct flowi fl = { .oif = vif->link,
1154 				    .nl_u = { .ip4_u =
1155 					      { .daddr = vif->remote,
1156 						.saddr = vif->local,
1157 						.tos = RT_TOS(iph->tos) } },
1158 				    .proto = IPPROTO_IPIP };
1159 		if (ip_route_output_key(&rt, &fl))
1160 			goto out_free;
1161 		encap = sizeof(struct iphdr);
1162 	} else {
1163 		struct flowi fl = { .oif = vif->link,
1164 				    .nl_u = { .ip4_u =
1165 					      { .daddr = iph->daddr,
1166 						.tos = RT_TOS(iph->tos) } },
1167 				    .proto = IPPROTO_IPIP };
1168 		if (ip_route_output_key(&rt, &fl))
1169 			goto out_free;
1170 	}
1171 
1172 	dev = rt->u.dst.dev;
1173 
1174 	if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1175 		/* Do not fragment multicasts. Alas, IPv4 does not
1176 		   allow to send ICMP, so that packets will disappear
1177 		   to blackhole.
1178 		 */
1179 
1180 		IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
1181 		ip_rt_put(rt);
1182 		goto out_free;
1183 	}
1184 
1185 	encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1186 
1187 	if (skb_cow(skb, encap)) {
1188  		ip_rt_put(rt);
1189 		goto out_free;
1190 	}
1191 
1192 	vif->pkt_out++;
1193 	vif->bytes_out+=skb->len;
1194 
1195 	dst_release(skb->dst);
1196 	skb->dst = &rt->u.dst;
1197 	iph = skb->nh.iph;
1198 	ip_decrease_ttl(iph);
1199 
1200 	/* FIXME: forward and output firewalls used to be called here.
1201 	 * What do we do with netfilter? -- RR */
1202 	if (vif->flags & VIFF_TUNNEL) {
1203 		ip_encap(skb, vif->local, vif->remote);
1204 		/* FIXME: extra output firewall step used to be here. --RR */
1205 		((struct ip_tunnel *)vif->dev->priv)->stat.tx_packets++;
1206 		((struct ip_tunnel *)vif->dev->priv)->stat.tx_bytes+=skb->len;
1207 	}
1208 
1209 	IPCB(skb)->flags |= IPSKB_FORWARDED;
1210 
1211 	/*
1212 	 * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1213 	 * not only before forwarding, but after forwarding on all output
1214 	 * interfaces. It is clear, if mrouter runs a multicasting
1215 	 * program, it should receive packets not depending to what interface
1216 	 * program is joined.
1217 	 * If we will not make it, the program will have to join on all
1218 	 * interfaces. On the other hand, multihoming host (or router, but
1219 	 * not mrouter) cannot join to more than one interface - it will
1220 	 * result in receiving multiple packets.
1221 	 */
1222 	NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev,
1223 		ipmr_forward_finish);
1224 	return;
1225 
1226 out_free:
1227 	kfree_skb(skb);
1228 	return;
1229 }
1230 
1231 static int ipmr_find_vif(struct net_device *dev)
1232 {
1233 	int ct;
1234 	for (ct=maxvif-1; ct>=0; ct--) {
1235 		if (vif_table[ct].dev == dev)
1236 			break;
1237 	}
1238 	return ct;
1239 }
1240 
1241 /* "local" means that we should preserve one skb (for local delivery) */
1242 
1243 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1244 {
1245 	int psend = -1;
1246 	int vif, ct;
1247 
1248 	vif = cache->mfc_parent;
1249 	cache->mfc_un.res.pkt++;
1250 	cache->mfc_un.res.bytes += skb->len;
1251 
1252 	/*
1253 	 * Wrong interface: drop packet and (maybe) send PIM assert.
1254 	 */
1255 	if (vif_table[vif].dev != skb->dev) {
1256 		int true_vifi;
1257 
1258 		if (((struct rtable*)skb->dst)->fl.iif == 0) {
1259 			/* It is our own packet, looped back.
1260 			   Very complicated situation...
1261 
1262 			   The best workaround until routing daemons will be
1263 			   fixed is not to redistribute packet, if it was
1264 			   send through wrong interface. It means, that
1265 			   multicast applications WILL NOT work for
1266 			   (S,G), which have default multicast route pointing
1267 			   to wrong oif. In any case, it is not a good
1268 			   idea to use multicasting applications on router.
1269 			 */
1270 			goto dont_forward;
1271 		}
1272 
1273 		cache->mfc_un.res.wrong_if++;
1274 		true_vifi = ipmr_find_vif(skb->dev);
1275 
1276 		if (true_vifi >= 0 && mroute_do_assert &&
1277 		    /* pimsm uses asserts, when switching from RPT to SPT,
1278 		       so that we cannot check that packet arrived on an oif.
1279 		       It is bad, but otherwise we would need to move pretty
1280 		       large chunk of pimd to kernel. Ough... --ANK
1281 		     */
1282 		    (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1283 		    time_after(jiffies,
1284 			       cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1285 			cache->mfc_un.res.last_assert = jiffies;
1286 			ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1287 		}
1288 		goto dont_forward;
1289 	}
1290 
1291 	vif_table[vif].pkt_in++;
1292 	vif_table[vif].bytes_in+=skb->len;
1293 
1294 	/*
1295 	 *	Forward the frame
1296 	 */
1297 	for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1298 		if (skb->nh.iph->ttl > cache->mfc_un.res.ttls[ct]) {
1299 			if (psend != -1) {
1300 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1301 				if (skb2)
1302 					ipmr_queue_xmit(skb2, cache, psend);
1303 			}
1304 			psend=ct;
1305 		}
1306 	}
1307 	if (psend != -1) {
1308 		if (local) {
1309 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1310 			if (skb2)
1311 				ipmr_queue_xmit(skb2, cache, psend);
1312 		} else {
1313 			ipmr_queue_xmit(skb, cache, psend);
1314 			return 0;
1315 		}
1316 	}
1317 
1318 dont_forward:
1319 	if (!local)
1320 		kfree_skb(skb);
1321 	return 0;
1322 }
1323 
1324 
1325 /*
1326  *	Multicast packets for forwarding arrive here
1327  */
1328 
1329 int ip_mr_input(struct sk_buff *skb)
1330 {
1331 	struct mfc_cache *cache;
1332 	int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
1333 
1334 	/* Packet is looped back after forward, it should not be
1335 	   forwarded second time, but still can be delivered locally.
1336 	 */
1337 	if (IPCB(skb)->flags&IPSKB_FORWARDED)
1338 		goto dont_forward;
1339 
1340 	if (!local) {
1341 		    if (IPCB(skb)->opt.router_alert) {
1342 			    if (ip_call_ra_chain(skb))
1343 				    return 0;
1344 		    } else if (skb->nh.iph->protocol == IPPROTO_IGMP){
1345 			    /* IGMPv1 (and broken IGMPv2 implementations sort of
1346 			       Cisco IOS <= 11.2(8)) do not put router alert
1347 			       option to IGMP packets destined to routable
1348 			       groups. It is very bad, because it means
1349 			       that we can forward NO IGMP messages.
1350 			     */
1351 			    read_lock(&mrt_lock);
1352 			    if (mroute_socket) {
1353 				    raw_rcv(mroute_socket, skb);
1354 				    read_unlock(&mrt_lock);
1355 				    return 0;
1356 			    }
1357 			    read_unlock(&mrt_lock);
1358 		    }
1359 	}
1360 
1361 	read_lock(&mrt_lock);
1362 	cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr);
1363 
1364 	/*
1365 	 *	No usable cache entry
1366 	 */
1367 	if (cache==NULL) {
1368 		int vif;
1369 
1370 		if (local) {
1371 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1372 			ip_local_deliver(skb);
1373 			if (skb2 == NULL) {
1374 				read_unlock(&mrt_lock);
1375 				return -ENOBUFS;
1376 			}
1377 			skb = skb2;
1378 		}
1379 
1380 		vif = ipmr_find_vif(skb->dev);
1381 		if (vif >= 0) {
1382 			int err = ipmr_cache_unresolved(vif, skb);
1383 			read_unlock(&mrt_lock);
1384 
1385 			return err;
1386 		}
1387 		read_unlock(&mrt_lock);
1388 		kfree_skb(skb);
1389 		return -ENODEV;
1390 	}
1391 
1392 	ip_mr_forward(skb, cache, local);
1393 
1394 	read_unlock(&mrt_lock);
1395 
1396 	if (local)
1397 		return ip_local_deliver(skb);
1398 
1399 	return 0;
1400 
1401 dont_forward:
1402 	if (local)
1403 		return ip_local_deliver(skb);
1404 	kfree_skb(skb);
1405 	return 0;
1406 }
1407 
1408 #ifdef CONFIG_IP_PIMSM_V1
1409 /*
1410  * Handle IGMP messages of PIMv1
1411  */
1412 
1413 int pim_rcv_v1(struct sk_buff * skb)
1414 {
1415 	struct igmphdr *pim;
1416 	struct iphdr   *encap;
1417 	struct net_device  *reg_dev = NULL;
1418 
1419 	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1420 		goto drop;
1421 
1422 	pim = (struct igmphdr*)skb->h.raw;
1423 
1424         if (!mroute_do_pim ||
1425 	    skb->len < sizeof(*pim) + sizeof(*encap) ||
1426 	    pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1427 		goto drop;
1428 
1429 	encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr));
1430 	/*
1431 	   Check that:
1432 	   a. packet is really destinted to a multicast group
1433 	   b. packet is not a NULL-REGISTER
1434 	   c. packet is not truncated
1435 	 */
1436 	if (!MULTICAST(encap->daddr) ||
1437 	    encap->tot_len == 0 ||
1438 	    ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1439 		goto drop;
1440 
1441 	read_lock(&mrt_lock);
1442 	if (reg_vif_num >= 0)
1443 		reg_dev = vif_table[reg_vif_num].dev;
1444 	if (reg_dev)
1445 		dev_hold(reg_dev);
1446 	read_unlock(&mrt_lock);
1447 
1448 	if (reg_dev == NULL)
1449 		goto drop;
1450 
1451 	skb->mac.raw = skb->nh.raw;
1452 	skb_pull(skb, (u8*)encap - skb->data);
1453 	skb->nh.iph = (struct iphdr *)skb->data;
1454 	skb->dev = reg_dev;
1455 	memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
1456 	skb->protocol = htons(ETH_P_IP);
1457 	skb->ip_summed = 0;
1458 	skb->pkt_type = PACKET_HOST;
1459 	dst_release(skb->dst);
1460 	skb->dst = NULL;
1461 	((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len;
1462 	((struct net_device_stats*)reg_dev->priv)->rx_packets++;
1463 	nf_reset(skb);
1464 	netif_rx(skb);
1465 	dev_put(reg_dev);
1466 	return 0;
1467  drop:
1468 	kfree_skb(skb);
1469 	return 0;
1470 }
1471 #endif
1472 
1473 #ifdef CONFIG_IP_PIMSM_V2
1474 static int pim_rcv(struct sk_buff * skb)
1475 {
1476 	struct pimreghdr *pim;
1477 	struct iphdr   *encap;
1478 	struct net_device  *reg_dev = NULL;
1479 
1480 	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1481 		goto drop;
1482 
1483 	pim = (struct pimreghdr*)skb->h.raw;
1484         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1485 	    (pim->flags&PIM_NULL_REGISTER) ||
1486 	    (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1487 	     (u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1488 		goto drop;
1489 
1490 	/* check if the inner packet is destined to mcast group */
1491 	encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr));
1492 	if (!MULTICAST(encap->daddr) ||
1493 	    encap->tot_len == 0 ||
1494 	    ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1495 		goto drop;
1496 
1497 	read_lock(&mrt_lock);
1498 	if (reg_vif_num >= 0)
1499 		reg_dev = vif_table[reg_vif_num].dev;
1500 	if (reg_dev)
1501 		dev_hold(reg_dev);
1502 	read_unlock(&mrt_lock);
1503 
1504 	if (reg_dev == NULL)
1505 		goto drop;
1506 
1507 	skb->mac.raw = skb->nh.raw;
1508 	skb_pull(skb, (u8*)encap - skb->data);
1509 	skb->nh.iph = (struct iphdr *)skb->data;
1510 	skb->dev = reg_dev;
1511 	memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
1512 	skb->protocol = htons(ETH_P_IP);
1513 	skb->ip_summed = 0;
1514 	skb->pkt_type = PACKET_HOST;
1515 	dst_release(skb->dst);
1516 	((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len;
1517 	((struct net_device_stats*)reg_dev->priv)->rx_packets++;
1518 	skb->dst = NULL;
1519 	nf_reset(skb);
1520 	netif_rx(skb);
1521 	dev_put(reg_dev);
1522 	return 0;
1523  drop:
1524 	kfree_skb(skb);
1525 	return 0;
1526 }
1527 #endif
1528 
1529 static int
1530 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1531 {
1532 	int ct;
1533 	struct rtnexthop *nhp;
1534 	struct net_device *dev = vif_table[c->mfc_parent].dev;
1535 	u8 *b = skb->tail;
1536 	struct rtattr *mp_head;
1537 
1538 	if (dev)
1539 		RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1540 
1541 	mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1542 
1543 	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1544 		if (c->mfc_un.res.ttls[ct] < 255) {
1545 			if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1546 				goto rtattr_failure;
1547 			nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1548 			nhp->rtnh_flags = 0;
1549 			nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1550 			nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1551 			nhp->rtnh_len = sizeof(*nhp);
1552 		}
1553 	}
1554 	mp_head->rta_type = RTA_MULTIPATH;
1555 	mp_head->rta_len = skb->tail - (u8*)mp_head;
1556 	rtm->rtm_type = RTN_MULTICAST;
1557 	return 1;
1558 
1559 rtattr_failure:
1560 	skb_trim(skb, b - skb->data);
1561 	return -EMSGSIZE;
1562 }
1563 
1564 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1565 {
1566 	int err;
1567 	struct mfc_cache *cache;
1568 	struct rtable *rt = (struct rtable*)skb->dst;
1569 
1570 	read_lock(&mrt_lock);
1571 	cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1572 
1573 	if (cache==NULL) {
1574 		struct net_device *dev;
1575 		int vif;
1576 
1577 		if (nowait) {
1578 			read_unlock(&mrt_lock);
1579 			return -EAGAIN;
1580 		}
1581 
1582 		dev = skb->dev;
1583 		if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1584 			read_unlock(&mrt_lock);
1585 			return -ENODEV;
1586 		}
1587 		skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
1588 		skb->nh.iph->ihl = sizeof(struct iphdr)>>2;
1589 		skb->nh.iph->saddr = rt->rt_src;
1590 		skb->nh.iph->daddr = rt->rt_dst;
1591 		skb->nh.iph->version = 0;
1592 		err = ipmr_cache_unresolved(vif, skb);
1593 		read_unlock(&mrt_lock);
1594 		return err;
1595 	}
1596 
1597 	if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1598 		cache->mfc_flags |= MFC_NOTIFY;
1599 	err = ipmr_fill_mroute(skb, cache, rtm);
1600 	read_unlock(&mrt_lock);
1601 	return err;
1602 }
1603 
1604 #ifdef CONFIG_PROC_FS
1605 /*
1606  *	The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1607  */
1608 struct ipmr_vif_iter {
1609 	int ct;
1610 };
1611 
1612 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1613 					   loff_t pos)
1614 {
1615 	for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1616 		if(!VIF_EXISTS(iter->ct))
1617 			continue;
1618 		if (pos-- == 0)
1619 			return &vif_table[iter->ct];
1620 	}
1621 	return NULL;
1622 }
1623 
1624 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1625 {
1626 	read_lock(&mrt_lock);
1627 	return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1628 		: SEQ_START_TOKEN;
1629 }
1630 
1631 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1632 {
1633 	struct ipmr_vif_iter *iter = seq->private;
1634 
1635 	++*pos;
1636 	if (v == SEQ_START_TOKEN)
1637 		return ipmr_vif_seq_idx(iter, 0);
1638 
1639 	while (++iter->ct < maxvif) {
1640 		if(!VIF_EXISTS(iter->ct))
1641 			continue;
1642 		return &vif_table[iter->ct];
1643 	}
1644 	return NULL;
1645 }
1646 
1647 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1648 {
1649 	read_unlock(&mrt_lock);
1650 }
1651 
1652 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1653 {
1654 	if (v == SEQ_START_TOKEN) {
1655 		seq_puts(seq,
1656 			 "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1657 	} else {
1658 		const struct vif_device *vif = v;
1659 		const char *name =  vif->dev ? vif->dev->name : "none";
1660 
1661 		seq_printf(seq,
1662 			   "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1663 			   vif - vif_table,
1664 			   name, vif->bytes_in, vif->pkt_in,
1665 			   vif->bytes_out, vif->pkt_out,
1666 			   vif->flags, vif->local, vif->remote);
1667 	}
1668 	return 0;
1669 }
1670 
1671 static struct seq_operations ipmr_vif_seq_ops = {
1672 	.start = ipmr_vif_seq_start,
1673 	.next  = ipmr_vif_seq_next,
1674 	.stop  = ipmr_vif_seq_stop,
1675 	.show  = ipmr_vif_seq_show,
1676 };
1677 
1678 static int ipmr_vif_open(struct inode *inode, struct file *file)
1679 {
1680 	struct seq_file *seq;
1681 	int rc = -ENOMEM;
1682 	struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1683 
1684 	if (!s)
1685 		goto out;
1686 
1687 	rc = seq_open(file, &ipmr_vif_seq_ops);
1688 	if (rc)
1689 		goto out_kfree;
1690 
1691 	s->ct = 0;
1692 	seq = file->private_data;
1693 	seq->private = s;
1694 out:
1695 	return rc;
1696 out_kfree:
1697 	kfree(s);
1698 	goto out;
1699 
1700 }
1701 
1702 static struct file_operations ipmr_vif_fops = {
1703 	.owner	 = THIS_MODULE,
1704 	.open    = ipmr_vif_open,
1705 	.read    = seq_read,
1706 	.llseek  = seq_lseek,
1707 	.release = seq_release_private,
1708 };
1709 
1710 struct ipmr_mfc_iter {
1711 	struct mfc_cache **cache;
1712 	int ct;
1713 };
1714 
1715 
1716 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1717 {
1718 	struct mfc_cache *mfc;
1719 
1720 	it->cache = mfc_cache_array;
1721 	read_lock(&mrt_lock);
1722 	for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1723 		for(mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
1724 			if (pos-- == 0)
1725 				return mfc;
1726 	read_unlock(&mrt_lock);
1727 
1728 	it->cache = &mfc_unres_queue;
1729 	spin_lock_bh(&mfc_unres_lock);
1730 	for(mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1731 		if (pos-- == 0)
1732 			return mfc;
1733 	spin_unlock_bh(&mfc_unres_lock);
1734 
1735 	it->cache = NULL;
1736 	return NULL;
1737 }
1738 
1739 
1740 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1741 {
1742 	struct ipmr_mfc_iter *it = seq->private;
1743 	it->cache = NULL;
1744 	it->ct = 0;
1745 	return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1746 		: SEQ_START_TOKEN;
1747 }
1748 
1749 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1750 {
1751 	struct mfc_cache *mfc = v;
1752 	struct ipmr_mfc_iter *it = seq->private;
1753 
1754 	++*pos;
1755 
1756 	if (v == SEQ_START_TOKEN)
1757 		return ipmr_mfc_seq_idx(seq->private, 0);
1758 
1759 	if (mfc->next)
1760 		return mfc->next;
1761 
1762 	if (it->cache == &mfc_unres_queue)
1763 		goto end_of_list;
1764 
1765 	BUG_ON(it->cache != mfc_cache_array);
1766 
1767 	while (++it->ct < MFC_LINES) {
1768 		mfc = mfc_cache_array[it->ct];
1769 		if (mfc)
1770 			return mfc;
1771 	}
1772 
1773 	/* exhausted cache_array, show unresolved */
1774 	read_unlock(&mrt_lock);
1775 	it->cache = &mfc_unres_queue;
1776 	it->ct = 0;
1777 
1778 	spin_lock_bh(&mfc_unres_lock);
1779 	mfc = mfc_unres_queue;
1780 	if (mfc)
1781 		return mfc;
1782 
1783  end_of_list:
1784 	spin_unlock_bh(&mfc_unres_lock);
1785 	it->cache = NULL;
1786 
1787 	return NULL;
1788 }
1789 
1790 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1791 {
1792 	struct ipmr_mfc_iter *it = seq->private;
1793 
1794 	if (it->cache == &mfc_unres_queue)
1795 		spin_unlock_bh(&mfc_unres_lock);
1796 	else if (it->cache == mfc_cache_array)
1797 		read_unlock(&mrt_lock);
1798 }
1799 
1800 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1801 {
1802 	int n;
1803 
1804 	if (v == SEQ_START_TOKEN) {
1805 		seq_puts(seq,
1806 		 "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1807 	} else {
1808 		const struct mfc_cache *mfc = v;
1809 		const struct ipmr_mfc_iter *it = seq->private;
1810 
1811 		seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1812 			   (unsigned long) mfc->mfc_mcastgrp,
1813 			   (unsigned long) mfc->mfc_origin,
1814 			   mfc->mfc_parent,
1815 			   mfc->mfc_un.res.pkt,
1816 			   mfc->mfc_un.res.bytes,
1817 			   mfc->mfc_un.res.wrong_if);
1818 
1819 		if (it->cache != &mfc_unres_queue) {
1820 			for(n = mfc->mfc_un.res.minvif;
1821 			    n < mfc->mfc_un.res.maxvif; n++ ) {
1822 				if(VIF_EXISTS(n)
1823 				   && mfc->mfc_un.res.ttls[n] < 255)
1824 				seq_printf(seq,
1825 					   " %2d:%-3d",
1826 					   n, mfc->mfc_un.res.ttls[n]);
1827 			}
1828 		}
1829 		seq_putc(seq, '\n');
1830 	}
1831 	return 0;
1832 }
1833 
1834 static struct seq_operations ipmr_mfc_seq_ops = {
1835 	.start = ipmr_mfc_seq_start,
1836 	.next  = ipmr_mfc_seq_next,
1837 	.stop  = ipmr_mfc_seq_stop,
1838 	.show  = ipmr_mfc_seq_show,
1839 };
1840 
1841 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1842 {
1843 	struct seq_file *seq;
1844 	int rc = -ENOMEM;
1845 	struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1846 
1847 	if (!s)
1848 		goto out;
1849 
1850 	rc = seq_open(file, &ipmr_mfc_seq_ops);
1851 	if (rc)
1852 		goto out_kfree;
1853 
1854 	seq = file->private_data;
1855 	seq->private = s;
1856 out:
1857 	return rc;
1858 out_kfree:
1859 	kfree(s);
1860 	goto out;
1861 
1862 }
1863 
1864 static struct file_operations ipmr_mfc_fops = {
1865 	.owner	 = THIS_MODULE,
1866 	.open    = ipmr_mfc_open,
1867 	.read    = seq_read,
1868 	.llseek  = seq_lseek,
1869 	.release = seq_release_private,
1870 };
1871 #endif
1872 
1873 #ifdef CONFIG_IP_PIMSM_V2
1874 static struct net_protocol pim_protocol = {
1875 	.handler	=	pim_rcv,
1876 };
1877 #endif
1878 
1879 
1880 /*
1881  *	Setup for IP multicast routing
1882  */
1883 
1884 void __init ip_mr_init(void)
1885 {
1886 	mrt_cachep = kmem_cache_create("ip_mrt_cache",
1887 				       sizeof(struct mfc_cache),
1888 				       0, SLAB_HWCACHE_ALIGN,
1889 				       NULL, NULL);
1890 	if (!mrt_cachep)
1891 		panic("cannot allocate ip_mrt_cache");
1892 
1893 	init_timer(&ipmr_expire_timer);
1894 	ipmr_expire_timer.function=ipmr_expire_process;
1895 	register_netdevice_notifier(&ip_mr_notifier);
1896 #ifdef CONFIG_PROC_FS
1897 	proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops);
1898 	proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops);
1899 #endif
1900 }
1901