xref: /linux/net/ipv4/ipmr.c (revision c537b994505099b7197e7d3125b942ecbcc51eb6)
1 /*
2  *	IP multicast routing support for mrouted 3.6/3.8
3  *
4  *		(c) 1995 Alan Cox, <alan@redhat.com>
5  *	  Linux Consultancy and Custom Driver Development
6  *
7  *	This program is free software; you can redistribute it and/or
8  *	modify it under the terms of the GNU General Public License
9  *	as published by the Free Software Foundation; either version
10  *	2 of the License, or (at your option) any later version.
11  *
12  *	Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
13  *
14  *	Fixes:
15  *	Michael Chastain	:	Incorrect size of copying.
16  *	Alan Cox		:	Added the cache manager code
17  *	Alan Cox		:	Fixed the clone/copy bug and device race.
18  *	Mike McLagan		:	Routing by source
19  *	Malcolm Beattie		:	Buffer handling fixes.
20  *	Alexey Kuznetsov	:	Double buffer free and other fixes.
21  *	SVR Anand		:	Fixed several multicast bugs and problems.
22  *	Alexey Kuznetsov	:	Status, optimisations and more.
23  *	Brad Parker		:	Better behaviour on mrouted upcall
24  *					overflow.
25  *      Carlos Picoto           :       PIMv1 Support
26  *	Pavlin Ivanov Radoslavov:	PIMv2 Registers must checksum only PIM header
27  *					Relax this requrement to work with older peers.
28  *
29  */
30 
31 #include <asm/system.h>
32 #include <asm/uaccess.h>
33 #include <linux/types.h>
34 #include <linux/capability.h>
35 #include <linux/errno.h>
36 #include <linux/timer.h>
37 #include <linux/mm.h>
38 #include <linux/kernel.h>
39 #include <linux/fcntl.h>
40 #include <linux/stat.h>
41 #include <linux/socket.h>
42 #include <linux/in.h>
43 #include <linux/inet.h>
44 #include <linux/netdevice.h>
45 #include <linux/inetdevice.h>
46 #include <linux/igmp.h>
47 #include <linux/proc_fs.h>
48 #include <linux/seq_file.h>
49 #include <linux/mroute.h>
50 #include <linux/init.h>
51 #include <linux/if_ether.h>
52 #include <net/ip.h>
53 #include <net/protocol.h>
54 #include <linux/skbuff.h>
55 #include <net/route.h>
56 #include <net/sock.h>
57 #include <net/icmp.h>
58 #include <net/udp.h>
59 #include <net/raw.h>
60 #include <linux/notifier.h>
61 #include <linux/if_arp.h>
62 #include <linux/netfilter_ipv4.h>
63 #include <net/ipip.h>
64 #include <net/checksum.h>
65 
66 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
67 #define CONFIG_IP_PIMSM	1
68 #endif
69 
70 static struct sock *mroute_socket;
71 
72 
73 /* Big lock, protecting vif table, mrt cache and mroute socket state.
74    Note that the changes are semaphored via rtnl_lock.
75  */
76 
77 static DEFINE_RWLOCK(mrt_lock);
78 
79 /*
80  *	Multicast router control variables
81  */
82 
83 static struct vif_device vif_table[MAXVIFS];		/* Devices 		*/
84 static int maxvif;
85 
86 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
87 
88 static int mroute_do_assert;				/* Set in PIM assert	*/
89 static int mroute_do_pim;
90 
91 static struct mfc_cache *mfc_cache_array[MFC_LINES];	/* Forwarding cache	*/
92 
93 static struct mfc_cache *mfc_unres_queue;		/* Queue of unresolved entries */
94 static atomic_t cache_resolve_queue_len;		/* Size of unresolved	*/
95 
96 /* Special spinlock for queue of unresolved entries */
97 static DEFINE_SPINLOCK(mfc_unres_lock);
98 
99 /* We return to original Alan's scheme. Hash table of resolved
100    entries is changed only in process context and protected
101    with weak lock mrt_lock. Queue of unresolved entries is protected
102    with strong spinlock mfc_unres_lock.
103 
104    In this case data path is free of exclusive locks at all.
105  */
106 
107 static struct kmem_cache *mrt_cachep __read_mostly;
108 
109 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
110 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
111 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
112 
113 #ifdef CONFIG_IP_PIMSM_V2
114 static struct net_protocol pim_protocol;
115 #endif
116 
117 static struct timer_list ipmr_expire_timer;
118 
119 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
120 
121 static
122 struct net_device *ipmr_new_tunnel(struct vifctl *v)
123 {
124 	struct net_device  *dev;
125 
126 	dev = __dev_get_by_name("tunl0");
127 
128 	if (dev) {
129 		int err;
130 		struct ifreq ifr;
131 		mm_segment_t	oldfs;
132 		struct ip_tunnel_parm p;
133 		struct in_device  *in_dev;
134 
135 		memset(&p, 0, sizeof(p));
136 		p.iph.daddr = v->vifc_rmt_addr.s_addr;
137 		p.iph.saddr = v->vifc_lcl_addr.s_addr;
138 		p.iph.version = 4;
139 		p.iph.ihl = 5;
140 		p.iph.protocol = IPPROTO_IPIP;
141 		sprintf(p.name, "dvmrp%d", v->vifc_vifi);
142 		ifr.ifr_ifru.ifru_data = (void*)&p;
143 
144 		oldfs = get_fs(); set_fs(KERNEL_DS);
145 		err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
146 		set_fs(oldfs);
147 
148 		dev = NULL;
149 
150 		if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
151 			dev->flags |= IFF_MULTICAST;
152 
153 			in_dev = __in_dev_get_rtnl(dev);
154 			if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL)
155 				goto failure;
156 			in_dev->cnf.rp_filter = 0;
157 
158 			if (dev_open(dev))
159 				goto failure;
160 		}
161 	}
162 	return dev;
163 
164 failure:
165 	/* allow the register to be completed before unregistering. */
166 	rtnl_unlock();
167 	rtnl_lock();
168 
169 	unregister_netdevice(dev);
170 	return NULL;
171 }
172 
173 #ifdef CONFIG_IP_PIMSM
174 
175 static int reg_vif_num = -1;
176 
177 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
178 {
179 	read_lock(&mrt_lock);
180 	((struct net_device_stats*)netdev_priv(dev))->tx_bytes += skb->len;
181 	((struct net_device_stats*)netdev_priv(dev))->tx_packets++;
182 	ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
183 	read_unlock(&mrt_lock);
184 	kfree_skb(skb);
185 	return 0;
186 }
187 
188 static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
189 {
190 	return (struct net_device_stats*)netdev_priv(dev);
191 }
192 
193 static void reg_vif_setup(struct net_device *dev)
194 {
195 	dev->type		= ARPHRD_PIMREG;
196 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 8;
197 	dev->flags		= IFF_NOARP;
198 	dev->hard_start_xmit	= reg_vif_xmit;
199 	dev->get_stats		= reg_vif_get_stats;
200 	dev->destructor		= free_netdev;
201 }
202 
203 static struct net_device *ipmr_reg_vif(void)
204 {
205 	struct net_device *dev;
206 	struct in_device *in_dev;
207 
208 	dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
209 			   reg_vif_setup);
210 
211 	if (dev == NULL)
212 		return NULL;
213 
214 	if (register_netdevice(dev)) {
215 		free_netdev(dev);
216 		return NULL;
217 	}
218 	dev->iflink = 0;
219 
220 	if ((in_dev = inetdev_init(dev)) == NULL)
221 		goto failure;
222 
223 	in_dev->cnf.rp_filter = 0;
224 
225 	if (dev_open(dev))
226 		goto failure;
227 
228 	return dev;
229 
230 failure:
231 	/* allow the register to be completed before unregistering. */
232 	rtnl_unlock();
233 	rtnl_lock();
234 
235 	unregister_netdevice(dev);
236 	return NULL;
237 }
238 #endif
239 
240 /*
241  *	Delete a VIF entry
242  */
243 
244 static int vif_delete(int vifi)
245 {
246 	struct vif_device *v;
247 	struct net_device *dev;
248 	struct in_device *in_dev;
249 
250 	if (vifi < 0 || vifi >= maxvif)
251 		return -EADDRNOTAVAIL;
252 
253 	v = &vif_table[vifi];
254 
255 	write_lock_bh(&mrt_lock);
256 	dev = v->dev;
257 	v->dev = NULL;
258 
259 	if (!dev) {
260 		write_unlock_bh(&mrt_lock);
261 		return -EADDRNOTAVAIL;
262 	}
263 
264 #ifdef CONFIG_IP_PIMSM
265 	if (vifi == reg_vif_num)
266 		reg_vif_num = -1;
267 #endif
268 
269 	if (vifi+1 == maxvif) {
270 		int tmp;
271 		for (tmp=vifi-1; tmp>=0; tmp--) {
272 			if (VIF_EXISTS(tmp))
273 				break;
274 		}
275 		maxvif = tmp+1;
276 	}
277 
278 	write_unlock_bh(&mrt_lock);
279 
280 	dev_set_allmulti(dev, -1);
281 
282 	if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
283 		in_dev->cnf.mc_forwarding--;
284 		ip_rt_multicast_event(in_dev);
285 	}
286 
287 	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
288 		unregister_netdevice(dev);
289 
290 	dev_put(dev);
291 	return 0;
292 }
293 
294 /* Destroy an unresolved cache entry, killing queued skbs
295    and reporting error to netlink readers.
296  */
297 
298 static void ipmr_destroy_unres(struct mfc_cache *c)
299 {
300 	struct sk_buff *skb;
301 	struct nlmsgerr *e;
302 
303 	atomic_dec(&cache_resolve_queue_len);
304 
305 	while((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
306 		if (skb->nh.iph->version == 0) {
307 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
308 			nlh->nlmsg_type = NLMSG_ERROR;
309 			nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
310 			skb_trim(skb, nlh->nlmsg_len);
311 			e = NLMSG_DATA(nlh);
312 			e->error = -ETIMEDOUT;
313 			memset(&e->msg, 0, sizeof(e->msg));
314 
315 			rtnl_unicast(skb, NETLINK_CB(skb).pid);
316 		} else
317 			kfree_skb(skb);
318 	}
319 
320 	kmem_cache_free(mrt_cachep, c);
321 }
322 
323 
324 /* Single timer process for all the unresolved queue. */
325 
326 static void ipmr_expire_process(unsigned long dummy)
327 {
328 	unsigned long now;
329 	unsigned long expires;
330 	struct mfc_cache *c, **cp;
331 
332 	if (!spin_trylock(&mfc_unres_lock)) {
333 		mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
334 		return;
335 	}
336 
337 	if (atomic_read(&cache_resolve_queue_len) == 0)
338 		goto out;
339 
340 	now = jiffies;
341 	expires = 10*HZ;
342 	cp = &mfc_unres_queue;
343 
344 	while ((c=*cp) != NULL) {
345 		if (time_after(c->mfc_un.unres.expires, now)) {
346 			unsigned long interval = c->mfc_un.unres.expires - now;
347 			if (interval < expires)
348 				expires = interval;
349 			cp = &c->next;
350 			continue;
351 		}
352 
353 		*cp = c->next;
354 
355 		ipmr_destroy_unres(c);
356 	}
357 
358 	if (atomic_read(&cache_resolve_queue_len))
359 		mod_timer(&ipmr_expire_timer, jiffies + expires);
360 
361 out:
362 	spin_unlock(&mfc_unres_lock);
363 }
364 
365 /* Fill oifs list. It is called under write locked mrt_lock. */
366 
367 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
368 {
369 	int vifi;
370 
371 	cache->mfc_un.res.minvif = MAXVIFS;
372 	cache->mfc_un.res.maxvif = 0;
373 	memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
374 
375 	for (vifi=0; vifi<maxvif; vifi++) {
376 		if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
377 			cache->mfc_un.res.ttls[vifi] = ttls[vifi];
378 			if (cache->mfc_un.res.minvif > vifi)
379 				cache->mfc_un.res.minvif = vifi;
380 			if (cache->mfc_un.res.maxvif <= vifi)
381 				cache->mfc_un.res.maxvif = vifi + 1;
382 		}
383 	}
384 }
385 
386 static int vif_add(struct vifctl *vifc, int mrtsock)
387 {
388 	int vifi = vifc->vifc_vifi;
389 	struct vif_device *v = &vif_table[vifi];
390 	struct net_device *dev;
391 	struct in_device *in_dev;
392 
393 	/* Is vif busy ? */
394 	if (VIF_EXISTS(vifi))
395 		return -EADDRINUSE;
396 
397 	switch (vifc->vifc_flags) {
398 #ifdef CONFIG_IP_PIMSM
399 	case VIFF_REGISTER:
400 		/*
401 		 * Special Purpose VIF in PIM
402 		 * All the packets will be sent to the daemon
403 		 */
404 		if (reg_vif_num >= 0)
405 			return -EADDRINUSE;
406 		dev = ipmr_reg_vif();
407 		if (!dev)
408 			return -ENOBUFS;
409 		break;
410 #endif
411 	case VIFF_TUNNEL:
412 		dev = ipmr_new_tunnel(vifc);
413 		if (!dev)
414 			return -ENOBUFS;
415 		break;
416 	case 0:
417 		dev = ip_dev_find(vifc->vifc_lcl_addr.s_addr);
418 		if (!dev)
419 			return -EADDRNOTAVAIL;
420 		dev_put(dev);
421 		break;
422 	default:
423 		return -EINVAL;
424 	}
425 
426 	if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
427 		return -EADDRNOTAVAIL;
428 	in_dev->cnf.mc_forwarding++;
429 	dev_set_allmulti(dev, +1);
430 	ip_rt_multicast_event(in_dev);
431 
432 	/*
433 	 *	Fill in the VIF structures
434 	 */
435 	v->rate_limit=vifc->vifc_rate_limit;
436 	v->local=vifc->vifc_lcl_addr.s_addr;
437 	v->remote=vifc->vifc_rmt_addr.s_addr;
438 	v->flags=vifc->vifc_flags;
439 	if (!mrtsock)
440 		v->flags |= VIFF_STATIC;
441 	v->threshold=vifc->vifc_threshold;
442 	v->bytes_in = 0;
443 	v->bytes_out = 0;
444 	v->pkt_in = 0;
445 	v->pkt_out = 0;
446 	v->link = dev->ifindex;
447 	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
448 		v->link = dev->iflink;
449 
450 	/* And finish update writing critical data */
451 	write_lock_bh(&mrt_lock);
452 	dev_hold(dev);
453 	v->dev=dev;
454 #ifdef CONFIG_IP_PIMSM
455 	if (v->flags&VIFF_REGISTER)
456 		reg_vif_num = vifi;
457 #endif
458 	if (vifi+1 > maxvif)
459 		maxvif = vifi+1;
460 	write_unlock_bh(&mrt_lock);
461 	return 0;
462 }
463 
464 static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
465 {
466 	int line=MFC_HASH(mcastgrp,origin);
467 	struct mfc_cache *c;
468 
469 	for (c=mfc_cache_array[line]; c; c = c->next) {
470 		if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
471 			break;
472 	}
473 	return c;
474 }
475 
476 /*
477  *	Allocate a multicast cache entry
478  */
479 static struct mfc_cache *ipmr_cache_alloc(void)
480 {
481 	struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
482 	if(c==NULL)
483 		return NULL;
484 	c->mfc_un.res.minvif = MAXVIFS;
485 	return c;
486 }
487 
488 static struct mfc_cache *ipmr_cache_alloc_unres(void)
489 {
490 	struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
491 	if(c==NULL)
492 		return NULL;
493 	skb_queue_head_init(&c->mfc_un.unres.unresolved);
494 	c->mfc_un.unres.expires = jiffies + 10*HZ;
495 	return c;
496 }
497 
498 /*
499  *	A cache entry has gone into a resolved state from queued
500  */
501 
502 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
503 {
504 	struct sk_buff *skb;
505 	struct nlmsgerr *e;
506 
507 	/*
508 	 *	Play the pending entries through our router
509 	 */
510 
511 	while((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
512 		if (skb->nh.iph->version == 0) {
513 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
514 
515 			if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
516 				nlh->nlmsg_len = skb->tail - (u8*)nlh;
517 			} else {
518 				nlh->nlmsg_type = NLMSG_ERROR;
519 				nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
520 				skb_trim(skb, nlh->nlmsg_len);
521 				e = NLMSG_DATA(nlh);
522 				e->error = -EMSGSIZE;
523 				memset(&e->msg, 0, sizeof(e->msg));
524 			}
525 
526 			rtnl_unicast(skb, NETLINK_CB(skb).pid);
527 		} else
528 			ip_mr_forward(skb, c, 0);
529 	}
530 }
531 
532 /*
533  *	Bounce a cache query up to mrouted. We could use netlink for this but mrouted
534  *	expects the following bizarre scheme.
535  *
536  *	Called under mrt_lock.
537  */
538 
539 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
540 {
541 	struct sk_buff *skb;
542 	int ihl = pkt->nh.iph->ihl<<2;
543 	struct igmphdr *igmp;
544 	struct igmpmsg *msg;
545 	int ret;
546 
547 #ifdef CONFIG_IP_PIMSM
548 	if (assert == IGMPMSG_WHOLEPKT)
549 		skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
550 	else
551 #endif
552 		skb = alloc_skb(128, GFP_ATOMIC);
553 
554 	if(!skb)
555 		return -ENOBUFS;
556 
557 #ifdef CONFIG_IP_PIMSM
558 	if (assert == IGMPMSG_WHOLEPKT) {
559 		/* Ugly, but we have no choice with this interface.
560 		   Duplicate old header, fix ihl, length etc.
561 		   And all this only to mangle msg->im_msgtype and
562 		   to set msg->im_mbz to "mbz" :-)
563 		 */
564 		msg = (struct igmpmsg*)skb_push(skb, sizeof(struct iphdr));
565 		skb->nh.raw = skb->h.raw = (u8*)msg;
566 		memcpy(msg, pkt->nh.raw, sizeof(struct iphdr));
567 		msg->im_msgtype = IGMPMSG_WHOLEPKT;
568 		msg->im_mbz = 0;
569 		msg->im_vif = reg_vif_num;
570 		skb->nh.iph->ihl = sizeof(struct iphdr) >> 2;
571 		skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr));
572 	} else
573 #endif
574 	{
575 
576 	/*
577 	 *	Copy the IP header
578 	 */
579 
580 	skb->nh.iph = (struct iphdr *)skb_put(skb, ihl);
581 	memcpy(skb->data,pkt->data,ihl);
582 	skb->nh.iph->protocol = 0;			/* Flag to the kernel this is a route add */
583 	msg = (struct igmpmsg*)skb->nh.iph;
584 	msg->im_vif = vifi;
585 	skb->dst = dst_clone(pkt->dst);
586 
587 	/*
588 	 *	Add our header
589 	 */
590 
591 	igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
592 	igmp->type	=
593 	msg->im_msgtype = assert;
594 	igmp->code 	=	0;
595 	skb->nh.iph->tot_len=htons(skb->len);			/* Fix the length */
596 	skb->h.raw = skb->nh.raw;
597 	}
598 
599 	if (mroute_socket == NULL) {
600 		kfree_skb(skb);
601 		return -EINVAL;
602 	}
603 
604 	/*
605 	 *	Deliver to mrouted
606 	 */
607 	if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
608 		if (net_ratelimit())
609 			printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
610 		kfree_skb(skb);
611 	}
612 
613 	return ret;
614 }
615 
616 /*
617  *	Queue a packet for resolution. It gets locked cache entry!
618  */
619 
620 static int
621 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
622 {
623 	int err;
624 	struct mfc_cache *c;
625 
626 	spin_lock_bh(&mfc_unres_lock);
627 	for (c=mfc_unres_queue; c; c=c->next) {
628 		if (c->mfc_mcastgrp == skb->nh.iph->daddr &&
629 		    c->mfc_origin == skb->nh.iph->saddr)
630 			break;
631 	}
632 
633 	if (c == NULL) {
634 		/*
635 		 *	Create a new entry if allowable
636 		 */
637 
638 		if (atomic_read(&cache_resolve_queue_len)>=10 ||
639 		    (c=ipmr_cache_alloc_unres())==NULL) {
640 			spin_unlock_bh(&mfc_unres_lock);
641 
642 			kfree_skb(skb);
643 			return -ENOBUFS;
644 		}
645 
646 		/*
647 		 *	Fill in the new cache entry
648 		 */
649 		c->mfc_parent=-1;
650 		c->mfc_origin=skb->nh.iph->saddr;
651 		c->mfc_mcastgrp=skb->nh.iph->daddr;
652 
653 		/*
654 		 *	Reflect first query at mrouted.
655 		 */
656 		if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
657 			/* If the report failed throw the cache entry
658 			   out - Brad Parker
659 			 */
660 			spin_unlock_bh(&mfc_unres_lock);
661 
662 			kmem_cache_free(mrt_cachep, c);
663 			kfree_skb(skb);
664 			return err;
665 		}
666 
667 		atomic_inc(&cache_resolve_queue_len);
668 		c->next = mfc_unres_queue;
669 		mfc_unres_queue = c;
670 
671 		mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
672 	}
673 
674 	/*
675 	 *	See if we can append the packet
676 	 */
677 	if (c->mfc_un.unres.unresolved.qlen>3) {
678 		kfree_skb(skb);
679 		err = -ENOBUFS;
680 	} else {
681 		skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
682 		err = 0;
683 	}
684 
685 	spin_unlock_bh(&mfc_unres_lock);
686 	return err;
687 }
688 
689 /*
690  *	MFC cache manipulation by user space mroute daemon
691  */
692 
693 static int ipmr_mfc_delete(struct mfcctl *mfc)
694 {
695 	int line;
696 	struct mfc_cache *c, **cp;
697 
698 	line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
699 
700 	for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
701 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
702 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
703 			write_lock_bh(&mrt_lock);
704 			*cp = c->next;
705 			write_unlock_bh(&mrt_lock);
706 
707 			kmem_cache_free(mrt_cachep, c);
708 			return 0;
709 		}
710 	}
711 	return -ENOENT;
712 }
713 
714 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
715 {
716 	int line;
717 	struct mfc_cache *uc, *c, **cp;
718 
719 	line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
720 
721 	for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
722 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
723 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
724 			break;
725 	}
726 
727 	if (c != NULL) {
728 		write_lock_bh(&mrt_lock);
729 		c->mfc_parent = mfc->mfcc_parent;
730 		ipmr_update_thresholds(c, mfc->mfcc_ttls);
731 		if (!mrtsock)
732 			c->mfc_flags |= MFC_STATIC;
733 		write_unlock_bh(&mrt_lock);
734 		return 0;
735 	}
736 
737 	if(!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
738 		return -EINVAL;
739 
740 	c=ipmr_cache_alloc();
741 	if (c==NULL)
742 		return -ENOMEM;
743 
744 	c->mfc_origin=mfc->mfcc_origin.s_addr;
745 	c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
746 	c->mfc_parent=mfc->mfcc_parent;
747 	ipmr_update_thresholds(c, mfc->mfcc_ttls);
748 	if (!mrtsock)
749 		c->mfc_flags |= MFC_STATIC;
750 
751 	write_lock_bh(&mrt_lock);
752 	c->next = mfc_cache_array[line];
753 	mfc_cache_array[line] = c;
754 	write_unlock_bh(&mrt_lock);
755 
756 	/*
757 	 *	Check to see if we resolved a queued list. If so we
758 	 *	need to send on the frames and tidy up.
759 	 */
760 	spin_lock_bh(&mfc_unres_lock);
761 	for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
762 	     cp = &uc->next) {
763 		if (uc->mfc_origin == c->mfc_origin &&
764 		    uc->mfc_mcastgrp == c->mfc_mcastgrp) {
765 			*cp = uc->next;
766 			if (atomic_dec_and_test(&cache_resolve_queue_len))
767 				del_timer(&ipmr_expire_timer);
768 			break;
769 		}
770 	}
771 	spin_unlock_bh(&mfc_unres_lock);
772 
773 	if (uc) {
774 		ipmr_cache_resolve(uc, c);
775 		kmem_cache_free(mrt_cachep, uc);
776 	}
777 	return 0;
778 }
779 
780 /*
781  *	Close the multicast socket, and clear the vif tables etc
782  */
783 
784 static void mroute_clean_tables(struct sock *sk)
785 {
786 	int i;
787 
788 	/*
789 	 *	Shut down all active vif entries
790 	 */
791 	for(i=0; i<maxvif; i++) {
792 		if (!(vif_table[i].flags&VIFF_STATIC))
793 			vif_delete(i);
794 	}
795 
796 	/*
797 	 *	Wipe the cache
798 	 */
799 	for (i=0;i<MFC_LINES;i++) {
800 		struct mfc_cache *c, **cp;
801 
802 		cp = &mfc_cache_array[i];
803 		while ((c = *cp) != NULL) {
804 			if (c->mfc_flags&MFC_STATIC) {
805 				cp = &c->next;
806 				continue;
807 			}
808 			write_lock_bh(&mrt_lock);
809 			*cp = c->next;
810 			write_unlock_bh(&mrt_lock);
811 
812 			kmem_cache_free(mrt_cachep, c);
813 		}
814 	}
815 
816 	if (atomic_read(&cache_resolve_queue_len) != 0) {
817 		struct mfc_cache *c;
818 
819 		spin_lock_bh(&mfc_unres_lock);
820 		while (mfc_unres_queue != NULL) {
821 			c = mfc_unres_queue;
822 			mfc_unres_queue = c->next;
823 			spin_unlock_bh(&mfc_unres_lock);
824 
825 			ipmr_destroy_unres(c);
826 
827 			spin_lock_bh(&mfc_unres_lock);
828 		}
829 		spin_unlock_bh(&mfc_unres_lock);
830 	}
831 }
832 
833 static void mrtsock_destruct(struct sock *sk)
834 {
835 	rtnl_lock();
836 	if (sk == mroute_socket) {
837 		ipv4_devconf.mc_forwarding--;
838 
839 		write_lock_bh(&mrt_lock);
840 		mroute_socket=NULL;
841 		write_unlock_bh(&mrt_lock);
842 
843 		mroute_clean_tables(sk);
844 	}
845 	rtnl_unlock();
846 }
847 
848 /*
849  *	Socket options and virtual interface manipulation. The whole
850  *	virtual interface system is a complete heap, but unfortunately
851  *	that's how BSD mrouted happens to think. Maybe one day with a proper
852  *	MOSPF/PIM router set up we can clean this up.
853  */
854 
855 int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
856 {
857 	int ret;
858 	struct vifctl vif;
859 	struct mfcctl mfc;
860 
861 	if(optname!=MRT_INIT)
862 	{
863 		if(sk!=mroute_socket && !capable(CAP_NET_ADMIN))
864 			return -EACCES;
865 	}
866 
867 	switch(optname)
868 	{
869 		case MRT_INIT:
870 			if (sk->sk_type != SOCK_RAW ||
871 			    inet_sk(sk)->num != IPPROTO_IGMP)
872 				return -EOPNOTSUPP;
873 			if(optlen!=sizeof(int))
874 				return -ENOPROTOOPT;
875 
876 			rtnl_lock();
877 			if (mroute_socket) {
878 				rtnl_unlock();
879 				return -EADDRINUSE;
880 			}
881 
882 			ret = ip_ra_control(sk, 1, mrtsock_destruct);
883 			if (ret == 0) {
884 				write_lock_bh(&mrt_lock);
885 				mroute_socket=sk;
886 				write_unlock_bh(&mrt_lock);
887 
888 				ipv4_devconf.mc_forwarding++;
889 			}
890 			rtnl_unlock();
891 			return ret;
892 		case MRT_DONE:
893 			if (sk!=mroute_socket)
894 				return -EACCES;
895 			return ip_ra_control(sk, 0, NULL);
896 		case MRT_ADD_VIF:
897 		case MRT_DEL_VIF:
898 			if(optlen!=sizeof(vif))
899 				return -EINVAL;
900 			if (copy_from_user(&vif,optval,sizeof(vif)))
901 				return -EFAULT;
902 			if(vif.vifc_vifi >= MAXVIFS)
903 				return -ENFILE;
904 			rtnl_lock();
905 			if (optname==MRT_ADD_VIF) {
906 				ret = vif_add(&vif, sk==mroute_socket);
907 			} else {
908 				ret = vif_delete(vif.vifc_vifi);
909 			}
910 			rtnl_unlock();
911 			return ret;
912 
913 		/*
914 		 *	Manipulate the forwarding caches. These live
915 		 *	in a sort of kernel/user symbiosis.
916 		 */
917 		case MRT_ADD_MFC:
918 		case MRT_DEL_MFC:
919 			if(optlen!=sizeof(mfc))
920 				return -EINVAL;
921 			if (copy_from_user(&mfc,optval, sizeof(mfc)))
922 				return -EFAULT;
923 			rtnl_lock();
924 			if (optname==MRT_DEL_MFC)
925 				ret = ipmr_mfc_delete(&mfc);
926 			else
927 				ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
928 			rtnl_unlock();
929 			return ret;
930 		/*
931 		 *	Control PIM assert.
932 		 */
933 		case MRT_ASSERT:
934 		{
935 			int v;
936 			if(get_user(v,(int __user *)optval))
937 				return -EFAULT;
938 			mroute_do_assert=(v)?1:0;
939 			return 0;
940 		}
941 #ifdef CONFIG_IP_PIMSM
942 		case MRT_PIM:
943 		{
944 			int v, ret;
945 			if(get_user(v,(int __user *)optval))
946 				return -EFAULT;
947 			v = (v)?1:0;
948 			rtnl_lock();
949 			ret = 0;
950 			if (v != mroute_do_pim) {
951 				mroute_do_pim = v;
952 				mroute_do_assert = v;
953 #ifdef CONFIG_IP_PIMSM_V2
954 				if (mroute_do_pim)
955 					ret = inet_add_protocol(&pim_protocol,
956 								IPPROTO_PIM);
957 				else
958 					ret = inet_del_protocol(&pim_protocol,
959 								IPPROTO_PIM);
960 				if (ret < 0)
961 					ret = -EAGAIN;
962 #endif
963 			}
964 			rtnl_unlock();
965 			return ret;
966 		}
967 #endif
968 		/*
969 		 *	Spurious command, or MRT_VERSION which you cannot
970 		 *	set.
971 		 */
972 		default:
973 			return -ENOPROTOOPT;
974 	}
975 }
976 
977 /*
978  *	Getsock opt support for the multicast routing system.
979  */
980 
981 int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
982 {
983 	int olr;
984 	int val;
985 
986 	if(optname!=MRT_VERSION &&
987 #ifdef CONFIG_IP_PIMSM
988 	   optname!=MRT_PIM &&
989 #endif
990 	   optname!=MRT_ASSERT)
991 		return -ENOPROTOOPT;
992 
993 	if (get_user(olr, optlen))
994 		return -EFAULT;
995 
996 	olr = min_t(unsigned int, olr, sizeof(int));
997 	if (olr < 0)
998 		return -EINVAL;
999 
1000 	if(put_user(olr,optlen))
1001 		return -EFAULT;
1002 	if(optname==MRT_VERSION)
1003 		val=0x0305;
1004 #ifdef CONFIG_IP_PIMSM
1005 	else if(optname==MRT_PIM)
1006 		val=mroute_do_pim;
1007 #endif
1008 	else
1009 		val=mroute_do_assert;
1010 	if(copy_to_user(optval,&val,olr))
1011 		return -EFAULT;
1012 	return 0;
1013 }
1014 
1015 /*
1016  *	The IP multicast ioctl support routines.
1017  */
1018 
1019 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1020 {
1021 	struct sioc_sg_req sr;
1022 	struct sioc_vif_req vr;
1023 	struct vif_device *vif;
1024 	struct mfc_cache *c;
1025 
1026 	switch(cmd)
1027 	{
1028 		case SIOCGETVIFCNT:
1029 			if (copy_from_user(&vr,arg,sizeof(vr)))
1030 				return -EFAULT;
1031 			if(vr.vifi>=maxvif)
1032 				return -EINVAL;
1033 			read_lock(&mrt_lock);
1034 			vif=&vif_table[vr.vifi];
1035 			if(VIF_EXISTS(vr.vifi))	{
1036 				vr.icount=vif->pkt_in;
1037 				vr.ocount=vif->pkt_out;
1038 				vr.ibytes=vif->bytes_in;
1039 				vr.obytes=vif->bytes_out;
1040 				read_unlock(&mrt_lock);
1041 
1042 				if (copy_to_user(arg,&vr,sizeof(vr)))
1043 					return -EFAULT;
1044 				return 0;
1045 			}
1046 			read_unlock(&mrt_lock);
1047 			return -EADDRNOTAVAIL;
1048 		case SIOCGETSGCNT:
1049 			if (copy_from_user(&sr,arg,sizeof(sr)))
1050 				return -EFAULT;
1051 
1052 			read_lock(&mrt_lock);
1053 			c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1054 			if (c) {
1055 				sr.pktcnt = c->mfc_un.res.pkt;
1056 				sr.bytecnt = c->mfc_un.res.bytes;
1057 				sr.wrong_if = c->mfc_un.res.wrong_if;
1058 				read_unlock(&mrt_lock);
1059 
1060 				if (copy_to_user(arg,&sr,sizeof(sr)))
1061 					return -EFAULT;
1062 				return 0;
1063 			}
1064 			read_unlock(&mrt_lock);
1065 			return -EADDRNOTAVAIL;
1066 		default:
1067 			return -ENOIOCTLCMD;
1068 	}
1069 }
1070 
1071 
1072 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1073 {
1074 	struct vif_device *v;
1075 	int ct;
1076 	if (event != NETDEV_UNREGISTER)
1077 		return NOTIFY_DONE;
1078 	v=&vif_table[0];
1079 	for(ct=0;ct<maxvif;ct++,v++) {
1080 		if (v->dev==ptr)
1081 			vif_delete(ct);
1082 	}
1083 	return NOTIFY_DONE;
1084 }
1085 
1086 
1087 static struct notifier_block ip_mr_notifier={
1088 	.notifier_call = ipmr_device_event,
1089 };
1090 
1091 /*
1092  * 	Encapsulate a packet by attaching a valid IPIP header to it.
1093  *	This avoids tunnel drivers and other mess and gives us the speed so
1094  *	important for multicast video.
1095  */
1096 
1097 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1098 {
1099 	struct iphdr *iph = (struct iphdr *)skb_push(skb,sizeof(struct iphdr));
1100 
1101 	iph->version	= 	4;
1102 	iph->tos	=	skb->nh.iph->tos;
1103 	iph->ttl	=	skb->nh.iph->ttl;
1104 	iph->frag_off	=	0;
1105 	iph->daddr	=	daddr;
1106 	iph->saddr	=	saddr;
1107 	iph->protocol	=	IPPROTO_IPIP;
1108 	iph->ihl	=	5;
1109 	iph->tot_len	=	htons(skb->len);
1110 	ip_select_ident(iph, skb->dst, NULL);
1111 	ip_send_check(iph);
1112 
1113 	skb->h.ipiph = skb->nh.iph;
1114 	skb->nh.iph = iph;
1115 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1116 	nf_reset(skb);
1117 }
1118 
1119 static inline int ipmr_forward_finish(struct sk_buff *skb)
1120 {
1121 	struct ip_options * opt	= &(IPCB(skb)->opt);
1122 
1123 	IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
1124 
1125 	if (unlikely(opt->optlen))
1126 		ip_forward_options(skb);
1127 
1128 	return dst_output(skb);
1129 }
1130 
1131 /*
1132  *	Processing handlers for ipmr_forward
1133  */
1134 
1135 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1136 {
1137 	struct iphdr *iph = skb->nh.iph;
1138 	struct vif_device *vif = &vif_table[vifi];
1139 	struct net_device *dev;
1140 	struct rtable *rt;
1141 	int    encap = 0;
1142 
1143 	if (vif->dev == NULL)
1144 		goto out_free;
1145 
1146 #ifdef CONFIG_IP_PIMSM
1147 	if (vif->flags & VIFF_REGISTER) {
1148 		vif->pkt_out++;
1149 		vif->bytes_out+=skb->len;
1150 		((struct net_device_stats*)netdev_priv(vif->dev))->tx_bytes += skb->len;
1151 		((struct net_device_stats*)netdev_priv(vif->dev))->tx_packets++;
1152 		ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1153 		kfree_skb(skb);
1154 		return;
1155 	}
1156 #endif
1157 
1158 	if (vif->flags&VIFF_TUNNEL) {
1159 		struct flowi fl = { .oif = vif->link,
1160 				    .nl_u = { .ip4_u =
1161 					      { .daddr = vif->remote,
1162 						.saddr = vif->local,
1163 						.tos = RT_TOS(iph->tos) } },
1164 				    .proto = IPPROTO_IPIP };
1165 		if (ip_route_output_key(&rt, &fl))
1166 			goto out_free;
1167 		encap = sizeof(struct iphdr);
1168 	} else {
1169 		struct flowi fl = { .oif = vif->link,
1170 				    .nl_u = { .ip4_u =
1171 					      { .daddr = iph->daddr,
1172 						.tos = RT_TOS(iph->tos) } },
1173 				    .proto = IPPROTO_IPIP };
1174 		if (ip_route_output_key(&rt, &fl))
1175 			goto out_free;
1176 	}
1177 
1178 	dev = rt->u.dst.dev;
1179 
1180 	if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1181 		/* Do not fragment multicasts. Alas, IPv4 does not
1182 		   allow to send ICMP, so that packets will disappear
1183 		   to blackhole.
1184 		 */
1185 
1186 		IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
1187 		ip_rt_put(rt);
1188 		goto out_free;
1189 	}
1190 
1191 	encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1192 
1193 	if (skb_cow(skb, encap)) {
1194 		ip_rt_put(rt);
1195 		goto out_free;
1196 	}
1197 
1198 	vif->pkt_out++;
1199 	vif->bytes_out+=skb->len;
1200 
1201 	dst_release(skb->dst);
1202 	skb->dst = &rt->u.dst;
1203 	iph = skb->nh.iph;
1204 	ip_decrease_ttl(iph);
1205 
1206 	/* FIXME: forward and output firewalls used to be called here.
1207 	 * What do we do with netfilter? -- RR */
1208 	if (vif->flags & VIFF_TUNNEL) {
1209 		ip_encap(skb, vif->local, vif->remote);
1210 		/* FIXME: extra output firewall step used to be here. --RR */
1211 		((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_packets++;
1212 		((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_bytes+=skb->len;
1213 	}
1214 
1215 	IPCB(skb)->flags |= IPSKB_FORWARDED;
1216 
1217 	/*
1218 	 * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1219 	 * not only before forwarding, but after forwarding on all output
1220 	 * interfaces. It is clear, if mrouter runs a multicasting
1221 	 * program, it should receive packets not depending to what interface
1222 	 * program is joined.
1223 	 * If we will not make it, the program will have to join on all
1224 	 * interfaces. On the other hand, multihoming host (or router, but
1225 	 * not mrouter) cannot join to more than one interface - it will
1226 	 * result in receiving multiple packets.
1227 	 */
1228 	NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev,
1229 		ipmr_forward_finish);
1230 	return;
1231 
1232 out_free:
1233 	kfree_skb(skb);
1234 	return;
1235 }
1236 
1237 static int ipmr_find_vif(struct net_device *dev)
1238 {
1239 	int ct;
1240 	for (ct=maxvif-1; ct>=0; ct--) {
1241 		if (vif_table[ct].dev == dev)
1242 			break;
1243 	}
1244 	return ct;
1245 }
1246 
1247 /* "local" means that we should preserve one skb (for local delivery) */
1248 
1249 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1250 {
1251 	int psend = -1;
1252 	int vif, ct;
1253 
1254 	vif = cache->mfc_parent;
1255 	cache->mfc_un.res.pkt++;
1256 	cache->mfc_un.res.bytes += skb->len;
1257 
1258 	/*
1259 	 * Wrong interface: drop packet and (maybe) send PIM assert.
1260 	 */
1261 	if (vif_table[vif].dev != skb->dev) {
1262 		int true_vifi;
1263 
1264 		if (((struct rtable*)skb->dst)->fl.iif == 0) {
1265 			/* It is our own packet, looped back.
1266 			   Very complicated situation...
1267 
1268 			   The best workaround until routing daemons will be
1269 			   fixed is not to redistribute packet, if it was
1270 			   send through wrong interface. It means, that
1271 			   multicast applications WILL NOT work for
1272 			   (S,G), which have default multicast route pointing
1273 			   to wrong oif. In any case, it is not a good
1274 			   idea to use multicasting applications on router.
1275 			 */
1276 			goto dont_forward;
1277 		}
1278 
1279 		cache->mfc_un.res.wrong_if++;
1280 		true_vifi = ipmr_find_vif(skb->dev);
1281 
1282 		if (true_vifi >= 0 && mroute_do_assert &&
1283 		    /* pimsm uses asserts, when switching from RPT to SPT,
1284 		       so that we cannot check that packet arrived on an oif.
1285 		       It is bad, but otherwise we would need to move pretty
1286 		       large chunk of pimd to kernel. Ough... --ANK
1287 		     */
1288 		    (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1289 		    time_after(jiffies,
1290 			       cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1291 			cache->mfc_un.res.last_assert = jiffies;
1292 			ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1293 		}
1294 		goto dont_forward;
1295 	}
1296 
1297 	vif_table[vif].pkt_in++;
1298 	vif_table[vif].bytes_in+=skb->len;
1299 
1300 	/*
1301 	 *	Forward the frame
1302 	 */
1303 	for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1304 		if (skb->nh.iph->ttl > cache->mfc_un.res.ttls[ct]) {
1305 			if (psend != -1) {
1306 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1307 				if (skb2)
1308 					ipmr_queue_xmit(skb2, cache, psend);
1309 			}
1310 			psend=ct;
1311 		}
1312 	}
1313 	if (psend != -1) {
1314 		if (local) {
1315 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1316 			if (skb2)
1317 				ipmr_queue_xmit(skb2, cache, psend);
1318 		} else {
1319 			ipmr_queue_xmit(skb, cache, psend);
1320 			return 0;
1321 		}
1322 	}
1323 
1324 dont_forward:
1325 	if (!local)
1326 		kfree_skb(skb);
1327 	return 0;
1328 }
1329 
1330 
1331 /*
1332  *	Multicast packets for forwarding arrive here
1333  */
1334 
1335 int ip_mr_input(struct sk_buff *skb)
1336 {
1337 	struct mfc_cache *cache;
1338 	int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
1339 
1340 	/* Packet is looped back after forward, it should not be
1341 	   forwarded second time, but still can be delivered locally.
1342 	 */
1343 	if (IPCB(skb)->flags&IPSKB_FORWARDED)
1344 		goto dont_forward;
1345 
1346 	if (!local) {
1347 		    if (IPCB(skb)->opt.router_alert) {
1348 			    if (ip_call_ra_chain(skb))
1349 				    return 0;
1350 		    } else if (skb->nh.iph->protocol == IPPROTO_IGMP){
1351 			    /* IGMPv1 (and broken IGMPv2 implementations sort of
1352 			       Cisco IOS <= 11.2(8)) do not put router alert
1353 			       option to IGMP packets destined to routable
1354 			       groups. It is very bad, because it means
1355 			       that we can forward NO IGMP messages.
1356 			     */
1357 			    read_lock(&mrt_lock);
1358 			    if (mroute_socket) {
1359 				    nf_reset(skb);
1360 				    raw_rcv(mroute_socket, skb);
1361 				    read_unlock(&mrt_lock);
1362 				    return 0;
1363 			    }
1364 			    read_unlock(&mrt_lock);
1365 		    }
1366 	}
1367 
1368 	read_lock(&mrt_lock);
1369 	cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr);
1370 
1371 	/*
1372 	 *	No usable cache entry
1373 	 */
1374 	if (cache==NULL) {
1375 		int vif;
1376 
1377 		if (local) {
1378 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1379 			ip_local_deliver(skb);
1380 			if (skb2 == NULL) {
1381 				read_unlock(&mrt_lock);
1382 				return -ENOBUFS;
1383 			}
1384 			skb = skb2;
1385 		}
1386 
1387 		vif = ipmr_find_vif(skb->dev);
1388 		if (vif >= 0) {
1389 			int err = ipmr_cache_unresolved(vif, skb);
1390 			read_unlock(&mrt_lock);
1391 
1392 			return err;
1393 		}
1394 		read_unlock(&mrt_lock);
1395 		kfree_skb(skb);
1396 		return -ENODEV;
1397 	}
1398 
1399 	ip_mr_forward(skb, cache, local);
1400 
1401 	read_unlock(&mrt_lock);
1402 
1403 	if (local)
1404 		return ip_local_deliver(skb);
1405 
1406 	return 0;
1407 
1408 dont_forward:
1409 	if (local)
1410 		return ip_local_deliver(skb);
1411 	kfree_skb(skb);
1412 	return 0;
1413 }
1414 
1415 #ifdef CONFIG_IP_PIMSM_V1
1416 /*
1417  * Handle IGMP messages of PIMv1
1418  */
1419 
1420 int pim_rcv_v1(struct sk_buff * skb)
1421 {
1422 	struct igmphdr *pim;
1423 	struct iphdr   *encap;
1424 	struct net_device  *reg_dev = NULL;
1425 
1426 	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1427 		goto drop;
1428 
1429 	pim = (struct igmphdr*)skb->h.raw;
1430 
1431 	if (!mroute_do_pim ||
1432 	    skb->len < sizeof(*pim) + sizeof(*encap) ||
1433 	    pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1434 		goto drop;
1435 
1436 	encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr));
1437 	/*
1438 	   Check that:
1439 	   a. packet is really destinted to a multicast group
1440 	   b. packet is not a NULL-REGISTER
1441 	   c. packet is not truncated
1442 	 */
1443 	if (!MULTICAST(encap->daddr) ||
1444 	    encap->tot_len == 0 ||
1445 	    ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1446 		goto drop;
1447 
1448 	read_lock(&mrt_lock);
1449 	if (reg_vif_num >= 0)
1450 		reg_dev = vif_table[reg_vif_num].dev;
1451 	if (reg_dev)
1452 		dev_hold(reg_dev);
1453 	read_unlock(&mrt_lock);
1454 
1455 	if (reg_dev == NULL)
1456 		goto drop;
1457 
1458 	skb->mac.raw = skb->nh.raw;
1459 	skb_pull(skb, (u8*)encap - skb->data);
1460 	skb->nh.iph = (struct iphdr *)skb->data;
1461 	skb->dev = reg_dev;
1462 	skb->protocol = htons(ETH_P_IP);
1463 	skb->ip_summed = 0;
1464 	skb->pkt_type = PACKET_HOST;
1465 	dst_release(skb->dst);
1466 	skb->dst = NULL;
1467 	((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1468 	((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1469 	nf_reset(skb);
1470 	netif_rx(skb);
1471 	dev_put(reg_dev);
1472 	return 0;
1473  drop:
1474 	kfree_skb(skb);
1475 	return 0;
1476 }
1477 #endif
1478 
1479 #ifdef CONFIG_IP_PIMSM_V2
1480 static int pim_rcv(struct sk_buff * skb)
1481 {
1482 	struct pimreghdr *pim;
1483 	struct iphdr   *encap;
1484 	struct net_device  *reg_dev = NULL;
1485 
1486 	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1487 		goto drop;
1488 
1489 	pim = (struct pimreghdr*)skb->h.raw;
1490 	if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1491 	    (pim->flags&PIM_NULL_REGISTER) ||
1492 	    (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1493 	     csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1494 		goto drop;
1495 
1496 	/* check if the inner packet is destined to mcast group */
1497 	encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr));
1498 	if (!MULTICAST(encap->daddr) ||
1499 	    encap->tot_len == 0 ||
1500 	    ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1501 		goto drop;
1502 
1503 	read_lock(&mrt_lock);
1504 	if (reg_vif_num >= 0)
1505 		reg_dev = vif_table[reg_vif_num].dev;
1506 	if (reg_dev)
1507 		dev_hold(reg_dev);
1508 	read_unlock(&mrt_lock);
1509 
1510 	if (reg_dev == NULL)
1511 		goto drop;
1512 
1513 	skb->mac.raw = skb->nh.raw;
1514 	skb_pull(skb, (u8*)encap - skb->data);
1515 	skb->nh.iph = (struct iphdr *)skb->data;
1516 	skb->dev = reg_dev;
1517 	skb->protocol = htons(ETH_P_IP);
1518 	skb->ip_summed = 0;
1519 	skb->pkt_type = PACKET_HOST;
1520 	dst_release(skb->dst);
1521 	((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1522 	((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1523 	skb->dst = NULL;
1524 	nf_reset(skb);
1525 	netif_rx(skb);
1526 	dev_put(reg_dev);
1527 	return 0;
1528  drop:
1529 	kfree_skb(skb);
1530 	return 0;
1531 }
1532 #endif
1533 
1534 static int
1535 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1536 {
1537 	int ct;
1538 	struct rtnexthop *nhp;
1539 	struct net_device *dev = vif_table[c->mfc_parent].dev;
1540 	u8 *b = skb->tail;
1541 	struct rtattr *mp_head;
1542 
1543 	if (dev)
1544 		RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1545 
1546 	mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1547 
1548 	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1549 		if (c->mfc_un.res.ttls[ct] < 255) {
1550 			if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1551 				goto rtattr_failure;
1552 			nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1553 			nhp->rtnh_flags = 0;
1554 			nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1555 			nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1556 			nhp->rtnh_len = sizeof(*nhp);
1557 		}
1558 	}
1559 	mp_head->rta_type = RTA_MULTIPATH;
1560 	mp_head->rta_len = skb->tail - (u8*)mp_head;
1561 	rtm->rtm_type = RTN_MULTICAST;
1562 	return 1;
1563 
1564 rtattr_failure:
1565 	skb_trim(skb, b - skb->data);
1566 	return -EMSGSIZE;
1567 }
1568 
1569 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1570 {
1571 	int err;
1572 	struct mfc_cache *cache;
1573 	struct rtable *rt = (struct rtable*)skb->dst;
1574 
1575 	read_lock(&mrt_lock);
1576 	cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1577 
1578 	if (cache==NULL) {
1579 		struct sk_buff *skb2;
1580 		struct net_device *dev;
1581 		int vif;
1582 
1583 		if (nowait) {
1584 			read_unlock(&mrt_lock);
1585 			return -EAGAIN;
1586 		}
1587 
1588 		dev = skb->dev;
1589 		if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1590 			read_unlock(&mrt_lock);
1591 			return -ENODEV;
1592 		}
1593 		skb2 = skb_clone(skb, GFP_ATOMIC);
1594 		if (!skb2) {
1595 			read_unlock(&mrt_lock);
1596 			return -ENOMEM;
1597 		}
1598 
1599 		skb2->nh.raw = skb_push(skb2, sizeof(struct iphdr));
1600 		skb2->nh.iph->ihl = sizeof(struct iphdr)>>2;
1601 		skb2->nh.iph->saddr = rt->rt_src;
1602 		skb2->nh.iph->daddr = rt->rt_dst;
1603 		skb2->nh.iph->version = 0;
1604 		err = ipmr_cache_unresolved(vif, skb2);
1605 		read_unlock(&mrt_lock);
1606 		return err;
1607 	}
1608 
1609 	if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1610 		cache->mfc_flags |= MFC_NOTIFY;
1611 	err = ipmr_fill_mroute(skb, cache, rtm);
1612 	read_unlock(&mrt_lock);
1613 	return err;
1614 }
1615 
1616 #ifdef CONFIG_PROC_FS
1617 /*
1618  *	The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1619  */
1620 struct ipmr_vif_iter {
1621 	int ct;
1622 };
1623 
1624 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1625 					   loff_t pos)
1626 {
1627 	for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1628 		if(!VIF_EXISTS(iter->ct))
1629 			continue;
1630 		if (pos-- == 0)
1631 			return &vif_table[iter->ct];
1632 	}
1633 	return NULL;
1634 }
1635 
1636 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1637 {
1638 	read_lock(&mrt_lock);
1639 	return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1640 		: SEQ_START_TOKEN;
1641 }
1642 
1643 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1644 {
1645 	struct ipmr_vif_iter *iter = seq->private;
1646 
1647 	++*pos;
1648 	if (v == SEQ_START_TOKEN)
1649 		return ipmr_vif_seq_idx(iter, 0);
1650 
1651 	while (++iter->ct < maxvif) {
1652 		if(!VIF_EXISTS(iter->ct))
1653 			continue;
1654 		return &vif_table[iter->ct];
1655 	}
1656 	return NULL;
1657 }
1658 
1659 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1660 {
1661 	read_unlock(&mrt_lock);
1662 }
1663 
1664 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1665 {
1666 	if (v == SEQ_START_TOKEN) {
1667 		seq_puts(seq,
1668 			 "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1669 	} else {
1670 		const struct vif_device *vif = v;
1671 		const char *name =  vif->dev ? vif->dev->name : "none";
1672 
1673 		seq_printf(seq,
1674 			   "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1675 			   vif - vif_table,
1676 			   name, vif->bytes_in, vif->pkt_in,
1677 			   vif->bytes_out, vif->pkt_out,
1678 			   vif->flags, vif->local, vif->remote);
1679 	}
1680 	return 0;
1681 }
1682 
1683 static struct seq_operations ipmr_vif_seq_ops = {
1684 	.start = ipmr_vif_seq_start,
1685 	.next  = ipmr_vif_seq_next,
1686 	.stop  = ipmr_vif_seq_stop,
1687 	.show  = ipmr_vif_seq_show,
1688 };
1689 
1690 static int ipmr_vif_open(struct inode *inode, struct file *file)
1691 {
1692 	struct seq_file *seq;
1693 	int rc = -ENOMEM;
1694 	struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1695 
1696 	if (!s)
1697 		goto out;
1698 
1699 	rc = seq_open(file, &ipmr_vif_seq_ops);
1700 	if (rc)
1701 		goto out_kfree;
1702 
1703 	s->ct = 0;
1704 	seq = file->private_data;
1705 	seq->private = s;
1706 out:
1707 	return rc;
1708 out_kfree:
1709 	kfree(s);
1710 	goto out;
1711 
1712 }
1713 
1714 static const struct file_operations ipmr_vif_fops = {
1715 	.owner	 = THIS_MODULE,
1716 	.open    = ipmr_vif_open,
1717 	.read    = seq_read,
1718 	.llseek  = seq_lseek,
1719 	.release = seq_release_private,
1720 };
1721 
1722 struct ipmr_mfc_iter {
1723 	struct mfc_cache **cache;
1724 	int ct;
1725 };
1726 
1727 
1728 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1729 {
1730 	struct mfc_cache *mfc;
1731 
1732 	it->cache = mfc_cache_array;
1733 	read_lock(&mrt_lock);
1734 	for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1735 		for(mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
1736 			if (pos-- == 0)
1737 				return mfc;
1738 	read_unlock(&mrt_lock);
1739 
1740 	it->cache = &mfc_unres_queue;
1741 	spin_lock_bh(&mfc_unres_lock);
1742 	for(mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1743 		if (pos-- == 0)
1744 			return mfc;
1745 	spin_unlock_bh(&mfc_unres_lock);
1746 
1747 	it->cache = NULL;
1748 	return NULL;
1749 }
1750 
1751 
1752 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1753 {
1754 	struct ipmr_mfc_iter *it = seq->private;
1755 	it->cache = NULL;
1756 	it->ct = 0;
1757 	return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1758 		: SEQ_START_TOKEN;
1759 }
1760 
1761 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1762 {
1763 	struct mfc_cache *mfc = v;
1764 	struct ipmr_mfc_iter *it = seq->private;
1765 
1766 	++*pos;
1767 
1768 	if (v == SEQ_START_TOKEN)
1769 		return ipmr_mfc_seq_idx(seq->private, 0);
1770 
1771 	if (mfc->next)
1772 		return mfc->next;
1773 
1774 	if (it->cache == &mfc_unres_queue)
1775 		goto end_of_list;
1776 
1777 	BUG_ON(it->cache != mfc_cache_array);
1778 
1779 	while (++it->ct < MFC_LINES) {
1780 		mfc = mfc_cache_array[it->ct];
1781 		if (mfc)
1782 			return mfc;
1783 	}
1784 
1785 	/* exhausted cache_array, show unresolved */
1786 	read_unlock(&mrt_lock);
1787 	it->cache = &mfc_unres_queue;
1788 	it->ct = 0;
1789 
1790 	spin_lock_bh(&mfc_unres_lock);
1791 	mfc = mfc_unres_queue;
1792 	if (mfc)
1793 		return mfc;
1794 
1795  end_of_list:
1796 	spin_unlock_bh(&mfc_unres_lock);
1797 	it->cache = NULL;
1798 
1799 	return NULL;
1800 }
1801 
1802 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1803 {
1804 	struct ipmr_mfc_iter *it = seq->private;
1805 
1806 	if (it->cache == &mfc_unres_queue)
1807 		spin_unlock_bh(&mfc_unres_lock);
1808 	else if (it->cache == mfc_cache_array)
1809 		read_unlock(&mrt_lock);
1810 }
1811 
1812 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1813 {
1814 	int n;
1815 
1816 	if (v == SEQ_START_TOKEN) {
1817 		seq_puts(seq,
1818 		 "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1819 	} else {
1820 		const struct mfc_cache *mfc = v;
1821 		const struct ipmr_mfc_iter *it = seq->private;
1822 
1823 		seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1824 			   (unsigned long) mfc->mfc_mcastgrp,
1825 			   (unsigned long) mfc->mfc_origin,
1826 			   mfc->mfc_parent,
1827 			   mfc->mfc_un.res.pkt,
1828 			   mfc->mfc_un.res.bytes,
1829 			   mfc->mfc_un.res.wrong_if);
1830 
1831 		if (it->cache != &mfc_unres_queue) {
1832 			for(n = mfc->mfc_un.res.minvif;
1833 			    n < mfc->mfc_un.res.maxvif; n++ ) {
1834 				if(VIF_EXISTS(n)
1835 				   && mfc->mfc_un.res.ttls[n] < 255)
1836 				seq_printf(seq,
1837 					   " %2d:%-3d",
1838 					   n, mfc->mfc_un.res.ttls[n]);
1839 			}
1840 		}
1841 		seq_putc(seq, '\n');
1842 	}
1843 	return 0;
1844 }
1845 
1846 static struct seq_operations ipmr_mfc_seq_ops = {
1847 	.start = ipmr_mfc_seq_start,
1848 	.next  = ipmr_mfc_seq_next,
1849 	.stop  = ipmr_mfc_seq_stop,
1850 	.show  = ipmr_mfc_seq_show,
1851 };
1852 
1853 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1854 {
1855 	struct seq_file *seq;
1856 	int rc = -ENOMEM;
1857 	struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1858 
1859 	if (!s)
1860 		goto out;
1861 
1862 	rc = seq_open(file, &ipmr_mfc_seq_ops);
1863 	if (rc)
1864 		goto out_kfree;
1865 
1866 	seq = file->private_data;
1867 	seq->private = s;
1868 out:
1869 	return rc;
1870 out_kfree:
1871 	kfree(s);
1872 	goto out;
1873 
1874 }
1875 
1876 static const struct file_operations ipmr_mfc_fops = {
1877 	.owner	 = THIS_MODULE,
1878 	.open    = ipmr_mfc_open,
1879 	.read    = seq_read,
1880 	.llseek  = seq_lseek,
1881 	.release = seq_release_private,
1882 };
1883 #endif
1884 
1885 #ifdef CONFIG_IP_PIMSM_V2
1886 static struct net_protocol pim_protocol = {
1887 	.handler	=	pim_rcv,
1888 };
1889 #endif
1890 
1891 
1892 /*
1893  *	Setup for IP multicast routing
1894  */
1895 
1896 void __init ip_mr_init(void)
1897 {
1898 	mrt_cachep = kmem_cache_create("ip_mrt_cache",
1899 				       sizeof(struct mfc_cache),
1900 				       0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1901 				       NULL, NULL);
1902 	init_timer(&ipmr_expire_timer);
1903 	ipmr_expire_timer.function=ipmr_expire_process;
1904 	register_netdevice_notifier(&ip_mr_notifier);
1905 #ifdef CONFIG_PROC_FS
1906 	proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops);
1907 	proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops);
1908 #endif
1909 }
1910