xref: /linux/net/ipv4/ipmr.c (revision 8fa5723aa7e053d498336b48448b292fc2e0458b)
1 /*
2  *	IP multicast routing support for mrouted 3.6/3.8
3  *
4  *		(c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
5  *	  Linux Consultancy and Custom Driver Development
6  *
7  *	This program is free software; you can redistribute it and/or
8  *	modify it under the terms of the GNU General Public License
9  *	as published by the Free Software Foundation; either version
10  *	2 of the License, or (at your option) any later version.
11  *
12  *	Fixes:
13  *	Michael Chastain	:	Incorrect size of copying.
14  *	Alan Cox		:	Added the cache manager code
15  *	Alan Cox		:	Fixed the clone/copy bug and device race.
16  *	Mike McLagan		:	Routing by source
17  *	Malcolm Beattie		:	Buffer handling fixes.
18  *	Alexey Kuznetsov	:	Double buffer free and other fixes.
19  *	SVR Anand		:	Fixed several multicast bugs and problems.
20  *	Alexey Kuznetsov	:	Status, optimisations and more.
21  *	Brad Parker		:	Better behaviour on mrouted upcall
22  *					overflow.
23  *      Carlos Picoto           :       PIMv1 Support
24  *	Pavlin Ivanov Radoslavov:	PIMv2 Registers must checksum only PIM header
25  *					Relax this requrement to work with older peers.
26  *
27  */
28 
29 #include <asm/system.h>
30 #include <asm/uaccess.h>
31 #include <linux/types.h>
32 #include <linux/capability.h>
33 #include <linux/errno.h>
34 #include <linux/timer.h>
35 #include <linux/mm.h>
36 #include <linux/kernel.h>
37 #include <linux/fcntl.h>
38 #include <linux/stat.h>
39 #include <linux/socket.h>
40 #include <linux/in.h>
41 #include <linux/inet.h>
42 #include <linux/netdevice.h>
43 #include <linux/inetdevice.h>
44 #include <linux/igmp.h>
45 #include <linux/proc_fs.h>
46 #include <linux/seq_file.h>
47 #include <linux/mroute.h>
48 #include <linux/init.h>
49 #include <linux/if_ether.h>
50 #include <net/net_namespace.h>
51 #include <net/ip.h>
52 #include <net/protocol.h>
53 #include <linux/skbuff.h>
54 #include <net/route.h>
55 #include <net/sock.h>
56 #include <net/icmp.h>
57 #include <net/udp.h>
58 #include <net/raw.h>
59 #include <linux/notifier.h>
60 #include <linux/if_arp.h>
61 #include <linux/netfilter_ipv4.h>
62 #include <net/ipip.h>
63 #include <net/checksum.h>
64 #include <net/netlink.h>
65 
66 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
67 #define CONFIG_IP_PIMSM	1
68 #endif
69 
70 static struct sock *mroute_socket;
71 
72 
73 /* Big lock, protecting vif table, mrt cache and mroute socket state.
74    Note that the changes are semaphored via rtnl_lock.
75  */
76 
77 static DEFINE_RWLOCK(mrt_lock);
78 
79 /*
80  *	Multicast router control variables
81  */
82 
83 static struct vif_device vif_table[MAXVIFS];		/* Devices 		*/
84 static int maxvif;
85 
86 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
87 
88 static int mroute_do_assert;				/* Set in PIM assert	*/
89 static int mroute_do_pim;
90 
91 static struct mfc_cache *mfc_cache_array[MFC_LINES];	/* Forwarding cache	*/
92 
93 static struct mfc_cache *mfc_unres_queue;		/* Queue of unresolved entries */
94 static atomic_t cache_resolve_queue_len;		/* Size of unresolved	*/
95 
96 /* Special spinlock for queue of unresolved entries */
97 static DEFINE_SPINLOCK(mfc_unres_lock);
98 
99 /* We return to original Alan's scheme. Hash table of resolved
100    entries is changed only in process context and protected
101    with weak lock mrt_lock. Queue of unresolved entries is protected
102    with strong spinlock mfc_unres_lock.
103 
104    In this case data path is free of exclusive locks at all.
105  */
106 
107 static struct kmem_cache *mrt_cachep __read_mostly;
108 
109 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
110 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
111 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
112 
113 #ifdef CONFIG_IP_PIMSM_V2
114 static struct net_protocol pim_protocol;
115 #endif
116 
117 static struct timer_list ipmr_expire_timer;
118 
119 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
120 
121 static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
122 {
123 	dev_close(dev);
124 
125 	dev = __dev_get_by_name(&init_net, "tunl0");
126 	if (dev) {
127 		struct ifreq ifr;
128 		mm_segment_t	oldfs;
129 		struct ip_tunnel_parm p;
130 
131 		memset(&p, 0, sizeof(p));
132 		p.iph.daddr = v->vifc_rmt_addr.s_addr;
133 		p.iph.saddr = v->vifc_lcl_addr.s_addr;
134 		p.iph.version = 4;
135 		p.iph.ihl = 5;
136 		p.iph.protocol = IPPROTO_IPIP;
137 		sprintf(p.name, "dvmrp%d", v->vifc_vifi);
138 		ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
139 
140 		oldfs = get_fs(); set_fs(KERNEL_DS);
141 		dev->do_ioctl(dev, &ifr, SIOCDELTUNNEL);
142 		set_fs(oldfs);
143 	}
144 }
145 
146 static
147 struct net_device *ipmr_new_tunnel(struct vifctl *v)
148 {
149 	struct net_device  *dev;
150 
151 	dev = __dev_get_by_name(&init_net, "tunl0");
152 
153 	if (dev) {
154 		int err;
155 		struct ifreq ifr;
156 		mm_segment_t	oldfs;
157 		struct ip_tunnel_parm p;
158 		struct in_device  *in_dev;
159 
160 		memset(&p, 0, sizeof(p));
161 		p.iph.daddr = v->vifc_rmt_addr.s_addr;
162 		p.iph.saddr = v->vifc_lcl_addr.s_addr;
163 		p.iph.version = 4;
164 		p.iph.ihl = 5;
165 		p.iph.protocol = IPPROTO_IPIP;
166 		sprintf(p.name, "dvmrp%d", v->vifc_vifi);
167 		ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
168 
169 		oldfs = get_fs(); set_fs(KERNEL_DS);
170 		err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
171 		set_fs(oldfs);
172 
173 		dev = NULL;
174 
175 		if (err == 0 && (dev = __dev_get_by_name(&init_net, p.name)) != NULL) {
176 			dev->flags |= IFF_MULTICAST;
177 
178 			in_dev = __in_dev_get_rtnl(dev);
179 			if (in_dev == NULL)
180 				goto failure;
181 
182 			ipv4_devconf_setall(in_dev);
183 			IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
184 
185 			if (dev_open(dev))
186 				goto failure;
187 			dev_hold(dev);
188 		}
189 	}
190 	return dev;
191 
192 failure:
193 	/* allow the register to be completed before unregistering. */
194 	rtnl_unlock();
195 	rtnl_lock();
196 
197 	unregister_netdevice(dev);
198 	return NULL;
199 }
200 
201 #ifdef CONFIG_IP_PIMSM
202 
203 static int reg_vif_num = -1;
204 
205 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
206 {
207 	read_lock(&mrt_lock);
208 	dev->stats.tx_bytes += skb->len;
209 	dev->stats.tx_packets++;
210 	ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
211 	read_unlock(&mrt_lock);
212 	kfree_skb(skb);
213 	return 0;
214 }
215 
216 static void reg_vif_setup(struct net_device *dev)
217 {
218 	dev->type		= ARPHRD_PIMREG;
219 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 8;
220 	dev->flags		= IFF_NOARP;
221 	dev->hard_start_xmit	= reg_vif_xmit;
222 	dev->destructor		= free_netdev;
223 }
224 
225 static struct net_device *ipmr_reg_vif(void)
226 {
227 	struct net_device *dev;
228 	struct in_device *in_dev;
229 
230 	dev = alloc_netdev(0, "pimreg", reg_vif_setup);
231 
232 	if (dev == NULL)
233 		return NULL;
234 
235 	if (register_netdevice(dev)) {
236 		free_netdev(dev);
237 		return NULL;
238 	}
239 	dev->iflink = 0;
240 
241 	rcu_read_lock();
242 	if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
243 		rcu_read_unlock();
244 		goto failure;
245 	}
246 
247 	ipv4_devconf_setall(in_dev);
248 	IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
249 	rcu_read_unlock();
250 
251 	if (dev_open(dev))
252 		goto failure;
253 
254 	dev_hold(dev);
255 
256 	return dev;
257 
258 failure:
259 	/* allow the register to be completed before unregistering. */
260 	rtnl_unlock();
261 	rtnl_lock();
262 
263 	unregister_netdevice(dev);
264 	return NULL;
265 }
266 #endif
267 
268 /*
269  *	Delete a VIF entry
270  *	@notify: Set to 1, if the caller is a notifier_call
271  */
272 
273 static int vif_delete(int vifi, int notify)
274 {
275 	struct vif_device *v;
276 	struct net_device *dev;
277 	struct in_device *in_dev;
278 
279 	if (vifi < 0 || vifi >= maxvif)
280 		return -EADDRNOTAVAIL;
281 
282 	v = &vif_table[vifi];
283 
284 	write_lock_bh(&mrt_lock);
285 	dev = v->dev;
286 	v->dev = NULL;
287 
288 	if (!dev) {
289 		write_unlock_bh(&mrt_lock);
290 		return -EADDRNOTAVAIL;
291 	}
292 
293 #ifdef CONFIG_IP_PIMSM
294 	if (vifi == reg_vif_num)
295 		reg_vif_num = -1;
296 #endif
297 
298 	if (vifi+1 == maxvif) {
299 		int tmp;
300 		for (tmp=vifi-1; tmp>=0; tmp--) {
301 			if (VIF_EXISTS(tmp))
302 				break;
303 		}
304 		maxvif = tmp+1;
305 	}
306 
307 	write_unlock_bh(&mrt_lock);
308 
309 	dev_set_allmulti(dev, -1);
310 
311 	if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
312 		IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
313 		ip_rt_multicast_event(in_dev);
314 	}
315 
316 	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
317 		unregister_netdevice(dev);
318 
319 	dev_put(dev);
320 	return 0;
321 }
322 
323 /* Destroy an unresolved cache entry, killing queued skbs
324    and reporting error to netlink readers.
325  */
326 
327 static void ipmr_destroy_unres(struct mfc_cache *c)
328 {
329 	struct sk_buff *skb;
330 	struct nlmsgerr *e;
331 
332 	atomic_dec(&cache_resolve_queue_len);
333 
334 	while ((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
335 		if (ip_hdr(skb)->version == 0) {
336 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
337 			nlh->nlmsg_type = NLMSG_ERROR;
338 			nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
339 			skb_trim(skb, nlh->nlmsg_len);
340 			e = NLMSG_DATA(nlh);
341 			e->error = -ETIMEDOUT;
342 			memset(&e->msg, 0, sizeof(e->msg));
343 
344 			rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
345 		} else
346 			kfree_skb(skb);
347 	}
348 
349 	kmem_cache_free(mrt_cachep, c);
350 }
351 
352 
353 /* Single timer process for all the unresolved queue. */
354 
355 static void ipmr_expire_process(unsigned long dummy)
356 {
357 	unsigned long now;
358 	unsigned long expires;
359 	struct mfc_cache *c, **cp;
360 
361 	if (!spin_trylock(&mfc_unres_lock)) {
362 		mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
363 		return;
364 	}
365 
366 	if (atomic_read(&cache_resolve_queue_len) == 0)
367 		goto out;
368 
369 	now = jiffies;
370 	expires = 10*HZ;
371 	cp = &mfc_unres_queue;
372 
373 	while ((c=*cp) != NULL) {
374 		if (time_after(c->mfc_un.unres.expires, now)) {
375 			unsigned long interval = c->mfc_un.unres.expires - now;
376 			if (interval < expires)
377 				expires = interval;
378 			cp = &c->next;
379 			continue;
380 		}
381 
382 		*cp = c->next;
383 
384 		ipmr_destroy_unres(c);
385 	}
386 
387 	if (atomic_read(&cache_resolve_queue_len))
388 		mod_timer(&ipmr_expire_timer, jiffies + expires);
389 
390 out:
391 	spin_unlock(&mfc_unres_lock);
392 }
393 
394 /* Fill oifs list. It is called under write locked mrt_lock. */
395 
396 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
397 {
398 	int vifi;
399 
400 	cache->mfc_un.res.minvif = MAXVIFS;
401 	cache->mfc_un.res.maxvif = 0;
402 	memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
403 
404 	for (vifi=0; vifi<maxvif; vifi++) {
405 		if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
406 			cache->mfc_un.res.ttls[vifi] = ttls[vifi];
407 			if (cache->mfc_un.res.minvif > vifi)
408 				cache->mfc_un.res.minvif = vifi;
409 			if (cache->mfc_un.res.maxvif <= vifi)
410 				cache->mfc_un.res.maxvif = vifi + 1;
411 		}
412 	}
413 }
414 
415 static int vif_add(struct vifctl *vifc, int mrtsock)
416 {
417 	int vifi = vifc->vifc_vifi;
418 	struct vif_device *v = &vif_table[vifi];
419 	struct net_device *dev;
420 	struct in_device *in_dev;
421 	int err;
422 
423 	/* Is vif busy ? */
424 	if (VIF_EXISTS(vifi))
425 		return -EADDRINUSE;
426 
427 	switch (vifc->vifc_flags) {
428 #ifdef CONFIG_IP_PIMSM
429 	case VIFF_REGISTER:
430 		/*
431 		 * Special Purpose VIF in PIM
432 		 * All the packets will be sent to the daemon
433 		 */
434 		if (reg_vif_num >= 0)
435 			return -EADDRINUSE;
436 		dev = ipmr_reg_vif();
437 		if (!dev)
438 			return -ENOBUFS;
439 		err = dev_set_allmulti(dev, 1);
440 		if (err) {
441 			unregister_netdevice(dev);
442 			dev_put(dev);
443 			return err;
444 		}
445 		break;
446 #endif
447 	case VIFF_TUNNEL:
448 		dev = ipmr_new_tunnel(vifc);
449 		if (!dev)
450 			return -ENOBUFS;
451 		err = dev_set_allmulti(dev, 1);
452 		if (err) {
453 			ipmr_del_tunnel(dev, vifc);
454 			dev_put(dev);
455 			return err;
456 		}
457 		break;
458 	case 0:
459 		dev = ip_dev_find(&init_net, vifc->vifc_lcl_addr.s_addr);
460 		if (!dev)
461 			return -EADDRNOTAVAIL;
462 		err = dev_set_allmulti(dev, 1);
463 		if (err) {
464 			dev_put(dev);
465 			return err;
466 		}
467 		break;
468 	default:
469 		return -EINVAL;
470 	}
471 
472 	if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
473 		return -EADDRNOTAVAIL;
474 	IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
475 	ip_rt_multicast_event(in_dev);
476 
477 	/*
478 	 *	Fill in the VIF structures
479 	 */
480 	v->rate_limit=vifc->vifc_rate_limit;
481 	v->local=vifc->vifc_lcl_addr.s_addr;
482 	v->remote=vifc->vifc_rmt_addr.s_addr;
483 	v->flags=vifc->vifc_flags;
484 	if (!mrtsock)
485 		v->flags |= VIFF_STATIC;
486 	v->threshold=vifc->vifc_threshold;
487 	v->bytes_in = 0;
488 	v->bytes_out = 0;
489 	v->pkt_in = 0;
490 	v->pkt_out = 0;
491 	v->link = dev->ifindex;
492 	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
493 		v->link = dev->iflink;
494 
495 	/* And finish update writing critical data */
496 	write_lock_bh(&mrt_lock);
497 	v->dev=dev;
498 #ifdef CONFIG_IP_PIMSM
499 	if (v->flags&VIFF_REGISTER)
500 		reg_vif_num = vifi;
501 #endif
502 	if (vifi+1 > maxvif)
503 		maxvif = vifi+1;
504 	write_unlock_bh(&mrt_lock);
505 	return 0;
506 }
507 
508 static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
509 {
510 	int line=MFC_HASH(mcastgrp,origin);
511 	struct mfc_cache *c;
512 
513 	for (c=mfc_cache_array[line]; c; c = c->next) {
514 		if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
515 			break;
516 	}
517 	return c;
518 }
519 
520 /*
521  *	Allocate a multicast cache entry
522  */
523 static struct mfc_cache *ipmr_cache_alloc(void)
524 {
525 	struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
526 	if (c==NULL)
527 		return NULL;
528 	c->mfc_un.res.minvif = MAXVIFS;
529 	return c;
530 }
531 
532 static struct mfc_cache *ipmr_cache_alloc_unres(void)
533 {
534 	struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
535 	if (c==NULL)
536 		return NULL;
537 	skb_queue_head_init(&c->mfc_un.unres.unresolved);
538 	c->mfc_un.unres.expires = jiffies + 10*HZ;
539 	return c;
540 }
541 
542 /*
543  *	A cache entry has gone into a resolved state from queued
544  */
545 
546 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
547 {
548 	struct sk_buff *skb;
549 	struct nlmsgerr *e;
550 
551 	/*
552 	 *	Play the pending entries through our router
553 	 */
554 
555 	while ((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
556 		if (ip_hdr(skb)->version == 0) {
557 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
558 
559 			if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
560 				nlh->nlmsg_len = (skb_tail_pointer(skb) -
561 						  (u8 *)nlh);
562 			} else {
563 				nlh->nlmsg_type = NLMSG_ERROR;
564 				nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
565 				skb_trim(skb, nlh->nlmsg_len);
566 				e = NLMSG_DATA(nlh);
567 				e->error = -EMSGSIZE;
568 				memset(&e->msg, 0, sizeof(e->msg));
569 			}
570 
571 			rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
572 		} else
573 			ip_mr_forward(skb, c, 0);
574 	}
575 }
576 
577 /*
578  *	Bounce a cache query up to mrouted. We could use netlink for this but mrouted
579  *	expects the following bizarre scheme.
580  *
581  *	Called under mrt_lock.
582  */
583 
584 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
585 {
586 	struct sk_buff *skb;
587 	const int ihl = ip_hdrlen(pkt);
588 	struct igmphdr *igmp;
589 	struct igmpmsg *msg;
590 	int ret;
591 
592 #ifdef CONFIG_IP_PIMSM
593 	if (assert == IGMPMSG_WHOLEPKT)
594 		skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
595 	else
596 #endif
597 		skb = alloc_skb(128, GFP_ATOMIC);
598 
599 	if (!skb)
600 		return -ENOBUFS;
601 
602 #ifdef CONFIG_IP_PIMSM
603 	if (assert == IGMPMSG_WHOLEPKT) {
604 		/* Ugly, but we have no choice with this interface.
605 		   Duplicate old header, fix ihl, length etc.
606 		   And all this only to mangle msg->im_msgtype and
607 		   to set msg->im_mbz to "mbz" :-)
608 		 */
609 		skb_push(skb, sizeof(struct iphdr));
610 		skb_reset_network_header(skb);
611 		skb_reset_transport_header(skb);
612 		msg = (struct igmpmsg *)skb_network_header(skb);
613 		memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
614 		msg->im_msgtype = IGMPMSG_WHOLEPKT;
615 		msg->im_mbz = 0;
616 		msg->im_vif = reg_vif_num;
617 		ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
618 		ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
619 					     sizeof(struct iphdr));
620 	} else
621 #endif
622 	{
623 
624 	/*
625 	 *	Copy the IP header
626 	 */
627 
628 	skb->network_header = skb->tail;
629 	skb_put(skb, ihl);
630 	skb_copy_to_linear_data(skb, pkt->data, ihl);
631 	ip_hdr(skb)->protocol = 0;			/* Flag to the kernel this is a route add */
632 	msg = (struct igmpmsg *)skb_network_header(skb);
633 	msg->im_vif = vifi;
634 	skb->dst = dst_clone(pkt->dst);
635 
636 	/*
637 	 *	Add our header
638 	 */
639 
640 	igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
641 	igmp->type	=
642 	msg->im_msgtype = assert;
643 	igmp->code 	=	0;
644 	ip_hdr(skb)->tot_len = htons(skb->len);			/* Fix the length */
645 	skb->transport_header = skb->network_header;
646 	}
647 
648 	if (mroute_socket == NULL) {
649 		kfree_skb(skb);
650 		return -EINVAL;
651 	}
652 
653 	/*
654 	 *	Deliver to mrouted
655 	 */
656 	if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
657 		if (net_ratelimit())
658 			printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
659 		kfree_skb(skb);
660 	}
661 
662 	return ret;
663 }
664 
665 /*
666  *	Queue a packet for resolution. It gets locked cache entry!
667  */
668 
669 static int
670 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
671 {
672 	int err;
673 	struct mfc_cache *c;
674 	const struct iphdr *iph = ip_hdr(skb);
675 
676 	spin_lock_bh(&mfc_unres_lock);
677 	for (c=mfc_unres_queue; c; c=c->next) {
678 		if (c->mfc_mcastgrp == iph->daddr &&
679 		    c->mfc_origin == iph->saddr)
680 			break;
681 	}
682 
683 	if (c == NULL) {
684 		/*
685 		 *	Create a new entry if allowable
686 		 */
687 
688 		if (atomic_read(&cache_resolve_queue_len)>=10 ||
689 		    (c=ipmr_cache_alloc_unres())==NULL) {
690 			spin_unlock_bh(&mfc_unres_lock);
691 
692 			kfree_skb(skb);
693 			return -ENOBUFS;
694 		}
695 
696 		/*
697 		 *	Fill in the new cache entry
698 		 */
699 		c->mfc_parent	= -1;
700 		c->mfc_origin	= iph->saddr;
701 		c->mfc_mcastgrp	= iph->daddr;
702 
703 		/*
704 		 *	Reflect first query at mrouted.
705 		 */
706 		if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
707 			/* If the report failed throw the cache entry
708 			   out - Brad Parker
709 			 */
710 			spin_unlock_bh(&mfc_unres_lock);
711 
712 			kmem_cache_free(mrt_cachep, c);
713 			kfree_skb(skb);
714 			return err;
715 		}
716 
717 		atomic_inc(&cache_resolve_queue_len);
718 		c->next = mfc_unres_queue;
719 		mfc_unres_queue = c;
720 
721 		mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
722 	}
723 
724 	/*
725 	 *	See if we can append the packet
726 	 */
727 	if (c->mfc_un.unres.unresolved.qlen>3) {
728 		kfree_skb(skb);
729 		err = -ENOBUFS;
730 	} else {
731 		skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
732 		err = 0;
733 	}
734 
735 	spin_unlock_bh(&mfc_unres_lock);
736 	return err;
737 }
738 
739 /*
740  *	MFC cache manipulation by user space mroute daemon
741  */
742 
743 static int ipmr_mfc_delete(struct mfcctl *mfc)
744 {
745 	int line;
746 	struct mfc_cache *c, **cp;
747 
748 	line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
749 
750 	for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
751 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
752 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
753 			write_lock_bh(&mrt_lock);
754 			*cp = c->next;
755 			write_unlock_bh(&mrt_lock);
756 
757 			kmem_cache_free(mrt_cachep, c);
758 			return 0;
759 		}
760 	}
761 	return -ENOENT;
762 }
763 
764 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
765 {
766 	int line;
767 	struct mfc_cache *uc, *c, **cp;
768 
769 	line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
770 
771 	for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
772 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
773 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
774 			break;
775 	}
776 
777 	if (c != NULL) {
778 		write_lock_bh(&mrt_lock);
779 		c->mfc_parent = mfc->mfcc_parent;
780 		ipmr_update_thresholds(c, mfc->mfcc_ttls);
781 		if (!mrtsock)
782 			c->mfc_flags |= MFC_STATIC;
783 		write_unlock_bh(&mrt_lock);
784 		return 0;
785 	}
786 
787 	if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
788 		return -EINVAL;
789 
790 	c=ipmr_cache_alloc();
791 	if (c==NULL)
792 		return -ENOMEM;
793 
794 	c->mfc_origin=mfc->mfcc_origin.s_addr;
795 	c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
796 	c->mfc_parent=mfc->mfcc_parent;
797 	ipmr_update_thresholds(c, mfc->mfcc_ttls);
798 	if (!mrtsock)
799 		c->mfc_flags |= MFC_STATIC;
800 
801 	write_lock_bh(&mrt_lock);
802 	c->next = mfc_cache_array[line];
803 	mfc_cache_array[line] = c;
804 	write_unlock_bh(&mrt_lock);
805 
806 	/*
807 	 *	Check to see if we resolved a queued list. If so we
808 	 *	need to send on the frames and tidy up.
809 	 */
810 	spin_lock_bh(&mfc_unres_lock);
811 	for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
812 	     cp = &uc->next) {
813 		if (uc->mfc_origin == c->mfc_origin &&
814 		    uc->mfc_mcastgrp == c->mfc_mcastgrp) {
815 			*cp = uc->next;
816 			if (atomic_dec_and_test(&cache_resolve_queue_len))
817 				del_timer(&ipmr_expire_timer);
818 			break;
819 		}
820 	}
821 	spin_unlock_bh(&mfc_unres_lock);
822 
823 	if (uc) {
824 		ipmr_cache_resolve(uc, c);
825 		kmem_cache_free(mrt_cachep, uc);
826 	}
827 	return 0;
828 }
829 
830 /*
831  *	Close the multicast socket, and clear the vif tables etc
832  */
833 
834 static void mroute_clean_tables(struct sock *sk)
835 {
836 	int i;
837 
838 	/*
839 	 *	Shut down all active vif entries
840 	 */
841 	for (i=0; i<maxvif; i++) {
842 		if (!(vif_table[i].flags&VIFF_STATIC))
843 			vif_delete(i, 0);
844 	}
845 
846 	/*
847 	 *	Wipe the cache
848 	 */
849 	for (i=0;i<MFC_LINES;i++) {
850 		struct mfc_cache *c, **cp;
851 
852 		cp = &mfc_cache_array[i];
853 		while ((c = *cp) != NULL) {
854 			if (c->mfc_flags&MFC_STATIC) {
855 				cp = &c->next;
856 				continue;
857 			}
858 			write_lock_bh(&mrt_lock);
859 			*cp = c->next;
860 			write_unlock_bh(&mrt_lock);
861 
862 			kmem_cache_free(mrt_cachep, c);
863 		}
864 	}
865 
866 	if (atomic_read(&cache_resolve_queue_len) != 0) {
867 		struct mfc_cache *c;
868 
869 		spin_lock_bh(&mfc_unres_lock);
870 		while (mfc_unres_queue != NULL) {
871 			c = mfc_unres_queue;
872 			mfc_unres_queue = c->next;
873 			spin_unlock_bh(&mfc_unres_lock);
874 
875 			ipmr_destroy_unres(c);
876 
877 			spin_lock_bh(&mfc_unres_lock);
878 		}
879 		spin_unlock_bh(&mfc_unres_lock);
880 	}
881 }
882 
883 static void mrtsock_destruct(struct sock *sk)
884 {
885 	rtnl_lock();
886 	if (sk == mroute_socket) {
887 		IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)--;
888 
889 		write_lock_bh(&mrt_lock);
890 		mroute_socket=NULL;
891 		write_unlock_bh(&mrt_lock);
892 
893 		mroute_clean_tables(sk);
894 	}
895 	rtnl_unlock();
896 }
897 
898 /*
899  *	Socket options and virtual interface manipulation. The whole
900  *	virtual interface system is a complete heap, but unfortunately
901  *	that's how BSD mrouted happens to think. Maybe one day with a proper
902  *	MOSPF/PIM router set up we can clean this up.
903  */
904 
905 int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
906 {
907 	int ret;
908 	struct vifctl vif;
909 	struct mfcctl mfc;
910 
911 	if (optname != MRT_INIT) {
912 		if (sk != mroute_socket && !capable(CAP_NET_ADMIN))
913 			return -EACCES;
914 	}
915 
916 	switch (optname) {
917 	case MRT_INIT:
918 		if (sk->sk_type != SOCK_RAW ||
919 		    inet_sk(sk)->num != IPPROTO_IGMP)
920 			return -EOPNOTSUPP;
921 		if (optlen!=sizeof(int))
922 			return -ENOPROTOOPT;
923 
924 		rtnl_lock();
925 		if (mroute_socket) {
926 			rtnl_unlock();
927 			return -EADDRINUSE;
928 		}
929 
930 		ret = ip_ra_control(sk, 1, mrtsock_destruct);
931 		if (ret == 0) {
932 			write_lock_bh(&mrt_lock);
933 			mroute_socket=sk;
934 			write_unlock_bh(&mrt_lock);
935 
936 			IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)++;
937 		}
938 		rtnl_unlock();
939 		return ret;
940 	case MRT_DONE:
941 		if (sk!=mroute_socket)
942 			return -EACCES;
943 		return ip_ra_control(sk, 0, NULL);
944 	case MRT_ADD_VIF:
945 	case MRT_DEL_VIF:
946 		if (optlen!=sizeof(vif))
947 			return -EINVAL;
948 		if (copy_from_user(&vif,optval,sizeof(vif)))
949 			return -EFAULT;
950 		if (vif.vifc_vifi >= MAXVIFS)
951 			return -ENFILE;
952 		rtnl_lock();
953 		if (optname==MRT_ADD_VIF) {
954 			ret = vif_add(&vif, sk==mroute_socket);
955 		} else {
956 			ret = vif_delete(vif.vifc_vifi, 0);
957 		}
958 		rtnl_unlock();
959 		return ret;
960 
961 		/*
962 		 *	Manipulate the forwarding caches. These live
963 		 *	in a sort of kernel/user symbiosis.
964 		 */
965 	case MRT_ADD_MFC:
966 	case MRT_DEL_MFC:
967 		if (optlen!=sizeof(mfc))
968 			return -EINVAL;
969 		if (copy_from_user(&mfc,optval, sizeof(mfc)))
970 			return -EFAULT;
971 		rtnl_lock();
972 		if (optname==MRT_DEL_MFC)
973 			ret = ipmr_mfc_delete(&mfc);
974 		else
975 			ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
976 		rtnl_unlock();
977 		return ret;
978 		/*
979 		 *	Control PIM assert.
980 		 */
981 	case MRT_ASSERT:
982 	{
983 		int v;
984 		if (get_user(v,(int __user *)optval))
985 			return -EFAULT;
986 		mroute_do_assert=(v)?1:0;
987 		return 0;
988 	}
989 #ifdef CONFIG_IP_PIMSM
990 	case MRT_PIM:
991 	{
992 		int v;
993 
994 		if (get_user(v,(int __user *)optval))
995 			return -EFAULT;
996 		v = (v) ? 1 : 0;
997 
998 		rtnl_lock();
999 		ret = 0;
1000 		if (v != mroute_do_pim) {
1001 			mroute_do_pim = v;
1002 			mroute_do_assert = v;
1003 #ifdef CONFIG_IP_PIMSM_V2
1004 			if (mroute_do_pim)
1005 				ret = inet_add_protocol(&pim_protocol,
1006 							IPPROTO_PIM);
1007 			else
1008 				ret = inet_del_protocol(&pim_protocol,
1009 							IPPROTO_PIM);
1010 			if (ret < 0)
1011 				ret = -EAGAIN;
1012 #endif
1013 		}
1014 		rtnl_unlock();
1015 		return ret;
1016 	}
1017 #endif
1018 	/*
1019 	 *	Spurious command, or MRT_VERSION which you cannot
1020 	 *	set.
1021 	 */
1022 	default:
1023 		return -ENOPROTOOPT;
1024 	}
1025 }
1026 
1027 /*
1028  *	Getsock opt support for the multicast routing system.
1029  */
1030 
1031 int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
1032 {
1033 	int olr;
1034 	int val;
1035 
1036 	if (optname!=MRT_VERSION &&
1037 #ifdef CONFIG_IP_PIMSM
1038 	   optname!=MRT_PIM &&
1039 #endif
1040 	   optname!=MRT_ASSERT)
1041 		return -ENOPROTOOPT;
1042 
1043 	if (get_user(olr, optlen))
1044 		return -EFAULT;
1045 
1046 	olr = min_t(unsigned int, olr, sizeof(int));
1047 	if (olr < 0)
1048 		return -EINVAL;
1049 
1050 	if (put_user(olr,optlen))
1051 		return -EFAULT;
1052 	if (optname==MRT_VERSION)
1053 		val=0x0305;
1054 #ifdef CONFIG_IP_PIMSM
1055 	else if (optname==MRT_PIM)
1056 		val=mroute_do_pim;
1057 #endif
1058 	else
1059 		val=mroute_do_assert;
1060 	if (copy_to_user(optval,&val,olr))
1061 		return -EFAULT;
1062 	return 0;
1063 }
1064 
1065 /*
1066  *	The IP multicast ioctl support routines.
1067  */
1068 
1069 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1070 {
1071 	struct sioc_sg_req sr;
1072 	struct sioc_vif_req vr;
1073 	struct vif_device *vif;
1074 	struct mfc_cache *c;
1075 
1076 	switch (cmd) {
1077 	case SIOCGETVIFCNT:
1078 		if (copy_from_user(&vr,arg,sizeof(vr)))
1079 			return -EFAULT;
1080 		if (vr.vifi>=maxvif)
1081 			return -EINVAL;
1082 		read_lock(&mrt_lock);
1083 		vif=&vif_table[vr.vifi];
1084 		if (VIF_EXISTS(vr.vifi))	{
1085 			vr.icount=vif->pkt_in;
1086 			vr.ocount=vif->pkt_out;
1087 			vr.ibytes=vif->bytes_in;
1088 			vr.obytes=vif->bytes_out;
1089 			read_unlock(&mrt_lock);
1090 
1091 			if (copy_to_user(arg,&vr,sizeof(vr)))
1092 				return -EFAULT;
1093 			return 0;
1094 		}
1095 		read_unlock(&mrt_lock);
1096 		return -EADDRNOTAVAIL;
1097 	case SIOCGETSGCNT:
1098 		if (copy_from_user(&sr,arg,sizeof(sr)))
1099 			return -EFAULT;
1100 
1101 		read_lock(&mrt_lock);
1102 		c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1103 		if (c) {
1104 			sr.pktcnt = c->mfc_un.res.pkt;
1105 			sr.bytecnt = c->mfc_un.res.bytes;
1106 			sr.wrong_if = c->mfc_un.res.wrong_if;
1107 			read_unlock(&mrt_lock);
1108 
1109 			if (copy_to_user(arg,&sr,sizeof(sr)))
1110 				return -EFAULT;
1111 			return 0;
1112 		}
1113 		read_unlock(&mrt_lock);
1114 		return -EADDRNOTAVAIL;
1115 	default:
1116 		return -ENOIOCTLCMD;
1117 	}
1118 }
1119 
1120 
1121 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1122 {
1123 	struct net_device *dev = ptr;
1124 	struct vif_device *v;
1125 	int ct;
1126 
1127 	if (!net_eq(dev_net(dev), &init_net))
1128 		return NOTIFY_DONE;
1129 
1130 	if (event != NETDEV_UNREGISTER)
1131 		return NOTIFY_DONE;
1132 	v=&vif_table[0];
1133 	for (ct=0;ct<maxvif;ct++,v++) {
1134 		if (v->dev==dev)
1135 			vif_delete(ct, 1);
1136 	}
1137 	return NOTIFY_DONE;
1138 }
1139 
1140 
1141 static struct notifier_block ip_mr_notifier={
1142 	.notifier_call = ipmr_device_event,
1143 };
1144 
1145 /*
1146  * 	Encapsulate a packet by attaching a valid IPIP header to it.
1147  *	This avoids tunnel drivers and other mess and gives us the speed so
1148  *	important for multicast video.
1149  */
1150 
1151 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1152 {
1153 	struct iphdr *iph;
1154 	struct iphdr *old_iph = ip_hdr(skb);
1155 
1156 	skb_push(skb, sizeof(struct iphdr));
1157 	skb->transport_header = skb->network_header;
1158 	skb_reset_network_header(skb);
1159 	iph = ip_hdr(skb);
1160 
1161 	iph->version	= 	4;
1162 	iph->tos	=	old_iph->tos;
1163 	iph->ttl	=	old_iph->ttl;
1164 	iph->frag_off	=	0;
1165 	iph->daddr	=	daddr;
1166 	iph->saddr	=	saddr;
1167 	iph->protocol	=	IPPROTO_IPIP;
1168 	iph->ihl	=	5;
1169 	iph->tot_len	=	htons(skb->len);
1170 	ip_select_ident(iph, skb->dst, NULL);
1171 	ip_send_check(iph);
1172 
1173 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1174 	nf_reset(skb);
1175 }
1176 
1177 static inline int ipmr_forward_finish(struct sk_buff *skb)
1178 {
1179 	struct ip_options * opt	= &(IPCB(skb)->opt);
1180 
1181 	IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
1182 
1183 	if (unlikely(opt->optlen))
1184 		ip_forward_options(skb);
1185 
1186 	return dst_output(skb);
1187 }
1188 
1189 /*
1190  *	Processing handlers for ipmr_forward
1191  */
1192 
1193 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1194 {
1195 	const struct iphdr *iph = ip_hdr(skb);
1196 	struct vif_device *vif = &vif_table[vifi];
1197 	struct net_device *dev;
1198 	struct rtable *rt;
1199 	int    encap = 0;
1200 
1201 	if (vif->dev == NULL)
1202 		goto out_free;
1203 
1204 #ifdef CONFIG_IP_PIMSM
1205 	if (vif->flags & VIFF_REGISTER) {
1206 		vif->pkt_out++;
1207 		vif->bytes_out+=skb->len;
1208 		vif->dev->stats.tx_bytes += skb->len;
1209 		vif->dev->stats.tx_packets++;
1210 		ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1211 		kfree_skb(skb);
1212 		return;
1213 	}
1214 #endif
1215 
1216 	if (vif->flags&VIFF_TUNNEL) {
1217 		struct flowi fl = { .oif = vif->link,
1218 				    .nl_u = { .ip4_u =
1219 					      { .daddr = vif->remote,
1220 						.saddr = vif->local,
1221 						.tos = RT_TOS(iph->tos) } },
1222 				    .proto = IPPROTO_IPIP };
1223 		if (ip_route_output_key(&init_net, &rt, &fl))
1224 			goto out_free;
1225 		encap = sizeof(struct iphdr);
1226 	} else {
1227 		struct flowi fl = { .oif = vif->link,
1228 				    .nl_u = { .ip4_u =
1229 					      { .daddr = iph->daddr,
1230 						.tos = RT_TOS(iph->tos) } },
1231 				    .proto = IPPROTO_IPIP };
1232 		if (ip_route_output_key(&init_net, &rt, &fl))
1233 			goto out_free;
1234 	}
1235 
1236 	dev = rt->u.dst.dev;
1237 
1238 	if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1239 		/* Do not fragment multicasts. Alas, IPv4 does not
1240 		   allow to send ICMP, so that packets will disappear
1241 		   to blackhole.
1242 		 */
1243 
1244 		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1245 		ip_rt_put(rt);
1246 		goto out_free;
1247 	}
1248 
1249 	encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1250 
1251 	if (skb_cow(skb, encap)) {
1252 		ip_rt_put(rt);
1253 		goto out_free;
1254 	}
1255 
1256 	vif->pkt_out++;
1257 	vif->bytes_out+=skb->len;
1258 
1259 	dst_release(skb->dst);
1260 	skb->dst = &rt->u.dst;
1261 	ip_decrease_ttl(ip_hdr(skb));
1262 
1263 	/* FIXME: forward and output firewalls used to be called here.
1264 	 * What do we do with netfilter? -- RR */
1265 	if (vif->flags & VIFF_TUNNEL) {
1266 		ip_encap(skb, vif->local, vif->remote);
1267 		/* FIXME: extra output firewall step used to be here. --RR */
1268 		vif->dev->stats.tx_packets++;
1269 		vif->dev->stats.tx_bytes += skb->len;
1270 	}
1271 
1272 	IPCB(skb)->flags |= IPSKB_FORWARDED;
1273 
1274 	/*
1275 	 * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1276 	 * not only before forwarding, but after forwarding on all output
1277 	 * interfaces. It is clear, if mrouter runs a multicasting
1278 	 * program, it should receive packets not depending to what interface
1279 	 * program is joined.
1280 	 * If we will not make it, the program will have to join on all
1281 	 * interfaces. On the other hand, multihoming host (or router, but
1282 	 * not mrouter) cannot join to more than one interface - it will
1283 	 * result in receiving multiple packets.
1284 	 */
1285 	NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, dev,
1286 		ipmr_forward_finish);
1287 	return;
1288 
1289 out_free:
1290 	kfree_skb(skb);
1291 	return;
1292 }
1293 
1294 static int ipmr_find_vif(struct net_device *dev)
1295 {
1296 	int ct;
1297 	for (ct=maxvif-1; ct>=0; ct--) {
1298 		if (vif_table[ct].dev == dev)
1299 			break;
1300 	}
1301 	return ct;
1302 }
1303 
1304 /* "local" means that we should preserve one skb (for local delivery) */
1305 
1306 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1307 {
1308 	int psend = -1;
1309 	int vif, ct;
1310 
1311 	vif = cache->mfc_parent;
1312 	cache->mfc_un.res.pkt++;
1313 	cache->mfc_un.res.bytes += skb->len;
1314 
1315 	/*
1316 	 * Wrong interface: drop packet and (maybe) send PIM assert.
1317 	 */
1318 	if (vif_table[vif].dev != skb->dev) {
1319 		int true_vifi;
1320 
1321 		if (skb->rtable->fl.iif == 0) {
1322 			/* It is our own packet, looped back.
1323 			   Very complicated situation...
1324 
1325 			   The best workaround until routing daemons will be
1326 			   fixed is not to redistribute packet, if it was
1327 			   send through wrong interface. It means, that
1328 			   multicast applications WILL NOT work for
1329 			   (S,G), which have default multicast route pointing
1330 			   to wrong oif. In any case, it is not a good
1331 			   idea to use multicasting applications on router.
1332 			 */
1333 			goto dont_forward;
1334 		}
1335 
1336 		cache->mfc_un.res.wrong_if++;
1337 		true_vifi = ipmr_find_vif(skb->dev);
1338 
1339 		if (true_vifi >= 0 && mroute_do_assert &&
1340 		    /* pimsm uses asserts, when switching from RPT to SPT,
1341 		       so that we cannot check that packet arrived on an oif.
1342 		       It is bad, but otherwise we would need to move pretty
1343 		       large chunk of pimd to kernel. Ough... --ANK
1344 		     */
1345 		    (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1346 		    time_after(jiffies,
1347 			       cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1348 			cache->mfc_un.res.last_assert = jiffies;
1349 			ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1350 		}
1351 		goto dont_forward;
1352 	}
1353 
1354 	vif_table[vif].pkt_in++;
1355 	vif_table[vif].bytes_in+=skb->len;
1356 
1357 	/*
1358 	 *	Forward the frame
1359 	 */
1360 	for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1361 		if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1362 			if (psend != -1) {
1363 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1364 				if (skb2)
1365 					ipmr_queue_xmit(skb2, cache, psend);
1366 			}
1367 			psend=ct;
1368 		}
1369 	}
1370 	if (psend != -1) {
1371 		if (local) {
1372 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1373 			if (skb2)
1374 				ipmr_queue_xmit(skb2, cache, psend);
1375 		} else {
1376 			ipmr_queue_xmit(skb, cache, psend);
1377 			return 0;
1378 		}
1379 	}
1380 
1381 dont_forward:
1382 	if (!local)
1383 		kfree_skb(skb);
1384 	return 0;
1385 }
1386 
1387 
1388 /*
1389  *	Multicast packets for forwarding arrive here
1390  */
1391 
1392 int ip_mr_input(struct sk_buff *skb)
1393 {
1394 	struct mfc_cache *cache;
1395 	int local = skb->rtable->rt_flags&RTCF_LOCAL;
1396 
1397 	/* Packet is looped back after forward, it should not be
1398 	   forwarded second time, but still can be delivered locally.
1399 	 */
1400 	if (IPCB(skb)->flags&IPSKB_FORWARDED)
1401 		goto dont_forward;
1402 
1403 	if (!local) {
1404 		    if (IPCB(skb)->opt.router_alert) {
1405 			    if (ip_call_ra_chain(skb))
1406 				    return 0;
1407 		    } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1408 			    /* IGMPv1 (and broken IGMPv2 implementations sort of
1409 			       Cisco IOS <= 11.2(8)) do not put router alert
1410 			       option to IGMP packets destined to routable
1411 			       groups. It is very bad, because it means
1412 			       that we can forward NO IGMP messages.
1413 			     */
1414 			    read_lock(&mrt_lock);
1415 			    if (mroute_socket) {
1416 				    nf_reset(skb);
1417 				    raw_rcv(mroute_socket, skb);
1418 				    read_unlock(&mrt_lock);
1419 				    return 0;
1420 			    }
1421 			    read_unlock(&mrt_lock);
1422 		    }
1423 	}
1424 
1425 	read_lock(&mrt_lock);
1426 	cache = ipmr_cache_find(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1427 
1428 	/*
1429 	 *	No usable cache entry
1430 	 */
1431 	if (cache==NULL) {
1432 		int vif;
1433 
1434 		if (local) {
1435 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1436 			ip_local_deliver(skb);
1437 			if (skb2 == NULL) {
1438 				read_unlock(&mrt_lock);
1439 				return -ENOBUFS;
1440 			}
1441 			skb = skb2;
1442 		}
1443 
1444 		vif = ipmr_find_vif(skb->dev);
1445 		if (vif >= 0) {
1446 			int err = ipmr_cache_unresolved(vif, skb);
1447 			read_unlock(&mrt_lock);
1448 
1449 			return err;
1450 		}
1451 		read_unlock(&mrt_lock);
1452 		kfree_skb(skb);
1453 		return -ENODEV;
1454 	}
1455 
1456 	ip_mr_forward(skb, cache, local);
1457 
1458 	read_unlock(&mrt_lock);
1459 
1460 	if (local)
1461 		return ip_local_deliver(skb);
1462 
1463 	return 0;
1464 
1465 dont_forward:
1466 	if (local)
1467 		return ip_local_deliver(skb);
1468 	kfree_skb(skb);
1469 	return 0;
1470 }
1471 
1472 #ifdef CONFIG_IP_PIMSM_V1
1473 /*
1474  * Handle IGMP messages of PIMv1
1475  */
1476 
1477 int pim_rcv_v1(struct sk_buff * skb)
1478 {
1479 	struct igmphdr *pim;
1480 	struct iphdr   *encap;
1481 	struct net_device  *reg_dev = NULL;
1482 
1483 	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1484 		goto drop;
1485 
1486 	pim = igmp_hdr(skb);
1487 
1488 	if (!mroute_do_pim ||
1489 	    skb->len < sizeof(*pim) + sizeof(*encap) ||
1490 	    pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1491 		goto drop;
1492 
1493 	encap = (struct iphdr *)(skb_transport_header(skb) +
1494 				 sizeof(struct igmphdr));
1495 	/*
1496 	   Check that:
1497 	   a. packet is really destinted to a multicast group
1498 	   b. packet is not a NULL-REGISTER
1499 	   c. packet is not truncated
1500 	 */
1501 	if (!ipv4_is_multicast(encap->daddr) ||
1502 	    encap->tot_len == 0 ||
1503 	    ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1504 		goto drop;
1505 
1506 	read_lock(&mrt_lock);
1507 	if (reg_vif_num >= 0)
1508 		reg_dev = vif_table[reg_vif_num].dev;
1509 	if (reg_dev)
1510 		dev_hold(reg_dev);
1511 	read_unlock(&mrt_lock);
1512 
1513 	if (reg_dev == NULL)
1514 		goto drop;
1515 
1516 	skb->mac_header = skb->network_header;
1517 	skb_pull(skb, (u8*)encap - skb->data);
1518 	skb_reset_network_header(skb);
1519 	skb->dev = reg_dev;
1520 	skb->protocol = htons(ETH_P_IP);
1521 	skb->ip_summed = 0;
1522 	skb->pkt_type = PACKET_HOST;
1523 	dst_release(skb->dst);
1524 	skb->dst = NULL;
1525 	reg_dev->stats.rx_bytes += skb->len;
1526 	reg_dev->stats.rx_packets++;
1527 	nf_reset(skb);
1528 	netif_rx(skb);
1529 	dev_put(reg_dev);
1530 	return 0;
1531  drop:
1532 	kfree_skb(skb);
1533 	return 0;
1534 }
1535 #endif
1536 
1537 #ifdef CONFIG_IP_PIMSM_V2
1538 static int pim_rcv(struct sk_buff * skb)
1539 {
1540 	struct pimreghdr *pim;
1541 	struct iphdr   *encap;
1542 	struct net_device  *reg_dev = NULL;
1543 
1544 	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1545 		goto drop;
1546 
1547 	pim = (struct pimreghdr *)skb_transport_header(skb);
1548 	if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1549 	    (pim->flags&PIM_NULL_REGISTER) ||
1550 	    (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1551 	     csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1552 		goto drop;
1553 
1554 	/* check if the inner packet is destined to mcast group */
1555 	encap = (struct iphdr *)(skb_transport_header(skb) +
1556 				 sizeof(struct pimreghdr));
1557 	if (!ipv4_is_multicast(encap->daddr) ||
1558 	    encap->tot_len == 0 ||
1559 	    ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1560 		goto drop;
1561 
1562 	read_lock(&mrt_lock);
1563 	if (reg_vif_num >= 0)
1564 		reg_dev = vif_table[reg_vif_num].dev;
1565 	if (reg_dev)
1566 		dev_hold(reg_dev);
1567 	read_unlock(&mrt_lock);
1568 
1569 	if (reg_dev == NULL)
1570 		goto drop;
1571 
1572 	skb->mac_header = skb->network_header;
1573 	skb_pull(skb, (u8*)encap - skb->data);
1574 	skb_reset_network_header(skb);
1575 	skb->dev = reg_dev;
1576 	skb->protocol = htons(ETH_P_IP);
1577 	skb->ip_summed = 0;
1578 	skb->pkt_type = PACKET_HOST;
1579 	dst_release(skb->dst);
1580 	reg_dev->stats.rx_bytes += skb->len;
1581 	reg_dev->stats.rx_packets++;
1582 	skb->dst = NULL;
1583 	nf_reset(skb);
1584 	netif_rx(skb);
1585 	dev_put(reg_dev);
1586 	return 0;
1587  drop:
1588 	kfree_skb(skb);
1589 	return 0;
1590 }
1591 #endif
1592 
1593 static int
1594 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1595 {
1596 	int ct;
1597 	struct rtnexthop *nhp;
1598 	struct net_device *dev = vif_table[c->mfc_parent].dev;
1599 	u8 *b = skb_tail_pointer(skb);
1600 	struct rtattr *mp_head;
1601 
1602 	if (dev)
1603 		RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1604 
1605 	mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1606 
1607 	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1608 		if (c->mfc_un.res.ttls[ct] < 255) {
1609 			if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1610 				goto rtattr_failure;
1611 			nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1612 			nhp->rtnh_flags = 0;
1613 			nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1614 			nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1615 			nhp->rtnh_len = sizeof(*nhp);
1616 		}
1617 	}
1618 	mp_head->rta_type = RTA_MULTIPATH;
1619 	mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1620 	rtm->rtm_type = RTN_MULTICAST;
1621 	return 1;
1622 
1623 rtattr_failure:
1624 	nlmsg_trim(skb, b);
1625 	return -EMSGSIZE;
1626 }
1627 
1628 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1629 {
1630 	int err;
1631 	struct mfc_cache *cache;
1632 	struct rtable *rt = skb->rtable;
1633 
1634 	read_lock(&mrt_lock);
1635 	cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1636 
1637 	if (cache==NULL) {
1638 		struct sk_buff *skb2;
1639 		struct iphdr *iph;
1640 		struct net_device *dev;
1641 		int vif;
1642 
1643 		if (nowait) {
1644 			read_unlock(&mrt_lock);
1645 			return -EAGAIN;
1646 		}
1647 
1648 		dev = skb->dev;
1649 		if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1650 			read_unlock(&mrt_lock);
1651 			return -ENODEV;
1652 		}
1653 		skb2 = skb_clone(skb, GFP_ATOMIC);
1654 		if (!skb2) {
1655 			read_unlock(&mrt_lock);
1656 			return -ENOMEM;
1657 		}
1658 
1659 		skb_push(skb2, sizeof(struct iphdr));
1660 		skb_reset_network_header(skb2);
1661 		iph = ip_hdr(skb2);
1662 		iph->ihl = sizeof(struct iphdr) >> 2;
1663 		iph->saddr = rt->rt_src;
1664 		iph->daddr = rt->rt_dst;
1665 		iph->version = 0;
1666 		err = ipmr_cache_unresolved(vif, skb2);
1667 		read_unlock(&mrt_lock);
1668 		return err;
1669 	}
1670 
1671 	if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1672 		cache->mfc_flags |= MFC_NOTIFY;
1673 	err = ipmr_fill_mroute(skb, cache, rtm);
1674 	read_unlock(&mrt_lock);
1675 	return err;
1676 }
1677 
1678 #ifdef CONFIG_PROC_FS
1679 /*
1680  *	The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1681  */
1682 struct ipmr_vif_iter {
1683 	int ct;
1684 };
1685 
1686 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1687 					   loff_t pos)
1688 {
1689 	for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1690 		if (!VIF_EXISTS(iter->ct))
1691 			continue;
1692 		if (pos-- == 0)
1693 			return &vif_table[iter->ct];
1694 	}
1695 	return NULL;
1696 }
1697 
1698 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1699 	__acquires(mrt_lock)
1700 {
1701 	read_lock(&mrt_lock);
1702 	return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1703 		: SEQ_START_TOKEN;
1704 }
1705 
1706 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1707 {
1708 	struct ipmr_vif_iter *iter = seq->private;
1709 
1710 	++*pos;
1711 	if (v == SEQ_START_TOKEN)
1712 		return ipmr_vif_seq_idx(iter, 0);
1713 
1714 	while (++iter->ct < maxvif) {
1715 		if (!VIF_EXISTS(iter->ct))
1716 			continue;
1717 		return &vif_table[iter->ct];
1718 	}
1719 	return NULL;
1720 }
1721 
1722 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1723 	__releases(mrt_lock)
1724 {
1725 	read_unlock(&mrt_lock);
1726 }
1727 
1728 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1729 {
1730 	if (v == SEQ_START_TOKEN) {
1731 		seq_puts(seq,
1732 			 "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1733 	} else {
1734 		const struct vif_device *vif = v;
1735 		const char *name =  vif->dev ? vif->dev->name : "none";
1736 
1737 		seq_printf(seq,
1738 			   "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1739 			   vif - vif_table,
1740 			   name, vif->bytes_in, vif->pkt_in,
1741 			   vif->bytes_out, vif->pkt_out,
1742 			   vif->flags, vif->local, vif->remote);
1743 	}
1744 	return 0;
1745 }
1746 
1747 static const struct seq_operations ipmr_vif_seq_ops = {
1748 	.start = ipmr_vif_seq_start,
1749 	.next  = ipmr_vif_seq_next,
1750 	.stop  = ipmr_vif_seq_stop,
1751 	.show  = ipmr_vif_seq_show,
1752 };
1753 
1754 static int ipmr_vif_open(struct inode *inode, struct file *file)
1755 {
1756 	return seq_open_private(file, &ipmr_vif_seq_ops,
1757 			sizeof(struct ipmr_vif_iter));
1758 }
1759 
1760 static const struct file_operations ipmr_vif_fops = {
1761 	.owner	 = THIS_MODULE,
1762 	.open    = ipmr_vif_open,
1763 	.read    = seq_read,
1764 	.llseek  = seq_lseek,
1765 	.release = seq_release_private,
1766 };
1767 
1768 struct ipmr_mfc_iter {
1769 	struct mfc_cache **cache;
1770 	int ct;
1771 };
1772 
1773 
1774 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1775 {
1776 	struct mfc_cache *mfc;
1777 
1778 	it->cache = mfc_cache_array;
1779 	read_lock(&mrt_lock);
1780 	for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1781 		for (mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
1782 			if (pos-- == 0)
1783 				return mfc;
1784 	read_unlock(&mrt_lock);
1785 
1786 	it->cache = &mfc_unres_queue;
1787 	spin_lock_bh(&mfc_unres_lock);
1788 	for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1789 		if (pos-- == 0)
1790 			return mfc;
1791 	spin_unlock_bh(&mfc_unres_lock);
1792 
1793 	it->cache = NULL;
1794 	return NULL;
1795 }
1796 
1797 
1798 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1799 {
1800 	struct ipmr_mfc_iter *it = seq->private;
1801 	it->cache = NULL;
1802 	it->ct = 0;
1803 	return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1804 		: SEQ_START_TOKEN;
1805 }
1806 
1807 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1808 {
1809 	struct mfc_cache *mfc = v;
1810 	struct ipmr_mfc_iter *it = seq->private;
1811 
1812 	++*pos;
1813 
1814 	if (v == SEQ_START_TOKEN)
1815 		return ipmr_mfc_seq_idx(seq->private, 0);
1816 
1817 	if (mfc->next)
1818 		return mfc->next;
1819 
1820 	if (it->cache == &mfc_unres_queue)
1821 		goto end_of_list;
1822 
1823 	BUG_ON(it->cache != mfc_cache_array);
1824 
1825 	while (++it->ct < MFC_LINES) {
1826 		mfc = mfc_cache_array[it->ct];
1827 		if (mfc)
1828 			return mfc;
1829 	}
1830 
1831 	/* exhausted cache_array, show unresolved */
1832 	read_unlock(&mrt_lock);
1833 	it->cache = &mfc_unres_queue;
1834 	it->ct = 0;
1835 
1836 	spin_lock_bh(&mfc_unres_lock);
1837 	mfc = mfc_unres_queue;
1838 	if (mfc)
1839 		return mfc;
1840 
1841  end_of_list:
1842 	spin_unlock_bh(&mfc_unres_lock);
1843 	it->cache = NULL;
1844 
1845 	return NULL;
1846 }
1847 
1848 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1849 {
1850 	struct ipmr_mfc_iter *it = seq->private;
1851 
1852 	if (it->cache == &mfc_unres_queue)
1853 		spin_unlock_bh(&mfc_unres_lock);
1854 	else if (it->cache == mfc_cache_array)
1855 		read_unlock(&mrt_lock);
1856 }
1857 
1858 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1859 {
1860 	int n;
1861 
1862 	if (v == SEQ_START_TOKEN) {
1863 		seq_puts(seq,
1864 		 "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1865 	} else {
1866 		const struct mfc_cache *mfc = v;
1867 		const struct ipmr_mfc_iter *it = seq->private;
1868 
1869 		seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1870 			   (unsigned long) mfc->mfc_mcastgrp,
1871 			   (unsigned long) mfc->mfc_origin,
1872 			   mfc->mfc_parent,
1873 			   mfc->mfc_un.res.pkt,
1874 			   mfc->mfc_un.res.bytes,
1875 			   mfc->mfc_un.res.wrong_if);
1876 
1877 		if (it->cache != &mfc_unres_queue) {
1878 			for (n = mfc->mfc_un.res.minvif;
1879 			     n < mfc->mfc_un.res.maxvif; n++ ) {
1880 				if (VIF_EXISTS(n)
1881 				   && mfc->mfc_un.res.ttls[n] < 255)
1882 				seq_printf(seq,
1883 					   " %2d:%-3d",
1884 					   n, mfc->mfc_un.res.ttls[n]);
1885 			}
1886 		}
1887 		seq_putc(seq, '\n');
1888 	}
1889 	return 0;
1890 }
1891 
1892 static const struct seq_operations ipmr_mfc_seq_ops = {
1893 	.start = ipmr_mfc_seq_start,
1894 	.next  = ipmr_mfc_seq_next,
1895 	.stop  = ipmr_mfc_seq_stop,
1896 	.show  = ipmr_mfc_seq_show,
1897 };
1898 
1899 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1900 {
1901 	return seq_open_private(file, &ipmr_mfc_seq_ops,
1902 			sizeof(struct ipmr_mfc_iter));
1903 }
1904 
1905 static const struct file_operations ipmr_mfc_fops = {
1906 	.owner	 = THIS_MODULE,
1907 	.open    = ipmr_mfc_open,
1908 	.read    = seq_read,
1909 	.llseek  = seq_lseek,
1910 	.release = seq_release_private,
1911 };
1912 #endif
1913 
1914 #ifdef CONFIG_IP_PIMSM_V2
1915 static struct net_protocol pim_protocol = {
1916 	.handler	=	pim_rcv,
1917 };
1918 #endif
1919 
1920 
1921 /*
1922  *	Setup for IP multicast routing
1923  */
1924 
1925 int __init ip_mr_init(void)
1926 {
1927 	int err;
1928 
1929 	mrt_cachep = kmem_cache_create("ip_mrt_cache",
1930 				       sizeof(struct mfc_cache),
1931 				       0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1932 				       NULL);
1933 	if (!mrt_cachep)
1934 		return -ENOMEM;
1935 
1936 	setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0);
1937 	err = register_netdevice_notifier(&ip_mr_notifier);
1938 	if (err)
1939 		goto reg_notif_fail;
1940 #ifdef CONFIG_PROC_FS
1941 	err = -ENOMEM;
1942 	if (!proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops))
1943 		goto proc_vif_fail;
1944 	if (!proc_net_fops_create(&init_net, "ip_mr_cache", 0, &ipmr_mfc_fops))
1945 		goto proc_cache_fail;
1946 #endif
1947 	return 0;
1948 reg_notif_fail:
1949 	kmem_cache_destroy(mrt_cachep);
1950 #ifdef CONFIG_PROC_FS
1951 proc_vif_fail:
1952 	unregister_netdevice_notifier(&ip_mr_notifier);
1953 proc_cache_fail:
1954 	proc_net_remove(&init_net, "ip_mr_vif");
1955 #endif
1956 	return err;
1957 }
1958