xref: /linux/net/ipv4/ipmr.c (revision b3b77c8caef1750ebeea1054e39e358550ea9f55)
1 /*
2  *	IP multicast routing support for mrouted 3.6/3.8
3  *
4  *		(c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
5  *	  Linux Consultancy and Custom Driver Development
6  *
7  *	This program is free software; you can redistribute it and/or
8  *	modify it under the terms of the GNU General Public License
9  *	as published by the Free Software Foundation; either version
10  *	2 of the License, or (at your option) any later version.
11  *
12  *	Fixes:
13  *	Michael Chastain	:	Incorrect size of copying.
14  *	Alan Cox		:	Added the cache manager code
15  *	Alan Cox		:	Fixed the clone/copy bug and device race.
16  *	Mike McLagan		:	Routing by source
17  *	Malcolm Beattie		:	Buffer handling fixes.
18  *	Alexey Kuznetsov	:	Double buffer free and other fixes.
19  *	SVR Anand		:	Fixed several multicast bugs and problems.
20  *	Alexey Kuznetsov	:	Status, optimisations and more.
21  *	Brad Parker		:	Better behaviour on mrouted upcall
22  *					overflow.
23  *      Carlos Picoto           :       PIMv1 Support
24  *	Pavlin Ivanov Radoslavov:	PIMv2 Registers must checksum only PIM header
25  *					Relax this requirement to work with older peers.
26  *
27  */
28 
29 #include <asm/system.h>
30 #include <asm/uaccess.h>
31 #include <linux/types.h>
32 #include <linux/capability.h>
33 #include <linux/errno.h>
34 #include <linux/timer.h>
35 #include <linux/mm.h>
36 #include <linux/kernel.h>
37 #include <linux/fcntl.h>
38 #include <linux/stat.h>
39 #include <linux/socket.h>
40 #include <linux/in.h>
41 #include <linux/inet.h>
42 #include <linux/netdevice.h>
43 #include <linux/inetdevice.h>
44 #include <linux/igmp.h>
45 #include <linux/proc_fs.h>
46 #include <linux/seq_file.h>
47 #include <linux/mroute.h>
48 #include <linux/init.h>
49 #include <linux/if_ether.h>
50 #include <linux/slab.h>
51 #include <net/net_namespace.h>
52 #include <net/ip.h>
53 #include <net/protocol.h>
54 #include <linux/skbuff.h>
55 #include <net/route.h>
56 #include <net/sock.h>
57 #include <net/icmp.h>
58 #include <net/udp.h>
59 #include <net/raw.h>
60 #include <linux/notifier.h>
61 #include <linux/if_arp.h>
62 #include <linux/netfilter_ipv4.h>
63 #include <net/ipip.h>
64 #include <net/checksum.h>
65 #include <net/netlink.h>
66 #include <net/fib_rules.h>
67 
68 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
69 #define CONFIG_IP_PIMSM	1
70 #endif
71 
72 struct mr_table {
73 	struct list_head	list;
74 #ifdef CONFIG_NET_NS
75 	struct net		*net;
76 #endif
77 	u32			id;
78 	struct sock		*mroute_sk;
79 	struct timer_list	ipmr_expire_timer;
80 	struct list_head	mfc_unres_queue;
81 	struct list_head	mfc_cache_array[MFC_LINES];
82 	struct vif_device	vif_table[MAXVIFS];
83 	int			maxvif;
84 	atomic_t		cache_resolve_queue_len;
85 	int			mroute_do_assert;
86 	int			mroute_do_pim;
87 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
88 	int			mroute_reg_vif_num;
89 #endif
90 };
91 
92 struct ipmr_rule {
93 	struct fib_rule		common;
94 };
95 
96 struct ipmr_result {
97 	struct mr_table		*mrt;
98 };
99 
100 /* Big lock, protecting vif table, mrt cache and mroute socket state.
101    Note that the changes are semaphored via rtnl_lock.
102  */
103 
104 static DEFINE_RWLOCK(mrt_lock);
105 
106 /*
107  *	Multicast router control variables
108  */
109 
110 #define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
111 
112 /* Special spinlock for queue of unresolved entries */
113 static DEFINE_SPINLOCK(mfc_unres_lock);
114 
115 /* We return to original Alan's scheme. Hash table of resolved
116    entries is changed only in process context and protected
117    with weak lock mrt_lock. Queue of unresolved entries is protected
118    with strong spinlock mfc_unres_lock.
119 
120    In this case data path is free of exclusive locks at all.
121  */
122 
123 static struct kmem_cache *mrt_cachep __read_mostly;
124 
125 static struct mr_table *ipmr_new_table(struct net *net, u32 id);
126 static int ip_mr_forward(struct net *net, struct mr_table *mrt,
127 			 struct sk_buff *skb, struct mfc_cache *cache,
128 			 int local);
129 static int ipmr_cache_report(struct mr_table *mrt,
130 			     struct sk_buff *pkt, vifi_t vifi, int assert);
131 static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
132 			      struct mfc_cache *c, struct rtmsg *rtm);
133 static void ipmr_expire_process(unsigned long arg);
134 
135 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
136 #define ipmr_for_each_table(mrt, net) \
137 	list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list)
138 
139 static struct mr_table *ipmr_get_table(struct net *net, u32 id)
140 {
141 	struct mr_table *mrt;
142 
143 	ipmr_for_each_table(mrt, net) {
144 		if (mrt->id == id)
145 			return mrt;
146 	}
147 	return NULL;
148 }
149 
150 static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
151 			   struct mr_table **mrt)
152 {
153 	struct ipmr_result res;
154 	struct fib_lookup_arg arg = { .result = &res, };
155 	int err;
156 
157 	err = fib_rules_lookup(net->ipv4.mr_rules_ops, flp, 0, &arg);
158 	if (err < 0)
159 		return err;
160 	*mrt = res.mrt;
161 	return 0;
162 }
163 
164 static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
165 			    int flags, struct fib_lookup_arg *arg)
166 {
167 	struct ipmr_result *res = arg->result;
168 	struct mr_table *mrt;
169 
170 	switch (rule->action) {
171 	case FR_ACT_TO_TBL:
172 		break;
173 	case FR_ACT_UNREACHABLE:
174 		return -ENETUNREACH;
175 	case FR_ACT_PROHIBIT:
176 		return -EACCES;
177 	case FR_ACT_BLACKHOLE:
178 	default:
179 		return -EINVAL;
180 	}
181 
182 	mrt = ipmr_get_table(rule->fr_net, rule->table);
183 	if (mrt == NULL)
184 		return -EAGAIN;
185 	res->mrt = mrt;
186 	return 0;
187 }
188 
189 static int ipmr_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
190 {
191 	return 1;
192 }
193 
194 static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = {
195 	FRA_GENERIC_POLICY,
196 };
197 
198 static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
199 			       struct fib_rule_hdr *frh, struct nlattr **tb)
200 {
201 	return 0;
202 }
203 
204 static int ipmr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
205 			     struct nlattr **tb)
206 {
207 	return 1;
208 }
209 
210 static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
211 			  struct fib_rule_hdr *frh)
212 {
213 	frh->dst_len = 0;
214 	frh->src_len = 0;
215 	frh->tos     = 0;
216 	return 0;
217 }
218 
219 static const struct fib_rules_ops __net_initdata ipmr_rules_ops_template = {
220 	.family		= RTNL_FAMILY_IPMR,
221 	.rule_size	= sizeof(struct ipmr_rule),
222 	.addr_size	= sizeof(u32),
223 	.action		= ipmr_rule_action,
224 	.match		= ipmr_rule_match,
225 	.configure	= ipmr_rule_configure,
226 	.compare	= ipmr_rule_compare,
227 	.default_pref	= fib_default_rule_pref,
228 	.fill		= ipmr_rule_fill,
229 	.nlgroup	= RTNLGRP_IPV4_RULE,
230 	.policy		= ipmr_rule_policy,
231 	.owner		= THIS_MODULE,
232 };
233 
234 static int __net_init ipmr_rules_init(struct net *net)
235 {
236 	struct fib_rules_ops *ops;
237 	struct mr_table *mrt;
238 	int err;
239 
240 	ops = fib_rules_register(&ipmr_rules_ops_template, net);
241 	if (IS_ERR(ops))
242 		return PTR_ERR(ops);
243 
244 	INIT_LIST_HEAD(&net->ipv4.mr_tables);
245 
246 	mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
247 	if (mrt == NULL) {
248 		err = -ENOMEM;
249 		goto err1;
250 	}
251 
252 	err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT, 0);
253 	if (err < 0)
254 		goto err2;
255 
256 	net->ipv4.mr_rules_ops = ops;
257 	return 0;
258 
259 err2:
260 	kfree(mrt);
261 err1:
262 	fib_rules_unregister(ops);
263 	return err;
264 }
265 
266 static void __net_exit ipmr_rules_exit(struct net *net)
267 {
268 	struct mr_table *mrt, *next;
269 
270 	list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list)
271 		kfree(mrt);
272 	fib_rules_unregister(net->ipv4.mr_rules_ops);
273 }
274 #else
275 #define ipmr_for_each_table(mrt, net) \
276 	for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
277 
278 static struct mr_table *ipmr_get_table(struct net *net, u32 id)
279 {
280 	return net->ipv4.mrt;
281 }
282 
283 static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
284 			   struct mr_table **mrt)
285 {
286 	*mrt = net->ipv4.mrt;
287 	return 0;
288 }
289 
290 static int __net_init ipmr_rules_init(struct net *net)
291 {
292 	net->ipv4.mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
293 	return net->ipv4.mrt ? 0 : -ENOMEM;
294 }
295 
296 static void __net_exit ipmr_rules_exit(struct net *net)
297 {
298 	kfree(net->ipv4.mrt);
299 }
300 #endif
301 
302 static struct mr_table *ipmr_new_table(struct net *net, u32 id)
303 {
304 	struct mr_table *mrt;
305 	unsigned int i;
306 
307 	mrt = ipmr_get_table(net, id);
308 	if (mrt != NULL)
309 		return mrt;
310 
311 	mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
312 	if (mrt == NULL)
313 		return NULL;
314 	write_pnet(&mrt->net, net);
315 	mrt->id = id;
316 
317 	/* Forwarding cache */
318 	for (i = 0; i < MFC_LINES; i++)
319 		INIT_LIST_HEAD(&mrt->mfc_cache_array[i]);
320 
321 	INIT_LIST_HEAD(&mrt->mfc_unres_queue);
322 
323 	setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
324 		    (unsigned long)mrt);
325 
326 #ifdef CONFIG_IP_PIMSM
327 	mrt->mroute_reg_vif_num = -1;
328 #endif
329 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
330 	list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables);
331 #endif
332 	return mrt;
333 }
334 
335 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
336 
337 static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
338 {
339 	struct net *net = dev_net(dev);
340 
341 	dev_close(dev);
342 
343 	dev = __dev_get_by_name(net, "tunl0");
344 	if (dev) {
345 		const struct net_device_ops *ops = dev->netdev_ops;
346 		struct ifreq ifr;
347 		struct ip_tunnel_parm p;
348 
349 		memset(&p, 0, sizeof(p));
350 		p.iph.daddr = v->vifc_rmt_addr.s_addr;
351 		p.iph.saddr = v->vifc_lcl_addr.s_addr;
352 		p.iph.version = 4;
353 		p.iph.ihl = 5;
354 		p.iph.protocol = IPPROTO_IPIP;
355 		sprintf(p.name, "dvmrp%d", v->vifc_vifi);
356 		ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
357 
358 		if (ops->ndo_do_ioctl) {
359 			mm_segment_t oldfs = get_fs();
360 
361 			set_fs(KERNEL_DS);
362 			ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
363 			set_fs(oldfs);
364 		}
365 	}
366 }
367 
368 static
369 struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
370 {
371 	struct net_device  *dev;
372 
373 	dev = __dev_get_by_name(net, "tunl0");
374 
375 	if (dev) {
376 		const struct net_device_ops *ops = dev->netdev_ops;
377 		int err;
378 		struct ifreq ifr;
379 		struct ip_tunnel_parm p;
380 		struct in_device  *in_dev;
381 
382 		memset(&p, 0, sizeof(p));
383 		p.iph.daddr = v->vifc_rmt_addr.s_addr;
384 		p.iph.saddr = v->vifc_lcl_addr.s_addr;
385 		p.iph.version = 4;
386 		p.iph.ihl = 5;
387 		p.iph.protocol = IPPROTO_IPIP;
388 		sprintf(p.name, "dvmrp%d", v->vifc_vifi);
389 		ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
390 
391 		if (ops->ndo_do_ioctl) {
392 			mm_segment_t oldfs = get_fs();
393 
394 			set_fs(KERNEL_DS);
395 			err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
396 			set_fs(oldfs);
397 		} else
398 			err = -EOPNOTSUPP;
399 
400 		dev = NULL;
401 
402 		if (err == 0 &&
403 		    (dev = __dev_get_by_name(net, p.name)) != NULL) {
404 			dev->flags |= IFF_MULTICAST;
405 
406 			in_dev = __in_dev_get_rtnl(dev);
407 			if (in_dev == NULL)
408 				goto failure;
409 
410 			ipv4_devconf_setall(in_dev);
411 			IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
412 
413 			if (dev_open(dev))
414 				goto failure;
415 			dev_hold(dev);
416 		}
417 	}
418 	return dev;
419 
420 failure:
421 	/* allow the register to be completed before unregistering. */
422 	rtnl_unlock();
423 	rtnl_lock();
424 
425 	unregister_netdevice(dev);
426 	return NULL;
427 }
428 
429 #ifdef CONFIG_IP_PIMSM
430 
431 static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
432 {
433 	struct net *net = dev_net(dev);
434 	struct mr_table *mrt;
435 	struct flowi fl = {
436 		.oif		= dev->ifindex,
437 		.iif		= skb->skb_iif,
438 		.mark		= skb->mark,
439 	};
440 	int err;
441 
442 	err = ipmr_fib_lookup(net, &fl, &mrt);
443 	if (err < 0)
444 		return err;
445 
446 	read_lock(&mrt_lock);
447 	dev->stats.tx_bytes += skb->len;
448 	dev->stats.tx_packets++;
449 	ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT);
450 	read_unlock(&mrt_lock);
451 	kfree_skb(skb);
452 	return NETDEV_TX_OK;
453 }
454 
455 static const struct net_device_ops reg_vif_netdev_ops = {
456 	.ndo_start_xmit	= reg_vif_xmit,
457 };
458 
459 static void reg_vif_setup(struct net_device *dev)
460 {
461 	dev->type		= ARPHRD_PIMREG;
462 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 8;
463 	dev->flags		= IFF_NOARP;
464 	dev->netdev_ops		= &reg_vif_netdev_ops,
465 	dev->destructor		= free_netdev;
466 	dev->features		|= NETIF_F_NETNS_LOCAL;
467 }
468 
469 static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
470 {
471 	struct net_device *dev;
472 	struct in_device *in_dev;
473 	char name[IFNAMSIZ];
474 
475 	if (mrt->id == RT_TABLE_DEFAULT)
476 		sprintf(name, "pimreg");
477 	else
478 		sprintf(name, "pimreg%u", mrt->id);
479 
480 	dev = alloc_netdev(0, name, reg_vif_setup);
481 
482 	if (dev == NULL)
483 		return NULL;
484 
485 	dev_net_set(dev, net);
486 
487 	if (register_netdevice(dev)) {
488 		free_netdev(dev);
489 		return NULL;
490 	}
491 	dev->iflink = 0;
492 
493 	rcu_read_lock();
494 	if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
495 		rcu_read_unlock();
496 		goto failure;
497 	}
498 
499 	ipv4_devconf_setall(in_dev);
500 	IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
501 	rcu_read_unlock();
502 
503 	if (dev_open(dev))
504 		goto failure;
505 
506 	dev_hold(dev);
507 
508 	return dev;
509 
510 failure:
511 	/* allow the register to be completed before unregistering. */
512 	rtnl_unlock();
513 	rtnl_lock();
514 
515 	unregister_netdevice(dev);
516 	return NULL;
517 }
518 #endif
519 
520 /*
521  *	Delete a VIF entry
522  *	@notify: Set to 1, if the caller is a notifier_call
523  */
524 
525 static int vif_delete(struct mr_table *mrt, int vifi, int notify,
526 		      struct list_head *head)
527 {
528 	struct vif_device *v;
529 	struct net_device *dev;
530 	struct in_device *in_dev;
531 
532 	if (vifi < 0 || vifi >= mrt->maxvif)
533 		return -EADDRNOTAVAIL;
534 
535 	v = &mrt->vif_table[vifi];
536 
537 	write_lock_bh(&mrt_lock);
538 	dev = v->dev;
539 	v->dev = NULL;
540 
541 	if (!dev) {
542 		write_unlock_bh(&mrt_lock);
543 		return -EADDRNOTAVAIL;
544 	}
545 
546 #ifdef CONFIG_IP_PIMSM
547 	if (vifi == mrt->mroute_reg_vif_num)
548 		mrt->mroute_reg_vif_num = -1;
549 #endif
550 
551 	if (vifi+1 == mrt->maxvif) {
552 		int tmp;
553 		for (tmp=vifi-1; tmp>=0; tmp--) {
554 			if (VIF_EXISTS(mrt, tmp))
555 				break;
556 		}
557 		mrt->maxvif = tmp+1;
558 	}
559 
560 	write_unlock_bh(&mrt_lock);
561 
562 	dev_set_allmulti(dev, -1);
563 
564 	if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
565 		IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
566 		ip_rt_multicast_event(in_dev);
567 	}
568 
569 	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
570 		unregister_netdevice_queue(dev, head);
571 
572 	dev_put(dev);
573 	return 0;
574 }
575 
576 static inline void ipmr_cache_free(struct mfc_cache *c)
577 {
578 	kmem_cache_free(mrt_cachep, c);
579 }
580 
581 /* Destroy an unresolved cache entry, killing queued skbs
582    and reporting error to netlink readers.
583  */
584 
585 static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
586 {
587 	struct net *net = read_pnet(&mrt->net);
588 	struct sk_buff *skb;
589 	struct nlmsgerr *e;
590 
591 	atomic_dec(&mrt->cache_resolve_queue_len);
592 
593 	while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
594 		if (ip_hdr(skb)->version == 0) {
595 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
596 			nlh->nlmsg_type = NLMSG_ERROR;
597 			nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
598 			skb_trim(skb, nlh->nlmsg_len);
599 			e = NLMSG_DATA(nlh);
600 			e->error = -ETIMEDOUT;
601 			memset(&e->msg, 0, sizeof(e->msg));
602 
603 			rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
604 		} else
605 			kfree_skb(skb);
606 	}
607 
608 	ipmr_cache_free(c);
609 }
610 
611 
612 /* Timer process for the unresolved queue. */
613 
614 static void ipmr_expire_process(unsigned long arg)
615 {
616 	struct mr_table *mrt = (struct mr_table *)arg;
617 	unsigned long now;
618 	unsigned long expires;
619 	struct mfc_cache *c, *next;
620 
621 	if (!spin_trylock(&mfc_unres_lock)) {
622 		mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10);
623 		return;
624 	}
625 
626 	if (list_empty(&mrt->mfc_unres_queue))
627 		goto out;
628 
629 	now = jiffies;
630 	expires = 10*HZ;
631 
632 	list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
633 		if (time_after(c->mfc_un.unres.expires, now)) {
634 			unsigned long interval = c->mfc_un.unres.expires - now;
635 			if (interval < expires)
636 				expires = interval;
637 			continue;
638 		}
639 
640 		list_del(&c->list);
641 		ipmr_destroy_unres(mrt, c);
642 	}
643 
644 	if (!list_empty(&mrt->mfc_unres_queue))
645 		mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);
646 
647 out:
648 	spin_unlock(&mfc_unres_lock);
649 }
650 
651 /* Fill oifs list. It is called under write locked mrt_lock. */
652 
653 static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache,
654 				   unsigned char *ttls)
655 {
656 	int vifi;
657 
658 	cache->mfc_un.res.minvif = MAXVIFS;
659 	cache->mfc_un.res.maxvif = 0;
660 	memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
661 
662 	for (vifi = 0; vifi < mrt->maxvif; vifi++) {
663 		if (VIF_EXISTS(mrt, vifi) &&
664 		    ttls[vifi] && ttls[vifi] < 255) {
665 			cache->mfc_un.res.ttls[vifi] = ttls[vifi];
666 			if (cache->mfc_un.res.minvif > vifi)
667 				cache->mfc_un.res.minvif = vifi;
668 			if (cache->mfc_un.res.maxvif <= vifi)
669 				cache->mfc_un.res.maxvif = vifi + 1;
670 		}
671 	}
672 }
673 
674 static int vif_add(struct net *net, struct mr_table *mrt,
675 		   struct vifctl *vifc, int mrtsock)
676 {
677 	int vifi = vifc->vifc_vifi;
678 	struct vif_device *v = &mrt->vif_table[vifi];
679 	struct net_device *dev;
680 	struct in_device *in_dev;
681 	int err;
682 
683 	/* Is vif busy ? */
684 	if (VIF_EXISTS(mrt, vifi))
685 		return -EADDRINUSE;
686 
687 	switch (vifc->vifc_flags) {
688 #ifdef CONFIG_IP_PIMSM
689 	case VIFF_REGISTER:
690 		/*
691 		 * Special Purpose VIF in PIM
692 		 * All the packets will be sent to the daemon
693 		 */
694 		if (mrt->mroute_reg_vif_num >= 0)
695 			return -EADDRINUSE;
696 		dev = ipmr_reg_vif(net, mrt);
697 		if (!dev)
698 			return -ENOBUFS;
699 		err = dev_set_allmulti(dev, 1);
700 		if (err) {
701 			unregister_netdevice(dev);
702 			dev_put(dev);
703 			return err;
704 		}
705 		break;
706 #endif
707 	case VIFF_TUNNEL:
708 		dev = ipmr_new_tunnel(net, vifc);
709 		if (!dev)
710 			return -ENOBUFS;
711 		err = dev_set_allmulti(dev, 1);
712 		if (err) {
713 			ipmr_del_tunnel(dev, vifc);
714 			dev_put(dev);
715 			return err;
716 		}
717 		break;
718 
719 	case VIFF_USE_IFINDEX:
720 	case 0:
721 		if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
722 			dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
723 			if (dev && dev->ip_ptr == NULL) {
724 				dev_put(dev);
725 				return -EADDRNOTAVAIL;
726 			}
727 		} else
728 			dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
729 
730 		if (!dev)
731 			return -EADDRNOTAVAIL;
732 		err = dev_set_allmulti(dev, 1);
733 		if (err) {
734 			dev_put(dev);
735 			return err;
736 		}
737 		break;
738 	default:
739 		return -EINVAL;
740 	}
741 
742 	if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) {
743 		dev_put(dev);
744 		return -EADDRNOTAVAIL;
745 	}
746 	IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
747 	ip_rt_multicast_event(in_dev);
748 
749 	/*
750 	 *	Fill in the VIF structures
751 	 */
752 	v->rate_limit = vifc->vifc_rate_limit;
753 	v->local = vifc->vifc_lcl_addr.s_addr;
754 	v->remote = vifc->vifc_rmt_addr.s_addr;
755 	v->flags = vifc->vifc_flags;
756 	if (!mrtsock)
757 		v->flags |= VIFF_STATIC;
758 	v->threshold = vifc->vifc_threshold;
759 	v->bytes_in = 0;
760 	v->bytes_out = 0;
761 	v->pkt_in = 0;
762 	v->pkt_out = 0;
763 	v->link = dev->ifindex;
764 	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
765 		v->link = dev->iflink;
766 
767 	/* And finish update writing critical data */
768 	write_lock_bh(&mrt_lock);
769 	v->dev = dev;
770 #ifdef CONFIG_IP_PIMSM
771 	if (v->flags&VIFF_REGISTER)
772 		mrt->mroute_reg_vif_num = vifi;
773 #endif
774 	if (vifi+1 > mrt->maxvif)
775 		mrt->maxvif = vifi+1;
776 	write_unlock_bh(&mrt_lock);
777 	return 0;
778 }
779 
780 static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
781 					 __be32 origin,
782 					 __be32 mcastgrp)
783 {
784 	int line = MFC_HASH(mcastgrp, origin);
785 	struct mfc_cache *c;
786 
787 	list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
788 		if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
789 			return c;
790 	}
791 	return NULL;
792 }
793 
794 /*
795  *	Allocate a multicast cache entry
796  */
797 static struct mfc_cache *ipmr_cache_alloc(void)
798 {
799 	struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
800 	if (c == NULL)
801 		return NULL;
802 	c->mfc_un.res.minvif = MAXVIFS;
803 	return c;
804 }
805 
806 static struct mfc_cache *ipmr_cache_alloc_unres(void)
807 {
808 	struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
809 	if (c == NULL)
810 		return NULL;
811 	skb_queue_head_init(&c->mfc_un.unres.unresolved);
812 	c->mfc_un.unres.expires = jiffies + 10*HZ;
813 	return c;
814 }
815 
816 /*
817  *	A cache entry has gone into a resolved state from queued
818  */
819 
820 static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
821 			       struct mfc_cache *uc, struct mfc_cache *c)
822 {
823 	struct sk_buff *skb;
824 	struct nlmsgerr *e;
825 
826 	/*
827 	 *	Play the pending entries through our router
828 	 */
829 
830 	while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
831 		if (ip_hdr(skb)->version == 0) {
832 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
833 
834 			if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) {
835 				nlh->nlmsg_len = (skb_tail_pointer(skb) -
836 						  (u8 *)nlh);
837 			} else {
838 				nlh->nlmsg_type = NLMSG_ERROR;
839 				nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
840 				skb_trim(skb, nlh->nlmsg_len);
841 				e = NLMSG_DATA(nlh);
842 				e->error = -EMSGSIZE;
843 				memset(&e->msg, 0, sizeof(e->msg));
844 			}
845 
846 			rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
847 		} else
848 			ip_mr_forward(net, mrt, skb, c, 0);
849 	}
850 }
851 
852 /*
853  *	Bounce a cache query up to mrouted. We could use netlink for this but mrouted
854  *	expects the following bizarre scheme.
855  *
856  *	Called under mrt_lock.
857  */
858 
859 static int ipmr_cache_report(struct mr_table *mrt,
860 			     struct sk_buff *pkt, vifi_t vifi, int assert)
861 {
862 	struct sk_buff *skb;
863 	const int ihl = ip_hdrlen(pkt);
864 	struct igmphdr *igmp;
865 	struct igmpmsg *msg;
866 	int ret;
867 
868 #ifdef CONFIG_IP_PIMSM
869 	if (assert == IGMPMSG_WHOLEPKT)
870 		skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
871 	else
872 #endif
873 		skb = alloc_skb(128, GFP_ATOMIC);
874 
875 	if (!skb)
876 		return -ENOBUFS;
877 
878 #ifdef CONFIG_IP_PIMSM
879 	if (assert == IGMPMSG_WHOLEPKT) {
880 		/* Ugly, but we have no choice with this interface.
881 		   Duplicate old header, fix ihl, length etc.
882 		   And all this only to mangle msg->im_msgtype and
883 		   to set msg->im_mbz to "mbz" :-)
884 		 */
885 		skb_push(skb, sizeof(struct iphdr));
886 		skb_reset_network_header(skb);
887 		skb_reset_transport_header(skb);
888 		msg = (struct igmpmsg *)skb_network_header(skb);
889 		memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
890 		msg->im_msgtype = IGMPMSG_WHOLEPKT;
891 		msg->im_mbz = 0;
892 		msg->im_vif = mrt->mroute_reg_vif_num;
893 		ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
894 		ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
895 					     sizeof(struct iphdr));
896 	} else
897 #endif
898 	{
899 
900 	/*
901 	 *	Copy the IP header
902 	 */
903 
904 	skb->network_header = skb->tail;
905 	skb_put(skb, ihl);
906 	skb_copy_to_linear_data(skb, pkt->data, ihl);
907 	ip_hdr(skb)->protocol = 0;			/* Flag to the kernel this is a route add */
908 	msg = (struct igmpmsg *)skb_network_header(skb);
909 	msg->im_vif = vifi;
910 	skb_dst_set(skb, dst_clone(skb_dst(pkt)));
911 
912 	/*
913 	 *	Add our header
914 	 */
915 
916 	igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
917 	igmp->type	=
918 	msg->im_msgtype = assert;
919 	igmp->code 	=	0;
920 	ip_hdr(skb)->tot_len = htons(skb->len);			/* Fix the length */
921 	skb->transport_header = skb->network_header;
922 	}
923 
924 	if (mrt->mroute_sk == NULL) {
925 		kfree_skb(skb);
926 		return -EINVAL;
927 	}
928 
929 	/*
930 	 *	Deliver to mrouted
931 	 */
932 	ret = sock_queue_rcv_skb(mrt->mroute_sk, skb);
933 	if (ret < 0) {
934 		if (net_ratelimit())
935 			printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
936 		kfree_skb(skb);
937 	}
938 
939 	return ret;
940 }
941 
942 /*
943  *	Queue a packet for resolution. It gets locked cache entry!
944  */
945 
946 static int
947 ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
948 {
949 	bool found = false;
950 	int err;
951 	struct mfc_cache *c;
952 	const struct iphdr *iph = ip_hdr(skb);
953 
954 	spin_lock_bh(&mfc_unres_lock);
955 	list_for_each_entry(c, &mrt->mfc_unres_queue, list) {
956 		if (c->mfc_mcastgrp == iph->daddr &&
957 		    c->mfc_origin == iph->saddr) {
958 			found = true;
959 			break;
960 		}
961 	}
962 
963 	if (!found) {
964 		/*
965 		 *	Create a new entry if allowable
966 		 */
967 
968 		if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
969 		    (c = ipmr_cache_alloc_unres()) == NULL) {
970 			spin_unlock_bh(&mfc_unres_lock);
971 
972 			kfree_skb(skb);
973 			return -ENOBUFS;
974 		}
975 
976 		/*
977 		 *	Fill in the new cache entry
978 		 */
979 		c->mfc_parent	= -1;
980 		c->mfc_origin	= iph->saddr;
981 		c->mfc_mcastgrp	= iph->daddr;
982 
983 		/*
984 		 *	Reflect first query at mrouted.
985 		 */
986 		err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
987 		if (err < 0) {
988 			/* If the report failed throw the cache entry
989 			   out - Brad Parker
990 			 */
991 			spin_unlock_bh(&mfc_unres_lock);
992 
993 			ipmr_cache_free(c);
994 			kfree_skb(skb);
995 			return err;
996 		}
997 
998 		atomic_inc(&mrt->cache_resolve_queue_len);
999 		list_add(&c->list, &mrt->mfc_unres_queue);
1000 
1001 		if (atomic_read(&mrt->cache_resolve_queue_len) == 1)
1002 			mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
1003 	}
1004 
1005 	/*
1006 	 *	See if we can append the packet
1007 	 */
1008 	if (c->mfc_un.unres.unresolved.qlen>3) {
1009 		kfree_skb(skb);
1010 		err = -ENOBUFS;
1011 	} else {
1012 		skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
1013 		err = 0;
1014 	}
1015 
1016 	spin_unlock_bh(&mfc_unres_lock);
1017 	return err;
1018 }
1019 
1020 /*
1021  *	MFC cache manipulation by user space mroute daemon
1022  */
1023 
1024 static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
1025 {
1026 	int line;
1027 	struct mfc_cache *c, *next;
1028 
1029 	line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
1030 
1031 	list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
1032 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
1033 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
1034 			write_lock_bh(&mrt_lock);
1035 			list_del(&c->list);
1036 			write_unlock_bh(&mrt_lock);
1037 
1038 			ipmr_cache_free(c);
1039 			return 0;
1040 		}
1041 	}
1042 	return -ENOENT;
1043 }
1044 
1045 static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
1046 			struct mfcctl *mfc, int mrtsock)
1047 {
1048 	bool found = false;
1049 	int line;
1050 	struct mfc_cache *uc, *c;
1051 
1052 	if (mfc->mfcc_parent >= MAXVIFS)
1053 		return -ENFILE;
1054 
1055 	line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
1056 
1057 	list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
1058 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
1059 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
1060 			found = true;
1061 			break;
1062 		}
1063 	}
1064 
1065 	if (found) {
1066 		write_lock_bh(&mrt_lock);
1067 		c->mfc_parent = mfc->mfcc_parent;
1068 		ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
1069 		if (!mrtsock)
1070 			c->mfc_flags |= MFC_STATIC;
1071 		write_unlock_bh(&mrt_lock);
1072 		return 0;
1073 	}
1074 
1075 	if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
1076 		return -EINVAL;
1077 
1078 	c = ipmr_cache_alloc();
1079 	if (c == NULL)
1080 		return -ENOMEM;
1081 
1082 	c->mfc_origin = mfc->mfcc_origin.s_addr;
1083 	c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
1084 	c->mfc_parent = mfc->mfcc_parent;
1085 	ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
1086 	if (!mrtsock)
1087 		c->mfc_flags |= MFC_STATIC;
1088 
1089 	write_lock_bh(&mrt_lock);
1090 	list_add(&c->list, &mrt->mfc_cache_array[line]);
1091 	write_unlock_bh(&mrt_lock);
1092 
1093 	/*
1094 	 *	Check to see if we resolved a queued list. If so we
1095 	 *	need to send on the frames and tidy up.
1096 	 */
1097 	found = false;
1098 	spin_lock_bh(&mfc_unres_lock);
1099 	list_for_each_entry(uc, &mrt->mfc_unres_queue, list) {
1100 		if (uc->mfc_origin == c->mfc_origin &&
1101 		    uc->mfc_mcastgrp == c->mfc_mcastgrp) {
1102 			list_del(&uc->list);
1103 			atomic_dec(&mrt->cache_resolve_queue_len);
1104 			found = true;
1105 			break;
1106 		}
1107 	}
1108 	if (list_empty(&mrt->mfc_unres_queue))
1109 		del_timer(&mrt->ipmr_expire_timer);
1110 	spin_unlock_bh(&mfc_unres_lock);
1111 
1112 	if (found) {
1113 		ipmr_cache_resolve(net, mrt, uc, c);
1114 		ipmr_cache_free(uc);
1115 	}
1116 	return 0;
1117 }
1118 
1119 /*
1120  *	Close the multicast socket, and clear the vif tables etc
1121  */
1122 
1123 static void mroute_clean_tables(struct mr_table *mrt)
1124 {
1125 	int i;
1126 	LIST_HEAD(list);
1127 	struct mfc_cache *c, *next;
1128 
1129 	/*
1130 	 *	Shut down all active vif entries
1131 	 */
1132 	for (i = 0; i < mrt->maxvif; i++) {
1133 		if (!(mrt->vif_table[i].flags&VIFF_STATIC))
1134 			vif_delete(mrt, i, 0, &list);
1135 	}
1136 	unregister_netdevice_many(&list);
1137 
1138 	/*
1139 	 *	Wipe the cache
1140 	 */
1141 	for (i = 0; i < MFC_LINES; i++) {
1142 		list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
1143 			if (c->mfc_flags&MFC_STATIC)
1144 				continue;
1145 			write_lock_bh(&mrt_lock);
1146 			list_del(&c->list);
1147 			write_unlock_bh(&mrt_lock);
1148 
1149 			ipmr_cache_free(c);
1150 		}
1151 	}
1152 
1153 	if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
1154 		spin_lock_bh(&mfc_unres_lock);
1155 		list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
1156 			list_del(&c->list);
1157 			ipmr_destroy_unres(mrt, c);
1158 		}
1159 		spin_unlock_bh(&mfc_unres_lock);
1160 	}
1161 }
1162 
1163 static void mrtsock_destruct(struct sock *sk)
1164 {
1165 	struct net *net = sock_net(sk);
1166 	struct mr_table *mrt;
1167 
1168 	rtnl_lock();
1169 	ipmr_for_each_table(mrt, net) {
1170 		if (sk == mrt->mroute_sk) {
1171 			IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
1172 
1173 			write_lock_bh(&mrt_lock);
1174 			mrt->mroute_sk = NULL;
1175 			write_unlock_bh(&mrt_lock);
1176 
1177 			mroute_clean_tables(mrt);
1178 		}
1179 	}
1180 	rtnl_unlock();
1181 }
1182 
1183 /*
1184  *	Socket options and virtual interface manipulation. The whole
1185  *	virtual interface system is a complete heap, but unfortunately
1186  *	that's how BSD mrouted happens to think. Maybe one day with a proper
1187  *	MOSPF/PIM router set up we can clean this up.
1188  */
1189 
1190 int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen)
1191 {
1192 	int ret;
1193 	struct vifctl vif;
1194 	struct mfcctl mfc;
1195 	struct net *net = sock_net(sk);
1196 	struct mr_table *mrt;
1197 
1198 	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1199 	if (mrt == NULL)
1200 		return -ENOENT;
1201 
1202 	if (optname != MRT_INIT) {
1203 		if (sk != mrt->mroute_sk && !capable(CAP_NET_ADMIN))
1204 			return -EACCES;
1205 	}
1206 
1207 	switch (optname) {
1208 	case MRT_INIT:
1209 		if (sk->sk_type != SOCK_RAW ||
1210 		    inet_sk(sk)->inet_num != IPPROTO_IGMP)
1211 			return -EOPNOTSUPP;
1212 		if (optlen != sizeof(int))
1213 			return -ENOPROTOOPT;
1214 
1215 		rtnl_lock();
1216 		if (mrt->mroute_sk) {
1217 			rtnl_unlock();
1218 			return -EADDRINUSE;
1219 		}
1220 
1221 		ret = ip_ra_control(sk, 1, mrtsock_destruct);
1222 		if (ret == 0) {
1223 			write_lock_bh(&mrt_lock);
1224 			mrt->mroute_sk = sk;
1225 			write_unlock_bh(&mrt_lock);
1226 
1227 			IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
1228 		}
1229 		rtnl_unlock();
1230 		return ret;
1231 	case MRT_DONE:
1232 		if (sk != mrt->mroute_sk)
1233 			return -EACCES;
1234 		return ip_ra_control(sk, 0, NULL);
1235 	case MRT_ADD_VIF:
1236 	case MRT_DEL_VIF:
1237 		if (optlen != sizeof(vif))
1238 			return -EINVAL;
1239 		if (copy_from_user(&vif, optval, sizeof(vif)))
1240 			return -EFAULT;
1241 		if (vif.vifc_vifi >= MAXVIFS)
1242 			return -ENFILE;
1243 		rtnl_lock();
1244 		if (optname == MRT_ADD_VIF) {
1245 			ret = vif_add(net, mrt, &vif, sk == mrt->mroute_sk);
1246 		} else {
1247 			ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
1248 		}
1249 		rtnl_unlock();
1250 		return ret;
1251 
1252 		/*
1253 		 *	Manipulate the forwarding caches. These live
1254 		 *	in a sort of kernel/user symbiosis.
1255 		 */
1256 	case MRT_ADD_MFC:
1257 	case MRT_DEL_MFC:
1258 		if (optlen != sizeof(mfc))
1259 			return -EINVAL;
1260 		if (copy_from_user(&mfc, optval, sizeof(mfc)))
1261 			return -EFAULT;
1262 		rtnl_lock();
1263 		if (optname == MRT_DEL_MFC)
1264 			ret = ipmr_mfc_delete(mrt, &mfc);
1265 		else
1266 			ret = ipmr_mfc_add(net, mrt, &mfc, sk == mrt->mroute_sk);
1267 		rtnl_unlock();
1268 		return ret;
1269 		/*
1270 		 *	Control PIM assert.
1271 		 */
1272 	case MRT_ASSERT:
1273 	{
1274 		int v;
1275 		if (get_user(v,(int __user *)optval))
1276 			return -EFAULT;
1277 		mrt->mroute_do_assert = (v) ? 1 : 0;
1278 		return 0;
1279 	}
1280 #ifdef CONFIG_IP_PIMSM
1281 	case MRT_PIM:
1282 	{
1283 		int v;
1284 
1285 		if (get_user(v,(int __user *)optval))
1286 			return -EFAULT;
1287 		v = (v) ? 1 : 0;
1288 
1289 		rtnl_lock();
1290 		ret = 0;
1291 		if (v != mrt->mroute_do_pim) {
1292 			mrt->mroute_do_pim = v;
1293 			mrt->mroute_do_assert = v;
1294 		}
1295 		rtnl_unlock();
1296 		return ret;
1297 	}
1298 #endif
1299 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
1300 	case MRT_TABLE:
1301 	{
1302 		u32 v;
1303 
1304 		if (optlen != sizeof(u32))
1305 			return -EINVAL;
1306 		if (get_user(v, (u32 __user *)optval))
1307 			return -EFAULT;
1308 		if (sk == mrt->mroute_sk)
1309 			return -EBUSY;
1310 
1311 		rtnl_lock();
1312 		ret = 0;
1313 		if (!ipmr_new_table(net, v))
1314 			ret = -ENOMEM;
1315 		raw_sk(sk)->ipmr_table = v;
1316 		rtnl_unlock();
1317 		return ret;
1318 	}
1319 #endif
1320 	/*
1321 	 *	Spurious command, or MRT_VERSION which you cannot
1322 	 *	set.
1323 	 */
1324 	default:
1325 		return -ENOPROTOOPT;
1326 	}
1327 }
1328 
1329 /*
1330  *	Getsock opt support for the multicast routing system.
1331  */
1332 
1333 int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
1334 {
1335 	int olr;
1336 	int val;
1337 	struct net *net = sock_net(sk);
1338 	struct mr_table *mrt;
1339 
1340 	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1341 	if (mrt == NULL)
1342 		return -ENOENT;
1343 
1344 	if (optname != MRT_VERSION &&
1345 #ifdef CONFIG_IP_PIMSM
1346 	   optname!=MRT_PIM &&
1347 #endif
1348 	   optname!=MRT_ASSERT)
1349 		return -ENOPROTOOPT;
1350 
1351 	if (get_user(olr, optlen))
1352 		return -EFAULT;
1353 
1354 	olr = min_t(unsigned int, olr, sizeof(int));
1355 	if (olr < 0)
1356 		return -EINVAL;
1357 
1358 	if (put_user(olr, optlen))
1359 		return -EFAULT;
1360 	if (optname == MRT_VERSION)
1361 		val = 0x0305;
1362 #ifdef CONFIG_IP_PIMSM
1363 	else if (optname == MRT_PIM)
1364 		val = mrt->mroute_do_pim;
1365 #endif
1366 	else
1367 		val = mrt->mroute_do_assert;
1368 	if (copy_to_user(optval, &val, olr))
1369 		return -EFAULT;
1370 	return 0;
1371 }
1372 
1373 /*
1374  *	The IP multicast ioctl support routines.
1375  */
1376 
1377 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1378 {
1379 	struct sioc_sg_req sr;
1380 	struct sioc_vif_req vr;
1381 	struct vif_device *vif;
1382 	struct mfc_cache *c;
1383 	struct net *net = sock_net(sk);
1384 	struct mr_table *mrt;
1385 
1386 	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1387 	if (mrt == NULL)
1388 		return -ENOENT;
1389 
1390 	switch (cmd) {
1391 	case SIOCGETVIFCNT:
1392 		if (copy_from_user(&vr, arg, sizeof(vr)))
1393 			return -EFAULT;
1394 		if (vr.vifi >= mrt->maxvif)
1395 			return -EINVAL;
1396 		read_lock(&mrt_lock);
1397 		vif = &mrt->vif_table[vr.vifi];
1398 		if (VIF_EXISTS(mrt, vr.vifi)) {
1399 			vr.icount = vif->pkt_in;
1400 			vr.ocount = vif->pkt_out;
1401 			vr.ibytes = vif->bytes_in;
1402 			vr.obytes = vif->bytes_out;
1403 			read_unlock(&mrt_lock);
1404 
1405 			if (copy_to_user(arg, &vr, sizeof(vr)))
1406 				return -EFAULT;
1407 			return 0;
1408 		}
1409 		read_unlock(&mrt_lock);
1410 		return -EADDRNOTAVAIL;
1411 	case SIOCGETSGCNT:
1412 		if (copy_from_user(&sr, arg, sizeof(sr)))
1413 			return -EFAULT;
1414 
1415 		read_lock(&mrt_lock);
1416 		c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
1417 		if (c) {
1418 			sr.pktcnt = c->mfc_un.res.pkt;
1419 			sr.bytecnt = c->mfc_un.res.bytes;
1420 			sr.wrong_if = c->mfc_un.res.wrong_if;
1421 			read_unlock(&mrt_lock);
1422 
1423 			if (copy_to_user(arg, &sr, sizeof(sr)))
1424 				return -EFAULT;
1425 			return 0;
1426 		}
1427 		read_unlock(&mrt_lock);
1428 		return -EADDRNOTAVAIL;
1429 	default:
1430 		return -ENOIOCTLCMD;
1431 	}
1432 }
1433 
1434 
1435 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1436 {
1437 	struct net_device *dev = ptr;
1438 	struct net *net = dev_net(dev);
1439 	struct mr_table *mrt;
1440 	struct vif_device *v;
1441 	int ct;
1442 	LIST_HEAD(list);
1443 
1444 	if (event != NETDEV_UNREGISTER)
1445 		return NOTIFY_DONE;
1446 
1447 	ipmr_for_each_table(mrt, net) {
1448 		v = &mrt->vif_table[0];
1449 		for (ct = 0; ct < mrt->maxvif; ct++, v++) {
1450 			if (v->dev == dev)
1451 				vif_delete(mrt, ct, 1, &list);
1452 		}
1453 	}
1454 	unregister_netdevice_many(&list);
1455 	return NOTIFY_DONE;
1456 }
1457 
1458 
1459 static struct notifier_block ip_mr_notifier = {
1460 	.notifier_call = ipmr_device_event,
1461 };
1462 
1463 /*
1464  * 	Encapsulate a packet by attaching a valid IPIP header to it.
1465  *	This avoids tunnel drivers and other mess and gives us the speed so
1466  *	important for multicast video.
1467  */
1468 
1469 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1470 {
1471 	struct iphdr *iph;
1472 	struct iphdr *old_iph = ip_hdr(skb);
1473 
1474 	skb_push(skb, sizeof(struct iphdr));
1475 	skb->transport_header = skb->network_header;
1476 	skb_reset_network_header(skb);
1477 	iph = ip_hdr(skb);
1478 
1479 	iph->version	= 	4;
1480 	iph->tos	=	old_iph->tos;
1481 	iph->ttl	=	old_iph->ttl;
1482 	iph->frag_off	=	0;
1483 	iph->daddr	=	daddr;
1484 	iph->saddr	=	saddr;
1485 	iph->protocol	=	IPPROTO_IPIP;
1486 	iph->ihl	=	5;
1487 	iph->tot_len	=	htons(skb->len);
1488 	ip_select_ident(iph, skb_dst(skb), NULL);
1489 	ip_send_check(iph);
1490 
1491 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1492 	nf_reset(skb);
1493 }
1494 
1495 static inline int ipmr_forward_finish(struct sk_buff *skb)
1496 {
1497 	struct ip_options * opt	= &(IPCB(skb)->opt);
1498 
1499 	IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
1500 
1501 	if (unlikely(opt->optlen))
1502 		ip_forward_options(skb);
1503 
1504 	return dst_output(skb);
1505 }
1506 
1507 /*
1508  *	Processing handlers for ipmr_forward
1509  */
1510 
1511 static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1512 			    struct sk_buff *skb, struct mfc_cache *c, int vifi)
1513 {
1514 	const struct iphdr *iph = ip_hdr(skb);
1515 	struct vif_device *vif = &mrt->vif_table[vifi];
1516 	struct net_device *dev;
1517 	struct rtable *rt;
1518 	int    encap = 0;
1519 
1520 	if (vif->dev == NULL)
1521 		goto out_free;
1522 
1523 #ifdef CONFIG_IP_PIMSM
1524 	if (vif->flags & VIFF_REGISTER) {
1525 		vif->pkt_out++;
1526 		vif->bytes_out += skb->len;
1527 		vif->dev->stats.tx_bytes += skb->len;
1528 		vif->dev->stats.tx_packets++;
1529 		ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
1530 		goto out_free;
1531 	}
1532 #endif
1533 
1534 	if (vif->flags&VIFF_TUNNEL) {
1535 		struct flowi fl = { .oif = vif->link,
1536 				    .nl_u = { .ip4_u =
1537 					      { .daddr = vif->remote,
1538 						.saddr = vif->local,
1539 						.tos = RT_TOS(iph->tos) } },
1540 				    .proto = IPPROTO_IPIP };
1541 		if (ip_route_output_key(net, &rt, &fl))
1542 			goto out_free;
1543 		encap = sizeof(struct iphdr);
1544 	} else {
1545 		struct flowi fl = { .oif = vif->link,
1546 				    .nl_u = { .ip4_u =
1547 					      { .daddr = iph->daddr,
1548 						.tos = RT_TOS(iph->tos) } },
1549 				    .proto = IPPROTO_IPIP };
1550 		if (ip_route_output_key(net, &rt, &fl))
1551 			goto out_free;
1552 	}
1553 
1554 	dev = rt->u.dst.dev;
1555 
1556 	if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1557 		/* Do not fragment multicasts. Alas, IPv4 does not
1558 		   allow to send ICMP, so that packets will disappear
1559 		   to blackhole.
1560 		 */
1561 
1562 		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1563 		ip_rt_put(rt);
1564 		goto out_free;
1565 	}
1566 
1567 	encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1568 
1569 	if (skb_cow(skb, encap)) {
1570 		ip_rt_put(rt);
1571 		goto out_free;
1572 	}
1573 
1574 	vif->pkt_out++;
1575 	vif->bytes_out += skb->len;
1576 
1577 	skb_dst_drop(skb);
1578 	skb_dst_set(skb, &rt->u.dst);
1579 	ip_decrease_ttl(ip_hdr(skb));
1580 
1581 	/* FIXME: forward and output firewalls used to be called here.
1582 	 * What do we do with netfilter? -- RR */
1583 	if (vif->flags & VIFF_TUNNEL) {
1584 		ip_encap(skb, vif->local, vif->remote);
1585 		/* FIXME: extra output firewall step used to be here. --RR */
1586 		vif->dev->stats.tx_packets++;
1587 		vif->dev->stats.tx_bytes += skb->len;
1588 	}
1589 
1590 	IPCB(skb)->flags |= IPSKB_FORWARDED;
1591 
1592 	/*
1593 	 * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1594 	 * not only before forwarding, but after forwarding on all output
1595 	 * interfaces. It is clear, if mrouter runs a multicasting
1596 	 * program, it should receive packets not depending to what interface
1597 	 * program is joined.
1598 	 * If we will not make it, the program will have to join on all
1599 	 * interfaces. On the other hand, multihoming host (or router, but
1600 	 * not mrouter) cannot join to more than one interface - it will
1601 	 * result in receiving multiple packets.
1602 	 */
1603 	NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, dev,
1604 		ipmr_forward_finish);
1605 	return;
1606 
1607 out_free:
1608 	kfree_skb(skb);
1609 }
1610 
1611 static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
1612 {
1613 	int ct;
1614 
1615 	for (ct = mrt->maxvif-1; ct >= 0; ct--) {
1616 		if (mrt->vif_table[ct].dev == dev)
1617 			break;
1618 	}
1619 	return ct;
1620 }
1621 
1622 /* "local" means that we should preserve one skb (for local delivery) */
1623 
1624 static int ip_mr_forward(struct net *net, struct mr_table *mrt,
1625 			 struct sk_buff *skb, struct mfc_cache *cache,
1626 			 int local)
1627 {
1628 	int psend = -1;
1629 	int vif, ct;
1630 
1631 	vif = cache->mfc_parent;
1632 	cache->mfc_un.res.pkt++;
1633 	cache->mfc_un.res.bytes += skb->len;
1634 
1635 	/*
1636 	 * Wrong interface: drop packet and (maybe) send PIM assert.
1637 	 */
1638 	if (mrt->vif_table[vif].dev != skb->dev) {
1639 		int true_vifi;
1640 
1641 		if (skb_rtable(skb)->fl.iif == 0) {
1642 			/* It is our own packet, looped back.
1643 			   Very complicated situation...
1644 
1645 			   The best workaround until routing daemons will be
1646 			   fixed is not to redistribute packet, if it was
1647 			   send through wrong interface. It means, that
1648 			   multicast applications WILL NOT work for
1649 			   (S,G), which have default multicast route pointing
1650 			   to wrong oif. In any case, it is not a good
1651 			   idea to use multicasting applications on router.
1652 			 */
1653 			goto dont_forward;
1654 		}
1655 
1656 		cache->mfc_un.res.wrong_if++;
1657 		true_vifi = ipmr_find_vif(mrt, skb->dev);
1658 
1659 		if (true_vifi >= 0 && mrt->mroute_do_assert &&
1660 		    /* pimsm uses asserts, when switching from RPT to SPT,
1661 		       so that we cannot check that packet arrived on an oif.
1662 		       It is bad, but otherwise we would need to move pretty
1663 		       large chunk of pimd to kernel. Ough... --ANK
1664 		     */
1665 		    (mrt->mroute_do_pim ||
1666 		     cache->mfc_un.res.ttls[true_vifi] < 255) &&
1667 		    time_after(jiffies,
1668 			       cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1669 			cache->mfc_un.res.last_assert = jiffies;
1670 			ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
1671 		}
1672 		goto dont_forward;
1673 	}
1674 
1675 	mrt->vif_table[vif].pkt_in++;
1676 	mrt->vif_table[vif].bytes_in += skb->len;
1677 
1678 	/*
1679 	 *	Forward the frame
1680 	 */
1681 	for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1682 		if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1683 			if (psend != -1) {
1684 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1685 				if (skb2)
1686 					ipmr_queue_xmit(net, mrt, skb2, cache,
1687 							psend);
1688 			}
1689 			psend = ct;
1690 		}
1691 	}
1692 	if (psend != -1) {
1693 		if (local) {
1694 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1695 			if (skb2)
1696 				ipmr_queue_xmit(net, mrt, skb2, cache, psend);
1697 		} else {
1698 			ipmr_queue_xmit(net, mrt, skb, cache, psend);
1699 			return 0;
1700 		}
1701 	}
1702 
1703 dont_forward:
1704 	if (!local)
1705 		kfree_skb(skb);
1706 	return 0;
1707 }
1708 
1709 
1710 /*
1711  *	Multicast packets for forwarding arrive here
1712  */
1713 
1714 int ip_mr_input(struct sk_buff *skb)
1715 {
1716 	struct mfc_cache *cache;
1717 	struct net *net = dev_net(skb->dev);
1718 	int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
1719 	struct mr_table *mrt;
1720 	int err;
1721 
1722 	/* Packet is looped back after forward, it should not be
1723 	   forwarded second time, but still can be delivered locally.
1724 	 */
1725 	if (IPCB(skb)->flags&IPSKB_FORWARDED)
1726 		goto dont_forward;
1727 
1728 	err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt);
1729 	if (err < 0)
1730 		return err;
1731 
1732 	if (!local) {
1733 		    if (IPCB(skb)->opt.router_alert) {
1734 			    if (ip_call_ra_chain(skb))
1735 				    return 0;
1736 		    } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1737 			    /* IGMPv1 (and broken IGMPv2 implementations sort of
1738 			       Cisco IOS <= 11.2(8)) do not put router alert
1739 			       option to IGMP packets destined to routable
1740 			       groups. It is very bad, because it means
1741 			       that we can forward NO IGMP messages.
1742 			     */
1743 			    read_lock(&mrt_lock);
1744 			    if (mrt->mroute_sk) {
1745 				    nf_reset(skb);
1746 				    raw_rcv(mrt->mroute_sk, skb);
1747 				    read_unlock(&mrt_lock);
1748 				    return 0;
1749 			    }
1750 			    read_unlock(&mrt_lock);
1751 		    }
1752 	}
1753 
1754 	read_lock(&mrt_lock);
1755 	cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1756 
1757 	/*
1758 	 *	No usable cache entry
1759 	 */
1760 	if (cache == NULL) {
1761 		int vif;
1762 
1763 		if (local) {
1764 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1765 			ip_local_deliver(skb);
1766 			if (skb2 == NULL) {
1767 				read_unlock(&mrt_lock);
1768 				return -ENOBUFS;
1769 			}
1770 			skb = skb2;
1771 		}
1772 
1773 		vif = ipmr_find_vif(mrt, skb->dev);
1774 		if (vif >= 0) {
1775 			int err2 = ipmr_cache_unresolved(mrt, vif, skb);
1776 			read_unlock(&mrt_lock);
1777 
1778 			return err2;
1779 		}
1780 		read_unlock(&mrt_lock);
1781 		kfree_skb(skb);
1782 		return -ENODEV;
1783 	}
1784 
1785 	ip_mr_forward(net, mrt, skb, cache, local);
1786 
1787 	read_unlock(&mrt_lock);
1788 
1789 	if (local)
1790 		return ip_local_deliver(skb);
1791 
1792 	return 0;
1793 
1794 dont_forward:
1795 	if (local)
1796 		return ip_local_deliver(skb);
1797 	kfree_skb(skb);
1798 	return 0;
1799 }
1800 
1801 #ifdef CONFIG_IP_PIMSM
1802 static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
1803 		     unsigned int pimlen)
1804 {
1805 	struct net_device *reg_dev = NULL;
1806 	struct iphdr *encap;
1807 
1808 	encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
1809 	/*
1810 	   Check that:
1811 	   a. packet is really destinted to a multicast group
1812 	   b. packet is not a NULL-REGISTER
1813 	   c. packet is not truncated
1814 	 */
1815 	if (!ipv4_is_multicast(encap->daddr) ||
1816 	    encap->tot_len == 0 ||
1817 	    ntohs(encap->tot_len) + pimlen > skb->len)
1818 		return 1;
1819 
1820 	read_lock(&mrt_lock);
1821 	if (mrt->mroute_reg_vif_num >= 0)
1822 		reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
1823 	if (reg_dev)
1824 		dev_hold(reg_dev);
1825 	read_unlock(&mrt_lock);
1826 
1827 	if (reg_dev == NULL)
1828 		return 1;
1829 
1830 	skb->mac_header = skb->network_header;
1831 	skb_pull(skb, (u8*)encap - skb->data);
1832 	skb_reset_network_header(skb);
1833 	skb->protocol = htons(ETH_P_IP);
1834 	skb->ip_summed = 0;
1835 	skb->pkt_type = PACKET_HOST;
1836 
1837 	skb_tunnel_rx(skb, reg_dev);
1838 
1839 	netif_rx(skb);
1840 	dev_put(reg_dev);
1841 
1842 	return 0;
1843 }
1844 #endif
1845 
1846 #ifdef CONFIG_IP_PIMSM_V1
1847 /*
1848  * Handle IGMP messages of PIMv1
1849  */
1850 
1851 int pim_rcv_v1(struct sk_buff * skb)
1852 {
1853 	struct igmphdr *pim;
1854 	struct net *net = dev_net(skb->dev);
1855 	struct mr_table *mrt;
1856 
1857 	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1858 		goto drop;
1859 
1860 	pim = igmp_hdr(skb);
1861 
1862 	if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
1863 		goto drop;
1864 
1865 	if (!mrt->mroute_do_pim ||
1866 	    pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1867 		goto drop;
1868 
1869 	if (__pim_rcv(mrt, skb, sizeof(*pim))) {
1870 drop:
1871 		kfree_skb(skb);
1872 	}
1873 	return 0;
1874 }
1875 #endif
1876 
1877 #ifdef CONFIG_IP_PIMSM_V2
1878 static int pim_rcv(struct sk_buff * skb)
1879 {
1880 	struct pimreghdr *pim;
1881 	struct net *net = dev_net(skb->dev);
1882 	struct mr_table *mrt;
1883 
1884 	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1885 		goto drop;
1886 
1887 	pim = (struct pimreghdr *)skb_transport_header(skb);
1888 	if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1889 	    (pim->flags&PIM_NULL_REGISTER) ||
1890 	    (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1891 	     csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1892 		goto drop;
1893 
1894 	if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
1895 		goto drop;
1896 
1897 	if (__pim_rcv(mrt, skb, sizeof(*pim))) {
1898 drop:
1899 		kfree_skb(skb);
1900 	}
1901 	return 0;
1902 }
1903 #endif
1904 
1905 static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
1906 			      struct mfc_cache *c, struct rtmsg *rtm)
1907 {
1908 	int ct;
1909 	struct rtnexthop *nhp;
1910 	u8 *b = skb_tail_pointer(skb);
1911 	struct rtattr *mp_head;
1912 
1913 	/* If cache is unresolved, don't try to parse IIF and OIF */
1914 	if (c->mfc_parent > MAXVIFS)
1915 		return -ENOENT;
1916 
1917 	if (VIF_EXISTS(mrt, c->mfc_parent))
1918 		RTA_PUT(skb, RTA_IIF, 4, &mrt->vif_table[c->mfc_parent].dev->ifindex);
1919 
1920 	mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
1921 
1922 	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1923 		if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
1924 			if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1925 				goto rtattr_failure;
1926 			nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1927 			nhp->rtnh_flags = 0;
1928 			nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1929 			nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;
1930 			nhp->rtnh_len = sizeof(*nhp);
1931 		}
1932 	}
1933 	mp_head->rta_type = RTA_MULTIPATH;
1934 	mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1935 	rtm->rtm_type = RTN_MULTICAST;
1936 	return 1;
1937 
1938 rtattr_failure:
1939 	nlmsg_trim(skb, b);
1940 	return -EMSGSIZE;
1941 }
1942 
1943 int ipmr_get_route(struct net *net,
1944 		   struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1945 {
1946 	int err;
1947 	struct mr_table *mrt;
1948 	struct mfc_cache *cache;
1949 	struct rtable *rt = skb_rtable(skb);
1950 
1951 	mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
1952 	if (mrt == NULL)
1953 		return -ENOENT;
1954 
1955 	read_lock(&mrt_lock);
1956 	cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst);
1957 
1958 	if (cache == NULL) {
1959 		struct sk_buff *skb2;
1960 		struct iphdr *iph;
1961 		struct net_device *dev;
1962 		int vif;
1963 
1964 		if (nowait) {
1965 			read_unlock(&mrt_lock);
1966 			return -EAGAIN;
1967 		}
1968 
1969 		dev = skb->dev;
1970 		if (dev == NULL || (vif = ipmr_find_vif(mrt, dev)) < 0) {
1971 			read_unlock(&mrt_lock);
1972 			return -ENODEV;
1973 		}
1974 		skb2 = skb_clone(skb, GFP_ATOMIC);
1975 		if (!skb2) {
1976 			read_unlock(&mrt_lock);
1977 			return -ENOMEM;
1978 		}
1979 
1980 		skb_push(skb2, sizeof(struct iphdr));
1981 		skb_reset_network_header(skb2);
1982 		iph = ip_hdr(skb2);
1983 		iph->ihl = sizeof(struct iphdr) >> 2;
1984 		iph->saddr = rt->rt_src;
1985 		iph->daddr = rt->rt_dst;
1986 		iph->version = 0;
1987 		err = ipmr_cache_unresolved(mrt, vif, skb2);
1988 		read_unlock(&mrt_lock);
1989 		return err;
1990 	}
1991 
1992 	if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1993 		cache->mfc_flags |= MFC_NOTIFY;
1994 	err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
1995 	read_unlock(&mrt_lock);
1996 	return err;
1997 }
1998 
1999 static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2000 			    u32 pid, u32 seq, struct mfc_cache *c)
2001 {
2002 	struct nlmsghdr *nlh;
2003 	struct rtmsg *rtm;
2004 
2005 	nlh = nlmsg_put(skb, pid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI);
2006 	if (nlh == NULL)
2007 		return -EMSGSIZE;
2008 
2009 	rtm = nlmsg_data(nlh);
2010 	rtm->rtm_family   = RTNL_FAMILY_IPMR;
2011 	rtm->rtm_dst_len  = 32;
2012 	rtm->rtm_src_len  = 32;
2013 	rtm->rtm_tos      = 0;
2014 	rtm->rtm_table    = mrt->id;
2015 	NLA_PUT_U32(skb, RTA_TABLE, mrt->id);
2016 	rtm->rtm_type     = RTN_MULTICAST;
2017 	rtm->rtm_scope    = RT_SCOPE_UNIVERSE;
2018 	rtm->rtm_protocol = RTPROT_UNSPEC;
2019 	rtm->rtm_flags    = 0;
2020 
2021 	NLA_PUT_BE32(skb, RTA_SRC, c->mfc_origin);
2022 	NLA_PUT_BE32(skb, RTA_DST, c->mfc_mcastgrp);
2023 
2024 	if (__ipmr_fill_mroute(mrt, skb, c, rtm) < 0)
2025 		goto nla_put_failure;
2026 
2027 	return nlmsg_end(skb, nlh);
2028 
2029 nla_put_failure:
2030 	nlmsg_cancel(skb, nlh);
2031 	return -EMSGSIZE;
2032 }
2033 
2034 static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
2035 {
2036 	struct net *net = sock_net(skb->sk);
2037 	struct mr_table *mrt;
2038 	struct mfc_cache *mfc;
2039 	unsigned int t = 0, s_t;
2040 	unsigned int h = 0, s_h;
2041 	unsigned int e = 0, s_e;
2042 
2043 	s_t = cb->args[0];
2044 	s_h = cb->args[1];
2045 	s_e = cb->args[2];
2046 
2047 	read_lock(&mrt_lock);
2048 	ipmr_for_each_table(mrt, net) {
2049 		if (t < s_t)
2050 			goto next_table;
2051 		if (t > s_t)
2052 			s_h = 0;
2053 		for (h = s_h; h < MFC_LINES; h++) {
2054 			list_for_each_entry(mfc, &mrt->mfc_cache_array[h], list) {
2055 				if (e < s_e)
2056 					goto next_entry;
2057 				if (ipmr_fill_mroute(mrt, skb,
2058 						     NETLINK_CB(cb->skb).pid,
2059 						     cb->nlh->nlmsg_seq,
2060 						     mfc) < 0)
2061 					goto done;
2062 next_entry:
2063 				e++;
2064 			}
2065 			e = s_e = 0;
2066 		}
2067 		s_h = 0;
2068 next_table:
2069 		t++;
2070 	}
2071 done:
2072 	read_unlock(&mrt_lock);
2073 
2074 	cb->args[2] = e;
2075 	cb->args[1] = h;
2076 	cb->args[0] = t;
2077 
2078 	return skb->len;
2079 }
2080 
2081 #ifdef CONFIG_PROC_FS
2082 /*
2083  *	The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
2084  */
2085 struct ipmr_vif_iter {
2086 	struct seq_net_private p;
2087 	struct mr_table *mrt;
2088 	int ct;
2089 };
2090 
2091 static struct vif_device *ipmr_vif_seq_idx(struct net *net,
2092 					   struct ipmr_vif_iter *iter,
2093 					   loff_t pos)
2094 {
2095 	struct mr_table *mrt = iter->mrt;
2096 
2097 	for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) {
2098 		if (!VIF_EXISTS(mrt, iter->ct))
2099 			continue;
2100 		if (pos-- == 0)
2101 			return &mrt->vif_table[iter->ct];
2102 	}
2103 	return NULL;
2104 }
2105 
2106 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
2107 	__acquires(mrt_lock)
2108 {
2109 	struct ipmr_vif_iter *iter = seq->private;
2110 	struct net *net = seq_file_net(seq);
2111 	struct mr_table *mrt;
2112 
2113 	mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2114 	if (mrt == NULL)
2115 		return ERR_PTR(-ENOENT);
2116 
2117 	iter->mrt = mrt;
2118 
2119 	read_lock(&mrt_lock);
2120 	return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1)
2121 		: SEQ_START_TOKEN;
2122 }
2123 
2124 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2125 {
2126 	struct ipmr_vif_iter *iter = seq->private;
2127 	struct net *net = seq_file_net(seq);
2128 	struct mr_table *mrt = iter->mrt;
2129 
2130 	++*pos;
2131 	if (v == SEQ_START_TOKEN)
2132 		return ipmr_vif_seq_idx(net, iter, 0);
2133 
2134 	while (++iter->ct < mrt->maxvif) {
2135 		if (!VIF_EXISTS(mrt, iter->ct))
2136 			continue;
2137 		return &mrt->vif_table[iter->ct];
2138 	}
2139 	return NULL;
2140 }
2141 
2142 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
2143 	__releases(mrt_lock)
2144 {
2145 	read_unlock(&mrt_lock);
2146 }
2147 
2148 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
2149 {
2150 	struct ipmr_vif_iter *iter = seq->private;
2151 	struct mr_table *mrt = iter->mrt;
2152 
2153 	if (v == SEQ_START_TOKEN) {
2154 		seq_puts(seq,
2155 			 "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
2156 	} else {
2157 		const struct vif_device *vif = v;
2158 		const char *name =  vif->dev ? vif->dev->name : "none";
2159 
2160 		seq_printf(seq,
2161 			   "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
2162 			   vif - mrt->vif_table,
2163 			   name, vif->bytes_in, vif->pkt_in,
2164 			   vif->bytes_out, vif->pkt_out,
2165 			   vif->flags, vif->local, vif->remote);
2166 	}
2167 	return 0;
2168 }
2169 
2170 static const struct seq_operations ipmr_vif_seq_ops = {
2171 	.start = ipmr_vif_seq_start,
2172 	.next  = ipmr_vif_seq_next,
2173 	.stop  = ipmr_vif_seq_stop,
2174 	.show  = ipmr_vif_seq_show,
2175 };
2176 
2177 static int ipmr_vif_open(struct inode *inode, struct file *file)
2178 {
2179 	return seq_open_net(inode, file, &ipmr_vif_seq_ops,
2180 			    sizeof(struct ipmr_vif_iter));
2181 }
2182 
2183 static const struct file_operations ipmr_vif_fops = {
2184 	.owner	 = THIS_MODULE,
2185 	.open    = ipmr_vif_open,
2186 	.read    = seq_read,
2187 	.llseek  = seq_lseek,
2188 	.release = seq_release_net,
2189 };
2190 
2191 struct ipmr_mfc_iter {
2192 	struct seq_net_private p;
2193 	struct mr_table *mrt;
2194 	struct list_head *cache;
2195 	int ct;
2196 };
2197 
2198 
2199 static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
2200 					  struct ipmr_mfc_iter *it, loff_t pos)
2201 {
2202 	struct mr_table *mrt = it->mrt;
2203 	struct mfc_cache *mfc;
2204 
2205 	read_lock(&mrt_lock);
2206 	for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
2207 		it->cache = &mrt->mfc_cache_array[it->ct];
2208 		list_for_each_entry(mfc, it->cache, list)
2209 			if (pos-- == 0)
2210 				return mfc;
2211 	}
2212 	read_unlock(&mrt_lock);
2213 
2214 	spin_lock_bh(&mfc_unres_lock);
2215 	it->cache = &mrt->mfc_unres_queue;
2216 	list_for_each_entry(mfc, it->cache, list)
2217 		if (pos-- == 0)
2218 			return mfc;
2219 	spin_unlock_bh(&mfc_unres_lock);
2220 
2221 	it->cache = NULL;
2222 	return NULL;
2223 }
2224 
2225 
2226 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
2227 {
2228 	struct ipmr_mfc_iter *it = seq->private;
2229 	struct net *net = seq_file_net(seq);
2230 	struct mr_table *mrt;
2231 
2232 	mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2233 	if (mrt == NULL)
2234 		return ERR_PTR(-ENOENT);
2235 
2236 	it->mrt = mrt;
2237 	it->cache = NULL;
2238 	it->ct = 0;
2239 	return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
2240 		: SEQ_START_TOKEN;
2241 }
2242 
2243 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2244 {
2245 	struct mfc_cache *mfc = v;
2246 	struct ipmr_mfc_iter *it = seq->private;
2247 	struct net *net = seq_file_net(seq);
2248 	struct mr_table *mrt = it->mrt;
2249 
2250 	++*pos;
2251 
2252 	if (v == SEQ_START_TOKEN)
2253 		return ipmr_mfc_seq_idx(net, seq->private, 0);
2254 
2255 	if (mfc->list.next != it->cache)
2256 		return list_entry(mfc->list.next, struct mfc_cache, list);
2257 
2258 	if (it->cache == &mrt->mfc_unres_queue)
2259 		goto end_of_list;
2260 
2261 	BUG_ON(it->cache != &mrt->mfc_cache_array[it->ct]);
2262 
2263 	while (++it->ct < MFC_LINES) {
2264 		it->cache = &mrt->mfc_cache_array[it->ct];
2265 		if (list_empty(it->cache))
2266 			continue;
2267 		return list_first_entry(it->cache, struct mfc_cache, list);
2268 	}
2269 
2270 	/* exhausted cache_array, show unresolved */
2271 	read_unlock(&mrt_lock);
2272 	it->cache = &mrt->mfc_unres_queue;
2273 	it->ct = 0;
2274 
2275 	spin_lock_bh(&mfc_unres_lock);
2276 	if (!list_empty(it->cache))
2277 		return list_first_entry(it->cache, struct mfc_cache, list);
2278 
2279  end_of_list:
2280 	spin_unlock_bh(&mfc_unres_lock);
2281 	it->cache = NULL;
2282 
2283 	return NULL;
2284 }
2285 
2286 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
2287 {
2288 	struct ipmr_mfc_iter *it = seq->private;
2289 	struct mr_table *mrt = it->mrt;
2290 
2291 	if (it->cache == &mrt->mfc_unres_queue)
2292 		spin_unlock_bh(&mfc_unres_lock);
2293 	else if (it->cache == &mrt->mfc_cache_array[it->ct])
2294 		read_unlock(&mrt_lock);
2295 }
2296 
2297 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
2298 {
2299 	int n;
2300 
2301 	if (v == SEQ_START_TOKEN) {
2302 		seq_puts(seq,
2303 		 "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
2304 	} else {
2305 		const struct mfc_cache *mfc = v;
2306 		const struct ipmr_mfc_iter *it = seq->private;
2307 		const struct mr_table *mrt = it->mrt;
2308 
2309 		seq_printf(seq, "%08X %08X %-3hd",
2310 			   (__force u32) mfc->mfc_mcastgrp,
2311 			   (__force u32) mfc->mfc_origin,
2312 			   mfc->mfc_parent);
2313 
2314 		if (it->cache != &mrt->mfc_unres_queue) {
2315 			seq_printf(seq, " %8lu %8lu %8lu",
2316 				   mfc->mfc_un.res.pkt,
2317 				   mfc->mfc_un.res.bytes,
2318 				   mfc->mfc_un.res.wrong_if);
2319 			for (n = mfc->mfc_un.res.minvif;
2320 			     n < mfc->mfc_un.res.maxvif; n++ ) {
2321 				if (VIF_EXISTS(mrt, n) &&
2322 				    mfc->mfc_un.res.ttls[n] < 255)
2323 					seq_printf(seq,
2324 					   " %2d:%-3d",
2325 					   n, mfc->mfc_un.res.ttls[n]);
2326 			}
2327 		} else {
2328 			/* unresolved mfc_caches don't contain
2329 			 * pkt, bytes and wrong_if values
2330 			 */
2331 			seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
2332 		}
2333 		seq_putc(seq, '\n');
2334 	}
2335 	return 0;
2336 }
2337 
2338 static const struct seq_operations ipmr_mfc_seq_ops = {
2339 	.start = ipmr_mfc_seq_start,
2340 	.next  = ipmr_mfc_seq_next,
2341 	.stop  = ipmr_mfc_seq_stop,
2342 	.show  = ipmr_mfc_seq_show,
2343 };
2344 
2345 static int ipmr_mfc_open(struct inode *inode, struct file *file)
2346 {
2347 	return seq_open_net(inode, file, &ipmr_mfc_seq_ops,
2348 			    sizeof(struct ipmr_mfc_iter));
2349 }
2350 
2351 static const struct file_operations ipmr_mfc_fops = {
2352 	.owner	 = THIS_MODULE,
2353 	.open    = ipmr_mfc_open,
2354 	.read    = seq_read,
2355 	.llseek  = seq_lseek,
2356 	.release = seq_release_net,
2357 };
2358 #endif
2359 
2360 #ifdef CONFIG_IP_PIMSM_V2
2361 static const struct net_protocol pim_protocol = {
2362 	.handler	=	pim_rcv,
2363 	.netns_ok	=	1,
2364 };
2365 #endif
2366 
2367 
2368 /*
2369  *	Setup for IP multicast routing
2370  */
2371 static int __net_init ipmr_net_init(struct net *net)
2372 {
2373 	int err;
2374 
2375 	err = ipmr_rules_init(net);
2376 	if (err < 0)
2377 		goto fail;
2378 
2379 #ifdef CONFIG_PROC_FS
2380 	err = -ENOMEM;
2381 	if (!proc_net_fops_create(net, "ip_mr_vif", 0, &ipmr_vif_fops))
2382 		goto proc_vif_fail;
2383 	if (!proc_net_fops_create(net, "ip_mr_cache", 0, &ipmr_mfc_fops))
2384 		goto proc_cache_fail;
2385 #endif
2386 	return 0;
2387 
2388 #ifdef CONFIG_PROC_FS
2389 proc_cache_fail:
2390 	proc_net_remove(net, "ip_mr_vif");
2391 proc_vif_fail:
2392 	ipmr_rules_exit(net);
2393 #endif
2394 fail:
2395 	return err;
2396 }
2397 
2398 static void __net_exit ipmr_net_exit(struct net *net)
2399 {
2400 #ifdef CONFIG_PROC_FS
2401 	proc_net_remove(net, "ip_mr_cache");
2402 	proc_net_remove(net, "ip_mr_vif");
2403 #endif
2404 	ipmr_rules_exit(net);
2405 }
2406 
2407 static struct pernet_operations ipmr_net_ops = {
2408 	.init = ipmr_net_init,
2409 	.exit = ipmr_net_exit,
2410 };
2411 
2412 int __init ip_mr_init(void)
2413 {
2414 	int err;
2415 
2416 	mrt_cachep = kmem_cache_create("ip_mrt_cache",
2417 				       sizeof(struct mfc_cache),
2418 				       0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
2419 				       NULL);
2420 	if (!mrt_cachep)
2421 		return -ENOMEM;
2422 
2423 	err = register_pernet_subsys(&ipmr_net_ops);
2424 	if (err)
2425 		goto reg_pernet_fail;
2426 
2427 	err = register_netdevice_notifier(&ip_mr_notifier);
2428 	if (err)
2429 		goto reg_notif_fail;
2430 #ifdef CONFIG_IP_PIMSM_V2
2431 	if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) {
2432 		printk(KERN_ERR "ip_mr_init: can't add PIM protocol\n");
2433 		err = -EAGAIN;
2434 		goto add_proto_fail;
2435 	}
2436 #endif
2437 	rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, NULL, ipmr_rtm_dumproute);
2438 	return 0;
2439 
2440 #ifdef CONFIG_IP_PIMSM_V2
2441 add_proto_fail:
2442 	unregister_netdevice_notifier(&ip_mr_notifier);
2443 #endif
2444 reg_notif_fail:
2445 	unregister_pernet_subsys(&ipmr_net_ops);
2446 reg_pernet_fail:
2447 	kmem_cache_destroy(mrt_cachep);
2448 	return err;
2449 }
2450