xref: /linux/net/ipv4/fib_frontend.c (revision 3252b11fc4790d046b93f300c898df2f7cd7c176)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		IPv4 Forwarding Information Base: FIB frontend.
7  *
8  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9  *
10  *		This program is free software; you can redistribute it and/or
11  *		modify it under the terms of the GNU General Public License
12  *		as published by the Free Software Foundation; either version
13  *		2 of the License, or (at your option) any later version.
14  */
15 
16 #include <linux/module.h>
17 #include <asm/uaccess.h>
18 #include <asm/system.h>
19 #include <linux/bitops.h>
20 #include <linux/capability.h>
21 #include <linux/types.h>
22 #include <linux/kernel.h>
23 #include <linux/mm.h>
24 #include <linux/string.h>
25 #include <linux/socket.h>
26 #include <linux/sockios.h>
27 #include <linux/errno.h>
28 #include <linux/in.h>
29 #include <linux/inet.h>
30 #include <linux/inetdevice.h>
31 #include <linux/netdevice.h>
32 #include <linux/if_addr.h>
33 #include <linux/if_arp.h>
34 #include <linux/skbuff.h>
35 #include <linux/init.h>
36 #include <linux/list.h>
37 
38 #include <net/ip.h>
39 #include <net/protocol.h>
40 #include <net/route.h>
41 #include <net/tcp.h>
42 #include <net/sock.h>
43 #include <net/arp.h>
44 #include <net/ip_fib.h>
45 #include <net/rtnetlink.h>
46 
47 #ifndef CONFIG_IP_MULTIPLE_TABLES
48 
49 static int __net_init fib4_rules_init(struct net *net)
50 {
51 	struct fib_table *local_table, *main_table;
52 
53 	local_table = fib_hash_table(RT_TABLE_LOCAL);
54 	if (local_table == NULL)
55 		return -ENOMEM;
56 
57 	main_table  = fib_hash_table(RT_TABLE_MAIN);
58 	if (main_table == NULL)
59 		goto fail;
60 
61 	hlist_add_head_rcu(&local_table->tb_hlist,
62 				&net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
63 	hlist_add_head_rcu(&main_table->tb_hlist,
64 				&net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
65 	return 0;
66 
67 fail:
68 	kfree(local_table);
69 	return -ENOMEM;
70 }
71 #else
72 
73 struct fib_table *fib_new_table(struct net *net, u32 id)
74 {
75 	struct fib_table *tb;
76 	unsigned int h;
77 
78 	if (id == 0)
79 		id = RT_TABLE_MAIN;
80 	tb = fib_get_table(net, id);
81 	if (tb)
82 		return tb;
83 
84 	tb = fib_hash_table(id);
85 	if (!tb)
86 		return NULL;
87 	h = id & (FIB_TABLE_HASHSZ - 1);
88 	hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
89 	return tb;
90 }
91 
92 struct fib_table *fib_get_table(struct net *net, u32 id)
93 {
94 	struct fib_table *tb;
95 	struct hlist_node *node;
96 	struct hlist_head *head;
97 	unsigned int h;
98 
99 	if (id == 0)
100 		id = RT_TABLE_MAIN;
101 	h = id & (FIB_TABLE_HASHSZ - 1);
102 
103 	rcu_read_lock();
104 	head = &net->ipv4.fib_table_hash[h];
105 	hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
106 		if (tb->tb_id == id) {
107 			rcu_read_unlock();
108 			return tb;
109 		}
110 	}
111 	rcu_read_unlock();
112 	return NULL;
113 }
114 #endif /* CONFIG_IP_MULTIPLE_TABLES */
115 
116 void fib_select_default(struct net *net,
117 			const struct flowi *flp, struct fib_result *res)
118 {
119 	struct fib_table *tb;
120 	int table = RT_TABLE_MAIN;
121 #ifdef CONFIG_IP_MULTIPLE_TABLES
122 	if (res->r == NULL || res->r->action != FR_ACT_TO_TBL)
123 		return;
124 	table = res->r->table;
125 #endif
126 	tb = fib_get_table(net, table);
127 	if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
128 		fib_table_select_default(tb, flp, res);
129 }
130 
131 static void fib_flush(struct net *net)
132 {
133 	int flushed = 0;
134 	struct fib_table *tb;
135 	struct hlist_node *node;
136 	struct hlist_head *head;
137 	unsigned int h;
138 
139 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
140 		head = &net->ipv4.fib_table_hash[h];
141 		hlist_for_each_entry(tb, node, head, tb_hlist)
142 			flushed += fib_table_flush(tb);
143 	}
144 
145 	if (flushed)
146 		rt_cache_flush(net, -1);
147 }
148 
149 /*
150  *	Find the first device with a given source address.
151  */
152 
153 struct net_device * ip_dev_find(struct net *net, __be32 addr)
154 {
155 	struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
156 	struct fib_result res;
157 	struct net_device *dev = NULL;
158 	struct fib_table *local_table;
159 
160 #ifdef CONFIG_IP_MULTIPLE_TABLES
161 	res.r = NULL;
162 #endif
163 
164 	local_table = fib_get_table(net, RT_TABLE_LOCAL);
165 	if (!local_table || fib_table_lookup(local_table, &fl, &res))
166 		return NULL;
167 	if (res.type != RTN_LOCAL)
168 		goto out;
169 	dev = FIB_RES_DEV(res);
170 
171 	if (dev)
172 		dev_hold(dev);
173 out:
174 	fib_res_put(&res);
175 	return dev;
176 }
177 
178 /*
179  * Find address type as if only "dev" was present in the system. If
180  * on_dev is NULL then all interfaces are taken into consideration.
181  */
182 static inline unsigned __inet_dev_addr_type(struct net *net,
183 					    const struct net_device *dev,
184 					    __be32 addr)
185 {
186 	struct flowi		fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
187 	struct fib_result	res;
188 	unsigned ret = RTN_BROADCAST;
189 	struct fib_table *local_table;
190 
191 	if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
192 		return RTN_BROADCAST;
193 	if (ipv4_is_multicast(addr))
194 		return RTN_MULTICAST;
195 
196 #ifdef CONFIG_IP_MULTIPLE_TABLES
197 	res.r = NULL;
198 #endif
199 
200 	local_table = fib_get_table(net, RT_TABLE_LOCAL);
201 	if (local_table) {
202 		ret = RTN_UNICAST;
203 		if (!fib_table_lookup(local_table, &fl, &res)) {
204 			if (!dev || dev == res.fi->fib_dev)
205 				ret = res.type;
206 			fib_res_put(&res);
207 		}
208 	}
209 	return ret;
210 }
211 
212 unsigned int inet_addr_type(struct net *net, __be32 addr)
213 {
214 	return __inet_dev_addr_type(net, NULL, addr);
215 }
216 
217 unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
218 				__be32 addr)
219 {
220        return __inet_dev_addr_type(net, dev, addr);
221 }
222 
223 /* Given (packet source, input interface) and optional (dst, oif, tos):
224    - (main) check, that source is valid i.e. not broadcast or our local
225      address.
226    - figure out what "logical" interface this packet arrived
227      and calculate "specific destination" address.
228    - check, that packet arrived from expected physical interface.
229  */
230 
231 int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
232 			struct net_device *dev, __be32 *spec_dst,
233 			u32 *itag, u32 mark)
234 {
235 	struct in_device *in_dev;
236 	struct flowi fl = { .nl_u = { .ip4_u =
237 				      { .daddr = src,
238 					.saddr = dst,
239 					.tos = tos } },
240 			    .mark = mark,
241 			    .iif = oif };
242 
243 	struct fib_result res;
244 	int no_addr, rpf, accept_local;
245 	int ret;
246 	struct net *net;
247 
248 	no_addr = rpf = accept_local = 0;
249 	rcu_read_lock();
250 	in_dev = __in_dev_get_rcu(dev);
251 	if (in_dev) {
252 		no_addr = in_dev->ifa_list == NULL;
253 		rpf = IN_DEV_RPFILTER(in_dev);
254 		accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
255 	}
256 	rcu_read_unlock();
257 
258 	if (in_dev == NULL)
259 		goto e_inval;
260 
261 	net = dev_net(dev);
262 	if (fib_lookup(net, &fl, &res))
263 		goto last_resort;
264 	if (res.type != RTN_UNICAST) {
265 		if (res.type != RTN_LOCAL || !accept_local)
266 			goto e_inval_res;
267 	}
268 	*spec_dst = FIB_RES_PREFSRC(res);
269 	fib_combine_itag(itag, &res);
270 #ifdef CONFIG_IP_ROUTE_MULTIPATH
271 	if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1)
272 #else
273 	if (FIB_RES_DEV(res) == dev)
274 #endif
275 	{
276 		ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
277 		fib_res_put(&res);
278 		return ret;
279 	}
280 	fib_res_put(&res);
281 	if (no_addr)
282 		goto last_resort;
283 	if (rpf == 1)
284 		goto e_inval;
285 	fl.oif = dev->ifindex;
286 
287 	ret = 0;
288 	if (fib_lookup(net, &fl, &res) == 0) {
289 		if (res.type == RTN_UNICAST) {
290 			*spec_dst = FIB_RES_PREFSRC(res);
291 			ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
292 		}
293 		fib_res_put(&res);
294 	}
295 	return ret;
296 
297 last_resort:
298 	if (rpf)
299 		goto e_inval;
300 	*spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
301 	*itag = 0;
302 	return 0;
303 
304 e_inval_res:
305 	fib_res_put(&res);
306 e_inval:
307 	return -EINVAL;
308 }
309 
310 static inline __be32 sk_extract_addr(struct sockaddr *addr)
311 {
312 	return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
313 }
314 
315 static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
316 {
317 	struct nlattr *nla;
318 
319 	nla = (struct nlattr *) ((char *) mx + len);
320 	nla->nla_type = type;
321 	nla->nla_len = nla_attr_size(4);
322 	*(u32 *) nla_data(nla) = value;
323 
324 	return len + nla_total_size(4);
325 }
326 
327 static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
328 				 struct fib_config *cfg)
329 {
330 	__be32 addr;
331 	int plen;
332 
333 	memset(cfg, 0, sizeof(*cfg));
334 	cfg->fc_nlinfo.nl_net = net;
335 
336 	if (rt->rt_dst.sa_family != AF_INET)
337 		return -EAFNOSUPPORT;
338 
339 	/*
340 	 * Check mask for validity:
341 	 * a) it must be contiguous.
342 	 * b) destination must have all host bits clear.
343 	 * c) if application forgot to set correct family (AF_INET),
344 	 *    reject request unless it is absolutely clear i.e.
345 	 *    both family and mask are zero.
346 	 */
347 	plen = 32;
348 	addr = sk_extract_addr(&rt->rt_dst);
349 	if (!(rt->rt_flags & RTF_HOST)) {
350 		__be32 mask = sk_extract_addr(&rt->rt_genmask);
351 
352 		if (rt->rt_genmask.sa_family != AF_INET) {
353 			if (mask || rt->rt_genmask.sa_family)
354 				return -EAFNOSUPPORT;
355 		}
356 
357 		if (bad_mask(mask, addr))
358 			return -EINVAL;
359 
360 		plen = inet_mask_len(mask);
361 	}
362 
363 	cfg->fc_dst_len = plen;
364 	cfg->fc_dst = addr;
365 
366 	if (cmd != SIOCDELRT) {
367 		cfg->fc_nlflags = NLM_F_CREATE;
368 		cfg->fc_protocol = RTPROT_BOOT;
369 	}
370 
371 	if (rt->rt_metric)
372 		cfg->fc_priority = rt->rt_metric - 1;
373 
374 	if (rt->rt_flags & RTF_REJECT) {
375 		cfg->fc_scope = RT_SCOPE_HOST;
376 		cfg->fc_type = RTN_UNREACHABLE;
377 		return 0;
378 	}
379 
380 	cfg->fc_scope = RT_SCOPE_NOWHERE;
381 	cfg->fc_type = RTN_UNICAST;
382 
383 	if (rt->rt_dev) {
384 		char *colon;
385 		struct net_device *dev;
386 		char devname[IFNAMSIZ];
387 
388 		if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1))
389 			return -EFAULT;
390 
391 		devname[IFNAMSIZ-1] = 0;
392 		colon = strchr(devname, ':');
393 		if (colon)
394 			*colon = 0;
395 		dev = __dev_get_by_name(net, devname);
396 		if (!dev)
397 			return -ENODEV;
398 		cfg->fc_oif = dev->ifindex;
399 		if (colon) {
400 			struct in_ifaddr *ifa;
401 			struct in_device *in_dev = __in_dev_get_rtnl(dev);
402 			if (!in_dev)
403 				return -ENODEV;
404 			*colon = ':';
405 			for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
406 				if (strcmp(ifa->ifa_label, devname) == 0)
407 					break;
408 			if (ifa == NULL)
409 				return -ENODEV;
410 			cfg->fc_prefsrc = ifa->ifa_local;
411 		}
412 	}
413 
414 	addr = sk_extract_addr(&rt->rt_gateway);
415 	if (rt->rt_gateway.sa_family == AF_INET && addr) {
416 		cfg->fc_gw = addr;
417 		if (rt->rt_flags & RTF_GATEWAY &&
418 		    inet_addr_type(net, addr) == RTN_UNICAST)
419 			cfg->fc_scope = RT_SCOPE_UNIVERSE;
420 	}
421 
422 	if (cmd == SIOCDELRT)
423 		return 0;
424 
425 	if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw)
426 		return -EINVAL;
427 
428 	if (cfg->fc_scope == RT_SCOPE_NOWHERE)
429 		cfg->fc_scope = RT_SCOPE_LINK;
430 
431 	if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) {
432 		struct nlattr *mx;
433 		int len = 0;
434 
435 		mx = kzalloc(3 * nla_total_size(4), GFP_KERNEL);
436 		if (mx == NULL)
437 			return -ENOMEM;
438 
439 		if (rt->rt_flags & RTF_MTU)
440 			len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40);
441 
442 		if (rt->rt_flags & RTF_WINDOW)
443 			len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window);
444 
445 		if (rt->rt_flags & RTF_IRTT)
446 			len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3);
447 
448 		cfg->fc_mx = mx;
449 		cfg->fc_mx_len = len;
450 	}
451 
452 	return 0;
453 }
454 
455 /*
456  *	Handle IP routing ioctl calls. These are used to manipulate the routing tables
457  */
458 
459 int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
460 {
461 	struct fib_config cfg;
462 	struct rtentry rt;
463 	int err;
464 
465 	switch (cmd) {
466 	case SIOCADDRT:		/* Add a route */
467 	case SIOCDELRT:		/* Delete a route */
468 		if (!capable(CAP_NET_ADMIN))
469 			return -EPERM;
470 
471 		if (copy_from_user(&rt, arg, sizeof(rt)))
472 			return -EFAULT;
473 
474 		rtnl_lock();
475 		err = rtentry_to_fib_config(net, cmd, &rt, &cfg);
476 		if (err == 0) {
477 			struct fib_table *tb;
478 
479 			if (cmd == SIOCDELRT) {
480 				tb = fib_get_table(net, cfg.fc_table);
481 				if (tb)
482 					err = fib_table_delete(tb, &cfg);
483 				else
484 					err = -ESRCH;
485 			} else {
486 				tb = fib_new_table(net, cfg.fc_table);
487 				if (tb)
488 					err = fib_table_insert(tb, &cfg);
489 				else
490 					err = -ENOBUFS;
491 			}
492 
493 			/* allocated by rtentry_to_fib_config() */
494 			kfree(cfg.fc_mx);
495 		}
496 		rtnl_unlock();
497 		return err;
498 	}
499 	return -EINVAL;
500 }
501 
502 const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = {
503 	[RTA_DST]		= { .type = NLA_U32 },
504 	[RTA_SRC]		= { .type = NLA_U32 },
505 	[RTA_IIF]		= { .type = NLA_U32 },
506 	[RTA_OIF]		= { .type = NLA_U32 },
507 	[RTA_GATEWAY]		= { .type = NLA_U32 },
508 	[RTA_PRIORITY]		= { .type = NLA_U32 },
509 	[RTA_PREFSRC]		= { .type = NLA_U32 },
510 	[RTA_METRICS]		= { .type = NLA_NESTED },
511 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
512 	[RTA_FLOW]		= { .type = NLA_U32 },
513 };
514 
515 static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
516 			    struct nlmsghdr *nlh, struct fib_config *cfg)
517 {
518 	struct nlattr *attr;
519 	int err, remaining;
520 	struct rtmsg *rtm;
521 
522 	err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy);
523 	if (err < 0)
524 		goto errout;
525 
526 	memset(cfg, 0, sizeof(*cfg));
527 
528 	rtm = nlmsg_data(nlh);
529 	cfg->fc_dst_len = rtm->rtm_dst_len;
530 	cfg->fc_tos = rtm->rtm_tos;
531 	cfg->fc_table = rtm->rtm_table;
532 	cfg->fc_protocol = rtm->rtm_protocol;
533 	cfg->fc_scope = rtm->rtm_scope;
534 	cfg->fc_type = rtm->rtm_type;
535 	cfg->fc_flags = rtm->rtm_flags;
536 	cfg->fc_nlflags = nlh->nlmsg_flags;
537 
538 	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
539 	cfg->fc_nlinfo.nlh = nlh;
540 	cfg->fc_nlinfo.nl_net = net;
541 
542 	if (cfg->fc_type > RTN_MAX) {
543 		err = -EINVAL;
544 		goto errout;
545 	}
546 
547 	nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
548 		switch (nla_type(attr)) {
549 		case RTA_DST:
550 			cfg->fc_dst = nla_get_be32(attr);
551 			break;
552 		case RTA_OIF:
553 			cfg->fc_oif = nla_get_u32(attr);
554 			break;
555 		case RTA_GATEWAY:
556 			cfg->fc_gw = nla_get_be32(attr);
557 			break;
558 		case RTA_PRIORITY:
559 			cfg->fc_priority = nla_get_u32(attr);
560 			break;
561 		case RTA_PREFSRC:
562 			cfg->fc_prefsrc = nla_get_be32(attr);
563 			break;
564 		case RTA_METRICS:
565 			cfg->fc_mx = nla_data(attr);
566 			cfg->fc_mx_len = nla_len(attr);
567 			break;
568 		case RTA_MULTIPATH:
569 			cfg->fc_mp = nla_data(attr);
570 			cfg->fc_mp_len = nla_len(attr);
571 			break;
572 		case RTA_FLOW:
573 			cfg->fc_flow = nla_get_u32(attr);
574 			break;
575 		case RTA_TABLE:
576 			cfg->fc_table = nla_get_u32(attr);
577 			break;
578 		}
579 	}
580 
581 	return 0;
582 errout:
583 	return err;
584 }
585 
586 static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
587 {
588 	struct net *net = sock_net(skb->sk);
589 	struct fib_config cfg;
590 	struct fib_table *tb;
591 	int err;
592 
593 	err = rtm_to_fib_config(net, skb, nlh, &cfg);
594 	if (err < 0)
595 		goto errout;
596 
597 	tb = fib_get_table(net, cfg.fc_table);
598 	if (tb == NULL) {
599 		err = -ESRCH;
600 		goto errout;
601 	}
602 
603 	err = fib_table_delete(tb, &cfg);
604 errout:
605 	return err;
606 }
607 
608 static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
609 {
610 	struct net *net = sock_net(skb->sk);
611 	struct fib_config cfg;
612 	struct fib_table *tb;
613 	int err;
614 
615 	err = rtm_to_fib_config(net, skb, nlh, &cfg);
616 	if (err < 0)
617 		goto errout;
618 
619 	tb = fib_new_table(net, cfg.fc_table);
620 	if (tb == NULL) {
621 		err = -ENOBUFS;
622 		goto errout;
623 	}
624 
625 	err = fib_table_insert(tb, &cfg);
626 errout:
627 	return err;
628 }
629 
630 static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
631 {
632 	struct net *net = sock_net(skb->sk);
633 	unsigned int h, s_h;
634 	unsigned int e = 0, s_e;
635 	struct fib_table *tb;
636 	struct hlist_node *node;
637 	struct hlist_head *head;
638 	int dumped = 0;
639 
640 	if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
641 	    ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED)
642 		return ip_rt_dump(skb, cb);
643 
644 	s_h = cb->args[0];
645 	s_e = cb->args[1];
646 
647 	for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
648 		e = 0;
649 		head = &net->ipv4.fib_table_hash[h];
650 		hlist_for_each_entry(tb, node, head, tb_hlist) {
651 			if (e < s_e)
652 				goto next;
653 			if (dumped)
654 				memset(&cb->args[2], 0, sizeof(cb->args) -
655 						 2 * sizeof(cb->args[0]));
656 			if (fib_table_dump(tb, skb, cb) < 0)
657 				goto out;
658 			dumped = 1;
659 next:
660 			e++;
661 		}
662 	}
663 out:
664 	cb->args[1] = e;
665 	cb->args[0] = h;
666 
667 	return skb->len;
668 }
669 
670 /* Prepare and feed intra-kernel routing request.
671    Really, it should be netlink message, but :-( netlink
672    can be not configured, so that we feed it directly
673    to fib engine. It is legal, because all events occur
674    only when netlink is already locked.
675  */
676 
677 static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
678 {
679 	struct net *net = dev_net(ifa->ifa_dev->dev);
680 	struct fib_table *tb;
681 	struct fib_config cfg = {
682 		.fc_protocol = RTPROT_KERNEL,
683 		.fc_type = type,
684 		.fc_dst = dst,
685 		.fc_dst_len = dst_len,
686 		.fc_prefsrc = ifa->ifa_local,
687 		.fc_oif = ifa->ifa_dev->dev->ifindex,
688 		.fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
689 		.fc_nlinfo = {
690 			.nl_net = net,
691 		},
692 	};
693 
694 	if (type == RTN_UNICAST)
695 		tb = fib_new_table(net, RT_TABLE_MAIN);
696 	else
697 		tb = fib_new_table(net, RT_TABLE_LOCAL);
698 
699 	if (tb == NULL)
700 		return;
701 
702 	cfg.fc_table = tb->tb_id;
703 
704 	if (type != RTN_LOCAL)
705 		cfg.fc_scope = RT_SCOPE_LINK;
706 	else
707 		cfg.fc_scope = RT_SCOPE_HOST;
708 
709 	if (cmd == RTM_NEWROUTE)
710 		fib_table_insert(tb, &cfg);
711 	else
712 		fib_table_delete(tb, &cfg);
713 }
714 
715 void fib_add_ifaddr(struct in_ifaddr *ifa)
716 {
717 	struct in_device *in_dev = ifa->ifa_dev;
718 	struct net_device *dev = in_dev->dev;
719 	struct in_ifaddr *prim = ifa;
720 	__be32 mask = ifa->ifa_mask;
721 	__be32 addr = ifa->ifa_local;
722 	__be32 prefix = ifa->ifa_address&mask;
723 
724 	if (ifa->ifa_flags&IFA_F_SECONDARY) {
725 		prim = inet_ifa_byprefix(in_dev, prefix, mask);
726 		if (prim == NULL) {
727 			printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n");
728 			return;
729 		}
730 	}
731 
732 	fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
733 
734 	if (!(dev->flags&IFF_UP))
735 		return;
736 
737 	/* Add broadcast address, if it is explicitly assigned. */
738 	if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
739 		fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
740 
741 	if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) &&
742 	    (prefix != addr || ifa->ifa_prefixlen < 32)) {
743 		fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
744 			  RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim);
745 
746 		/* Add network specific broadcasts, when it takes a sense */
747 		if (ifa->ifa_prefixlen < 31) {
748 			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
749 			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim);
750 		}
751 	}
752 }
753 
754 static void fib_del_ifaddr(struct in_ifaddr *ifa)
755 {
756 	struct in_device *in_dev = ifa->ifa_dev;
757 	struct net_device *dev = in_dev->dev;
758 	struct in_ifaddr *ifa1;
759 	struct in_ifaddr *prim = ifa;
760 	__be32 brd = ifa->ifa_address|~ifa->ifa_mask;
761 	__be32 any = ifa->ifa_address&ifa->ifa_mask;
762 #define LOCAL_OK	1
763 #define BRD_OK		2
764 #define BRD0_OK		4
765 #define BRD1_OK		8
766 	unsigned ok = 0;
767 
768 	if (!(ifa->ifa_flags&IFA_F_SECONDARY))
769 		fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
770 			  RTN_UNICAST, any, ifa->ifa_prefixlen, prim);
771 	else {
772 		prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
773 		if (prim == NULL) {
774 			printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n");
775 			return;
776 		}
777 	}
778 
779 	/* Deletion is more complicated than add.
780 	   We should take care of not to delete too much :-)
781 
782 	   Scan address list to be sure that addresses are really gone.
783 	 */
784 
785 	for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
786 		if (ifa->ifa_local == ifa1->ifa_local)
787 			ok |= LOCAL_OK;
788 		if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
789 			ok |= BRD_OK;
790 		if (brd == ifa1->ifa_broadcast)
791 			ok |= BRD1_OK;
792 		if (any == ifa1->ifa_broadcast)
793 			ok |= BRD0_OK;
794 	}
795 
796 	if (!(ok&BRD_OK))
797 		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
798 	if (!(ok&BRD1_OK))
799 		fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
800 	if (!(ok&BRD0_OK))
801 		fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
802 	if (!(ok&LOCAL_OK)) {
803 		fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
804 
805 		/* Check, that this local address finally disappeared. */
806 		if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
807 			/* And the last, but not the least thing.
808 			   We must flush stray FIB entries.
809 
810 			   First of all, we scan fib_info list searching
811 			   for stray nexthop entries, then ignite fib_flush.
812 			*/
813 			if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
814 				fib_flush(dev_net(dev));
815 		}
816 	}
817 #undef LOCAL_OK
818 #undef BRD_OK
819 #undef BRD0_OK
820 #undef BRD1_OK
821 }
822 
823 static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
824 {
825 
826 	struct fib_result       res;
827 	struct flowi            fl = { .mark = frn->fl_mark,
828 				       .nl_u = { .ip4_u = { .daddr = frn->fl_addr,
829 							    .tos = frn->fl_tos,
830 							    .scope = frn->fl_scope } } };
831 
832 #ifdef CONFIG_IP_MULTIPLE_TABLES
833 	res.r = NULL;
834 #endif
835 
836 	frn->err = -ENOENT;
837 	if (tb) {
838 		local_bh_disable();
839 
840 		frn->tb_id = tb->tb_id;
841 		frn->err = fib_table_lookup(tb, &fl, &res);
842 
843 		if (!frn->err) {
844 			frn->prefixlen = res.prefixlen;
845 			frn->nh_sel = res.nh_sel;
846 			frn->type = res.type;
847 			frn->scope = res.scope;
848 			fib_res_put(&res);
849 		}
850 		local_bh_enable();
851 	}
852 }
853 
854 static void nl_fib_input(struct sk_buff *skb)
855 {
856 	struct net *net;
857 	struct fib_result_nl *frn;
858 	struct nlmsghdr *nlh;
859 	struct fib_table *tb;
860 	u32 pid;
861 
862 	net = sock_net(skb->sk);
863 	nlh = nlmsg_hdr(skb);
864 	if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len ||
865 	    nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn)))
866 		return;
867 
868 	skb = skb_clone(skb, GFP_KERNEL);
869 	if (skb == NULL)
870 		return;
871 	nlh = nlmsg_hdr(skb);
872 
873 	frn = (struct fib_result_nl *) NLMSG_DATA(nlh);
874 	tb = fib_get_table(net, frn->tb_id_in);
875 
876 	nl_fib_lookup(frn, tb);
877 
878 	pid = NETLINK_CB(skb).pid;       /* pid of sending process */
879 	NETLINK_CB(skb).pid = 0;         /* from kernel */
880 	NETLINK_CB(skb).dst_group = 0;  /* unicast */
881 	netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
882 }
883 
884 static int nl_fib_lookup_init(struct net *net)
885 {
886 	struct sock *sk;
887 	sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0,
888 				   nl_fib_input, NULL, THIS_MODULE);
889 	if (sk == NULL)
890 		return -EAFNOSUPPORT;
891 	net->ipv4.fibnl = sk;
892 	return 0;
893 }
894 
895 static void nl_fib_lookup_exit(struct net *net)
896 {
897 	netlink_kernel_release(net->ipv4.fibnl);
898 	net->ipv4.fibnl = NULL;
899 }
900 
901 static void fib_disable_ip(struct net_device *dev, int force, int delay)
902 {
903 	if (fib_sync_down_dev(dev, force))
904 		fib_flush(dev_net(dev));
905 	rt_cache_flush(dev_net(dev), delay);
906 	arp_ifdown(dev);
907 }
908 
909 static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
910 {
911 	struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
912 	struct net_device *dev = ifa->ifa_dev->dev;
913 
914 	switch (event) {
915 	case NETDEV_UP:
916 		fib_add_ifaddr(ifa);
917 #ifdef CONFIG_IP_ROUTE_MULTIPATH
918 		fib_sync_up(dev);
919 #endif
920 		rt_cache_flush(dev_net(dev), -1);
921 		break;
922 	case NETDEV_DOWN:
923 		fib_del_ifaddr(ifa);
924 		if (ifa->ifa_dev->ifa_list == NULL) {
925 			/* Last address was deleted from this interface.
926 			   Disable IP.
927 			 */
928 			fib_disable_ip(dev, 1, 0);
929 		} else {
930 			rt_cache_flush(dev_net(dev), -1);
931 		}
932 		break;
933 	}
934 	return NOTIFY_DONE;
935 }
936 
937 static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
938 {
939 	struct net_device *dev = ptr;
940 	struct in_device *in_dev = __in_dev_get_rtnl(dev);
941 
942 	if (event == NETDEV_UNREGISTER) {
943 		fib_disable_ip(dev, 2, -1);
944 		return NOTIFY_DONE;
945 	}
946 
947 	if (!in_dev)
948 		return NOTIFY_DONE;
949 
950 	switch (event) {
951 	case NETDEV_UP:
952 		for_ifa(in_dev) {
953 			fib_add_ifaddr(ifa);
954 		} endfor_ifa(in_dev);
955 #ifdef CONFIG_IP_ROUTE_MULTIPATH
956 		fib_sync_up(dev);
957 #endif
958 		rt_cache_flush(dev_net(dev), -1);
959 		break;
960 	case NETDEV_DOWN:
961 		fib_disable_ip(dev, 0, 0);
962 		break;
963 	case NETDEV_CHANGEMTU:
964 	case NETDEV_CHANGE:
965 		rt_cache_flush(dev_net(dev), 0);
966 		break;
967 	case NETDEV_UNREGISTER_BATCH:
968 		rt_cache_flush_batch();
969 		break;
970 	}
971 	return NOTIFY_DONE;
972 }
973 
974 static struct notifier_block fib_inetaddr_notifier = {
975 	.notifier_call = fib_inetaddr_event,
976 };
977 
978 static struct notifier_block fib_netdev_notifier = {
979 	.notifier_call = fib_netdev_event,
980 };
981 
982 static int __net_init ip_fib_net_init(struct net *net)
983 {
984 	int err;
985 	unsigned int i;
986 
987 	net->ipv4.fib_table_hash = kzalloc(
988 			sizeof(struct hlist_head)*FIB_TABLE_HASHSZ, GFP_KERNEL);
989 	if (net->ipv4.fib_table_hash == NULL)
990 		return -ENOMEM;
991 
992 	for (i = 0; i < FIB_TABLE_HASHSZ; i++)
993 		INIT_HLIST_HEAD(&net->ipv4.fib_table_hash[i]);
994 
995 	err = fib4_rules_init(net);
996 	if (err < 0)
997 		goto fail;
998 	return 0;
999 
1000 fail:
1001 	kfree(net->ipv4.fib_table_hash);
1002 	return err;
1003 }
1004 
1005 static void __net_exit ip_fib_net_exit(struct net *net)
1006 {
1007 	unsigned int i;
1008 
1009 #ifdef CONFIG_IP_MULTIPLE_TABLES
1010 	fib4_rules_exit(net);
1011 #endif
1012 
1013 	for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
1014 		struct fib_table *tb;
1015 		struct hlist_head *head;
1016 		struct hlist_node *node, *tmp;
1017 
1018 		head = &net->ipv4.fib_table_hash[i];
1019 		hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) {
1020 			hlist_del(node);
1021 			fib_table_flush(tb);
1022 			kfree(tb);
1023 		}
1024 	}
1025 	kfree(net->ipv4.fib_table_hash);
1026 }
1027 
1028 static int __net_init fib_net_init(struct net *net)
1029 {
1030 	int error;
1031 
1032 	error = ip_fib_net_init(net);
1033 	if (error < 0)
1034 		goto out;
1035 	error = nl_fib_lookup_init(net);
1036 	if (error < 0)
1037 		goto out_nlfl;
1038 	error = fib_proc_init(net);
1039 	if (error < 0)
1040 		goto out_proc;
1041 out:
1042 	return error;
1043 
1044 out_proc:
1045 	nl_fib_lookup_exit(net);
1046 out_nlfl:
1047 	ip_fib_net_exit(net);
1048 	goto out;
1049 }
1050 
1051 static void __net_exit fib_net_exit(struct net *net)
1052 {
1053 	fib_proc_exit(net);
1054 	nl_fib_lookup_exit(net);
1055 	ip_fib_net_exit(net);
1056 }
1057 
1058 static struct pernet_operations fib_net_ops = {
1059 	.init = fib_net_init,
1060 	.exit = fib_net_exit,
1061 };
1062 
1063 void __init ip_fib_init(void)
1064 {
1065 	rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL);
1066 	rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL);
1067 	rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib);
1068 
1069 	register_pernet_subsys(&fib_net_ops);
1070 	register_netdevice_notifier(&fib_netdev_notifier);
1071 	register_inetaddr_notifier(&fib_inetaddr_notifier);
1072 
1073 	fib_hash_init();
1074 }
1075 
1076 EXPORT_SYMBOL(inet_addr_type);
1077 EXPORT_SYMBOL(inet_dev_addr_type);
1078 EXPORT_SYMBOL(ip_dev_find);
1079