xref: /linux/net/ipv4/fib_semantics.c (revision a115bc070b1fc57ab23f3972401425927b5b465c)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		IPv4 Forwarding Information Base: semantics.
7  *
8  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9  *
10  *		This program is free software; you can redistribute it and/or
11  *		modify it under the terms of the GNU General Public License
12  *		as published by the Free Software Foundation; either version
13  *		2 of the License, or (at your option) any later version.
14  */
15 
16 #include <asm/uaccess.h>
17 #include <asm/system.h>
18 #include <linux/bitops.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/jiffies.h>
22 #include <linux/mm.h>
23 #include <linux/string.h>
24 #include <linux/socket.h>
25 #include <linux/sockios.h>
26 #include <linux/errno.h>
27 #include <linux/in.h>
28 #include <linux/inet.h>
29 #include <linux/inetdevice.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/proc_fs.h>
33 #include <linux/skbuff.h>
34 #include <linux/init.h>
35 
36 #include <net/arp.h>
37 #include <net/ip.h>
38 #include <net/protocol.h>
39 #include <net/route.h>
40 #include <net/tcp.h>
41 #include <net/sock.h>
42 #include <net/ip_fib.h>
43 #include <net/netlink.h>
44 #include <net/nexthop.h>
45 
46 #include "fib_lookup.h"
47 
48 static DEFINE_SPINLOCK(fib_info_lock);
49 static struct hlist_head *fib_info_hash;
50 static struct hlist_head *fib_info_laddrhash;
51 static unsigned int fib_hash_size;
52 static unsigned int fib_info_cnt;
53 
54 #define DEVINDEX_HASHBITS 8
55 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
56 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
57 
58 #ifdef CONFIG_IP_ROUTE_MULTIPATH
59 
60 static DEFINE_SPINLOCK(fib_multipath_lock);
61 
62 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
63 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
64 
65 #define change_nexthops(fi) { int nhsel; struct fib_nh *nexthop_nh; \
66 for (nhsel=0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nexthop_nh++, nhsel++)
67 
68 #else /* CONFIG_IP_ROUTE_MULTIPATH */
69 
70 /* Hope, that gcc will optimize it to get rid of dummy loop */
71 
72 #define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \
73 for (nhsel=0; nhsel < 1; nhsel++)
74 
75 #define change_nexthops(fi) { int nhsel = 0; struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
76 for (nhsel=0; nhsel < 1; nhsel++)
77 
78 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
79 
80 #define endfor_nexthops(fi) }
81 
82 
83 static const struct
84 {
85 	int	error;
86 	u8	scope;
87 } fib_props[RTN_MAX + 1] = {
88 	{
89 		.error	= 0,
90 		.scope	= RT_SCOPE_NOWHERE,
91 	},	/* RTN_UNSPEC */
92 	{
93 		.error	= 0,
94 		.scope	= RT_SCOPE_UNIVERSE,
95 	},	/* RTN_UNICAST */
96 	{
97 		.error	= 0,
98 		.scope	= RT_SCOPE_HOST,
99 	},	/* RTN_LOCAL */
100 	{
101 		.error	= 0,
102 		.scope	= RT_SCOPE_LINK,
103 	},	/* RTN_BROADCAST */
104 	{
105 		.error	= 0,
106 		.scope	= RT_SCOPE_LINK,
107 	},	/* RTN_ANYCAST */
108 	{
109 		.error	= 0,
110 		.scope	= RT_SCOPE_UNIVERSE,
111 	},	/* RTN_MULTICAST */
112 	{
113 		.error	= -EINVAL,
114 		.scope	= RT_SCOPE_UNIVERSE,
115 	},	/* RTN_BLACKHOLE */
116 	{
117 		.error	= -EHOSTUNREACH,
118 		.scope	= RT_SCOPE_UNIVERSE,
119 	},	/* RTN_UNREACHABLE */
120 	{
121 		.error	= -EACCES,
122 		.scope	= RT_SCOPE_UNIVERSE,
123 	},	/* RTN_PROHIBIT */
124 	{
125 		.error	= -EAGAIN,
126 		.scope	= RT_SCOPE_UNIVERSE,
127 	},	/* RTN_THROW */
128 	{
129 		.error	= -EINVAL,
130 		.scope	= RT_SCOPE_NOWHERE,
131 	},	/* RTN_NAT */
132 	{
133 		.error	= -EINVAL,
134 		.scope	= RT_SCOPE_NOWHERE,
135 	},	/* RTN_XRESOLVE */
136 };
137 
138 
139 /* Release a nexthop info record */
140 
141 void free_fib_info(struct fib_info *fi)
142 {
143 	if (fi->fib_dead == 0) {
144 		printk(KERN_WARNING "Freeing alive fib_info %p\n", fi);
145 		return;
146 	}
147 	change_nexthops(fi) {
148 		if (nexthop_nh->nh_dev)
149 			dev_put(nexthop_nh->nh_dev);
150 		nexthop_nh->nh_dev = NULL;
151 	} endfor_nexthops(fi);
152 	fib_info_cnt--;
153 	release_net(fi->fib_net);
154 	kfree(fi);
155 }
156 
157 void fib_release_info(struct fib_info *fi)
158 {
159 	spin_lock_bh(&fib_info_lock);
160 	if (fi && --fi->fib_treeref == 0) {
161 		hlist_del(&fi->fib_hash);
162 		if (fi->fib_prefsrc)
163 			hlist_del(&fi->fib_lhash);
164 		change_nexthops(fi) {
165 			if (!nexthop_nh->nh_dev)
166 				continue;
167 			hlist_del(&nexthop_nh->nh_hash);
168 		} endfor_nexthops(fi)
169 		fi->fib_dead = 1;
170 		fib_info_put(fi);
171 	}
172 	spin_unlock_bh(&fib_info_lock);
173 }
174 
175 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
176 {
177 	const struct fib_nh *onh = ofi->fib_nh;
178 
179 	for_nexthops(fi) {
180 		if (nh->nh_oif != onh->nh_oif ||
181 		    nh->nh_gw  != onh->nh_gw ||
182 		    nh->nh_scope != onh->nh_scope ||
183 #ifdef CONFIG_IP_ROUTE_MULTIPATH
184 		    nh->nh_weight != onh->nh_weight ||
185 #endif
186 #ifdef CONFIG_NET_CLS_ROUTE
187 		    nh->nh_tclassid != onh->nh_tclassid ||
188 #endif
189 		    ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
190 			return -1;
191 		onh++;
192 	} endfor_nexthops(fi);
193 	return 0;
194 }
195 
196 static inline unsigned int fib_devindex_hashfn(unsigned int val)
197 {
198 	unsigned int mask = DEVINDEX_HASHSIZE - 1;
199 
200 	return (val ^
201 		(val >> DEVINDEX_HASHBITS) ^
202 		(val >> (DEVINDEX_HASHBITS * 2))) & mask;
203 }
204 
205 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
206 {
207 	unsigned int mask = (fib_hash_size - 1);
208 	unsigned int val = fi->fib_nhs;
209 
210 	val ^= fi->fib_protocol;
211 	val ^= (__force u32)fi->fib_prefsrc;
212 	val ^= fi->fib_priority;
213 	for_nexthops(fi) {
214 		val ^= fib_devindex_hashfn(nh->nh_oif);
215 	} endfor_nexthops(fi)
216 
217 	return (val ^ (val >> 7) ^ (val >> 12)) & mask;
218 }
219 
220 static struct fib_info *fib_find_info(const struct fib_info *nfi)
221 {
222 	struct hlist_head *head;
223 	struct hlist_node *node;
224 	struct fib_info *fi;
225 	unsigned int hash;
226 
227 	hash = fib_info_hashfn(nfi);
228 	head = &fib_info_hash[hash];
229 
230 	hlist_for_each_entry(fi, node, head, fib_hash) {
231 		if (!net_eq(fi->fib_net, nfi->fib_net))
232 			continue;
233 		if (fi->fib_nhs != nfi->fib_nhs)
234 			continue;
235 		if (nfi->fib_protocol == fi->fib_protocol &&
236 		    nfi->fib_prefsrc == fi->fib_prefsrc &&
237 		    nfi->fib_priority == fi->fib_priority &&
238 		    memcmp(nfi->fib_metrics, fi->fib_metrics,
239 			   sizeof(fi->fib_metrics)) == 0 &&
240 		    ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
241 		    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
242 			return fi;
243 	}
244 
245 	return NULL;
246 }
247 
248 /* Check, that the gateway is already configured.
249    Used only by redirect accept routine.
250  */
251 
252 int ip_fib_check_default(__be32 gw, struct net_device *dev)
253 {
254 	struct hlist_head *head;
255 	struct hlist_node *node;
256 	struct fib_nh *nh;
257 	unsigned int hash;
258 
259 	spin_lock(&fib_info_lock);
260 
261 	hash = fib_devindex_hashfn(dev->ifindex);
262 	head = &fib_info_devhash[hash];
263 	hlist_for_each_entry(nh, node, head, nh_hash) {
264 		if (nh->nh_dev == dev &&
265 		    nh->nh_gw == gw &&
266 		    !(nh->nh_flags&RTNH_F_DEAD)) {
267 			spin_unlock(&fib_info_lock);
268 			return 0;
269 		}
270 	}
271 
272 	spin_unlock(&fib_info_lock);
273 
274 	return -1;
275 }
276 
277 static inline size_t fib_nlmsg_size(struct fib_info *fi)
278 {
279 	size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
280 			 + nla_total_size(4) /* RTA_TABLE */
281 			 + nla_total_size(4) /* RTA_DST */
282 			 + nla_total_size(4) /* RTA_PRIORITY */
283 			 + nla_total_size(4); /* RTA_PREFSRC */
284 
285 	/* space for nested metrics */
286 	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
287 
288 	if (fi->fib_nhs) {
289 		/* Also handles the special case fib_nhs == 1 */
290 
291 		/* each nexthop is packed in an attribute */
292 		size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
293 
294 		/* may contain flow and gateway attribute */
295 		nhsize += 2 * nla_total_size(4);
296 
297 		/* all nexthops are packed in a nested attribute */
298 		payload += nla_total_size(fi->fib_nhs * nhsize);
299 	}
300 
301 	return payload;
302 }
303 
304 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
305 	       int dst_len, u32 tb_id, struct nl_info *info,
306 	       unsigned int nlm_flags)
307 {
308 	struct sk_buff *skb;
309 	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
310 	int err = -ENOBUFS;
311 
312 	skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
313 	if (skb == NULL)
314 		goto errout;
315 
316 	err = fib_dump_info(skb, info->pid, seq, event, tb_id,
317 			    fa->fa_type, fa->fa_scope, key, dst_len,
318 			    fa->fa_tos, fa->fa_info, nlm_flags);
319 	if (err < 0) {
320 		/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
321 		WARN_ON(err == -EMSGSIZE);
322 		kfree_skb(skb);
323 		goto errout;
324 	}
325 	rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
326 		    info->nlh, GFP_KERNEL);
327 	return;
328 errout:
329 	if (err < 0)
330 		rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
331 }
332 
333 /* Return the first fib alias matching TOS with
334  * priority less than or equal to PRIO.
335  */
336 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
337 {
338 	if (fah) {
339 		struct fib_alias *fa;
340 		list_for_each_entry(fa, fah, fa_list) {
341 			if (fa->fa_tos > tos)
342 				continue;
343 			if (fa->fa_info->fib_priority >= prio ||
344 			    fa->fa_tos < tos)
345 				return fa;
346 		}
347 	}
348 	return NULL;
349 }
350 
351 int fib_detect_death(struct fib_info *fi, int order,
352 		     struct fib_info **last_resort, int *last_idx, int dflt)
353 {
354 	struct neighbour *n;
355 	int state = NUD_NONE;
356 
357 	n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
358 	if (n) {
359 		state = n->nud_state;
360 		neigh_release(n);
361 	}
362 	if (state == NUD_REACHABLE)
363 		return 0;
364 	if ((state&NUD_VALID) && order != dflt)
365 		return 0;
366 	if ((state&NUD_VALID) ||
367 	    (*last_idx<0 && order > dflt)) {
368 		*last_resort = fi;
369 		*last_idx = order;
370 	}
371 	return 1;
372 }
373 
374 #ifdef CONFIG_IP_ROUTE_MULTIPATH
375 
376 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
377 {
378 	int nhs = 0;
379 
380 	while (rtnh_ok(rtnh, remaining)) {
381 		nhs++;
382 		rtnh = rtnh_next(rtnh, &remaining);
383 	}
384 
385 	/* leftover implies invalid nexthop configuration, discard it */
386 	return remaining > 0 ? 0 : nhs;
387 }
388 
389 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
390 		       int remaining, struct fib_config *cfg)
391 {
392 	change_nexthops(fi) {
393 		int attrlen;
394 
395 		if (!rtnh_ok(rtnh, remaining))
396 			return -EINVAL;
397 
398 		nexthop_nh->nh_flags =
399 			(cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
400 		nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
401 		nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
402 
403 		attrlen = rtnh_attrlen(rtnh);
404 		if (attrlen > 0) {
405 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
406 
407 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
408 			nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0;
409 #ifdef CONFIG_NET_CLS_ROUTE
410 			nla = nla_find(attrs, attrlen, RTA_FLOW);
411 			nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
412 #endif
413 		}
414 
415 		rtnh = rtnh_next(rtnh, &remaining);
416 	} endfor_nexthops(fi);
417 
418 	return 0;
419 }
420 
421 #endif
422 
423 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
424 {
425 #ifdef CONFIG_IP_ROUTE_MULTIPATH
426 	struct rtnexthop *rtnh;
427 	int remaining;
428 #endif
429 
430 	if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
431 		return 1;
432 
433 	if (cfg->fc_oif || cfg->fc_gw) {
434 		if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
435 		    (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
436 			return 0;
437 		return 1;
438 	}
439 
440 #ifdef CONFIG_IP_ROUTE_MULTIPATH
441 	if (cfg->fc_mp == NULL)
442 		return 0;
443 
444 	rtnh = cfg->fc_mp;
445 	remaining = cfg->fc_mp_len;
446 
447 	for_nexthops(fi) {
448 		int attrlen;
449 
450 		if (!rtnh_ok(rtnh, remaining))
451 			return -EINVAL;
452 
453 		if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
454 			return 1;
455 
456 		attrlen = rtnh_attrlen(rtnh);
457 		if (attrlen < 0) {
458 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
459 
460 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
461 			if (nla && nla_get_be32(nla) != nh->nh_gw)
462 				return 1;
463 #ifdef CONFIG_NET_CLS_ROUTE
464 			nla = nla_find(attrs, attrlen, RTA_FLOW);
465 			if (nla && nla_get_u32(nla) != nh->nh_tclassid)
466 				return 1;
467 #endif
468 		}
469 
470 		rtnh = rtnh_next(rtnh, &remaining);
471 	} endfor_nexthops(fi);
472 #endif
473 	return 0;
474 }
475 
476 
477 /*
478    Picture
479    -------
480 
481    Semantics of nexthop is very messy by historical reasons.
482    We have to take into account, that:
483    a) gateway can be actually local interface address,
484       so that gatewayed route is direct.
485    b) gateway must be on-link address, possibly
486       described not by an ifaddr, but also by a direct route.
487    c) If both gateway and interface are specified, they should not
488       contradict.
489    d) If we use tunnel routes, gateway could be not on-link.
490 
491    Attempt to reconcile all of these (alas, self-contradictory) conditions
492    results in pretty ugly and hairy code with obscure logic.
493 
494    I chose to generalized it instead, so that the size
495    of code does not increase practically, but it becomes
496    much more general.
497    Every prefix is assigned a "scope" value: "host" is local address,
498    "link" is direct route,
499    [ ... "site" ... "interior" ... ]
500    and "universe" is true gateway route with global meaning.
501 
502    Every prefix refers to a set of "nexthop"s (gw, oif),
503    where gw must have narrower scope. This recursion stops
504    when gw has LOCAL scope or if "nexthop" is declared ONLINK,
505    which means that gw is forced to be on link.
506 
507    Code is still hairy, but now it is apparently logically
508    consistent and very flexible. F.e. as by-product it allows
509    to co-exists in peace independent exterior and interior
510    routing processes.
511 
512    Normally it looks as following.
513 
514    {universe prefix}  -> (gw, oif) [scope link]
515 			  |
516 			  |-> {link prefix} -> (gw, oif) [scope local]
517 						|
518 						|-> {local prefix} (terminal node)
519  */
520 
521 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
522 			struct fib_nh *nh)
523 {
524 	int err;
525 	struct net *net;
526 
527 	net = cfg->fc_nlinfo.nl_net;
528 	if (nh->nh_gw) {
529 		struct fib_result res;
530 
531 		if (nh->nh_flags&RTNH_F_ONLINK) {
532 			struct net_device *dev;
533 
534 			if (cfg->fc_scope >= RT_SCOPE_LINK)
535 				return -EINVAL;
536 			if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
537 				return -EINVAL;
538 			if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL)
539 				return -ENODEV;
540 			if (!(dev->flags&IFF_UP))
541 				return -ENETDOWN;
542 			nh->nh_dev = dev;
543 			dev_hold(dev);
544 			nh->nh_scope = RT_SCOPE_LINK;
545 			return 0;
546 		}
547 		{
548 			struct flowi fl = {
549 				.nl_u = {
550 					.ip4_u = {
551 						.daddr = nh->nh_gw,
552 						.scope = cfg->fc_scope + 1,
553 					},
554 				},
555 				.oif = nh->nh_oif,
556 			};
557 
558 			/* It is not necessary, but requires a bit of thinking */
559 			if (fl.fl4_scope < RT_SCOPE_LINK)
560 				fl.fl4_scope = RT_SCOPE_LINK;
561 			if ((err = fib_lookup(net, &fl, &res)) != 0)
562 				return err;
563 		}
564 		err = -EINVAL;
565 		if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
566 			goto out;
567 		nh->nh_scope = res.scope;
568 		nh->nh_oif = FIB_RES_OIF(res);
569 		if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
570 			goto out;
571 		dev_hold(nh->nh_dev);
572 		err = -ENETDOWN;
573 		if (!(nh->nh_dev->flags & IFF_UP))
574 			goto out;
575 		err = 0;
576 out:
577 		fib_res_put(&res);
578 		return err;
579 	} else {
580 		struct in_device *in_dev;
581 
582 		if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
583 			return -EINVAL;
584 
585 		in_dev = inetdev_by_index(net, nh->nh_oif);
586 		if (in_dev == NULL)
587 			return -ENODEV;
588 		if (!(in_dev->dev->flags&IFF_UP)) {
589 			in_dev_put(in_dev);
590 			return -ENETDOWN;
591 		}
592 		nh->nh_dev = in_dev->dev;
593 		dev_hold(nh->nh_dev);
594 		nh->nh_scope = RT_SCOPE_HOST;
595 		in_dev_put(in_dev);
596 	}
597 	return 0;
598 }
599 
600 static inline unsigned int fib_laddr_hashfn(__be32 val)
601 {
602 	unsigned int mask = (fib_hash_size - 1);
603 
604 	return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
605 }
606 
607 static struct hlist_head *fib_hash_alloc(int bytes)
608 {
609 	if (bytes <= PAGE_SIZE)
610 		return kzalloc(bytes, GFP_KERNEL);
611 	else
612 		return (struct hlist_head *)
613 			__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes));
614 }
615 
616 static void fib_hash_free(struct hlist_head *hash, int bytes)
617 {
618 	if (!hash)
619 		return;
620 
621 	if (bytes <= PAGE_SIZE)
622 		kfree(hash);
623 	else
624 		free_pages((unsigned long) hash, get_order(bytes));
625 }
626 
627 static void fib_hash_move(struct hlist_head *new_info_hash,
628 			  struct hlist_head *new_laddrhash,
629 			  unsigned int new_size)
630 {
631 	struct hlist_head *old_info_hash, *old_laddrhash;
632 	unsigned int old_size = fib_hash_size;
633 	unsigned int i, bytes;
634 
635 	spin_lock_bh(&fib_info_lock);
636 	old_info_hash = fib_info_hash;
637 	old_laddrhash = fib_info_laddrhash;
638 	fib_hash_size = new_size;
639 
640 	for (i = 0; i < old_size; i++) {
641 		struct hlist_head *head = &fib_info_hash[i];
642 		struct hlist_node *node, *n;
643 		struct fib_info *fi;
644 
645 		hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
646 			struct hlist_head *dest;
647 			unsigned int new_hash;
648 
649 			hlist_del(&fi->fib_hash);
650 
651 			new_hash = fib_info_hashfn(fi);
652 			dest = &new_info_hash[new_hash];
653 			hlist_add_head(&fi->fib_hash, dest);
654 		}
655 	}
656 	fib_info_hash = new_info_hash;
657 
658 	for (i = 0; i < old_size; i++) {
659 		struct hlist_head *lhead = &fib_info_laddrhash[i];
660 		struct hlist_node *node, *n;
661 		struct fib_info *fi;
662 
663 		hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
664 			struct hlist_head *ldest;
665 			unsigned int new_hash;
666 
667 			hlist_del(&fi->fib_lhash);
668 
669 			new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
670 			ldest = &new_laddrhash[new_hash];
671 			hlist_add_head(&fi->fib_lhash, ldest);
672 		}
673 	}
674 	fib_info_laddrhash = new_laddrhash;
675 
676 	spin_unlock_bh(&fib_info_lock);
677 
678 	bytes = old_size * sizeof(struct hlist_head *);
679 	fib_hash_free(old_info_hash, bytes);
680 	fib_hash_free(old_laddrhash, bytes);
681 }
682 
683 struct fib_info *fib_create_info(struct fib_config *cfg)
684 {
685 	int err;
686 	struct fib_info *fi = NULL;
687 	struct fib_info *ofi;
688 	int nhs = 1;
689 	struct net *net = cfg->fc_nlinfo.nl_net;
690 
691 	/* Fast check to catch the most weird cases */
692 	if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
693 		goto err_inval;
694 
695 #ifdef CONFIG_IP_ROUTE_MULTIPATH
696 	if (cfg->fc_mp) {
697 		nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
698 		if (nhs == 0)
699 			goto err_inval;
700 	}
701 #endif
702 
703 	err = -ENOBUFS;
704 	if (fib_info_cnt >= fib_hash_size) {
705 		unsigned int new_size = fib_hash_size << 1;
706 		struct hlist_head *new_info_hash;
707 		struct hlist_head *new_laddrhash;
708 		unsigned int bytes;
709 
710 		if (!new_size)
711 			new_size = 1;
712 		bytes = new_size * sizeof(struct hlist_head *);
713 		new_info_hash = fib_hash_alloc(bytes);
714 		new_laddrhash = fib_hash_alloc(bytes);
715 		if (!new_info_hash || !new_laddrhash) {
716 			fib_hash_free(new_info_hash, bytes);
717 			fib_hash_free(new_laddrhash, bytes);
718 		} else
719 			fib_hash_move(new_info_hash, new_laddrhash, new_size);
720 
721 		if (!fib_hash_size)
722 			goto failure;
723 	}
724 
725 	fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
726 	if (fi == NULL)
727 		goto failure;
728 	fib_info_cnt++;
729 
730 	fi->fib_net = hold_net(net);
731 	fi->fib_protocol = cfg->fc_protocol;
732 	fi->fib_flags = cfg->fc_flags;
733 	fi->fib_priority = cfg->fc_priority;
734 	fi->fib_prefsrc = cfg->fc_prefsrc;
735 
736 	fi->fib_nhs = nhs;
737 	change_nexthops(fi) {
738 		nexthop_nh->nh_parent = fi;
739 	} endfor_nexthops(fi)
740 
741 	if (cfg->fc_mx) {
742 		struct nlattr *nla;
743 		int remaining;
744 
745 		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
746 			int type = nla_type(nla);
747 
748 			if (type) {
749 				if (type > RTAX_MAX)
750 					goto err_inval;
751 				fi->fib_metrics[type - 1] = nla_get_u32(nla);
752 			}
753 		}
754 	}
755 
756 	if (cfg->fc_mp) {
757 #ifdef CONFIG_IP_ROUTE_MULTIPATH
758 		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
759 		if (err != 0)
760 			goto failure;
761 		if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
762 			goto err_inval;
763 		if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
764 			goto err_inval;
765 #ifdef CONFIG_NET_CLS_ROUTE
766 		if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
767 			goto err_inval;
768 #endif
769 #else
770 		goto err_inval;
771 #endif
772 	} else {
773 		struct fib_nh *nh = fi->fib_nh;
774 
775 		nh->nh_oif = cfg->fc_oif;
776 		nh->nh_gw = cfg->fc_gw;
777 		nh->nh_flags = cfg->fc_flags;
778 #ifdef CONFIG_NET_CLS_ROUTE
779 		nh->nh_tclassid = cfg->fc_flow;
780 #endif
781 #ifdef CONFIG_IP_ROUTE_MULTIPATH
782 		nh->nh_weight = 1;
783 #endif
784 	}
785 
786 	if (fib_props[cfg->fc_type].error) {
787 		if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
788 			goto err_inval;
789 		goto link_it;
790 	}
791 
792 	if (cfg->fc_scope > RT_SCOPE_HOST)
793 		goto err_inval;
794 
795 	if (cfg->fc_scope == RT_SCOPE_HOST) {
796 		struct fib_nh *nh = fi->fib_nh;
797 
798 		/* Local address is added. */
799 		if (nhs != 1 || nh->nh_gw)
800 			goto err_inval;
801 		nh->nh_scope = RT_SCOPE_NOWHERE;
802 		nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
803 		err = -ENODEV;
804 		if (nh->nh_dev == NULL)
805 			goto failure;
806 	} else {
807 		change_nexthops(fi) {
808 			if ((err = fib_check_nh(cfg, fi, nexthop_nh)) != 0)
809 				goto failure;
810 		} endfor_nexthops(fi)
811 	}
812 
813 	if (fi->fib_prefsrc) {
814 		if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
815 		    fi->fib_prefsrc != cfg->fc_dst)
816 			if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
817 				goto err_inval;
818 	}
819 
820 link_it:
821 	if ((ofi = fib_find_info(fi)) != NULL) {
822 		fi->fib_dead = 1;
823 		free_fib_info(fi);
824 		ofi->fib_treeref++;
825 		return ofi;
826 	}
827 
828 	fi->fib_treeref++;
829 	atomic_inc(&fi->fib_clntref);
830 	spin_lock_bh(&fib_info_lock);
831 	hlist_add_head(&fi->fib_hash,
832 		       &fib_info_hash[fib_info_hashfn(fi)]);
833 	if (fi->fib_prefsrc) {
834 		struct hlist_head *head;
835 
836 		head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
837 		hlist_add_head(&fi->fib_lhash, head);
838 	}
839 	change_nexthops(fi) {
840 		struct hlist_head *head;
841 		unsigned int hash;
842 
843 		if (!nexthop_nh->nh_dev)
844 			continue;
845 		hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex);
846 		head = &fib_info_devhash[hash];
847 		hlist_add_head(&nexthop_nh->nh_hash, head);
848 	} endfor_nexthops(fi)
849 	spin_unlock_bh(&fib_info_lock);
850 	return fi;
851 
852 err_inval:
853 	err = -EINVAL;
854 
855 failure:
856 	if (fi) {
857 		fi->fib_dead = 1;
858 		free_fib_info(fi);
859 	}
860 
861 	return ERR_PTR(err);
862 }
863 
864 /* Note! fib_semantic_match intentionally uses  RCU list functions. */
865 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
866 		       struct fib_result *res, int prefixlen)
867 {
868 	struct fib_alias *fa;
869 	int nh_sel = 0;
870 
871 	list_for_each_entry_rcu(fa, head, fa_list) {
872 		int err;
873 
874 		if (fa->fa_tos &&
875 		    fa->fa_tos != flp->fl4_tos)
876 			continue;
877 
878 		if (fa->fa_scope < flp->fl4_scope)
879 			continue;
880 
881 		fa->fa_state |= FA_S_ACCESSED;
882 
883 		err = fib_props[fa->fa_type].error;
884 		if (err == 0) {
885 			struct fib_info *fi = fa->fa_info;
886 
887 			if (fi->fib_flags & RTNH_F_DEAD)
888 				continue;
889 
890 			switch (fa->fa_type) {
891 			case RTN_UNICAST:
892 			case RTN_LOCAL:
893 			case RTN_BROADCAST:
894 			case RTN_ANYCAST:
895 			case RTN_MULTICAST:
896 				for_nexthops(fi) {
897 					if (nh->nh_flags&RTNH_F_DEAD)
898 						continue;
899 					if (!flp->oif || flp->oif == nh->nh_oif)
900 						break;
901 				}
902 #ifdef CONFIG_IP_ROUTE_MULTIPATH
903 				if (nhsel < fi->fib_nhs) {
904 					nh_sel = nhsel;
905 					goto out_fill_res;
906 				}
907 #else
908 				if (nhsel < 1) {
909 					goto out_fill_res;
910 				}
911 #endif
912 				endfor_nexthops(fi);
913 				continue;
914 
915 			default:
916 				printk(KERN_WARNING "fib_semantic_match bad type %#x\n",
917 					fa->fa_type);
918 				return -EINVAL;
919 			}
920 		}
921 		return err;
922 	}
923 	return 1;
924 
925 out_fill_res:
926 	res->prefixlen = prefixlen;
927 	res->nh_sel = nh_sel;
928 	res->type = fa->fa_type;
929 	res->scope = fa->fa_scope;
930 	res->fi = fa->fa_info;
931 	atomic_inc(&res->fi->fib_clntref);
932 	return 0;
933 }
934 
935 /* Find appropriate source address to this destination */
936 
937 __be32 __fib_res_prefsrc(struct fib_result *res)
938 {
939 	return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
940 }
941 
942 int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
943 		  u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
944 		  struct fib_info *fi, unsigned int flags)
945 {
946 	struct nlmsghdr *nlh;
947 	struct rtmsg *rtm;
948 
949 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
950 	if (nlh == NULL)
951 		return -EMSGSIZE;
952 
953 	rtm = nlmsg_data(nlh);
954 	rtm->rtm_family = AF_INET;
955 	rtm->rtm_dst_len = dst_len;
956 	rtm->rtm_src_len = 0;
957 	rtm->rtm_tos = tos;
958 	if (tb_id < 256)
959 		rtm->rtm_table = tb_id;
960 	else
961 		rtm->rtm_table = RT_TABLE_COMPAT;
962 	NLA_PUT_U32(skb, RTA_TABLE, tb_id);
963 	rtm->rtm_type = type;
964 	rtm->rtm_flags = fi->fib_flags;
965 	rtm->rtm_scope = scope;
966 	rtm->rtm_protocol = fi->fib_protocol;
967 
968 	if (rtm->rtm_dst_len)
969 		NLA_PUT_BE32(skb, RTA_DST, dst);
970 
971 	if (fi->fib_priority)
972 		NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
973 
974 	if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
975 		goto nla_put_failure;
976 
977 	if (fi->fib_prefsrc)
978 		NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
979 
980 	if (fi->fib_nhs == 1) {
981 		if (fi->fib_nh->nh_gw)
982 			NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
983 
984 		if (fi->fib_nh->nh_oif)
985 			NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
986 #ifdef CONFIG_NET_CLS_ROUTE
987 		if (fi->fib_nh[0].nh_tclassid)
988 			NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
989 #endif
990 	}
991 #ifdef CONFIG_IP_ROUTE_MULTIPATH
992 	if (fi->fib_nhs > 1) {
993 		struct rtnexthop *rtnh;
994 		struct nlattr *mp;
995 
996 		mp = nla_nest_start(skb, RTA_MULTIPATH);
997 		if (mp == NULL)
998 			goto nla_put_failure;
999 
1000 		for_nexthops(fi) {
1001 			rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1002 			if (rtnh == NULL)
1003 				goto nla_put_failure;
1004 
1005 			rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1006 			rtnh->rtnh_hops = nh->nh_weight - 1;
1007 			rtnh->rtnh_ifindex = nh->nh_oif;
1008 
1009 			if (nh->nh_gw)
1010 				NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1011 #ifdef CONFIG_NET_CLS_ROUTE
1012 			if (nh->nh_tclassid)
1013 				NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1014 #endif
1015 			/* length of rtnetlink header + attributes */
1016 			rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1017 		} endfor_nexthops(fi);
1018 
1019 		nla_nest_end(skb, mp);
1020 	}
1021 #endif
1022 	return nlmsg_end(skb, nlh);
1023 
1024 nla_put_failure:
1025 	nlmsg_cancel(skb, nlh);
1026 	return -EMSGSIZE;
1027 }
1028 
1029 /*
1030    Update FIB if:
1031    - local address disappeared -> we must delete all the entries
1032      referring to it.
1033    - device went down -> we must shutdown all nexthops going via it.
1034  */
1035 int fib_sync_down_addr(struct net *net, __be32 local)
1036 {
1037 	int ret = 0;
1038 	unsigned int hash = fib_laddr_hashfn(local);
1039 	struct hlist_head *head = &fib_info_laddrhash[hash];
1040 	struct hlist_node *node;
1041 	struct fib_info *fi;
1042 
1043 	if (fib_info_laddrhash == NULL || local == 0)
1044 		return 0;
1045 
1046 	hlist_for_each_entry(fi, node, head, fib_lhash) {
1047 		if (!net_eq(fi->fib_net, net))
1048 			continue;
1049 		if (fi->fib_prefsrc == local) {
1050 			fi->fib_flags |= RTNH_F_DEAD;
1051 			ret++;
1052 		}
1053 	}
1054 	return ret;
1055 }
1056 
1057 int fib_sync_down_dev(struct net_device *dev, int force)
1058 {
1059 	int ret = 0;
1060 	int scope = RT_SCOPE_NOWHERE;
1061 	struct fib_info *prev_fi = NULL;
1062 	unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1063 	struct hlist_head *head = &fib_info_devhash[hash];
1064 	struct hlist_node *node;
1065 	struct fib_nh *nh;
1066 
1067 	if (force)
1068 		scope = -1;
1069 
1070 	hlist_for_each_entry(nh, node, head, nh_hash) {
1071 		struct fib_info *fi = nh->nh_parent;
1072 		int dead;
1073 
1074 		BUG_ON(!fi->fib_nhs);
1075 		if (nh->nh_dev != dev || fi == prev_fi)
1076 			continue;
1077 		prev_fi = fi;
1078 		dead = 0;
1079 		change_nexthops(fi) {
1080 			if (nexthop_nh->nh_flags&RTNH_F_DEAD)
1081 				dead++;
1082 			else if (nexthop_nh->nh_dev == dev &&
1083 				 nexthop_nh->nh_scope != scope) {
1084 				nexthop_nh->nh_flags |= RTNH_F_DEAD;
1085 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1086 				spin_lock_bh(&fib_multipath_lock);
1087 				fi->fib_power -= nexthop_nh->nh_power;
1088 				nexthop_nh->nh_power = 0;
1089 				spin_unlock_bh(&fib_multipath_lock);
1090 #endif
1091 				dead++;
1092 			}
1093 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1094 			if (force > 1 && nexthop_nh->nh_dev == dev) {
1095 				dead = fi->fib_nhs;
1096 				break;
1097 			}
1098 #endif
1099 		} endfor_nexthops(fi)
1100 		if (dead == fi->fib_nhs) {
1101 			fi->fib_flags |= RTNH_F_DEAD;
1102 			ret++;
1103 		}
1104 	}
1105 
1106 	return ret;
1107 }
1108 
1109 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1110 
1111 /*
1112    Dead device goes up. We wake up dead nexthops.
1113    It takes sense only on multipath routes.
1114  */
1115 
1116 int fib_sync_up(struct net_device *dev)
1117 {
1118 	struct fib_info *prev_fi;
1119 	unsigned int hash;
1120 	struct hlist_head *head;
1121 	struct hlist_node *node;
1122 	struct fib_nh *nh;
1123 	int ret;
1124 
1125 	if (!(dev->flags&IFF_UP))
1126 		return 0;
1127 
1128 	prev_fi = NULL;
1129 	hash = fib_devindex_hashfn(dev->ifindex);
1130 	head = &fib_info_devhash[hash];
1131 	ret = 0;
1132 
1133 	hlist_for_each_entry(nh, node, head, nh_hash) {
1134 		struct fib_info *fi = nh->nh_parent;
1135 		int alive;
1136 
1137 		BUG_ON(!fi->fib_nhs);
1138 		if (nh->nh_dev != dev || fi == prev_fi)
1139 			continue;
1140 
1141 		prev_fi = fi;
1142 		alive = 0;
1143 		change_nexthops(fi) {
1144 			if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
1145 				alive++;
1146 				continue;
1147 			}
1148 			if (nexthop_nh->nh_dev == NULL ||
1149 			    !(nexthop_nh->nh_dev->flags&IFF_UP))
1150 				continue;
1151 			if (nexthop_nh->nh_dev != dev ||
1152 			    !__in_dev_get_rtnl(dev))
1153 				continue;
1154 			alive++;
1155 			spin_lock_bh(&fib_multipath_lock);
1156 			nexthop_nh->nh_power = 0;
1157 			nexthop_nh->nh_flags &= ~RTNH_F_DEAD;
1158 			spin_unlock_bh(&fib_multipath_lock);
1159 		} endfor_nexthops(fi)
1160 
1161 		if (alive > 0) {
1162 			fi->fib_flags &= ~RTNH_F_DEAD;
1163 			ret++;
1164 		}
1165 	}
1166 
1167 	return ret;
1168 }
1169 
1170 /*
1171    The algorithm is suboptimal, but it provides really
1172    fair weighted route distribution.
1173  */
1174 
1175 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1176 {
1177 	struct fib_info *fi = res->fi;
1178 	int w;
1179 
1180 	spin_lock_bh(&fib_multipath_lock);
1181 	if (fi->fib_power <= 0) {
1182 		int power = 0;
1183 		change_nexthops(fi) {
1184 			if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
1185 				power += nexthop_nh->nh_weight;
1186 				nexthop_nh->nh_power = nexthop_nh->nh_weight;
1187 			}
1188 		} endfor_nexthops(fi);
1189 		fi->fib_power = power;
1190 		if (power <= 0) {
1191 			spin_unlock_bh(&fib_multipath_lock);
1192 			/* Race condition: route has just become dead. */
1193 			res->nh_sel = 0;
1194 			return;
1195 		}
1196 	}
1197 
1198 
1199 	/* w should be random number [0..fi->fib_power-1],
1200 	   it is pretty bad approximation.
1201 	 */
1202 
1203 	w = jiffies % fi->fib_power;
1204 
1205 	change_nexthops(fi) {
1206 		if (!(nexthop_nh->nh_flags&RTNH_F_DEAD) &&
1207 		    nexthop_nh->nh_power) {
1208 			if ((w -= nexthop_nh->nh_power) <= 0) {
1209 				nexthop_nh->nh_power--;
1210 				fi->fib_power--;
1211 				res->nh_sel = nhsel;
1212 				spin_unlock_bh(&fib_multipath_lock);
1213 				return;
1214 			}
1215 		}
1216 	} endfor_nexthops(fi);
1217 
1218 	/* Race condition: route has just become dead. */
1219 	res->nh_sel = 0;
1220 	spin_unlock_bh(&fib_multipath_lock);
1221 }
1222 #endif
1223