xref: /linux/net/ipv4/fib_semantics.c (revision 606d099cdd1080bbb50ea50dc52d98252f8f10a1)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		IPv4 Forwarding Information Base: semantics.
7  *
8  * Version:	$Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
9  *
10  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11  *
12  *		This program is free software; you can redistribute it and/or
13  *		modify it under the terms of the GNU General Public License
14  *		as published by the Free Software Foundation; either version
15  *		2 of the License, or (at your option) any later version.
16  */
17 
18 #include <asm/uaccess.h>
19 #include <asm/system.h>
20 #include <linux/bitops.h>
21 #include <linux/types.h>
22 #include <linux/kernel.h>
23 #include <linux/jiffies.h>
24 #include <linux/mm.h>
25 #include <linux/string.h>
26 #include <linux/socket.h>
27 #include <linux/sockios.h>
28 #include <linux/errno.h>
29 #include <linux/in.h>
30 #include <linux/inet.h>
31 #include <linux/inetdevice.h>
32 #include <linux/netdevice.h>
33 #include <linux/if_arp.h>
34 #include <linux/proc_fs.h>
35 #include <linux/skbuff.h>
36 #include <linux/init.h>
37 
38 #include <net/arp.h>
39 #include <net/ip.h>
40 #include <net/protocol.h>
41 #include <net/route.h>
42 #include <net/tcp.h>
43 #include <net/sock.h>
44 #include <net/ip_fib.h>
45 #include <net/ip_mp_alg.h>
46 #include <net/netlink.h>
47 #include <net/nexthop.h>
48 
49 #include "fib_lookup.h"
50 
51 #define FSprintk(a...)
52 
53 static DEFINE_SPINLOCK(fib_info_lock);
54 static struct hlist_head *fib_info_hash;
55 static struct hlist_head *fib_info_laddrhash;
56 static unsigned int fib_hash_size;
57 static unsigned int fib_info_cnt;
58 
59 #define DEVINDEX_HASHBITS 8
60 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
61 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
62 
63 #ifdef CONFIG_IP_ROUTE_MULTIPATH
64 
65 static DEFINE_SPINLOCK(fib_multipath_lock);
66 
67 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
68 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
69 
70 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
71 for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
72 
73 #else /* CONFIG_IP_ROUTE_MULTIPATH */
74 
75 /* Hope, that gcc will optimize it to get rid of dummy loop */
76 
77 #define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
78 for (nhsel=0; nhsel < 1; nhsel++)
79 
80 #define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
81 for (nhsel=0; nhsel < 1; nhsel++)
82 
83 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
84 
85 #define endfor_nexthops(fi) }
86 
87 
88 static const struct
89 {
90 	int	error;
91 	u8	scope;
92 } fib_props[RTA_MAX + 1] = {
93         {
94 		.error	= 0,
95 		.scope	= RT_SCOPE_NOWHERE,
96 	},	/* RTN_UNSPEC */
97 	{
98 		.error	= 0,
99 		.scope	= RT_SCOPE_UNIVERSE,
100 	},	/* RTN_UNICAST */
101 	{
102 		.error	= 0,
103 		.scope	= RT_SCOPE_HOST,
104 	},	/* RTN_LOCAL */
105 	{
106 		.error	= 0,
107 		.scope	= RT_SCOPE_LINK,
108 	},	/* RTN_BROADCAST */
109 	{
110 		.error	= 0,
111 		.scope	= RT_SCOPE_LINK,
112 	},	/* RTN_ANYCAST */
113 	{
114 		.error	= 0,
115 		.scope	= RT_SCOPE_UNIVERSE,
116 	},	/* RTN_MULTICAST */
117 	{
118 		.error	= -EINVAL,
119 		.scope	= RT_SCOPE_UNIVERSE,
120 	},	/* RTN_BLACKHOLE */
121 	{
122 		.error	= -EHOSTUNREACH,
123 		.scope	= RT_SCOPE_UNIVERSE,
124 	},	/* RTN_UNREACHABLE */
125 	{
126 		.error	= -EACCES,
127 		.scope	= RT_SCOPE_UNIVERSE,
128 	},	/* RTN_PROHIBIT */
129 	{
130 		.error	= -EAGAIN,
131 		.scope	= RT_SCOPE_UNIVERSE,
132 	},	/* RTN_THROW */
133 	{
134 		.error	= -EINVAL,
135 		.scope	= RT_SCOPE_NOWHERE,
136 	},	/* RTN_NAT */
137 	{
138 		.error	= -EINVAL,
139 		.scope	= RT_SCOPE_NOWHERE,
140 	},	/* RTN_XRESOLVE */
141 };
142 
143 
144 /* Release a nexthop info record */
145 
146 void free_fib_info(struct fib_info *fi)
147 {
148 	if (fi->fib_dead == 0) {
149 		printk("Freeing alive fib_info %p\n", fi);
150 		return;
151 	}
152 	change_nexthops(fi) {
153 		if (nh->nh_dev)
154 			dev_put(nh->nh_dev);
155 		nh->nh_dev = NULL;
156 	} endfor_nexthops(fi);
157 	fib_info_cnt--;
158 	kfree(fi);
159 }
160 
161 void fib_release_info(struct fib_info *fi)
162 {
163 	spin_lock_bh(&fib_info_lock);
164 	if (fi && --fi->fib_treeref == 0) {
165 		hlist_del(&fi->fib_hash);
166 		if (fi->fib_prefsrc)
167 			hlist_del(&fi->fib_lhash);
168 		change_nexthops(fi) {
169 			if (!nh->nh_dev)
170 				continue;
171 			hlist_del(&nh->nh_hash);
172 		} endfor_nexthops(fi)
173 		fi->fib_dead = 1;
174 		fib_info_put(fi);
175 	}
176 	spin_unlock_bh(&fib_info_lock);
177 }
178 
179 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
180 {
181 	const struct fib_nh *onh = ofi->fib_nh;
182 
183 	for_nexthops(fi) {
184 		if (nh->nh_oif != onh->nh_oif ||
185 		    nh->nh_gw  != onh->nh_gw ||
186 		    nh->nh_scope != onh->nh_scope ||
187 #ifdef CONFIG_IP_ROUTE_MULTIPATH
188 		    nh->nh_weight != onh->nh_weight ||
189 #endif
190 #ifdef CONFIG_NET_CLS_ROUTE
191 		    nh->nh_tclassid != onh->nh_tclassid ||
192 #endif
193 		    ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
194 			return -1;
195 		onh++;
196 	} endfor_nexthops(fi);
197 	return 0;
198 }
199 
200 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
201 {
202 	unsigned int mask = (fib_hash_size - 1);
203 	unsigned int val = fi->fib_nhs;
204 
205 	val ^= fi->fib_protocol;
206 	val ^= (__force u32)fi->fib_prefsrc;
207 	val ^= fi->fib_priority;
208 
209 	return (val ^ (val >> 7) ^ (val >> 12)) & mask;
210 }
211 
212 static struct fib_info *fib_find_info(const struct fib_info *nfi)
213 {
214 	struct hlist_head *head;
215 	struct hlist_node *node;
216 	struct fib_info *fi;
217 	unsigned int hash;
218 
219 	hash = fib_info_hashfn(nfi);
220 	head = &fib_info_hash[hash];
221 
222 	hlist_for_each_entry(fi, node, head, fib_hash) {
223 		if (fi->fib_nhs != nfi->fib_nhs)
224 			continue;
225 		if (nfi->fib_protocol == fi->fib_protocol &&
226 		    nfi->fib_prefsrc == fi->fib_prefsrc &&
227 		    nfi->fib_priority == fi->fib_priority &&
228 		    memcmp(nfi->fib_metrics, fi->fib_metrics,
229 			   sizeof(fi->fib_metrics)) == 0 &&
230 		    ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
231 		    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
232 			return fi;
233 	}
234 
235 	return NULL;
236 }
237 
238 static inline unsigned int fib_devindex_hashfn(unsigned int val)
239 {
240 	unsigned int mask = DEVINDEX_HASHSIZE - 1;
241 
242 	return (val ^
243 		(val >> DEVINDEX_HASHBITS) ^
244 		(val >> (DEVINDEX_HASHBITS * 2))) & mask;
245 }
246 
247 /* Check, that the gateway is already configured.
248    Used only by redirect accept routine.
249  */
250 
251 int ip_fib_check_default(__be32 gw, struct net_device *dev)
252 {
253 	struct hlist_head *head;
254 	struct hlist_node *node;
255 	struct fib_nh *nh;
256 	unsigned int hash;
257 
258 	spin_lock(&fib_info_lock);
259 
260 	hash = fib_devindex_hashfn(dev->ifindex);
261 	head = &fib_info_devhash[hash];
262 	hlist_for_each_entry(nh, node, head, nh_hash) {
263 		if (nh->nh_dev == dev &&
264 		    nh->nh_gw == gw &&
265 		    !(nh->nh_flags&RTNH_F_DEAD)) {
266 			spin_unlock(&fib_info_lock);
267 			return 0;
268 		}
269 	}
270 
271 	spin_unlock(&fib_info_lock);
272 
273 	return -1;
274 }
275 
276 static inline size_t fib_nlmsg_size(struct fib_info *fi)
277 {
278 	size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
279 			 + nla_total_size(4) /* RTA_TABLE */
280 			 + nla_total_size(4) /* RTA_DST */
281 			 + nla_total_size(4) /* RTA_PRIORITY */
282 			 + nla_total_size(4); /* RTA_PREFSRC */
283 
284 	/* space for nested metrics */
285 	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
286 
287 	if (fi->fib_nhs) {
288 		/* Also handles the special case fib_nhs == 1 */
289 
290 		/* each nexthop is packed in an attribute */
291 		size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
292 
293 		/* may contain flow and gateway attribute */
294 		nhsize += 2 * nla_total_size(4);
295 
296 		/* all nexthops are packed in a nested attribute */
297 		payload += nla_total_size(fi->fib_nhs * nhsize);
298 	}
299 
300 	return payload;
301 }
302 
303 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
304 	       int dst_len, u32 tb_id, struct nl_info *info)
305 {
306 	struct sk_buff *skb;
307 	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
308 	int err = -ENOBUFS;
309 
310 	skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
311 	if (skb == NULL)
312 		goto errout;
313 
314 	err = fib_dump_info(skb, info->pid, seq, event, tb_id,
315 			    fa->fa_type, fa->fa_scope, key, dst_len,
316 			    fa->fa_tos, fa->fa_info, 0);
317 	/* failure implies BUG in fib_nlmsg_size() */
318 	BUG_ON(err < 0);
319 
320 	err = rtnl_notify(skb, info->pid, RTNLGRP_IPV4_ROUTE,
321 			  info->nlh, GFP_KERNEL);
322 errout:
323 	if (err < 0)
324 		rtnl_set_sk_err(RTNLGRP_IPV4_ROUTE, err);
325 }
326 
327 /* Return the first fib alias matching TOS with
328  * priority less than or equal to PRIO.
329  */
330 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
331 {
332 	if (fah) {
333 		struct fib_alias *fa;
334 		list_for_each_entry(fa, fah, fa_list) {
335 			if (fa->fa_tos > tos)
336 				continue;
337 			if (fa->fa_info->fib_priority >= prio ||
338 			    fa->fa_tos < tos)
339 				return fa;
340 		}
341 	}
342 	return NULL;
343 }
344 
345 int fib_detect_death(struct fib_info *fi, int order,
346 		     struct fib_info **last_resort, int *last_idx, int *dflt)
347 {
348 	struct neighbour *n;
349 	int state = NUD_NONE;
350 
351 	n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
352 	if (n) {
353 		state = n->nud_state;
354 		neigh_release(n);
355 	}
356 	if (state==NUD_REACHABLE)
357 		return 0;
358 	if ((state&NUD_VALID) && order != *dflt)
359 		return 0;
360 	if ((state&NUD_VALID) ||
361 	    (*last_idx<0 && order > *dflt)) {
362 		*last_resort = fi;
363 		*last_idx = order;
364 	}
365 	return 1;
366 }
367 
368 #ifdef CONFIG_IP_ROUTE_MULTIPATH
369 
370 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
371 {
372 	int nhs = 0;
373 
374 	while (rtnh_ok(rtnh, remaining)) {
375 		nhs++;
376 		rtnh = rtnh_next(rtnh, &remaining);
377 	}
378 
379 	/* leftover implies invalid nexthop configuration, discard it */
380 	return remaining > 0 ? 0 : nhs;
381 }
382 
383 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
384 		       int remaining, struct fib_config *cfg)
385 {
386 	change_nexthops(fi) {
387 		int attrlen;
388 
389 		if (!rtnh_ok(rtnh, remaining))
390 			return -EINVAL;
391 
392 		nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
393 		nh->nh_oif = rtnh->rtnh_ifindex;
394 		nh->nh_weight = rtnh->rtnh_hops + 1;
395 
396 		attrlen = rtnh_attrlen(rtnh);
397 		if (attrlen > 0) {
398 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
399 
400 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
401 			nh->nh_gw = nla ? nla_get_be32(nla) : 0;
402 #ifdef CONFIG_NET_CLS_ROUTE
403 			nla = nla_find(attrs, attrlen, RTA_FLOW);
404 			nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
405 #endif
406 		}
407 
408 		rtnh = rtnh_next(rtnh, &remaining);
409 	} endfor_nexthops(fi);
410 
411 	return 0;
412 }
413 
414 #endif
415 
416 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
417 {
418 #ifdef CONFIG_IP_ROUTE_MULTIPATH
419 	struct rtnexthop *rtnh;
420 	int remaining;
421 #endif
422 
423 	if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
424 		return 1;
425 
426 	if (cfg->fc_oif || cfg->fc_gw) {
427 		if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
428 		    (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
429 			return 0;
430 		return 1;
431 	}
432 
433 #ifdef CONFIG_IP_ROUTE_MULTIPATH
434 	if (cfg->fc_mp == NULL)
435 		return 0;
436 
437 	rtnh = cfg->fc_mp;
438 	remaining = cfg->fc_mp_len;
439 
440 	for_nexthops(fi) {
441 		int attrlen;
442 
443 		if (!rtnh_ok(rtnh, remaining))
444 			return -EINVAL;
445 
446 		if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
447 			return 1;
448 
449 		attrlen = rtnh_attrlen(rtnh);
450 		if (attrlen < 0) {
451 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
452 
453 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
454 			if (nla && nla_get_be32(nla) != nh->nh_gw)
455 				return 1;
456 #ifdef CONFIG_NET_CLS_ROUTE
457 			nla = nla_find(attrs, attrlen, RTA_FLOW);
458 			if (nla && nla_get_u32(nla) != nh->nh_tclassid)
459 				return 1;
460 #endif
461 		}
462 
463 		rtnh = rtnh_next(rtnh, &remaining);
464 	} endfor_nexthops(fi);
465 #endif
466 	return 0;
467 }
468 
469 
470 /*
471    Picture
472    -------
473 
474    Semantics of nexthop is very messy by historical reasons.
475    We have to take into account, that:
476    a) gateway can be actually local interface address,
477       so that gatewayed route is direct.
478    b) gateway must be on-link address, possibly
479       described not by an ifaddr, but also by a direct route.
480    c) If both gateway and interface are specified, they should not
481       contradict.
482    d) If we use tunnel routes, gateway could be not on-link.
483 
484    Attempt to reconcile all of these (alas, self-contradictory) conditions
485    results in pretty ugly and hairy code with obscure logic.
486 
487    I chose to generalized it instead, so that the size
488    of code does not increase practically, but it becomes
489    much more general.
490    Every prefix is assigned a "scope" value: "host" is local address,
491    "link" is direct route,
492    [ ... "site" ... "interior" ... ]
493    and "universe" is true gateway route with global meaning.
494 
495    Every prefix refers to a set of "nexthop"s (gw, oif),
496    where gw must have narrower scope. This recursion stops
497    when gw has LOCAL scope or if "nexthop" is declared ONLINK,
498    which means that gw is forced to be on link.
499 
500    Code is still hairy, but now it is apparently logically
501    consistent and very flexible. F.e. as by-product it allows
502    to co-exists in peace independent exterior and interior
503    routing processes.
504 
505    Normally it looks as following.
506 
507    {universe prefix}  -> (gw, oif) [scope link]
508                           |
509 			  |-> {link prefix} -> (gw, oif) [scope local]
510 			                        |
511 						|-> {local prefix} (terminal node)
512  */
513 
514 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
515 			struct fib_nh *nh)
516 {
517 	int err;
518 
519 	if (nh->nh_gw) {
520 		struct fib_result res;
521 
522 #ifdef CONFIG_IP_ROUTE_PERVASIVE
523 		if (nh->nh_flags&RTNH_F_PERVASIVE)
524 			return 0;
525 #endif
526 		if (nh->nh_flags&RTNH_F_ONLINK) {
527 			struct net_device *dev;
528 
529 			if (cfg->fc_scope >= RT_SCOPE_LINK)
530 				return -EINVAL;
531 			if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
532 				return -EINVAL;
533 			if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL)
534 				return -ENODEV;
535 			if (!(dev->flags&IFF_UP))
536 				return -ENETDOWN;
537 			nh->nh_dev = dev;
538 			dev_hold(dev);
539 			nh->nh_scope = RT_SCOPE_LINK;
540 			return 0;
541 		}
542 		{
543 			struct flowi fl = {
544 				.nl_u = {
545 					.ip4_u = {
546 						.daddr = nh->nh_gw,
547 						.scope = cfg->fc_scope + 1,
548 					},
549 				},
550 				.oif = nh->nh_oif,
551 			};
552 
553 			/* It is not necessary, but requires a bit of thinking */
554 			if (fl.fl4_scope < RT_SCOPE_LINK)
555 				fl.fl4_scope = RT_SCOPE_LINK;
556 			if ((err = fib_lookup(&fl, &res)) != 0)
557 				return err;
558 		}
559 		err = -EINVAL;
560 		if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
561 			goto out;
562 		nh->nh_scope = res.scope;
563 		nh->nh_oif = FIB_RES_OIF(res);
564 		if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
565 			goto out;
566 		dev_hold(nh->nh_dev);
567 		err = -ENETDOWN;
568 		if (!(nh->nh_dev->flags & IFF_UP))
569 			goto out;
570 		err = 0;
571 out:
572 		fib_res_put(&res);
573 		return err;
574 	} else {
575 		struct in_device *in_dev;
576 
577 		if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
578 			return -EINVAL;
579 
580 		in_dev = inetdev_by_index(nh->nh_oif);
581 		if (in_dev == NULL)
582 			return -ENODEV;
583 		if (!(in_dev->dev->flags&IFF_UP)) {
584 			in_dev_put(in_dev);
585 			return -ENETDOWN;
586 		}
587 		nh->nh_dev = in_dev->dev;
588 		dev_hold(nh->nh_dev);
589 		nh->nh_scope = RT_SCOPE_HOST;
590 		in_dev_put(in_dev);
591 	}
592 	return 0;
593 }
594 
595 static inline unsigned int fib_laddr_hashfn(__be32 val)
596 {
597 	unsigned int mask = (fib_hash_size - 1);
598 
599 	return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
600 }
601 
602 static struct hlist_head *fib_hash_alloc(int bytes)
603 {
604 	if (bytes <= PAGE_SIZE)
605 		return kmalloc(bytes, GFP_KERNEL);
606 	else
607 		return (struct hlist_head *)
608 			__get_free_pages(GFP_KERNEL, get_order(bytes));
609 }
610 
611 static void fib_hash_free(struct hlist_head *hash, int bytes)
612 {
613 	if (!hash)
614 		return;
615 
616 	if (bytes <= PAGE_SIZE)
617 		kfree(hash);
618 	else
619 		free_pages((unsigned long) hash, get_order(bytes));
620 }
621 
622 static void fib_hash_move(struct hlist_head *new_info_hash,
623 			  struct hlist_head *new_laddrhash,
624 			  unsigned int new_size)
625 {
626 	struct hlist_head *old_info_hash, *old_laddrhash;
627 	unsigned int old_size = fib_hash_size;
628 	unsigned int i, bytes;
629 
630 	spin_lock_bh(&fib_info_lock);
631 	old_info_hash = fib_info_hash;
632 	old_laddrhash = fib_info_laddrhash;
633 	fib_hash_size = new_size;
634 
635 	for (i = 0; i < old_size; i++) {
636 		struct hlist_head *head = &fib_info_hash[i];
637 		struct hlist_node *node, *n;
638 		struct fib_info *fi;
639 
640 		hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
641 			struct hlist_head *dest;
642 			unsigned int new_hash;
643 
644 			hlist_del(&fi->fib_hash);
645 
646 			new_hash = fib_info_hashfn(fi);
647 			dest = &new_info_hash[new_hash];
648 			hlist_add_head(&fi->fib_hash, dest);
649 		}
650 	}
651 	fib_info_hash = new_info_hash;
652 
653 	for (i = 0; i < old_size; i++) {
654 		struct hlist_head *lhead = &fib_info_laddrhash[i];
655 		struct hlist_node *node, *n;
656 		struct fib_info *fi;
657 
658 		hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
659 			struct hlist_head *ldest;
660 			unsigned int new_hash;
661 
662 			hlist_del(&fi->fib_lhash);
663 
664 			new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
665 			ldest = &new_laddrhash[new_hash];
666 			hlist_add_head(&fi->fib_lhash, ldest);
667 		}
668 	}
669 	fib_info_laddrhash = new_laddrhash;
670 
671 	spin_unlock_bh(&fib_info_lock);
672 
673 	bytes = old_size * sizeof(struct hlist_head *);
674 	fib_hash_free(old_info_hash, bytes);
675 	fib_hash_free(old_laddrhash, bytes);
676 }
677 
678 struct fib_info *fib_create_info(struct fib_config *cfg)
679 {
680 	int err;
681 	struct fib_info *fi = NULL;
682 	struct fib_info *ofi;
683 	int nhs = 1;
684 
685 	/* Fast check to catch the most weird cases */
686 	if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
687 		goto err_inval;
688 
689 #ifdef CONFIG_IP_ROUTE_MULTIPATH
690 	if (cfg->fc_mp) {
691 		nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
692 		if (nhs == 0)
693 			goto err_inval;
694 	}
695 #endif
696 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
697 	if (cfg->fc_mp_alg) {
698 		if (cfg->fc_mp_alg < IP_MP_ALG_NONE ||
699 		    cfg->fc_mp_alg > IP_MP_ALG_MAX)
700 			goto err_inval;
701 	}
702 #endif
703 
704 	err = -ENOBUFS;
705 	if (fib_info_cnt >= fib_hash_size) {
706 		unsigned int new_size = fib_hash_size << 1;
707 		struct hlist_head *new_info_hash;
708 		struct hlist_head *new_laddrhash;
709 		unsigned int bytes;
710 
711 		if (!new_size)
712 			new_size = 1;
713 		bytes = new_size * sizeof(struct hlist_head *);
714 		new_info_hash = fib_hash_alloc(bytes);
715 		new_laddrhash = fib_hash_alloc(bytes);
716 		if (!new_info_hash || !new_laddrhash) {
717 			fib_hash_free(new_info_hash, bytes);
718 			fib_hash_free(new_laddrhash, bytes);
719 		} else {
720 			memset(new_info_hash, 0, bytes);
721 			memset(new_laddrhash, 0, bytes);
722 
723 			fib_hash_move(new_info_hash, new_laddrhash, new_size);
724 		}
725 
726 		if (!fib_hash_size)
727 			goto failure;
728 	}
729 
730 	fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
731 	if (fi == NULL)
732 		goto failure;
733 	fib_info_cnt++;
734 
735 	fi->fib_protocol = cfg->fc_protocol;
736 	fi->fib_flags = cfg->fc_flags;
737 	fi->fib_priority = cfg->fc_priority;
738 	fi->fib_prefsrc = cfg->fc_prefsrc;
739 
740 	fi->fib_nhs = nhs;
741 	change_nexthops(fi) {
742 		nh->nh_parent = fi;
743 	} endfor_nexthops(fi)
744 
745 	if (cfg->fc_mx) {
746 		struct nlattr *nla;
747 		int remaining;
748 
749 		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
750 			int type = nla->nla_type;
751 
752 			if (type) {
753 				if (type > RTAX_MAX)
754 					goto err_inval;
755 				fi->fib_metrics[type - 1] = nla_get_u32(nla);
756 			}
757 		}
758 	}
759 
760 	if (cfg->fc_mp) {
761 #ifdef CONFIG_IP_ROUTE_MULTIPATH
762 		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
763 		if (err != 0)
764 			goto failure;
765 		if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
766 			goto err_inval;
767 		if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
768 			goto err_inval;
769 #ifdef CONFIG_NET_CLS_ROUTE
770 		if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
771 			goto err_inval;
772 #endif
773 #else
774 		goto err_inval;
775 #endif
776 	} else {
777 		struct fib_nh *nh = fi->fib_nh;
778 
779 		nh->nh_oif = cfg->fc_oif;
780 		nh->nh_gw = cfg->fc_gw;
781 		nh->nh_flags = cfg->fc_flags;
782 #ifdef CONFIG_NET_CLS_ROUTE
783 		nh->nh_tclassid = cfg->fc_flow;
784 #endif
785 #ifdef CONFIG_IP_ROUTE_MULTIPATH
786 		nh->nh_weight = 1;
787 #endif
788 	}
789 
790 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
791 	fi->fib_mp_alg = cfg->fc_mp_alg;
792 #endif
793 
794 	if (fib_props[cfg->fc_type].error) {
795 		if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
796 			goto err_inval;
797 		goto link_it;
798 	}
799 
800 	if (cfg->fc_scope > RT_SCOPE_HOST)
801 		goto err_inval;
802 
803 	if (cfg->fc_scope == RT_SCOPE_HOST) {
804 		struct fib_nh *nh = fi->fib_nh;
805 
806 		/* Local address is added. */
807 		if (nhs != 1 || nh->nh_gw)
808 			goto err_inval;
809 		nh->nh_scope = RT_SCOPE_NOWHERE;
810 		nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif);
811 		err = -ENODEV;
812 		if (nh->nh_dev == NULL)
813 			goto failure;
814 	} else {
815 		change_nexthops(fi) {
816 			if ((err = fib_check_nh(cfg, fi, nh)) != 0)
817 				goto failure;
818 		} endfor_nexthops(fi)
819 	}
820 
821 	if (fi->fib_prefsrc) {
822 		if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
823 		    fi->fib_prefsrc != cfg->fc_dst)
824 			if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
825 				goto err_inval;
826 	}
827 
828 link_it:
829 	if ((ofi = fib_find_info(fi)) != NULL) {
830 		fi->fib_dead = 1;
831 		free_fib_info(fi);
832 		ofi->fib_treeref++;
833 		return ofi;
834 	}
835 
836 	fi->fib_treeref++;
837 	atomic_inc(&fi->fib_clntref);
838 	spin_lock_bh(&fib_info_lock);
839 	hlist_add_head(&fi->fib_hash,
840 		       &fib_info_hash[fib_info_hashfn(fi)]);
841 	if (fi->fib_prefsrc) {
842 		struct hlist_head *head;
843 
844 		head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
845 		hlist_add_head(&fi->fib_lhash, head);
846 	}
847 	change_nexthops(fi) {
848 		struct hlist_head *head;
849 		unsigned int hash;
850 
851 		if (!nh->nh_dev)
852 			continue;
853 		hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
854 		head = &fib_info_devhash[hash];
855 		hlist_add_head(&nh->nh_hash, head);
856 	} endfor_nexthops(fi)
857 	spin_unlock_bh(&fib_info_lock);
858 	return fi;
859 
860 err_inval:
861 	err = -EINVAL;
862 
863 failure:
864         if (fi) {
865 		fi->fib_dead = 1;
866 		free_fib_info(fi);
867 	}
868 
869 	return ERR_PTR(err);
870 }
871 
872 /* Note! fib_semantic_match intentionally uses  RCU list functions. */
873 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
874 		       struct fib_result *res, __be32 zone, __be32 mask,
875 			int prefixlen)
876 {
877 	struct fib_alias *fa;
878 	int nh_sel = 0;
879 
880 	list_for_each_entry_rcu(fa, head, fa_list) {
881 		int err;
882 
883 		if (fa->fa_tos &&
884 		    fa->fa_tos != flp->fl4_tos)
885 			continue;
886 
887 		if (fa->fa_scope < flp->fl4_scope)
888 			continue;
889 
890 		fa->fa_state |= FA_S_ACCESSED;
891 
892 		err = fib_props[fa->fa_type].error;
893 		if (err == 0) {
894 			struct fib_info *fi = fa->fa_info;
895 
896 			if (fi->fib_flags & RTNH_F_DEAD)
897 				continue;
898 
899 			switch (fa->fa_type) {
900 			case RTN_UNICAST:
901 			case RTN_LOCAL:
902 			case RTN_BROADCAST:
903 			case RTN_ANYCAST:
904 			case RTN_MULTICAST:
905 				for_nexthops(fi) {
906 					if (nh->nh_flags&RTNH_F_DEAD)
907 						continue;
908 					if (!flp->oif || flp->oif == nh->nh_oif)
909 						break;
910 				}
911 #ifdef CONFIG_IP_ROUTE_MULTIPATH
912 				if (nhsel < fi->fib_nhs) {
913 					nh_sel = nhsel;
914 					goto out_fill_res;
915 				}
916 #else
917 				if (nhsel < 1) {
918 					goto out_fill_res;
919 				}
920 #endif
921 				endfor_nexthops(fi);
922 				continue;
923 
924 			default:
925 				printk(KERN_DEBUG "impossible 102\n");
926 				return -EINVAL;
927 			};
928 		}
929 		return err;
930 	}
931 	return 1;
932 
933 out_fill_res:
934 	res->prefixlen = prefixlen;
935 	res->nh_sel = nh_sel;
936 	res->type = fa->fa_type;
937 	res->scope = fa->fa_scope;
938 	res->fi = fa->fa_info;
939 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
940 	res->netmask = mask;
941 	res->network = zone & inet_make_mask(prefixlen);
942 #endif
943 	atomic_inc(&res->fi->fib_clntref);
944 	return 0;
945 }
946 
947 /* Find appropriate source address to this destination */
948 
949 __be32 __fib_res_prefsrc(struct fib_result *res)
950 {
951 	return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
952 }
953 
954 int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
955 		  u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
956 		  struct fib_info *fi, unsigned int flags)
957 {
958 	struct nlmsghdr *nlh;
959 	struct rtmsg *rtm;
960 
961 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
962 	if (nlh == NULL)
963 		return -ENOBUFS;
964 
965 	rtm = nlmsg_data(nlh);
966 	rtm->rtm_family = AF_INET;
967 	rtm->rtm_dst_len = dst_len;
968 	rtm->rtm_src_len = 0;
969 	rtm->rtm_tos = tos;
970 	rtm->rtm_table = tb_id;
971 	NLA_PUT_U32(skb, RTA_TABLE, tb_id);
972 	rtm->rtm_type = type;
973 	rtm->rtm_flags = fi->fib_flags;
974 	rtm->rtm_scope = scope;
975 	rtm->rtm_protocol = fi->fib_protocol;
976 
977 	if (rtm->rtm_dst_len)
978 		NLA_PUT_BE32(skb, RTA_DST, dst);
979 
980 	if (fi->fib_priority)
981 		NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
982 
983 	if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
984 		goto nla_put_failure;
985 
986 	if (fi->fib_prefsrc)
987 		NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
988 
989 	if (fi->fib_nhs == 1) {
990 		if (fi->fib_nh->nh_gw)
991 			NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
992 
993 		if (fi->fib_nh->nh_oif)
994 			NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
995 #ifdef CONFIG_NET_CLS_ROUTE
996 		if (fi->fib_nh[0].nh_tclassid)
997 			NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
998 #endif
999 	}
1000 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1001 	if (fi->fib_nhs > 1) {
1002 		struct rtnexthop *rtnh;
1003 		struct nlattr *mp;
1004 
1005 		mp = nla_nest_start(skb, RTA_MULTIPATH);
1006 		if (mp == NULL)
1007 			goto nla_put_failure;
1008 
1009 		for_nexthops(fi) {
1010 			rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1011 			if (rtnh == NULL)
1012 				goto nla_put_failure;
1013 
1014 			rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1015 			rtnh->rtnh_hops = nh->nh_weight - 1;
1016 			rtnh->rtnh_ifindex = nh->nh_oif;
1017 
1018 			if (nh->nh_gw)
1019 				NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1020 #ifdef CONFIG_NET_CLS_ROUTE
1021 			if (nh->nh_tclassid)
1022 				NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1023 #endif
1024 			/* length of rtnetlink header + attributes */
1025 			rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1026 		} endfor_nexthops(fi);
1027 
1028 		nla_nest_end(skb, mp);
1029 	}
1030 #endif
1031 	return nlmsg_end(skb, nlh);
1032 
1033 nla_put_failure:
1034 	return nlmsg_cancel(skb, nlh);
1035 }
1036 
1037 /*
1038    Update FIB if:
1039    - local address disappeared -> we must delete all the entries
1040      referring to it.
1041    - device went down -> we must shutdown all nexthops going via it.
1042  */
1043 
1044 int fib_sync_down(__be32 local, struct net_device *dev, int force)
1045 {
1046 	int ret = 0;
1047 	int scope = RT_SCOPE_NOWHERE;
1048 
1049 	if (force)
1050 		scope = -1;
1051 
1052 	if (local && fib_info_laddrhash) {
1053 		unsigned int hash = fib_laddr_hashfn(local);
1054 		struct hlist_head *head = &fib_info_laddrhash[hash];
1055 		struct hlist_node *node;
1056 		struct fib_info *fi;
1057 
1058 		hlist_for_each_entry(fi, node, head, fib_lhash) {
1059 			if (fi->fib_prefsrc == local) {
1060 				fi->fib_flags |= RTNH_F_DEAD;
1061 				ret++;
1062 			}
1063 		}
1064 	}
1065 
1066 	if (dev) {
1067 		struct fib_info *prev_fi = NULL;
1068 		unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1069 		struct hlist_head *head = &fib_info_devhash[hash];
1070 		struct hlist_node *node;
1071 		struct fib_nh *nh;
1072 
1073 		hlist_for_each_entry(nh, node, head, nh_hash) {
1074 			struct fib_info *fi = nh->nh_parent;
1075 			int dead;
1076 
1077 			BUG_ON(!fi->fib_nhs);
1078 			if (nh->nh_dev != dev || fi == prev_fi)
1079 				continue;
1080 			prev_fi = fi;
1081 			dead = 0;
1082 			change_nexthops(fi) {
1083 				if (nh->nh_flags&RTNH_F_DEAD)
1084 					dead++;
1085 				else if (nh->nh_dev == dev &&
1086 					 nh->nh_scope != scope) {
1087 					nh->nh_flags |= RTNH_F_DEAD;
1088 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1089 					spin_lock_bh(&fib_multipath_lock);
1090 					fi->fib_power -= nh->nh_power;
1091 					nh->nh_power = 0;
1092 					spin_unlock_bh(&fib_multipath_lock);
1093 #endif
1094 					dead++;
1095 				}
1096 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1097 				if (force > 1 && nh->nh_dev == dev) {
1098 					dead = fi->fib_nhs;
1099 					break;
1100 				}
1101 #endif
1102 			} endfor_nexthops(fi)
1103 			if (dead == fi->fib_nhs) {
1104 				fi->fib_flags |= RTNH_F_DEAD;
1105 				ret++;
1106 			}
1107 		}
1108 	}
1109 
1110 	return ret;
1111 }
1112 
1113 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1114 
1115 /*
1116    Dead device goes up. We wake up dead nexthops.
1117    It takes sense only on multipath routes.
1118  */
1119 
1120 int fib_sync_up(struct net_device *dev)
1121 {
1122 	struct fib_info *prev_fi;
1123 	unsigned int hash;
1124 	struct hlist_head *head;
1125 	struct hlist_node *node;
1126 	struct fib_nh *nh;
1127 	int ret;
1128 
1129 	if (!(dev->flags&IFF_UP))
1130 		return 0;
1131 
1132 	prev_fi = NULL;
1133 	hash = fib_devindex_hashfn(dev->ifindex);
1134 	head = &fib_info_devhash[hash];
1135 	ret = 0;
1136 
1137 	hlist_for_each_entry(nh, node, head, nh_hash) {
1138 		struct fib_info *fi = nh->nh_parent;
1139 		int alive;
1140 
1141 		BUG_ON(!fi->fib_nhs);
1142 		if (nh->nh_dev != dev || fi == prev_fi)
1143 			continue;
1144 
1145 		prev_fi = fi;
1146 		alive = 0;
1147 		change_nexthops(fi) {
1148 			if (!(nh->nh_flags&RTNH_F_DEAD)) {
1149 				alive++;
1150 				continue;
1151 			}
1152 			if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1153 				continue;
1154 			if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
1155 				continue;
1156 			alive++;
1157 			spin_lock_bh(&fib_multipath_lock);
1158 			nh->nh_power = 0;
1159 			nh->nh_flags &= ~RTNH_F_DEAD;
1160 			spin_unlock_bh(&fib_multipath_lock);
1161 		} endfor_nexthops(fi)
1162 
1163 		if (alive > 0) {
1164 			fi->fib_flags &= ~RTNH_F_DEAD;
1165 			ret++;
1166 		}
1167 	}
1168 
1169 	return ret;
1170 }
1171 
1172 /*
1173    The algorithm is suboptimal, but it provides really
1174    fair weighted route distribution.
1175  */
1176 
1177 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1178 {
1179 	struct fib_info *fi = res->fi;
1180 	int w;
1181 
1182 	spin_lock_bh(&fib_multipath_lock);
1183 	if (fi->fib_power <= 0) {
1184 		int power = 0;
1185 		change_nexthops(fi) {
1186 			if (!(nh->nh_flags&RTNH_F_DEAD)) {
1187 				power += nh->nh_weight;
1188 				nh->nh_power = nh->nh_weight;
1189 			}
1190 		} endfor_nexthops(fi);
1191 		fi->fib_power = power;
1192 		if (power <= 0) {
1193 			spin_unlock_bh(&fib_multipath_lock);
1194 			/* Race condition: route has just become dead. */
1195 			res->nh_sel = 0;
1196 			return;
1197 		}
1198 	}
1199 
1200 
1201 	/* w should be random number [0..fi->fib_power-1],
1202 	   it is pretty bad approximation.
1203 	 */
1204 
1205 	w = jiffies % fi->fib_power;
1206 
1207 	change_nexthops(fi) {
1208 		if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1209 			if ((w -= nh->nh_power) <= 0) {
1210 				nh->nh_power--;
1211 				fi->fib_power--;
1212 				res->nh_sel = nhsel;
1213 				spin_unlock_bh(&fib_multipath_lock);
1214 				return;
1215 			}
1216 		}
1217 	} endfor_nexthops(fi);
1218 
1219 	/* Race condition: route has just become dead. */
1220 	res->nh_sel = 0;
1221 	spin_unlock_bh(&fib_multipath_lock);
1222 }
1223 #endif
1224