xref: /linux/net/ipv4/fib_semantics.c (revision c537b994505099b7197e7d3125b942ecbcc51eb6)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		IPv4 Forwarding Information Base: semantics.
7  *
8  * Version:	$Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
9  *
10  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11  *
12  *		This program is free software; you can redistribute it and/or
13  *		modify it under the terms of the GNU General Public License
14  *		as published by the Free Software Foundation; either version
15  *		2 of the License, or (at your option) any later version.
16  */
17 
18 #include <asm/uaccess.h>
19 #include <asm/system.h>
20 #include <linux/bitops.h>
21 #include <linux/types.h>
22 #include <linux/kernel.h>
23 #include <linux/jiffies.h>
24 #include <linux/mm.h>
25 #include <linux/string.h>
26 #include <linux/socket.h>
27 #include <linux/sockios.h>
28 #include <linux/errno.h>
29 #include <linux/in.h>
30 #include <linux/inet.h>
31 #include <linux/inetdevice.h>
32 #include <linux/netdevice.h>
33 #include <linux/if_arp.h>
34 #include <linux/proc_fs.h>
35 #include <linux/skbuff.h>
36 #include <linux/init.h>
37 
38 #include <net/arp.h>
39 #include <net/ip.h>
40 #include <net/protocol.h>
41 #include <net/route.h>
42 #include <net/tcp.h>
43 #include <net/sock.h>
44 #include <net/ip_fib.h>
45 #include <net/ip_mp_alg.h>
46 #include <net/netlink.h>
47 #include <net/nexthop.h>
48 
49 #include "fib_lookup.h"
50 
51 #define FSprintk(a...)
52 
53 static DEFINE_SPINLOCK(fib_info_lock);
54 static struct hlist_head *fib_info_hash;
55 static struct hlist_head *fib_info_laddrhash;
56 static unsigned int fib_hash_size;
57 static unsigned int fib_info_cnt;
58 
59 #define DEVINDEX_HASHBITS 8
60 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
61 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
62 
63 #ifdef CONFIG_IP_ROUTE_MULTIPATH
64 
65 static DEFINE_SPINLOCK(fib_multipath_lock);
66 
67 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
68 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
69 
70 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
71 for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
72 
73 #else /* CONFIG_IP_ROUTE_MULTIPATH */
74 
75 /* Hope, that gcc will optimize it to get rid of dummy loop */
76 
77 #define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
78 for (nhsel=0; nhsel < 1; nhsel++)
79 
80 #define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
81 for (nhsel=0; nhsel < 1; nhsel++)
82 
83 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
84 
85 #define endfor_nexthops(fi) }
86 
87 
88 static const struct
89 {
90 	int	error;
91 	u8	scope;
92 } fib_props[RTA_MAX + 1] = {
93 	{
94 		.error	= 0,
95 		.scope	= RT_SCOPE_NOWHERE,
96 	},	/* RTN_UNSPEC */
97 	{
98 		.error	= 0,
99 		.scope	= RT_SCOPE_UNIVERSE,
100 	},	/* RTN_UNICAST */
101 	{
102 		.error	= 0,
103 		.scope	= RT_SCOPE_HOST,
104 	},	/* RTN_LOCAL */
105 	{
106 		.error	= 0,
107 		.scope	= RT_SCOPE_LINK,
108 	},	/* RTN_BROADCAST */
109 	{
110 		.error	= 0,
111 		.scope	= RT_SCOPE_LINK,
112 	},	/* RTN_ANYCAST */
113 	{
114 		.error	= 0,
115 		.scope	= RT_SCOPE_UNIVERSE,
116 	},	/* RTN_MULTICAST */
117 	{
118 		.error	= -EINVAL,
119 		.scope	= RT_SCOPE_UNIVERSE,
120 	},	/* RTN_BLACKHOLE */
121 	{
122 		.error	= -EHOSTUNREACH,
123 		.scope	= RT_SCOPE_UNIVERSE,
124 	},	/* RTN_UNREACHABLE */
125 	{
126 		.error	= -EACCES,
127 		.scope	= RT_SCOPE_UNIVERSE,
128 	},	/* RTN_PROHIBIT */
129 	{
130 		.error	= -EAGAIN,
131 		.scope	= RT_SCOPE_UNIVERSE,
132 	},	/* RTN_THROW */
133 	{
134 		.error	= -EINVAL,
135 		.scope	= RT_SCOPE_NOWHERE,
136 	},	/* RTN_NAT */
137 	{
138 		.error	= -EINVAL,
139 		.scope	= RT_SCOPE_NOWHERE,
140 	},	/* RTN_XRESOLVE */
141 };
142 
143 
144 /* Release a nexthop info record */
145 
146 void free_fib_info(struct fib_info *fi)
147 {
148 	if (fi->fib_dead == 0) {
149 		printk("Freeing alive fib_info %p\n", fi);
150 		return;
151 	}
152 	change_nexthops(fi) {
153 		if (nh->nh_dev)
154 			dev_put(nh->nh_dev);
155 		nh->nh_dev = NULL;
156 	} endfor_nexthops(fi);
157 	fib_info_cnt--;
158 	kfree(fi);
159 }
160 
161 void fib_release_info(struct fib_info *fi)
162 {
163 	spin_lock_bh(&fib_info_lock);
164 	if (fi && --fi->fib_treeref == 0) {
165 		hlist_del(&fi->fib_hash);
166 		if (fi->fib_prefsrc)
167 			hlist_del(&fi->fib_lhash);
168 		change_nexthops(fi) {
169 			if (!nh->nh_dev)
170 				continue;
171 			hlist_del(&nh->nh_hash);
172 		} endfor_nexthops(fi)
173 		fi->fib_dead = 1;
174 		fib_info_put(fi);
175 	}
176 	spin_unlock_bh(&fib_info_lock);
177 }
178 
179 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
180 {
181 	const struct fib_nh *onh = ofi->fib_nh;
182 
183 	for_nexthops(fi) {
184 		if (nh->nh_oif != onh->nh_oif ||
185 		    nh->nh_gw  != onh->nh_gw ||
186 		    nh->nh_scope != onh->nh_scope ||
187 #ifdef CONFIG_IP_ROUTE_MULTIPATH
188 		    nh->nh_weight != onh->nh_weight ||
189 #endif
190 #ifdef CONFIG_NET_CLS_ROUTE
191 		    nh->nh_tclassid != onh->nh_tclassid ||
192 #endif
193 		    ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
194 			return -1;
195 		onh++;
196 	} endfor_nexthops(fi);
197 	return 0;
198 }
199 
200 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
201 {
202 	unsigned int mask = (fib_hash_size - 1);
203 	unsigned int val = fi->fib_nhs;
204 
205 	val ^= fi->fib_protocol;
206 	val ^= (__force u32)fi->fib_prefsrc;
207 	val ^= fi->fib_priority;
208 
209 	return (val ^ (val >> 7) ^ (val >> 12)) & mask;
210 }
211 
212 static struct fib_info *fib_find_info(const struct fib_info *nfi)
213 {
214 	struct hlist_head *head;
215 	struct hlist_node *node;
216 	struct fib_info *fi;
217 	unsigned int hash;
218 
219 	hash = fib_info_hashfn(nfi);
220 	head = &fib_info_hash[hash];
221 
222 	hlist_for_each_entry(fi, node, head, fib_hash) {
223 		if (fi->fib_nhs != nfi->fib_nhs)
224 			continue;
225 		if (nfi->fib_protocol == fi->fib_protocol &&
226 		    nfi->fib_prefsrc == fi->fib_prefsrc &&
227 		    nfi->fib_priority == fi->fib_priority &&
228 		    memcmp(nfi->fib_metrics, fi->fib_metrics,
229 			   sizeof(fi->fib_metrics)) == 0 &&
230 		    ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
231 		    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
232 			return fi;
233 	}
234 
235 	return NULL;
236 }
237 
238 static inline unsigned int fib_devindex_hashfn(unsigned int val)
239 {
240 	unsigned int mask = DEVINDEX_HASHSIZE - 1;
241 
242 	return (val ^
243 		(val >> DEVINDEX_HASHBITS) ^
244 		(val >> (DEVINDEX_HASHBITS * 2))) & mask;
245 }
246 
247 /* Check, that the gateway is already configured.
248    Used only by redirect accept routine.
249  */
250 
251 int ip_fib_check_default(__be32 gw, struct net_device *dev)
252 {
253 	struct hlist_head *head;
254 	struct hlist_node *node;
255 	struct fib_nh *nh;
256 	unsigned int hash;
257 
258 	spin_lock(&fib_info_lock);
259 
260 	hash = fib_devindex_hashfn(dev->ifindex);
261 	head = &fib_info_devhash[hash];
262 	hlist_for_each_entry(nh, node, head, nh_hash) {
263 		if (nh->nh_dev == dev &&
264 		    nh->nh_gw == gw &&
265 		    !(nh->nh_flags&RTNH_F_DEAD)) {
266 			spin_unlock(&fib_info_lock);
267 			return 0;
268 		}
269 	}
270 
271 	spin_unlock(&fib_info_lock);
272 
273 	return -1;
274 }
275 
276 static inline size_t fib_nlmsg_size(struct fib_info *fi)
277 {
278 	size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
279 			 + nla_total_size(4) /* RTA_TABLE */
280 			 + nla_total_size(4) /* RTA_DST */
281 			 + nla_total_size(4) /* RTA_PRIORITY */
282 			 + nla_total_size(4); /* RTA_PREFSRC */
283 
284 	/* space for nested metrics */
285 	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
286 
287 	if (fi->fib_nhs) {
288 		/* Also handles the special case fib_nhs == 1 */
289 
290 		/* each nexthop is packed in an attribute */
291 		size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
292 
293 		/* may contain flow and gateway attribute */
294 		nhsize += 2 * nla_total_size(4);
295 
296 		/* all nexthops are packed in a nested attribute */
297 		payload += nla_total_size(fi->fib_nhs * nhsize);
298 	}
299 
300 	return payload;
301 }
302 
303 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
304 	       int dst_len, u32 tb_id, struct nl_info *info)
305 {
306 	struct sk_buff *skb;
307 	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
308 	int err = -ENOBUFS;
309 
310 	skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
311 	if (skb == NULL)
312 		goto errout;
313 
314 	err = fib_dump_info(skb, info->pid, seq, event, tb_id,
315 			    fa->fa_type, fa->fa_scope, key, dst_len,
316 			    fa->fa_tos, fa->fa_info, 0);
317 	if (err < 0) {
318 		/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
319 		WARN_ON(err == -EMSGSIZE);
320 		kfree_skb(skb);
321 		goto errout;
322 	}
323 	err = rtnl_notify(skb, info->pid, RTNLGRP_IPV4_ROUTE,
324 			  info->nlh, GFP_KERNEL);
325 errout:
326 	if (err < 0)
327 		rtnl_set_sk_err(RTNLGRP_IPV4_ROUTE, err);
328 }
329 
330 /* Return the first fib alias matching TOS with
331  * priority less than or equal to PRIO.
332  */
333 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
334 {
335 	if (fah) {
336 		struct fib_alias *fa;
337 		list_for_each_entry(fa, fah, fa_list) {
338 			if (fa->fa_tos > tos)
339 				continue;
340 			if (fa->fa_info->fib_priority >= prio ||
341 			    fa->fa_tos < tos)
342 				return fa;
343 		}
344 	}
345 	return NULL;
346 }
347 
348 int fib_detect_death(struct fib_info *fi, int order,
349 		     struct fib_info **last_resort, int *last_idx, int *dflt)
350 {
351 	struct neighbour *n;
352 	int state = NUD_NONE;
353 
354 	n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
355 	if (n) {
356 		state = n->nud_state;
357 		neigh_release(n);
358 	}
359 	if (state==NUD_REACHABLE)
360 		return 0;
361 	if ((state&NUD_VALID) && order != *dflt)
362 		return 0;
363 	if ((state&NUD_VALID) ||
364 	    (*last_idx<0 && order > *dflt)) {
365 		*last_resort = fi;
366 		*last_idx = order;
367 	}
368 	return 1;
369 }
370 
371 #ifdef CONFIG_IP_ROUTE_MULTIPATH
372 
373 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
374 {
375 	int nhs = 0;
376 
377 	while (rtnh_ok(rtnh, remaining)) {
378 		nhs++;
379 		rtnh = rtnh_next(rtnh, &remaining);
380 	}
381 
382 	/* leftover implies invalid nexthop configuration, discard it */
383 	return remaining > 0 ? 0 : nhs;
384 }
385 
386 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
387 		       int remaining, struct fib_config *cfg)
388 {
389 	change_nexthops(fi) {
390 		int attrlen;
391 
392 		if (!rtnh_ok(rtnh, remaining))
393 			return -EINVAL;
394 
395 		nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
396 		nh->nh_oif = rtnh->rtnh_ifindex;
397 		nh->nh_weight = rtnh->rtnh_hops + 1;
398 
399 		attrlen = rtnh_attrlen(rtnh);
400 		if (attrlen > 0) {
401 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
402 
403 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
404 			nh->nh_gw = nla ? nla_get_be32(nla) : 0;
405 #ifdef CONFIG_NET_CLS_ROUTE
406 			nla = nla_find(attrs, attrlen, RTA_FLOW);
407 			nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
408 #endif
409 		}
410 
411 		rtnh = rtnh_next(rtnh, &remaining);
412 	} endfor_nexthops(fi);
413 
414 	return 0;
415 }
416 
417 #endif
418 
419 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
420 {
421 #ifdef CONFIG_IP_ROUTE_MULTIPATH
422 	struct rtnexthop *rtnh;
423 	int remaining;
424 #endif
425 
426 	if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
427 		return 1;
428 
429 	if (cfg->fc_oif || cfg->fc_gw) {
430 		if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
431 		    (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
432 			return 0;
433 		return 1;
434 	}
435 
436 #ifdef CONFIG_IP_ROUTE_MULTIPATH
437 	if (cfg->fc_mp == NULL)
438 		return 0;
439 
440 	rtnh = cfg->fc_mp;
441 	remaining = cfg->fc_mp_len;
442 
443 	for_nexthops(fi) {
444 		int attrlen;
445 
446 		if (!rtnh_ok(rtnh, remaining))
447 			return -EINVAL;
448 
449 		if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
450 			return 1;
451 
452 		attrlen = rtnh_attrlen(rtnh);
453 		if (attrlen < 0) {
454 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
455 
456 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
457 			if (nla && nla_get_be32(nla) != nh->nh_gw)
458 				return 1;
459 #ifdef CONFIG_NET_CLS_ROUTE
460 			nla = nla_find(attrs, attrlen, RTA_FLOW);
461 			if (nla && nla_get_u32(nla) != nh->nh_tclassid)
462 				return 1;
463 #endif
464 		}
465 
466 		rtnh = rtnh_next(rtnh, &remaining);
467 	} endfor_nexthops(fi);
468 #endif
469 	return 0;
470 }
471 
472 
473 /*
474    Picture
475    -------
476 
477    Semantics of nexthop is very messy by historical reasons.
478    We have to take into account, that:
479    a) gateway can be actually local interface address,
480       so that gatewayed route is direct.
481    b) gateway must be on-link address, possibly
482       described not by an ifaddr, but also by a direct route.
483    c) If both gateway and interface are specified, they should not
484       contradict.
485    d) If we use tunnel routes, gateway could be not on-link.
486 
487    Attempt to reconcile all of these (alas, self-contradictory) conditions
488    results in pretty ugly and hairy code with obscure logic.
489 
490    I chose to generalized it instead, so that the size
491    of code does not increase practically, but it becomes
492    much more general.
493    Every prefix is assigned a "scope" value: "host" is local address,
494    "link" is direct route,
495    [ ... "site" ... "interior" ... ]
496    and "universe" is true gateway route with global meaning.
497 
498    Every prefix refers to a set of "nexthop"s (gw, oif),
499    where gw must have narrower scope. This recursion stops
500    when gw has LOCAL scope or if "nexthop" is declared ONLINK,
501    which means that gw is forced to be on link.
502 
503    Code is still hairy, but now it is apparently logically
504    consistent and very flexible. F.e. as by-product it allows
505    to co-exists in peace independent exterior and interior
506    routing processes.
507 
508    Normally it looks as following.
509 
510    {universe prefix}  -> (gw, oif) [scope link]
511 			  |
512 			  |-> {link prefix} -> (gw, oif) [scope local]
513 						|
514 						|-> {local prefix} (terminal node)
515  */
516 
517 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
518 			struct fib_nh *nh)
519 {
520 	int err;
521 
522 	if (nh->nh_gw) {
523 		struct fib_result res;
524 
525 #ifdef CONFIG_IP_ROUTE_PERVASIVE
526 		if (nh->nh_flags&RTNH_F_PERVASIVE)
527 			return 0;
528 #endif
529 		if (nh->nh_flags&RTNH_F_ONLINK) {
530 			struct net_device *dev;
531 
532 			if (cfg->fc_scope >= RT_SCOPE_LINK)
533 				return -EINVAL;
534 			if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
535 				return -EINVAL;
536 			if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL)
537 				return -ENODEV;
538 			if (!(dev->flags&IFF_UP))
539 				return -ENETDOWN;
540 			nh->nh_dev = dev;
541 			dev_hold(dev);
542 			nh->nh_scope = RT_SCOPE_LINK;
543 			return 0;
544 		}
545 		{
546 			struct flowi fl = {
547 				.nl_u = {
548 					.ip4_u = {
549 						.daddr = nh->nh_gw,
550 						.scope = cfg->fc_scope + 1,
551 					},
552 				},
553 				.oif = nh->nh_oif,
554 			};
555 
556 			/* It is not necessary, but requires a bit of thinking */
557 			if (fl.fl4_scope < RT_SCOPE_LINK)
558 				fl.fl4_scope = RT_SCOPE_LINK;
559 			if ((err = fib_lookup(&fl, &res)) != 0)
560 				return err;
561 		}
562 		err = -EINVAL;
563 		if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
564 			goto out;
565 		nh->nh_scope = res.scope;
566 		nh->nh_oif = FIB_RES_OIF(res);
567 		if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
568 			goto out;
569 		dev_hold(nh->nh_dev);
570 		err = -ENETDOWN;
571 		if (!(nh->nh_dev->flags & IFF_UP))
572 			goto out;
573 		err = 0;
574 out:
575 		fib_res_put(&res);
576 		return err;
577 	} else {
578 		struct in_device *in_dev;
579 
580 		if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
581 			return -EINVAL;
582 
583 		in_dev = inetdev_by_index(nh->nh_oif);
584 		if (in_dev == NULL)
585 			return -ENODEV;
586 		if (!(in_dev->dev->flags&IFF_UP)) {
587 			in_dev_put(in_dev);
588 			return -ENETDOWN;
589 		}
590 		nh->nh_dev = in_dev->dev;
591 		dev_hold(nh->nh_dev);
592 		nh->nh_scope = RT_SCOPE_HOST;
593 		in_dev_put(in_dev);
594 	}
595 	return 0;
596 }
597 
598 static inline unsigned int fib_laddr_hashfn(__be32 val)
599 {
600 	unsigned int mask = (fib_hash_size - 1);
601 
602 	return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
603 }
604 
605 static struct hlist_head *fib_hash_alloc(int bytes)
606 {
607 	if (bytes <= PAGE_SIZE)
608 		return kmalloc(bytes, GFP_KERNEL);
609 	else
610 		return (struct hlist_head *)
611 			__get_free_pages(GFP_KERNEL, get_order(bytes));
612 }
613 
614 static void fib_hash_free(struct hlist_head *hash, int bytes)
615 {
616 	if (!hash)
617 		return;
618 
619 	if (bytes <= PAGE_SIZE)
620 		kfree(hash);
621 	else
622 		free_pages((unsigned long) hash, get_order(bytes));
623 }
624 
625 static void fib_hash_move(struct hlist_head *new_info_hash,
626 			  struct hlist_head *new_laddrhash,
627 			  unsigned int new_size)
628 {
629 	struct hlist_head *old_info_hash, *old_laddrhash;
630 	unsigned int old_size = fib_hash_size;
631 	unsigned int i, bytes;
632 
633 	spin_lock_bh(&fib_info_lock);
634 	old_info_hash = fib_info_hash;
635 	old_laddrhash = fib_info_laddrhash;
636 	fib_hash_size = new_size;
637 
638 	for (i = 0; i < old_size; i++) {
639 		struct hlist_head *head = &fib_info_hash[i];
640 		struct hlist_node *node, *n;
641 		struct fib_info *fi;
642 
643 		hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
644 			struct hlist_head *dest;
645 			unsigned int new_hash;
646 
647 			hlist_del(&fi->fib_hash);
648 
649 			new_hash = fib_info_hashfn(fi);
650 			dest = &new_info_hash[new_hash];
651 			hlist_add_head(&fi->fib_hash, dest);
652 		}
653 	}
654 	fib_info_hash = new_info_hash;
655 
656 	for (i = 0; i < old_size; i++) {
657 		struct hlist_head *lhead = &fib_info_laddrhash[i];
658 		struct hlist_node *node, *n;
659 		struct fib_info *fi;
660 
661 		hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
662 			struct hlist_head *ldest;
663 			unsigned int new_hash;
664 
665 			hlist_del(&fi->fib_lhash);
666 
667 			new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
668 			ldest = &new_laddrhash[new_hash];
669 			hlist_add_head(&fi->fib_lhash, ldest);
670 		}
671 	}
672 	fib_info_laddrhash = new_laddrhash;
673 
674 	spin_unlock_bh(&fib_info_lock);
675 
676 	bytes = old_size * sizeof(struct hlist_head *);
677 	fib_hash_free(old_info_hash, bytes);
678 	fib_hash_free(old_laddrhash, bytes);
679 }
680 
681 struct fib_info *fib_create_info(struct fib_config *cfg)
682 {
683 	int err;
684 	struct fib_info *fi = NULL;
685 	struct fib_info *ofi;
686 	int nhs = 1;
687 
688 	/* Fast check to catch the most weird cases */
689 	if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
690 		goto err_inval;
691 
692 #ifdef CONFIG_IP_ROUTE_MULTIPATH
693 	if (cfg->fc_mp) {
694 		nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
695 		if (nhs == 0)
696 			goto err_inval;
697 	}
698 #endif
699 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
700 	if (cfg->fc_mp_alg) {
701 		if (cfg->fc_mp_alg < IP_MP_ALG_NONE ||
702 		    cfg->fc_mp_alg > IP_MP_ALG_MAX)
703 			goto err_inval;
704 	}
705 #endif
706 
707 	err = -ENOBUFS;
708 	if (fib_info_cnt >= fib_hash_size) {
709 		unsigned int new_size = fib_hash_size << 1;
710 		struct hlist_head *new_info_hash;
711 		struct hlist_head *new_laddrhash;
712 		unsigned int bytes;
713 
714 		if (!new_size)
715 			new_size = 1;
716 		bytes = new_size * sizeof(struct hlist_head *);
717 		new_info_hash = fib_hash_alloc(bytes);
718 		new_laddrhash = fib_hash_alloc(bytes);
719 		if (!new_info_hash || !new_laddrhash) {
720 			fib_hash_free(new_info_hash, bytes);
721 			fib_hash_free(new_laddrhash, bytes);
722 		} else {
723 			memset(new_info_hash, 0, bytes);
724 			memset(new_laddrhash, 0, bytes);
725 
726 			fib_hash_move(new_info_hash, new_laddrhash, new_size);
727 		}
728 
729 		if (!fib_hash_size)
730 			goto failure;
731 	}
732 
733 	fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
734 	if (fi == NULL)
735 		goto failure;
736 	fib_info_cnt++;
737 
738 	fi->fib_protocol = cfg->fc_protocol;
739 	fi->fib_flags = cfg->fc_flags;
740 	fi->fib_priority = cfg->fc_priority;
741 	fi->fib_prefsrc = cfg->fc_prefsrc;
742 
743 	fi->fib_nhs = nhs;
744 	change_nexthops(fi) {
745 		nh->nh_parent = fi;
746 	} endfor_nexthops(fi)
747 
748 	if (cfg->fc_mx) {
749 		struct nlattr *nla;
750 		int remaining;
751 
752 		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
753 			int type = nla->nla_type;
754 
755 			if (type) {
756 				if (type > RTAX_MAX)
757 					goto err_inval;
758 				fi->fib_metrics[type - 1] = nla_get_u32(nla);
759 			}
760 		}
761 	}
762 
763 	if (cfg->fc_mp) {
764 #ifdef CONFIG_IP_ROUTE_MULTIPATH
765 		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
766 		if (err != 0)
767 			goto failure;
768 		if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
769 			goto err_inval;
770 		if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
771 			goto err_inval;
772 #ifdef CONFIG_NET_CLS_ROUTE
773 		if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
774 			goto err_inval;
775 #endif
776 #else
777 		goto err_inval;
778 #endif
779 	} else {
780 		struct fib_nh *nh = fi->fib_nh;
781 
782 		nh->nh_oif = cfg->fc_oif;
783 		nh->nh_gw = cfg->fc_gw;
784 		nh->nh_flags = cfg->fc_flags;
785 #ifdef CONFIG_NET_CLS_ROUTE
786 		nh->nh_tclassid = cfg->fc_flow;
787 #endif
788 #ifdef CONFIG_IP_ROUTE_MULTIPATH
789 		nh->nh_weight = 1;
790 #endif
791 	}
792 
793 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
794 	fi->fib_mp_alg = cfg->fc_mp_alg;
795 #endif
796 
797 	if (fib_props[cfg->fc_type].error) {
798 		if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
799 			goto err_inval;
800 		goto link_it;
801 	}
802 
803 	if (cfg->fc_scope > RT_SCOPE_HOST)
804 		goto err_inval;
805 
806 	if (cfg->fc_scope == RT_SCOPE_HOST) {
807 		struct fib_nh *nh = fi->fib_nh;
808 
809 		/* Local address is added. */
810 		if (nhs != 1 || nh->nh_gw)
811 			goto err_inval;
812 		nh->nh_scope = RT_SCOPE_NOWHERE;
813 		nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif);
814 		err = -ENODEV;
815 		if (nh->nh_dev == NULL)
816 			goto failure;
817 	} else {
818 		change_nexthops(fi) {
819 			if ((err = fib_check_nh(cfg, fi, nh)) != 0)
820 				goto failure;
821 		} endfor_nexthops(fi)
822 	}
823 
824 	if (fi->fib_prefsrc) {
825 		if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
826 		    fi->fib_prefsrc != cfg->fc_dst)
827 			if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
828 				goto err_inval;
829 	}
830 
831 link_it:
832 	if ((ofi = fib_find_info(fi)) != NULL) {
833 		fi->fib_dead = 1;
834 		free_fib_info(fi);
835 		ofi->fib_treeref++;
836 		return ofi;
837 	}
838 
839 	fi->fib_treeref++;
840 	atomic_inc(&fi->fib_clntref);
841 	spin_lock_bh(&fib_info_lock);
842 	hlist_add_head(&fi->fib_hash,
843 		       &fib_info_hash[fib_info_hashfn(fi)]);
844 	if (fi->fib_prefsrc) {
845 		struct hlist_head *head;
846 
847 		head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
848 		hlist_add_head(&fi->fib_lhash, head);
849 	}
850 	change_nexthops(fi) {
851 		struct hlist_head *head;
852 		unsigned int hash;
853 
854 		if (!nh->nh_dev)
855 			continue;
856 		hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
857 		head = &fib_info_devhash[hash];
858 		hlist_add_head(&nh->nh_hash, head);
859 	} endfor_nexthops(fi)
860 	spin_unlock_bh(&fib_info_lock);
861 	return fi;
862 
863 err_inval:
864 	err = -EINVAL;
865 
866 failure:
867 	if (fi) {
868 		fi->fib_dead = 1;
869 		free_fib_info(fi);
870 	}
871 
872 	return ERR_PTR(err);
873 }
874 
875 /* Note! fib_semantic_match intentionally uses  RCU list functions. */
876 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
877 		       struct fib_result *res, __be32 zone, __be32 mask,
878 			int prefixlen)
879 {
880 	struct fib_alias *fa;
881 	int nh_sel = 0;
882 
883 	list_for_each_entry_rcu(fa, head, fa_list) {
884 		int err;
885 
886 		if (fa->fa_tos &&
887 		    fa->fa_tos != flp->fl4_tos)
888 			continue;
889 
890 		if (fa->fa_scope < flp->fl4_scope)
891 			continue;
892 
893 		fa->fa_state |= FA_S_ACCESSED;
894 
895 		err = fib_props[fa->fa_type].error;
896 		if (err == 0) {
897 			struct fib_info *fi = fa->fa_info;
898 
899 			if (fi->fib_flags & RTNH_F_DEAD)
900 				continue;
901 
902 			switch (fa->fa_type) {
903 			case RTN_UNICAST:
904 			case RTN_LOCAL:
905 			case RTN_BROADCAST:
906 			case RTN_ANYCAST:
907 			case RTN_MULTICAST:
908 				for_nexthops(fi) {
909 					if (nh->nh_flags&RTNH_F_DEAD)
910 						continue;
911 					if (!flp->oif || flp->oif == nh->nh_oif)
912 						break;
913 				}
914 #ifdef CONFIG_IP_ROUTE_MULTIPATH
915 				if (nhsel < fi->fib_nhs) {
916 					nh_sel = nhsel;
917 					goto out_fill_res;
918 				}
919 #else
920 				if (nhsel < 1) {
921 					goto out_fill_res;
922 				}
923 #endif
924 				endfor_nexthops(fi);
925 				continue;
926 
927 			default:
928 				printk(KERN_DEBUG "impossible 102\n");
929 				return -EINVAL;
930 			};
931 		}
932 		return err;
933 	}
934 	return 1;
935 
936 out_fill_res:
937 	res->prefixlen = prefixlen;
938 	res->nh_sel = nh_sel;
939 	res->type = fa->fa_type;
940 	res->scope = fa->fa_scope;
941 	res->fi = fa->fa_info;
942 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
943 	res->netmask = mask;
944 	res->network = zone & inet_make_mask(prefixlen);
945 #endif
946 	atomic_inc(&res->fi->fib_clntref);
947 	return 0;
948 }
949 
950 /* Find appropriate source address to this destination */
951 
952 __be32 __fib_res_prefsrc(struct fib_result *res)
953 {
954 	return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
955 }
956 
957 int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
958 		  u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
959 		  struct fib_info *fi, unsigned int flags)
960 {
961 	struct nlmsghdr *nlh;
962 	struct rtmsg *rtm;
963 
964 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
965 	if (nlh == NULL)
966 		return -EMSGSIZE;
967 
968 	rtm = nlmsg_data(nlh);
969 	rtm->rtm_family = AF_INET;
970 	rtm->rtm_dst_len = dst_len;
971 	rtm->rtm_src_len = 0;
972 	rtm->rtm_tos = tos;
973 	rtm->rtm_table = tb_id;
974 	NLA_PUT_U32(skb, RTA_TABLE, tb_id);
975 	rtm->rtm_type = type;
976 	rtm->rtm_flags = fi->fib_flags;
977 	rtm->rtm_scope = scope;
978 	rtm->rtm_protocol = fi->fib_protocol;
979 
980 	if (rtm->rtm_dst_len)
981 		NLA_PUT_BE32(skb, RTA_DST, dst);
982 
983 	if (fi->fib_priority)
984 		NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
985 
986 	if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
987 		goto nla_put_failure;
988 
989 	if (fi->fib_prefsrc)
990 		NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
991 
992 	if (fi->fib_nhs == 1) {
993 		if (fi->fib_nh->nh_gw)
994 			NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
995 
996 		if (fi->fib_nh->nh_oif)
997 			NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
998 #ifdef CONFIG_NET_CLS_ROUTE
999 		if (fi->fib_nh[0].nh_tclassid)
1000 			NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
1001 #endif
1002 	}
1003 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1004 	if (fi->fib_nhs > 1) {
1005 		struct rtnexthop *rtnh;
1006 		struct nlattr *mp;
1007 
1008 		mp = nla_nest_start(skb, RTA_MULTIPATH);
1009 		if (mp == NULL)
1010 			goto nla_put_failure;
1011 
1012 		for_nexthops(fi) {
1013 			rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1014 			if (rtnh == NULL)
1015 				goto nla_put_failure;
1016 
1017 			rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1018 			rtnh->rtnh_hops = nh->nh_weight - 1;
1019 			rtnh->rtnh_ifindex = nh->nh_oif;
1020 
1021 			if (nh->nh_gw)
1022 				NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1023 #ifdef CONFIG_NET_CLS_ROUTE
1024 			if (nh->nh_tclassid)
1025 				NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1026 #endif
1027 			/* length of rtnetlink header + attributes */
1028 			rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1029 		} endfor_nexthops(fi);
1030 
1031 		nla_nest_end(skb, mp);
1032 	}
1033 #endif
1034 	return nlmsg_end(skb, nlh);
1035 
1036 nla_put_failure:
1037 	nlmsg_cancel(skb, nlh);
1038 	return -EMSGSIZE;
1039 }
1040 
1041 /*
1042    Update FIB if:
1043    - local address disappeared -> we must delete all the entries
1044      referring to it.
1045    - device went down -> we must shutdown all nexthops going via it.
1046  */
1047 
1048 int fib_sync_down(__be32 local, struct net_device *dev, int force)
1049 {
1050 	int ret = 0;
1051 	int scope = RT_SCOPE_NOWHERE;
1052 
1053 	if (force)
1054 		scope = -1;
1055 
1056 	if (local && fib_info_laddrhash) {
1057 		unsigned int hash = fib_laddr_hashfn(local);
1058 		struct hlist_head *head = &fib_info_laddrhash[hash];
1059 		struct hlist_node *node;
1060 		struct fib_info *fi;
1061 
1062 		hlist_for_each_entry(fi, node, head, fib_lhash) {
1063 			if (fi->fib_prefsrc == local) {
1064 				fi->fib_flags |= RTNH_F_DEAD;
1065 				ret++;
1066 			}
1067 		}
1068 	}
1069 
1070 	if (dev) {
1071 		struct fib_info *prev_fi = NULL;
1072 		unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1073 		struct hlist_head *head = &fib_info_devhash[hash];
1074 		struct hlist_node *node;
1075 		struct fib_nh *nh;
1076 
1077 		hlist_for_each_entry(nh, node, head, nh_hash) {
1078 			struct fib_info *fi = nh->nh_parent;
1079 			int dead;
1080 
1081 			BUG_ON(!fi->fib_nhs);
1082 			if (nh->nh_dev != dev || fi == prev_fi)
1083 				continue;
1084 			prev_fi = fi;
1085 			dead = 0;
1086 			change_nexthops(fi) {
1087 				if (nh->nh_flags&RTNH_F_DEAD)
1088 					dead++;
1089 				else if (nh->nh_dev == dev &&
1090 					 nh->nh_scope != scope) {
1091 					nh->nh_flags |= RTNH_F_DEAD;
1092 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1093 					spin_lock_bh(&fib_multipath_lock);
1094 					fi->fib_power -= nh->nh_power;
1095 					nh->nh_power = 0;
1096 					spin_unlock_bh(&fib_multipath_lock);
1097 #endif
1098 					dead++;
1099 				}
1100 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1101 				if (force > 1 && nh->nh_dev == dev) {
1102 					dead = fi->fib_nhs;
1103 					break;
1104 				}
1105 #endif
1106 			} endfor_nexthops(fi)
1107 			if (dead == fi->fib_nhs) {
1108 				fi->fib_flags |= RTNH_F_DEAD;
1109 				ret++;
1110 			}
1111 		}
1112 	}
1113 
1114 	return ret;
1115 }
1116 
1117 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1118 
1119 /*
1120    Dead device goes up. We wake up dead nexthops.
1121    It takes sense only on multipath routes.
1122  */
1123 
1124 int fib_sync_up(struct net_device *dev)
1125 {
1126 	struct fib_info *prev_fi;
1127 	unsigned int hash;
1128 	struct hlist_head *head;
1129 	struct hlist_node *node;
1130 	struct fib_nh *nh;
1131 	int ret;
1132 
1133 	if (!(dev->flags&IFF_UP))
1134 		return 0;
1135 
1136 	prev_fi = NULL;
1137 	hash = fib_devindex_hashfn(dev->ifindex);
1138 	head = &fib_info_devhash[hash];
1139 	ret = 0;
1140 
1141 	hlist_for_each_entry(nh, node, head, nh_hash) {
1142 		struct fib_info *fi = nh->nh_parent;
1143 		int alive;
1144 
1145 		BUG_ON(!fi->fib_nhs);
1146 		if (nh->nh_dev != dev || fi == prev_fi)
1147 			continue;
1148 
1149 		prev_fi = fi;
1150 		alive = 0;
1151 		change_nexthops(fi) {
1152 			if (!(nh->nh_flags&RTNH_F_DEAD)) {
1153 				alive++;
1154 				continue;
1155 			}
1156 			if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1157 				continue;
1158 			if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
1159 				continue;
1160 			alive++;
1161 			spin_lock_bh(&fib_multipath_lock);
1162 			nh->nh_power = 0;
1163 			nh->nh_flags &= ~RTNH_F_DEAD;
1164 			spin_unlock_bh(&fib_multipath_lock);
1165 		} endfor_nexthops(fi)
1166 
1167 		if (alive > 0) {
1168 			fi->fib_flags &= ~RTNH_F_DEAD;
1169 			ret++;
1170 		}
1171 	}
1172 
1173 	return ret;
1174 }
1175 
1176 /*
1177    The algorithm is suboptimal, but it provides really
1178    fair weighted route distribution.
1179  */
1180 
1181 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1182 {
1183 	struct fib_info *fi = res->fi;
1184 	int w;
1185 
1186 	spin_lock_bh(&fib_multipath_lock);
1187 	if (fi->fib_power <= 0) {
1188 		int power = 0;
1189 		change_nexthops(fi) {
1190 			if (!(nh->nh_flags&RTNH_F_DEAD)) {
1191 				power += nh->nh_weight;
1192 				nh->nh_power = nh->nh_weight;
1193 			}
1194 		} endfor_nexthops(fi);
1195 		fi->fib_power = power;
1196 		if (power <= 0) {
1197 			spin_unlock_bh(&fib_multipath_lock);
1198 			/* Race condition: route has just become dead. */
1199 			res->nh_sel = 0;
1200 			return;
1201 		}
1202 	}
1203 
1204 
1205 	/* w should be random number [0..fi->fib_power-1],
1206 	   it is pretty bad approximation.
1207 	 */
1208 
1209 	w = jiffies % fi->fib_power;
1210 
1211 	change_nexthops(fi) {
1212 		if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1213 			if ((w -= nh->nh_power) <= 0) {
1214 				nh->nh_power--;
1215 				fi->fib_power--;
1216 				res->nh_sel = nhsel;
1217 				spin_unlock_bh(&fib_multipath_lock);
1218 				return;
1219 			}
1220 		}
1221 	} endfor_nexthops(fi);
1222 
1223 	/* Race condition: route has just become dead. */
1224 	res->nh_sel = 0;
1225 	spin_unlock_bh(&fib_multipath_lock);
1226 }
1227 #endif
1228