1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * IPv4 Forwarding Information Base: semantics. 7 * 8 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 */ 15 16 #include <asm/uaccess.h> 17 #include <asm/system.h> 18 #include <linux/bitops.h> 19 #include <linux/types.h> 20 #include <linux/kernel.h> 21 #include <linux/jiffies.h> 22 #include <linux/mm.h> 23 #include <linux/string.h> 24 #include <linux/socket.h> 25 #include <linux/sockios.h> 26 #include <linux/errno.h> 27 #include <linux/in.h> 28 #include <linux/inet.h> 29 #include <linux/inetdevice.h> 30 #include <linux/netdevice.h> 31 #include <linux/if_arp.h> 32 #include <linux/proc_fs.h> 33 #include <linux/skbuff.h> 34 #include <linux/init.h> 35 #include <linux/slab.h> 36 37 #include <net/arp.h> 38 #include <net/ip.h> 39 #include <net/protocol.h> 40 #include <net/route.h> 41 #include <net/tcp.h> 42 #include <net/sock.h> 43 #include <net/ip_fib.h> 44 #include <net/netlink.h> 45 #include <net/nexthop.h> 46 47 #include "fib_lookup.h" 48 49 static DEFINE_SPINLOCK(fib_info_lock); 50 static struct hlist_head *fib_info_hash; 51 static struct hlist_head *fib_info_laddrhash; 52 static unsigned int fib_info_hash_size; 53 static unsigned int fib_info_cnt; 54 55 #define DEVINDEX_HASHBITS 8 56 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS) 57 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; 58 59 #ifdef CONFIG_IP_ROUTE_MULTIPATH 60 61 static DEFINE_SPINLOCK(fib_multipath_lock); 62 63 #define for_nexthops(fi) { \ 64 int nhsel; const struct fib_nh *nh; \ 65 for (nhsel = 0, nh = (fi)->fib_nh; \ 66 nhsel < (fi)->fib_nhs; \ 67 nh++, nhsel++) 68 69 #define change_nexthops(fi) { \ 70 int nhsel; struct fib_nh *nexthop_nh; \ 71 for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ 72 nhsel < (fi)->fib_nhs; \ 73 nexthop_nh++, nhsel++) 74 75 #else /* CONFIG_IP_ROUTE_MULTIPATH */ 76 77 /* Hope, that gcc will optimize it to get rid of dummy loop */ 78 79 #define for_nexthops(fi) { \ 80 int nhsel; const struct fib_nh *nh = (fi)->fib_nh; \ 81 for (nhsel = 0; nhsel < 1; nhsel++) 82 83 #define change_nexthops(fi) { \ 84 int nhsel; \ 85 struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ 86 for (nhsel = 0; nhsel < 1; nhsel++) 87 88 #endif /* CONFIG_IP_ROUTE_MULTIPATH */ 89 90 #define endfor_nexthops(fi) } 91 92 93 const struct fib_prop fib_props[RTN_MAX + 1] = { 94 [RTN_UNSPEC] = { 95 .error = 0, 96 .scope = RT_SCOPE_NOWHERE, 97 }, 98 [RTN_UNICAST] = { 99 .error = 0, 100 .scope = RT_SCOPE_UNIVERSE, 101 }, 102 [RTN_LOCAL] = { 103 .error = 0, 104 .scope = RT_SCOPE_HOST, 105 }, 106 [RTN_BROADCAST] = { 107 .error = 0, 108 .scope = RT_SCOPE_LINK, 109 }, 110 [RTN_ANYCAST] = { 111 .error = 0, 112 .scope = RT_SCOPE_LINK, 113 }, 114 [RTN_MULTICAST] = { 115 .error = 0, 116 .scope = RT_SCOPE_UNIVERSE, 117 }, 118 [RTN_BLACKHOLE] = { 119 .error = -EINVAL, 120 .scope = RT_SCOPE_UNIVERSE, 121 }, 122 [RTN_UNREACHABLE] = { 123 .error = -EHOSTUNREACH, 124 .scope = RT_SCOPE_UNIVERSE, 125 }, 126 [RTN_PROHIBIT] = { 127 .error = -EACCES, 128 .scope = RT_SCOPE_UNIVERSE, 129 }, 130 [RTN_THROW] = { 131 .error = -EAGAIN, 132 .scope = RT_SCOPE_UNIVERSE, 133 }, 134 [RTN_NAT] = { 135 .error = -EINVAL, 136 .scope = RT_SCOPE_NOWHERE, 137 }, 138 [RTN_XRESOLVE] = { 139 .error = -EINVAL, 140 .scope = RT_SCOPE_NOWHERE, 141 }, 142 }; 143 144 145 /* Release a nexthop info record */ 146 147 static void free_fib_info_rcu(struct rcu_head *head) 148 { 149 struct fib_info *fi = container_of(head, struct fib_info, rcu); 150 151 if (fi->fib_metrics != (u32 *) dst_default_metrics) 152 kfree(fi->fib_metrics); 153 kfree(fi); 154 } 155 156 void free_fib_info(struct fib_info *fi) 157 { 158 if (fi->fib_dead == 0) { 159 pr_warning("Freeing alive fib_info %p\n", fi); 160 return; 161 } 162 change_nexthops(fi) { 163 if (nexthop_nh->nh_dev) 164 dev_put(nexthop_nh->nh_dev); 165 nexthop_nh->nh_dev = NULL; 166 } endfor_nexthops(fi); 167 fib_info_cnt--; 168 release_net(fi->fib_net); 169 call_rcu(&fi->rcu, free_fib_info_rcu); 170 } 171 172 void fib_release_info(struct fib_info *fi) 173 { 174 spin_lock_bh(&fib_info_lock); 175 if (fi && --fi->fib_treeref == 0) { 176 hlist_del(&fi->fib_hash); 177 if (fi->fib_prefsrc) 178 hlist_del(&fi->fib_lhash); 179 change_nexthops(fi) { 180 if (!nexthop_nh->nh_dev) 181 continue; 182 hlist_del(&nexthop_nh->nh_hash); 183 } endfor_nexthops(fi) 184 fi->fib_dead = 1; 185 fib_info_put(fi); 186 } 187 spin_unlock_bh(&fib_info_lock); 188 } 189 190 static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) 191 { 192 const struct fib_nh *onh = ofi->fib_nh; 193 194 for_nexthops(fi) { 195 if (nh->nh_oif != onh->nh_oif || 196 nh->nh_gw != onh->nh_gw || 197 nh->nh_scope != onh->nh_scope || 198 #ifdef CONFIG_IP_ROUTE_MULTIPATH 199 nh->nh_weight != onh->nh_weight || 200 #endif 201 #ifdef CONFIG_IP_ROUTE_CLASSID 202 nh->nh_tclassid != onh->nh_tclassid || 203 #endif 204 ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD)) 205 return -1; 206 onh++; 207 } endfor_nexthops(fi); 208 return 0; 209 } 210 211 static inline unsigned int fib_devindex_hashfn(unsigned int val) 212 { 213 unsigned int mask = DEVINDEX_HASHSIZE - 1; 214 215 return (val ^ 216 (val >> DEVINDEX_HASHBITS) ^ 217 (val >> (DEVINDEX_HASHBITS * 2))) & mask; 218 } 219 220 static inline unsigned int fib_info_hashfn(const struct fib_info *fi) 221 { 222 unsigned int mask = (fib_info_hash_size - 1); 223 unsigned int val = fi->fib_nhs; 224 225 val ^= (fi->fib_protocol << 8) | fi->fib_scope; 226 val ^= (__force u32)fi->fib_prefsrc; 227 val ^= fi->fib_priority; 228 for_nexthops(fi) { 229 val ^= fib_devindex_hashfn(nh->nh_oif); 230 } endfor_nexthops(fi) 231 232 return (val ^ (val >> 7) ^ (val >> 12)) & mask; 233 } 234 235 static struct fib_info *fib_find_info(const struct fib_info *nfi) 236 { 237 struct hlist_head *head; 238 struct hlist_node *node; 239 struct fib_info *fi; 240 unsigned int hash; 241 242 hash = fib_info_hashfn(nfi); 243 head = &fib_info_hash[hash]; 244 245 hlist_for_each_entry(fi, node, head, fib_hash) { 246 if (!net_eq(fi->fib_net, nfi->fib_net)) 247 continue; 248 if (fi->fib_nhs != nfi->fib_nhs) 249 continue; 250 if (nfi->fib_protocol == fi->fib_protocol && 251 nfi->fib_scope == fi->fib_scope && 252 nfi->fib_prefsrc == fi->fib_prefsrc && 253 nfi->fib_priority == fi->fib_priority && 254 memcmp(nfi->fib_metrics, fi->fib_metrics, 255 sizeof(u32) * RTAX_MAX) == 0 && 256 ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 && 257 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) 258 return fi; 259 } 260 261 return NULL; 262 } 263 264 /* Check, that the gateway is already configured. 265 * Used only by redirect accept routine. 266 */ 267 int ip_fib_check_default(__be32 gw, struct net_device *dev) 268 { 269 struct hlist_head *head; 270 struct hlist_node *node; 271 struct fib_nh *nh; 272 unsigned int hash; 273 274 spin_lock(&fib_info_lock); 275 276 hash = fib_devindex_hashfn(dev->ifindex); 277 head = &fib_info_devhash[hash]; 278 hlist_for_each_entry(nh, node, head, nh_hash) { 279 if (nh->nh_dev == dev && 280 nh->nh_gw == gw && 281 !(nh->nh_flags & RTNH_F_DEAD)) { 282 spin_unlock(&fib_info_lock); 283 return 0; 284 } 285 } 286 287 spin_unlock(&fib_info_lock); 288 289 return -1; 290 } 291 292 static inline size_t fib_nlmsg_size(struct fib_info *fi) 293 { 294 size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg)) 295 + nla_total_size(4) /* RTA_TABLE */ 296 + nla_total_size(4) /* RTA_DST */ 297 + nla_total_size(4) /* RTA_PRIORITY */ 298 + nla_total_size(4); /* RTA_PREFSRC */ 299 300 /* space for nested metrics */ 301 payload += nla_total_size((RTAX_MAX * nla_total_size(4))); 302 303 if (fi->fib_nhs) { 304 /* Also handles the special case fib_nhs == 1 */ 305 306 /* each nexthop is packed in an attribute */ 307 size_t nhsize = nla_total_size(sizeof(struct rtnexthop)); 308 309 /* may contain flow and gateway attribute */ 310 nhsize += 2 * nla_total_size(4); 311 312 /* all nexthops are packed in a nested attribute */ 313 payload += nla_total_size(fi->fib_nhs * nhsize); 314 } 315 316 return payload; 317 } 318 319 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, 320 int dst_len, u32 tb_id, struct nl_info *info, 321 unsigned int nlm_flags) 322 { 323 struct sk_buff *skb; 324 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 325 int err = -ENOBUFS; 326 327 skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL); 328 if (skb == NULL) 329 goto errout; 330 331 err = fib_dump_info(skb, info->pid, seq, event, tb_id, 332 fa->fa_type, key, dst_len, 333 fa->fa_tos, fa->fa_info, nlm_flags); 334 if (err < 0) { 335 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ 336 WARN_ON(err == -EMSGSIZE); 337 kfree_skb(skb); 338 goto errout; 339 } 340 rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE, 341 info->nlh, GFP_KERNEL); 342 return; 343 errout: 344 if (err < 0) 345 rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err); 346 } 347 348 /* Return the first fib alias matching TOS with 349 * priority less than or equal to PRIO. 350 */ 351 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio) 352 { 353 if (fah) { 354 struct fib_alias *fa; 355 list_for_each_entry(fa, fah, fa_list) { 356 if (fa->fa_tos > tos) 357 continue; 358 if (fa->fa_info->fib_priority >= prio || 359 fa->fa_tos < tos) 360 return fa; 361 } 362 } 363 return NULL; 364 } 365 366 int fib_detect_death(struct fib_info *fi, int order, 367 struct fib_info **last_resort, int *last_idx, int dflt) 368 { 369 struct neighbour *n; 370 int state = NUD_NONE; 371 372 n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev); 373 if (n) { 374 state = n->nud_state; 375 neigh_release(n); 376 } 377 if (state == NUD_REACHABLE) 378 return 0; 379 if ((state & NUD_VALID) && order != dflt) 380 return 0; 381 if ((state & NUD_VALID) || 382 (*last_idx < 0 && order > dflt)) { 383 *last_resort = fi; 384 *last_idx = order; 385 } 386 return 1; 387 } 388 389 #ifdef CONFIG_IP_ROUTE_MULTIPATH 390 391 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining) 392 { 393 int nhs = 0; 394 395 while (rtnh_ok(rtnh, remaining)) { 396 nhs++; 397 rtnh = rtnh_next(rtnh, &remaining); 398 } 399 400 /* leftover implies invalid nexthop configuration, discard it */ 401 return remaining > 0 ? 0 : nhs; 402 } 403 404 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, 405 int remaining, struct fib_config *cfg) 406 { 407 change_nexthops(fi) { 408 int attrlen; 409 410 if (!rtnh_ok(rtnh, remaining)) 411 return -EINVAL; 412 413 nexthop_nh->nh_flags = 414 (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; 415 nexthop_nh->nh_oif = rtnh->rtnh_ifindex; 416 nexthop_nh->nh_weight = rtnh->rtnh_hops + 1; 417 418 attrlen = rtnh_attrlen(rtnh); 419 if (attrlen > 0) { 420 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 421 422 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 423 nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0; 424 #ifdef CONFIG_IP_ROUTE_CLASSID 425 nla = nla_find(attrs, attrlen, RTA_FLOW); 426 nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; 427 #endif 428 } 429 430 rtnh = rtnh_next(rtnh, &remaining); 431 } endfor_nexthops(fi); 432 433 return 0; 434 } 435 436 #endif 437 438 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) 439 { 440 #ifdef CONFIG_IP_ROUTE_MULTIPATH 441 struct rtnexthop *rtnh; 442 int remaining; 443 #endif 444 445 if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority) 446 return 1; 447 448 if (cfg->fc_oif || cfg->fc_gw) { 449 if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) && 450 (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw)) 451 return 0; 452 return 1; 453 } 454 455 #ifdef CONFIG_IP_ROUTE_MULTIPATH 456 if (cfg->fc_mp == NULL) 457 return 0; 458 459 rtnh = cfg->fc_mp; 460 remaining = cfg->fc_mp_len; 461 462 for_nexthops(fi) { 463 int attrlen; 464 465 if (!rtnh_ok(rtnh, remaining)) 466 return -EINVAL; 467 468 if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif) 469 return 1; 470 471 attrlen = rtnh_attrlen(rtnh); 472 if (attrlen < 0) { 473 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 474 475 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 476 if (nla && nla_get_be32(nla) != nh->nh_gw) 477 return 1; 478 #ifdef CONFIG_IP_ROUTE_CLASSID 479 nla = nla_find(attrs, attrlen, RTA_FLOW); 480 if (nla && nla_get_u32(nla) != nh->nh_tclassid) 481 return 1; 482 #endif 483 } 484 485 rtnh = rtnh_next(rtnh, &remaining); 486 } endfor_nexthops(fi); 487 #endif 488 return 0; 489 } 490 491 492 /* 493 * Picture 494 * ------- 495 * 496 * Semantics of nexthop is very messy by historical reasons. 497 * We have to take into account, that: 498 * a) gateway can be actually local interface address, 499 * so that gatewayed route is direct. 500 * b) gateway must be on-link address, possibly 501 * described not by an ifaddr, but also by a direct route. 502 * c) If both gateway and interface are specified, they should not 503 * contradict. 504 * d) If we use tunnel routes, gateway could be not on-link. 505 * 506 * Attempt to reconcile all of these (alas, self-contradictory) conditions 507 * results in pretty ugly and hairy code with obscure logic. 508 * 509 * I chose to generalized it instead, so that the size 510 * of code does not increase practically, but it becomes 511 * much more general. 512 * Every prefix is assigned a "scope" value: "host" is local address, 513 * "link" is direct route, 514 * [ ... "site" ... "interior" ... ] 515 * and "universe" is true gateway route with global meaning. 516 * 517 * Every prefix refers to a set of "nexthop"s (gw, oif), 518 * where gw must have narrower scope. This recursion stops 519 * when gw has LOCAL scope or if "nexthop" is declared ONLINK, 520 * which means that gw is forced to be on link. 521 * 522 * Code is still hairy, but now it is apparently logically 523 * consistent and very flexible. F.e. as by-product it allows 524 * to co-exists in peace independent exterior and interior 525 * routing processes. 526 * 527 * Normally it looks as following. 528 * 529 * {universe prefix} -> (gw, oif) [scope link] 530 * | 531 * |-> {link prefix} -> (gw, oif) [scope local] 532 * | 533 * |-> {local prefix} (terminal node) 534 */ 535 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, 536 struct fib_nh *nh) 537 { 538 int err; 539 struct net *net; 540 struct net_device *dev; 541 542 net = cfg->fc_nlinfo.nl_net; 543 if (nh->nh_gw) { 544 struct fib_result res; 545 546 if (nh->nh_flags & RTNH_F_ONLINK) { 547 548 if (cfg->fc_scope >= RT_SCOPE_LINK) 549 return -EINVAL; 550 if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST) 551 return -EINVAL; 552 dev = __dev_get_by_index(net, nh->nh_oif); 553 if (!dev) 554 return -ENODEV; 555 if (!(dev->flags & IFF_UP)) 556 return -ENETDOWN; 557 nh->nh_dev = dev; 558 dev_hold(dev); 559 nh->nh_scope = RT_SCOPE_LINK; 560 return 0; 561 } 562 rcu_read_lock(); 563 { 564 struct flowi4 fl4 = { 565 .daddr = nh->nh_gw, 566 .flowi4_scope = cfg->fc_scope + 1, 567 .flowi4_oif = nh->nh_oif, 568 }; 569 570 /* It is not necessary, but requires a bit of thinking */ 571 if (fl4.flowi4_scope < RT_SCOPE_LINK) 572 fl4.flowi4_scope = RT_SCOPE_LINK; 573 err = fib_lookup(net, &fl4, &res); 574 if (err) { 575 rcu_read_unlock(); 576 return err; 577 } 578 } 579 err = -EINVAL; 580 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) 581 goto out; 582 nh->nh_scope = res.scope; 583 nh->nh_oif = FIB_RES_OIF(res); 584 nh->nh_dev = dev = FIB_RES_DEV(res); 585 if (!dev) 586 goto out; 587 dev_hold(dev); 588 err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; 589 } else { 590 struct in_device *in_dev; 591 592 if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) 593 return -EINVAL; 594 595 rcu_read_lock(); 596 err = -ENODEV; 597 in_dev = inetdev_by_index(net, nh->nh_oif); 598 if (in_dev == NULL) 599 goto out; 600 err = -ENETDOWN; 601 if (!(in_dev->dev->flags & IFF_UP)) 602 goto out; 603 nh->nh_dev = in_dev->dev; 604 dev_hold(nh->nh_dev); 605 nh->nh_scope = RT_SCOPE_HOST; 606 err = 0; 607 } 608 out: 609 rcu_read_unlock(); 610 return err; 611 } 612 613 static inline unsigned int fib_laddr_hashfn(__be32 val) 614 { 615 unsigned int mask = (fib_info_hash_size - 1); 616 617 return ((__force u32)val ^ 618 ((__force u32)val >> 7) ^ 619 ((__force u32)val >> 14)) & mask; 620 } 621 622 static struct hlist_head *fib_info_hash_alloc(int bytes) 623 { 624 if (bytes <= PAGE_SIZE) 625 return kzalloc(bytes, GFP_KERNEL); 626 else 627 return (struct hlist_head *) 628 __get_free_pages(GFP_KERNEL | __GFP_ZERO, 629 get_order(bytes)); 630 } 631 632 static void fib_info_hash_free(struct hlist_head *hash, int bytes) 633 { 634 if (!hash) 635 return; 636 637 if (bytes <= PAGE_SIZE) 638 kfree(hash); 639 else 640 free_pages((unsigned long) hash, get_order(bytes)); 641 } 642 643 static void fib_info_hash_move(struct hlist_head *new_info_hash, 644 struct hlist_head *new_laddrhash, 645 unsigned int new_size) 646 { 647 struct hlist_head *old_info_hash, *old_laddrhash; 648 unsigned int old_size = fib_info_hash_size; 649 unsigned int i, bytes; 650 651 spin_lock_bh(&fib_info_lock); 652 old_info_hash = fib_info_hash; 653 old_laddrhash = fib_info_laddrhash; 654 fib_info_hash_size = new_size; 655 656 for (i = 0; i < old_size; i++) { 657 struct hlist_head *head = &fib_info_hash[i]; 658 struct hlist_node *node, *n; 659 struct fib_info *fi; 660 661 hlist_for_each_entry_safe(fi, node, n, head, fib_hash) { 662 struct hlist_head *dest; 663 unsigned int new_hash; 664 665 hlist_del(&fi->fib_hash); 666 667 new_hash = fib_info_hashfn(fi); 668 dest = &new_info_hash[new_hash]; 669 hlist_add_head(&fi->fib_hash, dest); 670 } 671 } 672 fib_info_hash = new_info_hash; 673 674 for (i = 0; i < old_size; i++) { 675 struct hlist_head *lhead = &fib_info_laddrhash[i]; 676 struct hlist_node *node, *n; 677 struct fib_info *fi; 678 679 hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) { 680 struct hlist_head *ldest; 681 unsigned int new_hash; 682 683 hlist_del(&fi->fib_lhash); 684 685 new_hash = fib_laddr_hashfn(fi->fib_prefsrc); 686 ldest = &new_laddrhash[new_hash]; 687 hlist_add_head(&fi->fib_lhash, ldest); 688 } 689 } 690 fib_info_laddrhash = new_laddrhash; 691 692 spin_unlock_bh(&fib_info_lock); 693 694 bytes = old_size * sizeof(struct hlist_head *); 695 fib_info_hash_free(old_info_hash, bytes); 696 fib_info_hash_free(old_laddrhash, bytes); 697 } 698 699 __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh) 700 { 701 nh->nh_saddr = inet_select_addr(nh->nh_dev, 702 nh->nh_gw, 703 nh->nh_parent->fib_scope); 704 nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid); 705 706 return nh->nh_saddr; 707 } 708 709 struct fib_info *fib_create_info(struct fib_config *cfg) 710 { 711 int err; 712 struct fib_info *fi = NULL; 713 struct fib_info *ofi; 714 int nhs = 1; 715 struct net *net = cfg->fc_nlinfo.nl_net; 716 717 if (cfg->fc_type > RTN_MAX) 718 goto err_inval; 719 720 /* Fast check to catch the most weird cases */ 721 if (fib_props[cfg->fc_type].scope > cfg->fc_scope) 722 goto err_inval; 723 724 #ifdef CONFIG_IP_ROUTE_MULTIPATH 725 if (cfg->fc_mp) { 726 nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len); 727 if (nhs == 0) 728 goto err_inval; 729 } 730 #endif 731 732 err = -ENOBUFS; 733 if (fib_info_cnt >= fib_info_hash_size) { 734 unsigned int new_size = fib_info_hash_size << 1; 735 struct hlist_head *new_info_hash; 736 struct hlist_head *new_laddrhash; 737 unsigned int bytes; 738 739 if (!new_size) 740 new_size = 1; 741 bytes = new_size * sizeof(struct hlist_head *); 742 new_info_hash = fib_info_hash_alloc(bytes); 743 new_laddrhash = fib_info_hash_alloc(bytes); 744 if (!new_info_hash || !new_laddrhash) { 745 fib_info_hash_free(new_info_hash, bytes); 746 fib_info_hash_free(new_laddrhash, bytes); 747 } else 748 fib_info_hash_move(new_info_hash, new_laddrhash, new_size); 749 750 if (!fib_info_hash_size) 751 goto failure; 752 } 753 754 fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); 755 if (fi == NULL) 756 goto failure; 757 if (cfg->fc_mx) { 758 fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); 759 if (!fi->fib_metrics) 760 goto failure; 761 } else 762 fi->fib_metrics = (u32 *) dst_default_metrics; 763 fib_info_cnt++; 764 765 fi->fib_net = hold_net(net); 766 fi->fib_protocol = cfg->fc_protocol; 767 fi->fib_scope = cfg->fc_scope; 768 fi->fib_flags = cfg->fc_flags; 769 fi->fib_priority = cfg->fc_priority; 770 fi->fib_prefsrc = cfg->fc_prefsrc; 771 772 fi->fib_nhs = nhs; 773 change_nexthops(fi) { 774 nexthop_nh->nh_parent = fi; 775 } endfor_nexthops(fi) 776 777 if (cfg->fc_mx) { 778 struct nlattr *nla; 779 int remaining; 780 781 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { 782 int type = nla_type(nla); 783 784 if (type) { 785 if (type > RTAX_MAX) 786 goto err_inval; 787 fi->fib_metrics[type - 1] = nla_get_u32(nla); 788 } 789 } 790 } 791 792 if (cfg->fc_mp) { 793 #ifdef CONFIG_IP_ROUTE_MULTIPATH 794 err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg); 795 if (err != 0) 796 goto failure; 797 if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif) 798 goto err_inval; 799 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) 800 goto err_inval; 801 #ifdef CONFIG_IP_ROUTE_CLASSID 802 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) 803 goto err_inval; 804 #endif 805 #else 806 goto err_inval; 807 #endif 808 } else { 809 struct fib_nh *nh = fi->fib_nh; 810 811 nh->nh_oif = cfg->fc_oif; 812 nh->nh_gw = cfg->fc_gw; 813 nh->nh_flags = cfg->fc_flags; 814 #ifdef CONFIG_IP_ROUTE_CLASSID 815 nh->nh_tclassid = cfg->fc_flow; 816 #endif 817 #ifdef CONFIG_IP_ROUTE_MULTIPATH 818 nh->nh_weight = 1; 819 #endif 820 } 821 822 if (fib_props[cfg->fc_type].error) { 823 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) 824 goto err_inval; 825 goto link_it; 826 } else { 827 switch (cfg->fc_type) { 828 case RTN_UNICAST: 829 case RTN_LOCAL: 830 case RTN_BROADCAST: 831 case RTN_ANYCAST: 832 case RTN_MULTICAST: 833 break; 834 default: 835 goto err_inval; 836 } 837 } 838 839 if (cfg->fc_scope > RT_SCOPE_HOST) 840 goto err_inval; 841 842 if (cfg->fc_scope == RT_SCOPE_HOST) { 843 struct fib_nh *nh = fi->fib_nh; 844 845 /* Local address is added. */ 846 if (nhs != 1 || nh->nh_gw) 847 goto err_inval; 848 nh->nh_scope = RT_SCOPE_NOWHERE; 849 nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif); 850 err = -ENODEV; 851 if (nh->nh_dev == NULL) 852 goto failure; 853 } else { 854 change_nexthops(fi) { 855 err = fib_check_nh(cfg, fi, nexthop_nh); 856 if (err != 0) 857 goto failure; 858 } endfor_nexthops(fi) 859 } 860 861 if (fi->fib_prefsrc) { 862 if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || 863 fi->fib_prefsrc != cfg->fc_dst) 864 if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL) 865 goto err_inval; 866 } 867 868 change_nexthops(fi) { 869 fib_info_update_nh_saddr(net, nexthop_nh); 870 } endfor_nexthops(fi) 871 872 link_it: 873 ofi = fib_find_info(fi); 874 if (ofi) { 875 fi->fib_dead = 1; 876 free_fib_info(fi); 877 ofi->fib_treeref++; 878 return ofi; 879 } 880 881 fi->fib_treeref++; 882 atomic_inc(&fi->fib_clntref); 883 spin_lock_bh(&fib_info_lock); 884 hlist_add_head(&fi->fib_hash, 885 &fib_info_hash[fib_info_hashfn(fi)]); 886 if (fi->fib_prefsrc) { 887 struct hlist_head *head; 888 889 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)]; 890 hlist_add_head(&fi->fib_lhash, head); 891 } 892 change_nexthops(fi) { 893 struct hlist_head *head; 894 unsigned int hash; 895 896 if (!nexthop_nh->nh_dev) 897 continue; 898 hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex); 899 head = &fib_info_devhash[hash]; 900 hlist_add_head(&nexthop_nh->nh_hash, head); 901 } endfor_nexthops(fi) 902 spin_unlock_bh(&fib_info_lock); 903 return fi; 904 905 err_inval: 906 err = -EINVAL; 907 908 failure: 909 if (fi) { 910 fi->fib_dead = 1; 911 free_fib_info(fi); 912 } 913 914 return ERR_PTR(err); 915 } 916 917 int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, 918 u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, 919 struct fib_info *fi, unsigned int flags) 920 { 921 struct nlmsghdr *nlh; 922 struct rtmsg *rtm; 923 924 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags); 925 if (nlh == NULL) 926 return -EMSGSIZE; 927 928 rtm = nlmsg_data(nlh); 929 rtm->rtm_family = AF_INET; 930 rtm->rtm_dst_len = dst_len; 931 rtm->rtm_src_len = 0; 932 rtm->rtm_tos = tos; 933 if (tb_id < 256) 934 rtm->rtm_table = tb_id; 935 else 936 rtm->rtm_table = RT_TABLE_COMPAT; 937 NLA_PUT_U32(skb, RTA_TABLE, tb_id); 938 rtm->rtm_type = type; 939 rtm->rtm_flags = fi->fib_flags; 940 rtm->rtm_scope = fi->fib_scope; 941 rtm->rtm_protocol = fi->fib_protocol; 942 943 if (rtm->rtm_dst_len) 944 NLA_PUT_BE32(skb, RTA_DST, dst); 945 946 if (fi->fib_priority) 947 NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority); 948 949 if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0) 950 goto nla_put_failure; 951 952 if (fi->fib_prefsrc) 953 NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc); 954 955 if (fi->fib_nhs == 1) { 956 if (fi->fib_nh->nh_gw) 957 NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw); 958 959 if (fi->fib_nh->nh_oif) 960 NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif); 961 #ifdef CONFIG_IP_ROUTE_CLASSID 962 if (fi->fib_nh[0].nh_tclassid) 963 NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid); 964 #endif 965 } 966 #ifdef CONFIG_IP_ROUTE_MULTIPATH 967 if (fi->fib_nhs > 1) { 968 struct rtnexthop *rtnh; 969 struct nlattr *mp; 970 971 mp = nla_nest_start(skb, RTA_MULTIPATH); 972 if (mp == NULL) 973 goto nla_put_failure; 974 975 for_nexthops(fi) { 976 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 977 if (rtnh == NULL) 978 goto nla_put_failure; 979 980 rtnh->rtnh_flags = nh->nh_flags & 0xFF; 981 rtnh->rtnh_hops = nh->nh_weight - 1; 982 rtnh->rtnh_ifindex = nh->nh_oif; 983 984 if (nh->nh_gw) 985 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw); 986 #ifdef CONFIG_IP_ROUTE_CLASSID 987 if (nh->nh_tclassid) 988 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid); 989 #endif 990 /* length of rtnetlink header + attributes */ 991 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh; 992 } endfor_nexthops(fi); 993 994 nla_nest_end(skb, mp); 995 } 996 #endif 997 return nlmsg_end(skb, nlh); 998 999 nla_put_failure: 1000 nlmsg_cancel(skb, nlh); 1001 return -EMSGSIZE; 1002 } 1003 1004 /* 1005 * Update FIB if: 1006 * - local address disappeared -> we must delete all the entries 1007 * referring to it. 1008 * - device went down -> we must shutdown all nexthops going via it. 1009 */ 1010 int fib_sync_down_addr(struct net *net, __be32 local) 1011 { 1012 int ret = 0; 1013 unsigned int hash = fib_laddr_hashfn(local); 1014 struct hlist_head *head = &fib_info_laddrhash[hash]; 1015 struct hlist_node *node; 1016 struct fib_info *fi; 1017 1018 if (fib_info_laddrhash == NULL || local == 0) 1019 return 0; 1020 1021 hlist_for_each_entry(fi, node, head, fib_lhash) { 1022 if (!net_eq(fi->fib_net, net)) 1023 continue; 1024 if (fi->fib_prefsrc == local) { 1025 fi->fib_flags |= RTNH_F_DEAD; 1026 ret++; 1027 } 1028 } 1029 return ret; 1030 } 1031 1032 int fib_sync_down_dev(struct net_device *dev, int force) 1033 { 1034 int ret = 0; 1035 int scope = RT_SCOPE_NOWHERE; 1036 struct fib_info *prev_fi = NULL; 1037 unsigned int hash = fib_devindex_hashfn(dev->ifindex); 1038 struct hlist_head *head = &fib_info_devhash[hash]; 1039 struct hlist_node *node; 1040 struct fib_nh *nh; 1041 1042 if (force) 1043 scope = -1; 1044 1045 hlist_for_each_entry(nh, node, head, nh_hash) { 1046 struct fib_info *fi = nh->nh_parent; 1047 int dead; 1048 1049 BUG_ON(!fi->fib_nhs); 1050 if (nh->nh_dev != dev || fi == prev_fi) 1051 continue; 1052 prev_fi = fi; 1053 dead = 0; 1054 change_nexthops(fi) { 1055 if (nexthop_nh->nh_flags & RTNH_F_DEAD) 1056 dead++; 1057 else if (nexthop_nh->nh_dev == dev && 1058 nexthop_nh->nh_scope != scope) { 1059 nexthop_nh->nh_flags |= RTNH_F_DEAD; 1060 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1061 spin_lock_bh(&fib_multipath_lock); 1062 fi->fib_power -= nexthop_nh->nh_power; 1063 nexthop_nh->nh_power = 0; 1064 spin_unlock_bh(&fib_multipath_lock); 1065 #endif 1066 dead++; 1067 } 1068 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1069 if (force > 1 && nexthop_nh->nh_dev == dev) { 1070 dead = fi->fib_nhs; 1071 break; 1072 } 1073 #endif 1074 } endfor_nexthops(fi) 1075 if (dead == fi->fib_nhs) { 1076 fi->fib_flags |= RTNH_F_DEAD; 1077 ret++; 1078 } 1079 } 1080 1081 return ret; 1082 } 1083 1084 /* Must be invoked inside of an RCU protected region. */ 1085 void fib_select_default(struct fib_result *res) 1086 { 1087 struct fib_info *fi = NULL, *last_resort = NULL; 1088 struct list_head *fa_head = res->fa_head; 1089 struct fib_table *tb = res->table; 1090 int order = -1, last_idx = -1; 1091 struct fib_alias *fa; 1092 1093 list_for_each_entry_rcu(fa, fa_head, fa_list) { 1094 struct fib_info *next_fi = fa->fa_info; 1095 1096 if (next_fi->fib_scope != res->scope || 1097 fa->fa_type != RTN_UNICAST) 1098 continue; 1099 1100 if (next_fi->fib_priority > res->fi->fib_priority) 1101 break; 1102 if (!next_fi->fib_nh[0].nh_gw || 1103 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) 1104 continue; 1105 1106 fib_alias_accessed(fa); 1107 1108 if (fi == NULL) { 1109 if (next_fi != res->fi) 1110 break; 1111 } else if (!fib_detect_death(fi, order, &last_resort, 1112 &last_idx, tb->tb_default)) { 1113 fib_result_assign(res, fi); 1114 tb->tb_default = order; 1115 goto out; 1116 } 1117 fi = next_fi; 1118 order++; 1119 } 1120 1121 if (order <= 0 || fi == NULL) { 1122 tb->tb_default = -1; 1123 goto out; 1124 } 1125 1126 if (!fib_detect_death(fi, order, &last_resort, &last_idx, 1127 tb->tb_default)) { 1128 fib_result_assign(res, fi); 1129 tb->tb_default = order; 1130 goto out; 1131 } 1132 1133 if (last_idx >= 0) 1134 fib_result_assign(res, last_resort); 1135 tb->tb_default = last_idx; 1136 out: 1137 return; 1138 } 1139 1140 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1141 1142 /* 1143 * Dead device goes up. We wake up dead nexthops. 1144 * It takes sense only on multipath routes. 1145 */ 1146 int fib_sync_up(struct net_device *dev) 1147 { 1148 struct fib_info *prev_fi; 1149 unsigned int hash; 1150 struct hlist_head *head; 1151 struct hlist_node *node; 1152 struct fib_nh *nh; 1153 int ret; 1154 1155 if (!(dev->flags & IFF_UP)) 1156 return 0; 1157 1158 prev_fi = NULL; 1159 hash = fib_devindex_hashfn(dev->ifindex); 1160 head = &fib_info_devhash[hash]; 1161 ret = 0; 1162 1163 hlist_for_each_entry(nh, node, head, nh_hash) { 1164 struct fib_info *fi = nh->nh_parent; 1165 int alive; 1166 1167 BUG_ON(!fi->fib_nhs); 1168 if (nh->nh_dev != dev || fi == prev_fi) 1169 continue; 1170 1171 prev_fi = fi; 1172 alive = 0; 1173 change_nexthops(fi) { 1174 if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) { 1175 alive++; 1176 continue; 1177 } 1178 if (nexthop_nh->nh_dev == NULL || 1179 !(nexthop_nh->nh_dev->flags & IFF_UP)) 1180 continue; 1181 if (nexthop_nh->nh_dev != dev || 1182 !__in_dev_get_rtnl(dev)) 1183 continue; 1184 alive++; 1185 spin_lock_bh(&fib_multipath_lock); 1186 nexthop_nh->nh_power = 0; 1187 nexthop_nh->nh_flags &= ~RTNH_F_DEAD; 1188 spin_unlock_bh(&fib_multipath_lock); 1189 } endfor_nexthops(fi) 1190 1191 if (alive > 0) { 1192 fi->fib_flags &= ~RTNH_F_DEAD; 1193 ret++; 1194 } 1195 } 1196 1197 return ret; 1198 } 1199 1200 /* 1201 * The algorithm is suboptimal, but it provides really 1202 * fair weighted route distribution. 1203 */ 1204 void fib_select_multipath(struct fib_result *res) 1205 { 1206 struct fib_info *fi = res->fi; 1207 int w; 1208 1209 spin_lock_bh(&fib_multipath_lock); 1210 if (fi->fib_power <= 0) { 1211 int power = 0; 1212 change_nexthops(fi) { 1213 if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) { 1214 power += nexthop_nh->nh_weight; 1215 nexthop_nh->nh_power = nexthop_nh->nh_weight; 1216 } 1217 } endfor_nexthops(fi); 1218 fi->fib_power = power; 1219 if (power <= 0) { 1220 spin_unlock_bh(&fib_multipath_lock); 1221 /* Race condition: route has just become dead. */ 1222 res->nh_sel = 0; 1223 return; 1224 } 1225 } 1226 1227 1228 /* w should be random number [0..fi->fib_power-1], 1229 * it is pretty bad approximation. 1230 */ 1231 1232 w = jiffies % fi->fib_power; 1233 1234 change_nexthops(fi) { 1235 if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) && 1236 nexthop_nh->nh_power) { 1237 w -= nexthop_nh->nh_power; 1238 if (w <= 0) { 1239 nexthop_nh->nh_power--; 1240 fi->fib_power--; 1241 res->nh_sel = nhsel; 1242 spin_unlock_bh(&fib_multipath_lock); 1243 return; 1244 } 1245 } 1246 } endfor_nexthops(fi); 1247 1248 /* Race condition: route has just become dead. */ 1249 res->nh_sel = 0; 1250 spin_unlock_bh(&fib_multipath_lock); 1251 } 1252 #endif 1253