1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * IPv4 Forwarding Information Base: semantics. 8 * 9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 10 */ 11 12 #include <linux/uaccess.h> 13 #include <linux/bitops.h> 14 #include <linux/types.h> 15 #include <linux/kernel.h> 16 #include <linux/jiffies.h> 17 #include <linux/mm.h> 18 #include <linux/string.h> 19 #include <linux/socket.h> 20 #include <linux/sockios.h> 21 #include <linux/errno.h> 22 #include <linux/in.h> 23 #include <linux/inet.h> 24 #include <linux/inetdevice.h> 25 #include <linux/netdevice.h> 26 #include <linux/if_arp.h> 27 #include <linux/proc_fs.h> 28 #include <linux/skbuff.h> 29 #include <linux/init.h> 30 #include <linux/slab.h> 31 #include <linux/netlink.h> 32 #include <linux/hash.h> 33 #include <linux/nospec.h> 34 35 #include <net/arp.h> 36 #include <net/inet_dscp.h> 37 #include <net/ip.h> 38 #include <net/protocol.h> 39 #include <net/route.h> 40 #include <net/tcp.h> 41 #include <net/sock.h> 42 #include <net/ip_fib.h> 43 #include <net/ip6_fib.h> 44 #include <net/nexthop.h> 45 #include <net/netlink.h> 46 #include <net/rtnh.h> 47 #include <net/lwtunnel.h> 48 #include <net/fib_notifier.h> 49 #include <net/addrconf.h> 50 51 #include "fib_lookup.h" 52 53 /* for_nexthops and change_nexthops only used when nexthop object 54 * is not set in a fib_info. The logic within can reference fib_nh. 55 */ 56 #ifdef CONFIG_IP_ROUTE_MULTIPATH 57 58 #define for_nexthops(fi) { \ 59 int nhsel; const struct fib_nh *nh; \ 60 for (nhsel = 0, nh = (fi)->fib_nh; \ 61 nhsel < fib_info_num_path((fi)); \ 62 nh++, nhsel++) 63 64 #define change_nexthops(fi) { \ 65 int nhsel; struct fib_nh *nexthop_nh; \ 66 for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ 67 nhsel < fib_info_num_path((fi)); \ 68 nexthop_nh++, nhsel++) 69 70 #else /* CONFIG_IP_ROUTE_MULTIPATH */ 71 72 /* Hope, that gcc will optimize it to get rid of dummy loop */ 73 74 #define for_nexthops(fi) { \ 75 int nhsel; const struct fib_nh *nh = (fi)->fib_nh; \ 76 for (nhsel = 0; nhsel < 1; nhsel++) 77 78 #define change_nexthops(fi) { \ 79 int nhsel; \ 80 struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ 81 for (nhsel = 0; nhsel < 1; nhsel++) 82 83 #endif /* CONFIG_IP_ROUTE_MULTIPATH */ 84 85 #define endfor_nexthops(fi) } 86 87 88 const struct fib_prop fib_props[RTN_MAX + 1] = { 89 [RTN_UNSPEC] = { 90 .error = 0, 91 .scope = RT_SCOPE_NOWHERE, 92 }, 93 [RTN_UNICAST] = { 94 .error = 0, 95 .scope = RT_SCOPE_UNIVERSE, 96 }, 97 [RTN_LOCAL] = { 98 .error = 0, 99 .scope = RT_SCOPE_HOST, 100 }, 101 [RTN_BROADCAST] = { 102 .error = 0, 103 .scope = RT_SCOPE_LINK, 104 }, 105 [RTN_ANYCAST] = { 106 .error = 0, 107 .scope = RT_SCOPE_LINK, 108 }, 109 [RTN_MULTICAST] = { 110 .error = 0, 111 .scope = RT_SCOPE_UNIVERSE, 112 }, 113 [RTN_BLACKHOLE] = { 114 .error = -EINVAL, 115 .scope = RT_SCOPE_UNIVERSE, 116 }, 117 [RTN_UNREACHABLE] = { 118 .error = -EHOSTUNREACH, 119 .scope = RT_SCOPE_UNIVERSE, 120 }, 121 [RTN_PROHIBIT] = { 122 .error = -EACCES, 123 .scope = RT_SCOPE_UNIVERSE, 124 }, 125 [RTN_THROW] = { 126 .error = -EAGAIN, 127 .scope = RT_SCOPE_UNIVERSE, 128 }, 129 [RTN_NAT] = { 130 .error = -EINVAL, 131 .scope = RT_SCOPE_NOWHERE, 132 }, 133 [RTN_XRESOLVE] = { 134 .error = -EINVAL, 135 .scope = RT_SCOPE_NOWHERE, 136 }, 137 }; 138 139 static void rt_fibinfo_free(struct rtable __rcu **rtp) 140 { 141 struct rtable *rt = rcu_dereference_protected(*rtp, 1); 142 143 if (!rt) 144 return; 145 146 /* Not even needed : RCU_INIT_POINTER(*rtp, NULL); 147 * because we waited an RCU grace period before calling 148 * free_fib_info_rcu() 149 */ 150 151 dst_dev_put(&rt->dst); 152 dst_release_immediate(&rt->dst); 153 } 154 155 static void free_nh_exceptions(struct fib_nh_common *nhc) 156 { 157 struct fnhe_hash_bucket *hash; 158 int i; 159 160 hash = rcu_dereference_protected(nhc->nhc_exceptions, 1); 161 if (!hash) 162 return; 163 for (i = 0; i < FNHE_HASH_SIZE; i++) { 164 struct fib_nh_exception *fnhe; 165 166 fnhe = rcu_dereference_protected(hash[i].chain, 1); 167 while (fnhe) { 168 struct fib_nh_exception *next; 169 170 next = rcu_dereference_protected(fnhe->fnhe_next, 1); 171 172 rt_fibinfo_free(&fnhe->fnhe_rth_input); 173 rt_fibinfo_free(&fnhe->fnhe_rth_output); 174 175 kfree(fnhe); 176 177 fnhe = next; 178 } 179 } 180 kfree(hash); 181 } 182 183 static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) 184 { 185 int cpu; 186 187 if (!rtp) 188 return; 189 190 for_each_possible_cpu(cpu) { 191 struct rtable *rt; 192 193 rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1); 194 if (rt) { 195 dst_dev_put(&rt->dst); 196 dst_release_immediate(&rt->dst); 197 } 198 } 199 free_percpu(rtp); 200 } 201 202 void fib_nh_common_release(struct fib_nh_common *nhc) 203 { 204 netdev_put(nhc->nhc_dev, &nhc->nhc_dev_tracker); 205 lwtstate_put(nhc->nhc_lwtstate); 206 rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output); 207 rt_fibinfo_free(&nhc->nhc_rth_input); 208 free_nh_exceptions(nhc); 209 } 210 211 void fib_nh_release(struct net *net, struct fib_nh *fib_nh) 212 { 213 #ifdef CONFIG_IP_ROUTE_CLASSID 214 if (fib_nh->nh_tclassid) 215 atomic_dec(&net->ipv4.fib_num_tclassid_users); 216 #endif 217 fib_nh_common_release(&fib_nh->nh_common); 218 } 219 220 /* Release a nexthop info record */ 221 static void free_fib_info_rcu(struct rcu_head *head) 222 { 223 struct fib_info *fi = container_of(head, struct fib_info, rcu); 224 225 if (fi->nh) { 226 nexthop_put(fi->nh); 227 } else { 228 change_nexthops(fi) { 229 fib_nh_release(fi->fib_net, nexthop_nh); 230 } endfor_nexthops(fi); 231 } 232 233 ip_fib_metrics_put(fi->fib_metrics); 234 235 kfree(fi); 236 } 237 238 void free_fib_info(struct fib_info *fi) 239 { 240 if (fi->fib_dead == 0) { 241 pr_warn("Freeing alive fib_info %p\n", fi); 242 return; 243 } 244 245 call_rcu_hurry(&fi->rcu, free_fib_info_rcu); 246 } 247 EXPORT_SYMBOL_GPL(free_fib_info); 248 249 void fib_release_info(struct fib_info *fi) 250 { 251 ASSERT_RTNL(); 252 if (fi && refcount_dec_and_test(&fi->fib_treeref)) { 253 hlist_del(&fi->fib_hash); 254 fi->fib_net->ipv4.fib_info_cnt--; 255 256 if (fi->fib_prefsrc) 257 hlist_del(&fi->fib_lhash); 258 if (fi->nh) { 259 list_del(&fi->nh_list); 260 } else { 261 change_nexthops(fi) { 262 if (!nexthop_nh->fib_nh_dev) 263 continue; 264 hlist_del_rcu(&nexthop_nh->nh_hash); 265 } endfor_nexthops(fi) 266 } 267 /* Paired with READ_ONCE() from fib_table_lookup() */ 268 WRITE_ONCE(fi->fib_dead, 1); 269 fib_info_put(fi); 270 } 271 } 272 273 static inline int nh_comp(struct fib_info *fi, struct fib_info *ofi) 274 { 275 const struct fib_nh *onh; 276 277 if (fi->nh || ofi->nh) 278 return nexthop_cmp(fi->nh, ofi->nh) ? 0 : -1; 279 280 if (ofi->fib_nhs == 0) 281 return 0; 282 283 for_nexthops(fi) { 284 onh = fib_info_nh(ofi, nhsel); 285 286 if (nh->fib_nh_oif != onh->fib_nh_oif || 287 nh->fib_nh_gw_family != onh->fib_nh_gw_family || 288 nh->fib_nh_scope != onh->fib_nh_scope || 289 #ifdef CONFIG_IP_ROUTE_MULTIPATH 290 nh->fib_nh_weight != onh->fib_nh_weight || 291 #endif 292 #ifdef CONFIG_IP_ROUTE_CLASSID 293 nh->nh_tclassid != onh->nh_tclassid || 294 #endif 295 lwtunnel_cmp_encap(nh->fib_nh_lws, onh->fib_nh_lws) || 296 ((nh->fib_nh_flags ^ onh->fib_nh_flags) & ~RTNH_COMPARE_MASK)) 297 return -1; 298 299 if (nh->fib_nh_gw_family == AF_INET && 300 nh->fib_nh_gw4 != onh->fib_nh_gw4) 301 return -1; 302 303 if (nh->fib_nh_gw_family == AF_INET6 && 304 ipv6_addr_cmp(&nh->fib_nh_gw6, &onh->fib_nh_gw6)) 305 return -1; 306 } endfor_nexthops(fi); 307 return 0; 308 } 309 310 static struct hlist_head *fib_nh_head(struct net_device *dev) 311 { 312 return &dev->fib_nh_head; 313 } 314 315 static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope, 316 u32 prefsrc, u32 priority) 317 { 318 unsigned int val = init_val; 319 320 val ^= (protocol << 8) | scope; 321 val ^= prefsrc; 322 val ^= priority; 323 324 return val; 325 } 326 327 static unsigned int fib_info_hashfn_result(const struct net *net, 328 unsigned int val) 329 { 330 return hash_32(val ^ net_hash_mix(net), net->ipv4.fib_info_hash_bits); 331 } 332 333 static struct hlist_head *fib_info_hash_bucket(struct fib_info *fi) 334 { 335 struct net *net = fi->fib_net; 336 unsigned int val; 337 338 val = fib_info_hashfn_1(fi->fib_nhs, fi->fib_protocol, 339 fi->fib_scope, (__force u32)fi->fib_prefsrc, 340 fi->fib_priority); 341 342 if (fi->nh) { 343 val ^= fi->nh->id; 344 } else { 345 for_nexthops(fi) { 346 val ^= nh->fib_nh_oif; 347 } endfor_nexthops(fi) 348 } 349 350 return &net->ipv4.fib_info_hash[fib_info_hashfn_result(net, val)]; 351 } 352 353 static struct hlist_head *fib_info_laddrhash_bucket(const struct net *net, 354 __be32 val) 355 { 356 unsigned int hash_bits = net->ipv4.fib_info_hash_bits; 357 u32 slot; 358 359 slot = hash_32(net_hash_mix(net) ^ (__force u32)val, hash_bits); 360 361 return &net->ipv4.fib_info_hash[(1 << hash_bits) + slot]; 362 } 363 364 static struct hlist_head *fib_info_hash_alloc(unsigned int hash_bits) 365 { 366 /* The second half is used for prefsrc */ 367 return kvzalloc_objs(struct hlist_head, (1 << hash_bits) * 2); 368 } 369 370 static void fib_info_hash_free(struct hlist_head *head) 371 { 372 kvfree(head); 373 } 374 375 static void fib_info_hash_grow(struct net *net) 376 { 377 unsigned int old_size = 1 << net->ipv4.fib_info_hash_bits; 378 struct hlist_head *new_info_hash, *old_info_hash; 379 unsigned int i; 380 381 if (net->ipv4.fib_info_cnt < old_size) 382 return; 383 384 new_info_hash = fib_info_hash_alloc(net->ipv4.fib_info_hash_bits + 1); 385 if (!new_info_hash) 386 return; 387 388 old_info_hash = net->ipv4.fib_info_hash; 389 net->ipv4.fib_info_hash = new_info_hash; 390 net->ipv4.fib_info_hash_bits += 1; 391 392 for (i = 0; i < old_size; i++) { 393 struct hlist_head *head = &old_info_hash[i]; 394 struct hlist_node *n; 395 struct fib_info *fi; 396 397 hlist_for_each_entry_safe(fi, n, head, fib_hash) 398 hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi)); 399 } 400 401 for (i = 0; i < old_size; i++) { 402 struct hlist_head *lhead = &old_info_hash[old_size + i]; 403 struct hlist_node *n; 404 struct fib_info *fi; 405 406 hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) 407 hlist_add_head(&fi->fib_lhash, 408 fib_info_laddrhash_bucket(fi->fib_net, 409 fi->fib_prefsrc)); 410 } 411 412 fib_info_hash_free(old_info_hash); 413 } 414 415 /* no metrics, only nexthop id */ 416 static struct fib_info *fib_find_info_nh(struct net *net, 417 const struct fib_config *cfg) 418 { 419 struct hlist_head *head; 420 struct fib_info *fi; 421 unsigned int hash; 422 423 hash = fib_info_hashfn_1(cfg->fc_nh_id, 424 cfg->fc_protocol, cfg->fc_scope, 425 (__force u32)cfg->fc_prefsrc, 426 cfg->fc_priority); 427 hash = fib_info_hashfn_result(net, hash); 428 head = &net->ipv4.fib_info_hash[hash]; 429 430 hlist_for_each_entry(fi, head, fib_hash) { 431 if (!fi->nh || fi->nh->id != cfg->fc_nh_id) 432 continue; 433 434 if (cfg->fc_protocol == fi->fib_protocol && 435 cfg->fc_scope == fi->fib_scope && 436 cfg->fc_prefsrc == fi->fib_prefsrc && 437 cfg->fc_priority == fi->fib_priority && 438 cfg->fc_type == fi->fib_type && 439 cfg->fc_table == fi->fib_tb_id && 440 !((cfg->fc_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK)) 441 return fi; 442 } 443 444 return NULL; 445 } 446 447 static struct fib_info *fib_find_info(struct fib_info *nfi) 448 { 449 struct hlist_head *head = fib_info_hash_bucket(nfi); 450 struct fib_info *fi; 451 452 hlist_for_each_entry(fi, head, fib_hash) { 453 if (fi->fib_nhs != nfi->fib_nhs) 454 continue; 455 456 if (nfi->fib_protocol == fi->fib_protocol && 457 nfi->fib_scope == fi->fib_scope && 458 nfi->fib_prefsrc == fi->fib_prefsrc && 459 nfi->fib_priority == fi->fib_priority && 460 nfi->fib_type == fi->fib_type && 461 nfi->fib_tb_id == fi->fib_tb_id && 462 memcmp(nfi->fib_metrics, fi->fib_metrics, 463 sizeof(u32) * RTAX_MAX) == 0 && 464 !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) && 465 nh_comp(fi, nfi) == 0) 466 return fi; 467 } 468 469 return NULL; 470 } 471 472 /* Check, that the gateway is already configured. 473 * Used only by redirect accept routine, under rcu_read_lock(); 474 */ 475 int ip_fib_check_default(__be32 gw, struct net_device *dev) 476 { 477 struct hlist_head *head; 478 struct fib_nh *nh; 479 480 head = fib_nh_head(dev); 481 482 hlist_for_each_entry_rcu(nh, head, nh_hash) { 483 DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev); 484 if (nh->fib_nh_gw4 == gw && 485 !(nh->fib_nh_flags & RTNH_F_DEAD)) { 486 return 0; 487 } 488 } 489 490 return -1; 491 } 492 493 size_t fib_nlmsg_size(struct fib_info *fi) 494 { 495 size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg)) 496 + nla_total_size(4) /* RTA_TABLE */ 497 + nla_total_size(4) /* RTA_DST */ 498 + nla_total_size(4) /* RTA_PRIORITY */ 499 + nla_total_size(4) /* RTA_PREFSRC */ 500 + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */ 501 unsigned int nhs = fib_info_num_path(fi); 502 503 /* space for nested metrics */ 504 payload += nla_total_size((RTAX_MAX * nla_total_size(4))); 505 506 if (fi->nh) 507 payload += nla_total_size(4); /* RTA_NH_ID */ 508 509 if (nhs) { 510 size_t nh_encapsize = 0; 511 /* Also handles the special case nhs == 1 */ 512 513 /* each nexthop is packed in an attribute */ 514 size_t nhsize = nla_total_size(sizeof(struct rtnexthop)); 515 unsigned int i; 516 517 /* may contain flow and gateway attribute */ 518 nhsize += 2 * nla_total_size(4); 519 520 /* grab encap info */ 521 for (i = 0; i < fib_info_num_path(fi); i++) { 522 struct fib_nh_common *nhc = fib_info_nhc(fi, i); 523 524 if (nhc->nhc_lwtstate) { 525 /* RTA_ENCAP_TYPE */ 526 nh_encapsize += lwtunnel_get_encap_size( 527 nhc->nhc_lwtstate); 528 /* RTA_ENCAP */ 529 nh_encapsize += nla_total_size(2); 530 } 531 } 532 533 /* all nexthops are packed in a nested attribute */ 534 payload += nla_total_size((nhs * nhsize) + nh_encapsize); 535 536 } 537 538 return payload; 539 } 540 541 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, 542 int dst_len, u32 tb_id, const struct nl_info *info, 543 unsigned int nlm_flags) 544 { 545 struct fib_rt_info fri; 546 struct sk_buff *skb; 547 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 548 int err = -ENOBUFS; 549 550 skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL); 551 if (!skb) 552 goto errout; 553 554 fri.fi = fa->fa_info; 555 fri.tb_id = tb_id; 556 fri.dst = key; 557 fri.dst_len = dst_len; 558 fri.dscp = fa->fa_dscp; 559 fri.type = fa->fa_type; 560 fri.offload = READ_ONCE(fa->offload); 561 fri.trap = READ_ONCE(fa->trap); 562 fri.offload_failed = READ_ONCE(fa->offload_failed); 563 err = fib_dump_info(skb, info->portid, seq, event, &fri, nlm_flags); 564 if (err < 0) { 565 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ 566 WARN_ON(err == -EMSGSIZE); 567 kfree_skb(skb); 568 goto errout; 569 } 570 rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE, 571 info->nlh, GFP_KERNEL); 572 return; 573 errout: 574 rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err); 575 } 576 577 static int fib_detect_death(struct fib_info *fi, int order, 578 struct fib_info **last_resort, int *last_idx, 579 int dflt) 580 { 581 const struct fib_nh_common *nhc = fib_info_nhc(fi, 0); 582 struct neighbour *n; 583 int state = NUD_NONE; 584 585 if (likely(nhc->nhc_gw_family == AF_INET)) 586 n = neigh_lookup(&arp_tbl, &nhc->nhc_gw.ipv4, nhc->nhc_dev); 587 else if (IS_ENABLED(CONFIG_IPV6) && nhc->nhc_gw_family == AF_INET6) 588 n = neigh_lookup(&nd_tbl, &nhc->nhc_gw.ipv6, nhc->nhc_dev); 589 else 590 n = NULL; 591 592 if (n) { 593 state = READ_ONCE(n->nud_state); 594 neigh_release(n); 595 } else { 596 return 0; 597 } 598 if (state == NUD_REACHABLE) 599 return 0; 600 if ((state & NUD_VALID) && order != dflt) 601 return 0; 602 if ((state & NUD_VALID) || 603 (*last_idx < 0 && order > dflt && state != NUD_INCOMPLETE)) { 604 *last_resort = fi; 605 *last_idx = order; 606 } 607 return 1; 608 } 609 610 int fib_nh_common_init(struct net *net, struct fib_nh_common *nhc, 611 struct nlattr *encap, u16 encap_type, 612 void *cfg, gfp_t gfp_flags, 613 struct netlink_ext_ack *extack) 614 { 615 int err; 616 617 nhc->nhc_pcpu_rth_output = alloc_percpu_gfp(struct rtable __rcu *, 618 gfp_flags); 619 if (!nhc->nhc_pcpu_rth_output) 620 return -ENOMEM; 621 622 if (encap) { 623 struct lwtunnel_state *lwtstate; 624 625 err = lwtunnel_build_state(net, encap_type, encap, 626 nhc->nhc_family, cfg, &lwtstate, 627 extack); 628 if (err) 629 goto lwt_failure; 630 631 nhc->nhc_lwtstate = lwtstate_get(lwtstate); 632 } 633 634 return 0; 635 636 lwt_failure: 637 rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output); 638 nhc->nhc_pcpu_rth_output = NULL; 639 return err; 640 } 641 642 int fib_nh_init(struct net *net, struct fib_nh *nh, 643 struct fib_config *cfg, int nh_weight, 644 struct netlink_ext_ack *extack) 645 { 646 int err; 647 648 nh->fib_nh_family = AF_INET; 649 650 err = fib_nh_common_init(net, &nh->nh_common, cfg->fc_encap, 651 cfg->fc_encap_type, cfg, GFP_KERNEL, extack); 652 if (err) 653 return err; 654 655 nh->fib_nh_oif = cfg->fc_oif; 656 nh->fib_nh_gw_family = cfg->fc_gw_family; 657 if (cfg->fc_gw_family == AF_INET) 658 nh->fib_nh_gw4 = cfg->fc_gw4; 659 else if (cfg->fc_gw_family == AF_INET6) 660 nh->fib_nh_gw6 = cfg->fc_gw6; 661 662 nh->fib_nh_flags = cfg->fc_flags; 663 664 #ifdef CONFIG_IP_ROUTE_CLASSID 665 nh->nh_tclassid = cfg->fc_flow; 666 if (nh->nh_tclassid) 667 atomic_inc(&net->ipv4.fib_num_tclassid_users); 668 #endif 669 #ifdef CONFIG_IP_ROUTE_MULTIPATH 670 nh->fib_nh_weight = nh_weight; 671 #endif 672 return 0; 673 } 674 675 #ifdef CONFIG_IP_ROUTE_MULTIPATH 676 677 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining, 678 struct netlink_ext_ack *extack) 679 { 680 int nhs = 0; 681 682 while (rtnh_ok(rtnh, remaining)) { 683 nhs++; 684 rtnh = rtnh_next(rtnh, &remaining); 685 } 686 687 /* leftover implies invalid nexthop configuration, discard it */ 688 if (remaining > 0) { 689 NL_SET_ERR_MSG(extack, 690 "Invalid nexthop configuration - extra data after nexthops"); 691 nhs = 0; 692 } 693 694 return nhs; 695 } 696 697 static int fib_gw_from_attr(__be32 *gw, struct nlattr *nla, 698 struct netlink_ext_ack *extack) 699 { 700 if (nla_len(nla) < sizeof(*gw)) { 701 NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_GATEWAY"); 702 return -EINVAL; 703 } 704 705 *gw = nla_get_in_addr(nla); 706 707 return 0; 708 } 709 710 /* only called when fib_nh is integrated into fib_info */ 711 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, 712 int remaining, struct fib_config *cfg, 713 struct netlink_ext_ack *extack) 714 { 715 struct net *net = fi->fib_net; 716 struct fib_config fib_cfg; 717 struct fib_nh *nh; 718 int ret; 719 720 change_nexthops(fi) { 721 int attrlen; 722 723 memset(&fib_cfg, 0, sizeof(fib_cfg)); 724 725 if (!rtnh_ok(rtnh, remaining)) { 726 NL_SET_ERR_MSG(extack, 727 "Invalid nexthop configuration - extra data after nexthop"); 728 return -EINVAL; 729 } 730 731 if (rtnh->rtnh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) { 732 NL_SET_ERR_MSG(extack, 733 "Invalid flags for nexthop - can not contain DEAD or LINKDOWN"); 734 return -EINVAL; 735 } 736 737 fib_cfg.fc_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; 738 fib_cfg.fc_oif = rtnh->rtnh_ifindex; 739 740 attrlen = rtnh_attrlen(rtnh); 741 if (attrlen > 0) { 742 struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh); 743 744 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 745 nlav = nla_find(attrs, attrlen, RTA_VIA); 746 if (nla && nlav) { 747 NL_SET_ERR_MSG(extack, 748 "Nexthop configuration can not contain both GATEWAY and VIA"); 749 return -EINVAL; 750 } 751 if (nla) { 752 ret = fib_gw_from_attr(&fib_cfg.fc_gw4, nla, 753 extack); 754 if (ret) 755 goto errout; 756 757 if (fib_cfg.fc_gw4) 758 fib_cfg.fc_gw_family = AF_INET; 759 } else if (nlav) { 760 ret = fib_gw_from_via(&fib_cfg, nlav, extack); 761 if (ret) 762 goto errout; 763 } 764 765 nla = nla_find(attrs, attrlen, RTA_FLOW); 766 if (nla) { 767 if (nla_len(nla) < sizeof(u32)) { 768 NL_SET_ERR_MSG(extack, "Invalid RTA_FLOW"); 769 return -EINVAL; 770 } 771 fib_cfg.fc_flow = nla_get_u32(nla); 772 } 773 774 fib_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 775 /* RTA_ENCAP_TYPE length checked in 776 * lwtunnel_valid_encap_type_attr 777 */ 778 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 779 if (nla) 780 fib_cfg.fc_encap_type = nla_get_u16(nla); 781 } 782 783 ret = fib_nh_init(net, nexthop_nh, &fib_cfg, 784 rtnh->rtnh_hops + 1, extack); 785 if (ret) 786 goto errout; 787 788 rtnh = rtnh_next(rtnh, &remaining); 789 } endfor_nexthops(fi); 790 791 ret = -EINVAL; 792 nh = fib_info_nh(fi, 0); 793 if (cfg->fc_oif && nh->fib_nh_oif != cfg->fc_oif) { 794 NL_SET_ERR_MSG(extack, 795 "Nexthop device index does not match RTA_OIF"); 796 goto errout; 797 } 798 if (cfg->fc_gw_family) { 799 if (cfg->fc_gw_family != nh->fib_nh_gw_family || 800 (cfg->fc_gw_family == AF_INET && 801 nh->fib_nh_gw4 != cfg->fc_gw4) || 802 (cfg->fc_gw_family == AF_INET6 && 803 ipv6_addr_cmp(&nh->fib_nh_gw6, &cfg->fc_gw6))) { 804 NL_SET_ERR_MSG(extack, 805 "Nexthop gateway does not match RTA_GATEWAY or RTA_VIA"); 806 goto errout; 807 } 808 } 809 #ifdef CONFIG_IP_ROUTE_CLASSID 810 if (cfg->fc_flow && nh->nh_tclassid != cfg->fc_flow) { 811 NL_SET_ERR_MSG(extack, 812 "Nexthop class id does not match RTA_FLOW"); 813 goto errout; 814 } 815 #endif 816 ret = 0; 817 errout: 818 return ret; 819 } 820 821 /* only called when fib_nh is integrated into fib_info */ 822 static void fib_rebalance(struct fib_info *fi) 823 { 824 int total; 825 int w; 826 827 if (fib_info_num_path(fi) < 2) 828 return; 829 830 total = 0; 831 for_nexthops(fi) { 832 if (nh->fib_nh_flags & RTNH_F_DEAD) 833 continue; 834 835 if (ip_ignore_linkdown(nh->fib_nh_dev) && 836 nh->fib_nh_flags & RTNH_F_LINKDOWN) 837 continue; 838 839 total += nh->fib_nh_weight; 840 } endfor_nexthops(fi); 841 842 w = 0; 843 change_nexthops(fi) { 844 int upper_bound; 845 846 if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD) { 847 upper_bound = -1; 848 } else if (ip_ignore_linkdown(nexthop_nh->fib_nh_dev) && 849 nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN) { 850 upper_bound = -1; 851 } else { 852 w += nexthop_nh->fib_nh_weight; 853 upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, 854 total) - 1; 855 } 856 857 atomic_set(&nexthop_nh->fib_nh_upper_bound, upper_bound); 858 } endfor_nexthops(fi); 859 } 860 #else /* CONFIG_IP_ROUTE_MULTIPATH */ 861 862 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, 863 int remaining, struct fib_config *cfg, 864 struct netlink_ext_ack *extack) 865 { 866 NL_SET_ERR_MSG(extack, "Multipath support not enabled in kernel"); 867 868 return -EINVAL; 869 } 870 871 #define fib_rebalance(fi) do { } while (0) 872 873 #endif /* CONFIG_IP_ROUTE_MULTIPATH */ 874 875 static int fib_encap_match(struct net *net, u16 encap_type, 876 struct nlattr *encap, 877 const struct fib_nh *nh, 878 const struct fib_config *cfg, 879 struct netlink_ext_ack *extack) 880 { 881 struct lwtunnel_state *lwtstate; 882 int ret, result = 0; 883 884 if (encap_type == LWTUNNEL_ENCAP_NONE) 885 return 0; 886 887 ret = lwtunnel_build_state(net, encap_type, encap, AF_INET, 888 cfg, &lwtstate, extack); 889 if (!ret) { 890 result = lwtunnel_cmp_encap(lwtstate, nh->fib_nh_lws); 891 lwtstate_free(lwtstate); 892 } 893 894 return result; 895 } 896 897 int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi, 898 struct netlink_ext_ack *extack) 899 { 900 #ifdef CONFIG_IP_ROUTE_MULTIPATH 901 struct rtnexthop *rtnh; 902 int remaining; 903 #endif 904 905 if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority) 906 return 1; 907 908 if (cfg->fc_nh_id) { 909 if (fi->nh && cfg->fc_nh_id == fi->nh->id) 910 return 0; 911 return 1; 912 } 913 914 if (fi->nh) { 915 if (cfg->fc_oif || cfg->fc_gw_family || cfg->fc_mp) 916 return 1; 917 return 0; 918 } 919 920 if (cfg->fc_oif || cfg->fc_gw_family) { 921 struct fib_nh *nh; 922 923 nh = fib_info_nh(fi, 0); 924 if (cfg->fc_encap) { 925 if (fib_encap_match(net, cfg->fc_encap_type, 926 cfg->fc_encap, nh, cfg, extack)) 927 return 1; 928 } 929 #ifdef CONFIG_IP_ROUTE_CLASSID 930 if (cfg->fc_flow && 931 cfg->fc_flow != nh->nh_tclassid) 932 return 1; 933 #endif 934 if ((cfg->fc_oif && cfg->fc_oif != nh->fib_nh_oif) || 935 (cfg->fc_gw_family && 936 cfg->fc_gw_family != nh->fib_nh_gw_family)) 937 return 1; 938 939 if (cfg->fc_gw_family == AF_INET && 940 cfg->fc_gw4 != nh->fib_nh_gw4) 941 return 1; 942 943 if (cfg->fc_gw_family == AF_INET6 && 944 ipv6_addr_cmp(&cfg->fc_gw6, &nh->fib_nh_gw6)) 945 return 1; 946 947 return 0; 948 } 949 950 #ifdef CONFIG_IP_ROUTE_MULTIPATH 951 if (!cfg->fc_mp) 952 return 0; 953 954 rtnh = cfg->fc_mp; 955 remaining = cfg->fc_mp_len; 956 957 for_nexthops(fi) { 958 int attrlen; 959 960 if (!rtnh_ok(rtnh, remaining)) 961 return -EINVAL; 962 963 if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->fib_nh_oif) 964 return 1; 965 966 attrlen = rtnh_attrlen(rtnh); 967 if (attrlen > 0) { 968 struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh); 969 int err; 970 971 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 972 nlav = nla_find(attrs, attrlen, RTA_VIA); 973 if (nla && nlav) { 974 NL_SET_ERR_MSG(extack, 975 "Nexthop configuration can not contain both GATEWAY and VIA"); 976 return -EINVAL; 977 } 978 979 if (nla) { 980 __be32 gw; 981 982 err = fib_gw_from_attr(&gw, nla, extack); 983 if (err) 984 return err; 985 986 if (nh->fib_nh_gw_family != AF_INET || 987 gw != nh->fib_nh_gw4) 988 return 1; 989 } else if (nlav) { 990 struct fib_config cfg2; 991 992 err = fib_gw_from_via(&cfg2, nlav, extack); 993 if (err) 994 return err; 995 996 switch (nh->fib_nh_gw_family) { 997 case AF_INET: 998 if (cfg2.fc_gw_family != AF_INET || 999 cfg2.fc_gw4 != nh->fib_nh_gw4) 1000 return 1; 1001 break; 1002 case AF_INET6: 1003 if (cfg2.fc_gw_family != AF_INET6 || 1004 ipv6_addr_cmp(&cfg2.fc_gw6, 1005 &nh->fib_nh_gw6)) 1006 return 1; 1007 break; 1008 } 1009 } 1010 1011 #ifdef CONFIG_IP_ROUTE_CLASSID 1012 nla = nla_find(attrs, attrlen, RTA_FLOW); 1013 if (nla) { 1014 if (nla_len(nla) < sizeof(u32)) { 1015 NL_SET_ERR_MSG(extack, "Invalid RTA_FLOW"); 1016 return -EINVAL; 1017 } 1018 if (nla_get_u32(nla) != nh->nh_tclassid) 1019 return 1; 1020 } 1021 #endif 1022 } 1023 1024 rtnh = rtnh_next(rtnh, &remaining); 1025 } endfor_nexthops(fi); 1026 #endif 1027 return 0; 1028 } 1029 1030 bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) 1031 { 1032 struct nlattr *nla; 1033 int remaining; 1034 1035 if (!cfg->fc_mx) 1036 return true; 1037 1038 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { 1039 int type = nla_type(nla); 1040 u32 fi_val, val; 1041 1042 if (!type) 1043 continue; 1044 if (type > RTAX_MAX) 1045 return false; 1046 1047 type = array_index_nospec(type, RTAX_MAX + 1); 1048 if (type == RTAX_CC_ALGO) { 1049 char tmp[TCP_CA_NAME_MAX]; 1050 bool ecn_ca = false; 1051 1052 nla_strscpy(tmp, nla, sizeof(tmp)); 1053 val = tcp_ca_get_key_by_name(tmp, &ecn_ca); 1054 } else { 1055 if (nla_len(nla) != sizeof(u32)) 1056 return false; 1057 val = nla_get_u32(nla); 1058 } 1059 1060 fi_val = fi->fib_metrics->metrics[type - 1]; 1061 if (type == RTAX_FEATURES) 1062 fi_val &= ~DST_FEATURE_ECN_CA; 1063 1064 if (fi_val != val) 1065 return false; 1066 } 1067 1068 return true; 1069 } 1070 1071 static int fib_check_nh_v6_gw(struct net *net, struct fib_nh *nh, 1072 u32 table, struct netlink_ext_ack *extack) 1073 { 1074 struct fib6_config cfg = { 1075 .fc_table = table, 1076 .fc_flags = nh->fib_nh_flags | RTF_GATEWAY, 1077 .fc_ifindex = nh->fib_nh_oif, 1078 .fc_gateway = nh->fib_nh_gw6, 1079 }; 1080 struct fib6_nh fib6_nh = {}; 1081 int err; 1082 1083 err = fib6_nh_init(net, &fib6_nh, &cfg, GFP_KERNEL, extack); 1084 if (!err) { 1085 nh->fib_nh_dev = fib6_nh.fib_nh_dev; 1086 netdev_hold(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, 1087 GFP_KERNEL); 1088 nh->fib_nh_oif = nh->fib_nh_dev->ifindex; 1089 nh->fib_nh_scope = RT_SCOPE_LINK; 1090 1091 fib6_nh_release(&fib6_nh); 1092 } 1093 1094 return err; 1095 } 1096 1097 /* 1098 * Picture 1099 * ------- 1100 * 1101 * Semantics of nexthop is very messy by historical reasons. 1102 * We have to take into account, that: 1103 * a) gateway can be actually local interface address, 1104 * so that gatewayed route is direct. 1105 * b) gateway must be on-link address, possibly 1106 * described not by an ifaddr, but also by a direct route. 1107 * c) If both gateway and interface are specified, they should not 1108 * contradict. 1109 * d) If we use tunnel routes, gateway could be not on-link. 1110 * 1111 * Attempt to reconcile all of these (alas, self-contradictory) conditions 1112 * results in pretty ugly and hairy code with obscure logic. 1113 * 1114 * I chose to generalized it instead, so that the size 1115 * of code does not increase practically, but it becomes 1116 * much more general. 1117 * Every prefix is assigned a "scope" value: "host" is local address, 1118 * "link" is direct route, 1119 * [ ... "site" ... "interior" ... ] 1120 * and "universe" is true gateway route with global meaning. 1121 * 1122 * Every prefix refers to a set of "nexthop"s (gw, oif), 1123 * where gw must have narrower scope. This recursion stops 1124 * when gw has LOCAL scope or if "nexthop" is declared ONLINK, 1125 * which means that gw is forced to be on link. 1126 * 1127 * Code is still hairy, but now it is apparently logically 1128 * consistent and very flexible. F.e. as by-product it allows 1129 * to co-exists in peace independent exterior and interior 1130 * routing processes. 1131 * 1132 * Normally it looks as following. 1133 * 1134 * {universe prefix} -> (gw, oif) [scope link] 1135 * | 1136 * |-> {link prefix} -> (gw, oif) [scope local] 1137 * | 1138 * |-> {local prefix} (terminal node) 1139 */ 1140 static int fib_check_nh_v4_gw(struct net *net, struct fib_nh *nh, u32 table, 1141 u8 scope, struct netlink_ext_ack *extack) 1142 { 1143 struct net_device *dev; 1144 struct fib_result res; 1145 int err = 0; 1146 1147 if (nh->fib_nh_flags & RTNH_F_ONLINK) { 1148 unsigned int addr_type; 1149 1150 if (scope >= RT_SCOPE_LINK) { 1151 NL_SET_ERR_MSG(extack, "Nexthop has invalid scope"); 1152 return -EINVAL; 1153 } 1154 dev = __dev_get_by_index(net, nh->fib_nh_oif); 1155 if (!dev) { 1156 NL_SET_ERR_MSG(extack, "Nexthop device required for onlink"); 1157 return -ENODEV; 1158 } 1159 if (!(dev->flags & IFF_UP)) { 1160 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 1161 return -ENETDOWN; 1162 } 1163 addr_type = inet_addr_type_dev_table(net, dev, nh->fib_nh_gw4); 1164 if (addr_type != RTN_UNICAST) { 1165 NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); 1166 return -EINVAL; 1167 } 1168 if (!netif_carrier_ok(dev)) 1169 nh->fib_nh_flags |= RTNH_F_LINKDOWN; 1170 nh->fib_nh_dev = dev; 1171 netdev_hold(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC); 1172 nh->fib_nh_scope = RT_SCOPE_LINK; 1173 return 0; 1174 } 1175 rcu_read_lock(); 1176 { 1177 struct fib_table *tbl = NULL; 1178 struct flowi4 fl4 = { 1179 .daddr = nh->fib_nh_gw4, 1180 .flowi4_scope = scope + 1, 1181 .flowi4_oif = nh->fib_nh_oif, 1182 .flowi4_iif = LOOPBACK_IFINDEX, 1183 }; 1184 1185 /* It is not necessary, but requires a bit of thinking */ 1186 if (fl4.flowi4_scope < RT_SCOPE_LINK) 1187 fl4.flowi4_scope = RT_SCOPE_LINK; 1188 1189 if (table && table != RT_TABLE_MAIN) 1190 tbl = fib_get_table(net, table); 1191 1192 if (tbl) 1193 err = fib_table_lookup(tbl, &fl4, &res, 1194 FIB_LOOKUP_IGNORE_LINKSTATE | 1195 FIB_LOOKUP_NOREF); 1196 1197 /* on error or if no table given do full lookup. This 1198 * is needed for example when nexthops are in the local 1199 * table rather than the given table 1200 */ 1201 if (!tbl || err) { 1202 err = fib_lookup(net, &fl4, &res, 1203 FIB_LOOKUP_IGNORE_LINKSTATE); 1204 } 1205 1206 if (err) { 1207 NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); 1208 goto out; 1209 } 1210 } 1211 1212 err = -EINVAL; 1213 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) { 1214 NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); 1215 goto out; 1216 } 1217 nh->fib_nh_scope = res.scope; 1218 nh->fib_nh_oif = FIB_RES_OIF(res); 1219 nh->fib_nh_dev = dev = FIB_RES_DEV(res); 1220 if (!dev) { 1221 NL_SET_ERR_MSG(extack, 1222 "No egress device for nexthop gateway"); 1223 goto out; 1224 } 1225 netdev_hold(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC); 1226 if (!netif_carrier_ok(dev)) 1227 nh->fib_nh_flags |= RTNH_F_LINKDOWN; 1228 err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; 1229 out: 1230 rcu_read_unlock(); 1231 return err; 1232 } 1233 1234 static int fib_check_nh_nongw(struct net *net, struct fib_nh *nh, 1235 struct netlink_ext_ack *extack) 1236 { 1237 struct in_device *in_dev; 1238 int err; 1239 1240 if (nh->fib_nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) { 1241 NL_SET_ERR_MSG(extack, 1242 "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set"); 1243 return -EINVAL; 1244 } 1245 1246 rcu_read_lock(); 1247 1248 err = -ENODEV; 1249 in_dev = inetdev_by_index(net, nh->fib_nh_oif); 1250 if (!in_dev) 1251 goto out; 1252 err = -ENETDOWN; 1253 if (!(in_dev->dev->flags & IFF_UP)) { 1254 NL_SET_ERR_MSG(extack, "Device for nexthop is not up"); 1255 goto out; 1256 } 1257 1258 nh->fib_nh_dev = in_dev->dev; 1259 netdev_hold(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC); 1260 nh->fib_nh_scope = RT_SCOPE_HOST; 1261 if (!netif_carrier_ok(nh->fib_nh_dev)) 1262 nh->fib_nh_flags |= RTNH_F_LINKDOWN; 1263 err = 0; 1264 out: 1265 rcu_read_unlock(); 1266 return err; 1267 } 1268 1269 int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, 1270 struct netlink_ext_ack *extack) 1271 { 1272 int err; 1273 1274 if (nh->fib_nh_gw_family == AF_INET) 1275 err = fib_check_nh_v4_gw(net, nh, table, scope, extack); 1276 else if (nh->fib_nh_gw_family == AF_INET6) 1277 err = fib_check_nh_v6_gw(net, nh, table, extack); 1278 else 1279 err = fib_check_nh_nongw(net, nh, extack); 1280 1281 return err; 1282 } 1283 1284 __be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc, 1285 unsigned char scope) 1286 { 1287 struct fib_nh *nh; 1288 __be32 saddr; 1289 1290 if (nhc->nhc_family != AF_INET) 1291 return inet_select_addr(nhc->nhc_dev, 0, scope); 1292 1293 nh = container_of(nhc, struct fib_nh, nh_common); 1294 saddr = inet_select_addr(nh->fib_nh_dev, nh->fib_nh_gw4, scope); 1295 1296 WRITE_ONCE(nh->nh_saddr, saddr); 1297 WRITE_ONCE(nh->nh_saddr_genid, atomic_read(&net->ipv4.dev_addr_genid)); 1298 1299 return saddr; 1300 } 1301 1302 __be32 fib_result_prefsrc(struct net *net, struct fib_result *res) 1303 { 1304 struct fib_nh_common *nhc = res->nhc; 1305 1306 if (res->fi->fib_prefsrc) 1307 return res->fi->fib_prefsrc; 1308 1309 if (nhc->nhc_family == AF_INET) { 1310 struct fib_nh *nh; 1311 1312 nh = container_of(nhc, struct fib_nh, nh_common); 1313 if (READ_ONCE(nh->nh_saddr_genid) == 1314 atomic_read(&net->ipv4.dev_addr_genid)) 1315 return READ_ONCE(nh->nh_saddr); 1316 } 1317 1318 return fib_info_update_nhc_saddr(net, nhc, res->fi->fib_scope); 1319 } 1320 1321 static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) 1322 { 1323 if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || 1324 fib_prefsrc != cfg->fc_dst) { 1325 u32 tb_id = cfg->fc_table; 1326 int rc; 1327 1328 if (tb_id == RT_TABLE_MAIN) 1329 tb_id = RT_TABLE_LOCAL; 1330 1331 rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net, 1332 fib_prefsrc, tb_id); 1333 1334 if (rc != RTN_LOCAL && tb_id != RT_TABLE_LOCAL) { 1335 rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net, 1336 fib_prefsrc, RT_TABLE_LOCAL); 1337 } 1338 1339 if (rc != RTN_LOCAL) 1340 return false; 1341 } 1342 return true; 1343 } 1344 1345 struct fib_info *fib_create_info(struct fib_config *cfg, 1346 struct netlink_ext_ack *extack) 1347 { 1348 int err; 1349 struct fib_info *fi = NULL; 1350 struct nexthop *nh = NULL; 1351 struct fib_info *ofi; 1352 int nhs = 1; 1353 struct net *net = cfg->fc_nlinfo.nl_net; 1354 1355 ASSERT_RTNL(); 1356 if (cfg->fc_type > RTN_MAX) 1357 goto err_inval; 1358 1359 /* Fast check to catch the most weird cases */ 1360 if (fib_props[cfg->fc_type].scope > cfg->fc_scope) { 1361 NL_SET_ERR_MSG(extack, "Invalid scope"); 1362 goto err_inval; 1363 } 1364 1365 if (cfg->fc_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) { 1366 NL_SET_ERR_MSG(extack, 1367 "Invalid rtm_flags - can not contain DEAD or LINKDOWN"); 1368 goto err_inval; 1369 } 1370 1371 if (cfg->fc_nh_id) { 1372 if (!cfg->fc_mx) { 1373 fi = fib_find_info_nh(net, cfg); 1374 if (fi) { 1375 refcount_inc(&fi->fib_treeref); 1376 return fi; 1377 } 1378 } 1379 1380 nh = nexthop_find_by_id(net, cfg->fc_nh_id); 1381 if (!nh) { 1382 NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); 1383 goto err_inval; 1384 } 1385 nhs = 0; 1386 } 1387 1388 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1389 if (cfg->fc_mp) { 1390 nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len, extack); 1391 if (nhs == 0) 1392 goto err_inval; 1393 } 1394 #endif 1395 1396 fib_info_hash_grow(net); 1397 1398 fi = kzalloc_flex(*fi, fib_nh, nhs); 1399 if (!fi) { 1400 err = -ENOBUFS; 1401 goto failure; 1402 } 1403 1404 fi->fib_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack); 1405 if (IS_ERR(fi->fib_metrics)) { 1406 err = PTR_ERR(fi->fib_metrics); 1407 kfree(fi); 1408 return ERR_PTR(err); 1409 } 1410 1411 fi->fib_net = net; 1412 fi->fib_protocol = cfg->fc_protocol; 1413 fi->fib_scope = cfg->fc_scope; 1414 fi->fib_flags = cfg->fc_flags; 1415 fi->fib_priority = cfg->fc_priority; 1416 fi->fib_prefsrc = cfg->fc_prefsrc; 1417 fi->fib_type = cfg->fc_type; 1418 fi->fib_tb_id = cfg->fc_table; 1419 1420 fi->fib_nhs = nhs; 1421 if (nh) { 1422 if (!nexthop_get(nh)) { 1423 NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); 1424 err = -EINVAL; 1425 } else { 1426 err = 0; 1427 fi->nh = nh; 1428 } 1429 } else { 1430 change_nexthops(fi) { 1431 nexthop_nh->nh_parent = fi; 1432 } endfor_nexthops(fi) 1433 1434 if (cfg->fc_mp) 1435 err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, 1436 extack); 1437 else 1438 err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack); 1439 } 1440 1441 if (err != 0) 1442 goto failure; 1443 1444 if (fib_props[cfg->fc_type].error) { 1445 if (cfg->fc_gw_family || cfg->fc_oif || cfg->fc_mp) { 1446 NL_SET_ERR_MSG(extack, 1447 "Gateway, device and multipath can not be specified for this route type"); 1448 goto err_inval; 1449 } 1450 goto link_it; 1451 } else { 1452 switch (cfg->fc_type) { 1453 case RTN_UNICAST: 1454 case RTN_LOCAL: 1455 case RTN_BROADCAST: 1456 case RTN_ANYCAST: 1457 case RTN_MULTICAST: 1458 break; 1459 default: 1460 NL_SET_ERR_MSG(extack, "Invalid route type"); 1461 goto err_inval; 1462 } 1463 } 1464 1465 if (cfg->fc_scope > RT_SCOPE_HOST) { 1466 NL_SET_ERR_MSG(extack, "Invalid scope"); 1467 goto err_inval; 1468 } 1469 1470 if (fi->nh) { 1471 err = fib_check_nexthop(fi->nh, cfg->fc_scope, extack); 1472 if (err) 1473 goto failure; 1474 } else if (cfg->fc_scope == RT_SCOPE_HOST) { 1475 struct fib_nh *nh = fi->fib_nh; 1476 1477 /* Local address is added. */ 1478 if (nhs != 1) { 1479 NL_SET_ERR_MSG(extack, 1480 "Route with host scope can not have multiple nexthops"); 1481 goto err_inval; 1482 } 1483 if (nh->fib_nh_gw_family) { 1484 NL_SET_ERR_MSG(extack, 1485 "Route with host scope can not have a gateway"); 1486 goto err_inval; 1487 } 1488 nh->fib_nh_scope = RT_SCOPE_NOWHERE; 1489 nh->fib_nh_dev = dev_get_by_index(net, nh->fib_nh_oif); 1490 err = -ENODEV; 1491 if (!nh->fib_nh_dev) 1492 goto failure; 1493 netdev_tracker_alloc(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, 1494 GFP_KERNEL); 1495 } else { 1496 int linkdown = 0; 1497 1498 change_nexthops(fi) { 1499 err = fib_check_nh(cfg->fc_nlinfo.nl_net, nexthop_nh, 1500 cfg->fc_table, cfg->fc_scope, 1501 extack); 1502 if (err != 0) 1503 goto failure; 1504 if (nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN) 1505 linkdown++; 1506 } endfor_nexthops(fi) 1507 if (linkdown == fi->fib_nhs) 1508 fi->fib_flags |= RTNH_F_LINKDOWN; 1509 } 1510 1511 if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc)) { 1512 NL_SET_ERR_MSG(extack, "Invalid prefsrc address"); 1513 goto err_inval; 1514 } 1515 1516 if (!fi->nh) { 1517 change_nexthops(fi) { 1518 fib_info_update_nhc_saddr(net, &nexthop_nh->nh_common, 1519 fi->fib_scope); 1520 if (nexthop_nh->fib_nh_gw_family == AF_INET6) 1521 fi->fib_nh_is_v6 = true; 1522 } endfor_nexthops(fi) 1523 1524 fib_rebalance(fi); 1525 } 1526 1527 link_it: 1528 ofi = fib_find_info(fi); 1529 if (ofi) { 1530 /* fib_table_lookup() should not see @fi yet. */ 1531 fi->fib_dead = 1; 1532 free_fib_info(fi); 1533 refcount_inc(&ofi->fib_treeref); 1534 return ofi; 1535 } 1536 1537 refcount_set(&fi->fib_treeref, 1); 1538 refcount_set(&fi->fib_clntref, 1); 1539 1540 net->ipv4.fib_info_cnt++; 1541 hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi)); 1542 1543 if (fi->fib_prefsrc) { 1544 struct hlist_head *head; 1545 1546 head = fib_info_laddrhash_bucket(net, fi->fib_prefsrc); 1547 hlist_add_head(&fi->fib_lhash, head); 1548 } 1549 if (fi->nh) { 1550 list_add(&fi->nh_list, &nh->fi_list); 1551 } else { 1552 change_nexthops(fi) { 1553 struct hlist_head *head; 1554 1555 if (!nexthop_nh->fib_nh_dev) 1556 continue; 1557 head = fib_nh_head(nexthop_nh->fib_nh_dev); 1558 hlist_add_head_rcu(&nexthop_nh->nh_hash, head); 1559 } endfor_nexthops(fi) 1560 } 1561 return fi; 1562 1563 err_inval: 1564 err = -EINVAL; 1565 1566 failure: 1567 if (fi) { 1568 /* fib_table_lookup() should not see @fi yet. */ 1569 fi->fib_dead = 1; 1570 free_fib_info(fi); 1571 } 1572 1573 return ERR_PTR(err); 1574 } 1575 1576 int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc, 1577 u8 rt_family, unsigned char *flags, bool skip_oif) 1578 { 1579 if (nhc->nhc_flags & RTNH_F_DEAD) 1580 *flags |= RTNH_F_DEAD; 1581 1582 if (nhc->nhc_flags & RTNH_F_LINKDOWN) { 1583 *flags |= RTNH_F_LINKDOWN; 1584 1585 rcu_read_lock(); 1586 switch (nhc->nhc_family) { 1587 case AF_INET: 1588 if (ip_ignore_linkdown(nhc->nhc_dev)) 1589 *flags |= RTNH_F_DEAD; 1590 break; 1591 case AF_INET6: 1592 if (ip6_ignore_linkdown(nhc->nhc_dev)) 1593 *flags |= RTNH_F_DEAD; 1594 break; 1595 } 1596 rcu_read_unlock(); 1597 } 1598 1599 switch (nhc->nhc_gw_family) { 1600 case AF_INET: 1601 if (nla_put_in_addr(skb, RTA_GATEWAY, nhc->nhc_gw.ipv4)) 1602 goto nla_put_failure; 1603 break; 1604 case AF_INET6: 1605 /* if gateway family does not match nexthop family 1606 * gateway is encoded as RTA_VIA 1607 */ 1608 if (rt_family != nhc->nhc_gw_family) { 1609 int alen = sizeof(struct in6_addr); 1610 struct nlattr *nla; 1611 struct rtvia *via; 1612 1613 nla = nla_reserve(skb, RTA_VIA, alen + 2); 1614 if (!nla) 1615 goto nla_put_failure; 1616 1617 via = nla_data(nla); 1618 via->rtvia_family = AF_INET6; 1619 memcpy(via->rtvia_addr, &nhc->nhc_gw.ipv6, alen); 1620 } else if (nla_put_in6_addr(skb, RTA_GATEWAY, 1621 &nhc->nhc_gw.ipv6) < 0) { 1622 goto nla_put_failure; 1623 } 1624 break; 1625 } 1626 1627 *flags |= (nhc->nhc_flags & 1628 (RTNH_F_ONLINK | RTNH_F_OFFLOAD | RTNH_F_TRAP)); 1629 1630 if (!skip_oif && nhc->nhc_dev && 1631 nla_put_u32(skb, RTA_OIF, nhc->nhc_dev->ifindex)) 1632 goto nla_put_failure; 1633 1634 if (lwtunnel_fill_encap(skb, nhc->nhc_lwtstate, 1635 RTA_ENCAP, RTA_ENCAP_TYPE) < 0) 1636 goto nla_put_failure; 1637 1638 return 0; 1639 1640 nla_put_failure: 1641 return -EMSGSIZE; 1642 } 1643 1644 #if IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) || IS_ENABLED(CONFIG_IPV6) 1645 int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc, 1646 int nh_weight, u8 rt_family, u32 nh_tclassid) 1647 { 1648 const struct net_device *dev = nhc->nhc_dev; 1649 struct rtnexthop *rtnh; 1650 unsigned char flags = 0; 1651 1652 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 1653 if (!rtnh) 1654 goto nla_put_failure; 1655 1656 rtnh->rtnh_hops = nh_weight - 1; 1657 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; 1658 1659 if (fib_nexthop_info(skb, nhc, rt_family, &flags, true) < 0) 1660 goto nla_put_failure; 1661 1662 rtnh->rtnh_flags = flags; 1663 1664 if (nh_tclassid && nla_put_u32(skb, RTA_FLOW, nh_tclassid)) 1665 goto nla_put_failure; 1666 1667 /* length of rtnetlink header + attributes */ 1668 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; 1669 1670 return 0; 1671 1672 nla_put_failure: 1673 return -EMSGSIZE; 1674 } 1675 #endif 1676 1677 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1678 static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) 1679 { 1680 struct nlattr *mp; 1681 1682 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); 1683 if (!mp) 1684 goto nla_put_failure; 1685 1686 if (unlikely(fi->nh)) { 1687 if (nexthop_mpath_fill_node(skb, fi->nh, AF_INET) < 0) 1688 goto nla_put_failure; 1689 goto mp_end; 1690 } 1691 1692 for_nexthops(fi) { 1693 u32 nh_tclassid = 0; 1694 #ifdef CONFIG_IP_ROUTE_CLASSID 1695 nh_tclassid = nh->nh_tclassid; 1696 #endif 1697 if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight, 1698 AF_INET, nh_tclassid) < 0) 1699 goto nla_put_failure; 1700 } endfor_nexthops(fi); 1701 1702 mp_end: 1703 nla_nest_end(skb, mp); 1704 1705 return 0; 1706 1707 nla_put_failure: 1708 return -EMSGSIZE; 1709 } 1710 #else 1711 static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) 1712 { 1713 return 0; 1714 } 1715 #endif 1716 1717 int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, 1718 const struct fib_rt_info *fri, unsigned int flags) 1719 { 1720 unsigned int nhs = fib_info_num_path(fri->fi); 1721 struct fib_info *fi = fri->fi; 1722 u32 tb_id = fri->tb_id; 1723 struct nlmsghdr *nlh; 1724 struct rtmsg *rtm; 1725 1726 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags); 1727 if (!nlh) 1728 return -EMSGSIZE; 1729 1730 rtm = nlmsg_data(nlh); 1731 rtm->rtm_family = AF_INET; 1732 rtm->rtm_dst_len = fri->dst_len; 1733 rtm->rtm_src_len = 0; 1734 rtm->rtm_tos = inet_dscp_to_dsfield(fri->dscp); 1735 if (tb_id < 256) 1736 rtm->rtm_table = tb_id; 1737 else 1738 rtm->rtm_table = RT_TABLE_COMPAT; 1739 if (nla_put_u32(skb, RTA_TABLE, tb_id)) 1740 goto nla_put_failure; 1741 rtm->rtm_type = fri->type; 1742 rtm->rtm_flags = fi->fib_flags; 1743 rtm->rtm_scope = fi->fib_scope; 1744 rtm->rtm_protocol = fi->fib_protocol; 1745 1746 if (rtm->rtm_dst_len && 1747 nla_put_in_addr(skb, RTA_DST, fri->dst)) 1748 goto nla_put_failure; 1749 if (fi->fib_priority && 1750 nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority)) 1751 goto nla_put_failure; 1752 if (rtnetlink_put_metrics(skb, fi->fib_metrics->metrics) < 0) 1753 goto nla_put_failure; 1754 1755 if (fi->fib_prefsrc && 1756 nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc)) 1757 goto nla_put_failure; 1758 1759 if (fi->nh) { 1760 if (nla_put_u32(skb, RTA_NH_ID, fi->nh->id)) 1761 goto nla_put_failure; 1762 if (nexthop_is_blackhole(fi->nh)) 1763 rtm->rtm_type = RTN_BLACKHOLE; 1764 if (!READ_ONCE(fi->fib_net->ipv4.sysctl_nexthop_compat_mode)) 1765 goto offload; 1766 } 1767 1768 if (nhs == 1) { 1769 const struct fib_nh_common *nhc = fib_info_nhc(fi, 0); 1770 unsigned char flags = 0; 1771 1772 if (fib_nexthop_info(skb, nhc, AF_INET, &flags, false) < 0) 1773 goto nla_put_failure; 1774 1775 rtm->rtm_flags = flags; 1776 #ifdef CONFIG_IP_ROUTE_CLASSID 1777 if (nhc->nhc_family == AF_INET) { 1778 struct fib_nh *nh; 1779 1780 nh = container_of(nhc, struct fib_nh, nh_common); 1781 if (nh->nh_tclassid && 1782 nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) 1783 goto nla_put_failure; 1784 } 1785 #endif 1786 } else { 1787 if (fib_add_multipath(skb, fi) < 0) 1788 goto nla_put_failure; 1789 } 1790 1791 offload: 1792 if (fri->offload) 1793 rtm->rtm_flags |= RTM_F_OFFLOAD; 1794 if (fri->trap) 1795 rtm->rtm_flags |= RTM_F_TRAP; 1796 if (fri->offload_failed) 1797 rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED; 1798 1799 nlmsg_end(skb, nlh); 1800 return 0; 1801 1802 nla_put_failure: 1803 nlmsg_cancel(skb, nlh); 1804 return -EMSGSIZE; 1805 } 1806 1807 /* 1808 * Update FIB if: 1809 * - local address disappeared -> we must delete all the entries 1810 * referring to it. 1811 * - device went down -> we must shutdown all nexthops going via it. 1812 */ 1813 int fib_sync_down_addr(struct net_device *dev, __be32 local) 1814 { 1815 int tb_id = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 1816 struct net *net = dev_net(dev); 1817 struct hlist_head *head; 1818 struct fib_info *fi; 1819 int ret = 0; 1820 1821 if (!local) 1822 return 0; 1823 1824 head = fib_info_laddrhash_bucket(net, local); 1825 hlist_for_each_entry(fi, head, fib_lhash) { 1826 if (!net_eq(fi->fib_net, net) || 1827 fi->fib_tb_id != tb_id) 1828 continue; 1829 if (fi->fib_prefsrc == local) { 1830 fi->fib_flags |= RTNH_F_DEAD; 1831 fi->pfsrc_removed = true; 1832 ret++; 1833 } 1834 } 1835 return ret; 1836 } 1837 1838 static int call_fib_nh_notifiers(struct fib_nh *nh, 1839 enum fib_event_type event_type) 1840 { 1841 bool ignore_link_down = ip_ignore_linkdown(nh->fib_nh_dev); 1842 struct fib_nh_notifier_info info = { 1843 .fib_nh = nh, 1844 }; 1845 1846 switch (event_type) { 1847 case FIB_EVENT_NH_ADD: 1848 if (nh->fib_nh_flags & RTNH_F_DEAD) 1849 break; 1850 if (ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN) 1851 break; 1852 return call_fib4_notifiers(dev_net(nh->fib_nh_dev), event_type, 1853 &info.info); 1854 case FIB_EVENT_NH_DEL: 1855 if ((ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN) || 1856 (nh->fib_nh_flags & RTNH_F_DEAD)) 1857 return call_fib4_notifiers(dev_net(nh->fib_nh_dev), 1858 event_type, &info.info); 1859 break; 1860 default: 1861 break; 1862 } 1863 1864 return NOTIFY_DONE; 1865 } 1866 1867 /* Update the PMTU of exceptions when: 1868 * - the new MTU of the first hop becomes smaller than the PMTU 1869 * - the old MTU was the same as the PMTU, and it limited discovery of 1870 * larger MTUs on the path. With that limit raised, we can now 1871 * discover larger MTUs 1872 * A special case is locked exceptions, for which the PMTU is smaller 1873 * than the minimal accepted PMTU: 1874 * - if the new MTU is greater than the PMTU, don't make any change 1875 * - otherwise, unlock and set PMTU 1876 */ 1877 void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig) 1878 { 1879 struct fnhe_hash_bucket *bucket; 1880 int i; 1881 1882 bucket = rcu_dereference_protected(nhc->nhc_exceptions, 1); 1883 if (!bucket) 1884 return; 1885 1886 for (i = 0; i < FNHE_HASH_SIZE; i++) { 1887 struct fib_nh_exception *fnhe; 1888 1889 for (fnhe = rcu_dereference_protected(bucket[i].chain, 1); 1890 fnhe; 1891 fnhe = rcu_dereference_protected(fnhe->fnhe_next, 1)) { 1892 if (fnhe->fnhe_mtu_locked) { 1893 if (new <= fnhe->fnhe_pmtu) { 1894 fnhe->fnhe_pmtu = new; 1895 fnhe->fnhe_mtu_locked = false; 1896 } 1897 } else if (new < fnhe->fnhe_pmtu || 1898 orig == fnhe->fnhe_pmtu) { 1899 fnhe->fnhe_pmtu = new; 1900 } 1901 } 1902 } 1903 } 1904 1905 void fib_sync_mtu(struct net_device *dev, u32 orig_mtu) 1906 { 1907 struct hlist_head *head = fib_nh_head(dev); 1908 struct fib_nh *nh; 1909 1910 hlist_for_each_entry(nh, head, nh_hash) { 1911 DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev); 1912 fib_nhc_update_mtu(&nh->nh_common, dev->mtu, orig_mtu); 1913 } 1914 } 1915 1916 /* Event force Flags Description 1917 * NETDEV_CHANGE 0 LINKDOWN Carrier OFF, not for scope host 1918 * NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host 1919 * NETDEV_DOWN 1 LINKDOWN|DEAD Last address removed 1920 * NETDEV_UNREGISTER 1 LINKDOWN|DEAD Device removed 1921 * 1922 * only used when fib_nh is built into fib_info 1923 */ 1924 int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) 1925 { 1926 struct hlist_head *head = fib_nh_head(dev); 1927 struct fib_info *prev_fi = NULL; 1928 int scope = RT_SCOPE_NOWHERE; 1929 struct fib_nh *nh; 1930 int ret = 0; 1931 1932 if (force) 1933 scope = -1; 1934 1935 hlist_for_each_entry(nh, head, nh_hash) { 1936 struct fib_info *fi = nh->nh_parent; 1937 int dead; 1938 1939 BUG_ON(!fi->fib_nhs); 1940 DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev); 1941 if (fi == prev_fi) 1942 continue; 1943 prev_fi = fi; 1944 dead = 0; 1945 change_nexthops(fi) { 1946 if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD) 1947 dead++; 1948 else if (nexthop_nh->fib_nh_dev == dev && 1949 nexthop_nh->fib_nh_scope != scope) { 1950 switch (event) { 1951 case NETDEV_DOWN: 1952 case NETDEV_UNREGISTER: 1953 nexthop_nh->fib_nh_flags |= RTNH_F_DEAD; 1954 fallthrough; 1955 case NETDEV_CHANGE: 1956 nexthop_nh->fib_nh_flags |= RTNH_F_LINKDOWN; 1957 break; 1958 } 1959 call_fib_nh_notifiers(nexthop_nh, 1960 FIB_EVENT_NH_DEL); 1961 dead++; 1962 } 1963 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1964 if (event == NETDEV_UNREGISTER && 1965 nexthop_nh->fib_nh_dev == dev) { 1966 dead = fi->fib_nhs; 1967 break; 1968 } 1969 #endif 1970 } endfor_nexthops(fi) 1971 if (dead == fi->fib_nhs) { 1972 switch (event) { 1973 case NETDEV_DOWN: 1974 case NETDEV_UNREGISTER: 1975 fi->fib_flags |= RTNH_F_DEAD; 1976 fallthrough; 1977 case NETDEV_CHANGE: 1978 fi->fib_flags |= RTNH_F_LINKDOWN; 1979 break; 1980 } 1981 ret++; 1982 } 1983 1984 fib_rebalance(fi); 1985 } 1986 1987 return ret; 1988 } 1989 1990 /* Must be invoked inside of an RCU protected region. */ 1991 static void fib_select_default(const struct flowi4 *flp, struct fib_result *res) 1992 { 1993 struct fib_info *fi = NULL, *last_resort = NULL; 1994 struct hlist_head *fa_head = res->fa_head; 1995 struct fib_table *tb = res->table; 1996 u8 slen = 32 - res->prefixlen; 1997 int order = -1, last_idx = -1; 1998 struct fib_alias *fa, *fa1 = NULL; 1999 u32 last_prio = res->fi->fib_priority; 2000 dscp_t last_dscp = 0; 2001 2002 hlist_for_each_entry_rcu(fa, fa_head, fa_list) { 2003 struct fib_info *next_fi = fa->fa_info; 2004 struct fib_nh_common *nhc; 2005 2006 if (fa->fa_slen != slen) 2007 continue; 2008 if (fa->fa_dscp && !fib_dscp_masked_match(fa->fa_dscp, flp)) 2009 continue; 2010 if (fa->tb_id != tb->tb_id) 2011 continue; 2012 if (next_fi->fib_priority > last_prio && 2013 fa->fa_dscp == last_dscp) { 2014 if (last_dscp) 2015 continue; 2016 break; 2017 } 2018 if (next_fi->fib_flags & RTNH_F_DEAD) 2019 continue; 2020 last_dscp = fa->fa_dscp; 2021 last_prio = next_fi->fib_priority; 2022 2023 if (next_fi->fib_scope != res->scope || 2024 fa->fa_type != RTN_UNICAST) 2025 continue; 2026 2027 nhc = fib_info_nhc(next_fi, 0); 2028 if (!nhc->nhc_gw_family || nhc->nhc_scope != RT_SCOPE_LINK) 2029 continue; 2030 2031 fib_alias_accessed(fa); 2032 2033 if (!fi) { 2034 if (next_fi != res->fi) 2035 break; 2036 fa1 = fa; 2037 } else if (!fib_detect_death(fi, order, &last_resort, 2038 &last_idx, fa1->fa_default)) { 2039 fib_result_assign(res, fi); 2040 fa1->fa_default = order; 2041 goto out; 2042 } 2043 fi = next_fi; 2044 order++; 2045 } 2046 2047 if (order <= 0 || !fi) { 2048 if (fa1) 2049 fa1->fa_default = -1; 2050 goto out; 2051 } 2052 2053 if (!fib_detect_death(fi, order, &last_resort, &last_idx, 2054 fa1->fa_default)) { 2055 fib_result_assign(res, fi); 2056 fa1->fa_default = order; 2057 goto out; 2058 } 2059 2060 if (last_idx >= 0) 2061 fib_result_assign(res, last_resort); 2062 fa1->fa_default = last_idx; 2063 out: 2064 return; 2065 } 2066 2067 /* 2068 * Dead device goes up. We wake up dead nexthops. 2069 * It takes sense only on multipath routes. 2070 * 2071 * only used when fib_nh is built into fib_info 2072 */ 2073 int fib_sync_up(struct net_device *dev, unsigned char nh_flags) 2074 { 2075 struct fib_info *prev_fi; 2076 struct hlist_head *head; 2077 struct fib_nh *nh; 2078 int ret; 2079 2080 if (!(dev->flags & IFF_UP)) 2081 return 0; 2082 2083 if (nh_flags & RTNH_F_DEAD) { 2084 unsigned int flags = netif_get_flags(dev); 2085 2086 if (flags & (IFF_RUNNING | IFF_LOWER_UP)) 2087 nh_flags |= RTNH_F_LINKDOWN; 2088 } 2089 2090 prev_fi = NULL; 2091 head = fib_nh_head(dev); 2092 ret = 0; 2093 2094 hlist_for_each_entry(nh, head, nh_hash) { 2095 struct fib_info *fi = nh->nh_parent; 2096 int alive; 2097 2098 BUG_ON(!fi->fib_nhs); 2099 DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev); 2100 if (fi == prev_fi) 2101 continue; 2102 2103 prev_fi = fi; 2104 alive = 0; 2105 change_nexthops(fi) { 2106 if (!(nexthop_nh->fib_nh_flags & nh_flags)) { 2107 alive++; 2108 continue; 2109 } 2110 if (!nexthop_nh->fib_nh_dev || 2111 !(nexthop_nh->fib_nh_dev->flags & IFF_UP)) 2112 continue; 2113 if (nexthop_nh->fib_nh_dev != dev || 2114 !__in_dev_get_rtnl(dev)) 2115 continue; 2116 alive++; 2117 nexthop_nh->fib_nh_flags &= ~nh_flags; 2118 call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_ADD); 2119 } endfor_nexthops(fi) 2120 2121 if (alive > 0) { 2122 fi->fib_flags &= ~nh_flags; 2123 ret++; 2124 } 2125 2126 fib_rebalance(fi); 2127 } 2128 2129 return ret; 2130 } 2131 2132 #ifdef CONFIG_IP_ROUTE_MULTIPATH 2133 static bool fib_good_nh(const struct fib_nh *nh) 2134 { 2135 int state = NUD_REACHABLE; 2136 2137 if (nh->fib_nh_scope == RT_SCOPE_LINK) { 2138 struct neighbour *n; 2139 2140 rcu_read_lock(); 2141 2142 if (likely(nh->fib_nh_gw_family == AF_INET)) 2143 n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev, 2144 (__force u32)nh->fib_nh_gw4); 2145 else if (IS_ENABLED(CONFIG_IPV6) && 2146 nh->fib_nh_gw_family == AF_INET6) 2147 n = __ipv6_neigh_lookup_noref(nh->fib_nh_dev, 2148 &nh->fib_nh_gw6); 2149 else 2150 n = NULL; 2151 if (n) 2152 state = READ_ONCE(n->nud_state); 2153 2154 rcu_read_unlock(); 2155 } 2156 2157 return !!(state & NUD_VALID); 2158 } 2159 2160 void fib_select_multipath(struct fib_result *res, int hash, 2161 const struct flowi4 *fl4) 2162 { 2163 struct fib_info *fi = res->fi; 2164 struct net *net = fi->fib_net; 2165 bool use_neigh; 2166 int score = -1; 2167 __be32 saddr; 2168 2169 if (unlikely(res->fi->nh)) { 2170 nexthop_path_fib_result(res, hash); 2171 return; 2172 } 2173 2174 use_neigh = READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh); 2175 saddr = fl4 ? fl4->saddr : 0; 2176 2177 change_nexthops(fi) { 2178 int nh_upper_bound, nh_score = 0; 2179 2180 /* Nexthops without a carrier are assigned an upper bound of 2181 * minus one when "ignore_routes_with_linkdown" is set. 2182 */ 2183 nh_upper_bound = atomic_read(&nexthop_nh->fib_nh_upper_bound); 2184 if (nh_upper_bound == -1 || 2185 (use_neigh && !fib_good_nh(nexthop_nh))) 2186 continue; 2187 2188 if (saddr && nexthop_nh->nh_saddr == saddr) 2189 nh_score += 2; 2190 if (hash <= nh_upper_bound) 2191 nh_score++; 2192 if (score < nh_score) { 2193 res->nh_sel = nhsel; 2194 res->nhc = &nexthop_nh->nh_common; 2195 if (nh_score == 3 || (!saddr && nh_score == 1)) 2196 return; 2197 score = nh_score; 2198 } 2199 2200 } endfor_nexthops(fi); 2201 } 2202 #endif 2203 2204 void fib_select_path(struct net *net, struct fib_result *res, 2205 struct flowi4 *fl4, const struct sk_buff *skb) 2206 { 2207 if (fl4->flowi4_oif) 2208 goto check_saddr; 2209 2210 #ifdef CONFIG_IP_ROUTE_MULTIPATH 2211 if (fib_info_num_path(res->fi) > 1) { 2212 int h = fib_multipath_hash(net, fl4, skb, NULL); 2213 2214 fib_select_multipath(res, h, fl4); 2215 } 2216 else 2217 #endif 2218 if (!res->prefixlen && 2219 res->table->tb_num_default > 1 && 2220 res->type == RTN_UNICAST) 2221 fib_select_default(fl4, res); 2222 2223 check_saddr: 2224 if (!fl4->saddr) { 2225 struct net_device *l3mdev; 2226 2227 l3mdev = dev_get_by_index_rcu(net, fl4->flowi4_l3mdev); 2228 2229 if (!l3mdev || 2230 l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) == l3mdev) 2231 fl4->saddr = fib_result_prefsrc(net, res); 2232 else 2233 fl4->saddr = inet_select_addr(l3mdev, 0, RT_SCOPE_LINK); 2234 } 2235 } 2236 2237 int __net_init fib4_semantics_init(struct net *net) 2238 { 2239 unsigned int hash_bits = 4; 2240 2241 net->ipv4.fib_info_hash = fib_info_hash_alloc(hash_bits); 2242 if (!net->ipv4.fib_info_hash) 2243 return -ENOMEM; 2244 2245 net->ipv4.fib_info_hash_bits = hash_bits; 2246 net->ipv4.fib_info_cnt = 0; 2247 2248 return 0; 2249 } 2250 2251 void __net_exit fib4_semantics_exit(struct net *net) 2252 { 2253 fib_info_hash_free(net->ipv4.fib_info_hash); 2254 } 2255