1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Linux INET6 implementation 4 * FIB front-end. 5 * 6 * Authors: 7 * Pedro Roque <roque@di.fc.ul.pt> 8 */ 9 10 /* Changes: 11 * 12 * YOSHIFUJI Hideaki @USAGI 13 * reworked default router selection. 14 * - respect outgoing interface 15 * - select from (probably) reachable routers (i.e. 16 * routers in REACHABLE, STALE, DELAY or PROBE states). 17 * - always select the same router if it is (probably) 18 * reachable. otherwise, round-robin the list. 19 * Ville Nuorvala 20 * Fixed routing subtrees. 21 */ 22 23 #define pr_fmt(fmt) "IPv6: " fmt 24 25 #include <linux/capability.h> 26 #include <linux/errno.h> 27 #include <linux/export.h> 28 #include <linux/types.h> 29 #include <linux/times.h> 30 #include <linux/socket.h> 31 #include <linux/sockios.h> 32 #include <linux/net.h> 33 #include <linux/route.h> 34 #include <linux/netdevice.h> 35 #include <linux/in6.h> 36 #include <linux/mroute6.h> 37 #include <linux/init.h> 38 #include <linux/if_arp.h> 39 #include <linux/proc_fs.h> 40 #include <linux/seq_file.h> 41 #include <linux/nsproxy.h> 42 #include <linux/slab.h> 43 #include <linux/jhash.h> 44 #include <linux/siphash.h> 45 #include <net/net_namespace.h> 46 #include <net/snmp.h> 47 #include <net/ipv6.h> 48 #include <net/ip6_fib.h> 49 #include <net/ip6_route.h> 50 #include <net/ndisc.h> 51 #include <net/addrconf.h> 52 #include <net/tcp.h> 53 #include <linux/rtnetlink.h> 54 #include <net/dst.h> 55 #include <net/dst_metadata.h> 56 #include <net/xfrm.h> 57 #include <net/netevent.h> 58 #include <net/netlink.h> 59 #include <net/rtnh.h> 60 #include <net/lwtunnel.h> 61 #include <net/ip_tunnels.h> 62 #include <net/l3mdev.h> 63 #include <net/ip.h> 64 #include <linux/uaccess.h> 65 #include <linux/btf_ids.h> 66 67 #ifdef CONFIG_SYSCTL 68 #include <linux/sysctl.h> 69 #endif 70 71 static int ip6_rt_type_to_error(u8 fib6_type); 72 73 #define CREATE_TRACE_POINTS 74 #include <trace/events/fib6.h> 75 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); 76 #undef CREATE_TRACE_POINTS 77 78 enum rt6_nud_state { 79 RT6_NUD_FAIL_HARD = -3, 80 RT6_NUD_FAIL_PROBE = -2, 81 RT6_NUD_FAIL_DO_RR = -1, 82 RT6_NUD_SUCCEED = 1 83 }; 84 85 INDIRECT_CALLABLE_SCOPE 86 struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 87 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 88 INDIRECT_CALLABLE_SCOPE 89 unsigned int ip6_mtu(const struct dst_entry *dst); 90 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 91 static void ip6_dst_destroy(struct dst_entry *); 92 static void ip6_dst_ifdown(struct dst_entry *, 93 struct net_device *dev); 94 static void ip6_dst_gc(struct dst_ops *ops); 95 96 static int ip6_pkt_discard(struct sk_buff *skb); 97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 98 static int ip6_pkt_prohibit(struct sk_buff *skb); 99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 100 static void ip6_link_failure(struct sk_buff *skb); 101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 102 struct sk_buff *skb, u32 mtu, 103 bool confirm_neigh); 104 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 105 struct sk_buff *skb); 106 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, 107 int strict); 108 static size_t rt6_nlmsg_size(struct fib6_info *f6i); 109 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 110 struct fib6_info *rt, struct dst_entry *dst, 111 struct in6_addr *dest, struct in6_addr *src, 112 int iif, int type, u32 portid, u32 seq, 113 unsigned int flags); 114 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, 115 const struct in6_addr *daddr, 116 const struct in6_addr *saddr); 117 118 #ifdef CONFIG_IPV6_ROUTE_INFO 119 static struct fib6_info *rt6_add_route_info(struct net *net, 120 const struct in6_addr *prefix, int prefixlen, 121 const struct in6_addr *gwaddr, 122 struct net_device *dev, 123 unsigned int pref); 124 static struct fib6_info *rt6_get_route_info(struct net *net, 125 const struct in6_addr *prefix, int prefixlen, 126 const struct in6_addr *gwaddr, 127 struct net_device *dev); 128 #endif 129 130 struct uncached_list { 131 spinlock_t lock; 132 struct list_head head; 133 struct list_head quarantine; 134 }; 135 136 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 137 138 void rt6_uncached_list_add(struct rt6_info *rt) 139 { 140 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 141 142 rt->dst.rt_uncached_list = ul; 143 144 spin_lock_bh(&ul->lock); 145 list_add_tail(&rt->dst.rt_uncached, &ul->head); 146 spin_unlock_bh(&ul->lock); 147 } 148 149 void rt6_uncached_list_del(struct rt6_info *rt) 150 { 151 if (!list_empty(&rt->dst.rt_uncached)) { 152 struct uncached_list *ul = rt->dst.rt_uncached_list; 153 154 spin_lock_bh(&ul->lock); 155 list_del_init(&rt->dst.rt_uncached); 156 spin_unlock_bh(&ul->lock); 157 } 158 } 159 160 static void rt6_uncached_list_flush_dev(struct net_device *dev) 161 { 162 int cpu; 163 164 for_each_possible_cpu(cpu) { 165 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 166 struct rt6_info *rt, *safe; 167 168 if (list_empty(&ul->head)) 169 continue; 170 171 spin_lock_bh(&ul->lock); 172 list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) { 173 struct inet6_dev *rt_idev = rt->rt6i_idev; 174 struct net_device *rt_dev = rt->dst.dev; 175 bool handled = false; 176 177 if (rt_idev->dev == dev) { 178 rt->rt6i_idev = in6_dev_get(blackhole_netdev); 179 in6_dev_put(rt_idev); 180 handled = true; 181 } 182 183 if (rt_dev == dev) { 184 rt->dst.dev = blackhole_netdev; 185 netdev_ref_replace(rt_dev, blackhole_netdev, 186 &rt->dst.dev_tracker, 187 GFP_ATOMIC); 188 handled = true; 189 } 190 if (handled) 191 list_move(&rt->dst.rt_uncached, 192 &ul->quarantine); 193 } 194 spin_unlock_bh(&ul->lock); 195 } 196 } 197 198 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 199 struct sk_buff *skb, 200 const void *daddr) 201 { 202 if (!ipv6_addr_any(p)) 203 return (const void *) p; 204 else if (skb) 205 return &ipv6_hdr(skb)->daddr; 206 return daddr; 207 } 208 209 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 210 struct net_device *dev, 211 struct sk_buff *skb, 212 const void *daddr) 213 { 214 struct neighbour *n; 215 216 daddr = choose_neigh_daddr(gw, skb, daddr); 217 n = __ipv6_neigh_lookup(dev, daddr); 218 if (n) 219 return n; 220 221 n = neigh_create(&nd_tbl, daddr, dev); 222 return IS_ERR(n) ? NULL : n; 223 } 224 225 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 226 struct sk_buff *skb, 227 const void *daddr) 228 { 229 const struct rt6_info *rt = dst_rt6_info(dst); 230 231 return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any), 232 dst->dev, skb, daddr); 233 } 234 235 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 236 { 237 const struct rt6_info *rt = dst_rt6_info(dst); 238 struct net_device *dev = dst->dev; 239 240 daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr); 241 if (!daddr) 242 return; 243 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 244 return; 245 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 246 return; 247 __ipv6_confirm_neigh(dev, daddr); 248 } 249 250 static struct dst_ops ip6_dst_ops_template = { 251 .family = AF_INET6, 252 .gc = ip6_dst_gc, 253 .gc_thresh = 1024, 254 .check = ip6_dst_check, 255 .default_advmss = ip6_default_advmss, 256 .mtu = ip6_mtu, 257 .cow_metrics = dst_cow_metrics_generic, 258 .destroy = ip6_dst_destroy, 259 .ifdown = ip6_dst_ifdown, 260 .negative_advice = ip6_negative_advice, 261 .link_failure = ip6_link_failure, 262 .update_pmtu = ip6_rt_update_pmtu, 263 .redirect = rt6_do_redirect, 264 .local_out = __ip6_local_out, 265 .neigh_lookup = ip6_dst_neigh_lookup, 266 .confirm_neigh = ip6_confirm_neigh, 267 }; 268 269 static struct dst_ops ip6_dst_blackhole_ops = { 270 .family = AF_INET6, 271 .default_advmss = ip6_default_advmss, 272 .neigh_lookup = ip6_dst_neigh_lookup, 273 .check = ip6_dst_check, 274 .destroy = ip6_dst_destroy, 275 .cow_metrics = dst_cow_metrics_generic, 276 .update_pmtu = dst_blackhole_update_pmtu, 277 .redirect = dst_blackhole_redirect, 278 .mtu = dst_blackhole_mtu, 279 }; 280 281 static const u32 ip6_template_metrics[RTAX_MAX] = { 282 [RTAX_HOPLIMIT - 1] = 0, 283 }; 284 285 static const struct fib6_info fib6_null_entry_template = { 286 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 287 .fib6_protocol = RTPROT_KERNEL, 288 .fib6_metric = ~(u32)0, 289 .fib6_ref = REFCOUNT_INIT(1), 290 .fib6_type = RTN_UNREACHABLE, 291 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 292 }; 293 294 static const struct rt6_info ip6_null_entry_template = { 295 .dst = { 296 .__rcuref = RCUREF_INIT(1), 297 .__use = 1, 298 .obsolete = DST_OBSOLETE_FORCE_CHK, 299 .error = -ENETUNREACH, 300 .input = ip6_pkt_discard, 301 .output = ip6_pkt_discard_out, 302 }, 303 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 304 }; 305 306 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 307 308 static const struct rt6_info ip6_prohibit_entry_template = { 309 .dst = { 310 .__rcuref = RCUREF_INIT(1), 311 .__use = 1, 312 .obsolete = DST_OBSOLETE_FORCE_CHK, 313 .error = -EACCES, 314 .input = ip6_pkt_prohibit, 315 .output = ip6_pkt_prohibit_out, 316 }, 317 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 318 }; 319 320 static const struct rt6_info ip6_blk_hole_entry_template = { 321 .dst = { 322 .__rcuref = RCUREF_INIT(1), 323 .__use = 1, 324 .obsolete = DST_OBSOLETE_FORCE_CHK, 325 .error = -EINVAL, 326 .input = dst_discard, 327 .output = dst_discard_out, 328 }, 329 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 330 }; 331 332 #endif 333 334 static void rt6_info_init(struct rt6_info *rt) 335 { 336 memset_after(rt, 0, dst); 337 } 338 339 /* allocate dst with ip6_dst_ops */ 340 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 341 int flags) 342 { 343 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 344 DST_OBSOLETE_FORCE_CHK, flags); 345 346 if (rt) { 347 rt6_info_init(rt); 348 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 349 } 350 351 return rt; 352 } 353 EXPORT_SYMBOL(ip6_dst_alloc); 354 355 static void ip6_dst_destroy(struct dst_entry *dst) 356 { 357 struct rt6_info *rt = dst_rt6_info(dst); 358 struct fib6_info *from; 359 struct inet6_dev *idev; 360 361 ip_dst_metrics_put(dst); 362 rt6_uncached_list_del(rt); 363 364 idev = rt->rt6i_idev; 365 if (idev) { 366 rt->rt6i_idev = NULL; 367 in6_dev_put(idev); 368 } 369 370 from = xchg((__force struct fib6_info **)&rt->from, NULL); 371 fib6_info_release(from); 372 } 373 374 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev) 375 { 376 struct rt6_info *rt = dst_rt6_info(dst); 377 struct inet6_dev *idev = rt->rt6i_idev; 378 379 if (idev && idev->dev != blackhole_netdev) { 380 struct inet6_dev *blackhole_idev = in6_dev_get(blackhole_netdev); 381 382 if (blackhole_idev) { 383 rt->rt6i_idev = blackhole_idev; 384 in6_dev_put(idev); 385 } 386 } 387 } 388 389 static bool __rt6_check_expired(const struct rt6_info *rt) 390 { 391 if (rt->rt6i_flags & RTF_EXPIRES) 392 return time_after(jiffies, rt->dst.expires); 393 else 394 return false; 395 } 396 397 static bool rt6_check_expired(const struct rt6_info *rt) 398 { 399 struct fib6_info *from; 400 401 from = rcu_dereference(rt->from); 402 403 if (rt->rt6i_flags & RTF_EXPIRES) { 404 if (time_after(jiffies, rt->dst.expires)) 405 return true; 406 } else if (from) { 407 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 408 fib6_check_expired(from); 409 } 410 return false; 411 } 412 413 void fib6_select_path(const struct net *net, struct fib6_result *res, 414 struct flowi6 *fl6, int oif, bool have_oif_match, 415 const struct sk_buff *skb, int strict) 416 { 417 struct fib6_info *sibling, *next_sibling; 418 struct fib6_info *match = res->f6i; 419 420 if (!match->nh && (!match->fib6_nsiblings || have_oif_match)) 421 goto out; 422 423 if (match->nh && have_oif_match && res->nh) 424 return; 425 426 if (skb) 427 IP6CB(skb)->flags |= IP6SKB_MULTIPATH; 428 429 /* We might have already computed the hash for ICMPv6 errors. In such 430 * case it will always be non-zero. Otherwise now is the time to do it. 431 */ 432 if (!fl6->mp_hash && 433 (!match->nh || nexthop_is_multipath(match->nh))) 434 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 435 436 if (unlikely(match->nh)) { 437 nexthop_path_fib6_result(res, fl6->mp_hash); 438 return; 439 } 440 441 if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound)) 442 goto out; 443 444 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 445 fib6_siblings) { 446 const struct fib6_nh *nh = sibling->fib6_nh; 447 int nh_upper_bound; 448 449 nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); 450 if (fl6->mp_hash > nh_upper_bound) 451 continue; 452 if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0) 453 break; 454 match = sibling; 455 break; 456 } 457 458 out: 459 res->f6i = match; 460 res->nh = match->fib6_nh; 461 } 462 463 /* 464 * Route lookup. rcu_read_lock() should be held. 465 */ 466 467 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh, 468 const struct in6_addr *saddr, int oif, int flags) 469 { 470 const struct net_device *dev; 471 472 if (nh->fib_nh_flags & RTNH_F_DEAD) 473 return false; 474 475 dev = nh->fib_nh_dev; 476 if (oif) { 477 if (dev->ifindex == oif) 478 return true; 479 } else { 480 if (ipv6_chk_addr(net, saddr, dev, 481 flags & RT6_LOOKUP_F_IFACE)) 482 return true; 483 } 484 485 return false; 486 } 487 488 struct fib6_nh_dm_arg { 489 struct net *net; 490 const struct in6_addr *saddr; 491 int oif; 492 int flags; 493 struct fib6_nh *nh; 494 }; 495 496 static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg) 497 { 498 struct fib6_nh_dm_arg *arg = _arg; 499 500 arg->nh = nh; 501 return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif, 502 arg->flags); 503 } 504 505 /* returns fib6_nh from nexthop or NULL */ 506 static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh, 507 struct fib6_result *res, 508 const struct in6_addr *saddr, 509 int oif, int flags) 510 { 511 struct fib6_nh_dm_arg arg = { 512 .net = net, 513 .saddr = saddr, 514 .oif = oif, 515 .flags = flags, 516 }; 517 518 if (nexthop_is_blackhole(nh)) 519 return NULL; 520 521 if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg)) 522 return arg.nh; 523 524 return NULL; 525 } 526 527 static void rt6_device_match(struct net *net, struct fib6_result *res, 528 const struct in6_addr *saddr, int oif, int flags) 529 { 530 struct fib6_info *f6i = res->f6i; 531 struct fib6_info *spf6i; 532 struct fib6_nh *nh; 533 534 if (!oif && ipv6_addr_any(saddr)) { 535 if (unlikely(f6i->nh)) { 536 nh = nexthop_fib6_nh(f6i->nh); 537 if (nexthop_is_blackhole(f6i->nh)) 538 goto out_blackhole; 539 } else { 540 nh = f6i->fib6_nh; 541 } 542 if (!(nh->fib_nh_flags & RTNH_F_DEAD)) 543 goto out; 544 } 545 546 for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) { 547 bool matched = false; 548 549 if (unlikely(spf6i->nh)) { 550 nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr, 551 oif, flags); 552 if (nh) 553 matched = true; 554 } else { 555 nh = spf6i->fib6_nh; 556 if (__rt6_device_match(net, nh, saddr, oif, flags)) 557 matched = true; 558 } 559 if (matched) { 560 res->f6i = spf6i; 561 goto out; 562 } 563 } 564 565 if (oif && flags & RT6_LOOKUP_F_IFACE) { 566 res->f6i = net->ipv6.fib6_null_entry; 567 nh = res->f6i->fib6_nh; 568 goto out; 569 } 570 571 if (unlikely(f6i->nh)) { 572 nh = nexthop_fib6_nh(f6i->nh); 573 if (nexthop_is_blackhole(f6i->nh)) 574 goto out_blackhole; 575 } else { 576 nh = f6i->fib6_nh; 577 } 578 579 if (nh->fib_nh_flags & RTNH_F_DEAD) { 580 res->f6i = net->ipv6.fib6_null_entry; 581 nh = res->f6i->fib6_nh; 582 } 583 out: 584 res->nh = nh; 585 res->fib6_type = res->f6i->fib6_type; 586 res->fib6_flags = res->f6i->fib6_flags; 587 return; 588 589 out_blackhole: 590 res->fib6_flags |= RTF_REJECT; 591 res->fib6_type = RTN_BLACKHOLE; 592 res->nh = nh; 593 } 594 595 #ifdef CONFIG_IPV6_ROUTER_PREF 596 struct __rt6_probe_work { 597 struct work_struct work; 598 struct in6_addr target; 599 struct net_device *dev; 600 netdevice_tracker dev_tracker; 601 }; 602 603 static void rt6_probe_deferred(struct work_struct *w) 604 { 605 struct in6_addr mcaddr; 606 struct __rt6_probe_work *work = 607 container_of(w, struct __rt6_probe_work, work); 608 609 addrconf_addr_solict_mult(&work->target, &mcaddr); 610 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 611 netdev_put(work->dev, &work->dev_tracker); 612 kfree(work); 613 } 614 615 static void rt6_probe(struct fib6_nh *fib6_nh) 616 { 617 struct __rt6_probe_work *work = NULL; 618 const struct in6_addr *nh_gw; 619 unsigned long last_probe; 620 struct neighbour *neigh; 621 struct net_device *dev; 622 struct inet6_dev *idev; 623 624 /* 625 * Okay, this does not seem to be appropriate 626 * for now, however, we need to check if it 627 * is really so; aka Router Reachability Probing. 628 * 629 * Router Reachability Probe MUST be rate-limited 630 * to no more than one per minute. 631 */ 632 if (!fib6_nh->fib_nh_gw_family) 633 return; 634 635 nh_gw = &fib6_nh->fib_nh_gw6; 636 dev = fib6_nh->fib_nh_dev; 637 rcu_read_lock(); 638 last_probe = READ_ONCE(fib6_nh->last_probe); 639 idev = __in6_dev_get(dev); 640 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 641 if (neigh) { 642 if (READ_ONCE(neigh->nud_state) & NUD_VALID) 643 goto out; 644 645 write_lock_bh(&neigh->lock); 646 if (!(neigh->nud_state & NUD_VALID) && 647 time_after(jiffies, 648 neigh->updated + 649 READ_ONCE(idev->cnf.rtr_probe_interval))) { 650 work = kmalloc(sizeof(*work), GFP_ATOMIC); 651 if (work) 652 __neigh_set_probe_once(neigh); 653 } 654 write_unlock_bh(&neigh->lock); 655 } else if (time_after(jiffies, last_probe + 656 READ_ONCE(idev->cnf.rtr_probe_interval))) { 657 work = kmalloc(sizeof(*work), GFP_ATOMIC); 658 } 659 660 if (!work || cmpxchg(&fib6_nh->last_probe, 661 last_probe, jiffies) != last_probe) { 662 kfree(work); 663 } else { 664 INIT_WORK(&work->work, rt6_probe_deferred); 665 work->target = *nh_gw; 666 netdev_hold(dev, &work->dev_tracker, GFP_ATOMIC); 667 work->dev = dev; 668 schedule_work(&work->work); 669 } 670 671 out: 672 rcu_read_unlock(); 673 } 674 #else 675 static inline void rt6_probe(struct fib6_nh *fib6_nh) 676 { 677 } 678 #endif 679 680 /* 681 * Default Router Selection (RFC 2461 6.3.6) 682 */ 683 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh) 684 { 685 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 686 struct neighbour *neigh; 687 688 rcu_read_lock(); 689 neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev, 690 &fib6_nh->fib_nh_gw6); 691 if (neigh) { 692 u8 nud_state = READ_ONCE(neigh->nud_state); 693 694 if (nud_state & NUD_VALID) 695 ret = RT6_NUD_SUCCEED; 696 #ifdef CONFIG_IPV6_ROUTER_PREF 697 else if (!(nud_state & NUD_FAILED)) 698 ret = RT6_NUD_SUCCEED; 699 else 700 ret = RT6_NUD_FAIL_PROBE; 701 #endif 702 } else { 703 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 704 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 705 } 706 rcu_read_unlock(); 707 708 return ret; 709 } 710 711 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, 712 int strict) 713 { 714 int m = 0; 715 716 if (!oif || nh->fib_nh_dev->ifindex == oif) 717 m = 2; 718 719 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 720 return RT6_NUD_FAIL_HARD; 721 #ifdef CONFIG_IPV6_ROUTER_PREF 722 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2; 723 #endif 724 if ((strict & RT6_LOOKUP_F_REACHABLE) && 725 !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) { 726 int n = rt6_check_neigh(nh); 727 if (n < 0) 728 return n; 729 } 730 return m; 731 } 732 733 static bool find_match(struct fib6_nh *nh, u32 fib6_flags, 734 int oif, int strict, int *mpri, bool *do_rr) 735 { 736 bool match_do_rr = false; 737 bool rc = false; 738 int m; 739 740 if (nh->fib_nh_flags & RTNH_F_DEAD) 741 goto out; 742 743 if (ip6_ignore_linkdown(nh->fib_nh_dev) && 744 nh->fib_nh_flags & RTNH_F_LINKDOWN && 745 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 746 goto out; 747 748 m = rt6_score_route(nh, fib6_flags, oif, strict); 749 if (m == RT6_NUD_FAIL_DO_RR) { 750 match_do_rr = true; 751 m = 0; /* lowest valid score */ 752 } else if (m == RT6_NUD_FAIL_HARD) { 753 goto out; 754 } 755 756 if (strict & RT6_LOOKUP_F_REACHABLE) 757 rt6_probe(nh); 758 759 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 760 if (m > *mpri) { 761 *do_rr = match_do_rr; 762 *mpri = m; 763 rc = true; 764 } 765 out: 766 return rc; 767 } 768 769 struct fib6_nh_frl_arg { 770 u32 flags; 771 int oif; 772 int strict; 773 int *mpri; 774 bool *do_rr; 775 struct fib6_nh *nh; 776 }; 777 778 static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg) 779 { 780 struct fib6_nh_frl_arg *arg = _arg; 781 782 arg->nh = nh; 783 return find_match(nh, arg->flags, arg->oif, arg->strict, 784 arg->mpri, arg->do_rr); 785 } 786 787 static void __find_rr_leaf(struct fib6_info *f6i_start, 788 struct fib6_info *nomatch, u32 metric, 789 struct fib6_result *res, struct fib6_info **cont, 790 int oif, int strict, bool *do_rr, int *mpri) 791 { 792 struct fib6_info *f6i; 793 794 for (f6i = f6i_start; 795 f6i && f6i != nomatch; 796 f6i = rcu_dereference(f6i->fib6_next)) { 797 bool matched = false; 798 struct fib6_nh *nh; 799 800 if (cont && f6i->fib6_metric != metric) { 801 *cont = f6i; 802 return; 803 } 804 805 if (fib6_check_expired(f6i)) 806 continue; 807 808 if (unlikely(f6i->nh)) { 809 struct fib6_nh_frl_arg arg = { 810 .flags = f6i->fib6_flags, 811 .oif = oif, 812 .strict = strict, 813 .mpri = mpri, 814 .do_rr = do_rr 815 }; 816 817 if (nexthop_is_blackhole(f6i->nh)) { 818 res->fib6_flags = RTF_REJECT; 819 res->fib6_type = RTN_BLACKHOLE; 820 res->f6i = f6i; 821 res->nh = nexthop_fib6_nh(f6i->nh); 822 return; 823 } 824 if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match, 825 &arg)) { 826 matched = true; 827 nh = arg.nh; 828 } 829 } else { 830 nh = f6i->fib6_nh; 831 if (find_match(nh, f6i->fib6_flags, oif, strict, 832 mpri, do_rr)) 833 matched = true; 834 } 835 if (matched) { 836 res->f6i = f6i; 837 res->nh = nh; 838 res->fib6_flags = f6i->fib6_flags; 839 res->fib6_type = f6i->fib6_type; 840 } 841 } 842 } 843 844 static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf, 845 struct fib6_info *rr_head, int oif, int strict, 846 bool *do_rr, struct fib6_result *res) 847 { 848 u32 metric = rr_head->fib6_metric; 849 struct fib6_info *cont = NULL; 850 int mpri = -1; 851 852 __find_rr_leaf(rr_head, NULL, metric, res, &cont, 853 oif, strict, do_rr, &mpri); 854 855 __find_rr_leaf(leaf, rr_head, metric, res, &cont, 856 oif, strict, do_rr, &mpri); 857 858 if (res->f6i || !cont) 859 return; 860 861 __find_rr_leaf(cont, NULL, metric, res, NULL, 862 oif, strict, do_rr, &mpri); 863 } 864 865 static void rt6_select(struct net *net, struct fib6_node *fn, int oif, 866 struct fib6_result *res, int strict) 867 { 868 struct fib6_info *leaf = rcu_dereference(fn->leaf); 869 struct fib6_info *rt0; 870 bool do_rr = false; 871 int key_plen; 872 873 /* make sure this function or its helpers sets f6i */ 874 res->f6i = NULL; 875 876 if (!leaf || leaf == net->ipv6.fib6_null_entry) 877 goto out; 878 879 rt0 = rcu_dereference(fn->rr_ptr); 880 if (!rt0) 881 rt0 = leaf; 882 883 /* Double check to make sure fn is not an intermediate node 884 * and fn->leaf does not points to its child's leaf 885 * (This might happen if all routes under fn are deleted from 886 * the tree and fib6_repair_tree() is called on the node.) 887 */ 888 key_plen = rt0->fib6_dst.plen; 889 #ifdef CONFIG_IPV6_SUBTREES 890 if (rt0->fib6_src.plen) 891 key_plen = rt0->fib6_src.plen; 892 #endif 893 if (fn->fn_bit != key_plen) 894 goto out; 895 896 find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res); 897 if (do_rr) { 898 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 899 900 /* no entries matched; do round-robin */ 901 if (!next || next->fib6_metric != rt0->fib6_metric) 902 next = leaf; 903 904 if (next != rt0) { 905 spin_lock_bh(&leaf->fib6_table->tb6_lock); 906 /* make sure next is not being deleted from the tree */ 907 if (next->fib6_node) 908 rcu_assign_pointer(fn->rr_ptr, next); 909 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 910 } 911 } 912 913 out: 914 if (!res->f6i) { 915 res->f6i = net->ipv6.fib6_null_entry; 916 res->nh = res->f6i->fib6_nh; 917 res->fib6_flags = res->f6i->fib6_flags; 918 res->fib6_type = res->f6i->fib6_type; 919 } 920 } 921 922 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res) 923 { 924 return (res->f6i->fib6_flags & RTF_NONEXTHOP) || 925 res->nh->fib_nh_gw_family; 926 } 927 928 #ifdef CONFIG_IPV6_ROUTE_INFO 929 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 930 const struct in6_addr *gwaddr) 931 { 932 struct net *net = dev_net(dev); 933 struct route_info *rinfo = (struct route_info *) opt; 934 struct in6_addr prefix_buf, *prefix; 935 struct fib6_table *table; 936 unsigned int pref; 937 unsigned long lifetime; 938 struct fib6_info *rt; 939 940 if (len < sizeof(struct route_info)) { 941 return -EINVAL; 942 } 943 944 /* Sanity check for prefix_len and length */ 945 if (rinfo->length > 3) { 946 return -EINVAL; 947 } else if (rinfo->prefix_len > 128) { 948 return -EINVAL; 949 } else if (rinfo->prefix_len > 64) { 950 if (rinfo->length < 2) { 951 return -EINVAL; 952 } 953 } else if (rinfo->prefix_len > 0) { 954 if (rinfo->length < 1) { 955 return -EINVAL; 956 } 957 } 958 959 pref = rinfo->route_pref; 960 if (pref == ICMPV6_ROUTER_PREF_INVALID) 961 return -EINVAL; 962 963 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 964 965 if (rinfo->length == 3) 966 prefix = (struct in6_addr *)rinfo->prefix; 967 else { 968 /* this function is safe */ 969 ipv6_addr_prefix(&prefix_buf, 970 (struct in6_addr *)rinfo->prefix, 971 rinfo->prefix_len); 972 prefix = &prefix_buf; 973 } 974 975 if (rinfo->prefix_len == 0) 976 rt = rt6_get_dflt_router(net, gwaddr, dev); 977 else 978 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 979 gwaddr, dev); 980 981 if (rt && !lifetime) { 982 ip6_del_rt(net, rt, false); 983 rt = NULL; 984 } 985 986 if (!rt && lifetime) 987 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 988 dev, pref); 989 else if (rt) 990 rt->fib6_flags = RTF_ROUTEINFO | 991 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 992 993 if (rt) { 994 table = rt->fib6_table; 995 spin_lock_bh(&table->tb6_lock); 996 997 if (!addrconf_finite_timeout(lifetime)) { 998 fib6_clean_expires(rt); 999 fib6_remove_gc_list(rt); 1000 } else { 1001 fib6_set_expires(rt, jiffies + HZ * lifetime); 1002 fib6_add_gc_list(rt); 1003 } 1004 1005 spin_unlock_bh(&table->tb6_lock); 1006 1007 fib6_info_release(rt); 1008 } 1009 return 0; 1010 } 1011 #endif 1012 1013 /* 1014 * Misc support functions 1015 */ 1016 1017 /* called with rcu_lock held */ 1018 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res) 1019 { 1020 struct net_device *dev = res->nh->fib_nh_dev; 1021 1022 if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 1023 /* for copies of local routes, dst->dev needs to be the 1024 * device if it is a master device, the master device if 1025 * device is enslaved, and the loopback as the default 1026 */ 1027 if (netif_is_l3_slave(dev) && 1028 !rt6_need_strict(&res->f6i->fib6_dst.addr)) 1029 dev = l3mdev_master_dev_rcu(dev); 1030 else if (!netif_is_l3_master(dev)) 1031 dev = dev_net(dev)->loopback_dev; 1032 /* last case is netif_is_l3_master(dev) is true in which 1033 * case we want dev returned to be dev 1034 */ 1035 } 1036 1037 return dev; 1038 } 1039 1040 static const int fib6_prop[RTN_MAX + 1] = { 1041 [RTN_UNSPEC] = 0, 1042 [RTN_UNICAST] = 0, 1043 [RTN_LOCAL] = 0, 1044 [RTN_BROADCAST] = 0, 1045 [RTN_ANYCAST] = 0, 1046 [RTN_MULTICAST] = 0, 1047 [RTN_BLACKHOLE] = -EINVAL, 1048 [RTN_UNREACHABLE] = -EHOSTUNREACH, 1049 [RTN_PROHIBIT] = -EACCES, 1050 [RTN_THROW] = -EAGAIN, 1051 [RTN_NAT] = -EINVAL, 1052 [RTN_XRESOLVE] = -EINVAL, 1053 }; 1054 1055 static int ip6_rt_type_to_error(u8 fib6_type) 1056 { 1057 return fib6_prop[fib6_type]; 1058 } 1059 1060 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 1061 { 1062 unsigned short flags = 0; 1063 1064 if (rt->dst_nocount) 1065 flags |= DST_NOCOUNT; 1066 if (rt->dst_nopolicy) 1067 flags |= DST_NOPOLICY; 1068 1069 return flags; 1070 } 1071 1072 static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type) 1073 { 1074 rt->dst.error = ip6_rt_type_to_error(fib6_type); 1075 1076 switch (fib6_type) { 1077 case RTN_BLACKHOLE: 1078 rt->dst.output = dst_discard_out; 1079 rt->dst.input = dst_discard; 1080 break; 1081 case RTN_PROHIBIT: 1082 rt->dst.output = ip6_pkt_prohibit_out; 1083 rt->dst.input = ip6_pkt_prohibit; 1084 break; 1085 case RTN_THROW: 1086 case RTN_UNREACHABLE: 1087 default: 1088 rt->dst.output = ip6_pkt_discard_out; 1089 rt->dst.input = ip6_pkt_discard; 1090 break; 1091 } 1092 } 1093 1094 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res) 1095 { 1096 struct fib6_info *f6i = res->f6i; 1097 1098 if (res->fib6_flags & RTF_REJECT) { 1099 ip6_rt_init_dst_reject(rt, res->fib6_type); 1100 return; 1101 } 1102 1103 rt->dst.error = 0; 1104 rt->dst.output = ip6_output; 1105 1106 if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) { 1107 rt->dst.input = ip6_input; 1108 } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 1109 rt->dst.input = ip6_mc_input; 1110 } else { 1111 rt->dst.input = ip6_forward; 1112 } 1113 1114 if (res->nh->fib_nh_lws) { 1115 rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws); 1116 lwtunnel_set_redirect(&rt->dst); 1117 } 1118 1119 rt->dst.lastuse = jiffies; 1120 } 1121 1122 /* Caller must already hold reference to @from */ 1123 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 1124 { 1125 rt->rt6i_flags &= ~RTF_EXPIRES; 1126 rcu_assign_pointer(rt->from, from); 1127 ip_dst_init_metrics(&rt->dst, from->fib6_metrics); 1128 } 1129 1130 /* Caller must already hold reference to f6i in result */ 1131 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res) 1132 { 1133 const struct fib6_nh *nh = res->nh; 1134 const struct net_device *dev = nh->fib_nh_dev; 1135 struct fib6_info *f6i = res->f6i; 1136 1137 ip6_rt_init_dst(rt, res); 1138 1139 rt->rt6i_dst = f6i->fib6_dst; 1140 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 1141 rt->rt6i_flags = res->fib6_flags; 1142 if (nh->fib_nh_gw_family) { 1143 rt->rt6i_gateway = nh->fib_nh_gw6; 1144 rt->rt6i_flags |= RTF_GATEWAY; 1145 } 1146 rt6_set_from(rt, f6i); 1147 #ifdef CONFIG_IPV6_SUBTREES 1148 rt->rt6i_src = f6i->fib6_src; 1149 #endif 1150 } 1151 1152 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1153 struct in6_addr *saddr) 1154 { 1155 struct fib6_node *pn, *sn; 1156 while (1) { 1157 if (fn->fn_flags & RTN_TL_ROOT) 1158 return NULL; 1159 pn = rcu_dereference(fn->parent); 1160 sn = FIB6_SUBTREE(pn); 1161 if (sn && sn != fn) 1162 fn = fib6_node_lookup(sn, NULL, saddr); 1163 else 1164 fn = pn; 1165 if (fn->fn_flags & RTN_RTINFO) 1166 return fn; 1167 } 1168 } 1169 1170 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt) 1171 { 1172 struct rt6_info *rt = *prt; 1173 1174 if (dst_hold_safe(&rt->dst)) 1175 return true; 1176 if (net) { 1177 rt = net->ipv6.ip6_null_entry; 1178 dst_hold(&rt->dst); 1179 } else { 1180 rt = NULL; 1181 } 1182 *prt = rt; 1183 return false; 1184 } 1185 1186 /* called with rcu_lock held */ 1187 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res) 1188 { 1189 struct net_device *dev = res->nh->fib_nh_dev; 1190 struct fib6_info *f6i = res->f6i; 1191 unsigned short flags; 1192 struct rt6_info *nrt; 1193 1194 if (!fib6_info_hold_safe(f6i)) 1195 goto fallback; 1196 1197 flags = fib6_info_dst_flags(f6i); 1198 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1199 if (!nrt) { 1200 fib6_info_release(f6i); 1201 goto fallback; 1202 } 1203 1204 ip6_rt_copy_init(nrt, res); 1205 return nrt; 1206 1207 fallback: 1208 nrt = dev_net(dev)->ipv6.ip6_null_entry; 1209 dst_hold(&nrt->dst); 1210 return nrt; 1211 } 1212 1213 INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_lookup(struct net *net, 1214 struct fib6_table *table, 1215 struct flowi6 *fl6, 1216 const struct sk_buff *skb, 1217 int flags) 1218 { 1219 struct fib6_result res = {}; 1220 struct fib6_node *fn; 1221 struct rt6_info *rt; 1222 1223 rcu_read_lock(); 1224 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1225 restart: 1226 res.f6i = rcu_dereference(fn->leaf); 1227 if (!res.f6i) 1228 res.f6i = net->ipv6.fib6_null_entry; 1229 else 1230 rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif, 1231 flags); 1232 1233 if (res.f6i == net->ipv6.fib6_null_entry) { 1234 fn = fib6_backtrack(fn, &fl6->saddr); 1235 if (fn) 1236 goto restart; 1237 1238 rt = net->ipv6.ip6_null_entry; 1239 dst_hold(&rt->dst); 1240 goto out; 1241 } else if (res.fib6_flags & RTF_REJECT) { 1242 goto do_create; 1243 } 1244 1245 fib6_select_path(net, &res, fl6, fl6->flowi6_oif, 1246 fl6->flowi6_oif != 0, skb, flags); 1247 1248 /* Search through exception table */ 1249 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); 1250 if (rt) { 1251 if (ip6_hold_safe(net, &rt)) 1252 dst_use_noref(&rt->dst, jiffies); 1253 } else { 1254 do_create: 1255 rt = ip6_create_rt_rcu(&res); 1256 } 1257 1258 out: 1259 trace_fib6_table_lookup(net, &res, table, fl6); 1260 1261 rcu_read_unlock(); 1262 1263 return rt; 1264 } 1265 1266 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1267 const struct sk_buff *skb, int flags) 1268 { 1269 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1270 } 1271 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1272 1273 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1274 const struct in6_addr *saddr, int oif, 1275 const struct sk_buff *skb, int strict) 1276 { 1277 struct flowi6 fl6 = { 1278 .flowi6_oif = oif, 1279 .daddr = *daddr, 1280 }; 1281 struct dst_entry *dst; 1282 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1283 1284 if (saddr) { 1285 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1286 flags |= RT6_LOOKUP_F_HAS_SADDR; 1287 } 1288 1289 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1290 if (dst->error == 0) 1291 return dst_rt6_info(dst); 1292 1293 dst_release(dst); 1294 1295 return NULL; 1296 } 1297 EXPORT_SYMBOL(rt6_lookup); 1298 1299 /* ip6_ins_rt is called with FREE table->tb6_lock. 1300 * It takes new route entry, the addition fails by any reason the 1301 * route is released. 1302 * Caller must hold dst before calling it. 1303 */ 1304 1305 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1306 struct netlink_ext_ack *extack) 1307 { 1308 int err; 1309 struct fib6_table *table; 1310 1311 table = rt->fib6_table; 1312 spin_lock_bh(&table->tb6_lock); 1313 err = fib6_add(&table->tb6_root, rt, info, extack); 1314 spin_unlock_bh(&table->tb6_lock); 1315 1316 return err; 1317 } 1318 1319 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1320 { 1321 struct nl_info info = { .nl_net = net, }; 1322 1323 return __ip6_ins_rt(rt, &info, NULL); 1324 } 1325 1326 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res, 1327 const struct in6_addr *daddr, 1328 const struct in6_addr *saddr) 1329 { 1330 struct fib6_info *f6i = res->f6i; 1331 struct net_device *dev; 1332 struct rt6_info *rt; 1333 1334 /* 1335 * Clone the route. 1336 */ 1337 1338 if (!fib6_info_hold_safe(f6i)) 1339 return NULL; 1340 1341 dev = ip6_rt_get_dev_rcu(res); 1342 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1343 if (!rt) { 1344 fib6_info_release(f6i); 1345 return NULL; 1346 } 1347 1348 ip6_rt_copy_init(rt, res); 1349 rt->rt6i_flags |= RTF_CACHE; 1350 rt->rt6i_dst.addr = *daddr; 1351 rt->rt6i_dst.plen = 128; 1352 1353 if (!rt6_is_gw_or_nonexthop(res)) { 1354 if (f6i->fib6_dst.plen != 128 && 1355 ipv6_addr_equal(&f6i->fib6_dst.addr, daddr)) 1356 rt->rt6i_flags |= RTF_ANYCAST; 1357 #ifdef CONFIG_IPV6_SUBTREES 1358 if (rt->rt6i_src.plen && saddr) { 1359 rt->rt6i_src.addr = *saddr; 1360 rt->rt6i_src.plen = 128; 1361 } 1362 #endif 1363 } 1364 1365 return rt; 1366 } 1367 1368 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res) 1369 { 1370 struct fib6_info *f6i = res->f6i; 1371 unsigned short flags = fib6_info_dst_flags(f6i); 1372 struct net_device *dev; 1373 struct rt6_info *pcpu_rt; 1374 1375 if (!fib6_info_hold_safe(f6i)) 1376 return NULL; 1377 1378 rcu_read_lock(); 1379 dev = ip6_rt_get_dev_rcu(res); 1380 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags | DST_NOCOUNT); 1381 rcu_read_unlock(); 1382 if (!pcpu_rt) { 1383 fib6_info_release(f6i); 1384 return NULL; 1385 } 1386 ip6_rt_copy_init(pcpu_rt, res); 1387 pcpu_rt->rt6i_flags |= RTF_PCPU; 1388 1389 if (f6i->nh) 1390 pcpu_rt->sernum = rt_genid_ipv6(dev_net(dev)); 1391 1392 return pcpu_rt; 1393 } 1394 1395 static bool rt6_is_valid(const struct rt6_info *rt6) 1396 { 1397 return rt6->sernum == rt_genid_ipv6(dev_net(rt6->dst.dev)); 1398 } 1399 1400 /* It should be called with rcu_read_lock() acquired */ 1401 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) 1402 { 1403 struct rt6_info *pcpu_rt; 1404 1405 pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu); 1406 1407 if (pcpu_rt && pcpu_rt->sernum && !rt6_is_valid(pcpu_rt)) { 1408 struct rt6_info *prev, **p; 1409 1410 p = this_cpu_ptr(res->nh->rt6i_pcpu); 1411 prev = xchg(p, NULL); 1412 if (prev) { 1413 dst_dev_put(&prev->dst); 1414 dst_release(&prev->dst); 1415 } 1416 1417 pcpu_rt = NULL; 1418 } 1419 1420 return pcpu_rt; 1421 } 1422 1423 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1424 const struct fib6_result *res) 1425 { 1426 struct rt6_info *pcpu_rt, *prev, **p; 1427 1428 pcpu_rt = ip6_rt_pcpu_alloc(res); 1429 if (!pcpu_rt) 1430 return NULL; 1431 1432 p = this_cpu_ptr(res->nh->rt6i_pcpu); 1433 prev = cmpxchg(p, NULL, pcpu_rt); 1434 BUG_ON(prev); 1435 1436 if (res->f6i->fib6_destroying) { 1437 struct fib6_info *from; 1438 1439 from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); 1440 fib6_info_release(from); 1441 } 1442 1443 return pcpu_rt; 1444 } 1445 1446 /* exception hash table implementation 1447 */ 1448 static DEFINE_SPINLOCK(rt6_exception_lock); 1449 1450 /* Remove rt6_ex from hash table and free the memory 1451 * Caller must hold rt6_exception_lock 1452 */ 1453 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1454 struct rt6_exception *rt6_ex) 1455 { 1456 struct fib6_info *from; 1457 struct net *net; 1458 1459 if (!bucket || !rt6_ex) 1460 return; 1461 1462 net = dev_net(rt6_ex->rt6i->dst.dev); 1463 net->ipv6.rt6_stats->fib_rt_cache--; 1464 1465 /* purge completely the exception to allow releasing the held resources: 1466 * some [sk] cache may keep the dst around for unlimited time 1467 */ 1468 from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL); 1469 fib6_info_release(from); 1470 dst_dev_put(&rt6_ex->rt6i->dst); 1471 1472 hlist_del_rcu(&rt6_ex->hlist); 1473 dst_release(&rt6_ex->rt6i->dst); 1474 kfree_rcu(rt6_ex, rcu); 1475 WARN_ON_ONCE(!bucket->depth); 1476 bucket->depth--; 1477 } 1478 1479 /* Remove oldest rt6_ex in bucket and free the memory 1480 * Caller must hold rt6_exception_lock 1481 */ 1482 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1483 { 1484 struct rt6_exception *rt6_ex, *oldest = NULL; 1485 1486 if (!bucket) 1487 return; 1488 1489 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1490 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1491 oldest = rt6_ex; 1492 } 1493 rt6_remove_exception(bucket, oldest); 1494 } 1495 1496 static u32 rt6_exception_hash(const struct in6_addr *dst, 1497 const struct in6_addr *src) 1498 { 1499 static siphash_aligned_key_t rt6_exception_key; 1500 struct { 1501 struct in6_addr dst; 1502 struct in6_addr src; 1503 } __aligned(SIPHASH_ALIGNMENT) combined = { 1504 .dst = *dst, 1505 }; 1506 u64 val; 1507 1508 net_get_random_once(&rt6_exception_key, sizeof(rt6_exception_key)); 1509 1510 #ifdef CONFIG_IPV6_SUBTREES 1511 if (src) 1512 combined.src = *src; 1513 #endif 1514 val = siphash(&combined, sizeof(combined), &rt6_exception_key); 1515 1516 return hash_64(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1517 } 1518 1519 /* Helper function to find the cached rt in the hash table 1520 * and update bucket pointer to point to the bucket for this 1521 * (daddr, saddr) pair 1522 * Caller must hold rt6_exception_lock 1523 */ 1524 static struct rt6_exception * 1525 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1526 const struct in6_addr *daddr, 1527 const struct in6_addr *saddr) 1528 { 1529 struct rt6_exception *rt6_ex; 1530 u32 hval; 1531 1532 if (!(*bucket) || !daddr) 1533 return NULL; 1534 1535 hval = rt6_exception_hash(daddr, saddr); 1536 *bucket += hval; 1537 1538 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1539 struct rt6_info *rt6 = rt6_ex->rt6i; 1540 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1541 1542 #ifdef CONFIG_IPV6_SUBTREES 1543 if (matched && saddr) 1544 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1545 #endif 1546 if (matched) 1547 return rt6_ex; 1548 } 1549 return NULL; 1550 } 1551 1552 /* Helper function to find the cached rt in the hash table 1553 * and update bucket pointer to point to the bucket for this 1554 * (daddr, saddr) pair 1555 * Caller must hold rcu_read_lock() 1556 */ 1557 static struct rt6_exception * 1558 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1559 const struct in6_addr *daddr, 1560 const struct in6_addr *saddr) 1561 { 1562 struct rt6_exception *rt6_ex; 1563 u32 hval; 1564 1565 WARN_ON_ONCE(!rcu_read_lock_held()); 1566 1567 if (!(*bucket) || !daddr) 1568 return NULL; 1569 1570 hval = rt6_exception_hash(daddr, saddr); 1571 *bucket += hval; 1572 1573 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1574 struct rt6_info *rt6 = rt6_ex->rt6i; 1575 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1576 1577 #ifdef CONFIG_IPV6_SUBTREES 1578 if (matched && saddr) 1579 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1580 #endif 1581 if (matched) 1582 return rt6_ex; 1583 } 1584 return NULL; 1585 } 1586 1587 static unsigned int fib6_mtu(const struct fib6_result *res) 1588 { 1589 const struct fib6_nh *nh = res->nh; 1590 unsigned int mtu; 1591 1592 if (res->f6i->fib6_pmtu) { 1593 mtu = res->f6i->fib6_pmtu; 1594 } else { 1595 struct net_device *dev = nh->fib_nh_dev; 1596 struct inet6_dev *idev; 1597 1598 rcu_read_lock(); 1599 idev = __in6_dev_get(dev); 1600 mtu = READ_ONCE(idev->cnf.mtu6); 1601 rcu_read_unlock(); 1602 } 1603 1604 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1605 1606 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); 1607 } 1608 1609 #define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL 1610 1611 /* used when the flushed bit is not relevant, only access to the bucket 1612 * (ie., all bucket users except rt6_insert_exception); 1613 * 1614 * called under rcu lock; sometimes called with rt6_exception_lock held 1615 */ 1616 static 1617 struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh, 1618 spinlock_t *lock) 1619 { 1620 struct rt6_exception_bucket *bucket; 1621 1622 if (lock) 1623 bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, 1624 lockdep_is_held(lock)); 1625 else 1626 bucket = rcu_dereference(nh->rt6i_exception_bucket); 1627 1628 /* remove bucket flushed bit if set */ 1629 if (bucket) { 1630 unsigned long p = (unsigned long)bucket; 1631 1632 p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED; 1633 bucket = (struct rt6_exception_bucket *)p; 1634 } 1635 1636 return bucket; 1637 } 1638 1639 static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket) 1640 { 1641 unsigned long p = (unsigned long)bucket; 1642 1643 return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED); 1644 } 1645 1646 /* called with rt6_exception_lock held */ 1647 static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh, 1648 spinlock_t *lock) 1649 { 1650 struct rt6_exception_bucket *bucket; 1651 unsigned long p; 1652 1653 bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, 1654 lockdep_is_held(lock)); 1655 1656 p = (unsigned long)bucket; 1657 p |= FIB6_EXCEPTION_BUCKET_FLUSHED; 1658 bucket = (struct rt6_exception_bucket *)p; 1659 rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); 1660 } 1661 1662 static int rt6_insert_exception(struct rt6_info *nrt, 1663 const struct fib6_result *res) 1664 { 1665 struct net *net = dev_net(nrt->dst.dev); 1666 struct rt6_exception_bucket *bucket; 1667 struct fib6_info *f6i = res->f6i; 1668 struct in6_addr *src_key = NULL; 1669 struct rt6_exception *rt6_ex; 1670 struct fib6_nh *nh = res->nh; 1671 int max_depth; 1672 int err = 0; 1673 1674 spin_lock_bh(&rt6_exception_lock); 1675 1676 bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, 1677 lockdep_is_held(&rt6_exception_lock)); 1678 if (!bucket) { 1679 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1680 GFP_ATOMIC); 1681 if (!bucket) { 1682 err = -ENOMEM; 1683 goto out; 1684 } 1685 rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); 1686 } else if (fib6_nh_excptn_bucket_flushed(bucket)) { 1687 err = -EINVAL; 1688 goto out; 1689 } 1690 1691 #ifdef CONFIG_IPV6_SUBTREES 1692 /* fib6_src.plen != 0 indicates f6i is in subtree 1693 * and exception table is indexed by a hash of 1694 * both fib6_dst and fib6_src. 1695 * Otherwise, the exception table is indexed by 1696 * a hash of only fib6_dst. 1697 */ 1698 if (f6i->fib6_src.plen) 1699 src_key = &nrt->rt6i_src.addr; 1700 #endif 1701 /* rt6_mtu_change() might lower mtu on f6i. 1702 * Only insert this exception route if its mtu 1703 * is less than f6i's mtu value. 1704 */ 1705 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) { 1706 err = -EINVAL; 1707 goto out; 1708 } 1709 1710 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1711 src_key); 1712 if (rt6_ex) 1713 rt6_remove_exception(bucket, rt6_ex); 1714 1715 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1716 if (!rt6_ex) { 1717 err = -ENOMEM; 1718 goto out; 1719 } 1720 rt6_ex->rt6i = nrt; 1721 rt6_ex->stamp = jiffies; 1722 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1723 bucket->depth++; 1724 net->ipv6.rt6_stats->fib_rt_cache++; 1725 1726 /* Randomize max depth to avoid some side channels attacks. */ 1727 max_depth = FIB6_MAX_DEPTH + get_random_u32_below(FIB6_MAX_DEPTH); 1728 while (bucket->depth > max_depth) 1729 rt6_exception_remove_oldest(bucket); 1730 1731 out: 1732 spin_unlock_bh(&rt6_exception_lock); 1733 1734 /* Update fn->fn_sernum to invalidate all cached dst */ 1735 if (!err) { 1736 spin_lock_bh(&f6i->fib6_table->tb6_lock); 1737 fib6_update_sernum(net, f6i); 1738 spin_unlock_bh(&f6i->fib6_table->tb6_lock); 1739 fib6_force_start_gc(net); 1740 } 1741 1742 return err; 1743 } 1744 1745 static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from) 1746 { 1747 struct rt6_exception_bucket *bucket; 1748 struct rt6_exception *rt6_ex; 1749 struct hlist_node *tmp; 1750 int i; 1751 1752 spin_lock_bh(&rt6_exception_lock); 1753 1754 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 1755 if (!bucket) 1756 goto out; 1757 1758 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1759 if (!from) 1760 fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock); 1761 1762 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1763 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) { 1764 if (!from || 1765 rcu_access_pointer(rt6_ex->rt6i->from) == from) 1766 rt6_remove_exception(bucket, rt6_ex); 1767 } 1768 WARN_ON_ONCE(!from && bucket->depth); 1769 bucket++; 1770 } 1771 out: 1772 spin_unlock_bh(&rt6_exception_lock); 1773 } 1774 1775 static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg) 1776 { 1777 struct fib6_info *f6i = arg; 1778 1779 fib6_nh_flush_exceptions(nh, f6i); 1780 1781 return 0; 1782 } 1783 1784 void rt6_flush_exceptions(struct fib6_info *f6i) 1785 { 1786 if (f6i->nh) 1787 nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, 1788 f6i); 1789 else 1790 fib6_nh_flush_exceptions(f6i->fib6_nh, f6i); 1791 } 1792 1793 /* Find cached rt in the hash table inside passed in rt 1794 * Caller has to hold rcu_read_lock() 1795 */ 1796 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, 1797 const struct in6_addr *daddr, 1798 const struct in6_addr *saddr) 1799 { 1800 const struct in6_addr *src_key = NULL; 1801 struct rt6_exception_bucket *bucket; 1802 struct rt6_exception *rt6_ex; 1803 struct rt6_info *ret = NULL; 1804 1805 #ifdef CONFIG_IPV6_SUBTREES 1806 /* fib6i_src.plen != 0 indicates f6i is in subtree 1807 * and exception table is indexed by a hash of 1808 * both fib6_dst and fib6_src. 1809 * However, the src addr used to create the hash 1810 * might not be exactly the passed in saddr which 1811 * is a /128 addr from the flow. 1812 * So we need to use f6i->fib6_src to redo lookup 1813 * if the passed in saddr does not find anything. 1814 * (See the logic in ip6_rt_cache_alloc() on how 1815 * rt->rt6i_src is updated.) 1816 */ 1817 if (res->f6i->fib6_src.plen) 1818 src_key = saddr; 1819 find_ex: 1820 #endif 1821 bucket = fib6_nh_get_excptn_bucket(res->nh, NULL); 1822 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1823 1824 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1825 ret = rt6_ex->rt6i; 1826 1827 #ifdef CONFIG_IPV6_SUBTREES 1828 /* Use fib6_src as src_key and redo lookup */ 1829 if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) { 1830 src_key = &res->f6i->fib6_src.addr; 1831 goto find_ex; 1832 } 1833 #endif 1834 1835 return ret; 1836 } 1837 1838 /* Remove the passed in cached rt from the hash table that contains it */ 1839 static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen, 1840 const struct rt6_info *rt) 1841 { 1842 const struct in6_addr *src_key = NULL; 1843 struct rt6_exception_bucket *bucket; 1844 struct rt6_exception *rt6_ex; 1845 int err; 1846 1847 if (!rcu_access_pointer(nh->rt6i_exception_bucket)) 1848 return -ENOENT; 1849 1850 spin_lock_bh(&rt6_exception_lock); 1851 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 1852 1853 #ifdef CONFIG_IPV6_SUBTREES 1854 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1855 * and exception table is indexed by a hash of 1856 * both rt6i_dst and rt6i_src. 1857 * Otherwise, the exception table is indexed by 1858 * a hash of only rt6i_dst. 1859 */ 1860 if (plen) 1861 src_key = &rt->rt6i_src.addr; 1862 #endif 1863 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1864 &rt->rt6i_dst.addr, 1865 src_key); 1866 if (rt6_ex) { 1867 rt6_remove_exception(bucket, rt6_ex); 1868 err = 0; 1869 } else { 1870 err = -ENOENT; 1871 } 1872 1873 spin_unlock_bh(&rt6_exception_lock); 1874 return err; 1875 } 1876 1877 struct fib6_nh_excptn_arg { 1878 struct rt6_info *rt; 1879 int plen; 1880 }; 1881 1882 static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg) 1883 { 1884 struct fib6_nh_excptn_arg *arg = _arg; 1885 int err; 1886 1887 err = fib6_nh_remove_exception(nh, arg->plen, arg->rt); 1888 if (err == 0) 1889 return 1; 1890 1891 return 0; 1892 } 1893 1894 static int rt6_remove_exception_rt(struct rt6_info *rt) 1895 { 1896 struct fib6_info *from; 1897 1898 from = rcu_dereference(rt->from); 1899 if (!from || !(rt->rt6i_flags & RTF_CACHE)) 1900 return -EINVAL; 1901 1902 if (from->nh) { 1903 struct fib6_nh_excptn_arg arg = { 1904 .rt = rt, 1905 .plen = from->fib6_src.plen 1906 }; 1907 int rc; 1908 1909 /* rc = 1 means an entry was found */ 1910 rc = nexthop_for_each_fib6_nh(from->nh, 1911 rt6_nh_remove_exception_rt, 1912 &arg); 1913 return rc ? 0 : -ENOENT; 1914 } 1915 1916 return fib6_nh_remove_exception(from->fib6_nh, 1917 from->fib6_src.plen, rt); 1918 } 1919 1920 /* Find rt6_ex which contains the passed in rt cache and 1921 * refresh its stamp 1922 */ 1923 static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen, 1924 const struct rt6_info *rt) 1925 { 1926 const struct in6_addr *src_key = NULL; 1927 struct rt6_exception_bucket *bucket; 1928 struct rt6_exception *rt6_ex; 1929 1930 bucket = fib6_nh_get_excptn_bucket(nh, NULL); 1931 #ifdef CONFIG_IPV6_SUBTREES 1932 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1933 * and exception table is indexed by a hash of 1934 * both rt6i_dst and rt6i_src. 1935 * Otherwise, the exception table is indexed by 1936 * a hash of only rt6i_dst. 1937 */ 1938 if (plen) 1939 src_key = &rt->rt6i_src.addr; 1940 #endif 1941 rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key); 1942 if (rt6_ex) 1943 rt6_ex->stamp = jiffies; 1944 } 1945 1946 struct fib6_nh_match_arg { 1947 const struct net_device *dev; 1948 const struct in6_addr *gw; 1949 struct fib6_nh *match; 1950 }; 1951 1952 /* determine if fib6_nh has given device and gateway */ 1953 static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg) 1954 { 1955 struct fib6_nh_match_arg *arg = _arg; 1956 1957 if (arg->dev != nh->fib_nh_dev || 1958 (arg->gw && !nh->fib_nh_gw_family) || 1959 (!arg->gw && nh->fib_nh_gw_family) || 1960 (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6))) 1961 return 0; 1962 1963 arg->match = nh; 1964 1965 /* found a match, break the loop */ 1966 return 1; 1967 } 1968 1969 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1970 { 1971 struct fib6_info *from; 1972 struct fib6_nh *fib6_nh; 1973 1974 rcu_read_lock(); 1975 1976 from = rcu_dereference(rt->from); 1977 if (!from || !(rt->rt6i_flags & RTF_CACHE)) 1978 goto unlock; 1979 1980 if (from->nh) { 1981 struct fib6_nh_match_arg arg = { 1982 .dev = rt->dst.dev, 1983 .gw = &rt->rt6i_gateway, 1984 }; 1985 1986 nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg); 1987 1988 if (!arg.match) 1989 goto unlock; 1990 fib6_nh = arg.match; 1991 } else { 1992 fib6_nh = from->fib6_nh; 1993 } 1994 fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt); 1995 unlock: 1996 rcu_read_unlock(); 1997 } 1998 1999 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 2000 struct rt6_info *rt, int mtu) 2001 { 2002 /* If the new MTU is lower than the route PMTU, this new MTU will be the 2003 * lowest MTU in the path: always allow updating the route PMTU to 2004 * reflect PMTU decreases. 2005 * 2006 * If the new MTU is higher, and the route PMTU is equal to the local 2007 * MTU, this means the old MTU is the lowest in the path, so allow 2008 * updating it: if other nodes now have lower MTUs, PMTU discovery will 2009 * handle this. 2010 */ 2011 2012 if (dst_mtu(&rt->dst) >= mtu) 2013 return true; 2014 2015 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 2016 return true; 2017 2018 return false; 2019 } 2020 2021 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 2022 const struct fib6_nh *nh, int mtu) 2023 { 2024 struct rt6_exception_bucket *bucket; 2025 struct rt6_exception *rt6_ex; 2026 int i; 2027 2028 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 2029 if (!bucket) 2030 return; 2031 2032 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 2033 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 2034 struct rt6_info *entry = rt6_ex->rt6i; 2035 2036 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 2037 * route), the metrics of its rt->from have already 2038 * been updated. 2039 */ 2040 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 2041 rt6_mtu_change_route_allowed(idev, entry, mtu)) 2042 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 2043 } 2044 bucket++; 2045 } 2046 } 2047 2048 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 2049 2050 static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh, 2051 const struct in6_addr *gateway) 2052 { 2053 struct rt6_exception_bucket *bucket; 2054 struct rt6_exception *rt6_ex; 2055 struct hlist_node *tmp; 2056 int i; 2057 2058 if (!rcu_access_pointer(nh->rt6i_exception_bucket)) 2059 return; 2060 2061 spin_lock_bh(&rt6_exception_lock); 2062 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 2063 if (bucket) { 2064 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 2065 hlist_for_each_entry_safe(rt6_ex, tmp, 2066 &bucket->chain, hlist) { 2067 struct rt6_info *entry = rt6_ex->rt6i; 2068 2069 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 2070 RTF_CACHE_GATEWAY && 2071 ipv6_addr_equal(gateway, 2072 &entry->rt6i_gateway)) { 2073 rt6_remove_exception(bucket, rt6_ex); 2074 } 2075 } 2076 bucket++; 2077 } 2078 } 2079 2080 spin_unlock_bh(&rt6_exception_lock); 2081 } 2082 2083 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 2084 struct rt6_exception *rt6_ex, 2085 struct fib6_gc_args *gc_args, 2086 unsigned long now) 2087 { 2088 struct rt6_info *rt = rt6_ex->rt6i; 2089 2090 /* we are pruning and obsoleting aged-out and non gateway exceptions 2091 * even if others have still references to them, so that on next 2092 * dst_check() such references can be dropped. 2093 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 2094 * expired, independently from their aging, as per RFC 8201 section 4 2095 */ 2096 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 2097 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 2098 pr_debug("aging clone %p\n", rt); 2099 rt6_remove_exception(bucket, rt6_ex); 2100 return; 2101 } 2102 } else if (time_after(jiffies, rt->dst.expires)) { 2103 pr_debug("purging expired route %p\n", rt); 2104 rt6_remove_exception(bucket, rt6_ex); 2105 return; 2106 } 2107 2108 if (rt->rt6i_flags & RTF_GATEWAY) { 2109 struct neighbour *neigh; 2110 2111 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 2112 2113 if (!(neigh && (neigh->flags & NTF_ROUTER))) { 2114 pr_debug("purging route %p via non-router but gateway\n", 2115 rt); 2116 rt6_remove_exception(bucket, rt6_ex); 2117 return; 2118 } 2119 } 2120 2121 gc_args->more++; 2122 } 2123 2124 static void fib6_nh_age_exceptions(const struct fib6_nh *nh, 2125 struct fib6_gc_args *gc_args, 2126 unsigned long now) 2127 { 2128 struct rt6_exception_bucket *bucket; 2129 struct rt6_exception *rt6_ex; 2130 struct hlist_node *tmp; 2131 int i; 2132 2133 if (!rcu_access_pointer(nh->rt6i_exception_bucket)) 2134 return; 2135 2136 rcu_read_lock_bh(); 2137 spin_lock(&rt6_exception_lock); 2138 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 2139 if (bucket) { 2140 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 2141 hlist_for_each_entry_safe(rt6_ex, tmp, 2142 &bucket->chain, hlist) { 2143 rt6_age_examine_exception(bucket, rt6_ex, 2144 gc_args, now); 2145 } 2146 bucket++; 2147 } 2148 } 2149 spin_unlock(&rt6_exception_lock); 2150 rcu_read_unlock_bh(); 2151 } 2152 2153 struct fib6_nh_age_excptn_arg { 2154 struct fib6_gc_args *gc_args; 2155 unsigned long now; 2156 }; 2157 2158 static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg) 2159 { 2160 struct fib6_nh_age_excptn_arg *arg = _arg; 2161 2162 fib6_nh_age_exceptions(nh, arg->gc_args, arg->now); 2163 return 0; 2164 } 2165 2166 void rt6_age_exceptions(struct fib6_info *f6i, 2167 struct fib6_gc_args *gc_args, 2168 unsigned long now) 2169 { 2170 if (f6i->nh) { 2171 struct fib6_nh_age_excptn_arg arg = { 2172 .gc_args = gc_args, 2173 .now = now 2174 }; 2175 2176 nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions, 2177 &arg); 2178 } else { 2179 fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now); 2180 } 2181 } 2182 2183 /* must be called with rcu lock held */ 2184 int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, 2185 struct flowi6 *fl6, struct fib6_result *res, int strict) 2186 { 2187 struct fib6_node *fn, *saved_fn; 2188 2189 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2190 saved_fn = fn; 2191 2192 redo_rt6_select: 2193 rt6_select(net, fn, oif, res, strict); 2194 if (res->f6i == net->ipv6.fib6_null_entry) { 2195 fn = fib6_backtrack(fn, &fl6->saddr); 2196 if (fn) 2197 goto redo_rt6_select; 2198 else if (strict & RT6_LOOKUP_F_REACHABLE) { 2199 /* also consider unreachable route */ 2200 strict &= ~RT6_LOOKUP_F_REACHABLE; 2201 fn = saved_fn; 2202 goto redo_rt6_select; 2203 } 2204 } 2205 2206 trace_fib6_table_lookup(net, res, table, fl6); 2207 2208 return 0; 2209 } 2210 2211 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 2212 int oif, struct flowi6 *fl6, 2213 const struct sk_buff *skb, int flags) 2214 { 2215 struct fib6_result res = {}; 2216 struct rt6_info *rt = NULL; 2217 int strict = 0; 2218 2219 WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) && 2220 !rcu_read_lock_held()); 2221 2222 strict |= flags & RT6_LOOKUP_F_IFACE; 2223 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 2224 if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0) 2225 strict |= RT6_LOOKUP_F_REACHABLE; 2226 2227 rcu_read_lock(); 2228 2229 fib6_table_lookup(net, table, oif, fl6, &res, strict); 2230 if (res.f6i == net->ipv6.fib6_null_entry) 2231 goto out; 2232 2233 fib6_select_path(net, &res, fl6, oif, false, skb, strict); 2234 2235 /*Search through exception table */ 2236 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); 2237 if (rt) { 2238 goto out; 2239 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 2240 !res.nh->fib_nh_gw_family)) { 2241 /* Create a RTF_CACHE clone which will not be 2242 * owned by the fib6 tree. It is for the special case where 2243 * the daddr in the skb during the neighbor look-up is different 2244 * from the fl6->daddr used to look-up route here. 2245 */ 2246 rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); 2247 2248 if (rt) { 2249 /* 1 refcnt is taken during ip6_rt_cache_alloc(). 2250 * As rt6_uncached_list_add() does not consume refcnt, 2251 * this refcnt is always returned to the caller even 2252 * if caller sets RT6_LOOKUP_F_DST_NOREF flag. 2253 */ 2254 rt6_uncached_list_add(rt); 2255 rcu_read_unlock(); 2256 2257 return rt; 2258 } 2259 } else { 2260 /* Get a percpu copy */ 2261 local_bh_disable(); 2262 rt = rt6_get_pcpu_route(&res); 2263 2264 if (!rt) 2265 rt = rt6_make_pcpu_route(net, &res); 2266 2267 local_bh_enable(); 2268 } 2269 out: 2270 if (!rt) 2271 rt = net->ipv6.ip6_null_entry; 2272 if (!(flags & RT6_LOOKUP_F_DST_NOREF)) 2273 ip6_hold_safe(net, &rt); 2274 rcu_read_unlock(); 2275 2276 return rt; 2277 } 2278 EXPORT_SYMBOL_GPL(ip6_pol_route); 2279 2280 INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_input(struct net *net, 2281 struct fib6_table *table, 2282 struct flowi6 *fl6, 2283 const struct sk_buff *skb, 2284 int flags) 2285 { 2286 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 2287 } 2288 2289 struct dst_entry *ip6_route_input_lookup(struct net *net, 2290 struct net_device *dev, 2291 struct flowi6 *fl6, 2292 const struct sk_buff *skb, 2293 int flags) 2294 { 2295 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 2296 flags |= RT6_LOOKUP_F_IFACE; 2297 2298 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 2299 } 2300 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 2301 2302 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 2303 struct flow_keys *keys, 2304 struct flow_keys *flkeys) 2305 { 2306 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 2307 const struct ipv6hdr *key_iph = outer_iph; 2308 struct flow_keys *_flkeys = flkeys; 2309 const struct ipv6hdr *inner_iph; 2310 const struct icmp6hdr *icmph; 2311 struct ipv6hdr _inner_iph; 2312 struct icmp6hdr _icmph; 2313 2314 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 2315 goto out; 2316 2317 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 2318 sizeof(_icmph), &_icmph); 2319 if (!icmph) 2320 goto out; 2321 2322 if (!icmpv6_is_err(icmph->icmp6_type)) 2323 goto out; 2324 2325 inner_iph = skb_header_pointer(skb, 2326 skb_transport_offset(skb) + sizeof(*icmph), 2327 sizeof(_inner_iph), &_inner_iph); 2328 if (!inner_iph) 2329 goto out; 2330 2331 key_iph = inner_iph; 2332 _flkeys = NULL; 2333 out: 2334 if (_flkeys) { 2335 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 2336 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 2337 keys->tags.flow_label = _flkeys->tags.flow_label; 2338 keys->basic.ip_proto = _flkeys->basic.ip_proto; 2339 } else { 2340 keys->addrs.v6addrs.src = key_iph->saddr; 2341 keys->addrs.v6addrs.dst = key_iph->daddr; 2342 keys->tags.flow_label = ip6_flowlabel(key_iph); 2343 keys->basic.ip_proto = key_iph->nexthdr; 2344 } 2345 } 2346 2347 static u32 rt6_multipath_custom_hash_outer(const struct net *net, 2348 const struct sk_buff *skb, 2349 bool *p_has_inner) 2350 { 2351 u32 hash_fields = ip6_multipath_hash_fields(net); 2352 struct flow_keys keys, hash_keys; 2353 2354 if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) 2355 return 0; 2356 2357 memset(&hash_keys, 0, sizeof(hash_keys)); 2358 skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP); 2359 2360 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2361 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) 2362 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; 2363 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) 2364 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; 2365 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) 2366 hash_keys.basic.ip_proto = keys.basic.ip_proto; 2367 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) 2368 hash_keys.tags.flow_label = keys.tags.flow_label; 2369 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) 2370 hash_keys.ports.src = keys.ports.src; 2371 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) 2372 hash_keys.ports.dst = keys.ports.dst; 2373 2374 *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION); 2375 return flow_hash_from_keys(&hash_keys); 2376 } 2377 2378 static u32 rt6_multipath_custom_hash_inner(const struct net *net, 2379 const struct sk_buff *skb, 2380 bool has_inner) 2381 { 2382 u32 hash_fields = ip6_multipath_hash_fields(net); 2383 struct flow_keys keys, hash_keys; 2384 2385 /* We assume the packet carries an encapsulation, but if none was 2386 * encountered during dissection of the outer flow, then there is no 2387 * point in calling the flow dissector again. 2388 */ 2389 if (!has_inner) 2390 return 0; 2391 2392 if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK)) 2393 return 0; 2394 2395 memset(&hash_keys, 0, sizeof(hash_keys)); 2396 skb_flow_dissect_flow_keys(skb, &keys, 0); 2397 2398 if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION)) 2399 return 0; 2400 2401 if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { 2402 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 2403 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) 2404 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; 2405 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) 2406 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; 2407 } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { 2408 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2409 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) 2410 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; 2411 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) 2412 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; 2413 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL) 2414 hash_keys.tags.flow_label = keys.tags.flow_label; 2415 } 2416 2417 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO) 2418 hash_keys.basic.ip_proto = keys.basic.ip_proto; 2419 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT) 2420 hash_keys.ports.src = keys.ports.src; 2421 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) 2422 hash_keys.ports.dst = keys.ports.dst; 2423 2424 return flow_hash_from_keys(&hash_keys); 2425 } 2426 2427 static u32 rt6_multipath_custom_hash_skb(const struct net *net, 2428 const struct sk_buff *skb) 2429 { 2430 u32 mhash, mhash_inner; 2431 bool has_inner = true; 2432 2433 mhash = rt6_multipath_custom_hash_outer(net, skb, &has_inner); 2434 mhash_inner = rt6_multipath_custom_hash_inner(net, skb, has_inner); 2435 2436 return jhash_2words(mhash, mhash_inner, 0); 2437 } 2438 2439 static u32 rt6_multipath_custom_hash_fl6(const struct net *net, 2440 const struct flowi6 *fl6) 2441 { 2442 u32 hash_fields = ip6_multipath_hash_fields(net); 2443 struct flow_keys hash_keys; 2444 2445 if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) 2446 return 0; 2447 2448 memset(&hash_keys, 0, sizeof(hash_keys)); 2449 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2450 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) 2451 hash_keys.addrs.v6addrs.src = fl6->saddr; 2452 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) 2453 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2454 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) 2455 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2456 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) 2457 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2458 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) 2459 hash_keys.ports.src = fl6->fl6_sport; 2460 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) 2461 hash_keys.ports.dst = fl6->fl6_dport; 2462 2463 return flow_hash_from_keys(&hash_keys); 2464 } 2465 2466 /* if skb is set it will be used and fl6 can be NULL */ 2467 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 2468 const struct sk_buff *skb, struct flow_keys *flkeys) 2469 { 2470 struct flow_keys hash_keys; 2471 u32 mhash = 0; 2472 2473 switch (ip6_multipath_hash_policy(net)) { 2474 case 0: 2475 memset(&hash_keys, 0, sizeof(hash_keys)); 2476 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2477 if (skb) { 2478 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 2479 } else { 2480 hash_keys.addrs.v6addrs.src = fl6->saddr; 2481 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2482 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2483 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2484 } 2485 mhash = flow_hash_from_keys(&hash_keys); 2486 break; 2487 case 1: 2488 if (skb) { 2489 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 2490 struct flow_keys keys; 2491 2492 /* short-circuit if we already have L4 hash present */ 2493 if (skb->l4_hash) 2494 return skb_get_hash_raw(skb) >> 1; 2495 2496 memset(&hash_keys, 0, sizeof(hash_keys)); 2497 2498 if (!flkeys) { 2499 skb_flow_dissect_flow_keys(skb, &keys, flag); 2500 flkeys = &keys; 2501 } 2502 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2503 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2504 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2505 hash_keys.ports.src = flkeys->ports.src; 2506 hash_keys.ports.dst = flkeys->ports.dst; 2507 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2508 } else { 2509 memset(&hash_keys, 0, sizeof(hash_keys)); 2510 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2511 hash_keys.addrs.v6addrs.src = fl6->saddr; 2512 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2513 hash_keys.ports.src = fl6->fl6_sport; 2514 hash_keys.ports.dst = fl6->fl6_dport; 2515 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2516 } 2517 mhash = flow_hash_from_keys(&hash_keys); 2518 break; 2519 case 2: 2520 memset(&hash_keys, 0, sizeof(hash_keys)); 2521 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2522 if (skb) { 2523 struct flow_keys keys; 2524 2525 if (!flkeys) { 2526 skb_flow_dissect_flow_keys(skb, &keys, 0); 2527 flkeys = &keys; 2528 } 2529 2530 /* Inner can be v4 or v6 */ 2531 if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { 2532 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 2533 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; 2534 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; 2535 } else if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { 2536 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2537 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2538 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2539 hash_keys.tags.flow_label = flkeys->tags.flow_label; 2540 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2541 } else { 2542 /* Same as case 0 */ 2543 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2544 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 2545 } 2546 } else { 2547 /* Same as case 0 */ 2548 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2549 hash_keys.addrs.v6addrs.src = fl6->saddr; 2550 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2551 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2552 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2553 } 2554 mhash = flow_hash_from_keys(&hash_keys); 2555 break; 2556 case 3: 2557 if (skb) 2558 mhash = rt6_multipath_custom_hash_skb(net, skb); 2559 else 2560 mhash = rt6_multipath_custom_hash_fl6(net, fl6); 2561 break; 2562 } 2563 2564 return mhash >> 1; 2565 } 2566 2567 /* Called with rcu held */ 2568 void ip6_route_input(struct sk_buff *skb) 2569 { 2570 const struct ipv6hdr *iph = ipv6_hdr(skb); 2571 struct net *net = dev_net(skb->dev); 2572 int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF; 2573 struct ip_tunnel_info *tun_info; 2574 struct flowi6 fl6 = { 2575 .flowi6_iif = skb->dev->ifindex, 2576 .daddr = iph->daddr, 2577 .saddr = iph->saddr, 2578 .flowlabel = ip6_flowinfo(iph), 2579 .flowi6_mark = skb->mark, 2580 .flowi6_proto = iph->nexthdr, 2581 }; 2582 struct flow_keys *flkeys = NULL, _flkeys; 2583 2584 tun_info = skb_tunnel_info(skb); 2585 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2586 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2587 2588 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2589 flkeys = &_flkeys; 2590 2591 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2592 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2593 skb_dst_drop(skb); 2594 skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev, 2595 &fl6, skb, flags)); 2596 } 2597 2598 INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net, 2599 struct fib6_table *table, 2600 struct flowi6 *fl6, 2601 const struct sk_buff *skb, 2602 int flags) 2603 { 2604 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2605 } 2606 2607 static struct dst_entry *ip6_route_output_flags_noref(struct net *net, 2608 const struct sock *sk, 2609 struct flowi6 *fl6, 2610 int flags) 2611 { 2612 bool any_src; 2613 2614 if (ipv6_addr_type(&fl6->daddr) & 2615 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { 2616 struct dst_entry *dst; 2617 2618 /* This function does not take refcnt on the dst */ 2619 dst = l3mdev_link_scope_lookup(net, fl6); 2620 if (dst) 2621 return dst; 2622 } 2623 2624 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2625 2626 flags |= RT6_LOOKUP_F_DST_NOREF; 2627 any_src = ipv6_addr_any(&fl6->saddr); 2628 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2629 (fl6->flowi6_oif && any_src)) 2630 flags |= RT6_LOOKUP_F_IFACE; 2631 2632 if (!any_src) 2633 flags |= RT6_LOOKUP_F_HAS_SADDR; 2634 else if (sk) 2635 flags |= rt6_srcprefs2flags(READ_ONCE(inet6_sk(sk)->srcprefs)); 2636 2637 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2638 } 2639 2640 struct dst_entry *ip6_route_output_flags(struct net *net, 2641 const struct sock *sk, 2642 struct flowi6 *fl6, 2643 int flags) 2644 { 2645 struct dst_entry *dst; 2646 struct rt6_info *rt6; 2647 2648 rcu_read_lock(); 2649 dst = ip6_route_output_flags_noref(net, sk, fl6, flags); 2650 rt6 = dst_rt6_info(dst); 2651 /* For dst cached in uncached_list, refcnt is already taken. */ 2652 if (list_empty(&rt6->dst.rt_uncached) && !dst_hold_safe(dst)) { 2653 dst = &net->ipv6.ip6_null_entry->dst; 2654 dst_hold(dst); 2655 } 2656 rcu_read_unlock(); 2657 2658 return dst; 2659 } 2660 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2661 2662 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2663 { 2664 struct rt6_info *rt, *ort = dst_rt6_info(dst_orig); 2665 struct net_device *loopback_dev = net->loopback_dev; 2666 struct dst_entry *new = NULL; 2667 2668 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 2669 DST_OBSOLETE_DEAD, 0); 2670 if (rt) { 2671 rt6_info_init(rt); 2672 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2673 2674 new = &rt->dst; 2675 new->__use = 1; 2676 new->input = dst_discard; 2677 new->output = dst_discard_out; 2678 2679 dst_copy_metrics(new, &ort->dst); 2680 2681 rt->rt6i_idev = in6_dev_get(loopback_dev); 2682 rt->rt6i_gateway = ort->rt6i_gateway; 2683 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2684 2685 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2686 #ifdef CONFIG_IPV6_SUBTREES 2687 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2688 #endif 2689 } 2690 2691 dst_release(dst_orig); 2692 return new ? new : ERR_PTR(-ENOMEM); 2693 } 2694 2695 /* 2696 * Destination cache support functions 2697 */ 2698 2699 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2700 { 2701 u32 rt_cookie = 0; 2702 2703 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2704 return false; 2705 2706 if (fib6_check_expired(f6i)) 2707 return false; 2708 2709 return true; 2710 } 2711 2712 static struct dst_entry *rt6_check(struct rt6_info *rt, 2713 struct fib6_info *from, 2714 u32 cookie) 2715 { 2716 u32 rt_cookie = 0; 2717 2718 if (!from || !fib6_get_cookie_safe(from, &rt_cookie) || 2719 rt_cookie != cookie) 2720 return NULL; 2721 2722 if (rt6_check_expired(rt)) 2723 return NULL; 2724 2725 return &rt->dst; 2726 } 2727 2728 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2729 struct fib6_info *from, 2730 u32 cookie) 2731 { 2732 if (!__rt6_check_expired(rt) && 2733 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2734 fib6_check(from, cookie)) 2735 return &rt->dst; 2736 else 2737 return NULL; 2738 } 2739 2740 INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, 2741 u32 cookie) 2742 { 2743 struct dst_entry *dst_ret; 2744 struct fib6_info *from; 2745 struct rt6_info *rt; 2746 2747 rt = dst_rt6_info(dst); 2748 2749 if (rt->sernum) 2750 return rt6_is_valid(rt) ? dst : NULL; 2751 2752 rcu_read_lock(); 2753 2754 /* All IPV6 dsts are created with ->obsolete set to the value 2755 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2756 * into this function always. 2757 */ 2758 2759 from = rcu_dereference(rt->from); 2760 2761 if (from && (rt->rt6i_flags & RTF_PCPU || 2762 unlikely(!list_empty(&rt->dst.rt_uncached)))) 2763 dst_ret = rt6_dst_from_check(rt, from, cookie); 2764 else 2765 dst_ret = rt6_check(rt, from, cookie); 2766 2767 rcu_read_unlock(); 2768 2769 return dst_ret; 2770 } 2771 EXPORT_INDIRECT_CALLABLE(ip6_dst_check); 2772 2773 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2774 { 2775 struct rt6_info *rt = dst_rt6_info(dst); 2776 2777 if (rt) { 2778 if (rt->rt6i_flags & RTF_CACHE) { 2779 rcu_read_lock(); 2780 if (rt6_check_expired(rt)) { 2781 rt6_remove_exception_rt(rt); 2782 dst = NULL; 2783 } 2784 rcu_read_unlock(); 2785 } else { 2786 dst_release(dst); 2787 dst = NULL; 2788 } 2789 } 2790 return dst; 2791 } 2792 2793 static void ip6_link_failure(struct sk_buff *skb) 2794 { 2795 struct rt6_info *rt; 2796 2797 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2798 2799 rt = dst_rt6_info(skb_dst(skb)); 2800 if (rt) { 2801 rcu_read_lock(); 2802 if (rt->rt6i_flags & RTF_CACHE) { 2803 rt6_remove_exception_rt(rt); 2804 } else { 2805 struct fib6_info *from; 2806 struct fib6_node *fn; 2807 2808 from = rcu_dereference(rt->from); 2809 if (from) { 2810 fn = rcu_dereference(from->fib6_node); 2811 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2812 WRITE_ONCE(fn->fn_sernum, -1); 2813 } 2814 } 2815 rcu_read_unlock(); 2816 } 2817 } 2818 2819 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2820 { 2821 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2822 struct fib6_info *from; 2823 2824 rcu_read_lock(); 2825 from = rcu_dereference(rt0->from); 2826 if (from) 2827 rt0->dst.expires = from->expires; 2828 rcu_read_unlock(); 2829 } 2830 2831 dst_set_expires(&rt0->dst, timeout); 2832 rt0->rt6i_flags |= RTF_EXPIRES; 2833 } 2834 2835 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2836 { 2837 struct net *net = dev_net(rt->dst.dev); 2838 2839 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2840 rt->rt6i_flags |= RTF_MODIFIED; 2841 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2842 } 2843 2844 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2845 { 2846 return !(rt->rt6i_flags & RTF_CACHE) && 2847 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from)); 2848 } 2849 2850 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2851 const struct ipv6hdr *iph, u32 mtu, 2852 bool confirm_neigh) 2853 { 2854 const struct in6_addr *daddr, *saddr; 2855 struct rt6_info *rt6 = dst_rt6_info(dst); 2856 2857 /* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU) 2858 * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it. 2859 * [see also comment in rt6_mtu_change_route()] 2860 */ 2861 2862 if (iph) { 2863 daddr = &iph->daddr; 2864 saddr = &iph->saddr; 2865 } else if (sk) { 2866 daddr = &sk->sk_v6_daddr; 2867 saddr = &inet6_sk(sk)->saddr; 2868 } else { 2869 daddr = NULL; 2870 saddr = NULL; 2871 } 2872 2873 if (confirm_neigh) 2874 dst_confirm_neigh(dst, daddr); 2875 2876 if (mtu < IPV6_MIN_MTU) 2877 return; 2878 if (mtu >= dst_mtu(dst)) 2879 return; 2880 2881 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2882 rt6_do_update_pmtu(rt6, mtu); 2883 /* update rt6_ex->stamp for cache */ 2884 if (rt6->rt6i_flags & RTF_CACHE) 2885 rt6_update_exception_stamp_rt(rt6); 2886 } else if (daddr) { 2887 struct fib6_result res = {}; 2888 struct rt6_info *nrt6; 2889 2890 rcu_read_lock(); 2891 res.f6i = rcu_dereference(rt6->from); 2892 if (!res.f6i) 2893 goto out_unlock; 2894 2895 res.fib6_flags = res.f6i->fib6_flags; 2896 res.fib6_type = res.f6i->fib6_type; 2897 2898 if (res.f6i->nh) { 2899 struct fib6_nh_match_arg arg = { 2900 .dev = dst->dev, 2901 .gw = &rt6->rt6i_gateway, 2902 }; 2903 2904 nexthop_for_each_fib6_nh(res.f6i->nh, 2905 fib6_nh_find_match, &arg); 2906 2907 /* fib6_info uses a nexthop that does not have fib6_nh 2908 * using the dst->dev + gw. Should be impossible. 2909 */ 2910 if (!arg.match) 2911 goto out_unlock; 2912 2913 res.nh = arg.match; 2914 } else { 2915 res.nh = res.f6i->fib6_nh; 2916 } 2917 2918 nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr); 2919 if (nrt6) { 2920 rt6_do_update_pmtu(nrt6, mtu); 2921 if (rt6_insert_exception(nrt6, &res)) 2922 dst_release_immediate(&nrt6->dst); 2923 } 2924 out_unlock: 2925 rcu_read_unlock(); 2926 } 2927 } 2928 2929 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2930 struct sk_buff *skb, u32 mtu, 2931 bool confirm_neigh) 2932 { 2933 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu, 2934 confirm_neigh); 2935 } 2936 2937 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2938 int oif, u32 mark, kuid_t uid) 2939 { 2940 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2941 struct dst_entry *dst; 2942 struct flowi6 fl6 = { 2943 .flowi6_oif = oif, 2944 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark), 2945 .daddr = iph->daddr, 2946 .saddr = iph->saddr, 2947 .flowlabel = ip6_flowinfo(iph), 2948 .flowi6_uid = uid, 2949 }; 2950 2951 dst = ip6_route_output(net, NULL, &fl6); 2952 if (!dst->error) 2953 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), true); 2954 dst_release(dst); 2955 } 2956 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2957 2958 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2959 { 2960 int oif = sk->sk_bound_dev_if; 2961 struct dst_entry *dst; 2962 2963 if (!oif && skb->dev) 2964 oif = l3mdev_master_ifindex(skb->dev); 2965 2966 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, READ_ONCE(sk->sk_mark), 2967 sk->sk_uid); 2968 2969 dst = __sk_dst_get(sk); 2970 if (!dst || !dst->obsolete || 2971 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2972 return; 2973 2974 bh_lock_sock(sk); 2975 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2976 ip6_datagram_dst_update(sk, false); 2977 bh_unlock_sock(sk); 2978 } 2979 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2980 2981 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2982 const struct flowi6 *fl6) 2983 { 2984 #ifdef CONFIG_IPV6_SUBTREES 2985 struct ipv6_pinfo *np = inet6_sk(sk); 2986 #endif 2987 2988 ip6_dst_store(sk, dst, 2989 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2990 &sk->sk_v6_daddr : NULL, 2991 #ifdef CONFIG_IPV6_SUBTREES 2992 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2993 &np->saddr : 2994 #endif 2995 NULL); 2996 } 2997 2998 static bool ip6_redirect_nh_match(const struct fib6_result *res, 2999 struct flowi6 *fl6, 3000 const struct in6_addr *gw, 3001 struct rt6_info **ret) 3002 { 3003 const struct fib6_nh *nh = res->nh; 3004 3005 if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family || 3006 fl6->flowi6_oif != nh->fib_nh_dev->ifindex) 3007 return false; 3008 3009 /* rt_cache's gateway might be different from its 'parent' 3010 * in the case of an ip redirect. 3011 * So we keep searching in the exception table if the gateway 3012 * is different. 3013 */ 3014 if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) { 3015 struct rt6_info *rt_cache; 3016 3017 rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr); 3018 if (rt_cache && 3019 ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) { 3020 *ret = rt_cache; 3021 return true; 3022 } 3023 return false; 3024 } 3025 return true; 3026 } 3027 3028 struct fib6_nh_rd_arg { 3029 struct fib6_result *res; 3030 struct flowi6 *fl6; 3031 const struct in6_addr *gw; 3032 struct rt6_info **ret; 3033 }; 3034 3035 static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg) 3036 { 3037 struct fib6_nh_rd_arg *arg = _arg; 3038 3039 arg->res->nh = nh; 3040 return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret); 3041 } 3042 3043 /* Handle redirects */ 3044 struct ip6rd_flowi { 3045 struct flowi6 fl6; 3046 struct in6_addr gateway; 3047 }; 3048 3049 INDIRECT_CALLABLE_SCOPE struct rt6_info *__ip6_route_redirect(struct net *net, 3050 struct fib6_table *table, 3051 struct flowi6 *fl6, 3052 const struct sk_buff *skb, 3053 int flags) 3054 { 3055 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 3056 struct rt6_info *ret = NULL; 3057 struct fib6_result res = {}; 3058 struct fib6_nh_rd_arg arg = { 3059 .res = &res, 3060 .fl6 = fl6, 3061 .gw = &rdfl->gateway, 3062 .ret = &ret 3063 }; 3064 struct fib6_info *rt; 3065 struct fib6_node *fn; 3066 3067 /* Get the "current" route for this destination and 3068 * check if the redirect has come from appropriate router. 3069 * 3070 * RFC 4861 specifies that redirects should only be 3071 * accepted if they come from the nexthop to the target. 3072 * Due to the way the routes are chosen, this notion 3073 * is a bit fuzzy and one might need to check all possible 3074 * routes. 3075 */ 3076 3077 rcu_read_lock(); 3078 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 3079 restart: 3080 for_each_fib6_node_rt_rcu(fn) { 3081 res.f6i = rt; 3082 if (fib6_check_expired(rt)) 3083 continue; 3084 if (rt->fib6_flags & RTF_REJECT) 3085 break; 3086 if (unlikely(rt->nh)) { 3087 if (nexthop_is_blackhole(rt->nh)) 3088 continue; 3089 /* on match, res->nh is filled in and potentially ret */ 3090 if (nexthop_for_each_fib6_nh(rt->nh, 3091 fib6_nh_redirect_match, 3092 &arg)) 3093 goto out; 3094 } else { 3095 res.nh = rt->fib6_nh; 3096 if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, 3097 &ret)) 3098 goto out; 3099 } 3100 } 3101 3102 if (!rt) 3103 rt = net->ipv6.fib6_null_entry; 3104 else if (rt->fib6_flags & RTF_REJECT) { 3105 ret = net->ipv6.ip6_null_entry; 3106 goto out; 3107 } 3108 3109 if (rt == net->ipv6.fib6_null_entry) { 3110 fn = fib6_backtrack(fn, &fl6->saddr); 3111 if (fn) 3112 goto restart; 3113 } 3114 3115 res.f6i = rt; 3116 res.nh = rt->fib6_nh; 3117 out: 3118 if (ret) { 3119 ip6_hold_safe(net, &ret); 3120 } else { 3121 res.fib6_flags = res.f6i->fib6_flags; 3122 res.fib6_type = res.f6i->fib6_type; 3123 ret = ip6_create_rt_rcu(&res); 3124 } 3125 3126 rcu_read_unlock(); 3127 3128 trace_fib6_table_lookup(net, &res, table, fl6); 3129 return ret; 3130 }; 3131 3132 static struct dst_entry *ip6_route_redirect(struct net *net, 3133 const struct flowi6 *fl6, 3134 const struct sk_buff *skb, 3135 const struct in6_addr *gateway) 3136 { 3137 int flags = RT6_LOOKUP_F_HAS_SADDR; 3138 struct ip6rd_flowi rdfl; 3139 3140 rdfl.fl6 = *fl6; 3141 rdfl.gateway = *gateway; 3142 3143 return fib6_rule_lookup(net, &rdfl.fl6, skb, 3144 flags, __ip6_route_redirect); 3145 } 3146 3147 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 3148 kuid_t uid) 3149 { 3150 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 3151 struct dst_entry *dst; 3152 struct flowi6 fl6 = { 3153 .flowi6_iif = LOOPBACK_IFINDEX, 3154 .flowi6_oif = oif, 3155 .flowi6_mark = mark, 3156 .daddr = iph->daddr, 3157 .saddr = iph->saddr, 3158 .flowlabel = ip6_flowinfo(iph), 3159 .flowi6_uid = uid, 3160 }; 3161 3162 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 3163 rt6_do_redirect(dst, NULL, skb); 3164 dst_release(dst); 3165 } 3166 EXPORT_SYMBOL_GPL(ip6_redirect); 3167 3168 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) 3169 { 3170 const struct ipv6hdr *iph = ipv6_hdr(skb); 3171 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 3172 struct dst_entry *dst; 3173 struct flowi6 fl6 = { 3174 .flowi6_iif = LOOPBACK_IFINDEX, 3175 .flowi6_oif = oif, 3176 .daddr = msg->dest, 3177 .saddr = iph->daddr, 3178 .flowi6_uid = sock_net_uid(net, NULL), 3179 }; 3180 3181 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 3182 rt6_do_redirect(dst, NULL, skb); 3183 dst_release(dst); 3184 } 3185 3186 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 3187 { 3188 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, 3189 READ_ONCE(sk->sk_mark), sk->sk_uid); 3190 } 3191 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 3192 3193 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 3194 { 3195 struct net_device *dev = dst->dev; 3196 unsigned int mtu = dst_mtu(dst); 3197 struct net *net = dev_net(dev); 3198 3199 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 3200 3201 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 3202 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 3203 3204 /* 3205 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 3206 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 3207 * IPV6_MAXPLEN is also valid and means: "any MSS, 3208 * rely only on pmtu discovery" 3209 */ 3210 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 3211 mtu = IPV6_MAXPLEN; 3212 return mtu; 3213 } 3214 3215 INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst) 3216 { 3217 return ip6_dst_mtu_maybe_forward(dst, false); 3218 } 3219 EXPORT_INDIRECT_CALLABLE(ip6_mtu); 3220 3221 /* MTU selection: 3222 * 1. mtu on route is locked - use it 3223 * 2. mtu from nexthop exception 3224 * 3. mtu from egress device 3225 * 3226 * based on ip6_dst_mtu_forward and exception logic of 3227 * rt6_find_cached_rt; called with rcu_read_lock 3228 */ 3229 u32 ip6_mtu_from_fib6(const struct fib6_result *res, 3230 const struct in6_addr *daddr, 3231 const struct in6_addr *saddr) 3232 { 3233 const struct fib6_nh *nh = res->nh; 3234 struct fib6_info *f6i = res->f6i; 3235 struct inet6_dev *idev; 3236 struct rt6_info *rt; 3237 u32 mtu = 0; 3238 3239 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 3240 mtu = f6i->fib6_pmtu; 3241 if (mtu) 3242 goto out; 3243 } 3244 3245 rt = rt6_find_cached_rt(res, daddr, saddr); 3246 if (unlikely(rt)) { 3247 mtu = dst_metric_raw(&rt->dst, RTAX_MTU); 3248 } else { 3249 struct net_device *dev = nh->fib_nh_dev; 3250 3251 mtu = IPV6_MIN_MTU; 3252 idev = __in6_dev_get(dev); 3253 if (idev) 3254 mtu = max_t(u32, mtu, READ_ONCE(idev->cnf.mtu6)); 3255 } 3256 3257 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 3258 out: 3259 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); 3260 } 3261 3262 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 3263 struct flowi6 *fl6) 3264 { 3265 struct dst_entry *dst; 3266 struct rt6_info *rt; 3267 struct inet6_dev *idev = in6_dev_get(dev); 3268 struct net *net = dev_net(dev); 3269 3270 if (unlikely(!idev)) 3271 return ERR_PTR(-ENODEV); 3272 3273 rt = ip6_dst_alloc(net, dev, 0); 3274 if (unlikely(!rt)) { 3275 in6_dev_put(idev); 3276 dst = ERR_PTR(-ENOMEM); 3277 goto out; 3278 } 3279 3280 rt->dst.input = ip6_input; 3281 rt->dst.output = ip6_output; 3282 rt->rt6i_gateway = fl6->daddr; 3283 rt->rt6i_dst.addr = fl6->daddr; 3284 rt->rt6i_dst.plen = 128; 3285 rt->rt6i_idev = idev; 3286 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 3287 3288 /* Add this dst into uncached_list so that rt6_disable_ip() can 3289 * do proper release of the net_device 3290 */ 3291 rt6_uncached_list_add(rt); 3292 3293 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 3294 3295 out: 3296 return dst; 3297 } 3298 3299 static void ip6_dst_gc(struct dst_ops *ops) 3300 { 3301 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 3302 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 3303 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 3304 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 3305 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 3306 unsigned int val; 3307 int entries; 3308 3309 if (time_after(rt_last_gc + rt_min_interval, jiffies)) 3310 goto out; 3311 3312 fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true); 3313 entries = dst_entries_get_slow(ops); 3314 if (entries < ops->gc_thresh) 3315 atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1); 3316 out: 3317 val = atomic_read(&net->ipv6.ip6_rt_gc_expire); 3318 atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); 3319 } 3320 3321 static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, 3322 const struct in6_addr *gw_addr, u32 tbid, 3323 int flags, struct fib6_result *res) 3324 { 3325 struct flowi6 fl6 = { 3326 .flowi6_oif = cfg->fc_ifindex, 3327 .daddr = *gw_addr, 3328 .saddr = cfg->fc_prefsrc, 3329 }; 3330 struct fib6_table *table; 3331 int err; 3332 3333 table = fib6_get_table(net, tbid); 3334 if (!table) 3335 return -EINVAL; 3336 3337 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 3338 flags |= RT6_LOOKUP_F_HAS_SADDR; 3339 3340 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 3341 3342 err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags); 3343 if (!err && res->f6i != net->ipv6.fib6_null_entry) 3344 fib6_select_path(net, res, &fl6, cfg->fc_ifindex, 3345 cfg->fc_ifindex != 0, NULL, flags); 3346 3347 return err; 3348 } 3349 3350 static int ip6_route_check_nh_onlink(struct net *net, 3351 struct fib6_config *cfg, 3352 const struct net_device *dev, 3353 struct netlink_ext_ack *extack) 3354 { 3355 u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; 3356 const struct in6_addr *gw_addr = &cfg->fc_gateway; 3357 struct fib6_result res = {}; 3358 int err; 3359 3360 err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res); 3361 if (!err && !(res.fib6_flags & RTF_REJECT) && 3362 /* ignore match if it is the default route */ 3363 !ipv6_addr_any(&res.f6i->fib6_dst.addr) && 3364 (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) { 3365 NL_SET_ERR_MSG(extack, 3366 "Nexthop has invalid gateway or device mismatch"); 3367 err = -EINVAL; 3368 } 3369 3370 return err; 3371 } 3372 3373 static int ip6_route_check_nh(struct net *net, 3374 struct fib6_config *cfg, 3375 struct net_device **_dev, 3376 netdevice_tracker *dev_tracker, 3377 struct inet6_dev **idev) 3378 { 3379 const struct in6_addr *gw_addr = &cfg->fc_gateway; 3380 struct net_device *dev = _dev ? *_dev : NULL; 3381 int flags = RT6_LOOKUP_F_IFACE; 3382 struct fib6_result res = {}; 3383 int err = -EHOSTUNREACH; 3384 3385 if (cfg->fc_table) { 3386 err = ip6_nh_lookup_table(net, cfg, gw_addr, 3387 cfg->fc_table, flags, &res); 3388 /* gw_addr can not require a gateway or resolve to a reject 3389 * route. If a device is given, it must match the result. 3390 */ 3391 if (err || res.fib6_flags & RTF_REJECT || 3392 res.nh->fib_nh_gw_family || 3393 (dev && dev != res.nh->fib_nh_dev)) 3394 err = -EHOSTUNREACH; 3395 } 3396 3397 if (err < 0) { 3398 struct flowi6 fl6 = { 3399 .flowi6_oif = cfg->fc_ifindex, 3400 .daddr = *gw_addr, 3401 }; 3402 3403 err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags); 3404 if (err || res.fib6_flags & RTF_REJECT || 3405 res.nh->fib_nh_gw_family) 3406 err = -EHOSTUNREACH; 3407 3408 if (err) 3409 return err; 3410 3411 fib6_select_path(net, &res, &fl6, cfg->fc_ifindex, 3412 cfg->fc_ifindex != 0, NULL, flags); 3413 } 3414 3415 err = 0; 3416 if (dev) { 3417 if (dev != res.nh->fib_nh_dev) 3418 err = -EHOSTUNREACH; 3419 } else { 3420 *_dev = dev = res.nh->fib_nh_dev; 3421 netdev_hold(dev, dev_tracker, GFP_ATOMIC); 3422 *idev = in6_dev_get(dev); 3423 } 3424 3425 return err; 3426 } 3427 3428 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 3429 struct net_device **_dev, 3430 netdevice_tracker *dev_tracker, 3431 struct inet6_dev **idev, 3432 struct netlink_ext_ack *extack) 3433 { 3434 const struct in6_addr *gw_addr = &cfg->fc_gateway; 3435 int gwa_type = ipv6_addr_type(gw_addr); 3436 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 3437 const struct net_device *dev = *_dev; 3438 bool need_addr_check = !dev; 3439 int err = -EINVAL; 3440 3441 /* if gw_addr is local we will fail to detect this in case 3442 * address is still TENTATIVE (DAD in progress). rt6_lookup() 3443 * will return already-added prefix route via interface that 3444 * prefix route was assigned to, which might be non-loopback. 3445 */ 3446 if (dev && 3447 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 3448 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 3449 goto out; 3450 } 3451 3452 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 3453 /* IPv6 strictly inhibits using not link-local 3454 * addresses as nexthop address. 3455 * Otherwise, router will not able to send redirects. 3456 * It is very good, but in some (rare!) circumstances 3457 * (SIT, PtP, NBMA NOARP links) it is handy to allow 3458 * some exceptions. --ANK 3459 * We allow IPv4-mapped nexthops to support RFC4798-type 3460 * addressing 3461 */ 3462 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 3463 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 3464 goto out; 3465 } 3466 3467 rcu_read_lock(); 3468 3469 if (cfg->fc_flags & RTNH_F_ONLINK) 3470 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 3471 else 3472 err = ip6_route_check_nh(net, cfg, _dev, dev_tracker, 3473 idev); 3474 3475 rcu_read_unlock(); 3476 3477 if (err) 3478 goto out; 3479 } 3480 3481 /* reload in case device was changed */ 3482 dev = *_dev; 3483 3484 err = -EINVAL; 3485 if (!dev) { 3486 NL_SET_ERR_MSG(extack, "Egress device not specified"); 3487 goto out; 3488 } else if (dev->flags & IFF_LOOPBACK) { 3489 NL_SET_ERR_MSG(extack, 3490 "Egress device can not be loopback device for this route"); 3491 goto out; 3492 } 3493 3494 /* if we did not check gw_addr above, do so now that the 3495 * egress device has been resolved. 3496 */ 3497 if (need_addr_check && 3498 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 3499 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 3500 goto out; 3501 } 3502 3503 err = 0; 3504 out: 3505 return err; 3506 } 3507 3508 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type) 3509 { 3510 if ((flags & RTF_REJECT) || 3511 (dev && (dev->flags & IFF_LOOPBACK) && 3512 !(addr_type & IPV6_ADDR_LOOPBACK) && 3513 !(flags & (RTF_ANYCAST | RTF_LOCAL)))) 3514 return true; 3515 3516 return false; 3517 } 3518 3519 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, 3520 struct fib6_config *cfg, gfp_t gfp_flags, 3521 struct netlink_ext_ack *extack) 3522 { 3523 netdevice_tracker *dev_tracker = &fib6_nh->fib_nh_dev_tracker; 3524 struct net_device *dev = NULL; 3525 struct inet6_dev *idev = NULL; 3526 int addr_type; 3527 int err; 3528 3529 fib6_nh->fib_nh_family = AF_INET6; 3530 #ifdef CONFIG_IPV6_ROUTER_PREF 3531 fib6_nh->last_probe = jiffies; 3532 #endif 3533 if (cfg->fc_is_fdb) { 3534 fib6_nh->fib_nh_gw6 = cfg->fc_gateway; 3535 fib6_nh->fib_nh_gw_family = AF_INET6; 3536 return 0; 3537 } 3538 3539 err = -ENODEV; 3540 if (cfg->fc_ifindex) { 3541 dev = netdev_get_by_index(net, cfg->fc_ifindex, 3542 dev_tracker, gfp_flags); 3543 if (!dev) 3544 goto out; 3545 idev = in6_dev_get(dev); 3546 if (!idev) 3547 goto out; 3548 } 3549 3550 if (cfg->fc_flags & RTNH_F_ONLINK) { 3551 if (!dev) { 3552 NL_SET_ERR_MSG(extack, 3553 "Nexthop device required for onlink"); 3554 goto out; 3555 } 3556 3557 if (!(dev->flags & IFF_UP)) { 3558 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3559 err = -ENETDOWN; 3560 goto out; 3561 } 3562 3563 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK; 3564 } 3565 3566 fib6_nh->fib_nh_weight = 1; 3567 3568 /* We cannot add true routes via loopback here, 3569 * they would result in kernel looping; promote them to reject routes 3570 */ 3571 addr_type = ipv6_addr_type(&cfg->fc_dst); 3572 if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) { 3573 /* hold loopback dev/idev if we haven't done so. */ 3574 if (dev != net->loopback_dev) { 3575 if (dev) { 3576 netdev_put(dev, dev_tracker); 3577 in6_dev_put(idev); 3578 } 3579 dev = net->loopback_dev; 3580 netdev_hold(dev, dev_tracker, gfp_flags); 3581 idev = in6_dev_get(dev); 3582 if (!idev) { 3583 err = -ENODEV; 3584 goto out; 3585 } 3586 } 3587 goto pcpu_alloc; 3588 } 3589 3590 if (cfg->fc_flags & RTF_GATEWAY) { 3591 err = ip6_validate_gw(net, cfg, &dev, dev_tracker, 3592 &idev, extack); 3593 if (err) 3594 goto out; 3595 3596 fib6_nh->fib_nh_gw6 = cfg->fc_gateway; 3597 fib6_nh->fib_nh_gw_family = AF_INET6; 3598 } 3599 3600 err = -ENODEV; 3601 if (!dev) 3602 goto out; 3603 3604 if (idev->cnf.disable_ipv6) { 3605 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3606 err = -EACCES; 3607 goto out; 3608 } 3609 3610 if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) { 3611 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3612 err = -ENETDOWN; 3613 goto out; 3614 } 3615 3616 if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3617 !netif_carrier_ok(dev)) 3618 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; 3619 3620 err = fib_nh_common_init(net, &fib6_nh->nh_common, cfg->fc_encap, 3621 cfg->fc_encap_type, cfg, gfp_flags, extack); 3622 if (err) 3623 goto out; 3624 3625 pcpu_alloc: 3626 fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); 3627 if (!fib6_nh->rt6i_pcpu) { 3628 err = -ENOMEM; 3629 goto out; 3630 } 3631 3632 fib6_nh->fib_nh_dev = dev; 3633 fib6_nh->fib_nh_oif = dev->ifindex; 3634 err = 0; 3635 out: 3636 if (idev) 3637 in6_dev_put(idev); 3638 3639 if (err) { 3640 lwtstate_put(fib6_nh->fib_nh_lws); 3641 fib6_nh->fib_nh_lws = NULL; 3642 netdev_put(dev, dev_tracker); 3643 } 3644 3645 return err; 3646 } 3647 3648 void fib6_nh_release(struct fib6_nh *fib6_nh) 3649 { 3650 struct rt6_exception_bucket *bucket; 3651 3652 rcu_read_lock(); 3653 3654 fib6_nh_flush_exceptions(fib6_nh, NULL); 3655 bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL); 3656 if (bucket) { 3657 rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL); 3658 kfree(bucket); 3659 } 3660 3661 rcu_read_unlock(); 3662 3663 fib6_nh_release_dsts(fib6_nh); 3664 free_percpu(fib6_nh->rt6i_pcpu); 3665 3666 fib_nh_common_release(&fib6_nh->nh_common); 3667 } 3668 3669 void fib6_nh_release_dsts(struct fib6_nh *fib6_nh) 3670 { 3671 int cpu; 3672 3673 if (!fib6_nh->rt6i_pcpu) 3674 return; 3675 3676 for_each_possible_cpu(cpu) { 3677 struct rt6_info *pcpu_rt, **ppcpu_rt; 3678 3679 ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); 3680 pcpu_rt = xchg(ppcpu_rt, NULL); 3681 if (pcpu_rt) { 3682 dst_dev_put(&pcpu_rt->dst); 3683 dst_release(&pcpu_rt->dst); 3684 } 3685 } 3686 } 3687 3688 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 3689 gfp_t gfp_flags, 3690 struct netlink_ext_ack *extack) 3691 { 3692 struct net *net = cfg->fc_nlinfo.nl_net; 3693 struct fib6_info *rt = NULL; 3694 struct nexthop *nh = NULL; 3695 struct fib6_table *table; 3696 struct fib6_nh *fib6_nh; 3697 int err = -EINVAL; 3698 int addr_type; 3699 3700 /* RTF_PCPU is an internal flag; can not be set by userspace */ 3701 if (cfg->fc_flags & RTF_PCPU) { 3702 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 3703 goto out; 3704 } 3705 3706 /* RTF_CACHE is an internal flag; can not be set by userspace */ 3707 if (cfg->fc_flags & RTF_CACHE) { 3708 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 3709 goto out; 3710 } 3711 3712 if (cfg->fc_type > RTN_MAX) { 3713 NL_SET_ERR_MSG(extack, "Invalid route type"); 3714 goto out; 3715 } 3716 3717 if (cfg->fc_dst_len > 128) { 3718 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 3719 goto out; 3720 } 3721 if (cfg->fc_src_len > 128) { 3722 NL_SET_ERR_MSG(extack, "Invalid source address length"); 3723 goto out; 3724 } 3725 #ifndef CONFIG_IPV6_SUBTREES 3726 if (cfg->fc_src_len) { 3727 NL_SET_ERR_MSG(extack, 3728 "Specifying source address requires IPV6_SUBTREES to be enabled"); 3729 goto out; 3730 } 3731 #endif 3732 if (cfg->fc_nh_id) { 3733 nh = nexthop_find_by_id(net, cfg->fc_nh_id); 3734 if (!nh) { 3735 NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); 3736 goto out; 3737 } 3738 err = fib6_check_nexthop(nh, cfg, extack); 3739 if (err) 3740 goto out; 3741 } 3742 3743 err = -ENOBUFS; 3744 if (cfg->fc_nlinfo.nlh && 3745 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 3746 table = fib6_get_table(net, cfg->fc_table); 3747 if (!table) { 3748 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 3749 table = fib6_new_table(net, cfg->fc_table); 3750 } 3751 } else { 3752 table = fib6_new_table(net, cfg->fc_table); 3753 } 3754 3755 if (!table) 3756 goto out; 3757 3758 err = -ENOMEM; 3759 rt = fib6_info_alloc(gfp_flags, !nh); 3760 if (!rt) 3761 goto out; 3762 3763 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len, 3764 extack); 3765 if (IS_ERR(rt->fib6_metrics)) { 3766 err = PTR_ERR(rt->fib6_metrics); 3767 /* Do not leave garbage there. */ 3768 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; 3769 goto out_free; 3770 } 3771 3772 if (cfg->fc_flags & RTF_ADDRCONF) 3773 rt->dst_nocount = true; 3774 3775 if (cfg->fc_flags & RTF_EXPIRES) 3776 fib6_set_expires(rt, jiffies + 3777 clock_t_to_jiffies(cfg->fc_expires)); 3778 3779 if (cfg->fc_protocol == RTPROT_UNSPEC) 3780 cfg->fc_protocol = RTPROT_BOOT; 3781 rt->fib6_protocol = cfg->fc_protocol; 3782 3783 rt->fib6_table = table; 3784 rt->fib6_metric = cfg->fc_metric; 3785 rt->fib6_type = cfg->fc_type ? : RTN_UNICAST; 3786 rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY; 3787 3788 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3789 rt->fib6_dst.plen = cfg->fc_dst_len; 3790 3791 #ifdef CONFIG_IPV6_SUBTREES 3792 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3793 rt->fib6_src.plen = cfg->fc_src_len; 3794 #endif 3795 if (nh) { 3796 if (rt->fib6_src.plen) { 3797 NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); 3798 goto out_free; 3799 } 3800 if (!nexthop_get(nh)) { 3801 NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); 3802 goto out_free; 3803 } 3804 rt->nh = nh; 3805 fib6_nh = nexthop_fib6_nh(rt->nh); 3806 } else { 3807 err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack); 3808 if (err) 3809 goto out; 3810 3811 fib6_nh = rt->fib6_nh; 3812 3813 /* We cannot add true routes via loopback here, they would 3814 * result in kernel looping; promote them to reject routes 3815 */ 3816 addr_type = ipv6_addr_type(&cfg->fc_dst); 3817 if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev, 3818 addr_type)) 3819 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; 3820 } 3821 3822 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3823 struct net_device *dev = fib6_nh->fib_nh_dev; 3824 3825 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3826 NL_SET_ERR_MSG(extack, "Invalid source address"); 3827 err = -EINVAL; 3828 goto out; 3829 } 3830 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3831 rt->fib6_prefsrc.plen = 128; 3832 } else 3833 rt->fib6_prefsrc.plen = 0; 3834 3835 return rt; 3836 out: 3837 fib6_info_release(rt); 3838 return ERR_PTR(err); 3839 out_free: 3840 ip_fib_metrics_put(rt->fib6_metrics); 3841 kfree(rt); 3842 return ERR_PTR(err); 3843 } 3844 3845 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3846 struct netlink_ext_ack *extack) 3847 { 3848 struct fib6_info *rt; 3849 int err; 3850 3851 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3852 if (IS_ERR(rt)) 3853 return PTR_ERR(rt); 3854 3855 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3856 fib6_info_release(rt); 3857 3858 return err; 3859 } 3860 3861 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3862 { 3863 struct net *net = info->nl_net; 3864 struct fib6_table *table; 3865 int err; 3866 3867 if (rt == net->ipv6.fib6_null_entry) { 3868 err = -ENOENT; 3869 goto out; 3870 } 3871 3872 table = rt->fib6_table; 3873 spin_lock_bh(&table->tb6_lock); 3874 err = fib6_del(rt, info); 3875 spin_unlock_bh(&table->tb6_lock); 3876 3877 out: 3878 fib6_info_release(rt); 3879 return err; 3880 } 3881 3882 int ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify) 3883 { 3884 struct nl_info info = { 3885 .nl_net = net, 3886 .skip_notify = skip_notify 3887 }; 3888 3889 return __ip6_del_rt(rt, &info); 3890 } 3891 3892 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3893 { 3894 struct nl_info *info = &cfg->fc_nlinfo; 3895 struct net *net = info->nl_net; 3896 struct sk_buff *skb = NULL; 3897 struct fib6_table *table; 3898 int err = -ENOENT; 3899 3900 if (rt == net->ipv6.fib6_null_entry) 3901 goto out_put; 3902 table = rt->fib6_table; 3903 spin_lock_bh(&table->tb6_lock); 3904 3905 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3906 struct fib6_info *sibling, *next_sibling; 3907 struct fib6_node *fn; 3908 3909 /* prefer to send a single notification with all hops */ 3910 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3911 if (skb) { 3912 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3913 3914 if (rt6_fill_node(net, skb, rt, NULL, 3915 NULL, NULL, 0, RTM_DELROUTE, 3916 info->portid, seq, 0) < 0) { 3917 kfree_skb(skb); 3918 skb = NULL; 3919 } else 3920 info->skip_notify = 1; 3921 } 3922 3923 /* 'rt' points to the first sibling route. If it is not the 3924 * leaf, then we do not need to send a notification. Otherwise, 3925 * we need to check if the last sibling has a next route or not 3926 * and emit a replace or delete notification, respectively. 3927 */ 3928 info->skip_notify_kernel = 1; 3929 fn = rcu_dereference_protected(rt->fib6_node, 3930 lockdep_is_held(&table->tb6_lock)); 3931 if (rcu_access_pointer(fn->leaf) == rt) { 3932 struct fib6_info *last_sibling, *replace_rt; 3933 3934 last_sibling = list_last_entry(&rt->fib6_siblings, 3935 struct fib6_info, 3936 fib6_siblings); 3937 replace_rt = rcu_dereference_protected( 3938 last_sibling->fib6_next, 3939 lockdep_is_held(&table->tb6_lock)); 3940 if (replace_rt) 3941 call_fib6_entry_notifiers_replace(net, 3942 replace_rt); 3943 else 3944 call_fib6_multipath_entry_notifiers(net, 3945 FIB_EVENT_ENTRY_DEL, 3946 rt, rt->fib6_nsiblings, 3947 NULL); 3948 } 3949 list_for_each_entry_safe(sibling, next_sibling, 3950 &rt->fib6_siblings, 3951 fib6_siblings) { 3952 err = fib6_del(sibling, info); 3953 if (err) 3954 goto out_unlock; 3955 } 3956 } 3957 3958 err = fib6_del(rt, info); 3959 out_unlock: 3960 spin_unlock_bh(&table->tb6_lock); 3961 out_put: 3962 fib6_info_release(rt); 3963 3964 if (skb) { 3965 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3966 info->nlh, gfp_any()); 3967 } 3968 return err; 3969 } 3970 3971 static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3972 { 3973 int rc = -ESRCH; 3974 3975 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3976 goto out; 3977 3978 if (cfg->fc_flags & RTF_GATEWAY && 3979 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3980 goto out; 3981 3982 rc = rt6_remove_exception_rt(rt); 3983 out: 3984 return rc; 3985 } 3986 3987 static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt, 3988 struct fib6_nh *nh) 3989 { 3990 struct fib6_result res = { 3991 .f6i = rt, 3992 .nh = nh, 3993 }; 3994 struct rt6_info *rt_cache; 3995 3996 rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src); 3997 if (rt_cache) 3998 return __ip6_del_cached_rt(rt_cache, cfg); 3999 4000 return 0; 4001 } 4002 4003 struct fib6_nh_del_cached_rt_arg { 4004 struct fib6_config *cfg; 4005 struct fib6_info *f6i; 4006 }; 4007 4008 static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg) 4009 { 4010 struct fib6_nh_del_cached_rt_arg *arg = _arg; 4011 int rc; 4012 4013 rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh); 4014 return rc != -ESRCH ? rc : 0; 4015 } 4016 4017 static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i) 4018 { 4019 struct fib6_nh_del_cached_rt_arg arg = { 4020 .cfg = cfg, 4021 .f6i = f6i 4022 }; 4023 4024 return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg); 4025 } 4026 4027 static int ip6_route_del(struct fib6_config *cfg, 4028 struct netlink_ext_ack *extack) 4029 { 4030 struct fib6_table *table; 4031 struct fib6_info *rt; 4032 struct fib6_node *fn; 4033 int err = -ESRCH; 4034 4035 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 4036 if (!table) { 4037 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 4038 return err; 4039 } 4040 4041 rcu_read_lock(); 4042 4043 fn = fib6_locate(&table->tb6_root, 4044 &cfg->fc_dst, cfg->fc_dst_len, 4045 &cfg->fc_src, cfg->fc_src_len, 4046 !(cfg->fc_flags & RTF_CACHE)); 4047 4048 if (fn) { 4049 for_each_fib6_node_rt_rcu(fn) { 4050 struct fib6_nh *nh; 4051 4052 if (rt->nh && cfg->fc_nh_id && 4053 rt->nh->id != cfg->fc_nh_id) 4054 continue; 4055 4056 if (cfg->fc_flags & RTF_CACHE) { 4057 int rc = 0; 4058 4059 if (rt->nh) { 4060 rc = ip6_del_cached_rt_nh(cfg, rt); 4061 } else if (cfg->fc_nh_id) { 4062 continue; 4063 } else { 4064 nh = rt->fib6_nh; 4065 rc = ip6_del_cached_rt(cfg, rt, nh); 4066 } 4067 if (rc != -ESRCH) { 4068 rcu_read_unlock(); 4069 return rc; 4070 } 4071 continue; 4072 } 4073 4074 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 4075 continue; 4076 if (cfg->fc_protocol && 4077 cfg->fc_protocol != rt->fib6_protocol) 4078 continue; 4079 4080 if (rt->nh) { 4081 if (!fib6_info_hold_safe(rt)) 4082 continue; 4083 rcu_read_unlock(); 4084 4085 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 4086 } 4087 if (cfg->fc_nh_id) 4088 continue; 4089 4090 nh = rt->fib6_nh; 4091 if (cfg->fc_ifindex && 4092 (!nh->fib_nh_dev || 4093 nh->fib_nh_dev->ifindex != cfg->fc_ifindex)) 4094 continue; 4095 if (cfg->fc_flags & RTF_GATEWAY && 4096 !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6)) 4097 continue; 4098 if (!fib6_info_hold_safe(rt)) 4099 continue; 4100 rcu_read_unlock(); 4101 4102 /* if gateway was specified only delete the one hop */ 4103 if (cfg->fc_flags & RTF_GATEWAY) 4104 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 4105 4106 return __ip6_del_rt_siblings(rt, cfg); 4107 } 4108 } 4109 rcu_read_unlock(); 4110 4111 return err; 4112 } 4113 4114 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 4115 { 4116 struct netevent_redirect netevent; 4117 struct rt6_info *rt, *nrt = NULL; 4118 struct fib6_result res = {}; 4119 struct ndisc_options ndopts; 4120 struct inet6_dev *in6_dev; 4121 struct neighbour *neigh; 4122 struct rd_msg *msg; 4123 int optlen, on_link; 4124 u8 *lladdr; 4125 4126 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 4127 optlen -= sizeof(*msg); 4128 4129 if (optlen < 0) { 4130 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 4131 return; 4132 } 4133 4134 msg = (struct rd_msg *)icmp6_hdr(skb); 4135 4136 if (ipv6_addr_is_multicast(&msg->dest)) { 4137 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 4138 return; 4139 } 4140 4141 on_link = 0; 4142 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 4143 on_link = 1; 4144 } else if (ipv6_addr_type(&msg->target) != 4145 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 4146 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 4147 return; 4148 } 4149 4150 in6_dev = __in6_dev_get(skb->dev); 4151 if (!in6_dev) 4152 return; 4153 if (READ_ONCE(in6_dev->cnf.forwarding) || 4154 !READ_ONCE(in6_dev->cnf.accept_redirects)) 4155 return; 4156 4157 /* RFC2461 8.1: 4158 * The IP source address of the Redirect MUST be the same as the current 4159 * first-hop router for the specified ICMP Destination Address. 4160 */ 4161 4162 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 4163 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 4164 return; 4165 } 4166 4167 lladdr = NULL; 4168 if (ndopts.nd_opts_tgt_lladdr) { 4169 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 4170 skb->dev); 4171 if (!lladdr) { 4172 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 4173 return; 4174 } 4175 } 4176 4177 rt = dst_rt6_info(dst); 4178 if (rt->rt6i_flags & RTF_REJECT) { 4179 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 4180 return; 4181 } 4182 4183 /* Redirect received -> path was valid. 4184 * Look, redirects are sent only in response to data packets, 4185 * so that this nexthop apparently is reachable. --ANK 4186 */ 4187 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 4188 4189 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 4190 if (!neigh) 4191 return; 4192 4193 /* 4194 * We have finally decided to accept it. 4195 */ 4196 4197 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 4198 NEIGH_UPDATE_F_WEAK_OVERRIDE| 4199 NEIGH_UPDATE_F_OVERRIDE| 4200 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 4201 NEIGH_UPDATE_F_ISROUTER)), 4202 NDISC_REDIRECT, &ndopts); 4203 4204 rcu_read_lock(); 4205 res.f6i = rcu_dereference(rt->from); 4206 if (!res.f6i) 4207 goto out; 4208 4209 if (res.f6i->nh) { 4210 struct fib6_nh_match_arg arg = { 4211 .dev = dst->dev, 4212 .gw = &rt->rt6i_gateway, 4213 }; 4214 4215 nexthop_for_each_fib6_nh(res.f6i->nh, 4216 fib6_nh_find_match, &arg); 4217 4218 /* fib6_info uses a nexthop that does not have fib6_nh 4219 * using the dst->dev. Should be impossible 4220 */ 4221 if (!arg.match) 4222 goto out; 4223 res.nh = arg.match; 4224 } else { 4225 res.nh = res.f6i->fib6_nh; 4226 } 4227 4228 res.fib6_flags = res.f6i->fib6_flags; 4229 res.fib6_type = res.f6i->fib6_type; 4230 nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL); 4231 if (!nrt) 4232 goto out; 4233 4234 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 4235 if (on_link) 4236 nrt->rt6i_flags &= ~RTF_GATEWAY; 4237 4238 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 4239 4240 /* rt6_insert_exception() will take care of duplicated exceptions */ 4241 if (rt6_insert_exception(nrt, &res)) { 4242 dst_release_immediate(&nrt->dst); 4243 goto out; 4244 } 4245 4246 netevent.old = &rt->dst; 4247 netevent.new = &nrt->dst; 4248 netevent.daddr = &msg->dest; 4249 netevent.neigh = neigh; 4250 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 4251 4252 out: 4253 rcu_read_unlock(); 4254 neigh_release(neigh); 4255 } 4256 4257 #ifdef CONFIG_IPV6_ROUTE_INFO 4258 static struct fib6_info *rt6_get_route_info(struct net *net, 4259 const struct in6_addr *prefix, int prefixlen, 4260 const struct in6_addr *gwaddr, 4261 struct net_device *dev) 4262 { 4263 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 4264 int ifindex = dev->ifindex; 4265 struct fib6_node *fn; 4266 struct fib6_info *rt = NULL; 4267 struct fib6_table *table; 4268 4269 table = fib6_get_table(net, tb_id); 4270 if (!table) 4271 return NULL; 4272 4273 rcu_read_lock(); 4274 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 4275 if (!fn) 4276 goto out; 4277 4278 for_each_fib6_node_rt_rcu(fn) { 4279 /* these routes do not use nexthops */ 4280 if (rt->nh) 4281 continue; 4282 if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex) 4283 continue; 4284 if (!(rt->fib6_flags & RTF_ROUTEINFO) || 4285 !rt->fib6_nh->fib_nh_gw_family) 4286 continue; 4287 if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr)) 4288 continue; 4289 if (!fib6_info_hold_safe(rt)) 4290 continue; 4291 break; 4292 } 4293 out: 4294 rcu_read_unlock(); 4295 return rt; 4296 } 4297 4298 static struct fib6_info *rt6_add_route_info(struct net *net, 4299 const struct in6_addr *prefix, int prefixlen, 4300 const struct in6_addr *gwaddr, 4301 struct net_device *dev, 4302 unsigned int pref) 4303 { 4304 struct fib6_config cfg = { 4305 .fc_metric = IP6_RT_PRIO_USER, 4306 .fc_ifindex = dev->ifindex, 4307 .fc_dst_len = prefixlen, 4308 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 4309 RTF_UP | RTF_PREF(pref), 4310 .fc_protocol = RTPROT_RA, 4311 .fc_type = RTN_UNICAST, 4312 .fc_nlinfo.portid = 0, 4313 .fc_nlinfo.nlh = NULL, 4314 .fc_nlinfo.nl_net = net, 4315 }; 4316 4317 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 4318 cfg.fc_dst = *prefix; 4319 cfg.fc_gateway = *gwaddr; 4320 4321 /* We should treat it as a default route if prefix length is 0. */ 4322 if (!prefixlen) 4323 cfg.fc_flags |= RTF_DEFAULT; 4324 4325 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 4326 4327 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 4328 } 4329 #endif 4330 4331 struct fib6_info *rt6_get_dflt_router(struct net *net, 4332 const struct in6_addr *addr, 4333 struct net_device *dev) 4334 { 4335 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 4336 struct fib6_info *rt; 4337 struct fib6_table *table; 4338 4339 table = fib6_get_table(net, tb_id); 4340 if (!table) 4341 return NULL; 4342 4343 rcu_read_lock(); 4344 for_each_fib6_node_rt_rcu(&table->tb6_root) { 4345 struct fib6_nh *nh; 4346 4347 /* RA routes do not use nexthops */ 4348 if (rt->nh) 4349 continue; 4350 4351 nh = rt->fib6_nh; 4352 if (dev == nh->fib_nh_dev && 4353 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 4354 ipv6_addr_equal(&nh->fib_nh_gw6, addr)) 4355 break; 4356 } 4357 if (rt && !fib6_info_hold_safe(rt)) 4358 rt = NULL; 4359 rcu_read_unlock(); 4360 return rt; 4361 } 4362 4363 struct fib6_info *rt6_add_dflt_router(struct net *net, 4364 const struct in6_addr *gwaddr, 4365 struct net_device *dev, 4366 unsigned int pref, 4367 u32 defrtr_usr_metric, 4368 int lifetime) 4369 { 4370 struct fib6_config cfg = { 4371 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 4372 .fc_metric = defrtr_usr_metric, 4373 .fc_ifindex = dev->ifindex, 4374 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 4375 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 4376 .fc_protocol = RTPROT_RA, 4377 .fc_type = RTN_UNICAST, 4378 .fc_nlinfo.portid = 0, 4379 .fc_nlinfo.nlh = NULL, 4380 .fc_nlinfo.nl_net = net, 4381 .fc_expires = jiffies_to_clock_t(lifetime * HZ), 4382 }; 4383 4384 cfg.fc_gateway = *gwaddr; 4385 4386 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 4387 struct fib6_table *table; 4388 4389 table = fib6_get_table(dev_net(dev), cfg.fc_table); 4390 if (table) 4391 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 4392 } 4393 4394 return rt6_get_dflt_router(net, gwaddr, dev); 4395 } 4396 4397 static void __rt6_purge_dflt_routers(struct net *net, 4398 struct fib6_table *table) 4399 { 4400 struct fib6_info *rt; 4401 4402 restart: 4403 rcu_read_lock(); 4404 for_each_fib6_node_rt_rcu(&table->tb6_root) { 4405 struct net_device *dev = fib6_info_nh_dev(rt); 4406 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 4407 4408 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 4409 (!idev || idev->cnf.accept_ra != 2) && 4410 fib6_info_hold_safe(rt)) { 4411 rcu_read_unlock(); 4412 ip6_del_rt(net, rt, false); 4413 goto restart; 4414 } 4415 } 4416 rcu_read_unlock(); 4417 4418 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 4419 } 4420 4421 void rt6_purge_dflt_routers(struct net *net) 4422 { 4423 struct fib6_table *table; 4424 struct hlist_head *head; 4425 unsigned int h; 4426 4427 rcu_read_lock(); 4428 4429 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 4430 head = &net->ipv6.fib_table_hash[h]; 4431 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 4432 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 4433 __rt6_purge_dflt_routers(net, table); 4434 } 4435 } 4436 4437 rcu_read_unlock(); 4438 } 4439 4440 static void rtmsg_to_fib6_config(struct net *net, 4441 struct in6_rtmsg *rtmsg, 4442 struct fib6_config *cfg) 4443 { 4444 *cfg = (struct fib6_config){ 4445 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 4446 : RT6_TABLE_MAIN, 4447 .fc_ifindex = rtmsg->rtmsg_ifindex, 4448 .fc_metric = rtmsg->rtmsg_metric, 4449 .fc_expires = rtmsg->rtmsg_info, 4450 .fc_dst_len = rtmsg->rtmsg_dst_len, 4451 .fc_src_len = rtmsg->rtmsg_src_len, 4452 .fc_flags = rtmsg->rtmsg_flags, 4453 .fc_type = rtmsg->rtmsg_type, 4454 4455 .fc_nlinfo.nl_net = net, 4456 4457 .fc_dst = rtmsg->rtmsg_dst, 4458 .fc_src = rtmsg->rtmsg_src, 4459 .fc_gateway = rtmsg->rtmsg_gateway, 4460 }; 4461 } 4462 4463 int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg) 4464 { 4465 struct fib6_config cfg; 4466 int err; 4467 4468 if (cmd != SIOCADDRT && cmd != SIOCDELRT) 4469 return -EINVAL; 4470 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 4471 return -EPERM; 4472 4473 rtmsg_to_fib6_config(net, rtmsg, &cfg); 4474 4475 rtnl_lock(); 4476 switch (cmd) { 4477 case SIOCADDRT: 4478 /* Only do the default setting of fc_metric in route adding */ 4479 if (cfg.fc_metric == 0) 4480 cfg.fc_metric = IP6_RT_PRIO_USER; 4481 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 4482 break; 4483 case SIOCDELRT: 4484 err = ip6_route_del(&cfg, NULL); 4485 break; 4486 } 4487 rtnl_unlock(); 4488 return err; 4489 } 4490 4491 /* 4492 * Drop the packet on the floor 4493 */ 4494 4495 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 4496 { 4497 struct dst_entry *dst = skb_dst(skb); 4498 struct net *net = dev_net(dst->dev); 4499 struct inet6_dev *idev; 4500 SKB_DR(reason); 4501 int type; 4502 4503 if (netif_is_l3_master(skb->dev) || 4504 dst->dev == net->loopback_dev) 4505 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); 4506 else 4507 idev = ip6_dst_idev(dst); 4508 4509 switch (ipstats_mib_noroutes) { 4510 case IPSTATS_MIB_INNOROUTES: 4511 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 4512 if (type == IPV6_ADDR_ANY) { 4513 SKB_DR_SET(reason, IP_INADDRERRORS); 4514 IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 4515 break; 4516 } 4517 SKB_DR_SET(reason, IP_INNOROUTES); 4518 fallthrough; 4519 case IPSTATS_MIB_OUTNOROUTES: 4520 SKB_DR_OR(reason, IP_OUTNOROUTES); 4521 IP6_INC_STATS(net, idev, ipstats_mib_noroutes); 4522 break; 4523 } 4524 4525 /* Start over by dropping the dst for l3mdev case */ 4526 if (netif_is_l3_master(skb->dev)) 4527 skb_dst_drop(skb); 4528 4529 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 4530 kfree_skb_reason(skb, reason); 4531 return 0; 4532 } 4533 4534 static int ip6_pkt_discard(struct sk_buff *skb) 4535 { 4536 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 4537 } 4538 4539 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 4540 { 4541 skb->dev = skb_dst(skb)->dev; 4542 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 4543 } 4544 4545 static int ip6_pkt_prohibit(struct sk_buff *skb) 4546 { 4547 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 4548 } 4549 4550 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 4551 { 4552 skb->dev = skb_dst(skb)->dev; 4553 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 4554 } 4555 4556 /* 4557 * Allocate a dst for local (unicast / anycast) address. 4558 */ 4559 4560 struct fib6_info *addrconf_f6i_alloc(struct net *net, 4561 struct inet6_dev *idev, 4562 const struct in6_addr *addr, 4563 bool anycast, gfp_t gfp_flags, 4564 struct netlink_ext_ack *extack) 4565 { 4566 struct fib6_config cfg = { 4567 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL, 4568 .fc_ifindex = idev->dev->ifindex, 4569 .fc_flags = RTF_UP | RTF_NONEXTHOP, 4570 .fc_dst = *addr, 4571 .fc_dst_len = 128, 4572 .fc_protocol = RTPROT_KERNEL, 4573 .fc_nlinfo.nl_net = net, 4574 .fc_ignore_dev_down = true, 4575 }; 4576 struct fib6_info *f6i; 4577 4578 if (anycast) { 4579 cfg.fc_type = RTN_ANYCAST; 4580 cfg.fc_flags |= RTF_ANYCAST; 4581 } else { 4582 cfg.fc_type = RTN_LOCAL; 4583 cfg.fc_flags |= RTF_LOCAL; 4584 } 4585 4586 f6i = ip6_route_info_create(&cfg, gfp_flags, extack); 4587 if (!IS_ERR(f6i)) { 4588 f6i->dst_nocount = true; 4589 4590 if (!anycast && 4591 (READ_ONCE(net->ipv6.devconf_all->disable_policy) || 4592 READ_ONCE(idev->cnf.disable_policy))) 4593 f6i->dst_nopolicy = true; 4594 } 4595 4596 return f6i; 4597 } 4598 4599 /* remove deleted ip from prefsrc entries */ 4600 struct arg_dev_net_ip { 4601 struct net *net; 4602 struct in6_addr *addr; 4603 }; 4604 4605 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 4606 { 4607 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 4608 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 4609 4610 if (!rt->nh && 4611 rt != net->ipv6.fib6_null_entry && 4612 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr) && 4613 !ipv6_chk_addr(net, addr, rt->fib6_nh->fib_nh_dev, 0)) { 4614 spin_lock_bh(&rt6_exception_lock); 4615 /* remove prefsrc entry */ 4616 rt->fib6_prefsrc.plen = 0; 4617 spin_unlock_bh(&rt6_exception_lock); 4618 } 4619 return 0; 4620 } 4621 4622 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 4623 { 4624 struct net *net = dev_net(ifp->idev->dev); 4625 struct arg_dev_net_ip adni = { 4626 .net = net, 4627 .addr = &ifp->addr, 4628 }; 4629 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 4630 } 4631 4632 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT) 4633 4634 /* Remove routers and update dst entries when gateway turn into host. */ 4635 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 4636 { 4637 struct in6_addr *gateway = (struct in6_addr *)arg; 4638 struct fib6_nh *nh; 4639 4640 /* RA routes do not use nexthops */ 4641 if (rt->nh) 4642 return 0; 4643 4644 nh = rt->fib6_nh; 4645 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 4646 nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6)) 4647 return -1; 4648 4649 /* Further clean up cached routes in exception table. 4650 * This is needed because cached route may have a different 4651 * gateway than its 'parent' in the case of an ip redirect. 4652 */ 4653 fib6_nh_exceptions_clean_tohost(nh, gateway); 4654 4655 return 0; 4656 } 4657 4658 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 4659 { 4660 fib6_clean_all(net, fib6_clean_tohost, gateway); 4661 } 4662 4663 struct arg_netdev_event { 4664 const struct net_device *dev; 4665 union { 4666 unsigned char nh_flags; 4667 unsigned long event; 4668 }; 4669 }; 4670 4671 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 4672 { 4673 struct fib6_info *iter; 4674 struct fib6_node *fn; 4675 4676 fn = rcu_dereference_protected(rt->fib6_node, 4677 lockdep_is_held(&rt->fib6_table->tb6_lock)); 4678 iter = rcu_dereference_protected(fn->leaf, 4679 lockdep_is_held(&rt->fib6_table->tb6_lock)); 4680 while (iter) { 4681 if (iter->fib6_metric == rt->fib6_metric && 4682 rt6_qualify_for_ecmp(iter)) 4683 return iter; 4684 iter = rcu_dereference_protected(iter->fib6_next, 4685 lockdep_is_held(&rt->fib6_table->tb6_lock)); 4686 } 4687 4688 return NULL; 4689 } 4690 4691 /* only called for fib entries with builtin fib6_nh */ 4692 static bool rt6_is_dead(const struct fib6_info *rt) 4693 { 4694 if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD || 4695 (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN && 4696 ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev))) 4697 return true; 4698 4699 return false; 4700 } 4701 4702 static int rt6_multipath_total_weight(const struct fib6_info *rt) 4703 { 4704 struct fib6_info *iter; 4705 int total = 0; 4706 4707 if (!rt6_is_dead(rt)) 4708 total += rt->fib6_nh->fib_nh_weight; 4709 4710 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 4711 if (!rt6_is_dead(iter)) 4712 total += iter->fib6_nh->fib_nh_weight; 4713 } 4714 4715 return total; 4716 } 4717 4718 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 4719 { 4720 int upper_bound = -1; 4721 4722 if (!rt6_is_dead(rt)) { 4723 *weight += rt->fib6_nh->fib_nh_weight; 4724 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 4725 total) - 1; 4726 } 4727 atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound); 4728 } 4729 4730 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 4731 { 4732 struct fib6_info *iter; 4733 int weight = 0; 4734 4735 rt6_upper_bound_set(rt, &weight, total); 4736 4737 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4738 rt6_upper_bound_set(iter, &weight, total); 4739 } 4740 4741 void rt6_multipath_rebalance(struct fib6_info *rt) 4742 { 4743 struct fib6_info *first; 4744 int total; 4745 4746 /* In case the entire multipath route was marked for flushing, 4747 * then there is no need to rebalance upon the removal of every 4748 * sibling route. 4749 */ 4750 if (!rt->fib6_nsiblings || rt->should_flush) 4751 return; 4752 4753 /* During lookup routes are evaluated in order, so we need to 4754 * make sure upper bounds are assigned from the first sibling 4755 * onwards. 4756 */ 4757 first = rt6_multipath_first_sibling(rt); 4758 if (WARN_ON_ONCE(!first)) 4759 return; 4760 4761 total = rt6_multipath_total_weight(first); 4762 rt6_multipath_upper_bound_set(first, total); 4763 } 4764 4765 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 4766 { 4767 const struct arg_netdev_event *arg = p_arg; 4768 struct net *net = dev_net(arg->dev); 4769 4770 if (rt != net->ipv6.fib6_null_entry && !rt->nh && 4771 rt->fib6_nh->fib_nh_dev == arg->dev) { 4772 rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags; 4773 fib6_update_sernum_upto_root(net, rt); 4774 rt6_multipath_rebalance(rt); 4775 } 4776 4777 return 0; 4778 } 4779 4780 void rt6_sync_up(struct net_device *dev, unsigned char nh_flags) 4781 { 4782 struct arg_netdev_event arg = { 4783 .dev = dev, 4784 { 4785 .nh_flags = nh_flags, 4786 }, 4787 }; 4788 4789 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 4790 arg.nh_flags |= RTNH_F_LINKDOWN; 4791 4792 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 4793 } 4794 4795 /* only called for fib entries with inline fib6_nh */ 4796 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 4797 const struct net_device *dev) 4798 { 4799 struct fib6_info *iter; 4800 4801 if (rt->fib6_nh->fib_nh_dev == dev) 4802 return true; 4803 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4804 if (iter->fib6_nh->fib_nh_dev == dev) 4805 return true; 4806 4807 return false; 4808 } 4809 4810 static void rt6_multipath_flush(struct fib6_info *rt) 4811 { 4812 struct fib6_info *iter; 4813 4814 rt->should_flush = 1; 4815 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4816 iter->should_flush = 1; 4817 } 4818 4819 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 4820 const struct net_device *down_dev) 4821 { 4822 struct fib6_info *iter; 4823 unsigned int dead = 0; 4824 4825 if (rt->fib6_nh->fib_nh_dev == down_dev || 4826 rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD) 4827 dead++; 4828 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4829 if (iter->fib6_nh->fib_nh_dev == down_dev || 4830 iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD) 4831 dead++; 4832 4833 return dead; 4834 } 4835 4836 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 4837 const struct net_device *dev, 4838 unsigned char nh_flags) 4839 { 4840 struct fib6_info *iter; 4841 4842 if (rt->fib6_nh->fib_nh_dev == dev) 4843 rt->fib6_nh->fib_nh_flags |= nh_flags; 4844 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4845 if (iter->fib6_nh->fib_nh_dev == dev) 4846 iter->fib6_nh->fib_nh_flags |= nh_flags; 4847 } 4848 4849 /* called with write lock held for table with rt */ 4850 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 4851 { 4852 const struct arg_netdev_event *arg = p_arg; 4853 const struct net_device *dev = arg->dev; 4854 struct net *net = dev_net(dev); 4855 4856 if (rt == net->ipv6.fib6_null_entry || rt->nh) 4857 return 0; 4858 4859 switch (arg->event) { 4860 case NETDEV_UNREGISTER: 4861 return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; 4862 case NETDEV_DOWN: 4863 if (rt->should_flush) 4864 return -1; 4865 if (!rt->fib6_nsiblings) 4866 return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; 4867 if (rt6_multipath_uses_dev(rt, dev)) { 4868 unsigned int count; 4869 4870 count = rt6_multipath_dead_count(rt, dev); 4871 if (rt->fib6_nsiblings + 1 == count) { 4872 rt6_multipath_flush(rt); 4873 return -1; 4874 } 4875 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4876 RTNH_F_LINKDOWN); 4877 fib6_update_sernum(net, rt); 4878 rt6_multipath_rebalance(rt); 4879 } 4880 return -2; 4881 case NETDEV_CHANGE: 4882 if (rt->fib6_nh->fib_nh_dev != dev || 4883 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 4884 break; 4885 rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; 4886 rt6_multipath_rebalance(rt); 4887 break; 4888 } 4889 4890 return 0; 4891 } 4892 4893 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4894 { 4895 struct arg_netdev_event arg = { 4896 .dev = dev, 4897 { 4898 .event = event, 4899 }, 4900 }; 4901 struct net *net = dev_net(dev); 4902 4903 if (net->ipv6.sysctl.skip_notify_on_dev_down) 4904 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg); 4905 else 4906 fib6_clean_all(net, fib6_ifdown, &arg); 4907 } 4908 4909 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4910 { 4911 rt6_sync_down_dev(dev, event); 4912 rt6_uncached_list_flush_dev(dev); 4913 neigh_ifdown(&nd_tbl, dev); 4914 } 4915 4916 struct rt6_mtu_change_arg { 4917 struct net_device *dev; 4918 unsigned int mtu; 4919 struct fib6_info *f6i; 4920 }; 4921 4922 static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg) 4923 { 4924 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg; 4925 struct fib6_info *f6i = arg->f6i; 4926 4927 /* For administrative MTU increase, there is no way to discover 4928 * IPv6 PMTU increase, so PMTU increase should be updated here. 4929 * Since RFC 1981 doesn't include administrative MTU increase 4930 * update PMTU increase is a MUST. (i.e. jumbo frame) 4931 */ 4932 if (nh->fib_nh_dev == arg->dev) { 4933 struct inet6_dev *idev = __in6_dev_get(arg->dev); 4934 u32 mtu = f6i->fib6_pmtu; 4935 4936 if (mtu >= arg->mtu || 4937 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4938 fib6_metric_set(f6i, RTAX_MTU, arg->mtu); 4939 4940 spin_lock_bh(&rt6_exception_lock); 4941 rt6_exceptions_update_pmtu(idev, nh, arg->mtu); 4942 spin_unlock_bh(&rt6_exception_lock); 4943 } 4944 4945 return 0; 4946 } 4947 4948 static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg) 4949 { 4950 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4951 struct inet6_dev *idev; 4952 4953 /* In IPv6 pmtu discovery is not optional, 4954 so that RTAX_MTU lock cannot disable it. 4955 We still use this lock to block changes 4956 caused by addrconf/ndisc. 4957 */ 4958 4959 idev = __in6_dev_get(arg->dev); 4960 if (!idev) 4961 return 0; 4962 4963 if (fib6_metric_locked(f6i, RTAX_MTU)) 4964 return 0; 4965 4966 arg->f6i = f6i; 4967 if (f6i->nh) { 4968 /* fib6_nh_mtu_change only returns 0, so this is safe */ 4969 return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change, 4970 arg); 4971 } 4972 4973 return fib6_nh_mtu_change(f6i->fib6_nh, arg); 4974 } 4975 4976 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4977 { 4978 struct rt6_mtu_change_arg arg = { 4979 .dev = dev, 4980 .mtu = mtu, 4981 }; 4982 4983 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4984 } 4985 4986 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4987 [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 }, 4988 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4989 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4990 [RTA_OIF] = { .type = NLA_U32 }, 4991 [RTA_IIF] = { .type = NLA_U32 }, 4992 [RTA_PRIORITY] = { .type = NLA_U32 }, 4993 [RTA_METRICS] = { .type = NLA_NESTED }, 4994 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4995 [RTA_PREF] = { .type = NLA_U8 }, 4996 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4997 [RTA_ENCAP] = { .type = NLA_NESTED }, 4998 [RTA_EXPIRES] = { .type = NLA_U32 }, 4999 [RTA_UID] = { .type = NLA_U32 }, 5000 [RTA_MARK] = { .type = NLA_U32 }, 5001 [RTA_TABLE] = { .type = NLA_U32 }, 5002 [RTA_IP_PROTO] = { .type = NLA_U8 }, 5003 [RTA_SPORT] = { .type = NLA_U16 }, 5004 [RTA_DPORT] = { .type = NLA_U16 }, 5005 [RTA_NH_ID] = { .type = NLA_U32 }, 5006 }; 5007 5008 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 5009 struct fib6_config *cfg, 5010 struct netlink_ext_ack *extack) 5011 { 5012 struct rtmsg *rtm; 5013 struct nlattr *tb[RTA_MAX+1]; 5014 unsigned int pref; 5015 int err; 5016 5017 err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, 5018 rtm_ipv6_policy, extack); 5019 if (err < 0) 5020 goto errout; 5021 5022 err = -EINVAL; 5023 rtm = nlmsg_data(nlh); 5024 5025 if (rtm->rtm_tos) { 5026 NL_SET_ERR_MSG(extack, 5027 "Invalid dsfield (tos): option not available for IPv6"); 5028 goto errout; 5029 } 5030 5031 *cfg = (struct fib6_config){ 5032 .fc_table = rtm->rtm_table, 5033 .fc_dst_len = rtm->rtm_dst_len, 5034 .fc_src_len = rtm->rtm_src_len, 5035 .fc_flags = RTF_UP, 5036 .fc_protocol = rtm->rtm_protocol, 5037 .fc_type = rtm->rtm_type, 5038 5039 .fc_nlinfo.portid = NETLINK_CB(skb).portid, 5040 .fc_nlinfo.nlh = nlh, 5041 .fc_nlinfo.nl_net = sock_net(skb->sk), 5042 }; 5043 5044 if (rtm->rtm_type == RTN_UNREACHABLE || 5045 rtm->rtm_type == RTN_BLACKHOLE || 5046 rtm->rtm_type == RTN_PROHIBIT || 5047 rtm->rtm_type == RTN_THROW) 5048 cfg->fc_flags |= RTF_REJECT; 5049 5050 if (rtm->rtm_type == RTN_LOCAL) 5051 cfg->fc_flags |= RTF_LOCAL; 5052 5053 if (rtm->rtm_flags & RTM_F_CLONED) 5054 cfg->fc_flags |= RTF_CACHE; 5055 5056 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 5057 5058 if (tb[RTA_NH_ID]) { 5059 if (tb[RTA_GATEWAY] || tb[RTA_OIF] || 5060 tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) { 5061 NL_SET_ERR_MSG(extack, 5062 "Nexthop specification and nexthop id are mutually exclusive"); 5063 goto errout; 5064 } 5065 cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]); 5066 } 5067 5068 if (tb[RTA_GATEWAY]) { 5069 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 5070 cfg->fc_flags |= RTF_GATEWAY; 5071 } 5072 if (tb[RTA_VIA]) { 5073 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute"); 5074 goto errout; 5075 } 5076 5077 if (tb[RTA_DST]) { 5078 int plen = (rtm->rtm_dst_len + 7) >> 3; 5079 5080 if (nla_len(tb[RTA_DST]) < plen) 5081 goto errout; 5082 5083 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 5084 } 5085 5086 if (tb[RTA_SRC]) { 5087 int plen = (rtm->rtm_src_len + 7) >> 3; 5088 5089 if (nla_len(tb[RTA_SRC]) < plen) 5090 goto errout; 5091 5092 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 5093 } 5094 5095 if (tb[RTA_PREFSRC]) 5096 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 5097 5098 if (tb[RTA_OIF]) 5099 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 5100 5101 if (tb[RTA_PRIORITY]) 5102 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 5103 5104 if (tb[RTA_METRICS]) { 5105 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 5106 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 5107 } 5108 5109 if (tb[RTA_TABLE]) 5110 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 5111 5112 if (tb[RTA_MULTIPATH]) { 5113 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 5114 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 5115 5116 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 5117 cfg->fc_mp_len, extack); 5118 if (err < 0) 5119 goto errout; 5120 } 5121 5122 if (tb[RTA_PREF]) { 5123 pref = nla_get_u8(tb[RTA_PREF]); 5124 if (pref != ICMPV6_ROUTER_PREF_LOW && 5125 pref != ICMPV6_ROUTER_PREF_HIGH) 5126 pref = ICMPV6_ROUTER_PREF_MEDIUM; 5127 cfg->fc_flags |= RTF_PREF(pref); 5128 } 5129 5130 if (tb[RTA_ENCAP]) 5131 cfg->fc_encap = tb[RTA_ENCAP]; 5132 5133 if (tb[RTA_ENCAP_TYPE]) { 5134 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 5135 5136 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 5137 if (err < 0) 5138 goto errout; 5139 } 5140 5141 if (tb[RTA_EXPIRES]) { 5142 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 5143 5144 if (addrconf_finite_timeout(timeout)) { 5145 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 5146 cfg->fc_flags |= RTF_EXPIRES; 5147 } 5148 } 5149 5150 err = 0; 5151 errout: 5152 return err; 5153 } 5154 5155 struct rt6_nh { 5156 struct fib6_info *fib6_info; 5157 struct fib6_config r_cfg; 5158 struct list_head next; 5159 }; 5160 5161 static int ip6_route_info_append(struct net *net, 5162 struct list_head *rt6_nh_list, 5163 struct fib6_info *rt, 5164 struct fib6_config *r_cfg) 5165 { 5166 struct rt6_nh *nh; 5167 int err = -EEXIST; 5168 5169 list_for_each_entry(nh, rt6_nh_list, next) { 5170 /* check if fib6_info already exists */ 5171 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 5172 return err; 5173 } 5174 5175 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 5176 if (!nh) 5177 return -ENOMEM; 5178 nh->fib6_info = rt; 5179 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 5180 list_add_tail(&nh->next, rt6_nh_list); 5181 5182 return 0; 5183 } 5184 5185 static void ip6_route_mpath_notify(struct fib6_info *rt, 5186 struct fib6_info *rt_last, 5187 struct nl_info *info, 5188 __u16 nlflags) 5189 { 5190 /* if this is an APPEND route, then rt points to the first route 5191 * inserted and rt_last points to last route inserted. Userspace 5192 * wants a consistent dump of the route which starts at the first 5193 * nexthop. Since sibling routes are always added at the end of 5194 * the list, find the first sibling of the last route appended 5195 */ 5196 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 5197 rt = list_first_entry(&rt_last->fib6_siblings, 5198 struct fib6_info, 5199 fib6_siblings); 5200 } 5201 5202 if (rt) 5203 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 5204 } 5205 5206 static bool ip6_route_mpath_should_notify(const struct fib6_info *rt) 5207 { 5208 bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); 5209 bool should_notify = false; 5210 struct fib6_info *leaf; 5211 struct fib6_node *fn; 5212 5213 rcu_read_lock(); 5214 fn = rcu_dereference(rt->fib6_node); 5215 if (!fn) 5216 goto out; 5217 5218 leaf = rcu_dereference(fn->leaf); 5219 if (!leaf) 5220 goto out; 5221 5222 if (rt == leaf || 5223 (rt_can_ecmp && rt->fib6_metric == leaf->fib6_metric && 5224 rt6_qualify_for_ecmp(leaf))) 5225 should_notify = true; 5226 out: 5227 rcu_read_unlock(); 5228 5229 return should_notify; 5230 } 5231 5232 static int fib6_gw_from_attr(struct in6_addr *gw, struct nlattr *nla, 5233 struct netlink_ext_ack *extack) 5234 { 5235 if (nla_len(nla) < sizeof(*gw)) { 5236 NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_GATEWAY"); 5237 return -EINVAL; 5238 } 5239 5240 *gw = nla_get_in6_addr(nla); 5241 5242 return 0; 5243 } 5244 5245 static int ip6_route_multipath_add(struct fib6_config *cfg, 5246 struct netlink_ext_ack *extack) 5247 { 5248 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 5249 struct nl_info *info = &cfg->fc_nlinfo; 5250 struct fib6_config r_cfg; 5251 struct rtnexthop *rtnh; 5252 struct fib6_info *rt; 5253 struct rt6_nh *err_nh; 5254 struct rt6_nh *nh, *nh_safe; 5255 __u16 nlflags; 5256 int remaining; 5257 int attrlen; 5258 int err = 1; 5259 int nhn = 0; 5260 int replace = (cfg->fc_nlinfo.nlh && 5261 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 5262 LIST_HEAD(rt6_nh_list); 5263 5264 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 5265 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 5266 nlflags |= NLM_F_APPEND; 5267 5268 remaining = cfg->fc_mp_len; 5269 rtnh = (struct rtnexthop *)cfg->fc_mp; 5270 5271 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 5272 * fib6_info structs per nexthop 5273 */ 5274 while (rtnh_ok(rtnh, remaining)) { 5275 memcpy(&r_cfg, cfg, sizeof(*cfg)); 5276 if (rtnh->rtnh_ifindex) 5277 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 5278 5279 attrlen = rtnh_attrlen(rtnh); 5280 if (attrlen > 0) { 5281 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 5282 5283 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 5284 if (nla) { 5285 err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla, 5286 extack); 5287 if (err) 5288 goto cleanup; 5289 5290 r_cfg.fc_flags |= RTF_GATEWAY; 5291 } 5292 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 5293 5294 /* RTA_ENCAP_TYPE length checked in 5295 * lwtunnel_valid_encap_type_attr 5296 */ 5297 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 5298 if (nla) 5299 r_cfg.fc_encap_type = nla_get_u16(nla); 5300 } 5301 5302 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 5303 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 5304 if (IS_ERR(rt)) { 5305 err = PTR_ERR(rt); 5306 rt = NULL; 5307 goto cleanup; 5308 } 5309 if (!rt6_qualify_for_ecmp(rt)) { 5310 err = -EINVAL; 5311 NL_SET_ERR_MSG(extack, 5312 "Device only routes can not be added for IPv6 using the multipath API."); 5313 fib6_info_release(rt); 5314 goto cleanup; 5315 } 5316 5317 rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; 5318 5319 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 5320 rt, &r_cfg); 5321 if (err) { 5322 fib6_info_release(rt); 5323 goto cleanup; 5324 } 5325 5326 rtnh = rtnh_next(rtnh, &remaining); 5327 } 5328 5329 if (list_empty(&rt6_nh_list)) { 5330 NL_SET_ERR_MSG(extack, 5331 "Invalid nexthop configuration - no valid nexthops"); 5332 return -EINVAL; 5333 } 5334 5335 /* for add and replace send one notification with all nexthops. 5336 * Skip the notification in fib6_add_rt2node and send one with 5337 * the full route when done 5338 */ 5339 info->skip_notify = 1; 5340 5341 /* For add and replace, send one notification with all nexthops. For 5342 * append, send one notification with all appended nexthops. 5343 */ 5344 info->skip_notify_kernel = 1; 5345 5346 err_nh = NULL; 5347 list_for_each_entry(nh, &rt6_nh_list, next) { 5348 err = __ip6_ins_rt(nh->fib6_info, info, extack); 5349 5350 if (err) { 5351 if (replace && nhn) 5352 NL_SET_ERR_MSG_MOD(extack, 5353 "multipath route replace failed (check consistency of installed routes)"); 5354 err_nh = nh; 5355 goto add_errout; 5356 } 5357 /* save reference to last route successfully inserted */ 5358 rt_last = nh->fib6_info; 5359 5360 /* save reference to first route for notification */ 5361 if (!rt_notif) 5362 rt_notif = nh->fib6_info; 5363 5364 /* Because each route is added like a single route we remove 5365 * these flags after the first nexthop: if there is a collision, 5366 * we have already failed to add the first nexthop: 5367 * fib6_add_rt2node() has rejected it; when replacing, old 5368 * nexthops have been replaced by first new, the rest should 5369 * be added to it. 5370 */ 5371 if (cfg->fc_nlinfo.nlh) { 5372 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 5373 NLM_F_REPLACE); 5374 cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE; 5375 } 5376 nhn++; 5377 } 5378 5379 /* An in-kernel notification should only be sent in case the new 5380 * multipath route is added as the first route in the node, or if 5381 * it was appended to it. We pass 'rt_notif' since it is the first 5382 * sibling and might allow us to skip some checks in the replace case. 5383 */ 5384 if (ip6_route_mpath_should_notify(rt_notif)) { 5385 enum fib_event_type fib_event; 5386 5387 if (rt_notif->fib6_nsiblings != nhn - 1) 5388 fib_event = FIB_EVENT_ENTRY_APPEND; 5389 else 5390 fib_event = FIB_EVENT_ENTRY_REPLACE; 5391 5392 err = call_fib6_multipath_entry_notifiers(info->nl_net, 5393 fib_event, rt_notif, 5394 nhn - 1, extack); 5395 if (err) { 5396 /* Delete all the siblings that were just added */ 5397 err_nh = NULL; 5398 goto add_errout; 5399 } 5400 } 5401 5402 /* success ... tell user about new route */ 5403 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 5404 goto cleanup; 5405 5406 add_errout: 5407 /* send notification for routes that were added so that 5408 * the delete notifications sent by ip6_route_del are 5409 * coherent 5410 */ 5411 if (rt_notif) 5412 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 5413 5414 /* Delete routes that were already added */ 5415 list_for_each_entry(nh, &rt6_nh_list, next) { 5416 if (err_nh == nh) 5417 break; 5418 ip6_route_del(&nh->r_cfg, extack); 5419 } 5420 5421 cleanup: 5422 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 5423 fib6_info_release(nh->fib6_info); 5424 list_del(&nh->next); 5425 kfree(nh); 5426 } 5427 5428 return err; 5429 } 5430 5431 static int ip6_route_multipath_del(struct fib6_config *cfg, 5432 struct netlink_ext_ack *extack) 5433 { 5434 struct fib6_config r_cfg; 5435 struct rtnexthop *rtnh; 5436 int last_err = 0; 5437 int remaining; 5438 int attrlen; 5439 int err; 5440 5441 remaining = cfg->fc_mp_len; 5442 rtnh = (struct rtnexthop *)cfg->fc_mp; 5443 5444 /* Parse a Multipath Entry */ 5445 while (rtnh_ok(rtnh, remaining)) { 5446 memcpy(&r_cfg, cfg, sizeof(*cfg)); 5447 if (rtnh->rtnh_ifindex) 5448 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 5449 5450 attrlen = rtnh_attrlen(rtnh); 5451 if (attrlen > 0) { 5452 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 5453 5454 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 5455 if (nla) { 5456 err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla, 5457 extack); 5458 if (err) { 5459 last_err = err; 5460 goto next_rtnh; 5461 } 5462 5463 r_cfg.fc_flags |= RTF_GATEWAY; 5464 } 5465 } 5466 err = ip6_route_del(&r_cfg, extack); 5467 if (err) 5468 last_err = err; 5469 5470 next_rtnh: 5471 rtnh = rtnh_next(rtnh, &remaining); 5472 } 5473 5474 return last_err; 5475 } 5476 5477 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 5478 struct netlink_ext_ack *extack) 5479 { 5480 struct fib6_config cfg; 5481 int err; 5482 5483 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 5484 if (err < 0) 5485 return err; 5486 5487 if (cfg.fc_nh_id && 5488 !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) { 5489 NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); 5490 return -EINVAL; 5491 } 5492 5493 if (cfg.fc_mp) 5494 return ip6_route_multipath_del(&cfg, extack); 5495 else { 5496 cfg.fc_delete_all_nh = 1; 5497 return ip6_route_del(&cfg, extack); 5498 } 5499 } 5500 5501 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 5502 struct netlink_ext_ack *extack) 5503 { 5504 struct fib6_config cfg; 5505 int err; 5506 5507 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 5508 if (err < 0) 5509 return err; 5510 5511 if (cfg.fc_metric == 0) 5512 cfg.fc_metric = IP6_RT_PRIO_USER; 5513 5514 if (cfg.fc_mp) 5515 return ip6_route_multipath_add(&cfg, extack); 5516 else 5517 return ip6_route_add(&cfg, GFP_KERNEL, extack); 5518 } 5519 5520 /* add the overhead of this fib6_nh to nexthop_len */ 5521 static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg) 5522 { 5523 int *nexthop_len = arg; 5524 5525 *nexthop_len += nla_total_size(0) /* RTA_MULTIPATH */ 5526 + NLA_ALIGN(sizeof(struct rtnexthop)) 5527 + nla_total_size(16); /* RTA_GATEWAY */ 5528 5529 if (nh->fib_nh_lws) { 5530 /* RTA_ENCAP_TYPE */ 5531 *nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); 5532 /* RTA_ENCAP */ 5533 *nexthop_len += nla_total_size(2); 5534 } 5535 5536 return 0; 5537 } 5538 5539 static size_t rt6_nlmsg_size(struct fib6_info *f6i) 5540 { 5541 int nexthop_len; 5542 5543 if (f6i->nh) { 5544 nexthop_len = nla_total_size(4); /* RTA_NH_ID */ 5545 nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size, 5546 &nexthop_len); 5547 } else { 5548 struct fib6_info *sibling, *next_sibling; 5549 struct fib6_nh *nh = f6i->fib6_nh; 5550 5551 nexthop_len = 0; 5552 if (f6i->fib6_nsiblings) { 5553 rt6_nh_nlmsg_size(nh, &nexthop_len); 5554 5555 list_for_each_entry_safe(sibling, next_sibling, 5556 &f6i->fib6_siblings, fib6_siblings) { 5557 rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len); 5558 } 5559 } 5560 nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); 5561 } 5562 5563 return NLMSG_ALIGN(sizeof(struct rtmsg)) 5564 + nla_total_size(16) /* RTA_SRC */ 5565 + nla_total_size(16) /* RTA_DST */ 5566 + nla_total_size(16) /* RTA_GATEWAY */ 5567 + nla_total_size(16) /* RTA_PREFSRC */ 5568 + nla_total_size(4) /* RTA_TABLE */ 5569 + nla_total_size(4) /* RTA_IIF */ 5570 + nla_total_size(4) /* RTA_OIF */ 5571 + nla_total_size(4) /* RTA_PRIORITY */ 5572 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 5573 + nla_total_size(sizeof(struct rta_cacheinfo)) 5574 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 5575 + nla_total_size(1) /* RTA_PREF */ 5576 + nexthop_len; 5577 } 5578 5579 static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh, 5580 unsigned char *flags) 5581 { 5582 if (nexthop_is_multipath(nh)) { 5583 struct nlattr *mp; 5584 5585 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); 5586 if (!mp) 5587 goto nla_put_failure; 5588 5589 if (nexthop_mpath_fill_node(skb, nh, AF_INET6)) 5590 goto nla_put_failure; 5591 5592 nla_nest_end(skb, mp); 5593 } else { 5594 struct fib6_nh *fib6_nh; 5595 5596 fib6_nh = nexthop_fib6_nh(nh); 5597 if (fib_nexthop_info(skb, &fib6_nh->nh_common, AF_INET6, 5598 flags, false) < 0) 5599 goto nla_put_failure; 5600 } 5601 5602 return 0; 5603 5604 nla_put_failure: 5605 return -EMSGSIZE; 5606 } 5607 5608 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 5609 struct fib6_info *rt, struct dst_entry *dst, 5610 struct in6_addr *dest, struct in6_addr *src, 5611 int iif, int type, u32 portid, u32 seq, 5612 unsigned int flags) 5613 { 5614 struct rt6_info *rt6 = dst_rt6_info(dst); 5615 struct rt6key *rt6_dst, *rt6_src; 5616 u32 *pmetrics, table, rt6_flags; 5617 unsigned char nh_flags = 0; 5618 struct nlmsghdr *nlh; 5619 struct rtmsg *rtm; 5620 long expires = 0; 5621 5622 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 5623 if (!nlh) 5624 return -EMSGSIZE; 5625 5626 if (rt6) { 5627 rt6_dst = &rt6->rt6i_dst; 5628 rt6_src = &rt6->rt6i_src; 5629 rt6_flags = rt6->rt6i_flags; 5630 } else { 5631 rt6_dst = &rt->fib6_dst; 5632 rt6_src = &rt->fib6_src; 5633 rt6_flags = rt->fib6_flags; 5634 } 5635 5636 rtm = nlmsg_data(nlh); 5637 rtm->rtm_family = AF_INET6; 5638 rtm->rtm_dst_len = rt6_dst->plen; 5639 rtm->rtm_src_len = rt6_src->plen; 5640 rtm->rtm_tos = 0; 5641 if (rt->fib6_table) 5642 table = rt->fib6_table->tb6_id; 5643 else 5644 table = RT6_TABLE_UNSPEC; 5645 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT; 5646 if (nla_put_u32(skb, RTA_TABLE, table)) 5647 goto nla_put_failure; 5648 5649 rtm->rtm_type = rt->fib6_type; 5650 rtm->rtm_flags = 0; 5651 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 5652 rtm->rtm_protocol = rt->fib6_protocol; 5653 5654 if (rt6_flags & RTF_CACHE) 5655 rtm->rtm_flags |= RTM_F_CLONED; 5656 5657 if (dest) { 5658 if (nla_put_in6_addr(skb, RTA_DST, dest)) 5659 goto nla_put_failure; 5660 rtm->rtm_dst_len = 128; 5661 } else if (rtm->rtm_dst_len) 5662 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr)) 5663 goto nla_put_failure; 5664 #ifdef CONFIG_IPV6_SUBTREES 5665 if (src) { 5666 if (nla_put_in6_addr(skb, RTA_SRC, src)) 5667 goto nla_put_failure; 5668 rtm->rtm_src_len = 128; 5669 } else if (rtm->rtm_src_len && 5670 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr)) 5671 goto nla_put_failure; 5672 #endif 5673 if (iif) { 5674 #ifdef CONFIG_IPV6_MROUTE 5675 if (ipv6_addr_is_multicast(&rt6_dst->addr)) { 5676 int err = ip6mr_get_route(net, skb, rtm, portid); 5677 5678 if (err == 0) 5679 return 0; 5680 if (err < 0) 5681 goto nla_put_failure; 5682 } else 5683 #endif 5684 if (nla_put_u32(skb, RTA_IIF, iif)) 5685 goto nla_put_failure; 5686 } else if (dest) { 5687 struct in6_addr saddr_buf; 5688 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 5689 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 5690 goto nla_put_failure; 5691 } 5692 5693 if (rt->fib6_prefsrc.plen) { 5694 struct in6_addr saddr_buf; 5695 saddr_buf = rt->fib6_prefsrc.addr; 5696 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 5697 goto nla_put_failure; 5698 } 5699 5700 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 5701 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 5702 goto nla_put_failure; 5703 5704 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 5705 goto nla_put_failure; 5706 5707 /* For multipath routes, walk the siblings list and add 5708 * each as a nexthop within RTA_MULTIPATH. 5709 */ 5710 if (rt6) { 5711 if (rt6_flags & RTF_GATEWAY && 5712 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) 5713 goto nla_put_failure; 5714 5715 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex)) 5716 goto nla_put_failure; 5717 5718 if (dst->lwtstate && 5719 lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) 5720 goto nla_put_failure; 5721 } else if (rt->fib6_nsiblings) { 5722 struct fib6_info *sibling, *next_sibling; 5723 struct nlattr *mp; 5724 5725 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); 5726 if (!mp) 5727 goto nla_put_failure; 5728 5729 if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common, 5730 rt->fib6_nh->fib_nh_weight, AF_INET6, 5731 0) < 0) 5732 goto nla_put_failure; 5733 5734 list_for_each_entry_safe(sibling, next_sibling, 5735 &rt->fib6_siblings, fib6_siblings) { 5736 if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common, 5737 sibling->fib6_nh->fib_nh_weight, 5738 AF_INET6, 0) < 0) 5739 goto nla_put_failure; 5740 } 5741 5742 nla_nest_end(skb, mp); 5743 } else if (rt->nh) { 5744 if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id)) 5745 goto nla_put_failure; 5746 5747 if (nexthop_is_blackhole(rt->nh)) 5748 rtm->rtm_type = RTN_BLACKHOLE; 5749 5750 if (READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode) && 5751 rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0) 5752 goto nla_put_failure; 5753 5754 rtm->rtm_flags |= nh_flags; 5755 } else { 5756 if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, AF_INET6, 5757 &nh_flags, false) < 0) 5758 goto nla_put_failure; 5759 5760 rtm->rtm_flags |= nh_flags; 5761 } 5762 5763 if (rt6_flags & RTF_EXPIRES) { 5764 expires = dst ? dst->expires : rt->expires; 5765 expires -= jiffies; 5766 } 5767 5768 if (!dst) { 5769 if (READ_ONCE(rt->offload)) 5770 rtm->rtm_flags |= RTM_F_OFFLOAD; 5771 if (READ_ONCE(rt->trap)) 5772 rtm->rtm_flags |= RTM_F_TRAP; 5773 if (READ_ONCE(rt->offload_failed)) 5774 rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED; 5775 } 5776 5777 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 5778 goto nla_put_failure; 5779 5780 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags))) 5781 goto nla_put_failure; 5782 5783 5784 nlmsg_end(skb, nlh); 5785 return 0; 5786 5787 nla_put_failure: 5788 nlmsg_cancel(skb, nlh); 5789 return -EMSGSIZE; 5790 } 5791 5792 static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg) 5793 { 5794 const struct net_device *dev = arg; 5795 5796 if (nh->fib_nh_dev == dev) 5797 return 1; 5798 5799 return 0; 5800 } 5801 5802 static bool fib6_info_uses_dev(const struct fib6_info *f6i, 5803 const struct net_device *dev) 5804 { 5805 if (f6i->nh) { 5806 struct net_device *_dev = (struct net_device *)dev; 5807 5808 return !!nexthop_for_each_fib6_nh(f6i->nh, 5809 fib6_info_nh_uses_dev, 5810 _dev); 5811 } 5812 5813 if (f6i->fib6_nh->fib_nh_dev == dev) 5814 return true; 5815 5816 if (f6i->fib6_nsiblings) { 5817 struct fib6_info *sibling, *next_sibling; 5818 5819 list_for_each_entry_safe(sibling, next_sibling, 5820 &f6i->fib6_siblings, fib6_siblings) { 5821 if (sibling->fib6_nh->fib_nh_dev == dev) 5822 return true; 5823 } 5824 } 5825 5826 return false; 5827 } 5828 5829 struct fib6_nh_exception_dump_walker { 5830 struct rt6_rtnl_dump_arg *dump; 5831 struct fib6_info *rt; 5832 unsigned int flags; 5833 unsigned int skip; 5834 unsigned int count; 5835 }; 5836 5837 static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg) 5838 { 5839 struct fib6_nh_exception_dump_walker *w = arg; 5840 struct rt6_rtnl_dump_arg *dump = w->dump; 5841 struct rt6_exception_bucket *bucket; 5842 struct rt6_exception *rt6_ex; 5843 int i, err; 5844 5845 bucket = fib6_nh_get_excptn_bucket(nh, NULL); 5846 if (!bucket) 5847 return 0; 5848 5849 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 5850 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 5851 if (w->skip) { 5852 w->skip--; 5853 continue; 5854 } 5855 5856 /* Expiration of entries doesn't bump sernum, insertion 5857 * does. Removal is triggered by insertion, so we can 5858 * rely on the fact that if entries change between two 5859 * partial dumps, this node is scanned again completely, 5860 * see rt6_insert_exception() and fib6_dump_table(). 5861 * 5862 * Count expired entries we go through as handled 5863 * entries that we'll skip next time, in case of partial 5864 * node dump. Otherwise, if entries expire meanwhile, 5865 * we'll skip the wrong amount. 5866 */ 5867 if (rt6_check_expired(rt6_ex->rt6i)) { 5868 w->count++; 5869 continue; 5870 } 5871 5872 err = rt6_fill_node(dump->net, dump->skb, w->rt, 5873 &rt6_ex->rt6i->dst, NULL, NULL, 0, 5874 RTM_NEWROUTE, 5875 NETLINK_CB(dump->cb->skb).portid, 5876 dump->cb->nlh->nlmsg_seq, w->flags); 5877 if (err) 5878 return err; 5879 5880 w->count++; 5881 } 5882 bucket++; 5883 } 5884 5885 return 0; 5886 } 5887 5888 /* Return -1 if done with node, number of handled routes on partial dump */ 5889 int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip) 5890 { 5891 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 5892 struct fib_dump_filter *filter = &arg->filter; 5893 unsigned int flags = NLM_F_MULTI; 5894 struct net *net = arg->net; 5895 int count = 0; 5896 5897 if (rt == net->ipv6.fib6_null_entry) 5898 return -1; 5899 5900 if ((filter->flags & RTM_F_PREFIX) && 5901 !(rt->fib6_flags & RTF_PREFIX_RT)) { 5902 /* success since this is not a prefix route */ 5903 return -1; 5904 } 5905 if (filter->filter_set && 5906 ((filter->rt_type && rt->fib6_type != filter->rt_type) || 5907 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || 5908 (filter->protocol && rt->fib6_protocol != filter->protocol))) { 5909 return -1; 5910 } 5911 5912 if (filter->filter_set || 5913 !filter->dump_routes || !filter->dump_exceptions) { 5914 flags |= NLM_F_DUMP_FILTERED; 5915 } 5916 5917 if (filter->dump_routes) { 5918 if (skip) { 5919 skip--; 5920 } else { 5921 if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 5922 0, RTM_NEWROUTE, 5923 NETLINK_CB(arg->cb->skb).portid, 5924 arg->cb->nlh->nlmsg_seq, flags)) { 5925 return 0; 5926 } 5927 count++; 5928 } 5929 } 5930 5931 if (filter->dump_exceptions) { 5932 struct fib6_nh_exception_dump_walker w = { .dump = arg, 5933 .rt = rt, 5934 .flags = flags, 5935 .skip = skip, 5936 .count = 0 }; 5937 int err; 5938 5939 rcu_read_lock(); 5940 if (rt->nh) { 5941 err = nexthop_for_each_fib6_nh(rt->nh, 5942 rt6_nh_dump_exceptions, 5943 &w); 5944 } else { 5945 err = rt6_nh_dump_exceptions(rt->fib6_nh, &w); 5946 } 5947 rcu_read_unlock(); 5948 5949 if (err) 5950 return count + w.count; 5951 } 5952 5953 return -1; 5954 } 5955 5956 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, 5957 const struct nlmsghdr *nlh, 5958 struct nlattr **tb, 5959 struct netlink_ext_ack *extack) 5960 { 5961 struct rtmsg *rtm; 5962 int i, err; 5963 5964 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { 5965 NL_SET_ERR_MSG_MOD(extack, 5966 "Invalid header for get route request"); 5967 return -EINVAL; 5968 } 5969 5970 if (!netlink_strict_get_check(skb)) 5971 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, 5972 rtm_ipv6_policy, extack); 5973 5974 rtm = nlmsg_data(nlh); 5975 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || 5976 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || 5977 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || 5978 rtm->rtm_type) { 5979 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request"); 5980 return -EINVAL; 5981 } 5982 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) { 5983 NL_SET_ERR_MSG_MOD(extack, 5984 "Invalid flags for get route request"); 5985 return -EINVAL; 5986 } 5987 5988 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, 5989 rtm_ipv6_policy, extack); 5990 if (err) 5991 return err; 5992 5993 if ((tb[RTA_SRC] && !rtm->rtm_src_len) || 5994 (tb[RTA_DST] && !rtm->rtm_dst_len)) { 5995 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); 5996 return -EINVAL; 5997 } 5998 5999 for (i = 0; i <= RTA_MAX; i++) { 6000 if (!tb[i]) 6001 continue; 6002 6003 switch (i) { 6004 case RTA_SRC: 6005 case RTA_DST: 6006 case RTA_IIF: 6007 case RTA_OIF: 6008 case RTA_MARK: 6009 case RTA_UID: 6010 case RTA_SPORT: 6011 case RTA_DPORT: 6012 case RTA_IP_PROTO: 6013 break; 6014 default: 6015 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); 6016 return -EINVAL; 6017 } 6018 } 6019 6020 return 0; 6021 } 6022 6023 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 6024 struct netlink_ext_ack *extack) 6025 { 6026 struct net *net = sock_net(in_skb->sk); 6027 struct nlattr *tb[RTA_MAX+1]; 6028 int err, iif = 0, oif = 0; 6029 struct fib6_info *from; 6030 struct dst_entry *dst; 6031 struct rt6_info *rt; 6032 struct sk_buff *skb; 6033 struct rtmsg *rtm; 6034 struct flowi6 fl6 = {}; 6035 bool fibmatch; 6036 6037 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack); 6038 if (err < 0) 6039 goto errout; 6040 6041 err = -EINVAL; 6042 rtm = nlmsg_data(nlh); 6043 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 6044 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 6045 6046 if (tb[RTA_SRC]) { 6047 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 6048 goto errout; 6049 6050 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 6051 } 6052 6053 if (tb[RTA_DST]) { 6054 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 6055 goto errout; 6056 6057 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 6058 } 6059 6060 if (tb[RTA_IIF]) 6061 iif = nla_get_u32(tb[RTA_IIF]); 6062 6063 if (tb[RTA_OIF]) 6064 oif = nla_get_u32(tb[RTA_OIF]); 6065 6066 if (tb[RTA_MARK]) 6067 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 6068 6069 if (tb[RTA_UID]) 6070 fl6.flowi6_uid = make_kuid(current_user_ns(), 6071 nla_get_u32(tb[RTA_UID])); 6072 else 6073 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 6074 6075 if (tb[RTA_SPORT]) 6076 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); 6077 6078 if (tb[RTA_DPORT]) 6079 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); 6080 6081 if (tb[RTA_IP_PROTO]) { 6082 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 6083 &fl6.flowi6_proto, AF_INET6, 6084 extack); 6085 if (err) 6086 goto errout; 6087 } 6088 6089 if (iif) { 6090 struct net_device *dev; 6091 int flags = 0; 6092 6093 rcu_read_lock(); 6094 6095 dev = dev_get_by_index_rcu(net, iif); 6096 if (!dev) { 6097 rcu_read_unlock(); 6098 err = -ENODEV; 6099 goto errout; 6100 } 6101 6102 fl6.flowi6_iif = iif; 6103 6104 if (!ipv6_addr_any(&fl6.saddr)) 6105 flags |= RT6_LOOKUP_F_HAS_SADDR; 6106 6107 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 6108 6109 rcu_read_unlock(); 6110 } else { 6111 fl6.flowi6_oif = oif; 6112 6113 dst = ip6_route_output(net, NULL, &fl6); 6114 } 6115 6116 6117 rt = dst_rt6_info(dst); 6118 if (rt->dst.error) { 6119 err = rt->dst.error; 6120 ip6_rt_put(rt); 6121 goto errout; 6122 } 6123 6124 if (rt == net->ipv6.ip6_null_entry) { 6125 err = rt->dst.error; 6126 ip6_rt_put(rt); 6127 goto errout; 6128 } 6129 6130 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 6131 if (!skb) { 6132 ip6_rt_put(rt); 6133 err = -ENOBUFS; 6134 goto errout; 6135 } 6136 6137 skb_dst_set(skb, &rt->dst); 6138 6139 rcu_read_lock(); 6140 from = rcu_dereference(rt->from); 6141 if (from) { 6142 if (fibmatch) 6143 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, 6144 iif, RTM_NEWROUTE, 6145 NETLINK_CB(in_skb).portid, 6146 nlh->nlmsg_seq, 0); 6147 else 6148 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 6149 &fl6.saddr, iif, RTM_NEWROUTE, 6150 NETLINK_CB(in_skb).portid, 6151 nlh->nlmsg_seq, 0); 6152 } else { 6153 err = -ENETUNREACH; 6154 } 6155 rcu_read_unlock(); 6156 6157 if (err < 0) { 6158 kfree_skb(skb); 6159 goto errout; 6160 } 6161 6162 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 6163 errout: 6164 return err; 6165 } 6166 6167 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 6168 unsigned int nlm_flags) 6169 { 6170 struct sk_buff *skb; 6171 struct net *net = info->nl_net; 6172 u32 seq; 6173 int err; 6174 6175 err = -ENOBUFS; 6176 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 6177 6178 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 6179 if (!skb) 6180 goto errout; 6181 6182 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 6183 event, info->portid, seq, nlm_flags); 6184 if (err < 0) { 6185 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 6186 WARN_ON(err == -EMSGSIZE); 6187 kfree_skb(skb); 6188 goto errout; 6189 } 6190 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 6191 info->nlh, gfp_any()); 6192 return; 6193 errout: 6194 if (err < 0) 6195 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 6196 } 6197 6198 void fib6_rt_update(struct net *net, struct fib6_info *rt, 6199 struct nl_info *info) 6200 { 6201 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 6202 struct sk_buff *skb; 6203 int err = -ENOBUFS; 6204 6205 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 6206 if (!skb) 6207 goto errout; 6208 6209 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 6210 RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE); 6211 if (err < 0) { 6212 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 6213 WARN_ON(err == -EMSGSIZE); 6214 kfree_skb(skb); 6215 goto errout; 6216 } 6217 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 6218 info->nlh, gfp_any()); 6219 return; 6220 errout: 6221 if (err < 0) 6222 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 6223 } 6224 6225 void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, 6226 bool offload, bool trap, bool offload_failed) 6227 { 6228 struct sk_buff *skb; 6229 int err; 6230 6231 if (READ_ONCE(f6i->offload) == offload && 6232 READ_ONCE(f6i->trap) == trap && 6233 READ_ONCE(f6i->offload_failed) == offload_failed) 6234 return; 6235 6236 WRITE_ONCE(f6i->offload, offload); 6237 WRITE_ONCE(f6i->trap, trap); 6238 6239 /* 2 means send notifications only if offload_failed was changed. */ 6240 if (net->ipv6.sysctl.fib_notify_on_flag_change == 2 && 6241 READ_ONCE(f6i->offload_failed) == offload_failed) 6242 return; 6243 6244 WRITE_ONCE(f6i->offload_failed, offload_failed); 6245 6246 if (!rcu_access_pointer(f6i->fib6_node)) 6247 /* The route was removed from the tree, do not send 6248 * notification. 6249 */ 6250 return; 6251 6252 if (!net->ipv6.sysctl.fib_notify_on_flag_change) 6253 return; 6254 6255 skb = nlmsg_new(rt6_nlmsg_size(f6i), GFP_KERNEL); 6256 if (!skb) { 6257 err = -ENOBUFS; 6258 goto errout; 6259 } 6260 6261 err = rt6_fill_node(net, skb, f6i, NULL, NULL, NULL, 0, RTM_NEWROUTE, 0, 6262 0, 0); 6263 if (err < 0) { 6264 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 6265 WARN_ON(err == -EMSGSIZE); 6266 kfree_skb(skb); 6267 goto errout; 6268 } 6269 6270 rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ROUTE, NULL, GFP_KERNEL); 6271 return; 6272 6273 errout: 6274 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 6275 } 6276 EXPORT_SYMBOL(fib6_info_hw_flags_set); 6277 6278 static int ip6_route_dev_notify(struct notifier_block *this, 6279 unsigned long event, void *ptr) 6280 { 6281 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 6282 struct net *net = dev_net(dev); 6283 6284 if (!(dev->flags & IFF_LOOPBACK)) 6285 return NOTIFY_OK; 6286 6287 if (event == NETDEV_REGISTER) { 6288 net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev; 6289 net->ipv6.ip6_null_entry->dst.dev = dev; 6290 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 6291 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6292 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 6293 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 6294 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 6295 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 6296 #endif 6297 } else if (event == NETDEV_UNREGISTER && 6298 dev->reg_state != NETREG_UNREGISTERED) { 6299 /* NETDEV_UNREGISTER could be fired for multiple times by 6300 * netdev_wait_allrefs(). Make sure we only call this once. 6301 */ 6302 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 6303 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6304 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 6305 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 6306 #endif 6307 } 6308 6309 return NOTIFY_OK; 6310 } 6311 6312 /* 6313 * /proc 6314 */ 6315 6316 #ifdef CONFIG_PROC_FS 6317 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 6318 { 6319 struct net *net = (struct net *)seq->private; 6320 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 6321 net->ipv6.rt6_stats->fib_nodes, 6322 net->ipv6.rt6_stats->fib_route_nodes, 6323 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 6324 net->ipv6.rt6_stats->fib_rt_entries, 6325 net->ipv6.rt6_stats->fib_rt_cache, 6326 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 6327 net->ipv6.rt6_stats->fib_discarded_routes); 6328 6329 return 0; 6330 } 6331 #endif /* CONFIG_PROC_FS */ 6332 6333 #ifdef CONFIG_SYSCTL 6334 6335 static int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 6336 void *buffer, size_t *lenp, loff_t *ppos) 6337 { 6338 struct net *net; 6339 int delay; 6340 int ret; 6341 if (!write) 6342 return -EINVAL; 6343 6344 net = (struct net *)ctl->extra1; 6345 delay = net->ipv6.sysctl.flush_delay; 6346 ret = proc_dointvec(ctl, write, buffer, lenp, ppos); 6347 if (ret) 6348 return ret; 6349 6350 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 6351 return 0; 6352 } 6353 6354 static struct ctl_table ipv6_route_table_template[] = { 6355 { 6356 .procname = "max_size", 6357 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 6358 .maxlen = sizeof(int), 6359 .mode = 0644, 6360 .proc_handler = proc_dointvec, 6361 }, 6362 { 6363 .procname = "gc_thresh", 6364 .data = &ip6_dst_ops_template.gc_thresh, 6365 .maxlen = sizeof(int), 6366 .mode = 0644, 6367 .proc_handler = proc_dointvec, 6368 }, 6369 { 6370 .procname = "flush", 6371 .data = &init_net.ipv6.sysctl.flush_delay, 6372 .maxlen = sizeof(int), 6373 .mode = 0200, 6374 .proc_handler = ipv6_sysctl_rtcache_flush 6375 }, 6376 { 6377 .procname = "gc_min_interval", 6378 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 6379 .maxlen = sizeof(int), 6380 .mode = 0644, 6381 .proc_handler = proc_dointvec_jiffies, 6382 }, 6383 { 6384 .procname = "gc_timeout", 6385 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 6386 .maxlen = sizeof(int), 6387 .mode = 0644, 6388 .proc_handler = proc_dointvec_jiffies, 6389 }, 6390 { 6391 .procname = "gc_interval", 6392 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 6393 .maxlen = sizeof(int), 6394 .mode = 0644, 6395 .proc_handler = proc_dointvec_jiffies, 6396 }, 6397 { 6398 .procname = "gc_elasticity", 6399 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 6400 .maxlen = sizeof(int), 6401 .mode = 0644, 6402 .proc_handler = proc_dointvec, 6403 }, 6404 { 6405 .procname = "mtu_expires", 6406 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 6407 .maxlen = sizeof(int), 6408 .mode = 0644, 6409 .proc_handler = proc_dointvec_jiffies, 6410 }, 6411 { 6412 .procname = "min_adv_mss", 6413 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 6414 .maxlen = sizeof(int), 6415 .mode = 0644, 6416 .proc_handler = proc_dointvec, 6417 }, 6418 { 6419 .procname = "gc_min_interval_ms", 6420 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 6421 .maxlen = sizeof(int), 6422 .mode = 0644, 6423 .proc_handler = proc_dointvec_ms_jiffies, 6424 }, 6425 { 6426 .procname = "skip_notify_on_dev_down", 6427 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down, 6428 .maxlen = sizeof(u8), 6429 .mode = 0644, 6430 .proc_handler = proc_dou8vec_minmax, 6431 .extra1 = SYSCTL_ZERO, 6432 .extra2 = SYSCTL_ONE, 6433 }, 6434 }; 6435 6436 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 6437 { 6438 struct ctl_table *table; 6439 6440 table = kmemdup(ipv6_route_table_template, 6441 sizeof(ipv6_route_table_template), 6442 GFP_KERNEL); 6443 6444 if (table) { 6445 table[0].data = &net->ipv6.sysctl.ip6_rt_max_size; 6446 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 6447 table[2].data = &net->ipv6.sysctl.flush_delay; 6448 table[2].extra1 = net; 6449 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 6450 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 6451 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 6452 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 6453 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 6454 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 6455 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 6456 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down; 6457 } 6458 6459 return table; 6460 } 6461 6462 size_t ipv6_route_sysctl_table_size(struct net *net) 6463 { 6464 /* Don't export sysctls to unprivileged users */ 6465 if (net->user_ns != &init_user_ns) 6466 return 1; 6467 6468 return ARRAY_SIZE(ipv6_route_table_template); 6469 } 6470 #endif 6471 6472 static int __net_init ip6_route_net_init(struct net *net) 6473 { 6474 int ret = -ENOMEM; 6475 6476 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 6477 sizeof(net->ipv6.ip6_dst_ops)); 6478 6479 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 6480 goto out_ip6_dst_ops; 6481 6482 net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true); 6483 if (!net->ipv6.fib6_null_entry) 6484 goto out_ip6_dst_entries; 6485 memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template, 6486 sizeof(*net->ipv6.fib6_null_entry)); 6487 6488 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 6489 sizeof(*net->ipv6.ip6_null_entry), 6490 GFP_KERNEL); 6491 if (!net->ipv6.ip6_null_entry) 6492 goto out_fib6_null_entry; 6493 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 6494 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 6495 ip6_template_metrics, true); 6496 INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->dst.rt_uncached); 6497 6498 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6499 net->ipv6.fib6_has_custom_rules = false; 6500 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 6501 sizeof(*net->ipv6.ip6_prohibit_entry), 6502 GFP_KERNEL); 6503 if (!net->ipv6.ip6_prohibit_entry) 6504 goto out_ip6_null_entry; 6505 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 6506 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 6507 ip6_template_metrics, true); 6508 INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->dst.rt_uncached); 6509 6510 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 6511 sizeof(*net->ipv6.ip6_blk_hole_entry), 6512 GFP_KERNEL); 6513 if (!net->ipv6.ip6_blk_hole_entry) 6514 goto out_ip6_prohibit_entry; 6515 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 6516 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 6517 ip6_template_metrics, true); 6518 INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->dst.rt_uncached); 6519 #ifdef CONFIG_IPV6_SUBTREES 6520 net->ipv6.fib6_routes_require_src = 0; 6521 #endif 6522 #endif 6523 6524 net->ipv6.sysctl.flush_delay = 0; 6525 net->ipv6.sysctl.ip6_rt_max_size = INT_MAX; 6526 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 6527 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 6528 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 6529 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 6530 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 6531 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 6532 net->ipv6.sysctl.skip_notify_on_dev_down = 0; 6533 6534 atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ); 6535 6536 ret = 0; 6537 out: 6538 return ret; 6539 6540 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6541 out_ip6_prohibit_entry: 6542 kfree(net->ipv6.ip6_prohibit_entry); 6543 out_ip6_null_entry: 6544 kfree(net->ipv6.ip6_null_entry); 6545 #endif 6546 out_fib6_null_entry: 6547 kfree(net->ipv6.fib6_null_entry); 6548 out_ip6_dst_entries: 6549 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 6550 out_ip6_dst_ops: 6551 goto out; 6552 } 6553 6554 static void __net_exit ip6_route_net_exit(struct net *net) 6555 { 6556 kfree(net->ipv6.fib6_null_entry); 6557 kfree(net->ipv6.ip6_null_entry); 6558 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6559 kfree(net->ipv6.ip6_prohibit_entry); 6560 kfree(net->ipv6.ip6_blk_hole_entry); 6561 #endif 6562 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 6563 } 6564 6565 static int __net_init ip6_route_net_init_late(struct net *net) 6566 { 6567 #ifdef CONFIG_PROC_FS 6568 if (!proc_create_net("ipv6_route", 0, net->proc_net, 6569 &ipv6_route_seq_ops, 6570 sizeof(struct ipv6_route_iter))) 6571 return -ENOMEM; 6572 6573 if (!proc_create_net_single("rt6_stats", 0444, net->proc_net, 6574 rt6_stats_seq_show, NULL)) { 6575 remove_proc_entry("ipv6_route", net->proc_net); 6576 return -ENOMEM; 6577 } 6578 #endif 6579 return 0; 6580 } 6581 6582 static void __net_exit ip6_route_net_exit_late(struct net *net) 6583 { 6584 #ifdef CONFIG_PROC_FS 6585 remove_proc_entry("ipv6_route", net->proc_net); 6586 remove_proc_entry("rt6_stats", net->proc_net); 6587 #endif 6588 } 6589 6590 static struct pernet_operations ip6_route_net_ops = { 6591 .init = ip6_route_net_init, 6592 .exit = ip6_route_net_exit, 6593 }; 6594 6595 static int __net_init ipv6_inetpeer_init(struct net *net) 6596 { 6597 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 6598 6599 if (!bp) 6600 return -ENOMEM; 6601 inet_peer_base_init(bp); 6602 net->ipv6.peers = bp; 6603 return 0; 6604 } 6605 6606 static void __net_exit ipv6_inetpeer_exit(struct net *net) 6607 { 6608 struct inet_peer_base *bp = net->ipv6.peers; 6609 6610 net->ipv6.peers = NULL; 6611 inetpeer_invalidate_tree(bp); 6612 kfree(bp); 6613 } 6614 6615 static struct pernet_operations ipv6_inetpeer_ops = { 6616 .init = ipv6_inetpeer_init, 6617 .exit = ipv6_inetpeer_exit, 6618 }; 6619 6620 static struct pernet_operations ip6_route_net_late_ops = { 6621 .init = ip6_route_net_init_late, 6622 .exit = ip6_route_net_exit_late, 6623 }; 6624 6625 static struct notifier_block ip6_route_dev_notifier = { 6626 .notifier_call = ip6_route_dev_notify, 6627 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 6628 }; 6629 6630 void __init ip6_route_init_special_entries(void) 6631 { 6632 /* Registering of the loopback is done before this portion of code, 6633 * the loopback reference in rt6_info will not be taken, do it 6634 * manually for init_net */ 6635 init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev; 6636 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 6637 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 6638 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6639 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 6640 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 6641 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 6642 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 6643 #endif 6644 } 6645 6646 #if IS_BUILTIN(CONFIG_IPV6) 6647 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 6648 DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt) 6649 6650 BTF_ID_LIST(btf_fib6_info_id) 6651 BTF_ID(struct, fib6_info) 6652 6653 static const struct bpf_iter_seq_info ipv6_route_seq_info = { 6654 .seq_ops = &ipv6_route_seq_ops, 6655 .init_seq_private = bpf_iter_init_seq_net, 6656 .fini_seq_private = bpf_iter_fini_seq_net, 6657 .seq_priv_size = sizeof(struct ipv6_route_iter), 6658 }; 6659 6660 static struct bpf_iter_reg ipv6_route_reg_info = { 6661 .target = "ipv6_route", 6662 .ctx_arg_info_size = 1, 6663 .ctx_arg_info = { 6664 { offsetof(struct bpf_iter__ipv6_route, rt), 6665 PTR_TO_BTF_ID_OR_NULL }, 6666 }, 6667 .seq_info = &ipv6_route_seq_info, 6668 }; 6669 6670 static int __init bpf_iter_register(void) 6671 { 6672 ipv6_route_reg_info.ctx_arg_info[0].btf_id = *btf_fib6_info_id; 6673 return bpf_iter_reg_target(&ipv6_route_reg_info); 6674 } 6675 6676 static void bpf_iter_unregister(void) 6677 { 6678 bpf_iter_unreg_target(&ipv6_route_reg_info); 6679 } 6680 #endif 6681 #endif 6682 6683 int __init ip6_route_init(void) 6684 { 6685 int ret; 6686 int cpu; 6687 6688 ret = -ENOMEM; 6689 ip6_dst_ops_template.kmem_cachep = 6690 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 6691 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); 6692 if (!ip6_dst_ops_template.kmem_cachep) 6693 goto out; 6694 6695 ret = dst_entries_init(&ip6_dst_blackhole_ops); 6696 if (ret) 6697 goto out_kmem_cache; 6698 6699 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 6700 if (ret) 6701 goto out_dst_entries; 6702 6703 ret = register_pernet_subsys(&ip6_route_net_ops); 6704 if (ret) 6705 goto out_register_inetpeer; 6706 6707 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 6708 6709 ret = fib6_init(); 6710 if (ret) 6711 goto out_register_subsys; 6712 6713 ret = xfrm6_init(); 6714 if (ret) 6715 goto out_fib6_init; 6716 6717 ret = fib6_rules_init(); 6718 if (ret) 6719 goto xfrm6_init; 6720 6721 ret = register_pernet_subsys(&ip6_route_net_late_ops); 6722 if (ret) 6723 goto fib6_rules_init; 6724 6725 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 6726 inet6_rtm_newroute, NULL, 0); 6727 if (ret < 0) 6728 goto out_register_late_subsys; 6729 6730 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 6731 inet6_rtm_delroute, NULL, 0); 6732 if (ret < 0) 6733 goto out_register_late_subsys; 6734 6735 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 6736 inet6_rtm_getroute, NULL, 6737 RTNL_FLAG_DOIT_UNLOCKED); 6738 if (ret < 0) 6739 goto out_register_late_subsys; 6740 6741 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 6742 if (ret) 6743 goto out_register_late_subsys; 6744 6745 #if IS_BUILTIN(CONFIG_IPV6) 6746 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 6747 ret = bpf_iter_register(); 6748 if (ret) 6749 goto out_register_late_subsys; 6750 #endif 6751 #endif 6752 6753 for_each_possible_cpu(cpu) { 6754 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 6755 6756 INIT_LIST_HEAD(&ul->head); 6757 INIT_LIST_HEAD(&ul->quarantine); 6758 spin_lock_init(&ul->lock); 6759 } 6760 6761 out: 6762 return ret; 6763 6764 out_register_late_subsys: 6765 rtnl_unregister_all(PF_INET6); 6766 unregister_pernet_subsys(&ip6_route_net_late_ops); 6767 fib6_rules_init: 6768 fib6_rules_cleanup(); 6769 xfrm6_init: 6770 xfrm6_fini(); 6771 out_fib6_init: 6772 fib6_gc_cleanup(); 6773 out_register_subsys: 6774 unregister_pernet_subsys(&ip6_route_net_ops); 6775 out_register_inetpeer: 6776 unregister_pernet_subsys(&ipv6_inetpeer_ops); 6777 out_dst_entries: 6778 dst_entries_destroy(&ip6_dst_blackhole_ops); 6779 out_kmem_cache: 6780 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 6781 goto out; 6782 } 6783 6784 void ip6_route_cleanup(void) 6785 { 6786 #if IS_BUILTIN(CONFIG_IPV6) 6787 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 6788 bpf_iter_unregister(); 6789 #endif 6790 #endif 6791 unregister_netdevice_notifier(&ip6_route_dev_notifier); 6792 unregister_pernet_subsys(&ip6_route_net_late_ops); 6793 fib6_rules_cleanup(); 6794 xfrm6_fini(); 6795 fib6_gc_cleanup(); 6796 unregister_pernet_subsys(&ipv6_inetpeer_ops); 6797 unregister_pernet_subsys(&ip6_route_net_ops); 6798 dst_entries_destroy(&ip6_dst_blackhole_ops); 6799 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 6800 } 6801