1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Linux INET6 implementation 4 * FIB front-end. 5 * 6 * Authors: 7 * Pedro Roque <roque@di.fc.ul.pt> 8 */ 9 10 /* Changes: 11 * 12 * YOSHIFUJI Hideaki @USAGI 13 * reworked default router selection. 14 * - respect outgoing interface 15 * - select from (probably) reachable routers (i.e. 16 * routers in REACHABLE, STALE, DELAY or PROBE states). 17 * - always select the same router if it is (probably) 18 * reachable. otherwise, round-robin the list. 19 * Ville Nuorvala 20 * Fixed routing subtrees. 21 */ 22 23 #define pr_fmt(fmt) "IPv6: " fmt 24 25 #include <linux/capability.h> 26 #include <linux/errno.h> 27 #include <linux/export.h> 28 #include <linux/types.h> 29 #include <linux/times.h> 30 #include <linux/socket.h> 31 #include <linux/sockios.h> 32 #include <linux/net.h> 33 #include <linux/route.h> 34 #include <linux/netdevice.h> 35 #include <linux/in6.h> 36 #include <linux/mroute6.h> 37 #include <linux/init.h> 38 #include <linux/if_arp.h> 39 #include <linux/proc_fs.h> 40 #include <linux/seq_file.h> 41 #include <linux/nsproxy.h> 42 #include <linux/slab.h> 43 #include <linux/jhash.h> 44 #include <linux/siphash.h> 45 #include <net/net_namespace.h> 46 #include <net/snmp.h> 47 #include <net/ipv6.h> 48 #include <net/ip6_fib.h> 49 #include <net/ip6_route.h> 50 #include <net/ndisc.h> 51 #include <net/addrconf.h> 52 #include <net/tcp.h> 53 #include <linux/rtnetlink.h> 54 #include <net/dst.h> 55 #include <net/dst_metadata.h> 56 #include <net/xfrm.h> 57 #include <net/netevent.h> 58 #include <net/netlink.h> 59 #include <net/rtnh.h> 60 #include <net/lwtunnel.h> 61 #include <net/ip_tunnels.h> 62 #include <net/l3mdev.h> 63 #include <net/ip.h> 64 #include <linux/uaccess.h> 65 #include <linux/btf_ids.h> 66 67 #ifdef CONFIG_SYSCTL 68 #include <linux/sysctl.h> 69 #endif 70 71 static int ip6_rt_type_to_error(u8 fib6_type); 72 73 #define CREATE_TRACE_POINTS 74 #include <trace/events/fib6.h> 75 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); 76 #undef CREATE_TRACE_POINTS 77 78 enum rt6_nud_state { 79 RT6_NUD_FAIL_HARD = -3, 80 RT6_NUD_FAIL_PROBE = -2, 81 RT6_NUD_FAIL_DO_RR = -1, 82 RT6_NUD_SUCCEED = 1 83 }; 84 85 INDIRECT_CALLABLE_SCOPE 86 struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 87 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 88 INDIRECT_CALLABLE_SCOPE 89 unsigned int ip6_mtu(const struct dst_entry *dst); 90 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 91 static void ip6_dst_destroy(struct dst_entry *); 92 static void ip6_dst_ifdown(struct dst_entry *, 93 struct net_device *dev); 94 static void ip6_dst_gc(struct dst_ops *ops); 95 96 static int ip6_pkt_discard(struct sk_buff *skb); 97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 98 static int ip6_pkt_prohibit(struct sk_buff *skb); 99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 100 static void ip6_link_failure(struct sk_buff *skb); 101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 102 struct sk_buff *skb, u32 mtu, 103 bool confirm_neigh); 104 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 105 struct sk_buff *skb); 106 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, 107 int strict); 108 static size_t rt6_nlmsg_size(struct fib6_info *f6i); 109 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 110 struct fib6_info *rt, struct dst_entry *dst, 111 struct in6_addr *dest, struct in6_addr *src, 112 int iif, int type, u32 portid, u32 seq, 113 unsigned int flags); 114 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, 115 const struct in6_addr *daddr, 116 const struct in6_addr *saddr); 117 118 #ifdef CONFIG_IPV6_ROUTE_INFO 119 static struct fib6_info *rt6_add_route_info(struct net *net, 120 const struct in6_addr *prefix, int prefixlen, 121 const struct in6_addr *gwaddr, 122 struct net_device *dev, 123 unsigned int pref); 124 static struct fib6_info *rt6_get_route_info(struct net *net, 125 const struct in6_addr *prefix, int prefixlen, 126 const struct in6_addr *gwaddr, 127 struct net_device *dev); 128 #endif 129 130 struct uncached_list { 131 spinlock_t lock; 132 struct list_head head; 133 struct list_head quarantine; 134 }; 135 136 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 137 138 void rt6_uncached_list_add(struct rt6_info *rt) 139 { 140 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 141 142 rt->dst.rt_uncached_list = ul; 143 144 spin_lock_bh(&ul->lock); 145 list_add_tail(&rt->dst.rt_uncached, &ul->head); 146 spin_unlock_bh(&ul->lock); 147 } 148 149 void rt6_uncached_list_del(struct rt6_info *rt) 150 { 151 if (!list_empty(&rt->dst.rt_uncached)) { 152 struct uncached_list *ul = rt->dst.rt_uncached_list; 153 154 spin_lock_bh(&ul->lock); 155 list_del_init(&rt->dst.rt_uncached); 156 spin_unlock_bh(&ul->lock); 157 } 158 } 159 160 static void rt6_uncached_list_flush_dev(struct net_device *dev) 161 { 162 int cpu; 163 164 for_each_possible_cpu(cpu) { 165 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 166 struct rt6_info *rt, *safe; 167 168 if (list_empty(&ul->head)) 169 continue; 170 171 spin_lock_bh(&ul->lock); 172 list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) { 173 struct inet6_dev *rt_idev = rt->rt6i_idev; 174 struct net_device *rt_dev = rt->dst.dev; 175 bool handled = false; 176 177 if (rt_idev->dev == dev) { 178 rt->rt6i_idev = in6_dev_get(blackhole_netdev); 179 in6_dev_put(rt_idev); 180 handled = true; 181 } 182 183 if (rt_dev == dev) { 184 rt->dst.dev = blackhole_netdev; 185 netdev_ref_replace(rt_dev, blackhole_netdev, 186 &rt->dst.dev_tracker, 187 GFP_ATOMIC); 188 handled = true; 189 } 190 if (handled) 191 list_move(&rt->dst.rt_uncached, 192 &ul->quarantine); 193 } 194 spin_unlock_bh(&ul->lock); 195 } 196 } 197 198 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 199 struct sk_buff *skb, 200 const void *daddr) 201 { 202 if (!ipv6_addr_any(p)) 203 return (const void *) p; 204 else if (skb) 205 return &ipv6_hdr(skb)->daddr; 206 return daddr; 207 } 208 209 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 210 struct net_device *dev, 211 struct sk_buff *skb, 212 const void *daddr) 213 { 214 struct neighbour *n; 215 216 daddr = choose_neigh_daddr(gw, skb, daddr); 217 n = __ipv6_neigh_lookup(dev, daddr); 218 if (n) 219 return n; 220 221 n = neigh_create(&nd_tbl, daddr, dev); 222 return IS_ERR(n) ? NULL : n; 223 } 224 225 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 226 struct sk_buff *skb, 227 const void *daddr) 228 { 229 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 230 231 return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any), 232 dst->dev, skb, daddr); 233 } 234 235 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 236 { 237 struct net_device *dev = dst->dev; 238 struct rt6_info *rt = (struct rt6_info *)dst; 239 240 daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr); 241 if (!daddr) 242 return; 243 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 244 return; 245 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 246 return; 247 __ipv6_confirm_neigh(dev, daddr); 248 } 249 250 static struct dst_ops ip6_dst_ops_template = { 251 .family = AF_INET6, 252 .gc = ip6_dst_gc, 253 .gc_thresh = 1024, 254 .check = ip6_dst_check, 255 .default_advmss = ip6_default_advmss, 256 .mtu = ip6_mtu, 257 .cow_metrics = dst_cow_metrics_generic, 258 .destroy = ip6_dst_destroy, 259 .ifdown = ip6_dst_ifdown, 260 .negative_advice = ip6_negative_advice, 261 .link_failure = ip6_link_failure, 262 .update_pmtu = ip6_rt_update_pmtu, 263 .redirect = rt6_do_redirect, 264 .local_out = __ip6_local_out, 265 .neigh_lookup = ip6_dst_neigh_lookup, 266 .confirm_neigh = ip6_confirm_neigh, 267 }; 268 269 static struct dst_ops ip6_dst_blackhole_ops = { 270 .family = AF_INET6, 271 .default_advmss = ip6_default_advmss, 272 .neigh_lookup = ip6_dst_neigh_lookup, 273 .check = ip6_dst_check, 274 .destroy = ip6_dst_destroy, 275 .cow_metrics = dst_cow_metrics_generic, 276 .update_pmtu = dst_blackhole_update_pmtu, 277 .redirect = dst_blackhole_redirect, 278 .mtu = dst_blackhole_mtu, 279 }; 280 281 static const u32 ip6_template_metrics[RTAX_MAX] = { 282 [RTAX_HOPLIMIT - 1] = 0, 283 }; 284 285 static const struct fib6_info fib6_null_entry_template = { 286 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 287 .fib6_protocol = RTPROT_KERNEL, 288 .fib6_metric = ~(u32)0, 289 .fib6_ref = REFCOUNT_INIT(1), 290 .fib6_type = RTN_UNREACHABLE, 291 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 292 }; 293 294 static const struct rt6_info ip6_null_entry_template = { 295 .dst = { 296 .__rcuref = RCUREF_INIT(1), 297 .__use = 1, 298 .obsolete = DST_OBSOLETE_FORCE_CHK, 299 .error = -ENETUNREACH, 300 .input = ip6_pkt_discard, 301 .output = ip6_pkt_discard_out, 302 }, 303 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 304 }; 305 306 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 307 308 static const struct rt6_info ip6_prohibit_entry_template = { 309 .dst = { 310 .__rcuref = RCUREF_INIT(1), 311 .__use = 1, 312 .obsolete = DST_OBSOLETE_FORCE_CHK, 313 .error = -EACCES, 314 .input = ip6_pkt_prohibit, 315 .output = ip6_pkt_prohibit_out, 316 }, 317 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 318 }; 319 320 static const struct rt6_info ip6_blk_hole_entry_template = { 321 .dst = { 322 .__rcuref = RCUREF_INIT(1), 323 .__use = 1, 324 .obsolete = DST_OBSOLETE_FORCE_CHK, 325 .error = -EINVAL, 326 .input = dst_discard, 327 .output = dst_discard_out, 328 }, 329 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 330 }; 331 332 #endif 333 334 static void rt6_info_init(struct rt6_info *rt) 335 { 336 memset_after(rt, 0, dst); 337 } 338 339 /* allocate dst with ip6_dst_ops */ 340 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 341 int flags) 342 { 343 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 344 DST_OBSOLETE_FORCE_CHK, flags); 345 346 if (rt) { 347 rt6_info_init(rt); 348 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 349 } 350 351 return rt; 352 } 353 EXPORT_SYMBOL(ip6_dst_alloc); 354 355 static void ip6_dst_destroy(struct dst_entry *dst) 356 { 357 struct rt6_info *rt = (struct rt6_info *)dst; 358 struct fib6_info *from; 359 struct inet6_dev *idev; 360 361 ip_dst_metrics_put(dst); 362 rt6_uncached_list_del(rt); 363 364 idev = rt->rt6i_idev; 365 if (idev) { 366 rt->rt6i_idev = NULL; 367 in6_dev_put(idev); 368 } 369 370 from = xchg((__force struct fib6_info **)&rt->from, NULL); 371 fib6_info_release(from); 372 } 373 374 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev) 375 { 376 struct rt6_info *rt = (struct rt6_info *)dst; 377 struct inet6_dev *idev = rt->rt6i_idev; 378 379 if (idev && idev->dev != blackhole_netdev) { 380 struct inet6_dev *blackhole_idev = in6_dev_get(blackhole_netdev); 381 382 if (blackhole_idev) { 383 rt->rt6i_idev = blackhole_idev; 384 in6_dev_put(idev); 385 } 386 } 387 } 388 389 static bool __rt6_check_expired(const struct rt6_info *rt) 390 { 391 if (rt->rt6i_flags & RTF_EXPIRES) 392 return time_after(jiffies, rt->dst.expires); 393 else 394 return false; 395 } 396 397 static bool rt6_check_expired(const struct rt6_info *rt) 398 { 399 struct fib6_info *from; 400 401 from = rcu_dereference(rt->from); 402 403 if (rt->rt6i_flags & RTF_EXPIRES) { 404 if (time_after(jiffies, rt->dst.expires)) 405 return true; 406 } else if (from) { 407 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 408 fib6_check_expired(from); 409 } 410 return false; 411 } 412 413 void fib6_select_path(const struct net *net, struct fib6_result *res, 414 struct flowi6 *fl6, int oif, bool have_oif_match, 415 const struct sk_buff *skb, int strict) 416 { 417 struct fib6_info *sibling, *next_sibling; 418 struct fib6_info *match = res->f6i; 419 420 if (!match->nh && (!match->fib6_nsiblings || have_oif_match)) 421 goto out; 422 423 if (match->nh && have_oif_match && res->nh) 424 return; 425 426 if (skb) 427 IP6CB(skb)->flags |= IP6SKB_MULTIPATH; 428 429 /* We might have already computed the hash for ICMPv6 errors. In such 430 * case it will always be non-zero. Otherwise now is the time to do it. 431 */ 432 if (!fl6->mp_hash && 433 (!match->nh || nexthop_is_multipath(match->nh))) 434 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 435 436 if (unlikely(match->nh)) { 437 nexthop_path_fib6_result(res, fl6->mp_hash); 438 return; 439 } 440 441 if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound)) 442 goto out; 443 444 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 445 fib6_siblings) { 446 const struct fib6_nh *nh = sibling->fib6_nh; 447 int nh_upper_bound; 448 449 nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); 450 if (fl6->mp_hash > nh_upper_bound) 451 continue; 452 if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0) 453 break; 454 match = sibling; 455 break; 456 } 457 458 out: 459 res->f6i = match; 460 res->nh = match->fib6_nh; 461 } 462 463 /* 464 * Route lookup. rcu_read_lock() should be held. 465 */ 466 467 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh, 468 const struct in6_addr *saddr, int oif, int flags) 469 { 470 const struct net_device *dev; 471 472 if (nh->fib_nh_flags & RTNH_F_DEAD) 473 return false; 474 475 dev = nh->fib_nh_dev; 476 if (oif) { 477 if (dev->ifindex == oif) 478 return true; 479 } else { 480 if (ipv6_chk_addr(net, saddr, dev, 481 flags & RT6_LOOKUP_F_IFACE)) 482 return true; 483 } 484 485 return false; 486 } 487 488 struct fib6_nh_dm_arg { 489 struct net *net; 490 const struct in6_addr *saddr; 491 int oif; 492 int flags; 493 struct fib6_nh *nh; 494 }; 495 496 static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg) 497 { 498 struct fib6_nh_dm_arg *arg = _arg; 499 500 arg->nh = nh; 501 return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif, 502 arg->flags); 503 } 504 505 /* returns fib6_nh from nexthop or NULL */ 506 static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh, 507 struct fib6_result *res, 508 const struct in6_addr *saddr, 509 int oif, int flags) 510 { 511 struct fib6_nh_dm_arg arg = { 512 .net = net, 513 .saddr = saddr, 514 .oif = oif, 515 .flags = flags, 516 }; 517 518 if (nexthop_is_blackhole(nh)) 519 return NULL; 520 521 if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg)) 522 return arg.nh; 523 524 return NULL; 525 } 526 527 static void rt6_device_match(struct net *net, struct fib6_result *res, 528 const struct in6_addr *saddr, int oif, int flags) 529 { 530 struct fib6_info *f6i = res->f6i; 531 struct fib6_info *spf6i; 532 struct fib6_nh *nh; 533 534 if (!oif && ipv6_addr_any(saddr)) { 535 if (unlikely(f6i->nh)) { 536 nh = nexthop_fib6_nh(f6i->nh); 537 if (nexthop_is_blackhole(f6i->nh)) 538 goto out_blackhole; 539 } else { 540 nh = f6i->fib6_nh; 541 } 542 if (!(nh->fib_nh_flags & RTNH_F_DEAD)) 543 goto out; 544 } 545 546 for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) { 547 bool matched = false; 548 549 if (unlikely(spf6i->nh)) { 550 nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr, 551 oif, flags); 552 if (nh) 553 matched = true; 554 } else { 555 nh = spf6i->fib6_nh; 556 if (__rt6_device_match(net, nh, saddr, oif, flags)) 557 matched = true; 558 } 559 if (matched) { 560 res->f6i = spf6i; 561 goto out; 562 } 563 } 564 565 if (oif && flags & RT6_LOOKUP_F_IFACE) { 566 res->f6i = net->ipv6.fib6_null_entry; 567 nh = res->f6i->fib6_nh; 568 goto out; 569 } 570 571 if (unlikely(f6i->nh)) { 572 nh = nexthop_fib6_nh(f6i->nh); 573 if (nexthop_is_blackhole(f6i->nh)) 574 goto out_blackhole; 575 } else { 576 nh = f6i->fib6_nh; 577 } 578 579 if (nh->fib_nh_flags & RTNH_F_DEAD) { 580 res->f6i = net->ipv6.fib6_null_entry; 581 nh = res->f6i->fib6_nh; 582 } 583 out: 584 res->nh = nh; 585 res->fib6_type = res->f6i->fib6_type; 586 res->fib6_flags = res->f6i->fib6_flags; 587 return; 588 589 out_blackhole: 590 res->fib6_flags |= RTF_REJECT; 591 res->fib6_type = RTN_BLACKHOLE; 592 res->nh = nh; 593 } 594 595 #ifdef CONFIG_IPV6_ROUTER_PREF 596 struct __rt6_probe_work { 597 struct work_struct work; 598 struct in6_addr target; 599 struct net_device *dev; 600 netdevice_tracker dev_tracker; 601 }; 602 603 static void rt6_probe_deferred(struct work_struct *w) 604 { 605 struct in6_addr mcaddr; 606 struct __rt6_probe_work *work = 607 container_of(w, struct __rt6_probe_work, work); 608 609 addrconf_addr_solict_mult(&work->target, &mcaddr); 610 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 611 netdev_put(work->dev, &work->dev_tracker); 612 kfree(work); 613 } 614 615 static void rt6_probe(struct fib6_nh *fib6_nh) 616 { 617 struct __rt6_probe_work *work = NULL; 618 const struct in6_addr *nh_gw; 619 unsigned long last_probe; 620 struct neighbour *neigh; 621 struct net_device *dev; 622 struct inet6_dev *idev; 623 624 /* 625 * Okay, this does not seem to be appropriate 626 * for now, however, we need to check if it 627 * is really so; aka Router Reachability Probing. 628 * 629 * Router Reachability Probe MUST be rate-limited 630 * to no more than one per minute. 631 */ 632 if (!fib6_nh->fib_nh_gw_family) 633 return; 634 635 nh_gw = &fib6_nh->fib_nh_gw6; 636 dev = fib6_nh->fib_nh_dev; 637 rcu_read_lock(); 638 last_probe = READ_ONCE(fib6_nh->last_probe); 639 idev = __in6_dev_get(dev); 640 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 641 if (neigh) { 642 if (READ_ONCE(neigh->nud_state) & NUD_VALID) 643 goto out; 644 645 write_lock_bh(&neigh->lock); 646 if (!(neigh->nud_state & NUD_VALID) && 647 time_after(jiffies, 648 neigh->updated + 649 READ_ONCE(idev->cnf.rtr_probe_interval))) { 650 work = kmalloc(sizeof(*work), GFP_ATOMIC); 651 if (work) 652 __neigh_set_probe_once(neigh); 653 } 654 write_unlock_bh(&neigh->lock); 655 } else if (time_after(jiffies, last_probe + 656 READ_ONCE(idev->cnf.rtr_probe_interval))) { 657 work = kmalloc(sizeof(*work), GFP_ATOMIC); 658 } 659 660 if (!work || cmpxchg(&fib6_nh->last_probe, 661 last_probe, jiffies) != last_probe) { 662 kfree(work); 663 } else { 664 INIT_WORK(&work->work, rt6_probe_deferred); 665 work->target = *nh_gw; 666 netdev_hold(dev, &work->dev_tracker, GFP_ATOMIC); 667 work->dev = dev; 668 schedule_work(&work->work); 669 } 670 671 out: 672 rcu_read_unlock(); 673 } 674 #else 675 static inline void rt6_probe(struct fib6_nh *fib6_nh) 676 { 677 } 678 #endif 679 680 /* 681 * Default Router Selection (RFC 2461 6.3.6) 682 */ 683 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh) 684 { 685 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 686 struct neighbour *neigh; 687 688 rcu_read_lock(); 689 neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev, 690 &fib6_nh->fib_nh_gw6); 691 if (neigh) { 692 u8 nud_state = READ_ONCE(neigh->nud_state); 693 694 if (nud_state & NUD_VALID) 695 ret = RT6_NUD_SUCCEED; 696 #ifdef CONFIG_IPV6_ROUTER_PREF 697 else if (!(nud_state & NUD_FAILED)) 698 ret = RT6_NUD_SUCCEED; 699 else 700 ret = RT6_NUD_FAIL_PROBE; 701 #endif 702 } else { 703 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 704 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 705 } 706 rcu_read_unlock(); 707 708 return ret; 709 } 710 711 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, 712 int strict) 713 { 714 int m = 0; 715 716 if (!oif || nh->fib_nh_dev->ifindex == oif) 717 m = 2; 718 719 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 720 return RT6_NUD_FAIL_HARD; 721 #ifdef CONFIG_IPV6_ROUTER_PREF 722 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2; 723 #endif 724 if ((strict & RT6_LOOKUP_F_REACHABLE) && 725 !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) { 726 int n = rt6_check_neigh(nh); 727 if (n < 0) 728 return n; 729 } 730 return m; 731 } 732 733 static bool find_match(struct fib6_nh *nh, u32 fib6_flags, 734 int oif, int strict, int *mpri, bool *do_rr) 735 { 736 bool match_do_rr = false; 737 bool rc = false; 738 int m; 739 740 if (nh->fib_nh_flags & RTNH_F_DEAD) 741 goto out; 742 743 if (ip6_ignore_linkdown(nh->fib_nh_dev) && 744 nh->fib_nh_flags & RTNH_F_LINKDOWN && 745 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 746 goto out; 747 748 m = rt6_score_route(nh, fib6_flags, oif, strict); 749 if (m == RT6_NUD_FAIL_DO_RR) { 750 match_do_rr = true; 751 m = 0; /* lowest valid score */ 752 } else if (m == RT6_NUD_FAIL_HARD) { 753 goto out; 754 } 755 756 if (strict & RT6_LOOKUP_F_REACHABLE) 757 rt6_probe(nh); 758 759 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 760 if (m > *mpri) { 761 *do_rr = match_do_rr; 762 *mpri = m; 763 rc = true; 764 } 765 out: 766 return rc; 767 } 768 769 struct fib6_nh_frl_arg { 770 u32 flags; 771 int oif; 772 int strict; 773 int *mpri; 774 bool *do_rr; 775 struct fib6_nh *nh; 776 }; 777 778 static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg) 779 { 780 struct fib6_nh_frl_arg *arg = _arg; 781 782 arg->nh = nh; 783 return find_match(nh, arg->flags, arg->oif, arg->strict, 784 arg->mpri, arg->do_rr); 785 } 786 787 static void __find_rr_leaf(struct fib6_info *f6i_start, 788 struct fib6_info *nomatch, u32 metric, 789 struct fib6_result *res, struct fib6_info **cont, 790 int oif, int strict, bool *do_rr, int *mpri) 791 { 792 struct fib6_info *f6i; 793 794 for (f6i = f6i_start; 795 f6i && f6i != nomatch; 796 f6i = rcu_dereference(f6i->fib6_next)) { 797 bool matched = false; 798 struct fib6_nh *nh; 799 800 if (cont && f6i->fib6_metric != metric) { 801 *cont = f6i; 802 return; 803 } 804 805 if (fib6_check_expired(f6i)) 806 continue; 807 808 if (unlikely(f6i->nh)) { 809 struct fib6_nh_frl_arg arg = { 810 .flags = f6i->fib6_flags, 811 .oif = oif, 812 .strict = strict, 813 .mpri = mpri, 814 .do_rr = do_rr 815 }; 816 817 if (nexthop_is_blackhole(f6i->nh)) { 818 res->fib6_flags = RTF_REJECT; 819 res->fib6_type = RTN_BLACKHOLE; 820 res->f6i = f6i; 821 res->nh = nexthop_fib6_nh(f6i->nh); 822 return; 823 } 824 if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match, 825 &arg)) { 826 matched = true; 827 nh = arg.nh; 828 } 829 } else { 830 nh = f6i->fib6_nh; 831 if (find_match(nh, f6i->fib6_flags, oif, strict, 832 mpri, do_rr)) 833 matched = true; 834 } 835 if (matched) { 836 res->f6i = f6i; 837 res->nh = nh; 838 res->fib6_flags = f6i->fib6_flags; 839 res->fib6_type = f6i->fib6_type; 840 } 841 } 842 } 843 844 static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf, 845 struct fib6_info *rr_head, int oif, int strict, 846 bool *do_rr, struct fib6_result *res) 847 { 848 u32 metric = rr_head->fib6_metric; 849 struct fib6_info *cont = NULL; 850 int mpri = -1; 851 852 __find_rr_leaf(rr_head, NULL, metric, res, &cont, 853 oif, strict, do_rr, &mpri); 854 855 __find_rr_leaf(leaf, rr_head, metric, res, &cont, 856 oif, strict, do_rr, &mpri); 857 858 if (res->f6i || !cont) 859 return; 860 861 __find_rr_leaf(cont, NULL, metric, res, NULL, 862 oif, strict, do_rr, &mpri); 863 } 864 865 static void rt6_select(struct net *net, struct fib6_node *fn, int oif, 866 struct fib6_result *res, int strict) 867 { 868 struct fib6_info *leaf = rcu_dereference(fn->leaf); 869 struct fib6_info *rt0; 870 bool do_rr = false; 871 int key_plen; 872 873 /* make sure this function or its helpers sets f6i */ 874 res->f6i = NULL; 875 876 if (!leaf || leaf == net->ipv6.fib6_null_entry) 877 goto out; 878 879 rt0 = rcu_dereference(fn->rr_ptr); 880 if (!rt0) 881 rt0 = leaf; 882 883 /* Double check to make sure fn is not an intermediate node 884 * and fn->leaf does not points to its child's leaf 885 * (This might happen if all routes under fn are deleted from 886 * the tree and fib6_repair_tree() is called on the node.) 887 */ 888 key_plen = rt0->fib6_dst.plen; 889 #ifdef CONFIG_IPV6_SUBTREES 890 if (rt0->fib6_src.plen) 891 key_plen = rt0->fib6_src.plen; 892 #endif 893 if (fn->fn_bit != key_plen) 894 goto out; 895 896 find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res); 897 if (do_rr) { 898 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 899 900 /* no entries matched; do round-robin */ 901 if (!next || next->fib6_metric != rt0->fib6_metric) 902 next = leaf; 903 904 if (next != rt0) { 905 spin_lock_bh(&leaf->fib6_table->tb6_lock); 906 /* make sure next is not being deleted from the tree */ 907 if (next->fib6_node) 908 rcu_assign_pointer(fn->rr_ptr, next); 909 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 910 } 911 } 912 913 out: 914 if (!res->f6i) { 915 res->f6i = net->ipv6.fib6_null_entry; 916 res->nh = res->f6i->fib6_nh; 917 res->fib6_flags = res->f6i->fib6_flags; 918 res->fib6_type = res->f6i->fib6_type; 919 } 920 } 921 922 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res) 923 { 924 return (res->f6i->fib6_flags & RTF_NONEXTHOP) || 925 res->nh->fib_nh_gw_family; 926 } 927 928 #ifdef CONFIG_IPV6_ROUTE_INFO 929 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 930 const struct in6_addr *gwaddr) 931 { 932 struct net *net = dev_net(dev); 933 struct route_info *rinfo = (struct route_info *) opt; 934 struct in6_addr prefix_buf, *prefix; 935 struct fib6_table *table; 936 unsigned int pref; 937 unsigned long lifetime; 938 struct fib6_info *rt; 939 940 if (len < sizeof(struct route_info)) { 941 return -EINVAL; 942 } 943 944 /* Sanity check for prefix_len and length */ 945 if (rinfo->length > 3) { 946 return -EINVAL; 947 } else if (rinfo->prefix_len > 128) { 948 return -EINVAL; 949 } else if (rinfo->prefix_len > 64) { 950 if (rinfo->length < 2) { 951 return -EINVAL; 952 } 953 } else if (rinfo->prefix_len > 0) { 954 if (rinfo->length < 1) { 955 return -EINVAL; 956 } 957 } 958 959 pref = rinfo->route_pref; 960 if (pref == ICMPV6_ROUTER_PREF_INVALID) 961 return -EINVAL; 962 963 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 964 965 if (rinfo->length == 3) 966 prefix = (struct in6_addr *)rinfo->prefix; 967 else { 968 /* this function is safe */ 969 ipv6_addr_prefix(&prefix_buf, 970 (struct in6_addr *)rinfo->prefix, 971 rinfo->prefix_len); 972 prefix = &prefix_buf; 973 } 974 975 if (rinfo->prefix_len == 0) 976 rt = rt6_get_dflt_router(net, gwaddr, dev); 977 else 978 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 979 gwaddr, dev); 980 981 if (rt && !lifetime) { 982 ip6_del_rt(net, rt, false); 983 rt = NULL; 984 } 985 986 if (!rt && lifetime) 987 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 988 dev, pref); 989 else if (rt) 990 rt->fib6_flags = RTF_ROUTEINFO | 991 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 992 993 if (rt) { 994 table = rt->fib6_table; 995 spin_lock_bh(&table->tb6_lock); 996 997 if (!addrconf_finite_timeout(lifetime)) { 998 fib6_clean_expires(rt); 999 fib6_remove_gc_list(rt); 1000 } else { 1001 fib6_set_expires(rt, jiffies + HZ * lifetime); 1002 fib6_add_gc_list(rt); 1003 } 1004 1005 spin_unlock_bh(&table->tb6_lock); 1006 1007 fib6_info_release(rt); 1008 } 1009 return 0; 1010 } 1011 #endif 1012 1013 /* 1014 * Misc support functions 1015 */ 1016 1017 /* called with rcu_lock held */ 1018 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res) 1019 { 1020 struct net_device *dev = res->nh->fib_nh_dev; 1021 1022 if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 1023 /* for copies of local routes, dst->dev needs to be the 1024 * device if it is a master device, the master device if 1025 * device is enslaved, and the loopback as the default 1026 */ 1027 if (netif_is_l3_slave(dev) && 1028 !rt6_need_strict(&res->f6i->fib6_dst.addr)) 1029 dev = l3mdev_master_dev_rcu(dev); 1030 else if (!netif_is_l3_master(dev)) 1031 dev = dev_net(dev)->loopback_dev; 1032 /* last case is netif_is_l3_master(dev) is true in which 1033 * case we want dev returned to be dev 1034 */ 1035 } 1036 1037 return dev; 1038 } 1039 1040 static const int fib6_prop[RTN_MAX + 1] = { 1041 [RTN_UNSPEC] = 0, 1042 [RTN_UNICAST] = 0, 1043 [RTN_LOCAL] = 0, 1044 [RTN_BROADCAST] = 0, 1045 [RTN_ANYCAST] = 0, 1046 [RTN_MULTICAST] = 0, 1047 [RTN_BLACKHOLE] = -EINVAL, 1048 [RTN_UNREACHABLE] = -EHOSTUNREACH, 1049 [RTN_PROHIBIT] = -EACCES, 1050 [RTN_THROW] = -EAGAIN, 1051 [RTN_NAT] = -EINVAL, 1052 [RTN_XRESOLVE] = -EINVAL, 1053 }; 1054 1055 static int ip6_rt_type_to_error(u8 fib6_type) 1056 { 1057 return fib6_prop[fib6_type]; 1058 } 1059 1060 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 1061 { 1062 unsigned short flags = 0; 1063 1064 if (rt->dst_nocount) 1065 flags |= DST_NOCOUNT; 1066 if (rt->dst_nopolicy) 1067 flags |= DST_NOPOLICY; 1068 1069 return flags; 1070 } 1071 1072 static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type) 1073 { 1074 rt->dst.error = ip6_rt_type_to_error(fib6_type); 1075 1076 switch (fib6_type) { 1077 case RTN_BLACKHOLE: 1078 rt->dst.output = dst_discard_out; 1079 rt->dst.input = dst_discard; 1080 break; 1081 case RTN_PROHIBIT: 1082 rt->dst.output = ip6_pkt_prohibit_out; 1083 rt->dst.input = ip6_pkt_prohibit; 1084 break; 1085 case RTN_THROW: 1086 case RTN_UNREACHABLE: 1087 default: 1088 rt->dst.output = ip6_pkt_discard_out; 1089 rt->dst.input = ip6_pkt_discard; 1090 break; 1091 } 1092 } 1093 1094 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res) 1095 { 1096 struct fib6_info *f6i = res->f6i; 1097 1098 if (res->fib6_flags & RTF_REJECT) { 1099 ip6_rt_init_dst_reject(rt, res->fib6_type); 1100 return; 1101 } 1102 1103 rt->dst.error = 0; 1104 rt->dst.output = ip6_output; 1105 1106 if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) { 1107 rt->dst.input = ip6_input; 1108 } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 1109 rt->dst.input = ip6_mc_input; 1110 } else { 1111 rt->dst.input = ip6_forward; 1112 } 1113 1114 if (res->nh->fib_nh_lws) { 1115 rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws); 1116 lwtunnel_set_redirect(&rt->dst); 1117 } 1118 1119 rt->dst.lastuse = jiffies; 1120 } 1121 1122 /* Caller must already hold reference to @from */ 1123 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 1124 { 1125 rt->rt6i_flags &= ~RTF_EXPIRES; 1126 rcu_assign_pointer(rt->from, from); 1127 ip_dst_init_metrics(&rt->dst, from->fib6_metrics); 1128 } 1129 1130 /* Caller must already hold reference to f6i in result */ 1131 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res) 1132 { 1133 const struct fib6_nh *nh = res->nh; 1134 const struct net_device *dev = nh->fib_nh_dev; 1135 struct fib6_info *f6i = res->f6i; 1136 1137 ip6_rt_init_dst(rt, res); 1138 1139 rt->rt6i_dst = f6i->fib6_dst; 1140 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 1141 rt->rt6i_flags = res->fib6_flags; 1142 if (nh->fib_nh_gw_family) { 1143 rt->rt6i_gateway = nh->fib_nh_gw6; 1144 rt->rt6i_flags |= RTF_GATEWAY; 1145 } 1146 rt6_set_from(rt, f6i); 1147 #ifdef CONFIG_IPV6_SUBTREES 1148 rt->rt6i_src = f6i->fib6_src; 1149 #endif 1150 } 1151 1152 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1153 struct in6_addr *saddr) 1154 { 1155 struct fib6_node *pn, *sn; 1156 while (1) { 1157 if (fn->fn_flags & RTN_TL_ROOT) 1158 return NULL; 1159 pn = rcu_dereference(fn->parent); 1160 sn = FIB6_SUBTREE(pn); 1161 if (sn && sn != fn) 1162 fn = fib6_node_lookup(sn, NULL, saddr); 1163 else 1164 fn = pn; 1165 if (fn->fn_flags & RTN_RTINFO) 1166 return fn; 1167 } 1168 } 1169 1170 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt) 1171 { 1172 struct rt6_info *rt = *prt; 1173 1174 if (dst_hold_safe(&rt->dst)) 1175 return true; 1176 if (net) { 1177 rt = net->ipv6.ip6_null_entry; 1178 dst_hold(&rt->dst); 1179 } else { 1180 rt = NULL; 1181 } 1182 *prt = rt; 1183 return false; 1184 } 1185 1186 /* called with rcu_lock held */ 1187 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res) 1188 { 1189 struct net_device *dev = res->nh->fib_nh_dev; 1190 struct fib6_info *f6i = res->f6i; 1191 unsigned short flags; 1192 struct rt6_info *nrt; 1193 1194 if (!fib6_info_hold_safe(f6i)) 1195 goto fallback; 1196 1197 flags = fib6_info_dst_flags(f6i); 1198 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1199 if (!nrt) { 1200 fib6_info_release(f6i); 1201 goto fallback; 1202 } 1203 1204 ip6_rt_copy_init(nrt, res); 1205 return nrt; 1206 1207 fallback: 1208 nrt = dev_net(dev)->ipv6.ip6_null_entry; 1209 dst_hold(&nrt->dst); 1210 return nrt; 1211 } 1212 1213 INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_lookup(struct net *net, 1214 struct fib6_table *table, 1215 struct flowi6 *fl6, 1216 const struct sk_buff *skb, 1217 int flags) 1218 { 1219 struct fib6_result res = {}; 1220 struct fib6_node *fn; 1221 struct rt6_info *rt; 1222 1223 rcu_read_lock(); 1224 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1225 restart: 1226 res.f6i = rcu_dereference(fn->leaf); 1227 if (!res.f6i) 1228 res.f6i = net->ipv6.fib6_null_entry; 1229 else 1230 rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif, 1231 flags); 1232 1233 if (res.f6i == net->ipv6.fib6_null_entry) { 1234 fn = fib6_backtrack(fn, &fl6->saddr); 1235 if (fn) 1236 goto restart; 1237 1238 rt = net->ipv6.ip6_null_entry; 1239 dst_hold(&rt->dst); 1240 goto out; 1241 } else if (res.fib6_flags & RTF_REJECT) { 1242 goto do_create; 1243 } 1244 1245 fib6_select_path(net, &res, fl6, fl6->flowi6_oif, 1246 fl6->flowi6_oif != 0, skb, flags); 1247 1248 /* Search through exception table */ 1249 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); 1250 if (rt) { 1251 if (ip6_hold_safe(net, &rt)) 1252 dst_use_noref(&rt->dst, jiffies); 1253 } else { 1254 do_create: 1255 rt = ip6_create_rt_rcu(&res); 1256 } 1257 1258 out: 1259 trace_fib6_table_lookup(net, &res, table, fl6); 1260 1261 rcu_read_unlock(); 1262 1263 return rt; 1264 } 1265 1266 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1267 const struct sk_buff *skb, int flags) 1268 { 1269 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1270 } 1271 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1272 1273 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1274 const struct in6_addr *saddr, int oif, 1275 const struct sk_buff *skb, int strict) 1276 { 1277 struct flowi6 fl6 = { 1278 .flowi6_oif = oif, 1279 .daddr = *daddr, 1280 }; 1281 struct dst_entry *dst; 1282 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1283 1284 if (saddr) { 1285 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1286 flags |= RT6_LOOKUP_F_HAS_SADDR; 1287 } 1288 1289 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1290 if (dst->error == 0) 1291 return (struct rt6_info *) dst; 1292 1293 dst_release(dst); 1294 1295 return NULL; 1296 } 1297 EXPORT_SYMBOL(rt6_lookup); 1298 1299 /* ip6_ins_rt is called with FREE table->tb6_lock. 1300 * It takes new route entry, the addition fails by any reason the 1301 * route is released. 1302 * Caller must hold dst before calling it. 1303 */ 1304 1305 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1306 struct netlink_ext_ack *extack) 1307 { 1308 int err; 1309 struct fib6_table *table; 1310 1311 table = rt->fib6_table; 1312 spin_lock_bh(&table->tb6_lock); 1313 err = fib6_add(&table->tb6_root, rt, info, extack); 1314 spin_unlock_bh(&table->tb6_lock); 1315 1316 return err; 1317 } 1318 1319 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1320 { 1321 struct nl_info info = { .nl_net = net, }; 1322 1323 return __ip6_ins_rt(rt, &info, NULL); 1324 } 1325 1326 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res, 1327 const struct in6_addr *daddr, 1328 const struct in6_addr *saddr) 1329 { 1330 struct fib6_info *f6i = res->f6i; 1331 struct net_device *dev; 1332 struct rt6_info *rt; 1333 1334 /* 1335 * Clone the route. 1336 */ 1337 1338 if (!fib6_info_hold_safe(f6i)) 1339 return NULL; 1340 1341 dev = ip6_rt_get_dev_rcu(res); 1342 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1343 if (!rt) { 1344 fib6_info_release(f6i); 1345 return NULL; 1346 } 1347 1348 ip6_rt_copy_init(rt, res); 1349 rt->rt6i_flags |= RTF_CACHE; 1350 rt->rt6i_dst.addr = *daddr; 1351 rt->rt6i_dst.plen = 128; 1352 1353 if (!rt6_is_gw_or_nonexthop(res)) { 1354 if (f6i->fib6_dst.plen != 128 && 1355 ipv6_addr_equal(&f6i->fib6_dst.addr, daddr)) 1356 rt->rt6i_flags |= RTF_ANYCAST; 1357 #ifdef CONFIG_IPV6_SUBTREES 1358 if (rt->rt6i_src.plen && saddr) { 1359 rt->rt6i_src.addr = *saddr; 1360 rt->rt6i_src.plen = 128; 1361 } 1362 #endif 1363 } 1364 1365 return rt; 1366 } 1367 1368 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res) 1369 { 1370 struct fib6_info *f6i = res->f6i; 1371 unsigned short flags = fib6_info_dst_flags(f6i); 1372 struct net_device *dev; 1373 struct rt6_info *pcpu_rt; 1374 1375 if (!fib6_info_hold_safe(f6i)) 1376 return NULL; 1377 1378 rcu_read_lock(); 1379 dev = ip6_rt_get_dev_rcu(res); 1380 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags | DST_NOCOUNT); 1381 rcu_read_unlock(); 1382 if (!pcpu_rt) { 1383 fib6_info_release(f6i); 1384 return NULL; 1385 } 1386 ip6_rt_copy_init(pcpu_rt, res); 1387 pcpu_rt->rt6i_flags |= RTF_PCPU; 1388 1389 if (f6i->nh) 1390 pcpu_rt->sernum = rt_genid_ipv6(dev_net(dev)); 1391 1392 return pcpu_rt; 1393 } 1394 1395 static bool rt6_is_valid(const struct rt6_info *rt6) 1396 { 1397 return rt6->sernum == rt_genid_ipv6(dev_net(rt6->dst.dev)); 1398 } 1399 1400 /* It should be called with rcu_read_lock() acquired */ 1401 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) 1402 { 1403 struct rt6_info *pcpu_rt; 1404 1405 pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu); 1406 1407 if (pcpu_rt && pcpu_rt->sernum && !rt6_is_valid(pcpu_rt)) { 1408 struct rt6_info *prev, **p; 1409 1410 p = this_cpu_ptr(res->nh->rt6i_pcpu); 1411 prev = xchg(p, NULL); 1412 if (prev) { 1413 dst_dev_put(&prev->dst); 1414 dst_release(&prev->dst); 1415 } 1416 1417 pcpu_rt = NULL; 1418 } 1419 1420 return pcpu_rt; 1421 } 1422 1423 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1424 const struct fib6_result *res) 1425 { 1426 struct rt6_info *pcpu_rt, *prev, **p; 1427 1428 pcpu_rt = ip6_rt_pcpu_alloc(res); 1429 if (!pcpu_rt) 1430 return NULL; 1431 1432 p = this_cpu_ptr(res->nh->rt6i_pcpu); 1433 prev = cmpxchg(p, NULL, pcpu_rt); 1434 BUG_ON(prev); 1435 1436 if (res->f6i->fib6_destroying) { 1437 struct fib6_info *from; 1438 1439 from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); 1440 fib6_info_release(from); 1441 } 1442 1443 return pcpu_rt; 1444 } 1445 1446 /* exception hash table implementation 1447 */ 1448 static DEFINE_SPINLOCK(rt6_exception_lock); 1449 1450 /* Remove rt6_ex from hash table and free the memory 1451 * Caller must hold rt6_exception_lock 1452 */ 1453 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1454 struct rt6_exception *rt6_ex) 1455 { 1456 struct fib6_info *from; 1457 struct net *net; 1458 1459 if (!bucket || !rt6_ex) 1460 return; 1461 1462 net = dev_net(rt6_ex->rt6i->dst.dev); 1463 net->ipv6.rt6_stats->fib_rt_cache--; 1464 1465 /* purge completely the exception to allow releasing the held resources: 1466 * some [sk] cache may keep the dst around for unlimited time 1467 */ 1468 from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL); 1469 fib6_info_release(from); 1470 dst_dev_put(&rt6_ex->rt6i->dst); 1471 1472 hlist_del_rcu(&rt6_ex->hlist); 1473 dst_release(&rt6_ex->rt6i->dst); 1474 kfree_rcu(rt6_ex, rcu); 1475 WARN_ON_ONCE(!bucket->depth); 1476 bucket->depth--; 1477 } 1478 1479 /* Remove oldest rt6_ex in bucket and free the memory 1480 * Caller must hold rt6_exception_lock 1481 */ 1482 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1483 { 1484 struct rt6_exception *rt6_ex, *oldest = NULL; 1485 1486 if (!bucket) 1487 return; 1488 1489 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1490 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1491 oldest = rt6_ex; 1492 } 1493 rt6_remove_exception(bucket, oldest); 1494 } 1495 1496 static u32 rt6_exception_hash(const struct in6_addr *dst, 1497 const struct in6_addr *src) 1498 { 1499 static siphash_aligned_key_t rt6_exception_key; 1500 struct { 1501 struct in6_addr dst; 1502 struct in6_addr src; 1503 } __aligned(SIPHASH_ALIGNMENT) combined = { 1504 .dst = *dst, 1505 }; 1506 u64 val; 1507 1508 net_get_random_once(&rt6_exception_key, sizeof(rt6_exception_key)); 1509 1510 #ifdef CONFIG_IPV6_SUBTREES 1511 if (src) 1512 combined.src = *src; 1513 #endif 1514 val = siphash(&combined, sizeof(combined), &rt6_exception_key); 1515 1516 return hash_64(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1517 } 1518 1519 /* Helper function to find the cached rt in the hash table 1520 * and update bucket pointer to point to the bucket for this 1521 * (daddr, saddr) pair 1522 * Caller must hold rt6_exception_lock 1523 */ 1524 static struct rt6_exception * 1525 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1526 const struct in6_addr *daddr, 1527 const struct in6_addr *saddr) 1528 { 1529 struct rt6_exception *rt6_ex; 1530 u32 hval; 1531 1532 if (!(*bucket) || !daddr) 1533 return NULL; 1534 1535 hval = rt6_exception_hash(daddr, saddr); 1536 *bucket += hval; 1537 1538 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1539 struct rt6_info *rt6 = rt6_ex->rt6i; 1540 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1541 1542 #ifdef CONFIG_IPV6_SUBTREES 1543 if (matched && saddr) 1544 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1545 #endif 1546 if (matched) 1547 return rt6_ex; 1548 } 1549 return NULL; 1550 } 1551 1552 /* Helper function to find the cached rt in the hash table 1553 * and update bucket pointer to point to the bucket for this 1554 * (daddr, saddr) pair 1555 * Caller must hold rcu_read_lock() 1556 */ 1557 static struct rt6_exception * 1558 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1559 const struct in6_addr *daddr, 1560 const struct in6_addr *saddr) 1561 { 1562 struct rt6_exception *rt6_ex; 1563 u32 hval; 1564 1565 WARN_ON_ONCE(!rcu_read_lock_held()); 1566 1567 if (!(*bucket) || !daddr) 1568 return NULL; 1569 1570 hval = rt6_exception_hash(daddr, saddr); 1571 *bucket += hval; 1572 1573 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1574 struct rt6_info *rt6 = rt6_ex->rt6i; 1575 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1576 1577 #ifdef CONFIG_IPV6_SUBTREES 1578 if (matched && saddr) 1579 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1580 #endif 1581 if (matched) 1582 return rt6_ex; 1583 } 1584 return NULL; 1585 } 1586 1587 static unsigned int fib6_mtu(const struct fib6_result *res) 1588 { 1589 const struct fib6_nh *nh = res->nh; 1590 unsigned int mtu; 1591 1592 if (res->f6i->fib6_pmtu) { 1593 mtu = res->f6i->fib6_pmtu; 1594 } else { 1595 struct net_device *dev = nh->fib_nh_dev; 1596 struct inet6_dev *idev; 1597 1598 rcu_read_lock(); 1599 idev = __in6_dev_get(dev); 1600 mtu = READ_ONCE(idev->cnf.mtu6); 1601 rcu_read_unlock(); 1602 } 1603 1604 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1605 1606 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); 1607 } 1608 1609 #define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL 1610 1611 /* used when the flushed bit is not relevant, only access to the bucket 1612 * (ie., all bucket users except rt6_insert_exception); 1613 * 1614 * called under rcu lock; sometimes called with rt6_exception_lock held 1615 */ 1616 static 1617 struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh, 1618 spinlock_t *lock) 1619 { 1620 struct rt6_exception_bucket *bucket; 1621 1622 if (lock) 1623 bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, 1624 lockdep_is_held(lock)); 1625 else 1626 bucket = rcu_dereference(nh->rt6i_exception_bucket); 1627 1628 /* remove bucket flushed bit if set */ 1629 if (bucket) { 1630 unsigned long p = (unsigned long)bucket; 1631 1632 p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED; 1633 bucket = (struct rt6_exception_bucket *)p; 1634 } 1635 1636 return bucket; 1637 } 1638 1639 static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket) 1640 { 1641 unsigned long p = (unsigned long)bucket; 1642 1643 return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED); 1644 } 1645 1646 /* called with rt6_exception_lock held */ 1647 static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh, 1648 spinlock_t *lock) 1649 { 1650 struct rt6_exception_bucket *bucket; 1651 unsigned long p; 1652 1653 bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, 1654 lockdep_is_held(lock)); 1655 1656 p = (unsigned long)bucket; 1657 p |= FIB6_EXCEPTION_BUCKET_FLUSHED; 1658 bucket = (struct rt6_exception_bucket *)p; 1659 rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); 1660 } 1661 1662 static int rt6_insert_exception(struct rt6_info *nrt, 1663 const struct fib6_result *res) 1664 { 1665 struct net *net = dev_net(nrt->dst.dev); 1666 struct rt6_exception_bucket *bucket; 1667 struct fib6_info *f6i = res->f6i; 1668 struct in6_addr *src_key = NULL; 1669 struct rt6_exception *rt6_ex; 1670 struct fib6_nh *nh = res->nh; 1671 int max_depth; 1672 int err = 0; 1673 1674 spin_lock_bh(&rt6_exception_lock); 1675 1676 bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, 1677 lockdep_is_held(&rt6_exception_lock)); 1678 if (!bucket) { 1679 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1680 GFP_ATOMIC); 1681 if (!bucket) { 1682 err = -ENOMEM; 1683 goto out; 1684 } 1685 rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); 1686 } else if (fib6_nh_excptn_bucket_flushed(bucket)) { 1687 err = -EINVAL; 1688 goto out; 1689 } 1690 1691 #ifdef CONFIG_IPV6_SUBTREES 1692 /* fib6_src.plen != 0 indicates f6i is in subtree 1693 * and exception table is indexed by a hash of 1694 * both fib6_dst and fib6_src. 1695 * Otherwise, the exception table is indexed by 1696 * a hash of only fib6_dst. 1697 */ 1698 if (f6i->fib6_src.plen) 1699 src_key = &nrt->rt6i_src.addr; 1700 #endif 1701 /* rt6_mtu_change() might lower mtu on f6i. 1702 * Only insert this exception route if its mtu 1703 * is less than f6i's mtu value. 1704 */ 1705 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) { 1706 err = -EINVAL; 1707 goto out; 1708 } 1709 1710 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1711 src_key); 1712 if (rt6_ex) 1713 rt6_remove_exception(bucket, rt6_ex); 1714 1715 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1716 if (!rt6_ex) { 1717 err = -ENOMEM; 1718 goto out; 1719 } 1720 rt6_ex->rt6i = nrt; 1721 rt6_ex->stamp = jiffies; 1722 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1723 bucket->depth++; 1724 net->ipv6.rt6_stats->fib_rt_cache++; 1725 1726 /* Randomize max depth to avoid some side channels attacks. */ 1727 max_depth = FIB6_MAX_DEPTH + get_random_u32_below(FIB6_MAX_DEPTH); 1728 while (bucket->depth > max_depth) 1729 rt6_exception_remove_oldest(bucket); 1730 1731 out: 1732 spin_unlock_bh(&rt6_exception_lock); 1733 1734 /* Update fn->fn_sernum to invalidate all cached dst */ 1735 if (!err) { 1736 spin_lock_bh(&f6i->fib6_table->tb6_lock); 1737 fib6_update_sernum(net, f6i); 1738 spin_unlock_bh(&f6i->fib6_table->tb6_lock); 1739 fib6_force_start_gc(net); 1740 } 1741 1742 return err; 1743 } 1744 1745 static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from) 1746 { 1747 struct rt6_exception_bucket *bucket; 1748 struct rt6_exception *rt6_ex; 1749 struct hlist_node *tmp; 1750 int i; 1751 1752 spin_lock_bh(&rt6_exception_lock); 1753 1754 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 1755 if (!bucket) 1756 goto out; 1757 1758 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1759 if (!from) 1760 fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock); 1761 1762 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1763 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) { 1764 if (!from || 1765 rcu_access_pointer(rt6_ex->rt6i->from) == from) 1766 rt6_remove_exception(bucket, rt6_ex); 1767 } 1768 WARN_ON_ONCE(!from && bucket->depth); 1769 bucket++; 1770 } 1771 out: 1772 spin_unlock_bh(&rt6_exception_lock); 1773 } 1774 1775 static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg) 1776 { 1777 struct fib6_info *f6i = arg; 1778 1779 fib6_nh_flush_exceptions(nh, f6i); 1780 1781 return 0; 1782 } 1783 1784 void rt6_flush_exceptions(struct fib6_info *f6i) 1785 { 1786 if (f6i->nh) 1787 nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, 1788 f6i); 1789 else 1790 fib6_nh_flush_exceptions(f6i->fib6_nh, f6i); 1791 } 1792 1793 /* Find cached rt in the hash table inside passed in rt 1794 * Caller has to hold rcu_read_lock() 1795 */ 1796 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, 1797 const struct in6_addr *daddr, 1798 const struct in6_addr *saddr) 1799 { 1800 const struct in6_addr *src_key = NULL; 1801 struct rt6_exception_bucket *bucket; 1802 struct rt6_exception *rt6_ex; 1803 struct rt6_info *ret = NULL; 1804 1805 #ifdef CONFIG_IPV6_SUBTREES 1806 /* fib6i_src.plen != 0 indicates f6i is in subtree 1807 * and exception table is indexed by a hash of 1808 * both fib6_dst and fib6_src. 1809 * However, the src addr used to create the hash 1810 * might not be exactly the passed in saddr which 1811 * is a /128 addr from the flow. 1812 * So we need to use f6i->fib6_src to redo lookup 1813 * if the passed in saddr does not find anything. 1814 * (See the logic in ip6_rt_cache_alloc() on how 1815 * rt->rt6i_src is updated.) 1816 */ 1817 if (res->f6i->fib6_src.plen) 1818 src_key = saddr; 1819 find_ex: 1820 #endif 1821 bucket = fib6_nh_get_excptn_bucket(res->nh, NULL); 1822 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1823 1824 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1825 ret = rt6_ex->rt6i; 1826 1827 #ifdef CONFIG_IPV6_SUBTREES 1828 /* Use fib6_src as src_key and redo lookup */ 1829 if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) { 1830 src_key = &res->f6i->fib6_src.addr; 1831 goto find_ex; 1832 } 1833 #endif 1834 1835 return ret; 1836 } 1837 1838 /* Remove the passed in cached rt from the hash table that contains it */ 1839 static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen, 1840 const struct rt6_info *rt) 1841 { 1842 const struct in6_addr *src_key = NULL; 1843 struct rt6_exception_bucket *bucket; 1844 struct rt6_exception *rt6_ex; 1845 int err; 1846 1847 if (!rcu_access_pointer(nh->rt6i_exception_bucket)) 1848 return -ENOENT; 1849 1850 spin_lock_bh(&rt6_exception_lock); 1851 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 1852 1853 #ifdef CONFIG_IPV6_SUBTREES 1854 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1855 * and exception table is indexed by a hash of 1856 * both rt6i_dst and rt6i_src. 1857 * Otherwise, the exception table is indexed by 1858 * a hash of only rt6i_dst. 1859 */ 1860 if (plen) 1861 src_key = &rt->rt6i_src.addr; 1862 #endif 1863 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1864 &rt->rt6i_dst.addr, 1865 src_key); 1866 if (rt6_ex) { 1867 rt6_remove_exception(bucket, rt6_ex); 1868 err = 0; 1869 } else { 1870 err = -ENOENT; 1871 } 1872 1873 spin_unlock_bh(&rt6_exception_lock); 1874 return err; 1875 } 1876 1877 struct fib6_nh_excptn_arg { 1878 struct rt6_info *rt; 1879 int plen; 1880 }; 1881 1882 static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg) 1883 { 1884 struct fib6_nh_excptn_arg *arg = _arg; 1885 int err; 1886 1887 err = fib6_nh_remove_exception(nh, arg->plen, arg->rt); 1888 if (err == 0) 1889 return 1; 1890 1891 return 0; 1892 } 1893 1894 static int rt6_remove_exception_rt(struct rt6_info *rt) 1895 { 1896 struct fib6_info *from; 1897 1898 from = rcu_dereference(rt->from); 1899 if (!from || !(rt->rt6i_flags & RTF_CACHE)) 1900 return -EINVAL; 1901 1902 if (from->nh) { 1903 struct fib6_nh_excptn_arg arg = { 1904 .rt = rt, 1905 .plen = from->fib6_src.plen 1906 }; 1907 int rc; 1908 1909 /* rc = 1 means an entry was found */ 1910 rc = nexthop_for_each_fib6_nh(from->nh, 1911 rt6_nh_remove_exception_rt, 1912 &arg); 1913 return rc ? 0 : -ENOENT; 1914 } 1915 1916 return fib6_nh_remove_exception(from->fib6_nh, 1917 from->fib6_src.plen, rt); 1918 } 1919 1920 /* Find rt6_ex which contains the passed in rt cache and 1921 * refresh its stamp 1922 */ 1923 static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen, 1924 const struct rt6_info *rt) 1925 { 1926 const struct in6_addr *src_key = NULL; 1927 struct rt6_exception_bucket *bucket; 1928 struct rt6_exception *rt6_ex; 1929 1930 bucket = fib6_nh_get_excptn_bucket(nh, NULL); 1931 #ifdef CONFIG_IPV6_SUBTREES 1932 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1933 * and exception table is indexed by a hash of 1934 * both rt6i_dst and rt6i_src. 1935 * Otherwise, the exception table is indexed by 1936 * a hash of only rt6i_dst. 1937 */ 1938 if (plen) 1939 src_key = &rt->rt6i_src.addr; 1940 #endif 1941 rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key); 1942 if (rt6_ex) 1943 rt6_ex->stamp = jiffies; 1944 } 1945 1946 struct fib6_nh_match_arg { 1947 const struct net_device *dev; 1948 const struct in6_addr *gw; 1949 struct fib6_nh *match; 1950 }; 1951 1952 /* determine if fib6_nh has given device and gateway */ 1953 static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg) 1954 { 1955 struct fib6_nh_match_arg *arg = _arg; 1956 1957 if (arg->dev != nh->fib_nh_dev || 1958 (arg->gw && !nh->fib_nh_gw_family) || 1959 (!arg->gw && nh->fib_nh_gw_family) || 1960 (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6))) 1961 return 0; 1962 1963 arg->match = nh; 1964 1965 /* found a match, break the loop */ 1966 return 1; 1967 } 1968 1969 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1970 { 1971 struct fib6_info *from; 1972 struct fib6_nh *fib6_nh; 1973 1974 rcu_read_lock(); 1975 1976 from = rcu_dereference(rt->from); 1977 if (!from || !(rt->rt6i_flags & RTF_CACHE)) 1978 goto unlock; 1979 1980 if (from->nh) { 1981 struct fib6_nh_match_arg arg = { 1982 .dev = rt->dst.dev, 1983 .gw = &rt->rt6i_gateway, 1984 }; 1985 1986 nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg); 1987 1988 if (!arg.match) 1989 goto unlock; 1990 fib6_nh = arg.match; 1991 } else { 1992 fib6_nh = from->fib6_nh; 1993 } 1994 fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt); 1995 unlock: 1996 rcu_read_unlock(); 1997 } 1998 1999 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 2000 struct rt6_info *rt, int mtu) 2001 { 2002 /* If the new MTU is lower than the route PMTU, this new MTU will be the 2003 * lowest MTU in the path: always allow updating the route PMTU to 2004 * reflect PMTU decreases. 2005 * 2006 * If the new MTU is higher, and the route PMTU is equal to the local 2007 * MTU, this means the old MTU is the lowest in the path, so allow 2008 * updating it: if other nodes now have lower MTUs, PMTU discovery will 2009 * handle this. 2010 */ 2011 2012 if (dst_mtu(&rt->dst) >= mtu) 2013 return true; 2014 2015 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 2016 return true; 2017 2018 return false; 2019 } 2020 2021 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 2022 const struct fib6_nh *nh, int mtu) 2023 { 2024 struct rt6_exception_bucket *bucket; 2025 struct rt6_exception *rt6_ex; 2026 int i; 2027 2028 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 2029 if (!bucket) 2030 return; 2031 2032 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 2033 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 2034 struct rt6_info *entry = rt6_ex->rt6i; 2035 2036 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 2037 * route), the metrics of its rt->from have already 2038 * been updated. 2039 */ 2040 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 2041 rt6_mtu_change_route_allowed(idev, entry, mtu)) 2042 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 2043 } 2044 bucket++; 2045 } 2046 } 2047 2048 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 2049 2050 static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh, 2051 const struct in6_addr *gateway) 2052 { 2053 struct rt6_exception_bucket *bucket; 2054 struct rt6_exception *rt6_ex; 2055 struct hlist_node *tmp; 2056 int i; 2057 2058 if (!rcu_access_pointer(nh->rt6i_exception_bucket)) 2059 return; 2060 2061 spin_lock_bh(&rt6_exception_lock); 2062 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 2063 if (bucket) { 2064 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 2065 hlist_for_each_entry_safe(rt6_ex, tmp, 2066 &bucket->chain, hlist) { 2067 struct rt6_info *entry = rt6_ex->rt6i; 2068 2069 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 2070 RTF_CACHE_GATEWAY && 2071 ipv6_addr_equal(gateway, 2072 &entry->rt6i_gateway)) { 2073 rt6_remove_exception(bucket, rt6_ex); 2074 } 2075 } 2076 bucket++; 2077 } 2078 } 2079 2080 spin_unlock_bh(&rt6_exception_lock); 2081 } 2082 2083 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 2084 struct rt6_exception *rt6_ex, 2085 struct fib6_gc_args *gc_args, 2086 unsigned long now) 2087 { 2088 struct rt6_info *rt = rt6_ex->rt6i; 2089 2090 /* we are pruning and obsoleting aged-out and non gateway exceptions 2091 * even if others have still references to them, so that on next 2092 * dst_check() such references can be dropped. 2093 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 2094 * expired, independently from their aging, as per RFC 8201 section 4 2095 */ 2096 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 2097 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 2098 pr_debug("aging clone %p\n", rt); 2099 rt6_remove_exception(bucket, rt6_ex); 2100 return; 2101 } 2102 } else if (time_after(jiffies, rt->dst.expires)) { 2103 pr_debug("purging expired route %p\n", rt); 2104 rt6_remove_exception(bucket, rt6_ex); 2105 return; 2106 } 2107 2108 if (rt->rt6i_flags & RTF_GATEWAY) { 2109 struct neighbour *neigh; 2110 2111 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 2112 2113 if (!(neigh && (neigh->flags & NTF_ROUTER))) { 2114 pr_debug("purging route %p via non-router but gateway\n", 2115 rt); 2116 rt6_remove_exception(bucket, rt6_ex); 2117 return; 2118 } 2119 } 2120 2121 gc_args->more++; 2122 } 2123 2124 static void fib6_nh_age_exceptions(const struct fib6_nh *nh, 2125 struct fib6_gc_args *gc_args, 2126 unsigned long now) 2127 { 2128 struct rt6_exception_bucket *bucket; 2129 struct rt6_exception *rt6_ex; 2130 struct hlist_node *tmp; 2131 int i; 2132 2133 if (!rcu_access_pointer(nh->rt6i_exception_bucket)) 2134 return; 2135 2136 rcu_read_lock_bh(); 2137 spin_lock(&rt6_exception_lock); 2138 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 2139 if (bucket) { 2140 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 2141 hlist_for_each_entry_safe(rt6_ex, tmp, 2142 &bucket->chain, hlist) { 2143 rt6_age_examine_exception(bucket, rt6_ex, 2144 gc_args, now); 2145 } 2146 bucket++; 2147 } 2148 } 2149 spin_unlock(&rt6_exception_lock); 2150 rcu_read_unlock_bh(); 2151 } 2152 2153 struct fib6_nh_age_excptn_arg { 2154 struct fib6_gc_args *gc_args; 2155 unsigned long now; 2156 }; 2157 2158 static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg) 2159 { 2160 struct fib6_nh_age_excptn_arg *arg = _arg; 2161 2162 fib6_nh_age_exceptions(nh, arg->gc_args, arg->now); 2163 return 0; 2164 } 2165 2166 void rt6_age_exceptions(struct fib6_info *f6i, 2167 struct fib6_gc_args *gc_args, 2168 unsigned long now) 2169 { 2170 if (f6i->nh) { 2171 struct fib6_nh_age_excptn_arg arg = { 2172 .gc_args = gc_args, 2173 .now = now 2174 }; 2175 2176 nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions, 2177 &arg); 2178 } else { 2179 fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now); 2180 } 2181 } 2182 2183 /* must be called with rcu lock held */ 2184 int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, 2185 struct flowi6 *fl6, struct fib6_result *res, int strict) 2186 { 2187 struct fib6_node *fn, *saved_fn; 2188 2189 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2190 saved_fn = fn; 2191 2192 redo_rt6_select: 2193 rt6_select(net, fn, oif, res, strict); 2194 if (res->f6i == net->ipv6.fib6_null_entry) { 2195 fn = fib6_backtrack(fn, &fl6->saddr); 2196 if (fn) 2197 goto redo_rt6_select; 2198 else if (strict & RT6_LOOKUP_F_REACHABLE) { 2199 /* also consider unreachable route */ 2200 strict &= ~RT6_LOOKUP_F_REACHABLE; 2201 fn = saved_fn; 2202 goto redo_rt6_select; 2203 } 2204 } 2205 2206 trace_fib6_table_lookup(net, res, table, fl6); 2207 2208 return 0; 2209 } 2210 2211 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 2212 int oif, struct flowi6 *fl6, 2213 const struct sk_buff *skb, int flags) 2214 { 2215 struct fib6_result res = {}; 2216 struct rt6_info *rt = NULL; 2217 int strict = 0; 2218 2219 WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) && 2220 !rcu_read_lock_held()); 2221 2222 strict |= flags & RT6_LOOKUP_F_IFACE; 2223 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 2224 if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0) 2225 strict |= RT6_LOOKUP_F_REACHABLE; 2226 2227 rcu_read_lock(); 2228 2229 fib6_table_lookup(net, table, oif, fl6, &res, strict); 2230 if (res.f6i == net->ipv6.fib6_null_entry) 2231 goto out; 2232 2233 fib6_select_path(net, &res, fl6, oif, false, skb, strict); 2234 2235 /*Search through exception table */ 2236 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); 2237 if (rt) { 2238 goto out; 2239 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 2240 !res.nh->fib_nh_gw_family)) { 2241 /* Create a RTF_CACHE clone which will not be 2242 * owned by the fib6 tree. It is for the special case where 2243 * the daddr in the skb during the neighbor look-up is different 2244 * from the fl6->daddr used to look-up route here. 2245 */ 2246 rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); 2247 2248 if (rt) { 2249 /* 1 refcnt is taken during ip6_rt_cache_alloc(). 2250 * As rt6_uncached_list_add() does not consume refcnt, 2251 * this refcnt is always returned to the caller even 2252 * if caller sets RT6_LOOKUP_F_DST_NOREF flag. 2253 */ 2254 rt6_uncached_list_add(rt); 2255 rcu_read_unlock(); 2256 2257 return rt; 2258 } 2259 } else { 2260 /* Get a percpu copy */ 2261 local_bh_disable(); 2262 rt = rt6_get_pcpu_route(&res); 2263 2264 if (!rt) 2265 rt = rt6_make_pcpu_route(net, &res); 2266 2267 local_bh_enable(); 2268 } 2269 out: 2270 if (!rt) 2271 rt = net->ipv6.ip6_null_entry; 2272 if (!(flags & RT6_LOOKUP_F_DST_NOREF)) 2273 ip6_hold_safe(net, &rt); 2274 rcu_read_unlock(); 2275 2276 return rt; 2277 } 2278 EXPORT_SYMBOL_GPL(ip6_pol_route); 2279 2280 INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_input(struct net *net, 2281 struct fib6_table *table, 2282 struct flowi6 *fl6, 2283 const struct sk_buff *skb, 2284 int flags) 2285 { 2286 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 2287 } 2288 2289 struct dst_entry *ip6_route_input_lookup(struct net *net, 2290 struct net_device *dev, 2291 struct flowi6 *fl6, 2292 const struct sk_buff *skb, 2293 int flags) 2294 { 2295 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 2296 flags |= RT6_LOOKUP_F_IFACE; 2297 2298 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 2299 } 2300 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 2301 2302 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 2303 struct flow_keys *keys, 2304 struct flow_keys *flkeys) 2305 { 2306 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 2307 const struct ipv6hdr *key_iph = outer_iph; 2308 struct flow_keys *_flkeys = flkeys; 2309 const struct ipv6hdr *inner_iph; 2310 const struct icmp6hdr *icmph; 2311 struct ipv6hdr _inner_iph; 2312 struct icmp6hdr _icmph; 2313 2314 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 2315 goto out; 2316 2317 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 2318 sizeof(_icmph), &_icmph); 2319 if (!icmph) 2320 goto out; 2321 2322 if (!icmpv6_is_err(icmph->icmp6_type)) 2323 goto out; 2324 2325 inner_iph = skb_header_pointer(skb, 2326 skb_transport_offset(skb) + sizeof(*icmph), 2327 sizeof(_inner_iph), &_inner_iph); 2328 if (!inner_iph) 2329 goto out; 2330 2331 key_iph = inner_iph; 2332 _flkeys = NULL; 2333 out: 2334 if (_flkeys) { 2335 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 2336 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 2337 keys->tags.flow_label = _flkeys->tags.flow_label; 2338 keys->basic.ip_proto = _flkeys->basic.ip_proto; 2339 } else { 2340 keys->addrs.v6addrs.src = key_iph->saddr; 2341 keys->addrs.v6addrs.dst = key_iph->daddr; 2342 keys->tags.flow_label = ip6_flowlabel(key_iph); 2343 keys->basic.ip_proto = key_iph->nexthdr; 2344 } 2345 } 2346 2347 static u32 rt6_multipath_custom_hash_outer(const struct net *net, 2348 const struct sk_buff *skb, 2349 bool *p_has_inner) 2350 { 2351 u32 hash_fields = ip6_multipath_hash_fields(net); 2352 struct flow_keys keys, hash_keys; 2353 2354 if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) 2355 return 0; 2356 2357 memset(&hash_keys, 0, sizeof(hash_keys)); 2358 skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP); 2359 2360 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2361 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) 2362 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; 2363 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) 2364 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; 2365 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) 2366 hash_keys.basic.ip_proto = keys.basic.ip_proto; 2367 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) 2368 hash_keys.tags.flow_label = keys.tags.flow_label; 2369 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) 2370 hash_keys.ports.src = keys.ports.src; 2371 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) 2372 hash_keys.ports.dst = keys.ports.dst; 2373 2374 *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION); 2375 return flow_hash_from_keys(&hash_keys); 2376 } 2377 2378 static u32 rt6_multipath_custom_hash_inner(const struct net *net, 2379 const struct sk_buff *skb, 2380 bool has_inner) 2381 { 2382 u32 hash_fields = ip6_multipath_hash_fields(net); 2383 struct flow_keys keys, hash_keys; 2384 2385 /* We assume the packet carries an encapsulation, but if none was 2386 * encountered during dissection of the outer flow, then there is no 2387 * point in calling the flow dissector again. 2388 */ 2389 if (!has_inner) 2390 return 0; 2391 2392 if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK)) 2393 return 0; 2394 2395 memset(&hash_keys, 0, sizeof(hash_keys)); 2396 skb_flow_dissect_flow_keys(skb, &keys, 0); 2397 2398 if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION)) 2399 return 0; 2400 2401 if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { 2402 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 2403 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) 2404 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; 2405 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) 2406 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; 2407 } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { 2408 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2409 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) 2410 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; 2411 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) 2412 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; 2413 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL) 2414 hash_keys.tags.flow_label = keys.tags.flow_label; 2415 } 2416 2417 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO) 2418 hash_keys.basic.ip_proto = keys.basic.ip_proto; 2419 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT) 2420 hash_keys.ports.src = keys.ports.src; 2421 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) 2422 hash_keys.ports.dst = keys.ports.dst; 2423 2424 return flow_hash_from_keys(&hash_keys); 2425 } 2426 2427 static u32 rt6_multipath_custom_hash_skb(const struct net *net, 2428 const struct sk_buff *skb) 2429 { 2430 u32 mhash, mhash_inner; 2431 bool has_inner = true; 2432 2433 mhash = rt6_multipath_custom_hash_outer(net, skb, &has_inner); 2434 mhash_inner = rt6_multipath_custom_hash_inner(net, skb, has_inner); 2435 2436 return jhash_2words(mhash, mhash_inner, 0); 2437 } 2438 2439 static u32 rt6_multipath_custom_hash_fl6(const struct net *net, 2440 const struct flowi6 *fl6) 2441 { 2442 u32 hash_fields = ip6_multipath_hash_fields(net); 2443 struct flow_keys hash_keys; 2444 2445 if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) 2446 return 0; 2447 2448 memset(&hash_keys, 0, sizeof(hash_keys)); 2449 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2450 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) 2451 hash_keys.addrs.v6addrs.src = fl6->saddr; 2452 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) 2453 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2454 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) 2455 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2456 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) 2457 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2458 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) 2459 hash_keys.ports.src = fl6->fl6_sport; 2460 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) 2461 hash_keys.ports.dst = fl6->fl6_dport; 2462 2463 return flow_hash_from_keys(&hash_keys); 2464 } 2465 2466 /* if skb is set it will be used and fl6 can be NULL */ 2467 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 2468 const struct sk_buff *skb, struct flow_keys *flkeys) 2469 { 2470 struct flow_keys hash_keys; 2471 u32 mhash = 0; 2472 2473 switch (ip6_multipath_hash_policy(net)) { 2474 case 0: 2475 memset(&hash_keys, 0, sizeof(hash_keys)); 2476 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2477 if (skb) { 2478 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 2479 } else { 2480 hash_keys.addrs.v6addrs.src = fl6->saddr; 2481 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2482 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2483 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2484 } 2485 mhash = flow_hash_from_keys(&hash_keys); 2486 break; 2487 case 1: 2488 if (skb) { 2489 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 2490 struct flow_keys keys; 2491 2492 /* short-circuit if we already have L4 hash present */ 2493 if (skb->l4_hash) 2494 return skb_get_hash_raw(skb) >> 1; 2495 2496 memset(&hash_keys, 0, sizeof(hash_keys)); 2497 2498 if (!flkeys) { 2499 skb_flow_dissect_flow_keys(skb, &keys, flag); 2500 flkeys = &keys; 2501 } 2502 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2503 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2504 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2505 hash_keys.ports.src = flkeys->ports.src; 2506 hash_keys.ports.dst = flkeys->ports.dst; 2507 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2508 } else { 2509 memset(&hash_keys, 0, sizeof(hash_keys)); 2510 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2511 hash_keys.addrs.v6addrs.src = fl6->saddr; 2512 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2513 hash_keys.ports.src = fl6->fl6_sport; 2514 hash_keys.ports.dst = fl6->fl6_dport; 2515 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2516 } 2517 mhash = flow_hash_from_keys(&hash_keys); 2518 break; 2519 case 2: 2520 memset(&hash_keys, 0, sizeof(hash_keys)); 2521 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2522 if (skb) { 2523 struct flow_keys keys; 2524 2525 if (!flkeys) { 2526 skb_flow_dissect_flow_keys(skb, &keys, 0); 2527 flkeys = &keys; 2528 } 2529 2530 /* Inner can be v4 or v6 */ 2531 if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { 2532 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 2533 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; 2534 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; 2535 } else if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { 2536 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2537 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2538 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2539 hash_keys.tags.flow_label = flkeys->tags.flow_label; 2540 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2541 } else { 2542 /* Same as case 0 */ 2543 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2544 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 2545 } 2546 } else { 2547 /* Same as case 0 */ 2548 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2549 hash_keys.addrs.v6addrs.src = fl6->saddr; 2550 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2551 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2552 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2553 } 2554 mhash = flow_hash_from_keys(&hash_keys); 2555 break; 2556 case 3: 2557 if (skb) 2558 mhash = rt6_multipath_custom_hash_skb(net, skb); 2559 else 2560 mhash = rt6_multipath_custom_hash_fl6(net, fl6); 2561 break; 2562 } 2563 2564 return mhash >> 1; 2565 } 2566 2567 /* Called with rcu held */ 2568 void ip6_route_input(struct sk_buff *skb) 2569 { 2570 const struct ipv6hdr *iph = ipv6_hdr(skb); 2571 struct net *net = dev_net(skb->dev); 2572 int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF; 2573 struct ip_tunnel_info *tun_info; 2574 struct flowi6 fl6 = { 2575 .flowi6_iif = skb->dev->ifindex, 2576 .daddr = iph->daddr, 2577 .saddr = iph->saddr, 2578 .flowlabel = ip6_flowinfo(iph), 2579 .flowi6_mark = skb->mark, 2580 .flowi6_proto = iph->nexthdr, 2581 }; 2582 struct flow_keys *flkeys = NULL, _flkeys; 2583 2584 tun_info = skb_tunnel_info(skb); 2585 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2586 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2587 2588 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2589 flkeys = &_flkeys; 2590 2591 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2592 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2593 skb_dst_drop(skb); 2594 skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev, 2595 &fl6, skb, flags)); 2596 } 2597 2598 INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net, 2599 struct fib6_table *table, 2600 struct flowi6 *fl6, 2601 const struct sk_buff *skb, 2602 int flags) 2603 { 2604 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2605 } 2606 2607 static struct dst_entry *ip6_route_output_flags_noref(struct net *net, 2608 const struct sock *sk, 2609 struct flowi6 *fl6, 2610 int flags) 2611 { 2612 bool any_src; 2613 2614 if (ipv6_addr_type(&fl6->daddr) & 2615 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { 2616 struct dst_entry *dst; 2617 2618 /* This function does not take refcnt on the dst */ 2619 dst = l3mdev_link_scope_lookup(net, fl6); 2620 if (dst) 2621 return dst; 2622 } 2623 2624 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2625 2626 flags |= RT6_LOOKUP_F_DST_NOREF; 2627 any_src = ipv6_addr_any(&fl6->saddr); 2628 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2629 (fl6->flowi6_oif && any_src)) 2630 flags |= RT6_LOOKUP_F_IFACE; 2631 2632 if (!any_src) 2633 flags |= RT6_LOOKUP_F_HAS_SADDR; 2634 else if (sk) 2635 flags |= rt6_srcprefs2flags(READ_ONCE(inet6_sk(sk)->srcprefs)); 2636 2637 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2638 } 2639 2640 struct dst_entry *ip6_route_output_flags(struct net *net, 2641 const struct sock *sk, 2642 struct flowi6 *fl6, 2643 int flags) 2644 { 2645 struct dst_entry *dst; 2646 struct rt6_info *rt6; 2647 2648 rcu_read_lock(); 2649 dst = ip6_route_output_flags_noref(net, sk, fl6, flags); 2650 rt6 = (struct rt6_info *)dst; 2651 /* For dst cached in uncached_list, refcnt is already taken. */ 2652 if (list_empty(&rt6->dst.rt_uncached) && !dst_hold_safe(dst)) { 2653 dst = &net->ipv6.ip6_null_entry->dst; 2654 dst_hold(dst); 2655 } 2656 rcu_read_unlock(); 2657 2658 return dst; 2659 } 2660 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2661 2662 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2663 { 2664 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2665 struct net_device *loopback_dev = net->loopback_dev; 2666 struct dst_entry *new = NULL; 2667 2668 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 2669 DST_OBSOLETE_DEAD, 0); 2670 if (rt) { 2671 rt6_info_init(rt); 2672 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2673 2674 new = &rt->dst; 2675 new->__use = 1; 2676 new->input = dst_discard; 2677 new->output = dst_discard_out; 2678 2679 dst_copy_metrics(new, &ort->dst); 2680 2681 rt->rt6i_idev = in6_dev_get(loopback_dev); 2682 rt->rt6i_gateway = ort->rt6i_gateway; 2683 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2684 2685 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2686 #ifdef CONFIG_IPV6_SUBTREES 2687 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2688 #endif 2689 } 2690 2691 dst_release(dst_orig); 2692 return new ? new : ERR_PTR(-ENOMEM); 2693 } 2694 2695 /* 2696 * Destination cache support functions 2697 */ 2698 2699 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2700 { 2701 u32 rt_cookie = 0; 2702 2703 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2704 return false; 2705 2706 if (fib6_check_expired(f6i)) 2707 return false; 2708 2709 return true; 2710 } 2711 2712 static struct dst_entry *rt6_check(struct rt6_info *rt, 2713 struct fib6_info *from, 2714 u32 cookie) 2715 { 2716 u32 rt_cookie = 0; 2717 2718 if (!from || !fib6_get_cookie_safe(from, &rt_cookie) || 2719 rt_cookie != cookie) 2720 return NULL; 2721 2722 if (rt6_check_expired(rt)) 2723 return NULL; 2724 2725 return &rt->dst; 2726 } 2727 2728 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2729 struct fib6_info *from, 2730 u32 cookie) 2731 { 2732 if (!__rt6_check_expired(rt) && 2733 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2734 fib6_check(from, cookie)) 2735 return &rt->dst; 2736 else 2737 return NULL; 2738 } 2739 2740 INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, 2741 u32 cookie) 2742 { 2743 struct dst_entry *dst_ret; 2744 struct fib6_info *from; 2745 struct rt6_info *rt; 2746 2747 rt = container_of(dst, struct rt6_info, dst); 2748 2749 if (rt->sernum) 2750 return rt6_is_valid(rt) ? dst : NULL; 2751 2752 rcu_read_lock(); 2753 2754 /* All IPV6 dsts are created with ->obsolete set to the value 2755 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2756 * into this function always. 2757 */ 2758 2759 from = rcu_dereference(rt->from); 2760 2761 if (from && (rt->rt6i_flags & RTF_PCPU || 2762 unlikely(!list_empty(&rt->dst.rt_uncached)))) 2763 dst_ret = rt6_dst_from_check(rt, from, cookie); 2764 else 2765 dst_ret = rt6_check(rt, from, cookie); 2766 2767 rcu_read_unlock(); 2768 2769 return dst_ret; 2770 } 2771 EXPORT_INDIRECT_CALLABLE(ip6_dst_check); 2772 2773 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2774 { 2775 struct rt6_info *rt = (struct rt6_info *) dst; 2776 2777 if (rt) { 2778 if (rt->rt6i_flags & RTF_CACHE) { 2779 rcu_read_lock(); 2780 if (rt6_check_expired(rt)) { 2781 rt6_remove_exception_rt(rt); 2782 dst = NULL; 2783 } 2784 rcu_read_unlock(); 2785 } else { 2786 dst_release(dst); 2787 dst = NULL; 2788 } 2789 } 2790 return dst; 2791 } 2792 2793 static void ip6_link_failure(struct sk_buff *skb) 2794 { 2795 struct rt6_info *rt; 2796 2797 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2798 2799 rt = (struct rt6_info *) skb_dst(skb); 2800 if (rt) { 2801 rcu_read_lock(); 2802 if (rt->rt6i_flags & RTF_CACHE) { 2803 rt6_remove_exception_rt(rt); 2804 } else { 2805 struct fib6_info *from; 2806 struct fib6_node *fn; 2807 2808 from = rcu_dereference(rt->from); 2809 if (from) { 2810 fn = rcu_dereference(from->fib6_node); 2811 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2812 WRITE_ONCE(fn->fn_sernum, -1); 2813 } 2814 } 2815 rcu_read_unlock(); 2816 } 2817 } 2818 2819 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2820 { 2821 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2822 struct fib6_info *from; 2823 2824 rcu_read_lock(); 2825 from = rcu_dereference(rt0->from); 2826 if (from) 2827 rt0->dst.expires = from->expires; 2828 rcu_read_unlock(); 2829 } 2830 2831 dst_set_expires(&rt0->dst, timeout); 2832 rt0->rt6i_flags |= RTF_EXPIRES; 2833 } 2834 2835 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2836 { 2837 struct net *net = dev_net(rt->dst.dev); 2838 2839 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2840 rt->rt6i_flags |= RTF_MODIFIED; 2841 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2842 } 2843 2844 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2845 { 2846 return !(rt->rt6i_flags & RTF_CACHE) && 2847 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from)); 2848 } 2849 2850 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2851 const struct ipv6hdr *iph, u32 mtu, 2852 bool confirm_neigh) 2853 { 2854 const struct in6_addr *daddr, *saddr; 2855 struct rt6_info *rt6 = (struct rt6_info *)dst; 2856 2857 /* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU) 2858 * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it. 2859 * [see also comment in rt6_mtu_change_route()] 2860 */ 2861 2862 if (iph) { 2863 daddr = &iph->daddr; 2864 saddr = &iph->saddr; 2865 } else if (sk) { 2866 daddr = &sk->sk_v6_daddr; 2867 saddr = &inet6_sk(sk)->saddr; 2868 } else { 2869 daddr = NULL; 2870 saddr = NULL; 2871 } 2872 2873 if (confirm_neigh) 2874 dst_confirm_neigh(dst, daddr); 2875 2876 if (mtu < IPV6_MIN_MTU) 2877 return; 2878 if (mtu >= dst_mtu(dst)) 2879 return; 2880 2881 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2882 rt6_do_update_pmtu(rt6, mtu); 2883 /* update rt6_ex->stamp for cache */ 2884 if (rt6->rt6i_flags & RTF_CACHE) 2885 rt6_update_exception_stamp_rt(rt6); 2886 } else if (daddr) { 2887 struct fib6_result res = {}; 2888 struct rt6_info *nrt6; 2889 2890 rcu_read_lock(); 2891 res.f6i = rcu_dereference(rt6->from); 2892 if (!res.f6i) 2893 goto out_unlock; 2894 2895 res.fib6_flags = res.f6i->fib6_flags; 2896 res.fib6_type = res.f6i->fib6_type; 2897 2898 if (res.f6i->nh) { 2899 struct fib6_nh_match_arg arg = { 2900 .dev = dst->dev, 2901 .gw = &rt6->rt6i_gateway, 2902 }; 2903 2904 nexthop_for_each_fib6_nh(res.f6i->nh, 2905 fib6_nh_find_match, &arg); 2906 2907 /* fib6_info uses a nexthop that does not have fib6_nh 2908 * using the dst->dev + gw. Should be impossible. 2909 */ 2910 if (!arg.match) 2911 goto out_unlock; 2912 2913 res.nh = arg.match; 2914 } else { 2915 res.nh = res.f6i->fib6_nh; 2916 } 2917 2918 nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr); 2919 if (nrt6) { 2920 rt6_do_update_pmtu(nrt6, mtu); 2921 if (rt6_insert_exception(nrt6, &res)) 2922 dst_release_immediate(&nrt6->dst); 2923 } 2924 out_unlock: 2925 rcu_read_unlock(); 2926 } 2927 } 2928 2929 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2930 struct sk_buff *skb, u32 mtu, 2931 bool confirm_neigh) 2932 { 2933 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu, 2934 confirm_neigh); 2935 } 2936 2937 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2938 int oif, u32 mark, kuid_t uid) 2939 { 2940 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2941 struct dst_entry *dst; 2942 struct flowi6 fl6 = { 2943 .flowi6_oif = oif, 2944 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark), 2945 .daddr = iph->daddr, 2946 .saddr = iph->saddr, 2947 .flowlabel = ip6_flowinfo(iph), 2948 .flowi6_uid = uid, 2949 }; 2950 2951 dst = ip6_route_output(net, NULL, &fl6); 2952 if (!dst->error) 2953 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), true); 2954 dst_release(dst); 2955 } 2956 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2957 2958 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2959 { 2960 int oif = sk->sk_bound_dev_if; 2961 struct dst_entry *dst; 2962 2963 if (!oif && skb->dev) 2964 oif = l3mdev_master_ifindex(skb->dev); 2965 2966 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, READ_ONCE(sk->sk_mark), 2967 sk->sk_uid); 2968 2969 dst = __sk_dst_get(sk); 2970 if (!dst || !dst->obsolete || 2971 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2972 return; 2973 2974 bh_lock_sock(sk); 2975 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2976 ip6_datagram_dst_update(sk, false); 2977 bh_unlock_sock(sk); 2978 } 2979 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2980 2981 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2982 const struct flowi6 *fl6) 2983 { 2984 #ifdef CONFIG_IPV6_SUBTREES 2985 struct ipv6_pinfo *np = inet6_sk(sk); 2986 #endif 2987 2988 ip6_dst_store(sk, dst, 2989 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2990 &sk->sk_v6_daddr : NULL, 2991 #ifdef CONFIG_IPV6_SUBTREES 2992 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2993 &np->saddr : 2994 #endif 2995 NULL); 2996 } 2997 2998 static bool ip6_redirect_nh_match(const struct fib6_result *res, 2999 struct flowi6 *fl6, 3000 const struct in6_addr *gw, 3001 struct rt6_info **ret) 3002 { 3003 const struct fib6_nh *nh = res->nh; 3004 3005 if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family || 3006 fl6->flowi6_oif != nh->fib_nh_dev->ifindex) 3007 return false; 3008 3009 /* rt_cache's gateway might be different from its 'parent' 3010 * in the case of an ip redirect. 3011 * So we keep searching in the exception table if the gateway 3012 * is different. 3013 */ 3014 if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) { 3015 struct rt6_info *rt_cache; 3016 3017 rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr); 3018 if (rt_cache && 3019 ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) { 3020 *ret = rt_cache; 3021 return true; 3022 } 3023 return false; 3024 } 3025 return true; 3026 } 3027 3028 struct fib6_nh_rd_arg { 3029 struct fib6_result *res; 3030 struct flowi6 *fl6; 3031 const struct in6_addr *gw; 3032 struct rt6_info **ret; 3033 }; 3034 3035 static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg) 3036 { 3037 struct fib6_nh_rd_arg *arg = _arg; 3038 3039 arg->res->nh = nh; 3040 return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret); 3041 } 3042 3043 /* Handle redirects */ 3044 struct ip6rd_flowi { 3045 struct flowi6 fl6; 3046 struct in6_addr gateway; 3047 }; 3048 3049 INDIRECT_CALLABLE_SCOPE struct rt6_info *__ip6_route_redirect(struct net *net, 3050 struct fib6_table *table, 3051 struct flowi6 *fl6, 3052 const struct sk_buff *skb, 3053 int flags) 3054 { 3055 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 3056 struct rt6_info *ret = NULL; 3057 struct fib6_result res = {}; 3058 struct fib6_nh_rd_arg arg = { 3059 .res = &res, 3060 .fl6 = fl6, 3061 .gw = &rdfl->gateway, 3062 .ret = &ret 3063 }; 3064 struct fib6_info *rt; 3065 struct fib6_node *fn; 3066 3067 /* Get the "current" route for this destination and 3068 * check if the redirect has come from appropriate router. 3069 * 3070 * RFC 4861 specifies that redirects should only be 3071 * accepted if they come from the nexthop to the target. 3072 * Due to the way the routes are chosen, this notion 3073 * is a bit fuzzy and one might need to check all possible 3074 * routes. 3075 */ 3076 3077 rcu_read_lock(); 3078 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 3079 restart: 3080 for_each_fib6_node_rt_rcu(fn) { 3081 res.f6i = rt; 3082 if (fib6_check_expired(rt)) 3083 continue; 3084 if (rt->fib6_flags & RTF_REJECT) 3085 break; 3086 if (unlikely(rt->nh)) { 3087 if (nexthop_is_blackhole(rt->nh)) 3088 continue; 3089 /* on match, res->nh is filled in and potentially ret */ 3090 if (nexthop_for_each_fib6_nh(rt->nh, 3091 fib6_nh_redirect_match, 3092 &arg)) 3093 goto out; 3094 } else { 3095 res.nh = rt->fib6_nh; 3096 if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, 3097 &ret)) 3098 goto out; 3099 } 3100 } 3101 3102 if (!rt) 3103 rt = net->ipv6.fib6_null_entry; 3104 else if (rt->fib6_flags & RTF_REJECT) { 3105 ret = net->ipv6.ip6_null_entry; 3106 goto out; 3107 } 3108 3109 if (rt == net->ipv6.fib6_null_entry) { 3110 fn = fib6_backtrack(fn, &fl6->saddr); 3111 if (fn) 3112 goto restart; 3113 } 3114 3115 res.f6i = rt; 3116 res.nh = rt->fib6_nh; 3117 out: 3118 if (ret) { 3119 ip6_hold_safe(net, &ret); 3120 } else { 3121 res.fib6_flags = res.f6i->fib6_flags; 3122 res.fib6_type = res.f6i->fib6_type; 3123 ret = ip6_create_rt_rcu(&res); 3124 } 3125 3126 rcu_read_unlock(); 3127 3128 trace_fib6_table_lookup(net, &res, table, fl6); 3129 return ret; 3130 }; 3131 3132 static struct dst_entry *ip6_route_redirect(struct net *net, 3133 const struct flowi6 *fl6, 3134 const struct sk_buff *skb, 3135 const struct in6_addr *gateway) 3136 { 3137 int flags = RT6_LOOKUP_F_HAS_SADDR; 3138 struct ip6rd_flowi rdfl; 3139 3140 rdfl.fl6 = *fl6; 3141 rdfl.gateway = *gateway; 3142 3143 return fib6_rule_lookup(net, &rdfl.fl6, skb, 3144 flags, __ip6_route_redirect); 3145 } 3146 3147 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 3148 kuid_t uid) 3149 { 3150 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 3151 struct dst_entry *dst; 3152 struct flowi6 fl6 = { 3153 .flowi6_iif = LOOPBACK_IFINDEX, 3154 .flowi6_oif = oif, 3155 .flowi6_mark = mark, 3156 .daddr = iph->daddr, 3157 .saddr = iph->saddr, 3158 .flowlabel = ip6_flowinfo(iph), 3159 .flowi6_uid = uid, 3160 }; 3161 3162 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 3163 rt6_do_redirect(dst, NULL, skb); 3164 dst_release(dst); 3165 } 3166 EXPORT_SYMBOL_GPL(ip6_redirect); 3167 3168 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) 3169 { 3170 const struct ipv6hdr *iph = ipv6_hdr(skb); 3171 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 3172 struct dst_entry *dst; 3173 struct flowi6 fl6 = { 3174 .flowi6_iif = LOOPBACK_IFINDEX, 3175 .flowi6_oif = oif, 3176 .daddr = msg->dest, 3177 .saddr = iph->daddr, 3178 .flowi6_uid = sock_net_uid(net, NULL), 3179 }; 3180 3181 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 3182 rt6_do_redirect(dst, NULL, skb); 3183 dst_release(dst); 3184 } 3185 3186 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 3187 { 3188 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, 3189 READ_ONCE(sk->sk_mark), sk->sk_uid); 3190 } 3191 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 3192 3193 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 3194 { 3195 struct net_device *dev = dst->dev; 3196 unsigned int mtu = dst_mtu(dst); 3197 struct net *net = dev_net(dev); 3198 3199 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 3200 3201 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 3202 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 3203 3204 /* 3205 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 3206 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 3207 * IPV6_MAXPLEN is also valid and means: "any MSS, 3208 * rely only on pmtu discovery" 3209 */ 3210 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 3211 mtu = IPV6_MAXPLEN; 3212 return mtu; 3213 } 3214 3215 INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst) 3216 { 3217 return ip6_dst_mtu_maybe_forward(dst, false); 3218 } 3219 EXPORT_INDIRECT_CALLABLE(ip6_mtu); 3220 3221 /* MTU selection: 3222 * 1. mtu on route is locked - use it 3223 * 2. mtu from nexthop exception 3224 * 3. mtu from egress device 3225 * 3226 * based on ip6_dst_mtu_forward and exception logic of 3227 * rt6_find_cached_rt; called with rcu_read_lock 3228 */ 3229 u32 ip6_mtu_from_fib6(const struct fib6_result *res, 3230 const struct in6_addr *daddr, 3231 const struct in6_addr *saddr) 3232 { 3233 const struct fib6_nh *nh = res->nh; 3234 struct fib6_info *f6i = res->f6i; 3235 struct inet6_dev *idev; 3236 struct rt6_info *rt; 3237 u32 mtu = 0; 3238 3239 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 3240 mtu = f6i->fib6_pmtu; 3241 if (mtu) 3242 goto out; 3243 } 3244 3245 rt = rt6_find_cached_rt(res, daddr, saddr); 3246 if (unlikely(rt)) { 3247 mtu = dst_metric_raw(&rt->dst, RTAX_MTU); 3248 } else { 3249 struct net_device *dev = nh->fib_nh_dev; 3250 3251 mtu = IPV6_MIN_MTU; 3252 idev = __in6_dev_get(dev); 3253 if (idev) 3254 mtu = max_t(u32, mtu, READ_ONCE(idev->cnf.mtu6)); 3255 } 3256 3257 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 3258 out: 3259 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); 3260 } 3261 3262 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 3263 struct flowi6 *fl6) 3264 { 3265 struct dst_entry *dst; 3266 struct rt6_info *rt; 3267 struct inet6_dev *idev = in6_dev_get(dev); 3268 struct net *net = dev_net(dev); 3269 3270 if (unlikely(!idev)) 3271 return ERR_PTR(-ENODEV); 3272 3273 rt = ip6_dst_alloc(net, dev, 0); 3274 if (unlikely(!rt)) { 3275 in6_dev_put(idev); 3276 dst = ERR_PTR(-ENOMEM); 3277 goto out; 3278 } 3279 3280 rt->dst.input = ip6_input; 3281 rt->dst.output = ip6_output; 3282 rt->rt6i_gateway = fl6->daddr; 3283 rt->rt6i_dst.addr = fl6->daddr; 3284 rt->rt6i_dst.plen = 128; 3285 rt->rt6i_idev = idev; 3286 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 3287 3288 /* Add this dst into uncached_list so that rt6_disable_ip() can 3289 * do proper release of the net_device 3290 */ 3291 rt6_uncached_list_add(rt); 3292 3293 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 3294 3295 out: 3296 return dst; 3297 } 3298 3299 static void ip6_dst_gc(struct dst_ops *ops) 3300 { 3301 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 3302 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 3303 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 3304 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 3305 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 3306 unsigned int val; 3307 int entries; 3308 3309 if (time_after(rt_last_gc + rt_min_interval, jiffies)) 3310 goto out; 3311 3312 fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true); 3313 entries = dst_entries_get_slow(ops); 3314 if (entries < ops->gc_thresh) 3315 atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1); 3316 out: 3317 val = atomic_read(&net->ipv6.ip6_rt_gc_expire); 3318 atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); 3319 } 3320 3321 static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, 3322 const struct in6_addr *gw_addr, u32 tbid, 3323 int flags, struct fib6_result *res) 3324 { 3325 struct flowi6 fl6 = { 3326 .flowi6_oif = cfg->fc_ifindex, 3327 .daddr = *gw_addr, 3328 .saddr = cfg->fc_prefsrc, 3329 }; 3330 struct fib6_table *table; 3331 int err; 3332 3333 table = fib6_get_table(net, tbid); 3334 if (!table) 3335 return -EINVAL; 3336 3337 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 3338 flags |= RT6_LOOKUP_F_HAS_SADDR; 3339 3340 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 3341 3342 err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags); 3343 if (!err && res->f6i != net->ipv6.fib6_null_entry) 3344 fib6_select_path(net, res, &fl6, cfg->fc_ifindex, 3345 cfg->fc_ifindex != 0, NULL, flags); 3346 3347 return err; 3348 } 3349 3350 static int ip6_route_check_nh_onlink(struct net *net, 3351 struct fib6_config *cfg, 3352 const struct net_device *dev, 3353 struct netlink_ext_ack *extack) 3354 { 3355 u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; 3356 const struct in6_addr *gw_addr = &cfg->fc_gateway; 3357 struct fib6_result res = {}; 3358 int err; 3359 3360 err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res); 3361 if (!err && !(res.fib6_flags & RTF_REJECT) && 3362 /* ignore match if it is the default route */ 3363 !ipv6_addr_any(&res.f6i->fib6_dst.addr) && 3364 (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) { 3365 NL_SET_ERR_MSG(extack, 3366 "Nexthop has invalid gateway or device mismatch"); 3367 err = -EINVAL; 3368 } 3369 3370 return err; 3371 } 3372 3373 static int ip6_route_check_nh(struct net *net, 3374 struct fib6_config *cfg, 3375 struct net_device **_dev, 3376 netdevice_tracker *dev_tracker, 3377 struct inet6_dev **idev) 3378 { 3379 const struct in6_addr *gw_addr = &cfg->fc_gateway; 3380 struct net_device *dev = _dev ? *_dev : NULL; 3381 int flags = RT6_LOOKUP_F_IFACE; 3382 struct fib6_result res = {}; 3383 int err = -EHOSTUNREACH; 3384 3385 if (cfg->fc_table) { 3386 err = ip6_nh_lookup_table(net, cfg, gw_addr, 3387 cfg->fc_table, flags, &res); 3388 /* gw_addr can not require a gateway or resolve to a reject 3389 * route. If a device is given, it must match the result. 3390 */ 3391 if (err || res.fib6_flags & RTF_REJECT || 3392 res.nh->fib_nh_gw_family || 3393 (dev && dev != res.nh->fib_nh_dev)) 3394 err = -EHOSTUNREACH; 3395 } 3396 3397 if (err < 0) { 3398 struct flowi6 fl6 = { 3399 .flowi6_oif = cfg->fc_ifindex, 3400 .daddr = *gw_addr, 3401 }; 3402 3403 err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags); 3404 if (err || res.fib6_flags & RTF_REJECT || 3405 res.nh->fib_nh_gw_family) 3406 err = -EHOSTUNREACH; 3407 3408 if (err) 3409 return err; 3410 3411 fib6_select_path(net, &res, &fl6, cfg->fc_ifindex, 3412 cfg->fc_ifindex != 0, NULL, flags); 3413 } 3414 3415 err = 0; 3416 if (dev) { 3417 if (dev != res.nh->fib_nh_dev) 3418 err = -EHOSTUNREACH; 3419 } else { 3420 *_dev = dev = res.nh->fib_nh_dev; 3421 netdev_hold(dev, dev_tracker, GFP_ATOMIC); 3422 *idev = in6_dev_get(dev); 3423 } 3424 3425 return err; 3426 } 3427 3428 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 3429 struct net_device **_dev, 3430 netdevice_tracker *dev_tracker, 3431 struct inet6_dev **idev, 3432 struct netlink_ext_ack *extack) 3433 { 3434 const struct in6_addr *gw_addr = &cfg->fc_gateway; 3435 int gwa_type = ipv6_addr_type(gw_addr); 3436 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 3437 const struct net_device *dev = *_dev; 3438 bool need_addr_check = !dev; 3439 int err = -EINVAL; 3440 3441 /* if gw_addr is local we will fail to detect this in case 3442 * address is still TENTATIVE (DAD in progress). rt6_lookup() 3443 * will return already-added prefix route via interface that 3444 * prefix route was assigned to, which might be non-loopback. 3445 */ 3446 if (dev && 3447 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 3448 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 3449 goto out; 3450 } 3451 3452 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 3453 /* IPv6 strictly inhibits using not link-local 3454 * addresses as nexthop address. 3455 * Otherwise, router will not able to send redirects. 3456 * It is very good, but in some (rare!) circumstances 3457 * (SIT, PtP, NBMA NOARP links) it is handy to allow 3458 * some exceptions. --ANK 3459 * We allow IPv4-mapped nexthops to support RFC4798-type 3460 * addressing 3461 */ 3462 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 3463 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 3464 goto out; 3465 } 3466 3467 rcu_read_lock(); 3468 3469 if (cfg->fc_flags & RTNH_F_ONLINK) 3470 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 3471 else 3472 err = ip6_route_check_nh(net, cfg, _dev, dev_tracker, 3473 idev); 3474 3475 rcu_read_unlock(); 3476 3477 if (err) 3478 goto out; 3479 } 3480 3481 /* reload in case device was changed */ 3482 dev = *_dev; 3483 3484 err = -EINVAL; 3485 if (!dev) { 3486 NL_SET_ERR_MSG(extack, "Egress device not specified"); 3487 goto out; 3488 } else if (dev->flags & IFF_LOOPBACK) { 3489 NL_SET_ERR_MSG(extack, 3490 "Egress device can not be loopback device for this route"); 3491 goto out; 3492 } 3493 3494 /* if we did not check gw_addr above, do so now that the 3495 * egress device has been resolved. 3496 */ 3497 if (need_addr_check && 3498 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 3499 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 3500 goto out; 3501 } 3502 3503 err = 0; 3504 out: 3505 return err; 3506 } 3507 3508 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type) 3509 { 3510 if ((flags & RTF_REJECT) || 3511 (dev && (dev->flags & IFF_LOOPBACK) && 3512 !(addr_type & IPV6_ADDR_LOOPBACK) && 3513 !(flags & (RTF_ANYCAST | RTF_LOCAL)))) 3514 return true; 3515 3516 return false; 3517 } 3518 3519 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, 3520 struct fib6_config *cfg, gfp_t gfp_flags, 3521 struct netlink_ext_ack *extack) 3522 { 3523 netdevice_tracker *dev_tracker = &fib6_nh->fib_nh_dev_tracker; 3524 struct net_device *dev = NULL; 3525 struct inet6_dev *idev = NULL; 3526 int addr_type; 3527 int err; 3528 3529 fib6_nh->fib_nh_family = AF_INET6; 3530 #ifdef CONFIG_IPV6_ROUTER_PREF 3531 fib6_nh->last_probe = jiffies; 3532 #endif 3533 if (cfg->fc_is_fdb) { 3534 fib6_nh->fib_nh_gw6 = cfg->fc_gateway; 3535 fib6_nh->fib_nh_gw_family = AF_INET6; 3536 return 0; 3537 } 3538 3539 err = -ENODEV; 3540 if (cfg->fc_ifindex) { 3541 dev = netdev_get_by_index(net, cfg->fc_ifindex, 3542 dev_tracker, gfp_flags); 3543 if (!dev) 3544 goto out; 3545 idev = in6_dev_get(dev); 3546 if (!idev) 3547 goto out; 3548 } 3549 3550 if (cfg->fc_flags & RTNH_F_ONLINK) { 3551 if (!dev) { 3552 NL_SET_ERR_MSG(extack, 3553 "Nexthop device required for onlink"); 3554 goto out; 3555 } 3556 3557 if (!(dev->flags & IFF_UP)) { 3558 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3559 err = -ENETDOWN; 3560 goto out; 3561 } 3562 3563 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK; 3564 } 3565 3566 fib6_nh->fib_nh_weight = 1; 3567 3568 /* We cannot add true routes via loopback here, 3569 * they would result in kernel looping; promote them to reject routes 3570 */ 3571 addr_type = ipv6_addr_type(&cfg->fc_dst); 3572 if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) { 3573 /* hold loopback dev/idev if we haven't done so. */ 3574 if (dev != net->loopback_dev) { 3575 if (dev) { 3576 netdev_put(dev, dev_tracker); 3577 in6_dev_put(idev); 3578 } 3579 dev = net->loopback_dev; 3580 netdev_hold(dev, dev_tracker, gfp_flags); 3581 idev = in6_dev_get(dev); 3582 if (!idev) { 3583 err = -ENODEV; 3584 goto out; 3585 } 3586 } 3587 goto pcpu_alloc; 3588 } 3589 3590 if (cfg->fc_flags & RTF_GATEWAY) { 3591 err = ip6_validate_gw(net, cfg, &dev, dev_tracker, 3592 &idev, extack); 3593 if (err) 3594 goto out; 3595 3596 fib6_nh->fib_nh_gw6 = cfg->fc_gateway; 3597 fib6_nh->fib_nh_gw_family = AF_INET6; 3598 } 3599 3600 err = -ENODEV; 3601 if (!dev) 3602 goto out; 3603 3604 if (idev->cnf.disable_ipv6) { 3605 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3606 err = -EACCES; 3607 goto out; 3608 } 3609 3610 if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) { 3611 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3612 err = -ENETDOWN; 3613 goto out; 3614 } 3615 3616 if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3617 !netif_carrier_ok(dev)) 3618 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; 3619 3620 err = fib_nh_common_init(net, &fib6_nh->nh_common, cfg->fc_encap, 3621 cfg->fc_encap_type, cfg, gfp_flags, extack); 3622 if (err) 3623 goto out; 3624 3625 pcpu_alloc: 3626 fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); 3627 if (!fib6_nh->rt6i_pcpu) { 3628 err = -ENOMEM; 3629 goto out; 3630 } 3631 3632 fib6_nh->fib_nh_dev = dev; 3633 fib6_nh->fib_nh_oif = dev->ifindex; 3634 err = 0; 3635 out: 3636 if (idev) 3637 in6_dev_put(idev); 3638 3639 if (err) { 3640 lwtstate_put(fib6_nh->fib_nh_lws); 3641 fib6_nh->fib_nh_lws = NULL; 3642 netdev_put(dev, dev_tracker); 3643 } 3644 3645 return err; 3646 } 3647 3648 void fib6_nh_release(struct fib6_nh *fib6_nh) 3649 { 3650 struct rt6_exception_bucket *bucket; 3651 3652 rcu_read_lock(); 3653 3654 fib6_nh_flush_exceptions(fib6_nh, NULL); 3655 bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL); 3656 if (bucket) { 3657 rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL); 3658 kfree(bucket); 3659 } 3660 3661 rcu_read_unlock(); 3662 3663 fib6_nh_release_dsts(fib6_nh); 3664 free_percpu(fib6_nh->rt6i_pcpu); 3665 3666 fib_nh_common_release(&fib6_nh->nh_common); 3667 } 3668 3669 void fib6_nh_release_dsts(struct fib6_nh *fib6_nh) 3670 { 3671 int cpu; 3672 3673 if (!fib6_nh->rt6i_pcpu) 3674 return; 3675 3676 for_each_possible_cpu(cpu) { 3677 struct rt6_info *pcpu_rt, **ppcpu_rt; 3678 3679 ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); 3680 pcpu_rt = xchg(ppcpu_rt, NULL); 3681 if (pcpu_rt) { 3682 dst_dev_put(&pcpu_rt->dst); 3683 dst_release(&pcpu_rt->dst); 3684 } 3685 } 3686 } 3687 3688 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 3689 gfp_t gfp_flags, 3690 struct netlink_ext_ack *extack) 3691 { 3692 struct net *net = cfg->fc_nlinfo.nl_net; 3693 struct fib6_info *rt = NULL; 3694 struct nexthop *nh = NULL; 3695 struct fib6_table *table; 3696 struct fib6_nh *fib6_nh; 3697 int err = -EINVAL; 3698 int addr_type; 3699 3700 /* RTF_PCPU is an internal flag; can not be set by userspace */ 3701 if (cfg->fc_flags & RTF_PCPU) { 3702 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 3703 goto out; 3704 } 3705 3706 /* RTF_CACHE is an internal flag; can not be set by userspace */ 3707 if (cfg->fc_flags & RTF_CACHE) { 3708 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 3709 goto out; 3710 } 3711 3712 if (cfg->fc_type > RTN_MAX) { 3713 NL_SET_ERR_MSG(extack, "Invalid route type"); 3714 goto out; 3715 } 3716 3717 if (cfg->fc_dst_len > 128) { 3718 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 3719 goto out; 3720 } 3721 if (cfg->fc_src_len > 128) { 3722 NL_SET_ERR_MSG(extack, "Invalid source address length"); 3723 goto out; 3724 } 3725 #ifndef CONFIG_IPV6_SUBTREES 3726 if (cfg->fc_src_len) { 3727 NL_SET_ERR_MSG(extack, 3728 "Specifying source address requires IPV6_SUBTREES to be enabled"); 3729 goto out; 3730 } 3731 #endif 3732 if (cfg->fc_nh_id) { 3733 nh = nexthop_find_by_id(net, cfg->fc_nh_id); 3734 if (!nh) { 3735 NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); 3736 goto out; 3737 } 3738 err = fib6_check_nexthop(nh, cfg, extack); 3739 if (err) 3740 goto out; 3741 } 3742 3743 err = -ENOBUFS; 3744 if (cfg->fc_nlinfo.nlh && 3745 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 3746 table = fib6_get_table(net, cfg->fc_table); 3747 if (!table) { 3748 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 3749 table = fib6_new_table(net, cfg->fc_table); 3750 } 3751 } else { 3752 table = fib6_new_table(net, cfg->fc_table); 3753 } 3754 3755 if (!table) 3756 goto out; 3757 3758 err = -ENOMEM; 3759 rt = fib6_info_alloc(gfp_flags, !nh); 3760 if (!rt) 3761 goto out; 3762 3763 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len, 3764 extack); 3765 if (IS_ERR(rt->fib6_metrics)) { 3766 err = PTR_ERR(rt->fib6_metrics); 3767 /* Do not leave garbage there. */ 3768 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; 3769 goto out_free; 3770 } 3771 3772 if (cfg->fc_flags & RTF_ADDRCONF) 3773 rt->dst_nocount = true; 3774 3775 if (cfg->fc_flags & RTF_EXPIRES) 3776 fib6_set_expires(rt, jiffies + 3777 clock_t_to_jiffies(cfg->fc_expires)); 3778 3779 if (cfg->fc_protocol == RTPROT_UNSPEC) 3780 cfg->fc_protocol = RTPROT_BOOT; 3781 rt->fib6_protocol = cfg->fc_protocol; 3782 3783 rt->fib6_table = table; 3784 rt->fib6_metric = cfg->fc_metric; 3785 rt->fib6_type = cfg->fc_type ? : RTN_UNICAST; 3786 rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY; 3787 3788 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3789 rt->fib6_dst.plen = cfg->fc_dst_len; 3790 3791 #ifdef CONFIG_IPV6_SUBTREES 3792 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3793 rt->fib6_src.plen = cfg->fc_src_len; 3794 #endif 3795 if (nh) { 3796 if (rt->fib6_src.plen) { 3797 NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); 3798 goto out_free; 3799 } 3800 if (!nexthop_get(nh)) { 3801 NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); 3802 goto out_free; 3803 } 3804 rt->nh = nh; 3805 fib6_nh = nexthop_fib6_nh(rt->nh); 3806 } else { 3807 err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack); 3808 if (err) 3809 goto out; 3810 3811 fib6_nh = rt->fib6_nh; 3812 3813 /* We cannot add true routes via loopback here, they would 3814 * result in kernel looping; promote them to reject routes 3815 */ 3816 addr_type = ipv6_addr_type(&cfg->fc_dst); 3817 if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev, 3818 addr_type)) 3819 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; 3820 } 3821 3822 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3823 struct net_device *dev = fib6_nh->fib_nh_dev; 3824 3825 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3826 NL_SET_ERR_MSG(extack, "Invalid source address"); 3827 err = -EINVAL; 3828 goto out; 3829 } 3830 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3831 rt->fib6_prefsrc.plen = 128; 3832 } else 3833 rt->fib6_prefsrc.plen = 0; 3834 3835 return rt; 3836 out: 3837 fib6_info_release(rt); 3838 return ERR_PTR(err); 3839 out_free: 3840 ip_fib_metrics_put(rt->fib6_metrics); 3841 kfree(rt); 3842 return ERR_PTR(err); 3843 } 3844 3845 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3846 struct netlink_ext_ack *extack) 3847 { 3848 struct fib6_info *rt; 3849 int err; 3850 3851 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3852 if (IS_ERR(rt)) 3853 return PTR_ERR(rt); 3854 3855 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3856 fib6_info_release(rt); 3857 3858 return err; 3859 } 3860 3861 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3862 { 3863 struct net *net = info->nl_net; 3864 struct fib6_table *table; 3865 int err; 3866 3867 if (rt == net->ipv6.fib6_null_entry) { 3868 err = -ENOENT; 3869 goto out; 3870 } 3871 3872 table = rt->fib6_table; 3873 spin_lock_bh(&table->tb6_lock); 3874 err = fib6_del(rt, info); 3875 spin_unlock_bh(&table->tb6_lock); 3876 3877 out: 3878 fib6_info_release(rt); 3879 return err; 3880 } 3881 3882 int ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify) 3883 { 3884 struct nl_info info = { 3885 .nl_net = net, 3886 .skip_notify = skip_notify 3887 }; 3888 3889 return __ip6_del_rt(rt, &info); 3890 } 3891 3892 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3893 { 3894 struct nl_info *info = &cfg->fc_nlinfo; 3895 struct net *net = info->nl_net; 3896 struct sk_buff *skb = NULL; 3897 struct fib6_table *table; 3898 int err = -ENOENT; 3899 3900 if (rt == net->ipv6.fib6_null_entry) 3901 goto out_put; 3902 table = rt->fib6_table; 3903 spin_lock_bh(&table->tb6_lock); 3904 3905 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3906 struct fib6_info *sibling, *next_sibling; 3907 struct fib6_node *fn; 3908 3909 /* prefer to send a single notification with all hops */ 3910 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3911 if (skb) { 3912 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3913 3914 if (rt6_fill_node(net, skb, rt, NULL, 3915 NULL, NULL, 0, RTM_DELROUTE, 3916 info->portid, seq, 0) < 0) { 3917 kfree_skb(skb); 3918 skb = NULL; 3919 } else 3920 info->skip_notify = 1; 3921 } 3922 3923 /* 'rt' points to the first sibling route. If it is not the 3924 * leaf, then we do not need to send a notification. Otherwise, 3925 * we need to check if the last sibling has a next route or not 3926 * and emit a replace or delete notification, respectively. 3927 */ 3928 info->skip_notify_kernel = 1; 3929 fn = rcu_dereference_protected(rt->fib6_node, 3930 lockdep_is_held(&table->tb6_lock)); 3931 if (rcu_access_pointer(fn->leaf) == rt) { 3932 struct fib6_info *last_sibling, *replace_rt; 3933 3934 last_sibling = list_last_entry(&rt->fib6_siblings, 3935 struct fib6_info, 3936 fib6_siblings); 3937 replace_rt = rcu_dereference_protected( 3938 last_sibling->fib6_next, 3939 lockdep_is_held(&table->tb6_lock)); 3940 if (replace_rt) 3941 call_fib6_entry_notifiers_replace(net, 3942 replace_rt); 3943 else 3944 call_fib6_multipath_entry_notifiers(net, 3945 FIB_EVENT_ENTRY_DEL, 3946 rt, rt->fib6_nsiblings, 3947 NULL); 3948 } 3949 list_for_each_entry_safe(sibling, next_sibling, 3950 &rt->fib6_siblings, 3951 fib6_siblings) { 3952 err = fib6_del(sibling, info); 3953 if (err) 3954 goto out_unlock; 3955 } 3956 } 3957 3958 err = fib6_del(rt, info); 3959 out_unlock: 3960 spin_unlock_bh(&table->tb6_lock); 3961 out_put: 3962 fib6_info_release(rt); 3963 3964 if (skb) { 3965 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3966 info->nlh, gfp_any()); 3967 } 3968 return err; 3969 } 3970 3971 static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3972 { 3973 int rc = -ESRCH; 3974 3975 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3976 goto out; 3977 3978 if (cfg->fc_flags & RTF_GATEWAY && 3979 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3980 goto out; 3981 3982 rc = rt6_remove_exception_rt(rt); 3983 out: 3984 return rc; 3985 } 3986 3987 static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt, 3988 struct fib6_nh *nh) 3989 { 3990 struct fib6_result res = { 3991 .f6i = rt, 3992 .nh = nh, 3993 }; 3994 struct rt6_info *rt_cache; 3995 3996 rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src); 3997 if (rt_cache) 3998 return __ip6_del_cached_rt(rt_cache, cfg); 3999 4000 return 0; 4001 } 4002 4003 struct fib6_nh_del_cached_rt_arg { 4004 struct fib6_config *cfg; 4005 struct fib6_info *f6i; 4006 }; 4007 4008 static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg) 4009 { 4010 struct fib6_nh_del_cached_rt_arg *arg = _arg; 4011 int rc; 4012 4013 rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh); 4014 return rc != -ESRCH ? rc : 0; 4015 } 4016 4017 static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i) 4018 { 4019 struct fib6_nh_del_cached_rt_arg arg = { 4020 .cfg = cfg, 4021 .f6i = f6i 4022 }; 4023 4024 return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg); 4025 } 4026 4027 static int ip6_route_del(struct fib6_config *cfg, 4028 struct netlink_ext_ack *extack) 4029 { 4030 struct fib6_table *table; 4031 struct fib6_info *rt; 4032 struct fib6_node *fn; 4033 int err = -ESRCH; 4034 4035 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 4036 if (!table) { 4037 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 4038 return err; 4039 } 4040 4041 rcu_read_lock(); 4042 4043 fn = fib6_locate(&table->tb6_root, 4044 &cfg->fc_dst, cfg->fc_dst_len, 4045 &cfg->fc_src, cfg->fc_src_len, 4046 !(cfg->fc_flags & RTF_CACHE)); 4047 4048 if (fn) { 4049 for_each_fib6_node_rt_rcu(fn) { 4050 struct fib6_nh *nh; 4051 4052 if (rt->nh && cfg->fc_nh_id && 4053 rt->nh->id != cfg->fc_nh_id) 4054 continue; 4055 4056 if (cfg->fc_flags & RTF_CACHE) { 4057 int rc = 0; 4058 4059 if (rt->nh) { 4060 rc = ip6_del_cached_rt_nh(cfg, rt); 4061 } else if (cfg->fc_nh_id) { 4062 continue; 4063 } else { 4064 nh = rt->fib6_nh; 4065 rc = ip6_del_cached_rt(cfg, rt, nh); 4066 } 4067 if (rc != -ESRCH) { 4068 rcu_read_unlock(); 4069 return rc; 4070 } 4071 continue; 4072 } 4073 4074 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 4075 continue; 4076 if (cfg->fc_protocol && 4077 cfg->fc_protocol != rt->fib6_protocol) 4078 continue; 4079 4080 if (rt->nh) { 4081 if (!fib6_info_hold_safe(rt)) 4082 continue; 4083 rcu_read_unlock(); 4084 4085 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 4086 } 4087 if (cfg->fc_nh_id) 4088 continue; 4089 4090 nh = rt->fib6_nh; 4091 if (cfg->fc_ifindex && 4092 (!nh->fib_nh_dev || 4093 nh->fib_nh_dev->ifindex != cfg->fc_ifindex)) 4094 continue; 4095 if (cfg->fc_flags & RTF_GATEWAY && 4096 !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6)) 4097 continue; 4098 if (!fib6_info_hold_safe(rt)) 4099 continue; 4100 rcu_read_unlock(); 4101 4102 /* if gateway was specified only delete the one hop */ 4103 if (cfg->fc_flags & RTF_GATEWAY) 4104 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 4105 4106 return __ip6_del_rt_siblings(rt, cfg); 4107 } 4108 } 4109 rcu_read_unlock(); 4110 4111 return err; 4112 } 4113 4114 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 4115 { 4116 struct netevent_redirect netevent; 4117 struct rt6_info *rt, *nrt = NULL; 4118 struct fib6_result res = {}; 4119 struct ndisc_options ndopts; 4120 struct inet6_dev *in6_dev; 4121 struct neighbour *neigh; 4122 struct rd_msg *msg; 4123 int optlen, on_link; 4124 u8 *lladdr; 4125 4126 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 4127 optlen -= sizeof(*msg); 4128 4129 if (optlen < 0) { 4130 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 4131 return; 4132 } 4133 4134 msg = (struct rd_msg *)icmp6_hdr(skb); 4135 4136 if (ipv6_addr_is_multicast(&msg->dest)) { 4137 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 4138 return; 4139 } 4140 4141 on_link = 0; 4142 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 4143 on_link = 1; 4144 } else if (ipv6_addr_type(&msg->target) != 4145 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 4146 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 4147 return; 4148 } 4149 4150 in6_dev = __in6_dev_get(skb->dev); 4151 if (!in6_dev) 4152 return; 4153 if (READ_ONCE(in6_dev->cnf.forwarding) || 4154 !READ_ONCE(in6_dev->cnf.accept_redirects)) 4155 return; 4156 4157 /* RFC2461 8.1: 4158 * The IP source address of the Redirect MUST be the same as the current 4159 * first-hop router for the specified ICMP Destination Address. 4160 */ 4161 4162 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 4163 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 4164 return; 4165 } 4166 4167 lladdr = NULL; 4168 if (ndopts.nd_opts_tgt_lladdr) { 4169 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 4170 skb->dev); 4171 if (!lladdr) { 4172 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 4173 return; 4174 } 4175 } 4176 4177 rt = (struct rt6_info *) dst; 4178 if (rt->rt6i_flags & RTF_REJECT) { 4179 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 4180 return; 4181 } 4182 4183 /* Redirect received -> path was valid. 4184 * Look, redirects are sent only in response to data packets, 4185 * so that this nexthop apparently is reachable. --ANK 4186 */ 4187 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 4188 4189 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 4190 if (!neigh) 4191 return; 4192 4193 /* 4194 * We have finally decided to accept it. 4195 */ 4196 4197 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 4198 NEIGH_UPDATE_F_WEAK_OVERRIDE| 4199 NEIGH_UPDATE_F_OVERRIDE| 4200 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 4201 NEIGH_UPDATE_F_ISROUTER)), 4202 NDISC_REDIRECT, &ndopts); 4203 4204 rcu_read_lock(); 4205 res.f6i = rcu_dereference(rt->from); 4206 if (!res.f6i) 4207 goto out; 4208 4209 if (res.f6i->nh) { 4210 struct fib6_nh_match_arg arg = { 4211 .dev = dst->dev, 4212 .gw = &rt->rt6i_gateway, 4213 }; 4214 4215 nexthop_for_each_fib6_nh(res.f6i->nh, 4216 fib6_nh_find_match, &arg); 4217 4218 /* fib6_info uses a nexthop that does not have fib6_nh 4219 * using the dst->dev. Should be impossible 4220 */ 4221 if (!arg.match) 4222 goto out; 4223 res.nh = arg.match; 4224 } else { 4225 res.nh = res.f6i->fib6_nh; 4226 } 4227 4228 res.fib6_flags = res.f6i->fib6_flags; 4229 res.fib6_type = res.f6i->fib6_type; 4230 nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL); 4231 if (!nrt) 4232 goto out; 4233 4234 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 4235 if (on_link) 4236 nrt->rt6i_flags &= ~RTF_GATEWAY; 4237 4238 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 4239 4240 /* rt6_insert_exception() will take care of duplicated exceptions */ 4241 if (rt6_insert_exception(nrt, &res)) { 4242 dst_release_immediate(&nrt->dst); 4243 goto out; 4244 } 4245 4246 netevent.old = &rt->dst; 4247 netevent.new = &nrt->dst; 4248 netevent.daddr = &msg->dest; 4249 netevent.neigh = neigh; 4250 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 4251 4252 out: 4253 rcu_read_unlock(); 4254 neigh_release(neigh); 4255 } 4256 4257 #ifdef CONFIG_IPV6_ROUTE_INFO 4258 static struct fib6_info *rt6_get_route_info(struct net *net, 4259 const struct in6_addr *prefix, int prefixlen, 4260 const struct in6_addr *gwaddr, 4261 struct net_device *dev) 4262 { 4263 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 4264 int ifindex = dev->ifindex; 4265 struct fib6_node *fn; 4266 struct fib6_info *rt = NULL; 4267 struct fib6_table *table; 4268 4269 table = fib6_get_table(net, tb_id); 4270 if (!table) 4271 return NULL; 4272 4273 rcu_read_lock(); 4274 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 4275 if (!fn) 4276 goto out; 4277 4278 for_each_fib6_node_rt_rcu(fn) { 4279 /* these routes do not use nexthops */ 4280 if (rt->nh) 4281 continue; 4282 if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex) 4283 continue; 4284 if (!(rt->fib6_flags & RTF_ROUTEINFO) || 4285 !rt->fib6_nh->fib_nh_gw_family) 4286 continue; 4287 if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr)) 4288 continue; 4289 if (!fib6_info_hold_safe(rt)) 4290 continue; 4291 break; 4292 } 4293 out: 4294 rcu_read_unlock(); 4295 return rt; 4296 } 4297 4298 static struct fib6_info *rt6_add_route_info(struct net *net, 4299 const struct in6_addr *prefix, int prefixlen, 4300 const struct in6_addr *gwaddr, 4301 struct net_device *dev, 4302 unsigned int pref) 4303 { 4304 struct fib6_config cfg = { 4305 .fc_metric = IP6_RT_PRIO_USER, 4306 .fc_ifindex = dev->ifindex, 4307 .fc_dst_len = prefixlen, 4308 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 4309 RTF_UP | RTF_PREF(pref), 4310 .fc_protocol = RTPROT_RA, 4311 .fc_type = RTN_UNICAST, 4312 .fc_nlinfo.portid = 0, 4313 .fc_nlinfo.nlh = NULL, 4314 .fc_nlinfo.nl_net = net, 4315 }; 4316 4317 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 4318 cfg.fc_dst = *prefix; 4319 cfg.fc_gateway = *gwaddr; 4320 4321 /* We should treat it as a default route if prefix length is 0. */ 4322 if (!prefixlen) 4323 cfg.fc_flags |= RTF_DEFAULT; 4324 4325 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 4326 4327 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 4328 } 4329 #endif 4330 4331 struct fib6_info *rt6_get_dflt_router(struct net *net, 4332 const struct in6_addr *addr, 4333 struct net_device *dev) 4334 { 4335 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 4336 struct fib6_info *rt; 4337 struct fib6_table *table; 4338 4339 table = fib6_get_table(net, tb_id); 4340 if (!table) 4341 return NULL; 4342 4343 rcu_read_lock(); 4344 for_each_fib6_node_rt_rcu(&table->tb6_root) { 4345 struct fib6_nh *nh; 4346 4347 /* RA routes do not use nexthops */ 4348 if (rt->nh) 4349 continue; 4350 4351 nh = rt->fib6_nh; 4352 if (dev == nh->fib_nh_dev && 4353 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 4354 ipv6_addr_equal(&nh->fib_nh_gw6, addr)) 4355 break; 4356 } 4357 if (rt && !fib6_info_hold_safe(rt)) 4358 rt = NULL; 4359 rcu_read_unlock(); 4360 return rt; 4361 } 4362 4363 struct fib6_info *rt6_add_dflt_router(struct net *net, 4364 const struct in6_addr *gwaddr, 4365 struct net_device *dev, 4366 unsigned int pref, 4367 u32 defrtr_usr_metric, 4368 int lifetime) 4369 { 4370 struct fib6_config cfg = { 4371 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 4372 .fc_metric = defrtr_usr_metric, 4373 .fc_ifindex = dev->ifindex, 4374 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 4375 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 4376 .fc_protocol = RTPROT_RA, 4377 .fc_type = RTN_UNICAST, 4378 .fc_nlinfo.portid = 0, 4379 .fc_nlinfo.nlh = NULL, 4380 .fc_nlinfo.nl_net = net, 4381 .fc_expires = jiffies_to_clock_t(lifetime * HZ), 4382 }; 4383 4384 cfg.fc_gateway = *gwaddr; 4385 4386 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 4387 struct fib6_table *table; 4388 4389 table = fib6_get_table(dev_net(dev), cfg.fc_table); 4390 if (table) 4391 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 4392 } 4393 4394 return rt6_get_dflt_router(net, gwaddr, dev); 4395 } 4396 4397 static void __rt6_purge_dflt_routers(struct net *net, 4398 struct fib6_table *table) 4399 { 4400 struct fib6_info *rt; 4401 4402 restart: 4403 rcu_read_lock(); 4404 for_each_fib6_node_rt_rcu(&table->tb6_root) { 4405 struct net_device *dev = fib6_info_nh_dev(rt); 4406 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 4407 4408 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 4409 (!idev || idev->cnf.accept_ra != 2) && 4410 fib6_info_hold_safe(rt)) { 4411 rcu_read_unlock(); 4412 ip6_del_rt(net, rt, false); 4413 goto restart; 4414 } 4415 } 4416 rcu_read_unlock(); 4417 4418 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 4419 } 4420 4421 void rt6_purge_dflt_routers(struct net *net) 4422 { 4423 struct fib6_table *table; 4424 struct hlist_head *head; 4425 unsigned int h; 4426 4427 rcu_read_lock(); 4428 4429 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 4430 head = &net->ipv6.fib_table_hash[h]; 4431 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 4432 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 4433 __rt6_purge_dflt_routers(net, table); 4434 } 4435 } 4436 4437 rcu_read_unlock(); 4438 } 4439 4440 static void rtmsg_to_fib6_config(struct net *net, 4441 struct in6_rtmsg *rtmsg, 4442 struct fib6_config *cfg) 4443 { 4444 *cfg = (struct fib6_config){ 4445 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 4446 : RT6_TABLE_MAIN, 4447 .fc_ifindex = rtmsg->rtmsg_ifindex, 4448 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER, 4449 .fc_expires = rtmsg->rtmsg_info, 4450 .fc_dst_len = rtmsg->rtmsg_dst_len, 4451 .fc_src_len = rtmsg->rtmsg_src_len, 4452 .fc_flags = rtmsg->rtmsg_flags, 4453 .fc_type = rtmsg->rtmsg_type, 4454 4455 .fc_nlinfo.nl_net = net, 4456 4457 .fc_dst = rtmsg->rtmsg_dst, 4458 .fc_src = rtmsg->rtmsg_src, 4459 .fc_gateway = rtmsg->rtmsg_gateway, 4460 }; 4461 } 4462 4463 int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg) 4464 { 4465 struct fib6_config cfg; 4466 int err; 4467 4468 if (cmd != SIOCADDRT && cmd != SIOCDELRT) 4469 return -EINVAL; 4470 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 4471 return -EPERM; 4472 4473 rtmsg_to_fib6_config(net, rtmsg, &cfg); 4474 4475 rtnl_lock(); 4476 switch (cmd) { 4477 case SIOCADDRT: 4478 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 4479 break; 4480 case SIOCDELRT: 4481 err = ip6_route_del(&cfg, NULL); 4482 break; 4483 } 4484 rtnl_unlock(); 4485 return err; 4486 } 4487 4488 /* 4489 * Drop the packet on the floor 4490 */ 4491 4492 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 4493 { 4494 struct dst_entry *dst = skb_dst(skb); 4495 struct net *net = dev_net(dst->dev); 4496 struct inet6_dev *idev; 4497 SKB_DR(reason); 4498 int type; 4499 4500 if (netif_is_l3_master(skb->dev) || 4501 dst->dev == net->loopback_dev) 4502 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); 4503 else 4504 idev = ip6_dst_idev(dst); 4505 4506 switch (ipstats_mib_noroutes) { 4507 case IPSTATS_MIB_INNOROUTES: 4508 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 4509 if (type == IPV6_ADDR_ANY) { 4510 SKB_DR_SET(reason, IP_INADDRERRORS); 4511 IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 4512 break; 4513 } 4514 SKB_DR_SET(reason, IP_INNOROUTES); 4515 fallthrough; 4516 case IPSTATS_MIB_OUTNOROUTES: 4517 SKB_DR_OR(reason, IP_OUTNOROUTES); 4518 IP6_INC_STATS(net, idev, ipstats_mib_noroutes); 4519 break; 4520 } 4521 4522 /* Start over by dropping the dst for l3mdev case */ 4523 if (netif_is_l3_master(skb->dev)) 4524 skb_dst_drop(skb); 4525 4526 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 4527 kfree_skb_reason(skb, reason); 4528 return 0; 4529 } 4530 4531 static int ip6_pkt_discard(struct sk_buff *skb) 4532 { 4533 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 4534 } 4535 4536 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 4537 { 4538 skb->dev = skb_dst(skb)->dev; 4539 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 4540 } 4541 4542 static int ip6_pkt_prohibit(struct sk_buff *skb) 4543 { 4544 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 4545 } 4546 4547 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 4548 { 4549 skb->dev = skb_dst(skb)->dev; 4550 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 4551 } 4552 4553 /* 4554 * Allocate a dst for local (unicast / anycast) address. 4555 */ 4556 4557 struct fib6_info *addrconf_f6i_alloc(struct net *net, 4558 struct inet6_dev *idev, 4559 const struct in6_addr *addr, 4560 bool anycast, gfp_t gfp_flags, 4561 struct netlink_ext_ack *extack) 4562 { 4563 struct fib6_config cfg = { 4564 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL, 4565 .fc_ifindex = idev->dev->ifindex, 4566 .fc_flags = RTF_UP | RTF_NONEXTHOP, 4567 .fc_dst = *addr, 4568 .fc_dst_len = 128, 4569 .fc_protocol = RTPROT_KERNEL, 4570 .fc_nlinfo.nl_net = net, 4571 .fc_ignore_dev_down = true, 4572 }; 4573 struct fib6_info *f6i; 4574 4575 if (anycast) { 4576 cfg.fc_type = RTN_ANYCAST; 4577 cfg.fc_flags |= RTF_ANYCAST; 4578 } else { 4579 cfg.fc_type = RTN_LOCAL; 4580 cfg.fc_flags |= RTF_LOCAL; 4581 } 4582 4583 f6i = ip6_route_info_create(&cfg, gfp_flags, extack); 4584 if (!IS_ERR(f6i)) { 4585 f6i->dst_nocount = true; 4586 4587 if (!anycast && 4588 (READ_ONCE(net->ipv6.devconf_all->disable_policy) || 4589 READ_ONCE(idev->cnf.disable_policy))) 4590 f6i->dst_nopolicy = true; 4591 } 4592 4593 return f6i; 4594 } 4595 4596 /* remove deleted ip from prefsrc entries */ 4597 struct arg_dev_net_ip { 4598 struct net *net; 4599 struct in6_addr *addr; 4600 }; 4601 4602 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 4603 { 4604 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 4605 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 4606 4607 if (!rt->nh && 4608 rt != net->ipv6.fib6_null_entry && 4609 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr) && 4610 !ipv6_chk_addr(net, addr, rt->fib6_nh->fib_nh_dev, 0)) { 4611 spin_lock_bh(&rt6_exception_lock); 4612 /* remove prefsrc entry */ 4613 rt->fib6_prefsrc.plen = 0; 4614 spin_unlock_bh(&rt6_exception_lock); 4615 } 4616 return 0; 4617 } 4618 4619 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 4620 { 4621 struct net *net = dev_net(ifp->idev->dev); 4622 struct arg_dev_net_ip adni = { 4623 .net = net, 4624 .addr = &ifp->addr, 4625 }; 4626 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 4627 } 4628 4629 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT) 4630 4631 /* Remove routers and update dst entries when gateway turn into host. */ 4632 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 4633 { 4634 struct in6_addr *gateway = (struct in6_addr *)arg; 4635 struct fib6_nh *nh; 4636 4637 /* RA routes do not use nexthops */ 4638 if (rt->nh) 4639 return 0; 4640 4641 nh = rt->fib6_nh; 4642 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 4643 nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6)) 4644 return -1; 4645 4646 /* Further clean up cached routes in exception table. 4647 * This is needed because cached route may have a different 4648 * gateway than its 'parent' in the case of an ip redirect. 4649 */ 4650 fib6_nh_exceptions_clean_tohost(nh, gateway); 4651 4652 return 0; 4653 } 4654 4655 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 4656 { 4657 fib6_clean_all(net, fib6_clean_tohost, gateway); 4658 } 4659 4660 struct arg_netdev_event { 4661 const struct net_device *dev; 4662 union { 4663 unsigned char nh_flags; 4664 unsigned long event; 4665 }; 4666 }; 4667 4668 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 4669 { 4670 struct fib6_info *iter; 4671 struct fib6_node *fn; 4672 4673 fn = rcu_dereference_protected(rt->fib6_node, 4674 lockdep_is_held(&rt->fib6_table->tb6_lock)); 4675 iter = rcu_dereference_protected(fn->leaf, 4676 lockdep_is_held(&rt->fib6_table->tb6_lock)); 4677 while (iter) { 4678 if (iter->fib6_metric == rt->fib6_metric && 4679 rt6_qualify_for_ecmp(iter)) 4680 return iter; 4681 iter = rcu_dereference_protected(iter->fib6_next, 4682 lockdep_is_held(&rt->fib6_table->tb6_lock)); 4683 } 4684 4685 return NULL; 4686 } 4687 4688 /* only called for fib entries with builtin fib6_nh */ 4689 static bool rt6_is_dead(const struct fib6_info *rt) 4690 { 4691 if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD || 4692 (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN && 4693 ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev))) 4694 return true; 4695 4696 return false; 4697 } 4698 4699 static int rt6_multipath_total_weight(const struct fib6_info *rt) 4700 { 4701 struct fib6_info *iter; 4702 int total = 0; 4703 4704 if (!rt6_is_dead(rt)) 4705 total += rt->fib6_nh->fib_nh_weight; 4706 4707 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 4708 if (!rt6_is_dead(iter)) 4709 total += iter->fib6_nh->fib_nh_weight; 4710 } 4711 4712 return total; 4713 } 4714 4715 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 4716 { 4717 int upper_bound = -1; 4718 4719 if (!rt6_is_dead(rt)) { 4720 *weight += rt->fib6_nh->fib_nh_weight; 4721 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 4722 total) - 1; 4723 } 4724 atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound); 4725 } 4726 4727 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 4728 { 4729 struct fib6_info *iter; 4730 int weight = 0; 4731 4732 rt6_upper_bound_set(rt, &weight, total); 4733 4734 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4735 rt6_upper_bound_set(iter, &weight, total); 4736 } 4737 4738 void rt6_multipath_rebalance(struct fib6_info *rt) 4739 { 4740 struct fib6_info *first; 4741 int total; 4742 4743 /* In case the entire multipath route was marked for flushing, 4744 * then there is no need to rebalance upon the removal of every 4745 * sibling route. 4746 */ 4747 if (!rt->fib6_nsiblings || rt->should_flush) 4748 return; 4749 4750 /* During lookup routes are evaluated in order, so we need to 4751 * make sure upper bounds are assigned from the first sibling 4752 * onwards. 4753 */ 4754 first = rt6_multipath_first_sibling(rt); 4755 if (WARN_ON_ONCE(!first)) 4756 return; 4757 4758 total = rt6_multipath_total_weight(first); 4759 rt6_multipath_upper_bound_set(first, total); 4760 } 4761 4762 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 4763 { 4764 const struct arg_netdev_event *arg = p_arg; 4765 struct net *net = dev_net(arg->dev); 4766 4767 if (rt != net->ipv6.fib6_null_entry && !rt->nh && 4768 rt->fib6_nh->fib_nh_dev == arg->dev) { 4769 rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags; 4770 fib6_update_sernum_upto_root(net, rt); 4771 rt6_multipath_rebalance(rt); 4772 } 4773 4774 return 0; 4775 } 4776 4777 void rt6_sync_up(struct net_device *dev, unsigned char nh_flags) 4778 { 4779 struct arg_netdev_event arg = { 4780 .dev = dev, 4781 { 4782 .nh_flags = nh_flags, 4783 }, 4784 }; 4785 4786 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 4787 arg.nh_flags |= RTNH_F_LINKDOWN; 4788 4789 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 4790 } 4791 4792 /* only called for fib entries with inline fib6_nh */ 4793 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 4794 const struct net_device *dev) 4795 { 4796 struct fib6_info *iter; 4797 4798 if (rt->fib6_nh->fib_nh_dev == dev) 4799 return true; 4800 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4801 if (iter->fib6_nh->fib_nh_dev == dev) 4802 return true; 4803 4804 return false; 4805 } 4806 4807 static void rt6_multipath_flush(struct fib6_info *rt) 4808 { 4809 struct fib6_info *iter; 4810 4811 rt->should_flush = 1; 4812 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4813 iter->should_flush = 1; 4814 } 4815 4816 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 4817 const struct net_device *down_dev) 4818 { 4819 struct fib6_info *iter; 4820 unsigned int dead = 0; 4821 4822 if (rt->fib6_nh->fib_nh_dev == down_dev || 4823 rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD) 4824 dead++; 4825 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4826 if (iter->fib6_nh->fib_nh_dev == down_dev || 4827 iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD) 4828 dead++; 4829 4830 return dead; 4831 } 4832 4833 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 4834 const struct net_device *dev, 4835 unsigned char nh_flags) 4836 { 4837 struct fib6_info *iter; 4838 4839 if (rt->fib6_nh->fib_nh_dev == dev) 4840 rt->fib6_nh->fib_nh_flags |= nh_flags; 4841 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4842 if (iter->fib6_nh->fib_nh_dev == dev) 4843 iter->fib6_nh->fib_nh_flags |= nh_flags; 4844 } 4845 4846 /* called with write lock held for table with rt */ 4847 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 4848 { 4849 const struct arg_netdev_event *arg = p_arg; 4850 const struct net_device *dev = arg->dev; 4851 struct net *net = dev_net(dev); 4852 4853 if (rt == net->ipv6.fib6_null_entry || rt->nh) 4854 return 0; 4855 4856 switch (arg->event) { 4857 case NETDEV_UNREGISTER: 4858 return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; 4859 case NETDEV_DOWN: 4860 if (rt->should_flush) 4861 return -1; 4862 if (!rt->fib6_nsiblings) 4863 return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; 4864 if (rt6_multipath_uses_dev(rt, dev)) { 4865 unsigned int count; 4866 4867 count = rt6_multipath_dead_count(rt, dev); 4868 if (rt->fib6_nsiblings + 1 == count) { 4869 rt6_multipath_flush(rt); 4870 return -1; 4871 } 4872 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4873 RTNH_F_LINKDOWN); 4874 fib6_update_sernum(net, rt); 4875 rt6_multipath_rebalance(rt); 4876 } 4877 return -2; 4878 case NETDEV_CHANGE: 4879 if (rt->fib6_nh->fib_nh_dev != dev || 4880 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 4881 break; 4882 rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; 4883 rt6_multipath_rebalance(rt); 4884 break; 4885 } 4886 4887 return 0; 4888 } 4889 4890 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4891 { 4892 struct arg_netdev_event arg = { 4893 .dev = dev, 4894 { 4895 .event = event, 4896 }, 4897 }; 4898 struct net *net = dev_net(dev); 4899 4900 if (net->ipv6.sysctl.skip_notify_on_dev_down) 4901 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg); 4902 else 4903 fib6_clean_all(net, fib6_ifdown, &arg); 4904 } 4905 4906 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4907 { 4908 rt6_sync_down_dev(dev, event); 4909 rt6_uncached_list_flush_dev(dev); 4910 neigh_ifdown(&nd_tbl, dev); 4911 } 4912 4913 struct rt6_mtu_change_arg { 4914 struct net_device *dev; 4915 unsigned int mtu; 4916 struct fib6_info *f6i; 4917 }; 4918 4919 static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg) 4920 { 4921 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg; 4922 struct fib6_info *f6i = arg->f6i; 4923 4924 /* For administrative MTU increase, there is no way to discover 4925 * IPv6 PMTU increase, so PMTU increase should be updated here. 4926 * Since RFC 1981 doesn't include administrative MTU increase 4927 * update PMTU increase is a MUST. (i.e. jumbo frame) 4928 */ 4929 if (nh->fib_nh_dev == arg->dev) { 4930 struct inet6_dev *idev = __in6_dev_get(arg->dev); 4931 u32 mtu = f6i->fib6_pmtu; 4932 4933 if (mtu >= arg->mtu || 4934 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4935 fib6_metric_set(f6i, RTAX_MTU, arg->mtu); 4936 4937 spin_lock_bh(&rt6_exception_lock); 4938 rt6_exceptions_update_pmtu(idev, nh, arg->mtu); 4939 spin_unlock_bh(&rt6_exception_lock); 4940 } 4941 4942 return 0; 4943 } 4944 4945 static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg) 4946 { 4947 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4948 struct inet6_dev *idev; 4949 4950 /* In IPv6 pmtu discovery is not optional, 4951 so that RTAX_MTU lock cannot disable it. 4952 We still use this lock to block changes 4953 caused by addrconf/ndisc. 4954 */ 4955 4956 idev = __in6_dev_get(arg->dev); 4957 if (!idev) 4958 return 0; 4959 4960 if (fib6_metric_locked(f6i, RTAX_MTU)) 4961 return 0; 4962 4963 arg->f6i = f6i; 4964 if (f6i->nh) { 4965 /* fib6_nh_mtu_change only returns 0, so this is safe */ 4966 return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change, 4967 arg); 4968 } 4969 4970 return fib6_nh_mtu_change(f6i->fib6_nh, arg); 4971 } 4972 4973 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4974 { 4975 struct rt6_mtu_change_arg arg = { 4976 .dev = dev, 4977 .mtu = mtu, 4978 }; 4979 4980 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4981 } 4982 4983 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4984 [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 }, 4985 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4986 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4987 [RTA_OIF] = { .type = NLA_U32 }, 4988 [RTA_IIF] = { .type = NLA_U32 }, 4989 [RTA_PRIORITY] = { .type = NLA_U32 }, 4990 [RTA_METRICS] = { .type = NLA_NESTED }, 4991 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4992 [RTA_PREF] = { .type = NLA_U8 }, 4993 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4994 [RTA_ENCAP] = { .type = NLA_NESTED }, 4995 [RTA_EXPIRES] = { .type = NLA_U32 }, 4996 [RTA_UID] = { .type = NLA_U32 }, 4997 [RTA_MARK] = { .type = NLA_U32 }, 4998 [RTA_TABLE] = { .type = NLA_U32 }, 4999 [RTA_IP_PROTO] = { .type = NLA_U8 }, 5000 [RTA_SPORT] = { .type = NLA_U16 }, 5001 [RTA_DPORT] = { .type = NLA_U16 }, 5002 [RTA_NH_ID] = { .type = NLA_U32 }, 5003 }; 5004 5005 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 5006 struct fib6_config *cfg, 5007 struct netlink_ext_ack *extack) 5008 { 5009 struct rtmsg *rtm; 5010 struct nlattr *tb[RTA_MAX+1]; 5011 unsigned int pref; 5012 int err; 5013 5014 err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, 5015 rtm_ipv6_policy, extack); 5016 if (err < 0) 5017 goto errout; 5018 5019 err = -EINVAL; 5020 rtm = nlmsg_data(nlh); 5021 5022 if (rtm->rtm_tos) { 5023 NL_SET_ERR_MSG(extack, 5024 "Invalid dsfield (tos): option not available for IPv6"); 5025 goto errout; 5026 } 5027 5028 *cfg = (struct fib6_config){ 5029 .fc_table = rtm->rtm_table, 5030 .fc_dst_len = rtm->rtm_dst_len, 5031 .fc_src_len = rtm->rtm_src_len, 5032 .fc_flags = RTF_UP, 5033 .fc_protocol = rtm->rtm_protocol, 5034 .fc_type = rtm->rtm_type, 5035 5036 .fc_nlinfo.portid = NETLINK_CB(skb).portid, 5037 .fc_nlinfo.nlh = nlh, 5038 .fc_nlinfo.nl_net = sock_net(skb->sk), 5039 }; 5040 5041 if (rtm->rtm_type == RTN_UNREACHABLE || 5042 rtm->rtm_type == RTN_BLACKHOLE || 5043 rtm->rtm_type == RTN_PROHIBIT || 5044 rtm->rtm_type == RTN_THROW) 5045 cfg->fc_flags |= RTF_REJECT; 5046 5047 if (rtm->rtm_type == RTN_LOCAL) 5048 cfg->fc_flags |= RTF_LOCAL; 5049 5050 if (rtm->rtm_flags & RTM_F_CLONED) 5051 cfg->fc_flags |= RTF_CACHE; 5052 5053 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 5054 5055 if (tb[RTA_NH_ID]) { 5056 if (tb[RTA_GATEWAY] || tb[RTA_OIF] || 5057 tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) { 5058 NL_SET_ERR_MSG(extack, 5059 "Nexthop specification and nexthop id are mutually exclusive"); 5060 goto errout; 5061 } 5062 cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]); 5063 } 5064 5065 if (tb[RTA_GATEWAY]) { 5066 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 5067 cfg->fc_flags |= RTF_GATEWAY; 5068 } 5069 if (tb[RTA_VIA]) { 5070 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute"); 5071 goto errout; 5072 } 5073 5074 if (tb[RTA_DST]) { 5075 int plen = (rtm->rtm_dst_len + 7) >> 3; 5076 5077 if (nla_len(tb[RTA_DST]) < plen) 5078 goto errout; 5079 5080 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 5081 } 5082 5083 if (tb[RTA_SRC]) { 5084 int plen = (rtm->rtm_src_len + 7) >> 3; 5085 5086 if (nla_len(tb[RTA_SRC]) < plen) 5087 goto errout; 5088 5089 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 5090 } 5091 5092 if (tb[RTA_PREFSRC]) 5093 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 5094 5095 if (tb[RTA_OIF]) 5096 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 5097 5098 if (tb[RTA_PRIORITY]) 5099 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 5100 5101 if (tb[RTA_METRICS]) { 5102 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 5103 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 5104 } 5105 5106 if (tb[RTA_TABLE]) 5107 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 5108 5109 if (tb[RTA_MULTIPATH]) { 5110 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 5111 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 5112 5113 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 5114 cfg->fc_mp_len, extack); 5115 if (err < 0) 5116 goto errout; 5117 } 5118 5119 if (tb[RTA_PREF]) { 5120 pref = nla_get_u8(tb[RTA_PREF]); 5121 if (pref != ICMPV6_ROUTER_PREF_LOW && 5122 pref != ICMPV6_ROUTER_PREF_HIGH) 5123 pref = ICMPV6_ROUTER_PREF_MEDIUM; 5124 cfg->fc_flags |= RTF_PREF(pref); 5125 } 5126 5127 if (tb[RTA_ENCAP]) 5128 cfg->fc_encap = tb[RTA_ENCAP]; 5129 5130 if (tb[RTA_ENCAP_TYPE]) { 5131 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 5132 5133 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 5134 if (err < 0) 5135 goto errout; 5136 } 5137 5138 if (tb[RTA_EXPIRES]) { 5139 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 5140 5141 if (addrconf_finite_timeout(timeout)) { 5142 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 5143 cfg->fc_flags |= RTF_EXPIRES; 5144 } 5145 } 5146 5147 err = 0; 5148 errout: 5149 return err; 5150 } 5151 5152 struct rt6_nh { 5153 struct fib6_info *fib6_info; 5154 struct fib6_config r_cfg; 5155 struct list_head next; 5156 }; 5157 5158 static int ip6_route_info_append(struct net *net, 5159 struct list_head *rt6_nh_list, 5160 struct fib6_info *rt, 5161 struct fib6_config *r_cfg) 5162 { 5163 struct rt6_nh *nh; 5164 int err = -EEXIST; 5165 5166 list_for_each_entry(nh, rt6_nh_list, next) { 5167 /* check if fib6_info already exists */ 5168 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 5169 return err; 5170 } 5171 5172 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 5173 if (!nh) 5174 return -ENOMEM; 5175 nh->fib6_info = rt; 5176 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 5177 list_add_tail(&nh->next, rt6_nh_list); 5178 5179 return 0; 5180 } 5181 5182 static void ip6_route_mpath_notify(struct fib6_info *rt, 5183 struct fib6_info *rt_last, 5184 struct nl_info *info, 5185 __u16 nlflags) 5186 { 5187 /* if this is an APPEND route, then rt points to the first route 5188 * inserted and rt_last points to last route inserted. Userspace 5189 * wants a consistent dump of the route which starts at the first 5190 * nexthop. Since sibling routes are always added at the end of 5191 * the list, find the first sibling of the last route appended 5192 */ 5193 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 5194 rt = list_first_entry(&rt_last->fib6_siblings, 5195 struct fib6_info, 5196 fib6_siblings); 5197 } 5198 5199 if (rt) 5200 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 5201 } 5202 5203 static bool ip6_route_mpath_should_notify(const struct fib6_info *rt) 5204 { 5205 bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); 5206 bool should_notify = false; 5207 struct fib6_info *leaf; 5208 struct fib6_node *fn; 5209 5210 rcu_read_lock(); 5211 fn = rcu_dereference(rt->fib6_node); 5212 if (!fn) 5213 goto out; 5214 5215 leaf = rcu_dereference(fn->leaf); 5216 if (!leaf) 5217 goto out; 5218 5219 if (rt == leaf || 5220 (rt_can_ecmp && rt->fib6_metric == leaf->fib6_metric && 5221 rt6_qualify_for_ecmp(leaf))) 5222 should_notify = true; 5223 out: 5224 rcu_read_unlock(); 5225 5226 return should_notify; 5227 } 5228 5229 static int fib6_gw_from_attr(struct in6_addr *gw, struct nlattr *nla, 5230 struct netlink_ext_ack *extack) 5231 { 5232 if (nla_len(nla) < sizeof(*gw)) { 5233 NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_GATEWAY"); 5234 return -EINVAL; 5235 } 5236 5237 *gw = nla_get_in6_addr(nla); 5238 5239 return 0; 5240 } 5241 5242 static int ip6_route_multipath_add(struct fib6_config *cfg, 5243 struct netlink_ext_ack *extack) 5244 { 5245 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 5246 struct nl_info *info = &cfg->fc_nlinfo; 5247 struct fib6_config r_cfg; 5248 struct rtnexthop *rtnh; 5249 struct fib6_info *rt; 5250 struct rt6_nh *err_nh; 5251 struct rt6_nh *nh, *nh_safe; 5252 __u16 nlflags; 5253 int remaining; 5254 int attrlen; 5255 int err = 1; 5256 int nhn = 0; 5257 int replace = (cfg->fc_nlinfo.nlh && 5258 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 5259 LIST_HEAD(rt6_nh_list); 5260 5261 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 5262 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 5263 nlflags |= NLM_F_APPEND; 5264 5265 remaining = cfg->fc_mp_len; 5266 rtnh = (struct rtnexthop *)cfg->fc_mp; 5267 5268 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 5269 * fib6_info structs per nexthop 5270 */ 5271 while (rtnh_ok(rtnh, remaining)) { 5272 memcpy(&r_cfg, cfg, sizeof(*cfg)); 5273 if (rtnh->rtnh_ifindex) 5274 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 5275 5276 attrlen = rtnh_attrlen(rtnh); 5277 if (attrlen > 0) { 5278 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 5279 5280 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 5281 if (nla) { 5282 err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla, 5283 extack); 5284 if (err) 5285 goto cleanup; 5286 5287 r_cfg.fc_flags |= RTF_GATEWAY; 5288 } 5289 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 5290 5291 /* RTA_ENCAP_TYPE length checked in 5292 * lwtunnel_valid_encap_type_attr 5293 */ 5294 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 5295 if (nla) 5296 r_cfg.fc_encap_type = nla_get_u16(nla); 5297 } 5298 5299 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 5300 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 5301 if (IS_ERR(rt)) { 5302 err = PTR_ERR(rt); 5303 rt = NULL; 5304 goto cleanup; 5305 } 5306 if (!rt6_qualify_for_ecmp(rt)) { 5307 err = -EINVAL; 5308 NL_SET_ERR_MSG(extack, 5309 "Device only routes can not be added for IPv6 using the multipath API."); 5310 fib6_info_release(rt); 5311 goto cleanup; 5312 } 5313 5314 rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; 5315 5316 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 5317 rt, &r_cfg); 5318 if (err) { 5319 fib6_info_release(rt); 5320 goto cleanup; 5321 } 5322 5323 rtnh = rtnh_next(rtnh, &remaining); 5324 } 5325 5326 if (list_empty(&rt6_nh_list)) { 5327 NL_SET_ERR_MSG(extack, 5328 "Invalid nexthop configuration - no valid nexthops"); 5329 return -EINVAL; 5330 } 5331 5332 /* for add and replace send one notification with all nexthops. 5333 * Skip the notification in fib6_add_rt2node and send one with 5334 * the full route when done 5335 */ 5336 info->skip_notify = 1; 5337 5338 /* For add and replace, send one notification with all nexthops. For 5339 * append, send one notification with all appended nexthops. 5340 */ 5341 info->skip_notify_kernel = 1; 5342 5343 err_nh = NULL; 5344 list_for_each_entry(nh, &rt6_nh_list, next) { 5345 err = __ip6_ins_rt(nh->fib6_info, info, extack); 5346 fib6_info_release(nh->fib6_info); 5347 5348 if (!err) { 5349 /* save reference to last route successfully inserted */ 5350 rt_last = nh->fib6_info; 5351 5352 /* save reference to first route for notification */ 5353 if (!rt_notif) 5354 rt_notif = nh->fib6_info; 5355 } 5356 5357 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 5358 nh->fib6_info = NULL; 5359 if (err) { 5360 if (replace && nhn) 5361 NL_SET_ERR_MSG_MOD(extack, 5362 "multipath route replace failed (check consistency of installed routes)"); 5363 err_nh = nh; 5364 goto add_errout; 5365 } 5366 5367 /* Because each route is added like a single route we remove 5368 * these flags after the first nexthop: if there is a collision, 5369 * we have already failed to add the first nexthop: 5370 * fib6_add_rt2node() has rejected it; when replacing, old 5371 * nexthops have been replaced by first new, the rest should 5372 * be added to it. 5373 */ 5374 if (cfg->fc_nlinfo.nlh) { 5375 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 5376 NLM_F_REPLACE); 5377 cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE; 5378 } 5379 nhn++; 5380 } 5381 5382 /* An in-kernel notification should only be sent in case the new 5383 * multipath route is added as the first route in the node, or if 5384 * it was appended to it. We pass 'rt_notif' since it is the first 5385 * sibling and might allow us to skip some checks in the replace case. 5386 */ 5387 if (ip6_route_mpath_should_notify(rt_notif)) { 5388 enum fib_event_type fib_event; 5389 5390 if (rt_notif->fib6_nsiblings != nhn - 1) 5391 fib_event = FIB_EVENT_ENTRY_APPEND; 5392 else 5393 fib_event = FIB_EVENT_ENTRY_REPLACE; 5394 5395 err = call_fib6_multipath_entry_notifiers(info->nl_net, 5396 fib_event, rt_notif, 5397 nhn - 1, extack); 5398 if (err) { 5399 /* Delete all the siblings that were just added */ 5400 err_nh = NULL; 5401 goto add_errout; 5402 } 5403 } 5404 5405 /* success ... tell user about new route */ 5406 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 5407 goto cleanup; 5408 5409 add_errout: 5410 /* send notification for routes that were added so that 5411 * the delete notifications sent by ip6_route_del are 5412 * coherent 5413 */ 5414 if (rt_notif) 5415 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 5416 5417 /* Delete routes that were already added */ 5418 list_for_each_entry(nh, &rt6_nh_list, next) { 5419 if (err_nh == nh) 5420 break; 5421 ip6_route_del(&nh->r_cfg, extack); 5422 } 5423 5424 cleanup: 5425 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 5426 if (nh->fib6_info) 5427 fib6_info_release(nh->fib6_info); 5428 list_del(&nh->next); 5429 kfree(nh); 5430 } 5431 5432 return err; 5433 } 5434 5435 static int ip6_route_multipath_del(struct fib6_config *cfg, 5436 struct netlink_ext_ack *extack) 5437 { 5438 struct fib6_config r_cfg; 5439 struct rtnexthop *rtnh; 5440 int last_err = 0; 5441 int remaining; 5442 int attrlen; 5443 int err; 5444 5445 remaining = cfg->fc_mp_len; 5446 rtnh = (struct rtnexthop *)cfg->fc_mp; 5447 5448 /* Parse a Multipath Entry */ 5449 while (rtnh_ok(rtnh, remaining)) { 5450 memcpy(&r_cfg, cfg, sizeof(*cfg)); 5451 if (rtnh->rtnh_ifindex) 5452 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 5453 5454 attrlen = rtnh_attrlen(rtnh); 5455 if (attrlen > 0) { 5456 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 5457 5458 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 5459 if (nla) { 5460 err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla, 5461 extack); 5462 if (err) { 5463 last_err = err; 5464 goto next_rtnh; 5465 } 5466 5467 r_cfg.fc_flags |= RTF_GATEWAY; 5468 } 5469 } 5470 err = ip6_route_del(&r_cfg, extack); 5471 if (err) 5472 last_err = err; 5473 5474 next_rtnh: 5475 rtnh = rtnh_next(rtnh, &remaining); 5476 } 5477 5478 return last_err; 5479 } 5480 5481 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 5482 struct netlink_ext_ack *extack) 5483 { 5484 struct fib6_config cfg; 5485 int err; 5486 5487 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 5488 if (err < 0) 5489 return err; 5490 5491 if (cfg.fc_nh_id && 5492 !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) { 5493 NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); 5494 return -EINVAL; 5495 } 5496 5497 if (cfg.fc_mp) 5498 return ip6_route_multipath_del(&cfg, extack); 5499 else { 5500 cfg.fc_delete_all_nh = 1; 5501 return ip6_route_del(&cfg, extack); 5502 } 5503 } 5504 5505 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 5506 struct netlink_ext_ack *extack) 5507 { 5508 struct fib6_config cfg; 5509 int err; 5510 5511 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 5512 if (err < 0) 5513 return err; 5514 5515 if (cfg.fc_metric == 0) 5516 cfg.fc_metric = IP6_RT_PRIO_USER; 5517 5518 if (cfg.fc_mp) 5519 return ip6_route_multipath_add(&cfg, extack); 5520 else 5521 return ip6_route_add(&cfg, GFP_KERNEL, extack); 5522 } 5523 5524 /* add the overhead of this fib6_nh to nexthop_len */ 5525 static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg) 5526 { 5527 int *nexthop_len = arg; 5528 5529 *nexthop_len += nla_total_size(0) /* RTA_MULTIPATH */ 5530 + NLA_ALIGN(sizeof(struct rtnexthop)) 5531 + nla_total_size(16); /* RTA_GATEWAY */ 5532 5533 if (nh->fib_nh_lws) { 5534 /* RTA_ENCAP_TYPE */ 5535 *nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); 5536 /* RTA_ENCAP */ 5537 *nexthop_len += nla_total_size(2); 5538 } 5539 5540 return 0; 5541 } 5542 5543 static size_t rt6_nlmsg_size(struct fib6_info *f6i) 5544 { 5545 int nexthop_len; 5546 5547 if (f6i->nh) { 5548 nexthop_len = nla_total_size(4); /* RTA_NH_ID */ 5549 nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size, 5550 &nexthop_len); 5551 } else { 5552 struct fib6_info *sibling, *next_sibling; 5553 struct fib6_nh *nh = f6i->fib6_nh; 5554 5555 nexthop_len = 0; 5556 if (f6i->fib6_nsiblings) { 5557 rt6_nh_nlmsg_size(nh, &nexthop_len); 5558 5559 list_for_each_entry_safe(sibling, next_sibling, 5560 &f6i->fib6_siblings, fib6_siblings) { 5561 rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len); 5562 } 5563 } 5564 nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); 5565 } 5566 5567 return NLMSG_ALIGN(sizeof(struct rtmsg)) 5568 + nla_total_size(16) /* RTA_SRC */ 5569 + nla_total_size(16) /* RTA_DST */ 5570 + nla_total_size(16) /* RTA_GATEWAY */ 5571 + nla_total_size(16) /* RTA_PREFSRC */ 5572 + nla_total_size(4) /* RTA_TABLE */ 5573 + nla_total_size(4) /* RTA_IIF */ 5574 + nla_total_size(4) /* RTA_OIF */ 5575 + nla_total_size(4) /* RTA_PRIORITY */ 5576 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 5577 + nla_total_size(sizeof(struct rta_cacheinfo)) 5578 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 5579 + nla_total_size(1) /* RTA_PREF */ 5580 + nexthop_len; 5581 } 5582 5583 static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh, 5584 unsigned char *flags) 5585 { 5586 if (nexthop_is_multipath(nh)) { 5587 struct nlattr *mp; 5588 5589 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); 5590 if (!mp) 5591 goto nla_put_failure; 5592 5593 if (nexthop_mpath_fill_node(skb, nh, AF_INET6)) 5594 goto nla_put_failure; 5595 5596 nla_nest_end(skb, mp); 5597 } else { 5598 struct fib6_nh *fib6_nh; 5599 5600 fib6_nh = nexthop_fib6_nh(nh); 5601 if (fib_nexthop_info(skb, &fib6_nh->nh_common, AF_INET6, 5602 flags, false) < 0) 5603 goto nla_put_failure; 5604 } 5605 5606 return 0; 5607 5608 nla_put_failure: 5609 return -EMSGSIZE; 5610 } 5611 5612 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 5613 struct fib6_info *rt, struct dst_entry *dst, 5614 struct in6_addr *dest, struct in6_addr *src, 5615 int iif, int type, u32 portid, u32 seq, 5616 unsigned int flags) 5617 { 5618 struct rt6_info *rt6 = (struct rt6_info *)dst; 5619 struct rt6key *rt6_dst, *rt6_src; 5620 u32 *pmetrics, table, rt6_flags; 5621 unsigned char nh_flags = 0; 5622 struct nlmsghdr *nlh; 5623 struct rtmsg *rtm; 5624 long expires = 0; 5625 5626 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 5627 if (!nlh) 5628 return -EMSGSIZE; 5629 5630 if (rt6) { 5631 rt6_dst = &rt6->rt6i_dst; 5632 rt6_src = &rt6->rt6i_src; 5633 rt6_flags = rt6->rt6i_flags; 5634 } else { 5635 rt6_dst = &rt->fib6_dst; 5636 rt6_src = &rt->fib6_src; 5637 rt6_flags = rt->fib6_flags; 5638 } 5639 5640 rtm = nlmsg_data(nlh); 5641 rtm->rtm_family = AF_INET6; 5642 rtm->rtm_dst_len = rt6_dst->plen; 5643 rtm->rtm_src_len = rt6_src->plen; 5644 rtm->rtm_tos = 0; 5645 if (rt->fib6_table) 5646 table = rt->fib6_table->tb6_id; 5647 else 5648 table = RT6_TABLE_UNSPEC; 5649 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT; 5650 if (nla_put_u32(skb, RTA_TABLE, table)) 5651 goto nla_put_failure; 5652 5653 rtm->rtm_type = rt->fib6_type; 5654 rtm->rtm_flags = 0; 5655 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 5656 rtm->rtm_protocol = rt->fib6_protocol; 5657 5658 if (rt6_flags & RTF_CACHE) 5659 rtm->rtm_flags |= RTM_F_CLONED; 5660 5661 if (dest) { 5662 if (nla_put_in6_addr(skb, RTA_DST, dest)) 5663 goto nla_put_failure; 5664 rtm->rtm_dst_len = 128; 5665 } else if (rtm->rtm_dst_len) 5666 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr)) 5667 goto nla_put_failure; 5668 #ifdef CONFIG_IPV6_SUBTREES 5669 if (src) { 5670 if (nla_put_in6_addr(skb, RTA_SRC, src)) 5671 goto nla_put_failure; 5672 rtm->rtm_src_len = 128; 5673 } else if (rtm->rtm_src_len && 5674 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr)) 5675 goto nla_put_failure; 5676 #endif 5677 if (iif) { 5678 #ifdef CONFIG_IPV6_MROUTE 5679 if (ipv6_addr_is_multicast(&rt6_dst->addr)) { 5680 int err = ip6mr_get_route(net, skb, rtm, portid); 5681 5682 if (err == 0) 5683 return 0; 5684 if (err < 0) 5685 goto nla_put_failure; 5686 } else 5687 #endif 5688 if (nla_put_u32(skb, RTA_IIF, iif)) 5689 goto nla_put_failure; 5690 } else if (dest) { 5691 struct in6_addr saddr_buf; 5692 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 5693 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 5694 goto nla_put_failure; 5695 } 5696 5697 if (rt->fib6_prefsrc.plen) { 5698 struct in6_addr saddr_buf; 5699 saddr_buf = rt->fib6_prefsrc.addr; 5700 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 5701 goto nla_put_failure; 5702 } 5703 5704 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 5705 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 5706 goto nla_put_failure; 5707 5708 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 5709 goto nla_put_failure; 5710 5711 /* For multipath routes, walk the siblings list and add 5712 * each as a nexthop within RTA_MULTIPATH. 5713 */ 5714 if (rt6) { 5715 if (rt6_flags & RTF_GATEWAY && 5716 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) 5717 goto nla_put_failure; 5718 5719 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex)) 5720 goto nla_put_failure; 5721 5722 if (dst->lwtstate && 5723 lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) 5724 goto nla_put_failure; 5725 } else if (rt->fib6_nsiblings) { 5726 struct fib6_info *sibling, *next_sibling; 5727 struct nlattr *mp; 5728 5729 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); 5730 if (!mp) 5731 goto nla_put_failure; 5732 5733 if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common, 5734 rt->fib6_nh->fib_nh_weight, AF_INET6, 5735 0) < 0) 5736 goto nla_put_failure; 5737 5738 list_for_each_entry_safe(sibling, next_sibling, 5739 &rt->fib6_siblings, fib6_siblings) { 5740 if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common, 5741 sibling->fib6_nh->fib_nh_weight, 5742 AF_INET6, 0) < 0) 5743 goto nla_put_failure; 5744 } 5745 5746 nla_nest_end(skb, mp); 5747 } else if (rt->nh) { 5748 if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id)) 5749 goto nla_put_failure; 5750 5751 if (nexthop_is_blackhole(rt->nh)) 5752 rtm->rtm_type = RTN_BLACKHOLE; 5753 5754 if (READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode) && 5755 rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0) 5756 goto nla_put_failure; 5757 5758 rtm->rtm_flags |= nh_flags; 5759 } else { 5760 if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, AF_INET6, 5761 &nh_flags, false) < 0) 5762 goto nla_put_failure; 5763 5764 rtm->rtm_flags |= nh_flags; 5765 } 5766 5767 if (rt6_flags & RTF_EXPIRES) { 5768 expires = dst ? dst->expires : rt->expires; 5769 expires -= jiffies; 5770 } 5771 5772 if (!dst) { 5773 if (READ_ONCE(rt->offload)) 5774 rtm->rtm_flags |= RTM_F_OFFLOAD; 5775 if (READ_ONCE(rt->trap)) 5776 rtm->rtm_flags |= RTM_F_TRAP; 5777 if (READ_ONCE(rt->offload_failed)) 5778 rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED; 5779 } 5780 5781 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 5782 goto nla_put_failure; 5783 5784 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags))) 5785 goto nla_put_failure; 5786 5787 5788 nlmsg_end(skb, nlh); 5789 return 0; 5790 5791 nla_put_failure: 5792 nlmsg_cancel(skb, nlh); 5793 return -EMSGSIZE; 5794 } 5795 5796 static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg) 5797 { 5798 const struct net_device *dev = arg; 5799 5800 if (nh->fib_nh_dev == dev) 5801 return 1; 5802 5803 return 0; 5804 } 5805 5806 static bool fib6_info_uses_dev(const struct fib6_info *f6i, 5807 const struct net_device *dev) 5808 { 5809 if (f6i->nh) { 5810 struct net_device *_dev = (struct net_device *)dev; 5811 5812 return !!nexthop_for_each_fib6_nh(f6i->nh, 5813 fib6_info_nh_uses_dev, 5814 _dev); 5815 } 5816 5817 if (f6i->fib6_nh->fib_nh_dev == dev) 5818 return true; 5819 5820 if (f6i->fib6_nsiblings) { 5821 struct fib6_info *sibling, *next_sibling; 5822 5823 list_for_each_entry_safe(sibling, next_sibling, 5824 &f6i->fib6_siblings, fib6_siblings) { 5825 if (sibling->fib6_nh->fib_nh_dev == dev) 5826 return true; 5827 } 5828 } 5829 5830 return false; 5831 } 5832 5833 struct fib6_nh_exception_dump_walker { 5834 struct rt6_rtnl_dump_arg *dump; 5835 struct fib6_info *rt; 5836 unsigned int flags; 5837 unsigned int skip; 5838 unsigned int count; 5839 }; 5840 5841 static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg) 5842 { 5843 struct fib6_nh_exception_dump_walker *w = arg; 5844 struct rt6_rtnl_dump_arg *dump = w->dump; 5845 struct rt6_exception_bucket *bucket; 5846 struct rt6_exception *rt6_ex; 5847 int i, err; 5848 5849 bucket = fib6_nh_get_excptn_bucket(nh, NULL); 5850 if (!bucket) 5851 return 0; 5852 5853 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 5854 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 5855 if (w->skip) { 5856 w->skip--; 5857 continue; 5858 } 5859 5860 /* Expiration of entries doesn't bump sernum, insertion 5861 * does. Removal is triggered by insertion, so we can 5862 * rely on the fact that if entries change between two 5863 * partial dumps, this node is scanned again completely, 5864 * see rt6_insert_exception() and fib6_dump_table(). 5865 * 5866 * Count expired entries we go through as handled 5867 * entries that we'll skip next time, in case of partial 5868 * node dump. Otherwise, if entries expire meanwhile, 5869 * we'll skip the wrong amount. 5870 */ 5871 if (rt6_check_expired(rt6_ex->rt6i)) { 5872 w->count++; 5873 continue; 5874 } 5875 5876 err = rt6_fill_node(dump->net, dump->skb, w->rt, 5877 &rt6_ex->rt6i->dst, NULL, NULL, 0, 5878 RTM_NEWROUTE, 5879 NETLINK_CB(dump->cb->skb).portid, 5880 dump->cb->nlh->nlmsg_seq, w->flags); 5881 if (err) 5882 return err; 5883 5884 w->count++; 5885 } 5886 bucket++; 5887 } 5888 5889 return 0; 5890 } 5891 5892 /* Return -1 if done with node, number of handled routes on partial dump */ 5893 int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip) 5894 { 5895 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 5896 struct fib_dump_filter *filter = &arg->filter; 5897 unsigned int flags = NLM_F_MULTI; 5898 struct net *net = arg->net; 5899 int count = 0; 5900 5901 if (rt == net->ipv6.fib6_null_entry) 5902 return -1; 5903 5904 if ((filter->flags & RTM_F_PREFIX) && 5905 !(rt->fib6_flags & RTF_PREFIX_RT)) { 5906 /* success since this is not a prefix route */ 5907 return -1; 5908 } 5909 if (filter->filter_set && 5910 ((filter->rt_type && rt->fib6_type != filter->rt_type) || 5911 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || 5912 (filter->protocol && rt->fib6_protocol != filter->protocol))) { 5913 return -1; 5914 } 5915 5916 if (filter->filter_set || 5917 !filter->dump_routes || !filter->dump_exceptions) { 5918 flags |= NLM_F_DUMP_FILTERED; 5919 } 5920 5921 if (filter->dump_routes) { 5922 if (skip) { 5923 skip--; 5924 } else { 5925 if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 5926 0, RTM_NEWROUTE, 5927 NETLINK_CB(arg->cb->skb).portid, 5928 arg->cb->nlh->nlmsg_seq, flags)) { 5929 return 0; 5930 } 5931 count++; 5932 } 5933 } 5934 5935 if (filter->dump_exceptions) { 5936 struct fib6_nh_exception_dump_walker w = { .dump = arg, 5937 .rt = rt, 5938 .flags = flags, 5939 .skip = skip, 5940 .count = 0 }; 5941 int err; 5942 5943 rcu_read_lock(); 5944 if (rt->nh) { 5945 err = nexthop_for_each_fib6_nh(rt->nh, 5946 rt6_nh_dump_exceptions, 5947 &w); 5948 } else { 5949 err = rt6_nh_dump_exceptions(rt->fib6_nh, &w); 5950 } 5951 rcu_read_unlock(); 5952 5953 if (err) 5954 return count + w.count; 5955 } 5956 5957 return -1; 5958 } 5959 5960 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, 5961 const struct nlmsghdr *nlh, 5962 struct nlattr **tb, 5963 struct netlink_ext_ack *extack) 5964 { 5965 struct rtmsg *rtm; 5966 int i, err; 5967 5968 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { 5969 NL_SET_ERR_MSG_MOD(extack, 5970 "Invalid header for get route request"); 5971 return -EINVAL; 5972 } 5973 5974 if (!netlink_strict_get_check(skb)) 5975 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, 5976 rtm_ipv6_policy, extack); 5977 5978 rtm = nlmsg_data(nlh); 5979 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || 5980 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || 5981 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || 5982 rtm->rtm_type) { 5983 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request"); 5984 return -EINVAL; 5985 } 5986 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) { 5987 NL_SET_ERR_MSG_MOD(extack, 5988 "Invalid flags for get route request"); 5989 return -EINVAL; 5990 } 5991 5992 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, 5993 rtm_ipv6_policy, extack); 5994 if (err) 5995 return err; 5996 5997 if ((tb[RTA_SRC] && !rtm->rtm_src_len) || 5998 (tb[RTA_DST] && !rtm->rtm_dst_len)) { 5999 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); 6000 return -EINVAL; 6001 } 6002 6003 for (i = 0; i <= RTA_MAX; i++) { 6004 if (!tb[i]) 6005 continue; 6006 6007 switch (i) { 6008 case RTA_SRC: 6009 case RTA_DST: 6010 case RTA_IIF: 6011 case RTA_OIF: 6012 case RTA_MARK: 6013 case RTA_UID: 6014 case RTA_SPORT: 6015 case RTA_DPORT: 6016 case RTA_IP_PROTO: 6017 break; 6018 default: 6019 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); 6020 return -EINVAL; 6021 } 6022 } 6023 6024 return 0; 6025 } 6026 6027 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 6028 struct netlink_ext_ack *extack) 6029 { 6030 struct net *net = sock_net(in_skb->sk); 6031 struct nlattr *tb[RTA_MAX+1]; 6032 int err, iif = 0, oif = 0; 6033 struct fib6_info *from; 6034 struct dst_entry *dst; 6035 struct rt6_info *rt; 6036 struct sk_buff *skb; 6037 struct rtmsg *rtm; 6038 struct flowi6 fl6 = {}; 6039 bool fibmatch; 6040 6041 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack); 6042 if (err < 0) 6043 goto errout; 6044 6045 err = -EINVAL; 6046 rtm = nlmsg_data(nlh); 6047 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 6048 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 6049 6050 if (tb[RTA_SRC]) { 6051 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 6052 goto errout; 6053 6054 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 6055 } 6056 6057 if (tb[RTA_DST]) { 6058 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 6059 goto errout; 6060 6061 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 6062 } 6063 6064 if (tb[RTA_IIF]) 6065 iif = nla_get_u32(tb[RTA_IIF]); 6066 6067 if (tb[RTA_OIF]) 6068 oif = nla_get_u32(tb[RTA_OIF]); 6069 6070 if (tb[RTA_MARK]) 6071 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 6072 6073 if (tb[RTA_UID]) 6074 fl6.flowi6_uid = make_kuid(current_user_ns(), 6075 nla_get_u32(tb[RTA_UID])); 6076 else 6077 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 6078 6079 if (tb[RTA_SPORT]) 6080 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); 6081 6082 if (tb[RTA_DPORT]) 6083 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); 6084 6085 if (tb[RTA_IP_PROTO]) { 6086 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 6087 &fl6.flowi6_proto, AF_INET6, 6088 extack); 6089 if (err) 6090 goto errout; 6091 } 6092 6093 if (iif) { 6094 struct net_device *dev; 6095 int flags = 0; 6096 6097 rcu_read_lock(); 6098 6099 dev = dev_get_by_index_rcu(net, iif); 6100 if (!dev) { 6101 rcu_read_unlock(); 6102 err = -ENODEV; 6103 goto errout; 6104 } 6105 6106 fl6.flowi6_iif = iif; 6107 6108 if (!ipv6_addr_any(&fl6.saddr)) 6109 flags |= RT6_LOOKUP_F_HAS_SADDR; 6110 6111 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 6112 6113 rcu_read_unlock(); 6114 } else { 6115 fl6.flowi6_oif = oif; 6116 6117 dst = ip6_route_output(net, NULL, &fl6); 6118 } 6119 6120 6121 rt = container_of(dst, struct rt6_info, dst); 6122 if (rt->dst.error) { 6123 err = rt->dst.error; 6124 ip6_rt_put(rt); 6125 goto errout; 6126 } 6127 6128 if (rt == net->ipv6.ip6_null_entry) { 6129 err = rt->dst.error; 6130 ip6_rt_put(rt); 6131 goto errout; 6132 } 6133 6134 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 6135 if (!skb) { 6136 ip6_rt_put(rt); 6137 err = -ENOBUFS; 6138 goto errout; 6139 } 6140 6141 skb_dst_set(skb, &rt->dst); 6142 6143 rcu_read_lock(); 6144 from = rcu_dereference(rt->from); 6145 if (from) { 6146 if (fibmatch) 6147 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, 6148 iif, RTM_NEWROUTE, 6149 NETLINK_CB(in_skb).portid, 6150 nlh->nlmsg_seq, 0); 6151 else 6152 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 6153 &fl6.saddr, iif, RTM_NEWROUTE, 6154 NETLINK_CB(in_skb).portid, 6155 nlh->nlmsg_seq, 0); 6156 } else { 6157 err = -ENETUNREACH; 6158 } 6159 rcu_read_unlock(); 6160 6161 if (err < 0) { 6162 kfree_skb(skb); 6163 goto errout; 6164 } 6165 6166 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 6167 errout: 6168 return err; 6169 } 6170 6171 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 6172 unsigned int nlm_flags) 6173 { 6174 struct sk_buff *skb; 6175 struct net *net = info->nl_net; 6176 u32 seq; 6177 int err; 6178 6179 err = -ENOBUFS; 6180 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 6181 6182 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 6183 if (!skb) 6184 goto errout; 6185 6186 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 6187 event, info->portid, seq, nlm_flags); 6188 if (err < 0) { 6189 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 6190 WARN_ON(err == -EMSGSIZE); 6191 kfree_skb(skb); 6192 goto errout; 6193 } 6194 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 6195 info->nlh, gfp_any()); 6196 return; 6197 errout: 6198 if (err < 0) 6199 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 6200 } 6201 6202 void fib6_rt_update(struct net *net, struct fib6_info *rt, 6203 struct nl_info *info) 6204 { 6205 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 6206 struct sk_buff *skb; 6207 int err = -ENOBUFS; 6208 6209 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 6210 if (!skb) 6211 goto errout; 6212 6213 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 6214 RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE); 6215 if (err < 0) { 6216 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 6217 WARN_ON(err == -EMSGSIZE); 6218 kfree_skb(skb); 6219 goto errout; 6220 } 6221 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 6222 info->nlh, gfp_any()); 6223 return; 6224 errout: 6225 if (err < 0) 6226 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 6227 } 6228 6229 void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, 6230 bool offload, bool trap, bool offload_failed) 6231 { 6232 struct sk_buff *skb; 6233 int err; 6234 6235 if (READ_ONCE(f6i->offload) == offload && 6236 READ_ONCE(f6i->trap) == trap && 6237 READ_ONCE(f6i->offload_failed) == offload_failed) 6238 return; 6239 6240 WRITE_ONCE(f6i->offload, offload); 6241 WRITE_ONCE(f6i->trap, trap); 6242 6243 /* 2 means send notifications only if offload_failed was changed. */ 6244 if (net->ipv6.sysctl.fib_notify_on_flag_change == 2 && 6245 READ_ONCE(f6i->offload_failed) == offload_failed) 6246 return; 6247 6248 WRITE_ONCE(f6i->offload_failed, offload_failed); 6249 6250 if (!rcu_access_pointer(f6i->fib6_node)) 6251 /* The route was removed from the tree, do not send 6252 * notification. 6253 */ 6254 return; 6255 6256 if (!net->ipv6.sysctl.fib_notify_on_flag_change) 6257 return; 6258 6259 skb = nlmsg_new(rt6_nlmsg_size(f6i), GFP_KERNEL); 6260 if (!skb) { 6261 err = -ENOBUFS; 6262 goto errout; 6263 } 6264 6265 err = rt6_fill_node(net, skb, f6i, NULL, NULL, NULL, 0, RTM_NEWROUTE, 0, 6266 0, 0); 6267 if (err < 0) { 6268 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 6269 WARN_ON(err == -EMSGSIZE); 6270 kfree_skb(skb); 6271 goto errout; 6272 } 6273 6274 rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ROUTE, NULL, GFP_KERNEL); 6275 return; 6276 6277 errout: 6278 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 6279 } 6280 EXPORT_SYMBOL(fib6_info_hw_flags_set); 6281 6282 static int ip6_route_dev_notify(struct notifier_block *this, 6283 unsigned long event, void *ptr) 6284 { 6285 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 6286 struct net *net = dev_net(dev); 6287 6288 if (!(dev->flags & IFF_LOOPBACK)) 6289 return NOTIFY_OK; 6290 6291 if (event == NETDEV_REGISTER) { 6292 net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev; 6293 net->ipv6.ip6_null_entry->dst.dev = dev; 6294 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 6295 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6296 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 6297 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 6298 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 6299 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 6300 #endif 6301 } else if (event == NETDEV_UNREGISTER && 6302 dev->reg_state != NETREG_UNREGISTERED) { 6303 /* NETDEV_UNREGISTER could be fired for multiple times by 6304 * netdev_wait_allrefs(). Make sure we only call this once. 6305 */ 6306 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 6307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6308 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 6309 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 6310 #endif 6311 } 6312 6313 return NOTIFY_OK; 6314 } 6315 6316 /* 6317 * /proc 6318 */ 6319 6320 #ifdef CONFIG_PROC_FS 6321 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 6322 { 6323 struct net *net = (struct net *)seq->private; 6324 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 6325 net->ipv6.rt6_stats->fib_nodes, 6326 net->ipv6.rt6_stats->fib_route_nodes, 6327 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 6328 net->ipv6.rt6_stats->fib_rt_entries, 6329 net->ipv6.rt6_stats->fib_rt_cache, 6330 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 6331 net->ipv6.rt6_stats->fib_discarded_routes); 6332 6333 return 0; 6334 } 6335 #endif /* CONFIG_PROC_FS */ 6336 6337 #ifdef CONFIG_SYSCTL 6338 6339 static int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 6340 void *buffer, size_t *lenp, loff_t *ppos) 6341 { 6342 struct net *net; 6343 int delay; 6344 int ret; 6345 if (!write) 6346 return -EINVAL; 6347 6348 net = (struct net *)ctl->extra1; 6349 delay = net->ipv6.sysctl.flush_delay; 6350 ret = proc_dointvec(ctl, write, buffer, lenp, ppos); 6351 if (ret) 6352 return ret; 6353 6354 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 6355 return 0; 6356 } 6357 6358 static struct ctl_table ipv6_route_table_template[] = { 6359 { 6360 .procname = "max_size", 6361 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 6362 .maxlen = sizeof(int), 6363 .mode = 0644, 6364 .proc_handler = proc_dointvec, 6365 }, 6366 { 6367 .procname = "gc_thresh", 6368 .data = &ip6_dst_ops_template.gc_thresh, 6369 .maxlen = sizeof(int), 6370 .mode = 0644, 6371 .proc_handler = proc_dointvec, 6372 }, 6373 { 6374 .procname = "flush", 6375 .data = &init_net.ipv6.sysctl.flush_delay, 6376 .maxlen = sizeof(int), 6377 .mode = 0200, 6378 .proc_handler = ipv6_sysctl_rtcache_flush 6379 }, 6380 { 6381 .procname = "gc_min_interval", 6382 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 6383 .maxlen = sizeof(int), 6384 .mode = 0644, 6385 .proc_handler = proc_dointvec_jiffies, 6386 }, 6387 { 6388 .procname = "gc_timeout", 6389 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 6390 .maxlen = sizeof(int), 6391 .mode = 0644, 6392 .proc_handler = proc_dointvec_jiffies, 6393 }, 6394 { 6395 .procname = "gc_interval", 6396 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 6397 .maxlen = sizeof(int), 6398 .mode = 0644, 6399 .proc_handler = proc_dointvec_jiffies, 6400 }, 6401 { 6402 .procname = "gc_elasticity", 6403 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 6404 .maxlen = sizeof(int), 6405 .mode = 0644, 6406 .proc_handler = proc_dointvec, 6407 }, 6408 { 6409 .procname = "mtu_expires", 6410 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 6411 .maxlen = sizeof(int), 6412 .mode = 0644, 6413 .proc_handler = proc_dointvec_jiffies, 6414 }, 6415 { 6416 .procname = "min_adv_mss", 6417 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 6418 .maxlen = sizeof(int), 6419 .mode = 0644, 6420 .proc_handler = proc_dointvec, 6421 }, 6422 { 6423 .procname = "gc_min_interval_ms", 6424 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 6425 .maxlen = sizeof(int), 6426 .mode = 0644, 6427 .proc_handler = proc_dointvec_ms_jiffies, 6428 }, 6429 { 6430 .procname = "skip_notify_on_dev_down", 6431 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down, 6432 .maxlen = sizeof(u8), 6433 .mode = 0644, 6434 .proc_handler = proc_dou8vec_minmax, 6435 .extra1 = SYSCTL_ZERO, 6436 .extra2 = SYSCTL_ONE, 6437 }, 6438 { } 6439 }; 6440 6441 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 6442 { 6443 struct ctl_table *table; 6444 6445 table = kmemdup(ipv6_route_table_template, 6446 sizeof(ipv6_route_table_template), 6447 GFP_KERNEL); 6448 6449 if (table) { 6450 table[0].data = &net->ipv6.sysctl.ip6_rt_max_size; 6451 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 6452 table[2].data = &net->ipv6.sysctl.flush_delay; 6453 table[2].extra1 = net; 6454 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 6455 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 6456 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 6457 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 6458 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 6459 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 6460 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 6461 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down; 6462 6463 /* Don't export sysctls to unprivileged users */ 6464 if (net->user_ns != &init_user_ns) 6465 table[1].procname = NULL; 6466 } 6467 6468 return table; 6469 } 6470 6471 size_t ipv6_route_sysctl_table_size(struct net *net) 6472 { 6473 /* Don't export sysctls to unprivileged users */ 6474 if (net->user_ns != &init_user_ns) 6475 return 1; 6476 6477 return ARRAY_SIZE(ipv6_route_table_template); 6478 } 6479 #endif 6480 6481 static int __net_init ip6_route_net_init(struct net *net) 6482 { 6483 int ret = -ENOMEM; 6484 6485 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 6486 sizeof(net->ipv6.ip6_dst_ops)); 6487 6488 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 6489 goto out_ip6_dst_ops; 6490 6491 net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true); 6492 if (!net->ipv6.fib6_null_entry) 6493 goto out_ip6_dst_entries; 6494 memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template, 6495 sizeof(*net->ipv6.fib6_null_entry)); 6496 6497 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 6498 sizeof(*net->ipv6.ip6_null_entry), 6499 GFP_KERNEL); 6500 if (!net->ipv6.ip6_null_entry) 6501 goto out_fib6_null_entry; 6502 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 6503 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 6504 ip6_template_metrics, true); 6505 INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->dst.rt_uncached); 6506 6507 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6508 net->ipv6.fib6_has_custom_rules = false; 6509 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 6510 sizeof(*net->ipv6.ip6_prohibit_entry), 6511 GFP_KERNEL); 6512 if (!net->ipv6.ip6_prohibit_entry) 6513 goto out_ip6_null_entry; 6514 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 6515 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 6516 ip6_template_metrics, true); 6517 INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->dst.rt_uncached); 6518 6519 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 6520 sizeof(*net->ipv6.ip6_blk_hole_entry), 6521 GFP_KERNEL); 6522 if (!net->ipv6.ip6_blk_hole_entry) 6523 goto out_ip6_prohibit_entry; 6524 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 6525 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 6526 ip6_template_metrics, true); 6527 INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->dst.rt_uncached); 6528 #ifdef CONFIG_IPV6_SUBTREES 6529 net->ipv6.fib6_routes_require_src = 0; 6530 #endif 6531 #endif 6532 6533 net->ipv6.sysctl.flush_delay = 0; 6534 net->ipv6.sysctl.ip6_rt_max_size = INT_MAX; 6535 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 6536 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 6537 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 6538 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 6539 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 6540 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 6541 net->ipv6.sysctl.skip_notify_on_dev_down = 0; 6542 6543 atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ); 6544 6545 ret = 0; 6546 out: 6547 return ret; 6548 6549 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6550 out_ip6_prohibit_entry: 6551 kfree(net->ipv6.ip6_prohibit_entry); 6552 out_ip6_null_entry: 6553 kfree(net->ipv6.ip6_null_entry); 6554 #endif 6555 out_fib6_null_entry: 6556 kfree(net->ipv6.fib6_null_entry); 6557 out_ip6_dst_entries: 6558 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 6559 out_ip6_dst_ops: 6560 goto out; 6561 } 6562 6563 static void __net_exit ip6_route_net_exit(struct net *net) 6564 { 6565 kfree(net->ipv6.fib6_null_entry); 6566 kfree(net->ipv6.ip6_null_entry); 6567 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6568 kfree(net->ipv6.ip6_prohibit_entry); 6569 kfree(net->ipv6.ip6_blk_hole_entry); 6570 #endif 6571 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 6572 } 6573 6574 static int __net_init ip6_route_net_init_late(struct net *net) 6575 { 6576 #ifdef CONFIG_PROC_FS 6577 if (!proc_create_net("ipv6_route", 0, net->proc_net, 6578 &ipv6_route_seq_ops, 6579 sizeof(struct ipv6_route_iter))) 6580 return -ENOMEM; 6581 6582 if (!proc_create_net_single("rt6_stats", 0444, net->proc_net, 6583 rt6_stats_seq_show, NULL)) { 6584 remove_proc_entry("ipv6_route", net->proc_net); 6585 return -ENOMEM; 6586 } 6587 #endif 6588 return 0; 6589 } 6590 6591 static void __net_exit ip6_route_net_exit_late(struct net *net) 6592 { 6593 #ifdef CONFIG_PROC_FS 6594 remove_proc_entry("ipv6_route", net->proc_net); 6595 remove_proc_entry("rt6_stats", net->proc_net); 6596 #endif 6597 } 6598 6599 static struct pernet_operations ip6_route_net_ops = { 6600 .init = ip6_route_net_init, 6601 .exit = ip6_route_net_exit, 6602 }; 6603 6604 static int __net_init ipv6_inetpeer_init(struct net *net) 6605 { 6606 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 6607 6608 if (!bp) 6609 return -ENOMEM; 6610 inet_peer_base_init(bp); 6611 net->ipv6.peers = bp; 6612 return 0; 6613 } 6614 6615 static void __net_exit ipv6_inetpeer_exit(struct net *net) 6616 { 6617 struct inet_peer_base *bp = net->ipv6.peers; 6618 6619 net->ipv6.peers = NULL; 6620 inetpeer_invalidate_tree(bp); 6621 kfree(bp); 6622 } 6623 6624 static struct pernet_operations ipv6_inetpeer_ops = { 6625 .init = ipv6_inetpeer_init, 6626 .exit = ipv6_inetpeer_exit, 6627 }; 6628 6629 static struct pernet_operations ip6_route_net_late_ops = { 6630 .init = ip6_route_net_init_late, 6631 .exit = ip6_route_net_exit_late, 6632 }; 6633 6634 static struct notifier_block ip6_route_dev_notifier = { 6635 .notifier_call = ip6_route_dev_notify, 6636 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 6637 }; 6638 6639 void __init ip6_route_init_special_entries(void) 6640 { 6641 /* Registering of the loopback is done before this portion of code, 6642 * the loopback reference in rt6_info will not be taken, do it 6643 * manually for init_net */ 6644 init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev; 6645 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 6646 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 6647 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6648 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 6649 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 6650 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 6651 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 6652 #endif 6653 } 6654 6655 #if IS_BUILTIN(CONFIG_IPV6) 6656 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 6657 DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt) 6658 6659 BTF_ID_LIST(btf_fib6_info_id) 6660 BTF_ID(struct, fib6_info) 6661 6662 static const struct bpf_iter_seq_info ipv6_route_seq_info = { 6663 .seq_ops = &ipv6_route_seq_ops, 6664 .init_seq_private = bpf_iter_init_seq_net, 6665 .fini_seq_private = bpf_iter_fini_seq_net, 6666 .seq_priv_size = sizeof(struct ipv6_route_iter), 6667 }; 6668 6669 static struct bpf_iter_reg ipv6_route_reg_info = { 6670 .target = "ipv6_route", 6671 .ctx_arg_info_size = 1, 6672 .ctx_arg_info = { 6673 { offsetof(struct bpf_iter__ipv6_route, rt), 6674 PTR_TO_BTF_ID_OR_NULL }, 6675 }, 6676 .seq_info = &ipv6_route_seq_info, 6677 }; 6678 6679 static int __init bpf_iter_register(void) 6680 { 6681 ipv6_route_reg_info.ctx_arg_info[0].btf_id = *btf_fib6_info_id; 6682 return bpf_iter_reg_target(&ipv6_route_reg_info); 6683 } 6684 6685 static void bpf_iter_unregister(void) 6686 { 6687 bpf_iter_unreg_target(&ipv6_route_reg_info); 6688 } 6689 #endif 6690 #endif 6691 6692 int __init ip6_route_init(void) 6693 { 6694 int ret; 6695 int cpu; 6696 6697 ret = -ENOMEM; 6698 ip6_dst_ops_template.kmem_cachep = 6699 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 6700 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); 6701 if (!ip6_dst_ops_template.kmem_cachep) 6702 goto out; 6703 6704 ret = dst_entries_init(&ip6_dst_blackhole_ops); 6705 if (ret) 6706 goto out_kmem_cache; 6707 6708 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 6709 if (ret) 6710 goto out_dst_entries; 6711 6712 ret = register_pernet_subsys(&ip6_route_net_ops); 6713 if (ret) 6714 goto out_register_inetpeer; 6715 6716 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 6717 6718 ret = fib6_init(); 6719 if (ret) 6720 goto out_register_subsys; 6721 6722 ret = xfrm6_init(); 6723 if (ret) 6724 goto out_fib6_init; 6725 6726 ret = fib6_rules_init(); 6727 if (ret) 6728 goto xfrm6_init; 6729 6730 ret = register_pernet_subsys(&ip6_route_net_late_ops); 6731 if (ret) 6732 goto fib6_rules_init; 6733 6734 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 6735 inet6_rtm_newroute, NULL, 0); 6736 if (ret < 0) 6737 goto out_register_late_subsys; 6738 6739 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 6740 inet6_rtm_delroute, NULL, 0); 6741 if (ret < 0) 6742 goto out_register_late_subsys; 6743 6744 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 6745 inet6_rtm_getroute, NULL, 6746 RTNL_FLAG_DOIT_UNLOCKED); 6747 if (ret < 0) 6748 goto out_register_late_subsys; 6749 6750 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 6751 if (ret) 6752 goto out_register_late_subsys; 6753 6754 #if IS_BUILTIN(CONFIG_IPV6) 6755 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 6756 ret = bpf_iter_register(); 6757 if (ret) 6758 goto out_register_late_subsys; 6759 #endif 6760 #endif 6761 6762 for_each_possible_cpu(cpu) { 6763 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 6764 6765 INIT_LIST_HEAD(&ul->head); 6766 INIT_LIST_HEAD(&ul->quarantine); 6767 spin_lock_init(&ul->lock); 6768 } 6769 6770 out: 6771 return ret; 6772 6773 out_register_late_subsys: 6774 rtnl_unregister_all(PF_INET6); 6775 unregister_pernet_subsys(&ip6_route_net_late_ops); 6776 fib6_rules_init: 6777 fib6_rules_cleanup(); 6778 xfrm6_init: 6779 xfrm6_fini(); 6780 out_fib6_init: 6781 fib6_gc_cleanup(); 6782 out_register_subsys: 6783 unregister_pernet_subsys(&ip6_route_net_ops); 6784 out_register_inetpeer: 6785 unregister_pernet_subsys(&ipv6_inetpeer_ops); 6786 out_dst_entries: 6787 dst_entries_destroy(&ip6_dst_blackhole_ops); 6788 out_kmem_cache: 6789 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 6790 goto out; 6791 } 6792 6793 void ip6_route_cleanup(void) 6794 { 6795 #if IS_BUILTIN(CONFIG_IPV6) 6796 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 6797 bpf_iter_unregister(); 6798 #endif 6799 #endif 6800 unregister_netdevice_notifier(&ip6_route_dev_notifier); 6801 unregister_pernet_subsys(&ip6_route_net_late_ops); 6802 fib6_rules_cleanup(); 6803 xfrm6_fini(); 6804 fib6_gc_cleanup(); 6805 unregister_pernet_subsys(&ipv6_inetpeer_ops); 6806 unregister_pernet_subsys(&ip6_route_net_ops); 6807 dst_entries_destroy(&ip6_dst_blackhole_ops); 6808 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 6809 } 6810