1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Linux INET6 implementation 4 * FIB front-end. 5 * 6 * Authors: 7 * Pedro Roque <roque@di.fc.ul.pt> 8 */ 9 10 /* Changes: 11 * 12 * YOSHIFUJI Hideaki @USAGI 13 * reworked default router selection. 14 * - respect outgoing interface 15 * - select from (probably) reachable routers (i.e. 16 * routers in REACHABLE, STALE, DELAY or PROBE states). 17 * - always select the same router if it is (probably) 18 * reachable. otherwise, round-robin the list. 19 * Ville Nuorvala 20 * Fixed routing subtrees. 21 */ 22 23 #define pr_fmt(fmt) "IPv6: " fmt 24 25 #include <linux/capability.h> 26 #include <linux/errno.h> 27 #include <linux/export.h> 28 #include <linux/types.h> 29 #include <linux/times.h> 30 #include <linux/socket.h> 31 #include <linux/sockios.h> 32 #include <linux/net.h> 33 #include <linux/route.h> 34 #include <linux/netdevice.h> 35 #include <linux/in6.h> 36 #include <linux/mroute6.h> 37 #include <linux/init.h> 38 #include <linux/if_arp.h> 39 #include <linux/proc_fs.h> 40 #include <linux/seq_file.h> 41 #include <linux/nsproxy.h> 42 #include <linux/slab.h> 43 #include <linux/jhash.h> 44 #include <linux/siphash.h> 45 #include <net/net_namespace.h> 46 #include <net/snmp.h> 47 #include <net/ipv6.h> 48 #include <net/ip6_fib.h> 49 #include <net/ip6_route.h> 50 #include <net/ndisc.h> 51 #include <net/addrconf.h> 52 #include <net/tcp.h> 53 #include <linux/rtnetlink.h> 54 #include <net/dst.h> 55 #include <net/dst_metadata.h> 56 #include <net/xfrm.h> 57 #include <net/netevent.h> 58 #include <net/netlink.h> 59 #include <net/rtnh.h> 60 #include <net/lwtunnel.h> 61 #include <net/ip_tunnels.h> 62 #include <net/l3mdev.h> 63 #include <net/ip.h> 64 #include <linux/uaccess.h> 65 #include <linux/btf_ids.h> 66 67 #ifdef CONFIG_SYSCTL 68 #include <linux/sysctl.h> 69 #endif 70 71 static int ip6_rt_type_to_error(u8 fib6_type); 72 73 #define CREATE_TRACE_POINTS 74 #include <trace/events/fib6.h> 75 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); 76 #undef CREATE_TRACE_POINTS 77 78 enum rt6_nud_state { 79 RT6_NUD_FAIL_HARD = -3, 80 RT6_NUD_FAIL_PROBE = -2, 81 RT6_NUD_FAIL_DO_RR = -1, 82 RT6_NUD_SUCCEED = 1 83 }; 84 85 INDIRECT_CALLABLE_SCOPE 86 struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 87 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 88 INDIRECT_CALLABLE_SCOPE 89 unsigned int ip6_mtu(const struct dst_entry *dst); 90 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 91 static void ip6_dst_destroy(struct dst_entry *); 92 static void ip6_dst_ifdown(struct dst_entry *, 93 struct net_device *dev, int how); 94 static int ip6_dst_gc(struct dst_ops *ops); 95 96 static int ip6_pkt_discard(struct sk_buff *skb); 97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 98 static int ip6_pkt_prohibit(struct sk_buff *skb); 99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 100 static void ip6_link_failure(struct sk_buff *skb); 101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 102 struct sk_buff *skb, u32 mtu, 103 bool confirm_neigh); 104 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 105 struct sk_buff *skb); 106 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, 107 int strict); 108 static size_t rt6_nlmsg_size(struct fib6_info *f6i); 109 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 110 struct fib6_info *rt, struct dst_entry *dst, 111 struct in6_addr *dest, struct in6_addr *src, 112 int iif, int type, u32 portid, u32 seq, 113 unsigned int flags); 114 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, 115 const struct in6_addr *daddr, 116 const struct in6_addr *saddr); 117 118 #ifdef CONFIG_IPV6_ROUTE_INFO 119 static struct fib6_info *rt6_add_route_info(struct net *net, 120 const struct in6_addr *prefix, int prefixlen, 121 const struct in6_addr *gwaddr, 122 struct net_device *dev, 123 unsigned int pref); 124 static struct fib6_info *rt6_get_route_info(struct net *net, 125 const struct in6_addr *prefix, int prefixlen, 126 const struct in6_addr *gwaddr, 127 struct net_device *dev); 128 #endif 129 130 struct uncached_list { 131 spinlock_t lock; 132 struct list_head head; 133 }; 134 135 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 136 137 void rt6_uncached_list_add(struct rt6_info *rt) 138 { 139 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 140 141 rt->rt6i_uncached_list = ul; 142 143 spin_lock_bh(&ul->lock); 144 list_add_tail(&rt->rt6i_uncached, &ul->head); 145 spin_unlock_bh(&ul->lock); 146 } 147 148 void rt6_uncached_list_del(struct rt6_info *rt) 149 { 150 if (!list_empty(&rt->rt6i_uncached)) { 151 struct uncached_list *ul = rt->rt6i_uncached_list; 152 struct net *net = dev_net(rt->dst.dev); 153 154 spin_lock_bh(&ul->lock); 155 list_del(&rt->rt6i_uncached); 156 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 157 spin_unlock_bh(&ul->lock); 158 } 159 } 160 161 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 162 { 163 struct net_device *loopback_dev = net->loopback_dev; 164 int cpu; 165 166 if (dev == loopback_dev) 167 return; 168 169 for_each_possible_cpu(cpu) { 170 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 171 struct rt6_info *rt; 172 173 spin_lock_bh(&ul->lock); 174 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 175 struct inet6_dev *rt_idev = rt->rt6i_idev; 176 struct net_device *rt_dev = rt->dst.dev; 177 178 if (rt_idev->dev == dev) { 179 rt->rt6i_idev = in6_dev_get(loopback_dev); 180 in6_dev_put(rt_idev); 181 } 182 183 if (rt_dev == dev) { 184 rt->dst.dev = blackhole_netdev; 185 dev_hold(rt->dst.dev); 186 dev_put(rt_dev); 187 } 188 } 189 spin_unlock_bh(&ul->lock); 190 } 191 } 192 193 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 194 struct sk_buff *skb, 195 const void *daddr) 196 { 197 if (!ipv6_addr_any(p)) 198 return (const void *) p; 199 else if (skb) 200 return &ipv6_hdr(skb)->daddr; 201 return daddr; 202 } 203 204 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 205 struct net_device *dev, 206 struct sk_buff *skb, 207 const void *daddr) 208 { 209 struct neighbour *n; 210 211 daddr = choose_neigh_daddr(gw, skb, daddr); 212 n = __ipv6_neigh_lookup(dev, daddr); 213 if (n) 214 return n; 215 216 n = neigh_create(&nd_tbl, daddr, dev); 217 return IS_ERR(n) ? NULL : n; 218 } 219 220 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 221 struct sk_buff *skb, 222 const void *daddr) 223 { 224 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 225 226 return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any), 227 dst->dev, skb, daddr); 228 } 229 230 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 231 { 232 struct net_device *dev = dst->dev; 233 struct rt6_info *rt = (struct rt6_info *)dst; 234 235 daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr); 236 if (!daddr) 237 return; 238 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 239 return; 240 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 241 return; 242 __ipv6_confirm_neigh(dev, daddr); 243 } 244 245 static struct dst_ops ip6_dst_ops_template = { 246 .family = AF_INET6, 247 .gc = ip6_dst_gc, 248 .gc_thresh = 1024, 249 .check = ip6_dst_check, 250 .default_advmss = ip6_default_advmss, 251 .mtu = ip6_mtu, 252 .cow_metrics = dst_cow_metrics_generic, 253 .destroy = ip6_dst_destroy, 254 .ifdown = ip6_dst_ifdown, 255 .negative_advice = ip6_negative_advice, 256 .link_failure = ip6_link_failure, 257 .update_pmtu = ip6_rt_update_pmtu, 258 .redirect = rt6_do_redirect, 259 .local_out = __ip6_local_out, 260 .neigh_lookup = ip6_dst_neigh_lookup, 261 .confirm_neigh = ip6_confirm_neigh, 262 }; 263 264 static struct dst_ops ip6_dst_blackhole_ops = { 265 .family = AF_INET6, 266 .default_advmss = ip6_default_advmss, 267 .neigh_lookup = ip6_dst_neigh_lookup, 268 .check = ip6_dst_check, 269 .destroy = ip6_dst_destroy, 270 .cow_metrics = dst_cow_metrics_generic, 271 .update_pmtu = dst_blackhole_update_pmtu, 272 .redirect = dst_blackhole_redirect, 273 .mtu = dst_blackhole_mtu, 274 }; 275 276 static const u32 ip6_template_metrics[RTAX_MAX] = { 277 [RTAX_HOPLIMIT - 1] = 0, 278 }; 279 280 static const struct fib6_info fib6_null_entry_template = { 281 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 282 .fib6_protocol = RTPROT_KERNEL, 283 .fib6_metric = ~(u32)0, 284 .fib6_ref = REFCOUNT_INIT(1), 285 .fib6_type = RTN_UNREACHABLE, 286 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 287 }; 288 289 static const struct rt6_info ip6_null_entry_template = { 290 .dst = { 291 .__refcnt = ATOMIC_INIT(1), 292 .__use = 1, 293 .obsolete = DST_OBSOLETE_FORCE_CHK, 294 .error = -ENETUNREACH, 295 .input = ip6_pkt_discard, 296 .output = ip6_pkt_discard_out, 297 }, 298 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 299 }; 300 301 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 302 303 static const struct rt6_info ip6_prohibit_entry_template = { 304 .dst = { 305 .__refcnt = ATOMIC_INIT(1), 306 .__use = 1, 307 .obsolete = DST_OBSOLETE_FORCE_CHK, 308 .error = -EACCES, 309 .input = ip6_pkt_prohibit, 310 .output = ip6_pkt_prohibit_out, 311 }, 312 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 313 }; 314 315 static const struct rt6_info ip6_blk_hole_entry_template = { 316 .dst = { 317 .__refcnt = ATOMIC_INIT(1), 318 .__use = 1, 319 .obsolete = DST_OBSOLETE_FORCE_CHK, 320 .error = -EINVAL, 321 .input = dst_discard, 322 .output = dst_discard_out, 323 }, 324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 325 }; 326 327 #endif 328 329 static void rt6_info_init(struct rt6_info *rt) 330 { 331 memset_after(rt, 0, dst); 332 INIT_LIST_HEAD(&rt->rt6i_uncached); 333 } 334 335 /* allocate dst with ip6_dst_ops */ 336 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 337 int flags) 338 { 339 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 340 1, DST_OBSOLETE_FORCE_CHK, flags); 341 342 if (rt) { 343 rt6_info_init(rt); 344 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 345 } 346 347 return rt; 348 } 349 EXPORT_SYMBOL(ip6_dst_alloc); 350 351 static void ip6_dst_destroy(struct dst_entry *dst) 352 { 353 struct rt6_info *rt = (struct rt6_info *)dst; 354 struct fib6_info *from; 355 struct inet6_dev *idev; 356 357 ip_dst_metrics_put(dst); 358 rt6_uncached_list_del(rt); 359 360 idev = rt->rt6i_idev; 361 if (idev) { 362 rt->rt6i_idev = NULL; 363 in6_dev_put(idev); 364 } 365 366 from = xchg((__force struct fib6_info **)&rt->from, NULL); 367 fib6_info_release(from); 368 } 369 370 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 371 int how) 372 { 373 struct rt6_info *rt = (struct rt6_info *)dst; 374 struct inet6_dev *idev = rt->rt6i_idev; 375 struct net_device *loopback_dev = 376 dev_net(dev)->loopback_dev; 377 378 if (idev && idev->dev != loopback_dev) { 379 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 380 if (loopback_idev) { 381 rt->rt6i_idev = loopback_idev; 382 in6_dev_put(idev); 383 } 384 } 385 } 386 387 static bool __rt6_check_expired(const struct rt6_info *rt) 388 { 389 if (rt->rt6i_flags & RTF_EXPIRES) 390 return time_after(jiffies, rt->dst.expires); 391 else 392 return false; 393 } 394 395 static bool rt6_check_expired(const struct rt6_info *rt) 396 { 397 struct fib6_info *from; 398 399 from = rcu_dereference(rt->from); 400 401 if (rt->rt6i_flags & RTF_EXPIRES) { 402 if (time_after(jiffies, rt->dst.expires)) 403 return true; 404 } else if (from) { 405 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 406 fib6_check_expired(from); 407 } 408 return false; 409 } 410 411 void fib6_select_path(const struct net *net, struct fib6_result *res, 412 struct flowi6 *fl6, int oif, bool have_oif_match, 413 const struct sk_buff *skb, int strict) 414 { 415 struct fib6_info *sibling, *next_sibling; 416 struct fib6_info *match = res->f6i; 417 418 if (!match->nh && (!match->fib6_nsiblings || have_oif_match)) 419 goto out; 420 421 if (match->nh && have_oif_match && res->nh) 422 return; 423 424 /* We might have already computed the hash for ICMPv6 errors. In such 425 * case it will always be non-zero. Otherwise now is the time to do it. 426 */ 427 if (!fl6->mp_hash && 428 (!match->nh || nexthop_is_multipath(match->nh))) 429 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 430 431 if (unlikely(match->nh)) { 432 nexthop_path_fib6_result(res, fl6->mp_hash); 433 return; 434 } 435 436 if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound)) 437 goto out; 438 439 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 440 fib6_siblings) { 441 const struct fib6_nh *nh = sibling->fib6_nh; 442 int nh_upper_bound; 443 444 nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); 445 if (fl6->mp_hash > nh_upper_bound) 446 continue; 447 if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0) 448 break; 449 match = sibling; 450 break; 451 } 452 453 out: 454 res->f6i = match; 455 res->nh = match->fib6_nh; 456 } 457 458 /* 459 * Route lookup. rcu_read_lock() should be held. 460 */ 461 462 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh, 463 const struct in6_addr *saddr, int oif, int flags) 464 { 465 const struct net_device *dev; 466 467 if (nh->fib_nh_flags & RTNH_F_DEAD) 468 return false; 469 470 dev = nh->fib_nh_dev; 471 if (oif) { 472 if (dev->ifindex == oif) 473 return true; 474 } else { 475 if (ipv6_chk_addr(net, saddr, dev, 476 flags & RT6_LOOKUP_F_IFACE)) 477 return true; 478 } 479 480 return false; 481 } 482 483 struct fib6_nh_dm_arg { 484 struct net *net; 485 const struct in6_addr *saddr; 486 int oif; 487 int flags; 488 struct fib6_nh *nh; 489 }; 490 491 static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg) 492 { 493 struct fib6_nh_dm_arg *arg = _arg; 494 495 arg->nh = nh; 496 return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif, 497 arg->flags); 498 } 499 500 /* returns fib6_nh from nexthop or NULL */ 501 static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh, 502 struct fib6_result *res, 503 const struct in6_addr *saddr, 504 int oif, int flags) 505 { 506 struct fib6_nh_dm_arg arg = { 507 .net = net, 508 .saddr = saddr, 509 .oif = oif, 510 .flags = flags, 511 }; 512 513 if (nexthop_is_blackhole(nh)) 514 return NULL; 515 516 if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg)) 517 return arg.nh; 518 519 return NULL; 520 } 521 522 static void rt6_device_match(struct net *net, struct fib6_result *res, 523 const struct in6_addr *saddr, int oif, int flags) 524 { 525 struct fib6_info *f6i = res->f6i; 526 struct fib6_info *spf6i; 527 struct fib6_nh *nh; 528 529 if (!oif && ipv6_addr_any(saddr)) { 530 if (unlikely(f6i->nh)) { 531 nh = nexthop_fib6_nh(f6i->nh); 532 if (nexthop_is_blackhole(f6i->nh)) 533 goto out_blackhole; 534 } else { 535 nh = f6i->fib6_nh; 536 } 537 if (!(nh->fib_nh_flags & RTNH_F_DEAD)) 538 goto out; 539 } 540 541 for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) { 542 bool matched = false; 543 544 if (unlikely(spf6i->nh)) { 545 nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr, 546 oif, flags); 547 if (nh) 548 matched = true; 549 } else { 550 nh = spf6i->fib6_nh; 551 if (__rt6_device_match(net, nh, saddr, oif, flags)) 552 matched = true; 553 } 554 if (matched) { 555 res->f6i = spf6i; 556 goto out; 557 } 558 } 559 560 if (oif && flags & RT6_LOOKUP_F_IFACE) { 561 res->f6i = net->ipv6.fib6_null_entry; 562 nh = res->f6i->fib6_nh; 563 goto out; 564 } 565 566 if (unlikely(f6i->nh)) { 567 nh = nexthop_fib6_nh(f6i->nh); 568 if (nexthop_is_blackhole(f6i->nh)) 569 goto out_blackhole; 570 } else { 571 nh = f6i->fib6_nh; 572 } 573 574 if (nh->fib_nh_flags & RTNH_F_DEAD) { 575 res->f6i = net->ipv6.fib6_null_entry; 576 nh = res->f6i->fib6_nh; 577 } 578 out: 579 res->nh = nh; 580 res->fib6_type = res->f6i->fib6_type; 581 res->fib6_flags = res->f6i->fib6_flags; 582 return; 583 584 out_blackhole: 585 res->fib6_flags |= RTF_REJECT; 586 res->fib6_type = RTN_BLACKHOLE; 587 res->nh = nh; 588 } 589 590 #ifdef CONFIG_IPV6_ROUTER_PREF 591 struct __rt6_probe_work { 592 struct work_struct work; 593 struct in6_addr target; 594 struct net_device *dev; 595 }; 596 597 static void rt6_probe_deferred(struct work_struct *w) 598 { 599 struct in6_addr mcaddr; 600 struct __rt6_probe_work *work = 601 container_of(w, struct __rt6_probe_work, work); 602 603 addrconf_addr_solict_mult(&work->target, &mcaddr); 604 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 605 dev_put(work->dev); 606 kfree(work); 607 } 608 609 static void rt6_probe(struct fib6_nh *fib6_nh) 610 { 611 struct __rt6_probe_work *work = NULL; 612 const struct in6_addr *nh_gw; 613 unsigned long last_probe; 614 struct neighbour *neigh; 615 struct net_device *dev; 616 struct inet6_dev *idev; 617 618 /* 619 * Okay, this does not seem to be appropriate 620 * for now, however, we need to check if it 621 * is really so; aka Router Reachability Probing. 622 * 623 * Router Reachability Probe MUST be rate-limited 624 * to no more than one per minute. 625 */ 626 if (!fib6_nh->fib_nh_gw_family) 627 return; 628 629 nh_gw = &fib6_nh->fib_nh_gw6; 630 dev = fib6_nh->fib_nh_dev; 631 rcu_read_lock_bh(); 632 last_probe = READ_ONCE(fib6_nh->last_probe); 633 idev = __in6_dev_get(dev); 634 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 635 if (neigh) { 636 if (neigh->nud_state & NUD_VALID) 637 goto out; 638 639 write_lock(&neigh->lock); 640 if (!(neigh->nud_state & NUD_VALID) && 641 time_after(jiffies, 642 neigh->updated + idev->cnf.rtr_probe_interval)) { 643 work = kmalloc(sizeof(*work), GFP_ATOMIC); 644 if (work) 645 __neigh_set_probe_once(neigh); 646 } 647 write_unlock(&neigh->lock); 648 } else if (time_after(jiffies, last_probe + 649 idev->cnf.rtr_probe_interval)) { 650 work = kmalloc(sizeof(*work), GFP_ATOMIC); 651 } 652 653 if (!work || cmpxchg(&fib6_nh->last_probe, 654 last_probe, jiffies) != last_probe) { 655 kfree(work); 656 } else { 657 INIT_WORK(&work->work, rt6_probe_deferred); 658 work->target = *nh_gw; 659 dev_hold(dev); 660 work->dev = dev; 661 schedule_work(&work->work); 662 } 663 664 out: 665 rcu_read_unlock_bh(); 666 } 667 #else 668 static inline void rt6_probe(struct fib6_nh *fib6_nh) 669 { 670 } 671 #endif 672 673 /* 674 * Default Router Selection (RFC 2461 6.3.6) 675 */ 676 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh) 677 { 678 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 679 struct neighbour *neigh; 680 681 rcu_read_lock_bh(); 682 neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev, 683 &fib6_nh->fib_nh_gw6); 684 if (neigh) { 685 read_lock(&neigh->lock); 686 if (neigh->nud_state & NUD_VALID) 687 ret = RT6_NUD_SUCCEED; 688 #ifdef CONFIG_IPV6_ROUTER_PREF 689 else if (!(neigh->nud_state & NUD_FAILED)) 690 ret = RT6_NUD_SUCCEED; 691 else 692 ret = RT6_NUD_FAIL_PROBE; 693 #endif 694 read_unlock(&neigh->lock); 695 } else { 696 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 697 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 698 } 699 rcu_read_unlock_bh(); 700 701 return ret; 702 } 703 704 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, 705 int strict) 706 { 707 int m = 0; 708 709 if (!oif || nh->fib_nh_dev->ifindex == oif) 710 m = 2; 711 712 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 713 return RT6_NUD_FAIL_HARD; 714 #ifdef CONFIG_IPV6_ROUTER_PREF 715 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2; 716 #endif 717 if ((strict & RT6_LOOKUP_F_REACHABLE) && 718 !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) { 719 int n = rt6_check_neigh(nh); 720 if (n < 0) 721 return n; 722 } 723 return m; 724 } 725 726 static bool find_match(struct fib6_nh *nh, u32 fib6_flags, 727 int oif, int strict, int *mpri, bool *do_rr) 728 { 729 bool match_do_rr = false; 730 bool rc = false; 731 int m; 732 733 if (nh->fib_nh_flags & RTNH_F_DEAD) 734 goto out; 735 736 if (ip6_ignore_linkdown(nh->fib_nh_dev) && 737 nh->fib_nh_flags & RTNH_F_LINKDOWN && 738 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 739 goto out; 740 741 m = rt6_score_route(nh, fib6_flags, oif, strict); 742 if (m == RT6_NUD_FAIL_DO_RR) { 743 match_do_rr = true; 744 m = 0; /* lowest valid score */ 745 } else if (m == RT6_NUD_FAIL_HARD) { 746 goto out; 747 } 748 749 if (strict & RT6_LOOKUP_F_REACHABLE) 750 rt6_probe(nh); 751 752 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 753 if (m > *mpri) { 754 *do_rr = match_do_rr; 755 *mpri = m; 756 rc = true; 757 } 758 out: 759 return rc; 760 } 761 762 struct fib6_nh_frl_arg { 763 u32 flags; 764 int oif; 765 int strict; 766 int *mpri; 767 bool *do_rr; 768 struct fib6_nh *nh; 769 }; 770 771 static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg) 772 { 773 struct fib6_nh_frl_arg *arg = _arg; 774 775 arg->nh = nh; 776 return find_match(nh, arg->flags, arg->oif, arg->strict, 777 arg->mpri, arg->do_rr); 778 } 779 780 static void __find_rr_leaf(struct fib6_info *f6i_start, 781 struct fib6_info *nomatch, u32 metric, 782 struct fib6_result *res, struct fib6_info **cont, 783 int oif, int strict, bool *do_rr, int *mpri) 784 { 785 struct fib6_info *f6i; 786 787 for (f6i = f6i_start; 788 f6i && f6i != nomatch; 789 f6i = rcu_dereference(f6i->fib6_next)) { 790 bool matched = false; 791 struct fib6_nh *nh; 792 793 if (cont && f6i->fib6_metric != metric) { 794 *cont = f6i; 795 return; 796 } 797 798 if (fib6_check_expired(f6i)) 799 continue; 800 801 if (unlikely(f6i->nh)) { 802 struct fib6_nh_frl_arg arg = { 803 .flags = f6i->fib6_flags, 804 .oif = oif, 805 .strict = strict, 806 .mpri = mpri, 807 .do_rr = do_rr 808 }; 809 810 if (nexthop_is_blackhole(f6i->nh)) { 811 res->fib6_flags = RTF_REJECT; 812 res->fib6_type = RTN_BLACKHOLE; 813 res->f6i = f6i; 814 res->nh = nexthop_fib6_nh(f6i->nh); 815 return; 816 } 817 if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match, 818 &arg)) { 819 matched = true; 820 nh = arg.nh; 821 } 822 } else { 823 nh = f6i->fib6_nh; 824 if (find_match(nh, f6i->fib6_flags, oif, strict, 825 mpri, do_rr)) 826 matched = true; 827 } 828 if (matched) { 829 res->f6i = f6i; 830 res->nh = nh; 831 res->fib6_flags = f6i->fib6_flags; 832 res->fib6_type = f6i->fib6_type; 833 } 834 } 835 } 836 837 static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf, 838 struct fib6_info *rr_head, int oif, int strict, 839 bool *do_rr, struct fib6_result *res) 840 { 841 u32 metric = rr_head->fib6_metric; 842 struct fib6_info *cont = NULL; 843 int mpri = -1; 844 845 __find_rr_leaf(rr_head, NULL, metric, res, &cont, 846 oif, strict, do_rr, &mpri); 847 848 __find_rr_leaf(leaf, rr_head, metric, res, &cont, 849 oif, strict, do_rr, &mpri); 850 851 if (res->f6i || !cont) 852 return; 853 854 __find_rr_leaf(cont, NULL, metric, res, NULL, 855 oif, strict, do_rr, &mpri); 856 } 857 858 static void rt6_select(struct net *net, struct fib6_node *fn, int oif, 859 struct fib6_result *res, int strict) 860 { 861 struct fib6_info *leaf = rcu_dereference(fn->leaf); 862 struct fib6_info *rt0; 863 bool do_rr = false; 864 int key_plen; 865 866 /* make sure this function or its helpers sets f6i */ 867 res->f6i = NULL; 868 869 if (!leaf || leaf == net->ipv6.fib6_null_entry) 870 goto out; 871 872 rt0 = rcu_dereference(fn->rr_ptr); 873 if (!rt0) 874 rt0 = leaf; 875 876 /* Double check to make sure fn is not an intermediate node 877 * and fn->leaf does not points to its child's leaf 878 * (This might happen if all routes under fn are deleted from 879 * the tree and fib6_repair_tree() is called on the node.) 880 */ 881 key_plen = rt0->fib6_dst.plen; 882 #ifdef CONFIG_IPV6_SUBTREES 883 if (rt0->fib6_src.plen) 884 key_plen = rt0->fib6_src.plen; 885 #endif 886 if (fn->fn_bit != key_plen) 887 goto out; 888 889 find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res); 890 if (do_rr) { 891 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 892 893 /* no entries matched; do round-robin */ 894 if (!next || next->fib6_metric != rt0->fib6_metric) 895 next = leaf; 896 897 if (next != rt0) { 898 spin_lock_bh(&leaf->fib6_table->tb6_lock); 899 /* make sure next is not being deleted from the tree */ 900 if (next->fib6_node) 901 rcu_assign_pointer(fn->rr_ptr, next); 902 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 903 } 904 } 905 906 out: 907 if (!res->f6i) { 908 res->f6i = net->ipv6.fib6_null_entry; 909 res->nh = res->f6i->fib6_nh; 910 res->fib6_flags = res->f6i->fib6_flags; 911 res->fib6_type = res->f6i->fib6_type; 912 } 913 } 914 915 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res) 916 { 917 return (res->f6i->fib6_flags & RTF_NONEXTHOP) || 918 res->nh->fib_nh_gw_family; 919 } 920 921 #ifdef CONFIG_IPV6_ROUTE_INFO 922 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 923 const struct in6_addr *gwaddr) 924 { 925 struct net *net = dev_net(dev); 926 struct route_info *rinfo = (struct route_info *) opt; 927 struct in6_addr prefix_buf, *prefix; 928 unsigned int pref; 929 unsigned long lifetime; 930 struct fib6_info *rt; 931 932 if (len < sizeof(struct route_info)) { 933 return -EINVAL; 934 } 935 936 /* Sanity check for prefix_len and length */ 937 if (rinfo->length > 3) { 938 return -EINVAL; 939 } else if (rinfo->prefix_len > 128) { 940 return -EINVAL; 941 } else if (rinfo->prefix_len > 64) { 942 if (rinfo->length < 2) { 943 return -EINVAL; 944 } 945 } else if (rinfo->prefix_len > 0) { 946 if (rinfo->length < 1) { 947 return -EINVAL; 948 } 949 } 950 951 pref = rinfo->route_pref; 952 if (pref == ICMPV6_ROUTER_PREF_INVALID) 953 return -EINVAL; 954 955 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 956 957 if (rinfo->length == 3) 958 prefix = (struct in6_addr *)rinfo->prefix; 959 else { 960 /* this function is safe */ 961 ipv6_addr_prefix(&prefix_buf, 962 (struct in6_addr *)rinfo->prefix, 963 rinfo->prefix_len); 964 prefix = &prefix_buf; 965 } 966 967 if (rinfo->prefix_len == 0) 968 rt = rt6_get_dflt_router(net, gwaddr, dev); 969 else 970 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 971 gwaddr, dev); 972 973 if (rt && !lifetime) { 974 ip6_del_rt(net, rt, false); 975 rt = NULL; 976 } 977 978 if (!rt && lifetime) 979 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 980 dev, pref); 981 else if (rt) 982 rt->fib6_flags = RTF_ROUTEINFO | 983 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 984 985 if (rt) { 986 if (!addrconf_finite_timeout(lifetime)) 987 fib6_clean_expires(rt); 988 else 989 fib6_set_expires(rt, jiffies + HZ * lifetime); 990 991 fib6_info_release(rt); 992 } 993 return 0; 994 } 995 #endif 996 997 /* 998 * Misc support functions 999 */ 1000 1001 /* called with rcu_lock held */ 1002 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res) 1003 { 1004 struct net_device *dev = res->nh->fib_nh_dev; 1005 1006 if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 1007 /* for copies of local routes, dst->dev needs to be the 1008 * device if it is a master device, the master device if 1009 * device is enslaved, and the loopback as the default 1010 */ 1011 if (netif_is_l3_slave(dev) && 1012 !rt6_need_strict(&res->f6i->fib6_dst.addr)) 1013 dev = l3mdev_master_dev_rcu(dev); 1014 else if (!netif_is_l3_master(dev)) 1015 dev = dev_net(dev)->loopback_dev; 1016 /* last case is netif_is_l3_master(dev) is true in which 1017 * case we want dev returned to be dev 1018 */ 1019 } 1020 1021 return dev; 1022 } 1023 1024 static const int fib6_prop[RTN_MAX + 1] = { 1025 [RTN_UNSPEC] = 0, 1026 [RTN_UNICAST] = 0, 1027 [RTN_LOCAL] = 0, 1028 [RTN_BROADCAST] = 0, 1029 [RTN_ANYCAST] = 0, 1030 [RTN_MULTICAST] = 0, 1031 [RTN_BLACKHOLE] = -EINVAL, 1032 [RTN_UNREACHABLE] = -EHOSTUNREACH, 1033 [RTN_PROHIBIT] = -EACCES, 1034 [RTN_THROW] = -EAGAIN, 1035 [RTN_NAT] = -EINVAL, 1036 [RTN_XRESOLVE] = -EINVAL, 1037 }; 1038 1039 static int ip6_rt_type_to_error(u8 fib6_type) 1040 { 1041 return fib6_prop[fib6_type]; 1042 } 1043 1044 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 1045 { 1046 unsigned short flags = 0; 1047 1048 if (rt->dst_nocount) 1049 flags |= DST_NOCOUNT; 1050 if (rt->dst_nopolicy) 1051 flags |= DST_NOPOLICY; 1052 1053 return flags; 1054 } 1055 1056 static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type) 1057 { 1058 rt->dst.error = ip6_rt_type_to_error(fib6_type); 1059 1060 switch (fib6_type) { 1061 case RTN_BLACKHOLE: 1062 rt->dst.output = dst_discard_out; 1063 rt->dst.input = dst_discard; 1064 break; 1065 case RTN_PROHIBIT: 1066 rt->dst.output = ip6_pkt_prohibit_out; 1067 rt->dst.input = ip6_pkt_prohibit; 1068 break; 1069 case RTN_THROW: 1070 case RTN_UNREACHABLE: 1071 default: 1072 rt->dst.output = ip6_pkt_discard_out; 1073 rt->dst.input = ip6_pkt_discard; 1074 break; 1075 } 1076 } 1077 1078 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res) 1079 { 1080 struct fib6_info *f6i = res->f6i; 1081 1082 if (res->fib6_flags & RTF_REJECT) { 1083 ip6_rt_init_dst_reject(rt, res->fib6_type); 1084 return; 1085 } 1086 1087 rt->dst.error = 0; 1088 rt->dst.output = ip6_output; 1089 1090 if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) { 1091 rt->dst.input = ip6_input; 1092 } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 1093 rt->dst.input = ip6_mc_input; 1094 } else { 1095 rt->dst.input = ip6_forward; 1096 } 1097 1098 if (res->nh->fib_nh_lws) { 1099 rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws); 1100 lwtunnel_set_redirect(&rt->dst); 1101 } 1102 1103 rt->dst.lastuse = jiffies; 1104 } 1105 1106 /* Caller must already hold reference to @from */ 1107 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 1108 { 1109 rt->rt6i_flags &= ~RTF_EXPIRES; 1110 rcu_assign_pointer(rt->from, from); 1111 ip_dst_init_metrics(&rt->dst, from->fib6_metrics); 1112 } 1113 1114 /* Caller must already hold reference to f6i in result */ 1115 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res) 1116 { 1117 const struct fib6_nh *nh = res->nh; 1118 const struct net_device *dev = nh->fib_nh_dev; 1119 struct fib6_info *f6i = res->f6i; 1120 1121 ip6_rt_init_dst(rt, res); 1122 1123 rt->rt6i_dst = f6i->fib6_dst; 1124 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 1125 rt->rt6i_flags = res->fib6_flags; 1126 if (nh->fib_nh_gw_family) { 1127 rt->rt6i_gateway = nh->fib_nh_gw6; 1128 rt->rt6i_flags |= RTF_GATEWAY; 1129 } 1130 rt6_set_from(rt, f6i); 1131 #ifdef CONFIG_IPV6_SUBTREES 1132 rt->rt6i_src = f6i->fib6_src; 1133 #endif 1134 } 1135 1136 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1137 struct in6_addr *saddr) 1138 { 1139 struct fib6_node *pn, *sn; 1140 while (1) { 1141 if (fn->fn_flags & RTN_TL_ROOT) 1142 return NULL; 1143 pn = rcu_dereference(fn->parent); 1144 sn = FIB6_SUBTREE(pn); 1145 if (sn && sn != fn) 1146 fn = fib6_node_lookup(sn, NULL, saddr); 1147 else 1148 fn = pn; 1149 if (fn->fn_flags & RTN_RTINFO) 1150 return fn; 1151 } 1152 } 1153 1154 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt) 1155 { 1156 struct rt6_info *rt = *prt; 1157 1158 if (dst_hold_safe(&rt->dst)) 1159 return true; 1160 if (net) { 1161 rt = net->ipv6.ip6_null_entry; 1162 dst_hold(&rt->dst); 1163 } else { 1164 rt = NULL; 1165 } 1166 *prt = rt; 1167 return false; 1168 } 1169 1170 /* called with rcu_lock held */ 1171 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res) 1172 { 1173 struct net_device *dev = res->nh->fib_nh_dev; 1174 struct fib6_info *f6i = res->f6i; 1175 unsigned short flags; 1176 struct rt6_info *nrt; 1177 1178 if (!fib6_info_hold_safe(f6i)) 1179 goto fallback; 1180 1181 flags = fib6_info_dst_flags(f6i); 1182 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1183 if (!nrt) { 1184 fib6_info_release(f6i); 1185 goto fallback; 1186 } 1187 1188 ip6_rt_copy_init(nrt, res); 1189 return nrt; 1190 1191 fallback: 1192 nrt = dev_net(dev)->ipv6.ip6_null_entry; 1193 dst_hold(&nrt->dst); 1194 return nrt; 1195 } 1196 1197 INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_lookup(struct net *net, 1198 struct fib6_table *table, 1199 struct flowi6 *fl6, 1200 const struct sk_buff *skb, 1201 int flags) 1202 { 1203 struct fib6_result res = {}; 1204 struct fib6_node *fn; 1205 struct rt6_info *rt; 1206 1207 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1208 flags &= ~RT6_LOOKUP_F_IFACE; 1209 1210 rcu_read_lock(); 1211 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1212 restart: 1213 res.f6i = rcu_dereference(fn->leaf); 1214 if (!res.f6i) 1215 res.f6i = net->ipv6.fib6_null_entry; 1216 else 1217 rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif, 1218 flags); 1219 1220 if (res.f6i == net->ipv6.fib6_null_entry) { 1221 fn = fib6_backtrack(fn, &fl6->saddr); 1222 if (fn) 1223 goto restart; 1224 1225 rt = net->ipv6.ip6_null_entry; 1226 dst_hold(&rt->dst); 1227 goto out; 1228 } else if (res.fib6_flags & RTF_REJECT) { 1229 goto do_create; 1230 } 1231 1232 fib6_select_path(net, &res, fl6, fl6->flowi6_oif, 1233 fl6->flowi6_oif != 0, skb, flags); 1234 1235 /* Search through exception table */ 1236 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); 1237 if (rt) { 1238 if (ip6_hold_safe(net, &rt)) 1239 dst_use_noref(&rt->dst, jiffies); 1240 } else { 1241 do_create: 1242 rt = ip6_create_rt_rcu(&res); 1243 } 1244 1245 out: 1246 trace_fib6_table_lookup(net, &res, table, fl6); 1247 1248 rcu_read_unlock(); 1249 1250 return rt; 1251 } 1252 1253 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1254 const struct sk_buff *skb, int flags) 1255 { 1256 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1257 } 1258 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1259 1260 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1261 const struct in6_addr *saddr, int oif, 1262 const struct sk_buff *skb, int strict) 1263 { 1264 struct flowi6 fl6 = { 1265 .flowi6_oif = oif, 1266 .daddr = *daddr, 1267 }; 1268 struct dst_entry *dst; 1269 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1270 1271 if (saddr) { 1272 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1273 flags |= RT6_LOOKUP_F_HAS_SADDR; 1274 } 1275 1276 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1277 if (dst->error == 0) 1278 return (struct rt6_info *) dst; 1279 1280 dst_release(dst); 1281 1282 return NULL; 1283 } 1284 EXPORT_SYMBOL(rt6_lookup); 1285 1286 /* ip6_ins_rt is called with FREE table->tb6_lock. 1287 * It takes new route entry, the addition fails by any reason the 1288 * route is released. 1289 * Caller must hold dst before calling it. 1290 */ 1291 1292 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1293 struct netlink_ext_ack *extack) 1294 { 1295 int err; 1296 struct fib6_table *table; 1297 1298 table = rt->fib6_table; 1299 spin_lock_bh(&table->tb6_lock); 1300 err = fib6_add(&table->tb6_root, rt, info, extack); 1301 spin_unlock_bh(&table->tb6_lock); 1302 1303 return err; 1304 } 1305 1306 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1307 { 1308 struct nl_info info = { .nl_net = net, }; 1309 1310 return __ip6_ins_rt(rt, &info, NULL); 1311 } 1312 1313 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res, 1314 const struct in6_addr *daddr, 1315 const struct in6_addr *saddr) 1316 { 1317 struct fib6_info *f6i = res->f6i; 1318 struct net_device *dev; 1319 struct rt6_info *rt; 1320 1321 /* 1322 * Clone the route. 1323 */ 1324 1325 if (!fib6_info_hold_safe(f6i)) 1326 return NULL; 1327 1328 dev = ip6_rt_get_dev_rcu(res); 1329 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1330 if (!rt) { 1331 fib6_info_release(f6i); 1332 return NULL; 1333 } 1334 1335 ip6_rt_copy_init(rt, res); 1336 rt->rt6i_flags |= RTF_CACHE; 1337 rt->rt6i_dst.addr = *daddr; 1338 rt->rt6i_dst.plen = 128; 1339 1340 if (!rt6_is_gw_or_nonexthop(res)) { 1341 if (f6i->fib6_dst.plen != 128 && 1342 ipv6_addr_equal(&f6i->fib6_dst.addr, daddr)) 1343 rt->rt6i_flags |= RTF_ANYCAST; 1344 #ifdef CONFIG_IPV6_SUBTREES 1345 if (rt->rt6i_src.plen && saddr) { 1346 rt->rt6i_src.addr = *saddr; 1347 rt->rt6i_src.plen = 128; 1348 } 1349 #endif 1350 } 1351 1352 return rt; 1353 } 1354 1355 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res) 1356 { 1357 struct fib6_info *f6i = res->f6i; 1358 unsigned short flags = fib6_info_dst_flags(f6i); 1359 struct net_device *dev; 1360 struct rt6_info *pcpu_rt; 1361 1362 if (!fib6_info_hold_safe(f6i)) 1363 return NULL; 1364 1365 rcu_read_lock(); 1366 dev = ip6_rt_get_dev_rcu(res); 1367 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags | DST_NOCOUNT); 1368 rcu_read_unlock(); 1369 if (!pcpu_rt) { 1370 fib6_info_release(f6i); 1371 return NULL; 1372 } 1373 ip6_rt_copy_init(pcpu_rt, res); 1374 pcpu_rt->rt6i_flags |= RTF_PCPU; 1375 1376 if (f6i->nh) 1377 pcpu_rt->sernum = rt_genid_ipv6(dev_net(dev)); 1378 1379 return pcpu_rt; 1380 } 1381 1382 static bool rt6_is_valid(const struct rt6_info *rt6) 1383 { 1384 return rt6->sernum == rt_genid_ipv6(dev_net(rt6->dst.dev)); 1385 } 1386 1387 /* It should be called with rcu_read_lock() acquired */ 1388 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) 1389 { 1390 struct rt6_info *pcpu_rt; 1391 1392 pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu); 1393 1394 if (pcpu_rt && pcpu_rt->sernum && !rt6_is_valid(pcpu_rt)) { 1395 struct rt6_info *prev, **p; 1396 1397 p = this_cpu_ptr(res->nh->rt6i_pcpu); 1398 prev = xchg(p, NULL); 1399 if (prev) { 1400 dst_dev_put(&prev->dst); 1401 dst_release(&prev->dst); 1402 } 1403 1404 pcpu_rt = NULL; 1405 } 1406 1407 return pcpu_rt; 1408 } 1409 1410 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1411 const struct fib6_result *res) 1412 { 1413 struct rt6_info *pcpu_rt, *prev, **p; 1414 1415 pcpu_rt = ip6_rt_pcpu_alloc(res); 1416 if (!pcpu_rt) 1417 return NULL; 1418 1419 p = this_cpu_ptr(res->nh->rt6i_pcpu); 1420 prev = cmpxchg(p, NULL, pcpu_rt); 1421 BUG_ON(prev); 1422 1423 if (res->f6i->fib6_destroying) { 1424 struct fib6_info *from; 1425 1426 from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); 1427 fib6_info_release(from); 1428 } 1429 1430 return pcpu_rt; 1431 } 1432 1433 /* exception hash table implementation 1434 */ 1435 static DEFINE_SPINLOCK(rt6_exception_lock); 1436 1437 /* Remove rt6_ex from hash table and free the memory 1438 * Caller must hold rt6_exception_lock 1439 */ 1440 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1441 struct rt6_exception *rt6_ex) 1442 { 1443 struct fib6_info *from; 1444 struct net *net; 1445 1446 if (!bucket || !rt6_ex) 1447 return; 1448 1449 net = dev_net(rt6_ex->rt6i->dst.dev); 1450 net->ipv6.rt6_stats->fib_rt_cache--; 1451 1452 /* purge completely the exception to allow releasing the held resources: 1453 * some [sk] cache may keep the dst around for unlimited time 1454 */ 1455 from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL); 1456 fib6_info_release(from); 1457 dst_dev_put(&rt6_ex->rt6i->dst); 1458 1459 hlist_del_rcu(&rt6_ex->hlist); 1460 dst_release(&rt6_ex->rt6i->dst); 1461 kfree_rcu(rt6_ex, rcu); 1462 WARN_ON_ONCE(!bucket->depth); 1463 bucket->depth--; 1464 } 1465 1466 /* Remove oldest rt6_ex in bucket and free the memory 1467 * Caller must hold rt6_exception_lock 1468 */ 1469 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1470 { 1471 struct rt6_exception *rt6_ex, *oldest = NULL; 1472 1473 if (!bucket) 1474 return; 1475 1476 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1477 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1478 oldest = rt6_ex; 1479 } 1480 rt6_remove_exception(bucket, oldest); 1481 } 1482 1483 static u32 rt6_exception_hash(const struct in6_addr *dst, 1484 const struct in6_addr *src) 1485 { 1486 static siphash_aligned_key_t rt6_exception_key; 1487 struct { 1488 struct in6_addr dst; 1489 struct in6_addr src; 1490 } __aligned(SIPHASH_ALIGNMENT) combined = { 1491 .dst = *dst, 1492 }; 1493 u64 val; 1494 1495 net_get_random_once(&rt6_exception_key, sizeof(rt6_exception_key)); 1496 1497 #ifdef CONFIG_IPV6_SUBTREES 1498 if (src) 1499 combined.src = *src; 1500 #endif 1501 val = siphash(&combined, sizeof(combined), &rt6_exception_key); 1502 1503 return hash_64(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1504 } 1505 1506 /* Helper function to find the cached rt in the hash table 1507 * and update bucket pointer to point to the bucket for this 1508 * (daddr, saddr) pair 1509 * Caller must hold rt6_exception_lock 1510 */ 1511 static struct rt6_exception * 1512 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1513 const struct in6_addr *daddr, 1514 const struct in6_addr *saddr) 1515 { 1516 struct rt6_exception *rt6_ex; 1517 u32 hval; 1518 1519 if (!(*bucket) || !daddr) 1520 return NULL; 1521 1522 hval = rt6_exception_hash(daddr, saddr); 1523 *bucket += hval; 1524 1525 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1526 struct rt6_info *rt6 = rt6_ex->rt6i; 1527 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1528 1529 #ifdef CONFIG_IPV6_SUBTREES 1530 if (matched && saddr) 1531 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1532 #endif 1533 if (matched) 1534 return rt6_ex; 1535 } 1536 return NULL; 1537 } 1538 1539 /* Helper function to find the cached rt in the hash table 1540 * and update bucket pointer to point to the bucket for this 1541 * (daddr, saddr) pair 1542 * Caller must hold rcu_read_lock() 1543 */ 1544 static struct rt6_exception * 1545 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1546 const struct in6_addr *daddr, 1547 const struct in6_addr *saddr) 1548 { 1549 struct rt6_exception *rt6_ex; 1550 u32 hval; 1551 1552 WARN_ON_ONCE(!rcu_read_lock_held()); 1553 1554 if (!(*bucket) || !daddr) 1555 return NULL; 1556 1557 hval = rt6_exception_hash(daddr, saddr); 1558 *bucket += hval; 1559 1560 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1561 struct rt6_info *rt6 = rt6_ex->rt6i; 1562 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1563 1564 #ifdef CONFIG_IPV6_SUBTREES 1565 if (matched && saddr) 1566 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1567 #endif 1568 if (matched) 1569 return rt6_ex; 1570 } 1571 return NULL; 1572 } 1573 1574 static unsigned int fib6_mtu(const struct fib6_result *res) 1575 { 1576 const struct fib6_nh *nh = res->nh; 1577 unsigned int mtu; 1578 1579 if (res->f6i->fib6_pmtu) { 1580 mtu = res->f6i->fib6_pmtu; 1581 } else { 1582 struct net_device *dev = nh->fib_nh_dev; 1583 struct inet6_dev *idev; 1584 1585 rcu_read_lock(); 1586 idev = __in6_dev_get(dev); 1587 mtu = idev->cnf.mtu6; 1588 rcu_read_unlock(); 1589 } 1590 1591 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1592 1593 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); 1594 } 1595 1596 #define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL 1597 1598 /* used when the flushed bit is not relevant, only access to the bucket 1599 * (ie., all bucket users except rt6_insert_exception); 1600 * 1601 * called under rcu lock; sometimes called with rt6_exception_lock held 1602 */ 1603 static 1604 struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh, 1605 spinlock_t *lock) 1606 { 1607 struct rt6_exception_bucket *bucket; 1608 1609 if (lock) 1610 bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, 1611 lockdep_is_held(lock)); 1612 else 1613 bucket = rcu_dereference(nh->rt6i_exception_bucket); 1614 1615 /* remove bucket flushed bit if set */ 1616 if (bucket) { 1617 unsigned long p = (unsigned long)bucket; 1618 1619 p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED; 1620 bucket = (struct rt6_exception_bucket *)p; 1621 } 1622 1623 return bucket; 1624 } 1625 1626 static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket) 1627 { 1628 unsigned long p = (unsigned long)bucket; 1629 1630 return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED); 1631 } 1632 1633 /* called with rt6_exception_lock held */ 1634 static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh, 1635 spinlock_t *lock) 1636 { 1637 struct rt6_exception_bucket *bucket; 1638 unsigned long p; 1639 1640 bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, 1641 lockdep_is_held(lock)); 1642 1643 p = (unsigned long)bucket; 1644 p |= FIB6_EXCEPTION_BUCKET_FLUSHED; 1645 bucket = (struct rt6_exception_bucket *)p; 1646 rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); 1647 } 1648 1649 static int rt6_insert_exception(struct rt6_info *nrt, 1650 const struct fib6_result *res) 1651 { 1652 struct net *net = dev_net(nrt->dst.dev); 1653 struct rt6_exception_bucket *bucket; 1654 struct fib6_info *f6i = res->f6i; 1655 struct in6_addr *src_key = NULL; 1656 struct rt6_exception *rt6_ex; 1657 struct fib6_nh *nh = res->nh; 1658 int max_depth; 1659 int err = 0; 1660 1661 spin_lock_bh(&rt6_exception_lock); 1662 1663 bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, 1664 lockdep_is_held(&rt6_exception_lock)); 1665 if (!bucket) { 1666 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1667 GFP_ATOMIC); 1668 if (!bucket) { 1669 err = -ENOMEM; 1670 goto out; 1671 } 1672 rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); 1673 } else if (fib6_nh_excptn_bucket_flushed(bucket)) { 1674 err = -EINVAL; 1675 goto out; 1676 } 1677 1678 #ifdef CONFIG_IPV6_SUBTREES 1679 /* fib6_src.plen != 0 indicates f6i is in subtree 1680 * and exception table is indexed by a hash of 1681 * both fib6_dst and fib6_src. 1682 * Otherwise, the exception table is indexed by 1683 * a hash of only fib6_dst. 1684 */ 1685 if (f6i->fib6_src.plen) 1686 src_key = &nrt->rt6i_src.addr; 1687 #endif 1688 /* rt6_mtu_change() might lower mtu on f6i. 1689 * Only insert this exception route if its mtu 1690 * is less than f6i's mtu value. 1691 */ 1692 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) { 1693 err = -EINVAL; 1694 goto out; 1695 } 1696 1697 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1698 src_key); 1699 if (rt6_ex) 1700 rt6_remove_exception(bucket, rt6_ex); 1701 1702 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1703 if (!rt6_ex) { 1704 err = -ENOMEM; 1705 goto out; 1706 } 1707 rt6_ex->rt6i = nrt; 1708 rt6_ex->stamp = jiffies; 1709 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1710 bucket->depth++; 1711 net->ipv6.rt6_stats->fib_rt_cache++; 1712 1713 /* Randomize max depth to avoid some side channels attacks. */ 1714 max_depth = FIB6_MAX_DEPTH + prandom_u32_max(FIB6_MAX_DEPTH); 1715 while (bucket->depth > max_depth) 1716 rt6_exception_remove_oldest(bucket); 1717 1718 out: 1719 spin_unlock_bh(&rt6_exception_lock); 1720 1721 /* Update fn->fn_sernum to invalidate all cached dst */ 1722 if (!err) { 1723 spin_lock_bh(&f6i->fib6_table->tb6_lock); 1724 fib6_update_sernum(net, f6i); 1725 spin_unlock_bh(&f6i->fib6_table->tb6_lock); 1726 fib6_force_start_gc(net); 1727 } 1728 1729 return err; 1730 } 1731 1732 static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from) 1733 { 1734 struct rt6_exception_bucket *bucket; 1735 struct rt6_exception *rt6_ex; 1736 struct hlist_node *tmp; 1737 int i; 1738 1739 spin_lock_bh(&rt6_exception_lock); 1740 1741 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 1742 if (!bucket) 1743 goto out; 1744 1745 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1746 if (!from) 1747 fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock); 1748 1749 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1750 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) { 1751 if (!from || 1752 rcu_access_pointer(rt6_ex->rt6i->from) == from) 1753 rt6_remove_exception(bucket, rt6_ex); 1754 } 1755 WARN_ON_ONCE(!from && bucket->depth); 1756 bucket++; 1757 } 1758 out: 1759 spin_unlock_bh(&rt6_exception_lock); 1760 } 1761 1762 static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg) 1763 { 1764 struct fib6_info *f6i = arg; 1765 1766 fib6_nh_flush_exceptions(nh, f6i); 1767 1768 return 0; 1769 } 1770 1771 void rt6_flush_exceptions(struct fib6_info *f6i) 1772 { 1773 if (f6i->nh) 1774 nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, 1775 f6i); 1776 else 1777 fib6_nh_flush_exceptions(f6i->fib6_nh, f6i); 1778 } 1779 1780 /* Find cached rt in the hash table inside passed in rt 1781 * Caller has to hold rcu_read_lock() 1782 */ 1783 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, 1784 const struct in6_addr *daddr, 1785 const struct in6_addr *saddr) 1786 { 1787 const struct in6_addr *src_key = NULL; 1788 struct rt6_exception_bucket *bucket; 1789 struct rt6_exception *rt6_ex; 1790 struct rt6_info *ret = NULL; 1791 1792 #ifdef CONFIG_IPV6_SUBTREES 1793 /* fib6i_src.plen != 0 indicates f6i is in subtree 1794 * and exception table is indexed by a hash of 1795 * both fib6_dst and fib6_src. 1796 * However, the src addr used to create the hash 1797 * might not be exactly the passed in saddr which 1798 * is a /128 addr from the flow. 1799 * So we need to use f6i->fib6_src to redo lookup 1800 * if the passed in saddr does not find anything. 1801 * (See the logic in ip6_rt_cache_alloc() on how 1802 * rt->rt6i_src is updated.) 1803 */ 1804 if (res->f6i->fib6_src.plen) 1805 src_key = saddr; 1806 find_ex: 1807 #endif 1808 bucket = fib6_nh_get_excptn_bucket(res->nh, NULL); 1809 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1810 1811 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1812 ret = rt6_ex->rt6i; 1813 1814 #ifdef CONFIG_IPV6_SUBTREES 1815 /* Use fib6_src as src_key and redo lookup */ 1816 if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) { 1817 src_key = &res->f6i->fib6_src.addr; 1818 goto find_ex; 1819 } 1820 #endif 1821 1822 return ret; 1823 } 1824 1825 /* Remove the passed in cached rt from the hash table that contains it */ 1826 static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen, 1827 const struct rt6_info *rt) 1828 { 1829 const struct in6_addr *src_key = NULL; 1830 struct rt6_exception_bucket *bucket; 1831 struct rt6_exception *rt6_ex; 1832 int err; 1833 1834 if (!rcu_access_pointer(nh->rt6i_exception_bucket)) 1835 return -ENOENT; 1836 1837 spin_lock_bh(&rt6_exception_lock); 1838 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 1839 1840 #ifdef CONFIG_IPV6_SUBTREES 1841 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1842 * and exception table is indexed by a hash of 1843 * both rt6i_dst and rt6i_src. 1844 * Otherwise, the exception table is indexed by 1845 * a hash of only rt6i_dst. 1846 */ 1847 if (plen) 1848 src_key = &rt->rt6i_src.addr; 1849 #endif 1850 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1851 &rt->rt6i_dst.addr, 1852 src_key); 1853 if (rt6_ex) { 1854 rt6_remove_exception(bucket, rt6_ex); 1855 err = 0; 1856 } else { 1857 err = -ENOENT; 1858 } 1859 1860 spin_unlock_bh(&rt6_exception_lock); 1861 return err; 1862 } 1863 1864 struct fib6_nh_excptn_arg { 1865 struct rt6_info *rt; 1866 int plen; 1867 }; 1868 1869 static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg) 1870 { 1871 struct fib6_nh_excptn_arg *arg = _arg; 1872 int err; 1873 1874 err = fib6_nh_remove_exception(nh, arg->plen, arg->rt); 1875 if (err == 0) 1876 return 1; 1877 1878 return 0; 1879 } 1880 1881 static int rt6_remove_exception_rt(struct rt6_info *rt) 1882 { 1883 struct fib6_info *from; 1884 1885 from = rcu_dereference(rt->from); 1886 if (!from || !(rt->rt6i_flags & RTF_CACHE)) 1887 return -EINVAL; 1888 1889 if (from->nh) { 1890 struct fib6_nh_excptn_arg arg = { 1891 .rt = rt, 1892 .plen = from->fib6_src.plen 1893 }; 1894 int rc; 1895 1896 /* rc = 1 means an entry was found */ 1897 rc = nexthop_for_each_fib6_nh(from->nh, 1898 rt6_nh_remove_exception_rt, 1899 &arg); 1900 return rc ? 0 : -ENOENT; 1901 } 1902 1903 return fib6_nh_remove_exception(from->fib6_nh, 1904 from->fib6_src.plen, rt); 1905 } 1906 1907 /* Find rt6_ex which contains the passed in rt cache and 1908 * refresh its stamp 1909 */ 1910 static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen, 1911 const struct rt6_info *rt) 1912 { 1913 const struct in6_addr *src_key = NULL; 1914 struct rt6_exception_bucket *bucket; 1915 struct rt6_exception *rt6_ex; 1916 1917 bucket = fib6_nh_get_excptn_bucket(nh, NULL); 1918 #ifdef CONFIG_IPV6_SUBTREES 1919 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1920 * and exception table is indexed by a hash of 1921 * both rt6i_dst and rt6i_src. 1922 * Otherwise, the exception table is indexed by 1923 * a hash of only rt6i_dst. 1924 */ 1925 if (plen) 1926 src_key = &rt->rt6i_src.addr; 1927 #endif 1928 rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key); 1929 if (rt6_ex) 1930 rt6_ex->stamp = jiffies; 1931 } 1932 1933 struct fib6_nh_match_arg { 1934 const struct net_device *dev; 1935 const struct in6_addr *gw; 1936 struct fib6_nh *match; 1937 }; 1938 1939 /* determine if fib6_nh has given device and gateway */ 1940 static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg) 1941 { 1942 struct fib6_nh_match_arg *arg = _arg; 1943 1944 if (arg->dev != nh->fib_nh_dev || 1945 (arg->gw && !nh->fib_nh_gw_family) || 1946 (!arg->gw && nh->fib_nh_gw_family) || 1947 (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6))) 1948 return 0; 1949 1950 arg->match = nh; 1951 1952 /* found a match, break the loop */ 1953 return 1; 1954 } 1955 1956 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1957 { 1958 struct fib6_info *from; 1959 struct fib6_nh *fib6_nh; 1960 1961 rcu_read_lock(); 1962 1963 from = rcu_dereference(rt->from); 1964 if (!from || !(rt->rt6i_flags & RTF_CACHE)) 1965 goto unlock; 1966 1967 if (from->nh) { 1968 struct fib6_nh_match_arg arg = { 1969 .dev = rt->dst.dev, 1970 .gw = &rt->rt6i_gateway, 1971 }; 1972 1973 nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg); 1974 1975 if (!arg.match) 1976 goto unlock; 1977 fib6_nh = arg.match; 1978 } else { 1979 fib6_nh = from->fib6_nh; 1980 } 1981 fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt); 1982 unlock: 1983 rcu_read_unlock(); 1984 } 1985 1986 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1987 struct rt6_info *rt, int mtu) 1988 { 1989 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1990 * lowest MTU in the path: always allow updating the route PMTU to 1991 * reflect PMTU decreases. 1992 * 1993 * If the new MTU is higher, and the route PMTU is equal to the local 1994 * MTU, this means the old MTU is the lowest in the path, so allow 1995 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1996 * handle this. 1997 */ 1998 1999 if (dst_mtu(&rt->dst) >= mtu) 2000 return true; 2001 2002 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 2003 return true; 2004 2005 return false; 2006 } 2007 2008 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 2009 const struct fib6_nh *nh, int mtu) 2010 { 2011 struct rt6_exception_bucket *bucket; 2012 struct rt6_exception *rt6_ex; 2013 int i; 2014 2015 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 2016 if (!bucket) 2017 return; 2018 2019 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 2020 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 2021 struct rt6_info *entry = rt6_ex->rt6i; 2022 2023 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 2024 * route), the metrics of its rt->from have already 2025 * been updated. 2026 */ 2027 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 2028 rt6_mtu_change_route_allowed(idev, entry, mtu)) 2029 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 2030 } 2031 bucket++; 2032 } 2033 } 2034 2035 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 2036 2037 static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh, 2038 const struct in6_addr *gateway) 2039 { 2040 struct rt6_exception_bucket *bucket; 2041 struct rt6_exception *rt6_ex; 2042 struct hlist_node *tmp; 2043 int i; 2044 2045 if (!rcu_access_pointer(nh->rt6i_exception_bucket)) 2046 return; 2047 2048 spin_lock_bh(&rt6_exception_lock); 2049 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 2050 if (bucket) { 2051 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 2052 hlist_for_each_entry_safe(rt6_ex, tmp, 2053 &bucket->chain, hlist) { 2054 struct rt6_info *entry = rt6_ex->rt6i; 2055 2056 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 2057 RTF_CACHE_GATEWAY && 2058 ipv6_addr_equal(gateway, 2059 &entry->rt6i_gateway)) { 2060 rt6_remove_exception(bucket, rt6_ex); 2061 } 2062 } 2063 bucket++; 2064 } 2065 } 2066 2067 spin_unlock_bh(&rt6_exception_lock); 2068 } 2069 2070 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 2071 struct rt6_exception *rt6_ex, 2072 struct fib6_gc_args *gc_args, 2073 unsigned long now) 2074 { 2075 struct rt6_info *rt = rt6_ex->rt6i; 2076 2077 /* we are pruning and obsoleting aged-out and non gateway exceptions 2078 * even if others have still references to them, so that on next 2079 * dst_check() such references can be dropped. 2080 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 2081 * expired, independently from their aging, as per RFC 8201 section 4 2082 */ 2083 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 2084 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 2085 RT6_TRACE("aging clone %p\n", rt); 2086 rt6_remove_exception(bucket, rt6_ex); 2087 return; 2088 } 2089 } else if (time_after(jiffies, rt->dst.expires)) { 2090 RT6_TRACE("purging expired route %p\n", rt); 2091 rt6_remove_exception(bucket, rt6_ex); 2092 return; 2093 } 2094 2095 if (rt->rt6i_flags & RTF_GATEWAY) { 2096 struct neighbour *neigh; 2097 2098 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 2099 2100 if (!(neigh && (neigh->flags & NTF_ROUTER))) { 2101 RT6_TRACE("purging route %p via non-router but gateway\n", 2102 rt); 2103 rt6_remove_exception(bucket, rt6_ex); 2104 return; 2105 } 2106 } 2107 2108 gc_args->more++; 2109 } 2110 2111 static void fib6_nh_age_exceptions(const struct fib6_nh *nh, 2112 struct fib6_gc_args *gc_args, 2113 unsigned long now) 2114 { 2115 struct rt6_exception_bucket *bucket; 2116 struct rt6_exception *rt6_ex; 2117 struct hlist_node *tmp; 2118 int i; 2119 2120 if (!rcu_access_pointer(nh->rt6i_exception_bucket)) 2121 return; 2122 2123 rcu_read_lock_bh(); 2124 spin_lock(&rt6_exception_lock); 2125 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 2126 if (bucket) { 2127 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 2128 hlist_for_each_entry_safe(rt6_ex, tmp, 2129 &bucket->chain, hlist) { 2130 rt6_age_examine_exception(bucket, rt6_ex, 2131 gc_args, now); 2132 } 2133 bucket++; 2134 } 2135 } 2136 spin_unlock(&rt6_exception_lock); 2137 rcu_read_unlock_bh(); 2138 } 2139 2140 struct fib6_nh_age_excptn_arg { 2141 struct fib6_gc_args *gc_args; 2142 unsigned long now; 2143 }; 2144 2145 static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg) 2146 { 2147 struct fib6_nh_age_excptn_arg *arg = _arg; 2148 2149 fib6_nh_age_exceptions(nh, arg->gc_args, arg->now); 2150 return 0; 2151 } 2152 2153 void rt6_age_exceptions(struct fib6_info *f6i, 2154 struct fib6_gc_args *gc_args, 2155 unsigned long now) 2156 { 2157 if (f6i->nh) { 2158 struct fib6_nh_age_excptn_arg arg = { 2159 .gc_args = gc_args, 2160 .now = now 2161 }; 2162 2163 nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions, 2164 &arg); 2165 } else { 2166 fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now); 2167 } 2168 } 2169 2170 /* must be called with rcu lock held */ 2171 int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, 2172 struct flowi6 *fl6, struct fib6_result *res, int strict) 2173 { 2174 struct fib6_node *fn, *saved_fn; 2175 2176 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2177 saved_fn = fn; 2178 2179 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 2180 oif = 0; 2181 2182 redo_rt6_select: 2183 rt6_select(net, fn, oif, res, strict); 2184 if (res->f6i == net->ipv6.fib6_null_entry) { 2185 fn = fib6_backtrack(fn, &fl6->saddr); 2186 if (fn) 2187 goto redo_rt6_select; 2188 else if (strict & RT6_LOOKUP_F_REACHABLE) { 2189 /* also consider unreachable route */ 2190 strict &= ~RT6_LOOKUP_F_REACHABLE; 2191 fn = saved_fn; 2192 goto redo_rt6_select; 2193 } 2194 } 2195 2196 trace_fib6_table_lookup(net, res, table, fl6); 2197 2198 return 0; 2199 } 2200 2201 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 2202 int oif, struct flowi6 *fl6, 2203 const struct sk_buff *skb, int flags) 2204 { 2205 struct fib6_result res = {}; 2206 struct rt6_info *rt = NULL; 2207 int strict = 0; 2208 2209 WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) && 2210 !rcu_read_lock_held()); 2211 2212 strict |= flags & RT6_LOOKUP_F_IFACE; 2213 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 2214 if (net->ipv6.devconf_all->forwarding == 0) 2215 strict |= RT6_LOOKUP_F_REACHABLE; 2216 2217 rcu_read_lock(); 2218 2219 fib6_table_lookup(net, table, oif, fl6, &res, strict); 2220 if (res.f6i == net->ipv6.fib6_null_entry) 2221 goto out; 2222 2223 fib6_select_path(net, &res, fl6, oif, false, skb, strict); 2224 2225 /*Search through exception table */ 2226 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); 2227 if (rt) { 2228 goto out; 2229 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 2230 !res.nh->fib_nh_gw_family)) { 2231 /* Create a RTF_CACHE clone which will not be 2232 * owned by the fib6 tree. It is for the special case where 2233 * the daddr in the skb during the neighbor look-up is different 2234 * from the fl6->daddr used to look-up route here. 2235 */ 2236 rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); 2237 2238 if (rt) { 2239 /* 1 refcnt is taken during ip6_rt_cache_alloc(). 2240 * As rt6_uncached_list_add() does not consume refcnt, 2241 * this refcnt is always returned to the caller even 2242 * if caller sets RT6_LOOKUP_F_DST_NOREF flag. 2243 */ 2244 rt6_uncached_list_add(rt); 2245 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2246 rcu_read_unlock(); 2247 2248 return rt; 2249 } 2250 } else { 2251 /* Get a percpu copy */ 2252 local_bh_disable(); 2253 rt = rt6_get_pcpu_route(&res); 2254 2255 if (!rt) 2256 rt = rt6_make_pcpu_route(net, &res); 2257 2258 local_bh_enable(); 2259 } 2260 out: 2261 if (!rt) 2262 rt = net->ipv6.ip6_null_entry; 2263 if (!(flags & RT6_LOOKUP_F_DST_NOREF)) 2264 ip6_hold_safe(net, &rt); 2265 rcu_read_unlock(); 2266 2267 return rt; 2268 } 2269 EXPORT_SYMBOL_GPL(ip6_pol_route); 2270 2271 INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_input(struct net *net, 2272 struct fib6_table *table, 2273 struct flowi6 *fl6, 2274 const struct sk_buff *skb, 2275 int flags) 2276 { 2277 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 2278 } 2279 2280 struct dst_entry *ip6_route_input_lookup(struct net *net, 2281 struct net_device *dev, 2282 struct flowi6 *fl6, 2283 const struct sk_buff *skb, 2284 int flags) 2285 { 2286 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 2287 flags |= RT6_LOOKUP_F_IFACE; 2288 2289 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 2290 } 2291 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 2292 2293 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 2294 struct flow_keys *keys, 2295 struct flow_keys *flkeys) 2296 { 2297 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 2298 const struct ipv6hdr *key_iph = outer_iph; 2299 struct flow_keys *_flkeys = flkeys; 2300 const struct ipv6hdr *inner_iph; 2301 const struct icmp6hdr *icmph; 2302 struct ipv6hdr _inner_iph; 2303 struct icmp6hdr _icmph; 2304 2305 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 2306 goto out; 2307 2308 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 2309 sizeof(_icmph), &_icmph); 2310 if (!icmph) 2311 goto out; 2312 2313 if (!icmpv6_is_err(icmph->icmp6_type)) 2314 goto out; 2315 2316 inner_iph = skb_header_pointer(skb, 2317 skb_transport_offset(skb) + sizeof(*icmph), 2318 sizeof(_inner_iph), &_inner_iph); 2319 if (!inner_iph) 2320 goto out; 2321 2322 key_iph = inner_iph; 2323 _flkeys = NULL; 2324 out: 2325 if (_flkeys) { 2326 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 2327 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 2328 keys->tags.flow_label = _flkeys->tags.flow_label; 2329 keys->basic.ip_proto = _flkeys->basic.ip_proto; 2330 } else { 2331 keys->addrs.v6addrs.src = key_iph->saddr; 2332 keys->addrs.v6addrs.dst = key_iph->daddr; 2333 keys->tags.flow_label = ip6_flowlabel(key_iph); 2334 keys->basic.ip_proto = key_iph->nexthdr; 2335 } 2336 } 2337 2338 static u32 rt6_multipath_custom_hash_outer(const struct net *net, 2339 const struct sk_buff *skb, 2340 bool *p_has_inner) 2341 { 2342 u32 hash_fields = ip6_multipath_hash_fields(net); 2343 struct flow_keys keys, hash_keys; 2344 2345 if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) 2346 return 0; 2347 2348 memset(&hash_keys, 0, sizeof(hash_keys)); 2349 skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP); 2350 2351 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2352 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) 2353 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; 2354 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) 2355 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; 2356 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) 2357 hash_keys.basic.ip_proto = keys.basic.ip_proto; 2358 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) 2359 hash_keys.tags.flow_label = keys.tags.flow_label; 2360 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) 2361 hash_keys.ports.src = keys.ports.src; 2362 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) 2363 hash_keys.ports.dst = keys.ports.dst; 2364 2365 *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION); 2366 return flow_hash_from_keys(&hash_keys); 2367 } 2368 2369 static u32 rt6_multipath_custom_hash_inner(const struct net *net, 2370 const struct sk_buff *skb, 2371 bool has_inner) 2372 { 2373 u32 hash_fields = ip6_multipath_hash_fields(net); 2374 struct flow_keys keys, hash_keys; 2375 2376 /* We assume the packet carries an encapsulation, but if none was 2377 * encountered during dissection of the outer flow, then there is no 2378 * point in calling the flow dissector again. 2379 */ 2380 if (!has_inner) 2381 return 0; 2382 2383 if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK)) 2384 return 0; 2385 2386 memset(&hash_keys, 0, sizeof(hash_keys)); 2387 skb_flow_dissect_flow_keys(skb, &keys, 0); 2388 2389 if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION)) 2390 return 0; 2391 2392 if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { 2393 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 2394 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) 2395 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; 2396 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) 2397 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; 2398 } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { 2399 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2400 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) 2401 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; 2402 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) 2403 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; 2404 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL) 2405 hash_keys.tags.flow_label = keys.tags.flow_label; 2406 } 2407 2408 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO) 2409 hash_keys.basic.ip_proto = keys.basic.ip_proto; 2410 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT) 2411 hash_keys.ports.src = keys.ports.src; 2412 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) 2413 hash_keys.ports.dst = keys.ports.dst; 2414 2415 return flow_hash_from_keys(&hash_keys); 2416 } 2417 2418 static u32 rt6_multipath_custom_hash_skb(const struct net *net, 2419 const struct sk_buff *skb) 2420 { 2421 u32 mhash, mhash_inner; 2422 bool has_inner = true; 2423 2424 mhash = rt6_multipath_custom_hash_outer(net, skb, &has_inner); 2425 mhash_inner = rt6_multipath_custom_hash_inner(net, skb, has_inner); 2426 2427 return jhash_2words(mhash, mhash_inner, 0); 2428 } 2429 2430 static u32 rt6_multipath_custom_hash_fl6(const struct net *net, 2431 const struct flowi6 *fl6) 2432 { 2433 u32 hash_fields = ip6_multipath_hash_fields(net); 2434 struct flow_keys hash_keys; 2435 2436 if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) 2437 return 0; 2438 2439 memset(&hash_keys, 0, sizeof(hash_keys)); 2440 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2441 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) 2442 hash_keys.addrs.v6addrs.src = fl6->saddr; 2443 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) 2444 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2445 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) 2446 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2447 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) 2448 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2449 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) 2450 hash_keys.ports.src = fl6->fl6_sport; 2451 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) 2452 hash_keys.ports.dst = fl6->fl6_dport; 2453 2454 return flow_hash_from_keys(&hash_keys); 2455 } 2456 2457 /* if skb is set it will be used and fl6 can be NULL */ 2458 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 2459 const struct sk_buff *skb, struct flow_keys *flkeys) 2460 { 2461 struct flow_keys hash_keys; 2462 u32 mhash = 0; 2463 2464 switch (ip6_multipath_hash_policy(net)) { 2465 case 0: 2466 memset(&hash_keys, 0, sizeof(hash_keys)); 2467 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2468 if (skb) { 2469 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 2470 } else { 2471 hash_keys.addrs.v6addrs.src = fl6->saddr; 2472 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2473 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2474 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2475 } 2476 mhash = flow_hash_from_keys(&hash_keys); 2477 break; 2478 case 1: 2479 if (skb) { 2480 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 2481 struct flow_keys keys; 2482 2483 /* short-circuit if we already have L4 hash present */ 2484 if (skb->l4_hash) 2485 return skb_get_hash_raw(skb) >> 1; 2486 2487 memset(&hash_keys, 0, sizeof(hash_keys)); 2488 2489 if (!flkeys) { 2490 skb_flow_dissect_flow_keys(skb, &keys, flag); 2491 flkeys = &keys; 2492 } 2493 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2494 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2495 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2496 hash_keys.ports.src = flkeys->ports.src; 2497 hash_keys.ports.dst = flkeys->ports.dst; 2498 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2499 } else { 2500 memset(&hash_keys, 0, sizeof(hash_keys)); 2501 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2502 hash_keys.addrs.v6addrs.src = fl6->saddr; 2503 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2504 hash_keys.ports.src = fl6->fl6_sport; 2505 hash_keys.ports.dst = fl6->fl6_dport; 2506 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2507 } 2508 mhash = flow_hash_from_keys(&hash_keys); 2509 break; 2510 case 2: 2511 memset(&hash_keys, 0, sizeof(hash_keys)); 2512 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2513 if (skb) { 2514 struct flow_keys keys; 2515 2516 if (!flkeys) { 2517 skb_flow_dissect_flow_keys(skb, &keys, 0); 2518 flkeys = &keys; 2519 } 2520 2521 /* Inner can be v4 or v6 */ 2522 if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { 2523 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 2524 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; 2525 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; 2526 } else if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { 2527 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2528 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2529 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2530 hash_keys.tags.flow_label = flkeys->tags.flow_label; 2531 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2532 } else { 2533 /* Same as case 0 */ 2534 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2535 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 2536 } 2537 } else { 2538 /* Same as case 0 */ 2539 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2540 hash_keys.addrs.v6addrs.src = fl6->saddr; 2541 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2542 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2543 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2544 } 2545 mhash = flow_hash_from_keys(&hash_keys); 2546 break; 2547 case 3: 2548 if (skb) 2549 mhash = rt6_multipath_custom_hash_skb(net, skb); 2550 else 2551 mhash = rt6_multipath_custom_hash_fl6(net, fl6); 2552 break; 2553 } 2554 2555 return mhash >> 1; 2556 } 2557 2558 /* Called with rcu held */ 2559 void ip6_route_input(struct sk_buff *skb) 2560 { 2561 const struct ipv6hdr *iph = ipv6_hdr(skb); 2562 struct net *net = dev_net(skb->dev); 2563 int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF; 2564 struct ip_tunnel_info *tun_info; 2565 struct flowi6 fl6 = { 2566 .flowi6_iif = skb->dev->ifindex, 2567 .daddr = iph->daddr, 2568 .saddr = iph->saddr, 2569 .flowlabel = ip6_flowinfo(iph), 2570 .flowi6_mark = skb->mark, 2571 .flowi6_proto = iph->nexthdr, 2572 }; 2573 struct flow_keys *flkeys = NULL, _flkeys; 2574 2575 tun_info = skb_tunnel_info(skb); 2576 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2577 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2578 2579 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2580 flkeys = &_flkeys; 2581 2582 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2583 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2584 skb_dst_drop(skb); 2585 skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev, 2586 &fl6, skb, flags)); 2587 } 2588 2589 INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net, 2590 struct fib6_table *table, 2591 struct flowi6 *fl6, 2592 const struct sk_buff *skb, 2593 int flags) 2594 { 2595 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2596 } 2597 2598 struct dst_entry *ip6_route_output_flags_noref(struct net *net, 2599 const struct sock *sk, 2600 struct flowi6 *fl6, int flags) 2601 { 2602 bool any_src; 2603 2604 if (ipv6_addr_type(&fl6->daddr) & 2605 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { 2606 struct dst_entry *dst; 2607 2608 /* This function does not take refcnt on the dst */ 2609 dst = l3mdev_link_scope_lookup(net, fl6); 2610 if (dst) 2611 return dst; 2612 } 2613 2614 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2615 2616 flags |= RT6_LOOKUP_F_DST_NOREF; 2617 any_src = ipv6_addr_any(&fl6->saddr); 2618 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2619 (fl6->flowi6_oif && any_src)) 2620 flags |= RT6_LOOKUP_F_IFACE; 2621 2622 if (!any_src) 2623 flags |= RT6_LOOKUP_F_HAS_SADDR; 2624 else if (sk) 2625 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2626 2627 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2628 } 2629 EXPORT_SYMBOL_GPL(ip6_route_output_flags_noref); 2630 2631 struct dst_entry *ip6_route_output_flags(struct net *net, 2632 const struct sock *sk, 2633 struct flowi6 *fl6, 2634 int flags) 2635 { 2636 struct dst_entry *dst; 2637 struct rt6_info *rt6; 2638 2639 rcu_read_lock(); 2640 dst = ip6_route_output_flags_noref(net, sk, fl6, flags); 2641 rt6 = (struct rt6_info *)dst; 2642 /* For dst cached in uncached_list, refcnt is already taken. */ 2643 if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) { 2644 dst = &net->ipv6.ip6_null_entry->dst; 2645 dst_hold(dst); 2646 } 2647 rcu_read_unlock(); 2648 2649 return dst; 2650 } 2651 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2652 2653 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2654 { 2655 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2656 struct net_device *loopback_dev = net->loopback_dev; 2657 struct dst_entry *new = NULL; 2658 2659 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2660 DST_OBSOLETE_DEAD, 0); 2661 if (rt) { 2662 rt6_info_init(rt); 2663 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2664 2665 new = &rt->dst; 2666 new->__use = 1; 2667 new->input = dst_discard; 2668 new->output = dst_discard_out; 2669 2670 dst_copy_metrics(new, &ort->dst); 2671 2672 rt->rt6i_idev = in6_dev_get(loopback_dev); 2673 rt->rt6i_gateway = ort->rt6i_gateway; 2674 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2675 2676 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2677 #ifdef CONFIG_IPV6_SUBTREES 2678 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2679 #endif 2680 } 2681 2682 dst_release(dst_orig); 2683 return new ? new : ERR_PTR(-ENOMEM); 2684 } 2685 2686 /* 2687 * Destination cache support functions 2688 */ 2689 2690 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2691 { 2692 u32 rt_cookie = 0; 2693 2694 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2695 return false; 2696 2697 if (fib6_check_expired(f6i)) 2698 return false; 2699 2700 return true; 2701 } 2702 2703 static struct dst_entry *rt6_check(struct rt6_info *rt, 2704 struct fib6_info *from, 2705 u32 cookie) 2706 { 2707 u32 rt_cookie = 0; 2708 2709 if (!from || !fib6_get_cookie_safe(from, &rt_cookie) || 2710 rt_cookie != cookie) 2711 return NULL; 2712 2713 if (rt6_check_expired(rt)) 2714 return NULL; 2715 2716 return &rt->dst; 2717 } 2718 2719 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2720 struct fib6_info *from, 2721 u32 cookie) 2722 { 2723 if (!__rt6_check_expired(rt) && 2724 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2725 fib6_check(from, cookie)) 2726 return &rt->dst; 2727 else 2728 return NULL; 2729 } 2730 2731 INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, 2732 u32 cookie) 2733 { 2734 struct dst_entry *dst_ret; 2735 struct fib6_info *from; 2736 struct rt6_info *rt; 2737 2738 rt = container_of(dst, struct rt6_info, dst); 2739 2740 if (rt->sernum) 2741 return rt6_is_valid(rt) ? dst : NULL; 2742 2743 rcu_read_lock(); 2744 2745 /* All IPV6 dsts are created with ->obsolete set to the value 2746 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2747 * into this function always. 2748 */ 2749 2750 from = rcu_dereference(rt->from); 2751 2752 if (from && (rt->rt6i_flags & RTF_PCPU || 2753 unlikely(!list_empty(&rt->rt6i_uncached)))) 2754 dst_ret = rt6_dst_from_check(rt, from, cookie); 2755 else 2756 dst_ret = rt6_check(rt, from, cookie); 2757 2758 rcu_read_unlock(); 2759 2760 return dst_ret; 2761 } 2762 EXPORT_INDIRECT_CALLABLE(ip6_dst_check); 2763 2764 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2765 { 2766 struct rt6_info *rt = (struct rt6_info *) dst; 2767 2768 if (rt) { 2769 if (rt->rt6i_flags & RTF_CACHE) { 2770 rcu_read_lock(); 2771 if (rt6_check_expired(rt)) { 2772 rt6_remove_exception_rt(rt); 2773 dst = NULL; 2774 } 2775 rcu_read_unlock(); 2776 } else { 2777 dst_release(dst); 2778 dst = NULL; 2779 } 2780 } 2781 return dst; 2782 } 2783 2784 static void ip6_link_failure(struct sk_buff *skb) 2785 { 2786 struct rt6_info *rt; 2787 2788 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2789 2790 rt = (struct rt6_info *) skb_dst(skb); 2791 if (rt) { 2792 rcu_read_lock(); 2793 if (rt->rt6i_flags & RTF_CACHE) { 2794 rt6_remove_exception_rt(rt); 2795 } else { 2796 struct fib6_info *from; 2797 struct fib6_node *fn; 2798 2799 from = rcu_dereference(rt->from); 2800 if (from) { 2801 fn = rcu_dereference(from->fib6_node); 2802 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2803 fn->fn_sernum = -1; 2804 } 2805 } 2806 rcu_read_unlock(); 2807 } 2808 } 2809 2810 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2811 { 2812 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2813 struct fib6_info *from; 2814 2815 rcu_read_lock(); 2816 from = rcu_dereference(rt0->from); 2817 if (from) 2818 rt0->dst.expires = from->expires; 2819 rcu_read_unlock(); 2820 } 2821 2822 dst_set_expires(&rt0->dst, timeout); 2823 rt0->rt6i_flags |= RTF_EXPIRES; 2824 } 2825 2826 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2827 { 2828 struct net *net = dev_net(rt->dst.dev); 2829 2830 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2831 rt->rt6i_flags |= RTF_MODIFIED; 2832 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2833 } 2834 2835 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2836 { 2837 return !(rt->rt6i_flags & RTF_CACHE) && 2838 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from)); 2839 } 2840 2841 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2842 const struct ipv6hdr *iph, u32 mtu, 2843 bool confirm_neigh) 2844 { 2845 const struct in6_addr *daddr, *saddr; 2846 struct rt6_info *rt6 = (struct rt6_info *)dst; 2847 2848 /* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU) 2849 * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it. 2850 * [see also comment in rt6_mtu_change_route()] 2851 */ 2852 2853 if (iph) { 2854 daddr = &iph->daddr; 2855 saddr = &iph->saddr; 2856 } else if (sk) { 2857 daddr = &sk->sk_v6_daddr; 2858 saddr = &inet6_sk(sk)->saddr; 2859 } else { 2860 daddr = NULL; 2861 saddr = NULL; 2862 } 2863 2864 if (confirm_neigh) 2865 dst_confirm_neigh(dst, daddr); 2866 2867 if (mtu < IPV6_MIN_MTU) 2868 return; 2869 if (mtu >= dst_mtu(dst)) 2870 return; 2871 2872 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2873 rt6_do_update_pmtu(rt6, mtu); 2874 /* update rt6_ex->stamp for cache */ 2875 if (rt6->rt6i_flags & RTF_CACHE) 2876 rt6_update_exception_stamp_rt(rt6); 2877 } else if (daddr) { 2878 struct fib6_result res = {}; 2879 struct rt6_info *nrt6; 2880 2881 rcu_read_lock(); 2882 res.f6i = rcu_dereference(rt6->from); 2883 if (!res.f6i) 2884 goto out_unlock; 2885 2886 res.fib6_flags = res.f6i->fib6_flags; 2887 res.fib6_type = res.f6i->fib6_type; 2888 2889 if (res.f6i->nh) { 2890 struct fib6_nh_match_arg arg = { 2891 .dev = dst->dev, 2892 .gw = &rt6->rt6i_gateway, 2893 }; 2894 2895 nexthop_for_each_fib6_nh(res.f6i->nh, 2896 fib6_nh_find_match, &arg); 2897 2898 /* fib6_info uses a nexthop that does not have fib6_nh 2899 * using the dst->dev + gw. Should be impossible. 2900 */ 2901 if (!arg.match) 2902 goto out_unlock; 2903 2904 res.nh = arg.match; 2905 } else { 2906 res.nh = res.f6i->fib6_nh; 2907 } 2908 2909 nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr); 2910 if (nrt6) { 2911 rt6_do_update_pmtu(nrt6, mtu); 2912 if (rt6_insert_exception(nrt6, &res)) 2913 dst_release_immediate(&nrt6->dst); 2914 } 2915 out_unlock: 2916 rcu_read_unlock(); 2917 } 2918 } 2919 2920 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2921 struct sk_buff *skb, u32 mtu, 2922 bool confirm_neigh) 2923 { 2924 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu, 2925 confirm_neigh); 2926 } 2927 2928 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2929 int oif, u32 mark, kuid_t uid) 2930 { 2931 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2932 struct dst_entry *dst; 2933 struct flowi6 fl6 = { 2934 .flowi6_oif = oif, 2935 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark), 2936 .daddr = iph->daddr, 2937 .saddr = iph->saddr, 2938 .flowlabel = ip6_flowinfo(iph), 2939 .flowi6_uid = uid, 2940 }; 2941 2942 dst = ip6_route_output(net, NULL, &fl6); 2943 if (!dst->error) 2944 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), true); 2945 dst_release(dst); 2946 } 2947 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2948 2949 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2950 { 2951 int oif = sk->sk_bound_dev_if; 2952 struct dst_entry *dst; 2953 2954 if (!oif && skb->dev) 2955 oif = l3mdev_master_ifindex(skb->dev); 2956 2957 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid); 2958 2959 dst = __sk_dst_get(sk); 2960 if (!dst || !dst->obsolete || 2961 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2962 return; 2963 2964 bh_lock_sock(sk); 2965 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2966 ip6_datagram_dst_update(sk, false); 2967 bh_unlock_sock(sk); 2968 } 2969 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2970 2971 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2972 const struct flowi6 *fl6) 2973 { 2974 #ifdef CONFIG_IPV6_SUBTREES 2975 struct ipv6_pinfo *np = inet6_sk(sk); 2976 #endif 2977 2978 ip6_dst_store(sk, dst, 2979 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2980 &sk->sk_v6_daddr : NULL, 2981 #ifdef CONFIG_IPV6_SUBTREES 2982 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2983 &np->saddr : 2984 #endif 2985 NULL); 2986 } 2987 2988 static bool ip6_redirect_nh_match(const struct fib6_result *res, 2989 struct flowi6 *fl6, 2990 const struct in6_addr *gw, 2991 struct rt6_info **ret) 2992 { 2993 const struct fib6_nh *nh = res->nh; 2994 2995 if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family || 2996 fl6->flowi6_oif != nh->fib_nh_dev->ifindex) 2997 return false; 2998 2999 /* rt_cache's gateway might be different from its 'parent' 3000 * in the case of an ip redirect. 3001 * So we keep searching in the exception table if the gateway 3002 * is different. 3003 */ 3004 if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) { 3005 struct rt6_info *rt_cache; 3006 3007 rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr); 3008 if (rt_cache && 3009 ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) { 3010 *ret = rt_cache; 3011 return true; 3012 } 3013 return false; 3014 } 3015 return true; 3016 } 3017 3018 struct fib6_nh_rd_arg { 3019 struct fib6_result *res; 3020 struct flowi6 *fl6; 3021 const struct in6_addr *gw; 3022 struct rt6_info **ret; 3023 }; 3024 3025 static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg) 3026 { 3027 struct fib6_nh_rd_arg *arg = _arg; 3028 3029 arg->res->nh = nh; 3030 return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret); 3031 } 3032 3033 /* Handle redirects */ 3034 struct ip6rd_flowi { 3035 struct flowi6 fl6; 3036 struct in6_addr gateway; 3037 }; 3038 3039 INDIRECT_CALLABLE_SCOPE struct rt6_info *__ip6_route_redirect(struct net *net, 3040 struct fib6_table *table, 3041 struct flowi6 *fl6, 3042 const struct sk_buff *skb, 3043 int flags) 3044 { 3045 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 3046 struct rt6_info *ret = NULL; 3047 struct fib6_result res = {}; 3048 struct fib6_nh_rd_arg arg = { 3049 .res = &res, 3050 .fl6 = fl6, 3051 .gw = &rdfl->gateway, 3052 .ret = &ret 3053 }; 3054 struct fib6_info *rt; 3055 struct fib6_node *fn; 3056 3057 /* l3mdev_update_flow overrides oif if the device is enslaved; in 3058 * this case we must match on the real ingress device, so reset it 3059 */ 3060 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 3061 fl6->flowi6_oif = skb->dev->ifindex; 3062 3063 /* Get the "current" route for this destination and 3064 * check if the redirect has come from appropriate router. 3065 * 3066 * RFC 4861 specifies that redirects should only be 3067 * accepted if they come from the nexthop to the target. 3068 * Due to the way the routes are chosen, this notion 3069 * is a bit fuzzy and one might need to check all possible 3070 * routes. 3071 */ 3072 3073 rcu_read_lock(); 3074 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 3075 restart: 3076 for_each_fib6_node_rt_rcu(fn) { 3077 res.f6i = rt; 3078 if (fib6_check_expired(rt)) 3079 continue; 3080 if (rt->fib6_flags & RTF_REJECT) 3081 break; 3082 if (unlikely(rt->nh)) { 3083 if (nexthop_is_blackhole(rt->nh)) 3084 continue; 3085 /* on match, res->nh is filled in and potentially ret */ 3086 if (nexthop_for_each_fib6_nh(rt->nh, 3087 fib6_nh_redirect_match, 3088 &arg)) 3089 goto out; 3090 } else { 3091 res.nh = rt->fib6_nh; 3092 if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, 3093 &ret)) 3094 goto out; 3095 } 3096 } 3097 3098 if (!rt) 3099 rt = net->ipv6.fib6_null_entry; 3100 else if (rt->fib6_flags & RTF_REJECT) { 3101 ret = net->ipv6.ip6_null_entry; 3102 goto out; 3103 } 3104 3105 if (rt == net->ipv6.fib6_null_entry) { 3106 fn = fib6_backtrack(fn, &fl6->saddr); 3107 if (fn) 3108 goto restart; 3109 } 3110 3111 res.f6i = rt; 3112 res.nh = rt->fib6_nh; 3113 out: 3114 if (ret) { 3115 ip6_hold_safe(net, &ret); 3116 } else { 3117 res.fib6_flags = res.f6i->fib6_flags; 3118 res.fib6_type = res.f6i->fib6_type; 3119 ret = ip6_create_rt_rcu(&res); 3120 } 3121 3122 rcu_read_unlock(); 3123 3124 trace_fib6_table_lookup(net, &res, table, fl6); 3125 return ret; 3126 }; 3127 3128 static struct dst_entry *ip6_route_redirect(struct net *net, 3129 const struct flowi6 *fl6, 3130 const struct sk_buff *skb, 3131 const struct in6_addr *gateway) 3132 { 3133 int flags = RT6_LOOKUP_F_HAS_SADDR; 3134 struct ip6rd_flowi rdfl; 3135 3136 rdfl.fl6 = *fl6; 3137 rdfl.gateway = *gateway; 3138 3139 return fib6_rule_lookup(net, &rdfl.fl6, skb, 3140 flags, __ip6_route_redirect); 3141 } 3142 3143 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 3144 kuid_t uid) 3145 { 3146 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 3147 struct dst_entry *dst; 3148 struct flowi6 fl6 = { 3149 .flowi6_iif = LOOPBACK_IFINDEX, 3150 .flowi6_oif = oif, 3151 .flowi6_mark = mark, 3152 .daddr = iph->daddr, 3153 .saddr = iph->saddr, 3154 .flowlabel = ip6_flowinfo(iph), 3155 .flowi6_uid = uid, 3156 }; 3157 3158 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 3159 rt6_do_redirect(dst, NULL, skb); 3160 dst_release(dst); 3161 } 3162 EXPORT_SYMBOL_GPL(ip6_redirect); 3163 3164 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) 3165 { 3166 const struct ipv6hdr *iph = ipv6_hdr(skb); 3167 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 3168 struct dst_entry *dst; 3169 struct flowi6 fl6 = { 3170 .flowi6_iif = LOOPBACK_IFINDEX, 3171 .flowi6_oif = oif, 3172 .daddr = msg->dest, 3173 .saddr = iph->daddr, 3174 .flowi6_uid = sock_net_uid(net, NULL), 3175 }; 3176 3177 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 3178 rt6_do_redirect(dst, NULL, skb); 3179 dst_release(dst); 3180 } 3181 3182 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 3183 { 3184 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 3185 sk->sk_uid); 3186 } 3187 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 3188 3189 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 3190 { 3191 struct net_device *dev = dst->dev; 3192 unsigned int mtu = dst_mtu(dst); 3193 struct net *net = dev_net(dev); 3194 3195 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 3196 3197 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 3198 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 3199 3200 /* 3201 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 3202 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 3203 * IPV6_MAXPLEN is also valid and means: "any MSS, 3204 * rely only on pmtu discovery" 3205 */ 3206 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 3207 mtu = IPV6_MAXPLEN; 3208 return mtu; 3209 } 3210 3211 INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst) 3212 { 3213 return ip6_dst_mtu_maybe_forward(dst, false); 3214 } 3215 EXPORT_INDIRECT_CALLABLE(ip6_mtu); 3216 3217 /* MTU selection: 3218 * 1. mtu on route is locked - use it 3219 * 2. mtu from nexthop exception 3220 * 3. mtu from egress device 3221 * 3222 * based on ip6_dst_mtu_forward and exception logic of 3223 * rt6_find_cached_rt; called with rcu_read_lock 3224 */ 3225 u32 ip6_mtu_from_fib6(const struct fib6_result *res, 3226 const struct in6_addr *daddr, 3227 const struct in6_addr *saddr) 3228 { 3229 const struct fib6_nh *nh = res->nh; 3230 struct fib6_info *f6i = res->f6i; 3231 struct inet6_dev *idev; 3232 struct rt6_info *rt; 3233 u32 mtu = 0; 3234 3235 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 3236 mtu = f6i->fib6_pmtu; 3237 if (mtu) 3238 goto out; 3239 } 3240 3241 rt = rt6_find_cached_rt(res, daddr, saddr); 3242 if (unlikely(rt)) { 3243 mtu = dst_metric_raw(&rt->dst, RTAX_MTU); 3244 } else { 3245 struct net_device *dev = nh->fib_nh_dev; 3246 3247 mtu = IPV6_MIN_MTU; 3248 idev = __in6_dev_get(dev); 3249 if (idev && idev->cnf.mtu6 > mtu) 3250 mtu = idev->cnf.mtu6; 3251 } 3252 3253 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 3254 out: 3255 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); 3256 } 3257 3258 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 3259 struct flowi6 *fl6) 3260 { 3261 struct dst_entry *dst; 3262 struct rt6_info *rt; 3263 struct inet6_dev *idev = in6_dev_get(dev); 3264 struct net *net = dev_net(dev); 3265 3266 if (unlikely(!idev)) 3267 return ERR_PTR(-ENODEV); 3268 3269 rt = ip6_dst_alloc(net, dev, 0); 3270 if (unlikely(!rt)) { 3271 in6_dev_put(idev); 3272 dst = ERR_PTR(-ENOMEM); 3273 goto out; 3274 } 3275 3276 rt->dst.input = ip6_input; 3277 rt->dst.output = ip6_output; 3278 rt->rt6i_gateway = fl6->daddr; 3279 rt->rt6i_dst.addr = fl6->daddr; 3280 rt->rt6i_dst.plen = 128; 3281 rt->rt6i_idev = idev; 3282 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 3283 3284 /* Add this dst into uncached_list so that rt6_disable_ip() can 3285 * do proper release of the net_device 3286 */ 3287 rt6_uncached_list_add(rt); 3288 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 3289 3290 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 3291 3292 out: 3293 return dst; 3294 } 3295 3296 static int ip6_dst_gc(struct dst_ops *ops) 3297 { 3298 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 3299 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 3300 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 3301 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 3302 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 3303 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 3304 int entries; 3305 3306 entries = dst_entries_get_fast(ops); 3307 if (entries > rt_max_size) 3308 entries = dst_entries_get_slow(ops); 3309 3310 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 3311 entries <= rt_max_size) 3312 goto out; 3313 3314 net->ipv6.ip6_rt_gc_expire++; 3315 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 3316 entries = dst_entries_get_slow(ops); 3317 if (entries < ops->gc_thresh) 3318 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 3319 out: 3320 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 3321 return entries > rt_max_size; 3322 } 3323 3324 static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, 3325 const struct in6_addr *gw_addr, u32 tbid, 3326 int flags, struct fib6_result *res) 3327 { 3328 struct flowi6 fl6 = { 3329 .flowi6_oif = cfg->fc_ifindex, 3330 .daddr = *gw_addr, 3331 .saddr = cfg->fc_prefsrc, 3332 }; 3333 struct fib6_table *table; 3334 int err; 3335 3336 table = fib6_get_table(net, tbid); 3337 if (!table) 3338 return -EINVAL; 3339 3340 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 3341 flags |= RT6_LOOKUP_F_HAS_SADDR; 3342 3343 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 3344 3345 err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags); 3346 if (!err && res->f6i != net->ipv6.fib6_null_entry) 3347 fib6_select_path(net, res, &fl6, cfg->fc_ifindex, 3348 cfg->fc_ifindex != 0, NULL, flags); 3349 3350 return err; 3351 } 3352 3353 static int ip6_route_check_nh_onlink(struct net *net, 3354 struct fib6_config *cfg, 3355 const struct net_device *dev, 3356 struct netlink_ext_ack *extack) 3357 { 3358 u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; 3359 const struct in6_addr *gw_addr = &cfg->fc_gateway; 3360 struct fib6_result res = {}; 3361 int err; 3362 3363 err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res); 3364 if (!err && !(res.fib6_flags & RTF_REJECT) && 3365 /* ignore match if it is the default route */ 3366 !ipv6_addr_any(&res.f6i->fib6_dst.addr) && 3367 (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) { 3368 NL_SET_ERR_MSG(extack, 3369 "Nexthop has invalid gateway or device mismatch"); 3370 err = -EINVAL; 3371 } 3372 3373 return err; 3374 } 3375 3376 static int ip6_route_check_nh(struct net *net, 3377 struct fib6_config *cfg, 3378 struct net_device **_dev, 3379 struct inet6_dev **idev) 3380 { 3381 const struct in6_addr *gw_addr = &cfg->fc_gateway; 3382 struct net_device *dev = _dev ? *_dev : NULL; 3383 int flags = RT6_LOOKUP_F_IFACE; 3384 struct fib6_result res = {}; 3385 int err = -EHOSTUNREACH; 3386 3387 if (cfg->fc_table) { 3388 err = ip6_nh_lookup_table(net, cfg, gw_addr, 3389 cfg->fc_table, flags, &res); 3390 /* gw_addr can not require a gateway or resolve to a reject 3391 * route. If a device is given, it must match the result. 3392 */ 3393 if (err || res.fib6_flags & RTF_REJECT || 3394 res.nh->fib_nh_gw_family || 3395 (dev && dev != res.nh->fib_nh_dev)) 3396 err = -EHOSTUNREACH; 3397 } 3398 3399 if (err < 0) { 3400 struct flowi6 fl6 = { 3401 .flowi6_oif = cfg->fc_ifindex, 3402 .daddr = *gw_addr, 3403 }; 3404 3405 err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags); 3406 if (err || res.fib6_flags & RTF_REJECT || 3407 res.nh->fib_nh_gw_family) 3408 err = -EHOSTUNREACH; 3409 3410 if (err) 3411 return err; 3412 3413 fib6_select_path(net, &res, &fl6, cfg->fc_ifindex, 3414 cfg->fc_ifindex != 0, NULL, flags); 3415 } 3416 3417 err = 0; 3418 if (dev) { 3419 if (dev != res.nh->fib_nh_dev) 3420 err = -EHOSTUNREACH; 3421 } else { 3422 *_dev = dev = res.nh->fib_nh_dev; 3423 dev_hold(dev); 3424 *idev = in6_dev_get(dev); 3425 } 3426 3427 return err; 3428 } 3429 3430 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 3431 struct net_device **_dev, struct inet6_dev **idev, 3432 struct netlink_ext_ack *extack) 3433 { 3434 const struct in6_addr *gw_addr = &cfg->fc_gateway; 3435 int gwa_type = ipv6_addr_type(gw_addr); 3436 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 3437 const struct net_device *dev = *_dev; 3438 bool need_addr_check = !dev; 3439 int err = -EINVAL; 3440 3441 /* if gw_addr is local we will fail to detect this in case 3442 * address is still TENTATIVE (DAD in progress). rt6_lookup() 3443 * will return already-added prefix route via interface that 3444 * prefix route was assigned to, which might be non-loopback. 3445 */ 3446 if (dev && 3447 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 3448 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 3449 goto out; 3450 } 3451 3452 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 3453 /* IPv6 strictly inhibits using not link-local 3454 * addresses as nexthop address. 3455 * Otherwise, router will not able to send redirects. 3456 * It is very good, but in some (rare!) circumstances 3457 * (SIT, PtP, NBMA NOARP links) it is handy to allow 3458 * some exceptions. --ANK 3459 * We allow IPv4-mapped nexthops to support RFC4798-type 3460 * addressing 3461 */ 3462 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 3463 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 3464 goto out; 3465 } 3466 3467 rcu_read_lock(); 3468 3469 if (cfg->fc_flags & RTNH_F_ONLINK) 3470 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 3471 else 3472 err = ip6_route_check_nh(net, cfg, _dev, idev); 3473 3474 rcu_read_unlock(); 3475 3476 if (err) 3477 goto out; 3478 } 3479 3480 /* reload in case device was changed */ 3481 dev = *_dev; 3482 3483 err = -EINVAL; 3484 if (!dev) { 3485 NL_SET_ERR_MSG(extack, "Egress device not specified"); 3486 goto out; 3487 } else if (dev->flags & IFF_LOOPBACK) { 3488 NL_SET_ERR_MSG(extack, 3489 "Egress device can not be loopback device for this route"); 3490 goto out; 3491 } 3492 3493 /* if we did not check gw_addr above, do so now that the 3494 * egress device has been resolved. 3495 */ 3496 if (need_addr_check && 3497 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 3498 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 3499 goto out; 3500 } 3501 3502 err = 0; 3503 out: 3504 return err; 3505 } 3506 3507 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type) 3508 { 3509 if ((flags & RTF_REJECT) || 3510 (dev && (dev->flags & IFF_LOOPBACK) && 3511 !(addr_type & IPV6_ADDR_LOOPBACK) && 3512 !(flags & (RTF_ANYCAST | RTF_LOCAL)))) 3513 return true; 3514 3515 return false; 3516 } 3517 3518 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, 3519 struct fib6_config *cfg, gfp_t gfp_flags, 3520 struct netlink_ext_ack *extack) 3521 { 3522 struct net_device *dev = NULL; 3523 struct inet6_dev *idev = NULL; 3524 int addr_type; 3525 int err; 3526 3527 fib6_nh->fib_nh_family = AF_INET6; 3528 #ifdef CONFIG_IPV6_ROUTER_PREF 3529 fib6_nh->last_probe = jiffies; 3530 #endif 3531 if (cfg->fc_is_fdb) { 3532 fib6_nh->fib_nh_gw6 = cfg->fc_gateway; 3533 fib6_nh->fib_nh_gw_family = AF_INET6; 3534 return 0; 3535 } 3536 3537 err = -ENODEV; 3538 if (cfg->fc_ifindex) { 3539 dev = dev_get_by_index(net, cfg->fc_ifindex); 3540 if (!dev) 3541 goto out; 3542 idev = in6_dev_get(dev); 3543 if (!idev) 3544 goto out; 3545 } 3546 3547 if (cfg->fc_flags & RTNH_F_ONLINK) { 3548 if (!dev) { 3549 NL_SET_ERR_MSG(extack, 3550 "Nexthop device required for onlink"); 3551 goto out; 3552 } 3553 3554 if (!(dev->flags & IFF_UP)) { 3555 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3556 err = -ENETDOWN; 3557 goto out; 3558 } 3559 3560 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK; 3561 } 3562 3563 fib6_nh->fib_nh_weight = 1; 3564 3565 /* We cannot add true routes via loopback here, 3566 * they would result in kernel looping; promote them to reject routes 3567 */ 3568 addr_type = ipv6_addr_type(&cfg->fc_dst); 3569 if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) { 3570 /* hold loopback dev/idev if we haven't done so. */ 3571 if (dev != net->loopback_dev) { 3572 if (dev) { 3573 dev_put(dev); 3574 in6_dev_put(idev); 3575 } 3576 dev = net->loopback_dev; 3577 dev_hold(dev); 3578 idev = in6_dev_get(dev); 3579 if (!idev) { 3580 err = -ENODEV; 3581 goto out; 3582 } 3583 } 3584 goto pcpu_alloc; 3585 } 3586 3587 if (cfg->fc_flags & RTF_GATEWAY) { 3588 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 3589 if (err) 3590 goto out; 3591 3592 fib6_nh->fib_nh_gw6 = cfg->fc_gateway; 3593 fib6_nh->fib_nh_gw_family = AF_INET6; 3594 } 3595 3596 err = -ENODEV; 3597 if (!dev) 3598 goto out; 3599 3600 if (idev->cnf.disable_ipv6) { 3601 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3602 err = -EACCES; 3603 goto out; 3604 } 3605 3606 if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) { 3607 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3608 err = -ENETDOWN; 3609 goto out; 3610 } 3611 3612 if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3613 !netif_carrier_ok(dev)) 3614 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; 3615 3616 err = fib_nh_common_init(net, &fib6_nh->nh_common, cfg->fc_encap, 3617 cfg->fc_encap_type, cfg, gfp_flags, extack); 3618 if (err) 3619 goto out; 3620 3621 pcpu_alloc: 3622 fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); 3623 if (!fib6_nh->rt6i_pcpu) { 3624 err = -ENOMEM; 3625 goto out; 3626 } 3627 3628 fib6_nh->fib_nh_dev = dev; 3629 fib6_nh->fib_nh_oif = dev->ifindex; 3630 err = 0; 3631 out: 3632 if (idev) 3633 in6_dev_put(idev); 3634 3635 if (err) { 3636 lwtstate_put(fib6_nh->fib_nh_lws); 3637 fib6_nh->fib_nh_lws = NULL; 3638 dev_put(dev); 3639 } 3640 3641 return err; 3642 } 3643 3644 void fib6_nh_release(struct fib6_nh *fib6_nh) 3645 { 3646 struct rt6_exception_bucket *bucket; 3647 3648 rcu_read_lock(); 3649 3650 fib6_nh_flush_exceptions(fib6_nh, NULL); 3651 bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL); 3652 if (bucket) { 3653 rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL); 3654 kfree(bucket); 3655 } 3656 3657 rcu_read_unlock(); 3658 3659 if (fib6_nh->rt6i_pcpu) { 3660 int cpu; 3661 3662 for_each_possible_cpu(cpu) { 3663 struct rt6_info **ppcpu_rt; 3664 struct rt6_info *pcpu_rt; 3665 3666 ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); 3667 pcpu_rt = *ppcpu_rt; 3668 if (pcpu_rt) { 3669 dst_dev_put(&pcpu_rt->dst); 3670 dst_release(&pcpu_rt->dst); 3671 *ppcpu_rt = NULL; 3672 } 3673 } 3674 3675 free_percpu(fib6_nh->rt6i_pcpu); 3676 } 3677 3678 fib_nh_common_release(&fib6_nh->nh_common); 3679 } 3680 3681 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 3682 gfp_t gfp_flags, 3683 struct netlink_ext_ack *extack) 3684 { 3685 struct net *net = cfg->fc_nlinfo.nl_net; 3686 struct fib6_info *rt = NULL; 3687 struct nexthop *nh = NULL; 3688 struct fib6_table *table; 3689 struct fib6_nh *fib6_nh; 3690 int err = -EINVAL; 3691 int addr_type; 3692 3693 /* RTF_PCPU is an internal flag; can not be set by userspace */ 3694 if (cfg->fc_flags & RTF_PCPU) { 3695 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 3696 goto out; 3697 } 3698 3699 /* RTF_CACHE is an internal flag; can not be set by userspace */ 3700 if (cfg->fc_flags & RTF_CACHE) { 3701 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 3702 goto out; 3703 } 3704 3705 if (cfg->fc_type > RTN_MAX) { 3706 NL_SET_ERR_MSG(extack, "Invalid route type"); 3707 goto out; 3708 } 3709 3710 if (cfg->fc_dst_len > 128) { 3711 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 3712 goto out; 3713 } 3714 if (cfg->fc_src_len > 128) { 3715 NL_SET_ERR_MSG(extack, "Invalid source address length"); 3716 goto out; 3717 } 3718 #ifndef CONFIG_IPV6_SUBTREES 3719 if (cfg->fc_src_len) { 3720 NL_SET_ERR_MSG(extack, 3721 "Specifying source address requires IPV6_SUBTREES to be enabled"); 3722 goto out; 3723 } 3724 #endif 3725 if (cfg->fc_nh_id) { 3726 nh = nexthop_find_by_id(net, cfg->fc_nh_id); 3727 if (!nh) { 3728 NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); 3729 goto out; 3730 } 3731 err = fib6_check_nexthop(nh, cfg, extack); 3732 if (err) 3733 goto out; 3734 } 3735 3736 err = -ENOBUFS; 3737 if (cfg->fc_nlinfo.nlh && 3738 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 3739 table = fib6_get_table(net, cfg->fc_table); 3740 if (!table) { 3741 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 3742 table = fib6_new_table(net, cfg->fc_table); 3743 } 3744 } else { 3745 table = fib6_new_table(net, cfg->fc_table); 3746 } 3747 3748 if (!table) 3749 goto out; 3750 3751 err = -ENOMEM; 3752 rt = fib6_info_alloc(gfp_flags, !nh); 3753 if (!rt) 3754 goto out; 3755 3756 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len, 3757 extack); 3758 if (IS_ERR(rt->fib6_metrics)) { 3759 err = PTR_ERR(rt->fib6_metrics); 3760 /* Do not leave garbage there. */ 3761 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; 3762 goto out_free; 3763 } 3764 3765 if (cfg->fc_flags & RTF_ADDRCONF) 3766 rt->dst_nocount = true; 3767 3768 if (cfg->fc_flags & RTF_EXPIRES) 3769 fib6_set_expires(rt, jiffies + 3770 clock_t_to_jiffies(cfg->fc_expires)); 3771 else 3772 fib6_clean_expires(rt); 3773 3774 if (cfg->fc_protocol == RTPROT_UNSPEC) 3775 cfg->fc_protocol = RTPROT_BOOT; 3776 rt->fib6_protocol = cfg->fc_protocol; 3777 3778 rt->fib6_table = table; 3779 rt->fib6_metric = cfg->fc_metric; 3780 rt->fib6_type = cfg->fc_type ? : RTN_UNICAST; 3781 rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY; 3782 3783 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3784 rt->fib6_dst.plen = cfg->fc_dst_len; 3785 3786 #ifdef CONFIG_IPV6_SUBTREES 3787 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3788 rt->fib6_src.plen = cfg->fc_src_len; 3789 #endif 3790 if (nh) { 3791 if (rt->fib6_src.plen) { 3792 NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); 3793 goto out_free; 3794 } 3795 if (!nexthop_get(nh)) { 3796 NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); 3797 goto out_free; 3798 } 3799 rt->nh = nh; 3800 fib6_nh = nexthop_fib6_nh(rt->nh); 3801 } else { 3802 err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack); 3803 if (err) 3804 goto out; 3805 3806 fib6_nh = rt->fib6_nh; 3807 3808 /* We cannot add true routes via loopback here, they would 3809 * result in kernel looping; promote them to reject routes 3810 */ 3811 addr_type = ipv6_addr_type(&cfg->fc_dst); 3812 if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev, 3813 addr_type)) 3814 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; 3815 } 3816 3817 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3818 struct net_device *dev = fib6_nh->fib_nh_dev; 3819 3820 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3821 NL_SET_ERR_MSG(extack, "Invalid source address"); 3822 err = -EINVAL; 3823 goto out; 3824 } 3825 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3826 rt->fib6_prefsrc.plen = 128; 3827 } else 3828 rt->fib6_prefsrc.plen = 0; 3829 3830 return rt; 3831 out: 3832 fib6_info_release(rt); 3833 return ERR_PTR(err); 3834 out_free: 3835 ip_fib_metrics_put(rt->fib6_metrics); 3836 kfree(rt); 3837 return ERR_PTR(err); 3838 } 3839 3840 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3841 struct netlink_ext_ack *extack) 3842 { 3843 struct fib6_info *rt; 3844 int err; 3845 3846 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3847 if (IS_ERR(rt)) 3848 return PTR_ERR(rt); 3849 3850 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3851 fib6_info_release(rt); 3852 3853 return err; 3854 } 3855 3856 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3857 { 3858 struct net *net = info->nl_net; 3859 struct fib6_table *table; 3860 int err; 3861 3862 if (rt == net->ipv6.fib6_null_entry) { 3863 err = -ENOENT; 3864 goto out; 3865 } 3866 3867 table = rt->fib6_table; 3868 spin_lock_bh(&table->tb6_lock); 3869 err = fib6_del(rt, info); 3870 spin_unlock_bh(&table->tb6_lock); 3871 3872 out: 3873 fib6_info_release(rt); 3874 return err; 3875 } 3876 3877 int ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify) 3878 { 3879 struct nl_info info = { 3880 .nl_net = net, 3881 .skip_notify = skip_notify 3882 }; 3883 3884 return __ip6_del_rt(rt, &info); 3885 } 3886 3887 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3888 { 3889 struct nl_info *info = &cfg->fc_nlinfo; 3890 struct net *net = info->nl_net; 3891 struct sk_buff *skb = NULL; 3892 struct fib6_table *table; 3893 int err = -ENOENT; 3894 3895 if (rt == net->ipv6.fib6_null_entry) 3896 goto out_put; 3897 table = rt->fib6_table; 3898 spin_lock_bh(&table->tb6_lock); 3899 3900 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3901 struct fib6_info *sibling, *next_sibling; 3902 struct fib6_node *fn; 3903 3904 /* prefer to send a single notification with all hops */ 3905 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3906 if (skb) { 3907 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3908 3909 if (rt6_fill_node(net, skb, rt, NULL, 3910 NULL, NULL, 0, RTM_DELROUTE, 3911 info->portid, seq, 0) < 0) { 3912 kfree_skb(skb); 3913 skb = NULL; 3914 } else 3915 info->skip_notify = 1; 3916 } 3917 3918 /* 'rt' points to the first sibling route. If it is not the 3919 * leaf, then we do not need to send a notification. Otherwise, 3920 * we need to check if the last sibling has a next route or not 3921 * and emit a replace or delete notification, respectively. 3922 */ 3923 info->skip_notify_kernel = 1; 3924 fn = rcu_dereference_protected(rt->fib6_node, 3925 lockdep_is_held(&table->tb6_lock)); 3926 if (rcu_access_pointer(fn->leaf) == rt) { 3927 struct fib6_info *last_sibling, *replace_rt; 3928 3929 last_sibling = list_last_entry(&rt->fib6_siblings, 3930 struct fib6_info, 3931 fib6_siblings); 3932 replace_rt = rcu_dereference_protected( 3933 last_sibling->fib6_next, 3934 lockdep_is_held(&table->tb6_lock)); 3935 if (replace_rt) 3936 call_fib6_entry_notifiers_replace(net, 3937 replace_rt); 3938 else 3939 call_fib6_multipath_entry_notifiers(net, 3940 FIB_EVENT_ENTRY_DEL, 3941 rt, rt->fib6_nsiblings, 3942 NULL); 3943 } 3944 list_for_each_entry_safe(sibling, next_sibling, 3945 &rt->fib6_siblings, 3946 fib6_siblings) { 3947 err = fib6_del(sibling, info); 3948 if (err) 3949 goto out_unlock; 3950 } 3951 } 3952 3953 err = fib6_del(rt, info); 3954 out_unlock: 3955 spin_unlock_bh(&table->tb6_lock); 3956 out_put: 3957 fib6_info_release(rt); 3958 3959 if (skb) { 3960 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3961 info->nlh, gfp_any()); 3962 } 3963 return err; 3964 } 3965 3966 static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3967 { 3968 int rc = -ESRCH; 3969 3970 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3971 goto out; 3972 3973 if (cfg->fc_flags & RTF_GATEWAY && 3974 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3975 goto out; 3976 3977 rc = rt6_remove_exception_rt(rt); 3978 out: 3979 return rc; 3980 } 3981 3982 static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt, 3983 struct fib6_nh *nh) 3984 { 3985 struct fib6_result res = { 3986 .f6i = rt, 3987 .nh = nh, 3988 }; 3989 struct rt6_info *rt_cache; 3990 3991 rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src); 3992 if (rt_cache) 3993 return __ip6_del_cached_rt(rt_cache, cfg); 3994 3995 return 0; 3996 } 3997 3998 struct fib6_nh_del_cached_rt_arg { 3999 struct fib6_config *cfg; 4000 struct fib6_info *f6i; 4001 }; 4002 4003 static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg) 4004 { 4005 struct fib6_nh_del_cached_rt_arg *arg = _arg; 4006 int rc; 4007 4008 rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh); 4009 return rc != -ESRCH ? rc : 0; 4010 } 4011 4012 static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i) 4013 { 4014 struct fib6_nh_del_cached_rt_arg arg = { 4015 .cfg = cfg, 4016 .f6i = f6i 4017 }; 4018 4019 return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg); 4020 } 4021 4022 static int ip6_route_del(struct fib6_config *cfg, 4023 struct netlink_ext_ack *extack) 4024 { 4025 struct fib6_table *table; 4026 struct fib6_info *rt; 4027 struct fib6_node *fn; 4028 int err = -ESRCH; 4029 4030 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 4031 if (!table) { 4032 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 4033 return err; 4034 } 4035 4036 rcu_read_lock(); 4037 4038 fn = fib6_locate(&table->tb6_root, 4039 &cfg->fc_dst, cfg->fc_dst_len, 4040 &cfg->fc_src, cfg->fc_src_len, 4041 !(cfg->fc_flags & RTF_CACHE)); 4042 4043 if (fn) { 4044 for_each_fib6_node_rt_rcu(fn) { 4045 struct fib6_nh *nh; 4046 4047 if (rt->nh && cfg->fc_nh_id && 4048 rt->nh->id != cfg->fc_nh_id) 4049 continue; 4050 4051 if (cfg->fc_flags & RTF_CACHE) { 4052 int rc = 0; 4053 4054 if (rt->nh) { 4055 rc = ip6_del_cached_rt_nh(cfg, rt); 4056 } else if (cfg->fc_nh_id) { 4057 continue; 4058 } else { 4059 nh = rt->fib6_nh; 4060 rc = ip6_del_cached_rt(cfg, rt, nh); 4061 } 4062 if (rc != -ESRCH) { 4063 rcu_read_unlock(); 4064 return rc; 4065 } 4066 continue; 4067 } 4068 4069 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 4070 continue; 4071 if (cfg->fc_protocol && 4072 cfg->fc_protocol != rt->fib6_protocol) 4073 continue; 4074 4075 if (rt->nh) { 4076 if (!fib6_info_hold_safe(rt)) 4077 continue; 4078 rcu_read_unlock(); 4079 4080 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 4081 } 4082 if (cfg->fc_nh_id) 4083 continue; 4084 4085 nh = rt->fib6_nh; 4086 if (cfg->fc_ifindex && 4087 (!nh->fib_nh_dev || 4088 nh->fib_nh_dev->ifindex != cfg->fc_ifindex)) 4089 continue; 4090 if (cfg->fc_flags & RTF_GATEWAY && 4091 !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6)) 4092 continue; 4093 if (!fib6_info_hold_safe(rt)) 4094 continue; 4095 rcu_read_unlock(); 4096 4097 /* if gateway was specified only delete the one hop */ 4098 if (cfg->fc_flags & RTF_GATEWAY) 4099 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 4100 4101 return __ip6_del_rt_siblings(rt, cfg); 4102 } 4103 } 4104 rcu_read_unlock(); 4105 4106 return err; 4107 } 4108 4109 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 4110 { 4111 struct netevent_redirect netevent; 4112 struct rt6_info *rt, *nrt = NULL; 4113 struct fib6_result res = {}; 4114 struct ndisc_options ndopts; 4115 struct inet6_dev *in6_dev; 4116 struct neighbour *neigh; 4117 struct rd_msg *msg; 4118 int optlen, on_link; 4119 u8 *lladdr; 4120 4121 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 4122 optlen -= sizeof(*msg); 4123 4124 if (optlen < 0) { 4125 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 4126 return; 4127 } 4128 4129 msg = (struct rd_msg *)icmp6_hdr(skb); 4130 4131 if (ipv6_addr_is_multicast(&msg->dest)) { 4132 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 4133 return; 4134 } 4135 4136 on_link = 0; 4137 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 4138 on_link = 1; 4139 } else if (ipv6_addr_type(&msg->target) != 4140 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 4141 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 4142 return; 4143 } 4144 4145 in6_dev = __in6_dev_get(skb->dev); 4146 if (!in6_dev) 4147 return; 4148 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 4149 return; 4150 4151 /* RFC2461 8.1: 4152 * The IP source address of the Redirect MUST be the same as the current 4153 * first-hop router for the specified ICMP Destination Address. 4154 */ 4155 4156 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 4157 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 4158 return; 4159 } 4160 4161 lladdr = NULL; 4162 if (ndopts.nd_opts_tgt_lladdr) { 4163 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 4164 skb->dev); 4165 if (!lladdr) { 4166 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 4167 return; 4168 } 4169 } 4170 4171 rt = (struct rt6_info *) dst; 4172 if (rt->rt6i_flags & RTF_REJECT) { 4173 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 4174 return; 4175 } 4176 4177 /* Redirect received -> path was valid. 4178 * Look, redirects are sent only in response to data packets, 4179 * so that this nexthop apparently is reachable. --ANK 4180 */ 4181 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 4182 4183 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 4184 if (!neigh) 4185 return; 4186 4187 /* 4188 * We have finally decided to accept it. 4189 */ 4190 4191 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 4192 NEIGH_UPDATE_F_WEAK_OVERRIDE| 4193 NEIGH_UPDATE_F_OVERRIDE| 4194 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 4195 NEIGH_UPDATE_F_ISROUTER)), 4196 NDISC_REDIRECT, &ndopts); 4197 4198 rcu_read_lock(); 4199 res.f6i = rcu_dereference(rt->from); 4200 if (!res.f6i) 4201 goto out; 4202 4203 if (res.f6i->nh) { 4204 struct fib6_nh_match_arg arg = { 4205 .dev = dst->dev, 4206 .gw = &rt->rt6i_gateway, 4207 }; 4208 4209 nexthop_for_each_fib6_nh(res.f6i->nh, 4210 fib6_nh_find_match, &arg); 4211 4212 /* fib6_info uses a nexthop that does not have fib6_nh 4213 * using the dst->dev. Should be impossible 4214 */ 4215 if (!arg.match) 4216 goto out; 4217 res.nh = arg.match; 4218 } else { 4219 res.nh = res.f6i->fib6_nh; 4220 } 4221 4222 res.fib6_flags = res.f6i->fib6_flags; 4223 res.fib6_type = res.f6i->fib6_type; 4224 nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL); 4225 if (!nrt) 4226 goto out; 4227 4228 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 4229 if (on_link) 4230 nrt->rt6i_flags &= ~RTF_GATEWAY; 4231 4232 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 4233 4234 /* rt6_insert_exception() will take care of duplicated exceptions */ 4235 if (rt6_insert_exception(nrt, &res)) { 4236 dst_release_immediate(&nrt->dst); 4237 goto out; 4238 } 4239 4240 netevent.old = &rt->dst; 4241 netevent.new = &nrt->dst; 4242 netevent.daddr = &msg->dest; 4243 netevent.neigh = neigh; 4244 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 4245 4246 out: 4247 rcu_read_unlock(); 4248 neigh_release(neigh); 4249 } 4250 4251 #ifdef CONFIG_IPV6_ROUTE_INFO 4252 static struct fib6_info *rt6_get_route_info(struct net *net, 4253 const struct in6_addr *prefix, int prefixlen, 4254 const struct in6_addr *gwaddr, 4255 struct net_device *dev) 4256 { 4257 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 4258 int ifindex = dev->ifindex; 4259 struct fib6_node *fn; 4260 struct fib6_info *rt = NULL; 4261 struct fib6_table *table; 4262 4263 table = fib6_get_table(net, tb_id); 4264 if (!table) 4265 return NULL; 4266 4267 rcu_read_lock(); 4268 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 4269 if (!fn) 4270 goto out; 4271 4272 for_each_fib6_node_rt_rcu(fn) { 4273 /* these routes do not use nexthops */ 4274 if (rt->nh) 4275 continue; 4276 if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex) 4277 continue; 4278 if (!(rt->fib6_flags & RTF_ROUTEINFO) || 4279 !rt->fib6_nh->fib_nh_gw_family) 4280 continue; 4281 if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr)) 4282 continue; 4283 if (!fib6_info_hold_safe(rt)) 4284 continue; 4285 break; 4286 } 4287 out: 4288 rcu_read_unlock(); 4289 return rt; 4290 } 4291 4292 static struct fib6_info *rt6_add_route_info(struct net *net, 4293 const struct in6_addr *prefix, int prefixlen, 4294 const struct in6_addr *gwaddr, 4295 struct net_device *dev, 4296 unsigned int pref) 4297 { 4298 struct fib6_config cfg = { 4299 .fc_metric = IP6_RT_PRIO_USER, 4300 .fc_ifindex = dev->ifindex, 4301 .fc_dst_len = prefixlen, 4302 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 4303 RTF_UP | RTF_PREF(pref), 4304 .fc_protocol = RTPROT_RA, 4305 .fc_type = RTN_UNICAST, 4306 .fc_nlinfo.portid = 0, 4307 .fc_nlinfo.nlh = NULL, 4308 .fc_nlinfo.nl_net = net, 4309 }; 4310 4311 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 4312 cfg.fc_dst = *prefix; 4313 cfg.fc_gateway = *gwaddr; 4314 4315 /* We should treat it as a default route if prefix length is 0. */ 4316 if (!prefixlen) 4317 cfg.fc_flags |= RTF_DEFAULT; 4318 4319 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 4320 4321 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 4322 } 4323 #endif 4324 4325 struct fib6_info *rt6_get_dflt_router(struct net *net, 4326 const struct in6_addr *addr, 4327 struct net_device *dev) 4328 { 4329 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 4330 struct fib6_info *rt; 4331 struct fib6_table *table; 4332 4333 table = fib6_get_table(net, tb_id); 4334 if (!table) 4335 return NULL; 4336 4337 rcu_read_lock(); 4338 for_each_fib6_node_rt_rcu(&table->tb6_root) { 4339 struct fib6_nh *nh; 4340 4341 /* RA routes do not use nexthops */ 4342 if (rt->nh) 4343 continue; 4344 4345 nh = rt->fib6_nh; 4346 if (dev == nh->fib_nh_dev && 4347 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 4348 ipv6_addr_equal(&nh->fib_nh_gw6, addr)) 4349 break; 4350 } 4351 if (rt && !fib6_info_hold_safe(rt)) 4352 rt = NULL; 4353 rcu_read_unlock(); 4354 return rt; 4355 } 4356 4357 struct fib6_info *rt6_add_dflt_router(struct net *net, 4358 const struct in6_addr *gwaddr, 4359 struct net_device *dev, 4360 unsigned int pref, 4361 u32 defrtr_usr_metric) 4362 { 4363 struct fib6_config cfg = { 4364 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 4365 .fc_metric = defrtr_usr_metric, 4366 .fc_ifindex = dev->ifindex, 4367 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 4368 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 4369 .fc_protocol = RTPROT_RA, 4370 .fc_type = RTN_UNICAST, 4371 .fc_nlinfo.portid = 0, 4372 .fc_nlinfo.nlh = NULL, 4373 .fc_nlinfo.nl_net = net, 4374 }; 4375 4376 cfg.fc_gateway = *gwaddr; 4377 4378 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 4379 struct fib6_table *table; 4380 4381 table = fib6_get_table(dev_net(dev), cfg.fc_table); 4382 if (table) 4383 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 4384 } 4385 4386 return rt6_get_dflt_router(net, gwaddr, dev); 4387 } 4388 4389 static void __rt6_purge_dflt_routers(struct net *net, 4390 struct fib6_table *table) 4391 { 4392 struct fib6_info *rt; 4393 4394 restart: 4395 rcu_read_lock(); 4396 for_each_fib6_node_rt_rcu(&table->tb6_root) { 4397 struct net_device *dev = fib6_info_nh_dev(rt); 4398 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 4399 4400 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 4401 (!idev || idev->cnf.accept_ra != 2) && 4402 fib6_info_hold_safe(rt)) { 4403 rcu_read_unlock(); 4404 ip6_del_rt(net, rt, false); 4405 goto restart; 4406 } 4407 } 4408 rcu_read_unlock(); 4409 4410 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 4411 } 4412 4413 void rt6_purge_dflt_routers(struct net *net) 4414 { 4415 struct fib6_table *table; 4416 struct hlist_head *head; 4417 unsigned int h; 4418 4419 rcu_read_lock(); 4420 4421 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 4422 head = &net->ipv6.fib_table_hash[h]; 4423 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 4424 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 4425 __rt6_purge_dflt_routers(net, table); 4426 } 4427 } 4428 4429 rcu_read_unlock(); 4430 } 4431 4432 static void rtmsg_to_fib6_config(struct net *net, 4433 struct in6_rtmsg *rtmsg, 4434 struct fib6_config *cfg) 4435 { 4436 *cfg = (struct fib6_config){ 4437 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 4438 : RT6_TABLE_MAIN, 4439 .fc_ifindex = rtmsg->rtmsg_ifindex, 4440 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER, 4441 .fc_expires = rtmsg->rtmsg_info, 4442 .fc_dst_len = rtmsg->rtmsg_dst_len, 4443 .fc_src_len = rtmsg->rtmsg_src_len, 4444 .fc_flags = rtmsg->rtmsg_flags, 4445 .fc_type = rtmsg->rtmsg_type, 4446 4447 .fc_nlinfo.nl_net = net, 4448 4449 .fc_dst = rtmsg->rtmsg_dst, 4450 .fc_src = rtmsg->rtmsg_src, 4451 .fc_gateway = rtmsg->rtmsg_gateway, 4452 }; 4453 } 4454 4455 int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg) 4456 { 4457 struct fib6_config cfg; 4458 int err; 4459 4460 if (cmd != SIOCADDRT && cmd != SIOCDELRT) 4461 return -EINVAL; 4462 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 4463 return -EPERM; 4464 4465 rtmsg_to_fib6_config(net, rtmsg, &cfg); 4466 4467 rtnl_lock(); 4468 switch (cmd) { 4469 case SIOCADDRT: 4470 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 4471 break; 4472 case SIOCDELRT: 4473 err = ip6_route_del(&cfg, NULL); 4474 break; 4475 } 4476 rtnl_unlock(); 4477 return err; 4478 } 4479 4480 /* 4481 * Drop the packet on the floor 4482 */ 4483 4484 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 4485 { 4486 struct dst_entry *dst = skb_dst(skb); 4487 struct net *net = dev_net(dst->dev); 4488 struct inet6_dev *idev; 4489 int type; 4490 4491 if (netif_is_l3_master(skb->dev) && 4492 dst->dev == net->loopback_dev) 4493 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); 4494 else 4495 idev = ip6_dst_idev(dst); 4496 4497 switch (ipstats_mib_noroutes) { 4498 case IPSTATS_MIB_INNOROUTES: 4499 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 4500 if (type == IPV6_ADDR_ANY) { 4501 IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 4502 break; 4503 } 4504 fallthrough; 4505 case IPSTATS_MIB_OUTNOROUTES: 4506 IP6_INC_STATS(net, idev, ipstats_mib_noroutes); 4507 break; 4508 } 4509 4510 /* Start over by dropping the dst for l3mdev case */ 4511 if (netif_is_l3_master(skb->dev)) 4512 skb_dst_drop(skb); 4513 4514 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 4515 kfree_skb(skb); 4516 return 0; 4517 } 4518 4519 static int ip6_pkt_discard(struct sk_buff *skb) 4520 { 4521 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 4522 } 4523 4524 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 4525 { 4526 skb->dev = skb_dst(skb)->dev; 4527 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 4528 } 4529 4530 static int ip6_pkt_prohibit(struct sk_buff *skb) 4531 { 4532 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 4533 } 4534 4535 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 4536 { 4537 skb->dev = skb_dst(skb)->dev; 4538 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 4539 } 4540 4541 /* 4542 * Allocate a dst for local (unicast / anycast) address. 4543 */ 4544 4545 struct fib6_info *addrconf_f6i_alloc(struct net *net, 4546 struct inet6_dev *idev, 4547 const struct in6_addr *addr, 4548 bool anycast, gfp_t gfp_flags) 4549 { 4550 struct fib6_config cfg = { 4551 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL, 4552 .fc_ifindex = idev->dev->ifindex, 4553 .fc_flags = RTF_UP | RTF_NONEXTHOP, 4554 .fc_dst = *addr, 4555 .fc_dst_len = 128, 4556 .fc_protocol = RTPROT_KERNEL, 4557 .fc_nlinfo.nl_net = net, 4558 .fc_ignore_dev_down = true, 4559 }; 4560 struct fib6_info *f6i; 4561 4562 if (anycast) { 4563 cfg.fc_type = RTN_ANYCAST; 4564 cfg.fc_flags |= RTF_ANYCAST; 4565 } else { 4566 cfg.fc_type = RTN_LOCAL; 4567 cfg.fc_flags |= RTF_LOCAL; 4568 } 4569 4570 f6i = ip6_route_info_create(&cfg, gfp_flags, NULL); 4571 if (!IS_ERR(f6i)) 4572 f6i->dst_nocount = true; 4573 return f6i; 4574 } 4575 4576 /* remove deleted ip from prefsrc entries */ 4577 struct arg_dev_net_ip { 4578 struct net_device *dev; 4579 struct net *net; 4580 struct in6_addr *addr; 4581 }; 4582 4583 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 4584 { 4585 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 4586 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 4587 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 4588 4589 if (!rt->nh && 4590 ((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) && 4591 rt != net->ipv6.fib6_null_entry && 4592 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 4593 spin_lock_bh(&rt6_exception_lock); 4594 /* remove prefsrc entry */ 4595 rt->fib6_prefsrc.plen = 0; 4596 spin_unlock_bh(&rt6_exception_lock); 4597 } 4598 return 0; 4599 } 4600 4601 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 4602 { 4603 struct net *net = dev_net(ifp->idev->dev); 4604 struct arg_dev_net_ip adni = { 4605 .dev = ifp->idev->dev, 4606 .net = net, 4607 .addr = &ifp->addr, 4608 }; 4609 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 4610 } 4611 4612 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT) 4613 4614 /* Remove routers and update dst entries when gateway turn into host. */ 4615 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 4616 { 4617 struct in6_addr *gateway = (struct in6_addr *)arg; 4618 struct fib6_nh *nh; 4619 4620 /* RA routes do not use nexthops */ 4621 if (rt->nh) 4622 return 0; 4623 4624 nh = rt->fib6_nh; 4625 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 4626 nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6)) 4627 return -1; 4628 4629 /* Further clean up cached routes in exception table. 4630 * This is needed because cached route may have a different 4631 * gateway than its 'parent' in the case of an ip redirect. 4632 */ 4633 fib6_nh_exceptions_clean_tohost(nh, gateway); 4634 4635 return 0; 4636 } 4637 4638 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 4639 { 4640 fib6_clean_all(net, fib6_clean_tohost, gateway); 4641 } 4642 4643 struct arg_netdev_event { 4644 const struct net_device *dev; 4645 union { 4646 unsigned char nh_flags; 4647 unsigned long event; 4648 }; 4649 }; 4650 4651 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 4652 { 4653 struct fib6_info *iter; 4654 struct fib6_node *fn; 4655 4656 fn = rcu_dereference_protected(rt->fib6_node, 4657 lockdep_is_held(&rt->fib6_table->tb6_lock)); 4658 iter = rcu_dereference_protected(fn->leaf, 4659 lockdep_is_held(&rt->fib6_table->tb6_lock)); 4660 while (iter) { 4661 if (iter->fib6_metric == rt->fib6_metric && 4662 rt6_qualify_for_ecmp(iter)) 4663 return iter; 4664 iter = rcu_dereference_protected(iter->fib6_next, 4665 lockdep_is_held(&rt->fib6_table->tb6_lock)); 4666 } 4667 4668 return NULL; 4669 } 4670 4671 /* only called for fib entries with builtin fib6_nh */ 4672 static bool rt6_is_dead(const struct fib6_info *rt) 4673 { 4674 if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD || 4675 (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN && 4676 ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev))) 4677 return true; 4678 4679 return false; 4680 } 4681 4682 static int rt6_multipath_total_weight(const struct fib6_info *rt) 4683 { 4684 struct fib6_info *iter; 4685 int total = 0; 4686 4687 if (!rt6_is_dead(rt)) 4688 total += rt->fib6_nh->fib_nh_weight; 4689 4690 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 4691 if (!rt6_is_dead(iter)) 4692 total += iter->fib6_nh->fib_nh_weight; 4693 } 4694 4695 return total; 4696 } 4697 4698 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 4699 { 4700 int upper_bound = -1; 4701 4702 if (!rt6_is_dead(rt)) { 4703 *weight += rt->fib6_nh->fib_nh_weight; 4704 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 4705 total) - 1; 4706 } 4707 atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound); 4708 } 4709 4710 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 4711 { 4712 struct fib6_info *iter; 4713 int weight = 0; 4714 4715 rt6_upper_bound_set(rt, &weight, total); 4716 4717 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4718 rt6_upper_bound_set(iter, &weight, total); 4719 } 4720 4721 void rt6_multipath_rebalance(struct fib6_info *rt) 4722 { 4723 struct fib6_info *first; 4724 int total; 4725 4726 /* In case the entire multipath route was marked for flushing, 4727 * then there is no need to rebalance upon the removal of every 4728 * sibling route. 4729 */ 4730 if (!rt->fib6_nsiblings || rt->should_flush) 4731 return; 4732 4733 /* During lookup routes are evaluated in order, so we need to 4734 * make sure upper bounds are assigned from the first sibling 4735 * onwards. 4736 */ 4737 first = rt6_multipath_first_sibling(rt); 4738 if (WARN_ON_ONCE(!first)) 4739 return; 4740 4741 total = rt6_multipath_total_weight(first); 4742 rt6_multipath_upper_bound_set(first, total); 4743 } 4744 4745 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 4746 { 4747 const struct arg_netdev_event *arg = p_arg; 4748 struct net *net = dev_net(arg->dev); 4749 4750 if (rt != net->ipv6.fib6_null_entry && !rt->nh && 4751 rt->fib6_nh->fib_nh_dev == arg->dev) { 4752 rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags; 4753 fib6_update_sernum_upto_root(net, rt); 4754 rt6_multipath_rebalance(rt); 4755 } 4756 4757 return 0; 4758 } 4759 4760 void rt6_sync_up(struct net_device *dev, unsigned char nh_flags) 4761 { 4762 struct arg_netdev_event arg = { 4763 .dev = dev, 4764 { 4765 .nh_flags = nh_flags, 4766 }, 4767 }; 4768 4769 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 4770 arg.nh_flags |= RTNH_F_LINKDOWN; 4771 4772 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 4773 } 4774 4775 /* only called for fib entries with inline fib6_nh */ 4776 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 4777 const struct net_device *dev) 4778 { 4779 struct fib6_info *iter; 4780 4781 if (rt->fib6_nh->fib_nh_dev == dev) 4782 return true; 4783 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4784 if (iter->fib6_nh->fib_nh_dev == dev) 4785 return true; 4786 4787 return false; 4788 } 4789 4790 static void rt6_multipath_flush(struct fib6_info *rt) 4791 { 4792 struct fib6_info *iter; 4793 4794 rt->should_flush = 1; 4795 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4796 iter->should_flush = 1; 4797 } 4798 4799 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 4800 const struct net_device *down_dev) 4801 { 4802 struct fib6_info *iter; 4803 unsigned int dead = 0; 4804 4805 if (rt->fib6_nh->fib_nh_dev == down_dev || 4806 rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD) 4807 dead++; 4808 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4809 if (iter->fib6_nh->fib_nh_dev == down_dev || 4810 iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD) 4811 dead++; 4812 4813 return dead; 4814 } 4815 4816 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 4817 const struct net_device *dev, 4818 unsigned char nh_flags) 4819 { 4820 struct fib6_info *iter; 4821 4822 if (rt->fib6_nh->fib_nh_dev == dev) 4823 rt->fib6_nh->fib_nh_flags |= nh_flags; 4824 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4825 if (iter->fib6_nh->fib_nh_dev == dev) 4826 iter->fib6_nh->fib_nh_flags |= nh_flags; 4827 } 4828 4829 /* called with write lock held for table with rt */ 4830 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 4831 { 4832 const struct arg_netdev_event *arg = p_arg; 4833 const struct net_device *dev = arg->dev; 4834 struct net *net = dev_net(dev); 4835 4836 if (rt == net->ipv6.fib6_null_entry || rt->nh) 4837 return 0; 4838 4839 switch (arg->event) { 4840 case NETDEV_UNREGISTER: 4841 return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; 4842 case NETDEV_DOWN: 4843 if (rt->should_flush) 4844 return -1; 4845 if (!rt->fib6_nsiblings) 4846 return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; 4847 if (rt6_multipath_uses_dev(rt, dev)) { 4848 unsigned int count; 4849 4850 count = rt6_multipath_dead_count(rt, dev); 4851 if (rt->fib6_nsiblings + 1 == count) { 4852 rt6_multipath_flush(rt); 4853 return -1; 4854 } 4855 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4856 RTNH_F_LINKDOWN); 4857 fib6_update_sernum(net, rt); 4858 rt6_multipath_rebalance(rt); 4859 } 4860 return -2; 4861 case NETDEV_CHANGE: 4862 if (rt->fib6_nh->fib_nh_dev != dev || 4863 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 4864 break; 4865 rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; 4866 rt6_multipath_rebalance(rt); 4867 break; 4868 } 4869 4870 return 0; 4871 } 4872 4873 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4874 { 4875 struct arg_netdev_event arg = { 4876 .dev = dev, 4877 { 4878 .event = event, 4879 }, 4880 }; 4881 struct net *net = dev_net(dev); 4882 4883 if (net->ipv6.sysctl.skip_notify_on_dev_down) 4884 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg); 4885 else 4886 fib6_clean_all(net, fib6_ifdown, &arg); 4887 } 4888 4889 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4890 { 4891 rt6_sync_down_dev(dev, event); 4892 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4893 neigh_ifdown(&nd_tbl, dev); 4894 } 4895 4896 struct rt6_mtu_change_arg { 4897 struct net_device *dev; 4898 unsigned int mtu; 4899 struct fib6_info *f6i; 4900 }; 4901 4902 static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg) 4903 { 4904 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg; 4905 struct fib6_info *f6i = arg->f6i; 4906 4907 /* For administrative MTU increase, there is no way to discover 4908 * IPv6 PMTU increase, so PMTU increase should be updated here. 4909 * Since RFC 1981 doesn't include administrative MTU increase 4910 * update PMTU increase is a MUST. (i.e. jumbo frame) 4911 */ 4912 if (nh->fib_nh_dev == arg->dev) { 4913 struct inet6_dev *idev = __in6_dev_get(arg->dev); 4914 u32 mtu = f6i->fib6_pmtu; 4915 4916 if (mtu >= arg->mtu || 4917 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4918 fib6_metric_set(f6i, RTAX_MTU, arg->mtu); 4919 4920 spin_lock_bh(&rt6_exception_lock); 4921 rt6_exceptions_update_pmtu(idev, nh, arg->mtu); 4922 spin_unlock_bh(&rt6_exception_lock); 4923 } 4924 4925 return 0; 4926 } 4927 4928 static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg) 4929 { 4930 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4931 struct inet6_dev *idev; 4932 4933 /* In IPv6 pmtu discovery is not optional, 4934 so that RTAX_MTU lock cannot disable it. 4935 We still use this lock to block changes 4936 caused by addrconf/ndisc. 4937 */ 4938 4939 idev = __in6_dev_get(arg->dev); 4940 if (!idev) 4941 return 0; 4942 4943 if (fib6_metric_locked(f6i, RTAX_MTU)) 4944 return 0; 4945 4946 arg->f6i = f6i; 4947 if (f6i->nh) { 4948 /* fib6_nh_mtu_change only returns 0, so this is safe */ 4949 return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change, 4950 arg); 4951 } 4952 4953 return fib6_nh_mtu_change(f6i->fib6_nh, arg); 4954 } 4955 4956 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4957 { 4958 struct rt6_mtu_change_arg arg = { 4959 .dev = dev, 4960 .mtu = mtu, 4961 }; 4962 4963 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4964 } 4965 4966 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4967 [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 }, 4968 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4969 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4970 [RTA_OIF] = { .type = NLA_U32 }, 4971 [RTA_IIF] = { .type = NLA_U32 }, 4972 [RTA_PRIORITY] = { .type = NLA_U32 }, 4973 [RTA_METRICS] = { .type = NLA_NESTED }, 4974 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4975 [RTA_PREF] = { .type = NLA_U8 }, 4976 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4977 [RTA_ENCAP] = { .type = NLA_NESTED }, 4978 [RTA_EXPIRES] = { .type = NLA_U32 }, 4979 [RTA_UID] = { .type = NLA_U32 }, 4980 [RTA_MARK] = { .type = NLA_U32 }, 4981 [RTA_TABLE] = { .type = NLA_U32 }, 4982 [RTA_IP_PROTO] = { .type = NLA_U8 }, 4983 [RTA_SPORT] = { .type = NLA_U16 }, 4984 [RTA_DPORT] = { .type = NLA_U16 }, 4985 [RTA_NH_ID] = { .type = NLA_U32 }, 4986 }; 4987 4988 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4989 struct fib6_config *cfg, 4990 struct netlink_ext_ack *extack) 4991 { 4992 struct rtmsg *rtm; 4993 struct nlattr *tb[RTA_MAX+1]; 4994 unsigned int pref; 4995 int err; 4996 4997 err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, 4998 rtm_ipv6_policy, extack); 4999 if (err < 0) 5000 goto errout; 5001 5002 err = -EINVAL; 5003 rtm = nlmsg_data(nlh); 5004 5005 *cfg = (struct fib6_config){ 5006 .fc_table = rtm->rtm_table, 5007 .fc_dst_len = rtm->rtm_dst_len, 5008 .fc_src_len = rtm->rtm_src_len, 5009 .fc_flags = RTF_UP, 5010 .fc_protocol = rtm->rtm_protocol, 5011 .fc_type = rtm->rtm_type, 5012 5013 .fc_nlinfo.portid = NETLINK_CB(skb).portid, 5014 .fc_nlinfo.nlh = nlh, 5015 .fc_nlinfo.nl_net = sock_net(skb->sk), 5016 }; 5017 5018 if (rtm->rtm_type == RTN_UNREACHABLE || 5019 rtm->rtm_type == RTN_BLACKHOLE || 5020 rtm->rtm_type == RTN_PROHIBIT || 5021 rtm->rtm_type == RTN_THROW) 5022 cfg->fc_flags |= RTF_REJECT; 5023 5024 if (rtm->rtm_type == RTN_LOCAL) 5025 cfg->fc_flags |= RTF_LOCAL; 5026 5027 if (rtm->rtm_flags & RTM_F_CLONED) 5028 cfg->fc_flags |= RTF_CACHE; 5029 5030 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 5031 5032 if (tb[RTA_NH_ID]) { 5033 if (tb[RTA_GATEWAY] || tb[RTA_OIF] || 5034 tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) { 5035 NL_SET_ERR_MSG(extack, 5036 "Nexthop specification and nexthop id are mutually exclusive"); 5037 goto errout; 5038 } 5039 cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]); 5040 } 5041 5042 if (tb[RTA_GATEWAY]) { 5043 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 5044 cfg->fc_flags |= RTF_GATEWAY; 5045 } 5046 if (tb[RTA_VIA]) { 5047 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute"); 5048 goto errout; 5049 } 5050 5051 if (tb[RTA_DST]) { 5052 int plen = (rtm->rtm_dst_len + 7) >> 3; 5053 5054 if (nla_len(tb[RTA_DST]) < plen) 5055 goto errout; 5056 5057 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 5058 } 5059 5060 if (tb[RTA_SRC]) { 5061 int plen = (rtm->rtm_src_len + 7) >> 3; 5062 5063 if (nla_len(tb[RTA_SRC]) < plen) 5064 goto errout; 5065 5066 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 5067 } 5068 5069 if (tb[RTA_PREFSRC]) 5070 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 5071 5072 if (tb[RTA_OIF]) 5073 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 5074 5075 if (tb[RTA_PRIORITY]) 5076 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 5077 5078 if (tb[RTA_METRICS]) { 5079 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 5080 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 5081 } 5082 5083 if (tb[RTA_TABLE]) 5084 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 5085 5086 if (tb[RTA_MULTIPATH]) { 5087 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 5088 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 5089 5090 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 5091 cfg->fc_mp_len, extack); 5092 if (err < 0) 5093 goto errout; 5094 } 5095 5096 if (tb[RTA_PREF]) { 5097 pref = nla_get_u8(tb[RTA_PREF]); 5098 if (pref != ICMPV6_ROUTER_PREF_LOW && 5099 pref != ICMPV6_ROUTER_PREF_HIGH) 5100 pref = ICMPV6_ROUTER_PREF_MEDIUM; 5101 cfg->fc_flags |= RTF_PREF(pref); 5102 } 5103 5104 if (tb[RTA_ENCAP]) 5105 cfg->fc_encap = tb[RTA_ENCAP]; 5106 5107 if (tb[RTA_ENCAP_TYPE]) { 5108 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 5109 5110 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 5111 if (err < 0) 5112 goto errout; 5113 } 5114 5115 if (tb[RTA_EXPIRES]) { 5116 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 5117 5118 if (addrconf_finite_timeout(timeout)) { 5119 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 5120 cfg->fc_flags |= RTF_EXPIRES; 5121 } 5122 } 5123 5124 err = 0; 5125 errout: 5126 return err; 5127 } 5128 5129 struct rt6_nh { 5130 struct fib6_info *fib6_info; 5131 struct fib6_config r_cfg; 5132 struct list_head next; 5133 }; 5134 5135 static int ip6_route_info_append(struct net *net, 5136 struct list_head *rt6_nh_list, 5137 struct fib6_info *rt, 5138 struct fib6_config *r_cfg) 5139 { 5140 struct rt6_nh *nh; 5141 int err = -EEXIST; 5142 5143 list_for_each_entry(nh, rt6_nh_list, next) { 5144 /* check if fib6_info already exists */ 5145 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 5146 return err; 5147 } 5148 5149 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 5150 if (!nh) 5151 return -ENOMEM; 5152 nh->fib6_info = rt; 5153 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 5154 list_add_tail(&nh->next, rt6_nh_list); 5155 5156 return 0; 5157 } 5158 5159 static void ip6_route_mpath_notify(struct fib6_info *rt, 5160 struct fib6_info *rt_last, 5161 struct nl_info *info, 5162 __u16 nlflags) 5163 { 5164 /* if this is an APPEND route, then rt points to the first route 5165 * inserted and rt_last points to last route inserted. Userspace 5166 * wants a consistent dump of the route which starts at the first 5167 * nexthop. Since sibling routes are always added at the end of 5168 * the list, find the first sibling of the last route appended 5169 */ 5170 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 5171 rt = list_first_entry(&rt_last->fib6_siblings, 5172 struct fib6_info, 5173 fib6_siblings); 5174 } 5175 5176 if (rt) 5177 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 5178 } 5179 5180 static bool ip6_route_mpath_should_notify(const struct fib6_info *rt) 5181 { 5182 bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); 5183 bool should_notify = false; 5184 struct fib6_info *leaf; 5185 struct fib6_node *fn; 5186 5187 rcu_read_lock(); 5188 fn = rcu_dereference(rt->fib6_node); 5189 if (!fn) 5190 goto out; 5191 5192 leaf = rcu_dereference(fn->leaf); 5193 if (!leaf) 5194 goto out; 5195 5196 if (rt == leaf || 5197 (rt_can_ecmp && rt->fib6_metric == leaf->fib6_metric && 5198 rt6_qualify_for_ecmp(leaf))) 5199 should_notify = true; 5200 out: 5201 rcu_read_unlock(); 5202 5203 return should_notify; 5204 } 5205 5206 static int ip6_route_multipath_add(struct fib6_config *cfg, 5207 struct netlink_ext_ack *extack) 5208 { 5209 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 5210 struct nl_info *info = &cfg->fc_nlinfo; 5211 struct fib6_config r_cfg; 5212 struct rtnexthop *rtnh; 5213 struct fib6_info *rt; 5214 struct rt6_nh *err_nh; 5215 struct rt6_nh *nh, *nh_safe; 5216 __u16 nlflags; 5217 int remaining; 5218 int attrlen; 5219 int err = 1; 5220 int nhn = 0; 5221 int replace = (cfg->fc_nlinfo.nlh && 5222 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 5223 LIST_HEAD(rt6_nh_list); 5224 5225 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 5226 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 5227 nlflags |= NLM_F_APPEND; 5228 5229 remaining = cfg->fc_mp_len; 5230 rtnh = (struct rtnexthop *)cfg->fc_mp; 5231 5232 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 5233 * fib6_info structs per nexthop 5234 */ 5235 while (rtnh_ok(rtnh, remaining)) { 5236 memcpy(&r_cfg, cfg, sizeof(*cfg)); 5237 if (rtnh->rtnh_ifindex) 5238 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 5239 5240 attrlen = rtnh_attrlen(rtnh); 5241 if (attrlen > 0) { 5242 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 5243 5244 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 5245 if (nla) { 5246 r_cfg.fc_gateway = nla_get_in6_addr(nla); 5247 r_cfg.fc_flags |= RTF_GATEWAY; 5248 } 5249 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 5250 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 5251 if (nla) 5252 r_cfg.fc_encap_type = nla_get_u16(nla); 5253 } 5254 5255 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 5256 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 5257 if (IS_ERR(rt)) { 5258 err = PTR_ERR(rt); 5259 rt = NULL; 5260 goto cleanup; 5261 } 5262 if (!rt6_qualify_for_ecmp(rt)) { 5263 err = -EINVAL; 5264 NL_SET_ERR_MSG(extack, 5265 "Device only routes can not be added for IPv6 using the multipath API."); 5266 fib6_info_release(rt); 5267 goto cleanup; 5268 } 5269 5270 rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; 5271 5272 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 5273 rt, &r_cfg); 5274 if (err) { 5275 fib6_info_release(rt); 5276 goto cleanup; 5277 } 5278 5279 rtnh = rtnh_next(rtnh, &remaining); 5280 } 5281 5282 if (list_empty(&rt6_nh_list)) { 5283 NL_SET_ERR_MSG(extack, 5284 "Invalid nexthop configuration - no valid nexthops"); 5285 return -EINVAL; 5286 } 5287 5288 /* for add and replace send one notification with all nexthops. 5289 * Skip the notification in fib6_add_rt2node and send one with 5290 * the full route when done 5291 */ 5292 info->skip_notify = 1; 5293 5294 /* For add and replace, send one notification with all nexthops. For 5295 * append, send one notification with all appended nexthops. 5296 */ 5297 info->skip_notify_kernel = 1; 5298 5299 err_nh = NULL; 5300 list_for_each_entry(nh, &rt6_nh_list, next) { 5301 err = __ip6_ins_rt(nh->fib6_info, info, extack); 5302 fib6_info_release(nh->fib6_info); 5303 5304 if (!err) { 5305 /* save reference to last route successfully inserted */ 5306 rt_last = nh->fib6_info; 5307 5308 /* save reference to first route for notification */ 5309 if (!rt_notif) 5310 rt_notif = nh->fib6_info; 5311 } 5312 5313 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 5314 nh->fib6_info = NULL; 5315 if (err) { 5316 if (replace && nhn) 5317 NL_SET_ERR_MSG_MOD(extack, 5318 "multipath route replace failed (check consistency of installed routes)"); 5319 err_nh = nh; 5320 goto add_errout; 5321 } 5322 5323 /* Because each route is added like a single route we remove 5324 * these flags after the first nexthop: if there is a collision, 5325 * we have already failed to add the first nexthop: 5326 * fib6_add_rt2node() has rejected it; when replacing, old 5327 * nexthops have been replaced by first new, the rest should 5328 * be added to it. 5329 */ 5330 if (cfg->fc_nlinfo.nlh) { 5331 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 5332 NLM_F_REPLACE); 5333 cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE; 5334 } 5335 nhn++; 5336 } 5337 5338 /* An in-kernel notification should only be sent in case the new 5339 * multipath route is added as the first route in the node, or if 5340 * it was appended to it. We pass 'rt_notif' since it is the first 5341 * sibling and might allow us to skip some checks in the replace case. 5342 */ 5343 if (ip6_route_mpath_should_notify(rt_notif)) { 5344 enum fib_event_type fib_event; 5345 5346 if (rt_notif->fib6_nsiblings != nhn - 1) 5347 fib_event = FIB_EVENT_ENTRY_APPEND; 5348 else 5349 fib_event = FIB_EVENT_ENTRY_REPLACE; 5350 5351 err = call_fib6_multipath_entry_notifiers(info->nl_net, 5352 fib_event, rt_notif, 5353 nhn - 1, extack); 5354 if (err) { 5355 /* Delete all the siblings that were just added */ 5356 err_nh = NULL; 5357 goto add_errout; 5358 } 5359 } 5360 5361 /* success ... tell user about new route */ 5362 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 5363 goto cleanup; 5364 5365 add_errout: 5366 /* send notification for routes that were added so that 5367 * the delete notifications sent by ip6_route_del are 5368 * coherent 5369 */ 5370 if (rt_notif) 5371 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 5372 5373 /* Delete routes that were already added */ 5374 list_for_each_entry(nh, &rt6_nh_list, next) { 5375 if (err_nh == nh) 5376 break; 5377 ip6_route_del(&nh->r_cfg, extack); 5378 } 5379 5380 cleanup: 5381 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 5382 if (nh->fib6_info) 5383 fib6_info_release(nh->fib6_info); 5384 list_del(&nh->next); 5385 kfree(nh); 5386 } 5387 5388 return err; 5389 } 5390 5391 static int ip6_route_multipath_del(struct fib6_config *cfg, 5392 struct netlink_ext_ack *extack) 5393 { 5394 struct fib6_config r_cfg; 5395 struct rtnexthop *rtnh; 5396 int last_err = 0; 5397 int remaining; 5398 int attrlen; 5399 int err; 5400 5401 remaining = cfg->fc_mp_len; 5402 rtnh = (struct rtnexthop *)cfg->fc_mp; 5403 5404 /* Parse a Multipath Entry */ 5405 while (rtnh_ok(rtnh, remaining)) { 5406 memcpy(&r_cfg, cfg, sizeof(*cfg)); 5407 if (rtnh->rtnh_ifindex) 5408 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 5409 5410 attrlen = rtnh_attrlen(rtnh); 5411 if (attrlen > 0) { 5412 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 5413 5414 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 5415 if (nla) { 5416 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 5417 r_cfg.fc_flags |= RTF_GATEWAY; 5418 } 5419 } 5420 err = ip6_route_del(&r_cfg, extack); 5421 if (err) 5422 last_err = err; 5423 5424 rtnh = rtnh_next(rtnh, &remaining); 5425 } 5426 5427 return last_err; 5428 } 5429 5430 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 5431 struct netlink_ext_ack *extack) 5432 { 5433 struct fib6_config cfg; 5434 int err; 5435 5436 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 5437 if (err < 0) 5438 return err; 5439 5440 if (cfg.fc_nh_id && 5441 !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) { 5442 NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); 5443 return -EINVAL; 5444 } 5445 5446 if (cfg.fc_mp) 5447 return ip6_route_multipath_del(&cfg, extack); 5448 else { 5449 cfg.fc_delete_all_nh = 1; 5450 return ip6_route_del(&cfg, extack); 5451 } 5452 } 5453 5454 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 5455 struct netlink_ext_ack *extack) 5456 { 5457 struct fib6_config cfg; 5458 int err; 5459 5460 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 5461 if (err < 0) 5462 return err; 5463 5464 if (cfg.fc_metric == 0) 5465 cfg.fc_metric = IP6_RT_PRIO_USER; 5466 5467 if (cfg.fc_mp) 5468 return ip6_route_multipath_add(&cfg, extack); 5469 else 5470 return ip6_route_add(&cfg, GFP_KERNEL, extack); 5471 } 5472 5473 /* add the overhead of this fib6_nh to nexthop_len */ 5474 static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg) 5475 { 5476 int *nexthop_len = arg; 5477 5478 *nexthop_len += nla_total_size(0) /* RTA_MULTIPATH */ 5479 + NLA_ALIGN(sizeof(struct rtnexthop)) 5480 + nla_total_size(16); /* RTA_GATEWAY */ 5481 5482 if (nh->fib_nh_lws) { 5483 /* RTA_ENCAP_TYPE */ 5484 *nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); 5485 /* RTA_ENCAP */ 5486 *nexthop_len += nla_total_size(2); 5487 } 5488 5489 return 0; 5490 } 5491 5492 static size_t rt6_nlmsg_size(struct fib6_info *f6i) 5493 { 5494 int nexthop_len; 5495 5496 if (f6i->nh) { 5497 nexthop_len = nla_total_size(4); /* RTA_NH_ID */ 5498 nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size, 5499 &nexthop_len); 5500 } else { 5501 struct fib6_nh *nh = f6i->fib6_nh; 5502 5503 nexthop_len = 0; 5504 if (f6i->fib6_nsiblings) { 5505 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 5506 + NLA_ALIGN(sizeof(struct rtnexthop)) 5507 + nla_total_size(16) /* RTA_GATEWAY */ 5508 + lwtunnel_get_encap_size(nh->fib_nh_lws); 5509 5510 nexthop_len *= f6i->fib6_nsiblings; 5511 } 5512 nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); 5513 } 5514 5515 return NLMSG_ALIGN(sizeof(struct rtmsg)) 5516 + nla_total_size(16) /* RTA_SRC */ 5517 + nla_total_size(16) /* RTA_DST */ 5518 + nla_total_size(16) /* RTA_GATEWAY */ 5519 + nla_total_size(16) /* RTA_PREFSRC */ 5520 + nla_total_size(4) /* RTA_TABLE */ 5521 + nla_total_size(4) /* RTA_IIF */ 5522 + nla_total_size(4) /* RTA_OIF */ 5523 + nla_total_size(4) /* RTA_PRIORITY */ 5524 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 5525 + nla_total_size(sizeof(struct rta_cacheinfo)) 5526 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 5527 + nla_total_size(1) /* RTA_PREF */ 5528 + nexthop_len; 5529 } 5530 5531 static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh, 5532 unsigned char *flags) 5533 { 5534 if (nexthop_is_multipath(nh)) { 5535 struct nlattr *mp; 5536 5537 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); 5538 if (!mp) 5539 goto nla_put_failure; 5540 5541 if (nexthop_mpath_fill_node(skb, nh, AF_INET6)) 5542 goto nla_put_failure; 5543 5544 nla_nest_end(skb, mp); 5545 } else { 5546 struct fib6_nh *fib6_nh; 5547 5548 fib6_nh = nexthop_fib6_nh(nh); 5549 if (fib_nexthop_info(skb, &fib6_nh->nh_common, AF_INET6, 5550 flags, false) < 0) 5551 goto nla_put_failure; 5552 } 5553 5554 return 0; 5555 5556 nla_put_failure: 5557 return -EMSGSIZE; 5558 } 5559 5560 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 5561 struct fib6_info *rt, struct dst_entry *dst, 5562 struct in6_addr *dest, struct in6_addr *src, 5563 int iif, int type, u32 portid, u32 seq, 5564 unsigned int flags) 5565 { 5566 struct rt6_info *rt6 = (struct rt6_info *)dst; 5567 struct rt6key *rt6_dst, *rt6_src; 5568 u32 *pmetrics, table, rt6_flags; 5569 unsigned char nh_flags = 0; 5570 struct nlmsghdr *nlh; 5571 struct rtmsg *rtm; 5572 long expires = 0; 5573 5574 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 5575 if (!nlh) 5576 return -EMSGSIZE; 5577 5578 if (rt6) { 5579 rt6_dst = &rt6->rt6i_dst; 5580 rt6_src = &rt6->rt6i_src; 5581 rt6_flags = rt6->rt6i_flags; 5582 } else { 5583 rt6_dst = &rt->fib6_dst; 5584 rt6_src = &rt->fib6_src; 5585 rt6_flags = rt->fib6_flags; 5586 } 5587 5588 rtm = nlmsg_data(nlh); 5589 rtm->rtm_family = AF_INET6; 5590 rtm->rtm_dst_len = rt6_dst->plen; 5591 rtm->rtm_src_len = rt6_src->plen; 5592 rtm->rtm_tos = 0; 5593 if (rt->fib6_table) 5594 table = rt->fib6_table->tb6_id; 5595 else 5596 table = RT6_TABLE_UNSPEC; 5597 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT; 5598 if (nla_put_u32(skb, RTA_TABLE, table)) 5599 goto nla_put_failure; 5600 5601 rtm->rtm_type = rt->fib6_type; 5602 rtm->rtm_flags = 0; 5603 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 5604 rtm->rtm_protocol = rt->fib6_protocol; 5605 5606 if (rt6_flags & RTF_CACHE) 5607 rtm->rtm_flags |= RTM_F_CLONED; 5608 5609 if (dest) { 5610 if (nla_put_in6_addr(skb, RTA_DST, dest)) 5611 goto nla_put_failure; 5612 rtm->rtm_dst_len = 128; 5613 } else if (rtm->rtm_dst_len) 5614 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr)) 5615 goto nla_put_failure; 5616 #ifdef CONFIG_IPV6_SUBTREES 5617 if (src) { 5618 if (nla_put_in6_addr(skb, RTA_SRC, src)) 5619 goto nla_put_failure; 5620 rtm->rtm_src_len = 128; 5621 } else if (rtm->rtm_src_len && 5622 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr)) 5623 goto nla_put_failure; 5624 #endif 5625 if (iif) { 5626 #ifdef CONFIG_IPV6_MROUTE 5627 if (ipv6_addr_is_multicast(&rt6_dst->addr)) { 5628 int err = ip6mr_get_route(net, skb, rtm, portid); 5629 5630 if (err == 0) 5631 return 0; 5632 if (err < 0) 5633 goto nla_put_failure; 5634 } else 5635 #endif 5636 if (nla_put_u32(skb, RTA_IIF, iif)) 5637 goto nla_put_failure; 5638 } else if (dest) { 5639 struct in6_addr saddr_buf; 5640 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 5641 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 5642 goto nla_put_failure; 5643 } 5644 5645 if (rt->fib6_prefsrc.plen) { 5646 struct in6_addr saddr_buf; 5647 saddr_buf = rt->fib6_prefsrc.addr; 5648 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 5649 goto nla_put_failure; 5650 } 5651 5652 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 5653 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 5654 goto nla_put_failure; 5655 5656 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 5657 goto nla_put_failure; 5658 5659 /* For multipath routes, walk the siblings list and add 5660 * each as a nexthop within RTA_MULTIPATH. 5661 */ 5662 if (rt6) { 5663 if (rt6_flags & RTF_GATEWAY && 5664 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) 5665 goto nla_put_failure; 5666 5667 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex)) 5668 goto nla_put_failure; 5669 5670 if (dst->lwtstate && 5671 lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) 5672 goto nla_put_failure; 5673 } else if (rt->fib6_nsiblings) { 5674 struct fib6_info *sibling, *next_sibling; 5675 struct nlattr *mp; 5676 5677 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); 5678 if (!mp) 5679 goto nla_put_failure; 5680 5681 if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common, 5682 rt->fib6_nh->fib_nh_weight, AF_INET6, 5683 0) < 0) 5684 goto nla_put_failure; 5685 5686 list_for_each_entry_safe(sibling, next_sibling, 5687 &rt->fib6_siblings, fib6_siblings) { 5688 if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common, 5689 sibling->fib6_nh->fib_nh_weight, 5690 AF_INET6, 0) < 0) 5691 goto nla_put_failure; 5692 } 5693 5694 nla_nest_end(skb, mp); 5695 } else if (rt->nh) { 5696 if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id)) 5697 goto nla_put_failure; 5698 5699 if (nexthop_is_blackhole(rt->nh)) 5700 rtm->rtm_type = RTN_BLACKHOLE; 5701 5702 if (net->ipv4.sysctl_nexthop_compat_mode && 5703 rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0) 5704 goto nla_put_failure; 5705 5706 rtm->rtm_flags |= nh_flags; 5707 } else { 5708 if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, AF_INET6, 5709 &nh_flags, false) < 0) 5710 goto nla_put_failure; 5711 5712 rtm->rtm_flags |= nh_flags; 5713 } 5714 5715 if (rt6_flags & RTF_EXPIRES) { 5716 expires = dst ? dst->expires : rt->expires; 5717 expires -= jiffies; 5718 } 5719 5720 if (!dst) { 5721 if (rt->offload) 5722 rtm->rtm_flags |= RTM_F_OFFLOAD; 5723 if (rt->trap) 5724 rtm->rtm_flags |= RTM_F_TRAP; 5725 if (rt->offload_failed) 5726 rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED; 5727 } 5728 5729 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 5730 goto nla_put_failure; 5731 5732 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags))) 5733 goto nla_put_failure; 5734 5735 5736 nlmsg_end(skb, nlh); 5737 return 0; 5738 5739 nla_put_failure: 5740 nlmsg_cancel(skb, nlh); 5741 return -EMSGSIZE; 5742 } 5743 5744 static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg) 5745 { 5746 const struct net_device *dev = arg; 5747 5748 if (nh->fib_nh_dev == dev) 5749 return 1; 5750 5751 return 0; 5752 } 5753 5754 static bool fib6_info_uses_dev(const struct fib6_info *f6i, 5755 const struct net_device *dev) 5756 { 5757 if (f6i->nh) { 5758 struct net_device *_dev = (struct net_device *)dev; 5759 5760 return !!nexthop_for_each_fib6_nh(f6i->nh, 5761 fib6_info_nh_uses_dev, 5762 _dev); 5763 } 5764 5765 if (f6i->fib6_nh->fib_nh_dev == dev) 5766 return true; 5767 5768 if (f6i->fib6_nsiblings) { 5769 struct fib6_info *sibling, *next_sibling; 5770 5771 list_for_each_entry_safe(sibling, next_sibling, 5772 &f6i->fib6_siblings, fib6_siblings) { 5773 if (sibling->fib6_nh->fib_nh_dev == dev) 5774 return true; 5775 } 5776 } 5777 5778 return false; 5779 } 5780 5781 struct fib6_nh_exception_dump_walker { 5782 struct rt6_rtnl_dump_arg *dump; 5783 struct fib6_info *rt; 5784 unsigned int flags; 5785 unsigned int skip; 5786 unsigned int count; 5787 }; 5788 5789 static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg) 5790 { 5791 struct fib6_nh_exception_dump_walker *w = arg; 5792 struct rt6_rtnl_dump_arg *dump = w->dump; 5793 struct rt6_exception_bucket *bucket; 5794 struct rt6_exception *rt6_ex; 5795 int i, err; 5796 5797 bucket = fib6_nh_get_excptn_bucket(nh, NULL); 5798 if (!bucket) 5799 return 0; 5800 5801 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 5802 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 5803 if (w->skip) { 5804 w->skip--; 5805 continue; 5806 } 5807 5808 /* Expiration of entries doesn't bump sernum, insertion 5809 * does. Removal is triggered by insertion, so we can 5810 * rely on the fact that if entries change between two 5811 * partial dumps, this node is scanned again completely, 5812 * see rt6_insert_exception() and fib6_dump_table(). 5813 * 5814 * Count expired entries we go through as handled 5815 * entries that we'll skip next time, in case of partial 5816 * node dump. Otherwise, if entries expire meanwhile, 5817 * we'll skip the wrong amount. 5818 */ 5819 if (rt6_check_expired(rt6_ex->rt6i)) { 5820 w->count++; 5821 continue; 5822 } 5823 5824 err = rt6_fill_node(dump->net, dump->skb, w->rt, 5825 &rt6_ex->rt6i->dst, NULL, NULL, 0, 5826 RTM_NEWROUTE, 5827 NETLINK_CB(dump->cb->skb).portid, 5828 dump->cb->nlh->nlmsg_seq, w->flags); 5829 if (err) 5830 return err; 5831 5832 w->count++; 5833 } 5834 bucket++; 5835 } 5836 5837 return 0; 5838 } 5839 5840 /* Return -1 if done with node, number of handled routes on partial dump */ 5841 int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip) 5842 { 5843 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 5844 struct fib_dump_filter *filter = &arg->filter; 5845 unsigned int flags = NLM_F_MULTI; 5846 struct net *net = arg->net; 5847 int count = 0; 5848 5849 if (rt == net->ipv6.fib6_null_entry) 5850 return -1; 5851 5852 if ((filter->flags & RTM_F_PREFIX) && 5853 !(rt->fib6_flags & RTF_PREFIX_RT)) { 5854 /* success since this is not a prefix route */ 5855 return -1; 5856 } 5857 if (filter->filter_set && 5858 ((filter->rt_type && rt->fib6_type != filter->rt_type) || 5859 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || 5860 (filter->protocol && rt->fib6_protocol != filter->protocol))) { 5861 return -1; 5862 } 5863 5864 if (filter->filter_set || 5865 !filter->dump_routes || !filter->dump_exceptions) { 5866 flags |= NLM_F_DUMP_FILTERED; 5867 } 5868 5869 if (filter->dump_routes) { 5870 if (skip) { 5871 skip--; 5872 } else { 5873 if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 5874 0, RTM_NEWROUTE, 5875 NETLINK_CB(arg->cb->skb).portid, 5876 arg->cb->nlh->nlmsg_seq, flags)) { 5877 return 0; 5878 } 5879 count++; 5880 } 5881 } 5882 5883 if (filter->dump_exceptions) { 5884 struct fib6_nh_exception_dump_walker w = { .dump = arg, 5885 .rt = rt, 5886 .flags = flags, 5887 .skip = skip, 5888 .count = 0 }; 5889 int err; 5890 5891 rcu_read_lock(); 5892 if (rt->nh) { 5893 err = nexthop_for_each_fib6_nh(rt->nh, 5894 rt6_nh_dump_exceptions, 5895 &w); 5896 } else { 5897 err = rt6_nh_dump_exceptions(rt->fib6_nh, &w); 5898 } 5899 rcu_read_unlock(); 5900 5901 if (err) 5902 return count += w.count; 5903 } 5904 5905 return -1; 5906 } 5907 5908 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, 5909 const struct nlmsghdr *nlh, 5910 struct nlattr **tb, 5911 struct netlink_ext_ack *extack) 5912 { 5913 struct rtmsg *rtm; 5914 int i, err; 5915 5916 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { 5917 NL_SET_ERR_MSG_MOD(extack, 5918 "Invalid header for get route request"); 5919 return -EINVAL; 5920 } 5921 5922 if (!netlink_strict_get_check(skb)) 5923 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, 5924 rtm_ipv6_policy, extack); 5925 5926 rtm = nlmsg_data(nlh); 5927 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || 5928 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || 5929 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || 5930 rtm->rtm_type) { 5931 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request"); 5932 return -EINVAL; 5933 } 5934 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) { 5935 NL_SET_ERR_MSG_MOD(extack, 5936 "Invalid flags for get route request"); 5937 return -EINVAL; 5938 } 5939 5940 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, 5941 rtm_ipv6_policy, extack); 5942 if (err) 5943 return err; 5944 5945 if ((tb[RTA_SRC] && !rtm->rtm_src_len) || 5946 (tb[RTA_DST] && !rtm->rtm_dst_len)) { 5947 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); 5948 return -EINVAL; 5949 } 5950 5951 for (i = 0; i <= RTA_MAX; i++) { 5952 if (!tb[i]) 5953 continue; 5954 5955 switch (i) { 5956 case RTA_SRC: 5957 case RTA_DST: 5958 case RTA_IIF: 5959 case RTA_OIF: 5960 case RTA_MARK: 5961 case RTA_UID: 5962 case RTA_SPORT: 5963 case RTA_DPORT: 5964 case RTA_IP_PROTO: 5965 break; 5966 default: 5967 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); 5968 return -EINVAL; 5969 } 5970 } 5971 5972 return 0; 5973 } 5974 5975 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 5976 struct netlink_ext_ack *extack) 5977 { 5978 struct net *net = sock_net(in_skb->sk); 5979 struct nlattr *tb[RTA_MAX+1]; 5980 int err, iif = 0, oif = 0; 5981 struct fib6_info *from; 5982 struct dst_entry *dst; 5983 struct rt6_info *rt; 5984 struct sk_buff *skb; 5985 struct rtmsg *rtm; 5986 struct flowi6 fl6 = {}; 5987 bool fibmatch; 5988 5989 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack); 5990 if (err < 0) 5991 goto errout; 5992 5993 err = -EINVAL; 5994 rtm = nlmsg_data(nlh); 5995 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 5996 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 5997 5998 if (tb[RTA_SRC]) { 5999 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 6000 goto errout; 6001 6002 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 6003 } 6004 6005 if (tb[RTA_DST]) { 6006 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 6007 goto errout; 6008 6009 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 6010 } 6011 6012 if (tb[RTA_IIF]) 6013 iif = nla_get_u32(tb[RTA_IIF]); 6014 6015 if (tb[RTA_OIF]) 6016 oif = nla_get_u32(tb[RTA_OIF]); 6017 6018 if (tb[RTA_MARK]) 6019 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 6020 6021 if (tb[RTA_UID]) 6022 fl6.flowi6_uid = make_kuid(current_user_ns(), 6023 nla_get_u32(tb[RTA_UID])); 6024 else 6025 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 6026 6027 if (tb[RTA_SPORT]) 6028 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); 6029 6030 if (tb[RTA_DPORT]) 6031 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); 6032 6033 if (tb[RTA_IP_PROTO]) { 6034 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 6035 &fl6.flowi6_proto, AF_INET6, 6036 extack); 6037 if (err) 6038 goto errout; 6039 } 6040 6041 if (iif) { 6042 struct net_device *dev; 6043 int flags = 0; 6044 6045 rcu_read_lock(); 6046 6047 dev = dev_get_by_index_rcu(net, iif); 6048 if (!dev) { 6049 rcu_read_unlock(); 6050 err = -ENODEV; 6051 goto errout; 6052 } 6053 6054 fl6.flowi6_iif = iif; 6055 6056 if (!ipv6_addr_any(&fl6.saddr)) 6057 flags |= RT6_LOOKUP_F_HAS_SADDR; 6058 6059 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 6060 6061 rcu_read_unlock(); 6062 } else { 6063 fl6.flowi6_oif = oif; 6064 6065 dst = ip6_route_output(net, NULL, &fl6); 6066 } 6067 6068 6069 rt = container_of(dst, struct rt6_info, dst); 6070 if (rt->dst.error) { 6071 err = rt->dst.error; 6072 ip6_rt_put(rt); 6073 goto errout; 6074 } 6075 6076 if (rt == net->ipv6.ip6_null_entry) { 6077 err = rt->dst.error; 6078 ip6_rt_put(rt); 6079 goto errout; 6080 } 6081 6082 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 6083 if (!skb) { 6084 ip6_rt_put(rt); 6085 err = -ENOBUFS; 6086 goto errout; 6087 } 6088 6089 skb_dst_set(skb, &rt->dst); 6090 6091 rcu_read_lock(); 6092 from = rcu_dereference(rt->from); 6093 if (from) { 6094 if (fibmatch) 6095 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, 6096 iif, RTM_NEWROUTE, 6097 NETLINK_CB(in_skb).portid, 6098 nlh->nlmsg_seq, 0); 6099 else 6100 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 6101 &fl6.saddr, iif, RTM_NEWROUTE, 6102 NETLINK_CB(in_skb).portid, 6103 nlh->nlmsg_seq, 0); 6104 } else { 6105 err = -ENETUNREACH; 6106 } 6107 rcu_read_unlock(); 6108 6109 if (err < 0) { 6110 kfree_skb(skb); 6111 goto errout; 6112 } 6113 6114 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 6115 errout: 6116 return err; 6117 } 6118 6119 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 6120 unsigned int nlm_flags) 6121 { 6122 struct sk_buff *skb; 6123 struct net *net = info->nl_net; 6124 u32 seq; 6125 int err; 6126 6127 err = -ENOBUFS; 6128 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 6129 6130 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 6131 if (!skb) 6132 goto errout; 6133 6134 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 6135 event, info->portid, seq, nlm_flags); 6136 if (err < 0) { 6137 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 6138 WARN_ON(err == -EMSGSIZE); 6139 kfree_skb(skb); 6140 goto errout; 6141 } 6142 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 6143 info->nlh, gfp_any()); 6144 return; 6145 errout: 6146 if (err < 0) 6147 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 6148 } 6149 6150 void fib6_rt_update(struct net *net, struct fib6_info *rt, 6151 struct nl_info *info) 6152 { 6153 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 6154 struct sk_buff *skb; 6155 int err = -ENOBUFS; 6156 6157 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 6158 if (!skb) 6159 goto errout; 6160 6161 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 6162 RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE); 6163 if (err < 0) { 6164 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 6165 WARN_ON(err == -EMSGSIZE); 6166 kfree_skb(skb); 6167 goto errout; 6168 } 6169 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 6170 info->nlh, gfp_any()); 6171 return; 6172 errout: 6173 if (err < 0) 6174 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 6175 } 6176 6177 void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, 6178 bool offload, bool trap, bool offload_failed) 6179 { 6180 struct sk_buff *skb; 6181 int err; 6182 6183 if (f6i->offload == offload && f6i->trap == trap && 6184 f6i->offload_failed == offload_failed) 6185 return; 6186 6187 f6i->offload = offload; 6188 f6i->trap = trap; 6189 6190 /* 2 means send notifications only if offload_failed was changed. */ 6191 if (net->ipv6.sysctl.fib_notify_on_flag_change == 2 && 6192 f6i->offload_failed == offload_failed) 6193 return; 6194 6195 f6i->offload_failed = offload_failed; 6196 6197 if (!rcu_access_pointer(f6i->fib6_node)) 6198 /* The route was removed from the tree, do not send 6199 * notification. 6200 */ 6201 return; 6202 6203 if (!net->ipv6.sysctl.fib_notify_on_flag_change) 6204 return; 6205 6206 skb = nlmsg_new(rt6_nlmsg_size(f6i), GFP_KERNEL); 6207 if (!skb) { 6208 err = -ENOBUFS; 6209 goto errout; 6210 } 6211 6212 err = rt6_fill_node(net, skb, f6i, NULL, NULL, NULL, 0, RTM_NEWROUTE, 0, 6213 0, 0); 6214 if (err < 0) { 6215 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 6216 WARN_ON(err == -EMSGSIZE); 6217 kfree_skb(skb); 6218 goto errout; 6219 } 6220 6221 rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ROUTE, NULL, GFP_KERNEL); 6222 return; 6223 6224 errout: 6225 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 6226 } 6227 EXPORT_SYMBOL(fib6_info_hw_flags_set); 6228 6229 static int ip6_route_dev_notify(struct notifier_block *this, 6230 unsigned long event, void *ptr) 6231 { 6232 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 6233 struct net *net = dev_net(dev); 6234 6235 if (!(dev->flags & IFF_LOOPBACK)) 6236 return NOTIFY_OK; 6237 6238 if (event == NETDEV_REGISTER) { 6239 net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev; 6240 net->ipv6.ip6_null_entry->dst.dev = dev; 6241 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 6242 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6243 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 6244 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 6245 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 6246 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 6247 #endif 6248 } else if (event == NETDEV_UNREGISTER && 6249 dev->reg_state != NETREG_UNREGISTERED) { 6250 /* NETDEV_UNREGISTER could be fired for multiple times by 6251 * netdev_wait_allrefs(). Make sure we only call this once. 6252 */ 6253 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 6254 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6255 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 6256 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 6257 #endif 6258 } 6259 6260 return NOTIFY_OK; 6261 } 6262 6263 /* 6264 * /proc 6265 */ 6266 6267 #ifdef CONFIG_PROC_FS 6268 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 6269 { 6270 struct net *net = (struct net *)seq->private; 6271 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 6272 net->ipv6.rt6_stats->fib_nodes, 6273 net->ipv6.rt6_stats->fib_route_nodes, 6274 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 6275 net->ipv6.rt6_stats->fib_rt_entries, 6276 net->ipv6.rt6_stats->fib_rt_cache, 6277 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 6278 net->ipv6.rt6_stats->fib_discarded_routes); 6279 6280 return 0; 6281 } 6282 #endif /* CONFIG_PROC_FS */ 6283 6284 #ifdef CONFIG_SYSCTL 6285 6286 static int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 6287 void *buffer, size_t *lenp, loff_t *ppos) 6288 { 6289 struct net *net; 6290 int delay; 6291 int ret; 6292 if (!write) 6293 return -EINVAL; 6294 6295 net = (struct net *)ctl->extra1; 6296 delay = net->ipv6.sysctl.flush_delay; 6297 ret = proc_dointvec(ctl, write, buffer, lenp, ppos); 6298 if (ret) 6299 return ret; 6300 6301 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 6302 return 0; 6303 } 6304 6305 static struct ctl_table ipv6_route_table_template[] = { 6306 { 6307 .procname = "max_size", 6308 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 6309 .maxlen = sizeof(int), 6310 .mode = 0644, 6311 .proc_handler = proc_dointvec, 6312 }, 6313 { 6314 .procname = "gc_thresh", 6315 .data = &ip6_dst_ops_template.gc_thresh, 6316 .maxlen = sizeof(int), 6317 .mode = 0644, 6318 .proc_handler = proc_dointvec, 6319 }, 6320 { 6321 .procname = "flush", 6322 .data = &init_net.ipv6.sysctl.flush_delay, 6323 .maxlen = sizeof(int), 6324 .mode = 0200, 6325 .proc_handler = ipv6_sysctl_rtcache_flush 6326 }, 6327 { 6328 .procname = "gc_min_interval", 6329 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 6330 .maxlen = sizeof(int), 6331 .mode = 0644, 6332 .proc_handler = proc_dointvec_jiffies, 6333 }, 6334 { 6335 .procname = "gc_timeout", 6336 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 6337 .maxlen = sizeof(int), 6338 .mode = 0644, 6339 .proc_handler = proc_dointvec_jiffies, 6340 }, 6341 { 6342 .procname = "gc_interval", 6343 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 6344 .maxlen = sizeof(int), 6345 .mode = 0644, 6346 .proc_handler = proc_dointvec_jiffies, 6347 }, 6348 { 6349 .procname = "gc_elasticity", 6350 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 6351 .maxlen = sizeof(int), 6352 .mode = 0644, 6353 .proc_handler = proc_dointvec, 6354 }, 6355 { 6356 .procname = "mtu_expires", 6357 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 6358 .maxlen = sizeof(int), 6359 .mode = 0644, 6360 .proc_handler = proc_dointvec_jiffies, 6361 }, 6362 { 6363 .procname = "min_adv_mss", 6364 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 6365 .maxlen = sizeof(int), 6366 .mode = 0644, 6367 .proc_handler = proc_dointvec, 6368 }, 6369 { 6370 .procname = "gc_min_interval_ms", 6371 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 6372 .maxlen = sizeof(int), 6373 .mode = 0644, 6374 .proc_handler = proc_dointvec_ms_jiffies, 6375 }, 6376 { 6377 .procname = "skip_notify_on_dev_down", 6378 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down, 6379 .maxlen = sizeof(int), 6380 .mode = 0644, 6381 .proc_handler = proc_dointvec_minmax, 6382 .extra1 = SYSCTL_ZERO, 6383 .extra2 = SYSCTL_ONE, 6384 }, 6385 { } 6386 }; 6387 6388 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 6389 { 6390 struct ctl_table *table; 6391 6392 table = kmemdup(ipv6_route_table_template, 6393 sizeof(ipv6_route_table_template), 6394 GFP_KERNEL); 6395 6396 if (table) { 6397 table[0].data = &net->ipv6.sysctl.ip6_rt_max_size; 6398 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 6399 table[2].data = &net->ipv6.sysctl.flush_delay; 6400 table[2].extra1 = net; 6401 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 6402 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 6403 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 6404 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 6405 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 6406 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 6407 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 6408 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down; 6409 6410 /* Don't export sysctls to unprivileged users */ 6411 if (net->user_ns != &init_user_ns) 6412 table[1].procname = NULL; 6413 } 6414 6415 return table; 6416 } 6417 #endif 6418 6419 static int __net_init ip6_route_net_init(struct net *net) 6420 { 6421 int ret = -ENOMEM; 6422 6423 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 6424 sizeof(net->ipv6.ip6_dst_ops)); 6425 6426 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 6427 goto out_ip6_dst_ops; 6428 6429 net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true); 6430 if (!net->ipv6.fib6_null_entry) 6431 goto out_ip6_dst_entries; 6432 memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template, 6433 sizeof(*net->ipv6.fib6_null_entry)); 6434 6435 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 6436 sizeof(*net->ipv6.ip6_null_entry), 6437 GFP_KERNEL); 6438 if (!net->ipv6.ip6_null_entry) 6439 goto out_fib6_null_entry; 6440 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 6441 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 6442 ip6_template_metrics, true); 6443 INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->rt6i_uncached); 6444 6445 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6446 net->ipv6.fib6_has_custom_rules = false; 6447 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 6448 sizeof(*net->ipv6.ip6_prohibit_entry), 6449 GFP_KERNEL); 6450 if (!net->ipv6.ip6_prohibit_entry) 6451 goto out_ip6_null_entry; 6452 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 6453 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 6454 ip6_template_metrics, true); 6455 INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->rt6i_uncached); 6456 6457 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 6458 sizeof(*net->ipv6.ip6_blk_hole_entry), 6459 GFP_KERNEL); 6460 if (!net->ipv6.ip6_blk_hole_entry) 6461 goto out_ip6_prohibit_entry; 6462 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 6463 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 6464 ip6_template_metrics, true); 6465 INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached); 6466 #ifdef CONFIG_IPV6_SUBTREES 6467 net->ipv6.fib6_routes_require_src = 0; 6468 #endif 6469 #endif 6470 6471 net->ipv6.sysctl.flush_delay = 0; 6472 net->ipv6.sysctl.ip6_rt_max_size = 4096; 6473 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 6474 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 6475 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 6476 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 6477 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 6478 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 6479 net->ipv6.sysctl.skip_notify_on_dev_down = 0; 6480 6481 net->ipv6.ip6_rt_gc_expire = 30*HZ; 6482 6483 ret = 0; 6484 out: 6485 return ret; 6486 6487 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6488 out_ip6_prohibit_entry: 6489 kfree(net->ipv6.ip6_prohibit_entry); 6490 out_ip6_null_entry: 6491 kfree(net->ipv6.ip6_null_entry); 6492 #endif 6493 out_fib6_null_entry: 6494 kfree(net->ipv6.fib6_null_entry); 6495 out_ip6_dst_entries: 6496 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 6497 out_ip6_dst_ops: 6498 goto out; 6499 } 6500 6501 static void __net_exit ip6_route_net_exit(struct net *net) 6502 { 6503 kfree(net->ipv6.fib6_null_entry); 6504 kfree(net->ipv6.ip6_null_entry); 6505 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6506 kfree(net->ipv6.ip6_prohibit_entry); 6507 kfree(net->ipv6.ip6_blk_hole_entry); 6508 #endif 6509 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 6510 } 6511 6512 static int __net_init ip6_route_net_init_late(struct net *net) 6513 { 6514 #ifdef CONFIG_PROC_FS 6515 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops, 6516 sizeof(struct ipv6_route_iter)); 6517 proc_create_net_single("rt6_stats", 0444, net->proc_net, 6518 rt6_stats_seq_show, NULL); 6519 #endif 6520 return 0; 6521 } 6522 6523 static void __net_exit ip6_route_net_exit_late(struct net *net) 6524 { 6525 #ifdef CONFIG_PROC_FS 6526 remove_proc_entry("ipv6_route", net->proc_net); 6527 remove_proc_entry("rt6_stats", net->proc_net); 6528 #endif 6529 } 6530 6531 static struct pernet_operations ip6_route_net_ops = { 6532 .init = ip6_route_net_init, 6533 .exit = ip6_route_net_exit, 6534 }; 6535 6536 static int __net_init ipv6_inetpeer_init(struct net *net) 6537 { 6538 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 6539 6540 if (!bp) 6541 return -ENOMEM; 6542 inet_peer_base_init(bp); 6543 net->ipv6.peers = bp; 6544 return 0; 6545 } 6546 6547 static void __net_exit ipv6_inetpeer_exit(struct net *net) 6548 { 6549 struct inet_peer_base *bp = net->ipv6.peers; 6550 6551 net->ipv6.peers = NULL; 6552 inetpeer_invalidate_tree(bp); 6553 kfree(bp); 6554 } 6555 6556 static struct pernet_operations ipv6_inetpeer_ops = { 6557 .init = ipv6_inetpeer_init, 6558 .exit = ipv6_inetpeer_exit, 6559 }; 6560 6561 static struct pernet_operations ip6_route_net_late_ops = { 6562 .init = ip6_route_net_init_late, 6563 .exit = ip6_route_net_exit_late, 6564 }; 6565 6566 static struct notifier_block ip6_route_dev_notifier = { 6567 .notifier_call = ip6_route_dev_notify, 6568 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 6569 }; 6570 6571 void __init ip6_route_init_special_entries(void) 6572 { 6573 /* Registering of the loopback is done before this portion of code, 6574 * the loopback reference in rt6_info will not be taken, do it 6575 * manually for init_net */ 6576 init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev; 6577 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 6578 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 6579 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6580 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 6581 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 6582 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 6583 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 6584 #endif 6585 } 6586 6587 #if IS_BUILTIN(CONFIG_IPV6) 6588 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 6589 DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt) 6590 6591 BTF_ID_LIST(btf_fib6_info_id) 6592 BTF_ID(struct, fib6_info) 6593 6594 static const struct bpf_iter_seq_info ipv6_route_seq_info = { 6595 .seq_ops = &ipv6_route_seq_ops, 6596 .init_seq_private = bpf_iter_init_seq_net, 6597 .fini_seq_private = bpf_iter_fini_seq_net, 6598 .seq_priv_size = sizeof(struct ipv6_route_iter), 6599 }; 6600 6601 static struct bpf_iter_reg ipv6_route_reg_info = { 6602 .target = "ipv6_route", 6603 .ctx_arg_info_size = 1, 6604 .ctx_arg_info = { 6605 { offsetof(struct bpf_iter__ipv6_route, rt), 6606 PTR_TO_BTF_ID_OR_NULL }, 6607 }, 6608 .seq_info = &ipv6_route_seq_info, 6609 }; 6610 6611 static int __init bpf_iter_register(void) 6612 { 6613 ipv6_route_reg_info.ctx_arg_info[0].btf_id = *btf_fib6_info_id; 6614 return bpf_iter_reg_target(&ipv6_route_reg_info); 6615 } 6616 6617 static void bpf_iter_unregister(void) 6618 { 6619 bpf_iter_unreg_target(&ipv6_route_reg_info); 6620 } 6621 #endif 6622 #endif 6623 6624 int __init ip6_route_init(void) 6625 { 6626 int ret; 6627 int cpu; 6628 6629 ret = -ENOMEM; 6630 ip6_dst_ops_template.kmem_cachep = 6631 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 6632 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); 6633 if (!ip6_dst_ops_template.kmem_cachep) 6634 goto out; 6635 6636 ret = dst_entries_init(&ip6_dst_blackhole_ops); 6637 if (ret) 6638 goto out_kmem_cache; 6639 6640 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 6641 if (ret) 6642 goto out_dst_entries; 6643 6644 ret = register_pernet_subsys(&ip6_route_net_ops); 6645 if (ret) 6646 goto out_register_inetpeer; 6647 6648 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 6649 6650 ret = fib6_init(); 6651 if (ret) 6652 goto out_register_subsys; 6653 6654 ret = xfrm6_init(); 6655 if (ret) 6656 goto out_fib6_init; 6657 6658 ret = fib6_rules_init(); 6659 if (ret) 6660 goto xfrm6_init; 6661 6662 ret = register_pernet_subsys(&ip6_route_net_late_ops); 6663 if (ret) 6664 goto fib6_rules_init; 6665 6666 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 6667 inet6_rtm_newroute, NULL, 0); 6668 if (ret < 0) 6669 goto out_register_late_subsys; 6670 6671 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 6672 inet6_rtm_delroute, NULL, 0); 6673 if (ret < 0) 6674 goto out_register_late_subsys; 6675 6676 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 6677 inet6_rtm_getroute, NULL, 6678 RTNL_FLAG_DOIT_UNLOCKED); 6679 if (ret < 0) 6680 goto out_register_late_subsys; 6681 6682 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 6683 if (ret) 6684 goto out_register_late_subsys; 6685 6686 #if IS_BUILTIN(CONFIG_IPV6) 6687 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 6688 ret = bpf_iter_register(); 6689 if (ret) 6690 goto out_register_late_subsys; 6691 #endif 6692 #endif 6693 6694 for_each_possible_cpu(cpu) { 6695 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 6696 6697 INIT_LIST_HEAD(&ul->head); 6698 spin_lock_init(&ul->lock); 6699 } 6700 6701 out: 6702 return ret; 6703 6704 out_register_late_subsys: 6705 rtnl_unregister_all(PF_INET6); 6706 unregister_pernet_subsys(&ip6_route_net_late_ops); 6707 fib6_rules_init: 6708 fib6_rules_cleanup(); 6709 xfrm6_init: 6710 xfrm6_fini(); 6711 out_fib6_init: 6712 fib6_gc_cleanup(); 6713 out_register_subsys: 6714 unregister_pernet_subsys(&ip6_route_net_ops); 6715 out_register_inetpeer: 6716 unregister_pernet_subsys(&ipv6_inetpeer_ops); 6717 out_dst_entries: 6718 dst_entries_destroy(&ip6_dst_blackhole_ops); 6719 out_kmem_cache: 6720 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 6721 goto out; 6722 } 6723 6724 void ip6_route_cleanup(void) 6725 { 6726 #if IS_BUILTIN(CONFIG_IPV6) 6727 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 6728 bpf_iter_unregister(); 6729 #endif 6730 #endif 6731 unregister_netdevice_notifier(&ip6_route_dev_notifier); 6732 unregister_pernet_subsys(&ip6_route_net_late_ops); 6733 fib6_rules_cleanup(); 6734 xfrm6_fini(); 6735 fib6_gc_cleanup(); 6736 unregister_pernet_subsys(&ipv6_inetpeer_ops); 6737 unregister_pernet_subsys(&ip6_route_net_ops); 6738 dst_entries_destroy(&ip6_dst_blackhole_ops); 6739 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 6740 } 6741