1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Linux INET6 implementation 4 * FIB front-end. 5 * 6 * Authors: 7 * Pedro Roque <roque@di.fc.ul.pt> 8 */ 9 10 /* Changes: 11 * 12 * YOSHIFUJI Hideaki @USAGI 13 * reworked default router selection. 14 * - respect outgoing interface 15 * - select from (probably) reachable routers (i.e. 16 * routers in REACHABLE, STALE, DELAY or PROBE states). 17 * - always select the same router if it is (probably) 18 * reachable. otherwise, round-robin the list. 19 * Ville Nuorvala 20 * Fixed routing subtrees. 21 */ 22 23 #define pr_fmt(fmt) "IPv6: " fmt 24 25 #include <linux/capability.h> 26 #include <linux/errno.h> 27 #include <linux/export.h> 28 #include <linux/types.h> 29 #include <linux/times.h> 30 #include <linux/socket.h> 31 #include <linux/sockios.h> 32 #include <linux/net.h> 33 #include <linux/route.h> 34 #include <linux/netdevice.h> 35 #include <linux/in6.h> 36 #include <linux/mroute6.h> 37 #include <linux/init.h> 38 #include <linux/if_arp.h> 39 #include <linux/proc_fs.h> 40 #include <linux/seq_file.h> 41 #include <linux/nsproxy.h> 42 #include <linux/slab.h> 43 #include <linux/jhash.h> 44 #include <net/net_namespace.h> 45 #include <net/snmp.h> 46 #include <net/ipv6.h> 47 #include <net/ip6_fib.h> 48 #include <net/ip6_route.h> 49 #include <net/ndisc.h> 50 #include <net/addrconf.h> 51 #include <net/tcp.h> 52 #include <linux/rtnetlink.h> 53 #include <net/dst.h> 54 #include <net/dst_metadata.h> 55 #include <net/xfrm.h> 56 #include <net/netevent.h> 57 #include <net/netlink.h> 58 #include <net/rtnh.h> 59 #include <net/lwtunnel.h> 60 #include <net/ip_tunnels.h> 61 #include <net/l3mdev.h> 62 #include <net/ip.h> 63 #include <linux/uaccess.h> 64 65 #ifdef CONFIG_SYSCTL 66 #include <linux/sysctl.h> 67 #endif 68 69 static int ip6_rt_type_to_error(u8 fib6_type); 70 71 #define CREATE_TRACE_POINTS 72 #include <trace/events/fib6.h> 73 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); 74 #undef CREATE_TRACE_POINTS 75 76 enum rt6_nud_state { 77 RT6_NUD_FAIL_HARD = -3, 78 RT6_NUD_FAIL_PROBE = -2, 79 RT6_NUD_FAIL_DO_RR = -1, 80 RT6_NUD_SUCCEED = 1 81 }; 82 83 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 84 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 85 static unsigned int ip6_mtu(const struct dst_entry *dst); 86 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 87 static void ip6_dst_destroy(struct dst_entry *); 88 static void ip6_dst_ifdown(struct dst_entry *, 89 struct net_device *dev, int how); 90 static int ip6_dst_gc(struct dst_ops *ops); 91 92 static int ip6_pkt_discard(struct sk_buff *skb); 93 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 94 static int ip6_pkt_prohibit(struct sk_buff *skb); 95 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 96 static void ip6_link_failure(struct sk_buff *skb); 97 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 98 struct sk_buff *skb, u32 mtu); 99 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 100 struct sk_buff *skb); 101 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, 102 int strict); 103 static size_t rt6_nlmsg_size(struct fib6_info *rt); 104 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 105 struct fib6_info *rt, struct dst_entry *dst, 106 struct in6_addr *dest, struct in6_addr *src, 107 int iif, int type, u32 portid, u32 seq, 108 unsigned int flags); 109 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, 110 const struct in6_addr *daddr, 111 const struct in6_addr *saddr); 112 113 #ifdef CONFIG_IPV6_ROUTE_INFO 114 static struct fib6_info *rt6_add_route_info(struct net *net, 115 const struct in6_addr *prefix, int prefixlen, 116 const struct in6_addr *gwaddr, 117 struct net_device *dev, 118 unsigned int pref); 119 static struct fib6_info *rt6_get_route_info(struct net *net, 120 const struct in6_addr *prefix, int prefixlen, 121 const struct in6_addr *gwaddr, 122 struct net_device *dev); 123 #endif 124 125 struct uncached_list { 126 spinlock_t lock; 127 struct list_head head; 128 }; 129 130 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 131 132 void rt6_uncached_list_add(struct rt6_info *rt) 133 { 134 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 135 136 rt->rt6i_uncached_list = ul; 137 138 spin_lock_bh(&ul->lock); 139 list_add_tail(&rt->rt6i_uncached, &ul->head); 140 spin_unlock_bh(&ul->lock); 141 } 142 143 void rt6_uncached_list_del(struct rt6_info *rt) 144 { 145 if (!list_empty(&rt->rt6i_uncached)) { 146 struct uncached_list *ul = rt->rt6i_uncached_list; 147 struct net *net = dev_net(rt->dst.dev); 148 149 spin_lock_bh(&ul->lock); 150 list_del(&rt->rt6i_uncached); 151 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 152 spin_unlock_bh(&ul->lock); 153 } 154 } 155 156 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 157 { 158 struct net_device *loopback_dev = net->loopback_dev; 159 int cpu; 160 161 if (dev == loopback_dev) 162 return; 163 164 for_each_possible_cpu(cpu) { 165 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 166 struct rt6_info *rt; 167 168 spin_lock_bh(&ul->lock); 169 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 170 struct inet6_dev *rt_idev = rt->rt6i_idev; 171 struct net_device *rt_dev = rt->dst.dev; 172 173 if (rt_idev->dev == dev) { 174 rt->rt6i_idev = in6_dev_get(loopback_dev); 175 in6_dev_put(rt_idev); 176 } 177 178 if (rt_dev == dev) { 179 rt->dst.dev = loopback_dev; 180 dev_hold(rt->dst.dev); 181 dev_put(rt_dev); 182 } 183 } 184 spin_unlock_bh(&ul->lock); 185 } 186 } 187 188 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 189 struct sk_buff *skb, 190 const void *daddr) 191 { 192 if (!ipv6_addr_any(p)) 193 return (const void *) p; 194 else if (skb) 195 return &ipv6_hdr(skb)->daddr; 196 return daddr; 197 } 198 199 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 200 struct net_device *dev, 201 struct sk_buff *skb, 202 const void *daddr) 203 { 204 struct neighbour *n; 205 206 daddr = choose_neigh_daddr(gw, skb, daddr); 207 n = __ipv6_neigh_lookup(dev, daddr); 208 if (n) 209 return n; 210 211 n = neigh_create(&nd_tbl, daddr, dev); 212 return IS_ERR(n) ? NULL : n; 213 } 214 215 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 216 struct sk_buff *skb, 217 const void *daddr) 218 { 219 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 220 221 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); 222 } 223 224 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 225 { 226 struct net_device *dev = dst->dev; 227 struct rt6_info *rt = (struct rt6_info *)dst; 228 229 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); 230 if (!daddr) 231 return; 232 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 233 return; 234 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 235 return; 236 __ipv6_confirm_neigh(dev, daddr); 237 } 238 239 static struct dst_ops ip6_dst_ops_template = { 240 .family = AF_INET6, 241 .gc = ip6_dst_gc, 242 .gc_thresh = 1024, 243 .check = ip6_dst_check, 244 .default_advmss = ip6_default_advmss, 245 .mtu = ip6_mtu, 246 .cow_metrics = dst_cow_metrics_generic, 247 .destroy = ip6_dst_destroy, 248 .ifdown = ip6_dst_ifdown, 249 .negative_advice = ip6_negative_advice, 250 .link_failure = ip6_link_failure, 251 .update_pmtu = ip6_rt_update_pmtu, 252 .redirect = rt6_do_redirect, 253 .local_out = __ip6_local_out, 254 .neigh_lookup = ip6_dst_neigh_lookup, 255 .confirm_neigh = ip6_confirm_neigh, 256 }; 257 258 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 259 { 260 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 261 262 return mtu ? : dst->dev->mtu; 263 } 264 265 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 266 struct sk_buff *skb, u32 mtu) 267 { 268 } 269 270 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 271 struct sk_buff *skb) 272 { 273 } 274 275 static struct dst_ops ip6_dst_blackhole_ops = { 276 .family = AF_INET6, 277 .destroy = ip6_dst_destroy, 278 .check = ip6_dst_check, 279 .mtu = ip6_blackhole_mtu, 280 .default_advmss = ip6_default_advmss, 281 .update_pmtu = ip6_rt_blackhole_update_pmtu, 282 .redirect = ip6_rt_blackhole_redirect, 283 .cow_metrics = dst_cow_metrics_generic, 284 .neigh_lookup = ip6_dst_neigh_lookup, 285 }; 286 287 static const u32 ip6_template_metrics[RTAX_MAX] = { 288 [RTAX_HOPLIMIT - 1] = 0, 289 }; 290 291 static const struct fib6_info fib6_null_entry_template = { 292 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 293 .fib6_protocol = RTPROT_KERNEL, 294 .fib6_metric = ~(u32)0, 295 .fib6_ref = REFCOUNT_INIT(1), 296 .fib6_type = RTN_UNREACHABLE, 297 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 298 }; 299 300 static const struct rt6_info ip6_null_entry_template = { 301 .dst = { 302 .__refcnt = ATOMIC_INIT(1), 303 .__use = 1, 304 .obsolete = DST_OBSOLETE_FORCE_CHK, 305 .error = -ENETUNREACH, 306 .input = ip6_pkt_discard, 307 .output = ip6_pkt_discard_out, 308 }, 309 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 310 }; 311 312 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 313 314 static const struct rt6_info ip6_prohibit_entry_template = { 315 .dst = { 316 .__refcnt = ATOMIC_INIT(1), 317 .__use = 1, 318 .obsolete = DST_OBSOLETE_FORCE_CHK, 319 .error = -EACCES, 320 .input = ip6_pkt_prohibit, 321 .output = ip6_pkt_prohibit_out, 322 }, 323 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 324 }; 325 326 static const struct rt6_info ip6_blk_hole_entry_template = { 327 .dst = { 328 .__refcnt = ATOMIC_INIT(1), 329 .__use = 1, 330 .obsolete = DST_OBSOLETE_FORCE_CHK, 331 .error = -EINVAL, 332 .input = dst_discard, 333 .output = dst_discard_out, 334 }, 335 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 336 }; 337 338 #endif 339 340 static void rt6_info_init(struct rt6_info *rt) 341 { 342 struct dst_entry *dst = &rt->dst; 343 344 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 345 INIT_LIST_HEAD(&rt->rt6i_uncached); 346 } 347 348 /* allocate dst with ip6_dst_ops */ 349 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 350 int flags) 351 { 352 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 353 1, DST_OBSOLETE_FORCE_CHK, flags); 354 355 if (rt) { 356 rt6_info_init(rt); 357 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 358 } 359 360 return rt; 361 } 362 EXPORT_SYMBOL(ip6_dst_alloc); 363 364 static void ip6_dst_destroy(struct dst_entry *dst) 365 { 366 struct rt6_info *rt = (struct rt6_info *)dst; 367 struct fib6_info *from; 368 struct inet6_dev *idev; 369 370 ip_dst_metrics_put(dst); 371 rt6_uncached_list_del(rt); 372 373 idev = rt->rt6i_idev; 374 if (idev) { 375 rt->rt6i_idev = NULL; 376 in6_dev_put(idev); 377 } 378 379 from = xchg((__force struct fib6_info **)&rt->from, NULL); 380 fib6_info_release(from); 381 } 382 383 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 384 int how) 385 { 386 struct rt6_info *rt = (struct rt6_info *)dst; 387 struct inet6_dev *idev = rt->rt6i_idev; 388 struct net_device *loopback_dev = 389 dev_net(dev)->loopback_dev; 390 391 if (idev && idev->dev != loopback_dev) { 392 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 393 if (loopback_idev) { 394 rt->rt6i_idev = loopback_idev; 395 in6_dev_put(idev); 396 } 397 } 398 } 399 400 static bool __rt6_check_expired(const struct rt6_info *rt) 401 { 402 if (rt->rt6i_flags & RTF_EXPIRES) 403 return time_after(jiffies, rt->dst.expires); 404 else 405 return false; 406 } 407 408 static bool rt6_check_expired(const struct rt6_info *rt) 409 { 410 struct fib6_info *from; 411 412 from = rcu_dereference(rt->from); 413 414 if (rt->rt6i_flags & RTF_EXPIRES) { 415 if (time_after(jiffies, rt->dst.expires)) 416 return true; 417 } else if (from) { 418 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 419 fib6_check_expired(from); 420 } 421 return false; 422 } 423 424 void fib6_select_path(const struct net *net, struct fib6_result *res, 425 struct flowi6 *fl6, int oif, bool have_oif_match, 426 const struct sk_buff *skb, int strict) 427 { 428 struct fib6_info *sibling, *next_sibling; 429 struct fib6_info *match = res->f6i; 430 431 if ((!match->fib6_nsiblings && !match->nh) || have_oif_match) 432 goto out; 433 434 /* We might have already computed the hash for ICMPv6 errors. In such 435 * case it will always be non-zero. Otherwise now is the time to do it. 436 */ 437 if (!fl6->mp_hash && 438 (!match->nh || nexthop_is_multipath(match->nh))) 439 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 440 441 if (unlikely(match->nh)) { 442 nexthop_path_fib6_result(res, fl6->mp_hash); 443 return; 444 } 445 446 if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound)) 447 goto out; 448 449 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 450 fib6_siblings) { 451 const struct fib6_nh *nh = sibling->fib6_nh; 452 int nh_upper_bound; 453 454 nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); 455 if (fl6->mp_hash > nh_upper_bound) 456 continue; 457 if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0) 458 break; 459 match = sibling; 460 break; 461 } 462 463 out: 464 res->f6i = match; 465 res->nh = match->fib6_nh; 466 } 467 468 /* 469 * Route lookup. rcu_read_lock() should be held. 470 */ 471 472 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh, 473 const struct in6_addr *saddr, int oif, int flags) 474 { 475 const struct net_device *dev; 476 477 if (nh->fib_nh_flags & RTNH_F_DEAD) 478 return false; 479 480 dev = nh->fib_nh_dev; 481 if (oif) { 482 if (dev->ifindex == oif) 483 return true; 484 } else { 485 if (ipv6_chk_addr(net, saddr, dev, 486 flags & RT6_LOOKUP_F_IFACE)) 487 return true; 488 } 489 490 return false; 491 } 492 493 static void rt6_device_match(struct net *net, struct fib6_result *res, 494 const struct in6_addr *saddr, int oif, int flags) 495 { 496 struct fib6_info *f6i = res->f6i; 497 struct fib6_info *spf6i; 498 struct fib6_nh *nh; 499 500 if (!oif && ipv6_addr_any(saddr)) { 501 if (unlikely(f6i->nh)) { 502 nh = nexthop_fib6_nh(f6i->nh); 503 if (nexthop_is_blackhole(f6i->nh)) 504 goto out_blackhole; 505 } else { 506 nh = f6i->fib6_nh; 507 } 508 if (!(nh->fib_nh_flags & RTNH_F_DEAD)) 509 goto out; 510 } 511 512 for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) { 513 nh = spf6i->fib6_nh; 514 if (__rt6_device_match(net, nh, saddr, oif, flags)) { 515 res->f6i = spf6i; 516 goto out; 517 } 518 } 519 520 if (oif && flags & RT6_LOOKUP_F_IFACE) { 521 res->f6i = net->ipv6.fib6_null_entry; 522 nh = res->f6i->fib6_nh; 523 goto out; 524 } 525 526 if (unlikely(f6i->nh)) { 527 nh = nexthop_fib6_nh(f6i->nh); 528 if (nexthop_is_blackhole(f6i->nh)) 529 goto out_blackhole; 530 } else { 531 nh = f6i->fib6_nh; 532 } 533 534 if (nh->fib_nh_flags & RTNH_F_DEAD) { 535 res->f6i = net->ipv6.fib6_null_entry; 536 nh = res->f6i->fib6_nh; 537 } 538 out: 539 res->nh = nh; 540 res->fib6_type = res->f6i->fib6_type; 541 res->fib6_flags = res->f6i->fib6_flags; 542 return; 543 544 out_blackhole: 545 res->fib6_flags |= RTF_REJECT; 546 res->fib6_type = RTN_BLACKHOLE; 547 res->nh = nh; 548 } 549 550 #ifdef CONFIG_IPV6_ROUTER_PREF 551 struct __rt6_probe_work { 552 struct work_struct work; 553 struct in6_addr target; 554 struct net_device *dev; 555 }; 556 557 static void rt6_probe_deferred(struct work_struct *w) 558 { 559 struct in6_addr mcaddr; 560 struct __rt6_probe_work *work = 561 container_of(w, struct __rt6_probe_work, work); 562 563 addrconf_addr_solict_mult(&work->target, &mcaddr); 564 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 565 dev_put(work->dev); 566 kfree(work); 567 } 568 569 static void rt6_probe(struct fib6_nh *fib6_nh) 570 { 571 struct __rt6_probe_work *work = NULL; 572 const struct in6_addr *nh_gw; 573 struct neighbour *neigh; 574 struct net_device *dev; 575 struct inet6_dev *idev; 576 577 /* 578 * Okay, this does not seem to be appropriate 579 * for now, however, we need to check if it 580 * is really so; aka Router Reachability Probing. 581 * 582 * Router Reachability Probe MUST be rate-limited 583 * to no more than one per minute. 584 */ 585 if (fib6_nh->fib_nh_gw_family) 586 return; 587 588 nh_gw = &fib6_nh->fib_nh_gw6; 589 dev = fib6_nh->fib_nh_dev; 590 rcu_read_lock_bh(); 591 idev = __in6_dev_get(dev); 592 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 593 if (neigh) { 594 if (neigh->nud_state & NUD_VALID) 595 goto out; 596 597 write_lock(&neigh->lock); 598 if (!(neigh->nud_state & NUD_VALID) && 599 time_after(jiffies, 600 neigh->updated + idev->cnf.rtr_probe_interval)) { 601 work = kmalloc(sizeof(*work), GFP_ATOMIC); 602 if (work) 603 __neigh_set_probe_once(neigh); 604 } 605 write_unlock(&neigh->lock); 606 } else if (time_after(jiffies, fib6_nh->last_probe + 607 idev->cnf.rtr_probe_interval)) { 608 work = kmalloc(sizeof(*work), GFP_ATOMIC); 609 } 610 611 if (work) { 612 fib6_nh->last_probe = jiffies; 613 INIT_WORK(&work->work, rt6_probe_deferred); 614 work->target = *nh_gw; 615 dev_hold(dev); 616 work->dev = dev; 617 schedule_work(&work->work); 618 } 619 620 out: 621 rcu_read_unlock_bh(); 622 } 623 #else 624 static inline void rt6_probe(struct fib6_nh *fib6_nh) 625 { 626 } 627 #endif 628 629 /* 630 * Default Router Selection (RFC 2461 6.3.6) 631 */ 632 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh) 633 { 634 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 635 struct neighbour *neigh; 636 637 rcu_read_lock_bh(); 638 neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev, 639 &fib6_nh->fib_nh_gw6); 640 if (neigh) { 641 read_lock(&neigh->lock); 642 if (neigh->nud_state & NUD_VALID) 643 ret = RT6_NUD_SUCCEED; 644 #ifdef CONFIG_IPV6_ROUTER_PREF 645 else if (!(neigh->nud_state & NUD_FAILED)) 646 ret = RT6_NUD_SUCCEED; 647 else 648 ret = RT6_NUD_FAIL_PROBE; 649 #endif 650 read_unlock(&neigh->lock); 651 } else { 652 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 653 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 654 } 655 rcu_read_unlock_bh(); 656 657 return ret; 658 } 659 660 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, 661 int strict) 662 { 663 int m = 0; 664 665 if (!oif || nh->fib_nh_dev->ifindex == oif) 666 m = 2; 667 668 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 669 return RT6_NUD_FAIL_HARD; 670 #ifdef CONFIG_IPV6_ROUTER_PREF 671 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2; 672 #endif 673 if ((strict & RT6_LOOKUP_F_REACHABLE) && 674 !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) { 675 int n = rt6_check_neigh(nh); 676 if (n < 0) 677 return n; 678 } 679 return m; 680 } 681 682 static bool find_match(struct fib6_nh *nh, u32 fib6_flags, 683 int oif, int strict, int *mpri, bool *do_rr) 684 { 685 bool match_do_rr = false; 686 bool rc = false; 687 int m; 688 689 if (nh->fib_nh_flags & RTNH_F_DEAD) 690 goto out; 691 692 if (ip6_ignore_linkdown(nh->fib_nh_dev) && 693 nh->fib_nh_flags & RTNH_F_LINKDOWN && 694 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 695 goto out; 696 697 m = rt6_score_route(nh, fib6_flags, oif, strict); 698 if (m == RT6_NUD_FAIL_DO_RR) { 699 match_do_rr = true; 700 m = 0; /* lowest valid score */ 701 } else if (m == RT6_NUD_FAIL_HARD) { 702 goto out; 703 } 704 705 if (strict & RT6_LOOKUP_F_REACHABLE) 706 rt6_probe(nh); 707 708 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 709 if (m > *mpri) { 710 *do_rr = match_do_rr; 711 *mpri = m; 712 rc = true; 713 } 714 out: 715 return rc; 716 } 717 718 static void __find_rr_leaf(struct fib6_info *f6i_start, 719 struct fib6_info *nomatch, u32 metric, 720 struct fib6_result *res, struct fib6_info **cont, 721 int oif, int strict, bool *do_rr, int *mpri) 722 { 723 struct fib6_info *f6i; 724 725 for (f6i = f6i_start; 726 f6i && f6i != nomatch; 727 f6i = rcu_dereference(f6i->fib6_next)) { 728 struct fib6_nh *nh; 729 730 if (cont && f6i->fib6_metric != metric) { 731 *cont = f6i; 732 return; 733 } 734 735 if (fib6_check_expired(f6i)) 736 continue; 737 738 nh = f6i->fib6_nh; 739 if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) { 740 res->f6i = f6i; 741 res->nh = nh; 742 res->fib6_flags = f6i->fib6_flags; 743 res->fib6_type = f6i->fib6_type; 744 } 745 } 746 } 747 748 static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf, 749 struct fib6_info *rr_head, int oif, int strict, 750 bool *do_rr, struct fib6_result *res) 751 { 752 u32 metric = rr_head->fib6_metric; 753 struct fib6_info *cont = NULL; 754 int mpri = -1; 755 756 __find_rr_leaf(rr_head, NULL, metric, res, &cont, 757 oif, strict, do_rr, &mpri); 758 759 __find_rr_leaf(leaf, rr_head, metric, res, &cont, 760 oif, strict, do_rr, &mpri); 761 762 if (res->f6i || !cont) 763 return; 764 765 __find_rr_leaf(cont, NULL, metric, res, NULL, 766 oif, strict, do_rr, &mpri); 767 } 768 769 static void rt6_select(struct net *net, struct fib6_node *fn, int oif, 770 struct fib6_result *res, int strict) 771 { 772 struct fib6_info *leaf = rcu_dereference(fn->leaf); 773 struct fib6_info *rt0; 774 bool do_rr = false; 775 int key_plen; 776 777 /* make sure this function or its helpers sets f6i */ 778 res->f6i = NULL; 779 780 if (!leaf || leaf == net->ipv6.fib6_null_entry) 781 goto out; 782 783 rt0 = rcu_dereference(fn->rr_ptr); 784 if (!rt0) 785 rt0 = leaf; 786 787 /* Double check to make sure fn is not an intermediate node 788 * and fn->leaf does not points to its child's leaf 789 * (This might happen if all routes under fn are deleted from 790 * the tree and fib6_repair_tree() is called on the node.) 791 */ 792 key_plen = rt0->fib6_dst.plen; 793 #ifdef CONFIG_IPV6_SUBTREES 794 if (rt0->fib6_src.plen) 795 key_plen = rt0->fib6_src.plen; 796 #endif 797 if (fn->fn_bit != key_plen) 798 goto out; 799 800 find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res); 801 if (do_rr) { 802 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 803 804 /* no entries matched; do round-robin */ 805 if (!next || next->fib6_metric != rt0->fib6_metric) 806 next = leaf; 807 808 if (next != rt0) { 809 spin_lock_bh(&leaf->fib6_table->tb6_lock); 810 /* make sure next is not being deleted from the tree */ 811 if (next->fib6_node) 812 rcu_assign_pointer(fn->rr_ptr, next); 813 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 814 } 815 } 816 817 out: 818 if (!res->f6i) { 819 res->f6i = net->ipv6.fib6_null_entry; 820 res->nh = res->f6i->fib6_nh; 821 res->fib6_flags = res->f6i->fib6_flags; 822 res->fib6_type = res->f6i->fib6_type; 823 } 824 } 825 826 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res) 827 { 828 return (res->f6i->fib6_flags & RTF_NONEXTHOP) || 829 res->nh->fib_nh_gw_family; 830 } 831 832 #ifdef CONFIG_IPV6_ROUTE_INFO 833 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 834 const struct in6_addr *gwaddr) 835 { 836 struct net *net = dev_net(dev); 837 struct route_info *rinfo = (struct route_info *) opt; 838 struct in6_addr prefix_buf, *prefix; 839 unsigned int pref; 840 unsigned long lifetime; 841 struct fib6_info *rt; 842 843 if (len < sizeof(struct route_info)) { 844 return -EINVAL; 845 } 846 847 /* Sanity check for prefix_len and length */ 848 if (rinfo->length > 3) { 849 return -EINVAL; 850 } else if (rinfo->prefix_len > 128) { 851 return -EINVAL; 852 } else if (rinfo->prefix_len > 64) { 853 if (rinfo->length < 2) { 854 return -EINVAL; 855 } 856 } else if (rinfo->prefix_len > 0) { 857 if (rinfo->length < 1) { 858 return -EINVAL; 859 } 860 } 861 862 pref = rinfo->route_pref; 863 if (pref == ICMPV6_ROUTER_PREF_INVALID) 864 return -EINVAL; 865 866 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 867 868 if (rinfo->length == 3) 869 prefix = (struct in6_addr *)rinfo->prefix; 870 else { 871 /* this function is safe */ 872 ipv6_addr_prefix(&prefix_buf, 873 (struct in6_addr *)rinfo->prefix, 874 rinfo->prefix_len); 875 prefix = &prefix_buf; 876 } 877 878 if (rinfo->prefix_len == 0) 879 rt = rt6_get_dflt_router(net, gwaddr, dev); 880 else 881 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 882 gwaddr, dev); 883 884 if (rt && !lifetime) { 885 ip6_del_rt(net, rt); 886 rt = NULL; 887 } 888 889 if (!rt && lifetime) 890 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 891 dev, pref); 892 else if (rt) 893 rt->fib6_flags = RTF_ROUTEINFO | 894 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 895 896 if (rt) { 897 if (!addrconf_finite_timeout(lifetime)) 898 fib6_clean_expires(rt); 899 else 900 fib6_set_expires(rt, jiffies + HZ * lifetime); 901 902 fib6_info_release(rt); 903 } 904 return 0; 905 } 906 #endif 907 908 /* 909 * Misc support functions 910 */ 911 912 /* called with rcu_lock held */ 913 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res) 914 { 915 struct net_device *dev = res->nh->fib_nh_dev; 916 917 if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 918 /* for copies of local routes, dst->dev needs to be the 919 * device if it is a master device, the master device if 920 * device is enslaved, and the loopback as the default 921 */ 922 if (netif_is_l3_slave(dev) && 923 !rt6_need_strict(&res->f6i->fib6_dst.addr)) 924 dev = l3mdev_master_dev_rcu(dev); 925 else if (!netif_is_l3_master(dev)) 926 dev = dev_net(dev)->loopback_dev; 927 /* last case is netif_is_l3_master(dev) is true in which 928 * case we want dev returned to be dev 929 */ 930 } 931 932 return dev; 933 } 934 935 static const int fib6_prop[RTN_MAX + 1] = { 936 [RTN_UNSPEC] = 0, 937 [RTN_UNICAST] = 0, 938 [RTN_LOCAL] = 0, 939 [RTN_BROADCAST] = 0, 940 [RTN_ANYCAST] = 0, 941 [RTN_MULTICAST] = 0, 942 [RTN_BLACKHOLE] = -EINVAL, 943 [RTN_UNREACHABLE] = -EHOSTUNREACH, 944 [RTN_PROHIBIT] = -EACCES, 945 [RTN_THROW] = -EAGAIN, 946 [RTN_NAT] = -EINVAL, 947 [RTN_XRESOLVE] = -EINVAL, 948 }; 949 950 static int ip6_rt_type_to_error(u8 fib6_type) 951 { 952 return fib6_prop[fib6_type]; 953 } 954 955 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 956 { 957 unsigned short flags = 0; 958 959 if (rt->dst_nocount) 960 flags |= DST_NOCOUNT; 961 if (rt->dst_nopolicy) 962 flags |= DST_NOPOLICY; 963 if (rt->dst_host) 964 flags |= DST_HOST; 965 966 return flags; 967 } 968 969 static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type) 970 { 971 rt->dst.error = ip6_rt_type_to_error(fib6_type); 972 973 switch (fib6_type) { 974 case RTN_BLACKHOLE: 975 rt->dst.output = dst_discard_out; 976 rt->dst.input = dst_discard; 977 break; 978 case RTN_PROHIBIT: 979 rt->dst.output = ip6_pkt_prohibit_out; 980 rt->dst.input = ip6_pkt_prohibit; 981 break; 982 case RTN_THROW: 983 case RTN_UNREACHABLE: 984 default: 985 rt->dst.output = ip6_pkt_discard_out; 986 rt->dst.input = ip6_pkt_discard; 987 break; 988 } 989 } 990 991 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res) 992 { 993 struct fib6_info *f6i = res->f6i; 994 995 if (res->fib6_flags & RTF_REJECT) { 996 ip6_rt_init_dst_reject(rt, res->fib6_type); 997 return; 998 } 999 1000 rt->dst.error = 0; 1001 rt->dst.output = ip6_output; 1002 1003 if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) { 1004 rt->dst.input = ip6_input; 1005 } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 1006 rt->dst.input = ip6_mc_input; 1007 } else { 1008 rt->dst.input = ip6_forward; 1009 } 1010 1011 if (res->nh->fib_nh_lws) { 1012 rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws); 1013 lwtunnel_set_redirect(&rt->dst); 1014 } 1015 1016 rt->dst.lastuse = jiffies; 1017 } 1018 1019 /* Caller must already hold reference to @from */ 1020 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 1021 { 1022 rt->rt6i_flags &= ~RTF_EXPIRES; 1023 rcu_assign_pointer(rt->from, from); 1024 ip_dst_init_metrics(&rt->dst, from->fib6_metrics); 1025 } 1026 1027 /* Caller must already hold reference to f6i in result */ 1028 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res) 1029 { 1030 const struct fib6_nh *nh = res->nh; 1031 const struct net_device *dev = nh->fib_nh_dev; 1032 struct fib6_info *f6i = res->f6i; 1033 1034 ip6_rt_init_dst(rt, res); 1035 1036 rt->rt6i_dst = f6i->fib6_dst; 1037 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 1038 rt->rt6i_flags = res->fib6_flags; 1039 if (nh->fib_nh_gw_family) { 1040 rt->rt6i_gateway = nh->fib_nh_gw6; 1041 rt->rt6i_flags |= RTF_GATEWAY; 1042 } 1043 rt6_set_from(rt, f6i); 1044 #ifdef CONFIG_IPV6_SUBTREES 1045 rt->rt6i_src = f6i->fib6_src; 1046 #endif 1047 } 1048 1049 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1050 struct in6_addr *saddr) 1051 { 1052 struct fib6_node *pn, *sn; 1053 while (1) { 1054 if (fn->fn_flags & RTN_TL_ROOT) 1055 return NULL; 1056 pn = rcu_dereference(fn->parent); 1057 sn = FIB6_SUBTREE(pn); 1058 if (sn && sn != fn) 1059 fn = fib6_node_lookup(sn, NULL, saddr); 1060 else 1061 fn = pn; 1062 if (fn->fn_flags & RTN_RTINFO) 1063 return fn; 1064 } 1065 } 1066 1067 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt) 1068 { 1069 struct rt6_info *rt = *prt; 1070 1071 if (dst_hold_safe(&rt->dst)) 1072 return true; 1073 if (net) { 1074 rt = net->ipv6.ip6_null_entry; 1075 dst_hold(&rt->dst); 1076 } else { 1077 rt = NULL; 1078 } 1079 *prt = rt; 1080 return false; 1081 } 1082 1083 /* called with rcu_lock held */ 1084 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res) 1085 { 1086 struct net_device *dev = res->nh->fib_nh_dev; 1087 struct fib6_info *f6i = res->f6i; 1088 unsigned short flags; 1089 struct rt6_info *nrt; 1090 1091 if (!fib6_info_hold_safe(f6i)) 1092 goto fallback; 1093 1094 flags = fib6_info_dst_flags(f6i); 1095 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1096 if (!nrt) { 1097 fib6_info_release(f6i); 1098 goto fallback; 1099 } 1100 1101 ip6_rt_copy_init(nrt, res); 1102 return nrt; 1103 1104 fallback: 1105 nrt = dev_net(dev)->ipv6.ip6_null_entry; 1106 dst_hold(&nrt->dst); 1107 return nrt; 1108 } 1109 1110 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1111 struct fib6_table *table, 1112 struct flowi6 *fl6, 1113 const struct sk_buff *skb, 1114 int flags) 1115 { 1116 struct fib6_result res = {}; 1117 struct fib6_node *fn; 1118 struct rt6_info *rt; 1119 1120 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1121 flags &= ~RT6_LOOKUP_F_IFACE; 1122 1123 rcu_read_lock(); 1124 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1125 restart: 1126 res.f6i = rcu_dereference(fn->leaf); 1127 if (!res.f6i) 1128 res.f6i = net->ipv6.fib6_null_entry; 1129 else 1130 rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif, 1131 flags); 1132 1133 if (res.f6i == net->ipv6.fib6_null_entry) { 1134 fn = fib6_backtrack(fn, &fl6->saddr); 1135 if (fn) 1136 goto restart; 1137 1138 rt = net->ipv6.ip6_null_entry; 1139 dst_hold(&rt->dst); 1140 goto out; 1141 } else if (res.fib6_flags & RTF_REJECT) { 1142 goto do_create; 1143 } 1144 1145 fib6_select_path(net, &res, fl6, fl6->flowi6_oif, 1146 fl6->flowi6_oif != 0, skb, flags); 1147 1148 /* Search through exception table */ 1149 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); 1150 if (rt) { 1151 if (ip6_hold_safe(net, &rt)) 1152 dst_use_noref(&rt->dst, jiffies); 1153 } else { 1154 do_create: 1155 rt = ip6_create_rt_rcu(&res); 1156 } 1157 1158 out: 1159 trace_fib6_table_lookup(net, &res, table, fl6); 1160 1161 rcu_read_unlock(); 1162 1163 return rt; 1164 } 1165 1166 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1167 const struct sk_buff *skb, int flags) 1168 { 1169 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1170 } 1171 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1172 1173 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1174 const struct in6_addr *saddr, int oif, 1175 const struct sk_buff *skb, int strict) 1176 { 1177 struct flowi6 fl6 = { 1178 .flowi6_oif = oif, 1179 .daddr = *daddr, 1180 }; 1181 struct dst_entry *dst; 1182 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1183 1184 if (saddr) { 1185 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1186 flags |= RT6_LOOKUP_F_HAS_SADDR; 1187 } 1188 1189 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1190 if (dst->error == 0) 1191 return (struct rt6_info *) dst; 1192 1193 dst_release(dst); 1194 1195 return NULL; 1196 } 1197 EXPORT_SYMBOL(rt6_lookup); 1198 1199 /* ip6_ins_rt is called with FREE table->tb6_lock. 1200 * It takes new route entry, the addition fails by any reason the 1201 * route is released. 1202 * Caller must hold dst before calling it. 1203 */ 1204 1205 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1206 struct netlink_ext_ack *extack) 1207 { 1208 int err; 1209 struct fib6_table *table; 1210 1211 table = rt->fib6_table; 1212 spin_lock_bh(&table->tb6_lock); 1213 err = fib6_add(&table->tb6_root, rt, info, extack); 1214 spin_unlock_bh(&table->tb6_lock); 1215 1216 return err; 1217 } 1218 1219 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1220 { 1221 struct nl_info info = { .nl_net = net, }; 1222 1223 return __ip6_ins_rt(rt, &info, NULL); 1224 } 1225 1226 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res, 1227 const struct in6_addr *daddr, 1228 const struct in6_addr *saddr) 1229 { 1230 struct fib6_info *f6i = res->f6i; 1231 struct net_device *dev; 1232 struct rt6_info *rt; 1233 1234 /* 1235 * Clone the route. 1236 */ 1237 1238 if (!fib6_info_hold_safe(f6i)) 1239 return NULL; 1240 1241 dev = ip6_rt_get_dev_rcu(res); 1242 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1243 if (!rt) { 1244 fib6_info_release(f6i); 1245 return NULL; 1246 } 1247 1248 ip6_rt_copy_init(rt, res); 1249 rt->rt6i_flags |= RTF_CACHE; 1250 rt->dst.flags |= DST_HOST; 1251 rt->rt6i_dst.addr = *daddr; 1252 rt->rt6i_dst.plen = 128; 1253 1254 if (!rt6_is_gw_or_nonexthop(res)) { 1255 if (f6i->fib6_dst.plen != 128 && 1256 ipv6_addr_equal(&f6i->fib6_dst.addr, daddr)) 1257 rt->rt6i_flags |= RTF_ANYCAST; 1258 #ifdef CONFIG_IPV6_SUBTREES 1259 if (rt->rt6i_src.plen && saddr) { 1260 rt->rt6i_src.addr = *saddr; 1261 rt->rt6i_src.plen = 128; 1262 } 1263 #endif 1264 } 1265 1266 return rt; 1267 } 1268 1269 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res) 1270 { 1271 struct fib6_info *f6i = res->f6i; 1272 unsigned short flags = fib6_info_dst_flags(f6i); 1273 struct net_device *dev; 1274 struct rt6_info *pcpu_rt; 1275 1276 if (!fib6_info_hold_safe(f6i)) 1277 return NULL; 1278 1279 rcu_read_lock(); 1280 dev = ip6_rt_get_dev_rcu(res); 1281 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1282 rcu_read_unlock(); 1283 if (!pcpu_rt) { 1284 fib6_info_release(f6i); 1285 return NULL; 1286 } 1287 ip6_rt_copy_init(pcpu_rt, res); 1288 pcpu_rt->rt6i_flags |= RTF_PCPU; 1289 return pcpu_rt; 1290 } 1291 1292 /* It should be called with rcu_read_lock() acquired */ 1293 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) 1294 { 1295 struct rt6_info *pcpu_rt; 1296 1297 pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu); 1298 1299 if (pcpu_rt) 1300 ip6_hold_safe(NULL, &pcpu_rt); 1301 1302 return pcpu_rt; 1303 } 1304 1305 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1306 const struct fib6_result *res) 1307 { 1308 struct rt6_info *pcpu_rt, *prev, **p; 1309 1310 pcpu_rt = ip6_rt_pcpu_alloc(res); 1311 if (!pcpu_rt) { 1312 dst_hold(&net->ipv6.ip6_null_entry->dst); 1313 return net->ipv6.ip6_null_entry; 1314 } 1315 1316 dst_hold(&pcpu_rt->dst); 1317 p = this_cpu_ptr(res->nh->rt6i_pcpu); 1318 prev = cmpxchg(p, NULL, pcpu_rt); 1319 BUG_ON(prev); 1320 1321 if (res->f6i->fib6_destroying) { 1322 struct fib6_info *from; 1323 1324 from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); 1325 fib6_info_release(from); 1326 } 1327 1328 return pcpu_rt; 1329 } 1330 1331 /* exception hash table implementation 1332 */ 1333 static DEFINE_SPINLOCK(rt6_exception_lock); 1334 1335 /* Remove rt6_ex from hash table and free the memory 1336 * Caller must hold rt6_exception_lock 1337 */ 1338 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1339 struct rt6_exception *rt6_ex) 1340 { 1341 struct fib6_info *from; 1342 struct net *net; 1343 1344 if (!bucket || !rt6_ex) 1345 return; 1346 1347 net = dev_net(rt6_ex->rt6i->dst.dev); 1348 net->ipv6.rt6_stats->fib_rt_cache--; 1349 1350 /* purge completely the exception to allow releasing the held resources: 1351 * some [sk] cache may keep the dst around for unlimited time 1352 */ 1353 from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL); 1354 fib6_info_release(from); 1355 dst_dev_put(&rt6_ex->rt6i->dst); 1356 1357 hlist_del_rcu(&rt6_ex->hlist); 1358 dst_release(&rt6_ex->rt6i->dst); 1359 kfree_rcu(rt6_ex, rcu); 1360 WARN_ON_ONCE(!bucket->depth); 1361 bucket->depth--; 1362 } 1363 1364 /* Remove oldest rt6_ex in bucket and free the memory 1365 * Caller must hold rt6_exception_lock 1366 */ 1367 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1368 { 1369 struct rt6_exception *rt6_ex, *oldest = NULL; 1370 1371 if (!bucket) 1372 return; 1373 1374 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1375 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1376 oldest = rt6_ex; 1377 } 1378 rt6_remove_exception(bucket, oldest); 1379 } 1380 1381 static u32 rt6_exception_hash(const struct in6_addr *dst, 1382 const struct in6_addr *src) 1383 { 1384 static u32 seed __read_mostly; 1385 u32 val; 1386 1387 net_get_random_once(&seed, sizeof(seed)); 1388 val = jhash(dst, sizeof(*dst), seed); 1389 1390 #ifdef CONFIG_IPV6_SUBTREES 1391 if (src) 1392 val = jhash(src, sizeof(*src), val); 1393 #endif 1394 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1395 } 1396 1397 /* Helper function to find the cached rt in the hash table 1398 * and update bucket pointer to point to the bucket for this 1399 * (daddr, saddr) pair 1400 * Caller must hold rt6_exception_lock 1401 */ 1402 static struct rt6_exception * 1403 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1404 const struct in6_addr *daddr, 1405 const struct in6_addr *saddr) 1406 { 1407 struct rt6_exception *rt6_ex; 1408 u32 hval; 1409 1410 if (!(*bucket) || !daddr) 1411 return NULL; 1412 1413 hval = rt6_exception_hash(daddr, saddr); 1414 *bucket += hval; 1415 1416 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1417 struct rt6_info *rt6 = rt6_ex->rt6i; 1418 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1419 1420 #ifdef CONFIG_IPV6_SUBTREES 1421 if (matched && saddr) 1422 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1423 #endif 1424 if (matched) 1425 return rt6_ex; 1426 } 1427 return NULL; 1428 } 1429 1430 /* Helper function to find the cached rt in the hash table 1431 * and update bucket pointer to point to the bucket for this 1432 * (daddr, saddr) pair 1433 * Caller must hold rcu_read_lock() 1434 */ 1435 static struct rt6_exception * 1436 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1437 const struct in6_addr *daddr, 1438 const struct in6_addr *saddr) 1439 { 1440 struct rt6_exception *rt6_ex; 1441 u32 hval; 1442 1443 WARN_ON_ONCE(!rcu_read_lock_held()); 1444 1445 if (!(*bucket) || !daddr) 1446 return NULL; 1447 1448 hval = rt6_exception_hash(daddr, saddr); 1449 *bucket += hval; 1450 1451 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1452 struct rt6_info *rt6 = rt6_ex->rt6i; 1453 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1454 1455 #ifdef CONFIG_IPV6_SUBTREES 1456 if (matched && saddr) 1457 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1458 #endif 1459 if (matched) 1460 return rt6_ex; 1461 } 1462 return NULL; 1463 } 1464 1465 static unsigned int fib6_mtu(const struct fib6_result *res) 1466 { 1467 const struct fib6_nh *nh = res->nh; 1468 unsigned int mtu; 1469 1470 if (res->f6i->fib6_pmtu) { 1471 mtu = res->f6i->fib6_pmtu; 1472 } else { 1473 struct net_device *dev = nh->fib_nh_dev; 1474 struct inet6_dev *idev; 1475 1476 rcu_read_lock(); 1477 idev = __in6_dev_get(dev); 1478 mtu = idev->cnf.mtu6; 1479 rcu_read_unlock(); 1480 } 1481 1482 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1483 1484 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); 1485 } 1486 1487 #define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL 1488 1489 /* used when the flushed bit is not relevant, only access to the bucket 1490 * (ie., all bucket users except rt6_insert_exception); 1491 * 1492 * called under rcu lock; sometimes called with rt6_exception_lock held 1493 */ 1494 static 1495 struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh, 1496 spinlock_t *lock) 1497 { 1498 struct rt6_exception_bucket *bucket; 1499 1500 if (lock) 1501 bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, 1502 lockdep_is_held(lock)); 1503 else 1504 bucket = rcu_dereference(nh->rt6i_exception_bucket); 1505 1506 /* remove bucket flushed bit if set */ 1507 if (bucket) { 1508 unsigned long p = (unsigned long)bucket; 1509 1510 p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED; 1511 bucket = (struct rt6_exception_bucket *)p; 1512 } 1513 1514 return bucket; 1515 } 1516 1517 static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket) 1518 { 1519 unsigned long p = (unsigned long)bucket; 1520 1521 return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED); 1522 } 1523 1524 /* called with rt6_exception_lock held */ 1525 static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh, 1526 spinlock_t *lock) 1527 { 1528 struct rt6_exception_bucket *bucket; 1529 unsigned long p; 1530 1531 bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, 1532 lockdep_is_held(lock)); 1533 1534 p = (unsigned long)bucket; 1535 p |= FIB6_EXCEPTION_BUCKET_FLUSHED; 1536 bucket = (struct rt6_exception_bucket *)p; 1537 rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); 1538 } 1539 1540 static int rt6_insert_exception(struct rt6_info *nrt, 1541 const struct fib6_result *res) 1542 { 1543 struct net *net = dev_net(nrt->dst.dev); 1544 struct rt6_exception_bucket *bucket; 1545 struct fib6_info *f6i = res->f6i; 1546 struct in6_addr *src_key = NULL; 1547 struct rt6_exception *rt6_ex; 1548 struct fib6_nh *nh = res->nh; 1549 int err = 0; 1550 1551 spin_lock_bh(&rt6_exception_lock); 1552 1553 bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, 1554 lockdep_is_held(&rt6_exception_lock)); 1555 if (!bucket) { 1556 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1557 GFP_ATOMIC); 1558 if (!bucket) { 1559 err = -ENOMEM; 1560 goto out; 1561 } 1562 rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); 1563 } else if (fib6_nh_excptn_bucket_flushed(bucket)) { 1564 err = -EINVAL; 1565 goto out; 1566 } 1567 1568 #ifdef CONFIG_IPV6_SUBTREES 1569 /* fib6_src.plen != 0 indicates f6i is in subtree 1570 * and exception table is indexed by a hash of 1571 * both fib6_dst and fib6_src. 1572 * Otherwise, the exception table is indexed by 1573 * a hash of only fib6_dst. 1574 */ 1575 if (f6i->fib6_src.plen) 1576 src_key = &nrt->rt6i_src.addr; 1577 #endif 1578 /* rt6_mtu_change() might lower mtu on f6i. 1579 * Only insert this exception route if its mtu 1580 * is less than f6i's mtu value. 1581 */ 1582 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) { 1583 err = -EINVAL; 1584 goto out; 1585 } 1586 1587 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1588 src_key); 1589 if (rt6_ex) 1590 rt6_remove_exception(bucket, rt6_ex); 1591 1592 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1593 if (!rt6_ex) { 1594 err = -ENOMEM; 1595 goto out; 1596 } 1597 rt6_ex->rt6i = nrt; 1598 rt6_ex->stamp = jiffies; 1599 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1600 bucket->depth++; 1601 net->ipv6.rt6_stats->fib_rt_cache++; 1602 1603 if (bucket->depth > FIB6_MAX_DEPTH) 1604 rt6_exception_remove_oldest(bucket); 1605 1606 out: 1607 spin_unlock_bh(&rt6_exception_lock); 1608 1609 /* Update fn->fn_sernum to invalidate all cached dst */ 1610 if (!err) { 1611 spin_lock_bh(&f6i->fib6_table->tb6_lock); 1612 fib6_update_sernum(net, f6i); 1613 spin_unlock_bh(&f6i->fib6_table->tb6_lock); 1614 fib6_force_start_gc(net); 1615 } 1616 1617 return err; 1618 } 1619 1620 static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from) 1621 { 1622 struct rt6_exception_bucket *bucket; 1623 struct rt6_exception *rt6_ex; 1624 struct hlist_node *tmp; 1625 int i; 1626 1627 spin_lock_bh(&rt6_exception_lock); 1628 1629 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 1630 if (!bucket) 1631 goto out; 1632 1633 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1634 if (!from) 1635 fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock); 1636 1637 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1638 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) { 1639 if (!from || 1640 rcu_access_pointer(rt6_ex->rt6i->from) == from) 1641 rt6_remove_exception(bucket, rt6_ex); 1642 } 1643 WARN_ON_ONCE(!from && bucket->depth); 1644 bucket++; 1645 } 1646 out: 1647 spin_unlock_bh(&rt6_exception_lock); 1648 } 1649 1650 void rt6_flush_exceptions(struct fib6_info *f6i) 1651 { 1652 fib6_nh_flush_exceptions(f6i->fib6_nh, f6i); 1653 } 1654 1655 /* Find cached rt in the hash table inside passed in rt 1656 * Caller has to hold rcu_read_lock() 1657 */ 1658 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, 1659 const struct in6_addr *daddr, 1660 const struct in6_addr *saddr) 1661 { 1662 const struct in6_addr *src_key = NULL; 1663 struct rt6_exception_bucket *bucket; 1664 struct rt6_exception *rt6_ex; 1665 struct rt6_info *ret = NULL; 1666 1667 #ifdef CONFIG_IPV6_SUBTREES 1668 /* fib6i_src.plen != 0 indicates f6i is in subtree 1669 * and exception table is indexed by a hash of 1670 * both fib6_dst and fib6_src. 1671 * However, the src addr used to create the hash 1672 * might not be exactly the passed in saddr which 1673 * is a /128 addr from the flow. 1674 * So we need to use f6i->fib6_src to redo lookup 1675 * if the passed in saddr does not find anything. 1676 * (See the logic in ip6_rt_cache_alloc() on how 1677 * rt->rt6i_src is updated.) 1678 */ 1679 if (res->f6i->fib6_src.plen) 1680 src_key = saddr; 1681 find_ex: 1682 #endif 1683 bucket = fib6_nh_get_excptn_bucket(res->nh, NULL); 1684 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1685 1686 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1687 ret = rt6_ex->rt6i; 1688 1689 #ifdef CONFIG_IPV6_SUBTREES 1690 /* Use fib6_src as src_key and redo lookup */ 1691 if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) { 1692 src_key = &res->f6i->fib6_src.addr; 1693 goto find_ex; 1694 } 1695 #endif 1696 1697 return ret; 1698 } 1699 1700 /* Remove the passed in cached rt from the hash table that contains it */ 1701 static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen, 1702 const struct rt6_info *rt) 1703 { 1704 const struct in6_addr *src_key = NULL; 1705 struct rt6_exception_bucket *bucket; 1706 struct rt6_exception *rt6_ex; 1707 int err; 1708 1709 if (!rcu_access_pointer(nh->rt6i_exception_bucket)) 1710 return -ENOENT; 1711 1712 spin_lock_bh(&rt6_exception_lock); 1713 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 1714 1715 #ifdef CONFIG_IPV6_SUBTREES 1716 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1717 * and exception table is indexed by a hash of 1718 * both rt6i_dst and rt6i_src. 1719 * Otherwise, the exception table is indexed by 1720 * a hash of only rt6i_dst. 1721 */ 1722 if (plen) 1723 src_key = &rt->rt6i_src.addr; 1724 #endif 1725 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1726 &rt->rt6i_dst.addr, 1727 src_key); 1728 if (rt6_ex) { 1729 rt6_remove_exception(bucket, rt6_ex); 1730 err = 0; 1731 } else { 1732 err = -ENOENT; 1733 } 1734 1735 spin_unlock_bh(&rt6_exception_lock); 1736 return err; 1737 } 1738 1739 static int rt6_remove_exception_rt(struct rt6_info *rt) 1740 { 1741 struct fib6_info *from; 1742 1743 from = rcu_dereference(rt->from); 1744 if (!from || !(rt->rt6i_flags & RTF_CACHE)) 1745 return -EINVAL; 1746 1747 return fib6_nh_remove_exception(from->fib6_nh, 1748 from->fib6_src.plen, rt); 1749 } 1750 1751 /* Find rt6_ex which contains the passed in rt cache and 1752 * refresh its stamp 1753 */ 1754 static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen, 1755 const struct rt6_info *rt) 1756 { 1757 const struct in6_addr *src_key = NULL; 1758 struct rt6_exception_bucket *bucket; 1759 struct rt6_exception *rt6_ex; 1760 1761 bucket = fib6_nh_get_excptn_bucket(nh, NULL); 1762 #ifdef CONFIG_IPV6_SUBTREES 1763 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1764 * and exception table is indexed by a hash of 1765 * both rt6i_dst and rt6i_src. 1766 * Otherwise, the exception table is indexed by 1767 * a hash of only rt6i_dst. 1768 */ 1769 if (plen) 1770 src_key = &rt->rt6i_src.addr; 1771 #endif 1772 rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key); 1773 if (rt6_ex) 1774 rt6_ex->stamp = jiffies; 1775 } 1776 1777 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1778 { 1779 struct fib6_info *from; 1780 1781 rcu_read_lock(); 1782 1783 from = rcu_dereference(rt->from); 1784 if (!from || !(rt->rt6i_flags & RTF_CACHE)) 1785 goto unlock; 1786 1787 fib6_nh_update_exception(from->fib6_nh, from->fib6_src.plen, rt); 1788 unlock: 1789 rcu_read_unlock(); 1790 } 1791 1792 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1793 struct rt6_info *rt, int mtu) 1794 { 1795 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1796 * lowest MTU in the path: always allow updating the route PMTU to 1797 * reflect PMTU decreases. 1798 * 1799 * If the new MTU is higher, and the route PMTU is equal to the local 1800 * MTU, this means the old MTU is the lowest in the path, so allow 1801 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1802 * handle this. 1803 */ 1804 1805 if (dst_mtu(&rt->dst) >= mtu) 1806 return true; 1807 1808 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 1809 return true; 1810 1811 return false; 1812 } 1813 1814 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1815 const struct fib6_nh *nh, int mtu) 1816 { 1817 struct rt6_exception_bucket *bucket; 1818 struct rt6_exception *rt6_ex; 1819 int i; 1820 1821 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 1822 if (!bucket) 1823 return; 1824 1825 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1826 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1827 struct rt6_info *entry = rt6_ex->rt6i; 1828 1829 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 1830 * route), the metrics of its rt->from have already 1831 * been updated. 1832 */ 1833 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 1834 rt6_mtu_change_route_allowed(idev, entry, mtu)) 1835 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 1836 } 1837 bucket++; 1838 } 1839 } 1840 1841 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1842 1843 static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh, 1844 const struct in6_addr *gateway) 1845 { 1846 struct rt6_exception_bucket *bucket; 1847 struct rt6_exception *rt6_ex; 1848 struct hlist_node *tmp; 1849 int i; 1850 1851 if (!rcu_access_pointer(nh->rt6i_exception_bucket)) 1852 return; 1853 1854 spin_lock_bh(&rt6_exception_lock); 1855 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 1856 if (bucket) { 1857 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1858 hlist_for_each_entry_safe(rt6_ex, tmp, 1859 &bucket->chain, hlist) { 1860 struct rt6_info *entry = rt6_ex->rt6i; 1861 1862 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1863 RTF_CACHE_GATEWAY && 1864 ipv6_addr_equal(gateway, 1865 &entry->rt6i_gateway)) { 1866 rt6_remove_exception(bucket, rt6_ex); 1867 } 1868 } 1869 bucket++; 1870 } 1871 } 1872 1873 spin_unlock_bh(&rt6_exception_lock); 1874 } 1875 1876 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1877 struct rt6_exception *rt6_ex, 1878 struct fib6_gc_args *gc_args, 1879 unsigned long now) 1880 { 1881 struct rt6_info *rt = rt6_ex->rt6i; 1882 1883 /* we are pruning and obsoleting aged-out and non gateway exceptions 1884 * even if others have still references to them, so that on next 1885 * dst_check() such references can be dropped. 1886 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1887 * expired, independently from their aging, as per RFC 8201 section 4 1888 */ 1889 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 1890 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1891 RT6_TRACE("aging clone %p\n", rt); 1892 rt6_remove_exception(bucket, rt6_ex); 1893 return; 1894 } 1895 } else if (time_after(jiffies, rt->dst.expires)) { 1896 RT6_TRACE("purging expired route %p\n", rt); 1897 rt6_remove_exception(bucket, rt6_ex); 1898 return; 1899 } 1900 1901 if (rt->rt6i_flags & RTF_GATEWAY) { 1902 struct neighbour *neigh; 1903 __u8 neigh_flags = 0; 1904 1905 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 1906 if (neigh) 1907 neigh_flags = neigh->flags; 1908 1909 if (!(neigh_flags & NTF_ROUTER)) { 1910 RT6_TRACE("purging route %p via non-router but gateway\n", 1911 rt); 1912 rt6_remove_exception(bucket, rt6_ex); 1913 return; 1914 } 1915 } 1916 1917 gc_args->more++; 1918 } 1919 1920 static void fib6_nh_age_exceptions(const struct fib6_nh *nh, 1921 struct fib6_gc_args *gc_args, 1922 unsigned long now) 1923 { 1924 struct rt6_exception_bucket *bucket; 1925 struct rt6_exception *rt6_ex; 1926 struct hlist_node *tmp; 1927 int i; 1928 1929 if (!rcu_access_pointer(nh->rt6i_exception_bucket)) 1930 return; 1931 1932 rcu_read_lock_bh(); 1933 spin_lock(&rt6_exception_lock); 1934 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 1935 if (bucket) { 1936 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1937 hlist_for_each_entry_safe(rt6_ex, tmp, 1938 &bucket->chain, hlist) { 1939 rt6_age_examine_exception(bucket, rt6_ex, 1940 gc_args, now); 1941 } 1942 bucket++; 1943 } 1944 } 1945 spin_unlock(&rt6_exception_lock); 1946 rcu_read_unlock_bh(); 1947 } 1948 1949 void rt6_age_exceptions(struct fib6_info *f6i, 1950 struct fib6_gc_args *gc_args, 1951 unsigned long now) 1952 { 1953 fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now); 1954 } 1955 1956 /* must be called with rcu lock held */ 1957 int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, 1958 struct flowi6 *fl6, struct fib6_result *res, int strict) 1959 { 1960 struct fib6_node *fn, *saved_fn; 1961 1962 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1963 saved_fn = fn; 1964 1965 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1966 oif = 0; 1967 1968 redo_rt6_select: 1969 rt6_select(net, fn, oif, res, strict); 1970 if (res->f6i == net->ipv6.fib6_null_entry) { 1971 fn = fib6_backtrack(fn, &fl6->saddr); 1972 if (fn) 1973 goto redo_rt6_select; 1974 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1975 /* also consider unreachable route */ 1976 strict &= ~RT6_LOOKUP_F_REACHABLE; 1977 fn = saved_fn; 1978 goto redo_rt6_select; 1979 } 1980 } 1981 1982 trace_fib6_table_lookup(net, res, table, fl6); 1983 1984 return 0; 1985 } 1986 1987 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1988 int oif, struct flowi6 *fl6, 1989 const struct sk_buff *skb, int flags) 1990 { 1991 struct fib6_result res = {}; 1992 struct rt6_info *rt; 1993 int strict = 0; 1994 1995 strict |= flags & RT6_LOOKUP_F_IFACE; 1996 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1997 if (net->ipv6.devconf_all->forwarding == 0) 1998 strict |= RT6_LOOKUP_F_REACHABLE; 1999 2000 rcu_read_lock(); 2001 2002 fib6_table_lookup(net, table, oif, fl6, &res, strict); 2003 if (res.f6i == net->ipv6.fib6_null_entry) { 2004 rt = net->ipv6.ip6_null_entry; 2005 rcu_read_unlock(); 2006 dst_hold(&rt->dst); 2007 return rt; 2008 } 2009 2010 fib6_select_path(net, &res, fl6, oif, false, skb, strict); 2011 2012 /*Search through exception table */ 2013 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); 2014 if (rt) { 2015 if (ip6_hold_safe(net, &rt)) 2016 dst_use_noref(&rt->dst, jiffies); 2017 2018 rcu_read_unlock(); 2019 return rt; 2020 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 2021 !res.nh->fib_nh_gw_family)) { 2022 /* Create a RTF_CACHE clone which will not be 2023 * owned by the fib6 tree. It is for the special case where 2024 * the daddr in the skb during the neighbor look-up is different 2025 * from the fl6->daddr used to look-up route here. 2026 */ 2027 struct rt6_info *uncached_rt; 2028 2029 uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); 2030 2031 rcu_read_unlock(); 2032 2033 if (uncached_rt) { 2034 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 2035 * No need for another dst_hold() 2036 */ 2037 rt6_uncached_list_add(uncached_rt); 2038 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2039 } else { 2040 uncached_rt = net->ipv6.ip6_null_entry; 2041 dst_hold(&uncached_rt->dst); 2042 } 2043 2044 return uncached_rt; 2045 } else { 2046 /* Get a percpu copy */ 2047 2048 struct rt6_info *pcpu_rt; 2049 2050 local_bh_disable(); 2051 pcpu_rt = rt6_get_pcpu_route(&res); 2052 2053 if (!pcpu_rt) 2054 pcpu_rt = rt6_make_pcpu_route(net, &res); 2055 2056 local_bh_enable(); 2057 rcu_read_unlock(); 2058 2059 return pcpu_rt; 2060 } 2061 } 2062 EXPORT_SYMBOL_GPL(ip6_pol_route); 2063 2064 static struct rt6_info *ip6_pol_route_input(struct net *net, 2065 struct fib6_table *table, 2066 struct flowi6 *fl6, 2067 const struct sk_buff *skb, 2068 int flags) 2069 { 2070 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 2071 } 2072 2073 struct dst_entry *ip6_route_input_lookup(struct net *net, 2074 struct net_device *dev, 2075 struct flowi6 *fl6, 2076 const struct sk_buff *skb, 2077 int flags) 2078 { 2079 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 2080 flags |= RT6_LOOKUP_F_IFACE; 2081 2082 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 2083 } 2084 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 2085 2086 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 2087 struct flow_keys *keys, 2088 struct flow_keys *flkeys) 2089 { 2090 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 2091 const struct ipv6hdr *key_iph = outer_iph; 2092 struct flow_keys *_flkeys = flkeys; 2093 const struct ipv6hdr *inner_iph; 2094 const struct icmp6hdr *icmph; 2095 struct ipv6hdr _inner_iph; 2096 struct icmp6hdr _icmph; 2097 2098 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 2099 goto out; 2100 2101 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 2102 sizeof(_icmph), &_icmph); 2103 if (!icmph) 2104 goto out; 2105 2106 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 2107 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 2108 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 2109 icmph->icmp6_type != ICMPV6_PARAMPROB) 2110 goto out; 2111 2112 inner_iph = skb_header_pointer(skb, 2113 skb_transport_offset(skb) + sizeof(*icmph), 2114 sizeof(_inner_iph), &_inner_iph); 2115 if (!inner_iph) 2116 goto out; 2117 2118 key_iph = inner_iph; 2119 _flkeys = NULL; 2120 out: 2121 if (_flkeys) { 2122 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 2123 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 2124 keys->tags.flow_label = _flkeys->tags.flow_label; 2125 keys->basic.ip_proto = _flkeys->basic.ip_proto; 2126 } else { 2127 keys->addrs.v6addrs.src = key_iph->saddr; 2128 keys->addrs.v6addrs.dst = key_iph->daddr; 2129 keys->tags.flow_label = ip6_flowlabel(key_iph); 2130 keys->basic.ip_proto = key_iph->nexthdr; 2131 } 2132 } 2133 2134 /* if skb is set it will be used and fl6 can be NULL */ 2135 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 2136 const struct sk_buff *skb, struct flow_keys *flkeys) 2137 { 2138 struct flow_keys hash_keys; 2139 u32 mhash; 2140 2141 switch (ip6_multipath_hash_policy(net)) { 2142 case 0: 2143 memset(&hash_keys, 0, sizeof(hash_keys)); 2144 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2145 if (skb) { 2146 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 2147 } else { 2148 hash_keys.addrs.v6addrs.src = fl6->saddr; 2149 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2150 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2151 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2152 } 2153 break; 2154 case 1: 2155 if (skb) { 2156 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 2157 struct flow_keys keys; 2158 2159 /* short-circuit if we already have L4 hash present */ 2160 if (skb->l4_hash) 2161 return skb_get_hash_raw(skb) >> 1; 2162 2163 memset(&hash_keys, 0, sizeof(hash_keys)); 2164 2165 if (!flkeys) { 2166 skb_flow_dissect_flow_keys(skb, &keys, flag); 2167 flkeys = &keys; 2168 } 2169 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2170 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2171 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2172 hash_keys.ports.src = flkeys->ports.src; 2173 hash_keys.ports.dst = flkeys->ports.dst; 2174 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2175 } else { 2176 memset(&hash_keys, 0, sizeof(hash_keys)); 2177 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2178 hash_keys.addrs.v6addrs.src = fl6->saddr; 2179 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2180 hash_keys.ports.src = fl6->fl6_sport; 2181 hash_keys.ports.dst = fl6->fl6_dport; 2182 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2183 } 2184 break; 2185 } 2186 mhash = flow_hash_from_keys(&hash_keys); 2187 2188 return mhash >> 1; 2189 } 2190 2191 void ip6_route_input(struct sk_buff *skb) 2192 { 2193 const struct ipv6hdr *iph = ipv6_hdr(skb); 2194 struct net *net = dev_net(skb->dev); 2195 int flags = RT6_LOOKUP_F_HAS_SADDR; 2196 struct ip_tunnel_info *tun_info; 2197 struct flowi6 fl6 = { 2198 .flowi6_iif = skb->dev->ifindex, 2199 .daddr = iph->daddr, 2200 .saddr = iph->saddr, 2201 .flowlabel = ip6_flowinfo(iph), 2202 .flowi6_mark = skb->mark, 2203 .flowi6_proto = iph->nexthdr, 2204 }; 2205 struct flow_keys *flkeys = NULL, _flkeys; 2206 2207 tun_info = skb_tunnel_info(skb); 2208 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2209 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2210 2211 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2212 flkeys = &_flkeys; 2213 2214 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2215 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2216 skb_dst_drop(skb); 2217 skb_dst_set(skb, 2218 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); 2219 } 2220 2221 static struct rt6_info *ip6_pol_route_output(struct net *net, 2222 struct fib6_table *table, 2223 struct flowi6 *fl6, 2224 const struct sk_buff *skb, 2225 int flags) 2226 { 2227 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2228 } 2229 2230 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 2231 struct flowi6 *fl6, int flags) 2232 { 2233 bool any_src; 2234 2235 if (ipv6_addr_type(&fl6->daddr) & 2236 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { 2237 struct dst_entry *dst; 2238 2239 dst = l3mdev_link_scope_lookup(net, fl6); 2240 if (dst) 2241 return dst; 2242 } 2243 2244 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2245 2246 any_src = ipv6_addr_any(&fl6->saddr); 2247 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2248 (fl6->flowi6_oif && any_src)) 2249 flags |= RT6_LOOKUP_F_IFACE; 2250 2251 if (!any_src) 2252 flags |= RT6_LOOKUP_F_HAS_SADDR; 2253 else if (sk) 2254 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2255 2256 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2257 } 2258 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2259 2260 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2261 { 2262 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2263 struct net_device *loopback_dev = net->loopback_dev; 2264 struct dst_entry *new = NULL; 2265 2266 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2267 DST_OBSOLETE_DEAD, 0); 2268 if (rt) { 2269 rt6_info_init(rt); 2270 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2271 2272 new = &rt->dst; 2273 new->__use = 1; 2274 new->input = dst_discard; 2275 new->output = dst_discard_out; 2276 2277 dst_copy_metrics(new, &ort->dst); 2278 2279 rt->rt6i_idev = in6_dev_get(loopback_dev); 2280 rt->rt6i_gateway = ort->rt6i_gateway; 2281 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2282 2283 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2284 #ifdef CONFIG_IPV6_SUBTREES 2285 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2286 #endif 2287 } 2288 2289 dst_release(dst_orig); 2290 return new ? new : ERR_PTR(-ENOMEM); 2291 } 2292 2293 /* 2294 * Destination cache support functions 2295 */ 2296 2297 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2298 { 2299 u32 rt_cookie = 0; 2300 2301 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2302 return false; 2303 2304 if (fib6_check_expired(f6i)) 2305 return false; 2306 2307 return true; 2308 } 2309 2310 static struct dst_entry *rt6_check(struct rt6_info *rt, 2311 struct fib6_info *from, 2312 u32 cookie) 2313 { 2314 u32 rt_cookie = 0; 2315 2316 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || 2317 rt_cookie != cookie) 2318 return NULL; 2319 2320 if (rt6_check_expired(rt)) 2321 return NULL; 2322 2323 return &rt->dst; 2324 } 2325 2326 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2327 struct fib6_info *from, 2328 u32 cookie) 2329 { 2330 if (!__rt6_check_expired(rt) && 2331 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2332 fib6_check(from, cookie)) 2333 return &rt->dst; 2334 else 2335 return NULL; 2336 } 2337 2338 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2339 { 2340 struct dst_entry *dst_ret; 2341 struct fib6_info *from; 2342 struct rt6_info *rt; 2343 2344 rt = container_of(dst, struct rt6_info, dst); 2345 2346 rcu_read_lock(); 2347 2348 /* All IPV6 dsts are created with ->obsolete set to the value 2349 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2350 * into this function always. 2351 */ 2352 2353 from = rcu_dereference(rt->from); 2354 2355 if (from && (rt->rt6i_flags & RTF_PCPU || 2356 unlikely(!list_empty(&rt->rt6i_uncached)))) 2357 dst_ret = rt6_dst_from_check(rt, from, cookie); 2358 else 2359 dst_ret = rt6_check(rt, from, cookie); 2360 2361 rcu_read_unlock(); 2362 2363 return dst_ret; 2364 } 2365 2366 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2367 { 2368 struct rt6_info *rt = (struct rt6_info *) dst; 2369 2370 if (rt) { 2371 if (rt->rt6i_flags & RTF_CACHE) { 2372 rcu_read_lock(); 2373 if (rt6_check_expired(rt)) { 2374 rt6_remove_exception_rt(rt); 2375 dst = NULL; 2376 } 2377 rcu_read_unlock(); 2378 } else { 2379 dst_release(dst); 2380 dst = NULL; 2381 } 2382 } 2383 return dst; 2384 } 2385 2386 static void ip6_link_failure(struct sk_buff *skb) 2387 { 2388 struct rt6_info *rt; 2389 2390 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2391 2392 rt = (struct rt6_info *) skb_dst(skb); 2393 if (rt) { 2394 rcu_read_lock(); 2395 if (rt->rt6i_flags & RTF_CACHE) { 2396 rt6_remove_exception_rt(rt); 2397 } else { 2398 struct fib6_info *from; 2399 struct fib6_node *fn; 2400 2401 from = rcu_dereference(rt->from); 2402 if (from) { 2403 fn = rcu_dereference(from->fib6_node); 2404 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2405 fn->fn_sernum = -1; 2406 } 2407 } 2408 rcu_read_unlock(); 2409 } 2410 } 2411 2412 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2413 { 2414 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2415 struct fib6_info *from; 2416 2417 rcu_read_lock(); 2418 from = rcu_dereference(rt0->from); 2419 if (from) 2420 rt0->dst.expires = from->expires; 2421 rcu_read_unlock(); 2422 } 2423 2424 dst_set_expires(&rt0->dst, timeout); 2425 rt0->rt6i_flags |= RTF_EXPIRES; 2426 } 2427 2428 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2429 { 2430 struct net *net = dev_net(rt->dst.dev); 2431 2432 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2433 rt->rt6i_flags |= RTF_MODIFIED; 2434 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2435 } 2436 2437 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2438 { 2439 return !(rt->rt6i_flags & RTF_CACHE) && 2440 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from)); 2441 } 2442 2443 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2444 const struct ipv6hdr *iph, u32 mtu) 2445 { 2446 const struct in6_addr *daddr, *saddr; 2447 struct rt6_info *rt6 = (struct rt6_info *)dst; 2448 2449 if (dst_metric_locked(dst, RTAX_MTU)) 2450 return; 2451 2452 if (iph) { 2453 daddr = &iph->daddr; 2454 saddr = &iph->saddr; 2455 } else if (sk) { 2456 daddr = &sk->sk_v6_daddr; 2457 saddr = &inet6_sk(sk)->saddr; 2458 } else { 2459 daddr = NULL; 2460 saddr = NULL; 2461 } 2462 dst_confirm_neigh(dst, daddr); 2463 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2464 if (mtu >= dst_mtu(dst)) 2465 return; 2466 2467 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2468 rt6_do_update_pmtu(rt6, mtu); 2469 /* update rt6_ex->stamp for cache */ 2470 if (rt6->rt6i_flags & RTF_CACHE) 2471 rt6_update_exception_stamp_rt(rt6); 2472 } else if (daddr) { 2473 struct fib6_result res = {}; 2474 struct rt6_info *nrt6; 2475 2476 rcu_read_lock(); 2477 res.f6i = rcu_dereference(rt6->from); 2478 if (!res.f6i) { 2479 rcu_read_unlock(); 2480 return; 2481 } 2482 res.nh = res.f6i->fib6_nh; 2483 res.fib6_flags = res.f6i->fib6_flags; 2484 res.fib6_type = res.f6i->fib6_type; 2485 2486 nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr); 2487 if (nrt6) { 2488 rt6_do_update_pmtu(nrt6, mtu); 2489 if (rt6_insert_exception(nrt6, &res)) 2490 dst_release_immediate(&nrt6->dst); 2491 } 2492 rcu_read_unlock(); 2493 } 2494 } 2495 2496 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2497 struct sk_buff *skb, u32 mtu) 2498 { 2499 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2500 } 2501 2502 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2503 int oif, u32 mark, kuid_t uid) 2504 { 2505 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2506 struct dst_entry *dst; 2507 struct flowi6 fl6 = { 2508 .flowi6_oif = oif, 2509 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark), 2510 .daddr = iph->daddr, 2511 .saddr = iph->saddr, 2512 .flowlabel = ip6_flowinfo(iph), 2513 .flowi6_uid = uid, 2514 }; 2515 2516 dst = ip6_route_output(net, NULL, &fl6); 2517 if (!dst->error) 2518 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2519 dst_release(dst); 2520 } 2521 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2522 2523 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2524 { 2525 int oif = sk->sk_bound_dev_if; 2526 struct dst_entry *dst; 2527 2528 if (!oif && skb->dev) 2529 oif = l3mdev_master_ifindex(skb->dev); 2530 2531 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid); 2532 2533 dst = __sk_dst_get(sk); 2534 if (!dst || !dst->obsolete || 2535 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2536 return; 2537 2538 bh_lock_sock(sk); 2539 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2540 ip6_datagram_dst_update(sk, false); 2541 bh_unlock_sock(sk); 2542 } 2543 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2544 2545 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2546 const struct flowi6 *fl6) 2547 { 2548 #ifdef CONFIG_IPV6_SUBTREES 2549 struct ipv6_pinfo *np = inet6_sk(sk); 2550 #endif 2551 2552 ip6_dst_store(sk, dst, 2553 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2554 &sk->sk_v6_daddr : NULL, 2555 #ifdef CONFIG_IPV6_SUBTREES 2556 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2557 &np->saddr : 2558 #endif 2559 NULL); 2560 } 2561 2562 static bool ip6_redirect_nh_match(const struct fib6_result *res, 2563 struct flowi6 *fl6, 2564 const struct in6_addr *gw, 2565 struct rt6_info **ret) 2566 { 2567 const struct fib6_nh *nh = res->nh; 2568 2569 if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family || 2570 fl6->flowi6_oif != nh->fib_nh_dev->ifindex) 2571 return false; 2572 2573 /* rt_cache's gateway might be different from its 'parent' 2574 * in the case of an ip redirect. 2575 * So we keep searching in the exception table if the gateway 2576 * is different. 2577 */ 2578 if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) { 2579 struct rt6_info *rt_cache; 2580 2581 rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr); 2582 if (rt_cache && 2583 ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) { 2584 *ret = rt_cache; 2585 return true; 2586 } 2587 return false; 2588 } 2589 return true; 2590 } 2591 2592 /* Handle redirects */ 2593 struct ip6rd_flowi { 2594 struct flowi6 fl6; 2595 struct in6_addr gateway; 2596 }; 2597 2598 static struct rt6_info *__ip6_route_redirect(struct net *net, 2599 struct fib6_table *table, 2600 struct flowi6 *fl6, 2601 const struct sk_buff *skb, 2602 int flags) 2603 { 2604 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2605 struct rt6_info *ret = NULL; 2606 struct fib6_result res = {}; 2607 struct fib6_info *rt; 2608 struct fib6_node *fn; 2609 2610 /* l3mdev_update_flow overrides oif if the device is enslaved; in 2611 * this case we must match on the real ingress device, so reset it 2612 */ 2613 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 2614 fl6->flowi6_oif = skb->dev->ifindex; 2615 2616 /* Get the "current" route for this destination and 2617 * check if the redirect has come from appropriate router. 2618 * 2619 * RFC 4861 specifies that redirects should only be 2620 * accepted if they come from the nexthop to the target. 2621 * Due to the way the routes are chosen, this notion 2622 * is a bit fuzzy and one might need to check all possible 2623 * routes. 2624 */ 2625 2626 rcu_read_lock(); 2627 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2628 restart: 2629 for_each_fib6_node_rt_rcu(fn) { 2630 res.f6i = rt; 2631 res.nh = rt->fib6_nh; 2632 2633 if (fib6_check_expired(rt)) 2634 continue; 2635 if (rt->fib6_flags & RTF_REJECT) 2636 break; 2637 if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret)) 2638 goto out; 2639 } 2640 2641 if (!rt) 2642 rt = net->ipv6.fib6_null_entry; 2643 else if (rt->fib6_flags & RTF_REJECT) { 2644 ret = net->ipv6.ip6_null_entry; 2645 goto out; 2646 } 2647 2648 if (rt == net->ipv6.fib6_null_entry) { 2649 fn = fib6_backtrack(fn, &fl6->saddr); 2650 if (fn) 2651 goto restart; 2652 } 2653 2654 res.f6i = rt; 2655 res.nh = rt->fib6_nh; 2656 out: 2657 if (ret) { 2658 ip6_hold_safe(net, &ret); 2659 } else { 2660 res.fib6_flags = res.f6i->fib6_flags; 2661 res.fib6_type = res.f6i->fib6_type; 2662 ret = ip6_create_rt_rcu(&res); 2663 } 2664 2665 rcu_read_unlock(); 2666 2667 trace_fib6_table_lookup(net, &res, table, fl6); 2668 return ret; 2669 }; 2670 2671 static struct dst_entry *ip6_route_redirect(struct net *net, 2672 const struct flowi6 *fl6, 2673 const struct sk_buff *skb, 2674 const struct in6_addr *gateway) 2675 { 2676 int flags = RT6_LOOKUP_F_HAS_SADDR; 2677 struct ip6rd_flowi rdfl; 2678 2679 rdfl.fl6 = *fl6; 2680 rdfl.gateway = *gateway; 2681 2682 return fib6_rule_lookup(net, &rdfl.fl6, skb, 2683 flags, __ip6_route_redirect); 2684 } 2685 2686 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2687 kuid_t uid) 2688 { 2689 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2690 struct dst_entry *dst; 2691 struct flowi6 fl6 = { 2692 .flowi6_iif = LOOPBACK_IFINDEX, 2693 .flowi6_oif = oif, 2694 .flowi6_mark = mark, 2695 .daddr = iph->daddr, 2696 .saddr = iph->saddr, 2697 .flowlabel = ip6_flowinfo(iph), 2698 .flowi6_uid = uid, 2699 }; 2700 2701 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 2702 rt6_do_redirect(dst, NULL, skb); 2703 dst_release(dst); 2704 } 2705 EXPORT_SYMBOL_GPL(ip6_redirect); 2706 2707 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) 2708 { 2709 const struct ipv6hdr *iph = ipv6_hdr(skb); 2710 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2711 struct dst_entry *dst; 2712 struct flowi6 fl6 = { 2713 .flowi6_iif = LOOPBACK_IFINDEX, 2714 .flowi6_oif = oif, 2715 .daddr = msg->dest, 2716 .saddr = iph->daddr, 2717 .flowi6_uid = sock_net_uid(net, NULL), 2718 }; 2719 2720 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 2721 rt6_do_redirect(dst, NULL, skb); 2722 dst_release(dst); 2723 } 2724 2725 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2726 { 2727 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2728 sk->sk_uid); 2729 } 2730 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2731 2732 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2733 { 2734 struct net_device *dev = dst->dev; 2735 unsigned int mtu = dst_mtu(dst); 2736 struct net *net = dev_net(dev); 2737 2738 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2739 2740 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2741 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2742 2743 /* 2744 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2745 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2746 * IPV6_MAXPLEN is also valid and means: "any MSS, 2747 * rely only on pmtu discovery" 2748 */ 2749 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2750 mtu = IPV6_MAXPLEN; 2751 return mtu; 2752 } 2753 2754 static unsigned int ip6_mtu(const struct dst_entry *dst) 2755 { 2756 struct inet6_dev *idev; 2757 unsigned int mtu; 2758 2759 mtu = dst_metric_raw(dst, RTAX_MTU); 2760 if (mtu) 2761 goto out; 2762 2763 mtu = IPV6_MIN_MTU; 2764 2765 rcu_read_lock(); 2766 idev = __in6_dev_get(dst->dev); 2767 if (idev) 2768 mtu = idev->cnf.mtu6; 2769 rcu_read_unlock(); 2770 2771 out: 2772 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2773 2774 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2775 } 2776 2777 /* MTU selection: 2778 * 1. mtu on route is locked - use it 2779 * 2. mtu from nexthop exception 2780 * 3. mtu from egress device 2781 * 2782 * based on ip6_dst_mtu_forward and exception logic of 2783 * rt6_find_cached_rt; called with rcu_read_lock 2784 */ 2785 u32 ip6_mtu_from_fib6(const struct fib6_result *res, 2786 const struct in6_addr *daddr, 2787 const struct in6_addr *saddr) 2788 { 2789 const struct fib6_nh *nh = res->nh; 2790 struct fib6_info *f6i = res->f6i; 2791 struct inet6_dev *idev; 2792 struct rt6_info *rt; 2793 u32 mtu = 0; 2794 2795 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 2796 mtu = f6i->fib6_pmtu; 2797 if (mtu) 2798 goto out; 2799 } 2800 2801 rt = rt6_find_cached_rt(res, daddr, saddr); 2802 if (unlikely(rt)) { 2803 mtu = dst_metric_raw(&rt->dst, RTAX_MTU); 2804 } else { 2805 struct net_device *dev = nh->fib_nh_dev; 2806 2807 mtu = IPV6_MIN_MTU; 2808 idev = __in6_dev_get(dev); 2809 if (idev && idev->cnf.mtu6 > mtu) 2810 mtu = idev->cnf.mtu6; 2811 } 2812 2813 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2814 out: 2815 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); 2816 } 2817 2818 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2819 struct flowi6 *fl6) 2820 { 2821 struct dst_entry *dst; 2822 struct rt6_info *rt; 2823 struct inet6_dev *idev = in6_dev_get(dev); 2824 struct net *net = dev_net(dev); 2825 2826 if (unlikely(!idev)) 2827 return ERR_PTR(-ENODEV); 2828 2829 rt = ip6_dst_alloc(net, dev, 0); 2830 if (unlikely(!rt)) { 2831 in6_dev_put(idev); 2832 dst = ERR_PTR(-ENOMEM); 2833 goto out; 2834 } 2835 2836 rt->dst.flags |= DST_HOST; 2837 rt->dst.input = ip6_input; 2838 rt->dst.output = ip6_output; 2839 rt->rt6i_gateway = fl6->daddr; 2840 rt->rt6i_dst.addr = fl6->daddr; 2841 rt->rt6i_dst.plen = 128; 2842 rt->rt6i_idev = idev; 2843 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2844 2845 /* Add this dst into uncached_list so that rt6_disable_ip() can 2846 * do proper release of the net_device 2847 */ 2848 rt6_uncached_list_add(rt); 2849 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2850 2851 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2852 2853 out: 2854 return dst; 2855 } 2856 2857 static int ip6_dst_gc(struct dst_ops *ops) 2858 { 2859 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2860 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2861 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2862 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2863 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2864 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2865 int entries; 2866 2867 entries = dst_entries_get_fast(ops); 2868 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2869 entries <= rt_max_size) 2870 goto out; 2871 2872 net->ipv6.ip6_rt_gc_expire++; 2873 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2874 entries = dst_entries_get_slow(ops); 2875 if (entries < ops->gc_thresh) 2876 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2877 out: 2878 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2879 return entries > rt_max_size; 2880 } 2881 2882 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2883 struct fib6_config *cfg, 2884 const struct in6_addr *gw_addr, 2885 u32 tbid, int flags) 2886 { 2887 struct flowi6 fl6 = { 2888 .flowi6_oif = cfg->fc_ifindex, 2889 .daddr = *gw_addr, 2890 .saddr = cfg->fc_prefsrc, 2891 }; 2892 struct fib6_table *table; 2893 struct rt6_info *rt; 2894 2895 table = fib6_get_table(net, tbid); 2896 if (!table) 2897 return NULL; 2898 2899 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2900 flags |= RT6_LOOKUP_F_HAS_SADDR; 2901 2902 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 2903 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); 2904 2905 /* if table lookup failed, fall back to full lookup */ 2906 if (rt == net->ipv6.ip6_null_entry) { 2907 ip6_rt_put(rt); 2908 rt = NULL; 2909 } 2910 2911 return rt; 2912 } 2913 2914 static int ip6_route_check_nh_onlink(struct net *net, 2915 struct fib6_config *cfg, 2916 const struct net_device *dev, 2917 struct netlink_ext_ack *extack) 2918 { 2919 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 2920 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2921 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; 2922 struct fib6_info *from; 2923 struct rt6_info *grt; 2924 int err; 2925 2926 err = 0; 2927 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2928 if (grt) { 2929 rcu_read_lock(); 2930 from = rcu_dereference(grt->from); 2931 if (!grt->dst.error && 2932 /* ignore match if it is the default route */ 2933 from && !ipv6_addr_any(&from->fib6_dst.addr) && 2934 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2935 NL_SET_ERR_MSG(extack, 2936 "Nexthop has invalid gateway or device mismatch"); 2937 err = -EINVAL; 2938 } 2939 rcu_read_unlock(); 2940 2941 ip6_rt_put(grt); 2942 } 2943 2944 return err; 2945 } 2946 2947 static int ip6_route_check_nh(struct net *net, 2948 struct fib6_config *cfg, 2949 struct net_device **_dev, 2950 struct inet6_dev **idev) 2951 { 2952 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2953 struct net_device *dev = _dev ? *_dev : NULL; 2954 struct rt6_info *grt = NULL; 2955 int err = -EHOSTUNREACH; 2956 2957 if (cfg->fc_table) { 2958 int flags = RT6_LOOKUP_F_IFACE; 2959 2960 grt = ip6_nh_lookup_table(net, cfg, gw_addr, 2961 cfg->fc_table, flags); 2962 if (grt) { 2963 if (grt->rt6i_flags & RTF_GATEWAY || 2964 (dev && dev != grt->dst.dev)) { 2965 ip6_rt_put(grt); 2966 grt = NULL; 2967 } 2968 } 2969 } 2970 2971 if (!grt) 2972 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); 2973 2974 if (!grt) 2975 goto out; 2976 2977 if (dev) { 2978 if (dev != grt->dst.dev) { 2979 ip6_rt_put(grt); 2980 goto out; 2981 } 2982 } else { 2983 *_dev = dev = grt->dst.dev; 2984 *idev = grt->rt6i_idev; 2985 dev_hold(dev); 2986 in6_dev_hold(grt->rt6i_idev); 2987 } 2988 2989 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2990 err = 0; 2991 2992 ip6_rt_put(grt); 2993 2994 out: 2995 return err; 2996 } 2997 2998 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 2999 struct net_device **_dev, struct inet6_dev **idev, 3000 struct netlink_ext_ack *extack) 3001 { 3002 const struct in6_addr *gw_addr = &cfg->fc_gateway; 3003 int gwa_type = ipv6_addr_type(gw_addr); 3004 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 3005 const struct net_device *dev = *_dev; 3006 bool need_addr_check = !dev; 3007 int err = -EINVAL; 3008 3009 /* if gw_addr is local we will fail to detect this in case 3010 * address is still TENTATIVE (DAD in progress). rt6_lookup() 3011 * will return already-added prefix route via interface that 3012 * prefix route was assigned to, which might be non-loopback. 3013 */ 3014 if (dev && 3015 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 3016 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 3017 goto out; 3018 } 3019 3020 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 3021 /* IPv6 strictly inhibits using not link-local 3022 * addresses as nexthop address. 3023 * Otherwise, router will not able to send redirects. 3024 * It is very good, but in some (rare!) circumstances 3025 * (SIT, PtP, NBMA NOARP links) it is handy to allow 3026 * some exceptions. --ANK 3027 * We allow IPv4-mapped nexthops to support RFC4798-type 3028 * addressing 3029 */ 3030 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 3031 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 3032 goto out; 3033 } 3034 3035 if (cfg->fc_flags & RTNH_F_ONLINK) 3036 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 3037 else 3038 err = ip6_route_check_nh(net, cfg, _dev, idev); 3039 3040 if (err) 3041 goto out; 3042 } 3043 3044 /* reload in case device was changed */ 3045 dev = *_dev; 3046 3047 err = -EINVAL; 3048 if (!dev) { 3049 NL_SET_ERR_MSG(extack, "Egress device not specified"); 3050 goto out; 3051 } else if (dev->flags & IFF_LOOPBACK) { 3052 NL_SET_ERR_MSG(extack, 3053 "Egress device can not be loopback device for this route"); 3054 goto out; 3055 } 3056 3057 /* if we did not check gw_addr above, do so now that the 3058 * egress device has been resolved. 3059 */ 3060 if (need_addr_check && 3061 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 3062 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 3063 goto out; 3064 } 3065 3066 err = 0; 3067 out: 3068 return err; 3069 } 3070 3071 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type) 3072 { 3073 if ((flags & RTF_REJECT) || 3074 (dev && (dev->flags & IFF_LOOPBACK) && 3075 !(addr_type & IPV6_ADDR_LOOPBACK) && 3076 !(flags & RTF_LOCAL))) 3077 return true; 3078 3079 return false; 3080 } 3081 3082 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, 3083 struct fib6_config *cfg, gfp_t gfp_flags, 3084 struct netlink_ext_ack *extack) 3085 { 3086 struct net_device *dev = NULL; 3087 struct inet6_dev *idev = NULL; 3088 int addr_type; 3089 int err; 3090 3091 fib6_nh->fib_nh_family = AF_INET6; 3092 3093 err = -ENODEV; 3094 if (cfg->fc_ifindex) { 3095 dev = dev_get_by_index(net, cfg->fc_ifindex); 3096 if (!dev) 3097 goto out; 3098 idev = in6_dev_get(dev); 3099 if (!idev) 3100 goto out; 3101 } 3102 3103 if (cfg->fc_flags & RTNH_F_ONLINK) { 3104 if (!dev) { 3105 NL_SET_ERR_MSG(extack, 3106 "Nexthop device required for onlink"); 3107 goto out; 3108 } 3109 3110 if (!(dev->flags & IFF_UP)) { 3111 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3112 err = -ENETDOWN; 3113 goto out; 3114 } 3115 3116 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK; 3117 } 3118 3119 fib6_nh->fib_nh_weight = 1; 3120 3121 /* We cannot add true routes via loopback here, 3122 * they would result in kernel looping; promote them to reject routes 3123 */ 3124 addr_type = ipv6_addr_type(&cfg->fc_dst); 3125 if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) { 3126 /* hold loopback dev/idev if we haven't done so. */ 3127 if (dev != net->loopback_dev) { 3128 if (dev) { 3129 dev_put(dev); 3130 in6_dev_put(idev); 3131 } 3132 dev = net->loopback_dev; 3133 dev_hold(dev); 3134 idev = in6_dev_get(dev); 3135 if (!idev) { 3136 err = -ENODEV; 3137 goto out; 3138 } 3139 } 3140 goto pcpu_alloc; 3141 } 3142 3143 if (cfg->fc_flags & RTF_GATEWAY) { 3144 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 3145 if (err) 3146 goto out; 3147 3148 fib6_nh->fib_nh_gw6 = cfg->fc_gateway; 3149 fib6_nh->fib_nh_gw_family = AF_INET6; 3150 } 3151 3152 err = -ENODEV; 3153 if (!dev) 3154 goto out; 3155 3156 if (idev->cnf.disable_ipv6) { 3157 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3158 err = -EACCES; 3159 goto out; 3160 } 3161 3162 if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) { 3163 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3164 err = -ENETDOWN; 3165 goto out; 3166 } 3167 3168 if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3169 !netif_carrier_ok(dev)) 3170 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; 3171 3172 err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap, 3173 cfg->fc_encap_type, cfg, gfp_flags, extack); 3174 if (err) 3175 goto out; 3176 3177 pcpu_alloc: 3178 fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); 3179 if (!fib6_nh->rt6i_pcpu) { 3180 err = -ENOMEM; 3181 goto out; 3182 } 3183 3184 fib6_nh->fib_nh_dev = dev; 3185 fib6_nh->fib_nh_oif = dev->ifindex; 3186 err = 0; 3187 out: 3188 if (idev) 3189 in6_dev_put(idev); 3190 3191 if (err) { 3192 lwtstate_put(fib6_nh->fib_nh_lws); 3193 fib6_nh->fib_nh_lws = NULL; 3194 if (dev) 3195 dev_put(dev); 3196 } 3197 3198 return err; 3199 } 3200 3201 void fib6_nh_release(struct fib6_nh *fib6_nh) 3202 { 3203 struct rt6_exception_bucket *bucket; 3204 3205 rcu_read_lock(); 3206 3207 fib6_nh_flush_exceptions(fib6_nh, NULL); 3208 bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL); 3209 if (bucket) { 3210 rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL); 3211 kfree(bucket); 3212 } 3213 3214 rcu_read_unlock(); 3215 3216 if (fib6_nh->rt6i_pcpu) { 3217 int cpu; 3218 3219 for_each_possible_cpu(cpu) { 3220 struct rt6_info **ppcpu_rt; 3221 struct rt6_info *pcpu_rt; 3222 3223 ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); 3224 pcpu_rt = *ppcpu_rt; 3225 if (pcpu_rt) { 3226 dst_dev_put(&pcpu_rt->dst); 3227 dst_release(&pcpu_rt->dst); 3228 *ppcpu_rt = NULL; 3229 } 3230 } 3231 3232 free_percpu(fib6_nh->rt6i_pcpu); 3233 } 3234 3235 fib_nh_common_release(&fib6_nh->nh_common); 3236 } 3237 3238 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 3239 gfp_t gfp_flags, 3240 struct netlink_ext_ack *extack) 3241 { 3242 struct net *net = cfg->fc_nlinfo.nl_net; 3243 struct fib6_info *rt = NULL; 3244 struct nexthop *nh = NULL; 3245 struct fib6_table *table; 3246 struct fib6_nh *fib6_nh; 3247 int err = -EINVAL; 3248 int addr_type; 3249 3250 /* RTF_PCPU is an internal flag; can not be set by userspace */ 3251 if (cfg->fc_flags & RTF_PCPU) { 3252 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 3253 goto out; 3254 } 3255 3256 /* RTF_CACHE is an internal flag; can not be set by userspace */ 3257 if (cfg->fc_flags & RTF_CACHE) { 3258 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 3259 goto out; 3260 } 3261 3262 if (cfg->fc_type > RTN_MAX) { 3263 NL_SET_ERR_MSG(extack, "Invalid route type"); 3264 goto out; 3265 } 3266 3267 if (cfg->fc_dst_len > 128) { 3268 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 3269 goto out; 3270 } 3271 if (cfg->fc_src_len > 128) { 3272 NL_SET_ERR_MSG(extack, "Invalid source address length"); 3273 goto out; 3274 } 3275 #ifndef CONFIG_IPV6_SUBTREES 3276 if (cfg->fc_src_len) { 3277 NL_SET_ERR_MSG(extack, 3278 "Specifying source address requires IPV6_SUBTREES to be enabled"); 3279 goto out; 3280 } 3281 #endif 3282 3283 err = -ENOBUFS; 3284 if (cfg->fc_nlinfo.nlh && 3285 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 3286 table = fib6_get_table(net, cfg->fc_table); 3287 if (!table) { 3288 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 3289 table = fib6_new_table(net, cfg->fc_table); 3290 } 3291 } else { 3292 table = fib6_new_table(net, cfg->fc_table); 3293 } 3294 3295 if (!table) 3296 goto out; 3297 3298 err = -ENOMEM; 3299 rt = fib6_info_alloc(gfp_flags, !nh); 3300 if (!rt) 3301 goto out; 3302 3303 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len, 3304 extack); 3305 if (IS_ERR(rt->fib6_metrics)) { 3306 err = PTR_ERR(rt->fib6_metrics); 3307 /* Do not leave garbage there. */ 3308 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; 3309 goto out; 3310 } 3311 3312 if (cfg->fc_flags & RTF_ADDRCONF) 3313 rt->dst_nocount = true; 3314 3315 if (cfg->fc_flags & RTF_EXPIRES) 3316 fib6_set_expires(rt, jiffies + 3317 clock_t_to_jiffies(cfg->fc_expires)); 3318 else 3319 fib6_clean_expires(rt); 3320 3321 if (cfg->fc_protocol == RTPROT_UNSPEC) 3322 cfg->fc_protocol = RTPROT_BOOT; 3323 rt->fib6_protocol = cfg->fc_protocol; 3324 3325 rt->fib6_table = table; 3326 rt->fib6_metric = cfg->fc_metric; 3327 rt->fib6_type = cfg->fc_type; 3328 rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY; 3329 3330 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3331 rt->fib6_dst.plen = cfg->fc_dst_len; 3332 if (rt->fib6_dst.plen == 128) 3333 rt->dst_host = true; 3334 3335 #ifdef CONFIG_IPV6_SUBTREES 3336 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3337 rt->fib6_src.plen = cfg->fc_src_len; 3338 #endif 3339 if (nh) { 3340 if (!nexthop_get(nh)) { 3341 NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); 3342 goto out; 3343 } 3344 if (rt->fib6_src.plen) { 3345 NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); 3346 goto out; 3347 } 3348 rt->nh = nh; 3349 fib6_nh = nexthop_fib6_nh(rt->nh); 3350 } else { 3351 err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack); 3352 if (err) 3353 goto out; 3354 3355 fib6_nh = rt->fib6_nh; 3356 3357 /* We cannot add true routes via loopback here, they would 3358 * result in kernel looping; promote them to reject routes 3359 */ 3360 addr_type = ipv6_addr_type(&cfg->fc_dst); 3361 if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev, 3362 addr_type)) 3363 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; 3364 } 3365 3366 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3367 struct net_device *dev = fib6_nh->fib_nh_dev; 3368 3369 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3370 NL_SET_ERR_MSG(extack, "Invalid source address"); 3371 err = -EINVAL; 3372 goto out; 3373 } 3374 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3375 rt->fib6_prefsrc.plen = 128; 3376 } else 3377 rt->fib6_prefsrc.plen = 0; 3378 3379 return rt; 3380 out: 3381 fib6_info_release(rt); 3382 return ERR_PTR(err); 3383 } 3384 3385 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3386 struct netlink_ext_ack *extack) 3387 { 3388 struct fib6_info *rt; 3389 int err; 3390 3391 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3392 if (IS_ERR(rt)) 3393 return PTR_ERR(rt); 3394 3395 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3396 fib6_info_release(rt); 3397 3398 return err; 3399 } 3400 3401 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3402 { 3403 struct net *net = info->nl_net; 3404 struct fib6_table *table; 3405 int err; 3406 3407 if (rt == net->ipv6.fib6_null_entry) { 3408 err = -ENOENT; 3409 goto out; 3410 } 3411 3412 table = rt->fib6_table; 3413 spin_lock_bh(&table->tb6_lock); 3414 err = fib6_del(rt, info); 3415 spin_unlock_bh(&table->tb6_lock); 3416 3417 out: 3418 fib6_info_release(rt); 3419 return err; 3420 } 3421 3422 int ip6_del_rt(struct net *net, struct fib6_info *rt) 3423 { 3424 struct nl_info info = { .nl_net = net }; 3425 3426 return __ip6_del_rt(rt, &info); 3427 } 3428 3429 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3430 { 3431 struct nl_info *info = &cfg->fc_nlinfo; 3432 struct net *net = info->nl_net; 3433 struct sk_buff *skb = NULL; 3434 struct fib6_table *table; 3435 int err = -ENOENT; 3436 3437 if (rt == net->ipv6.fib6_null_entry) 3438 goto out_put; 3439 table = rt->fib6_table; 3440 spin_lock_bh(&table->tb6_lock); 3441 3442 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3443 struct fib6_info *sibling, *next_sibling; 3444 3445 /* prefer to send a single notification with all hops */ 3446 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3447 if (skb) { 3448 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3449 3450 if (rt6_fill_node(net, skb, rt, NULL, 3451 NULL, NULL, 0, RTM_DELROUTE, 3452 info->portid, seq, 0) < 0) { 3453 kfree_skb(skb); 3454 skb = NULL; 3455 } else 3456 info->skip_notify = 1; 3457 } 3458 3459 list_for_each_entry_safe(sibling, next_sibling, 3460 &rt->fib6_siblings, 3461 fib6_siblings) { 3462 err = fib6_del(sibling, info); 3463 if (err) 3464 goto out_unlock; 3465 } 3466 } 3467 3468 err = fib6_del(rt, info); 3469 out_unlock: 3470 spin_unlock_bh(&table->tb6_lock); 3471 out_put: 3472 fib6_info_release(rt); 3473 3474 if (skb) { 3475 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3476 info->nlh, gfp_any()); 3477 } 3478 return err; 3479 } 3480 3481 static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3482 { 3483 int rc = -ESRCH; 3484 3485 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3486 goto out; 3487 3488 if (cfg->fc_flags & RTF_GATEWAY && 3489 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3490 goto out; 3491 3492 rc = rt6_remove_exception_rt(rt); 3493 out: 3494 return rc; 3495 } 3496 3497 static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt, 3498 struct fib6_nh *nh) 3499 { 3500 struct fib6_result res = { 3501 .f6i = rt, 3502 .nh = nh, 3503 }; 3504 struct rt6_info *rt_cache; 3505 3506 rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src); 3507 if (rt_cache) 3508 return __ip6_del_cached_rt(rt_cache, cfg); 3509 3510 return 0; 3511 } 3512 3513 static int ip6_route_del(struct fib6_config *cfg, 3514 struct netlink_ext_ack *extack) 3515 { 3516 struct fib6_table *table; 3517 struct fib6_info *rt; 3518 struct fib6_node *fn; 3519 int err = -ESRCH; 3520 3521 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 3522 if (!table) { 3523 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 3524 return err; 3525 } 3526 3527 rcu_read_lock(); 3528 3529 fn = fib6_locate(&table->tb6_root, 3530 &cfg->fc_dst, cfg->fc_dst_len, 3531 &cfg->fc_src, cfg->fc_src_len, 3532 !(cfg->fc_flags & RTF_CACHE)); 3533 3534 if (fn) { 3535 for_each_fib6_node_rt_rcu(fn) { 3536 struct fib6_nh *nh; 3537 3538 nh = rt->fib6_nh; 3539 if (cfg->fc_flags & RTF_CACHE) { 3540 int rc; 3541 3542 rc = ip6_del_cached_rt(cfg, rt, nh); 3543 if (rc != -ESRCH) { 3544 rcu_read_unlock(); 3545 return rc; 3546 } 3547 continue; 3548 } 3549 3550 if (cfg->fc_ifindex && 3551 (!nh->fib_nh_dev || 3552 nh->fib_nh_dev->ifindex != cfg->fc_ifindex)) 3553 continue; 3554 if (cfg->fc_flags & RTF_GATEWAY && 3555 !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6)) 3556 continue; 3557 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 3558 continue; 3559 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) 3560 continue; 3561 if (!fib6_info_hold_safe(rt)) 3562 continue; 3563 rcu_read_unlock(); 3564 3565 /* if gateway was specified only delete the one hop */ 3566 if (cfg->fc_flags & RTF_GATEWAY) 3567 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3568 3569 return __ip6_del_rt_siblings(rt, cfg); 3570 } 3571 } 3572 rcu_read_unlock(); 3573 3574 return err; 3575 } 3576 3577 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3578 { 3579 struct netevent_redirect netevent; 3580 struct rt6_info *rt, *nrt = NULL; 3581 struct fib6_result res = {}; 3582 struct ndisc_options ndopts; 3583 struct inet6_dev *in6_dev; 3584 struct neighbour *neigh; 3585 struct rd_msg *msg; 3586 int optlen, on_link; 3587 u8 *lladdr; 3588 3589 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3590 optlen -= sizeof(*msg); 3591 3592 if (optlen < 0) { 3593 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3594 return; 3595 } 3596 3597 msg = (struct rd_msg *)icmp6_hdr(skb); 3598 3599 if (ipv6_addr_is_multicast(&msg->dest)) { 3600 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3601 return; 3602 } 3603 3604 on_link = 0; 3605 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3606 on_link = 1; 3607 } else if (ipv6_addr_type(&msg->target) != 3608 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3609 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3610 return; 3611 } 3612 3613 in6_dev = __in6_dev_get(skb->dev); 3614 if (!in6_dev) 3615 return; 3616 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3617 return; 3618 3619 /* RFC2461 8.1: 3620 * The IP source address of the Redirect MUST be the same as the current 3621 * first-hop router for the specified ICMP Destination Address. 3622 */ 3623 3624 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3625 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3626 return; 3627 } 3628 3629 lladdr = NULL; 3630 if (ndopts.nd_opts_tgt_lladdr) { 3631 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3632 skb->dev); 3633 if (!lladdr) { 3634 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 3635 return; 3636 } 3637 } 3638 3639 rt = (struct rt6_info *) dst; 3640 if (rt->rt6i_flags & RTF_REJECT) { 3641 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 3642 return; 3643 } 3644 3645 /* Redirect received -> path was valid. 3646 * Look, redirects are sent only in response to data packets, 3647 * so that this nexthop apparently is reachable. --ANK 3648 */ 3649 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3650 3651 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3652 if (!neigh) 3653 return; 3654 3655 /* 3656 * We have finally decided to accept it. 3657 */ 3658 3659 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3660 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3661 NEIGH_UPDATE_F_OVERRIDE| 3662 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3663 NEIGH_UPDATE_F_ISROUTER)), 3664 NDISC_REDIRECT, &ndopts); 3665 3666 rcu_read_lock(); 3667 res.f6i = rcu_dereference(rt->from); 3668 if (!res.f6i) 3669 goto out; 3670 3671 res.nh = res.f6i->fib6_nh; 3672 res.fib6_flags = res.f6i->fib6_flags; 3673 res.fib6_type = res.f6i->fib6_type; 3674 nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL); 3675 if (!nrt) 3676 goto out; 3677 3678 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3679 if (on_link) 3680 nrt->rt6i_flags &= ~RTF_GATEWAY; 3681 3682 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3683 3684 /* rt6_insert_exception() will take care of duplicated exceptions */ 3685 if (rt6_insert_exception(nrt, &res)) { 3686 dst_release_immediate(&nrt->dst); 3687 goto out; 3688 } 3689 3690 netevent.old = &rt->dst; 3691 netevent.new = &nrt->dst; 3692 netevent.daddr = &msg->dest; 3693 netevent.neigh = neigh; 3694 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3695 3696 out: 3697 rcu_read_unlock(); 3698 neigh_release(neigh); 3699 } 3700 3701 #ifdef CONFIG_IPV6_ROUTE_INFO 3702 static struct fib6_info *rt6_get_route_info(struct net *net, 3703 const struct in6_addr *prefix, int prefixlen, 3704 const struct in6_addr *gwaddr, 3705 struct net_device *dev) 3706 { 3707 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3708 int ifindex = dev->ifindex; 3709 struct fib6_node *fn; 3710 struct fib6_info *rt = NULL; 3711 struct fib6_table *table; 3712 3713 table = fib6_get_table(net, tb_id); 3714 if (!table) 3715 return NULL; 3716 3717 rcu_read_lock(); 3718 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3719 if (!fn) 3720 goto out; 3721 3722 for_each_fib6_node_rt_rcu(fn) { 3723 /* these routes do not use nexthops */ 3724 if (rt->nh) 3725 continue; 3726 if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex) 3727 continue; 3728 if (!(rt->fib6_flags & RTF_ROUTEINFO) || 3729 !rt->fib6_nh->fib_nh_gw_family) 3730 continue; 3731 if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr)) 3732 continue; 3733 if (!fib6_info_hold_safe(rt)) 3734 continue; 3735 break; 3736 } 3737 out: 3738 rcu_read_unlock(); 3739 return rt; 3740 } 3741 3742 static struct fib6_info *rt6_add_route_info(struct net *net, 3743 const struct in6_addr *prefix, int prefixlen, 3744 const struct in6_addr *gwaddr, 3745 struct net_device *dev, 3746 unsigned int pref) 3747 { 3748 struct fib6_config cfg = { 3749 .fc_metric = IP6_RT_PRIO_USER, 3750 .fc_ifindex = dev->ifindex, 3751 .fc_dst_len = prefixlen, 3752 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3753 RTF_UP | RTF_PREF(pref), 3754 .fc_protocol = RTPROT_RA, 3755 .fc_type = RTN_UNICAST, 3756 .fc_nlinfo.portid = 0, 3757 .fc_nlinfo.nlh = NULL, 3758 .fc_nlinfo.nl_net = net, 3759 }; 3760 3761 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3762 cfg.fc_dst = *prefix; 3763 cfg.fc_gateway = *gwaddr; 3764 3765 /* We should treat it as a default route if prefix length is 0. */ 3766 if (!prefixlen) 3767 cfg.fc_flags |= RTF_DEFAULT; 3768 3769 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 3770 3771 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3772 } 3773 #endif 3774 3775 struct fib6_info *rt6_get_dflt_router(struct net *net, 3776 const struct in6_addr *addr, 3777 struct net_device *dev) 3778 { 3779 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3780 struct fib6_info *rt; 3781 struct fib6_table *table; 3782 3783 table = fib6_get_table(net, tb_id); 3784 if (!table) 3785 return NULL; 3786 3787 rcu_read_lock(); 3788 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3789 struct fib6_nh *nh; 3790 3791 /* RA routes do not use nexthops */ 3792 if (rt->nh) 3793 continue; 3794 3795 nh = rt->fib6_nh; 3796 if (dev == nh->fib_nh_dev && 3797 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3798 ipv6_addr_equal(&nh->fib_nh_gw6, addr)) 3799 break; 3800 } 3801 if (rt && !fib6_info_hold_safe(rt)) 3802 rt = NULL; 3803 rcu_read_unlock(); 3804 return rt; 3805 } 3806 3807 struct fib6_info *rt6_add_dflt_router(struct net *net, 3808 const struct in6_addr *gwaddr, 3809 struct net_device *dev, 3810 unsigned int pref) 3811 { 3812 struct fib6_config cfg = { 3813 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3814 .fc_metric = IP6_RT_PRIO_USER, 3815 .fc_ifindex = dev->ifindex, 3816 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3817 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3818 .fc_protocol = RTPROT_RA, 3819 .fc_type = RTN_UNICAST, 3820 .fc_nlinfo.portid = 0, 3821 .fc_nlinfo.nlh = NULL, 3822 .fc_nlinfo.nl_net = net, 3823 }; 3824 3825 cfg.fc_gateway = *gwaddr; 3826 3827 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 3828 struct fib6_table *table; 3829 3830 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3831 if (table) 3832 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3833 } 3834 3835 return rt6_get_dflt_router(net, gwaddr, dev); 3836 } 3837 3838 static void __rt6_purge_dflt_routers(struct net *net, 3839 struct fib6_table *table) 3840 { 3841 struct fib6_info *rt; 3842 3843 restart: 3844 rcu_read_lock(); 3845 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3846 struct net_device *dev = fib6_info_nh_dev(rt); 3847 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 3848 3849 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3850 (!idev || idev->cnf.accept_ra != 2) && 3851 fib6_info_hold_safe(rt)) { 3852 rcu_read_unlock(); 3853 ip6_del_rt(net, rt); 3854 goto restart; 3855 } 3856 } 3857 rcu_read_unlock(); 3858 3859 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3860 } 3861 3862 void rt6_purge_dflt_routers(struct net *net) 3863 { 3864 struct fib6_table *table; 3865 struct hlist_head *head; 3866 unsigned int h; 3867 3868 rcu_read_lock(); 3869 3870 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3871 head = &net->ipv6.fib_table_hash[h]; 3872 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3873 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3874 __rt6_purge_dflt_routers(net, table); 3875 } 3876 } 3877 3878 rcu_read_unlock(); 3879 } 3880 3881 static void rtmsg_to_fib6_config(struct net *net, 3882 struct in6_rtmsg *rtmsg, 3883 struct fib6_config *cfg) 3884 { 3885 *cfg = (struct fib6_config){ 3886 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3887 : RT6_TABLE_MAIN, 3888 .fc_ifindex = rtmsg->rtmsg_ifindex, 3889 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER, 3890 .fc_expires = rtmsg->rtmsg_info, 3891 .fc_dst_len = rtmsg->rtmsg_dst_len, 3892 .fc_src_len = rtmsg->rtmsg_src_len, 3893 .fc_flags = rtmsg->rtmsg_flags, 3894 .fc_type = rtmsg->rtmsg_type, 3895 3896 .fc_nlinfo.nl_net = net, 3897 3898 .fc_dst = rtmsg->rtmsg_dst, 3899 .fc_src = rtmsg->rtmsg_src, 3900 .fc_gateway = rtmsg->rtmsg_gateway, 3901 }; 3902 } 3903 3904 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3905 { 3906 struct fib6_config cfg; 3907 struct in6_rtmsg rtmsg; 3908 int err; 3909 3910 switch (cmd) { 3911 case SIOCADDRT: /* Add a route */ 3912 case SIOCDELRT: /* Delete a route */ 3913 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3914 return -EPERM; 3915 err = copy_from_user(&rtmsg, arg, 3916 sizeof(struct in6_rtmsg)); 3917 if (err) 3918 return -EFAULT; 3919 3920 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3921 3922 rtnl_lock(); 3923 switch (cmd) { 3924 case SIOCADDRT: 3925 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 3926 break; 3927 case SIOCDELRT: 3928 err = ip6_route_del(&cfg, NULL); 3929 break; 3930 default: 3931 err = -EINVAL; 3932 } 3933 rtnl_unlock(); 3934 3935 return err; 3936 } 3937 3938 return -EINVAL; 3939 } 3940 3941 /* 3942 * Drop the packet on the floor 3943 */ 3944 3945 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3946 { 3947 struct dst_entry *dst = skb_dst(skb); 3948 struct net *net = dev_net(dst->dev); 3949 struct inet6_dev *idev; 3950 int type; 3951 3952 if (netif_is_l3_master(skb->dev) && 3953 dst->dev == net->loopback_dev) 3954 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); 3955 else 3956 idev = ip6_dst_idev(dst); 3957 3958 switch (ipstats_mib_noroutes) { 3959 case IPSTATS_MIB_INNOROUTES: 3960 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3961 if (type == IPV6_ADDR_ANY) { 3962 IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 3963 break; 3964 } 3965 /* FALLTHROUGH */ 3966 case IPSTATS_MIB_OUTNOROUTES: 3967 IP6_INC_STATS(net, idev, ipstats_mib_noroutes); 3968 break; 3969 } 3970 3971 /* Start over by dropping the dst for l3mdev case */ 3972 if (netif_is_l3_master(skb->dev)) 3973 skb_dst_drop(skb); 3974 3975 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3976 kfree_skb(skb); 3977 return 0; 3978 } 3979 3980 static int ip6_pkt_discard(struct sk_buff *skb) 3981 { 3982 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3983 } 3984 3985 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3986 { 3987 skb->dev = skb_dst(skb)->dev; 3988 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3989 } 3990 3991 static int ip6_pkt_prohibit(struct sk_buff *skb) 3992 { 3993 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3994 } 3995 3996 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3997 { 3998 skb->dev = skb_dst(skb)->dev; 3999 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 4000 } 4001 4002 /* 4003 * Allocate a dst for local (unicast / anycast) address. 4004 */ 4005 4006 struct fib6_info *addrconf_f6i_alloc(struct net *net, 4007 struct inet6_dev *idev, 4008 const struct in6_addr *addr, 4009 bool anycast, gfp_t gfp_flags) 4010 { 4011 struct fib6_config cfg = { 4012 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL, 4013 .fc_ifindex = idev->dev->ifindex, 4014 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP, 4015 .fc_dst = *addr, 4016 .fc_dst_len = 128, 4017 .fc_protocol = RTPROT_KERNEL, 4018 .fc_nlinfo.nl_net = net, 4019 .fc_ignore_dev_down = true, 4020 }; 4021 4022 if (anycast) { 4023 cfg.fc_type = RTN_ANYCAST; 4024 cfg.fc_flags |= RTF_ANYCAST; 4025 } else { 4026 cfg.fc_type = RTN_LOCAL; 4027 cfg.fc_flags |= RTF_LOCAL; 4028 } 4029 4030 return ip6_route_info_create(&cfg, gfp_flags, NULL); 4031 } 4032 4033 /* remove deleted ip from prefsrc entries */ 4034 struct arg_dev_net_ip { 4035 struct net_device *dev; 4036 struct net *net; 4037 struct in6_addr *addr; 4038 }; 4039 4040 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 4041 { 4042 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 4043 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 4044 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 4045 4046 if (!rt->nh && 4047 ((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) && 4048 rt != net->ipv6.fib6_null_entry && 4049 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 4050 spin_lock_bh(&rt6_exception_lock); 4051 /* remove prefsrc entry */ 4052 rt->fib6_prefsrc.plen = 0; 4053 spin_unlock_bh(&rt6_exception_lock); 4054 } 4055 return 0; 4056 } 4057 4058 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 4059 { 4060 struct net *net = dev_net(ifp->idev->dev); 4061 struct arg_dev_net_ip adni = { 4062 .dev = ifp->idev->dev, 4063 .net = net, 4064 .addr = &ifp->addr, 4065 }; 4066 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 4067 } 4068 4069 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT) 4070 4071 /* Remove routers and update dst entries when gateway turn into host. */ 4072 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 4073 { 4074 struct in6_addr *gateway = (struct in6_addr *)arg; 4075 struct fib6_nh *nh; 4076 4077 /* RA routes do not use nexthops */ 4078 if (rt->nh) 4079 return 0; 4080 4081 nh = rt->fib6_nh; 4082 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 4083 nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6)) 4084 return -1; 4085 4086 /* Further clean up cached routes in exception table. 4087 * This is needed because cached route may have a different 4088 * gateway than its 'parent' in the case of an ip redirect. 4089 */ 4090 fib6_nh_exceptions_clean_tohost(nh, gateway); 4091 4092 return 0; 4093 } 4094 4095 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 4096 { 4097 fib6_clean_all(net, fib6_clean_tohost, gateway); 4098 } 4099 4100 struct arg_netdev_event { 4101 const struct net_device *dev; 4102 union { 4103 unsigned char nh_flags; 4104 unsigned long event; 4105 }; 4106 }; 4107 4108 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 4109 { 4110 struct fib6_info *iter; 4111 struct fib6_node *fn; 4112 4113 fn = rcu_dereference_protected(rt->fib6_node, 4114 lockdep_is_held(&rt->fib6_table->tb6_lock)); 4115 iter = rcu_dereference_protected(fn->leaf, 4116 lockdep_is_held(&rt->fib6_table->tb6_lock)); 4117 while (iter) { 4118 if (iter->fib6_metric == rt->fib6_metric && 4119 rt6_qualify_for_ecmp(iter)) 4120 return iter; 4121 iter = rcu_dereference_protected(iter->fib6_next, 4122 lockdep_is_held(&rt->fib6_table->tb6_lock)); 4123 } 4124 4125 return NULL; 4126 } 4127 4128 /* only called for fib entries with builtin fib6_nh */ 4129 static bool rt6_is_dead(const struct fib6_info *rt) 4130 { 4131 if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD || 4132 (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN && 4133 ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev))) 4134 return true; 4135 4136 return false; 4137 } 4138 4139 static int rt6_multipath_total_weight(const struct fib6_info *rt) 4140 { 4141 struct fib6_info *iter; 4142 int total = 0; 4143 4144 if (!rt6_is_dead(rt)) 4145 total += rt->fib6_nh->fib_nh_weight; 4146 4147 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 4148 if (!rt6_is_dead(iter)) 4149 total += iter->fib6_nh->fib_nh_weight; 4150 } 4151 4152 return total; 4153 } 4154 4155 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 4156 { 4157 int upper_bound = -1; 4158 4159 if (!rt6_is_dead(rt)) { 4160 *weight += rt->fib6_nh->fib_nh_weight; 4161 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 4162 total) - 1; 4163 } 4164 atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound); 4165 } 4166 4167 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 4168 { 4169 struct fib6_info *iter; 4170 int weight = 0; 4171 4172 rt6_upper_bound_set(rt, &weight, total); 4173 4174 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4175 rt6_upper_bound_set(iter, &weight, total); 4176 } 4177 4178 void rt6_multipath_rebalance(struct fib6_info *rt) 4179 { 4180 struct fib6_info *first; 4181 int total; 4182 4183 /* In case the entire multipath route was marked for flushing, 4184 * then there is no need to rebalance upon the removal of every 4185 * sibling route. 4186 */ 4187 if (!rt->fib6_nsiblings || rt->should_flush) 4188 return; 4189 4190 /* During lookup routes are evaluated in order, so we need to 4191 * make sure upper bounds are assigned from the first sibling 4192 * onwards. 4193 */ 4194 first = rt6_multipath_first_sibling(rt); 4195 if (WARN_ON_ONCE(!first)) 4196 return; 4197 4198 total = rt6_multipath_total_weight(first); 4199 rt6_multipath_upper_bound_set(first, total); 4200 } 4201 4202 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 4203 { 4204 const struct arg_netdev_event *arg = p_arg; 4205 struct net *net = dev_net(arg->dev); 4206 4207 if (rt != net->ipv6.fib6_null_entry && !rt->nh && 4208 rt->fib6_nh->fib_nh_dev == arg->dev) { 4209 rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags; 4210 fib6_update_sernum_upto_root(net, rt); 4211 rt6_multipath_rebalance(rt); 4212 } 4213 4214 return 0; 4215 } 4216 4217 void rt6_sync_up(struct net_device *dev, unsigned char nh_flags) 4218 { 4219 struct arg_netdev_event arg = { 4220 .dev = dev, 4221 { 4222 .nh_flags = nh_flags, 4223 }, 4224 }; 4225 4226 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 4227 arg.nh_flags |= RTNH_F_LINKDOWN; 4228 4229 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 4230 } 4231 4232 /* only called for fib entries with inline fib6_nh */ 4233 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 4234 const struct net_device *dev) 4235 { 4236 struct fib6_info *iter; 4237 4238 if (rt->fib6_nh->fib_nh_dev == dev) 4239 return true; 4240 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4241 if (iter->fib6_nh->fib_nh_dev == dev) 4242 return true; 4243 4244 return false; 4245 } 4246 4247 static void rt6_multipath_flush(struct fib6_info *rt) 4248 { 4249 struct fib6_info *iter; 4250 4251 rt->should_flush = 1; 4252 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4253 iter->should_flush = 1; 4254 } 4255 4256 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 4257 const struct net_device *down_dev) 4258 { 4259 struct fib6_info *iter; 4260 unsigned int dead = 0; 4261 4262 if (rt->fib6_nh->fib_nh_dev == down_dev || 4263 rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD) 4264 dead++; 4265 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4266 if (iter->fib6_nh->fib_nh_dev == down_dev || 4267 iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD) 4268 dead++; 4269 4270 return dead; 4271 } 4272 4273 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 4274 const struct net_device *dev, 4275 unsigned char nh_flags) 4276 { 4277 struct fib6_info *iter; 4278 4279 if (rt->fib6_nh->fib_nh_dev == dev) 4280 rt->fib6_nh->fib_nh_flags |= nh_flags; 4281 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4282 if (iter->fib6_nh->fib_nh_dev == dev) 4283 iter->fib6_nh->fib_nh_flags |= nh_flags; 4284 } 4285 4286 /* called with write lock held for table with rt */ 4287 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 4288 { 4289 const struct arg_netdev_event *arg = p_arg; 4290 const struct net_device *dev = arg->dev; 4291 struct net *net = dev_net(dev); 4292 4293 if (rt == net->ipv6.fib6_null_entry || rt->nh) 4294 return 0; 4295 4296 switch (arg->event) { 4297 case NETDEV_UNREGISTER: 4298 return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; 4299 case NETDEV_DOWN: 4300 if (rt->should_flush) 4301 return -1; 4302 if (!rt->fib6_nsiblings) 4303 return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; 4304 if (rt6_multipath_uses_dev(rt, dev)) { 4305 unsigned int count; 4306 4307 count = rt6_multipath_dead_count(rt, dev); 4308 if (rt->fib6_nsiblings + 1 == count) { 4309 rt6_multipath_flush(rt); 4310 return -1; 4311 } 4312 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4313 RTNH_F_LINKDOWN); 4314 fib6_update_sernum(net, rt); 4315 rt6_multipath_rebalance(rt); 4316 } 4317 return -2; 4318 case NETDEV_CHANGE: 4319 if (rt->fib6_nh->fib_nh_dev != dev || 4320 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 4321 break; 4322 rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; 4323 rt6_multipath_rebalance(rt); 4324 break; 4325 } 4326 4327 return 0; 4328 } 4329 4330 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4331 { 4332 struct arg_netdev_event arg = { 4333 .dev = dev, 4334 { 4335 .event = event, 4336 }, 4337 }; 4338 struct net *net = dev_net(dev); 4339 4340 if (net->ipv6.sysctl.skip_notify_on_dev_down) 4341 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg); 4342 else 4343 fib6_clean_all(net, fib6_ifdown, &arg); 4344 } 4345 4346 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4347 { 4348 rt6_sync_down_dev(dev, event); 4349 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4350 neigh_ifdown(&nd_tbl, dev); 4351 } 4352 4353 struct rt6_mtu_change_arg { 4354 struct net_device *dev; 4355 unsigned int mtu; 4356 struct fib6_info *f6i; 4357 }; 4358 4359 static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg) 4360 { 4361 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg; 4362 struct fib6_info *f6i = arg->f6i; 4363 4364 /* For administrative MTU increase, there is no way to discover 4365 * IPv6 PMTU increase, so PMTU increase should be updated here. 4366 * Since RFC 1981 doesn't include administrative MTU increase 4367 * update PMTU increase is a MUST. (i.e. jumbo frame) 4368 */ 4369 if (nh->fib_nh_dev == arg->dev) { 4370 struct inet6_dev *idev = __in6_dev_get(arg->dev); 4371 u32 mtu = f6i->fib6_pmtu; 4372 4373 if (mtu >= arg->mtu || 4374 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4375 fib6_metric_set(f6i, RTAX_MTU, arg->mtu); 4376 4377 spin_lock_bh(&rt6_exception_lock); 4378 rt6_exceptions_update_pmtu(idev, nh, arg->mtu); 4379 spin_unlock_bh(&rt6_exception_lock); 4380 } 4381 4382 return 0; 4383 } 4384 4385 static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg) 4386 { 4387 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4388 struct inet6_dev *idev; 4389 4390 /* In IPv6 pmtu discovery is not optional, 4391 so that RTAX_MTU lock cannot disable it. 4392 We still use this lock to block changes 4393 caused by addrconf/ndisc. 4394 */ 4395 4396 idev = __in6_dev_get(arg->dev); 4397 if (!idev) 4398 return 0; 4399 4400 if (fib6_metric_locked(f6i, RTAX_MTU)) 4401 return 0; 4402 4403 arg->f6i = f6i; 4404 return fib6_nh_mtu_change(f6i->fib6_nh, arg); 4405 } 4406 4407 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4408 { 4409 struct rt6_mtu_change_arg arg = { 4410 .dev = dev, 4411 .mtu = mtu, 4412 }; 4413 4414 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4415 } 4416 4417 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4418 [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 }, 4419 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4420 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4421 [RTA_OIF] = { .type = NLA_U32 }, 4422 [RTA_IIF] = { .type = NLA_U32 }, 4423 [RTA_PRIORITY] = { .type = NLA_U32 }, 4424 [RTA_METRICS] = { .type = NLA_NESTED }, 4425 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4426 [RTA_PREF] = { .type = NLA_U8 }, 4427 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4428 [RTA_ENCAP] = { .type = NLA_NESTED }, 4429 [RTA_EXPIRES] = { .type = NLA_U32 }, 4430 [RTA_UID] = { .type = NLA_U32 }, 4431 [RTA_MARK] = { .type = NLA_U32 }, 4432 [RTA_TABLE] = { .type = NLA_U32 }, 4433 [RTA_IP_PROTO] = { .type = NLA_U8 }, 4434 [RTA_SPORT] = { .type = NLA_U16 }, 4435 [RTA_DPORT] = { .type = NLA_U16 }, 4436 }; 4437 4438 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4439 struct fib6_config *cfg, 4440 struct netlink_ext_ack *extack) 4441 { 4442 struct rtmsg *rtm; 4443 struct nlattr *tb[RTA_MAX+1]; 4444 unsigned int pref; 4445 int err; 4446 4447 err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, 4448 rtm_ipv6_policy, extack); 4449 if (err < 0) 4450 goto errout; 4451 4452 err = -EINVAL; 4453 rtm = nlmsg_data(nlh); 4454 4455 *cfg = (struct fib6_config){ 4456 .fc_table = rtm->rtm_table, 4457 .fc_dst_len = rtm->rtm_dst_len, 4458 .fc_src_len = rtm->rtm_src_len, 4459 .fc_flags = RTF_UP, 4460 .fc_protocol = rtm->rtm_protocol, 4461 .fc_type = rtm->rtm_type, 4462 4463 .fc_nlinfo.portid = NETLINK_CB(skb).portid, 4464 .fc_nlinfo.nlh = nlh, 4465 .fc_nlinfo.nl_net = sock_net(skb->sk), 4466 }; 4467 4468 if (rtm->rtm_type == RTN_UNREACHABLE || 4469 rtm->rtm_type == RTN_BLACKHOLE || 4470 rtm->rtm_type == RTN_PROHIBIT || 4471 rtm->rtm_type == RTN_THROW) 4472 cfg->fc_flags |= RTF_REJECT; 4473 4474 if (rtm->rtm_type == RTN_LOCAL) 4475 cfg->fc_flags |= RTF_LOCAL; 4476 4477 if (rtm->rtm_flags & RTM_F_CLONED) 4478 cfg->fc_flags |= RTF_CACHE; 4479 4480 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4481 4482 if (tb[RTA_GATEWAY]) { 4483 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4484 cfg->fc_flags |= RTF_GATEWAY; 4485 } 4486 if (tb[RTA_VIA]) { 4487 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute"); 4488 goto errout; 4489 } 4490 4491 if (tb[RTA_DST]) { 4492 int plen = (rtm->rtm_dst_len + 7) >> 3; 4493 4494 if (nla_len(tb[RTA_DST]) < plen) 4495 goto errout; 4496 4497 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 4498 } 4499 4500 if (tb[RTA_SRC]) { 4501 int plen = (rtm->rtm_src_len + 7) >> 3; 4502 4503 if (nla_len(tb[RTA_SRC]) < plen) 4504 goto errout; 4505 4506 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 4507 } 4508 4509 if (tb[RTA_PREFSRC]) 4510 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 4511 4512 if (tb[RTA_OIF]) 4513 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 4514 4515 if (tb[RTA_PRIORITY]) 4516 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 4517 4518 if (tb[RTA_METRICS]) { 4519 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 4520 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 4521 } 4522 4523 if (tb[RTA_TABLE]) 4524 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 4525 4526 if (tb[RTA_MULTIPATH]) { 4527 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 4528 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 4529 4530 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 4531 cfg->fc_mp_len, extack); 4532 if (err < 0) 4533 goto errout; 4534 } 4535 4536 if (tb[RTA_PREF]) { 4537 pref = nla_get_u8(tb[RTA_PREF]); 4538 if (pref != ICMPV6_ROUTER_PREF_LOW && 4539 pref != ICMPV6_ROUTER_PREF_HIGH) 4540 pref = ICMPV6_ROUTER_PREF_MEDIUM; 4541 cfg->fc_flags |= RTF_PREF(pref); 4542 } 4543 4544 if (tb[RTA_ENCAP]) 4545 cfg->fc_encap = tb[RTA_ENCAP]; 4546 4547 if (tb[RTA_ENCAP_TYPE]) { 4548 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 4549 4550 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 4551 if (err < 0) 4552 goto errout; 4553 } 4554 4555 if (tb[RTA_EXPIRES]) { 4556 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 4557 4558 if (addrconf_finite_timeout(timeout)) { 4559 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 4560 cfg->fc_flags |= RTF_EXPIRES; 4561 } 4562 } 4563 4564 err = 0; 4565 errout: 4566 return err; 4567 } 4568 4569 struct rt6_nh { 4570 struct fib6_info *fib6_info; 4571 struct fib6_config r_cfg; 4572 struct list_head next; 4573 }; 4574 4575 static int ip6_route_info_append(struct net *net, 4576 struct list_head *rt6_nh_list, 4577 struct fib6_info *rt, 4578 struct fib6_config *r_cfg) 4579 { 4580 struct rt6_nh *nh; 4581 int err = -EEXIST; 4582 4583 list_for_each_entry(nh, rt6_nh_list, next) { 4584 /* check if fib6_info already exists */ 4585 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 4586 return err; 4587 } 4588 4589 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4590 if (!nh) 4591 return -ENOMEM; 4592 nh->fib6_info = rt; 4593 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4594 list_add_tail(&nh->next, rt6_nh_list); 4595 4596 return 0; 4597 } 4598 4599 static void ip6_route_mpath_notify(struct fib6_info *rt, 4600 struct fib6_info *rt_last, 4601 struct nl_info *info, 4602 __u16 nlflags) 4603 { 4604 /* if this is an APPEND route, then rt points to the first route 4605 * inserted and rt_last points to last route inserted. Userspace 4606 * wants a consistent dump of the route which starts at the first 4607 * nexthop. Since sibling routes are always added at the end of 4608 * the list, find the first sibling of the last route appended 4609 */ 4610 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 4611 rt = list_first_entry(&rt_last->fib6_siblings, 4612 struct fib6_info, 4613 fib6_siblings); 4614 } 4615 4616 if (rt) 4617 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 4618 } 4619 4620 static int ip6_route_multipath_add(struct fib6_config *cfg, 4621 struct netlink_ext_ack *extack) 4622 { 4623 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 4624 struct nl_info *info = &cfg->fc_nlinfo; 4625 struct fib6_config r_cfg; 4626 struct rtnexthop *rtnh; 4627 struct fib6_info *rt; 4628 struct rt6_nh *err_nh; 4629 struct rt6_nh *nh, *nh_safe; 4630 __u16 nlflags; 4631 int remaining; 4632 int attrlen; 4633 int err = 1; 4634 int nhn = 0; 4635 int replace = (cfg->fc_nlinfo.nlh && 4636 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 4637 LIST_HEAD(rt6_nh_list); 4638 4639 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 4640 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 4641 nlflags |= NLM_F_APPEND; 4642 4643 remaining = cfg->fc_mp_len; 4644 rtnh = (struct rtnexthop *)cfg->fc_mp; 4645 4646 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4647 * fib6_info structs per nexthop 4648 */ 4649 while (rtnh_ok(rtnh, remaining)) { 4650 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4651 if (rtnh->rtnh_ifindex) 4652 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4653 4654 attrlen = rtnh_attrlen(rtnh); 4655 if (attrlen > 0) { 4656 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4657 4658 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4659 if (nla) { 4660 r_cfg.fc_gateway = nla_get_in6_addr(nla); 4661 r_cfg.fc_flags |= RTF_GATEWAY; 4662 } 4663 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 4664 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 4665 if (nla) 4666 r_cfg.fc_encap_type = nla_get_u16(nla); 4667 } 4668 4669 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 4670 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 4671 if (IS_ERR(rt)) { 4672 err = PTR_ERR(rt); 4673 rt = NULL; 4674 goto cleanup; 4675 } 4676 if (!rt6_qualify_for_ecmp(rt)) { 4677 err = -EINVAL; 4678 NL_SET_ERR_MSG(extack, 4679 "Device only routes can not be added for IPv6 using the multipath API."); 4680 fib6_info_release(rt); 4681 goto cleanup; 4682 } 4683 4684 rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; 4685 4686 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 4687 rt, &r_cfg); 4688 if (err) { 4689 fib6_info_release(rt); 4690 goto cleanup; 4691 } 4692 4693 rtnh = rtnh_next(rtnh, &remaining); 4694 } 4695 4696 /* for add and replace send one notification with all nexthops. 4697 * Skip the notification in fib6_add_rt2node and send one with 4698 * the full route when done 4699 */ 4700 info->skip_notify = 1; 4701 4702 err_nh = NULL; 4703 list_for_each_entry(nh, &rt6_nh_list, next) { 4704 err = __ip6_ins_rt(nh->fib6_info, info, extack); 4705 fib6_info_release(nh->fib6_info); 4706 4707 if (!err) { 4708 /* save reference to last route successfully inserted */ 4709 rt_last = nh->fib6_info; 4710 4711 /* save reference to first route for notification */ 4712 if (!rt_notif) 4713 rt_notif = nh->fib6_info; 4714 } 4715 4716 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 4717 nh->fib6_info = NULL; 4718 if (err) { 4719 if (replace && nhn) 4720 NL_SET_ERR_MSG_MOD(extack, 4721 "multipath route replace failed (check consistency of installed routes)"); 4722 err_nh = nh; 4723 goto add_errout; 4724 } 4725 4726 /* Because each route is added like a single route we remove 4727 * these flags after the first nexthop: if there is a collision, 4728 * we have already failed to add the first nexthop: 4729 * fib6_add_rt2node() has rejected it; when replacing, old 4730 * nexthops have been replaced by first new, the rest should 4731 * be added to it. 4732 */ 4733 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4734 NLM_F_REPLACE); 4735 nhn++; 4736 } 4737 4738 /* success ... tell user about new route */ 4739 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4740 goto cleanup; 4741 4742 add_errout: 4743 /* send notification for routes that were added so that 4744 * the delete notifications sent by ip6_route_del are 4745 * coherent 4746 */ 4747 if (rt_notif) 4748 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4749 4750 /* Delete routes that were already added */ 4751 list_for_each_entry(nh, &rt6_nh_list, next) { 4752 if (err_nh == nh) 4753 break; 4754 ip6_route_del(&nh->r_cfg, extack); 4755 } 4756 4757 cleanup: 4758 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4759 if (nh->fib6_info) 4760 fib6_info_release(nh->fib6_info); 4761 list_del(&nh->next); 4762 kfree(nh); 4763 } 4764 4765 return err; 4766 } 4767 4768 static int ip6_route_multipath_del(struct fib6_config *cfg, 4769 struct netlink_ext_ack *extack) 4770 { 4771 struct fib6_config r_cfg; 4772 struct rtnexthop *rtnh; 4773 int remaining; 4774 int attrlen; 4775 int err = 1, last_err = 0; 4776 4777 remaining = cfg->fc_mp_len; 4778 rtnh = (struct rtnexthop *)cfg->fc_mp; 4779 4780 /* Parse a Multipath Entry */ 4781 while (rtnh_ok(rtnh, remaining)) { 4782 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4783 if (rtnh->rtnh_ifindex) 4784 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4785 4786 attrlen = rtnh_attrlen(rtnh); 4787 if (attrlen > 0) { 4788 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4789 4790 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4791 if (nla) { 4792 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 4793 r_cfg.fc_flags |= RTF_GATEWAY; 4794 } 4795 } 4796 err = ip6_route_del(&r_cfg, extack); 4797 if (err) 4798 last_err = err; 4799 4800 rtnh = rtnh_next(rtnh, &remaining); 4801 } 4802 4803 return last_err; 4804 } 4805 4806 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4807 struct netlink_ext_ack *extack) 4808 { 4809 struct fib6_config cfg; 4810 int err; 4811 4812 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4813 if (err < 0) 4814 return err; 4815 4816 if (cfg.fc_mp) 4817 return ip6_route_multipath_del(&cfg, extack); 4818 else { 4819 cfg.fc_delete_all_nh = 1; 4820 return ip6_route_del(&cfg, extack); 4821 } 4822 } 4823 4824 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4825 struct netlink_ext_ack *extack) 4826 { 4827 struct fib6_config cfg; 4828 int err; 4829 4830 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4831 if (err < 0) 4832 return err; 4833 4834 if (cfg.fc_metric == 0) 4835 cfg.fc_metric = IP6_RT_PRIO_USER; 4836 4837 if (cfg.fc_mp) 4838 return ip6_route_multipath_add(&cfg, extack); 4839 else 4840 return ip6_route_add(&cfg, GFP_KERNEL, extack); 4841 } 4842 4843 static size_t rt6_nlmsg_size(struct fib6_info *rt) 4844 { 4845 int nexthop_len = 0; 4846 4847 if (rt->nh) 4848 nexthop_len += nla_total_size(4); /* RTA_NH_ID */ 4849 4850 if (rt->fib6_nsiblings) { 4851 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4852 + NLA_ALIGN(sizeof(struct rtnexthop)) 4853 + nla_total_size(16) /* RTA_GATEWAY */ 4854 + lwtunnel_get_encap_size(rt->fib6_nh->fib_nh_lws); 4855 4856 nexthop_len *= rt->fib6_nsiblings; 4857 } 4858 4859 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4860 + nla_total_size(16) /* RTA_SRC */ 4861 + nla_total_size(16) /* RTA_DST */ 4862 + nla_total_size(16) /* RTA_GATEWAY */ 4863 + nla_total_size(16) /* RTA_PREFSRC */ 4864 + nla_total_size(4) /* RTA_TABLE */ 4865 + nla_total_size(4) /* RTA_IIF */ 4866 + nla_total_size(4) /* RTA_OIF */ 4867 + nla_total_size(4) /* RTA_PRIORITY */ 4868 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 4869 + nla_total_size(sizeof(struct rta_cacheinfo)) 4870 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4871 + nla_total_size(1) /* RTA_PREF */ 4872 + lwtunnel_get_encap_size(rt->fib6_nh->fib_nh_lws) 4873 + nexthop_len; 4874 } 4875 4876 static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh, 4877 unsigned char *flags) 4878 { 4879 if (nexthop_is_multipath(nh)) { 4880 struct nlattr *mp; 4881 4882 mp = nla_nest_start(skb, RTA_MULTIPATH); 4883 if (!mp) 4884 goto nla_put_failure; 4885 4886 if (nexthop_mpath_fill_node(skb, nh)) 4887 goto nla_put_failure; 4888 4889 nla_nest_end(skb, mp); 4890 } else { 4891 struct fib6_nh *fib6_nh; 4892 4893 fib6_nh = nexthop_fib6_nh(nh); 4894 if (fib_nexthop_info(skb, &fib6_nh->nh_common, 4895 flags, false) < 0) 4896 goto nla_put_failure; 4897 } 4898 4899 return 0; 4900 4901 nla_put_failure: 4902 return -EMSGSIZE; 4903 } 4904 4905 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 4906 struct fib6_info *rt, struct dst_entry *dst, 4907 struct in6_addr *dest, struct in6_addr *src, 4908 int iif, int type, u32 portid, u32 seq, 4909 unsigned int flags) 4910 { 4911 struct rt6_info *rt6 = (struct rt6_info *)dst; 4912 struct rt6key *rt6_dst, *rt6_src; 4913 u32 *pmetrics, table, rt6_flags; 4914 unsigned char nh_flags = 0; 4915 struct nlmsghdr *nlh; 4916 struct rtmsg *rtm; 4917 long expires = 0; 4918 4919 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4920 if (!nlh) 4921 return -EMSGSIZE; 4922 4923 if (rt6) { 4924 rt6_dst = &rt6->rt6i_dst; 4925 rt6_src = &rt6->rt6i_src; 4926 rt6_flags = rt6->rt6i_flags; 4927 } else { 4928 rt6_dst = &rt->fib6_dst; 4929 rt6_src = &rt->fib6_src; 4930 rt6_flags = rt->fib6_flags; 4931 } 4932 4933 rtm = nlmsg_data(nlh); 4934 rtm->rtm_family = AF_INET6; 4935 rtm->rtm_dst_len = rt6_dst->plen; 4936 rtm->rtm_src_len = rt6_src->plen; 4937 rtm->rtm_tos = 0; 4938 if (rt->fib6_table) 4939 table = rt->fib6_table->tb6_id; 4940 else 4941 table = RT6_TABLE_UNSPEC; 4942 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT; 4943 if (nla_put_u32(skb, RTA_TABLE, table)) 4944 goto nla_put_failure; 4945 4946 rtm->rtm_type = rt->fib6_type; 4947 rtm->rtm_flags = 0; 4948 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4949 rtm->rtm_protocol = rt->fib6_protocol; 4950 4951 if (rt6_flags & RTF_CACHE) 4952 rtm->rtm_flags |= RTM_F_CLONED; 4953 4954 if (dest) { 4955 if (nla_put_in6_addr(skb, RTA_DST, dest)) 4956 goto nla_put_failure; 4957 rtm->rtm_dst_len = 128; 4958 } else if (rtm->rtm_dst_len) 4959 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr)) 4960 goto nla_put_failure; 4961 #ifdef CONFIG_IPV6_SUBTREES 4962 if (src) { 4963 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4964 goto nla_put_failure; 4965 rtm->rtm_src_len = 128; 4966 } else if (rtm->rtm_src_len && 4967 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr)) 4968 goto nla_put_failure; 4969 #endif 4970 if (iif) { 4971 #ifdef CONFIG_IPV6_MROUTE 4972 if (ipv6_addr_is_multicast(&rt6_dst->addr)) { 4973 int err = ip6mr_get_route(net, skb, rtm, portid); 4974 4975 if (err == 0) 4976 return 0; 4977 if (err < 0) 4978 goto nla_put_failure; 4979 } else 4980 #endif 4981 if (nla_put_u32(skb, RTA_IIF, iif)) 4982 goto nla_put_failure; 4983 } else if (dest) { 4984 struct in6_addr saddr_buf; 4985 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 4986 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4987 goto nla_put_failure; 4988 } 4989 4990 if (rt->fib6_prefsrc.plen) { 4991 struct in6_addr saddr_buf; 4992 saddr_buf = rt->fib6_prefsrc.addr; 4993 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4994 goto nla_put_failure; 4995 } 4996 4997 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 4998 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 4999 goto nla_put_failure; 5000 5001 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 5002 goto nla_put_failure; 5003 5004 /* For multipath routes, walk the siblings list and add 5005 * each as a nexthop within RTA_MULTIPATH. 5006 */ 5007 if (rt6) { 5008 if (rt6_flags & RTF_GATEWAY && 5009 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) 5010 goto nla_put_failure; 5011 5012 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex)) 5013 goto nla_put_failure; 5014 } else if (rt->fib6_nsiblings) { 5015 struct fib6_info *sibling, *next_sibling; 5016 struct nlattr *mp; 5017 5018 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); 5019 if (!mp) 5020 goto nla_put_failure; 5021 5022 if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common, 5023 rt->fib6_nh->fib_nh_weight) < 0) 5024 goto nla_put_failure; 5025 5026 list_for_each_entry_safe(sibling, next_sibling, 5027 &rt->fib6_siblings, fib6_siblings) { 5028 if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common, 5029 sibling->fib6_nh->fib_nh_weight) < 0) 5030 goto nla_put_failure; 5031 } 5032 5033 nla_nest_end(skb, mp); 5034 } else if (rt->nh) { 5035 if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id)) 5036 goto nla_put_failure; 5037 5038 if (nexthop_is_blackhole(rt->nh)) 5039 rtm->rtm_type = RTN_BLACKHOLE; 5040 5041 if (rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0) 5042 goto nla_put_failure; 5043 5044 rtm->rtm_flags |= nh_flags; 5045 } else { 5046 if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, 5047 &nh_flags, false) < 0) 5048 goto nla_put_failure; 5049 5050 rtm->rtm_flags |= nh_flags; 5051 } 5052 5053 if (rt6_flags & RTF_EXPIRES) { 5054 expires = dst ? dst->expires : rt->expires; 5055 expires -= jiffies; 5056 } 5057 5058 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 5059 goto nla_put_failure; 5060 5061 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags))) 5062 goto nla_put_failure; 5063 5064 5065 nlmsg_end(skb, nlh); 5066 return 0; 5067 5068 nla_put_failure: 5069 nlmsg_cancel(skb, nlh); 5070 return -EMSGSIZE; 5071 } 5072 5073 static bool fib6_info_uses_dev(const struct fib6_info *f6i, 5074 const struct net_device *dev) 5075 { 5076 if (f6i->fib6_nh->fib_nh_dev == dev) 5077 return true; 5078 5079 if (f6i->fib6_nsiblings) { 5080 struct fib6_info *sibling, *next_sibling; 5081 5082 list_for_each_entry_safe(sibling, next_sibling, 5083 &f6i->fib6_siblings, fib6_siblings) { 5084 if (sibling->fib6_nh->fib_nh_dev == dev) 5085 return true; 5086 } 5087 } 5088 5089 return false; 5090 } 5091 5092 int rt6_dump_route(struct fib6_info *rt, void *p_arg) 5093 { 5094 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 5095 struct fib_dump_filter *filter = &arg->filter; 5096 unsigned int flags = NLM_F_MULTI; 5097 struct net *net = arg->net; 5098 5099 if (rt == net->ipv6.fib6_null_entry) 5100 return 0; 5101 5102 if ((filter->flags & RTM_F_PREFIX) && 5103 !(rt->fib6_flags & RTF_PREFIX_RT)) { 5104 /* success since this is not a prefix route */ 5105 return 1; 5106 } 5107 if (filter->filter_set) { 5108 if ((filter->rt_type && rt->fib6_type != filter->rt_type) || 5109 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || 5110 (filter->protocol && rt->fib6_protocol != filter->protocol)) { 5111 return 1; 5112 } 5113 flags |= NLM_F_DUMP_FILTERED; 5114 } 5115 5116 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, 5117 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, 5118 arg->cb->nlh->nlmsg_seq, flags); 5119 } 5120 5121 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, 5122 const struct nlmsghdr *nlh, 5123 struct nlattr **tb, 5124 struct netlink_ext_ack *extack) 5125 { 5126 struct rtmsg *rtm; 5127 int i, err; 5128 5129 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { 5130 NL_SET_ERR_MSG_MOD(extack, 5131 "Invalid header for get route request"); 5132 return -EINVAL; 5133 } 5134 5135 if (!netlink_strict_get_check(skb)) 5136 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, 5137 rtm_ipv6_policy, extack); 5138 5139 rtm = nlmsg_data(nlh); 5140 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || 5141 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || 5142 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || 5143 rtm->rtm_type) { 5144 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request"); 5145 return -EINVAL; 5146 } 5147 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) { 5148 NL_SET_ERR_MSG_MOD(extack, 5149 "Invalid flags for get route request"); 5150 return -EINVAL; 5151 } 5152 5153 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, 5154 rtm_ipv6_policy, extack); 5155 if (err) 5156 return err; 5157 5158 if ((tb[RTA_SRC] && !rtm->rtm_src_len) || 5159 (tb[RTA_DST] && !rtm->rtm_dst_len)) { 5160 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); 5161 return -EINVAL; 5162 } 5163 5164 for (i = 0; i <= RTA_MAX; i++) { 5165 if (!tb[i]) 5166 continue; 5167 5168 switch (i) { 5169 case RTA_SRC: 5170 case RTA_DST: 5171 case RTA_IIF: 5172 case RTA_OIF: 5173 case RTA_MARK: 5174 case RTA_UID: 5175 case RTA_SPORT: 5176 case RTA_DPORT: 5177 case RTA_IP_PROTO: 5178 break; 5179 default: 5180 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); 5181 return -EINVAL; 5182 } 5183 } 5184 5185 return 0; 5186 } 5187 5188 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 5189 struct netlink_ext_ack *extack) 5190 { 5191 struct net *net = sock_net(in_skb->sk); 5192 struct nlattr *tb[RTA_MAX+1]; 5193 int err, iif = 0, oif = 0; 5194 struct fib6_info *from; 5195 struct dst_entry *dst; 5196 struct rt6_info *rt; 5197 struct sk_buff *skb; 5198 struct rtmsg *rtm; 5199 struct flowi6 fl6 = {}; 5200 bool fibmatch; 5201 5202 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack); 5203 if (err < 0) 5204 goto errout; 5205 5206 err = -EINVAL; 5207 rtm = nlmsg_data(nlh); 5208 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 5209 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 5210 5211 if (tb[RTA_SRC]) { 5212 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 5213 goto errout; 5214 5215 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 5216 } 5217 5218 if (tb[RTA_DST]) { 5219 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 5220 goto errout; 5221 5222 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 5223 } 5224 5225 if (tb[RTA_IIF]) 5226 iif = nla_get_u32(tb[RTA_IIF]); 5227 5228 if (tb[RTA_OIF]) 5229 oif = nla_get_u32(tb[RTA_OIF]); 5230 5231 if (tb[RTA_MARK]) 5232 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 5233 5234 if (tb[RTA_UID]) 5235 fl6.flowi6_uid = make_kuid(current_user_ns(), 5236 nla_get_u32(tb[RTA_UID])); 5237 else 5238 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 5239 5240 if (tb[RTA_SPORT]) 5241 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); 5242 5243 if (tb[RTA_DPORT]) 5244 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); 5245 5246 if (tb[RTA_IP_PROTO]) { 5247 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 5248 &fl6.flowi6_proto, AF_INET6, 5249 extack); 5250 if (err) 5251 goto errout; 5252 } 5253 5254 if (iif) { 5255 struct net_device *dev; 5256 int flags = 0; 5257 5258 rcu_read_lock(); 5259 5260 dev = dev_get_by_index_rcu(net, iif); 5261 if (!dev) { 5262 rcu_read_unlock(); 5263 err = -ENODEV; 5264 goto errout; 5265 } 5266 5267 fl6.flowi6_iif = iif; 5268 5269 if (!ipv6_addr_any(&fl6.saddr)) 5270 flags |= RT6_LOOKUP_F_HAS_SADDR; 5271 5272 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 5273 5274 rcu_read_unlock(); 5275 } else { 5276 fl6.flowi6_oif = oif; 5277 5278 dst = ip6_route_output(net, NULL, &fl6); 5279 } 5280 5281 5282 rt = container_of(dst, struct rt6_info, dst); 5283 if (rt->dst.error) { 5284 err = rt->dst.error; 5285 ip6_rt_put(rt); 5286 goto errout; 5287 } 5288 5289 if (rt == net->ipv6.ip6_null_entry) { 5290 err = rt->dst.error; 5291 ip6_rt_put(rt); 5292 goto errout; 5293 } 5294 5295 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 5296 if (!skb) { 5297 ip6_rt_put(rt); 5298 err = -ENOBUFS; 5299 goto errout; 5300 } 5301 5302 skb_dst_set(skb, &rt->dst); 5303 5304 rcu_read_lock(); 5305 from = rcu_dereference(rt->from); 5306 if (from) { 5307 if (fibmatch) 5308 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, 5309 iif, RTM_NEWROUTE, 5310 NETLINK_CB(in_skb).portid, 5311 nlh->nlmsg_seq, 0); 5312 else 5313 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 5314 &fl6.saddr, iif, RTM_NEWROUTE, 5315 NETLINK_CB(in_skb).portid, 5316 nlh->nlmsg_seq, 0); 5317 } else { 5318 err = -ENETUNREACH; 5319 } 5320 rcu_read_unlock(); 5321 5322 if (err < 0) { 5323 kfree_skb(skb); 5324 goto errout; 5325 } 5326 5327 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 5328 errout: 5329 return err; 5330 } 5331 5332 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 5333 unsigned int nlm_flags) 5334 { 5335 struct sk_buff *skb; 5336 struct net *net = info->nl_net; 5337 u32 seq; 5338 int err; 5339 5340 err = -ENOBUFS; 5341 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 5342 5343 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 5344 if (!skb) 5345 goto errout; 5346 5347 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 5348 event, info->portid, seq, nlm_flags); 5349 if (err < 0) { 5350 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 5351 WARN_ON(err == -EMSGSIZE); 5352 kfree_skb(skb); 5353 goto errout; 5354 } 5355 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 5356 info->nlh, gfp_any()); 5357 return; 5358 errout: 5359 if (err < 0) 5360 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 5361 } 5362 5363 void fib6_rt_update(struct net *net, struct fib6_info *rt, 5364 struct nl_info *info) 5365 { 5366 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 5367 struct sk_buff *skb; 5368 int err = -ENOBUFS; 5369 5370 /* call_fib6_entry_notifiers will be removed when in-kernel notifier 5371 * is implemented and supported for nexthop objects 5372 */ 5373 call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, rt, NULL); 5374 5375 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 5376 if (!skb) 5377 goto errout; 5378 5379 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 5380 RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE); 5381 if (err < 0) { 5382 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 5383 WARN_ON(err == -EMSGSIZE); 5384 kfree_skb(skb); 5385 goto errout; 5386 } 5387 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 5388 info->nlh, gfp_any()); 5389 return; 5390 errout: 5391 if (err < 0) 5392 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 5393 } 5394 5395 static int ip6_route_dev_notify(struct notifier_block *this, 5396 unsigned long event, void *ptr) 5397 { 5398 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 5399 struct net *net = dev_net(dev); 5400 5401 if (!(dev->flags & IFF_LOOPBACK)) 5402 return NOTIFY_OK; 5403 5404 if (event == NETDEV_REGISTER) { 5405 net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev; 5406 net->ipv6.ip6_null_entry->dst.dev = dev; 5407 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 5408 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5409 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 5410 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 5411 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 5412 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 5413 #endif 5414 } else if (event == NETDEV_UNREGISTER && 5415 dev->reg_state != NETREG_UNREGISTERED) { 5416 /* NETDEV_UNREGISTER could be fired for multiple times by 5417 * netdev_wait_allrefs(). Make sure we only call this once. 5418 */ 5419 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 5420 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5421 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 5422 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 5423 #endif 5424 } 5425 5426 return NOTIFY_OK; 5427 } 5428 5429 /* 5430 * /proc 5431 */ 5432 5433 #ifdef CONFIG_PROC_FS 5434 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 5435 { 5436 struct net *net = (struct net *)seq->private; 5437 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 5438 net->ipv6.rt6_stats->fib_nodes, 5439 net->ipv6.rt6_stats->fib_route_nodes, 5440 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 5441 net->ipv6.rt6_stats->fib_rt_entries, 5442 net->ipv6.rt6_stats->fib_rt_cache, 5443 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 5444 net->ipv6.rt6_stats->fib_discarded_routes); 5445 5446 return 0; 5447 } 5448 #endif /* CONFIG_PROC_FS */ 5449 5450 #ifdef CONFIG_SYSCTL 5451 5452 static 5453 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 5454 void __user *buffer, size_t *lenp, loff_t *ppos) 5455 { 5456 struct net *net; 5457 int delay; 5458 int ret; 5459 if (!write) 5460 return -EINVAL; 5461 5462 net = (struct net *)ctl->extra1; 5463 delay = net->ipv6.sysctl.flush_delay; 5464 ret = proc_dointvec(ctl, write, buffer, lenp, ppos); 5465 if (ret) 5466 return ret; 5467 5468 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 5469 return 0; 5470 } 5471 5472 static int zero; 5473 static int one = 1; 5474 5475 static struct ctl_table ipv6_route_table_template[] = { 5476 { 5477 .procname = "flush", 5478 .data = &init_net.ipv6.sysctl.flush_delay, 5479 .maxlen = sizeof(int), 5480 .mode = 0200, 5481 .proc_handler = ipv6_sysctl_rtcache_flush 5482 }, 5483 { 5484 .procname = "gc_thresh", 5485 .data = &ip6_dst_ops_template.gc_thresh, 5486 .maxlen = sizeof(int), 5487 .mode = 0644, 5488 .proc_handler = proc_dointvec, 5489 }, 5490 { 5491 .procname = "max_size", 5492 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 5493 .maxlen = sizeof(int), 5494 .mode = 0644, 5495 .proc_handler = proc_dointvec, 5496 }, 5497 { 5498 .procname = "gc_min_interval", 5499 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5500 .maxlen = sizeof(int), 5501 .mode = 0644, 5502 .proc_handler = proc_dointvec_jiffies, 5503 }, 5504 { 5505 .procname = "gc_timeout", 5506 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 5507 .maxlen = sizeof(int), 5508 .mode = 0644, 5509 .proc_handler = proc_dointvec_jiffies, 5510 }, 5511 { 5512 .procname = "gc_interval", 5513 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 5514 .maxlen = sizeof(int), 5515 .mode = 0644, 5516 .proc_handler = proc_dointvec_jiffies, 5517 }, 5518 { 5519 .procname = "gc_elasticity", 5520 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 5521 .maxlen = sizeof(int), 5522 .mode = 0644, 5523 .proc_handler = proc_dointvec, 5524 }, 5525 { 5526 .procname = "mtu_expires", 5527 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 5528 .maxlen = sizeof(int), 5529 .mode = 0644, 5530 .proc_handler = proc_dointvec_jiffies, 5531 }, 5532 { 5533 .procname = "min_adv_mss", 5534 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 5535 .maxlen = sizeof(int), 5536 .mode = 0644, 5537 .proc_handler = proc_dointvec, 5538 }, 5539 { 5540 .procname = "gc_min_interval_ms", 5541 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5542 .maxlen = sizeof(int), 5543 .mode = 0644, 5544 .proc_handler = proc_dointvec_ms_jiffies, 5545 }, 5546 { 5547 .procname = "skip_notify_on_dev_down", 5548 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down, 5549 .maxlen = sizeof(int), 5550 .mode = 0644, 5551 .proc_handler = proc_dointvec, 5552 .extra1 = &zero, 5553 .extra2 = &one, 5554 }, 5555 { } 5556 }; 5557 5558 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 5559 { 5560 struct ctl_table *table; 5561 5562 table = kmemdup(ipv6_route_table_template, 5563 sizeof(ipv6_route_table_template), 5564 GFP_KERNEL); 5565 5566 if (table) { 5567 table[0].data = &net->ipv6.sysctl.flush_delay; 5568 table[0].extra1 = net; 5569 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 5570 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 5571 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5572 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 5573 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 5574 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 5575 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 5576 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 5577 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5578 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down; 5579 5580 /* Don't export sysctls to unprivileged users */ 5581 if (net->user_ns != &init_user_ns) 5582 table[0].procname = NULL; 5583 } 5584 5585 return table; 5586 } 5587 #endif 5588 5589 static int __net_init ip6_route_net_init(struct net *net) 5590 { 5591 int ret = -ENOMEM; 5592 5593 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 5594 sizeof(net->ipv6.ip6_dst_ops)); 5595 5596 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 5597 goto out_ip6_dst_ops; 5598 5599 net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true); 5600 if (!net->ipv6.fib6_null_entry) 5601 goto out_ip6_dst_entries; 5602 memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template, 5603 sizeof(*net->ipv6.fib6_null_entry)); 5604 5605 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 5606 sizeof(*net->ipv6.ip6_null_entry), 5607 GFP_KERNEL); 5608 if (!net->ipv6.ip6_null_entry) 5609 goto out_fib6_null_entry; 5610 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5611 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 5612 ip6_template_metrics, true); 5613 5614 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5615 net->ipv6.fib6_has_custom_rules = false; 5616 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 5617 sizeof(*net->ipv6.ip6_prohibit_entry), 5618 GFP_KERNEL); 5619 if (!net->ipv6.ip6_prohibit_entry) 5620 goto out_ip6_null_entry; 5621 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5622 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 5623 ip6_template_metrics, true); 5624 5625 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 5626 sizeof(*net->ipv6.ip6_blk_hole_entry), 5627 GFP_KERNEL); 5628 if (!net->ipv6.ip6_blk_hole_entry) 5629 goto out_ip6_prohibit_entry; 5630 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5631 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 5632 ip6_template_metrics, true); 5633 #endif 5634 5635 net->ipv6.sysctl.flush_delay = 0; 5636 net->ipv6.sysctl.ip6_rt_max_size = 4096; 5637 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 5638 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 5639 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 5640 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 5641 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 5642 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 5643 net->ipv6.sysctl.skip_notify_on_dev_down = 0; 5644 5645 net->ipv6.ip6_rt_gc_expire = 30*HZ; 5646 5647 ret = 0; 5648 out: 5649 return ret; 5650 5651 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5652 out_ip6_prohibit_entry: 5653 kfree(net->ipv6.ip6_prohibit_entry); 5654 out_ip6_null_entry: 5655 kfree(net->ipv6.ip6_null_entry); 5656 #endif 5657 out_fib6_null_entry: 5658 kfree(net->ipv6.fib6_null_entry); 5659 out_ip6_dst_entries: 5660 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5661 out_ip6_dst_ops: 5662 goto out; 5663 } 5664 5665 static void __net_exit ip6_route_net_exit(struct net *net) 5666 { 5667 kfree(net->ipv6.fib6_null_entry); 5668 kfree(net->ipv6.ip6_null_entry); 5669 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5670 kfree(net->ipv6.ip6_prohibit_entry); 5671 kfree(net->ipv6.ip6_blk_hole_entry); 5672 #endif 5673 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5674 } 5675 5676 static int __net_init ip6_route_net_init_late(struct net *net) 5677 { 5678 #ifdef CONFIG_PROC_FS 5679 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops, 5680 sizeof(struct ipv6_route_iter)); 5681 proc_create_net_single("rt6_stats", 0444, net->proc_net, 5682 rt6_stats_seq_show, NULL); 5683 #endif 5684 return 0; 5685 } 5686 5687 static void __net_exit ip6_route_net_exit_late(struct net *net) 5688 { 5689 #ifdef CONFIG_PROC_FS 5690 remove_proc_entry("ipv6_route", net->proc_net); 5691 remove_proc_entry("rt6_stats", net->proc_net); 5692 #endif 5693 } 5694 5695 static struct pernet_operations ip6_route_net_ops = { 5696 .init = ip6_route_net_init, 5697 .exit = ip6_route_net_exit, 5698 }; 5699 5700 static int __net_init ipv6_inetpeer_init(struct net *net) 5701 { 5702 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 5703 5704 if (!bp) 5705 return -ENOMEM; 5706 inet_peer_base_init(bp); 5707 net->ipv6.peers = bp; 5708 return 0; 5709 } 5710 5711 static void __net_exit ipv6_inetpeer_exit(struct net *net) 5712 { 5713 struct inet_peer_base *bp = net->ipv6.peers; 5714 5715 net->ipv6.peers = NULL; 5716 inetpeer_invalidate_tree(bp); 5717 kfree(bp); 5718 } 5719 5720 static struct pernet_operations ipv6_inetpeer_ops = { 5721 .init = ipv6_inetpeer_init, 5722 .exit = ipv6_inetpeer_exit, 5723 }; 5724 5725 static struct pernet_operations ip6_route_net_late_ops = { 5726 .init = ip6_route_net_init_late, 5727 .exit = ip6_route_net_exit_late, 5728 }; 5729 5730 static struct notifier_block ip6_route_dev_notifier = { 5731 .notifier_call = ip6_route_dev_notify, 5732 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 5733 }; 5734 5735 void __init ip6_route_init_special_entries(void) 5736 { 5737 /* Registering of the loopback is done before this portion of code, 5738 * the loopback reference in rt6_info will not be taken, do it 5739 * manually for init_net */ 5740 init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev; 5741 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5742 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5743 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5744 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 5745 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5746 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 5747 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5748 #endif 5749 } 5750 5751 int __init ip6_route_init(void) 5752 { 5753 int ret; 5754 int cpu; 5755 5756 ret = -ENOMEM; 5757 ip6_dst_ops_template.kmem_cachep = 5758 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 5759 SLAB_HWCACHE_ALIGN, NULL); 5760 if (!ip6_dst_ops_template.kmem_cachep) 5761 goto out; 5762 5763 ret = dst_entries_init(&ip6_dst_blackhole_ops); 5764 if (ret) 5765 goto out_kmem_cache; 5766 5767 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 5768 if (ret) 5769 goto out_dst_entries; 5770 5771 ret = register_pernet_subsys(&ip6_route_net_ops); 5772 if (ret) 5773 goto out_register_inetpeer; 5774 5775 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 5776 5777 ret = fib6_init(); 5778 if (ret) 5779 goto out_register_subsys; 5780 5781 ret = xfrm6_init(); 5782 if (ret) 5783 goto out_fib6_init; 5784 5785 ret = fib6_rules_init(); 5786 if (ret) 5787 goto xfrm6_init; 5788 5789 ret = register_pernet_subsys(&ip6_route_net_late_ops); 5790 if (ret) 5791 goto fib6_rules_init; 5792 5793 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 5794 inet6_rtm_newroute, NULL, 0); 5795 if (ret < 0) 5796 goto out_register_late_subsys; 5797 5798 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 5799 inet6_rtm_delroute, NULL, 0); 5800 if (ret < 0) 5801 goto out_register_late_subsys; 5802 5803 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 5804 inet6_rtm_getroute, NULL, 5805 RTNL_FLAG_DOIT_UNLOCKED); 5806 if (ret < 0) 5807 goto out_register_late_subsys; 5808 5809 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 5810 if (ret) 5811 goto out_register_late_subsys; 5812 5813 for_each_possible_cpu(cpu) { 5814 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 5815 5816 INIT_LIST_HEAD(&ul->head); 5817 spin_lock_init(&ul->lock); 5818 } 5819 5820 out: 5821 return ret; 5822 5823 out_register_late_subsys: 5824 rtnl_unregister_all(PF_INET6); 5825 unregister_pernet_subsys(&ip6_route_net_late_ops); 5826 fib6_rules_init: 5827 fib6_rules_cleanup(); 5828 xfrm6_init: 5829 xfrm6_fini(); 5830 out_fib6_init: 5831 fib6_gc_cleanup(); 5832 out_register_subsys: 5833 unregister_pernet_subsys(&ip6_route_net_ops); 5834 out_register_inetpeer: 5835 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5836 out_dst_entries: 5837 dst_entries_destroy(&ip6_dst_blackhole_ops); 5838 out_kmem_cache: 5839 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5840 goto out; 5841 } 5842 5843 void ip6_route_cleanup(void) 5844 { 5845 unregister_netdevice_notifier(&ip6_route_dev_notifier); 5846 unregister_pernet_subsys(&ip6_route_net_late_ops); 5847 fib6_rules_cleanup(); 5848 xfrm6_fini(); 5849 fib6_gc_cleanup(); 5850 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5851 unregister_pernet_subsys(&ip6_route_net_ops); 5852 dst_entries_destroy(&ip6_dst_blackhole_ops); 5853 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5854 } 5855