1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/nexthop.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <net/ip.h> 67 #include <linux/uaccess.h> 68 69 #ifdef CONFIG_SYSCTL 70 #include <linux/sysctl.h> 71 #endif 72 73 static int ip6_rt_type_to_error(u8 fib6_type); 74 75 #define CREATE_TRACE_POINTS 76 #include <trace/events/fib6.h> 77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); 78 #undef CREATE_TRACE_POINTS 79 80 enum rt6_nud_state { 81 RT6_NUD_FAIL_HARD = -3, 82 RT6_NUD_FAIL_PROBE = -2, 83 RT6_NUD_FAIL_DO_RR = -1, 84 RT6_NUD_SUCCEED = 1 85 }; 86 87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 88 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 89 static unsigned int ip6_mtu(const struct dst_entry *dst); 90 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 91 static void ip6_dst_destroy(struct dst_entry *); 92 static void ip6_dst_ifdown(struct dst_entry *, 93 struct net_device *dev, int how); 94 static int ip6_dst_gc(struct dst_ops *ops); 95 96 static int ip6_pkt_discard(struct sk_buff *skb); 97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 98 static int ip6_pkt_prohibit(struct sk_buff *skb); 99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 100 static void ip6_link_failure(struct sk_buff *skb); 101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 102 struct sk_buff *skb, u32 mtu); 103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 104 struct sk_buff *skb); 105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict); 106 static size_t rt6_nlmsg_size(struct fib6_info *rt); 107 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 108 struct fib6_info *rt, struct dst_entry *dst, 109 struct in6_addr *dest, struct in6_addr *src, 110 int iif, int type, u32 portid, u32 seq, 111 unsigned int flags); 112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 113 struct in6_addr *daddr, 114 struct in6_addr *saddr); 115 116 #ifdef CONFIG_IPV6_ROUTE_INFO 117 static struct fib6_info *rt6_add_route_info(struct net *net, 118 const struct in6_addr *prefix, int prefixlen, 119 const struct in6_addr *gwaddr, 120 struct net_device *dev, 121 unsigned int pref); 122 static struct fib6_info *rt6_get_route_info(struct net *net, 123 const struct in6_addr *prefix, int prefixlen, 124 const struct in6_addr *gwaddr, 125 struct net_device *dev); 126 #endif 127 128 struct uncached_list { 129 spinlock_t lock; 130 struct list_head head; 131 }; 132 133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 134 135 void rt6_uncached_list_add(struct rt6_info *rt) 136 { 137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 138 139 rt->rt6i_uncached_list = ul; 140 141 spin_lock_bh(&ul->lock); 142 list_add_tail(&rt->rt6i_uncached, &ul->head); 143 spin_unlock_bh(&ul->lock); 144 } 145 146 void rt6_uncached_list_del(struct rt6_info *rt) 147 { 148 if (!list_empty(&rt->rt6i_uncached)) { 149 struct uncached_list *ul = rt->rt6i_uncached_list; 150 struct net *net = dev_net(rt->dst.dev); 151 152 spin_lock_bh(&ul->lock); 153 list_del(&rt->rt6i_uncached); 154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 155 spin_unlock_bh(&ul->lock); 156 } 157 } 158 159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 160 { 161 struct net_device *loopback_dev = net->loopback_dev; 162 int cpu; 163 164 if (dev == loopback_dev) 165 return; 166 167 for_each_possible_cpu(cpu) { 168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 169 struct rt6_info *rt; 170 171 spin_lock_bh(&ul->lock); 172 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 173 struct inet6_dev *rt_idev = rt->rt6i_idev; 174 struct net_device *rt_dev = rt->dst.dev; 175 176 if (rt_idev->dev == dev) { 177 rt->rt6i_idev = in6_dev_get(loopback_dev); 178 in6_dev_put(rt_idev); 179 } 180 181 if (rt_dev == dev) { 182 rt->dst.dev = loopback_dev; 183 dev_hold(rt->dst.dev); 184 dev_put(rt_dev); 185 } 186 } 187 spin_unlock_bh(&ul->lock); 188 } 189 } 190 191 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 192 struct sk_buff *skb, 193 const void *daddr) 194 { 195 if (!ipv6_addr_any(p)) 196 return (const void *) p; 197 else if (skb) 198 return &ipv6_hdr(skb)->daddr; 199 return daddr; 200 } 201 202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 203 struct net_device *dev, 204 struct sk_buff *skb, 205 const void *daddr) 206 { 207 struct neighbour *n; 208 209 daddr = choose_neigh_daddr(gw, skb, daddr); 210 n = __ipv6_neigh_lookup(dev, daddr); 211 if (n) 212 return n; 213 return neigh_create(&nd_tbl, daddr, dev); 214 } 215 216 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 217 struct sk_buff *skb, 218 const void *daddr) 219 { 220 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 221 222 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); 223 } 224 225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 226 { 227 struct net_device *dev = dst->dev; 228 struct rt6_info *rt = (struct rt6_info *)dst; 229 230 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); 231 if (!daddr) 232 return; 233 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 234 return; 235 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 236 return; 237 __ipv6_confirm_neigh(dev, daddr); 238 } 239 240 static struct dst_ops ip6_dst_ops_template = { 241 .family = AF_INET6, 242 .gc = ip6_dst_gc, 243 .gc_thresh = 1024, 244 .check = ip6_dst_check, 245 .default_advmss = ip6_default_advmss, 246 .mtu = ip6_mtu, 247 .cow_metrics = dst_cow_metrics_generic, 248 .destroy = ip6_dst_destroy, 249 .ifdown = ip6_dst_ifdown, 250 .negative_advice = ip6_negative_advice, 251 .link_failure = ip6_link_failure, 252 .update_pmtu = ip6_rt_update_pmtu, 253 .redirect = rt6_do_redirect, 254 .local_out = __ip6_local_out, 255 .neigh_lookup = ip6_dst_neigh_lookup, 256 .confirm_neigh = ip6_confirm_neigh, 257 }; 258 259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 260 { 261 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 262 263 return mtu ? : dst->dev->mtu; 264 } 265 266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 267 struct sk_buff *skb, u32 mtu) 268 { 269 } 270 271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 272 struct sk_buff *skb) 273 { 274 } 275 276 static struct dst_ops ip6_dst_blackhole_ops = { 277 .family = AF_INET6, 278 .destroy = ip6_dst_destroy, 279 .check = ip6_dst_check, 280 .mtu = ip6_blackhole_mtu, 281 .default_advmss = ip6_default_advmss, 282 .update_pmtu = ip6_rt_blackhole_update_pmtu, 283 .redirect = ip6_rt_blackhole_redirect, 284 .cow_metrics = dst_cow_metrics_generic, 285 .neigh_lookup = ip6_dst_neigh_lookup, 286 }; 287 288 static const u32 ip6_template_metrics[RTAX_MAX] = { 289 [RTAX_HOPLIMIT - 1] = 0, 290 }; 291 292 static const struct fib6_info fib6_null_entry_template = { 293 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 294 .fib6_protocol = RTPROT_KERNEL, 295 .fib6_metric = ~(u32)0, 296 .fib6_ref = ATOMIC_INIT(1), 297 .fib6_type = RTN_UNREACHABLE, 298 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 299 }; 300 301 static const struct rt6_info ip6_null_entry_template = { 302 .dst = { 303 .__refcnt = ATOMIC_INIT(1), 304 .__use = 1, 305 .obsolete = DST_OBSOLETE_FORCE_CHK, 306 .error = -ENETUNREACH, 307 .input = ip6_pkt_discard, 308 .output = ip6_pkt_discard_out, 309 }, 310 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 311 }; 312 313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 314 315 static const struct rt6_info ip6_prohibit_entry_template = { 316 .dst = { 317 .__refcnt = ATOMIC_INIT(1), 318 .__use = 1, 319 .obsolete = DST_OBSOLETE_FORCE_CHK, 320 .error = -EACCES, 321 .input = ip6_pkt_prohibit, 322 .output = ip6_pkt_prohibit_out, 323 }, 324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 325 }; 326 327 static const struct rt6_info ip6_blk_hole_entry_template = { 328 .dst = { 329 .__refcnt = ATOMIC_INIT(1), 330 .__use = 1, 331 .obsolete = DST_OBSOLETE_FORCE_CHK, 332 .error = -EINVAL, 333 .input = dst_discard, 334 .output = dst_discard_out, 335 }, 336 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 337 }; 338 339 #endif 340 341 static void rt6_info_init(struct rt6_info *rt) 342 { 343 struct dst_entry *dst = &rt->dst; 344 345 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 346 INIT_LIST_HEAD(&rt->rt6i_uncached); 347 } 348 349 /* allocate dst with ip6_dst_ops */ 350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 351 int flags) 352 { 353 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 354 1, DST_OBSOLETE_FORCE_CHK, flags); 355 356 if (rt) { 357 rt6_info_init(rt); 358 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 359 } 360 361 return rt; 362 } 363 EXPORT_SYMBOL(ip6_dst_alloc); 364 365 static void ip6_dst_destroy(struct dst_entry *dst) 366 { 367 struct rt6_info *rt = (struct rt6_info *)dst; 368 struct fib6_info *from; 369 struct inet6_dev *idev; 370 371 dst_destroy_metrics_generic(dst); 372 rt6_uncached_list_del(rt); 373 374 idev = rt->rt6i_idev; 375 if (idev) { 376 rt->rt6i_idev = NULL; 377 in6_dev_put(idev); 378 } 379 380 rcu_read_lock(); 381 from = rcu_dereference(rt->from); 382 rcu_assign_pointer(rt->from, NULL); 383 fib6_info_release(from); 384 rcu_read_unlock(); 385 } 386 387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 388 int how) 389 { 390 struct rt6_info *rt = (struct rt6_info *)dst; 391 struct inet6_dev *idev = rt->rt6i_idev; 392 struct net_device *loopback_dev = 393 dev_net(dev)->loopback_dev; 394 395 if (idev && idev->dev != loopback_dev) { 396 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 397 if (loopback_idev) { 398 rt->rt6i_idev = loopback_idev; 399 in6_dev_put(idev); 400 } 401 } 402 } 403 404 static bool __rt6_check_expired(const struct rt6_info *rt) 405 { 406 if (rt->rt6i_flags & RTF_EXPIRES) 407 return time_after(jiffies, rt->dst.expires); 408 else 409 return false; 410 } 411 412 static bool rt6_check_expired(const struct rt6_info *rt) 413 { 414 struct fib6_info *from; 415 416 from = rcu_dereference(rt->from); 417 418 if (rt->rt6i_flags & RTF_EXPIRES) { 419 if (time_after(jiffies, rt->dst.expires)) 420 return true; 421 } else if (from) { 422 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 423 fib6_check_expired(from); 424 } 425 return false; 426 } 427 428 struct fib6_info *fib6_multipath_select(const struct net *net, 429 struct fib6_info *match, 430 struct flowi6 *fl6, int oif, 431 const struct sk_buff *skb, 432 int strict) 433 { 434 struct fib6_info *sibling, *next_sibling; 435 436 /* We might have already computed the hash for ICMPv6 errors. In such 437 * case it will always be non-zero. Otherwise now is the time to do it. 438 */ 439 if (!fl6->mp_hash) 440 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 441 442 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound)) 443 return match; 444 445 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 446 fib6_siblings) { 447 int nh_upper_bound; 448 449 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound); 450 if (fl6->mp_hash > nh_upper_bound) 451 continue; 452 if (rt6_score_route(sibling, oif, strict) < 0) 453 break; 454 match = sibling; 455 break; 456 } 457 458 return match; 459 } 460 461 /* 462 * Route lookup. rcu_read_lock() should be held. 463 */ 464 465 static inline struct fib6_info *rt6_device_match(struct net *net, 466 struct fib6_info *rt, 467 const struct in6_addr *saddr, 468 int oif, 469 int flags) 470 { 471 struct fib6_info *sprt; 472 473 if (!oif && ipv6_addr_any(saddr) && 474 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD)) 475 return rt; 476 477 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) { 478 const struct net_device *dev = sprt->fib6_nh.nh_dev; 479 480 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD) 481 continue; 482 483 if (oif) { 484 if (dev->ifindex == oif) 485 return sprt; 486 } else { 487 if (ipv6_chk_addr(net, saddr, dev, 488 flags & RT6_LOOKUP_F_IFACE)) 489 return sprt; 490 } 491 } 492 493 if (oif && flags & RT6_LOOKUP_F_IFACE) 494 return net->ipv6.fib6_null_entry; 495 496 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt; 497 } 498 499 #ifdef CONFIG_IPV6_ROUTER_PREF 500 struct __rt6_probe_work { 501 struct work_struct work; 502 struct in6_addr target; 503 struct net_device *dev; 504 }; 505 506 static void rt6_probe_deferred(struct work_struct *w) 507 { 508 struct in6_addr mcaddr; 509 struct __rt6_probe_work *work = 510 container_of(w, struct __rt6_probe_work, work); 511 512 addrconf_addr_solict_mult(&work->target, &mcaddr); 513 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 514 dev_put(work->dev); 515 kfree(work); 516 } 517 518 static void rt6_probe(struct fib6_info *rt) 519 { 520 struct __rt6_probe_work *work; 521 const struct in6_addr *nh_gw; 522 struct neighbour *neigh; 523 struct net_device *dev; 524 525 /* 526 * Okay, this does not seem to be appropriate 527 * for now, however, we need to check if it 528 * is really so; aka Router Reachability Probing. 529 * 530 * Router Reachability Probe MUST be rate-limited 531 * to no more than one per minute. 532 */ 533 if (!rt || !(rt->fib6_flags & RTF_GATEWAY)) 534 return; 535 536 nh_gw = &rt->fib6_nh.nh_gw; 537 dev = rt->fib6_nh.nh_dev; 538 rcu_read_lock_bh(); 539 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 540 if (neigh) { 541 struct inet6_dev *idev; 542 543 if (neigh->nud_state & NUD_VALID) 544 goto out; 545 546 idev = __in6_dev_get(dev); 547 work = NULL; 548 write_lock(&neigh->lock); 549 if (!(neigh->nud_state & NUD_VALID) && 550 time_after(jiffies, 551 neigh->updated + idev->cnf.rtr_probe_interval)) { 552 work = kmalloc(sizeof(*work), GFP_ATOMIC); 553 if (work) 554 __neigh_set_probe_once(neigh); 555 } 556 write_unlock(&neigh->lock); 557 } else { 558 work = kmalloc(sizeof(*work), GFP_ATOMIC); 559 } 560 561 if (work) { 562 INIT_WORK(&work->work, rt6_probe_deferred); 563 work->target = *nh_gw; 564 dev_hold(dev); 565 work->dev = dev; 566 schedule_work(&work->work); 567 } 568 569 out: 570 rcu_read_unlock_bh(); 571 } 572 #else 573 static inline void rt6_probe(struct fib6_info *rt) 574 { 575 } 576 #endif 577 578 /* 579 * Default Router Selection (RFC 2461 6.3.6) 580 */ 581 static inline int rt6_check_dev(struct fib6_info *rt, int oif) 582 { 583 const struct net_device *dev = rt->fib6_nh.nh_dev; 584 585 if (!oif || dev->ifindex == oif) 586 return 2; 587 return 0; 588 } 589 590 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) 591 { 592 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 593 struct neighbour *neigh; 594 595 if (rt->fib6_flags & RTF_NONEXTHOP || 596 !(rt->fib6_flags & RTF_GATEWAY)) 597 return RT6_NUD_SUCCEED; 598 599 rcu_read_lock_bh(); 600 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev, 601 &rt->fib6_nh.nh_gw); 602 if (neigh) { 603 read_lock(&neigh->lock); 604 if (neigh->nud_state & NUD_VALID) 605 ret = RT6_NUD_SUCCEED; 606 #ifdef CONFIG_IPV6_ROUTER_PREF 607 else if (!(neigh->nud_state & NUD_FAILED)) 608 ret = RT6_NUD_SUCCEED; 609 else 610 ret = RT6_NUD_FAIL_PROBE; 611 #endif 612 read_unlock(&neigh->lock); 613 } else { 614 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 615 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 616 } 617 rcu_read_unlock_bh(); 618 619 return ret; 620 } 621 622 static int rt6_score_route(struct fib6_info *rt, int oif, int strict) 623 { 624 int m; 625 626 m = rt6_check_dev(rt, oif); 627 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 628 return RT6_NUD_FAIL_HARD; 629 #ifdef CONFIG_IPV6_ROUTER_PREF 630 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2; 631 #endif 632 if (strict & RT6_LOOKUP_F_REACHABLE) { 633 int n = rt6_check_neigh(rt); 634 if (n < 0) 635 return n; 636 } 637 return m; 638 } 639 640 /* called with rc_read_lock held */ 641 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i) 642 { 643 const struct net_device *dev = fib6_info_nh_dev(f6i); 644 bool rc = false; 645 646 if (dev) { 647 const struct inet6_dev *idev = __in6_dev_get(dev); 648 649 rc = !!idev->cnf.ignore_routes_with_linkdown; 650 } 651 652 return rc; 653 } 654 655 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, 656 int *mpri, struct fib6_info *match, 657 bool *do_rr) 658 { 659 int m; 660 bool match_do_rr = false; 661 662 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 663 goto out; 664 665 if (fib6_ignore_linkdown(rt) && 666 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 667 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 668 goto out; 669 670 if (fib6_check_expired(rt)) 671 goto out; 672 673 m = rt6_score_route(rt, oif, strict); 674 if (m == RT6_NUD_FAIL_DO_RR) { 675 match_do_rr = true; 676 m = 0; /* lowest valid score */ 677 } else if (m == RT6_NUD_FAIL_HARD) { 678 goto out; 679 } 680 681 if (strict & RT6_LOOKUP_F_REACHABLE) 682 rt6_probe(rt); 683 684 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 685 if (m > *mpri) { 686 *do_rr = match_do_rr; 687 *mpri = m; 688 match = rt; 689 } 690 out: 691 return match; 692 } 693 694 static struct fib6_info *find_rr_leaf(struct fib6_node *fn, 695 struct fib6_info *leaf, 696 struct fib6_info *rr_head, 697 u32 metric, int oif, int strict, 698 bool *do_rr) 699 { 700 struct fib6_info *rt, *match, *cont; 701 int mpri = -1; 702 703 match = NULL; 704 cont = NULL; 705 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) { 706 if (rt->fib6_metric != metric) { 707 cont = rt; 708 break; 709 } 710 711 match = find_match(rt, oif, strict, &mpri, match, do_rr); 712 } 713 714 for (rt = leaf; rt && rt != rr_head; 715 rt = rcu_dereference(rt->fib6_next)) { 716 if (rt->fib6_metric != metric) { 717 cont = rt; 718 break; 719 } 720 721 match = find_match(rt, oif, strict, &mpri, match, do_rr); 722 } 723 724 if (match || !cont) 725 return match; 726 727 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next)) 728 match = find_match(rt, oif, strict, &mpri, match, do_rr); 729 730 return match; 731 } 732 733 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, 734 int oif, int strict) 735 { 736 struct fib6_info *leaf = rcu_dereference(fn->leaf); 737 struct fib6_info *match, *rt0; 738 bool do_rr = false; 739 int key_plen; 740 741 if (!leaf || leaf == net->ipv6.fib6_null_entry) 742 return net->ipv6.fib6_null_entry; 743 744 rt0 = rcu_dereference(fn->rr_ptr); 745 if (!rt0) 746 rt0 = leaf; 747 748 /* Double check to make sure fn is not an intermediate node 749 * and fn->leaf does not points to its child's leaf 750 * (This might happen if all routes under fn are deleted from 751 * the tree and fib6_repair_tree() is called on the node.) 752 */ 753 key_plen = rt0->fib6_dst.plen; 754 #ifdef CONFIG_IPV6_SUBTREES 755 if (rt0->fib6_src.plen) 756 key_plen = rt0->fib6_src.plen; 757 #endif 758 if (fn->fn_bit != key_plen) 759 return net->ipv6.fib6_null_entry; 760 761 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict, 762 &do_rr); 763 764 if (do_rr) { 765 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 766 767 /* no entries matched; do round-robin */ 768 if (!next || next->fib6_metric != rt0->fib6_metric) 769 next = leaf; 770 771 if (next != rt0) { 772 spin_lock_bh(&leaf->fib6_table->tb6_lock); 773 /* make sure next is not being deleted from the tree */ 774 if (next->fib6_node) 775 rcu_assign_pointer(fn->rr_ptr, next); 776 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 777 } 778 } 779 780 return match ? match : net->ipv6.fib6_null_entry; 781 } 782 783 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) 784 { 785 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); 786 } 787 788 #ifdef CONFIG_IPV6_ROUTE_INFO 789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 790 const struct in6_addr *gwaddr) 791 { 792 struct net *net = dev_net(dev); 793 struct route_info *rinfo = (struct route_info *) opt; 794 struct in6_addr prefix_buf, *prefix; 795 unsigned int pref; 796 unsigned long lifetime; 797 struct fib6_info *rt; 798 799 if (len < sizeof(struct route_info)) { 800 return -EINVAL; 801 } 802 803 /* Sanity check for prefix_len and length */ 804 if (rinfo->length > 3) { 805 return -EINVAL; 806 } else if (rinfo->prefix_len > 128) { 807 return -EINVAL; 808 } else if (rinfo->prefix_len > 64) { 809 if (rinfo->length < 2) { 810 return -EINVAL; 811 } 812 } else if (rinfo->prefix_len > 0) { 813 if (rinfo->length < 1) { 814 return -EINVAL; 815 } 816 } 817 818 pref = rinfo->route_pref; 819 if (pref == ICMPV6_ROUTER_PREF_INVALID) 820 return -EINVAL; 821 822 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 823 824 if (rinfo->length == 3) 825 prefix = (struct in6_addr *)rinfo->prefix; 826 else { 827 /* this function is safe */ 828 ipv6_addr_prefix(&prefix_buf, 829 (struct in6_addr *)rinfo->prefix, 830 rinfo->prefix_len); 831 prefix = &prefix_buf; 832 } 833 834 if (rinfo->prefix_len == 0) 835 rt = rt6_get_dflt_router(net, gwaddr, dev); 836 else 837 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 838 gwaddr, dev); 839 840 if (rt && !lifetime) { 841 ip6_del_rt(net, rt); 842 rt = NULL; 843 } 844 845 if (!rt && lifetime) 846 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 847 dev, pref); 848 else if (rt) 849 rt->fib6_flags = RTF_ROUTEINFO | 850 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 851 852 if (rt) { 853 if (!addrconf_finite_timeout(lifetime)) 854 fib6_clean_expires(rt); 855 else 856 fib6_set_expires(rt, jiffies + HZ * lifetime); 857 858 fib6_info_release(rt); 859 } 860 return 0; 861 } 862 #endif 863 864 /* 865 * Misc support functions 866 */ 867 868 /* called with rcu_lock held */ 869 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt) 870 { 871 struct net_device *dev = rt->fib6_nh.nh_dev; 872 873 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 874 /* for copies of local routes, dst->dev needs to be the 875 * device if it is a master device, the master device if 876 * device is enslaved, and the loopback as the default 877 */ 878 if (netif_is_l3_slave(dev) && 879 !rt6_need_strict(&rt->fib6_dst.addr)) 880 dev = l3mdev_master_dev_rcu(dev); 881 else if (!netif_is_l3_master(dev)) 882 dev = dev_net(dev)->loopback_dev; 883 /* last case is netif_is_l3_master(dev) is true in which 884 * case we want dev returned to be dev 885 */ 886 } 887 888 return dev; 889 } 890 891 static const int fib6_prop[RTN_MAX + 1] = { 892 [RTN_UNSPEC] = 0, 893 [RTN_UNICAST] = 0, 894 [RTN_LOCAL] = 0, 895 [RTN_BROADCAST] = 0, 896 [RTN_ANYCAST] = 0, 897 [RTN_MULTICAST] = 0, 898 [RTN_BLACKHOLE] = -EINVAL, 899 [RTN_UNREACHABLE] = -EHOSTUNREACH, 900 [RTN_PROHIBIT] = -EACCES, 901 [RTN_THROW] = -EAGAIN, 902 [RTN_NAT] = -EINVAL, 903 [RTN_XRESOLVE] = -EINVAL, 904 }; 905 906 static int ip6_rt_type_to_error(u8 fib6_type) 907 { 908 return fib6_prop[fib6_type]; 909 } 910 911 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 912 { 913 unsigned short flags = 0; 914 915 if (rt->dst_nocount) 916 flags |= DST_NOCOUNT; 917 if (rt->dst_nopolicy) 918 flags |= DST_NOPOLICY; 919 if (rt->dst_host) 920 flags |= DST_HOST; 921 922 return flags; 923 } 924 925 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) 926 { 927 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); 928 929 switch (ort->fib6_type) { 930 case RTN_BLACKHOLE: 931 rt->dst.output = dst_discard_out; 932 rt->dst.input = dst_discard; 933 break; 934 case RTN_PROHIBIT: 935 rt->dst.output = ip6_pkt_prohibit_out; 936 rt->dst.input = ip6_pkt_prohibit; 937 break; 938 case RTN_THROW: 939 case RTN_UNREACHABLE: 940 default: 941 rt->dst.output = ip6_pkt_discard_out; 942 rt->dst.input = ip6_pkt_discard; 943 break; 944 } 945 } 946 947 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) 948 { 949 rt->dst.flags |= fib6_info_dst_flags(ort); 950 951 if (ort->fib6_flags & RTF_REJECT) { 952 ip6_rt_init_dst_reject(rt, ort); 953 return; 954 } 955 956 rt->dst.error = 0; 957 rt->dst.output = ip6_output; 958 959 if (ort->fib6_type == RTN_LOCAL) { 960 rt->dst.input = ip6_input; 961 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 962 rt->dst.input = ip6_mc_input; 963 } else { 964 rt->dst.input = ip6_forward; 965 } 966 967 if (ort->fib6_nh.nh_lwtstate) { 968 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); 969 lwtunnel_set_redirect(&rt->dst); 970 } 971 972 rt->dst.lastuse = jiffies; 973 } 974 975 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 976 { 977 rt->rt6i_flags &= ~RTF_EXPIRES; 978 fib6_info_hold(from); 979 rcu_assign_pointer(rt->from, from); 980 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); 981 if (from->fib6_metrics != &dst_default_metrics) { 982 rt->dst._metrics |= DST_METRICS_REFCOUNTED; 983 refcount_inc(&from->fib6_metrics->refcnt); 984 } 985 } 986 987 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) 988 { 989 struct net_device *dev = fib6_info_nh_dev(ort); 990 991 ip6_rt_init_dst(rt, ort); 992 993 rt->rt6i_dst = ort->fib6_dst; 994 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 995 rt->rt6i_gateway = ort->fib6_nh.nh_gw; 996 rt->rt6i_flags = ort->fib6_flags; 997 rt6_set_from(rt, ort); 998 #ifdef CONFIG_IPV6_SUBTREES 999 rt->rt6i_src = ort->fib6_src; 1000 #endif 1001 rt->rt6i_prefsrc = ort->fib6_prefsrc; 1002 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); 1003 } 1004 1005 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1006 struct in6_addr *saddr) 1007 { 1008 struct fib6_node *pn, *sn; 1009 while (1) { 1010 if (fn->fn_flags & RTN_TL_ROOT) 1011 return NULL; 1012 pn = rcu_dereference(fn->parent); 1013 sn = FIB6_SUBTREE(pn); 1014 if (sn && sn != fn) 1015 fn = fib6_node_lookup(sn, NULL, saddr); 1016 else 1017 fn = pn; 1018 if (fn->fn_flags & RTN_RTINFO) 1019 return fn; 1020 } 1021 } 1022 1023 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, 1024 bool null_fallback) 1025 { 1026 struct rt6_info *rt = *prt; 1027 1028 if (dst_hold_safe(&rt->dst)) 1029 return true; 1030 if (null_fallback) { 1031 rt = net->ipv6.ip6_null_entry; 1032 dst_hold(&rt->dst); 1033 } else { 1034 rt = NULL; 1035 } 1036 *prt = rt; 1037 return false; 1038 } 1039 1040 /* called with rcu_lock held */ 1041 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) 1042 { 1043 unsigned short flags = fib6_info_dst_flags(rt); 1044 struct net_device *dev = rt->fib6_nh.nh_dev; 1045 struct rt6_info *nrt; 1046 1047 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1048 if (nrt) 1049 ip6_rt_copy_init(nrt, rt); 1050 1051 return nrt; 1052 } 1053 1054 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1055 struct fib6_table *table, 1056 struct flowi6 *fl6, 1057 const struct sk_buff *skb, 1058 int flags) 1059 { 1060 struct fib6_info *f6i; 1061 struct fib6_node *fn; 1062 struct rt6_info *rt; 1063 1064 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1065 flags &= ~RT6_LOOKUP_F_IFACE; 1066 1067 rcu_read_lock(); 1068 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1069 restart: 1070 f6i = rcu_dereference(fn->leaf); 1071 if (!f6i) { 1072 f6i = net->ipv6.fib6_null_entry; 1073 } else { 1074 f6i = rt6_device_match(net, f6i, &fl6->saddr, 1075 fl6->flowi6_oif, flags); 1076 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) 1077 f6i = fib6_multipath_select(net, f6i, fl6, 1078 fl6->flowi6_oif, skb, 1079 flags); 1080 } 1081 if (f6i == net->ipv6.fib6_null_entry) { 1082 fn = fib6_backtrack(fn, &fl6->saddr); 1083 if (fn) 1084 goto restart; 1085 } 1086 1087 trace_fib6_table_lookup(net, f6i, table, fl6); 1088 1089 /* Search through exception table */ 1090 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1091 if (rt) { 1092 if (ip6_hold_safe(net, &rt, true)) 1093 dst_use_noref(&rt->dst, jiffies); 1094 } else if (f6i == net->ipv6.fib6_null_entry) { 1095 rt = net->ipv6.ip6_null_entry; 1096 dst_hold(&rt->dst); 1097 } else { 1098 rt = ip6_create_rt_rcu(f6i); 1099 if (!rt) { 1100 rt = net->ipv6.ip6_null_entry; 1101 dst_hold(&rt->dst); 1102 } 1103 } 1104 1105 rcu_read_unlock(); 1106 1107 return rt; 1108 } 1109 1110 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1111 const struct sk_buff *skb, int flags) 1112 { 1113 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1114 } 1115 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1116 1117 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1118 const struct in6_addr *saddr, int oif, 1119 const struct sk_buff *skb, int strict) 1120 { 1121 struct flowi6 fl6 = { 1122 .flowi6_oif = oif, 1123 .daddr = *daddr, 1124 }; 1125 struct dst_entry *dst; 1126 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1127 1128 if (saddr) { 1129 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1130 flags |= RT6_LOOKUP_F_HAS_SADDR; 1131 } 1132 1133 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1134 if (dst->error == 0) 1135 return (struct rt6_info *) dst; 1136 1137 dst_release(dst); 1138 1139 return NULL; 1140 } 1141 EXPORT_SYMBOL(rt6_lookup); 1142 1143 /* ip6_ins_rt is called with FREE table->tb6_lock. 1144 * It takes new route entry, the addition fails by any reason the 1145 * route is released. 1146 * Caller must hold dst before calling it. 1147 */ 1148 1149 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1150 struct netlink_ext_ack *extack) 1151 { 1152 int err; 1153 struct fib6_table *table; 1154 1155 table = rt->fib6_table; 1156 spin_lock_bh(&table->tb6_lock); 1157 err = fib6_add(&table->tb6_root, rt, info, extack); 1158 spin_unlock_bh(&table->tb6_lock); 1159 1160 return err; 1161 } 1162 1163 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1164 { 1165 struct nl_info info = { .nl_net = net, }; 1166 1167 return __ip6_ins_rt(rt, &info, NULL); 1168 } 1169 1170 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, 1171 const struct in6_addr *daddr, 1172 const struct in6_addr *saddr) 1173 { 1174 struct net_device *dev; 1175 struct rt6_info *rt; 1176 1177 /* 1178 * Clone the route. 1179 */ 1180 1181 dev = ip6_rt_get_dev_rcu(ort); 1182 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1183 if (!rt) 1184 return NULL; 1185 1186 ip6_rt_copy_init(rt, ort); 1187 rt->rt6i_flags |= RTF_CACHE; 1188 rt->dst.flags |= DST_HOST; 1189 rt->rt6i_dst.addr = *daddr; 1190 rt->rt6i_dst.plen = 128; 1191 1192 if (!rt6_is_gw_or_nonexthop(ort)) { 1193 if (ort->fib6_dst.plen != 128 && 1194 ipv6_addr_equal(&ort->fib6_dst.addr, daddr)) 1195 rt->rt6i_flags |= RTF_ANYCAST; 1196 #ifdef CONFIG_IPV6_SUBTREES 1197 if (rt->rt6i_src.plen && saddr) { 1198 rt->rt6i_src.addr = *saddr; 1199 rt->rt6i_src.plen = 128; 1200 } 1201 #endif 1202 } 1203 1204 return rt; 1205 } 1206 1207 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) 1208 { 1209 unsigned short flags = fib6_info_dst_flags(rt); 1210 struct net_device *dev; 1211 struct rt6_info *pcpu_rt; 1212 1213 rcu_read_lock(); 1214 dev = ip6_rt_get_dev_rcu(rt); 1215 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1216 rcu_read_unlock(); 1217 if (!pcpu_rt) 1218 return NULL; 1219 ip6_rt_copy_init(pcpu_rt, rt); 1220 pcpu_rt->rt6i_flags |= RTF_PCPU; 1221 return pcpu_rt; 1222 } 1223 1224 /* It should be called with rcu_read_lock() acquired */ 1225 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt) 1226 { 1227 struct rt6_info *pcpu_rt, **p; 1228 1229 p = this_cpu_ptr(rt->rt6i_pcpu); 1230 pcpu_rt = *p; 1231 1232 if (pcpu_rt) 1233 ip6_hold_safe(NULL, &pcpu_rt, false); 1234 1235 return pcpu_rt; 1236 } 1237 1238 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1239 struct fib6_info *rt) 1240 { 1241 struct rt6_info *pcpu_rt, *prev, **p; 1242 1243 pcpu_rt = ip6_rt_pcpu_alloc(rt); 1244 if (!pcpu_rt) { 1245 dst_hold(&net->ipv6.ip6_null_entry->dst); 1246 return net->ipv6.ip6_null_entry; 1247 } 1248 1249 dst_hold(&pcpu_rt->dst); 1250 p = this_cpu_ptr(rt->rt6i_pcpu); 1251 prev = cmpxchg(p, NULL, pcpu_rt); 1252 BUG_ON(prev); 1253 1254 return pcpu_rt; 1255 } 1256 1257 /* exception hash table implementation 1258 */ 1259 static DEFINE_SPINLOCK(rt6_exception_lock); 1260 1261 /* Remove rt6_ex from hash table and free the memory 1262 * Caller must hold rt6_exception_lock 1263 */ 1264 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1265 struct rt6_exception *rt6_ex) 1266 { 1267 struct net *net; 1268 1269 if (!bucket || !rt6_ex) 1270 return; 1271 1272 net = dev_net(rt6_ex->rt6i->dst.dev); 1273 hlist_del_rcu(&rt6_ex->hlist); 1274 dst_release(&rt6_ex->rt6i->dst); 1275 kfree_rcu(rt6_ex, rcu); 1276 WARN_ON_ONCE(!bucket->depth); 1277 bucket->depth--; 1278 net->ipv6.rt6_stats->fib_rt_cache--; 1279 } 1280 1281 /* Remove oldest rt6_ex in bucket and free the memory 1282 * Caller must hold rt6_exception_lock 1283 */ 1284 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1285 { 1286 struct rt6_exception *rt6_ex, *oldest = NULL; 1287 1288 if (!bucket) 1289 return; 1290 1291 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1292 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1293 oldest = rt6_ex; 1294 } 1295 rt6_remove_exception(bucket, oldest); 1296 } 1297 1298 static u32 rt6_exception_hash(const struct in6_addr *dst, 1299 const struct in6_addr *src) 1300 { 1301 static u32 seed __read_mostly; 1302 u32 val; 1303 1304 net_get_random_once(&seed, sizeof(seed)); 1305 val = jhash(dst, sizeof(*dst), seed); 1306 1307 #ifdef CONFIG_IPV6_SUBTREES 1308 if (src) 1309 val = jhash(src, sizeof(*src), val); 1310 #endif 1311 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1312 } 1313 1314 /* Helper function to find the cached rt in the hash table 1315 * and update bucket pointer to point to the bucket for this 1316 * (daddr, saddr) pair 1317 * Caller must hold rt6_exception_lock 1318 */ 1319 static struct rt6_exception * 1320 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1321 const struct in6_addr *daddr, 1322 const struct in6_addr *saddr) 1323 { 1324 struct rt6_exception *rt6_ex; 1325 u32 hval; 1326 1327 if (!(*bucket) || !daddr) 1328 return NULL; 1329 1330 hval = rt6_exception_hash(daddr, saddr); 1331 *bucket += hval; 1332 1333 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1334 struct rt6_info *rt6 = rt6_ex->rt6i; 1335 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1336 1337 #ifdef CONFIG_IPV6_SUBTREES 1338 if (matched && saddr) 1339 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1340 #endif 1341 if (matched) 1342 return rt6_ex; 1343 } 1344 return NULL; 1345 } 1346 1347 /* Helper function to find the cached rt in the hash table 1348 * and update bucket pointer to point to the bucket for this 1349 * (daddr, saddr) pair 1350 * Caller must hold rcu_read_lock() 1351 */ 1352 static struct rt6_exception * 1353 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1354 const struct in6_addr *daddr, 1355 const struct in6_addr *saddr) 1356 { 1357 struct rt6_exception *rt6_ex; 1358 u32 hval; 1359 1360 WARN_ON_ONCE(!rcu_read_lock_held()); 1361 1362 if (!(*bucket) || !daddr) 1363 return NULL; 1364 1365 hval = rt6_exception_hash(daddr, saddr); 1366 *bucket += hval; 1367 1368 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1369 struct rt6_info *rt6 = rt6_ex->rt6i; 1370 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1371 1372 #ifdef CONFIG_IPV6_SUBTREES 1373 if (matched && saddr) 1374 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1375 #endif 1376 if (matched) 1377 return rt6_ex; 1378 } 1379 return NULL; 1380 } 1381 1382 static unsigned int fib6_mtu(const struct fib6_info *rt) 1383 { 1384 unsigned int mtu; 1385 1386 if (rt->fib6_pmtu) { 1387 mtu = rt->fib6_pmtu; 1388 } else { 1389 struct net_device *dev = fib6_info_nh_dev(rt); 1390 struct inet6_dev *idev; 1391 1392 rcu_read_lock(); 1393 idev = __in6_dev_get(dev); 1394 mtu = idev->cnf.mtu6; 1395 rcu_read_unlock(); 1396 } 1397 1398 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1399 1400 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu); 1401 } 1402 1403 static int rt6_insert_exception(struct rt6_info *nrt, 1404 struct fib6_info *ort) 1405 { 1406 struct net *net = dev_net(nrt->dst.dev); 1407 struct rt6_exception_bucket *bucket; 1408 struct in6_addr *src_key = NULL; 1409 struct rt6_exception *rt6_ex; 1410 int err = 0; 1411 1412 spin_lock_bh(&rt6_exception_lock); 1413 1414 if (ort->exception_bucket_flushed) { 1415 err = -EINVAL; 1416 goto out; 1417 } 1418 1419 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, 1420 lockdep_is_held(&rt6_exception_lock)); 1421 if (!bucket) { 1422 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1423 GFP_ATOMIC); 1424 if (!bucket) { 1425 err = -ENOMEM; 1426 goto out; 1427 } 1428 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); 1429 } 1430 1431 #ifdef CONFIG_IPV6_SUBTREES 1432 /* rt6i_src.plen != 0 indicates ort is in subtree 1433 * and exception table is indexed by a hash of 1434 * both rt6i_dst and rt6i_src. 1435 * Otherwise, the exception table is indexed by 1436 * a hash of only rt6i_dst. 1437 */ 1438 if (ort->fib6_src.plen) 1439 src_key = &nrt->rt6i_src.addr; 1440 #endif 1441 1442 /* Update rt6i_prefsrc as it could be changed 1443 * in rt6_remove_prefsrc() 1444 */ 1445 nrt->rt6i_prefsrc = ort->fib6_prefsrc; 1446 /* rt6_mtu_change() might lower mtu on ort. 1447 * Only insert this exception route if its mtu 1448 * is less than ort's mtu value. 1449 */ 1450 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) { 1451 err = -EINVAL; 1452 goto out; 1453 } 1454 1455 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1456 src_key); 1457 if (rt6_ex) 1458 rt6_remove_exception(bucket, rt6_ex); 1459 1460 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1461 if (!rt6_ex) { 1462 err = -ENOMEM; 1463 goto out; 1464 } 1465 rt6_ex->rt6i = nrt; 1466 rt6_ex->stamp = jiffies; 1467 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1468 bucket->depth++; 1469 net->ipv6.rt6_stats->fib_rt_cache++; 1470 1471 if (bucket->depth > FIB6_MAX_DEPTH) 1472 rt6_exception_remove_oldest(bucket); 1473 1474 out: 1475 spin_unlock_bh(&rt6_exception_lock); 1476 1477 /* Update fn->fn_sernum to invalidate all cached dst */ 1478 if (!err) { 1479 spin_lock_bh(&ort->fib6_table->tb6_lock); 1480 fib6_update_sernum(net, ort); 1481 spin_unlock_bh(&ort->fib6_table->tb6_lock); 1482 fib6_force_start_gc(net); 1483 } 1484 1485 return err; 1486 } 1487 1488 void rt6_flush_exceptions(struct fib6_info *rt) 1489 { 1490 struct rt6_exception_bucket *bucket; 1491 struct rt6_exception *rt6_ex; 1492 struct hlist_node *tmp; 1493 int i; 1494 1495 spin_lock_bh(&rt6_exception_lock); 1496 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1497 rt->exception_bucket_flushed = 1; 1498 1499 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1500 lockdep_is_held(&rt6_exception_lock)); 1501 if (!bucket) 1502 goto out; 1503 1504 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1505 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1506 rt6_remove_exception(bucket, rt6_ex); 1507 WARN_ON_ONCE(bucket->depth); 1508 bucket++; 1509 } 1510 1511 out: 1512 spin_unlock_bh(&rt6_exception_lock); 1513 } 1514 1515 /* Find cached rt in the hash table inside passed in rt 1516 * Caller has to hold rcu_read_lock() 1517 */ 1518 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 1519 struct in6_addr *daddr, 1520 struct in6_addr *saddr) 1521 { 1522 struct rt6_exception_bucket *bucket; 1523 struct in6_addr *src_key = NULL; 1524 struct rt6_exception *rt6_ex; 1525 struct rt6_info *res = NULL; 1526 1527 bucket = rcu_dereference(rt->rt6i_exception_bucket); 1528 1529 #ifdef CONFIG_IPV6_SUBTREES 1530 /* rt6i_src.plen != 0 indicates rt is in subtree 1531 * and exception table is indexed by a hash of 1532 * both rt6i_dst and rt6i_src. 1533 * Otherwise, the exception table is indexed by 1534 * a hash of only rt6i_dst. 1535 */ 1536 if (rt->fib6_src.plen) 1537 src_key = saddr; 1538 #endif 1539 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1540 1541 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1542 res = rt6_ex->rt6i; 1543 1544 return res; 1545 } 1546 1547 /* Remove the passed in cached rt from the hash table that contains it */ 1548 static int rt6_remove_exception_rt(struct rt6_info *rt) 1549 { 1550 struct rt6_exception_bucket *bucket; 1551 struct in6_addr *src_key = NULL; 1552 struct rt6_exception *rt6_ex; 1553 struct fib6_info *from; 1554 int err; 1555 1556 from = rcu_dereference(rt->from); 1557 if (!from || 1558 !(rt->rt6i_flags & RTF_CACHE)) 1559 return -EINVAL; 1560 1561 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1562 return -ENOENT; 1563 1564 spin_lock_bh(&rt6_exception_lock); 1565 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1566 lockdep_is_held(&rt6_exception_lock)); 1567 #ifdef CONFIG_IPV6_SUBTREES 1568 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1569 * and exception table is indexed by a hash of 1570 * both rt6i_dst and rt6i_src. 1571 * Otherwise, the exception table is indexed by 1572 * a hash of only rt6i_dst. 1573 */ 1574 if (from->fib6_src.plen) 1575 src_key = &rt->rt6i_src.addr; 1576 #endif 1577 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1578 &rt->rt6i_dst.addr, 1579 src_key); 1580 if (rt6_ex) { 1581 rt6_remove_exception(bucket, rt6_ex); 1582 err = 0; 1583 } else { 1584 err = -ENOENT; 1585 } 1586 1587 spin_unlock_bh(&rt6_exception_lock); 1588 return err; 1589 } 1590 1591 /* Find rt6_ex which contains the passed in rt cache and 1592 * refresh its stamp 1593 */ 1594 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1595 { 1596 struct rt6_exception_bucket *bucket; 1597 struct fib6_info *from = rt->from; 1598 struct in6_addr *src_key = NULL; 1599 struct rt6_exception *rt6_ex; 1600 1601 if (!from || 1602 !(rt->rt6i_flags & RTF_CACHE)) 1603 return; 1604 1605 rcu_read_lock(); 1606 bucket = rcu_dereference(from->rt6i_exception_bucket); 1607 1608 #ifdef CONFIG_IPV6_SUBTREES 1609 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1610 * and exception table is indexed by a hash of 1611 * both rt6i_dst and rt6i_src. 1612 * Otherwise, the exception table is indexed by 1613 * a hash of only rt6i_dst. 1614 */ 1615 if (from->fib6_src.plen) 1616 src_key = &rt->rt6i_src.addr; 1617 #endif 1618 rt6_ex = __rt6_find_exception_rcu(&bucket, 1619 &rt->rt6i_dst.addr, 1620 src_key); 1621 if (rt6_ex) 1622 rt6_ex->stamp = jiffies; 1623 1624 rcu_read_unlock(); 1625 } 1626 1627 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt) 1628 { 1629 struct rt6_exception_bucket *bucket; 1630 struct rt6_exception *rt6_ex; 1631 int i; 1632 1633 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1634 lockdep_is_held(&rt6_exception_lock)); 1635 1636 if (bucket) { 1637 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1638 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1639 rt6_ex->rt6i->rt6i_prefsrc.plen = 0; 1640 } 1641 bucket++; 1642 } 1643 } 1644 } 1645 1646 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1647 struct rt6_info *rt, int mtu) 1648 { 1649 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1650 * lowest MTU in the path: always allow updating the route PMTU to 1651 * reflect PMTU decreases. 1652 * 1653 * If the new MTU is higher, and the route PMTU is equal to the local 1654 * MTU, this means the old MTU is the lowest in the path, so allow 1655 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1656 * handle this. 1657 */ 1658 1659 if (dst_mtu(&rt->dst) >= mtu) 1660 return true; 1661 1662 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 1663 return true; 1664 1665 return false; 1666 } 1667 1668 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1669 struct fib6_info *rt, int mtu) 1670 { 1671 struct rt6_exception_bucket *bucket; 1672 struct rt6_exception *rt6_ex; 1673 int i; 1674 1675 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1676 lockdep_is_held(&rt6_exception_lock)); 1677 1678 if (!bucket) 1679 return; 1680 1681 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1682 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1683 struct rt6_info *entry = rt6_ex->rt6i; 1684 1685 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 1686 * route), the metrics of its rt->from have already 1687 * been updated. 1688 */ 1689 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 1690 rt6_mtu_change_route_allowed(idev, entry, mtu)) 1691 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 1692 } 1693 bucket++; 1694 } 1695 } 1696 1697 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1698 1699 static void rt6_exceptions_clean_tohost(struct fib6_info *rt, 1700 struct in6_addr *gateway) 1701 { 1702 struct rt6_exception_bucket *bucket; 1703 struct rt6_exception *rt6_ex; 1704 struct hlist_node *tmp; 1705 int i; 1706 1707 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1708 return; 1709 1710 spin_lock_bh(&rt6_exception_lock); 1711 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1712 lockdep_is_held(&rt6_exception_lock)); 1713 1714 if (bucket) { 1715 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1716 hlist_for_each_entry_safe(rt6_ex, tmp, 1717 &bucket->chain, hlist) { 1718 struct rt6_info *entry = rt6_ex->rt6i; 1719 1720 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1721 RTF_CACHE_GATEWAY && 1722 ipv6_addr_equal(gateway, 1723 &entry->rt6i_gateway)) { 1724 rt6_remove_exception(bucket, rt6_ex); 1725 } 1726 } 1727 bucket++; 1728 } 1729 } 1730 1731 spin_unlock_bh(&rt6_exception_lock); 1732 } 1733 1734 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1735 struct rt6_exception *rt6_ex, 1736 struct fib6_gc_args *gc_args, 1737 unsigned long now) 1738 { 1739 struct rt6_info *rt = rt6_ex->rt6i; 1740 1741 /* we are pruning and obsoleting aged-out and non gateway exceptions 1742 * even if others have still references to them, so that on next 1743 * dst_check() such references can be dropped. 1744 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1745 * expired, independently from their aging, as per RFC 8201 section 4 1746 */ 1747 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 1748 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1749 RT6_TRACE("aging clone %p\n", rt); 1750 rt6_remove_exception(bucket, rt6_ex); 1751 return; 1752 } 1753 } else if (time_after(jiffies, rt->dst.expires)) { 1754 RT6_TRACE("purging expired route %p\n", rt); 1755 rt6_remove_exception(bucket, rt6_ex); 1756 return; 1757 } 1758 1759 if (rt->rt6i_flags & RTF_GATEWAY) { 1760 struct neighbour *neigh; 1761 __u8 neigh_flags = 0; 1762 1763 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 1764 if (neigh) 1765 neigh_flags = neigh->flags; 1766 1767 if (!(neigh_flags & NTF_ROUTER)) { 1768 RT6_TRACE("purging route %p via non-router but gateway\n", 1769 rt); 1770 rt6_remove_exception(bucket, rt6_ex); 1771 return; 1772 } 1773 } 1774 1775 gc_args->more++; 1776 } 1777 1778 void rt6_age_exceptions(struct fib6_info *rt, 1779 struct fib6_gc_args *gc_args, 1780 unsigned long now) 1781 { 1782 struct rt6_exception_bucket *bucket; 1783 struct rt6_exception *rt6_ex; 1784 struct hlist_node *tmp; 1785 int i; 1786 1787 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1788 return; 1789 1790 rcu_read_lock_bh(); 1791 spin_lock(&rt6_exception_lock); 1792 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1793 lockdep_is_held(&rt6_exception_lock)); 1794 1795 if (bucket) { 1796 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1797 hlist_for_each_entry_safe(rt6_ex, tmp, 1798 &bucket->chain, hlist) { 1799 rt6_age_examine_exception(bucket, rt6_ex, 1800 gc_args, now); 1801 } 1802 bucket++; 1803 } 1804 } 1805 spin_unlock(&rt6_exception_lock); 1806 rcu_read_unlock_bh(); 1807 } 1808 1809 /* must be called with rcu lock held */ 1810 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, 1811 int oif, struct flowi6 *fl6, int strict) 1812 { 1813 struct fib6_node *fn, *saved_fn; 1814 struct fib6_info *f6i; 1815 1816 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1817 saved_fn = fn; 1818 1819 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1820 oif = 0; 1821 1822 redo_rt6_select: 1823 f6i = rt6_select(net, fn, oif, strict); 1824 if (f6i == net->ipv6.fib6_null_entry) { 1825 fn = fib6_backtrack(fn, &fl6->saddr); 1826 if (fn) 1827 goto redo_rt6_select; 1828 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1829 /* also consider unreachable route */ 1830 strict &= ~RT6_LOOKUP_F_REACHABLE; 1831 fn = saved_fn; 1832 goto redo_rt6_select; 1833 } 1834 } 1835 1836 trace_fib6_table_lookup(net, f6i, table, fl6); 1837 1838 return f6i; 1839 } 1840 1841 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1842 int oif, struct flowi6 *fl6, 1843 const struct sk_buff *skb, int flags) 1844 { 1845 struct fib6_info *f6i; 1846 struct rt6_info *rt; 1847 int strict = 0; 1848 1849 strict |= flags & RT6_LOOKUP_F_IFACE; 1850 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1851 if (net->ipv6.devconf_all->forwarding == 0) 1852 strict |= RT6_LOOKUP_F_REACHABLE; 1853 1854 rcu_read_lock(); 1855 1856 f6i = fib6_table_lookup(net, table, oif, fl6, strict); 1857 if (f6i->fib6_nsiblings) 1858 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict); 1859 1860 if (f6i == net->ipv6.fib6_null_entry) { 1861 rt = net->ipv6.ip6_null_entry; 1862 rcu_read_unlock(); 1863 dst_hold(&rt->dst); 1864 return rt; 1865 } 1866 1867 /*Search through exception table */ 1868 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1869 if (rt) { 1870 if (ip6_hold_safe(net, &rt, true)) 1871 dst_use_noref(&rt->dst, jiffies); 1872 1873 rcu_read_unlock(); 1874 return rt; 1875 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1876 !(f6i->fib6_flags & RTF_GATEWAY))) { 1877 /* Create a RTF_CACHE clone which will not be 1878 * owned by the fib6 tree. It is for the special case where 1879 * the daddr in the skb during the neighbor look-up is different 1880 * from the fl6->daddr used to look-up route here. 1881 */ 1882 struct rt6_info *uncached_rt; 1883 1884 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); 1885 1886 rcu_read_unlock(); 1887 1888 if (uncached_rt) { 1889 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1890 * No need for another dst_hold() 1891 */ 1892 rt6_uncached_list_add(uncached_rt); 1893 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 1894 } else { 1895 uncached_rt = net->ipv6.ip6_null_entry; 1896 dst_hold(&uncached_rt->dst); 1897 } 1898 1899 return uncached_rt; 1900 } else { 1901 /* Get a percpu copy */ 1902 1903 struct rt6_info *pcpu_rt; 1904 1905 local_bh_disable(); 1906 pcpu_rt = rt6_get_pcpu_route(f6i); 1907 1908 if (!pcpu_rt) 1909 pcpu_rt = rt6_make_pcpu_route(net, f6i); 1910 1911 local_bh_enable(); 1912 rcu_read_unlock(); 1913 1914 return pcpu_rt; 1915 } 1916 } 1917 EXPORT_SYMBOL_GPL(ip6_pol_route); 1918 1919 static struct rt6_info *ip6_pol_route_input(struct net *net, 1920 struct fib6_table *table, 1921 struct flowi6 *fl6, 1922 const struct sk_buff *skb, 1923 int flags) 1924 { 1925 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 1926 } 1927 1928 struct dst_entry *ip6_route_input_lookup(struct net *net, 1929 struct net_device *dev, 1930 struct flowi6 *fl6, 1931 const struct sk_buff *skb, 1932 int flags) 1933 { 1934 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1935 flags |= RT6_LOOKUP_F_IFACE; 1936 1937 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 1938 } 1939 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1940 1941 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1942 struct flow_keys *keys, 1943 struct flow_keys *flkeys) 1944 { 1945 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1946 const struct ipv6hdr *key_iph = outer_iph; 1947 struct flow_keys *_flkeys = flkeys; 1948 const struct ipv6hdr *inner_iph; 1949 const struct icmp6hdr *icmph; 1950 struct ipv6hdr _inner_iph; 1951 struct icmp6hdr _icmph; 1952 1953 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1954 goto out; 1955 1956 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 1957 sizeof(_icmph), &_icmph); 1958 if (!icmph) 1959 goto out; 1960 1961 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1962 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1963 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1964 icmph->icmp6_type != ICMPV6_PARAMPROB) 1965 goto out; 1966 1967 inner_iph = skb_header_pointer(skb, 1968 skb_transport_offset(skb) + sizeof(*icmph), 1969 sizeof(_inner_iph), &_inner_iph); 1970 if (!inner_iph) 1971 goto out; 1972 1973 key_iph = inner_iph; 1974 _flkeys = NULL; 1975 out: 1976 if (_flkeys) { 1977 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 1978 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 1979 keys->tags.flow_label = _flkeys->tags.flow_label; 1980 keys->basic.ip_proto = _flkeys->basic.ip_proto; 1981 } else { 1982 keys->addrs.v6addrs.src = key_iph->saddr; 1983 keys->addrs.v6addrs.dst = key_iph->daddr; 1984 keys->tags.flow_label = ip6_flowlabel(key_iph); 1985 keys->basic.ip_proto = key_iph->nexthdr; 1986 } 1987 } 1988 1989 /* if skb is set it will be used and fl6 can be NULL */ 1990 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 1991 const struct sk_buff *skb, struct flow_keys *flkeys) 1992 { 1993 struct flow_keys hash_keys; 1994 u32 mhash; 1995 1996 switch (ip6_multipath_hash_policy(net)) { 1997 case 0: 1998 memset(&hash_keys, 0, sizeof(hash_keys)); 1999 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2000 if (skb) { 2001 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 2002 } else { 2003 hash_keys.addrs.v6addrs.src = fl6->saddr; 2004 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2005 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2006 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2007 } 2008 break; 2009 case 1: 2010 if (skb) { 2011 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 2012 struct flow_keys keys; 2013 2014 /* short-circuit if we already have L4 hash present */ 2015 if (skb->l4_hash) 2016 return skb_get_hash_raw(skb) >> 1; 2017 2018 memset(&hash_keys, 0, sizeof(hash_keys)); 2019 2020 if (!flkeys) { 2021 skb_flow_dissect_flow_keys(skb, &keys, flag); 2022 flkeys = &keys; 2023 } 2024 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2025 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2026 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2027 hash_keys.ports.src = flkeys->ports.src; 2028 hash_keys.ports.dst = flkeys->ports.dst; 2029 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2030 } else { 2031 memset(&hash_keys, 0, sizeof(hash_keys)); 2032 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2033 hash_keys.addrs.v6addrs.src = fl6->saddr; 2034 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2035 hash_keys.ports.src = fl6->fl6_sport; 2036 hash_keys.ports.dst = fl6->fl6_dport; 2037 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2038 } 2039 break; 2040 } 2041 mhash = flow_hash_from_keys(&hash_keys); 2042 2043 return mhash >> 1; 2044 } 2045 2046 void ip6_route_input(struct sk_buff *skb) 2047 { 2048 const struct ipv6hdr *iph = ipv6_hdr(skb); 2049 struct net *net = dev_net(skb->dev); 2050 int flags = RT6_LOOKUP_F_HAS_SADDR; 2051 struct ip_tunnel_info *tun_info; 2052 struct flowi6 fl6 = { 2053 .flowi6_iif = skb->dev->ifindex, 2054 .daddr = iph->daddr, 2055 .saddr = iph->saddr, 2056 .flowlabel = ip6_flowinfo(iph), 2057 .flowi6_mark = skb->mark, 2058 .flowi6_proto = iph->nexthdr, 2059 }; 2060 struct flow_keys *flkeys = NULL, _flkeys; 2061 2062 tun_info = skb_tunnel_info(skb); 2063 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2064 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2065 2066 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2067 flkeys = &_flkeys; 2068 2069 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2070 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2071 skb_dst_drop(skb); 2072 skb_dst_set(skb, 2073 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); 2074 } 2075 2076 static struct rt6_info *ip6_pol_route_output(struct net *net, 2077 struct fib6_table *table, 2078 struct flowi6 *fl6, 2079 const struct sk_buff *skb, 2080 int flags) 2081 { 2082 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2083 } 2084 2085 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 2086 struct flowi6 *fl6, int flags) 2087 { 2088 bool any_src; 2089 2090 if (rt6_need_strict(&fl6->daddr)) { 2091 struct dst_entry *dst; 2092 2093 dst = l3mdev_link_scope_lookup(net, fl6); 2094 if (dst) 2095 return dst; 2096 } 2097 2098 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2099 2100 any_src = ipv6_addr_any(&fl6->saddr); 2101 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2102 (fl6->flowi6_oif && any_src)) 2103 flags |= RT6_LOOKUP_F_IFACE; 2104 2105 if (!any_src) 2106 flags |= RT6_LOOKUP_F_HAS_SADDR; 2107 else if (sk) 2108 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2109 2110 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2111 } 2112 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2113 2114 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2115 { 2116 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2117 struct net_device *loopback_dev = net->loopback_dev; 2118 struct dst_entry *new = NULL; 2119 2120 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2121 DST_OBSOLETE_DEAD, 0); 2122 if (rt) { 2123 rt6_info_init(rt); 2124 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2125 2126 new = &rt->dst; 2127 new->__use = 1; 2128 new->input = dst_discard; 2129 new->output = dst_discard_out; 2130 2131 dst_copy_metrics(new, &ort->dst); 2132 2133 rt->rt6i_idev = in6_dev_get(loopback_dev); 2134 rt->rt6i_gateway = ort->rt6i_gateway; 2135 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2136 2137 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2138 #ifdef CONFIG_IPV6_SUBTREES 2139 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2140 #endif 2141 } 2142 2143 dst_release(dst_orig); 2144 return new ? new : ERR_PTR(-ENOMEM); 2145 } 2146 2147 /* 2148 * Destination cache support functions 2149 */ 2150 2151 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2152 { 2153 u32 rt_cookie = 0; 2154 2155 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2156 return false; 2157 2158 if (fib6_check_expired(f6i)) 2159 return false; 2160 2161 return true; 2162 } 2163 2164 static struct dst_entry *rt6_check(struct rt6_info *rt, 2165 struct fib6_info *from, 2166 u32 cookie) 2167 { 2168 u32 rt_cookie = 0; 2169 2170 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || 2171 rt_cookie != cookie) 2172 return NULL; 2173 2174 if (rt6_check_expired(rt)) 2175 return NULL; 2176 2177 return &rt->dst; 2178 } 2179 2180 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2181 struct fib6_info *from, 2182 u32 cookie) 2183 { 2184 if (!__rt6_check_expired(rt) && 2185 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2186 fib6_check(from, cookie)) 2187 return &rt->dst; 2188 else 2189 return NULL; 2190 } 2191 2192 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2193 { 2194 struct dst_entry *dst_ret; 2195 struct fib6_info *from; 2196 struct rt6_info *rt; 2197 2198 rt = container_of(dst, struct rt6_info, dst); 2199 2200 rcu_read_lock(); 2201 2202 /* All IPV6 dsts are created with ->obsolete set to the value 2203 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2204 * into this function always. 2205 */ 2206 2207 from = rcu_dereference(rt->from); 2208 2209 if (from && (rt->rt6i_flags & RTF_PCPU || 2210 unlikely(!list_empty(&rt->rt6i_uncached)))) 2211 dst_ret = rt6_dst_from_check(rt, from, cookie); 2212 else 2213 dst_ret = rt6_check(rt, from, cookie); 2214 2215 rcu_read_unlock(); 2216 2217 return dst_ret; 2218 } 2219 2220 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2221 { 2222 struct rt6_info *rt = (struct rt6_info *) dst; 2223 2224 if (rt) { 2225 if (rt->rt6i_flags & RTF_CACHE) { 2226 rcu_read_lock(); 2227 if (rt6_check_expired(rt)) { 2228 rt6_remove_exception_rt(rt); 2229 dst = NULL; 2230 } 2231 rcu_read_unlock(); 2232 } else { 2233 dst_release(dst); 2234 dst = NULL; 2235 } 2236 } 2237 return dst; 2238 } 2239 2240 static void ip6_link_failure(struct sk_buff *skb) 2241 { 2242 struct rt6_info *rt; 2243 2244 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2245 2246 rt = (struct rt6_info *) skb_dst(skb); 2247 if (rt) { 2248 rcu_read_lock(); 2249 if (rt->rt6i_flags & RTF_CACHE) { 2250 if (dst_hold_safe(&rt->dst)) 2251 rt6_remove_exception_rt(rt); 2252 } else { 2253 struct fib6_info *from; 2254 struct fib6_node *fn; 2255 2256 from = rcu_dereference(rt->from); 2257 if (from) { 2258 fn = rcu_dereference(from->fib6_node); 2259 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2260 fn->fn_sernum = -1; 2261 } 2262 } 2263 rcu_read_unlock(); 2264 } 2265 } 2266 2267 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2268 { 2269 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2270 struct fib6_info *from; 2271 2272 rcu_read_lock(); 2273 from = rcu_dereference(rt0->from); 2274 if (from) 2275 rt0->dst.expires = from->expires; 2276 rcu_read_unlock(); 2277 } 2278 2279 dst_set_expires(&rt0->dst, timeout); 2280 rt0->rt6i_flags |= RTF_EXPIRES; 2281 } 2282 2283 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2284 { 2285 struct net *net = dev_net(rt->dst.dev); 2286 2287 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2288 rt->rt6i_flags |= RTF_MODIFIED; 2289 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2290 } 2291 2292 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2293 { 2294 bool from_set; 2295 2296 rcu_read_lock(); 2297 from_set = !!rcu_dereference(rt->from); 2298 rcu_read_unlock(); 2299 2300 return !(rt->rt6i_flags & RTF_CACHE) && 2301 (rt->rt6i_flags & RTF_PCPU || from_set); 2302 } 2303 2304 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2305 const struct ipv6hdr *iph, u32 mtu) 2306 { 2307 const struct in6_addr *daddr, *saddr; 2308 struct rt6_info *rt6 = (struct rt6_info *)dst; 2309 2310 if (dst_metric_locked(dst, RTAX_MTU)) 2311 return; 2312 2313 if (iph) { 2314 daddr = &iph->daddr; 2315 saddr = &iph->saddr; 2316 } else if (sk) { 2317 daddr = &sk->sk_v6_daddr; 2318 saddr = &inet6_sk(sk)->saddr; 2319 } else { 2320 daddr = NULL; 2321 saddr = NULL; 2322 } 2323 dst_confirm_neigh(dst, daddr); 2324 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2325 if (mtu >= dst_mtu(dst)) 2326 return; 2327 2328 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2329 rt6_do_update_pmtu(rt6, mtu); 2330 /* update rt6_ex->stamp for cache */ 2331 if (rt6->rt6i_flags & RTF_CACHE) 2332 rt6_update_exception_stamp_rt(rt6); 2333 } else if (daddr) { 2334 struct fib6_info *from; 2335 struct rt6_info *nrt6; 2336 2337 rcu_read_lock(); 2338 from = rcu_dereference(rt6->from); 2339 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr); 2340 if (nrt6) { 2341 rt6_do_update_pmtu(nrt6, mtu); 2342 if (rt6_insert_exception(nrt6, from)) 2343 dst_release_immediate(&nrt6->dst); 2344 } 2345 rcu_read_unlock(); 2346 } 2347 } 2348 2349 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2350 struct sk_buff *skb, u32 mtu) 2351 { 2352 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2353 } 2354 2355 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2356 int oif, u32 mark, kuid_t uid) 2357 { 2358 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2359 struct dst_entry *dst; 2360 struct flowi6 fl6; 2361 2362 memset(&fl6, 0, sizeof(fl6)); 2363 fl6.flowi6_oif = oif; 2364 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); 2365 fl6.daddr = iph->daddr; 2366 fl6.saddr = iph->saddr; 2367 fl6.flowlabel = ip6_flowinfo(iph); 2368 fl6.flowi6_uid = uid; 2369 2370 dst = ip6_route_output(net, NULL, &fl6); 2371 if (!dst->error) 2372 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2373 dst_release(dst); 2374 } 2375 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2376 2377 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2378 { 2379 struct dst_entry *dst; 2380 2381 ip6_update_pmtu(skb, sock_net(sk), mtu, 2382 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid); 2383 2384 dst = __sk_dst_get(sk); 2385 if (!dst || !dst->obsolete || 2386 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2387 return; 2388 2389 bh_lock_sock(sk); 2390 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2391 ip6_datagram_dst_update(sk, false); 2392 bh_unlock_sock(sk); 2393 } 2394 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2395 2396 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2397 const struct flowi6 *fl6) 2398 { 2399 #ifdef CONFIG_IPV6_SUBTREES 2400 struct ipv6_pinfo *np = inet6_sk(sk); 2401 #endif 2402 2403 ip6_dst_store(sk, dst, 2404 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2405 &sk->sk_v6_daddr : NULL, 2406 #ifdef CONFIG_IPV6_SUBTREES 2407 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2408 &np->saddr : 2409 #endif 2410 NULL); 2411 } 2412 2413 /* Handle redirects */ 2414 struct ip6rd_flowi { 2415 struct flowi6 fl6; 2416 struct in6_addr gateway; 2417 }; 2418 2419 static struct rt6_info *__ip6_route_redirect(struct net *net, 2420 struct fib6_table *table, 2421 struct flowi6 *fl6, 2422 const struct sk_buff *skb, 2423 int flags) 2424 { 2425 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2426 struct rt6_info *ret = NULL, *rt_cache; 2427 struct fib6_info *rt; 2428 struct fib6_node *fn; 2429 2430 /* Get the "current" route for this destination and 2431 * check if the redirect has come from appropriate router. 2432 * 2433 * RFC 4861 specifies that redirects should only be 2434 * accepted if they come from the nexthop to the target. 2435 * Due to the way the routes are chosen, this notion 2436 * is a bit fuzzy and one might need to check all possible 2437 * routes. 2438 */ 2439 2440 rcu_read_lock(); 2441 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2442 restart: 2443 for_each_fib6_node_rt_rcu(fn) { 2444 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 2445 continue; 2446 if (fib6_check_expired(rt)) 2447 continue; 2448 if (rt->fib6_flags & RTF_REJECT) 2449 break; 2450 if (!(rt->fib6_flags & RTF_GATEWAY)) 2451 continue; 2452 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex) 2453 continue; 2454 /* rt_cache's gateway might be different from its 'parent' 2455 * in the case of an ip redirect. 2456 * So we keep searching in the exception table if the gateway 2457 * is different. 2458 */ 2459 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) { 2460 rt_cache = rt6_find_cached_rt(rt, 2461 &fl6->daddr, 2462 &fl6->saddr); 2463 if (rt_cache && 2464 ipv6_addr_equal(&rdfl->gateway, 2465 &rt_cache->rt6i_gateway)) { 2466 ret = rt_cache; 2467 break; 2468 } 2469 continue; 2470 } 2471 break; 2472 } 2473 2474 if (!rt) 2475 rt = net->ipv6.fib6_null_entry; 2476 else if (rt->fib6_flags & RTF_REJECT) { 2477 ret = net->ipv6.ip6_null_entry; 2478 goto out; 2479 } 2480 2481 if (rt == net->ipv6.fib6_null_entry) { 2482 fn = fib6_backtrack(fn, &fl6->saddr); 2483 if (fn) 2484 goto restart; 2485 } 2486 2487 out: 2488 if (ret) 2489 dst_hold(&ret->dst); 2490 else 2491 ret = ip6_create_rt_rcu(rt); 2492 2493 rcu_read_unlock(); 2494 2495 trace_fib6_table_lookup(net, rt, table, fl6); 2496 return ret; 2497 }; 2498 2499 static struct dst_entry *ip6_route_redirect(struct net *net, 2500 const struct flowi6 *fl6, 2501 const struct sk_buff *skb, 2502 const struct in6_addr *gateway) 2503 { 2504 int flags = RT6_LOOKUP_F_HAS_SADDR; 2505 struct ip6rd_flowi rdfl; 2506 2507 rdfl.fl6 = *fl6; 2508 rdfl.gateway = *gateway; 2509 2510 return fib6_rule_lookup(net, &rdfl.fl6, skb, 2511 flags, __ip6_route_redirect); 2512 } 2513 2514 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2515 kuid_t uid) 2516 { 2517 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2518 struct dst_entry *dst; 2519 struct flowi6 fl6; 2520 2521 memset(&fl6, 0, sizeof(fl6)); 2522 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2523 fl6.flowi6_oif = oif; 2524 fl6.flowi6_mark = mark; 2525 fl6.daddr = iph->daddr; 2526 fl6.saddr = iph->saddr; 2527 fl6.flowlabel = ip6_flowinfo(iph); 2528 fl6.flowi6_uid = uid; 2529 2530 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 2531 rt6_do_redirect(dst, NULL, skb); 2532 dst_release(dst); 2533 } 2534 EXPORT_SYMBOL_GPL(ip6_redirect); 2535 2536 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, 2537 u32 mark) 2538 { 2539 const struct ipv6hdr *iph = ipv6_hdr(skb); 2540 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2541 struct dst_entry *dst; 2542 struct flowi6 fl6; 2543 2544 memset(&fl6, 0, sizeof(fl6)); 2545 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2546 fl6.flowi6_oif = oif; 2547 fl6.flowi6_mark = mark; 2548 fl6.daddr = msg->dest; 2549 fl6.saddr = iph->daddr; 2550 fl6.flowi6_uid = sock_net_uid(net, NULL); 2551 2552 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 2553 rt6_do_redirect(dst, NULL, skb); 2554 dst_release(dst); 2555 } 2556 2557 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2558 { 2559 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2560 sk->sk_uid); 2561 } 2562 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2563 2564 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2565 { 2566 struct net_device *dev = dst->dev; 2567 unsigned int mtu = dst_mtu(dst); 2568 struct net *net = dev_net(dev); 2569 2570 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2571 2572 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2573 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2574 2575 /* 2576 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2577 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2578 * IPV6_MAXPLEN is also valid and means: "any MSS, 2579 * rely only on pmtu discovery" 2580 */ 2581 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2582 mtu = IPV6_MAXPLEN; 2583 return mtu; 2584 } 2585 2586 static unsigned int ip6_mtu(const struct dst_entry *dst) 2587 { 2588 struct inet6_dev *idev; 2589 unsigned int mtu; 2590 2591 mtu = dst_metric_raw(dst, RTAX_MTU); 2592 if (mtu) 2593 goto out; 2594 2595 mtu = IPV6_MIN_MTU; 2596 2597 rcu_read_lock(); 2598 idev = __in6_dev_get(dst->dev); 2599 if (idev) 2600 mtu = idev->cnf.mtu6; 2601 rcu_read_unlock(); 2602 2603 out: 2604 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2605 2606 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2607 } 2608 2609 /* MTU selection: 2610 * 1. mtu on route is locked - use it 2611 * 2. mtu from nexthop exception 2612 * 3. mtu from egress device 2613 * 2614 * based on ip6_dst_mtu_forward and exception logic of 2615 * rt6_find_cached_rt; called with rcu_read_lock 2616 */ 2617 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, 2618 struct in6_addr *saddr) 2619 { 2620 struct rt6_exception_bucket *bucket; 2621 struct rt6_exception *rt6_ex; 2622 struct in6_addr *src_key; 2623 struct inet6_dev *idev; 2624 u32 mtu = 0; 2625 2626 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 2627 mtu = f6i->fib6_pmtu; 2628 if (mtu) 2629 goto out; 2630 } 2631 2632 src_key = NULL; 2633 #ifdef CONFIG_IPV6_SUBTREES 2634 if (f6i->fib6_src.plen) 2635 src_key = saddr; 2636 #endif 2637 2638 bucket = rcu_dereference(f6i->rt6i_exception_bucket); 2639 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 2640 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 2641 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU); 2642 2643 if (likely(!mtu)) { 2644 struct net_device *dev = fib6_info_nh_dev(f6i); 2645 2646 mtu = IPV6_MIN_MTU; 2647 idev = __in6_dev_get(dev); 2648 if (idev && idev->cnf.mtu6 > mtu) 2649 mtu = idev->cnf.mtu6; 2650 } 2651 2652 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2653 out: 2654 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu); 2655 } 2656 2657 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2658 struct flowi6 *fl6) 2659 { 2660 struct dst_entry *dst; 2661 struct rt6_info *rt; 2662 struct inet6_dev *idev = in6_dev_get(dev); 2663 struct net *net = dev_net(dev); 2664 2665 if (unlikely(!idev)) 2666 return ERR_PTR(-ENODEV); 2667 2668 rt = ip6_dst_alloc(net, dev, 0); 2669 if (unlikely(!rt)) { 2670 in6_dev_put(idev); 2671 dst = ERR_PTR(-ENOMEM); 2672 goto out; 2673 } 2674 2675 rt->dst.flags |= DST_HOST; 2676 rt->dst.input = ip6_input; 2677 rt->dst.output = ip6_output; 2678 rt->rt6i_gateway = fl6->daddr; 2679 rt->rt6i_dst.addr = fl6->daddr; 2680 rt->rt6i_dst.plen = 128; 2681 rt->rt6i_idev = idev; 2682 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2683 2684 /* Add this dst into uncached_list so that rt6_disable_ip() can 2685 * do proper release of the net_device 2686 */ 2687 rt6_uncached_list_add(rt); 2688 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2689 2690 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2691 2692 out: 2693 return dst; 2694 } 2695 2696 static int ip6_dst_gc(struct dst_ops *ops) 2697 { 2698 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2699 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2700 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2701 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2702 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2703 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2704 int entries; 2705 2706 entries = dst_entries_get_fast(ops); 2707 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2708 entries <= rt_max_size) 2709 goto out; 2710 2711 net->ipv6.ip6_rt_gc_expire++; 2712 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2713 entries = dst_entries_get_slow(ops); 2714 if (entries < ops->gc_thresh) 2715 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2716 out: 2717 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2718 return entries > rt_max_size; 2719 } 2720 2721 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt, 2722 struct fib6_config *cfg) 2723 { 2724 struct dst_metrics *p; 2725 2726 if (!cfg->fc_mx) 2727 return 0; 2728 2729 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL); 2730 if (unlikely(!p)) 2731 return -ENOMEM; 2732 2733 refcount_set(&p->refcnt, 1); 2734 rt->fib6_metrics = p; 2735 2736 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics); 2737 } 2738 2739 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2740 struct fib6_config *cfg, 2741 const struct in6_addr *gw_addr, 2742 u32 tbid, int flags) 2743 { 2744 struct flowi6 fl6 = { 2745 .flowi6_oif = cfg->fc_ifindex, 2746 .daddr = *gw_addr, 2747 .saddr = cfg->fc_prefsrc, 2748 }; 2749 struct fib6_table *table; 2750 struct rt6_info *rt; 2751 2752 table = fib6_get_table(net, tbid); 2753 if (!table) 2754 return NULL; 2755 2756 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2757 flags |= RT6_LOOKUP_F_HAS_SADDR; 2758 2759 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 2760 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); 2761 2762 /* if table lookup failed, fall back to full lookup */ 2763 if (rt == net->ipv6.ip6_null_entry) { 2764 ip6_rt_put(rt); 2765 rt = NULL; 2766 } 2767 2768 return rt; 2769 } 2770 2771 static int ip6_route_check_nh_onlink(struct net *net, 2772 struct fib6_config *cfg, 2773 const struct net_device *dev, 2774 struct netlink_ext_ack *extack) 2775 { 2776 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 2777 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2778 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; 2779 struct rt6_info *grt; 2780 int err; 2781 2782 err = 0; 2783 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2784 if (grt) { 2785 if (!grt->dst.error && 2786 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2787 NL_SET_ERR_MSG(extack, 2788 "Nexthop has invalid gateway or device mismatch"); 2789 err = -EINVAL; 2790 } 2791 2792 ip6_rt_put(grt); 2793 } 2794 2795 return err; 2796 } 2797 2798 static int ip6_route_check_nh(struct net *net, 2799 struct fib6_config *cfg, 2800 struct net_device **_dev, 2801 struct inet6_dev **idev) 2802 { 2803 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2804 struct net_device *dev = _dev ? *_dev : NULL; 2805 struct rt6_info *grt = NULL; 2806 int err = -EHOSTUNREACH; 2807 2808 if (cfg->fc_table) { 2809 int flags = RT6_LOOKUP_F_IFACE; 2810 2811 grt = ip6_nh_lookup_table(net, cfg, gw_addr, 2812 cfg->fc_table, flags); 2813 if (grt) { 2814 if (grt->rt6i_flags & RTF_GATEWAY || 2815 (dev && dev != grt->dst.dev)) { 2816 ip6_rt_put(grt); 2817 grt = NULL; 2818 } 2819 } 2820 } 2821 2822 if (!grt) 2823 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); 2824 2825 if (!grt) 2826 goto out; 2827 2828 if (dev) { 2829 if (dev != grt->dst.dev) { 2830 ip6_rt_put(grt); 2831 goto out; 2832 } 2833 } else { 2834 *_dev = dev = grt->dst.dev; 2835 *idev = grt->rt6i_idev; 2836 dev_hold(dev); 2837 in6_dev_hold(grt->rt6i_idev); 2838 } 2839 2840 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2841 err = 0; 2842 2843 ip6_rt_put(grt); 2844 2845 out: 2846 return err; 2847 } 2848 2849 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 2850 struct net_device **_dev, struct inet6_dev **idev, 2851 struct netlink_ext_ack *extack) 2852 { 2853 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2854 int gwa_type = ipv6_addr_type(gw_addr); 2855 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 2856 const struct net_device *dev = *_dev; 2857 bool need_addr_check = !dev; 2858 int err = -EINVAL; 2859 2860 /* if gw_addr is local we will fail to detect this in case 2861 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2862 * will return already-added prefix route via interface that 2863 * prefix route was assigned to, which might be non-loopback. 2864 */ 2865 if (dev && 2866 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2867 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2868 goto out; 2869 } 2870 2871 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 2872 /* IPv6 strictly inhibits using not link-local 2873 * addresses as nexthop address. 2874 * Otherwise, router will not able to send redirects. 2875 * It is very good, but in some (rare!) circumstances 2876 * (SIT, PtP, NBMA NOARP links) it is handy to allow 2877 * some exceptions. --ANK 2878 * We allow IPv4-mapped nexthops to support RFC4798-type 2879 * addressing 2880 */ 2881 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 2882 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2883 goto out; 2884 } 2885 2886 if (cfg->fc_flags & RTNH_F_ONLINK) 2887 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 2888 else 2889 err = ip6_route_check_nh(net, cfg, _dev, idev); 2890 2891 if (err) 2892 goto out; 2893 } 2894 2895 /* reload in case device was changed */ 2896 dev = *_dev; 2897 2898 err = -EINVAL; 2899 if (!dev) { 2900 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2901 goto out; 2902 } else if (dev->flags & IFF_LOOPBACK) { 2903 NL_SET_ERR_MSG(extack, 2904 "Egress device can not be loopback device for this route"); 2905 goto out; 2906 } 2907 2908 /* if we did not check gw_addr above, do so now that the 2909 * egress device has been resolved. 2910 */ 2911 if (need_addr_check && 2912 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2913 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2914 goto out; 2915 } 2916 2917 err = 0; 2918 out: 2919 return err; 2920 } 2921 2922 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 2923 gfp_t gfp_flags, 2924 struct netlink_ext_ack *extack) 2925 { 2926 struct net *net = cfg->fc_nlinfo.nl_net; 2927 struct fib6_info *rt = NULL; 2928 struct net_device *dev = NULL; 2929 struct inet6_dev *idev = NULL; 2930 struct fib6_table *table; 2931 int addr_type; 2932 int err = -EINVAL; 2933 2934 /* RTF_PCPU is an internal flag; can not be set by userspace */ 2935 if (cfg->fc_flags & RTF_PCPU) { 2936 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 2937 goto out; 2938 } 2939 2940 /* RTF_CACHE is an internal flag; can not be set by userspace */ 2941 if (cfg->fc_flags & RTF_CACHE) { 2942 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 2943 goto out; 2944 } 2945 2946 if (cfg->fc_type > RTN_MAX) { 2947 NL_SET_ERR_MSG(extack, "Invalid route type"); 2948 goto out; 2949 } 2950 2951 if (cfg->fc_dst_len > 128) { 2952 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 2953 goto out; 2954 } 2955 if (cfg->fc_src_len > 128) { 2956 NL_SET_ERR_MSG(extack, "Invalid source address length"); 2957 goto out; 2958 } 2959 #ifndef CONFIG_IPV6_SUBTREES 2960 if (cfg->fc_src_len) { 2961 NL_SET_ERR_MSG(extack, 2962 "Specifying source address requires IPV6_SUBTREES to be enabled"); 2963 goto out; 2964 } 2965 #endif 2966 if (cfg->fc_ifindex) { 2967 err = -ENODEV; 2968 dev = dev_get_by_index(net, cfg->fc_ifindex); 2969 if (!dev) 2970 goto out; 2971 idev = in6_dev_get(dev); 2972 if (!idev) 2973 goto out; 2974 } 2975 2976 if (cfg->fc_metric == 0) 2977 cfg->fc_metric = IP6_RT_PRIO_USER; 2978 2979 if (cfg->fc_flags & RTNH_F_ONLINK) { 2980 if (!dev) { 2981 NL_SET_ERR_MSG(extack, 2982 "Nexthop device required for onlink"); 2983 err = -ENODEV; 2984 goto out; 2985 } 2986 2987 if (!(dev->flags & IFF_UP)) { 2988 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 2989 err = -ENETDOWN; 2990 goto out; 2991 } 2992 } 2993 2994 err = -ENOBUFS; 2995 if (cfg->fc_nlinfo.nlh && 2996 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 2997 table = fib6_get_table(net, cfg->fc_table); 2998 if (!table) { 2999 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 3000 table = fib6_new_table(net, cfg->fc_table); 3001 } 3002 } else { 3003 table = fib6_new_table(net, cfg->fc_table); 3004 } 3005 3006 if (!table) 3007 goto out; 3008 3009 err = -ENOMEM; 3010 rt = fib6_info_alloc(gfp_flags); 3011 if (!rt) 3012 goto out; 3013 3014 if (cfg->fc_flags & RTF_ADDRCONF) 3015 rt->dst_nocount = true; 3016 3017 err = ip6_convert_metrics(net, rt, cfg); 3018 if (err < 0) 3019 goto out; 3020 3021 if (cfg->fc_flags & RTF_EXPIRES) 3022 fib6_set_expires(rt, jiffies + 3023 clock_t_to_jiffies(cfg->fc_expires)); 3024 else 3025 fib6_clean_expires(rt); 3026 3027 if (cfg->fc_protocol == RTPROT_UNSPEC) 3028 cfg->fc_protocol = RTPROT_BOOT; 3029 rt->fib6_protocol = cfg->fc_protocol; 3030 3031 addr_type = ipv6_addr_type(&cfg->fc_dst); 3032 3033 if (cfg->fc_encap) { 3034 struct lwtunnel_state *lwtstate; 3035 3036 err = lwtunnel_build_state(cfg->fc_encap_type, 3037 cfg->fc_encap, AF_INET6, cfg, 3038 &lwtstate, extack); 3039 if (err) 3040 goto out; 3041 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate); 3042 } 3043 3044 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3045 rt->fib6_dst.plen = cfg->fc_dst_len; 3046 if (rt->fib6_dst.plen == 128) 3047 rt->dst_host = true; 3048 3049 #ifdef CONFIG_IPV6_SUBTREES 3050 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3051 rt->fib6_src.plen = cfg->fc_src_len; 3052 #endif 3053 3054 rt->fib6_metric = cfg->fc_metric; 3055 rt->fib6_nh.nh_weight = 1; 3056 3057 rt->fib6_type = cfg->fc_type; 3058 3059 /* We cannot add true routes via loopback here, 3060 they would result in kernel looping; promote them to reject routes 3061 */ 3062 if ((cfg->fc_flags & RTF_REJECT) || 3063 (dev && (dev->flags & IFF_LOOPBACK) && 3064 !(addr_type & IPV6_ADDR_LOOPBACK) && 3065 !(cfg->fc_flags & RTF_LOCAL))) { 3066 /* hold loopback dev/idev if we haven't done so. */ 3067 if (dev != net->loopback_dev) { 3068 if (dev) { 3069 dev_put(dev); 3070 in6_dev_put(idev); 3071 } 3072 dev = net->loopback_dev; 3073 dev_hold(dev); 3074 idev = in6_dev_get(dev); 3075 if (!idev) { 3076 err = -ENODEV; 3077 goto out; 3078 } 3079 } 3080 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP; 3081 goto install_route; 3082 } 3083 3084 if (cfg->fc_flags & RTF_GATEWAY) { 3085 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 3086 if (err) 3087 goto out; 3088 3089 rt->fib6_nh.nh_gw = cfg->fc_gateway; 3090 } 3091 3092 err = -ENODEV; 3093 if (!dev) 3094 goto out; 3095 3096 if (idev->cnf.disable_ipv6) { 3097 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3098 err = -EACCES; 3099 goto out; 3100 } 3101 3102 if (!(dev->flags & IFF_UP)) { 3103 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3104 err = -ENETDOWN; 3105 goto out; 3106 } 3107 3108 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3109 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3110 NL_SET_ERR_MSG(extack, "Invalid source address"); 3111 err = -EINVAL; 3112 goto out; 3113 } 3114 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3115 rt->fib6_prefsrc.plen = 128; 3116 } else 3117 rt->fib6_prefsrc.plen = 0; 3118 3119 rt->fib6_flags = cfg->fc_flags; 3120 3121 install_route: 3122 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3123 !netif_carrier_ok(dev)) 3124 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 3125 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); 3126 rt->fib6_nh.nh_dev = dev; 3127 rt->fib6_table = table; 3128 3129 cfg->fc_nlinfo.nl_net = dev_net(dev); 3130 3131 if (idev) 3132 in6_dev_put(idev); 3133 3134 return rt; 3135 out: 3136 if (dev) 3137 dev_put(dev); 3138 if (idev) 3139 in6_dev_put(idev); 3140 3141 fib6_info_release(rt); 3142 return ERR_PTR(err); 3143 } 3144 3145 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3146 struct netlink_ext_ack *extack) 3147 { 3148 struct fib6_info *rt; 3149 int err; 3150 3151 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3152 if (IS_ERR(rt)) 3153 return PTR_ERR(rt); 3154 3155 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3156 fib6_info_release(rt); 3157 3158 return err; 3159 } 3160 3161 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3162 { 3163 struct net *net = info->nl_net; 3164 struct fib6_table *table; 3165 int err; 3166 3167 if (rt == net->ipv6.fib6_null_entry) { 3168 err = -ENOENT; 3169 goto out; 3170 } 3171 3172 table = rt->fib6_table; 3173 spin_lock_bh(&table->tb6_lock); 3174 err = fib6_del(rt, info); 3175 spin_unlock_bh(&table->tb6_lock); 3176 3177 out: 3178 fib6_info_release(rt); 3179 return err; 3180 } 3181 3182 int ip6_del_rt(struct net *net, struct fib6_info *rt) 3183 { 3184 struct nl_info info = { .nl_net = net }; 3185 3186 return __ip6_del_rt(rt, &info); 3187 } 3188 3189 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3190 { 3191 struct nl_info *info = &cfg->fc_nlinfo; 3192 struct net *net = info->nl_net; 3193 struct sk_buff *skb = NULL; 3194 struct fib6_table *table; 3195 int err = -ENOENT; 3196 3197 if (rt == net->ipv6.fib6_null_entry) 3198 goto out_put; 3199 table = rt->fib6_table; 3200 spin_lock_bh(&table->tb6_lock); 3201 3202 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3203 struct fib6_info *sibling, *next_sibling; 3204 3205 /* prefer to send a single notification with all hops */ 3206 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3207 if (skb) { 3208 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3209 3210 if (rt6_fill_node(net, skb, rt, NULL, 3211 NULL, NULL, 0, RTM_DELROUTE, 3212 info->portid, seq, 0) < 0) { 3213 kfree_skb(skb); 3214 skb = NULL; 3215 } else 3216 info->skip_notify = 1; 3217 } 3218 3219 list_for_each_entry_safe(sibling, next_sibling, 3220 &rt->fib6_siblings, 3221 fib6_siblings) { 3222 err = fib6_del(sibling, info); 3223 if (err) 3224 goto out_unlock; 3225 } 3226 } 3227 3228 err = fib6_del(rt, info); 3229 out_unlock: 3230 spin_unlock_bh(&table->tb6_lock); 3231 out_put: 3232 fib6_info_release(rt); 3233 3234 if (skb) { 3235 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3236 info->nlh, gfp_any()); 3237 } 3238 return err; 3239 } 3240 3241 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3242 { 3243 int rc = -ESRCH; 3244 3245 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3246 goto out; 3247 3248 if (cfg->fc_flags & RTF_GATEWAY && 3249 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3250 goto out; 3251 if (dst_hold_safe(&rt->dst)) 3252 rc = rt6_remove_exception_rt(rt); 3253 out: 3254 return rc; 3255 } 3256 3257 static int ip6_route_del(struct fib6_config *cfg, 3258 struct netlink_ext_ack *extack) 3259 { 3260 struct rt6_info *rt_cache; 3261 struct fib6_table *table; 3262 struct fib6_info *rt; 3263 struct fib6_node *fn; 3264 int err = -ESRCH; 3265 3266 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 3267 if (!table) { 3268 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 3269 return err; 3270 } 3271 3272 rcu_read_lock(); 3273 3274 fn = fib6_locate(&table->tb6_root, 3275 &cfg->fc_dst, cfg->fc_dst_len, 3276 &cfg->fc_src, cfg->fc_src_len, 3277 !(cfg->fc_flags & RTF_CACHE)); 3278 3279 if (fn) { 3280 for_each_fib6_node_rt_rcu(fn) { 3281 if (cfg->fc_flags & RTF_CACHE) { 3282 int rc; 3283 3284 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, 3285 &cfg->fc_src); 3286 if (rt_cache) { 3287 rc = ip6_del_cached_rt(rt_cache, cfg); 3288 if (rc != -ESRCH) { 3289 rcu_read_unlock(); 3290 return rc; 3291 } 3292 } 3293 continue; 3294 } 3295 if (cfg->fc_ifindex && 3296 (!rt->fib6_nh.nh_dev || 3297 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex)) 3298 continue; 3299 if (cfg->fc_flags & RTF_GATEWAY && 3300 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw)) 3301 continue; 3302 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 3303 continue; 3304 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) 3305 continue; 3306 fib6_info_hold(rt); 3307 rcu_read_unlock(); 3308 3309 /* if gateway was specified only delete the one hop */ 3310 if (cfg->fc_flags & RTF_GATEWAY) 3311 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3312 3313 return __ip6_del_rt_siblings(rt, cfg); 3314 } 3315 } 3316 rcu_read_unlock(); 3317 3318 return err; 3319 } 3320 3321 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3322 { 3323 struct netevent_redirect netevent; 3324 struct rt6_info *rt, *nrt = NULL; 3325 struct ndisc_options ndopts; 3326 struct inet6_dev *in6_dev; 3327 struct neighbour *neigh; 3328 struct fib6_info *from; 3329 struct rd_msg *msg; 3330 int optlen, on_link; 3331 u8 *lladdr; 3332 3333 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3334 optlen -= sizeof(*msg); 3335 3336 if (optlen < 0) { 3337 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3338 return; 3339 } 3340 3341 msg = (struct rd_msg *)icmp6_hdr(skb); 3342 3343 if (ipv6_addr_is_multicast(&msg->dest)) { 3344 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3345 return; 3346 } 3347 3348 on_link = 0; 3349 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3350 on_link = 1; 3351 } else if (ipv6_addr_type(&msg->target) != 3352 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3353 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3354 return; 3355 } 3356 3357 in6_dev = __in6_dev_get(skb->dev); 3358 if (!in6_dev) 3359 return; 3360 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3361 return; 3362 3363 /* RFC2461 8.1: 3364 * The IP source address of the Redirect MUST be the same as the current 3365 * first-hop router for the specified ICMP Destination Address. 3366 */ 3367 3368 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3369 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3370 return; 3371 } 3372 3373 lladdr = NULL; 3374 if (ndopts.nd_opts_tgt_lladdr) { 3375 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3376 skb->dev); 3377 if (!lladdr) { 3378 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 3379 return; 3380 } 3381 } 3382 3383 rt = (struct rt6_info *) dst; 3384 if (rt->rt6i_flags & RTF_REJECT) { 3385 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 3386 return; 3387 } 3388 3389 /* Redirect received -> path was valid. 3390 * Look, redirects are sent only in response to data packets, 3391 * so that this nexthop apparently is reachable. --ANK 3392 */ 3393 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3394 3395 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3396 if (!neigh) 3397 return; 3398 3399 /* 3400 * We have finally decided to accept it. 3401 */ 3402 3403 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3404 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3405 NEIGH_UPDATE_F_OVERRIDE| 3406 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3407 NEIGH_UPDATE_F_ISROUTER)), 3408 NDISC_REDIRECT, &ndopts); 3409 3410 rcu_read_lock(); 3411 from = rcu_dereference(rt->from); 3412 fib6_info_hold(from); 3413 rcu_read_unlock(); 3414 3415 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL); 3416 if (!nrt) 3417 goto out; 3418 3419 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3420 if (on_link) 3421 nrt->rt6i_flags &= ~RTF_GATEWAY; 3422 3423 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3424 3425 /* No need to remove rt from the exception table if rt is 3426 * a cached route because rt6_insert_exception() will 3427 * takes care of it 3428 */ 3429 if (rt6_insert_exception(nrt, from)) { 3430 dst_release_immediate(&nrt->dst); 3431 goto out; 3432 } 3433 3434 netevent.old = &rt->dst; 3435 netevent.new = &nrt->dst; 3436 netevent.daddr = &msg->dest; 3437 netevent.neigh = neigh; 3438 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3439 3440 out: 3441 fib6_info_release(from); 3442 neigh_release(neigh); 3443 } 3444 3445 #ifdef CONFIG_IPV6_ROUTE_INFO 3446 static struct fib6_info *rt6_get_route_info(struct net *net, 3447 const struct in6_addr *prefix, int prefixlen, 3448 const struct in6_addr *gwaddr, 3449 struct net_device *dev) 3450 { 3451 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3452 int ifindex = dev->ifindex; 3453 struct fib6_node *fn; 3454 struct fib6_info *rt = NULL; 3455 struct fib6_table *table; 3456 3457 table = fib6_get_table(net, tb_id); 3458 if (!table) 3459 return NULL; 3460 3461 rcu_read_lock(); 3462 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3463 if (!fn) 3464 goto out; 3465 3466 for_each_fib6_node_rt_rcu(fn) { 3467 if (rt->fib6_nh.nh_dev->ifindex != ifindex) 3468 continue; 3469 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) 3470 continue; 3471 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) 3472 continue; 3473 fib6_info_hold(rt); 3474 break; 3475 } 3476 out: 3477 rcu_read_unlock(); 3478 return rt; 3479 } 3480 3481 static struct fib6_info *rt6_add_route_info(struct net *net, 3482 const struct in6_addr *prefix, int prefixlen, 3483 const struct in6_addr *gwaddr, 3484 struct net_device *dev, 3485 unsigned int pref) 3486 { 3487 struct fib6_config cfg = { 3488 .fc_metric = IP6_RT_PRIO_USER, 3489 .fc_ifindex = dev->ifindex, 3490 .fc_dst_len = prefixlen, 3491 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3492 RTF_UP | RTF_PREF(pref), 3493 .fc_protocol = RTPROT_RA, 3494 .fc_type = RTN_UNICAST, 3495 .fc_nlinfo.portid = 0, 3496 .fc_nlinfo.nlh = NULL, 3497 .fc_nlinfo.nl_net = net, 3498 }; 3499 3500 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3501 cfg.fc_dst = *prefix; 3502 cfg.fc_gateway = *gwaddr; 3503 3504 /* We should treat it as a default route if prefix length is 0. */ 3505 if (!prefixlen) 3506 cfg.fc_flags |= RTF_DEFAULT; 3507 3508 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 3509 3510 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3511 } 3512 #endif 3513 3514 struct fib6_info *rt6_get_dflt_router(struct net *net, 3515 const struct in6_addr *addr, 3516 struct net_device *dev) 3517 { 3518 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3519 struct fib6_info *rt; 3520 struct fib6_table *table; 3521 3522 table = fib6_get_table(net, tb_id); 3523 if (!table) 3524 return NULL; 3525 3526 rcu_read_lock(); 3527 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3528 if (dev == rt->fib6_nh.nh_dev && 3529 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3530 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) 3531 break; 3532 } 3533 if (rt) 3534 fib6_info_hold(rt); 3535 rcu_read_unlock(); 3536 return rt; 3537 } 3538 3539 struct fib6_info *rt6_add_dflt_router(struct net *net, 3540 const struct in6_addr *gwaddr, 3541 struct net_device *dev, 3542 unsigned int pref) 3543 { 3544 struct fib6_config cfg = { 3545 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3546 .fc_metric = IP6_RT_PRIO_USER, 3547 .fc_ifindex = dev->ifindex, 3548 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3549 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3550 .fc_protocol = RTPROT_RA, 3551 .fc_type = RTN_UNICAST, 3552 .fc_nlinfo.portid = 0, 3553 .fc_nlinfo.nlh = NULL, 3554 .fc_nlinfo.nl_net = net, 3555 }; 3556 3557 cfg.fc_gateway = *gwaddr; 3558 3559 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 3560 struct fib6_table *table; 3561 3562 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3563 if (table) 3564 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3565 } 3566 3567 return rt6_get_dflt_router(net, gwaddr, dev); 3568 } 3569 3570 static void __rt6_purge_dflt_routers(struct net *net, 3571 struct fib6_table *table) 3572 { 3573 struct fib6_info *rt; 3574 3575 restart: 3576 rcu_read_lock(); 3577 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3578 struct net_device *dev = fib6_info_nh_dev(rt); 3579 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 3580 3581 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3582 (!idev || idev->cnf.accept_ra != 2)) { 3583 fib6_info_hold(rt); 3584 rcu_read_unlock(); 3585 ip6_del_rt(net, rt); 3586 goto restart; 3587 } 3588 } 3589 rcu_read_unlock(); 3590 3591 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3592 } 3593 3594 void rt6_purge_dflt_routers(struct net *net) 3595 { 3596 struct fib6_table *table; 3597 struct hlist_head *head; 3598 unsigned int h; 3599 3600 rcu_read_lock(); 3601 3602 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3603 head = &net->ipv6.fib_table_hash[h]; 3604 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3605 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3606 __rt6_purge_dflt_routers(net, table); 3607 } 3608 } 3609 3610 rcu_read_unlock(); 3611 } 3612 3613 static void rtmsg_to_fib6_config(struct net *net, 3614 struct in6_rtmsg *rtmsg, 3615 struct fib6_config *cfg) 3616 { 3617 memset(cfg, 0, sizeof(*cfg)); 3618 3619 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3620 : RT6_TABLE_MAIN; 3621 cfg->fc_ifindex = rtmsg->rtmsg_ifindex; 3622 cfg->fc_metric = rtmsg->rtmsg_metric; 3623 cfg->fc_expires = rtmsg->rtmsg_info; 3624 cfg->fc_dst_len = rtmsg->rtmsg_dst_len; 3625 cfg->fc_src_len = rtmsg->rtmsg_src_len; 3626 cfg->fc_flags = rtmsg->rtmsg_flags; 3627 cfg->fc_type = rtmsg->rtmsg_type; 3628 3629 cfg->fc_nlinfo.nl_net = net; 3630 3631 cfg->fc_dst = rtmsg->rtmsg_dst; 3632 cfg->fc_src = rtmsg->rtmsg_src; 3633 cfg->fc_gateway = rtmsg->rtmsg_gateway; 3634 } 3635 3636 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3637 { 3638 struct fib6_config cfg; 3639 struct in6_rtmsg rtmsg; 3640 int err; 3641 3642 switch (cmd) { 3643 case SIOCADDRT: /* Add a route */ 3644 case SIOCDELRT: /* Delete a route */ 3645 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3646 return -EPERM; 3647 err = copy_from_user(&rtmsg, arg, 3648 sizeof(struct in6_rtmsg)); 3649 if (err) 3650 return -EFAULT; 3651 3652 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3653 3654 rtnl_lock(); 3655 switch (cmd) { 3656 case SIOCADDRT: 3657 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 3658 break; 3659 case SIOCDELRT: 3660 err = ip6_route_del(&cfg, NULL); 3661 break; 3662 default: 3663 err = -EINVAL; 3664 } 3665 rtnl_unlock(); 3666 3667 return err; 3668 } 3669 3670 return -EINVAL; 3671 } 3672 3673 /* 3674 * Drop the packet on the floor 3675 */ 3676 3677 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3678 { 3679 int type; 3680 struct dst_entry *dst = skb_dst(skb); 3681 switch (ipstats_mib_noroutes) { 3682 case IPSTATS_MIB_INNOROUTES: 3683 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3684 if (type == IPV6_ADDR_ANY) { 3685 IP6_INC_STATS(dev_net(dst->dev), 3686 __in6_dev_get_safely(skb->dev), 3687 IPSTATS_MIB_INADDRERRORS); 3688 break; 3689 } 3690 /* FALLTHROUGH */ 3691 case IPSTATS_MIB_OUTNOROUTES: 3692 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3693 ipstats_mib_noroutes); 3694 break; 3695 } 3696 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3697 kfree_skb(skb); 3698 return 0; 3699 } 3700 3701 static int ip6_pkt_discard(struct sk_buff *skb) 3702 { 3703 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3704 } 3705 3706 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3707 { 3708 skb->dev = skb_dst(skb)->dev; 3709 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3710 } 3711 3712 static int ip6_pkt_prohibit(struct sk_buff *skb) 3713 { 3714 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3715 } 3716 3717 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3718 { 3719 skb->dev = skb_dst(skb)->dev; 3720 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3721 } 3722 3723 /* 3724 * Allocate a dst for local (unicast / anycast) address. 3725 */ 3726 3727 struct fib6_info *addrconf_f6i_alloc(struct net *net, 3728 struct inet6_dev *idev, 3729 const struct in6_addr *addr, 3730 bool anycast, gfp_t gfp_flags) 3731 { 3732 u32 tb_id; 3733 struct net_device *dev = idev->dev; 3734 struct fib6_info *f6i; 3735 3736 f6i = fib6_info_alloc(gfp_flags); 3737 if (!f6i) 3738 return ERR_PTR(-ENOMEM); 3739 3740 f6i->dst_nocount = true; 3741 f6i->dst_host = true; 3742 f6i->fib6_protocol = RTPROT_KERNEL; 3743 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP; 3744 if (anycast) { 3745 f6i->fib6_type = RTN_ANYCAST; 3746 f6i->fib6_flags |= RTF_ANYCAST; 3747 } else { 3748 f6i->fib6_type = RTN_LOCAL; 3749 f6i->fib6_flags |= RTF_LOCAL; 3750 } 3751 3752 f6i->fib6_nh.nh_gw = *addr; 3753 dev_hold(dev); 3754 f6i->fib6_nh.nh_dev = dev; 3755 f6i->fib6_dst.addr = *addr; 3756 f6i->fib6_dst.plen = 128; 3757 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; 3758 f6i->fib6_table = fib6_get_table(net, tb_id); 3759 3760 return f6i; 3761 } 3762 3763 /* remove deleted ip from prefsrc entries */ 3764 struct arg_dev_net_ip { 3765 struct net_device *dev; 3766 struct net *net; 3767 struct in6_addr *addr; 3768 }; 3769 3770 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 3771 { 3772 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3773 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3774 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3775 3776 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) && 3777 rt != net->ipv6.fib6_null_entry && 3778 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 3779 spin_lock_bh(&rt6_exception_lock); 3780 /* remove prefsrc entry */ 3781 rt->fib6_prefsrc.plen = 0; 3782 /* need to update cache as well */ 3783 rt6_exceptions_remove_prefsrc(rt); 3784 spin_unlock_bh(&rt6_exception_lock); 3785 } 3786 return 0; 3787 } 3788 3789 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3790 { 3791 struct net *net = dev_net(ifp->idev->dev); 3792 struct arg_dev_net_ip adni = { 3793 .dev = ifp->idev->dev, 3794 .net = net, 3795 .addr = &ifp->addr, 3796 }; 3797 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3798 } 3799 3800 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) 3801 3802 /* Remove routers and update dst entries when gateway turn into host. */ 3803 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 3804 { 3805 struct in6_addr *gateway = (struct in6_addr *)arg; 3806 3807 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3808 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) { 3809 return -1; 3810 } 3811 3812 /* Further clean up cached routes in exception table. 3813 * This is needed because cached route may have a different 3814 * gateway than its 'parent' in the case of an ip redirect. 3815 */ 3816 rt6_exceptions_clean_tohost(rt, gateway); 3817 3818 return 0; 3819 } 3820 3821 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3822 { 3823 fib6_clean_all(net, fib6_clean_tohost, gateway); 3824 } 3825 3826 struct arg_netdev_event { 3827 const struct net_device *dev; 3828 union { 3829 unsigned int nh_flags; 3830 unsigned long event; 3831 }; 3832 }; 3833 3834 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 3835 { 3836 struct fib6_info *iter; 3837 struct fib6_node *fn; 3838 3839 fn = rcu_dereference_protected(rt->fib6_node, 3840 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3841 iter = rcu_dereference_protected(fn->leaf, 3842 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3843 while (iter) { 3844 if (iter->fib6_metric == rt->fib6_metric && 3845 iter->fib6_nsiblings) 3846 return iter; 3847 iter = rcu_dereference_protected(iter->fib6_next, 3848 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3849 } 3850 3851 return NULL; 3852 } 3853 3854 static bool rt6_is_dead(const struct fib6_info *rt) 3855 { 3856 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || 3857 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 3858 fib6_ignore_linkdown(rt))) 3859 return true; 3860 3861 return false; 3862 } 3863 3864 static int rt6_multipath_total_weight(const struct fib6_info *rt) 3865 { 3866 struct fib6_info *iter; 3867 int total = 0; 3868 3869 if (!rt6_is_dead(rt)) 3870 total += rt->fib6_nh.nh_weight; 3871 3872 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 3873 if (!rt6_is_dead(iter)) 3874 total += iter->fib6_nh.nh_weight; 3875 } 3876 3877 return total; 3878 } 3879 3880 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 3881 { 3882 int upper_bound = -1; 3883 3884 if (!rt6_is_dead(rt)) { 3885 *weight += rt->fib6_nh.nh_weight; 3886 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 3887 total) - 1; 3888 } 3889 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound); 3890 } 3891 3892 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 3893 { 3894 struct fib6_info *iter; 3895 int weight = 0; 3896 3897 rt6_upper_bound_set(rt, &weight, total); 3898 3899 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3900 rt6_upper_bound_set(iter, &weight, total); 3901 } 3902 3903 void rt6_multipath_rebalance(struct fib6_info *rt) 3904 { 3905 struct fib6_info *first; 3906 int total; 3907 3908 /* In case the entire multipath route was marked for flushing, 3909 * then there is no need to rebalance upon the removal of every 3910 * sibling route. 3911 */ 3912 if (!rt->fib6_nsiblings || rt->should_flush) 3913 return; 3914 3915 /* During lookup routes are evaluated in order, so we need to 3916 * make sure upper bounds are assigned from the first sibling 3917 * onwards. 3918 */ 3919 first = rt6_multipath_first_sibling(rt); 3920 if (WARN_ON_ONCE(!first)) 3921 return; 3922 3923 total = rt6_multipath_total_weight(first); 3924 rt6_multipath_upper_bound_set(first, total); 3925 } 3926 3927 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 3928 { 3929 const struct arg_netdev_event *arg = p_arg; 3930 struct net *net = dev_net(arg->dev); 3931 3932 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) { 3933 rt->fib6_nh.nh_flags &= ~arg->nh_flags; 3934 fib6_update_sernum_upto_root(net, rt); 3935 rt6_multipath_rebalance(rt); 3936 } 3937 3938 return 0; 3939 } 3940 3941 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) 3942 { 3943 struct arg_netdev_event arg = { 3944 .dev = dev, 3945 { 3946 .nh_flags = nh_flags, 3947 }, 3948 }; 3949 3950 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 3951 arg.nh_flags |= RTNH_F_LINKDOWN; 3952 3953 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 3954 } 3955 3956 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 3957 const struct net_device *dev) 3958 { 3959 struct fib6_info *iter; 3960 3961 if (rt->fib6_nh.nh_dev == dev) 3962 return true; 3963 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3964 if (iter->fib6_nh.nh_dev == dev) 3965 return true; 3966 3967 return false; 3968 } 3969 3970 static void rt6_multipath_flush(struct fib6_info *rt) 3971 { 3972 struct fib6_info *iter; 3973 3974 rt->should_flush = 1; 3975 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3976 iter->should_flush = 1; 3977 } 3978 3979 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 3980 const struct net_device *down_dev) 3981 { 3982 struct fib6_info *iter; 3983 unsigned int dead = 0; 3984 3985 if (rt->fib6_nh.nh_dev == down_dev || 3986 rt->fib6_nh.nh_flags & RTNH_F_DEAD) 3987 dead++; 3988 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3989 if (iter->fib6_nh.nh_dev == down_dev || 3990 iter->fib6_nh.nh_flags & RTNH_F_DEAD) 3991 dead++; 3992 3993 return dead; 3994 } 3995 3996 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 3997 const struct net_device *dev, 3998 unsigned int nh_flags) 3999 { 4000 struct fib6_info *iter; 4001 4002 if (rt->fib6_nh.nh_dev == dev) 4003 rt->fib6_nh.nh_flags |= nh_flags; 4004 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4005 if (iter->fib6_nh.nh_dev == dev) 4006 iter->fib6_nh.nh_flags |= nh_flags; 4007 } 4008 4009 /* called with write lock held for table with rt */ 4010 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 4011 { 4012 const struct arg_netdev_event *arg = p_arg; 4013 const struct net_device *dev = arg->dev; 4014 struct net *net = dev_net(dev); 4015 4016 if (rt == net->ipv6.fib6_null_entry) 4017 return 0; 4018 4019 switch (arg->event) { 4020 case NETDEV_UNREGISTER: 4021 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 4022 case NETDEV_DOWN: 4023 if (rt->should_flush) 4024 return -1; 4025 if (!rt->fib6_nsiblings) 4026 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 4027 if (rt6_multipath_uses_dev(rt, dev)) { 4028 unsigned int count; 4029 4030 count = rt6_multipath_dead_count(rt, dev); 4031 if (rt->fib6_nsiblings + 1 == count) { 4032 rt6_multipath_flush(rt); 4033 return -1; 4034 } 4035 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4036 RTNH_F_LINKDOWN); 4037 fib6_update_sernum(net, rt); 4038 rt6_multipath_rebalance(rt); 4039 } 4040 return -2; 4041 case NETDEV_CHANGE: 4042 if (rt->fib6_nh.nh_dev != dev || 4043 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 4044 break; 4045 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 4046 rt6_multipath_rebalance(rt); 4047 break; 4048 } 4049 4050 return 0; 4051 } 4052 4053 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4054 { 4055 struct arg_netdev_event arg = { 4056 .dev = dev, 4057 { 4058 .event = event, 4059 }, 4060 }; 4061 4062 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg); 4063 } 4064 4065 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4066 { 4067 rt6_sync_down_dev(dev, event); 4068 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4069 neigh_ifdown(&nd_tbl, dev); 4070 } 4071 4072 struct rt6_mtu_change_arg { 4073 struct net_device *dev; 4074 unsigned int mtu; 4075 }; 4076 4077 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) 4078 { 4079 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4080 struct inet6_dev *idev; 4081 4082 /* In IPv6 pmtu discovery is not optional, 4083 so that RTAX_MTU lock cannot disable it. 4084 We still use this lock to block changes 4085 caused by addrconf/ndisc. 4086 */ 4087 4088 idev = __in6_dev_get(arg->dev); 4089 if (!idev) 4090 return 0; 4091 4092 /* For administrative MTU increase, there is no way to discover 4093 IPv6 PMTU increase, so PMTU increase should be updated here. 4094 Since RFC 1981 doesn't include administrative MTU increase 4095 update PMTU increase is a MUST. (i.e. jumbo frame) 4096 */ 4097 if (rt->fib6_nh.nh_dev == arg->dev && 4098 !fib6_metric_locked(rt, RTAX_MTU)) { 4099 u32 mtu = rt->fib6_pmtu; 4100 4101 if (mtu >= arg->mtu || 4102 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4103 fib6_metric_set(rt, RTAX_MTU, arg->mtu); 4104 4105 spin_lock_bh(&rt6_exception_lock); 4106 rt6_exceptions_update_pmtu(idev, rt, arg->mtu); 4107 spin_unlock_bh(&rt6_exception_lock); 4108 } 4109 return 0; 4110 } 4111 4112 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4113 { 4114 struct rt6_mtu_change_arg arg = { 4115 .dev = dev, 4116 .mtu = mtu, 4117 }; 4118 4119 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4120 } 4121 4122 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4123 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4124 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4125 [RTA_OIF] = { .type = NLA_U32 }, 4126 [RTA_IIF] = { .type = NLA_U32 }, 4127 [RTA_PRIORITY] = { .type = NLA_U32 }, 4128 [RTA_METRICS] = { .type = NLA_NESTED }, 4129 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4130 [RTA_PREF] = { .type = NLA_U8 }, 4131 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4132 [RTA_ENCAP] = { .type = NLA_NESTED }, 4133 [RTA_EXPIRES] = { .type = NLA_U32 }, 4134 [RTA_UID] = { .type = NLA_U32 }, 4135 [RTA_MARK] = { .type = NLA_U32 }, 4136 [RTA_TABLE] = { .type = NLA_U32 }, 4137 [RTA_IP_PROTO] = { .type = NLA_U8 }, 4138 [RTA_SPORT] = { .type = NLA_U16 }, 4139 [RTA_DPORT] = { .type = NLA_U16 }, 4140 }; 4141 4142 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4143 struct fib6_config *cfg, 4144 struct netlink_ext_ack *extack) 4145 { 4146 struct rtmsg *rtm; 4147 struct nlattr *tb[RTA_MAX+1]; 4148 unsigned int pref; 4149 int err; 4150 4151 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4152 NULL); 4153 if (err < 0) 4154 goto errout; 4155 4156 err = -EINVAL; 4157 rtm = nlmsg_data(nlh); 4158 memset(cfg, 0, sizeof(*cfg)); 4159 4160 cfg->fc_table = rtm->rtm_table; 4161 cfg->fc_dst_len = rtm->rtm_dst_len; 4162 cfg->fc_src_len = rtm->rtm_src_len; 4163 cfg->fc_flags = RTF_UP; 4164 cfg->fc_protocol = rtm->rtm_protocol; 4165 cfg->fc_type = rtm->rtm_type; 4166 4167 if (rtm->rtm_type == RTN_UNREACHABLE || 4168 rtm->rtm_type == RTN_BLACKHOLE || 4169 rtm->rtm_type == RTN_PROHIBIT || 4170 rtm->rtm_type == RTN_THROW) 4171 cfg->fc_flags |= RTF_REJECT; 4172 4173 if (rtm->rtm_type == RTN_LOCAL) 4174 cfg->fc_flags |= RTF_LOCAL; 4175 4176 if (rtm->rtm_flags & RTM_F_CLONED) 4177 cfg->fc_flags |= RTF_CACHE; 4178 4179 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4180 4181 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; 4182 cfg->fc_nlinfo.nlh = nlh; 4183 cfg->fc_nlinfo.nl_net = sock_net(skb->sk); 4184 4185 if (tb[RTA_GATEWAY]) { 4186 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4187 cfg->fc_flags |= RTF_GATEWAY; 4188 } 4189 4190 if (tb[RTA_DST]) { 4191 int plen = (rtm->rtm_dst_len + 7) >> 3; 4192 4193 if (nla_len(tb[RTA_DST]) < plen) 4194 goto errout; 4195 4196 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 4197 } 4198 4199 if (tb[RTA_SRC]) { 4200 int plen = (rtm->rtm_src_len + 7) >> 3; 4201 4202 if (nla_len(tb[RTA_SRC]) < plen) 4203 goto errout; 4204 4205 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 4206 } 4207 4208 if (tb[RTA_PREFSRC]) 4209 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 4210 4211 if (tb[RTA_OIF]) 4212 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 4213 4214 if (tb[RTA_PRIORITY]) 4215 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 4216 4217 if (tb[RTA_METRICS]) { 4218 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 4219 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 4220 } 4221 4222 if (tb[RTA_TABLE]) 4223 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 4224 4225 if (tb[RTA_MULTIPATH]) { 4226 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 4227 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 4228 4229 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 4230 cfg->fc_mp_len, extack); 4231 if (err < 0) 4232 goto errout; 4233 } 4234 4235 if (tb[RTA_PREF]) { 4236 pref = nla_get_u8(tb[RTA_PREF]); 4237 if (pref != ICMPV6_ROUTER_PREF_LOW && 4238 pref != ICMPV6_ROUTER_PREF_HIGH) 4239 pref = ICMPV6_ROUTER_PREF_MEDIUM; 4240 cfg->fc_flags |= RTF_PREF(pref); 4241 } 4242 4243 if (tb[RTA_ENCAP]) 4244 cfg->fc_encap = tb[RTA_ENCAP]; 4245 4246 if (tb[RTA_ENCAP_TYPE]) { 4247 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 4248 4249 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 4250 if (err < 0) 4251 goto errout; 4252 } 4253 4254 if (tb[RTA_EXPIRES]) { 4255 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 4256 4257 if (addrconf_finite_timeout(timeout)) { 4258 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 4259 cfg->fc_flags |= RTF_EXPIRES; 4260 } 4261 } 4262 4263 err = 0; 4264 errout: 4265 return err; 4266 } 4267 4268 struct rt6_nh { 4269 struct fib6_info *fib6_info; 4270 struct fib6_config r_cfg; 4271 struct list_head next; 4272 }; 4273 4274 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) 4275 { 4276 struct rt6_nh *nh; 4277 4278 list_for_each_entry(nh, rt6_nh_list, next) { 4279 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n", 4280 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, 4281 nh->r_cfg.fc_ifindex); 4282 } 4283 } 4284 4285 static int ip6_route_info_append(struct net *net, 4286 struct list_head *rt6_nh_list, 4287 struct fib6_info *rt, 4288 struct fib6_config *r_cfg) 4289 { 4290 struct rt6_nh *nh; 4291 int err = -EEXIST; 4292 4293 list_for_each_entry(nh, rt6_nh_list, next) { 4294 /* check if fib6_info already exists */ 4295 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 4296 return err; 4297 } 4298 4299 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4300 if (!nh) 4301 return -ENOMEM; 4302 nh->fib6_info = rt; 4303 err = ip6_convert_metrics(net, rt, r_cfg); 4304 if (err) { 4305 kfree(nh); 4306 return err; 4307 } 4308 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4309 list_add_tail(&nh->next, rt6_nh_list); 4310 4311 return 0; 4312 } 4313 4314 static void ip6_route_mpath_notify(struct fib6_info *rt, 4315 struct fib6_info *rt_last, 4316 struct nl_info *info, 4317 __u16 nlflags) 4318 { 4319 /* if this is an APPEND route, then rt points to the first route 4320 * inserted and rt_last points to last route inserted. Userspace 4321 * wants a consistent dump of the route which starts at the first 4322 * nexthop. Since sibling routes are always added at the end of 4323 * the list, find the first sibling of the last route appended 4324 */ 4325 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 4326 rt = list_first_entry(&rt_last->fib6_siblings, 4327 struct fib6_info, 4328 fib6_siblings); 4329 } 4330 4331 if (rt) 4332 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 4333 } 4334 4335 static int ip6_route_multipath_add(struct fib6_config *cfg, 4336 struct netlink_ext_ack *extack) 4337 { 4338 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 4339 struct nl_info *info = &cfg->fc_nlinfo; 4340 struct fib6_config r_cfg; 4341 struct rtnexthop *rtnh; 4342 struct fib6_info *rt; 4343 struct rt6_nh *err_nh; 4344 struct rt6_nh *nh, *nh_safe; 4345 __u16 nlflags; 4346 int remaining; 4347 int attrlen; 4348 int err = 1; 4349 int nhn = 0; 4350 int replace = (cfg->fc_nlinfo.nlh && 4351 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 4352 LIST_HEAD(rt6_nh_list); 4353 4354 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 4355 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 4356 nlflags |= NLM_F_APPEND; 4357 4358 remaining = cfg->fc_mp_len; 4359 rtnh = (struct rtnexthop *)cfg->fc_mp; 4360 4361 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4362 * fib6_info structs per nexthop 4363 */ 4364 while (rtnh_ok(rtnh, remaining)) { 4365 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4366 if (rtnh->rtnh_ifindex) 4367 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4368 4369 attrlen = rtnh_attrlen(rtnh); 4370 if (attrlen > 0) { 4371 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4372 4373 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4374 if (nla) { 4375 r_cfg.fc_gateway = nla_get_in6_addr(nla); 4376 r_cfg.fc_flags |= RTF_GATEWAY; 4377 } 4378 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 4379 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 4380 if (nla) 4381 r_cfg.fc_encap_type = nla_get_u16(nla); 4382 } 4383 4384 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 4385 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 4386 if (IS_ERR(rt)) { 4387 err = PTR_ERR(rt); 4388 rt = NULL; 4389 goto cleanup; 4390 } 4391 4392 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1; 4393 4394 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 4395 rt, &r_cfg); 4396 if (err) { 4397 fib6_info_release(rt); 4398 goto cleanup; 4399 } 4400 4401 rtnh = rtnh_next(rtnh, &remaining); 4402 } 4403 4404 /* for add and replace send one notification with all nexthops. 4405 * Skip the notification in fib6_add_rt2node and send one with 4406 * the full route when done 4407 */ 4408 info->skip_notify = 1; 4409 4410 err_nh = NULL; 4411 list_for_each_entry(nh, &rt6_nh_list, next) { 4412 err = __ip6_ins_rt(nh->fib6_info, info, extack); 4413 fib6_info_release(nh->fib6_info); 4414 4415 if (!err) { 4416 /* save reference to last route successfully inserted */ 4417 rt_last = nh->fib6_info; 4418 4419 /* save reference to first route for notification */ 4420 if (!rt_notif) 4421 rt_notif = nh->fib6_info; 4422 } 4423 4424 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 4425 nh->fib6_info = NULL; 4426 if (err) { 4427 if (replace && nhn) 4428 ip6_print_replace_route_err(&rt6_nh_list); 4429 err_nh = nh; 4430 goto add_errout; 4431 } 4432 4433 /* Because each route is added like a single route we remove 4434 * these flags after the first nexthop: if there is a collision, 4435 * we have already failed to add the first nexthop: 4436 * fib6_add_rt2node() has rejected it; when replacing, old 4437 * nexthops have been replaced by first new, the rest should 4438 * be added to it. 4439 */ 4440 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4441 NLM_F_REPLACE); 4442 cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_APPEND; 4443 nhn++; 4444 } 4445 4446 /* success ... tell user about new route */ 4447 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4448 goto cleanup; 4449 4450 add_errout: 4451 /* send notification for routes that were added so that 4452 * the delete notifications sent by ip6_route_del are 4453 * coherent 4454 */ 4455 if (rt_notif) 4456 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4457 4458 /* Delete routes that were already added */ 4459 list_for_each_entry(nh, &rt6_nh_list, next) { 4460 if (err_nh == nh) 4461 break; 4462 ip6_route_del(&nh->r_cfg, extack); 4463 } 4464 4465 cleanup: 4466 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4467 if (nh->fib6_info) 4468 fib6_info_release(nh->fib6_info); 4469 list_del(&nh->next); 4470 kfree(nh); 4471 } 4472 4473 return err; 4474 } 4475 4476 static int ip6_route_multipath_del(struct fib6_config *cfg, 4477 struct netlink_ext_ack *extack) 4478 { 4479 struct fib6_config r_cfg; 4480 struct rtnexthop *rtnh; 4481 int remaining; 4482 int attrlen; 4483 int err = 1, last_err = 0; 4484 4485 remaining = cfg->fc_mp_len; 4486 rtnh = (struct rtnexthop *)cfg->fc_mp; 4487 4488 /* Parse a Multipath Entry */ 4489 while (rtnh_ok(rtnh, remaining)) { 4490 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4491 if (rtnh->rtnh_ifindex) 4492 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4493 4494 attrlen = rtnh_attrlen(rtnh); 4495 if (attrlen > 0) { 4496 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4497 4498 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4499 if (nla) { 4500 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 4501 r_cfg.fc_flags |= RTF_GATEWAY; 4502 } 4503 } 4504 err = ip6_route_del(&r_cfg, extack); 4505 if (err) 4506 last_err = err; 4507 4508 rtnh = rtnh_next(rtnh, &remaining); 4509 } 4510 4511 return last_err; 4512 } 4513 4514 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4515 struct netlink_ext_ack *extack) 4516 { 4517 struct fib6_config cfg; 4518 int err; 4519 4520 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4521 if (err < 0) 4522 return err; 4523 4524 if (cfg.fc_mp) 4525 return ip6_route_multipath_del(&cfg, extack); 4526 else { 4527 cfg.fc_delete_all_nh = 1; 4528 return ip6_route_del(&cfg, extack); 4529 } 4530 } 4531 4532 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4533 struct netlink_ext_ack *extack) 4534 { 4535 struct fib6_config cfg; 4536 int err; 4537 4538 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4539 if (err < 0) 4540 return err; 4541 4542 if (cfg.fc_mp) 4543 return ip6_route_multipath_add(&cfg, extack); 4544 else 4545 return ip6_route_add(&cfg, GFP_KERNEL, extack); 4546 } 4547 4548 static size_t rt6_nlmsg_size(struct fib6_info *rt) 4549 { 4550 int nexthop_len = 0; 4551 4552 if (rt->fib6_nsiblings) { 4553 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4554 + NLA_ALIGN(sizeof(struct rtnexthop)) 4555 + nla_total_size(16) /* RTA_GATEWAY */ 4556 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate); 4557 4558 nexthop_len *= rt->fib6_nsiblings; 4559 } 4560 4561 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4562 + nla_total_size(16) /* RTA_SRC */ 4563 + nla_total_size(16) /* RTA_DST */ 4564 + nla_total_size(16) /* RTA_GATEWAY */ 4565 + nla_total_size(16) /* RTA_PREFSRC */ 4566 + nla_total_size(4) /* RTA_TABLE */ 4567 + nla_total_size(4) /* RTA_IIF */ 4568 + nla_total_size(4) /* RTA_OIF */ 4569 + nla_total_size(4) /* RTA_PRIORITY */ 4570 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 4571 + nla_total_size(sizeof(struct rta_cacheinfo)) 4572 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4573 + nla_total_size(1) /* RTA_PREF */ 4574 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate) 4575 + nexthop_len; 4576 } 4577 4578 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt, 4579 unsigned int *flags, bool skip_oif) 4580 { 4581 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 4582 *flags |= RTNH_F_DEAD; 4583 4584 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) { 4585 *flags |= RTNH_F_LINKDOWN; 4586 4587 rcu_read_lock(); 4588 if (fib6_ignore_linkdown(rt)) 4589 *flags |= RTNH_F_DEAD; 4590 rcu_read_unlock(); 4591 } 4592 4593 if (rt->fib6_flags & RTF_GATEWAY) { 4594 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0) 4595 goto nla_put_failure; 4596 } 4597 4598 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK); 4599 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD) 4600 *flags |= RTNH_F_OFFLOAD; 4601 4602 /* not needed for multipath encoding b/c it has a rtnexthop struct */ 4603 if (!skip_oif && rt->fib6_nh.nh_dev && 4604 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex)) 4605 goto nla_put_failure; 4606 4607 if (rt->fib6_nh.nh_lwtstate && 4608 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0) 4609 goto nla_put_failure; 4610 4611 return 0; 4612 4613 nla_put_failure: 4614 return -EMSGSIZE; 4615 } 4616 4617 /* add multipath next hop */ 4618 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt) 4619 { 4620 const struct net_device *dev = rt->fib6_nh.nh_dev; 4621 struct rtnexthop *rtnh; 4622 unsigned int flags = 0; 4623 4624 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 4625 if (!rtnh) 4626 goto nla_put_failure; 4627 4628 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1; 4629 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; 4630 4631 if (rt6_nexthop_info(skb, rt, &flags, true) < 0) 4632 goto nla_put_failure; 4633 4634 rtnh->rtnh_flags = flags; 4635 4636 /* length of rtnetlink header + attributes */ 4637 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; 4638 4639 return 0; 4640 4641 nla_put_failure: 4642 return -EMSGSIZE; 4643 } 4644 4645 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 4646 struct fib6_info *rt, struct dst_entry *dst, 4647 struct in6_addr *dest, struct in6_addr *src, 4648 int iif, int type, u32 portid, u32 seq, 4649 unsigned int flags) 4650 { 4651 struct rtmsg *rtm; 4652 struct nlmsghdr *nlh; 4653 long expires = 0; 4654 u32 *pmetrics; 4655 u32 table; 4656 4657 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4658 if (!nlh) 4659 return -EMSGSIZE; 4660 4661 rtm = nlmsg_data(nlh); 4662 rtm->rtm_family = AF_INET6; 4663 rtm->rtm_dst_len = rt->fib6_dst.plen; 4664 rtm->rtm_src_len = rt->fib6_src.plen; 4665 rtm->rtm_tos = 0; 4666 if (rt->fib6_table) 4667 table = rt->fib6_table->tb6_id; 4668 else 4669 table = RT6_TABLE_UNSPEC; 4670 rtm->rtm_table = table; 4671 if (nla_put_u32(skb, RTA_TABLE, table)) 4672 goto nla_put_failure; 4673 4674 rtm->rtm_type = rt->fib6_type; 4675 rtm->rtm_flags = 0; 4676 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4677 rtm->rtm_protocol = rt->fib6_protocol; 4678 4679 if (rt->fib6_flags & RTF_CACHE) 4680 rtm->rtm_flags |= RTM_F_CLONED; 4681 4682 if (dest) { 4683 if (nla_put_in6_addr(skb, RTA_DST, dest)) 4684 goto nla_put_failure; 4685 rtm->rtm_dst_len = 128; 4686 } else if (rtm->rtm_dst_len) 4687 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr)) 4688 goto nla_put_failure; 4689 #ifdef CONFIG_IPV6_SUBTREES 4690 if (src) { 4691 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4692 goto nla_put_failure; 4693 rtm->rtm_src_len = 128; 4694 } else if (rtm->rtm_src_len && 4695 nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr)) 4696 goto nla_put_failure; 4697 #endif 4698 if (iif) { 4699 #ifdef CONFIG_IPV6_MROUTE 4700 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) { 4701 int err = ip6mr_get_route(net, skb, rtm, portid); 4702 4703 if (err == 0) 4704 return 0; 4705 if (err < 0) 4706 goto nla_put_failure; 4707 } else 4708 #endif 4709 if (nla_put_u32(skb, RTA_IIF, iif)) 4710 goto nla_put_failure; 4711 } else if (dest) { 4712 struct in6_addr saddr_buf; 4713 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 4714 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4715 goto nla_put_failure; 4716 } 4717 4718 if (rt->fib6_prefsrc.plen) { 4719 struct in6_addr saddr_buf; 4720 saddr_buf = rt->fib6_prefsrc.addr; 4721 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4722 goto nla_put_failure; 4723 } 4724 4725 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 4726 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 4727 goto nla_put_failure; 4728 4729 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 4730 goto nla_put_failure; 4731 4732 /* For multipath routes, walk the siblings list and add 4733 * each as a nexthop within RTA_MULTIPATH. 4734 */ 4735 if (rt->fib6_nsiblings) { 4736 struct fib6_info *sibling, *next_sibling; 4737 struct nlattr *mp; 4738 4739 mp = nla_nest_start(skb, RTA_MULTIPATH); 4740 if (!mp) 4741 goto nla_put_failure; 4742 4743 if (rt6_add_nexthop(skb, rt) < 0) 4744 goto nla_put_failure; 4745 4746 list_for_each_entry_safe(sibling, next_sibling, 4747 &rt->fib6_siblings, fib6_siblings) { 4748 if (rt6_add_nexthop(skb, sibling) < 0) 4749 goto nla_put_failure; 4750 } 4751 4752 nla_nest_end(skb, mp); 4753 } else { 4754 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0) 4755 goto nla_put_failure; 4756 } 4757 4758 if (rt->fib6_flags & RTF_EXPIRES) { 4759 expires = dst ? dst->expires : rt->expires; 4760 expires -= jiffies; 4761 } 4762 4763 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 4764 goto nla_put_failure; 4765 4766 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags))) 4767 goto nla_put_failure; 4768 4769 4770 nlmsg_end(skb, nlh); 4771 return 0; 4772 4773 nla_put_failure: 4774 nlmsg_cancel(skb, nlh); 4775 return -EMSGSIZE; 4776 } 4777 4778 int rt6_dump_route(struct fib6_info *rt, void *p_arg) 4779 { 4780 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4781 struct net *net = arg->net; 4782 4783 if (rt == net->ipv6.fib6_null_entry) 4784 return 0; 4785 4786 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { 4787 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); 4788 4789 /* user wants prefix routes only */ 4790 if (rtm->rtm_flags & RTM_F_PREFIX && 4791 !(rt->fib6_flags & RTF_PREFIX_RT)) { 4792 /* success since this is not a prefix route */ 4793 return 1; 4794 } 4795 } 4796 4797 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, 4798 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, 4799 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI); 4800 } 4801 4802 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4803 struct netlink_ext_ack *extack) 4804 { 4805 struct net *net = sock_net(in_skb->sk); 4806 struct nlattr *tb[RTA_MAX+1]; 4807 int err, iif = 0, oif = 0; 4808 struct fib6_info *from; 4809 struct dst_entry *dst; 4810 struct rt6_info *rt; 4811 struct sk_buff *skb; 4812 struct rtmsg *rtm; 4813 struct flowi6 fl6; 4814 bool fibmatch; 4815 4816 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4817 extack); 4818 if (err < 0) 4819 goto errout; 4820 4821 err = -EINVAL; 4822 memset(&fl6, 0, sizeof(fl6)); 4823 rtm = nlmsg_data(nlh); 4824 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4825 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4826 4827 if (tb[RTA_SRC]) { 4828 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4829 goto errout; 4830 4831 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4832 } 4833 4834 if (tb[RTA_DST]) { 4835 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4836 goto errout; 4837 4838 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4839 } 4840 4841 if (tb[RTA_IIF]) 4842 iif = nla_get_u32(tb[RTA_IIF]); 4843 4844 if (tb[RTA_OIF]) 4845 oif = nla_get_u32(tb[RTA_OIF]); 4846 4847 if (tb[RTA_MARK]) 4848 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4849 4850 if (tb[RTA_UID]) 4851 fl6.flowi6_uid = make_kuid(current_user_ns(), 4852 nla_get_u32(tb[RTA_UID])); 4853 else 4854 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4855 4856 if (tb[RTA_SPORT]) 4857 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); 4858 4859 if (tb[RTA_DPORT]) 4860 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); 4861 4862 if (tb[RTA_IP_PROTO]) { 4863 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 4864 &fl6.flowi6_proto, extack); 4865 if (err) 4866 goto errout; 4867 } 4868 4869 if (iif) { 4870 struct net_device *dev; 4871 int flags = 0; 4872 4873 rcu_read_lock(); 4874 4875 dev = dev_get_by_index_rcu(net, iif); 4876 if (!dev) { 4877 rcu_read_unlock(); 4878 err = -ENODEV; 4879 goto errout; 4880 } 4881 4882 fl6.flowi6_iif = iif; 4883 4884 if (!ipv6_addr_any(&fl6.saddr)) 4885 flags |= RT6_LOOKUP_F_HAS_SADDR; 4886 4887 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 4888 4889 rcu_read_unlock(); 4890 } else { 4891 fl6.flowi6_oif = oif; 4892 4893 dst = ip6_route_output(net, NULL, &fl6); 4894 } 4895 4896 4897 rt = container_of(dst, struct rt6_info, dst); 4898 if (rt->dst.error) { 4899 err = rt->dst.error; 4900 ip6_rt_put(rt); 4901 goto errout; 4902 } 4903 4904 if (rt == net->ipv6.ip6_null_entry) { 4905 err = rt->dst.error; 4906 ip6_rt_put(rt); 4907 goto errout; 4908 } 4909 4910 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 4911 if (!skb) { 4912 ip6_rt_put(rt); 4913 err = -ENOBUFS; 4914 goto errout; 4915 } 4916 4917 skb_dst_set(skb, &rt->dst); 4918 4919 rcu_read_lock(); 4920 from = rcu_dereference(rt->from); 4921 4922 if (fibmatch) 4923 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, 4924 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 4925 nlh->nlmsg_seq, 0); 4926 else 4927 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 4928 &fl6.saddr, iif, RTM_NEWROUTE, 4929 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 4930 0); 4931 rcu_read_unlock(); 4932 4933 if (err < 0) { 4934 kfree_skb(skb); 4935 goto errout; 4936 } 4937 4938 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 4939 errout: 4940 return err; 4941 } 4942 4943 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 4944 unsigned int nlm_flags) 4945 { 4946 struct sk_buff *skb; 4947 struct net *net = info->nl_net; 4948 u32 seq; 4949 int err; 4950 4951 err = -ENOBUFS; 4952 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 4953 4954 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 4955 if (!skb) 4956 goto errout; 4957 4958 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 4959 event, info->portid, seq, nlm_flags); 4960 if (err < 0) { 4961 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 4962 WARN_ON(err == -EMSGSIZE); 4963 kfree_skb(skb); 4964 goto errout; 4965 } 4966 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 4967 info->nlh, gfp_any()); 4968 return; 4969 errout: 4970 if (err < 0) 4971 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 4972 } 4973 4974 static int ip6_route_dev_notify(struct notifier_block *this, 4975 unsigned long event, void *ptr) 4976 { 4977 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 4978 struct net *net = dev_net(dev); 4979 4980 if (!(dev->flags & IFF_LOOPBACK)) 4981 return NOTIFY_OK; 4982 4983 if (event == NETDEV_REGISTER) { 4984 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev; 4985 net->ipv6.ip6_null_entry->dst.dev = dev; 4986 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 4987 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4988 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 4989 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 4990 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 4991 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 4992 #endif 4993 } else if (event == NETDEV_UNREGISTER && 4994 dev->reg_state != NETREG_UNREGISTERED) { 4995 /* NETDEV_UNREGISTER could be fired for multiple times by 4996 * netdev_wait_allrefs(). Make sure we only call this once. 4997 */ 4998 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 4999 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5000 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 5001 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 5002 #endif 5003 } 5004 5005 return NOTIFY_OK; 5006 } 5007 5008 /* 5009 * /proc 5010 */ 5011 5012 #ifdef CONFIG_PROC_FS 5013 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 5014 { 5015 struct net *net = (struct net *)seq->private; 5016 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 5017 net->ipv6.rt6_stats->fib_nodes, 5018 net->ipv6.rt6_stats->fib_route_nodes, 5019 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 5020 net->ipv6.rt6_stats->fib_rt_entries, 5021 net->ipv6.rt6_stats->fib_rt_cache, 5022 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 5023 net->ipv6.rt6_stats->fib_discarded_routes); 5024 5025 return 0; 5026 } 5027 #endif /* CONFIG_PROC_FS */ 5028 5029 #ifdef CONFIG_SYSCTL 5030 5031 static 5032 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 5033 void __user *buffer, size_t *lenp, loff_t *ppos) 5034 { 5035 struct net *net; 5036 int delay; 5037 if (!write) 5038 return -EINVAL; 5039 5040 net = (struct net *)ctl->extra1; 5041 delay = net->ipv6.sysctl.flush_delay; 5042 proc_dointvec(ctl, write, buffer, lenp, ppos); 5043 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 5044 return 0; 5045 } 5046 5047 struct ctl_table ipv6_route_table_template[] = { 5048 { 5049 .procname = "flush", 5050 .data = &init_net.ipv6.sysctl.flush_delay, 5051 .maxlen = sizeof(int), 5052 .mode = 0200, 5053 .proc_handler = ipv6_sysctl_rtcache_flush 5054 }, 5055 { 5056 .procname = "gc_thresh", 5057 .data = &ip6_dst_ops_template.gc_thresh, 5058 .maxlen = sizeof(int), 5059 .mode = 0644, 5060 .proc_handler = proc_dointvec, 5061 }, 5062 { 5063 .procname = "max_size", 5064 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 5065 .maxlen = sizeof(int), 5066 .mode = 0644, 5067 .proc_handler = proc_dointvec, 5068 }, 5069 { 5070 .procname = "gc_min_interval", 5071 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5072 .maxlen = sizeof(int), 5073 .mode = 0644, 5074 .proc_handler = proc_dointvec_jiffies, 5075 }, 5076 { 5077 .procname = "gc_timeout", 5078 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 5079 .maxlen = sizeof(int), 5080 .mode = 0644, 5081 .proc_handler = proc_dointvec_jiffies, 5082 }, 5083 { 5084 .procname = "gc_interval", 5085 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 5086 .maxlen = sizeof(int), 5087 .mode = 0644, 5088 .proc_handler = proc_dointvec_jiffies, 5089 }, 5090 { 5091 .procname = "gc_elasticity", 5092 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 5093 .maxlen = sizeof(int), 5094 .mode = 0644, 5095 .proc_handler = proc_dointvec, 5096 }, 5097 { 5098 .procname = "mtu_expires", 5099 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 5100 .maxlen = sizeof(int), 5101 .mode = 0644, 5102 .proc_handler = proc_dointvec_jiffies, 5103 }, 5104 { 5105 .procname = "min_adv_mss", 5106 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 5107 .maxlen = sizeof(int), 5108 .mode = 0644, 5109 .proc_handler = proc_dointvec, 5110 }, 5111 { 5112 .procname = "gc_min_interval_ms", 5113 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5114 .maxlen = sizeof(int), 5115 .mode = 0644, 5116 .proc_handler = proc_dointvec_ms_jiffies, 5117 }, 5118 { } 5119 }; 5120 5121 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 5122 { 5123 struct ctl_table *table; 5124 5125 table = kmemdup(ipv6_route_table_template, 5126 sizeof(ipv6_route_table_template), 5127 GFP_KERNEL); 5128 5129 if (table) { 5130 table[0].data = &net->ipv6.sysctl.flush_delay; 5131 table[0].extra1 = net; 5132 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 5133 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 5134 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5135 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 5136 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 5137 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 5138 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 5139 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 5140 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5141 5142 /* Don't export sysctls to unprivileged users */ 5143 if (net->user_ns != &init_user_ns) 5144 table[0].procname = NULL; 5145 } 5146 5147 return table; 5148 } 5149 #endif 5150 5151 static int __net_init ip6_route_net_init(struct net *net) 5152 { 5153 int ret = -ENOMEM; 5154 5155 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 5156 sizeof(net->ipv6.ip6_dst_ops)); 5157 5158 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 5159 goto out_ip6_dst_ops; 5160 5161 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, 5162 sizeof(*net->ipv6.fib6_null_entry), 5163 GFP_KERNEL); 5164 if (!net->ipv6.fib6_null_entry) 5165 goto out_ip6_dst_entries; 5166 5167 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 5168 sizeof(*net->ipv6.ip6_null_entry), 5169 GFP_KERNEL); 5170 if (!net->ipv6.ip6_null_entry) 5171 goto out_fib6_null_entry; 5172 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5173 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 5174 ip6_template_metrics, true); 5175 5176 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5177 net->ipv6.fib6_has_custom_rules = false; 5178 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 5179 sizeof(*net->ipv6.ip6_prohibit_entry), 5180 GFP_KERNEL); 5181 if (!net->ipv6.ip6_prohibit_entry) 5182 goto out_ip6_null_entry; 5183 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5184 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 5185 ip6_template_metrics, true); 5186 5187 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 5188 sizeof(*net->ipv6.ip6_blk_hole_entry), 5189 GFP_KERNEL); 5190 if (!net->ipv6.ip6_blk_hole_entry) 5191 goto out_ip6_prohibit_entry; 5192 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5193 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 5194 ip6_template_metrics, true); 5195 #endif 5196 5197 net->ipv6.sysctl.flush_delay = 0; 5198 net->ipv6.sysctl.ip6_rt_max_size = 4096; 5199 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 5200 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 5201 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 5202 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 5203 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 5204 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 5205 5206 net->ipv6.ip6_rt_gc_expire = 30*HZ; 5207 5208 ret = 0; 5209 out: 5210 return ret; 5211 5212 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5213 out_ip6_prohibit_entry: 5214 kfree(net->ipv6.ip6_prohibit_entry); 5215 out_ip6_null_entry: 5216 kfree(net->ipv6.ip6_null_entry); 5217 #endif 5218 out_fib6_null_entry: 5219 kfree(net->ipv6.fib6_null_entry); 5220 out_ip6_dst_entries: 5221 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5222 out_ip6_dst_ops: 5223 goto out; 5224 } 5225 5226 static void __net_exit ip6_route_net_exit(struct net *net) 5227 { 5228 kfree(net->ipv6.fib6_null_entry); 5229 kfree(net->ipv6.ip6_null_entry); 5230 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5231 kfree(net->ipv6.ip6_prohibit_entry); 5232 kfree(net->ipv6.ip6_blk_hole_entry); 5233 #endif 5234 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5235 } 5236 5237 static int __net_init ip6_route_net_init_late(struct net *net) 5238 { 5239 #ifdef CONFIG_PROC_FS 5240 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops, 5241 sizeof(struct ipv6_route_iter)); 5242 proc_create_net_single("rt6_stats", 0444, net->proc_net, 5243 rt6_stats_seq_show, NULL); 5244 #endif 5245 return 0; 5246 } 5247 5248 static void __net_exit ip6_route_net_exit_late(struct net *net) 5249 { 5250 #ifdef CONFIG_PROC_FS 5251 remove_proc_entry("ipv6_route", net->proc_net); 5252 remove_proc_entry("rt6_stats", net->proc_net); 5253 #endif 5254 } 5255 5256 static struct pernet_operations ip6_route_net_ops = { 5257 .init = ip6_route_net_init, 5258 .exit = ip6_route_net_exit, 5259 }; 5260 5261 static int __net_init ipv6_inetpeer_init(struct net *net) 5262 { 5263 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 5264 5265 if (!bp) 5266 return -ENOMEM; 5267 inet_peer_base_init(bp); 5268 net->ipv6.peers = bp; 5269 return 0; 5270 } 5271 5272 static void __net_exit ipv6_inetpeer_exit(struct net *net) 5273 { 5274 struct inet_peer_base *bp = net->ipv6.peers; 5275 5276 net->ipv6.peers = NULL; 5277 inetpeer_invalidate_tree(bp); 5278 kfree(bp); 5279 } 5280 5281 static struct pernet_operations ipv6_inetpeer_ops = { 5282 .init = ipv6_inetpeer_init, 5283 .exit = ipv6_inetpeer_exit, 5284 }; 5285 5286 static struct pernet_operations ip6_route_net_late_ops = { 5287 .init = ip6_route_net_init_late, 5288 .exit = ip6_route_net_exit_late, 5289 }; 5290 5291 static struct notifier_block ip6_route_dev_notifier = { 5292 .notifier_call = ip6_route_dev_notify, 5293 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 5294 }; 5295 5296 void __init ip6_route_init_special_entries(void) 5297 { 5298 /* Registering of the loopback is done before this portion of code, 5299 * the loopback reference in rt6_info will not be taken, do it 5300 * manually for init_net */ 5301 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev; 5302 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5303 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5304 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5305 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 5306 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5307 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 5308 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5309 #endif 5310 } 5311 5312 int __init ip6_route_init(void) 5313 { 5314 int ret; 5315 int cpu; 5316 5317 ret = -ENOMEM; 5318 ip6_dst_ops_template.kmem_cachep = 5319 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 5320 SLAB_HWCACHE_ALIGN, NULL); 5321 if (!ip6_dst_ops_template.kmem_cachep) 5322 goto out; 5323 5324 ret = dst_entries_init(&ip6_dst_blackhole_ops); 5325 if (ret) 5326 goto out_kmem_cache; 5327 5328 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 5329 if (ret) 5330 goto out_dst_entries; 5331 5332 ret = register_pernet_subsys(&ip6_route_net_ops); 5333 if (ret) 5334 goto out_register_inetpeer; 5335 5336 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 5337 5338 ret = fib6_init(); 5339 if (ret) 5340 goto out_register_subsys; 5341 5342 ret = xfrm6_init(); 5343 if (ret) 5344 goto out_fib6_init; 5345 5346 ret = fib6_rules_init(); 5347 if (ret) 5348 goto xfrm6_init; 5349 5350 ret = register_pernet_subsys(&ip6_route_net_late_ops); 5351 if (ret) 5352 goto fib6_rules_init; 5353 5354 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 5355 inet6_rtm_newroute, NULL, 0); 5356 if (ret < 0) 5357 goto out_register_late_subsys; 5358 5359 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 5360 inet6_rtm_delroute, NULL, 0); 5361 if (ret < 0) 5362 goto out_register_late_subsys; 5363 5364 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 5365 inet6_rtm_getroute, NULL, 5366 RTNL_FLAG_DOIT_UNLOCKED); 5367 if (ret < 0) 5368 goto out_register_late_subsys; 5369 5370 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 5371 if (ret) 5372 goto out_register_late_subsys; 5373 5374 for_each_possible_cpu(cpu) { 5375 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 5376 5377 INIT_LIST_HEAD(&ul->head); 5378 spin_lock_init(&ul->lock); 5379 } 5380 5381 out: 5382 return ret; 5383 5384 out_register_late_subsys: 5385 rtnl_unregister_all(PF_INET6); 5386 unregister_pernet_subsys(&ip6_route_net_late_ops); 5387 fib6_rules_init: 5388 fib6_rules_cleanup(); 5389 xfrm6_init: 5390 xfrm6_fini(); 5391 out_fib6_init: 5392 fib6_gc_cleanup(); 5393 out_register_subsys: 5394 unregister_pernet_subsys(&ip6_route_net_ops); 5395 out_register_inetpeer: 5396 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5397 out_dst_entries: 5398 dst_entries_destroy(&ip6_dst_blackhole_ops); 5399 out_kmem_cache: 5400 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5401 goto out; 5402 } 5403 5404 void ip6_route_cleanup(void) 5405 { 5406 unregister_netdevice_notifier(&ip6_route_dev_notifier); 5407 unregister_pernet_subsys(&ip6_route_net_late_ops); 5408 fib6_rules_cleanup(); 5409 xfrm6_fini(); 5410 fib6_gc_cleanup(); 5411 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5412 unregister_pernet_subsys(&ip6_route_net_ops); 5413 dst_entries_destroy(&ip6_dst_blackhole_ops); 5414 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5415 } 5416