1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/nexthop.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <net/ip.h> 67 #include <linux/uaccess.h> 68 69 #ifdef CONFIG_SYSCTL 70 #include <linux/sysctl.h> 71 #endif 72 73 static int ip6_rt_type_to_error(u8 fib6_type); 74 75 #define CREATE_TRACE_POINTS 76 #include <trace/events/fib6.h> 77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); 78 #undef CREATE_TRACE_POINTS 79 80 enum rt6_nud_state { 81 RT6_NUD_FAIL_HARD = -3, 82 RT6_NUD_FAIL_PROBE = -2, 83 RT6_NUD_FAIL_DO_RR = -1, 84 RT6_NUD_SUCCEED = 1 85 }; 86 87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 88 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 89 static unsigned int ip6_mtu(const struct dst_entry *dst); 90 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 91 static void ip6_dst_destroy(struct dst_entry *); 92 static void ip6_dst_ifdown(struct dst_entry *, 93 struct net_device *dev, int how); 94 static int ip6_dst_gc(struct dst_ops *ops); 95 96 static int ip6_pkt_discard(struct sk_buff *skb); 97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 98 static int ip6_pkt_prohibit(struct sk_buff *skb); 99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 100 static void ip6_link_failure(struct sk_buff *skb); 101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 102 struct sk_buff *skb, u32 mtu); 103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 104 struct sk_buff *skb); 105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict); 106 static size_t rt6_nlmsg_size(struct fib6_info *rt); 107 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 108 struct fib6_info *rt, struct dst_entry *dst, 109 struct in6_addr *dest, struct in6_addr *src, 110 int iif, int type, u32 portid, u32 seq, 111 unsigned int flags); 112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 113 struct in6_addr *daddr, 114 struct in6_addr *saddr); 115 116 #ifdef CONFIG_IPV6_ROUTE_INFO 117 static struct fib6_info *rt6_add_route_info(struct net *net, 118 const struct in6_addr *prefix, int prefixlen, 119 const struct in6_addr *gwaddr, 120 struct net_device *dev, 121 unsigned int pref); 122 static struct fib6_info *rt6_get_route_info(struct net *net, 123 const struct in6_addr *prefix, int prefixlen, 124 const struct in6_addr *gwaddr, 125 struct net_device *dev); 126 #endif 127 128 struct uncached_list { 129 spinlock_t lock; 130 struct list_head head; 131 }; 132 133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 134 135 void rt6_uncached_list_add(struct rt6_info *rt) 136 { 137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 138 139 rt->rt6i_uncached_list = ul; 140 141 spin_lock_bh(&ul->lock); 142 list_add_tail(&rt->rt6i_uncached, &ul->head); 143 spin_unlock_bh(&ul->lock); 144 } 145 146 void rt6_uncached_list_del(struct rt6_info *rt) 147 { 148 if (!list_empty(&rt->rt6i_uncached)) { 149 struct uncached_list *ul = rt->rt6i_uncached_list; 150 struct net *net = dev_net(rt->dst.dev); 151 152 spin_lock_bh(&ul->lock); 153 list_del(&rt->rt6i_uncached); 154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 155 spin_unlock_bh(&ul->lock); 156 } 157 } 158 159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 160 { 161 struct net_device *loopback_dev = net->loopback_dev; 162 int cpu; 163 164 if (dev == loopback_dev) 165 return; 166 167 for_each_possible_cpu(cpu) { 168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 169 struct rt6_info *rt; 170 171 spin_lock_bh(&ul->lock); 172 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 173 struct inet6_dev *rt_idev = rt->rt6i_idev; 174 struct net_device *rt_dev = rt->dst.dev; 175 176 if (rt_idev->dev == dev) { 177 rt->rt6i_idev = in6_dev_get(loopback_dev); 178 in6_dev_put(rt_idev); 179 } 180 181 if (rt_dev == dev) { 182 rt->dst.dev = loopback_dev; 183 dev_hold(rt->dst.dev); 184 dev_put(rt_dev); 185 } 186 } 187 spin_unlock_bh(&ul->lock); 188 } 189 } 190 191 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 192 struct sk_buff *skb, 193 const void *daddr) 194 { 195 if (!ipv6_addr_any(p)) 196 return (const void *) p; 197 else if (skb) 198 return &ipv6_hdr(skb)->daddr; 199 return daddr; 200 } 201 202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 203 struct net_device *dev, 204 struct sk_buff *skb, 205 const void *daddr) 206 { 207 struct neighbour *n; 208 209 daddr = choose_neigh_daddr(gw, skb, daddr); 210 n = __ipv6_neigh_lookup(dev, daddr); 211 if (n) 212 return n; 213 return neigh_create(&nd_tbl, daddr, dev); 214 } 215 216 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 217 struct sk_buff *skb, 218 const void *daddr) 219 { 220 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 221 222 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); 223 } 224 225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 226 { 227 struct net_device *dev = dst->dev; 228 struct rt6_info *rt = (struct rt6_info *)dst; 229 230 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); 231 if (!daddr) 232 return; 233 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 234 return; 235 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 236 return; 237 __ipv6_confirm_neigh(dev, daddr); 238 } 239 240 static struct dst_ops ip6_dst_ops_template = { 241 .family = AF_INET6, 242 .gc = ip6_dst_gc, 243 .gc_thresh = 1024, 244 .check = ip6_dst_check, 245 .default_advmss = ip6_default_advmss, 246 .mtu = ip6_mtu, 247 .cow_metrics = dst_cow_metrics_generic, 248 .destroy = ip6_dst_destroy, 249 .ifdown = ip6_dst_ifdown, 250 .negative_advice = ip6_negative_advice, 251 .link_failure = ip6_link_failure, 252 .update_pmtu = ip6_rt_update_pmtu, 253 .redirect = rt6_do_redirect, 254 .local_out = __ip6_local_out, 255 .neigh_lookup = ip6_dst_neigh_lookup, 256 .confirm_neigh = ip6_confirm_neigh, 257 }; 258 259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 260 { 261 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 262 263 return mtu ? : dst->dev->mtu; 264 } 265 266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 267 struct sk_buff *skb, u32 mtu) 268 { 269 } 270 271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 272 struct sk_buff *skb) 273 { 274 } 275 276 static struct dst_ops ip6_dst_blackhole_ops = { 277 .family = AF_INET6, 278 .destroy = ip6_dst_destroy, 279 .check = ip6_dst_check, 280 .mtu = ip6_blackhole_mtu, 281 .default_advmss = ip6_default_advmss, 282 .update_pmtu = ip6_rt_blackhole_update_pmtu, 283 .redirect = ip6_rt_blackhole_redirect, 284 .cow_metrics = dst_cow_metrics_generic, 285 .neigh_lookup = ip6_dst_neigh_lookup, 286 }; 287 288 static const u32 ip6_template_metrics[RTAX_MAX] = { 289 [RTAX_HOPLIMIT - 1] = 0, 290 }; 291 292 static const struct fib6_info fib6_null_entry_template = { 293 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 294 .fib6_protocol = RTPROT_KERNEL, 295 .fib6_metric = ~(u32)0, 296 .fib6_ref = ATOMIC_INIT(1), 297 .fib6_type = RTN_UNREACHABLE, 298 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 299 }; 300 301 static const struct rt6_info ip6_null_entry_template = { 302 .dst = { 303 .__refcnt = ATOMIC_INIT(1), 304 .__use = 1, 305 .obsolete = DST_OBSOLETE_FORCE_CHK, 306 .error = -ENETUNREACH, 307 .input = ip6_pkt_discard, 308 .output = ip6_pkt_discard_out, 309 }, 310 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 311 }; 312 313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 314 315 static const struct rt6_info ip6_prohibit_entry_template = { 316 .dst = { 317 .__refcnt = ATOMIC_INIT(1), 318 .__use = 1, 319 .obsolete = DST_OBSOLETE_FORCE_CHK, 320 .error = -EACCES, 321 .input = ip6_pkt_prohibit, 322 .output = ip6_pkt_prohibit_out, 323 }, 324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 325 }; 326 327 static const struct rt6_info ip6_blk_hole_entry_template = { 328 .dst = { 329 .__refcnt = ATOMIC_INIT(1), 330 .__use = 1, 331 .obsolete = DST_OBSOLETE_FORCE_CHK, 332 .error = -EINVAL, 333 .input = dst_discard, 334 .output = dst_discard_out, 335 }, 336 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 337 }; 338 339 #endif 340 341 static void rt6_info_init(struct rt6_info *rt) 342 { 343 struct dst_entry *dst = &rt->dst; 344 345 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 346 INIT_LIST_HEAD(&rt->rt6i_uncached); 347 } 348 349 /* allocate dst with ip6_dst_ops */ 350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 351 int flags) 352 { 353 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 354 1, DST_OBSOLETE_FORCE_CHK, flags); 355 356 if (rt) { 357 rt6_info_init(rt); 358 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 359 } 360 361 return rt; 362 } 363 EXPORT_SYMBOL(ip6_dst_alloc); 364 365 static void ip6_dst_destroy(struct dst_entry *dst) 366 { 367 struct rt6_info *rt = (struct rt6_info *)dst; 368 struct fib6_info *from; 369 struct inet6_dev *idev; 370 371 dst_destroy_metrics_generic(dst); 372 rt6_uncached_list_del(rt); 373 374 idev = rt->rt6i_idev; 375 if (idev) { 376 rt->rt6i_idev = NULL; 377 in6_dev_put(idev); 378 } 379 380 rcu_read_lock(); 381 from = rcu_dereference(rt->from); 382 rcu_assign_pointer(rt->from, NULL); 383 fib6_info_release(from); 384 rcu_read_unlock(); 385 } 386 387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 388 int how) 389 { 390 struct rt6_info *rt = (struct rt6_info *)dst; 391 struct inet6_dev *idev = rt->rt6i_idev; 392 struct net_device *loopback_dev = 393 dev_net(dev)->loopback_dev; 394 395 if (idev && idev->dev != loopback_dev) { 396 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 397 if (loopback_idev) { 398 rt->rt6i_idev = loopback_idev; 399 in6_dev_put(idev); 400 } 401 } 402 } 403 404 static bool __rt6_check_expired(const struct rt6_info *rt) 405 { 406 if (rt->rt6i_flags & RTF_EXPIRES) 407 return time_after(jiffies, rt->dst.expires); 408 else 409 return false; 410 } 411 412 static bool rt6_check_expired(const struct rt6_info *rt) 413 { 414 struct fib6_info *from; 415 416 from = rcu_dereference(rt->from); 417 418 if (rt->rt6i_flags & RTF_EXPIRES) { 419 if (time_after(jiffies, rt->dst.expires)) 420 return true; 421 } else if (from) { 422 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 423 fib6_check_expired(from); 424 } 425 return false; 426 } 427 428 struct fib6_info *fib6_multipath_select(const struct net *net, 429 struct fib6_info *match, 430 struct flowi6 *fl6, int oif, 431 const struct sk_buff *skb, 432 int strict) 433 { 434 struct fib6_info *sibling, *next_sibling; 435 436 /* We might have already computed the hash for ICMPv6 errors. In such 437 * case it will always be non-zero. Otherwise now is the time to do it. 438 */ 439 if (!fl6->mp_hash) 440 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 441 442 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound)) 443 return match; 444 445 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 446 fib6_siblings) { 447 int nh_upper_bound; 448 449 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound); 450 if (fl6->mp_hash > nh_upper_bound) 451 continue; 452 if (rt6_score_route(sibling, oif, strict) < 0) 453 break; 454 match = sibling; 455 break; 456 } 457 458 return match; 459 } 460 461 /* 462 * Route lookup. rcu_read_lock() should be held. 463 */ 464 465 static inline struct fib6_info *rt6_device_match(struct net *net, 466 struct fib6_info *rt, 467 const struct in6_addr *saddr, 468 int oif, 469 int flags) 470 { 471 struct fib6_info *sprt; 472 473 if (!oif && ipv6_addr_any(saddr) && 474 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD)) 475 return rt; 476 477 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) { 478 const struct net_device *dev = sprt->fib6_nh.nh_dev; 479 480 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD) 481 continue; 482 483 if (oif) { 484 if (dev->ifindex == oif) 485 return sprt; 486 } else { 487 if (ipv6_chk_addr(net, saddr, dev, 488 flags & RT6_LOOKUP_F_IFACE)) 489 return sprt; 490 } 491 } 492 493 if (oif && flags & RT6_LOOKUP_F_IFACE) 494 return net->ipv6.fib6_null_entry; 495 496 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt; 497 } 498 499 #ifdef CONFIG_IPV6_ROUTER_PREF 500 struct __rt6_probe_work { 501 struct work_struct work; 502 struct in6_addr target; 503 struct net_device *dev; 504 }; 505 506 static void rt6_probe_deferred(struct work_struct *w) 507 { 508 struct in6_addr mcaddr; 509 struct __rt6_probe_work *work = 510 container_of(w, struct __rt6_probe_work, work); 511 512 addrconf_addr_solict_mult(&work->target, &mcaddr); 513 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 514 dev_put(work->dev); 515 kfree(work); 516 } 517 518 static void rt6_probe(struct fib6_info *rt) 519 { 520 struct __rt6_probe_work *work; 521 const struct in6_addr *nh_gw; 522 struct neighbour *neigh; 523 struct net_device *dev; 524 525 /* 526 * Okay, this does not seem to be appropriate 527 * for now, however, we need to check if it 528 * is really so; aka Router Reachability Probing. 529 * 530 * Router Reachability Probe MUST be rate-limited 531 * to no more than one per minute. 532 */ 533 if (!rt || !(rt->fib6_flags & RTF_GATEWAY)) 534 return; 535 536 nh_gw = &rt->fib6_nh.nh_gw; 537 dev = rt->fib6_nh.nh_dev; 538 rcu_read_lock_bh(); 539 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 540 if (neigh) { 541 struct inet6_dev *idev; 542 543 if (neigh->nud_state & NUD_VALID) 544 goto out; 545 546 idev = __in6_dev_get(dev); 547 work = NULL; 548 write_lock(&neigh->lock); 549 if (!(neigh->nud_state & NUD_VALID) && 550 time_after(jiffies, 551 neigh->updated + idev->cnf.rtr_probe_interval)) { 552 work = kmalloc(sizeof(*work), GFP_ATOMIC); 553 if (work) 554 __neigh_set_probe_once(neigh); 555 } 556 write_unlock(&neigh->lock); 557 } else { 558 work = kmalloc(sizeof(*work), GFP_ATOMIC); 559 } 560 561 if (work) { 562 INIT_WORK(&work->work, rt6_probe_deferred); 563 work->target = *nh_gw; 564 dev_hold(dev); 565 work->dev = dev; 566 schedule_work(&work->work); 567 } 568 569 out: 570 rcu_read_unlock_bh(); 571 } 572 #else 573 static inline void rt6_probe(struct fib6_info *rt) 574 { 575 } 576 #endif 577 578 /* 579 * Default Router Selection (RFC 2461 6.3.6) 580 */ 581 static inline int rt6_check_dev(struct fib6_info *rt, int oif) 582 { 583 const struct net_device *dev = rt->fib6_nh.nh_dev; 584 585 if (!oif || dev->ifindex == oif) 586 return 2; 587 return 0; 588 } 589 590 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) 591 { 592 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 593 struct neighbour *neigh; 594 595 if (rt->fib6_flags & RTF_NONEXTHOP || 596 !(rt->fib6_flags & RTF_GATEWAY)) 597 return RT6_NUD_SUCCEED; 598 599 rcu_read_lock_bh(); 600 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev, 601 &rt->fib6_nh.nh_gw); 602 if (neigh) { 603 read_lock(&neigh->lock); 604 if (neigh->nud_state & NUD_VALID) 605 ret = RT6_NUD_SUCCEED; 606 #ifdef CONFIG_IPV6_ROUTER_PREF 607 else if (!(neigh->nud_state & NUD_FAILED)) 608 ret = RT6_NUD_SUCCEED; 609 else 610 ret = RT6_NUD_FAIL_PROBE; 611 #endif 612 read_unlock(&neigh->lock); 613 } else { 614 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 615 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 616 } 617 rcu_read_unlock_bh(); 618 619 return ret; 620 } 621 622 static int rt6_score_route(struct fib6_info *rt, int oif, int strict) 623 { 624 int m; 625 626 m = rt6_check_dev(rt, oif); 627 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 628 return RT6_NUD_FAIL_HARD; 629 #ifdef CONFIG_IPV6_ROUTER_PREF 630 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2; 631 #endif 632 if (strict & RT6_LOOKUP_F_REACHABLE) { 633 int n = rt6_check_neigh(rt); 634 if (n < 0) 635 return n; 636 } 637 return m; 638 } 639 640 /* called with rc_read_lock held */ 641 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i) 642 { 643 const struct net_device *dev = fib6_info_nh_dev(f6i); 644 bool rc = false; 645 646 if (dev) { 647 const struct inet6_dev *idev = __in6_dev_get(dev); 648 649 rc = !!idev->cnf.ignore_routes_with_linkdown; 650 } 651 652 return rc; 653 } 654 655 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, 656 int *mpri, struct fib6_info *match, 657 bool *do_rr) 658 { 659 int m; 660 bool match_do_rr = false; 661 662 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 663 goto out; 664 665 if (fib6_ignore_linkdown(rt) && 666 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 667 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 668 goto out; 669 670 if (fib6_check_expired(rt)) 671 goto out; 672 673 m = rt6_score_route(rt, oif, strict); 674 if (m == RT6_NUD_FAIL_DO_RR) { 675 match_do_rr = true; 676 m = 0; /* lowest valid score */ 677 } else if (m == RT6_NUD_FAIL_HARD) { 678 goto out; 679 } 680 681 if (strict & RT6_LOOKUP_F_REACHABLE) 682 rt6_probe(rt); 683 684 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 685 if (m > *mpri) { 686 *do_rr = match_do_rr; 687 *mpri = m; 688 match = rt; 689 } 690 out: 691 return match; 692 } 693 694 static struct fib6_info *find_rr_leaf(struct fib6_node *fn, 695 struct fib6_info *leaf, 696 struct fib6_info *rr_head, 697 u32 metric, int oif, int strict, 698 bool *do_rr) 699 { 700 struct fib6_info *rt, *match, *cont; 701 int mpri = -1; 702 703 match = NULL; 704 cont = NULL; 705 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) { 706 if (rt->fib6_metric != metric) { 707 cont = rt; 708 break; 709 } 710 711 match = find_match(rt, oif, strict, &mpri, match, do_rr); 712 } 713 714 for (rt = leaf; rt && rt != rr_head; 715 rt = rcu_dereference(rt->fib6_next)) { 716 if (rt->fib6_metric != metric) { 717 cont = rt; 718 break; 719 } 720 721 match = find_match(rt, oif, strict, &mpri, match, do_rr); 722 } 723 724 if (match || !cont) 725 return match; 726 727 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next)) 728 match = find_match(rt, oif, strict, &mpri, match, do_rr); 729 730 return match; 731 } 732 733 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, 734 int oif, int strict) 735 { 736 struct fib6_info *leaf = rcu_dereference(fn->leaf); 737 struct fib6_info *match, *rt0; 738 bool do_rr = false; 739 int key_plen; 740 741 if (!leaf || leaf == net->ipv6.fib6_null_entry) 742 return net->ipv6.fib6_null_entry; 743 744 rt0 = rcu_dereference(fn->rr_ptr); 745 if (!rt0) 746 rt0 = leaf; 747 748 /* Double check to make sure fn is not an intermediate node 749 * and fn->leaf does not points to its child's leaf 750 * (This might happen if all routes under fn are deleted from 751 * the tree and fib6_repair_tree() is called on the node.) 752 */ 753 key_plen = rt0->fib6_dst.plen; 754 #ifdef CONFIG_IPV6_SUBTREES 755 if (rt0->fib6_src.plen) 756 key_plen = rt0->fib6_src.plen; 757 #endif 758 if (fn->fn_bit != key_plen) 759 return net->ipv6.fib6_null_entry; 760 761 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict, 762 &do_rr); 763 764 if (do_rr) { 765 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 766 767 /* no entries matched; do round-robin */ 768 if (!next || next->fib6_metric != rt0->fib6_metric) 769 next = leaf; 770 771 if (next != rt0) { 772 spin_lock_bh(&leaf->fib6_table->tb6_lock); 773 /* make sure next is not being deleted from the tree */ 774 if (next->fib6_node) 775 rcu_assign_pointer(fn->rr_ptr, next); 776 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 777 } 778 } 779 780 return match ? match : net->ipv6.fib6_null_entry; 781 } 782 783 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) 784 { 785 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); 786 } 787 788 #ifdef CONFIG_IPV6_ROUTE_INFO 789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 790 const struct in6_addr *gwaddr) 791 { 792 struct net *net = dev_net(dev); 793 struct route_info *rinfo = (struct route_info *) opt; 794 struct in6_addr prefix_buf, *prefix; 795 unsigned int pref; 796 unsigned long lifetime; 797 struct fib6_info *rt; 798 799 if (len < sizeof(struct route_info)) { 800 return -EINVAL; 801 } 802 803 /* Sanity check for prefix_len and length */ 804 if (rinfo->length > 3) { 805 return -EINVAL; 806 } else if (rinfo->prefix_len > 128) { 807 return -EINVAL; 808 } else if (rinfo->prefix_len > 64) { 809 if (rinfo->length < 2) { 810 return -EINVAL; 811 } 812 } else if (rinfo->prefix_len > 0) { 813 if (rinfo->length < 1) { 814 return -EINVAL; 815 } 816 } 817 818 pref = rinfo->route_pref; 819 if (pref == ICMPV6_ROUTER_PREF_INVALID) 820 return -EINVAL; 821 822 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 823 824 if (rinfo->length == 3) 825 prefix = (struct in6_addr *)rinfo->prefix; 826 else { 827 /* this function is safe */ 828 ipv6_addr_prefix(&prefix_buf, 829 (struct in6_addr *)rinfo->prefix, 830 rinfo->prefix_len); 831 prefix = &prefix_buf; 832 } 833 834 if (rinfo->prefix_len == 0) 835 rt = rt6_get_dflt_router(net, gwaddr, dev); 836 else 837 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 838 gwaddr, dev); 839 840 if (rt && !lifetime) { 841 ip6_del_rt(net, rt); 842 rt = NULL; 843 } 844 845 if (!rt && lifetime) 846 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 847 dev, pref); 848 else if (rt) 849 rt->fib6_flags = RTF_ROUTEINFO | 850 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 851 852 if (rt) { 853 if (!addrconf_finite_timeout(lifetime)) 854 fib6_clean_expires(rt); 855 else 856 fib6_set_expires(rt, jiffies + HZ * lifetime); 857 858 fib6_info_release(rt); 859 } 860 return 0; 861 } 862 #endif 863 864 /* 865 * Misc support functions 866 */ 867 868 /* called with rcu_lock held */ 869 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt) 870 { 871 struct net_device *dev = rt->fib6_nh.nh_dev; 872 873 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 874 /* for copies of local routes, dst->dev needs to be the 875 * device if it is a master device, the master device if 876 * device is enslaved, and the loopback as the default 877 */ 878 if (netif_is_l3_slave(dev) && 879 !rt6_need_strict(&rt->fib6_dst.addr)) 880 dev = l3mdev_master_dev_rcu(dev); 881 else if (!netif_is_l3_master(dev)) 882 dev = dev_net(dev)->loopback_dev; 883 /* last case is netif_is_l3_master(dev) is true in which 884 * case we want dev returned to be dev 885 */ 886 } 887 888 return dev; 889 } 890 891 static const int fib6_prop[RTN_MAX + 1] = { 892 [RTN_UNSPEC] = 0, 893 [RTN_UNICAST] = 0, 894 [RTN_LOCAL] = 0, 895 [RTN_BROADCAST] = 0, 896 [RTN_ANYCAST] = 0, 897 [RTN_MULTICAST] = 0, 898 [RTN_BLACKHOLE] = -EINVAL, 899 [RTN_UNREACHABLE] = -EHOSTUNREACH, 900 [RTN_PROHIBIT] = -EACCES, 901 [RTN_THROW] = -EAGAIN, 902 [RTN_NAT] = -EINVAL, 903 [RTN_XRESOLVE] = -EINVAL, 904 }; 905 906 static int ip6_rt_type_to_error(u8 fib6_type) 907 { 908 return fib6_prop[fib6_type]; 909 } 910 911 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 912 { 913 unsigned short flags = 0; 914 915 if (rt->dst_nocount) 916 flags |= DST_NOCOUNT; 917 if (rt->dst_nopolicy) 918 flags |= DST_NOPOLICY; 919 if (rt->dst_host) 920 flags |= DST_HOST; 921 922 return flags; 923 } 924 925 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) 926 { 927 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); 928 929 switch (ort->fib6_type) { 930 case RTN_BLACKHOLE: 931 rt->dst.output = dst_discard_out; 932 rt->dst.input = dst_discard; 933 break; 934 case RTN_PROHIBIT: 935 rt->dst.output = ip6_pkt_prohibit_out; 936 rt->dst.input = ip6_pkt_prohibit; 937 break; 938 case RTN_THROW: 939 case RTN_UNREACHABLE: 940 default: 941 rt->dst.output = ip6_pkt_discard_out; 942 rt->dst.input = ip6_pkt_discard; 943 break; 944 } 945 } 946 947 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) 948 { 949 rt->dst.flags |= fib6_info_dst_flags(ort); 950 951 if (ort->fib6_flags & RTF_REJECT) { 952 ip6_rt_init_dst_reject(rt, ort); 953 return; 954 } 955 956 rt->dst.error = 0; 957 rt->dst.output = ip6_output; 958 959 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) { 960 rt->dst.input = ip6_input; 961 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 962 rt->dst.input = ip6_mc_input; 963 } else { 964 rt->dst.input = ip6_forward; 965 } 966 967 if (ort->fib6_nh.nh_lwtstate) { 968 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); 969 lwtunnel_set_redirect(&rt->dst); 970 } 971 972 rt->dst.lastuse = jiffies; 973 } 974 975 /* Caller must already hold reference to @from */ 976 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 977 { 978 rt->rt6i_flags &= ~RTF_EXPIRES; 979 rcu_assign_pointer(rt->from, from); 980 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); 981 } 982 983 /* Caller must already hold reference to @ort */ 984 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) 985 { 986 struct net_device *dev = fib6_info_nh_dev(ort); 987 988 ip6_rt_init_dst(rt, ort); 989 990 rt->rt6i_dst = ort->fib6_dst; 991 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 992 rt->rt6i_gateway = ort->fib6_nh.nh_gw; 993 rt->rt6i_flags = ort->fib6_flags; 994 rt6_set_from(rt, ort); 995 #ifdef CONFIG_IPV6_SUBTREES 996 rt->rt6i_src = ort->fib6_src; 997 #endif 998 rt->rt6i_prefsrc = ort->fib6_prefsrc; 999 } 1000 1001 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1002 struct in6_addr *saddr) 1003 { 1004 struct fib6_node *pn, *sn; 1005 while (1) { 1006 if (fn->fn_flags & RTN_TL_ROOT) 1007 return NULL; 1008 pn = rcu_dereference(fn->parent); 1009 sn = FIB6_SUBTREE(pn); 1010 if (sn && sn != fn) 1011 fn = fib6_node_lookup(sn, NULL, saddr); 1012 else 1013 fn = pn; 1014 if (fn->fn_flags & RTN_RTINFO) 1015 return fn; 1016 } 1017 } 1018 1019 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, 1020 bool null_fallback) 1021 { 1022 struct rt6_info *rt = *prt; 1023 1024 if (dst_hold_safe(&rt->dst)) 1025 return true; 1026 if (null_fallback) { 1027 rt = net->ipv6.ip6_null_entry; 1028 dst_hold(&rt->dst); 1029 } else { 1030 rt = NULL; 1031 } 1032 *prt = rt; 1033 return false; 1034 } 1035 1036 /* called with rcu_lock held */ 1037 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) 1038 { 1039 unsigned short flags = fib6_info_dst_flags(rt); 1040 struct net_device *dev = rt->fib6_nh.nh_dev; 1041 struct rt6_info *nrt; 1042 1043 if (!fib6_info_hold_safe(rt)) 1044 return NULL; 1045 1046 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1047 if (nrt) 1048 ip6_rt_copy_init(nrt, rt); 1049 else 1050 fib6_info_release(rt); 1051 1052 return nrt; 1053 } 1054 1055 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1056 struct fib6_table *table, 1057 struct flowi6 *fl6, 1058 const struct sk_buff *skb, 1059 int flags) 1060 { 1061 struct fib6_info *f6i; 1062 struct fib6_node *fn; 1063 struct rt6_info *rt; 1064 1065 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1066 flags &= ~RT6_LOOKUP_F_IFACE; 1067 1068 rcu_read_lock(); 1069 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1070 restart: 1071 f6i = rcu_dereference(fn->leaf); 1072 if (!f6i) { 1073 f6i = net->ipv6.fib6_null_entry; 1074 } else { 1075 f6i = rt6_device_match(net, f6i, &fl6->saddr, 1076 fl6->flowi6_oif, flags); 1077 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) 1078 f6i = fib6_multipath_select(net, f6i, fl6, 1079 fl6->flowi6_oif, skb, 1080 flags); 1081 } 1082 if (f6i == net->ipv6.fib6_null_entry) { 1083 fn = fib6_backtrack(fn, &fl6->saddr); 1084 if (fn) 1085 goto restart; 1086 } 1087 1088 trace_fib6_table_lookup(net, f6i, table, fl6); 1089 1090 /* Search through exception table */ 1091 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1092 if (rt) { 1093 if (ip6_hold_safe(net, &rt, true)) 1094 dst_use_noref(&rt->dst, jiffies); 1095 } else if (f6i == net->ipv6.fib6_null_entry) { 1096 rt = net->ipv6.ip6_null_entry; 1097 dst_hold(&rt->dst); 1098 } else { 1099 rt = ip6_create_rt_rcu(f6i); 1100 if (!rt) { 1101 rt = net->ipv6.ip6_null_entry; 1102 dst_hold(&rt->dst); 1103 } 1104 } 1105 1106 rcu_read_unlock(); 1107 1108 return rt; 1109 } 1110 1111 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1112 const struct sk_buff *skb, int flags) 1113 { 1114 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1115 } 1116 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1117 1118 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1119 const struct in6_addr *saddr, int oif, 1120 const struct sk_buff *skb, int strict) 1121 { 1122 struct flowi6 fl6 = { 1123 .flowi6_oif = oif, 1124 .daddr = *daddr, 1125 }; 1126 struct dst_entry *dst; 1127 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1128 1129 if (saddr) { 1130 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1131 flags |= RT6_LOOKUP_F_HAS_SADDR; 1132 } 1133 1134 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1135 if (dst->error == 0) 1136 return (struct rt6_info *) dst; 1137 1138 dst_release(dst); 1139 1140 return NULL; 1141 } 1142 EXPORT_SYMBOL(rt6_lookup); 1143 1144 /* ip6_ins_rt is called with FREE table->tb6_lock. 1145 * It takes new route entry, the addition fails by any reason the 1146 * route is released. 1147 * Caller must hold dst before calling it. 1148 */ 1149 1150 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1151 struct netlink_ext_ack *extack) 1152 { 1153 int err; 1154 struct fib6_table *table; 1155 1156 table = rt->fib6_table; 1157 spin_lock_bh(&table->tb6_lock); 1158 err = fib6_add(&table->tb6_root, rt, info, extack); 1159 spin_unlock_bh(&table->tb6_lock); 1160 1161 return err; 1162 } 1163 1164 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1165 { 1166 struct nl_info info = { .nl_net = net, }; 1167 1168 return __ip6_ins_rt(rt, &info, NULL); 1169 } 1170 1171 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, 1172 const struct in6_addr *daddr, 1173 const struct in6_addr *saddr) 1174 { 1175 struct net_device *dev; 1176 struct rt6_info *rt; 1177 1178 /* 1179 * Clone the route. 1180 */ 1181 1182 if (!fib6_info_hold_safe(ort)) 1183 return NULL; 1184 1185 dev = ip6_rt_get_dev_rcu(ort); 1186 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1187 if (!rt) { 1188 fib6_info_release(ort); 1189 return NULL; 1190 } 1191 1192 ip6_rt_copy_init(rt, ort); 1193 rt->rt6i_flags |= RTF_CACHE; 1194 rt->dst.flags |= DST_HOST; 1195 rt->rt6i_dst.addr = *daddr; 1196 rt->rt6i_dst.plen = 128; 1197 1198 if (!rt6_is_gw_or_nonexthop(ort)) { 1199 if (ort->fib6_dst.plen != 128 && 1200 ipv6_addr_equal(&ort->fib6_dst.addr, daddr)) 1201 rt->rt6i_flags |= RTF_ANYCAST; 1202 #ifdef CONFIG_IPV6_SUBTREES 1203 if (rt->rt6i_src.plen && saddr) { 1204 rt->rt6i_src.addr = *saddr; 1205 rt->rt6i_src.plen = 128; 1206 } 1207 #endif 1208 } 1209 1210 return rt; 1211 } 1212 1213 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) 1214 { 1215 unsigned short flags = fib6_info_dst_flags(rt); 1216 struct net_device *dev; 1217 struct rt6_info *pcpu_rt; 1218 1219 if (!fib6_info_hold_safe(rt)) 1220 return NULL; 1221 1222 rcu_read_lock(); 1223 dev = ip6_rt_get_dev_rcu(rt); 1224 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1225 rcu_read_unlock(); 1226 if (!pcpu_rt) { 1227 fib6_info_release(rt); 1228 return NULL; 1229 } 1230 ip6_rt_copy_init(pcpu_rt, rt); 1231 pcpu_rt->rt6i_flags |= RTF_PCPU; 1232 return pcpu_rt; 1233 } 1234 1235 /* It should be called with rcu_read_lock() acquired */ 1236 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt) 1237 { 1238 struct rt6_info *pcpu_rt, **p; 1239 1240 p = this_cpu_ptr(rt->rt6i_pcpu); 1241 pcpu_rt = *p; 1242 1243 if (pcpu_rt) 1244 ip6_hold_safe(NULL, &pcpu_rt, false); 1245 1246 return pcpu_rt; 1247 } 1248 1249 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1250 struct fib6_info *rt) 1251 { 1252 struct rt6_info *pcpu_rt, *prev, **p; 1253 1254 pcpu_rt = ip6_rt_pcpu_alloc(rt); 1255 if (!pcpu_rt) { 1256 dst_hold(&net->ipv6.ip6_null_entry->dst); 1257 return net->ipv6.ip6_null_entry; 1258 } 1259 1260 dst_hold(&pcpu_rt->dst); 1261 p = this_cpu_ptr(rt->rt6i_pcpu); 1262 prev = cmpxchg(p, NULL, pcpu_rt); 1263 BUG_ON(prev); 1264 1265 return pcpu_rt; 1266 } 1267 1268 /* exception hash table implementation 1269 */ 1270 static DEFINE_SPINLOCK(rt6_exception_lock); 1271 1272 /* Remove rt6_ex from hash table and free the memory 1273 * Caller must hold rt6_exception_lock 1274 */ 1275 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1276 struct rt6_exception *rt6_ex) 1277 { 1278 struct net *net; 1279 1280 if (!bucket || !rt6_ex) 1281 return; 1282 1283 net = dev_net(rt6_ex->rt6i->dst.dev); 1284 hlist_del_rcu(&rt6_ex->hlist); 1285 dst_release(&rt6_ex->rt6i->dst); 1286 kfree_rcu(rt6_ex, rcu); 1287 WARN_ON_ONCE(!bucket->depth); 1288 bucket->depth--; 1289 net->ipv6.rt6_stats->fib_rt_cache--; 1290 } 1291 1292 /* Remove oldest rt6_ex in bucket and free the memory 1293 * Caller must hold rt6_exception_lock 1294 */ 1295 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1296 { 1297 struct rt6_exception *rt6_ex, *oldest = NULL; 1298 1299 if (!bucket) 1300 return; 1301 1302 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1303 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1304 oldest = rt6_ex; 1305 } 1306 rt6_remove_exception(bucket, oldest); 1307 } 1308 1309 static u32 rt6_exception_hash(const struct in6_addr *dst, 1310 const struct in6_addr *src) 1311 { 1312 static u32 seed __read_mostly; 1313 u32 val; 1314 1315 net_get_random_once(&seed, sizeof(seed)); 1316 val = jhash(dst, sizeof(*dst), seed); 1317 1318 #ifdef CONFIG_IPV6_SUBTREES 1319 if (src) 1320 val = jhash(src, sizeof(*src), val); 1321 #endif 1322 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1323 } 1324 1325 /* Helper function to find the cached rt in the hash table 1326 * and update bucket pointer to point to the bucket for this 1327 * (daddr, saddr) pair 1328 * Caller must hold rt6_exception_lock 1329 */ 1330 static struct rt6_exception * 1331 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1332 const struct in6_addr *daddr, 1333 const struct in6_addr *saddr) 1334 { 1335 struct rt6_exception *rt6_ex; 1336 u32 hval; 1337 1338 if (!(*bucket) || !daddr) 1339 return NULL; 1340 1341 hval = rt6_exception_hash(daddr, saddr); 1342 *bucket += hval; 1343 1344 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1345 struct rt6_info *rt6 = rt6_ex->rt6i; 1346 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1347 1348 #ifdef CONFIG_IPV6_SUBTREES 1349 if (matched && saddr) 1350 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1351 #endif 1352 if (matched) 1353 return rt6_ex; 1354 } 1355 return NULL; 1356 } 1357 1358 /* Helper function to find the cached rt in the hash table 1359 * and update bucket pointer to point to the bucket for this 1360 * (daddr, saddr) pair 1361 * Caller must hold rcu_read_lock() 1362 */ 1363 static struct rt6_exception * 1364 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1365 const struct in6_addr *daddr, 1366 const struct in6_addr *saddr) 1367 { 1368 struct rt6_exception *rt6_ex; 1369 u32 hval; 1370 1371 WARN_ON_ONCE(!rcu_read_lock_held()); 1372 1373 if (!(*bucket) || !daddr) 1374 return NULL; 1375 1376 hval = rt6_exception_hash(daddr, saddr); 1377 *bucket += hval; 1378 1379 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1380 struct rt6_info *rt6 = rt6_ex->rt6i; 1381 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1382 1383 #ifdef CONFIG_IPV6_SUBTREES 1384 if (matched && saddr) 1385 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1386 #endif 1387 if (matched) 1388 return rt6_ex; 1389 } 1390 return NULL; 1391 } 1392 1393 static unsigned int fib6_mtu(const struct fib6_info *rt) 1394 { 1395 unsigned int mtu; 1396 1397 if (rt->fib6_pmtu) { 1398 mtu = rt->fib6_pmtu; 1399 } else { 1400 struct net_device *dev = fib6_info_nh_dev(rt); 1401 struct inet6_dev *idev; 1402 1403 rcu_read_lock(); 1404 idev = __in6_dev_get(dev); 1405 mtu = idev->cnf.mtu6; 1406 rcu_read_unlock(); 1407 } 1408 1409 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1410 1411 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu); 1412 } 1413 1414 static int rt6_insert_exception(struct rt6_info *nrt, 1415 struct fib6_info *ort) 1416 { 1417 struct net *net = dev_net(nrt->dst.dev); 1418 struct rt6_exception_bucket *bucket; 1419 struct in6_addr *src_key = NULL; 1420 struct rt6_exception *rt6_ex; 1421 int err = 0; 1422 1423 spin_lock_bh(&rt6_exception_lock); 1424 1425 if (ort->exception_bucket_flushed) { 1426 err = -EINVAL; 1427 goto out; 1428 } 1429 1430 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, 1431 lockdep_is_held(&rt6_exception_lock)); 1432 if (!bucket) { 1433 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1434 GFP_ATOMIC); 1435 if (!bucket) { 1436 err = -ENOMEM; 1437 goto out; 1438 } 1439 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); 1440 } 1441 1442 #ifdef CONFIG_IPV6_SUBTREES 1443 /* rt6i_src.plen != 0 indicates ort is in subtree 1444 * and exception table is indexed by a hash of 1445 * both rt6i_dst and rt6i_src. 1446 * Otherwise, the exception table is indexed by 1447 * a hash of only rt6i_dst. 1448 */ 1449 if (ort->fib6_src.plen) 1450 src_key = &nrt->rt6i_src.addr; 1451 #endif 1452 1453 /* Update rt6i_prefsrc as it could be changed 1454 * in rt6_remove_prefsrc() 1455 */ 1456 nrt->rt6i_prefsrc = ort->fib6_prefsrc; 1457 /* rt6_mtu_change() might lower mtu on ort. 1458 * Only insert this exception route if its mtu 1459 * is less than ort's mtu value. 1460 */ 1461 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) { 1462 err = -EINVAL; 1463 goto out; 1464 } 1465 1466 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1467 src_key); 1468 if (rt6_ex) 1469 rt6_remove_exception(bucket, rt6_ex); 1470 1471 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1472 if (!rt6_ex) { 1473 err = -ENOMEM; 1474 goto out; 1475 } 1476 rt6_ex->rt6i = nrt; 1477 rt6_ex->stamp = jiffies; 1478 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1479 bucket->depth++; 1480 net->ipv6.rt6_stats->fib_rt_cache++; 1481 1482 if (bucket->depth > FIB6_MAX_DEPTH) 1483 rt6_exception_remove_oldest(bucket); 1484 1485 out: 1486 spin_unlock_bh(&rt6_exception_lock); 1487 1488 /* Update fn->fn_sernum to invalidate all cached dst */ 1489 if (!err) { 1490 spin_lock_bh(&ort->fib6_table->tb6_lock); 1491 fib6_update_sernum(net, ort); 1492 spin_unlock_bh(&ort->fib6_table->tb6_lock); 1493 fib6_force_start_gc(net); 1494 } 1495 1496 return err; 1497 } 1498 1499 void rt6_flush_exceptions(struct fib6_info *rt) 1500 { 1501 struct rt6_exception_bucket *bucket; 1502 struct rt6_exception *rt6_ex; 1503 struct hlist_node *tmp; 1504 int i; 1505 1506 spin_lock_bh(&rt6_exception_lock); 1507 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1508 rt->exception_bucket_flushed = 1; 1509 1510 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1511 lockdep_is_held(&rt6_exception_lock)); 1512 if (!bucket) 1513 goto out; 1514 1515 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1516 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1517 rt6_remove_exception(bucket, rt6_ex); 1518 WARN_ON_ONCE(bucket->depth); 1519 bucket++; 1520 } 1521 1522 out: 1523 spin_unlock_bh(&rt6_exception_lock); 1524 } 1525 1526 /* Find cached rt in the hash table inside passed in rt 1527 * Caller has to hold rcu_read_lock() 1528 */ 1529 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 1530 struct in6_addr *daddr, 1531 struct in6_addr *saddr) 1532 { 1533 struct rt6_exception_bucket *bucket; 1534 struct in6_addr *src_key = NULL; 1535 struct rt6_exception *rt6_ex; 1536 struct rt6_info *res = NULL; 1537 1538 bucket = rcu_dereference(rt->rt6i_exception_bucket); 1539 1540 #ifdef CONFIG_IPV6_SUBTREES 1541 /* rt6i_src.plen != 0 indicates rt is in subtree 1542 * and exception table is indexed by a hash of 1543 * both rt6i_dst and rt6i_src. 1544 * Otherwise, the exception table is indexed by 1545 * a hash of only rt6i_dst. 1546 */ 1547 if (rt->fib6_src.plen) 1548 src_key = saddr; 1549 #endif 1550 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1551 1552 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1553 res = rt6_ex->rt6i; 1554 1555 return res; 1556 } 1557 1558 /* Remove the passed in cached rt from the hash table that contains it */ 1559 static int rt6_remove_exception_rt(struct rt6_info *rt) 1560 { 1561 struct rt6_exception_bucket *bucket; 1562 struct in6_addr *src_key = NULL; 1563 struct rt6_exception *rt6_ex; 1564 struct fib6_info *from; 1565 int err; 1566 1567 from = rcu_dereference(rt->from); 1568 if (!from || 1569 !(rt->rt6i_flags & RTF_CACHE)) 1570 return -EINVAL; 1571 1572 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1573 return -ENOENT; 1574 1575 spin_lock_bh(&rt6_exception_lock); 1576 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1577 lockdep_is_held(&rt6_exception_lock)); 1578 #ifdef CONFIG_IPV6_SUBTREES 1579 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1580 * and exception table is indexed by a hash of 1581 * both rt6i_dst and rt6i_src. 1582 * Otherwise, the exception table is indexed by 1583 * a hash of only rt6i_dst. 1584 */ 1585 if (from->fib6_src.plen) 1586 src_key = &rt->rt6i_src.addr; 1587 #endif 1588 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1589 &rt->rt6i_dst.addr, 1590 src_key); 1591 if (rt6_ex) { 1592 rt6_remove_exception(bucket, rt6_ex); 1593 err = 0; 1594 } else { 1595 err = -ENOENT; 1596 } 1597 1598 spin_unlock_bh(&rt6_exception_lock); 1599 return err; 1600 } 1601 1602 /* Find rt6_ex which contains the passed in rt cache and 1603 * refresh its stamp 1604 */ 1605 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1606 { 1607 struct rt6_exception_bucket *bucket; 1608 struct fib6_info *from = rt->from; 1609 struct in6_addr *src_key = NULL; 1610 struct rt6_exception *rt6_ex; 1611 1612 if (!from || 1613 !(rt->rt6i_flags & RTF_CACHE)) 1614 return; 1615 1616 rcu_read_lock(); 1617 bucket = rcu_dereference(from->rt6i_exception_bucket); 1618 1619 #ifdef CONFIG_IPV6_SUBTREES 1620 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1621 * and exception table is indexed by a hash of 1622 * both rt6i_dst and rt6i_src. 1623 * Otherwise, the exception table is indexed by 1624 * a hash of only rt6i_dst. 1625 */ 1626 if (from->fib6_src.plen) 1627 src_key = &rt->rt6i_src.addr; 1628 #endif 1629 rt6_ex = __rt6_find_exception_rcu(&bucket, 1630 &rt->rt6i_dst.addr, 1631 src_key); 1632 if (rt6_ex) 1633 rt6_ex->stamp = jiffies; 1634 1635 rcu_read_unlock(); 1636 } 1637 1638 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt) 1639 { 1640 struct rt6_exception_bucket *bucket; 1641 struct rt6_exception *rt6_ex; 1642 int i; 1643 1644 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1645 lockdep_is_held(&rt6_exception_lock)); 1646 1647 if (bucket) { 1648 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1649 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1650 rt6_ex->rt6i->rt6i_prefsrc.plen = 0; 1651 } 1652 bucket++; 1653 } 1654 } 1655 } 1656 1657 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1658 struct rt6_info *rt, int mtu) 1659 { 1660 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1661 * lowest MTU in the path: always allow updating the route PMTU to 1662 * reflect PMTU decreases. 1663 * 1664 * If the new MTU is higher, and the route PMTU is equal to the local 1665 * MTU, this means the old MTU is the lowest in the path, so allow 1666 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1667 * handle this. 1668 */ 1669 1670 if (dst_mtu(&rt->dst) >= mtu) 1671 return true; 1672 1673 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 1674 return true; 1675 1676 return false; 1677 } 1678 1679 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1680 struct fib6_info *rt, int mtu) 1681 { 1682 struct rt6_exception_bucket *bucket; 1683 struct rt6_exception *rt6_ex; 1684 int i; 1685 1686 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1687 lockdep_is_held(&rt6_exception_lock)); 1688 1689 if (!bucket) 1690 return; 1691 1692 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1693 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1694 struct rt6_info *entry = rt6_ex->rt6i; 1695 1696 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 1697 * route), the metrics of its rt->from have already 1698 * been updated. 1699 */ 1700 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 1701 rt6_mtu_change_route_allowed(idev, entry, mtu)) 1702 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 1703 } 1704 bucket++; 1705 } 1706 } 1707 1708 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1709 1710 static void rt6_exceptions_clean_tohost(struct fib6_info *rt, 1711 struct in6_addr *gateway) 1712 { 1713 struct rt6_exception_bucket *bucket; 1714 struct rt6_exception *rt6_ex; 1715 struct hlist_node *tmp; 1716 int i; 1717 1718 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1719 return; 1720 1721 spin_lock_bh(&rt6_exception_lock); 1722 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1723 lockdep_is_held(&rt6_exception_lock)); 1724 1725 if (bucket) { 1726 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1727 hlist_for_each_entry_safe(rt6_ex, tmp, 1728 &bucket->chain, hlist) { 1729 struct rt6_info *entry = rt6_ex->rt6i; 1730 1731 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1732 RTF_CACHE_GATEWAY && 1733 ipv6_addr_equal(gateway, 1734 &entry->rt6i_gateway)) { 1735 rt6_remove_exception(bucket, rt6_ex); 1736 } 1737 } 1738 bucket++; 1739 } 1740 } 1741 1742 spin_unlock_bh(&rt6_exception_lock); 1743 } 1744 1745 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1746 struct rt6_exception *rt6_ex, 1747 struct fib6_gc_args *gc_args, 1748 unsigned long now) 1749 { 1750 struct rt6_info *rt = rt6_ex->rt6i; 1751 1752 /* we are pruning and obsoleting aged-out and non gateway exceptions 1753 * even if others have still references to them, so that on next 1754 * dst_check() such references can be dropped. 1755 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1756 * expired, independently from their aging, as per RFC 8201 section 4 1757 */ 1758 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 1759 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1760 RT6_TRACE("aging clone %p\n", rt); 1761 rt6_remove_exception(bucket, rt6_ex); 1762 return; 1763 } 1764 } else if (time_after(jiffies, rt->dst.expires)) { 1765 RT6_TRACE("purging expired route %p\n", rt); 1766 rt6_remove_exception(bucket, rt6_ex); 1767 return; 1768 } 1769 1770 if (rt->rt6i_flags & RTF_GATEWAY) { 1771 struct neighbour *neigh; 1772 __u8 neigh_flags = 0; 1773 1774 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 1775 if (neigh) 1776 neigh_flags = neigh->flags; 1777 1778 if (!(neigh_flags & NTF_ROUTER)) { 1779 RT6_TRACE("purging route %p via non-router but gateway\n", 1780 rt); 1781 rt6_remove_exception(bucket, rt6_ex); 1782 return; 1783 } 1784 } 1785 1786 gc_args->more++; 1787 } 1788 1789 void rt6_age_exceptions(struct fib6_info *rt, 1790 struct fib6_gc_args *gc_args, 1791 unsigned long now) 1792 { 1793 struct rt6_exception_bucket *bucket; 1794 struct rt6_exception *rt6_ex; 1795 struct hlist_node *tmp; 1796 int i; 1797 1798 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1799 return; 1800 1801 rcu_read_lock_bh(); 1802 spin_lock(&rt6_exception_lock); 1803 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1804 lockdep_is_held(&rt6_exception_lock)); 1805 1806 if (bucket) { 1807 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1808 hlist_for_each_entry_safe(rt6_ex, tmp, 1809 &bucket->chain, hlist) { 1810 rt6_age_examine_exception(bucket, rt6_ex, 1811 gc_args, now); 1812 } 1813 bucket++; 1814 } 1815 } 1816 spin_unlock(&rt6_exception_lock); 1817 rcu_read_unlock_bh(); 1818 } 1819 1820 /* must be called with rcu lock held */ 1821 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, 1822 int oif, struct flowi6 *fl6, int strict) 1823 { 1824 struct fib6_node *fn, *saved_fn; 1825 struct fib6_info *f6i; 1826 1827 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1828 saved_fn = fn; 1829 1830 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1831 oif = 0; 1832 1833 redo_rt6_select: 1834 f6i = rt6_select(net, fn, oif, strict); 1835 if (f6i == net->ipv6.fib6_null_entry) { 1836 fn = fib6_backtrack(fn, &fl6->saddr); 1837 if (fn) 1838 goto redo_rt6_select; 1839 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1840 /* also consider unreachable route */ 1841 strict &= ~RT6_LOOKUP_F_REACHABLE; 1842 fn = saved_fn; 1843 goto redo_rt6_select; 1844 } 1845 } 1846 1847 trace_fib6_table_lookup(net, f6i, table, fl6); 1848 1849 return f6i; 1850 } 1851 1852 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1853 int oif, struct flowi6 *fl6, 1854 const struct sk_buff *skb, int flags) 1855 { 1856 struct fib6_info *f6i; 1857 struct rt6_info *rt; 1858 int strict = 0; 1859 1860 strict |= flags & RT6_LOOKUP_F_IFACE; 1861 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1862 if (net->ipv6.devconf_all->forwarding == 0) 1863 strict |= RT6_LOOKUP_F_REACHABLE; 1864 1865 rcu_read_lock(); 1866 1867 f6i = fib6_table_lookup(net, table, oif, fl6, strict); 1868 if (f6i->fib6_nsiblings) 1869 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict); 1870 1871 if (f6i == net->ipv6.fib6_null_entry) { 1872 rt = net->ipv6.ip6_null_entry; 1873 rcu_read_unlock(); 1874 dst_hold(&rt->dst); 1875 return rt; 1876 } 1877 1878 /*Search through exception table */ 1879 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1880 if (rt) { 1881 if (ip6_hold_safe(net, &rt, true)) 1882 dst_use_noref(&rt->dst, jiffies); 1883 1884 rcu_read_unlock(); 1885 return rt; 1886 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1887 !(f6i->fib6_flags & RTF_GATEWAY))) { 1888 /* Create a RTF_CACHE clone which will not be 1889 * owned by the fib6 tree. It is for the special case where 1890 * the daddr in the skb during the neighbor look-up is different 1891 * from the fl6->daddr used to look-up route here. 1892 */ 1893 struct rt6_info *uncached_rt; 1894 1895 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); 1896 1897 rcu_read_unlock(); 1898 1899 if (uncached_rt) { 1900 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1901 * No need for another dst_hold() 1902 */ 1903 rt6_uncached_list_add(uncached_rt); 1904 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 1905 } else { 1906 uncached_rt = net->ipv6.ip6_null_entry; 1907 dst_hold(&uncached_rt->dst); 1908 } 1909 1910 return uncached_rt; 1911 } else { 1912 /* Get a percpu copy */ 1913 1914 struct rt6_info *pcpu_rt; 1915 1916 local_bh_disable(); 1917 pcpu_rt = rt6_get_pcpu_route(f6i); 1918 1919 if (!pcpu_rt) 1920 pcpu_rt = rt6_make_pcpu_route(net, f6i); 1921 1922 local_bh_enable(); 1923 rcu_read_unlock(); 1924 1925 return pcpu_rt; 1926 } 1927 } 1928 EXPORT_SYMBOL_GPL(ip6_pol_route); 1929 1930 static struct rt6_info *ip6_pol_route_input(struct net *net, 1931 struct fib6_table *table, 1932 struct flowi6 *fl6, 1933 const struct sk_buff *skb, 1934 int flags) 1935 { 1936 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 1937 } 1938 1939 struct dst_entry *ip6_route_input_lookup(struct net *net, 1940 struct net_device *dev, 1941 struct flowi6 *fl6, 1942 const struct sk_buff *skb, 1943 int flags) 1944 { 1945 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1946 flags |= RT6_LOOKUP_F_IFACE; 1947 1948 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 1949 } 1950 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1951 1952 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1953 struct flow_keys *keys, 1954 struct flow_keys *flkeys) 1955 { 1956 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1957 const struct ipv6hdr *key_iph = outer_iph; 1958 struct flow_keys *_flkeys = flkeys; 1959 const struct ipv6hdr *inner_iph; 1960 const struct icmp6hdr *icmph; 1961 struct ipv6hdr _inner_iph; 1962 struct icmp6hdr _icmph; 1963 1964 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1965 goto out; 1966 1967 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 1968 sizeof(_icmph), &_icmph); 1969 if (!icmph) 1970 goto out; 1971 1972 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1973 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1974 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1975 icmph->icmp6_type != ICMPV6_PARAMPROB) 1976 goto out; 1977 1978 inner_iph = skb_header_pointer(skb, 1979 skb_transport_offset(skb) + sizeof(*icmph), 1980 sizeof(_inner_iph), &_inner_iph); 1981 if (!inner_iph) 1982 goto out; 1983 1984 key_iph = inner_iph; 1985 _flkeys = NULL; 1986 out: 1987 if (_flkeys) { 1988 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 1989 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 1990 keys->tags.flow_label = _flkeys->tags.flow_label; 1991 keys->basic.ip_proto = _flkeys->basic.ip_proto; 1992 } else { 1993 keys->addrs.v6addrs.src = key_iph->saddr; 1994 keys->addrs.v6addrs.dst = key_iph->daddr; 1995 keys->tags.flow_label = ip6_flowlabel(key_iph); 1996 keys->basic.ip_proto = key_iph->nexthdr; 1997 } 1998 } 1999 2000 /* if skb is set it will be used and fl6 can be NULL */ 2001 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 2002 const struct sk_buff *skb, struct flow_keys *flkeys) 2003 { 2004 struct flow_keys hash_keys; 2005 u32 mhash; 2006 2007 switch (ip6_multipath_hash_policy(net)) { 2008 case 0: 2009 memset(&hash_keys, 0, sizeof(hash_keys)); 2010 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2011 if (skb) { 2012 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 2013 } else { 2014 hash_keys.addrs.v6addrs.src = fl6->saddr; 2015 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2016 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2017 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2018 } 2019 break; 2020 case 1: 2021 if (skb) { 2022 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 2023 struct flow_keys keys; 2024 2025 /* short-circuit if we already have L4 hash present */ 2026 if (skb->l4_hash) 2027 return skb_get_hash_raw(skb) >> 1; 2028 2029 memset(&hash_keys, 0, sizeof(hash_keys)); 2030 2031 if (!flkeys) { 2032 skb_flow_dissect_flow_keys(skb, &keys, flag); 2033 flkeys = &keys; 2034 } 2035 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2036 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2037 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2038 hash_keys.ports.src = flkeys->ports.src; 2039 hash_keys.ports.dst = flkeys->ports.dst; 2040 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2041 } else { 2042 memset(&hash_keys, 0, sizeof(hash_keys)); 2043 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2044 hash_keys.addrs.v6addrs.src = fl6->saddr; 2045 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2046 hash_keys.ports.src = fl6->fl6_sport; 2047 hash_keys.ports.dst = fl6->fl6_dport; 2048 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2049 } 2050 break; 2051 } 2052 mhash = flow_hash_from_keys(&hash_keys); 2053 2054 return mhash >> 1; 2055 } 2056 2057 void ip6_route_input(struct sk_buff *skb) 2058 { 2059 const struct ipv6hdr *iph = ipv6_hdr(skb); 2060 struct net *net = dev_net(skb->dev); 2061 int flags = RT6_LOOKUP_F_HAS_SADDR; 2062 struct ip_tunnel_info *tun_info; 2063 struct flowi6 fl6 = { 2064 .flowi6_iif = skb->dev->ifindex, 2065 .daddr = iph->daddr, 2066 .saddr = iph->saddr, 2067 .flowlabel = ip6_flowinfo(iph), 2068 .flowi6_mark = skb->mark, 2069 .flowi6_proto = iph->nexthdr, 2070 }; 2071 struct flow_keys *flkeys = NULL, _flkeys; 2072 2073 tun_info = skb_tunnel_info(skb); 2074 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2075 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2076 2077 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2078 flkeys = &_flkeys; 2079 2080 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2081 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2082 skb_dst_drop(skb); 2083 skb_dst_set(skb, 2084 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); 2085 } 2086 2087 static struct rt6_info *ip6_pol_route_output(struct net *net, 2088 struct fib6_table *table, 2089 struct flowi6 *fl6, 2090 const struct sk_buff *skb, 2091 int flags) 2092 { 2093 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2094 } 2095 2096 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 2097 struct flowi6 *fl6, int flags) 2098 { 2099 bool any_src; 2100 2101 if (rt6_need_strict(&fl6->daddr)) { 2102 struct dst_entry *dst; 2103 2104 dst = l3mdev_link_scope_lookup(net, fl6); 2105 if (dst) 2106 return dst; 2107 } 2108 2109 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2110 2111 any_src = ipv6_addr_any(&fl6->saddr); 2112 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2113 (fl6->flowi6_oif && any_src)) 2114 flags |= RT6_LOOKUP_F_IFACE; 2115 2116 if (!any_src) 2117 flags |= RT6_LOOKUP_F_HAS_SADDR; 2118 else if (sk) 2119 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2120 2121 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2122 } 2123 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2124 2125 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2126 { 2127 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2128 struct net_device *loopback_dev = net->loopback_dev; 2129 struct dst_entry *new = NULL; 2130 2131 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2132 DST_OBSOLETE_DEAD, 0); 2133 if (rt) { 2134 rt6_info_init(rt); 2135 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2136 2137 new = &rt->dst; 2138 new->__use = 1; 2139 new->input = dst_discard; 2140 new->output = dst_discard_out; 2141 2142 dst_copy_metrics(new, &ort->dst); 2143 2144 rt->rt6i_idev = in6_dev_get(loopback_dev); 2145 rt->rt6i_gateway = ort->rt6i_gateway; 2146 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2147 2148 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2149 #ifdef CONFIG_IPV6_SUBTREES 2150 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2151 #endif 2152 } 2153 2154 dst_release(dst_orig); 2155 return new ? new : ERR_PTR(-ENOMEM); 2156 } 2157 2158 /* 2159 * Destination cache support functions 2160 */ 2161 2162 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2163 { 2164 u32 rt_cookie = 0; 2165 2166 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2167 return false; 2168 2169 if (fib6_check_expired(f6i)) 2170 return false; 2171 2172 return true; 2173 } 2174 2175 static struct dst_entry *rt6_check(struct rt6_info *rt, 2176 struct fib6_info *from, 2177 u32 cookie) 2178 { 2179 u32 rt_cookie = 0; 2180 2181 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || 2182 rt_cookie != cookie) 2183 return NULL; 2184 2185 if (rt6_check_expired(rt)) 2186 return NULL; 2187 2188 return &rt->dst; 2189 } 2190 2191 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2192 struct fib6_info *from, 2193 u32 cookie) 2194 { 2195 if (!__rt6_check_expired(rt) && 2196 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2197 fib6_check(from, cookie)) 2198 return &rt->dst; 2199 else 2200 return NULL; 2201 } 2202 2203 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2204 { 2205 struct dst_entry *dst_ret; 2206 struct fib6_info *from; 2207 struct rt6_info *rt; 2208 2209 rt = container_of(dst, struct rt6_info, dst); 2210 2211 rcu_read_lock(); 2212 2213 /* All IPV6 dsts are created with ->obsolete set to the value 2214 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2215 * into this function always. 2216 */ 2217 2218 from = rcu_dereference(rt->from); 2219 2220 if (from && (rt->rt6i_flags & RTF_PCPU || 2221 unlikely(!list_empty(&rt->rt6i_uncached)))) 2222 dst_ret = rt6_dst_from_check(rt, from, cookie); 2223 else 2224 dst_ret = rt6_check(rt, from, cookie); 2225 2226 rcu_read_unlock(); 2227 2228 return dst_ret; 2229 } 2230 2231 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2232 { 2233 struct rt6_info *rt = (struct rt6_info *) dst; 2234 2235 if (rt) { 2236 if (rt->rt6i_flags & RTF_CACHE) { 2237 rcu_read_lock(); 2238 if (rt6_check_expired(rt)) { 2239 rt6_remove_exception_rt(rt); 2240 dst = NULL; 2241 } 2242 rcu_read_unlock(); 2243 } else { 2244 dst_release(dst); 2245 dst = NULL; 2246 } 2247 } 2248 return dst; 2249 } 2250 2251 static void ip6_link_failure(struct sk_buff *skb) 2252 { 2253 struct rt6_info *rt; 2254 2255 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2256 2257 rt = (struct rt6_info *) skb_dst(skb); 2258 if (rt) { 2259 rcu_read_lock(); 2260 if (rt->rt6i_flags & RTF_CACHE) { 2261 if (dst_hold_safe(&rt->dst)) 2262 rt6_remove_exception_rt(rt); 2263 } else { 2264 struct fib6_info *from; 2265 struct fib6_node *fn; 2266 2267 from = rcu_dereference(rt->from); 2268 if (from) { 2269 fn = rcu_dereference(from->fib6_node); 2270 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2271 fn->fn_sernum = -1; 2272 } 2273 } 2274 rcu_read_unlock(); 2275 } 2276 } 2277 2278 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2279 { 2280 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2281 struct fib6_info *from; 2282 2283 rcu_read_lock(); 2284 from = rcu_dereference(rt0->from); 2285 if (from) 2286 rt0->dst.expires = from->expires; 2287 rcu_read_unlock(); 2288 } 2289 2290 dst_set_expires(&rt0->dst, timeout); 2291 rt0->rt6i_flags |= RTF_EXPIRES; 2292 } 2293 2294 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2295 { 2296 struct net *net = dev_net(rt->dst.dev); 2297 2298 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2299 rt->rt6i_flags |= RTF_MODIFIED; 2300 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2301 } 2302 2303 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2304 { 2305 bool from_set; 2306 2307 rcu_read_lock(); 2308 from_set = !!rcu_dereference(rt->from); 2309 rcu_read_unlock(); 2310 2311 return !(rt->rt6i_flags & RTF_CACHE) && 2312 (rt->rt6i_flags & RTF_PCPU || from_set); 2313 } 2314 2315 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2316 const struct ipv6hdr *iph, u32 mtu) 2317 { 2318 const struct in6_addr *daddr, *saddr; 2319 struct rt6_info *rt6 = (struct rt6_info *)dst; 2320 2321 if (dst_metric_locked(dst, RTAX_MTU)) 2322 return; 2323 2324 if (iph) { 2325 daddr = &iph->daddr; 2326 saddr = &iph->saddr; 2327 } else if (sk) { 2328 daddr = &sk->sk_v6_daddr; 2329 saddr = &inet6_sk(sk)->saddr; 2330 } else { 2331 daddr = NULL; 2332 saddr = NULL; 2333 } 2334 dst_confirm_neigh(dst, daddr); 2335 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2336 if (mtu >= dst_mtu(dst)) 2337 return; 2338 2339 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2340 rt6_do_update_pmtu(rt6, mtu); 2341 /* update rt6_ex->stamp for cache */ 2342 if (rt6->rt6i_flags & RTF_CACHE) 2343 rt6_update_exception_stamp_rt(rt6); 2344 } else if (daddr) { 2345 struct fib6_info *from; 2346 struct rt6_info *nrt6; 2347 2348 rcu_read_lock(); 2349 from = rcu_dereference(rt6->from); 2350 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr); 2351 if (nrt6) { 2352 rt6_do_update_pmtu(nrt6, mtu); 2353 if (rt6_insert_exception(nrt6, from)) 2354 dst_release_immediate(&nrt6->dst); 2355 } 2356 rcu_read_unlock(); 2357 } 2358 } 2359 2360 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2361 struct sk_buff *skb, u32 mtu) 2362 { 2363 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2364 } 2365 2366 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2367 int oif, u32 mark, kuid_t uid) 2368 { 2369 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2370 struct dst_entry *dst; 2371 struct flowi6 fl6; 2372 2373 memset(&fl6, 0, sizeof(fl6)); 2374 fl6.flowi6_oif = oif; 2375 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); 2376 fl6.daddr = iph->daddr; 2377 fl6.saddr = iph->saddr; 2378 fl6.flowlabel = ip6_flowinfo(iph); 2379 fl6.flowi6_uid = uid; 2380 2381 dst = ip6_route_output(net, NULL, &fl6); 2382 if (!dst->error) 2383 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2384 dst_release(dst); 2385 } 2386 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2387 2388 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2389 { 2390 struct dst_entry *dst; 2391 2392 ip6_update_pmtu(skb, sock_net(sk), mtu, 2393 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid); 2394 2395 dst = __sk_dst_get(sk); 2396 if (!dst || !dst->obsolete || 2397 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2398 return; 2399 2400 bh_lock_sock(sk); 2401 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2402 ip6_datagram_dst_update(sk, false); 2403 bh_unlock_sock(sk); 2404 } 2405 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2406 2407 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2408 const struct flowi6 *fl6) 2409 { 2410 #ifdef CONFIG_IPV6_SUBTREES 2411 struct ipv6_pinfo *np = inet6_sk(sk); 2412 #endif 2413 2414 ip6_dst_store(sk, dst, 2415 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2416 &sk->sk_v6_daddr : NULL, 2417 #ifdef CONFIG_IPV6_SUBTREES 2418 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2419 &np->saddr : 2420 #endif 2421 NULL); 2422 } 2423 2424 /* Handle redirects */ 2425 struct ip6rd_flowi { 2426 struct flowi6 fl6; 2427 struct in6_addr gateway; 2428 }; 2429 2430 static struct rt6_info *__ip6_route_redirect(struct net *net, 2431 struct fib6_table *table, 2432 struct flowi6 *fl6, 2433 const struct sk_buff *skb, 2434 int flags) 2435 { 2436 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2437 struct rt6_info *ret = NULL, *rt_cache; 2438 struct fib6_info *rt; 2439 struct fib6_node *fn; 2440 2441 /* Get the "current" route for this destination and 2442 * check if the redirect has come from appropriate router. 2443 * 2444 * RFC 4861 specifies that redirects should only be 2445 * accepted if they come from the nexthop to the target. 2446 * Due to the way the routes are chosen, this notion 2447 * is a bit fuzzy and one might need to check all possible 2448 * routes. 2449 */ 2450 2451 rcu_read_lock(); 2452 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2453 restart: 2454 for_each_fib6_node_rt_rcu(fn) { 2455 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 2456 continue; 2457 if (fib6_check_expired(rt)) 2458 continue; 2459 if (rt->fib6_flags & RTF_REJECT) 2460 break; 2461 if (!(rt->fib6_flags & RTF_GATEWAY)) 2462 continue; 2463 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex) 2464 continue; 2465 /* rt_cache's gateway might be different from its 'parent' 2466 * in the case of an ip redirect. 2467 * So we keep searching in the exception table if the gateway 2468 * is different. 2469 */ 2470 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) { 2471 rt_cache = rt6_find_cached_rt(rt, 2472 &fl6->daddr, 2473 &fl6->saddr); 2474 if (rt_cache && 2475 ipv6_addr_equal(&rdfl->gateway, 2476 &rt_cache->rt6i_gateway)) { 2477 ret = rt_cache; 2478 break; 2479 } 2480 continue; 2481 } 2482 break; 2483 } 2484 2485 if (!rt) 2486 rt = net->ipv6.fib6_null_entry; 2487 else if (rt->fib6_flags & RTF_REJECT) { 2488 ret = net->ipv6.ip6_null_entry; 2489 goto out; 2490 } 2491 2492 if (rt == net->ipv6.fib6_null_entry) { 2493 fn = fib6_backtrack(fn, &fl6->saddr); 2494 if (fn) 2495 goto restart; 2496 } 2497 2498 out: 2499 if (ret) 2500 ip6_hold_safe(net, &ret, true); 2501 else 2502 ret = ip6_create_rt_rcu(rt); 2503 2504 rcu_read_unlock(); 2505 2506 trace_fib6_table_lookup(net, rt, table, fl6); 2507 return ret; 2508 }; 2509 2510 static struct dst_entry *ip6_route_redirect(struct net *net, 2511 const struct flowi6 *fl6, 2512 const struct sk_buff *skb, 2513 const struct in6_addr *gateway) 2514 { 2515 int flags = RT6_LOOKUP_F_HAS_SADDR; 2516 struct ip6rd_flowi rdfl; 2517 2518 rdfl.fl6 = *fl6; 2519 rdfl.gateway = *gateway; 2520 2521 return fib6_rule_lookup(net, &rdfl.fl6, skb, 2522 flags, __ip6_route_redirect); 2523 } 2524 2525 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2526 kuid_t uid) 2527 { 2528 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2529 struct dst_entry *dst; 2530 struct flowi6 fl6; 2531 2532 memset(&fl6, 0, sizeof(fl6)); 2533 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2534 fl6.flowi6_oif = oif; 2535 fl6.flowi6_mark = mark; 2536 fl6.daddr = iph->daddr; 2537 fl6.saddr = iph->saddr; 2538 fl6.flowlabel = ip6_flowinfo(iph); 2539 fl6.flowi6_uid = uid; 2540 2541 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 2542 rt6_do_redirect(dst, NULL, skb); 2543 dst_release(dst); 2544 } 2545 EXPORT_SYMBOL_GPL(ip6_redirect); 2546 2547 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, 2548 u32 mark) 2549 { 2550 const struct ipv6hdr *iph = ipv6_hdr(skb); 2551 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2552 struct dst_entry *dst; 2553 struct flowi6 fl6; 2554 2555 memset(&fl6, 0, sizeof(fl6)); 2556 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2557 fl6.flowi6_oif = oif; 2558 fl6.flowi6_mark = mark; 2559 fl6.daddr = msg->dest; 2560 fl6.saddr = iph->daddr; 2561 fl6.flowi6_uid = sock_net_uid(net, NULL); 2562 2563 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 2564 rt6_do_redirect(dst, NULL, skb); 2565 dst_release(dst); 2566 } 2567 2568 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2569 { 2570 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2571 sk->sk_uid); 2572 } 2573 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2574 2575 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2576 { 2577 struct net_device *dev = dst->dev; 2578 unsigned int mtu = dst_mtu(dst); 2579 struct net *net = dev_net(dev); 2580 2581 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2582 2583 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2584 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2585 2586 /* 2587 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2588 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2589 * IPV6_MAXPLEN is also valid and means: "any MSS, 2590 * rely only on pmtu discovery" 2591 */ 2592 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2593 mtu = IPV6_MAXPLEN; 2594 return mtu; 2595 } 2596 2597 static unsigned int ip6_mtu(const struct dst_entry *dst) 2598 { 2599 struct inet6_dev *idev; 2600 unsigned int mtu; 2601 2602 mtu = dst_metric_raw(dst, RTAX_MTU); 2603 if (mtu) 2604 goto out; 2605 2606 mtu = IPV6_MIN_MTU; 2607 2608 rcu_read_lock(); 2609 idev = __in6_dev_get(dst->dev); 2610 if (idev) 2611 mtu = idev->cnf.mtu6; 2612 rcu_read_unlock(); 2613 2614 out: 2615 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2616 2617 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2618 } 2619 2620 /* MTU selection: 2621 * 1. mtu on route is locked - use it 2622 * 2. mtu from nexthop exception 2623 * 3. mtu from egress device 2624 * 2625 * based on ip6_dst_mtu_forward and exception logic of 2626 * rt6_find_cached_rt; called with rcu_read_lock 2627 */ 2628 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, 2629 struct in6_addr *saddr) 2630 { 2631 struct rt6_exception_bucket *bucket; 2632 struct rt6_exception *rt6_ex; 2633 struct in6_addr *src_key; 2634 struct inet6_dev *idev; 2635 u32 mtu = 0; 2636 2637 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 2638 mtu = f6i->fib6_pmtu; 2639 if (mtu) 2640 goto out; 2641 } 2642 2643 src_key = NULL; 2644 #ifdef CONFIG_IPV6_SUBTREES 2645 if (f6i->fib6_src.plen) 2646 src_key = saddr; 2647 #endif 2648 2649 bucket = rcu_dereference(f6i->rt6i_exception_bucket); 2650 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 2651 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 2652 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU); 2653 2654 if (likely(!mtu)) { 2655 struct net_device *dev = fib6_info_nh_dev(f6i); 2656 2657 mtu = IPV6_MIN_MTU; 2658 idev = __in6_dev_get(dev); 2659 if (idev && idev->cnf.mtu6 > mtu) 2660 mtu = idev->cnf.mtu6; 2661 } 2662 2663 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2664 out: 2665 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu); 2666 } 2667 2668 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2669 struct flowi6 *fl6) 2670 { 2671 struct dst_entry *dst; 2672 struct rt6_info *rt; 2673 struct inet6_dev *idev = in6_dev_get(dev); 2674 struct net *net = dev_net(dev); 2675 2676 if (unlikely(!idev)) 2677 return ERR_PTR(-ENODEV); 2678 2679 rt = ip6_dst_alloc(net, dev, 0); 2680 if (unlikely(!rt)) { 2681 in6_dev_put(idev); 2682 dst = ERR_PTR(-ENOMEM); 2683 goto out; 2684 } 2685 2686 rt->dst.flags |= DST_HOST; 2687 rt->dst.input = ip6_input; 2688 rt->dst.output = ip6_output; 2689 rt->rt6i_gateway = fl6->daddr; 2690 rt->rt6i_dst.addr = fl6->daddr; 2691 rt->rt6i_dst.plen = 128; 2692 rt->rt6i_idev = idev; 2693 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2694 2695 /* Add this dst into uncached_list so that rt6_disable_ip() can 2696 * do proper release of the net_device 2697 */ 2698 rt6_uncached_list_add(rt); 2699 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2700 2701 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2702 2703 out: 2704 return dst; 2705 } 2706 2707 static int ip6_dst_gc(struct dst_ops *ops) 2708 { 2709 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2710 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2711 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2712 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2713 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2714 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2715 int entries; 2716 2717 entries = dst_entries_get_fast(ops); 2718 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2719 entries <= rt_max_size) 2720 goto out; 2721 2722 net->ipv6.ip6_rt_gc_expire++; 2723 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2724 entries = dst_entries_get_slow(ops); 2725 if (entries < ops->gc_thresh) 2726 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2727 out: 2728 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2729 return entries > rt_max_size; 2730 } 2731 2732 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt, 2733 struct fib6_config *cfg) 2734 { 2735 struct dst_metrics *p; 2736 2737 if (!cfg->fc_mx) 2738 return 0; 2739 2740 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL); 2741 if (unlikely(!p)) 2742 return -ENOMEM; 2743 2744 refcount_set(&p->refcnt, 1); 2745 rt->fib6_metrics = p; 2746 2747 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics); 2748 } 2749 2750 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2751 struct fib6_config *cfg, 2752 const struct in6_addr *gw_addr, 2753 u32 tbid, int flags) 2754 { 2755 struct flowi6 fl6 = { 2756 .flowi6_oif = cfg->fc_ifindex, 2757 .daddr = *gw_addr, 2758 .saddr = cfg->fc_prefsrc, 2759 }; 2760 struct fib6_table *table; 2761 struct rt6_info *rt; 2762 2763 table = fib6_get_table(net, tbid); 2764 if (!table) 2765 return NULL; 2766 2767 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2768 flags |= RT6_LOOKUP_F_HAS_SADDR; 2769 2770 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 2771 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); 2772 2773 /* if table lookup failed, fall back to full lookup */ 2774 if (rt == net->ipv6.ip6_null_entry) { 2775 ip6_rt_put(rt); 2776 rt = NULL; 2777 } 2778 2779 return rt; 2780 } 2781 2782 static int ip6_route_check_nh_onlink(struct net *net, 2783 struct fib6_config *cfg, 2784 const struct net_device *dev, 2785 struct netlink_ext_ack *extack) 2786 { 2787 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 2788 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2789 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; 2790 struct rt6_info *grt; 2791 int err; 2792 2793 err = 0; 2794 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2795 if (grt) { 2796 if (!grt->dst.error && 2797 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2798 NL_SET_ERR_MSG(extack, 2799 "Nexthop has invalid gateway or device mismatch"); 2800 err = -EINVAL; 2801 } 2802 2803 ip6_rt_put(grt); 2804 } 2805 2806 return err; 2807 } 2808 2809 static int ip6_route_check_nh(struct net *net, 2810 struct fib6_config *cfg, 2811 struct net_device **_dev, 2812 struct inet6_dev **idev) 2813 { 2814 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2815 struct net_device *dev = _dev ? *_dev : NULL; 2816 struct rt6_info *grt = NULL; 2817 int err = -EHOSTUNREACH; 2818 2819 if (cfg->fc_table) { 2820 int flags = RT6_LOOKUP_F_IFACE; 2821 2822 grt = ip6_nh_lookup_table(net, cfg, gw_addr, 2823 cfg->fc_table, flags); 2824 if (grt) { 2825 if (grt->rt6i_flags & RTF_GATEWAY || 2826 (dev && dev != grt->dst.dev)) { 2827 ip6_rt_put(grt); 2828 grt = NULL; 2829 } 2830 } 2831 } 2832 2833 if (!grt) 2834 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); 2835 2836 if (!grt) 2837 goto out; 2838 2839 if (dev) { 2840 if (dev != grt->dst.dev) { 2841 ip6_rt_put(grt); 2842 goto out; 2843 } 2844 } else { 2845 *_dev = dev = grt->dst.dev; 2846 *idev = grt->rt6i_idev; 2847 dev_hold(dev); 2848 in6_dev_hold(grt->rt6i_idev); 2849 } 2850 2851 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2852 err = 0; 2853 2854 ip6_rt_put(grt); 2855 2856 out: 2857 return err; 2858 } 2859 2860 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 2861 struct net_device **_dev, struct inet6_dev **idev, 2862 struct netlink_ext_ack *extack) 2863 { 2864 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2865 int gwa_type = ipv6_addr_type(gw_addr); 2866 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 2867 const struct net_device *dev = *_dev; 2868 bool need_addr_check = !dev; 2869 int err = -EINVAL; 2870 2871 /* if gw_addr is local we will fail to detect this in case 2872 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2873 * will return already-added prefix route via interface that 2874 * prefix route was assigned to, which might be non-loopback. 2875 */ 2876 if (dev && 2877 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2878 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2879 goto out; 2880 } 2881 2882 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 2883 /* IPv6 strictly inhibits using not link-local 2884 * addresses as nexthop address. 2885 * Otherwise, router will not able to send redirects. 2886 * It is very good, but in some (rare!) circumstances 2887 * (SIT, PtP, NBMA NOARP links) it is handy to allow 2888 * some exceptions. --ANK 2889 * We allow IPv4-mapped nexthops to support RFC4798-type 2890 * addressing 2891 */ 2892 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 2893 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2894 goto out; 2895 } 2896 2897 if (cfg->fc_flags & RTNH_F_ONLINK) 2898 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 2899 else 2900 err = ip6_route_check_nh(net, cfg, _dev, idev); 2901 2902 if (err) 2903 goto out; 2904 } 2905 2906 /* reload in case device was changed */ 2907 dev = *_dev; 2908 2909 err = -EINVAL; 2910 if (!dev) { 2911 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2912 goto out; 2913 } else if (dev->flags & IFF_LOOPBACK) { 2914 NL_SET_ERR_MSG(extack, 2915 "Egress device can not be loopback device for this route"); 2916 goto out; 2917 } 2918 2919 /* if we did not check gw_addr above, do so now that the 2920 * egress device has been resolved. 2921 */ 2922 if (need_addr_check && 2923 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2924 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2925 goto out; 2926 } 2927 2928 err = 0; 2929 out: 2930 return err; 2931 } 2932 2933 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 2934 gfp_t gfp_flags, 2935 struct netlink_ext_ack *extack) 2936 { 2937 struct net *net = cfg->fc_nlinfo.nl_net; 2938 struct fib6_info *rt = NULL; 2939 struct net_device *dev = NULL; 2940 struct inet6_dev *idev = NULL; 2941 struct fib6_table *table; 2942 int addr_type; 2943 int err = -EINVAL; 2944 2945 /* RTF_PCPU is an internal flag; can not be set by userspace */ 2946 if (cfg->fc_flags & RTF_PCPU) { 2947 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 2948 goto out; 2949 } 2950 2951 /* RTF_CACHE is an internal flag; can not be set by userspace */ 2952 if (cfg->fc_flags & RTF_CACHE) { 2953 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 2954 goto out; 2955 } 2956 2957 if (cfg->fc_type > RTN_MAX) { 2958 NL_SET_ERR_MSG(extack, "Invalid route type"); 2959 goto out; 2960 } 2961 2962 if (cfg->fc_dst_len > 128) { 2963 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 2964 goto out; 2965 } 2966 if (cfg->fc_src_len > 128) { 2967 NL_SET_ERR_MSG(extack, "Invalid source address length"); 2968 goto out; 2969 } 2970 #ifndef CONFIG_IPV6_SUBTREES 2971 if (cfg->fc_src_len) { 2972 NL_SET_ERR_MSG(extack, 2973 "Specifying source address requires IPV6_SUBTREES to be enabled"); 2974 goto out; 2975 } 2976 #endif 2977 if (cfg->fc_ifindex) { 2978 err = -ENODEV; 2979 dev = dev_get_by_index(net, cfg->fc_ifindex); 2980 if (!dev) 2981 goto out; 2982 idev = in6_dev_get(dev); 2983 if (!idev) 2984 goto out; 2985 } 2986 2987 if (cfg->fc_metric == 0) 2988 cfg->fc_metric = IP6_RT_PRIO_USER; 2989 2990 if (cfg->fc_flags & RTNH_F_ONLINK) { 2991 if (!dev) { 2992 NL_SET_ERR_MSG(extack, 2993 "Nexthop device required for onlink"); 2994 err = -ENODEV; 2995 goto out; 2996 } 2997 2998 if (!(dev->flags & IFF_UP)) { 2999 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3000 err = -ENETDOWN; 3001 goto out; 3002 } 3003 } 3004 3005 err = -ENOBUFS; 3006 if (cfg->fc_nlinfo.nlh && 3007 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 3008 table = fib6_get_table(net, cfg->fc_table); 3009 if (!table) { 3010 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 3011 table = fib6_new_table(net, cfg->fc_table); 3012 } 3013 } else { 3014 table = fib6_new_table(net, cfg->fc_table); 3015 } 3016 3017 if (!table) 3018 goto out; 3019 3020 err = -ENOMEM; 3021 rt = fib6_info_alloc(gfp_flags); 3022 if (!rt) 3023 goto out; 3024 3025 if (cfg->fc_flags & RTF_ADDRCONF) 3026 rt->dst_nocount = true; 3027 3028 err = ip6_convert_metrics(net, rt, cfg); 3029 if (err < 0) 3030 goto out; 3031 3032 if (cfg->fc_flags & RTF_EXPIRES) 3033 fib6_set_expires(rt, jiffies + 3034 clock_t_to_jiffies(cfg->fc_expires)); 3035 else 3036 fib6_clean_expires(rt); 3037 3038 if (cfg->fc_protocol == RTPROT_UNSPEC) 3039 cfg->fc_protocol = RTPROT_BOOT; 3040 rt->fib6_protocol = cfg->fc_protocol; 3041 3042 addr_type = ipv6_addr_type(&cfg->fc_dst); 3043 3044 if (cfg->fc_encap) { 3045 struct lwtunnel_state *lwtstate; 3046 3047 err = lwtunnel_build_state(cfg->fc_encap_type, 3048 cfg->fc_encap, AF_INET6, cfg, 3049 &lwtstate, extack); 3050 if (err) 3051 goto out; 3052 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate); 3053 } 3054 3055 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3056 rt->fib6_dst.plen = cfg->fc_dst_len; 3057 if (rt->fib6_dst.plen == 128) 3058 rt->dst_host = true; 3059 3060 #ifdef CONFIG_IPV6_SUBTREES 3061 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3062 rt->fib6_src.plen = cfg->fc_src_len; 3063 #endif 3064 3065 rt->fib6_metric = cfg->fc_metric; 3066 rt->fib6_nh.nh_weight = 1; 3067 3068 rt->fib6_type = cfg->fc_type; 3069 3070 /* We cannot add true routes via loopback here, 3071 they would result in kernel looping; promote them to reject routes 3072 */ 3073 if ((cfg->fc_flags & RTF_REJECT) || 3074 (dev && (dev->flags & IFF_LOOPBACK) && 3075 !(addr_type & IPV6_ADDR_LOOPBACK) && 3076 !(cfg->fc_flags & RTF_LOCAL))) { 3077 /* hold loopback dev/idev if we haven't done so. */ 3078 if (dev != net->loopback_dev) { 3079 if (dev) { 3080 dev_put(dev); 3081 in6_dev_put(idev); 3082 } 3083 dev = net->loopback_dev; 3084 dev_hold(dev); 3085 idev = in6_dev_get(dev); 3086 if (!idev) { 3087 err = -ENODEV; 3088 goto out; 3089 } 3090 } 3091 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP; 3092 goto install_route; 3093 } 3094 3095 if (cfg->fc_flags & RTF_GATEWAY) { 3096 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 3097 if (err) 3098 goto out; 3099 3100 rt->fib6_nh.nh_gw = cfg->fc_gateway; 3101 } 3102 3103 err = -ENODEV; 3104 if (!dev) 3105 goto out; 3106 3107 if (idev->cnf.disable_ipv6) { 3108 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3109 err = -EACCES; 3110 goto out; 3111 } 3112 3113 if (!(dev->flags & IFF_UP)) { 3114 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3115 err = -ENETDOWN; 3116 goto out; 3117 } 3118 3119 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3120 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3121 NL_SET_ERR_MSG(extack, "Invalid source address"); 3122 err = -EINVAL; 3123 goto out; 3124 } 3125 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3126 rt->fib6_prefsrc.plen = 128; 3127 } else 3128 rt->fib6_prefsrc.plen = 0; 3129 3130 rt->fib6_flags = cfg->fc_flags; 3131 3132 install_route: 3133 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3134 !netif_carrier_ok(dev)) 3135 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 3136 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); 3137 rt->fib6_nh.nh_dev = dev; 3138 rt->fib6_table = table; 3139 3140 if (idev) 3141 in6_dev_put(idev); 3142 3143 return rt; 3144 out: 3145 if (dev) 3146 dev_put(dev); 3147 if (idev) 3148 in6_dev_put(idev); 3149 3150 fib6_info_release(rt); 3151 return ERR_PTR(err); 3152 } 3153 3154 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3155 struct netlink_ext_ack *extack) 3156 { 3157 struct fib6_info *rt; 3158 int err; 3159 3160 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3161 if (IS_ERR(rt)) 3162 return PTR_ERR(rt); 3163 3164 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3165 fib6_info_release(rt); 3166 3167 return err; 3168 } 3169 3170 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3171 { 3172 struct net *net = info->nl_net; 3173 struct fib6_table *table; 3174 int err; 3175 3176 if (rt == net->ipv6.fib6_null_entry) { 3177 err = -ENOENT; 3178 goto out; 3179 } 3180 3181 table = rt->fib6_table; 3182 spin_lock_bh(&table->tb6_lock); 3183 err = fib6_del(rt, info); 3184 spin_unlock_bh(&table->tb6_lock); 3185 3186 out: 3187 fib6_info_release(rt); 3188 return err; 3189 } 3190 3191 int ip6_del_rt(struct net *net, struct fib6_info *rt) 3192 { 3193 struct nl_info info = { .nl_net = net }; 3194 3195 return __ip6_del_rt(rt, &info); 3196 } 3197 3198 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3199 { 3200 struct nl_info *info = &cfg->fc_nlinfo; 3201 struct net *net = info->nl_net; 3202 struct sk_buff *skb = NULL; 3203 struct fib6_table *table; 3204 int err = -ENOENT; 3205 3206 if (rt == net->ipv6.fib6_null_entry) 3207 goto out_put; 3208 table = rt->fib6_table; 3209 spin_lock_bh(&table->tb6_lock); 3210 3211 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3212 struct fib6_info *sibling, *next_sibling; 3213 3214 /* prefer to send a single notification with all hops */ 3215 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3216 if (skb) { 3217 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3218 3219 if (rt6_fill_node(net, skb, rt, NULL, 3220 NULL, NULL, 0, RTM_DELROUTE, 3221 info->portid, seq, 0) < 0) { 3222 kfree_skb(skb); 3223 skb = NULL; 3224 } else 3225 info->skip_notify = 1; 3226 } 3227 3228 list_for_each_entry_safe(sibling, next_sibling, 3229 &rt->fib6_siblings, 3230 fib6_siblings) { 3231 err = fib6_del(sibling, info); 3232 if (err) 3233 goto out_unlock; 3234 } 3235 } 3236 3237 err = fib6_del(rt, info); 3238 out_unlock: 3239 spin_unlock_bh(&table->tb6_lock); 3240 out_put: 3241 fib6_info_release(rt); 3242 3243 if (skb) { 3244 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3245 info->nlh, gfp_any()); 3246 } 3247 return err; 3248 } 3249 3250 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3251 { 3252 int rc = -ESRCH; 3253 3254 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3255 goto out; 3256 3257 if (cfg->fc_flags & RTF_GATEWAY && 3258 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3259 goto out; 3260 if (dst_hold_safe(&rt->dst)) 3261 rc = rt6_remove_exception_rt(rt); 3262 out: 3263 return rc; 3264 } 3265 3266 static int ip6_route_del(struct fib6_config *cfg, 3267 struct netlink_ext_ack *extack) 3268 { 3269 struct rt6_info *rt_cache; 3270 struct fib6_table *table; 3271 struct fib6_info *rt; 3272 struct fib6_node *fn; 3273 int err = -ESRCH; 3274 3275 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 3276 if (!table) { 3277 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 3278 return err; 3279 } 3280 3281 rcu_read_lock(); 3282 3283 fn = fib6_locate(&table->tb6_root, 3284 &cfg->fc_dst, cfg->fc_dst_len, 3285 &cfg->fc_src, cfg->fc_src_len, 3286 !(cfg->fc_flags & RTF_CACHE)); 3287 3288 if (fn) { 3289 for_each_fib6_node_rt_rcu(fn) { 3290 if (cfg->fc_flags & RTF_CACHE) { 3291 int rc; 3292 3293 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, 3294 &cfg->fc_src); 3295 if (rt_cache) { 3296 rc = ip6_del_cached_rt(rt_cache, cfg); 3297 if (rc != -ESRCH) { 3298 rcu_read_unlock(); 3299 return rc; 3300 } 3301 } 3302 continue; 3303 } 3304 if (cfg->fc_ifindex && 3305 (!rt->fib6_nh.nh_dev || 3306 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex)) 3307 continue; 3308 if (cfg->fc_flags & RTF_GATEWAY && 3309 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw)) 3310 continue; 3311 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 3312 continue; 3313 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) 3314 continue; 3315 if (!fib6_info_hold_safe(rt)) 3316 continue; 3317 rcu_read_unlock(); 3318 3319 /* if gateway was specified only delete the one hop */ 3320 if (cfg->fc_flags & RTF_GATEWAY) 3321 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3322 3323 return __ip6_del_rt_siblings(rt, cfg); 3324 } 3325 } 3326 rcu_read_unlock(); 3327 3328 return err; 3329 } 3330 3331 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3332 { 3333 struct netevent_redirect netevent; 3334 struct rt6_info *rt, *nrt = NULL; 3335 struct ndisc_options ndopts; 3336 struct inet6_dev *in6_dev; 3337 struct neighbour *neigh; 3338 struct fib6_info *from; 3339 struct rd_msg *msg; 3340 int optlen, on_link; 3341 u8 *lladdr; 3342 3343 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3344 optlen -= sizeof(*msg); 3345 3346 if (optlen < 0) { 3347 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3348 return; 3349 } 3350 3351 msg = (struct rd_msg *)icmp6_hdr(skb); 3352 3353 if (ipv6_addr_is_multicast(&msg->dest)) { 3354 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3355 return; 3356 } 3357 3358 on_link = 0; 3359 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3360 on_link = 1; 3361 } else if (ipv6_addr_type(&msg->target) != 3362 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3363 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3364 return; 3365 } 3366 3367 in6_dev = __in6_dev_get(skb->dev); 3368 if (!in6_dev) 3369 return; 3370 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3371 return; 3372 3373 /* RFC2461 8.1: 3374 * The IP source address of the Redirect MUST be the same as the current 3375 * first-hop router for the specified ICMP Destination Address. 3376 */ 3377 3378 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3379 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3380 return; 3381 } 3382 3383 lladdr = NULL; 3384 if (ndopts.nd_opts_tgt_lladdr) { 3385 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3386 skb->dev); 3387 if (!lladdr) { 3388 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 3389 return; 3390 } 3391 } 3392 3393 rt = (struct rt6_info *) dst; 3394 if (rt->rt6i_flags & RTF_REJECT) { 3395 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 3396 return; 3397 } 3398 3399 /* Redirect received -> path was valid. 3400 * Look, redirects are sent only in response to data packets, 3401 * so that this nexthop apparently is reachable. --ANK 3402 */ 3403 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3404 3405 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3406 if (!neigh) 3407 return; 3408 3409 /* 3410 * We have finally decided to accept it. 3411 */ 3412 3413 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3414 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3415 NEIGH_UPDATE_F_OVERRIDE| 3416 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3417 NEIGH_UPDATE_F_ISROUTER)), 3418 NDISC_REDIRECT, &ndopts); 3419 3420 rcu_read_lock(); 3421 from = rcu_dereference(rt->from); 3422 /* This fib6_info_hold() is safe here because we hold reference to rt 3423 * and rt already holds reference to fib6_info. 3424 */ 3425 fib6_info_hold(from); 3426 rcu_read_unlock(); 3427 3428 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL); 3429 if (!nrt) 3430 goto out; 3431 3432 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3433 if (on_link) 3434 nrt->rt6i_flags &= ~RTF_GATEWAY; 3435 3436 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3437 3438 /* No need to remove rt from the exception table if rt is 3439 * a cached route because rt6_insert_exception() will 3440 * takes care of it 3441 */ 3442 if (rt6_insert_exception(nrt, from)) { 3443 dst_release_immediate(&nrt->dst); 3444 goto out; 3445 } 3446 3447 netevent.old = &rt->dst; 3448 netevent.new = &nrt->dst; 3449 netevent.daddr = &msg->dest; 3450 netevent.neigh = neigh; 3451 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3452 3453 out: 3454 fib6_info_release(from); 3455 neigh_release(neigh); 3456 } 3457 3458 #ifdef CONFIG_IPV6_ROUTE_INFO 3459 static struct fib6_info *rt6_get_route_info(struct net *net, 3460 const struct in6_addr *prefix, int prefixlen, 3461 const struct in6_addr *gwaddr, 3462 struct net_device *dev) 3463 { 3464 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3465 int ifindex = dev->ifindex; 3466 struct fib6_node *fn; 3467 struct fib6_info *rt = NULL; 3468 struct fib6_table *table; 3469 3470 table = fib6_get_table(net, tb_id); 3471 if (!table) 3472 return NULL; 3473 3474 rcu_read_lock(); 3475 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3476 if (!fn) 3477 goto out; 3478 3479 for_each_fib6_node_rt_rcu(fn) { 3480 if (rt->fib6_nh.nh_dev->ifindex != ifindex) 3481 continue; 3482 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) 3483 continue; 3484 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) 3485 continue; 3486 if (!fib6_info_hold_safe(rt)) 3487 continue; 3488 break; 3489 } 3490 out: 3491 rcu_read_unlock(); 3492 return rt; 3493 } 3494 3495 static struct fib6_info *rt6_add_route_info(struct net *net, 3496 const struct in6_addr *prefix, int prefixlen, 3497 const struct in6_addr *gwaddr, 3498 struct net_device *dev, 3499 unsigned int pref) 3500 { 3501 struct fib6_config cfg = { 3502 .fc_metric = IP6_RT_PRIO_USER, 3503 .fc_ifindex = dev->ifindex, 3504 .fc_dst_len = prefixlen, 3505 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3506 RTF_UP | RTF_PREF(pref), 3507 .fc_protocol = RTPROT_RA, 3508 .fc_type = RTN_UNICAST, 3509 .fc_nlinfo.portid = 0, 3510 .fc_nlinfo.nlh = NULL, 3511 .fc_nlinfo.nl_net = net, 3512 }; 3513 3514 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3515 cfg.fc_dst = *prefix; 3516 cfg.fc_gateway = *gwaddr; 3517 3518 /* We should treat it as a default route if prefix length is 0. */ 3519 if (!prefixlen) 3520 cfg.fc_flags |= RTF_DEFAULT; 3521 3522 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 3523 3524 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3525 } 3526 #endif 3527 3528 struct fib6_info *rt6_get_dflt_router(struct net *net, 3529 const struct in6_addr *addr, 3530 struct net_device *dev) 3531 { 3532 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3533 struct fib6_info *rt; 3534 struct fib6_table *table; 3535 3536 table = fib6_get_table(net, tb_id); 3537 if (!table) 3538 return NULL; 3539 3540 rcu_read_lock(); 3541 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3542 if (dev == rt->fib6_nh.nh_dev && 3543 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3544 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) 3545 break; 3546 } 3547 if (rt && !fib6_info_hold_safe(rt)) 3548 rt = NULL; 3549 rcu_read_unlock(); 3550 return rt; 3551 } 3552 3553 struct fib6_info *rt6_add_dflt_router(struct net *net, 3554 const struct in6_addr *gwaddr, 3555 struct net_device *dev, 3556 unsigned int pref) 3557 { 3558 struct fib6_config cfg = { 3559 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3560 .fc_metric = IP6_RT_PRIO_USER, 3561 .fc_ifindex = dev->ifindex, 3562 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3563 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3564 .fc_protocol = RTPROT_RA, 3565 .fc_type = RTN_UNICAST, 3566 .fc_nlinfo.portid = 0, 3567 .fc_nlinfo.nlh = NULL, 3568 .fc_nlinfo.nl_net = net, 3569 }; 3570 3571 cfg.fc_gateway = *gwaddr; 3572 3573 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 3574 struct fib6_table *table; 3575 3576 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3577 if (table) 3578 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3579 } 3580 3581 return rt6_get_dflt_router(net, gwaddr, dev); 3582 } 3583 3584 static void __rt6_purge_dflt_routers(struct net *net, 3585 struct fib6_table *table) 3586 { 3587 struct fib6_info *rt; 3588 3589 restart: 3590 rcu_read_lock(); 3591 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3592 struct net_device *dev = fib6_info_nh_dev(rt); 3593 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 3594 3595 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3596 (!idev || idev->cnf.accept_ra != 2) && 3597 fib6_info_hold_safe(rt)) { 3598 rcu_read_unlock(); 3599 ip6_del_rt(net, rt); 3600 goto restart; 3601 } 3602 } 3603 rcu_read_unlock(); 3604 3605 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3606 } 3607 3608 void rt6_purge_dflt_routers(struct net *net) 3609 { 3610 struct fib6_table *table; 3611 struct hlist_head *head; 3612 unsigned int h; 3613 3614 rcu_read_lock(); 3615 3616 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3617 head = &net->ipv6.fib_table_hash[h]; 3618 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3619 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3620 __rt6_purge_dflt_routers(net, table); 3621 } 3622 } 3623 3624 rcu_read_unlock(); 3625 } 3626 3627 static void rtmsg_to_fib6_config(struct net *net, 3628 struct in6_rtmsg *rtmsg, 3629 struct fib6_config *cfg) 3630 { 3631 memset(cfg, 0, sizeof(*cfg)); 3632 3633 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3634 : RT6_TABLE_MAIN; 3635 cfg->fc_ifindex = rtmsg->rtmsg_ifindex; 3636 cfg->fc_metric = rtmsg->rtmsg_metric; 3637 cfg->fc_expires = rtmsg->rtmsg_info; 3638 cfg->fc_dst_len = rtmsg->rtmsg_dst_len; 3639 cfg->fc_src_len = rtmsg->rtmsg_src_len; 3640 cfg->fc_flags = rtmsg->rtmsg_flags; 3641 cfg->fc_type = rtmsg->rtmsg_type; 3642 3643 cfg->fc_nlinfo.nl_net = net; 3644 3645 cfg->fc_dst = rtmsg->rtmsg_dst; 3646 cfg->fc_src = rtmsg->rtmsg_src; 3647 cfg->fc_gateway = rtmsg->rtmsg_gateway; 3648 } 3649 3650 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3651 { 3652 struct fib6_config cfg; 3653 struct in6_rtmsg rtmsg; 3654 int err; 3655 3656 switch (cmd) { 3657 case SIOCADDRT: /* Add a route */ 3658 case SIOCDELRT: /* Delete a route */ 3659 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3660 return -EPERM; 3661 err = copy_from_user(&rtmsg, arg, 3662 sizeof(struct in6_rtmsg)); 3663 if (err) 3664 return -EFAULT; 3665 3666 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3667 3668 rtnl_lock(); 3669 switch (cmd) { 3670 case SIOCADDRT: 3671 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 3672 break; 3673 case SIOCDELRT: 3674 err = ip6_route_del(&cfg, NULL); 3675 break; 3676 default: 3677 err = -EINVAL; 3678 } 3679 rtnl_unlock(); 3680 3681 return err; 3682 } 3683 3684 return -EINVAL; 3685 } 3686 3687 /* 3688 * Drop the packet on the floor 3689 */ 3690 3691 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3692 { 3693 int type; 3694 struct dst_entry *dst = skb_dst(skb); 3695 switch (ipstats_mib_noroutes) { 3696 case IPSTATS_MIB_INNOROUTES: 3697 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3698 if (type == IPV6_ADDR_ANY) { 3699 IP6_INC_STATS(dev_net(dst->dev), 3700 __in6_dev_get_safely(skb->dev), 3701 IPSTATS_MIB_INADDRERRORS); 3702 break; 3703 } 3704 /* FALLTHROUGH */ 3705 case IPSTATS_MIB_OUTNOROUTES: 3706 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3707 ipstats_mib_noroutes); 3708 break; 3709 } 3710 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3711 kfree_skb(skb); 3712 return 0; 3713 } 3714 3715 static int ip6_pkt_discard(struct sk_buff *skb) 3716 { 3717 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3718 } 3719 3720 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3721 { 3722 skb->dev = skb_dst(skb)->dev; 3723 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3724 } 3725 3726 static int ip6_pkt_prohibit(struct sk_buff *skb) 3727 { 3728 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3729 } 3730 3731 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3732 { 3733 skb->dev = skb_dst(skb)->dev; 3734 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3735 } 3736 3737 /* 3738 * Allocate a dst for local (unicast / anycast) address. 3739 */ 3740 3741 struct fib6_info *addrconf_f6i_alloc(struct net *net, 3742 struct inet6_dev *idev, 3743 const struct in6_addr *addr, 3744 bool anycast, gfp_t gfp_flags) 3745 { 3746 u32 tb_id; 3747 struct net_device *dev = idev->dev; 3748 struct fib6_info *f6i; 3749 3750 f6i = fib6_info_alloc(gfp_flags); 3751 if (!f6i) 3752 return ERR_PTR(-ENOMEM); 3753 3754 f6i->dst_nocount = true; 3755 f6i->dst_host = true; 3756 f6i->fib6_protocol = RTPROT_KERNEL; 3757 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP; 3758 if (anycast) { 3759 f6i->fib6_type = RTN_ANYCAST; 3760 f6i->fib6_flags |= RTF_ANYCAST; 3761 } else { 3762 f6i->fib6_type = RTN_LOCAL; 3763 f6i->fib6_flags |= RTF_LOCAL; 3764 } 3765 3766 f6i->fib6_nh.nh_gw = *addr; 3767 dev_hold(dev); 3768 f6i->fib6_nh.nh_dev = dev; 3769 f6i->fib6_dst.addr = *addr; 3770 f6i->fib6_dst.plen = 128; 3771 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; 3772 f6i->fib6_table = fib6_get_table(net, tb_id); 3773 3774 return f6i; 3775 } 3776 3777 /* remove deleted ip from prefsrc entries */ 3778 struct arg_dev_net_ip { 3779 struct net_device *dev; 3780 struct net *net; 3781 struct in6_addr *addr; 3782 }; 3783 3784 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 3785 { 3786 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3787 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3788 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3789 3790 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) && 3791 rt != net->ipv6.fib6_null_entry && 3792 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 3793 spin_lock_bh(&rt6_exception_lock); 3794 /* remove prefsrc entry */ 3795 rt->fib6_prefsrc.plen = 0; 3796 /* need to update cache as well */ 3797 rt6_exceptions_remove_prefsrc(rt); 3798 spin_unlock_bh(&rt6_exception_lock); 3799 } 3800 return 0; 3801 } 3802 3803 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3804 { 3805 struct net *net = dev_net(ifp->idev->dev); 3806 struct arg_dev_net_ip adni = { 3807 .dev = ifp->idev->dev, 3808 .net = net, 3809 .addr = &ifp->addr, 3810 }; 3811 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3812 } 3813 3814 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) 3815 3816 /* Remove routers and update dst entries when gateway turn into host. */ 3817 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 3818 { 3819 struct in6_addr *gateway = (struct in6_addr *)arg; 3820 3821 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3822 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) { 3823 return -1; 3824 } 3825 3826 /* Further clean up cached routes in exception table. 3827 * This is needed because cached route may have a different 3828 * gateway than its 'parent' in the case of an ip redirect. 3829 */ 3830 rt6_exceptions_clean_tohost(rt, gateway); 3831 3832 return 0; 3833 } 3834 3835 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3836 { 3837 fib6_clean_all(net, fib6_clean_tohost, gateway); 3838 } 3839 3840 struct arg_netdev_event { 3841 const struct net_device *dev; 3842 union { 3843 unsigned int nh_flags; 3844 unsigned long event; 3845 }; 3846 }; 3847 3848 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 3849 { 3850 struct fib6_info *iter; 3851 struct fib6_node *fn; 3852 3853 fn = rcu_dereference_protected(rt->fib6_node, 3854 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3855 iter = rcu_dereference_protected(fn->leaf, 3856 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3857 while (iter) { 3858 if (iter->fib6_metric == rt->fib6_metric && 3859 rt6_qualify_for_ecmp(iter)) 3860 return iter; 3861 iter = rcu_dereference_protected(iter->fib6_next, 3862 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3863 } 3864 3865 return NULL; 3866 } 3867 3868 static bool rt6_is_dead(const struct fib6_info *rt) 3869 { 3870 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || 3871 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 3872 fib6_ignore_linkdown(rt))) 3873 return true; 3874 3875 return false; 3876 } 3877 3878 static int rt6_multipath_total_weight(const struct fib6_info *rt) 3879 { 3880 struct fib6_info *iter; 3881 int total = 0; 3882 3883 if (!rt6_is_dead(rt)) 3884 total += rt->fib6_nh.nh_weight; 3885 3886 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 3887 if (!rt6_is_dead(iter)) 3888 total += iter->fib6_nh.nh_weight; 3889 } 3890 3891 return total; 3892 } 3893 3894 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 3895 { 3896 int upper_bound = -1; 3897 3898 if (!rt6_is_dead(rt)) { 3899 *weight += rt->fib6_nh.nh_weight; 3900 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 3901 total) - 1; 3902 } 3903 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound); 3904 } 3905 3906 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 3907 { 3908 struct fib6_info *iter; 3909 int weight = 0; 3910 3911 rt6_upper_bound_set(rt, &weight, total); 3912 3913 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3914 rt6_upper_bound_set(iter, &weight, total); 3915 } 3916 3917 void rt6_multipath_rebalance(struct fib6_info *rt) 3918 { 3919 struct fib6_info *first; 3920 int total; 3921 3922 /* In case the entire multipath route was marked for flushing, 3923 * then there is no need to rebalance upon the removal of every 3924 * sibling route. 3925 */ 3926 if (!rt->fib6_nsiblings || rt->should_flush) 3927 return; 3928 3929 /* During lookup routes are evaluated in order, so we need to 3930 * make sure upper bounds are assigned from the first sibling 3931 * onwards. 3932 */ 3933 first = rt6_multipath_first_sibling(rt); 3934 if (WARN_ON_ONCE(!first)) 3935 return; 3936 3937 total = rt6_multipath_total_weight(first); 3938 rt6_multipath_upper_bound_set(first, total); 3939 } 3940 3941 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 3942 { 3943 const struct arg_netdev_event *arg = p_arg; 3944 struct net *net = dev_net(arg->dev); 3945 3946 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) { 3947 rt->fib6_nh.nh_flags &= ~arg->nh_flags; 3948 fib6_update_sernum_upto_root(net, rt); 3949 rt6_multipath_rebalance(rt); 3950 } 3951 3952 return 0; 3953 } 3954 3955 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) 3956 { 3957 struct arg_netdev_event arg = { 3958 .dev = dev, 3959 { 3960 .nh_flags = nh_flags, 3961 }, 3962 }; 3963 3964 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 3965 arg.nh_flags |= RTNH_F_LINKDOWN; 3966 3967 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 3968 } 3969 3970 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 3971 const struct net_device *dev) 3972 { 3973 struct fib6_info *iter; 3974 3975 if (rt->fib6_nh.nh_dev == dev) 3976 return true; 3977 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3978 if (iter->fib6_nh.nh_dev == dev) 3979 return true; 3980 3981 return false; 3982 } 3983 3984 static void rt6_multipath_flush(struct fib6_info *rt) 3985 { 3986 struct fib6_info *iter; 3987 3988 rt->should_flush = 1; 3989 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3990 iter->should_flush = 1; 3991 } 3992 3993 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 3994 const struct net_device *down_dev) 3995 { 3996 struct fib6_info *iter; 3997 unsigned int dead = 0; 3998 3999 if (rt->fib6_nh.nh_dev == down_dev || 4000 rt->fib6_nh.nh_flags & RTNH_F_DEAD) 4001 dead++; 4002 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4003 if (iter->fib6_nh.nh_dev == down_dev || 4004 iter->fib6_nh.nh_flags & RTNH_F_DEAD) 4005 dead++; 4006 4007 return dead; 4008 } 4009 4010 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 4011 const struct net_device *dev, 4012 unsigned int nh_flags) 4013 { 4014 struct fib6_info *iter; 4015 4016 if (rt->fib6_nh.nh_dev == dev) 4017 rt->fib6_nh.nh_flags |= nh_flags; 4018 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4019 if (iter->fib6_nh.nh_dev == dev) 4020 iter->fib6_nh.nh_flags |= nh_flags; 4021 } 4022 4023 /* called with write lock held for table with rt */ 4024 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 4025 { 4026 const struct arg_netdev_event *arg = p_arg; 4027 const struct net_device *dev = arg->dev; 4028 struct net *net = dev_net(dev); 4029 4030 if (rt == net->ipv6.fib6_null_entry) 4031 return 0; 4032 4033 switch (arg->event) { 4034 case NETDEV_UNREGISTER: 4035 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 4036 case NETDEV_DOWN: 4037 if (rt->should_flush) 4038 return -1; 4039 if (!rt->fib6_nsiblings) 4040 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 4041 if (rt6_multipath_uses_dev(rt, dev)) { 4042 unsigned int count; 4043 4044 count = rt6_multipath_dead_count(rt, dev); 4045 if (rt->fib6_nsiblings + 1 == count) { 4046 rt6_multipath_flush(rt); 4047 return -1; 4048 } 4049 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4050 RTNH_F_LINKDOWN); 4051 fib6_update_sernum(net, rt); 4052 rt6_multipath_rebalance(rt); 4053 } 4054 return -2; 4055 case NETDEV_CHANGE: 4056 if (rt->fib6_nh.nh_dev != dev || 4057 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 4058 break; 4059 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 4060 rt6_multipath_rebalance(rt); 4061 break; 4062 } 4063 4064 return 0; 4065 } 4066 4067 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4068 { 4069 struct arg_netdev_event arg = { 4070 .dev = dev, 4071 { 4072 .event = event, 4073 }, 4074 }; 4075 4076 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg); 4077 } 4078 4079 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4080 { 4081 rt6_sync_down_dev(dev, event); 4082 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4083 neigh_ifdown(&nd_tbl, dev); 4084 } 4085 4086 struct rt6_mtu_change_arg { 4087 struct net_device *dev; 4088 unsigned int mtu; 4089 }; 4090 4091 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) 4092 { 4093 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4094 struct inet6_dev *idev; 4095 4096 /* In IPv6 pmtu discovery is not optional, 4097 so that RTAX_MTU lock cannot disable it. 4098 We still use this lock to block changes 4099 caused by addrconf/ndisc. 4100 */ 4101 4102 idev = __in6_dev_get(arg->dev); 4103 if (!idev) 4104 return 0; 4105 4106 /* For administrative MTU increase, there is no way to discover 4107 IPv6 PMTU increase, so PMTU increase should be updated here. 4108 Since RFC 1981 doesn't include administrative MTU increase 4109 update PMTU increase is a MUST. (i.e. jumbo frame) 4110 */ 4111 if (rt->fib6_nh.nh_dev == arg->dev && 4112 !fib6_metric_locked(rt, RTAX_MTU)) { 4113 u32 mtu = rt->fib6_pmtu; 4114 4115 if (mtu >= arg->mtu || 4116 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4117 fib6_metric_set(rt, RTAX_MTU, arg->mtu); 4118 4119 spin_lock_bh(&rt6_exception_lock); 4120 rt6_exceptions_update_pmtu(idev, rt, arg->mtu); 4121 spin_unlock_bh(&rt6_exception_lock); 4122 } 4123 return 0; 4124 } 4125 4126 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4127 { 4128 struct rt6_mtu_change_arg arg = { 4129 .dev = dev, 4130 .mtu = mtu, 4131 }; 4132 4133 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4134 } 4135 4136 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4137 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4138 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4139 [RTA_OIF] = { .type = NLA_U32 }, 4140 [RTA_IIF] = { .type = NLA_U32 }, 4141 [RTA_PRIORITY] = { .type = NLA_U32 }, 4142 [RTA_METRICS] = { .type = NLA_NESTED }, 4143 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4144 [RTA_PREF] = { .type = NLA_U8 }, 4145 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4146 [RTA_ENCAP] = { .type = NLA_NESTED }, 4147 [RTA_EXPIRES] = { .type = NLA_U32 }, 4148 [RTA_UID] = { .type = NLA_U32 }, 4149 [RTA_MARK] = { .type = NLA_U32 }, 4150 [RTA_TABLE] = { .type = NLA_U32 }, 4151 [RTA_IP_PROTO] = { .type = NLA_U8 }, 4152 [RTA_SPORT] = { .type = NLA_U16 }, 4153 [RTA_DPORT] = { .type = NLA_U16 }, 4154 }; 4155 4156 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4157 struct fib6_config *cfg, 4158 struct netlink_ext_ack *extack) 4159 { 4160 struct rtmsg *rtm; 4161 struct nlattr *tb[RTA_MAX+1]; 4162 unsigned int pref; 4163 int err; 4164 4165 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4166 NULL); 4167 if (err < 0) 4168 goto errout; 4169 4170 err = -EINVAL; 4171 rtm = nlmsg_data(nlh); 4172 memset(cfg, 0, sizeof(*cfg)); 4173 4174 cfg->fc_table = rtm->rtm_table; 4175 cfg->fc_dst_len = rtm->rtm_dst_len; 4176 cfg->fc_src_len = rtm->rtm_src_len; 4177 cfg->fc_flags = RTF_UP; 4178 cfg->fc_protocol = rtm->rtm_protocol; 4179 cfg->fc_type = rtm->rtm_type; 4180 4181 if (rtm->rtm_type == RTN_UNREACHABLE || 4182 rtm->rtm_type == RTN_BLACKHOLE || 4183 rtm->rtm_type == RTN_PROHIBIT || 4184 rtm->rtm_type == RTN_THROW) 4185 cfg->fc_flags |= RTF_REJECT; 4186 4187 if (rtm->rtm_type == RTN_LOCAL) 4188 cfg->fc_flags |= RTF_LOCAL; 4189 4190 if (rtm->rtm_flags & RTM_F_CLONED) 4191 cfg->fc_flags |= RTF_CACHE; 4192 4193 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4194 4195 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; 4196 cfg->fc_nlinfo.nlh = nlh; 4197 cfg->fc_nlinfo.nl_net = sock_net(skb->sk); 4198 4199 if (tb[RTA_GATEWAY]) { 4200 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4201 cfg->fc_flags |= RTF_GATEWAY; 4202 } 4203 4204 if (tb[RTA_DST]) { 4205 int plen = (rtm->rtm_dst_len + 7) >> 3; 4206 4207 if (nla_len(tb[RTA_DST]) < plen) 4208 goto errout; 4209 4210 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 4211 } 4212 4213 if (tb[RTA_SRC]) { 4214 int plen = (rtm->rtm_src_len + 7) >> 3; 4215 4216 if (nla_len(tb[RTA_SRC]) < plen) 4217 goto errout; 4218 4219 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 4220 } 4221 4222 if (tb[RTA_PREFSRC]) 4223 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 4224 4225 if (tb[RTA_OIF]) 4226 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 4227 4228 if (tb[RTA_PRIORITY]) 4229 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 4230 4231 if (tb[RTA_METRICS]) { 4232 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 4233 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 4234 } 4235 4236 if (tb[RTA_TABLE]) 4237 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 4238 4239 if (tb[RTA_MULTIPATH]) { 4240 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 4241 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 4242 4243 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 4244 cfg->fc_mp_len, extack); 4245 if (err < 0) 4246 goto errout; 4247 } 4248 4249 if (tb[RTA_PREF]) { 4250 pref = nla_get_u8(tb[RTA_PREF]); 4251 if (pref != ICMPV6_ROUTER_PREF_LOW && 4252 pref != ICMPV6_ROUTER_PREF_HIGH) 4253 pref = ICMPV6_ROUTER_PREF_MEDIUM; 4254 cfg->fc_flags |= RTF_PREF(pref); 4255 } 4256 4257 if (tb[RTA_ENCAP]) 4258 cfg->fc_encap = tb[RTA_ENCAP]; 4259 4260 if (tb[RTA_ENCAP_TYPE]) { 4261 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 4262 4263 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 4264 if (err < 0) 4265 goto errout; 4266 } 4267 4268 if (tb[RTA_EXPIRES]) { 4269 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 4270 4271 if (addrconf_finite_timeout(timeout)) { 4272 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 4273 cfg->fc_flags |= RTF_EXPIRES; 4274 } 4275 } 4276 4277 err = 0; 4278 errout: 4279 return err; 4280 } 4281 4282 struct rt6_nh { 4283 struct fib6_info *fib6_info; 4284 struct fib6_config r_cfg; 4285 struct list_head next; 4286 }; 4287 4288 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) 4289 { 4290 struct rt6_nh *nh; 4291 4292 list_for_each_entry(nh, rt6_nh_list, next) { 4293 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n", 4294 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, 4295 nh->r_cfg.fc_ifindex); 4296 } 4297 } 4298 4299 static int ip6_route_info_append(struct net *net, 4300 struct list_head *rt6_nh_list, 4301 struct fib6_info *rt, 4302 struct fib6_config *r_cfg) 4303 { 4304 struct rt6_nh *nh; 4305 int err = -EEXIST; 4306 4307 list_for_each_entry(nh, rt6_nh_list, next) { 4308 /* check if fib6_info already exists */ 4309 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 4310 return err; 4311 } 4312 4313 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4314 if (!nh) 4315 return -ENOMEM; 4316 nh->fib6_info = rt; 4317 err = ip6_convert_metrics(net, rt, r_cfg); 4318 if (err) { 4319 kfree(nh); 4320 return err; 4321 } 4322 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4323 list_add_tail(&nh->next, rt6_nh_list); 4324 4325 return 0; 4326 } 4327 4328 static void ip6_route_mpath_notify(struct fib6_info *rt, 4329 struct fib6_info *rt_last, 4330 struct nl_info *info, 4331 __u16 nlflags) 4332 { 4333 /* if this is an APPEND route, then rt points to the first route 4334 * inserted and rt_last points to last route inserted. Userspace 4335 * wants a consistent dump of the route which starts at the first 4336 * nexthop. Since sibling routes are always added at the end of 4337 * the list, find the first sibling of the last route appended 4338 */ 4339 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 4340 rt = list_first_entry(&rt_last->fib6_siblings, 4341 struct fib6_info, 4342 fib6_siblings); 4343 } 4344 4345 if (rt) 4346 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 4347 } 4348 4349 static int ip6_route_multipath_add(struct fib6_config *cfg, 4350 struct netlink_ext_ack *extack) 4351 { 4352 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 4353 struct nl_info *info = &cfg->fc_nlinfo; 4354 struct fib6_config r_cfg; 4355 struct rtnexthop *rtnh; 4356 struct fib6_info *rt; 4357 struct rt6_nh *err_nh; 4358 struct rt6_nh *nh, *nh_safe; 4359 __u16 nlflags; 4360 int remaining; 4361 int attrlen; 4362 int err = 1; 4363 int nhn = 0; 4364 int replace = (cfg->fc_nlinfo.nlh && 4365 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 4366 LIST_HEAD(rt6_nh_list); 4367 4368 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 4369 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 4370 nlflags |= NLM_F_APPEND; 4371 4372 remaining = cfg->fc_mp_len; 4373 rtnh = (struct rtnexthop *)cfg->fc_mp; 4374 4375 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4376 * fib6_info structs per nexthop 4377 */ 4378 while (rtnh_ok(rtnh, remaining)) { 4379 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4380 if (rtnh->rtnh_ifindex) 4381 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4382 4383 attrlen = rtnh_attrlen(rtnh); 4384 if (attrlen > 0) { 4385 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4386 4387 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4388 if (nla) { 4389 r_cfg.fc_gateway = nla_get_in6_addr(nla); 4390 r_cfg.fc_flags |= RTF_GATEWAY; 4391 } 4392 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 4393 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 4394 if (nla) 4395 r_cfg.fc_encap_type = nla_get_u16(nla); 4396 } 4397 4398 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 4399 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 4400 if (IS_ERR(rt)) { 4401 err = PTR_ERR(rt); 4402 rt = NULL; 4403 goto cleanup; 4404 } 4405 if (!rt6_qualify_for_ecmp(rt)) { 4406 err = -EINVAL; 4407 NL_SET_ERR_MSG(extack, 4408 "Device only routes can not be added for IPv6 using the multipath API."); 4409 fib6_info_release(rt); 4410 goto cleanup; 4411 } 4412 4413 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1; 4414 4415 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 4416 rt, &r_cfg); 4417 if (err) { 4418 fib6_info_release(rt); 4419 goto cleanup; 4420 } 4421 4422 rtnh = rtnh_next(rtnh, &remaining); 4423 } 4424 4425 /* for add and replace send one notification with all nexthops. 4426 * Skip the notification in fib6_add_rt2node and send one with 4427 * the full route when done 4428 */ 4429 info->skip_notify = 1; 4430 4431 err_nh = NULL; 4432 list_for_each_entry(nh, &rt6_nh_list, next) { 4433 err = __ip6_ins_rt(nh->fib6_info, info, extack); 4434 fib6_info_release(nh->fib6_info); 4435 4436 if (!err) { 4437 /* save reference to last route successfully inserted */ 4438 rt_last = nh->fib6_info; 4439 4440 /* save reference to first route for notification */ 4441 if (!rt_notif) 4442 rt_notif = nh->fib6_info; 4443 } 4444 4445 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 4446 nh->fib6_info = NULL; 4447 if (err) { 4448 if (replace && nhn) 4449 ip6_print_replace_route_err(&rt6_nh_list); 4450 err_nh = nh; 4451 goto add_errout; 4452 } 4453 4454 /* Because each route is added like a single route we remove 4455 * these flags after the first nexthop: if there is a collision, 4456 * we have already failed to add the first nexthop: 4457 * fib6_add_rt2node() has rejected it; when replacing, old 4458 * nexthops have been replaced by first new, the rest should 4459 * be added to it. 4460 */ 4461 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4462 NLM_F_REPLACE); 4463 nhn++; 4464 } 4465 4466 /* success ... tell user about new route */ 4467 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4468 goto cleanup; 4469 4470 add_errout: 4471 /* send notification for routes that were added so that 4472 * the delete notifications sent by ip6_route_del are 4473 * coherent 4474 */ 4475 if (rt_notif) 4476 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4477 4478 /* Delete routes that were already added */ 4479 list_for_each_entry(nh, &rt6_nh_list, next) { 4480 if (err_nh == nh) 4481 break; 4482 ip6_route_del(&nh->r_cfg, extack); 4483 } 4484 4485 cleanup: 4486 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4487 if (nh->fib6_info) 4488 fib6_info_release(nh->fib6_info); 4489 list_del(&nh->next); 4490 kfree(nh); 4491 } 4492 4493 return err; 4494 } 4495 4496 static int ip6_route_multipath_del(struct fib6_config *cfg, 4497 struct netlink_ext_ack *extack) 4498 { 4499 struct fib6_config r_cfg; 4500 struct rtnexthop *rtnh; 4501 int remaining; 4502 int attrlen; 4503 int err = 1, last_err = 0; 4504 4505 remaining = cfg->fc_mp_len; 4506 rtnh = (struct rtnexthop *)cfg->fc_mp; 4507 4508 /* Parse a Multipath Entry */ 4509 while (rtnh_ok(rtnh, remaining)) { 4510 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4511 if (rtnh->rtnh_ifindex) 4512 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4513 4514 attrlen = rtnh_attrlen(rtnh); 4515 if (attrlen > 0) { 4516 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4517 4518 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4519 if (nla) { 4520 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 4521 r_cfg.fc_flags |= RTF_GATEWAY; 4522 } 4523 } 4524 err = ip6_route_del(&r_cfg, extack); 4525 if (err) 4526 last_err = err; 4527 4528 rtnh = rtnh_next(rtnh, &remaining); 4529 } 4530 4531 return last_err; 4532 } 4533 4534 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4535 struct netlink_ext_ack *extack) 4536 { 4537 struct fib6_config cfg; 4538 int err; 4539 4540 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4541 if (err < 0) 4542 return err; 4543 4544 if (cfg.fc_mp) 4545 return ip6_route_multipath_del(&cfg, extack); 4546 else { 4547 cfg.fc_delete_all_nh = 1; 4548 return ip6_route_del(&cfg, extack); 4549 } 4550 } 4551 4552 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4553 struct netlink_ext_ack *extack) 4554 { 4555 struct fib6_config cfg; 4556 int err; 4557 4558 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4559 if (err < 0) 4560 return err; 4561 4562 if (cfg.fc_mp) 4563 return ip6_route_multipath_add(&cfg, extack); 4564 else 4565 return ip6_route_add(&cfg, GFP_KERNEL, extack); 4566 } 4567 4568 static size_t rt6_nlmsg_size(struct fib6_info *rt) 4569 { 4570 int nexthop_len = 0; 4571 4572 if (rt->fib6_nsiblings) { 4573 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4574 + NLA_ALIGN(sizeof(struct rtnexthop)) 4575 + nla_total_size(16) /* RTA_GATEWAY */ 4576 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate); 4577 4578 nexthop_len *= rt->fib6_nsiblings; 4579 } 4580 4581 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4582 + nla_total_size(16) /* RTA_SRC */ 4583 + nla_total_size(16) /* RTA_DST */ 4584 + nla_total_size(16) /* RTA_GATEWAY */ 4585 + nla_total_size(16) /* RTA_PREFSRC */ 4586 + nla_total_size(4) /* RTA_TABLE */ 4587 + nla_total_size(4) /* RTA_IIF */ 4588 + nla_total_size(4) /* RTA_OIF */ 4589 + nla_total_size(4) /* RTA_PRIORITY */ 4590 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 4591 + nla_total_size(sizeof(struct rta_cacheinfo)) 4592 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4593 + nla_total_size(1) /* RTA_PREF */ 4594 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate) 4595 + nexthop_len; 4596 } 4597 4598 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt, 4599 unsigned int *flags, bool skip_oif) 4600 { 4601 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 4602 *flags |= RTNH_F_DEAD; 4603 4604 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) { 4605 *flags |= RTNH_F_LINKDOWN; 4606 4607 rcu_read_lock(); 4608 if (fib6_ignore_linkdown(rt)) 4609 *flags |= RTNH_F_DEAD; 4610 rcu_read_unlock(); 4611 } 4612 4613 if (rt->fib6_flags & RTF_GATEWAY) { 4614 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0) 4615 goto nla_put_failure; 4616 } 4617 4618 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK); 4619 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD) 4620 *flags |= RTNH_F_OFFLOAD; 4621 4622 /* not needed for multipath encoding b/c it has a rtnexthop struct */ 4623 if (!skip_oif && rt->fib6_nh.nh_dev && 4624 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex)) 4625 goto nla_put_failure; 4626 4627 if (rt->fib6_nh.nh_lwtstate && 4628 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0) 4629 goto nla_put_failure; 4630 4631 return 0; 4632 4633 nla_put_failure: 4634 return -EMSGSIZE; 4635 } 4636 4637 /* add multipath next hop */ 4638 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt) 4639 { 4640 const struct net_device *dev = rt->fib6_nh.nh_dev; 4641 struct rtnexthop *rtnh; 4642 unsigned int flags = 0; 4643 4644 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 4645 if (!rtnh) 4646 goto nla_put_failure; 4647 4648 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1; 4649 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; 4650 4651 if (rt6_nexthop_info(skb, rt, &flags, true) < 0) 4652 goto nla_put_failure; 4653 4654 rtnh->rtnh_flags = flags; 4655 4656 /* length of rtnetlink header + attributes */ 4657 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; 4658 4659 return 0; 4660 4661 nla_put_failure: 4662 return -EMSGSIZE; 4663 } 4664 4665 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 4666 struct fib6_info *rt, struct dst_entry *dst, 4667 struct in6_addr *dest, struct in6_addr *src, 4668 int iif, int type, u32 portid, u32 seq, 4669 unsigned int flags) 4670 { 4671 struct rtmsg *rtm; 4672 struct nlmsghdr *nlh; 4673 long expires = 0; 4674 u32 *pmetrics; 4675 u32 table; 4676 4677 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4678 if (!nlh) 4679 return -EMSGSIZE; 4680 4681 rtm = nlmsg_data(nlh); 4682 rtm->rtm_family = AF_INET6; 4683 rtm->rtm_dst_len = rt->fib6_dst.plen; 4684 rtm->rtm_src_len = rt->fib6_src.plen; 4685 rtm->rtm_tos = 0; 4686 if (rt->fib6_table) 4687 table = rt->fib6_table->tb6_id; 4688 else 4689 table = RT6_TABLE_UNSPEC; 4690 rtm->rtm_table = table; 4691 if (nla_put_u32(skb, RTA_TABLE, table)) 4692 goto nla_put_failure; 4693 4694 rtm->rtm_type = rt->fib6_type; 4695 rtm->rtm_flags = 0; 4696 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4697 rtm->rtm_protocol = rt->fib6_protocol; 4698 4699 if (rt->fib6_flags & RTF_CACHE) 4700 rtm->rtm_flags |= RTM_F_CLONED; 4701 4702 if (dest) { 4703 if (nla_put_in6_addr(skb, RTA_DST, dest)) 4704 goto nla_put_failure; 4705 rtm->rtm_dst_len = 128; 4706 } else if (rtm->rtm_dst_len) 4707 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr)) 4708 goto nla_put_failure; 4709 #ifdef CONFIG_IPV6_SUBTREES 4710 if (src) { 4711 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4712 goto nla_put_failure; 4713 rtm->rtm_src_len = 128; 4714 } else if (rtm->rtm_src_len && 4715 nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr)) 4716 goto nla_put_failure; 4717 #endif 4718 if (iif) { 4719 #ifdef CONFIG_IPV6_MROUTE 4720 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) { 4721 int err = ip6mr_get_route(net, skb, rtm, portid); 4722 4723 if (err == 0) 4724 return 0; 4725 if (err < 0) 4726 goto nla_put_failure; 4727 } else 4728 #endif 4729 if (nla_put_u32(skb, RTA_IIF, iif)) 4730 goto nla_put_failure; 4731 } else if (dest) { 4732 struct in6_addr saddr_buf; 4733 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 4734 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4735 goto nla_put_failure; 4736 } 4737 4738 if (rt->fib6_prefsrc.plen) { 4739 struct in6_addr saddr_buf; 4740 saddr_buf = rt->fib6_prefsrc.addr; 4741 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4742 goto nla_put_failure; 4743 } 4744 4745 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 4746 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 4747 goto nla_put_failure; 4748 4749 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 4750 goto nla_put_failure; 4751 4752 /* For multipath routes, walk the siblings list and add 4753 * each as a nexthop within RTA_MULTIPATH. 4754 */ 4755 if (rt->fib6_nsiblings) { 4756 struct fib6_info *sibling, *next_sibling; 4757 struct nlattr *mp; 4758 4759 mp = nla_nest_start(skb, RTA_MULTIPATH); 4760 if (!mp) 4761 goto nla_put_failure; 4762 4763 if (rt6_add_nexthop(skb, rt) < 0) 4764 goto nla_put_failure; 4765 4766 list_for_each_entry_safe(sibling, next_sibling, 4767 &rt->fib6_siblings, fib6_siblings) { 4768 if (rt6_add_nexthop(skb, sibling) < 0) 4769 goto nla_put_failure; 4770 } 4771 4772 nla_nest_end(skb, mp); 4773 } else { 4774 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0) 4775 goto nla_put_failure; 4776 } 4777 4778 if (rt->fib6_flags & RTF_EXPIRES) { 4779 expires = dst ? dst->expires : rt->expires; 4780 expires -= jiffies; 4781 } 4782 4783 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 4784 goto nla_put_failure; 4785 4786 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags))) 4787 goto nla_put_failure; 4788 4789 4790 nlmsg_end(skb, nlh); 4791 return 0; 4792 4793 nla_put_failure: 4794 nlmsg_cancel(skb, nlh); 4795 return -EMSGSIZE; 4796 } 4797 4798 int rt6_dump_route(struct fib6_info *rt, void *p_arg) 4799 { 4800 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4801 struct net *net = arg->net; 4802 4803 if (rt == net->ipv6.fib6_null_entry) 4804 return 0; 4805 4806 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { 4807 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); 4808 4809 /* user wants prefix routes only */ 4810 if (rtm->rtm_flags & RTM_F_PREFIX && 4811 !(rt->fib6_flags & RTF_PREFIX_RT)) { 4812 /* success since this is not a prefix route */ 4813 return 1; 4814 } 4815 } 4816 4817 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, 4818 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, 4819 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI); 4820 } 4821 4822 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4823 struct netlink_ext_ack *extack) 4824 { 4825 struct net *net = sock_net(in_skb->sk); 4826 struct nlattr *tb[RTA_MAX+1]; 4827 int err, iif = 0, oif = 0; 4828 struct fib6_info *from; 4829 struct dst_entry *dst; 4830 struct rt6_info *rt; 4831 struct sk_buff *skb; 4832 struct rtmsg *rtm; 4833 struct flowi6 fl6; 4834 bool fibmatch; 4835 4836 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4837 extack); 4838 if (err < 0) 4839 goto errout; 4840 4841 err = -EINVAL; 4842 memset(&fl6, 0, sizeof(fl6)); 4843 rtm = nlmsg_data(nlh); 4844 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4845 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4846 4847 if (tb[RTA_SRC]) { 4848 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4849 goto errout; 4850 4851 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4852 } 4853 4854 if (tb[RTA_DST]) { 4855 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4856 goto errout; 4857 4858 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4859 } 4860 4861 if (tb[RTA_IIF]) 4862 iif = nla_get_u32(tb[RTA_IIF]); 4863 4864 if (tb[RTA_OIF]) 4865 oif = nla_get_u32(tb[RTA_OIF]); 4866 4867 if (tb[RTA_MARK]) 4868 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4869 4870 if (tb[RTA_UID]) 4871 fl6.flowi6_uid = make_kuid(current_user_ns(), 4872 nla_get_u32(tb[RTA_UID])); 4873 else 4874 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4875 4876 if (tb[RTA_SPORT]) 4877 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); 4878 4879 if (tb[RTA_DPORT]) 4880 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); 4881 4882 if (tb[RTA_IP_PROTO]) { 4883 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 4884 &fl6.flowi6_proto, extack); 4885 if (err) 4886 goto errout; 4887 } 4888 4889 if (iif) { 4890 struct net_device *dev; 4891 int flags = 0; 4892 4893 rcu_read_lock(); 4894 4895 dev = dev_get_by_index_rcu(net, iif); 4896 if (!dev) { 4897 rcu_read_unlock(); 4898 err = -ENODEV; 4899 goto errout; 4900 } 4901 4902 fl6.flowi6_iif = iif; 4903 4904 if (!ipv6_addr_any(&fl6.saddr)) 4905 flags |= RT6_LOOKUP_F_HAS_SADDR; 4906 4907 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 4908 4909 rcu_read_unlock(); 4910 } else { 4911 fl6.flowi6_oif = oif; 4912 4913 dst = ip6_route_output(net, NULL, &fl6); 4914 } 4915 4916 4917 rt = container_of(dst, struct rt6_info, dst); 4918 if (rt->dst.error) { 4919 err = rt->dst.error; 4920 ip6_rt_put(rt); 4921 goto errout; 4922 } 4923 4924 if (rt == net->ipv6.ip6_null_entry) { 4925 err = rt->dst.error; 4926 ip6_rt_put(rt); 4927 goto errout; 4928 } 4929 4930 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 4931 if (!skb) { 4932 ip6_rt_put(rt); 4933 err = -ENOBUFS; 4934 goto errout; 4935 } 4936 4937 skb_dst_set(skb, &rt->dst); 4938 4939 rcu_read_lock(); 4940 from = rcu_dereference(rt->from); 4941 4942 if (fibmatch) 4943 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, 4944 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 4945 nlh->nlmsg_seq, 0); 4946 else 4947 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 4948 &fl6.saddr, iif, RTM_NEWROUTE, 4949 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 4950 0); 4951 rcu_read_unlock(); 4952 4953 if (err < 0) { 4954 kfree_skb(skb); 4955 goto errout; 4956 } 4957 4958 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 4959 errout: 4960 return err; 4961 } 4962 4963 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 4964 unsigned int nlm_flags) 4965 { 4966 struct sk_buff *skb; 4967 struct net *net = info->nl_net; 4968 u32 seq; 4969 int err; 4970 4971 err = -ENOBUFS; 4972 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 4973 4974 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 4975 if (!skb) 4976 goto errout; 4977 4978 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 4979 event, info->portid, seq, nlm_flags); 4980 if (err < 0) { 4981 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 4982 WARN_ON(err == -EMSGSIZE); 4983 kfree_skb(skb); 4984 goto errout; 4985 } 4986 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 4987 info->nlh, gfp_any()); 4988 return; 4989 errout: 4990 if (err < 0) 4991 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 4992 } 4993 4994 static int ip6_route_dev_notify(struct notifier_block *this, 4995 unsigned long event, void *ptr) 4996 { 4997 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 4998 struct net *net = dev_net(dev); 4999 5000 if (!(dev->flags & IFF_LOOPBACK)) 5001 return NOTIFY_OK; 5002 5003 if (event == NETDEV_REGISTER) { 5004 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev; 5005 net->ipv6.ip6_null_entry->dst.dev = dev; 5006 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 5007 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5008 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 5009 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 5010 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 5011 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 5012 #endif 5013 } else if (event == NETDEV_UNREGISTER && 5014 dev->reg_state != NETREG_UNREGISTERED) { 5015 /* NETDEV_UNREGISTER could be fired for multiple times by 5016 * netdev_wait_allrefs(). Make sure we only call this once. 5017 */ 5018 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 5019 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5020 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 5021 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 5022 #endif 5023 } 5024 5025 return NOTIFY_OK; 5026 } 5027 5028 /* 5029 * /proc 5030 */ 5031 5032 #ifdef CONFIG_PROC_FS 5033 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 5034 { 5035 struct net *net = (struct net *)seq->private; 5036 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 5037 net->ipv6.rt6_stats->fib_nodes, 5038 net->ipv6.rt6_stats->fib_route_nodes, 5039 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 5040 net->ipv6.rt6_stats->fib_rt_entries, 5041 net->ipv6.rt6_stats->fib_rt_cache, 5042 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 5043 net->ipv6.rt6_stats->fib_discarded_routes); 5044 5045 return 0; 5046 } 5047 #endif /* CONFIG_PROC_FS */ 5048 5049 #ifdef CONFIG_SYSCTL 5050 5051 static 5052 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 5053 void __user *buffer, size_t *lenp, loff_t *ppos) 5054 { 5055 struct net *net; 5056 int delay; 5057 if (!write) 5058 return -EINVAL; 5059 5060 net = (struct net *)ctl->extra1; 5061 delay = net->ipv6.sysctl.flush_delay; 5062 proc_dointvec(ctl, write, buffer, lenp, ppos); 5063 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 5064 return 0; 5065 } 5066 5067 struct ctl_table ipv6_route_table_template[] = { 5068 { 5069 .procname = "flush", 5070 .data = &init_net.ipv6.sysctl.flush_delay, 5071 .maxlen = sizeof(int), 5072 .mode = 0200, 5073 .proc_handler = ipv6_sysctl_rtcache_flush 5074 }, 5075 { 5076 .procname = "gc_thresh", 5077 .data = &ip6_dst_ops_template.gc_thresh, 5078 .maxlen = sizeof(int), 5079 .mode = 0644, 5080 .proc_handler = proc_dointvec, 5081 }, 5082 { 5083 .procname = "max_size", 5084 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 5085 .maxlen = sizeof(int), 5086 .mode = 0644, 5087 .proc_handler = proc_dointvec, 5088 }, 5089 { 5090 .procname = "gc_min_interval", 5091 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5092 .maxlen = sizeof(int), 5093 .mode = 0644, 5094 .proc_handler = proc_dointvec_jiffies, 5095 }, 5096 { 5097 .procname = "gc_timeout", 5098 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 5099 .maxlen = sizeof(int), 5100 .mode = 0644, 5101 .proc_handler = proc_dointvec_jiffies, 5102 }, 5103 { 5104 .procname = "gc_interval", 5105 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 5106 .maxlen = sizeof(int), 5107 .mode = 0644, 5108 .proc_handler = proc_dointvec_jiffies, 5109 }, 5110 { 5111 .procname = "gc_elasticity", 5112 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 5113 .maxlen = sizeof(int), 5114 .mode = 0644, 5115 .proc_handler = proc_dointvec, 5116 }, 5117 { 5118 .procname = "mtu_expires", 5119 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 5120 .maxlen = sizeof(int), 5121 .mode = 0644, 5122 .proc_handler = proc_dointvec_jiffies, 5123 }, 5124 { 5125 .procname = "min_adv_mss", 5126 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 5127 .maxlen = sizeof(int), 5128 .mode = 0644, 5129 .proc_handler = proc_dointvec, 5130 }, 5131 { 5132 .procname = "gc_min_interval_ms", 5133 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5134 .maxlen = sizeof(int), 5135 .mode = 0644, 5136 .proc_handler = proc_dointvec_ms_jiffies, 5137 }, 5138 { } 5139 }; 5140 5141 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 5142 { 5143 struct ctl_table *table; 5144 5145 table = kmemdup(ipv6_route_table_template, 5146 sizeof(ipv6_route_table_template), 5147 GFP_KERNEL); 5148 5149 if (table) { 5150 table[0].data = &net->ipv6.sysctl.flush_delay; 5151 table[0].extra1 = net; 5152 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 5153 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 5154 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5155 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 5156 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 5157 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 5158 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 5159 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 5160 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5161 5162 /* Don't export sysctls to unprivileged users */ 5163 if (net->user_ns != &init_user_ns) 5164 table[0].procname = NULL; 5165 } 5166 5167 return table; 5168 } 5169 #endif 5170 5171 static int __net_init ip6_route_net_init(struct net *net) 5172 { 5173 int ret = -ENOMEM; 5174 5175 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 5176 sizeof(net->ipv6.ip6_dst_ops)); 5177 5178 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 5179 goto out_ip6_dst_ops; 5180 5181 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, 5182 sizeof(*net->ipv6.fib6_null_entry), 5183 GFP_KERNEL); 5184 if (!net->ipv6.fib6_null_entry) 5185 goto out_ip6_dst_entries; 5186 5187 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 5188 sizeof(*net->ipv6.ip6_null_entry), 5189 GFP_KERNEL); 5190 if (!net->ipv6.ip6_null_entry) 5191 goto out_fib6_null_entry; 5192 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5193 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 5194 ip6_template_metrics, true); 5195 5196 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5197 net->ipv6.fib6_has_custom_rules = false; 5198 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 5199 sizeof(*net->ipv6.ip6_prohibit_entry), 5200 GFP_KERNEL); 5201 if (!net->ipv6.ip6_prohibit_entry) 5202 goto out_ip6_null_entry; 5203 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5204 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 5205 ip6_template_metrics, true); 5206 5207 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 5208 sizeof(*net->ipv6.ip6_blk_hole_entry), 5209 GFP_KERNEL); 5210 if (!net->ipv6.ip6_blk_hole_entry) 5211 goto out_ip6_prohibit_entry; 5212 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5213 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 5214 ip6_template_metrics, true); 5215 #endif 5216 5217 net->ipv6.sysctl.flush_delay = 0; 5218 net->ipv6.sysctl.ip6_rt_max_size = 4096; 5219 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 5220 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 5221 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 5222 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 5223 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 5224 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 5225 5226 net->ipv6.ip6_rt_gc_expire = 30*HZ; 5227 5228 ret = 0; 5229 out: 5230 return ret; 5231 5232 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5233 out_ip6_prohibit_entry: 5234 kfree(net->ipv6.ip6_prohibit_entry); 5235 out_ip6_null_entry: 5236 kfree(net->ipv6.ip6_null_entry); 5237 #endif 5238 out_fib6_null_entry: 5239 kfree(net->ipv6.fib6_null_entry); 5240 out_ip6_dst_entries: 5241 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5242 out_ip6_dst_ops: 5243 goto out; 5244 } 5245 5246 static void __net_exit ip6_route_net_exit(struct net *net) 5247 { 5248 kfree(net->ipv6.fib6_null_entry); 5249 kfree(net->ipv6.ip6_null_entry); 5250 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5251 kfree(net->ipv6.ip6_prohibit_entry); 5252 kfree(net->ipv6.ip6_blk_hole_entry); 5253 #endif 5254 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5255 } 5256 5257 static int __net_init ip6_route_net_init_late(struct net *net) 5258 { 5259 #ifdef CONFIG_PROC_FS 5260 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops, 5261 sizeof(struct ipv6_route_iter)); 5262 proc_create_net_single("rt6_stats", 0444, net->proc_net, 5263 rt6_stats_seq_show, NULL); 5264 #endif 5265 return 0; 5266 } 5267 5268 static void __net_exit ip6_route_net_exit_late(struct net *net) 5269 { 5270 #ifdef CONFIG_PROC_FS 5271 remove_proc_entry("ipv6_route", net->proc_net); 5272 remove_proc_entry("rt6_stats", net->proc_net); 5273 #endif 5274 } 5275 5276 static struct pernet_operations ip6_route_net_ops = { 5277 .init = ip6_route_net_init, 5278 .exit = ip6_route_net_exit, 5279 }; 5280 5281 static int __net_init ipv6_inetpeer_init(struct net *net) 5282 { 5283 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 5284 5285 if (!bp) 5286 return -ENOMEM; 5287 inet_peer_base_init(bp); 5288 net->ipv6.peers = bp; 5289 return 0; 5290 } 5291 5292 static void __net_exit ipv6_inetpeer_exit(struct net *net) 5293 { 5294 struct inet_peer_base *bp = net->ipv6.peers; 5295 5296 net->ipv6.peers = NULL; 5297 inetpeer_invalidate_tree(bp); 5298 kfree(bp); 5299 } 5300 5301 static struct pernet_operations ipv6_inetpeer_ops = { 5302 .init = ipv6_inetpeer_init, 5303 .exit = ipv6_inetpeer_exit, 5304 }; 5305 5306 static struct pernet_operations ip6_route_net_late_ops = { 5307 .init = ip6_route_net_init_late, 5308 .exit = ip6_route_net_exit_late, 5309 }; 5310 5311 static struct notifier_block ip6_route_dev_notifier = { 5312 .notifier_call = ip6_route_dev_notify, 5313 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 5314 }; 5315 5316 void __init ip6_route_init_special_entries(void) 5317 { 5318 /* Registering of the loopback is done before this portion of code, 5319 * the loopback reference in rt6_info will not be taken, do it 5320 * manually for init_net */ 5321 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev; 5322 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5323 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5324 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5325 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 5326 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5327 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 5328 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5329 #endif 5330 } 5331 5332 int __init ip6_route_init(void) 5333 { 5334 int ret; 5335 int cpu; 5336 5337 ret = -ENOMEM; 5338 ip6_dst_ops_template.kmem_cachep = 5339 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 5340 SLAB_HWCACHE_ALIGN, NULL); 5341 if (!ip6_dst_ops_template.kmem_cachep) 5342 goto out; 5343 5344 ret = dst_entries_init(&ip6_dst_blackhole_ops); 5345 if (ret) 5346 goto out_kmem_cache; 5347 5348 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 5349 if (ret) 5350 goto out_dst_entries; 5351 5352 ret = register_pernet_subsys(&ip6_route_net_ops); 5353 if (ret) 5354 goto out_register_inetpeer; 5355 5356 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 5357 5358 ret = fib6_init(); 5359 if (ret) 5360 goto out_register_subsys; 5361 5362 ret = xfrm6_init(); 5363 if (ret) 5364 goto out_fib6_init; 5365 5366 ret = fib6_rules_init(); 5367 if (ret) 5368 goto xfrm6_init; 5369 5370 ret = register_pernet_subsys(&ip6_route_net_late_ops); 5371 if (ret) 5372 goto fib6_rules_init; 5373 5374 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 5375 inet6_rtm_newroute, NULL, 0); 5376 if (ret < 0) 5377 goto out_register_late_subsys; 5378 5379 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 5380 inet6_rtm_delroute, NULL, 0); 5381 if (ret < 0) 5382 goto out_register_late_subsys; 5383 5384 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 5385 inet6_rtm_getroute, NULL, 5386 RTNL_FLAG_DOIT_UNLOCKED); 5387 if (ret < 0) 5388 goto out_register_late_subsys; 5389 5390 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 5391 if (ret) 5392 goto out_register_late_subsys; 5393 5394 for_each_possible_cpu(cpu) { 5395 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 5396 5397 INIT_LIST_HEAD(&ul->head); 5398 spin_lock_init(&ul->lock); 5399 } 5400 5401 out: 5402 return ret; 5403 5404 out_register_late_subsys: 5405 rtnl_unregister_all(PF_INET6); 5406 unregister_pernet_subsys(&ip6_route_net_late_ops); 5407 fib6_rules_init: 5408 fib6_rules_cleanup(); 5409 xfrm6_init: 5410 xfrm6_fini(); 5411 out_fib6_init: 5412 fib6_gc_cleanup(); 5413 out_register_subsys: 5414 unregister_pernet_subsys(&ip6_route_net_ops); 5415 out_register_inetpeer: 5416 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5417 out_dst_entries: 5418 dst_entries_destroy(&ip6_dst_blackhole_ops); 5419 out_kmem_cache: 5420 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5421 goto out; 5422 } 5423 5424 void ip6_route_cleanup(void) 5425 { 5426 unregister_netdevice_notifier(&ip6_route_dev_notifier); 5427 unregister_pernet_subsys(&ip6_route_net_late_ops); 5428 fib6_rules_cleanup(); 5429 xfrm6_fini(); 5430 fib6_gc_cleanup(); 5431 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5432 unregister_pernet_subsys(&ip6_route_net_ops); 5433 dst_entries_destroy(&ip6_dst_blackhole_ops); 5434 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5435 } 5436