1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/nexthop.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <net/ip.h> 67 #include <linux/uaccess.h> 68 69 #ifdef CONFIG_SYSCTL 70 #include <linux/sysctl.h> 71 #endif 72 73 static int ip6_rt_type_to_error(u8 fib6_type); 74 75 #define CREATE_TRACE_POINTS 76 #include <trace/events/fib6.h> 77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); 78 #undef CREATE_TRACE_POINTS 79 80 enum rt6_nud_state { 81 RT6_NUD_FAIL_HARD = -3, 82 RT6_NUD_FAIL_PROBE = -2, 83 RT6_NUD_FAIL_DO_RR = -1, 84 RT6_NUD_SUCCEED = 1 85 }; 86 87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 88 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 89 static unsigned int ip6_mtu(const struct dst_entry *dst); 90 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 91 static void ip6_dst_destroy(struct dst_entry *); 92 static void ip6_dst_ifdown(struct dst_entry *, 93 struct net_device *dev, int how); 94 static int ip6_dst_gc(struct dst_ops *ops); 95 96 static int ip6_pkt_discard(struct sk_buff *skb); 97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 98 static int ip6_pkt_prohibit(struct sk_buff *skb); 99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 100 static void ip6_link_failure(struct sk_buff *skb); 101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 102 struct sk_buff *skb, u32 mtu); 103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 104 struct sk_buff *skb); 105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict); 106 static size_t rt6_nlmsg_size(struct fib6_info *rt); 107 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 108 struct fib6_info *rt, struct dst_entry *dst, 109 struct in6_addr *dest, struct in6_addr *src, 110 int iif, int type, u32 portid, u32 seq, 111 unsigned int flags); 112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 113 struct in6_addr *daddr, 114 struct in6_addr *saddr); 115 116 #ifdef CONFIG_IPV6_ROUTE_INFO 117 static struct fib6_info *rt6_add_route_info(struct net *net, 118 const struct in6_addr *prefix, int prefixlen, 119 const struct in6_addr *gwaddr, 120 struct net_device *dev, 121 unsigned int pref); 122 static struct fib6_info *rt6_get_route_info(struct net *net, 123 const struct in6_addr *prefix, int prefixlen, 124 const struct in6_addr *gwaddr, 125 struct net_device *dev); 126 #endif 127 128 struct uncached_list { 129 spinlock_t lock; 130 struct list_head head; 131 }; 132 133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 134 135 void rt6_uncached_list_add(struct rt6_info *rt) 136 { 137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 138 139 rt->rt6i_uncached_list = ul; 140 141 spin_lock_bh(&ul->lock); 142 list_add_tail(&rt->rt6i_uncached, &ul->head); 143 spin_unlock_bh(&ul->lock); 144 } 145 146 void rt6_uncached_list_del(struct rt6_info *rt) 147 { 148 if (!list_empty(&rt->rt6i_uncached)) { 149 struct uncached_list *ul = rt->rt6i_uncached_list; 150 struct net *net = dev_net(rt->dst.dev); 151 152 spin_lock_bh(&ul->lock); 153 list_del(&rt->rt6i_uncached); 154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 155 spin_unlock_bh(&ul->lock); 156 } 157 } 158 159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 160 { 161 struct net_device *loopback_dev = net->loopback_dev; 162 int cpu; 163 164 if (dev == loopback_dev) 165 return; 166 167 for_each_possible_cpu(cpu) { 168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 169 struct rt6_info *rt; 170 171 spin_lock_bh(&ul->lock); 172 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 173 struct inet6_dev *rt_idev = rt->rt6i_idev; 174 struct net_device *rt_dev = rt->dst.dev; 175 176 if (rt_idev->dev == dev) { 177 rt->rt6i_idev = in6_dev_get(loopback_dev); 178 in6_dev_put(rt_idev); 179 } 180 181 if (rt_dev == dev) { 182 rt->dst.dev = loopback_dev; 183 dev_hold(rt->dst.dev); 184 dev_put(rt_dev); 185 } 186 } 187 spin_unlock_bh(&ul->lock); 188 } 189 } 190 191 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 192 struct sk_buff *skb, 193 const void *daddr) 194 { 195 if (!ipv6_addr_any(p)) 196 return (const void *) p; 197 else if (skb) 198 return &ipv6_hdr(skb)->daddr; 199 return daddr; 200 } 201 202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 203 struct net_device *dev, 204 struct sk_buff *skb, 205 const void *daddr) 206 { 207 struct neighbour *n; 208 209 daddr = choose_neigh_daddr(gw, skb, daddr); 210 n = __ipv6_neigh_lookup(dev, daddr); 211 if (n) 212 return n; 213 return neigh_create(&nd_tbl, daddr, dev); 214 } 215 216 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 217 struct sk_buff *skb, 218 const void *daddr) 219 { 220 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 221 222 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); 223 } 224 225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 226 { 227 struct net_device *dev = dst->dev; 228 struct rt6_info *rt = (struct rt6_info *)dst; 229 230 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); 231 if (!daddr) 232 return; 233 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 234 return; 235 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 236 return; 237 __ipv6_confirm_neigh(dev, daddr); 238 } 239 240 static struct dst_ops ip6_dst_ops_template = { 241 .family = AF_INET6, 242 .gc = ip6_dst_gc, 243 .gc_thresh = 1024, 244 .check = ip6_dst_check, 245 .default_advmss = ip6_default_advmss, 246 .mtu = ip6_mtu, 247 .cow_metrics = dst_cow_metrics_generic, 248 .destroy = ip6_dst_destroy, 249 .ifdown = ip6_dst_ifdown, 250 .negative_advice = ip6_negative_advice, 251 .link_failure = ip6_link_failure, 252 .update_pmtu = ip6_rt_update_pmtu, 253 .redirect = rt6_do_redirect, 254 .local_out = __ip6_local_out, 255 .neigh_lookup = ip6_dst_neigh_lookup, 256 .confirm_neigh = ip6_confirm_neigh, 257 }; 258 259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 260 { 261 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 262 263 return mtu ? : dst->dev->mtu; 264 } 265 266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 267 struct sk_buff *skb, u32 mtu) 268 { 269 } 270 271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 272 struct sk_buff *skb) 273 { 274 } 275 276 static struct dst_ops ip6_dst_blackhole_ops = { 277 .family = AF_INET6, 278 .destroy = ip6_dst_destroy, 279 .check = ip6_dst_check, 280 .mtu = ip6_blackhole_mtu, 281 .default_advmss = ip6_default_advmss, 282 .update_pmtu = ip6_rt_blackhole_update_pmtu, 283 .redirect = ip6_rt_blackhole_redirect, 284 .cow_metrics = dst_cow_metrics_generic, 285 .neigh_lookup = ip6_dst_neigh_lookup, 286 }; 287 288 static const u32 ip6_template_metrics[RTAX_MAX] = { 289 [RTAX_HOPLIMIT - 1] = 0, 290 }; 291 292 static const struct fib6_info fib6_null_entry_template = { 293 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 294 .fib6_protocol = RTPROT_KERNEL, 295 .fib6_metric = ~(u32)0, 296 .fib6_ref = ATOMIC_INIT(1), 297 .fib6_type = RTN_UNREACHABLE, 298 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 299 }; 300 301 static const struct rt6_info ip6_null_entry_template = { 302 .dst = { 303 .__refcnt = ATOMIC_INIT(1), 304 .__use = 1, 305 .obsolete = DST_OBSOLETE_FORCE_CHK, 306 .error = -ENETUNREACH, 307 .input = ip6_pkt_discard, 308 .output = ip6_pkt_discard_out, 309 }, 310 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 311 }; 312 313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 314 315 static const struct rt6_info ip6_prohibit_entry_template = { 316 .dst = { 317 .__refcnt = ATOMIC_INIT(1), 318 .__use = 1, 319 .obsolete = DST_OBSOLETE_FORCE_CHK, 320 .error = -EACCES, 321 .input = ip6_pkt_prohibit, 322 .output = ip6_pkt_prohibit_out, 323 }, 324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 325 }; 326 327 static const struct rt6_info ip6_blk_hole_entry_template = { 328 .dst = { 329 .__refcnt = ATOMIC_INIT(1), 330 .__use = 1, 331 .obsolete = DST_OBSOLETE_FORCE_CHK, 332 .error = -EINVAL, 333 .input = dst_discard, 334 .output = dst_discard_out, 335 }, 336 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 337 }; 338 339 #endif 340 341 static void rt6_info_init(struct rt6_info *rt) 342 { 343 struct dst_entry *dst = &rt->dst; 344 345 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 346 INIT_LIST_HEAD(&rt->rt6i_uncached); 347 } 348 349 /* allocate dst with ip6_dst_ops */ 350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 351 int flags) 352 { 353 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 354 1, DST_OBSOLETE_FORCE_CHK, flags); 355 356 if (rt) { 357 rt6_info_init(rt); 358 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 359 } 360 361 return rt; 362 } 363 EXPORT_SYMBOL(ip6_dst_alloc); 364 365 static void ip6_dst_destroy(struct dst_entry *dst) 366 { 367 struct rt6_info *rt = (struct rt6_info *)dst; 368 struct fib6_info *from; 369 struct inet6_dev *idev; 370 371 dst_destroy_metrics_generic(dst); 372 rt6_uncached_list_del(rt); 373 374 idev = rt->rt6i_idev; 375 if (idev) { 376 rt->rt6i_idev = NULL; 377 in6_dev_put(idev); 378 } 379 380 rcu_read_lock(); 381 from = rcu_dereference(rt->from); 382 rcu_assign_pointer(rt->from, NULL); 383 fib6_info_release(from); 384 rcu_read_unlock(); 385 } 386 387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 388 int how) 389 { 390 struct rt6_info *rt = (struct rt6_info *)dst; 391 struct inet6_dev *idev = rt->rt6i_idev; 392 struct net_device *loopback_dev = 393 dev_net(dev)->loopback_dev; 394 395 if (idev && idev->dev != loopback_dev) { 396 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 397 if (loopback_idev) { 398 rt->rt6i_idev = loopback_idev; 399 in6_dev_put(idev); 400 } 401 } 402 } 403 404 static bool __rt6_check_expired(const struct rt6_info *rt) 405 { 406 if (rt->rt6i_flags & RTF_EXPIRES) 407 return time_after(jiffies, rt->dst.expires); 408 else 409 return false; 410 } 411 412 static bool rt6_check_expired(const struct rt6_info *rt) 413 { 414 struct fib6_info *from; 415 416 from = rcu_dereference(rt->from); 417 418 if (rt->rt6i_flags & RTF_EXPIRES) { 419 if (time_after(jiffies, rt->dst.expires)) 420 return true; 421 } else if (from) { 422 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 423 fib6_check_expired(from); 424 } 425 return false; 426 } 427 428 struct fib6_info *fib6_multipath_select(const struct net *net, 429 struct fib6_info *match, 430 struct flowi6 *fl6, int oif, 431 const struct sk_buff *skb, 432 int strict) 433 { 434 struct fib6_info *sibling, *next_sibling; 435 436 /* We might have already computed the hash for ICMPv6 errors. In such 437 * case it will always be non-zero. Otherwise now is the time to do it. 438 */ 439 if (!fl6->mp_hash) 440 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 441 442 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound)) 443 return match; 444 445 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 446 fib6_siblings) { 447 int nh_upper_bound; 448 449 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound); 450 if (fl6->mp_hash > nh_upper_bound) 451 continue; 452 if (rt6_score_route(sibling, oif, strict) < 0) 453 break; 454 match = sibling; 455 break; 456 } 457 458 return match; 459 } 460 461 /* 462 * Route lookup. rcu_read_lock() should be held. 463 */ 464 465 static inline struct fib6_info *rt6_device_match(struct net *net, 466 struct fib6_info *rt, 467 const struct in6_addr *saddr, 468 int oif, 469 int flags) 470 { 471 struct fib6_info *sprt; 472 473 if (!oif && ipv6_addr_any(saddr) && 474 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD)) 475 return rt; 476 477 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) { 478 const struct net_device *dev = sprt->fib6_nh.nh_dev; 479 480 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD) 481 continue; 482 483 if (oif) { 484 if (dev->ifindex == oif) 485 return sprt; 486 } else { 487 if (ipv6_chk_addr(net, saddr, dev, 488 flags & RT6_LOOKUP_F_IFACE)) 489 return sprt; 490 } 491 } 492 493 if (oif && flags & RT6_LOOKUP_F_IFACE) 494 return net->ipv6.fib6_null_entry; 495 496 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt; 497 } 498 499 #ifdef CONFIG_IPV6_ROUTER_PREF 500 struct __rt6_probe_work { 501 struct work_struct work; 502 struct in6_addr target; 503 struct net_device *dev; 504 }; 505 506 static void rt6_probe_deferred(struct work_struct *w) 507 { 508 struct in6_addr mcaddr; 509 struct __rt6_probe_work *work = 510 container_of(w, struct __rt6_probe_work, work); 511 512 addrconf_addr_solict_mult(&work->target, &mcaddr); 513 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 514 dev_put(work->dev); 515 kfree(work); 516 } 517 518 static void rt6_probe(struct fib6_info *rt) 519 { 520 struct __rt6_probe_work *work; 521 const struct in6_addr *nh_gw; 522 struct neighbour *neigh; 523 struct net_device *dev; 524 525 /* 526 * Okay, this does not seem to be appropriate 527 * for now, however, we need to check if it 528 * is really so; aka Router Reachability Probing. 529 * 530 * Router Reachability Probe MUST be rate-limited 531 * to no more than one per minute. 532 */ 533 if (!rt || !(rt->fib6_flags & RTF_GATEWAY)) 534 return; 535 536 nh_gw = &rt->fib6_nh.nh_gw; 537 dev = rt->fib6_nh.nh_dev; 538 rcu_read_lock_bh(); 539 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 540 if (neigh) { 541 struct inet6_dev *idev; 542 543 if (neigh->nud_state & NUD_VALID) 544 goto out; 545 546 idev = __in6_dev_get(dev); 547 work = NULL; 548 write_lock(&neigh->lock); 549 if (!(neigh->nud_state & NUD_VALID) && 550 time_after(jiffies, 551 neigh->updated + idev->cnf.rtr_probe_interval)) { 552 work = kmalloc(sizeof(*work), GFP_ATOMIC); 553 if (work) 554 __neigh_set_probe_once(neigh); 555 } 556 write_unlock(&neigh->lock); 557 } else { 558 work = kmalloc(sizeof(*work), GFP_ATOMIC); 559 } 560 561 if (work) { 562 INIT_WORK(&work->work, rt6_probe_deferred); 563 work->target = *nh_gw; 564 dev_hold(dev); 565 work->dev = dev; 566 schedule_work(&work->work); 567 } 568 569 out: 570 rcu_read_unlock_bh(); 571 } 572 #else 573 static inline void rt6_probe(struct fib6_info *rt) 574 { 575 } 576 #endif 577 578 /* 579 * Default Router Selection (RFC 2461 6.3.6) 580 */ 581 static inline int rt6_check_dev(struct fib6_info *rt, int oif) 582 { 583 const struct net_device *dev = rt->fib6_nh.nh_dev; 584 585 if (!oif || dev->ifindex == oif) 586 return 2; 587 return 0; 588 } 589 590 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) 591 { 592 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 593 struct neighbour *neigh; 594 595 if (rt->fib6_flags & RTF_NONEXTHOP || 596 !(rt->fib6_flags & RTF_GATEWAY)) 597 return RT6_NUD_SUCCEED; 598 599 rcu_read_lock_bh(); 600 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev, 601 &rt->fib6_nh.nh_gw); 602 if (neigh) { 603 read_lock(&neigh->lock); 604 if (neigh->nud_state & NUD_VALID) 605 ret = RT6_NUD_SUCCEED; 606 #ifdef CONFIG_IPV6_ROUTER_PREF 607 else if (!(neigh->nud_state & NUD_FAILED)) 608 ret = RT6_NUD_SUCCEED; 609 else 610 ret = RT6_NUD_FAIL_PROBE; 611 #endif 612 read_unlock(&neigh->lock); 613 } else { 614 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 615 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 616 } 617 rcu_read_unlock_bh(); 618 619 return ret; 620 } 621 622 static int rt6_score_route(struct fib6_info *rt, int oif, int strict) 623 { 624 int m; 625 626 m = rt6_check_dev(rt, oif); 627 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 628 return RT6_NUD_FAIL_HARD; 629 #ifdef CONFIG_IPV6_ROUTER_PREF 630 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2; 631 #endif 632 if (strict & RT6_LOOKUP_F_REACHABLE) { 633 int n = rt6_check_neigh(rt); 634 if (n < 0) 635 return n; 636 } 637 return m; 638 } 639 640 /* called with rc_read_lock held */ 641 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i) 642 { 643 const struct net_device *dev = fib6_info_nh_dev(f6i); 644 bool rc = false; 645 646 if (dev) { 647 const struct inet6_dev *idev = __in6_dev_get(dev); 648 649 rc = !!idev->cnf.ignore_routes_with_linkdown; 650 } 651 652 return rc; 653 } 654 655 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, 656 int *mpri, struct fib6_info *match, 657 bool *do_rr) 658 { 659 int m; 660 bool match_do_rr = false; 661 662 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 663 goto out; 664 665 if (fib6_ignore_linkdown(rt) && 666 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 667 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 668 goto out; 669 670 if (fib6_check_expired(rt)) 671 goto out; 672 673 m = rt6_score_route(rt, oif, strict); 674 if (m == RT6_NUD_FAIL_DO_RR) { 675 match_do_rr = true; 676 m = 0; /* lowest valid score */ 677 } else if (m == RT6_NUD_FAIL_HARD) { 678 goto out; 679 } 680 681 if (strict & RT6_LOOKUP_F_REACHABLE) 682 rt6_probe(rt); 683 684 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 685 if (m > *mpri) { 686 *do_rr = match_do_rr; 687 *mpri = m; 688 match = rt; 689 } 690 out: 691 return match; 692 } 693 694 static struct fib6_info *find_rr_leaf(struct fib6_node *fn, 695 struct fib6_info *leaf, 696 struct fib6_info *rr_head, 697 u32 metric, int oif, int strict, 698 bool *do_rr) 699 { 700 struct fib6_info *rt, *match, *cont; 701 int mpri = -1; 702 703 match = NULL; 704 cont = NULL; 705 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) { 706 if (rt->fib6_metric != metric) { 707 cont = rt; 708 break; 709 } 710 711 match = find_match(rt, oif, strict, &mpri, match, do_rr); 712 } 713 714 for (rt = leaf; rt && rt != rr_head; 715 rt = rcu_dereference(rt->fib6_next)) { 716 if (rt->fib6_metric != metric) { 717 cont = rt; 718 break; 719 } 720 721 match = find_match(rt, oif, strict, &mpri, match, do_rr); 722 } 723 724 if (match || !cont) 725 return match; 726 727 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next)) 728 match = find_match(rt, oif, strict, &mpri, match, do_rr); 729 730 return match; 731 } 732 733 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, 734 int oif, int strict) 735 { 736 struct fib6_info *leaf = rcu_dereference(fn->leaf); 737 struct fib6_info *match, *rt0; 738 bool do_rr = false; 739 int key_plen; 740 741 if (!leaf || leaf == net->ipv6.fib6_null_entry) 742 return net->ipv6.fib6_null_entry; 743 744 rt0 = rcu_dereference(fn->rr_ptr); 745 if (!rt0) 746 rt0 = leaf; 747 748 /* Double check to make sure fn is not an intermediate node 749 * and fn->leaf does not points to its child's leaf 750 * (This might happen if all routes under fn are deleted from 751 * the tree and fib6_repair_tree() is called on the node.) 752 */ 753 key_plen = rt0->fib6_dst.plen; 754 #ifdef CONFIG_IPV6_SUBTREES 755 if (rt0->fib6_src.plen) 756 key_plen = rt0->fib6_src.plen; 757 #endif 758 if (fn->fn_bit != key_plen) 759 return net->ipv6.fib6_null_entry; 760 761 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict, 762 &do_rr); 763 764 if (do_rr) { 765 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 766 767 /* no entries matched; do round-robin */ 768 if (!next || next->fib6_metric != rt0->fib6_metric) 769 next = leaf; 770 771 if (next != rt0) { 772 spin_lock_bh(&leaf->fib6_table->tb6_lock); 773 /* make sure next is not being deleted from the tree */ 774 if (next->fib6_node) 775 rcu_assign_pointer(fn->rr_ptr, next); 776 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 777 } 778 } 779 780 return match ? match : net->ipv6.fib6_null_entry; 781 } 782 783 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) 784 { 785 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); 786 } 787 788 #ifdef CONFIG_IPV6_ROUTE_INFO 789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 790 const struct in6_addr *gwaddr) 791 { 792 struct net *net = dev_net(dev); 793 struct route_info *rinfo = (struct route_info *) opt; 794 struct in6_addr prefix_buf, *prefix; 795 unsigned int pref; 796 unsigned long lifetime; 797 struct fib6_info *rt; 798 799 if (len < sizeof(struct route_info)) { 800 return -EINVAL; 801 } 802 803 /* Sanity check for prefix_len and length */ 804 if (rinfo->length > 3) { 805 return -EINVAL; 806 } else if (rinfo->prefix_len > 128) { 807 return -EINVAL; 808 } else if (rinfo->prefix_len > 64) { 809 if (rinfo->length < 2) { 810 return -EINVAL; 811 } 812 } else if (rinfo->prefix_len > 0) { 813 if (rinfo->length < 1) { 814 return -EINVAL; 815 } 816 } 817 818 pref = rinfo->route_pref; 819 if (pref == ICMPV6_ROUTER_PREF_INVALID) 820 return -EINVAL; 821 822 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 823 824 if (rinfo->length == 3) 825 prefix = (struct in6_addr *)rinfo->prefix; 826 else { 827 /* this function is safe */ 828 ipv6_addr_prefix(&prefix_buf, 829 (struct in6_addr *)rinfo->prefix, 830 rinfo->prefix_len); 831 prefix = &prefix_buf; 832 } 833 834 if (rinfo->prefix_len == 0) 835 rt = rt6_get_dflt_router(net, gwaddr, dev); 836 else 837 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 838 gwaddr, dev); 839 840 if (rt && !lifetime) { 841 ip6_del_rt(net, rt); 842 rt = NULL; 843 } 844 845 if (!rt && lifetime) 846 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 847 dev, pref); 848 else if (rt) 849 rt->fib6_flags = RTF_ROUTEINFO | 850 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 851 852 if (rt) { 853 if (!addrconf_finite_timeout(lifetime)) 854 fib6_clean_expires(rt); 855 else 856 fib6_set_expires(rt, jiffies + HZ * lifetime); 857 858 fib6_info_release(rt); 859 } 860 return 0; 861 } 862 #endif 863 864 /* 865 * Misc support functions 866 */ 867 868 /* called with rcu_lock held */ 869 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt) 870 { 871 struct net_device *dev = rt->fib6_nh.nh_dev; 872 873 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 874 /* for copies of local routes, dst->dev needs to be the 875 * device if it is a master device, the master device if 876 * device is enslaved, and the loopback as the default 877 */ 878 if (netif_is_l3_slave(dev) && 879 !rt6_need_strict(&rt->fib6_dst.addr)) 880 dev = l3mdev_master_dev_rcu(dev); 881 else if (!netif_is_l3_master(dev)) 882 dev = dev_net(dev)->loopback_dev; 883 /* last case is netif_is_l3_master(dev) is true in which 884 * case we want dev returned to be dev 885 */ 886 } 887 888 return dev; 889 } 890 891 static const int fib6_prop[RTN_MAX + 1] = { 892 [RTN_UNSPEC] = 0, 893 [RTN_UNICAST] = 0, 894 [RTN_LOCAL] = 0, 895 [RTN_BROADCAST] = 0, 896 [RTN_ANYCAST] = 0, 897 [RTN_MULTICAST] = 0, 898 [RTN_BLACKHOLE] = -EINVAL, 899 [RTN_UNREACHABLE] = -EHOSTUNREACH, 900 [RTN_PROHIBIT] = -EACCES, 901 [RTN_THROW] = -EAGAIN, 902 [RTN_NAT] = -EINVAL, 903 [RTN_XRESOLVE] = -EINVAL, 904 }; 905 906 static int ip6_rt_type_to_error(u8 fib6_type) 907 { 908 return fib6_prop[fib6_type]; 909 } 910 911 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 912 { 913 unsigned short flags = 0; 914 915 if (rt->dst_nocount) 916 flags |= DST_NOCOUNT; 917 if (rt->dst_nopolicy) 918 flags |= DST_NOPOLICY; 919 if (rt->dst_host) 920 flags |= DST_HOST; 921 922 return flags; 923 } 924 925 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) 926 { 927 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); 928 929 switch (ort->fib6_type) { 930 case RTN_BLACKHOLE: 931 rt->dst.output = dst_discard_out; 932 rt->dst.input = dst_discard; 933 break; 934 case RTN_PROHIBIT: 935 rt->dst.output = ip6_pkt_prohibit_out; 936 rt->dst.input = ip6_pkt_prohibit; 937 break; 938 case RTN_THROW: 939 case RTN_UNREACHABLE: 940 default: 941 rt->dst.output = ip6_pkt_discard_out; 942 rt->dst.input = ip6_pkt_discard; 943 break; 944 } 945 } 946 947 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) 948 { 949 rt->dst.flags |= fib6_info_dst_flags(ort); 950 951 if (ort->fib6_flags & RTF_REJECT) { 952 ip6_rt_init_dst_reject(rt, ort); 953 return; 954 } 955 956 rt->dst.error = 0; 957 rt->dst.output = ip6_output; 958 959 if (ort->fib6_type == RTN_LOCAL) { 960 rt->dst.input = ip6_input; 961 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 962 rt->dst.input = ip6_mc_input; 963 } else { 964 rt->dst.input = ip6_forward; 965 } 966 967 if (ort->fib6_nh.nh_lwtstate) { 968 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); 969 lwtunnel_set_redirect(&rt->dst); 970 } 971 972 rt->dst.lastuse = jiffies; 973 } 974 975 /* Caller must already hold reference to @from */ 976 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 977 { 978 rt->rt6i_flags &= ~RTF_EXPIRES; 979 rcu_assign_pointer(rt->from, from); 980 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); 981 } 982 983 /* Caller must already hold reference to @ort */ 984 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) 985 { 986 struct net_device *dev = fib6_info_nh_dev(ort); 987 988 ip6_rt_init_dst(rt, ort); 989 990 rt->rt6i_dst = ort->fib6_dst; 991 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 992 rt->rt6i_gateway = ort->fib6_nh.nh_gw; 993 rt->rt6i_flags = ort->fib6_flags; 994 rt6_set_from(rt, ort); 995 #ifdef CONFIG_IPV6_SUBTREES 996 rt->rt6i_src = ort->fib6_src; 997 #endif 998 rt->rt6i_prefsrc = ort->fib6_prefsrc; 999 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); 1000 } 1001 1002 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1003 struct in6_addr *saddr) 1004 { 1005 struct fib6_node *pn, *sn; 1006 while (1) { 1007 if (fn->fn_flags & RTN_TL_ROOT) 1008 return NULL; 1009 pn = rcu_dereference(fn->parent); 1010 sn = FIB6_SUBTREE(pn); 1011 if (sn && sn != fn) 1012 fn = fib6_node_lookup(sn, NULL, saddr); 1013 else 1014 fn = pn; 1015 if (fn->fn_flags & RTN_RTINFO) 1016 return fn; 1017 } 1018 } 1019 1020 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, 1021 bool null_fallback) 1022 { 1023 struct rt6_info *rt = *prt; 1024 1025 if (dst_hold_safe(&rt->dst)) 1026 return true; 1027 if (null_fallback) { 1028 rt = net->ipv6.ip6_null_entry; 1029 dst_hold(&rt->dst); 1030 } else { 1031 rt = NULL; 1032 } 1033 *prt = rt; 1034 return false; 1035 } 1036 1037 /* called with rcu_lock held */ 1038 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) 1039 { 1040 unsigned short flags = fib6_info_dst_flags(rt); 1041 struct net_device *dev = rt->fib6_nh.nh_dev; 1042 struct rt6_info *nrt; 1043 1044 if (!fib6_info_hold_safe(rt)) 1045 return NULL; 1046 1047 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1048 if (nrt) 1049 ip6_rt_copy_init(nrt, rt); 1050 else 1051 fib6_info_release(rt); 1052 1053 return nrt; 1054 } 1055 1056 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1057 struct fib6_table *table, 1058 struct flowi6 *fl6, 1059 const struct sk_buff *skb, 1060 int flags) 1061 { 1062 struct fib6_info *f6i; 1063 struct fib6_node *fn; 1064 struct rt6_info *rt; 1065 1066 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1067 flags &= ~RT6_LOOKUP_F_IFACE; 1068 1069 rcu_read_lock(); 1070 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1071 restart: 1072 f6i = rcu_dereference(fn->leaf); 1073 if (!f6i) { 1074 f6i = net->ipv6.fib6_null_entry; 1075 } else { 1076 f6i = rt6_device_match(net, f6i, &fl6->saddr, 1077 fl6->flowi6_oif, flags); 1078 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) 1079 f6i = fib6_multipath_select(net, f6i, fl6, 1080 fl6->flowi6_oif, skb, 1081 flags); 1082 } 1083 if (f6i == net->ipv6.fib6_null_entry) { 1084 fn = fib6_backtrack(fn, &fl6->saddr); 1085 if (fn) 1086 goto restart; 1087 } 1088 1089 trace_fib6_table_lookup(net, f6i, table, fl6); 1090 1091 /* Search through exception table */ 1092 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1093 if (rt) { 1094 if (ip6_hold_safe(net, &rt, true)) 1095 dst_use_noref(&rt->dst, jiffies); 1096 } else if (f6i == net->ipv6.fib6_null_entry) { 1097 rt = net->ipv6.ip6_null_entry; 1098 dst_hold(&rt->dst); 1099 } else { 1100 rt = ip6_create_rt_rcu(f6i); 1101 if (!rt) { 1102 rt = net->ipv6.ip6_null_entry; 1103 dst_hold(&rt->dst); 1104 } 1105 } 1106 1107 rcu_read_unlock(); 1108 1109 return rt; 1110 } 1111 1112 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1113 const struct sk_buff *skb, int flags) 1114 { 1115 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1116 } 1117 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1118 1119 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1120 const struct in6_addr *saddr, int oif, 1121 const struct sk_buff *skb, int strict) 1122 { 1123 struct flowi6 fl6 = { 1124 .flowi6_oif = oif, 1125 .daddr = *daddr, 1126 }; 1127 struct dst_entry *dst; 1128 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1129 1130 if (saddr) { 1131 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1132 flags |= RT6_LOOKUP_F_HAS_SADDR; 1133 } 1134 1135 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1136 if (dst->error == 0) 1137 return (struct rt6_info *) dst; 1138 1139 dst_release(dst); 1140 1141 return NULL; 1142 } 1143 EXPORT_SYMBOL(rt6_lookup); 1144 1145 /* ip6_ins_rt is called with FREE table->tb6_lock. 1146 * It takes new route entry, the addition fails by any reason the 1147 * route is released. 1148 * Caller must hold dst before calling it. 1149 */ 1150 1151 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1152 struct netlink_ext_ack *extack) 1153 { 1154 int err; 1155 struct fib6_table *table; 1156 1157 table = rt->fib6_table; 1158 spin_lock_bh(&table->tb6_lock); 1159 err = fib6_add(&table->tb6_root, rt, info, extack); 1160 spin_unlock_bh(&table->tb6_lock); 1161 1162 return err; 1163 } 1164 1165 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1166 { 1167 struct nl_info info = { .nl_net = net, }; 1168 1169 return __ip6_ins_rt(rt, &info, NULL); 1170 } 1171 1172 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, 1173 const struct in6_addr *daddr, 1174 const struct in6_addr *saddr) 1175 { 1176 struct net_device *dev; 1177 struct rt6_info *rt; 1178 1179 /* 1180 * Clone the route. 1181 */ 1182 1183 if (!fib6_info_hold_safe(ort)) 1184 return NULL; 1185 1186 dev = ip6_rt_get_dev_rcu(ort); 1187 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1188 if (!rt) { 1189 fib6_info_release(ort); 1190 return NULL; 1191 } 1192 1193 ip6_rt_copy_init(rt, ort); 1194 rt->rt6i_flags |= RTF_CACHE; 1195 rt->dst.flags |= DST_HOST; 1196 rt->rt6i_dst.addr = *daddr; 1197 rt->rt6i_dst.plen = 128; 1198 1199 if (!rt6_is_gw_or_nonexthop(ort)) { 1200 if (ort->fib6_dst.plen != 128 && 1201 ipv6_addr_equal(&ort->fib6_dst.addr, daddr)) 1202 rt->rt6i_flags |= RTF_ANYCAST; 1203 #ifdef CONFIG_IPV6_SUBTREES 1204 if (rt->rt6i_src.plen && saddr) { 1205 rt->rt6i_src.addr = *saddr; 1206 rt->rt6i_src.plen = 128; 1207 } 1208 #endif 1209 } 1210 1211 return rt; 1212 } 1213 1214 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) 1215 { 1216 unsigned short flags = fib6_info_dst_flags(rt); 1217 struct net_device *dev; 1218 struct rt6_info *pcpu_rt; 1219 1220 if (!fib6_info_hold_safe(rt)) 1221 return NULL; 1222 1223 rcu_read_lock(); 1224 dev = ip6_rt_get_dev_rcu(rt); 1225 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1226 rcu_read_unlock(); 1227 if (!pcpu_rt) { 1228 fib6_info_release(rt); 1229 return NULL; 1230 } 1231 ip6_rt_copy_init(pcpu_rt, rt); 1232 pcpu_rt->rt6i_flags |= RTF_PCPU; 1233 return pcpu_rt; 1234 } 1235 1236 /* It should be called with rcu_read_lock() acquired */ 1237 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt) 1238 { 1239 struct rt6_info *pcpu_rt, **p; 1240 1241 p = this_cpu_ptr(rt->rt6i_pcpu); 1242 pcpu_rt = *p; 1243 1244 if (pcpu_rt) 1245 ip6_hold_safe(NULL, &pcpu_rt, false); 1246 1247 return pcpu_rt; 1248 } 1249 1250 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1251 struct fib6_info *rt) 1252 { 1253 struct rt6_info *pcpu_rt, *prev, **p; 1254 1255 pcpu_rt = ip6_rt_pcpu_alloc(rt); 1256 if (!pcpu_rt) { 1257 dst_hold(&net->ipv6.ip6_null_entry->dst); 1258 return net->ipv6.ip6_null_entry; 1259 } 1260 1261 dst_hold(&pcpu_rt->dst); 1262 p = this_cpu_ptr(rt->rt6i_pcpu); 1263 prev = cmpxchg(p, NULL, pcpu_rt); 1264 BUG_ON(prev); 1265 1266 return pcpu_rt; 1267 } 1268 1269 /* exception hash table implementation 1270 */ 1271 static DEFINE_SPINLOCK(rt6_exception_lock); 1272 1273 /* Remove rt6_ex from hash table and free the memory 1274 * Caller must hold rt6_exception_lock 1275 */ 1276 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1277 struct rt6_exception *rt6_ex) 1278 { 1279 struct net *net; 1280 1281 if (!bucket || !rt6_ex) 1282 return; 1283 1284 net = dev_net(rt6_ex->rt6i->dst.dev); 1285 hlist_del_rcu(&rt6_ex->hlist); 1286 dst_release(&rt6_ex->rt6i->dst); 1287 kfree_rcu(rt6_ex, rcu); 1288 WARN_ON_ONCE(!bucket->depth); 1289 bucket->depth--; 1290 net->ipv6.rt6_stats->fib_rt_cache--; 1291 } 1292 1293 /* Remove oldest rt6_ex in bucket and free the memory 1294 * Caller must hold rt6_exception_lock 1295 */ 1296 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1297 { 1298 struct rt6_exception *rt6_ex, *oldest = NULL; 1299 1300 if (!bucket) 1301 return; 1302 1303 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1304 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1305 oldest = rt6_ex; 1306 } 1307 rt6_remove_exception(bucket, oldest); 1308 } 1309 1310 static u32 rt6_exception_hash(const struct in6_addr *dst, 1311 const struct in6_addr *src) 1312 { 1313 static u32 seed __read_mostly; 1314 u32 val; 1315 1316 net_get_random_once(&seed, sizeof(seed)); 1317 val = jhash(dst, sizeof(*dst), seed); 1318 1319 #ifdef CONFIG_IPV6_SUBTREES 1320 if (src) 1321 val = jhash(src, sizeof(*src), val); 1322 #endif 1323 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1324 } 1325 1326 /* Helper function to find the cached rt in the hash table 1327 * and update bucket pointer to point to the bucket for this 1328 * (daddr, saddr) pair 1329 * Caller must hold rt6_exception_lock 1330 */ 1331 static struct rt6_exception * 1332 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1333 const struct in6_addr *daddr, 1334 const struct in6_addr *saddr) 1335 { 1336 struct rt6_exception *rt6_ex; 1337 u32 hval; 1338 1339 if (!(*bucket) || !daddr) 1340 return NULL; 1341 1342 hval = rt6_exception_hash(daddr, saddr); 1343 *bucket += hval; 1344 1345 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1346 struct rt6_info *rt6 = rt6_ex->rt6i; 1347 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1348 1349 #ifdef CONFIG_IPV6_SUBTREES 1350 if (matched && saddr) 1351 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1352 #endif 1353 if (matched) 1354 return rt6_ex; 1355 } 1356 return NULL; 1357 } 1358 1359 /* Helper function to find the cached rt in the hash table 1360 * and update bucket pointer to point to the bucket for this 1361 * (daddr, saddr) pair 1362 * Caller must hold rcu_read_lock() 1363 */ 1364 static struct rt6_exception * 1365 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1366 const struct in6_addr *daddr, 1367 const struct in6_addr *saddr) 1368 { 1369 struct rt6_exception *rt6_ex; 1370 u32 hval; 1371 1372 WARN_ON_ONCE(!rcu_read_lock_held()); 1373 1374 if (!(*bucket) || !daddr) 1375 return NULL; 1376 1377 hval = rt6_exception_hash(daddr, saddr); 1378 *bucket += hval; 1379 1380 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1381 struct rt6_info *rt6 = rt6_ex->rt6i; 1382 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1383 1384 #ifdef CONFIG_IPV6_SUBTREES 1385 if (matched && saddr) 1386 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1387 #endif 1388 if (matched) 1389 return rt6_ex; 1390 } 1391 return NULL; 1392 } 1393 1394 static unsigned int fib6_mtu(const struct fib6_info *rt) 1395 { 1396 unsigned int mtu; 1397 1398 if (rt->fib6_pmtu) { 1399 mtu = rt->fib6_pmtu; 1400 } else { 1401 struct net_device *dev = fib6_info_nh_dev(rt); 1402 struct inet6_dev *idev; 1403 1404 rcu_read_lock(); 1405 idev = __in6_dev_get(dev); 1406 mtu = idev->cnf.mtu6; 1407 rcu_read_unlock(); 1408 } 1409 1410 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1411 1412 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu); 1413 } 1414 1415 static int rt6_insert_exception(struct rt6_info *nrt, 1416 struct fib6_info *ort) 1417 { 1418 struct net *net = dev_net(nrt->dst.dev); 1419 struct rt6_exception_bucket *bucket; 1420 struct in6_addr *src_key = NULL; 1421 struct rt6_exception *rt6_ex; 1422 int err = 0; 1423 1424 spin_lock_bh(&rt6_exception_lock); 1425 1426 if (ort->exception_bucket_flushed) { 1427 err = -EINVAL; 1428 goto out; 1429 } 1430 1431 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, 1432 lockdep_is_held(&rt6_exception_lock)); 1433 if (!bucket) { 1434 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1435 GFP_ATOMIC); 1436 if (!bucket) { 1437 err = -ENOMEM; 1438 goto out; 1439 } 1440 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); 1441 } 1442 1443 #ifdef CONFIG_IPV6_SUBTREES 1444 /* rt6i_src.plen != 0 indicates ort is in subtree 1445 * and exception table is indexed by a hash of 1446 * both rt6i_dst and rt6i_src. 1447 * Otherwise, the exception table is indexed by 1448 * a hash of only rt6i_dst. 1449 */ 1450 if (ort->fib6_src.plen) 1451 src_key = &nrt->rt6i_src.addr; 1452 #endif 1453 1454 /* Update rt6i_prefsrc as it could be changed 1455 * in rt6_remove_prefsrc() 1456 */ 1457 nrt->rt6i_prefsrc = ort->fib6_prefsrc; 1458 /* rt6_mtu_change() might lower mtu on ort. 1459 * Only insert this exception route if its mtu 1460 * is less than ort's mtu value. 1461 */ 1462 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) { 1463 err = -EINVAL; 1464 goto out; 1465 } 1466 1467 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1468 src_key); 1469 if (rt6_ex) 1470 rt6_remove_exception(bucket, rt6_ex); 1471 1472 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1473 if (!rt6_ex) { 1474 err = -ENOMEM; 1475 goto out; 1476 } 1477 rt6_ex->rt6i = nrt; 1478 rt6_ex->stamp = jiffies; 1479 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1480 bucket->depth++; 1481 net->ipv6.rt6_stats->fib_rt_cache++; 1482 1483 if (bucket->depth > FIB6_MAX_DEPTH) 1484 rt6_exception_remove_oldest(bucket); 1485 1486 out: 1487 spin_unlock_bh(&rt6_exception_lock); 1488 1489 /* Update fn->fn_sernum to invalidate all cached dst */ 1490 if (!err) { 1491 spin_lock_bh(&ort->fib6_table->tb6_lock); 1492 fib6_update_sernum(net, ort); 1493 spin_unlock_bh(&ort->fib6_table->tb6_lock); 1494 fib6_force_start_gc(net); 1495 } 1496 1497 return err; 1498 } 1499 1500 void rt6_flush_exceptions(struct fib6_info *rt) 1501 { 1502 struct rt6_exception_bucket *bucket; 1503 struct rt6_exception *rt6_ex; 1504 struct hlist_node *tmp; 1505 int i; 1506 1507 spin_lock_bh(&rt6_exception_lock); 1508 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1509 rt->exception_bucket_flushed = 1; 1510 1511 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1512 lockdep_is_held(&rt6_exception_lock)); 1513 if (!bucket) 1514 goto out; 1515 1516 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1517 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1518 rt6_remove_exception(bucket, rt6_ex); 1519 WARN_ON_ONCE(bucket->depth); 1520 bucket++; 1521 } 1522 1523 out: 1524 spin_unlock_bh(&rt6_exception_lock); 1525 } 1526 1527 /* Find cached rt in the hash table inside passed in rt 1528 * Caller has to hold rcu_read_lock() 1529 */ 1530 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 1531 struct in6_addr *daddr, 1532 struct in6_addr *saddr) 1533 { 1534 struct rt6_exception_bucket *bucket; 1535 struct in6_addr *src_key = NULL; 1536 struct rt6_exception *rt6_ex; 1537 struct rt6_info *res = NULL; 1538 1539 bucket = rcu_dereference(rt->rt6i_exception_bucket); 1540 1541 #ifdef CONFIG_IPV6_SUBTREES 1542 /* rt6i_src.plen != 0 indicates rt is in subtree 1543 * and exception table is indexed by a hash of 1544 * both rt6i_dst and rt6i_src. 1545 * Otherwise, the exception table is indexed by 1546 * a hash of only rt6i_dst. 1547 */ 1548 if (rt->fib6_src.plen) 1549 src_key = saddr; 1550 #endif 1551 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1552 1553 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1554 res = rt6_ex->rt6i; 1555 1556 return res; 1557 } 1558 1559 /* Remove the passed in cached rt from the hash table that contains it */ 1560 static int rt6_remove_exception_rt(struct rt6_info *rt) 1561 { 1562 struct rt6_exception_bucket *bucket; 1563 struct in6_addr *src_key = NULL; 1564 struct rt6_exception *rt6_ex; 1565 struct fib6_info *from; 1566 int err; 1567 1568 from = rcu_dereference(rt->from); 1569 if (!from || 1570 !(rt->rt6i_flags & RTF_CACHE)) 1571 return -EINVAL; 1572 1573 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1574 return -ENOENT; 1575 1576 spin_lock_bh(&rt6_exception_lock); 1577 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1578 lockdep_is_held(&rt6_exception_lock)); 1579 #ifdef CONFIG_IPV6_SUBTREES 1580 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1581 * and exception table is indexed by a hash of 1582 * both rt6i_dst and rt6i_src. 1583 * Otherwise, the exception table is indexed by 1584 * a hash of only rt6i_dst. 1585 */ 1586 if (from->fib6_src.plen) 1587 src_key = &rt->rt6i_src.addr; 1588 #endif 1589 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1590 &rt->rt6i_dst.addr, 1591 src_key); 1592 if (rt6_ex) { 1593 rt6_remove_exception(bucket, rt6_ex); 1594 err = 0; 1595 } else { 1596 err = -ENOENT; 1597 } 1598 1599 spin_unlock_bh(&rt6_exception_lock); 1600 return err; 1601 } 1602 1603 /* Find rt6_ex which contains the passed in rt cache and 1604 * refresh its stamp 1605 */ 1606 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1607 { 1608 struct rt6_exception_bucket *bucket; 1609 struct fib6_info *from = rt->from; 1610 struct in6_addr *src_key = NULL; 1611 struct rt6_exception *rt6_ex; 1612 1613 if (!from || 1614 !(rt->rt6i_flags & RTF_CACHE)) 1615 return; 1616 1617 rcu_read_lock(); 1618 bucket = rcu_dereference(from->rt6i_exception_bucket); 1619 1620 #ifdef CONFIG_IPV6_SUBTREES 1621 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1622 * and exception table is indexed by a hash of 1623 * both rt6i_dst and rt6i_src. 1624 * Otherwise, the exception table is indexed by 1625 * a hash of only rt6i_dst. 1626 */ 1627 if (from->fib6_src.plen) 1628 src_key = &rt->rt6i_src.addr; 1629 #endif 1630 rt6_ex = __rt6_find_exception_rcu(&bucket, 1631 &rt->rt6i_dst.addr, 1632 src_key); 1633 if (rt6_ex) 1634 rt6_ex->stamp = jiffies; 1635 1636 rcu_read_unlock(); 1637 } 1638 1639 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt) 1640 { 1641 struct rt6_exception_bucket *bucket; 1642 struct rt6_exception *rt6_ex; 1643 int i; 1644 1645 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1646 lockdep_is_held(&rt6_exception_lock)); 1647 1648 if (bucket) { 1649 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1650 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1651 rt6_ex->rt6i->rt6i_prefsrc.plen = 0; 1652 } 1653 bucket++; 1654 } 1655 } 1656 } 1657 1658 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1659 struct rt6_info *rt, int mtu) 1660 { 1661 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1662 * lowest MTU in the path: always allow updating the route PMTU to 1663 * reflect PMTU decreases. 1664 * 1665 * If the new MTU is higher, and the route PMTU is equal to the local 1666 * MTU, this means the old MTU is the lowest in the path, so allow 1667 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1668 * handle this. 1669 */ 1670 1671 if (dst_mtu(&rt->dst) >= mtu) 1672 return true; 1673 1674 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 1675 return true; 1676 1677 return false; 1678 } 1679 1680 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1681 struct fib6_info *rt, int mtu) 1682 { 1683 struct rt6_exception_bucket *bucket; 1684 struct rt6_exception *rt6_ex; 1685 int i; 1686 1687 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1688 lockdep_is_held(&rt6_exception_lock)); 1689 1690 if (!bucket) 1691 return; 1692 1693 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1694 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1695 struct rt6_info *entry = rt6_ex->rt6i; 1696 1697 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 1698 * route), the metrics of its rt->from have already 1699 * been updated. 1700 */ 1701 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 1702 rt6_mtu_change_route_allowed(idev, entry, mtu)) 1703 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 1704 } 1705 bucket++; 1706 } 1707 } 1708 1709 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1710 1711 static void rt6_exceptions_clean_tohost(struct fib6_info *rt, 1712 struct in6_addr *gateway) 1713 { 1714 struct rt6_exception_bucket *bucket; 1715 struct rt6_exception *rt6_ex; 1716 struct hlist_node *tmp; 1717 int i; 1718 1719 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1720 return; 1721 1722 spin_lock_bh(&rt6_exception_lock); 1723 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1724 lockdep_is_held(&rt6_exception_lock)); 1725 1726 if (bucket) { 1727 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1728 hlist_for_each_entry_safe(rt6_ex, tmp, 1729 &bucket->chain, hlist) { 1730 struct rt6_info *entry = rt6_ex->rt6i; 1731 1732 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1733 RTF_CACHE_GATEWAY && 1734 ipv6_addr_equal(gateway, 1735 &entry->rt6i_gateway)) { 1736 rt6_remove_exception(bucket, rt6_ex); 1737 } 1738 } 1739 bucket++; 1740 } 1741 } 1742 1743 spin_unlock_bh(&rt6_exception_lock); 1744 } 1745 1746 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1747 struct rt6_exception *rt6_ex, 1748 struct fib6_gc_args *gc_args, 1749 unsigned long now) 1750 { 1751 struct rt6_info *rt = rt6_ex->rt6i; 1752 1753 /* we are pruning and obsoleting aged-out and non gateway exceptions 1754 * even if others have still references to them, so that on next 1755 * dst_check() such references can be dropped. 1756 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1757 * expired, independently from their aging, as per RFC 8201 section 4 1758 */ 1759 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 1760 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1761 RT6_TRACE("aging clone %p\n", rt); 1762 rt6_remove_exception(bucket, rt6_ex); 1763 return; 1764 } 1765 } else if (time_after(jiffies, rt->dst.expires)) { 1766 RT6_TRACE("purging expired route %p\n", rt); 1767 rt6_remove_exception(bucket, rt6_ex); 1768 return; 1769 } 1770 1771 if (rt->rt6i_flags & RTF_GATEWAY) { 1772 struct neighbour *neigh; 1773 __u8 neigh_flags = 0; 1774 1775 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 1776 if (neigh) 1777 neigh_flags = neigh->flags; 1778 1779 if (!(neigh_flags & NTF_ROUTER)) { 1780 RT6_TRACE("purging route %p via non-router but gateway\n", 1781 rt); 1782 rt6_remove_exception(bucket, rt6_ex); 1783 return; 1784 } 1785 } 1786 1787 gc_args->more++; 1788 } 1789 1790 void rt6_age_exceptions(struct fib6_info *rt, 1791 struct fib6_gc_args *gc_args, 1792 unsigned long now) 1793 { 1794 struct rt6_exception_bucket *bucket; 1795 struct rt6_exception *rt6_ex; 1796 struct hlist_node *tmp; 1797 int i; 1798 1799 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1800 return; 1801 1802 rcu_read_lock_bh(); 1803 spin_lock(&rt6_exception_lock); 1804 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1805 lockdep_is_held(&rt6_exception_lock)); 1806 1807 if (bucket) { 1808 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1809 hlist_for_each_entry_safe(rt6_ex, tmp, 1810 &bucket->chain, hlist) { 1811 rt6_age_examine_exception(bucket, rt6_ex, 1812 gc_args, now); 1813 } 1814 bucket++; 1815 } 1816 } 1817 spin_unlock(&rt6_exception_lock); 1818 rcu_read_unlock_bh(); 1819 } 1820 1821 /* must be called with rcu lock held */ 1822 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, 1823 int oif, struct flowi6 *fl6, int strict) 1824 { 1825 struct fib6_node *fn, *saved_fn; 1826 struct fib6_info *f6i; 1827 1828 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1829 saved_fn = fn; 1830 1831 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1832 oif = 0; 1833 1834 redo_rt6_select: 1835 f6i = rt6_select(net, fn, oif, strict); 1836 if (f6i == net->ipv6.fib6_null_entry) { 1837 fn = fib6_backtrack(fn, &fl6->saddr); 1838 if (fn) 1839 goto redo_rt6_select; 1840 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1841 /* also consider unreachable route */ 1842 strict &= ~RT6_LOOKUP_F_REACHABLE; 1843 fn = saved_fn; 1844 goto redo_rt6_select; 1845 } 1846 } 1847 1848 trace_fib6_table_lookup(net, f6i, table, fl6); 1849 1850 return f6i; 1851 } 1852 1853 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1854 int oif, struct flowi6 *fl6, 1855 const struct sk_buff *skb, int flags) 1856 { 1857 struct fib6_info *f6i; 1858 struct rt6_info *rt; 1859 int strict = 0; 1860 1861 strict |= flags & RT6_LOOKUP_F_IFACE; 1862 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1863 if (net->ipv6.devconf_all->forwarding == 0) 1864 strict |= RT6_LOOKUP_F_REACHABLE; 1865 1866 rcu_read_lock(); 1867 1868 f6i = fib6_table_lookup(net, table, oif, fl6, strict); 1869 if (f6i->fib6_nsiblings) 1870 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict); 1871 1872 if (f6i == net->ipv6.fib6_null_entry) { 1873 rt = net->ipv6.ip6_null_entry; 1874 rcu_read_unlock(); 1875 dst_hold(&rt->dst); 1876 return rt; 1877 } 1878 1879 /*Search through exception table */ 1880 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1881 if (rt) { 1882 if (ip6_hold_safe(net, &rt, true)) 1883 dst_use_noref(&rt->dst, jiffies); 1884 1885 rcu_read_unlock(); 1886 return rt; 1887 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1888 !(f6i->fib6_flags & RTF_GATEWAY))) { 1889 /* Create a RTF_CACHE clone which will not be 1890 * owned by the fib6 tree. It is for the special case where 1891 * the daddr in the skb during the neighbor look-up is different 1892 * from the fl6->daddr used to look-up route here. 1893 */ 1894 struct rt6_info *uncached_rt; 1895 1896 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); 1897 1898 rcu_read_unlock(); 1899 1900 if (uncached_rt) { 1901 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1902 * No need for another dst_hold() 1903 */ 1904 rt6_uncached_list_add(uncached_rt); 1905 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 1906 } else { 1907 uncached_rt = net->ipv6.ip6_null_entry; 1908 dst_hold(&uncached_rt->dst); 1909 } 1910 1911 return uncached_rt; 1912 } else { 1913 /* Get a percpu copy */ 1914 1915 struct rt6_info *pcpu_rt; 1916 1917 local_bh_disable(); 1918 pcpu_rt = rt6_get_pcpu_route(f6i); 1919 1920 if (!pcpu_rt) 1921 pcpu_rt = rt6_make_pcpu_route(net, f6i); 1922 1923 local_bh_enable(); 1924 rcu_read_unlock(); 1925 1926 return pcpu_rt; 1927 } 1928 } 1929 EXPORT_SYMBOL_GPL(ip6_pol_route); 1930 1931 static struct rt6_info *ip6_pol_route_input(struct net *net, 1932 struct fib6_table *table, 1933 struct flowi6 *fl6, 1934 const struct sk_buff *skb, 1935 int flags) 1936 { 1937 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 1938 } 1939 1940 struct dst_entry *ip6_route_input_lookup(struct net *net, 1941 struct net_device *dev, 1942 struct flowi6 *fl6, 1943 const struct sk_buff *skb, 1944 int flags) 1945 { 1946 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1947 flags |= RT6_LOOKUP_F_IFACE; 1948 1949 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 1950 } 1951 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1952 1953 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1954 struct flow_keys *keys, 1955 struct flow_keys *flkeys) 1956 { 1957 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1958 const struct ipv6hdr *key_iph = outer_iph; 1959 struct flow_keys *_flkeys = flkeys; 1960 const struct ipv6hdr *inner_iph; 1961 const struct icmp6hdr *icmph; 1962 struct ipv6hdr _inner_iph; 1963 struct icmp6hdr _icmph; 1964 1965 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1966 goto out; 1967 1968 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 1969 sizeof(_icmph), &_icmph); 1970 if (!icmph) 1971 goto out; 1972 1973 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1974 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1975 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1976 icmph->icmp6_type != ICMPV6_PARAMPROB) 1977 goto out; 1978 1979 inner_iph = skb_header_pointer(skb, 1980 skb_transport_offset(skb) + sizeof(*icmph), 1981 sizeof(_inner_iph), &_inner_iph); 1982 if (!inner_iph) 1983 goto out; 1984 1985 key_iph = inner_iph; 1986 _flkeys = NULL; 1987 out: 1988 if (_flkeys) { 1989 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 1990 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 1991 keys->tags.flow_label = _flkeys->tags.flow_label; 1992 keys->basic.ip_proto = _flkeys->basic.ip_proto; 1993 } else { 1994 keys->addrs.v6addrs.src = key_iph->saddr; 1995 keys->addrs.v6addrs.dst = key_iph->daddr; 1996 keys->tags.flow_label = ip6_flowlabel(key_iph); 1997 keys->basic.ip_proto = key_iph->nexthdr; 1998 } 1999 } 2000 2001 /* if skb is set it will be used and fl6 can be NULL */ 2002 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 2003 const struct sk_buff *skb, struct flow_keys *flkeys) 2004 { 2005 struct flow_keys hash_keys; 2006 u32 mhash; 2007 2008 switch (ip6_multipath_hash_policy(net)) { 2009 case 0: 2010 memset(&hash_keys, 0, sizeof(hash_keys)); 2011 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2012 if (skb) { 2013 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 2014 } else { 2015 hash_keys.addrs.v6addrs.src = fl6->saddr; 2016 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2017 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2018 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2019 } 2020 break; 2021 case 1: 2022 if (skb) { 2023 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 2024 struct flow_keys keys; 2025 2026 /* short-circuit if we already have L4 hash present */ 2027 if (skb->l4_hash) 2028 return skb_get_hash_raw(skb) >> 1; 2029 2030 memset(&hash_keys, 0, sizeof(hash_keys)); 2031 2032 if (!flkeys) { 2033 skb_flow_dissect_flow_keys(skb, &keys, flag); 2034 flkeys = &keys; 2035 } 2036 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2037 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2038 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2039 hash_keys.ports.src = flkeys->ports.src; 2040 hash_keys.ports.dst = flkeys->ports.dst; 2041 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2042 } else { 2043 memset(&hash_keys, 0, sizeof(hash_keys)); 2044 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2045 hash_keys.addrs.v6addrs.src = fl6->saddr; 2046 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2047 hash_keys.ports.src = fl6->fl6_sport; 2048 hash_keys.ports.dst = fl6->fl6_dport; 2049 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2050 } 2051 break; 2052 } 2053 mhash = flow_hash_from_keys(&hash_keys); 2054 2055 return mhash >> 1; 2056 } 2057 2058 void ip6_route_input(struct sk_buff *skb) 2059 { 2060 const struct ipv6hdr *iph = ipv6_hdr(skb); 2061 struct net *net = dev_net(skb->dev); 2062 int flags = RT6_LOOKUP_F_HAS_SADDR; 2063 struct ip_tunnel_info *tun_info; 2064 struct flowi6 fl6 = { 2065 .flowi6_iif = skb->dev->ifindex, 2066 .daddr = iph->daddr, 2067 .saddr = iph->saddr, 2068 .flowlabel = ip6_flowinfo(iph), 2069 .flowi6_mark = skb->mark, 2070 .flowi6_proto = iph->nexthdr, 2071 }; 2072 struct flow_keys *flkeys = NULL, _flkeys; 2073 2074 tun_info = skb_tunnel_info(skb); 2075 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2076 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2077 2078 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2079 flkeys = &_flkeys; 2080 2081 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2082 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2083 skb_dst_drop(skb); 2084 skb_dst_set(skb, 2085 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); 2086 } 2087 2088 static struct rt6_info *ip6_pol_route_output(struct net *net, 2089 struct fib6_table *table, 2090 struct flowi6 *fl6, 2091 const struct sk_buff *skb, 2092 int flags) 2093 { 2094 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2095 } 2096 2097 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 2098 struct flowi6 *fl6, int flags) 2099 { 2100 bool any_src; 2101 2102 if (rt6_need_strict(&fl6->daddr)) { 2103 struct dst_entry *dst; 2104 2105 dst = l3mdev_link_scope_lookup(net, fl6); 2106 if (dst) 2107 return dst; 2108 } 2109 2110 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2111 2112 any_src = ipv6_addr_any(&fl6->saddr); 2113 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2114 (fl6->flowi6_oif && any_src)) 2115 flags |= RT6_LOOKUP_F_IFACE; 2116 2117 if (!any_src) 2118 flags |= RT6_LOOKUP_F_HAS_SADDR; 2119 else if (sk) 2120 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2121 2122 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2123 } 2124 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2125 2126 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2127 { 2128 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2129 struct net_device *loopback_dev = net->loopback_dev; 2130 struct dst_entry *new = NULL; 2131 2132 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2133 DST_OBSOLETE_DEAD, 0); 2134 if (rt) { 2135 rt6_info_init(rt); 2136 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2137 2138 new = &rt->dst; 2139 new->__use = 1; 2140 new->input = dst_discard; 2141 new->output = dst_discard_out; 2142 2143 dst_copy_metrics(new, &ort->dst); 2144 2145 rt->rt6i_idev = in6_dev_get(loopback_dev); 2146 rt->rt6i_gateway = ort->rt6i_gateway; 2147 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2148 2149 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2150 #ifdef CONFIG_IPV6_SUBTREES 2151 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2152 #endif 2153 } 2154 2155 dst_release(dst_orig); 2156 return new ? new : ERR_PTR(-ENOMEM); 2157 } 2158 2159 /* 2160 * Destination cache support functions 2161 */ 2162 2163 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2164 { 2165 u32 rt_cookie = 0; 2166 2167 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2168 return false; 2169 2170 if (fib6_check_expired(f6i)) 2171 return false; 2172 2173 return true; 2174 } 2175 2176 static struct dst_entry *rt6_check(struct rt6_info *rt, 2177 struct fib6_info *from, 2178 u32 cookie) 2179 { 2180 u32 rt_cookie = 0; 2181 2182 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || 2183 rt_cookie != cookie) 2184 return NULL; 2185 2186 if (rt6_check_expired(rt)) 2187 return NULL; 2188 2189 return &rt->dst; 2190 } 2191 2192 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2193 struct fib6_info *from, 2194 u32 cookie) 2195 { 2196 if (!__rt6_check_expired(rt) && 2197 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2198 fib6_check(from, cookie)) 2199 return &rt->dst; 2200 else 2201 return NULL; 2202 } 2203 2204 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2205 { 2206 struct dst_entry *dst_ret; 2207 struct fib6_info *from; 2208 struct rt6_info *rt; 2209 2210 rt = container_of(dst, struct rt6_info, dst); 2211 2212 rcu_read_lock(); 2213 2214 /* All IPV6 dsts are created with ->obsolete set to the value 2215 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2216 * into this function always. 2217 */ 2218 2219 from = rcu_dereference(rt->from); 2220 2221 if (from && (rt->rt6i_flags & RTF_PCPU || 2222 unlikely(!list_empty(&rt->rt6i_uncached)))) 2223 dst_ret = rt6_dst_from_check(rt, from, cookie); 2224 else 2225 dst_ret = rt6_check(rt, from, cookie); 2226 2227 rcu_read_unlock(); 2228 2229 return dst_ret; 2230 } 2231 2232 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2233 { 2234 struct rt6_info *rt = (struct rt6_info *) dst; 2235 2236 if (rt) { 2237 if (rt->rt6i_flags & RTF_CACHE) { 2238 rcu_read_lock(); 2239 if (rt6_check_expired(rt)) { 2240 rt6_remove_exception_rt(rt); 2241 dst = NULL; 2242 } 2243 rcu_read_unlock(); 2244 } else { 2245 dst_release(dst); 2246 dst = NULL; 2247 } 2248 } 2249 return dst; 2250 } 2251 2252 static void ip6_link_failure(struct sk_buff *skb) 2253 { 2254 struct rt6_info *rt; 2255 2256 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2257 2258 rt = (struct rt6_info *) skb_dst(skb); 2259 if (rt) { 2260 rcu_read_lock(); 2261 if (rt->rt6i_flags & RTF_CACHE) { 2262 if (dst_hold_safe(&rt->dst)) 2263 rt6_remove_exception_rt(rt); 2264 } else { 2265 struct fib6_info *from; 2266 struct fib6_node *fn; 2267 2268 from = rcu_dereference(rt->from); 2269 if (from) { 2270 fn = rcu_dereference(from->fib6_node); 2271 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2272 fn->fn_sernum = -1; 2273 } 2274 } 2275 rcu_read_unlock(); 2276 } 2277 } 2278 2279 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2280 { 2281 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2282 struct fib6_info *from; 2283 2284 rcu_read_lock(); 2285 from = rcu_dereference(rt0->from); 2286 if (from) 2287 rt0->dst.expires = from->expires; 2288 rcu_read_unlock(); 2289 } 2290 2291 dst_set_expires(&rt0->dst, timeout); 2292 rt0->rt6i_flags |= RTF_EXPIRES; 2293 } 2294 2295 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2296 { 2297 struct net *net = dev_net(rt->dst.dev); 2298 2299 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2300 rt->rt6i_flags |= RTF_MODIFIED; 2301 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2302 } 2303 2304 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2305 { 2306 bool from_set; 2307 2308 rcu_read_lock(); 2309 from_set = !!rcu_dereference(rt->from); 2310 rcu_read_unlock(); 2311 2312 return !(rt->rt6i_flags & RTF_CACHE) && 2313 (rt->rt6i_flags & RTF_PCPU || from_set); 2314 } 2315 2316 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2317 const struct ipv6hdr *iph, u32 mtu) 2318 { 2319 const struct in6_addr *daddr, *saddr; 2320 struct rt6_info *rt6 = (struct rt6_info *)dst; 2321 2322 if (dst_metric_locked(dst, RTAX_MTU)) 2323 return; 2324 2325 if (iph) { 2326 daddr = &iph->daddr; 2327 saddr = &iph->saddr; 2328 } else if (sk) { 2329 daddr = &sk->sk_v6_daddr; 2330 saddr = &inet6_sk(sk)->saddr; 2331 } else { 2332 daddr = NULL; 2333 saddr = NULL; 2334 } 2335 dst_confirm_neigh(dst, daddr); 2336 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2337 if (mtu >= dst_mtu(dst)) 2338 return; 2339 2340 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2341 rt6_do_update_pmtu(rt6, mtu); 2342 /* update rt6_ex->stamp for cache */ 2343 if (rt6->rt6i_flags & RTF_CACHE) 2344 rt6_update_exception_stamp_rt(rt6); 2345 } else if (daddr) { 2346 struct fib6_info *from; 2347 struct rt6_info *nrt6; 2348 2349 rcu_read_lock(); 2350 from = rcu_dereference(rt6->from); 2351 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr); 2352 if (nrt6) { 2353 rt6_do_update_pmtu(nrt6, mtu); 2354 if (rt6_insert_exception(nrt6, from)) 2355 dst_release_immediate(&nrt6->dst); 2356 } 2357 rcu_read_unlock(); 2358 } 2359 } 2360 2361 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2362 struct sk_buff *skb, u32 mtu) 2363 { 2364 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2365 } 2366 2367 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2368 int oif, u32 mark, kuid_t uid) 2369 { 2370 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2371 struct dst_entry *dst; 2372 struct flowi6 fl6; 2373 2374 memset(&fl6, 0, sizeof(fl6)); 2375 fl6.flowi6_oif = oif; 2376 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); 2377 fl6.daddr = iph->daddr; 2378 fl6.saddr = iph->saddr; 2379 fl6.flowlabel = ip6_flowinfo(iph); 2380 fl6.flowi6_uid = uid; 2381 2382 dst = ip6_route_output(net, NULL, &fl6); 2383 if (!dst->error) 2384 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2385 dst_release(dst); 2386 } 2387 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2388 2389 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2390 { 2391 struct dst_entry *dst; 2392 2393 ip6_update_pmtu(skb, sock_net(sk), mtu, 2394 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid); 2395 2396 dst = __sk_dst_get(sk); 2397 if (!dst || !dst->obsolete || 2398 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2399 return; 2400 2401 bh_lock_sock(sk); 2402 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2403 ip6_datagram_dst_update(sk, false); 2404 bh_unlock_sock(sk); 2405 } 2406 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2407 2408 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2409 const struct flowi6 *fl6) 2410 { 2411 #ifdef CONFIG_IPV6_SUBTREES 2412 struct ipv6_pinfo *np = inet6_sk(sk); 2413 #endif 2414 2415 ip6_dst_store(sk, dst, 2416 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2417 &sk->sk_v6_daddr : NULL, 2418 #ifdef CONFIG_IPV6_SUBTREES 2419 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2420 &np->saddr : 2421 #endif 2422 NULL); 2423 } 2424 2425 /* Handle redirects */ 2426 struct ip6rd_flowi { 2427 struct flowi6 fl6; 2428 struct in6_addr gateway; 2429 }; 2430 2431 static struct rt6_info *__ip6_route_redirect(struct net *net, 2432 struct fib6_table *table, 2433 struct flowi6 *fl6, 2434 const struct sk_buff *skb, 2435 int flags) 2436 { 2437 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2438 struct rt6_info *ret = NULL, *rt_cache; 2439 struct fib6_info *rt; 2440 struct fib6_node *fn; 2441 2442 /* Get the "current" route for this destination and 2443 * check if the redirect has come from appropriate router. 2444 * 2445 * RFC 4861 specifies that redirects should only be 2446 * accepted if they come from the nexthop to the target. 2447 * Due to the way the routes are chosen, this notion 2448 * is a bit fuzzy and one might need to check all possible 2449 * routes. 2450 */ 2451 2452 rcu_read_lock(); 2453 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2454 restart: 2455 for_each_fib6_node_rt_rcu(fn) { 2456 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 2457 continue; 2458 if (fib6_check_expired(rt)) 2459 continue; 2460 if (rt->fib6_flags & RTF_REJECT) 2461 break; 2462 if (!(rt->fib6_flags & RTF_GATEWAY)) 2463 continue; 2464 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex) 2465 continue; 2466 /* rt_cache's gateway might be different from its 'parent' 2467 * in the case of an ip redirect. 2468 * So we keep searching in the exception table if the gateway 2469 * is different. 2470 */ 2471 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) { 2472 rt_cache = rt6_find_cached_rt(rt, 2473 &fl6->daddr, 2474 &fl6->saddr); 2475 if (rt_cache && 2476 ipv6_addr_equal(&rdfl->gateway, 2477 &rt_cache->rt6i_gateway)) { 2478 ret = rt_cache; 2479 break; 2480 } 2481 continue; 2482 } 2483 break; 2484 } 2485 2486 if (!rt) 2487 rt = net->ipv6.fib6_null_entry; 2488 else if (rt->fib6_flags & RTF_REJECT) { 2489 ret = net->ipv6.ip6_null_entry; 2490 goto out; 2491 } 2492 2493 if (rt == net->ipv6.fib6_null_entry) { 2494 fn = fib6_backtrack(fn, &fl6->saddr); 2495 if (fn) 2496 goto restart; 2497 } 2498 2499 out: 2500 if (ret) 2501 ip6_hold_safe(net, &ret, true); 2502 else 2503 ret = ip6_create_rt_rcu(rt); 2504 2505 rcu_read_unlock(); 2506 2507 trace_fib6_table_lookup(net, rt, table, fl6); 2508 return ret; 2509 }; 2510 2511 static struct dst_entry *ip6_route_redirect(struct net *net, 2512 const struct flowi6 *fl6, 2513 const struct sk_buff *skb, 2514 const struct in6_addr *gateway) 2515 { 2516 int flags = RT6_LOOKUP_F_HAS_SADDR; 2517 struct ip6rd_flowi rdfl; 2518 2519 rdfl.fl6 = *fl6; 2520 rdfl.gateway = *gateway; 2521 2522 return fib6_rule_lookup(net, &rdfl.fl6, skb, 2523 flags, __ip6_route_redirect); 2524 } 2525 2526 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2527 kuid_t uid) 2528 { 2529 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2530 struct dst_entry *dst; 2531 struct flowi6 fl6; 2532 2533 memset(&fl6, 0, sizeof(fl6)); 2534 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2535 fl6.flowi6_oif = oif; 2536 fl6.flowi6_mark = mark; 2537 fl6.daddr = iph->daddr; 2538 fl6.saddr = iph->saddr; 2539 fl6.flowlabel = ip6_flowinfo(iph); 2540 fl6.flowi6_uid = uid; 2541 2542 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 2543 rt6_do_redirect(dst, NULL, skb); 2544 dst_release(dst); 2545 } 2546 EXPORT_SYMBOL_GPL(ip6_redirect); 2547 2548 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, 2549 u32 mark) 2550 { 2551 const struct ipv6hdr *iph = ipv6_hdr(skb); 2552 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2553 struct dst_entry *dst; 2554 struct flowi6 fl6; 2555 2556 memset(&fl6, 0, sizeof(fl6)); 2557 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2558 fl6.flowi6_oif = oif; 2559 fl6.flowi6_mark = mark; 2560 fl6.daddr = msg->dest; 2561 fl6.saddr = iph->daddr; 2562 fl6.flowi6_uid = sock_net_uid(net, NULL); 2563 2564 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 2565 rt6_do_redirect(dst, NULL, skb); 2566 dst_release(dst); 2567 } 2568 2569 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2570 { 2571 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2572 sk->sk_uid); 2573 } 2574 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2575 2576 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2577 { 2578 struct net_device *dev = dst->dev; 2579 unsigned int mtu = dst_mtu(dst); 2580 struct net *net = dev_net(dev); 2581 2582 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2583 2584 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2585 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2586 2587 /* 2588 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2589 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2590 * IPV6_MAXPLEN is also valid and means: "any MSS, 2591 * rely only on pmtu discovery" 2592 */ 2593 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2594 mtu = IPV6_MAXPLEN; 2595 return mtu; 2596 } 2597 2598 static unsigned int ip6_mtu(const struct dst_entry *dst) 2599 { 2600 struct inet6_dev *idev; 2601 unsigned int mtu; 2602 2603 mtu = dst_metric_raw(dst, RTAX_MTU); 2604 if (mtu) 2605 goto out; 2606 2607 mtu = IPV6_MIN_MTU; 2608 2609 rcu_read_lock(); 2610 idev = __in6_dev_get(dst->dev); 2611 if (idev) 2612 mtu = idev->cnf.mtu6; 2613 rcu_read_unlock(); 2614 2615 out: 2616 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2617 2618 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2619 } 2620 2621 /* MTU selection: 2622 * 1. mtu on route is locked - use it 2623 * 2. mtu from nexthop exception 2624 * 3. mtu from egress device 2625 * 2626 * based on ip6_dst_mtu_forward and exception logic of 2627 * rt6_find_cached_rt; called with rcu_read_lock 2628 */ 2629 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, 2630 struct in6_addr *saddr) 2631 { 2632 struct rt6_exception_bucket *bucket; 2633 struct rt6_exception *rt6_ex; 2634 struct in6_addr *src_key; 2635 struct inet6_dev *idev; 2636 u32 mtu = 0; 2637 2638 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 2639 mtu = f6i->fib6_pmtu; 2640 if (mtu) 2641 goto out; 2642 } 2643 2644 src_key = NULL; 2645 #ifdef CONFIG_IPV6_SUBTREES 2646 if (f6i->fib6_src.plen) 2647 src_key = saddr; 2648 #endif 2649 2650 bucket = rcu_dereference(f6i->rt6i_exception_bucket); 2651 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 2652 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 2653 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU); 2654 2655 if (likely(!mtu)) { 2656 struct net_device *dev = fib6_info_nh_dev(f6i); 2657 2658 mtu = IPV6_MIN_MTU; 2659 idev = __in6_dev_get(dev); 2660 if (idev && idev->cnf.mtu6 > mtu) 2661 mtu = idev->cnf.mtu6; 2662 } 2663 2664 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2665 out: 2666 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu); 2667 } 2668 2669 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2670 struct flowi6 *fl6) 2671 { 2672 struct dst_entry *dst; 2673 struct rt6_info *rt; 2674 struct inet6_dev *idev = in6_dev_get(dev); 2675 struct net *net = dev_net(dev); 2676 2677 if (unlikely(!idev)) 2678 return ERR_PTR(-ENODEV); 2679 2680 rt = ip6_dst_alloc(net, dev, 0); 2681 if (unlikely(!rt)) { 2682 in6_dev_put(idev); 2683 dst = ERR_PTR(-ENOMEM); 2684 goto out; 2685 } 2686 2687 rt->dst.flags |= DST_HOST; 2688 rt->dst.input = ip6_input; 2689 rt->dst.output = ip6_output; 2690 rt->rt6i_gateway = fl6->daddr; 2691 rt->rt6i_dst.addr = fl6->daddr; 2692 rt->rt6i_dst.plen = 128; 2693 rt->rt6i_idev = idev; 2694 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2695 2696 /* Add this dst into uncached_list so that rt6_disable_ip() can 2697 * do proper release of the net_device 2698 */ 2699 rt6_uncached_list_add(rt); 2700 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2701 2702 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2703 2704 out: 2705 return dst; 2706 } 2707 2708 static int ip6_dst_gc(struct dst_ops *ops) 2709 { 2710 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2711 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2712 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2713 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2714 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2715 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2716 int entries; 2717 2718 entries = dst_entries_get_fast(ops); 2719 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2720 entries <= rt_max_size) 2721 goto out; 2722 2723 net->ipv6.ip6_rt_gc_expire++; 2724 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2725 entries = dst_entries_get_slow(ops); 2726 if (entries < ops->gc_thresh) 2727 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2728 out: 2729 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2730 return entries > rt_max_size; 2731 } 2732 2733 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt, 2734 struct fib6_config *cfg) 2735 { 2736 struct dst_metrics *p; 2737 2738 if (!cfg->fc_mx) 2739 return 0; 2740 2741 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL); 2742 if (unlikely(!p)) 2743 return -ENOMEM; 2744 2745 refcount_set(&p->refcnt, 1); 2746 rt->fib6_metrics = p; 2747 2748 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics); 2749 } 2750 2751 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2752 struct fib6_config *cfg, 2753 const struct in6_addr *gw_addr, 2754 u32 tbid, int flags) 2755 { 2756 struct flowi6 fl6 = { 2757 .flowi6_oif = cfg->fc_ifindex, 2758 .daddr = *gw_addr, 2759 .saddr = cfg->fc_prefsrc, 2760 }; 2761 struct fib6_table *table; 2762 struct rt6_info *rt; 2763 2764 table = fib6_get_table(net, tbid); 2765 if (!table) 2766 return NULL; 2767 2768 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2769 flags |= RT6_LOOKUP_F_HAS_SADDR; 2770 2771 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 2772 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); 2773 2774 /* if table lookup failed, fall back to full lookup */ 2775 if (rt == net->ipv6.ip6_null_entry) { 2776 ip6_rt_put(rt); 2777 rt = NULL; 2778 } 2779 2780 return rt; 2781 } 2782 2783 static int ip6_route_check_nh_onlink(struct net *net, 2784 struct fib6_config *cfg, 2785 const struct net_device *dev, 2786 struct netlink_ext_ack *extack) 2787 { 2788 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 2789 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2790 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; 2791 struct rt6_info *grt; 2792 int err; 2793 2794 err = 0; 2795 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2796 if (grt) { 2797 if (!grt->dst.error && 2798 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2799 NL_SET_ERR_MSG(extack, 2800 "Nexthop has invalid gateway or device mismatch"); 2801 err = -EINVAL; 2802 } 2803 2804 ip6_rt_put(grt); 2805 } 2806 2807 return err; 2808 } 2809 2810 static int ip6_route_check_nh(struct net *net, 2811 struct fib6_config *cfg, 2812 struct net_device **_dev, 2813 struct inet6_dev **idev) 2814 { 2815 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2816 struct net_device *dev = _dev ? *_dev : NULL; 2817 struct rt6_info *grt = NULL; 2818 int err = -EHOSTUNREACH; 2819 2820 if (cfg->fc_table) { 2821 int flags = RT6_LOOKUP_F_IFACE; 2822 2823 grt = ip6_nh_lookup_table(net, cfg, gw_addr, 2824 cfg->fc_table, flags); 2825 if (grt) { 2826 if (grt->rt6i_flags & RTF_GATEWAY || 2827 (dev && dev != grt->dst.dev)) { 2828 ip6_rt_put(grt); 2829 grt = NULL; 2830 } 2831 } 2832 } 2833 2834 if (!grt) 2835 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); 2836 2837 if (!grt) 2838 goto out; 2839 2840 if (dev) { 2841 if (dev != grt->dst.dev) { 2842 ip6_rt_put(grt); 2843 goto out; 2844 } 2845 } else { 2846 *_dev = dev = grt->dst.dev; 2847 *idev = grt->rt6i_idev; 2848 dev_hold(dev); 2849 in6_dev_hold(grt->rt6i_idev); 2850 } 2851 2852 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2853 err = 0; 2854 2855 ip6_rt_put(grt); 2856 2857 out: 2858 return err; 2859 } 2860 2861 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 2862 struct net_device **_dev, struct inet6_dev **idev, 2863 struct netlink_ext_ack *extack) 2864 { 2865 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2866 int gwa_type = ipv6_addr_type(gw_addr); 2867 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 2868 const struct net_device *dev = *_dev; 2869 bool need_addr_check = !dev; 2870 int err = -EINVAL; 2871 2872 /* if gw_addr is local we will fail to detect this in case 2873 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2874 * will return already-added prefix route via interface that 2875 * prefix route was assigned to, which might be non-loopback. 2876 */ 2877 if (dev && 2878 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2879 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2880 goto out; 2881 } 2882 2883 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 2884 /* IPv6 strictly inhibits using not link-local 2885 * addresses as nexthop address. 2886 * Otherwise, router will not able to send redirects. 2887 * It is very good, but in some (rare!) circumstances 2888 * (SIT, PtP, NBMA NOARP links) it is handy to allow 2889 * some exceptions. --ANK 2890 * We allow IPv4-mapped nexthops to support RFC4798-type 2891 * addressing 2892 */ 2893 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 2894 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2895 goto out; 2896 } 2897 2898 if (cfg->fc_flags & RTNH_F_ONLINK) 2899 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 2900 else 2901 err = ip6_route_check_nh(net, cfg, _dev, idev); 2902 2903 if (err) 2904 goto out; 2905 } 2906 2907 /* reload in case device was changed */ 2908 dev = *_dev; 2909 2910 err = -EINVAL; 2911 if (!dev) { 2912 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2913 goto out; 2914 } else if (dev->flags & IFF_LOOPBACK) { 2915 NL_SET_ERR_MSG(extack, 2916 "Egress device can not be loopback device for this route"); 2917 goto out; 2918 } 2919 2920 /* if we did not check gw_addr above, do so now that the 2921 * egress device has been resolved. 2922 */ 2923 if (need_addr_check && 2924 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2925 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2926 goto out; 2927 } 2928 2929 err = 0; 2930 out: 2931 return err; 2932 } 2933 2934 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 2935 gfp_t gfp_flags, 2936 struct netlink_ext_ack *extack) 2937 { 2938 struct net *net = cfg->fc_nlinfo.nl_net; 2939 struct fib6_info *rt = NULL; 2940 struct net_device *dev = NULL; 2941 struct inet6_dev *idev = NULL; 2942 struct fib6_table *table; 2943 int addr_type; 2944 int err = -EINVAL; 2945 2946 /* RTF_PCPU is an internal flag; can not be set by userspace */ 2947 if (cfg->fc_flags & RTF_PCPU) { 2948 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 2949 goto out; 2950 } 2951 2952 /* RTF_CACHE is an internal flag; can not be set by userspace */ 2953 if (cfg->fc_flags & RTF_CACHE) { 2954 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 2955 goto out; 2956 } 2957 2958 if (cfg->fc_type > RTN_MAX) { 2959 NL_SET_ERR_MSG(extack, "Invalid route type"); 2960 goto out; 2961 } 2962 2963 if (cfg->fc_dst_len > 128) { 2964 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 2965 goto out; 2966 } 2967 if (cfg->fc_src_len > 128) { 2968 NL_SET_ERR_MSG(extack, "Invalid source address length"); 2969 goto out; 2970 } 2971 #ifndef CONFIG_IPV6_SUBTREES 2972 if (cfg->fc_src_len) { 2973 NL_SET_ERR_MSG(extack, 2974 "Specifying source address requires IPV6_SUBTREES to be enabled"); 2975 goto out; 2976 } 2977 #endif 2978 if (cfg->fc_ifindex) { 2979 err = -ENODEV; 2980 dev = dev_get_by_index(net, cfg->fc_ifindex); 2981 if (!dev) 2982 goto out; 2983 idev = in6_dev_get(dev); 2984 if (!idev) 2985 goto out; 2986 } 2987 2988 if (cfg->fc_metric == 0) 2989 cfg->fc_metric = IP6_RT_PRIO_USER; 2990 2991 if (cfg->fc_flags & RTNH_F_ONLINK) { 2992 if (!dev) { 2993 NL_SET_ERR_MSG(extack, 2994 "Nexthop device required for onlink"); 2995 err = -ENODEV; 2996 goto out; 2997 } 2998 2999 if (!(dev->flags & IFF_UP)) { 3000 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3001 err = -ENETDOWN; 3002 goto out; 3003 } 3004 } 3005 3006 err = -ENOBUFS; 3007 if (cfg->fc_nlinfo.nlh && 3008 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 3009 table = fib6_get_table(net, cfg->fc_table); 3010 if (!table) { 3011 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 3012 table = fib6_new_table(net, cfg->fc_table); 3013 } 3014 } else { 3015 table = fib6_new_table(net, cfg->fc_table); 3016 } 3017 3018 if (!table) 3019 goto out; 3020 3021 err = -ENOMEM; 3022 rt = fib6_info_alloc(gfp_flags); 3023 if (!rt) 3024 goto out; 3025 3026 if (cfg->fc_flags & RTF_ADDRCONF) 3027 rt->dst_nocount = true; 3028 3029 err = ip6_convert_metrics(net, rt, cfg); 3030 if (err < 0) 3031 goto out; 3032 3033 if (cfg->fc_flags & RTF_EXPIRES) 3034 fib6_set_expires(rt, jiffies + 3035 clock_t_to_jiffies(cfg->fc_expires)); 3036 else 3037 fib6_clean_expires(rt); 3038 3039 if (cfg->fc_protocol == RTPROT_UNSPEC) 3040 cfg->fc_protocol = RTPROT_BOOT; 3041 rt->fib6_protocol = cfg->fc_protocol; 3042 3043 addr_type = ipv6_addr_type(&cfg->fc_dst); 3044 3045 if (cfg->fc_encap) { 3046 struct lwtunnel_state *lwtstate; 3047 3048 err = lwtunnel_build_state(cfg->fc_encap_type, 3049 cfg->fc_encap, AF_INET6, cfg, 3050 &lwtstate, extack); 3051 if (err) 3052 goto out; 3053 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate); 3054 } 3055 3056 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3057 rt->fib6_dst.plen = cfg->fc_dst_len; 3058 if (rt->fib6_dst.plen == 128) 3059 rt->dst_host = true; 3060 3061 #ifdef CONFIG_IPV6_SUBTREES 3062 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3063 rt->fib6_src.plen = cfg->fc_src_len; 3064 #endif 3065 3066 rt->fib6_metric = cfg->fc_metric; 3067 rt->fib6_nh.nh_weight = 1; 3068 3069 rt->fib6_type = cfg->fc_type; 3070 3071 /* We cannot add true routes via loopback here, 3072 they would result in kernel looping; promote them to reject routes 3073 */ 3074 if ((cfg->fc_flags & RTF_REJECT) || 3075 (dev && (dev->flags & IFF_LOOPBACK) && 3076 !(addr_type & IPV6_ADDR_LOOPBACK) && 3077 !(cfg->fc_flags & RTF_LOCAL))) { 3078 /* hold loopback dev/idev if we haven't done so. */ 3079 if (dev != net->loopback_dev) { 3080 if (dev) { 3081 dev_put(dev); 3082 in6_dev_put(idev); 3083 } 3084 dev = net->loopback_dev; 3085 dev_hold(dev); 3086 idev = in6_dev_get(dev); 3087 if (!idev) { 3088 err = -ENODEV; 3089 goto out; 3090 } 3091 } 3092 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP; 3093 goto install_route; 3094 } 3095 3096 if (cfg->fc_flags & RTF_GATEWAY) { 3097 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 3098 if (err) 3099 goto out; 3100 3101 rt->fib6_nh.nh_gw = cfg->fc_gateway; 3102 } 3103 3104 err = -ENODEV; 3105 if (!dev) 3106 goto out; 3107 3108 if (idev->cnf.disable_ipv6) { 3109 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3110 err = -EACCES; 3111 goto out; 3112 } 3113 3114 if (!(dev->flags & IFF_UP)) { 3115 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3116 err = -ENETDOWN; 3117 goto out; 3118 } 3119 3120 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3121 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3122 NL_SET_ERR_MSG(extack, "Invalid source address"); 3123 err = -EINVAL; 3124 goto out; 3125 } 3126 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3127 rt->fib6_prefsrc.plen = 128; 3128 } else 3129 rt->fib6_prefsrc.plen = 0; 3130 3131 rt->fib6_flags = cfg->fc_flags; 3132 3133 install_route: 3134 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3135 !netif_carrier_ok(dev)) 3136 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 3137 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); 3138 rt->fib6_nh.nh_dev = dev; 3139 rt->fib6_table = table; 3140 3141 cfg->fc_nlinfo.nl_net = dev_net(dev); 3142 3143 if (idev) 3144 in6_dev_put(idev); 3145 3146 return rt; 3147 out: 3148 if (dev) 3149 dev_put(dev); 3150 if (idev) 3151 in6_dev_put(idev); 3152 3153 fib6_info_release(rt); 3154 return ERR_PTR(err); 3155 } 3156 3157 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3158 struct netlink_ext_ack *extack) 3159 { 3160 struct fib6_info *rt; 3161 int err; 3162 3163 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3164 if (IS_ERR(rt)) 3165 return PTR_ERR(rt); 3166 3167 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3168 fib6_info_release(rt); 3169 3170 return err; 3171 } 3172 3173 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3174 { 3175 struct net *net = info->nl_net; 3176 struct fib6_table *table; 3177 int err; 3178 3179 if (rt == net->ipv6.fib6_null_entry) { 3180 err = -ENOENT; 3181 goto out; 3182 } 3183 3184 table = rt->fib6_table; 3185 spin_lock_bh(&table->tb6_lock); 3186 err = fib6_del(rt, info); 3187 spin_unlock_bh(&table->tb6_lock); 3188 3189 out: 3190 fib6_info_release(rt); 3191 return err; 3192 } 3193 3194 int ip6_del_rt(struct net *net, struct fib6_info *rt) 3195 { 3196 struct nl_info info = { .nl_net = net }; 3197 3198 return __ip6_del_rt(rt, &info); 3199 } 3200 3201 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3202 { 3203 struct nl_info *info = &cfg->fc_nlinfo; 3204 struct net *net = info->nl_net; 3205 struct sk_buff *skb = NULL; 3206 struct fib6_table *table; 3207 int err = -ENOENT; 3208 3209 if (rt == net->ipv6.fib6_null_entry) 3210 goto out_put; 3211 table = rt->fib6_table; 3212 spin_lock_bh(&table->tb6_lock); 3213 3214 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3215 struct fib6_info *sibling, *next_sibling; 3216 3217 /* prefer to send a single notification with all hops */ 3218 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3219 if (skb) { 3220 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3221 3222 if (rt6_fill_node(net, skb, rt, NULL, 3223 NULL, NULL, 0, RTM_DELROUTE, 3224 info->portid, seq, 0) < 0) { 3225 kfree_skb(skb); 3226 skb = NULL; 3227 } else 3228 info->skip_notify = 1; 3229 } 3230 3231 list_for_each_entry_safe(sibling, next_sibling, 3232 &rt->fib6_siblings, 3233 fib6_siblings) { 3234 err = fib6_del(sibling, info); 3235 if (err) 3236 goto out_unlock; 3237 } 3238 } 3239 3240 err = fib6_del(rt, info); 3241 out_unlock: 3242 spin_unlock_bh(&table->tb6_lock); 3243 out_put: 3244 fib6_info_release(rt); 3245 3246 if (skb) { 3247 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3248 info->nlh, gfp_any()); 3249 } 3250 return err; 3251 } 3252 3253 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3254 { 3255 int rc = -ESRCH; 3256 3257 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3258 goto out; 3259 3260 if (cfg->fc_flags & RTF_GATEWAY && 3261 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3262 goto out; 3263 if (dst_hold_safe(&rt->dst)) 3264 rc = rt6_remove_exception_rt(rt); 3265 out: 3266 return rc; 3267 } 3268 3269 static int ip6_route_del(struct fib6_config *cfg, 3270 struct netlink_ext_ack *extack) 3271 { 3272 struct rt6_info *rt_cache; 3273 struct fib6_table *table; 3274 struct fib6_info *rt; 3275 struct fib6_node *fn; 3276 int err = -ESRCH; 3277 3278 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 3279 if (!table) { 3280 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 3281 return err; 3282 } 3283 3284 rcu_read_lock(); 3285 3286 fn = fib6_locate(&table->tb6_root, 3287 &cfg->fc_dst, cfg->fc_dst_len, 3288 &cfg->fc_src, cfg->fc_src_len, 3289 !(cfg->fc_flags & RTF_CACHE)); 3290 3291 if (fn) { 3292 for_each_fib6_node_rt_rcu(fn) { 3293 if (cfg->fc_flags & RTF_CACHE) { 3294 int rc; 3295 3296 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, 3297 &cfg->fc_src); 3298 if (rt_cache) { 3299 rc = ip6_del_cached_rt(rt_cache, cfg); 3300 if (rc != -ESRCH) { 3301 rcu_read_unlock(); 3302 return rc; 3303 } 3304 } 3305 continue; 3306 } 3307 if (cfg->fc_ifindex && 3308 (!rt->fib6_nh.nh_dev || 3309 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex)) 3310 continue; 3311 if (cfg->fc_flags & RTF_GATEWAY && 3312 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw)) 3313 continue; 3314 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 3315 continue; 3316 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) 3317 continue; 3318 if (!fib6_info_hold_safe(rt)) 3319 continue; 3320 rcu_read_unlock(); 3321 3322 /* if gateway was specified only delete the one hop */ 3323 if (cfg->fc_flags & RTF_GATEWAY) 3324 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3325 3326 return __ip6_del_rt_siblings(rt, cfg); 3327 } 3328 } 3329 rcu_read_unlock(); 3330 3331 return err; 3332 } 3333 3334 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3335 { 3336 struct netevent_redirect netevent; 3337 struct rt6_info *rt, *nrt = NULL; 3338 struct ndisc_options ndopts; 3339 struct inet6_dev *in6_dev; 3340 struct neighbour *neigh; 3341 struct fib6_info *from; 3342 struct rd_msg *msg; 3343 int optlen, on_link; 3344 u8 *lladdr; 3345 3346 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3347 optlen -= sizeof(*msg); 3348 3349 if (optlen < 0) { 3350 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3351 return; 3352 } 3353 3354 msg = (struct rd_msg *)icmp6_hdr(skb); 3355 3356 if (ipv6_addr_is_multicast(&msg->dest)) { 3357 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3358 return; 3359 } 3360 3361 on_link = 0; 3362 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3363 on_link = 1; 3364 } else if (ipv6_addr_type(&msg->target) != 3365 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3366 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3367 return; 3368 } 3369 3370 in6_dev = __in6_dev_get(skb->dev); 3371 if (!in6_dev) 3372 return; 3373 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3374 return; 3375 3376 /* RFC2461 8.1: 3377 * The IP source address of the Redirect MUST be the same as the current 3378 * first-hop router for the specified ICMP Destination Address. 3379 */ 3380 3381 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3382 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3383 return; 3384 } 3385 3386 lladdr = NULL; 3387 if (ndopts.nd_opts_tgt_lladdr) { 3388 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3389 skb->dev); 3390 if (!lladdr) { 3391 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 3392 return; 3393 } 3394 } 3395 3396 rt = (struct rt6_info *) dst; 3397 if (rt->rt6i_flags & RTF_REJECT) { 3398 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 3399 return; 3400 } 3401 3402 /* Redirect received -> path was valid. 3403 * Look, redirects are sent only in response to data packets, 3404 * so that this nexthop apparently is reachable. --ANK 3405 */ 3406 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3407 3408 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3409 if (!neigh) 3410 return; 3411 3412 /* 3413 * We have finally decided to accept it. 3414 */ 3415 3416 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3417 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3418 NEIGH_UPDATE_F_OVERRIDE| 3419 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3420 NEIGH_UPDATE_F_ISROUTER)), 3421 NDISC_REDIRECT, &ndopts); 3422 3423 rcu_read_lock(); 3424 from = rcu_dereference(rt->from); 3425 /* This fib6_info_hold() is safe here because we hold reference to rt 3426 * and rt already holds reference to fib6_info. 3427 */ 3428 fib6_info_hold(from); 3429 rcu_read_unlock(); 3430 3431 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL); 3432 if (!nrt) 3433 goto out; 3434 3435 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3436 if (on_link) 3437 nrt->rt6i_flags &= ~RTF_GATEWAY; 3438 3439 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3440 3441 /* No need to remove rt from the exception table if rt is 3442 * a cached route because rt6_insert_exception() will 3443 * takes care of it 3444 */ 3445 if (rt6_insert_exception(nrt, from)) { 3446 dst_release_immediate(&nrt->dst); 3447 goto out; 3448 } 3449 3450 netevent.old = &rt->dst; 3451 netevent.new = &nrt->dst; 3452 netevent.daddr = &msg->dest; 3453 netevent.neigh = neigh; 3454 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3455 3456 out: 3457 fib6_info_release(from); 3458 neigh_release(neigh); 3459 } 3460 3461 #ifdef CONFIG_IPV6_ROUTE_INFO 3462 static struct fib6_info *rt6_get_route_info(struct net *net, 3463 const struct in6_addr *prefix, int prefixlen, 3464 const struct in6_addr *gwaddr, 3465 struct net_device *dev) 3466 { 3467 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3468 int ifindex = dev->ifindex; 3469 struct fib6_node *fn; 3470 struct fib6_info *rt = NULL; 3471 struct fib6_table *table; 3472 3473 table = fib6_get_table(net, tb_id); 3474 if (!table) 3475 return NULL; 3476 3477 rcu_read_lock(); 3478 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3479 if (!fn) 3480 goto out; 3481 3482 for_each_fib6_node_rt_rcu(fn) { 3483 if (rt->fib6_nh.nh_dev->ifindex != ifindex) 3484 continue; 3485 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) 3486 continue; 3487 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) 3488 continue; 3489 if (!fib6_info_hold_safe(rt)) 3490 continue; 3491 break; 3492 } 3493 out: 3494 rcu_read_unlock(); 3495 return rt; 3496 } 3497 3498 static struct fib6_info *rt6_add_route_info(struct net *net, 3499 const struct in6_addr *prefix, int prefixlen, 3500 const struct in6_addr *gwaddr, 3501 struct net_device *dev, 3502 unsigned int pref) 3503 { 3504 struct fib6_config cfg = { 3505 .fc_metric = IP6_RT_PRIO_USER, 3506 .fc_ifindex = dev->ifindex, 3507 .fc_dst_len = prefixlen, 3508 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3509 RTF_UP | RTF_PREF(pref), 3510 .fc_protocol = RTPROT_RA, 3511 .fc_type = RTN_UNICAST, 3512 .fc_nlinfo.portid = 0, 3513 .fc_nlinfo.nlh = NULL, 3514 .fc_nlinfo.nl_net = net, 3515 }; 3516 3517 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3518 cfg.fc_dst = *prefix; 3519 cfg.fc_gateway = *gwaddr; 3520 3521 /* We should treat it as a default route if prefix length is 0. */ 3522 if (!prefixlen) 3523 cfg.fc_flags |= RTF_DEFAULT; 3524 3525 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 3526 3527 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3528 } 3529 #endif 3530 3531 struct fib6_info *rt6_get_dflt_router(struct net *net, 3532 const struct in6_addr *addr, 3533 struct net_device *dev) 3534 { 3535 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3536 struct fib6_info *rt; 3537 struct fib6_table *table; 3538 3539 table = fib6_get_table(net, tb_id); 3540 if (!table) 3541 return NULL; 3542 3543 rcu_read_lock(); 3544 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3545 if (dev == rt->fib6_nh.nh_dev && 3546 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3547 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) 3548 break; 3549 } 3550 if (rt && !fib6_info_hold_safe(rt)) 3551 rt = NULL; 3552 rcu_read_unlock(); 3553 return rt; 3554 } 3555 3556 struct fib6_info *rt6_add_dflt_router(struct net *net, 3557 const struct in6_addr *gwaddr, 3558 struct net_device *dev, 3559 unsigned int pref) 3560 { 3561 struct fib6_config cfg = { 3562 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3563 .fc_metric = IP6_RT_PRIO_USER, 3564 .fc_ifindex = dev->ifindex, 3565 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3566 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3567 .fc_protocol = RTPROT_RA, 3568 .fc_type = RTN_UNICAST, 3569 .fc_nlinfo.portid = 0, 3570 .fc_nlinfo.nlh = NULL, 3571 .fc_nlinfo.nl_net = net, 3572 }; 3573 3574 cfg.fc_gateway = *gwaddr; 3575 3576 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 3577 struct fib6_table *table; 3578 3579 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3580 if (table) 3581 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3582 } 3583 3584 return rt6_get_dflt_router(net, gwaddr, dev); 3585 } 3586 3587 static void __rt6_purge_dflt_routers(struct net *net, 3588 struct fib6_table *table) 3589 { 3590 struct fib6_info *rt; 3591 3592 restart: 3593 rcu_read_lock(); 3594 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3595 struct net_device *dev = fib6_info_nh_dev(rt); 3596 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 3597 3598 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3599 (!idev || idev->cnf.accept_ra != 2) && 3600 fib6_info_hold_safe(rt)) { 3601 rcu_read_unlock(); 3602 ip6_del_rt(net, rt); 3603 goto restart; 3604 } 3605 } 3606 rcu_read_unlock(); 3607 3608 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3609 } 3610 3611 void rt6_purge_dflt_routers(struct net *net) 3612 { 3613 struct fib6_table *table; 3614 struct hlist_head *head; 3615 unsigned int h; 3616 3617 rcu_read_lock(); 3618 3619 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3620 head = &net->ipv6.fib_table_hash[h]; 3621 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3622 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3623 __rt6_purge_dflt_routers(net, table); 3624 } 3625 } 3626 3627 rcu_read_unlock(); 3628 } 3629 3630 static void rtmsg_to_fib6_config(struct net *net, 3631 struct in6_rtmsg *rtmsg, 3632 struct fib6_config *cfg) 3633 { 3634 memset(cfg, 0, sizeof(*cfg)); 3635 3636 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3637 : RT6_TABLE_MAIN; 3638 cfg->fc_ifindex = rtmsg->rtmsg_ifindex; 3639 cfg->fc_metric = rtmsg->rtmsg_metric; 3640 cfg->fc_expires = rtmsg->rtmsg_info; 3641 cfg->fc_dst_len = rtmsg->rtmsg_dst_len; 3642 cfg->fc_src_len = rtmsg->rtmsg_src_len; 3643 cfg->fc_flags = rtmsg->rtmsg_flags; 3644 cfg->fc_type = rtmsg->rtmsg_type; 3645 3646 cfg->fc_nlinfo.nl_net = net; 3647 3648 cfg->fc_dst = rtmsg->rtmsg_dst; 3649 cfg->fc_src = rtmsg->rtmsg_src; 3650 cfg->fc_gateway = rtmsg->rtmsg_gateway; 3651 } 3652 3653 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3654 { 3655 struct fib6_config cfg; 3656 struct in6_rtmsg rtmsg; 3657 int err; 3658 3659 switch (cmd) { 3660 case SIOCADDRT: /* Add a route */ 3661 case SIOCDELRT: /* Delete a route */ 3662 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3663 return -EPERM; 3664 err = copy_from_user(&rtmsg, arg, 3665 sizeof(struct in6_rtmsg)); 3666 if (err) 3667 return -EFAULT; 3668 3669 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3670 3671 rtnl_lock(); 3672 switch (cmd) { 3673 case SIOCADDRT: 3674 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 3675 break; 3676 case SIOCDELRT: 3677 err = ip6_route_del(&cfg, NULL); 3678 break; 3679 default: 3680 err = -EINVAL; 3681 } 3682 rtnl_unlock(); 3683 3684 return err; 3685 } 3686 3687 return -EINVAL; 3688 } 3689 3690 /* 3691 * Drop the packet on the floor 3692 */ 3693 3694 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3695 { 3696 int type; 3697 struct dst_entry *dst = skb_dst(skb); 3698 switch (ipstats_mib_noroutes) { 3699 case IPSTATS_MIB_INNOROUTES: 3700 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3701 if (type == IPV6_ADDR_ANY) { 3702 IP6_INC_STATS(dev_net(dst->dev), 3703 __in6_dev_get_safely(skb->dev), 3704 IPSTATS_MIB_INADDRERRORS); 3705 break; 3706 } 3707 /* FALLTHROUGH */ 3708 case IPSTATS_MIB_OUTNOROUTES: 3709 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3710 ipstats_mib_noroutes); 3711 break; 3712 } 3713 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3714 kfree_skb(skb); 3715 return 0; 3716 } 3717 3718 static int ip6_pkt_discard(struct sk_buff *skb) 3719 { 3720 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3721 } 3722 3723 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3724 { 3725 skb->dev = skb_dst(skb)->dev; 3726 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3727 } 3728 3729 static int ip6_pkt_prohibit(struct sk_buff *skb) 3730 { 3731 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3732 } 3733 3734 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3735 { 3736 skb->dev = skb_dst(skb)->dev; 3737 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3738 } 3739 3740 /* 3741 * Allocate a dst for local (unicast / anycast) address. 3742 */ 3743 3744 struct fib6_info *addrconf_f6i_alloc(struct net *net, 3745 struct inet6_dev *idev, 3746 const struct in6_addr *addr, 3747 bool anycast, gfp_t gfp_flags) 3748 { 3749 u32 tb_id; 3750 struct net_device *dev = idev->dev; 3751 struct fib6_info *f6i; 3752 3753 f6i = fib6_info_alloc(gfp_flags); 3754 if (!f6i) 3755 return ERR_PTR(-ENOMEM); 3756 3757 f6i->dst_nocount = true; 3758 f6i->dst_host = true; 3759 f6i->fib6_protocol = RTPROT_KERNEL; 3760 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP; 3761 if (anycast) { 3762 f6i->fib6_type = RTN_ANYCAST; 3763 f6i->fib6_flags |= RTF_ANYCAST; 3764 } else { 3765 f6i->fib6_type = RTN_LOCAL; 3766 f6i->fib6_flags |= RTF_LOCAL; 3767 } 3768 3769 f6i->fib6_nh.nh_gw = *addr; 3770 dev_hold(dev); 3771 f6i->fib6_nh.nh_dev = dev; 3772 f6i->fib6_dst.addr = *addr; 3773 f6i->fib6_dst.plen = 128; 3774 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; 3775 f6i->fib6_table = fib6_get_table(net, tb_id); 3776 3777 return f6i; 3778 } 3779 3780 /* remove deleted ip from prefsrc entries */ 3781 struct arg_dev_net_ip { 3782 struct net_device *dev; 3783 struct net *net; 3784 struct in6_addr *addr; 3785 }; 3786 3787 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 3788 { 3789 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3790 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3791 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3792 3793 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) && 3794 rt != net->ipv6.fib6_null_entry && 3795 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 3796 spin_lock_bh(&rt6_exception_lock); 3797 /* remove prefsrc entry */ 3798 rt->fib6_prefsrc.plen = 0; 3799 /* need to update cache as well */ 3800 rt6_exceptions_remove_prefsrc(rt); 3801 spin_unlock_bh(&rt6_exception_lock); 3802 } 3803 return 0; 3804 } 3805 3806 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3807 { 3808 struct net *net = dev_net(ifp->idev->dev); 3809 struct arg_dev_net_ip adni = { 3810 .dev = ifp->idev->dev, 3811 .net = net, 3812 .addr = &ifp->addr, 3813 }; 3814 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3815 } 3816 3817 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) 3818 3819 /* Remove routers and update dst entries when gateway turn into host. */ 3820 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 3821 { 3822 struct in6_addr *gateway = (struct in6_addr *)arg; 3823 3824 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3825 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) { 3826 return -1; 3827 } 3828 3829 /* Further clean up cached routes in exception table. 3830 * This is needed because cached route may have a different 3831 * gateway than its 'parent' in the case of an ip redirect. 3832 */ 3833 rt6_exceptions_clean_tohost(rt, gateway); 3834 3835 return 0; 3836 } 3837 3838 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3839 { 3840 fib6_clean_all(net, fib6_clean_tohost, gateway); 3841 } 3842 3843 struct arg_netdev_event { 3844 const struct net_device *dev; 3845 union { 3846 unsigned int nh_flags; 3847 unsigned long event; 3848 }; 3849 }; 3850 3851 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 3852 { 3853 struct fib6_info *iter; 3854 struct fib6_node *fn; 3855 3856 fn = rcu_dereference_protected(rt->fib6_node, 3857 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3858 iter = rcu_dereference_protected(fn->leaf, 3859 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3860 while (iter) { 3861 if (iter->fib6_metric == rt->fib6_metric && 3862 rt6_qualify_for_ecmp(iter)) 3863 return iter; 3864 iter = rcu_dereference_protected(iter->fib6_next, 3865 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3866 } 3867 3868 return NULL; 3869 } 3870 3871 static bool rt6_is_dead(const struct fib6_info *rt) 3872 { 3873 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || 3874 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 3875 fib6_ignore_linkdown(rt))) 3876 return true; 3877 3878 return false; 3879 } 3880 3881 static int rt6_multipath_total_weight(const struct fib6_info *rt) 3882 { 3883 struct fib6_info *iter; 3884 int total = 0; 3885 3886 if (!rt6_is_dead(rt)) 3887 total += rt->fib6_nh.nh_weight; 3888 3889 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 3890 if (!rt6_is_dead(iter)) 3891 total += iter->fib6_nh.nh_weight; 3892 } 3893 3894 return total; 3895 } 3896 3897 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 3898 { 3899 int upper_bound = -1; 3900 3901 if (!rt6_is_dead(rt)) { 3902 *weight += rt->fib6_nh.nh_weight; 3903 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 3904 total) - 1; 3905 } 3906 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound); 3907 } 3908 3909 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 3910 { 3911 struct fib6_info *iter; 3912 int weight = 0; 3913 3914 rt6_upper_bound_set(rt, &weight, total); 3915 3916 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3917 rt6_upper_bound_set(iter, &weight, total); 3918 } 3919 3920 void rt6_multipath_rebalance(struct fib6_info *rt) 3921 { 3922 struct fib6_info *first; 3923 int total; 3924 3925 /* In case the entire multipath route was marked for flushing, 3926 * then there is no need to rebalance upon the removal of every 3927 * sibling route. 3928 */ 3929 if (!rt->fib6_nsiblings || rt->should_flush) 3930 return; 3931 3932 /* During lookup routes are evaluated in order, so we need to 3933 * make sure upper bounds are assigned from the first sibling 3934 * onwards. 3935 */ 3936 first = rt6_multipath_first_sibling(rt); 3937 if (WARN_ON_ONCE(!first)) 3938 return; 3939 3940 total = rt6_multipath_total_weight(first); 3941 rt6_multipath_upper_bound_set(first, total); 3942 } 3943 3944 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 3945 { 3946 const struct arg_netdev_event *arg = p_arg; 3947 struct net *net = dev_net(arg->dev); 3948 3949 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) { 3950 rt->fib6_nh.nh_flags &= ~arg->nh_flags; 3951 fib6_update_sernum_upto_root(net, rt); 3952 rt6_multipath_rebalance(rt); 3953 } 3954 3955 return 0; 3956 } 3957 3958 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) 3959 { 3960 struct arg_netdev_event arg = { 3961 .dev = dev, 3962 { 3963 .nh_flags = nh_flags, 3964 }, 3965 }; 3966 3967 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 3968 arg.nh_flags |= RTNH_F_LINKDOWN; 3969 3970 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 3971 } 3972 3973 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 3974 const struct net_device *dev) 3975 { 3976 struct fib6_info *iter; 3977 3978 if (rt->fib6_nh.nh_dev == dev) 3979 return true; 3980 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3981 if (iter->fib6_nh.nh_dev == dev) 3982 return true; 3983 3984 return false; 3985 } 3986 3987 static void rt6_multipath_flush(struct fib6_info *rt) 3988 { 3989 struct fib6_info *iter; 3990 3991 rt->should_flush = 1; 3992 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3993 iter->should_flush = 1; 3994 } 3995 3996 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 3997 const struct net_device *down_dev) 3998 { 3999 struct fib6_info *iter; 4000 unsigned int dead = 0; 4001 4002 if (rt->fib6_nh.nh_dev == down_dev || 4003 rt->fib6_nh.nh_flags & RTNH_F_DEAD) 4004 dead++; 4005 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4006 if (iter->fib6_nh.nh_dev == down_dev || 4007 iter->fib6_nh.nh_flags & RTNH_F_DEAD) 4008 dead++; 4009 4010 return dead; 4011 } 4012 4013 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 4014 const struct net_device *dev, 4015 unsigned int nh_flags) 4016 { 4017 struct fib6_info *iter; 4018 4019 if (rt->fib6_nh.nh_dev == dev) 4020 rt->fib6_nh.nh_flags |= nh_flags; 4021 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4022 if (iter->fib6_nh.nh_dev == dev) 4023 iter->fib6_nh.nh_flags |= nh_flags; 4024 } 4025 4026 /* called with write lock held for table with rt */ 4027 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 4028 { 4029 const struct arg_netdev_event *arg = p_arg; 4030 const struct net_device *dev = arg->dev; 4031 struct net *net = dev_net(dev); 4032 4033 if (rt == net->ipv6.fib6_null_entry) 4034 return 0; 4035 4036 switch (arg->event) { 4037 case NETDEV_UNREGISTER: 4038 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 4039 case NETDEV_DOWN: 4040 if (rt->should_flush) 4041 return -1; 4042 if (!rt->fib6_nsiblings) 4043 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 4044 if (rt6_multipath_uses_dev(rt, dev)) { 4045 unsigned int count; 4046 4047 count = rt6_multipath_dead_count(rt, dev); 4048 if (rt->fib6_nsiblings + 1 == count) { 4049 rt6_multipath_flush(rt); 4050 return -1; 4051 } 4052 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4053 RTNH_F_LINKDOWN); 4054 fib6_update_sernum(net, rt); 4055 rt6_multipath_rebalance(rt); 4056 } 4057 return -2; 4058 case NETDEV_CHANGE: 4059 if (rt->fib6_nh.nh_dev != dev || 4060 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 4061 break; 4062 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 4063 rt6_multipath_rebalance(rt); 4064 break; 4065 } 4066 4067 return 0; 4068 } 4069 4070 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4071 { 4072 struct arg_netdev_event arg = { 4073 .dev = dev, 4074 { 4075 .event = event, 4076 }, 4077 }; 4078 4079 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg); 4080 } 4081 4082 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4083 { 4084 rt6_sync_down_dev(dev, event); 4085 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4086 neigh_ifdown(&nd_tbl, dev); 4087 } 4088 4089 struct rt6_mtu_change_arg { 4090 struct net_device *dev; 4091 unsigned int mtu; 4092 }; 4093 4094 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) 4095 { 4096 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4097 struct inet6_dev *idev; 4098 4099 /* In IPv6 pmtu discovery is not optional, 4100 so that RTAX_MTU lock cannot disable it. 4101 We still use this lock to block changes 4102 caused by addrconf/ndisc. 4103 */ 4104 4105 idev = __in6_dev_get(arg->dev); 4106 if (!idev) 4107 return 0; 4108 4109 /* For administrative MTU increase, there is no way to discover 4110 IPv6 PMTU increase, so PMTU increase should be updated here. 4111 Since RFC 1981 doesn't include administrative MTU increase 4112 update PMTU increase is a MUST. (i.e. jumbo frame) 4113 */ 4114 if (rt->fib6_nh.nh_dev == arg->dev && 4115 !fib6_metric_locked(rt, RTAX_MTU)) { 4116 u32 mtu = rt->fib6_pmtu; 4117 4118 if (mtu >= arg->mtu || 4119 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4120 fib6_metric_set(rt, RTAX_MTU, arg->mtu); 4121 4122 spin_lock_bh(&rt6_exception_lock); 4123 rt6_exceptions_update_pmtu(idev, rt, arg->mtu); 4124 spin_unlock_bh(&rt6_exception_lock); 4125 } 4126 return 0; 4127 } 4128 4129 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4130 { 4131 struct rt6_mtu_change_arg arg = { 4132 .dev = dev, 4133 .mtu = mtu, 4134 }; 4135 4136 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4137 } 4138 4139 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4140 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4141 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4142 [RTA_OIF] = { .type = NLA_U32 }, 4143 [RTA_IIF] = { .type = NLA_U32 }, 4144 [RTA_PRIORITY] = { .type = NLA_U32 }, 4145 [RTA_METRICS] = { .type = NLA_NESTED }, 4146 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4147 [RTA_PREF] = { .type = NLA_U8 }, 4148 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4149 [RTA_ENCAP] = { .type = NLA_NESTED }, 4150 [RTA_EXPIRES] = { .type = NLA_U32 }, 4151 [RTA_UID] = { .type = NLA_U32 }, 4152 [RTA_MARK] = { .type = NLA_U32 }, 4153 [RTA_TABLE] = { .type = NLA_U32 }, 4154 [RTA_IP_PROTO] = { .type = NLA_U8 }, 4155 [RTA_SPORT] = { .type = NLA_U16 }, 4156 [RTA_DPORT] = { .type = NLA_U16 }, 4157 }; 4158 4159 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4160 struct fib6_config *cfg, 4161 struct netlink_ext_ack *extack) 4162 { 4163 struct rtmsg *rtm; 4164 struct nlattr *tb[RTA_MAX+1]; 4165 unsigned int pref; 4166 int err; 4167 4168 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4169 NULL); 4170 if (err < 0) 4171 goto errout; 4172 4173 err = -EINVAL; 4174 rtm = nlmsg_data(nlh); 4175 memset(cfg, 0, sizeof(*cfg)); 4176 4177 cfg->fc_table = rtm->rtm_table; 4178 cfg->fc_dst_len = rtm->rtm_dst_len; 4179 cfg->fc_src_len = rtm->rtm_src_len; 4180 cfg->fc_flags = RTF_UP; 4181 cfg->fc_protocol = rtm->rtm_protocol; 4182 cfg->fc_type = rtm->rtm_type; 4183 4184 if (rtm->rtm_type == RTN_UNREACHABLE || 4185 rtm->rtm_type == RTN_BLACKHOLE || 4186 rtm->rtm_type == RTN_PROHIBIT || 4187 rtm->rtm_type == RTN_THROW) 4188 cfg->fc_flags |= RTF_REJECT; 4189 4190 if (rtm->rtm_type == RTN_LOCAL) 4191 cfg->fc_flags |= RTF_LOCAL; 4192 4193 if (rtm->rtm_flags & RTM_F_CLONED) 4194 cfg->fc_flags |= RTF_CACHE; 4195 4196 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4197 4198 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; 4199 cfg->fc_nlinfo.nlh = nlh; 4200 cfg->fc_nlinfo.nl_net = sock_net(skb->sk); 4201 4202 if (tb[RTA_GATEWAY]) { 4203 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4204 cfg->fc_flags |= RTF_GATEWAY; 4205 } 4206 4207 if (tb[RTA_DST]) { 4208 int plen = (rtm->rtm_dst_len + 7) >> 3; 4209 4210 if (nla_len(tb[RTA_DST]) < plen) 4211 goto errout; 4212 4213 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 4214 } 4215 4216 if (tb[RTA_SRC]) { 4217 int plen = (rtm->rtm_src_len + 7) >> 3; 4218 4219 if (nla_len(tb[RTA_SRC]) < plen) 4220 goto errout; 4221 4222 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 4223 } 4224 4225 if (tb[RTA_PREFSRC]) 4226 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 4227 4228 if (tb[RTA_OIF]) 4229 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 4230 4231 if (tb[RTA_PRIORITY]) 4232 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 4233 4234 if (tb[RTA_METRICS]) { 4235 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 4236 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 4237 } 4238 4239 if (tb[RTA_TABLE]) 4240 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 4241 4242 if (tb[RTA_MULTIPATH]) { 4243 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 4244 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 4245 4246 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 4247 cfg->fc_mp_len, extack); 4248 if (err < 0) 4249 goto errout; 4250 } 4251 4252 if (tb[RTA_PREF]) { 4253 pref = nla_get_u8(tb[RTA_PREF]); 4254 if (pref != ICMPV6_ROUTER_PREF_LOW && 4255 pref != ICMPV6_ROUTER_PREF_HIGH) 4256 pref = ICMPV6_ROUTER_PREF_MEDIUM; 4257 cfg->fc_flags |= RTF_PREF(pref); 4258 } 4259 4260 if (tb[RTA_ENCAP]) 4261 cfg->fc_encap = tb[RTA_ENCAP]; 4262 4263 if (tb[RTA_ENCAP_TYPE]) { 4264 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 4265 4266 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 4267 if (err < 0) 4268 goto errout; 4269 } 4270 4271 if (tb[RTA_EXPIRES]) { 4272 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 4273 4274 if (addrconf_finite_timeout(timeout)) { 4275 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 4276 cfg->fc_flags |= RTF_EXPIRES; 4277 } 4278 } 4279 4280 err = 0; 4281 errout: 4282 return err; 4283 } 4284 4285 struct rt6_nh { 4286 struct fib6_info *fib6_info; 4287 struct fib6_config r_cfg; 4288 struct list_head next; 4289 }; 4290 4291 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) 4292 { 4293 struct rt6_nh *nh; 4294 4295 list_for_each_entry(nh, rt6_nh_list, next) { 4296 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n", 4297 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, 4298 nh->r_cfg.fc_ifindex); 4299 } 4300 } 4301 4302 static int ip6_route_info_append(struct net *net, 4303 struct list_head *rt6_nh_list, 4304 struct fib6_info *rt, 4305 struct fib6_config *r_cfg) 4306 { 4307 struct rt6_nh *nh; 4308 int err = -EEXIST; 4309 4310 list_for_each_entry(nh, rt6_nh_list, next) { 4311 /* check if fib6_info already exists */ 4312 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 4313 return err; 4314 } 4315 4316 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4317 if (!nh) 4318 return -ENOMEM; 4319 nh->fib6_info = rt; 4320 err = ip6_convert_metrics(net, rt, r_cfg); 4321 if (err) { 4322 kfree(nh); 4323 return err; 4324 } 4325 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4326 list_add_tail(&nh->next, rt6_nh_list); 4327 4328 return 0; 4329 } 4330 4331 static void ip6_route_mpath_notify(struct fib6_info *rt, 4332 struct fib6_info *rt_last, 4333 struct nl_info *info, 4334 __u16 nlflags) 4335 { 4336 /* if this is an APPEND route, then rt points to the first route 4337 * inserted and rt_last points to last route inserted. Userspace 4338 * wants a consistent dump of the route which starts at the first 4339 * nexthop. Since sibling routes are always added at the end of 4340 * the list, find the first sibling of the last route appended 4341 */ 4342 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 4343 rt = list_first_entry(&rt_last->fib6_siblings, 4344 struct fib6_info, 4345 fib6_siblings); 4346 } 4347 4348 if (rt) 4349 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 4350 } 4351 4352 static int ip6_route_multipath_add(struct fib6_config *cfg, 4353 struct netlink_ext_ack *extack) 4354 { 4355 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 4356 struct nl_info *info = &cfg->fc_nlinfo; 4357 struct fib6_config r_cfg; 4358 struct rtnexthop *rtnh; 4359 struct fib6_info *rt; 4360 struct rt6_nh *err_nh; 4361 struct rt6_nh *nh, *nh_safe; 4362 __u16 nlflags; 4363 int remaining; 4364 int attrlen; 4365 int err = 1; 4366 int nhn = 0; 4367 int replace = (cfg->fc_nlinfo.nlh && 4368 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 4369 LIST_HEAD(rt6_nh_list); 4370 4371 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 4372 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 4373 nlflags |= NLM_F_APPEND; 4374 4375 remaining = cfg->fc_mp_len; 4376 rtnh = (struct rtnexthop *)cfg->fc_mp; 4377 4378 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4379 * fib6_info structs per nexthop 4380 */ 4381 while (rtnh_ok(rtnh, remaining)) { 4382 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4383 if (rtnh->rtnh_ifindex) 4384 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4385 4386 attrlen = rtnh_attrlen(rtnh); 4387 if (attrlen > 0) { 4388 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4389 4390 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4391 if (nla) { 4392 r_cfg.fc_gateway = nla_get_in6_addr(nla); 4393 r_cfg.fc_flags |= RTF_GATEWAY; 4394 } 4395 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 4396 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 4397 if (nla) 4398 r_cfg.fc_encap_type = nla_get_u16(nla); 4399 } 4400 4401 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 4402 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 4403 if (IS_ERR(rt)) { 4404 err = PTR_ERR(rt); 4405 rt = NULL; 4406 goto cleanup; 4407 } 4408 if (!rt6_qualify_for_ecmp(rt)) { 4409 err = -EINVAL; 4410 NL_SET_ERR_MSG(extack, 4411 "Device only routes can not be added for IPv6 using the multipath API."); 4412 fib6_info_release(rt); 4413 goto cleanup; 4414 } 4415 4416 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1; 4417 4418 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 4419 rt, &r_cfg); 4420 if (err) { 4421 fib6_info_release(rt); 4422 goto cleanup; 4423 } 4424 4425 rtnh = rtnh_next(rtnh, &remaining); 4426 } 4427 4428 /* for add and replace send one notification with all nexthops. 4429 * Skip the notification in fib6_add_rt2node and send one with 4430 * the full route when done 4431 */ 4432 info->skip_notify = 1; 4433 4434 err_nh = NULL; 4435 list_for_each_entry(nh, &rt6_nh_list, next) { 4436 err = __ip6_ins_rt(nh->fib6_info, info, extack); 4437 fib6_info_release(nh->fib6_info); 4438 4439 if (!err) { 4440 /* save reference to last route successfully inserted */ 4441 rt_last = nh->fib6_info; 4442 4443 /* save reference to first route for notification */ 4444 if (!rt_notif) 4445 rt_notif = nh->fib6_info; 4446 } 4447 4448 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 4449 nh->fib6_info = NULL; 4450 if (err) { 4451 if (replace && nhn) 4452 ip6_print_replace_route_err(&rt6_nh_list); 4453 err_nh = nh; 4454 goto add_errout; 4455 } 4456 4457 /* Because each route is added like a single route we remove 4458 * these flags after the first nexthop: if there is a collision, 4459 * we have already failed to add the first nexthop: 4460 * fib6_add_rt2node() has rejected it; when replacing, old 4461 * nexthops have been replaced by first new, the rest should 4462 * be added to it. 4463 */ 4464 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4465 NLM_F_REPLACE); 4466 nhn++; 4467 } 4468 4469 /* success ... tell user about new route */ 4470 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4471 goto cleanup; 4472 4473 add_errout: 4474 /* send notification for routes that were added so that 4475 * the delete notifications sent by ip6_route_del are 4476 * coherent 4477 */ 4478 if (rt_notif) 4479 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4480 4481 /* Delete routes that were already added */ 4482 list_for_each_entry(nh, &rt6_nh_list, next) { 4483 if (err_nh == nh) 4484 break; 4485 ip6_route_del(&nh->r_cfg, extack); 4486 } 4487 4488 cleanup: 4489 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4490 if (nh->fib6_info) 4491 fib6_info_release(nh->fib6_info); 4492 list_del(&nh->next); 4493 kfree(nh); 4494 } 4495 4496 return err; 4497 } 4498 4499 static int ip6_route_multipath_del(struct fib6_config *cfg, 4500 struct netlink_ext_ack *extack) 4501 { 4502 struct fib6_config r_cfg; 4503 struct rtnexthop *rtnh; 4504 int remaining; 4505 int attrlen; 4506 int err = 1, last_err = 0; 4507 4508 remaining = cfg->fc_mp_len; 4509 rtnh = (struct rtnexthop *)cfg->fc_mp; 4510 4511 /* Parse a Multipath Entry */ 4512 while (rtnh_ok(rtnh, remaining)) { 4513 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4514 if (rtnh->rtnh_ifindex) 4515 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4516 4517 attrlen = rtnh_attrlen(rtnh); 4518 if (attrlen > 0) { 4519 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4520 4521 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4522 if (nla) { 4523 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 4524 r_cfg.fc_flags |= RTF_GATEWAY; 4525 } 4526 } 4527 err = ip6_route_del(&r_cfg, extack); 4528 if (err) 4529 last_err = err; 4530 4531 rtnh = rtnh_next(rtnh, &remaining); 4532 } 4533 4534 return last_err; 4535 } 4536 4537 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4538 struct netlink_ext_ack *extack) 4539 { 4540 struct fib6_config cfg; 4541 int err; 4542 4543 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4544 if (err < 0) 4545 return err; 4546 4547 if (cfg.fc_mp) 4548 return ip6_route_multipath_del(&cfg, extack); 4549 else { 4550 cfg.fc_delete_all_nh = 1; 4551 return ip6_route_del(&cfg, extack); 4552 } 4553 } 4554 4555 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4556 struct netlink_ext_ack *extack) 4557 { 4558 struct fib6_config cfg; 4559 int err; 4560 4561 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4562 if (err < 0) 4563 return err; 4564 4565 if (cfg.fc_mp) 4566 return ip6_route_multipath_add(&cfg, extack); 4567 else 4568 return ip6_route_add(&cfg, GFP_KERNEL, extack); 4569 } 4570 4571 static size_t rt6_nlmsg_size(struct fib6_info *rt) 4572 { 4573 int nexthop_len = 0; 4574 4575 if (rt->fib6_nsiblings) { 4576 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4577 + NLA_ALIGN(sizeof(struct rtnexthop)) 4578 + nla_total_size(16) /* RTA_GATEWAY */ 4579 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate); 4580 4581 nexthop_len *= rt->fib6_nsiblings; 4582 } 4583 4584 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4585 + nla_total_size(16) /* RTA_SRC */ 4586 + nla_total_size(16) /* RTA_DST */ 4587 + nla_total_size(16) /* RTA_GATEWAY */ 4588 + nla_total_size(16) /* RTA_PREFSRC */ 4589 + nla_total_size(4) /* RTA_TABLE */ 4590 + nla_total_size(4) /* RTA_IIF */ 4591 + nla_total_size(4) /* RTA_OIF */ 4592 + nla_total_size(4) /* RTA_PRIORITY */ 4593 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 4594 + nla_total_size(sizeof(struct rta_cacheinfo)) 4595 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4596 + nla_total_size(1) /* RTA_PREF */ 4597 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate) 4598 + nexthop_len; 4599 } 4600 4601 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt, 4602 unsigned int *flags, bool skip_oif) 4603 { 4604 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 4605 *flags |= RTNH_F_DEAD; 4606 4607 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) { 4608 *flags |= RTNH_F_LINKDOWN; 4609 4610 rcu_read_lock(); 4611 if (fib6_ignore_linkdown(rt)) 4612 *flags |= RTNH_F_DEAD; 4613 rcu_read_unlock(); 4614 } 4615 4616 if (rt->fib6_flags & RTF_GATEWAY) { 4617 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0) 4618 goto nla_put_failure; 4619 } 4620 4621 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK); 4622 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD) 4623 *flags |= RTNH_F_OFFLOAD; 4624 4625 /* not needed for multipath encoding b/c it has a rtnexthop struct */ 4626 if (!skip_oif && rt->fib6_nh.nh_dev && 4627 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex)) 4628 goto nla_put_failure; 4629 4630 if (rt->fib6_nh.nh_lwtstate && 4631 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0) 4632 goto nla_put_failure; 4633 4634 return 0; 4635 4636 nla_put_failure: 4637 return -EMSGSIZE; 4638 } 4639 4640 /* add multipath next hop */ 4641 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt) 4642 { 4643 const struct net_device *dev = rt->fib6_nh.nh_dev; 4644 struct rtnexthop *rtnh; 4645 unsigned int flags = 0; 4646 4647 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 4648 if (!rtnh) 4649 goto nla_put_failure; 4650 4651 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1; 4652 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; 4653 4654 if (rt6_nexthop_info(skb, rt, &flags, true) < 0) 4655 goto nla_put_failure; 4656 4657 rtnh->rtnh_flags = flags; 4658 4659 /* length of rtnetlink header + attributes */ 4660 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; 4661 4662 return 0; 4663 4664 nla_put_failure: 4665 return -EMSGSIZE; 4666 } 4667 4668 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 4669 struct fib6_info *rt, struct dst_entry *dst, 4670 struct in6_addr *dest, struct in6_addr *src, 4671 int iif, int type, u32 portid, u32 seq, 4672 unsigned int flags) 4673 { 4674 struct rtmsg *rtm; 4675 struct nlmsghdr *nlh; 4676 long expires = 0; 4677 u32 *pmetrics; 4678 u32 table; 4679 4680 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4681 if (!nlh) 4682 return -EMSGSIZE; 4683 4684 rtm = nlmsg_data(nlh); 4685 rtm->rtm_family = AF_INET6; 4686 rtm->rtm_dst_len = rt->fib6_dst.plen; 4687 rtm->rtm_src_len = rt->fib6_src.plen; 4688 rtm->rtm_tos = 0; 4689 if (rt->fib6_table) 4690 table = rt->fib6_table->tb6_id; 4691 else 4692 table = RT6_TABLE_UNSPEC; 4693 rtm->rtm_table = table; 4694 if (nla_put_u32(skb, RTA_TABLE, table)) 4695 goto nla_put_failure; 4696 4697 rtm->rtm_type = rt->fib6_type; 4698 rtm->rtm_flags = 0; 4699 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4700 rtm->rtm_protocol = rt->fib6_protocol; 4701 4702 if (rt->fib6_flags & RTF_CACHE) 4703 rtm->rtm_flags |= RTM_F_CLONED; 4704 4705 if (dest) { 4706 if (nla_put_in6_addr(skb, RTA_DST, dest)) 4707 goto nla_put_failure; 4708 rtm->rtm_dst_len = 128; 4709 } else if (rtm->rtm_dst_len) 4710 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr)) 4711 goto nla_put_failure; 4712 #ifdef CONFIG_IPV6_SUBTREES 4713 if (src) { 4714 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4715 goto nla_put_failure; 4716 rtm->rtm_src_len = 128; 4717 } else if (rtm->rtm_src_len && 4718 nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr)) 4719 goto nla_put_failure; 4720 #endif 4721 if (iif) { 4722 #ifdef CONFIG_IPV6_MROUTE 4723 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) { 4724 int err = ip6mr_get_route(net, skb, rtm, portid); 4725 4726 if (err == 0) 4727 return 0; 4728 if (err < 0) 4729 goto nla_put_failure; 4730 } else 4731 #endif 4732 if (nla_put_u32(skb, RTA_IIF, iif)) 4733 goto nla_put_failure; 4734 } else if (dest) { 4735 struct in6_addr saddr_buf; 4736 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 4737 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4738 goto nla_put_failure; 4739 } 4740 4741 if (rt->fib6_prefsrc.plen) { 4742 struct in6_addr saddr_buf; 4743 saddr_buf = rt->fib6_prefsrc.addr; 4744 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4745 goto nla_put_failure; 4746 } 4747 4748 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 4749 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 4750 goto nla_put_failure; 4751 4752 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 4753 goto nla_put_failure; 4754 4755 /* For multipath routes, walk the siblings list and add 4756 * each as a nexthop within RTA_MULTIPATH. 4757 */ 4758 if (rt->fib6_nsiblings) { 4759 struct fib6_info *sibling, *next_sibling; 4760 struct nlattr *mp; 4761 4762 mp = nla_nest_start(skb, RTA_MULTIPATH); 4763 if (!mp) 4764 goto nla_put_failure; 4765 4766 if (rt6_add_nexthop(skb, rt) < 0) 4767 goto nla_put_failure; 4768 4769 list_for_each_entry_safe(sibling, next_sibling, 4770 &rt->fib6_siblings, fib6_siblings) { 4771 if (rt6_add_nexthop(skb, sibling) < 0) 4772 goto nla_put_failure; 4773 } 4774 4775 nla_nest_end(skb, mp); 4776 } else { 4777 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0) 4778 goto nla_put_failure; 4779 } 4780 4781 if (rt->fib6_flags & RTF_EXPIRES) { 4782 expires = dst ? dst->expires : rt->expires; 4783 expires -= jiffies; 4784 } 4785 4786 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 4787 goto nla_put_failure; 4788 4789 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags))) 4790 goto nla_put_failure; 4791 4792 4793 nlmsg_end(skb, nlh); 4794 return 0; 4795 4796 nla_put_failure: 4797 nlmsg_cancel(skb, nlh); 4798 return -EMSGSIZE; 4799 } 4800 4801 int rt6_dump_route(struct fib6_info *rt, void *p_arg) 4802 { 4803 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4804 struct net *net = arg->net; 4805 4806 if (rt == net->ipv6.fib6_null_entry) 4807 return 0; 4808 4809 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { 4810 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); 4811 4812 /* user wants prefix routes only */ 4813 if (rtm->rtm_flags & RTM_F_PREFIX && 4814 !(rt->fib6_flags & RTF_PREFIX_RT)) { 4815 /* success since this is not a prefix route */ 4816 return 1; 4817 } 4818 } 4819 4820 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, 4821 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, 4822 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI); 4823 } 4824 4825 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4826 struct netlink_ext_ack *extack) 4827 { 4828 struct net *net = sock_net(in_skb->sk); 4829 struct nlattr *tb[RTA_MAX+1]; 4830 int err, iif = 0, oif = 0; 4831 struct fib6_info *from; 4832 struct dst_entry *dst; 4833 struct rt6_info *rt; 4834 struct sk_buff *skb; 4835 struct rtmsg *rtm; 4836 struct flowi6 fl6; 4837 bool fibmatch; 4838 4839 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4840 extack); 4841 if (err < 0) 4842 goto errout; 4843 4844 err = -EINVAL; 4845 memset(&fl6, 0, sizeof(fl6)); 4846 rtm = nlmsg_data(nlh); 4847 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4848 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4849 4850 if (tb[RTA_SRC]) { 4851 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4852 goto errout; 4853 4854 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4855 } 4856 4857 if (tb[RTA_DST]) { 4858 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4859 goto errout; 4860 4861 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4862 } 4863 4864 if (tb[RTA_IIF]) 4865 iif = nla_get_u32(tb[RTA_IIF]); 4866 4867 if (tb[RTA_OIF]) 4868 oif = nla_get_u32(tb[RTA_OIF]); 4869 4870 if (tb[RTA_MARK]) 4871 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4872 4873 if (tb[RTA_UID]) 4874 fl6.flowi6_uid = make_kuid(current_user_ns(), 4875 nla_get_u32(tb[RTA_UID])); 4876 else 4877 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4878 4879 if (tb[RTA_SPORT]) 4880 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); 4881 4882 if (tb[RTA_DPORT]) 4883 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); 4884 4885 if (tb[RTA_IP_PROTO]) { 4886 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 4887 &fl6.flowi6_proto, extack); 4888 if (err) 4889 goto errout; 4890 } 4891 4892 if (iif) { 4893 struct net_device *dev; 4894 int flags = 0; 4895 4896 rcu_read_lock(); 4897 4898 dev = dev_get_by_index_rcu(net, iif); 4899 if (!dev) { 4900 rcu_read_unlock(); 4901 err = -ENODEV; 4902 goto errout; 4903 } 4904 4905 fl6.flowi6_iif = iif; 4906 4907 if (!ipv6_addr_any(&fl6.saddr)) 4908 flags |= RT6_LOOKUP_F_HAS_SADDR; 4909 4910 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 4911 4912 rcu_read_unlock(); 4913 } else { 4914 fl6.flowi6_oif = oif; 4915 4916 dst = ip6_route_output(net, NULL, &fl6); 4917 } 4918 4919 4920 rt = container_of(dst, struct rt6_info, dst); 4921 if (rt->dst.error) { 4922 err = rt->dst.error; 4923 ip6_rt_put(rt); 4924 goto errout; 4925 } 4926 4927 if (rt == net->ipv6.ip6_null_entry) { 4928 err = rt->dst.error; 4929 ip6_rt_put(rt); 4930 goto errout; 4931 } 4932 4933 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 4934 if (!skb) { 4935 ip6_rt_put(rt); 4936 err = -ENOBUFS; 4937 goto errout; 4938 } 4939 4940 skb_dst_set(skb, &rt->dst); 4941 4942 rcu_read_lock(); 4943 from = rcu_dereference(rt->from); 4944 4945 if (fibmatch) 4946 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, 4947 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 4948 nlh->nlmsg_seq, 0); 4949 else 4950 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 4951 &fl6.saddr, iif, RTM_NEWROUTE, 4952 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 4953 0); 4954 rcu_read_unlock(); 4955 4956 if (err < 0) { 4957 kfree_skb(skb); 4958 goto errout; 4959 } 4960 4961 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 4962 errout: 4963 return err; 4964 } 4965 4966 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 4967 unsigned int nlm_flags) 4968 { 4969 struct sk_buff *skb; 4970 struct net *net = info->nl_net; 4971 u32 seq; 4972 int err; 4973 4974 err = -ENOBUFS; 4975 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 4976 4977 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 4978 if (!skb) 4979 goto errout; 4980 4981 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 4982 event, info->portid, seq, nlm_flags); 4983 if (err < 0) { 4984 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 4985 WARN_ON(err == -EMSGSIZE); 4986 kfree_skb(skb); 4987 goto errout; 4988 } 4989 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 4990 info->nlh, gfp_any()); 4991 return; 4992 errout: 4993 if (err < 0) 4994 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 4995 } 4996 4997 static int ip6_route_dev_notify(struct notifier_block *this, 4998 unsigned long event, void *ptr) 4999 { 5000 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 5001 struct net *net = dev_net(dev); 5002 5003 if (!(dev->flags & IFF_LOOPBACK)) 5004 return NOTIFY_OK; 5005 5006 if (event == NETDEV_REGISTER) { 5007 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev; 5008 net->ipv6.ip6_null_entry->dst.dev = dev; 5009 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 5010 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5011 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 5012 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 5013 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 5014 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 5015 #endif 5016 } else if (event == NETDEV_UNREGISTER && 5017 dev->reg_state != NETREG_UNREGISTERED) { 5018 /* NETDEV_UNREGISTER could be fired for multiple times by 5019 * netdev_wait_allrefs(). Make sure we only call this once. 5020 */ 5021 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 5022 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5023 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 5024 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 5025 #endif 5026 } 5027 5028 return NOTIFY_OK; 5029 } 5030 5031 /* 5032 * /proc 5033 */ 5034 5035 #ifdef CONFIG_PROC_FS 5036 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 5037 { 5038 struct net *net = (struct net *)seq->private; 5039 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 5040 net->ipv6.rt6_stats->fib_nodes, 5041 net->ipv6.rt6_stats->fib_route_nodes, 5042 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 5043 net->ipv6.rt6_stats->fib_rt_entries, 5044 net->ipv6.rt6_stats->fib_rt_cache, 5045 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 5046 net->ipv6.rt6_stats->fib_discarded_routes); 5047 5048 return 0; 5049 } 5050 #endif /* CONFIG_PROC_FS */ 5051 5052 #ifdef CONFIG_SYSCTL 5053 5054 static 5055 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 5056 void __user *buffer, size_t *lenp, loff_t *ppos) 5057 { 5058 struct net *net; 5059 int delay; 5060 if (!write) 5061 return -EINVAL; 5062 5063 net = (struct net *)ctl->extra1; 5064 delay = net->ipv6.sysctl.flush_delay; 5065 proc_dointvec(ctl, write, buffer, lenp, ppos); 5066 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 5067 return 0; 5068 } 5069 5070 struct ctl_table ipv6_route_table_template[] = { 5071 { 5072 .procname = "flush", 5073 .data = &init_net.ipv6.sysctl.flush_delay, 5074 .maxlen = sizeof(int), 5075 .mode = 0200, 5076 .proc_handler = ipv6_sysctl_rtcache_flush 5077 }, 5078 { 5079 .procname = "gc_thresh", 5080 .data = &ip6_dst_ops_template.gc_thresh, 5081 .maxlen = sizeof(int), 5082 .mode = 0644, 5083 .proc_handler = proc_dointvec, 5084 }, 5085 { 5086 .procname = "max_size", 5087 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 5088 .maxlen = sizeof(int), 5089 .mode = 0644, 5090 .proc_handler = proc_dointvec, 5091 }, 5092 { 5093 .procname = "gc_min_interval", 5094 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5095 .maxlen = sizeof(int), 5096 .mode = 0644, 5097 .proc_handler = proc_dointvec_jiffies, 5098 }, 5099 { 5100 .procname = "gc_timeout", 5101 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 5102 .maxlen = sizeof(int), 5103 .mode = 0644, 5104 .proc_handler = proc_dointvec_jiffies, 5105 }, 5106 { 5107 .procname = "gc_interval", 5108 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 5109 .maxlen = sizeof(int), 5110 .mode = 0644, 5111 .proc_handler = proc_dointvec_jiffies, 5112 }, 5113 { 5114 .procname = "gc_elasticity", 5115 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 5116 .maxlen = sizeof(int), 5117 .mode = 0644, 5118 .proc_handler = proc_dointvec, 5119 }, 5120 { 5121 .procname = "mtu_expires", 5122 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 5123 .maxlen = sizeof(int), 5124 .mode = 0644, 5125 .proc_handler = proc_dointvec_jiffies, 5126 }, 5127 { 5128 .procname = "min_adv_mss", 5129 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 5130 .maxlen = sizeof(int), 5131 .mode = 0644, 5132 .proc_handler = proc_dointvec, 5133 }, 5134 { 5135 .procname = "gc_min_interval_ms", 5136 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5137 .maxlen = sizeof(int), 5138 .mode = 0644, 5139 .proc_handler = proc_dointvec_ms_jiffies, 5140 }, 5141 { } 5142 }; 5143 5144 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 5145 { 5146 struct ctl_table *table; 5147 5148 table = kmemdup(ipv6_route_table_template, 5149 sizeof(ipv6_route_table_template), 5150 GFP_KERNEL); 5151 5152 if (table) { 5153 table[0].data = &net->ipv6.sysctl.flush_delay; 5154 table[0].extra1 = net; 5155 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 5156 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 5157 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5158 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 5159 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 5160 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 5161 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 5162 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 5163 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5164 5165 /* Don't export sysctls to unprivileged users */ 5166 if (net->user_ns != &init_user_ns) 5167 table[0].procname = NULL; 5168 } 5169 5170 return table; 5171 } 5172 #endif 5173 5174 static int __net_init ip6_route_net_init(struct net *net) 5175 { 5176 int ret = -ENOMEM; 5177 5178 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 5179 sizeof(net->ipv6.ip6_dst_ops)); 5180 5181 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 5182 goto out_ip6_dst_ops; 5183 5184 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, 5185 sizeof(*net->ipv6.fib6_null_entry), 5186 GFP_KERNEL); 5187 if (!net->ipv6.fib6_null_entry) 5188 goto out_ip6_dst_entries; 5189 5190 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 5191 sizeof(*net->ipv6.ip6_null_entry), 5192 GFP_KERNEL); 5193 if (!net->ipv6.ip6_null_entry) 5194 goto out_fib6_null_entry; 5195 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5196 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 5197 ip6_template_metrics, true); 5198 5199 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5200 net->ipv6.fib6_has_custom_rules = false; 5201 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 5202 sizeof(*net->ipv6.ip6_prohibit_entry), 5203 GFP_KERNEL); 5204 if (!net->ipv6.ip6_prohibit_entry) 5205 goto out_ip6_null_entry; 5206 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5207 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 5208 ip6_template_metrics, true); 5209 5210 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 5211 sizeof(*net->ipv6.ip6_blk_hole_entry), 5212 GFP_KERNEL); 5213 if (!net->ipv6.ip6_blk_hole_entry) 5214 goto out_ip6_prohibit_entry; 5215 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5216 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 5217 ip6_template_metrics, true); 5218 #endif 5219 5220 net->ipv6.sysctl.flush_delay = 0; 5221 net->ipv6.sysctl.ip6_rt_max_size = 4096; 5222 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 5223 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 5224 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 5225 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 5226 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 5227 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 5228 5229 net->ipv6.ip6_rt_gc_expire = 30*HZ; 5230 5231 ret = 0; 5232 out: 5233 return ret; 5234 5235 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5236 out_ip6_prohibit_entry: 5237 kfree(net->ipv6.ip6_prohibit_entry); 5238 out_ip6_null_entry: 5239 kfree(net->ipv6.ip6_null_entry); 5240 #endif 5241 out_fib6_null_entry: 5242 kfree(net->ipv6.fib6_null_entry); 5243 out_ip6_dst_entries: 5244 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5245 out_ip6_dst_ops: 5246 goto out; 5247 } 5248 5249 static void __net_exit ip6_route_net_exit(struct net *net) 5250 { 5251 kfree(net->ipv6.fib6_null_entry); 5252 kfree(net->ipv6.ip6_null_entry); 5253 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5254 kfree(net->ipv6.ip6_prohibit_entry); 5255 kfree(net->ipv6.ip6_blk_hole_entry); 5256 #endif 5257 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5258 } 5259 5260 static int __net_init ip6_route_net_init_late(struct net *net) 5261 { 5262 #ifdef CONFIG_PROC_FS 5263 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops, 5264 sizeof(struct ipv6_route_iter)); 5265 proc_create_net_single("rt6_stats", 0444, net->proc_net, 5266 rt6_stats_seq_show, NULL); 5267 #endif 5268 return 0; 5269 } 5270 5271 static void __net_exit ip6_route_net_exit_late(struct net *net) 5272 { 5273 #ifdef CONFIG_PROC_FS 5274 remove_proc_entry("ipv6_route", net->proc_net); 5275 remove_proc_entry("rt6_stats", net->proc_net); 5276 #endif 5277 } 5278 5279 static struct pernet_operations ip6_route_net_ops = { 5280 .init = ip6_route_net_init, 5281 .exit = ip6_route_net_exit, 5282 }; 5283 5284 static int __net_init ipv6_inetpeer_init(struct net *net) 5285 { 5286 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 5287 5288 if (!bp) 5289 return -ENOMEM; 5290 inet_peer_base_init(bp); 5291 net->ipv6.peers = bp; 5292 return 0; 5293 } 5294 5295 static void __net_exit ipv6_inetpeer_exit(struct net *net) 5296 { 5297 struct inet_peer_base *bp = net->ipv6.peers; 5298 5299 net->ipv6.peers = NULL; 5300 inetpeer_invalidate_tree(bp); 5301 kfree(bp); 5302 } 5303 5304 static struct pernet_operations ipv6_inetpeer_ops = { 5305 .init = ipv6_inetpeer_init, 5306 .exit = ipv6_inetpeer_exit, 5307 }; 5308 5309 static struct pernet_operations ip6_route_net_late_ops = { 5310 .init = ip6_route_net_init_late, 5311 .exit = ip6_route_net_exit_late, 5312 }; 5313 5314 static struct notifier_block ip6_route_dev_notifier = { 5315 .notifier_call = ip6_route_dev_notify, 5316 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 5317 }; 5318 5319 void __init ip6_route_init_special_entries(void) 5320 { 5321 /* Registering of the loopback is done before this portion of code, 5322 * the loopback reference in rt6_info will not be taken, do it 5323 * manually for init_net */ 5324 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev; 5325 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5326 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5327 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5328 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 5329 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5330 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 5331 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5332 #endif 5333 } 5334 5335 int __init ip6_route_init(void) 5336 { 5337 int ret; 5338 int cpu; 5339 5340 ret = -ENOMEM; 5341 ip6_dst_ops_template.kmem_cachep = 5342 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 5343 SLAB_HWCACHE_ALIGN, NULL); 5344 if (!ip6_dst_ops_template.kmem_cachep) 5345 goto out; 5346 5347 ret = dst_entries_init(&ip6_dst_blackhole_ops); 5348 if (ret) 5349 goto out_kmem_cache; 5350 5351 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 5352 if (ret) 5353 goto out_dst_entries; 5354 5355 ret = register_pernet_subsys(&ip6_route_net_ops); 5356 if (ret) 5357 goto out_register_inetpeer; 5358 5359 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 5360 5361 ret = fib6_init(); 5362 if (ret) 5363 goto out_register_subsys; 5364 5365 ret = xfrm6_init(); 5366 if (ret) 5367 goto out_fib6_init; 5368 5369 ret = fib6_rules_init(); 5370 if (ret) 5371 goto xfrm6_init; 5372 5373 ret = register_pernet_subsys(&ip6_route_net_late_ops); 5374 if (ret) 5375 goto fib6_rules_init; 5376 5377 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 5378 inet6_rtm_newroute, NULL, 0); 5379 if (ret < 0) 5380 goto out_register_late_subsys; 5381 5382 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 5383 inet6_rtm_delroute, NULL, 0); 5384 if (ret < 0) 5385 goto out_register_late_subsys; 5386 5387 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 5388 inet6_rtm_getroute, NULL, 5389 RTNL_FLAG_DOIT_UNLOCKED); 5390 if (ret < 0) 5391 goto out_register_late_subsys; 5392 5393 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 5394 if (ret) 5395 goto out_register_late_subsys; 5396 5397 for_each_possible_cpu(cpu) { 5398 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 5399 5400 INIT_LIST_HEAD(&ul->head); 5401 spin_lock_init(&ul->lock); 5402 } 5403 5404 out: 5405 return ret; 5406 5407 out_register_late_subsys: 5408 rtnl_unregister_all(PF_INET6); 5409 unregister_pernet_subsys(&ip6_route_net_late_ops); 5410 fib6_rules_init: 5411 fib6_rules_cleanup(); 5412 xfrm6_init: 5413 xfrm6_fini(); 5414 out_fib6_init: 5415 fib6_gc_cleanup(); 5416 out_register_subsys: 5417 unregister_pernet_subsys(&ip6_route_net_ops); 5418 out_register_inetpeer: 5419 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5420 out_dst_entries: 5421 dst_entries_destroy(&ip6_dst_blackhole_ops); 5422 out_kmem_cache: 5423 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5424 goto out; 5425 } 5426 5427 void ip6_route_cleanup(void) 5428 { 5429 unregister_netdevice_notifier(&ip6_route_dev_notifier); 5430 unregister_pernet_subsys(&ip6_route_net_late_ops); 5431 fib6_rules_cleanup(); 5432 xfrm6_fini(); 5433 fib6_gc_cleanup(); 5434 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5435 unregister_pernet_subsys(&ip6_route_net_ops); 5436 dst_entries_destroy(&ip6_dst_blackhole_ops); 5437 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5438 } 5439