1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/nexthop.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <net/ip.h> 67 #include <linux/uaccess.h> 68 69 #ifdef CONFIG_SYSCTL 70 #include <linux/sysctl.h> 71 #endif 72 73 static int ip6_rt_type_to_error(u8 fib6_type); 74 75 #define CREATE_TRACE_POINTS 76 #include <trace/events/fib6.h> 77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); 78 #undef CREATE_TRACE_POINTS 79 80 enum rt6_nud_state { 81 RT6_NUD_FAIL_HARD = -3, 82 RT6_NUD_FAIL_PROBE = -2, 83 RT6_NUD_FAIL_DO_RR = -1, 84 RT6_NUD_SUCCEED = 1 85 }; 86 87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 88 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 89 static unsigned int ip6_mtu(const struct dst_entry *dst); 90 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 91 static void ip6_dst_destroy(struct dst_entry *); 92 static void ip6_dst_ifdown(struct dst_entry *, 93 struct net_device *dev, int how); 94 static int ip6_dst_gc(struct dst_ops *ops); 95 96 static int ip6_pkt_discard(struct sk_buff *skb); 97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 98 static int ip6_pkt_prohibit(struct sk_buff *skb); 99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 100 static void ip6_link_failure(struct sk_buff *skb); 101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 102 struct sk_buff *skb, u32 mtu); 103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 104 struct sk_buff *skb); 105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict); 106 static size_t rt6_nlmsg_size(struct fib6_info *rt); 107 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 108 struct fib6_info *rt, struct dst_entry *dst, 109 struct in6_addr *dest, struct in6_addr *src, 110 int iif, int type, u32 portid, u32 seq, 111 unsigned int flags); 112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 113 struct in6_addr *daddr, 114 struct in6_addr *saddr); 115 116 #ifdef CONFIG_IPV6_ROUTE_INFO 117 static struct fib6_info *rt6_add_route_info(struct net *net, 118 const struct in6_addr *prefix, int prefixlen, 119 const struct in6_addr *gwaddr, 120 struct net_device *dev, 121 unsigned int pref); 122 static struct fib6_info *rt6_get_route_info(struct net *net, 123 const struct in6_addr *prefix, int prefixlen, 124 const struct in6_addr *gwaddr, 125 struct net_device *dev); 126 #endif 127 128 struct uncached_list { 129 spinlock_t lock; 130 struct list_head head; 131 }; 132 133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 134 135 void rt6_uncached_list_add(struct rt6_info *rt) 136 { 137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 138 139 rt->rt6i_uncached_list = ul; 140 141 spin_lock_bh(&ul->lock); 142 list_add_tail(&rt->rt6i_uncached, &ul->head); 143 spin_unlock_bh(&ul->lock); 144 } 145 146 void rt6_uncached_list_del(struct rt6_info *rt) 147 { 148 if (!list_empty(&rt->rt6i_uncached)) { 149 struct uncached_list *ul = rt->rt6i_uncached_list; 150 struct net *net = dev_net(rt->dst.dev); 151 152 spin_lock_bh(&ul->lock); 153 list_del(&rt->rt6i_uncached); 154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 155 spin_unlock_bh(&ul->lock); 156 } 157 } 158 159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 160 { 161 struct net_device *loopback_dev = net->loopback_dev; 162 int cpu; 163 164 if (dev == loopback_dev) 165 return; 166 167 for_each_possible_cpu(cpu) { 168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 169 struct rt6_info *rt; 170 171 spin_lock_bh(&ul->lock); 172 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 173 struct inet6_dev *rt_idev = rt->rt6i_idev; 174 struct net_device *rt_dev = rt->dst.dev; 175 176 if (rt_idev->dev == dev) { 177 rt->rt6i_idev = in6_dev_get(loopback_dev); 178 in6_dev_put(rt_idev); 179 } 180 181 if (rt_dev == dev) { 182 rt->dst.dev = loopback_dev; 183 dev_hold(rt->dst.dev); 184 dev_put(rt_dev); 185 } 186 } 187 spin_unlock_bh(&ul->lock); 188 } 189 } 190 191 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 192 struct sk_buff *skb, 193 const void *daddr) 194 { 195 if (!ipv6_addr_any(p)) 196 return (const void *) p; 197 else if (skb) 198 return &ipv6_hdr(skb)->daddr; 199 return daddr; 200 } 201 202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 203 struct net_device *dev, 204 struct sk_buff *skb, 205 const void *daddr) 206 { 207 struct neighbour *n; 208 209 daddr = choose_neigh_daddr(gw, skb, daddr); 210 n = __ipv6_neigh_lookup(dev, daddr); 211 if (n) 212 return n; 213 return neigh_create(&nd_tbl, daddr, dev); 214 } 215 216 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 217 struct sk_buff *skb, 218 const void *daddr) 219 { 220 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 221 222 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); 223 } 224 225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 226 { 227 struct net_device *dev = dst->dev; 228 struct rt6_info *rt = (struct rt6_info *)dst; 229 230 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); 231 if (!daddr) 232 return; 233 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 234 return; 235 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 236 return; 237 __ipv6_confirm_neigh(dev, daddr); 238 } 239 240 static struct dst_ops ip6_dst_ops_template = { 241 .family = AF_INET6, 242 .gc = ip6_dst_gc, 243 .gc_thresh = 1024, 244 .check = ip6_dst_check, 245 .default_advmss = ip6_default_advmss, 246 .mtu = ip6_mtu, 247 .cow_metrics = dst_cow_metrics_generic, 248 .destroy = ip6_dst_destroy, 249 .ifdown = ip6_dst_ifdown, 250 .negative_advice = ip6_negative_advice, 251 .link_failure = ip6_link_failure, 252 .update_pmtu = ip6_rt_update_pmtu, 253 .redirect = rt6_do_redirect, 254 .local_out = __ip6_local_out, 255 .neigh_lookup = ip6_dst_neigh_lookup, 256 .confirm_neigh = ip6_confirm_neigh, 257 }; 258 259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 260 { 261 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 262 263 return mtu ? : dst->dev->mtu; 264 } 265 266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 267 struct sk_buff *skb, u32 mtu) 268 { 269 } 270 271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 272 struct sk_buff *skb) 273 { 274 } 275 276 static struct dst_ops ip6_dst_blackhole_ops = { 277 .family = AF_INET6, 278 .destroy = ip6_dst_destroy, 279 .check = ip6_dst_check, 280 .mtu = ip6_blackhole_mtu, 281 .default_advmss = ip6_default_advmss, 282 .update_pmtu = ip6_rt_blackhole_update_pmtu, 283 .redirect = ip6_rt_blackhole_redirect, 284 .cow_metrics = dst_cow_metrics_generic, 285 .neigh_lookup = ip6_dst_neigh_lookup, 286 }; 287 288 static const u32 ip6_template_metrics[RTAX_MAX] = { 289 [RTAX_HOPLIMIT - 1] = 0, 290 }; 291 292 static const struct fib6_info fib6_null_entry_template = { 293 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 294 .fib6_protocol = RTPROT_KERNEL, 295 .fib6_metric = ~(u32)0, 296 .fib6_ref = ATOMIC_INIT(1), 297 .fib6_type = RTN_UNREACHABLE, 298 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 299 }; 300 301 static const struct rt6_info ip6_null_entry_template = { 302 .dst = { 303 .__refcnt = ATOMIC_INIT(1), 304 .__use = 1, 305 .obsolete = DST_OBSOLETE_FORCE_CHK, 306 .error = -ENETUNREACH, 307 .input = ip6_pkt_discard, 308 .output = ip6_pkt_discard_out, 309 }, 310 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 311 }; 312 313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 314 315 static const struct rt6_info ip6_prohibit_entry_template = { 316 .dst = { 317 .__refcnt = ATOMIC_INIT(1), 318 .__use = 1, 319 .obsolete = DST_OBSOLETE_FORCE_CHK, 320 .error = -EACCES, 321 .input = ip6_pkt_prohibit, 322 .output = ip6_pkt_prohibit_out, 323 }, 324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 325 }; 326 327 static const struct rt6_info ip6_blk_hole_entry_template = { 328 .dst = { 329 .__refcnt = ATOMIC_INIT(1), 330 .__use = 1, 331 .obsolete = DST_OBSOLETE_FORCE_CHK, 332 .error = -EINVAL, 333 .input = dst_discard, 334 .output = dst_discard_out, 335 }, 336 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 337 }; 338 339 #endif 340 341 static void rt6_info_init(struct rt6_info *rt) 342 { 343 struct dst_entry *dst = &rt->dst; 344 345 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 346 INIT_LIST_HEAD(&rt->rt6i_uncached); 347 } 348 349 /* allocate dst with ip6_dst_ops */ 350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 351 int flags) 352 { 353 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 354 1, DST_OBSOLETE_FORCE_CHK, flags); 355 356 if (rt) { 357 rt6_info_init(rt); 358 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 359 } 360 361 return rt; 362 } 363 EXPORT_SYMBOL(ip6_dst_alloc); 364 365 static void ip6_dst_destroy(struct dst_entry *dst) 366 { 367 struct rt6_info *rt = (struct rt6_info *)dst; 368 struct fib6_info *from; 369 struct inet6_dev *idev; 370 371 dst_destroy_metrics_generic(dst); 372 rt6_uncached_list_del(rt); 373 374 idev = rt->rt6i_idev; 375 if (idev) { 376 rt->rt6i_idev = NULL; 377 in6_dev_put(idev); 378 } 379 380 rcu_read_lock(); 381 from = rcu_dereference(rt->from); 382 rcu_assign_pointer(rt->from, NULL); 383 fib6_info_release(from); 384 rcu_read_unlock(); 385 } 386 387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 388 int how) 389 { 390 struct rt6_info *rt = (struct rt6_info *)dst; 391 struct inet6_dev *idev = rt->rt6i_idev; 392 struct net_device *loopback_dev = 393 dev_net(dev)->loopback_dev; 394 395 if (idev && idev->dev != loopback_dev) { 396 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 397 if (loopback_idev) { 398 rt->rt6i_idev = loopback_idev; 399 in6_dev_put(idev); 400 } 401 } 402 } 403 404 static bool __rt6_check_expired(const struct rt6_info *rt) 405 { 406 if (rt->rt6i_flags & RTF_EXPIRES) 407 return time_after(jiffies, rt->dst.expires); 408 else 409 return false; 410 } 411 412 static bool rt6_check_expired(const struct rt6_info *rt) 413 { 414 struct fib6_info *from; 415 416 from = rcu_dereference(rt->from); 417 418 if (rt->rt6i_flags & RTF_EXPIRES) { 419 if (time_after(jiffies, rt->dst.expires)) 420 return true; 421 } else if (from) { 422 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 423 fib6_check_expired(from); 424 } 425 return false; 426 } 427 428 struct fib6_info *fib6_multipath_select(const struct net *net, 429 struct fib6_info *match, 430 struct flowi6 *fl6, int oif, 431 const struct sk_buff *skb, 432 int strict) 433 { 434 struct fib6_info *sibling, *next_sibling; 435 436 /* We might have already computed the hash for ICMPv6 errors. In such 437 * case it will always be non-zero. Otherwise now is the time to do it. 438 */ 439 if (!fl6->mp_hash) 440 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 441 442 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound)) 443 return match; 444 445 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 446 fib6_siblings) { 447 int nh_upper_bound; 448 449 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound); 450 if (fl6->mp_hash > nh_upper_bound) 451 continue; 452 if (rt6_score_route(sibling, oif, strict) < 0) 453 break; 454 match = sibling; 455 break; 456 } 457 458 return match; 459 } 460 461 /* 462 * Route lookup. rcu_read_lock() should be held. 463 */ 464 465 static inline struct fib6_info *rt6_device_match(struct net *net, 466 struct fib6_info *rt, 467 const struct in6_addr *saddr, 468 int oif, 469 int flags) 470 { 471 struct fib6_info *sprt; 472 473 if (!oif && ipv6_addr_any(saddr) && 474 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD)) 475 return rt; 476 477 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) { 478 const struct net_device *dev = sprt->fib6_nh.nh_dev; 479 480 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD) 481 continue; 482 483 if (oif) { 484 if (dev->ifindex == oif) 485 return sprt; 486 } else { 487 if (ipv6_chk_addr(net, saddr, dev, 488 flags & RT6_LOOKUP_F_IFACE)) 489 return sprt; 490 } 491 } 492 493 if (oif && flags & RT6_LOOKUP_F_IFACE) 494 return net->ipv6.fib6_null_entry; 495 496 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt; 497 } 498 499 #ifdef CONFIG_IPV6_ROUTER_PREF 500 struct __rt6_probe_work { 501 struct work_struct work; 502 struct in6_addr target; 503 struct net_device *dev; 504 }; 505 506 static void rt6_probe_deferred(struct work_struct *w) 507 { 508 struct in6_addr mcaddr; 509 struct __rt6_probe_work *work = 510 container_of(w, struct __rt6_probe_work, work); 511 512 addrconf_addr_solict_mult(&work->target, &mcaddr); 513 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 514 dev_put(work->dev); 515 kfree(work); 516 } 517 518 static void rt6_probe(struct fib6_info *rt) 519 { 520 struct __rt6_probe_work *work; 521 const struct in6_addr *nh_gw; 522 struct neighbour *neigh; 523 struct net_device *dev; 524 525 /* 526 * Okay, this does not seem to be appropriate 527 * for now, however, we need to check if it 528 * is really so; aka Router Reachability Probing. 529 * 530 * Router Reachability Probe MUST be rate-limited 531 * to no more than one per minute. 532 */ 533 if (!rt || !(rt->fib6_flags & RTF_GATEWAY)) 534 return; 535 536 nh_gw = &rt->fib6_nh.nh_gw; 537 dev = rt->fib6_nh.nh_dev; 538 rcu_read_lock_bh(); 539 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 540 if (neigh) { 541 struct inet6_dev *idev; 542 543 if (neigh->nud_state & NUD_VALID) 544 goto out; 545 546 idev = __in6_dev_get(dev); 547 work = NULL; 548 write_lock(&neigh->lock); 549 if (!(neigh->nud_state & NUD_VALID) && 550 time_after(jiffies, 551 neigh->updated + idev->cnf.rtr_probe_interval)) { 552 work = kmalloc(sizeof(*work), GFP_ATOMIC); 553 if (work) 554 __neigh_set_probe_once(neigh); 555 } 556 write_unlock(&neigh->lock); 557 } else { 558 work = kmalloc(sizeof(*work), GFP_ATOMIC); 559 } 560 561 if (work) { 562 INIT_WORK(&work->work, rt6_probe_deferred); 563 work->target = *nh_gw; 564 dev_hold(dev); 565 work->dev = dev; 566 schedule_work(&work->work); 567 } 568 569 out: 570 rcu_read_unlock_bh(); 571 } 572 #else 573 static inline void rt6_probe(struct fib6_info *rt) 574 { 575 } 576 #endif 577 578 /* 579 * Default Router Selection (RFC 2461 6.3.6) 580 */ 581 static inline int rt6_check_dev(struct fib6_info *rt, int oif) 582 { 583 const struct net_device *dev = rt->fib6_nh.nh_dev; 584 585 if (!oif || dev->ifindex == oif) 586 return 2; 587 return 0; 588 } 589 590 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) 591 { 592 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 593 struct neighbour *neigh; 594 595 if (rt->fib6_flags & RTF_NONEXTHOP || 596 !(rt->fib6_flags & RTF_GATEWAY)) 597 return RT6_NUD_SUCCEED; 598 599 rcu_read_lock_bh(); 600 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev, 601 &rt->fib6_nh.nh_gw); 602 if (neigh) { 603 read_lock(&neigh->lock); 604 if (neigh->nud_state & NUD_VALID) 605 ret = RT6_NUD_SUCCEED; 606 #ifdef CONFIG_IPV6_ROUTER_PREF 607 else if (!(neigh->nud_state & NUD_FAILED)) 608 ret = RT6_NUD_SUCCEED; 609 else 610 ret = RT6_NUD_FAIL_PROBE; 611 #endif 612 read_unlock(&neigh->lock); 613 } else { 614 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 615 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 616 } 617 rcu_read_unlock_bh(); 618 619 return ret; 620 } 621 622 static int rt6_score_route(struct fib6_info *rt, int oif, int strict) 623 { 624 int m; 625 626 m = rt6_check_dev(rt, oif); 627 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 628 return RT6_NUD_FAIL_HARD; 629 #ifdef CONFIG_IPV6_ROUTER_PREF 630 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2; 631 #endif 632 if (strict & RT6_LOOKUP_F_REACHABLE) { 633 int n = rt6_check_neigh(rt); 634 if (n < 0) 635 return n; 636 } 637 return m; 638 } 639 640 /* called with rc_read_lock held */ 641 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i) 642 { 643 const struct net_device *dev = fib6_info_nh_dev(f6i); 644 bool rc = false; 645 646 if (dev) { 647 const struct inet6_dev *idev = __in6_dev_get(dev); 648 649 rc = !!idev->cnf.ignore_routes_with_linkdown; 650 } 651 652 return rc; 653 } 654 655 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, 656 int *mpri, struct fib6_info *match, 657 bool *do_rr) 658 { 659 int m; 660 bool match_do_rr = false; 661 662 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 663 goto out; 664 665 if (fib6_ignore_linkdown(rt) && 666 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 667 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 668 goto out; 669 670 if (fib6_check_expired(rt)) 671 goto out; 672 673 m = rt6_score_route(rt, oif, strict); 674 if (m == RT6_NUD_FAIL_DO_RR) { 675 match_do_rr = true; 676 m = 0; /* lowest valid score */ 677 } else if (m == RT6_NUD_FAIL_HARD) { 678 goto out; 679 } 680 681 if (strict & RT6_LOOKUP_F_REACHABLE) 682 rt6_probe(rt); 683 684 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 685 if (m > *mpri) { 686 *do_rr = match_do_rr; 687 *mpri = m; 688 match = rt; 689 } 690 out: 691 return match; 692 } 693 694 static struct fib6_info *find_rr_leaf(struct fib6_node *fn, 695 struct fib6_info *leaf, 696 struct fib6_info *rr_head, 697 u32 metric, int oif, int strict, 698 bool *do_rr) 699 { 700 struct fib6_info *rt, *match, *cont; 701 int mpri = -1; 702 703 match = NULL; 704 cont = NULL; 705 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) { 706 if (rt->fib6_metric != metric) { 707 cont = rt; 708 break; 709 } 710 711 match = find_match(rt, oif, strict, &mpri, match, do_rr); 712 } 713 714 for (rt = leaf; rt && rt != rr_head; 715 rt = rcu_dereference(rt->fib6_next)) { 716 if (rt->fib6_metric != metric) { 717 cont = rt; 718 break; 719 } 720 721 match = find_match(rt, oif, strict, &mpri, match, do_rr); 722 } 723 724 if (match || !cont) 725 return match; 726 727 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next)) 728 match = find_match(rt, oif, strict, &mpri, match, do_rr); 729 730 return match; 731 } 732 733 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, 734 int oif, int strict) 735 { 736 struct fib6_info *leaf = rcu_dereference(fn->leaf); 737 struct fib6_info *match, *rt0; 738 bool do_rr = false; 739 int key_plen; 740 741 if (!leaf || leaf == net->ipv6.fib6_null_entry) 742 return net->ipv6.fib6_null_entry; 743 744 rt0 = rcu_dereference(fn->rr_ptr); 745 if (!rt0) 746 rt0 = leaf; 747 748 /* Double check to make sure fn is not an intermediate node 749 * and fn->leaf does not points to its child's leaf 750 * (This might happen if all routes under fn are deleted from 751 * the tree and fib6_repair_tree() is called on the node.) 752 */ 753 key_plen = rt0->fib6_dst.plen; 754 #ifdef CONFIG_IPV6_SUBTREES 755 if (rt0->fib6_src.plen) 756 key_plen = rt0->fib6_src.plen; 757 #endif 758 if (fn->fn_bit != key_plen) 759 return net->ipv6.fib6_null_entry; 760 761 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict, 762 &do_rr); 763 764 if (do_rr) { 765 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 766 767 /* no entries matched; do round-robin */ 768 if (!next || next->fib6_metric != rt0->fib6_metric) 769 next = leaf; 770 771 if (next != rt0) { 772 spin_lock_bh(&leaf->fib6_table->tb6_lock); 773 /* make sure next is not being deleted from the tree */ 774 if (next->fib6_node) 775 rcu_assign_pointer(fn->rr_ptr, next); 776 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 777 } 778 } 779 780 return match ? match : net->ipv6.fib6_null_entry; 781 } 782 783 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) 784 { 785 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); 786 } 787 788 #ifdef CONFIG_IPV6_ROUTE_INFO 789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 790 const struct in6_addr *gwaddr) 791 { 792 struct net *net = dev_net(dev); 793 struct route_info *rinfo = (struct route_info *) opt; 794 struct in6_addr prefix_buf, *prefix; 795 unsigned int pref; 796 unsigned long lifetime; 797 struct fib6_info *rt; 798 799 if (len < sizeof(struct route_info)) { 800 return -EINVAL; 801 } 802 803 /* Sanity check for prefix_len and length */ 804 if (rinfo->length > 3) { 805 return -EINVAL; 806 } else if (rinfo->prefix_len > 128) { 807 return -EINVAL; 808 } else if (rinfo->prefix_len > 64) { 809 if (rinfo->length < 2) { 810 return -EINVAL; 811 } 812 } else if (rinfo->prefix_len > 0) { 813 if (rinfo->length < 1) { 814 return -EINVAL; 815 } 816 } 817 818 pref = rinfo->route_pref; 819 if (pref == ICMPV6_ROUTER_PREF_INVALID) 820 return -EINVAL; 821 822 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 823 824 if (rinfo->length == 3) 825 prefix = (struct in6_addr *)rinfo->prefix; 826 else { 827 /* this function is safe */ 828 ipv6_addr_prefix(&prefix_buf, 829 (struct in6_addr *)rinfo->prefix, 830 rinfo->prefix_len); 831 prefix = &prefix_buf; 832 } 833 834 if (rinfo->prefix_len == 0) 835 rt = rt6_get_dflt_router(net, gwaddr, dev); 836 else 837 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 838 gwaddr, dev); 839 840 if (rt && !lifetime) { 841 ip6_del_rt(net, rt); 842 rt = NULL; 843 } 844 845 if (!rt && lifetime) 846 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 847 dev, pref); 848 else if (rt) 849 rt->fib6_flags = RTF_ROUTEINFO | 850 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 851 852 if (rt) { 853 if (!addrconf_finite_timeout(lifetime)) 854 fib6_clean_expires(rt); 855 else 856 fib6_set_expires(rt, jiffies + HZ * lifetime); 857 858 fib6_info_release(rt); 859 } 860 return 0; 861 } 862 #endif 863 864 /* 865 * Misc support functions 866 */ 867 868 /* called with rcu_lock held */ 869 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt) 870 { 871 struct net_device *dev = rt->fib6_nh.nh_dev; 872 873 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 874 /* for copies of local routes, dst->dev needs to be the 875 * device if it is a master device, the master device if 876 * device is enslaved, and the loopback as the default 877 */ 878 if (netif_is_l3_slave(dev) && 879 !rt6_need_strict(&rt->fib6_dst.addr)) 880 dev = l3mdev_master_dev_rcu(dev); 881 else if (!netif_is_l3_master(dev)) 882 dev = dev_net(dev)->loopback_dev; 883 /* last case is netif_is_l3_master(dev) is true in which 884 * case we want dev returned to be dev 885 */ 886 } 887 888 return dev; 889 } 890 891 static const int fib6_prop[RTN_MAX + 1] = { 892 [RTN_UNSPEC] = 0, 893 [RTN_UNICAST] = 0, 894 [RTN_LOCAL] = 0, 895 [RTN_BROADCAST] = 0, 896 [RTN_ANYCAST] = 0, 897 [RTN_MULTICAST] = 0, 898 [RTN_BLACKHOLE] = -EINVAL, 899 [RTN_UNREACHABLE] = -EHOSTUNREACH, 900 [RTN_PROHIBIT] = -EACCES, 901 [RTN_THROW] = -EAGAIN, 902 [RTN_NAT] = -EINVAL, 903 [RTN_XRESOLVE] = -EINVAL, 904 }; 905 906 static int ip6_rt_type_to_error(u8 fib6_type) 907 { 908 return fib6_prop[fib6_type]; 909 } 910 911 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 912 { 913 unsigned short flags = 0; 914 915 if (rt->dst_nocount) 916 flags |= DST_NOCOUNT; 917 if (rt->dst_nopolicy) 918 flags |= DST_NOPOLICY; 919 if (rt->dst_host) 920 flags |= DST_HOST; 921 922 return flags; 923 } 924 925 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) 926 { 927 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); 928 929 switch (ort->fib6_type) { 930 case RTN_BLACKHOLE: 931 rt->dst.output = dst_discard_out; 932 rt->dst.input = dst_discard; 933 break; 934 case RTN_PROHIBIT: 935 rt->dst.output = ip6_pkt_prohibit_out; 936 rt->dst.input = ip6_pkt_prohibit; 937 break; 938 case RTN_THROW: 939 case RTN_UNREACHABLE: 940 default: 941 rt->dst.output = ip6_pkt_discard_out; 942 rt->dst.input = ip6_pkt_discard; 943 break; 944 } 945 } 946 947 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) 948 { 949 if (ort->fib6_flags & RTF_REJECT) { 950 ip6_rt_init_dst_reject(rt, ort); 951 return; 952 } 953 954 rt->dst.error = 0; 955 rt->dst.output = ip6_output; 956 957 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) { 958 rt->dst.input = ip6_input; 959 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 960 rt->dst.input = ip6_mc_input; 961 } else { 962 rt->dst.input = ip6_forward; 963 } 964 965 if (ort->fib6_nh.nh_lwtstate) { 966 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); 967 lwtunnel_set_redirect(&rt->dst); 968 } 969 970 rt->dst.lastuse = jiffies; 971 } 972 973 /* Caller must already hold reference to @from */ 974 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 975 { 976 rt->rt6i_flags &= ~RTF_EXPIRES; 977 rcu_assign_pointer(rt->from, from); 978 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); 979 } 980 981 /* Caller must already hold reference to @ort */ 982 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) 983 { 984 struct net_device *dev = fib6_info_nh_dev(ort); 985 986 ip6_rt_init_dst(rt, ort); 987 988 rt->rt6i_dst = ort->fib6_dst; 989 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 990 rt->rt6i_gateway = ort->fib6_nh.nh_gw; 991 rt->rt6i_flags = ort->fib6_flags; 992 rt6_set_from(rt, ort); 993 #ifdef CONFIG_IPV6_SUBTREES 994 rt->rt6i_src = ort->fib6_src; 995 #endif 996 } 997 998 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 999 struct in6_addr *saddr) 1000 { 1001 struct fib6_node *pn, *sn; 1002 while (1) { 1003 if (fn->fn_flags & RTN_TL_ROOT) 1004 return NULL; 1005 pn = rcu_dereference(fn->parent); 1006 sn = FIB6_SUBTREE(pn); 1007 if (sn && sn != fn) 1008 fn = fib6_node_lookup(sn, NULL, saddr); 1009 else 1010 fn = pn; 1011 if (fn->fn_flags & RTN_RTINFO) 1012 return fn; 1013 } 1014 } 1015 1016 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, 1017 bool null_fallback) 1018 { 1019 struct rt6_info *rt = *prt; 1020 1021 if (dst_hold_safe(&rt->dst)) 1022 return true; 1023 if (null_fallback) { 1024 rt = net->ipv6.ip6_null_entry; 1025 dst_hold(&rt->dst); 1026 } else { 1027 rt = NULL; 1028 } 1029 *prt = rt; 1030 return false; 1031 } 1032 1033 /* called with rcu_lock held */ 1034 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) 1035 { 1036 unsigned short flags = fib6_info_dst_flags(rt); 1037 struct net_device *dev = rt->fib6_nh.nh_dev; 1038 struct rt6_info *nrt; 1039 1040 if (!fib6_info_hold_safe(rt)) 1041 return NULL; 1042 1043 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1044 if (nrt) 1045 ip6_rt_copy_init(nrt, rt); 1046 else 1047 fib6_info_release(rt); 1048 1049 return nrt; 1050 } 1051 1052 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1053 struct fib6_table *table, 1054 struct flowi6 *fl6, 1055 const struct sk_buff *skb, 1056 int flags) 1057 { 1058 struct fib6_info *f6i; 1059 struct fib6_node *fn; 1060 struct rt6_info *rt; 1061 1062 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1063 flags &= ~RT6_LOOKUP_F_IFACE; 1064 1065 rcu_read_lock(); 1066 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1067 restart: 1068 f6i = rcu_dereference(fn->leaf); 1069 if (!f6i) { 1070 f6i = net->ipv6.fib6_null_entry; 1071 } else { 1072 f6i = rt6_device_match(net, f6i, &fl6->saddr, 1073 fl6->flowi6_oif, flags); 1074 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) 1075 f6i = fib6_multipath_select(net, f6i, fl6, 1076 fl6->flowi6_oif, skb, 1077 flags); 1078 } 1079 if (f6i == net->ipv6.fib6_null_entry) { 1080 fn = fib6_backtrack(fn, &fl6->saddr); 1081 if (fn) 1082 goto restart; 1083 } 1084 1085 trace_fib6_table_lookup(net, f6i, table, fl6); 1086 1087 /* Search through exception table */ 1088 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1089 if (rt) { 1090 if (ip6_hold_safe(net, &rt, true)) 1091 dst_use_noref(&rt->dst, jiffies); 1092 } else if (f6i == net->ipv6.fib6_null_entry) { 1093 rt = net->ipv6.ip6_null_entry; 1094 dst_hold(&rt->dst); 1095 } else { 1096 rt = ip6_create_rt_rcu(f6i); 1097 if (!rt) { 1098 rt = net->ipv6.ip6_null_entry; 1099 dst_hold(&rt->dst); 1100 } 1101 } 1102 1103 rcu_read_unlock(); 1104 1105 return rt; 1106 } 1107 1108 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1109 const struct sk_buff *skb, int flags) 1110 { 1111 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1112 } 1113 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1114 1115 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1116 const struct in6_addr *saddr, int oif, 1117 const struct sk_buff *skb, int strict) 1118 { 1119 struct flowi6 fl6 = { 1120 .flowi6_oif = oif, 1121 .daddr = *daddr, 1122 }; 1123 struct dst_entry *dst; 1124 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1125 1126 if (saddr) { 1127 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1128 flags |= RT6_LOOKUP_F_HAS_SADDR; 1129 } 1130 1131 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1132 if (dst->error == 0) 1133 return (struct rt6_info *) dst; 1134 1135 dst_release(dst); 1136 1137 return NULL; 1138 } 1139 EXPORT_SYMBOL(rt6_lookup); 1140 1141 /* ip6_ins_rt is called with FREE table->tb6_lock. 1142 * It takes new route entry, the addition fails by any reason the 1143 * route is released. 1144 * Caller must hold dst before calling it. 1145 */ 1146 1147 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1148 struct netlink_ext_ack *extack) 1149 { 1150 int err; 1151 struct fib6_table *table; 1152 1153 table = rt->fib6_table; 1154 spin_lock_bh(&table->tb6_lock); 1155 err = fib6_add(&table->tb6_root, rt, info, extack); 1156 spin_unlock_bh(&table->tb6_lock); 1157 1158 return err; 1159 } 1160 1161 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1162 { 1163 struct nl_info info = { .nl_net = net, }; 1164 1165 return __ip6_ins_rt(rt, &info, NULL); 1166 } 1167 1168 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, 1169 const struct in6_addr *daddr, 1170 const struct in6_addr *saddr) 1171 { 1172 struct net_device *dev; 1173 struct rt6_info *rt; 1174 1175 /* 1176 * Clone the route. 1177 */ 1178 1179 if (!fib6_info_hold_safe(ort)) 1180 return NULL; 1181 1182 dev = ip6_rt_get_dev_rcu(ort); 1183 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1184 if (!rt) { 1185 fib6_info_release(ort); 1186 return NULL; 1187 } 1188 1189 ip6_rt_copy_init(rt, ort); 1190 rt->rt6i_flags |= RTF_CACHE; 1191 rt->dst.flags |= DST_HOST; 1192 rt->rt6i_dst.addr = *daddr; 1193 rt->rt6i_dst.plen = 128; 1194 1195 if (!rt6_is_gw_or_nonexthop(ort)) { 1196 if (ort->fib6_dst.plen != 128 && 1197 ipv6_addr_equal(&ort->fib6_dst.addr, daddr)) 1198 rt->rt6i_flags |= RTF_ANYCAST; 1199 #ifdef CONFIG_IPV6_SUBTREES 1200 if (rt->rt6i_src.plen && saddr) { 1201 rt->rt6i_src.addr = *saddr; 1202 rt->rt6i_src.plen = 128; 1203 } 1204 #endif 1205 } 1206 1207 return rt; 1208 } 1209 1210 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) 1211 { 1212 unsigned short flags = fib6_info_dst_flags(rt); 1213 struct net_device *dev; 1214 struct rt6_info *pcpu_rt; 1215 1216 if (!fib6_info_hold_safe(rt)) 1217 return NULL; 1218 1219 rcu_read_lock(); 1220 dev = ip6_rt_get_dev_rcu(rt); 1221 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1222 rcu_read_unlock(); 1223 if (!pcpu_rt) { 1224 fib6_info_release(rt); 1225 return NULL; 1226 } 1227 ip6_rt_copy_init(pcpu_rt, rt); 1228 pcpu_rt->rt6i_flags |= RTF_PCPU; 1229 return pcpu_rt; 1230 } 1231 1232 /* It should be called with rcu_read_lock() acquired */ 1233 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt) 1234 { 1235 struct rt6_info *pcpu_rt, **p; 1236 1237 p = this_cpu_ptr(rt->rt6i_pcpu); 1238 pcpu_rt = *p; 1239 1240 if (pcpu_rt) 1241 ip6_hold_safe(NULL, &pcpu_rt, false); 1242 1243 return pcpu_rt; 1244 } 1245 1246 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1247 struct fib6_info *rt) 1248 { 1249 struct rt6_info *pcpu_rt, *prev, **p; 1250 1251 pcpu_rt = ip6_rt_pcpu_alloc(rt); 1252 if (!pcpu_rt) { 1253 dst_hold(&net->ipv6.ip6_null_entry->dst); 1254 return net->ipv6.ip6_null_entry; 1255 } 1256 1257 dst_hold(&pcpu_rt->dst); 1258 p = this_cpu_ptr(rt->rt6i_pcpu); 1259 prev = cmpxchg(p, NULL, pcpu_rt); 1260 BUG_ON(prev); 1261 1262 return pcpu_rt; 1263 } 1264 1265 /* exception hash table implementation 1266 */ 1267 static DEFINE_SPINLOCK(rt6_exception_lock); 1268 1269 /* Remove rt6_ex from hash table and free the memory 1270 * Caller must hold rt6_exception_lock 1271 */ 1272 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1273 struct rt6_exception *rt6_ex) 1274 { 1275 struct net *net; 1276 1277 if (!bucket || !rt6_ex) 1278 return; 1279 1280 net = dev_net(rt6_ex->rt6i->dst.dev); 1281 hlist_del_rcu(&rt6_ex->hlist); 1282 dst_release(&rt6_ex->rt6i->dst); 1283 kfree_rcu(rt6_ex, rcu); 1284 WARN_ON_ONCE(!bucket->depth); 1285 bucket->depth--; 1286 net->ipv6.rt6_stats->fib_rt_cache--; 1287 } 1288 1289 /* Remove oldest rt6_ex in bucket and free the memory 1290 * Caller must hold rt6_exception_lock 1291 */ 1292 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1293 { 1294 struct rt6_exception *rt6_ex, *oldest = NULL; 1295 1296 if (!bucket) 1297 return; 1298 1299 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1300 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1301 oldest = rt6_ex; 1302 } 1303 rt6_remove_exception(bucket, oldest); 1304 } 1305 1306 static u32 rt6_exception_hash(const struct in6_addr *dst, 1307 const struct in6_addr *src) 1308 { 1309 static u32 seed __read_mostly; 1310 u32 val; 1311 1312 net_get_random_once(&seed, sizeof(seed)); 1313 val = jhash(dst, sizeof(*dst), seed); 1314 1315 #ifdef CONFIG_IPV6_SUBTREES 1316 if (src) 1317 val = jhash(src, sizeof(*src), val); 1318 #endif 1319 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1320 } 1321 1322 /* Helper function to find the cached rt in the hash table 1323 * and update bucket pointer to point to the bucket for this 1324 * (daddr, saddr) pair 1325 * Caller must hold rt6_exception_lock 1326 */ 1327 static struct rt6_exception * 1328 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1329 const struct in6_addr *daddr, 1330 const struct in6_addr *saddr) 1331 { 1332 struct rt6_exception *rt6_ex; 1333 u32 hval; 1334 1335 if (!(*bucket) || !daddr) 1336 return NULL; 1337 1338 hval = rt6_exception_hash(daddr, saddr); 1339 *bucket += hval; 1340 1341 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1342 struct rt6_info *rt6 = rt6_ex->rt6i; 1343 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1344 1345 #ifdef CONFIG_IPV6_SUBTREES 1346 if (matched && saddr) 1347 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1348 #endif 1349 if (matched) 1350 return rt6_ex; 1351 } 1352 return NULL; 1353 } 1354 1355 /* Helper function to find the cached rt in the hash table 1356 * and update bucket pointer to point to the bucket for this 1357 * (daddr, saddr) pair 1358 * Caller must hold rcu_read_lock() 1359 */ 1360 static struct rt6_exception * 1361 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1362 const struct in6_addr *daddr, 1363 const struct in6_addr *saddr) 1364 { 1365 struct rt6_exception *rt6_ex; 1366 u32 hval; 1367 1368 WARN_ON_ONCE(!rcu_read_lock_held()); 1369 1370 if (!(*bucket) || !daddr) 1371 return NULL; 1372 1373 hval = rt6_exception_hash(daddr, saddr); 1374 *bucket += hval; 1375 1376 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1377 struct rt6_info *rt6 = rt6_ex->rt6i; 1378 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1379 1380 #ifdef CONFIG_IPV6_SUBTREES 1381 if (matched && saddr) 1382 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1383 #endif 1384 if (matched) 1385 return rt6_ex; 1386 } 1387 return NULL; 1388 } 1389 1390 static unsigned int fib6_mtu(const struct fib6_info *rt) 1391 { 1392 unsigned int mtu; 1393 1394 if (rt->fib6_pmtu) { 1395 mtu = rt->fib6_pmtu; 1396 } else { 1397 struct net_device *dev = fib6_info_nh_dev(rt); 1398 struct inet6_dev *idev; 1399 1400 rcu_read_lock(); 1401 idev = __in6_dev_get(dev); 1402 mtu = idev->cnf.mtu6; 1403 rcu_read_unlock(); 1404 } 1405 1406 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1407 1408 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu); 1409 } 1410 1411 static int rt6_insert_exception(struct rt6_info *nrt, 1412 struct fib6_info *ort) 1413 { 1414 struct net *net = dev_net(nrt->dst.dev); 1415 struct rt6_exception_bucket *bucket; 1416 struct in6_addr *src_key = NULL; 1417 struct rt6_exception *rt6_ex; 1418 int err = 0; 1419 1420 spin_lock_bh(&rt6_exception_lock); 1421 1422 if (ort->exception_bucket_flushed) { 1423 err = -EINVAL; 1424 goto out; 1425 } 1426 1427 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, 1428 lockdep_is_held(&rt6_exception_lock)); 1429 if (!bucket) { 1430 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1431 GFP_ATOMIC); 1432 if (!bucket) { 1433 err = -ENOMEM; 1434 goto out; 1435 } 1436 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); 1437 } 1438 1439 #ifdef CONFIG_IPV6_SUBTREES 1440 /* rt6i_src.plen != 0 indicates ort is in subtree 1441 * and exception table is indexed by a hash of 1442 * both rt6i_dst and rt6i_src. 1443 * Otherwise, the exception table is indexed by 1444 * a hash of only rt6i_dst. 1445 */ 1446 if (ort->fib6_src.plen) 1447 src_key = &nrt->rt6i_src.addr; 1448 #endif 1449 /* rt6_mtu_change() might lower mtu on ort. 1450 * Only insert this exception route if its mtu 1451 * is less than ort's mtu value. 1452 */ 1453 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) { 1454 err = -EINVAL; 1455 goto out; 1456 } 1457 1458 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1459 src_key); 1460 if (rt6_ex) 1461 rt6_remove_exception(bucket, rt6_ex); 1462 1463 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1464 if (!rt6_ex) { 1465 err = -ENOMEM; 1466 goto out; 1467 } 1468 rt6_ex->rt6i = nrt; 1469 rt6_ex->stamp = jiffies; 1470 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1471 bucket->depth++; 1472 net->ipv6.rt6_stats->fib_rt_cache++; 1473 1474 if (bucket->depth > FIB6_MAX_DEPTH) 1475 rt6_exception_remove_oldest(bucket); 1476 1477 out: 1478 spin_unlock_bh(&rt6_exception_lock); 1479 1480 /* Update fn->fn_sernum to invalidate all cached dst */ 1481 if (!err) { 1482 spin_lock_bh(&ort->fib6_table->tb6_lock); 1483 fib6_update_sernum(net, ort); 1484 spin_unlock_bh(&ort->fib6_table->tb6_lock); 1485 fib6_force_start_gc(net); 1486 } 1487 1488 return err; 1489 } 1490 1491 void rt6_flush_exceptions(struct fib6_info *rt) 1492 { 1493 struct rt6_exception_bucket *bucket; 1494 struct rt6_exception *rt6_ex; 1495 struct hlist_node *tmp; 1496 int i; 1497 1498 spin_lock_bh(&rt6_exception_lock); 1499 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1500 rt->exception_bucket_flushed = 1; 1501 1502 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1503 lockdep_is_held(&rt6_exception_lock)); 1504 if (!bucket) 1505 goto out; 1506 1507 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1508 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1509 rt6_remove_exception(bucket, rt6_ex); 1510 WARN_ON_ONCE(bucket->depth); 1511 bucket++; 1512 } 1513 1514 out: 1515 spin_unlock_bh(&rt6_exception_lock); 1516 } 1517 1518 /* Find cached rt in the hash table inside passed in rt 1519 * Caller has to hold rcu_read_lock() 1520 */ 1521 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 1522 struct in6_addr *daddr, 1523 struct in6_addr *saddr) 1524 { 1525 struct rt6_exception_bucket *bucket; 1526 struct in6_addr *src_key = NULL; 1527 struct rt6_exception *rt6_ex; 1528 struct rt6_info *res = NULL; 1529 1530 bucket = rcu_dereference(rt->rt6i_exception_bucket); 1531 1532 #ifdef CONFIG_IPV6_SUBTREES 1533 /* rt6i_src.plen != 0 indicates rt is in subtree 1534 * and exception table is indexed by a hash of 1535 * both rt6i_dst and rt6i_src. 1536 * Otherwise, the exception table is indexed by 1537 * a hash of only rt6i_dst. 1538 */ 1539 if (rt->fib6_src.plen) 1540 src_key = saddr; 1541 #endif 1542 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1543 1544 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1545 res = rt6_ex->rt6i; 1546 1547 return res; 1548 } 1549 1550 /* Remove the passed in cached rt from the hash table that contains it */ 1551 static int rt6_remove_exception_rt(struct rt6_info *rt) 1552 { 1553 struct rt6_exception_bucket *bucket; 1554 struct in6_addr *src_key = NULL; 1555 struct rt6_exception *rt6_ex; 1556 struct fib6_info *from; 1557 int err; 1558 1559 from = rcu_dereference(rt->from); 1560 if (!from || 1561 !(rt->rt6i_flags & RTF_CACHE)) 1562 return -EINVAL; 1563 1564 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1565 return -ENOENT; 1566 1567 spin_lock_bh(&rt6_exception_lock); 1568 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1569 lockdep_is_held(&rt6_exception_lock)); 1570 #ifdef CONFIG_IPV6_SUBTREES 1571 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1572 * and exception table is indexed by a hash of 1573 * both rt6i_dst and rt6i_src. 1574 * Otherwise, the exception table is indexed by 1575 * a hash of only rt6i_dst. 1576 */ 1577 if (from->fib6_src.plen) 1578 src_key = &rt->rt6i_src.addr; 1579 #endif 1580 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1581 &rt->rt6i_dst.addr, 1582 src_key); 1583 if (rt6_ex) { 1584 rt6_remove_exception(bucket, rt6_ex); 1585 err = 0; 1586 } else { 1587 err = -ENOENT; 1588 } 1589 1590 spin_unlock_bh(&rt6_exception_lock); 1591 return err; 1592 } 1593 1594 /* Find rt6_ex which contains the passed in rt cache and 1595 * refresh its stamp 1596 */ 1597 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1598 { 1599 struct rt6_exception_bucket *bucket; 1600 struct fib6_info *from = rt->from; 1601 struct in6_addr *src_key = NULL; 1602 struct rt6_exception *rt6_ex; 1603 1604 if (!from || 1605 !(rt->rt6i_flags & RTF_CACHE)) 1606 return; 1607 1608 rcu_read_lock(); 1609 bucket = rcu_dereference(from->rt6i_exception_bucket); 1610 1611 #ifdef CONFIG_IPV6_SUBTREES 1612 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1613 * and exception table is indexed by a hash of 1614 * both rt6i_dst and rt6i_src. 1615 * Otherwise, the exception table is indexed by 1616 * a hash of only rt6i_dst. 1617 */ 1618 if (from->fib6_src.plen) 1619 src_key = &rt->rt6i_src.addr; 1620 #endif 1621 rt6_ex = __rt6_find_exception_rcu(&bucket, 1622 &rt->rt6i_dst.addr, 1623 src_key); 1624 if (rt6_ex) 1625 rt6_ex->stamp = jiffies; 1626 1627 rcu_read_unlock(); 1628 } 1629 1630 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1631 struct rt6_info *rt, int mtu) 1632 { 1633 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1634 * lowest MTU in the path: always allow updating the route PMTU to 1635 * reflect PMTU decreases. 1636 * 1637 * If the new MTU is higher, and the route PMTU is equal to the local 1638 * MTU, this means the old MTU is the lowest in the path, so allow 1639 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1640 * handle this. 1641 */ 1642 1643 if (dst_mtu(&rt->dst) >= mtu) 1644 return true; 1645 1646 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 1647 return true; 1648 1649 return false; 1650 } 1651 1652 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1653 struct fib6_info *rt, int mtu) 1654 { 1655 struct rt6_exception_bucket *bucket; 1656 struct rt6_exception *rt6_ex; 1657 int i; 1658 1659 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1660 lockdep_is_held(&rt6_exception_lock)); 1661 1662 if (!bucket) 1663 return; 1664 1665 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1666 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1667 struct rt6_info *entry = rt6_ex->rt6i; 1668 1669 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 1670 * route), the metrics of its rt->from have already 1671 * been updated. 1672 */ 1673 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 1674 rt6_mtu_change_route_allowed(idev, entry, mtu)) 1675 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 1676 } 1677 bucket++; 1678 } 1679 } 1680 1681 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1682 1683 static void rt6_exceptions_clean_tohost(struct fib6_info *rt, 1684 struct in6_addr *gateway) 1685 { 1686 struct rt6_exception_bucket *bucket; 1687 struct rt6_exception *rt6_ex; 1688 struct hlist_node *tmp; 1689 int i; 1690 1691 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1692 return; 1693 1694 spin_lock_bh(&rt6_exception_lock); 1695 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1696 lockdep_is_held(&rt6_exception_lock)); 1697 1698 if (bucket) { 1699 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1700 hlist_for_each_entry_safe(rt6_ex, tmp, 1701 &bucket->chain, hlist) { 1702 struct rt6_info *entry = rt6_ex->rt6i; 1703 1704 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1705 RTF_CACHE_GATEWAY && 1706 ipv6_addr_equal(gateway, 1707 &entry->rt6i_gateway)) { 1708 rt6_remove_exception(bucket, rt6_ex); 1709 } 1710 } 1711 bucket++; 1712 } 1713 } 1714 1715 spin_unlock_bh(&rt6_exception_lock); 1716 } 1717 1718 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1719 struct rt6_exception *rt6_ex, 1720 struct fib6_gc_args *gc_args, 1721 unsigned long now) 1722 { 1723 struct rt6_info *rt = rt6_ex->rt6i; 1724 1725 /* we are pruning and obsoleting aged-out and non gateway exceptions 1726 * even if others have still references to them, so that on next 1727 * dst_check() such references can be dropped. 1728 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1729 * expired, independently from their aging, as per RFC 8201 section 4 1730 */ 1731 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 1732 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1733 RT6_TRACE("aging clone %p\n", rt); 1734 rt6_remove_exception(bucket, rt6_ex); 1735 return; 1736 } 1737 } else if (time_after(jiffies, rt->dst.expires)) { 1738 RT6_TRACE("purging expired route %p\n", rt); 1739 rt6_remove_exception(bucket, rt6_ex); 1740 return; 1741 } 1742 1743 if (rt->rt6i_flags & RTF_GATEWAY) { 1744 struct neighbour *neigh; 1745 __u8 neigh_flags = 0; 1746 1747 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 1748 if (neigh) 1749 neigh_flags = neigh->flags; 1750 1751 if (!(neigh_flags & NTF_ROUTER)) { 1752 RT6_TRACE("purging route %p via non-router but gateway\n", 1753 rt); 1754 rt6_remove_exception(bucket, rt6_ex); 1755 return; 1756 } 1757 } 1758 1759 gc_args->more++; 1760 } 1761 1762 void rt6_age_exceptions(struct fib6_info *rt, 1763 struct fib6_gc_args *gc_args, 1764 unsigned long now) 1765 { 1766 struct rt6_exception_bucket *bucket; 1767 struct rt6_exception *rt6_ex; 1768 struct hlist_node *tmp; 1769 int i; 1770 1771 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1772 return; 1773 1774 rcu_read_lock_bh(); 1775 spin_lock(&rt6_exception_lock); 1776 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1777 lockdep_is_held(&rt6_exception_lock)); 1778 1779 if (bucket) { 1780 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1781 hlist_for_each_entry_safe(rt6_ex, tmp, 1782 &bucket->chain, hlist) { 1783 rt6_age_examine_exception(bucket, rt6_ex, 1784 gc_args, now); 1785 } 1786 bucket++; 1787 } 1788 } 1789 spin_unlock(&rt6_exception_lock); 1790 rcu_read_unlock_bh(); 1791 } 1792 1793 /* must be called with rcu lock held */ 1794 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, 1795 int oif, struct flowi6 *fl6, int strict) 1796 { 1797 struct fib6_node *fn, *saved_fn; 1798 struct fib6_info *f6i; 1799 1800 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1801 saved_fn = fn; 1802 1803 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1804 oif = 0; 1805 1806 redo_rt6_select: 1807 f6i = rt6_select(net, fn, oif, strict); 1808 if (f6i == net->ipv6.fib6_null_entry) { 1809 fn = fib6_backtrack(fn, &fl6->saddr); 1810 if (fn) 1811 goto redo_rt6_select; 1812 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1813 /* also consider unreachable route */ 1814 strict &= ~RT6_LOOKUP_F_REACHABLE; 1815 fn = saved_fn; 1816 goto redo_rt6_select; 1817 } 1818 } 1819 1820 trace_fib6_table_lookup(net, f6i, table, fl6); 1821 1822 return f6i; 1823 } 1824 1825 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1826 int oif, struct flowi6 *fl6, 1827 const struct sk_buff *skb, int flags) 1828 { 1829 struct fib6_info *f6i; 1830 struct rt6_info *rt; 1831 int strict = 0; 1832 1833 strict |= flags & RT6_LOOKUP_F_IFACE; 1834 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1835 if (net->ipv6.devconf_all->forwarding == 0) 1836 strict |= RT6_LOOKUP_F_REACHABLE; 1837 1838 rcu_read_lock(); 1839 1840 f6i = fib6_table_lookup(net, table, oif, fl6, strict); 1841 if (f6i->fib6_nsiblings) 1842 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict); 1843 1844 if (f6i == net->ipv6.fib6_null_entry) { 1845 rt = net->ipv6.ip6_null_entry; 1846 rcu_read_unlock(); 1847 dst_hold(&rt->dst); 1848 return rt; 1849 } 1850 1851 /*Search through exception table */ 1852 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1853 if (rt) { 1854 if (ip6_hold_safe(net, &rt, true)) 1855 dst_use_noref(&rt->dst, jiffies); 1856 1857 rcu_read_unlock(); 1858 return rt; 1859 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1860 !(f6i->fib6_flags & RTF_GATEWAY))) { 1861 /* Create a RTF_CACHE clone which will not be 1862 * owned by the fib6 tree. It is for the special case where 1863 * the daddr in the skb during the neighbor look-up is different 1864 * from the fl6->daddr used to look-up route here. 1865 */ 1866 struct rt6_info *uncached_rt; 1867 1868 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); 1869 1870 rcu_read_unlock(); 1871 1872 if (uncached_rt) { 1873 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1874 * No need for another dst_hold() 1875 */ 1876 rt6_uncached_list_add(uncached_rt); 1877 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 1878 } else { 1879 uncached_rt = net->ipv6.ip6_null_entry; 1880 dst_hold(&uncached_rt->dst); 1881 } 1882 1883 return uncached_rt; 1884 } else { 1885 /* Get a percpu copy */ 1886 1887 struct rt6_info *pcpu_rt; 1888 1889 local_bh_disable(); 1890 pcpu_rt = rt6_get_pcpu_route(f6i); 1891 1892 if (!pcpu_rt) 1893 pcpu_rt = rt6_make_pcpu_route(net, f6i); 1894 1895 local_bh_enable(); 1896 rcu_read_unlock(); 1897 1898 return pcpu_rt; 1899 } 1900 } 1901 EXPORT_SYMBOL_GPL(ip6_pol_route); 1902 1903 static struct rt6_info *ip6_pol_route_input(struct net *net, 1904 struct fib6_table *table, 1905 struct flowi6 *fl6, 1906 const struct sk_buff *skb, 1907 int flags) 1908 { 1909 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 1910 } 1911 1912 struct dst_entry *ip6_route_input_lookup(struct net *net, 1913 struct net_device *dev, 1914 struct flowi6 *fl6, 1915 const struct sk_buff *skb, 1916 int flags) 1917 { 1918 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1919 flags |= RT6_LOOKUP_F_IFACE; 1920 1921 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 1922 } 1923 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1924 1925 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1926 struct flow_keys *keys, 1927 struct flow_keys *flkeys) 1928 { 1929 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1930 const struct ipv6hdr *key_iph = outer_iph; 1931 struct flow_keys *_flkeys = flkeys; 1932 const struct ipv6hdr *inner_iph; 1933 const struct icmp6hdr *icmph; 1934 struct ipv6hdr _inner_iph; 1935 struct icmp6hdr _icmph; 1936 1937 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1938 goto out; 1939 1940 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 1941 sizeof(_icmph), &_icmph); 1942 if (!icmph) 1943 goto out; 1944 1945 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1946 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1947 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1948 icmph->icmp6_type != ICMPV6_PARAMPROB) 1949 goto out; 1950 1951 inner_iph = skb_header_pointer(skb, 1952 skb_transport_offset(skb) + sizeof(*icmph), 1953 sizeof(_inner_iph), &_inner_iph); 1954 if (!inner_iph) 1955 goto out; 1956 1957 key_iph = inner_iph; 1958 _flkeys = NULL; 1959 out: 1960 if (_flkeys) { 1961 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 1962 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 1963 keys->tags.flow_label = _flkeys->tags.flow_label; 1964 keys->basic.ip_proto = _flkeys->basic.ip_proto; 1965 } else { 1966 keys->addrs.v6addrs.src = key_iph->saddr; 1967 keys->addrs.v6addrs.dst = key_iph->daddr; 1968 keys->tags.flow_label = ip6_flowlabel(key_iph); 1969 keys->basic.ip_proto = key_iph->nexthdr; 1970 } 1971 } 1972 1973 /* if skb is set it will be used and fl6 can be NULL */ 1974 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 1975 const struct sk_buff *skb, struct flow_keys *flkeys) 1976 { 1977 struct flow_keys hash_keys; 1978 u32 mhash; 1979 1980 switch (ip6_multipath_hash_policy(net)) { 1981 case 0: 1982 memset(&hash_keys, 0, sizeof(hash_keys)); 1983 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 1984 if (skb) { 1985 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 1986 } else { 1987 hash_keys.addrs.v6addrs.src = fl6->saddr; 1988 hash_keys.addrs.v6addrs.dst = fl6->daddr; 1989 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 1990 hash_keys.basic.ip_proto = fl6->flowi6_proto; 1991 } 1992 break; 1993 case 1: 1994 if (skb) { 1995 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 1996 struct flow_keys keys; 1997 1998 /* short-circuit if we already have L4 hash present */ 1999 if (skb->l4_hash) 2000 return skb_get_hash_raw(skb) >> 1; 2001 2002 memset(&hash_keys, 0, sizeof(hash_keys)); 2003 2004 if (!flkeys) { 2005 skb_flow_dissect_flow_keys(skb, &keys, flag); 2006 flkeys = &keys; 2007 } 2008 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2009 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2010 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2011 hash_keys.ports.src = flkeys->ports.src; 2012 hash_keys.ports.dst = flkeys->ports.dst; 2013 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2014 } else { 2015 memset(&hash_keys, 0, sizeof(hash_keys)); 2016 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2017 hash_keys.addrs.v6addrs.src = fl6->saddr; 2018 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2019 hash_keys.ports.src = fl6->fl6_sport; 2020 hash_keys.ports.dst = fl6->fl6_dport; 2021 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2022 } 2023 break; 2024 } 2025 mhash = flow_hash_from_keys(&hash_keys); 2026 2027 return mhash >> 1; 2028 } 2029 2030 void ip6_route_input(struct sk_buff *skb) 2031 { 2032 const struct ipv6hdr *iph = ipv6_hdr(skb); 2033 struct net *net = dev_net(skb->dev); 2034 int flags = RT6_LOOKUP_F_HAS_SADDR; 2035 struct ip_tunnel_info *tun_info; 2036 struct flowi6 fl6 = { 2037 .flowi6_iif = skb->dev->ifindex, 2038 .daddr = iph->daddr, 2039 .saddr = iph->saddr, 2040 .flowlabel = ip6_flowinfo(iph), 2041 .flowi6_mark = skb->mark, 2042 .flowi6_proto = iph->nexthdr, 2043 }; 2044 struct flow_keys *flkeys = NULL, _flkeys; 2045 2046 tun_info = skb_tunnel_info(skb); 2047 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2048 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2049 2050 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2051 flkeys = &_flkeys; 2052 2053 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2054 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2055 skb_dst_drop(skb); 2056 skb_dst_set(skb, 2057 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); 2058 } 2059 2060 static struct rt6_info *ip6_pol_route_output(struct net *net, 2061 struct fib6_table *table, 2062 struct flowi6 *fl6, 2063 const struct sk_buff *skb, 2064 int flags) 2065 { 2066 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2067 } 2068 2069 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 2070 struct flowi6 *fl6, int flags) 2071 { 2072 bool any_src; 2073 2074 if (ipv6_addr_type(&fl6->daddr) & 2075 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { 2076 struct dst_entry *dst; 2077 2078 dst = l3mdev_link_scope_lookup(net, fl6); 2079 if (dst) 2080 return dst; 2081 } 2082 2083 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2084 2085 any_src = ipv6_addr_any(&fl6->saddr); 2086 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2087 (fl6->flowi6_oif && any_src)) 2088 flags |= RT6_LOOKUP_F_IFACE; 2089 2090 if (!any_src) 2091 flags |= RT6_LOOKUP_F_HAS_SADDR; 2092 else if (sk) 2093 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2094 2095 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2096 } 2097 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2098 2099 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2100 { 2101 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2102 struct net_device *loopback_dev = net->loopback_dev; 2103 struct dst_entry *new = NULL; 2104 2105 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2106 DST_OBSOLETE_DEAD, 0); 2107 if (rt) { 2108 rt6_info_init(rt); 2109 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2110 2111 new = &rt->dst; 2112 new->__use = 1; 2113 new->input = dst_discard; 2114 new->output = dst_discard_out; 2115 2116 dst_copy_metrics(new, &ort->dst); 2117 2118 rt->rt6i_idev = in6_dev_get(loopback_dev); 2119 rt->rt6i_gateway = ort->rt6i_gateway; 2120 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2121 2122 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2123 #ifdef CONFIG_IPV6_SUBTREES 2124 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2125 #endif 2126 } 2127 2128 dst_release(dst_orig); 2129 return new ? new : ERR_PTR(-ENOMEM); 2130 } 2131 2132 /* 2133 * Destination cache support functions 2134 */ 2135 2136 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2137 { 2138 u32 rt_cookie = 0; 2139 2140 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2141 return false; 2142 2143 if (fib6_check_expired(f6i)) 2144 return false; 2145 2146 return true; 2147 } 2148 2149 static struct dst_entry *rt6_check(struct rt6_info *rt, 2150 struct fib6_info *from, 2151 u32 cookie) 2152 { 2153 u32 rt_cookie = 0; 2154 2155 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || 2156 rt_cookie != cookie) 2157 return NULL; 2158 2159 if (rt6_check_expired(rt)) 2160 return NULL; 2161 2162 return &rt->dst; 2163 } 2164 2165 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2166 struct fib6_info *from, 2167 u32 cookie) 2168 { 2169 if (!__rt6_check_expired(rt) && 2170 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2171 fib6_check(from, cookie)) 2172 return &rt->dst; 2173 else 2174 return NULL; 2175 } 2176 2177 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2178 { 2179 struct dst_entry *dst_ret; 2180 struct fib6_info *from; 2181 struct rt6_info *rt; 2182 2183 rt = container_of(dst, struct rt6_info, dst); 2184 2185 rcu_read_lock(); 2186 2187 /* All IPV6 dsts are created with ->obsolete set to the value 2188 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2189 * into this function always. 2190 */ 2191 2192 from = rcu_dereference(rt->from); 2193 2194 if (from && (rt->rt6i_flags & RTF_PCPU || 2195 unlikely(!list_empty(&rt->rt6i_uncached)))) 2196 dst_ret = rt6_dst_from_check(rt, from, cookie); 2197 else 2198 dst_ret = rt6_check(rt, from, cookie); 2199 2200 rcu_read_unlock(); 2201 2202 return dst_ret; 2203 } 2204 2205 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2206 { 2207 struct rt6_info *rt = (struct rt6_info *) dst; 2208 2209 if (rt) { 2210 if (rt->rt6i_flags & RTF_CACHE) { 2211 rcu_read_lock(); 2212 if (rt6_check_expired(rt)) { 2213 rt6_remove_exception_rt(rt); 2214 dst = NULL; 2215 } 2216 rcu_read_unlock(); 2217 } else { 2218 dst_release(dst); 2219 dst = NULL; 2220 } 2221 } 2222 return dst; 2223 } 2224 2225 static void ip6_link_failure(struct sk_buff *skb) 2226 { 2227 struct rt6_info *rt; 2228 2229 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2230 2231 rt = (struct rt6_info *) skb_dst(skb); 2232 if (rt) { 2233 rcu_read_lock(); 2234 if (rt->rt6i_flags & RTF_CACHE) { 2235 if (dst_hold_safe(&rt->dst)) 2236 rt6_remove_exception_rt(rt); 2237 } else { 2238 struct fib6_info *from; 2239 struct fib6_node *fn; 2240 2241 from = rcu_dereference(rt->from); 2242 if (from) { 2243 fn = rcu_dereference(from->fib6_node); 2244 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2245 fn->fn_sernum = -1; 2246 } 2247 } 2248 rcu_read_unlock(); 2249 } 2250 } 2251 2252 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2253 { 2254 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2255 struct fib6_info *from; 2256 2257 rcu_read_lock(); 2258 from = rcu_dereference(rt0->from); 2259 if (from) 2260 rt0->dst.expires = from->expires; 2261 rcu_read_unlock(); 2262 } 2263 2264 dst_set_expires(&rt0->dst, timeout); 2265 rt0->rt6i_flags |= RTF_EXPIRES; 2266 } 2267 2268 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2269 { 2270 struct net *net = dev_net(rt->dst.dev); 2271 2272 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2273 rt->rt6i_flags |= RTF_MODIFIED; 2274 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2275 } 2276 2277 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2278 { 2279 bool from_set; 2280 2281 rcu_read_lock(); 2282 from_set = !!rcu_dereference(rt->from); 2283 rcu_read_unlock(); 2284 2285 return !(rt->rt6i_flags & RTF_CACHE) && 2286 (rt->rt6i_flags & RTF_PCPU || from_set); 2287 } 2288 2289 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2290 const struct ipv6hdr *iph, u32 mtu) 2291 { 2292 const struct in6_addr *daddr, *saddr; 2293 struct rt6_info *rt6 = (struct rt6_info *)dst; 2294 2295 if (dst_metric_locked(dst, RTAX_MTU)) 2296 return; 2297 2298 if (iph) { 2299 daddr = &iph->daddr; 2300 saddr = &iph->saddr; 2301 } else if (sk) { 2302 daddr = &sk->sk_v6_daddr; 2303 saddr = &inet6_sk(sk)->saddr; 2304 } else { 2305 daddr = NULL; 2306 saddr = NULL; 2307 } 2308 dst_confirm_neigh(dst, daddr); 2309 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2310 if (mtu >= dst_mtu(dst)) 2311 return; 2312 2313 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2314 rt6_do_update_pmtu(rt6, mtu); 2315 /* update rt6_ex->stamp for cache */ 2316 if (rt6->rt6i_flags & RTF_CACHE) 2317 rt6_update_exception_stamp_rt(rt6); 2318 } else if (daddr) { 2319 struct fib6_info *from; 2320 struct rt6_info *nrt6; 2321 2322 rcu_read_lock(); 2323 from = rcu_dereference(rt6->from); 2324 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr); 2325 if (nrt6) { 2326 rt6_do_update_pmtu(nrt6, mtu); 2327 if (rt6_insert_exception(nrt6, from)) 2328 dst_release_immediate(&nrt6->dst); 2329 } 2330 rcu_read_unlock(); 2331 } 2332 } 2333 2334 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2335 struct sk_buff *skb, u32 mtu) 2336 { 2337 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2338 } 2339 2340 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2341 int oif, u32 mark, kuid_t uid) 2342 { 2343 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2344 struct dst_entry *dst; 2345 struct flowi6 fl6; 2346 2347 memset(&fl6, 0, sizeof(fl6)); 2348 fl6.flowi6_oif = oif; 2349 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); 2350 fl6.daddr = iph->daddr; 2351 fl6.saddr = iph->saddr; 2352 fl6.flowlabel = ip6_flowinfo(iph); 2353 fl6.flowi6_uid = uid; 2354 2355 dst = ip6_route_output(net, NULL, &fl6); 2356 if (!dst->error) 2357 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2358 dst_release(dst); 2359 } 2360 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2361 2362 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2363 { 2364 struct dst_entry *dst; 2365 2366 ip6_update_pmtu(skb, sock_net(sk), mtu, 2367 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid); 2368 2369 dst = __sk_dst_get(sk); 2370 if (!dst || !dst->obsolete || 2371 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2372 return; 2373 2374 bh_lock_sock(sk); 2375 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2376 ip6_datagram_dst_update(sk, false); 2377 bh_unlock_sock(sk); 2378 } 2379 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2380 2381 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2382 const struct flowi6 *fl6) 2383 { 2384 #ifdef CONFIG_IPV6_SUBTREES 2385 struct ipv6_pinfo *np = inet6_sk(sk); 2386 #endif 2387 2388 ip6_dst_store(sk, dst, 2389 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2390 &sk->sk_v6_daddr : NULL, 2391 #ifdef CONFIG_IPV6_SUBTREES 2392 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2393 &np->saddr : 2394 #endif 2395 NULL); 2396 } 2397 2398 /* Handle redirects */ 2399 struct ip6rd_flowi { 2400 struct flowi6 fl6; 2401 struct in6_addr gateway; 2402 }; 2403 2404 static struct rt6_info *__ip6_route_redirect(struct net *net, 2405 struct fib6_table *table, 2406 struct flowi6 *fl6, 2407 const struct sk_buff *skb, 2408 int flags) 2409 { 2410 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2411 struct rt6_info *ret = NULL, *rt_cache; 2412 struct fib6_info *rt; 2413 struct fib6_node *fn; 2414 2415 /* Get the "current" route for this destination and 2416 * check if the redirect has come from appropriate router. 2417 * 2418 * RFC 4861 specifies that redirects should only be 2419 * accepted if they come from the nexthop to the target. 2420 * Due to the way the routes are chosen, this notion 2421 * is a bit fuzzy and one might need to check all possible 2422 * routes. 2423 */ 2424 2425 rcu_read_lock(); 2426 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2427 restart: 2428 for_each_fib6_node_rt_rcu(fn) { 2429 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 2430 continue; 2431 if (fib6_check_expired(rt)) 2432 continue; 2433 if (rt->fib6_flags & RTF_REJECT) 2434 break; 2435 if (!(rt->fib6_flags & RTF_GATEWAY)) 2436 continue; 2437 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex) 2438 continue; 2439 /* rt_cache's gateway might be different from its 'parent' 2440 * in the case of an ip redirect. 2441 * So we keep searching in the exception table if the gateway 2442 * is different. 2443 */ 2444 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) { 2445 rt_cache = rt6_find_cached_rt(rt, 2446 &fl6->daddr, 2447 &fl6->saddr); 2448 if (rt_cache && 2449 ipv6_addr_equal(&rdfl->gateway, 2450 &rt_cache->rt6i_gateway)) { 2451 ret = rt_cache; 2452 break; 2453 } 2454 continue; 2455 } 2456 break; 2457 } 2458 2459 if (!rt) 2460 rt = net->ipv6.fib6_null_entry; 2461 else if (rt->fib6_flags & RTF_REJECT) { 2462 ret = net->ipv6.ip6_null_entry; 2463 goto out; 2464 } 2465 2466 if (rt == net->ipv6.fib6_null_entry) { 2467 fn = fib6_backtrack(fn, &fl6->saddr); 2468 if (fn) 2469 goto restart; 2470 } 2471 2472 out: 2473 if (ret) 2474 ip6_hold_safe(net, &ret, true); 2475 else 2476 ret = ip6_create_rt_rcu(rt); 2477 2478 rcu_read_unlock(); 2479 2480 trace_fib6_table_lookup(net, rt, table, fl6); 2481 return ret; 2482 }; 2483 2484 static struct dst_entry *ip6_route_redirect(struct net *net, 2485 const struct flowi6 *fl6, 2486 const struct sk_buff *skb, 2487 const struct in6_addr *gateway) 2488 { 2489 int flags = RT6_LOOKUP_F_HAS_SADDR; 2490 struct ip6rd_flowi rdfl; 2491 2492 rdfl.fl6 = *fl6; 2493 rdfl.gateway = *gateway; 2494 2495 return fib6_rule_lookup(net, &rdfl.fl6, skb, 2496 flags, __ip6_route_redirect); 2497 } 2498 2499 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2500 kuid_t uid) 2501 { 2502 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2503 struct dst_entry *dst; 2504 struct flowi6 fl6; 2505 2506 memset(&fl6, 0, sizeof(fl6)); 2507 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2508 fl6.flowi6_oif = oif; 2509 fl6.flowi6_mark = mark; 2510 fl6.daddr = iph->daddr; 2511 fl6.saddr = iph->saddr; 2512 fl6.flowlabel = ip6_flowinfo(iph); 2513 fl6.flowi6_uid = uid; 2514 2515 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 2516 rt6_do_redirect(dst, NULL, skb); 2517 dst_release(dst); 2518 } 2519 EXPORT_SYMBOL_GPL(ip6_redirect); 2520 2521 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, 2522 u32 mark) 2523 { 2524 const struct ipv6hdr *iph = ipv6_hdr(skb); 2525 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2526 struct dst_entry *dst; 2527 struct flowi6 fl6; 2528 2529 memset(&fl6, 0, sizeof(fl6)); 2530 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2531 fl6.flowi6_oif = oif; 2532 fl6.flowi6_mark = mark; 2533 fl6.daddr = msg->dest; 2534 fl6.saddr = iph->daddr; 2535 fl6.flowi6_uid = sock_net_uid(net, NULL); 2536 2537 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 2538 rt6_do_redirect(dst, NULL, skb); 2539 dst_release(dst); 2540 } 2541 2542 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2543 { 2544 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2545 sk->sk_uid); 2546 } 2547 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2548 2549 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2550 { 2551 struct net_device *dev = dst->dev; 2552 unsigned int mtu = dst_mtu(dst); 2553 struct net *net = dev_net(dev); 2554 2555 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2556 2557 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2558 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2559 2560 /* 2561 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2562 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2563 * IPV6_MAXPLEN is also valid and means: "any MSS, 2564 * rely only on pmtu discovery" 2565 */ 2566 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2567 mtu = IPV6_MAXPLEN; 2568 return mtu; 2569 } 2570 2571 static unsigned int ip6_mtu(const struct dst_entry *dst) 2572 { 2573 struct inet6_dev *idev; 2574 unsigned int mtu; 2575 2576 mtu = dst_metric_raw(dst, RTAX_MTU); 2577 if (mtu) 2578 goto out; 2579 2580 mtu = IPV6_MIN_MTU; 2581 2582 rcu_read_lock(); 2583 idev = __in6_dev_get(dst->dev); 2584 if (idev) 2585 mtu = idev->cnf.mtu6; 2586 rcu_read_unlock(); 2587 2588 out: 2589 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2590 2591 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2592 } 2593 2594 /* MTU selection: 2595 * 1. mtu on route is locked - use it 2596 * 2. mtu from nexthop exception 2597 * 3. mtu from egress device 2598 * 2599 * based on ip6_dst_mtu_forward and exception logic of 2600 * rt6_find_cached_rt; called with rcu_read_lock 2601 */ 2602 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, 2603 struct in6_addr *saddr) 2604 { 2605 struct rt6_exception_bucket *bucket; 2606 struct rt6_exception *rt6_ex; 2607 struct in6_addr *src_key; 2608 struct inet6_dev *idev; 2609 u32 mtu = 0; 2610 2611 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 2612 mtu = f6i->fib6_pmtu; 2613 if (mtu) 2614 goto out; 2615 } 2616 2617 src_key = NULL; 2618 #ifdef CONFIG_IPV6_SUBTREES 2619 if (f6i->fib6_src.plen) 2620 src_key = saddr; 2621 #endif 2622 2623 bucket = rcu_dereference(f6i->rt6i_exception_bucket); 2624 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 2625 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 2626 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU); 2627 2628 if (likely(!mtu)) { 2629 struct net_device *dev = fib6_info_nh_dev(f6i); 2630 2631 mtu = IPV6_MIN_MTU; 2632 idev = __in6_dev_get(dev); 2633 if (idev && idev->cnf.mtu6 > mtu) 2634 mtu = idev->cnf.mtu6; 2635 } 2636 2637 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2638 out: 2639 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu); 2640 } 2641 2642 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2643 struct flowi6 *fl6) 2644 { 2645 struct dst_entry *dst; 2646 struct rt6_info *rt; 2647 struct inet6_dev *idev = in6_dev_get(dev); 2648 struct net *net = dev_net(dev); 2649 2650 if (unlikely(!idev)) 2651 return ERR_PTR(-ENODEV); 2652 2653 rt = ip6_dst_alloc(net, dev, 0); 2654 if (unlikely(!rt)) { 2655 in6_dev_put(idev); 2656 dst = ERR_PTR(-ENOMEM); 2657 goto out; 2658 } 2659 2660 rt->dst.flags |= DST_HOST; 2661 rt->dst.input = ip6_input; 2662 rt->dst.output = ip6_output; 2663 rt->rt6i_gateway = fl6->daddr; 2664 rt->rt6i_dst.addr = fl6->daddr; 2665 rt->rt6i_dst.plen = 128; 2666 rt->rt6i_idev = idev; 2667 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2668 2669 /* Add this dst into uncached_list so that rt6_disable_ip() can 2670 * do proper release of the net_device 2671 */ 2672 rt6_uncached_list_add(rt); 2673 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2674 2675 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2676 2677 out: 2678 return dst; 2679 } 2680 2681 static int ip6_dst_gc(struct dst_ops *ops) 2682 { 2683 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2684 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2685 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2686 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2687 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2688 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2689 int entries; 2690 2691 entries = dst_entries_get_fast(ops); 2692 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2693 entries <= rt_max_size) 2694 goto out; 2695 2696 net->ipv6.ip6_rt_gc_expire++; 2697 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2698 entries = dst_entries_get_slow(ops); 2699 if (entries < ops->gc_thresh) 2700 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2701 out: 2702 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2703 return entries > rt_max_size; 2704 } 2705 2706 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt, 2707 struct fib6_config *cfg) 2708 { 2709 struct dst_metrics *p; 2710 2711 if (!cfg->fc_mx) 2712 return 0; 2713 2714 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL); 2715 if (unlikely(!p)) 2716 return -ENOMEM; 2717 2718 refcount_set(&p->refcnt, 1); 2719 rt->fib6_metrics = p; 2720 2721 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics); 2722 } 2723 2724 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2725 struct fib6_config *cfg, 2726 const struct in6_addr *gw_addr, 2727 u32 tbid, int flags) 2728 { 2729 struct flowi6 fl6 = { 2730 .flowi6_oif = cfg->fc_ifindex, 2731 .daddr = *gw_addr, 2732 .saddr = cfg->fc_prefsrc, 2733 }; 2734 struct fib6_table *table; 2735 struct rt6_info *rt; 2736 2737 table = fib6_get_table(net, tbid); 2738 if (!table) 2739 return NULL; 2740 2741 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2742 flags |= RT6_LOOKUP_F_HAS_SADDR; 2743 2744 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 2745 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); 2746 2747 /* if table lookup failed, fall back to full lookup */ 2748 if (rt == net->ipv6.ip6_null_entry) { 2749 ip6_rt_put(rt); 2750 rt = NULL; 2751 } 2752 2753 return rt; 2754 } 2755 2756 static int ip6_route_check_nh_onlink(struct net *net, 2757 struct fib6_config *cfg, 2758 const struct net_device *dev, 2759 struct netlink_ext_ack *extack) 2760 { 2761 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 2762 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2763 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; 2764 struct rt6_info *grt; 2765 int err; 2766 2767 err = 0; 2768 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2769 if (grt) { 2770 if (!grt->dst.error && 2771 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2772 NL_SET_ERR_MSG(extack, 2773 "Nexthop has invalid gateway or device mismatch"); 2774 err = -EINVAL; 2775 } 2776 2777 ip6_rt_put(grt); 2778 } 2779 2780 return err; 2781 } 2782 2783 static int ip6_route_check_nh(struct net *net, 2784 struct fib6_config *cfg, 2785 struct net_device **_dev, 2786 struct inet6_dev **idev) 2787 { 2788 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2789 struct net_device *dev = _dev ? *_dev : NULL; 2790 struct rt6_info *grt = NULL; 2791 int err = -EHOSTUNREACH; 2792 2793 if (cfg->fc_table) { 2794 int flags = RT6_LOOKUP_F_IFACE; 2795 2796 grt = ip6_nh_lookup_table(net, cfg, gw_addr, 2797 cfg->fc_table, flags); 2798 if (grt) { 2799 if (grt->rt6i_flags & RTF_GATEWAY || 2800 (dev && dev != grt->dst.dev)) { 2801 ip6_rt_put(grt); 2802 grt = NULL; 2803 } 2804 } 2805 } 2806 2807 if (!grt) 2808 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); 2809 2810 if (!grt) 2811 goto out; 2812 2813 if (dev) { 2814 if (dev != grt->dst.dev) { 2815 ip6_rt_put(grt); 2816 goto out; 2817 } 2818 } else { 2819 *_dev = dev = grt->dst.dev; 2820 *idev = grt->rt6i_idev; 2821 dev_hold(dev); 2822 in6_dev_hold(grt->rt6i_idev); 2823 } 2824 2825 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2826 err = 0; 2827 2828 ip6_rt_put(grt); 2829 2830 out: 2831 return err; 2832 } 2833 2834 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 2835 struct net_device **_dev, struct inet6_dev **idev, 2836 struct netlink_ext_ack *extack) 2837 { 2838 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2839 int gwa_type = ipv6_addr_type(gw_addr); 2840 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 2841 const struct net_device *dev = *_dev; 2842 bool need_addr_check = !dev; 2843 int err = -EINVAL; 2844 2845 /* if gw_addr is local we will fail to detect this in case 2846 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2847 * will return already-added prefix route via interface that 2848 * prefix route was assigned to, which might be non-loopback. 2849 */ 2850 if (dev && 2851 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2852 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2853 goto out; 2854 } 2855 2856 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 2857 /* IPv6 strictly inhibits using not link-local 2858 * addresses as nexthop address. 2859 * Otherwise, router will not able to send redirects. 2860 * It is very good, but in some (rare!) circumstances 2861 * (SIT, PtP, NBMA NOARP links) it is handy to allow 2862 * some exceptions. --ANK 2863 * We allow IPv4-mapped nexthops to support RFC4798-type 2864 * addressing 2865 */ 2866 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 2867 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2868 goto out; 2869 } 2870 2871 if (cfg->fc_flags & RTNH_F_ONLINK) 2872 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 2873 else 2874 err = ip6_route_check_nh(net, cfg, _dev, idev); 2875 2876 if (err) 2877 goto out; 2878 } 2879 2880 /* reload in case device was changed */ 2881 dev = *_dev; 2882 2883 err = -EINVAL; 2884 if (!dev) { 2885 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2886 goto out; 2887 } else if (dev->flags & IFF_LOOPBACK) { 2888 NL_SET_ERR_MSG(extack, 2889 "Egress device can not be loopback device for this route"); 2890 goto out; 2891 } 2892 2893 /* if we did not check gw_addr above, do so now that the 2894 * egress device has been resolved. 2895 */ 2896 if (need_addr_check && 2897 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2898 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2899 goto out; 2900 } 2901 2902 err = 0; 2903 out: 2904 return err; 2905 } 2906 2907 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 2908 gfp_t gfp_flags, 2909 struct netlink_ext_ack *extack) 2910 { 2911 struct net *net = cfg->fc_nlinfo.nl_net; 2912 struct fib6_info *rt = NULL; 2913 struct net_device *dev = NULL; 2914 struct inet6_dev *idev = NULL; 2915 struct fib6_table *table; 2916 int addr_type; 2917 int err = -EINVAL; 2918 2919 /* RTF_PCPU is an internal flag; can not be set by userspace */ 2920 if (cfg->fc_flags & RTF_PCPU) { 2921 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 2922 goto out; 2923 } 2924 2925 /* RTF_CACHE is an internal flag; can not be set by userspace */ 2926 if (cfg->fc_flags & RTF_CACHE) { 2927 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 2928 goto out; 2929 } 2930 2931 if (cfg->fc_type > RTN_MAX) { 2932 NL_SET_ERR_MSG(extack, "Invalid route type"); 2933 goto out; 2934 } 2935 2936 if (cfg->fc_dst_len > 128) { 2937 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 2938 goto out; 2939 } 2940 if (cfg->fc_src_len > 128) { 2941 NL_SET_ERR_MSG(extack, "Invalid source address length"); 2942 goto out; 2943 } 2944 #ifndef CONFIG_IPV6_SUBTREES 2945 if (cfg->fc_src_len) { 2946 NL_SET_ERR_MSG(extack, 2947 "Specifying source address requires IPV6_SUBTREES to be enabled"); 2948 goto out; 2949 } 2950 #endif 2951 if (cfg->fc_ifindex) { 2952 err = -ENODEV; 2953 dev = dev_get_by_index(net, cfg->fc_ifindex); 2954 if (!dev) 2955 goto out; 2956 idev = in6_dev_get(dev); 2957 if (!idev) 2958 goto out; 2959 } 2960 2961 if (cfg->fc_metric == 0) 2962 cfg->fc_metric = IP6_RT_PRIO_USER; 2963 2964 if (cfg->fc_flags & RTNH_F_ONLINK) { 2965 if (!dev) { 2966 NL_SET_ERR_MSG(extack, 2967 "Nexthop device required for onlink"); 2968 err = -ENODEV; 2969 goto out; 2970 } 2971 2972 if (!(dev->flags & IFF_UP)) { 2973 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 2974 err = -ENETDOWN; 2975 goto out; 2976 } 2977 } 2978 2979 err = -ENOBUFS; 2980 if (cfg->fc_nlinfo.nlh && 2981 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 2982 table = fib6_get_table(net, cfg->fc_table); 2983 if (!table) { 2984 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 2985 table = fib6_new_table(net, cfg->fc_table); 2986 } 2987 } else { 2988 table = fib6_new_table(net, cfg->fc_table); 2989 } 2990 2991 if (!table) 2992 goto out; 2993 2994 err = -ENOMEM; 2995 rt = fib6_info_alloc(gfp_flags); 2996 if (!rt) 2997 goto out; 2998 2999 if (cfg->fc_flags & RTF_ADDRCONF) 3000 rt->dst_nocount = true; 3001 3002 err = ip6_convert_metrics(net, rt, cfg); 3003 if (err < 0) 3004 goto out; 3005 3006 if (cfg->fc_flags & RTF_EXPIRES) 3007 fib6_set_expires(rt, jiffies + 3008 clock_t_to_jiffies(cfg->fc_expires)); 3009 else 3010 fib6_clean_expires(rt); 3011 3012 if (cfg->fc_protocol == RTPROT_UNSPEC) 3013 cfg->fc_protocol = RTPROT_BOOT; 3014 rt->fib6_protocol = cfg->fc_protocol; 3015 3016 addr_type = ipv6_addr_type(&cfg->fc_dst); 3017 3018 if (cfg->fc_encap) { 3019 struct lwtunnel_state *lwtstate; 3020 3021 err = lwtunnel_build_state(cfg->fc_encap_type, 3022 cfg->fc_encap, AF_INET6, cfg, 3023 &lwtstate, extack); 3024 if (err) 3025 goto out; 3026 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate); 3027 } 3028 3029 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3030 rt->fib6_dst.plen = cfg->fc_dst_len; 3031 if (rt->fib6_dst.plen == 128) 3032 rt->dst_host = true; 3033 3034 #ifdef CONFIG_IPV6_SUBTREES 3035 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3036 rt->fib6_src.plen = cfg->fc_src_len; 3037 #endif 3038 3039 rt->fib6_metric = cfg->fc_metric; 3040 rt->fib6_nh.nh_weight = 1; 3041 3042 rt->fib6_type = cfg->fc_type; 3043 3044 /* We cannot add true routes via loopback here, 3045 they would result in kernel looping; promote them to reject routes 3046 */ 3047 if ((cfg->fc_flags & RTF_REJECT) || 3048 (dev && (dev->flags & IFF_LOOPBACK) && 3049 !(addr_type & IPV6_ADDR_LOOPBACK) && 3050 !(cfg->fc_flags & RTF_LOCAL))) { 3051 /* hold loopback dev/idev if we haven't done so. */ 3052 if (dev != net->loopback_dev) { 3053 if (dev) { 3054 dev_put(dev); 3055 in6_dev_put(idev); 3056 } 3057 dev = net->loopback_dev; 3058 dev_hold(dev); 3059 idev = in6_dev_get(dev); 3060 if (!idev) { 3061 err = -ENODEV; 3062 goto out; 3063 } 3064 } 3065 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP; 3066 goto install_route; 3067 } 3068 3069 if (cfg->fc_flags & RTF_GATEWAY) { 3070 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 3071 if (err) 3072 goto out; 3073 3074 rt->fib6_nh.nh_gw = cfg->fc_gateway; 3075 } 3076 3077 err = -ENODEV; 3078 if (!dev) 3079 goto out; 3080 3081 if (idev->cnf.disable_ipv6) { 3082 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3083 err = -EACCES; 3084 goto out; 3085 } 3086 3087 if (!(dev->flags & IFF_UP)) { 3088 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3089 err = -ENETDOWN; 3090 goto out; 3091 } 3092 3093 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3094 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3095 NL_SET_ERR_MSG(extack, "Invalid source address"); 3096 err = -EINVAL; 3097 goto out; 3098 } 3099 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3100 rt->fib6_prefsrc.plen = 128; 3101 } else 3102 rt->fib6_prefsrc.plen = 0; 3103 3104 rt->fib6_flags = cfg->fc_flags; 3105 3106 install_route: 3107 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3108 !netif_carrier_ok(dev)) 3109 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 3110 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); 3111 rt->fib6_nh.nh_dev = dev; 3112 rt->fib6_table = table; 3113 3114 if (idev) 3115 in6_dev_put(idev); 3116 3117 return rt; 3118 out: 3119 if (dev) 3120 dev_put(dev); 3121 if (idev) 3122 in6_dev_put(idev); 3123 3124 fib6_info_release(rt); 3125 return ERR_PTR(err); 3126 } 3127 3128 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3129 struct netlink_ext_ack *extack) 3130 { 3131 struct fib6_info *rt; 3132 int err; 3133 3134 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3135 if (IS_ERR(rt)) 3136 return PTR_ERR(rt); 3137 3138 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3139 fib6_info_release(rt); 3140 3141 return err; 3142 } 3143 3144 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3145 { 3146 struct net *net = info->nl_net; 3147 struct fib6_table *table; 3148 int err; 3149 3150 if (rt == net->ipv6.fib6_null_entry) { 3151 err = -ENOENT; 3152 goto out; 3153 } 3154 3155 table = rt->fib6_table; 3156 spin_lock_bh(&table->tb6_lock); 3157 err = fib6_del(rt, info); 3158 spin_unlock_bh(&table->tb6_lock); 3159 3160 out: 3161 fib6_info_release(rt); 3162 return err; 3163 } 3164 3165 int ip6_del_rt(struct net *net, struct fib6_info *rt) 3166 { 3167 struct nl_info info = { .nl_net = net }; 3168 3169 return __ip6_del_rt(rt, &info); 3170 } 3171 3172 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3173 { 3174 struct nl_info *info = &cfg->fc_nlinfo; 3175 struct net *net = info->nl_net; 3176 struct sk_buff *skb = NULL; 3177 struct fib6_table *table; 3178 int err = -ENOENT; 3179 3180 if (rt == net->ipv6.fib6_null_entry) 3181 goto out_put; 3182 table = rt->fib6_table; 3183 spin_lock_bh(&table->tb6_lock); 3184 3185 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3186 struct fib6_info *sibling, *next_sibling; 3187 3188 /* prefer to send a single notification with all hops */ 3189 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3190 if (skb) { 3191 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3192 3193 if (rt6_fill_node(net, skb, rt, NULL, 3194 NULL, NULL, 0, RTM_DELROUTE, 3195 info->portid, seq, 0) < 0) { 3196 kfree_skb(skb); 3197 skb = NULL; 3198 } else 3199 info->skip_notify = 1; 3200 } 3201 3202 list_for_each_entry_safe(sibling, next_sibling, 3203 &rt->fib6_siblings, 3204 fib6_siblings) { 3205 err = fib6_del(sibling, info); 3206 if (err) 3207 goto out_unlock; 3208 } 3209 } 3210 3211 err = fib6_del(rt, info); 3212 out_unlock: 3213 spin_unlock_bh(&table->tb6_lock); 3214 out_put: 3215 fib6_info_release(rt); 3216 3217 if (skb) { 3218 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3219 info->nlh, gfp_any()); 3220 } 3221 return err; 3222 } 3223 3224 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3225 { 3226 int rc = -ESRCH; 3227 3228 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3229 goto out; 3230 3231 if (cfg->fc_flags & RTF_GATEWAY && 3232 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3233 goto out; 3234 if (dst_hold_safe(&rt->dst)) 3235 rc = rt6_remove_exception_rt(rt); 3236 out: 3237 return rc; 3238 } 3239 3240 static int ip6_route_del(struct fib6_config *cfg, 3241 struct netlink_ext_ack *extack) 3242 { 3243 struct rt6_info *rt_cache; 3244 struct fib6_table *table; 3245 struct fib6_info *rt; 3246 struct fib6_node *fn; 3247 int err = -ESRCH; 3248 3249 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 3250 if (!table) { 3251 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 3252 return err; 3253 } 3254 3255 rcu_read_lock(); 3256 3257 fn = fib6_locate(&table->tb6_root, 3258 &cfg->fc_dst, cfg->fc_dst_len, 3259 &cfg->fc_src, cfg->fc_src_len, 3260 !(cfg->fc_flags & RTF_CACHE)); 3261 3262 if (fn) { 3263 for_each_fib6_node_rt_rcu(fn) { 3264 if (cfg->fc_flags & RTF_CACHE) { 3265 int rc; 3266 3267 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, 3268 &cfg->fc_src); 3269 if (rt_cache) { 3270 rc = ip6_del_cached_rt(rt_cache, cfg); 3271 if (rc != -ESRCH) { 3272 rcu_read_unlock(); 3273 return rc; 3274 } 3275 } 3276 continue; 3277 } 3278 if (cfg->fc_ifindex && 3279 (!rt->fib6_nh.nh_dev || 3280 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex)) 3281 continue; 3282 if (cfg->fc_flags & RTF_GATEWAY && 3283 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw)) 3284 continue; 3285 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 3286 continue; 3287 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) 3288 continue; 3289 if (!fib6_info_hold_safe(rt)) 3290 continue; 3291 rcu_read_unlock(); 3292 3293 /* if gateway was specified only delete the one hop */ 3294 if (cfg->fc_flags & RTF_GATEWAY) 3295 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3296 3297 return __ip6_del_rt_siblings(rt, cfg); 3298 } 3299 } 3300 rcu_read_unlock(); 3301 3302 return err; 3303 } 3304 3305 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3306 { 3307 struct netevent_redirect netevent; 3308 struct rt6_info *rt, *nrt = NULL; 3309 struct ndisc_options ndopts; 3310 struct inet6_dev *in6_dev; 3311 struct neighbour *neigh; 3312 struct fib6_info *from; 3313 struct rd_msg *msg; 3314 int optlen, on_link; 3315 u8 *lladdr; 3316 3317 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3318 optlen -= sizeof(*msg); 3319 3320 if (optlen < 0) { 3321 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3322 return; 3323 } 3324 3325 msg = (struct rd_msg *)icmp6_hdr(skb); 3326 3327 if (ipv6_addr_is_multicast(&msg->dest)) { 3328 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3329 return; 3330 } 3331 3332 on_link = 0; 3333 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3334 on_link = 1; 3335 } else if (ipv6_addr_type(&msg->target) != 3336 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3337 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3338 return; 3339 } 3340 3341 in6_dev = __in6_dev_get(skb->dev); 3342 if (!in6_dev) 3343 return; 3344 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3345 return; 3346 3347 /* RFC2461 8.1: 3348 * The IP source address of the Redirect MUST be the same as the current 3349 * first-hop router for the specified ICMP Destination Address. 3350 */ 3351 3352 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3353 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3354 return; 3355 } 3356 3357 lladdr = NULL; 3358 if (ndopts.nd_opts_tgt_lladdr) { 3359 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3360 skb->dev); 3361 if (!lladdr) { 3362 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 3363 return; 3364 } 3365 } 3366 3367 rt = (struct rt6_info *) dst; 3368 if (rt->rt6i_flags & RTF_REJECT) { 3369 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 3370 return; 3371 } 3372 3373 /* Redirect received -> path was valid. 3374 * Look, redirects are sent only in response to data packets, 3375 * so that this nexthop apparently is reachable. --ANK 3376 */ 3377 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3378 3379 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3380 if (!neigh) 3381 return; 3382 3383 /* 3384 * We have finally decided to accept it. 3385 */ 3386 3387 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3388 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3389 NEIGH_UPDATE_F_OVERRIDE| 3390 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3391 NEIGH_UPDATE_F_ISROUTER)), 3392 NDISC_REDIRECT, &ndopts); 3393 3394 rcu_read_lock(); 3395 from = rcu_dereference(rt->from); 3396 /* This fib6_info_hold() is safe here because we hold reference to rt 3397 * and rt already holds reference to fib6_info. 3398 */ 3399 fib6_info_hold(from); 3400 rcu_read_unlock(); 3401 3402 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL); 3403 if (!nrt) 3404 goto out; 3405 3406 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3407 if (on_link) 3408 nrt->rt6i_flags &= ~RTF_GATEWAY; 3409 3410 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3411 3412 /* No need to remove rt from the exception table if rt is 3413 * a cached route because rt6_insert_exception() will 3414 * takes care of it 3415 */ 3416 if (rt6_insert_exception(nrt, from)) { 3417 dst_release_immediate(&nrt->dst); 3418 goto out; 3419 } 3420 3421 netevent.old = &rt->dst; 3422 netevent.new = &nrt->dst; 3423 netevent.daddr = &msg->dest; 3424 netevent.neigh = neigh; 3425 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3426 3427 out: 3428 fib6_info_release(from); 3429 neigh_release(neigh); 3430 } 3431 3432 #ifdef CONFIG_IPV6_ROUTE_INFO 3433 static struct fib6_info *rt6_get_route_info(struct net *net, 3434 const struct in6_addr *prefix, int prefixlen, 3435 const struct in6_addr *gwaddr, 3436 struct net_device *dev) 3437 { 3438 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3439 int ifindex = dev->ifindex; 3440 struct fib6_node *fn; 3441 struct fib6_info *rt = NULL; 3442 struct fib6_table *table; 3443 3444 table = fib6_get_table(net, tb_id); 3445 if (!table) 3446 return NULL; 3447 3448 rcu_read_lock(); 3449 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3450 if (!fn) 3451 goto out; 3452 3453 for_each_fib6_node_rt_rcu(fn) { 3454 if (rt->fib6_nh.nh_dev->ifindex != ifindex) 3455 continue; 3456 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) 3457 continue; 3458 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) 3459 continue; 3460 if (!fib6_info_hold_safe(rt)) 3461 continue; 3462 break; 3463 } 3464 out: 3465 rcu_read_unlock(); 3466 return rt; 3467 } 3468 3469 static struct fib6_info *rt6_add_route_info(struct net *net, 3470 const struct in6_addr *prefix, int prefixlen, 3471 const struct in6_addr *gwaddr, 3472 struct net_device *dev, 3473 unsigned int pref) 3474 { 3475 struct fib6_config cfg = { 3476 .fc_metric = IP6_RT_PRIO_USER, 3477 .fc_ifindex = dev->ifindex, 3478 .fc_dst_len = prefixlen, 3479 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3480 RTF_UP | RTF_PREF(pref), 3481 .fc_protocol = RTPROT_RA, 3482 .fc_type = RTN_UNICAST, 3483 .fc_nlinfo.portid = 0, 3484 .fc_nlinfo.nlh = NULL, 3485 .fc_nlinfo.nl_net = net, 3486 }; 3487 3488 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3489 cfg.fc_dst = *prefix; 3490 cfg.fc_gateway = *gwaddr; 3491 3492 /* We should treat it as a default route if prefix length is 0. */ 3493 if (!prefixlen) 3494 cfg.fc_flags |= RTF_DEFAULT; 3495 3496 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 3497 3498 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3499 } 3500 #endif 3501 3502 struct fib6_info *rt6_get_dflt_router(struct net *net, 3503 const struct in6_addr *addr, 3504 struct net_device *dev) 3505 { 3506 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3507 struct fib6_info *rt; 3508 struct fib6_table *table; 3509 3510 table = fib6_get_table(net, tb_id); 3511 if (!table) 3512 return NULL; 3513 3514 rcu_read_lock(); 3515 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3516 if (dev == rt->fib6_nh.nh_dev && 3517 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3518 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) 3519 break; 3520 } 3521 if (rt && !fib6_info_hold_safe(rt)) 3522 rt = NULL; 3523 rcu_read_unlock(); 3524 return rt; 3525 } 3526 3527 struct fib6_info *rt6_add_dflt_router(struct net *net, 3528 const struct in6_addr *gwaddr, 3529 struct net_device *dev, 3530 unsigned int pref) 3531 { 3532 struct fib6_config cfg = { 3533 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3534 .fc_metric = IP6_RT_PRIO_USER, 3535 .fc_ifindex = dev->ifindex, 3536 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3537 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3538 .fc_protocol = RTPROT_RA, 3539 .fc_type = RTN_UNICAST, 3540 .fc_nlinfo.portid = 0, 3541 .fc_nlinfo.nlh = NULL, 3542 .fc_nlinfo.nl_net = net, 3543 }; 3544 3545 cfg.fc_gateway = *gwaddr; 3546 3547 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 3548 struct fib6_table *table; 3549 3550 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3551 if (table) 3552 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3553 } 3554 3555 return rt6_get_dflt_router(net, gwaddr, dev); 3556 } 3557 3558 static void __rt6_purge_dflt_routers(struct net *net, 3559 struct fib6_table *table) 3560 { 3561 struct fib6_info *rt; 3562 3563 restart: 3564 rcu_read_lock(); 3565 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3566 struct net_device *dev = fib6_info_nh_dev(rt); 3567 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 3568 3569 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3570 (!idev || idev->cnf.accept_ra != 2) && 3571 fib6_info_hold_safe(rt)) { 3572 rcu_read_unlock(); 3573 ip6_del_rt(net, rt); 3574 goto restart; 3575 } 3576 } 3577 rcu_read_unlock(); 3578 3579 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3580 } 3581 3582 void rt6_purge_dflt_routers(struct net *net) 3583 { 3584 struct fib6_table *table; 3585 struct hlist_head *head; 3586 unsigned int h; 3587 3588 rcu_read_lock(); 3589 3590 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3591 head = &net->ipv6.fib_table_hash[h]; 3592 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3593 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3594 __rt6_purge_dflt_routers(net, table); 3595 } 3596 } 3597 3598 rcu_read_unlock(); 3599 } 3600 3601 static void rtmsg_to_fib6_config(struct net *net, 3602 struct in6_rtmsg *rtmsg, 3603 struct fib6_config *cfg) 3604 { 3605 memset(cfg, 0, sizeof(*cfg)); 3606 3607 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3608 : RT6_TABLE_MAIN; 3609 cfg->fc_ifindex = rtmsg->rtmsg_ifindex; 3610 cfg->fc_metric = rtmsg->rtmsg_metric; 3611 cfg->fc_expires = rtmsg->rtmsg_info; 3612 cfg->fc_dst_len = rtmsg->rtmsg_dst_len; 3613 cfg->fc_src_len = rtmsg->rtmsg_src_len; 3614 cfg->fc_flags = rtmsg->rtmsg_flags; 3615 cfg->fc_type = rtmsg->rtmsg_type; 3616 3617 cfg->fc_nlinfo.nl_net = net; 3618 3619 cfg->fc_dst = rtmsg->rtmsg_dst; 3620 cfg->fc_src = rtmsg->rtmsg_src; 3621 cfg->fc_gateway = rtmsg->rtmsg_gateway; 3622 } 3623 3624 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3625 { 3626 struct fib6_config cfg; 3627 struct in6_rtmsg rtmsg; 3628 int err; 3629 3630 switch (cmd) { 3631 case SIOCADDRT: /* Add a route */ 3632 case SIOCDELRT: /* Delete a route */ 3633 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3634 return -EPERM; 3635 err = copy_from_user(&rtmsg, arg, 3636 sizeof(struct in6_rtmsg)); 3637 if (err) 3638 return -EFAULT; 3639 3640 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3641 3642 rtnl_lock(); 3643 switch (cmd) { 3644 case SIOCADDRT: 3645 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 3646 break; 3647 case SIOCDELRT: 3648 err = ip6_route_del(&cfg, NULL); 3649 break; 3650 default: 3651 err = -EINVAL; 3652 } 3653 rtnl_unlock(); 3654 3655 return err; 3656 } 3657 3658 return -EINVAL; 3659 } 3660 3661 /* 3662 * Drop the packet on the floor 3663 */ 3664 3665 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3666 { 3667 int type; 3668 struct dst_entry *dst = skb_dst(skb); 3669 switch (ipstats_mib_noroutes) { 3670 case IPSTATS_MIB_INNOROUTES: 3671 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3672 if (type == IPV6_ADDR_ANY) { 3673 IP6_INC_STATS(dev_net(dst->dev), 3674 __in6_dev_get_safely(skb->dev), 3675 IPSTATS_MIB_INADDRERRORS); 3676 break; 3677 } 3678 /* FALLTHROUGH */ 3679 case IPSTATS_MIB_OUTNOROUTES: 3680 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3681 ipstats_mib_noroutes); 3682 break; 3683 } 3684 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3685 kfree_skb(skb); 3686 return 0; 3687 } 3688 3689 static int ip6_pkt_discard(struct sk_buff *skb) 3690 { 3691 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3692 } 3693 3694 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3695 { 3696 skb->dev = skb_dst(skb)->dev; 3697 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3698 } 3699 3700 static int ip6_pkt_prohibit(struct sk_buff *skb) 3701 { 3702 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3703 } 3704 3705 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3706 { 3707 skb->dev = skb_dst(skb)->dev; 3708 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3709 } 3710 3711 /* 3712 * Allocate a dst for local (unicast / anycast) address. 3713 */ 3714 3715 struct fib6_info *addrconf_f6i_alloc(struct net *net, 3716 struct inet6_dev *idev, 3717 const struct in6_addr *addr, 3718 bool anycast, gfp_t gfp_flags) 3719 { 3720 u32 tb_id; 3721 struct net_device *dev = idev->dev; 3722 struct fib6_info *f6i; 3723 3724 f6i = fib6_info_alloc(gfp_flags); 3725 if (!f6i) 3726 return ERR_PTR(-ENOMEM); 3727 3728 f6i->dst_nocount = true; 3729 f6i->dst_host = true; 3730 f6i->fib6_protocol = RTPROT_KERNEL; 3731 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP; 3732 if (anycast) { 3733 f6i->fib6_type = RTN_ANYCAST; 3734 f6i->fib6_flags |= RTF_ANYCAST; 3735 } else { 3736 f6i->fib6_type = RTN_LOCAL; 3737 f6i->fib6_flags |= RTF_LOCAL; 3738 } 3739 3740 f6i->fib6_nh.nh_gw = *addr; 3741 dev_hold(dev); 3742 f6i->fib6_nh.nh_dev = dev; 3743 f6i->fib6_dst.addr = *addr; 3744 f6i->fib6_dst.plen = 128; 3745 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; 3746 f6i->fib6_table = fib6_get_table(net, tb_id); 3747 3748 return f6i; 3749 } 3750 3751 /* remove deleted ip from prefsrc entries */ 3752 struct arg_dev_net_ip { 3753 struct net_device *dev; 3754 struct net *net; 3755 struct in6_addr *addr; 3756 }; 3757 3758 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 3759 { 3760 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3761 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3762 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3763 3764 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) && 3765 rt != net->ipv6.fib6_null_entry && 3766 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 3767 spin_lock_bh(&rt6_exception_lock); 3768 /* remove prefsrc entry */ 3769 rt->fib6_prefsrc.plen = 0; 3770 spin_unlock_bh(&rt6_exception_lock); 3771 } 3772 return 0; 3773 } 3774 3775 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3776 { 3777 struct net *net = dev_net(ifp->idev->dev); 3778 struct arg_dev_net_ip adni = { 3779 .dev = ifp->idev->dev, 3780 .net = net, 3781 .addr = &ifp->addr, 3782 }; 3783 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3784 } 3785 3786 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) 3787 3788 /* Remove routers and update dst entries when gateway turn into host. */ 3789 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 3790 { 3791 struct in6_addr *gateway = (struct in6_addr *)arg; 3792 3793 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3794 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) { 3795 return -1; 3796 } 3797 3798 /* Further clean up cached routes in exception table. 3799 * This is needed because cached route may have a different 3800 * gateway than its 'parent' in the case of an ip redirect. 3801 */ 3802 rt6_exceptions_clean_tohost(rt, gateway); 3803 3804 return 0; 3805 } 3806 3807 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3808 { 3809 fib6_clean_all(net, fib6_clean_tohost, gateway); 3810 } 3811 3812 struct arg_netdev_event { 3813 const struct net_device *dev; 3814 union { 3815 unsigned int nh_flags; 3816 unsigned long event; 3817 }; 3818 }; 3819 3820 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 3821 { 3822 struct fib6_info *iter; 3823 struct fib6_node *fn; 3824 3825 fn = rcu_dereference_protected(rt->fib6_node, 3826 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3827 iter = rcu_dereference_protected(fn->leaf, 3828 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3829 while (iter) { 3830 if (iter->fib6_metric == rt->fib6_metric && 3831 rt6_qualify_for_ecmp(iter)) 3832 return iter; 3833 iter = rcu_dereference_protected(iter->fib6_next, 3834 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3835 } 3836 3837 return NULL; 3838 } 3839 3840 static bool rt6_is_dead(const struct fib6_info *rt) 3841 { 3842 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || 3843 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 3844 fib6_ignore_linkdown(rt))) 3845 return true; 3846 3847 return false; 3848 } 3849 3850 static int rt6_multipath_total_weight(const struct fib6_info *rt) 3851 { 3852 struct fib6_info *iter; 3853 int total = 0; 3854 3855 if (!rt6_is_dead(rt)) 3856 total += rt->fib6_nh.nh_weight; 3857 3858 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 3859 if (!rt6_is_dead(iter)) 3860 total += iter->fib6_nh.nh_weight; 3861 } 3862 3863 return total; 3864 } 3865 3866 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 3867 { 3868 int upper_bound = -1; 3869 3870 if (!rt6_is_dead(rt)) { 3871 *weight += rt->fib6_nh.nh_weight; 3872 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 3873 total) - 1; 3874 } 3875 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound); 3876 } 3877 3878 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 3879 { 3880 struct fib6_info *iter; 3881 int weight = 0; 3882 3883 rt6_upper_bound_set(rt, &weight, total); 3884 3885 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3886 rt6_upper_bound_set(iter, &weight, total); 3887 } 3888 3889 void rt6_multipath_rebalance(struct fib6_info *rt) 3890 { 3891 struct fib6_info *first; 3892 int total; 3893 3894 /* In case the entire multipath route was marked for flushing, 3895 * then there is no need to rebalance upon the removal of every 3896 * sibling route. 3897 */ 3898 if (!rt->fib6_nsiblings || rt->should_flush) 3899 return; 3900 3901 /* During lookup routes are evaluated in order, so we need to 3902 * make sure upper bounds are assigned from the first sibling 3903 * onwards. 3904 */ 3905 first = rt6_multipath_first_sibling(rt); 3906 if (WARN_ON_ONCE(!first)) 3907 return; 3908 3909 total = rt6_multipath_total_weight(first); 3910 rt6_multipath_upper_bound_set(first, total); 3911 } 3912 3913 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 3914 { 3915 const struct arg_netdev_event *arg = p_arg; 3916 struct net *net = dev_net(arg->dev); 3917 3918 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) { 3919 rt->fib6_nh.nh_flags &= ~arg->nh_flags; 3920 fib6_update_sernum_upto_root(net, rt); 3921 rt6_multipath_rebalance(rt); 3922 } 3923 3924 return 0; 3925 } 3926 3927 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) 3928 { 3929 struct arg_netdev_event arg = { 3930 .dev = dev, 3931 { 3932 .nh_flags = nh_flags, 3933 }, 3934 }; 3935 3936 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 3937 arg.nh_flags |= RTNH_F_LINKDOWN; 3938 3939 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 3940 } 3941 3942 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 3943 const struct net_device *dev) 3944 { 3945 struct fib6_info *iter; 3946 3947 if (rt->fib6_nh.nh_dev == dev) 3948 return true; 3949 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3950 if (iter->fib6_nh.nh_dev == dev) 3951 return true; 3952 3953 return false; 3954 } 3955 3956 static void rt6_multipath_flush(struct fib6_info *rt) 3957 { 3958 struct fib6_info *iter; 3959 3960 rt->should_flush = 1; 3961 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3962 iter->should_flush = 1; 3963 } 3964 3965 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 3966 const struct net_device *down_dev) 3967 { 3968 struct fib6_info *iter; 3969 unsigned int dead = 0; 3970 3971 if (rt->fib6_nh.nh_dev == down_dev || 3972 rt->fib6_nh.nh_flags & RTNH_F_DEAD) 3973 dead++; 3974 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3975 if (iter->fib6_nh.nh_dev == down_dev || 3976 iter->fib6_nh.nh_flags & RTNH_F_DEAD) 3977 dead++; 3978 3979 return dead; 3980 } 3981 3982 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 3983 const struct net_device *dev, 3984 unsigned int nh_flags) 3985 { 3986 struct fib6_info *iter; 3987 3988 if (rt->fib6_nh.nh_dev == dev) 3989 rt->fib6_nh.nh_flags |= nh_flags; 3990 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3991 if (iter->fib6_nh.nh_dev == dev) 3992 iter->fib6_nh.nh_flags |= nh_flags; 3993 } 3994 3995 /* called with write lock held for table with rt */ 3996 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 3997 { 3998 const struct arg_netdev_event *arg = p_arg; 3999 const struct net_device *dev = arg->dev; 4000 struct net *net = dev_net(dev); 4001 4002 if (rt == net->ipv6.fib6_null_entry) 4003 return 0; 4004 4005 switch (arg->event) { 4006 case NETDEV_UNREGISTER: 4007 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 4008 case NETDEV_DOWN: 4009 if (rt->should_flush) 4010 return -1; 4011 if (!rt->fib6_nsiblings) 4012 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 4013 if (rt6_multipath_uses_dev(rt, dev)) { 4014 unsigned int count; 4015 4016 count = rt6_multipath_dead_count(rt, dev); 4017 if (rt->fib6_nsiblings + 1 == count) { 4018 rt6_multipath_flush(rt); 4019 return -1; 4020 } 4021 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4022 RTNH_F_LINKDOWN); 4023 fib6_update_sernum(net, rt); 4024 rt6_multipath_rebalance(rt); 4025 } 4026 return -2; 4027 case NETDEV_CHANGE: 4028 if (rt->fib6_nh.nh_dev != dev || 4029 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 4030 break; 4031 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 4032 rt6_multipath_rebalance(rt); 4033 break; 4034 } 4035 4036 return 0; 4037 } 4038 4039 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4040 { 4041 struct arg_netdev_event arg = { 4042 .dev = dev, 4043 { 4044 .event = event, 4045 }, 4046 }; 4047 4048 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg); 4049 } 4050 4051 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4052 { 4053 rt6_sync_down_dev(dev, event); 4054 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4055 neigh_ifdown(&nd_tbl, dev); 4056 } 4057 4058 struct rt6_mtu_change_arg { 4059 struct net_device *dev; 4060 unsigned int mtu; 4061 }; 4062 4063 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) 4064 { 4065 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4066 struct inet6_dev *idev; 4067 4068 /* In IPv6 pmtu discovery is not optional, 4069 so that RTAX_MTU lock cannot disable it. 4070 We still use this lock to block changes 4071 caused by addrconf/ndisc. 4072 */ 4073 4074 idev = __in6_dev_get(arg->dev); 4075 if (!idev) 4076 return 0; 4077 4078 /* For administrative MTU increase, there is no way to discover 4079 IPv6 PMTU increase, so PMTU increase should be updated here. 4080 Since RFC 1981 doesn't include administrative MTU increase 4081 update PMTU increase is a MUST. (i.e. jumbo frame) 4082 */ 4083 if (rt->fib6_nh.nh_dev == arg->dev && 4084 !fib6_metric_locked(rt, RTAX_MTU)) { 4085 u32 mtu = rt->fib6_pmtu; 4086 4087 if (mtu >= arg->mtu || 4088 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4089 fib6_metric_set(rt, RTAX_MTU, arg->mtu); 4090 4091 spin_lock_bh(&rt6_exception_lock); 4092 rt6_exceptions_update_pmtu(idev, rt, arg->mtu); 4093 spin_unlock_bh(&rt6_exception_lock); 4094 } 4095 return 0; 4096 } 4097 4098 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4099 { 4100 struct rt6_mtu_change_arg arg = { 4101 .dev = dev, 4102 .mtu = mtu, 4103 }; 4104 4105 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4106 } 4107 4108 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4109 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4110 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4111 [RTA_OIF] = { .type = NLA_U32 }, 4112 [RTA_IIF] = { .type = NLA_U32 }, 4113 [RTA_PRIORITY] = { .type = NLA_U32 }, 4114 [RTA_METRICS] = { .type = NLA_NESTED }, 4115 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4116 [RTA_PREF] = { .type = NLA_U8 }, 4117 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4118 [RTA_ENCAP] = { .type = NLA_NESTED }, 4119 [RTA_EXPIRES] = { .type = NLA_U32 }, 4120 [RTA_UID] = { .type = NLA_U32 }, 4121 [RTA_MARK] = { .type = NLA_U32 }, 4122 [RTA_TABLE] = { .type = NLA_U32 }, 4123 [RTA_IP_PROTO] = { .type = NLA_U8 }, 4124 [RTA_SPORT] = { .type = NLA_U16 }, 4125 [RTA_DPORT] = { .type = NLA_U16 }, 4126 }; 4127 4128 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4129 struct fib6_config *cfg, 4130 struct netlink_ext_ack *extack) 4131 { 4132 struct rtmsg *rtm; 4133 struct nlattr *tb[RTA_MAX+1]; 4134 unsigned int pref; 4135 int err; 4136 4137 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4138 NULL); 4139 if (err < 0) 4140 goto errout; 4141 4142 err = -EINVAL; 4143 rtm = nlmsg_data(nlh); 4144 memset(cfg, 0, sizeof(*cfg)); 4145 4146 cfg->fc_table = rtm->rtm_table; 4147 cfg->fc_dst_len = rtm->rtm_dst_len; 4148 cfg->fc_src_len = rtm->rtm_src_len; 4149 cfg->fc_flags = RTF_UP; 4150 cfg->fc_protocol = rtm->rtm_protocol; 4151 cfg->fc_type = rtm->rtm_type; 4152 4153 if (rtm->rtm_type == RTN_UNREACHABLE || 4154 rtm->rtm_type == RTN_BLACKHOLE || 4155 rtm->rtm_type == RTN_PROHIBIT || 4156 rtm->rtm_type == RTN_THROW) 4157 cfg->fc_flags |= RTF_REJECT; 4158 4159 if (rtm->rtm_type == RTN_LOCAL) 4160 cfg->fc_flags |= RTF_LOCAL; 4161 4162 if (rtm->rtm_flags & RTM_F_CLONED) 4163 cfg->fc_flags |= RTF_CACHE; 4164 4165 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4166 4167 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; 4168 cfg->fc_nlinfo.nlh = nlh; 4169 cfg->fc_nlinfo.nl_net = sock_net(skb->sk); 4170 4171 if (tb[RTA_GATEWAY]) { 4172 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4173 cfg->fc_flags |= RTF_GATEWAY; 4174 } 4175 4176 if (tb[RTA_DST]) { 4177 int plen = (rtm->rtm_dst_len + 7) >> 3; 4178 4179 if (nla_len(tb[RTA_DST]) < plen) 4180 goto errout; 4181 4182 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 4183 } 4184 4185 if (tb[RTA_SRC]) { 4186 int plen = (rtm->rtm_src_len + 7) >> 3; 4187 4188 if (nla_len(tb[RTA_SRC]) < plen) 4189 goto errout; 4190 4191 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 4192 } 4193 4194 if (tb[RTA_PREFSRC]) 4195 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 4196 4197 if (tb[RTA_OIF]) 4198 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 4199 4200 if (tb[RTA_PRIORITY]) 4201 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 4202 4203 if (tb[RTA_METRICS]) { 4204 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 4205 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 4206 } 4207 4208 if (tb[RTA_TABLE]) 4209 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 4210 4211 if (tb[RTA_MULTIPATH]) { 4212 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 4213 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 4214 4215 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 4216 cfg->fc_mp_len, extack); 4217 if (err < 0) 4218 goto errout; 4219 } 4220 4221 if (tb[RTA_PREF]) { 4222 pref = nla_get_u8(tb[RTA_PREF]); 4223 if (pref != ICMPV6_ROUTER_PREF_LOW && 4224 pref != ICMPV6_ROUTER_PREF_HIGH) 4225 pref = ICMPV6_ROUTER_PREF_MEDIUM; 4226 cfg->fc_flags |= RTF_PREF(pref); 4227 } 4228 4229 if (tb[RTA_ENCAP]) 4230 cfg->fc_encap = tb[RTA_ENCAP]; 4231 4232 if (tb[RTA_ENCAP_TYPE]) { 4233 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 4234 4235 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 4236 if (err < 0) 4237 goto errout; 4238 } 4239 4240 if (tb[RTA_EXPIRES]) { 4241 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 4242 4243 if (addrconf_finite_timeout(timeout)) { 4244 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 4245 cfg->fc_flags |= RTF_EXPIRES; 4246 } 4247 } 4248 4249 err = 0; 4250 errout: 4251 return err; 4252 } 4253 4254 struct rt6_nh { 4255 struct fib6_info *fib6_info; 4256 struct fib6_config r_cfg; 4257 struct list_head next; 4258 }; 4259 4260 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) 4261 { 4262 struct rt6_nh *nh; 4263 4264 list_for_each_entry(nh, rt6_nh_list, next) { 4265 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n", 4266 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, 4267 nh->r_cfg.fc_ifindex); 4268 } 4269 } 4270 4271 static int ip6_route_info_append(struct net *net, 4272 struct list_head *rt6_nh_list, 4273 struct fib6_info *rt, 4274 struct fib6_config *r_cfg) 4275 { 4276 struct rt6_nh *nh; 4277 int err = -EEXIST; 4278 4279 list_for_each_entry(nh, rt6_nh_list, next) { 4280 /* check if fib6_info already exists */ 4281 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 4282 return err; 4283 } 4284 4285 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4286 if (!nh) 4287 return -ENOMEM; 4288 nh->fib6_info = rt; 4289 err = ip6_convert_metrics(net, rt, r_cfg); 4290 if (err) { 4291 kfree(nh); 4292 return err; 4293 } 4294 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4295 list_add_tail(&nh->next, rt6_nh_list); 4296 4297 return 0; 4298 } 4299 4300 static void ip6_route_mpath_notify(struct fib6_info *rt, 4301 struct fib6_info *rt_last, 4302 struct nl_info *info, 4303 __u16 nlflags) 4304 { 4305 /* if this is an APPEND route, then rt points to the first route 4306 * inserted and rt_last points to last route inserted. Userspace 4307 * wants a consistent dump of the route which starts at the first 4308 * nexthop. Since sibling routes are always added at the end of 4309 * the list, find the first sibling of the last route appended 4310 */ 4311 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 4312 rt = list_first_entry(&rt_last->fib6_siblings, 4313 struct fib6_info, 4314 fib6_siblings); 4315 } 4316 4317 if (rt) 4318 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 4319 } 4320 4321 static int ip6_route_multipath_add(struct fib6_config *cfg, 4322 struct netlink_ext_ack *extack) 4323 { 4324 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 4325 struct nl_info *info = &cfg->fc_nlinfo; 4326 struct fib6_config r_cfg; 4327 struct rtnexthop *rtnh; 4328 struct fib6_info *rt; 4329 struct rt6_nh *err_nh; 4330 struct rt6_nh *nh, *nh_safe; 4331 __u16 nlflags; 4332 int remaining; 4333 int attrlen; 4334 int err = 1; 4335 int nhn = 0; 4336 int replace = (cfg->fc_nlinfo.nlh && 4337 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 4338 LIST_HEAD(rt6_nh_list); 4339 4340 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 4341 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 4342 nlflags |= NLM_F_APPEND; 4343 4344 remaining = cfg->fc_mp_len; 4345 rtnh = (struct rtnexthop *)cfg->fc_mp; 4346 4347 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4348 * fib6_info structs per nexthop 4349 */ 4350 while (rtnh_ok(rtnh, remaining)) { 4351 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4352 if (rtnh->rtnh_ifindex) 4353 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4354 4355 attrlen = rtnh_attrlen(rtnh); 4356 if (attrlen > 0) { 4357 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4358 4359 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4360 if (nla) { 4361 r_cfg.fc_gateway = nla_get_in6_addr(nla); 4362 r_cfg.fc_flags |= RTF_GATEWAY; 4363 } 4364 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 4365 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 4366 if (nla) 4367 r_cfg.fc_encap_type = nla_get_u16(nla); 4368 } 4369 4370 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 4371 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 4372 if (IS_ERR(rt)) { 4373 err = PTR_ERR(rt); 4374 rt = NULL; 4375 goto cleanup; 4376 } 4377 if (!rt6_qualify_for_ecmp(rt)) { 4378 err = -EINVAL; 4379 NL_SET_ERR_MSG(extack, 4380 "Device only routes can not be added for IPv6 using the multipath API."); 4381 fib6_info_release(rt); 4382 goto cleanup; 4383 } 4384 4385 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1; 4386 4387 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 4388 rt, &r_cfg); 4389 if (err) { 4390 fib6_info_release(rt); 4391 goto cleanup; 4392 } 4393 4394 rtnh = rtnh_next(rtnh, &remaining); 4395 } 4396 4397 /* for add and replace send one notification with all nexthops. 4398 * Skip the notification in fib6_add_rt2node and send one with 4399 * the full route when done 4400 */ 4401 info->skip_notify = 1; 4402 4403 err_nh = NULL; 4404 list_for_each_entry(nh, &rt6_nh_list, next) { 4405 err = __ip6_ins_rt(nh->fib6_info, info, extack); 4406 fib6_info_release(nh->fib6_info); 4407 4408 if (!err) { 4409 /* save reference to last route successfully inserted */ 4410 rt_last = nh->fib6_info; 4411 4412 /* save reference to first route for notification */ 4413 if (!rt_notif) 4414 rt_notif = nh->fib6_info; 4415 } 4416 4417 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 4418 nh->fib6_info = NULL; 4419 if (err) { 4420 if (replace && nhn) 4421 ip6_print_replace_route_err(&rt6_nh_list); 4422 err_nh = nh; 4423 goto add_errout; 4424 } 4425 4426 /* Because each route is added like a single route we remove 4427 * these flags after the first nexthop: if there is a collision, 4428 * we have already failed to add the first nexthop: 4429 * fib6_add_rt2node() has rejected it; when replacing, old 4430 * nexthops have been replaced by first new, the rest should 4431 * be added to it. 4432 */ 4433 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4434 NLM_F_REPLACE); 4435 nhn++; 4436 } 4437 4438 /* success ... tell user about new route */ 4439 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4440 goto cleanup; 4441 4442 add_errout: 4443 /* send notification for routes that were added so that 4444 * the delete notifications sent by ip6_route_del are 4445 * coherent 4446 */ 4447 if (rt_notif) 4448 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4449 4450 /* Delete routes that were already added */ 4451 list_for_each_entry(nh, &rt6_nh_list, next) { 4452 if (err_nh == nh) 4453 break; 4454 ip6_route_del(&nh->r_cfg, extack); 4455 } 4456 4457 cleanup: 4458 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4459 if (nh->fib6_info) 4460 fib6_info_release(nh->fib6_info); 4461 list_del(&nh->next); 4462 kfree(nh); 4463 } 4464 4465 return err; 4466 } 4467 4468 static int ip6_route_multipath_del(struct fib6_config *cfg, 4469 struct netlink_ext_ack *extack) 4470 { 4471 struct fib6_config r_cfg; 4472 struct rtnexthop *rtnh; 4473 int remaining; 4474 int attrlen; 4475 int err = 1, last_err = 0; 4476 4477 remaining = cfg->fc_mp_len; 4478 rtnh = (struct rtnexthop *)cfg->fc_mp; 4479 4480 /* Parse a Multipath Entry */ 4481 while (rtnh_ok(rtnh, remaining)) { 4482 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4483 if (rtnh->rtnh_ifindex) 4484 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4485 4486 attrlen = rtnh_attrlen(rtnh); 4487 if (attrlen > 0) { 4488 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4489 4490 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4491 if (nla) { 4492 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 4493 r_cfg.fc_flags |= RTF_GATEWAY; 4494 } 4495 } 4496 err = ip6_route_del(&r_cfg, extack); 4497 if (err) 4498 last_err = err; 4499 4500 rtnh = rtnh_next(rtnh, &remaining); 4501 } 4502 4503 return last_err; 4504 } 4505 4506 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4507 struct netlink_ext_ack *extack) 4508 { 4509 struct fib6_config cfg; 4510 int err; 4511 4512 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4513 if (err < 0) 4514 return err; 4515 4516 if (cfg.fc_mp) 4517 return ip6_route_multipath_del(&cfg, extack); 4518 else { 4519 cfg.fc_delete_all_nh = 1; 4520 return ip6_route_del(&cfg, extack); 4521 } 4522 } 4523 4524 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4525 struct netlink_ext_ack *extack) 4526 { 4527 struct fib6_config cfg; 4528 int err; 4529 4530 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4531 if (err < 0) 4532 return err; 4533 4534 if (cfg.fc_mp) 4535 return ip6_route_multipath_add(&cfg, extack); 4536 else 4537 return ip6_route_add(&cfg, GFP_KERNEL, extack); 4538 } 4539 4540 static size_t rt6_nlmsg_size(struct fib6_info *rt) 4541 { 4542 int nexthop_len = 0; 4543 4544 if (rt->fib6_nsiblings) { 4545 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4546 + NLA_ALIGN(sizeof(struct rtnexthop)) 4547 + nla_total_size(16) /* RTA_GATEWAY */ 4548 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate); 4549 4550 nexthop_len *= rt->fib6_nsiblings; 4551 } 4552 4553 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4554 + nla_total_size(16) /* RTA_SRC */ 4555 + nla_total_size(16) /* RTA_DST */ 4556 + nla_total_size(16) /* RTA_GATEWAY */ 4557 + nla_total_size(16) /* RTA_PREFSRC */ 4558 + nla_total_size(4) /* RTA_TABLE */ 4559 + nla_total_size(4) /* RTA_IIF */ 4560 + nla_total_size(4) /* RTA_OIF */ 4561 + nla_total_size(4) /* RTA_PRIORITY */ 4562 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 4563 + nla_total_size(sizeof(struct rta_cacheinfo)) 4564 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4565 + nla_total_size(1) /* RTA_PREF */ 4566 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate) 4567 + nexthop_len; 4568 } 4569 4570 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt, 4571 unsigned int *flags, bool skip_oif) 4572 { 4573 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 4574 *flags |= RTNH_F_DEAD; 4575 4576 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) { 4577 *flags |= RTNH_F_LINKDOWN; 4578 4579 rcu_read_lock(); 4580 if (fib6_ignore_linkdown(rt)) 4581 *flags |= RTNH_F_DEAD; 4582 rcu_read_unlock(); 4583 } 4584 4585 if (rt->fib6_flags & RTF_GATEWAY) { 4586 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0) 4587 goto nla_put_failure; 4588 } 4589 4590 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK); 4591 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD) 4592 *flags |= RTNH_F_OFFLOAD; 4593 4594 /* not needed for multipath encoding b/c it has a rtnexthop struct */ 4595 if (!skip_oif && rt->fib6_nh.nh_dev && 4596 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex)) 4597 goto nla_put_failure; 4598 4599 if (rt->fib6_nh.nh_lwtstate && 4600 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0) 4601 goto nla_put_failure; 4602 4603 return 0; 4604 4605 nla_put_failure: 4606 return -EMSGSIZE; 4607 } 4608 4609 /* add multipath next hop */ 4610 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt) 4611 { 4612 const struct net_device *dev = rt->fib6_nh.nh_dev; 4613 struct rtnexthop *rtnh; 4614 unsigned int flags = 0; 4615 4616 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 4617 if (!rtnh) 4618 goto nla_put_failure; 4619 4620 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1; 4621 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; 4622 4623 if (rt6_nexthop_info(skb, rt, &flags, true) < 0) 4624 goto nla_put_failure; 4625 4626 rtnh->rtnh_flags = flags; 4627 4628 /* length of rtnetlink header + attributes */ 4629 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; 4630 4631 return 0; 4632 4633 nla_put_failure: 4634 return -EMSGSIZE; 4635 } 4636 4637 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 4638 struct fib6_info *rt, struct dst_entry *dst, 4639 struct in6_addr *dest, struct in6_addr *src, 4640 int iif, int type, u32 portid, u32 seq, 4641 unsigned int flags) 4642 { 4643 struct rt6_info *rt6 = (struct rt6_info *)dst; 4644 struct rt6key *rt6_dst, *rt6_src; 4645 u32 *pmetrics, table, rt6_flags; 4646 struct nlmsghdr *nlh; 4647 struct rtmsg *rtm; 4648 long expires = 0; 4649 4650 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4651 if (!nlh) 4652 return -EMSGSIZE; 4653 4654 if (rt6) { 4655 rt6_dst = &rt6->rt6i_dst; 4656 rt6_src = &rt6->rt6i_src; 4657 rt6_flags = rt6->rt6i_flags; 4658 } else { 4659 rt6_dst = &rt->fib6_dst; 4660 rt6_src = &rt->fib6_src; 4661 rt6_flags = rt->fib6_flags; 4662 } 4663 4664 rtm = nlmsg_data(nlh); 4665 rtm->rtm_family = AF_INET6; 4666 rtm->rtm_dst_len = rt6_dst->plen; 4667 rtm->rtm_src_len = rt6_src->plen; 4668 rtm->rtm_tos = 0; 4669 if (rt->fib6_table) 4670 table = rt->fib6_table->tb6_id; 4671 else 4672 table = RT6_TABLE_UNSPEC; 4673 rtm->rtm_table = table; 4674 if (nla_put_u32(skb, RTA_TABLE, table)) 4675 goto nla_put_failure; 4676 4677 rtm->rtm_type = rt->fib6_type; 4678 rtm->rtm_flags = 0; 4679 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4680 rtm->rtm_protocol = rt->fib6_protocol; 4681 4682 if (rt6_flags & RTF_CACHE) 4683 rtm->rtm_flags |= RTM_F_CLONED; 4684 4685 if (dest) { 4686 if (nla_put_in6_addr(skb, RTA_DST, dest)) 4687 goto nla_put_failure; 4688 rtm->rtm_dst_len = 128; 4689 } else if (rtm->rtm_dst_len) 4690 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr)) 4691 goto nla_put_failure; 4692 #ifdef CONFIG_IPV6_SUBTREES 4693 if (src) { 4694 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4695 goto nla_put_failure; 4696 rtm->rtm_src_len = 128; 4697 } else if (rtm->rtm_src_len && 4698 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr)) 4699 goto nla_put_failure; 4700 #endif 4701 if (iif) { 4702 #ifdef CONFIG_IPV6_MROUTE 4703 if (ipv6_addr_is_multicast(&rt6_dst->addr)) { 4704 int err = ip6mr_get_route(net, skb, rtm, portid); 4705 4706 if (err == 0) 4707 return 0; 4708 if (err < 0) 4709 goto nla_put_failure; 4710 } else 4711 #endif 4712 if (nla_put_u32(skb, RTA_IIF, iif)) 4713 goto nla_put_failure; 4714 } else if (dest) { 4715 struct in6_addr saddr_buf; 4716 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 4717 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4718 goto nla_put_failure; 4719 } 4720 4721 if (rt->fib6_prefsrc.plen) { 4722 struct in6_addr saddr_buf; 4723 saddr_buf = rt->fib6_prefsrc.addr; 4724 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4725 goto nla_put_failure; 4726 } 4727 4728 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 4729 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 4730 goto nla_put_failure; 4731 4732 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 4733 goto nla_put_failure; 4734 4735 /* For multipath routes, walk the siblings list and add 4736 * each as a nexthop within RTA_MULTIPATH. 4737 */ 4738 if (rt6) { 4739 if (rt6_flags & RTF_GATEWAY && 4740 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) 4741 goto nla_put_failure; 4742 4743 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex)) 4744 goto nla_put_failure; 4745 } else if (rt->fib6_nsiblings) { 4746 struct fib6_info *sibling, *next_sibling; 4747 struct nlattr *mp; 4748 4749 mp = nla_nest_start(skb, RTA_MULTIPATH); 4750 if (!mp) 4751 goto nla_put_failure; 4752 4753 if (rt6_add_nexthop(skb, rt) < 0) 4754 goto nla_put_failure; 4755 4756 list_for_each_entry_safe(sibling, next_sibling, 4757 &rt->fib6_siblings, fib6_siblings) { 4758 if (rt6_add_nexthop(skb, sibling) < 0) 4759 goto nla_put_failure; 4760 } 4761 4762 nla_nest_end(skb, mp); 4763 } else { 4764 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0) 4765 goto nla_put_failure; 4766 } 4767 4768 if (rt6_flags & RTF_EXPIRES) { 4769 expires = dst ? dst->expires : rt->expires; 4770 expires -= jiffies; 4771 } 4772 4773 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 4774 goto nla_put_failure; 4775 4776 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags))) 4777 goto nla_put_failure; 4778 4779 4780 nlmsg_end(skb, nlh); 4781 return 0; 4782 4783 nla_put_failure: 4784 nlmsg_cancel(skb, nlh); 4785 return -EMSGSIZE; 4786 } 4787 4788 int rt6_dump_route(struct fib6_info *rt, void *p_arg) 4789 { 4790 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4791 struct net *net = arg->net; 4792 4793 if (rt == net->ipv6.fib6_null_entry) 4794 return 0; 4795 4796 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { 4797 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); 4798 4799 /* user wants prefix routes only */ 4800 if (rtm->rtm_flags & RTM_F_PREFIX && 4801 !(rt->fib6_flags & RTF_PREFIX_RT)) { 4802 /* success since this is not a prefix route */ 4803 return 1; 4804 } 4805 } 4806 4807 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, 4808 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, 4809 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI); 4810 } 4811 4812 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4813 struct netlink_ext_ack *extack) 4814 { 4815 struct net *net = sock_net(in_skb->sk); 4816 struct nlattr *tb[RTA_MAX+1]; 4817 int err, iif = 0, oif = 0; 4818 struct fib6_info *from; 4819 struct dst_entry *dst; 4820 struct rt6_info *rt; 4821 struct sk_buff *skb; 4822 struct rtmsg *rtm; 4823 struct flowi6 fl6; 4824 bool fibmatch; 4825 4826 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4827 extack); 4828 if (err < 0) 4829 goto errout; 4830 4831 err = -EINVAL; 4832 memset(&fl6, 0, sizeof(fl6)); 4833 rtm = nlmsg_data(nlh); 4834 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4835 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4836 4837 if (tb[RTA_SRC]) { 4838 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4839 goto errout; 4840 4841 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4842 } 4843 4844 if (tb[RTA_DST]) { 4845 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4846 goto errout; 4847 4848 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4849 } 4850 4851 if (tb[RTA_IIF]) 4852 iif = nla_get_u32(tb[RTA_IIF]); 4853 4854 if (tb[RTA_OIF]) 4855 oif = nla_get_u32(tb[RTA_OIF]); 4856 4857 if (tb[RTA_MARK]) 4858 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4859 4860 if (tb[RTA_UID]) 4861 fl6.flowi6_uid = make_kuid(current_user_ns(), 4862 nla_get_u32(tb[RTA_UID])); 4863 else 4864 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4865 4866 if (tb[RTA_SPORT]) 4867 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); 4868 4869 if (tb[RTA_DPORT]) 4870 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); 4871 4872 if (tb[RTA_IP_PROTO]) { 4873 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 4874 &fl6.flowi6_proto, extack); 4875 if (err) 4876 goto errout; 4877 } 4878 4879 if (iif) { 4880 struct net_device *dev; 4881 int flags = 0; 4882 4883 rcu_read_lock(); 4884 4885 dev = dev_get_by_index_rcu(net, iif); 4886 if (!dev) { 4887 rcu_read_unlock(); 4888 err = -ENODEV; 4889 goto errout; 4890 } 4891 4892 fl6.flowi6_iif = iif; 4893 4894 if (!ipv6_addr_any(&fl6.saddr)) 4895 flags |= RT6_LOOKUP_F_HAS_SADDR; 4896 4897 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 4898 4899 rcu_read_unlock(); 4900 } else { 4901 fl6.flowi6_oif = oif; 4902 4903 dst = ip6_route_output(net, NULL, &fl6); 4904 } 4905 4906 4907 rt = container_of(dst, struct rt6_info, dst); 4908 if (rt->dst.error) { 4909 err = rt->dst.error; 4910 ip6_rt_put(rt); 4911 goto errout; 4912 } 4913 4914 if (rt == net->ipv6.ip6_null_entry) { 4915 err = rt->dst.error; 4916 ip6_rt_put(rt); 4917 goto errout; 4918 } 4919 4920 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 4921 if (!skb) { 4922 ip6_rt_put(rt); 4923 err = -ENOBUFS; 4924 goto errout; 4925 } 4926 4927 skb_dst_set(skb, &rt->dst); 4928 4929 rcu_read_lock(); 4930 from = rcu_dereference(rt->from); 4931 4932 if (fibmatch) 4933 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, 4934 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 4935 nlh->nlmsg_seq, 0); 4936 else 4937 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 4938 &fl6.saddr, iif, RTM_NEWROUTE, 4939 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 4940 0); 4941 rcu_read_unlock(); 4942 4943 if (err < 0) { 4944 kfree_skb(skb); 4945 goto errout; 4946 } 4947 4948 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 4949 errout: 4950 return err; 4951 } 4952 4953 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 4954 unsigned int nlm_flags) 4955 { 4956 struct sk_buff *skb; 4957 struct net *net = info->nl_net; 4958 u32 seq; 4959 int err; 4960 4961 err = -ENOBUFS; 4962 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 4963 4964 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 4965 if (!skb) 4966 goto errout; 4967 4968 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 4969 event, info->portid, seq, nlm_flags); 4970 if (err < 0) { 4971 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 4972 WARN_ON(err == -EMSGSIZE); 4973 kfree_skb(skb); 4974 goto errout; 4975 } 4976 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 4977 info->nlh, gfp_any()); 4978 return; 4979 errout: 4980 if (err < 0) 4981 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 4982 } 4983 4984 static int ip6_route_dev_notify(struct notifier_block *this, 4985 unsigned long event, void *ptr) 4986 { 4987 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 4988 struct net *net = dev_net(dev); 4989 4990 if (!(dev->flags & IFF_LOOPBACK)) 4991 return NOTIFY_OK; 4992 4993 if (event == NETDEV_REGISTER) { 4994 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev; 4995 net->ipv6.ip6_null_entry->dst.dev = dev; 4996 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 4997 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4998 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 4999 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 5000 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 5001 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 5002 #endif 5003 } else if (event == NETDEV_UNREGISTER && 5004 dev->reg_state != NETREG_UNREGISTERED) { 5005 /* NETDEV_UNREGISTER could be fired for multiple times by 5006 * netdev_wait_allrefs(). Make sure we only call this once. 5007 */ 5008 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 5009 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5010 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 5011 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 5012 #endif 5013 } 5014 5015 return NOTIFY_OK; 5016 } 5017 5018 /* 5019 * /proc 5020 */ 5021 5022 #ifdef CONFIG_PROC_FS 5023 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 5024 { 5025 struct net *net = (struct net *)seq->private; 5026 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 5027 net->ipv6.rt6_stats->fib_nodes, 5028 net->ipv6.rt6_stats->fib_route_nodes, 5029 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 5030 net->ipv6.rt6_stats->fib_rt_entries, 5031 net->ipv6.rt6_stats->fib_rt_cache, 5032 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 5033 net->ipv6.rt6_stats->fib_discarded_routes); 5034 5035 return 0; 5036 } 5037 #endif /* CONFIG_PROC_FS */ 5038 5039 #ifdef CONFIG_SYSCTL 5040 5041 static 5042 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 5043 void __user *buffer, size_t *lenp, loff_t *ppos) 5044 { 5045 struct net *net; 5046 int delay; 5047 if (!write) 5048 return -EINVAL; 5049 5050 net = (struct net *)ctl->extra1; 5051 delay = net->ipv6.sysctl.flush_delay; 5052 proc_dointvec(ctl, write, buffer, lenp, ppos); 5053 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 5054 return 0; 5055 } 5056 5057 struct ctl_table ipv6_route_table_template[] = { 5058 { 5059 .procname = "flush", 5060 .data = &init_net.ipv6.sysctl.flush_delay, 5061 .maxlen = sizeof(int), 5062 .mode = 0200, 5063 .proc_handler = ipv6_sysctl_rtcache_flush 5064 }, 5065 { 5066 .procname = "gc_thresh", 5067 .data = &ip6_dst_ops_template.gc_thresh, 5068 .maxlen = sizeof(int), 5069 .mode = 0644, 5070 .proc_handler = proc_dointvec, 5071 }, 5072 { 5073 .procname = "max_size", 5074 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 5075 .maxlen = sizeof(int), 5076 .mode = 0644, 5077 .proc_handler = proc_dointvec, 5078 }, 5079 { 5080 .procname = "gc_min_interval", 5081 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5082 .maxlen = sizeof(int), 5083 .mode = 0644, 5084 .proc_handler = proc_dointvec_jiffies, 5085 }, 5086 { 5087 .procname = "gc_timeout", 5088 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 5089 .maxlen = sizeof(int), 5090 .mode = 0644, 5091 .proc_handler = proc_dointvec_jiffies, 5092 }, 5093 { 5094 .procname = "gc_interval", 5095 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 5096 .maxlen = sizeof(int), 5097 .mode = 0644, 5098 .proc_handler = proc_dointvec_jiffies, 5099 }, 5100 { 5101 .procname = "gc_elasticity", 5102 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 5103 .maxlen = sizeof(int), 5104 .mode = 0644, 5105 .proc_handler = proc_dointvec, 5106 }, 5107 { 5108 .procname = "mtu_expires", 5109 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 5110 .maxlen = sizeof(int), 5111 .mode = 0644, 5112 .proc_handler = proc_dointvec_jiffies, 5113 }, 5114 { 5115 .procname = "min_adv_mss", 5116 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 5117 .maxlen = sizeof(int), 5118 .mode = 0644, 5119 .proc_handler = proc_dointvec, 5120 }, 5121 { 5122 .procname = "gc_min_interval_ms", 5123 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5124 .maxlen = sizeof(int), 5125 .mode = 0644, 5126 .proc_handler = proc_dointvec_ms_jiffies, 5127 }, 5128 { } 5129 }; 5130 5131 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 5132 { 5133 struct ctl_table *table; 5134 5135 table = kmemdup(ipv6_route_table_template, 5136 sizeof(ipv6_route_table_template), 5137 GFP_KERNEL); 5138 5139 if (table) { 5140 table[0].data = &net->ipv6.sysctl.flush_delay; 5141 table[0].extra1 = net; 5142 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 5143 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 5144 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5145 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 5146 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 5147 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 5148 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 5149 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 5150 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5151 5152 /* Don't export sysctls to unprivileged users */ 5153 if (net->user_ns != &init_user_ns) 5154 table[0].procname = NULL; 5155 } 5156 5157 return table; 5158 } 5159 #endif 5160 5161 static int __net_init ip6_route_net_init(struct net *net) 5162 { 5163 int ret = -ENOMEM; 5164 5165 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 5166 sizeof(net->ipv6.ip6_dst_ops)); 5167 5168 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 5169 goto out_ip6_dst_ops; 5170 5171 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, 5172 sizeof(*net->ipv6.fib6_null_entry), 5173 GFP_KERNEL); 5174 if (!net->ipv6.fib6_null_entry) 5175 goto out_ip6_dst_entries; 5176 5177 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 5178 sizeof(*net->ipv6.ip6_null_entry), 5179 GFP_KERNEL); 5180 if (!net->ipv6.ip6_null_entry) 5181 goto out_fib6_null_entry; 5182 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5183 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 5184 ip6_template_metrics, true); 5185 5186 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5187 net->ipv6.fib6_has_custom_rules = false; 5188 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 5189 sizeof(*net->ipv6.ip6_prohibit_entry), 5190 GFP_KERNEL); 5191 if (!net->ipv6.ip6_prohibit_entry) 5192 goto out_ip6_null_entry; 5193 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5194 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 5195 ip6_template_metrics, true); 5196 5197 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 5198 sizeof(*net->ipv6.ip6_blk_hole_entry), 5199 GFP_KERNEL); 5200 if (!net->ipv6.ip6_blk_hole_entry) 5201 goto out_ip6_prohibit_entry; 5202 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5203 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 5204 ip6_template_metrics, true); 5205 #endif 5206 5207 net->ipv6.sysctl.flush_delay = 0; 5208 net->ipv6.sysctl.ip6_rt_max_size = 4096; 5209 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 5210 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 5211 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 5212 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 5213 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 5214 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 5215 5216 net->ipv6.ip6_rt_gc_expire = 30*HZ; 5217 5218 ret = 0; 5219 out: 5220 return ret; 5221 5222 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5223 out_ip6_prohibit_entry: 5224 kfree(net->ipv6.ip6_prohibit_entry); 5225 out_ip6_null_entry: 5226 kfree(net->ipv6.ip6_null_entry); 5227 #endif 5228 out_fib6_null_entry: 5229 kfree(net->ipv6.fib6_null_entry); 5230 out_ip6_dst_entries: 5231 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5232 out_ip6_dst_ops: 5233 goto out; 5234 } 5235 5236 static void __net_exit ip6_route_net_exit(struct net *net) 5237 { 5238 kfree(net->ipv6.fib6_null_entry); 5239 kfree(net->ipv6.ip6_null_entry); 5240 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5241 kfree(net->ipv6.ip6_prohibit_entry); 5242 kfree(net->ipv6.ip6_blk_hole_entry); 5243 #endif 5244 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5245 } 5246 5247 static int __net_init ip6_route_net_init_late(struct net *net) 5248 { 5249 #ifdef CONFIG_PROC_FS 5250 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops, 5251 sizeof(struct ipv6_route_iter)); 5252 proc_create_net_single("rt6_stats", 0444, net->proc_net, 5253 rt6_stats_seq_show, NULL); 5254 #endif 5255 return 0; 5256 } 5257 5258 static void __net_exit ip6_route_net_exit_late(struct net *net) 5259 { 5260 #ifdef CONFIG_PROC_FS 5261 remove_proc_entry("ipv6_route", net->proc_net); 5262 remove_proc_entry("rt6_stats", net->proc_net); 5263 #endif 5264 } 5265 5266 static struct pernet_operations ip6_route_net_ops = { 5267 .init = ip6_route_net_init, 5268 .exit = ip6_route_net_exit, 5269 }; 5270 5271 static int __net_init ipv6_inetpeer_init(struct net *net) 5272 { 5273 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 5274 5275 if (!bp) 5276 return -ENOMEM; 5277 inet_peer_base_init(bp); 5278 net->ipv6.peers = bp; 5279 return 0; 5280 } 5281 5282 static void __net_exit ipv6_inetpeer_exit(struct net *net) 5283 { 5284 struct inet_peer_base *bp = net->ipv6.peers; 5285 5286 net->ipv6.peers = NULL; 5287 inetpeer_invalidate_tree(bp); 5288 kfree(bp); 5289 } 5290 5291 static struct pernet_operations ipv6_inetpeer_ops = { 5292 .init = ipv6_inetpeer_init, 5293 .exit = ipv6_inetpeer_exit, 5294 }; 5295 5296 static struct pernet_operations ip6_route_net_late_ops = { 5297 .init = ip6_route_net_init_late, 5298 .exit = ip6_route_net_exit_late, 5299 }; 5300 5301 static struct notifier_block ip6_route_dev_notifier = { 5302 .notifier_call = ip6_route_dev_notify, 5303 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 5304 }; 5305 5306 void __init ip6_route_init_special_entries(void) 5307 { 5308 /* Registering of the loopback is done before this portion of code, 5309 * the loopback reference in rt6_info will not be taken, do it 5310 * manually for init_net */ 5311 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev; 5312 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5313 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5314 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5315 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 5316 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5317 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 5318 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5319 #endif 5320 } 5321 5322 int __init ip6_route_init(void) 5323 { 5324 int ret; 5325 int cpu; 5326 5327 ret = -ENOMEM; 5328 ip6_dst_ops_template.kmem_cachep = 5329 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 5330 SLAB_HWCACHE_ALIGN, NULL); 5331 if (!ip6_dst_ops_template.kmem_cachep) 5332 goto out; 5333 5334 ret = dst_entries_init(&ip6_dst_blackhole_ops); 5335 if (ret) 5336 goto out_kmem_cache; 5337 5338 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 5339 if (ret) 5340 goto out_dst_entries; 5341 5342 ret = register_pernet_subsys(&ip6_route_net_ops); 5343 if (ret) 5344 goto out_register_inetpeer; 5345 5346 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 5347 5348 ret = fib6_init(); 5349 if (ret) 5350 goto out_register_subsys; 5351 5352 ret = xfrm6_init(); 5353 if (ret) 5354 goto out_fib6_init; 5355 5356 ret = fib6_rules_init(); 5357 if (ret) 5358 goto xfrm6_init; 5359 5360 ret = register_pernet_subsys(&ip6_route_net_late_ops); 5361 if (ret) 5362 goto fib6_rules_init; 5363 5364 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 5365 inet6_rtm_newroute, NULL, 0); 5366 if (ret < 0) 5367 goto out_register_late_subsys; 5368 5369 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 5370 inet6_rtm_delroute, NULL, 0); 5371 if (ret < 0) 5372 goto out_register_late_subsys; 5373 5374 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 5375 inet6_rtm_getroute, NULL, 5376 RTNL_FLAG_DOIT_UNLOCKED); 5377 if (ret < 0) 5378 goto out_register_late_subsys; 5379 5380 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 5381 if (ret) 5382 goto out_register_late_subsys; 5383 5384 for_each_possible_cpu(cpu) { 5385 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 5386 5387 INIT_LIST_HEAD(&ul->head); 5388 spin_lock_init(&ul->lock); 5389 } 5390 5391 out: 5392 return ret; 5393 5394 out_register_late_subsys: 5395 rtnl_unregister_all(PF_INET6); 5396 unregister_pernet_subsys(&ip6_route_net_late_ops); 5397 fib6_rules_init: 5398 fib6_rules_cleanup(); 5399 xfrm6_init: 5400 xfrm6_fini(); 5401 out_fib6_init: 5402 fib6_gc_cleanup(); 5403 out_register_subsys: 5404 unregister_pernet_subsys(&ip6_route_net_ops); 5405 out_register_inetpeer: 5406 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5407 out_dst_entries: 5408 dst_entries_destroy(&ip6_dst_blackhole_ops); 5409 out_kmem_cache: 5410 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5411 goto out; 5412 } 5413 5414 void ip6_route_cleanup(void) 5415 { 5416 unregister_netdevice_notifier(&ip6_route_dev_notifier); 5417 unregister_pernet_subsys(&ip6_route_net_late_ops); 5418 fib6_rules_cleanup(); 5419 xfrm6_fini(); 5420 fib6_gc_cleanup(); 5421 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5422 unregister_pernet_subsys(&ip6_route_net_ops); 5423 dst_entries_destroy(&ip6_dst_blackhole_ops); 5424 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5425 } 5426