1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/nexthop.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <trace/events/fib6.h> 67 68 #include <linux/uaccess.h> 69 70 #ifdef CONFIG_SYSCTL 71 #include <linux/sysctl.h> 72 #endif 73 74 enum rt6_nud_state { 75 RT6_NUD_FAIL_HARD = -3, 76 RT6_NUD_FAIL_PROBE = -2, 77 RT6_NUD_FAIL_DO_RR = -1, 78 RT6_NUD_SUCCEED = 1 79 }; 80 81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 82 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 83 static unsigned int ip6_mtu(const struct dst_entry *dst); 84 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 85 static void ip6_dst_destroy(struct dst_entry *); 86 static void ip6_dst_ifdown(struct dst_entry *, 87 struct net_device *dev, int how); 88 static int ip6_dst_gc(struct dst_ops *ops); 89 90 static int ip6_pkt_discard(struct sk_buff *skb); 91 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 92 static int ip6_pkt_prohibit(struct sk_buff *skb); 93 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 94 static void ip6_link_failure(struct sk_buff *skb); 95 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 96 struct sk_buff *skb, u32 mtu); 97 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 98 struct sk_buff *skb); 99 static int rt6_score_route(struct fib6_info *rt, int oif, int strict); 100 static size_t rt6_nlmsg_size(struct fib6_info *rt); 101 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 102 struct fib6_info *rt, struct dst_entry *dst, 103 struct in6_addr *dest, struct in6_addr *src, 104 int iif, int type, u32 portid, u32 seq, 105 unsigned int flags); 106 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 107 struct in6_addr *daddr, 108 struct in6_addr *saddr); 109 110 #ifdef CONFIG_IPV6_ROUTE_INFO 111 static struct fib6_info *rt6_add_route_info(struct net *net, 112 const struct in6_addr *prefix, int prefixlen, 113 const struct in6_addr *gwaddr, 114 struct net_device *dev, 115 unsigned int pref); 116 static struct fib6_info *rt6_get_route_info(struct net *net, 117 const struct in6_addr *prefix, int prefixlen, 118 const struct in6_addr *gwaddr, 119 struct net_device *dev); 120 #endif 121 122 struct uncached_list { 123 spinlock_t lock; 124 struct list_head head; 125 }; 126 127 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 128 129 void rt6_uncached_list_add(struct rt6_info *rt) 130 { 131 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 132 133 rt->rt6i_uncached_list = ul; 134 135 spin_lock_bh(&ul->lock); 136 list_add_tail(&rt->rt6i_uncached, &ul->head); 137 spin_unlock_bh(&ul->lock); 138 } 139 140 void rt6_uncached_list_del(struct rt6_info *rt) 141 { 142 if (!list_empty(&rt->rt6i_uncached)) { 143 struct uncached_list *ul = rt->rt6i_uncached_list; 144 struct net *net = dev_net(rt->dst.dev); 145 146 spin_lock_bh(&ul->lock); 147 list_del(&rt->rt6i_uncached); 148 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 149 spin_unlock_bh(&ul->lock); 150 } 151 } 152 153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 154 { 155 struct net_device *loopback_dev = net->loopback_dev; 156 int cpu; 157 158 if (dev == loopback_dev) 159 return; 160 161 for_each_possible_cpu(cpu) { 162 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 163 struct rt6_info *rt; 164 165 spin_lock_bh(&ul->lock); 166 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 167 struct inet6_dev *rt_idev = rt->rt6i_idev; 168 struct net_device *rt_dev = rt->dst.dev; 169 170 if (rt_idev->dev == dev) { 171 rt->rt6i_idev = in6_dev_get(loopback_dev); 172 in6_dev_put(rt_idev); 173 } 174 175 if (rt_dev == dev) { 176 rt->dst.dev = loopback_dev; 177 dev_hold(rt->dst.dev); 178 dev_put(rt_dev); 179 } 180 } 181 spin_unlock_bh(&ul->lock); 182 } 183 } 184 185 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 186 struct sk_buff *skb, 187 const void *daddr) 188 { 189 if (!ipv6_addr_any(p)) 190 return (const void *) p; 191 else if (skb) 192 return &ipv6_hdr(skb)->daddr; 193 return daddr; 194 } 195 196 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 197 struct net_device *dev, 198 struct sk_buff *skb, 199 const void *daddr) 200 { 201 struct neighbour *n; 202 203 daddr = choose_neigh_daddr(gw, skb, daddr); 204 n = __ipv6_neigh_lookup(dev, daddr); 205 if (n) 206 return n; 207 return neigh_create(&nd_tbl, daddr, dev); 208 } 209 210 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 211 struct sk_buff *skb, 212 const void *daddr) 213 { 214 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 215 216 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); 217 } 218 219 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 220 { 221 struct net_device *dev = dst->dev; 222 struct rt6_info *rt = (struct rt6_info *)dst; 223 224 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); 225 if (!daddr) 226 return; 227 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 228 return; 229 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 230 return; 231 __ipv6_confirm_neigh(dev, daddr); 232 } 233 234 static struct dst_ops ip6_dst_ops_template = { 235 .family = AF_INET6, 236 .gc = ip6_dst_gc, 237 .gc_thresh = 1024, 238 .check = ip6_dst_check, 239 .default_advmss = ip6_default_advmss, 240 .mtu = ip6_mtu, 241 .cow_metrics = dst_cow_metrics_generic, 242 .destroy = ip6_dst_destroy, 243 .ifdown = ip6_dst_ifdown, 244 .negative_advice = ip6_negative_advice, 245 .link_failure = ip6_link_failure, 246 .update_pmtu = ip6_rt_update_pmtu, 247 .redirect = rt6_do_redirect, 248 .local_out = __ip6_local_out, 249 .neigh_lookup = ip6_dst_neigh_lookup, 250 .confirm_neigh = ip6_confirm_neigh, 251 }; 252 253 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 254 { 255 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 256 257 return mtu ? : dst->dev->mtu; 258 } 259 260 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 261 struct sk_buff *skb, u32 mtu) 262 { 263 } 264 265 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 266 struct sk_buff *skb) 267 { 268 } 269 270 static struct dst_ops ip6_dst_blackhole_ops = { 271 .family = AF_INET6, 272 .destroy = ip6_dst_destroy, 273 .check = ip6_dst_check, 274 .mtu = ip6_blackhole_mtu, 275 .default_advmss = ip6_default_advmss, 276 .update_pmtu = ip6_rt_blackhole_update_pmtu, 277 .redirect = ip6_rt_blackhole_redirect, 278 .cow_metrics = dst_cow_metrics_generic, 279 .neigh_lookup = ip6_dst_neigh_lookup, 280 }; 281 282 static const u32 ip6_template_metrics[RTAX_MAX] = { 283 [RTAX_HOPLIMIT - 1] = 0, 284 }; 285 286 static const struct fib6_info fib6_null_entry_template = { 287 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 288 .fib6_protocol = RTPROT_KERNEL, 289 .fib6_metric = ~(u32)0, 290 .fib6_ref = ATOMIC_INIT(1), 291 .fib6_type = RTN_UNREACHABLE, 292 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 293 }; 294 295 static const struct rt6_info ip6_null_entry_template = { 296 .dst = { 297 .__refcnt = ATOMIC_INIT(1), 298 .__use = 1, 299 .obsolete = DST_OBSOLETE_FORCE_CHK, 300 .error = -ENETUNREACH, 301 .input = ip6_pkt_discard, 302 .output = ip6_pkt_discard_out, 303 }, 304 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 305 }; 306 307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 308 309 static const struct rt6_info ip6_prohibit_entry_template = { 310 .dst = { 311 .__refcnt = ATOMIC_INIT(1), 312 .__use = 1, 313 .obsolete = DST_OBSOLETE_FORCE_CHK, 314 .error = -EACCES, 315 .input = ip6_pkt_prohibit, 316 .output = ip6_pkt_prohibit_out, 317 }, 318 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 319 }; 320 321 static const struct rt6_info ip6_blk_hole_entry_template = { 322 .dst = { 323 .__refcnt = ATOMIC_INIT(1), 324 .__use = 1, 325 .obsolete = DST_OBSOLETE_FORCE_CHK, 326 .error = -EINVAL, 327 .input = dst_discard, 328 .output = dst_discard_out, 329 }, 330 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 331 }; 332 333 #endif 334 335 static void rt6_info_init(struct rt6_info *rt) 336 { 337 struct dst_entry *dst = &rt->dst; 338 339 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 340 INIT_LIST_HEAD(&rt->rt6i_uncached); 341 } 342 343 /* allocate dst with ip6_dst_ops */ 344 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 345 int flags) 346 { 347 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 348 1, DST_OBSOLETE_FORCE_CHK, flags); 349 350 if (rt) { 351 rt6_info_init(rt); 352 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 353 } 354 355 return rt; 356 } 357 EXPORT_SYMBOL(ip6_dst_alloc); 358 359 static void ip6_dst_destroy(struct dst_entry *dst) 360 { 361 struct rt6_info *rt = (struct rt6_info *)dst; 362 struct fib6_info *from; 363 struct inet6_dev *idev; 364 365 dst_destroy_metrics_generic(dst); 366 rt6_uncached_list_del(rt); 367 368 idev = rt->rt6i_idev; 369 if (idev) { 370 rt->rt6i_idev = NULL; 371 in6_dev_put(idev); 372 } 373 374 rcu_read_lock(); 375 from = rcu_dereference(rt->from); 376 rcu_assign_pointer(rt->from, NULL); 377 fib6_info_release(from); 378 rcu_read_unlock(); 379 } 380 381 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 382 int how) 383 { 384 struct rt6_info *rt = (struct rt6_info *)dst; 385 struct inet6_dev *idev = rt->rt6i_idev; 386 struct net_device *loopback_dev = 387 dev_net(dev)->loopback_dev; 388 389 if (idev && idev->dev != loopback_dev) { 390 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 391 if (loopback_idev) { 392 rt->rt6i_idev = loopback_idev; 393 in6_dev_put(idev); 394 } 395 } 396 } 397 398 static bool __rt6_check_expired(const struct rt6_info *rt) 399 { 400 if (rt->rt6i_flags & RTF_EXPIRES) 401 return time_after(jiffies, rt->dst.expires); 402 else 403 return false; 404 } 405 406 static bool rt6_check_expired(const struct rt6_info *rt) 407 { 408 struct fib6_info *from; 409 410 from = rcu_dereference(rt->from); 411 412 if (rt->rt6i_flags & RTF_EXPIRES) { 413 if (time_after(jiffies, rt->dst.expires)) 414 return true; 415 } else if (from) { 416 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 417 fib6_check_expired(from); 418 } 419 return false; 420 } 421 422 struct fib6_info *fib6_multipath_select(const struct net *net, 423 struct fib6_info *match, 424 struct flowi6 *fl6, int oif, 425 const struct sk_buff *skb, 426 int strict) 427 { 428 struct fib6_info *sibling, *next_sibling; 429 430 /* We might have already computed the hash for ICMPv6 errors. In such 431 * case it will always be non-zero. Otherwise now is the time to do it. 432 */ 433 if (!fl6->mp_hash) 434 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 435 436 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound)) 437 return match; 438 439 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 440 fib6_siblings) { 441 int nh_upper_bound; 442 443 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound); 444 if (fl6->mp_hash > nh_upper_bound) 445 continue; 446 if (rt6_score_route(sibling, oif, strict) < 0) 447 break; 448 match = sibling; 449 break; 450 } 451 452 return match; 453 } 454 455 /* 456 * Route lookup. rcu_read_lock() should be held. 457 */ 458 459 static inline struct fib6_info *rt6_device_match(struct net *net, 460 struct fib6_info *rt, 461 const struct in6_addr *saddr, 462 int oif, 463 int flags) 464 { 465 struct fib6_info *sprt; 466 467 if (!oif && ipv6_addr_any(saddr) && 468 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD)) 469 return rt; 470 471 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) { 472 const struct net_device *dev = sprt->fib6_nh.nh_dev; 473 474 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD) 475 continue; 476 477 if (oif) { 478 if (dev->ifindex == oif) 479 return sprt; 480 } else { 481 if (ipv6_chk_addr(net, saddr, dev, 482 flags & RT6_LOOKUP_F_IFACE)) 483 return sprt; 484 } 485 } 486 487 if (oif && flags & RT6_LOOKUP_F_IFACE) 488 return net->ipv6.fib6_null_entry; 489 490 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt; 491 } 492 493 #ifdef CONFIG_IPV6_ROUTER_PREF 494 struct __rt6_probe_work { 495 struct work_struct work; 496 struct in6_addr target; 497 struct net_device *dev; 498 }; 499 500 static void rt6_probe_deferred(struct work_struct *w) 501 { 502 struct in6_addr mcaddr; 503 struct __rt6_probe_work *work = 504 container_of(w, struct __rt6_probe_work, work); 505 506 addrconf_addr_solict_mult(&work->target, &mcaddr); 507 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 508 dev_put(work->dev); 509 kfree(work); 510 } 511 512 static void rt6_probe(struct fib6_info *rt) 513 { 514 struct __rt6_probe_work *work; 515 const struct in6_addr *nh_gw; 516 struct neighbour *neigh; 517 struct net_device *dev; 518 519 /* 520 * Okay, this does not seem to be appropriate 521 * for now, however, we need to check if it 522 * is really so; aka Router Reachability Probing. 523 * 524 * Router Reachability Probe MUST be rate-limited 525 * to no more than one per minute. 526 */ 527 if (!rt || !(rt->fib6_flags & RTF_GATEWAY)) 528 return; 529 530 nh_gw = &rt->fib6_nh.nh_gw; 531 dev = rt->fib6_nh.nh_dev; 532 rcu_read_lock_bh(); 533 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 534 if (neigh) { 535 struct inet6_dev *idev; 536 537 if (neigh->nud_state & NUD_VALID) 538 goto out; 539 540 idev = __in6_dev_get(dev); 541 work = NULL; 542 write_lock(&neigh->lock); 543 if (!(neigh->nud_state & NUD_VALID) && 544 time_after(jiffies, 545 neigh->updated + idev->cnf.rtr_probe_interval)) { 546 work = kmalloc(sizeof(*work), GFP_ATOMIC); 547 if (work) 548 __neigh_set_probe_once(neigh); 549 } 550 write_unlock(&neigh->lock); 551 } else { 552 work = kmalloc(sizeof(*work), GFP_ATOMIC); 553 } 554 555 if (work) { 556 INIT_WORK(&work->work, rt6_probe_deferred); 557 work->target = *nh_gw; 558 dev_hold(dev); 559 work->dev = dev; 560 schedule_work(&work->work); 561 } 562 563 out: 564 rcu_read_unlock_bh(); 565 } 566 #else 567 static inline void rt6_probe(struct fib6_info *rt) 568 { 569 } 570 #endif 571 572 /* 573 * Default Router Selection (RFC 2461 6.3.6) 574 */ 575 static inline int rt6_check_dev(struct fib6_info *rt, int oif) 576 { 577 const struct net_device *dev = rt->fib6_nh.nh_dev; 578 579 if (!oif || dev->ifindex == oif) 580 return 2; 581 return 0; 582 } 583 584 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) 585 { 586 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 587 struct neighbour *neigh; 588 589 if (rt->fib6_flags & RTF_NONEXTHOP || 590 !(rt->fib6_flags & RTF_GATEWAY)) 591 return RT6_NUD_SUCCEED; 592 593 rcu_read_lock_bh(); 594 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev, 595 &rt->fib6_nh.nh_gw); 596 if (neigh) { 597 read_lock(&neigh->lock); 598 if (neigh->nud_state & NUD_VALID) 599 ret = RT6_NUD_SUCCEED; 600 #ifdef CONFIG_IPV6_ROUTER_PREF 601 else if (!(neigh->nud_state & NUD_FAILED)) 602 ret = RT6_NUD_SUCCEED; 603 else 604 ret = RT6_NUD_FAIL_PROBE; 605 #endif 606 read_unlock(&neigh->lock); 607 } else { 608 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 609 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 610 } 611 rcu_read_unlock_bh(); 612 613 return ret; 614 } 615 616 static int rt6_score_route(struct fib6_info *rt, int oif, int strict) 617 { 618 int m; 619 620 m = rt6_check_dev(rt, oif); 621 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 622 return RT6_NUD_FAIL_HARD; 623 #ifdef CONFIG_IPV6_ROUTER_PREF 624 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2; 625 #endif 626 if (strict & RT6_LOOKUP_F_REACHABLE) { 627 int n = rt6_check_neigh(rt); 628 if (n < 0) 629 return n; 630 } 631 return m; 632 } 633 634 /* called with rc_read_lock held */ 635 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i) 636 { 637 const struct net_device *dev = fib6_info_nh_dev(f6i); 638 bool rc = false; 639 640 if (dev) { 641 const struct inet6_dev *idev = __in6_dev_get(dev); 642 643 rc = !!idev->cnf.ignore_routes_with_linkdown; 644 } 645 646 return rc; 647 } 648 649 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, 650 int *mpri, struct fib6_info *match, 651 bool *do_rr) 652 { 653 int m; 654 bool match_do_rr = false; 655 656 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 657 goto out; 658 659 if (fib6_ignore_linkdown(rt) && 660 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 661 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 662 goto out; 663 664 if (fib6_check_expired(rt)) 665 goto out; 666 667 m = rt6_score_route(rt, oif, strict); 668 if (m == RT6_NUD_FAIL_DO_RR) { 669 match_do_rr = true; 670 m = 0; /* lowest valid score */ 671 } else if (m == RT6_NUD_FAIL_HARD) { 672 goto out; 673 } 674 675 if (strict & RT6_LOOKUP_F_REACHABLE) 676 rt6_probe(rt); 677 678 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 679 if (m > *mpri) { 680 *do_rr = match_do_rr; 681 *mpri = m; 682 match = rt; 683 } 684 out: 685 return match; 686 } 687 688 static struct fib6_info *find_rr_leaf(struct fib6_node *fn, 689 struct fib6_info *leaf, 690 struct fib6_info *rr_head, 691 u32 metric, int oif, int strict, 692 bool *do_rr) 693 { 694 struct fib6_info *rt, *match, *cont; 695 int mpri = -1; 696 697 match = NULL; 698 cont = NULL; 699 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) { 700 if (rt->fib6_metric != metric) { 701 cont = rt; 702 break; 703 } 704 705 match = find_match(rt, oif, strict, &mpri, match, do_rr); 706 } 707 708 for (rt = leaf; rt && rt != rr_head; 709 rt = rcu_dereference(rt->fib6_next)) { 710 if (rt->fib6_metric != metric) { 711 cont = rt; 712 break; 713 } 714 715 match = find_match(rt, oif, strict, &mpri, match, do_rr); 716 } 717 718 if (match || !cont) 719 return match; 720 721 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next)) 722 match = find_match(rt, oif, strict, &mpri, match, do_rr); 723 724 return match; 725 } 726 727 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, 728 int oif, int strict) 729 { 730 struct fib6_info *leaf = rcu_dereference(fn->leaf); 731 struct fib6_info *match, *rt0; 732 bool do_rr = false; 733 int key_plen; 734 735 if (!leaf || leaf == net->ipv6.fib6_null_entry) 736 return net->ipv6.fib6_null_entry; 737 738 rt0 = rcu_dereference(fn->rr_ptr); 739 if (!rt0) 740 rt0 = leaf; 741 742 /* Double check to make sure fn is not an intermediate node 743 * and fn->leaf does not points to its child's leaf 744 * (This might happen if all routes under fn are deleted from 745 * the tree and fib6_repair_tree() is called on the node.) 746 */ 747 key_plen = rt0->fib6_dst.plen; 748 #ifdef CONFIG_IPV6_SUBTREES 749 if (rt0->fib6_src.plen) 750 key_plen = rt0->fib6_src.plen; 751 #endif 752 if (fn->fn_bit != key_plen) 753 return net->ipv6.fib6_null_entry; 754 755 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict, 756 &do_rr); 757 758 if (do_rr) { 759 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 760 761 /* no entries matched; do round-robin */ 762 if (!next || next->fib6_metric != rt0->fib6_metric) 763 next = leaf; 764 765 if (next != rt0) { 766 spin_lock_bh(&leaf->fib6_table->tb6_lock); 767 /* make sure next is not being deleted from the tree */ 768 if (next->fib6_node) 769 rcu_assign_pointer(fn->rr_ptr, next); 770 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 771 } 772 } 773 774 return match ? match : net->ipv6.fib6_null_entry; 775 } 776 777 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) 778 { 779 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); 780 } 781 782 #ifdef CONFIG_IPV6_ROUTE_INFO 783 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 784 const struct in6_addr *gwaddr) 785 { 786 struct net *net = dev_net(dev); 787 struct route_info *rinfo = (struct route_info *) opt; 788 struct in6_addr prefix_buf, *prefix; 789 unsigned int pref; 790 unsigned long lifetime; 791 struct fib6_info *rt; 792 793 if (len < sizeof(struct route_info)) { 794 return -EINVAL; 795 } 796 797 /* Sanity check for prefix_len and length */ 798 if (rinfo->length > 3) { 799 return -EINVAL; 800 } else if (rinfo->prefix_len > 128) { 801 return -EINVAL; 802 } else if (rinfo->prefix_len > 64) { 803 if (rinfo->length < 2) { 804 return -EINVAL; 805 } 806 } else if (rinfo->prefix_len > 0) { 807 if (rinfo->length < 1) { 808 return -EINVAL; 809 } 810 } 811 812 pref = rinfo->route_pref; 813 if (pref == ICMPV6_ROUTER_PREF_INVALID) 814 return -EINVAL; 815 816 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 817 818 if (rinfo->length == 3) 819 prefix = (struct in6_addr *)rinfo->prefix; 820 else { 821 /* this function is safe */ 822 ipv6_addr_prefix(&prefix_buf, 823 (struct in6_addr *)rinfo->prefix, 824 rinfo->prefix_len); 825 prefix = &prefix_buf; 826 } 827 828 if (rinfo->prefix_len == 0) 829 rt = rt6_get_dflt_router(net, gwaddr, dev); 830 else 831 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 832 gwaddr, dev); 833 834 if (rt && !lifetime) { 835 ip6_del_rt(net, rt); 836 rt = NULL; 837 } 838 839 if (!rt && lifetime) 840 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 841 dev, pref); 842 else if (rt) 843 rt->fib6_flags = RTF_ROUTEINFO | 844 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 845 846 if (rt) { 847 if (!addrconf_finite_timeout(lifetime)) 848 fib6_clean_expires(rt); 849 else 850 fib6_set_expires(rt, jiffies + HZ * lifetime); 851 852 fib6_info_release(rt); 853 } 854 return 0; 855 } 856 #endif 857 858 /* 859 * Misc support functions 860 */ 861 862 /* called with rcu_lock held */ 863 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt) 864 { 865 struct net_device *dev = rt->fib6_nh.nh_dev; 866 867 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 868 /* for copies of local routes, dst->dev needs to be the 869 * device if it is a master device, the master device if 870 * device is enslaved, and the loopback as the default 871 */ 872 if (netif_is_l3_slave(dev) && 873 !rt6_need_strict(&rt->fib6_dst.addr)) 874 dev = l3mdev_master_dev_rcu(dev); 875 else if (!netif_is_l3_master(dev)) 876 dev = dev_net(dev)->loopback_dev; 877 /* last case is netif_is_l3_master(dev) is true in which 878 * case we want dev returned to be dev 879 */ 880 } 881 882 return dev; 883 } 884 885 static const int fib6_prop[RTN_MAX + 1] = { 886 [RTN_UNSPEC] = 0, 887 [RTN_UNICAST] = 0, 888 [RTN_LOCAL] = 0, 889 [RTN_BROADCAST] = 0, 890 [RTN_ANYCAST] = 0, 891 [RTN_MULTICAST] = 0, 892 [RTN_BLACKHOLE] = -EINVAL, 893 [RTN_UNREACHABLE] = -EHOSTUNREACH, 894 [RTN_PROHIBIT] = -EACCES, 895 [RTN_THROW] = -EAGAIN, 896 [RTN_NAT] = -EINVAL, 897 [RTN_XRESOLVE] = -EINVAL, 898 }; 899 900 static int ip6_rt_type_to_error(u8 fib6_type) 901 { 902 return fib6_prop[fib6_type]; 903 } 904 905 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 906 { 907 unsigned short flags = 0; 908 909 if (rt->dst_nocount) 910 flags |= DST_NOCOUNT; 911 if (rt->dst_nopolicy) 912 flags |= DST_NOPOLICY; 913 if (rt->dst_host) 914 flags |= DST_HOST; 915 916 return flags; 917 } 918 919 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) 920 { 921 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); 922 923 switch (ort->fib6_type) { 924 case RTN_BLACKHOLE: 925 rt->dst.output = dst_discard_out; 926 rt->dst.input = dst_discard; 927 break; 928 case RTN_PROHIBIT: 929 rt->dst.output = ip6_pkt_prohibit_out; 930 rt->dst.input = ip6_pkt_prohibit; 931 break; 932 case RTN_THROW: 933 case RTN_UNREACHABLE: 934 default: 935 rt->dst.output = ip6_pkt_discard_out; 936 rt->dst.input = ip6_pkt_discard; 937 break; 938 } 939 } 940 941 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) 942 { 943 rt->dst.flags |= fib6_info_dst_flags(ort); 944 945 if (ort->fib6_flags & RTF_REJECT) { 946 ip6_rt_init_dst_reject(rt, ort); 947 return; 948 } 949 950 rt->dst.error = 0; 951 rt->dst.output = ip6_output; 952 953 if (ort->fib6_type == RTN_LOCAL) { 954 rt->dst.input = ip6_input; 955 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 956 rt->dst.input = ip6_mc_input; 957 } else { 958 rt->dst.input = ip6_forward; 959 } 960 961 if (ort->fib6_nh.nh_lwtstate) { 962 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); 963 lwtunnel_set_redirect(&rt->dst); 964 } 965 966 rt->dst.lastuse = jiffies; 967 } 968 969 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 970 { 971 rt->rt6i_flags &= ~RTF_EXPIRES; 972 fib6_info_hold(from); 973 rcu_assign_pointer(rt->from, from); 974 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); 975 if (from->fib6_metrics != &dst_default_metrics) { 976 rt->dst._metrics |= DST_METRICS_REFCOUNTED; 977 refcount_inc(&from->fib6_metrics->refcnt); 978 } 979 } 980 981 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) 982 { 983 struct net_device *dev = fib6_info_nh_dev(ort); 984 985 ip6_rt_init_dst(rt, ort); 986 987 rt->rt6i_dst = ort->fib6_dst; 988 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 989 rt->rt6i_gateway = ort->fib6_nh.nh_gw; 990 rt->rt6i_flags = ort->fib6_flags; 991 rt6_set_from(rt, ort); 992 #ifdef CONFIG_IPV6_SUBTREES 993 rt->rt6i_src = ort->fib6_src; 994 #endif 995 rt->rt6i_prefsrc = ort->fib6_prefsrc; 996 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); 997 } 998 999 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1000 struct in6_addr *saddr) 1001 { 1002 struct fib6_node *pn, *sn; 1003 while (1) { 1004 if (fn->fn_flags & RTN_TL_ROOT) 1005 return NULL; 1006 pn = rcu_dereference(fn->parent); 1007 sn = FIB6_SUBTREE(pn); 1008 if (sn && sn != fn) 1009 fn = fib6_node_lookup(sn, NULL, saddr); 1010 else 1011 fn = pn; 1012 if (fn->fn_flags & RTN_RTINFO) 1013 return fn; 1014 } 1015 } 1016 1017 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, 1018 bool null_fallback) 1019 { 1020 struct rt6_info *rt = *prt; 1021 1022 if (dst_hold_safe(&rt->dst)) 1023 return true; 1024 if (null_fallback) { 1025 rt = net->ipv6.ip6_null_entry; 1026 dst_hold(&rt->dst); 1027 } else { 1028 rt = NULL; 1029 } 1030 *prt = rt; 1031 return false; 1032 } 1033 1034 /* called with rcu_lock held */ 1035 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) 1036 { 1037 unsigned short flags = fib6_info_dst_flags(rt); 1038 struct net_device *dev = rt->fib6_nh.nh_dev; 1039 struct rt6_info *nrt; 1040 1041 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1042 if (nrt) 1043 ip6_rt_copy_init(nrt, rt); 1044 1045 return nrt; 1046 } 1047 1048 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1049 struct fib6_table *table, 1050 struct flowi6 *fl6, 1051 const struct sk_buff *skb, 1052 int flags) 1053 { 1054 struct fib6_info *f6i; 1055 struct fib6_node *fn; 1056 struct rt6_info *rt; 1057 1058 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1059 flags &= ~RT6_LOOKUP_F_IFACE; 1060 1061 rcu_read_lock(); 1062 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1063 restart: 1064 f6i = rcu_dereference(fn->leaf); 1065 if (!f6i) { 1066 f6i = net->ipv6.fib6_null_entry; 1067 } else { 1068 f6i = rt6_device_match(net, f6i, &fl6->saddr, 1069 fl6->flowi6_oif, flags); 1070 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) 1071 f6i = fib6_multipath_select(net, f6i, fl6, 1072 fl6->flowi6_oif, skb, 1073 flags); 1074 } 1075 if (f6i == net->ipv6.fib6_null_entry) { 1076 fn = fib6_backtrack(fn, &fl6->saddr); 1077 if (fn) 1078 goto restart; 1079 } 1080 1081 trace_fib6_table_lookup(net, f6i, table, fl6); 1082 1083 /* Search through exception table */ 1084 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1085 if (rt) { 1086 if (ip6_hold_safe(net, &rt, true)) 1087 dst_use_noref(&rt->dst, jiffies); 1088 } else if (f6i == net->ipv6.fib6_null_entry) { 1089 rt = net->ipv6.ip6_null_entry; 1090 dst_hold(&rt->dst); 1091 } else { 1092 rt = ip6_create_rt_rcu(f6i); 1093 if (!rt) { 1094 rt = net->ipv6.ip6_null_entry; 1095 dst_hold(&rt->dst); 1096 } 1097 } 1098 1099 rcu_read_unlock(); 1100 1101 return rt; 1102 } 1103 1104 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1105 const struct sk_buff *skb, int flags) 1106 { 1107 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1108 } 1109 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1110 1111 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1112 const struct in6_addr *saddr, int oif, 1113 const struct sk_buff *skb, int strict) 1114 { 1115 struct flowi6 fl6 = { 1116 .flowi6_oif = oif, 1117 .daddr = *daddr, 1118 }; 1119 struct dst_entry *dst; 1120 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1121 1122 if (saddr) { 1123 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1124 flags |= RT6_LOOKUP_F_HAS_SADDR; 1125 } 1126 1127 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1128 if (dst->error == 0) 1129 return (struct rt6_info *) dst; 1130 1131 dst_release(dst); 1132 1133 return NULL; 1134 } 1135 EXPORT_SYMBOL(rt6_lookup); 1136 1137 /* ip6_ins_rt is called with FREE table->tb6_lock. 1138 * It takes new route entry, the addition fails by any reason the 1139 * route is released. 1140 * Caller must hold dst before calling it. 1141 */ 1142 1143 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1144 struct netlink_ext_ack *extack) 1145 { 1146 int err; 1147 struct fib6_table *table; 1148 1149 table = rt->fib6_table; 1150 spin_lock_bh(&table->tb6_lock); 1151 err = fib6_add(&table->tb6_root, rt, info, extack); 1152 spin_unlock_bh(&table->tb6_lock); 1153 1154 return err; 1155 } 1156 1157 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1158 { 1159 struct nl_info info = { .nl_net = net, }; 1160 1161 return __ip6_ins_rt(rt, &info, NULL); 1162 } 1163 1164 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, 1165 const struct in6_addr *daddr, 1166 const struct in6_addr *saddr) 1167 { 1168 struct net_device *dev; 1169 struct rt6_info *rt; 1170 1171 /* 1172 * Clone the route. 1173 */ 1174 1175 dev = ip6_rt_get_dev_rcu(ort); 1176 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1177 if (!rt) 1178 return NULL; 1179 1180 ip6_rt_copy_init(rt, ort); 1181 rt->rt6i_flags |= RTF_CACHE; 1182 rt->dst.flags |= DST_HOST; 1183 rt->rt6i_dst.addr = *daddr; 1184 rt->rt6i_dst.plen = 128; 1185 1186 if (!rt6_is_gw_or_nonexthop(ort)) { 1187 if (ort->fib6_dst.plen != 128 && 1188 ipv6_addr_equal(&ort->fib6_dst.addr, daddr)) 1189 rt->rt6i_flags |= RTF_ANYCAST; 1190 #ifdef CONFIG_IPV6_SUBTREES 1191 if (rt->rt6i_src.plen && saddr) { 1192 rt->rt6i_src.addr = *saddr; 1193 rt->rt6i_src.plen = 128; 1194 } 1195 #endif 1196 } 1197 1198 return rt; 1199 } 1200 1201 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) 1202 { 1203 unsigned short flags = fib6_info_dst_flags(rt); 1204 struct net_device *dev; 1205 struct rt6_info *pcpu_rt; 1206 1207 rcu_read_lock(); 1208 dev = ip6_rt_get_dev_rcu(rt); 1209 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1210 rcu_read_unlock(); 1211 if (!pcpu_rt) 1212 return NULL; 1213 ip6_rt_copy_init(pcpu_rt, rt); 1214 pcpu_rt->rt6i_flags |= RTF_PCPU; 1215 return pcpu_rt; 1216 } 1217 1218 /* It should be called with rcu_read_lock() acquired */ 1219 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt) 1220 { 1221 struct rt6_info *pcpu_rt, **p; 1222 1223 p = this_cpu_ptr(rt->rt6i_pcpu); 1224 pcpu_rt = *p; 1225 1226 if (pcpu_rt) 1227 ip6_hold_safe(NULL, &pcpu_rt, false); 1228 1229 return pcpu_rt; 1230 } 1231 1232 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1233 struct fib6_info *rt) 1234 { 1235 struct rt6_info *pcpu_rt, *prev, **p; 1236 1237 pcpu_rt = ip6_rt_pcpu_alloc(rt); 1238 if (!pcpu_rt) { 1239 dst_hold(&net->ipv6.ip6_null_entry->dst); 1240 return net->ipv6.ip6_null_entry; 1241 } 1242 1243 dst_hold(&pcpu_rt->dst); 1244 p = this_cpu_ptr(rt->rt6i_pcpu); 1245 prev = cmpxchg(p, NULL, pcpu_rt); 1246 BUG_ON(prev); 1247 1248 return pcpu_rt; 1249 } 1250 1251 /* exception hash table implementation 1252 */ 1253 static DEFINE_SPINLOCK(rt6_exception_lock); 1254 1255 /* Remove rt6_ex from hash table and free the memory 1256 * Caller must hold rt6_exception_lock 1257 */ 1258 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1259 struct rt6_exception *rt6_ex) 1260 { 1261 struct net *net; 1262 1263 if (!bucket || !rt6_ex) 1264 return; 1265 1266 net = dev_net(rt6_ex->rt6i->dst.dev); 1267 hlist_del_rcu(&rt6_ex->hlist); 1268 dst_release(&rt6_ex->rt6i->dst); 1269 kfree_rcu(rt6_ex, rcu); 1270 WARN_ON_ONCE(!bucket->depth); 1271 bucket->depth--; 1272 net->ipv6.rt6_stats->fib_rt_cache--; 1273 } 1274 1275 /* Remove oldest rt6_ex in bucket and free the memory 1276 * Caller must hold rt6_exception_lock 1277 */ 1278 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1279 { 1280 struct rt6_exception *rt6_ex, *oldest = NULL; 1281 1282 if (!bucket) 1283 return; 1284 1285 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1286 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1287 oldest = rt6_ex; 1288 } 1289 rt6_remove_exception(bucket, oldest); 1290 } 1291 1292 static u32 rt6_exception_hash(const struct in6_addr *dst, 1293 const struct in6_addr *src) 1294 { 1295 static u32 seed __read_mostly; 1296 u32 val; 1297 1298 net_get_random_once(&seed, sizeof(seed)); 1299 val = jhash(dst, sizeof(*dst), seed); 1300 1301 #ifdef CONFIG_IPV6_SUBTREES 1302 if (src) 1303 val = jhash(src, sizeof(*src), val); 1304 #endif 1305 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1306 } 1307 1308 /* Helper function to find the cached rt in the hash table 1309 * and update bucket pointer to point to the bucket for this 1310 * (daddr, saddr) pair 1311 * Caller must hold rt6_exception_lock 1312 */ 1313 static struct rt6_exception * 1314 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1315 const struct in6_addr *daddr, 1316 const struct in6_addr *saddr) 1317 { 1318 struct rt6_exception *rt6_ex; 1319 u32 hval; 1320 1321 if (!(*bucket) || !daddr) 1322 return NULL; 1323 1324 hval = rt6_exception_hash(daddr, saddr); 1325 *bucket += hval; 1326 1327 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1328 struct rt6_info *rt6 = rt6_ex->rt6i; 1329 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1330 1331 #ifdef CONFIG_IPV6_SUBTREES 1332 if (matched && saddr) 1333 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1334 #endif 1335 if (matched) 1336 return rt6_ex; 1337 } 1338 return NULL; 1339 } 1340 1341 /* Helper function to find the cached rt in the hash table 1342 * and update bucket pointer to point to the bucket for this 1343 * (daddr, saddr) pair 1344 * Caller must hold rcu_read_lock() 1345 */ 1346 static struct rt6_exception * 1347 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1348 const struct in6_addr *daddr, 1349 const struct in6_addr *saddr) 1350 { 1351 struct rt6_exception *rt6_ex; 1352 u32 hval; 1353 1354 WARN_ON_ONCE(!rcu_read_lock_held()); 1355 1356 if (!(*bucket) || !daddr) 1357 return NULL; 1358 1359 hval = rt6_exception_hash(daddr, saddr); 1360 *bucket += hval; 1361 1362 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1363 struct rt6_info *rt6 = rt6_ex->rt6i; 1364 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1365 1366 #ifdef CONFIG_IPV6_SUBTREES 1367 if (matched && saddr) 1368 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1369 #endif 1370 if (matched) 1371 return rt6_ex; 1372 } 1373 return NULL; 1374 } 1375 1376 static unsigned int fib6_mtu(const struct fib6_info *rt) 1377 { 1378 unsigned int mtu; 1379 1380 if (rt->fib6_pmtu) { 1381 mtu = rt->fib6_pmtu; 1382 } else { 1383 struct net_device *dev = fib6_info_nh_dev(rt); 1384 struct inet6_dev *idev; 1385 1386 rcu_read_lock(); 1387 idev = __in6_dev_get(dev); 1388 mtu = idev->cnf.mtu6; 1389 rcu_read_unlock(); 1390 } 1391 1392 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1393 1394 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu); 1395 } 1396 1397 static int rt6_insert_exception(struct rt6_info *nrt, 1398 struct fib6_info *ort) 1399 { 1400 struct net *net = dev_net(nrt->dst.dev); 1401 struct rt6_exception_bucket *bucket; 1402 struct in6_addr *src_key = NULL; 1403 struct rt6_exception *rt6_ex; 1404 int err = 0; 1405 1406 spin_lock_bh(&rt6_exception_lock); 1407 1408 if (ort->exception_bucket_flushed) { 1409 err = -EINVAL; 1410 goto out; 1411 } 1412 1413 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, 1414 lockdep_is_held(&rt6_exception_lock)); 1415 if (!bucket) { 1416 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1417 GFP_ATOMIC); 1418 if (!bucket) { 1419 err = -ENOMEM; 1420 goto out; 1421 } 1422 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); 1423 } 1424 1425 #ifdef CONFIG_IPV6_SUBTREES 1426 /* rt6i_src.plen != 0 indicates ort is in subtree 1427 * and exception table is indexed by a hash of 1428 * both rt6i_dst and rt6i_src. 1429 * Otherwise, the exception table is indexed by 1430 * a hash of only rt6i_dst. 1431 */ 1432 if (ort->fib6_src.plen) 1433 src_key = &nrt->rt6i_src.addr; 1434 #endif 1435 1436 /* Update rt6i_prefsrc as it could be changed 1437 * in rt6_remove_prefsrc() 1438 */ 1439 nrt->rt6i_prefsrc = ort->fib6_prefsrc; 1440 /* rt6_mtu_change() might lower mtu on ort. 1441 * Only insert this exception route if its mtu 1442 * is less than ort's mtu value. 1443 */ 1444 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) { 1445 err = -EINVAL; 1446 goto out; 1447 } 1448 1449 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1450 src_key); 1451 if (rt6_ex) 1452 rt6_remove_exception(bucket, rt6_ex); 1453 1454 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1455 if (!rt6_ex) { 1456 err = -ENOMEM; 1457 goto out; 1458 } 1459 rt6_ex->rt6i = nrt; 1460 rt6_ex->stamp = jiffies; 1461 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1462 bucket->depth++; 1463 net->ipv6.rt6_stats->fib_rt_cache++; 1464 1465 if (bucket->depth > FIB6_MAX_DEPTH) 1466 rt6_exception_remove_oldest(bucket); 1467 1468 out: 1469 spin_unlock_bh(&rt6_exception_lock); 1470 1471 /* Update fn->fn_sernum to invalidate all cached dst */ 1472 if (!err) { 1473 spin_lock_bh(&ort->fib6_table->tb6_lock); 1474 fib6_update_sernum(net, ort); 1475 spin_unlock_bh(&ort->fib6_table->tb6_lock); 1476 fib6_force_start_gc(net); 1477 } 1478 1479 return err; 1480 } 1481 1482 void rt6_flush_exceptions(struct fib6_info *rt) 1483 { 1484 struct rt6_exception_bucket *bucket; 1485 struct rt6_exception *rt6_ex; 1486 struct hlist_node *tmp; 1487 int i; 1488 1489 spin_lock_bh(&rt6_exception_lock); 1490 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1491 rt->exception_bucket_flushed = 1; 1492 1493 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1494 lockdep_is_held(&rt6_exception_lock)); 1495 if (!bucket) 1496 goto out; 1497 1498 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1499 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1500 rt6_remove_exception(bucket, rt6_ex); 1501 WARN_ON_ONCE(bucket->depth); 1502 bucket++; 1503 } 1504 1505 out: 1506 spin_unlock_bh(&rt6_exception_lock); 1507 } 1508 1509 /* Find cached rt in the hash table inside passed in rt 1510 * Caller has to hold rcu_read_lock() 1511 */ 1512 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 1513 struct in6_addr *daddr, 1514 struct in6_addr *saddr) 1515 { 1516 struct rt6_exception_bucket *bucket; 1517 struct in6_addr *src_key = NULL; 1518 struct rt6_exception *rt6_ex; 1519 struct rt6_info *res = NULL; 1520 1521 bucket = rcu_dereference(rt->rt6i_exception_bucket); 1522 1523 #ifdef CONFIG_IPV6_SUBTREES 1524 /* rt6i_src.plen != 0 indicates rt is in subtree 1525 * and exception table is indexed by a hash of 1526 * both rt6i_dst and rt6i_src. 1527 * Otherwise, the exception table is indexed by 1528 * a hash of only rt6i_dst. 1529 */ 1530 if (rt->fib6_src.plen) 1531 src_key = saddr; 1532 #endif 1533 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1534 1535 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1536 res = rt6_ex->rt6i; 1537 1538 return res; 1539 } 1540 1541 /* Remove the passed in cached rt from the hash table that contains it */ 1542 static int rt6_remove_exception_rt(struct rt6_info *rt) 1543 { 1544 struct rt6_exception_bucket *bucket; 1545 struct in6_addr *src_key = NULL; 1546 struct rt6_exception *rt6_ex; 1547 struct fib6_info *from; 1548 int err; 1549 1550 from = rcu_dereference(rt->from); 1551 if (!from || 1552 !(rt->rt6i_flags & RTF_CACHE)) 1553 return -EINVAL; 1554 1555 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1556 return -ENOENT; 1557 1558 spin_lock_bh(&rt6_exception_lock); 1559 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1560 lockdep_is_held(&rt6_exception_lock)); 1561 #ifdef CONFIG_IPV6_SUBTREES 1562 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1563 * and exception table is indexed by a hash of 1564 * both rt6i_dst and rt6i_src. 1565 * Otherwise, the exception table is indexed by 1566 * a hash of only rt6i_dst. 1567 */ 1568 if (from->fib6_src.plen) 1569 src_key = &rt->rt6i_src.addr; 1570 #endif 1571 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1572 &rt->rt6i_dst.addr, 1573 src_key); 1574 if (rt6_ex) { 1575 rt6_remove_exception(bucket, rt6_ex); 1576 err = 0; 1577 } else { 1578 err = -ENOENT; 1579 } 1580 1581 spin_unlock_bh(&rt6_exception_lock); 1582 return err; 1583 } 1584 1585 /* Find rt6_ex which contains the passed in rt cache and 1586 * refresh its stamp 1587 */ 1588 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1589 { 1590 struct rt6_exception_bucket *bucket; 1591 struct fib6_info *from = rt->from; 1592 struct in6_addr *src_key = NULL; 1593 struct rt6_exception *rt6_ex; 1594 1595 if (!from || 1596 !(rt->rt6i_flags & RTF_CACHE)) 1597 return; 1598 1599 rcu_read_lock(); 1600 bucket = rcu_dereference(from->rt6i_exception_bucket); 1601 1602 #ifdef CONFIG_IPV6_SUBTREES 1603 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1604 * and exception table is indexed by a hash of 1605 * both rt6i_dst and rt6i_src. 1606 * Otherwise, the exception table is indexed by 1607 * a hash of only rt6i_dst. 1608 */ 1609 if (from->fib6_src.plen) 1610 src_key = &rt->rt6i_src.addr; 1611 #endif 1612 rt6_ex = __rt6_find_exception_rcu(&bucket, 1613 &rt->rt6i_dst.addr, 1614 src_key); 1615 if (rt6_ex) 1616 rt6_ex->stamp = jiffies; 1617 1618 rcu_read_unlock(); 1619 } 1620 1621 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt) 1622 { 1623 struct rt6_exception_bucket *bucket; 1624 struct rt6_exception *rt6_ex; 1625 int i; 1626 1627 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1628 lockdep_is_held(&rt6_exception_lock)); 1629 1630 if (bucket) { 1631 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1632 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1633 rt6_ex->rt6i->rt6i_prefsrc.plen = 0; 1634 } 1635 bucket++; 1636 } 1637 } 1638 } 1639 1640 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1641 struct rt6_info *rt, int mtu) 1642 { 1643 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1644 * lowest MTU in the path: always allow updating the route PMTU to 1645 * reflect PMTU decreases. 1646 * 1647 * If the new MTU is higher, and the route PMTU is equal to the local 1648 * MTU, this means the old MTU is the lowest in the path, so allow 1649 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1650 * handle this. 1651 */ 1652 1653 if (dst_mtu(&rt->dst) >= mtu) 1654 return true; 1655 1656 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 1657 return true; 1658 1659 return false; 1660 } 1661 1662 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1663 struct fib6_info *rt, int mtu) 1664 { 1665 struct rt6_exception_bucket *bucket; 1666 struct rt6_exception *rt6_ex; 1667 int i; 1668 1669 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1670 lockdep_is_held(&rt6_exception_lock)); 1671 1672 if (!bucket) 1673 return; 1674 1675 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1676 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1677 struct rt6_info *entry = rt6_ex->rt6i; 1678 1679 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 1680 * route), the metrics of its rt->from have already 1681 * been updated. 1682 */ 1683 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 1684 rt6_mtu_change_route_allowed(idev, entry, mtu)) 1685 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 1686 } 1687 bucket++; 1688 } 1689 } 1690 1691 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1692 1693 static void rt6_exceptions_clean_tohost(struct fib6_info *rt, 1694 struct in6_addr *gateway) 1695 { 1696 struct rt6_exception_bucket *bucket; 1697 struct rt6_exception *rt6_ex; 1698 struct hlist_node *tmp; 1699 int i; 1700 1701 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1702 return; 1703 1704 spin_lock_bh(&rt6_exception_lock); 1705 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1706 lockdep_is_held(&rt6_exception_lock)); 1707 1708 if (bucket) { 1709 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1710 hlist_for_each_entry_safe(rt6_ex, tmp, 1711 &bucket->chain, hlist) { 1712 struct rt6_info *entry = rt6_ex->rt6i; 1713 1714 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1715 RTF_CACHE_GATEWAY && 1716 ipv6_addr_equal(gateway, 1717 &entry->rt6i_gateway)) { 1718 rt6_remove_exception(bucket, rt6_ex); 1719 } 1720 } 1721 bucket++; 1722 } 1723 } 1724 1725 spin_unlock_bh(&rt6_exception_lock); 1726 } 1727 1728 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1729 struct rt6_exception *rt6_ex, 1730 struct fib6_gc_args *gc_args, 1731 unsigned long now) 1732 { 1733 struct rt6_info *rt = rt6_ex->rt6i; 1734 1735 /* we are pruning and obsoleting aged-out and non gateway exceptions 1736 * even if others have still references to them, so that on next 1737 * dst_check() such references can be dropped. 1738 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1739 * expired, independently from their aging, as per RFC 8201 section 4 1740 */ 1741 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 1742 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1743 RT6_TRACE("aging clone %p\n", rt); 1744 rt6_remove_exception(bucket, rt6_ex); 1745 return; 1746 } 1747 } else if (time_after(jiffies, rt->dst.expires)) { 1748 RT6_TRACE("purging expired route %p\n", rt); 1749 rt6_remove_exception(bucket, rt6_ex); 1750 return; 1751 } 1752 1753 if (rt->rt6i_flags & RTF_GATEWAY) { 1754 struct neighbour *neigh; 1755 __u8 neigh_flags = 0; 1756 1757 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 1758 if (neigh) 1759 neigh_flags = neigh->flags; 1760 1761 if (!(neigh_flags & NTF_ROUTER)) { 1762 RT6_TRACE("purging route %p via non-router but gateway\n", 1763 rt); 1764 rt6_remove_exception(bucket, rt6_ex); 1765 return; 1766 } 1767 } 1768 1769 gc_args->more++; 1770 } 1771 1772 void rt6_age_exceptions(struct fib6_info *rt, 1773 struct fib6_gc_args *gc_args, 1774 unsigned long now) 1775 { 1776 struct rt6_exception_bucket *bucket; 1777 struct rt6_exception *rt6_ex; 1778 struct hlist_node *tmp; 1779 int i; 1780 1781 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1782 return; 1783 1784 rcu_read_lock_bh(); 1785 spin_lock(&rt6_exception_lock); 1786 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1787 lockdep_is_held(&rt6_exception_lock)); 1788 1789 if (bucket) { 1790 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1791 hlist_for_each_entry_safe(rt6_ex, tmp, 1792 &bucket->chain, hlist) { 1793 rt6_age_examine_exception(bucket, rt6_ex, 1794 gc_args, now); 1795 } 1796 bucket++; 1797 } 1798 } 1799 spin_unlock(&rt6_exception_lock); 1800 rcu_read_unlock_bh(); 1801 } 1802 1803 /* must be called with rcu lock held */ 1804 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, 1805 int oif, struct flowi6 *fl6, int strict) 1806 { 1807 struct fib6_node *fn, *saved_fn; 1808 struct fib6_info *f6i; 1809 1810 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1811 saved_fn = fn; 1812 1813 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1814 oif = 0; 1815 1816 redo_rt6_select: 1817 f6i = rt6_select(net, fn, oif, strict); 1818 if (f6i == net->ipv6.fib6_null_entry) { 1819 fn = fib6_backtrack(fn, &fl6->saddr); 1820 if (fn) 1821 goto redo_rt6_select; 1822 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1823 /* also consider unreachable route */ 1824 strict &= ~RT6_LOOKUP_F_REACHABLE; 1825 fn = saved_fn; 1826 goto redo_rt6_select; 1827 } 1828 } 1829 1830 trace_fib6_table_lookup(net, f6i, table, fl6); 1831 1832 return f6i; 1833 } 1834 1835 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1836 int oif, struct flowi6 *fl6, 1837 const struct sk_buff *skb, int flags) 1838 { 1839 struct fib6_info *f6i; 1840 struct rt6_info *rt; 1841 int strict = 0; 1842 1843 strict |= flags & RT6_LOOKUP_F_IFACE; 1844 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1845 if (net->ipv6.devconf_all->forwarding == 0) 1846 strict |= RT6_LOOKUP_F_REACHABLE; 1847 1848 rcu_read_lock(); 1849 1850 f6i = fib6_table_lookup(net, table, oif, fl6, strict); 1851 if (f6i->fib6_nsiblings) 1852 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict); 1853 1854 if (f6i == net->ipv6.fib6_null_entry) { 1855 rt = net->ipv6.ip6_null_entry; 1856 rcu_read_unlock(); 1857 dst_hold(&rt->dst); 1858 return rt; 1859 } 1860 1861 /*Search through exception table */ 1862 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1863 if (rt) { 1864 if (ip6_hold_safe(net, &rt, true)) 1865 dst_use_noref(&rt->dst, jiffies); 1866 1867 rcu_read_unlock(); 1868 return rt; 1869 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1870 !(f6i->fib6_flags & RTF_GATEWAY))) { 1871 /* Create a RTF_CACHE clone which will not be 1872 * owned by the fib6 tree. It is for the special case where 1873 * the daddr in the skb during the neighbor look-up is different 1874 * from the fl6->daddr used to look-up route here. 1875 */ 1876 struct rt6_info *uncached_rt; 1877 1878 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); 1879 1880 rcu_read_unlock(); 1881 1882 if (uncached_rt) { 1883 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1884 * No need for another dst_hold() 1885 */ 1886 rt6_uncached_list_add(uncached_rt); 1887 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 1888 } else { 1889 uncached_rt = net->ipv6.ip6_null_entry; 1890 dst_hold(&uncached_rt->dst); 1891 } 1892 1893 return uncached_rt; 1894 } else { 1895 /* Get a percpu copy */ 1896 1897 struct rt6_info *pcpu_rt; 1898 1899 local_bh_disable(); 1900 pcpu_rt = rt6_get_pcpu_route(f6i); 1901 1902 if (!pcpu_rt) 1903 pcpu_rt = rt6_make_pcpu_route(net, f6i); 1904 1905 local_bh_enable(); 1906 rcu_read_unlock(); 1907 1908 return pcpu_rt; 1909 } 1910 } 1911 EXPORT_SYMBOL_GPL(ip6_pol_route); 1912 1913 static struct rt6_info *ip6_pol_route_input(struct net *net, 1914 struct fib6_table *table, 1915 struct flowi6 *fl6, 1916 const struct sk_buff *skb, 1917 int flags) 1918 { 1919 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 1920 } 1921 1922 struct dst_entry *ip6_route_input_lookup(struct net *net, 1923 struct net_device *dev, 1924 struct flowi6 *fl6, 1925 const struct sk_buff *skb, 1926 int flags) 1927 { 1928 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1929 flags |= RT6_LOOKUP_F_IFACE; 1930 1931 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 1932 } 1933 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1934 1935 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1936 struct flow_keys *keys, 1937 struct flow_keys *flkeys) 1938 { 1939 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1940 const struct ipv6hdr *key_iph = outer_iph; 1941 struct flow_keys *_flkeys = flkeys; 1942 const struct ipv6hdr *inner_iph; 1943 const struct icmp6hdr *icmph; 1944 struct ipv6hdr _inner_iph; 1945 struct icmp6hdr _icmph; 1946 1947 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1948 goto out; 1949 1950 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 1951 sizeof(_icmph), &_icmph); 1952 if (!icmph) 1953 goto out; 1954 1955 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1956 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1957 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1958 icmph->icmp6_type != ICMPV6_PARAMPROB) 1959 goto out; 1960 1961 inner_iph = skb_header_pointer(skb, 1962 skb_transport_offset(skb) + sizeof(*icmph), 1963 sizeof(_inner_iph), &_inner_iph); 1964 if (!inner_iph) 1965 goto out; 1966 1967 key_iph = inner_iph; 1968 _flkeys = NULL; 1969 out: 1970 if (_flkeys) { 1971 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 1972 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 1973 keys->tags.flow_label = _flkeys->tags.flow_label; 1974 keys->basic.ip_proto = _flkeys->basic.ip_proto; 1975 } else { 1976 keys->addrs.v6addrs.src = key_iph->saddr; 1977 keys->addrs.v6addrs.dst = key_iph->daddr; 1978 keys->tags.flow_label = ip6_flowinfo(key_iph); 1979 keys->basic.ip_proto = key_iph->nexthdr; 1980 } 1981 } 1982 1983 /* if skb is set it will be used and fl6 can be NULL */ 1984 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 1985 const struct sk_buff *skb, struct flow_keys *flkeys) 1986 { 1987 struct flow_keys hash_keys; 1988 u32 mhash; 1989 1990 switch (ip6_multipath_hash_policy(net)) { 1991 case 0: 1992 memset(&hash_keys, 0, sizeof(hash_keys)); 1993 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 1994 if (skb) { 1995 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 1996 } else { 1997 hash_keys.addrs.v6addrs.src = fl6->saddr; 1998 hash_keys.addrs.v6addrs.dst = fl6->daddr; 1999 hash_keys.tags.flow_label = (__force u32)fl6->flowlabel; 2000 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2001 } 2002 break; 2003 case 1: 2004 if (skb) { 2005 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 2006 struct flow_keys keys; 2007 2008 /* short-circuit if we already have L4 hash present */ 2009 if (skb->l4_hash) 2010 return skb_get_hash_raw(skb) >> 1; 2011 2012 memset(&hash_keys, 0, sizeof(hash_keys)); 2013 2014 if (!flkeys) { 2015 skb_flow_dissect_flow_keys(skb, &keys, flag); 2016 flkeys = &keys; 2017 } 2018 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2019 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2020 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2021 hash_keys.ports.src = flkeys->ports.src; 2022 hash_keys.ports.dst = flkeys->ports.dst; 2023 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2024 } else { 2025 memset(&hash_keys, 0, sizeof(hash_keys)); 2026 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2027 hash_keys.addrs.v6addrs.src = fl6->saddr; 2028 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2029 hash_keys.ports.src = fl6->fl6_sport; 2030 hash_keys.ports.dst = fl6->fl6_dport; 2031 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2032 } 2033 break; 2034 } 2035 mhash = flow_hash_from_keys(&hash_keys); 2036 2037 return mhash >> 1; 2038 } 2039 2040 void ip6_route_input(struct sk_buff *skb) 2041 { 2042 const struct ipv6hdr *iph = ipv6_hdr(skb); 2043 struct net *net = dev_net(skb->dev); 2044 int flags = RT6_LOOKUP_F_HAS_SADDR; 2045 struct ip_tunnel_info *tun_info; 2046 struct flowi6 fl6 = { 2047 .flowi6_iif = skb->dev->ifindex, 2048 .daddr = iph->daddr, 2049 .saddr = iph->saddr, 2050 .flowlabel = ip6_flowinfo(iph), 2051 .flowi6_mark = skb->mark, 2052 .flowi6_proto = iph->nexthdr, 2053 }; 2054 struct flow_keys *flkeys = NULL, _flkeys; 2055 2056 tun_info = skb_tunnel_info(skb); 2057 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2058 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2059 2060 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2061 flkeys = &_flkeys; 2062 2063 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2064 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2065 skb_dst_drop(skb); 2066 skb_dst_set(skb, 2067 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); 2068 } 2069 2070 static struct rt6_info *ip6_pol_route_output(struct net *net, 2071 struct fib6_table *table, 2072 struct flowi6 *fl6, 2073 const struct sk_buff *skb, 2074 int flags) 2075 { 2076 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2077 } 2078 2079 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 2080 struct flowi6 *fl6, int flags) 2081 { 2082 bool any_src; 2083 2084 if (rt6_need_strict(&fl6->daddr)) { 2085 struct dst_entry *dst; 2086 2087 dst = l3mdev_link_scope_lookup(net, fl6); 2088 if (dst) 2089 return dst; 2090 } 2091 2092 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2093 2094 any_src = ipv6_addr_any(&fl6->saddr); 2095 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2096 (fl6->flowi6_oif && any_src)) 2097 flags |= RT6_LOOKUP_F_IFACE; 2098 2099 if (!any_src) 2100 flags |= RT6_LOOKUP_F_HAS_SADDR; 2101 else if (sk) 2102 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2103 2104 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2105 } 2106 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2107 2108 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2109 { 2110 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2111 struct net_device *loopback_dev = net->loopback_dev; 2112 struct dst_entry *new = NULL; 2113 2114 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2115 DST_OBSOLETE_DEAD, 0); 2116 if (rt) { 2117 rt6_info_init(rt); 2118 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2119 2120 new = &rt->dst; 2121 new->__use = 1; 2122 new->input = dst_discard; 2123 new->output = dst_discard_out; 2124 2125 dst_copy_metrics(new, &ort->dst); 2126 2127 rt->rt6i_idev = in6_dev_get(loopback_dev); 2128 rt->rt6i_gateway = ort->rt6i_gateway; 2129 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2130 2131 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2132 #ifdef CONFIG_IPV6_SUBTREES 2133 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2134 #endif 2135 } 2136 2137 dst_release(dst_orig); 2138 return new ? new : ERR_PTR(-ENOMEM); 2139 } 2140 2141 /* 2142 * Destination cache support functions 2143 */ 2144 2145 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2146 { 2147 u32 rt_cookie = 0; 2148 2149 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2150 return false; 2151 2152 if (fib6_check_expired(f6i)) 2153 return false; 2154 2155 return true; 2156 } 2157 2158 static struct dst_entry *rt6_check(struct rt6_info *rt, 2159 struct fib6_info *from, 2160 u32 cookie) 2161 { 2162 u32 rt_cookie = 0; 2163 2164 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || 2165 rt_cookie != cookie) 2166 return NULL; 2167 2168 if (rt6_check_expired(rt)) 2169 return NULL; 2170 2171 return &rt->dst; 2172 } 2173 2174 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2175 struct fib6_info *from, 2176 u32 cookie) 2177 { 2178 if (!__rt6_check_expired(rt) && 2179 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2180 fib6_check(from, cookie)) 2181 return &rt->dst; 2182 else 2183 return NULL; 2184 } 2185 2186 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2187 { 2188 struct dst_entry *dst_ret; 2189 struct fib6_info *from; 2190 struct rt6_info *rt; 2191 2192 rt = container_of(dst, struct rt6_info, dst); 2193 2194 rcu_read_lock(); 2195 2196 /* All IPV6 dsts are created with ->obsolete set to the value 2197 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2198 * into this function always. 2199 */ 2200 2201 from = rcu_dereference(rt->from); 2202 2203 if (from && (rt->rt6i_flags & RTF_PCPU || 2204 unlikely(!list_empty(&rt->rt6i_uncached)))) 2205 dst_ret = rt6_dst_from_check(rt, from, cookie); 2206 else 2207 dst_ret = rt6_check(rt, from, cookie); 2208 2209 rcu_read_unlock(); 2210 2211 return dst_ret; 2212 } 2213 2214 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2215 { 2216 struct rt6_info *rt = (struct rt6_info *) dst; 2217 2218 if (rt) { 2219 if (rt->rt6i_flags & RTF_CACHE) { 2220 rcu_read_lock(); 2221 if (rt6_check_expired(rt)) { 2222 rt6_remove_exception_rt(rt); 2223 dst = NULL; 2224 } 2225 rcu_read_unlock(); 2226 } else { 2227 dst_release(dst); 2228 dst = NULL; 2229 } 2230 } 2231 return dst; 2232 } 2233 2234 static void ip6_link_failure(struct sk_buff *skb) 2235 { 2236 struct rt6_info *rt; 2237 2238 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2239 2240 rt = (struct rt6_info *) skb_dst(skb); 2241 if (rt) { 2242 rcu_read_lock(); 2243 if (rt->rt6i_flags & RTF_CACHE) { 2244 if (dst_hold_safe(&rt->dst)) 2245 rt6_remove_exception_rt(rt); 2246 } else { 2247 struct fib6_info *from; 2248 struct fib6_node *fn; 2249 2250 from = rcu_dereference(rt->from); 2251 if (from) { 2252 fn = rcu_dereference(from->fib6_node); 2253 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2254 fn->fn_sernum = -1; 2255 } 2256 } 2257 rcu_read_unlock(); 2258 } 2259 } 2260 2261 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2262 { 2263 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2264 struct fib6_info *from; 2265 2266 rcu_read_lock(); 2267 from = rcu_dereference(rt0->from); 2268 if (from) 2269 rt0->dst.expires = from->expires; 2270 rcu_read_unlock(); 2271 } 2272 2273 dst_set_expires(&rt0->dst, timeout); 2274 rt0->rt6i_flags |= RTF_EXPIRES; 2275 } 2276 2277 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2278 { 2279 struct net *net = dev_net(rt->dst.dev); 2280 2281 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2282 rt->rt6i_flags |= RTF_MODIFIED; 2283 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2284 } 2285 2286 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2287 { 2288 bool from_set; 2289 2290 rcu_read_lock(); 2291 from_set = !!rcu_dereference(rt->from); 2292 rcu_read_unlock(); 2293 2294 return !(rt->rt6i_flags & RTF_CACHE) && 2295 (rt->rt6i_flags & RTF_PCPU || from_set); 2296 } 2297 2298 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2299 const struct ipv6hdr *iph, u32 mtu) 2300 { 2301 const struct in6_addr *daddr, *saddr; 2302 struct rt6_info *rt6 = (struct rt6_info *)dst; 2303 2304 if (rt6->rt6i_flags & RTF_LOCAL) 2305 return; 2306 2307 if (dst_metric_locked(dst, RTAX_MTU)) 2308 return; 2309 2310 if (iph) { 2311 daddr = &iph->daddr; 2312 saddr = &iph->saddr; 2313 } else if (sk) { 2314 daddr = &sk->sk_v6_daddr; 2315 saddr = &inet6_sk(sk)->saddr; 2316 } else { 2317 daddr = NULL; 2318 saddr = NULL; 2319 } 2320 dst_confirm_neigh(dst, daddr); 2321 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2322 if (mtu >= dst_mtu(dst)) 2323 return; 2324 2325 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2326 rt6_do_update_pmtu(rt6, mtu); 2327 /* update rt6_ex->stamp for cache */ 2328 if (rt6->rt6i_flags & RTF_CACHE) 2329 rt6_update_exception_stamp_rt(rt6); 2330 } else if (daddr) { 2331 struct fib6_info *from; 2332 struct rt6_info *nrt6; 2333 2334 rcu_read_lock(); 2335 from = rcu_dereference(rt6->from); 2336 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr); 2337 if (nrt6) { 2338 rt6_do_update_pmtu(nrt6, mtu); 2339 if (rt6_insert_exception(nrt6, from)) 2340 dst_release_immediate(&nrt6->dst); 2341 } 2342 rcu_read_unlock(); 2343 } 2344 } 2345 2346 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2347 struct sk_buff *skb, u32 mtu) 2348 { 2349 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2350 } 2351 2352 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2353 int oif, u32 mark, kuid_t uid) 2354 { 2355 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2356 struct dst_entry *dst; 2357 struct flowi6 fl6; 2358 2359 memset(&fl6, 0, sizeof(fl6)); 2360 fl6.flowi6_oif = oif; 2361 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); 2362 fl6.daddr = iph->daddr; 2363 fl6.saddr = iph->saddr; 2364 fl6.flowlabel = ip6_flowinfo(iph); 2365 fl6.flowi6_uid = uid; 2366 2367 dst = ip6_route_output(net, NULL, &fl6); 2368 if (!dst->error) 2369 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2370 dst_release(dst); 2371 } 2372 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2373 2374 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2375 { 2376 struct dst_entry *dst; 2377 2378 ip6_update_pmtu(skb, sock_net(sk), mtu, 2379 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid); 2380 2381 dst = __sk_dst_get(sk); 2382 if (!dst || !dst->obsolete || 2383 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2384 return; 2385 2386 bh_lock_sock(sk); 2387 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2388 ip6_datagram_dst_update(sk, false); 2389 bh_unlock_sock(sk); 2390 } 2391 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2392 2393 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2394 const struct flowi6 *fl6) 2395 { 2396 #ifdef CONFIG_IPV6_SUBTREES 2397 struct ipv6_pinfo *np = inet6_sk(sk); 2398 #endif 2399 2400 ip6_dst_store(sk, dst, 2401 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2402 &sk->sk_v6_daddr : NULL, 2403 #ifdef CONFIG_IPV6_SUBTREES 2404 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2405 &np->saddr : 2406 #endif 2407 NULL); 2408 } 2409 2410 /* Handle redirects */ 2411 struct ip6rd_flowi { 2412 struct flowi6 fl6; 2413 struct in6_addr gateway; 2414 }; 2415 2416 static struct rt6_info *__ip6_route_redirect(struct net *net, 2417 struct fib6_table *table, 2418 struct flowi6 *fl6, 2419 const struct sk_buff *skb, 2420 int flags) 2421 { 2422 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2423 struct rt6_info *ret = NULL, *rt_cache; 2424 struct fib6_info *rt; 2425 struct fib6_node *fn; 2426 2427 /* Get the "current" route for this destination and 2428 * check if the redirect has come from appropriate router. 2429 * 2430 * RFC 4861 specifies that redirects should only be 2431 * accepted if they come from the nexthop to the target. 2432 * Due to the way the routes are chosen, this notion 2433 * is a bit fuzzy and one might need to check all possible 2434 * routes. 2435 */ 2436 2437 rcu_read_lock(); 2438 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2439 restart: 2440 for_each_fib6_node_rt_rcu(fn) { 2441 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 2442 continue; 2443 if (fib6_check_expired(rt)) 2444 continue; 2445 if (rt->fib6_flags & RTF_REJECT) 2446 break; 2447 if (!(rt->fib6_flags & RTF_GATEWAY)) 2448 continue; 2449 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex) 2450 continue; 2451 /* rt_cache's gateway might be different from its 'parent' 2452 * in the case of an ip redirect. 2453 * So we keep searching in the exception table if the gateway 2454 * is different. 2455 */ 2456 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) { 2457 rt_cache = rt6_find_cached_rt(rt, 2458 &fl6->daddr, 2459 &fl6->saddr); 2460 if (rt_cache && 2461 ipv6_addr_equal(&rdfl->gateway, 2462 &rt_cache->rt6i_gateway)) { 2463 ret = rt_cache; 2464 break; 2465 } 2466 continue; 2467 } 2468 break; 2469 } 2470 2471 if (!rt) 2472 rt = net->ipv6.fib6_null_entry; 2473 else if (rt->fib6_flags & RTF_REJECT) { 2474 ret = net->ipv6.ip6_null_entry; 2475 goto out; 2476 } 2477 2478 if (rt == net->ipv6.fib6_null_entry) { 2479 fn = fib6_backtrack(fn, &fl6->saddr); 2480 if (fn) 2481 goto restart; 2482 } 2483 2484 out: 2485 if (ret) 2486 dst_hold(&ret->dst); 2487 else 2488 ret = ip6_create_rt_rcu(rt); 2489 2490 rcu_read_unlock(); 2491 2492 trace_fib6_table_lookup(net, rt, table, fl6); 2493 return ret; 2494 }; 2495 2496 static struct dst_entry *ip6_route_redirect(struct net *net, 2497 const struct flowi6 *fl6, 2498 const struct sk_buff *skb, 2499 const struct in6_addr *gateway) 2500 { 2501 int flags = RT6_LOOKUP_F_HAS_SADDR; 2502 struct ip6rd_flowi rdfl; 2503 2504 rdfl.fl6 = *fl6; 2505 rdfl.gateway = *gateway; 2506 2507 return fib6_rule_lookup(net, &rdfl.fl6, skb, 2508 flags, __ip6_route_redirect); 2509 } 2510 2511 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2512 kuid_t uid) 2513 { 2514 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2515 struct dst_entry *dst; 2516 struct flowi6 fl6; 2517 2518 memset(&fl6, 0, sizeof(fl6)); 2519 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2520 fl6.flowi6_oif = oif; 2521 fl6.flowi6_mark = mark; 2522 fl6.daddr = iph->daddr; 2523 fl6.saddr = iph->saddr; 2524 fl6.flowlabel = ip6_flowinfo(iph); 2525 fl6.flowi6_uid = uid; 2526 2527 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 2528 rt6_do_redirect(dst, NULL, skb); 2529 dst_release(dst); 2530 } 2531 EXPORT_SYMBOL_GPL(ip6_redirect); 2532 2533 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, 2534 u32 mark) 2535 { 2536 const struct ipv6hdr *iph = ipv6_hdr(skb); 2537 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2538 struct dst_entry *dst; 2539 struct flowi6 fl6; 2540 2541 memset(&fl6, 0, sizeof(fl6)); 2542 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2543 fl6.flowi6_oif = oif; 2544 fl6.flowi6_mark = mark; 2545 fl6.daddr = msg->dest; 2546 fl6.saddr = iph->daddr; 2547 fl6.flowi6_uid = sock_net_uid(net, NULL); 2548 2549 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 2550 rt6_do_redirect(dst, NULL, skb); 2551 dst_release(dst); 2552 } 2553 2554 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2555 { 2556 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2557 sk->sk_uid); 2558 } 2559 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2560 2561 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2562 { 2563 struct net_device *dev = dst->dev; 2564 unsigned int mtu = dst_mtu(dst); 2565 struct net *net = dev_net(dev); 2566 2567 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2568 2569 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2570 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2571 2572 /* 2573 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2574 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2575 * IPV6_MAXPLEN is also valid and means: "any MSS, 2576 * rely only on pmtu discovery" 2577 */ 2578 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2579 mtu = IPV6_MAXPLEN; 2580 return mtu; 2581 } 2582 2583 static unsigned int ip6_mtu(const struct dst_entry *dst) 2584 { 2585 struct inet6_dev *idev; 2586 unsigned int mtu; 2587 2588 mtu = dst_metric_raw(dst, RTAX_MTU); 2589 if (mtu) 2590 goto out; 2591 2592 mtu = IPV6_MIN_MTU; 2593 2594 rcu_read_lock(); 2595 idev = __in6_dev_get(dst->dev); 2596 if (idev) 2597 mtu = idev->cnf.mtu6; 2598 rcu_read_unlock(); 2599 2600 out: 2601 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2602 2603 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2604 } 2605 2606 /* MTU selection: 2607 * 1. mtu on route is locked - use it 2608 * 2. mtu from nexthop exception 2609 * 3. mtu from egress device 2610 * 2611 * based on ip6_dst_mtu_forward and exception logic of 2612 * rt6_find_cached_rt; called with rcu_read_lock 2613 */ 2614 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, 2615 struct in6_addr *saddr) 2616 { 2617 struct rt6_exception_bucket *bucket; 2618 struct rt6_exception *rt6_ex; 2619 struct in6_addr *src_key; 2620 struct inet6_dev *idev; 2621 u32 mtu = 0; 2622 2623 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 2624 mtu = f6i->fib6_pmtu; 2625 if (mtu) 2626 goto out; 2627 } 2628 2629 src_key = NULL; 2630 #ifdef CONFIG_IPV6_SUBTREES 2631 if (f6i->fib6_src.plen) 2632 src_key = saddr; 2633 #endif 2634 2635 bucket = rcu_dereference(f6i->rt6i_exception_bucket); 2636 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 2637 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 2638 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU); 2639 2640 if (likely(!mtu)) { 2641 struct net_device *dev = fib6_info_nh_dev(f6i); 2642 2643 mtu = IPV6_MIN_MTU; 2644 idev = __in6_dev_get(dev); 2645 if (idev && idev->cnf.mtu6 > mtu) 2646 mtu = idev->cnf.mtu6; 2647 } 2648 2649 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2650 out: 2651 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu); 2652 } 2653 2654 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2655 struct flowi6 *fl6) 2656 { 2657 struct dst_entry *dst; 2658 struct rt6_info *rt; 2659 struct inet6_dev *idev = in6_dev_get(dev); 2660 struct net *net = dev_net(dev); 2661 2662 if (unlikely(!idev)) 2663 return ERR_PTR(-ENODEV); 2664 2665 rt = ip6_dst_alloc(net, dev, 0); 2666 if (unlikely(!rt)) { 2667 in6_dev_put(idev); 2668 dst = ERR_PTR(-ENOMEM); 2669 goto out; 2670 } 2671 2672 rt->dst.flags |= DST_HOST; 2673 rt->dst.input = ip6_input; 2674 rt->dst.output = ip6_output; 2675 rt->rt6i_gateway = fl6->daddr; 2676 rt->rt6i_dst.addr = fl6->daddr; 2677 rt->rt6i_dst.plen = 128; 2678 rt->rt6i_idev = idev; 2679 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2680 2681 /* Add this dst into uncached_list so that rt6_disable_ip() can 2682 * do proper release of the net_device 2683 */ 2684 rt6_uncached_list_add(rt); 2685 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2686 2687 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2688 2689 out: 2690 return dst; 2691 } 2692 2693 static int ip6_dst_gc(struct dst_ops *ops) 2694 { 2695 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2696 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2697 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2698 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2699 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2700 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2701 int entries; 2702 2703 entries = dst_entries_get_fast(ops); 2704 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2705 entries <= rt_max_size) 2706 goto out; 2707 2708 net->ipv6.ip6_rt_gc_expire++; 2709 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2710 entries = dst_entries_get_slow(ops); 2711 if (entries < ops->gc_thresh) 2712 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2713 out: 2714 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2715 return entries > rt_max_size; 2716 } 2717 2718 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt, 2719 struct fib6_config *cfg) 2720 { 2721 struct dst_metrics *p; 2722 2723 if (!cfg->fc_mx) 2724 return 0; 2725 2726 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL); 2727 if (unlikely(!p)) 2728 return -ENOMEM; 2729 2730 refcount_set(&p->refcnt, 1); 2731 rt->fib6_metrics = p; 2732 2733 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics); 2734 } 2735 2736 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2737 struct fib6_config *cfg, 2738 const struct in6_addr *gw_addr, 2739 u32 tbid, int flags) 2740 { 2741 struct flowi6 fl6 = { 2742 .flowi6_oif = cfg->fc_ifindex, 2743 .daddr = *gw_addr, 2744 .saddr = cfg->fc_prefsrc, 2745 }; 2746 struct fib6_table *table; 2747 struct rt6_info *rt; 2748 2749 table = fib6_get_table(net, tbid); 2750 if (!table) 2751 return NULL; 2752 2753 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2754 flags |= RT6_LOOKUP_F_HAS_SADDR; 2755 2756 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 2757 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); 2758 2759 /* if table lookup failed, fall back to full lookup */ 2760 if (rt == net->ipv6.ip6_null_entry) { 2761 ip6_rt_put(rt); 2762 rt = NULL; 2763 } 2764 2765 return rt; 2766 } 2767 2768 static int ip6_route_check_nh_onlink(struct net *net, 2769 struct fib6_config *cfg, 2770 const struct net_device *dev, 2771 struct netlink_ext_ack *extack) 2772 { 2773 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 2774 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2775 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; 2776 struct rt6_info *grt; 2777 int err; 2778 2779 err = 0; 2780 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2781 if (grt) { 2782 if (!grt->dst.error && 2783 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2784 NL_SET_ERR_MSG(extack, 2785 "Nexthop has invalid gateway or device mismatch"); 2786 err = -EINVAL; 2787 } 2788 2789 ip6_rt_put(grt); 2790 } 2791 2792 return err; 2793 } 2794 2795 static int ip6_route_check_nh(struct net *net, 2796 struct fib6_config *cfg, 2797 struct net_device **_dev, 2798 struct inet6_dev **idev) 2799 { 2800 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2801 struct net_device *dev = _dev ? *_dev : NULL; 2802 struct rt6_info *grt = NULL; 2803 int err = -EHOSTUNREACH; 2804 2805 if (cfg->fc_table) { 2806 int flags = RT6_LOOKUP_F_IFACE; 2807 2808 grt = ip6_nh_lookup_table(net, cfg, gw_addr, 2809 cfg->fc_table, flags); 2810 if (grt) { 2811 if (grt->rt6i_flags & RTF_GATEWAY || 2812 (dev && dev != grt->dst.dev)) { 2813 ip6_rt_put(grt); 2814 grt = NULL; 2815 } 2816 } 2817 } 2818 2819 if (!grt) 2820 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); 2821 2822 if (!grt) 2823 goto out; 2824 2825 if (dev) { 2826 if (dev != grt->dst.dev) { 2827 ip6_rt_put(grt); 2828 goto out; 2829 } 2830 } else { 2831 *_dev = dev = grt->dst.dev; 2832 *idev = grt->rt6i_idev; 2833 dev_hold(dev); 2834 in6_dev_hold(grt->rt6i_idev); 2835 } 2836 2837 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2838 err = 0; 2839 2840 ip6_rt_put(grt); 2841 2842 out: 2843 return err; 2844 } 2845 2846 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 2847 struct net_device **_dev, struct inet6_dev **idev, 2848 struct netlink_ext_ack *extack) 2849 { 2850 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2851 int gwa_type = ipv6_addr_type(gw_addr); 2852 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 2853 const struct net_device *dev = *_dev; 2854 bool need_addr_check = !dev; 2855 int err = -EINVAL; 2856 2857 /* if gw_addr is local we will fail to detect this in case 2858 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2859 * will return already-added prefix route via interface that 2860 * prefix route was assigned to, which might be non-loopback. 2861 */ 2862 if (dev && 2863 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2864 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2865 goto out; 2866 } 2867 2868 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 2869 /* IPv6 strictly inhibits using not link-local 2870 * addresses as nexthop address. 2871 * Otherwise, router will not able to send redirects. 2872 * It is very good, but in some (rare!) circumstances 2873 * (SIT, PtP, NBMA NOARP links) it is handy to allow 2874 * some exceptions. --ANK 2875 * We allow IPv4-mapped nexthops to support RFC4798-type 2876 * addressing 2877 */ 2878 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 2879 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2880 goto out; 2881 } 2882 2883 if (cfg->fc_flags & RTNH_F_ONLINK) 2884 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 2885 else 2886 err = ip6_route_check_nh(net, cfg, _dev, idev); 2887 2888 if (err) 2889 goto out; 2890 } 2891 2892 /* reload in case device was changed */ 2893 dev = *_dev; 2894 2895 err = -EINVAL; 2896 if (!dev) { 2897 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2898 goto out; 2899 } else if (dev->flags & IFF_LOOPBACK) { 2900 NL_SET_ERR_MSG(extack, 2901 "Egress device can not be loopback device for this route"); 2902 goto out; 2903 } 2904 2905 /* if we did not check gw_addr above, do so now that the 2906 * egress device has been resolved. 2907 */ 2908 if (need_addr_check && 2909 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2910 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2911 goto out; 2912 } 2913 2914 err = 0; 2915 out: 2916 return err; 2917 } 2918 2919 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 2920 gfp_t gfp_flags, 2921 struct netlink_ext_ack *extack) 2922 { 2923 struct net *net = cfg->fc_nlinfo.nl_net; 2924 struct fib6_info *rt = NULL; 2925 struct net_device *dev = NULL; 2926 struct inet6_dev *idev = NULL; 2927 struct fib6_table *table; 2928 int addr_type; 2929 int err = -EINVAL; 2930 2931 /* RTF_PCPU is an internal flag; can not be set by userspace */ 2932 if (cfg->fc_flags & RTF_PCPU) { 2933 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 2934 goto out; 2935 } 2936 2937 /* RTF_CACHE is an internal flag; can not be set by userspace */ 2938 if (cfg->fc_flags & RTF_CACHE) { 2939 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 2940 goto out; 2941 } 2942 2943 if (cfg->fc_type > RTN_MAX) { 2944 NL_SET_ERR_MSG(extack, "Invalid route type"); 2945 goto out; 2946 } 2947 2948 if (cfg->fc_dst_len > 128) { 2949 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 2950 goto out; 2951 } 2952 if (cfg->fc_src_len > 128) { 2953 NL_SET_ERR_MSG(extack, "Invalid source address length"); 2954 goto out; 2955 } 2956 #ifndef CONFIG_IPV6_SUBTREES 2957 if (cfg->fc_src_len) { 2958 NL_SET_ERR_MSG(extack, 2959 "Specifying source address requires IPV6_SUBTREES to be enabled"); 2960 goto out; 2961 } 2962 #endif 2963 if (cfg->fc_ifindex) { 2964 err = -ENODEV; 2965 dev = dev_get_by_index(net, cfg->fc_ifindex); 2966 if (!dev) 2967 goto out; 2968 idev = in6_dev_get(dev); 2969 if (!idev) 2970 goto out; 2971 } 2972 2973 if (cfg->fc_metric == 0) 2974 cfg->fc_metric = IP6_RT_PRIO_USER; 2975 2976 if (cfg->fc_flags & RTNH_F_ONLINK) { 2977 if (!dev) { 2978 NL_SET_ERR_MSG(extack, 2979 "Nexthop device required for onlink"); 2980 err = -ENODEV; 2981 goto out; 2982 } 2983 2984 if (!(dev->flags & IFF_UP)) { 2985 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 2986 err = -ENETDOWN; 2987 goto out; 2988 } 2989 } 2990 2991 err = -ENOBUFS; 2992 if (cfg->fc_nlinfo.nlh && 2993 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 2994 table = fib6_get_table(net, cfg->fc_table); 2995 if (!table) { 2996 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 2997 table = fib6_new_table(net, cfg->fc_table); 2998 } 2999 } else { 3000 table = fib6_new_table(net, cfg->fc_table); 3001 } 3002 3003 if (!table) 3004 goto out; 3005 3006 err = -ENOMEM; 3007 rt = fib6_info_alloc(gfp_flags); 3008 if (!rt) 3009 goto out; 3010 3011 if (cfg->fc_flags & RTF_ADDRCONF) 3012 rt->dst_nocount = true; 3013 3014 err = ip6_convert_metrics(net, rt, cfg); 3015 if (err < 0) 3016 goto out; 3017 3018 if (cfg->fc_flags & RTF_EXPIRES) 3019 fib6_set_expires(rt, jiffies + 3020 clock_t_to_jiffies(cfg->fc_expires)); 3021 else 3022 fib6_clean_expires(rt); 3023 3024 if (cfg->fc_protocol == RTPROT_UNSPEC) 3025 cfg->fc_protocol = RTPROT_BOOT; 3026 rt->fib6_protocol = cfg->fc_protocol; 3027 3028 addr_type = ipv6_addr_type(&cfg->fc_dst); 3029 3030 if (cfg->fc_encap) { 3031 struct lwtunnel_state *lwtstate; 3032 3033 err = lwtunnel_build_state(cfg->fc_encap_type, 3034 cfg->fc_encap, AF_INET6, cfg, 3035 &lwtstate, extack); 3036 if (err) 3037 goto out; 3038 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate); 3039 } 3040 3041 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3042 rt->fib6_dst.plen = cfg->fc_dst_len; 3043 if (rt->fib6_dst.plen == 128) 3044 rt->dst_host = true; 3045 3046 #ifdef CONFIG_IPV6_SUBTREES 3047 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3048 rt->fib6_src.plen = cfg->fc_src_len; 3049 #endif 3050 3051 rt->fib6_metric = cfg->fc_metric; 3052 rt->fib6_nh.nh_weight = 1; 3053 3054 rt->fib6_type = cfg->fc_type; 3055 3056 /* We cannot add true routes via loopback here, 3057 they would result in kernel looping; promote them to reject routes 3058 */ 3059 if ((cfg->fc_flags & RTF_REJECT) || 3060 (dev && (dev->flags & IFF_LOOPBACK) && 3061 !(addr_type & IPV6_ADDR_LOOPBACK) && 3062 !(cfg->fc_flags & RTF_LOCAL))) { 3063 /* hold loopback dev/idev if we haven't done so. */ 3064 if (dev != net->loopback_dev) { 3065 if (dev) { 3066 dev_put(dev); 3067 in6_dev_put(idev); 3068 } 3069 dev = net->loopback_dev; 3070 dev_hold(dev); 3071 idev = in6_dev_get(dev); 3072 if (!idev) { 3073 err = -ENODEV; 3074 goto out; 3075 } 3076 } 3077 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP; 3078 goto install_route; 3079 } 3080 3081 if (cfg->fc_flags & RTF_GATEWAY) { 3082 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 3083 if (err) 3084 goto out; 3085 3086 rt->fib6_nh.nh_gw = cfg->fc_gateway; 3087 } 3088 3089 err = -ENODEV; 3090 if (!dev) 3091 goto out; 3092 3093 if (idev->cnf.disable_ipv6) { 3094 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3095 err = -EACCES; 3096 goto out; 3097 } 3098 3099 if (!(dev->flags & IFF_UP)) { 3100 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3101 err = -ENETDOWN; 3102 goto out; 3103 } 3104 3105 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3106 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3107 NL_SET_ERR_MSG(extack, "Invalid source address"); 3108 err = -EINVAL; 3109 goto out; 3110 } 3111 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3112 rt->fib6_prefsrc.plen = 128; 3113 } else 3114 rt->fib6_prefsrc.plen = 0; 3115 3116 rt->fib6_flags = cfg->fc_flags; 3117 3118 install_route: 3119 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3120 !netif_carrier_ok(dev)) 3121 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 3122 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); 3123 rt->fib6_nh.nh_dev = dev; 3124 rt->fib6_table = table; 3125 3126 cfg->fc_nlinfo.nl_net = dev_net(dev); 3127 3128 if (idev) 3129 in6_dev_put(idev); 3130 3131 return rt; 3132 out: 3133 if (dev) 3134 dev_put(dev); 3135 if (idev) 3136 in6_dev_put(idev); 3137 3138 fib6_info_release(rt); 3139 return ERR_PTR(err); 3140 } 3141 3142 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3143 struct netlink_ext_ack *extack) 3144 { 3145 struct fib6_info *rt; 3146 int err; 3147 3148 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3149 if (IS_ERR(rt)) 3150 return PTR_ERR(rt); 3151 3152 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3153 fib6_info_release(rt); 3154 3155 return err; 3156 } 3157 3158 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3159 { 3160 struct net *net = info->nl_net; 3161 struct fib6_table *table; 3162 int err; 3163 3164 if (rt == net->ipv6.fib6_null_entry) { 3165 err = -ENOENT; 3166 goto out; 3167 } 3168 3169 table = rt->fib6_table; 3170 spin_lock_bh(&table->tb6_lock); 3171 err = fib6_del(rt, info); 3172 spin_unlock_bh(&table->tb6_lock); 3173 3174 out: 3175 fib6_info_release(rt); 3176 return err; 3177 } 3178 3179 int ip6_del_rt(struct net *net, struct fib6_info *rt) 3180 { 3181 struct nl_info info = { .nl_net = net }; 3182 3183 return __ip6_del_rt(rt, &info); 3184 } 3185 3186 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3187 { 3188 struct nl_info *info = &cfg->fc_nlinfo; 3189 struct net *net = info->nl_net; 3190 struct sk_buff *skb = NULL; 3191 struct fib6_table *table; 3192 int err = -ENOENT; 3193 3194 if (rt == net->ipv6.fib6_null_entry) 3195 goto out_put; 3196 table = rt->fib6_table; 3197 spin_lock_bh(&table->tb6_lock); 3198 3199 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3200 struct fib6_info *sibling, *next_sibling; 3201 3202 /* prefer to send a single notification with all hops */ 3203 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3204 if (skb) { 3205 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3206 3207 if (rt6_fill_node(net, skb, rt, NULL, 3208 NULL, NULL, 0, RTM_DELROUTE, 3209 info->portid, seq, 0) < 0) { 3210 kfree_skb(skb); 3211 skb = NULL; 3212 } else 3213 info->skip_notify = 1; 3214 } 3215 3216 list_for_each_entry_safe(sibling, next_sibling, 3217 &rt->fib6_siblings, 3218 fib6_siblings) { 3219 err = fib6_del(sibling, info); 3220 if (err) 3221 goto out_unlock; 3222 } 3223 } 3224 3225 err = fib6_del(rt, info); 3226 out_unlock: 3227 spin_unlock_bh(&table->tb6_lock); 3228 out_put: 3229 fib6_info_release(rt); 3230 3231 if (skb) { 3232 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3233 info->nlh, gfp_any()); 3234 } 3235 return err; 3236 } 3237 3238 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3239 { 3240 int rc = -ESRCH; 3241 3242 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3243 goto out; 3244 3245 if (cfg->fc_flags & RTF_GATEWAY && 3246 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3247 goto out; 3248 if (dst_hold_safe(&rt->dst)) 3249 rc = rt6_remove_exception_rt(rt); 3250 out: 3251 return rc; 3252 } 3253 3254 static int ip6_route_del(struct fib6_config *cfg, 3255 struct netlink_ext_ack *extack) 3256 { 3257 struct rt6_info *rt_cache; 3258 struct fib6_table *table; 3259 struct fib6_info *rt; 3260 struct fib6_node *fn; 3261 int err = -ESRCH; 3262 3263 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 3264 if (!table) { 3265 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 3266 return err; 3267 } 3268 3269 rcu_read_lock(); 3270 3271 fn = fib6_locate(&table->tb6_root, 3272 &cfg->fc_dst, cfg->fc_dst_len, 3273 &cfg->fc_src, cfg->fc_src_len, 3274 !(cfg->fc_flags & RTF_CACHE)); 3275 3276 if (fn) { 3277 for_each_fib6_node_rt_rcu(fn) { 3278 if (cfg->fc_flags & RTF_CACHE) { 3279 int rc; 3280 3281 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, 3282 &cfg->fc_src); 3283 if (rt_cache) { 3284 rc = ip6_del_cached_rt(rt_cache, cfg); 3285 if (rc != -ESRCH) { 3286 rcu_read_unlock(); 3287 return rc; 3288 } 3289 } 3290 continue; 3291 } 3292 if (cfg->fc_ifindex && 3293 (!rt->fib6_nh.nh_dev || 3294 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex)) 3295 continue; 3296 if (cfg->fc_flags & RTF_GATEWAY && 3297 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw)) 3298 continue; 3299 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 3300 continue; 3301 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) 3302 continue; 3303 fib6_info_hold(rt); 3304 rcu_read_unlock(); 3305 3306 /* if gateway was specified only delete the one hop */ 3307 if (cfg->fc_flags & RTF_GATEWAY) 3308 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3309 3310 return __ip6_del_rt_siblings(rt, cfg); 3311 } 3312 } 3313 rcu_read_unlock(); 3314 3315 return err; 3316 } 3317 3318 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3319 { 3320 struct netevent_redirect netevent; 3321 struct rt6_info *rt, *nrt = NULL; 3322 struct ndisc_options ndopts; 3323 struct inet6_dev *in6_dev; 3324 struct neighbour *neigh; 3325 struct fib6_info *from; 3326 struct rd_msg *msg; 3327 int optlen, on_link; 3328 u8 *lladdr; 3329 3330 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3331 optlen -= sizeof(*msg); 3332 3333 if (optlen < 0) { 3334 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3335 return; 3336 } 3337 3338 msg = (struct rd_msg *)icmp6_hdr(skb); 3339 3340 if (ipv6_addr_is_multicast(&msg->dest)) { 3341 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3342 return; 3343 } 3344 3345 on_link = 0; 3346 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3347 on_link = 1; 3348 } else if (ipv6_addr_type(&msg->target) != 3349 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3350 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3351 return; 3352 } 3353 3354 in6_dev = __in6_dev_get(skb->dev); 3355 if (!in6_dev) 3356 return; 3357 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3358 return; 3359 3360 /* RFC2461 8.1: 3361 * The IP source address of the Redirect MUST be the same as the current 3362 * first-hop router for the specified ICMP Destination Address. 3363 */ 3364 3365 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3366 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3367 return; 3368 } 3369 3370 lladdr = NULL; 3371 if (ndopts.nd_opts_tgt_lladdr) { 3372 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3373 skb->dev); 3374 if (!lladdr) { 3375 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 3376 return; 3377 } 3378 } 3379 3380 rt = (struct rt6_info *) dst; 3381 if (rt->rt6i_flags & RTF_REJECT) { 3382 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 3383 return; 3384 } 3385 3386 /* Redirect received -> path was valid. 3387 * Look, redirects are sent only in response to data packets, 3388 * so that this nexthop apparently is reachable. --ANK 3389 */ 3390 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3391 3392 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3393 if (!neigh) 3394 return; 3395 3396 /* 3397 * We have finally decided to accept it. 3398 */ 3399 3400 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3401 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3402 NEIGH_UPDATE_F_OVERRIDE| 3403 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3404 NEIGH_UPDATE_F_ISROUTER)), 3405 NDISC_REDIRECT, &ndopts); 3406 3407 rcu_read_lock(); 3408 from = rcu_dereference(rt->from); 3409 fib6_info_hold(from); 3410 rcu_read_unlock(); 3411 3412 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL); 3413 if (!nrt) 3414 goto out; 3415 3416 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3417 if (on_link) 3418 nrt->rt6i_flags &= ~RTF_GATEWAY; 3419 3420 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3421 3422 /* No need to remove rt from the exception table if rt is 3423 * a cached route because rt6_insert_exception() will 3424 * takes care of it 3425 */ 3426 if (rt6_insert_exception(nrt, from)) { 3427 dst_release_immediate(&nrt->dst); 3428 goto out; 3429 } 3430 3431 netevent.old = &rt->dst; 3432 netevent.new = &nrt->dst; 3433 netevent.daddr = &msg->dest; 3434 netevent.neigh = neigh; 3435 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3436 3437 out: 3438 fib6_info_release(from); 3439 neigh_release(neigh); 3440 } 3441 3442 #ifdef CONFIG_IPV6_ROUTE_INFO 3443 static struct fib6_info *rt6_get_route_info(struct net *net, 3444 const struct in6_addr *prefix, int prefixlen, 3445 const struct in6_addr *gwaddr, 3446 struct net_device *dev) 3447 { 3448 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3449 int ifindex = dev->ifindex; 3450 struct fib6_node *fn; 3451 struct fib6_info *rt = NULL; 3452 struct fib6_table *table; 3453 3454 table = fib6_get_table(net, tb_id); 3455 if (!table) 3456 return NULL; 3457 3458 rcu_read_lock(); 3459 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3460 if (!fn) 3461 goto out; 3462 3463 for_each_fib6_node_rt_rcu(fn) { 3464 if (rt->fib6_nh.nh_dev->ifindex != ifindex) 3465 continue; 3466 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) 3467 continue; 3468 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) 3469 continue; 3470 fib6_info_hold(rt); 3471 break; 3472 } 3473 out: 3474 rcu_read_unlock(); 3475 return rt; 3476 } 3477 3478 static struct fib6_info *rt6_add_route_info(struct net *net, 3479 const struct in6_addr *prefix, int prefixlen, 3480 const struct in6_addr *gwaddr, 3481 struct net_device *dev, 3482 unsigned int pref) 3483 { 3484 struct fib6_config cfg = { 3485 .fc_metric = IP6_RT_PRIO_USER, 3486 .fc_ifindex = dev->ifindex, 3487 .fc_dst_len = prefixlen, 3488 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3489 RTF_UP | RTF_PREF(pref), 3490 .fc_protocol = RTPROT_RA, 3491 .fc_type = RTN_UNICAST, 3492 .fc_nlinfo.portid = 0, 3493 .fc_nlinfo.nlh = NULL, 3494 .fc_nlinfo.nl_net = net, 3495 }; 3496 3497 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3498 cfg.fc_dst = *prefix; 3499 cfg.fc_gateway = *gwaddr; 3500 3501 /* We should treat it as a default route if prefix length is 0. */ 3502 if (!prefixlen) 3503 cfg.fc_flags |= RTF_DEFAULT; 3504 3505 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 3506 3507 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3508 } 3509 #endif 3510 3511 struct fib6_info *rt6_get_dflt_router(struct net *net, 3512 const struct in6_addr *addr, 3513 struct net_device *dev) 3514 { 3515 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3516 struct fib6_info *rt; 3517 struct fib6_table *table; 3518 3519 table = fib6_get_table(net, tb_id); 3520 if (!table) 3521 return NULL; 3522 3523 rcu_read_lock(); 3524 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3525 if (dev == rt->fib6_nh.nh_dev && 3526 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3527 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) 3528 break; 3529 } 3530 if (rt) 3531 fib6_info_hold(rt); 3532 rcu_read_unlock(); 3533 return rt; 3534 } 3535 3536 struct fib6_info *rt6_add_dflt_router(struct net *net, 3537 const struct in6_addr *gwaddr, 3538 struct net_device *dev, 3539 unsigned int pref) 3540 { 3541 struct fib6_config cfg = { 3542 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3543 .fc_metric = IP6_RT_PRIO_USER, 3544 .fc_ifindex = dev->ifindex, 3545 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3546 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3547 .fc_protocol = RTPROT_RA, 3548 .fc_type = RTN_UNICAST, 3549 .fc_nlinfo.portid = 0, 3550 .fc_nlinfo.nlh = NULL, 3551 .fc_nlinfo.nl_net = net, 3552 }; 3553 3554 cfg.fc_gateway = *gwaddr; 3555 3556 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 3557 struct fib6_table *table; 3558 3559 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3560 if (table) 3561 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3562 } 3563 3564 return rt6_get_dflt_router(net, gwaddr, dev); 3565 } 3566 3567 static void __rt6_purge_dflt_routers(struct net *net, 3568 struct fib6_table *table) 3569 { 3570 struct fib6_info *rt; 3571 3572 restart: 3573 rcu_read_lock(); 3574 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3575 struct net_device *dev = fib6_info_nh_dev(rt); 3576 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 3577 3578 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3579 (!idev || idev->cnf.accept_ra != 2)) { 3580 fib6_info_hold(rt); 3581 rcu_read_unlock(); 3582 ip6_del_rt(net, rt); 3583 goto restart; 3584 } 3585 } 3586 rcu_read_unlock(); 3587 3588 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3589 } 3590 3591 void rt6_purge_dflt_routers(struct net *net) 3592 { 3593 struct fib6_table *table; 3594 struct hlist_head *head; 3595 unsigned int h; 3596 3597 rcu_read_lock(); 3598 3599 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3600 head = &net->ipv6.fib_table_hash[h]; 3601 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3602 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3603 __rt6_purge_dflt_routers(net, table); 3604 } 3605 } 3606 3607 rcu_read_unlock(); 3608 } 3609 3610 static void rtmsg_to_fib6_config(struct net *net, 3611 struct in6_rtmsg *rtmsg, 3612 struct fib6_config *cfg) 3613 { 3614 memset(cfg, 0, sizeof(*cfg)); 3615 3616 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3617 : RT6_TABLE_MAIN; 3618 cfg->fc_ifindex = rtmsg->rtmsg_ifindex; 3619 cfg->fc_metric = rtmsg->rtmsg_metric; 3620 cfg->fc_expires = rtmsg->rtmsg_info; 3621 cfg->fc_dst_len = rtmsg->rtmsg_dst_len; 3622 cfg->fc_src_len = rtmsg->rtmsg_src_len; 3623 cfg->fc_flags = rtmsg->rtmsg_flags; 3624 cfg->fc_type = rtmsg->rtmsg_type; 3625 3626 cfg->fc_nlinfo.nl_net = net; 3627 3628 cfg->fc_dst = rtmsg->rtmsg_dst; 3629 cfg->fc_src = rtmsg->rtmsg_src; 3630 cfg->fc_gateway = rtmsg->rtmsg_gateway; 3631 } 3632 3633 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3634 { 3635 struct fib6_config cfg; 3636 struct in6_rtmsg rtmsg; 3637 int err; 3638 3639 switch (cmd) { 3640 case SIOCADDRT: /* Add a route */ 3641 case SIOCDELRT: /* Delete a route */ 3642 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3643 return -EPERM; 3644 err = copy_from_user(&rtmsg, arg, 3645 sizeof(struct in6_rtmsg)); 3646 if (err) 3647 return -EFAULT; 3648 3649 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3650 3651 rtnl_lock(); 3652 switch (cmd) { 3653 case SIOCADDRT: 3654 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 3655 break; 3656 case SIOCDELRT: 3657 err = ip6_route_del(&cfg, NULL); 3658 break; 3659 default: 3660 err = -EINVAL; 3661 } 3662 rtnl_unlock(); 3663 3664 return err; 3665 } 3666 3667 return -EINVAL; 3668 } 3669 3670 /* 3671 * Drop the packet on the floor 3672 */ 3673 3674 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3675 { 3676 int type; 3677 struct dst_entry *dst = skb_dst(skb); 3678 switch (ipstats_mib_noroutes) { 3679 case IPSTATS_MIB_INNOROUTES: 3680 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3681 if (type == IPV6_ADDR_ANY) { 3682 IP6_INC_STATS(dev_net(dst->dev), 3683 __in6_dev_get_safely(skb->dev), 3684 IPSTATS_MIB_INADDRERRORS); 3685 break; 3686 } 3687 /* FALLTHROUGH */ 3688 case IPSTATS_MIB_OUTNOROUTES: 3689 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3690 ipstats_mib_noroutes); 3691 break; 3692 } 3693 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3694 kfree_skb(skb); 3695 return 0; 3696 } 3697 3698 static int ip6_pkt_discard(struct sk_buff *skb) 3699 { 3700 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3701 } 3702 3703 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3704 { 3705 skb->dev = skb_dst(skb)->dev; 3706 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3707 } 3708 3709 static int ip6_pkt_prohibit(struct sk_buff *skb) 3710 { 3711 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3712 } 3713 3714 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3715 { 3716 skb->dev = skb_dst(skb)->dev; 3717 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3718 } 3719 3720 /* 3721 * Allocate a dst for local (unicast / anycast) address. 3722 */ 3723 3724 struct fib6_info *addrconf_f6i_alloc(struct net *net, 3725 struct inet6_dev *idev, 3726 const struct in6_addr *addr, 3727 bool anycast, gfp_t gfp_flags) 3728 { 3729 u32 tb_id; 3730 struct net_device *dev = idev->dev; 3731 struct fib6_info *f6i; 3732 3733 f6i = fib6_info_alloc(gfp_flags); 3734 if (!f6i) 3735 return ERR_PTR(-ENOMEM); 3736 3737 f6i->dst_nocount = true; 3738 f6i->dst_host = true; 3739 f6i->fib6_protocol = RTPROT_KERNEL; 3740 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP; 3741 if (anycast) { 3742 f6i->fib6_type = RTN_ANYCAST; 3743 f6i->fib6_flags |= RTF_ANYCAST; 3744 } else { 3745 f6i->fib6_type = RTN_LOCAL; 3746 f6i->fib6_flags |= RTF_LOCAL; 3747 } 3748 3749 f6i->fib6_nh.nh_gw = *addr; 3750 dev_hold(dev); 3751 f6i->fib6_nh.nh_dev = dev; 3752 f6i->fib6_dst.addr = *addr; 3753 f6i->fib6_dst.plen = 128; 3754 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; 3755 f6i->fib6_table = fib6_get_table(net, tb_id); 3756 3757 return f6i; 3758 } 3759 3760 /* remove deleted ip from prefsrc entries */ 3761 struct arg_dev_net_ip { 3762 struct net_device *dev; 3763 struct net *net; 3764 struct in6_addr *addr; 3765 }; 3766 3767 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 3768 { 3769 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3770 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3771 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3772 3773 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) && 3774 rt != net->ipv6.fib6_null_entry && 3775 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 3776 spin_lock_bh(&rt6_exception_lock); 3777 /* remove prefsrc entry */ 3778 rt->fib6_prefsrc.plen = 0; 3779 /* need to update cache as well */ 3780 rt6_exceptions_remove_prefsrc(rt); 3781 spin_unlock_bh(&rt6_exception_lock); 3782 } 3783 return 0; 3784 } 3785 3786 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3787 { 3788 struct net *net = dev_net(ifp->idev->dev); 3789 struct arg_dev_net_ip adni = { 3790 .dev = ifp->idev->dev, 3791 .net = net, 3792 .addr = &ifp->addr, 3793 }; 3794 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3795 } 3796 3797 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) 3798 3799 /* Remove routers and update dst entries when gateway turn into host. */ 3800 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 3801 { 3802 struct in6_addr *gateway = (struct in6_addr *)arg; 3803 3804 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3805 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) { 3806 return -1; 3807 } 3808 3809 /* Further clean up cached routes in exception table. 3810 * This is needed because cached route may have a different 3811 * gateway than its 'parent' in the case of an ip redirect. 3812 */ 3813 rt6_exceptions_clean_tohost(rt, gateway); 3814 3815 return 0; 3816 } 3817 3818 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3819 { 3820 fib6_clean_all(net, fib6_clean_tohost, gateway); 3821 } 3822 3823 struct arg_netdev_event { 3824 const struct net_device *dev; 3825 union { 3826 unsigned int nh_flags; 3827 unsigned long event; 3828 }; 3829 }; 3830 3831 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 3832 { 3833 struct fib6_info *iter; 3834 struct fib6_node *fn; 3835 3836 fn = rcu_dereference_protected(rt->fib6_node, 3837 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3838 iter = rcu_dereference_protected(fn->leaf, 3839 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3840 while (iter) { 3841 if (iter->fib6_metric == rt->fib6_metric && 3842 rt6_qualify_for_ecmp(iter)) 3843 return iter; 3844 iter = rcu_dereference_protected(iter->fib6_next, 3845 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3846 } 3847 3848 return NULL; 3849 } 3850 3851 static bool rt6_is_dead(const struct fib6_info *rt) 3852 { 3853 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || 3854 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 3855 fib6_ignore_linkdown(rt))) 3856 return true; 3857 3858 return false; 3859 } 3860 3861 static int rt6_multipath_total_weight(const struct fib6_info *rt) 3862 { 3863 struct fib6_info *iter; 3864 int total = 0; 3865 3866 if (!rt6_is_dead(rt)) 3867 total += rt->fib6_nh.nh_weight; 3868 3869 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 3870 if (!rt6_is_dead(iter)) 3871 total += iter->fib6_nh.nh_weight; 3872 } 3873 3874 return total; 3875 } 3876 3877 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 3878 { 3879 int upper_bound = -1; 3880 3881 if (!rt6_is_dead(rt)) { 3882 *weight += rt->fib6_nh.nh_weight; 3883 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 3884 total) - 1; 3885 } 3886 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound); 3887 } 3888 3889 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 3890 { 3891 struct fib6_info *iter; 3892 int weight = 0; 3893 3894 rt6_upper_bound_set(rt, &weight, total); 3895 3896 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3897 rt6_upper_bound_set(iter, &weight, total); 3898 } 3899 3900 void rt6_multipath_rebalance(struct fib6_info *rt) 3901 { 3902 struct fib6_info *first; 3903 int total; 3904 3905 /* In case the entire multipath route was marked for flushing, 3906 * then there is no need to rebalance upon the removal of every 3907 * sibling route. 3908 */ 3909 if (!rt->fib6_nsiblings || rt->should_flush) 3910 return; 3911 3912 /* During lookup routes are evaluated in order, so we need to 3913 * make sure upper bounds are assigned from the first sibling 3914 * onwards. 3915 */ 3916 first = rt6_multipath_first_sibling(rt); 3917 if (WARN_ON_ONCE(!first)) 3918 return; 3919 3920 total = rt6_multipath_total_weight(first); 3921 rt6_multipath_upper_bound_set(first, total); 3922 } 3923 3924 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 3925 { 3926 const struct arg_netdev_event *arg = p_arg; 3927 struct net *net = dev_net(arg->dev); 3928 3929 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) { 3930 rt->fib6_nh.nh_flags &= ~arg->nh_flags; 3931 fib6_update_sernum_upto_root(net, rt); 3932 rt6_multipath_rebalance(rt); 3933 } 3934 3935 return 0; 3936 } 3937 3938 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) 3939 { 3940 struct arg_netdev_event arg = { 3941 .dev = dev, 3942 { 3943 .nh_flags = nh_flags, 3944 }, 3945 }; 3946 3947 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 3948 arg.nh_flags |= RTNH_F_LINKDOWN; 3949 3950 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 3951 } 3952 3953 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 3954 const struct net_device *dev) 3955 { 3956 struct fib6_info *iter; 3957 3958 if (rt->fib6_nh.nh_dev == dev) 3959 return true; 3960 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3961 if (iter->fib6_nh.nh_dev == dev) 3962 return true; 3963 3964 return false; 3965 } 3966 3967 static void rt6_multipath_flush(struct fib6_info *rt) 3968 { 3969 struct fib6_info *iter; 3970 3971 rt->should_flush = 1; 3972 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3973 iter->should_flush = 1; 3974 } 3975 3976 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 3977 const struct net_device *down_dev) 3978 { 3979 struct fib6_info *iter; 3980 unsigned int dead = 0; 3981 3982 if (rt->fib6_nh.nh_dev == down_dev || 3983 rt->fib6_nh.nh_flags & RTNH_F_DEAD) 3984 dead++; 3985 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3986 if (iter->fib6_nh.nh_dev == down_dev || 3987 iter->fib6_nh.nh_flags & RTNH_F_DEAD) 3988 dead++; 3989 3990 return dead; 3991 } 3992 3993 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 3994 const struct net_device *dev, 3995 unsigned int nh_flags) 3996 { 3997 struct fib6_info *iter; 3998 3999 if (rt->fib6_nh.nh_dev == dev) 4000 rt->fib6_nh.nh_flags |= nh_flags; 4001 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4002 if (iter->fib6_nh.nh_dev == dev) 4003 iter->fib6_nh.nh_flags |= nh_flags; 4004 } 4005 4006 /* called with write lock held for table with rt */ 4007 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 4008 { 4009 const struct arg_netdev_event *arg = p_arg; 4010 const struct net_device *dev = arg->dev; 4011 struct net *net = dev_net(dev); 4012 4013 if (rt == net->ipv6.fib6_null_entry) 4014 return 0; 4015 4016 switch (arg->event) { 4017 case NETDEV_UNREGISTER: 4018 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 4019 case NETDEV_DOWN: 4020 if (rt->should_flush) 4021 return -1; 4022 if (!rt->fib6_nsiblings) 4023 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 4024 if (rt6_multipath_uses_dev(rt, dev)) { 4025 unsigned int count; 4026 4027 count = rt6_multipath_dead_count(rt, dev); 4028 if (rt->fib6_nsiblings + 1 == count) { 4029 rt6_multipath_flush(rt); 4030 return -1; 4031 } 4032 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4033 RTNH_F_LINKDOWN); 4034 fib6_update_sernum(net, rt); 4035 rt6_multipath_rebalance(rt); 4036 } 4037 return -2; 4038 case NETDEV_CHANGE: 4039 if (rt->fib6_nh.nh_dev != dev || 4040 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 4041 break; 4042 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 4043 rt6_multipath_rebalance(rt); 4044 break; 4045 } 4046 4047 return 0; 4048 } 4049 4050 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4051 { 4052 struct arg_netdev_event arg = { 4053 .dev = dev, 4054 { 4055 .event = event, 4056 }, 4057 }; 4058 4059 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg); 4060 } 4061 4062 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4063 { 4064 rt6_sync_down_dev(dev, event); 4065 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4066 neigh_ifdown(&nd_tbl, dev); 4067 } 4068 4069 struct rt6_mtu_change_arg { 4070 struct net_device *dev; 4071 unsigned int mtu; 4072 }; 4073 4074 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) 4075 { 4076 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4077 struct inet6_dev *idev; 4078 4079 /* In IPv6 pmtu discovery is not optional, 4080 so that RTAX_MTU lock cannot disable it. 4081 We still use this lock to block changes 4082 caused by addrconf/ndisc. 4083 */ 4084 4085 idev = __in6_dev_get(arg->dev); 4086 if (!idev) 4087 return 0; 4088 4089 /* For administrative MTU increase, there is no way to discover 4090 IPv6 PMTU increase, so PMTU increase should be updated here. 4091 Since RFC 1981 doesn't include administrative MTU increase 4092 update PMTU increase is a MUST. (i.e. jumbo frame) 4093 */ 4094 if (rt->fib6_nh.nh_dev == arg->dev && 4095 !fib6_metric_locked(rt, RTAX_MTU)) { 4096 u32 mtu = rt->fib6_pmtu; 4097 4098 if (mtu >= arg->mtu || 4099 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4100 fib6_metric_set(rt, RTAX_MTU, arg->mtu); 4101 4102 spin_lock_bh(&rt6_exception_lock); 4103 rt6_exceptions_update_pmtu(idev, rt, arg->mtu); 4104 spin_unlock_bh(&rt6_exception_lock); 4105 } 4106 return 0; 4107 } 4108 4109 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4110 { 4111 struct rt6_mtu_change_arg arg = { 4112 .dev = dev, 4113 .mtu = mtu, 4114 }; 4115 4116 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4117 } 4118 4119 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4120 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4121 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4122 [RTA_OIF] = { .type = NLA_U32 }, 4123 [RTA_IIF] = { .type = NLA_U32 }, 4124 [RTA_PRIORITY] = { .type = NLA_U32 }, 4125 [RTA_METRICS] = { .type = NLA_NESTED }, 4126 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4127 [RTA_PREF] = { .type = NLA_U8 }, 4128 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4129 [RTA_ENCAP] = { .type = NLA_NESTED }, 4130 [RTA_EXPIRES] = { .type = NLA_U32 }, 4131 [RTA_UID] = { .type = NLA_U32 }, 4132 [RTA_MARK] = { .type = NLA_U32 }, 4133 [RTA_TABLE] = { .type = NLA_U32 }, 4134 }; 4135 4136 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4137 struct fib6_config *cfg, 4138 struct netlink_ext_ack *extack) 4139 { 4140 struct rtmsg *rtm; 4141 struct nlattr *tb[RTA_MAX+1]; 4142 unsigned int pref; 4143 int err; 4144 4145 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4146 NULL); 4147 if (err < 0) 4148 goto errout; 4149 4150 err = -EINVAL; 4151 rtm = nlmsg_data(nlh); 4152 memset(cfg, 0, sizeof(*cfg)); 4153 4154 cfg->fc_table = rtm->rtm_table; 4155 cfg->fc_dst_len = rtm->rtm_dst_len; 4156 cfg->fc_src_len = rtm->rtm_src_len; 4157 cfg->fc_flags = RTF_UP; 4158 cfg->fc_protocol = rtm->rtm_protocol; 4159 cfg->fc_type = rtm->rtm_type; 4160 4161 if (rtm->rtm_type == RTN_UNREACHABLE || 4162 rtm->rtm_type == RTN_BLACKHOLE || 4163 rtm->rtm_type == RTN_PROHIBIT || 4164 rtm->rtm_type == RTN_THROW) 4165 cfg->fc_flags |= RTF_REJECT; 4166 4167 if (rtm->rtm_type == RTN_LOCAL) 4168 cfg->fc_flags |= RTF_LOCAL; 4169 4170 if (rtm->rtm_flags & RTM_F_CLONED) 4171 cfg->fc_flags |= RTF_CACHE; 4172 4173 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4174 4175 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; 4176 cfg->fc_nlinfo.nlh = nlh; 4177 cfg->fc_nlinfo.nl_net = sock_net(skb->sk); 4178 4179 if (tb[RTA_GATEWAY]) { 4180 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4181 cfg->fc_flags |= RTF_GATEWAY; 4182 } 4183 4184 if (tb[RTA_DST]) { 4185 int plen = (rtm->rtm_dst_len + 7) >> 3; 4186 4187 if (nla_len(tb[RTA_DST]) < plen) 4188 goto errout; 4189 4190 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 4191 } 4192 4193 if (tb[RTA_SRC]) { 4194 int plen = (rtm->rtm_src_len + 7) >> 3; 4195 4196 if (nla_len(tb[RTA_SRC]) < plen) 4197 goto errout; 4198 4199 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 4200 } 4201 4202 if (tb[RTA_PREFSRC]) 4203 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 4204 4205 if (tb[RTA_OIF]) 4206 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 4207 4208 if (tb[RTA_PRIORITY]) 4209 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 4210 4211 if (tb[RTA_METRICS]) { 4212 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 4213 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 4214 } 4215 4216 if (tb[RTA_TABLE]) 4217 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 4218 4219 if (tb[RTA_MULTIPATH]) { 4220 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 4221 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 4222 4223 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 4224 cfg->fc_mp_len, extack); 4225 if (err < 0) 4226 goto errout; 4227 } 4228 4229 if (tb[RTA_PREF]) { 4230 pref = nla_get_u8(tb[RTA_PREF]); 4231 if (pref != ICMPV6_ROUTER_PREF_LOW && 4232 pref != ICMPV6_ROUTER_PREF_HIGH) 4233 pref = ICMPV6_ROUTER_PREF_MEDIUM; 4234 cfg->fc_flags |= RTF_PREF(pref); 4235 } 4236 4237 if (tb[RTA_ENCAP]) 4238 cfg->fc_encap = tb[RTA_ENCAP]; 4239 4240 if (tb[RTA_ENCAP_TYPE]) { 4241 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 4242 4243 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 4244 if (err < 0) 4245 goto errout; 4246 } 4247 4248 if (tb[RTA_EXPIRES]) { 4249 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 4250 4251 if (addrconf_finite_timeout(timeout)) { 4252 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 4253 cfg->fc_flags |= RTF_EXPIRES; 4254 } 4255 } 4256 4257 err = 0; 4258 errout: 4259 return err; 4260 } 4261 4262 struct rt6_nh { 4263 struct fib6_info *fib6_info; 4264 struct fib6_config r_cfg; 4265 struct list_head next; 4266 }; 4267 4268 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) 4269 { 4270 struct rt6_nh *nh; 4271 4272 list_for_each_entry(nh, rt6_nh_list, next) { 4273 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n", 4274 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, 4275 nh->r_cfg.fc_ifindex); 4276 } 4277 } 4278 4279 static int ip6_route_info_append(struct net *net, 4280 struct list_head *rt6_nh_list, 4281 struct fib6_info *rt, 4282 struct fib6_config *r_cfg) 4283 { 4284 struct rt6_nh *nh; 4285 int err = -EEXIST; 4286 4287 list_for_each_entry(nh, rt6_nh_list, next) { 4288 /* check if fib6_info already exists */ 4289 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 4290 return err; 4291 } 4292 4293 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4294 if (!nh) 4295 return -ENOMEM; 4296 nh->fib6_info = rt; 4297 err = ip6_convert_metrics(net, rt, r_cfg); 4298 if (err) { 4299 kfree(nh); 4300 return err; 4301 } 4302 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4303 list_add_tail(&nh->next, rt6_nh_list); 4304 4305 return 0; 4306 } 4307 4308 static void ip6_route_mpath_notify(struct fib6_info *rt, 4309 struct fib6_info *rt_last, 4310 struct nl_info *info, 4311 __u16 nlflags) 4312 { 4313 /* if this is an APPEND route, then rt points to the first route 4314 * inserted and rt_last points to last route inserted. Userspace 4315 * wants a consistent dump of the route which starts at the first 4316 * nexthop. Since sibling routes are always added at the end of 4317 * the list, find the first sibling of the last route appended 4318 */ 4319 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 4320 rt = list_first_entry(&rt_last->fib6_siblings, 4321 struct fib6_info, 4322 fib6_siblings); 4323 } 4324 4325 if (rt) 4326 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 4327 } 4328 4329 static int ip6_route_multipath_add(struct fib6_config *cfg, 4330 struct netlink_ext_ack *extack) 4331 { 4332 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 4333 struct nl_info *info = &cfg->fc_nlinfo; 4334 struct fib6_config r_cfg; 4335 struct rtnexthop *rtnh; 4336 struct fib6_info *rt; 4337 struct rt6_nh *err_nh; 4338 struct rt6_nh *nh, *nh_safe; 4339 __u16 nlflags; 4340 int remaining; 4341 int attrlen; 4342 int err = 1; 4343 int nhn = 0; 4344 int replace = (cfg->fc_nlinfo.nlh && 4345 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 4346 LIST_HEAD(rt6_nh_list); 4347 4348 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 4349 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 4350 nlflags |= NLM_F_APPEND; 4351 4352 remaining = cfg->fc_mp_len; 4353 rtnh = (struct rtnexthop *)cfg->fc_mp; 4354 4355 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4356 * fib6_info structs per nexthop 4357 */ 4358 while (rtnh_ok(rtnh, remaining)) { 4359 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4360 if (rtnh->rtnh_ifindex) 4361 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4362 4363 attrlen = rtnh_attrlen(rtnh); 4364 if (attrlen > 0) { 4365 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4366 4367 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4368 if (nla) { 4369 r_cfg.fc_gateway = nla_get_in6_addr(nla); 4370 r_cfg.fc_flags |= RTF_GATEWAY; 4371 } 4372 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 4373 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 4374 if (nla) 4375 r_cfg.fc_encap_type = nla_get_u16(nla); 4376 } 4377 4378 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 4379 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 4380 if (IS_ERR(rt)) { 4381 err = PTR_ERR(rt); 4382 rt = NULL; 4383 goto cleanup; 4384 } 4385 4386 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1; 4387 4388 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 4389 rt, &r_cfg); 4390 if (err) { 4391 fib6_info_release(rt); 4392 goto cleanup; 4393 } 4394 4395 rtnh = rtnh_next(rtnh, &remaining); 4396 } 4397 4398 /* for add and replace send one notification with all nexthops. 4399 * Skip the notification in fib6_add_rt2node and send one with 4400 * the full route when done 4401 */ 4402 info->skip_notify = 1; 4403 4404 err_nh = NULL; 4405 list_for_each_entry(nh, &rt6_nh_list, next) { 4406 rt_last = nh->fib6_info; 4407 err = __ip6_ins_rt(nh->fib6_info, info, extack); 4408 fib6_info_release(nh->fib6_info); 4409 4410 /* save reference to first route for notification */ 4411 if (!rt_notif && !err) 4412 rt_notif = nh->fib6_info; 4413 4414 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 4415 nh->fib6_info = NULL; 4416 if (err) { 4417 if (replace && nhn) 4418 ip6_print_replace_route_err(&rt6_nh_list); 4419 err_nh = nh; 4420 goto add_errout; 4421 } 4422 4423 /* Because each route is added like a single route we remove 4424 * these flags after the first nexthop: if there is a collision, 4425 * we have already failed to add the first nexthop: 4426 * fib6_add_rt2node() has rejected it; when replacing, old 4427 * nexthops have been replaced by first new, the rest should 4428 * be added to it. 4429 */ 4430 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4431 NLM_F_REPLACE); 4432 nhn++; 4433 } 4434 4435 /* success ... tell user about new route */ 4436 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4437 goto cleanup; 4438 4439 add_errout: 4440 /* send notification for routes that were added so that 4441 * the delete notifications sent by ip6_route_del are 4442 * coherent 4443 */ 4444 if (rt_notif) 4445 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4446 4447 /* Delete routes that were already added */ 4448 list_for_each_entry(nh, &rt6_nh_list, next) { 4449 if (err_nh == nh) 4450 break; 4451 ip6_route_del(&nh->r_cfg, extack); 4452 } 4453 4454 cleanup: 4455 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4456 if (nh->fib6_info) 4457 fib6_info_release(nh->fib6_info); 4458 list_del(&nh->next); 4459 kfree(nh); 4460 } 4461 4462 return err; 4463 } 4464 4465 static int ip6_route_multipath_del(struct fib6_config *cfg, 4466 struct netlink_ext_ack *extack) 4467 { 4468 struct fib6_config r_cfg; 4469 struct rtnexthop *rtnh; 4470 int remaining; 4471 int attrlen; 4472 int err = 1, last_err = 0; 4473 4474 remaining = cfg->fc_mp_len; 4475 rtnh = (struct rtnexthop *)cfg->fc_mp; 4476 4477 /* Parse a Multipath Entry */ 4478 while (rtnh_ok(rtnh, remaining)) { 4479 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4480 if (rtnh->rtnh_ifindex) 4481 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4482 4483 attrlen = rtnh_attrlen(rtnh); 4484 if (attrlen > 0) { 4485 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4486 4487 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4488 if (nla) { 4489 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 4490 r_cfg.fc_flags |= RTF_GATEWAY; 4491 } 4492 } 4493 err = ip6_route_del(&r_cfg, extack); 4494 if (err) 4495 last_err = err; 4496 4497 rtnh = rtnh_next(rtnh, &remaining); 4498 } 4499 4500 return last_err; 4501 } 4502 4503 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4504 struct netlink_ext_ack *extack) 4505 { 4506 struct fib6_config cfg; 4507 int err; 4508 4509 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4510 if (err < 0) 4511 return err; 4512 4513 if (cfg.fc_mp) 4514 return ip6_route_multipath_del(&cfg, extack); 4515 else { 4516 cfg.fc_delete_all_nh = 1; 4517 return ip6_route_del(&cfg, extack); 4518 } 4519 } 4520 4521 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4522 struct netlink_ext_ack *extack) 4523 { 4524 struct fib6_config cfg; 4525 int err; 4526 4527 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4528 if (err < 0) 4529 return err; 4530 4531 if (cfg.fc_mp) 4532 return ip6_route_multipath_add(&cfg, extack); 4533 else 4534 return ip6_route_add(&cfg, GFP_KERNEL, extack); 4535 } 4536 4537 static size_t rt6_nlmsg_size(struct fib6_info *rt) 4538 { 4539 int nexthop_len = 0; 4540 4541 if (rt->fib6_nsiblings) { 4542 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4543 + NLA_ALIGN(sizeof(struct rtnexthop)) 4544 + nla_total_size(16) /* RTA_GATEWAY */ 4545 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate); 4546 4547 nexthop_len *= rt->fib6_nsiblings; 4548 } 4549 4550 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4551 + nla_total_size(16) /* RTA_SRC */ 4552 + nla_total_size(16) /* RTA_DST */ 4553 + nla_total_size(16) /* RTA_GATEWAY */ 4554 + nla_total_size(16) /* RTA_PREFSRC */ 4555 + nla_total_size(4) /* RTA_TABLE */ 4556 + nla_total_size(4) /* RTA_IIF */ 4557 + nla_total_size(4) /* RTA_OIF */ 4558 + nla_total_size(4) /* RTA_PRIORITY */ 4559 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 4560 + nla_total_size(sizeof(struct rta_cacheinfo)) 4561 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4562 + nla_total_size(1) /* RTA_PREF */ 4563 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate) 4564 + nexthop_len; 4565 } 4566 4567 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt, 4568 unsigned int *flags, bool skip_oif) 4569 { 4570 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 4571 *flags |= RTNH_F_DEAD; 4572 4573 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) { 4574 *flags |= RTNH_F_LINKDOWN; 4575 4576 rcu_read_lock(); 4577 if (fib6_ignore_linkdown(rt)) 4578 *flags |= RTNH_F_DEAD; 4579 rcu_read_unlock(); 4580 } 4581 4582 if (rt->fib6_flags & RTF_GATEWAY) { 4583 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0) 4584 goto nla_put_failure; 4585 } 4586 4587 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK); 4588 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD) 4589 *flags |= RTNH_F_OFFLOAD; 4590 4591 /* not needed for multipath encoding b/c it has a rtnexthop struct */ 4592 if (!skip_oif && rt->fib6_nh.nh_dev && 4593 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex)) 4594 goto nla_put_failure; 4595 4596 if (rt->fib6_nh.nh_lwtstate && 4597 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0) 4598 goto nla_put_failure; 4599 4600 return 0; 4601 4602 nla_put_failure: 4603 return -EMSGSIZE; 4604 } 4605 4606 /* add multipath next hop */ 4607 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt) 4608 { 4609 const struct net_device *dev = rt->fib6_nh.nh_dev; 4610 struct rtnexthop *rtnh; 4611 unsigned int flags = 0; 4612 4613 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 4614 if (!rtnh) 4615 goto nla_put_failure; 4616 4617 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1; 4618 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; 4619 4620 if (rt6_nexthop_info(skb, rt, &flags, true) < 0) 4621 goto nla_put_failure; 4622 4623 rtnh->rtnh_flags = flags; 4624 4625 /* length of rtnetlink header + attributes */ 4626 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; 4627 4628 return 0; 4629 4630 nla_put_failure: 4631 return -EMSGSIZE; 4632 } 4633 4634 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 4635 struct fib6_info *rt, struct dst_entry *dst, 4636 struct in6_addr *dest, struct in6_addr *src, 4637 int iif, int type, u32 portid, u32 seq, 4638 unsigned int flags) 4639 { 4640 struct rtmsg *rtm; 4641 struct nlmsghdr *nlh; 4642 long expires = 0; 4643 u32 *pmetrics; 4644 u32 table; 4645 4646 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4647 if (!nlh) 4648 return -EMSGSIZE; 4649 4650 rtm = nlmsg_data(nlh); 4651 rtm->rtm_family = AF_INET6; 4652 rtm->rtm_dst_len = rt->fib6_dst.plen; 4653 rtm->rtm_src_len = rt->fib6_src.plen; 4654 rtm->rtm_tos = 0; 4655 if (rt->fib6_table) 4656 table = rt->fib6_table->tb6_id; 4657 else 4658 table = RT6_TABLE_UNSPEC; 4659 rtm->rtm_table = table; 4660 if (nla_put_u32(skb, RTA_TABLE, table)) 4661 goto nla_put_failure; 4662 4663 rtm->rtm_type = rt->fib6_type; 4664 rtm->rtm_flags = 0; 4665 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4666 rtm->rtm_protocol = rt->fib6_protocol; 4667 4668 if (rt->fib6_flags & RTF_CACHE) 4669 rtm->rtm_flags |= RTM_F_CLONED; 4670 4671 if (dest) { 4672 if (nla_put_in6_addr(skb, RTA_DST, dest)) 4673 goto nla_put_failure; 4674 rtm->rtm_dst_len = 128; 4675 } else if (rtm->rtm_dst_len) 4676 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr)) 4677 goto nla_put_failure; 4678 #ifdef CONFIG_IPV6_SUBTREES 4679 if (src) { 4680 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4681 goto nla_put_failure; 4682 rtm->rtm_src_len = 128; 4683 } else if (rtm->rtm_src_len && 4684 nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr)) 4685 goto nla_put_failure; 4686 #endif 4687 if (iif) { 4688 #ifdef CONFIG_IPV6_MROUTE 4689 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) { 4690 int err = ip6mr_get_route(net, skb, rtm, portid); 4691 4692 if (err == 0) 4693 return 0; 4694 if (err < 0) 4695 goto nla_put_failure; 4696 } else 4697 #endif 4698 if (nla_put_u32(skb, RTA_IIF, iif)) 4699 goto nla_put_failure; 4700 } else if (dest) { 4701 struct in6_addr saddr_buf; 4702 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 4703 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4704 goto nla_put_failure; 4705 } 4706 4707 if (rt->fib6_prefsrc.plen) { 4708 struct in6_addr saddr_buf; 4709 saddr_buf = rt->fib6_prefsrc.addr; 4710 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4711 goto nla_put_failure; 4712 } 4713 4714 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 4715 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 4716 goto nla_put_failure; 4717 4718 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 4719 goto nla_put_failure; 4720 4721 /* For multipath routes, walk the siblings list and add 4722 * each as a nexthop within RTA_MULTIPATH. 4723 */ 4724 if (rt->fib6_nsiblings) { 4725 struct fib6_info *sibling, *next_sibling; 4726 struct nlattr *mp; 4727 4728 mp = nla_nest_start(skb, RTA_MULTIPATH); 4729 if (!mp) 4730 goto nla_put_failure; 4731 4732 if (rt6_add_nexthop(skb, rt) < 0) 4733 goto nla_put_failure; 4734 4735 list_for_each_entry_safe(sibling, next_sibling, 4736 &rt->fib6_siblings, fib6_siblings) { 4737 if (rt6_add_nexthop(skb, sibling) < 0) 4738 goto nla_put_failure; 4739 } 4740 4741 nla_nest_end(skb, mp); 4742 } else { 4743 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0) 4744 goto nla_put_failure; 4745 } 4746 4747 if (rt->fib6_flags & RTF_EXPIRES) { 4748 expires = dst ? dst->expires : rt->expires; 4749 expires -= jiffies; 4750 } 4751 4752 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 4753 goto nla_put_failure; 4754 4755 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags))) 4756 goto nla_put_failure; 4757 4758 4759 nlmsg_end(skb, nlh); 4760 return 0; 4761 4762 nla_put_failure: 4763 nlmsg_cancel(skb, nlh); 4764 return -EMSGSIZE; 4765 } 4766 4767 int rt6_dump_route(struct fib6_info *rt, void *p_arg) 4768 { 4769 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4770 struct net *net = arg->net; 4771 4772 if (rt == net->ipv6.fib6_null_entry) 4773 return 0; 4774 4775 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { 4776 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); 4777 4778 /* user wants prefix routes only */ 4779 if (rtm->rtm_flags & RTM_F_PREFIX && 4780 !(rt->fib6_flags & RTF_PREFIX_RT)) { 4781 /* success since this is not a prefix route */ 4782 return 1; 4783 } 4784 } 4785 4786 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, 4787 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, 4788 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI); 4789 } 4790 4791 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4792 struct netlink_ext_ack *extack) 4793 { 4794 struct net *net = sock_net(in_skb->sk); 4795 struct nlattr *tb[RTA_MAX+1]; 4796 int err, iif = 0, oif = 0; 4797 struct fib6_info *from; 4798 struct dst_entry *dst; 4799 struct rt6_info *rt; 4800 struct sk_buff *skb; 4801 struct rtmsg *rtm; 4802 struct flowi6 fl6; 4803 bool fibmatch; 4804 4805 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4806 extack); 4807 if (err < 0) 4808 goto errout; 4809 4810 err = -EINVAL; 4811 memset(&fl6, 0, sizeof(fl6)); 4812 rtm = nlmsg_data(nlh); 4813 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4814 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4815 4816 if (tb[RTA_SRC]) { 4817 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4818 goto errout; 4819 4820 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4821 } 4822 4823 if (tb[RTA_DST]) { 4824 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4825 goto errout; 4826 4827 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4828 } 4829 4830 if (tb[RTA_IIF]) 4831 iif = nla_get_u32(tb[RTA_IIF]); 4832 4833 if (tb[RTA_OIF]) 4834 oif = nla_get_u32(tb[RTA_OIF]); 4835 4836 if (tb[RTA_MARK]) 4837 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4838 4839 if (tb[RTA_UID]) 4840 fl6.flowi6_uid = make_kuid(current_user_ns(), 4841 nla_get_u32(tb[RTA_UID])); 4842 else 4843 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4844 4845 if (iif) { 4846 struct net_device *dev; 4847 int flags = 0; 4848 4849 rcu_read_lock(); 4850 4851 dev = dev_get_by_index_rcu(net, iif); 4852 if (!dev) { 4853 rcu_read_unlock(); 4854 err = -ENODEV; 4855 goto errout; 4856 } 4857 4858 fl6.flowi6_iif = iif; 4859 4860 if (!ipv6_addr_any(&fl6.saddr)) 4861 flags |= RT6_LOOKUP_F_HAS_SADDR; 4862 4863 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 4864 4865 rcu_read_unlock(); 4866 } else { 4867 fl6.flowi6_oif = oif; 4868 4869 dst = ip6_route_output(net, NULL, &fl6); 4870 } 4871 4872 4873 rt = container_of(dst, struct rt6_info, dst); 4874 if (rt->dst.error) { 4875 err = rt->dst.error; 4876 ip6_rt_put(rt); 4877 goto errout; 4878 } 4879 4880 if (rt == net->ipv6.ip6_null_entry) { 4881 err = rt->dst.error; 4882 ip6_rt_put(rt); 4883 goto errout; 4884 } 4885 4886 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 4887 if (!skb) { 4888 ip6_rt_put(rt); 4889 err = -ENOBUFS; 4890 goto errout; 4891 } 4892 4893 skb_dst_set(skb, &rt->dst); 4894 4895 rcu_read_lock(); 4896 from = rcu_dereference(rt->from); 4897 4898 if (fibmatch) 4899 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, 4900 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 4901 nlh->nlmsg_seq, 0); 4902 else 4903 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 4904 &fl6.saddr, iif, RTM_NEWROUTE, 4905 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 4906 0); 4907 rcu_read_unlock(); 4908 4909 if (err < 0) { 4910 kfree_skb(skb); 4911 goto errout; 4912 } 4913 4914 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 4915 errout: 4916 return err; 4917 } 4918 4919 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 4920 unsigned int nlm_flags) 4921 { 4922 struct sk_buff *skb; 4923 struct net *net = info->nl_net; 4924 u32 seq; 4925 int err; 4926 4927 err = -ENOBUFS; 4928 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 4929 4930 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 4931 if (!skb) 4932 goto errout; 4933 4934 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 4935 event, info->portid, seq, nlm_flags); 4936 if (err < 0) { 4937 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 4938 WARN_ON(err == -EMSGSIZE); 4939 kfree_skb(skb); 4940 goto errout; 4941 } 4942 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 4943 info->nlh, gfp_any()); 4944 return; 4945 errout: 4946 if (err < 0) 4947 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 4948 } 4949 4950 static int ip6_route_dev_notify(struct notifier_block *this, 4951 unsigned long event, void *ptr) 4952 { 4953 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 4954 struct net *net = dev_net(dev); 4955 4956 if (!(dev->flags & IFF_LOOPBACK)) 4957 return NOTIFY_OK; 4958 4959 if (event == NETDEV_REGISTER) { 4960 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev; 4961 net->ipv6.ip6_null_entry->dst.dev = dev; 4962 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 4963 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4964 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 4965 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 4966 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 4967 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 4968 #endif 4969 } else if (event == NETDEV_UNREGISTER && 4970 dev->reg_state != NETREG_UNREGISTERED) { 4971 /* NETDEV_UNREGISTER could be fired for multiple times by 4972 * netdev_wait_allrefs(). Make sure we only call this once. 4973 */ 4974 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 4975 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4976 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 4977 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 4978 #endif 4979 } 4980 4981 return NOTIFY_OK; 4982 } 4983 4984 /* 4985 * /proc 4986 */ 4987 4988 #ifdef CONFIG_PROC_FS 4989 4990 static const struct file_operations ipv6_route_proc_fops = { 4991 .open = ipv6_route_open, 4992 .read = seq_read, 4993 .llseek = seq_lseek, 4994 .release = seq_release_net, 4995 }; 4996 4997 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 4998 { 4999 struct net *net = (struct net *)seq->private; 5000 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 5001 net->ipv6.rt6_stats->fib_nodes, 5002 net->ipv6.rt6_stats->fib_route_nodes, 5003 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 5004 net->ipv6.rt6_stats->fib_rt_entries, 5005 net->ipv6.rt6_stats->fib_rt_cache, 5006 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 5007 net->ipv6.rt6_stats->fib_discarded_routes); 5008 5009 return 0; 5010 } 5011 5012 static int rt6_stats_seq_open(struct inode *inode, struct file *file) 5013 { 5014 return single_open_net(inode, file, rt6_stats_seq_show); 5015 } 5016 5017 static const struct file_operations rt6_stats_seq_fops = { 5018 .open = rt6_stats_seq_open, 5019 .read = seq_read, 5020 .llseek = seq_lseek, 5021 .release = single_release_net, 5022 }; 5023 #endif /* CONFIG_PROC_FS */ 5024 5025 #ifdef CONFIG_SYSCTL 5026 5027 static 5028 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 5029 void __user *buffer, size_t *lenp, loff_t *ppos) 5030 { 5031 struct net *net; 5032 int delay; 5033 if (!write) 5034 return -EINVAL; 5035 5036 net = (struct net *)ctl->extra1; 5037 delay = net->ipv6.sysctl.flush_delay; 5038 proc_dointvec(ctl, write, buffer, lenp, ppos); 5039 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 5040 return 0; 5041 } 5042 5043 struct ctl_table ipv6_route_table_template[] = { 5044 { 5045 .procname = "flush", 5046 .data = &init_net.ipv6.sysctl.flush_delay, 5047 .maxlen = sizeof(int), 5048 .mode = 0200, 5049 .proc_handler = ipv6_sysctl_rtcache_flush 5050 }, 5051 { 5052 .procname = "gc_thresh", 5053 .data = &ip6_dst_ops_template.gc_thresh, 5054 .maxlen = sizeof(int), 5055 .mode = 0644, 5056 .proc_handler = proc_dointvec, 5057 }, 5058 { 5059 .procname = "max_size", 5060 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 5061 .maxlen = sizeof(int), 5062 .mode = 0644, 5063 .proc_handler = proc_dointvec, 5064 }, 5065 { 5066 .procname = "gc_min_interval", 5067 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5068 .maxlen = sizeof(int), 5069 .mode = 0644, 5070 .proc_handler = proc_dointvec_jiffies, 5071 }, 5072 { 5073 .procname = "gc_timeout", 5074 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 5075 .maxlen = sizeof(int), 5076 .mode = 0644, 5077 .proc_handler = proc_dointvec_jiffies, 5078 }, 5079 { 5080 .procname = "gc_interval", 5081 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 5082 .maxlen = sizeof(int), 5083 .mode = 0644, 5084 .proc_handler = proc_dointvec_jiffies, 5085 }, 5086 { 5087 .procname = "gc_elasticity", 5088 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 5089 .maxlen = sizeof(int), 5090 .mode = 0644, 5091 .proc_handler = proc_dointvec, 5092 }, 5093 { 5094 .procname = "mtu_expires", 5095 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 5096 .maxlen = sizeof(int), 5097 .mode = 0644, 5098 .proc_handler = proc_dointvec_jiffies, 5099 }, 5100 { 5101 .procname = "min_adv_mss", 5102 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 5103 .maxlen = sizeof(int), 5104 .mode = 0644, 5105 .proc_handler = proc_dointvec, 5106 }, 5107 { 5108 .procname = "gc_min_interval_ms", 5109 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5110 .maxlen = sizeof(int), 5111 .mode = 0644, 5112 .proc_handler = proc_dointvec_ms_jiffies, 5113 }, 5114 { } 5115 }; 5116 5117 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 5118 { 5119 struct ctl_table *table; 5120 5121 table = kmemdup(ipv6_route_table_template, 5122 sizeof(ipv6_route_table_template), 5123 GFP_KERNEL); 5124 5125 if (table) { 5126 table[0].data = &net->ipv6.sysctl.flush_delay; 5127 table[0].extra1 = net; 5128 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 5129 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 5130 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5131 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 5132 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 5133 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 5134 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 5135 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 5136 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5137 5138 /* Don't export sysctls to unprivileged users */ 5139 if (net->user_ns != &init_user_ns) 5140 table[0].procname = NULL; 5141 } 5142 5143 return table; 5144 } 5145 #endif 5146 5147 static int __net_init ip6_route_net_init(struct net *net) 5148 { 5149 int ret = -ENOMEM; 5150 5151 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 5152 sizeof(net->ipv6.ip6_dst_ops)); 5153 5154 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 5155 goto out_ip6_dst_ops; 5156 5157 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, 5158 sizeof(*net->ipv6.fib6_null_entry), 5159 GFP_KERNEL); 5160 if (!net->ipv6.fib6_null_entry) 5161 goto out_ip6_dst_entries; 5162 5163 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 5164 sizeof(*net->ipv6.ip6_null_entry), 5165 GFP_KERNEL); 5166 if (!net->ipv6.ip6_null_entry) 5167 goto out_fib6_null_entry; 5168 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5169 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 5170 ip6_template_metrics, true); 5171 5172 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5173 net->ipv6.fib6_has_custom_rules = false; 5174 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 5175 sizeof(*net->ipv6.ip6_prohibit_entry), 5176 GFP_KERNEL); 5177 if (!net->ipv6.ip6_prohibit_entry) 5178 goto out_ip6_null_entry; 5179 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5180 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 5181 ip6_template_metrics, true); 5182 5183 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 5184 sizeof(*net->ipv6.ip6_blk_hole_entry), 5185 GFP_KERNEL); 5186 if (!net->ipv6.ip6_blk_hole_entry) 5187 goto out_ip6_prohibit_entry; 5188 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5189 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 5190 ip6_template_metrics, true); 5191 #endif 5192 5193 net->ipv6.sysctl.flush_delay = 0; 5194 net->ipv6.sysctl.ip6_rt_max_size = 4096; 5195 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 5196 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 5197 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 5198 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 5199 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 5200 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 5201 5202 net->ipv6.ip6_rt_gc_expire = 30*HZ; 5203 5204 ret = 0; 5205 out: 5206 return ret; 5207 5208 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5209 out_ip6_prohibit_entry: 5210 kfree(net->ipv6.ip6_prohibit_entry); 5211 out_ip6_null_entry: 5212 kfree(net->ipv6.ip6_null_entry); 5213 #endif 5214 out_fib6_null_entry: 5215 kfree(net->ipv6.fib6_null_entry); 5216 out_ip6_dst_entries: 5217 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5218 out_ip6_dst_ops: 5219 goto out; 5220 } 5221 5222 static void __net_exit ip6_route_net_exit(struct net *net) 5223 { 5224 kfree(net->ipv6.fib6_null_entry); 5225 kfree(net->ipv6.ip6_null_entry); 5226 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5227 kfree(net->ipv6.ip6_prohibit_entry); 5228 kfree(net->ipv6.ip6_blk_hole_entry); 5229 #endif 5230 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5231 } 5232 5233 static int __net_init ip6_route_net_init_late(struct net *net) 5234 { 5235 #ifdef CONFIG_PROC_FS 5236 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops); 5237 proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops); 5238 #endif 5239 return 0; 5240 } 5241 5242 static void __net_exit ip6_route_net_exit_late(struct net *net) 5243 { 5244 #ifdef CONFIG_PROC_FS 5245 remove_proc_entry("ipv6_route", net->proc_net); 5246 remove_proc_entry("rt6_stats", net->proc_net); 5247 #endif 5248 } 5249 5250 static struct pernet_operations ip6_route_net_ops = { 5251 .init = ip6_route_net_init, 5252 .exit = ip6_route_net_exit, 5253 }; 5254 5255 static int __net_init ipv6_inetpeer_init(struct net *net) 5256 { 5257 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 5258 5259 if (!bp) 5260 return -ENOMEM; 5261 inet_peer_base_init(bp); 5262 net->ipv6.peers = bp; 5263 return 0; 5264 } 5265 5266 static void __net_exit ipv6_inetpeer_exit(struct net *net) 5267 { 5268 struct inet_peer_base *bp = net->ipv6.peers; 5269 5270 net->ipv6.peers = NULL; 5271 inetpeer_invalidate_tree(bp); 5272 kfree(bp); 5273 } 5274 5275 static struct pernet_operations ipv6_inetpeer_ops = { 5276 .init = ipv6_inetpeer_init, 5277 .exit = ipv6_inetpeer_exit, 5278 }; 5279 5280 static struct pernet_operations ip6_route_net_late_ops = { 5281 .init = ip6_route_net_init_late, 5282 .exit = ip6_route_net_exit_late, 5283 }; 5284 5285 static struct notifier_block ip6_route_dev_notifier = { 5286 .notifier_call = ip6_route_dev_notify, 5287 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 5288 }; 5289 5290 void __init ip6_route_init_special_entries(void) 5291 { 5292 /* Registering of the loopback is done before this portion of code, 5293 * the loopback reference in rt6_info will not be taken, do it 5294 * manually for init_net */ 5295 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev; 5296 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5297 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5298 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5299 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 5300 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5301 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 5302 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5303 #endif 5304 } 5305 5306 int __init ip6_route_init(void) 5307 { 5308 int ret; 5309 int cpu; 5310 5311 ret = -ENOMEM; 5312 ip6_dst_ops_template.kmem_cachep = 5313 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 5314 SLAB_HWCACHE_ALIGN, NULL); 5315 if (!ip6_dst_ops_template.kmem_cachep) 5316 goto out; 5317 5318 ret = dst_entries_init(&ip6_dst_blackhole_ops); 5319 if (ret) 5320 goto out_kmem_cache; 5321 5322 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 5323 if (ret) 5324 goto out_dst_entries; 5325 5326 ret = register_pernet_subsys(&ip6_route_net_ops); 5327 if (ret) 5328 goto out_register_inetpeer; 5329 5330 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 5331 5332 ret = fib6_init(); 5333 if (ret) 5334 goto out_register_subsys; 5335 5336 ret = xfrm6_init(); 5337 if (ret) 5338 goto out_fib6_init; 5339 5340 ret = fib6_rules_init(); 5341 if (ret) 5342 goto xfrm6_init; 5343 5344 ret = register_pernet_subsys(&ip6_route_net_late_ops); 5345 if (ret) 5346 goto fib6_rules_init; 5347 5348 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 5349 inet6_rtm_newroute, NULL, 0); 5350 if (ret < 0) 5351 goto out_register_late_subsys; 5352 5353 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 5354 inet6_rtm_delroute, NULL, 0); 5355 if (ret < 0) 5356 goto out_register_late_subsys; 5357 5358 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 5359 inet6_rtm_getroute, NULL, 5360 RTNL_FLAG_DOIT_UNLOCKED); 5361 if (ret < 0) 5362 goto out_register_late_subsys; 5363 5364 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 5365 if (ret) 5366 goto out_register_late_subsys; 5367 5368 for_each_possible_cpu(cpu) { 5369 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 5370 5371 INIT_LIST_HEAD(&ul->head); 5372 spin_lock_init(&ul->lock); 5373 } 5374 5375 out: 5376 return ret; 5377 5378 out_register_late_subsys: 5379 rtnl_unregister_all(PF_INET6); 5380 unregister_pernet_subsys(&ip6_route_net_late_ops); 5381 fib6_rules_init: 5382 fib6_rules_cleanup(); 5383 xfrm6_init: 5384 xfrm6_fini(); 5385 out_fib6_init: 5386 fib6_gc_cleanup(); 5387 out_register_subsys: 5388 unregister_pernet_subsys(&ip6_route_net_ops); 5389 out_register_inetpeer: 5390 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5391 out_dst_entries: 5392 dst_entries_destroy(&ip6_dst_blackhole_ops); 5393 out_kmem_cache: 5394 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5395 goto out; 5396 } 5397 5398 void ip6_route_cleanup(void) 5399 { 5400 unregister_netdevice_notifier(&ip6_route_dev_notifier); 5401 unregister_pernet_subsys(&ip6_route_net_late_ops); 5402 fib6_rules_cleanup(); 5403 xfrm6_fini(); 5404 fib6_gc_cleanup(); 5405 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5406 unregister_pernet_subsys(&ip6_route_net_ops); 5407 dst_entries_destroy(&ip6_dst_blackhole_ops); 5408 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5409 } 5410