1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/nexthop.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <trace/events/fib6.h> 67 68 #include <linux/uaccess.h> 69 70 #ifdef CONFIG_SYSCTL 71 #include <linux/sysctl.h> 72 #endif 73 74 enum rt6_nud_state { 75 RT6_NUD_FAIL_HARD = -3, 76 RT6_NUD_FAIL_PROBE = -2, 77 RT6_NUD_FAIL_DO_RR = -1, 78 RT6_NUD_SUCCEED = 1 79 }; 80 81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 82 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 83 static unsigned int ip6_mtu(const struct dst_entry *dst); 84 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 85 static void ip6_dst_destroy(struct dst_entry *); 86 static void ip6_dst_ifdown(struct dst_entry *, 87 struct net_device *dev, int how); 88 static int ip6_dst_gc(struct dst_ops *ops); 89 90 static int ip6_pkt_discard(struct sk_buff *skb); 91 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 92 static int ip6_pkt_prohibit(struct sk_buff *skb); 93 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 94 static void ip6_link_failure(struct sk_buff *skb); 95 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 96 struct sk_buff *skb, u32 mtu); 97 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 98 struct sk_buff *skb); 99 static int rt6_score_route(struct fib6_info *rt, int oif, int strict); 100 static size_t rt6_nlmsg_size(struct fib6_info *rt); 101 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 102 struct fib6_info *rt, struct dst_entry *dst, 103 struct in6_addr *dest, struct in6_addr *src, 104 int iif, int type, u32 portid, u32 seq, 105 unsigned int flags); 106 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 107 struct in6_addr *daddr, 108 struct in6_addr *saddr); 109 110 #ifdef CONFIG_IPV6_ROUTE_INFO 111 static struct fib6_info *rt6_add_route_info(struct net *net, 112 const struct in6_addr *prefix, int prefixlen, 113 const struct in6_addr *gwaddr, 114 struct net_device *dev, 115 unsigned int pref); 116 static struct fib6_info *rt6_get_route_info(struct net *net, 117 const struct in6_addr *prefix, int prefixlen, 118 const struct in6_addr *gwaddr, 119 struct net_device *dev); 120 #endif 121 122 struct uncached_list { 123 spinlock_t lock; 124 struct list_head head; 125 }; 126 127 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 128 129 void rt6_uncached_list_add(struct rt6_info *rt) 130 { 131 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 132 133 rt->rt6i_uncached_list = ul; 134 135 spin_lock_bh(&ul->lock); 136 list_add_tail(&rt->rt6i_uncached, &ul->head); 137 spin_unlock_bh(&ul->lock); 138 } 139 140 void rt6_uncached_list_del(struct rt6_info *rt) 141 { 142 if (!list_empty(&rt->rt6i_uncached)) { 143 struct uncached_list *ul = rt->rt6i_uncached_list; 144 struct net *net = dev_net(rt->dst.dev); 145 146 spin_lock_bh(&ul->lock); 147 list_del(&rt->rt6i_uncached); 148 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 149 spin_unlock_bh(&ul->lock); 150 } 151 } 152 153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 154 { 155 struct net_device *loopback_dev = net->loopback_dev; 156 int cpu; 157 158 if (dev == loopback_dev) 159 return; 160 161 for_each_possible_cpu(cpu) { 162 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 163 struct rt6_info *rt; 164 165 spin_lock_bh(&ul->lock); 166 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 167 struct inet6_dev *rt_idev = rt->rt6i_idev; 168 struct net_device *rt_dev = rt->dst.dev; 169 170 if (rt_idev->dev == dev) { 171 rt->rt6i_idev = in6_dev_get(loopback_dev); 172 in6_dev_put(rt_idev); 173 } 174 175 if (rt_dev == dev) { 176 rt->dst.dev = loopback_dev; 177 dev_hold(rt->dst.dev); 178 dev_put(rt_dev); 179 } 180 } 181 spin_unlock_bh(&ul->lock); 182 } 183 } 184 185 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 186 struct sk_buff *skb, 187 const void *daddr) 188 { 189 if (!ipv6_addr_any(p)) 190 return (const void *) p; 191 else if (skb) 192 return &ipv6_hdr(skb)->daddr; 193 return daddr; 194 } 195 196 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 197 struct net_device *dev, 198 struct sk_buff *skb, 199 const void *daddr) 200 { 201 struct neighbour *n; 202 203 daddr = choose_neigh_daddr(gw, skb, daddr); 204 n = __ipv6_neigh_lookup(dev, daddr); 205 if (n) 206 return n; 207 return neigh_create(&nd_tbl, daddr, dev); 208 } 209 210 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 211 struct sk_buff *skb, 212 const void *daddr) 213 { 214 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 215 216 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); 217 } 218 219 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 220 { 221 struct net_device *dev = dst->dev; 222 struct rt6_info *rt = (struct rt6_info *)dst; 223 224 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); 225 if (!daddr) 226 return; 227 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 228 return; 229 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 230 return; 231 __ipv6_confirm_neigh(dev, daddr); 232 } 233 234 static struct dst_ops ip6_dst_ops_template = { 235 .family = AF_INET6, 236 .gc = ip6_dst_gc, 237 .gc_thresh = 1024, 238 .check = ip6_dst_check, 239 .default_advmss = ip6_default_advmss, 240 .mtu = ip6_mtu, 241 .cow_metrics = dst_cow_metrics_generic, 242 .destroy = ip6_dst_destroy, 243 .ifdown = ip6_dst_ifdown, 244 .negative_advice = ip6_negative_advice, 245 .link_failure = ip6_link_failure, 246 .update_pmtu = ip6_rt_update_pmtu, 247 .redirect = rt6_do_redirect, 248 .local_out = __ip6_local_out, 249 .neigh_lookup = ip6_dst_neigh_lookup, 250 .confirm_neigh = ip6_confirm_neigh, 251 }; 252 253 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 254 { 255 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 256 257 return mtu ? : dst->dev->mtu; 258 } 259 260 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 261 struct sk_buff *skb, u32 mtu) 262 { 263 } 264 265 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 266 struct sk_buff *skb) 267 { 268 } 269 270 static struct dst_ops ip6_dst_blackhole_ops = { 271 .family = AF_INET6, 272 .destroy = ip6_dst_destroy, 273 .check = ip6_dst_check, 274 .mtu = ip6_blackhole_mtu, 275 .default_advmss = ip6_default_advmss, 276 .update_pmtu = ip6_rt_blackhole_update_pmtu, 277 .redirect = ip6_rt_blackhole_redirect, 278 .cow_metrics = dst_cow_metrics_generic, 279 .neigh_lookup = ip6_dst_neigh_lookup, 280 }; 281 282 static const u32 ip6_template_metrics[RTAX_MAX] = { 283 [RTAX_HOPLIMIT - 1] = 0, 284 }; 285 286 static const struct fib6_info fib6_null_entry_template = { 287 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 288 .fib6_protocol = RTPROT_KERNEL, 289 .fib6_metric = ~(u32)0, 290 .fib6_ref = ATOMIC_INIT(1), 291 .fib6_type = RTN_UNREACHABLE, 292 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 293 }; 294 295 static const struct rt6_info ip6_null_entry_template = { 296 .dst = { 297 .__refcnt = ATOMIC_INIT(1), 298 .__use = 1, 299 .obsolete = DST_OBSOLETE_FORCE_CHK, 300 .error = -ENETUNREACH, 301 .input = ip6_pkt_discard, 302 .output = ip6_pkt_discard_out, 303 }, 304 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 305 }; 306 307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 308 309 static const struct rt6_info ip6_prohibit_entry_template = { 310 .dst = { 311 .__refcnt = ATOMIC_INIT(1), 312 .__use = 1, 313 .obsolete = DST_OBSOLETE_FORCE_CHK, 314 .error = -EACCES, 315 .input = ip6_pkt_prohibit, 316 .output = ip6_pkt_prohibit_out, 317 }, 318 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 319 }; 320 321 static const struct rt6_info ip6_blk_hole_entry_template = { 322 .dst = { 323 .__refcnt = ATOMIC_INIT(1), 324 .__use = 1, 325 .obsolete = DST_OBSOLETE_FORCE_CHK, 326 .error = -EINVAL, 327 .input = dst_discard, 328 .output = dst_discard_out, 329 }, 330 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 331 }; 332 333 #endif 334 335 static void rt6_info_init(struct rt6_info *rt) 336 { 337 struct dst_entry *dst = &rt->dst; 338 339 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 340 INIT_LIST_HEAD(&rt->rt6i_uncached); 341 } 342 343 /* allocate dst with ip6_dst_ops */ 344 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 345 int flags) 346 { 347 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 348 1, DST_OBSOLETE_FORCE_CHK, flags); 349 350 if (rt) { 351 rt6_info_init(rt); 352 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 353 } 354 355 return rt; 356 } 357 EXPORT_SYMBOL(ip6_dst_alloc); 358 359 static void ip6_dst_destroy(struct dst_entry *dst) 360 { 361 struct rt6_info *rt = (struct rt6_info *)dst; 362 struct fib6_info *from; 363 struct inet6_dev *idev; 364 365 dst_destroy_metrics_generic(dst); 366 rt6_uncached_list_del(rt); 367 368 idev = rt->rt6i_idev; 369 if (idev) { 370 rt->rt6i_idev = NULL; 371 in6_dev_put(idev); 372 } 373 374 rcu_read_lock(); 375 from = rcu_dereference(rt->from); 376 rcu_assign_pointer(rt->from, NULL); 377 fib6_info_release(from); 378 rcu_read_unlock(); 379 } 380 381 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 382 int how) 383 { 384 struct rt6_info *rt = (struct rt6_info *)dst; 385 struct inet6_dev *idev = rt->rt6i_idev; 386 struct net_device *loopback_dev = 387 dev_net(dev)->loopback_dev; 388 389 if (idev && idev->dev != loopback_dev) { 390 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 391 if (loopback_idev) { 392 rt->rt6i_idev = loopback_idev; 393 in6_dev_put(idev); 394 } 395 } 396 } 397 398 static bool __rt6_check_expired(const struct rt6_info *rt) 399 { 400 if (rt->rt6i_flags & RTF_EXPIRES) 401 return time_after(jiffies, rt->dst.expires); 402 else 403 return false; 404 } 405 406 static bool rt6_check_expired(const struct rt6_info *rt) 407 { 408 struct fib6_info *from; 409 410 from = rcu_dereference(rt->from); 411 412 if (rt->rt6i_flags & RTF_EXPIRES) { 413 if (time_after(jiffies, rt->dst.expires)) 414 return true; 415 } else if (from) { 416 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 417 fib6_check_expired(from); 418 } 419 return false; 420 } 421 422 static struct fib6_info *rt6_multipath_select(const struct net *net, 423 struct fib6_info *match, 424 struct flowi6 *fl6, int oif, 425 const struct sk_buff *skb, 426 int strict) 427 { 428 struct fib6_info *sibling, *next_sibling; 429 430 /* We might have already computed the hash for ICMPv6 errors. In such 431 * case it will always be non-zero. Otherwise now is the time to do it. 432 */ 433 if (!fl6->mp_hash) 434 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 435 436 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound)) 437 return match; 438 439 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 440 fib6_siblings) { 441 int nh_upper_bound; 442 443 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound); 444 if (fl6->mp_hash > nh_upper_bound) 445 continue; 446 if (rt6_score_route(sibling, oif, strict) < 0) 447 break; 448 match = sibling; 449 break; 450 } 451 452 return match; 453 } 454 455 /* 456 * Route lookup. rcu_read_lock() should be held. 457 */ 458 459 static inline struct fib6_info *rt6_device_match(struct net *net, 460 struct fib6_info *rt, 461 const struct in6_addr *saddr, 462 int oif, 463 int flags) 464 { 465 struct fib6_info *sprt; 466 467 if (!oif && ipv6_addr_any(saddr) && 468 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD)) 469 return rt; 470 471 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) { 472 const struct net_device *dev = sprt->fib6_nh.nh_dev; 473 474 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD) 475 continue; 476 477 if (oif) { 478 if (dev->ifindex == oif) 479 return sprt; 480 } else { 481 if (ipv6_chk_addr(net, saddr, dev, 482 flags & RT6_LOOKUP_F_IFACE)) 483 return sprt; 484 } 485 } 486 487 if (oif && flags & RT6_LOOKUP_F_IFACE) 488 return net->ipv6.fib6_null_entry; 489 490 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt; 491 } 492 493 #ifdef CONFIG_IPV6_ROUTER_PREF 494 struct __rt6_probe_work { 495 struct work_struct work; 496 struct in6_addr target; 497 struct net_device *dev; 498 }; 499 500 static void rt6_probe_deferred(struct work_struct *w) 501 { 502 struct in6_addr mcaddr; 503 struct __rt6_probe_work *work = 504 container_of(w, struct __rt6_probe_work, work); 505 506 addrconf_addr_solict_mult(&work->target, &mcaddr); 507 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 508 dev_put(work->dev); 509 kfree(work); 510 } 511 512 static void rt6_probe(struct fib6_info *rt) 513 { 514 struct __rt6_probe_work *work; 515 const struct in6_addr *nh_gw; 516 struct neighbour *neigh; 517 struct net_device *dev; 518 519 /* 520 * Okay, this does not seem to be appropriate 521 * for now, however, we need to check if it 522 * is really so; aka Router Reachability Probing. 523 * 524 * Router Reachability Probe MUST be rate-limited 525 * to no more than one per minute. 526 */ 527 if (!rt || !(rt->fib6_flags & RTF_GATEWAY)) 528 return; 529 530 nh_gw = &rt->fib6_nh.nh_gw; 531 dev = rt->fib6_nh.nh_dev; 532 rcu_read_lock_bh(); 533 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 534 if (neigh) { 535 struct inet6_dev *idev; 536 537 if (neigh->nud_state & NUD_VALID) 538 goto out; 539 540 idev = __in6_dev_get(dev); 541 work = NULL; 542 write_lock(&neigh->lock); 543 if (!(neigh->nud_state & NUD_VALID) && 544 time_after(jiffies, 545 neigh->updated + idev->cnf.rtr_probe_interval)) { 546 work = kmalloc(sizeof(*work), GFP_ATOMIC); 547 if (work) 548 __neigh_set_probe_once(neigh); 549 } 550 write_unlock(&neigh->lock); 551 } else { 552 work = kmalloc(sizeof(*work), GFP_ATOMIC); 553 } 554 555 if (work) { 556 INIT_WORK(&work->work, rt6_probe_deferred); 557 work->target = *nh_gw; 558 dev_hold(dev); 559 work->dev = dev; 560 schedule_work(&work->work); 561 } 562 563 out: 564 rcu_read_unlock_bh(); 565 } 566 #else 567 static inline void rt6_probe(struct fib6_info *rt) 568 { 569 } 570 #endif 571 572 /* 573 * Default Router Selection (RFC 2461 6.3.6) 574 */ 575 static inline int rt6_check_dev(struct fib6_info *rt, int oif) 576 { 577 const struct net_device *dev = rt->fib6_nh.nh_dev; 578 579 if (!oif || dev->ifindex == oif) 580 return 2; 581 return 0; 582 } 583 584 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) 585 { 586 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 587 struct neighbour *neigh; 588 589 if (rt->fib6_flags & RTF_NONEXTHOP || 590 !(rt->fib6_flags & RTF_GATEWAY)) 591 return RT6_NUD_SUCCEED; 592 593 rcu_read_lock_bh(); 594 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev, 595 &rt->fib6_nh.nh_gw); 596 if (neigh) { 597 read_lock(&neigh->lock); 598 if (neigh->nud_state & NUD_VALID) 599 ret = RT6_NUD_SUCCEED; 600 #ifdef CONFIG_IPV6_ROUTER_PREF 601 else if (!(neigh->nud_state & NUD_FAILED)) 602 ret = RT6_NUD_SUCCEED; 603 else 604 ret = RT6_NUD_FAIL_PROBE; 605 #endif 606 read_unlock(&neigh->lock); 607 } else { 608 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 609 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 610 } 611 rcu_read_unlock_bh(); 612 613 return ret; 614 } 615 616 static int rt6_score_route(struct fib6_info *rt, int oif, int strict) 617 { 618 int m; 619 620 m = rt6_check_dev(rt, oif); 621 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 622 return RT6_NUD_FAIL_HARD; 623 #ifdef CONFIG_IPV6_ROUTER_PREF 624 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2; 625 #endif 626 if (strict & RT6_LOOKUP_F_REACHABLE) { 627 int n = rt6_check_neigh(rt); 628 if (n < 0) 629 return n; 630 } 631 return m; 632 } 633 634 /* called with rc_read_lock held */ 635 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i) 636 { 637 const struct net_device *dev = fib6_info_nh_dev(f6i); 638 bool rc = false; 639 640 if (dev) { 641 const struct inet6_dev *idev = __in6_dev_get(dev); 642 643 rc = !!idev->cnf.ignore_routes_with_linkdown; 644 } 645 646 return rc; 647 } 648 649 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, 650 int *mpri, struct fib6_info *match, 651 bool *do_rr) 652 { 653 int m; 654 bool match_do_rr = false; 655 656 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 657 goto out; 658 659 if (fib6_ignore_linkdown(rt) && 660 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 661 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 662 goto out; 663 664 if (fib6_check_expired(rt)) 665 goto out; 666 667 m = rt6_score_route(rt, oif, strict); 668 if (m == RT6_NUD_FAIL_DO_RR) { 669 match_do_rr = true; 670 m = 0; /* lowest valid score */ 671 } else if (m == RT6_NUD_FAIL_HARD) { 672 goto out; 673 } 674 675 if (strict & RT6_LOOKUP_F_REACHABLE) 676 rt6_probe(rt); 677 678 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 679 if (m > *mpri) { 680 *do_rr = match_do_rr; 681 *mpri = m; 682 match = rt; 683 } 684 out: 685 return match; 686 } 687 688 static struct fib6_info *find_rr_leaf(struct fib6_node *fn, 689 struct fib6_info *leaf, 690 struct fib6_info *rr_head, 691 u32 metric, int oif, int strict, 692 bool *do_rr) 693 { 694 struct fib6_info *rt, *match, *cont; 695 int mpri = -1; 696 697 match = NULL; 698 cont = NULL; 699 for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) { 700 if (rt->fib6_metric != metric) { 701 cont = rt; 702 break; 703 } 704 705 match = find_match(rt, oif, strict, &mpri, match, do_rr); 706 } 707 708 for (rt = leaf; rt && rt != rr_head; 709 rt = rcu_dereference(rt->rt6_next)) { 710 if (rt->fib6_metric != metric) { 711 cont = rt; 712 break; 713 } 714 715 match = find_match(rt, oif, strict, &mpri, match, do_rr); 716 } 717 718 if (match || !cont) 719 return match; 720 721 for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next)) 722 match = find_match(rt, oif, strict, &mpri, match, do_rr); 723 724 return match; 725 } 726 727 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, 728 int oif, int strict) 729 { 730 struct fib6_info *leaf = rcu_dereference(fn->leaf); 731 struct fib6_info *match, *rt0; 732 bool do_rr = false; 733 int key_plen; 734 735 if (!leaf || leaf == net->ipv6.fib6_null_entry) 736 return net->ipv6.fib6_null_entry; 737 738 rt0 = rcu_dereference(fn->rr_ptr); 739 if (!rt0) 740 rt0 = leaf; 741 742 /* Double check to make sure fn is not an intermediate node 743 * and fn->leaf does not points to its child's leaf 744 * (This might happen if all routes under fn are deleted from 745 * the tree and fib6_repair_tree() is called on the node.) 746 */ 747 key_plen = rt0->fib6_dst.plen; 748 #ifdef CONFIG_IPV6_SUBTREES 749 if (rt0->fib6_src.plen) 750 key_plen = rt0->fib6_src.plen; 751 #endif 752 if (fn->fn_bit != key_plen) 753 return net->ipv6.fib6_null_entry; 754 755 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict, 756 &do_rr); 757 758 if (do_rr) { 759 struct fib6_info *next = rcu_dereference(rt0->rt6_next); 760 761 /* no entries matched; do round-robin */ 762 if (!next || next->fib6_metric != rt0->fib6_metric) 763 next = leaf; 764 765 if (next != rt0) { 766 spin_lock_bh(&leaf->fib6_table->tb6_lock); 767 /* make sure next is not being deleted from the tree */ 768 if (next->fib6_node) 769 rcu_assign_pointer(fn->rr_ptr, next); 770 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 771 } 772 } 773 774 return match ? match : net->ipv6.fib6_null_entry; 775 } 776 777 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) 778 { 779 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); 780 } 781 782 #ifdef CONFIG_IPV6_ROUTE_INFO 783 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 784 const struct in6_addr *gwaddr) 785 { 786 struct net *net = dev_net(dev); 787 struct route_info *rinfo = (struct route_info *) opt; 788 struct in6_addr prefix_buf, *prefix; 789 unsigned int pref; 790 unsigned long lifetime; 791 struct fib6_info *rt; 792 793 if (len < sizeof(struct route_info)) { 794 return -EINVAL; 795 } 796 797 /* Sanity check for prefix_len and length */ 798 if (rinfo->length > 3) { 799 return -EINVAL; 800 } else if (rinfo->prefix_len > 128) { 801 return -EINVAL; 802 } else if (rinfo->prefix_len > 64) { 803 if (rinfo->length < 2) { 804 return -EINVAL; 805 } 806 } else if (rinfo->prefix_len > 0) { 807 if (rinfo->length < 1) { 808 return -EINVAL; 809 } 810 } 811 812 pref = rinfo->route_pref; 813 if (pref == ICMPV6_ROUTER_PREF_INVALID) 814 return -EINVAL; 815 816 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 817 818 if (rinfo->length == 3) 819 prefix = (struct in6_addr *)rinfo->prefix; 820 else { 821 /* this function is safe */ 822 ipv6_addr_prefix(&prefix_buf, 823 (struct in6_addr *)rinfo->prefix, 824 rinfo->prefix_len); 825 prefix = &prefix_buf; 826 } 827 828 if (rinfo->prefix_len == 0) 829 rt = rt6_get_dflt_router(net, gwaddr, dev); 830 else 831 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 832 gwaddr, dev); 833 834 if (rt && !lifetime) { 835 ip6_del_rt(net, rt); 836 rt = NULL; 837 } 838 839 if (!rt && lifetime) 840 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 841 dev, pref); 842 else if (rt) 843 rt->fib6_flags = RTF_ROUTEINFO | 844 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 845 846 if (rt) { 847 if (!addrconf_finite_timeout(lifetime)) 848 fib6_clean_expires(rt); 849 else 850 fib6_set_expires(rt, jiffies + HZ * lifetime); 851 852 fib6_info_release(rt); 853 } 854 return 0; 855 } 856 #endif 857 858 /* 859 * Misc support functions 860 */ 861 862 /* called with rcu_lock held */ 863 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt) 864 { 865 struct net_device *dev = rt->fib6_nh.nh_dev; 866 867 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 868 /* for copies of local routes, dst->dev needs to be the 869 * device if it is a master device, the master device if 870 * device is enslaved, and the loopback as the default 871 */ 872 if (netif_is_l3_slave(dev) && 873 !rt6_need_strict(&rt->fib6_dst.addr)) 874 dev = l3mdev_master_dev_rcu(dev); 875 else if (!netif_is_l3_master(dev)) 876 dev = dev_net(dev)->loopback_dev; 877 /* last case is netif_is_l3_master(dev) is true in which 878 * case we want dev returned to be dev 879 */ 880 } 881 882 return dev; 883 } 884 885 static const int fib6_prop[RTN_MAX + 1] = { 886 [RTN_UNSPEC] = 0, 887 [RTN_UNICAST] = 0, 888 [RTN_LOCAL] = 0, 889 [RTN_BROADCAST] = 0, 890 [RTN_ANYCAST] = 0, 891 [RTN_MULTICAST] = 0, 892 [RTN_BLACKHOLE] = -EINVAL, 893 [RTN_UNREACHABLE] = -EHOSTUNREACH, 894 [RTN_PROHIBIT] = -EACCES, 895 [RTN_THROW] = -EAGAIN, 896 [RTN_NAT] = -EINVAL, 897 [RTN_XRESOLVE] = -EINVAL, 898 }; 899 900 static int ip6_rt_type_to_error(u8 fib6_type) 901 { 902 return fib6_prop[fib6_type]; 903 } 904 905 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 906 { 907 unsigned short flags = 0; 908 909 if (rt->dst_nocount) 910 flags |= DST_NOCOUNT; 911 if (rt->dst_nopolicy) 912 flags |= DST_NOPOLICY; 913 if (rt->dst_host) 914 flags |= DST_HOST; 915 916 return flags; 917 } 918 919 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) 920 { 921 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); 922 923 switch (ort->fib6_type) { 924 case RTN_BLACKHOLE: 925 rt->dst.output = dst_discard_out; 926 rt->dst.input = dst_discard; 927 break; 928 case RTN_PROHIBIT: 929 rt->dst.output = ip6_pkt_prohibit_out; 930 rt->dst.input = ip6_pkt_prohibit; 931 break; 932 case RTN_THROW: 933 case RTN_UNREACHABLE: 934 default: 935 rt->dst.output = ip6_pkt_discard_out; 936 rt->dst.input = ip6_pkt_discard; 937 break; 938 } 939 } 940 941 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) 942 { 943 rt->dst.flags |= fib6_info_dst_flags(ort); 944 945 if (ort->fib6_flags & RTF_REJECT) { 946 ip6_rt_init_dst_reject(rt, ort); 947 return; 948 } 949 950 rt->dst.error = 0; 951 rt->dst.output = ip6_output; 952 953 if (ort->fib6_type == RTN_LOCAL) { 954 rt->dst.input = ip6_input; 955 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 956 rt->dst.input = ip6_mc_input; 957 } else { 958 rt->dst.input = ip6_forward; 959 } 960 961 if (ort->fib6_nh.nh_lwtstate) { 962 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); 963 lwtunnel_set_redirect(&rt->dst); 964 } 965 966 rt->dst.lastuse = jiffies; 967 } 968 969 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 970 { 971 rt->rt6i_flags &= ~RTF_EXPIRES; 972 fib6_info_hold(from); 973 rcu_assign_pointer(rt->from, from); 974 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); 975 if (from->fib6_metrics != &dst_default_metrics) { 976 rt->dst._metrics |= DST_METRICS_REFCOUNTED; 977 refcount_inc(&from->fib6_metrics->refcnt); 978 } 979 } 980 981 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) 982 { 983 struct net_device *dev = fib6_info_nh_dev(ort); 984 985 ip6_rt_init_dst(rt, ort); 986 987 rt->rt6i_dst = ort->fib6_dst; 988 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 989 rt->rt6i_gateway = ort->fib6_nh.nh_gw; 990 rt->rt6i_flags = ort->fib6_flags; 991 rt6_set_from(rt, ort); 992 #ifdef CONFIG_IPV6_SUBTREES 993 rt->rt6i_src = ort->fib6_src; 994 #endif 995 rt->rt6i_prefsrc = ort->fib6_prefsrc; 996 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); 997 } 998 999 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1000 struct in6_addr *saddr) 1001 { 1002 struct fib6_node *pn, *sn; 1003 while (1) { 1004 if (fn->fn_flags & RTN_TL_ROOT) 1005 return NULL; 1006 pn = rcu_dereference(fn->parent); 1007 sn = FIB6_SUBTREE(pn); 1008 if (sn && sn != fn) 1009 fn = fib6_lookup(sn, NULL, saddr); 1010 else 1011 fn = pn; 1012 if (fn->fn_flags & RTN_RTINFO) 1013 return fn; 1014 } 1015 } 1016 1017 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, 1018 bool null_fallback) 1019 { 1020 struct rt6_info *rt = *prt; 1021 1022 if (dst_hold_safe(&rt->dst)) 1023 return true; 1024 if (null_fallback) { 1025 rt = net->ipv6.ip6_null_entry; 1026 dst_hold(&rt->dst); 1027 } else { 1028 rt = NULL; 1029 } 1030 *prt = rt; 1031 return false; 1032 } 1033 1034 /* called with rcu_lock held */ 1035 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) 1036 { 1037 unsigned short flags = fib6_info_dst_flags(rt); 1038 struct net_device *dev = rt->fib6_nh.nh_dev; 1039 struct rt6_info *nrt; 1040 1041 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1042 if (nrt) 1043 ip6_rt_copy_init(nrt, rt); 1044 1045 return nrt; 1046 } 1047 1048 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1049 struct fib6_table *table, 1050 struct flowi6 *fl6, 1051 const struct sk_buff *skb, 1052 int flags) 1053 { 1054 struct fib6_info *f6i; 1055 struct fib6_node *fn; 1056 struct rt6_info *rt; 1057 1058 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1059 flags &= ~RT6_LOOKUP_F_IFACE; 1060 1061 rcu_read_lock(); 1062 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1063 restart: 1064 f6i = rcu_dereference(fn->leaf); 1065 if (!f6i) { 1066 f6i = net->ipv6.fib6_null_entry; 1067 } else { 1068 f6i = rt6_device_match(net, f6i, &fl6->saddr, 1069 fl6->flowi6_oif, flags); 1070 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) 1071 f6i = rt6_multipath_select(net, f6i, fl6, 1072 fl6->flowi6_oif, skb, flags); 1073 } 1074 if (f6i == net->ipv6.fib6_null_entry) { 1075 fn = fib6_backtrack(fn, &fl6->saddr); 1076 if (fn) 1077 goto restart; 1078 } 1079 1080 /* Search through exception table */ 1081 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1082 if (rt) { 1083 if (ip6_hold_safe(net, &rt, true)) 1084 dst_use_noref(&rt->dst, jiffies); 1085 } else if (f6i == net->ipv6.fib6_null_entry) { 1086 rt = net->ipv6.ip6_null_entry; 1087 dst_hold(&rt->dst); 1088 } else { 1089 rt = ip6_create_rt_rcu(f6i); 1090 if (!rt) { 1091 rt = net->ipv6.ip6_null_entry; 1092 dst_hold(&rt->dst); 1093 } 1094 } 1095 1096 rcu_read_unlock(); 1097 1098 trace_fib6_table_lookup(net, rt, table, fl6); 1099 1100 return rt; 1101 } 1102 1103 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1104 const struct sk_buff *skb, int flags) 1105 { 1106 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1107 } 1108 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1109 1110 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1111 const struct in6_addr *saddr, int oif, 1112 const struct sk_buff *skb, int strict) 1113 { 1114 struct flowi6 fl6 = { 1115 .flowi6_oif = oif, 1116 .daddr = *daddr, 1117 }; 1118 struct dst_entry *dst; 1119 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1120 1121 if (saddr) { 1122 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1123 flags |= RT6_LOOKUP_F_HAS_SADDR; 1124 } 1125 1126 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1127 if (dst->error == 0) 1128 return (struct rt6_info *) dst; 1129 1130 dst_release(dst); 1131 1132 return NULL; 1133 } 1134 EXPORT_SYMBOL(rt6_lookup); 1135 1136 /* ip6_ins_rt is called with FREE table->tb6_lock. 1137 * It takes new route entry, the addition fails by any reason the 1138 * route is released. 1139 * Caller must hold dst before calling it. 1140 */ 1141 1142 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1143 struct netlink_ext_ack *extack) 1144 { 1145 int err; 1146 struct fib6_table *table; 1147 1148 table = rt->fib6_table; 1149 spin_lock_bh(&table->tb6_lock); 1150 err = fib6_add(&table->tb6_root, rt, info, extack); 1151 spin_unlock_bh(&table->tb6_lock); 1152 1153 return err; 1154 } 1155 1156 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1157 { 1158 struct nl_info info = { .nl_net = net, }; 1159 1160 return __ip6_ins_rt(rt, &info, NULL); 1161 } 1162 1163 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, 1164 const struct in6_addr *daddr, 1165 const struct in6_addr *saddr) 1166 { 1167 struct net_device *dev; 1168 struct rt6_info *rt; 1169 1170 /* 1171 * Clone the route. 1172 */ 1173 1174 dev = ip6_rt_get_dev_rcu(ort); 1175 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1176 if (!rt) 1177 return NULL; 1178 1179 ip6_rt_copy_init(rt, ort); 1180 rt->rt6i_flags |= RTF_CACHE; 1181 rt->dst.flags |= DST_HOST; 1182 rt->rt6i_dst.addr = *daddr; 1183 rt->rt6i_dst.plen = 128; 1184 1185 if (!rt6_is_gw_or_nonexthop(ort)) { 1186 if (ort->fib6_dst.plen != 128 && 1187 ipv6_addr_equal(&ort->fib6_dst.addr, daddr)) 1188 rt->rt6i_flags |= RTF_ANYCAST; 1189 #ifdef CONFIG_IPV6_SUBTREES 1190 if (rt->rt6i_src.plen && saddr) { 1191 rt->rt6i_src.addr = *saddr; 1192 rt->rt6i_src.plen = 128; 1193 } 1194 #endif 1195 } 1196 1197 return rt; 1198 } 1199 1200 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) 1201 { 1202 unsigned short flags = fib6_info_dst_flags(rt); 1203 struct net_device *dev; 1204 struct rt6_info *pcpu_rt; 1205 1206 rcu_read_lock(); 1207 dev = ip6_rt_get_dev_rcu(rt); 1208 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1209 rcu_read_unlock(); 1210 if (!pcpu_rt) 1211 return NULL; 1212 ip6_rt_copy_init(pcpu_rt, rt); 1213 pcpu_rt->rt6i_flags |= RTF_PCPU; 1214 return pcpu_rt; 1215 } 1216 1217 /* It should be called with rcu_read_lock() acquired */ 1218 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt) 1219 { 1220 struct rt6_info *pcpu_rt, **p; 1221 1222 p = this_cpu_ptr(rt->rt6i_pcpu); 1223 pcpu_rt = *p; 1224 1225 if (pcpu_rt) 1226 ip6_hold_safe(NULL, &pcpu_rt, false); 1227 1228 return pcpu_rt; 1229 } 1230 1231 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1232 struct fib6_info *rt) 1233 { 1234 struct rt6_info *pcpu_rt, *prev, **p; 1235 1236 pcpu_rt = ip6_rt_pcpu_alloc(rt); 1237 if (!pcpu_rt) { 1238 dst_hold(&net->ipv6.ip6_null_entry->dst); 1239 return net->ipv6.ip6_null_entry; 1240 } 1241 1242 dst_hold(&pcpu_rt->dst); 1243 p = this_cpu_ptr(rt->rt6i_pcpu); 1244 prev = cmpxchg(p, NULL, pcpu_rt); 1245 BUG_ON(prev); 1246 1247 return pcpu_rt; 1248 } 1249 1250 /* exception hash table implementation 1251 */ 1252 static DEFINE_SPINLOCK(rt6_exception_lock); 1253 1254 /* Remove rt6_ex from hash table and free the memory 1255 * Caller must hold rt6_exception_lock 1256 */ 1257 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1258 struct rt6_exception *rt6_ex) 1259 { 1260 struct net *net; 1261 1262 if (!bucket || !rt6_ex) 1263 return; 1264 1265 net = dev_net(rt6_ex->rt6i->dst.dev); 1266 hlist_del_rcu(&rt6_ex->hlist); 1267 dst_release(&rt6_ex->rt6i->dst); 1268 kfree_rcu(rt6_ex, rcu); 1269 WARN_ON_ONCE(!bucket->depth); 1270 bucket->depth--; 1271 net->ipv6.rt6_stats->fib_rt_cache--; 1272 } 1273 1274 /* Remove oldest rt6_ex in bucket and free the memory 1275 * Caller must hold rt6_exception_lock 1276 */ 1277 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1278 { 1279 struct rt6_exception *rt6_ex, *oldest = NULL; 1280 1281 if (!bucket) 1282 return; 1283 1284 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1285 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1286 oldest = rt6_ex; 1287 } 1288 rt6_remove_exception(bucket, oldest); 1289 } 1290 1291 static u32 rt6_exception_hash(const struct in6_addr *dst, 1292 const struct in6_addr *src) 1293 { 1294 static u32 seed __read_mostly; 1295 u32 val; 1296 1297 net_get_random_once(&seed, sizeof(seed)); 1298 val = jhash(dst, sizeof(*dst), seed); 1299 1300 #ifdef CONFIG_IPV6_SUBTREES 1301 if (src) 1302 val = jhash(src, sizeof(*src), val); 1303 #endif 1304 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1305 } 1306 1307 /* Helper function to find the cached rt in the hash table 1308 * and update bucket pointer to point to the bucket for this 1309 * (daddr, saddr) pair 1310 * Caller must hold rt6_exception_lock 1311 */ 1312 static struct rt6_exception * 1313 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1314 const struct in6_addr *daddr, 1315 const struct in6_addr *saddr) 1316 { 1317 struct rt6_exception *rt6_ex; 1318 u32 hval; 1319 1320 if (!(*bucket) || !daddr) 1321 return NULL; 1322 1323 hval = rt6_exception_hash(daddr, saddr); 1324 *bucket += hval; 1325 1326 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1327 struct rt6_info *rt6 = rt6_ex->rt6i; 1328 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1329 1330 #ifdef CONFIG_IPV6_SUBTREES 1331 if (matched && saddr) 1332 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1333 #endif 1334 if (matched) 1335 return rt6_ex; 1336 } 1337 return NULL; 1338 } 1339 1340 /* Helper function to find the cached rt in the hash table 1341 * and update bucket pointer to point to the bucket for this 1342 * (daddr, saddr) pair 1343 * Caller must hold rcu_read_lock() 1344 */ 1345 static struct rt6_exception * 1346 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1347 const struct in6_addr *daddr, 1348 const struct in6_addr *saddr) 1349 { 1350 struct rt6_exception *rt6_ex; 1351 u32 hval; 1352 1353 WARN_ON_ONCE(!rcu_read_lock_held()); 1354 1355 if (!(*bucket) || !daddr) 1356 return NULL; 1357 1358 hval = rt6_exception_hash(daddr, saddr); 1359 *bucket += hval; 1360 1361 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1362 struct rt6_info *rt6 = rt6_ex->rt6i; 1363 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1364 1365 #ifdef CONFIG_IPV6_SUBTREES 1366 if (matched && saddr) 1367 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1368 #endif 1369 if (matched) 1370 return rt6_ex; 1371 } 1372 return NULL; 1373 } 1374 1375 static unsigned int fib6_mtu(const struct fib6_info *rt) 1376 { 1377 unsigned int mtu; 1378 1379 if (rt->fib6_pmtu) { 1380 mtu = rt->fib6_pmtu; 1381 } else { 1382 struct net_device *dev = fib6_info_nh_dev(rt); 1383 struct inet6_dev *idev; 1384 1385 rcu_read_lock(); 1386 idev = __in6_dev_get(dev); 1387 mtu = idev->cnf.mtu6; 1388 rcu_read_unlock(); 1389 } 1390 1391 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1392 1393 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu); 1394 } 1395 1396 static int rt6_insert_exception(struct rt6_info *nrt, 1397 struct fib6_info *ort) 1398 { 1399 struct net *net = dev_net(nrt->dst.dev); 1400 struct rt6_exception_bucket *bucket; 1401 struct in6_addr *src_key = NULL; 1402 struct rt6_exception *rt6_ex; 1403 int err = 0; 1404 1405 spin_lock_bh(&rt6_exception_lock); 1406 1407 if (ort->exception_bucket_flushed) { 1408 err = -EINVAL; 1409 goto out; 1410 } 1411 1412 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, 1413 lockdep_is_held(&rt6_exception_lock)); 1414 if (!bucket) { 1415 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1416 GFP_ATOMIC); 1417 if (!bucket) { 1418 err = -ENOMEM; 1419 goto out; 1420 } 1421 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); 1422 } 1423 1424 #ifdef CONFIG_IPV6_SUBTREES 1425 /* rt6i_src.plen != 0 indicates ort is in subtree 1426 * and exception table is indexed by a hash of 1427 * both rt6i_dst and rt6i_src. 1428 * Otherwise, the exception table is indexed by 1429 * a hash of only rt6i_dst. 1430 */ 1431 if (ort->fib6_src.plen) 1432 src_key = &nrt->rt6i_src.addr; 1433 #endif 1434 1435 /* Update rt6i_prefsrc as it could be changed 1436 * in rt6_remove_prefsrc() 1437 */ 1438 nrt->rt6i_prefsrc = ort->fib6_prefsrc; 1439 /* rt6_mtu_change() might lower mtu on ort. 1440 * Only insert this exception route if its mtu 1441 * is less than ort's mtu value. 1442 */ 1443 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) { 1444 err = -EINVAL; 1445 goto out; 1446 } 1447 1448 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1449 src_key); 1450 if (rt6_ex) 1451 rt6_remove_exception(bucket, rt6_ex); 1452 1453 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1454 if (!rt6_ex) { 1455 err = -ENOMEM; 1456 goto out; 1457 } 1458 rt6_ex->rt6i = nrt; 1459 rt6_ex->stamp = jiffies; 1460 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1461 bucket->depth++; 1462 net->ipv6.rt6_stats->fib_rt_cache++; 1463 1464 if (bucket->depth > FIB6_MAX_DEPTH) 1465 rt6_exception_remove_oldest(bucket); 1466 1467 out: 1468 spin_unlock_bh(&rt6_exception_lock); 1469 1470 /* Update fn->fn_sernum to invalidate all cached dst */ 1471 if (!err) { 1472 spin_lock_bh(&ort->fib6_table->tb6_lock); 1473 fib6_update_sernum(net, ort); 1474 spin_unlock_bh(&ort->fib6_table->tb6_lock); 1475 fib6_force_start_gc(net); 1476 } 1477 1478 return err; 1479 } 1480 1481 void rt6_flush_exceptions(struct fib6_info *rt) 1482 { 1483 struct rt6_exception_bucket *bucket; 1484 struct rt6_exception *rt6_ex; 1485 struct hlist_node *tmp; 1486 int i; 1487 1488 spin_lock_bh(&rt6_exception_lock); 1489 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1490 rt->exception_bucket_flushed = 1; 1491 1492 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1493 lockdep_is_held(&rt6_exception_lock)); 1494 if (!bucket) 1495 goto out; 1496 1497 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1498 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1499 rt6_remove_exception(bucket, rt6_ex); 1500 WARN_ON_ONCE(bucket->depth); 1501 bucket++; 1502 } 1503 1504 out: 1505 spin_unlock_bh(&rt6_exception_lock); 1506 } 1507 1508 /* Find cached rt in the hash table inside passed in rt 1509 * Caller has to hold rcu_read_lock() 1510 */ 1511 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 1512 struct in6_addr *daddr, 1513 struct in6_addr *saddr) 1514 { 1515 struct rt6_exception_bucket *bucket; 1516 struct in6_addr *src_key = NULL; 1517 struct rt6_exception *rt6_ex; 1518 struct rt6_info *res = NULL; 1519 1520 bucket = rcu_dereference(rt->rt6i_exception_bucket); 1521 1522 #ifdef CONFIG_IPV6_SUBTREES 1523 /* rt6i_src.plen != 0 indicates rt is in subtree 1524 * and exception table is indexed by a hash of 1525 * both rt6i_dst and rt6i_src. 1526 * Otherwise, the exception table is indexed by 1527 * a hash of only rt6i_dst. 1528 */ 1529 if (rt->fib6_src.plen) 1530 src_key = saddr; 1531 #endif 1532 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1533 1534 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1535 res = rt6_ex->rt6i; 1536 1537 return res; 1538 } 1539 1540 /* Remove the passed in cached rt from the hash table that contains it */ 1541 static int rt6_remove_exception_rt(struct rt6_info *rt) 1542 { 1543 struct rt6_exception_bucket *bucket; 1544 struct in6_addr *src_key = NULL; 1545 struct rt6_exception *rt6_ex; 1546 struct fib6_info *from; 1547 int err; 1548 1549 from = rcu_dereference_protected(rt->from, 1550 lockdep_is_held(&rt6_exception_lock)); 1551 if (!from || 1552 !(rt->rt6i_flags & RTF_CACHE)) 1553 return -EINVAL; 1554 1555 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1556 return -ENOENT; 1557 1558 spin_lock_bh(&rt6_exception_lock); 1559 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1560 lockdep_is_held(&rt6_exception_lock)); 1561 #ifdef CONFIG_IPV6_SUBTREES 1562 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1563 * and exception table is indexed by a hash of 1564 * both rt6i_dst and rt6i_src. 1565 * Otherwise, the exception table is indexed by 1566 * a hash of only rt6i_dst. 1567 */ 1568 if (from->fib6_src.plen) 1569 src_key = &rt->rt6i_src.addr; 1570 #endif 1571 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1572 &rt->rt6i_dst.addr, 1573 src_key); 1574 if (rt6_ex) { 1575 rt6_remove_exception(bucket, rt6_ex); 1576 err = 0; 1577 } else { 1578 err = -ENOENT; 1579 } 1580 1581 spin_unlock_bh(&rt6_exception_lock); 1582 return err; 1583 } 1584 1585 /* Find rt6_ex which contains the passed in rt cache and 1586 * refresh its stamp 1587 */ 1588 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1589 { 1590 struct rt6_exception_bucket *bucket; 1591 struct fib6_info *from = rt->from; 1592 struct in6_addr *src_key = NULL; 1593 struct rt6_exception *rt6_ex; 1594 1595 if (!from || 1596 !(rt->rt6i_flags & RTF_CACHE)) 1597 return; 1598 1599 rcu_read_lock(); 1600 bucket = rcu_dereference(from->rt6i_exception_bucket); 1601 1602 #ifdef CONFIG_IPV6_SUBTREES 1603 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1604 * and exception table is indexed by a hash of 1605 * both rt6i_dst and rt6i_src. 1606 * Otherwise, the exception table is indexed by 1607 * a hash of only rt6i_dst. 1608 */ 1609 if (from->fib6_src.plen) 1610 src_key = &rt->rt6i_src.addr; 1611 #endif 1612 rt6_ex = __rt6_find_exception_rcu(&bucket, 1613 &rt->rt6i_dst.addr, 1614 src_key); 1615 if (rt6_ex) 1616 rt6_ex->stamp = jiffies; 1617 1618 rcu_read_unlock(); 1619 } 1620 1621 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt) 1622 { 1623 struct rt6_exception_bucket *bucket; 1624 struct rt6_exception *rt6_ex; 1625 int i; 1626 1627 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1628 lockdep_is_held(&rt6_exception_lock)); 1629 1630 if (bucket) { 1631 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1632 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1633 rt6_ex->rt6i->rt6i_prefsrc.plen = 0; 1634 } 1635 bucket++; 1636 } 1637 } 1638 } 1639 1640 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1641 struct rt6_info *rt, int mtu) 1642 { 1643 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1644 * lowest MTU in the path: always allow updating the route PMTU to 1645 * reflect PMTU decreases. 1646 * 1647 * If the new MTU is higher, and the route PMTU is equal to the local 1648 * MTU, this means the old MTU is the lowest in the path, so allow 1649 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1650 * handle this. 1651 */ 1652 1653 if (dst_mtu(&rt->dst) >= mtu) 1654 return true; 1655 1656 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 1657 return true; 1658 1659 return false; 1660 } 1661 1662 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1663 struct fib6_info *rt, int mtu) 1664 { 1665 struct rt6_exception_bucket *bucket; 1666 struct rt6_exception *rt6_ex; 1667 int i; 1668 1669 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1670 lockdep_is_held(&rt6_exception_lock)); 1671 1672 if (!bucket) 1673 return; 1674 1675 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1676 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1677 struct rt6_info *entry = rt6_ex->rt6i; 1678 1679 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 1680 * route), the metrics of its rt->from have already 1681 * been updated. 1682 */ 1683 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 1684 rt6_mtu_change_route_allowed(idev, entry, mtu)) 1685 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 1686 } 1687 bucket++; 1688 } 1689 } 1690 1691 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1692 1693 static void rt6_exceptions_clean_tohost(struct fib6_info *rt, 1694 struct in6_addr *gateway) 1695 { 1696 struct rt6_exception_bucket *bucket; 1697 struct rt6_exception *rt6_ex; 1698 struct hlist_node *tmp; 1699 int i; 1700 1701 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1702 return; 1703 1704 spin_lock_bh(&rt6_exception_lock); 1705 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1706 lockdep_is_held(&rt6_exception_lock)); 1707 1708 if (bucket) { 1709 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1710 hlist_for_each_entry_safe(rt6_ex, tmp, 1711 &bucket->chain, hlist) { 1712 struct rt6_info *entry = rt6_ex->rt6i; 1713 1714 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1715 RTF_CACHE_GATEWAY && 1716 ipv6_addr_equal(gateway, 1717 &entry->rt6i_gateway)) { 1718 rt6_remove_exception(bucket, rt6_ex); 1719 } 1720 } 1721 bucket++; 1722 } 1723 } 1724 1725 spin_unlock_bh(&rt6_exception_lock); 1726 } 1727 1728 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1729 struct rt6_exception *rt6_ex, 1730 struct fib6_gc_args *gc_args, 1731 unsigned long now) 1732 { 1733 struct rt6_info *rt = rt6_ex->rt6i; 1734 1735 /* we are pruning and obsoleting aged-out and non gateway exceptions 1736 * even if others have still references to them, so that on next 1737 * dst_check() such references can be dropped. 1738 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1739 * expired, independently from their aging, as per RFC 8201 section 4 1740 */ 1741 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 1742 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1743 RT6_TRACE("aging clone %p\n", rt); 1744 rt6_remove_exception(bucket, rt6_ex); 1745 return; 1746 } 1747 } else if (time_after(jiffies, rt->dst.expires)) { 1748 RT6_TRACE("purging expired route %p\n", rt); 1749 rt6_remove_exception(bucket, rt6_ex); 1750 return; 1751 } 1752 1753 if (rt->rt6i_flags & RTF_GATEWAY) { 1754 struct neighbour *neigh; 1755 __u8 neigh_flags = 0; 1756 1757 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 1758 if (neigh) 1759 neigh_flags = neigh->flags; 1760 1761 if (!(neigh_flags & NTF_ROUTER)) { 1762 RT6_TRACE("purging route %p via non-router but gateway\n", 1763 rt); 1764 rt6_remove_exception(bucket, rt6_ex); 1765 return; 1766 } 1767 } 1768 1769 gc_args->more++; 1770 } 1771 1772 void rt6_age_exceptions(struct fib6_info *rt, 1773 struct fib6_gc_args *gc_args, 1774 unsigned long now) 1775 { 1776 struct rt6_exception_bucket *bucket; 1777 struct rt6_exception *rt6_ex; 1778 struct hlist_node *tmp; 1779 int i; 1780 1781 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1782 return; 1783 1784 rcu_read_lock_bh(); 1785 spin_lock(&rt6_exception_lock); 1786 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1787 lockdep_is_held(&rt6_exception_lock)); 1788 1789 if (bucket) { 1790 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1791 hlist_for_each_entry_safe(rt6_ex, tmp, 1792 &bucket->chain, hlist) { 1793 rt6_age_examine_exception(bucket, rt6_ex, 1794 gc_args, now); 1795 } 1796 bucket++; 1797 } 1798 } 1799 spin_unlock(&rt6_exception_lock); 1800 rcu_read_unlock_bh(); 1801 } 1802 1803 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1804 int oif, struct flowi6 *fl6, 1805 const struct sk_buff *skb, int flags) 1806 { 1807 struct fib6_node *fn, *saved_fn; 1808 struct fib6_info *f6i; 1809 struct rt6_info *rt; 1810 int strict = 0; 1811 1812 strict |= flags & RT6_LOOKUP_F_IFACE; 1813 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1814 if (net->ipv6.devconf_all->forwarding == 0) 1815 strict |= RT6_LOOKUP_F_REACHABLE; 1816 1817 rcu_read_lock(); 1818 1819 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1820 saved_fn = fn; 1821 1822 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1823 oif = 0; 1824 1825 redo_rt6_select: 1826 f6i = rt6_select(net, fn, oif, strict); 1827 if (f6i->fib6_nsiblings) 1828 f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict); 1829 if (f6i == net->ipv6.fib6_null_entry) { 1830 fn = fib6_backtrack(fn, &fl6->saddr); 1831 if (fn) 1832 goto redo_rt6_select; 1833 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1834 /* also consider unreachable route */ 1835 strict &= ~RT6_LOOKUP_F_REACHABLE; 1836 fn = saved_fn; 1837 goto redo_rt6_select; 1838 } 1839 } 1840 1841 if (f6i == net->ipv6.fib6_null_entry) { 1842 rt = net->ipv6.ip6_null_entry; 1843 rcu_read_unlock(); 1844 dst_hold(&rt->dst); 1845 trace_fib6_table_lookup(net, rt, table, fl6); 1846 return rt; 1847 } 1848 1849 /*Search through exception table */ 1850 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1851 if (rt) { 1852 if (ip6_hold_safe(net, &rt, true)) 1853 dst_use_noref(&rt->dst, jiffies); 1854 1855 rcu_read_unlock(); 1856 trace_fib6_table_lookup(net, rt, table, fl6); 1857 return rt; 1858 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1859 !(f6i->fib6_flags & RTF_GATEWAY))) { 1860 /* Create a RTF_CACHE clone which will not be 1861 * owned by the fib6 tree. It is for the special case where 1862 * the daddr in the skb during the neighbor look-up is different 1863 * from the fl6->daddr used to look-up route here. 1864 */ 1865 struct rt6_info *uncached_rt; 1866 1867 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); 1868 1869 rcu_read_unlock(); 1870 1871 if (uncached_rt) { 1872 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1873 * No need for another dst_hold() 1874 */ 1875 rt6_uncached_list_add(uncached_rt); 1876 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 1877 } else { 1878 uncached_rt = net->ipv6.ip6_null_entry; 1879 dst_hold(&uncached_rt->dst); 1880 } 1881 1882 trace_fib6_table_lookup(net, uncached_rt, table, fl6); 1883 return uncached_rt; 1884 1885 } else { 1886 /* Get a percpu copy */ 1887 1888 struct rt6_info *pcpu_rt; 1889 1890 local_bh_disable(); 1891 pcpu_rt = rt6_get_pcpu_route(f6i); 1892 1893 if (!pcpu_rt) 1894 pcpu_rt = rt6_make_pcpu_route(net, f6i); 1895 1896 local_bh_enable(); 1897 rcu_read_unlock(); 1898 trace_fib6_table_lookup(net, pcpu_rt, table, fl6); 1899 return pcpu_rt; 1900 } 1901 } 1902 EXPORT_SYMBOL_GPL(ip6_pol_route); 1903 1904 static struct rt6_info *ip6_pol_route_input(struct net *net, 1905 struct fib6_table *table, 1906 struct flowi6 *fl6, 1907 const struct sk_buff *skb, 1908 int flags) 1909 { 1910 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 1911 } 1912 1913 struct dst_entry *ip6_route_input_lookup(struct net *net, 1914 struct net_device *dev, 1915 struct flowi6 *fl6, 1916 const struct sk_buff *skb, 1917 int flags) 1918 { 1919 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1920 flags |= RT6_LOOKUP_F_IFACE; 1921 1922 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 1923 } 1924 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1925 1926 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1927 struct flow_keys *keys, 1928 struct flow_keys *flkeys) 1929 { 1930 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1931 const struct ipv6hdr *key_iph = outer_iph; 1932 struct flow_keys *_flkeys = flkeys; 1933 const struct ipv6hdr *inner_iph; 1934 const struct icmp6hdr *icmph; 1935 struct ipv6hdr _inner_iph; 1936 1937 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1938 goto out; 1939 1940 icmph = icmp6_hdr(skb); 1941 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1942 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1943 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1944 icmph->icmp6_type != ICMPV6_PARAMPROB) 1945 goto out; 1946 1947 inner_iph = skb_header_pointer(skb, 1948 skb_transport_offset(skb) + sizeof(*icmph), 1949 sizeof(_inner_iph), &_inner_iph); 1950 if (!inner_iph) 1951 goto out; 1952 1953 key_iph = inner_iph; 1954 _flkeys = NULL; 1955 out: 1956 if (_flkeys) { 1957 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 1958 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 1959 keys->tags.flow_label = _flkeys->tags.flow_label; 1960 keys->basic.ip_proto = _flkeys->basic.ip_proto; 1961 } else { 1962 keys->addrs.v6addrs.src = key_iph->saddr; 1963 keys->addrs.v6addrs.dst = key_iph->daddr; 1964 keys->tags.flow_label = ip6_flowinfo(key_iph); 1965 keys->basic.ip_proto = key_iph->nexthdr; 1966 } 1967 } 1968 1969 /* if skb is set it will be used and fl6 can be NULL */ 1970 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 1971 const struct sk_buff *skb, struct flow_keys *flkeys) 1972 { 1973 struct flow_keys hash_keys; 1974 u32 mhash; 1975 1976 switch (ip6_multipath_hash_policy(net)) { 1977 case 0: 1978 memset(&hash_keys, 0, sizeof(hash_keys)); 1979 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 1980 if (skb) { 1981 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 1982 } else { 1983 hash_keys.addrs.v6addrs.src = fl6->saddr; 1984 hash_keys.addrs.v6addrs.dst = fl6->daddr; 1985 hash_keys.tags.flow_label = (__force u32)fl6->flowlabel; 1986 hash_keys.basic.ip_proto = fl6->flowi6_proto; 1987 } 1988 break; 1989 case 1: 1990 if (skb) { 1991 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 1992 struct flow_keys keys; 1993 1994 /* short-circuit if we already have L4 hash present */ 1995 if (skb->l4_hash) 1996 return skb_get_hash_raw(skb) >> 1; 1997 1998 memset(&hash_keys, 0, sizeof(hash_keys)); 1999 2000 if (!flkeys) { 2001 skb_flow_dissect_flow_keys(skb, &keys, flag); 2002 flkeys = &keys; 2003 } 2004 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2005 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2006 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2007 hash_keys.ports.src = flkeys->ports.src; 2008 hash_keys.ports.dst = flkeys->ports.dst; 2009 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2010 } else { 2011 memset(&hash_keys, 0, sizeof(hash_keys)); 2012 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2013 hash_keys.addrs.v6addrs.src = fl6->saddr; 2014 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2015 hash_keys.ports.src = fl6->fl6_sport; 2016 hash_keys.ports.dst = fl6->fl6_dport; 2017 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2018 } 2019 break; 2020 } 2021 mhash = flow_hash_from_keys(&hash_keys); 2022 2023 return mhash >> 1; 2024 } 2025 2026 void ip6_route_input(struct sk_buff *skb) 2027 { 2028 const struct ipv6hdr *iph = ipv6_hdr(skb); 2029 struct net *net = dev_net(skb->dev); 2030 int flags = RT6_LOOKUP_F_HAS_SADDR; 2031 struct ip_tunnel_info *tun_info; 2032 struct flowi6 fl6 = { 2033 .flowi6_iif = skb->dev->ifindex, 2034 .daddr = iph->daddr, 2035 .saddr = iph->saddr, 2036 .flowlabel = ip6_flowinfo(iph), 2037 .flowi6_mark = skb->mark, 2038 .flowi6_proto = iph->nexthdr, 2039 }; 2040 struct flow_keys *flkeys = NULL, _flkeys; 2041 2042 tun_info = skb_tunnel_info(skb); 2043 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2044 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2045 2046 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2047 flkeys = &_flkeys; 2048 2049 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2050 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2051 skb_dst_drop(skb); 2052 skb_dst_set(skb, 2053 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); 2054 } 2055 2056 static struct rt6_info *ip6_pol_route_output(struct net *net, 2057 struct fib6_table *table, 2058 struct flowi6 *fl6, 2059 const struct sk_buff *skb, 2060 int flags) 2061 { 2062 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2063 } 2064 2065 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 2066 struct flowi6 *fl6, int flags) 2067 { 2068 bool any_src; 2069 2070 if (rt6_need_strict(&fl6->daddr)) { 2071 struct dst_entry *dst; 2072 2073 dst = l3mdev_link_scope_lookup(net, fl6); 2074 if (dst) 2075 return dst; 2076 } 2077 2078 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2079 2080 any_src = ipv6_addr_any(&fl6->saddr); 2081 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2082 (fl6->flowi6_oif && any_src)) 2083 flags |= RT6_LOOKUP_F_IFACE; 2084 2085 if (!any_src) 2086 flags |= RT6_LOOKUP_F_HAS_SADDR; 2087 else if (sk) 2088 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2089 2090 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2091 } 2092 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2093 2094 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2095 { 2096 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2097 struct net_device *loopback_dev = net->loopback_dev; 2098 struct dst_entry *new = NULL; 2099 2100 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2101 DST_OBSOLETE_DEAD, 0); 2102 if (rt) { 2103 rt6_info_init(rt); 2104 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2105 2106 new = &rt->dst; 2107 new->__use = 1; 2108 new->input = dst_discard; 2109 new->output = dst_discard_out; 2110 2111 dst_copy_metrics(new, &ort->dst); 2112 2113 rt->rt6i_idev = in6_dev_get(loopback_dev); 2114 rt->rt6i_gateway = ort->rt6i_gateway; 2115 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2116 2117 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2118 #ifdef CONFIG_IPV6_SUBTREES 2119 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2120 #endif 2121 } 2122 2123 dst_release(dst_orig); 2124 return new ? new : ERR_PTR(-ENOMEM); 2125 } 2126 2127 /* 2128 * Destination cache support functions 2129 */ 2130 2131 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2132 { 2133 u32 rt_cookie = 0; 2134 2135 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2136 return false; 2137 2138 if (fib6_check_expired(f6i)) 2139 return false; 2140 2141 return true; 2142 } 2143 2144 static struct dst_entry *rt6_check(struct rt6_info *rt, 2145 struct fib6_info *from, 2146 u32 cookie) 2147 { 2148 u32 rt_cookie = 0; 2149 2150 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || 2151 rt_cookie != cookie) 2152 return NULL; 2153 2154 if (rt6_check_expired(rt)) 2155 return NULL; 2156 2157 return &rt->dst; 2158 } 2159 2160 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2161 struct fib6_info *from, 2162 u32 cookie) 2163 { 2164 if (!__rt6_check_expired(rt) && 2165 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2166 fib6_check(from, cookie)) 2167 return &rt->dst; 2168 else 2169 return NULL; 2170 } 2171 2172 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2173 { 2174 struct dst_entry *dst_ret; 2175 struct fib6_info *from; 2176 struct rt6_info *rt; 2177 2178 rt = container_of(dst, struct rt6_info, dst); 2179 2180 rcu_read_lock(); 2181 2182 /* All IPV6 dsts are created with ->obsolete set to the value 2183 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2184 * into this function always. 2185 */ 2186 2187 from = rcu_dereference(rt->from); 2188 2189 if (from && (rt->rt6i_flags & RTF_PCPU || 2190 unlikely(!list_empty(&rt->rt6i_uncached)))) 2191 dst_ret = rt6_dst_from_check(rt, from, cookie); 2192 else 2193 dst_ret = rt6_check(rt, from, cookie); 2194 2195 rcu_read_unlock(); 2196 2197 return dst_ret; 2198 } 2199 2200 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2201 { 2202 struct rt6_info *rt = (struct rt6_info *) dst; 2203 2204 if (rt) { 2205 if (rt->rt6i_flags & RTF_CACHE) { 2206 rcu_read_lock(); 2207 if (rt6_check_expired(rt)) { 2208 rt6_remove_exception_rt(rt); 2209 dst = NULL; 2210 } 2211 rcu_read_unlock(); 2212 } else { 2213 dst_release(dst); 2214 dst = NULL; 2215 } 2216 } 2217 return dst; 2218 } 2219 2220 static void ip6_link_failure(struct sk_buff *skb) 2221 { 2222 struct rt6_info *rt; 2223 2224 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2225 2226 rt = (struct rt6_info *) skb_dst(skb); 2227 if (rt) { 2228 rcu_read_lock(); 2229 if (rt->rt6i_flags & RTF_CACHE) { 2230 if (dst_hold_safe(&rt->dst)) 2231 rt6_remove_exception_rt(rt); 2232 } else { 2233 struct fib6_info *from; 2234 struct fib6_node *fn; 2235 2236 from = rcu_dereference(rt->from); 2237 if (from) { 2238 fn = rcu_dereference(from->fib6_node); 2239 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2240 fn->fn_sernum = -1; 2241 } 2242 } 2243 rcu_read_unlock(); 2244 } 2245 } 2246 2247 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2248 { 2249 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2250 struct fib6_info *from; 2251 2252 rcu_read_lock(); 2253 from = rcu_dereference(rt0->from); 2254 if (from) 2255 rt0->dst.expires = from->expires; 2256 rcu_read_unlock(); 2257 } 2258 2259 dst_set_expires(&rt0->dst, timeout); 2260 rt0->rt6i_flags |= RTF_EXPIRES; 2261 } 2262 2263 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2264 { 2265 struct net *net = dev_net(rt->dst.dev); 2266 2267 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2268 rt->rt6i_flags |= RTF_MODIFIED; 2269 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2270 } 2271 2272 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2273 { 2274 bool from_set; 2275 2276 rcu_read_lock(); 2277 from_set = !!rcu_dereference(rt->from); 2278 rcu_read_unlock(); 2279 2280 return !(rt->rt6i_flags & RTF_CACHE) && 2281 (rt->rt6i_flags & RTF_PCPU || from_set); 2282 } 2283 2284 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2285 const struct ipv6hdr *iph, u32 mtu) 2286 { 2287 const struct in6_addr *daddr, *saddr; 2288 struct rt6_info *rt6 = (struct rt6_info *)dst; 2289 2290 if (rt6->rt6i_flags & RTF_LOCAL) 2291 return; 2292 2293 if (dst_metric_locked(dst, RTAX_MTU)) 2294 return; 2295 2296 if (iph) { 2297 daddr = &iph->daddr; 2298 saddr = &iph->saddr; 2299 } else if (sk) { 2300 daddr = &sk->sk_v6_daddr; 2301 saddr = &inet6_sk(sk)->saddr; 2302 } else { 2303 daddr = NULL; 2304 saddr = NULL; 2305 } 2306 dst_confirm_neigh(dst, daddr); 2307 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2308 if (mtu >= dst_mtu(dst)) 2309 return; 2310 2311 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2312 rt6_do_update_pmtu(rt6, mtu); 2313 /* update rt6_ex->stamp for cache */ 2314 if (rt6->rt6i_flags & RTF_CACHE) 2315 rt6_update_exception_stamp_rt(rt6); 2316 } else if (daddr) { 2317 struct fib6_info *from; 2318 struct rt6_info *nrt6; 2319 2320 rcu_read_lock(); 2321 from = rcu_dereference(rt6->from); 2322 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr); 2323 if (nrt6) { 2324 rt6_do_update_pmtu(nrt6, mtu); 2325 if (rt6_insert_exception(nrt6, from)) 2326 dst_release_immediate(&nrt6->dst); 2327 } 2328 rcu_read_unlock(); 2329 } 2330 } 2331 2332 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2333 struct sk_buff *skb, u32 mtu) 2334 { 2335 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2336 } 2337 2338 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2339 int oif, u32 mark, kuid_t uid) 2340 { 2341 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2342 struct dst_entry *dst; 2343 struct flowi6 fl6; 2344 2345 memset(&fl6, 0, sizeof(fl6)); 2346 fl6.flowi6_oif = oif; 2347 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); 2348 fl6.daddr = iph->daddr; 2349 fl6.saddr = iph->saddr; 2350 fl6.flowlabel = ip6_flowinfo(iph); 2351 fl6.flowi6_uid = uid; 2352 2353 dst = ip6_route_output(net, NULL, &fl6); 2354 if (!dst->error) 2355 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2356 dst_release(dst); 2357 } 2358 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2359 2360 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2361 { 2362 struct dst_entry *dst; 2363 2364 ip6_update_pmtu(skb, sock_net(sk), mtu, 2365 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid); 2366 2367 dst = __sk_dst_get(sk); 2368 if (!dst || !dst->obsolete || 2369 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2370 return; 2371 2372 bh_lock_sock(sk); 2373 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2374 ip6_datagram_dst_update(sk, false); 2375 bh_unlock_sock(sk); 2376 } 2377 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2378 2379 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2380 const struct flowi6 *fl6) 2381 { 2382 #ifdef CONFIG_IPV6_SUBTREES 2383 struct ipv6_pinfo *np = inet6_sk(sk); 2384 #endif 2385 2386 ip6_dst_store(sk, dst, 2387 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2388 &sk->sk_v6_daddr : NULL, 2389 #ifdef CONFIG_IPV6_SUBTREES 2390 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2391 &np->saddr : 2392 #endif 2393 NULL); 2394 } 2395 2396 /* Handle redirects */ 2397 struct ip6rd_flowi { 2398 struct flowi6 fl6; 2399 struct in6_addr gateway; 2400 }; 2401 2402 static struct rt6_info *__ip6_route_redirect(struct net *net, 2403 struct fib6_table *table, 2404 struct flowi6 *fl6, 2405 const struct sk_buff *skb, 2406 int flags) 2407 { 2408 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2409 struct rt6_info *ret = NULL, *rt_cache; 2410 struct fib6_info *rt; 2411 struct fib6_node *fn; 2412 2413 /* Get the "current" route for this destination and 2414 * check if the redirect has come from appropriate router. 2415 * 2416 * RFC 4861 specifies that redirects should only be 2417 * accepted if they come from the nexthop to the target. 2418 * Due to the way the routes are chosen, this notion 2419 * is a bit fuzzy and one might need to check all possible 2420 * routes. 2421 */ 2422 2423 rcu_read_lock(); 2424 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2425 restart: 2426 for_each_fib6_node_rt_rcu(fn) { 2427 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 2428 continue; 2429 if (fib6_check_expired(rt)) 2430 continue; 2431 if (rt->fib6_flags & RTF_REJECT) 2432 break; 2433 if (!(rt->fib6_flags & RTF_GATEWAY)) 2434 continue; 2435 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex) 2436 continue; 2437 /* rt_cache's gateway might be different from its 'parent' 2438 * in the case of an ip redirect. 2439 * So we keep searching in the exception table if the gateway 2440 * is different. 2441 */ 2442 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) { 2443 rt_cache = rt6_find_cached_rt(rt, 2444 &fl6->daddr, 2445 &fl6->saddr); 2446 if (rt_cache && 2447 ipv6_addr_equal(&rdfl->gateway, 2448 &rt_cache->rt6i_gateway)) { 2449 ret = rt_cache; 2450 break; 2451 } 2452 continue; 2453 } 2454 break; 2455 } 2456 2457 if (!rt) 2458 rt = net->ipv6.fib6_null_entry; 2459 else if (rt->fib6_flags & RTF_REJECT) { 2460 ret = net->ipv6.ip6_null_entry; 2461 goto out; 2462 } 2463 2464 if (rt == net->ipv6.fib6_null_entry) { 2465 fn = fib6_backtrack(fn, &fl6->saddr); 2466 if (fn) 2467 goto restart; 2468 } 2469 2470 out: 2471 if (ret) 2472 dst_hold(&ret->dst); 2473 else 2474 ret = ip6_create_rt_rcu(rt); 2475 2476 rcu_read_unlock(); 2477 2478 trace_fib6_table_lookup(net, ret, table, fl6); 2479 return ret; 2480 }; 2481 2482 static struct dst_entry *ip6_route_redirect(struct net *net, 2483 const struct flowi6 *fl6, 2484 const struct sk_buff *skb, 2485 const struct in6_addr *gateway) 2486 { 2487 int flags = RT6_LOOKUP_F_HAS_SADDR; 2488 struct ip6rd_flowi rdfl; 2489 2490 rdfl.fl6 = *fl6; 2491 rdfl.gateway = *gateway; 2492 2493 return fib6_rule_lookup(net, &rdfl.fl6, skb, 2494 flags, __ip6_route_redirect); 2495 } 2496 2497 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2498 kuid_t uid) 2499 { 2500 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2501 struct dst_entry *dst; 2502 struct flowi6 fl6; 2503 2504 memset(&fl6, 0, sizeof(fl6)); 2505 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2506 fl6.flowi6_oif = oif; 2507 fl6.flowi6_mark = mark; 2508 fl6.daddr = iph->daddr; 2509 fl6.saddr = iph->saddr; 2510 fl6.flowlabel = ip6_flowinfo(iph); 2511 fl6.flowi6_uid = uid; 2512 2513 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 2514 rt6_do_redirect(dst, NULL, skb); 2515 dst_release(dst); 2516 } 2517 EXPORT_SYMBOL_GPL(ip6_redirect); 2518 2519 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, 2520 u32 mark) 2521 { 2522 const struct ipv6hdr *iph = ipv6_hdr(skb); 2523 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2524 struct dst_entry *dst; 2525 struct flowi6 fl6; 2526 2527 memset(&fl6, 0, sizeof(fl6)); 2528 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2529 fl6.flowi6_oif = oif; 2530 fl6.flowi6_mark = mark; 2531 fl6.daddr = msg->dest; 2532 fl6.saddr = iph->daddr; 2533 fl6.flowi6_uid = sock_net_uid(net, NULL); 2534 2535 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 2536 rt6_do_redirect(dst, NULL, skb); 2537 dst_release(dst); 2538 } 2539 2540 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2541 { 2542 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2543 sk->sk_uid); 2544 } 2545 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2546 2547 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2548 { 2549 struct net_device *dev = dst->dev; 2550 unsigned int mtu = dst_mtu(dst); 2551 struct net *net = dev_net(dev); 2552 2553 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2554 2555 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2556 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2557 2558 /* 2559 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2560 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2561 * IPV6_MAXPLEN is also valid and means: "any MSS, 2562 * rely only on pmtu discovery" 2563 */ 2564 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2565 mtu = IPV6_MAXPLEN; 2566 return mtu; 2567 } 2568 2569 static unsigned int ip6_mtu(const struct dst_entry *dst) 2570 { 2571 struct inet6_dev *idev; 2572 unsigned int mtu; 2573 2574 mtu = dst_metric_raw(dst, RTAX_MTU); 2575 if (mtu) 2576 goto out; 2577 2578 mtu = IPV6_MIN_MTU; 2579 2580 rcu_read_lock(); 2581 idev = __in6_dev_get(dst->dev); 2582 if (idev) 2583 mtu = idev->cnf.mtu6; 2584 rcu_read_unlock(); 2585 2586 out: 2587 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2588 2589 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2590 } 2591 2592 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2593 struct flowi6 *fl6) 2594 { 2595 struct dst_entry *dst; 2596 struct rt6_info *rt; 2597 struct inet6_dev *idev = in6_dev_get(dev); 2598 struct net *net = dev_net(dev); 2599 2600 if (unlikely(!idev)) 2601 return ERR_PTR(-ENODEV); 2602 2603 rt = ip6_dst_alloc(net, dev, 0); 2604 if (unlikely(!rt)) { 2605 in6_dev_put(idev); 2606 dst = ERR_PTR(-ENOMEM); 2607 goto out; 2608 } 2609 2610 rt->dst.flags |= DST_HOST; 2611 rt->dst.input = ip6_input; 2612 rt->dst.output = ip6_output; 2613 rt->rt6i_gateway = fl6->daddr; 2614 rt->rt6i_dst.addr = fl6->daddr; 2615 rt->rt6i_dst.plen = 128; 2616 rt->rt6i_idev = idev; 2617 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2618 2619 /* Add this dst into uncached_list so that rt6_disable_ip() can 2620 * do proper release of the net_device 2621 */ 2622 rt6_uncached_list_add(rt); 2623 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2624 2625 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2626 2627 out: 2628 return dst; 2629 } 2630 2631 static int ip6_dst_gc(struct dst_ops *ops) 2632 { 2633 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2634 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2635 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2636 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2637 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2638 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2639 int entries; 2640 2641 entries = dst_entries_get_fast(ops); 2642 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2643 entries <= rt_max_size) 2644 goto out; 2645 2646 net->ipv6.ip6_rt_gc_expire++; 2647 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2648 entries = dst_entries_get_slow(ops); 2649 if (entries < ops->gc_thresh) 2650 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2651 out: 2652 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2653 return entries > rt_max_size; 2654 } 2655 2656 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt, 2657 struct fib6_config *cfg) 2658 { 2659 struct dst_metrics *p; 2660 2661 if (!cfg->fc_mx) 2662 return 0; 2663 2664 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL); 2665 if (unlikely(!p)) 2666 return -ENOMEM; 2667 2668 refcount_set(&p->refcnt, 1); 2669 rt->fib6_metrics = p; 2670 2671 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics); 2672 } 2673 2674 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2675 struct fib6_config *cfg, 2676 const struct in6_addr *gw_addr, 2677 u32 tbid, int flags) 2678 { 2679 struct flowi6 fl6 = { 2680 .flowi6_oif = cfg->fc_ifindex, 2681 .daddr = *gw_addr, 2682 .saddr = cfg->fc_prefsrc, 2683 }; 2684 struct fib6_table *table; 2685 struct rt6_info *rt; 2686 2687 table = fib6_get_table(net, tbid); 2688 if (!table) 2689 return NULL; 2690 2691 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2692 flags |= RT6_LOOKUP_F_HAS_SADDR; 2693 2694 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 2695 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); 2696 2697 /* if table lookup failed, fall back to full lookup */ 2698 if (rt == net->ipv6.ip6_null_entry) { 2699 ip6_rt_put(rt); 2700 rt = NULL; 2701 } 2702 2703 return rt; 2704 } 2705 2706 static int ip6_route_check_nh_onlink(struct net *net, 2707 struct fib6_config *cfg, 2708 const struct net_device *dev, 2709 struct netlink_ext_ack *extack) 2710 { 2711 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 2712 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2713 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; 2714 struct rt6_info *grt; 2715 int err; 2716 2717 err = 0; 2718 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2719 if (grt) { 2720 if (!grt->dst.error && 2721 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2722 NL_SET_ERR_MSG(extack, 2723 "Nexthop has invalid gateway or device mismatch"); 2724 err = -EINVAL; 2725 } 2726 2727 ip6_rt_put(grt); 2728 } 2729 2730 return err; 2731 } 2732 2733 static int ip6_route_check_nh(struct net *net, 2734 struct fib6_config *cfg, 2735 struct net_device **_dev, 2736 struct inet6_dev **idev) 2737 { 2738 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2739 struct net_device *dev = _dev ? *_dev : NULL; 2740 struct rt6_info *grt = NULL; 2741 int err = -EHOSTUNREACH; 2742 2743 if (cfg->fc_table) { 2744 int flags = RT6_LOOKUP_F_IFACE; 2745 2746 grt = ip6_nh_lookup_table(net, cfg, gw_addr, 2747 cfg->fc_table, flags); 2748 if (grt) { 2749 if (grt->rt6i_flags & RTF_GATEWAY || 2750 (dev && dev != grt->dst.dev)) { 2751 ip6_rt_put(grt); 2752 grt = NULL; 2753 } 2754 } 2755 } 2756 2757 if (!grt) 2758 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); 2759 2760 if (!grt) 2761 goto out; 2762 2763 if (dev) { 2764 if (dev != grt->dst.dev) { 2765 ip6_rt_put(grt); 2766 goto out; 2767 } 2768 } else { 2769 *_dev = dev = grt->dst.dev; 2770 *idev = grt->rt6i_idev; 2771 dev_hold(dev); 2772 in6_dev_hold(grt->rt6i_idev); 2773 } 2774 2775 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2776 err = 0; 2777 2778 ip6_rt_put(grt); 2779 2780 out: 2781 return err; 2782 } 2783 2784 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 2785 struct net_device **_dev, struct inet6_dev **idev, 2786 struct netlink_ext_ack *extack) 2787 { 2788 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2789 int gwa_type = ipv6_addr_type(gw_addr); 2790 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 2791 const struct net_device *dev = *_dev; 2792 bool need_addr_check = !dev; 2793 int err = -EINVAL; 2794 2795 /* if gw_addr is local we will fail to detect this in case 2796 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2797 * will return already-added prefix route via interface that 2798 * prefix route was assigned to, which might be non-loopback. 2799 */ 2800 if (dev && 2801 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2802 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2803 goto out; 2804 } 2805 2806 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 2807 /* IPv6 strictly inhibits using not link-local 2808 * addresses as nexthop address. 2809 * Otherwise, router will not able to send redirects. 2810 * It is very good, but in some (rare!) circumstances 2811 * (SIT, PtP, NBMA NOARP links) it is handy to allow 2812 * some exceptions. --ANK 2813 * We allow IPv4-mapped nexthops to support RFC4798-type 2814 * addressing 2815 */ 2816 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 2817 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2818 goto out; 2819 } 2820 2821 if (cfg->fc_flags & RTNH_F_ONLINK) 2822 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 2823 else 2824 err = ip6_route_check_nh(net, cfg, _dev, idev); 2825 2826 if (err) 2827 goto out; 2828 } 2829 2830 /* reload in case device was changed */ 2831 dev = *_dev; 2832 2833 err = -EINVAL; 2834 if (!dev) { 2835 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2836 goto out; 2837 } else if (dev->flags & IFF_LOOPBACK) { 2838 NL_SET_ERR_MSG(extack, 2839 "Egress device can not be loopback device for this route"); 2840 goto out; 2841 } 2842 2843 /* if we did not check gw_addr above, do so now that the 2844 * egress device has been resolved. 2845 */ 2846 if (need_addr_check && 2847 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2848 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2849 goto out; 2850 } 2851 2852 err = 0; 2853 out: 2854 return err; 2855 } 2856 2857 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 2858 gfp_t gfp_flags, 2859 struct netlink_ext_ack *extack) 2860 { 2861 struct net *net = cfg->fc_nlinfo.nl_net; 2862 struct fib6_info *rt = NULL; 2863 struct net_device *dev = NULL; 2864 struct inet6_dev *idev = NULL; 2865 struct fib6_table *table; 2866 int addr_type; 2867 int err = -EINVAL; 2868 2869 /* RTF_PCPU is an internal flag; can not be set by userspace */ 2870 if (cfg->fc_flags & RTF_PCPU) { 2871 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 2872 goto out; 2873 } 2874 2875 /* RTF_CACHE is an internal flag; can not be set by userspace */ 2876 if (cfg->fc_flags & RTF_CACHE) { 2877 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 2878 goto out; 2879 } 2880 2881 if (cfg->fc_type > RTN_MAX) { 2882 NL_SET_ERR_MSG(extack, "Invalid route type"); 2883 goto out; 2884 } 2885 2886 if (cfg->fc_dst_len > 128) { 2887 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 2888 goto out; 2889 } 2890 if (cfg->fc_src_len > 128) { 2891 NL_SET_ERR_MSG(extack, "Invalid source address length"); 2892 goto out; 2893 } 2894 #ifndef CONFIG_IPV6_SUBTREES 2895 if (cfg->fc_src_len) { 2896 NL_SET_ERR_MSG(extack, 2897 "Specifying source address requires IPV6_SUBTREES to be enabled"); 2898 goto out; 2899 } 2900 #endif 2901 if (cfg->fc_ifindex) { 2902 err = -ENODEV; 2903 dev = dev_get_by_index(net, cfg->fc_ifindex); 2904 if (!dev) 2905 goto out; 2906 idev = in6_dev_get(dev); 2907 if (!idev) 2908 goto out; 2909 } 2910 2911 if (cfg->fc_metric == 0) 2912 cfg->fc_metric = IP6_RT_PRIO_USER; 2913 2914 if (cfg->fc_flags & RTNH_F_ONLINK) { 2915 if (!dev) { 2916 NL_SET_ERR_MSG(extack, 2917 "Nexthop device required for onlink"); 2918 err = -ENODEV; 2919 goto out; 2920 } 2921 2922 if (!(dev->flags & IFF_UP)) { 2923 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 2924 err = -ENETDOWN; 2925 goto out; 2926 } 2927 } 2928 2929 err = -ENOBUFS; 2930 if (cfg->fc_nlinfo.nlh && 2931 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 2932 table = fib6_get_table(net, cfg->fc_table); 2933 if (!table) { 2934 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 2935 table = fib6_new_table(net, cfg->fc_table); 2936 } 2937 } else { 2938 table = fib6_new_table(net, cfg->fc_table); 2939 } 2940 2941 if (!table) 2942 goto out; 2943 2944 err = -ENOMEM; 2945 rt = fib6_info_alloc(gfp_flags); 2946 if (!rt) 2947 goto out; 2948 2949 if (cfg->fc_flags & RTF_ADDRCONF) 2950 rt->dst_nocount = true; 2951 2952 err = ip6_convert_metrics(net, rt, cfg); 2953 if (err < 0) 2954 goto out; 2955 2956 if (cfg->fc_flags & RTF_EXPIRES) 2957 fib6_set_expires(rt, jiffies + 2958 clock_t_to_jiffies(cfg->fc_expires)); 2959 else 2960 fib6_clean_expires(rt); 2961 2962 if (cfg->fc_protocol == RTPROT_UNSPEC) 2963 cfg->fc_protocol = RTPROT_BOOT; 2964 rt->fib6_protocol = cfg->fc_protocol; 2965 2966 addr_type = ipv6_addr_type(&cfg->fc_dst); 2967 2968 if (cfg->fc_encap) { 2969 struct lwtunnel_state *lwtstate; 2970 2971 err = lwtunnel_build_state(cfg->fc_encap_type, 2972 cfg->fc_encap, AF_INET6, cfg, 2973 &lwtstate, extack); 2974 if (err) 2975 goto out; 2976 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate); 2977 } 2978 2979 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 2980 rt->fib6_dst.plen = cfg->fc_dst_len; 2981 if (rt->fib6_dst.plen == 128) 2982 rt->dst_host = true; 2983 2984 #ifdef CONFIG_IPV6_SUBTREES 2985 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 2986 rt->fib6_src.plen = cfg->fc_src_len; 2987 #endif 2988 2989 rt->fib6_metric = cfg->fc_metric; 2990 rt->fib6_nh.nh_weight = 1; 2991 2992 rt->fib6_type = cfg->fc_type; 2993 2994 /* We cannot add true routes via loopback here, 2995 they would result in kernel looping; promote them to reject routes 2996 */ 2997 if ((cfg->fc_flags & RTF_REJECT) || 2998 (dev && (dev->flags & IFF_LOOPBACK) && 2999 !(addr_type & IPV6_ADDR_LOOPBACK) && 3000 !(cfg->fc_flags & RTF_LOCAL))) { 3001 /* hold loopback dev/idev if we haven't done so. */ 3002 if (dev != net->loopback_dev) { 3003 if (dev) { 3004 dev_put(dev); 3005 in6_dev_put(idev); 3006 } 3007 dev = net->loopback_dev; 3008 dev_hold(dev); 3009 idev = in6_dev_get(dev); 3010 if (!idev) { 3011 err = -ENODEV; 3012 goto out; 3013 } 3014 } 3015 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP; 3016 goto install_route; 3017 } 3018 3019 if (cfg->fc_flags & RTF_GATEWAY) { 3020 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 3021 if (err) 3022 goto out; 3023 3024 rt->fib6_nh.nh_gw = cfg->fc_gateway; 3025 } 3026 3027 err = -ENODEV; 3028 if (!dev) 3029 goto out; 3030 3031 if (idev->cnf.disable_ipv6) { 3032 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3033 err = -EACCES; 3034 goto out; 3035 } 3036 3037 if (!(dev->flags & IFF_UP)) { 3038 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3039 err = -ENETDOWN; 3040 goto out; 3041 } 3042 3043 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3044 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3045 NL_SET_ERR_MSG(extack, "Invalid source address"); 3046 err = -EINVAL; 3047 goto out; 3048 } 3049 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3050 rt->fib6_prefsrc.plen = 128; 3051 } else 3052 rt->fib6_prefsrc.plen = 0; 3053 3054 rt->fib6_flags = cfg->fc_flags; 3055 3056 install_route: 3057 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3058 !netif_carrier_ok(dev)) 3059 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 3060 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); 3061 rt->fib6_nh.nh_dev = dev; 3062 rt->fib6_table = table; 3063 3064 cfg->fc_nlinfo.nl_net = dev_net(dev); 3065 3066 if (idev) 3067 in6_dev_put(idev); 3068 3069 return rt; 3070 out: 3071 if (dev) 3072 dev_put(dev); 3073 if (idev) 3074 in6_dev_put(idev); 3075 3076 fib6_info_release(rt); 3077 return ERR_PTR(err); 3078 } 3079 3080 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3081 struct netlink_ext_ack *extack) 3082 { 3083 struct fib6_info *rt; 3084 int err; 3085 3086 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3087 if (IS_ERR(rt)) 3088 return PTR_ERR(rt); 3089 3090 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3091 fib6_info_release(rt); 3092 3093 return err; 3094 } 3095 3096 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3097 { 3098 struct net *net = info->nl_net; 3099 struct fib6_table *table; 3100 int err; 3101 3102 if (rt == net->ipv6.fib6_null_entry) { 3103 err = -ENOENT; 3104 goto out; 3105 } 3106 3107 table = rt->fib6_table; 3108 spin_lock_bh(&table->tb6_lock); 3109 err = fib6_del(rt, info); 3110 spin_unlock_bh(&table->tb6_lock); 3111 3112 out: 3113 fib6_info_release(rt); 3114 return err; 3115 } 3116 3117 int ip6_del_rt(struct net *net, struct fib6_info *rt) 3118 { 3119 struct nl_info info = { .nl_net = net }; 3120 3121 return __ip6_del_rt(rt, &info); 3122 } 3123 3124 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3125 { 3126 struct nl_info *info = &cfg->fc_nlinfo; 3127 struct net *net = info->nl_net; 3128 struct sk_buff *skb = NULL; 3129 struct fib6_table *table; 3130 int err = -ENOENT; 3131 3132 if (rt == net->ipv6.fib6_null_entry) 3133 goto out_put; 3134 table = rt->fib6_table; 3135 spin_lock_bh(&table->tb6_lock); 3136 3137 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3138 struct fib6_info *sibling, *next_sibling; 3139 3140 /* prefer to send a single notification with all hops */ 3141 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3142 if (skb) { 3143 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3144 3145 if (rt6_fill_node(net, skb, rt, NULL, 3146 NULL, NULL, 0, RTM_DELROUTE, 3147 info->portid, seq, 0) < 0) { 3148 kfree_skb(skb); 3149 skb = NULL; 3150 } else 3151 info->skip_notify = 1; 3152 } 3153 3154 list_for_each_entry_safe(sibling, next_sibling, 3155 &rt->fib6_siblings, 3156 fib6_siblings) { 3157 err = fib6_del(sibling, info); 3158 if (err) 3159 goto out_unlock; 3160 } 3161 } 3162 3163 err = fib6_del(rt, info); 3164 out_unlock: 3165 spin_unlock_bh(&table->tb6_lock); 3166 out_put: 3167 fib6_info_release(rt); 3168 3169 if (skb) { 3170 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3171 info->nlh, gfp_any()); 3172 } 3173 return err; 3174 } 3175 3176 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3177 { 3178 int rc = -ESRCH; 3179 3180 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3181 goto out; 3182 3183 if (cfg->fc_flags & RTF_GATEWAY && 3184 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3185 goto out; 3186 if (dst_hold_safe(&rt->dst)) 3187 rc = rt6_remove_exception_rt(rt); 3188 out: 3189 return rc; 3190 } 3191 3192 static int ip6_route_del(struct fib6_config *cfg, 3193 struct netlink_ext_ack *extack) 3194 { 3195 struct rt6_info *rt_cache; 3196 struct fib6_table *table; 3197 struct fib6_info *rt; 3198 struct fib6_node *fn; 3199 int err = -ESRCH; 3200 3201 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 3202 if (!table) { 3203 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 3204 return err; 3205 } 3206 3207 rcu_read_lock(); 3208 3209 fn = fib6_locate(&table->tb6_root, 3210 &cfg->fc_dst, cfg->fc_dst_len, 3211 &cfg->fc_src, cfg->fc_src_len, 3212 !(cfg->fc_flags & RTF_CACHE)); 3213 3214 if (fn) { 3215 for_each_fib6_node_rt_rcu(fn) { 3216 if (cfg->fc_flags & RTF_CACHE) { 3217 int rc; 3218 3219 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, 3220 &cfg->fc_src); 3221 if (rt_cache) { 3222 rc = ip6_del_cached_rt(rt_cache, cfg); 3223 if (rc != -ESRCH) 3224 return rc; 3225 } 3226 continue; 3227 } 3228 if (cfg->fc_ifindex && 3229 (!rt->fib6_nh.nh_dev || 3230 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex)) 3231 continue; 3232 if (cfg->fc_flags & RTF_GATEWAY && 3233 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw)) 3234 continue; 3235 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 3236 continue; 3237 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) 3238 continue; 3239 fib6_info_hold(rt); 3240 rcu_read_unlock(); 3241 3242 /* if gateway was specified only delete the one hop */ 3243 if (cfg->fc_flags & RTF_GATEWAY) 3244 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3245 3246 return __ip6_del_rt_siblings(rt, cfg); 3247 } 3248 } 3249 rcu_read_unlock(); 3250 3251 return err; 3252 } 3253 3254 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3255 { 3256 struct netevent_redirect netevent; 3257 struct rt6_info *rt, *nrt = NULL; 3258 struct ndisc_options ndopts; 3259 struct inet6_dev *in6_dev; 3260 struct neighbour *neigh; 3261 struct fib6_info *from; 3262 struct rd_msg *msg; 3263 int optlen, on_link; 3264 u8 *lladdr; 3265 3266 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3267 optlen -= sizeof(*msg); 3268 3269 if (optlen < 0) { 3270 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3271 return; 3272 } 3273 3274 msg = (struct rd_msg *)icmp6_hdr(skb); 3275 3276 if (ipv6_addr_is_multicast(&msg->dest)) { 3277 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3278 return; 3279 } 3280 3281 on_link = 0; 3282 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3283 on_link = 1; 3284 } else if (ipv6_addr_type(&msg->target) != 3285 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3286 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3287 return; 3288 } 3289 3290 in6_dev = __in6_dev_get(skb->dev); 3291 if (!in6_dev) 3292 return; 3293 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3294 return; 3295 3296 /* RFC2461 8.1: 3297 * The IP source address of the Redirect MUST be the same as the current 3298 * first-hop router for the specified ICMP Destination Address. 3299 */ 3300 3301 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3302 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3303 return; 3304 } 3305 3306 lladdr = NULL; 3307 if (ndopts.nd_opts_tgt_lladdr) { 3308 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3309 skb->dev); 3310 if (!lladdr) { 3311 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 3312 return; 3313 } 3314 } 3315 3316 rt = (struct rt6_info *) dst; 3317 if (rt->rt6i_flags & RTF_REJECT) { 3318 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 3319 return; 3320 } 3321 3322 /* Redirect received -> path was valid. 3323 * Look, redirects are sent only in response to data packets, 3324 * so that this nexthop apparently is reachable. --ANK 3325 */ 3326 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3327 3328 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3329 if (!neigh) 3330 return; 3331 3332 /* 3333 * We have finally decided to accept it. 3334 */ 3335 3336 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3337 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3338 NEIGH_UPDATE_F_OVERRIDE| 3339 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3340 NEIGH_UPDATE_F_ISROUTER)), 3341 NDISC_REDIRECT, &ndopts); 3342 3343 rcu_read_lock(); 3344 from = rcu_dereference(rt->from); 3345 fib6_info_hold(from); 3346 rcu_read_unlock(); 3347 3348 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL); 3349 if (!nrt) 3350 goto out; 3351 3352 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3353 if (on_link) 3354 nrt->rt6i_flags &= ~RTF_GATEWAY; 3355 3356 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3357 3358 /* No need to remove rt from the exception table if rt is 3359 * a cached route because rt6_insert_exception() will 3360 * takes care of it 3361 */ 3362 if (rt6_insert_exception(nrt, from)) { 3363 dst_release_immediate(&nrt->dst); 3364 goto out; 3365 } 3366 3367 netevent.old = &rt->dst; 3368 netevent.new = &nrt->dst; 3369 netevent.daddr = &msg->dest; 3370 netevent.neigh = neigh; 3371 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3372 3373 out: 3374 fib6_info_release(from); 3375 neigh_release(neigh); 3376 } 3377 3378 #ifdef CONFIG_IPV6_ROUTE_INFO 3379 static struct fib6_info *rt6_get_route_info(struct net *net, 3380 const struct in6_addr *prefix, int prefixlen, 3381 const struct in6_addr *gwaddr, 3382 struct net_device *dev) 3383 { 3384 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3385 int ifindex = dev->ifindex; 3386 struct fib6_node *fn; 3387 struct fib6_info *rt = NULL; 3388 struct fib6_table *table; 3389 3390 table = fib6_get_table(net, tb_id); 3391 if (!table) 3392 return NULL; 3393 3394 rcu_read_lock(); 3395 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3396 if (!fn) 3397 goto out; 3398 3399 for_each_fib6_node_rt_rcu(fn) { 3400 if (rt->fib6_nh.nh_dev->ifindex != ifindex) 3401 continue; 3402 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) 3403 continue; 3404 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) 3405 continue; 3406 fib6_info_hold(rt); 3407 break; 3408 } 3409 out: 3410 rcu_read_unlock(); 3411 return rt; 3412 } 3413 3414 static struct fib6_info *rt6_add_route_info(struct net *net, 3415 const struct in6_addr *prefix, int prefixlen, 3416 const struct in6_addr *gwaddr, 3417 struct net_device *dev, 3418 unsigned int pref) 3419 { 3420 struct fib6_config cfg = { 3421 .fc_metric = IP6_RT_PRIO_USER, 3422 .fc_ifindex = dev->ifindex, 3423 .fc_dst_len = prefixlen, 3424 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3425 RTF_UP | RTF_PREF(pref), 3426 .fc_protocol = RTPROT_RA, 3427 .fc_type = RTN_UNICAST, 3428 .fc_nlinfo.portid = 0, 3429 .fc_nlinfo.nlh = NULL, 3430 .fc_nlinfo.nl_net = net, 3431 }; 3432 3433 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3434 cfg.fc_dst = *prefix; 3435 cfg.fc_gateway = *gwaddr; 3436 3437 /* We should treat it as a default route if prefix length is 0. */ 3438 if (!prefixlen) 3439 cfg.fc_flags |= RTF_DEFAULT; 3440 3441 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 3442 3443 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3444 } 3445 #endif 3446 3447 struct fib6_info *rt6_get_dflt_router(struct net *net, 3448 const struct in6_addr *addr, 3449 struct net_device *dev) 3450 { 3451 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3452 struct fib6_info *rt; 3453 struct fib6_table *table; 3454 3455 table = fib6_get_table(net, tb_id); 3456 if (!table) 3457 return NULL; 3458 3459 rcu_read_lock(); 3460 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3461 if (dev == rt->fib6_nh.nh_dev && 3462 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3463 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) 3464 break; 3465 } 3466 if (rt) 3467 fib6_info_hold(rt); 3468 rcu_read_unlock(); 3469 return rt; 3470 } 3471 3472 struct fib6_info *rt6_add_dflt_router(struct net *net, 3473 const struct in6_addr *gwaddr, 3474 struct net_device *dev, 3475 unsigned int pref) 3476 { 3477 struct fib6_config cfg = { 3478 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3479 .fc_metric = IP6_RT_PRIO_USER, 3480 .fc_ifindex = dev->ifindex, 3481 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3482 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3483 .fc_protocol = RTPROT_RA, 3484 .fc_type = RTN_UNICAST, 3485 .fc_nlinfo.portid = 0, 3486 .fc_nlinfo.nlh = NULL, 3487 .fc_nlinfo.nl_net = net, 3488 }; 3489 3490 cfg.fc_gateway = *gwaddr; 3491 3492 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 3493 struct fib6_table *table; 3494 3495 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3496 if (table) 3497 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3498 } 3499 3500 return rt6_get_dflt_router(net, gwaddr, dev); 3501 } 3502 3503 static void __rt6_purge_dflt_routers(struct net *net, 3504 struct fib6_table *table) 3505 { 3506 struct fib6_info *rt; 3507 3508 restart: 3509 rcu_read_lock(); 3510 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3511 struct net_device *dev = fib6_info_nh_dev(rt); 3512 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 3513 3514 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3515 (!idev || idev->cnf.accept_ra != 2)) { 3516 fib6_info_hold(rt); 3517 rcu_read_unlock(); 3518 ip6_del_rt(net, rt); 3519 goto restart; 3520 } 3521 } 3522 rcu_read_unlock(); 3523 3524 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3525 } 3526 3527 void rt6_purge_dflt_routers(struct net *net) 3528 { 3529 struct fib6_table *table; 3530 struct hlist_head *head; 3531 unsigned int h; 3532 3533 rcu_read_lock(); 3534 3535 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3536 head = &net->ipv6.fib_table_hash[h]; 3537 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3538 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3539 __rt6_purge_dflt_routers(net, table); 3540 } 3541 } 3542 3543 rcu_read_unlock(); 3544 } 3545 3546 static void rtmsg_to_fib6_config(struct net *net, 3547 struct in6_rtmsg *rtmsg, 3548 struct fib6_config *cfg) 3549 { 3550 memset(cfg, 0, sizeof(*cfg)); 3551 3552 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3553 : RT6_TABLE_MAIN; 3554 cfg->fc_ifindex = rtmsg->rtmsg_ifindex; 3555 cfg->fc_metric = rtmsg->rtmsg_metric; 3556 cfg->fc_expires = rtmsg->rtmsg_info; 3557 cfg->fc_dst_len = rtmsg->rtmsg_dst_len; 3558 cfg->fc_src_len = rtmsg->rtmsg_src_len; 3559 cfg->fc_flags = rtmsg->rtmsg_flags; 3560 cfg->fc_type = rtmsg->rtmsg_type; 3561 3562 cfg->fc_nlinfo.nl_net = net; 3563 3564 cfg->fc_dst = rtmsg->rtmsg_dst; 3565 cfg->fc_src = rtmsg->rtmsg_src; 3566 cfg->fc_gateway = rtmsg->rtmsg_gateway; 3567 } 3568 3569 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3570 { 3571 struct fib6_config cfg; 3572 struct in6_rtmsg rtmsg; 3573 int err; 3574 3575 switch (cmd) { 3576 case SIOCADDRT: /* Add a route */ 3577 case SIOCDELRT: /* Delete a route */ 3578 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3579 return -EPERM; 3580 err = copy_from_user(&rtmsg, arg, 3581 sizeof(struct in6_rtmsg)); 3582 if (err) 3583 return -EFAULT; 3584 3585 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3586 3587 rtnl_lock(); 3588 switch (cmd) { 3589 case SIOCADDRT: 3590 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 3591 break; 3592 case SIOCDELRT: 3593 err = ip6_route_del(&cfg, NULL); 3594 break; 3595 default: 3596 err = -EINVAL; 3597 } 3598 rtnl_unlock(); 3599 3600 return err; 3601 } 3602 3603 return -EINVAL; 3604 } 3605 3606 /* 3607 * Drop the packet on the floor 3608 */ 3609 3610 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3611 { 3612 int type; 3613 struct dst_entry *dst = skb_dst(skb); 3614 switch (ipstats_mib_noroutes) { 3615 case IPSTATS_MIB_INNOROUTES: 3616 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3617 if (type == IPV6_ADDR_ANY) { 3618 IP6_INC_STATS(dev_net(dst->dev), 3619 __in6_dev_get_safely(skb->dev), 3620 IPSTATS_MIB_INADDRERRORS); 3621 break; 3622 } 3623 /* FALLTHROUGH */ 3624 case IPSTATS_MIB_OUTNOROUTES: 3625 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3626 ipstats_mib_noroutes); 3627 break; 3628 } 3629 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3630 kfree_skb(skb); 3631 return 0; 3632 } 3633 3634 static int ip6_pkt_discard(struct sk_buff *skb) 3635 { 3636 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3637 } 3638 3639 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3640 { 3641 skb->dev = skb_dst(skb)->dev; 3642 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3643 } 3644 3645 static int ip6_pkt_prohibit(struct sk_buff *skb) 3646 { 3647 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3648 } 3649 3650 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3651 { 3652 skb->dev = skb_dst(skb)->dev; 3653 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3654 } 3655 3656 /* 3657 * Allocate a dst for local (unicast / anycast) address. 3658 */ 3659 3660 struct fib6_info *addrconf_f6i_alloc(struct net *net, 3661 struct inet6_dev *idev, 3662 const struct in6_addr *addr, 3663 bool anycast, gfp_t gfp_flags) 3664 { 3665 u32 tb_id; 3666 struct net_device *dev = idev->dev; 3667 struct fib6_info *f6i; 3668 3669 f6i = fib6_info_alloc(gfp_flags); 3670 if (!f6i) 3671 return ERR_PTR(-ENOMEM); 3672 3673 f6i->dst_nocount = true; 3674 f6i->dst_host = true; 3675 f6i->fib6_protocol = RTPROT_KERNEL; 3676 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP; 3677 if (anycast) { 3678 f6i->fib6_type = RTN_ANYCAST; 3679 f6i->fib6_flags |= RTF_ANYCAST; 3680 } else { 3681 f6i->fib6_type = RTN_LOCAL; 3682 f6i->fib6_flags |= RTF_LOCAL; 3683 } 3684 3685 f6i->fib6_nh.nh_gw = *addr; 3686 dev_hold(dev); 3687 f6i->fib6_nh.nh_dev = dev; 3688 f6i->fib6_dst.addr = *addr; 3689 f6i->fib6_dst.plen = 128; 3690 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; 3691 f6i->fib6_table = fib6_get_table(net, tb_id); 3692 3693 return f6i; 3694 } 3695 3696 /* remove deleted ip from prefsrc entries */ 3697 struct arg_dev_net_ip { 3698 struct net_device *dev; 3699 struct net *net; 3700 struct in6_addr *addr; 3701 }; 3702 3703 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 3704 { 3705 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3706 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3707 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3708 3709 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) && 3710 rt != net->ipv6.fib6_null_entry && 3711 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 3712 spin_lock_bh(&rt6_exception_lock); 3713 /* remove prefsrc entry */ 3714 rt->fib6_prefsrc.plen = 0; 3715 /* need to update cache as well */ 3716 rt6_exceptions_remove_prefsrc(rt); 3717 spin_unlock_bh(&rt6_exception_lock); 3718 } 3719 return 0; 3720 } 3721 3722 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3723 { 3724 struct net *net = dev_net(ifp->idev->dev); 3725 struct arg_dev_net_ip adni = { 3726 .dev = ifp->idev->dev, 3727 .net = net, 3728 .addr = &ifp->addr, 3729 }; 3730 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3731 } 3732 3733 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) 3734 3735 /* Remove routers and update dst entries when gateway turn into host. */ 3736 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 3737 { 3738 struct in6_addr *gateway = (struct in6_addr *)arg; 3739 3740 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3741 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) { 3742 return -1; 3743 } 3744 3745 /* Further clean up cached routes in exception table. 3746 * This is needed because cached route may have a different 3747 * gateway than its 'parent' in the case of an ip redirect. 3748 */ 3749 rt6_exceptions_clean_tohost(rt, gateway); 3750 3751 return 0; 3752 } 3753 3754 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3755 { 3756 fib6_clean_all(net, fib6_clean_tohost, gateway); 3757 } 3758 3759 struct arg_netdev_event { 3760 const struct net_device *dev; 3761 union { 3762 unsigned int nh_flags; 3763 unsigned long event; 3764 }; 3765 }; 3766 3767 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 3768 { 3769 struct fib6_info *iter; 3770 struct fib6_node *fn; 3771 3772 fn = rcu_dereference_protected(rt->fib6_node, 3773 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3774 iter = rcu_dereference_protected(fn->leaf, 3775 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3776 while (iter) { 3777 if (iter->fib6_metric == rt->fib6_metric && 3778 rt6_qualify_for_ecmp(iter)) 3779 return iter; 3780 iter = rcu_dereference_protected(iter->rt6_next, 3781 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3782 } 3783 3784 return NULL; 3785 } 3786 3787 static bool rt6_is_dead(const struct fib6_info *rt) 3788 { 3789 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || 3790 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 3791 fib6_ignore_linkdown(rt))) 3792 return true; 3793 3794 return false; 3795 } 3796 3797 static int rt6_multipath_total_weight(const struct fib6_info *rt) 3798 { 3799 struct fib6_info *iter; 3800 int total = 0; 3801 3802 if (!rt6_is_dead(rt)) 3803 total += rt->fib6_nh.nh_weight; 3804 3805 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 3806 if (!rt6_is_dead(iter)) 3807 total += iter->fib6_nh.nh_weight; 3808 } 3809 3810 return total; 3811 } 3812 3813 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 3814 { 3815 int upper_bound = -1; 3816 3817 if (!rt6_is_dead(rt)) { 3818 *weight += rt->fib6_nh.nh_weight; 3819 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 3820 total) - 1; 3821 } 3822 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound); 3823 } 3824 3825 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 3826 { 3827 struct fib6_info *iter; 3828 int weight = 0; 3829 3830 rt6_upper_bound_set(rt, &weight, total); 3831 3832 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3833 rt6_upper_bound_set(iter, &weight, total); 3834 } 3835 3836 void rt6_multipath_rebalance(struct fib6_info *rt) 3837 { 3838 struct fib6_info *first; 3839 int total; 3840 3841 /* In case the entire multipath route was marked for flushing, 3842 * then there is no need to rebalance upon the removal of every 3843 * sibling route. 3844 */ 3845 if (!rt->fib6_nsiblings || rt->should_flush) 3846 return; 3847 3848 /* During lookup routes are evaluated in order, so we need to 3849 * make sure upper bounds are assigned from the first sibling 3850 * onwards. 3851 */ 3852 first = rt6_multipath_first_sibling(rt); 3853 if (WARN_ON_ONCE(!first)) 3854 return; 3855 3856 total = rt6_multipath_total_weight(first); 3857 rt6_multipath_upper_bound_set(first, total); 3858 } 3859 3860 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 3861 { 3862 const struct arg_netdev_event *arg = p_arg; 3863 struct net *net = dev_net(arg->dev); 3864 3865 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) { 3866 rt->fib6_nh.nh_flags &= ~arg->nh_flags; 3867 fib6_update_sernum_upto_root(net, rt); 3868 rt6_multipath_rebalance(rt); 3869 } 3870 3871 return 0; 3872 } 3873 3874 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) 3875 { 3876 struct arg_netdev_event arg = { 3877 .dev = dev, 3878 { 3879 .nh_flags = nh_flags, 3880 }, 3881 }; 3882 3883 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 3884 arg.nh_flags |= RTNH_F_LINKDOWN; 3885 3886 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 3887 } 3888 3889 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 3890 const struct net_device *dev) 3891 { 3892 struct fib6_info *iter; 3893 3894 if (rt->fib6_nh.nh_dev == dev) 3895 return true; 3896 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3897 if (iter->fib6_nh.nh_dev == dev) 3898 return true; 3899 3900 return false; 3901 } 3902 3903 static void rt6_multipath_flush(struct fib6_info *rt) 3904 { 3905 struct fib6_info *iter; 3906 3907 rt->should_flush = 1; 3908 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3909 iter->should_flush = 1; 3910 } 3911 3912 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 3913 const struct net_device *down_dev) 3914 { 3915 struct fib6_info *iter; 3916 unsigned int dead = 0; 3917 3918 if (rt->fib6_nh.nh_dev == down_dev || 3919 rt->fib6_nh.nh_flags & RTNH_F_DEAD) 3920 dead++; 3921 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3922 if (iter->fib6_nh.nh_dev == down_dev || 3923 iter->fib6_nh.nh_flags & RTNH_F_DEAD) 3924 dead++; 3925 3926 return dead; 3927 } 3928 3929 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 3930 const struct net_device *dev, 3931 unsigned int nh_flags) 3932 { 3933 struct fib6_info *iter; 3934 3935 if (rt->fib6_nh.nh_dev == dev) 3936 rt->fib6_nh.nh_flags |= nh_flags; 3937 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3938 if (iter->fib6_nh.nh_dev == dev) 3939 iter->fib6_nh.nh_flags |= nh_flags; 3940 } 3941 3942 /* called with write lock held for table with rt */ 3943 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 3944 { 3945 const struct arg_netdev_event *arg = p_arg; 3946 const struct net_device *dev = arg->dev; 3947 struct net *net = dev_net(dev); 3948 3949 if (rt == net->ipv6.fib6_null_entry) 3950 return 0; 3951 3952 switch (arg->event) { 3953 case NETDEV_UNREGISTER: 3954 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 3955 case NETDEV_DOWN: 3956 if (rt->should_flush) 3957 return -1; 3958 if (!rt->fib6_nsiblings) 3959 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 3960 if (rt6_multipath_uses_dev(rt, dev)) { 3961 unsigned int count; 3962 3963 count = rt6_multipath_dead_count(rt, dev); 3964 if (rt->fib6_nsiblings + 1 == count) { 3965 rt6_multipath_flush(rt); 3966 return -1; 3967 } 3968 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 3969 RTNH_F_LINKDOWN); 3970 fib6_update_sernum(net, rt); 3971 rt6_multipath_rebalance(rt); 3972 } 3973 return -2; 3974 case NETDEV_CHANGE: 3975 if (rt->fib6_nh.nh_dev != dev || 3976 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 3977 break; 3978 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 3979 rt6_multipath_rebalance(rt); 3980 break; 3981 } 3982 3983 return 0; 3984 } 3985 3986 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 3987 { 3988 struct arg_netdev_event arg = { 3989 .dev = dev, 3990 { 3991 .event = event, 3992 }, 3993 }; 3994 3995 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg); 3996 } 3997 3998 void rt6_disable_ip(struct net_device *dev, unsigned long event) 3999 { 4000 rt6_sync_down_dev(dev, event); 4001 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4002 neigh_ifdown(&nd_tbl, dev); 4003 } 4004 4005 struct rt6_mtu_change_arg { 4006 struct net_device *dev; 4007 unsigned int mtu; 4008 }; 4009 4010 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) 4011 { 4012 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4013 struct inet6_dev *idev; 4014 4015 /* In IPv6 pmtu discovery is not optional, 4016 so that RTAX_MTU lock cannot disable it. 4017 We still use this lock to block changes 4018 caused by addrconf/ndisc. 4019 */ 4020 4021 idev = __in6_dev_get(arg->dev); 4022 if (!idev) 4023 return 0; 4024 4025 /* For administrative MTU increase, there is no way to discover 4026 IPv6 PMTU increase, so PMTU increase should be updated here. 4027 Since RFC 1981 doesn't include administrative MTU increase 4028 update PMTU increase is a MUST. (i.e. jumbo frame) 4029 */ 4030 if (rt->fib6_nh.nh_dev == arg->dev && 4031 !fib6_metric_locked(rt, RTAX_MTU)) { 4032 u32 mtu = rt->fib6_pmtu; 4033 4034 if (mtu >= arg->mtu || 4035 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4036 fib6_metric_set(rt, RTAX_MTU, arg->mtu); 4037 4038 spin_lock_bh(&rt6_exception_lock); 4039 rt6_exceptions_update_pmtu(idev, rt, arg->mtu); 4040 spin_unlock_bh(&rt6_exception_lock); 4041 } 4042 return 0; 4043 } 4044 4045 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4046 { 4047 struct rt6_mtu_change_arg arg = { 4048 .dev = dev, 4049 .mtu = mtu, 4050 }; 4051 4052 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4053 } 4054 4055 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4056 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4057 [RTA_OIF] = { .type = NLA_U32 }, 4058 [RTA_IIF] = { .type = NLA_U32 }, 4059 [RTA_PRIORITY] = { .type = NLA_U32 }, 4060 [RTA_METRICS] = { .type = NLA_NESTED }, 4061 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4062 [RTA_PREF] = { .type = NLA_U8 }, 4063 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4064 [RTA_ENCAP] = { .type = NLA_NESTED }, 4065 [RTA_EXPIRES] = { .type = NLA_U32 }, 4066 [RTA_UID] = { .type = NLA_U32 }, 4067 [RTA_MARK] = { .type = NLA_U32 }, 4068 }; 4069 4070 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4071 struct fib6_config *cfg, 4072 struct netlink_ext_ack *extack) 4073 { 4074 struct rtmsg *rtm; 4075 struct nlattr *tb[RTA_MAX+1]; 4076 unsigned int pref; 4077 int err; 4078 4079 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4080 NULL); 4081 if (err < 0) 4082 goto errout; 4083 4084 err = -EINVAL; 4085 rtm = nlmsg_data(nlh); 4086 memset(cfg, 0, sizeof(*cfg)); 4087 4088 cfg->fc_table = rtm->rtm_table; 4089 cfg->fc_dst_len = rtm->rtm_dst_len; 4090 cfg->fc_src_len = rtm->rtm_src_len; 4091 cfg->fc_flags = RTF_UP; 4092 cfg->fc_protocol = rtm->rtm_protocol; 4093 cfg->fc_type = rtm->rtm_type; 4094 4095 if (rtm->rtm_type == RTN_UNREACHABLE || 4096 rtm->rtm_type == RTN_BLACKHOLE || 4097 rtm->rtm_type == RTN_PROHIBIT || 4098 rtm->rtm_type == RTN_THROW) 4099 cfg->fc_flags |= RTF_REJECT; 4100 4101 if (rtm->rtm_type == RTN_LOCAL) 4102 cfg->fc_flags |= RTF_LOCAL; 4103 4104 if (rtm->rtm_flags & RTM_F_CLONED) 4105 cfg->fc_flags |= RTF_CACHE; 4106 4107 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4108 4109 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; 4110 cfg->fc_nlinfo.nlh = nlh; 4111 cfg->fc_nlinfo.nl_net = sock_net(skb->sk); 4112 4113 if (tb[RTA_GATEWAY]) { 4114 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4115 cfg->fc_flags |= RTF_GATEWAY; 4116 } 4117 4118 if (tb[RTA_DST]) { 4119 int plen = (rtm->rtm_dst_len + 7) >> 3; 4120 4121 if (nla_len(tb[RTA_DST]) < plen) 4122 goto errout; 4123 4124 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 4125 } 4126 4127 if (tb[RTA_SRC]) { 4128 int plen = (rtm->rtm_src_len + 7) >> 3; 4129 4130 if (nla_len(tb[RTA_SRC]) < plen) 4131 goto errout; 4132 4133 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 4134 } 4135 4136 if (tb[RTA_PREFSRC]) 4137 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 4138 4139 if (tb[RTA_OIF]) 4140 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 4141 4142 if (tb[RTA_PRIORITY]) 4143 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 4144 4145 if (tb[RTA_METRICS]) { 4146 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 4147 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 4148 } 4149 4150 if (tb[RTA_TABLE]) 4151 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 4152 4153 if (tb[RTA_MULTIPATH]) { 4154 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 4155 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 4156 4157 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 4158 cfg->fc_mp_len, extack); 4159 if (err < 0) 4160 goto errout; 4161 } 4162 4163 if (tb[RTA_PREF]) { 4164 pref = nla_get_u8(tb[RTA_PREF]); 4165 if (pref != ICMPV6_ROUTER_PREF_LOW && 4166 pref != ICMPV6_ROUTER_PREF_HIGH) 4167 pref = ICMPV6_ROUTER_PREF_MEDIUM; 4168 cfg->fc_flags |= RTF_PREF(pref); 4169 } 4170 4171 if (tb[RTA_ENCAP]) 4172 cfg->fc_encap = tb[RTA_ENCAP]; 4173 4174 if (tb[RTA_ENCAP_TYPE]) { 4175 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 4176 4177 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 4178 if (err < 0) 4179 goto errout; 4180 } 4181 4182 if (tb[RTA_EXPIRES]) { 4183 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 4184 4185 if (addrconf_finite_timeout(timeout)) { 4186 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 4187 cfg->fc_flags |= RTF_EXPIRES; 4188 } 4189 } 4190 4191 err = 0; 4192 errout: 4193 return err; 4194 } 4195 4196 struct rt6_nh { 4197 struct fib6_info *fib6_info; 4198 struct fib6_config r_cfg; 4199 struct list_head next; 4200 }; 4201 4202 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) 4203 { 4204 struct rt6_nh *nh; 4205 4206 list_for_each_entry(nh, rt6_nh_list, next) { 4207 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n", 4208 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, 4209 nh->r_cfg.fc_ifindex); 4210 } 4211 } 4212 4213 static int ip6_route_info_append(struct net *net, 4214 struct list_head *rt6_nh_list, 4215 struct fib6_info *rt, 4216 struct fib6_config *r_cfg) 4217 { 4218 struct rt6_nh *nh; 4219 int err = -EEXIST; 4220 4221 list_for_each_entry(nh, rt6_nh_list, next) { 4222 /* check if fib6_info already exists */ 4223 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 4224 return err; 4225 } 4226 4227 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4228 if (!nh) 4229 return -ENOMEM; 4230 nh->fib6_info = rt; 4231 err = ip6_convert_metrics(net, rt, r_cfg); 4232 if (err) { 4233 kfree(nh); 4234 return err; 4235 } 4236 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4237 list_add_tail(&nh->next, rt6_nh_list); 4238 4239 return 0; 4240 } 4241 4242 static void ip6_route_mpath_notify(struct fib6_info *rt, 4243 struct fib6_info *rt_last, 4244 struct nl_info *info, 4245 __u16 nlflags) 4246 { 4247 /* if this is an APPEND route, then rt points to the first route 4248 * inserted and rt_last points to last route inserted. Userspace 4249 * wants a consistent dump of the route which starts at the first 4250 * nexthop. Since sibling routes are always added at the end of 4251 * the list, find the first sibling of the last route appended 4252 */ 4253 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 4254 rt = list_first_entry(&rt_last->fib6_siblings, 4255 struct fib6_info, 4256 fib6_siblings); 4257 } 4258 4259 if (rt) 4260 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 4261 } 4262 4263 static int ip6_route_multipath_add(struct fib6_config *cfg, 4264 struct netlink_ext_ack *extack) 4265 { 4266 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 4267 struct nl_info *info = &cfg->fc_nlinfo; 4268 struct fib6_config r_cfg; 4269 struct rtnexthop *rtnh; 4270 struct fib6_info *rt; 4271 struct rt6_nh *err_nh; 4272 struct rt6_nh *nh, *nh_safe; 4273 __u16 nlflags; 4274 int remaining; 4275 int attrlen; 4276 int err = 1; 4277 int nhn = 0; 4278 int replace = (cfg->fc_nlinfo.nlh && 4279 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 4280 LIST_HEAD(rt6_nh_list); 4281 4282 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 4283 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 4284 nlflags |= NLM_F_APPEND; 4285 4286 remaining = cfg->fc_mp_len; 4287 rtnh = (struct rtnexthop *)cfg->fc_mp; 4288 4289 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4290 * fib6_info structs per nexthop 4291 */ 4292 while (rtnh_ok(rtnh, remaining)) { 4293 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4294 if (rtnh->rtnh_ifindex) 4295 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4296 4297 attrlen = rtnh_attrlen(rtnh); 4298 if (attrlen > 0) { 4299 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4300 4301 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4302 if (nla) { 4303 r_cfg.fc_gateway = nla_get_in6_addr(nla); 4304 r_cfg.fc_flags |= RTF_GATEWAY; 4305 } 4306 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 4307 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 4308 if (nla) 4309 r_cfg.fc_encap_type = nla_get_u16(nla); 4310 } 4311 4312 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 4313 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 4314 if (IS_ERR(rt)) { 4315 err = PTR_ERR(rt); 4316 rt = NULL; 4317 goto cleanup; 4318 } 4319 4320 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1; 4321 4322 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 4323 rt, &r_cfg); 4324 if (err) { 4325 fib6_info_release(rt); 4326 goto cleanup; 4327 } 4328 4329 rtnh = rtnh_next(rtnh, &remaining); 4330 } 4331 4332 /* for add and replace send one notification with all nexthops. 4333 * Skip the notification in fib6_add_rt2node and send one with 4334 * the full route when done 4335 */ 4336 info->skip_notify = 1; 4337 4338 err_nh = NULL; 4339 list_for_each_entry(nh, &rt6_nh_list, next) { 4340 rt_last = nh->fib6_info; 4341 err = __ip6_ins_rt(nh->fib6_info, info, extack); 4342 fib6_info_release(nh->fib6_info); 4343 4344 /* save reference to first route for notification */ 4345 if (!rt_notif && !err) 4346 rt_notif = nh->fib6_info; 4347 4348 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 4349 nh->fib6_info = NULL; 4350 if (err) { 4351 if (replace && nhn) 4352 ip6_print_replace_route_err(&rt6_nh_list); 4353 err_nh = nh; 4354 goto add_errout; 4355 } 4356 4357 /* Because each route is added like a single route we remove 4358 * these flags after the first nexthop: if there is a collision, 4359 * we have already failed to add the first nexthop: 4360 * fib6_add_rt2node() has rejected it; when replacing, old 4361 * nexthops have been replaced by first new, the rest should 4362 * be added to it. 4363 */ 4364 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4365 NLM_F_REPLACE); 4366 nhn++; 4367 } 4368 4369 /* success ... tell user about new route */ 4370 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4371 goto cleanup; 4372 4373 add_errout: 4374 /* send notification for routes that were added so that 4375 * the delete notifications sent by ip6_route_del are 4376 * coherent 4377 */ 4378 if (rt_notif) 4379 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4380 4381 /* Delete routes that were already added */ 4382 list_for_each_entry(nh, &rt6_nh_list, next) { 4383 if (err_nh == nh) 4384 break; 4385 ip6_route_del(&nh->r_cfg, extack); 4386 } 4387 4388 cleanup: 4389 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4390 if (nh->fib6_info) 4391 fib6_info_release(nh->fib6_info); 4392 list_del(&nh->next); 4393 kfree(nh); 4394 } 4395 4396 return err; 4397 } 4398 4399 static int ip6_route_multipath_del(struct fib6_config *cfg, 4400 struct netlink_ext_ack *extack) 4401 { 4402 struct fib6_config r_cfg; 4403 struct rtnexthop *rtnh; 4404 int remaining; 4405 int attrlen; 4406 int err = 1, last_err = 0; 4407 4408 remaining = cfg->fc_mp_len; 4409 rtnh = (struct rtnexthop *)cfg->fc_mp; 4410 4411 /* Parse a Multipath Entry */ 4412 while (rtnh_ok(rtnh, remaining)) { 4413 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4414 if (rtnh->rtnh_ifindex) 4415 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4416 4417 attrlen = rtnh_attrlen(rtnh); 4418 if (attrlen > 0) { 4419 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4420 4421 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4422 if (nla) { 4423 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 4424 r_cfg.fc_flags |= RTF_GATEWAY; 4425 } 4426 } 4427 err = ip6_route_del(&r_cfg, extack); 4428 if (err) 4429 last_err = err; 4430 4431 rtnh = rtnh_next(rtnh, &remaining); 4432 } 4433 4434 return last_err; 4435 } 4436 4437 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4438 struct netlink_ext_ack *extack) 4439 { 4440 struct fib6_config cfg; 4441 int err; 4442 4443 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4444 if (err < 0) 4445 return err; 4446 4447 if (cfg.fc_mp) 4448 return ip6_route_multipath_del(&cfg, extack); 4449 else { 4450 cfg.fc_delete_all_nh = 1; 4451 return ip6_route_del(&cfg, extack); 4452 } 4453 } 4454 4455 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4456 struct netlink_ext_ack *extack) 4457 { 4458 struct fib6_config cfg; 4459 int err; 4460 4461 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4462 if (err < 0) 4463 return err; 4464 4465 if (cfg.fc_mp) 4466 return ip6_route_multipath_add(&cfg, extack); 4467 else 4468 return ip6_route_add(&cfg, GFP_KERNEL, extack); 4469 } 4470 4471 static size_t rt6_nlmsg_size(struct fib6_info *rt) 4472 { 4473 int nexthop_len = 0; 4474 4475 if (rt->fib6_nsiblings) { 4476 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4477 + NLA_ALIGN(sizeof(struct rtnexthop)) 4478 + nla_total_size(16) /* RTA_GATEWAY */ 4479 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate); 4480 4481 nexthop_len *= rt->fib6_nsiblings; 4482 } 4483 4484 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4485 + nla_total_size(16) /* RTA_SRC */ 4486 + nla_total_size(16) /* RTA_DST */ 4487 + nla_total_size(16) /* RTA_GATEWAY */ 4488 + nla_total_size(16) /* RTA_PREFSRC */ 4489 + nla_total_size(4) /* RTA_TABLE */ 4490 + nla_total_size(4) /* RTA_IIF */ 4491 + nla_total_size(4) /* RTA_OIF */ 4492 + nla_total_size(4) /* RTA_PRIORITY */ 4493 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 4494 + nla_total_size(sizeof(struct rta_cacheinfo)) 4495 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4496 + nla_total_size(1) /* RTA_PREF */ 4497 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate) 4498 + nexthop_len; 4499 } 4500 4501 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt, 4502 unsigned int *flags, bool skip_oif) 4503 { 4504 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 4505 *flags |= RTNH_F_DEAD; 4506 4507 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) { 4508 *flags |= RTNH_F_LINKDOWN; 4509 4510 rcu_read_lock(); 4511 if (fib6_ignore_linkdown(rt)) 4512 *flags |= RTNH_F_DEAD; 4513 rcu_read_unlock(); 4514 } 4515 4516 if (rt->fib6_flags & RTF_GATEWAY) { 4517 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0) 4518 goto nla_put_failure; 4519 } 4520 4521 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK); 4522 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD) 4523 *flags |= RTNH_F_OFFLOAD; 4524 4525 /* not needed for multipath encoding b/c it has a rtnexthop struct */ 4526 if (!skip_oif && rt->fib6_nh.nh_dev && 4527 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex)) 4528 goto nla_put_failure; 4529 4530 if (rt->fib6_nh.nh_lwtstate && 4531 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0) 4532 goto nla_put_failure; 4533 4534 return 0; 4535 4536 nla_put_failure: 4537 return -EMSGSIZE; 4538 } 4539 4540 /* add multipath next hop */ 4541 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt) 4542 { 4543 const struct net_device *dev = rt->fib6_nh.nh_dev; 4544 struct rtnexthop *rtnh; 4545 unsigned int flags = 0; 4546 4547 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 4548 if (!rtnh) 4549 goto nla_put_failure; 4550 4551 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1; 4552 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; 4553 4554 if (rt6_nexthop_info(skb, rt, &flags, true) < 0) 4555 goto nla_put_failure; 4556 4557 rtnh->rtnh_flags = flags; 4558 4559 /* length of rtnetlink header + attributes */ 4560 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; 4561 4562 return 0; 4563 4564 nla_put_failure: 4565 return -EMSGSIZE; 4566 } 4567 4568 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 4569 struct fib6_info *rt, struct dst_entry *dst, 4570 struct in6_addr *dest, struct in6_addr *src, 4571 int iif, int type, u32 portid, u32 seq, 4572 unsigned int flags) 4573 { 4574 struct rtmsg *rtm; 4575 struct nlmsghdr *nlh; 4576 long expires = 0; 4577 u32 *pmetrics; 4578 u32 table; 4579 4580 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4581 if (!nlh) 4582 return -EMSGSIZE; 4583 4584 rtm = nlmsg_data(nlh); 4585 rtm->rtm_family = AF_INET6; 4586 rtm->rtm_dst_len = rt->fib6_dst.plen; 4587 rtm->rtm_src_len = rt->fib6_src.plen; 4588 rtm->rtm_tos = 0; 4589 if (rt->fib6_table) 4590 table = rt->fib6_table->tb6_id; 4591 else 4592 table = RT6_TABLE_UNSPEC; 4593 rtm->rtm_table = table; 4594 if (nla_put_u32(skb, RTA_TABLE, table)) 4595 goto nla_put_failure; 4596 4597 rtm->rtm_type = rt->fib6_type; 4598 rtm->rtm_flags = 0; 4599 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4600 rtm->rtm_protocol = rt->fib6_protocol; 4601 4602 if (rt->fib6_flags & RTF_CACHE) 4603 rtm->rtm_flags |= RTM_F_CLONED; 4604 4605 if (dest) { 4606 if (nla_put_in6_addr(skb, RTA_DST, dest)) 4607 goto nla_put_failure; 4608 rtm->rtm_dst_len = 128; 4609 } else if (rtm->rtm_dst_len) 4610 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr)) 4611 goto nla_put_failure; 4612 #ifdef CONFIG_IPV6_SUBTREES 4613 if (src) { 4614 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4615 goto nla_put_failure; 4616 rtm->rtm_src_len = 128; 4617 } else if (rtm->rtm_src_len && 4618 nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr)) 4619 goto nla_put_failure; 4620 #endif 4621 if (iif) { 4622 #ifdef CONFIG_IPV6_MROUTE 4623 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) { 4624 int err = ip6mr_get_route(net, skb, rtm, portid); 4625 4626 if (err == 0) 4627 return 0; 4628 if (err < 0) 4629 goto nla_put_failure; 4630 } else 4631 #endif 4632 if (nla_put_u32(skb, RTA_IIF, iif)) 4633 goto nla_put_failure; 4634 } else if (dest) { 4635 struct in6_addr saddr_buf; 4636 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 4637 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4638 goto nla_put_failure; 4639 } 4640 4641 if (rt->fib6_prefsrc.plen) { 4642 struct in6_addr saddr_buf; 4643 saddr_buf = rt->fib6_prefsrc.addr; 4644 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4645 goto nla_put_failure; 4646 } 4647 4648 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 4649 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 4650 goto nla_put_failure; 4651 4652 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 4653 goto nla_put_failure; 4654 4655 /* For multipath routes, walk the siblings list and add 4656 * each as a nexthop within RTA_MULTIPATH. 4657 */ 4658 if (rt->fib6_nsiblings) { 4659 struct fib6_info *sibling, *next_sibling; 4660 struct nlattr *mp; 4661 4662 mp = nla_nest_start(skb, RTA_MULTIPATH); 4663 if (!mp) 4664 goto nla_put_failure; 4665 4666 if (rt6_add_nexthop(skb, rt) < 0) 4667 goto nla_put_failure; 4668 4669 list_for_each_entry_safe(sibling, next_sibling, 4670 &rt->fib6_siblings, fib6_siblings) { 4671 if (rt6_add_nexthop(skb, sibling) < 0) 4672 goto nla_put_failure; 4673 } 4674 4675 nla_nest_end(skb, mp); 4676 } else { 4677 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0) 4678 goto nla_put_failure; 4679 } 4680 4681 if (rt->fib6_flags & RTF_EXPIRES) { 4682 expires = dst ? dst->expires : rt->expires; 4683 expires -= jiffies; 4684 } 4685 4686 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 4687 goto nla_put_failure; 4688 4689 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags))) 4690 goto nla_put_failure; 4691 4692 4693 nlmsg_end(skb, nlh); 4694 return 0; 4695 4696 nla_put_failure: 4697 nlmsg_cancel(skb, nlh); 4698 return -EMSGSIZE; 4699 } 4700 4701 int rt6_dump_route(struct fib6_info *rt, void *p_arg) 4702 { 4703 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4704 struct net *net = arg->net; 4705 4706 if (rt == net->ipv6.fib6_null_entry) 4707 return 0; 4708 4709 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { 4710 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); 4711 4712 /* user wants prefix routes only */ 4713 if (rtm->rtm_flags & RTM_F_PREFIX && 4714 !(rt->fib6_flags & RTF_PREFIX_RT)) { 4715 /* success since this is not a prefix route */ 4716 return 1; 4717 } 4718 } 4719 4720 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, 4721 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, 4722 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI); 4723 } 4724 4725 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4726 struct netlink_ext_ack *extack) 4727 { 4728 struct net *net = sock_net(in_skb->sk); 4729 struct nlattr *tb[RTA_MAX+1]; 4730 int err, iif = 0, oif = 0; 4731 struct fib6_info *from; 4732 struct dst_entry *dst; 4733 struct rt6_info *rt; 4734 struct sk_buff *skb; 4735 struct rtmsg *rtm; 4736 struct flowi6 fl6; 4737 bool fibmatch; 4738 4739 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4740 extack); 4741 if (err < 0) 4742 goto errout; 4743 4744 err = -EINVAL; 4745 memset(&fl6, 0, sizeof(fl6)); 4746 rtm = nlmsg_data(nlh); 4747 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4748 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4749 4750 if (tb[RTA_SRC]) { 4751 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4752 goto errout; 4753 4754 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4755 } 4756 4757 if (tb[RTA_DST]) { 4758 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4759 goto errout; 4760 4761 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4762 } 4763 4764 if (tb[RTA_IIF]) 4765 iif = nla_get_u32(tb[RTA_IIF]); 4766 4767 if (tb[RTA_OIF]) 4768 oif = nla_get_u32(tb[RTA_OIF]); 4769 4770 if (tb[RTA_MARK]) 4771 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4772 4773 if (tb[RTA_UID]) 4774 fl6.flowi6_uid = make_kuid(current_user_ns(), 4775 nla_get_u32(tb[RTA_UID])); 4776 else 4777 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4778 4779 if (iif) { 4780 struct net_device *dev; 4781 int flags = 0; 4782 4783 rcu_read_lock(); 4784 4785 dev = dev_get_by_index_rcu(net, iif); 4786 if (!dev) { 4787 rcu_read_unlock(); 4788 err = -ENODEV; 4789 goto errout; 4790 } 4791 4792 fl6.flowi6_iif = iif; 4793 4794 if (!ipv6_addr_any(&fl6.saddr)) 4795 flags |= RT6_LOOKUP_F_HAS_SADDR; 4796 4797 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 4798 4799 rcu_read_unlock(); 4800 } else { 4801 fl6.flowi6_oif = oif; 4802 4803 dst = ip6_route_output(net, NULL, &fl6); 4804 } 4805 4806 4807 rt = container_of(dst, struct rt6_info, dst); 4808 if (rt->dst.error) { 4809 err = rt->dst.error; 4810 ip6_rt_put(rt); 4811 goto errout; 4812 } 4813 4814 if (rt == net->ipv6.ip6_null_entry) { 4815 err = rt->dst.error; 4816 ip6_rt_put(rt); 4817 goto errout; 4818 } 4819 4820 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 4821 if (!skb) { 4822 ip6_rt_put(rt); 4823 err = -ENOBUFS; 4824 goto errout; 4825 } 4826 4827 skb_dst_set(skb, &rt->dst); 4828 4829 rcu_read_lock(); 4830 from = rcu_dereference(rt->from); 4831 4832 if (fibmatch) 4833 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, 4834 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 4835 nlh->nlmsg_seq, 0); 4836 else 4837 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 4838 &fl6.saddr, iif, RTM_NEWROUTE, 4839 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 4840 0); 4841 rcu_read_unlock(); 4842 4843 if (err < 0) { 4844 kfree_skb(skb); 4845 goto errout; 4846 } 4847 4848 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 4849 errout: 4850 return err; 4851 } 4852 4853 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 4854 unsigned int nlm_flags) 4855 { 4856 struct sk_buff *skb; 4857 struct net *net = info->nl_net; 4858 u32 seq; 4859 int err; 4860 4861 err = -ENOBUFS; 4862 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 4863 4864 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 4865 if (!skb) 4866 goto errout; 4867 4868 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 4869 event, info->portid, seq, nlm_flags); 4870 if (err < 0) { 4871 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 4872 WARN_ON(err == -EMSGSIZE); 4873 kfree_skb(skb); 4874 goto errout; 4875 } 4876 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 4877 info->nlh, gfp_any()); 4878 return; 4879 errout: 4880 if (err < 0) 4881 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 4882 } 4883 4884 static int ip6_route_dev_notify(struct notifier_block *this, 4885 unsigned long event, void *ptr) 4886 { 4887 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 4888 struct net *net = dev_net(dev); 4889 4890 if (!(dev->flags & IFF_LOOPBACK)) 4891 return NOTIFY_OK; 4892 4893 if (event == NETDEV_REGISTER) { 4894 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev; 4895 net->ipv6.ip6_null_entry->dst.dev = dev; 4896 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 4897 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4898 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 4899 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 4900 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 4901 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 4902 #endif 4903 } else if (event == NETDEV_UNREGISTER && 4904 dev->reg_state != NETREG_UNREGISTERED) { 4905 /* NETDEV_UNREGISTER could be fired for multiple times by 4906 * netdev_wait_allrefs(). Make sure we only call this once. 4907 */ 4908 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 4909 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4910 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 4911 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 4912 #endif 4913 } 4914 4915 return NOTIFY_OK; 4916 } 4917 4918 /* 4919 * /proc 4920 */ 4921 4922 #ifdef CONFIG_PROC_FS 4923 4924 static const struct file_operations ipv6_route_proc_fops = { 4925 .open = ipv6_route_open, 4926 .read = seq_read, 4927 .llseek = seq_lseek, 4928 .release = seq_release_net, 4929 }; 4930 4931 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 4932 { 4933 struct net *net = (struct net *)seq->private; 4934 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 4935 net->ipv6.rt6_stats->fib_nodes, 4936 net->ipv6.rt6_stats->fib_route_nodes, 4937 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 4938 net->ipv6.rt6_stats->fib_rt_entries, 4939 net->ipv6.rt6_stats->fib_rt_cache, 4940 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 4941 net->ipv6.rt6_stats->fib_discarded_routes); 4942 4943 return 0; 4944 } 4945 4946 static int rt6_stats_seq_open(struct inode *inode, struct file *file) 4947 { 4948 return single_open_net(inode, file, rt6_stats_seq_show); 4949 } 4950 4951 static const struct file_operations rt6_stats_seq_fops = { 4952 .open = rt6_stats_seq_open, 4953 .read = seq_read, 4954 .llseek = seq_lseek, 4955 .release = single_release_net, 4956 }; 4957 #endif /* CONFIG_PROC_FS */ 4958 4959 #ifdef CONFIG_SYSCTL 4960 4961 static 4962 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 4963 void __user *buffer, size_t *lenp, loff_t *ppos) 4964 { 4965 struct net *net; 4966 int delay; 4967 if (!write) 4968 return -EINVAL; 4969 4970 net = (struct net *)ctl->extra1; 4971 delay = net->ipv6.sysctl.flush_delay; 4972 proc_dointvec(ctl, write, buffer, lenp, ppos); 4973 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 4974 return 0; 4975 } 4976 4977 struct ctl_table ipv6_route_table_template[] = { 4978 { 4979 .procname = "flush", 4980 .data = &init_net.ipv6.sysctl.flush_delay, 4981 .maxlen = sizeof(int), 4982 .mode = 0200, 4983 .proc_handler = ipv6_sysctl_rtcache_flush 4984 }, 4985 { 4986 .procname = "gc_thresh", 4987 .data = &ip6_dst_ops_template.gc_thresh, 4988 .maxlen = sizeof(int), 4989 .mode = 0644, 4990 .proc_handler = proc_dointvec, 4991 }, 4992 { 4993 .procname = "max_size", 4994 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 4995 .maxlen = sizeof(int), 4996 .mode = 0644, 4997 .proc_handler = proc_dointvec, 4998 }, 4999 { 5000 .procname = "gc_min_interval", 5001 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5002 .maxlen = sizeof(int), 5003 .mode = 0644, 5004 .proc_handler = proc_dointvec_jiffies, 5005 }, 5006 { 5007 .procname = "gc_timeout", 5008 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 5009 .maxlen = sizeof(int), 5010 .mode = 0644, 5011 .proc_handler = proc_dointvec_jiffies, 5012 }, 5013 { 5014 .procname = "gc_interval", 5015 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 5016 .maxlen = sizeof(int), 5017 .mode = 0644, 5018 .proc_handler = proc_dointvec_jiffies, 5019 }, 5020 { 5021 .procname = "gc_elasticity", 5022 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 5023 .maxlen = sizeof(int), 5024 .mode = 0644, 5025 .proc_handler = proc_dointvec, 5026 }, 5027 { 5028 .procname = "mtu_expires", 5029 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 5030 .maxlen = sizeof(int), 5031 .mode = 0644, 5032 .proc_handler = proc_dointvec_jiffies, 5033 }, 5034 { 5035 .procname = "min_adv_mss", 5036 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 5037 .maxlen = sizeof(int), 5038 .mode = 0644, 5039 .proc_handler = proc_dointvec, 5040 }, 5041 { 5042 .procname = "gc_min_interval_ms", 5043 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5044 .maxlen = sizeof(int), 5045 .mode = 0644, 5046 .proc_handler = proc_dointvec_ms_jiffies, 5047 }, 5048 { } 5049 }; 5050 5051 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 5052 { 5053 struct ctl_table *table; 5054 5055 table = kmemdup(ipv6_route_table_template, 5056 sizeof(ipv6_route_table_template), 5057 GFP_KERNEL); 5058 5059 if (table) { 5060 table[0].data = &net->ipv6.sysctl.flush_delay; 5061 table[0].extra1 = net; 5062 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 5063 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 5064 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5065 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 5066 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 5067 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 5068 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 5069 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 5070 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5071 5072 /* Don't export sysctls to unprivileged users */ 5073 if (net->user_ns != &init_user_ns) 5074 table[0].procname = NULL; 5075 } 5076 5077 return table; 5078 } 5079 #endif 5080 5081 static int __net_init ip6_route_net_init(struct net *net) 5082 { 5083 int ret = -ENOMEM; 5084 5085 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 5086 sizeof(net->ipv6.ip6_dst_ops)); 5087 5088 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 5089 goto out_ip6_dst_ops; 5090 5091 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, 5092 sizeof(*net->ipv6.fib6_null_entry), 5093 GFP_KERNEL); 5094 if (!net->ipv6.fib6_null_entry) 5095 goto out_ip6_dst_entries; 5096 5097 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 5098 sizeof(*net->ipv6.ip6_null_entry), 5099 GFP_KERNEL); 5100 if (!net->ipv6.ip6_null_entry) 5101 goto out_fib6_null_entry; 5102 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5103 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 5104 ip6_template_metrics, true); 5105 5106 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5107 net->ipv6.fib6_has_custom_rules = false; 5108 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 5109 sizeof(*net->ipv6.ip6_prohibit_entry), 5110 GFP_KERNEL); 5111 if (!net->ipv6.ip6_prohibit_entry) 5112 goto out_ip6_null_entry; 5113 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5114 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 5115 ip6_template_metrics, true); 5116 5117 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 5118 sizeof(*net->ipv6.ip6_blk_hole_entry), 5119 GFP_KERNEL); 5120 if (!net->ipv6.ip6_blk_hole_entry) 5121 goto out_ip6_prohibit_entry; 5122 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5123 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 5124 ip6_template_metrics, true); 5125 #endif 5126 5127 net->ipv6.sysctl.flush_delay = 0; 5128 net->ipv6.sysctl.ip6_rt_max_size = 4096; 5129 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 5130 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 5131 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 5132 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 5133 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 5134 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 5135 5136 net->ipv6.ip6_rt_gc_expire = 30*HZ; 5137 5138 ret = 0; 5139 out: 5140 return ret; 5141 5142 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5143 out_ip6_prohibit_entry: 5144 kfree(net->ipv6.ip6_prohibit_entry); 5145 out_ip6_null_entry: 5146 kfree(net->ipv6.ip6_null_entry); 5147 #endif 5148 out_fib6_null_entry: 5149 kfree(net->ipv6.fib6_null_entry); 5150 out_ip6_dst_entries: 5151 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5152 out_ip6_dst_ops: 5153 goto out; 5154 } 5155 5156 static void __net_exit ip6_route_net_exit(struct net *net) 5157 { 5158 kfree(net->ipv6.fib6_null_entry); 5159 kfree(net->ipv6.ip6_null_entry); 5160 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5161 kfree(net->ipv6.ip6_prohibit_entry); 5162 kfree(net->ipv6.ip6_blk_hole_entry); 5163 #endif 5164 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5165 } 5166 5167 static int __net_init ip6_route_net_init_late(struct net *net) 5168 { 5169 #ifdef CONFIG_PROC_FS 5170 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops); 5171 proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops); 5172 #endif 5173 return 0; 5174 } 5175 5176 static void __net_exit ip6_route_net_exit_late(struct net *net) 5177 { 5178 #ifdef CONFIG_PROC_FS 5179 remove_proc_entry("ipv6_route", net->proc_net); 5180 remove_proc_entry("rt6_stats", net->proc_net); 5181 #endif 5182 } 5183 5184 static struct pernet_operations ip6_route_net_ops = { 5185 .init = ip6_route_net_init, 5186 .exit = ip6_route_net_exit, 5187 }; 5188 5189 static int __net_init ipv6_inetpeer_init(struct net *net) 5190 { 5191 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 5192 5193 if (!bp) 5194 return -ENOMEM; 5195 inet_peer_base_init(bp); 5196 net->ipv6.peers = bp; 5197 return 0; 5198 } 5199 5200 static void __net_exit ipv6_inetpeer_exit(struct net *net) 5201 { 5202 struct inet_peer_base *bp = net->ipv6.peers; 5203 5204 net->ipv6.peers = NULL; 5205 inetpeer_invalidate_tree(bp); 5206 kfree(bp); 5207 } 5208 5209 static struct pernet_operations ipv6_inetpeer_ops = { 5210 .init = ipv6_inetpeer_init, 5211 .exit = ipv6_inetpeer_exit, 5212 }; 5213 5214 static struct pernet_operations ip6_route_net_late_ops = { 5215 .init = ip6_route_net_init_late, 5216 .exit = ip6_route_net_exit_late, 5217 }; 5218 5219 static struct notifier_block ip6_route_dev_notifier = { 5220 .notifier_call = ip6_route_dev_notify, 5221 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 5222 }; 5223 5224 void __init ip6_route_init_special_entries(void) 5225 { 5226 /* Registering of the loopback is done before this portion of code, 5227 * the loopback reference in rt6_info will not be taken, do it 5228 * manually for init_net */ 5229 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev; 5230 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5231 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5232 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5233 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 5234 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5235 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 5236 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5237 #endif 5238 } 5239 5240 int __init ip6_route_init(void) 5241 { 5242 int ret; 5243 int cpu; 5244 5245 ret = -ENOMEM; 5246 ip6_dst_ops_template.kmem_cachep = 5247 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 5248 SLAB_HWCACHE_ALIGN, NULL); 5249 if (!ip6_dst_ops_template.kmem_cachep) 5250 goto out; 5251 5252 ret = dst_entries_init(&ip6_dst_blackhole_ops); 5253 if (ret) 5254 goto out_kmem_cache; 5255 5256 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 5257 if (ret) 5258 goto out_dst_entries; 5259 5260 ret = register_pernet_subsys(&ip6_route_net_ops); 5261 if (ret) 5262 goto out_register_inetpeer; 5263 5264 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 5265 5266 ret = fib6_init(); 5267 if (ret) 5268 goto out_register_subsys; 5269 5270 ret = xfrm6_init(); 5271 if (ret) 5272 goto out_fib6_init; 5273 5274 ret = fib6_rules_init(); 5275 if (ret) 5276 goto xfrm6_init; 5277 5278 ret = register_pernet_subsys(&ip6_route_net_late_ops); 5279 if (ret) 5280 goto fib6_rules_init; 5281 5282 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 5283 inet6_rtm_newroute, NULL, 0); 5284 if (ret < 0) 5285 goto out_register_late_subsys; 5286 5287 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 5288 inet6_rtm_delroute, NULL, 0); 5289 if (ret < 0) 5290 goto out_register_late_subsys; 5291 5292 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 5293 inet6_rtm_getroute, NULL, 5294 RTNL_FLAG_DOIT_UNLOCKED); 5295 if (ret < 0) 5296 goto out_register_late_subsys; 5297 5298 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 5299 if (ret) 5300 goto out_register_late_subsys; 5301 5302 for_each_possible_cpu(cpu) { 5303 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 5304 5305 INIT_LIST_HEAD(&ul->head); 5306 spin_lock_init(&ul->lock); 5307 } 5308 5309 out: 5310 return ret; 5311 5312 out_register_late_subsys: 5313 rtnl_unregister_all(PF_INET6); 5314 unregister_pernet_subsys(&ip6_route_net_late_ops); 5315 fib6_rules_init: 5316 fib6_rules_cleanup(); 5317 xfrm6_init: 5318 xfrm6_fini(); 5319 out_fib6_init: 5320 fib6_gc_cleanup(); 5321 out_register_subsys: 5322 unregister_pernet_subsys(&ip6_route_net_ops); 5323 out_register_inetpeer: 5324 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5325 out_dst_entries: 5326 dst_entries_destroy(&ip6_dst_blackhole_ops); 5327 out_kmem_cache: 5328 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5329 goto out; 5330 } 5331 5332 void ip6_route_cleanup(void) 5333 { 5334 unregister_netdevice_notifier(&ip6_route_dev_notifier); 5335 unregister_pernet_subsys(&ip6_route_net_late_ops); 5336 fib6_rules_cleanup(); 5337 xfrm6_fini(); 5338 fib6_gc_cleanup(); 5339 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5340 unregister_pernet_subsys(&ip6_route_net_ops); 5341 dst_entries_destroy(&ip6_dst_blackhole_ops); 5342 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5343 } 5344