1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/nexthop.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <trace/events/fib6.h> 67 68 #include <linux/uaccess.h> 69 70 #ifdef CONFIG_SYSCTL 71 #include <linux/sysctl.h> 72 #endif 73 74 enum rt6_nud_state { 75 RT6_NUD_FAIL_HARD = -3, 76 RT6_NUD_FAIL_PROBE = -2, 77 RT6_NUD_FAIL_DO_RR = -1, 78 RT6_NUD_SUCCEED = 1 79 }; 80 81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 82 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 83 static unsigned int ip6_mtu(const struct dst_entry *dst); 84 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 85 static void ip6_dst_destroy(struct dst_entry *); 86 static void ip6_dst_ifdown(struct dst_entry *, 87 struct net_device *dev, int how); 88 static int ip6_dst_gc(struct dst_ops *ops); 89 90 static int ip6_pkt_discard(struct sk_buff *skb); 91 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 92 static int ip6_pkt_prohibit(struct sk_buff *skb); 93 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 94 static void ip6_link_failure(struct sk_buff *skb); 95 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 96 struct sk_buff *skb, u32 mtu); 97 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 98 struct sk_buff *skb); 99 static int rt6_score_route(struct fib6_info *rt, int oif, int strict); 100 static size_t rt6_nlmsg_size(struct fib6_info *rt); 101 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 102 struct fib6_info *rt, struct dst_entry *dst, 103 struct in6_addr *dest, struct in6_addr *src, 104 int iif, int type, u32 portid, u32 seq, 105 unsigned int flags); 106 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 107 struct in6_addr *daddr, 108 struct in6_addr *saddr); 109 110 #ifdef CONFIG_IPV6_ROUTE_INFO 111 static struct fib6_info *rt6_add_route_info(struct net *net, 112 const struct in6_addr *prefix, int prefixlen, 113 const struct in6_addr *gwaddr, 114 struct net_device *dev, 115 unsigned int pref); 116 static struct fib6_info *rt6_get_route_info(struct net *net, 117 const struct in6_addr *prefix, int prefixlen, 118 const struct in6_addr *gwaddr, 119 struct net_device *dev); 120 #endif 121 122 struct uncached_list { 123 spinlock_t lock; 124 struct list_head head; 125 }; 126 127 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 128 129 void rt6_uncached_list_add(struct rt6_info *rt) 130 { 131 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 132 133 rt->rt6i_uncached_list = ul; 134 135 spin_lock_bh(&ul->lock); 136 list_add_tail(&rt->rt6i_uncached, &ul->head); 137 spin_unlock_bh(&ul->lock); 138 } 139 140 void rt6_uncached_list_del(struct rt6_info *rt) 141 { 142 if (!list_empty(&rt->rt6i_uncached)) { 143 struct uncached_list *ul = rt->rt6i_uncached_list; 144 struct net *net = dev_net(rt->dst.dev); 145 146 spin_lock_bh(&ul->lock); 147 list_del(&rt->rt6i_uncached); 148 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 149 spin_unlock_bh(&ul->lock); 150 } 151 } 152 153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 154 { 155 struct net_device *loopback_dev = net->loopback_dev; 156 int cpu; 157 158 if (dev == loopback_dev) 159 return; 160 161 for_each_possible_cpu(cpu) { 162 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 163 struct rt6_info *rt; 164 165 spin_lock_bh(&ul->lock); 166 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 167 struct inet6_dev *rt_idev = rt->rt6i_idev; 168 struct net_device *rt_dev = rt->dst.dev; 169 170 if (rt_idev->dev == dev) { 171 rt->rt6i_idev = in6_dev_get(loopback_dev); 172 in6_dev_put(rt_idev); 173 } 174 175 if (rt_dev == dev) { 176 rt->dst.dev = loopback_dev; 177 dev_hold(rt->dst.dev); 178 dev_put(rt_dev); 179 } 180 } 181 spin_unlock_bh(&ul->lock); 182 } 183 } 184 185 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 186 struct sk_buff *skb, 187 const void *daddr) 188 { 189 if (!ipv6_addr_any(p)) 190 return (const void *) p; 191 else if (skb) 192 return &ipv6_hdr(skb)->daddr; 193 return daddr; 194 } 195 196 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 197 struct net_device *dev, 198 struct sk_buff *skb, 199 const void *daddr) 200 { 201 struct neighbour *n; 202 203 daddr = choose_neigh_daddr(gw, skb, daddr); 204 n = __ipv6_neigh_lookup(dev, daddr); 205 if (n) 206 return n; 207 return neigh_create(&nd_tbl, daddr, dev); 208 } 209 210 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 211 struct sk_buff *skb, 212 const void *daddr) 213 { 214 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 215 216 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); 217 } 218 219 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 220 { 221 struct net_device *dev = dst->dev; 222 struct rt6_info *rt = (struct rt6_info *)dst; 223 224 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); 225 if (!daddr) 226 return; 227 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 228 return; 229 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 230 return; 231 __ipv6_confirm_neigh(dev, daddr); 232 } 233 234 static struct dst_ops ip6_dst_ops_template = { 235 .family = AF_INET6, 236 .gc = ip6_dst_gc, 237 .gc_thresh = 1024, 238 .check = ip6_dst_check, 239 .default_advmss = ip6_default_advmss, 240 .mtu = ip6_mtu, 241 .cow_metrics = dst_cow_metrics_generic, 242 .destroy = ip6_dst_destroy, 243 .ifdown = ip6_dst_ifdown, 244 .negative_advice = ip6_negative_advice, 245 .link_failure = ip6_link_failure, 246 .update_pmtu = ip6_rt_update_pmtu, 247 .redirect = rt6_do_redirect, 248 .local_out = __ip6_local_out, 249 .neigh_lookup = ip6_dst_neigh_lookup, 250 .confirm_neigh = ip6_confirm_neigh, 251 }; 252 253 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 254 { 255 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 256 257 return mtu ? : dst->dev->mtu; 258 } 259 260 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 261 struct sk_buff *skb, u32 mtu) 262 { 263 } 264 265 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 266 struct sk_buff *skb) 267 { 268 } 269 270 static struct dst_ops ip6_dst_blackhole_ops = { 271 .family = AF_INET6, 272 .destroy = ip6_dst_destroy, 273 .check = ip6_dst_check, 274 .mtu = ip6_blackhole_mtu, 275 .default_advmss = ip6_default_advmss, 276 .update_pmtu = ip6_rt_blackhole_update_pmtu, 277 .redirect = ip6_rt_blackhole_redirect, 278 .cow_metrics = dst_cow_metrics_generic, 279 .neigh_lookup = ip6_dst_neigh_lookup, 280 }; 281 282 static const u32 ip6_template_metrics[RTAX_MAX] = { 283 [RTAX_HOPLIMIT - 1] = 0, 284 }; 285 286 static const struct fib6_info fib6_null_entry_template = { 287 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 288 .fib6_protocol = RTPROT_KERNEL, 289 .fib6_metric = ~(u32)0, 290 .fib6_ref = ATOMIC_INIT(1), 291 .fib6_type = RTN_UNREACHABLE, 292 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 293 }; 294 295 static const struct rt6_info ip6_null_entry_template = { 296 .dst = { 297 .__refcnt = ATOMIC_INIT(1), 298 .__use = 1, 299 .obsolete = DST_OBSOLETE_FORCE_CHK, 300 .error = -ENETUNREACH, 301 .input = ip6_pkt_discard, 302 .output = ip6_pkt_discard_out, 303 }, 304 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 305 }; 306 307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 308 309 static const struct rt6_info ip6_prohibit_entry_template = { 310 .dst = { 311 .__refcnt = ATOMIC_INIT(1), 312 .__use = 1, 313 .obsolete = DST_OBSOLETE_FORCE_CHK, 314 .error = -EACCES, 315 .input = ip6_pkt_prohibit, 316 .output = ip6_pkt_prohibit_out, 317 }, 318 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 319 }; 320 321 static const struct rt6_info ip6_blk_hole_entry_template = { 322 .dst = { 323 .__refcnt = ATOMIC_INIT(1), 324 .__use = 1, 325 .obsolete = DST_OBSOLETE_FORCE_CHK, 326 .error = -EINVAL, 327 .input = dst_discard, 328 .output = dst_discard_out, 329 }, 330 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 331 }; 332 333 #endif 334 335 static void rt6_info_init(struct rt6_info *rt) 336 { 337 struct dst_entry *dst = &rt->dst; 338 339 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 340 INIT_LIST_HEAD(&rt->rt6i_uncached); 341 } 342 343 /* allocate dst with ip6_dst_ops */ 344 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 345 int flags) 346 { 347 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 348 1, DST_OBSOLETE_FORCE_CHK, flags); 349 350 if (rt) { 351 rt6_info_init(rt); 352 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 353 } 354 355 return rt; 356 } 357 EXPORT_SYMBOL(ip6_dst_alloc); 358 359 static void ip6_dst_destroy(struct dst_entry *dst) 360 { 361 struct rt6_info *rt = (struct rt6_info *)dst; 362 struct fib6_info *from; 363 struct inet6_dev *idev; 364 365 dst_destroy_metrics_generic(dst); 366 rt6_uncached_list_del(rt); 367 368 idev = rt->rt6i_idev; 369 if (idev) { 370 rt->rt6i_idev = NULL; 371 in6_dev_put(idev); 372 } 373 374 rcu_read_lock(); 375 from = rcu_dereference(rt->from); 376 rcu_assign_pointer(rt->from, NULL); 377 fib6_info_release(from); 378 rcu_read_unlock(); 379 } 380 381 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 382 int how) 383 { 384 struct rt6_info *rt = (struct rt6_info *)dst; 385 struct inet6_dev *idev = rt->rt6i_idev; 386 struct net_device *loopback_dev = 387 dev_net(dev)->loopback_dev; 388 389 if (idev && idev->dev != loopback_dev) { 390 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 391 if (loopback_idev) { 392 rt->rt6i_idev = loopback_idev; 393 in6_dev_put(idev); 394 } 395 } 396 } 397 398 static bool __rt6_check_expired(const struct rt6_info *rt) 399 { 400 if (rt->rt6i_flags & RTF_EXPIRES) 401 return time_after(jiffies, rt->dst.expires); 402 else 403 return false; 404 } 405 406 static bool rt6_check_expired(const struct rt6_info *rt) 407 { 408 struct fib6_info *from; 409 410 from = rcu_dereference(rt->from); 411 412 if (rt->rt6i_flags & RTF_EXPIRES) { 413 if (time_after(jiffies, rt->dst.expires)) 414 return true; 415 } else if (from) { 416 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 417 fib6_check_expired(from); 418 } 419 return false; 420 } 421 422 static struct fib6_info *rt6_multipath_select(const struct net *net, 423 struct fib6_info *match, 424 struct flowi6 *fl6, int oif, 425 const struct sk_buff *skb, 426 int strict) 427 { 428 struct fib6_info *sibling, *next_sibling; 429 430 /* We might have already computed the hash for ICMPv6 errors. In such 431 * case it will always be non-zero. Otherwise now is the time to do it. 432 */ 433 if (!fl6->mp_hash) 434 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 435 436 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound)) 437 return match; 438 439 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 440 fib6_siblings) { 441 int nh_upper_bound; 442 443 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound); 444 if (fl6->mp_hash > nh_upper_bound) 445 continue; 446 if (rt6_score_route(sibling, oif, strict) < 0) 447 break; 448 match = sibling; 449 break; 450 } 451 452 return match; 453 } 454 455 /* 456 * Route lookup. rcu_read_lock() should be held. 457 */ 458 459 static inline struct fib6_info *rt6_device_match(struct net *net, 460 struct fib6_info *rt, 461 const struct in6_addr *saddr, 462 int oif, 463 int flags) 464 { 465 struct fib6_info *sprt; 466 467 if (!oif && ipv6_addr_any(saddr) && 468 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD)) 469 return rt; 470 471 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) { 472 const struct net_device *dev = sprt->fib6_nh.nh_dev; 473 474 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD) 475 continue; 476 477 if (oif) { 478 if (dev->ifindex == oif) 479 return sprt; 480 } else { 481 if (ipv6_chk_addr(net, saddr, dev, 482 flags & RT6_LOOKUP_F_IFACE)) 483 return sprt; 484 } 485 } 486 487 if (oif && flags & RT6_LOOKUP_F_IFACE) 488 return net->ipv6.fib6_null_entry; 489 490 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt; 491 } 492 493 #ifdef CONFIG_IPV6_ROUTER_PREF 494 struct __rt6_probe_work { 495 struct work_struct work; 496 struct in6_addr target; 497 struct net_device *dev; 498 }; 499 500 static void rt6_probe_deferred(struct work_struct *w) 501 { 502 struct in6_addr mcaddr; 503 struct __rt6_probe_work *work = 504 container_of(w, struct __rt6_probe_work, work); 505 506 addrconf_addr_solict_mult(&work->target, &mcaddr); 507 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 508 dev_put(work->dev); 509 kfree(work); 510 } 511 512 static void rt6_probe(struct fib6_info *rt) 513 { 514 struct __rt6_probe_work *work; 515 const struct in6_addr *nh_gw; 516 struct neighbour *neigh; 517 struct net_device *dev; 518 519 /* 520 * Okay, this does not seem to be appropriate 521 * for now, however, we need to check if it 522 * is really so; aka Router Reachability Probing. 523 * 524 * Router Reachability Probe MUST be rate-limited 525 * to no more than one per minute. 526 */ 527 if (!rt || !(rt->fib6_flags & RTF_GATEWAY)) 528 return; 529 530 nh_gw = &rt->fib6_nh.nh_gw; 531 dev = rt->fib6_nh.nh_dev; 532 rcu_read_lock_bh(); 533 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 534 if (neigh) { 535 struct inet6_dev *idev; 536 537 if (neigh->nud_state & NUD_VALID) 538 goto out; 539 540 idev = __in6_dev_get(dev); 541 work = NULL; 542 write_lock(&neigh->lock); 543 if (!(neigh->nud_state & NUD_VALID) && 544 time_after(jiffies, 545 neigh->updated + idev->cnf.rtr_probe_interval)) { 546 work = kmalloc(sizeof(*work), GFP_ATOMIC); 547 if (work) 548 __neigh_set_probe_once(neigh); 549 } 550 write_unlock(&neigh->lock); 551 } else { 552 work = kmalloc(sizeof(*work), GFP_ATOMIC); 553 } 554 555 if (work) { 556 INIT_WORK(&work->work, rt6_probe_deferred); 557 work->target = *nh_gw; 558 dev_hold(dev); 559 work->dev = dev; 560 schedule_work(&work->work); 561 } 562 563 out: 564 rcu_read_unlock_bh(); 565 } 566 #else 567 static inline void rt6_probe(struct fib6_info *rt) 568 { 569 } 570 #endif 571 572 /* 573 * Default Router Selection (RFC 2461 6.3.6) 574 */ 575 static inline int rt6_check_dev(struct fib6_info *rt, int oif) 576 { 577 const struct net_device *dev = rt->fib6_nh.nh_dev; 578 579 if (!oif || dev->ifindex == oif) 580 return 2; 581 return 0; 582 } 583 584 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) 585 { 586 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 587 struct neighbour *neigh; 588 589 if (rt->fib6_flags & RTF_NONEXTHOP || 590 !(rt->fib6_flags & RTF_GATEWAY)) 591 return RT6_NUD_SUCCEED; 592 593 rcu_read_lock_bh(); 594 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev, 595 &rt->fib6_nh.nh_gw); 596 if (neigh) { 597 read_lock(&neigh->lock); 598 if (neigh->nud_state & NUD_VALID) 599 ret = RT6_NUD_SUCCEED; 600 #ifdef CONFIG_IPV6_ROUTER_PREF 601 else if (!(neigh->nud_state & NUD_FAILED)) 602 ret = RT6_NUD_SUCCEED; 603 else 604 ret = RT6_NUD_FAIL_PROBE; 605 #endif 606 read_unlock(&neigh->lock); 607 } else { 608 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 609 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 610 } 611 rcu_read_unlock_bh(); 612 613 return ret; 614 } 615 616 static int rt6_score_route(struct fib6_info *rt, int oif, int strict) 617 { 618 int m; 619 620 m = rt6_check_dev(rt, oif); 621 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 622 return RT6_NUD_FAIL_HARD; 623 #ifdef CONFIG_IPV6_ROUTER_PREF 624 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2; 625 #endif 626 if (strict & RT6_LOOKUP_F_REACHABLE) { 627 int n = rt6_check_neigh(rt); 628 if (n < 0) 629 return n; 630 } 631 return m; 632 } 633 634 /* called with rc_read_lock held */ 635 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i) 636 { 637 const struct net_device *dev = fib6_info_nh_dev(f6i); 638 bool rc = false; 639 640 if (dev) { 641 const struct inet6_dev *idev = __in6_dev_get(dev); 642 643 rc = !!idev->cnf.ignore_routes_with_linkdown; 644 } 645 646 return rc; 647 } 648 649 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, 650 int *mpri, struct fib6_info *match, 651 bool *do_rr) 652 { 653 int m; 654 bool match_do_rr = false; 655 656 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 657 goto out; 658 659 if (fib6_ignore_linkdown(rt) && 660 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 661 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 662 goto out; 663 664 if (fib6_check_expired(rt)) 665 goto out; 666 667 m = rt6_score_route(rt, oif, strict); 668 if (m == RT6_NUD_FAIL_DO_RR) { 669 match_do_rr = true; 670 m = 0; /* lowest valid score */ 671 } else if (m == RT6_NUD_FAIL_HARD) { 672 goto out; 673 } 674 675 if (strict & RT6_LOOKUP_F_REACHABLE) 676 rt6_probe(rt); 677 678 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 679 if (m > *mpri) { 680 *do_rr = match_do_rr; 681 *mpri = m; 682 match = rt; 683 } 684 out: 685 return match; 686 } 687 688 static struct fib6_info *find_rr_leaf(struct fib6_node *fn, 689 struct fib6_info *leaf, 690 struct fib6_info *rr_head, 691 u32 metric, int oif, int strict, 692 bool *do_rr) 693 { 694 struct fib6_info *rt, *match, *cont; 695 int mpri = -1; 696 697 match = NULL; 698 cont = NULL; 699 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) { 700 if (rt->fib6_metric != metric) { 701 cont = rt; 702 break; 703 } 704 705 match = find_match(rt, oif, strict, &mpri, match, do_rr); 706 } 707 708 for (rt = leaf; rt && rt != rr_head; 709 rt = rcu_dereference(rt->fib6_next)) { 710 if (rt->fib6_metric != metric) { 711 cont = rt; 712 break; 713 } 714 715 match = find_match(rt, oif, strict, &mpri, match, do_rr); 716 } 717 718 if (match || !cont) 719 return match; 720 721 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next)) 722 match = find_match(rt, oif, strict, &mpri, match, do_rr); 723 724 return match; 725 } 726 727 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, 728 int oif, int strict) 729 { 730 struct fib6_info *leaf = rcu_dereference(fn->leaf); 731 struct fib6_info *match, *rt0; 732 bool do_rr = false; 733 int key_plen; 734 735 if (!leaf || leaf == net->ipv6.fib6_null_entry) 736 return net->ipv6.fib6_null_entry; 737 738 rt0 = rcu_dereference(fn->rr_ptr); 739 if (!rt0) 740 rt0 = leaf; 741 742 /* Double check to make sure fn is not an intermediate node 743 * and fn->leaf does not points to its child's leaf 744 * (This might happen if all routes under fn are deleted from 745 * the tree and fib6_repair_tree() is called on the node.) 746 */ 747 key_plen = rt0->fib6_dst.plen; 748 #ifdef CONFIG_IPV6_SUBTREES 749 if (rt0->fib6_src.plen) 750 key_plen = rt0->fib6_src.plen; 751 #endif 752 if (fn->fn_bit != key_plen) 753 return net->ipv6.fib6_null_entry; 754 755 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict, 756 &do_rr); 757 758 if (do_rr) { 759 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 760 761 /* no entries matched; do round-robin */ 762 if (!next || next->fib6_metric != rt0->fib6_metric) 763 next = leaf; 764 765 if (next != rt0) { 766 spin_lock_bh(&leaf->fib6_table->tb6_lock); 767 /* make sure next is not being deleted from the tree */ 768 if (next->fib6_node) 769 rcu_assign_pointer(fn->rr_ptr, next); 770 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 771 } 772 } 773 774 return match ? match : net->ipv6.fib6_null_entry; 775 } 776 777 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) 778 { 779 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); 780 } 781 782 #ifdef CONFIG_IPV6_ROUTE_INFO 783 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 784 const struct in6_addr *gwaddr) 785 { 786 struct net *net = dev_net(dev); 787 struct route_info *rinfo = (struct route_info *) opt; 788 struct in6_addr prefix_buf, *prefix; 789 unsigned int pref; 790 unsigned long lifetime; 791 struct fib6_info *rt; 792 793 if (len < sizeof(struct route_info)) { 794 return -EINVAL; 795 } 796 797 /* Sanity check for prefix_len and length */ 798 if (rinfo->length > 3) { 799 return -EINVAL; 800 } else if (rinfo->prefix_len > 128) { 801 return -EINVAL; 802 } else if (rinfo->prefix_len > 64) { 803 if (rinfo->length < 2) { 804 return -EINVAL; 805 } 806 } else if (rinfo->prefix_len > 0) { 807 if (rinfo->length < 1) { 808 return -EINVAL; 809 } 810 } 811 812 pref = rinfo->route_pref; 813 if (pref == ICMPV6_ROUTER_PREF_INVALID) 814 return -EINVAL; 815 816 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 817 818 if (rinfo->length == 3) 819 prefix = (struct in6_addr *)rinfo->prefix; 820 else { 821 /* this function is safe */ 822 ipv6_addr_prefix(&prefix_buf, 823 (struct in6_addr *)rinfo->prefix, 824 rinfo->prefix_len); 825 prefix = &prefix_buf; 826 } 827 828 if (rinfo->prefix_len == 0) 829 rt = rt6_get_dflt_router(net, gwaddr, dev); 830 else 831 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 832 gwaddr, dev); 833 834 if (rt && !lifetime) { 835 ip6_del_rt(net, rt); 836 rt = NULL; 837 } 838 839 if (!rt && lifetime) 840 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 841 dev, pref); 842 else if (rt) 843 rt->fib6_flags = RTF_ROUTEINFO | 844 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 845 846 if (rt) { 847 if (!addrconf_finite_timeout(lifetime)) 848 fib6_clean_expires(rt); 849 else 850 fib6_set_expires(rt, jiffies + HZ * lifetime); 851 852 fib6_info_release(rt); 853 } 854 return 0; 855 } 856 #endif 857 858 /* 859 * Misc support functions 860 */ 861 862 /* called with rcu_lock held */ 863 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt) 864 { 865 struct net_device *dev = rt->fib6_nh.nh_dev; 866 867 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 868 /* for copies of local routes, dst->dev needs to be the 869 * device if it is a master device, the master device if 870 * device is enslaved, and the loopback as the default 871 */ 872 if (netif_is_l3_slave(dev) && 873 !rt6_need_strict(&rt->fib6_dst.addr)) 874 dev = l3mdev_master_dev_rcu(dev); 875 else if (!netif_is_l3_master(dev)) 876 dev = dev_net(dev)->loopback_dev; 877 /* last case is netif_is_l3_master(dev) is true in which 878 * case we want dev returned to be dev 879 */ 880 } 881 882 return dev; 883 } 884 885 static const int fib6_prop[RTN_MAX + 1] = { 886 [RTN_UNSPEC] = 0, 887 [RTN_UNICAST] = 0, 888 [RTN_LOCAL] = 0, 889 [RTN_BROADCAST] = 0, 890 [RTN_ANYCAST] = 0, 891 [RTN_MULTICAST] = 0, 892 [RTN_BLACKHOLE] = -EINVAL, 893 [RTN_UNREACHABLE] = -EHOSTUNREACH, 894 [RTN_PROHIBIT] = -EACCES, 895 [RTN_THROW] = -EAGAIN, 896 [RTN_NAT] = -EINVAL, 897 [RTN_XRESOLVE] = -EINVAL, 898 }; 899 900 static int ip6_rt_type_to_error(u8 fib6_type) 901 { 902 return fib6_prop[fib6_type]; 903 } 904 905 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 906 { 907 unsigned short flags = 0; 908 909 if (rt->dst_nocount) 910 flags |= DST_NOCOUNT; 911 if (rt->dst_nopolicy) 912 flags |= DST_NOPOLICY; 913 if (rt->dst_host) 914 flags |= DST_HOST; 915 916 return flags; 917 } 918 919 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) 920 { 921 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); 922 923 switch (ort->fib6_type) { 924 case RTN_BLACKHOLE: 925 rt->dst.output = dst_discard_out; 926 rt->dst.input = dst_discard; 927 break; 928 case RTN_PROHIBIT: 929 rt->dst.output = ip6_pkt_prohibit_out; 930 rt->dst.input = ip6_pkt_prohibit; 931 break; 932 case RTN_THROW: 933 case RTN_UNREACHABLE: 934 default: 935 rt->dst.output = ip6_pkt_discard_out; 936 rt->dst.input = ip6_pkt_discard; 937 break; 938 } 939 } 940 941 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) 942 { 943 rt->dst.flags |= fib6_info_dst_flags(ort); 944 945 if (ort->fib6_flags & RTF_REJECT) { 946 ip6_rt_init_dst_reject(rt, ort); 947 return; 948 } 949 950 rt->dst.error = 0; 951 rt->dst.output = ip6_output; 952 953 if (ort->fib6_type == RTN_LOCAL) { 954 rt->dst.input = ip6_input; 955 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 956 rt->dst.input = ip6_mc_input; 957 } else { 958 rt->dst.input = ip6_forward; 959 } 960 961 if (ort->fib6_nh.nh_lwtstate) { 962 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); 963 lwtunnel_set_redirect(&rt->dst); 964 } 965 966 rt->dst.lastuse = jiffies; 967 } 968 969 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 970 { 971 rt->rt6i_flags &= ~RTF_EXPIRES; 972 fib6_info_hold(from); 973 rcu_assign_pointer(rt->from, from); 974 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); 975 if (from->fib6_metrics != &dst_default_metrics) { 976 rt->dst._metrics |= DST_METRICS_REFCOUNTED; 977 refcount_inc(&from->fib6_metrics->refcnt); 978 } 979 } 980 981 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) 982 { 983 struct net_device *dev = fib6_info_nh_dev(ort); 984 985 ip6_rt_init_dst(rt, ort); 986 987 rt->rt6i_dst = ort->fib6_dst; 988 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 989 rt->rt6i_gateway = ort->fib6_nh.nh_gw; 990 rt->rt6i_flags = ort->fib6_flags; 991 rt6_set_from(rt, ort); 992 #ifdef CONFIG_IPV6_SUBTREES 993 rt->rt6i_src = ort->fib6_src; 994 #endif 995 rt->rt6i_prefsrc = ort->fib6_prefsrc; 996 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); 997 } 998 999 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1000 struct in6_addr *saddr) 1001 { 1002 struct fib6_node *pn, *sn; 1003 while (1) { 1004 if (fn->fn_flags & RTN_TL_ROOT) 1005 return NULL; 1006 pn = rcu_dereference(fn->parent); 1007 sn = FIB6_SUBTREE(pn); 1008 if (sn && sn != fn) 1009 fn = fib6_lookup(sn, NULL, saddr); 1010 else 1011 fn = pn; 1012 if (fn->fn_flags & RTN_RTINFO) 1013 return fn; 1014 } 1015 } 1016 1017 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, 1018 bool null_fallback) 1019 { 1020 struct rt6_info *rt = *prt; 1021 1022 if (dst_hold_safe(&rt->dst)) 1023 return true; 1024 if (null_fallback) { 1025 rt = net->ipv6.ip6_null_entry; 1026 dst_hold(&rt->dst); 1027 } else { 1028 rt = NULL; 1029 } 1030 *prt = rt; 1031 return false; 1032 } 1033 1034 /* called with rcu_lock held */ 1035 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) 1036 { 1037 unsigned short flags = fib6_info_dst_flags(rt); 1038 struct net_device *dev = rt->fib6_nh.nh_dev; 1039 struct rt6_info *nrt; 1040 1041 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1042 if (nrt) 1043 ip6_rt_copy_init(nrt, rt); 1044 1045 return nrt; 1046 } 1047 1048 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1049 struct fib6_table *table, 1050 struct flowi6 *fl6, 1051 const struct sk_buff *skb, 1052 int flags) 1053 { 1054 struct fib6_info *f6i; 1055 struct fib6_node *fn; 1056 struct rt6_info *rt; 1057 1058 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1059 flags &= ~RT6_LOOKUP_F_IFACE; 1060 1061 rcu_read_lock(); 1062 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1063 restart: 1064 f6i = rcu_dereference(fn->leaf); 1065 if (!f6i) { 1066 f6i = net->ipv6.fib6_null_entry; 1067 } else { 1068 f6i = rt6_device_match(net, f6i, &fl6->saddr, 1069 fl6->flowi6_oif, flags); 1070 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) 1071 f6i = rt6_multipath_select(net, f6i, fl6, 1072 fl6->flowi6_oif, skb, flags); 1073 } 1074 if (f6i == net->ipv6.fib6_null_entry) { 1075 fn = fib6_backtrack(fn, &fl6->saddr); 1076 if (fn) 1077 goto restart; 1078 } 1079 1080 /* Search through exception table */ 1081 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1082 if (rt) { 1083 if (ip6_hold_safe(net, &rt, true)) 1084 dst_use_noref(&rt->dst, jiffies); 1085 } else if (f6i == net->ipv6.fib6_null_entry) { 1086 rt = net->ipv6.ip6_null_entry; 1087 dst_hold(&rt->dst); 1088 } else { 1089 rt = ip6_create_rt_rcu(f6i); 1090 if (!rt) { 1091 rt = net->ipv6.ip6_null_entry; 1092 dst_hold(&rt->dst); 1093 } 1094 } 1095 1096 rcu_read_unlock(); 1097 1098 trace_fib6_table_lookup(net, rt, table, fl6); 1099 1100 return rt; 1101 } 1102 1103 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1104 const struct sk_buff *skb, int flags) 1105 { 1106 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1107 } 1108 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1109 1110 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1111 const struct in6_addr *saddr, int oif, 1112 const struct sk_buff *skb, int strict) 1113 { 1114 struct flowi6 fl6 = { 1115 .flowi6_oif = oif, 1116 .daddr = *daddr, 1117 }; 1118 struct dst_entry *dst; 1119 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1120 1121 if (saddr) { 1122 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1123 flags |= RT6_LOOKUP_F_HAS_SADDR; 1124 } 1125 1126 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1127 if (dst->error == 0) 1128 return (struct rt6_info *) dst; 1129 1130 dst_release(dst); 1131 1132 return NULL; 1133 } 1134 EXPORT_SYMBOL(rt6_lookup); 1135 1136 /* ip6_ins_rt is called with FREE table->tb6_lock. 1137 * It takes new route entry, the addition fails by any reason the 1138 * route is released. 1139 * Caller must hold dst before calling it. 1140 */ 1141 1142 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1143 struct netlink_ext_ack *extack) 1144 { 1145 int err; 1146 struct fib6_table *table; 1147 1148 table = rt->fib6_table; 1149 spin_lock_bh(&table->tb6_lock); 1150 err = fib6_add(&table->tb6_root, rt, info, extack); 1151 spin_unlock_bh(&table->tb6_lock); 1152 1153 return err; 1154 } 1155 1156 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1157 { 1158 struct nl_info info = { .nl_net = net, }; 1159 1160 return __ip6_ins_rt(rt, &info, NULL); 1161 } 1162 1163 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, 1164 const struct in6_addr *daddr, 1165 const struct in6_addr *saddr) 1166 { 1167 struct net_device *dev; 1168 struct rt6_info *rt; 1169 1170 /* 1171 * Clone the route. 1172 */ 1173 1174 dev = ip6_rt_get_dev_rcu(ort); 1175 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1176 if (!rt) 1177 return NULL; 1178 1179 ip6_rt_copy_init(rt, ort); 1180 rt->rt6i_flags |= RTF_CACHE; 1181 rt->dst.flags |= DST_HOST; 1182 rt->rt6i_dst.addr = *daddr; 1183 rt->rt6i_dst.plen = 128; 1184 1185 if (!rt6_is_gw_or_nonexthop(ort)) { 1186 if (ort->fib6_dst.plen != 128 && 1187 ipv6_addr_equal(&ort->fib6_dst.addr, daddr)) 1188 rt->rt6i_flags |= RTF_ANYCAST; 1189 #ifdef CONFIG_IPV6_SUBTREES 1190 if (rt->rt6i_src.plen && saddr) { 1191 rt->rt6i_src.addr = *saddr; 1192 rt->rt6i_src.plen = 128; 1193 } 1194 #endif 1195 } 1196 1197 return rt; 1198 } 1199 1200 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) 1201 { 1202 unsigned short flags = fib6_info_dst_flags(rt); 1203 struct net_device *dev; 1204 struct rt6_info *pcpu_rt; 1205 1206 rcu_read_lock(); 1207 dev = ip6_rt_get_dev_rcu(rt); 1208 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1209 rcu_read_unlock(); 1210 if (!pcpu_rt) 1211 return NULL; 1212 ip6_rt_copy_init(pcpu_rt, rt); 1213 pcpu_rt->rt6i_flags |= RTF_PCPU; 1214 return pcpu_rt; 1215 } 1216 1217 /* It should be called with rcu_read_lock() acquired */ 1218 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt) 1219 { 1220 struct rt6_info *pcpu_rt, **p; 1221 1222 p = this_cpu_ptr(rt->rt6i_pcpu); 1223 pcpu_rt = *p; 1224 1225 if (pcpu_rt) 1226 ip6_hold_safe(NULL, &pcpu_rt, false); 1227 1228 return pcpu_rt; 1229 } 1230 1231 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1232 struct fib6_info *rt) 1233 { 1234 struct rt6_info *pcpu_rt, *prev, **p; 1235 1236 pcpu_rt = ip6_rt_pcpu_alloc(rt); 1237 if (!pcpu_rt) { 1238 dst_hold(&net->ipv6.ip6_null_entry->dst); 1239 return net->ipv6.ip6_null_entry; 1240 } 1241 1242 dst_hold(&pcpu_rt->dst); 1243 p = this_cpu_ptr(rt->rt6i_pcpu); 1244 prev = cmpxchg(p, NULL, pcpu_rt); 1245 BUG_ON(prev); 1246 1247 return pcpu_rt; 1248 } 1249 1250 /* exception hash table implementation 1251 */ 1252 static DEFINE_SPINLOCK(rt6_exception_lock); 1253 1254 /* Remove rt6_ex from hash table and free the memory 1255 * Caller must hold rt6_exception_lock 1256 */ 1257 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1258 struct rt6_exception *rt6_ex) 1259 { 1260 struct net *net; 1261 1262 if (!bucket || !rt6_ex) 1263 return; 1264 1265 net = dev_net(rt6_ex->rt6i->dst.dev); 1266 hlist_del_rcu(&rt6_ex->hlist); 1267 dst_release(&rt6_ex->rt6i->dst); 1268 kfree_rcu(rt6_ex, rcu); 1269 WARN_ON_ONCE(!bucket->depth); 1270 bucket->depth--; 1271 net->ipv6.rt6_stats->fib_rt_cache--; 1272 } 1273 1274 /* Remove oldest rt6_ex in bucket and free the memory 1275 * Caller must hold rt6_exception_lock 1276 */ 1277 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1278 { 1279 struct rt6_exception *rt6_ex, *oldest = NULL; 1280 1281 if (!bucket) 1282 return; 1283 1284 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1285 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1286 oldest = rt6_ex; 1287 } 1288 rt6_remove_exception(bucket, oldest); 1289 } 1290 1291 static u32 rt6_exception_hash(const struct in6_addr *dst, 1292 const struct in6_addr *src) 1293 { 1294 static u32 seed __read_mostly; 1295 u32 val; 1296 1297 net_get_random_once(&seed, sizeof(seed)); 1298 val = jhash(dst, sizeof(*dst), seed); 1299 1300 #ifdef CONFIG_IPV6_SUBTREES 1301 if (src) 1302 val = jhash(src, sizeof(*src), val); 1303 #endif 1304 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1305 } 1306 1307 /* Helper function to find the cached rt in the hash table 1308 * and update bucket pointer to point to the bucket for this 1309 * (daddr, saddr) pair 1310 * Caller must hold rt6_exception_lock 1311 */ 1312 static struct rt6_exception * 1313 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1314 const struct in6_addr *daddr, 1315 const struct in6_addr *saddr) 1316 { 1317 struct rt6_exception *rt6_ex; 1318 u32 hval; 1319 1320 if (!(*bucket) || !daddr) 1321 return NULL; 1322 1323 hval = rt6_exception_hash(daddr, saddr); 1324 *bucket += hval; 1325 1326 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1327 struct rt6_info *rt6 = rt6_ex->rt6i; 1328 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1329 1330 #ifdef CONFIG_IPV6_SUBTREES 1331 if (matched && saddr) 1332 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1333 #endif 1334 if (matched) 1335 return rt6_ex; 1336 } 1337 return NULL; 1338 } 1339 1340 /* Helper function to find the cached rt in the hash table 1341 * and update bucket pointer to point to the bucket for this 1342 * (daddr, saddr) pair 1343 * Caller must hold rcu_read_lock() 1344 */ 1345 static struct rt6_exception * 1346 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1347 const struct in6_addr *daddr, 1348 const struct in6_addr *saddr) 1349 { 1350 struct rt6_exception *rt6_ex; 1351 u32 hval; 1352 1353 WARN_ON_ONCE(!rcu_read_lock_held()); 1354 1355 if (!(*bucket) || !daddr) 1356 return NULL; 1357 1358 hval = rt6_exception_hash(daddr, saddr); 1359 *bucket += hval; 1360 1361 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1362 struct rt6_info *rt6 = rt6_ex->rt6i; 1363 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1364 1365 #ifdef CONFIG_IPV6_SUBTREES 1366 if (matched && saddr) 1367 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1368 #endif 1369 if (matched) 1370 return rt6_ex; 1371 } 1372 return NULL; 1373 } 1374 1375 static unsigned int fib6_mtu(const struct fib6_info *rt) 1376 { 1377 unsigned int mtu; 1378 1379 if (rt->fib6_pmtu) { 1380 mtu = rt->fib6_pmtu; 1381 } else { 1382 struct net_device *dev = fib6_info_nh_dev(rt); 1383 struct inet6_dev *idev; 1384 1385 rcu_read_lock(); 1386 idev = __in6_dev_get(dev); 1387 mtu = idev->cnf.mtu6; 1388 rcu_read_unlock(); 1389 } 1390 1391 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1392 1393 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu); 1394 } 1395 1396 static int rt6_insert_exception(struct rt6_info *nrt, 1397 struct fib6_info *ort) 1398 { 1399 struct net *net = dev_net(nrt->dst.dev); 1400 struct rt6_exception_bucket *bucket; 1401 struct in6_addr *src_key = NULL; 1402 struct rt6_exception *rt6_ex; 1403 int err = 0; 1404 1405 spin_lock_bh(&rt6_exception_lock); 1406 1407 if (ort->exception_bucket_flushed) { 1408 err = -EINVAL; 1409 goto out; 1410 } 1411 1412 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, 1413 lockdep_is_held(&rt6_exception_lock)); 1414 if (!bucket) { 1415 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1416 GFP_ATOMIC); 1417 if (!bucket) { 1418 err = -ENOMEM; 1419 goto out; 1420 } 1421 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); 1422 } 1423 1424 #ifdef CONFIG_IPV6_SUBTREES 1425 /* rt6i_src.plen != 0 indicates ort is in subtree 1426 * and exception table is indexed by a hash of 1427 * both rt6i_dst and rt6i_src. 1428 * Otherwise, the exception table is indexed by 1429 * a hash of only rt6i_dst. 1430 */ 1431 if (ort->fib6_src.plen) 1432 src_key = &nrt->rt6i_src.addr; 1433 #endif 1434 1435 /* Update rt6i_prefsrc as it could be changed 1436 * in rt6_remove_prefsrc() 1437 */ 1438 nrt->rt6i_prefsrc = ort->fib6_prefsrc; 1439 /* rt6_mtu_change() might lower mtu on ort. 1440 * Only insert this exception route if its mtu 1441 * is less than ort's mtu value. 1442 */ 1443 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) { 1444 err = -EINVAL; 1445 goto out; 1446 } 1447 1448 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1449 src_key); 1450 if (rt6_ex) 1451 rt6_remove_exception(bucket, rt6_ex); 1452 1453 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1454 if (!rt6_ex) { 1455 err = -ENOMEM; 1456 goto out; 1457 } 1458 rt6_ex->rt6i = nrt; 1459 rt6_ex->stamp = jiffies; 1460 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1461 bucket->depth++; 1462 net->ipv6.rt6_stats->fib_rt_cache++; 1463 1464 if (bucket->depth > FIB6_MAX_DEPTH) 1465 rt6_exception_remove_oldest(bucket); 1466 1467 out: 1468 spin_unlock_bh(&rt6_exception_lock); 1469 1470 /* Update fn->fn_sernum to invalidate all cached dst */ 1471 if (!err) { 1472 spin_lock_bh(&ort->fib6_table->tb6_lock); 1473 fib6_update_sernum(net, ort); 1474 spin_unlock_bh(&ort->fib6_table->tb6_lock); 1475 fib6_force_start_gc(net); 1476 } 1477 1478 return err; 1479 } 1480 1481 void rt6_flush_exceptions(struct fib6_info *rt) 1482 { 1483 struct rt6_exception_bucket *bucket; 1484 struct rt6_exception *rt6_ex; 1485 struct hlist_node *tmp; 1486 int i; 1487 1488 spin_lock_bh(&rt6_exception_lock); 1489 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1490 rt->exception_bucket_flushed = 1; 1491 1492 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1493 lockdep_is_held(&rt6_exception_lock)); 1494 if (!bucket) 1495 goto out; 1496 1497 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1498 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1499 rt6_remove_exception(bucket, rt6_ex); 1500 WARN_ON_ONCE(bucket->depth); 1501 bucket++; 1502 } 1503 1504 out: 1505 spin_unlock_bh(&rt6_exception_lock); 1506 } 1507 1508 /* Find cached rt in the hash table inside passed in rt 1509 * Caller has to hold rcu_read_lock() 1510 */ 1511 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 1512 struct in6_addr *daddr, 1513 struct in6_addr *saddr) 1514 { 1515 struct rt6_exception_bucket *bucket; 1516 struct in6_addr *src_key = NULL; 1517 struct rt6_exception *rt6_ex; 1518 struct rt6_info *res = NULL; 1519 1520 bucket = rcu_dereference(rt->rt6i_exception_bucket); 1521 1522 #ifdef CONFIG_IPV6_SUBTREES 1523 /* rt6i_src.plen != 0 indicates rt is in subtree 1524 * and exception table is indexed by a hash of 1525 * both rt6i_dst and rt6i_src. 1526 * Otherwise, the exception table is indexed by 1527 * a hash of only rt6i_dst. 1528 */ 1529 if (rt->fib6_src.plen) 1530 src_key = saddr; 1531 #endif 1532 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1533 1534 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1535 res = rt6_ex->rt6i; 1536 1537 return res; 1538 } 1539 1540 /* Remove the passed in cached rt from the hash table that contains it */ 1541 static int rt6_remove_exception_rt(struct rt6_info *rt) 1542 { 1543 struct rt6_exception_bucket *bucket; 1544 struct in6_addr *src_key = NULL; 1545 struct rt6_exception *rt6_ex; 1546 struct fib6_info *from; 1547 int err; 1548 1549 from = rcu_dereference(rt->from); 1550 if (!from || 1551 !(rt->rt6i_flags & RTF_CACHE)) 1552 return -EINVAL; 1553 1554 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1555 return -ENOENT; 1556 1557 spin_lock_bh(&rt6_exception_lock); 1558 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1559 lockdep_is_held(&rt6_exception_lock)); 1560 #ifdef CONFIG_IPV6_SUBTREES 1561 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1562 * and exception table is indexed by a hash of 1563 * both rt6i_dst and rt6i_src. 1564 * Otherwise, the exception table is indexed by 1565 * a hash of only rt6i_dst. 1566 */ 1567 if (from->fib6_src.plen) 1568 src_key = &rt->rt6i_src.addr; 1569 #endif 1570 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1571 &rt->rt6i_dst.addr, 1572 src_key); 1573 if (rt6_ex) { 1574 rt6_remove_exception(bucket, rt6_ex); 1575 err = 0; 1576 } else { 1577 err = -ENOENT; 1578 } 1579 1580 spin_unlock_bh(&rt6_exception_lock); 1581 return err; 1582 } 1583 1584 /* Find rt6_ex which contains the passed in rt cache and 1585 * refresh its stamp 1586 */ 1587 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1588 { 1589 struct rt6_exception_bucket *bucket; 1590 struct fib6_info *from = rt->from; 1591 struct in6_addr *src_key = NULL; 1592 struct rt6_exception *rt6_ex; 1593 1594 if (!from || 1595 !(rt->rt6i_flags & RTF_CACHE)) 1596 return; 1597 1598 rcu_read_lock(); 1599 bucket = rcu_dereference(from->rt6i_exception_bucket); 1600 1601 #ifdef CONFIG_IPV6_SUBTREES 1602 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1603 * and exception table is indexed by a hash of 1604 * both rt6i_dst and rt6i_src. 1605 * Otherwise, the exception table is indexed by 1606 * a hash of only rt6i_dst. 1607 */ 1608 if (from->fib6_src.plen) 1609 src_key = &rt->rt6i_src.addr; 1610 #endif 1611 rt6_ex = __rt6_find_exception_rcu(&bucket, 1612 &rt->rt6i_dst.addr, 1613 src_key); 1614 if (rt6_ex) 1615 rt6_ex->stamp = jiffies; 1616 1617 rcu_read_unlock(); 1618 } 1619 1620 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt) 1621 { 1622 struct rt6_exception_bucket *bucket; 1623 struct rt6_exception *rt6_ex; 1624 int i; 1625 1626 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1627 lockdep_is_held(&rt6_exception_lock)); 1628 1629 if (bucket) { 1630 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1631 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1632 rt6_ex->rt6i->rt6i_prefsrc.plen = 0; 1633 } 1634 bucket++; 1635 } 1636 } 1637 } 1638 1639 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1640 struct rt6_info *rt, int mtu) 1641 { 1642 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1643 * lowest MTU in the path: always allow updating the route PMTU to 1644 * reflect PMTU decreases. 1645 * 1646 * If the new MTU is higher, and the route PMTU is equal to the local 1647 * MTU, this means the old MTU is the lowest in the path, so allow 1648 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1649 * handle this. 1650 */ 1651 1652 if (dst_mtu(&rt->dst) >= mtu) 1653 return true; 1654 1655 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 1656 return true; 1657 1658 return false; 1659 } 1660 1661 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1662 struct fib6_info *rt, int mtu) 1663 { 1664 struct rt6_exception_bucket *bucket; 1665 struct rt6_exception *rt6_ex; 1666 int i; 1667 1668 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1669 lockdep_is_held(&rt6_exception_lock)); 1670 1671 if (!bucket) 1672 return; 1673 1674 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1675 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1676 struct rt6_info *entry = rt6_ex->rt6i; 1677 1678 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 1679 * route), the metrics of its rt->from have already 1680 * been updated. 1681 */ 1682 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 1683 rt6_mtu_change_route_allowed(idev, entry, mtu)) 1684 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 1685 } 1686 bucket++; 1687 } 1688 } 1689 1690 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1691 1692 static void rt6_exceptions_clean_tohost(struct fib6_info *rt, 1693 struct in6_addr *gateway) 1694 { 1695 struct rt6_exception_bucket *bucket; 1696 struct rt6_exception *rt6_ex; 1697 struct hlist_node *tmp; 1698 int i; 1699 1700 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1701 return; 1702 1703 spin_lock_bh(&rt6_exception_lock); 1704 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1705 lockdep_is_held(&rt6_exception_lock)); 1706 1707 if (bucket) { 1708 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1709 hlist_for_each_entry_safe(rt6_ex, tmp, 1710 &bucket->chain, hlist) { 1711 struct rt6_info *entry = rt6_ex->rt6i; 1712 1713 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1714 RTF_CACHE_GATEWAY && 1715 ipv6_addr_equal(gateway, 1716 &entry->rt6i_gateway)) { 1717 rt6_remove_exception(bucket, rt6_ex); 1718 } 1719 } 1720 bucket++; 1721 } 1722 } 1723 1724 spin_unlock_bh(&rt6_exception_lock); 1725 } 1726 1727 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1728 struct rt6_exception *rt6_ex, 1729 struct fib6_gc_args *gc_args, 1730 unsigned long now) 1731 { 1732 struct rt6_info *rt = rt6_ex->rt6i; 1733 1734 /* we are pruning and obsoleting aged-out and non gateway exceptions 1735 * even if others have still references to them, so that on next 1736 * dst_check() such references can be dropped. 1737 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1738 * expired, independently from their aging, as per RFC 8201 section 4 1739 */ 1740 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 1741 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1742 RT6_TRACE("aging clone %p\n", rt); 1743 rt6_remove_exception(bucket, rt6_ex); 1744 return; 1745 } 1746 } else if (time_after(jiffies, rt->dst.expires)) { 1747 RT6_TRACE("purging expired route %p\n", rt); 1748 rt6_remove_exception(bucket, rt6_ex); 1749 return; 1750 } 1751 1752 if (rt->rt6i_flags & RTF_GATEWAY) { 1753 struct neighbour *neigh; 1754 __u8 neigh_flags = 0; 1755 1756 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 1757 if (neigh) 1758 neigh_flags = neigh->flags; 1759 1760 if (!(neigh_flags & NTF_ROUTER)) { 1761 RT6_TRACE("purging route %p via non-router but gateway\n", 1762 rt); 1763 rt6_remove_exception(bucket, rt6_ex); 1764 return; 1765 } 1766 } 1767 1768 gc_args->more++; 1769 } 1770 1771 void rt6_age_exceptions(struct fib6_info *rt, 1772 struct fib6_gc_args *gc_args, 1773 unsigned long now) 1774 { 1775 struct rt6_exception_bucket *bucket; 1776 struct rt6_exception *rt6_ex; 1777 struct hlist_node *tmp; 1778 int i; 1779 1780 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1781 return; 1782 1783 rcu_read_lock_bh(); 1784 spin_lock(&rt6_exception_lock); 1785 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1786 lockdep_is_held(&rt6_exception_lock)); 1787 1788 if (bucket) { 1789 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1790 hlist_for_each_entry_safe(rt6_ex, tmp, 1791 &bucket->chain, hlist) { 1792 rt6_age_examine_exception(bucket, rt6_ex, 1793 gc_args, now); 1794 } 1795 bucket++; 1796 } 1797 } 1798 spin_unlock(&rt6_exception_lock); 1799 rcu_read_unlock_bh(); 1800 } 1801 1802 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1803 int oif, struct flowi6 *fl6, 1804 const struct sk_buff *skb, int flags) 1805 { 1806 struct fib6_node *fn, *saved_fn; 1807 struct fib6_info *f6i; 1808 struct rt6_info *rt; 1809 int strict = 0; 1810 1811 strict |= flags & RT6_LOOKUP_F_IFACE; 1812 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1813 if (net->ipv6.devconf_all->forwarding == 0) 1814 strict |= RT6_LOOKUP_F_REACHABLE; 1815 1816 rcu_read_lock(); 1817 1818 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1819 saved_fn = fn; 1820 1821 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1822 oif = 0; 1823 1824 redo_rt6_select: 1825 f6i = rt6_select(net, fn, oif, strict); 1826 if (f6i->fib6_nsiblings) 1827 f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict); 1828 if (f6i == net->ipv6.fib6_null_entry) { 1829 fn = fib6_backtrack(fn, &fl6->saddr); 1830 if (fn) 1831 goto redo_rt6_select; 1832 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1833 /* also consider unreachable route */ 1834 strict &= ~RT6_LOOKUP_F_REACHABLE; 1835 fn = saved_fn; 1836 goto redo_rt6_select; 1837 } 1838 } 1839 1840 if (f6i == net->ipv6.fib6_null_entry) { 1841 rt = net->ipv6.ip6_null_entry; 1842 rcu_read_unlock(); 1843 dst_hold(&rt->dst); 1844 trace_fib6_table_lookup(net, rt, table, fl6); 1845 return rt; 1846 } 1847 1848 /*Search through exception table */ 1849 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1850 if (rt) { 1851 if (ip6_hold_safe(net, &rt, true)) 1852 dst_use_noref(&rt->dst, jiffies); 1853 1854 rcu_read_unlock(); 1855 trace_fib6_table_lookup(net, rt, table, fl6); 1856 return rt; 1857 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1858 !(f6i->fib6_flags & RTF_GATEWAY))) { 1859 /* Create a RTF_CACHE clone which will not be 1860 * owned by the fib6 tree. It is for the special case where 1861 * the daddr in the skb during the neighbor look-up is different 1862 * from the fl6->daddr used to look-up route here. 1863 */ 1864 struct rt6_info *uncached_rt; 1865 1866 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); 1867 1868 rcu_read_unlock(); 1869 1870 if (uncached_rt) { 1871 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1872 * No need for another dst_hold() 1873 */ 1874 rt6_uncached_list_add(uncached_rt); 1875 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 1876 } else { 1877 uncached_rt = net->ipv6.ip6_null_entry; 1878 dst_hold(&uncached_rt->dst); 1879 } 1880 1881 trace_fib6_table_lookup(net, uncached_rt, table, fl6); 1882 return uncached_rt; 1883 1884 } else { 1885 /* Get a percpu copy */ 1886 1887 struct rt6_info *pcpu_rt; 1888 1889 local_bh_disable(); 1890 pcpu_rt = rt6_get_pcpu_route(f6i); 1891 1892 if (!pcpu_rt) 1893 pcpu_rt = rt6_make_pcpu_route(net, f6i); 1894 1895 local_bh_enable(); 1896 rcu_read_unlock(); 1897 trace_fib6_table_lookup(net, pcpu_rt, table, fl6); 1898 return pcpu_rt; 1899 } 1900 } 1901 EXPORT_SYMBOL_GPL(ip6_pol_route); 1902 1903 static struct rt6_info *ip6_pol_route_input(struct net *net, 1904 struct fib6_table *table, 1905 struct flowi6 *fl6, 1906 const struct sk_buff *skb, 1907 int flags) 1908 { 1909 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 1910 } 1911 1912 struct dst_entry *ip6_route_input_lookup(struct net *net, 1913 struct net_device *dev, 1914 struct flowi6 *fl6, 1915 const struct sk_buff *skb, 1916 int flags) 1917 { 1918 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1919 flags |= RT6_LOOKUP_F_IFACE; 1920 1921 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 1922 } 1923 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1924 1925 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1926 struct flow_keys *keys, 1927 struct flow_keys *flkeys) 1928 { 1929 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1930 const struct ipv6hdr *key_iph = outer_iph; 1931 struct flow_keys *_flkeys = flkeys; 1932 const struct ipv6hdr *inner_iph; 1933 const struct icmp6hdr *icmph; 1934 struct ipv6hdr _inner_iph; 1935 struct icmp6hdr _icmph; 1936 1937 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1938 goto out; 1939 1940 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 1941 sizeof(_icmph), &_icmph); 1942 if (!icmph) 1943 goto out; 1944 1945 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1946 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1947 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1948 icmph->icmp6_type != ICMPV6_PARAMPROB) 1949 goto out; 1950 1951 inner_iph = skb_header_pointer(skb, 1952 skb_transport_offset(skb) + sizeof(*icmph), 1953 sizeof(_inner_iph), &_inner_iph); 1954 if (!inner_iph) 1955 goto out; 1956 1957 key_iph = inner_iph; 1958 _flkeys = NULL; 1959 out: 1960 if (_flkeys) { 1961 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 1962 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 1963 keys->tags.flow_label = _flkeys->tags.flow_label; 1964 keys->basic.ip_proto = _flkeys->basic.ip_proto; 1965 } else { 1966 keys->addrs.v6addrs.src = key_iph->saddr; 1967 keys->addrs.v6addrs.dst = key_iph->daddr; 1968 keys->tags.flow_label = ip6_flowinfo(key_iph); 1969 keys->basic.ip_proto = key_iph->nexthdr; 1970 } 1971 } 1972 1973 /* if skb is set it will be used and fl6 can be NULL */ 1974 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 1975 const struct sk_buff *skb, struct flow_keys *flkeys) 1976 { 1977 struct flow_keys hash_keys; 1978 u32 mhash; 1979 1980 switch (ip6_multipath_hash_policy(net)) { 1981 case 0: 1982 memset(&hash_keys, 0, sizeof(hash_keys)); 1983 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 1984 if (skb) { 1985 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 1986 } else { 1987 hash_keys.addrs.v6addrs.src = fl6->saddr; 1988 hash_keys.addrs.v6addrs.dst = fl6->daddr; 1989 hash_keys.tags.flow_label = (__force u32)fl6->flowlabel; 1990 hash_keys.basic.ip_proto = fl6->flowi6_proto; 1991 } 1992 break; 1993 case 1: 1994 if (skb) { 1995 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 1996 struct flow_keys keys; 1997 1998 /* short-circuit if we already have L4 hash present */ 1999 if (skb->l4_hash) 2000 return skb_get_hash_raw(skb) >> 1; 2001 2002 memset(&hash_keys, 0, sizeof(hash_keys)); 2003 2004 if (!flkeys) { 2005 skb_flow_dissect_flow_keys(skb, &keys, flag); 2006 flkeys = &keys; 2007 } 2008 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2009 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2010 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2011 hash_keys.ports.src = flkeys->ports.src; 2012 hash_keys.ports.dst = flkeys->ports.dst; 2013 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2014 } else { 2015 memset(&hash_keys, 0, sizeof(hash_keys)); 2016 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2017 hash_keys.addrs.v6addrs.src = fl6->saddr; 2018 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2019 hash_keys.ports.src = fl6->fl6_sport; 2020 hash_keys.ports.dst = fl6->fl6_dport; 2021 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2022 } 2023 break; 2024 } 2025 mhash = flow_hash_from_keys(&hash_keys); 2026 2027 return mhash >> 1; 2028 } 2029 2030 void ip6_route_input(struct sk_buff *skb) 2031 { 2032 const struct ipv6hdr *iph = ipv6_hdr(skb); 2033 struct net *net = dev_net(skb->dev); 2034 int flags = RT6_LOOKUP_F_HAS_SADDR; 2035 struct ip_tunnel_info *tun_info; 2036 struct flowi6 fl6 = { 2037 .flowi6_iif = skb->dev->ifindex, 2038 .daddr = iph->daddr, 2039 .saddr = iph->saddr, 2040 .flowlabel = ip6_flowinfo(iph), 2041 .flowi6_mark = skb->mark, 2042 .flowi6_proto = iph->nexthdr, 2043 }; 2044 struct flow_keys *flkeys = NULL, _flkeys; 2045 2046 tun_info = skb_tunnel_info(skb); 2047 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2048 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2049 2050 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2051 flkeys = &_flkeys; 2052 2053 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2054 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2055 skb_dst_drop(skb); 2056 skb_dst_set(skb, 2057 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); 2058 } 2059 2060 static struct rt6_info *ip6_pol_route_output(struct net *net, 2061 struct fib6_table *table, 2062 struct flowi6 *fl6, 2063 const struct sk_buff *skb, 2064 int flags) 2065 { 2066 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2067 } 2068 2069 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 2070 struct flowi6 *fl6, int flags) 2071 { 2072 bool any_src; 2073 2074 if (rt6_need_strict(&fl6->daddr)) { 2075 struct dst_entry *dst; 2076 2077 dst = l3mdev_link_scope_lookup(net, fl6); 2078 if (dst) 2079 return dst; 2080 } 2081 2082 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2083 2084 any_src = ipv6_addr_any(&fl6->saddr); 2085 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2086 (fl6->flowi6_oif && any_src)) 2087 flags |= RT6_LOOKUP_F_IFACE; 2088 2089 if (!any_src) 2090 flags |= RT6_LOOKUP_F_HAS_SADDR; 2091 else if (sk) 2092 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2093 2094 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2095 } 2096 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2097 2098 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2099 { 2100 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2101 struct net_device *loopback_dev = net->loopback_dev; 2102 struct dst_entry *new = NULL; 2103 2104 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2105 DST_OBSOLETE_DEAD, 0); 2106 if (rt) { 2107 rt6_info_init(rt); 2108 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2109 2110 new = &rt->dst; 2111 new->__use = 1; 2112 new->input = dst_discard; 2113 new->output = dst_discard_out; 2114 2115 dst_copy_metrics(new, &ort->dst); 2116 2117 rt->rt6i_idev = in6_dev_get(loopback_dev); 2118 rt->rt6i_gateway = ort->rt6i_gateway; 2119 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2120 2121 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2122 #ifdef CONFIG_IPV6_SUBTREES 2123 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2124 #endif 2125 } 2126 2127 dst_release(dst_orig); 2128 return new ? new : ERR_PTR(-ENOMEM); 2129 } 2130 2131 /* 2132 * Destination cache support functions 2133 */ 2134 2135 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2136 { 2137 u32 rt_cookie = 0; 2138 2139 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2140 return false; 2141 2142 if (fib6_check_expired(f6i)) 2143 return false; 2144 2145 return true; 2146 } 2147 2148 static struct dst_entry *rt6_check(struct rt6_info *rt, 2149 struct fib6_info *from, 2150 u32 cookie) 2151 { 2152 u32 rt_cookie = 0; 2153 2154 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || 2155 rt_cookie != cookie) 2156 return NULL; 2157 2158 if (rt6_check_expired(rt)) 2159 return NULL; 2160 2161 return &rt->dst; 2162 } 2163 2164 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2165 struct fib6_info *from, 2166 u32 cookie) 2167 { 2168 if (!__rt6_check_expired(rt) && 2169 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2170 fib6_check(from, cookie)) 2171 return &rt->dst; 2172 else 2173 return NULL; 2174 } 2175 2176 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2177 { 2178 struct dst_entry *dst_ret; 2179 struct fib6_info *from; 2180 struct rt6_info *rt; 2181 2182 rt = container_of(dst, struct rt6_info, dst); 2183 2184 rcu_read_lock(); 2185 2186 /* All IPV6 dsts are created with ->obsolete set to the value 2187 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2188 * into this function always. 2189 */ 2190 2191 from = rcu_dereference(rt->from); 2192 2193 if (from && (rt->rt6i_flags & RTF_PCPU || 2194 unlikely(!list_empty(&rt->rt6i_uncached)))) 2195 dst_ret = rt6_dst_from_check(rt, from, cookie); 2196 else 2197 dst_ret = rt6_check(rt, from, cookie); 2198 2199 rcu_read_unlock(); 2200 2201 return dst_ret; 2202 } 2203 2204 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2205 { 2206 struct rt6_info *rt = (struct rt6_info *) dst; 2207 2208 if (rt) { 2209 if (rt->rt6i_flags & RTF_CACHE) { 2210 rcu_read_lock(); 2211 if (rt6_check_expired(rt)) { 2212 rt6_remove_exception_rt(rt); 2213 dst = NULL; 2214 } 2215 rcu_read_unlock(); 2216 } else { 2217 dst_release(dst); 2218 dst = NULL; 2219 } 2220 } 2221 return dst; 2222 } 2223 2224 static void ip6_link_failure(struct sk_buff *skb) 2225 { 2226 struct rt6_info *rt; 2227 2228 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2229 2230 rt = (struct rt6_info *) skb_dst(skb); 2231 if (rt) { 2232 rcu_read_lock(); 2233 if (rt->rt6i_flags & RTF_CACHE) { 2234 if (dst_hold_safe(&rt->dst)) 2235 rt6_remove_exception_rt(rt); 2236 } else { 2237 struct fib6_info *from; 2238 struct fib6_node *fn; 2239 2240 from = rcu_dereference(rt->from); 2241 if (from) { 2242 fn = rcu_dereference(from->fib6_node); 2243 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2244 fn->fn_sernum = -1; 2245 } 2246 } 2247 rcu_read_unlock(); 2248 } 2249 } 2250 2251 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2252 { 2253 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2254 struct fib6_info *from; 2255 2256 rcu_read_lock(); 2257 from = rcu_dereference(rt0->from); 2258 if (from) 2259 rt0->dst.expires = from->expires; 2260 rcu_read_unlock(); 2261 } 2262 2263 dst_set_expires(&rt0->dst, timeout); 2264 rt0->rt6i_flags |= RTF_EXPIRES; 2265 } 2266 2267 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2268 { 2269 struct net *net = dev_net(rt->dst.dev); 2270 2271 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2272 rt->rt6i_flags |= RTF_MODIFIED; 2273 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2274 } 2275 2276 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2277 { 2278 bool from_set; 2279 2280 rcu_read_lock(); 2281 from_set = !!rcu_dereference(rt->from); 2282 rcu_read_unlock(); 2283 2284 return !(rt->rt6i_flags & RTF_CACHE) && 2285 (rt->rt6i_flags & RTF_PCPU || from_set); 2286 } 2287 2288 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2289 const struct ipv6hdr *iph, u32 mtu) 2290 { 2291 const struct in6_addr *daddr, *saddr; 2292 struct rt6_info *rt6 = (struct rt6_info *)dst; 2293 2294 if (rt6->rt6i_flags & RTF_LOCAL) 2295 return; 2296 2297 if (dst_metric_locked(dst, RTAX_MTU)) 2298 return; 2299 2300 if (iph) { 2301 daddr = &iph->daddr; 2302 saddr = &iph->saddr; 2303 } else if (sk) { 2304 daddr = &sk->sk_v6_daddr; 2305 saddr = &inet6_sk(sk)->saddr; 2306 } else { 2307 daddr = NULL; 2308 saddr = NULL; 2309 } 2310 dst_confirm_neigh(dst, daddr); 2311 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2312 if (mtu >= dst_mtu(dst)) 2313 return; 2314 2315 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2316 rt6_do_update_pmtu(rt6, mtu); 2317 /* update rt6_ex->stamp for cache */ 2318 if (rt6->rt6i_flags & RTF_CACHE) 2319 rt6_update_exception_stamp_rt(rt6); 2320 } else if (daddr) { 2321 struct fib6_info *from; 2322 struct rt6_info *nrt6; 2323 2324 rcu_read_lock(); 2325 from = rcu_dereference(rt6->from); 2326 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr); 2327 if (nrt6) { 2328 rt6_do_update_pmtu(nrt6, mtu); 2329 if (rt6_insert_exception(nrt6, from)) 2330 dst_release_immediate(&nrt6->dst); 2331 } 2332 rcu_read_unlock(); 2333 } 2334 } 2335 2336 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2337 struct sk_buff *skb, u32 mtu) 2338 { 2339 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2340 } 2341 2342 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2343 int oif, u32 mark, kuid_t uid) 2344 { 2345 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2346 struct dst_entry *dst; 2347 struct flowi6 fl6; 2348 2349 memset(&fl6, 0, sizeof(fl6)); 2350 fl6.flowi6_oif = oif; 2351 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); 2352 fl6.daddr = iph->daddr; 2353 fl6.saddr = iph->saddr; 2354 fl6.flowlabel = ip6_flowinfo(iph); 2355 fl6.flowi6_uid = uid; 2356 2357 dst = ip6_route_output(net, NULL, &fl6); 2358 if (!dst->error) 2359 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2360 dst_release(dst); 2361 } 2362 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2363 2364 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2365 { 2366 struct dst_entry *dst; 2367 2368 ip6_update_pmtu(skb, sock_net(sk), mtu, 2369 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid); 2370 2371 dst = __sk_dst_get(sk); 2372 if (!dst || !dst->obsolete || 2373 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2374 return; 2375 2376 bh_lock_sock(sk); 2377 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2378 ip6_datagram_dst_update(sk, false); 2379 bh_unlock_sock(sk); 2380 } 2381 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2382 2383 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2384 const struct flowi6 *fl6) 2385 { 2386 #ifdef CONFIG_IPV6_SUBTREES 2387 struct ipv6_pinfo *np = inet6_sk(sk); 2388 #endif 2389 2390 ip6_dst_store(sk, dst, 2391 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2392 &sk->sk_v6_daddr : NULL, 2393 #ifdef CONFIG_IPV6_SUBTREES 2394 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2395 &np->saddr : 2396 #endif 2397 NULL); 2398 } 2399 2400 /* Handle redirects */ 2401 struct ip6rd_flowi { 2402 struct flowi6 fl6; 2403 struct in6_addr gateway; 2404 }; 2405 2406 static struct rt6_info *__ip6_route_redirect(struct net *net, 2407 struct fib6_table *table, 2408 struct flowi6 *fl6, 2409 const struct sk_buff *skb, 2410 int flags) 2411 { 2412 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2413 struct rt6_info *ret = NULL, *rt_cache; 2414 struct fib6_info *rt; 2415 struct fib6_node *fn; 2416 2417 /* Get the "current" route for this destination and 2418 * check if the redirect has come from appropriate router. 2419 * 2420 * RFC 4861 specifies that redirects should only be 2421 * accepted if they come from the nexthop to the target. 2422 * Due to the way the routes are chosen, this notion 2423 * is a bit fuzzy and one might need to check all possible 2424 * routes. 2425 */ 2426 2427 rcu_read_lock(); 2428 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2429 restart: 2430 for_each_fib6_node_rt_rcu(fn) { 2431 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 2432 continue; 2433 if (fib6_check_expired(rt)) 2434 continue; 2435 if (rt->fib6_flags & RTF_REJECT) 2436 break; 2437 if (!(rt->fib6_flags & RTF_GATEWAY)) 2438 continue; 2439 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex) 2440 continue; 2441 /* rt_cache's gateway might be different from its 'parent' 2442 * in the case of an ip redirect. 2443 * So we keep searching in the exception table if the gateway 2444 * is different. 2445 */ 2446 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) { 2447 rt_cache = rt6_find_cached_rt(rt, 2448 &fl6->daddr, 2449 &fl6->saddr); 2450 if (rt_cache && 2451 ipv6_addr_equal(&rdfl->gateway, 2452 &rt_cache->rt6i_gateway)) { 2453 ret = rt_cache; 2454 break; 2455 } 2456 continue; 2457 } 2458 break; 2459 } 2460 2461 if (!rt) 2462 rt = net->ipv6.fib6_null_entry; 2463 else if (rt->fib6_flags & RTF_REJECT) { 2464 ret = net->ipv6.ip6_null_entry; 2465 goto out; 2466 } 2467 2468 if (rt == net->ipv6.fib6_null_entry) { 2469 fn = fib6_backtrack(fn, &fl6->saddr); 2470 if (fn) 2471 goto restart; 2472 } 2473 2474 out: 2475 if (ret) 2476 dst_hold(&ret->dst); 2477 else 2478 ret = ip6_create_rt_rcu(rt); 2479 2480 rcu_read_unlock(); 2481 2482 trace_fib6_table_lookup(net, ret, table, fl6); 2483 return ret; 2484 }; 2485 2486 static struct dst_entry *ip6_route_redirect(struct net *net, 2487 const struct flowi6 *fl6, 2488 const struct sk_buff *skb, 2489 const struct in6_addr *gateway) 2490 { 2491 int flags = RT6_LOOKUP_F_HAS_SADDR; 2492 struct ip6rd_flowi rdfl; 2493 2494 rdfl.fl6 = *fl6; 2495 rdfl.gateway = *gateway; 2496 2497 return fib6_rule_lookup(net, &rdfl.fl6, skb, 2498 flags, __ip6_route_redirect); 2499 } 2500 2501 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2502 kuid_t uid) 2503 { 2504 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2505 struct dst_entry *dst; 2506 struct flowi6 fl6; 2507 2508 memset(&fl6, 0, sizeof(fl6)); 2509 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2510 fl6.flowi6_oif = oif; 2511 fl6.flowi6_mark = mark; 2512 fl6.daddr = iph->daddr; 2513 fl6.saddr = iph->saddr; 2514 fl6.flowlabel = ip6_flowinfo(iph); 2515 fl6.flowi6_uid = uid; 2516 2517 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 2518 rt6_do_redirect(dst, NULL, skb); 2519 dst_release(dst); 2520 } 2521 EXPORT_SYMBOL_GPL(ip6_redirect); 2522 2523 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, 2524 u32 mark) 2525 { 2526 const struct ipv6hdr *iph = ipv6_hdr(skb); 2527 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2528 struct dst_entry *dst; 2529 struct flowi6 fl6; 2530 2531 memset(&fl6, 0, sizeof(fl6)); 2532 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2533 fl6.flowi6_oif = oif; 2534 fl6.flowi6_mark = mark; 2535 fl6.daddr = msg->dest; 2536 fl6.saddr = iph->daddr; 2537 fl6.flowi6_uid = sock_net_uid(net, NULL); 2538 2539 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 2540 rt6_do_redirect(dst, NULL, skb); 2541 dst_release(dst); 2542 } 2543 2544 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2545 { 2546 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2547 sk->sk_uid); 2548 } 2549 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2550 2551 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2552 { 2553 struct net_device *dev = dst->dev; 2554 unsigned int mtu = dst_mtu(dst); 2555 struct net *net = dev_net(dev); 2556 2557 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2558 2559 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2560 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2561 2562 /* 2563 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2564 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2565 * IPV6_MAXPLEN is also valid and means: "any MSS, 2566 * rely only on pmtu discovery" 2567 */ 2568 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2569 mtu = IPV6_MAXPLEN; 2570 return mtu; 2571 } 2572 2573 static unsigned int ip6_mtu(const struct dst_entry *dst) 2574 { 2575 struct inet6_dev *idev; 2576 unsigned int mtu; 2577 2578 mtu = dst_metric_raw(dst, RTAX_MTU); 2579 if (mtu) 2580 goto out; 2581 2582 mtu = IPV6_MIN_MTU; 2583 2584 rcu_read_lock(); 2585 idev = __in6_dev_get(dst->dev); 2586 if (idev) 2587 mtu = idev->cnf.mtu6; 2588 rcu_read_unlock(); 2589 2590 out: 2591 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2592 2593 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2594 } 2595 2596 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2597 struct flowi6 *fl6) 2598 { 2599 struct dst_entry *dst; 2600 struct rt6_info *rt; 2601 struct inet6_dev *idev = in6_dev_get(dev); 2602 struct net *net = dev_net(dev); 2603 2604 if (unlikely(!idev)) 2605 return ERR_PTR(-ENODEV); 2606 2607 rt = ip6_dst_alloc(net, dev, 0); 2608 if (unlikely(!rt)) { 2609 in6_dev_put(idev); 2610 dst = ERR_PTR(-ENOMEM); 2611 goto out; 2612 } 2613 2614 rt->dst.flags |= DST_HOST; 2615 rt->dst.input = ip6_input; 2616 rt->dst.output = ip6_output; 2617 rt->rt6i_gateway = fl6->daddr; 2618 rt->rt6i_dst.addr = fl6->daddr; 2619 rt->rt6i_dst.plen = 128; 2620 rt->rt6i_idev = idev; 2621 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2622 2623 /* Add this dst into uncached_list so that rt6_disable_ip() can 2624 * do proper release of the net_device 2625 */ 2626 rt6_uncached_list_add(rt); 2627 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2628 2629 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2630 2631 out: 2632 return dst; 2633 } 2634 2635 static int ip6_dst_gc(struct dst_ops *ops) 2636 { 2637 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2638 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2639 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2640 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2641 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2642 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2643 int entries; 2644 2645 entries = dst_entries_get_fast(ops); 2646 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2647 entries <= rt_max_size) 2648 goto out; 2649 2650 net->ipv6.ip6_rt_gc_expire++; 2651 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2652 entries = dst_entries_get_slow(ops); 2653 if (entries < ops->gc_thresh) 2654 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2655 out: 2656 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2657 return entries > rt_max_size; 2658 } 2659 2660 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt, 2661 struct fib6_config *cfg) 2662 { 2663 struct dst_metrics *p; 2664 2665 if (!cfg->fc_mx) 2666 return 0; 2667 2668 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL); 2669 if (unlikely(!p)) 2670 return -ENOMEM; 2671 2672 refcount_set(&p->refcnt, 1); 2673 rt->fib6_metrics = p; 2674 2675 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics); 2676 } 2677 2678 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2679 struct fib6_config *cfg, 2680 const struct in6_addr *gw_addr, 2681 u32 tbid, int flags) 2682 { 2683 struct flowi6 fl6 = { 2684 .flowi6_oif = cfg->fc_ifindex, 2685 .daddr = *gw_addr, 2686 .saddr = cfg->fc_prefsrc, 2687 }; 2688 struct fib6_table *table; 2689 struct rt6_info *rt; 2690 2691 table = fib6_get_table(net, tbid); 2692 if (!table) 2693 return NULL; 2694 2695 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2696 flags |= RT6_LOOKUP_F_HAS_SADDR; 2697 2698 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 2699 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); 2700 2701 /* if table lookup failed, fall back to full lookup */ 2702 if (rt == net->ipv6.ip6_null_entry) { 2703 ip6_rt_put(rt); 2704 rt = NULL; 2705 } 2706 2707 return rt; 2708 } 2709 2710 static int ip6_route_check_nh_onlink(struct net *net, 2711 struct fib6_config *cfg, 2712 const struct net_device *dev, 2713 struct netlink_ext_ack *extack) 2714 { 2715 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 2716 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2717 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; 2718 struct rt6_info *grt; 2719 int err; 2720 2721 err = 0; 2722 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2723 if (grt) { 2724 if (!grt->dst.error && 2725 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2726 NL_SET_ERR_MSG(extack, 2727 "Nexthop has invalid gateway or device mismatch"); 2728 err = -EINVAL; 2729 } 2730 2731 ip6_rt_put(grt); 2732 } 2733 2734 return err; 2735 } 2736 2737 static int ip6_route_check_nh(struct net *net, 2738 struct fib6_config *cfg, 2739 struct net_device **_dev, 2740 struct inet6_dev **idev) 2741 { 2742 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2743 struct net_device *dev = _dev ? *_dev : NULL; 2744 struct rt6_info *grt = NULL; 2745 int err = -EHOSTUNREACH; 2746 2747 if (cfg->fc_table) { 2748 int flags = RT6_LOOKUP_F_IFACE; 2749 2750 grt = ip6_nh_lookup_table(net, cfg, gw_addr, 2751 cfg->fc_table, flags); 2752 if (grt) { 2753 if (grt->rt6i_flags & RTF_GATEWAY || 2754 (dev && dev != grt->dst.dev)) { 2755 ip6_rt_put(grt); 2756 grt = NULL; 2757 } 2758 } 2759 } 2760 2761 if (!grt) 2762 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); 2763 2764 if (!grt) 2765 goto out; 2766 2767 if (dev) { 2768 if (dev != grt->dst.dev) { 2769 ip6_rt_put(grt); 2770 goto out; 2771 } 2772 } else { 2773 *_dev = dev = grt->dst.dev; 2774 *idev = grt->rt6i_idev; 2775 dev_hold(dev); 2776 in6_dev_hold(grt->rt6i_idev); 2777 } 2778 2779 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2780 err = 0; 2781 2782 ip6_rt_put(grt); 2783 2784 out: 2785 return err; 2786 } 2787 2788 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 2789 struct net_device **_dev, struct inet6_dev **idev, 2790 struct netlink_ext_ack *extack) 2791 { 2792 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2793 int gwa_type = ipv6_addr_type(gw_addr); 2794 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 2795 const struct net_device *dev = *_dev; 2796 bool need_addr_check = !dev; 2797 int err = -EINVAL; 2798 2799 /* if gw_addr is local we will fail to detect this in case 2800 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2801 * will return already-added prefix route via interface that 2802 * prefix route was assigned to, which might be non-loopback. 2803 */ 2804 if (dev && 2805 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2806 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2807 goto out; 2808 } 2809 2810 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 2811 /* IPv6 strictly inhibits using not link-local 2812 * addresses as nexthop address. 2813 * Otherwise, router will not able to send redirects. 2814 * It is very good, but in some (rare!) circumstances 2815 * (SIT, PtP, NBMA NOARP links) it is handy to allow 2816 * some exceptions. --ANK 2817 * We allow IPv4-mapped nexthops to support RFC4798-type 2818 * addressing 2819 */ 2820 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 2821 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2822 goto out; 2823 } 2824 2825 if (cfg->fc_flags & RTNH_F_ONLINK) 2826 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 2827 else 2828 err = ip6_route_check_nh(net, cfg, _dev, idev); 2829 2830 if (err) 2831 goto out; 2832 } 2833 2834 /* reload in case device was changed */ 2835 dev = *_dev; 2836 2837 err = -EINVAL; 2838 if (!dev) { 2839 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2840 goto out; 2841 } else if (dev->flags & IFF_LOOPBACK) { 2842 NL_SET_ERR_MSG(extack, 2843 "Egress device can not be loopback device for this route"); 2844 goto out; 2845 } 2846 2847 /* if we did not check gw_addr above, do so now that the 2848 * egress device has been resolved. 2849 */ 2850 if (need_addr_check && 2851 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2852 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2853 goto out; 2854 } 2855 2856 err = 0; 2857 out: 2858 return err; 2859 } 2860 2861 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 2862 gfp_t gfp_flags, 2863 struct netlink_ext_ack *extack) 2864 { 2865 struct net *net = cfg->fc_nlinfo.nl_net; 2866 struct fib6_info *rt = NULL; 2867 struct net_device *dev = NULL; 2868 struct inet6_dev *idev = NULL; 2869 struct fib6_table *table; 2870 int addr_type; 2871 int err = -EINVAL; 2872 2873 /* RTF_PCPU is an internal flag; can not be set by userspace */ 2874 if (cfg->fc_flags & RTF_PCPU) { 2875 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 2876 goto out; 2877 } 2878 2879 /* RTF_CACHE is an internal flag; can not be set by userspace */ 2880 if (cfg->fc_flags & RTF_CACHE) { 2881 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 2882 goto out; 2883 } 2884 2885 if (cfg->fc_type > RTN_MAX) { 2886 NL_SET_ERR_MSG(extack, "Invalid route type"); 2887 goto out; 2888 } 2889 2890 if (cfg->fc_dst_len > 128) { 2891 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 2892 goto out; 2893 } 2894 if (cfg->fc_src_len > 128) { 2895 NL_SET_ERR_MSG(extack, "Invalid source address length"); 2896 goto out; 2897 } 2898 #ifndef CONFIG_IPV6_SUBTREES 2899 if (cfg->fc_src_len) { 2900 NL_SET_ERR_MSG(extack, 2901 "Specifying source address requires IPV6_SUBTREES to be enabled"); 2902 goto out; 2903 } 2904 #endif 2905 if (cfg->fc_ifindex) { 2906 err = -ENODEV; 2907 dev = dev_get_by_index(net, cfg->fc_ifindex); 2908 if (!dev) 2909 goto out; 2910 idev = in6_dev_get(dev); 2911 if (!idev) 2912 goto out; 2913 } 2914 2915 if (cfg->fc_metric == 0) 2916 cfg->fc_metric = IP6_RT_PRIO_USER; 2917 2918 if (cfg->fc_flags & RTNH_F_ONLINK) { 2919 if (!dev) { 2920 NL_SET_ERR_MSG(extack, 2921 "Nexthop device required for onlink"); 2922 err = -ENODEV; 2923 goto out; 2924 } 2925 2926 if (!(dev->flags & IFF_UP)) { 2927 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 2928 err = -ENETDOWN; 2929 goto out; 2930 } 2931 } 2932 2933 err = -ENOBUFS; 2934 if (cfg->fc_nlinfo.nlh && 2935 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 2936 table = fib6_get_table(net, cfg->fc_table); 2937 if (!table) { 2938 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 2939 table = fib6_new_table(net, cfg->fc_table); 2940 } 2941 } else { 2942 table = fib6_new_table(net, cfg->fc_table); 2943 } 2944 2945 if (!table) 2946 goto out; 2947 2948 err = -ENOMEM; 2949 rt = fib6_info_alloc(gfp_flags); 2950 if (!rt) 2951 goto out; 2952 2953 if (cfg->fc_flags & RTF_ADDRCONF) 2954 rt->dst_nocount = true; 2955 2956 err = ip6_convert_metrics(net, rt, cfg); 2957 if (err < 0) 2958 goto out; 2959 2960 if (cfg->fc_flags & RTF_EXPIRES) 2961 fib6_set_expires(rt, jiffies + 2962 clock_t_to_jiffies(cfg->fc_expires)); 2963 else 2964 fib6_clean_expires(rt); 2965 2966 if (cfg->fc_protocol == RTPROT_UNSPEC) 2967 cfg->fc_protocol = RTPROT_BOOT; 2968 rt->fib6_protocol = cfg->fc_protocol; 2969 2970 addr_type = ipv6_addr_type(&cfg->fc_dst); 2971 2972 if (cfg->fc_encap) { 2973 struct lwtunnel_state *lwtstate; 2974 2975 err = lwtunnel_build_state(cfg->fc_encap_type, 2976 cfg->fc_encap, AF_INET6, cfg, 2977 &lwtstate, extack); 2978 if (err) 2979 goto out; 2980 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate); 2981 } 2982 2983 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 2984 rt->fib6_dst.plen = cfg->fc_dst_len; 2985 if (rt->fib6_dst.plen == 128) 2986 rt->dst_host = true; 2987 2988 #ifdef CONFIG_IPV6_SUBTREES 2989 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 2990 rt->fib6_src.plen = cfg->fc_src_len; 2991 #endif 2992 2993 rt->fib6_metric = cfg->fc_metric; 2994 rt->fib6_nh.nh_weight = 1; 2995 2996 rt->fib6_type = cfg->fc_type; 2997 2998 /* We cannot add true routes via loopback here, 2999 they would result in kernel looping; promote them to reject routes 3000 */ 3001 if ((cfg->fc_flags & RTF_REJECT) || 3002 (dev && (dev->flags & IFF_LOOPBACK) && 3003 !(addr_type & IPV6_ADDR_LOOPBACK) && 3004 !(cfg->fc_flags & RTF_LOCAL))) { 3005 /* hold loopback dev/idev if we haven't done so. */ 3006 if (dev != net->loopback_dev) { 3007 if (dev) { 3008 dev_put(dev); 3009 in6_dev_put(idev); 3010 } 3011 dev = net->loopback_dev; 3012 dev_hold(dev); 3013 idev = in6_dev_get(dev); 3014 if (!idev) { 3015 err = -ENODEV; 3016 goto out; 3017 } 3018 } 3019 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP; 3020 goto install_route; 3021 } 3022 3023 if (cfg->fc_flags & RTF_GATEWAY) { 3024 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 3025 if (err) 3026 goto out; 3027 3028 rt->fib6_nh.nh_gw = cfg->fc_gateway; 3029 } 3030 3031 err = -ENODEV; 3032 if (!dev) 3033 goto out; 3034 3035 if (idev->cnf.disable_ipv6) { 3036 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3037 err = -EACCES; 3038 goto out; 3039 } 3040 3041 if (!(dev->flags & IFF_UP)) { 3042 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3043 err = -ENETDOWN; 3044 goto out; 3045 } 3046 3047 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3048 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3049 NL_SET_ERR_MSG(extack, "Invalid source address"); 3050 err = -EINVAL; 3051 goto out; 3052 } 3053 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3054 rt->fib6_prefsrc.plen = 128; 3055 } else 3056 rt->fib6_prefsrc.plen = 0; 3057 3058 rt->fib6_flags = cfg->fc_flags; 3059 3060 install_route: 3061 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3062 !netif_carrier_ok(dev)) 3063 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 3064 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); 3065 rt->fib6_nh.nh_dev = dev; 3066 rt->fib6_table = table; 3067 3068 cfg->fc_nlinfo.nl_net = dev_net(dev); 3069 3070 if (idev) 3071 in6_dev_put(idev); 3072 3073 return rt; 3074 out: 3075 if (dev) 3076 dev_put(dev); 3077 if (idev) 3078 in6_dev_put(idev); 3079 3080 fib6_info_release(rt); 3081 return ERR_PTR(err); 3082 } 3083 3084 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3085 struct netlink_ext_ack *extack) 3086 { 3087 struct fib6_info *rt; 3088 int err; 3089 3090 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3091 if (IS_ERR(rt)) 3092 return PTR_ERR(rt); 3093 3094 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3095 fib6_info_release(rt); 3096 3097 return err; 3098 } 3099 3100 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3101 { 3102 struct net *net = info->nl_net; 3103 struct fib6_table *table; 3104 int err; 3105 3106 if (rt == net->ipv6.fib6_null_entry) { 3107 err = -ENOENT; 3108 goto out; 3109 } 3110 3111 table = rt->fib6_table; 3112 spin_lock_bh(&table->tb6_lock); 3113 err = fib6_del(rt, info); 3114 spin_unlock_bh(&table->tb6_lock); 3115 3116 out: 3117 fib6_info_release(rt); 3118 return err; 3119 } 3120 3121 int ip6_del_rt(struct net *net, struct fib6_info *rt) 3122 { 3123 struct nl_info info = { .nl_net = net }; 3124 3125 return __ip6_del_rt(rt, &info); 3126 } 3127 3128 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3129 { 3130 struct nl_info *info = &cfg->fc_nlinfo; 3131 struct net *net = info->nl_net; 3132 struct sk_buff *skb = NULL; 3133 struct fib6_table *table; 3134 int err = -ENOENT; 3135 3136 if (rt == net->ipv6.fib6_null_entry) 3137 goto out_put; 3138 table = rt->fib6_table; 3139 spin_lock_bh(&table->tb6_lock); 3140 3141 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3142 struct fib6_info *sibling, *next_sibling; 3143 3144 /* prefer to send a single notification with all hops */ 3145 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3146 if (skb) { 3147 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3148 3149 if (rt6_fill_node(net, skb, rt, NULL, 3150 NULL, NULL, 0, RTM_DELROUTE, 3151 info->portid, seq, 0) < 0) { 3152 kfree_skb(skb); 3153 skb = NULL; 3154 } else 3155 info->skip_notify = 1; 3156 } 3157 3158 list_for_each_entry_safe(sibling, next_sibling, 3159 &rt->fib6_siblings, 3160 fib6_siblings) { 3161 err = fib6_del(sibling, info); 3162 if (err) 3163 goto out_unlock; 3164 } 3165 } 3166 3167 err = fib6_del(rt, info); 3168 out_unlock: 3169 spin_unlock_bh(&table->tb6_lock); 3170 out_put: 3171 fib6_info_release(rt); 3172 3173 if (skb) { 3174 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3175 info->nlh, gfp_any()); 3176 } 3177 return err; 3178 } 3179 3180 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3181 { 3182 int rc = -ESRCH; 3183 3184 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3185 goto out; 3186 3187 if (cfg->fc_flags & RTF_GATEWAY && 3188 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3189 goto out; 3190 if (dst_hold_safe(&rt->dst)) 3191 rc = rt6_remove_exception_rt(rt); 3192 out: 3193 return rc; 3194 } 3195 3196 static int ip6_route_del(struct fib6_config *cfg, 3197 struct netlink_ext_ack *extack) 3198 { 3199 struct rt6_info *rt_cache; 3200 struct fib6_table *table; 3201 struct fib6_info *rt; 3202 struct fib6_node *fn; 3203 int err = -ESRCH; 3204 3205 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 3206 if (!table) { 3207 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 3208 return err; 3209 } 3210 3211 rcu_read_lock(); 3212 3213 fn = fib6_locate(&table->tb6_root, 3214 &cfg->fc_dst, cfg->fc_dst_len, 3215 &cfg->fc_src, cfg->fc_src_len, 3216 !(cfg->fc_flags & RTF_CACHE)); 3217 3218 if (fn) { 3219 for_each_fib6_node_rt_rcu(fn) { 3220 if (cfg->fc_flags & RTF_CACHE) { 3221 int rc; 3222 3223 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, 3224 &cfg->fc_src); 3225 if (rt_cache) { 3226 rc = ip6_del_cached_rt(rt_cache, cfg); 3227 if (rc != -ESRCH) 3228 return rc; 3229 } 3230 continue; 3231 } 3232 if (cfg->fc_ifindex && 3233 (!rt->fib6_nh.nh_dev || 3234 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex)) 3235 continue; 3236 if (cfg->fc_flags & RTF_GATEWAY && 3237 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw)) 3238 continue; 3239 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 3240 continue; 3241 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) 3242 continue; 3243 fib6_info_hold(rt); 3244 rcu_read_unlock(); 3245 3246 /* if gateway was specified only delete the one hop */ 3247 if (cfg->fc_flags & RTF_GATEWAY) 3248 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3249 3250 return __ip6_del_rt_siblings(rt, cfg); 3251 } 3252 } 3253 rcu_read_unlock(); 3254 3255 return err; 3256 } 3257 3258 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3259 { 3260 struct netevent_redirect netevent; 3261 struct rt6_info *rt, *nrt = NULL; 3262 struct ndisc_options ndopts; 3263 struct inet6_dev *in6_dev; 3264 struct neighbour *neigh; 3265 struct fib6_info *from; 3266 struct rd_msg *msg; 3267 int optlen, on_link; 3268 u8 *lladdr; 3269 3270 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3271 optlen -= sizeof(*msg); 3272 3273 if (optlen < 0) { 3274 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3275 return; 3276 } 3277 3278 msg = (struct rd_msg *)icmp6_hdr(skb); 3279 3280 if (ipv6_addr_is_multicast(&msg->dest)) { 3281 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3282 return; 3283 } 3284 3285 on_link = 0; 3286 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3287 on_link = 1; 3288 } else if (ipv6_addr_type(&msg->target) != 3289 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3290 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3291 return; 3292 } 3293 3294 in6_dev = __in6_dev_get(skb->dev); 3295 if (!in6_dev) 3296 return; 3297 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3298 return; 3299 3300 /* RFC2461 8.1: 3301 * The IP source address of the Redirect MUST be the same as the current 3302 * first-hop router for the specified ICMP Destination Address. 3303 */ 3304 3305 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3306 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3307 return; 3308 } 3309 3310 lladdr = NULL; 3311 if (ndopts.nd_opts_tgt_lladdr) { 3312 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3313 skb->dev); 3314 if (!lladdr) { 3315 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 3316 return; 3317 } 3318 } 3319 3320 rt = (struct rt6_info *) dst; 3321 if (rt->rt6i_flags & RTF_REJECT) { 3322 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 3323 return; 3324 } 3325 3326 /* Redirect received -> path was valid. 3327 * Look, redirects are sent only in response to data packets, 3328 * so that this nexthop apparently is reachable. --ANK 3329 */ 3330 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3331 3332 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3333 if (!neigh) 3334 return; 3335 3336 /* 3337 * We have finally decided to accept it. 3338 */ 3339 3340 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3341 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3342 NEIGH_UPDATE_F_OVERRIDE| 3343 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3344 NEIGH_UPDATE_F_ISROUTER)), 3345 NDISC_REDIRECT, &ndopts); 3346 3347 rcu_read_lock(); 3348 from = rcu_dereference(rt->from); 3349 fib6_info_hold(from); 3350 rcu_read_unlock(); 3351 3352 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL); 3353 if (!nrt) 3354 goto out; 3355 3356 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3357 if (on_link) 3358 nrt->rt6i_flags &= ~RTF_GATEWAY; 3359 3360 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3361 3362 /* No need to remove rt from the exception table if rt is 3363 * a cached route because rt6_insert_exception() will 3364 * takes care of it 3365 */ 3366 if (rt6_insert_exception(nrt, from)) { 3367 dst_release_immediate(&nrt->dst); 3368 goto out; 3369 } 3370 3371 netevent.old = &rt->dst; 3372 netevent.new = &nrt->dst; 3373 netevent.daddr = &msg->dest; 3374 netevent.neigh = neigh; 3375 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3376 3377 out: 3378 fib6_info_release(from); 3379 neigh_release(neigh); 3380 } 3381 3382 #ifdef CONFIG_IPV6_ROUTE_INFO 3383 static struct fib6_info *rt6_get_route_info(struct net *net, 3384 const struct in6_addr *prefix, int prefixlen, 3385 const struct in6_addr *gwaddr, 3386 struct net_device *dev) 3387 { 3388 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3389 int ifindex = dev->ifindex; 3390 struct fib6_node *fn; 3391 struct fib6_info *rt = NULL; 3392 struct fib6_table *table; 3393 3394 table = fib6_get_table(net, tb_id); 3395 if (!table) 3396 return NULL; 3397 3398 rcu_read_lock(); 3399 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3400 if (!fn) 3401 goto out; 3402 3403 for_each_fib6_node_rt_rcu(fn) { 3404 if (rt->fib6_nh.nh_dev->ifindex != ifindex) 3405 continue; 3406 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) 3407 continue; 3408 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) 3409 continue; 3410 fib6_info_hold(rt); 3411 break; 3412 } 3413 out: 3414 rcu_read_unlock(); 3415 return rt; 3416 } 3417 3418 static struct fib6_info *rt6_add_route_info(struct net *net, 3419 const struct in6_addr *prefix, int prefixlen, 3420 const struct in6_addr *gwaddr, 3421 struct net_device *dev, 3422 unsigned int pref) 3423 { 3424 struct fib6_config cfg = { 3425 .fc_metric = IP6_RT_PRIO_USER, 3426 .fc_ifindex = dev->ifindex, 3427 .fc_dst_len = prefixlen, 3428 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3429 RTF_UP | RTF_PREF(pref), 3430 .fc_protocol = RTPROT_RA, 3431 .fc_type = RTN_UNICAST, 3432 .fc_nlinfo.portid = 0, 3433 .fc_nlinfo.nlh = NULL, 3434 .fc_nlinfo.nl_net = net, 3435 }; 3436 3437 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3438 cfg.fc_dst = *prefix; 3439 cfg.fc_gateway = *gwaddr; 3440 3441 /* We should treat it as a default route if prefix length is 0. */ 3442 if (!prefixlen) 3443 cfg.fc_flags |= RTF_DEFAULT; 3444 3445 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 3446 3447 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3448 } 3449 #endif 3450 3451 struct fib6_info *rt6_get_dflt_router(struct net *net, 3452 const struct in6_addr *addr, 3453 struct net_device *dev) 3454 { 3455 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3456 struct fib6_info *rt; 3457 struct fib6_table *table; 3458 3459 table = fib6_get_table(net, tb_id); 3460 if (!table) 3461 return NULL; 3462 3463 rcu_read_lock(); 3464 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3465 if (dev == rt->fib6_nh.nh_dev && 3466 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3467 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) 3468 break; 3469 } 3470 if (rt) 3471 fib6_info_hold(rt); 3472 rcu_read_unlock(); 3473 return rt; 3474 } 3475 3476 struct fib6_info *rt6_add_dflt_router(struct net *net, 3477 const struct in6_addr *gwaddr, 3478 struct net_device *dev, 3479 unsigned int pref) 3480 { 3481 struct fib6_config cfg = { 3482 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3483 .fc_metric = IP6_RT_PRIO_USER, 3484 .fc_ifindex = dev->ifindex, 3485 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3486 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3487 .fc_protocol = RTPROT_RA, 3488 .fc_type = RTN_UNICAST, 3489 .fc_nlinfo.portid = 0, 3490 .fc_nlinfo.nlh = NULL, 3491 .fc_nlinfo.nl_net = net, 3492 }; 3493 3494 cfg.fc_gateway = *gwaddr; 3495 3496 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 3497 struct fib6_table *table; 3498 3499 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3500 if (table) 3501 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3502 } 3503 3504 return rt6_get_dflt_router(net, gwaddr, dev); 3505 } 3506 3507 static void __rt6_purge_dflt_routers(struct net *net, 3508 struct fib6_table *table) 3509 { 3510 struct fib6_info *rt; 3511 3512 restart: 3513 rcu_read_lock(); 3514 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3515 struct net_device *dev = fib6_info_nh_dev(rt); 3516 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 3517 3518 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3519 (!idev || idev->cnf.accept_ra != 2)) { 3520 fib6_info_hold(rt); 3521 rcu_read_unlock(); 3522 ip6_del_rt(net, rt); 3523 goto restart; 3524 } 3525 } 3526 rcu_read_unlock(); 3527 3528 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3529 } 3530 3531 void rt6_purge_dflt_routers(struct net *net) 3532 { 3533 struct fib6_table *table; 3534 struct hlist_head *head; 3535 unsigned int h; 3536 3537 rcu_read_lock(); 3538 3539 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3540 head = &net->ipv6.fib_table_hash[h]; 3541 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3542 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3543 __rt6_purge_dflt_routers(net, table); 3544 } 3545 } 3546 3547 rcu_read_unlock(); 3548 } 3549 3550 static void rtmsg_to_fib6_config(struct net *net, 3551 struct in6_rtmsg *rtmsg, 3552 struct fib6_config *cfg) 3553 { 3554 memset(cfg, 0, sizeof(*cfg)); 3555 3556 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3557 : RT6_TABLE_MAIN; 3558 cfg->fc_ifindex = rtmsg->rtmsg_ifindex; 3559 cfg->fc_metric = rtmsg->rtmsg_metric; 3560 cfg->fc_expires = rtmsg->rtmsg_info; 3561 cfg->fc_dst_len = rtmsg->rtmsg_dst_len; 3562 cfg->fc_src_len = rtmsg->rtmsg_src_len; 3563 cfg->fc_flags = rtmsg->rtmsg_flags; 3564 cfg->fc_type = rtmsg->rtmsg_type; 3565 3566 cfg->fc_nlinfo.nl_net = net; 3567 3568 cfg->fc_dst = rtmsg->rtmsg_dst; 3569 cfg->fc_src = rtmsg->rtmsg_src; 3570 cfg->fc_gateway = rtmsg->rtmsg_gateway; 3571 } 3572 3573 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3574 { 3575 struct fib6_config cfg; 3576 struct in6_rtmsg rtmsg; 3577 int err; 3578 3579 switch (cmd) { 3580 case SIOCADDRT: /* Add a route */ 3581 case SIOCDELRT: /* Delete a route */ 3582 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3583 return -EPERM; 3584 err = copy_from_user(&rtmsg, arg, 3585 sizeof(struct in6_rtmsg)); 3586 if (err) 3587 return -EFAULT; 3588 3589 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3590 3591 rtnl_lock(); 3592 switch (cmd) { 3593 case SIOCADDRT: 3594 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 3595 break; 3596 case SIOCDELRT: 3597 err = ip6_route_del(&cfg, NULL); 3598 break; 3599 default: 3600 err = -EINVAL; 3601 } 3602 rtnl_unlock(); 3603 3604 return err; 3605 } 3606 3607 return -EINVAL; 3608 } 3609 3610 /* 3611 * Drop the packet on the floor 3612 */ 3613 3614 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3615 { 3616 int type; 3617 struct dst_entry *dst = skb_dst(skb); 3618 switch (ipstats_mib_noroutes) { 3619 case IPSTATS_MIB_INNOROUTES: 3620 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3621 if (type == IPV6_ADDR_ANY) { 3622 IP6_INC_STATS(dev_net(dst->dev), 3623 __in6_dev_get_safely(skb->dev), 3624 IPSTATS_MIB_INADDRERRORS); 3625 break; 3626 } 3627 /* FALLTHROUGH */ 3628 case IPSTATS_MIB_OUTNOROUTES: 3629 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3630 ipstats_mib_noroutes); 3631 break; 3632 } 3633 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3634 kfree_skb(skb); 3635 return 0; 3636 } 3637 3638 static int ip6_pkt_discard(struct sk_buff *skb) 3639 { 3640 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3641 } 3642 3643 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3644 { 3645 skb->dev = skb_dst(skb)->dev; 3646 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3647 } 3648 3649 static int ip6_pkt_prohibit(struct sk_buff *skb) 3650 { 3651 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3652 } 3653 3654 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3655 { 3656 skb->dev = skb_dst(skb)->dev; 3657 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3658 } 3659 3660 /* 3661 * Allocate a dst for local (unicast / anycast) address. 3662 */ 3663 3664 struct fib6_info *addrconf_f6i_alloc(struct net *net, 3665 struct inet6_dev *idev, 3666 const struct in6_addr *addr, 3667 bool anycast, gfp_t gfp_flags) 3668 { 3669 u32 tb_id; 3670 struct net_device *dev = idev->dev; 3671 struct fib6_info *f6i; 3672 3673 f6i = fib6_info_alloc(gfp_flags); 3674 if (!f6i) 3675 return ERR_PTR(-ENOMEM); 3676 3677 f6i->dst_nocount = true; 3678 f6i->dst_host = true; 3679 f6i->fib6_protocol = RTPROT_KERNEL; 3680 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP; 3681 if (anycast) { 3682 f6i->fib6_type = RTN_ANYCAST; 3683 f6i->fib6_flags |= RTF_ANYCAST; 3684 } else { 3685 f6i->fib6_type = RTN_LOCAL; 3686 f6i->fib6_flags |= RTF_LOCAL; 3687 } 3688 3689 f6i->fib6_nh.nh_gw = *addr; 3690 dev_hold(dev); 3691 f6i->fib6_nh.nh_dev = dev; 3692 f6i->fib6_dst.addr = *addr; 3693 f6i->fib6_dst.plen = 128; 3694 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; 3695 f6i->fib6_table = fib6_get_table(net, tb_id); 3696 3697 return f6i; 3698 } 3699 3700 /* remove deleted ip from prefsrc entries */ 3701 struct arg_dev_net_ip { 3702 struct net_device *dev; 3703 struct net *net; 3704 struct in6_addr *addr; 3705 }; 3706 3707 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 3708 { 3709 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3710 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3711 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3712 3713 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) && 3714 rt != net->ipv6.fib6_null_entry && 3715 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 3716 spin_lock_bh(&rt6_exception_lock); 3717 /* remove prefsrc entry */ 3718 rt->fib6_prefsrc.plen = 0; 3719 /* need to update cache as well */ 3720 rt6_exceptions_remove_prefsrc(rt); 3721 spin_unlock_bh(&rt6_exception_lock); 3722 } 3723 return 0; 3724 } 3725 3726 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3727 { 3728 struct net *net = dev_net(ifp->idev->dev); 3729 struct arg_dev_net_ip adni = { 3730 .dev = ifp->idev->dev, 3731 .net = net, 3732 .addr = &ifp->addr, 3733 }; 3734 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3735 } 3736 3737 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) 3738 3739 /* Remove routers and update dst entries when gateway turn into host. */ 3740 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 3741 { 3742 struct in6_addr *gateway = (struct in6_addr *)arg; 3743 3744 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3745 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) { 3746 return -1; 3747 } 3748 3749 /* Further clean up cached routes in exception table. 3750 * This is needed because cached route may have a different 3751 * gateway than its 'parent' in the case of an ip redirect. 3752 */ 3753 rt6_exceptions_clean_tohost(rt, gateway); 3754 3755 return 0; 3756 } 3757 3758 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3759 { 3760 fib6_clean_all(net, fib6_clean_tohost, gateway); 3761 } 3762 3763 struct arg_netdev_event { 3764 const struct net_device *dev; 3765 union { 3766 unsigned int nh_flags; 3767 unsigned long event; 3768 }; 3769 }; 3770 3771 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 3772 { 3773 struct fib6_info *iter; 3774 struct fib6_node *fn; 3775 3776 fn = rcu_dereference_protected(rt->fib6_node, 3777 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3778 iter = rcu_dereference_protected(fn->leaf, 3779 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3780 while (iter) { 3781 if (iter->fib6_metric == rt->fib6_metric && 3782 rt6_qualify_for_ecmp(iter)) 3783 return iter; 3784 iter = rcu_dereference_protected(iter->fib6_next, 3785 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3786 } 3787 3788 return NULL; 3789 } 3790 3791 static bool rt6_is_dead(const struct fib6_info *rt) 3792 { 3793 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || 3794 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 3795 fib6_ignore_linkdown(rt))) 3796 return true; 3797 3798 return false; 3799 } 3800 3801 static int rt6_multipath_total_weight(const struct fib6_info *rt) 3802 { 3803 struct fib6_info *iter; 3804 int total = 0; 3805 3806 if (!rt6_is_dead(rt)) 3807 total += rt->fib6_nh.nh_weight; 3808 3809 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 3810 if (!rt6_is_dead(iter)) 3811 total += iter->fib6_nh.nh_weight; 3812 } 3813 3814 return total; 3815 } 3816 3817 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 3818 { 3819 int upper_bound = -1; 3820 3821 if (!rt6_is_dead(rt)) { 3822 *weight += rt->fib6_nh.nh_weight; 3823 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 3824 total) - 1; 3825 } 3826 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound); 3827 } 3828 3829 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 3830 { 3831 struct fib6_info *iter; 3832 int weight = 0; 3833 3834 rt6_upper_bound_set(rt, &weight, total); 3835 3836 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3837 rt6_upper_bound_set(iter, &weight, total); 3838 } 3839 3840 void rt6_multipath_rebalance(struct fib6_info *rt) 3841 { 3842 struct fib6_info *first; 3843 int total; 3844 3845 /* In case the entire multipath route was marked for flushing, 3846 * then there is no need to rebalance upon the removal of every 3847 * sibling route. 3848 */ 3849 if (!rt->fib6_nsiblings || rt->should_flush) 3850 return; 3851 3852 /* During lookup routes are evaluated in order, so we need to 3853 * make sure upper bounds are assigned from the first sibling 3854 * onwards. 3855 */ 3856 first = rt6_multipath_first_sibling(rt); 3857 if (WARN_ON_ONCE(!first)) 3858 return; 3859 3860 total = rt6_multipath_total_weight(first); 3861 rt6_multipath_upper_bound_set(first, total); 3862 } 3863 3864 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 3865 { 3866 const struct arg_netdev_event *arg = p_arg; 3867 struct net *net = dev_net(arg->dev); 3868 3869 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) { 3870 rt->fib6_nh.nh_flags &= ~arg->nh_flags; 3871 fib6_update_sernum_upto_root(net, rt); 3872 rt6_multipath_rebalance(rt); 3873 } 3874 3875 return 0; 3876 } 3877 3878 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) 3879 { 3880 struct arg_netdev_event arg = { 3881 .dev = dev, 3882 { 3883 .nh_flags = nh_flags, 3884 }, 3885 }; 3886 3887 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 3888 arg.nh_flags |= RTNH_F_LINKDOWN; 3889 3890 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 3891 } 3892 3893 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 3894 const struct net_device *dev) 3895 { 3896 struct fib6_info *iter; 3897 3898 if (rt->fib6_nh.nh_dev == dev) 3899 return true; 3900 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3901 if (iter->fib6_nh.nh_dev == dev) 3902 return true; 3903 3904 return false; 3905 } 3906 3907 static void rt6_multipath_flush(struct fib6_info *rt) 3908 { 3909 struct fib6_info *iter; 3910 3911 rt->should_flush = 1; 3912 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3913 iter->should_flush = 1; 3914 } 3915 3916 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 3917 const struct net_device *down_dev) 3918 { 3919 struct fib6_info *iter; 3920 unsigned int dead = 0; 3921 3922 if (rt->fib6_nh.nh_dev == down_dev || 3923 rt->fib6_nh.nh_flags & RTNH_F_DEAD) 3924 dead++; 3925 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3926 if (iter->fib6_nh.nh_dev == down_dev || 3927 iter->fib6_nh.nh_flags & RTNH_F_DEAD) 3928 dead++; 3929 3930 return dead; 3931 } 3932 3933 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 3934 const struct net_device *dev, 3935 unsigned int nh_flags) 3936 { 3937 struct fib6_info *iter; 3938 3939 if (rt->fib6_nh.nh_dev == dev) 3940 rt->fib6_nh.nh_flags |= nh_flags; 3941 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3942 if (iter->fib6_nh.nh_dev == dev) 3943 iter->fib6_nh.nh_flags |= nh_flags; 3944 } 3945 3946 /* called with write lock held for table with rt */ 3947 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 3948 { 3949 const struct arg_netdev_event *arg = p_arg; 3950 const struct net_device *dev = arg->dev; 3951 struct net *net = dev_net(dev); 3952 3953 if (rt == net->ipv6.fib6_null_entry) 3954 return 0; 3955 3956 switch (arg->event) { 3957 case NETDEV_UNREGISTER: 3958 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 3959 case NETDEV_DOWN: 3960 if (rt->should_flush) 3961 return -1; 3962 if (!rt->fib6_nsiblings) 3963 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 3964 if (rt6_multipath_uses_dev(rt, dev)) { 3965 unsigned int count; 3966 3967 count = rt6_multipath_dead_count(rt, dev); 3968 if (rt->fib6_nsiblings + 1 == count) { 3969 rt6_multipath_flush(rt); 3970 return -1; 3971 } 3972 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 3973 RTNH_F_LINKDOWN); 3974 fib6_update_sernum(net, rt); 3975 rt6_multipath_rebalance(rt); 3976 } 3977 return -2; 3978 case NETDEV_CHANGE: 3979 if (rt->fib6_nh.nh_dev != dev || 3980 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 3981 break; 3982 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 3983 rt6_multipath_rebalance(rt); 3984 break; 3985 } 3986 3987 return 0; 3988 } 3989 3990 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 3991 { 3992 struct arg_netdev_event arg = { 3993 .dev = dev, 3994 { 3995 .event = event, 3996 }, 3997 }; 3998 3999 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg); 4000 } 4001 4002 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4003 { 4004 rt6_sync_down_dev(dev, event); 4005 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4006 neigh_ifdown(&nd_tbl, dev); 4007 } 4008 4009 struct rt6_mtu_change_arg { 4010 struct net_device *dev; 4011 unsigned int mtu; 4012 }; 4013 4014 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) 4015 { 4016 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4017 struct inet6_dev *idev; 4018 4019 /* In IPv6 pmtu discovery is not optional, 4020 so that RTAX_MTU lock cannot disable it. 4021 We still use this lock to block changes 4022 caused by addrconf/ndisc. 4023 */ 4024 4025 idev = __in6_dev_get(arg->dev); 4026 if (!idev) 4027 return 0; 4028 4029 /* For administrative MTU increase, there is no way to discover 4030 IPv6 PMTU increase, so PMTU increase should be updated here. 4031 Since RFC 1981 doesn't include administrative MTU increase 4032 update PMTU increase is a MUST. (i.e. jumbo frame) 4033 */ 4034 if (rt->fib6_nh.nh_dev == arg->dev && 4035 !fib6_metric_locked(rt, RTAX_MTU)) { 4036 u32 mtu = rt->fib6_pmtu; 4037 4038 if (mtu >= arg->mtu || 4039 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4040 fib6_metric_set(rt, RTAX_MTU, arg->mtu); 4041 4042 spin_lock_bh(&rt6_exception_lock); 4043 rt6_exceptions_update_pmtu(idev, rt, arg->mtu); 4044 spin_unlock_bh(&rt6_exception_lock); 4045 } 4046 return 0; 4047 } 4048 4049 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4050 { 4051 struct rt6_mtu_change_arg arg = { 4052 .dev = dev, 4053 .mtu = mtu, 4054 }; 4055 4056 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4057 } 4058 4059 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4060 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4061 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4062 [RTA_OIF] = { .type = NLA_U32 }, 4063 [RTA_IIF] = { .type = NLA_U32 }, 4064 [RTA_PRIORITY] = { .type = NLA_U32 }, 4065 [RTA_METRICS] = { .type = NLA_NESTED }, 4066 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4067 [RTA_PREF] = { .type = NLA_U8 }, 4068 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4069 [RTA_ENCAP] = { .type = NLA_NESTED }, 4070 [RTA_EXPIRES] = { .type = NLA_U32 }, 4071 [RTA_UID] = { .type = NLA_U32 }, 4072 [RTA_MARK] = { .type = NLA_U32 }, 4073 [RTA_TABLE] = { .type = NLA_U32 }, 4074 }; 4075 4076 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4077 struct fib6_config *cfg, 4078 struct netlink_ext_ack *extack) 4079 { 4080 struct rtmsg *rtm; 4081 struct nlattr *tb[RTA_MAX+1]; 4082 unsigned int pref; 4083 int err; 4084 4085 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4086 NULL); 4087 if (err < 0) 4088 goto errout; 4089 4090 err = -EINVAL; 4091 rtm = nlmsg_data(nlh); 4092 memset(cfg, 0, sizeof(*cfg)); 4093 4094 cfg->fc_table = rtm->rtm_table; 4095 cfg->fc_dst_len = rtm->rtm_dst_len; 4096 cfg->fc_src_len = rtm->rtm_src_len; 4097 cfg->fc_flags = RTF_UP; 4098 cfg->fc_protocol = rtm->rtm_protocol; 4099 cfg->fc_type = rtm->rtm_type; 4100 4101 if (rtm->rtm_type == RTN_UNREACHABLE || 4102 rtm->rtm_type == RTN_BLACKHOLE || 4103 rtm->rtm_type == RTN_PROHIBIT || 4104 rtm->rtm_type == RTN_THROW) 4105 cfg->fc_flags |= RTF_REJECT; 4106 4107 if (rtm->rtm_type == RTN_LOCAL) 4108 cfg->fc_flags |= RTF_LOCAL; 4109 4110 if (rtm->rtm_flags & RTM_F_CLONED) 4111 cfg->fc_flags |= RTF_CACHE; 4112 4113 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4114 4115 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; 4116 cfg->fc_nlinfo.nlh = nlh; 4117 cfg->fc_nlinfo.nl_net = sock_net(skb->sk); 4118 4119 if (tb[RTA_GATEWAY]) { 4120 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4121 cfg->fc_flags |= RTF_GATEWAY; 4122 } 4123 4124 if (tb[RTA_DST]) { 4125 int plen = (rtm->rtm_dst_len + 7) >> 3; 4126 4127 if (nla_len(tb[RTA_DST]) < plen) 4128 goto errout; 4129 4130 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 4131 } 4132 4133 if (tb[RTA_SRC]) { 4134 int plen = (rtm->rtm_src_len + 7) >> 3; 4135 4136 if (nla_len(tb[RTA_SRC]) < plen) 4137 goto errout; 4138 4139 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 4140 } 4141 4142 if (tb[RTA_PREFSRC]) 4143 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 4144 4145 if (tb[RTA_OIF]) 4146 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 4147 4148 if (tb[RTA_PRIORITY]) 4149 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 4150 4151 if (tb[RTA_METRICS]) { 4152 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 4153 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 4154 } 4155 4156 if (tb[RTA_TABLE]) 4157 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 4158 4159 if (tb[RTA_MULTIPATH]) { 4160 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 4161 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 4162 4163 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 4164 cfg->fc_mp_len, extack); 4165 if (err < 0) 4166 goto errout; 4167 } 4168 4169 if (tb[RTA_PREF]) { 4170 pref = nla_get_u8(tb[RTA_PREF]); 4171 if (pref != ICMPV6_ROUTER_PREF_LOW && 4172 pref != ICMPV6_ROUTER_PREF_HIGH) 4173 pref = ICMPV6_ROUTER_PREF_MEDIUM; 4174 cfg->fc_flags |= RTF_PREF(pref); 4175 } 4176 4177 if (tb[RTA_ENCAP]) 4178 cfg->fc_encap = tb[RTA_ENCAP]; 4179 4180 if (tb[RTA_ENCAP_TYPE]) { 4181 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 4182 4183 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 4184 if (err < 0) 4185 goto errout; 4186 } 4187 4188 if (tb[RTA_EXPIRES]) { 4189 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 4190 4191 if (addrconf_finite_timeout(timeout)) { 4192 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 4193 cfg->fc_flags |= RTF_EXPIRES; 4194 } 4195 } 4196 4197 err = 0; 4198 errout: 4199 return err; 4200 } 4201 4202 struct rt6_nh { 4203 struct fib6_info *fib6_info; 4204 struct fib6_config r_cfg; 4205 struct list_head next; 4206 }; 4207 4208 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) 4209 { 4210 struct rt6_nh *nh; 4211 4212 list_for_each_entry(nh, rt6_nh_list, next) { 4213 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n", 4214 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, 4215 nh->r_cfg.fc_ifindex); 4216 } 4217 } 4218 4219 static int ip6_route_info_append(struct net *net, 4220 struct list_head *rt6_nh_list, 4221 struct fib6_info *rt, 4222 struct fib6_config *r_cfg) 4223 { 4224 struct rt6_nh *nh; 4225 int err = -EEXIST; 4226 4227 list_for_each_entry(nh, rt6_nh_list, next) { 4228 /* check if fib6_info already exists */ 4229 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 4230 return err; 4231 } 4232 4233 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4234 if (!nh) 4235 return -ENOMEM; 4236 nh->fib6_info = rt; 4237 err = ip6_convert_metrics(net, rt, r_cfg); 4238 if (err) { 4239 kfree(nh); 4240 return err; 4241 } 4242 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4243 list_add_tail(&nh->next, rt6_nh_list); 4244 4245 return 0; 4246 } 4247 4248 static void ip6_route_mpath_notify(struct fib6_info *rt, 4249 struct fib6_info *rt_last, 4250 struct nl_info *info, 4251 __u16 nlflags) 4252 { 4253 /* if this is an APPEND route, then rt points to the first route 4254 * inserted and rt_last points to last route inserted. Userspace 4255 * wants a consistent dump of the route which starts at the first 4256 * nexthop. Since sibling routes are always added at the end of 4257 * the list, find the first sibling of the last route appended 4258 */ 4259 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 4260 rt = list_first_entry(&rt_last->fib6_siblings, 4261 struct fib6_info, 4262 fib6_siblings); 4263 } 4264 4265 if (rt) 4266 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 4267 } 4268 4269 static int ip6_route_multipath_add(struct fib6_config *cfg, 4270 struct netlink_ext_ack *extack) 4271 { 4272 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 4273 struct nl_info *info = &cfg->fc_nlinfo; 4274 struct fib6_config r_cfg; 4275 struct rtnexthop *rtnh; 4276 struct fib6_info *rt; 4277 struct rt6_nh *err_nh; 4278 struct rt6_nh *nh, *nh_safe; 4279 __u16 nlflags; 4280 int remaining; 4281 int attrlen; 4282 int err = 1; 4283 int nhn = 0; 4284 int replace = (cfg->fc_nlinfo.nlh && 4285 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 4286 LIST_HEAD(rt6_nh_list); 4287 4288 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 4289 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 4290 nlflags |= NLM_F_APPEND; 4291 4292 remaining = cfg->fc_mp_len; 4293 rtnh = (struct rtnexthop *)cfg->fc_mp; 4294 4295 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4296 * fib6_info structs per nexthop 4297 */ 4298 while (rtnh_ok(rtnh, remaining)) { 4299 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4300 if (rtnh->rtnh_ifindex) 4301 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4302 4303 attrlen = rtnh_attrlen(rtnh); 4304 if (attrlen > 0) { 4305 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4306 4307 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4308 if (nla) { 4309 r_cfg.fc_gateway = nla_get_in6_addr(nla); 4310 r_cfg.fc_flags |= RTF_GATEWAY; 4311 } 4312 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 4313 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 4314 if (nla) 4315 r_cfg.fc_encap_type = nla_get_u16(nla); 4316 } 4317 4318 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 4319 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 4320 if (IS_ERR(rt)) { 4321 err = PTR_ERR(rt); 4322 rt = NULL; 4323 goto cleanup; 4324 } 4325 4326 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1; 4327 4328 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 4329 rt, &r_cfg); 4330 if (err) { 4331 fib6_info_release(rt); 4332 goto cleanup; 4333 } 4334 4335 rtnh = rtnh_next(rtnh, &remaining); 4336 } 4337 4338 /* for add and replace send one notification with all nexthops. 4339 * Skip the notification in fib6_add_rt2node and send one with 4340 * the full route when done 4341 */ 4342 info->skip_notify = 1; 4343 4344 err_nh = NULL; 4345 list_for_each_entry(nh, &rt6_nh_list, next) { 4346 rt_last = nh->fib6_info; 4347 err = __ip6_ins_rt(nh->fib6_info, info, extack); 4348 fib6_info_release(nh->fib6_info); 4349 4350 /* save reference to first route for notification */ 4351 if (!rt_notif && !err) 4352 rt_notif = nh->fib6_info; 4353 4354 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 4355 nh->fib6_info = NULL; 4356 if (err) { 4357 if (replace && nhn) 4358 ip6_print_replace_route_err(&rt6_nh_list); 4359 err_nh = nh; 4360 goto add_errout; 4361 } 4362 4363 /* Because each route is added like a single route we remove 4364 * these flags after the first nexthop: if there is a collision, 4365 * we have already failed to add the first nexthop: 4366 * fib6_add_rt2node() has rejected it; when replacing, old 4367 * nexthops have been replaced by first new, the rest should 4368 * be added to it. 4369 */ 4370 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4371 NLM_F_REPLACE); 4372 nhn++; 4373 } 4374 4375 /* success ... tell user about new route */ 4376 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4377 goto cleanup; 4378 4379 add_errout: 4380 /* send notification for routes that were added so that 4381 * the delete notifications sent by ip6_route_del are 4382 * coherent 4383 */ 4384 if (rt_notif) 4385 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4386 4387 /* Delete routes that were already added */ 4388 list_for_each_entry(nh, &rt6_nh_list, next) { 4389 if (err_nh == nh) 4390 break; 4391 ip6_route_del(&nh->r_cfg, extack); 4392 } 4393 4394 cleanup: 4395 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4396 if (nh->fib6_info) 4397 fib6_info_release(nh->fib6_info); 4398 list_del(&nh->next); 4399 kfree(nh); 4400 } 4401 4402 return err; 4403 } 4404 4405 static int ip6_route_multipath_del(struct fib6_config *cfg, 4406 struct netlink_ext_ack *extack) 4407 { 4408 struct fib6_config r_cfg; 4409 struct rtnexthop *rtnh; 4410 int remaining; 4411 int attrlen; 4412 int err = 1, last_err = 0; 4413 4414 remaining = cfg->fc_mp_len; 4415 rtnh = (struct rtnexthop *)cfg->fc_mp; 4416 4417 /* Parse a Multipath Entry */ 4418 while (rtnh_ok(rtnh, remaining)) { 4419 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4420 if (rtnh->rtnh_ifindex) 4421 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4422 4423 attrlen = rtnh_attrlen(rtnh); 4424 if (attrlen > 0) { 4425 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4426 4427 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4428 if (nla) { 4429 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 4430 r_cfg.fc_flags |= RTF_GATEWAY; 4431 } 4432 } 4433 err = ip6_route_del(&r_cfg, extack); 4434 if (err) 4435 last_err = err; 4436 4437 rtnh = rtnh_next(rtnh, &remaining); 4438 } 4439 4440 return last_err; 4441 } 4442 4443 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4444 struct netlink_ext_ack *extack) 4445 { 4446 struct fib6_config cfg; 4447 int err; 4448 4449 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4450 if (err < 0) 4451 return err; 4452 4453 if (cfg.fc_mp) 4454 return ip6_route_multipath_del(&cfg, extack); 4455 else { 4456 cfg.fc_delete_all_nh = 1; 4457 return ip6_route_del(&cfg, extack); 4458 } 4459 } 4460 4461 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4462 struct netlink_ext_ack *extack) 4463 { 4464 struct fib6_config cfg; 4465 int err; 4466 4467 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4468 if (err < 0) 4469 return err; 4470 4471 if (cfg.fc_mp) 4472 return ip6_route_multipath_add(&cfg, extack); 4473 else 4474 return ip6_route_add(&cfg, GFP_KERNEL, extack); 4475 } 4476 4477 static size_t rt6_nlmsg_size(struct fib6_info *rt) 4478 { 4479 int nexthop_len = 0; 4480 4481 if (rt->fib6_nsiblings) { 4482 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4483 + NLA_ALIGN(sizeof(struct rtnexthop)) 4484 + nla_total_size(16) /* RTA_GATEWAY */ 4485 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate); 4486 4487 nexthop_len *= rt->fib6_nsiblings; 4488 } 4489 4490 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4491 + nla_total_size(16) /* RTA_SRC */ 4492 + nla_total_size(16) /* RTA_DST */ 4493 + nla_total_size(16) /* RTA_GATEWAY */ 4494 + nla_total_size(16) /* RTA_PREFSRC */ 4495 + nla_total_size(4) /* RTA_TABLE */ 4496 + nla_total_size(4) /* RTA_IIF */ 4497 + nla_total_size(4) /* RTA_OIF */ 4498 + nla_total_size(4) /* RTA_PRIORITY */ 4499 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 4500 + nla_total_size(sizeof(struct rta_cacheinfo)) 4501 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4502 + nla_total_size(1) /* RTA_PREF */ 4503 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate) 4504 + nexthop_len; 4505 } 4506 4507 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt, 4508 unsigned int *flags, bool skip_oif) 4509 { 4510 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 4511 *flags |= RTNH_F_DEAD; 4512 4513 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) { 4514 *flags |= RTNH_F_LINKDOWN; 4515 4516 rcu_read_lock(); 4517 if (fib6_ignore_linkdown(rt)) 4518 *flags |= RTNH_F_DEAD; 4519 rcu_read_unlock(); 4520 } 4521 4522 if (rt->fib6_flags & RTF_GATEWAY) { 4523 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0) 4524 goto nla_put_failure; 4525 } 4526 4527 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK); 4528 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD) 4529 *flags |= RTNH_F_OFFLOAD; 4530 4531 /* not needed for multipath encoding b/c it has a rtnexthop struct */ 4532 if (!skip_oif && rt->fib6_nh.nh_dev && 4533 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex)) 4534 goto nla_put_failure; 4535 4536 if (rt->fib6_nh.nh_lwtstate && 4537 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0) 4538 goto nla_put_failure; 4539 4540 return 0; 4541 4542 nla_put_failure: 4543 return -EMSGSIZE; 4544 } 4545 4546 /* add multipath next hop */ 4547 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt) 4548 { 4549 const struct net_device *dev = rt->fib6_nh.nh_dev; 4550 struct rtnexthop *rtnh; 4551 unsigned int flags = 0; 4552 4553 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 4554 if (!rtnh) 4555 goto nla_put_failure; 4556 4557 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1; 4558 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; 4559 4560 if (rt6_nexthop_info(skb, rt, &flags, true) < 0) 4561 goto nla_put_failure; 4562 4563 rtnh->rtnh_flags = flags; 4564 4565 /* length of rtnetlink header + attributes */ 4566 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; 4567 4568 return 0; 4569 4570 nla_put_failure: 4571 return -EMSGSIZE; 4572 } 4573 4574 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 4575 struct fib6_info *rt, struct dst_entry *dst, 4576 struct in6_addr *dest, struct in6_addr *src, 4577 int iif, int type, u32 portid, u32 seq, 4578 unsigned int flags) 4579 { 4580 struct rtmsg *rtm; 4581 struct nlmsghdr *nlh; 4582 long expires = 0; 4583 u32 *pmetrics; 4584 u32 table; 4585 4586 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4587 if (!nlh) 4588 return -EMSGSIZE; 4589 4590 rtm = nlmsg_data(nlh); 4591 rtm->rtm_family = AF_INET6; 4592 rtm->rtm_dst_len = rt->fib6_dst.plen; 4593 rtm->rtm_src_len = rt->fib6_src.plen; 4594 rtm->rtm_tos = 0; 4595 if (rt->fib6_table) 4596 table = rt->fib6_table->tb6_id; 4597 else 4598 table = RT6_TABLE_UNSPEC; 4599 rtm->rtm_table = table; 4600 if (nla_put_u32(skb, RTA_TABLE, table)) 4601 goto nla_put_failure; 4602 4603 rtm->rtm_type = rt->fib6_type; 4604 rtm->rtm_flags = 0; 4605 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4606 rtm->rtm_protocol = rt->fib6_protocol; 4607 4608 if (rt->fib6_flags & RTF_CACHE) 4609 rtm->rtm_flags |= RTM_F_CLONED; 4610 4611 if (dest) { 4612 if (nla_put_in6_addr(skb, RTA_DST, dest)) 4613 goto nla_put_failure; 4614 rtm->rtm_dst_len = 128; 4615 } else if (rtm->rtm_dst_len) 4616 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr)) 4617 goto nla_put_failure; 4618 #ifdef CONFIG_IPV6_SUBTREES 4619 if (src) { 4620 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4621 goto nla_put_failure; 4622 rtm->rtm_src_len = 128; 4623 } else if (rtm->rtm_src_len && 4624 nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr)) 4625 goto nla_put_failure; 4626 #endif 4627 if (iif) { 4628 #ifdef CONFIG_IPV6_MROUTE 4629 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) { 4630 int err = ip6mr_get_route(net, skb, rtm, portid); 4631 4632 if (err == 0) 4633 return 0; 4634 if (err < 0) 4635 goto nla_put_failure; 4636 } else 4637 #endif 4638 if (nla_put_u32(skb, RTA_IIF, iif)) 4639 goto nla_put_failure; 4640 } else if (dest) { 4641 struct in6_addr saddr_buf; 4642 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 4643 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4644 goto nla_put_failure; 4645 } 4646 4647 if (rt->fib6_prefsrc.plen) { 4648 struct in6_addr saddr_buf; 4649 saddr_buf = rt->fib6_prefsrc.addr; 4650 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4651 goto nla_put_failure; 4652 } 4653 4654 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 4655 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 4656 goto nla_put_failure; 4657 4658 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 4659 goto nla_put_failure; 4660 4661 /* For multipath routes, walk the siblings list and add 4662 * each as a nexthop within RTA_MULTIPATH. 4663 */ 4664 if (rt->fib6_nsiblings) { 4665 struct fib6_info *sibling, *next_sibling; 4666 struct nlattr *mp; 4667 4668 mp = nla_nest_start(skb, RTA_MULTIPATH); 4669 if (!mp) 4670 goto nla_put_failure; 4671 4672 if (rt6_add_nexthop(skb, rt) < 0) 4673 goto nla_put_failure; 4674 4675 list_for_each_entry_safe(sibling, next_sibling, 4676 &rt->fib6_siblings, fib6_siblings) { 4677 if (rt6_add_nexthop(skb, sibling) < 0) 4678 goto nla_put_failure; 4679 } 4680 4681 nla_nest_end(skb, mp); 4682 } else { 4683 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0) 4684 goto nla_put_failure; 4685 } 4686 4687 if (rt->fib6_flags & RTF_EXPIRES) { 4688 expires = dst ? dst->expires : rt->expires; 4689 expires -= jiffies; 4690 } 4691 4692 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 4693 goto nla_put_failure; 4694 4695 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags))) 4696 goto nla_put_failure; 4697 4698 4699 nlmsg_end(skb, nlh); 4700 return 0; 4701 4702 nla_put_failure: 4703 nlmsg_cancel(skb, nlh); 4704 return -EMSGSIZE; 4705 } 4706 4707 int rt6_dump_route(struct fib6_info *rt, void *p_arg) 4708 { 4709 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4710 struct net *net = arg->net; 4711 4712 if (rt == net->ipv6.fib6_null_entry) 4713 return 0; 4714 4715 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { 4716 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); 4717 4718 /* user wants prefix routes only */ 4719 if (rtm->rtm_flags & RTM_F_PREFIX && 4720 !(rt->fib6_flags & RTF_PREFIX_RT)) { 4721 /* success since this is not a prefix route */ 4722 return 1; 4723 } 4724 } 4725 4726 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, 4727 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, 4728 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI); 4729 } 4730 4731 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4732 struct netlink_ext_ack *extack) 4733 { 4734 struct net *net = sock_net(in_skb->sk); 4735 struct nlattr *tb[RTA_MAX+1]; 4736 int err, iif = 0, oif = 0; 4737 struct fib6_info *from; 4738 struct dst_entry *dst; 4739 struct rt6_info *rt; 4740 struct sk_buff *skb; 4741 struct rtmsg *rtm; 4742 struct flowi6 fl6; 4743 bool fibmatch; 4744 4745 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4746 extack); 4747 if (err < 0) 4748 goto errout; 4749 4750 err = -EINVAL; 4751 memset(&fl6, 0, sizeof(fl6)); 4752 rtm = nlmsg_data(nlh); 4753 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4754 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4755 4756 if (tb[RTA_SRC]) { 4757 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4758 goto errout; 4759 4760 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4761 } 4762 4763 if (tb[RTA_DST]) { 4764 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4765 goto errout; 4766 4767 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4768 } 4769 4770 if (tb[RTA_IIF]) 4771 iif = nla_get_u32(tb[RTA_IIF]); 4772 4773 if (tb[RTA_OIF]) 4774 oif = nla_get_u32(tb[RTA_OIF]); 4775 4776 if (tb[RTA_MARK]) 4777 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4778 4779 if (tb[RTA_UID]) 4780 fl6.flowi6_uid = make_kuid(current_user_ns(), 4781 nla_get_u32(tb[RTA_UID])); 4782 else 4783 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4784 4785 if (iif) { 4786 struct net_device *dev; 4787 int flags = 0; 4788 4789 rcu_read_lock(); 4790 4791 dev = dev_get_by_index_rcu(net, iif); 4792 if (!dev) { 4793 rcu_read_unlock(); 4794 err = -ENODEV; 4795 goto errout; 4796 } 4797 4798 fl6.flowi6_iif = iif; 4799 4800 if (!ipv6_addr_any(&fl6.saddr)) 4801 flags |= RT6_LOOKUP_F_HAS_SADDR; 4802 4803 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 4804 4805 rcu_read_unlock(); 4806 } else { 4807 fl6.flowi6_oif = oif; 4808 4809 dst = ip6_route_output(net, NULL, &fl6); 4810 } 4811 4812 4813 rt = container_of(dst, struct rt6_info, dst); 4814 if (rt->dst.error) { 4815 err = rt->dst.error; 4816 ip6_rt_put(rt); 4817 goto errout; 4818 } 4819 4820 if (rt == net->ipv6.ip6_null_entry) { 4821 err = rt->dst.error; 4822 ip6_rt_put(rt); 4823 goto errout; 4824 } 4825 4826 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 4827 if (!skb) { 4828 ip6_rt_put(rt); 4829 err = -ENOBUFS; 4830 goto errout; 4831 } 4832 4833 skb_dst_set(skb, &rt->dst); 4834 4835 rcu_read_lock(); 4836 from = rcu_dereference(rt->from); 4837 4838 if (fibmatch) 4839 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, 4840 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 4841 nlh->nlmsg_seq, 0); 4842 else 4843 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 4844 &fl6.saddr, iif, RTM_NEWROUTE, 4845 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 4846 0); 4847 rcu_read_unlock(); 4848 4849 if (err < 0) { 4850 kfree_skb(skb); 4851 goto errout; 4852 } 4853 4854 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 4855 errout: 4856 return err; 4857 } 4858 4859 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 4860 unsigned int nlm_flags) 4861 { 4862 struct sk_buff *skb; 4863 struct net *net = info->nl_net; 4864 u32 seq; 4865 int err; 4866 4867 err = -ENOBUFS; 4868 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 4869 4870 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 4871 if (!skb) 4872 goto errout; 4873 4874 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 4875 event, info->portid, seq, nlm_flags); 4876 if (err < 0) { 4877 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 4878 WARN_ON(err == -EMSGSIZE); 4879 kfree_skb(skb); 4880 goto errout; 4881 } 4882 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 4883 info->nlh, gfp_any()); 4884 return; 4885 errout: 4886 if (err < 0) 4887 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 4888 } 4889 4890 static int ip6_route_dev_notify(struct notifier_block *this, 4891 unsigned long event, void *ptr) 4892 { 4893 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 4894 struct net *net = dev_net(dev); 4895 4896 if (!(dev->flags & IFF_LOOPBACK)) 4897 return NOTIFY_OK; 4898 4899 if (event == NETDEV_REGISTER) { 4900 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev; 4901 net->ipv6.ip6_null_entry->dst.dev = dev; 4902 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 4903 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4904 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 4905 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 4906 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 4907 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 4908 #endif 4909 } else if (event == NETDEV_UNREGISTER && 4910 dev->reg_state != NETREG_UNREGISTERED) { 4911 /* NETDEV_UNREGISTER could be fired for multiple times by 4912 * netdev_wait_allrefs(). Make sure we only call this once. 4913 */ 4914 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 4915 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4916 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 4917 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 4918 #endif 4919 } 4920 4921 return NOTIFY_OK; 4922 } 4923 4924 /* 4925 * /proc 4926 */ 4927 4928 #ifdef CONFIG_PROC_FS 4929 4930 static const struct file_operations ipv6_route_proc_fops = { 4931 .open = ipv6_route_open, 4932 .read = seq_read, 4933 .llseek = seq_lseek, 4934 .release = seq_release_net, 4935 }; 4936 4937 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 4938 { 4939 struct net *net = (struct net *)seq->private; 4940 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 4941 net->ipv6.rt6_stats->fib_nodes, 4942 net->ipv6.rt6_stats->fib_route_nodes, 4943 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 4944 net->ipv6.rt6_stats->fib_rt_entries, 4945 net->ipv6.rt6_stats->fib_rt_cache, 4946 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 4947 net->ipv6.rt6_stats->fib_discarded_routes); 4948 4949 return 0; 4950 } 4951 4952 static int rt6_stats_seq_open(struct inode *inode, struct file *file) 4953 { 4954 return single_open_net(inode, file, rt6_stats_seq_show); 4955 } 4956 4957 static const struct file_operations rt6_stats_seq_fops = { 4958 .open = rt6_stats_seq_open, 4959 .read = seq_read, 4960 .llseek = seq_lseek, 4961 .release = single_release_net, 4962 }; 4963 #endif /* CONFIG_PROC_FS */ 4964 4965 #ifdef CONFIG_SYSCTL 4966 4967 static 4968 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 4969 void __user *buffer, size_t *lenp, loff_t *ppos) 4970 { 4971 struct net *net; 4972 int delay; 4973 if (!write) 4974 return -EINVAL; 4975 4976 net = (struct net *)ctl->extra1; 4977 delay = net->ipv6.sysctl.flush_delay; 4978 proc_dointvec(ctl, write, buffer, lenp, ppos); 4979 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 4980 return 0; 4981 } 4982 4983 struct ctl_table ipv6_route_table_template[] = { 4984 { 4985 .procname = "flush", 4986 .data = &init_net.ipv6.sysctl.flush_delay, 4987 .maxlen = sizeof(int), 4988 .mode = 0200, 4989 .proc_handler = ipv6_sysctl_rtcache_flush 4990 }, 4991 { 4992 .procname = "gc_thresh", 4993 .data = &ip6_dst_ops_template.gc_thresh, 4994 .maxlen = sizeof(int), 4995 .mode = 0644, 4996 .proc_handler = proc_dointvec, 4997 }, 4998 { 4999 .procname = "max_size", 5000 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 5001 .maxlen = sizeof(int), 5002 .mode = 0644, 5003 .proc_handler = proc_dointvec, 5004 }, 5005 { 5006 .procname = "gc_min_interval", 5007 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5008 .maxlen = sizeof(int), 5009 .mode = 0644, 5010 .proc_handler = proc_dointvec_jiffies, 5011 }, 5012 { 5013 .procname = "gc_timeout", 5014 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 5015 .maxlen = sizeof(int), 5016 .mode = 0644, 5017 .proc_handler = proc_dointvec_jiffies, 5018 }, 5019 { 5020 .procname = "gc_interval", 5021 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 5022 .maxlen = sizeof(int), 5023 .mode = 0644, 5024 .proc_handler = proc_dointvec_jiffies, 5025 }, 5026 { 5027 .procname = "gc_elasticity", 5028 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 5029 .maxlen = sizeof(int), 5030 .mode = 0644, 5031 .proc_handler = proc_dointvec, 5032 }, 5033 { 5034 .procname = "mtu_expires", 5035 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 5036 .maxlen = sizeof(int), 5037 .mode = 0644, 5038 .proc_handler = proc_dointvec_jiffies, 5039 }, 5040 { 5041 .procname = "min_adv_mss", 5042 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 5043 .maxlen = sizeof(int), 5044 .mode = 0644, 5045 .proc_handler = proc_dointvec, 5046 }, 5047 { 5048 .procname = "gc_min_interval_ms", 5049 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5050 .maxlen = sizeof(int), 5051 .mode = 0644, 5052 .proc_handler = proc_dointvec_ms_jiffies, 5053 }, 5054 { } 5055 }; 5056 5057 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 5058 { 5059 struct ctl_table *table; 5060 5061 table = kmemdup(ipv6_route_table_template, 5062 sizeof(ipv6_route_table_template), 5063 GFP_KERNEL); 5064 5065 if (table) { 5066 table[0].data = &net->ipv6.sysctl.flush_delay; 5067 table[0].extra1 = net; 5068 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 5069 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 5070 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5071 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 5072 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 5073 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 5074 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 5075 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 5076 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5077 5078 /* Don't export sysctls to unprivileged users */ 5079 if (net->user_ns != &init_user_ns) 5080 table[0].procname = NULL; 5081 } 5082 5083 return table; 5084 } 5085 #endif 5086 5087 static int __net_init ip6_route_net_init(struct net *net) 5088 { 5089 int ret = -ENOMEM; 5090 5091 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 5092 sizeof(net->ipv6.ip6_dst_ops)); 5093 5094 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 5095 goto out_ip6_dst_ops; 5096 5097 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, 5098 sizeof(*net->ipv6.fib6_null_entry), 5099 GFP_KERNEL); 5100 if (!net->ipv6.fib6_null_entry) 5101 goto out_ip6_dst_entries; 5102 5103 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 5104 sizeof(*net->ipv6.ip6_null_entry), 5105 GFP_KERNEL); 5106 if (!net->ipv6.ip6_null_entry) 5107 goto out_fib6_null_entry; 5108 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5109 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 5110 ip6_template_metrics, true); 5111 5112 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5113 net->ipv6.fib6_has_custom_rules = false; 5114 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 5115 sizeof(*net->ipv6.ip6_prohibit_entry), 5116 GFP_KERNEL); 5117 if (!net->ipv6.ip6_prohibit_entry) 5118 goto out_ip6_null_entry; 5119 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5120 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 5121 ip6_template_metrics, true); 5122 5123 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 5124 sizeof(*net->ipv6.ip6_blk_hole_entry), 5125 GFP_KERNEL); 5126 if (!net->ipv6.ip6_blk_hole_entry) 5127 goto out_ip6_prohibit_entry; 5128 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5129 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 5130 ip6_template_metrics, true); 5131 #endif 5132 5133 net->ipv6.sysctl.flush_delay = 0; 5134 net->ipv6.sysctl.ip6_rt_max_size = 4096; 5135 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 5136 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 5137 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 5138 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 5139 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 5140 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 5141 5142 net->ipv6.ip6_rt_gc_expire = 30*HZ; 5143 5144 ret = 0; 5145 out: 5146 return ret; 5147 5148 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5149 out_ip6_prohibit_entry: 5150 kfree(net->ipv6.ip6_prohibit_entry); 5151 out_ip6_null_entry: 5152 kfree(net->ipv6.ip6_null_entry); 5153 #endif 5154 out_fib6_null_entry: 5155 kfree(net->ipv6.fib6_null_entry); 5156 out_ip6_dst_entries: 5157 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5158 out_ip6_dst_ops: 5159 goto out; 5160 } 5161 5162 static void __net_exit ip6_route_net_exit(struct net *net) 5163 { 5164 kfree(net->ipv6.fib6_null_entry); 5165 kfree(net->ipv6.ip6_null_entry); 5166 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5167 kfree(net->ipv6.ip6_prohibit_entry); 5168 kfree(net->ipv6.ip6_blk_hole_entry); 5169 #endif 5170 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5171 } 5172 5173 static int __net_init ip6_route_net_init_late(struct net *net) 5174 { 5175 #ifdef CONFIG_PROC_FS 5176 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops); 5177 proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops); 5178 #endif 5179 return 0; 5180 } 5181 5182 static void __net_exit ip6_route_net_exit_late(struct net *net) 5183 { 5184 #ifdef CONFIG_PROC_FS 5185 remove_proc_entry("ipv6_route", net->proc_net); 5186 remove_proc_entry("rt6_stats", net->proc_net); 5187 #endif 5188 } 5189 5190 static struct pernet_operations ip6_route_net_ops = { 5191 .init = ip6_route_net_init, 5192 .exit = ip6_route_net_exit, 5193 }; 5194 5195 static int __net_init ipv6_inetpeer_init(struct net *net) 5196 { 5197 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 5198 5199 if (!bp) 5200 return -ENOMEM; 5201 inet_peer_base_init(bp); 5202 net->ipv6.peers = bp; 5203 return 0; 5204 } 5205 5206 static void __net_exit ipv6_inetpeer_exit(struct net *net) 5207 { 5208 struct inet_peer_base *bp = net->ipv6.peers; 5209 5210 net->ipv6.peers = NULL; 5211 inetpeer_invalidate_tree(bp); 5212 kfree(bp); 5213 } 5214 5215 static struct pernet_operations ipv6_inetpeer_ops = { 5216 .init = ipv6_inetpeer_init, 5217 .exit = ipv6_inetpeer_exit, 5218 }; 5219 5220 static struct pernet_operations ip6_route_net_late_ops = { 5221 .init = ip6_route_net_init_late, 5222 .exit = ip6_route_net_exit_late, 5223 }; 5224 5225 static struct notifier_block ip6_route_dev_notifier = { 5226 .notifier_call = ip6_route_dev_notify, 5227 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 5228 }; 5229 5230 void __init ip6_route_init_special_entries(void) 5231 { 5232 /* Registering of the loopback is done before this portion of code, 5233 * the loopback reference in rt6_info will not be taken, do it 5234 * manually for init_net */ 5235 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev; 5236 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5237 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5238 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5239 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 5240 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5241 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 5242 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5243 #endif 5244 } 5245 5246 int __init ip6_route_init(void) 5247 { 5248 int ret; 5249 int cpu; 5250 5251 ret = -ENOMEM; 5252 ip6_dst_ops_template.kmem_cachep = 5253 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 5254 SLAB_HWCACHE_ALIGN, NULL); 5255 if (!ip6_dst_ops_template.kmem_cachep) 5256 goto out; 5257 5258 ret = dst_entries_init(&ip6_dst_blackhole_ops); 5259 if (ret) 5260 goto out_kmem_cache; 5261 5262 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 5263 if (ret) 5264 goto out_dst_entries; 5265 5266 ret = register_pernet_subsys(&ip6_route_net_ops); 5267 if (ret) 5268 goto out_register_inetpeer; 5269 5270 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 5271 5272 ret = fib6_init(); 5273 if (ret) 5274 goto out_register_subsys; 5275 5276 ret = xfrm6_init(); 5277 if (ret) 5278 goto out_fib6_init; 5279 5280 ret = fib6_rules_init(); 5281 if (ret) 5282 goto xfrm6_init; 5283 5284 ret = register_pernet_subsys(&ip6_route_net_late_ops); 5285 if (ret) 5286 goto fib6_rules_init; 5287 5288 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 5289 inet6_rtm_newroute, NULL, 0); 5290 if (ret < 0) 5291 goto out_register_late_subsys; 5292 5293 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 5294 inet6_rtm_delroute, NULL, 0); 5295 if (ret < 0) 5296 goto out_register_late_subsys; 5297 5298 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 5299 inet6_rtm_getroute, NULL, 5300 RTNL_FLAG_DOIT_UNLOCKED); 5301 if (ret < 0) 5302 goto out_register_late_subsys; 5303 5304 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 5305 if (ret) 5306 goto out_register_late_subsys; 5307 5308 for_each_possible_cpu(cpu) { 5309 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 5310 5311 INIT_LIST_HEAD(&ul->head); 5312 spin_lock_init(&ul->lock); 5313 } 5314 5315 out: 5316 return ret; 5317 5318 out_register_late_subsys: 5319 rtnl_unregister_all(PF_INET6); 5320 unregister_pernet_subsys(&ip6_route_net_late_ops); 5321 fib6_rules_init: 5322 fib6_rules_cleanup(); 5323 xfrm6_init: 5324 xfrm6_fini(); 5325 out_fib6_init: 5326 fib6_gc_cleanup(); 5327 out_register_subsys: 5328 unregister_pernet_subsys(&ip6_route_net_ops); 5329 out_register_inetpeer: 5330 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5331 out_dst_entries: 5332 dst_entries_destroy(&ip6_dst_blackhole_ops); 5333 out_kmem_cache: 5334 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5335 goto out; 5336 } 5337 5338 void ip6_route_cleanup(void) 5339 { 5340 unregister_netdevice_notifier(&ip6_route_dev_notifier); 5341 unregister_pernet_subsys(&ip6_route_net_late_ops); 5342 fib6_rules_cleanup(); 5343 xfrm6_fini(); 5344 fib6_gc_cleanup(); 5345 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5346 unregister_pernet_subsys(&ip6_route_net_ops); 5347 dst_entries_destroy(&ip6_dst_blackhole_ops); 5348 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5349 } 5350