1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/nexthop.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <net/ip.h> 67 #include <trace/events/fib6.h> 68 69 #include <linux/uaccess.h> 70 71 #ifdef CONFIG_SYSCTL 72 #include <linux/sysctl.h> 73 #endif 74 75 enum rt6_nud_state { 76 RT6_NUD_FAIL_HARD = -3, 77 RT6_NUD_FAIL_PROBE = -2, 78 RT6_NUD_FAIL_DO_RR = -1, 79 RT6_NUD_SUCCEED = 1 80 }; 81 82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 83 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 84 static unsigned int ip6_mtu(const struct dst_entry *dst); 85 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 86 static void ip6_dst_destroy(struct dst_entry *); 87 static void ip6_dst_ifdown(struct dst_entry *, 88 struct net_device *dev, int how); 89 static int ip6_dst_gc(struct dst_ops *ops); 90 91 static int ip6_pkt_discard(struct sk_buff *skb); 92 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 93 static int ip6_pkt_prohibit(struct sk_buff *skb); 94 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 95 static void ip6_link_failure(struct sk_buff *skb); 96 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 97 struct sk_buff *skb, u32 mtu); 98 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 99 struct sk_buff *skb); 100 static int rt6_score_route(struct fib6_info *rt, int oif, int strict); 101 static size_t rt6_nlmsg_size(struct fib6_info *rt); 102 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 103 struct fib6_info *rt, struct dst_entry *dst, 104 struct in6_addr *dest, struct in6_addr *src, 105 int iif, int type, u32 portid, u32 seq, 106 unsigned int flags); 107 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 108 struct in6_addr *daddr, 109 struct in6_addr *saddr); 110 111 #ifdef CONFIG_IPV6_ROUTE_INFO 112 static struct fib6_info *rt6_add_route_info(struct net *net, 113 const struct in6_addr *prefix, int prefixlen, 114 const struct in6_addr *gwaddr, 115 struct net_device *dev, 116 unsigned int pref); 117 static struct fib6_info *rt6_get_route_info(struct net *net, 118 const struct in6_addr *prefix, int prefixlen, 119 const struct in6_addr *gwaddr, 120 struct net_device *dev); 121 #endif 122 123 struct uncached_list { 124 spinlock_t lock; 125 struct list_head head; 126 }; 127 128 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 129 130 void rt6_uncached_list_add(struct rt6_info *rt) 131 { 132 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 133 134 rt->rt6i_uncached_list = ul; 135 136 spin_lock_bh(&ul->lock); 137 list_add_tail(&rt->rt6i_uncached, &ul->head); 138 spin_unlock_bh(&ul->lock); 139 } 140 141 void rt6_uncached_list_del(struct rt6_info *rt) 142 { 143 if (!list_empty(&rt->rt6i_uncached)) { 144 struct uncached_list *ul = rt->rt6i_uncached_list; 145 struct net *net = dev_net(rt->dst.dev); 146 147 spin_lock_bh(&ul->lock); 148 list_del(&rt->rt6i_uncached); 149 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 150 spin_unlock_bh(&ul->lock); 151 } 152 } 153 154 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 155 { 156 struct net_device *loopback_dev = net->loopback_dev; 157 int cpu; 158 159 if (dev == loopback_dev) 160 return; 161 162 for_each_possible_cpu(cpu) { 163 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 164 struct rt6_info *rt; 165 166 spin_lock_bh(&ul->lock); 167 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 168 struct inet6_dev *rt_idev = rt->rt6i_idev; 169 struct net_device *rt_dev = rt->dst.dev; 170 171 if (rt_idev->dev == dev) { 172 rt->rt6i_idev = in6_dev_get(loopback_dev); 173 in6_dev_put(rt_idev); 174 } 175 176 if (rt_dev == dev) { 177 rt->dst.dev = loopback_dev; 178 dev_hold(rt->dst.dev); 179 dev_put(rt_dev); 180 } 181 } 182 spin_unlock_bh(&ul->lock); 183 } 184 } 185 186 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 187 struct sk_buff *skb, 188 const void *daddr) 189 { 190 if (!ipv6_addr_any(p)) 191 return (const void *) p; 192 else if (skb) 193 return &ipv6_hdr(skb)->daddr; 194 return daddr; 195 } 196 197 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 198 struct net_device *dev, 199 struct sk_buff *skb, 200 const void *daddr) 201 { 202 struct neighbour *n; 203 204 daddr = choose_neigh_daddr(gw, skb, daddr); 205 n = __ipv6_neigh_lookup(dev, daddr); 206 if (n) 207 return n; 208 return neigh_create(&nd_tbl, daddr, dev); 209 } 210 211 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 212 struct sk_buff *skb, 213 const void *daddr) 214 { 215 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 216 217 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); 218 } 219 220 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 221 { 222 struct net_device *dev = dst->dev; 223 struct rt6_info *rt = (struct rt6_info *)dst; 224 225 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); 226 if (!daddr) 227 return; 228 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 229 return; 230 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 231 return; 232 __ipv6_confirm_neigh(dev, daddr); 233 } 234 235 static struct dst_ops ip6_dst_ops_template = { 236 .family = AF_INET6, 237 .gc = ip6_dst_gc, 238 .gc_thresh = 1024, 239 .check = ip6_dst_check, 240 .default_advmss = ip6_default_advmss, 241 .mtu = ip6_mtu, 242 .cow_metrics = dst_cow_metrics_generic, 243 .destroy = ip6_dst_destroy, 244 .ifdown = ip6_dst_ifdown, 245 .negative_advice = ip6_negative_advice, 246 .link_failure = ip6_link_failure, 247 .update_pmtu = ip6_rt_update_pmtu, 248 .redirect = rt6_do_redirect, 249 .local_out = __ip6_local_out, 250 .neigh_lookup = ip6_dst_neigh_lookup, 251 .confirm_neigh = ip6_confirm_neigh, 252 }; 253 254 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 255 { 256 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 257 258 return mtu ? : dst->dev->mtu; 259 } 260 261 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 262 struct sk_buff *skb, u32 mtu) 263 { 264 } 265 266 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 267 struct sk_buff *skb) 268 { 269 } 270 271 static struct dst_ops ip6_dst_blackhole_ops = { 272 .family = AF_INET6, 273 .destroy = ip6_dst_destroy, 274 .check = ip6_dst_check, 275 .mtu = ip6_blackhole_mtu, 276 .default_advmss = ip6_default_advmss, 277 .update_pmtu = ip6_rt_blackhole_update_pmtu, 278 .redirect = ip6_rt_blackhole_redirect, 279 .cow_metrics = dst_cow_metrics_generic, 280 .neigh_lookup = ip6_dst_neigh_lookup, 281 }; 282 283 static const u32 ip6_template_metrics[RTAX_MAX] = { 284 [RTAX_HOPLIMIT - 1] = 0, 285 }; 286 287 static const struct fib6_info fib6_null_entry_template = { 288 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 289 .fib6_protocol = RTPROT_KERNEL, 290 .fib6_metric = ~(u32)0, 291 .fib6_ref = ATOMIC_INIT(1), 292 .fib6_type = RTN_UNREACHABLE, 293 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 294 }; 295 296 static const struct rt6_info ip6_null_entry_template = { 297 .dst = { 298 .__refcnt = ATOMIC_INIT(1), 299 .__use = 1, 300 .obsolete = DST_OBSOLETE_FORCE_CHK, 301 .error = -ENETUNREACH, 302 .input = ip6_pkt_discard, 303 .output = ip6_pkt_discard_out, 304 }, 305 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 306 }; 307 308 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 309 310 static const struct rt6_info ip6_prohibit_entry_template = { 311 .dst = { 312 .__refcnt = ATOMIC_INIT(1), 313 .__use = 1, 314 .obsolete = DST_OBSOLETE_FORCE_CHK, 315 .error = -EACCES, 316 .input = ip6_pkt_prohibit, 317 .output = ip6_pkt_prohibit_out, 318 }, 319 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 320 }; 321 322 static const struct rt6_info ip6_blk_hole_entry_template = { 323 .dst = { 324 .__refcnt = ATOMIC_INIT(1), 325 .__use = 1, 326 .obsolete = DST_OBSOLETE_FORCE_CHK, 327 .error = -EINVAL, 328 .input = dst_discard, 329 .output = dst_discard_out, 330 }, 331 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 332 }; 333 334 #endif 335 336 static void rt6_info_init(struct rt6_info *rt) 337 { 338 struct dst_entry *dst = &rt->dst; 339 340 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 341 INIT_LIST_HEAD(&rt->rt6i_uncached); 342 } 343 344 /* allocate dst with ip6_dst_ops */ 345 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 346 int flags) 347 { 348 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 349 1, DST_OBSOLETE_FORCE_CHK, flags); 350 351 if (rt) { 352 rt6_info_init(rt); 353 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 354 } 355 356 return rt; 357 } 358 EXPORT_SYMBOL(ip6_dst_alloc); 359 360 static void ip6_dst_destroy(struct dst_entry *dst) 361 { 362 struct rt6_info *rt = (struct rt6_info *)dst; 363 struct fib6_info *from; 364 struct inet6_dev *idev; 365 366 dst_destroy_metrics_generic(dst); 367 rt6_uncached_list_del(rt); 368 369 idev = rt->rt6i_idev; 370 if (idev) { 371 rt->rt6i_idev = NULL; 372 in6_dev_put(idev); 373 } 374 375 rcu_read_lock(); 376 from = rcu_dereference(rt->from); 377 rcu_assign_pointer(rt->from, NULL); 378 fib6_info_release(from); 379 rcu_read_unlock(); 380 } 381 382 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 383 int how) 384 { 385 struct rt6_info *rt = (struct rt6_info *)dst; 386 struct inet6_dev *idev = rt->rt6i_idev; 387 struct net_device *loopback_dev = 388 dev_net(dev)->loopback_dev; 389 390 if (idev && idev->dev != loopback_dev) { 391 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 392 if (loopback_idev) { 393 rt->rt6i_idev = loopback_idev; 394 in6_dev_put(idev); 395 } 396 } 397 } 398 399 static bool __rt6_check_expired(const struct rt6_info *rt) 400 { 401 if (rt->rt6i_flags & RTF_EXPIRES) 402 return time_after(jiffies, rt->dst.expires); 403 else 404 return false; 405 } 406 407 static bool rt6_check_expired(const struct rt6_info *rt) 408 { 409 struct fib6_info *from; 410 411 from = rcu_dereference(rt->from); 412 413 if (rt->rt6i_flags & RTF_EXPIRES) { 414 if (time_after(jiffies, rt->dst.expires)) 415 return true; 416 } else if (from) { 417 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 418 fib6_check_expired(from); 419 } 420 return false; 421 } 422 423 struct fib6_info *fib6_multipath_select(const struct net *net, 424 struct fib6_info *match, 425 struct flowi6 *fl6, int oif, 426 const struct sk_buff *skb, 427 int strict) 428 { 429 struct fib6_info *sibling, *next_sibling; 430 431 /* We might have already computed the hash for ICMPv6 errors. In such 432 * case it will always be non-zero. Otherwise now is the time to do it. 433 */ 434 if (!fl6->mp_hash) 435 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 436 437 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound)) 438 return match; 439 440 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 441 fib6_siblings) { 442 int nh_upper_bound; 443 444 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound); 445 if (fl6->mp_hash > nh_upper_bound) 446 continue; 447 if (rt6_score_route(sibling, oif, strict) < 0) 448 break; 449 match = sibling; 450 break; 451 } 452 453 return match; 454 } 455 456 /* 457 * Route lookup. rcu_read_lock() should be held. 458 */ 459 460 static inline struct fib6_info *rt6_device_match(struct net *net, 461 struct fib6_info *rt, 462 const struct in6_addr *saddr, 463 int oif, 464 int flags) 465 { 466 struct fib6_info *sprt; 467 468 if (!oif && ipv6_addr_any(saddr) && 469 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD)) 470 return rt; 471 472 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) { 473 const struct net_device *dev = sprt->fib6_nh.nh_dev; 474 475 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD) 476 continue; 477 478 if (oif) { 479 if (dev->ifindex == oif) 480 return sprt; 481 } else { 482 if (ipv6_chk_addr(net, saddr, dev, 483 flags & RT6_LOOKUP_F_IFACE)) 484 return sprt; 485 } 486 } 487 488 if (oif && flags & RT6_LOOKUP_F_IFACE) 489 return net->ipv6.fib6_null_entry; 490 491 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt; 492 } 493 494 #ifdef CONFIG_IPV6_ROUTER_PREF 495 struct __rt6_probe_work { 496 struct work_struct work; 497 struct in6_addr target; 498 struct net_device *dev; 499 }; 500 501 static void rt6_probe_deferred(struct work_struct *w) 502 { 503 struct in6_addr mcaddr; 504 struct __rt6_probe_work *work = 505 container_of(w, struct __rt6_probe_work, work); 506 507 addrconf_addr_solict_mult(&work->target, &mcaddr); 508 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 509 dev_put(work->dev); 510 kfree(work); 511 } 512 513 static void rt6_probe(struct fib6_info *rt) 514 { 515 struct __rt6_probe_work *work; 516 const struct in6_addr *nh_gw; 517 struct neighbour *neigh; 518 struct net_device *dev; 519 520 /* 521 * Okay, this does not seem to be appropriate 522 * for now, however, we need to check if it 523 * is really so; aka Router Reachability Probing. 524 * 525 * Router Reachability Probe MUST be rate-limited 526 * to no more than one per minute. 527 */ 528 if (!rt || !(rt->fib6_flags & RTF_GATEWAY)) 529 return; 530 531 nh_gw = &rt->fib6_nh.nh_gw; 532 dev = rt->fib6_nh.nh_dev; 533 rcu_read_lock_bh(); 534 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 535 if (neigh) { 536 struct inet6_dev *idev; 537 538 if (neigh->nud_state & NUD_VALID) 539 goto out; 540 541 idev = __in6_dev_get(dev); 542 work = NULL; 543 write_lock(&neigh->lock); 544 if (!(neigh->nud_state & NUD_VALID) && 545 time_after(jiffies, 546 neigh->updated + idev->cnf.rtr_probe_interval)) { 547 work = kmalloc(sizeof(*work), GFP_ATOMIC); 548 if (work) 549 __neigh_set_probe_once(neigh); 550 } 551 write_unlock(&neigh->lock); 552 } else { 553 work = kmalloc(sizeof(*work), GFP_ATOMIC); 554 } 555 556 if (work) { 557 INIT_WORK(&work->work, rt6_probe_deferred); 558 work->target = *nh_gw; 559 dev_hold(dev); 560 work->dev = dev; 561 schedule_work(&work->work); 562 } 563 564 out: 565 rcu_read_unlock_bh(); 566 } 567 #else 568 static inline void rt6_probe(struct fib6_info *rt) 569 { 570 } 571 #endif 572 573 /* 574 * Default Router Selection (RFC 2461 6.3.6) 575 */ 576 static inline int rt6_check_dev(struct fib6_info *rt, int oif) 577 { 578 const struct net_device *dev = rt->fib6_nh.nh_dev; 579 580 if (!oif || dev->ifindex == oif) 581 return 2; 582 return 0; 583 } 584 585 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) 586 { 587 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 588 struct neighbour *neigh; 589 590 if (rt->fib6_flags & RTF_NONEXTHOP || 591 !(rt->fib6_flags & RTF_GATEWAY)) 592 return RT6_NUD_SUCCEED; 593 594 rcu_read_lock_bh(); 595 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev, 596 &rt->fib6_nh.nh_gw); 597 if (neigh) { 598 read_lock(&neigh->lock); 599 if (neigh->nud_state & NUD_VALID) 600 ret = RT6_NUD_SUCCEED; 601 #ifdef CONFIG_IPV6_ROUTER_PREF 602 else if (!(neigh->nud_state & NUD_FAILED)) 603 ret = RT6_NUD_SUCCEED; 604 else 605 ret = RT6_NUD_FAIL_PROBE; 606 #endif 607 read_unlock(&neigh->lock); 608 } else { 609 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 610 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 611 } 612 rcu_read_unlock_bh(); 613 614 return ret; 615 } 616 617 static int rt6_score_route(struct fib6_info *rt, int oif, int strict) 618 { 619 int m; 620 621 m = rt6_check_dev(rt, oif); 622 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 623 return RT6_NUD_FAIL_HARD; 624 #ifdef CONFIG_IPV6_ROUTER_PREF 625 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2; 626 #endif 627 if (strict & RT6_LOOKUP_F_REACHABLE) { 628 int n = rt6_check_neigh(rt); 629 if (n < 0) 630 return n; 631 } 632 return m; 633 } 634 635 /* called with rc_read_lock held */ 636 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i) 637 { 638 const struct net_device *dev = fib6_info_nh_dev(f6i); 639 bool rc = false; 640 641 if (dev) { 642 const struct inet6_dev *idev = __in6_dev_get(dev); 643 644 rc = !!idev->cnf.ignore_routes_with_linkdown; 645 } 646 647 return rc; 648 } 649 650 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, 651 int *mpri, struct fib6_info *match, 652 bool *do_rr) 653 { 654 int m; 655 bool match_do_rr = false; 656 657 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 658 goto out; 659 660 if (fib6_ignore_linkdown(rt) && 661 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 662 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 663 goto out; 664 665 if (fib6_check_expired(rt)) 666 goto out; 667 668 m = rt6_score_route(rt, oif, strict); 669 if (m == RT6_NUD_FAIL_DO_RR) { 670 match_do_rr = true; 671 m = 0; /* lowest valid score */ 672 } else if (m == RT6_NUD_FAIL_HARD) { 673 goto out; 674 } 675 676 if (strict & RT6_LOOKUP_F_REACHABLE) 677 rt6_probe(rt); 678 679 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 680 if (m > *mpri) { 681 *do_rr = match_do_rr; 682 *mpri = m; 683 match = rt; 684 } 685 out: 686 return match; 687 } 688 689 static struct fib6_info *find_rr_leaf(struct fib6_node *fn, 690 struct fib6_info *leaf, 691 struct fib6_info *rr_head, 692 u32 metric, int oif, int strict, 693 bool *do_rr) 694 { 695 struct fib6_info *rt, *match, *cont; 696 int mpri = -1; 697 698 match = NULL; 699 cont = NULL; 700 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) { 701 if (rt->fib6_metric != metric) { 702 cont = rt; 703 break; 704 } 705 706 match = find_match(rt, oif, strict, &mpri, match, do_rr); 707 } 708 709 for (rt = leaf; rt && rt != rr_head; 710 rt = rcu_dereference(rt->fib6_next)) { 711 if (rt->fib6_metric != metric) { 712 cont = rt; 713 break; 714 } 715 716 match = find_match(rt, oif, strict, &mpri, match, do_rr); 717 } 718 719 if (match || !cont) 720 return match; 721 722 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next)) 723 match = find_match(rt, oif, strict, &mpri, match, do_rr); 724 725 return match; 726 } 727 728 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, 729 int oif, int strict) 730 { 731 struct fib6_info *leaf = rcu_dereference(fn->leaf); 732 struct fib6_info *match, *rt0; 733 bool do_rr = false; 734 int key_plen; 735 736 if (!leaf || leaf == net->ipv6.fib6_null_entry) 737 return net->ipv6.fib6_null_entry; 738 739 rt0 = rcu_dereference(fn->rr_ptr); 740 if (!rt0) 741 rt0 = leaf; 742 743 /* Double check to make sure fn is not an intermediate node 744 * and fn->leaf does not points to its child's leaf 745 * (This might happen if all routes under fn are deleted from 746 * the tree and fib6_repair_tree() is called on the node.) 747 */ 748 key_plen = rt0->fib6_dst.plen; 749 #ifdef CONFIG_IPV6_SUBTREES 750 if (rt0->fib6_src.plen) 751 key_plen = rt0->fib6_src.plen; 752 #endif 753 if (fn->fn_bit != key_plen) 754 return net->ipv6.fib6_null_entry; 755 756 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict, 757 &do_rr); 758 759 if (do_rr) { 760 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 761 762 /* no entries matched; do round-robin */ 763 if (!next || next->fib6_metric != rt0->fib6_metric) 764 next = leaf; 765 766 if (next != rt0) { 767 spin_lock_bh(&leaf->fib6_table->tb6_lock); 768 /* make sure next is not being deleted from the tree */ 769 if (next->fib6_node) 770 rcu_assign_pointer(fn->rr_ptr, next); 771 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 772 } 773 } 774 775 return match ? match : net->ipv6.fib6_null_entry; 776 } 777 778 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) 779 { 780 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); 781 } 782 783 #ifdef CONFIG_IPV6_ROUTE_INFO 784 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 785 const struct in6_addr *gwaddr) 786 { 787 struct net *net = dev_net(dev); 788 struct route_info *rinfo = (struct route_info *) opt; 789 struct in6_addr prefix_buf, *prefix; 790 unsigned int pref; 791 unsigned long lifetime; 792 struct fib6_info *rt; 793 794 if (len < sizeof(struct route_info)) { 795 return -EINVAL; 796 } 797 798 /* Sanity check for prefix_len and length */ 799 if (rinfo->length > 3) { 800 return -EINVAL; 801 } else if (rinfo->prefix_len > 128) { 802 return -EINVAL; 803 } else if (rinfo->prefix_len > 64) { 804 if (rinfo->length < 2) { 805 return -EINVAL; 806 } 807 } else if (rinfo->prefix_len > 0) { 808 if (rinfo->length < 1) { 809 return -EINVAL; 810 } 811 } 812 813 pref = rinfo->route_pref; 814 if (pref == ICMPV6_ROUTER_PREF_INVALID) 815 return -EINVAL; 816 817 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 818 819 if (rinfo->length == 3) 820 prefix = (struct in6_addr *)rinfo->prefix; 821 else { 822 /* this function is safe */ 823 ipv6_addr_prefix(&prefix_buf, 824 (struct in6_addr *)rinfo->prefix, 825 rinfo->prefix_len); 826 prefix = &prefix_buf; 827 } 828 829 if (rinfo->prefix_len == 0) 830 rt = rt6_get_dflt_router(net, gwaddr, dev); 831 else 832 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 833 gwaddr, dev); 834 835 if (rt && !lifetime) { 836 ip6_del_rt(net, rt); 837 rt = NULL; 838 } 839 840 if (!rt && lifetime) 841 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 842 dev, pref); 843 else if (rt) 844 rt->fib6_flags = RTF_ROUTEINFO | 845 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 846 847 if (rt) { 848 if (!addrconf_finite_timeout(lifetime)) 849 fib6_clean_expires(rt); 850 else 851 fib6_set_expires(rt, jiffies + HZ * lifetime); 852 853 fib6_info_release(rt); 854 } 855 return 0; 856 } 857 #endif 858 859 /* 860 * Misc support functions 861 */ 862 863 /* called with rcu_lock held */ 864 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt) 865 { 866 struct net_device *dev = rt->fib6_nh.nh_dev; 867 868 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 869 /* for copies of local routes, dst->dev needs to be the 870 * device if it is a master device, the master device if 871 * device is enslaved, and the loopback as the default 872 */ 873 if (netif_is_l3_slave(dev) && 874 !rt6_need_strict(&rt->fib6_dst.addr)) 875 dev = l3mdev_master_dev_rcu(dev); 876 else if (!netif_is_l3_master(dev)) 877 dev = dev_net(dev)->loopback_dev; 878 /* last case is netif_is_l3_master(dev) is true in which 879 * case we want dev returned to be dev 880 */ 881 } 882 883 return dev; 884 } 885 886 static const int fib6_prop[RTN_MAX + 1] = { 887 [RTN_UNSPEC] = 0, 888 [RTN_UNICAST] = 0, 889 [RTN_LOCAL] = 0, 890 [RTN_BROADCAST] = 0, 891 [RTN_ANYCAST] = 0, 892 [RTN_MULTICAST] = 0, 893 [RTN_BLACKHOLE] = -EINVAL, 894 [RTN_UNREACHABLE] = -EHOSTUNREACH, 895 [RTN_PROHIBIT] = -EACCES, 896 [RTN_THROW] = -EAGAIN, 897 [RTN_NAT] = -EINVAL, 898 [RTN_XRESOLVE] = -EINVAL, 899 }; 900 901 static int ip6_rt_type_to_error(u8 fib6_type) 902 { 903 return fib6_prop[fib6_type]; 904 } 905 906 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 907 { 908 unsigned short flags = 0; 909 910 if (rt->dst_nocount) 911 flags |= DST_NOCOUNT; 912 if (rt->dst_nopolicy) 913 flags |= DST_NOPOLICY; 914 if (rt->dst_host) 915 flags |= DST_HOST; 916 917 return flags; 918 } 919 920 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) 921 { 922 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); 923 924 switch (ort->fib6_type) { 925 case RTN_BLACKHOLE: 926 rt->dst.output = dst_discard_out; 927 rt->dst.input = dst_discard; 928 break; 929 case RTN_PROHIBIT: 930 rt->dst.output = ip6_pkt_prohibit_out; 931 rt->dst.input = ip6_pkt_prohibit; 932 break; 933 case RTN_THROW: 934 case RTN_UNREACHABLE: 935 default: 936 rt->dst.output = ip6_pkt_discard_out; 937 rt->dst.input = ip6_pkt_discard; 938 break; 939 } 940 } 941 942 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) 943 { 944 rt->dst.flags |= fib6_info_dst_flags(ort); 945 946 if (ort->fib6_flags & RTF_REJECT) { 947 ip6_rt_init_dst_reject(rt, ort); 948 return; 949 } 950 951 rt->dst.error = 0; 952 rt->dst.output = ip6_output; 953 954 if (ort->fib6_type == RTN_LOCAL) { 955 rt->dst.input = ip6_input; 956 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 957 rt->dst.input = ip6_mc_input; 958 } else { 959 rt->dst.input = ip6_forward; 960 } 961 962 if (ort->fib6_nh.nh_lwtstate) { 963 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); 964 lwtunnel_set_redirect(&rt->dst); 965 } 966 967 rt->dst.lastuse = jiffies; 968 } 969 970 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 971 { 972 rt->rt6i_flags &= ~RTF_EXPIRES; 973 fib6_info_hold(from); 974 rcu_assign_pointer(rt->from, from); 975 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); 976 if (from->fib6_metrics != &dst_default_metrics) { 977 rt->dst._metrics |= DST_METRICS_REFCOUNTED; 978 refcount_inc(&from->fib6_metrics->refcnt); 979 } 980 } 981 982 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) 983 { 984 struct net_device *dev = fib6_info_nh_dev(ort); 985 986 ip6_rt_init_dst(rt, ort); 987 988 rt->rt6i_dst = ort->fib6_dst; 989 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 990 rt->rt6i_gateway = ort->fib6_nh.nh_gw; 991 rt->rt6i_flags = ort->fib6_flags; 992 rt6_set_from(rt, ort); 993 #ifdef CONFIG_IPV6_SUBTREES 994 rt->rt6i_src = ort->fib6_src; 995 #endif 996 rt->rt6i_prefsrc = ort->fib6_prefsrc; 997 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); 998 } 999 1000 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1001 struct in6_addr *saddr) 1002 { 1003 struct fib6_node *pn, *sn; 1004 while (1) { 1005 if (fn->fn_flags & RTN_TL_ROOT) 1006 return NULL; 1007 pn = rcu_dereference(fn->parent); 1008 sn = FIB6_SUBTREE(pn); 1009 if (sn && sn != fn) 1010 fn = fib6_node_lookup(sn, NULL, saddr); 1011 else 1012 fn = pn; 1013 if (fn->fn_flags & RTN_RTINFO) 1014 return fn; 1015 } 1016 } 1017 1018 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, 1019 bool null_fallback) 1020 { 1021 struct rt6_info *rt = *prt; 1022 1023 if (dst_hold_safe(&rt->dst)) 1024 return true; 1025 if (null_fallback) { 1026 rt = net->ipv6.ip6_null_entry; 1027 dst_hold(&rt->dst); 1028 } else { 1029 rt = NULL; 1030 } 1031 *prt = rt; 1032 return false; 1033 } 1034 1035 /* called with rcu_lock held */ 1036 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) 1037 { 1038 unsigned short flags = fib6_info_dst_flags(rt); 1039 struct net_device *dev = rt->fib6_nh.nh_dev; 1040 struct rt6_info *nrt; 1041 1042 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1043 if (nrt) 1044 ip6_rt_copy_init(nrt, rt); 1045 1046 return nrt; 1047 } 1048 1049 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1050 struct fib6_table *table, 1051 struct flowi6 *fl6, 1052 const struct sk_buff *skb, 1053 int flags) 1054 { 1055 struct fib6_info *f6i; 1056 struct fib6_node *fn; 1057 struct rt6_info *rt; 1058 1059 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1060 flags &= ~RT6_LOOKUP_F_IFACE; 1061 1062 rcu_read_lock(); 1063 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1064 restart: 1065 f6i = rcu_dereference(fn->leaf); 1066 if (!f6i) { 1067 f6i = net->ipv6.fib6_null_entry; 1068 } else { 1069 f6i = rt6_device_match(net, f6i, &fl6->saddr, 1070 fl6->flowi6_oif, flags); 1071 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) 1072 f6i = fib6_multipath_select(net, f6i, fl6, 1073 fl6->flowi6_oif, skb, 1074 flags); 1075 } 1076 if (f6i == net->ipv6.fib6_null_entry) { 1077 fn = fib6_backtrack(fn, &fl6->saddr); 1078 if (fn) 1079 goto restart; 1080 } 1081 1082 trace_fib6_table_lookup(net, f6i, table, fl6); 1083 1084 /* Search through exception table */ 1085 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1086 if (rt) { 1087 if (ip6_hold_safe(net, &rt, true)) 1088 dst_use_noref(&rt->dst, jiffies); 1089 } else if (f6i == net->ipv6.fib6_null_entry) { 1090 rt = net->ipv6.ip6_null_entry; 1091 dst_hold(&rt->dst); 1092 } else { 1093 rt = ip6_create_rt_rcu(f6i); 1094 if (!rt) { 1095 rt = net->ipv6.ip6_null_entry; 1096 dst_hold(&rt->dst); 1097 } 1098 } 1099 1100 rcu_read_unlock(); 1101 1102 return rt; 1103 } 1104 1105 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1106 const struct sk_buff *skb, int flags) 1107 { 1108 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1109 } 1110 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1111 1112 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1113 const struct in6_addr *saddr, int oif, 1114 const struct sk_buff *skb, int strict) 1115 { 1116 struct flowi6 fl6 = { 1117 .flowi6_oif = oif, 1118 .daddr = *daddr, 1119 }; 1120 struct dst_entry *dst; 1121 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1122 1123 if (saddr) { 1124 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1125 flags |= RT6_LOOKUP_F_HAS_SADDR; 1126 } 1127 1128 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1129 if (dst->error == 0) 1130 return (struct rt6_info *) dst; 1131 1132 dst_release(dst); 1133 1134 return NULL; 1135 } 1136 EXPORT_SYMBOL(rt6_lookup); 1137 1138 /* ip6_ins_rt is called with FREE table->tb6_lock. 1139 * It takes new route entry, the addition fails by any reason the 1140 * route is released. 1141 * Caller must hold dst before calling it. 1142 */ 1143 1144 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1145 struct netlink_ext_ack *extack) 1146 { 1147 int err; 1148 struct fib6_table *table; 1149 1150 table = rt->fib6_table; 1151 spin_lock_bh(&table->tb6_lock); 1152 err = fib6_add(&table->tb6_root, rt, info, extack); 1153 spin_unlock_bh(&table->tb6_lock); 1154 1155 return err; 1156 } 1157 1158 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1159 { 1160 struct nl_info info = { .nl_net = net, }; 1161 1162 return __ip6_ins_rt(rt, &info, NULL); 1163 } 1164 1165 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, 1166 const struct in6_addr *daddr, 1167 const struct in6_addr *saddr) 1168 { 1169 struct net_device *dev; 1170 struct rt6_info *rt; 1171 1172 /* 1173 * Clone the route. 1174 */ 1175 1176 dev = ip6_rt_get_dev_rcu(ort); 1177 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1178 if (!rt) 1179 return NULL; 1180 1181 ip6_rt_copy_init(rt, ort); 1182 rt->rt6i_flags |= RTF_CACHE; 1183 rt->dst.flags |= DST_HOST; 1184 rt->rt6i_dst.addr = *daddr; 1185 rt->rt6i_dst.plen = 128; 1186 1187 if (!rt6_is_gw_or_nonexthop(ort)) { 1188 if (ort->fib6_dst.plen != 128 && 1189 ipv6_addr_equal(&ort->fib6_dst.addr, daddr)) 1190 rt->rt6i_flags |= RTF_ANYCAST; 1191 #ifdef CONFIG_IPV6_SUBTREES 1192 if (rt->rt6i_src.plen && saddr) { 1193 rt->rt6i_src.addr = *saddr; 1194 rt->rt6i_src.plen = 128; 1195 } 1196 #endif 1197 } 1198 1199 return rt; 1200 } 1201 1202 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) 1203 { 1204 unsigned short flags = fib6_info_dst_flags(rt); 1205 struct net_device *dev; 1206 struct rt6_info *pcpu_rt; 1207 1208 rcu_read_lock(); 1209 dev = ip6_rt_get_dev_rcu(rt); 1210 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1211 rcu_read_unlock(); 1212 if (!pcpu_rt) 1213 return NULL; 1214 ip6_rt_copy_init(pcpu_rt, rt); 1215 pcpu_rt->rt6i_flags |= RTF_PCPU; 1216 return pcpu_rt; 1217 } 1218 1219 /* It should be called with rcu_read_lock() acquired */ 1220 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt) 1221 { 1222 struct rt6_info *pcpu_rt, **p; 1223 1224 p = this_cpu_ptr(rt->rt6i_pcpu); 1225 pcpu_rt = *p; 1226 1227 if (pcpu_rt) 1228 ip6_hold_safe(NULL, &pcpu_rt, false); 1229 1230 return pcpu_rt; 1231 } 1232 1233 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1234 struct fib6_info *rt) 1235 { 1236 struct rt6_info *pcpu_rt, *prev, **p; 1237 1238 pcpu_rt = ip6_rt_pcpu_alloc(rt); 1239 if (!pcpu_rt) { 1240 dst_hold(&net->ipv6.ip6_null_entry->dst); 1241 return net->ipv6.ip6_null_entry; 1242 } 1243 1244 dst_hold(&pcpu_rt->dst); 1245 p = this_cpu_ptr(rt->rt6i_pcpu); 1246 prev = cmpxchg(p, NULL, pcpu_rt); 1247 BUG_ON(prev); 1248 1249 return pcpu_rt; 1250 } 1251 1252 /* exception hash table implementation 1253 */ 1254 static DEFINE_SPINLOCK(rt6_exception_lock); 1255 1256 /* Remove rt6_ex from hash table and free the memory 1257 * Caller must hold rt6_exception_lock 1258 */ 1259 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1260 struct rt6_exception *rt6_ex) 1261 { 1262 struct net *net; 1263 1264 if (!bucket || !rt6_ex) 1265 return; 1266 1267 net = dev_net(rt6_ex->rt6i->dst.dev); 1268 hlist_del_rcu(&rt6_ex->hlist); 1269 dst_release(&rt6_ex->rt6i->dst); 1270 kfree_rcu(rt6_ex, rcu); 1271 WARN_ON_ONCE(!bucket->depth); 1272 bucket->depth--; 1273 net->ipv6.rt6_stats->fib_rt_cache--; 1274 } 1275 1276 /* Remove oldest rt6_ex in bucket and free the memory 1277 * Caller must hold rt6_exception_lock 1278 */ 1279 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1280 { 1281 struct rt6_exception *rt6_ex, *oldest = NULL; 1282 1283 if (!bucket) 1284 return; 1285 1286 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1287 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1288 oldest = rt6_ex; 1289 } 1290 rt6_remove_exception(bucket, oldest); 1291 } 1292 1293 static u32 rt6_exception_hash(const struct in6_addr *dst, 1294 const struct in6_addr *src) 1295 { 1296 static u32 seed __read_mostly; 1297 u32 val; 1298 1299 net_get_random_once(&seed, sizeof(seed)); 1300 val = jhash(dst, sizeof(*dst), seed); 1301 1302 #ifdef CONFIG_IPV6_SUBTREES 1303 if (src) 1304 val = jhash(src, sizeof(*src), val); 1305 #endif 1306 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1307 } 1308 1309 /* Helper function to find the cached rt in the hash table 1310 * and update bucket pointer to point to the bucket for this 1311 * (daddr, saddr) pair 1312 * Caller must hold rt6_exception_lock 1313 */ 1314 static struct rt6_exception * 1315 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1316 const struct in6_addr *daddr, 1317 const struct in6_addr *saddr) 1318 { 1319 struct rt6_exception *rt6_ex; 1320 u32 hval; 1321 1322 if (!(*bucket) || !daddr) 1323 return NULL; 1324 1325 hval = rt6_exception_hash(daddr, saddr); 1326 *bucket += hval; 1327 1328 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1329 struct rt6_info *rt6 = rt6_ex->rt6i; 1330 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1331 1332 #ifdef CONFIG_IPV6_SUBTREES 1333 if (matched && saddr) 1334 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1335 #endif 1336 if (matched) 1337 return rt6_ex; 1338 } 1339 return NULL; 1340 } 1341 1342 /* Helper function to find the cached rt in the hash table 1343 * and update bucket pointer to point to the bucket for this 1344 * (daddr, saddr) pair 1345 * Caller must hold rcu_read_lock() 1346 */ 1347 static struct rt6_exception * 1348 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1349 const struct in6_addr *daddr, 1350 const struct in6_addr *saddr) 1351 { 1352 struct rt6_exception *rt6_ex; 1353 u32 hval; 1354 1355 WARN_ON_ONCE(!rcu_read_lock_held()); 1356 1357 if (!(*bucket) || !daddr) 1358 return NULL; 1359 1360 hval = rt6_exception_hash(daddr, saddr); 1361 *bucket += hval; 1362 1363 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1364 struct rt6_info *rt6 = rt6_ex->rt6i; 1365 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1366 1367 #ifdef CONFIG_IPV6_SUBTREES 1368 if (matched && saddr) 1369 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1370 #endif 1371 if (matched) 1372 return rt6_ex; 1373 } 1374 return NULL; 1375 } 1376 1377 static unsigned int fib6_mtu(const struct fib6_info *rt) 1378 { 1379 unsigned int mtu; 1380 1381 if (rt->fib6_pmtu) { 1382 mtu = rt->fib6_pmtu; 1383 } else { 1384 struct net_device *dev = fib6_info_nh_dev(rt); 1385 struct inet6_dev *idev; 1386 1387 rcu_read_lock(); 1388 idev = __in6_dev_get(dev); 1389 mtu = idev->cnf.mtu6; 1390 rcu_read_unlock(); 1391 } 1392 1393 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1394 1395 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu); 1396 } 1397 1398 static int rt6_insert_exception(struct rt6_info *nrt, 1399 struct fib6_info *ort) 1400 { 1401 struct net *net = dev_net(nrt->dst.dev); 1402 struct rt6_exception_bucket *bucket; 1403 struct in6_addr *src_key = NULL; 1404 struct rt6_exception *rt6_ex; 1405 int err = 0; 1406 1407 spin_lock_bh(&rt6_exception_lock); 1408 1409 if (ort->exception_bucket_flushed) { 1410 err = -EINVAL; 1411 goto out; 1412 } 1413 1414 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, 1415 lockdep_is_held(&rt6_exception_lock)); 1416 if (!bucket) { 1417 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1418 GFP_ATOMIC); 1419 if (!bucket) { 1420 err = -ENOMEM; 1421 goto out; 1422 } 1423 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); 1424 } 1425 1426 #ifdef CONFIG_IPV6_SUBTREES 1427 /* rt6i_src.plen != 0 indicates ort is in subtree 1428 * and exception table is indexed by a hash of 1429 * both rt6i_dst and rt6i_src. 1430 * Otherwise, the exception table is indexed by 1431 * a hash of only rt6i_dst. 1432 */ 1433 if (ort->fib6_src.plen) 1434 src_key = &nrt->rt6i_src.addr; 1435 #endif 1436 1437 /* Update rt6i_prefsrc as it could be changed 1438 * in rt6_remove_prefsrc() 1439 */ 1440 nrt->rt6i_prefsrc = ort->fib6_prefsrc; 1441 /* rt6_mtu_change() might lower mtu on ort. 1442 * Only insert this exception route if its mtu 1443 * is less than ort's mtu value. 1444 */ 1445 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) { 1446 err = -EINVAL; 1447 goto out; 1448 } 1449 1450 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1451 src_key); 1452 if (rt6_ex) 1453 rt6_remove_exception(bucket, rt6_ex); 1454 1455 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1456 if (!rt6_ex) { 1457 err = -ENOMEM; 1458 goto out; 1459 } 1460 rt6_ex->rt6i = nrt; 1461 rt6_ex->stamp = jiffies; 1462 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1463 bucket->depth++; 1464 net->ipv6.rt6_stats->fib_rt_cache++; 1465 1466 if (bucket->depth > FIB6_MAX_DEPTH) 1467 rt6_exception_remove_oldest(bucket); 1468 1469 out: 1470 spin_unlock_bh(&rt6_exception_lock); 1471 1472 /* Update fn->fn_sernum to invalidate all cached dst */ 1473 if (!err) { 1474 spin_lock_bh(&ort->fib6_table->tb6_lock); 1475 fib6_update_sernum(net, ort); 1476 spin_unlock_bh(&ort->fib6_table->tb6_lock); 1477 fib6_force_start_gc(net); 1478 } 1479 1480 return err; 1481 } 1482 1483 void rt6_flush_exceptions(struct fib6_info *rt) 1484 { 1485 struct rt6_exception_bucket *bucket; 1486 struct rt6_exception *rt6_ex; 1487 struct hlist_node *tmp; 1488 int i; 1489 1490 spin_lock_bh(&rt6_exception_lock); 1491 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1492 rt->exception_bucket_flushed = 1; 1493 1494 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1495 lockdep_is_held(&rt6_exception_lock)); 1496 if (!bucket) 1497 goto out; 1498 1499 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1500 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1501 rt6_remove_exception(bucket, rt6_ex); 1502 WARN_ON_ONCE(bucket->depth); 1503 bucket++; 1504 } 1505 1506 out: 1507 spin_unlock_bh(&rt6_exception_lock); 1508 } 1509 1510 /* Find cached rt in the hash table inside passed in rt 1511 * Caller has to hold rcu_read_lock() 1512 */ 1513 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 1514 struct in6_addr *daddr, 1515 struct in6_addr *saddr) 1516 { 1517 struct rt6_exception_bucket *bucket; 1518 struct in6_addr *src_key = NULL; 1519 struct rt6_exception *rt6_ex; 1520 struct rt6_info *res = NULL; 1521 1522 bucket = rcu_dereference(rt->rt6i_exception_bucket); 1523 1524 #ifdef CONFIG_IPV6_SUBTREES 1525 /* rt6i_src.plen != 0 indicates rt is in subtree 1526 * and exception table is indexed by a hash of 1527 * both rt6i_dst and rt6i_src. 1528 * Otherwise, the exception table is indexed by 1529 * a hash of only rt6i_dst. 1530 */ 1531 if (rt->fib6_src.plen) 1532 src_key = saddr; 1533 #endif 1534 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1535 1536 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1537 res = rt6_ex->rt6i; 1538 1539 return res; 1540 } 1541 1542 /* Remove the passed in cached rt from the hash table that contains it */ 1543 static int rt6_remove_exception_rt(struct rt6_info *rt) 1544 { 1545 struct rt6_exception_bucket *bucket; 1546 struct in6_addr *src_key = NULL; 1547 struct rt6_exception *rt6_ex; 1548 struct fib6_info *from; 1549 int err; 1550 1551 from = rcu_dereference(rt->from); 1552 if (!from || 1553 !(rt->rt6i_flags & RTF_CACHE)) 1554 return -EINVAL; 1555 1556 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1557 return -ENOENT; 1558 1559 spin_lock_bh(&rt6_exception_lock); 1560 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1561 lockdep_is_held(&rt6_exception_lock)); 1562 #ifdef CONFIG_IPV6_SUBTREES 1563 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1564 * and exception table is indexed by a hash of 1565 * both rt6i_dst and rt6i_src. 1566 * Otherwise, the exception table is indexed by 1567 * a hash of only rt6i_dst. 1568 */ 1569 if (from->fib6_src.plen) 1570 src_key = &rt->rt6i_src.addr; 1571 #endif 1572 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1573 &rt->rt6i_dst.addr, 1574 src_key); 1575 if (rt6_ex) { 1576 rt6_remove_exception(bucket, rt6_ex); 1577 err = 0; 1578 } else { 1579 err = -ENOENT; 1580 } 1581 1582 spin_unlock_bh(&rt6_exception_lock); 1583 return err; 1584 } 1585 1586 /* Find rt6_ex which contains the passed in rt cache and 1587 * refresh its stamp 1588 */ 1589 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1590 { 1591 struct rt6_exception_bucket *bucket; 1592 struct fib6_info *from = rt->from; 1593 struct in6_addr *src_key = NULL; 1594 struct rt6_exception *rt6_ex; 1595 1596 if (!from || 1597 !(rt->rt6i_flags & RTF_CACHE)) 1598 return; 1599 1600 rcu_read_lock(); 1601 bucket = rcu_dereference(from->rt6i_exception_bucket); 1602 1603 #ifdef CONFIG_IPV6_SUBTREES 1604 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1605 * and exception table is indexed by a hash of 1606 * both rt6i_dst and rt6i_src. 1607 * Otherwise, the exception table is indexed by 1608 * a hash of only rt6i_dst. 1609 */ 1610 if (from->fib6_src.plen) 1611 src_key = &rt->rt6i_src.addr; 1612 #endif 1613 rt6_ex = __rt6_find_exception_rcu(&bucket, 1614 &rt->rt6i_dst.addr, 1615 src_key); 1616 if (rt6_ex) 1617 rt6_ex->stamp = jiffies; 1618 1619 rcu_read_unlock(); 1620 } 1621 1622 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt) 1623 { 1624 struct rt6_exception_bucket *bucket; 1625 struct rt6_exception *rt6_ex; 1626 int i; 1627 1628 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1629 lockdep_is_held(&rt6_exception_lock)); 1630 1631 if (bucket) { 1632 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1633 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1634 rt6_ex->rt6i->rt6i_prefsrc.plen = 0; 1635 } 1636 bucket++; 1637 } 1638 } 1639 } 1640 1641 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1642 struct rt6_info *rt, int mtu) 1643 { 1644 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1645 * lowest MTU in the path: always allow updating the route PMTU to 1646 * reflect PMTU decreases. 1647 * 1648 * If the new MTU is higher, and the route PMTU is equal to the local 1649 * MTU, this means the old MTU is the lowest in the path, so allow 1650 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1651 * handle this. 1652 */ 1653 1654 if (dst_mtu(&rt->dst) >= mtu) 1655 return true; 1656 1657 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 1658 return true; 1659 1660 return false; 1661 } 1662 1663 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1664 struct fib6_info *rt, int mtu) 1665 { 1666 struct rt6_exception_bucket *bucket; 1667 struct rt6_exception *rt6_ex; 1668 int i; 1669 1670 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1671 lockdep_is_held(&rt6_exception_lock)); 1672 1673 if (!bucket) 1674 return; 1675 1676 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1677 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1678 struct rt6_info *entry = rt6_ex->rt6i; 1679 1680 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 1681 * route), the metrics of its rt->from have already 1682 * been updated. 1683 */ 1684 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 1685 rt6_mtu_change_route_allowed(idev, entry, mtu)) 1686 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 1687 } 1688 bucket++; 1689 } 1690 } 1691 1692 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1693 1694 static void rt6_exceptions_clean_tohost(struct fib6_info *rt, 1695 struct in6_addr *gateway) 1696 { 1697 struct rt6_exception_bucket *bucket; 1698 struct rt6_exception *rt6_ex; 1699 struct hlist_node *tmp; 1700 int i; 1701 1702 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1703 return; 1704 1705 spin_lock_bh(&rt6_exception_lock); 1706 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1707 lockdep_is_held(&rt6_exception_lock)); 1708 1709 if (bucket) { 1710 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1711 hlist_for_each_entry_safe(rt6_ex, tmp, 1712 &bucket->chain, hlist) { 1713 struct rt6_info *entry = rt6_ex->rt6i; 1714 1715 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1716 RTF_CACHE_GATEWAY && 1717 ipv6_addr_equal(gateway, 1718 &entry->rt6i_gateway)) { 1719 rt6_remove_exception(bucket, rt6_ex); 1720 } 1721 } 1722 bucket++; 1723 } 1724 } 1725 1726 spin_unlock_bh(&rt6_exception_lock); 1727 } 1728 1729 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1730 struct rt6_exception *rt6_ex, 1731 struct fib6_gc_args *gc_args, 1732 unsigned long now) 1733 { 1734 struct rt6_info *rt = rt6_ex->rt6i; 1735 1736 /* we are pruning and obsoleting aged-out and non gateway exceptions 1737 * even if others have still references to them, so that on next 1738 * dst_check() such references can be dropped. 1739 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1740 * expired, independently from their aging, as per RFC 8201 section 4 1741 */ 1742 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 1743 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1744 RT6_TRACE("aging clone %p\n", rt); 1745 rt6_remove_exception(bucket, rt6_ex); 1746 return; 1747 } 1748 } else if (time_after(jiffies, rt->dst.expires)) { 1749 RT6_TRACE("purging expired route %p\n", rt); 1750 rt6_remove_exception(bucket, rt6_ex); 1751 return; 1752 } 1753 1754 if (rt->rt6i_flags & RTF_GATEWAY) { 1755 struct neighbour *neigh; 1756 __u8 neigh_flags = 0; 1757 1758 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 1759 if (neigh) 1760 neigh_flags = neigh->flags; 1761 1762 if (!(neigh_flags & NTF_ROUTER)) { 1763 RT6_TRACE("purging route %p via non-router but gateway\n", 1764 rt); 1765 rt6_remove_exception(bucket, rt6_ex); 1766 return; 1767 } 1768 } 1769 1770 gc_args->more++; 1771 } 1772 1773 void rt6_age_exceptions(struct fib6_info *rt, 1774 struct fib6_gc_args *gc_args, 1775 unsigned long now) 1776 { 1777 struct rt6_exception_bucket *bucket; 1778 struct rt6_exception *rt6_ex; 1779 struct hlist_node *tmp; 1780 int i; 1781 1782 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1783 return; 1784 1785 rcu_read_lock_bh(); 1786 spin_lock(&rt6_exception_lock); 1787 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1788 lockdep_is_held(&rt6_exception_lock)); 1789 1790 if (bucket) { 1791 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1792 hlist_for_each_entry_safe(rt6_ex, tmp, 1793 &bucket->chain, hlist) { 1794 rt6_age_examine_exception(bucket, rt6_ex, 1795 gc_args, now); 1796 } 1797 bucket++; 1798 } 1799 } 1800 spin_unlock(&rt6_exception_lock); 1801 rcu_read_unlock_bh(); 1802 } 1803 1804 /* must be called with rcu lock held */ 1805 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, 1806 int oif, struct flowi6 *fl6, int strict) 1807 { 1808 struct fib6_node *fn, *saved_fn; 1809 struct fib6_info *f6i; 1810 1811 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1812 saved_fn = fn; 1813 1814 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1815 oif = 0; 1816 1817 redo_rt6_select: 1818 f6i = rt6_select(net, fn, oif, strict); 1819 if (f6i == net->ipv6.fib6_null_entry) { 1820 fn = fib6_backtrack(fn, &fl6->saddr); 1821 if (fn) 1822 goto redo_rt6_select; 1823 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1824 /* also consider unreachable route */ 1825 strict &= ~RT6_LOOKUP_F_REACHABLE; 1826 fn = saved_fn; 1827 goto redo_rt6_select; 1828 } 1829 } 1830 1831 trace_fib6_table_lookup(net, f6i, table, fl6); 1832 1833 return f6i; 1834 } 1835 1836 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1837 int oif, struct flowi6 *fl6, 1838 const struct sk_buff *skb, int flags) 1839 { 1840 struct fib6_info *f6i; 1841 struct rt6_info *rt; 1842 int strict = 0; 1843 1844 strict |= flags & RT6_LOOKUP_F_IFACE; 1845 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1846 if (net->ipv6.devconf_all->forwarding == 0) 1847 strict |= RT6_LOOKUP_F_REACHABLE; 1848 1849 rcu_read_lock(); 1850 1851 f6i = fib6_table_lookup(net, table, oif, fl6, strict); 1852 if (f6i->fib6_nsiblings) 1853 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict); 1854 1855 if (f6i == net->ipv6.fib6_null_entry) { 1856 rt = net->ipv6.ip6_null_entry; 1857 rcu_read_unlock(); 1858 dst_hold(&rt->dst); 1859 return rt; 1860 } 1861 1862 /*Search through exception table */ 1863 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1864 if (rt) { 1865 if (ip6_hold_safe(net, &rt, true)) 1866 dst_use_noref(&rt->dst, jiffies); 1867 1868 rcu_read_unlock(); 1869 return rt; 1870 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1871 !(f6i->fib6_flags & RTF_GATEWAY))) { 1872 /* Create a RTF_CACHE clone which will not be 1873 * owned by the fib6 tree. It is for the special case where 1874 * the daddr in the skb during the neighbor look-up is different 1875 * from the fl6->daddr used to look-up route here. 1876 */ 1877 struct rt6_info *uncached_rt; 1878 1879 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); 1880 1881 rcu_read_unlock(); 1882 1883 if (uncached_rt) { 1884 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1885 * No need for another dst_hold() 1886 */ 1887 rt6_uncached_list_add(uncached_rt); 1888 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 1889 } else { 1890 uncached_rt = net->ipv6.ip6_null_entry; 1891 dst_hold(&uncached_rt->dst); 1892 } 1893 1894 return uncached_rt; 1895 } else { 1896 /* Get a percpu copy */ 1897 1898 struct rt6_info *pcpu_rt; 1899 1900 local_bh_disable(); 1901 pcpu_rt = rt6_get_pcpu_route(f6i); 1902 1903 if (!pcpu_rt) 1904 pcpu_rt = rt6_make_pcpu_route(net, f6i); 1905 1906 local_bh_enable(); 1907 rcu_read_unlock(); 1908 1909 return pcpu_rt; 1910 } 1911 } 1912 EXPORT_SYMBOL_GPL(ip6_pol_route); 1913 1914 static struct rt6_info *ip6_pol_route_input(struct net *net, 1915 struct fib6_table *table, 1916 struct flowi6 *fl6, 1917 const struct sk_buff *skb, 1918 int flags) 1919 { 1920 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 1921 } 1922 1923 struct dst_entry *ip6_route_input_lookup(struct net *net, 1924 struct net_device *dev, 1925 struct flowi6 *fl6, 1926 const struct sk_buff *skb, 1927 int flags) 1928 { 1929 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1930 flags |= RT6_LOOKUP_F_IFACE; 1931 1932 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 1933 } 1934 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1935 1936 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1937 struct flow_keys *keys, 1938 struct flow_keys *flkeys) 1939 { 1940 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1941 const struct ipv6hdr *key_iph = outer_iph; 1942 struct flow_keys *_flkeys = flkeys; 1943 const struct ipv6hdr *inner_iph; 1944 const struct icmp6hdr *icmph; 1945 struct ipv6hdr _inner_iph; 1946 struct icmp6hdr _icmph; 1947 1948 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1949 goto out; 1950 1951 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 1952 sizeof(_icmph), &_icmph); 1953 if (!icmph) 1954 goto out; 1955 1956 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1957 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1958 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1959 icmph->icmp6_type != ICMPV6_PARAMPROB) 1960 goto out; 1961 1962 inner_iph = skb_header_pointer(skb, 1963 skb_transport_offset(skb) + sizeof(*icmph), 1964 sizeof(_inner_iph), &_inner_iph); 1965 if (!inner_iph) 1966 goto out; 1967 1968 key_iph = inner_iph; 1969 _flkeys = NULL; 1970 out: 1971 if (_flkeys) { 1972 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 1973 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 1974 keys->tags.flow_label = _flkeys->tags.flow_label; 1975 keys->basic.ip_proto = _flkeys->basic.ip_proto; 1976 } else { 1977 keys->addrs.v6addrs.src = key_iph->saddr; 1978 keys->addrs.v6addrs.dst = key_iph->daddr; 1979 keys->tags.flow_label = ip6_flowinfo(key_iph); 1980 keys->basic.ip_proto = key_iph->nexthdr; 1981 } 1982 } 1983 1984 /* if skb is set it will be used and fl6 can be NULL */ 1985 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 1986 const struct sk_buff *skb, struct flow_keys *flkeys) 1987 { 1988 struct flow_keys hash_keys; 1989 u32 mhash; 1990 1991 switch (ip6_multipath_hash_policy(net)) { 1992 case 0: 1993 memset(&hash_keys, 0, sizeof(hash_keys)); 1994 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 1995 if (skb) { 1996 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 1997 } else { 1998 hash_keys.addrs.v6addrs.src = fl6->saddr; 1999 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2000 hash_keys.tags.flow_label = (__force u32)fl6->flowlabel; 2001 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2002 } 2003 break; 2004 case 1: 2005 if (skb) { 2006 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 2007 struct flow_keys keys; 2008 2009 /* short-circuit if we already have L4 hash present */ 2010 if (skb->l4_hash) 2011 return skb_get_hash_raw(skb) >> 1; 2012 2013 memset(&hash_keys, 0, sizeof(hash_keys)); 2014 2015 if (!flkeys) { 2016 skb_flow_dissect_flow_keys(skb, &keys, flag); 2017 flkeys = &keys; 2018 } 2019 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2020 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2021 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2022 hash_keys.ports.src = flkeys->ports.src; 2023 hash_keys.ports.dst = flkeys->ports.dst; 2024 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2025 } else { 2026 memset(&hash_keys, 0, sizeof(hash_keys)); 2027 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2028 hash_keys.addrs.v6addrs.src = fl6->saddr; 2029 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2030 hash_keys.ports.src = fl6->fl6_sport; 2031 hash_keys.ports.dst = fl6->fl6_dport; 2032 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2033 } 2034 break; 2035 } 2036 mhash = flow_hash_from_keys(&hash_keys); 2037 2038 return mhash >> 1; 2039 } 2040 2041 void ip6_route_input(struct sk_buff *skb) 2042 { 2043 const struct ipv6hdr *iph = ipv6_hdr(skb); 2044 struct net *net = dev_net(skb->dev); 2045 int flags = RT6_LOOKUP_F_HAS_SADDR; 2046 struct ip_tunnel_info *tun_info; 2047 struct flowi6 fl6 = { 2048 .flowi6_iif = skb->dev->ifindex, 2049 .daddr = iph->daddr, 2050 .saddr = iph->saddr, 2051 .flowlabel = ip6_flowinfo(iph), 2052 .flowi6_mark = skb->mark, 2053 .flowi6_proto = iph->nexthdr, 2054 }; 2055 struct flow_keys *flkeys = NULL, _flkeys; 2056 2057 tun_info = skb_tunnel_info(skb); 2058 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2059 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2060 2061 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2062 flkeys = &_flkeys; 2063 2064 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2065 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2066 skb_dst_drop(skb); 2067 skb_dst_set(skb, 2068 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); 2069 } 2070 2071 static struct rt6_info *ip6_pol_route_output(struct net *net, 2072 struct fib6_table *table, 2073 struct flowi6 *fl6, 2074 const struct sk_buff *skb, 2075 int flags) 2076 { 2077 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2078 } 2079 2080 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 2081 struct flowi6 *fl6, int flags) 2082 { 2083 bool any_src; 2084 2085 if (rt6_need_strict(&fl6->daddr)) { 2086 struct dst_entry *dst; 2087 2088 dst = l3mdev_link_scope_lookup(net, fl6); 2089 if (dst) 2090 return dst; 2091 } 2092 2093 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2094 2095 any_src = ipv6_addr_any(&fl6->saddr); 2096 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2097 (fl6->flowi6_oif && any_src)) 2098 flags |= RT6_LOOKUP_F_IFACE; 2099 2100 if (!any_src) 2101 flags |= RT6_LOOKUP_F_HAS_SADDR; 2102 else if (sk) 2103 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2104 2105 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2106 } 2107 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2108 2109 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2110 { 2111 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2112 struct net_device *loopback_dev = net->loopback_dev; 2113 struct dst_entry *new = NULL; 2114 2115 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2116 DST_OBSOLETE_DEAD, 0); 2117 if (rt) { 2118 rt6_info_init(rt); 2119 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2120 2121 new = &rt->dst; 2122 new->__use = 1; 2123 new->input = dst_discard; 2124 new->output = dst_discard_out; 2125 2126 dst_copy_metrics(new, &ort->dst); 2127 2128 rt->rt6i_idev = in6_dev_get(loopback_dev); 2129 rt->rt6i_gateway = ort->rt6i_gateway; 2130 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2131 2132 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2133 #ifdef CONFIG_IPV6_SUBTREES 2134 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2135 #endif 2136 } 2137 2138 dst_release(dst_orig); 2139 return new ? new : ERR_PTR(-ENOMEM); 2140 } 2141 2142 /* 2143 * Destination cache support functions 2144 */ 2145 2146 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2147 { 2148 u32 rt_cookie = 0; 2149 2150 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2151 return false; 2152 2153 if (fib6_check_expired(f6i)) 2154 return false; 2155 2156 return true; 2157 } 2158 2159 static struct dst_entry *rt6_check(struct rt6_info *rt, 2160 struct fib6_info *from, 2161 u32 cookie) 2162 { 2163 u32 rt_cookie = 0; 2164 2165 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || 2166 rt_cookie != cookie) 2167 return NULL; 2168 2169 if (rt6_check_expired(rt)) 2170 return NULL; 2171 2172 return &rt->dst; 2173 } 2174 2175 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2176 struct fib6_info *from, 2177 u32 cookie) 2178 { 2179 if (!__rt6_check_expired(rt) && 2180 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2181 fib6_check(from, cookie)) 2182 return &rt->dst; 2183 else 2184 return NULL; 2185 } 2186 2187 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2188 { 2189 struct dst_entry *dst_ret; 2190 struct fib6_info *from; 2191 struct rt6_info *rt; 2192 2193 rt = container_of(dst, struct rt6_info, dst); 2194 2195 rcu_read_lock(); 2196 2197 /* All IPV6 dsts are created with ->obsolete set to the value 2198 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2199 * into this function always. 2200 */ 2201 2202 from = rcu_dereference(rt->from); 2203 2204 if (from && (rt->rt6i_flags & RTF_PCPU || 2205 unlikely(!list_empty(&rt->rt6i_uncached)))) 2206 dst_ret = rt6_dst_from_check(rt, from, cookie); 2207 else 2208 dst_ret = rt6_check(rt, from, cookie); 2209 2210 rcu_read_unlock(); 2211 2212 return dst_ret; 2213 } 2214 2215 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2216 { 2217 struct rt6_info *rt = (struct rt6_info *) dst; 2218 2219 if (rt) { 2220 if (rt->rt6i_flags & RTF_CACHE) { 2221 rcu_read_lock(); 2222 if (rt6_check_expired(rt)) { 2223 rt6_remove_exception_rt(rt); 2224 dst = NULL; 2225 } 2226 rcu_read_unlock(); 2227 } else { 2228 dst_release(dst); 2229 dst = NULL; 2230 } 2231 } 2232 return dst; 2233 } 2234 2235 static void ip6_link_failure(struct sk_buff *skb) 2236 { 2237 struct rt6_info *rt; 2238 2239 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2240 2241 rt = (struct rt6_info *) skb_dst(skb); 2242 if (rt) { 2243 rcu_read_lock(); 2244 if (rt->rt6i_flags & RTF_CACHE) { 2245 if (dst_hold_safe(&rt->dst)) 2246 rt6_remove_exception_rt(rt); 2247 } else { 2248 struct fib6_info *from; 2249 struct fib6_node *fn; 2250 2251 from = rcu_dereference(rt->from); 2252 if (from) { 2253 fn = rcu_dereference(from->fib6_node); 2254 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2255 fn->fn_sernum = -1; 2256 } 2257 } 2258 rcu_read_unlock(); 2259 } 2260 } 2261 2262 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2263 { 2264 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2265 struct fib6_info *from; 2266 2267 rcu_read_lock(); 2268 from = rcu_dereference(rt0->from); 2269 if (from) 2270 rt0->dst.expires = from->expires; 2271 rcu_read_unlock(); 2272 } 2273 2274 dst_set_expires(&rt0->dst, timeout); 2275 rt0->rt6i_flags |= RTF_EXPIRES; 2276 } 2277 2278 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2279 { 2280 struct net *net = dev_net(rt->dst.dev); 2281 2282 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2283 rt->rt6i_flags |= RTF_MODIFIED; 2284 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2285 } 2286 2287 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2288 { 2289 bool from_set; 2290 2291 rcu_read_lock(); 2292 from_set = !!rcu_dereference(rt->from); 2293 rcu_read_unlock(); 2294 2295 return !(rt->rt6i_flags & RTF_CACHE) && 2296 (rt->rt6i_flags & RTF_PCPU || from_set); 2297 } 2298 2299 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2300 const struct ipv6hdr *iph, u32 mtu) 2301 { 2302 const struct in6_addr *daddr, *saddr; 2303 struct rt6_info *rt6 = (struct rt6_info *)dst; 2304 2305 if (rt6->rt6i_flags & RTF_LOCAL) 2306 return; 2307 2308 if (dst_metric_locked(dst, RTAX_MTU)) 2309 return; 2310 2311 if (iph) { 2312 daddr = &iph->daddr; 2313 saddr = &iph->saddr; 2314 } else if (sk) { 2315 daddr = &sk->sk_v6_daddr; 2316 saddr = &inet6_sk(sk)->saddr; 2317 } else { 2318 daddr = NULL; 2319 saddr = NULL; 2320 } 2321 dst_confirm_neigh(dst, daddr); 2322 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2323 if (mtu >= dst_mtu(dst)) 2324 return; 2325 2326 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2327 rt6_do_update_pmtu(rt6, mtu); 2328 /* update rt6_ex->stamp for cache */ 2329 if (rt6->rt6i_flags & RTF_CACHE) 2330 rt6_update_exception_stamp_rt(rt6); 2331 } else if (daddr) { 2332 struct fib6_info *from; 2333 struct rt6_info *nrt6; 2334 2335 rcu_read_lock(); 2336 from = rcu_dereference(rt6->from); 2337 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr); 2338 if (nrt6) { 2339 rt6_do_update_pmtu(nrt6, mtu); 2340 if (rt6_insert_exception(nrt6, from)) 2341 dst_release_immediate(&nrt6->dst); 2342 } 2343 rcu_read_unlock(); 2344 } 2345 } 2346 2347 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2348 struct sk_buff *skb, u32 mtu) 2349 { 2350 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2351 } 2352 2353 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2354 int oif, u32 mark, kuid_t uid) 2355 { 2356 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2357 struct dst_entry *dst; 2358 struct flowi6 fl6; 2359 2360 memset(&fl6, 0, sizeof(fl6)); 2361 fl6.flowi6_oif = oif; 2362 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); 2363 fl6.daddr = iph->daddr; 2364 fl6.saddr = iph->saddr; 2365 fl6.flowlabel = ip6_flowinfo(iph); 2366 fl6.flowi6_uid = uid; 2367 2368 dst = ip6_route_output(net, NULL, &fl6); 2369 if (!dst->error) 2370 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2371 dst_release(dst); 2372 } 2373 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2374 2375 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2376 { 2377 struct dst_entry *dst; 2378 2379 ip6_update_pmtu(skb, sock_net(sk), mtu, 2380 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid); 2381 2382 dst = __sk_dst_get(sk); 2383 if (!dst || !dst->obsolete || 2384 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2385 return; 2386 2387 bh_lock_sock(sk); 2388 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2389 ip6_datagram_dst_update(sk, false); 2390 bh_unlock_sock(sk); 2391 } 2392 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2393 2394 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2395 const struct flowi6 *fl6) 2396 { 2397 #ifdef CONFIG_IPV6_SUBTREES 2398 struct ipv6_pinfo *np = inet6_sk(sk); 2399 #endif 2400 2401 ip6_dst_store(sk, dst, 2402 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2403 &sk->sk_v6_daddr : NULL, 2404 #ifdef CONFIG_IPV6_SUBTREES 2405 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2406 &np->saddr : 2407 #endif 2408 NULL); 2409 } 2410 2411 /* Handle redirects */ 2412 struct ip6rd_flowi { 2413 struct flowi6 fl6; 2414 struct in6_addr gateway; 2415 }; 2416 2417 static struct rt6_info *__ip6_route_redirect(struct net *net, 2418 struct fib6_table *table, 2419 struct flowi6 *fl6, 2420 const struct sk_buff *skb, 2421 int flags) 2422 { 2423 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2424 struct rt6_info *ret = NULL, *rt_cache; 2425 struct fib6_info *rt; 2426 struct fib6_node *fn; 2427 2428 /* Get the "current" route for this destination and 2429 * check if the redirect has come from appropriate router. 2430 * 2431 * RFC 4861 specifies that redirects should only be 2432 * accepted if they come from the nexthop to the target. 2433 * Due to the way the routes are chosen, this notion 2434 * is a bit fuzzy and one might need to check all possible 2435 * routes. 2436 */ 2437 2438 rcu_read_lock(); 2439 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2440 restart: 2441 for_each_fib6_node_rt_rcu(fn) { 2442 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 2443 continue; 2444 if (fib6_check_expired(rt)) 2445 continue; 2446 if (rt->fib6_flags & RTF_REJECT) 2447 break; 2448 if (!(rt->fib6_flags & RTF_GATEWAY)) 2449 continue; 2450 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex) 2451 continue; 2452 /* rt_cache's gateway might be different from its 'parent' 2453 * in the case of an ip redirect. 2454 * So we keep searching in the exception table if the gateway 2455 * is different. 2456 */ 2457 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) { 2458 rt_cache = rt6_find_cached_rt(rt, 2459 &fl6->daddr, 2460 &fl6->saddr); 2461 if (rt_cache && 2462 ipv6_addr_equal(&rdfl->gateway, 2463 &rt_cache->rt6i_gateway)) { 2464 ret = rt_cache; 2465 break; 2466 } 2467 continue; 2468 } 2469 break; 2470 } 2471 2472 if (!rt) 2473 rt = net->ipv6.fib6_null_entry; 2474 else if (rt->fib6_flags & RTF_REJECT) { 2475 ret = net->ipv6.ip6_null_entry; 2476 goto out; 2477 } 2478 2479 if (rt == net->ipv6.fib6_null_entry) { 2480 fn = fib6_backtrack(fn, &fl6->saddr); 2481 if (fn) 2482 goto restart; 2483 } 2484 2485 out: 2486 if (ret) 2487 dst_hold(&ret->dst); 2488 else 2489 ret = ip6_create_rt_rcu(rt); 2490 2491 rcu_read_unlock(); 2492 2493 trace_fib6_table_lookup(net, rt, table, fl6); 2494 return ret; 2495 }; 2496 2497 static struct dst_entry *ip6_route_redirect(struct net *net, 2498 const struct flowi6 *fl6, 2499 const struct sk_buff *skb, 2500 const struct in6_addr *gateway) 2501 { 2502 int flags = RT6_LOOKUP_F_HAS_SADDR; 2503 struct ip6rd_flowi rdfl; 2504 2505 rdfl.fl6 = *fl6; 2506 rdfl.gateway = *gateway; 2507 2508 return fib6_rule_lookup(net, &rdfl.fl6, skb, 2509 flags, __ip6_route_redirect); 2510 } 2511 2512 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2513 kuid_t uid) 2514 { 2515 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2516 struct dst_entry *dst; 2517 struct flowi6 fl6; 2518 2519 memset(&fl6, 0, sizeof(fl6)); 2520 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2521 fl6.flowi6_oif = oif; 2522 fl6.flowi6_mark = mark; 2523 fl6.daddr = iph->daddr; 2524 fl6.saddr = iph->saddr; 2525 fl6.flowlabel = ip6_flowinfo(iph); 2526 fl6.flowi6_uid = uid; 2527 2528 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 2529 rt6_do_redirect(dst, NULL, skb); 2530 dst_release(dst); 2531 } 2532 EXPORT_SYMBOL_GPL(ip6_redirect); 2533 2534 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, 2535 u32 mark) 2536 { 2537 const struct ipv6hdr *iph = ipv6_hdr(skb); 2538 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2539 struct dst_entry *dst; 2540 struct flowi6 fl6; 2541 2542 memset(&fl6, 0, sizeof(fl6)); 2543 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2544 fl6.flowi6_oif = oif; 2545 fl6.flowi6_mark = mark; 2546 fl6.daddr = msg->dest; 2547 fl6.saddr = iph->daddr; 2548 fl6.flowi6_uid = sock_net_uid(net, NULL); 2549 2550 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 2551 rt6_do_redirect(dst, NULL, skb); 2552 dst_release(dst); 2553 } 2554 2555 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2556 { 2557 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2558 sk->sk_uid); 2559 } 2560 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2561 2562 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2563 { 2564 struct net_device *dev = dst->dev; 2565 unsigned int mtu = dst_mtu(dst); 2566 struct net *net = dev_net(dev); 2567 2568 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2569 2570 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2571 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2572 2573 /* 2574 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2575 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2576 * IPV6_MAXPLEN is also valid and means: "any MSS, 2577 * rely only on pmtu discovery" 2578 */ 2579 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2580 mtu = IPV6_MAXPLEN; 2581 return mtu; 2582 } 2583 2584 static unsigned int ip6_mtu(const struct dst_entry *dst) 2585 { 2586 struct inet6_dev *idev; 2587 unsigned int mtu; 2588 2589 mtu = dst_metric_raw(dst, RTAX_MTU); 2590 if (mtu) 2591 goto out; 2592 2593 mtu = IPV6_MIN_MTU; 2594 2595 rcu_read_lock(); 2596 idev = __in6_dev_get(dst->dev); 2597 if (idev) 2598 mtu = idev->cnf.mtu6; 2599 rcu_read_unlock(); 2600 2601 out: 2602 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2603 2604 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2605 } 2606 2607 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2608 struct flowi6 *fl6) 2609 { 2610 struct dst_entry *dst; 2611 struct rt6_info *rt; 2612 struct inet6_dev *idev = in6_dev_get(dev); 2613 struct net *net = dev_net(dev); 2614 2615 if (unlikely(!idev)) 2616 return ERR_PTR(-ENODEV); 2617 2618 rt = ip6_dst_alloc(net, dev, 0); 2619 if (unlikely(!rt)) { 2620 in6_dev_put(idev); 2621 dst = ERR_PTR(-ENOMEM); 2622 goto out; 2623 } 2624 2625 rt->dst.flags |= DST_HOST; 2626 rt->dst.input = ip6_input; 2627 rt->dst.output = ip6_output; 2628 rt->rt6i_gateway = fl6->daddr; 2629 rt->rt6i_dst.addr = fl6->daddr; 2630 rt->rt6i_dst.plen = 128; 2631 rt->rt6i_idev = idev; 2632 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2633 2634 /* Add this dst into uncached_list so that rt6_disable_ip() can 2635 * do proper release of the net_device 2636 */ 2637 rt6_uncached_list_add(rt); 2638 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2639 2640 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2641 2642 out: 2643 return dst; 2644 } 2645 2646 static int ip6_dst_gc(struct dst_ops *ops) 2647 { 2648 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2649 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2650 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2651 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2652 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2653 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2654 int entries; 2655 2656 entries = dst_entries_get_fast(ops); 2657 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2658 entries <= rt_max_size) 2659 goto out; 2660 2661 net->ipv6.ip6_rt_gc_expire++; 2662 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2663 entries = dst_entries_get_slow(ops); 2664 if (entries < ops->gc_thresh) 2665 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2666 out: 2667 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2668 return entries > rt_max_size; 2669 } 2670 2671 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt, 2672 struct fib6_config *cfg) 2673 { 2674 struct dst_metrics *p; 2675 2676 if (!cfg->fc_mx) 2677 return 0; 2678 2679 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL); 2680 if (unlikely(!p)) 2681 return -ENOMEM; 2682 2683 refcount_set(&p->refcnt, 1); 2684 rt->fib6_metrics = p; 2685 2686 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics); 2687 } 2688 2689 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2690 struct fib6_config *cfg, 2691 const struct in6_addr *gw_addr, 2692 u32 tbid, int flags) 2693 { 2694 struct flowi6 fl6 = { 2695 .flowi6_oif = cfg->fc_ifindex, 2696 .daddr = *gw_addr, 2697 .saddr = cfg->fc_prefsrc, 2698 }; 2699 struct fib6_table *table; 2700 struct rt6_info *rt; 2701 2702 table = fib6_get_table(net, tbid); 2703 if (!table) 2704 return NULL; 2705 2706 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2707 flags |= RT6_LOOKUP_F_HAS_SADDR; 2708 2709 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 2710 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); 2711 2712 /* if table lookup failed, fall back to full lookup */ 2713 if (rt == net->ipv6.ip6_null_entry) { 2714 ip6_rt_put(rt); 2715 rt = NULL; 2716 } 2717 2718 return rt; 2719 } 2720 2721 static int ip6_route_check_nh_onlink(struct net *net, 2722 struct fib6_config *cfg, 2723 const struct net_device *dev, 2724 struct netlink_ext_ack *extack) 2725 { 2726 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 2727 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2728 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; 2729 struct rt6_info *grt; 2730 int err; 2731 2732 err = 0; 2733 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2734 if (grt) { 2735 if (!grt->dst.error && 2736 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2737 NL_SET_ERR_MSG(extack, 2738 "Nexthop has invalid gateway or device mismatch"); 2739 err = -EINVAL; 2740 } 2741 2742 ip6_rt_put(grt); 2743 } 2744 2745 return err; 2746 } 2747 2748 static int ip6_route_check_nh(struct net *net, 2749 struct fib6_config *cfg, 2750 struct net_device **_dev, 2751 struct inet6_dev **idev) 2752 { 2753 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2754 struct net_device *dev = _dev ? *_dev : NULL; 2755 struct rt6_info *grt = NULL; 2756 int err = -EHOSTUNREACH; 2757 2758 if (cfg->fc_table) { 2759 int flags = RT6_LOOKUP_F_IFACE; 2760 2761 grt = ip6_nh_lookup_table(net, cfg, gw_addr, 2762 cfg->fc_table, flags); 2763 if (grt) { 2764 if (grt->rt6i_flags & RTF_GATEWAY || 2765 (dev && dev != grt->dst.dev)) { 2766 ip6_rt_put(grt); 2767 grt = NULL; 2768 } 2769 } 2770 } 2771 2772 if (!grt) 2773 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); 2774 2775 if (!grt) 2776 goto out; 2777 2778 if (dev) { 2779 if (dev != grt->dst.dev) { 2780 ip6_rt_put(grt); 2781 goto out; 2782 } 2783 } else { 2784 *_dev = dev = grt->dst.dev; 2785 *idev = grt->rt6i_idev; 2786 dev_hold(dev); 2787 in6_dev_hold(grt->rt6i_idev); 2788 } 2789 2790 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2791 err = 0; 2792 2793 ip6_rt_put(grt); 2794 2795 out: 2796 return err; 2797 } 2798 2799 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 2800 struct net_device **_dev, struct inet6_dev **idev, 2801 struct netlink_ext_ack *extack) 2802 { 2803 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2804 int gwa_type = ipv6_addr_type(gw_addr); 2805 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 2806 const struct net_device *dev = *_dev; 2807 bool need_addr_check = !dev; 2808 int err = -EINVAL; 2809 2810 /* if gw_addr is local we will fail to detect this in case 2811 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2812 * will return already-added prefix route via interface that 2813 * prefix route was assigned to, which might be non-loopback. 2814 */ 2815 if (dev && 2816 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2817 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2818 goto out; 2819 } 2820 2821 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 2822 /* IPv6 strictly inhibits using not link-local 2823 * addresses as nexthop address. 2824 * Otherwise, router will not able to send redirects. 2825 * It is very good, but in some (rare!) circumstances 2826 * (SIT, PtP, NBMA NOARP links) it is handy to allow 2827 * some exceptions. --ANK 2828 * We allow IPv4-mapped nexthops to support RFC4798-type 2829 * addressing 2830 */ 2831 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 2832 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2833 goto out; 2834 } 2835 2836 if (cfg->fc_flags & RTNH_F_ONLINK) 2837 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 2838 else 2839 err = ip6_route_check_nh(net, cfg, _dev, idev); 2840 2841 if (err) 2842 goto out; 2843 } 2844 2845 /* reload in case device was changed */ 2846 dev = *_dev; 2847 2848 err = -EINVAL; 2849 if (!dev) { 2850 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2851 goto out; 2852 } else if (dev->flags & IFF_LOOPBACK) { 2853 NL_SET_ERR_MSG(extack, 2854 "Egress device can not be loopback device for this route"); 2855 goto out; 2856 } 2857 2858 /* if we did not check gw_addr above, do so now that the 2859 * egress device has been resolved. 2860 */ 2861 if (need_addr_check && 2862 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2863 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2864 goto out; 2865 } 2866 2867 err = 0; 2868 out: 2869 return err; 2870 } 2871 2872 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 2873 gfp_t gfp_flags, 2874 struct netlink_ext_ack *extack) 2875 { 2876 struct net *net = cfg->fc_nlinfo.nl_net; 2877 struct fib6_info *rt = NULL; 2878 struct net_device *dev = NULL; 2879 struct inet6_dev *idev = NULL; 2880 struct fib6_table *table; 2881 int addr_type; 2882 int err = -EINVAL; 2883 2884 /* RTF_PCPU is an internal flag; can not be set by userspace */ 2885 if (cfg->fc_flags & RTF_PCPU) { 2886 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 2887 goto out; 2888 } 2889 2890 /* RTF_CACHE is an internal flag; can not be set by userspace */ 2891 if (cfg->fc_flags & RTF_CACHE) { 2892 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 2893 goto out; 2894 } 2895 2896 if (cfg->fc_type > RTN_MAX) { 2897 NL_SET_ERR_MSG(extack, "Invalid route type"); 2898 goto out; 2899 } 2900 2901 if (cfg->fc_dst_len > 128) { 2902 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 2903 goto out; 2904 } 2905 if (cfg->fc_src_len > 128) { 2906 NL_SET_ERR_MSG(extack, "Invalid source address length"); 2907 goto out; 2908 } 2909 #ifndef CONFIG_IPV6_SUBTREES 2910 if (cfg->fc_src_len) { 2911 NL_SET_ERR_MSG(extack, 2912 "Specifying source address requires IPV6_SUBTREES to be enabled"); 2913 goto out; 2914 } 2915 #endif 2916 if (cfg->fc_ifindex) { 2917 err = -ENODEV; 2918 dev = dev_get_by_index(net, cfg->fc_ifindex); 2919 if (!dev) 2920 goto out; 2921 idev = in6_dev_get(dev); 2922 if (!idev) 2923 goto out; 2924 } 2925 2926 if (cfg->fc_metric == 0) 2927 cfg->fc_metric = IP6_RT_PRIO_USER; 2928 2929 if (cfg->fc_flags & RTNH_F_ONLINK) { 2930 if (!dev) { 2931 NL_SET_ERR_MSG(extack, 2932 "Nexthop device required for onlink"); 2933 err = -ENODEV; 2934 goto out; 2935 } 2936 2937 if (!(dev->flags & IFF_UP)) { 2938 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 2939 err = -ENETDOWN; 2940 goto out; 2941 } 2942 } 2943 2944 err = -ENOBUFS; 2945 if (cfg->fc_nlinfo.nlh && 2946 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 2947 table = fib6_get_table(net, cfg->fc_table); 2948 if (!table) { 2949 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 2950 table = fib6_new_table(net, cfg->fc_table); 2951 } 2952 } else { 2953 table = fib6_new_table(net, cfg->fc_table); 2954 } 2955 2956 if (!table) 2957 goto out; 2958 2959 err = -ENOMEM; 2960 rt = fib6_info_alloc(gfp_flags); 2961 if (!rt) 2962 goto out; 2963 2964 if (cfg->fc_flags & RTF_ADDRCONF) 2965 rt->dst_nocount = true; 2966 2967 err = ip6_convert_metrics(net, rt, cfg); 2968 if (err < 0) 2969 goto out; 2970 2971 if (cfg->fc_flags & RTF_EXPIRES) 2972 fib6_set_expires(rt, jiffies + 2973 clock_t_to_jiffies(cfg->fc_expires)); 2974 else 2975 fib6_clean_expires(rt); 2976 2977 if (cfg->fc_protocol == RTPROT_UNSPEC) 2978 cfg->fc_protocol = RTPROT_BOOT; 2979 rt->fib6_protocol = cfg->fc_protocol; 2980 2981 addr_type = ipv6_addr_type(&cfg->fc_dst); 2982 2983 if (cfg->fc_encap) { 2984 struct lwtunnel_state *lwtstate; 2985 2986 err = lwtunnel_build_state(cfg->fc_encap_type, 2987 cfg->fc_encap, AF_INET6, cfg, 2988 &lwtstate, extack); 2989 if (err) 2990 goto out; 2991 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate); 2992 } 2993 2994 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 2995 rt->fib6_dst.plen = cfg->fc_dst_len; 2996 if (rt->fib6_dst.plen == 128) 2997 rt->dst_host = true; 2998 2999 #ifdef CONFIG_IPV6_SUBTREES 3000 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3001 rt->fib6_src.plen = cfg->fc_src_len; 3002 #endif 3003 3004 rt->fib6_metric = cfg->fc_metric; 3005 rt->fib6_nh.nh_weight = 1; 3006 3007 rt->fib6_type = cfg->fc_type; 3008 3009 /* We cannot add true routes via loopback here, 3010 they would result in kernel looping; promote them to reject routes 3011 */ 3012 if ((cfg->fc_flags & RTF_REJECT) || 3013 (dev && (dev->flags & IFF_LOOPBACK) && 3014 !(addr_type & IPV6_ADDR_LOOPBACK) && 3015 !(cfg->fc_flags & RTF_LOCAL))) { 3016 /* hold loopback dev/idev if we haven't done so. */ 3017 if (dev != net->loopback_dev) { 3018 if (dev) { 3019 dev_put(dev); 3020 in6_dev_put(idev); 3021 } 3022 dev = net->loopback_dev; 3023 dev_hold(dev); 3024 idev = in6_dev_get(dev); 3025 if (!idev) { 3026 err = -ENODEV; 3027 goto out; 3028 } 3029 } 3030 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP; 3031 goto install_route; 3032 } 3033 3034 if (cfg->fc_flags & RTF_GATEWAY) { 3035 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 3036 if (err) 3037 goto out; 3038 3039 rt->fib6_nh.nh_gw = cfg->fc_gateway; 3040 } 3041 3042 err = -ENODEV; 3043 if (!dev) 3044 goto out; 3045 3046 if (idev->cnf.disable_ipv6) { 3047 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3048 err = -EACCES; 3049 goto out; 3050 } 3051 3052 if (!(dev->flags & IFF_UP)) { 3053 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3054 err = -ENETDOWN; 3055 goto out; 3056 } 3057 3058 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3059 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3060 NL_SET_ERR_MSG(extack, "Invalid source address"); 3061 err = -EINVAL; 3062 goto out; 3063 } 3064 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3065 rt->fib6_prefsrc.plen = 128; 3066 } else 3067 rt->fib6_prefsrc.plen = 0; 3068 3069 rt->fib6_flags = cfg->fc_flags; 3070 3071 install_route: 3072 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3073 !netif_carrier_ok(dev)) 3074 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 3075 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); 3076 rt->fib6_nh.nh_dev = dev; 3077 rt->fib6_table = table; 3078 3079 cfg->fc_nlinfo.nl_net = dev_net(dev); 3080 3081 if (idev) 3082 in6_dev_put(idev); 3083 3084 return rt; 3085 out: 3086 if (dev) 3087 dev_put(dev); 3088 if (idev) 3089 in6_dev_put(idev); 3090 3091 fib6_info_release(rt); 3092 return ERR_PTR(err); 3093 } 3094 3095 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3096 struct netlink_ext_ack *extack) 3097 { 3098 struct fib6_info *rt; 3099 int err; 3100 3101 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3102 if (IS_ERR(rt)) 3103 return PTR_ERR(rt); 3104 3105 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3106 fib6_info_release(rt); 3107 3108 return err; 3109 } 3110 3111 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3112 { 3113 struct net *net = info->nl_net; 3114 struct fib6_table *table; 3115 int err; 3116 3117 if (rt == net->ipv6.fib6_null_entry) { 3118 err = -ENOENT; 3119 goto out; 3120 } 3121 3122 table = rt->fib6_table; 3123 spin_lock_bh(&table->tb6_lock); 3124 err = fib6_del(rt, info); 3125 spin_unlock_bh(&table->tb6_lock); 3126 3127 out: 3128 fib6_info_release(rt); 3129 return err; 3130 } 3131 3132 int ip6_del_rt(struct net *net, struct fib6_info *rt) 3133 { 3134 struct nl_info info = { .nl_net = net }; 3135 3136 return __ip6_del_rt(rt, &info); 3137 } 3138 3139 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3140 { 3141 struct nl_info *info = &cfg->fc_nlinfo; 3142 struct net *net = info->nl_net; 3143 struct sk_buff *skb = NULL; 3144 struct fib6_table *table; 3145 int err = -ENOENT; 3146 3147 if (rt == net->ipv6.fib6_null_entry) 3148 goto out_put; 3149 table = rt->fib6_table; 3150 spin_lock_bh(&table->tb6_lock); 3151 3152 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3153 struct fib6_info *sibling, *next_sibling; 3154 3155 /* prefer to send a single notification with all hops */ 3156 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3157 if (skb) { 3158 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3159 3160 if (rt6_fill_node(net, skb, rt, NULL, 3161 NULL, NULL, 0, RTM_DELROUTE, 3162 info->portid, seq, 0) < 0) { 3163 kfree_skb(skb); 3164 skb = NULL; 3165 } else 3166 info->skip_notify = 1; 3167 } 3168 3169 list_for_each_entry_safe(sibling, next_sibling, 3170 &rt->fib6_siblings, 3171 fib6_siblings) { 3172 err = fib6_del(sibling, info); 3173 if (err) 3174 goto out_unlock; 3175 } 3176 } 3177 3178 err = fib6_del(rt, info); 3179 out_unlock: 3180 spin_unlock_bh(&table->tb6_lock); 3181 out_put: 3182 fib6_info_release(rt); 3183 3184 if (skb) { 3185 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3186 info->nlh, gfp_any()); 3187 } 3188 return err; 3189 } 3190 3191 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3192 { 3193 int rc = -ESRCH; 3194 3195 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3196 goto out; 3197 3198 if (cfg->fc_flags & RTF_GATEWAY && 3199 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3200 goto out; 3201 if (dst_hold_safe(&rt->dst)) 3202 rc = rt6_remove_exception_rt(rt); 3203 out: 3204 return rc; 3205 } 3206 3207 static int ip6_route_del(struct fib6_config *cfg, 3208 struct netlink_ext_ack *extack) 3209 { 3210 struct rt6_info *rt_cache; 3211 struct fib6_table *table; 3212 struct fib6_info *rt; 3213 struct fib6_node *fn; 3214 int err = -ESRCH; 3215 3216 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 3217 if (!table) { 3218 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 3219 return err; 3220 } 3221 3222 rcu_read_lock(); 3223 3224 fn = fib6_locate(&table->tb6_root, 3225 &cfg->fc_dst, cfg->fc_dst_len, 3226 &cfg->fc_src, cfg->fc_src_len, 3227 !(cfg->fc_flags & RTF_CACHE)); 3228 3229 if (fn) { 3230 for_each_fib6_node_rt_rcu(fn) { 3231 if (cfg->fc_flags & RTF_CACHE) { 3232 int rc; 3233 3234 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, 3235 &cfg->fc_src); 3236 if (rt_cache) { 3237 rc = ip6_del_cached_rt(rt_cache, cfg); 3238 if (rc != -ESRCH) { 3239 rcu_read_unlock(); 3240 return rc; 3241 } 3242 } 3243 continue; 3244 } 3245 if (cfg->fc_ifindex && 3246 (!rt->fib6_nh.nh_dev || 3247 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex)) 3248 continue; 3249 if (cfg->fc_flags & RTF_GATEWAY && 3250 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw)) 3251 continue; 3252 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 3253 continue; 3254 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) 3255 continue; 3256 fib6_info_hold(rt); 3257 rcu_read_unlock(); 3258 3259 /* if gateway was specified only delete the one hop */ 3260 if (cfg->fc_flags & RTF_GATEWAY) 3261 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3262 3263 return __ip6_del_rt_siblings(rt, cfg); 3264 } 3265 } 3266 rcu_read_unlock(); 3267 3268 return err; 3269 } 3270 3271 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3272 { 3273 struct netevent_redirect netevent; 3274 struct rt6_info *rt, *nrt = NULL; 3275 struct ndisc_options ndopts; 3276 struct inet6_dev *in6_dev; 3277 struct neighbour *neigh; 3278 struct fib6_info *from; 3279 struct rd_msg *msg; 3280 int optlen, on_link; 3281 u8 *lladdr; 3282 3283 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3284 optlen -= sizeof(*msg); 3285 3286 if (optlen < 0) { 3287 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3288 return; 3289 } 3290 3291 msg = (struct rd_msg *)icmp6_hdr(skb); 3292 3293 if (ipv6_addr_is_multicast(&msg->dest)) { 3294 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3295 return; 3296 } 3297 3298 on_link = 0; 3299 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3300 on_link = 1; 3301 } else if (ipv6_addr_type(&msg->target) != 3302 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3303 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3304 return; 3305 } 3306 3307 in6_dev = __in6_dev_get(skb->dev); 3308 if (!in6_dev) 3309 return; 3310 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3311 return; 3312 3313 /* RFC2461 8.1: 3314 * The IP source address of the Redirect MUST be the same as the current 3315 * first-hop router for the specified ICMP Destination Address. 3316 */ 3317 3318 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3319 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3320 return; 3321 } 3322 3323 lladdr = NULL; 3324 if (ndopts.nd_opts_tgt_lladdr) { 3325 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3326 skb->dev); 3327 if (!lladdr) { 3328 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 3329 return; 3330 } 3331 } 3332 3333 rt = (struct rt6_info *) dst; 3334 if (rt->rt6i_flags & RTF_REJECT) { 3335 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 3336 return; 3337 } 3338 3339 /* Redirect received -> path was valid. 3340 * Look, redirects are sent only in response to data packets, 3341 * so that this nexthop apparently is reachable. --ANK 3342 */ 3343 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3344 3345 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3346 if (!neigh) 3347 return; 3348 3349 /* 3350 * We have finally decided to accept it. 3351 */ 3352 3353 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3354 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3355 NEIGH_UPDATE_F_OVERRIDE| 3356 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3357 NEIGH_UPDATE_F_ISROUTER)), 3358 NDISC_REDIRECT, &ndopts); 3359 3360 rcu_read_lock(); 3361 from = rcu_dereference(rt->from); 3362 fib6_info_hold(from); 3363 rcu_read_unlock(); 3364 3365 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL); 3366 if (!nrt) 3367 goto out; 3368 3369 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3370 if (on_link) 3371 nrt->rt6i_flags &= ~RTF_GATEWAY; 3372 3373 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3374 3375 /* No need to remove rt from the exception table if rt is 3376 * a cached route because rt6_insert_exception() will 3377 * takes care of it 3378 */ 3379 if (rt6_insert_exception(nrt, from)) { 3380 dst_release_immediate(&nrt->dst); 3381 goto out; 3382 } 3383 3384 netevent.old = &rt->dst; 3385 netevent.new = &nrt->dst; 3386 netevent.daddr = &msg->dest; 3387 netevent.neigh = neigh; 3388 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3389 3390 out: 3391 fib6_info_release(from); 3392 neigh_release(neigh); 3393 } 3394 3395 #ifdef CONFIG_IPV6_ROUTE_INFO 3396 static struct fib6_info *rt6_get_route_info(struct net *net, 3397 const struct in6_addr *prefix, int prefixlen, 3398 const struct in6_addr *gwaddr, 3399 struct net_device *dev) 3400 { 3401 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3402 int ifindex = dev->ifindex; 3403 struct fib6_node *fn; 3404 struct fib6_info *rt = NULL; 3405 struct fib6_table *table; 3406 3407 table = fib6_get_table(net, tb_id); 3408 if (!table) 3409 return NULL; 3410 3411 rcu_read_lock(); 3412 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3413 if (!fn) 3414 goto out; 3415 3416 for_each_fib6_node_rt_rcu(fn) { 3417 if (rt->fib6_nh.nh_dev->ifindex != ifindex) 3418 continue; 3419 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) 3420 continue; 3421 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) 3422 continue; 3423 fib6_info_hold(rt); 3424 break; 3425 } 3426 out: 3427 rcu_read_unlock(); 3428 return rt; 3429 } 3430 3431 static struct fib6_info *rt6_add_route_info(struct net *net, 3432 const struct in6_addr *prefix, int prefixlen, 3433 const struct in6_addr *gwaddr, 3434 struct net_device *dev, 3435 unsigned int pref) 3436 { 3437 struct fib6_config cfg = { 3438 .fc_metric = IP6_RT_PRIO_USER, 3439 .fc_ifindex = dev->ifindex, 3440 .fc_dst_len = prefixlen, 3441 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3442 RTF_UP | RTF_PREF(pref), 3443 .fc_protocol = RTPROT_RA, 3444 .fc_type = RTN_UNICAST, 3445 .fc_nlinfo.portid = 0, 3446 .fc_nlinfo.nlh = NULL, 3447 .fc_nlinfo.nl_net = net, 3448 }; 3449 3450 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3451 cfg.fc_dst = *prefix; 3452 cfg.fc_gateway = *gwaddr; 3453 3454 /* We should treat it as a default route if prefix length is 0. */ 3455 if (!prefixlen) 3456 cfg.fc_flags |= RTF_DEFAULT; 3457 3458 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 3459 3460 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3461 } 3462 #endif 3463 3464 struct fib6_info *rt6_get_dflt_router(struct net *net, 3465 const struct in6_addr *addr, 3466 struct net_device *dev) 3467 { 3468 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3469 struct fib6_info *rt; 3470 struct fib6_table *table; 3471 3472 table = fib6_get_table(net, tb_id); 3473 if (!table) 3474 return NULL; 3475 3476 rcu_read_lock(); 3477 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3478 if (dev == rt->fib6_nh.nh_dev && 3479 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3480 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) 3481 break; 3482 } 3483 if (rt) 3484 fib6_info_hold(rt); 3485 rcu_read_unlock(); 3486 return rt; 3487 } 3488 3489 struct fib6_info *rt6_add_dflt_router(struct net *net, 3490 const struct in6_addr *gwaddr, 3491 struct net_device *dev, 3492 unsigned int pref) 3493 { 3494 struct fib6_config cfg = { 3495 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3496 .fc_metric = IP6_RT_PRIO_USER, 3497 .fc_ifindex = dev->ifindex, 3498 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3499 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3500 .fc_protocol = RTPROT_RA, 3501 .fc_type = RTN_UNICAST, 3502 .fc_nlinfo.portid = 0, 3503 .fc_nlinfo.nlh = NULL, 3504 .fc_nlinfo.nl_net = net, 3505 }; 3506 3507 cfg.fc_gateway = *gwaddr; 3508 3509 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 3510 struct fib6_table *table; 3511 3512 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3513 if (table) 3514 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3515 } 3516 3517 return rt6_get_dflt_router(net, gwaddr, dev); 3518 } 3519 3520 static void __rt6_purge_dflt_routers(struct net *net, 3521 struct fib6_table *table) 3522 { 3523 struct fib6_info *rt; 3524 3525 restart: 3526 rcu_read_lock(); 3527 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3528 struct net_device *dev = fib6_info_nh_dev(rt); 3529 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 3530 3531 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3532 (!idev || idev->cnf.accept_ra != 2)) { 3533 fib6_info_hold(rt); 3534 rcu_read_unlock(); 3535 ip6_del_rt(net, rt); 3536 goto restart; 3537 } 3538 } 3539 rcu_read_unlock(); 3540 3541 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3542 } 3543 3544 void rt6_purge_dflt_routers(struct net *net) 3545 { 3546 struct fib6_table *table; 3547 struct hlist_head *head; 3548 unsigned int h; 3549 3550 rcu_read_lock(); 3551 3552 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3553 head = &net->ipv6.fib_table_hash[h]; 3554 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3555 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3556 __rt6_purge_dflt_routers(net, table); 3557 } 3558 } 3559 3560 rcu_read_unlock(); 3561 } 3562 3563 static void rtmsg_to_fib6_config(struct net *net, 3564 struct in6_rtmsg *rtmsg, 3565 struct fib6_config *cfg) 3566 { 3567 memset(cfg, 0, sizeof(*cfg)); 3568 3569 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3570 : RT6_TABLE_MAIN; 3571 cfg->fc_ifindex = rtmsg->rtmsg_ifindex; 3572 cfg->fc_metric = rtmsg->rtmsg_metric; 3573 cfg->fc_expires = rtmsg->rtmsg_info; 3574 cfg->fc_dst_len = rtmsg->rtmsg_dst_len; 3575 cfg->fc_src_len = rtmsg->rtmsg_src_len; 3576 cfg->fc_flags = rtmsg->rtmsg_flags; 3577 cfg->fc_type = rtmsg->rtmsg_type; 3578 3579 cfg->fc_nlinfo.nl_net = net; 3580 3581 cfg->fc_dst = rtmsg->rtmsg_dst; 3582 cfg->fc_src = rtmsg->rtmsg_src; 3583 cfg->fc_gateway = rtmsg->rtmsg_gateway; 3584 } 3585 3586 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3587 { 3588 struct fib6_config cfg; 3589 struct in6_rtmsg rtmsg; 3590 int err; 3591 3592 switch (cmd) { 3593 case SIOCADDRT: /* Add a route */ 3594 case SIOCDELRT: /* Delete a route */ 3595 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3596 return -EPERM; 3597 err = copy_from_user(&rtmsg, arg, 3598 sizeof(struct in6_rtmsg)); 3599 if (err) 3600 return -EFAULT; 3601 3602 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3603 3604 rtnl_lock(); 3605 switch (cmd) { 3606 case SIOCADDRT: 3607 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 3608 break; 3609 case SIOCDELRT: 3610 err = ip6_route_del(&cfg, NULL); 3611 break; 3612 default: 3613 err = -EINVAL; 3614 } 3615 rtnl_unlock(); 3616 3617 return err; 3618 } 3619 3620 return -EINVAL; 3621 } 3622 3623 /* 3624 * Drop the packet on the floor 3625 */ 3626 3627 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3628 { 3629 int type; 3630 struct dst_entry *dst = skb_dst(skb); 3631 switch (ipstats_mib_noroutes) { 3632 case IPSTATS_MIB_INNOROUTES: 3633 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3634 if (type == IPV6_ADDR_ANY) { 3635 IP6_INC_STATS(dev_net(dst->dev), 3636 __in6_dev_get_safely(skb->dev), 3637 IPSTATS_MIB_INADDRERRORS); 3638 break; 3639 } 3640 /* FALLTHROUGH */ 3641 case IPSTATS_MIB_OUTNOROUTES: 3642 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3643 ipstats_mib_noroutes); 3644 break; 3645 } 3646 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3647 kfree_skb(skb); 3648 return 0; 3649 } 3650 3651 static int ip6_pkt_discard(struct sk_buff *skb) 3652 { 3653 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3654 } 3655 3656 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3657 { 3658 skb->dev = skb_dst(skb)->dev; 3659 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3660 } 3661 3662 static int ip6_pkt_prohibit(struct sk_buff *skb) 3663 { 3664 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3665 } 3666 3667 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3668 { 3669 skb->dev = skb_dst(skb)->dev; 3670 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3671 } 3672 3673 /* 3674 * Allocate a dst for local (unicast / anycast) address. 3675 */ 3676 3677 struct fib6_info *addrconf_f6i_alloc(struct net *net, 3678 struct inet6_dev *idev, 3679 const struct in6_addr *addr, 3680 bool anycast, gfp_t gfp_flags) 3681 { 3682 u32 tb_id; 3683 struct net_device *dev = idev->dev; 3684 struct fib6_info *f6i; 3685 3686 f6i = fib6_info_alloc(gfp_flags); 3687 if (!f6i) 3688 return ERR_PTR(-ENOMEM); 3689 3690 f6i->dst_nocount = true; 3691 f6i->dst_host = true; 3692 f6i->fib6_protocol = RTPROT_KERNEL; 3693 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP; 3694 if (anycast) { 3695 f6i->fib6_type = RTN_ANYCAST; 3696 f6i->fib6_flags |= RTF_ANYCAST; 3697 } else { 3698 f6i->fib6_type = RTN_LOCAL; 3699 f6i->fib6_flags |= RTF_LOCAL; 3700 } 3701 3702 f6i->fib6_nh.nh_gw = *addr; 3703 dev_hold(dev); 3704 f6i->fib6_nh.nh_dev = dev; 3705 f6i->fib6_dst.addr = *addr; 3706 f6i->fib6_dst.plen = 128; 3707 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; 3708 f6i->fib6_table = fib6_get_table(net, tb_id); 3709 3710 return f6i; 3711 } 3712 3713 /* remove deleted ip from prefsrc entries */ 3714 struct arg_dev_net_ip { 3715 struct net_device *dev; 3716 struct net *net; 3717 struct in6_addr *addr; 3718 }; 3719 3720 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 3721 { 3722 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3723 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3724 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3725 3726 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) && 3727 rt != net->ipv6.fib6_null_entry && 3728 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 3729 spin_lock_bh(&rt6_exception_lock); 3730 /* remove prefsrc entry */ 3731 rt->fib6_prefsrc.plen = 0; 3732 /* need to update cache as well */ 3733 rt6_exceptions_remove_prefsrc(rt); 3734 spin_unlock_bh(&rt6_exception_lock); 3735 } 3736 return 0; 3737 } 3738 3739 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3740 { 3741 struct net *net = dev_net(ifp->idev->dev); 3742 struct arg_dev_net_ip adni = { 3743 .dev = ifp->idev->dev, 3744 .net = net, 3745 .addr = &ifp->addr, 3746 }; 3747 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3748 } 3749 3750 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) 3751 3752 /* Remove routers and update dst entries when gateway turn into host. */ 3753 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 3754 { 3755 struct in6_addr *gateway = (struct in6_addr *)arg; 3756 3757 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3758 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) { 3759 return -1; 3760 } 3761 3762 /* Further clean up cached routes in exception table. 3763 * This is needed because cached route may have a different 3764 * gateway than its 'parent' in the case of an ip redirect. 3765 */ 3766 rt6_exceptions_clean_tohost(rt, gateway); 3767 3768 return 0; 3769 } 3770 3771 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3772 { 3773 fib6_clean_all(net, fib6_clean_tohost, gateway); 3774 } 3775 3776 struct arg_netdev_event { 3777 const struct net_device *dev; 3778 union { 3779 unsigned int nh_flags; 3780 unsigned long event; 3781 }; 3782 }; 3783 3784 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 3785 { 3786 struct fib6_info *iter; 3787 struct fib6_node *fn; 3788 3789 fn = rcu_dereference_protected(rt->fib6_node, 3790 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3791 iter = rcu_dereference_protected(fn->leaf, 3792 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3793 while (iter) { 3794 if (iter->fib6_metric == rt->fib6_metric && 3795 iter->fib6_nsiblings) 3796 return iter; 3797 iter = rcu_dereference_protected(iter->fib6_next, 3798 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3799 } 3800 3801 return NULL; 3802 } 3803 3804 static bool rt6_is_dead(const struct fib6_info *rt) 3805 { 3806 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || 3807 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 3808 fib6_ignore_linkdown(rt))) 3809 return true; 3810 3811 return false; 3812 } 3813 3814 static int rt6_multipath_total_weight(const struct fib6_info *rt) 3815 { 3816 struct fib6_info *iter; 3817 int total = 0; 3818 3819 if (!rt6_is_dead(rt)) 3820 total += rt->fib6_nh.nh_weight; 3821 3822 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 3823 if (!rt6_is_dead(iter)) 3824 total += iter->fib6_nh.nh_weight; 3825 } 3826 3827 return total; 3828 } 3829 3830 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 3831 { 3832 int upper_bound = -1; 3833 3834 if (!rt6_is_dead(rt)) { 3835 *weight += rt->fib6_nh.nh_weight; 3836 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 3837 total) - 1; 3838 } 3839 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound); 3840 } 3841 3842 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 3843 { 3844 struct fib6_info *iter; 3845 int weight = 0; 3846 3847 rt6_upper_bound_set(rt, &weight, total); 3848 3849 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3850 rt6_upper_bound_set(iter, &weight, total); 3851 } 3852 3853 void rt6_multipath_rebalance(struct fib6_info *rt) 3854 { 3855 struct fib6_info *first; 3856 int total; 3857 3858 /* In case the entire multipath route was marked for flushing, 3859 * then there is no need to rebalance upon the removal of every 3860 * sibling route. 3861 */ 3862 if (!rt->fib6_nsiblings || rt->should_flush) 3863 return; 3864 3865 /* During lookup routes are evaluated in order, so we need to 3866 * make sure upper bounds are assigned from the first sibling 3867 * onwards. 3868 */ 3869 first = rt6_multipath_first_sibling(rt); 3870 if (WARN_ON_ONCE(!first)) 3871 return; 3872 3873 total = rt6_multipath_total_weight(first); 3874 rt6_multipath_upper_bound_set(first, total); 3875 } 3876 3877 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 3878 { 3879 const struct arg_netdev_event *arg = p_arg; 3880 struct net *net = dev_net(arg->dev); 3881 3882 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) { 3883 rt->fib6_nh.nh_flags &= ~arg->nh_flags; 3884 fib6_update_sernum_upto_root(net, rt); 3885 rt6_multipath_rebalance(rt); 3886 } 3887 3888 return 0; 3889 } 3890 3891 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) 3892 { 3893 struct arg_netdev_event arg = { 3894 .dev = dev, 3895 { 3896 .nh_flags = nh_flags, 3897 }, 3898 }; 3899 3900 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 3901 arg.nh_flags |= RTNH_F_LINKDOWN; 3902 3903 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 3904 } 3905 3906 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 3907 const struct net_device *dev) 3908 { 3909 struct fib6_info *iter; 3910 3911 if (rt->fib6_nh.nh_dev == dev) 3912 return true; 3913 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3914 if (iter->fib6_nh.nh_dev == dev) 3915 return true; 3916 3917 return false; 3918 } 3919 3920 static void rt6_multipath_flush(struct fib6_info *rt) 3921 { 3922 struct fib6_info *iter; 3923 3924 rt->should_flush = 1; 3925 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3926 iter->should_flush = 1; 3927 } 3928 3929 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 3930 const struct net_device *down_dev) 3931 { 3932 struct fib6_info *iter; 3933 unsigned int dead = 0; 3934 3935 if (rt->fib6_nh.nh_dev == down_dev || 3936 rt->fib6_nh.nh_flags & RTNH_F_DEAD) 3937 dead++; 3938 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3939 if (iter->fib6_nh.nh_dev == down_dev || 3940 iter->fib6_nh.nh_flags & RTNH_F_DEAD) 3941 dead++; 3942 3943 return dead; 3944 } 3945 3946 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 3947 const struct net_device *dev, 3948 unsigned int nh_flags) 3949 { 3950 struct fib6_info *iter; 3951 3952 if (rt->fib6_nh.nh_dev == dev) 3953 rt->fib6_nh.nh_flags |= nh_flags; 3954 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3955 if (iter->fib6_nh.nh_dev == dev) 3956 iter->fib6_nh.nh_flags |= nh_flags; 3957 } 3958 3959 /* called with write lock held for table with rt */ 3960 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 3961 { 3962 const struct arg_netdev_event *arg = p_arg; 3963 const struct net_device *dev = arg->dev; 3964 struct net *net = dev_net(dev); 3965 3966 if (rt == net->ipv6.fib6_null_entry) 3967 return 0; 3968 3969 switch (arg->event) { 3970 case NETDEV_UNREGISTER: 3971 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 3972 case NETDEV_DOWN: 3973 if (rt->should_flush) 3974 return -1; 3975 if (!rt->fib6_nsiblings) 3976 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 3977 if (rt6_multipath_uses_dev(rt, dev)) { 3978 unsigned int count; 3979 3980 count = rt6_multipath_dead_count(rt, dev); 3981 if (rt->fib6_nsiblings + 1 == count) { 3982 rt6_multipath_flush(rt); 3983 return -1; 3984 } 3985 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 3986 RTNH_F_LINKDOWN); 3987 fib6_update_sernum(net, rt); 3988 rt6_multipath_rebalance(rt); 3989 } 3990 return -2; 3991 case NETDEV_CHANGE: 3992 if (rt->fib6_nh.nh_dev != dev || 3993 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 3994 break; 3995 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 3996 rt6_multipath_rebalance(rt); 3997 break; 3998 } 3999 4000 return 0; 4001 } 4002 4003 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4004 { 4005 struct arg_netdev_event arg = { 4006 .dev = dev, 4007 { 4008 .event = event, 4009 }, 4010 }; 4011 4012 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg); 4013 } 4014 4015 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4016 { 4017 rt6_sync_down_dev(dev, event); 4018 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4019 neigh_ifdown(&nd_tbl, dev); 4020 } 4021 4022 struct rt6_mtu_change_arg { 4023 struct net_device *dev; 4024 unsigned int mtu; 4025 }; 4026 4027 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) 4028 { 4029 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4030 struct inet6_dev *idev; 4031 4032 /* In IPv6 pmtu discovery is not optional, 4033 so that RTAX_MTU lock cannot disable it. 4034 We still use this lock to block changes 4035 caused by addrconf/ndisc. 4036 */ 4037 4038 idev = __in6_dev_get(arg->dev); 4039 if (!idev) 4040 return 0; 4041 4042 /* For administrative MTU increase, there is no way to discover 4043 IPv6 PMTU increase, so PMTU increase should be updated here. 4044 Since RFC 1981 doesn't include administrative MTU increase 4045 update PMTU increase is a MUST. (i.e. jumbo frame) 4046 */ 4047 if (rt->fib6_nh.nh_dev == arg->dev && 4048 !fib6_metric_locked(rt, RTAX_MTU)) { 4049 u32 mtu = rt->fib6_pmtu; 4050 4051 if (mtu >= arg->mtu || 4052 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4053 fib6_metric_set(rt, RTAX_MTU, arg->mtu); 4054 4055 spin_lock_bh(&rt6_exception_lock); 4056 rt6_exceptions_update_pmtu(idev, rt, arg->mtu); 4057 spin_unlock_bh(&rt6_exception_lock); 4058 } 4059 return 0; 4060 } 4061 4062 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4063 { 4064 struct rt6_mtu_change_arg arg = { 4065 .dev = dev, 4066 .mtu = mtu, 4067 }; 4068 4069 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4070 } 4071 4072 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4073 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4074 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4075 [RTA_OIF] = { .type = NLA_U32 }, 4076 [RTA_IIF] = { .type = NLA_U32 }, 4077 [RTA_PRIORITY] = { .type = NLA_U32 }, 4078 [RTA_METRICS] = { .type = NLA_NESTED }, 4079 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4080 [RTA_PREF] = { .type = NLA_U8 }, 4081 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4082 [RTA_ENCAP] = { .type = NLA_NESTED }, 4083 [RTA_EXPIRES] = { .type = NLA_U32 }, 4084 [RTA_UID] = { .type = NLA_U32 }, 4085 [RTA_MARK] = { .type = NLA_U32 }, 4086 [RTA_TABLE] = { .type = NLA_U32 }, 4087 [RTA_IP_PROTO] = { .type = NLA_U8 }, 4088 [RTA_SPORT] = { .type = NLA_U16 }, 4089 [RTA_DPORT] = { .type = NLA_U16 }, 4090 }; 4091 4092 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4093 struct fib6_config *cfg, 4094 struct netlink_ext_ack *extack) 4095 { 4096 struct rtmsg *rtm; 4097 struct nlattr *tb[RTA_MAX+1]; 4098 unsigned int pref; 4099 int err; 4100 4101 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4102 NULL); 4103 if (err < 0) 4104 goto errout; 4105 4106 err = -EINVAL; 4107 rtm = nlmsg_data(nlh); 4108 memset(cfg, 0, sizeof(*cfg)); 4109 4110 cfg->fc_table = rtm->rtm_table; 4111 cfg->fc_dst_len = rtm->rtm_dst_len; 4112 cfg->fc_src_len = rtm->rtm_src_len; 4113 cfg->fc_flags = RTF_UP; 4114 cfg->fc_protocol = rtm->rtm_protocol; 4115 cfg->fc_type = rtm->rtm_type; 4116 4117 if (rtm->rtm_type == RTN_UNREACHABLE || 4118 rtm->rtm_type == RTN_BLACKHOLE || 4119 rtm->rtm_type == RTN_PROHIBIT || 4120 rtm->rtm_type == RTN_THROW) 4121 cfg->fc_flags |= RTF_REJECT; 4122 4123 if (rtm->rtm_type == RTN_LOCAL) 4124 cfg->fc_flags |= RTF_LOCAL; 4125 4126 if (rtm->rtm_flags & RTM_F_CLONED) 4127 cfg->fc_flags |= RTF_CACHE; 4128 4129 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4130 4131 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; 4132 cfg->fc_nlinfo.nlh = nlh; 4133 cfg->fc_nlinfo.nl_net = sock_net(skb->sk); 4134 4135 if (tb[RTA_GATEWAY]) { 4136 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4137 cfg->fc_flags |= RTF_GATEWAY; 4138 } 4139 4140 if (tb[RTA_DST]) { 4141 int plen = (rtm->rtm_dst_len + 7) >> 3; 4142 4143 if (nla_len(tb[RTA_DST]) < plen) 4144 goto errout; 4145 4146 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 4147 } 4148 4149 if (tb[RTA_SRC]) { 4150 int plen = (rtm->rtm_src_len + 7) >> 3; 4151 4152 if (nla_len(tb[RTA_SRC]) < plen) 4153 goto errout; 4154 4155 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 4156 } 4157 4158 if (tb[RTA_PREFSRC]) 4159 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 4160 4161 if (tb[RTA_OIF]) 4162 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 4163 4164 if (tb[RTA_PRIORITY]) 4165 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 4166 4167 if (tb[RTA_METRICS]) { 4168 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 4169 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 4170 } 4171 4172 if (tb[RTA_TABLE]) 4173 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 4174 4175 if (tb[RTA_MULTIPATH]) { 4176 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 4177 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 4178 4179 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 4180 cfg->fc_mp_len, extack); 4181 if (err < 0) 4182 goto errout; 4183 } 4184 4185 if (tb[RTA_PREF]) { 4186 pref = nla_get_u8(tb[RTA_PREF]); 4187 if (pref != ICMPV6_ROUTER_PREF_LOW && 4188 pref != ICMPV6_ROUTER_PREF_HIGH) 4189 pref = ICMPV6_ROUTER_PREF_MEDIUM; 4190 cfg->fc_flags |= RTF_PREF(pref); 4191 } 4192 4193 if (tb[RTA_ENCAP]) 4194 cfg->fc_encap = tb[RTA_ENCAP]; 4195 4196 if (tb[RTA_ENCAP_TYPE]) { 4197 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 4198 4199 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 4200 if (err < 0) 4201 goto errout; 4202 } 4203 4204 if (tb[RTA_EXPIRES]) { 4205 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 4206 4207 if (addrconf_finite_timeout(timeout)) { 4208 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 4209 cfg->fc_flags |= RTF_EXPIRES; 4210 } 4211 } 4212 4213 err = 0; 4214 errout: 4215 return err; 4216 } 4217 4218 struct rt6_nh { 4219 struct fib6_info *fib6_info; 4220 struct fib6_config r_cfg; 4221 struct list_head next; 4222 }; 4223 4224 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) 4225 { 4226 struct rt6_nh *nh; 4227 4228 list_for_each_entry(nh, rt6_nh_list, next) { 4229 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n", 4230 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, 4231 nh->r_cfg.fc_ifindex); 4232 } 4233 } 4234 4235 static int ip6_route_info_append(struct net *net, 4236 struct list_head *rt6_nh_list, 4237 struct fib6_info *rt, 4238 struct fib6_config *r_cfg) 4239 { 4240 struct rt6_nh *nh; 4241 int err = -EEXIST; 4242 4243 list_for_each_entry(nh, rt6_nh_list, next) { 4244 /* check if fib6_info already exists */ 4245 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 4246 return err; 4247 } 4248 4249 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4250 if (!nh) 4251 return -ENOMEM; 4252 nh->fib6_info = rt; 4253 err = ip6_convert_metrics(net, rt, r_cfg); 4254 if (err) { 4255 kfree(nh); 4256 return err; 4257 } 4258 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4259 list_add_tail(&nh->next, rt6_nh_list); 4260 4261 return 0; 4262 } 4263 4264 static void ip6_route_mpath_notify(struct fib6_info *rt, 4265 struct fib6_info *rt_last, 4266 struct nl_info *info, 4267 __u16 nlflags) 4268 { 4269 /* if this is an APPEND route, then rt points to the first route 4270 * inserted and rt_last points to last route inserted. Userspace 4271 * wants a consistent dump of the route which starts at the first 4272 * nexthop. Since sibling routes are always added at the end of 4273 * the list, find the first sibling of the last route appended 4274 */ 4275 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 4276 rt = list_first_entry(&rt_last->fib6_siblings, 4277 struct fib6_info, 4278 fib6_siblings); 4279 } 4280 4281 if (rt) 4282 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 4283 } 4284 4285 static int ip6_route_multipath_add(struct fib6_config *cfg, 4286 struct netlink_ext_ack *extack) 4287 { 4288 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 4289 struct nl_info *info = &cfg->fc_nlinfo; 4290 struct fib6_config r_cfg; 4291 struct rtnexthop *rtnh; 4292 struct fib6_info *rt; 4293 struct rt6_nh *err_nh; 4294 struct rt6_nh *nh, *nh_safe; 4295 __u16 nlflags; 4296 int remaining; 4297 int attrlen; 4298 int err = 1; 4299 int nhn = 0; 4300 int replace = (cfg->fc_nlinfo.nlh && 4301 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 4302 LIST_HEAD(rt6_nh_list); 4303 4304 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 4305 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 4306 nlflags |= NLM_F_APPEND; 4307 4308 remaining = cfg->fc_mp_len; 4309 rtnh = (struct rtnexthop *)cfg->fc_mp; 4310 4311 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4312 * fib6_info structs per nexthop 4313 */ 4314 while (rtnh_ok(rtnh, remaining)) { 4315 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4316 if (rtnh->rtnh_ifindex) 4317 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4318 4319 attrlen = rtnh_attrlen(rtnh); 4320 if (attrlen > 0) { 4321 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4322 4323 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4324 if (nla) { 4325 r_cfg.fc_gateway = nla_get_in6_addr(nla); 4326 r_cfg.fc_flags |= RTF_GATEWAY; 4327 } 4328 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 4329 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 4330 if (nla) 4331 r_cfg.fc_encap_type = nla_get_u16(nla); 4332 } 4333 4334 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 4335 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 4336 if (IS_ERR(rt)) { 4337 err = PTR_ERR(rt); 4338 rt = NULL; 4339 goto cleanup; 4340 } 4341 4342 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1; 4343 4344 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 4345 rt, &r_cfg); 4346 if (err) { 4347 fib6_info_release(rt); 4348 goto cleanup; 4349 } 4350 4351 rtnh = rtnh_next(rtnh, &remaining); 4352 } 4353 4354 /* for add and replace send one notification with all nexthops. 4355 * Skip the notification in fib6_add_rt2node and send one with 4356 * the full route when done 4357 */ 4358 info->skip_notify = 1; 4359 4360 err_nh = NULL; 4361 list_for_each_entry(nh, &rt6_nh_list, next) { 4362 rt_last = nh->fib6_info; 4363 err = __ip6_ins_rt(nh->fib6_info, info, extack); 4364 fib6_info_release(nh->fib6_info); 4365 4366 /* save reference to first route for notification */ 4367 if (!rt_notif && !err) 4368 rt_notif = nh->fib6_info; 4369 4370 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 4371 nh->fib6_info = NULL; 4372 if (err) { 4373 if (replace && nhn) 4374 ip6_print_replace_route_err(&rt6_nh_list); 4375 err_nh = nh; 4376 goto add_errout; 4377 } 4378 4379 /* Because each route is added like a single route we remove 4380 * these flags after the first nexthop: if there is a collision, 4381 * we have already failed to add the first nexthop: 4382 * fib6_add_rt2node() has rejected it; when replacing, old 4383 * nexthops have been replaced by first new, the rest should 4384 * be added to it. 4385 */ 4386 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4387 NLM_F_REPLACE); 4388 cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_APPEND; 4389 nhn++; 4390 } 4391 4392 /* success ... tell user about new route */ 4393 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4394 goto cleanup; 4395 4396 add_errout: 4397 /* send notification for routes that were added so that 4398 * the delete notifications sent by ip6_route_del are 4399 * coherent 4400 */ 4401 if (rt_notif) 4402 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4403 4404 /* Delete routes that were already added */ 4405 list_for_each_entry(nh, &rt6_nh_list, next) { 4406 if (err_nh == nh) 4407 break; 4408 ip6_route_del(&nh->r_cfg, extack); 4409 } 4410 4411 cleanup: 4412 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4413 if (nh->fib6_info) 4414 fib6_info_release(nh->fib6_info); 4415 list_del(&nh->next); 4416 kfree(nh); 4417 } 4418 4419 return err; 4420 } 4421 4422 static int ip6_route_multipath_del(struct fib6_config *cfg, 4423 struct netlink_ext_ack *extack) 4424 { 4425 struct fib6_config r_cfg; 4426 struct rtnexthop *rtnh; 4427 int remaining; 4428 int attrlen; 4429 int err = 1, last_err = 0; 4430 4431 remaining = cfg->fc_mp_len; 4432 rtnh = (struct rtnexthop *)cfg->fc_mp; 4433 4434 /* Parse a Multipath Entry */ 4435 while (rtnh_ok(rtnh, remaining)) { 4436 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4437 if (rtnh->rtnh_ifindex) 4438 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4439 4440 attrlen = rtnh_attrlen(rtnh); 4441 if (attrlen > 0) { 4442 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4443 4444 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4445 if (nla) { 4446 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 4447 r_cfg.fc_flags |= RTF_GATEWAY; 4448 } 4449 } 4450 err = ip6_route_del(&r_cfg, extack); 4451 if (err) 4452 last_err = err; 4453 4454 rtnh = rtnh_next(rtnh, &remaining); 4455 } 4456 4457 return last_err; 4458 } 4459 4460 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4461 struct netlink_ext_ack *extack) 4462 { 4463 struct fib6_config cfg; 4464 int err; 4465 4466 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4467 if (err < 0) 4468 return err; 4469 4470 if (cfg.fc_mp) 4471 return ip6_route_multipath_del(&cfg, extack); 4472 else { 4473 cfg.fc_delete_all_nh = 1; 4474 return ip6_route_del(&cfg, extack); 4475 } 4476 } 4477 4478 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4479 struct netlink_ext_ack *extack) 4480 { 4481 struct fib6_config cfg; 4482 int err; 4483 4484 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4485 if (err < 0) 4486 return err; 4487 4488 if (cfg.fc_mp) 4489 return ip6_route_multipath_add(&cfg, extack); 4490 else 4491 return ip6_route_add(&cfg, GFP_KERNEL, extack); 4492 } 4493 4494 static size_t rt6_nlmsg_size(struct fib6_info *rt) 4495 { 4496 int nexthop_len = 0; 4497 4498 if (rt->fib6_nsiblings) { 4499 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4500 + NLA_ALIGN(sizeof(struct rtnexthop)) 4501 + nla_total_size(16) /* RTA_GATEWAY */ 4502 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate); 4503 4504 nexthop_len *= rt->fib6_nsiblings; 4505 } 4506 4507 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4508 + nla_total_size(16) /* RTA_SRC */ 4509 + nla_total_size(16) /* RTA_DST */ 4510 + nla_total_size(16) /* RTA_GATEWAY */ 4511 + nla_total_size(16) /* RTA_PREFSRC */ 4512 + nla_total_size(4) /* RTA_TABLE */ 4513 + nla_total_size(4) /* RTA_IIF */ 4514 + nla_total_size(4) /* RTA_OIF */ 4515 + nla_total_size(4) /* RTA_PRIORITY */ 4516 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 4517 + nla_total_size(sizeof(struct rta_cacheinfo)) 4518 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4519 + nla_total_size(1) /* RTA_PREF */ 4520 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate) 4521 + nexthop_len; 4522 } 4523 4524 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt, 4525 unsigned int *flags, bool skip_oif) 4526 { 4527 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 4528 *flags |= RTNH_F_DEAD; 4529 4530 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) { 4531 *flags |= RTNH_F_LINKDOWN; 4532 4533 rcu_read_lock(); 4534 if (fib6_ignore_linkdown(rt)) 4535 *flags |= RTNH_F_DEAD; 4536 rcu_read_unlock(); 4537 } 4538 4539 if (rt->fib6_flags & RTF_GATEWAY) { 4540 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0) 4541 goto nla_put_failure; 4542 } 4543 4544 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK); 4545 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD) 4546 *flags |= RTNH_F_OFFLOAD; 4547 4548 /* not needed for multipath encoding b/c it has a rtnexthop struct */ 4549 if (!skip_oif && rt->fib6_nh.nh_dev && 4550 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex)) 4551 goto nla_put_failure; 4552 4553 if (rt->fib6_nh.nh_lwtstate && 4554 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0) 4555 goto nla_put_failure; 4556 4557 return 0; 4558 4559 nla_put_failure: 4560 return -EMSGSIZE; 4561 } 4562 4563 /* add multipath next hop */ 4564 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt) 4565 { 4566 const struct net_device *dev = rt->fib6_nh.nh_dev; 4567 struct rtnexthop *rtnh; 4568 unsigned int flags = 0; 4569 4570 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 4571 if (!rtnh) 4572 goto nla_put_failure; 4573 4574 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1; 4575 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; 4576 4577 if (rt6_nexthop_info(skb, rt, &flags, true) < 0) 4578 goto nla_put_failure; 4579 4580 rtnh->rtnh_flags = flags; 4581 4582 /* length of rtnetlink header + attributes */ 4583 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; 4584 4585 return 0; 4586 4587 nla_put_failure: 4588 return -EMSGSIZE; 4589 } 4590 4591 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 4592 struct fib6_info *rt, struct dst_entry *dst, 4593 struct in6_addr *dest, struct in6_addr *src, 4594 int iif, int type, u32 portid, u32 seq, 4595 unsigned int flags) 4596 { 4597 struct rtmsg *rtm; 4598 struct nlmsghdr *nlh; 4599 long expires = 0; 4600 u32 *pmetrics; 4601 u32 table; 4602 4603 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4604 if (!nlh) 4605 return -EMSGSIZE; 4606 4607 rtm = nlmsg_data(nlh); 4608 rtm->rtm_family = AF_INET6; 4609 rtm->rtm_dst_len = rt->fib6_dst.plen; 4610 rtm->rtm_src_len = rt->fib6_src.plen; 4611 rtm->rtm_tos = 0; 4612 if (rt->fib6_table) 4613 table = rt->fib6_table->tb6_id; 4614 else 4615 table = RT6_TABLE_UNSPEC; 4616 rtm->rtm_table = table; 4617 if (nla_put_u32(skb, RTA_TABLE, table)) 4618 goto nla_put_failure; 4619 4620 rtm->rtm_type = rt->fib6_type; 4621 rtm->rtm_flags = 0; 4622 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4623 rtm->rtm_protocol = rt->fib6_protocol; 4624 4625 if (rt->fib6_flags & RTF_CACHE) 4626 rtm->rtm_flags |= RTM_F_CLONED; 4627 4628 if (dest) { 4629 if (nla_put_in6_addr(skb, RTA_DST, dest)) 4630 goto nla_put_failure; 4631 rtm->rtm_dst_len = 128; 4632 } else if (rtm->rtm_dst_len) 4633 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr)) 4634 goto nla_put_failure; 4635 #ifdef CONFIG_IPV6_SUBTREES 4636 if (src) { 4637 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4638 goto nla_put_failure; 4639 rtm->rtm_src_len = 128; 4640 } else if (rtm->rtm_src_len && 4641 nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr)) 4642 goto nla_put_failure; 4643 #endif 4644 if (iif) { 4645 #ifdef CONFIG_IPV6_MROUTE 4646 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) { 4647 int err = ip6mr_get_route(net, skb, rtm, portid); 4648 4649 if (err == 0) 4650 return 0; 4651 if (err < 0) 4652 goto nla_put_failure; 4653 } else 4654 #endif 4655 if (nla_put_u32(skb, RTA_IIF, iif)) 4656 goto nla_put_failure; 4657 } else if (dest) { 4658 struct in6_addr saddr_buf; 4659 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 4660 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4661 goto nla_put_failure; 4662 } 4663 4664 if (rt->fib6_prefsrc.plen) { 4665 struct in6_addr saddr_buf; 4666 saddr_buf = rt->fib6_prefsrc.addr; 4667 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4668 goto nla_put_failure; 4669 } 4670 4671 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 4672 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 4673 goto nla_put_failure; 4674 4675 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 4676 goto nla_put_failure; 4677 4678 /* For multipath routes, walk the siblings list and add 4679 * each as a nexthop within RTA_MULTIPATH. 4680 */ 4681 if (rt->fib6_nsiblings) { 4682 struct fib6_info *sibling, *next_sibling; 4683 struct nlattr *mp; 4684 4685 mp = nla_nest_start(skb, RTA_MULTIPATH); 4686 if (!mp) 4687 goto nla_put_failure; 4688 4689 if (rt6_add_nexthop(skb, rt) < 0) 4690 goto nla_put_failure; 4691 4692 list_for_each_entry_safe(sibling, next_sibling, 4693 &rt->fib6_siblings, fib6_siblings) { 4694 if (rt6_add_nexthop(skb, sibling) < 0) 4695 goto nla_put_failure; 4696 } 4697 4698 nla_nest_end(skb, mp); 4699 } else { 4700 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0) 4701 goto nla_put_failure; 4702 } 4703 4704 if (rt->fib6_flags & RTF_EXPIRES) { 4705 expires = dst ? dst->expires : rt->expires; 4706 expires -= jiffies; 4707 } 4708 4709 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 4710 goto nla_put_failure; 4711 4712 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags))) 4713 goto nla_put_failure; 4714 4715 4716 nlmsg_end(skb, nlh); 4717 return 0; 4718 4719 nla_put_failure: 4720 nlmsg_cancel(skb, nlh); 4721 return -EMSGSIZE; 4722 } 4723 4724 int rt6_dump_route(struct fib6_info *rt, void *p_arg) 4725 { 4726 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4727 struct net *net = arg->net; 4728 4729 if (rt == net->ipv6.fib6_null_entry) 4730 return 0; 4731 4732 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { 4733 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); 4734 4735 /* user wants prefix routes only */ 4736 if (rtm->rtm_flags & RTM_F_PREFIX && 4737 !(rt->fib6_flags & RTF_PREFIX_RT)) { 4738 /* success since this is not a prefix route */ 4739 return 1; 4740 } 4741 } 4742 4743 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, 4744 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, 4745 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI); 4746 } 4747 4748 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4749 struct netlink_ext_ack *extack) 4750 { 4751 struct net *net = sock_net(in_skb->sk); 4752 struct nlattr *tb[RTA_MAX+1]; 4753 int err, iif = 0, oif = 0; 4754 struct fib6_info *from; 4755 struct dst_entry *dst; 4756 struct rt6_info *rt; 4757 struct sk_buff *skb; 4758 struct rtmsg *rtm; 4759 struct flowi6 fl6; 4760 bool fibmatch; 4761 4762 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4763 extack); 4764 if (err < 0) 4765 goto errout; 4766 4767 err = -EINVAL; 4768 memset(&fl6, 0, sizeof(fl6)); 4769 rtm = nlmsg_data(nlh); 4770 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4771 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4772 4773 if (tb[RTA_SRC]) { 4774 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4775 goto errout; 4776 4777 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4778 } 4779 4780 if (tb[RTA_DST]) { 4781 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4782 goto errout; 4783 4784 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4785 } 4786 4787 if (tb[RTA_IIF]) 4788 iif = nla_get_u32(tb[RTA_IIF]); 4789 4790 if (tb[RTA_OIF]) 4791 oif = nla_get_u32(tb[RTA_OIF]); 4792 4793 if (tb[RTA_MARK]) 4794 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4795 4796 if (tb[RTA_UID]) 4797 fl6.flowi6_uid = make_kuid(current_user_ns(), 4798 nla_get_u32(tb[RTA_UID])); 4799 else 4800 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4801 4802 if (tb[RTA_SPORT]) 4803 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); 4804 4805 if (tb[RTA_DPORT]) 4806 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); 4807 4808 if (tb[RTA_IP_PROTO]) { 4809 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 4810 &fl6.flowi6_proto, extack); 4811 if (err) 4812 goto errout; 4813 } 4814 4815 if (iif) { 4816 struct net_device *dev; 4817 int flags = 0; 4818 4819 rcu_read_lock(); 4820 4821 dev = dev_get_by_index_rcu(net, iif); 4822 if (!dev) { 4823 rcu_read_unlock(); 4824 err = -ENODEV; 4825 goto errout; 4826 } 4827 4828 fl6.flowi6_iif = iif; 4829 4830 if (!ipv6_addr_any(&fl6.saddr)) 4831 flags |= RT6_LOOKUP_F_HAS_SADDR; 4832 4833 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 4834 4835 rcu_read_unlock(); 4836 } else { 4837 fl6.flowi6_oif = oif; 4838 4839 dst = ip6_route_output(net, NULL, &fl6); 4840 } 4841 4842 4843 rt = container_of(dst, struct rt6_info, dst); 4844 if (rt->dst.error) { 4845 err = rt->dst.error; 4846 ip6_rt_put(rt); 4847 goto errout; 4848 } 4849 4850 if (rt == net->ipv6.ip6_null_entry) { 4851 err = rt->dst.error; 4852 ip6_rt_put(rt); 4853 goto errout; 4854 } 4855 4856 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 4857 if (!skb) { 4858 ip6_rt_put(rt); 4859 err = -ENOBUFS; 4860 goto errout; 4861 } 4862 4863 skb_dst_set(skb, &rt->dst); 4864 4865 rcu_read_lock(); 4866 from = rcu_dereference(rt->from); 4867 4868 if (fibmatch) 4869 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, 4870 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 4871 nlh->nlmsg_seq, 0); 4872 else 4873 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 4874 &fl6.saddr, iif, RTM_NEWROUTE, 4875 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 4876 0); 4877 rcu_read_unlock(); 4878 4879 if (err < 0) { 4880 kfree_skb(skb); 4881 goto errout; 4882 } 4883 4884 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 4885 errout: 4886 return err; 4887 } 4888 4889 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 4890 unsigned int nlm_flags) 4891 { 4892 struct sk_buff *skb; 4893 struct net *net = info->nl_net; 4894 u32 seq; 4895 int err; 4896 4897 err = -ENOBUFS; 4898 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 4899 4900 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 4901 if (!skb) 4902 goto errout; 4903 4904 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 4905 event, info->portid, seq, nlm_flags); 4906 if (err < 0) { 4907 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 4908 WARN_ON(err == -EMSGSIZE); 4909 kfree_skb(skb); 4910 goto errout; 4911 } 4912 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 4913 info->nlh, gfp_any()); 4914 return; 4915 errout: 4916 if (err < 0) 4917 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 4918 } 4919 4920 static int ip6_route_dev_notify(struct notifier_block *this, 4921 unsigned long event, void *ptr) 4922 { 4923 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 4924 struct net *net = dev_net(dev); 4925 4926 if (!(dev->flags & IFF_LOOPBACK)) 4927 return NOTIFY_OK; 4928 4929 if (event == NETDEV_REGISTER) { 4930 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev; 4931 net->ipv6.ip6_null_entry->dst.dev = dev; 4932 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 4933 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4934 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 4935 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 4936 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 4937 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 4938 #endif 4939 } else if (event == NETDEV_UNREGISTER && 4940 dev->reg_state != NETREG_UNREGISTERED) { 4941 /* NETDEV_UNREGISTER could be fired for multiple times by 4942 * netdev_wait_allrefs(). Make sure we only call this once. 4943 */ 4944 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 4945 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4946 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 4947 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 4948 #endif 4949 } 4950 4951 return NOTIFY_OK; 4952 } 4953 4954 /* 4955 * /proc 4956 */ 4957 4958 #ifdef CONFIG_PROC_FS 4959 4960 static const struct file_operations ipv6_route_proc_fops = { 4961 .open = ipv6_route_open, 4962 .read = seq_read, 4963 .llseek = seq_lseek, 4964 .release = seq_release_net, 4965 }; 4966 4967 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 4968 { 4969 struct net *net = (struct net *)seq->private; 4970 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 4971 net->ipv6.rt6_stats->fib_nodes, 4972 net->ipv6.rt6_stats->fib_route_nodes, 4973 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 4974 net->ipv6.rt6_stats->fib_rt_entries, 4975 net->ipv6.rt6_stats->fib_rt_cache, 4976 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 4977 net->ipv6.rt6_stats->fib_discarded_routes); 4978 4979 return 0; 4980 } 4981 4982 static int rt6_stats_seq_open(struct inode *inode, struct file *file) 4983 { 4984 return single_open_net(inode, file, rt6_stats_seq_show); 4985 } 4986 4987 static const struct file_operations rt6_stats_seq_fops = { 4988 .open = rt6_stats_seq_open, 4989 .read = seq_read, 4990 .llseek = seq_lseek, 4991 .release = single_release_net, 4992 }; 4993 #endif /* CONFIG_PROC_FS */ 4994 4995 #ifdef CONFIG_SYSCTL 4996 4997 static 4998 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 4999 void __user *buffer, size_t *lenp, loff_t *ppos) 5000 { 5001 struct net *net; 5002 int delay; 5003 if (!write) 5004 return -EINVAL; 5005 5006 net = (struct net *)ctl->extra1; 5007 delay = net->ipv6.sysctl.flush_delay; 5008 proc_dointvec(ctl, write, buffer, lenp, ppos); 5009 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 5010 return 0; 5011 } 5012 5013 struct ctl_table ipv6_route_table_template[] = { 5014 { 5015 .procname = "flush", 5016 .data = &init_net.ipv6.sysctl.flush_delay, 5017 .maxlen = sizeof(int), 5018 .mode = 0200, 5019 .proc_handler = ipv6_sysctl_rtcache_flush 5020 }, 5021 { 5022 .procname = "gc_thresh", 5023 .data = &ip6_dst_ops_template.gc_thresh, 5024 .maxlen = sizeof(int), 5025 .mode = 0644, 5026 .proc_handler = proc_dointvec, 5027 }, 5028 { 5029 .procname = "max_size", 5030 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 5031 .maxlen = sizeof(int), 5032 .mode = 0644, 5033 .proc_handler = proc_dointvec, 5034 }, 5035 { 5036 .procname = "gc_min_interval", 5037 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5038 .maxlen = sizeof(int), 5039 .mode = 0644, 5040 .proc_handler = proc_dointvec_jiffies, 5041 }, 5042 { 5043 .procname = "gc_timeout", 5044 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 5045 .maxlen = sizeof(int), 5046 .mode = 0644, 5047 .proc_handler = proc_dointvec_jiffies, 5048 }, 5049 { 5050 .procname = "gc_interval", 5051 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 5052 .maxlen = sizeof(int), 5053 .mode = 0644, 5054 .proc_handler = proc_dointvec_jiffies, 5055 }, 5056 { 5057 .procname = "gc_elasticity", 5058 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 5059 .maxlen = sizeof(int), 5060 .mode = 0644, 5061 .proc_handler = proc_dointvec, 5062 }, 5063 { 5064 .procname = "mtu_expires", 5065 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 5066 .maxlen = sizeof(int), 5067 .mode = 0644, 5068 .proc_handler = proc_dointvec_jiffies, 5069 }, 5070 { 5071 .procname = "min_adv_mss", 5072 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 5073 .maxlen = sizeof(int), 5074 .mode = 0644, 5075 .proc_handler = proc_dointvec, 5076 }, 5077 { 5078 .procname = "gc_min_interval_ms", 5079 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5080 .maxlen = sizeof(int), 5081 .mode = 0644, 5082 .proc_handler = proc_dointvec_ms_jiffies, 5083 }, 5084 { } 5085 }; 5086 5087 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 5088 { 5089 struct ctl_table *table; 5090 5091 table = kmemdup(ipv6_route_table_template, 5092 sizeof(ipv6_route_table_template), 5093 GFP_KERNEL); 5094 5095 if (table) { 5096 table[0].data = &net->ipv6.sysctl.flush_delay; 5097 table[0].extra1 = net; 5098 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 5099 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 5100 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5101 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 5102 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 5103 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 5104 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 5105 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 5106 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5107 5108 /* Don't export sysctls to unprivileged users */ 5109 if (net->user_ns != &init_user_ns) 5110 table[0].procname = NULL; 5111 } 5112 5113 return table; 5114 } 5115 #endif 5116 5117 static int __net_init ip6_route_net_init(struct net *net) 5118 { 5119 int ret = -ENOMEM; 5120 5121 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 5122 sizeof(net->ipv6.ip6_dst_ops)); 5123 5124 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 5125 goto out_ip6_dst_ops; 5126 5127 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, 5128 sizeof(*net->ipv6.fib6_null_entry), 5129 GFP_KERNEL); 5130 if (!net->ipv6.fib6_null_entry) 5131 goto out_ip6_dst_entries; 5132 5133 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 5134 sizeof(*net->ipv6.ip6_null_entry), 5135 GFP_KERNEL); 5136 if (!net->ipv6.ip6_null_entry) 5137 goto out_fib6_null_entry; 5138 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5139 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 5140 ip6_template_metrics, true); 5141 5142 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5143 net->ipv6.fib6_has_custom_rules = false; 5144 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 5145 sizeof(*net->ipv6.ip6_prohibit_entry), 5146 GFP_KERNEL); 5147 if (!net->ipv6.ip6_prohibit_entry) 5148 goto out_ip6_null_entry; 5149 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5150 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 5151 ip6_template_metrics, true); 5152 5153 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 5154 sizeof(*net->ipv6.ip6_blk_hole_entry), 5155 GFP_KERNEL); 5156 if (!net->ipv6.ip6_blk_hole_entry) 5157 goto out_ip6_prohibit_entry; 5158 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5159 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 5160 ip6_template_metrics, true); 5161 #endif 5162 5163 net->ipv6.sysctl.flush_delay = 0; 5164 net->ipv6.sysctl.ip6_rt_max_size = 4096; 5165 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 5166 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 5167 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 5168 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 5169 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 5170 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 5171 5172 net->ipv6.ip6_rt_gc_expire = 30*HZ; 5173 5174 ret = 0; 5175 out: 5176 return ret; 5177 5178 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5179 out_ip6_prohibit_entry: 5180 kfree(net->ipv6.ip6_prohibit_entry); 5181 out_ip6_null_entry: 5182 kfree(net->ipv6.ip6_null_entry); 5183 #endif 5184 out_fib6_null_entry: 5185 kfree(net->ipv6.fib6_null_entry); 5186 out_ip6_dst_entries: 5187 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5188 out_ip6_dst_ops: 5189 goto out; 5190 } 5191 5192 static void __net_exit ip6_route_net_exit(struct net *net) 5193 { 5194 kfree(net->ipv6.fib6_null_entry); 5195 kfree(net->ipv6.ip6_null_entry); 5196 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5197 kfree(net->ipv6.ip6_prohibit_entry); 5198 kfree(net->ipv6.ip6_blk_hole_entry); 5199 #endif 5200 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5201 } 5202 5203 static int __net_init ip6_route_net_init_late(struct net *net) 5204 { 5205 #ifdef CONFIG_PROC_FS 5206 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops); 5207 proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops); 5208 #endif 5209 return 0; 5210 } 5211 5212 static void __net_exit ip6_route_net_exit_late(struct net *net) 5213 { 5214 #ifdef CONFIG_PROC_FS 5215 remove_proc_entry("ipv6_route", net->proc_net); 5216 remove_proc_entry("rt6_stats", net->proc_net); 5217 #endif 5218 } 5219 5220 static struct pernet_operations ip6_route_net_ops = { 5221 .init = ip6_route_net_init, 5222 .exit = ip6_route_net_exit, 5223 }; 5224 5225 static int __net_init ipv6_inetpeer_init(struct net *net) 5226 { 5227 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 5228 5229 if (!bp) 5230 return -ENOMEM; 5231 inet_peer_base_init(bp); 5232 net->ipv6.peers = bp; 5233 return 0; 5234 } 5235 5236 static void __net_exit ipv6_inetpeer_exit(struct net *net) 5237 { 5238 struct inet_peer_base *bp = net->ipv6.peers; 5239 5240 net->ipv6.peers = NULL; 5241 inetpeer_invalidate_tree(bp); 5242 kfree(bp); 5243 } 5244 5245 static struct pernet_operations ipv6_inetpeer_ops = { 5246 .init = ipv6_inetpeer_init, 5247 .exit = ipv6_inetpeer_exit, 5248 }; 5249 5250 static struct pernet_operations ip6_route_net_late_ops = { 5251 .init = ip6_route_net_init_late, 5252 .exit = ip6_route_net_exit_late, 5253 }; 5254 5255 static struct notifier_block ip6_route_dev_notifier = { 5256 .notifier_call = ip6_route_dev_notify, 5257 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 5258 }; 5259 5260 void __init ip6_route_init_special_entries(void) 5261 { 5262 /* Registering of the loopback is done before this portion of code, 5263 * the loopback reference in rt6_info will not be taken, do it 5264 * manually for init_net */ 5265 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev; 5266 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5267 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5268 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5269 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 5270 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5271 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 5272 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5273 #endif 5274 } 5275 5276 int __init ip6_route_init(void) 5277 { 5278 int ret; 5279 int cpu; 5280 5281 ret = -ENOMEM; 5282 ip6_dst_ops_template.kmem_cachep = 5283 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 5284 SLAB_HWCACHE_ALIGN, NULL); 5285 if (!ip6_dst_ops_template.kmem_cachep) 5286 goto out; 5287 5288 ret = dst_entries_init(&ip6_dst_blackhole_ops); 5289 if (ret) 5290 goto out_kmem_cache; 5291 5292 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 5293 if (ret) 5294 goto out_dst_entries; 5295 5296 ret = register_pernet_subsys(&ip6_route_net_ops); 5297 if (ret) 5298 goto out_register_inetpeer; 5299 5300 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 5301 5302 ret = fib6_init(); 5303 if (ret) 5304 goto out_register_subsys; 5305 5306 ret = xfrm6_init(); 5307 if (ret) 5308 goto out_fib6_init; 5309 5310 ret = fib6_rules_init(); 5311 if (ret) 5312 goto xfrm6_init; 5313 5314 ret = register_pernet_subsys(&ip6_route_net_late_ops); 5315 if (ret) 5316 goto fib6_rules_init; 5317 5318 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 5319 inet6_rtm_newroute, NULL, 0); 5320 if (ret < 0) 5321 goto out_register_late_subsys; 5322 5323 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 5324 inet6_rtm_delroute, NULL, 0); 5325 if (ret < 0) 5326 goto out_register_late_subsys; 5327 5328 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 5329 inet6_rtm_getroute, NULL, 5330 RTNL_FLAG_DOIT_UNLOCKED); 5331 if (ret < 0) 5332 goto out_register_late_subsys; 5333 5334 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 5335 if (ret) 5336 goto out_register_late_subsys; 5337 5338 for_each_possible_cpu(cpu) { 5339 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 5340 5341 INIT_LIST_HEAD(&ul->head); 5342 spin_lock_init(&ul->lock); 5343 } 5344 5345 out: 5346 return ret; 5347 5348 out_register_late_subsys: 5349 rtnl_unregister_all(PF_INET6); 5350 unregister_pernet_subsys(&ip6_route_net_late_ops); 5351 fib6_rules_init: 5352 fib6_rules_cleanup(); 5353 xfrm6_init: 5354 xfrm6_fini(); 5355 out_fib6_init: 5356 fib6_gc_cleanup(); 5357 out_register_subsys: 5358 unregister_pernet_subsys(&ip6_route_net_ops); 5359 out_register_inetpeer: 5360 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5361 out_dst_entries: 5362 dst_entries_destroy(&ip6_dst_blackhole_ops); 5363 out_kmem_cache: 5364 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5365 goto out; 5366 } 5367 5368 void ip6_route_cleanup(void) 5369 { 5370 unregister_netdevice_notifier(&ip6_route_dev_notifier); 5371 unregister_pernet_subsys(&ip6_route_net_late_ops); 5372 fib6_rules_cleanup(); 5373 xfrm6_fini(); 5374 fib6_gc_cleanup(); 5375 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5376 unregister_pernet_subsys(&ip6_route_net_ops); 5377 dst_entries_destroy(&ip6_dst_blackhole_ops); 5378 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5379 } 5380