1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/nexthop.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <trace/events/fib6.h> 67 68 #include <linux/uaccess.h> 69 70 #ifdef CONFIG_SYSCTL 71 #include <linux/sysctl.h> 72 #endif 73 74 enum rt6_nud_state { 75 RT6_NUD_FAIL_HARD = -3, 76 RT6_NUD_FAIL_PROBE = -2, 77 RT6_NUD_FAIL_DO_RR = -1, 78 RT6_NUD_SUCCEED = 1 79 }; 80 81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 82 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 83 static unsigned int ip6_mtu(const struct dst_entry *dst); 84 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 85 static void ip6_dst_destroy(struct dst_entry *); 86 static void ip6_dst_ifdown(struct dst_entry *, 87 struct net_device *dev, int how); 88 static int ip6_dst_gc(struct dst_ops *ops); 89 90 static int ip6_pkt_discard(struct sk_buff *skb); 91 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 92 static int ip6_pkt_prohibit(struct sk_buff *skb); 93 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 94 static void ip6_link_failure(struct sk_buff *skb); 95 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 96 struct sk_buff *skb, u32 mtu); 97 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 98 struct sk_buff *skb); 99 static int rt6_score_route(struct fib6_info *rt, int oif, int strict); 100 static size_t rt6_nlmsg_size(struct fib6_info *rt); 101 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 102 struct fib6_info *rt, struct dst_entry *dst, 103 struct in6_addr *dest, struct in6_addr *src, 104 int iif, int type, u32 portid, u32 seq, 105 unsigned int flags); 106 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 107 struct in6_addr *daddr, 108 struct in6_addr *saddr); 109 110 #ifdef CONFIG_IPV6_ROUTE_INFO 111 static struct fib6_info *rt6_add_route_info(struct net *net, 112 const struct in6_addr *prefix, int prefixlen, 113 const struct in6_addr *gwaddr, 114 struct net_device *dev, 115 unsigned int pref); 116 static struct fib6_info *rt6_get_route_info(struct net *net, 117 const struct in6_addr *prefix, int prefixlen, 118 const struct in6_addr *gwaddr, 119 struct net_device *dev); 120 #endif 121 122 struct uncached_list { 123 spinlock_t lock; 124 struct list_head head; 125 }; 126 127 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 128 129 void rt6_uncached_list_add(struct rt6_info *rt) 130 { 131 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 132 133 rt->rt6i_uncached_list = ul; 134 135 spin_lock_bh(&ul->lock); 136 list_add_tail(&rt->rt6i_uncached, &ul->head); 137 spin_unlock_bh(&ul->lock); 138 } 139 140 void rt6_uncached_list_del(struct rt6_info *rt) 141 { 142 if (!list_empty(&rt->rt6i_uncached)) { 143 struct uncached_list *ul = rt->rt6i_uncached_list; 144 struct net *net = dev_net(rt->dst.dev); 145 146 spin_lock_bh(&ul->lock); 147 list_del(&rt->rt6i_uncached); 148 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 149 spin_unlock_bh(&ul->lock); 150 } 151 } 152 153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 154 { 155 struct net_device *loopback_dev = net->loopback_dev; 156 int cpu; 157 158 if (dev == loopback_dev) 159 return; 160 161 for_each_possible_cpu(cpu) { 162 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 163 struct rt6_info *rt; 164 165 spin_lock_bh(&ul->lock); 166 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 167 struct inet6_dev *rt_idev = rt->rt6i_idev; 168 struct net_device *rt_dev = rt->dst.dev; 169 170 if (rt_idev->dev == dev) { 171 rt->rt6i_idev = in6_dev_get(loopback_dev); 172 in6_dev_put(rt_idev); 173 } 174 175 if (rt_dev == dev) { 176 rt->dst.dev = loopback_dev; 177 dev_hold(rt->dst.dev); 178 dev_put(rt_dev); 179 } 180 } 181 spin_unlock_bh(&ul->lock); 182 } 183 } 184 185 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 186 struct sk_buff *skb, 187 const void *daddr) 188 { 189 if (!ipv6_addr_any(p)) 190 return (const void *) p; 191 else if (skb) 192 return &ipv6_hdr(skb)->daddr; 193 return daddr; 194 } 195 196 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 197 struct net_device *dev, 198 struct sk_buff *skb, 199 const void *daddr) 200 { 201 struct neighbour *n; 202 203 daddr = choose_neigh_daddr(gw, skb, daddr); 204 n = __ipv6_neigh_lookup(dev, daddr); 205 if (n) 206 return n; 207 return neigh_create(&nd_tbl, daddr, dev); 208 } 209 210 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 211 struct sk_buff *skb, 212 const void *daddr) 213 { 214 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 215 216 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); 217 } 218 219 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 220 { 221 struct net_device *dev = dst->dev; 222 struct rt6_info *rt = (struct rt6_info *)dst; 223 224 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); 225 if (!daddr) 226 return; 227 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 228 return; 229 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 230 return; 231 __ipv6_confirm_neigh(dev, daddr); 232 } 233 234 static struct dst_ops ip6_dst_ops_template = { 235 .family = AF_INET6, 236 .gc = ip6_dst_gc, 237 .gc_thresh = 1024, 238 .check = ip6_dst_check, 239 .default_advmss = ip6_default_advmss, 240 .mtu = ip6_mtu, 241 .cow_metrics = dst_cow_metrics_generic, 242 .destroy = ip6_dst_destroy, 243 .ifdown = ip6_dst_ifdown, 244 .negative_advice = ip6_negative_advice, 245 .link_failure = ip6_link_failure, 246 .update_pmtu = ip6_rt_update_pmtu, 247 .redirect = rt6_do_redirect, 248 .local_out = __ip6_local_out, 249 .neigh_lookup = ip6_dst_neigh_lookup, 250 .confirm_neigh = ip6_confirm_neigh, 251 }; 252 253 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 254 { 255 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 256 257 return mtu ? : dst->dev->mtu; 258 } 259 260 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 261 struct sk_buff *skb, u32 mtu) 262 { 263 } 264 265 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 266 struct sk_buff *skb) 267 { 268 } 269 270 static struct dst_ops ip6_dst_blackhole_ops = { 271 .family = AF_INET6, 272 .destroy = ip6_dst_destroy, 273 .check = ip6_dst_check, 274 .mtu = ip6_blackhole_mtu, 275 .default_advmss = ip6_default_advmss, 276 .update_pmtu = ip6_rt_blackhole_update_pmtu, 277 .redirect = ip6_rt_blackhole_redirect, 278 .cow_metrics = dst_cow_metrics_generic, 279 .neigh_lookup = ip6_dst_neigh_lookup, 280 }; 281 282 static const u32 ip6_template_metrics[RTAX_MAX] = { 283 [RTAX_HOPLIMIT - 1] = 0, 284 }; 285 286 static const struct fib6_info fib6_null_entry_template = { 287 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 288 .fib6_protocol = RTPROT_KERNEL, 289 .fib6_metric = ~(u32)0, 290 .fib6_ref = ATOMIC_INIT(1), 291 .fib6_type = RTN_UNREACHABLE, 292 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 293 }; 294 295 static const struct rt6_info ip6_null_entry_template = { 296 .dst = { 297 .__refcnt = ATOMIC_INIT(1), 298 .__use = 1, 299 .obsolete = DST_OBSOLETE_FORCE_CHK, 300 .error = -ENETUNREACH, 301 .input = ip6_pkt_discard, 302 .output = ip6_pkt_discard_out, 303 }, 304 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 305 }; 306 307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 308 309 static const struct rt6_info ip6_prohibit_entry_template = { 310 .dst = { 311 .__refcnt = ATOMIC_INIT(1), 312 .__use = 1, 313 .obsolete = DST_OBSOLETE_FORCE_CHK, 314 .error = -EACCES, 315 .input = ip6_pkt_prohibit, 316 .output = ip6_pkt_prohibit_out, 317 }, 318 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 319 }; 320 321 static const struct rt6_info ip6_blk_hole_entry_template = { 322 .dst = { 323 .__refcnt = ATOMIC_INIT(1), 324 .__use = 1, 325 .obsolete = DST_OBSOLETE_FORCE_CHK, 326 .error = -EINVAL, 327 .input = dst_discard, 328 .output = dst_discard_out, 329 }, 330 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 331 }; 332 333 #endif 334 335 static void rt6_info_init(struct rt6_info *rt) 336 { 337 struct dst_entry *dst = &rt->dst; 338 339 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 340 INIT_LIST_HEAD(&rt->rt6i_uncached); 341 } 342 343 /* allocate dst with ip6_dst_ops */ 344 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 345 int flags) 346 { 347 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 348 1, DST_OBSOLETE_FORCE_CHK, flags); 349 350 if (rt) { 351 rt6_info_init(rt); 352 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 353 } 354 355 return rt; 356 } 357 EXPORT_SYMBOL(ip6_dst_alloc); 358 359 static void ip6_dst_destroy(struct dst_entry *dst) 360 { 361 struct rt6_info *rt = (struct rt6_info *)dst; 362 struct fib6_info *from; 363 struct inet6_dev *idev; 364 365 dst_destroy_metrics_generic(dst); 366 rt6_uncached_list_del(rt); 367 368 idev = rt->rt6i_idev; 369 if (idev) { 370 rt->rt6i_idev = NULL; 371 in6_dev_put(idev); 372 } 373 374 rcu_read_lock(); 375 from = rcu_dereference(rt->from); 376 rcu_assign_pointer(rt->from, NULL); 377 fib6_info_release(from); 378 rcu_read_unlock(); 379 } 380 381 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 382 int how) 383 { 384 struct rt6_info *rt = (struct rt6_info *)dst; 385 struct inet6_dev *idev = rt->rt6i_idev; 386 struct net_device *loopback_dev = 387 dev_net(dev)->loopback_dev; 388 389 if (idev && idev->dev != loopback_dev) { 390 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 391 if (loopback_idev) { 392 rt->rt6i_idev = loopback_idev; 393 in6_dev_put(idev); 394 } 395 } 396 } 397 398 static bool __rt6_check_expired(const struct rt6_info *rt) 399 { 400 if (rt->rt6i_flags & RTF_EXPIRES) 401 return time_after(jiffies, rt->dst.expires); 402 else 403 return false; 404 } 405 406 static bool rt6_check_expired(const struct rt6_info *rt) 407 { 408 struct fib6_info *from; 409 410 from = rcu_dereference(rt->from); 411 412 if (rt->rt6i_flags & RTF_EXPIRES) { 413 if (time_after(jiffies, rt->dst.expires)) 414 return true; 415 } else if (from) { 416 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 417 fib6_check_expired(from); 418 } 419 return false; 420 } 421 422 static struct fib6_info *rt6_multipath_select(const struct net *net, 423 struct fib6_info *match, 424 struct flowi6 *fl6, int oif, 425 const struct sk_buff *skb, 426 int strict) 427 { 428 struct fib6_info *sibling, *next_sibling; 429 430 /* We might have already computed the hash for ICMPv6 errors. In such 431 * case it will always be non-zero. Otherwise now is the time to do it. 432 */ 433 if (!fl6->mp_hash) 434 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 435 436 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound)) 437 return match; 438 439 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 440 fib6_siblings) { 441 int nh_upper_bound; 442 443 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound); 444 if (fl6->mp_hash > nh_upper_bound) 445 continue; 446 if (rt6_score_route(sibling, oif, strict) < 0) 447 break; 448 match = sibling; 449 break; 450 } 451 452 return match; 453 } 454 455 /* 456 * Route lookup. rcu_read_lock() should be held. 457 */ 458 459 static inline struct fib6_info *rt6_device_match(struct net *net, 460 struct fib6_info *rt, 461 const struct in6_addr *saddr, 462 int oif, 463 int flags) 464 { 465 struct fib6_info *sprt; 466 467 if (!oif && ipv6_addr_any(saddr) && 468 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD)) 469 return rt; 470 471 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) { 472 const struct net_device *dev = sprt->fib6_nh.nh_dev; 473 474 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD) 475 continue; 476 477 if (oif) { 478 if (dev->ifindex == oif) 479 return sprt; 480 } else { 481 if (ipv6_chk_addr(net, saddr, dev, 482 flags & RT6_LOOKUP_F_IFACE)) 483 return sprt; 484 } 485 } 486 487 if (oif && flags & RT6_LOOKUP_F_IFACE) 488 return net->ipv6.fib6_null_entry; 489 490 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt; 491 } 492 493 #ifdef CONFIG_IPV6_ROUTER_PREF 494 struct __rt6_probe_work { 495 struct work_struct work; 496 struct in6_addr target; 497 struct net_device *dev; 498 }; 499 500 static void rt6_probe_deferred(struct work_struct *w) 501 { 502 struct in6_addr mcaddr; 503 struct __rt6_probe_work *work = 504 container_of(w, struct __rt6_probe_work, work); 505 506 addrconf_addr_solict_mult(&work->target, &mcaddr); 507 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 508 dev_put(work->dev); 509 kfree(work); 510 } 511 512 static void rt6_probe(struct fib6_info *rt) 513 { 514 struct __rt6_probe_work *work; 515 const struct in6_addr *nh_gw; 516 struct neighbour *neigh; 517 struct net_device *dev; 518 519 /* 520 * Okay, this does not seem to be appropriate 521 * for now, however, we need to check if it 522 * is really so; aka Router Reachability Probing. 523 * 524 * Router Reachability Probe MUST be rate-limited 525 * to no more than one per minute. 526 */ 527 if (!rt || !(rt->fib6_flags & RTF_GATEWAY)) 528 return; 529 530 nh_gw = &rt->fib6_nh.nh_gw; 531 dev = rt->fib6_nh.nh_dev; 532 rcu_read_lock_bh(); 533 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 534 if (neigh) { 535 struct inet6_dev *idev; 536 537 if (neigh->nud_state & NUD_VALID) 538 goto out; 539 540 idev = __in6_dev_get(dev); 541 work = NULL; 542 write_lock(&neigh->lock); 543 if (!(neigh->nud_state & NUD_VALID) && 544 time_after(jiffies, 545 neigh->updated + idev->cnf.rtr_probe_interval)) { 546 work = kmalloc(sizeof(*work), GFP_ATOMIC); 547 if (work) 548 __neigh_set_probe_once(neigh); 549 } 550 write_unlock(&neigh->lock); 551 } else { 552 work = kmalloc(sizeof(*work), GFP_ATOMIC); 553 } 554 555 if (work) { 556 INIT_WORK(&work->work, rt6_probe_deferred); 557 work->target = *nh_gw; 558 dev_hold(dev); 559 work->dev = dev; 560 schedule_work(&work->work); 561 } 562 563 out: 564 rcu_read_unlock_bh(); 565 } 566 #else 567 static inline void rt6_probe(struct fib6_info *rt) 568 { 569 } 570 #endif 571 572 /* 573 * Default Router Selection (RFC 2461 6.3.6) 574 */ 575 static inline int rt6_check_dev(struct fib6_info *rt, int oif) 576 { 577 const struct net_device *dev = rt->fib6_nh.nh_dev; 578 579 if (!oif || dev->ifindex == oif) 580 return 2; 581 return 0; 582 } 583 584 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) 585 { 586 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 587 struct neighbour *neigh; 588 589 if (rt->fib6_flags & RTF_NONEXTHOP || 590 !(rt->fib6_flags & RTF_GATEWAY)) 591 return RT6_NUD_SUCCEED; 592 593 rcu_read_lock_bh(); 594 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev, 595 &rt->fib6_nh.nh_gw); 596 if (neigh) { 597 read_lock(&neigh->lock); 598 if (neigh->nud_state & NUD_VALID) 599 ret = RT6_NUD_SUCCEED; 600 #ifdef CONFIG_IPV6_ROUTER_PREF 601 else if (!(neigh->nud_state & NUD_FAILED)) 602 ret = RT6_NUD_SUCCEED; 603 else 604 ret = RT6_NUD_FAIL_PROBE; 605 #endif 606 read_unlock(&neigh->lock); 607 } else { 608 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 609 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 610 } 611 rcu_read_unlock_bh(); 612 613 return ret; 614 } 615 616 static int rt6_score_route(struct fib6_info *rt, int oif, int strict) 617 { 618 int m; 619 620 m = rt6_check_dev(rt, oif); 621 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 622 return RT6_NUD_FAIL_HARD; 623 #ifdef CONFIG_IPV6_ROUTER_PREF 624 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2; 625 #endif 626 if (strict & RT6_LOOKUP_F_REACHABLE) { 627 int n = rt6_check_neigh(rt); 628 if (n < 0) 629 return n; 630 } 631 return m; 632 } 633 634 /* called with rc_read_lock held */ 635 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i) 636 { 637 const struct net_device *dev = fib6_info_nh_dev(f6i); 638 bool rc = false; 639 640 if (dev) { 641 const struct inet6_dev *idev = __in6_dev_get(dev); 642 643 rc = !!idev->cnf.ignore_routes_with_linkdown; 644 } 645 646 return rc; 647 } 648 649 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, 650 int *mpri, struct fib6_info *match, 651 bool *do_rr) 652 { 653 int m; 654 bool match_do_rr = false; 655 656 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 657 goto out; 658 659 if (fib6_ignore_linkdown(rt) && 660 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 661 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 662 goto out; 663 664 if (fib6_check_expired(rt)) 665 goto out; 666 667 m = rt6_score_route(rt, oif, strict); 668 if (m == RT6_NUD_FAIL_DO_RR) { 669 match_do_rr = true; 670 m = 0; /* lowest valid score */ 671 } else if (m == RT6_NUD_FAIL_HARD) { 672 goto out; 673 } 674 675 if (strict & RT6_LOOKUP_F_REACHABLE) 676 rt6_probe(rt); 677 678 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 679 if (m > *mpri) { 680 *do_rr = match_do_rr; 681 *mpri = m; 682 match = rt; 683 } 684 out: 685 return match; 686 } 687 688 static struct fib6_info *find_rr_leaf(struct fib6_node *fn, 689 struct fib6_info *leaf, 690 struct fib6_info *rr_head, 691 u32 metric, int oif, int strict, 692 bool *do_rr) 693 { 694 struct fib6_info *rt, *match, *cont; 695 int mpri = -1; 696 697 match = NULL; 698 cont = NULL; 699 for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) { 700 if (rt->fib6_metric != metric) { 701 cont = rt; 702 break; 703 } 704 705 match = find_match(rt, oif, strict, &mpri, match, do_rr); 706 } 707 708 for (rt = leaf; rt && rt != rr_head; 709 rt = rcu_dereference(rt->rt6_next)) { 710 if (rt->fib6_metric != metric) { 711 cont = rt; 712 break; 713 } 714 715 match = find_match(rt, oif, strict, &mpri, match, do_rr); 716 } 717 718 if (match || !cont) 719 return match; 720 721 for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next)) 722 match = find_match(rt, oif, strict, &mpri, match, do_rr); 723 724 return match; 725 } 726 727 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, 728 int oif, int strict) 729 { 730 struct fib6_info *leaf = rcu_dereference(fn->leaf); 731 struct fib6_info *match, *rt0; 732 bool do_rr = false; 733 int key_plen; 734 735 if (!leaf || leaf == net->ipv6.fib6_null_entry) 736 return net->ipv6.fib6_null_entry; 737 738 rt0 = rcu_dereference(fn->rr_ptr); 739 if (!rt0) 740 rt0 = leaf; 741 742 /* Double check to make sure fn is not an intermediate node 743 * and fn->leaf does not points to its child's leaf 744 * (This might happen if all routes under fn are deleted from 745 * the tree and fib6_repair_tree() is called on the node.) 746 */ 747 key_plen = rt0->fib6_dst.plen; 748 #ifdef CONFIG_IPV6_SUBTREES 749 if (rt0->fib6_src.plen) 750 key_plen = rt0->fib6_src.plen; 751 #endif 752 if (fn->fn_bit != key_plen) 753 return net->ipv6.fib6_null_entry; 754 755 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict, 756 &do_rr); 757 758 if (do_rr) { 759 struct fib6_info *next = rcu_dereference(rt0->rt6_next); 760 761 /* no entries matched; do round-robin */ 762 if (!next || next->fib6_metric != rt0->fib6_metric) 763 next = leaf; 764 765 if (next != rt0) { 766 spin_lock_bh(&leaf->fib6_table->tb6_lock); 767 /* make sure next is not being deleted from the tree */ 768 if (next->fib6_node) 769 rcu_assign_pointer(fn->rr_ptr, next); 770 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 771 } 772 } 773 774 return match ? match : net->ipv6.fib6_null_entry; 775 } 776 777 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) 778 { 779 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); 780 } 781 782 #ifdef CONFIG_IPV6_ROUTE_INFO 783 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 784 const struct in6_addr *gwaddr) 785 { 786 struct net *net = dev_net(dev); 787 struct route_info *rinfo = (struct route_info *) opt; 788 struct in6_addr prefix_buf, *prefix; 789 unsigned int pref; 790 unsigned long lifetime; 791 struct fib6_info *rt; 792 793 if (len < sizeof(struct route_info)) { 794 return -EINVAL; 795 } 796 797 /* Sanity check for prefix_len and length */ 798 if (rinfo->length > 3) { 799 return -EINVAL; 800 } else if (rinfo->prefix_len > 128) { 801 return -EINVAL; 802 } else if (rinfo->prefix_len > 64) { 803 if (rinfo->length < 2) { 804 return -EINVAL; 805 } 806 } else if (rinfo->prefix_len > 0) { 807 if (rinfo->length < 1) { 808 return -EINVAL; 809 } 810 } 811 812 pref = rinfo->route_pref; 813 if (pref == ICMPV6_ROUTER_PREF_INVALID) 814 return -EINVAL; 815 816 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 817 818 if (rinfo->length == 3) 819 prefix = (struct in6_addr *)rinfo->prefix; 820 else { 821 /* this function is safe */ 822 ipv6_addr_prefix(&prefix_buf, 823 (struct in6_addr *)rinfo->prefix, 824 rinfo->prefix_len); 825 prefix = &prefix_buf; 826 } 827 828 if (rinfo->prefix_len == 0) 829 rt = rt6_get_dflt_router(net, gwaddr, dev); 830 else 831 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 832 gwaddr, dev); 833 834 if (rt && !lifetime) { 835 ip6_del_rt(net, rt); 836 rt = NULL; 837 } 838 839 if (!rt && lifetime) 840 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 841 dev, pref); 842 else if (rt) 843 rt->fib6_flags = RTF_ROUTEINFO | 844 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 845 846 if (rt) { 847 if (!addrconf_finite_timeout(lifetime)) 848 fib6_clean_expires(rt); 849 else 850 fib6_set_expires(rt, jiffies + HZ * lifetime); 851 852 fib6_info_release(rt); 853 } 854 return 0; 855 } 856 #endif 857 858 /* 859 * Misc support functions 860 */ 861 862 /* called with rcu_lock held */ 863 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt) 864 { 865 struct net_device *dev = rt->fib6_nh.nh_dev; 866 867 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 868 /* for copies of local routes, dst->dev needs to be the 869 * device if it is a master device, the master device if 870 * device is enslaved, and the loopback as the default 871 */ 872 if (netif_is_l3_slave(dev) && 873 !rt6_need_strict(&rt->fib6_dst.addr)) 874 dev = l3mdev_master_dev_rcu(dev); 875 else if (!netif_is_l3_master(dev)) 876 dev = dev_net(dev)->loopback_dev; 877 /* last case is netif_is_l3_master(dev) is true in which 878 * case we want dev returned to be dev 879 */ 880 } 881 882 return dev; 883 } 884 885 static const int fib6_prop[RTN_MAX + 1] = { 886 [RTN_UNSPEC] = 0, 887 [RTN_UNICAST] = 0, 888 [RTN_LOCAL] = 0, 889 [RTN_BROADCAST] = 0, 890 [RTN_ANYCAST] = 0, 891 [RTN_MULTICAST] = 0, 892 [RTN_BLACKHOLE] = -EINVAL, 893 [RTN_UNREACHABLE] = -EHOSTUNREACH, 894 [RTN_PROHIBIT] = -EACCES, 895 [RTN_THROW] = -EAGAIN, 896 [RTN_NAT] = -EINVAL, 897 [RTN_XRESOLVE] = -EINVAL, 898 }; 899 900 static int ip6_rt_type_to_error(u8 fib6_type) 901 { 902 return fib6_prop[fib6_type]; 903 } 904 905 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 906 { 907 unsigned short flags = 0; 908 909 if (rt->dst_nocount) 910 flags |= DST_NOCOUNT; 911 if (rt->dst_nopolicy) 912 flags |= DST_NOPOLICY; 913 if (rt->dst_host) 914 flags |= DST_HOST; 915 916 return flags; 917 } 918 919 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) 920 { 921 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); 922 923 switch (ort->fib6_type) { 924 case RTN_BLACKHOLE: 925 rt->dst.output = dst_discard_out; 926 rt->dst.input = dst_discard; 927 break; 928 case RTN_PROHIBIT: 929 rt->dst.output = ip6_pkt_prohibit_out; 930 rt->dst.input = ip6_pkt_prohibit; 931 break; 932 case RTN_THROW: 933 case RTN_UNREACHABLE: 934 default: 935 rt->dst.output = ip6_pkt_discard_out; 936 rt->dst.input = ip6_pkt_discard; 937 break; 938 } 939 } 940 941 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) 942 { 943 rt->dst.flags |= fib6_info_dst_flags(ort); 944 945 if (ort->fib6_flags & RTF_REJECT) { 946 ip6_rt_init_dst_reject(rt, ort); 947 return; 948 } 949 950 rt->dst.error = 0; 951 rt->dst.output = ip6_output; 952 953 if (ort->fib6_type == RTN_LOCAL) { 954 rt->dst.input = ip6_input; 955 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 956 rt->dst.input = ip6_mc_input; 957 } else { 958 rt->dst.input = ip6_forward; 959 } 960 961 if (ort->fib6_nh.nh_lwtstate) { 962 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); 963 lwtunnel_set_redirect(&rt->dst); 964 } 965 966 rt->dst.lastuse = jiffies; 967 } 968 969 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 970 { 971 rt->rt6i_flags &= ~RTF_EXPIRES; 972 fib6_info_hold(from); 973 rcu_assign_pointer(rt->from, from); 974 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); 975 if (from->fib6_metrics != &dst_default_metrics) { 976 rt->dst._metrics |= DST_METRICS_REFCOUNTED; 977 refcount_inc(&from->fib6_metrics->refcnt); 978 } 979 } 980 981 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) 982 { 983 struct net_device *dev = fib6_info_nh_dev(ort); 984 985 ip6_rt_init_dst(rt, ort); 986 987 rt->rt6i_dst = ort->fib6_dst; 988 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 989 rt->rt6i_gateway = ort->fib6_nh.nh_gw; 990 rt->rt6i_flags = ort->fib6_flags; 991 rt6_set_from(rt, ort); 992 #ifdef CONFIG_IPV6_SUBTREES 993 rt->rt6i_src = ort->fib6_src; 994 #endif 995 rt->rt6i_prefsrc = ort->fib6_prefsrc; 996 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); 997 } 998 999 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1000 struct in6_addr *saddr) 1001 { 1002 struct fib6_node *pn, *sn; 1003 while (1) { 1004 if (fn->fn_flags & RTN_TL_ROOT) 1005 return NULL; 1006 pn = rcu_dereference(fn->parent); 1007 sn = FIB6_SUBTREE(pn); 1008 if (sn && sn != fn) 1009 fn = fib6_lookup(sn, NULL, saddr); 1010 else 1011 fn = pn; 1012 if (fn->fn_flags & RTN_RTINFO) 1013 return fn; 1014 } 1015 } 1016 1017 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, 1018 bool null_fallback) 1019 { 1020 struct rt6_info *rt = *prt; 1021 1022 if (dst_hold_safe(&rt->dst)) 1023 return true; 1024 if (null_fallback) { 1025 rt = net->ipv6.ip6_null_entry; 1026 dst_hold(&rt->dst); 1027 } else { 1028 rt = NULL; 1029 } 1030 *prt = rt; 1031 return false; 1032 } 1033 1034 /* called with rcu_lock held */ 1035 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) 1036 { 1037 unsigned short flags = fib6_info_dst_flags(rt); 1038 struct net_device *dev = rt->fib6_nh.nh_dev; 1039 struct rt6_info *nrt; 1040 1041 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1042 if (nrt) 1043 ip6_rt_copy_init(nrt, rt); 1044 1045 return nrt; 1046 } 1047 1048 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1049 struct fib6_table *table, 1050 struct flowi6 *fl6, 1051 const struct sk_buff *skb, 1052 int flags) 1053 { 1054 struct fib6_info *f6i; 1055 struct fib6_node *fn; 1056 struct rt6_info *rt; 1057 1058 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1059 flags &= ~RT6_LOOKUP_F_IFACE; 1060 1061 rcu_read_lock(); 1062 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1063 restart: 1064 f6i = rcu_dereference(fn->leaf); 1065 if (!f6i) { 1066 f6i = net->ipv6.fib6_null_entry; 1067 } else { 1068 f6i = rt6_device_match(net, f6i, &fl6->saddr, 1069 fl6->flowi6_oif, flags); 1070 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) 1071 f6i = rt6_multipath_select(net, f6i, fl6, 1072 fl6->flowi6_oif, skb, flags); 1073 } 1074 if (f6i == net->ipv6.fib6_null_entry) { 1075 fn = fib6_backtrack(fn, &fl6->saddr); 1076 if (fn) 1077 goto restart; 1078 } 1079 1080 /* Search through exception table */ 1081 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1082 if (rt) { 1083 if (ip6_hold_safe(net, &rt, true)) 1084 dst_use_noref(&rt->dst, jiffies); 1085 } else if (f6i == net->ipv6.fib6_null_entry) { 1086 rt = net->ipv6.ip6_null_entry; 1087 dst_hold(&rt->dst); 1088 } else { 1089 rt = ip6_create_rt_rcu(f6i); 1090 if (!rt) { 1091 rt = net->ipv6.ip6_null_entry; 1092 dst_hold(&rt->dst); 1093 } 1094 } 1095 1096 rcu_read_unlock(); 1097 1098 trace_fib6_table_lookup(net, rt, table, fl6); 1099 1100 return rt; 1101 } 1102 1103 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1104 const struct sk_buff *skb, int flags) 1105 { 1106 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1107 } 1108 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1109 1110 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1111 const struct in6_addr *saddr, int oif, 1112 const struct sk_buff *skb, int strict) 1113 { 1114 struct flowi6 fl6 = { 1115 .flowi6_oif = oif, 1116 .daddr = *daddr, 1117 }; 1118 struct dst_entry *dst; 1119 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1120 1121 if (saddr) { 1122 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1123 flags |= RT6_LOOKUP_F_HAS_SADDR; 1124 } 1125 1126 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1127 if (dst->error == 0) 1128 return (struct rt6_info *) dst; 1129 1130 dst_release(dst); 1131 1132 return NULL; 1133 } 1134 EXPORT_SYMBOL(rt6_lookup); 1135 1136 /* ip6_ins_rt is called with FREE table->tb6_lock. 1137 * It takes new route entry, the addition fails by any reason the 1138 * route is released. 1139 * Caller must hold dst before calling it. 1140 */ 1141 1142 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1143 struct netlink_ext_ack *extack) 1144 { 1145 int err; 1146 struct fib6_table *table; 1147 1148 table = rt->fib6_table; 1149 spin_lock_bh(&table->tb6_lock); 1150 err = fib6_add(&table->tb6_root, rt, info, extack); 1151 spin_unlock_bh(&table->tb6_lock); 1152 1153 return err; 1154 } 1155 1156 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1157 { 1158 struct nl_info info = { .nl_net = net, }; 1159 1160 return __ip6_ins_rt(rt, &info, NULL); 1161 } 1162 1163 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, 1164 const struct in6_addr *daddr, 1165 const struct in6_addr *saddr) 1166 { 1167 struct net_device *dev; 1168 struct rt6_info *rt; 1169 1170 /* 1171 * Clone the route. 1172 */ 1173 1174 dev = ip6_rt_get_dev_rcu(ort); 1175 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1176 if (!rt) 1177 return NULL; 1178 1179 ip6_rt_copy_init(rt, ort); 1180 rt->rt6i_flags |= RTF_CACHE; 1181 rt->dst.flags |= DST_HOST; 1182 rt->rt6i_dst.addr = *daddr; 1183 rt->rt6i_dst.plen = 128; 1184 1185 if (!rt6_is_gw_or_nonexthop(ort)) { 1186 if (ort->fib6_dst.plen != 128 && 1187 ipv6_addr_equal(&ort->fib6_dst.addr, daddr)) 1188 rt->rt6i_flags |= RTF_ANYCAST; 1189 #ifdef CONFIG_IPV6_SUBTREES 1190 if (rt->rt6i_src.plen && saddr) { 1191 rt->rt6i_src.addr = *saddr; 1192 rt->rt6i_src.plen = 128; 1193 } 1194 #endif 1195 } 1196 1197 return rt; 1198 } 1199 1200 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) 1201 { 1202 unsigned short flags = fib6_info_dst_flags(rt); 1203 struct net_device *dev; 1204 struct rt6_info *pcpu_rt; 1205 1206 rcu_read_lock(); 1207 dev = ip6_rt_get_dev_rcu(rt); 1208 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1209 rcu_read_unlock(); 1210 if (!pcpu_rt) 1211 return NULL; 1212 ip6_rt_copy_init(pcpu_rt, rt); 1213 pcpu_rt->rt6i_flags |= RTF_PCPU; 1214 return pcpu_rt; 1215 } 1216 1217 /* It should be called with rcu_read_lock() acquired */ 1218 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt) 1219 { 1220 struct rt6_info *pcpu_rt, **p; 1221 1222 p = this_cpu_ptr(rt->rt6i_pcpu); 1223 pcpu_rt = *p; 1224 1225 if (pcpu_rt) 1226 ip6_hold_safe(NULL, &pcpu_rt, false); 1227 1228 return pcpu_rt; 1229 } 1230 1231 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1232 struct fib6_info *rt) 1233 { 1234 struct rt6_info *pcpu_rt, *prev, **p; 1235 1236 pcpu_rt = ip6_rt_pcpu_alloc(rt); 1237 if (!pcpu_rt) { 1238 dst_hold(&net->ipv6.ip6_null_entry->dst); 1239 return net->ipv6.ip6_null_entry; 1240 } 1241 1242 dst_hold(&pcpu_rt->dst); 1243 p = this_cpu_ptr(rt->rt6i_pcpu); 1244 prev = cmpxchg(p, NULL, pcpu_rt); 1245 BUG_ON(prev); 1246 1247 return pcpu_rt; 1248 } 1249 1250 /* exception hash table implementation 1251 */ 1252 static DEFINE_SPINLOCK(rt6_exception_lock); 1253 1254 /* Remove rt6_ex from hash table and free the memory 1255 * Caller must hold rt6_exception_lock 1256 */ 1257 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1258 struct rt6_exception *rt6_ex) 1259 { 1260 struct net *net; 1261 1262 if (!bucket || !rt6_ex) 1263 return; 1264 1265 net = dev_net(rt6_ex->rt6i->dst.dev); 1266 hlist_del_rcu(&rt6_ex->hlist); 1267 dst_release(&rt6_ex->rt6i->dst); 1268 kfree_rcu(rt6_ex, rcu); 1269 WARN_ON_ONCE(!bucket->depth); 1270 bucket->depth--; 1271 net->ipv6.rt6_stats->fib_rt_cache--; 1272 } 1273 1274 /* Remove oldest rt6_ex in bucket and free the memory 1275 * Caller must hold rt6_exception_lock 1276 */ 1277 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1278 { 1279 struct rt6_exception *rt6_ex, *oldest = NULL; 1280 1281 if (!bucket) 1282 return; 1283 1284 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1285 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1286 oldest = rt6_ex; 1287 } 1288 rt6_remove_exception(bucket, oldest); 1289 } 1290 1291 static u32 rt6_exception_hash(const struct in6_addr *dst, 1292 const struct in6_addr *src) 1293 { 1294 static u32 seed __read_mostly; 1295 u32 val; 1296 1297 net_get_random_once(&seed, sizeof(seed)); 1298 val = jhash(dst, sizeof(*dst), seed); 1299 1300 #ifdef CONFIG_IPV6_SUBTREES 1301 if (src) 1302 val = jhash(src, sizeof(*src), val); 1303 #endif 1304 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1305 } 1306 1307 /* Helper function to find the cached rt in the hash table 1308 * and update bucket pointer to point to the bucket for this 1309 * (daddr, saddr) pair 1310 * Caller must hold rt6_exception_lock 1311 */ 1312 static struct rt6_exception * 1313 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1314 const struct in6_addr *daddr, 1315 const struct in6_addr *saddr) 1316 { 1317 struct rt6_exception *rt6_ex; 1318 u32 hval; 1319 1320 if (!(*bucket) || !daddr) 1321 return NULL; 1322 1323 hval = rt6_exception_hash(daddr, saddr); 1324 *bucket += hval; 1325 1326 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1327 struct rt6_info *rt6 = rt6_ex->rt6i; 1328 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1329 1330 #ifdef CONFIG_IPV6_SUBTREES 1331 if (matched && saddr) 1332 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1333 #endif 1334 if (matched) 1335 return rt6_ex; 1336 } 1337 return NULL; 1338 } 1339 1340 /* Helper function to find the cached rt in the hash table 1341 * and update bucket pointer to point to the bucket for this 1342 * (daddr, saddr) pair 1343 * Caller must hold rcu_read_lock() 1344 */ 1345 static struct rt6_exception * 1346 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1347 const struct in6_addr *daddr, 1348 const struct in6_addr *saddr) 1349 { 1350 struct rt6_exception *rt6_ex; 1351 u32 hval; 1352 1353 WARN_ON_ONCE(!rcu_read_lock_held()); 1354 1355 if (!(*bucket) || !daddr) 1356 return NULL; 1357 1358 hval = rt6_exception_hash(daddr, saddr); 1359 *bucket += hval; 1360 1361 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1362 struct rt6_info *rt6 = rt6_ex->rt6i; 1363 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1364 1365 #ifdef CONFIG_IPV6_SUBTREES 1366 if (matched && saddr) 1367 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1368 #endif 1369 if (matched) 1370 return rt6_ex; 1371 } 1372 return NULL; 1373 } 1374 1375 static unsigned int fib6_mtu(const struct fib6_info *rt) 1376 { 1377 unsigned int mtu; 1378 1379 if (rt->fib6_pmtu) { 1380 mtu = rt->fib6_pmtu; 1381 } else { 1382 struct net_device *dev = fib6_info_nh_dev(rt); 1383 struct inet6_dev *idev; 1384 1385 rcu_read_lock(); 1386 idev = __in6_dev_get(dev); 1387 mtu = idev->cnf.mtu6; 1388 rcu_read_unlock(); 1389 } 1390 1391 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1392 1393 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu); 1394 } 1395 1396 static int rt6_insert_exception(struct rt6_info *nrt, 1397 struct fib6_info *ort) 1398 { 1399 struct net *net = dev_net(nrt->dst.dev); 1400 struct rt6_exception_bucket *bucket; 1401 struct in6_addr *src_key = NULL; 1402 struct rt6_exception *rt6_ex; 1403 int err = 0; 1404 1405 spin_lock_bh(&rt6_exception_lock); 1406 1407 if (ort->exception_bucket_flushed) { 1408 err = -EINVAL; 1409 goto out; 1410 } 1411 1412 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, 1413 lockdep_is_held(&rt6_exception_lock)); 1414 if (!bucket) { 1415 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1416 GFP_ATOMIC); 1417 if (!bucket) { 1418 err = -ENOMEM; 1419 goto out; 1420 } 1421 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); 1422 } 1423 1424 #ifdef CONFIG_IPV6_SUBTREES 1425 /* rt6i_src.plen != 0 indicates ort is in subtree 1426 * and exception table is indexed by a hash of 1427 * both rt6i_dst and rt6i_src. 1428 * Otherwise, the exception table is indexed by 1429 * a hash of only rt6i_dst. 1430 */ 1431 if (ort->fib6_src.plen) 1432 src_key = &nrt->rt6i_src.addr; 1433 #endif 1434 1435 /* Update rt6i_prefsrc as it could be changed 1436 * in rt6_remove_prefsrc() 1437 */ 1438 nrt->rt6i_prefsrc = ort->fib6_prefsrc; 1439 /* rt6_mtu_change() might lower mtu on ort. 1440 * Only insert this exception route if its mtu 1441 * is less than ort's mtu value. 1442 */ 1443 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) { 1444 err = -EINVAL; 1445 goto out; 1446 } 1447 1448 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1449 src_key); 1450 if (rt6_ex) 1451 rt6_remove_exception(bucket, rt6_ex); 1452 1453 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1454 if (!rt6_ex) { 1455 err = -ENOMEM; 1456 goto out; 1457 } 1458 rt6_ex->rt6i = nrt; 1459 rt6_ex->stamp = jiffies; 1460 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1461 bucket->depth++; 1462 net->ipv6.rt6_stats->fib_rt_cache++; 1463 1464 if (bucket->depth > FIB6_MAX_DEPTH) 1465 rt6_exception_remove_oldest(bucket); 1466 1467 out: 1468 spin_unlock_bh(&rt6_exception_lock); 1469 1470 /* Update fn->fn_sernum to invalidate all cached dst */ 1471 if (!err) { 1472 spin_lock_bh(&ort->fib6_table->tb6_lock); 1473 fib6_update_sernum(net, ort); 1474 spin_unlock_bh(&ort->fib6_table->tb6_lock); 1475 fib6_force_start_gc(net); 1476 } 1477 1478 return err; 1479 } 1480 1481 void rt6_flush_exceptions(struct fib6_info *rt) 1482 { 1483 struct rt6_exception_bucket *bucket; 1484 struct rt6_exception *rt6_ex; 1485 struct hlist_node *tmp; 1486 int i; 1487 1488 spin_lock_bh(&rt6_exception_lock); 1489 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1490 rt->exception_bucket_flushed = 1; 1491 1492 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1493 lockdep_is_held(&rt6_exception_lock)); 1494 if (!bucket) 1495 goto out; 1496 1497 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1498 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1499 rt6_remove_exception(bucket, rt6_ex); 1500 WARN_ON_ONCE(bucket->depth); 1501 bucket++; 1502 } 1503 1504 out: 1505 spin_unlock_bh(&rt6_exception_lock); 1506 } 1507 1508 /* Find cached rt in the hash table inside passed in rt 1509 * Caller has to hold rcu_read_lock() 1510 */ 1511 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 1512 struct in6_addr *daddr, 1513 struct in6_addr *saddr) 1514 { 1515 struct rt6_exception_bucket *bucket; 1516 struct in6_addr *src_key = NULL; 1517 struct rt6_exception *rt6_ex; 1518 struct rt6_info *res = NULL; 1519 1520 bucket = rcu_dereference(rt->rt6i_exception_bucket); 1521 1522 #ifdef CONFIG_IPV6_SUBTREES 1523 /* rt6i_src.plen != 0 indicates rt is in subtree 1524 * and exception table is indexed by a hash of 1525 * both rt6i_dst and rt6i_src. 1526 * Otherwise, the exception table is indexed by 1527 * a hash of only rt6i_dst. 1528 */ 1529 if (rt->fib6_src.plen) 1530 src_key = saddr; 1531 #endif 1532 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1533 1534 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1535 res = rt6_ex->rt6i; 1536 1537 return res; 1538 } 1539 1540 /* Remove the passed in cached rt from the hash table that contains it */ 1541 static int rt6_remove_exception_rt(struct rt6_info *rt) 1542 { 1543 struct rt6_exception_bucket *bucket; 1544 struct in6_addr *src_key = NULL; 1545 struct rt6_exception *rt6_ex; 1546 struct fib6_info *from; 1547 int err; 1548 1549 from = rcu_dereference(rt->from); 1550 if (!from || 1551 !(rt->rt6i_flags & RTF_CACHE)) 1552 return -EINVAL; 1553 1554 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1555 return -ENOENT; 1556 1557 spin_lock_bh(&rt6_exception_lock); 1558 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1559 lockdep_is_held(&rt6_exception_lock)); 1560 #ifdef CONFIG_IPV6_SUBTREES 1561 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1562 * and exception table is indexed by a hash of 1563 * both rt6i_dst and rt6i_src. 1564 * Otherwise, the exception table is indexed by 1565 * a hash of only rt6i_dst. 1566 */ 1567 if (from->fib6_src.plen) 1568 src_key = &rt->rt6i_src.addr; 1569 #endif 1570 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1571 &rt->rt6i_dst.addr, 1572 src_key); 1573 if (rt6_ex) { 1574 rt6_remove_exception(bucket, rt6_ex); 1575 err = 0; 1576 } else { 1577 err = -ENOENT; 1578 } 1579 1580 spin_unlock_bh(&rt6_exception_lock); 1581 return err; 1582 } 1583 1584 /* Find rt6_ex which contains the passed in rt cache and 1585 * refresh its stamp 1586 */ 1587 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1588 { 1589 struct rt6_exception_bucket *bucket; 1590 struct fib6_info *from = rt->from; 1591 struct in6_addr *src_key = NULL; 1592 struct rt6_exception *rt6_ex; 1593 1594 if (!from || 1595 !(rt->rt6i_flags & RTF_CACHE)) 1596 return; 1597 1598 rcu_read_lock(); 1599 bucket = rcu_dereference(from->rt6i_exception_bucket); 1600 1601 #ifdef CONFIG_IPV6_SUBTREES 1602 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1603 * and exception table is indexed by a hash of 1604 * both rt6i_dst and rt6i_src. 1605 * Otherwise, the exception table is indexed by 1606 * a hash of only rt6i_dst. 1607 */ 1608 if (from->fib6_src.plen) 1609 src_key = &rt->rt6i_src.addr; 1610 #endif 1611 rt6_ex = __rt6_find_exception_rcu(&bucket, 1612 &rt->rt6i_dst.addr, 1613 src_key); 1614 if (rt6_ex) 1615 rt6_ex->stamp = jiffies; 1616 1617 rcu_read_unlock(); 1618 } 1619 1620 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt) 1621 { 1622 struct rt6_exception_bucket *bucket; 1623 struct rt6_exception *rt6_ex; 1624 int i; 1625 1626 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1627 lockdep_is_held(&rt6_exception_lock)); 1628 1629 if (bucket) { 1630 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1631 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1632 rt6_ex->rt6i->rt6i_prefsrc.plen = 0; 1633 } 1634 bucket++; 1635 } 1636 } 1637 } 1638 1639 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1640 struct rt6_info *rt, int mtu) 1641 { 1642 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1643 * lowest MTU in the path: always allow updating the route PMTU to 1644 * reflect PMTU decreases. 1645 * 1646 * If the new MTU is higher, and the route PMTU is equal to the local 1647 * MTU, this means the old MTU is the lowest in the path, so allow 1648 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1649 * handle this. 1650 */ 1651 1652 if (dst_mtu(&rt->dst) >= mtu) 1653 return true; 1654 1655 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 1656 return true; 1657 1658 return false; 1659 } 1660 1661 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1662 struct fib6_info *rt, int mtu) 1663 { 1664 struct rt6_exception_bucket *bucket; 1665 struct rt6_exception *rt6_ex; 1666 int i; 1667 1668 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1669 lockdep_is_held(&rt6_exception_lock)); 1670 1671 if (!bucket) 1672 return; 1673 1674 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1675 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1676 struct rt6_info *entry = rt6_ex->rt6i; 1677 1678 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 1679 * route), the metrics of its rt->from have already 1680 * been updated. 1681 */ 1682 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 1683 rt6_mtu_change_route_allowed(idev, entry, mtu)) 1684 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 1685 } 1686 bucket++; 1687 } 1688 } 1689 1690 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1691 1692 static void rt6_exceptions_clean_tohost(struct fib6_info *rt, 1693 struct in6_addr *gateway) 1694 { 1695 struct rt6_exception_bucket *bucket; 1696 struct rt6_exception *rt6_ex; 1697 struct hlist_node *tmp; 1698 int i; 1699 1700 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1701 return; 1702 1703 spin_lock_bh(&rt6_exception_lock); 1704 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1705 lockdep_is_held(&rt6_exception_lock)); 1706 1707 if (bucket) { 1708 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1709 hlist_for_each_entry_safe(rt6_ex, tmp, 1710 &bucket->chain, hlist) { 1711 struct rt6_info *entry = rt6_ex->rt6i; 1712 1713 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1714 RTF_CACHE_GATEWAY && 1715 ipv6_addr_equal(gateway, 1716 &entry->rt6i_gateway)) { 1717 rt6_remove_exception(bucket, rt6_ex); 1718 } 1719 } 1720 bucket++; 1721 } 1722 } 1723 1724 spin_unlock_bh(&rt6_exception_lock); 1725 } 1726 1727 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1728 struct rt6_exception *rt6_ex, 1729 struct fib6_gc_args *gc_args, 1730 unsigned long now) 1731 { 1732 struct rt6_info *rt = rt6_ex->rt6i; 1733 1734 /* we are pruning and obsoleting aged-out and non gateway exceptions 1735 * even if others have still references to them, so that on next 1736 * dst_check() such references can be dropped. 1737 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1738 * expired, independently from their aging, as per RFC 8201 section 4 1739 */ 1740 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 1741 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1742 RT6_TRACE("aging clone %p\n", rt); 1743 rt6_remove_exception(bucket, rt6_ex); 1744 return; 1745 } 1746 } else if (time_after(jiffies, rt->dst.expires)) { 1747 RT6_TRACE("purging expired route %p\n", rt); 1748 rt6_remove_exception(bucket, rt6_ex); 1749 return; 1750 } 1751 1752 if (rt->rt6i_flags & RTF_GATEWAY) { 1753 struct neighbour *neigh; 1754 __u8 neigh_flags = 0; 1755 1756 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 1757 if (neigh) 1758 neigh_flags = neigh->flags; 1759 1760 if (!(neigh_flags & NTF_ROUTER)) { 1761 RT6_TRACE("purging route %p via non-router but gateway\n", 1762 rt); 1763 rt6_remove_exception(bucket, rt6_ex); 1764 return; 1765 } 1766 } 1767 1768 gc_args->more++; 1769 } 1770 1771 void rt6_age_exceptions(struct fib6_info *rt, 1772 struct fib6_gc_args *gc_args, 1773 unsigned long now) 1774 { 1775 struct rt6_exception_bucket *bucket; 1776 struct rt6_exception *rt6_ex; 1777 struct hlist_node *tmp; 1778 int i; 1779 1780 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1781 return; 1782 1783 rcu_read_lock_bh(); 1784 spin_lock(&rt6_exception_lock); 1785 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1786 lockdep_is_held(&rt6_exception_lock)); 1787 1788 if (bucket) { 1789 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1790 hlist_for_each_entry_safe(rt6_ex, tmp, 1791 &bucket->chain, hlist) { 1792 rt6_age_examine_exception(bucket, rt6_ex, 1793 gc_args, now); 1794 } 1795 bucket++; 1796 } 1797 } 1798 spin_unlock(&rt6_exception_lock); 1799 rcu_read_unlock_bh(); 1800 } 1801 1802 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1803 int oif, struct flowi6 *fl6, 1804 const struct sk_buff *skb, int flags) 1805 { 1806 struct fib6_node *fn, *saved_fn; 1807 struct fib6_info *f6i; 1808 struct rt6_info *rt; 1809 int strict = 0; 1810 1811 strict |= flags & RT6_LOOKUP_F_IFACE; 1812 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1813 if (net->ipv6.devconf_all->forwarding == 0) 1814 strict |= RT6_LOOKUP_F_REACHABLE; 1815 1816 rcu_read_lock(); 1817 1818 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1819 saved_fn = fn; 1820 1821 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1822 oif = 0; 1823 1824 redo_rt6_select: 1825 f6i = rt6_select(net, fn, oif, strict); 1826 if (f6i->fib6_nsiblings) 1827 f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict); 1828 if (f6i == net->ipv6.fib6_null_entry) { 1829 fn = fib6_backtrack(fn, &fl6->saddr); 1830 if (fn) 1831 goto redo_rt6_select; 1832 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1833 /* also consider unreachable route */ 1834 strict &= ~RT6_LOOKUP_F_REACHABLE; 1835 fn = saved_fn; 1836 goto redo_rt6_select; 1837 } 1838 } 1839 1840 if (f6i == net->ipv6.fib6_null_entry) { 1841 rt = net->ipv6.ip6_null_entry; 1842 rcu_read_unlock(); 1843 dst_hold(&rt->dst); 1844 trace_fib6_table_lookup(net, rt, table, fl6); 1845 return rt; 1846 } 1847 1848 /*Search through exception table */ 1849 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1850 if (rt) { 1851 if (ip6_hold_safe(net, &rt, true)) 1852 dst_use_noref(&rt->dst, jiffies); 1853 1854 rcu_read_unlock(); 1855 trace_fib6_table_lookup(net, rt, table, fl6); 1856 return rt; 1857 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1858 !(f6i->fib6_flags & RTF_GATEWAY))) { 1859 /* Create a RTF_CACHE clone which will not be 1860 * owned by the fib6 tree. It is for the special case where 1861 * the daddr in the skb during the neighbor look-up is different 1862 * from the fl6->daddr used to look-up route here. 1863 */ 1864 struct rt6_info *uncached_rt; 1865 1866 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); 1867 1868 rcu_read_unlock(); 1869 1870 if (uncached_rt) { 1871 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1872 * No need for another dst_hold() 1873 */ 1874 rt6_uncached_list_add(uncached_rt); 1875 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 1876 } else { 1877 uncached_rt = net->ipv6.ip6_null_entry; 1878 dst_hold(&uncached_rt->dst); 1879 } 1880 1881 trace_fib6_table_lookup(net, uncached_rt, table, fl6); 1882 return uncached_rt; 1883 1884 } else { 1885 /* Get a percpu copy */ 1886 1887 struct rt6_info *pcpu_rt; 1888 1889 local_bh_disable(); 1890 pcpu_rt = rt6_get_pcpu_route(f6i); 1891 1892 if (!pcpu_rt) 1893 pcpu_rt = rt6_make_pcpu_route(net, f6i); 1894 1895 local_bh_enable(); 1896 rcu_read_unlock(); 1897 trace_fib6_table_lookup(net, pcpu_rt, table, fl6); 1898 return pcpu_rt; 1899 } 1900 } 1901 EXPORT_SYMBOL_GPL(ip6_pol_route); 1902 1903 static struct rt6_info *ip6_pol_route_input(struct net *net, 1904 struct fib6_table *table, 1905 struct flowi6 *fl6, 1906 const struct sk_buff *skb, 1907 int flags) 1908 { 1909 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 1910 } 1911 1912 struct dst_entry *ip6_route_input_lookup(struct net *net, 1913 struct net_device *dev, 1914 struct flowi6 *fl6, 1915 const struct sk_buff *skb, 1916 int flags) 1917 { 1918 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1919 flags |= RT6_LOOKUP_F_IFACE; 1920 1921 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 1922 } 1923 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1924 1925 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1926 struct flow_keys *keys, 1927 struct flow_keys *flkeys) 1928 { 1929 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1930 const struct ipv6hdr *key_iph = outer_iph; 1931 struct flow_keys *_flkeys = flkeys; 1932 const struct ipv6hdr *inner_iph; 1933 const struct icmp6hdr *icmph; 1934 struct ipv6hdr _inner_iph; 1935 1936 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1937 goto out; 1938 1939 icmph = icmp6_hdr(skb); 1940 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1941 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1942 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1943 icmph->icmp6_type != ICMPV6_PARAMPROB) 1944 goto out; 1945 1946 inner_iph = skb_header_pointer(skb, 1947 skb_transport_offset(skb) + sizeof(*icmph), 1948 sizeof(_inner_iph), &_inner_iph); 1949 if (!inner_iph) 1950 goto out; 1951 1952 key_iph = inner_iph; 1953 _flkeys = NULL; 1954 out: 1955 if (_flkeys) { 1956 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 1957 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 1958 keys->tags.flow_label = _flkeys->tags.flow_label; 1959 keys->basic.ip_proto = _flkeys->basic.ip_proto; 1960 } else { 1961 keys->addrs.v6addrs.src = key_iph->saddr; 1962 keys->addrs.v6addrs.dst = key_iph->daddr; 1963 keys->tags.flow_label = ip6_flowinfo(key_iph); 1964 keys->basic.ip_proto = key_iph->nexthdr; 1965 } 1966 } 1967 1968 /* if skb is set it will be used and fl6 can be NULL */ 1969 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 1970 const struct sk_buff *skb, struct flow_keys *flkeys) 1971 { 1972 struct flow_keys hash_keys; 1973 u32 mhash; 1974 1975 switch (ip6_multipath_hash_policy(net)) { 1976 case 0: 1977 memset(&hash_keys, 0, sizeof(hash_keys)); 1978 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 1979 if (skb) { 1980 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 1981 } else { 1982 hash_keys.addrs.v6addrs.src = fl6->saddr; 1983 hash_keys.addrs.v6addrs.dst = fl6->daddr; 1984 hash_keys.tags.flow_label = (__force u32)fl6->flowlabel; 1985 hash_keys.basic.ip_proto = fl6->flowi6_proto; 1986 } 1987 break; 1988 case 1: 1989 if (skb) { 1990 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 1991 struct flow_keys keys; 1992 1993 /* short-circuit if we already have L4 hash present */ 1994 if (skb->l4_hash) 1995 return skb_get_hash_raw(skb) >> 1; 1996 1997 memset(&hash_keys, 0, sizeof(hash_keys)); 1998 1999 if (!flkeys) { 2000 skb_flow_dissect_flow_keys(skb, &keys, flag); 2001 flkeys = &keys; 2002 } 2003 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2004 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2005 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2006 hash_keys.ports.src = flkeys->ports.src; 2007 hash_keys.ports.dst = flkeys->ports.dst; 2008 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2009 } else { 2010 memset(&hash_keys, 0, sizeof(hash_keys)); 2011 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2012 hash_keys.addrs.v6addrs.src = fl6->saddr; 2013 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2014 hash_keys.ports.src = fl6->fl6_sport; 2015 hash_keys.ports.dst = fl6->fl6_dport; 2016 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2017 } 2018 break; 2019 } 2020 mhash = flow_hash_from_keys(&hash_keys); 2021 2022 return mhash >> 1; 2023 } 2024 2025 void ip6_route_input(struct sk_buff *skb) 2026 { 2027 const struct ipv6hdr *iph = ipv6_hdr(skb); 2028 struct net *net = dev_net(skb->dev); 2029 int flags = RT6_LOOKUP_F_HAS_SADDR; 2030 struct ip_tunnel_info *tun_info; 2031 struct flowi6 fl6 = { 2032 .flowi6_iif = skb->dev->ifindex, 2033 .daddr = iph->daddr, 2034 .saddr = iph->saddr, 2035 .flowlabel = ip6_flowinfo(iph), 2036 .flowi6_mark = skb->mark, 2037 .flowi6_proto = iph->nexthdr, 2038 }; 2039 struct flow_keys *flkeys = NULL, _flkeys; 2040 2041 tun_info = skb_tunnel_info(skb); 2042 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2043 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2044 2045 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2046 flkeys = &_flkeys; 2047 2048 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2049 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2050 skb_dst_drop(skb); 2051 skb_dst_set(skb, 2052 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); 2053 } 2054 2055 static struct rt6_info *ip6_pol_route_output(struct net *net, 2056 struct fib6_table *table, 2057 struct flowi6 *fl6, 2058 const struct sk_buff *skb, 2059 int flags) 2060 { 2061 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2062 } 2063 2064 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 2065 struct flowi6 *fl6, int flags) 2066 { 2067 bool any_src; 2068 2069 if (rt6_need_strict(&fl6->daddr)) { 2070 struct dst_entry *dst; 2071 2072 dst = l3mdev_link_scope_lookup(net, fl6); 2073 if (dst) 2074 return dst; 2075 } 2076 2077 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2078 2079 any_src = ipv6_addr_any(&fl6->saddr); 2080 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2081 (fl6->flowi6_oif && any_src)) 2082 flags |= RT6_LOOKUP_F_IFACE; 2083 2084 if (!any_src) 2085 flags |= RT6_LOOKUP_F_HAS_SADDR; 2086 else if (sk) 2087 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2088 2089 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2090 } 2091 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2092 2093 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2094 { 2095 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2096 struct net_device *loopback_dev = net->loopback_dev; 2097 struct dst_entry *new = NULL; 2098 2099 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2100 DST_OBSOLETE_DEAD, 0); 2101 if (rt) { 2102 rt6_info_init(rt); 2103 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2104 2105 new = &rt->dst; 2106 new->__use = 1; 2107 new->input = dst_discard; 2108 new->output = dst_discard_out; 2109 2110 dst_copy_metrics(new, &ort->dst); 2111 2112 rt->rt6i_idev = in6_dev_get(loopback_dev); 2113 rt->rt6i_gateway = ort->rt6i_gateway; 2114 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2115 2116 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2117 #ifdef CONFIG_IPV6_SUBTREES 2118 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2119 #endif 2120 } 2121 2122 dst_release(dst_orig); 2123 return new ? new : ERR_PTR(-ENOMEM); 2124 } 2125 2126 /* 2127 * Destination cache support functions 2128 */ 2129 2130 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2131 { 2132 u32 rt_cookie = 0; 2133 2134 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2135 return false; 2136 2137 if (fib6_check_expired(f6i)) 2138 return false; 2139 2140 return true; 2141 } 2142 2143 static struct dst_entry *rt6_check(struct rt6_info *rt, 2144 struct fib6_info *from, 2145 u32 cookie) 2146 { 2147 u32 rt_cookie = 0; 2148 2149 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || 2150 rt_cookie != cookie) 2151 return NULL; 2152 2153 if (rt6_check_expired(rt)) 2154 return NULL; 2155 2156 return &rt->dst; 2157 } 2158 2159 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2160 struct fib6_info *from, 2161 u32 cookie) 2162 { 2163 if (!__rt6_check_expired(rt) && 2164 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2165 fib6_check(from, cookie)) 2166 return &rt->dst; 2167 else 2168 return NULL; 2169 } 2170 2171 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2172 { 2173 struct dst_entry *dst_ret; 2174 struct fib6_info *from; 2175 struct rt6_info *rt; 2176 2177 rt = container_of(dst, struct rt6_info, dst); 2178 2179 rcu_read_lock(); 2180 2181 /* All IPV6 dsts are created with ->obsolete set to the value 2182 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2183 * into this function always. 2184 */ 2185 2186 from = rcu_dereference(rt->from); 2187 2188 if (from && (rt->rt6i_flags & RTF_PCPU || 2189 unlikely(!list_empty(&rt->rt6i_uncached)))) 2190 dst_ret = rt6_dst_from_check(rt, from, cookie); 2191 else 2192 dst_ret = rt6_check(rt, from, cookie); 2193 2194 rcu_read_unlock(); 2195 2196 return dst_ret; 2197 } 2198 2199 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2200 { 2201 struct rt6_info *rt = (struct rt6_info *) dst; 2202 2203 if (rt) { 2204 if (rt->rt6i_flags & RTF_CACHE) { 2205 rcu_read_lock(); 2206 if (rt6_check_expired(rt)) { 2207 rt6_remove_exception_rt(rt); 2208 dst = NULL; 2209 } 2210 rcu_read_unlock(); 2211 } else { 2212 dst_release(dst); 2213 dst = NULL; 2214 } 2215 } 2216 return dst; 2217 } 2218 2219 static void ip6_link_failure(struct sk_buff *skb) 2220 { 2221 struct rt6_info *rt; 2222 2223 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2224 2225 rt = (struct rt6_info *) skb_dst(skb); 2226 if (rt) { 2227 rcu_read_lock(); 2228 if (rt->rt6i_flags & RTF_CACHE) { 2229 if (dst_hold_safe(&rt->dst)) 2230 rt6_remove_exception_rt(rt); 2231 } else { 2232 struct fib6_info *from; 2233 struct fib6_node *fn; 2234 2235 from = rcu_dereference(rt->from); 2236 if (from) { 2237 fn = rcu_dereference(from->fib6_node); 2238 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2239 fn->fn_sernum = -1; 2240 } 2241 } 2242 rcu_read_unlock(); 2243 } 2244 } 2245 2246 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2247 { 2248 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2249 struct fib6_info *from; 2250 2251 rcu_read_lock(); 2252 from = rcu_dereference(rt0->from); 2253 if (from) 2254 rt0->dst.expires = from->expires; 2255 rcu_read_unlock(); 2256 } 2257 2258 dst_set_expires(&rt0->dst, timeout); 2259 rt0->rt6i_flags |= RTF_EXPIRES; 2260 } 2261 2262 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2263 { 2264 struct net *net = dev_net(rt->dst.dev); 2265 2266 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2267 rt->rt6i_flags |= RTF_MODIFIED; 2268 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2269 } 2270 2271 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2272 { 2273 bool from_set; 2274 2275 rcu_read_lock(); 2276 from_set = !!rcu_dereference(rt->from); 2277 rcu_read_unlock(); 2278 2279 return !(rt->rt6i_flags & RTF_CACHE) && 2280 (rt->rt6i_flags & RTF_PCPU || from_set); 2281 } 2282 2283 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2284 const struct ipv6hdr *iph, u32 mtu) 2285 { 2286 const struct in6_addr *daddr, *saddr; 2287 struct rt6_info *rt6 = (struct rt6_info *)dst; 2288 2289 if (rt6->rt6i_flags & RTF_LOCAL) 2290 return; 2291 2292 if (dst_metric_locked(dst, RTAX_MTU)) 2293 return; 2294 2295 if (iph) { 2296 daddr = &iph->daddr; 2297 saddr = &iph->saddr; 2298 } else if (sk) { 2299 daddr = &sk->sk_v6_daddr; 2300 saddr = &inet6_sk(sk)->saddr; 2301 } else { 2302 daddr = NULL; 2303 saddr = NULL; 2304 } 2305 dst_confirm_neigh(dst, daddr); 2306 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2307 if (mtu >= dst_mtu(dst)) 2308 return; 2309 2310 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2311 rt6_do_update_pmtu(rt6, mtu); 2312 /* update rt6_ex->stamp for cache */ 2313 if (rt6->rt6i_flags & RTF_CACHE) 2314 rt6_update_exception_stamp_rt(rt6); 2315 } else if (daddr) { 2316 struct fib6_info *from; 2317 struct rt6_info *nrt6; 2318 2319 rcu_read_lock(); 2320 from = rcu_dereference(rt6->from); 2321 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr); 2322 if (nrt6) { 2323 rt6_do_update_pmtu(nrt6, mtu); 2324 if (rt6_insert_exception(nrt6, from)) 2325 dst_release_immediate(&nrt6->dst); 2326 } 2327 rcu_read_unlock(); 2328 } 2329 } 2330 2331 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2332 struct sk_buff *skb, u32 mtu) 2333 { 2334 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2335 } 2336 2337 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2338 int oif, u32 mark, kuid_t uid) 2339 { 2340 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2341 struct dst_entry *dst; 2342 struct flowi6 fl6; 2343 2344 memset(&fl6, 0, sizeof(fl6)); 2345 fl6.flowi6_oif = oif; 2346 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); 2347 fl6.daddr = iph->daddr; 2348 fl6.saddr = iph->saddr; 2349 fl6.flowlabel = ip6_flowinfo(iph); 2350 fl6.flowi6_uid = uid; 2351 2352 dst = ip6_route_output(net, NULL, &fl6); 2353 if (!dst->error) 2354 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2355 dst_release(dst); 2356 } 2357 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2358 2359 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2360 { 2361 struct dst_entry *dst; 2362 2363 ip6_update_pmtu(skb, sock_net(sk), mtu, 2364 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid); 2365 2366 dst = __sk_dst_get(sk); 2367 if (!dst || !dst->obsolete || 2368 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2369 return; 2370 2371 bh_lock_sock(sk); 2372 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2373 ip6_datagram_dst_update(sk, false); 2374 bh_unlock_sock(sk); 2375 } 2376 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2377 2378 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2379 const struct flowi6 *fl6) 2380 { 2381 #ifdef CONFIG_IPV6_SUBTREES 2382 struct ipv6_pinfo *np = inet6_sk(sk); 2383 #endif 2384 2385 ip6_dst_store(sk, dst, 2386 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2387 &sk->sk_v6_daddr : NULL, 2388 #ifdef CONFIG_IPV6_SUBTREES 2389 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2390 &np->saddr : 2391 #endif 2392 NULL); 2393 } 2394 2395 /* Handle redirects */ 2396 struct ip6rd_flowi { 2397 struct flowi6 fl6; 2398 struct in6_addr gateway; 2399 }; 2400 2401 static struct rt6_info *__ip6_route_redirect(struct net *net, 2402 struct fib6_table *table, 2403 struct flowi6 *fl6, 2404 const struct sk_buff *skb, 2405 int flags) 2406 { 2407 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2408 struct rt6_info *ret = NULL, *rt_cache; 2409 struct fib6_info *rt; 2410 struct fib6_node *fn; 2411 2412 /* Get the "current" route for this destination and 2413 * check if the redirect has come from appropriate router. 2414 * 2415 * RFC 4861 specifies that redirects should only be 2416 * accepted if they come from the nexthop to the target. 2417 * Due to the way the routes are chosen, this notion 2418 * is a bit fuzzy and one might need to check all possible 2419 * routes. 2420 */ 2421 2422 rcu_read_lock(); 2423 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2424 restart: 2425 for_each_fib6_node_rt_rcu(fn) { 2426 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 2427 continue; 2428 if (fib6_check_expired(rt)) 2429 continue; 2430 if (rt->fib6_flags & RTF_REJECT) 2431 break; 2432 if (!(rt->fib6_flags & RTF_GATEWAY)) 2433 continue; 2434 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex) 2435 continue; 2436 /* rt_cache's gateway might be different from its 'parent' 2437 * in the case of an ip redirect. 2438 * So we keep searching in the exception table if the gateway 2439 * is different. 2440 */ 2441 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) { 2442 rt_cache = rt6_find_cached_rt(rt, 2443 &fl6->daddr, 2444 &fl6->saddr); 2445 if (rt_cache && 2446 ipv6_addr_equal(&rdfl->gateway, 2447 &rt_cache->rt6i_gateway)) { 2448 ret = rt_cache; 2449 break; 2450 } 2451 continue; 2452 } 2453 break; 2454 } 2455 2456 if (!rt) 2457 rt = net->ipv6.fib6_null_entry; 2458 else if (rt->fib6_flags & RTF_REJECT) { 2459 ret = net->ipv6.ip6_null_entry; 2460 goto out; 2461 } 2462 2463 if (rt == net->ipv6.fib6_null_entry) { 2464 fn = fib6_backtrack(fn, &fl6->saddr); 2465 if (fn) 2466 goto restart; 2467 } 2468 2469 out: 2470 if (ret) 2471 dst_hold(&ret->dst); 2472 else 2473 ret = ip6_create_rt_rcu(rt); 2474 2475 rcu_read_unlock(); 2476 2477 trace_fib6_table_lookup(net, ret, table, fl6); 2478 return ret; 2479 }; 2480 2481 static struct dst_entry *ip6_route_redirect(struct net *net, 2482 const struct flowi6 *fl6, 2483 const struct sk_buff *skb, 2484 const struct in6_addr *gateway) 2485 { 2486 int flags = RT6_LOOKUP_F_HAS_SADDR; 2487 struct ip6rd_flowi rdfl; 2488 2489 rdfl.fl6 = *fl6; 2490 rdfl.gateway = *gateway; 2491 2492 return fib6_rule_lookup(net, &rdfl.fl6, skb, 2493 flags, __ip6_route_redirect); 2494 } 2495 2496 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2497 kuid_t uid) 2498 { 2499 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2500 struct dst_entry *dst; 2501 struct flowi6 fl6; 2502 2503 memset(&fl6, 0, sizeof(fl6)); 2504 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2505 fl6.flowi6_oif = oif; 2506 fl6.flowi6_mark = mark; 2507 fl6.daddr = iph->daddr; 2508 fl6.saddr = iph->saddr; 2509 fl6.flowlabel = ip6_flowinfo(iph); 2510 fl6.flowi6_uid = uid; 2511 2512 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 2513 rt6_do_redirect(dst, NULL, skb); 2514 dst_release(dst); 2515 } 2516 EXPORT_SYMBOL_GPL(ip6_redirect); 2517 2518 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, 2519 u32 mark) 2520 { 2521 const struct ipv6hdr *iph = ipv6_hdr(skb); 2522 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2523 struct dst_entry *dst; 2524 struct flowi6 fl6; 2525 2526 memset(&fl6, 0, sizeof(fl6)); 2527 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2528 fl6.flowi6_oif = oif; 2529 fl6.flowi6_mark = mark; 2530 fl6.daddr = msg->dest; 2531 fl6.saddr = iph->daddr; 2532 fl6.flowi6_uid = sock_net_uid(net, NULL); 2533 2534 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 2535 rt6_do_redirect(dst, NULL, skb); 2536 dst_release(dst); 2537 } 2538 2539 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2540 { 2541 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2542 sk->sk_uid); 2543 } 2544 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2545 2546 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2547 { 2548 struct net_device *dev = dst->dev; 2549 unsigned int mtu = dst_mtu(dst); 2550 struct net *net = dev_net(dev); 2551 2552 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2553 2554 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2555 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2556 2557 /* 2558 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2559 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2560 * IPV6_MAXPLEN is also valid and means: "any MSS, 2561 * rely only on pmtu discovery" 2562 */ 2563 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2564 mtu = IPV6_MAXPLEN; 2565 return mtu; 2566 } 2567 2568 static unsigned int ip6_mtu(const struct dst_entry *dst) 2569 { 2570 struct inet6_dev *idev; 2571 unsigned int mtu; 2572 2573 mtu = dst_metric_raw(dst, RTAX_MTU); 2574 if (mtu) 2575 goto out; 2576 2577 mtu = IPV6_MIN_MTU; 2578 2579 rcu_read_lock(); 2580 idev = __in6_dev_get(dst->dev); 2581 if (idev) 2582 mtu = idev->cnf.mtu6; 2583 rcu_read_unlock(); 2584 2585 out: 2586 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2587 2588 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2589 } 2590 2591 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2592 struct flowi6 *fl6) 2593 { 2594 struct dst_entry *dst; 2595 struct rt6_info *rt; 2596 struct inet6_dev *idev = in6_dev_get(dev); 2597 struct net *net = dev_net(dev); 2598 2599 if (unlikely(!idev)) 2600 return ERR_PTR(-ENODEV); 2601 2602 rt = ip6_dst_alloc(net, dev, 0); 2603 if (unlikely(!rt)) { 2604 in6_dev_put(idev); 2605 dst = ERR_PTR(-ENOMEM); 2606 goto out; 2607 } 2608 2609 rt->dst.flags |= DST_HOST; 2610 rt->dst.input = ip6_input; 2611 rt->dst.output = ip6_output; 2612 rt->rt6i_gateway = fl6->daddr; 2613 rt->rt6i_dst.addr = fl6->daddr; 2614 rt->rt6i_dst.plen = 128; 2615 rt->rt6i_idev = idev; 2616 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2617 2618 /* Add this dst into uncached_list so that rt6_disable_ip() can 2619 * do proper release of the net_device 2620 */ 2621 rt6_uncached_list_add(rt); 2622 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2623 2624 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2625 2626 out: 2627 return dst; 2628 } 2629 2630 static int ip6_dst_gc(struct dst_ops *ops) 2631 { 2632 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2633 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2634 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2635 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2636 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2637 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2638 int entries; 2639 2640 entries = dst_entries_get_fast(ops); 2641 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2642 entries <= rt_max_size) 2643 goto out; 2644 2645 net->ipv6.ip6_rt_gc_expire++; 2646 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2647 entries = dst_entries_get_slow(ops); 2648 if (entries < ops->gc_thresh) 2649 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2650 out: 2651 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2652 return entries > rt_max_size; 2653 } 2654 2655 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt, 2656 struct fib6_config *cfg) 2657 { 2658 struct dst_metrics *p; 2659 2660 if (!cfg->fc_mx) 2661 return 0; 2662 2663 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL); 2664 if (unlikely(!p)) 2665 return -ENOMEM; 2666 2667 refcount_set(&p->refcnt, 1); 2668 rt->fib6_metrics = p; 2669 2670 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics); 2671 } 2672 2673 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2674 struct fib6_config *cfg, 2675 const struct in6_addr *gw_addr, 2676 u32 tbid, int flags) 2677 { 2678 struct flowi6 fl6 = { 2679 .flowi6_oif = cfg->fc_ifindex, 2680 .daddr = *gw_addr, 2681 .saddr = cfg->fc_prefsrc, 2682 }; 2683 struct fib6_table *table; 2684 struct rt6_info *rt; 2685 2686 table = fib6_get_table(net, tbid); 2687 if (!table) 2688 return NULL; 2689 2690 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2691 flags |= RT6_LOOKUP_F_HAS_SADDR; 2692 2693 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 2694 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); 2695 2696 /* if table lookup failed, fall back to full lookup */ 2697 if (rt == net->ipv6.ip6_null_entry) { 2698 ip6_rt_put(rt); 2699 rt = NULL; 2700 } 2701 2702 return rt; 2703 } 2704 2705 static int ip6_route_check_nh_onlink(struct net *net, 2706 struct fib6_config *cfg, 2707 const struct net_device *dev, 2708 struct netlink_ext_ack *extack) 2709 { 2710 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 2711 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2712 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; 2713 struct rt6_info *grt; 2714 int err; 2715 2716 err = 0; 2717 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2718 if (grt) { 2719 if (!grt->dst.error && 2720 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2721 NL_SET_ERR_MSG(extack, 2722 "Nexthop has invalid gateway or device mismatch"); 2723 err = -EINVAL; 2724 } 2725 2726 ip6_rt_put(grt); 2727 } 2728 2729 return err; 2730 } 2731 2732 static int ip6_route_check_nh(struct net *net, 2733 struct fib6_config *cfg, 2734 struct net_device **_dev, 2735 struct inet6_dev **idev) 2736 { 2737 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2738 struct net_device *dev = _dev ? *_dev : NULL; 2739 struct rt6_info *grt = NULL; 2740 int err = -EHOSTUNREACH; 2741 2742 if (cfg->fc_table) { 2743 int flags = RT6_LOOKUP_F_IFACE; 2744 2745 grt = ip6_nh_lookup_table(net, cfg, gw_addr, 2746 cfg->fc_table, flags); 2747 if (grt) { 2748 if (grt->rt6i_flags & RTF_GATEWAY || 2749 (dev && dev != grt->dst.dev)) { 2750 ip6_rt_put(grt); 2751 grt = NULL; 2752 } 2753 } 2754 } 2755 2756 if (!grt) 2757 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); 2758 2759 if (!grt) 2760 goto out; 2761 2762 if (dev) { 2763 if (dev != grt->dst.dev) { 2764 ip6_rt_put(grt); 2765 goto out; 2766 } 2767 } else { 2768 *_dev = dev = grt->dst.dev; 2769 *idev = grt->rt6i_idev; 2770 dev_hold(dev); 2771 in6_dev_hold(grt->rt6i_idev); 2772 } 2773 2774 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2775 err = 0; 2776 2777 ip6_rt_put(grt); 2778 2779 out: 2780 return err; 2781 } 2782 2783 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 2784 struct net_device **_dev, struct inet6_dev **idev, 2785 struct netlink_ext_ack *extack) 2786 { 2787 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2788 int gwa_type = ipv6_addr_type(gw_addr); 2789 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 2790 const struct net_device *dev = *_dev; 2791 bool need_addr_check = !dev; 2792 int err = -EINVAL; 2793 2794 /* if gw_addr is local we will fail to detect this in case 2795 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2796 * will return already-added prefix route via interface that 2797 * prefix route was assigned to, which might be non-loopback. 2798 */ 2799 if (dev && 2800 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2801 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2802 goto out; 2803 } 2804 2805 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 2806 /* IPv6 strictly inhibits using not link-local 2807 * addresses as nexthop address. 2808 * Otherwise, router will not able to send redirects. 2809 * It is very good, but in some (rare!) circumstances 2810 * (SIT, PtP, NBMA NOARP links) it is handy to allow 2811 * some exceptions. --ANK 2812 * We allow IPv4-mapped nexthops to support RFC4798-type 2813 * addressing 2814 */ 2815 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 2816 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2817 goto out; 2818 } 2819 2820 if (cfg->fc_flags & RTNH_F_ONLINK) 2821 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 2822 else 2823 err = ip6_route_check_nh(net, cfg, _dev, idev); 2824 2825 if (err) 2826 goto out; 2827 } 2828 2829 /* reload in case device was changed */ 2830 dev = *_dev; 2831 2832 err = -EINVAL; 2833 if (!dev) { 2834 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2835 goto out; 2836 } else if (dev->flags & IFF_LOOPBACK) { 2837 NL_SET_ERR_MSG(extack, 2838 "Egress device can not be loopback device for this route"); 2839 goto out; 2840 } 2841 2842 /* if we did not check gw_addr above, do so now that the 2843 * egress device has been resolved. 2844 */ 2845 if (need_addr_check && 2846 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2847 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2848 goto out; 2849 } 2850 2851 err = 0; 2852 out: 2853 return err; 2854 } 2855 2856 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 2857 gfp_t gfp_flags, 2858 struct netlink_ext_ack *extack) 2859 { 2860 struct net *net = cfg->fc_nlinfo.nl_net; 2861 struct fib6_info *rt = NULL; 2862 struct net_device *dev = NULL; 2863 struct inet6_dev *idev = NULL; 2864 struct fib6_table *table; 2865 int addr_type; 2866 int err = -EINVAL; 2867 2868 /* RTF_PCPU is an internal flag; can not be set by userspace */ 2869 if (cfg->fc_flags & RTF_PCPU) { 2870 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 2871 goto out; 2872 } 2873 2874 /* RTF_CACHE is an internal flag; can not be set by userspace */ 2875 if (cfg->fc_flags & RTF_CACHE) { 2876 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 2877 goto out; 2878 } 2879 2880 if (cfg->fc_type > RTN_MAX) { 2881 NL_SET_ERR_MSG(extack, "Invalid route type"); 2882 goto out; 2883 } 2884 2885 if (cfg->fc_dst_len > 128) { 2886 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 2887 goto out; 2888 } 2889 if (cfg->fc_src_len > 128) { 2890 NL_SET_ERR_MSG(extack, "Invalid source address length"); 2891 goto out; 2892 } 2893 #ifndef CONFIG_IPV6_SUBTREES 2894 if (cfg->fc_src_len) { 2895 NL_SET_ERR_MSG(extack, 2896 "Specifying source address requires IPV6_SUBTREES to be enabled"); 2897 goto out; 2898 } 2899 #endif 2900 if (cfg->fc_ifindex) { 2901 err = -ENODEV; 2902 dev = dev_get_by_index(net, cfg->fc_ifindex); 2903 if (!dev) 2904 goto out; 2905 idev = in6_dev_get(dev); 2906 if (!idev) 2907 goto out; 2908 } 2909 2910 if (cfg->fc_metric == 0) 2911 cfg->fc_metric = IP6_RT_PRIO_USER; 2912 2913 if (cfg->fc_flags & RTNH_F_ONLINK) { 2914 if (!dev) { 2915 NL_SET_ERR_MSG(extack, 2916 "Nexthop device required for onlink"); 2917 err = -ENODEV; 2918 goto out; 2919 } 2920 2921 if (!(dev->flags & IFF_UP)) { 2922 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 2923 err = -ENETDOWN; 2924 goto out; 2925 } 2926 } 2927 2928 err = -ENOBUFS; 2929 if (cfg->fc_nlinfo.nlh && 2930 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 2931 table = fib6_get_table(net, cfg->fc_table); 2932 if (!table) { 2933 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 2934 table = fib6_new_table(net, cfg->fc_table); 2935 } 2936 } else { 2937 table = fib6_new_table(net, cfg->fc_table); 2938 } 2939 2940 if (!table) 2941 goto out; 2942 2943 err = -ENOMEM; 2944 rt = fib6_info_alloc(gfp_flags); 2945 if (!rt) 2946 goto out; 2947 2948 if (cfg->fc_flags & RTF_ADDRCONF) 2949 rt->dst_nocount = true; 2950 2951 err = ip6_convert_metrics(net, rt, cfg); 2952 if (err < 0) 2953 goto out; 2954 2955 if (cfg->fc_flags & RTF_EXPIRES) 2956 fib6_set_expires(rt, jiffies + 2957 clock_t_to_jiffies(cfg->fc_expires)); 2958 else 2959 fib6_clean_expires(rt); 2960 2961 if (cfg->fc_protocol == RTPROT_UNSPEC) 2962 cfg->fc_protocol = RTPROT_BOOT; 2963 rt->fib6_protocol = cfg->fc_protocol; 2964 2965 addr_type = ipv6_addr_type(&cfg->fc_dst); 2966 2967 if (cfg->fc_encap) { 2968 struct lwtunnel_state *lwtstate; 2969 2970 err = lwtunnel_build_state(cfg->fc_encap_type, 2971 cfg->fc_encap, AF_INET6, cfg, 2972 &lwtstate, extack); 2973 if (err) 2974 goto out; 2975 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate); 2976 } 2977 2978 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 2979 rt->fib6_dst.plen = cfg->fc_dst_len; 2980 if (rt->fib6_dst.plen == 128) 2981 rt->dst_host = true; 2982 2983 #ifdef CONFIG_IPV6_SUBTREES 2984 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 2985 rt->fib6_src.plen = cfg->fc_src_len; 2986 #endif 2987 2988 rt->fib6_metric = cfg->fc_metric; 2989 rt->fib6_nh.nh_weight = 1; 2990 2991 rt->fib6_type = cfg->fc_type; 2992 2993 /* We cannot add true routes via loopback here, 2994 they would result in kernel looping; promote them to reject routes 2995 */ 2996 if ((cfg->fc_flags & RTF_REJECT) || 2997 (dev && (dev->flags & IFF_LOOPBACK) && 2998 !(addr_type & IPV6_ADDR_LOOPBACK) && 2999 !(cfg->fc_flags & RTF_LOCAL))) { 3000 /* hold loopback dev/idev if we haven't done so. */ 3001 if (dev != net->loopback_dev) { 3002 if (dev) { 3003 dev_put(dev); 3004 in6_dev_put(idev); 3005 } 3006 dev = net->loopback_dev; 3007 dev_hold(dev); 3008 idev = in6_dev_get(dev); 3009 if (!idev) { 3010 err = -ENODEV; 3011 goto out; 3012 } 3013 } 3014 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP; 3015 goto install_route; 3016 } 3017 3018 if (cfg->fc_flags & RTF_GATEWAY) { 3019 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 3020 if (err) 3021 goto out; 3022 3023 rt->fib6_nh.nh_gw = cfg->fc_gateway; 3024 } 3025 3026 err = -ENODEV; 3027 if (!dev) 3028 goto out; 3029 3030 if (idev->cnf.disable_ipv6) { 3031 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3032 err = -EACCES; 3033 goto out; 3034 } 3035 3036 if (!(dev->flags & IFF_UP)) { 3037 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3038 err = -ENETDOWN; 3039 goto out; 3040 } 3041 3042 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3043 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3044 NL_SET_ERR_MSG(extack, "Invalid source address"); 3045 err = -EINVAL; 3046 goto out; 3047 } 3048 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3049 rt->fib6_prefsrc.plen = 128; 3050 } else 3051 rt->fib6_prefsrc.plen = 0; 3052 3053 rt->fib6_flags = cfg->fc_flags; 3054 3055 install_route: 3056 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3057 !netif_carrier_ok(dev)) 3058 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 3059 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); 3060 rt->fib6_nh.nh_dev = dev; 3061 rt->fib6_table = table; 3062 3063 cfg->fc_nlinfo.nl_net = dev_net(dev); 3064 3065 if (idev) 3066 in6_dev_put(idev); 3067 3068 return rt; 3069 out: 3070 if (dev) 3071 dev_put(dev); 3072 if (idev) 3073 in6_dev_put(idev); 3074 3075 fib6_info_release(rt); 3076 return ERR_PTR(err); 3077 } 3078 3079 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3080 struct netlink_ext_ack *extack) 3081 { 3082 struct fib6_info *rt; 3083 int err; 3084 3085 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3086 if (IS_ERR(rt)) 3087 return PTR_ERR(rt); 3088 3089 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3090 fib6_info_release(rt); 3091 3092 return err; 3093 } 3094 3095 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3096 { 3097 struct net *net = info->nl_net; 3098 struct fib6_table *table; 3099 int err; 3100 3101 if (rt == net->ipv6.fib6_null_entry) { 3102 err = -ENOENT; 3103 goto out; 3104 } 3105 3106 table = rt->fib6_table; 3107 spin_lock_bh(&table->tb6_lock); 3108 err = fib6_del(rt, info); 3109 spin_unlock_bh(&table->tb6_lock); 3110 3111 out: 3112 fib6_info_release(rt); 3113 return err; 3114 } 3115 3116 int ip6_del_rt(struct net *net, struct fib6_info *rt) 3117 { 3118 struct nl_info info = { .nl_net = net }; 3119 3120 return __ip6_del_rt(rt, &info); 3121 } 3122 3123 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3124 { 3125 struct nl_info *info = &cfg->fc_nlinfo; 3126 struct net *net = info->nl_net; 3127 struct sk_buff *skb = NULL; 3128 struct fib6_table *table; 3129 int err = -ENOENT; 3130 3131 if (rt == net->ipv6.fib6_null_entry) 3132 goto out_put; 3133 table = rt->fib6_table; 3134 spin_lock_bh(&table->tb6_lock); 3135 3136 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3137 struct fib6_info *sibling, *next_sibling; 3138 3139 /* prefer to send a single notification with all hops */ 3140 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3141 if (skb) { 3142 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3143 3144 if (rt6_fill_node(net, skb, rt, NULL, 3145 NULL, NULL, 0, RTM_DELROUTE, 3146 info->portid, seq, 0) < 0) { 3147 kfree_skb(skb); 3148 skb = NULL; 3149 } else 3150 info->skip_notify = 1; 3151 } 3152 3153 list_for_each_entry_safe(sibling, next_sibling, 3154 &rt->fib6_siblings, 3155 fib6_siblings) { 3156 err = fib6_del(sibling, info); 3157 if (err) 3158 goto out_unlock; 3159 } 3160 } 3161 3162 err = fib6_del(rt, info); 3163 out_unlock: 3164 spin_unlock_bh(&table->tb6_lock); 3165 out_put: 3166 fib6_info_release(rt); 3167 3168 if (skb) { 3169 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3170 info->nlh, gfp_any()); 3171 } 3172 return err; 3173 } 3174 3175 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3176 { 3177 int rc = -ESRCH; 3178 3179 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3180 goto out; 3181 3182 if (cfg->fc_flags & RTF_GATEWAY && 3183 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3184 goto out; 3185 if (dst_hold_safe(&rt->dst)) 3186 rc = rt6_remove_exception_rt(rt); 3187 out: 3188 return rc; 3189 } 3190 3191 static int ip6_route_del(struct fib6_config *cfg, 3192 struct netlink_ext_ack *extack) 3193 { 3194 struct rt6_info *rt_cache; 3195 struct fib6_table *table; 3196 struct fib6_info *rt; 3197 struct fib6_node *fn; 3198 int err = -ESRCH; 3199 3200 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 3201 if (!table) { 3202 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 3203 return err; 3204 } 3205 3206 rcu_read_lock(); 3207 3208 fn = fib6_locate(&table->tb6_root, 3209 &cfg->fc_dst, cfg->fc_dst_len, 3210 &cfg->fc_src, cfg->fc_src_len, 3211 !(cfg->fc_flags & RTF_CACHE)); 3212 3213 if (fn) { 3214 for_each_fib6_node_rt_rcu(fn) { 3215 if (cfg->fc_flags & RTF_CACHE) { 3216 int rc; 3217 3218 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, 3219 &cfg->fc_src); 3220 if (rt_cache) { 3221 rc = ip6_del_cached_rt(rt_cache, cfg); 3222 if (rc != -ESRCH) 3223 return rc; 3224 } 3225 continue; 3226 } 3227 if (cfg->fc_ifindex && 3228 (!rt->fib6_nh.nh_dev || 3229 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex)) 3230 continue; 3231 if (cfg->fc_flags & RTF_GATEWAY && 3232 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw)) 3233 continue; 3234 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 3235 continue; 3236 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) 3237 continue; 3238 fib6_info_hold(rt); 3239 rcu_read_unlock(); 3240 3241 /* if gateway was specified only delete the one hop */ 3242 if (cfg->fc_flags & RTF_GATEWAY) 3243 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3244 3245 return __ip6_del_rt_siblings(rt, cfg); 3246 } 3247 } 3248 rcu_read_unlock(); 3249 3250 return err; 3251 } 3252 3253 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3254 { 3255 struct netevent_redirect netevent; 3256 struct rt6_info *rt, *nrt = NULL; 3257 struct ndisc_options ndopts; 3258 struct inet6_dev *in6_dev; 3259 struct neighbour *neigh; 3260 struct fib6_info *from; 3261 struct rd_msg *msg; 3262 int optlen, on_link; 3263 u8 *lladdr; 3264 3265 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3266 optlen -= sizeof(*msg); 3267 3268 if (optlen < 0) { 3269 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3270 return; 3271 } 3272 3273 msg = (struct rd_msg *)icmp6_hdr(skb); 3274 3275 if (ipv6_addr_is_multicast(&msg->dest)) { 3276 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3277 return; 3278 } 3279 3280 on_link = 0; 3281 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3282 on_link = 1; 3283 } else if (ipv6_addr_type(&msg->target) != 3284 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3285 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3286 return; 3287 } 3288 3289 in6_dev = __in6_dev_get(skb->dev); 3290 if (!in6_dev) 3291 return; 3292 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3293 return; 3294 3295 /* RFC2461 8.1: 3296 * The IP source address of the Redirect MUST be the same as the current 3297 * first-hop router for the specified ICMP Destination Address. 3298 */ 3299 3300 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3301 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3302 return; 3303 } 3304 3305 lladdr = NULL; 3306 if (ndopts.nd_opts_tgt_lladdr) { 3307 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3308 skb->dev); 3309 if (!lladdr) { 3310 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 3311 return; 3312 } 3313 } 3314 3315 rt = (struct rt6_info *) dst; 3316 if (rt->rt6i_flags & RTF_REJECT) { 3317 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 3318 return; 3319 } 3320 3321 /* Redirect received -> path was valid. 3322 * Look, redirects are sent only in response to data packets, 3323 * so that this nexthop apparently is reachable. --ANK 3324 */ 3325 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3326 3327 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3328 if (!neigh) 3329 return; 3330 3331 /* 3332 * We have finally decided to accept it. 3333 */ 3334 3335 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3336 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3337 NEIGH_UPDATE_F_OVERRIDE| 3338 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3339 NEIGH_UPDATE_F_ISROUTER)), 3340 NDISC_REDIRECT, &ndopts); 3341 3342 rcu_read_lock(); 3343 from = rcu_dereference(rt->from); 3344 fib6_info_hold(from); 3345 rcu_read_unlock(); 3346 3347 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL); 3348 if (!nrt) 3349 goto out; 3350 3351 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3352 if (on_link) 3353 nrt->rt6i_flags &= ~RTF_GATEWAY; 3354 3355 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3356 3357 /* No need to remove rt from the exception table if rt is 3358 * a cached route because rt6_insert_exception() will 3359 * takes care of it 3360 */ 3361 if (rt6_insert_exception(nrt, from)) { 3362 dst_release_immediate(&nrt->dst); 3363 goto out; 3364 } 3365 3366 netevent.old = &rt->dst; 3367 netevent.new = &nrt->dst; 3368 netevent.daddr = &msg->dest; 3369 netevent.neigh = neigh; 3370 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3371 3372 out: 3373 fib6_info_release(from); 3374 neigh_release(neigh); 3375 } 3376 3377 #ifdef CONFIG_IPV6_ROUTE_INFO 3378 static struct fib6_info *rt6_get_route_info(struct net *net, 3379 const struct in6_addr *prefix, int prefixlen, 3380 const struct in6_addr *gwaddr, 3381 struct net_device *dev) 3382 { 3383 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3384 int ifindex = dev->ifindex; 3385 struct fib6_node *fn; 3386 struct fib6_info *rt = NULL; 3387 struct fib6_table *table; 3388 3389 table = fib6_get_table(net, tb_id); 3390 if (!table) 3391 return NULL; 3392 3393 rcu_read_lock(); 3394 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3395 if (!fn) 3396 goto out; 3397 3398 for_each_fib6_node_rt_rcu(fn) { 3399 if (rt->fib6_nh.nh_dev->ifindex != ifindex) 3400 continue; 3401 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) 3402 continue; 3403 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) 3404 continue; 3405 fib6_info_hold(rt); 3406 break; 3407 } 3408 out: 3409 rcu_read_unlock(); 3410 return rt; 3411 } 3412 3413 static struct fib6_info *rt6_add_route_info(struct net *net, 3414 const struct in6_addr *prefix, int prefixlen, 3415 const struct in6_addr *gwaddr, 3416 struct net_device *dev, 3417 unsigned int pref) 3418 { 3419 struct fib6_config cfg = { 3420 .fc_metric = IP6_RT_PRIO_USER, 3421 .fc_ifindex = dev->ifindex, 3422 .fc_dst_len = prefixlen, 3423 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3424 RTF_UP | RTF_PREF(pref), 3425 .fc_protocol = RTPROT_RA, 3426 .fc_type = RTN_UNICAST, 3427 .fc_nlinfo.portid = 0, 3428 .fc_nlinfo.nlh = NULL, 3429 .fc_nlinfo.nl_net = net, 3430 }; 3431 3432 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3433 cfg.fc_dst = *prefix; 3434 cfg.fc_gateway = *gwaddr; 3435 3436 /* We should treat it as a default route if prefix length is 0. */ 3437 if (!prefixlen) 3438 cfg.fc_flags |= RTF_DEFAULT; 3439 3440 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 3441 3442 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3443 } 3444 #endif 3445 3446 struct fib6_info *rt6_get_dflt_router(struct net *net, 3447 const struct in6_addr *addr, 3448 struct net_device *dev) 3449 { 3450 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3451 struct fib6_info *rt; 3452 struct fib6_table *table; 3453 3454 table = fib6_get_table(net, tb_id); 3455 if (!table) 3456 return NULL; 3457 3458 rcu_read_lock(); 3459 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3460 if (dev == rt->fib6_nh.nh_dev && 3461 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3462 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) 3463 break; 3464 } 3465 if (rt) 3466 fib6_info_hold(rt); 3467 rcu_read_unlock(); 3468 return rt; 3469 } 3470 3471 struct fib6_info *rt6_add_dflt_router(struct net *net, 3472 const struct in6_addr *gwaddr, 3473 struct net_device *dev, 3474 unsigned int pref) 3475 { 3476 struct fib6_config cfg = { 3477 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3478 .fc_metric = IP6_RT_PRIO_USER, 3479 .fc_ifindex = dev->ifindex, 3480 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3481 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3482 .fc_protocol = RTPROT_RA, 3483 .fc_type = RTN_UNICAST, 3484 .fc_nlinfo.portid = 0, 3485 .fc_nlinfo.nlh = NULL, 3486 .fc_nlinfo.nl_net = net, 3487 }; 3488 3489 cfg.fc_gateway = *gwaddr; 3490 3491 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 3492 struct fib6_table *table; 3493 3494 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3495 if (table) 3496 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3497 } 3498 3499 return rt6_get_dflt_router(net, gwaddr, dev); 3500 } 3501 3502 static void __rt6_purge_dflt_routers(struct net *net, 3503 struct fib6_table *table) 3504 { 3505 struct fib6_info *rt; 3506 3507 restart: 3508 rcu_read_lock(); 3509 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3510 struct net_device *dev = fib6_info_nh_dev(rt); 3511 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 3512 3513 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3514 (!idev || idev->cnf.accept_ra != 2)) { 3515 fib6_info_hold(rt); 3516 rcu_read_unlock(); 3517 ip6_del_rt(net, rt); 3518 goto restart; 3519 } 3520 } 3521 rcu_read_unlock(); 3522 3523 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3524 } 3525 3526 void rt6_purge_dflt_routers(struct net *net) 3527 { 3528 struct fib6_table *table; 3529 struct hlist_head *head; 3530 unsigned int h; 3531 3532 rcu_read_lock(); 3533 3534 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3535 head = &net->ipv6.fib_table_hash[h]; 3536 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3537 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3538 __rt6_purge_dflt_routers(net, table); 3539 } 3540 } 3541 3542 rcu_read_unlock(); 3543 } 3544 3545 static void rtmsg_to_fib6_config(struct net *net, 3546 struct in6_rtmsg *rtmsg, 3547 struct fib6_config *cfg) 3548 { 3549 memset(cfg, 0, sizeof(*cfg)); 3550 3551 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3552 : RT6_TABLE_MAIN; 3553 cfg->fc_ifindex = rtmsg->rtmsg_ifindex; 3554 cfg->fc_metric = rtmsg->rtmsg_metric; 3555 cfg->fc_expires = rtmsg->rtmsg_info; 3556 cfg->fc_dst_len = rtmsg->rtmsg_dst_len; 3557 cfg->fc_src_len = rtmsg->rtmsg_src_len; 3558 cfg->fc_flags = rtmsg->rtmsg_flags; 3559 cfg->fc_type = rtmsg->rtmsg_type; 3560 3561 cfg->fc_nlinfo.nl_net = net; 3562 3563 cfg->fc_dst = rtmsg->rtmsg_dst; 3564 cfg->fc_src = rtmsg->rtmsg_src; 3565 cfg->fc_gateway = rtmsg->rtmsg_gateway; 3566 } 3567 3568 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3569 { 3570 struct fib6_config cfg; 3571 struct in6_rtmsg rtmsg; 3572 int err; 3573 3574 switch (cmd) { 3575 case SIOCADDRT: /* Add a route */ 3576 case SIOCDELRT: /* Delete a route */ 3577 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3578 return -EPERM; 3579 err = copy_from_user(&rtmsg, arg, 3580 sizeof(struct in6_rtmsg)); 3581 if (err) 3582 return -EFAULT; 3583 3584 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3585 3586 rtnl_lock(); 3587 switch (cmd) { 3588 case SIOCADDRT: 3589 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 3590 break; 3591 case SIOCDELRT: 3592 err = ip6_route_del(&cfg, NULL); 3593 break; 3594 default: 3595 err = -EINVAL; 3596 } 3597 rtnl_unlock(); 3598 3599 return err; 3600 } 3601 3602 return -EINVAL; 3603 } 3604 3605 /* 3606 * Drop the packet on the floor 3607 */ 3608 3609 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3610 { 3611 int type; 3612 struct dst_entry *dst = skb_dst(skb); 3613 switch (ipstats_mib_noroutes) { 3614 case IPSTATS_MIB_INNOROUTES: 3615 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3616 if (type == IPV6_ADDR_ANY) { 3617 IP6_INC_STATS(dev_net(dst->dev), 3618 __in6_dev_get_safely(skb->dev), 3619 IPSTATS_MIB_INADDRERRORS); 3620 break; 3621 } 3622 /* FALLTHROUGH */ 3623 case IPSTATS_MIB_OUTNOROUTES: 3624 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3625 ipstats_mib_noroutes); 3626 break; 3627 } 3628 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3629 kfree_skb(skb); 3630 return 0; 3631 } 3632 3633 static int ip6_pkt_discard(struct sk_buff *skb) 3634 { 3635 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3636 } 3637 3638 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3639 { 3640 skb->dev = skb_dst(skb)->dev; 3641 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3642 } 3643 3644 static int ip6_pkt_prohibit(struct sk_buff *skb) 3645 { 3646 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3647 } 3648 3649 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3650 { 3651 skb->dev = skb_dst(skb)->dev; 3652 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3653 } 3654 3655 /* 3656 * Allocate a dst for local (unicast / anycast) address. 3657 */ 3658 3659 struct fib6_info *addrconf_f6i_alloc(struct net *net, 3660 struct inet6_dev *idev, 3661 const struct in6_addr *addr, 3662 bool anycast, gfp_t gfp_flags) 3663 { 3664 u32 tb_id; 3665 struct net_device *dev = idev->dev; 3666 struct fib6_info *f6i; 3667 3668 f6i = fib6_info_alloc(gfp_flags); 3669 if (!f6i) 3670 return ERR_PTR(-ENOMEM); 3671 3672 f6i->dst_nocount = true; 3673 f6i->dst_host = true; 3674 f6i->fib6_protocol = RTPROT_KERNEL; 3675 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP; 3676 if (anycast) { 3677 f6i->fib6_type = RTN_ANYCAST; 3678 f6i->fib6_flags |= RTF_ANYCAST; 3679 } else { 3680 f6i->fib6_type = RTN_LOCAL; 3681 f6i->fib6_flags |= RTF_LOCAL; 3682 } 3683 3684 f6i->fib6_nh.nh_gw = *addr; 3685 dev_hold(dev); 3686 f6i->fib6_nh.nh_dev = dev; 3687 f6i->fib6_dst.addr = *addr; 3688 f6i->fib6_dst.plen = 128; 3689 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; 3690 f6i->fib6_table = fib6_get_table(net, tb_id); 3691 3692 return f6i; 3693 } 3694 3695 /* remove deleted ip from prefsrc entries */ 3696 struct arg_dev_net_ip { 3697 struct net_device *dev; 3698 struct net *net; 3699 struct in6_addr *addr; 3700 }; 3701 3702 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 3703 { 3704 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3705 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3706 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3707 3708 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) && 3709 rt != net->ipv6.fib6_null_entry && 3710 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 3711 spin_lock_bh(&rt6_exception_lock); 3712 /* remove prefsrc entry */ 3713 rt->fib6_prefsrc.plen = 0; 3714 /* need to update cache as well */ 3715 rt6_exceptions_remove_prefsrc(rt); 3716 spin_unlock_bh(&rt6_exception_lock); 3717 } 3718 return 0; 3719 } 3720 3721 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3722 { 3723 struct net *net = dev_net(ifp->idev->dev); 3724 struct arg_dev_net_ip adni = { 3725 .dev = ifp->idev->dev, 3726 .net = net, 3727 .addr = &ifp->addr, 3728 }; 3729 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3730 } 3731 3732 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) 3733 3734 /* Remove routers and update dst entries when gateway turn into host. */ 3735 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 3736 { 3737 struct in6_addr *gateway = (struct in6_addr *)arg; 3738 3739 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3740 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) { 3741 return -1; 3742 } 3743 3744 /* Further clean up cached routes in exception table. 3745 * This is needed because cached route may have a different 3746 * gateway than its 'parent' in the case of an ip redirect. 3747 */ 3748 rt6_exceptions_clean_tohost(rt, gateway); 3749 3750 return 0; 3751 } 3752 3753 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3754 { 3755 fib6_clean_all(net, fib6_clean_tohost, gateway); 3756 } 3757 3758 struct arg_netdev_event { 3759 const struct net_device *dev; 3760 union { 3761 unsigned int nh_flags; 3762 unsigned long event; 3763 }; 3764 }; 3765 3766 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 3767 { 3768 struct fib6_info *iter; 3769 struct fib6_node *fn; 3770 3771 fn = rcu_dereference_protected(rt->fib6_node, 3772 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3773 iter = rcu_dereference_protected(fn->leaf, 3774 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3775 while (iter) { 3776 if (iter->fib6_metric == rt->fib6_metric && 3777 rt6_qualify_for_ecmp(iter)) 3778 return iter; 3779 iter = rcu_dereference_protected(iter->rt6_next, 3780 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3781 } 3782 3783 return NULL; 3784 } 3785 3786 static bool rt6_is_dead(const struct fib6_info *rt) 3787 { 3788 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || 3789 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 3790 fib6_ignore_linkdown(rt))) 3791 return true; 3792 3793 return false; 3794 } 3795 3796 static int rt6_multipath_total_weight(const struct fib6_info *rt) 3797 { 3798 struct fib6_info *iter; 3799 int total = 0; 3800 3801 if (!rt6_is_dead(rt)) 3802 total += rt->fib6_nh.nh_weight; 3803 3804 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 3805 if (!rt6_is_dead(iter)) 3806 total += iter->fib6_nh.nh_weight; 3807 } 3808 3809 return total; 3810 } 3811 3812 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 3813 { 3814 int upper_bound = -1; 3815 3816 if (!rt6_is_dead(rt)) { 3817 *weight += rt->fib6_nh.nh_weight; 3818 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 3819 total) - 1; 3820 } 3821 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound); 3822 } 3823 3824 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 3825 { 3826 struct fib6_info *iter; 3827 int weight = 0; 3828 3829 rt6_upper_bound_set(rt, &weight, total); 3830 3831 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3832 rt6_upper_bound_set(iter, &weight, total); 3833 } 3834 3835 void rt6_multipath_rebalance(struct fib6_info *rt) 3836 { 3837 struct fib6_info *first; 3838 int total; 3839 3840 /* In case the entire multipath route was marked for flushing, 3841 * then there is no need to rebalance upon the removal of every 3842 * sibling route. 3843 */ 3844 if (!rt->fib6_nsiblings || rt->should_flush) 3845 return; 3846 3847 /* During lookup routes are evaluated in order, so we need to 3848 * make sure upper bounds are assigned from the first sibling 3849 * onwards. 3850 */ 3851 first = rt6_multipath_first_sibling(rt); 3852 if (WARN_ON_ONCE(!first)) 3853 return; 3854 3855 total = rt6_multipath_total_weight(first); 3856 rt6_multipath_upper_bound_set(first, total); 3857 } 3858 3859 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 3860 { 3861 const struct arg_netdev_event *arg = p_arg; 3862 struct net *net = dev_net(arg->dev); 3863 3864 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) { 3865 rt->fib6_nh.nh_flags &= ~arg->nh_flags; 3866 fib6_update_sernum_upto_root(net, rt); 3867 rt6_multipath_rebalance(rt); 3868 } 3869 3870 return 0; 3871 } 3872 3873 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) 3874 { 3875 struct arg_netdev_event arg = { 3876 .dev = dev, 3877 { 3878 .nh_flags = nh_flags, 3879 }, 3880 }; 3881 3882 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 3883 arg.nh_flags |= RTNH_F_LINKDOWN; 3884 3885 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 3886 } 3887 3888 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 3889 const struct net_device *dev) 3890 { 3891 struct fib6_info *iter; 3892 3893 if (rt->fib6_nh.nh_dev == dev) 3894 return true; 3895 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3896 if (iter->fib6_nh.nh_dev == dev) 3897 return true; 3898 3899 return false; 3900 } 3901 3902 static void rt6_multipath_flush(struct fib6_info *rt) 3903 { 3904 struct fib6_info *iter; 3905 3906 rt->should_flush = 1; 3907 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3908 iter->should_flush = 1; 3909 } 3910 3911 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 3912 const struct net_device *down_dev) 3913 { 3914 struct fib6_info *iter; 3915 unsigned int dead = 0; 3916 3917 if (rt->fib6_nh.nh_dev == down_dev || 3918 rt->fib6_nh.nh_flags & RTNH_F_DEAD) 3919 dead++; 3920 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3921 if (iter->fib6_nh.nh_dev == down_dev || 3922 iter->fib6_nh.nh_flags & RTNH_F_DEAD) 3923 dead++; 3924 3925 return dead; 3926 } 3927 3928 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 3929 const struct net_device *dev, 3930 unsigned int nh_flags) 3931 { 3932 struct fib6_info *iter; 3933 3934 if (rt->fib6_nh.nh_dev == dev) 3935 rt->fib6_nh.nh_flags |= nh_flags; 3936 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3937 if (iter->fib6_nh.nh_dev == dev) 3938 iter->fib6_nh.nh_flags |= nh_flags; 3939 } 3940 3941 /* called with write lock held for table with rt */ 3942 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 3943 { 3944 const struct arg_netdev_event *arg = p_arg; 3945 const struct net_device *dev = arg->dev; 3946 struct net *net = dev_net(dev); 3947 3948 if (rt == net->ipv6.fib6_null_entry) 3949 return 0; 3950 3951 switch (arg->event) { 3952 case NETDEV_UNREGISTER: 3953 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 3954 case NETDEV_DOWN: 3955 if (rt->should_flush) 3956 return -1; 3957 if (!rt->fib6_nsiblings) 3958 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 3959 if (rt6_multipath_uses_dev(rt, dev)) { 3960 unsigned int count; 3961 3962 count = rt6_multipath_dead_count(rt, dev); 3963 if (rt->fib6_nsiblings + 1 == count) { 3964 rt6_multipath_flush(rt); 3965 return -1; 3966 } 3967 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 3968 RTNH_F_LINKDOWN); 3969 fib6_update_sernum(net, rt); 3970 rt6_multipath_rebalance(rt); 3971 } 3972 return -2; 3973 case NETDEV_CHANGE: 3974 if (rt->fib6_nh.nh_dev != dev || 3975 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 3976 break; 3977 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 3978 rt6_multipath_rebalance(rt); 3979 break; 3980 } 3981 3982 return 0; 3983 } 3984 3985 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 3986 { 3987 struct arg_netdev_event arg = { 3988 .dev = dev, 3989 { 3990 .event = event, 3991 }, 3992 }; 3993 3994 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg); 3995 } 3996 3997 void rt6_disable_ip(struct net_device *dev, unsigned long event) 3998 { 3999 rt6_sync_down_dev(dev, event); 4000 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4001 neigh_ifdown(&nd_tbl, dev); 4002 } 4003 4004 struct rt6_mtu_change_arg { 4005 struct net_device *dev; 4006 unsigned int mtu; 4007 }; 4008 4009 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) 4010 { 4011 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4012 struct inet6_dev *idev; 4013 4014 /* In IPv6 pmtu discovery is not optional, 4015 so that RTAX_MTU lock cannot disable it. 4016 We still use this lock to block changes 4017 caused by addrconf/ndisc. 4018 */ 4019 4020 idev = __in6_dev_get(arg->dev); 4021 if (!idev) 4022 return 0; 4023 4024 /* For administrative MTU increase, there is no way to discover 4025 IPv6 PMTU increase, so PMTU increase should be updated here. 4026 Since RFC 1981 doesn't include administrative MTU increase 4027 update PMTU increase is a MUST. (i.e. jumbo frame) 4028 */ 4029 if (rt->fib6_nh.nh_dev == arg->dev && 4030 !fib6_metric_locked(rt, RTAX_MTU)) { 4031 u32 mtu = rt->fib6_pmtu; 4032 4033 if (mtu >= arg->mtu || 4034 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4035 fib6_metric_set(rt, RTAX_MTU, arg->mtu); 4036 4037 spin_lock_bh(&rt6_exception_lock); 4038 rt6_exceptions_update_pmtu(idev, rt, arg->mtu); 4039 spin_unlock_bh(&rt6_exception_lock); 4040 } 4041 return 0; 4042 } 4043 4044 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4045 { 4046 struct rt6_mtu_change_arg arg = { 4047 .dev = dev, 4048 .mtu = mtu, 4049 }; 4050 4051 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4052 } 4053 4054 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4055 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4056 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4057 [RTA_OIF] = { .type = NLA_U32 }, 4058 [RTA_IIF] = { .type = NLA_U32 }, 4059 [RTA_PRIORITY] = { .type = NLA_U32 }, 4060 [RTA_METRICS] = { .type = NLA_NESTED }, 4061 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4062 [RTA_PREF] = { .type = NLA_U8 }, 4063 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4064 [RTA_ENCAP] = { .type = NLA_NESTED }, 4065 [RTA_EXPIRES] = { .type = NLA_U32 }, 4066 [RTA_UID] = { .type = NLA_U32 }, 4067 [RTA_MARK] = { .type = NLA_U32 }, 4068 [RTA_TABLE] = { .type = NLA_U32 }, 4069 }; 4070 4071 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4072 struct fib6_config *cfg, 4073 struct netlink_ext_ack *extack) 4074 { 4075 struct rtmsg *rtm; 4076 struct nlattr *tb[RTA_MAX+1]; 4077 unsigned int pref; 4078 int err; 4079 4080 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4081 NULL); 4082 if (err < 0) 4083 goto errout; 4084 4085 err = -EINVAL; 4086 rtm = nlmsg_data(nlh); 4087 memset(cfg, 0, sizeof(*cfg)); 4088 4089 cfg->fc_table = rtm->rtm_table; 4090 cfg->fc_dst_len = rtm->rtm_dst_len; 4091 cfg->fc_src_len = rtm->rtm_src_len; 4092 cfg->fc_flags = RTF_UP; 4093 cfg->fc_protocol = rtm->rtm_protocol; 4094 cfg->fc_type = rtm->rtm_type; 4095 4096 if (rtm->rtm_type == RTN_UNREACHABLE || 4097 rtm->rtm_type == RTN_BLACKHOLE || 4098 rtm->rtm_type == RTN_PROHIBIT || 4099 rtm->rtm_type == RTN_THROW) 4100 cfg->fc_flags |= RTF_REJECT; 4101 4102 if (rtm->rtm_type == RTN_LOCAL) 4103 cfg->fc_flags |= RTF_LOCAL; 4104 4105 if (rtm->rtm_flags & RTM_F_CLONED) 4106 cfg->fc_flags |= RTF_CACHE; 4107 4108 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4109 4110 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; 4111 cfg->fc_nlinfo.nlh = nlh; 4112 cfg->fc_nlinfo.nl_net = sock_net(skb->sk); 4113 4114 if (tb[RTA_GATEWAY]) { 4115 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4116 cfg->fc_flags |= RTF_GATEWAY; 4117 } 4118 4119 if (tb[RTA_DST]) { 4120 int plen = (rtm->rtm_dst_len + 7) >> 3; 4121 4122 if (nla_len(tb[RTA_DST]) < plen) 4123 goto errout; 4124 4125 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 4126 } 4127 4128 if (tb[RTA_SRC]) { 4129 int plen = (rtm->rtm_src_len + 7) >> 3; 4130 4131 if (nla_len(tb[RTA_SRC]) < plen) 4132 goto errout; 4133 4134 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 4135 } 4136 4137 if (tb[RTA_PREFSRC]) 4138 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 4139 4140 if (tb[RTA_OIF]) 4141 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 4142 4143 if (tb[RTA_PRIORITY]) 4144 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 4145 4146 if (tb[RTA_METRICS]) { 4147 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 4148 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 4149 } 4150 4151 if (tb[RTA_TABLE]) 4152 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 4153 4154 if (tb[RTA_MULTIPATH]) { 4155 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 4156 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 4157 4158 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 4159 cfg->fc_mp_len, extack); 4160 if (err < 0) 4161 goto errout; 4162 } 4163 4164 if (tb[RTA_PREF]) { 4165 pref = nla_get_u8(tb[RTA_PREF]); 4166 if (pref != ICMPV6_ROUTER_PREF_LOW && 4167 pref != ICMPV6_ROUTER_PREF_HIGH) 4168 pref = ICMPV6_ROUTER_PREF_MEDIUM; 4169 cfg->fc_flags |= RTF_PREF(pref); 4170 } 4171 4172 if (tb[RTA_ENCAP]) 4173 cfg->fc_encap = tb[RTA_ENCAP]; 4174 4175 if (tb[RTA_ENCAP_TYPE]) { 4176 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 4177 4178 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 4179 if (err < 0) 4180 goto errout; 4181 } 4182 4183 if (tb[RTA_EXPIRES]) { 4184 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 4185 4186 if (addrconf_finite_timeout(timeout)) { 4187 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 4188 cfg->fc_flags |= RTF_EXPIRES; 4189 } 4190 } 4191 4192 err = 0; 4193 errout: 4194 return err; 4195 } 4196 4197 struct rt6_nh { 4198 struct fib6_info *fib6_info; 4199 struct fib6_config r_cfg; 4200 struct list_head next; 4201 }; 4202 4203 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) 4204 { 4205 struct rt6_nh *nh; 4206 4207 list_for_each_entry(nh, rt6_nh_list, next) { 4208 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n", 4209 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, 4210 nh->r_cfg.fc_ifindex); 4211 } 4212 } 4213 4214 static int ip6_route_info_append(struct net *net, 4215 struct list_head *rt6_nh_list, 4216 struct fib6_info *rt, 4217 struct fib6_config *r_cfg) 4218 { 4219 struct rt6_nh *nh; 4220 int err = -EEXIST; 4221 4222 list_for_each_entry(nh, rt6_nh_list, next) { 4223 /* check if fib6_info already exists */ 4224 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 4225 return err; 4226 } 4227 4228 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4229 if (!nh) 4230 return -ENOMEM; 4231 nh->fib6_info = rt; 4232 err = ip6_convert_metrics(net, rt, r_cfg); 4233 if (err) { 4234 kfree(nh); 4235 return err; 4236 } 4237 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4238 list_add_tail(&nh->next, rt6_nh_list); 4239 4240 return 0; 4241 } 4242 4243 static void ip6_route_mpath_notify(struct fib6_info *rt, 4244 struct fib6_info *rt_last, 4245 struct nl_info *info, 4246 __u16 nlflags) 4247 { 4248 /* if this is an APPEND route, then rt points to the first route 4249 * inserted and rt_last points to last route inserted. Userspace 4250 * wants a consistent dump of the route which starts at the first 4251 * nexthop. Since sibling routes are always added at the end of 4252 * the list, find the first sibling of the last route appended 4253 */ 4254 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 4255 rt = list_first_entry(&rt_last->fib6_siblings, 4256 struct fib6_info, 4257 fib6_siblings); 4258 } 4259 4260 if (rt) 4261 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 4262 } 4263 4264 static int ip6_route_multipath_add(struct fib6_config *cfg, 4265 struct netlink_ext_ack *extack) 4266 { 4267 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 4268 struct nl_info *info = &cfg->fc_nlinfo; 4269 struct fib6_config r_cfg; 4270 struct rtnexthop *rtnh; 4271 struct fib6_info *rt; 4272 struct rt6_nh *err_nh; 4273 struct rt6_nh *nh, *nh_safe; 4274 __u16 nlflags; 4275 int remaining; 4276 int attrlen; 4277 int err = 1; 4278 int nhn = 0; 4279 int replace = (cfg->fc_nlinfo.nlh && 4280 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 4281 LIST_HEAD(rt6_nh_list); 4282 4283 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 4284 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 4285 nlflags |= NLM_F_APPEND; 4286 4287 remaining = cfg->fc_mp_len; 4288 rtnh = (struct rtnexthop *)cfg->fc_mp; 4289 4290 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4291 * fib6_info structs per nexthop 4292 */ 4293 while (rtnh_ok(rtnh, remaining)) { 4294 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4295 if (rtnh->rtnh_ifindex) 4296 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4297 4298 attrlen = rtnh_attrlen(rtnh); 4299 if (attrlen > 0) { 4300 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4301 4302 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4303 if (nla) { 4304 r_cfg.fc_gateway = nla_get_in6_addr(nla); 4305 r_cfg.fc_flags |= RTF_GATEWAY; 4306 } 4307 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 4308 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 4309 if (nla) 4310 r_cfg.fc_encap_type = nla_get_u16(nla); 4311 } 4312 4313 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 4314 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 4315 if (IS_ERR(rt)) { 4316 err = PTR_ERR(rt); 4317 rt = NULL; 4318 goto cleanup; 4319 } 4320 4321 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1; 4322 4323 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 4324 rt, &r_cfg); 4325 if (err) { 4326 fib6_info_release(rt); 4327 goto cleanup; 4328 } 4329 4330 rtnh = rtnh_next(rtnh, &remaining); 4331 } 4332 4333 /* for add and replace send one notification with all nexthops. 4334 * Skip the notification in fib6_add_rt2node and send one with 4335 * the full route when done 4336 */ 4337 info->skip_notify = 1; 4338 4339 err_nh = NULL; 4340 list_for_each_entry(nh, &rt6_nh_list, next) { 4341 rt_last = nh->fib6_info; 4342 err = __ip6_ins_rt(nh->fib6_info, info, extack); 4343 fib6_info_release(nh->fib6_info); 4344 4345 /* save reference to first route for notification */ 4346 if (!rt_notif && !err) 4347 rt_notif = nh->fib6_info; 4348 4349 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 4350 nh->fib6_info = NULL; 4351 if (err) { 4352 if (replace && nhn) 4353 ip6_print_replace_route_err(&rt6_nh_list); 4354 err_nh = nh; 4355 goto add_errout; 4356 } 4357 4358 /* Because each route is added like a single route we remove 4359 * these flags after the first nexthop: if there is a collision, 4360 * we have already failed to add the first nexthop: 4361 * fib6_add_rt2node() has rejected it; when replacing, old 4362 * nexthops have been replaced by first new, the rest should 4363 * be added to it. 4364 */ 4365 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4366 NLM_F_REPLACE); 4367 nhn++; 4368 } 4369 4370 /* success ... tell user about new route */ 4371 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4372 goto cleanup; 4373 4374 add_errout: 4375 /* send notification for routes that were added so that 4376 * the delete notifications sent by ip6_route_del are 4377 * coherent 4378 */ 4379 if (rt_notif) 4380 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4381 4382 /* Delete routes that were already added */ 4383 list_for_each_entry(nh, &rt6_nh_list, next) { 4384 if (err_nh == nh) 4385 break; 4386 ip6_route_del(&nh->r_cfg, extack); 4387 } 4388 4389 cleanup: 4390 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4391 if (nh->fib6_info) 4392 fib6_info_release(nh->fib6_info); 4393 list_del(&nh->next); 4394 kfree(nh); 4395 } 4396 4397 return err; 4398 } 4399 4400 static int ip6_route_multipath_del(struct fib6_config *cfg, 4401 struct netlink_ext_ack *extack) 4402 { 4403 struct fib6_config r_cfg; 4404 struct rtnexthop *rtnh; 4405 int remaining; 4406 int attrlen; 4407 int err = 1, last_err = 0; 4408 4409 remaining = cfg->fc_mp_len; 4410 rtnh = (struct rtnexthop *)cfg->fc_mp; 4411 4412 /* Parse a Multipath Entry */ 4413 while (rtnh_ok(rtnh, remaining)) { 4414 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4415 if (rtnh->rtnh_ifindex) 4416 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4417 4418 attrlen = rtnh_attrlen(rtnh); 4419 if (attrlen > 0) { 4420 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4421 4422 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4423 if (nla) { 4424 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 4425 r_cfg.fc_flags |= RTF_GATEWAY; 4426 } 4427 } 4428 err = ip6_route_del(&r_cfg, extack); 4429 if (err) 4430 last_err = err; 4431 4432 rtnh = rtnh_next(rtnh, &remaining); 4433 } 4434 4435 return last_err; 4436 } 4437 4438 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4439 struct netlink_ext_ack *extack) 4440 { 4441 struct fib6_config cfg; 4442 int err; 4443 4444 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4445 if (err < 0) 4446 return err; 4447 4448 if (cfg.fc_mp) 4449 return ip6_route_multipath_del(&cfg, extack); 4450 else { 4451 cfg.fc_delete_all_nh = 1; 4452 return ip6_route_del(&cfg, extack); 4453 } 4454 } 4455 4456 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4457 struct netlink_ext_ack *extack) 4458 { 4459 struct fib6_config cfg; 4460 int err; 4461 4462 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4463 if (err < 0) 4464 return err; 4465 4466 if (cfg.fc_mp) 4467 return ip6_route_multipath_add(&cfg, extack); 4468 else 4469 return ip6_route_add(&cfg, GFP_KERNEL, extack); 4470 } 4471 4472 static size_t rt6_nlmsg_size(struct fib6_info *rt) 4473 { 4474 int nexthop_len = 0; 4475 4476 if (rt->fib6_nsiblings) { 4477 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4478 + NLA_ALIGN(sizeof(struct rtnexthop)) 4479 + nla_total_size(16) /* RTA_GATEWAY */ 4480 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate); 4481 4482 nexthop_len *= rt->fib6_nsiblings; 4483 } 4484 4485 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4486 + nla_total_size(16) /* RTA_SRC */ 4487 + nla_total_size(16) /* RTA_DST */ 4488 + nla_total_size(16) /* RTA_GATEWAY */ 4489 + nla_total_size(16) /* RTA_PREFSRC */ 4490 + nla_total_size(4) /* RTA_TABLE */ 4491 + nla_total_size(4) /* RTA_IIF */ 4492 + nla_total_size(4) /* RTA_OIF */ 4493 + nla_total_size(4) /* RTA_PRIORITY */ 4494 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 4495 + nla_total_size(sizeof(struct rta_cacheinfo)) 4496 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4497 + nla_total_size(1) /* RTA_PREF */ 4498 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate) 4499 + nexthop_len; 4500 } 4501 4502 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt, 4503 unsigned int *flags, bool skip_oif) 4504 { 4505 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 4506 *flags |= RTNH_F_DEAD; 4507 4508 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) { 4509 *flags |= RTNH_F_LINKDOWN; 4510 4511 rcu_read_lock(); 4512 if (fib6_ignore_linkdown(rt)) 4513 *flags |= RTNH_F_DEAD; 4514 rcu_read_unlock(); 4515 } 4516 4517 if (rt->fib6_flags & RTF_GATEWAY) { 4518 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0) 4519 goto nla_put_failure; 4520 } 4521 4522 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK); 4523 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD) 4524 *flags |= RTNH_F_OFFLOAD; 4525 4526 /* not needed for multipath encoding b/c it has a rtnexthop struct */ 4527 if (!skip_oif && rt->fib6_nh.nh_dev && 4528 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex)) 4529 goto nla_put_failure; 4530 4531 if (rt->fib6_nh.nh_lwtstate && 4532 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0) 4533 goto nla_put_failure; 4534 4535 return 0; 4536 4537 nla_put_failure: 4538 return -EMSGSIZE; 4539 } 4540 4541 /* add multipath next hop */ 4542 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt) 4543 { 4544 const struct net_device *dev = rt->fib6_nh.nh_dev; 4545 struct rtnexthop *rtnh; 4546 unsigned int flags = 0; 4547 4548 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 4549 if (!rtnh) 4550 goto nla_put_failure; 4551 4552 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1; 4553 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; 4554 4555 if (rt6_nexthop_info(skb, rt, &flags, true) < 0) 4556 goto nla_put_failure; 4557 4558 rtnh->rtnh_flags = flags; 4559 4560 /* length of rtnetlink header + attributes */ 4561 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; 4562 4563 return 0; 4564 4565 nla_put_failure: 4566 return -EMSGSIZE; 4567 } 4568 4569 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 4570 struct fib6_info *rt, struct dst_entry *dst, 4571 struct in6_addr *dest, struct in6_addr *src, 4572 int iif, int type, u32 portid, u32 seq, 4573 unsigned int flags) 4574 { 4575 struct rtmsg *rtm; 4576 struct nlmsghdr *nlh; 4577 long expires = 0; 4578 u32 *pmetrics; 4579 u32 table; 4580 4581 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4582 if (!nlh) 4583 return -EMSGSIZE; 4584 4585 rtm = nlmsg_data(nlh); 4586 rtm->rtm_family = AF_INET6; 4587 rtm->rtm_dst_len = rt->fib6_dst.plen; 4588 rtm->rtm_src_len = rt->fib6_src.plen; 4589 rtm->rtm_tos = 0; 4590 if (rt->fib6_table) 4591 table = rt->fib6_table->tb6_id; 4592 else 4593 table = RT6_TABLE_UNSPEC; 4594 rtm->rtm_table = table; 4595 if (nla_put_u32(skb, RTA_TABLE, table)) 4596 goto nla_put_failure; 4597 4598 rtm->rtm_type = rt->fib6_type; 4599 rtm->rtm_flags = 0; 4600 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4601 rtm->rtm_protocol = rt->fib6_protocol; 4602 4603 if (rt->fib6_flags & RTF_CACHE) 4604 rtm->rtm_flags |= RTM_F_CLONED; 4605 4606 if (dest) { 4607 if (nla_put_in6_addr(skb, RTA_DST, dest)) 4608 goto nla_put_failure; 4609 rtm->rtm_dst_len = 128; 4610 } else if (rtm->rtm_dst_len) 4611 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr)) 4612 goto nla_put_failure; 4613 #ifdef CONFIG_IPV6_SUBTREES 4614 if (src) { 4615 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4616 goto nla_put_failure; 4617 rtm->rtm_src_len = 128; 4618 } else if (rtm->rtm_src_len && 4619 nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr)) 4620 goto nla_put_failure; 4621 #endif 4622 if (iif) { 4623 #ifdef CONFIG_IPV6_MROUTE 4624 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) { 4625 int err = ip6mr_get_route(net, skb, rtm, portid); 4626 4627 if (err == 0) 4628 return 0; 4629 if (err < 0) 4630 goto nla_put_failure; 4631 } else 4632 #endif 4633 if (nla_put_u32(skb, RTA_IIF, iif)) 4634 goto nla_put_failure; 4635 } else if (dest) { 4636 struct in6_addr saddr_buf; 4637 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 4638 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4639 goto nla_put_failure; 4640 } 4641 4642 if (rt->fib6_prefsrc.plen) { 4643 struct in6_addr saddr_buf; 4644 saddr_buf = rt->fib6_prefsrc.addr; 4645 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4646 goto nla_put_failure; 4647 } 4648 4649 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 4650 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 4651 goto nla_put_failure; 4652 4653 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 4654 goto nla_put_failure; 4655 4656 /* For multipath routes, walk the siblings list and add 4657 * each as a nexthop within RTA_MULTIPATH. 4658 */ 4659 if (rt->fib6_nsiblings) { 4660 struct fib6_info *sibling, *next_sibling; 4661 struct nlattr *mp; 4662 4663 mp = nla_nest_start(skb, RTA_MULTIPATH); 4664 if (!mp) 4665 goto nla_put_failure; 4666 4667 if (rt6_add_nexthop(skb, rt) < 0) 4668 goto nla_put_failure; 4669 4670 list_for_each_entry_safe(sibling, next_sibling, 4671 &rt->fib6_siblings, fib6_siblings) { 4672 if (rt6_add_nexthop(skb, sibling) < 0) 4673 goto nla_put_failure; 4674 } 4675 4676 nla_nest_end(skb, mp); 4677 } else { 4678 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0) 4679 goto nla_put_failure; 4680 } 4681 4682 if (rt->fib6_flags & RTF_EXPIRES) { 4683 expires = dst ? dst->expires : rt->expires; 4684 expires -= jiffies; 4685 } 4686 4687 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 4688 goto nla_put_failure; 4689 4690 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags))) 4691 goto nla_put_failure; 4692 4693 4694 nlmsg_end(skb, nlh); 4695 return 0; 4696 4697 nla_put_failure: 4698 nlmsg_cancel(skb, nlh); 4699 return -EMSGSIZE; 4700 } 4701 4702 int rt6_dump_route(struct fib6_info *rt, void *p_arg) 4703 { 4704 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4705 struct net *net = arg->net; 4706 4707 if (rt == net->ipv6.fib6_null_entry) 4708 return 0; 4709 4710 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { 4711 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); 4712 4713 /* user wants prefix routes only */ 4714 if (rtm->rtm_flags & RTM_F_PREFIX && 4715 !(rt->fib6_flags & RTF_PREFIX_RT)) { 4716 /* success since this is not a prefix route */ 4717 return 1; 4718 } 4719 } 4720 4721 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, 4722 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, 4723 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI); 4724 } 4725 4726 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4727 struct netlink_ext_ack *extack) 4728 { 4729 struct net *net = sock_net(in_skb->sk); 4730 struct nlattr *tb[RTA_MAX+1]; 4731 int err, iif = 0, oif = 0; 4732 struct fib6_info *from; 4733 struct dst_entry *dst; 4734 struct rt6_info *rt; 4735 struct sk_buff *skb; 4736 struct rtmsg *rtm; 4737 struct flowi6 fl6; 4738 bool fibmatch; 4739 4740 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4741 extack); 4742 if (err < 0) 4743 goto errout; 4744 4745 err = -EINVAL; 4746 memset(&fl6, 0, sizeof(fl6)); 4747 rtm = nlmsg_data(nlh); 4748 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4749 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4750 4751 if (tb[RTA_SRC]) { 4752 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4753 goto errout; 4754 4755 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4756 } 4757 4758 if (tb[RTA_DST]) { 4759 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4760 goto errout; 4761 4762 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4763 } 4764 4765 if (tb[RTA_IIF]) 4766 iif = nla_get_u32(tb[RTA_IIF]); 4767 4768 if (tb[RTA_OIF]) 4769 oif = nla_get_u32(tb[RTA_OIF]); 4770 4771 if (tb[RTA_MARK]) 4772 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4773 4774 if (tb[RTA_UID]) 4775 fl6.flowi6_uid = make_kuid(current_user_ns(), 4776 nla_get_u32(tb[RTA_UID])); 4777 else 4778 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4779 4780 if (iif) { 4781 struct net_device *dev; 4782 int flags = 0; 4783 4784 rcu_read_lock(); 4785 4786 dev = dev_get_by_index_rcu(net, iif); 4787 if (!dev) { 4788 rcu_read_unlock(); 4789 err = -ENODEV; 4790 goto errout; 4791 } 4792 4793 fl6.flowi6_iif = iif; 4794 4795 if (!ipv6_addr_any(&fl6.saddr)) 4796 flags |= RT6_LOOKUP_F_HAS_SADDR; 4797 4798 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 4799 4800 rcu_read_unlock(); 4801 } else { 4802 fl6.flowi6_oif = oif; 4803 4804 dst = ip6_route_output(net, NULL, &fl6); 4805 } 4806 4807 4808 rt = container_of(dst, struct rt6_info, dst); 4809 if (rt->dst.error) { 4810 err = rt->dst.error; 4811 ip6_rt_put(rt); 4812 goto errout; 4813 } 4814 4815 if (rt == net->ipv6.ip6_null_entry) { 4816 err = rt->dst.error; 4817 ip6_rt_put(rt); 4818 goto errout; 4819 } 4820 4821 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 4822 if (!skb) { 4823 ip6_rt_put(rt); 4824 err = -ENOBUFS; 4825 goto errout; 4826 } 4827 4828 skb_dst_set(skb, &rt->dst); 4829 4830 rcu_read_lock(); 4831 from = rcu_dereference(rt->from); 4832 4833 if (fibmatch) 4834 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, 4835 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 4836 nlh->nlmsg_seq, 0); 4837 else 4838 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 4839 &fl6.saddr, iif, RTM_NEWROUTE, 4840 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 4841 0); 4842 rcu_read_unlock(); 4843 4844 if (err < 0) { 4845 kfree_skb(skb); 4846 goto errout; 4847 } 4848 4849 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 4850 errout: 4851 return err; 4852 } 4853 4854 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 4855 unsigned int nlm_flags) 4856 { 4857 struct sk_buff *skb; 4858 struct net *net = info->nl_net; 4859 u32 seq; 4860 int err; 4861 4862 err = -ENOBUFS; 4863 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 4864 4865 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 4866 if (!skb) 4867 goto errout; 4868 4869 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 4870 event, info->portid, seq, nlm_flags); 4871 if (err < 0) { 4872 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 4873 WARN_ON(err == -EMSGSIZE); 4874 kfree_skb(skb); 4875 goto errout; 4876 } 4877 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 4878 info->nlh, gfp_any()); 4879 return; 4880 errout: 4881 if (err < 0) 4882 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 4883 } 4884 4885 static int ip6_route_dev_notify(struct notifier_block *this, 4886 unsigned long event, void *ptr) 4887 { 4888 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 4889 struct net *net = dev_net(dev); 4890 4891 if (!(dev->flags & IFF_LOOPBACK)) 4892 return NOTIFY_OK; 4893 4894 if (event == NETDEV_REGISTER) { 4895 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev; 4896 net->ipv6.ip6_null_entry->dst.dev = dev; 4897 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 4898 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4899 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 4900 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 4901 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 4902 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 4903 #endif 4904 } else if (event == NETDEV_UNREGISTER && 4905 dev->reg_state != NETREG_UNREGISTERED) { 4906 /* NETDEV_UNREGISTER could be fired for multiple times by 4907 * netdev_wait_allrefs(). Make sure we only call this once. 4908 */ 4909 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 4910 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4911 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 4912 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 4913 #endif 4914 } 4915 4916 return NOTIFY_OK; 4917 } 4918 4919 /* 4920 * /proc 4921 */ 4922 4923 #ifdef CONFIG_PROC_FS 4924 4925 static const struct file_operations ipv6_route_proc_fops = { 4926 .open = ipv6_route_open, 4927 .read = seq_read, 4928 .llseek = seq_lseek, 4929 .release = seq_release_net, 4930 }; 4931 4932 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 4933 { 4934 struct net *net = (struct net *)seq->private; 4935 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 4936 net->ipv6.rt6_stats->fib_nodes, 4937 net->ipv6.rt6_stats->fib_route_nodes, 4938 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 4939 net->ipv6.rt6_stats->fib_rt_entries, 4940 net->ipv6.rt6_stats->fib_rt_cache, 4941 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 4942 net->ipv6.rt6_stats->fib_discarded_routes); 4943 4944 return 0; 4945 } 4946 4947 static int rt6_stats_seq_open(struct inode *inode, struct file *file) 4948 { 4949 return single_open_net(inode, file, rt6_stats_seq_show); 4950 } 4951 4952 static const struct file_operations rt6_stats_seq_fops = { 4953 .open = rt6_stats_seq_open, 4954 .read = seq_read, 4955 .llseek = seq_lseek, 4956 .release = single_release_net, 4957 }; 4958 #endif /* CONFIG_PROC_FS */ 4959 4960 #ifdef CONFIG_SYSCTL 4961 4962 static 4963 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 4964 void __user *buffer, size_t *lenp, loff_t *ppos) 4965 { 4966 struct net *net; 4967 int delay; 4968 if (!write) 4969 return -EINVAL; 4970 4971 net = (struct net *)ctl->extra1; 4972 delay = net->ipv6.sysctl.flush_delay; 4973 proc_dointvec(ctl, write, buffer, lenp, ppos); 4974 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 4975 return 0; 4976 } 4977 4978 struct ctl_table ipv6_route_table_template[] = { 4979 { 4980 .procname = "flush", 4981 .data = &init_net.ipv6.sysctl.flush_delay, 4982 .maxlen = sizeof(int), 4983 .mode = 0200, 4984 .proc_handler = ipv6_sysctl_rtcache_flush 4985 }, 4986 { 4987 .procname = "gc_thresh", 4988 .data = &ip6_dst_ops_template.gc_thresh, 4989 .maxlen = sizeof(int), 4990 .mode = 0644, 4991 .proc_handler = proc_dointvec, 4992 }, 4993 { 4994 .procname = "max_size", 4995 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 4996 .maxlen = sizeof(int), 4997 .mode = 0644, 4998 .proc_handler = proc_dointvec, 4999 }, 5000 { 5001 .procname = "gc_min_interval", 5002 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5003 .maxlen = sizeof(int), 5004 .mode = 0644, 5005 .proc_handler = proc_dointvec_jiffies, 5006 }, 5007 { 5008 .procname = "gc_timeout", 5009 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 5010 .maxlen = sizeof(int), 5011 .mode = 0644, 5012 .proc_handler = proc_dointvec_jiffies, 5013 }, 5014 { 5015 .procname = "gc_interval", 5016 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 5017 .maxlen = sizeof(int), 5018 .mode = 0644, 5019 .proc_handler = proc_dointvec_jiffies, 5020 }, 5021 { 5022 .procname = "gc_elasticity", 5023 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 5024 .maxlen = sizeof(int), 5025 .mode = 0644, 5026 .proc_handler = proc_dointvec, 5027 }, 5028 { 5029 .procname = "mtu_expires", 5030 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 5031 .maxlen = sizeof(int), 5032 .mode = 0644, 5033 .proc_handler = proc_dointvec_jiffies, 5034 }, 5035 { 5036 .procname = "min_adv_mss", 5037 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 5038 .maxlen = sizeof(int), 5039 .mode = 0644, 5040 .proc_handler = proc_dointvec, 5041 }, 5042 { 5043 .procname = "gc_min_interval_ms", 5044 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5045 .maxlen = sizeof(int), 5046 .mode = 0644, 5047 .proc_handler = proc_dointvec_ms_jiffies, 5048 }, 5049 { } 5050 }; 5051 5052 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 5053 { 5054 struct ctl_table *table; 5055 5056 table = kmemdup(ipv6_route_table_template, 5057 sizeof(ipv6_route_table_template), 5058 GFP_KERNEL); 5059 5060 if (table) { 5061 table[0].data = &net->ipv6.sysctl.flush_delay; 5062 table[0].extra1 = net; 5063 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 5064 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 5065 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5066 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 5067 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 5068 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 5069 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 5070 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 5071 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5072 5073 /* Don't export sysctls to unprivileged users */ 5074 if (net->user_ns != &init_user_ns) 5075 table[0].procname = NULL; 5076 } 5077 5078 return table; 5079 } 5080 #endif 5081 5082 static int __net_init ip6_route_net_init(struct net *net) 5083 { 5084 int ret = -ENOMEM; 5085 5086 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 5087 sizeof(net->ipv6.ip6_dst_ops)); 5088 5089 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 5090 goto out_ip6_dst_ops; 5091 5092 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, 5093 sizeof(*net->ipv6.fib6_null_entry), 5094 GFP_KERNEL); 5095 if (!net->ipv6.fib6_null_entry) 5096 goto out_ip6_dst_entries; 5097 5098 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 5099 sizeof(*net->ipv6.ip6_null_entry), 5100 GFP_KERNEL); 5101 if (!net->ipv6.ip6_null_entry) 5102 goto out_fib6_null_entry; 5103 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5104 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 5105 ip6_template_metrics, true); 5106 5107 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5108 net->ipv6.fib6_has_custom_rules = false; 5109 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 5110 sizeof(*net->ipv6.ip6_prohibit_entry), 5111 GFP_KERNEL); 5112 if (!net->ipv6.ip6_prohibit_entry) 5113 goto out_ip6_null_entry; 5114 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5115 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 5116 ip6_template_metrics, true); 5117 5118 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 5119 sizeof(*net->ipv6.ip6_blk_hole_entry), 5120 GFP_KERNEL); 5121 if (!net->ipv6.ip6_blk_hole_entry) 5122 goto out_ip6_prohibit_entry; 5123 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5124 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 5125 ip6_template_metrics, true); 5126 #endif 5127 5128 net->ipv6.sysctl.flush_delay = 0; 5129 net->ipv6.sysctl.ip6_rt_max_size = 4096; 5130 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 5131 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 5132 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 5133 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 5134 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 5135 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 5136 5137 net->ipv6.ip6_rt_gc_expire = 30*HZ; 5138 5139 ret = 0; 5140 out: 5141 return ret; 5142 5143 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5144 out_ip6_prohibit_entry: 5145 kfree(net->ipv6.ip6_prohibit_entry); 5146 out_ip6_null_entry: 5147 kfree(net->ipv6.ip6_null_entry); 5148 #endif 5149 out_fib6_null_entry: 5150 kfree(net->ipv6.fib6_null_entry); 5151 out_ip6_dst_entries: 5152 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5153 out_ip6_dst_ops: 5154 goto out; 5155 } 5156 5157 static void __net_exit ip6_route_net_exit(struct net *net) 5158 { 5159 kfree(net->ipv6.fib6_null_entry); 5160 kfree(net->ipv6.ip6_null_entry); 5161 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5162 kfree(net->ipv6.ip6_prohibit_entry); 5163 kfree(net->ipv6.ip6_blk_hole_entry); 5164 #endif 5165 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5166 } 5167 5168 static int __net_init ip6_route_net_init_late(struct net *net) 5169 { 5170 #ifdef CONFIG_PROC_FS 5171 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops); 5172 proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops); 5173 #endif 5174 return 0; 5175 } 5176 5177 static void __net_exit ip6_route_net_exit_late(struct net *net) 5178 { 5179 #ifdef CONFIG_PROC_FS 5180 remove_proc_entry("ipv6_route", net->proc_net); 5181 remove_proc_entry("rt6_stats", net->proc_net); 5182 #endif 5183 } 5184 5185 static struct pernet_operations ip6_route_net_ops = { 5186 .init = ip6_route_net_init, 5187 .exit = ip6_route_net_exit, 5188 }; 5189 5190 static int __net_init ipv6_inetpeer_init(struct net *net) 5191 { 5192 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 5193 5194 if (!bp) 5195 return -ENOMEM; 5196 inet_peer_base_init(bp); 5197 net->ipv6.peers = bp; 5198 return 0; 5199 } 5200 5201 static void __net_exit ipv6_inetpeer_exit(struct net *net) 5202 { 5203 struct inet_peer_base *bp = net->ipv6.peers; 5204 5205 net->ipv6.peers = NULL; 5206 inetpeer_invalidate_tree(bp); 5207 kfree(bp); 5208 } 5209 5210 static struct pernet_operations ipv6_inetpeer_ops = { 5211 .init = ipv6_inetpeer_init, 5212 .exit = ipv6_inetpeer_exit, 5213 }; 5214 5215 static struct pernet_operations ip6_route_net_late_ops = { 5216 .init = ip6_route_net_init_late, 5217 .exit = ip6_route_net_exit_late, 5218 }; 5219 5220 static struct notifier_block ip6_route_dev_notifier = { 5221 .notifier_call = ip6_route_dev_notify, 5222 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 5223 }; 5224 5225 void __init ip6_route_init_special_entries(void) 5226 { 5227 /* Registering of the loopback is done before this portion of code, 5228 * the loopback reference in rt6_info will not be taken, do it 5229 * manually for init_net */ 5230 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev; 5231 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5232 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5233 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5234 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 5235 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5236 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 5237 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5238 #endif 5239 } 5240 5241 int __init ip6_route_init(void) 5242 { 5243 int ret; 5244 int cpu; 5245 5246 ret = -ENOMEM; 5247 ip6_dst_ops_template.kmem_cachep = 5248 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 5249 SLAB_HWCACHE_ALIGN, NULL); 5250 if (!ip6_dst_ops_template.kmem_cachep) 5251 goto out; 5252 5253 ret = dst_entries_init(&ip6_dst_blackhole_ops); 5254 if (ret) 5255 goto out_kmem_cache; 5256 5257 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 5258 if (ret) 5259 goto out_dst_entries; 5260 5261 ret = register_pernet_subsys(&ip6_route_net_ops); 5262 if (ret) 5263 goto out_register_inetpeer; 5264 5265 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 5266 5267 ret = fib6_init(); 5268 if (ret) 5269 goto out_register_subsys; 5270 5271 ret = xfrm6_init(); 5272 if (ret) 5273 goto out_fib6_init; 5274 5275 ret = fib6_rules_init(); 5276 if (ret) 5277 goto xfrm6_init; 5278 5279 ret = register_pernet_subsys(&ip6_route_net_late_ops); 5280 if (ret) 5281 goto fib6_rules_init; 5282 5283 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 5284 inet6_rtm_newroute, NULL, 0); 5285 if (ret < 0) 5286 goto out_register_late_subsys; 5287 5288 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 5289 inet6_rtm_delroute, NULL, 0); 5290 if (ret < 0) 5291 goto out_register_late_subsys; 5292 5293 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 5294 inet6_rtm_getroute, NULL, 5295 RTNL_FLAG_DOIT_UNLOCKED); 5296 if (ret < 0) 5297 goto out_register_late_subsys; 5298 5299 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 5300 if (ret) 5301 goto out_register_late_subsys; 5302 5303 for_each_possible_cpu(cpu) { 5304 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 5305 5306 INIT_LIST_HEAD(&ul->head); 5307 spin_lock_init(&ul->lock); 5308 } 5309 5310 out: 5311 return ret; 5312 5313 out_register_late_subsys: 5314 rtnl_unregister_all(PF_INET6); 5315 unregister_pernet_subsys(&ip6_route_net_late_ops); 5316 fib6_rules_init: 5317 fib6_rules_cleanup(); 5318 xfrm6_init: 5319 xfrm6_fini(); 5320 out_fib6_init: 5321 fib6_gc_cleanup(); 5322 out_register_subsys: 5323 unregister_pernet_subsys(&ip6_route_net_ops); 5324 out_register_inetpeer: 5325 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5326 out_dst_entries: 5327 dst_entries_destroy(&ip6_dst_blackhole_ops); 5328 out_kmem_cache: 5329 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5330 goto out; 5331 } 5332 5333 void ip6_route_cleanup(void) 5334 { 5335 unregister_netdevice_notifier(&ip6_route_dev_notifier); 5336 unregister_pernet_subsys(&ip6_route_net_late_ops); 5337 fib6_rules_cleanup(); 5338 xfrm6_fini(); 5339 fib6_gc_cleanup(); 5340 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5341 unregister_pernet_subsys(&ip6_route_net_ops); 5342 dst_entries_destroy(&ip6_dst_blackhole_ops); 5343 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5344 } 5345