1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/nexthop.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <net/ip.h> 67 #include <linux/uaccess.h> 68 69 #ifdef CONFIG_SYSCTL 70 #include <linux/sysctl.h> 71 #endif 72 73 static int ip6_rt_type_to_error(u8 fib6_type); 74 75 #define CREATE_TRACE_POINTS 76 #include <trace/events/fib6.h> 77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); 78 #undef CREATE_TRACE_POINTS 79 80 enum rt6_nud_state { 81 RT6_NUD_FAIL_HARD = -3, 82 RT6_NUD_FAIL_PROBE = -2, 83 RT6_NUD_FAIL_DO_RR = -1, 84 RT6_NUD_SUCCEED = 1 85 }; 86 87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 88 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 89 static unsigned int ip6_mtu(const struct dst_entry *dst); 90 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 91 static void ip6_dst_destroy(struct dst_entry *); 92 static void ip6_dst_ifdown(struct dst_entry *, 93 struct net_device *dev, int how); 94 static int ip6_dst_gc(struct dst_ops *ops); 95 96 static int ip6_pkt_discard(struct sk_buff *skb); 97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 98 static int ip6_pkt_prohibit(struct sk_buff *skb); 99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 100 static void ip6_link_failure(struct sk_buff *skb); 101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 102 struct sk_buff *skb, u32 mtu); 103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 104 struct sk_buff *skb); 105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict); 106 static size_t rt6_nlmsg_size(struct fib6_info *rt); 107 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 108 struct fib6_info *rt, struct dst_entry *dst, 109 struct in6_addr *dest, struct in6_addr *src, 110 int iif, int type, u32 portid, u32 seq, 111 unsigned int flags); 112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 113 struct in6_addr *daddr, 114 struct in6_addr *saddr); 115 116 #ifdef CONFIG_IPV6_ROUTE_INFO 117 static struct fib6_info *rt6_add_route_info(struct net *net, 118 const struct in6_addr *prefix, int prefixlen, 119 const struct in6_addr *gwaddr, 120 struct net_device *dev, 121 unsigned int pref); 122 static struct fib6_info *rt6_get_route_info(struct net *net, 123 const struct in6_addr *prefix, int prefixlen, 124 const struct in6_addr *gwaddr, 125 struct net_device *dev); 126 #endif 127 128 struct uncached_list { 129 spinlock_t lock; 130 struct list_head head; 131 }; 132 133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 134 135 void rt6_uncached_list_add(struct rt6_info *rt) 136 { 137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 138 139 rt->rt6i_uncached_list = ul; 140 141 spin_lock_bh(&ul->lock); 142 list_add_tail(&rt->rt6i_uncached, &ul->head); 143 spin_unlock_bh(&ul->lock); 144 } 145 146 void rt6_uncached_list_del(struct rt6_info *rt) 147 { 148 if (!list_empty(&rt->rt6i_uncached)) { 149 struct uncached_list *ul = rt->rt6i_uncached_list; 150 struct net *net = dev_net(rt->dst.dev); 151 152 spin_lock_bh(&ul->lock); 153 list_del(&rt->rt6i_uncached); 154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 155 spin_unlock_bh(&ul->lock); 156 } 157 } 158 159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 160 { 161 struct net_device *loopback_dev = net->loopback_dev; 162 int cpu; 163 164 if (dev == loopback_dev) 165 return; 166 167 for_each_possible_cpu(cpu) { 168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 169 struct rt6_info *rt; 170 171 spin_lock_bh(&ul->lock); 172 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 173 struct inet6_dev *rt_idev = rt->rt6i_idev; 174 struct net_device *rt_dev = rt->dst.dev; 175 176 if (rt_idev->dev == dev) { 177 rt->rt6i_idev = in6_dev_get(loopback_dev); 178 in6_dev_put(rt_idev); 179 } 180 181 if (rt_dev == dev) { 182 rt->dst.dev = loopback_dev; 183 dev_hold(rt->dst.dev); 184 dev_put(rt_dev); 185 } 186 } 187 spin_unlock_bh(&ul->lock); 188 } 189 } 190 191 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 192 struct sk_buff *skb, 193 const void *daddr) 194 { 195 if (!ipv6_addr_any(p)) 196 return (const void *) p; 197 else if (skb) 198 return &ipv6_hdr(skb)->daddr; 199 return daddr; 200 } 201 202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 203 struct net_device *dev, 204 struct sk_buff *skb, 205 const void *daddr) 206 { 207 struct neighbour *n; 208 209 daddr = choose_neigh_daddr(gw, skb, daddr); 210 n = __ipv6_neigh_lookup(dev, daddr); 211 if (n) 212 return n; 213 return neigh_create(&nd_tbl, daddr, dev); 214 } 215 216 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 217 struct sk_buff *skb, 218 const void *daddr) 219 { 220 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 221 222 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); 223 } 224 225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 226 { 227 struct net_device *dev = dst->dev; 228 struct rt6_info *rt = (struct rt6_info *)dst; 229 230 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); 231 if (!daddr) 232 return; 233 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 234 return; 235 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 236 return; 237 __ipv6_confirm_neigh(dev, daddr); 238 } 239 240 static struct dst_ops ip6_dst_ops_template = { 241 .family = AF_INET6, 242 .gc = ip6_dst_gc, 243 .gc_thresh = 1024, 244 .check = ip6_dst_check, 245 .default_advmss = ip6_default_advmss, 246 .mtu = ip6_mtu, 247 .cow_metrics = dst_cow_metrics_generic, 248 .destroy = ip6_dst_destroy, 249 .ifdown = ip6_dst_ifdown, 250 .negative_advice = ip6_negative_advice, 251 .link_failure = ip6_link_failure, 252 .update_pmtu = ip6_rt_update_pmtu, 253 .redirect = rt6_do_redirect, 254 .local_out = __ip6_local_out, 255 .neigh_lookup = ip6_dst_neigh_lookup, 256 .confirm_neigh = ip6_confirm_neigh, 257 }; 258 259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 260 { 261 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 262 263 return mtu ? : dst->dev->mtu; 264 } 265 266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 267 struct sk_buff *skb, u32 mtu) 268 { 269 } 270 271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 272 struct sk_buff *skb) 273 { 274 } 275 276 static struct dst_ops ip6_dst_blackhole_ops = { 277 .family = AF_INET6, 278 .destroy = ip6_dst_destroy, 279 .check = ip6_dst_check, 280 .mtu = ip6_blackhole_mtu, 281 .default_advmss = ip6_default_advmss, 282 .update_pmtu = ip6_rt_blackhole_update_pmtu, 283 .redirect = ip6_rt_blackhole_redirect, 284 .cow_metrics = dst_cow_metrics_generic, 285 .neigh_lookup = ip6_dst_neigh_lookup, 286 }; 287 288 static const u32 ip6_template_metrics[RTAX_MAX] = { 289 [RTAX_HOPLIMIT - 1] = 0, 290 }; 291 292 static const struct fib6_info fib6_null_entry_template = { 293 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 294 .fib6_protocol = RTPROT_KERNEL, 295 .fib6_metric = ~(u32)0, 296 .fib6_ref = ATOMIC_INIT(1), 297 .fib6_type = RTN_UNREACHABLE, 298 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 299 }; 300 301 static const struct rt6_info ip6_null_entry_template = { 302 .dst = { 303 .__refcnt = ATOMIC_INIT(1), 304 .__use = 1, 305 .obsolete = DST_OBSOLETE_FORCE_CHK, 306 .error = -ENETUNREACH, 307 .input = ip6_pkt_discard, 308 .output = ip6_pkt_discard_out, 309 }, 310 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 311 }; 312 313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 314 315 static const struct rt6_info ip6_prohibit_entry_template = { 316 .dst = { 317 .__refcnt = ATOMIC_INIT(1), 318 .__use = 1, 319 .obsolete = DST_OBSOLETE_FORCE_CHK, 320 .error = -EACCES, 321 .input = ip6_pkt_prohibit, 322 .output = ip6_pkt_prohibit_out, 323 }, 324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 325 }; 326 327 static const struct rt6_info ip6_blk_hole_entry_template = { 328 .dst = { 329 .__refcnt = ATOMIC_INIT(1), 330 .__use = 1, 331 .obsolete = DST_OBSOLETE_FORCE_CHK, 332 .error = -EINVAL, 333 .input = dst_discard, 334 .output = dst_discard_out, 335 }, 336 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 337 }; 338 339 #endif 340 341 static void rt6_info_init(struct rt6_info *rt) 342 { 343 struct dst_entry *dst = &rt->dst; 344 345 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 346 INIT_LIST_HEAD(&rt->rt6i_uncached); 347 } 348 349 /* allocate dst with ip6_dst_ops */ 350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 351 int flags) 352 { 353 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 354 1, DST_OBSOLETE_FORCE_CHK, flags); 355 356 if (rt) { 357 rt6_info_init(rt); 358 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 359 } 360 361 return rt; 362 } 363 EXPORT_SYMBOL(ip6_dst_alloc); 364 365 static void ip6_dst_destroy(struct dst_entry *dst) 366 { 367 struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst); 368 struct rt6_info *rt = (struct rt6_info *)dst; 369 struct fib6_info *from; 370 struct inet6_dev *idev; 371 372 if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt)) 373 kfree(p); 374 375 rt6_uncached_list_del(rt); 376 377 idev = rt->rt6i_idev; 378 if (idev) { 379 rt->rt6i_idev = NULL; 380 in6_dev_put(idev); 381 } 382 383 rcu_read_lock(); 384 from = rcu_dereference(rt->from); 385 rcu_assign_pointer(rt->from, NULL); 386 fib6_info_release(from); 387 rcu_read_unlock(); 388 } 389 390 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 391 int how) 392 { 393 struct rt6_info *rt = (struct rt6_info *)dst; 394 struct inet6_dev *idev = rt->rt6i_idev; 395 struct net_device *loopback_dev = 396 dev_net(dev)->loopback_dev; 397 398 if (idev && idev->dev != loopback_dev) { 399 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 400 if (loopback_idev) { 401 rt->rt6i_idev = loopback_idev; 402 in6_dev_put(idev); 403 } 404 } 405 } 406 407 static bool __rt6_check_expired(const struct rt6_info *rt) 408 { 409 if (rt->rt6i_flags & RTF_EXPIRES) 410 return time_after(jiffies, rt->dst.expires); 411 else 412 return false; 413 } 414 415 static bool rt6_check_expired(const struct rt6_info *rt) 416 { 417 struct fib6_info *from; 418 419 from = rcu_dereference(rt->from); 420 421 if (rt->rt6i_flags & RTF_EXPIRES) { 422 if (time_after(jiffies, rt->dst.expires)) 423 return true; 424 } else if (from) { 425 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 426 fib6_check_expired(from); 427 } 428 return false; 429 } 430 431 struct fib6_info *fib6_multipath_select(const struct net *net, 432 struct fib6_info *match, 433 struct flowi6 *fl6, int oif, 434 const struct sk_buff *skb, 435 int strict) 436 { 437 struct fib6_info *sibling, *next_sibling; 438 439 /* We might have already computed the hash for ICMPv6 errors. In such 440 * case it will always be non-zero. Otherwise now is the time to do it. 441 */ 442 if (!fl6->mp_hash) 443 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 444 445 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound)) 446 return match; 447 448 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 449 fib6_siblings) { 450 int nh_upper_bound; 451 452 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound); 453 if (fl6->mp_hash > nh_upper_bound) 454 continue; 455 if (rt6_score_route(sibling, oif, strict) < 0) 456 break; 457 match = sibling; 458 break; 459 } 460 461 return match; 462 } 463 464 /* 465 * Route lookup. rcu_read_lock() should be held. 466 */ 467 468 static inline struct fib6_info *rt6_device_match(struct net *net, 469 struct fib6_info *rt, 470 const struct in6_addr *saddr, 471 int oif, 472 int flags) 473 { 474 struct fib6_info *sprt; 475 476 if (!oif && ipv6_addr_any(saddr) && 477 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD)) 478 return rt; 479 480 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) { 481 const struct net_device *dev = sprt->fib6_nh.nh_dev; 482 483 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD) 484 continue; 485 486 if (oif) { 487 if (dev->ifindex == oif) 488 return sprt; 489 } else { 490 if (ipv6_chk_addr(net, saddr, dev, 491 flags & RT6_LOOKUP_F_IFACE)) 492 return sprt; 493 } 494 } 495 496 if (oif && flags & RT6_LOOKUP_F_IFACE) 497 return net->ipv6.fib6_null_entry; 498 499 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt; 500 } 501 502 #ifdef CONFIG_IPV6_ROUTER_PREF 503 struct __rt6_probe_work { 504 struct work_struct work; 505 struct in6_addr target; 506 struct net_device *dev; 507 }; 508 509 static void rt6_probe_deferred(struct work_struct *w) 510 { 511 struct in6_addr mcaddr; 512 struct __rt6_probe_work *work = 513 container_of(w, struct __rt6_probe_work, work); 514 515 addrconf_addr_solict_mult(&work->target, &mcaddr); 516 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 517 dev_put(work->dev); 518 kfree(work); 519 } 520 521 static void rt6_probe(struct fib6_info *rt) 522 { 523 struct __rt6_probe_work *work; 524 const struct in6_addr *nh_gw; 525 struct neighbour *neigh; 526 struct net_device *dev; 527 528 /* 529 * Okay, this does not seem to be appropriate 530 * for now, however, we need to check if it 531 * is really so; aka Router Reachability Probing. 532 * 533 * Router Reachability Probe MUST be rate-limited 534 * to no more than one per minute. 535 */ 536 if (!rt || !(rt->fib6_flags & RTF_GATEWAY)) 537 return; 538 539 nh_gw = &rt->fib6_nh.nh_gw; 540 dev = rt->fib6_nh.nh_dev; 541 rcu_read_lock_bh(); 542 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 543 if (neigh) { 544 struct inet6_dev *idev; 545 546 if (neigh->nud_state & NUD_VALID) 547 goto out; 548 549 idev = __in6_dev_get(dev); 550 work = NULL; 551 write_lock(&neigh->lock); 552 if (!(neigh->nud_state & NUD_VALID) && 553 time_after(jiffies, 554 neigh->updated + idev->cnf.rtr_probe_interval)) { 555 work = kmalloc(sizeof(*work), GFP_ATOMIC); 556 if (work) 557 __neigh_set_probe_once(neigh); 558 } 559 write_unlock(&neigh->lock); 560 } else { 561 work = kmalloc(sizeof(*work), GFP_ATOMIC); 562 } 563 564 if (work) { 565 INIT_WORK(&work->work, rt6_probe_deferred); 566 work->target = *nh_gw; 567 dev_hold(dev); 568 work->dev = dev; 569 schedule_work(&work->work); 570 } 571 572 out: 573 rcu_read_unlock_bh(); 574 } 575 #else 576 static inline void rt6_probe(struct fib6_info *rt) 577 { 578 } 579 #endif 580 581 /* 582 * Default Router Selection (RFC 2461 6.3.6) 583 */ 584 static inline int rt6_check_dev(struct fib6_info *rt, int oif) 585 { 586 const struct net_device *dev = rt->fib6_nh.nh_dev; 587 588 if (!oif || dev->ifindex == oif) 589 return 2; 590 return 0; 591 } 592 593 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) 594 { 595 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 596 struct neighbour *neigh; 597 598 if (rt->fib6_flags & RTF_NONEXTHOP || 599 !(rt->fib6_flags & RTF_GATEWAY)) 600 return RT6_NUD_SUCCEED; 601 602 rcu_read_lock_bh(); 603 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev, 604 &rt->fib6_nh.nh_gw); 605 if (neigh) { 606 read_lock(&neigh->lock); 607 if (neigh->nud_state & NUD_VALID) 608 ret = RT6_NUD_SUCCEED; 609 #ifdef CONFIG_IPV6_ROUTER_PREF 610 else if (!(neigh->nud_state & NUD_FAILED)) 611 ret = RT6_NUD_SUCCEED; 612 else 613 ret = RT6_NUD_FAIL_PROBE; 614 #endif 615 read_unlock(&neigh->lock); 616 } else { 617 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 618 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 619 } 620 rcu_read_unlock_bh(); 621 622 return ret; 623 } 624 625 static int rt6_score_route(struct fib6_info *rt, int oif, int strict) 626 { 627 int m; 628 629 m = rt6_check_dev(rt, oif); 630 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 631 return RT6_NUD_FAIL_HARD; 632 #ifdef CONFIG_IPV6_ROUTER_PREF 633 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2; 634 #endif 635 if (strict & RT6_LOOKUP_F_REACHABLE) { 636 int n = rt6_check_neigh(rt); 637 if (n < 0) 638 return n; 639 } 640 return m; 641 } 642 643 /* called with rc_read_lock held */ 644 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i) 645 { 646 const struct net_device *dev = fib6_info_nh_dev(f6i); 647 bool rc = false; 648 649 if (dev) { 650 const struct inet6_dev *idev = __in6_dev_get(dev); 651 652 rc = !!idev->cnf.ignore_routes_with_linkdown; 653 } 654 655 return rc; 656 } 657 658 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, 659 int *mpri, struct fib6_info *match, 660 bool *do_rr) 661 { 662 int m; 663 bool match_do_rr = false; 664 665 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 666 goto out; 667 668 if (fib6_ignore_linkdown(rt) && 669 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 670 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 671 goto out; 672 673 if (fib6_check_expired(rt)) 674 goto out; 675 676 m = rt6_score_route(rt, oif, strict); 677 if (m == RT6_NUD_FAIL_DO_RR) { 678 match_do_rr = true; 679 m = 0; /* lowest valid score */ 680 } else if (m == RT6_NUD_FAIL_HARD) { 681 goto out; 682 } 683 684 if (strict & RT6_LOOKUP_F_REACHABLE) 685 rt6_probe(rt); 686 687 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 688 if (m > *mpri) { 689 *do_rr = match_do_rr; 690 *mpri = m; 691 match = rt; 692 } 693 out: 694 return match; 695 } 696 697 static struct fib6_info *find_rr_leaf(struct fib6_node *fn, 698 struct fib6_info *leaf, 699 struct fib6_info *rr_head, 700 u32 metric, int oif, int strict, 701 bool *do_rr) 702 { 703 struct fib6_info *rt, *match, *cont; 704 int mpri = -1; 705 706 match = NULL; 707 cont = NULL; 708 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) { 709 if (rt->fib6_metric != metric) { 710 cont = rt; 711 break; 712 } 713 714 match = find_match(rt, oif, strict, &mpri, match, do_rr); 715 } 716 717 for (rt = leaf; rt && rt != rr_head; 718 rt = rcu_dereference(rt->fib6_next)) { 719 if (rt->fib6_metric != metric) { 720 cont = rt; 721 break; 722 } 723 724 match = find_match(rt, oif, strict, &mpri, match, do_rr); 725 } 726 727 if (match || !cont) 728 return match; 729 730 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next)) 731 match = find_match(rt, oif, strict, &mpri, match, do_rr); 732 733 return match; 734 } 735 736 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, 737 int oif, int strict) 738 { 739 struct fib6_info *leaf = rcu_dereference(fn->leaf); 740 struct fib6_info *match, *rt0; 741 bool do_rr = false; 742 int key_plen; 743 744 if (!leaf || leaf == net->ipv6.fib6_null_entry) 745 return net->ipv6.fib6_null_entry; 746 747 rt0 = rcu_dereference(fn->rr_ptr); 748 if (!rt0) 749 rt0 = leaf; 750 751 /* Double check to make sure fn is not an intermediate node 752 * and fn->leaf does not points to its child's leaf 753 * (This might happen if all routes under fn are deleted from 754 * the tree and fib6_repair_tree() is called on the node.) 755 */ 756 key_plen = rt0->fib6_dst.plen; 757 #ifdef CONFIG_IPV6_SUBTREES 758 if (rt0->fib6_src.plen) 759 key_plen = rt0->fib6_src.plen; 760 #endif 761 if (fn->fn_bit != key_plen) 762 return net->ipv6.fib6_null_entry; 763 764 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict, 765 &do_rr); 766 767 if (do_rr) { 768 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 769 770 /* no entries matched; do round-robin */ 771 if (!next || next->fib6_metric != rt0->fib6_metric) 772 next = leaf; 773 774 if (next != rt0) { 775 spin_lock_bh(&leaf->fib6_table->tb6_lock); 776 /* make sure next is not being deleted from the tree */ 777 if (next->fib6_node) 778 rcu_assign_pointer(fn->rr_ptr, next); 779 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 780 } 781 } 782 783 return match ? match : net->ipv6.fib6_null_entry; 784 } 785 786 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) 787 { 788 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); 789 } 790 791 #ifdef CONFIG_IPV6_ROUTE_INFO 792 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 793 const struct in6_addr *gwaddr) 794 { 795 struct net *net = dev_net(dev); 796 struct route_info *rinfo = (struct route_info *) opt; 797 struct in6_addr prefix_buf, *prefix; 798 unsigned int pref; 799 unsigned long lifetime; 800 struct fib6_info *rt; 801 802 if (len < sizeof(struct route_info)) { 803 return -EINVAL; 804 } 805 806 /* Sanity check for prefix_len and length */ 807 if (rinfo->length > 3) { 808 return -EINVAL; 809 } else if (rinfo->prefix_len > 128) { 810 return -EINVAL; 811 } else if (rinfo->prefix_len > 64) { 812 if (rinfo->length < 2) { 813 return -EINVAL; 814 } 815 } else if (rinfo->prefix_len > 0) { 816 if (rinfo->length < 1) { 817 return -EINVAL; 818 } 819 } 820 821 pref = rinfo->route_pref; 822 if (pref == ICMPV6_ROUTER_PREF_INVALID) 823 return -EINVAL; 824 825 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 826 827 if (rinfo->length == 3) 828 prefix = (struct in6_addr *)rinfo->prefix; 829 else { 830 /* this function is safe */ 831 ipv6_addr_prefix(&prefix_buf, 832 (struct in6_addr *)rinfo->prefix, 833 rinfo->prefix_len); 834 prefix = &prefix_buf; 835 } 836 837 if (rinfo->prefix_len == 0) 838 rt = rt6_get_dflt_router(net, gwaddr, dev); 839 else 840 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 841 gwaddr, dev); 842 843 if (rt && !lifetime) { 844 ip6_del_rt(net, rt); 845 rt = NULL; 846 } 847 848 if (!rt && lifetime) 849 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 850 dev, pref); 851 else if (rt) 852 rt->fib6_flags = RTF_ROUTEINFO | 853 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 854 855 if (rt) { 856 if (!addrconf_finite_timeout(lifetime)) 857 fib6_clean_expires(rt); 858 else 859 fib6_set_expires(rt, jiffies + HZ * lifetime); 860 861 fib6_info_release(rt); 862 } 863 return 0; 864 } 865 #endif 866 867 /* 868 * Misc support functions 869 */ 870 871 /* called with rcu_lock held */ 872 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt) 873 { 874 struct net_device *dev = rt->fib6_nh.nh_dev; 875 876 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 877 /* for copies of local routes, dst->dev needs to be the 878 * device if it is a master device, the master device if 879 * device is enslaved, and the loopback as the default 880 */ 881 if (netif_is_l3_slave(dev) && 882 !rt6_need_strict(&rt->fib6_dst.addr)) 883 dev = l3mdev_master_dev_rcu(dev); 884 else if (!netif_is_l3_master(dev)) 885 dev = dev_net(dev)->loopback_dev; 886 /* last case is netif_is_l3_master(dev) is true in which 887 * case we want dev returned to be dev 888 */ 889 } 890 891 return dev; 892 } 893 894 static const int fib6_prop[RTN_MAX + 1] = { 895 [RTN_UNSPEC] = 0, 896 [RTN_UNICAST] = 0, 897 [RTN_LOCAL] = 0, 898 [RTN_BROADCAST] = 0, 899 [RTN_ANYCAST] = 0, 900 [RTN_MULTICAST] = 0, 901 [RTN_BLACKHOLE] = -EINVAL, 902 [RTN_UNREACHABLE] = -EHOSTUNREACH, 903 [RTN_PROHIBIT] = -EACCES, 904 [RTN_THROW] = -EAGAIN, 905 [RTN_NAT] = -EINVAL, 906 [RTN_XRESOLVE] = -EINVAL, 907 }; 908 909 static int ip6_rt_type_to_error(u8 fib6_type) 910 { 911 return fib6_prop[fib6_type]; 912 } 913 914 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 915 { 916 unsigned short flags = 0; 917 918 if (rt->dst_nocount) 919 flags |= DST_NOCOUNT; 920 if (rt->dst_nopolicy) 921 flags |= DST_NOPOLICY; 922 if (rt->dst_host) 923 flags |= DST_HOST; 924 925 return flags; 926 } 927 928 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) 929 { 930 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); 931 932 switch (ort->fib6_type) { 933 case RTN_BLACKHOLE: 934 rt->dst.output = dst_discard_out; 935 rt->dst.input = dst_discard; 936 break; 937 case RTN_PROHIBIT: 938 rt->dst.output = ip6_pkt_prohibit_out; 939 rt->dst.input = ip6_pkt_prohibit; 940 break; 941 case RTN_THROW: 942 case RTN_UNREACHABLE: 943 default: 944 rt->dst.output = ip6_pkt_discard_out; 945 rt->dst.input = ip6_pkt_discard; 946 break; 947 } 948 } 949 950 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) 951 { 952 if (ort->fib6_flags & RTF_REJECT) { 953 ip6_rt_init_dst_reject(rt, ort); 954 return; 955 } 956 957 rt->dst.error = 0; 958 rt->dst.output = ip6_output; 959 960 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) { 961 rt->dst.input = ip6_input; 962 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 963 rt->dst.input = ip6_mc_input; 964 } else { 965 rt->dst.input = ip6_forward; 966 } 967 968 if (ort->fib6_nh.nh_lwtstate) { 969 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); 970 lwtunnel_set_redirect(&rt->dst); 971 } 972 973 rt->dst.lastuse = jiffies; 974 } 975 976 /* Caller must already hold reference to @from */ 977 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 978 { 979 rt->rt6i_flags &= ~RTF_EXPIRES; 980 rcu_assign_pointer(rt->from, from); 981 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); 982 if (from->fib6_metrics != &dst_default_metrics) { 983 rt->dst._metrics |= DST_METRICS_REFCOUNTED; 984 refcount_inc(&from->fib6_metrics->refcnt); 985 } 986 } 987 988 /* Caller must already hold reference to @ort */ 989 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) 990 { 991 struct net_device *dev = fib6_info_nh_dev(ort); 992 993 ip6_rt_init_dst(rt, ort); 994 995 rt->rt6i_dst = ort->fib6_dst; 996 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 997 rt->rt6i_gateway = ort->fib6_nh.nh_gw; 998 rt->rt6i_flags = ort->fib6_flags; 999 rt6_set_from(rt, ort); 1000 #ifdef CONFIG_IPV6_SUBTREES 1001 rt->rt6i_src = ort->fib6_src; 1002 #endif 1003 } 1004 1005 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1006 struct in6_addr *saddr) 1007 { 1008 struct fib6_node *pn, *sn; 1009 while (1) { 1010 if (fn->fn_flags & RTN_TL_ROOT) 1011 return NULL; 1012 pn = rcu_dereference(fn->parent); 1013 sn = FIB6_SUBTREE(pn); 1014 if (sn && sn != fn) 1015 fn = fib6_node_lookup(sn, NULL, saddr); 1016 else 1017 fn = pn; 1018 if (fn->fn_flags & RTN_RTINFO) 1019 return fn; 1020 } 1021 } 1022 1023 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, 1024 bool null_fallback) 1025 { 1026 struct rt6_info *rt = *prt; 1027 1028 if (dst_hold_safe(&rt->dst)) 1029 return true; 1030 if (null_fallback) { 1031 rt = net->ipv6.ip6_null_entry; 1032 dst_hold(&rt->dst); 1033 } else { 1034 rt = NULL; 1035 } 1036 *prt = rt; 1037 return false; 1038 } 1039 1040 /* called with rcu_lock held */ 1041 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) 1042 { 1043 unsigned short flags = fib6_info_dst_flags(rt); 1044 struct net_device *dev = rt->fib6_nh.nh_dev; 1045 struct rt6_info *nrt; 1046 1047 if (!fib6_info_hold_safe(rt)) 1048 return NULL; 1049 1050 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1051 if (nrt) 1052 ip6_rt_copy_init(nrt, rt); 1053 else 1054 fib6_info_release(rt); 1055 1056 return nrt; 1057 } 1058 1059 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1060 struct fib6_table *table, 1061 struct flowi6 *fl6, 1062 const struct sk_buff *skb, 1063 int flags) 1064 { 1065 struct fib6_info *f6i; 1066 struct fib6_node *fn; 1067 struct rt6_info *rt; 1068 1069 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1070 flags &= ~RT6_LOOKUP_F_IFACE; 1071 1072 rcu_read_lock(); 1073 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1074 restart: 1075 f6i = rcu_dereference(fn->leaf); 1076 if (!f6i) { 1077 f6i = net->ipv6.fib6_null_entry; 1078 } else { 1079 f6i = rt6_device_match(net, f6i, &fl6->saddr, 1080 fl6->flowi6_oif, flags); 1081 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) 1082 f6i = fib6_multipath_select(net, f6i, fl6, 1083 fl6->flowi6_oif, skb, 1084 flags); 1085 } 1086 if (f6i == net->ipv6.fib6_null_entry) { 1087 fn = fib6_backtrack(fn, &fl6->saddr); 1088 if (fn) 1089 goto restart; 1090 } 1091 1092 trace_fib6_table_lookup(net, f6i, table, fl6); 1093 1094 /* Search through exception table */ 1095 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1096 if (rt) { 1097 if (ip6_hold_safe(net, &rt, true)) 1098 dst_use_noref(&rt->dst, jiffies); 1099 } else if (f6i == net->ipv6.fib6_null_entry) { 1100 rt = net->ipv6.ip6_null_entry; 1101 dst_hold(&rt->dst); 1102 } else { 1103 rt = ip6_create_rt_rcu(f6i); 1104 if (!rt) { 1105 rt = net->ipv6.ip6_null_entry; 1106 dst_hold(&rt->dst); 1107 } 1108 } 1109 1110 rcu_read_unlock(); 1111 1112 return rt; 1113 } 1114 1115 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1116 const struct sk_buff *skb, int flags) 1117 { 1118 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1119 } 1120 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1121 1122 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1123 const struct in6_addr *saddr, int oif, 1124 const struct sk_buff *skb, int strict) 1125 { 1126 struct flowi6 fl6 = { 1127 .flowi6_oif = oif, 1128 .daddr = *daddr, 1129 }; 1130 struct dst_entry *dst; 1131 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1132 1133 if (saddr) { 1134 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1135 flags |= RT6_LOOKUP_F_HAS_SADDR; 1136 } 1137 1138 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1139 if (dst->error == 0) 1140 return (struct rt6_info *) dst; 1141 1142 dst_release(dst); 1143 1144 return NULL; 1145 } 1146 EXPORT_SYMBOL(rt6_lookup); 1147 1148 /* ip6_ins_rt is called with FREE table->tb6_lock. 1149 * It takes new route entry, the addition fails by any reason the 1150 * route is released. 1151 * Caller must hold dst before calling it. 1152 */ 1153 1154 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1155 struct netlink_ext_ack *extack) 1156 { 1157 int err; 1158 struct fib6_table *table; 1159 1160 table = rt->fib6_table; 1161 spin_lock_bh(&table->tb6_lock); 1162 err = fib6_add(&table->tb6_root, rt, info, extack); 1163 spin_unlock_bh(&table->tb6_lock); 1164 1165 return err; 1166 } 1167 1168 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1169 { 1170 struct nl_info info = { .nl_net = net, }; 1171 1172 return __ip6_ins_rt(rt, &info, NULL); 1173 } 1174 1175 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, 1176 const struct in6_addr *daddr, 1177 const struct in6_addr *saddr) 1178 { 1179 struct net_device *dev; 1180 struct rt6_info *rt; 1181 1182 /* 1183 * Clone the route. 1184 */ 1185 1186 if (!fib6_info_hold_safe(ort)) 1187 return NULL; 1188 1189 dev = ip6_rt_get_dev_rcu(ort); 1190 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1191 if (!rt) { 1192 fib6_info_release(ort); 1193 return NULL; 1194 } 1195 1196 ip6_rt_copy_init(rt, ort); 1197 rt->rt6i_flags |= RTF_CACHE; 1198 rt->dst.flags |= DST_HOST; 1199 rt->rt6i_dst.addr = *daddr; 1200 rt->rt6i_dst.plen = 128; 1201 1202 if (!rt6_is_gw_or_nonexthop(ort)) { 1203 if (ort->fib6_dst.plen != 128 && 1204 ipv6_addr_equal(&ort->fib6_dst.addr, daddr)) 1205 rt->rt6i_flags |= RTF_ANYCAST; 1206 #ifdef CONFIG_IPV6_SUBTREES 1207 if (rt->rt6i_src.plen && saddr) { 1208 rt->rt6i_src.addr = *saddr; 1209 rt->rt6i_src.plen = 128; 1210 } 1211 #endif 1212 } 1213 1214 return rt; 1215 } 1216 1217 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) 1218 { 1219 unsigned short flags = fib6_info_dst_flags(rt); 1220 struct net_device *dev; 1221 struct rt6_info *pcpu_rt; 1222 1223 if (!fib6_info_hold_safe(rt)) 1224 return NULL; 1225 1226 rcu_read_lock(); 1227 dev = ip6_rt_get_dev_rcu(rt); 1228 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1229 rcu_read_unlock(); 1230 if (!pcpu_rt) { 1231 fib6_info_release(rt); 1232 return NULL; 1233 } 1234 ip6_rt_copy_init(pcpu_rt, rt); 1235 pcpu_rt->rt6i_flags |= RTF_PCPU; 1236 return pcpu_rt; 1237 } 1238 1239 /* It should be called with rcu_read_lock() acquired */ 1240 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt) 1241 { 1242 struct rt6_info *pcpu_rt, **p; 1243 1244 p = this_cpu_ptr(rt->rt6i_pcpu); 1245 pcpu_rt = *p; 1246 1247 if (pcpu_rt) 1248 ip6_hold_safe(NULL, &pcpu_rt, false); 1249 1250 return pcpu_rt; 1251 } 1252 1253 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1254 struct fib6_info *rt) 1255 { 1256 struct rt6_info *pcpu_rt, *prev, **p; 1257 1258 pcpu_rt = ip6_rt_pcpu_alloc(rt); 1259 if (!pcpu_rt) { 1260 dst_hold(&net->ipv6.ip6_null_entry->dst); 1261 return net->ipv6.ip6_null_entry; 1262 } 1263 1264 dst_hold(&pcpu_rt->dst); 1265 p = this_cpu_ptr(rt->rt6i_pcpu); 1266 prev = cmpxchg(p, NULL, pcpu_rt); 1267 BUG_ON(prev); 1268 1269 return pcpu_rt; 1270 } 1271 1272 /* exception hash table implementation 1273 */ 1274 static DEFINE_SPINLOCK(rt6_exception_lock); 1275 1276 /* Remove rt6_ex from hash table and free the memory 1277 * Caller must hold rt6_exception_lock 1278 */ 1279 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1280 struct rt6_exception *rt6_ex) 1281 { 1282 struct net *net; 1283 1284 if (!bucket || !rt6_ex) 1285 return; 1286 1287 net = dev_net(rt6_ex->rt6i->dst.dev); 1288 hlist_del_rcu(&rt6_ex->hlist); 1289 dst_release(&rt6_ex->rt6i->dst); 1290 kfree_rcu(rt6_ex, rcu); 1291 WARN_ON_ONCE(!bucket->depth); 1292 bucket->depth--; 1293 net->ipv6.rt6_stats->fib_rt_cache--; 1294 } 1295 1296 /* Remove oldest rt6_ex in bucket and free the memory 1297 * Caller must hold rt6_exception_lock 1298 */ 1299 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1300 { 1301 struct rt6_exception *rt6_ex, *oldest = NULL; 1302 1303 if (!bucket) 1304 return; 1305 1306 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1307 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1308 oldest = rt6_ex; 1309 } 1310 rt6_remove_exception(bucket, oldest); 1311 } 1312 1313 static u32 rt6_exception_hash(const struct in6_addr *dst, 1314 const struct in6_addr *src) 1315 { 1316 static u32 seed __read_mostly; 1317 u32 val; 1318 1319 net_get_random_once(&seed, sizeof(seed)); 1320 val = jhash(dst, sizeof(*dst), seed); 1321 1322 #ifdef CONFIG_IPV6_SUBTREES 1323 if (src) 1324 val = jhash(src, sizeof(*src), val); 1325 #endif 1326 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1327 } 1328 1329 /* Helper function to find the cached rt in the hash table 1330 * and update bucket pointer to point to the bucket for this 1331 * (daddr, saddr) pair 1332 * Caller must hold rt6_exception_lock 1333 */ 1334 static struct rt6_exception * 1335 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1336 const struct in6_addr *daddr, 1337 const struct in6_addr *saddr) 1338 { 1339 struct rt6_exception *rt6_ex; 1340 u32 hval; 1341 1342 if (!(*bucket) || !daddr) 1343 return NULL; 1344 1345 hval = rt6_exception_hash(daddr, saddr); 1346 *bucket += hval; 1347 1348 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1349 struct rt6_info *rt6 = rt6_ex->rt6i; 1350 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1351 1352 #ifdef CONFIG_IPV6_SUBTREES 1353 if (matched && saddr) 1354 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1355 #endif 1356 if (matched) 1357 return rt6_ex; 1358 } 1359 return NULL; 1360 } 1361 1362 /* Helper function to find the cached rt in the hash table 1363 * and update bucket pointer to point to the bucket for this 1364 * (daddr, saddr) pair 1365 * Caller must hold rcu_read_lock() 1366 */ 1367 static struct rt6_exception * 1368 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1369 const struct in6_addr *daddr, 1370 const struct in6_addr *saddr) 1371 { 1372 struct rt6_exception *rt6_ex; 1373 u32 hval; 1374 1375 WARN_ON_ONCE(!rcu_read_lock_held()); 1376 1377 if (!(*bucket) || !daddr) 1378 return NULL; 1379 1380 hval = rt6_exception_hash(daddr, saddr); 1381 *bucket += hval; 1382 1383 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1384 struct rt6_info *rt6 = rt6_ex->rt6i; 1385 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1386 1387 #ifdef CONFIG_IPV6_SUBTREES 1388 if (matched && saddr) 1389 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1390 #endif 1391 if (matched) 1392 return rt6_ex; 1393 } 1394 return NULL; 1395 } 1396 1397 static unsigned int fib6_mtu(const struct fib6_info *rt) 1398 { 1399 unsigned int mtu; 1400 1401 if (rt->fib6_pmtu) { 1402 mtu = rt->fib6_pmtu; 1403 } else { 1404 struct net_device *dev = fib6_info_nh_dev(rt); 1405 struct inet6_dev *idev; 1406 1407 rcu_read_lock(); 1408 idev = __in6_dev_get(dev); 1409 mtu = idev->cnf.mtu6; 1410 rcu_read_unlock(); 1411 } 1412 1413 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1414 1415 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu); 1416 } 1417 1418 static int rt6_insert_exception(struct rt6_info *nrt, 1419 struct fib6_info *ort) 1420 { 1421 struct net *net = dev_net(nrt->dst.dev); 1422 struct rt6_exception_bucket *bucket; 1423 struct in6_addr *src_key = NULL; 1424 struct rt6_exception *rt6_ex; 1425 int err = 0; 1426 1427 spin_lock_bh(&rt6_exception_lock); 1428 1429 if (ort->exception_bucket_flushed) { 1430 err = -EINVAL; 1431 goto out; 1432 } 1433 1434 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, 1435 lockdep_is_held(&rt6_exception_lock)); 1436 if (!bucket) { 1437 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1438 GFP_ATOMIC); 1439 if (!bucket) { 1440 err = -ENOMEM; 1441 goto out; 1442 } 1443 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); 1444 } 1445 1446 #ifdef CONFIG_IPV6_SUBTREES 1447 /* rt6i_src.plen != 0 indicates ort is in subtree 1448 * and exception table is indexed by a hash of 1449 * both rt6i_dst and rt6i_src. 1450 * Otherwise, the exception table is indexed by 1451 * a hash of only rt6i_dst. 1452 */ 1453 if (ort->fib6_src.plen) 1454 src_key = &nrt->rt6i_src.addr; 1455 #endif 1456 /* rt6_mtu_change() might lower mtu on ort. 1457 * Only insert this exception route if its mtu 1458 * is less than ort's mtu value. 1459 */ 1460 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) { 1461 err = -EINVAL; 1462 goto out; 1463 } 1464 1465 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1466 src_key); 1467 if (rt6_ex) 1468 rt6_remove_exception(bucket, rt6_ex); 1469 1470 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1471 if (!rt6_ex) { 1472 err = -ENOMEM; 1473 goto out; 1474 } 1475 rt6_ex->rt6i = nrt; 1476 rt6_ex->stamp = jiffies; 1477 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1478 bucket->depth++; 1479 net->ipv6.rt6_stats->fib_rt_cache++; 1480 1481 if (bucket->depth > FIB6_MAX_DEPTH) 1482 rt6_exception_remove_oldest(bucket); 1483 1484 out: 1485 spin_unlock_bh(&rt6_exception_lock); 1486 1487 /* Update fn->fn_sernum to invalidate all cached dst */ 1488 if (!err) { 1489 spin_lock_bh(&ort->fib6_table->tb6_lock); 1490 fib6_update_sernum(net, ort); 1491 spin_unlock_bh(&ort->fib6_table->tb6_lock); 1492 fib6_force_start_gc(net); 1493 } 1494 1495 return err; 1496 } 1497 1498 void rt6_flush_exceptions(struct fib6_info *rt) 1499 { 1500 struct rt6_exception_bucket *bucket; 1501 struct rt6_exception *rt6_ex; 1502 struct hlist_node *tmp; 1503 int i; 1504 1505 spin_lock_bh(&rt6_exception_lock); 1506 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1507 rt->exception_bucket_flushed = 1; 1508 1509 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1510 lockdep_is_held(&rt6_exception_lock)); 1511 if (!bucket) 1512 goto out; 1513 1514 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1515 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1516 rt6_remove_exception(bucket, rt6_ex); 1517 WARN_ON_ONCE(bucket->depth); 1518 bucket++; 1519 } 1520 1521 out: 1522 spin_unlock_bh(&rt6_exception_lock); 1523 } 1524 1525 /* Find cached rt in the hash table inside passed in rt 1526 * Caller has to hold rcu_read_lock() 1527 */ 1528 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, 1529 struct in6_addr *daddr, 1530 struct in6_addr *saddr) 1531 { 1532 struct rt6_exception_bucket *bucket; 1533 struct in6_addr *src_key = NULL; 1534 struct rt6_exception *rt6_ex; 1535 struct rt6_info *res = NULL; 1536 1537 bucket = rcu_dereference(rt->rt6i_exception_bucket); 1538 1539 #ifdef CONFIG_IPV6_SUBTREES 1540 /* rt6i_src.plen != 0 indicates rt is in subtree 1541 * and exception table is indexed by a hash of 1542 * both rt6i_dst and rt6i_src. 1543 * Otherwise, the exception table is indexed by 1544 * a hash of only rt6i_dst. 1545 */ 1546 if (rt->fib6_src.plen) 1547 src_key = saddr; 1548 #endif 1549 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1550 1551 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1552 res = rt6_ex->rt6i; 1553 1554 return res; 1555 } 1556 1557 /* Remove the passed in cached rt from the hash table that contains it */ 1558 static int rt6_remove_exception_rt(struct rt6_info *rt) 1559 { 1560 struct rt6_exception_bucket *bucket; 1561 struct in6_addr *src_key = NULL; 1562 struct rt6_exception *rt6_ex; 1563 struct fib6_info *from; 1564 int err; 1565 1566 from = rcu_dereference(rt->from); 1567 if (!from || 1568 !(rt->rt6i_flags & RTF_CACHE)) 1569 return -EINVAL; 1570 1571 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1572 return -ENOENT; 1573 1574 spin_lock_bh(&rt6_exception_lock); 1575 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1576 lockdep_is_held(&rt6_exception_lock)); 1577 #ifdef CONFIG_IPV6_SUBTREES 1578 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1579 * and exception table is indexed by a hash of 1580 * both rt6i_dst and rt6i_src. 1581 * Otherwise, the exception table is indexed by 1582 * a hash of only rt6i_dst. 1583 */ 1584 if (from->fib6_src.plen) 1585 src_key = &rt->rt6i_src.addr; 1586 #endif 1587 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1588 &rt->rt6i_dst.addr, 1589 src_key); 1590 if (rt6_ex) { 1591 rt6_remove_exception(bucket, rt6_ex); 1592 err = 0; 1593 } else { 1594 err = -ENOENT; 1595 } 1596 1597 spin_unlock_bh(&rt6_exception_lock); 1598 return err; 1599 } 1600 1601 /* Find rt6_ex which contains the passed in rt cache and 1602 * refresh its stamp 1603 */ 1604 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1605 { 1606 struct rt6_exception_bucket *bucket; 1607 struct fib6_info *from = rt->from; 1608 struct in6_addr *src_key = NULL; 1609 struct rt6_exception *rt6_ex; 1610 1611 if (!from || 1612 !(rt->rt6i_flags & RTF_CACHE)) 1613 return; 1614 1615 rcu_read_lock(); 1616 bucket = rcu_dereference(from->rt6i_exception_bucket); 1617 1618 #ifdef CONFIG_IPV6_SUBTREES 1619 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1620 * and exception table is indexed by a hash of 1621 * both rt6i_dst and rt6i_src. 1622 * Otherwise, the exception table is indexed by 1623 * a hash of only rt6i_dst. 1624 */ 1625 if (from->fib6_src.plen) 1626 src_key = &rt->rt6i_src.addr; 1627 #endif 1628 rt6_ex = __rt6_find_exception_rcu(&bucket, 1629 &rt->rt6i_dst.addr, 1630 src_key); 1631 if (rt6_ex) 1632 rt6_ex->stamp = jiffies; 1633 1634 rcu_read_unlock(); 1635 } 1636 1637 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1638 struct rt6_info *rt, int mtu) 1639 { 1640 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1641 * lowest MTU in the path: always allow updating the route PMTU to 1642 * reflect PMTU decreases. 1643 * 1644 * If the new MTU is higher, and the route PMTU is equal to the local 1645 * MTU, this means the old MTU is the lowest in the path, so allow 1646 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1647 * handle this. 1648 */ 1649 1650 if (dst_mtu(&rt->dst) >= mtu) 1651 return true; 1652 1653 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 1654 return true; 1655 1656 return false; 1657 } 1658 1659 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1660 struct fib6_info *rt, int mtu) 1661 { 1662 struct rt6_exception_bucket *bucket; 1663 struct rt6_exception *rt6_ex; 1664 int i; 1665 1666 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1667 lockdep_is_held(&rt6_exception_lock)); 1668 1669 if (!bucket) 1670 return; 1671 1672 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1673 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1674 struct rt6_info *entry = rt6_ex->rt6i; 1675 1676 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 1677 * route), the metrics of its rt->from have already 1678 * been updated. 1679 */ 1680 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 1681 rt6_mtu_change_route_allowed(idev, entry, mtu)) 1682 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 1683 } 1684 bucket++; 1685 } 1686 } 1687 1688 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1689 1690 static void rt6_exceptions_clean_tohost(struct fib6_info *rt, 1691 struct in6_addr *gateway) 1692 { 1693 struct rt6_exception_bucket *bucket; 1694 struct rt6_exception *rt6_ex; 1695 struct hlist_node *tmp; 1696 int i; 1697 1698 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1699 return; 1700 1701 spin_lock_bh(&rt6_exception_lock); 1702 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1703 lockdep_is_held(&rt6_exception_lock)); 1704 1705 if (bucket) { 1706 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1707 hlist_for_each_entry_safe(rt6_ex, tmp, 1708 &bucket->chain, hlist) { 1709 struct rt6_info *entry = rt6_ex->rt6i; 1710 1711 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1712 RTF_CACHE_GATEWAY && 1713 ipv6_addr_equal(gateway, 1714 &entry->rt6i_gateway)) { 1715 rt6_remove_exception(bucket, rt6_ex); 1716 } 1717 } 1718 bucket++; 1719 } 1720 } 1721 1722 spin_unlock_bh(&rt6_exception_lock); 1723 } 1724 1725 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1726 struct rt6_exception *rt6_ex, 1727 struct fib6_gc_args *gc_args, 1728 unsigned long now) 1729 { 1730 struct rt6_info *rt = rt6_ex->rt6i; 1731 1732 /* we are pruning and obsoleting aged-out and non gateway exceptions 1733 * even if others have still references to them, so that on next 1734 * dst_check() such references can be dropped. 1735 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1736 * expired, independently from their aging, as per RFC 8201 section 4 1737 */ 1738 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 1739 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1740 RT6_TRACE("aging clone %p\n", rt); 1741 rt6_remove_exception(bucket, rt6_ex); 1742 return; 1743 } 1744 } else if (time_after(jiffies, rt->dst.expires)) { 1745 RT6_TRACE("purging expired route %p\n", rt); 1746 rt6_remove_exception(bucket, rt6_ex); 1747 return; 1748 } 1749 1750 if (rt->rt6i_flags & RTF_GATEWAY) { 1751 struct neighbour *neigh; 1752 __u8 neigh_flags = 0; 1753 1754 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 1755 if (neigh) 1756 neigh_flags = neigh->flags; 1757 1758 if (!(neigh_flags & NTF_ROUTER)) { 1759 RT6_TRACE("purging route %p via non-router but gateway\n", 1760 rt); 1761 rt6_remove_exception(bucket, rt6_ex); 1762 return; 1763 } 1764 } 1765 1766 gc_args->more++; 1767 } 1768 1769 void rt6_age_exceptions(struct fib6_info *rt, 1770 struct fib6_gc_args *gc_args, 1771 unsigned long now) 1772 { 1773 struct rt6_exception_bucket *bucket; 1774 struct rt6_exception *rt6_ex; 1775 struct hlist_node *tmp; 1776 int i; 1777 1778 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1779 return; 1780 1781 rcu_read_lock_bh(); 1782 spin_lock(&rt6_exception_lock); 1783 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1784 lockdep_is_held(&rt6_exception_lock)); 1785 1786 if (bucket) { 1787 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1788 hlist_for_each_entry_safe(rt6_ex, tmp, 1789 &bucket->chain, hlist) { 1790 rt6_age_examine_exception(bucket, rt6_ex, 1791 gc_args, now); 1792 } 1793 bucket++; 1794 } 1795 } 1796 spin_unlock(&rt6_exception_lock); 1797 rcu_read_unlock_bh(); 1798 } 1799 1800 /* must be called with rcu lock held */ 1801 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, 1802 int oif, struct flowi6 *fl6, int strict) 1803 { 1804 struct fib6_node *fn, *saved_fn; 1805 struct fib6_info *f6i; 1806 1807 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1808 saved_fn = fn; 1809 1810 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1811 oif = 0; 1812 1813 redo_rt6_select: 1814 f6i = rt6_select(net, fn, oif, strict); 1815 if (f6i == net->ipv6.fib6_null_entry) { 1816 fn = fib6_backtrack(fn, &fl6->saddr); 1817 if (fn) 1818 goto redo_rt6_select; 1819 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1820 /* also consider unreachable route */ 1821 strict &= ~RT6_LOOKUP_F_REACHABLE; 1822 fn = saved_fn; 1823 goto redo_rt6_select; 1824 } 1825 } 1826 1827 trace_fib6_table_lookup(net, f6i, table, fl6); 1828 1829 return f6i; 1830 } 1831 1832 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1833 int oif, struct flowi6 *fl6, 1834 const struct sk_buff *skb, int flags) 1835 { 1836 struct fib6_info *f6i; 1837 struct rt6_info *rt; 1838 int strict = 0; 1839 1840 strict |= flags & RT6_LOOKUP_F_IFACE; 1841 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1842 if (net->ipv6.devconf_all->forwarding == 0) 1843 strict |= RT6_LOOKUP_F_REACHABLE; 1844 1845 rcu_read_lock(); 1846 1847 f6i = fib6_table_lookup(net, table, oif, fl6, strict); 1848 if (f6i->fib6_nsiblings) 1849 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict); 1850 1851 if (f6i == net->ipv6.fib6_null_entry) { 1852 rt = net->ipv6.ip6_null_entry; 1853 rcu_read_unlock(); 1854 dst_hold(&rt->dst); 1855 return rt; 1856 } 1857 1858 /*Search through exception table */ 1859 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); 1860 if (rt) { 1861 if (ip6_hold_safe(net, &rt, true)) 1862 dst_use_noref(&rt->dst, jiffies); 1863 1864 rcu_read_unlock(); 1865 return rt; 1866 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1867 !(f6i->fib6_flags & RTF_GATEWAY))) { 1868 /* Create a RTF_CACHE clone which will not be 1869 * owned by the fib6 tree. It is for the special case where 1870 * the daddr in the skb during the neighbor look-up is different 1871 * from the fl6->daddr used to look-up route here. 1872 */ 1873 struct rt6_info *uncached_rt; 1874 1875 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); 1876 1877 rcu_read_unlock(); 1878 1879 if (uncached_rt) { 1880 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1881 * No need for another dst_hold() 1882 */ 1883 rt6_uncached_list_add(uncached_rt); 1884 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 1885 } else { 1886 uncached_rt = net->ipv6.ip6_null_entry; 1887 dst_hold(&uncached_rt->dst); 1888 } 1889 1890 return uncached_rt; 1891 } else { 1892 /* Get a percpu copy */ 1893 1894 struct rt6_info *pcpu_rt; 1895 1896 local_bh_disable(); 1897 pcpu_rt = rt6_get_pcpu_route(f6i); 1898 1899 if (!pcpu_rt) 1900 pcpu_rt = rt6_make_pcpu_route(net, f6i); 1901 1902 local_bh_enable(); 1903 rcu_read_unlock(); 1904 1905 return pcpu_rt; 1906 } 1907 } 1908 EXPORT_SYMBOL_GPL(ip6_pol_route); 1909 1910 static struct rt6_info *ip6_pol_route_input(struct net *net, 1911 struct fib6_table *table, 1912 struct flowi6 *fl6, 1913 const struct sk_buff *skb, 1914 int flags) 1915 { 1916 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 1917 } 1918 1919 struct dst_entry *ip6_route_input_lookup(struct net *net, 1920 struct net_device *dev, 1921 struct flowi6 *fl6, 1922 const struct sk_buff *skb, 1923 int flags) 1924 { 1925 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1926 flags |= RT6_LOOKUP_F_IFACE; 1927 1928 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 1929 } 1930 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1931 1932 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1933 struct flow_keys *keys, 1934 struct flow_keys *flkeys) 1935 { 1936 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1937 const struct ipv6hdr *key_iph = outer_iph; 1938 struct flow_keys *_flkeys = flkeys; 1939 const struct ipv6hdr *inner_iph; 1940 const struct icmp6hdr *icmph; 1941 struct ipv6hdr _inner_iph; 1942 struct icmp6hdr _icmph; 1943 1944 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1945 goto out; 1946 1947 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 1948 sizeof(_icmph), &_icmph); 1949 if (!icmph) 1950 goto out; 1951 1952 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1953 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1954 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1955 icmph->icmp6_type != ICMPV6_PARAMPROB) 1956 goto out; 1957 1958 inner_iph = skb_header_pointer(skb, 1959 skb_transport_offset(skb) + sizeof(*icmph), 1960 sizeof(_inner_iph), &_inner_iph); 1961 if (!inner_iph) 1962 goto out; 1963 1964 key_iph = inner_iph; 1965 _flkeys = NULL; 1966 out: 1967 if (_flkeys) { 1968 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 1969 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 1970 keys->tags.flow_label = _flkeys->tags.flow_label; 1971 keys->basic.ip_proto = _flkeys->basic.ip_proto; 1972 } else { 1973 keys->addrs.v6addrs.src = key_iph->saddr; 1974 keys->addrs.v6addrs.dst = key_iph->daddr; 1975 keys->tags.flow_label = ip6_flowlabel(key_iph); 1976 keys->basic.ip_proto = key_iph->nexthdr; 1977 } 1978 } 1979 1980 /* if skb is set it will be used and fl6 can be NULL */ 1981 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 1982 const struct sk_buff *skb, struct flow_keys *flkeys) 1983 { 1984 struct flow_keys hash_keys; 1985 u32 mhash; 1986 1987 switch (ip6_multipath_hash_policy(net)) { 1988 case 0: 1989 memset(&hash_keys, 0, sizeof(hash_keys)); 1990 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 1991 if (skb) { 1992 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 1993 } else { 1994 hash_keys.addrs.v6addrs.src = fl6->saddr; 1995 hash_keys.addrs.v6addrs.dst = fl6->daddr; 1996 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 1997 hash_keys.basic.ip_proto = fl6->flowi6_proto; 1998 } 1999 break; 2000 case 1: 2001 if (skb) { 2002 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 2003 struct flow_keys keys; 2004 2005 /* short-circuit if we already have L4 hash present */ 2006 if (skb->l4_hash) 2007 return skb_get_hash_raw(skb) >> 1; 2008 2009 memset(&hash_keys, 0, sizeof(hash_keys)); 2010 2011 if (!flkeys) { 2012 skb_flow_dissect_flow_keys(skb, &keys, flag); 2013 flkeys = &keys; 2014 } 2015 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2016 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2017 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2018 hash_keys.ports.src = flkeys->ports.src; 2019 hash_keys.ports.dst = flkeys->ports.dst; 2020 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2021 } else { 2022 memset(&hash_keys, 0, sizeof(hash_keys)); 2023 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2024 hash_keys.addrs.v6addrs.src = fl6->saddr; 2025 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2026 hash_keys.ports.src = fl6->fl6_sport; 2027 hash_keys.ports.dst = fl6->fl6_dport; 2028 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2029 } 2030 break; 2031 } 2032 mhash = flow_hash_from_keys(&hash_keys); 2033 2034 return mhash >> 1; 2035 } 2036 2037 void ip6_route_input(struct sk_buff *skb) 2038 { 2039 const struct ipv6hdr *iph = ipv6_hdr(skb); 2040 struct net *net = dev_net(skb->dev); 2041 int flags = RT6_LOOKUP_F_HAS_SADDR; 2042 struct ip_tunnel_info *tun_info; 2043 struct flowi6 fl6 = { 2044 .flowi6_iif = skb->dev->ifindex, 2045 .daddr = iph->daddr, 2046 .saddr = iph->saddr, 2047 .flowlabel = ip6_flowinfo(iph), 2048 .flowi6_mark = skb->mark, 2049 .flowi6_proto = iph->nexthdr, 2050 }; 2051 struct flow_keys *flkeys = NULL, _flkeys; 2052 2053 tun_info = skb_tunnel_info(skb); 2054 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2055 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2056 2057 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2058 flkeys = &_flkeys; 2059 2060 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2061 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2062 skb_dst_drop(skb); 2063 skb_dst_set(skb, 2064 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); 2065 } 2066 2067 static struct rt6_info *ip6_pol_route_output(struct net *net, 2068 struct fib6_table *table, 2069 struct flowi6 *fl6, 2070 const struct sk_buff *skb, 2071 int flags) 2072 { 2073 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2074 } 2075 2076 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 2077 struct flowi6 *fl6, int flags) 2078 { 2079 bool any_src; 2080 2081 if (ipv6_addr_type(&fl6->daddr) & 2082 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { 2083 struct dst_entry *dst; 2084 2085 dst = l3mdev_link_scope_lookup(net, fl6); 2086 if (dst) 2087 return dst; 2088 } 2089 2090 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2091 2092 any_src = ipv6_addr_any(&fl6->saddr); 2093 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2094 (fl6->flowi6_oif && any_src)) 2095 flags |= RT6_LOOKUP_F_IFACE; 2096 2097 if (!any_src) 2098 flags |= RT6_LOOKUP_F_HAS_SADDR; 2099 else if (sk) 2100 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2101 2102 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2103 } 2104 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2105 2106 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2107 { 2108 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2109 struct net_device *loopback_dev = net->loopback_dev; 2110 struct dst_entry *new = NULL; 2111 2112 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2113 DST_OBSOLETE_DEAD, 0); 2114 if (rt) { 2115 rt6_info_init(rt); 2116 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2117 2118 new = &rt->dst; 2119 new->__use = 1; 2120 new->input = dst_discard; 2121 new->output = dst_discard_out; 2122 2123 dst_copy_metrics(new, &ort->dst); 2124 2125 rt->rt6i_idev = in6_dev_get(loopback_dev); 2126 rt->rt6i_gateway = ort->rt6i_gateway; 2127 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2128 2129 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2130 #ifdef CONFIG_IPV6_SUBTREES 2131 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2132 #endif 2133 } 2134 2135 dst_release(dst_orig); 2136 return new ? new : ERR_PTR(-ENOMEM); 2137 } 2138 2139 /* 2140 * Destination cache support functions 2141 */ 2142 2143 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2144 { 2145 u32 rt_cookie = 0; 2146 2147 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2148 return false; 2149 2150 if (fib6_check_expired(f6i)) 2151 return false; 2152 2153 return true; 2154 } 2155 2156 static struct dst_entry *rt6_check(struct rt6_info *rt, 2157 struct fib6_info *from, 2158 u32 cookie) 2159 { 2160 u32 rt_cookie = 0; 2161 2162 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || 2163 rt_cookie != cookie) 2164 return NULL; 2165 2166 if (rt6_check_expired(rt)) 2167 return NULL; 2168 2169 return &rt->dst; 2170 } 2171 2172 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2173 struct fib6_info *from, 2174 u32 cookie) 2175 { 2176 if (!__rt6_check_expired(rt) && 2177 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2178 fib6_check(from, cookie)) 2179 return &rt->dst; 2180 else 2181 return NULL; 2182 } 2183 2184 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2185 { 2186 struct dst_entry *dst_ret; 2187 struct fib6_info *from; 2188 struct rt6_info *rt; 2189 2190 rt = container_of(dst, struct rt6_info, dst); 2191 2192 rcu_read_lock(); 2193 2194 /* All IPV6 dsts are created with ->obsolete set to the value 2195 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2196 * into this function always. 2197 */ 2198 2199 from = rcu_dereference(rt->from); 2200 2201 if (from && (rt->rt6i_flags & RTF_PCPU || 2202 unlikely(!list_empty(&rt->rt6i_uncached)))) 2203 dst_ret = rt6_dst_from_check(rt, from, cookie); 2204 else 2205 dst_ret = rt6_check(rt, from, cookie); 2206 2207 rcu_read_unlock(); 2208 2209 return dst_ret; 2210 } 2211 2212 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2213 { 2214 struct rt6_info *rt = (struct rt6_info *) dst; 2215 2216 if (rt) { 2217 if (rt->rt6i_flags & RTF_CACHE) { 2218 rcu_read_lock(); 2219 if (rt6_check_expired(rt)) { 2220 rt6_remove_exception_rt(rt); 2221 dst = NULL; 2222 } 2223 rcu_read_unlock(); 2224 } else { 2225 dst_release(dst); 2226 dst = NULL; 2227 } 2228 } 2229 return dst; 2230 } 2231 2232 static void ip6_link_failure(struct sk_buff *skb) 2233 { 2234 struct rt6_info *rt; 2235 2236 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2237 2238 rt = (struct rt6_info *) skb_dst(skb); 2239 if (rt) { 2240 rcu_read_lock(); 2241 if (rt->rt6i_flags & RTF_CACHE) { 2242 if (dst_hold_safe(&rt->dst)) 2243 rt6_remove_exception_rt(rt); 2244 } else { 2245 struct fib6_info *from; 2246 struct fib6_node *fn; 2247 2248 from = rcu_dereference(rt->from); 2249 if (from) { 2250 fn = rcu_dereference(from->fib6_node); 2251 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2252 fn->fn_sernum = -1; 2253 } 2254 } 2255 rcu_read_unlock(); 2256 } 2257 } 2258 2259 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2260 { 2261 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2262 struct fib6_info *from; 2263 2264 rcu_read_lock(); 2265 from = rcu_dereference(rt0->from); 2266 if (from) 2267 rt0->dst.expires = from->expires; 2268 rcu_read_unlock(); 2269 } 2270 2271 dst_set_expires(&rt0->dst, timeout); 2272 rt0->rt6i_flags |= RTF_EXPIRES; 2273 } 2274 2275 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2276 { 2277 struct net *net = dev_net(rt->dst.dev); 2278 2279 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2280 rt->rt6i_flags |= RTF_MODIFIED; 2281 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2282 } 2283 2284 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2285 { 2286 bool from_set; 2287 2288 rcu_read_lock(); 2289 from_set = !!rcu_dereference(rt->from); 2290 rcu_read_unlock(); 2291 2292 return !(rt->rt6i_flags & RTF_CACHE) && 2293 (rt->rt6i_flags & RTF_PCPU || from_set); 2294 } 2295 2296 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2297 const struct ipv6hdr *iph, u32 mtu) 2298 { 2299 const struct in6_addr *daddr, *saddr; 2300 struct rt6_info *rt6 = (struct rt6_info *)dst; 2301 2302 if (dst_metric_locked(dst, RTAX_MTU)) 2303 return; 2304 2305 if (iph) { 2306 daddr = &iph->daddr; 2307 saddr = &iph->saddr; 2308 } else if (sk) { 2309 daddr = &sk->sk_v6_daddr; 2310 saddr = &inet6_sk(sk)->saddr; 2311 } else { 2312 daddr = NULL; 2313 saddr = NULL; 2314 } 2315 dst_confirm_neigh(dst, daddr); 2316 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2317 if (mtu >= dst_mtu(dst)) 2318 return; 2319 2320 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2321 rt6_do_update_pmtu(rt6, mtu); 2322 /* update rt6_ex->stamp for cache */ 2323 if (rt6->rt6i_flags & RTF_CACHE) 2324 rt6_update_exception_stamp_rt(rt6); 2325 } else if (daddr) { 2326 struct fib6_info *from; 2327 struct rt6_info *nrt6; 2328 2329 rcu_read_lock(); 2330 from = rcu_dereference(rt6->from); 2331 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr); 2332 if (nrt6) { 2333 rt6_do_update_pmtu(nrt6, mtu); 2334 if (rt6_insert_exception(nrt6, from)) 2335 dst_release_immediate(&nrt6->dst); 2336 } 2337 rcu_read_unlock(); 2338 } 2339 } 2340 2341 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2342 struct sk_buff *skb, u32 mtu) 2343 { 2344 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2345 } 2346 2347 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2348 int oif, u32 mark, kuid_t uid) 2349 { 2350 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2351 struct dst_entry *dst; 2352 struct flowi6 fl6; 2353 2354 memset(&fl6, 0, sizeof(fl6)); 2355 fl6.flowi6_oif = oif; 2356 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); 2357 fl6.daddr = iph->daddr; 2358 fl6.saddr = iph->saddr; 2359 fl6.flowlabel = ip6_flowinfo(iph); 2360 fl6.flowi6_uid = uid; 2361 2362 dst = ip6_route_output(net, NULL, &fl6); 2363 if (!dst->error) 2364 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2365 dst_release(dst); 2366 } 2367 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2368 2369 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2370 { 2371 struct dst_entry *dst; 2372 2373 ip6_update_pmtu(skb, sock_net(sk), mtu, 2374 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid); 2375 2376 dst = __sk_dst_get(sk); 2377 if (!dst || !dst->obsolete || 2378 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2379 return; 2380 2381 bh_lock_sock(sk); 2382 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2383 ip6_datagram_dst_update(sk, false); 2384 bh_unlock_sock(sk); 2385 } 2386 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2387 2388 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2389 const struct flowi6 *fl6) 2390 { 2391 #ifdef CONFIG_IPV6_SUBTREES 2392 struct ipv6_pinfo *np = inet6_sk(sk); 2393 #endif 2394 2395 ip6_dst_store(sk, dst, 2396 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2397 &sk->sk_v6_daddr : NULL, 2398 #ifdef CONFIG_IPV6_SUBTREES 2399 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2400 &np->saddr : 2401 #endif 2402 NULL); 2403 } 2404 2405 /* Handle redirects */ 2406 struct ip6rd_flowi { 2407 struct flowi6 fl6; 2408 struct in6_addr gateway; 2409 }; 2410 2411 static struct rt6_info *__ip6_route_redirect(struct net *net, 2412 struct fib6_table *table, 2413 struct flowi6 *fl6, 2414 const struct sk_buff *skb, 2415 int flags) 2416 { 2417 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2418 struct rt6_info *ret = NULL, *rt_cache; 2419 struct fib6_info *rt; 2420 struct fib6_node *fn; 2421 2422 /* Get the "current" route for this destination and 2423 * check if the redirect has come from appropriate router. 2424 * 2425 * RFC 4861 specifies that redirects should only be 2426 * accepted if they come from the nexthop to the target. 2427 * Due to the way the routes are chosen, this notion 2428 * is a bit fuzzy and one might need to check all possible 2429 * routes. 2430 */ 2431 2432 rcu_read_lock(); 2433 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2434 restart: 2435 for_each_fib6_node_rt_rcu(fn) { 2436 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 2437 continue; 2438 if (fib6_check_expired(rt)) 2439 continue; 2440 if (rt->fib6_flags & RTF_REJECT) 2441 break; 2442 if (!(rt->fib6_flags & RTF_GATEWAY)) 2443 continue; 2444 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex) 2445 continue; 2446 /* rt_cache's gateway might be different from its 'parent' 2447 * in the case of an ip redirect. 2448 * So we keep searching in the exception table if the gateway 2449 * is different. 2450 */ 2451 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) { 2452 rt_cache = rt6_find_cached_rt(rt, 2453 &fl6->daddr, 2454 &fl6->saddr); 2455 if (rt_cache && 2456 ipv6_addr_equal(&rdfl->gateway, 2457 &rt_cache->rt6i_gateway)) { 2458 ret = rt_cache; 2459 break; 2460 } 2461 continue; 2462 } 2463 break; 2464 } 2465 2466 if (!rt) 2467 rt = net->ipv6.fib6_null_entry; 2468 else if (rt->fib6_flags & RTF_REJECT) { 2469 ret = net->ipv6.ip6_null_entry; 2470 goto out; 2471 } 2472 2473 if (rt == net->ipv6.fib6_null_entry) { 2474 fn = fib6_backtrack(fn, &fl6->saddr); 2475 if (fn) 2476 goto restart; 2477 } 2478 2479 out: 2480 if (ret) 2481 ip6_hold_safe(net, &ret, true); 2482 else 2483 ret = ip6_create_rt_rcu(rt); 2484 2485 rcu_read_unlock(); 2486 2487 trace_fib6_table_lookup(net, rt, table, fl6); 2488 return ret; 2489 }; 2490 2491 static struct dst_entry *ip6_route_redirect(struct net *net, 2492 const struct flowi6 *fl6, 2493 const struct sk_buff *skb, 2494 const struct in6_addr *gateway) 2495 { 2496 int flags = RT6_LOOKUP_F_HAS_SADDR; 2497 struct ip6rd_flowi rdfl; 2498 2499 rdfl.fl6 = *fl6; 2500 rdfl.gateway = *gateway; 2501 2502 return fib6_rule_lookup(net, &rdfl.fl6, skb, 2503 flags, __ip6_route_redirect); 2504 } 2505 2506 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2507 kuid_t uid) 2508 { 2509 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2510 struct dst_entry *dst; 2511 struct flowi6 fl6; 2512 2513 memset(&fl6, 0, sizeof(fl6)); 2514 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2515 fl6.flowi6_oif = oif; 2516 fl6.flowi6_mark = mark; 2517 fl6.daddr = iph->daddr; 2518 fl6.saddr = iph->saddr; 2519 fl6.flowlabel = ip6_flowinfo(iph); 2520 fl6.flowi6_uid = uid; 2521 2522 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 2523 rt6_do_redirect(dst, NULL, skb); 2524 dst_release(dst); 2525 } 2526 EXPORT_SYMBOL_GPL(ip6_redirect); 2527 2528 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, 2529 u32 mark) 2530 { 2531 const struct ipv6hdr *iph = ipv6_hdr(skb); 2532 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2533 struct dst_entry *dst; 2534 struct flowi6 fl6; 2535 2536 memset(&fl6, 0, sizeof(fl6)); 2537 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2538 fl6.flowi6_oif = oif; 2539 fl6.flowi6_mark = mark; 2540 fl6.daddr = msg->dest; 2541 fl6.saddr = iph->daddr; 2542 fl6.flowi6_uid = sock_net_uid(net, NULL); 2543 2544 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 2545 rt6_do_redirect(dst, NULL, skb); 2546 dst_release(dst); 2547 } 2548 2549 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2550 { 2551 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2552 sk->sk_uid); 2553 } 2554 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2555 2556 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2557 { 2558 struct net_device *dev = dst->dev; 2559 unsigned int mtu = dst_mtu(dst); 2560 struct net *net = dev_net(dev); 2561 2562 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2563 2564 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2565 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2566 2567 /* 2568 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2569 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2570 * IPV6_MAXPLEN is also valid and means: "any MSS, 2571 * rely only on pmtu discovery" 2572 */ 2573 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2574 mtu = IPV6_MAXPLEN; 2575 return mtu; 2576 } 2577 2578 static unsigned int ip6_mtu(const struct dst_entry *dst) 2579 { 2580 struct inet6_dev *idev; 2581 unsigned int mtu; 2582 2583 mtu = dst_metric_raw(dst, RTAX_MTU); 2584 if (mtu) 2585 goto out; 2586 2587 mtu = IPV6_MIN_MTU; 2588 2589 rcu_read_lock(); 2590 idev = __in6_dev_get(dst->dev); 2591 if (idev) 2592 mtu = idev->cnf.mtu6; 2593 rcu_read_unlock(); 2594 2595 out: 2596 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2597 2598 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2599 } 2600 2601 /* MTU selection: 2602 * 1. mtu on route is locked - use it 2603 * 2. mtu from nexthop exception 2604 * 3. mtu from egress device 2605 * 2606 * based on ip6_dst_mtu_forward and exception logic of 2607 * rt6_find_cached_rt; called with rcu_read_lock 2608 */ 2609 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, 2610 struct in6_addr *saddr) 2611 { 2612 struct rt6_exception_bucket *bucket; 2613 struct rt6_exception *rt6_ex; 2614 struct in6_addr *src_key; 2615 struct inet6_dev *idev; 2616 u32 mtu = 0; 2617 2618 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 2619 mtu = f6i->fib6_pmtu; 2620 if (mtu) 2621 goto out; 2622 } 2623 2624 src_key = NULL; 2625 #ifdef CONFIG_IPV6_SUBTREES 2626 if (f6i->fib6_src.plen) 2627 src_key = saddr; 2628 #endif 2629 2630 bucket = rcu_dereference(f6i->rt6i_exception_bucket); 2631 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 2632 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 2633 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU); 2634 2635 if (likely(!mtu)) { 2636 struct net_device *dev = fib6_info_nh_dev(f6i); 2637 2638 mtu = IPV6_MIN_MTU; 2639 idev = __in6_dev_get(dev); 2640 if (idev && idev->cnf.mtu6 > mtu) 2641 mtu = idev->cnf.mtu6; 2642 } 2643 2644 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2645 out: 2646 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu); 2647 } 2648 2649 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2650 struct flowi6 *fl6) 2651 { 2652 struct dst_entry *dst; 2653 struct rt6_info *rt; 2654 struct inet6_dev *idev = in6_dev_get(dev); 2655 struct net *net = dev_net(dev); 2656 2657 if (unlikely(!idev)) 2658 return ERR_PTR(-ENODEV); 2659 2660 rt = ip6_dst_alloc(net, dev, 0); 2661 if (unlikely(!rt)) { 2662 in6_dev_put(idev); 2663 dst = ERR_PTR(-ENOMEM); 2664 goto out; 2665 } 2666 2667 rt->dst.flags |= DST_HOST; 2668 rt->dst.input = ip6_input; 2669 rt->dst.output = ip6_output; 2670 rt->rt6i_gateway = fl6->daddr; 2671 rt->rt6i_dst.addr = fl6->daddr; 2672 rt->rt6i_dst.plen = 128; 2673 rt->rt6i_idev = idev; 2674 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2675 2676 /* Add this dst into uncached_list so that rt6_disable_ip() can 2677 * do proper release of the net_device 2678 */ 2679 rt6_uncached_list_add(rt); 2680 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2681 2682 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2683 2684 out: 2685 return dst; 2686 } 2687 2688 static int ip6_dst_gc(struct dst_ops *ops) 2689 { 2690 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2691 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2692 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2693 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2694 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2695 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2696 int entries; 2697 2698 entries = dst_entries_get_fast(ops); 2699 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2700 entries <= rt_max_size) 2701 goto out; 2702 2703 net->ipv6.ip6_rt_gc_expire++; 2704 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2705 entries = dst_entries_get_slow(ops); 2706 if (entries < ops->gc_thresh) 2707 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2708 out: 2709 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2710 return entries > rt_max_size; 2711 } 2712 2713 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt, 2714 struct fib6_config *cfg) 2715 { 2716 struct dst_metrics *p; 2717 2718 if (!cfg->fc_mx) 2719 return 0; 2720 2721 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL); 2722 if (unlikely(!p)) 2723 return -ENOMEM; 2724 2725 refcount_set(&p->refcnt, 1); 2726 rt->fib6_metrics = p; 2727 2728 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics); 2729 } 2730 2731 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2732 struct fib6_config *cfg, 2733 const struct in6_addr *gw_addr, 2734 u32 tbid, int flags) 2735 { 2736 struct flowi6 fl6 = { 2737 .flowi6_oif = cfg->fc_ifindex, 2738 .daddr = *gw_addr, 2739 .saddr = cfg->fc_prefsrc, 2740 }; 2741 struct fib6_table *table; 2742 struct rt6_info *rt; 2743 2744 table = fib6_get_table(net, tbid); 2745 if (!table) 2746 return NULL; 2747 2748 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2749 flags |= RT6_LOOKUP_F_HAS_SADDR; 2750 2751 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 2752 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); 2753 2754 /* if table lookup failed, fall back to full lookup */ 2755 if (rt == net->ipv6.ip6_null_entry) { 2756 ip6_rt_put(rt); 2757 rt = NULL; 2758 } 2759 2760 return rt; 2761 } 2762 2763 static int ip6_route_check_nh_onlink(struct net *net, 2764 struct fib6_config *cfg, 2765 const struct net_device *dev, 2766 struct netlink_ext_ack *extack) 2767 { 2768 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 2769 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2770 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; 2771 struct rt6_info *grt; 2772 int err; 2773 2774 err = 0; 2775 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2776 if (grt) { 2777 if (!grt->dst.error && 2778 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2779 NL_SET_ERR_MSG(extack, 2780 "Nexthop has invalid gateway or device mismatch"); 2781 err = -EINVAL; 2782 } 2783 2784 ip6_rt_put(grt); 2785 } 2786 2787 return err; 2788 } 2789 2790 static int ip6_route_check_nh(struct net *net, 2791 struct fib6_config *cfg, 2792 struct net_device **_dev, 2793 struct inet6_dev **idev) 2794 { 2795 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2796 struct net_device *dev = _dev ? *_dev : NULL; 2797 struct rt6_info *grt = NULL; 2798 int err = -EHOSTUNREACH; 2799 2800 if (cfg->fc_table) { 2801 int flags = RT6_LOOKUP_F_IFACE; 2802 2803 grt = ip6_nh_lookup_table(net, cfg, gw_addr, 2804 cfg->fc_table, flags); 2805 if (grt) { 2806 if (grt->rt6i_flags & RTF_GATEWAY || 2807 (dev && dev != grt->dst.dev)) { 2808 ip6_rt_put(grt); 2809 grt = NULL; 2810 } 2811 } 2812 } 2813 2814 if (!grt) 2815 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); 2816 2817 if (!grt) 2818 goto out; 2819 2820 if (dev) { 2821 if (dev != grt->dst.dev) { 2822 ip6_rt_put(grt); 2823 goto out; 2824 } 2825 } else { 2826 *_dev = dev = grt->dst.dev; 2827 *idev = grt->rt6i_idev; 2828 dev_hold(dev); 2829 in6_dev_hold(grt->rt6i_idev); 2830 } 2831 2832 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2833 err = 0; 2834 2835 ip6_rt_put(grt); 2836 2837 out: 2838 return err; 2839 } 2840 2841 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 2842 struct net_device **_dev, struct inet6_dev **idev, 2843 struct netlink_ext_ack *extack) 2844 { 2845 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2846 int gwa_type = ipv6_addr_type(gw_addr); 2847 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 2848 const struct net_device *dev = *_dev; 2849 bool need_addr_check = !dev; 2850 int err = -EINVAL; 2851 2852 /* if gw_addr is local we will fail to detect this in case 2853 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2854 * will return already-added prefix route via interface that 2855 * prefix route was assigned to, which might be non-loopback. 2856 */ 2857 if (dev && 2858 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2859 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2860 goto out; 2861 } 2862 2863 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 2864 /* IPv6 strictly inhibits using not link-local 2865 * addresses as nexthop address. 2866 * Otherwise, router will not able to send redirects. 2867 * It is very good, but in some (rare!) circumstances 2868 * (SIT, PtP, NBMA NOARP links) it is handy to allow 2869 * some exceptions. --ANK 2870 * We allow IPv4-mapped nexthops to support RFC4798-type 2871 * addressing 2872 */ 2873 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 2874 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2875 goto out; 2876 } 2877 2878 if (cfg->fc_flags & RTNH_F_ONLINK) 2879 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 2880 else 2881 err = ip6_route_check_nh(net, cfg, _dev, idev); 2882 2883 if (err) 2884 goto out; 2885 } 2886 2887 /* reload in case device was changed */ 2888 dev = *_dev; 2889 2890 err = -EINVAL; 2891 if (!dev) { 2892 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2893 goto out; 2894 } else if (dev->flags & IFF_LOOPBACK) { 2895 NL_SET_ERR_MSG(extack, 2896 "Egress device can not be loopback device for this route"); 2897 goto out; 2898 } 2899 2900 /* if we did not check gw_addr above, do so now that the 2901 * egress device has been resolved. 2902 */ 2903 if (need_addr_check && 2904 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2905 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2906 goto out; 2907 } 2908 2909 err = 0; 2910 out: 2911 return err; 2912 } 2913 2914 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 2915 gfp_t gfp_flags, 2916 struct netlink_ext_ack *extack) 2917 { 2918 struct net *net = cfg->fc_nlinfo.nl_net; 2919 struct fib6_info *rt = NULL; 2920 struct net_device *dev = NULL; 2921 struct inet6_dev *idev = NULL; 2922 struct fib6_table *table; 2923 int addr_type; 2924 int err = -EINVAL; 2925 2926 /* RTF_PCPU is an internal flag; can not be set by userspace */ 2927 if (cfg->fc_flags & RTF_PCPU) { 2928 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 2929 goto out; 2930 } 2931 2932 /* RTF_CACHE is an internal flag; can not be set by userspace */ 2933 if (cfg->fc_flags & RTF_CACHE) { 2934 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 2935 goto out; 2936 } 2937 2938 if (cfg->fc_type > RTN_MAX) { 2939 NL_SET_ERR_MSG(extack, "Invalid route type"); 2940 goto out; 2941 } 2942 2943 if (cfg->fc_dst_len > 128) { 2944 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 2945 goto out; 2946 } 2947 if (cfg->fc_src_len > 128) { 2948 NL_SET_ERR_MSG(extack, "Invalid source address length"); 2949 goto out; 2950 } 2951 #ifndef CONFIG_IPV6_SUBTREES 2952 if (cfg->fc_src_len) { 2953 NL_SET_ERR_MSG(extack, 2954 "Specifying source address requires IPV6_SUBTREES to be enabled"); 2955 goto out; 2956 } 2957 #endif 2958 if (cfg->fc_ifindex) { 2959 err = -ENODEV; 2960 dev = dev_get_by_index(net, cfg->fc_ifindex); 2961 if (!dev) 2962 goto out; 2963 idev = in6_dev_get(dev); 2964 if (!idev) 2965 goto out; 2966 } 2967 2968 if (cfg->fc_metric == 0) 2969 cfg->fc_metric = IP6_RT_PRIO_USER; 2970 2971 if (cfg->fc_flags & RTNH_F_ONLINK) { 2972 if (!dev) { 2973 NL_SET_ERR_MSG(extack, 2974 "Nexthop device required for onlink"); 2975 err = -ENODEV; 2976 goto out; 2977 } 2978 2979 if (!(dev->flags & IFF_UP)) { 2980 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 2981 err = -ENETDOWN; 2982 goto out; 2983 } 2984 } 2985 2986 err = -ENOBUFS; 2987 if (cfg->fc_nlinfo.nlh && 2988 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 2989 table = fib6_get_table(net, cfg->fc_table); 2990 if (!table) { 2991 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 2992 table = fib6_new_table(net, cfg->fc_table); 2993 } 2994 } else { 2995 table = fib6_new_table(net, cfg->fc_table); 2996 } 2997 2998 if (!table) 2999 goto out; 3000 3001 err = -ENOMEM; 3002 rt = fib6_info_alloc(gfp_flags); 3003 if (!rt) 3004 goto out; 3005 3006 if (cfg->fc_flags & RTF_ADDRCONF) 3007 rt->dst_nocount = true; 3008 3009 err = ip6_convert_metrics(net, rt, cfg); 3010 if (err < 0) 3011 goto out; 3012 3013 if (cfg->fc_flags & RTF_EXPIRES) 3014 fib6_set_expires(rt, jiffies + 3015 clock_t_to_jiffies(cfg->fc_expires)); 3016 else 3017 fib6_clean_expires(rt); 3018 3019 if (cfg->fc_protocol == RTPROT_UNSPEC) 3020 cfg->fc_protocol = RTPROT_BOOT; 3021 rt->fib6_protocol = cfg->fc_protocol; 3022 3023 addr_type = ipv6_addr_type(&cfg->fc_dst); 3024 3025 if (cfg->fc_encap) { 3026 struct lwtunnel_state *lwtstate; 3027 3028 err = lwtunnel_build_state(cfg->fc_encap_type, 3029 cfg->fc_encap, AF_INET6, cfg, 3030 &lwtstate, extack); 3031 if (err) 3032 goto out; 3033 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate); 3034 } 3035 3036 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3037 rt->fib6_dst.plen = cfg->fc_dst_len; 3038 if (rt->fib6_dst.plen == 128) 3039 rt->dst_host = true; 3040 3041 #ifdef CONFIG_IPV6_SUBTREES 3042 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3043 rt->fib6_src.plen = cfg->fc_src_len; 3044 #endif 3045 3046 rt->fib6_metric = cfg->fc_metric; 3047 rt->fib6_nh.nh_weight = 1; 3048 3049 rt->fib6_type = cfg->fc_type; 3050 3051 /* We cannot add true routes via loopback here, 3052 they would result in kernel looping; promote them to reject routes 3053 */ 3054 if ((cfg->fc_flags & RTF_REJECT) || 3055 (dev && (dev->flags & IFF_LOOPBACK) && 3056 !(addr_type & IPV6_ADDR_LOOPBACK) && 3057 !(cfg->fc_flags & RTF_LOCAL))) { 3058 /* hold loopback dev/idev if we haven't done so. */ 3059 if (dev != net->loopback_dev) { 3060 if (dev) { 3061 dev_put(dev); 3062 in6_dev_put(idev); 3063 } 3064 dev = net->loopback_dev; 3065 dev_hold(dev); 3066 idev = in6_dev_get(dev); 3067 if (!idev) { 3068 err = -ENODEV; 3069 goto out; 3070 } 3071 } 3072 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP; 3073 goto install_route; 3074 } 3075 3076 if (cfg->fc_flags & RTF_GATEWAY) { 3077 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 3078 if (err) 3079 goto out; 3080 3081 rt->fib6_nh.nh_gw = cfg->fc_gateway; 3082 } 3083 3084 err = -ENODEV; 3085 if (!dev) 3086 goto out; 3087 3088 if (idev->cnf.disable_ipv6) { 3089 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3090 err = -EACCES; 3091 goto out; 3092 } 3093 3094 if (!(dev->flags & IFF_UP)) { 3095 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3096 err = -ENETDOWN; 3097 goto out; 3098 } 3099 3100 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3101 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3102 NL_SET_ERR_MSG(extack, "Invalid source address"); 3103 err = -EINVAL; 3104 goto out; 3105 } 3106 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3107 rt->fib6_prefsrc.plen = 128; 3108 } else 3109 rt->fib6_prefsrc.plen = 0; 3110 3111 rt->fib6_flags = cfg->fc_flags; 3112 3113 install_route: 3114 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3115 !netif_carrier_ok(dev)) 3116 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 3117 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); 3118 rt->fib6_nh.nh_dev = dev; 3119 rt->fib6_table = table; 3120 3121 if (idev) 3122 in6_dev_put(idev); 3123 3124 return rt; 3125 out: 3126 if (dev) 3127 dev_put(dev); 3128 if (idev) 3129 in6_dev_put(idev); 3130 3131 fib6_info_release(rt); 3132 return ERR_PTR(err); 3133 } 3134 3135 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3136 struct netlink_ext_ack *extack) 3137 { 3138 struct fib6_info *rt; 3139 int err; 3140 3141 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3142 if (IS_ERR(rt)) 3143 return PTR_ERR(rt); 3144 3145 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3146 fib6_info_release(rt); 3147 3148 return err; 3149 } 3150 3151 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3152 { 3153 struct net *net = info->nl_net; 3154 struct fib6_table *table; 3155 int err; 3156 3157 if (rt == net->ipv6.fib6_null_entry) { 3158 err = -ENOENT; 3159 goto out; 3160 } 3161 3162 table = rt->fib6_table; 3163 spin_lock_bh(&table->tb6_lock); 3164 err = fib6_del(rt, info); 3165 spin_unlock_bh(&table->tb6_lock); 3166 3167 out: 3168 fib6_info_release(rt); 3169 return err; 3170 } 3171 3172 int ip6_del_rt(struct net *net, struct fib6_info *rt) 3173 { 3174 struct nl_info info = { .nl_net = net }; 3175 3176 return __ip6_del_rt(rt, &info); 3177 } 3178 3179 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3180 { 3181 struct nl_info *info = &cfg->fc_nlinfo; 3182 struct net *net = info->nl_net; 3183 struct sk_buff *skb = NULL; 3184 struct fib6_table *table; 3185 int err = -ENOENT; 3186 3187 if (rt == net->ipv6.fib6_null_entry) 3188 goto out_put; 3189 table = rt->fib6_table; 3190 spin_lock_bh(&table->tb6_lock); 3191 3192 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3193 struct fib6_info *sibling, *next_sibling; 3194 3195 /* prefer to send a single notification with all hops */ 3196 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3197 if (skb) { 3198 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3199 3200 if (rt6_fill_node(net, skb, rt, NULL, 3201 NULL, NULL, 0, RTM_DELROUTE, 3202 info->portid, seq, 0) < 0) { 3203 kfree_skb(skb); 3204 skb = NULL; 3205 } else 3206 info->skip_notify = 1; 3207 } 3208 3209 list_for_each_entry_safe(sibling, next_sibling, 3210 &rt->fib6_siblings, 3211 fib6_siblings) { 3212 err = fib6_del(sibling, info); 3213 if (err) 3214 goto out_unlock; 3215 } 3216 } 3217 3218 err = fib6_del(rt, info); 3219 out_unlock: 3220 spin_unlock_bh(&table->tb6_lock); 3221 out_put: 3222 fib6_info_release(rt); 3223 3224 if (skb) { 3225 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3226 info->nlh, gfp_any()); 3227 } 3228 return err; 3229 } 3230 3231 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3232 { 3233 int rc = -ESRCH; 3234 3235 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3236 goto out; 3237 3238 if (cfg->fc_flags & RTF_GATEWAY && 3239 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3240 goto out; 3241 if (dst_hold_safe(&rt->dst)) 3242 rc = rt6_remove_exception_rt(rt); 3243 out: 3244 return rc; 3245 } 3246 3247 static int ip6_route_del(struct fib6_config *cfg, 3248 struct netlink_ext_ack *extack) 3249 { 3250 struct rt6_info *rt_cache; 3251 struct fib6_table *table; 3252 struct fib6_info *rt; 3253 struct fib6_node *fn; 3254 int err = -ESRCH; 3255 3256 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 3257 if (!table) { 3258 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 3259 return err; 3260 } 3261 3262 rcu_read_lock(); 3263 3264 fn = fib6_locate(&table->tb6_root, 3265 &cfg->fc_dst, cfg->fc_dst_len, 3266 &cfg->fc_src, cfg->fc_src_len, 3267 !(cfg->fc_flags & RTF_CACHE)); 3268 3269 if (fn) { 3270 for_each_fib6_node_rt_rcu(fn) { 3271 if (cfg->fc_flags & RTF_CACHE) { 3272 int rc; 3273 3274 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, 3275 &cfg->fc_src); 3276 if (rt_cache) { 3277 rc = ip6_del_cached_rt(rt_cache, cfg); 3278 if (rc != -ESRCH) { 3279 rcu_read_unlock(); 3280 return rc; 3281 } 3282 } 3283 continue; 3284 } 3285 if (cfg->fc_ifindex && 3286 (!rt->fib6_nh.nh_dev || 3287 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex)) 3288 continue; 3289 if (cfg->fc_flags & RTF_GATEWAY && 3290 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw)) 3291 continue; 3292 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 3293 continue; 3294 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) 3295 continue; 3296 if (!fib6_info_hold_safe(rt)) 3297 continue; 3298 rcu_read_unlock(); 3299 3300 /* if gateway was specified only delete the one hop */ 3301 if (cfg->fc_flags & RTF_GATEWAY) 3302 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3303 3304 return __ip6_del_rt_siblings(rt, cfg); 3305 } 3306 } 3307 rcu_read_unlock(); 3308 3309 return err; 3310 } 3311 3312 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3313 { 3314 struct netevent_redirect netevent; 3315 struct rt6_info *rt, *nrt = NULL; 3316 struct ndisc_options ndopts; 3317 struct inet6_dev *in6_dev; 3318 struct neighbour *neigh; 3319 struct fib6_info *from; 3320 struct rd_msg *msg; 3321 int optlen, on_link; 3322 u8 *lladdr; 3323 3324 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3325 optlen -= sizeof(*msg); 3326 3327 if (optlen < 0) { 3328 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3329 return; 3330 } 3331 3332 msg = (struct rd_msg *)icmp6_hdr(skb); 3333 3334 if (ipv6_addr_is_multicast(&msg->dest)) { 3335 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3336 return; 3337 } 3338 3339 on_link = 0; 3340 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3341 on_link = 1; 3342 } else if (ipv6_addr_type(&msg->target) != 3343 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3344 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3345 return; 3346 } 3347 3348 in6_dev = __in6_dev_get(skb->dev); 3349 if (!in6_dev) 3350 return; 3351 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3352 return; 3353 3354 /* RFC2461 8.1: 3355 * The IP source address of the Redirect MUST be the same as the current 3356 * first-hop router for the specified ICMP Destination Address. 3357 */ 3358 3359 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3360 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3361 return; 3362 } 3363 3364 lladdr = NULL; 3365 if (ndopts.nd_opts_tgt_lladdr) { 3366 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3367 skb->dev); 3368 if (!lladdr) { 3369 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 3370 return; 3371 } 3372 } 3373 3374 rt = (struct rt6_info *) dst; 3375 if (rt->rt6i_flags & RTF_REJECT) { 3376 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 3377 return; 3378 } 3379 3380 /* Redirect received -> path was valid. 3381 * Look, redirects are sent only in response to data packets, 3382 * so that this nexthop apparently is reachable. --ANK 3383 */ 3384 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3385 3386 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3387 if (!neigh) 3388 return; 3389 3390 /* 3391 * We have finally decided to accept it. 3392 */ 3393 3394 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3395 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3396 NEIGH_UPDATE_F_OVERRIDE| 3397 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3398 NEIGH_UPDATE_F_ISROUTER)), 3399 NDISC_REDIRECT, &ndopts); 3400 3401 rcu_read_lock(); 3402 from = rcu_dereference(rt->from); 3403 /* This fib6_info_hold() is safe here because we hold reference to rt 3404 * and rt already holds reference to fib6_info. 3405 */ 3406 fib6_info_hold(from); 3407 rcu_read_unlock(); 3408 3409 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL); 3410 if (!nrt) 3411 goto out; 3412 3413 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3414 if (on_link) 3415 nrt->rt6i_flags &= ~RTF_GATEWAY; 3416 3417 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3418 3419 /* No need to remove rt from the exception table if rt is 3420 * a cached route because rt6_insert_exception() will 3421 * takes care of it 3422 */ 3423 if (rt6_insert_exception(nrt, from)) { 3424 dst_release_immediate(&nrt->dst); 3425 goto out; 3426 } 3427 3428 netevent.old = &rt->dst; 3429 netevent.new = &nrt->dst; 3430 netevent.daddr = &msg->dest; 3431 netevent.neigh = neigh; 3432 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3433 3434 out: 3435 fib6_info_release(from); 3436 neigh_release(neigh); 3437 } 3438 3439 #ifdef CONFIG_IPV6_ROUTE_INFO 3440 static struct fib6_info *rt6_get_route_info(struct net *net, 3441 const struct in6_addr *prefix, int prefixlen, 3442 const struct in6_addr *gwaddr, 3443 struct net_device *dev) 3444 { 3445 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3446 int ifindex = dev->ifindex; 3447 struct fib6_node *fn; 3448 struct fib6_info *rt = NULL; 3449 struct fib6_table *table; 3450 3451 table = fib6_get_table(net, tb_id); 3452 if (!table) 3453 return NULL; 3454 3455 rcu_read_lock(); 3456 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3457 if (!fn) 3458 goto out; 3459 3460 for_each_fib6_node_rt_rcu(fn) { 3461 if (rt->fib6_nh.nh_dev->ifindex != ifindex) 3462 continue; 3463 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) 3464 continue; 3465 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) 3466 continue; 3467 if (!fib6_info_hold_safe(rt)) 3468 continue; 3469 break; 3470 } 3471 out: 3472 rcu_read_unlock(); 3473 return rt; 3474 } 3475 3476 static struct fib6_info *rt6_add_route_info(struct net *net, 3477 const struct in6_addr *prefix, int prefixlen, 3478 const struct in6_addr *gwaddr, 3479 struct net_device *dev, 3480 unsigned int pref) 3481 { 3482 struct fib6_config cfg = { 3483 .fc_metric = IP6_RT_PRIO_USER, 3484 .fc_ifindex = dev->ifindex, 3485 .fc_dst_len = prefixlen, 3486 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3487 RTF_UP | RTF_PREF(pref), 3488 .fc_protocol = RTPROT_RA, 3489 .fc_type = RTN_UNICAST, 3490 .fc_nlinfo.portid = 0, 3491 .fc_nlinfo.nlh = NULL, 3492 .fc_nlinfo.nl_net = net, 3493 }; 3494 3495 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3496 cfg.fc_dst = *prefix; 3497 cfg.fc_gateway = *gwaddr; 3498 3499 /* We should treat it as a default route if prefix length is 0. */ 3500 if (!prefixlen) 3501 cfg.fc_flags |= RTF_DEFAULT; 3502 3503 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 3504 3505 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3506 } 3507 #endif 3508 3509 struct fib6_info *rt6_get_dflt_router(struct net *net, 3510 const struct in6_addr *addr, 3511 struct net_device *dev) 3512 { 3513 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3514 struct fib6_info *rt; 3515 struct fib6_table *table; 3516 3517 table = fib6_get_table(net, tb_id); 3518 if (!table) 3519 return NULL; 3520 3521 rcu_read_lock(); 3522 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3523 if (dev == rt->fib6_nh.nh_dev && 3524 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3525 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) 3526 break; 3527 } 3528 if (rt && !fib6_info_hold_safe(rt)) 3529 rt = NULL; 3530 rcu_read_unlock(); 3531 return rt; 3532 } 3533 3534 struct fib6_info *rt6_add_dflt_router(struct net *net, 3535 const struct in6_addr *gwaddr, 3536 struct net_device *dev, 3537 unsigned int pref) 3538 { 3539 struct fib6_config cfg = { 3540 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3541 .fc_metric = IP6_RT_PRIO_USER, 3542 .fc_ifindex = dev->ifindex, 3543 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3544 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3545 .fc_protocol = RTPROT_RA, 3546 .fc_type = RTN_UNICAST, 3547 .fc_nlinfo.portid = 0, 3548 .fc_nlinfo.nlh = NULL, 3549 .fc_nlinfo.nl_net = net, 3550 }; 3551 3552 cfg.fc_gateway = *gwaddr; 3553 3554 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 3555 struct fib6_table *table; 3556 3557 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3558 if (table) 3559 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3560 } 3561 3562 return rt6_get_dflt_router(net, gwaddr, dev); 3563 } 3564 3565 static void __rt6_purge_dflt_routers(struct net *net, 3566 struct fib6_table *table) 3567 { 3568 struct fib6_info *rt; 3569 3570 restart: 3571 rcu_read_lock(); 3572 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3573 struct net_device *dev = fib6_info_nh_dev(rt); 3574 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 3575 3576 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3577 (!idev || idev->cnf.accept_ra != 2) && 3578 fib6_info_hold_safe(rt)) { 3579 rcu_read_unlock(); 3580 ip6_del_rt(net, rt); 3581 goto restart; 3582 } 3583 } 3584 rcu_read_unlock(); 3585 3586 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3587 } 3588 3589 void rt6_purge_dflt_routers(struct net *net) 3590 { 3591 struct fib6_table *table; 3592 struct hlist_head *head; 3593 unsigned int h; 3594 3595 rcu_read_lock(); 3596 3597 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3598 head = &net->ipv6.fib_table_hash[h]; 3599 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3600 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3601 __rt6_purge_dflt_routers(net, table); 3602 } 3603 } 3604 3605 rcu_read_unlock(); 3606 } 3607 3608 static void rtmsg_to_fib6_config(struct net *net, 3609 struct in6_rtmsg *rtmsg, 3610 struct fib6_config *cfg) 3611 { 3612 memset(cfg, 0, sizeof(*cfg)); 3613 3614 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3615 : RT6_TABLE_MAIN; 3616 cfg->fc_ifindex = rtmsg->rtmsg_ifindex; 3617 cfg->fc_metric = rtmsg->rtmsg_metric; 3618 cfg->fc_expires = rtmsg->rtmsg_info; 3619 cfg->fc_dst_len = rtmsg->rtmsg_dst_len; 3620 cfg->fc_src_len = rtmsg->rtmsg_src_len; 3621 cfg->fc_flags = rtmsg->rtmsg_flags; 3622 cfg->fc_type = rtmsg->rtmsg_type; 3623 3624 cfg->fc_nlinfo.nl_net = net; 3625 3626 cfg->fc_dst = rtmsg->rtmsg_dst; 3627 cfg->fc_src = rtmsg->rtmsg_src; 3628 cfg->fc_gateway = rtmsg->rtmsg_gateway; 3629 } 3630 3631 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3632 { 3633 struct fib6_config cfg; 3634 struct in6_rtmsg rtmsg; 3635 int err; 3636 3637 switch (cmd) { 3638 case SIOCADDRT: /* Add a route */ 3639 case SIOCDELRT: /* Delete a route */ 3640 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3641 return -EPERM; 3642 err = copy_from_user(&rtmsg, arg, 3643 sizeof(struct in6_rtmsg)); 3644 if (err) 3645 return -EFAULT; 3646 3647 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3648 3649 rtnl_lock(); 3650 switch (cmd) { 3651 case SIOCADDRT: 3652 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 3653 break; 3654 case SIOCDELRT: 3655 err = ip6_route_del(&cfg, NULL); 3656 break; 3657 default: 3658 err = -EINVAL; 3659 } 3660 rtnl_unlock(); 3661 3662 return err; 3663 } 3664 3665 return -EINVAL; 3666 } 3667 3668 /* 3669 * Drop the packet on the floor 3670 */ 3671 3672 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3673 { 3674 int type; 3675 struct dst_entry *dst = skb_dst(skb); 3676 switch (ipstats_mib_noroutes) { 3677 case IPSTATS_MIB_INNOROUTES: 3678 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3679 if (type == IPV6_ADDR_ANY) { 3680 IP6_INC_STATS(dev_net(dst->dev), 3681 __in6_dev_get_safely(skb->dev), 3682 IPSTATS_MIB_INADDRERRORS); 3683 break; 3684 } 3685 /* FALLTHROUGH */ 3686 case IPSTATS_MIB_OUTNOROUTES: 3687 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3688 ipstats_mib_noroutes); 3689 break; 3690 } 3691 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3692 kfree_skb(skb); 3693 return 0; 3694 } 3695 3696 static int ip6_pkt_discard(struct sk_buff *skb) 3697 { 3698 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3699 } 3700 3701 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3702 { 3703 skb->dev = skb_dst(skb)->dev; 3704 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3705 } 3706 3707 static int ip6_pkt_prohibit(struct sk_buff *skb) 3708 { 3709 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3710 } 3711 3712 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3713 { 3714 skb->dev = skb_dst(skb)->dev; 3715 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3716 } 3717 3718 /* 3719 * Allocate a dst for local (unicast / anycast) address. 3720 */ 3721 3722 struct fib6_info *addrconf_f6i_alloc(struct net *net, 3723 struct inet6_dev *idev, 3724 const struct in6_addr *addr, 3725 bool anycast, gfp_t gfp_flags) 3726 { 3727 u32 tb_id; 3728 struct net_device *dev = idev->dev; 3729 struct fib6_info *f6i; 3730 3731 f6i = fib6_info_alloc(gfp_flags); 3732 if (!f6i) 3733 return ERR_PTR(-ENOMEM); 3734 3735 f6i->dst_nocount = true; 3736 f6i->dst_host = true; 3737 f6i->fib6_protocol = RTPROT_KERNEL; 3738 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP; 3739 if (anycast) { 3740 f6i->fib6_type = RTN_ANYCAST; 3741 f6i->fib6_flags |= RTF_ANYCAST; 3742 } else { 3743 f6i->fib6_type = RTN_LOCAL; 3744 f6i->fib6_flags |= RTF_LOCAL; 3745 } 3746 3747 f6i->fib6_nh.nh_gw = *addr; 3748 dev_hold(dev); 3749 f6i->fib6_nh.nh_dev = dev; 3750 f6i->fib6_dst.addr = *addr; 3751 f6i->fib6_dst.plen = 128; 3752 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; 3753 f6i->fib6_table = fib6_get_table(net, tb_id); 3754 3755 return f6i; 3756 } 3757 3758 /* remove deleted ip from prefsrc entries */ 3759 struct arg_dev_net_ip { 3760 struct net_device *dev; 3761 struct net *net; 3762 struct in6_addr *addr; 3763 }; 3764 3765 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 3766 { 3767 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3768 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3769 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3770 3771 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) && 3772 rt != net->ipv6.fib6_null_entry && 3773 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 3774 spin_lock_bh(&rt6_exception_lock); 3775 /* remove prefsrc entry */ 3776 rt->fib6_prefsrc.plen = 0; 3777 spin_unlock_bh(&rt6_exception_lock); 3778 } 3779 return 0; 3780 } 3781 3782 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3783 { 3784 struct net *net = dev_net(ifp->idev->dev); 3785 struct arg_dev_net_ip adni = { 3786 .dev = ifp->idev->dev, 3787 .net = net, 3788 .addr = &ifp->addr, 3789 }; 3790 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3791 } 3792 3793 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) 3794 3795 /* Remove routers and update dst entries when gateway turn into host. */ 3796 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 3797 { 3798 struct in6_addr *gateway = (struct in6_addr *)arg; 3799 3800 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3801 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) { 3802 return -1; 3803 } 3804 3805 /* Further clean up cached routes in exception table. 3806 * This is needed because cached route may have a different 3807 * gateway than its 'parent' in the case of an ip redirect. 3808 */ 3809 rt6_exceptions_clean_tohost(rt, gateway); 3810 3811 return 0; 3812 } 3813 3814 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3815 { 3816 fib6_clean_all(net, fib6_clean_tohost, gateway); 3817 } 3818 3819 struct arg_netdev_event { 3820 const struct net_device *dev; 3821 union { 3822 unsigned int nh_flags; 3823 unsigned long event; 3824 }; 3825 }; 3826 3827 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 3828 { 3829 struct fib6_info *iter; 3830 struct fib6_node *fn; 3831 3832 fn = rcu_dereference_protected(rt->fib6_node, 3833 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3834 iter = rcu_dereference_protected(fn->leaf, 3835 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3836 while (iter) { 3837 if (iter->fib6_metric == rt->fib6_metric && 3838 rt6_qualify_for_ecmp(iter)) 3839 return iter; 3840 iter = rcu_dereference_protected(iter->fib6_next, 3841 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3842 } 3843 3844 return NULL; 3845 } 3846 3847 static bool rt6_is_dead(const struct fib6_info *rt) 3848 { 3849 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || 3850 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && 3851 fib6_ignore_linkdown(rt))) 3852 return true; 3853 3854 return false; 3855 } 3856 3857 static int rt6_multipath_total_weight(const struct fib6_info *rt) 3858 { 3859 struct fib6_info *iter; 3860 int total = 0; 3861 3862 if (!rt6_is_dead(rt)) 3863 total += rt->fib6_nh.nh_weight; 3864 3865 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 3866 if (!rt6_is_dead(iter)) 3867 total += iter->fib6_nh.nh_weight; 3868 } 3869 3870 return total; 3871 } 3872 3873 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 3874 { 3875 int upper_bound = -1; 3876 3877 if (!rt6_is_dead(rt)) { 3878 *weight += rt->fib6_nh.nh_weight; 3879 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 3880 total) - 1; 3881 } 3882 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound); 3883 } 3884 3885 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 3886 { 3887 struct fib6_info *iter; 3888 int weight = 0; 3889 3890 rt6_upper_bound_set(rt, &weight, total); 3891 3892 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3893 rt6_upper_bound_set(iter, &weight, total); 3894 } 3895 3896 void rt6_multipath_rebalance(struct fib6_info *rt) 3897 { 3898 struct fib6_info *first; 3899 int total; 3900 3901 /* In case the entire multipath route was marked for flushing, 3902 * then there is no need to rebalance upon the removal of every 3903 * sibling route. 3904 */ 3905 if (!rt->fib6_nsiblings || rt->should_flush) 3906 return; 3907 3908 /* During lookup routes are evaluated in order, so we need to 3909 * make sure upper bounds are assigned from the first sibling 3910 * onwards. 3911 */ 3912 first = rt6_multipath_first_sibling(rt); 3913 if (WARN_ON_ONCE(!first)) 3914 return; 3915 3916 total = rt6_multipath_total_weight(first); 3917 rt6_multipath_upper_bound_set(first, total); 3918 } 3919 3920 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 3921 { 3922 const struct arg_netdev_event *arg = p_arg; 3923 struct net *net = dev_net(arg->dev); 3924 3925 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) { 3926 rt->fib6_nh.nh_flags &= ~arg->nh_flags; 3927 fib6_update_sernum_upto_root(net, rt); 3928 rt6_multipath_rebalance(rt); 3929 } 3930 3931 return 0; 3932 } 3933 3934 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) 3935 { 3936 struct arg_netdev_event arg = { 3937 .dev = dev, 3938 { 3939 .nh_flags = nh_flags, 3940 }, 3941 }; 3942 3943 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 3944 arg.nh_flags |= RTNH_F_LINKDOWN; 3945 3946 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 3947 } 3948 3949 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 3950 const struct net_device *dev) 3951 { 3952 struct fib6_info *iter; 3953 3954 if (rt->fib6_nh.nh_dev == dev) 3955 return true; 3956 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3957 if (iter->fib6_nh.nh_dev == dev) 3958 return true; 3959 3960 return false; 3961 } 3962 3963 static void rt6_multipath_flush(struct fib6_info *rt) 3964 { 3965 struct fib6_info *iter; 3966 3967 rt->should_flush = 1; 3968 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3969 iter->should_flush = 1; 3970 } 3971 3972 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 3973 const struct net_device *down_dev) 3974 { 3975 struct fib6_info *iter; 3976 unsigned int dead = 0; 3977 3978 if (rt->fib6_nh.nh_dev == down_dev || 3979 rt->fib6_nh.nh_flags & RTNH_F_DEAD) 3980 dead++; 3981 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3982 if (iter->fib6_nh.nh_dev == down_dev || 3983 iter->fib6_nh.nh_flags & RTNH_F_DEAD) 3984 dead++; 3985 3986 return dead; 3987 } 3988 3989 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 3990 const struct net_device *dev, 3991 unsigned int nh_flags) 3992 { 3993 struct fib6_info *iter; 3994 3995 if (rt->fib6_nh.nh_dev == dev) 3996 rt->fib6_nh.nh_flags |= nh_flags; 3997 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3998 if (iter->fib6_nh.nh_dev == dev) 3999 iter->fib6_nh.nh_flags |= nh_flags; 4000 } 4001 4002 /* called with write lock held for table with rt */ 4003 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 4004 { 4005 const struct arg_netdev_event *arg = p_arg; 4006 const struct net_device *dev = arg->dev; 4007 struct net *net = dev_net(dev); 4008 4009 if (rt == net->ipv6.fib6_null_entry) 4010 return 0; 4011 4012 switch (arg->event) { 4013 case NETDEV_UNREGISTER: 4014 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 4015 case NETDEV_DOWN: 4016 if (rt->should_flush) 4017 return -1; 4018 if (!rt->fib6_nsiblings) 4019 return rt->fib6_nh.nh_dev == dev ? -1 : 0; 4020 if (rt6_multipath_uses_dev(rt, dev)) { 4021 unsigned int count; 4022 4023 count = rt6_multipath_dead_count(rt, dev); 4024 if (rt->fib6_nsiblings + 1 == count) { 4025 rt6_multipath_flush(rt); 4026 return -1; 4027 } 4028 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4029 RTNH_F_LINKDOWN); 4030 fib6_update_sernum(net, rt); 4031 rt6_multipath_rebalance(rt); 4032 } 4033 return -2; 4034 case NETDEV_CHANGE: 4035 if (rt->fib6_nh.nh_dev != dev || 4036 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 4037 break; 4038 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; 4039 rt6_multipath_rebalance(rt); 4040 break; 4041 } 4042 4043 return 0; 4044 } 4045 4046 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4047 { 4048 struct arg_netdev_event arg = { 4049 .dev = dev, 4050 { 4051 .event = event, 4052 }, 4053 }; 4054 4055 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg); 4056 } 4057 4058 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4059 { 4060 rt6_sync_down_dev(dev, event); 4061 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4062 neigh_ifdown(&nd_tbl, dev); 4063 } 4064 4065 struct rt6_mtu_change_arg { 4066 struct net_device *dev; 4067 unsigned int mtu; 4068 }; 4069 4070 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) 4071 { 4072 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4073 struct inet6_dev *idev; 4074 4075 /* In IPv6 pmtu discovery is not optional, 4076 so that RTAX_MTU lock cannot disable it. 4077 We still use this lock to block changes 4078 caused by addrconf/ndisc. 4079 */ 4080 4081 idev = __in6_dev_get(arg->dev); 4082 if (!idev) 4083 return 0; 4084 4085 /* For administrative MTU increase, there is no way to discover 4086 IPv6 PMTU increase, so PMTU increase should be updated here. 4087 Since RFC 1981 doesn't include administrative MTU increase 4088 update PMTU increase is a MUST. (i.e. jumbo frame) 4089 */ 4090 if (rt->fib6_nh.nh_dev == arg->dev && 4091 !fib6_metric_locked(rt, RTAX_MTU)) { 4092 u32 mtu = rt->fib6_pmtu; 4093 4094 if (mtu >= arg->mtu || 4095 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4096 fib6_metric_set(rt, RTAX_MTU, arg->mtu); 4097 4098 spin_lock_bh(&rt6_exception_lock); 4099 rt6_exceptions_update_pmtu(idev, rt, arg->mtu); 4100 spin_unlock_bh(&rt6_exception_lock); 4101 } 4102 return 0; 4103 } 4104 4105 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4106 { 4107 struct rt6_mtu_change_arg arg = { 4108 .dev = dev, 4109 .mtu = mtu, 4110 }; 4111 4112 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4113 } 4114 4115 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4116 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4117 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4118 [RTA_OIF] = { .type = NLA_U32 }, 4119 [RTA_IIF] = { .type = NLA_U32 }, 4120 [RTA_PRIORITY] = { .type = NLA_U32 }, 4121 [RTA_METRICS] = { .type = NLA_NESTED }, 4122 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4123 [RTA_PREF] = { .type = NLA_U8 }, 4124 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4125 [RTA_ENCAP] = { .type = NLA_NESTED }, 4126 [RTA_EXPIRES] = { .type = NLA_U32 }, 4127 [RTA_UID] = { .type = NLA_U32 }, 4128 [RTA_MARK] = { .type = NLA_U32 }, 4129 [RTA_TABLE] = { .type = NLA_U32 }, 4130 [RTA_IP_PROTO] = { .type = NLA_U8 }, 4131 [RTA_SPORT] = { .type = NLA_U16 }, 4132 [RTA_DPORT] = { .type = NLA_U16 }, 4133 }; 4134 4135 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4136 struct fib6_config *cfg, 4137 struct netlink_ext_ack *extack) 4138 { 4139 struct rtmsg *rtm; 4140 struct nlattr *tb[RTA_MAX+1]; 4141 unsigned int pref; 4142 int err; 4143 4144 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4145 NULL); 4146 if (err < 0) 4147 goto errout; 4148 4149 err = -EINVAL; 4150 rtm = nlmsg_data(nlh); 4151 memset(cfg, 0, sizeof(*cfg)); 4152 4153 cfg->fc_table = rtm->rtm_table; 4154 cfg->fc_dst_len = rtm->rtm_dst_len; 4155 cfg->fc_src_len = rtm->rtm_src_len; 4156 cfg->fc_flags = RTF_UP; 4157 cfg->fc_protocol = rtm->rtm_protocol; 4158 cfg->fc_type = rtm->rtm_type; 4159 4160 if (rtm->rtm_type == RTN_UNREACHABLE || 4161 rtm->rtm_type == RTN_BLACKHOLE || 4162 rtm->rtm_type == RTN_PROHIBIT || 4163 rtm->rtm_type == RTN_THROW) 4164 cfg->fc_flags |= RTF_REJECT; 4165 4166 if (rtm->rtm_type == RTN_LOCAL) 4167 cfg->fc_flags |= RTF_LOCAL; 4168 4169 if (rtm->rtm_flags & RTM_F_CLONED) 4170 cfg->fc_flags |= RTF_CACHE; 4171 4172 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4173 4174 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; 4175 cfg->fc_nlinfo.nlh = nlh; 4176 cfg->fc_nlinfo.nl_net = sock_net(skb->sk); 4177 4178 if (tb[RTA_GATEWAY]) { 4179 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4180 cfg->fc_flags |= RTF_GATEWAY; 4181 } 4182 4183 if (tb[RTA_DST]) { 4184 int plen = (rtm->rtm_dst_len + 7) >> 3; 4185 4186 if (nla_len(tb[RTA_DST]) < plen) 4187 goto errout; 4188 4189 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 4190 } 4191 4192 if (tb[RTA_SRC]) { 4193 int plen = (rtm->rtm_src_len + 7) >> 3; 4194 4195 if (nla_len(tb[RTA_SRC]) < plen) 4196 goto errout; 4197 4198 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 4199 } 4200 4201 if (tb[RTA_PREFSRC]) 4202 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 4203 4204 if (tb[RTA_OIF]) 4205 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 4206 4207 if (tb[RTA_PRIORITY]) 4208 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 4209 4210 if (tb[RTA_METRICS]) { 4211 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 4212 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 4213 } 4214 4215 if (tb[RTA_TABLE]) 4216 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 4217 4218 if (tb[RTA_MULTIPATH]) { 4219 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 4220 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 4221 4222 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 4223 cfg->fc_mp_len, extack); 4224 if (err < 0) 4225 goto errout; 4226 } 4227 4228 if (tb[RTA_PREF]) { 4229 pref = nla_get_u8(tb[RTA_PREF]); 4230 if (pref != ICMPV6_ROUTER_PREF_LOW && 4231 pref != ICMPV6_ROUTER_PREF_HIGH) 4232 pref = ICMPV6_ROUTER_PREF_MEDIUM; 4233 cfg->fc_flags |= RTF_PREF(pref); 4234 } 4235 4236 if (tb[RTA_ENCAP]) 4237 cfg->fc_encap = tb[RTA_ENCAP]; 4238 4239 if (tb[RTA_ENCAP_TYPE]) { 4240 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 4241 4242 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 4243 if (err < 0) 4244 goto errout; 4245 } 4246 4247 if (tb[RTA_EXPIRES]) { 4248 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 4249 4250 if (addrconf_finite_timeout(timeout)) { 4251 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 4252 cfg->fc_flags |= RTF_EXPIRES; 4253 } 4254 } 4255 4256 err = 0; 4257 errout: 4258 return err; 4259 } 4260 4261 struct rt6_nh { 4262 struct fib6_info *fib6_info; 4263 struct fib6_config r_cfg; 4264 struct list_head next; 4265 }; 4266 4267 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) 4268 { 4269 struct rt6_nh *nh; 4270 4271 list_for_each_entry(nh, rt6_nh_list, next) { 4272 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n", 4273 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, 4274 nh->r_cfg.fc_ifindex); 4275 } 4276 } 4277 4278 static int ip6_route_info_append(struct net *net, 4279 struct list_head *rt6_nh_list, 4280 struct fib6_info *rt, 4281 struct fib6_config *r_cfg) 4282 { 4283 struct rt6_nh *nh; 4284 int err = -EEXIST; 4285 4286 list_for_each_entry(nh, rt6_nh_list, next) { 4287 /* check if fib6_info already exists */ 4288 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 4289 return err; 4290 } 4291 4292 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4293 if (!nh) 4294 return -ENOMEM; 4295 nh->fib6_info = rt; 4296 err = ip6_convert_metrics(net, rt, r_cfg); 4297 if (err) { 4298 kfree(nh); 4299 return err; 4300 } 4301 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4302 list_add_tail(&nh->next, rt6_nh_list); 4303 4304 return 0; 4305 } 4306 4307 static void ip6_route_mpath_notify(struct fib6_info *rt, 4308 struct fib6_info *rt_last, 4309 struct nl_info *info, 4310 __u16 nlflags) 4311 { 4312 /* if this is an APPEND route, then rt points to the first route 4313 * inserted and rt_last points to last route inserted. Userspace 4314 * wants a consistent dump of the route which starts at the first 4315 * nexthop. Since sibling routes are always added at the end of 4316 * the list, find the first sibling of the last route appended 4317 */ 4318 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 4319 rt = list_first_entry(&rt_last->fib6_siblings, 4320 struct fib6_info, 4321 fib6_siblings); 4322 } 4323 4324 if (rt) 4325 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 4326 } 4327 4328 static int ip6_route_multipath_add(struct fib6_config *cfg, 4329 struct netlink_ext_ack *extack) 4330 { 4331 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 4332 struct nl_info *info = &cfg->fc_nlinfo; 4333 struct fib6_config r_cfg; 4334 struct rtnexthop *rtnh; 4335 struct fib6_info *rt; 4336 struct rt6_nh *err_nh; 4337 struct rt6_nh *nh, *nh_safe; 4338 __u16 nlflags; 4339 int remaining; 4340 int attrlen; 4341 int err = 1; 4342 int nhn = 0; 4343 int replace = (cfg->fc_nlinfo.nlh && 4344 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 4345 LIST_HEAD(rt6_nh_list); 4346 4347 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 4348 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 4349 nlflags |= NLM_F_APPEND; 4350 4351 remaining = cfg->fc_mp_len; 4352 rtnh = (struct rtnexthop *)cfg->fc_mp; 4353 4354 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4355 * fib6_info structs per nexthop 4356 */ 4357 while (rtnh_ok(rtnh, remaining)) { 4358 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4359 if (rtnh->rtnh_ifindex) 4360 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4361 4362 attrlen = rtnh_attrlen(rtnh); 4363 if (attrlen > 0) { 4364 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4365 4366 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4367 if (nla) { 4368 r_cfg.fc_gateway = nla_get_in6_addr(nla); 4369 r_cfg.fc_flags |= RTF_GATEWAY; 4370 } 4371 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 4372 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 4373 if (nla) 4374 r_cfg.fc_encap_type = nla_get_u16(nla); 4375 } 4376 4377 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 4378 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 4379 if (IS_ERR(rt)) { 4380 err = PTR_ERR(rt); 4381 rt = NULL; 4382 goto cleanup; 4383 } 4384 if (!rt6_qualify_for_ecmp(rt)) { 4385 err = -EINVAL; 4386 NL_SET_ERR_MSG(extack, 4387 "Device only routes can not be added for IPv6 using the multipath API."); 4388 fib6_info_release(rt); 4389 goto cleanup; 4390 } 4391 4392 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1; 4393 4394 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 4395 rt, &r_cfg); 4396 if (err) { 4397 fib6_info_release(rt); 4398 goto cleanup; 4399 } 4400 4401 rtnh = rtnh_next(rtnh, &remaining); 4402 } 4403 4404 /* for add and replace send one notification with all nexthops. 4405 * Skip the notification in fib6_add_rt2node and send one with 4406 * the full route when done 4407 */ 4408 info->skip_notify = 1; 4409 4410 err_nh = NULL; 4411 list_for_each_entry(nh, &rt6_nh_list, next) { 4412 err = __ip6_ins_rt(nh->fib6_info, info, extack); 4413 fib6_info_release(nh->fib6_info); 4414 4415 if (!err) { 4416 /* save reference to last route successfully inserted */ 4417 rt_last = nh->fib6_info; 4418 4419 /* save reference to first route for notification */ 4420 if (!rt_notif) 4421 rt_notif = nh->fib6_info; 4422 } 4423 4424 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 4425 nh->fib6_info = NULL; 4426 if (err) { 4427 if (replace && nhn) 4428 ip6_print_replace_route_err(&rt6_nh_list); 4429 err_nh = nh; 4430 goto add_errout; 4431 } 4432 4433 /* Because each route is added like a single route we remove 4434 * these flags after the first nexthop: if there is a collision, 4435 * we have already failed to add the first nexthop: 4436 * fib6_add_rt2node() has rejected it; when replacing, old 4437 * nexthops have been replaced by first new, the rest should 4438 * be added to it. 4439 */ 4440 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4441 NLM_F_REPLACE); 4442 nhn++; 4443 } 4444 4445 /* success ... tell user about new route */ 4446 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4447 goto cleanup; 4448 4449 add_errout: 4450 /* send notification for routes that were added so that 4451 * the delete notifications sent by ip6_route_del are 4452 * coherent 4453 */ 4454 if (rt_notif) 4455 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4456 4457 /* Delete routes that were already added */ 4458 list_for_each_entry(nh, &rt6_nh_list, next) { 4459 if (err_nh == nh) 4460 break; 4461 ip6_route_del(&nh->r_cfg, extack); 4462 } 4463 4464 cleanup: 4465 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4466 if (nh->fib6_info) 4467 fib6_info_release(nh->fib6_info); 4468 list_del(&nh->next); 4469 kfree(nh); 4470 } 4471 4472 return err; 4473 } 4474 4475 static int ip6_route_multipath_del(struct fib6_config *cfg, 4476 struct netlink_ext_ack *extack) 4477 { 4478 struct fib6_config r_cfg; 4479 struct rtnexthop *rtnh; 4480 int remaining; 4481 int attrlen; 4482 int err = 1, last_err = 0; 4483 4484 remaining = cfg->fc_mp_len; 4485 rtnh = (struct rtnexthop *)cfg->fc_mp; 4486 4487 /* Parse a Multipath Entry */ 4488 while (rtnh_ok(rtnh, remaining)) { 4489 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4490 if (rtnh->rtnh_ifindex) 4491 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4492 4493 attrlen = rtnh_attrlen(rtnh); 4494 if (attrlen > 0) { 4495 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4496 4497 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4498 if (nla) { 4499 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 4500 r_cfg.fc_flags |= RTF_GATEWAY; 4501 } 4502 } 4503 err = ip6_route_del(&r_cfg, extack); 4504 if (err) 4505 last_err = err; 4506 4507 rtnh = rtnh_next(rtnh, &remaining); 4508 } 4509 4510 return last_err; 4511 } 4512 4513 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4514 struct netlink_ext_ack *extack) 4515 { 4516 struct fib6_config cfg; 4517 int err; 4518 4519 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4520 if (err < 0) 4521 return err; 4522 4523 if (cfg.fc_mp) 4524 return ip6_route_multipath_del(&cfg, extack); 4525 else { 4526 cfg.fc_delete_all_nh = 1; 4527 return ip6_route_del(&cfg, extack); 4528 } 4529 } 4530 4531 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4532 struct netlink_ext_ack *extack) 4533 { 4534 struct fib6_config cfg; 4535 int err; 4536 4537 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4538 if (err < 0) 4539 return err; 4540 4541 if (cfg.fc_mp) 4542 return ip6_route_multipath_add(&cfg, extack); 4543 else 4544 return ip6_route_add(&cfg, GFP_KERNEL, extack); 4545 } 4546 4547 static size_t rt6_nlmsg_size(struct fib6_info *rt) 4548 { 4549 int nexthop_len = 0; 4550 4551 if (rt->fib6_nsiblings) { 4552 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4553 + NLA_ALIGN(sizeof(struct rtnexthop)) 4554 + nla_total_size(16) /* RTA_GATEWAY */ 4555 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate); 4556 4557 nexthop_len *= rt->fib6_nsiblings; 4558 } 4559 4560 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4561 + nla_total_size(16) /* RTA_SRC */ 4562 + nla_total_size(16) /* RTA_DST */ 4563 + nla_total_size(16) /* RTA_GATEWAY */ 4564 + nla_total_size(16) /* RTA_PREFSRC */ 4565 + nla_total_size(4) /* RTA_TABLE */ 4566 + nla_total_size(4) /* RTA_IIF */ 4567 + nla_total_size(4) /* RTA_OIF */ 4568 + nla_total_size(4) /* RTA_PRIORITY */ 4569 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 4570 + nla_total_size(sizeof(struct rta_cacheinfo)) 4571 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4572 + nla_total_size(1) /* RTA_PREF */ 4573 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate) 4574 + nexthop_len; 4575 } 4576 4577 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt, 4578 unsigned int *flags, bool skip_oif) 4579 { 4580 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) 4581 *flags |= RTNH_F_DEAD; 4582 4583 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) { 4584 *flags |= RTNH_F_LINKDOWN; 4585 4586 rcu_read_lock(); 4587 if (fib6_ignore_linkdown(rt)) 4588 *flags |= RTNH_F_DEAD; 4589 rcu_read_unlock(); 4590 } 4591 4592 if (rt->fib6_flags & RTF_GATEWAY) { 4593 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0) 4594 goto nla_put_failure; 4595 } 4596 4597 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK); 4598 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD) 4599 *flags |= RTNH_F_OFFLOAD; 4600 4601 /* not needed for multipath encoding b/c it has a rtnexthop struct */ 4602 if (!skip_oif && rt->fib6_nh.nh_dev && 4603 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex)) 4604 goto nla_put_failure; 4605 4606 if (rt->fib6_nh.nh_lwtstate && 4607 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0) 4608 goto nla_put_failure; 4609 4610 return 0; 4611 4612 nla_put_failure: 4613 return -EMSGSIZE; 4614 } 4615 4616 /* add multipath next hop */ 4617 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt) 4618 { 4619 const struct net_device *dev = rt->fib6_nh.nh_dev; 4620 struct rtnexthop *rtnh; 4621 unsigned int flags = 0; 4622 4623 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 4624 if (!rtnh) 4625 goto nla_put_failure; 4626 4627 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1; 4628 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; 4629 4630 if (rt6_nexthop_info(skb, rt, &flags, true) < 0) 4631 goto nla_put_failure; 4632 4633 rtnh->rtnh_flags = flags; 4634 4635 /* length of rtnetlink header + attributes */ 4636 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; 4637 4638 return 0; 4639 4640 nla_put_failure: 4641 return -EMSGSIZE; 4642 } 4643 4644 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 4645 struct fib6_info *rt, struct dst_entry *dst, 4646 struct in6_addr *dest, struct in6_addr *src, 4647 int iif, int type, u32 portid, u32 seq, 4648 unsigned int flags) 4649 { 4650 struct rt6_info *rt6 = (struct rt6_info *)dst; 4651 struct rt6key *rt6_dst, *rt6_src; 4652 u32 *pmetrics, table, rt6_flags; 4653 struct nlmsghdr *nlh; 4654 struct rtmsg *rtm; 4655 long expires = 0; 4656 4657 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4658 if (!nlh) 4659 return -EMSGSIZE; 4660 4661 if (rt6) { 4662 rt6_dst = &rt6->rt6i_dst; 4663 rt6_src = &rt6->rt6i_src; 4664 rt6_flags = rt6->rt6i_flags; 4665 } else { 4666 rt6_dst = &rt->fib6_dst; 4667 rt6_src = &rt->fib6_src; 4668 rt6_flags = rt->fib6_flags; 4669 } 4670 4671 rtm = nlmsg_data(nlh); 4672 rtm->rtm_family = AF_INET6; 4673 rtm->rtm_dst_len = rt6_dst->plen; 4674 rtm->rtm_src_len = rt6_src->plen; 4675 rtm->rtm_tos = 0; 4676 if (rt->fib6_table) 4677 table = rt->fib6_table->tb6_id; 4678 else 4679 table = RT6_TABLE_UNSPEC; 4680 rtm->rtm_table = table; 4681 if (nla_put_u32(skb, RTA_TABLE, table)) 4682 goto nla_put_failure; 4683 4684 rtm->rtm_type = rt->fib6_type; 4685 rtm->rtm_flags = 0; 4686 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4687 rtm->rtm_protocol = rt->fib6_protocol; 4688 4689 if (rt6_flags & RTF_CACHE) 4690 rtm->rtm_flags |= RTM_F_CLONED; 4691 4692 if (dest) { 4693 if (nla_put_in6_addr(skb, RTA_DST, dest)) 4694 goto nla_put_failure; 4695 rtm->rtm_dst_len = 128; 4696 } else if (rtm->rtm_dst_len) 4697 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr)) 4698 goto nla_put_failure; 4699 #ifdef CONFIG_IPV6_SUBTREES 4700 if (src) { 4701 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4702 goto nla_put_failure; 4703 rtm->rtm_src_len = 128; 4704 } else if (rtm->rtm_src_len && 4705 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr)) 4706 goto nla_put_failure; 4707 #endif 4708 if (iif) { 4709 #ifdef CONFIG_IPV6_MROUTE 4710 if (ipv6_addr_is_multicast(&rt6_dst->addr)) { 4711 int err = ip6mr_get_route(net, skb, rtm, portid); 4712 4713 if (err == 0) 4714 return 0; 4715 if (err < 0) 4716 goto nla_put_failure; 4717 } else 4718 #endif 4719 if (nla_put_u32(skb, RTA_IIF, iif)) 4720 goto nla_put_failure; 4721 } else if (dest) { 4722 struct in6_addr saddr_buf; 4723 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 4724 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4725 goto nla_put_failure; 4726 } 4727 4728 if (rt->fib6_prefsrc.plen) { 4729 struct in6_addr saddr_buf; 4730 saddr_buf = rt->fib6_prefsrc.addr; 4731 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4732 goto nla_put_failure; 4733 } 4734 4735 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 4736 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 4737 goto nla_put_failure; 4738 4739 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 4740 goto nla_put_failure; 4741 4742 /* For multipath routes, walk the siblings list and add 4743 * each as a nexthop within RTA_MULTIPATH. 4744 */ 4745 if (rt6) { 4746 if (rt6_flags & RTF_GATEWAY && 4747 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) 4748 goto nla_put_failure; 4749 4750 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex)) 4751 goto nla_put_failure; 4752 } else if (rt->fib6_nsiblings) { 4753 struct fib6_info *sibling, *next_sibling; 4754 struct nlattr *mp; 4755 4756 mp = nla_nest_start(skb, RTA_MULTIPATH); 4757 if (!mp) 4758 goto nla_put_failure; 4759 4760 if (rt6_add_nexthop(skb, rt) < 0) 4761 goto nla_put_failure; 4762 4763 list_for_each_entry_safe(sibling, next_sibling, 4764 &rt->fib6_siblings, fib6_siblings) { 4765 if (rt6_add_nexthop(skb, sibling) < 0) 4766 goto nla_put_failure; 4767 } 4768 4769 nla_nest_end(skb, mp); 4770 } else { 4771 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0) 4772 goto nla_put_failure; 4773 } 4774 4775 if (rt6_flags & RTF_EXPIRES) { 4776 expires = dst ? dst->expires : rt->expires; 4777 expires -= jiffies; 4778 } 4779 4780 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 4781 goto nla_put_failure; 4782 4783 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags))) 4784 goto nla_put_failure; 4785 4786 4787 nlmsg_end(skb, nlh); 4788 return 0; 4789 4790 nla_put_failure: 4791 nlmsg_cancel(skb, nlh); 4792 return -EMSGSIZE; 4793 } 4794 4795 int rt6_dump_route(struct fib6_info *rt, void *p_arg) 4796 { 4797 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4798 struct net *net = arg->net; 4799 4800 if (rt == net->ipv6.fib6_null_entry) 4801 return 0; 4802 4803 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { 4804 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); 4805 4806 /* user wants prefix routes only */ 4807 if (rtm->rtm_flags & RTM_F_PREFIX && 4808 !(rt->fib6_flags & RTF_PREFIX_RT)) { 4809 /* success since this is not a prefix route */ 4810 return 1; 4811 } 4812 } 4813 4814 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, 4815 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, 4816 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI); 4817 } 4818 4819 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4820 struct netlink_ext_ack *extack) 4821 { 4822 struct net *net = sock_net(in_skb->sk); 4823 struct nlattr *tb[RTA_MAX+1]; 4824 int err, iif = 0, oif = 0; 4825 struct fib6_info *from; 4826 struct dst_entry *dst; 4827 struct rt6_info *rt; 4828 struct sk_buff *skb; 4829 struct rtmsg *rtm; 4830 struct flowi6 fl6; 4831 bool fibmatch; 4832 4833 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4834 extack); 4835 if (err < 0) 4836 goto errout; 4837 4838 err = -EINVAL; 4839 memset(&fl6, 0, sizeof(fl6)); 4840 rtm = nlmsg_data(nlh); 4841 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4842 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4843 4844 if (tb[RTA_SRC]) { 4845 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4846 goto errout; 4847 4848 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4849 } 4850 4851 if (tb[RTA_DST]) { 4852 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4853 goto errout; 4854 4855 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4856 } 4857 4858 if (tb[RTA_IIF]) 4859 iif = nla_get_u32(tb[RTA_IIF]); 4860 4861 if (tb[RTA_OIF]) 4862 oif = nla_get_u32(tb[RTA_OIF]); 4863 4864 if (tb[RTA_MARK]) 4865 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4866 4867 if (tb[RTA_UID]) 4868 fl6.flowi6_uid = make_kuid(current_user_ns(), 4869 nla_get_u32(tb[RTA_UID])); 4870 else 4871 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4872 4873 if (tb[RTA_SPORT]) 4874 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); 4875 4876 if (tb[RTA_DPORT]) 4877 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); 4878 4879 if (tb[RTA_IP_PROTO]) { 4880 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 4881 &fl6.flowi6_proto, extack); 4882 if (err) 4883 goto errout; 4884 } 4885 4886 if (iif) { 4887 struct net_device *dev; 4888 int flags = 0; 4889 4890 rcu_read_lock(); 4891 4892 dev = dev_get_by_index_rcu(net, iif); 4893 if (!dev) { 4894 rcu_read_unlock(); 4895 err = -ENODEV; 4896 goto errout; 4897 } 4898 4899 fl6.flowi6_iif = iif; 4900 4901 if (!ipv6_addr_any(&fl6.saddr)) 4902 flags |= RT6_LOOKUP_F_HAS_SADDR; 4903 4904 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 4905 4906 rcu_read_unlock(); 4907 } else { 4908 fl6.flowi6_oif = oif; 4909 4910 dst = ip6_route_output(net, NULL, &fl6); 4911 } 4912 4913 4914 rt = container_of(dst, struct rt6_info, dst); 4915 if (rt->dst.error) { 4916 err = rt->dst.error; 4917 ip6_rt_put(rt); 4918 goto errout; 4919 } 4920 4921 if (rt == net->ipv6.ip6_null_entry) { 4922 err = rt->dst.error; 4923 ip6_rt_put(rt); 4924 goto errout; 4925 } 4926 4927 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 4928 if (!skb) { 4929 ip6_rt_put(rt); 4930 err = -ENOBUFS; 4931 goto errout; 4932 } 4933 4934 skb_dst_set(skb, &rt->dst); 4935 4936 rcu_read_lock(); 4937 from = rcu_dereference(rt->from); 4938 4939 if (fibmatch) 4940 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, 4941 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 4942 nlh->nlmsg_seq, 0); 4943 else 4944 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 4945 &fl6.saddr, iif, RTM_NEWROUTE, 4946 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 4947 0); 4948 rcu_read_unlock(); 4949 4950 if (err < 0) { 4951 kfree_skb(skb); 4952 goto errout; 4953 } 4954 4955 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 4956 errout: 4957 return err; 4958 } 4959 4960 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 4961 unsigned int nlm_flags) 4962 { 4963 struct sk_buff *skb; 4964 struct net *net = info->nl_net; 4965 u32 seq; 4966 int err; 4967 4968 err = -ENOBUFS; 4969 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 4970 4971 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 4972 if (!skb) 4973 goto errout; 4974 4975 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 4976 event, info->portid, seq, nlm_flags); 4977 if (err < 0) { 4978 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 4979 WARN_ON(err == -EMSGSIZE); 4980 kfree_skb(skb); 4981 goto errout; 4982 } 4983 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 4984 info->nlh, gfp_any()); 4985 return; 4986 errout: 4987 if (err < 0) 4988 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 4989 } 4990 4991 static int ip6_route_dev_notify(struct notifier_block *this, 4992 unsigned long event, void *ptr) 4993 { 4994 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 4995 struct net *net = dev_net(dev); 4996 4997 if (!(dev->flags & IFF_LOOPBACK)) 4998 return NOTIFY_OK; 4999 5000 if (event == NETDEV_REGISTER) { 5001 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev; 5002 net->ipv6.ip6_null_entry->dst.dev = dev; 5003 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 5004 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5005 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 5006 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 5007 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 5008 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 5009 #endif 5010 } else if (event == NETDEV_UNREGISTER && 5011 dev->reg_state != NETREG_UNREGISTERED) { 5012 /* NETDEV_UNREGISTER could be fired for multiple times by 5013 * netdev_wait_allrefs(). Make sure we only call this once. 5014 */ 5015 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 5016 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5017 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 5018 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 5019 #endif 5020 } 5021 5022 return NOTIFY_OK; 5023 } 5024 5025 /* 5026 * /proc 5027 */ 5028 5029 #ifdef CONFIG_PROC_FS 5030 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 5031 { 5032 struct net *net = (struct net *)seq->private; 5033 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 5034 net->ipv6.rt6_stats->fib_nodes, 5035 net->ipv6.rt6_stats->fib_route_nodes, 5036 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 5037 net->ipv6.rt6_stats->fib_rt_entries, 5038 net->ipv6.rt6_stats->fib_rt_cache, 5039 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 5040 net->ipv6.rt6_stats->fib_discarded_routes); 5041 5042 return 0; 5043 } 5044 #endif /* CONFIG_PROC_FS */ 5045 5046 #ifdef CONFIG_SYSCTL 5047 5048 static 5049 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 5050 void __user *buffer, size_t *lenp, loff_t *ppos) 5051 { 5052 struct net *net; 5053 int delay; 5054 if (!write) 5055 return -EINVAL; 5056 5057 net = (struct net *)ctl->extra1; 5058 delay = net->ipv6.sysctl.flush_delay; 5059 proc_dointvec(ctl, write, buffer, lenp, ppos); 5060 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 5061 return 0; 5062 } 5063 5064 struct ctl_table ipv6_route_table_template[] = { 5065 { 5066 .procname = "flush", 5067 .data = &init_net.ipv6.sysctl.flush_delay, 5068 .maxlen = sizeof(int), 5069 .mode = 0200, 5070 .proc_handler = ipv6_sysctl_rtcache_flush 5071 }, 5072 { 5073 .procname = "gc_thresh", 5074 .data = &ip6_dst_ops_template.gc_thresh, 5075 .maxlen = sizeof(int), 5076 .mode = 0644, 5077 .proc_handler = proc_dointvec, 5078 }, 5079 { 5080 .procname = "max_size", 5081 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 5082 .maxlen = sizeof(int), 5083 .mode = 0644, 5084 .proc_handler = proc_dointvec, 5085 }, 5086 { 5087 .procname = "gc_min_interval", 5088 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5089 .maxlen = sizeof(int), 5090 .mode = 0644, 5091 .proc_handler = proc_dointvec_jiffies, 5092 }, 5093 { 5094 .procname = "gc_timeout", 5095 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 5096 .maxlen = sizeof(int), 5097 .mode = 0644, 5098 .proc_handler = proc_dointvec_jiffies, 5099 }, 5100 { 5101 .procname = "gc_interval", 5102 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 5103 .maxlen = sizeof(int), 5104 .mode = 0644, 5105 .proc_handler = proc_dointvec_jiffies, 5106 }, 5107 { 5108 .procname = "gc_elasticity", 5109 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 5110 .maxlen = sizeof(int), 5111 .mode = 0644, 5112 .proc_handler = proc_dointvec, 5113 }, 5114 { 5115 .procname = "mtu_expires", 5116 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 5117 .maxlen = sizeof(int), 5118 .mode = 0644, 5119 .proc_handler = proc_dointvec_jiffies, 5120 }, 5121 { 5122 .procname = "min_adv_mss", 5123 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 5124 .maxlen = sizeof(int), 5125 .mode = 0644, 5126 .proc_handler = proc_dointvec, 5127 }, 5128 { 5129 .procname = "gc_min_interval_ms", 5130 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5131 .maxlen = sizeof(int), 5132 .mode = 0644, 5133 .proc_handler = proc_dointvec_ms_jiffies, 5134 }, 5135 { } 5136 }; 5137 5138 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 5139 { 5140 struct ctl_table *table; 5141 5142 table = kmemdup(ipv6_route_table_template, 5143 sizeof(ipv6_route_table_template), 5144 GFP_KERNEL); 5145 5146 if (table) { 5147 table[0].data = &net->ipv6.sysctl.flush_delay; 5148 table[0].extra1 = net; 5149 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 5150 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 5151 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5152 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 5153 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 5154 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 5155 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 5156 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 5157 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5158 5159 /* Don't export sysctls to unprivileged users */ 5160 if (net->user_ns != &init_user_ns) 5161 table[0].procname = NULL; 5162 } 5163 5164 return table; 5165 } 5166 #endif 5167 5168 static int __net_init ip6_route_net_init(struct net *net) 5169 { 5170 int ret = -ENOMEM; 5171 5172 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 5173 sizeof(net->ipv6.ip6_dst_ops)); 5174 5175 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 5176 goto out_ip6_dst_ops; 5177 5178 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, 5179 sizeof(*net->ipv6.fib6_null_entry), 5180 GFP_KERNEL); 5181 if (!net->ipv6.fib6_null_entry) 5182 goto out_ip6_dst_entries; 5183 5184 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 5185 sizeof(*net->ipv6.ip6_null_entry), 5186 GFP_KERNEL); 5187 if (!net->ipv6.ip6_null_entry) 5188 goto out_fib6_null_entry; 5189 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5190 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 5191 ip6_template_metrics, true); 5192 5193 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5194 net->ipv6.fib6_has_custom_rules = false; 5195 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 5196 sizeof(*net->ipv6.ip6_prohibit_entry), 5197 GFP_KERNEL); 5198 if (!net->ipv6.ip6_prohibit_entry) 5199 goto out_ip6_null_entry; 5200 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5201 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 5202 ip6_template_metrics, true); 5203 5204 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 5205 sizeof(*net->ipv6.ip6_blk_hole_entry), 5206 GFP_KERNEL); 5207 if (!net->ipv6.ip6_blk_hole_entry) 5208 goto out_ip6_prohibit_entry; 5209 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5210 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 5211 ip6_template_metrics, true); 5212 #endif 5213 5214 net->ipv6.sysctl.flush_delay = 0; 5215 net->ipv6.sysctl.ip6_rt_max_size = 4096; 5216 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 5217 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 5218 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 5219 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 5220 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 5221 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 5222 5223 net->ipv6.ip6_rt_gc_expire = 30*HZ; 5224 5225 ret = 0; 5226 out: 5227 return ret; 5228 5229 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5230 out_ip6_prohibit_entry: 5231 kfree(net->ipv6.ip6_prohibit_entry); 5232 out_ip6_null_entry: 5233 kfree(net->ipv6.ip6_null_entry); 5234 #endif 5235 out_fib6_null_entry: 5236 kfree(net->ipv6.fib6_null_entry); 5237 out_ip6_dst_entries: 5238 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5239 out_ip6_dst_ops: 5240 goto out; 5241 } 5242 5243 static void __net_exit ip6_route_net_exit(struct net *net) 5244 { 5245 kfree(net->ipv6.fib6_null_entry); 5246 kfree(net->ipv6.ip6_null_entry); 5247 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5248 kfree(net->ipv6.ip6_prohibit_entry); 5249 kfree(net->ipv6.ip6_blk_hole_entry); 5250 #endif 5251 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5252 } 5253 5254 static int __net_init ip6_route_net_init_late(struct net *net) 5255 { 5256 #ifdef CONFIG_PROC_FS 5257 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops, 5258 sizeof(struct ipv6_route_iter)); 5259 proc_create_net_single("rt6_stats", 0444, net->proc_net, 5260 rt6_stats_seq_show, NULL); 5261 #endif 5262 return 0; 5263 } 5264 5265 static void __net_exit ip6_route_net_exit_late(struct net *net) 5266 { 5267 #ifdef CONFIG_PROC_FS 5268 remove_proc_entry("ipv6_route", net->proc_net); 5269 remove_proc_entry("rt6_stats", net->proc_net); 5270 #endif 5271 } 5272 5273 static struct pernet_operations ip6_route_net_ops = { 5274 .init = ip6_route_net_init, 5275 .exit = ip6_route_net_exit, 5276 }; 5277 5278 static int __net_init ipv6_inetpeer_init(struct net *net) 5279 { 5280 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 5281 5282 if (!bp) 5283 return -ENOMEM; 5284 inet_peer_base_init(bp); 5285 net->ipv6.peers = bp; 5286 return 0; 5287 } 5288 5289 static void __net_exit ipv6_inetpeer_exit(struct net *net) 5290 { 5291 struct inet_peer_base *bp = net->ipv6.peers; 5292 5293 net->ipv6.peers = NULL; 5294 inetpeer_invalidate_tree(bp); 5295 kfree(bp); 5296 } 5297 5298 static struct pernet_operations ipv6_inetpeer_ops = { 5299 .init = ipv6_inetpeer_init, 5300 .exit = ipv6_inetpeer_exit, 5301 }; 5302 5303 static struct pernet_operations ip6_route_net_late_ops = { 5304 .init = ip6_route_net_init_late, 5305 .exit = ip6_route_net_exit_late, 5306 }; 5307 5308 static struct notifier_block ip6_route_dev_notifier = { 5309 .notifier_call = ip6_route_dev_notify, 5310 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 5311 }; 5312 5313 void __init ip6_route_init_special_entries(void) 5314 { 5315 /* Registering of the loopback is done before this portion of code, 5316 * the loopback reference in rt6_info will not be taken, do it 5317 * manually for init_net */ 5318 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev; 5319 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5320 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5321 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5322 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 5323 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5324 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 5325 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5326 #endif 5327 } 5328 5329 int __init ip6_route_init(void) 5330 { 5331 int ret; 5332 int cpu; 5333 5334 ret = -ENOMEM; 5335 ip6_dst_ops_template.kmem_cachep = 5336 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 5337 SLAB_HWCACHE_ALIGN, NULL); 5338 if (!ip6_dst_ops_template.kmem_cachep) 5339 goto out; 5340 5341 ret = dst_entries_init(&ip6_dst_blackhole_ops); 5342 if (ret) 5343 goto out_kmem_cache; 5344 5345 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 5346 if (ret) 5347 goto out_dst_entries; 5348 5349 ret = register_pernet_subsys(&ip6_route_net_ops); 5350 if (ret) 5351 goto out_register_inetpeer; 5352 5353 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 5354 5355 ret = fib6_init(); 5356 if (ret) 5357 goto out_register_subsys; 5358 5359 ret = xfrm6_init(); 5360 if (ret) 5361 goto out_fib6_init; 5362 5363 ret = fib6_rules_init(); 5364 if (ret) 5365 goto xfrm6_init; 5366 5367 ret = register_pernet_subsys(&ip6_route_net_late_ops); 5368 if (ret) 5369 goto fib6_rules_init; 5370 5371 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 5372 inet6_rtm_newroute, NULL, 0); 5373 if (ret < 0) 5374 goto out_register_late_subsys; 5375 5376 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 5377 inet6_rtm_delroute, NULL, 0); 5378 if (ret < 0) 5379 goto out_register_late_subsys; 5380 5381 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 5382 inet6_rtm_getroute, NULL, 5383 RTNL_FLAG_DOIT_UNLOCKED); 5384 if (ret < 0) 5385 goto out_register_late_subsys; 5386 5387 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 5388 if (ret) 5389 goto out_register_late_subsys; 5390 5391 for_each_possible_cpu(cpu) { 5392 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 5393 5394 INIT_LIST_HEAD(&ul->head); 5395 spin_lock_init(&ul->lock); 5396 } 5397 5398 out: 5399 return ret; 5400 5401 out_register_late_subsys: 5402 rtnl_unregister_all(PF_INET6); 5403 unregister_pernet_subsys(&ip6_route_net_late_ops); 5404 fib6_rules_init: 5405 fib6_rules_cleanup(); 5406 xfrm6_init: 5407 xfrm6_fini(); 5408 out_fib6_init: 5409 fib6_gc_cleanup(); 5410 out_register_subsys: 5411 unregister_pernet_subsys(&ip6_route_net_ops); 5412 out_register_inetpeer: 5413 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5414 out_dst_entries: 5415 dst_entries_destroy(&ip6_dst_blackhole_ops); 5416 out_kmem_cache: 5417 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5418 goto out; 5419 } 5420 5421 void ip6_route_cleanup(void) 5422 { 5423 unregister_netdevice_notifier(&ip6_route_dev_notifier); 5424 unregister_pernet_subsys(&ip6_route_net_late_ops); 5425 fib6_rules_cleanup(); 5426 xfrm6_fini(); 5427 fib6_gc_cleanup(); 5428 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5429 unregister_pernet_subsys(&ip6_route_net_ops); 5430 dst_entries_destroy(&ip6_dst_blackhole_ops); 5431 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5432 } 5433