1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Linux INET6 implementation 4 * FIB front-end. 5 * 6 * Authors: 7 * Pedro Roque <roque@di.fc.ul.pt> 8 */ 9 10 /* Changes: 11 * 12 * YOSHIFUJI Hideaki @USAGI 13 * reworked default router selection. 14 * - respect outgoing interface 15 * - select from (probably) reachable routers (i.e. 16 * routers in REACHABLE, STALE, DELAY or PROBE states). 17 * - always select the same router if it is (probably) 18 * reachable. otherwise, round-robin the list. 19 * Ville Nuorvala 20 * Fixed routing subtrees. 21 */ 22 23 #define pr_fmt(fmt) "IPv6: " fmt 24 25 #include <linux/capability.h> 26 #include <linux/errno.h> 27 #include <linux/export.h> 28 #include <linux/types.h> 29 #include <linux/times.h> 30 #include <linux/socket.h> 31 #include <linux/sockios.h> 32 #include <linux/net.h> 33 #include <linux/route.h> 34 #include <linux/netdevice.h> 35 #include <linux/in6.h> 36 #include <linux/mroute6.h> 37 #include <linux/init.h> 38 #include <linux/if_arp.h> 39 #include <linux/proc_fs.h> 40 #include <linux/seq_file.h> 41 #include <linux/nsproxy.h> 42 #include <linux/slab.h> 43 #include <linux/jhash.h> 44 #include <net/net_namespace.h> 45 #include <net/snmp.h> 46 #include <net/ipv6.h> 47 #include <net/ip6_fib.h> 48 #include <net/ip6_route.h> 49 #include <net/ndisc.h> 50 #include <net/addrconf.h> 51 #include <net/tcp.h> 52 #include <linux/rtnetlink.h> 53 #include <net/dst.h> 54 #include <net/dst_metadata.h> 55 #include <net/xfrm.h> 56 #include <net/netevent.h> 57 #include <net/netlink.h> 58 #include <net/rtnh.h> 59 #include <net/lwtunnel.h> 60 #include <net/ip_tunnels.h> 61 #include <net/l3mdev.h> 62 #include <net/ip.h> 63 #include <linux/uaccess.h> 64 65 #ifdef CONFIG_SYSCTL 66 #include <linux/sysctl.h> 67 #endif 68 69 static int ip6_rt_type_to_error(u8 fib6_type); 70 71 #define CREATE_TRACE_POINTS 72 #include <trace/events/fib6.h> 73 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); 74 #undef CREATE_TRACE_POINTS 75 76 enum rt6_nud_state { 77 RT6_NUD_FAIL_HARD = -3, 78 RT6_NUD_FAIL_PROBE = -2, 79 RT6_NUD_FAIL_DO_RR = -1, 80 RT6_NUD_SUCCEED = 1 81 }; 82 83 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 84 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 85 static unsigned int ip6_mtu(const struct dst_entry *dst); 86 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 87 static void ip6_dst_destroy(struct dst_entry *); 88 static void ip6_dst_ifdown(struct dst_entry *, 89 struct net_device *dev, int how); 90 static int ip6_dst_gc(struct dst_ops *ops); 91 92 static int ip6_pkt_discard(struct sk_buff *skb); 93 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 94 static int ip6_pkt_prohibit(struct sk_buff *skb); 95 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 96 static void ip6_link_failure(struct sk_buff *skb); 97 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 98 struct sk_buff *skb, u32 mtu); 99 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 100 struct sk_buff *skb); 101 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, 102 int strict); 103 static size_t rt6_nlmsg_size(struct fib6_info *rt); 104 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 105 struct fib6_info *rt, struct dst_entry *dst, 106 struct in6_addr *dest, struct in6_addr *src, 107 int iif, int type, u32 portid, u32 seq, 108 unsigned int flags); 109 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, 110 const struct in6_addr *daddr, 111 const struct in6_addr *saddr); 112 113 #ifdef CONFIG_IPV6_ROUTE_INFO 114 static struct fib6_info *rt6_add_route_info(struct net *net, 115 const struct in6_addr *prefix, int prefixlen, 116 const struct in6_addr *gwaddr, 117 struct net_device *dev, 118 unsigned int pref); 119 static struct fib6_info *rt6_get_route_info(struct net *net, 120 const struct in6_addr *prefix, int prefixlen, 121 const struct in6_addr *gwaddr, 122 struct net_device *dev); 123 #endif 124 125 struct uncached_list { 126 spinlock_t lock; 127 struct list_head head; 128 }; 129 130 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 131 132 void rt6_uncached_list_add(struct rt6_info *rt) 133 { 134 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 135 136 rt->rt6i_uncached_list = ul; 137 138 spin_lock_bh(&ul->lock); 139 list_add_tail(&rt->rt6i_uncached, &ul->head); 140 spin_unlock_bh(&ul->lock); 141 } 142 143 void rt6_uncached_list_del(struct rt6_info *rt) 144 { 145 if (!list_empty(&rt->rt6i_uncached)) { 146 struct uncached_list *ul = rt->rt6i_uncached_list; 147 struct net *net = dev_net(rt->dst.dev); 148 149 spin_lock_bh(&ul->lock); 150 list_del(&rt->rt6i_uncached); 151 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 152 spin_unlock_bh(&ul->lock); 153 } 154 } 155 156 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 157 { 158 struct net_device *loopback_dev = net->loopback_dev; 159 int cpu; 160 161 if (dev == loopback_dev) 162 return; 163 164 for_each_possible_cpu(cpu) { 165 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 166 struct rt6_info *rt; 167 168 spin_lock_bh(&ul->lock); 169 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 170 struct inet6_dev *rt_idev = rt->rt6i_idev; 171 struct net_device *rt_dev = rt->dst.dev; 172 173 if (rt_idev->dev == dev) { 174 rt->rt6i_idev = in6_dev_get(loopback_dev); 175 in6_dev_put(rt_idev); 176 } 177 178 if (rt_dev == dev) { 179 rt->dst.dev = loopback_dev; 180 dev_hold(rt->dst.dev); 181 dev_put(rt_dev); 182 } 183 } 184 spin_unlock_bh(&ul->lock); 185 } 186 } 187 188 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 189 struct sk_buff *skb, 190 const void *daddr) 191 { 192 if (!ipv6_addr_any(p)) 193 return (const void *) p; 194 else if (skb) 195 return &ipv6_hdr(skb)->daddr; 196 return daddr; 197 } 198 199 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 200 struct net_device *dev, 201 struct sk_buff *skb, 202 const void *daddr) 203 { 204 struct neighbour *n; 205 206 daddr = choose_neigh_daddr(gw, skb, daddr); 207 n = __ipv6_neigh_lookup(dev, daddr); 208 if (n) 209 return n; 210 211 n = neigh_create(&nd_tbl, daddr, dev); 212 return IS_ERR(n) ? NULL : n; 213 } 214 215 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 216 struct sk_buff *skb, 217 const void *daddr) 218 { 219 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 220 221 return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any), 222 dst->dev, skb, daddr); 223 } 224 225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 226 { 227 struct net_device *dev = dst->dev; 228 struct rt6_info *rt = (struct rt6_info *)dst; 229 230 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); 231 if (!daddr) 232 return; 233 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 234 return; 235 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 236 return; 237 __ipv6_confirm_neigh(dev, daddr); 238 } 239 240 static struct dst_ops ip6_dst_ops_template = { 241 .family = AF_INET6, 242 .gc = ip6_dst_gc, 243 .gc_thresh = 1024, 244 .check = ip6_dst_check, 245 .default_advmss = ip6_default_advmss, 246 .mtu = ip6_mtu, 247 .cow_metrics = dst_cow_metrics_generic, 248 .destroy = ip6_dst_destroy, 249 .ifdown = ip6_dst_ifdown, 250 .negative_advice = ip6_negative_advice, 251 .link_failure = ip6_link_failure, 252 .update_pmtu = ip6_rt_update_pmtu, 253 .redirect = rt6_do_redirect, 254 .local_out = __ip6_local_out, 255 .neigh_lookup = ip6_dst_neigh_lookup, 256 .confirm_neigh = ip6_confirm_neigh, 257 }; 258 259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 260 { 261 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 262 263 return mtu ? : dst->dev->mtu; 264 } 265 266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 267 struct sk_buff *skb, u32 mtu) 268 { 269 } 270 271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 272 struct sk_buff *skb) 273 { 274 } 275 276 static struct dst_ops ip6_dst_blackhole_ops = { 277 .family = AF_INET6, 278 .destroy = ip6_dst_destroy, 279 .check = ip6_dst_check, 280 .mtu = ip6_blackhole_mtu, 281 .default_advmss = ip6_default_advmss, 282 .update_pmtu = ip6_rt_blackhole_update_pmtu, 283 .redirect = ip6_rt_blackhole_redirect, 284 .cow_metrics = dst_cow_metrics_generic, 285 .neigh_lookup = ip6_dst_neigh_lookup, 286 }; 287 288 static const u32 ip6_template_metrics[RTAX_MAX] = { 289 [RTAX_HOPLIMIT - 1] = 0, 290 }; 291 292 static const struct fib6_info fib6_null_entry_template = { 293 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 294 .fib6_protocol = RTPROT_KERNEL, 295 .fib6_metric = ~(u32)0, 296 .fib6_ref = REFCOUNT_INIT(1), 297 .fib6_type = RTN_UNREACHABLE, 298 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 299 }; 300 301 static const struct rt6_info ip6_null_entry_template = { 302 .dst = { 303 .__refcnt = ATOMIC_INIT(1), 304 .__use = 1, 305 .obsolete = DST_OBSOLETE_FORCE_CHK, 306 .error = -ENETUNREACH, 307 .input = ip6_pkt_discard, 308 .output = ip6_pkt_discard_out, 309 }, 310 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 311 }; 312 313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 314 315 static const struct rt6_info ip6_prohibit_entry_template = { 316 .dst = { 317 .__refcnt = ATOMIC_INIT(1), 318 .__use = 1, 319 .obsolete = DST_OBSOLETE_FORCE_CHK, 320 .error = -EACCES, 321 .input = ip6_pkt_prohibit, 322 .output = ip6_pkt_prohibit_out, 323 }, 324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 325 }; 326 327 static const struct rt6_info ip6_blk_hole_entry_template = { 328 .dst = { 329 .__refcnt = ATOMIC_INIT(1), 330 .__use = 1, 331 .obsolete = DST_OBSOLETE_FORCE_CHK, 332 .error = -EINVAL, 333 .input = dst_discard, 334 .output = dst_discard_out, 335 }, 336 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 337 }; 338 339 #endif 340 341 static void rt6_info_init(struct rt6_info *rt) 342 { 343 struct dst_entry *dst = &rt->dst; 344 345 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 346 INIT_LIST_HEAD(&rt->rt6i_uncached); 347 } 348 349 /* allocate dst with ip6_dst_ops */ 350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 351 int flags) 352 { 353 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 354 1, DST_OBSOLETE_FORCE_CHK, flags); 355 356 if (rt) { 357 rt6_info_init(rt); 358 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 359 } 360 361 return rt; 362 } 363 EXPORT_SYMBOL(ip6_dst_alloc); 364 365 static void ip6_dst_destroy(struct dst_entry *dst) 366 { 367 struct rt6_info *rt = (struct rt6_info *)dst; 368 struct fib6_info *from; 369 struct inet6_dev *idev; 370 371 ip_dst_metrics_put(dst); 372 rt6_uncached_list_del(rt); 373 374 idev = rt->rt6i_idev; 375 if (idev) { 376 rt->rt6i_idev = NULL; 377 in6_dev_put(idev); 378 } 379 380 from = xchg((__force struct fib6_info **)&rt->from, NULL); 381 fib6_info_release(from); 382 } 383 384 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 385 int how) 386 { 387 struct rt6_info *rt = (struct rt6_info *)dst; 388 struct inet6_dev *idev = rt->rt6i_idev; 389 struct net_device *loopback_dev = 390 dev_net(dev)->loopback_dev; 391 392 if (idev && idev->dev != loopback_dev) { 393 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 394 if (loopback_idev) { 395 rt->rt6i_idev = loopback_idev; 396 in6_dev_put(idev); 397 } 398 } 399 } 400 401 static bool __rt6_check_expired(const struct rt6_info *rt) 402 { 403 if (rt->rt6i_flags & RTF_EXPIRES) 404 return time_after(jiffies, rt->dst.expires); 405 else 406 return false; 407 } 408 409 static bool rt6_check_expired(const struct rt6_info *rt) 410 { 411 struct fib6_info *from; 412 413 from = rcu_dereference(rt->from); 414 415 if (rt->rt6i_flags & RTF_EXPIRES) { 416 if (time_after(jiffies, rt->dst.expires)) 417 return true; 418 } else if (from) { 419 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 420 fib6_check_expired(from); 421 } 422 return false; 423 } 424 425 void fib6_select_path(const struct net *net, struct fib6_result *res, 426 struct flowi6 *fl6, int oif, bool have_oif_match, 427 const struct sk_buff *skb, int strict) 428 { 429 struct fib6_info *sibling, *next_sibling; 430 struct fib6_info *match = res->f6i; 431 432 if (!match->fib6_nsiblings || have_oif_match) 433 goto out; 434 435 /* We might have already computed the hash for ICMPv6 errors. In such 436 * case it will always be non-zero. Otherwise now is the time to do it. 437 */ 438 if (!fl6->mp_hash) 439 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 440 441 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound)) 442 goto out; 443 444 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 445 fib6_siblings) { 446 const struct fib6_nh *nh = &sibling->fib6_nh; 447 int nh_upper_bound; 448 449 nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); 450 if (fl6->mp_hash > nh_upper_bound) 451 continue; 452 if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0) 453 break; 454 match = sibling; 455 break; 456 } 457 458 out: 459 res->f6i = match; 460 res->nh = &match->fib6_nh; 461 } 462 463 /* 464 * Route lookup. rcu_read_lock() should be held. 465 */ 466 467 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh, 468 const struct in6_addr *saddr, int oif, int flags) 469 { 470 const struct net_device *dev; 471 472 if (nh->fib_nh_flags & RTNH_F_DEAD) 473 return false; 474 475 dev = nh->fib_nh_dev; 476 if (oif) { 477 if (dev->ifindex == oif) 478 return true; 479 } else { 480 if (ipv6_chk_addr(net, saddr, dev, 481 flags & RT6_LOOKUP_F_IFACE)) 482 return true; 483 } 484 485 return false; 486 } 487 488 static void rt6_device_match(struct net *net, struct fib6_result *res, 489 const struct in6_addr *saddr, int oif, int flags) 490 { 491 struct fib6_info *f6i = res->f6i; 492 struct fib6_info *spf6i; 493 struct fib6_nh *nh; 494 495 if (!oif && ipv6_addr_any(saddr)) { 496 nh = &f6i->fib6_nh; 497 if (!(nh->fib_nh_flags & RTNH_F_DEAD)) 498 goto out; 499 } 500 501 for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) { 502 nh = &spf6i->fib6_nh; 503 if (__rt6_device_match(net, nh, saddr, oif, flags)) { 504 res->f6i = spf6i; 505 goto out; 506 } 507 } 508 509 if (oif && flags & RT6_LOOKUP_F_IFACE) { 510 res->f6i = net->ipv6.fib6_null_entry; 511 nh = &res->f6i->fib6_nh; 512 goto out; 513 } 514 515 nh = &f6i->fib6_nh; 516 if (nh->fib_nh_flags & RTNH_F_DEAD) { 517 res->f6i = net->ipv6.fib6_null_entry; 518 nh = &res->f6i->fib6_nh; 519 } 520 out: 521 res->nh = nh; 522 res->fib6_type = res->f6i->fib6_type; 523 res->fib6_flags = res->f6i->fib6_flags; 524 } 525 526 #ifdef CONFIG_IPV6_ROUTER_PREF 527 struct __rt6_probe_work { 528 struct work_struct work; 529 struct in6_addr target; 530 struct net_device *dev; 531 }; 532 533 static void rt6_probe_deferred(struct work_struct *w) 534 { 535 struct in6_addr mcaddr; 536 struct __rt6_probe_work *work = 537 container_of(w, struct __rt6_probe_work, work); 538 539 addrconf_addr_solict_mult(&work->target, &mcaddr); 540 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 541 dev_put(work->dev); 542 kfree(work); 543 } 544 545 static void rt6_probe(struct fib6_nh *fib6_nh) 546 { 547 struct __rt6_probe_work *work = NULL; 548 const struct in6_addr *nh_gw; 549 struct neighbour *neigh; 550 struct net_device *dev; 551 struct inet6_dev *idev; 552 553 /* 554 * Okay, this does not seem to be appropriate 555 * for now, however, we need to check if it 556 * is really so; aka Router Reachability Probing. 557 * 558 * Router Reachability Probe MUST be rate-limited 559 * to no more than one per minute. 560 */ 561 if (fib6_nh->fib_nh_gw_family) 562 return; 563 564 nh_gw = &fib6_nh->fib_nh_gw6; 565 dev = fib6_nh->fib_nh_dev; 566 rcu_read_lock_bh(); 567 idev = __in6_dev_get(dev); 568 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 569 if (neigh) { 570 if (neigh->nud_state & NUD_VALID) 571 goto out; 572 573 write_lock(&neigh->lock); 574 if (!(neigh->nud_state & NUD_VALID) && 575 time_after(jiffies, 576 neigh->updated + idev->cnf.rtr_probe_interval)) { 577 work = kmalloc(sizeof(*work), GFP_ATOMIC); 578 if (work) 579 __neigh_set_probe_once(neigh); 580 } 581 write_unlock(&neigh->lock); 582 } else if (time_after(jiffies, fib6_nh->last_probe + 583 idev->cnf.rtr_probe_interval)) { 584 work = kmalloc(sizeof(*work), GFP_ATOMIC); 585 } 586 587 if (work) { 588 fib6_nh->last_probe = jiffies; 589 INIT_WORK(&work->work, rt6_probe_deferred); 590 work->target = *nh_gw; 591 dev_hold(dev); 592 work->dev = dev; 593 schedule_work(&work->work); 594 } 595 596 out: 597 rcu_read_unlock_bh(); 598 } 599 #else 600 static inline void rt6_probe(struct fib6_nh *fib6_nh) 601 { 602 } 603 #endif 604 605 /* 606 * Default Router Selection (RFC 2461 6.3.6) 607 */ 608 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh) 609 { 610 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 611 struct neighbour *neigh; 612 613 rcu_read_lock_bh(); 614 neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev, 615 &fib6_nh->fib_nh_gw6); 616 if (neigh) { 617 read_lock(&neigh->lock); 618 if (neigh->nud_state & NUD_VALID) 619 ret = RT6_NUD_SUCCEED; 620 #ifdef CONFIG_IPV6_ROUTER_PREF 621 else if (!(neigh->nud_state & NUD_FAILED)) 622 ret = RT6_NUD_SUCCEED; 623 else 624 ret = RT6_NUD_FAIL_PROBE; 625 #endif 626 read_unlock(&neigh->lock); 627 } else { 628 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 629 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 630 } 631 rcu_read_unlock_bh(); 632 633 return ret; 634 } 635 636 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, 637 int strict) 638 { 639 int m = 0; 640 641 if (!oif || nh->fib_nh_dev->ifindex == oif) 642 m = 2; 643 644 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 645 return RT6_NUD_FAIL_HARD; 646 #ifdef CONFIG_IPV6_ROUTER_PREF 647 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2; 648 #endif 649 if ((strict & RT6_LOOKUP_F_REACHABLE) && 650 !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) { 651 int n = rt6_check_neigh(nh); 652 if (n < 0) 653 return n; 654 } 655 return m; 656 } 657 658 static bool find_match(struct fib6_nh *nh, u32 fib6_flags, 659 int oif, int strict, int *mpri, bool *do_rr) 660 { 661 bool match_do_rr = false; 662 bool rc = false; 663 int m; 664 665 if (nh->fib_nh_flags & RTNH_F_DEAD) 666 goto out; 667 668 if (ip6_ignore_linkdown(nh->fib_nh_dev) && 669 nh->fib_nh_flags & RTNH_F_LINKDOWN && 670 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 671 goto out; 672 673 m = rt6_score_route(nh, fib6_flags, oif, strict); 674 if (m == RT6_NUD_FAIL_DO_RR) { 675 match_do_rr = true; 676 m = 0; /* lowest valid score */ 677 } else if (m == RT6_NUD_FAIL_HARD) { 678 goto out; 679 } 680 681 if (strict & RT6_LOOKUP_F_REACHABLE) 682 rt6_probe(nh); 683 684 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 685 if (m > *mpri) { 686 *do_rr = match_do_rr; 687 *mpri = m; 688 rc = true; 689 } 690 out: 691 return rc; 692 } 693 694 static void __find_rr_leaf(struct fib6_info *f6i_start, 695 struct fib6_info *nomatch, u32 metric, 696 struct fib6_result *res, struct fib6_info **cont, 697 int oif, int strict, bool *do_rr, int *mpri) 698 { 699 struct fib6_info *f6i; 700 701 for (f6i = f6i_start; 702 f6i && f6i != nomatch; 703 f6i = rcu_dereference(f6i->fib6_next)) { 704 struct fib6_nh *nh; 705 706 if (cont && f6i->fib6_metric != metric) { 707 *cont = f6i; 708 return; 709 } 710 711 if (fib6_check_expired(f6i)) 712 continue; 713 714 nh = &f6i->fib6_nh; 715 if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) { 716 res->f6i = f6i; 717 res->nh = nh; 718 res->fib6_flags = f6i->fib6_flags; 719 res->fib6_type = f6i->fib6_type; 720 } 721 } 722 } 723 724 static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf, 725 struct fib6_info *rr_head, int oif, int strict, 726 bool *do_rr, struct fib6_result *res) 727 { 728 u32 metric = rr_head->fib6_metric; 729 struct fib6_info *cont = NULL; 730 int mpri = -1; 731 732 __find_rr_leaf(rr_head, NULL, metric, res, &cont, 733 oif, strict, do_rr, &mpri); 734 735 __find_rr_leaf(leaf, rr_head, metric, res, &cont, 736 oif, strict, do_rr, &mpri); 737 738 if (res->f6i || !cont) 739 return; 740 741 __find_rr_leaf(cont, NULL, metric, res, NULL, 742 oif, strict, do_rr, &mpri); 743 } 744 745 static void rt6_select(struct net *net, struct fib6_node *fn, int oif, 746 struct fib6_result *res, int strict) 747 { 748 struct fib6_info *leaf = rcu_dereference(fn->leaf); 749 struct fib6_info *rt0; 750 bool do_rr = false; 751 int key_plen; 752 753 /* make sure this function or its helpers sets f6i */ 754 res->f6i = NULL; 755 756 if (!leaf || leaf == net->ipv6.fib6_null_entry) 757 goto out; 758 759 rt0 = rcu_dereference(fn->rr_ptr); 760 if (!rt0) 761 rt0 = leaf; 762 763 /* Double check to make sure fn is not an intermediate node 764 * and fn->leaf does not points to its child's leaf 765 * (This might happen if all routes under fn are deleted from 766 * the tree and fib6_repair_tree() is called on the node.) 767 */ 768 key_plen = rt0->fib6_dst.plen; 769 #ifdef CONFIG_IPV6_SUBTREES 770 if (rt0->fib6_src.plen) 771 key_plen = rt0->fib6_src.plen; 772 #endif 773 if (fn->fn_bit != key_plen) 774 goto out; 775 776 find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res); 777 if (do_rr) { 778 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 779 780 /* no entries matched; do round-robin */ 781 if (!next || next->fib6_metric != rt0->fib6_metric) 782 next = leaf; 783 784 if (next != rt0) { 785 spin_lock_bh(&leaf->fib6_table->tb6_lock); 786 /* make sure next is not being deleted from the tree */ 787 if (next->fib6_node) 788 rcu_assign_pointer(fn->rr_ptr, next); 789 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 790 } 791 } 792 793 out: 794 if (!res->f6i) { 795 res->f6i = net->ipv6.fib6_null_entry; 796 res->nh = &res->f6i->fib6_nh; 797 res->fib6_flags = res->f6i->fib6_flags; 798 res->fib6_type = res->f6i->fib6_type; 799 } 800 } 801 802 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res) 803 { 804 return (res->f6i->fib6_flags & RTF_NONEXTHOP) || 805 res->nh->fib_nh_gw_family; 806 } 807 808 #ifdef CONFIG_IPV6_ROUTE_INFO 809 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 810 const struct in6_addr *gwaddr) 811 { 812 struct net *net = dev_net(dev); 813 struct route_info *rinfo = (struct route_info *) opt; 814 struct in6_addr prefix_buf, *prefix; 815 unsigned int pref; 816 unsigned long lifetime; 817 struct fib6_info *rt; 818 819 if (len < sizeof(struct route_info)) { 820 return -EINVAL; 821 } 822 823 /* Sanity check for prefix_len and length */ 824 if (rinfo->length > 3) { 825 return -EINVAL; 826 } else if (rinfo->prefix_len > 128) { 827 return -EINVAL; 828 } else if (rinfo->prefix_len > 64) { 829 if (rinfo->length < 2) { 830 return -EINVAL; 831 } 832 } else if (rinfo->prefix_len > 0) { 833 if (rinfo->length < 1) { 834 return -EINVAL; 835 } 836 } 837 838 pref = rinfo->route_pref; 839 if (pref == ICMPV6_ROUTER_PREF_INVALID) 840 return -EINVAL; 841 842 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 843 844 if (rinfo->length == 3) 845 prefix = (struct in6_addr *)rinfo->prefix; 846 else { 847 /* this function is safe */ 848 ipv6_addr_prefix(&prefix_buf, 849 (struct in6_addr *)rinfo->prefix, 850 rinfo->prefix_len); 851 prefix = &prefix_buf; 852 } 853 854 if (rinfo->prefix_len == 0) 855 rt = rt6_get_dflt_router(net, gwaddr, dev); 856 else 857 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 858 gwaddr, dev); 859 860 if (rt && !lifetime) { 861 ip6_del_rt(net, rt); 862 rt = NULL; 863 } 864 865 if (!rt && lifetime) 866 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 867 dev, pref); 868 else if (rt) 869 rt->fib6_flags = RTF_ROUTEINFO | 870 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 871 872 if (rt) { 873 if (!addrconf_finite_timeout(lifetime)) 874 fib6_clean_expires(rt); 875 else 876 fib6_set_expires(rt, jiffies + HZ * lifetime); 877 878 fib6_info_release(rt); 879 } 880 return 0; 881 } 882 #endif 883 884 /* 885 * Misc support functions 886 */ 887 888 /* called with rcu_lock held */ 889 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res) 890 { 891 struct net_device *dev = res->nh->fib_nh_dev; 892 893 if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 894 /* for copies of local routes, dst->dev needs to be the 895 * device if it is a master device, the master device if 896 * device is enslaved, and the loopback as the default 897 */ 898 if (netif_is_l3_slave(dev) && 899 !rt6_need_strict(&res->f6i->fib6_dst.addr)) 900 dev = l3mdev_master_dev_rcu(dev); 901 else if (!netif_is_l3_master(dev)) 902 dev = dev_net(dev)->loopback_dev; 903 /* last case is netif_is_l3_master(dev) is true in which 904 * case we want dev returned to be dev 905 */ 906 } 907 908 return dev; 909 } 910 911 static const int fib6_prop[RTN_MAX + 1] = { 912 [RTN_UNSPEC] = 0, 913 [RTN_UNICAST] = 0, 914 [RTN_LOCAL] = 0, 915 [RTN_BROADCAST] = 0, 916 [RTN_ANYCAST] = 0, 917 [RTN_MULTICAST] = 0, 918 [RTN_BLACKHOLE] = -EINVAL, 919 [RTN_UNREACHABLE] = -EHOSTUNREACH, 920 [RTN_PROHIBIT] = -EACCES, 921 [RTN_THROW] = -EAGAIN, 922 [RTN_NAT] = -EINVAL, 923 [RTN_XRESOLVE] = -EINVAL, 924 }; 925 926 static int ip6_rt_type_to_error(u8 fib6_type) 927 { 928 return fib6_prop[fib6_type]; 929 } 930 931 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 932 { 933 unsigned short flags = 0; 934 935 if (rt->dst_nocount) 936 flags |= DST_NOCOUNT; 937 if (rt->dst_nopolicy) 938 flags |= DST_NOPOLICY; 939 if (rt->dst_host) 940 flags |= DST_HOST; 941 942 return flags; 943 } 944 945 static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type) 946 { 947 rt->dst.error = ip6_rt_type_to_error(fib6_type); 948 949 switch (fib6_type) { 950 case RTN_BLACKHOLE: 951 rt->dst.output = dst_discard_out; 952 rt->dst.input = dst_discard; 953 break; 954 case RTN_PROHIBIT: 955 rt->dst.output = ip6_pkt_prohibit_out; 956 rt->dst.input = ip6_pkt_prohibit; 957 break; 958 case RTN_THROW: 959 case RTN_UNREACHABLE: 960 default: 961 rt->dst.output = ip6_pkt_discard_out; 962 rt->dst.input = ip6_pkt_discard; 963 break; 964 } 965 } 966 967 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res) 968 { 969 struct fib6_info *f6i = res->f6i; 970 971 if (res->fib6_flags & RTF_REJECT) { 972 ip6_rt_init_dst_reject(rt, res->fib6_type); 973 return; 974 } 975 976 rt->dst.error = 0; 977 rt->dst.output = ip6_output; 978 979 if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) { 980 rt->dst.input = ip6_input; 981 } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 982 rt->dst.input = ip6_mc_input; 983 } else { 984 rt->dst.input = ip6_forward; 985 } 986 987 if (res->nh->fib_nh_lws) { 988 rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws); 989 lwtunnel_set_redirect(&rt->dst); 990 } 991 992 rt->dst.lastuse = jiffies; 993 } 994 995 /* Caller must already hold reference to @from */ 996 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 997 { 998 rt->rt6i_flags &= ~RTF_EXPIRES; 999 rcu_assign_pointer(rt->from, from); 1000 ip_dst_init_metrics(&rt->dst, from->fib6_metrics); 1001 } 1002 1003 /* Caller must already hold reference to f6i in result */ 1004 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res) 1005 { 1006 const struct fib6_nh *nh = res->nh; 1007 const struct net_device *dev = nh->fib_nh_dev; 1008 struct fib6_info *f6i = res->f6i; 1009 1010 ip6_rt_init_dst(rt, res); 1011 1012 rt->rt6i_dst = f6i->fib6_dst; 1013 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 1014 rt->rt6i_flags = res->fib6_flags; 1015 if (nh->fib_nh_gw_family) { 1016 rt->rt6i_gateway = nh->fib_nh_gw6; 1017 rt->rt6i_flags |= RTF_GATEWAY; 1018 } 1019 rt6_set_from(rt, f6i); 1020 #ifdef CONFIG_IPV6_SUBTREES 1021 rt->rt6i_src = f6i->fib6_src; 1022 #endif 1023 } 1024 1025 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1026 struct in6_addr *saddr) 1027 { 1028 struct fib6_node *pn, *sn; 1029 while (1) { 1030 if (fn->fn_flags & RTN_TL_ROOT) 1031 return NULL; 1032 pn = rcu_dereference(fn->parent); 1033 sn = FIB6_SUBTREE(pn); 1034 if (sn && sn != fn) 1035 fn = fib6_node_lookup(sn, NULL, saddr); 1036 else 1037 fn = pn; 1038 if (fn->fn_flags & RTN_RTINFO) 1039 return fn; 1040 } 1041 } 1042 1043 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt) 1044 { 1045 struct rt6_info *rt = *prt; 1046 1047 if (dst_hold_safe(&rt->dst)) 1048 return true; 1049 if (net) { 1050 rt = net->ipv6.ip6_null_entry; 1051 dst_hold(&rt->dst); 1052 } else { 1053 rt = NULL; 1054 } 1055 *prt = rt; 1056 return false; 1057 } 1058 1059 /* called with rcu_lock held */ 1060 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res) 1061 { 1062 struct net_device *dev = res->nh->fib_nh_dev; 1063 struct fib6_info *f6i = res->f6i; 1064 unsigned short flags; 1065 struct rt6_info *nrt; 1066 1067 if (!fib6_info_hold_safe(f6i)) 1068 goto fallback; 1069 1070 flags = fib6_info_dst_flags(f6i); 1071 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1072 if (!nrt) { 1073 fib6_info_release(f6i); 1074 goto fallback; 1075 } 1076 1077 ip6_rt_copy_init(nrt, res); 1078 return nrt; 1079 1080 fallback: 1081 nrt = dev_net(dev)->ipv6.ip6_null_entry; 1082 dst_hold(&nrt->dst); 1083 return nrt; 1084 } 1085 1086 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1087 struct fib6_table *table, 1088 struct flowi6 *fl6, 1089 const struct sk_buff *skb, 1090 int flags) 1091 { 1092 struct fib6_result res = {}; 1093 struct fib6_node *fn; 1094 struct rt6_info *rt; 1095 1096 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1097 flags &= ~RT6_LOOKUP_F_IFACE; 1098 1099 rcu_read_lock(); 1100 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1101 restart: 1102 res.f6i = rcu_dereference(fn->leaf); 1103 if (!res.f6i) 1104 res.f6i = net->ipv6.fib6_null_entry; 1105 else 1106 rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif, 1107 flags); 1108 1109 if (res.f6i == net->ipv6.fib6_null_entry) { 1110 fn = fib6_backtrack(fn, &fl6->saddr); 1111 if (fn) 1112 goto restart; 1113 1114 rt = net->ipv6.ip6_null_entry; 1115 dst_hold(&rt->dst); 1116 goto out; 1117 } 1118 1119 fib6_select_path(net, &res, fl6, fl6->flowi6_oif, 1120 fl6->flowi6_oif != 0, skb, flags); 1121 1122 /* Search through exception table */ 1123 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); 1124 if (rt) { 1125 if (ip6_hold_safe(net, &rt)) 1126 dst_use_noref(&rt->dst, jiffies); 1127 } else { 1128 rt = ip6_create_rt_rcu(&res); 1129 } 1130 1131 out: 1132 trace_fib6_table_lookup(net, &res, table, fl6); 1133 1134 rcu_read_unlock(); 1135 1136 return rt; 1137 } 1138 1139 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1140 const struct sk_buff *skb, int flags) 1141 { 1142 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1143 } 1144 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1145 1146 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1147 const struct in6_addr *saddr, int oif, 1148 const struct sk_buff *skb, int strict) 1149 { 1150 struct flowi6 fl6 = { 1151 .flowi6_oif = oif, 1152 .daddr = *daddr, 1153 }; 1154 struct dst_entry *dst; 1155 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1156 1157 if (saddr) { 1158 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1159 flags |= RT6_LOOKUP_F_HAS_SADDR; 1160 } 1161 1162 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1163 if (dst->error == 0) 1164 return (struct rt6_info *) dst; 1165 1166 dst_release(dst); 1167 1168 return NULL; 1169 } 1170 EXPORT_SYMBOL(rt6_lookup); 1171 1172 /* ip6_ins_rt is called with FREE table->tb6_lock. 1173 * It takes new route entry, the addition fails by any reason the 1174 * route is released. 1175 * Caller must hold dst before calling it. 1176 */ 1177 1178 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1179 struct netlink_ext_ack *extack) 1180 { 1181 int err; 1182 struct fib6_table *table; 1183 1184 table = rt->fib6_table; 1185 spin_lock_bh(&table->tb6_lock); 1186 err = fib6_add(&table->tb6_root, rt, info, extack); 1187 spin_unlock_bh(&table->tb6_lock); 1188 1189 return err; 1190 } 1191 1192 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1193 { 1194 struct nl_info info = { .nl_net = net, }; 1195 1196 return __ip6_ins_rt(rt, &info, NULL); 1197 } 1198 1199 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res, 1200 const struct in6_addr *daddr, 1201 const struct in6_addr *saddr) 1202 { 1203 struct fib6_info *f6i = res->f6i; 1204 struct net_device *dev; 1205 struct rt6_info *rt; 1206 1207 /* 1208 * Clone the route. 1209 */ 1210 1211 if (!fib6_info_hold_safe(f6i)) 1212 return NULL; 1213 1214 dev = ip6_rt_get_dev_rcu(res); 1215 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1216 if (!rt) { 1217 fib6_info_release(f6i); 1218 return NULL; 1219 } 1220 1221 ip6_rt_copy_init(rt, res); 1222 rt->rt6i_flags |= RTF_CACHE; 1223 rt->dst.flags |= DST_HOST; 1224 rt->rt6i_dst.addr = *daddr; 1225 rt->rt6i_dst.plen = 128; 1226 1227 if (!rt6_is_gw_or_nonexthop(res)) { 1228 if (f6i->fib6_dst.plen != 128 && 1229 ipv6_addr_equal(&f6i->fib6_dst.addr, daddr)) 1230 rt->rt6i_flags |= RTF_ANYCAST; 1231 #ifdef CONFIG_IPV6_SUBTREES 1232 if (rt->rt6i_src.plen && saddr) { 1233 rt->rt6i_src.addr = *saddr; 1234 rt->rt6i_src.plen = 128; 1235 } 1236 #endif 1237 } 1238 1239 return rt; 1240 } 1241 1242 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res) 1243 { 1244 struct fib6_info *f6i = res->f6i; 1245 unsigned short flags = fib6_info_dst_flags(f6i); 1246 struct net_device *dev; 1247 struct rt6_info *pcpu_rt; 1248 1249 if (!fib6_info_hold_safe(f6i)) 1250 return NULL; 1251 1252 rcu_read_lock(); 1253 dev = ip6_rt_get_dev_rcu(res); 1254 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1255 rcu_read_unlock(); 1256 if (!pcpu_rt) { 1257 fib6_info_release(f6i); 1258 return NULL; 1259 } 1260 ip6_rt_copy_init(pcpu_rt, res); 1261 pcpu_rt->rt6i_flags |= RTF_PCPU; 1262 return pcpu_rt; 1263 } 1264 1265 /* It should be called with rcu_read_lock() acquired */ 1266 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) 1267 { 1268 struct rt6_info *pcpu_rt, **p; 1269 1270 p = this_cpu_ptr(res->f6i->rt6i_pcpu); 1271 pcpu_rt = *p; 1272 1273 if (pcpu_rt) 1274 ip6_hold_safe(NULL, &pcpu_rt); 1275 1276 return pcpu_rt; 1277 } 1278 1279 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1280 const struct fib6_result *res) 1281 { 1282 struct rt6_info *pcpu_rt, *prev, **p; 1283 1284 pcpu_rt = ip6_rt_pcpu_alloc(res); 1285 if (!pcpu_rt) { 1286 dst_hold(&net->ipv6.ip6_null_entry->dst); 1287 return net->ipv6.ip6_null_entry; 1288 } 1289 1290 dst_hold(&pcpu_rt->dst); 1291 p = this_cpu_ptr(res->f6i->rt6i_pcpu); 1292 prev = cmpxchg(p, NULL, pcpu_rt); 1293 BUG_ON(prev); 1294 1295 if (res->f6i->fib6_destroying) { 1296 struct fib6_info *from; 1297 1298 from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); 1299 fib6_info_release(from); 1300 } 1301 1302 return pcpu_rt; 1303 } 1304 1305 /* exception hash table implementation 1306 */ 1307 static DEFINE_SPINLOCK(rt6_exception_lock); 1308 1309 /* Remove rt6_ex from hash table and free the memory 1310 * Caller must hold rt6_exception_lock 1311 */ 1312 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1313 struct rt6_exception *rt6_ex) 1314 { 1315 struct fib6_info *from; 1316 struct net *net; 1317 1318 if (!bucket || !rt6_ex) 1319 return; 1320 1321 net = dev_net(rt6_ex->rt6i->dst.dev); 1322 net->ipv6.rt6_stats->fib_rt_cache--; 1323 1324 /* purge completely the exception to allow releasing the held resources: 1325 * some [sk] cache may keep the dst around for unlimited time 1326 */ 1327 from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL); 1328 fib6_info_release(from); 1329 dst_dev_put(&rt6_ex->rt6i->dst); 1330 1331 hlist_del_rcu(&rt6_ex->hlist); 1332 dst_release(&rt6_ex->rt6i->dst); 1333 kfree_rcu(rt6_ex, rcu); 1334 WARN_ON_ONCE(!bucket->depth); 1335 bucket->depth--; 1336 } 1337 1338 /* Remove oldest rt6_ex in bucket and free the memory 1339 * Caller must hold rt6_exception_lock 1340 */ 1341 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1342 { 1343 struct rt6_exception *rt6_ex, *oldest = NULL; 1344 1345 if (!bucket) 1346 return; 1347 1348 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1349 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1350 oldest = rt6_ex; 1351 } 1352 rt6_remove_exception(bucket, oldest); 1353 } 1354 1355 static u32 rt6_exception_hash(const struct in6_addr *dst, 1356 const struct in6_addr *src) 1357 { 1358 static u32 seed __read_mostly; 1359 u32 val; 1360 1361 net_get_random_once(&seed, sizeof(seed)); 1362 val = jhash(dst, sizeof(*dst), seed); 1363 1364 #ifdef CONFIG_IPV6_SUBTREES 1365 if (src) 1366 val = jhash(src, sizeof(*src), val); 1367 #endif 1368 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1369 } 1370 1371 /* Helper function to find the cached rt in the hash table 1372 * and update bucket pointer to point to the bucket for this 1373 * (daddr, saddr) pair 1374 * Caller must hold rt6_exception_lock 1375 */ 1376 static struct rt6_exception * 1377 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1378 const struct in6_addr *daddr, 1379 const struct in6_addr *saddr) 1380 { 1381 struct rt6_exception *rt6_ex; 1382 u32 hval; 1383 1384 if (!(*bucket) || !daddr) 1385 return NULL; 1386 1387 hval = rt6_exception_hash(daddr, saddr); 1388 *bucket += hval; 1389 1390 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1391 struct rt6_info *rt6 = rt6_ex->rt6i; 1392 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1393 1394 #ifdef CONFIG_IPV6_SUBTREES 1395 if (matched && saddr) 1396 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1397 #endif 1398 if (matched) 1399 return rt6_ex; 1400 } 1401 return NULL; 1402 } 1403 1404 /* Helper function to find the cached rt in the hash table 1405 * and update bucket pointer to point to the bucket for this 1406 * (daddr, saddr) pair 1407 * Caller must hold rcu_read_lock() 1408 */ 1409 static struct rt6_exception * 1410 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1411 const struct in6_addr *daddr, 1412 const struct in6_addr *saddr) 1413 { 1414 struct rt6_exception *rt6_ex; 1415 u32 hval; 1416 1417 WARN_ON_ONCE(!rcu_read_lock_held()); 1418 1419 if (!(*bucket) || !daddr) 1420 return NULL; 1421 1422 hval = rt6_exception_hash(daddr, saddr); 1423 *bucket += hval; 1424 1425 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1426 struct rt6_info *rt6 = rt6_ex->rt6i; 1427 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1428 1429 #ifdef CONFIG_IPV6_SUBTREES 1430 if (matched && saddr) 1431 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1432 #endif 1433 if (matched) 1434 return rt6_ex; 1435 } 1436 return NULL; 1437 } 1438 1439 static unsigned int fib6_mtu(const struct fib6_result *res) 1440 { 1441 const struct fib6_nh *nh = res->nh; 1442 unsigned int mtu; 1443 1444 if (res->f6i->fib6_pmtu) { 1445 mtu = res->f6i->fib6_pmtu; 1446 } else { 1447 struct net_device *dev = nh->fib_nh_dev; 1448 struct inet6_dev *idev; 1449 1450 rcu_read_lock(); 1451 idev = __in6_dev_get(dev); 1452 mtu = idev->cnf.mtu6; 1453 rcu_read_unlock(); 1454 } 1455 1456 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1457 1458 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); 1459 } 1460 1461 static int rt6_insert_exception(struct rt6_info *nrt, 1462 const struct fib6_result *res) 1463 { 1464 struct net *net = dev_net(nrt->dst.dev); 1465 struct rt6_exception_bucket *bucket; 1466 struct in6_addr *src_key = NULL; 1467 struct rt6_exception *rt6_ex; 1468 struct fib6_info *f6i = res->f6i; 1469 int err = 0; 1470 1471 spin_lock_bh(&rt6_exception_lock); 1472 1473 if (f6i->exception_bucket_flushed) { 1474 err = -EINVAL; 1475 goto out; 1476 } 1477 1478 bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1479 lockdep_is_held(&rt6_exception_lock)); 1480 if (!bucket) { 1481 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1482 GFP_ATOMIC); 1483 if (!bucket) { 1484 err = -ENOMEM; 1485 goto out; 1486 } 1487 rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket); 1488 } 1489 1490 #ifdef CONFIG_IPV6_SUBTREES 1491 /* fib6_src.plen != 0 indicates f6i is in subtree 1492 * and exception table is indexed by a hash of 1493 * both fib6_dst and fib6_src. 1494 * Otherwise, the exception table is indexed by 1495 * a hash of only fib6_dst. 1496 */ 1497 if (f6i->fib6_src.plen) 1498 src_key = &nrt->rt6i_src.addr; 1499 #endif 1500 /* rt6_mtu_change() might lower mtu on f6i. 1501 * Only insert this exception route if its mtu 1502 * is less than f6i's mtu value. 1503 */ 1504 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) { 1505 err = -EINVAL; 1506 goto out; 1507 } 1508 1509 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1510 src_key); 1511 if (rt6_ex) 1512 rt6_remove_exception(bucket, rt6_ex); 1513 1514 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1515 if (!rt6_ex) { 1516 err = -ENOMEM; 1517 goto out; 1518 } 1519 rt6_ex->rt6i = nrt; 1520 rt6_ex->stamp = jiffies; 1521 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1522 bucket->depth++; 1523 net->ipv6.rt6_stats->fib_rt_cache++; 1524 1525 if (bucket->depth > FIB6_MAX_DEPTH) 1526 rt6_exception_remove_oldest(bucket); 1527 1528 out: 1529 spin_unlock_bh(&rt6_exception_lock); 1530 1531 /* Update fn->fn_sernum to invalidate all cached dst */ 1532 if (!err) { 1533 spin_lock_bh(&f6i->fib6_table->tb6_lock); 1534 fib6_update_sernum(net, f6i); 1535 spin_unlock_bh(&f6i->fib6_table->tb6_lock); 1536 fib6_force_start_gc(net); 1537 } 1538 1539 return err; 1540 } 1541 1542 void rt6_flush_exceptions(struct fib6_info *rt) 1543 { 1544 struct rt6_exception_bucket *bucket; 1545 struct rt6_exception *rt6_ex; 1546 struct hlist_node *tmp; 1547 int i; 1548 1549 spin_lock_bh(&rt6_exception_lock); 1550 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1551 rt->exception_bucket_flushed = 1; 1552 1553 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1554 lockdep_is_held(&rt6_exception_lock)); 1555 if (!bucket) 1556 goto out; 1557 1558 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1559 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1560 rt6_remove_exception(bucket, rt6_ex); 1561 WARN_ON_ONCE(bucket->depth); 1562 bucket++; 1563 } 1564 1565 out: 1566 spin_unlock_bh(&rt6_exception_lock); 1567 } 1568 1569 /* Find cached rt in the hash table inside passed in rt 1570 * Caller has to hold rcu_read_lock() 1571 */ 1572 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, 1573 const struct in6_addr *daddr, 1574 const struct in6_addr *saddr) 1575 { 1576 const struct in6_addr *src_key = NULL; 1577 struct rt6_exception_bucket *bucket; 1578 struct rt6_exception *rt6_ex; 1579 struct rt6_info *ret = NULL; 1580 1581 #ifdef CONFIG_IPV6_SUBTREES 1582 /* fib6i_src.plen != 0 indicates f6i is in subtree 1583 * and exception table is indexed by a hash of 1584 * both fib6_dst and fib6_src. 1585 * However, the src addr used to create the hash 1586 * might not be exactly the passed in saddr which 1587 * is a /128 addr from the flow. 1588 * So we need to use f6i->fib6_src to redo lookup 1589 * if the passed in saddr does not find anything. 1590 * (See the logic in ip6_rt_cache_alloc() on how 1591 * rt->rt6i_src is updated.) 1592 */ 1593 if (res->f6i->fib6_src.plen) 1594 src_key = saddr; 1595 find_ex: 1596 #endif 1597 bucket = rcu_dereference(res->f6i->rt6i_exception_bucket); 1598 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1599 1600 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1601 ret = rt6_ex->rt6i; 1602 1603 #ifdef CONFIG_IPV6_SUBTREES 1604 /* Use fib6_src as src_key and redo lookup */ 1605 if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) { 1606 src_key = &res->f6i->fib6_src.addr; 1607 goto find_ex; 1608 } 1609 #endif 1610 1611 return ret; 1612 } 1613 1614 /* Remove the passed in cached rt from the hash table that contains it */ 1615 static int rt6_remove_exception_rt(struct rt6_info *rt) 1616 { 1617 struct rt6_exception_bucket *bucket; 1618 struct in6_addr *src_key = NULL; 1619 struct rt6_exception *rt6_ex; 1620 struct fib6_info *from; 1621 int err; 1622 1623 from = rcu_dereference(rt->from); 1624 if (!from || 1625 !(rt->rt6i_flags & RTF_CACHE)) 1626 return -EINVAL; 1627 1628 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1629 return -ENOENT; 1630 1631 spin_lock_bh(&rt6_exception_lock); 1632 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1633 lockdep_is_held(&rt6_exception_lock)); 1634 #ifdef CONFIG_IPV6_SUBTREES 1635 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1636 * and exception table is indexed by a hash of 1637 * both rt6i_dst and rt6i_src. 1638 * Otherwise, the exception table is indexed by 1639 * a hash of only rt6i_dst. 1640 */ 1641 if (from->fib6_src.plen) 1642 src_key = &rt->rt6i_src.addr; 1643 #endif 1644 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1645 &rt->rt6i_dst.addr, 1646 src_key); 1647 if (rt6_ex) { 1648 rt6_remove_exception(bucket, rt6_ex); 1649 err = 0; 1650 } else { 1651 err = -ENOENT; 1652 } 1653 1654 spin_unlock_bh(&rt6_exception_lock); 1655 return err; 1656 } 1657 1658 /* Find rt6_ex which contains the passed in rt cache and 1659 * refresh its stamp 1660 */ 1661 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1662 { 1663 struct rt6_exception_bucket *bucket; 1664 struct in6_addr *src_key = NULL; 1665 struct rt6_exception *rt6_ex; 1666 struct fib6_info *from; 1667 1668 rcu_read_lock(); 1669 from = rcu_dereference(rt->from); 1670 if (!from || !(rt->rt6i_flags & RTF_CACHE)) 1671 goto unlock; 1672 1673 bucket = rcu_dereference(from->rt6i_exception_bucket); 1674 1675 #ifdef CONFIG_IPV6_SUBTREES 1676 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1677 * and exception table is indexed by a hash of 1678 * both rt6i_dst and rt6i_src. 1679 * Otherwise, the exception table is indexed by 1680 * a hash of only rt6i_dst. 1681 */ 1682 if (from->fib6_src.plen) 1683 src_key = &rt->rt6i_src.addr; 1684 #endif 1685 rt6_ex = __rt6_find_exception_rcu(&bucket, 1686 &rt->rt6i_dst.addr, 1687 src_key); 1688 if (rt6_ex) 1689 rt6_ex->stamp = jiffies; 1690 1691 unlock: 1692 rcu_read_unlock(); 1693 } 1694 1695 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1696 struct rt6_info *rt, int mtu) 1697 { 1698 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1699 * lowest MTU in the path: always allow updating the route PMTU to 1700 * reflect PMTU decreases. 1701 * 1702 * If the new MTU is higher, and the route PMTU is equal to the local 1703 * MTU, this means the old MTU is the lowest in the path, so allow 1704 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1705 * handle this. 1706 */ 1707 1708 if (dst_mtu(&rt->dst) >= mtu) 1709 return true; 1710 1711 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 1712 return true; 1713 1714 return false; 1715 } 1716 1717 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1718 struct fib6_info *rt, int mtu) 1719 { 1720 struct rt6_exception_bucket *bucket; 1721 struct rt6_exception *rt6_ex; 1722 int i; 1723 1724 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1725 lockdep_is_held(&rt6_exception_lock)); 1726 1727 if (!bucket) 1728 return; 1729 1730 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1731 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1732 struct rt6_info *entry = rt6_ex->rt6i; 1733 1734 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 1735 * route), the metrics of its rt->from have already 1736 * been updated. 1737 */ 1738 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 1739 rt6_mtu_change_route_allowed(idev, entry, mtu)) 1740 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 1741 } 1742 bucket++; 1743 } 1744 } 1745 1746 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1747 1748 static void rt6_exceptions_clean_tohost(struct fib6_info *rt, 1749 struct in6_addr *gateway) 1750 { 1751 struct rt6_exception_bucket *bucket; 1752 struct rt6_exception *rt6_ex; 1753 struct hlist_node *tmp; 1754 int i; 1755 1756 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1757 return; 1758 1759 spin_lock_bh(&rt6_exception_lock); 1760 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1761 lockdep_is_held(&rt6_exception_lock)); 1762 1763 if (bucket) { 1764 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1765 hlist_for_each_entry_safe(rt6_ex, tmp, 1766 &bucket->chain, hlist) { 1767 struct rt6_info *entry = rt6_ex->rt6i; 1768 1769 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1770 RTF_CACHE_GATEWAY && 1771 ipv6_addr_equal(gateway, 1772 &entry->rt6i_gateway)) { 1773 rt6_remove_exception(bucket, rt6_ex); 1774 } 1775 } 1776 bucket++; 1777 } 1778 } 1779 1780 spin_unlock_bh(&rt6_exception_lock); 1781 } 1782 1783 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1784 struct rt6_exception *rt6_ex, 1785 struct fib6_gc_args *gc_args, 1786 unsigned long now) 1787 { 1788 struct rt6_info *rt = rt6_ex->rt6i; 1789 1790 /* we are pruning and obsoleting aged-out and non gateway exceptions 1791 * even if others have still references to them, so that on next 1792 * dst_check() such references can be dropped. 1793 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1794 * expired, independently from their aging, as per RFC 8201 section 4 1795 */ 1796 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 1797 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1798 RT6_TRACE("aging clone %p\n", rt); 1799 rt6_remove_exception(bucket, rt6_ex); 1800 return; 1801 } 1802 } else if (time_after(jiffies, rt->dst.expires)) { 1803 RT6_TRACE("purging expired route %p\n", rt); 1804 rt6_remove_exception(bucket, rt6_ex); 1805 return; 1806 } 1807 1808 if (rt->rt6i_flags & RTF_GATEWAY) { 1809 struct neighbour *neigh; 1810 __u8 neigh_flags = 0; 1811 1812 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 1813 if (neigh) 1814 neigh_flags = neigh->flags; 1815 1816 if (!(neigh_flags & NTF_ROUTER)) { 1817 RT6_TRACE("purging route %p via non-router but gateway\n", 1818 rt); 1819 rt6_remove_exception(bucket, rt6_ex); 1820 return; 1821 } 1822 } 1823 1824 gc_args->more++; 1825 } 1826 1827 void rt6_age_exceptions(struct fib6_info *rt, 1828 struct fib6_gc_args *gc_args, 1829 unsigned long now) 1830 { 1831 struct rt6_exception_bucket *bucket; 1832 struct rt6_exception *rt6_ex; 1833 struct hlist_node *tmp; 1834 int i; 1835 1836 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1837 return; 1838 1839 rcu_read_lock_bh(); 1840 spin_lock(&rt6_exception_lock); 1841 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1842 lockdep_is_held(&rt6_exception_lock)); 1843 1844 if (bucket) { 1845 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1846 hlist_for_each_entry_safe(rt6_ex, tmp, 1847 &bucket->chain, hlist) { 1848 rt6_age_examine_exception(bucket, rt6_ex, 1849 gc_args, now); 1850 } 1851 bucket++; 1852 } 1853 } 1854 spin_unlock(&rt6_exception_lock); 1855 rcu_read_unlock_bh(); 1856 } 1857 1858 /* must be called with rcu lock held */ 1859 int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, 1860 struct flowi6 *fl6, struct fib6_result *res, int strict) 1861 { 1862 struct fib6_node *fn, *saved_fn; 1863 1864 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1865 saved_fn = fn; 1866 1867 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1868 oif = 0; 1869 1870 redo_rt6_select: 1871 rt6_select(net, fn, oif, res, strict); 1872 if (res->f6i == net->ipv6.fib6_null_entry) { 1873 fn = fib6_backtrack(fn, &fl6->saddr); 1874 if (fn) 1875 goto redo_rt6_select; 1876 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1877 /* also consider unreachable route */ 1878 strict &= ~RT6_LOOKUP_F_REACHABLE; 1879 fn = saved_fn; 1880 goto redo_rt6_select; 1881 } 1882 } 1883 1884 trace_fib6_table_lookup(net, res, table, fl6); 1885 1886 return 0; 1887 } 1888 1889 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1890 int oif, struct flowi6 *fl6, 1891 const struct sk_buff *skb, int flags) 1892 { 1893 struct fib6_result res = {}; 1894 struct rt6_info *rt; 1895 int strict = 0; 1896 1897 strict |= flags & RT6_LOOKUP_F_IFACE; 1898 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1899 if (net->ipv6.devconf_all->forwarding == 0) 1900 strict |= RT6_LOOKUP_F_REACHABLE; 1901 1902 rcu_read_lock(); 1903 1904 fib6_table_lookup(net, table, oif, fl6, &res, strict); 1905 if (res.f6i == net->ipv6.fib6_null_entry) { 1906 rt = net->ipv6.ip6_null_entry; 1907 rcu_read_unlock(); 1908 dst_hold(&rt->dst); 1909 return rt; 1910 } 1911 1912 fib6_select_path(net, &res, fl6, oif, false, skb, strict); 1913 1914 /*Search through exception table */ 1915 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); 1916 if (rt) { 1917 if (ip6_hold_safe(net, &rt)) 1918 dst_use_noref(&rt->dst, jiffies); 1919 1920 rcu_read_unlock(); 1921 return rt; 1922 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1923 !res.nh->fib_nh_gw_family)) { 1924 /* Create a RTF_CACHE clone which will not be 1925 * owned by the fib6 tree. It is for the special case where 1926 * the daddr in the skb during the neighbor look-up is different 1927 * from the fl6->daddr used to look-up route here. 1928 */ 1929 struct rt6_info *uncached_rt; 1930 1931 uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); 1932 1933 rcu_read_unlock(); 1934 1935 if (uncached_rt) { 1936 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1937 * No need for another dst_hold() 1938 */ 1939 rt6_uncached_list_add(uncached_rt); 1940 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 1941 } else { 1942 uncached_rt = net->ipv6.ip6_null_entry; 1943 dst_hold(&uncached_rt->dst); 1944 } 1945 1946 return uncached_rt; 1947 } else { 1948 /* Get a percpu copy */ 1949 1950 struct rt6_info *pcpu_rt; 1951 1952 local_bh_disable(); 1953 pcpu_rt = rt6_get_pcpu_route(&res); 1954 1955 if (!pcpu_rt) 1956 pcpu_rt = rt6_make_pcpu_route(net, &res); 1957 1958 local_bh_enable(); 1959 rcu_read_unlock(); 1960 1961 return pcpu_rt; 1962 } 1963 } 1964 EXPORT_SYMBOL_GPL(ip6_pol_route); 1965 1966 static struct rt6_info *ip6_pol_route_input(struct net *net, 1967 struct fib6_table *table, 1968 struct flowi6 *fl6, 1969 const struct sk_buff *skb, 1970 int flags) 1971 { 1972 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 1973 } 1974 1975 struct dst_entry *ip6_route_input_lookup(struct net *net, 1976 struct net_device *dev, 1977 struct flowi6 *fl6, 1978 const struct sk_buff *skb, 1979 int flags) 1980 { 1981 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1982 flags |= RT6_LOOKUP_F_IFACE; 1983 1984 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 1985 } 1986 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1987 1988 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1989 struct flow_keys *keys, 1990 struct flow_keys *flkeys) 1991 { 1992 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1993 const struct ipv6hdr *key_iph = outer_iph; 1994 struct flow_keys *_flkeys = flkeys; 1995 const struct ipv6hdr *inner_iph; 1996 const struct icmp6hdr *icmph; 1997 struct ipv6hdr _inner_iph; 1998 struct icmp6hdr _icmph; 1999 2000 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 2001 goto out; 2002 2003 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 2004 sizeof(_icmph), &_icmph); 2005 if (!icmph) 2006 goto out; 2007 2008 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 2009 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 2010 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 2011 icmph->icmp6_type != ICMPV6_PARAMPROB) 2012 goto out; 2013 2014 inner_iph = skb_header_pointer(skb, 2015 skb_transport_offset(skb) + sizeof(*icmph), 2016 sizeof(_inner_iph), &_inner_iph); 2017 if (!inner_iph) 2018 goto out; 2019 2020 key_iph = inner_iph; 2021 _flkeys = NULL; 2022 out: 2023 if (_flkeys) { 2024 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 2025 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 2026 keys->tags.flow_label = _flkeys->tags.flow_label; 2027 keys->basic.ip_proto = _flkeys->basic.ip_proto; 2028 } else { 2029 keys->addrs.v6addrs.src = key_iph->saddr; 2030 keys->addrs.v6addrs.dst = key_iph->daddr; 2031 keys->tags.flow_label = ip6_flowlabel(key_iph); 2032 keys->basic.ip_proto = key_iph->nexthdr; 2033 } 2034 } 2035 2036 /* if skb is set it will be used and fl6 can be NULL */ 2037 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 2038 const struct sk_buff *skb, struct flow_keys *flkeys) 2039 { 2040 struct flow_keys hash_keys; 2041 u32 mhash; 2042 2043 switch (ip6_multipath_hash_policy(net)) { 2044 case 0: 2045 memset(&hash_keys, 0, sizeof(hash_keys)); 2046 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2047 if (skb) { 2048 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 2049 } else { 2050 hash_keys.addrs.v6addrs.src = fl6->saddr; 2051 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2052 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2053 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2054 } 2055 break; 2056 case 1: 2057 if (skb) { 2058 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 2059 struct flow_keys keys; 2060 2061 /* short-circuit if we already have L4 hash present */ 2062 if (skb->l4_hash) 2063 return skb_get_hash_raw(skb) >> 1; 2064 2065 memset(&hash_keys, 0, sizeof(hash_keys)); 2066 2067 if (!flkeys) { 2068 skb_flow_dissect_flow_keys(skb, &keys, flag); 2069 flkeys = &keys; 2070 } 2071 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2072 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2073 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2074 hash_keys.ports.src = flkeys->ports.src; 2075 hash_keys.ports.dst = flkeys->ports.dst; 2076 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2077 } else { 2078 memset(&hash_keys, 0, sizeof(hash_keys)); 2079 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2080 hash_keys.addrs.v6addrs.src = fl6->saddr; 2081 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2082 hash_keys.ports.src = fl6->fl6_sport; 2083 hash_keys.ports.dst = fl6->fl6_dport; 2084 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2085 } 2086 break; 2087 } 2088 mhash = flow_hash_from_keys(&hash_keys); 2089 2090 return mhash >> 1; 2091 } 2092 2093 void ip6_route_input(struct sk_buff *skb) 2094 { 2095 const struct ipv6hdr *iph = ipv6_hdr(skb); 2096 struct net *net = dev_net(skb->dev); 2097 int flags = RT6_LOOKUP_F_HAS_SADDR; 2098 struct ip_tunnel_info *tun_info; 2099 struct flowi6 fl6 = { 2100 .flowi6_iif = skb->dev->ifindex, 2101 .daddr = iph->daddr, 2102 .saddr = iph->saddr, 2103 .flowlabel = ip6_flowinfo(iph), 2104 .flowi6_mark = skb->mark, 2105 .flowi6_proto = iph->nexthdr, 2106 }; 2107 struct flow_keys *flkeys = NULL, _flkeys; 2108 2109 tun_info = skb_tunnel_info(skb); 2110 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2111 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2112 2113 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2114 flkeys = &_flkeys; 2115 2116 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2117 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2118 skb_dst_drop(skb); 2119 skb_dst_set(skb, 2120 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); 2121 } 2122 2123 static struct rt6_info *ip6_pol_route_output(struct net *net, 2124 struct fib6_table *table, 2125 struct flowi6 *fl6, 2126 const struct sk_buff *skb, 2127 int flags) 2128 { 2129 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2130 } 2131 2132 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 2133 struct flowi6 *fl6, int flags) 2134 { 2135 bool any_src; 2136 2137 if (ipv6_addr_type(&fl6->daddr) & 2138 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { 2139 struct dst_entry *dst; 2140 2141 dst = l3mdev_link_scope_lookup(net, fl6); 2142 if (dst) 2143 return dst; 2144 } 2145 2146 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2147 2148 any_src = ipv6_addr_any(&fl6->saddr); 2149 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2150 (fl6->flowi6_oif && any_src)) 2151 flags |= RT6_LOOKUP_F_IFACE; 2152 2153 if (!any_src) 2154 flags |= RT6_LOOKUP_F_HAS_SADDR; 2155 else if (sk) 2156 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2157 2158 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2159 } 2160 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2161 2162 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2163 { 2164 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2165 struct net_device *loopback_dev = net->loopback_dev; 2166 struct dst_entry *new = NULL; 2167 2168 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2169 DST_OBSOLETE_DEAD, 0); 2170 if (rt) { 2171 rt6_info_init(rt); 2172 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2173 2174 new = &rt->dst; 2175 new->__use = 1; 2176 new->input = dst_discard; 2177 new->output = dst_discard_out; 2178 2179 dst_copy_metrics(new, &ort->dst); 2180 2181 rt->rt6i_idev = in6_dev_get(loopback_dev); 2182 rt->rt6i_gateway = ort->rt6i_gateway; 2183 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2184 2185 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2186 #ifdef CONFIG_IPV6_SUBTREES 2187 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2188 #endif 2189 } 2190 2191 dst_release(dst_orig); 2192 return new ? new : ERR_PTR(-ENOMEM); 2193 } 2194 2195 /* 2196 * Destination cache support functions 2197 */ 2198 2199 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2200 { 2201 u32 rt_cookie = 0; 2202 2203 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2204 return false; 2205 2206 if (fib6_check_expired(f6i)) 2207 return false; 2208 2209 return true; 2210 } 2211 2212 static struct dst_entry *rt6_check(struct rt6_info *rt, 2213 struct fib6_info *from, 2214 u32 cookie) 2215 { 2216 u32 rt_cookie = 0; 2217 2218 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || 2219 rt_cookie != cookie) 2220 return NULL; 2221 2222 if (rt6_check_expired(rt)) 2223 return NULL; 2224 2225 return &rt->dst; 2226 } 2227 2228 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2229 struct fib6_info *from, 2230 u32 cookie) 2231 { 2232 if (!__rt6_check_expired(rt) && 2233 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2234 fib6_check(from, cookie)) 2235 return &rt->dst; 2236 else 2237 return NULL; 2238 } 2239 2240 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2241 { 2242 struct dst_entry *dst_ret; 2243 struct fib6_info *from; 2244 struct rt6_info *rt; 2245 2246 rt = container_of(dst, struct rt6_info, dst); 2247 2248 rcu_read_lock(); 2249 2250 /* All IPV6 dsts are created with ->obsolete set to the value 2251 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2252 * into this function always. 2253 */ 2254 2255 from = rcu_dereference(rt->from); 2256 2257 if (from && (rt->rt6i_flags & RTF_PCPU || 2258 unlikely(!list_empty(&rt->rt6i_uncached)))) 2259 dst_ret = rt6_dst_from_check(rt, from, cookie); 2260 else 2261 dst_ret = rt6_check(rt, from, cookie); 2262 2263 rcu_read_unlock(); 2264 2265 return dst_ret; 2266 } 2267 2268 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2269 { 2270 struct rt6_info *rt = (struct rt6_info *) dst; 2271 2272 if (rt) { 2273 if (rt->rt6i_flags & RTF_CACHE) { 2274 rcu_read_lock(); 2275 if (rt6_check_expired(rt)) { 2276 rt6_remove_exception_rt(rt); 2277 dst = NULL; 2278 } 2279 rcu_read_unlock(); 2280 } else { 2281 dst_release(dst); 2282 dst = NULL; 2283 } 2284 } 2285 return dst; 2286 } 2287 2288 static void ip6_link_failure(struct sk_buff *skb) 2289 { 2290 struct rt6_info *rt; 2291 2292 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2293 2294 rt = (struct rt6_info *) skb_dst(skb); 2295 if (rt) { 2296 rcu_read_lock(); 2297 if (rt->rt6i_flags & RTF_CACHE) { 2298 rt6_remove_exception_rt(rt); 2299 } else { 2300 struct fib6_info *from; 2301 struct fib6_node *fn; 2302 2303 from = rcu_dereference(rt->from); 2304 if (from) { 2305 fn = rcu_dereference(from->fib6_node); 2306 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2307 fn->fn_sernum = -1; 2308 } 2309 } 2310 rcu_read_unlock(); 2311 } 2312 } 2313 2314 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2315 { 2316 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2317 struct fib6_info *from; 2318 2319 rcu_read_lock(); 2320 from = rcu_dereference(rt0->from); 2321 if (from) 2322 rt0->dst.expires = from->expires; 2323 rcu_read_unlock(); 2324 } 2325 2326 dst_set_expires(&rt0->dst, timeout); 2327 rt0->rt6i_flags |= RTF_EXPIRES; 2328 } 2329 2330 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2331 { 2332 struct net *net = dev_net(rt->dst.dev); 2333 2334 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2335 rt->rt6i_flags |= RTF_MODIFIED; 2336 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2337 } 2338 2339 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2340 { 2341 return !(rt->rt6i_flags & RTF_CACHE) && 2342 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from)); 2343 } 2344 2345 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2346 const struct ipv6hdr *iph, u32 mtu) 2347 { 2348 const struct in6_addr *daddr, *saddr; 2349 struct rt6_info *rt6 = (struct rt6_info *)dst; 2350 2351 if (dst_metric_locked(dst, RTAX_MTU)) 2352 return; 2353 2354 if (iph) { 2355 daddr = &iph->daddr; 2356 saddr = &iph->saddr; 2357 } else if (sk) { 2358 daddr = &sk->sk_v6_daddr; 2359 saddr = &inet6_sk(sk)->saddr; 2360 } else { 2361 daddr = NULL; 2362 saddr = NULL; 2363 } 2364 dst_confirm_neigh(dst, daddr); 2365 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2366 if (mtu >= dst_mtu(dst)) 2367 return; 2368 2369 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2370 rt6_do_update_pmtu(rt6, mtu); 2371 /* update rt6_ex->stamp for cache */ 2372 if (rt6->rt6i_flags & RTF_CACHE) 2373 rt6_update_exception_stamp_rt(rt6); 2374 } else if (daddr) { 2375 struct fib6_result res = {}; 2376 struct rt6_info *nrt6; 2377 2378 rcu_read_lock(); 2379 res.f6i = rcu_dereference(rt6->from); 2380 if (!res.f6i) { 2381 rcu_read_unlock(); 2382 return; 2383 } 2384 res.nh = &res.f6i->fib6_nh; 2385 res.fib6_flags = res.f6i->fib6_flags; 2386 res.fib6_type = res.f6i->fib6_type; 2387 2388 nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr); 2389 if (nrt6) { 2390 rt6_do_update_pmtu(nrt6, mtu); 2391 if (rt6_insert_exception(nrt6, &res)) 2392 dst_release_immediate(&nrt6->dst); 2393 } 2394 rcu_read_unlock(); 2395 } 2396 } 2397 2398 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2399 struct sk_buff *skb, u32 mtu) 2400 { 2401 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2402 } 2403 2404 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2405 int oif, u32 mark, kuid_t uid) 2406 { 2407 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2408 struct dst_entry *dst; 2409 struct flowi6 fl6 = { 2410 .flowi6_oif = oif, 2411 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark), 2412 .daddr = iph->daddr, 2413 .saddr = iph->saddr, 2414 .flowlabel = ip6_flowinfo(iph), 2415 .flowi6_uid = uid, 2416 }; 2417 2418 dst = ip6_route_output(net, NULL, &fl6); 2419 if (!dst->error) 2420 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2421 dst_release(dst); 2422 } 2423 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2424 2425 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2426 { 2427 int oif = sk->sk_bound_dev_if; 2428 struct dst_entry *dst; 2429 2430 if (!oif && skb->dev) 2431 oif = l3mdev_master_ifindex(skb->dev); 2432 2433 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid); 2434 2435 dst = __sk_dst_get(sk); 2436 if (!dst || !dst->obsolete || 2437 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2438 return; 2439 2440 bh_lock_sock(sk); 2441 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2442 ip6_datagram_dst_update(sk, false); 2443 bh_unlock_sock(sk); 2444 } 2445 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2446 2447 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2448 const struct flowi6 *fl6) 2449 { 2450 #ifdef CONFIG_IPV6_SUBTREES 2451 struct ipv6_pinfo *np = inet6_sk(sk); 2452 #endif 2453 2454 ip6_dst_store(sk, dst, 2455 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2456 &sk->sk_v6_daddr : NULL, 2457 #ifdef CONFIG_IPV6_SUBTREES 2458 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2459 &np->saddr : 2460 #endif 2461 NULL); 2462 } 2463 2464 static bool ip6_redirect_nh_match(const struct fib6_result *res, 2465 struct flowi6 *fl6, 2466 const struct in6_addr *gw, 2467 struct rt6_info **ret) 2468 { 2469 const struct fib6_nh *nh = res->nh; 2470 2471 if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family || 2472 fl6->flowi6_oif != nh->fib_nh_dev->ifindex) 2473 return false; 2474 2475 /* rt_cache's gateway might be different from its 'parent' 2476 * in the case of an ip redirect. 2477 * So we keep searching in the exception table if the gateway 2478 * is different. 2479 */ 2480 if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) { 2481 struct rt6_info *rt_cache; 2482 2483 rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr); 2484 if (rt_cache && 2485 ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) { 2486 *ret = rt_cache; 2487 return true; 2488 } 2489 return false; 2490 } 2491 return true; 2492 } 2493 2494 /* Handle redirects */ 2495 struct ip6rd_flowi { 2496 struct flowi6 fl6; 2497 struct in6_addr gateway; 2498 }; 2499 2500 static struct rt6_info *__ip6_route_redirect(struct net *net, 2501 struct fib6_table *table, 2502 struct flowi6 *fl6, 2503 const struct sk_buff *skb, 2504 int flags) 2505 { 2506 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2507 struct rt6_info *ret = NULL; 2508 struct fib6_result res = {}; 2509 struct fib6_info *rt; 2510 struct fib6_node *fn; 2511 2512 /* l3mdev_update_flow overrides oif if the device is enslaved; in 2513 * this case we must match on the real ingress device, so reset it 2514 */ 2515 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 2516 fl6->flowi6_oif = skb->dev->ifindex; 2517 2518 /* Get the "current" route for this destination and 2519 * check if the redirect has come from appropriate router. 2520 * 2521 * RFC 4861 specifies that redirects should only be 2522 * accepted if they come from the nexthop to the target. 2523 * Due to the way the routes are chosen, this notion 2524 * is a bit fuzzy and one might need to check all possible 2525 * routes. 2526 */ 2527 2528 rcu_read_lock(); 2529 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2530 restart: 2531 for_each_fib6_node_rt_rcu(fn) { 2532 res.f6i = rt; 2533 res.nh = &rt->fib6_nh; 2534 2535 if (fib6_check_expired(rt)) 2536 continue; 2537 if (rt->fib6_flags & RTF_REJECT) 2538 break; 2539 if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret)) 2540 goto out; 2541 } 2542 2543 if (!rt) 2544 rt = net->ipv6.fib6_null_entry; 2545 else if (rt->fib6_flags & RTF_REJECT) { 2546 ret = net->ipv6.ip6_null_entry; 2547 goto out; 2548 } 2549 2550 if (rt == net->ipv6.fib6_null_entry) { 2551 fn = fib6_backtrack(fn, &fl6->saddr); 2552 if (fn) 2553 goto restart; 2554 } 2555 2556 res.f6i = rt; 2557 res.nh = &rt->fib6_nh; 2558 out: 2559 if (ret) { 2560 ip6_hold_safe(net, &ret); 2561 } else { 2562 res.fib6_flags = res.f6i->fib6_flags; 2563 res.fib6_type = res.f6i->fib6_type; 2564 ret = ip6_create_rt_rcu(&res); 2565 } 2566 2567 rcu_read_unlock(); 2568 2569 trace_fib6_table_lookup(net, &res, table, fl6); 2570 return ret; 2571 }; 2572 2573 static struct dst_entry *ip6_route_redirect(struct net *net, 2574 const struct flowi6 *fl6, 2575 const struct sk_buff *skb, 2576 const struct in6_addr *gateway) 2577 { 2578 int flags = RT6_LOOKUP_F_HAS_SADDR; 2579 struct ip6rd_flowi rdfl; 2580 2581 rdfl.fl6 = *fl6; 2582 rdfl.gateway = *gateway; 2583 2584 return fib6_rule_lookup(net, &rdfl.fl6, skb, 2585 flags, __ip6_route_redirect); 2586 } 2587 2588 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2589 kuid_t uid) 2590 { 2591 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2592 struct dst_entry *dst; 2593 struct flowi6 fl6 = { 2594 .flowi6_iif = LOOPBACK_IFINDEX, 2595 .flowi6_oif = oif, 2596 .flowi6_mark = mark, 2597 .daddr = iph->daddr, 2598 .saddr = iph->saddr, 2599 .flowlabel = ip6_flowinfo(iph), 2600 .flowi6_uid = uid, 2601 }; 2602 2603 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 2604 rt6_do_redirect(dst, NULL, skb); 2605 dst_release(dst); 2606 } 2607 EXPORT_SYMBOL_GPL(ip6_redirect); 2608 2609 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) 2610 { 2611 const struct ipv6hdr *iph = ipv6_hdr(skb); 2612 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2613 struct dst_entry *dst; 2614 struct flowi6 fl6 = { 2615 .flowi6_iif = LOOPBACK_IFINDEX, 2616 .flowi6_oif = oif, 2617 .daddr = msg->dest, 2618 .saddr = iph->daddr, 2619 .flowi6_uid = sock_net_uid(net, NULL), 2620 }; 2621 2622 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 2623 rt6_do_redirect(dst, NULL, skb); 2624 dst_release(dst); 2625 } 2626 2627 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2628 { 2629 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2630 sk->sk_uid); 2631 } 2632 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2633 2634 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2635 { 2636 struct net_device *dev = dst->dev; 2637 unsigned int mtu = dst_mtu(dst); 2638 struct net *net = dev_net(dev); 2639 2640 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2641 2642 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2643 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2644 2645 /* 2646 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2647 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2648 * IPV6_MAXPLEN is also valid and means: "any MSS, 2649 * rely only on pmtu discovery" 2650 */ 2651 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2652 mtu = IPV6_MAXPLEN; 2653 return mtu; 2654 } 2655 2656 static unsigned int ip6_mtu(const struct dst_entry *dst) 2657 { 2658 struct inet6_dev *idev; 2659 unsigned int mtu; 2660 2661 mtu = dst_metric_raw(dst, RTAX_MTU); 2662 if (mtu) 2663 goto out; 2664 2665 mtu = IPV6_MIN_MTU; 2666 2667 rcu_read_lock(); 2668 idev = __in6_dev_get(dst->dev); 2669 if (idev) 2670 mtu = idev->cnf.mtu6; 2671 rcu_read_unlock(); 2672 2673 out: 2674 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2675 2676 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2677 } 2678 2679 /* MTU selection: 2680 * 1. mtu on route is locked - use it 2681 * 2. mtu from nexthop exception 2682 * 3. mtu from egress device 2683 * 2684 * based on ip6_dst_mtu_forward and exception logic of 2685 * rt6_find_cached_rt; called with rcu_read_lock 2686 */ 2687 u32 ip6_mtu_from_fib6(const struct fib6_result *res, 2688 const struct in6_addr *daddr, 2689 const struct in6_addr *saddr) 2690 { 2691 const struct fib6_nh *nh = res->nh; 2692 struct fib6_info *f6i = res->f6i; 2693 struct inet6_dev *idev; 2694 struct rt6_info *rt; 2695 u32 mtu = 0; 2696 2697 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 2698 mtu = f6i->fib6_pmtu; 2699 if (mtu) 2700 goto out; 2701 } 2702 2703 rt = rt6_find_cached_rt(res, daddr, saddr); 2704 if (unlikely(rt)) { 2705 mtu = dst_metric_raw(&rt->dst, RTAX_MTU); 2706 } else { 2707 struct net_device *dev = nh->fib_nh_dev; 2708 2709 mtu = IPV6_MIN_MTU; 2710 idev = __in6_dev_get(dev); 2711 if (idev && idev->cnf.mtu6 > mtu) 2712 mtu = idev->cnf.mtu6; 2713 } 2714 2715 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2716 out: 2717 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); 2718 } 2719 2720 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2721 struct flowi6 *fl6) 2722 { 2723 struct dst_entry *dst; 2724 struct rt6_info *rt; 2725 struct inet6_dev *idev = in6_dev_get(dev); 2726 struct net *net = dev_net(dev); 2727 2728 if (unlikely(!idev)) 2729 return ERR_PTR(-ENODEV); 2730 2731 rt = ip6_dst_alloc(net, dev, 0); 2732 if (unlikely(!rt)) { 2733 in6_dev_put(idev); 2734 dst = ERR_PTR(-ENOMEM); 2735 goto out; 2736 } 2737 2738 rt->dst.flags |= DST_HOST; 2739 rt->dst.input = ip6_input; 2740 rt->dst.output = ip6_output; 2741 rt->rt6i_gateway = fl6->daddr; 2742 rt->rt6i_dst.addr = fl6->daddr; 2743 rt->rt6i_dst.plen = 128; 2744 rt->rt6i_idev = idev; 2745 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2746 2747 /* Add this dst into uncached_list so that rt6_disable_ip() can 2748 * do proper release of the net_device 2749 */ 2750 rt6_uncached_list_add(rt); 2751 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2752 2753 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2754 2755 out: 2756 return dst; 2757 } 2758 2759 static int ip6_dst_gc(struct dst_ops *ops) 2760 { 2761 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2762 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2763 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2764 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2765 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2766 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2767 int entries; 2768 2769 entries = dst_entries_get_fast(ops); 2770 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2771 entries <= rt_max_size) 2772 goto out; 2773 2774 net->ipv6.ip6_rt_gc_expire++; 2775 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2776 entries = dst_entries_get_slow(ops); 2777 if (entries < ops->gc_thresh) 2778 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2779 out: 2780 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2781 return entries > rt_max_size; 2782 } 2783 2784 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2785 struct fib6_config *cfg, 2786 const struct in6_addr *gw_addr, 2787 u32 tbid, int flags) 2788 { 2789 struct flowi6 fl6 = { 2790 .flowi6_oif = cfg->fc_ifindex, 2791 .daddr = *gw_addr, 2792 .saddr = cfg->fc_prefsrc, 2793 }; 2794 struct fib6_table *table; 2795 struct rt6_info *rt; 2796 2797 table = fib6_get_table(net, tbid); 2798 if (!table) 2799 return NULL; 2800 2801 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2802 flags |= RT6_LOOKUP_F_HAS_SADDR; 2803 2804 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 2805 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); 2806 2807 /* if table lookup failed, fall back to full lookup */ 2808 if (rt == net->ipv6.ip6_null_entry) { 2809 ip6_rt_put(rt); 2810 rt = NULL; 2811 } 2812 2813 return rt; 2814 } 2815 2816 static int ip6_route_check_nh_onlink(struct net *net, 2817 struct fib6_config *cfg, 2818 const struct net_device *dev, 2819 struct netlink_ext_ack *extack) 2820 { 2821 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 2822 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2823 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; 2824 struct fib6_info *from; 2825 struct rt6_info *grt; 2826 int err; 2827 2828 err = 0; 2829 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2830 if (grt) { 2831 rcu_read_lock(); 2832 from = rcu_dereference(grt->from); 2833 if (!grt->dst.error && 2834 /* ignore match if it is the default route */ 2835 from && !ipv6_addr_any(&from->fib6_dst.addr) && 2836 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2837 NL_SET_ERR_MSG(extack, 2838 "Nexthop has invalid gateway or device mismatch"); 2839 err = -EINVAL; 2840 } 2841 rcu_read_unlock(); 2842 2843 ip6_rt_put(grt); 2844 } 2845 2846 return err; 2847 } 2848 2849 static int ip6_route_check_nh(struct net *net, 2850 struct fib6_config *cfg, 2851 struct net_device **_dev, 2852 struct inet6_dev **idev) 2853 { 2854 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2855 struct net_device *dev = _dev ? *_dev : NULL; 2856 struct rt6_info *grt = NULL; 2857 int err = -EHOSTUNREACH; 2858 2859 if (cfg->fc_table) { 2860 int flags = RT6_LOOKUP_F_IFACE; 2861 2862 grt = ip6_nh_lookup_table(net, cfg, gw_addr, 2863 cfg->fc_table, flags); 2864 if (grt) { 2865 if (grt->rt6i_flags & RTF_GATEWAY || 2866 (dev && dev != grt->dst.dev)) { 2867 ip6_rt_put(grt); 2868 grt = NULL; 2869 } 2870 } 2871 } 2872 2873 if (!grt) 2874 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); 2875 2876 if (!grt) 2877 goto out; 2878 2879 if (dev) { 2880 if (dev != grt->dst.dev) { 2881 ip6_rt_put(grt); 2882 goto out; 2883 } 2884 } else { 2885 *_dev = dev = grt->dst.dev; 2886 *idev = grt->rt6i_idev; 2887 dev_hold(dev); 2888 in6_dev_hold(grt->rt6i_idev); 2889 } 2890 2891 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2892 err = 0; 2893 2894 ip6_rt_put(grt); 2895 2896 out: 2897 return err; 2898 } 2899 2900 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 2901 struct net_device **_dev, struct inet6_dev **idev, 2902 struct netlink_ext_ack *extack) 2903 { 2904 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2905 int gwa_type = ipv6_addr_type(gw_addr); 2906 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 2907 const struct net_device *dev = *_dev; 2908 bool need_addr_check = !dev; 2909 int err = -EINVAL; 2910 2911 /* if gw_addr is local we will fail to detect this in case 2912 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2913 * will return already-added prefix route via interface that 2914 * prefix route was assigned to, which might be non-loopback. 2915 */ 2916 if (dev && 2917 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2918 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2919 goto out; 2920 } 2921 2922 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 2923 /* IPv6 strictly inhibits using not link-local 2924 * addresses as nexthop address. 2925 * Otherwise, router will not able to send redirects. 2926 * It is very good, but in some (rare!) circumstances 2927 * (SIT, PtP, NBMA NOARP links) it is handy to allow 2928 * some exceptions. --ANK 2929 * We allow IPv4-mapped nexthops to support RFC4798-type 2930 * addressing 2931 */ 2932 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 2933 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2934 goto out; 2935 } 2936 2937 if (cfg->fc_flags & RTNH_F_ONLINK) 2938 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 2939 else 2940 err = ip6_route_check_nh(net, cfg, _dev, idev); 2941 2942 if (err) 2943 goto out; 2944 } 2945 2946 /* reload in case device was changed */ 2947 dev = *_dev; 2948 2949 err = -EINVAL; 2950 if (!dev) { 2951 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2952 goto out; 2953 } else if (dev->flags & IFF_LOOPBACK) { 2954 NL_SET_ERR_MSG(extack, 2955 "Egress device can not be loopback device for this route"); 2956 goto out; 2957 } 2958 2959 /* if we did not check gw_addr above, do so now that the 2960 * egress device has been resolved. 2961 */ 2962 if (need_addr_check && 2963 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2964 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2965 goto out; 2966 } 2967 2968 err = 0; 2969 out: 2970 return err; 2971 } 2972 2973 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type) 2974 { 2975 if ((flags & RTF_REJECT) || 2976 (dev && (dev->flags & IFF_LOOPBACK) && 2977 !(addr_type & IPV6_ADDR_LOOPBACK) && 2978 !(flags & RTF_LOCAL))) 2979 return true; 2980 2981 return false; 2982 } 2983 2984 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, 2985 struct fib6_config *cfg, gfp_t gfp_flags, 2986 struct netlink_ext_ack *extack) 2987 { 2988 struct net_device *dev = NULL; 2989 struct inet6_dev *idev = NULL; 2990 int addr_type; 2991 int err; 2992 2993 fib6_nh->fib_nh_family = AF_INET6; 2994 2995 err = -ENODEV; 2996 if (cfg->fc_ifindex) { 2997 dev = dev_get_by_index(net, cfg->fc_ifindex); 2998 if (!dev) 2999 goto out; 3000 idev = in6_dev_get(dev); 3001 if (!idev) 3002 goto out; 3003 } 3004 3005 if (cfg->fc_flags & RTNH_F_ONLINK) { 3006 if (!dev) { 3007 NL_SET_ERR_MSG(extack, 3008 "Nexthop device required for onlink"); 3009 goto out; 3010 } 3011 3012 if (!(dev->flags & IFF_UP)) { 3013 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3014 err = -ENETDOWN; 3015 goto out; 3016 } 3017 3018 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK; 3019 } 3020 3021 fib6_nh->fib_nh_weight = 1; 3022 3023 /* We cannot add true routes via loopback here, 3024 * they would result in kernel looping; promote them to reject routes 3025 */ 3026 addr_type = ipv6_addr_type(&cfg->fc_dst); 3027 if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) { 3028 /* hold loopback dev/idev if we haven't done so. */ 3029 if (dev != net->loopback_dev) { 3030 if (dev) { 3031 dev_put(dev); 3032 in6_dev_put(idev); 3033 } 3034 dev = net->loopback_dev; 3035 dev_hold(dev); 3036 idev = in6_dev_get(dev); 3037 if (!idev) { 3038 err = -ENODEV; 3039 goto out; 3040 } 3041 } 3042 goto set_dev; 3043 } 3044 3045 if (cfg->fc_flags & RTF_GATEWAY) { 3046 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 3047 if (err) 3048 goto out; 3049 3050 fib6_nh->fib_nh_gw6 = cfg->fc_gateway; 3051 fib6_nh->fib_nh_gw_family = AF_INET6; 3052 } 3053 3054 err = -ENODEV; 3055 if (!dev) 3056 goto out; 3057 3058 if (idev->cnf.disable_ipv6) { 3059 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3060 err = -EACCES; 3061 goto out; 3062 } 3063 3064 if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) { 3065 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3066 err = -ENETDOWN; 3067 goto out; 3068 } 3069 3070 if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3071 !netif_carrier_ok(dev)) 3072 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; 3073 3074 err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap, 3075 cfg->fc_encap_type, cfg, gfp_flags, extack); 3076 if (err) 3077 goto out; 3078 set_dev: 3079 fib6_nh->fib_nh_dev = dev; 3080 fib6_nh->fib_nh_oif = dev->ifindex; 3081 err = 0; 3082 out: 3083 if (idev) 3084 in6_dev_put(idev); 3085 3086 if (err) { 3087 lwtstate_put(fib6_nh->fib_nh_lws); 3088 fib6_nh->fib_nh_lws = NULL; 3089 if (dev) 3090 dev_put(dev); 3091 } 3092 3093 return err; 3094 } 3095 3096 void fib6_nh_release(struct fib6_nh *fib6_nh) 3097 { 3098 fib_nh_common_release(&fib6_nh->nh_common); 3099 } 3100 3101 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 3102 gfp_t gfp_flags, 3103 struct netlink_ext_ack *extack) 3104 { 3105 struct net *net = cfg->fc_nlinfo.nl_net; 3106 struct fib6_info *rt = NULL; 3107 struct fib6_table *table; 3108 int err = -EINVAL; 3109 int addr_type; 3110 3111 /* RTF_PCPU is an internal flag; can not be set by userspace */ 3112 if (cfg->fc_flags & RTF_PCPU) { 3113 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 3114 goto out; 3115 } 3116 3117 /* RTF_CACHE is an internal flag; can not be set by userspace */ 3118 if (cfg->fc_flags & RTF_CACHE) { 3119 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 3120 goto out; 3121 } 3122 3123 if (cfg->fc_type > RTN_MAX) { 3124 NL_SET_ERR_MSG(extack, "Invalid route type"); 3125 goto out; 3126 } 3127 3128 if (cfg->fc_dst_len > 128) { 3129 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 3130 goto out; 3131 } 3132 if (cfg->fc_src_len > 128) { 3133 NL_SET_ERR_MSG(extack, "Invalid source address length"); 3134 goto out; 3135 } 3136 #ifndef CONFIG_IPV6_SUBTREES 3137 if (cfg->fc_src_len) { 3138 NL_SET_ERR_MSG(extack, 3139 "Specifying source address requires IPV6_SUBTREES to be enabled"); 3140 goto out; 3141 } 3142 #endif 3143 3144 err = -ENOBUFS; 3145 if (cfg->fc_nlinfo.nlh && 3146 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 3147 table = fib6_get_table(net, cfg->fc_table); 3148 if (!table) { 3149 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 3150 table = fib6_new_table(net, cfg->fc_table); 3151 } 3152 } else { 3153 table = fib6_new_table(net, cfg->fc_table); 3154 } 3155 3156 if (!table) 3157 goto out; 3158 3159 err = -ENOMEM; 3160 rt = fib6_info_alloc(gfp_flags); 3161 if (!rt) 3162 goto out; 3163 3164 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len, 3165 extack); 3166 if (IS_ERR(rt->fib6_metrics)) { 3167 err = PTR_ERR(rt->fib6_metrics); 3168 /* Do not leave garbage there. */ 3169 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; 3170 goto out; 3171 } 3172 3173 if (cfg->fc_flags & RTF_ADDRCONF) 3174 rt->dst_nocount = true; 3175 3176 if (cfg->fc_flags & RTF_EXPIRES) 3177 fib6_set_expires(rt, jiffies + 3178 clock_t_to_jiffies(cfg->fc_expires)); 3179 else 3180 fib6_clean_expires(rt); 3181 3182 if (cfg->fc_protocol == RTPROT_UNSPEC) 3183 cfg->fc_protocol = RTPROT_BOOT; 3184 rt->fib6_protocol = cfg->fc_protocol; 3185 3186 rt->fib6_table = table; 3187 rt->fib6_metric = cfg->fc_metric; 3188 rt->fib6_type = cfg->fc_type ? : RTN_UNICAST; 3189 rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY; 3190 3191 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3192 rt->fib6_dst.plen = cfg->fc_dst_len; 3193 if (rt->fib6_dst.plen == 128) 3194 rt->dst_host = true; 3195 3196 #ifdef CONFIG_IPV6_SUBTREES 3197 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3198 rt->fib6_src.plen = cfg->fc_src_len; 3199 #endif 3200 err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack); 3201 if (err) 3202 goto out; 3203 3204 /* We cannot add true routes via loopback here, 3205 * they would result in kernel looping; promote them to reject routes 3206 */ 3207 addr_type = ipv6_addr_type(&cfg->fc_dst); 3208 if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type)) 3209 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; 3210 3211 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3212 struct net_device *dev = fib6_info_nh_dev(rt); 3213 3214 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3215 NL_SET_ERR_MSG(extack, "Invalid source address"); 3216 err = -EINVAL; 3217 goto out; 3218 } 3219 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3220 rt->fib6_prefsrc.plen = 128; 3221 } else 3222 rt->fib6_prefsrc.plen = 0; 3223 3224 return rt; 3225 out: 3226 fib6_info_release(rt); 3227 return ERR_PTR(err); 3228 } 3229 3230 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3231 struct netlink_ext_ack *extack) 3232 { 3233 struct fib6_info *rt; 3234 int err; 3235 3236 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3237 if (IS_ERR(rt)) 3238 return PTR_ERR(rt); 3239 3240 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3241 fib6_info_release(rt); 3242 3243 return err; 3244 } 3245 3246 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3247 { 3248 struct net *net = info->nl_net; 3249 struct fib6_table *table; 3250 int err; 3251 3252 if (rt == net->ipv6.fib6_null_entry) { 3253 err = -ENOENT; 3254 goto out; 3255 } 3256 3257 table = rt->fib6_table; 3258 spin_lock_bh(&table->tb6_lock); 3259 err = fib6_del(rt, info); 3260 spin_unlock_bh(&table->tb6_lock); 3261 3262 out: 3263 fib6_info_release(rt); 3264 return err; 3265 } 3266 3267 int ip6_del_rt(struct net *net, struct fib6_info *rt) 3268 { 3269 struct nl_info info = { .nl_net = net }; 3270 3271 return __ip6_del_rt(rt, &info); 3272 } 3273 3274 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3275 { 3276 struct nl_info *info = &cfg->fc_nlinfo; 3277 struct net *net = info->nl_net; 3278 struct sk_buff *skb = NULL; 3279 struct fib6_table *table; 3280 int err = -ENOENT; 3281 3282 if (rt == net->ipv6.fib6_null_entry) 3283 goto out_put; 3284 table = rt->fib6_table; 3285 spin_lock_bh(&table->tb6_lock); 3286 3287 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3288 struct fib6_info *sibling, *next_sibling; 3289 3290 /* prefer to send a single notification with all hops */ 3291 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3292 if (skb) { 3293 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3294 3295 if (rt6_fill_node(net, skb, rt, NULL, 3296 NULL, NULL, 0, RTM_DELROUTE, 3297 info->portid, seq, 0) < 0) { 3298 kfree_skb(skb); 3299 skb = NULL; 3300 } else 3301 info->skip_notify = 1; 3302 } 3303 3304 list_for_each_entry_safe(sibling, next_sibling, 3305 &rt->fib6_siblings, 3306 fib6_siblings) { 3307 err = fib6_del(sibling, info); 3308 if (err) 3309 goto out_unlock; 3310 } 3311 } 3312 3313 err = fib6_del(rt, info); 3314 out_unlock: 3315 spin_unlock_bh(&table->tb6_lock); 3316 out_put: 3317 fib6_info_release(rt); 3318 3319 if (skb) { 3320 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3321 info->nlh, gfp_any()); 3322 } 3323 return err; 3324 } 3325 3326 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3327 { 3328 int rc = -ESRCH; 3329 3330 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3331 goto out; 3332 3333 if (cfg->fc_flags & RTF_GATEWAY && 3334 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3335 goto out; 3336 3337 rc = rt6_remove_exception_rt(rt); 3338 out: 3339 return rc; 3340 } 3341 3342 static int ip6_route_del(struct fib6_config *cfg, 3343 struct netlink_ext_ack *extack) 3344 { 3345 struct rt6_info *rt_cache; 3346 struct fib6_table *table; 3347 struct fib6_info *rt; 3348 struct fib6_node *fn; 3349 int err = -ESRCH; 3350 3351 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 3352 if (!table) { 3353 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 3354 return err; 3355 } 3356 3357 rcu_read_lock(); 3358 3359 fn = fib6_locate(&table->tb6_root, 3360 &cfg->fc_dst, cfg->fc_dst_len, 3361 &cfg->fc_src, cfg->fc_src_len, 3362 !(cfg->fc_flags & RTF_CACHE)); 3363 3364 if (fn) { 3365 for_each_fib6_node_rt_rcu(fn) { 3366 struct fib6_nh *nh; 3367 3368 if (cfg->fc_flags & RTF_CACHE) { 3369 struct fib6_result res = { 3370 .f6i = rt, 3371 }; 3372 int rc; 3373 3374 rt_cache = rt6_find_cached_rt(&res, 3375 &cfg->fc_dst, 3376 &cfg->fc_src); 3377 if (rt_cache) { 3378 rc = ip6_del_cached_rt(rt_cache, cfg); 3379 if (rc != -ESRCH) { 3380 rcu_read_unlock(); 3381 return rc; 3382 } 3383 } 3384 continue; 3385 } 3386 3387 nh = &rt->fib6_nh; 3388 if (cfg->fc_ifindex && 3389 (!nh->fib_nh_dev || 3390 nh->fib_nh_dev->ifindex != cfg->fc_ifindex)) 3391 continue; 3392 if (cfg->fc_flags & RTF_GATEWAY && 3393 !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6)) 3394 continue; 3395 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 3396 continue; 3397 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) 3398 continue; 3399 if (!fib6_info_hold_safe(rt)) 3400 continue; 3401 rcu_read_unlock(); 3402 3403 /* if gateway was specified only delete the one hop */ 3404 if (cfg->fc_flags & RTF_GATEWAY) 3405 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3406 3407 return __ip6_del_rt_siblings(rt, cfg); 3408 } 3409 } 3410 rcu_read_unlock(); 3411 3412 return err; 3413 } 3414 3415 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3416 { 3417 struct netevent_redirect netevent; 3418 struct rt6_info *rt, *nrt = NULL; 3419 struct fib6_result res = {}; 3420 struct ndisc_options ndopts; 3421 struct inet6_dev *in6_dev; 3422 struct neighbour *neigh; 3423 struct rd_msg *msg; 3424 int optlen, on_link; 3425 u8 *lladdr; 3426 3427 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3428 optlen -= sizeof(*msg); 3429 3430 if (optlen < 0) { 3431 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3432 return; 3433 } 3434 3435 msg = (struct rd_msg *)icmp6_hdr(skb); 3436 3437 if (ipv6_addr_is_multicast(&msg->dest)) { 3438 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3439 return; 3440 } 3441 3442 on_link = 0; 3443 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3444 on_link = 1; 3445 } else if (ipv6_addr_type(&msg->target) != 3446 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3447 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3448 return; 3449 } 3450 3451 in6_dev = __in6_dev_get(skb->dev); 3452 if (!in6_dev) 3453 return; 3454 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3455 return; 3456 3457 /* RFC2461 8.1: 3458 * The IP source address of the Redirect MUST be the same as the current 3459 * first-hop router for the specified ICMP Destination Address. 3460 */ 3461 3462 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3463 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3464 return; 3465 } 3466 3467 lladdr = NULL; 3468 if (ndopts.nd_opts_tgt_lladdr) { 3469 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3470 skb->dev); 3471 if (!lladdr) { 3472 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 3473 return; 3474 } 3475 } 3476 3477 rt = (struct rt6_info *) dst; 3478 if (rt->rt6i_flags & RTF_REJECT) { 3479 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 3480 return; 3481 } 3482 3483 /* Redirect received -> path was valid. 3484 * Look, redirects are sent only in response to data packets, 3485 * so that this nexthop apparently is reachable. --ANK 3486 */ 3487 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3488 3489 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3490 if (!neigh) 3491 return; 3492 3493 /* 3494 * We have finally decided to accept it. 3495 */ 3496 3497 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3498 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3499 NEIGH_UPDATE_F_OVERRIDE| 3500 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3501 NEIGH_UPDATE_F_ISROUTER)), 3502 NDISC_REDIRECT, &ndopts); 3503 3504 rcu_read_lock(); 3505 res.f6i = rcu_dereference(rt->from); 3506 if (!res.f6i) 3507 goto out; 3508 3509 res.nh = &res.f6i->fib6_nh; 3510 res.fib6_flags = res.f6i->fib6_flags; 3511 res.fib6_type = res.f6i->fib6_type; 3512 nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL); 3513 if (!nrt) 3514 goto out; 3515 3516 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3517 if (on_link) 3518 nrt->rt6i_flags &= ~RTF_GATEWAY; 3519 3520 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3521 3522 /* rt6_insert_exception() will take care of duplicated exceptions */ 3523 if (rt6_insert_exception(nrt, &res)) { 3524 dst_release_immediate(&nrt->dst); 3525 goto out; 3526 } 3527 3528 netevent.old = &rt->dst; 3529 netevent.new = &nrt->dst; 3530 netevent.daddr = &msg->dest; 3531 netevent.neigh = neigh; 3532 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3533 3534 out: 3535 rcu_read_unlock(); 3536 neigh_release(neigh); 3537 } 3538 3539 #ifdef CONFIG_IPV6_ROUTE_INFO 3540 static struct fib6_info *rt6_get_route_info(struct net *net, 3541 const struct in6_addr *prefix, int prefixlen, 3542 const struct in6_addr *gwaddr, 3543 struct net_device *dev) 3544 { 3545 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3546 int ifindex = dev->ifindex; 3547 struct fib6_node *fn; 3548 struct fib6_info *rt = NULL; 3549 struct fib6_table *table; 3550 3551 table = fib6_get_table(net, tb_id); 3552 if (!table) 3553 return NULL; 3554 3555 rcu_read_lock(); 3556 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3557 if (!fn) 3558 goto out; 3559 3560 for_each_fib6_node_rt_rcu(fn) { 3561 if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex) 3562 continue; 3563 if (!(rt->fib6_flags & RTF_ROUTEINFO) || 3564 !rt->fib6_nh.fib_nh_gw_family) 3565 continue; 3566 if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr)) 3567 continue; 3568 if (!fib6_info_hold_safe(rt)) 3569 continue; 3570 break; 3571 } 3572 out: 3573 rcu_read_unlock(); 3574 return rt; 3575 } 3576 3577 static struct fib6_info *rt6_add_route_info(struct net *net, 3578 const struct in6_addr *prefix, int prefixlen, 3579 const struct in6_addr *gwaddr, 3580 struct net_device *dev, 3581 unsigned int pref) 3582 { 3583 struct fib6_config cfg = { 3584 .fc_metric = IP6_RT_PRIO_USER, 3585 .fc_ifindex = dev->ifindex, 3586 .fc_dst_len = prefixlen, 3587 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3588 RTF_UP | RTF_PREF(pref), 3589 .fc_protocol = RTPROT_RA, 3590 .fc_type = RTN_UNICAST, 3591 .fc_nlinfo.portid = 0, 3592 .fc_nlinfo.nlh = NULL, 3593 .fc_nlinfo.nl_net = net, 3594 }; 3595 3596 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3597 cfg.fc_dst = *prefix; 3598 cfg.fc_gateway = *gwaddr; 3599 3600 /* We should treat it as a default route if prefix length is 0. */ 3601 if (!prefixlen) 3602 cfg.fc_flags |= RTF_DEFAULT; 3603 3604 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 3605 3606 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3607 } 3608 #endif 3609 3610 struct fib6_info *rt6_get_dflt_router(struct net *net, 3611 const struct in6_addr *addr, 3612 struct net_device *dev) 3613 { 3614 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3615 struct fib6_info *rt; 3616 struct fib6_table *table; 3617 3618 table = fib6_get_table(net, tb_id); 3619 if (!table) 3620 return NULL; 3621 3622 rcu_read_lock(); 3623 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3624 struct fib6_nh *nh = &rt->fib6_nh; 3625 3626 if (dev == nh->fib_nh_dev && 3627 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3628 ipv6_addr_equal(&nh->fib_nh_gw6, addr)) 3629 break; 3630 } 3631 if (rt && !fib6_info_hold_safe(rt)) 3632 rt = NULL; 3633 rcu_read_unlock(); 3634 return rt; 3635 } 3636 3637 struct fib6_info *rt6_add_dflt_router(struct net *net, 3638 const struct in6_addr *gwaddr, 3639 struct net_device *dev, 3640 unsigned int pref) 3641 { 3642 struct fib6_config cfg = { 3643 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3644 .fc_metric = IP6_RT_PRIO_USER, 3645 .fc_ifindex = dev->ifindex, 3646 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3647 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3648 .fc_protocol = RTPROT_RA, 3649 .fc_type = RTN_UNICAST, 3650 .fc_nlinfo.portid = 0, 3651 .fc_nlinfo.nlh = NULL, 3652 .fc_nlinfo.nl_net = net, 3653 }; 3654 3655 cfg.fc_gateway = *gwaddr; 3656 3657 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 3658 struct fib6_table *table; 3659 3660 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3661 if (table) 3662 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3663 } 3664 3665 return rt6_get_dflt_router(net, gwaddr, dev); 3666 } 3667 3668 static void __rt6_purge_dflt_routers(struct net *net, 3669 struct fib6_table *table) 3670 { 3671 struct fib6_info *rt; 3672 3673 restart: 3674 rcu_read_lock(); 3675 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3676 struct net_device *dev = fib6_info_nh_dev(rt); 3677 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 3678 3679 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3680 (!idev || idev->cnf.accept_ra != 2) && 3681 fib6_info_hold_safe(rt)) { 3682 rcu_read_unlock(); 3683 ip6_del_rt(net, rt); 3684 goto restart; 3685 } 3686 } 3687 rcu_read_unlock(); 3688 3689 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3690 } 3691 3692 void rt6_purge_dflt_routers(struct net *net) 3693 { 3694 struct fib6_table *table; 3695 struct hlist_head *head; 3696 unsigned int h; 3697 3698 rcu_read_lock(); 3699 3700 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3701 head = &net->ipv6.fib_table_hash[h]; 3702 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3703 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3704 __rt6_purge_dflt_routers(net, table); 3705 } 3706 } 3707 3708 rcu_read_unlock(); 3709 } 3710 3711 static void rtmsg_to_fib6_config(struct net *net, 3712 struct in6_rtmsg *rtmsg, 3713 struct fib6_config *cfg) 3714 { 3715 *cfg = (struct fib6_config){ 3716 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3717 : RT6_TABLE_MAIN, 3718 .fc_ifindex = rtmsg->rtmsg_ifindex, 3719 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER, 3720 .fc_expires = rtmsg->rtmsg_info, 3721 .fc_dst_len = rtmsg->rtmsg_dst_len, 3722 .fc_src_len = rtmsg->rtmsg_src_len, 3723 .fc_flags = rtmsg->rtmsg_flags, 3724 .fc_type = rtmsg->rtmsg_type, 3725 3726 .fc_nlinfo.nl_net = net, 3727 3728 .fc_dst = rtmsg->rtmsg_dst, 3729 .fc_src = rtmsg->rtmsg_src, 3730 .fc_gateway = rtmsg->rtmsg_gateway, 3731 }; 3732 } 3733 3734 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3735 { 3736 struct fib6_config cfg; 3737 struct in6_rtmsg rtmsg; 3738 int err; 3739 3740 switch (cmd) { 3741 case SIOCADDRT: /* Add a route */ 3742 case SIOCDELRT: /* Delete a route */ 3743 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3744 return -EPERM; 3745 err = copy_from_user(&rtmsg, arg, 3746 sizeof(struct in6_rtmsg)); 3747 if (err) 3748 return -EFAULT; 3749 3750 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3751 3752 rtnl_lock(); 3753 switch (cmd) { 3754 case SIOCADDRT: 3755 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 3756 break; 3757 case SIOCDELRT: 3758 err = ip6_route_del(&cfg, NULL); 3759 break; 3760 default: 3761 err = -EINVAL; 3762 } 3763 rtnl_unlock(); 3764 3765 return err; 3766 } 3767 3768 return -EINVAL; 3769 } 3770 3771 /* 3772 * Drop the packet on the floor 3773 */ 3774 3775 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3776 { 3777 struct dst_entry *dst = skb_dst(skb); 3778 struct net *net = dev_net(dst->dev); 3779 struct inet6_dev *idev; 3780 int type; 3781 3782 if (netif_is_l3_master(skb->dev) && 3783 dst->dev == net->loopback_dev) 3784 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); 3785 else 3786 idev = ip6_dst_idev(dst); 3787 3788 switch (ipstats_mib_noroutes) { 3789 case IPSTATS_MIB_INNOROUTES: 3790 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3791 if (type == IPV6_ADDR_ANY) { 3792 IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 3793 break; 3794 } 3795 /* FALLTHROUGH */ 3796 case IPSTATS_MIB_OUTNOROUTES: 3797 IP6_INC_STATS(net, idev, ipstats_mib_noroutes); 3798 break; 3799 } 3800 3801 /* Start over by dropping the dst for l3mdev case */ 3802 if (netif_is_l3_master(skb->dev)) 3803 skb_dst_drop(skb); 3804 3805 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3806 kfree_skb(skb); 3807 return 0; 3808 } 3809 3810 static int ip6_pkt_discard(struct sk_buff *skb) 3811 { 3812 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3813 } 3814 3815 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3816 { 3817 skb->dev = skb_dst(skb)->dev; 3818 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3819 } 3820 3821 static int ip6_pkt_prohibit(struct sk_buff *skb) 3822 { 3823 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3824 } 3825 3826 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3827 { 3828 skb->dev = skb_dst(skb)->dev; 3829 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3830 } 3831 3832 /* 3833 * Allocate a dst for local (unicast / anycast) address. 3834 */ 3835 3836 struct fib6_info *addrconf_f6i_alloc(struct net *net, 3837 struct inet6_dev *idev, 3838 const struct in6_addr *addr, 3839 bool anycast, gfp_t gfp_flags) 3840 { 3841 struct fib6_config cfg = { 3842 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL, 3843 .fc_ifindex = idev->dev->ifindex, 3844 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP, 3845 .fc_dst = *addr, 3846 .fc_dst_len = 128, 3847 .fc_protocol = RTPROT_KERNEL, 3848 .fc_nlinfo.nl_net = net, 3849 .fc_ignore_dev_down = true, 3850 }; 3851 3852 if (anycast) { 3853 cfg.fc_type = RTN_ANYCAST; 3854 cfg.fc_flags |= RTF_ANYCAST; 3855 } else { 3856 cfg.fc_type = RTN_LOCAL; 3857 cfg.fc_flags |= RTF_LOCAL; 3858 } 3859 3860 return ip6_route_info_create(&cfg, gfp_flags, NULL); 3861 } 3862 3863 /* remove deleted ip from prefsrc entries */ 3864 struct arg_dev_net_ip { 3865 struct net_device *dev; 3866 struct net *net; 3867 struct in6_addr *addr; 3868 }; 3869 3870 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 3871 { 3872 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3873 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3874 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3875 3876 if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) && 3877 rt != net->ipv6.fib6_null_entry && 3878 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 3879 spin_lock_bh(&rt6_exception_lock); 3880 /* remove prefsrc entry */ 3881 rt->fib6_prefsrc.plen = 0; 3882 spin_unlock_bh(&rt6_exception_lock); 3883 } 3884 return 0; 3885 } 3886 3887 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3888 { 3889 struct net *net = dev_net(ifp->idev->dev); 3890 struct arg_dev_net_ip adni = { 3891 .dev = ifp->idev->dev, 3892 .net = net, 3893 .addr = &ifp->addr, 3894 }; 3895 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3896 } 3897 3898 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT) 3899 3900 /* Remove routers and update dst entries when gateway turn into host. */ 3901 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 3902 { 3903 struct in6_addr *gateway = (struct in6_addr *)arg; 3904 3905 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3906 rt->fib6_nh.fib_nh_gw_family && 3907 ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) { 3908 return -1; 3909 } 3910 3911 /* Further clean up cached routes in exception table. 3912 * This is needed because cached route may have a different 3913 * gateway than its 'parent' in the case of an ip redirect. 3914 */ 3915 rt6_exceptions_clean_tohost(rt, gateway); 3916 3917 return 0; 3918 } 3919 3920 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3921 { 3922 fib6_clean_all(net, fib6_clean_tohost, gateway); 3923 } 3924 3925 struct arg_netdev_event { 3926 const struct net_device *dev; 3927 union { 3928 unsigned char nh_flags; 3929 unsigned long event; 3930 }; 3931 }; 3932 3933 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 3934 { 3935 struct fib6_info *iter; 3936 struct fib6_node *fn; 3937 3938 fn = rcu_dereference_protected(rt->fib6_node, 3939 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3940 iter = rcu_dereference_protected(fn->leaf, 3941 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3942 while (iter) { 3943 if (iter->fib6_metric == rt->fib6_metric && 3944 rt6_qualify_for_ecmp(iter)) 3945 return iter; 3946 iter = rcu_dereference_protected(iter->fib6_next, 3947 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3948 } 3949 3950 return NULL; 3951 } 3952 3953 static bool rt6_is_dead(const struct fib6_info *rt) 3954 { 3955 if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD || 3956 (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN && 3957 ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev))) 3958 return true; 3959 3960 return false; 3961 } 3962 3963 static int rt6_multipath_total_weight(const struct fib6_info *rt) 3964 { 3965 struct fib6_info *iter; 3966 int total = 0; 3967 3968 if (!rt6_is_dead(rt)) 3969 total += rt->fib6_nh.fib_nh_weight; 3970 3971 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 3972 if (!rt6_is_dead(iter)) 3973 total += iter->fib6_nh.fib_nh_weight; 3974 } 3975 3976 return total; 3977 } 3978 3979 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 3980 { 3981 int upper_bound = -1; 3982 3983 if (!rt6_is_dead(rt)) { 3984 *weight += rt->fib6_nh.fib_nh_weight; 3985 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 3986 total) - 1; 3987 } 3988 atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound); 3989 } 3990 3991 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 3992 { 3993 struct fib6_info *iter; 3994 int weight = 0; 3995 3996 rt6_upper_bound_set(rt, &weight, total); 3997 3998 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 3999 rt6_upper_bound_set(iter, &weight, total); 4000 } 4001 4002 void rt6_multipath_rebalance(struct fib6_info *rt) 4003 { 4004 struct fib6_info *first; 4005 int total; 4006 4007 /* In case the entire multipath route was marked for flushing, 4008 * then there is no need to rebalance upon the removal of every 4009 * sibling route. 4010 */ 4011 if (!rt->fib6_nsiblings || rt->should_flush) 4012 return; 4013 4014 /* During lookup routes are evaluated in order, so we need to 4015 * make sure upper bounds are assigned from the first sibling 4016 * onwards. 4017 */ 4018 first = rt6_multipath_first_sibling(rt); 4019 if (WARN_ON_ONCE(!first)) 4020 return; 4021 4022 total = rt6_multipath_total_weight(first); 4023 rt6_multipath_upper_bound_set(first, total); 4024 } 4025 4026 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 4027 { 4028 const struct arg_netdev_event *arg = p_arg; 4029 struct net *net = dev_net(arg->dev); 4030 4031 if (rt != net->ipv6.fib6_null_entry && 4032 rt->fib6_nh.fib_nh_dev == arg->dev) { 4033 rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags; 4034 fib6_update_sernum_upto_root(net, rt); 4035 rt6_multipath_rebalance(rt); 4036 } 4037 4038 return 0; 4039 } 4040 4041 void rt6_sync_up(struct net_device *dev, unsigned char nh_flags) 4042 { 4043 struct arg_netdev_event arg = { 4044 .dev = dev, 4045 { 4046 .nh_flags = nh_flags, 4047 }, 4048 }; 4049 4050 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 4051 arg.nh_flags |= RTNH_F_LINKDOWN; 4052 4053 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 4054 } 4055 4056 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 4057 const struct net_device *dev) 4058 { 4059 struct fib6_info *iter; 4060 4061 if (rt->fib6_nh.fib_nh_dev == dev) 4062 return true; 4063 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4064 if (iter->fib6_nh.fib_nh_dev == dev) 4065 return true; 4066 4067 return false; 4068 } 4069 4070 static void rt6_multipath_flush(struct fib6_info *rt) 4071 { 4072 struct fib6_info *iter; 4073 4074 rt->should_flush = 1; 4075 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4076 iter->should_flush = 1; 4077 } 4078 4079 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 4080 const struct net_device *down_dev) 4081 { 4082 struct fib6_info *iter; 4083 unsigned int dead = 0; 4084 4085 if (rt->fib6_nh.fib_nh_dev == down_dev || 4086 rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD) 4087 dead++; 4088 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4089 if (iter->fib6_nh.fib_nh_dev == down_dev || 4090 iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD) 4091 dead++; 4092 4093 return dead; 4094 } 4095 4096 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 4097 const struct net_device *dev, 4098 unsigned char nh_flags) 4099 { 4100 struct fib6_info *iter; 4101 4102 if (rt->fib6_nh.fib_nh_dev == dev) 4103 rt->fib6_nh.fib_nh_flags |= nh_flags; 4104 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4105 if (iter->fib6_nh.fib_nh_dev == dev) 4106 iter->fib6_nh.fib_nh_flags |= nh_flags; 4107 } 4108 4109 /* called with write lock held for table with rt */ 4110 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 4111 { 4112 const struct arg_netdev_event *arg = p_arg; 4113 const struct net_device *dev = arg->dev; 4114 struct net *net = dev_net(dev); 4115 4116 if (rt == net->ipv6.fib6_null_entry) 4117 return 0; 4118 4119 switch (arg->event) { 4120 case NETDEV_UNREGISTER: 4121 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; 4122 case NETDEV_DOWN: 4123 if (rt->should_flush) 4124 return -1; 4125 if (!rt->fib6_nsiblings) 4126 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; 4127 if (rt6_multipath_uses_dev(rt, dev)) { 4128 unsigned int count; 4129 4130 count = rt6_multipath_dead_count(rt, dev); 4131 if (rt->fib6_nsiblings + 1 == count) { 4132 rt6_multipath_flush(rt); 4133 return -1; 4134 } 4135 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4136 RTNH_F_LINKDOWN); 4137 fib6_update_sernum(net, rt); 4138 rt6_multipath_rebalance(rt); 4139 } 4140 return -2; 4141 case NETDEV_CHANGE: 4142 if (rt->fib6_nh.fib_nh_dev != dev || 4143 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 4144 break; 4145 rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN; 4146 rt6_multipath_rebalance(rt); 4147 break; 4148 } 4149 4150 return 0; 4151 } 4152 4153 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4154 { 4155 struct arg_netdev_event arg = { 4156 .dev = dev, 4157 { 4158 .event = event, 4159 }, 4160 }; 4161 struct net *net = dev_net(dev); 4162 4163 if (net->ipv6.sysctl.skip_notify_on_dev_down) 4164 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg); 4165 else 4166 fib6_clean_all(net, fib6_ifdown, &arg); 4167 } 4168 4169 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4170 { 4171 rt6_sync_down_dev(dev, event); 4172 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4173 neigh_ifdown(&nd_tbl, dev); 4174 } 4175 4176 struct rt6_mtu_change_arg { 4177 struct net_device *dev; 4178 unsigned int mtu; 4179 }; 4180 4181 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) 4182 { 4183 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4184 struct inet6_dev *idev; 4185 4186 /* In IPv6 pmtu discovery is not optional, 4187 so that RTAX_MTU lock cannot disable it. 4188 We still use this lock to block changes 4189 caused by addrconf/ndisc. 4190 */ 4191 4192 idev = __in6_dev_get(arg->dev); 4193 if (!idev) 4194 return 0; 4195 4196 /* For administrative MTU increase, there is no way to discover 4197 IPv6 PMTU increase, so PMTU increase should be updated here. 4198 Since RFC 1981 doesn't include administrative MTU increase 4199 update PMTU increase is a MUST. (i.e. jumbo frame) 4200 */ 4201 if (rt->fib6_nh.fib_nh_dev == arg->dev && 4202 !fib6_metric_locked(rt, RTAX_MTU)) { 4203 u32 mtu = rt->fib6_pmtu; 4204 4205 if (mtu >= arg->mtu || 4206 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4207 fib6_metric_set(rt, RTAX_MTU, arg->mtu); 4208 4209 spin_lock_bh(&rt6_exception_lock); 4210 rt6_exceptions_update_pmtu(idev, rt, arg->mtu); 4211 spin_unlock_bh(&rt6_exception_lock); 4212 } 4213 return 0; 4214 } 4215 4216 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4217 { 4218 struct rt6_mtu_change_arg arg = { 4219 .dev = dev, 4220 .mtu = mtu, 4221 }; 4222 4223 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4224 } 4225 4226 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4227 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4228 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4229 [RTA_OIF] = { .type = NLA_U32 }, 4230 [RTA_IIF] = { .type = NLA_U32 }, 4231 [RTA_PRIORITY] = { .type = NLA_U32 }, 4232 [RTA_METRICS] = { .type = NLA_NESTED }, 4233 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4234 [RTA_PREF] = { .type = NLA_U8 }, 4235 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4236 [RTA_ENCAP] = { .type = NLA_NESTED }, 4237 [RTA_EXPIRES] = { .type = NLA_U32 }, 4238 [RTA_UID] = { .type = NLA_U32 }, 4239 [RTA_MARK] = { .type = NLA_U32 }, 4240 [RTA_TABLE] = { .type = NLA_U32 }, 4241 [RTA_IP_PROTO] = { .type = NLA_U8 }, 4242 [RTA_SPORT] = { .type = NLA_U16 }, 4243 [RTA_DPORT] = { .type = NLA_U16 }, 4244 }; 4245 4246 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4247 struct fib6_config *cfg, 4248 struct netlink_ext_ack *extack) 4249 { 4250 struct rtmsg *rtm; 4251 struct nlattr *tb[RTA_MAX+1]; 4252 unsigned int pref; 4253 int err; 4254 4255 err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, 4256 rtm_ipv6_policy, extack); 4257 if (err < 0) 4258 goto errout; 4259 4260 err = -EINVAL; 4261 rtm = nlmsg_data(nlh); 4262 4263 *cfg = (struct fib6_config){ 4264 .fc_table = rtm->rtm_table, 4265 .fc_dst_len = rtm->rtm_dst_len, 4266 .fc_src_len = rtm->rtm_src_len, 4267 .fc_flags = RTF_UP, 4268 .fc_protocol = rtm->rtm_protocol, 4269 .fc_type = rtm->rtm_type, 4270 4271 .fc_nlinfo.portid = NETLINK_CB(skb).portid, 4272 .fc_nlinfo.nlh = nlh, 4273 .fc_nlinfo.nl_net = sock_net(skb->sk), 4274 }; 4275 4276 if (rtm->rtm_type == RTN_UNREACHABLE || 4277 rtm->rtm_type == RTN_BLACKHOLE || 4278 rtm->rtm_type == RTN_PROHIBIT || 4279 rtm->rtm_type == RTN_THROW) 4280 cfg->fc_flags |= RTF_REJECT; 4281 4282 if (rtm->rtm_type == RTN_LOCAL) 4283 cfg->fc_flags |= RTF_LOCAL; 4284 4285 if (rtm->rtm_flags & RTM_F_CLONED) 4286 cfg->fc_flags |= RTF_CACHE; 4287 4288 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4289 4290 if (tb[RTA_GATEWAY]) { 4291 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4292 cfg->fc_flags |= RTF_GATEWAY; 4293 } 4294 if (tb[RTA_VIA]) { 4295 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute"); 4296 goto errout; 4297 } 4298 4299 if (tb[RTA_DST]) { 4300 int plen = (rtm->rtm_dst_len + 7) >> 3; 4301 4302 if (nla_len(tb[RTA_DST]) < plen) 4303 goto errout; 4304 4305 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 4306 } 4307 4308 if (tb[RTA_SRC]) { 4309 int plen = (rtm->rtm_src_len + 7) >> 3; 4310 4311 if (nla_len(tb[RTA_SRC]) < plen) 4312 goto errout; 4313 4314 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 4315 } 4316 4317 if (tb[RTA_PREFSRC]) 4318 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 4319 4320 if (tb[RTA_OIF]) 4321 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 4322 4323 if (tb[RTA_PRIORITY]) 4324 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 4325 4326 if (tb[RTA_METRICS]) { 4327 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 4328 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 4329 } 4330 4331 if (tb[RTA_TABLE]) 4332 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 4333 4334 if (tb[RTA_MULTIPATH]) { 4335 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 4336 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 4337 4338 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 4339 cfg->fc_mp_len, extack); 4340 if (err < 0) 4341 goto errout; 4342 } 4343 4344 if (tb[RTA_PREF]) { 4345 pref = nla_get_u8(tb[RTA_PREF]); 4346 if (pref != ICMPV6_ROUTER_PREF_LOW && 4347 pref != ICMPV6_ROUTER_PREF_HIGH) 4348 pref = ICMPV6_ROUTER_PREF_MEDIUM; 4349 cfg->fc_flags |= RTF_PREF(pref); 4350 } 4351 4352 if (tb[RTA_ENCAP]) 4353 cfg->fc_encap = tb[RTA_ENCAP]; 4354 4355 if (tb[RTA_ENCAP_TYPE]) { 4356 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 4357 4358 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 4359 if (err < 0) 4360 goto errout; 4361 } 4362 4363 if (tb[RTA_EXPIRES]) { 4364 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 4365 4366 if (addrconf_finite_timeout(timeout)) { 4367 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 4368 cfg->fc_flags |= RTF_EXPIRES; 4369 } 4370 } 4371 4372 err = 0; 4373 errout: 4374 return err; 4375 } 4376 4377 struct rt6_nh { 4378 struct fib6_info *fib6_info; 4379 struct fib6_config r_cfg; 4380 struct list_head next; 4381 }; 4382 4383 static int ip6_route_info_append(struct net *net, 4384 struct list_head *rt6_nh_list, 4385 struct fib6_info *rt, 4386 struct fib6_config *r_cfg) 4387 { 4388 struct rt6_nh *nh; 4389 int err = -EEXIST; 4390 4391 list_for_each_entry(nh, rt6_nh_list, next) { 4392 /* check if fib6_info already exists */ 4393 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 4394 return err; 4395 } 4396 4397 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4398 if (!nh) 4399 return -ENOMEM; 4400 nh->fib6_info = rt; 4401 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4402 list_add_tail(&nh->next, rt6_nh_list); 4403 4404 return 0; 4405 } 4406 4407 static void ip6_route_mpath_notify(struct fib6_info *rt, 4408 struct fib6_info *rt_last, 4409 struct nl_info *info, 4410 __u16 nlflags) 4411 { 4412 /* if this is an APPEND route, then rt points to the first route 4413 * inserted and rt_last points to last route inserted. Userspace 4414 * wants a consistent dump of the route which starts at the first 4415 * nexthop. Since sibling routes are always added at the end of 4416 * the list, find the first sibling of the last route appended 4417 */ 4418 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 4419 rt = list_first_entry(&rt_last->fib6_siblings, 4420 struct fib6_info, 4421 fib6_siblings); 4422 } 4423 4424 if (rt) 4425 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 4426 } 4427 4428 static int ip6_route_multipath_add(struct fib6_config *cfg, 4429 struct netlink_ext_ack *extack) 4430 { 4431 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 4432 struct nl_info *info = &cfg->fc_nlinfo; 4433 struct fib6_config r_cfg; 4434 struct rtnexthop *rtnh; 4435 struct fib6_info *rt; 4436 struct rt6_nh *err_nh; 4437 struct rt6_nh *nh, *nh_safe; 4438 __u16 nlflags; 4439 int remaining; 4440 int attrlen; 4441 int err = 1; 4442 int nhn = 0; 4443 int replace = (cfg->fc_nlinfo.nlh && 4444 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 4445 LIST_HEAD(rt6_nh_list); 4446 4447 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 4448 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 4449 nlflags |= NLM_F_APPEND; 4450 4451 remaining = cfg->fc_mp_len; 4452 rtnh = (struct rtnexthop *)cfg->fc_mp; 4453 4454 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4455 * fib6_info structs per nexthop 4456 */ 4457 while (rtnh_ok(rtnh, remaining)) { 4458 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4459 if (rtnh->rtnh_ifindex) 4460 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4461 4462 attrlen = rtnh_attrlen(rtnh); 4463 if (attrlen > 0) { 4464 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4465 4466 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4467 if (nla) { 4468 r_cfg.fc_gateway = nla_get_in6_addr(nla); 4469 r_cfg.fc_flags |= RTF_GATEWAY; 4470 } 4471 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 4472 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 4473 if (nla) 4474 r_cfg.fc_encap_type = nla_get_u16(nla); 4475 } 4476 4477 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 4478 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 4479 if (IS_ERR(rt)) { 4480 err = PTR_ERR(rt); 4481 rt = NULL; 4482 goto cleanup; 4483 } 4484 if (!rt6_qualify_for_ecmp(rt)) { 4485 err = -EINVAL; 4486 NL_SET_ERR_MSG(extack, 4487 "Device only routes can not be added for IPv6 using the multipath API."); 4488 fib6_info_release(rt); 4489 goto cleanup; 4490 } 4491 4492 rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1; 4493 4494 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 4495 rt, &r_cfg); 4496 if (err) { 4497 fib6_info_release(rt); 4498 goto cleanup; 4499 } 4500 4501 rtnh = rtnh_next(rtnh, &remaining); 4502 } 4503 4504 /* for add and replace send one notification with all nexthops. 4505 * Skip the notification in fib6_add_rt2node and send one with 4506 * the full route when done 4507 */ 4508 info->skip_notify = 1; 4509 4510 err_nh = NULL; 4511 list_for_each_entry(nh, &rt6_nh_list, next) { 4512 err = __ip6_ins_rt(nh->fib6_info, info, extack); 4513 fib6_info_release(nh->fib6_info); 4514 4515 if (!err) { 4516 /* save reference to last route successfully inserted */ 4517 rt_last = nh->fib6_info; 4518 4519 /* save reference to first route for notification */ 4520 if (!rt_notif) 4521 rt_notif = nh->fib6_info; 4522 } 4523 4524 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 4525 nh->fib6_info = NULL; 4526 if (err) { 4527 if (replace && nhn) 4528 NL_SET_ERR_MSG_MOD(extack, 4529 "multipath route replace failed (check consistency of installed routes)"); 4530 err_nh = nh; 4531 goto add_errout; 4532 } 4533 4534 /* Because each route is added like a single route we remove 4535 * these flags after the first nexthop: if there is a collision, 4536 * we have already failed to add the first nexthop: 4537 * fib6_add_rt2node() has rejected it; when replacing, old 4538 * nexthops have been replaced by first new, the rest should 4539 * be added to it. 4540 */ 4541 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4542 NLM_F_REPLACE); 4543 nhn++; 4544 } 4545 4546 /* success ... tell user about new route */ 4547 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4548 goto cleanup; 4549 4550 add_errout: 4551 /* send notification for routes that were added so that 4552 * the delete notifications sent by ip6_route_del are 4553 * coherent 4554 */ 4555 if (rt_notif) 4556 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4557 4558 /* Delete routes that were already added */ 4559 list_for_each_entry(nh, &rt6_nh_list, next) { 4560 if (err_nh == nh) 4561 break; 4562 ip6_route_del(&nh->r_cfg, extack); 4563 } 4564 4565 cleanup: 4566 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4567 if (nh->fib6_info) 4568 fib6_info_release(nh->fib6_info); 4569 list_del(&nh->next); 4570 kfree(nh); 4571 } 4572 4573 return err; 4574 } 4575 4576 static int ip6_route_multipath_del(struct fib6_config *cfg, 4577 struct netlink_ext_ack *extack) 4578 { 4579 struct fib6_config r_cfg; 4580 struct rtnexthop *rtnh; 4581 int remaining; 4582 int attrlen; 4583 int err = 1, last_err = 0; 4584 4585 remaining = cfg->fc_mp_len; 4586 rtnh = (struct rtnexthop *)cfg->fc_mp; 4587 4588 /* Parse a Multipath Entry */ 4589 while (rtnh_ok(rtnh, remaining)) { 4590 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4591 if (rtnh->rtnh_ifindex) 4592 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4593 4594 attrlen = rtnh_attrlen(rtnh); 4595 if (attrlen > 0) { 4596 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4597 4598 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4599 if (nla) { 4600 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 4601 r_cfg.fc_flags |= RTF_GATEWAY; 4602 } 4603 } 4604 err = ip6_route_del(&r_cfg, extack); 4605 if (err) 4606 last_err = err; 4607 4608 rtnh = rtnh_next(rtnh, &remaining); 4609 } 4610 4611 return last_err; 4612 } 4613 4614 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4615 struct netlink_ext_ack *extack) 4616 { 4617 struct fib6_config cfg; 4618 int err; 4619 4620 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4621 if (err < 0) 4622 return err; 4623 4624 if (cfg.fc_mp) 4625 return ip6_route_multipath_del(&cfg, extack); 4626 else { 4627 cfg.fc_delete_all_nh = 1; 4628 return ip6_route_del(&cfg, extack); 4629 } 4630 } 4631 4632 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4633 struct netlink_ext_ack *extack) 4634 { 4635 struct fib6_config cfg; 4636 int err; 4637 4638 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4639 if (err < 0) 4640 return err; 4641 4642 if (cfg.fc_metric == 0) 4643 cfg.fc_metric = IP6_RT_PRIO_USER; 4644 4645 if (cfg.fc_mp) 4646 return ip6_route_multipath_add(&cfg, extack); 4647 else 4648 return ip6_route_add(&cfg, GFP_KERNEL, extack); 4649 } 4650 4651 static size_t rt6_nlmsg_size(struct fib6_info *rt) 4652 { 4653 int nexthop_len = 0; 4654 4655 if (rt->fib6_nsiblings) { 4656 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4657 + NLA_ALIGN(sizeof(struct rtnexthop)) 4658 + nla_total_size(16) /* RTA_GATEWAY */ 4659 + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws); 4660 4661 nexthop_len *= rt->fib6_nsiblings; 4662 } 4663 4664 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4665 + nla_total_size(16) /* RTA_SRC */ 4666 + nla_total_size(16) /* RTA_DST */ 4667 + nla_total_size(16) /* RTA_GATEWAY */ 4668 + nla_total_size(16) /* RTA_PREFSRC */ 4669 + nla_total_size(4) /* RTA_TABLE */ 4670 + nla_total_size(4) /* RTA_IIF */ 4671 + nla_total_size(4) /* RTA_OIF */ 4672 + nla_total_size(4) /* RTA_PRIORITY */ 4673 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 4674 + nla_total_size(sizeof(struct rta_cacheinfo)) 4675 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4676 + nla_total_size(1) /* RTA_PREF */ 4677 + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws) 4678 + nexthop_len; 4679 } 4680 4681 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 4682 struct fib6_info *rt, struct dst_entry *dst, 4683 struct in6_addr *dest, struct in6_addr *src, 4684 int iif, int type, u32 portid, u32 seq, 4685 unsigned int flags) 4686 { 4687 struct rt6_info *rt6 = (struct rt6_info *)dst; 4688 struct rt6key *rt6_dst, *rt6_src; 4689 u32 *pmetrics, table, rt6_flags; 4690 struct nlmsghdr *nlh; 4691 struct rtmsg *rtm; 4692 long expires = 0; 4693 4694 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4695 if (!nlh) 4696 return -EMSGSIZE; 4697 4698 if (rt6) { 4699 rt6_dst = &rt6->rt6i_dst; 4700 rt6_src = &rt6->rt6i_src; 4701 rt6_flags = rt6->rt6i_flags; 4702 } else { 4703 rt6_dst = &rt->fib6_dst; 4704 rt6_src = &rt->fib6_src; 4705 rt6_flags = rt->fib6_flags; 4706 } 4707 4708 rtm = nlmsg_data(nlh); 4709 rtm->rtm_family = AF_INET6; 4710 rtm->rtm_dst_len = rt6_dst->plen; 4711 rtm->rtm_src_len = rt6_src->plen; 4712 rtm->rtm_tos = 0; 4713 if (rt->fib6_table) 4714 table = rt->fib6_table->tb6_id; 4715 else 4716 table = RT6_TABLE_UNSPEC; 4717 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT; 4718 if (nla_put_u32(skb, RTA_TABLE, table)) 4719 goto nla_put_failure; 4720 4721 rtm->rtm_type = rt->fib6_type; 4722 rtm->rtm_flags = 0; 4723 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4724 rtm->rtm_protocol = rt->fib6_protocol; 4725 4726 if (rt6_flags & RTF_CACHE) 4727 rtm->rtm_flags |= RTM_F_CLONED; 4728 4729 if (dest) { 4730 if (nla_put_in6_addr(skb, RTA_DST, dest)) 4731 goto nla_put_failure; 4732 rtm->rtm_dst_len = 128; 4733 } else if (rtm->rtm_dst_len) 4734 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr)) 4735 goto nla_put_failure; 4736 #ifdef CONFIG_IPV6_SUBTREES 4737 if (src) { 4738 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4739 goto nla_put_failure; 4740 rtm->rtm_src_len = 128; 4741 } else if (rtm->rtm_src_len && 4742 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr)) 4743 goto nla_put_failure; 4744 #endif 4745 if (iif) { 4746 #ifdef CONFIG_IPV6_MROUTE 4747 if (ipv6_addr_is_multicast(&rt6_dst->addr)) { 4748 int err = ip6mr_get_route(net, skb, rtm, portid); 4749 4750 if (err == 0) 4751 return 0; 4752 if (err < 0) 4753 goto nla_put_failure; 4754 } else 4755 #endif 4756 if (nla_put_u32(skb, RTA_IIF, iif)) 4757 goto nla_put_failure; 4758 } else if (dest) { 4759 struct in6_addr saddr_buf; 4760 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 4761 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4762 goto nla_put_failure; 4763 } 4764 4765 if (rt->fib6_prefsrc.plen) { 4766 struct in6_addr saddr_buf; 4767 saddr_buf = rt->fib6_prefsrc.addr; 4768 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4769 goto nla_put_failure; 4770 } 4771 4772 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 4773 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 4774 goto nla_put_failure; 4775 4776 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 4777 goto nla_put_failure; 4778 4779 /* For multipath routes, walk the siblings list and add 4780 * each as a nexthop within RTA_MULTIPATH. 4781 */ 4782 if (rt6) { 4783 if (rt6_flags & RTF_GATEWAY && 4784 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) 4785 goto nla_put_failure; 4786 4787 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex)) 4788 goto nla_put_failure; 4789 } else if (rt->fib6_nsiblings) { 4790 struct fib6_info *sibling, *next_sibling; 4791 struct nlattr *mp; 4792 4793 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); 4794 if (!mp) 4795 goto nla_put_failure; 4796 4797 if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common, 4798 rt->fib6_nh.fib_nh_weight) < 0) 4799 goto nla_put_failure; 4800 4801 list_for_each_entry_safe(sibling, next_sibling, 4802 &rt->fib6_siblings, fib6_siblings) { 4803 if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common, 4804 sibling->fib6_nh.fib_nh_weight) < 0) 4805 goto nla_put_failure; 4806 } 4807 4808 nla_nest_end(skb, mp); 4809 } else { 4810 unsigned char nh_flags = 0; 4811 4812 if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common, 4813 &nh_flags, false) < 0) 4814 goto nla_put_failure; 4815 4816 rtm->rtm_flags |= nh_flags; 4817 } 4818 4819 if (rt6_flags & RTF_EXPIRES) { 4820 expires = dst ? dst->expires : rt->expires; 4821 expires -= jiffies; 4822 } 4823 4824 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 4825 goto nla_put_failure; 4826 4827 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags))) 4828 goto nla_put_failure; 4829 4830 4831 nlmsg_end(skb, nlh); 4832 return 0; 4833 4834 nla_put_failure: 4835 nlmsg_cancel(skb, nlh); 4836 return -EMSGSIZE; 4837 } 4838 4839 static bool fib6_info_uses_dev(const struct fib6_info *f6i, 4840 const struct net_device *dev) 4841 { 4842 if (f6i->fib6_nh.fib_nh_dev == dev) 4843 return true; 4844 4845 if (f6i->fib6_nsiblings) { 4846 struct fib6_info *sibling, *next_sibling; 4847 4848 list_for_each_entry_safe(sibling, next_sibling, 4849 &f6i->fib6_siblings, fib6_siblings) { 4850 if (sibling->fib6_nh.fib_nh_dev == dev) 4851 return true; 4852 } 4853 } 4854 4855 return false; 4856 } 4857 4858 int rt6_dump_route(struct fib6_info *rt, void *p_arg) 4859 { 4860 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4861 struct fib_dump_filter *filter = &arg->filter; 4862 unsigned int flags = NLM_F_MULTI; 4863 struct net *net = arg->net; 4864 4865 if (rt == net->ipv6.fib6_null_entry) 4866 return 0; 4867 4868 if ((filter->flags & RTM_F_PREFIX) && 4869 !(rt->fib6_flags & RTF_PREFIX_RT)) { 4870 /* success since this is not a prefix route */ 4871 return 1; 4872 } 4873 if (filter->filter_set) { 4874 if ((filter->rt_type && rt->fib6_type != filter->rt_type) || 4875 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || 4876 (filter->protocol && rt->fib6_protocol != filter->protocol)) { 4877 return 1; 4878 } 4879 flags |= NLM_F_DUMP_FILTERED; 4880 } 4881 4882 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, 4883 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, 4884 arg->cb->nlh->nlmsg_seq, flags); 4885 } 4886 4887 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, 4888 const struct nlmsghdr *nlh, 4889 struct nlattr **tb, 4890 struct netlink_ext_ack *extack) 4891 { 4892 struct rtmsg *rtm; 4893 int i, err; 4894 4895 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { 4896 NL_SET_ERR_MSG_MOD(extack, 4897 "Invalid header for get route request"); 4898 return -EINVAL; 4899 } 4900 4901 if (!netlink_strict_get_check(skb)) 4902 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, 4903 rtm_ipv6_policy, extack); 4904 4905 rtm = nlmsg_data(nlh); 4906 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || 4907 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || 4908 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || 4909 rtm->rtm_type) { 4910 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request"); 4911 return -EINVAL; 4912 } 4913 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) { 4914 NL_SET_ERR_MSG_MOD(extack, 4915 "Invalid flags for get route request"); 4916 return -EINVAL; 4917 } 4918 4919 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, 4920 rtm_ipv6_policy, extack); 4921 if (err) 4922 return err; 4923 4924 if ((tb[RTA_SRC] && !rtm->rtm_src_len) || 4925 (tb[RTA_DST] && !rtm->rtm_dst_len)) { 4926 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); 4927 return -EINVAL; 4928 } 4929 4930 for (i = 0; i <= RTA_MAX; i++) { 4931 if (!tb[i]) 4932 continue; 4933 4934 switch (i) { 4935 case RTA_SRC: 4936 case RTA_DST: 4937 case RTA_IIF: 4938 case RTA_OIF: 4939 case RTA_MARK: 4940 case RTA_UID: 4941 case RTA_SPORT: 4942 case RTA_DPORT: 4943 case RTA_IP_PROTO: 4944 break; 4945 default: 4946 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); 4947 return -EINVAL; 4948 } 4949 } 4950 4951 return 0; 4952 } 4953 4954 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4955 struct netlink_ext_ack *extack) 4956 { 4957 struct net *net = sock_net(in_skb->sk); 4958 struct nlattr *tb[RTA_MAX+1]; 4959 int err, iif = 0, oif = 0; 4960 struct fib6_info *from; 4961 struct dst_entry *dst; 4962 struct rt6_info *rt; 4963 struct sk_buff *skb; 4964 struct rtmsg *rtm; 4965 struct flowi6 fl6 = {}; 4966 bool fibmatch; 4967 4968 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack); 4969 if (err < 0) 4970 goto errout; 4971 4972 err = -EINVAL; 4973 rtm = nlmsg_data(nlh); 4974 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4975 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4976 4977 if (tb[RTA_SRC]) { 4978 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4979 goto errout; 4980 4981 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4982 } 4983 4984 if (tb[RTA_DST]) { 4985 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4986 goto errout; 4987 4988 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4989 } 4990 4991 if (tb[RTA_IIF]) 4992 iif = nla_get_u32(tb[RTA_IIF]); 4993 4994 if (tb[RTA_OIF]) 4995 oif = nla_get_u32(tb[RTA_OIF]); 4996 4997 if (tb[RTA_MARK]) 4998 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4999 5000 if (tb[RTA_UID]) 5001 fl6.flowi6_uid = make_kuid(current_user_ns(), 5002 nla_get_u32(tb[RTA_UID])); 5003 else 5004 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 5005 5006 if (tb[RTA_SPORT]) 5007 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); 5008 5009 if (tb[RTA_DPORT]) 5010 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); 5011 5012 if (tb[RTA_IP_PROTO]) { 5013 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 5014 &fl6.flowi6_proto, AF_INET6, 5015 extack); 5016 if (err) 5017 goto errout; 5018 } 5019 5020 if (iif) { 5021 struct net_device *dev; 5022 int flags = 0; 5023 5024 rcu_read_lock(); 5025 5026 dev = dev_get_by_index_rcu(net, iif); 5027 if (!dev) { 5028 rcu_read_unlock(); 5029 err = -ENODEV; 5030 goto errout; 5031 } 5032 5033 fl6.flowi6_iif = iif; 5034 5035 if (!ipv6_addr_any(&fl6.saddr)) 5036 flags |= RT6_LOOKUP_F_HAS_SADDR; 5037 5038 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 5039 5040 rcu_read_unlock(); 5041 } else { 5042 fl6.flowi6_oif = oif; 5043 5044 dst = ip6_route_output(net, NULL, &fl6); 5045 } 5046 5047 5048 rt = container_of(dst, struct rt6_info, dst); 5049 if (rt->dst.error) { 5050 err = rt->dst.error; 5051 ip6_rt_put(rt); 5052 goto errout; 5053 } 5054 5055 if (rt == net->ipv6.ip6_null_entry) { 5056 err = rt->dst.error; 5057 ip6_rt_put(rt); 5058 goto errout; 5059 } 5060 5061 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 5062 if (!skb) { 5063 ip6_rt_put(rt); 5064 err = -ENOBUFS; 5065 goto errout; 5066 } 5067 5068 skb_dst_set(skb, &rt->dst); 5069 5070 rcu_read_lock(); 5071 from = rcu_dereference(rt->from); 5072 if (from) { 5073 if (fibmatch) 5074 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, 5075 iif, RTM_NEWROUTE, 5076 NETLINK_CB(in_skb).portid, 5077 nlh->nlmsg_seq, 0); 5078 else 5079 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 5080 &fl6.saddr, iif, RTM_NEWROUTE, 5081 NETLINK_CB(in_skb).portid, 5082 nlh->nlmsg_seq, 0); 5083 } else { 5084 err = -ENETUNREACH; 5085 } 5086 rcu_read_unlock(); 5087 5088 if (err < 0) { 5089 kfree_skb(skb); 5090 goto errout; 5091 } 5092 5093 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 5094 errout: 5095 return err; 5096 } 5097 5098 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 5099 unsigned int nlm_flags) 5100 { 5101 struct sk_buff *skb; 5102 struct net *net = info->nl_net; 5103 u32 seq; 5104 int err; 5105 5106 err = -ENOBUFS; 5107 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 5108 5109 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 5110 if (!skb) 5111 goto errout; 5112 5113 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 5114 event, info->portid, seq, nlm_flags); 5115 if (err < 0) { 5116 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 5117 WARN_ON(err == -EMSGSIZE); 5118 kfree_skb(skb); 5119 goto errout; 5120 } 5121 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 5122 info->nlh, gfp_any()); 5123 return; 5124 errout: 5125 if (err < 0) 5126 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 5127 } 5128 5129 static int ip6_route_dev_notify(struct notifier_block *this, 5130 unsigned long event, void *ptr) 5131 { 5132 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 5133 struct net *net = dev_net(dev); 5134 5135 if (!(dev->flags & IFF_LOOPBACK)) 5136 return NOTIFY_OK; 5137 5138 if (event == NETDEV_REGISTER) { 5139 net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev; 5140 net->ipv6.ip6_null_entry->dst.dev = dev; 5141 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 5142 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5143 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 5144 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 5145 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 5146 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 5147 #endif 5148 } else if (event == NETDEV_UNREGISTER && 5149 dev->reg_state != NETREG_UNREGISTERED) { 5150 /* NETDEV_UNREGISTER could be fired for multiple times by 5151 * netdev_wait_allrefs(). Make sure we only call this once. 5152 */ 5153 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 5154 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5155 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 5156 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 5157 #endif 5158 } 5159 5160 return NOTIFY_OK; 5161 } 5162 5163 /* 5164 * /proc 5165 */ 5166 5167 #ifdef CONFIG_PROC_FS 5168 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 5169 { 5170 struct net *net = (struct net *)seq->private; 5171 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 5172 net->ipv6.rt6_stats->fib_nodes, 5173 net->ipv6.rt6_stats->fib_route_nodes, 5174 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 5175 net->ipv6.rt6_stats->fib_rt_entries, 5176 net->ipv6.rt6_stats->fib_rt_cache, 5177 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 5178 net->ipv6.rt6_stats->fib_discarded_routes); 5179 5180 return 0; 5181 } 5182 #endif /* CONFIG_PROC_FS */ 5183 5184 #ifdef CONFIG_SYSCTL 5185 5186 static 5187 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 5188 void __user *buffer, size_t *lenp, loff_t *ppos) 5189 { 5190 struct net *net; 5191 int delay; 5192 int ret; 5193 if (!write) 5194 return -EINVAL; 5195 5196 net = (struct net *)ctl->extra1; 5197 delay = net->ipv6.sysctl.flush_delay; 5198 ret = proc_dointvec(ctl, write, buffer, lenp, ppos); 5199 if (ret) 5200 return ret; 5201 5202 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 5203 return 0; 5204 } 5205 5206 static int zero; 5207 static int one = 1; 5208 5209 static struct ctl_table ipv6_route_table_template[] = { 5210 { 5211 .procname = "flush", 5212 .data = &init_net.ipv6.sysctl.flush_delay, 5213 .maxlen = sizeof(int), 5214 .mode = 0200, 5215 .proc_handler = ipv6_sysctl_rtcache_flush 5216 }, 5217 { 5218 .procname = "gc_thresh", 5219 .data = &ip6_dst_ops_template.gc_thresh, 5220 .maxlen = sizeof(int), 5221 .mode = 0644, 5222 .proc_handler = proc_dointvec, 5223 }, 5224 { 5225 .procname = "max_size", 5226 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 5227 .maxlen = sizeof(int), 5228 .mode = 0644, 5229 .proc_handler = proc_dointvec, 5230 }, 5231 { 5232 .procname = "gc_min_interval", 5233 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5234 .maxlen = sizeof(int), 5235 .mode = 0644, 5236 .proc_handler = proc_dointvec_jiffies, 5237 }, 5238 { 5239 .procname = "gc_timeout", 5240 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 5241 .maxlen = sizeof(int), 5242 .mode = 0644, 5243 .proc_handler = proc_dointvec_jiffies, 5244 }, 5245 { 5246 .procname = "gc_interval", 5247 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 5248 .maxlen = sizeof(int), 5249 .mode = 0644, 5250 .proc_handler = proc_dointvec_jiffies, 5251 }, 5252 { 5253 .procname = "gc_elasticity", 5254 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 5255 .maxlen = sizeof(int), 5256 .mode = 0644, 5257 .proc_handler = proc_dointvec, 5258 }, 5259 { 5260 .procname = "mtu_expires", 5261 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 5262 .maxlen = sizeof(int), 5263 .mode = 0644, 5264 .proc_handler = proc_dointvec_jiffies, 5265 }, 5266 { 5267 .procname = "min_adv_mss", 5268 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 5269 .maxlen = sizeof(int), 5270 .mode = 0644, 5271 .proc_handler = proc_dointvec, 5272 }, 5273 { 5274 .procname = "gc_min_interval_ms", 5275 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5276 .maxlen = sizeof(int), 5277 .mode = 0644, 5278 .proc_handler = proc_dointvec_ms_jiffies, 5279 }, 5280 { 5281 .procname = "skip_notify_on_dev_down", 5282 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down, 5283 .maxlen = sizeof(int), 5284 .mode = 0644, 5285 .proc_handler = proc_dointvec_minmax, 5286 .extra1 = &zero, 5287 .extra2 = &one, 5288 }, 5289 { } 5290 }; 5291 5292 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 5293 { 5294 struct ctl_table *table; 5295 5296 table = kmemdup(ipv6_route_table_template, 5297 sizeof(ipv6_route_table_template), 5298 GFP_KERNEL); 5299 5300 if (table) { 5301 table[0].data = &net->ipv6.sysctl.flush_delay; 5302 table[0].extra1 = net; 5303 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 5304 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 5305 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5306 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 5307 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 5308 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 5309 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 5310 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 5311 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5312 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down; 5313 5314 /* Don't export sysctls to unprivileged users */ 5315 if (net->user_ns != &init_user_ns) 5316 table[0].procname = NULL; 5317 } 5318 5319 return table; 5320 } 5321 #endif 5322 5323 static int __net_init ip6_route_net_init(struct net *net) 5324 { 5325 int ret = -ENOMEM; 5326 5327 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 5328 sizeof(net->ipv6.ip6_dst_ops)); 5329 5330 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 5331 goto out_ip6_dst_ops; 5332 5333 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, 5334 sizeof(*net->ipv6.fib6_null_entry), 5335 GFP_KERNEL); 5336 if (!net->ipv6.fib6_null_entry) 5337 goto out_ip6_dst_entries; 5338 5339 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 5340 sizeof(*net->ipv6.ip6_null_entry), 5341 GFP_KERNEL); 5342 if (!net->ipv6.ip6_null_entry) 5343 goto out_fib6_null_entry; 5344 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5345 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 5346 ip6_template_metrics, true); 5347 5348 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5349 net->ipv6.fib6_has_custom_rules = false; 5350 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 5351 sizeof(*net->ipv6.ip6_prohibit_entry), 5352 GFP_KERNEL); 5353 if (!net->ipv6.ip6_prohibit_entry) 5354 goto out_ip6_null_entry; 5355 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5356 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 5357 ip6_template_metrics, true); 5358 5359 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 5360 sizeof(*net->ipv6.ip6_blk_hole_entry), 5361 GFP_KERNEL); 5362 if (!net->ipv6.ip6_blk_hole_entry) 5363 goto out_ip6_prohibit_entry; 5364 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5365 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 5366 ip6_template_metrics, true); 5367 #endif 5368 5369 net->ipv6.sysctl.flush_delay = 0; 5370 net->ipv6.sysctl.ip6_rt_max_size = 4096; 5371 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 5372 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 5373 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 5374 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 5375 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 5376 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 5377 net->ipv6.sysctl.skip_notify_on_dev_down = 0; 5378 5379 net->ipv6.ip6_rt_gc_expire = 30*HZ; 5380 5381 ret = 0; 5382 out: 5383 return ret; 5384 5385 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5386 out_ip6_prohibit_entry: 5387 kfree(net->ipv6.ip6_prohibit_entry); 5388 out_ip6_null_entry: 5389 kfree(net->ipv6.ip6_null_entry); 5390 #endif 5391 out_fib6_null_entry: 5392 kfree(net->ipv6.fib6_null_entry); 5393 out_ip6_dst_entries: 5394 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5395 out_ip6_dst_ops: 5396 goto out; 5397 } 5398 5399 static void __net_exit ip6_route_net_exit(struct net *net) 5400 { 5401 kfree(net->ipv6.fib6_null_entry); 5402 kfree(net->ipv6.ip6_null_entry); 5403 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5404 kfree(net->ipv6.ip6_prohibit_entry); 5405 kfree(net->ipv6.ip6_blk_hole_entry); 5406 #endif 5407 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5408 } 5409 5410 static int __net_init ip6_route_net_init_late(struct net *net) 5411 { 5412 #ifdef CONFIG_PROC_FS 5413 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops, 5414 sizeof(struct ipv6_route_iter)); 5415 proc_create_net_single("rt6_stats", 0444, net->proc_net, 5416 rt6_stats_seq_show, NULL); 5417 #endif 5418 return 0; 5419 } 5420 5421 static void __net_exit ip6_route_net_exit_late(struct net *net) 5422 { 5423 #ifdef CONFIG_PROC_FS 5424 remove_proc_entry("ipv6_route", net->proc_net); 5425 remove_proc_entry("rt6_stats", net->proc_net); 5426 #endif 5427 } 5428 5429 static struct pernet_operations ip6_route_net_ops = { 5430 .init = ip6_route_net_init, 5431 .exit = ip6_route_net_exit, 5432 }; 5433 5434 static int __net_init ipv6_inetpeer_init(struct net *net) 5435 { 5436 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 5437 5438 if (!bp) 5439 return -ENOMEM; 5440 inet_peer_base_init(bp); 5441 net->ipv6.peers = bp; 5442 return 0; 5443 } 5444 5445 static void __net_exit ipv6_inetpeer_exit(struct net *net) 5446 { 5447 struct inet_peer_base *bp = net->ipv6.peers; 5448 5449 net->ipv6.peers = NULL; 5450 inetpeer_invalidate_tree(bp); 5451 kfree(bp); 5452 } 5453 5454 static struct pernet_operations ipv6_inetpeer_ops = { 5455 .init = ipv6_inetpeer_init, 5456 .exit = ipv6_inetpeer_exit, 5457 }; 5458 5459 static struct pernet_operations ip6_route_net_late_ops = { 5460 .init = ip6_route_net_init_late, 5461 .exit = ip6_route_net_exit_late, 5462 }; 5463 5464 static struct notifier_block ip6_route_dev_notifier = { 5465 .notifier_call = ip6_route_dev_notify, 5466 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 5467 }; 5468 5469 void __init ip6_route_init_special_entries(void) 5470 { 5471 /* Registering of the loopback is done before this portion of code, 5472 * the loopback reference in rt6_info will not be taken, do it 5473 * manually for init_net */ 5474 init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev; 5475 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5476 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5477 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5478 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 5479 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5480 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 5481 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5482 #endif 5483 } 5484 5485 int __init ip6_route_init(void) 5486 { 5487 int ret; 5488 int cpu; 5489 5490 ret = -ENOMEM; 5491 ip6_dst_ops_template.kmem_cachep = 5492 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 5493 SLAB_HWCACHE_ALIGN, NULL); 5494 if (!ip6_dst_ops_template.kmem_cachep) 5495 goto out; 5496 5497 ret = dst_entries_init(&ip6_dst_blackhole_ops); 5498 if (ret) 5499 goto out_kmem_cache; 5500 5501 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 5502 if (ret) 5503 goto out_dst_entries; 5504 5505 ret = register_pernet_subsys(&ip6_route_net_ops); 5506 if (ret) 5507 goto out_register_inetpeer; 5508 5509 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 5510 5511 ret = fib6_init(); 5512 if (ret) 5513 goto out_register_subsys; 5514 5515 ret = xfrm6_init(); 5516 if (ret) 5517 goto out_fib6_init; 5518 5519 ret = fib6_rules_init(); 5520 if (ret) 5521 goto xfrm6_init; 5522 5523 ret = register_pernet_subsys(&ip6_route_net_late_ops); 5524 if (ret) 5525 goto fib6_rules_init; 5526 5527 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 5528 inet6_rtm_newroute, NULL, 0); 5529 if (ret < 0) 5530 goto out_register_late_subsys; 5531 5532 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 5533 inet6_rtm_delroute, NULL, 0); 5534 if (ret < 0) 5535 goto out_register_late_subsys; 5536 5537 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 5538 inet6_rtm_getroute, NULL, 5539 RTNL_FLAG_DOIT_UNLOCKED); 5540 if (ret < 0) 5541 goto out_register_late_subsys; 5542 5543 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 5544 if (ret) 5545 goto out_register_late_subsys; 5546 5547 for_each_possible_cpu(cpu) { 5548 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 5549 5550 INIT_LIST_HEAD(&ul->head); 5551 spin_lock_init(&ul->lock); 5552 } 5553 5554 out: 5555 return ret; 5556 5557 out_register_late_subsys: 5558 rtnl_unregister_all(PF_INET6); 5559 unregister_pernet_subsys(&ip6_route_net_late_ops); 5560 fib6_rules_init: 5561 fib6_rules_cleanup(); 5562 xfrm6_init: 5563 xfrm6_fini(); 5564 out_fib6_init: 5565 fib6_gc_cleanup(); 5566 out_register_subsys: 5567 unregister_pernet_subsys(&ip6_route_net_ops); 5568 out_register_inetpeer: 5569 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5570 out_dst_entries: 5571 dst_entries_destroy(&ip6_dst_blackhole_ops); 5572 out_kmem_cache: 5573 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5574 goto out; 5575 } 5576 5577 void ip6_route_cleanup(void) 5578 { 5579 unregister_netdevice_notifier(&ip6_route_dev_notifier); 5580 unregister_pernet_subsys(&ip6_route_net_late_ops); 5581 fib6_rules_cleanup(); 5582 xfrm6_fini(); 5583 fib6_gc_cleanup(); 5584 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5585 unregister_pernet_subsys(&ip6_route_net_ops); 5586 dst_entries_destroy(&ip6_dst_blackhole_ops); 5587 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5588 } 5589