1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/rtnh.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <net/ip.h> 67 #include <linux/uaccess.h> 68 69 #ifdef CONFIG_SYSCTL 70 #include <linux/sysctl.h> 71 #endif 72 73 static int ip6_rt_type_to_error(u8 fib6_type); 74 75 #define CREATE_TRACE_POINTS 76 #include <trace/events/fib6.h> 77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); 78 #undef CREATE_TRACE_POINTS 79 80 enum rt6_nud_state { 81 RT6_NUD_FAIL_HARD = -3, 82 RT6_NUD_FAIL_PROBE = -2, 83 RT6_NUD_FAIL_DO_RR = -1, 84 RT6_NUD_SUCCEED = 1 85 }; 86 87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 88 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 89 static unsigned int ip6_mtu(const struct dst_entry *dst); 90 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 91 static void ip6_dst_destroy(struct dst_entry *); 92 static void ip6_dst_ifdown(struct dst_entry *, 93 struct net_device *dev, int how); 94 static int ip6_dst_gc(struct dst_ops *ops); 95 96 static int ip6_pkt_discard(struct sk_buff *skb); 97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 98 static int ip6_pkt_prohibit(struct sk_buff *skb); 99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 100 static void ip6_link_failure(struct sk_buff *skb); 101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 102 struct sk_buff *skb, u32 mtu); 103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 104 struct sk_buff *skb); 105 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, 106 int strict); 107 static size_t rt6_nlmsg_size(struct fib6_info *rt); 108 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 109 struct fib6_info *rt, struct dst_entry *dst, 110 struct in6_addr *dest, struct in6_addr *src, 111 int iif, int type, u32 portid, u32 seq, 112 unsigned int flags); 113 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, 114 const struct in6_addr *daddr, 115 const struct in6_addr *saddr); 116 117 #ifdef CONFIG_IPV6_ROUTE_INFO 118 static struct fib6_info *rt6_add_route_info(struct net *net, 119 const struct in6_addr *prefix, int prefixlen, 120 const struct in6_addr *gwaddr, 121 struct net_device *dev, 122 unsigned int pref); 123 static struct fib6_info *rt6_get_route_info(struct net *net, 124 const struct in6_addr *prefix, int prefixlen, 125 const struct in6_addr *gwaddr, 126 struct net_device *dev); 127 #endif 128 129 struct uncached_list { 130 spinlock_t lock; 131 struct list_head head; 132 }; 133 134 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 135 136 void rt6_uncached_list_add(struct rt6_info *rt) 137 { 138 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 139 140 rt->rt6i_uncached_list = ul; 141 142 spin_lock_bh(&ul->lock); 143 list_add_tail(&rt->rt6i_uncached, &ul->head); 144 spin_unlock_bh(&ul->lock); 145 } 146 147 void rt6_uncached_list_del(struct rt6_info *rt) 148 { 149 if (!list_empty(&rt->rt6i_uncached)) { 150 struct uncached_list *ul = rt->rt6i_uncached_list; 151 struct net *net = dev_net(rt->dst.dev); 152 153 spin_lock_bh(&ul->lock); 154 list_del(&rt->rt6i_uncached); 155 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 156 spin_unlock_bh(&ul->lock); 157 } 158 } 159 160 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 161 { 162 struct net_device *loopback_dev = net->loopback_dev; 163 int cpu; 164 165 if (dev == loopback_dev) 166 return; 167 168 for_each_possible_cpu(cpu) { 169 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 170 struct rt6_info *rt; 171 172 spin_lock_bh(&ul->lock); 173 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 174 struct inet6_dev *rt_idev = rt->rt6i_idev; 175 struct net_device *rt_dev = rt->dst.dev; 176 177 if (rt_idev->dev == dev) { 178 rt->rt6i_idev = in6_dev_get(loopback_dev); 179 in6_dev_put(rt_idev); 180 } 181 182 if (rt_dev == dev) { 183 rt->dst.dev = loopback_dev; 184 dev_hold(rt->dst.dev); 185 dev_put(rt_dev); 186 } 187 } 188 spin_unlock_bh(&ul->lock); 189 } 190 } 191 192 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 193 struct sk_buff *skb, 194 const void *daddr) 195 { 196 if (!ipv6_addr_any(p)) 197 return (const void *) p; 198 else if (skb) 199 return &ipv6_hdr(skb)->daddr; 200 return daddr; 201 } 202 203 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 204 struct net_device *dev, 205 struct sk_buff *skb, 206 const void *daddr) 207 { 208 struct neighbour *n; 209 210 daddr = choose_neigh_daddr(gw, skb, daddr); 211 n = __ipv6_neigh_lookup(dev, daddr); 212 if (n) 213 return n; 214 215 n = neigh_create(&nd_tbl, daddr, dev); 216 return IS_ERR(n) ? NULL : n; 217 } 218 219 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 220 struct sk_buff *skb, 221 const void *daddr) 222 { 223 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 224 225 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); 226 } 227 228 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 229 { 230 struct net_device *dev = dst->dev; 231 struct rt6_info *rt = (struct rt6_info *)dst; 232 233 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); 234 if (!daddr) 235 return; 236 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 237 return; 238 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 239 return; 240 __ipv6_confirm_neigh(dev, daddr); 241 } 242 243 static struct dst_ops ip6_dst_ops_template = { 244 .family = AF_INET6, 245 .gc = ip6_dst_gc, 246 .gc_thresh = 1024, 247 .check = ip6_dst_check, 248 .default_advmss = ip6_default_advmss, 249 .mtu = ip6_mtu, 250 .cow_metrics = dst_cow_metrics_generic, 251 .destroy = ip6_dst_destroy, 252 .ifdown = ip6_dst_ifdown, 253 .negative_advice = ip6_negative_advice, 254 .link_failure = ip6_link_failure, 255 .update_pmtu = ip6_rt_update_pmtu, 256 .redirect = rt6_do_redirect, 257 .local_out = __ip6_local_out, 258 .neigh_lookup = ip6_dst_neigh_lookup, 259 .confirm_neigh = ip6_confirm_neigh, 260 }; 261 262 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 263 { 264 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 265 266 return mtu ? : dst->dev->mtu; 267 } 268 269 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 270 struct sk_buff *skb, u32 mtu) 271 { 272 } 273 274 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 275 struct sk_buff *skb) 276 { 277 } 278 279 static struct dst_ops ip6_dst_blackhole_ops = { 280 .family = AF_INET6, 281 .destroy = ip6_dst_destroy, 282 .check = ip6_dst_check, 283 .mtu = ip6_blackhole_mtu, 284 .default_advmss = ip6_default_advmss, 285 .update_pmtu = ip6_rt_blackhole_update_pmtu, 286 .redirect = ip6_rt_blackhole_redirect, 287 .cow_metrics = dst_cow_metrics_generic, 288 .neigh_lookup = ip6_dst_neigh_lookup, 289 }; 290 291 static const u32 ip6_template_metrics[RTAX_MAX] = { 292 [RTAX_HOPLIMIT - 1] = 0, 293 }; 294 295 static const struct fib6_info fib6_null_entry_template = { 296 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 297 .fib6_protocol = RTPROT_KERNEL, 298 .fib6_metric = ~(u32)0, 299 .fib6_ref = REFCOUNT_INIT(1), 300 .fib6_type = RTN_UNREACHABLE, 301 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 302 }; 303 304 static const struct rt6_info ip6_null_entry_template = { 305 .dst = { 306 .__refcnt = ATOMIC_INIT(1), 307 .__use = 1, 308 .obsolete = DST_OBSOLETE_FORCE_CHK, 309 .error = -ENETUNREACH, 310 .input = ip6_pkt_discard, 311 .output = ip6_pkt_discard_out, 312 }, 313 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 314 }; 315 316 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 317 318 static const struct rt6_info ip6_prohibit_entry_template = { 319 .dst = { 320 .__refcnt = ATOMIC_INIT(1), 321 .__use = 1, 322 .obsolete = DST_OBSOLETE_FORCE_CHK, 323 .error = -EACCES, 324 .input = ip6_pkt_prohibit, 325 .output = ip6_pkt_prohibit_out, 326 }, 327 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 328 }; 329 330 static const struct rt6_info ip6_blk_hole_entry_template = { 331 .dst = { 332 .__refcnt = ATOMIC_INIT(1), 333 .__use = 1, 334 .obsolete = DST_OBSOLETE_FORCE_CHK, 335 .error = -EINVAL, 336 .input = dst_discard, 337 .output = dst_discard_out, 338 }, 339 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 340 }; 341 342 #endif 343 344 static void rt6_info_init(struct rt6_info *rt) 345 { 346 struct dst_entry *dst = &rt->dst; 347 348 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 349 INIT_LIST_HEAD(&rt->rt6i_uncached); 350 } 351 352 /* allocate dst with ip6_dst_ops */ 353 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 354 int flags) 355 { 356 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 357 1, DST_OBSOLETE_FORCE_CHK, flags); 358 359 if (rt) { 360 rt6_info_init(rt); 361 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 362 } 363 364 return rt; 365 } 366 EXPORT_SYMBOL(ip6_dst_alloc); 367 368 static void ip6_dst_destroy(struct dst_entry *dst) 369 { 370 struct rt6_info *rt = (struct rt6_info *)dst; 371 struct fib6_info *from; 372 struct inet6_dev *idev; 373 374 ip_dst_metrics_put(dst); 375 rt6_uncached_list_del(rt); 376 377 idev = rt->rt6i_idev; 378 if (idev) { 379 rt->rt6i_idev = NULL; 380 in6_dev_put(idev); 381 } 382 383 from = xchg((__force struct fib6_info **)&rt->from, NULL); 384 fib6_info_release(from); 385 } 386 387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 388 int how) 389 { 390 struct rt6_info *rt = (struct rt6_info *)dst; 391 struct inet6_dev *idev = rt->rt6i_idev; 392 struct net_device *loopback_dev = 393 dev_net(dev)->loopback_dev; 394 395 if (idev && idev->dev != loopback_dev) { 396 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 397 if (loopback_idev) { 398 rt->rt6i_idev = loopback_idev; 399 in6_dev_put(idev); 400 } 401 } 402 } 403 404 static bool __rt6_check_expired(const struct rt6_info *rt) 405 { 406 if (rt->rt6i_flags & RTF_EXPIRES) 407 return time_after(jiffies, rt->dst.expires); 408 else 409 return false; 410 } 411 412 static bool rt6_check_expired(const struct rt6_info *rt) 413 { 414 struct fib6_info *from; 415 416 from = rcu_dereference(rt->from); 417 418 if (rt->rt6i_flags & RTF_EXPIRES) { 419 if (time_after(jiffies, rt->dst.expires)) 420 return true; 421 } else if (from) { 422 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 423 fib6_check_expired(from); 424 } 425 return false; 426 } 427 428 void fib6_select_path(const struct net *net, struct fib6_result *res, 429 struct flowi6 *fl6, int oif, bool have_oif_match, 430 const struct sk_buff *skb, int strict) 431 { 432 struct fib6_info *sibling, *next_sibling; 433 struct fib6_info *match = res->f6i; 434 435 if (!match->fib6_nsiblings || have_oif_match) 436 goto out; 437 438 /* We might have already computed the hash for ICMPv6 errors. In such 439 * case it will always be non-zero. Otherwise now is the time to do it. 440 */ 441 if (!fl6->mp_hash) 442 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 443 444 if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound)) 445 goto out; 446 447 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 448 fib6_siblings) { 449 const struct fib6_nh *nh = sibling->fib6_nh; 450 int nh_upper_bound; 451 452 nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); 453 if (fl6->mp_hash > nh_upper_bound) 454 continue; 455 if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0) 456 break; 457 match = sibling; 458 break; 459 } 460 461 out: 462 res->f6i = match; 463 res->nh = match->fib6_nh; 464 } 465 466 /* 467 * Route lookup. rcu_read_lock() should be held. 468 */ 469 470 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh, 471 const struct in6_addr *saddr, int oif, int flags) 472 { 473 const struct net_device *dev; 474 475 if (nh->fib_nh_flags & RTNH_F_DEAD) 476 return false; 477 478 dev = nh->fib_nh_dev; 479 if (oif) { 480 if (dev->ifindex == oif) 481 return true; 482 } else { 483 if (ipv6_chk_addr(net, saddr, dev, 484 flags & RT6_LOOKUP_F_IFACE)) 485 return true; 486 } 487 488 return false; 489 } 490 491 static void rt6_device_match(struct net *net, struct fib6_result *res, 492 const struct in6_addr *saddr, int oif, int flags) 493 { 494 struct fib6_info *f6i = res->f6i; 495 struct fib6_info *spf6i; 496 struct fib6_nh *nh; 497 498 if (!oif && ipv6_addr_any(saddr)) { 499 nh = f6i->fib6_nh; 500 if (!(nh->fib_nh_flags & RTNH_F_DEAD)) 501 goto out; 502 } 503 504 for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) { 505 nh = spf6i->fib6_nh; 506 if (__rt6_device_match(net, nh, saddr, oif, flags)) { 507 res->f6i = spf6i; 508 goto out; 509 } 510 } 511 512 if (oif && flags & RT6_LOOKUP_F_IFACE) { 513 res->f6i = net->ipv6.fib6_null_entry; 514 nh = res->f6i->fib6_nh; 515 goto out; 516 } 517 518 nh = f6i->fib6_nh; 519 if (nh->fib_nh_flags & RTNH_F_DEAD) { 520 res->f6i = net->ipv6.fib6_null_entry; 521 nh = res->f6i->fib6_nh; 522 } 523 out: 524 res->nh = nh; 525 res->fib6_type = res->f6i->fib6_type; 526 res->fib6_flags = res->f6i->fib6_flags; 527 } 528 529 #ifdef CONFIG_IPV6_ROUTER_PREF 530 struct __rt6_probe_work { 531 struct work_struct work; 532 struct in6_addr target; 533 struct net_device *dev; 534 }; 535 536 static void rt6_probe_deferred(struct work_struct *w) 537 { 538 struct in6_addr mcaddr; 539 struct __rt6_probe_work *work = 540 container_of(w, struct __rt6_probe_work, work); 541 542 addrconf_addr_solict_mult(&work->target, &mcaddr); 543 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 544 dev_put(work->dev); 545 kfree(work); 546 } 547 548 static void rt6_probe(struct fib6_nh *fib6_nh) 549 { 550 struct __rt6_probe_work *work = NULL; 551 const struct in6_addr *nh_gw; 552 struct neighbour *neigh; 553 struct net_device *dev; 554 struct inet6_dev *idev; 555 556 /* 557 * Okay, this does not seem to be appropriate 558 * for now, however, we need to check if it 559 * is really so; aka Router Reachability Probing. 560 * 561 * Router Reachability Probe MUST be rate-limited 562 * to no more than one per minute. 563 */ 564 if (fib6_nh->fib_nh_gw_family) 565 return; 566 567 nh_gw = &fib6_nh->fib_nh_gw6; 568 dev = fib6_nh->fib_nh_dev; 569 rcu_read_lock_bh(); 570 idev = __in6_dev_get(dev); 571 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 572 if (neigh) { 573 if (neigh->nud_state & NUD_VALID) 574 goto out; 575 576 write_lock(&neigh->lock); 577 if (!(neigh->nud_state & NUD_VALID) && 578 time_after(jiffies, 579 neigh->updated + idev->cnf.rtr_probe_interval)) { 580 work = kmalloc(sizeof(*work), GFP_ATOMIC); 581 if (work) 582 __neigh_set_probe_once(neigh); 583 } 584 write_unlock(&neigh->lock); 585 } else if (time_after(jiffies, fib6_nh->last_probe + 586 idev->cnf.rtr_probe_interval)) { 587 work = kmalloc(sizeof(*work), GFP_ATOMIC); 588 } 589 590 if (work) { 591 fib6_nh->last_probe = jiffies; 592 INIT_WORK(&work->work, rt6_probe_deferred); 593 work->target = *nh_gw; 594 dev_hold(dev); 595 work->dev = dev; 596 schedule_work(&work->work); 597 } 598 599 out: 600 rcu_read_unlock_bh(); 601 } 602 #else 603 static inline void rt6_probe(struct fib6_nh *fib6_nh) 604 { 605 } 606 #endif 607 608 /* 609 * Default Router Selection (RFC 2461 6.3.6) 610 */ 611 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh) 612 { 613 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 614 struct neighbour *neigh; 615 616 rcu_read_lock_bh(); 617 neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev, 618 &fib6_nh->fib_nh_gw6); 619 if (neigh) { 620 read_lock(&neigh->lock); 621 if (neigh->nud_state & NUD_VALID) 622 ret = RT6_NUD_SUCCEED; 623 #ifdef CONFIG_IPV6_ROUTER_PREF 624 else if (!(neigh->nud_state & NUD_FAILED)) 625 ret = RT6_NUD_SUCCEED; 626 else 627 ret = RT6_NUD_FAIL_PROBE; 628 #endif 629 read_unlock(&neigh->lock); 630 } else { 631 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 632 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 633 } 634 rcu_read_unlock_bh(); 635 636 return ret; 637 } 638 639 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, 640 int strict) 641 { 642 int m = 0; 643 644 if (!oif || nh->fib_nh_dev->ifindex == oif) 645 m = 2; 646 647 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 648 return RT6_NUD_FAIL_HARD; 649 #ifdef CONFIG_IPV6_ROUTER_PREF 650 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2; 651 #endif 652 if ((strict & RT6_LOOKUP_F_REACHABLE) && 653 !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) { 654 int n = rt6_check_neigh(nh); 655 if (n < 0) 656 return n; 657 } 658 return m; 659 } 660 661 static bool find_match(struct fib6_nh *nh, u32 fib6_flags, 662 int oif, int strict, int *mpri, bool *do_rr) 663 { 664 bool match_do_rr = false; 665 bool rc = false; 666 int m; 667 668 if (nh->fib_nh_flags & RTNH_F_DEAD) 669 goto out; 670 671 if (ip6_ignore_linkdown(nh->fib_nh_dev) && 672 nh->fib_nh_flags & RTNH_F_LINKDOWN && 673 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 674 goto out; 675 676 m = rt6_score_route(nh, fib6_flags, oif, strict); 677 if (m == RT6_NUD_FAIL_DO_RR) { 678 match_do_rr = true; 679 m = 0; /* lowest valid score */ 680 } else if (m == RT6_NUD_FAIL_HARD) { 681 goto out; 682 } 683 684 if (strict & RT6_LOOKUP_F_REACHABLE) 685 rt6_probe(nh); 686 687 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 688 if (m > *mpri) { 689 *do_rr = match_do_rr; 690 *mpri = m; 691 rc = true; 692 } 693 out: 694 return rc; 695 } 696 697 static void __find_rr_leaf(struct fib6_info *f6i_start, 698 struct fib6_info *nomatch, u32 metric, 699 struct fib6_result *res, struct fib6_info **cont, 700 int oif, int strict, bool *do_rr, int *mpri) 701 { 702 struct fib6_info *f6i; 703 704 for (f6i = f6i_start; 705 f6i && f6i != nomatch; 706 f6i = rcu_dereference(f6i->fib6_next)) { 707 struct fib6_nh *nh; 708 709 if (cont && f6i->fib6_metric != metric) { 710 *cont = f6i; 711 return; 712 } 713 714 if (fib6_check_expired(f6i)) 715 continue; 716 717 nh = f6i->fib6_nh; 718 if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) { 719 res->f6i = f6i; 720 res->nh = nh; 721 res->fib6_flags = f6i->fib6_flags; 722 res->fib6_type = f6i->fib6_type; 723 } 724 } 725 } 726 727 static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf, 728 struct fib6_info *rr_head, int oif, int strict, 729 bool *do_rr, struct fib6_result *res) 730 { 731 u32 metric = rr_head->fib6_metric; 732 struct fib6_info *cont = NULL; 733 int mpri = -1; 734 735 __find_rr_leaf(rr_head, NULL, metric, res, &cont, 736 oif, strict, do_rr, &mpri); 737 738 __find_rr_leaf(leaf, rr_head, metric, res, &cont, 739 oif, strict, do_rr, &mpri); 740 741 if (res->f6i || !cont) 742 return; 743 744 __find_rr_leaf(cont, NULL, metric, res, NULL, 745 oif, strict, do_rr, &mpri); 746 } 747 748 static void rt6_select(struct net *net, struct fib6_node *fn, int oif, 749 struct fib6_result *res, int strict) 750 { 751 struct fib6_info *leaf = rcu_dereference(fn->leaf); 752 struct fib6_info *rt0; 753 bool do_rr = false; 754 int key_plen; 755 756 /* make sure this function or its helpers sets f6i */ 757 res->f6i = NULL; 758 759 if (!leaf || leaf == net->ipv6.fib6_null_entry) 760 goto out; 761 762 rt0 = rcu_dereference(fn->rr_ptr); 763 if (!rt0) 764 rt0 = leaf; 765 766 /* Double check to make sure fn is not an intermediate node 767 * and fn->leaf does not points to its child's leaf 768 * (This might happen if all routes under fn are deleted from 769 * the tree and fib6_repair_tree() is called on the node.) 770 */ 771 key_plen = rt0->fib6_dst.plen; 772 #ifdef CONFIG_IPV6_SUBTREES 773 if (rt0->fib6_src.plen) 774 key_plen = rt0->fib6_src.plen; 775 #endif 776 if (fn->fn_bit != key_plen) 777 goto out; 778 779 find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res); 780 if (do_rr) { 781 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 782 783 /* no entries matched; do round-robin */ 784 if (!next || next->fib6_metric != rt0->fib6_metric) 785 next = leaf; 786 787 if (next != rt0) { 788 spin_lock_bh(&leaf->fib6_table->tb6_lock); 789 /* make sure next is not being deleted from the tree */ 790 if (next->fib6_node) 791 rcu_assign_pointer(fn->rr_ptr, next); 792 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 793 } 794 } 795 796 out: 797 if (!res->f6i) { 798 res->f6i = net->ipv6.fib6_null_entry; 799 res->nh = res->f6i->fib6_nh; 800 res->fib6_flags = res->f6i->fib6_flags; 801 res->fib6_type = res->f6i->fib6_type; 802 } 803 } 804 805 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res) 806 { 807 return (res->f6i->fib6_flags & RTF_NONEXTHOP) || 808 res->nh->fib_nh_gw_family; 809 } 810 811 #ifdef CONFIG_IPV6_ROUTE_INFO 812 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 813 const struct in6_addr *gwaddr) 814 { 815 struct net *net = dev_net(dev); 816 struct route_info *rinfo = (struct route_info *) opt; 817 struct in6_addr prefix_buf, *prefix; 818 unsigned int pref; 819 unsigned long lifetime; 820 struct fib6_info *rt; 821 822 if (len < sizeof(struct route_info)) { 823 return -EINVAL; 824 } 825 826 /* Sanity check for prefix_len and length */ 827 if (rinfo->length > 3) { 828 return -EINVAL; 829 } else if (rinfo->prefix_len > 128) { 830 return -EINVAL; 831 } else if (rinfo->prefix_len > 64) { 832 if (rinfo->length < 2) { 833 return -EINVAL; 834 } 835 } else if (rinfo->prefix_len > 0) { 836 if (rinfo->length < 1) { 837 return -EINVAL; 838 } 839 } 840 841 pref = rinfo->route_pref; 842 if (pref == ICMPV6_ROUTER_PREF_INVALID) 843 return -EINVAL; 844 845 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 846 847 if (rinfo->length == 3) 848 prefix = (struct in6_addr *)rinfo->prefix; 849 else { 850 /* this function is safe */ 851 ipv6_addr_prefix(&prefix_buf, 852 (struct in6_addr *)rinfo->prefix, 853 rinfo->prefix_len); 854 prefix = &prefix_buf; 855 } 856 857 if (rinfo->prefix_len == 0) 858 rt = rt6_get_dflt_router(net, gwaddr, dev); 859 else 860 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 861 gwaddr, dev); 862 863 if (rt && !lifetime) { 864 ip6_del_rt(net, rt); 865 rt = NULL; 866 } 867 868 if (!rt && lifetime) 869 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 870 dev, pref); 871 else if (rt) 872 rt->fib6_flags = RTF_ROUTEINFO | 873 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 874 875 if (rt) { 876 if (!addrconf_finite_timeout(lifetime)) 877 fib6_clean_expires(rt); 878 else 879 fib6_set_expires(rt, jiffies + HZ * lifetime); 880 881 fib6_info_release(rt); 882 } 883 return 0; 884 } 885 #endif 886 887 /* 888 * Misc support functions 889 */ 890 891 /* called with rcu_lock held */ 892 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res) 893 { 894 struct net_device *dev = res->nh->fib_nh_dev; 895 896 if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 897 /* for copies of local routes, dst->dev needs to be the 898 * device if it is a master device, the master device if 899 * device is enslaved, and the loopback as the default 900 */ 901 if (netif_is_l3_slave(dev) && 902 !rt6_need_strict(&res->f6i->fib6_dst.addr)) 903 dev = l3mdev_master_dev_rcu(dev); 904 else if (!netif_is_l3_master(dev)) 905 dev = dev_net(dev)->loopback_dev; 906 /* last case is netif_is_l3_master(dev) is true in which 907 * case we want dev returned to be dev 908 */ 909 } 910 911 return dev; 912 } 913 914 static const int fib6_prop[RTN_MAX + 1] = { 915 [RTN_UNSPEC] = 0, 916 [RTN_UNICAST] = 0, 917 [RTN_LOCAL] = 0, 918 [RTN_BROADCAST] = 0, 919 [RTN_ANYCAST] = 0, 920 [RTN_MULTICAST] = 0, 921 [RTN_BLACKHOLE] = -EINVAL, 922 [RTN_UNREACHABLE] = -EHOSTUNREACH, 923 [RTN_PROHIBIT] = -EACCES, 924 [RTN_THROW] = -EAGAIN, 925 [RTN_NAT] = -EINVAL, 926 [RTN_XRESOLVE] = -EINVAL, 927 }; 928 929 static int ip6_rt_type_to_error(u8 fib6_type) 930 { 931 return fib6_prop[fib6_type]; 932 } 933 934 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 935 { 936 unsigned short flags = 0; 937 938 if (rt->dst_nocount) 939 flags |= DST_NOCOUNT; 940 if (rt->dst_nopolicy) 941 flags |= DST_NOPOLICY; 942 if (rt->dst_host) 943 flags |= DST_HOST; 944 945 return flags; 946 } 947 948 static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type) 949 { 950 rt->dst.error = ip6_rt_type_to_error(fib6_type); 951 952 switch (fib6_type) { 953 case RTN_BLACKHOLE: 954 rt->dst.output = dst_discard_out; 955 rt->dst.input = dst_discard; 956 break; 957 case RTN_PROHIBIT: 958 rt->dst.output = ip6_pkt_prohibit_out; 959 rt->dst.input = ip6_pkt_prohibit; 960 break; 961 case RTN_THROW: 962 case RTN_UNREACHABLE: 963 default: 964 rt->dst.output = ip6_pkt_discard_out; 965 rt->dst.input = ip6_pkt_discard; 966 break; 967 } 968 } 969 970 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res) 971 { 972 struct fib6_info *f6i = res->f6i; 973 974 if (res->fib6_flags & RTF_REJECT) { 975 ip6_rt_init_dst_reject(rt, res->fib6_type); 976 return; 977 } 978 979 rt->dst.error = 0; 980 rt->dst.output = ip6_output; 981 982 if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) { 983 rt->dst.input = ip6_input; 984 } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 985 rt->dst.input = ip6_mc_input; 986 } else { 987 rt->dst.input = ip6_forward; 988 } 989 990 if (res->nh->fib_nh_lws) { 991 rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws); 992 lwtunnel_set_redirect(&rt->dst); 993 } 994 995 rt->dst.lastuse = jiffies; 996 } 997 998 /* Caller must already hold reference to @from */ 999 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 1000 { 1001 rt->rt6i_flags &= ~RTF_EXPIRES; 1002 rcu_assign_pointer(rt->from, from); 1003 ip_dst_init_metrics(&rt->dst, from->fib6_metrics); 1004 } 1005 1006 /* Caller must already hold reference to f6i in result */ 1007 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res) 1008 { 1009 const struct fib6_nh *nh = res->nh; 1010 const struct net_device *dev = nh->fib_nh_dev; 1011 struct fib6_info *f6i = res->f6i; 1012 1013 ip6_rt_init_dst(rt, res); 1014 1015 rt->rt6i_dst = f6i->fib6_dst; 1016 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 1017 rt->rt6i_flags = res->fib6_flags; 1018 if (nh->fib_nh_gw_family) { 1019 rt->rt6i_gateway = nh->fib_nh_gw6; 1020 rt->rt6i_flags |= RTF_GATEWAY; 1021 } 1022 rt6_set_from(rt, f6i); 1023 #ifdef CONFIG_IPV6_SUBTREES 1024 rt->rt6i_src = f6i->fib6_src; 1025 #endif 1026 } 1027 1028 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1029 struct in6_addr *saddr) 1030 { 1031 struct fib6_node *pn, *sn; 1032 while (1) { 1033 if (fn->fn_flags & RTN_TL_ROOT) 1034 return NULL; 1035 pn = rcu_dereference(fn->parent); 1036 sn = FIB6_SUBTREE(pn); 1037 if (sn && sn != fn) 1038 fn = fib6_node_lookup(sn, NULL, saddr); 1039 else 1040 fn = pn; 1041 if (fn->fn_flags & RTN_RTINFO) 1042 return fn; 1043 } 1044 } 1045 1046 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt) 1047 { 1048 struct rt6_info *rt = *prt; 1049 1050 if (dst_hold_safe(&rt->dst)) 1051 return true; 1052 if (net) { 1053 rt = net->ipv6.ip6_null_entry; 1054 dst_hold(&rt->dst); 1055 } else { 1056 rt = NULL; 1057 } 1058 *prt = rt; 1059 return false; 1060 } 1061 1062 /* called with rcu_lock held */ 1063 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res) 1064 { 1065 struct net_device *dev = res->nh->fib_nh_dev; 1066 struct fib6_info *f6i = res->f6i; 1067 unsigned short flags; 1068 struct rt6_info *nrt; 1069 1070 if (!fib6_info_hold_safe(f6i)) 1071 goto fallback; 1072 1073 flags = fib6_info_dst_flags(f6i); 1074 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1075 if (!nrt) { 1076 fib6_info_release(f6i); 1077 goto fallback; 1078 } 1079 1080 ip6_rt_copy_init(nrt, res); 1081 return nrt; 1082 1083 fallback: 1084 nrt = dev_net(dev)->ipv6.ip6_null_entry; 1085 dst_hold(&nrt->dst); 1086 return nrt; 1087 } 1088 1089 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1090 struct fib6_table *table, 1091 struct flowi6 *fl6, 1092 const struct sk_buff *skb, 1093 int flags) 1094 { 1095 struct fib6_result res = {}; 1096 struct fib6_node *fn; 1097 struct rt6_info *rt; 1098 1099 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1100 flags &= ~RT6_LOOKUP_F_IFACE; 1101 1102 rcu_read_lock(); 1103 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1104 restart: 1105 res.f6i = rcu_dereference(fn->leaf); 1106 if (!res.f6i) 1107 res.f6i = net->ipv6.fib6_null_entry; 1108 else 1109 rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif, 1110 flags); 1111 1112 if (res.f6i == net->ipv6.fib6_null_entry) { 1113 fn = fib6_backtrack(fn, &fl6->saddr); 1114 if (fn) 1115 goto restart; 1116 1117 rt = net->ipv6.ip6_null_entry; 1118 dst_hold(&rt->dst); 1119 goto out; 1120 } 1121 1122 fib6_select_path(net, &res, fl6, fl6->flowi6_oif, 1123 fl6->flowi6_oif != 0, skb, flags); 1124 1125 /* Search through exception table */ 1126 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); 1127 if (rt) { 1128 if (ip6_hold_safe(net, &rt)) 1129 dst_use_noref(&rt->dst, jiffies); 1130 } else { 1131 rt = ip6_create_rt_rcu(&res); 1132 } 1133 1134 out: 1135 trace_fib6_table_lookup(net, &res, table, fl6); 1136 1137 rcu_read_unlock(); 1138 1139 return rt; 1140 } 1141 1142 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1143 const struct sk_buff *skb, int flags) 1144 { 1145 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1146 } 1147 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1148 1149 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1150 const struct in6_addr *saddr, int oif, 1151 const struct sk_buff *skb, int strict) 1152 { 1153 struct flowi6 fl6 = { 1154 .flowi6_oif = oif, 1155 .daddr = *daddr, 1156 }; 1157 struct dst_entry *dst; 1158 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1159 1160 if (saddr) { 1161 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1162 flags |= RT6_LOOKUP_F_HAS_SADDR; 1163 } 1164 1165 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1166 if (dst->error == 0) 1167 return (struct rt6_info *) dst; 1168 1169 dst_release(dst); 1170 1171 return NULL; 1172 } 1173 EXPORT_SYMBOL(rt6_lookup); 1174 1175 /* ip6_ins_rt is called with FREE table->tb6_lock. 1176 * It takes new route entry, the addition fails by any reason the 1177 * route is released. 1178 * Caller must hold dst before calling it. 1179 */ 1180 1181 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1182 struct netlink_ext_ack *extack) 1183 { 1184 int err; 1185 struct fib6_table *table; 1186 1187 table = rt->fib6_table; 1188 spin_lock_bh(&table->tb6_lock); 1189 err = fib6_add(&table->tb6_root, rt, info, extack); 1190 spin_unlock_bh(&table->tb6_lock); 1191 1192 return err; 1193 } 1194 1195 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1196 { 1197 struct nl_info info = { .nl_net = net, }; 1198 1199 return __ip6_ins_rt(rt, &info, NULL); 1200 } 1201 1202 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res, 1203 const struct in6_addr *daddr, 1204 const struct in6_addr *saddr) 1205 { 1206 struct fib6_info *f6i = res->f6i; 1207 struct net_device *dev; 1208 struct rt6_info *rt; 1209 1210 /* 1211 * Clone the route. 1212 */ 1213 1214 if (!fib6_info_hold_safe(f6i)) 1215 return NULL; 1216 1217 dev = ip6_rt_get_dev_rcu(res); 1218 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1219 if (!rt) { 1220 fib6_info_release(f6i); 1221 return NULL; 1222 } 1223 1224 ip6_rt_copy_init(rt, res); 1225 rt->rt6i_flags |= RTF_CACHE; 1226 rt->dst.flags |= DST_HOST; 1227 rt->rt6i_dst.addr = *daddr; 1228 rt->rt6i_dst.plen = 128; 1229 1230 if (!rt6_is_gw_or_nonexthop(res)) { 1231 if (f6i->fib6_dst.plen != 128 && 1232 ipv6_addr_equal(&f6i->fib6_dst.addr, daddr)) 1233 rt->rt6i_flags |= RTF_ANYCAST; 1234 #ifdef CONFIG_IPV6_SUBTREES 1235 if (rt->rt6i_src.plen && saddr) { 1236 rt->rt6i_src.addr = *saddr; 1237 rt->rt6i_src.plen = 128; 1238 } 1239 #endif 1240 } 1241 1242 return rt; 1243 } 1244 1245 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res) 1246 { 1247 struct fib6_info *f6i = res->f6i; 1248 unsigned short flags = fib6_info_dst_flags(f6i); 1249 struct net_device *dev; 1250 struct rt6_info *pcpu_rt; 1251 1252 if (!fib6_info_hold_safe(f6i)) 1253 return NULL; 1254 1255 rcu_read_lock(); 1256 dev = ip6_rt_get_dev_rcu(res); 1257 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1258 rcu_read_unlock(); 1259 if (!pcpu_rt) { 1260 fib6_info_release(f6i); 1261 return NULL; 1262 } 1263 ip6_rt_copy_init(pcpu_rt, res); 1264 pcpu_rt->rt6i_flags |= RTF_PCPU; 1265 return pcpu_rt; 1266 } 1267 1268 /* It should be called with rcu_read_lock() acquired */ 1269 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) 1270 { 1271 struct rt6_info *pcpu_rt, **p; 1272 1273 p = this_cpu_ptr(res->nh->rt6i_pcpu); 1274 pcpu_rt = *p; 1275 1276 if (pcpu_rt) 1277 ip6_hold_safe(NULL, &pcpu_rt); 1278 1279 return pcpu_rt; 1280 } 1281 1282 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1283 const struct fib6_result *res) 1284 { 1285 struct rt6_info *pcpu_rt, *prev, **p; 1286 1287 pcpu_rt = ip6_rt_pcpu_alloc(res); 1288 if (!pcpu_rt) { 1289 dst_hold(&net->ipv6.ip6_null_entry->dst); 1290 return net->ipv6.ip6_null_entry; 1291 } 1292 1293 dst_hold(&pcpu_rt->dst); 1294 p = this_cpu_ptr(res->nh->rt6i_pcpu); 1295 prev = cmpxchg(p, NULL, pcpu_rt); 1296 BUG_ON(prev); 1297 1298 if (res->f6i->fib6_destroying) { 1299 struct fib6_info *from; 1300 1301 from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); 1302 fib6_info_release(from); 1303 } 1304 1305 return pcpu_rt; 1306 } 1307 1308 /* exception hash table implementation 1309 */ 1310 static DEFINE_SPINLOCK(rt6_exception_lock); 1311 1312 /* Remove rt6_ex from hash table and free the memory 1313 * Caller must hold rt6_exception_lock 1314 */ 1315 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1316 struct rt6_exception *rt6_ex) 1317 { 1318 struct fib6_info *from; 1319 struct net *net; 1320 1321 if (!bucket || !rt6_ex) 1322 return; 1323 1324 net = dev_net(rt6_ex->rt6i->dst.dev); 1325 net->ipv6.rt6_stats->fib_rt_cache--; 1326 1327 /* purge completely the exception to allow releasing the held resources: 1328 * some [sk] cache may keep the dst around for unlimited time 1329 */ 1330 from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL); 1331 fib6_info_release(from); 1332 dst_dev_put(&rt6_ex->rt6i->dst); 1333 1334 hlist_del_rcu(&rt6_ex->hlist); 1335 dst_release(&rt6_ex->rt6i->dst); 1336 kfree_rcu(rt6_ex, rcu); 1337 WARN_ON_ONCE(!bucket->depth); 1338 bucket->depth--; 1339 } 1340 1341 /* Remove oldest rt6_ex in bucket and free the memory 1342 * Caller must hold rt6_exception_lock 1343 */ 1344 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1345 { 1346 struct rt6_exception *rt6_ex, *oldest = NULL; 1347 1348 if (!bucket) 1349 return; 1350 1351 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1352 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1353 oldest = rt6_ex; 1354 } 1355 rt6_remove_exception(bucket, oldest); 1356 } 1357 1358 static u32 rt6_exception_hash(const struct in6_addr *dst, 1359 const struct in6_addr *src) 1360 { 1361 static u32 seed __read_mostly; 1362 u32 val; 1363 1364 net_get_random_once(&seed, sizeof(seed)); 1365 val = jhash(dst, sizeof(*dst), seed); 1366 1367 #ifdef CONFIG_IPV6_SUBTREES 1368 if (src) 1369 val = jhash(src, sizeof(*src), val); 1370 #endif 1371 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1372 } 1373 1374 /* Helper function to find the cached rt in the hash table 1375 * and update bucket pointer to point to the bucket for this 1376 * (daddr, saddr) pair 1377 * Caller must hold rt6_exception_lock 1378 */ 1379 static struct rt6_exception * 1380 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1381 const struct in6_addr *daddr, 1382 const struct in6_addr *saddr) 1383 { 1384 struct rt6_exception *rt6_ex; 1385 u32 hval; 1386 1387 if (!(*bucket) || !daddr) 1388 return NULL; 1389 1390 hval = rt6_exception_hash(daddr, saddr); 1391 *bucket += hval; 1392 1393 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1394 struct rt6_info *rt6 = rt6_ex->rt6i; 1395 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1396 1397 #ifdef CONFIG_IPV6_SUBTREES 1398 if (matched && saddr) 1399 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1400 #endif 1401 if (matched) 1402 return rt6_ex; 1403 } 1404 return NULL; 1405 } 1406 1407 /* Helper function to find the cached rt in the hash table 1408 * and update bucket pointer to point to the bucket for this 1409 * (daddr, saddr) pair 1410 * Caller must hold rcu_read_lock() 1411 */ 1412 static struct rt6_exception * 1413 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1414 const struct in6_addr *daddr, 1415 const struct in6_addr *saddr) 1416 { 1417 struct rt6_exception *rt6_ex; 1418 u32 hval; 1419 1420 WARN_ON_ONCE(!rcu_read_lock_held()); 1421 1422 if (!(*bucket) || !daddr) 1423 return NULL; 1424 1425 hval = rt6_exception_hash(daddr, saddr); 1426 *bucket += hval; 1427 1428 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1429 struct rt6_info *rt6 = rt6_ex->rt6i; 1430 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1431 1432 #ifdef CONFIG_IPV6_SUBTREES 1433 if (matched && saddr) 1434 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1435 #endif 1436 if (matched) 1437 return rt6_ex; 1438 } 1439 return NULL; 1440 } 1441 1442 static unsigned int fib6_mtu(const struct fib6_result *res) 1443 { 1444 const struct fib6_nh *nh = res->nh; 1445 unsigned int mtu; 1446 1447 if (res->f6i->fib6_pmtu) { 1448 mtu = res->f6i->fib6_pmtu; 1449 } else { 1450 struct net_device *dev = nh->fib_nh_dev; 1451 struct inet6_dev *idev; 1452 1453 rcu_read_lock(); 1454 idev = __in6_dev_get(dev); 1455 mtu = idev->cnf.mtu6; 1456 rcu_read_unlock(); 1457 } 1458 1459 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1460 1461 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); 1462 } 1463 1464 #define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL 1465 1466 /* used when the flushed bit is not relevant, only access to the bucket 1467 * (ie., all bucket users except rt6_insert_exception); 1468 * 1469 * called under rcu lock; sometimes called with rt6_exception_lock held 1470 */ 1471 static 1472 struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh, 1473 spinlock_t *lock) 1474 { 1475 struct rt6_exception_bucket *bucket; 1476 1477 if (lock) 1478 bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, 1479 lockdep_is_held(lock)); 1480 else 1481 bucket = rcu_dereference(nh->rt6i_exception_bucket); 1482 1483 /* remove bucket flushed bit if set */ 1484 if (bucket) { 1485 unsigned long p = (unsigned long)bucket; 1486 1487 p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED; 1488 bucket = (struct rt6_exception_bucket *)p; 1489 } 1490 1491 return bucket; 1492 } 1493 1494 static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket) 1495 { 1496 unsigned long p = (unsigned long)bucket; 1497 1498 return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED); 1499 } 1500 1501 /* called with rt6_exception_lock held */ 1502 static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh, 1503 spinlock_t *lock) 1504 { 1505 struct rt6_exception_bucket *bucket; 1506 unsigned long p; 1507 1508 bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, 1509 lockdep_is_held(lock)); 1510 1511 p = (unsigned long)bucket; 1512 p |= FIB6_EXCEPTION_BUCKET_FLUSHED; 1513 bucket = (struct rt6_exception_bucket *)p; 1514 rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); 1515 } 1516 1517 static int rt6_insert_exception(struct rt6_info *nrt, 1518 const struct fib6_result *res) 1519 { 1520 struct net *net = dev_net(nrt->dst.dev); 1521 struct rt6_exception_bucket *bucket; 1522 struct fib6_info *f6i = res->f6i; 1523 struct in6_addr *src_key = NULL; 1524 struct rt6_exception *rt6_ex; 1525 struct fib6_nh *nh = res->nh; 1526 int err = 0; 1527 1528 spin_lock_bh(&rt6_exception_lock); 1529 1530 bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, 1531 lockdep_is_held(&rt6_exception_lock)); 1532 if (!bucket) { 1533 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1534 GFP_ATOMIC); 1535 if (!bucket) { 1536 err = -ENOMEM; 1537 goto out; 1538 } 1539 rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); 1540 } else if (fib6_nh_excptn_bucket_flushed(bucket)) { 1541 err = -EINVAL; 1542 goto out; 1543 } 1544 1545 #ifdef CONFIG_IPV6_SUBTREES 1546 /* fib6_src.plen != 0 indicates f6i is in subtree 1547 * and exception table is indexed by a hash of 1548 * both fib6_dst and fib6_src. 1549 * Otherwise, the exception table is indexed by 1550 * a hash of only fib6_dst. 1551 */ 1552 if (f6i->fib6_src.plen) 1553 src_key = &nrt->rt6i_src.addr; 1554 #endif 1555 /* rt6_mtu_change() might lower mtu on f6i. 1556 * Only insert this exception route if its mtu 1557 * is less than f6i's mtu value. 1558 */ 1559 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) { 1560 err = -EINVAL; 1561 goto out; 1562 } 1563 1564 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1565 src_key); 1566 if (rt6_ex) 1567 rt6_remove_exception(bucket, rt6_ex); 1568 1569 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1570 if (!rt6_ex) { 1571 err = -ENOMEM; 1572 goto out; 1573 } 1574 rt6_ex->rt6i = nrt; 1575 rt6_ex->stamp = jiffies; 1576 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1577 bucket->depth++; 1578 net->ipv6.rt6_stats->fib_rt_cache++; 1579 1580 if (bucket->depth > FIB6_MAX_DEPTH) 1581 rt6_exception_remove_oldest(bucket); 1582 1583 out: 1584 spin_unlock_bh(&rt6_exception_lock); 1585 1586 /* Update fn->fn_sernum to invalidate all cached dst */ 1587 if (!err) { 1588 spin_lock_bh(&f6i->fib6_table->tb6_lock); 1589 fib6_update_sernum(net, f6i); 1590 spin_unlock_bh(&f6i->fib6_table->tb6_lock); 1591 fib6_force_start_gc(net); 1592 } 1593 1594 return err; 1595 } 1596 1597 static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from) 1598 { 1599 struct rt6_exception_bucket *bucket; 1600 struct rt6_exception *rt6_ex; 1601 struct hlist_node *tmp; 1602 int i; 1603 1604 spin_lock_bh(&rt6_exception_lock); 1605 1606 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 1607 if (!bucket) 1608 goto out; 1609 1610 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1611 if (!from) 1612 fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock); 1613 1614 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1615 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) { 1616 if (!from || 1617 rcu_access_pointer(rt6_ex->rt6i->from) == from) 1618 rt6_remove_exception(bucket, rt6_ex); 1619 } 1620 WARN_ON_ONCE(!from && bucket->depth); 1621 bucket++; 1622 } 1623 out: 1624 spin_unlock_bh(&rt6_exception_lock); 1625 } 1626 1627 void rt6_flush_exceptions(struct fib6_info *f6i) 1628 { 1629 fib6_nh_flush_exceptions(f6i->fib6_nh, f6i); 1630 } 1631 1632 /* Find cached rt in the hash table inside passed in rt 1633 * Caller has to hold rcu_read_lock() 1634 */ 1635 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, 1636 const struct in6_addr *daddr, 1637 const struct in6_addr *saddr) 1638 { 1639 const struct in6_addr *src_key = NULL; 1640 struct rt6_exception_bucket *bucket; 1641 struct rt6_exception *rt6_ex; 1642 struct rt6_info *ret = NULL; 1643 1644 #ifdef CONFIG_IPV6_SUBTREES 1645 /* fib6i_src.plen != 0 indicates f6i is in subtree 1646 * and exception table is indexed by a hash of 1647 * both fib6_dst and fib6_src. 1648 * However, the src addr used to create the hash 1649 * might not be exactly the passed in saddr which 1650 * is a /128 addr from the flow. 1651 * So we need to use f6i->fib6_src to redo lookup 1652 * if the passed in saddr does not find anything. 1653 * (See the logic in ip6_rt_cache_alloc() on how 1654 * rt->rt6i_src is updated.) 1655 */ 1656 if (res->f6i->fib6_src.plen) 1657 src_key = saddr; 1658 find_ex: 1659 #endif 1660 bucket = fib6_nh_get_excptn_bucket(res->nh, NULL); 1661 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1662 1663 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1664 ret = rt6_ex->rt6i; 1665 1666 #ifdef CONFIG_IPV6_SUBTREES 1667 /* Use fib6_src as src_key and redo lookup */ 1668 if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) { 1669 src_key = &res->f6i->fib6_src.addr; 1670 goto find_ex; 1671 } 1672 #endif 1673 1674 return ret; 1675 } 1676 1677 /* Remove the passed in cached rt from the hash table that contains it */ 1678 static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen, 1679 const struct rt6_info *rt) 1680 { 1681 const struct in6_addr *src_key = NULL; 1682 struct rt6_exception_bucket *bucket; 1683 struct rt6_exception *rt6_ex; 1684 int err; 1685 1686 if (!rcu_access_pointer(nh->rt6i_exception_bucket)) 1687 return -ENOENT; 1688 1689 spin_lock_bh(&rt6_exception_lock); 1690 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 1691 1692 #ifdef CONFIG_IPV6_SUBTREES 1693 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1694 * and exception table is indexed by a hash of 1695 * both rt6i_dst and rt6i_src. 1696 * Otherwise, the exception table is indexed by 1697 * a hash of only rt6i_dst. 1698 */ 1699 if (plen) 1700 src_key = &rt->rt6i_src.addr; 1701 #endif 1702 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1703 &rt->rt6i_dst.addr, 1704 src_key); 1705 if (rt6_ex) { 1706 rt6_remove_exception(bucket, rt6_ex); 1707 err = 0; 1708 } else { 1709 err = -ENOENT; 1710 } 1711 1712 spin_unlock_bh(&rt6_exception_lock); 1713 return err; 1714 } 1715 1716 static int rt6_remove_exception_rt(struct rt6_info *rt) 1717 { 1718 struct fib6_info *from; 1719 1720 from = rcu_dereference(rt->from); 1721 if (!from || !(rt->rt6i_flags & RTF_CACHE)) 1722 return -EINVAL; 1723 1724 return fib6_nh_remove_exception(from->fib6_nh, 1725 from->fib6_src.plen, rt); 1726 } 1727 1728 /* Find rt6_ex which contains the passed in rt cache and 1729 * refresh its stamp 1730 */ 1731 static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen, 1732 const struct rt6_info *rt) 1733 { 1734 const struct in6_addr *src_key = NULL; 1735 struct rt6_exception_bucket *bucket; 1736 struct rt6_exception *rt6_ex; 1737 1738 bucket = fib6_nh_get_excptn_bucket(nh, NULL); 1739 #ifdef CONFIG_IPV6_SUBTREES 1740 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1741 * and exception table is indexed by a hash of 1742 * both rt6i_dst and rt6i_src. 1743 * Otherwise, the exception table is indexed by 1744 * a hash of only rt6i_dst. 1745 */ 1746 if (plen) 1747 src_key = &rt->rt6i_src.addr; 1748 #endif 1749 rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key); 1750 if (rt6_ex) 1751 rt6_ex->stamp = jiffies; 1752 } 1753 1754 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1755 { 1756 struct fib6_info *from; 1757 1758 rcu_read_lock(); 1759 1760 from = rcu_dereference(rt->from); 1761 if (!from || !(rt->rt6i_flags & RTF_CACHE)) 1762 goto unlock; 1763 1764 fib6_nh_update_exception(from->fib6_nh, from->fib6_src.plen, rt); 1765 unlock: 1766 rcu_read_unlock(); 1767 } 1768 1769 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1770 struct rt6_info *rt, int mtu) 1771 { 1772 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1773 * lowest MTU in the path: always allow updating the route PMTU to 1774 * reflect PMTU decreases. 1775 * 1776 * If the new MTU is higher, and the route PMTU is equal to the local 1777 * MTU, this means the old MTU is the lowest in the path, so allow 1778 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1779 * handle this. 1780 */ 1781 1782 if (dst_mtu(&rt->dst) >= mtu) 1783 return true; 1784 1785 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 1786 return true; 1787 1788 return false; 1789 } 1790 1791 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1792 const struct fib6_nh *nh, int mtu) 1793 { 1794 struct rt6_exception_bucket *bucket; 1795 struct rt6_exception *rt6_ex; 1796 int i; 1797 1798 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 1799 if (!bucket) 1800 return; 1801 1802 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1803 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1804 struct rt6_info *entry = rt6_ex->rt6i; 1805 1806 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 1807 * route), the metrics of its rt->from have already 1808 * been updated. 1809 */ 1810 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 1811 rt6_mtu_change_route_allowed(idev, entry, mtu)) 1812 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 1813 } 1814 bucket++; 1815 } 1816 } 1817 1818 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1819 1820 static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh, 1821 const struct in6_addr *gateway) 1822 { 1823 struct rt6_exception_bucket *bucket; 1824 struct rt6_exception *rt6_ex; 1825 struct hlist_node *tmp; 1826 int i; 1827 1828 if (!rcu_access_pointer(nh->rt6i_exception_bucket)) 1829 return; 1830 1831 spin_lock_bh(&rt6_exception_lock); 1832 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 1833 if (bucket) { 1834 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1835 hlist_for_each_entry_safe(rt6_ex, tmp, 1836 &bucket->chain, hlist) { 1837 struct rt6_info *entry = rt6_ex->rt6i; 1838 1839 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1840 RTF_CACHE_GATEWAY && 1841 ipv6_addr_equal(gateway, 1842 &entry->rt6i_gateway)) { 1843 rt6_remove_exception(bucket, rt6_ex); 1844 } 1845 } 1846 bucket++; 1847 } 1848 } 1849 1850 spin_unlock_bh(&rt6_exception_lock); 1851 } 1852 1853 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1854 struct rt6_exception *rt6_ex, 1855 struct fib6_gc_args *gc_args, 1856 unsigned long now) 1857 { 1858 struct rt6_info *rt = rt6_ex->rt6i; 1859 1860 /* we are pruning and obsoleting aged-out and non gateway exceptions 1861 * even if others have still references to them, so that on next 1862 * dst_check() such references can be dropped. 1863 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1864 * expired, independently from their aging, as per RFC 8201 section 4 1865 */ 1866 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 1867 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1868 RT6_TRACE("aging clone %p\n", rt); 1869 rt6_remove_exception(bucket, rt6_ex); 1870 return; 1871 } 1872 } else if (time_after(jiffies, rt->dst.expires)) { 1873 RT6_TRACE("purging expired route %p\n", rt); 1874 rt6_remove_exception(bucket, rt6_ex); 1875 return; 1876 } 1877 1878 if (rt->rt6i_flags & RTF_GATEWAY) { 1879 struct neighbour *neigh; 1880 __u8 neigh_flags = 0; 1881 1882 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 1883 if (neigh) 1884 neigh_flags = neigh->flags; 1885 1886 if (!(neigh_flags & NTF_ROUTER)) { 1887 RT6_TRACE("purging route %p via non-router but gateway\n", 1888 rt); 1889 rt6_remove_exception(bucket, rt6_ex); 1890 return; 1891 } 1892 } 1893 1894 gc_args->more++; 1895 } 1896 1897 static void fib6_nh_age_exceptions(const struct fib6_nh *nh, 1898 struct fib6_gc_args *gc_args, 1899 unsigned long now) 1900 { 1901 struct rt6_exception_bucket *bucket; 1902 struct rt6_exception *rt6_ex; 1903 struct hlist_node *tmp; 1904 int i; 1905 1906 if (!rcu_access_pointer(nh->rt6i_exception_bucket)) 1907 return; 1908 1909 rcu_read_lock_bh(); 1910 spin_lock(&rt6_exception_lock); 1911 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 1912 if (bucket) { 1913 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1914 hlist_for_each_entry_safe(rt6_ex, tmp, 1915 &bucket->chain, hlist) { 1916 rt6_age_examine_exception(bucket, rt6_ex, 1917 gc_args, now); 1918 } 1919 bucket++; 1920 } 1921 } 1922 spin_unlock(&rt6_exception_lock); 1923 rcu_read_unlock_bh(); 1924 } 1925 1926 void rt6_age_exceptions(struct fib6_info *f6i, 1927 struct fib6_gc_args *gc_args, 1928 unsigned long now) 1929 { 1930 fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now); 1931 } 1932 1933 /* must be called with rcu lock held */ 1934 int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, 1935 struct flowi6 *fl6, struct fib6_result *res, int strict) 1936 { 1937 struct fib6_node *fn, *saved_fn; 1938 1939 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1940 saved_fn = fn; 1941 1942 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1943 oif = 0; 1944 1945 redo_rt6_select: 1946 rt6_select(net, fn, oif, res, strict); 1947 if (res->f6i == net->ipv6.fib6_null_entry) { 1948 fn = fib6_backtrack(fn, &fl6->saddr); 1949 if (fn) 1950 goto redo_rt6_select; 1951 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1952 /* also consider unreachable route */ 1953 strict &= ~RT6_LOOKUP_F_REACHABLE; 1954 fn = saved_fn; 1955 goto redo_rt6_select; 1956 } 1957 } 1958 1959 trace_fib6_table_lookup(net, res, table, fl6); 1960 1961 return 0; 1962 } 1963 1964 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1965 int oif, struct flowi6 *fl6, 1966 const struct sk_buff *skb, int flags) 1967 { 1968 struct fib6_result res = {}; 1969 struct rt6_info *rt; 1970 int strict = 0; 1971 1972 strict |= flags & RT6_LOOKUP_F_IFACE; 1973 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1974 if (net->ipv6.devconf_all->forwarding == 0) 1975 strict |= RT6_LOOKUP_F_REACHABLE; 1976 1977 rcu_read_lock(); 1978 1979 fib6_table_lookup(net, table, oif, fl6, &res, strict); 1980 if (res.f6i == net->ipv6.fib6_null_entry) { 1981 rt = net->ipv6.ip6_null_entry; 1982 rcu_read_unlock(); 1983 dst_hold(&rt->dst); 1984 return rt; 1985 } 1986 1987 fib6_select_path(net, &res, fl6, oif, false, skb, strict); 1988 1989 /*Search through exception table */ 1990 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); 1991 if (rt) { 1992 if (ip6_hold_safe(net, &rt)) 1993 dst_use_noref(&rt->dst, jiffies); 1994 1995 rcu_read_unlock(); 1996 return rt; 1997 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1998 !res.nh->fib_nh_gw_family)) { 1999 /* Create a RTF_CACHE clone which will not be 2000 * owned by the fib6 tree. It is for the special case where 2001 * the daddr in the skb during the neighbor look-up is different 2002 * from the fl6->daddr used to look-up route here. 2003 */ 2004 struct rt6_info *uncached_rt; 2005 2006 uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); 2007 2008 rcu_read_unlock(); 2009 2010 if (uncached_rt) { 2011 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 2012 * No need for another dst_hold() 2013 */ 2014 rt6_uncached_list_add(uncached_rt); 2015 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2016 } else { 2017 uncached_rt = net->ipv6.ip6_null_entry; 2018 dst_hold(&uncached_rt->dst); 2019 } 2020 2021 return uncached_rt; 2022 } else { 2023 /* Get a percpu copy */ 2024 2025 struct rt6_info *pcpu_rt; 2026 2027 local_bh_disable(); 2028 pcpu_rt = rt6_get_pcpu_route(&res); 2029 2030 if (!pcpu_rt) 2031 pcpu_rt = rt6_make_pcpu_route(net, &res); 2032 2033 local_bh_enable(); 2034 rcu_read_unlock(); 2035 2036 return pcpu_rt; 2037 } 2038 } 2039 EXPORT_SYMBOL_GPL(ip6_pol_route); 2040 2041 static struct rt6_info *ip6_pol_route_input(struct net *net, 2042 struct fib6_table *table, 2043 struct flowi6 *fl6, 2044 const struct sk_buff *skb, 2045 int flags) 2046 { 2047 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 2048 } 2049 2050 struct dst_entry *ip6_route_input_lookup(struct net *net, 2051 struct net_device *dev, 2052 struct flowi6 *fl6, 2053 const struct sk_buff *skb, 2054 int flags) 2055 { 2056 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 2057 flags |= RT6_LOOKUP_F_IFACE; 2058 2059 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 2060 } 2061 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 2062 2063 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 2064 struct flow_keys *keys, 2065 struct flow_keys *flkeys) 2066 { 2067 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 2068 const struct ipv6hdr *key_iph = outer_iph; 2069 struct flow_keys *_flkeys = flkeys; 2070 const struct ipv6hdr *inner_iph; 2071 const struct icmp6hdr *icmph; 2072 struct ipv6hdr _inner_iph; 2073 struct icmp6hdr _icmph; 2074 2075 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 2076 goto out; 2077 2078 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 2079 sizeof(_icmph), &_icmph); 2080 if (!icmph) 2081 goto out; 2082 2083 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 2084 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 2085 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 2086 icmph->icmp6_type != ICMPV6_PARAMPROB) 2087 goto out; 2088 2089 inner_iph = skb_header_pointer(skb, 2090 skb_transport_offset(skb) + sizeof(*icmph), 2091 sizeof(_inner_iph), &_inner_iph); 2092 if (!inner_iph) 2093 goto out; 2094 2095 key_iph = inner_iph; 2096 _flkeys = NULL; 2097 out: 2098 if (_flkeys) { 2099 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 2100 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 2101 keys->tags.flow_label = _flkeys->tags.flow_label; 2102 keys->basic.ip_proto = _flkeys->basic.ip_proto; 2103 } else { 2104 keys->addrs.v6addrs.src = key_iph->saddr; 2105 keys->addrs.v6addrs.dst = key_iph->daddr; 2106 keys->tags.flow_label = ip6_flowlabel(key_iph); 2107 keys->basic.ip_proto = key_iph->nexthdr; 2108 } 2109 } 2110 2111 /* if skb is set it will be used and fl6 can be NULL */ 2112 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 2113 const struct sk_buff *skb, struct flow_keys *flkeys) 2114 { 2115 struct flow_keys hash_keys; 2116 u32 mhash; 2117 2118 switch (ip6_multipath_hash_policy(net)) { 2119 case 0: 2120 memset(&hash_keys, 0, sizeof(hash_keys)); 2121 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2122 if (skb) { 2123 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 2124 } else { 2125 hash_keys.addrs.v6addrs.src = fl6->saddr; 2126 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2127 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2128 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2129 } 2130 break; 2131 case 1: 2132 if (skb) { 2133 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 2134 struct flow_keys keys; 2135 2136 /* short-circuit if we already have L4 hash present */ 2137 if (skb->l4_hash) 2138 return skb_get_hash_raw(skb) >> 1; 2139 2140 memset(&hash_keys, 0, sizeof(hash_keys)); 2141 2142 if (!flkeys) { 2143 skb_flow_dissect_flow_keys(skb, &keys, flag); 2144 flkeys = &keys; 2145 } 2146 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2147 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2148 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2149 hash_keys.ports.src = flkeys->ports.src; 2150 hash_keys.ports.dst = flkeys->ports.dst; 2151 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2152 } else { 2153 memset(&hash_keys, 0, sizeof(hash_keys)); 2154 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2155 hash_keys.addrs.v6addrs.src = fl6->saddr; 2156 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2157 hash_keys.ports.src = fl6->fl6_sport; 2158 hash_keys.ports.dst = fl6->fl6_dport; 2159 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2160 } 2161 break; 2162 } 2163 mhash = flow_hash_from_keys(&hash_keys); 2164 2165 return mhash >> 1; 2166 } 2167 2168 void ip6_route_input(struct sk_buff *skb) 2169 { 2170 const struct ipv6hdr *iph = ipv6_hdr(skb); 2171 struct net *net = dev_net(skb->dev); 2172 int flags = RT6_LOOKUP_F_HAS_SADDR; 2173 struct ip_tunnel_info *tun_info; 2174 struct flowi6 fl6 = { 2175 .flowi6_iif = skb->dev->ifindex, 2176 .daddr = iph->daddr, 2177 .saddr = iph->saddr, 2178 .flowlabel = ip6_flowinfo(iph), 2179 .flowi6_mark = skb->mark, 2180 .flowi6_proto = iph->nexthdr, 2181 }; 2182 struct flow_keys *flkeys = NULL, _flkeys; 2183 2184 tun_info = skb_tunnel_info(skb); 2185 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2186 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2187 2188 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2189 flkeys = &_flkeys; 2190 2191 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2192 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2193 skb_dst_drop(skb); 2194 skb_dst_set(skb, 2195 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); 2196 } 2197 2198 static struct rt6_info *ip6_pol_route_output(struct net *net, 2199 struct fib6_table *table, 2200 struct flowi6 *fl6, 2201 const struct sk_buff *skb, 2202 int flags) 2203 { 2204 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2205 } 2206 2207 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 2208 struct flowi6 *fl6, int flags) 2209 { 2210 bool any_src; 2211 2212 if (ipv6_addr_type(&fl6->daddr) & 2213 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { 2214 struct dst_entry *dst; 2215 2216 dst = l3mdev_link_scope_lookup(net, fl6); 2217 if (dst) 2218 return dst; 2219 } 2220 2221 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2222 2223 any_src = ipv6_addr_any(&fl6->saddr); 2224 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2225 (fl6->flowi6_oif && any_src)) 2226 flags |= RT6_LOOKUP_F_IFACE; 2227 2228 if (!any_src) 2229 flags |= RT6_LOOKUP_F_HAS_SADDR; 2230 else if (sk) 2231 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2232 2233 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2234 } 2235 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2236 2237 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2238 { 2239 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2240 struct net_device *loopback_dev = net->loopback_dev; 2241 struct dst_entry *new = NULL; 2242 2243 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2244 DST_OBSOLETE_DEAD, 0); 2245 if (rt) { 2246 rt6_info_init(rt); 2247 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2248 2249 new = &rt->dst; 2250 new->__use = 1; 2251 new->input = dst_discard; 2252 new->output = dst_discard_out; 2253 2254 dst_copy_metrics(new, &ort->dst); 2255 2256 rt->rt6i_idev = in6_dev_get(loopback_dev); 2257 rt->rt6i_gateway = ort->rt6i_gateway; 2258 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2259 2260 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2261 #ifdef CONFIG_IPV6_SUBTREES 2262 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2263 #endif 2264 } 2265 2266 dst_release(dst_orig); 2267 return new ? new : ERR_PTR(-ENOMEM); 2268 } 2269 2270 /* 2271 * Destination cache support functions 2272 */ 2273 2274 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2275 { 2276 u32 rt_cookie = 0; 2277 2278 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2279 return false; 2280 2281 if (fib6_check_expired(f6i)) 2282 return false; 2283 2284 return true; 2285 } 2286 2287 static struct dst_entry *rt6_check(struct rt6_info *rt, 2288 struct fib6_info *from, 2289 u32 cookie) 2290 { 2291 u32 rt_cookie = 0; 2292 2293 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || 2294 rt_cookie != cookie) 2295 return NULL; 2296 2297 if (rt6_check_expired(rt)) 2298 return NULL; 2299 2300 return &rt->dst; 2301 } 2302 2303 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2304 struct fib6_info *from, 2305 u32 cookie) 2306 { 2307 if (!__rt6_check_expired(rt) && 2308 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2309 fib6_check(from, cookie)) 2310 return &rt->dst; 2311 else 2312 return NULL; 2313 } 2314 2315 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2316 { 2317 struct dst_entry *dst_ret; 2318 struct fib6_info *from; 2319 struct rt6_info *rt; 2320 2321 rt = container_of(dst, struct rt6_info, dst); 2322 2323 rcu_read_lock(); 2324 2325 /* All IPV6 dsts are created with ->obsolete set to the value 2326 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2327 * into this function always. 2328 */ 2329 2330 from = rcu_dereference(rt->from); 2331 2332 if (from && (rt->rt6i_flags & RTF_PCPU || 2333 unlikely(!list_empty(&rt->rt6i_uncached)))) 2334 dst_ret = rt6_dst_from_check(rt, from, cookie); 2335 else 2336 dst_ret = rt6_check(rt, from, cookie); 2337 2338 rcu_read_unlock(); 2339 2340 return dst_ret; 2341 } 2342 2343 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2344 { 2345 struct rt6_info *rt = (struct rt6_info *) dst; 2346 2347 if (rt) { 2348 if (rt->rt6i_flags & RTF_CACHE) { 2349 rcu_read_lock(); 2350 if (rt6_check_expired(rt)) { 2351 rt6_remove_exception_rt(rt); 2352 dst = NULL; 2353 } 2354 rcu_read_unlock(); 2355 } else { 2356 dst_release(dst); 2357 dst = NULL; 2358 } 2359 } 2360 return dst; 2361 } 2362 2363 static void ip6_link_failure(struct sk_buff *skb) 2364 { 2365 struct rt6_info *rt; 2366 2367 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2368 2369 rt = (struct rt6_info *) skb_dst(skb); 2370 if (rt) { 2371 rcu_read_lock(); 2372 if (rt->rt6i_flags & RTF_CACHE) { 2373 rt6_remove_exception_rt(rt); 2374 } else { 2375 struct fib6_info *from; 2376 struct fib6_node *fn; 2377 2378 from = rcu_dereference(rt->from); 2379 if (from) { 2380 fn = rcu_dereference(from->fib6_node); 2381 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2382 fn->fn_sernum = -1; 2383 } 2384 } 2385 rcu_read_unlock(); 2386 } 2387 } 2388 2389 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2390 { 2391 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2392 struct fib6_info *from; 2393 2394 rcu_read_lock(); 2395 from = rcu_dereference(rt0->from); 2396 if (from) 2397 rt0->dst.expires = from->expires; 2398 rcu_read_unlock(); 2399 } 2400 2401 dst_set_expires(&rt0->dst, timeout); 2402 rt0->rt6i_flags |= RTF_EXPIRES; 2403 } 2404 2405 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2406 { 2407 struct net *net = dev_net(rt->dst.dev); 2408 2409 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2410 rt->rt6i_flags |= RTF_MODIFIED; 2411 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2412 } 2413 2414 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2415 { 2416 return !(rt->rt6i_flags & RTF_CACHE) && 2417 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from)); 2418 } 2419 2420 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2421 const struct ipv6hdr *iph, u32 mtu) 2422 { 2423 const struct in6_addr *daddr, *saddr; 2424 struct rt6_info *rt6 = (struct rt6_info *)dst; 2425 2426 if (dst_metric_locked(dst, RTAX_MTU)) 2427 return; 2428 2429 if (iph) { 2430 daddr = &iph->daddr; 2431 saddr = &iph->saddr; 2432 } else if (sk) { 2433 daddr = &sk->sk_v6_daddr; 2434 saddr = &inet6_sk(sk)->saddr; 2435 } else { 2436 daddr = NULL; 2437 saddr = NULL; 2438 } 2439 dst_confirm_neigh(dst, daddr); 2440 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2441 if (mtu >= dst_mtu(dst)) 2442 return; 2443 2444 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2445 rt6_do_update_pmtu(rt6, mtu); 2446 /* update rt6_ex->stamp for cache */ 2447 if (rt6->rt6i_flags & RTF_CACHE) 2448 rt6_update_exception_stamp_rt(rt6); 2449 } else if (daddr) { 2450 struct fib6_result res = {}; 2451 struct rt6_info *nrt6; 2452 2453 rcu_read_lock(); 2454 res.f6i = rcu_dereference(rt6->from); 2455 if (!res.f6i) { 2456 rcu_read_unlock(); 2457 return; 2458 } 2459 res.nh = res.f6i->fib6_nh; 2460 res.fib6_flags = res.f6i->fib6_flags; 2461 res.fib6_type = res.f6i->fib6_type; 2462 2463 nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr); 2464 if (nrt6) { 2465 rt6_do_update_pmtu(nrt6, mtu); 2466 if (rt6_insert_exception(nrt6, &res)) 2467 dst_release_immediate(&nrt6->dst); 2468 } 2469 rcu_read_unlock(); 2470 } 2471 } 2472 2473 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2474 struct sk_buff *skb, u32 mtu) 2475 { 2476 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2477 } 2478 2479 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2480 int oif, u32 mark, kuid_t uid) 2481 { 2482 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2483 struct dst_entry *dst; 2484 struct flowi6 fl6 = { 2485 .flowi6_oif = oif, 2486 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark), 2487 .daddr = iph->daddr, 2488 .saddr = iph->saddr, 2489 .flowlabel = ip6_flowinfo(iph), 2490 .flowi6_uid = uid, 2491 }; 2492 2493 dst = ip6_route_output(net, NULL, &fl6); 2494 if (!dst->error) 2495 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2496 dst_release(dst); 2497 } 2498 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2499 2500 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2501 { 2502 int oif = sk->sk_bound_dev_if; 2503 struct dst_entry *dst; 2504 2505 if (!oif && skb->dev) 2506 oif = l3mdev_master_ifindex(skb->dev); 2507 2508 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid); 2509 2510 dst = __sk_dst_get(sk); 2511 if (!dst || !dst->obsolete || 2512 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2513 return; 2514 2515 bh_lock_sock(sk); 2516 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2517 ip6_datagram_dst_update(sk, false); 2518 bh_unlock_sock(sk); 2519 } 2520 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2521 2522 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2523 const struct flowi6 *fl6) 2524 { 2525 #ifdef CONFIG_IPV6_SUBTREES 2526 struct ipv6_pinfo *np = inet6_sk(sk); 2527 #endif 2528 2529 ip6_dst_store(sk, dst, 2530 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2531 &sk->sk_v6_daddr : NULL, 2532 #ifdef CONFIG_IPV6_SUBTREES 2533 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2534 &np->saddr : 2535 #endif 2536 NULL); 2537 } 2538 2539 static bool ip6_redirect_nh_match(const struct fib6_result *res, 2540 struct flowi6 *fl6, 2541 const struct in6_addr *gw, 2542 struct rt6_info **ret) 2543 { 2544 const struct fib6_nh *nh = res->nh; 2545 2546 if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family || 2547 fl6->flowi6_oif != nh->fib_nh_dev->ifindex) 2548 return false; 2549 2550 /* rt_cache's gateway might be different from its 'parent' 2551 * in the case of an ip redirect. 2552 * So we keep searching in the exception table if the gateway 2553 * is different. 2554 */ 2555 if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) { 2556 struct rt6_info *rt_cache; 2557 2558 rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr); 2559 if (rt_cache && 2560 ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) { 2561 *ret = rt_cache; 2562 return true; 2563 } 2564 return false; 2565 } 2566 return true; 2567 } 2568 2569 /* Handle redirects */ 2570 struct ip6rd_flowi { 2571 struct flowi6 fl6; 2572 struct in6_addr gateway; 2573 }; 2574 2575 static struct rt6_info *__ip6_route_redirect(struct net *net, 2576 struct fib6_table *table, 2577 struct flowi6 *fl6, 2578 const struct sk_buff *skb, 2579 int flags) 2580 { 2581 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2582 struct rt6_info *ret = NULL; 2583 struct fib6_result res = {}; 2584 struct fib6_info *rt; 2585 struct fib6_node *fn; 2586 2587 /* l3mdev_update_flow overrides oif if the device is enslaved; in 2588 * this case we must match on the real ingress device, so reset it 2589 */ 2590 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 2591 fl6->flowi6_oif = skb->dev->ifindex; 2592 2593 /* Get the "current" route for this destination and 2594 * check if the redirect has come from appropriate router. 2595 * 2596 * RFC 4861 specifies that redirects should only be 2597 * accepted if they come from the nexthop to the target. 2598 * Due to the way the routes are chosen, this notion 2599 * is a bit fuzzy and one might need to check all possible 2600 * routes. 2601 */ 2602 2603 rcu_read_lock(); 2604 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2605 restart: 2606 for_each_fib6_node_rt_rcu(fn) { 2607 res.f6i = rt; 2608 res.nh = rt->fib6_nh; 2609 2610 if (fib6_check_expired(rt)) 2611 continue; 2612 if (rt->fib6_flags & RTF_REJECT) 2613 break; 2614 if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret)) 2615 goto out; 2616 } 2617 2618 if (!rt) 2619 rt = net->ipv6.fib6_null_entry; 2620 else if (rt->fib6_flags & RTF_REJECT) { 2621 ret = net->ipv6.ip6_null_entry; 2622 goto out; 2623 } 2624 2625 if (rt == net->ipv6.fib6_null_entry) { 2626 fn = fib6_backtrack(fn, &fl6->saddr); 2627 if (fn) 2628 goto restart; 2629 } 2630 2631 res.f6i = rt; 2632 res.nh = rt->fib6_nh; 2633 out: 2634 if (ret) { 2635 ip6_hold_safe(net, &ret); 2636 } else { 2637 res.fib6_flags = res.f6i->fib6_flags; 2638 res.fib6_type = res.f6i->fib6_type; 2639 ret = ip6_create_rt_rcu(&res); 2640 } 2641 2642 rcu_read_unlock(); 2643 2644 trace_fib6_table_lookup(net, &res, table, fl6); 2645 return ret; 2646 }; 2647 2648 static struct dst_entry *ip6_route_redirect(struct net *net, 2649 const struct flowi6 *fl6, 2650 const struct sk_buff *skb, 2651 const struct in6_addr *gateway) 2652 { 2653 int flags = RT6_LOOKUP_F_HAS_SADDR; 2654 struct ip6rd_flowi rdfl; 2655 2656 rdfl.fl6 = *fl6; 2657 rdfl.gateway = *gateway; 2658 2659 return fib6_rule_lookup(net, &rdfl.fl6, skb, 2660 flags, __ip6_route_redirect); 2661 } 2662 2663 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2664 kuid_t uid) 2665 { 2666 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2667 struct dst_entry *dst; 2668 struct flowi6 fl6 = { 2669 .flowi6_iif = LOOPBACK_IFINDEX, 2670 .flowi6_oif = oif, 2671 .flowi6_mark = mark, 2672 .daddr = iph->daddr, 2673 .saddr = iph->saddr, 2674 .flowlabel = ip6_flowinfo(iph), 2675 .flowi6_uid = uid, 2676 }; 2677 2678 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 2679 rt6_do_redirect(dst, NULL, skb); 2680 dst_release(dst); 2681 } 2682 EXPORT_SYMBOL_GPL(ip6_redirect); 2683 2684 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) 2685 { 2686 const struct ipv6hdr *iph = ipv6_hdr(skb); 2687 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2688 struct dst_entry *dst; 2689 struct flowi6 fl6 = { 2690 .flowi6_iif = LOOPBACK_IFINDEX, 2691 .flowi6_oif = oif, 2692 .daddr = msg->dest, 2693 .saddr = iph->daddr, 2694 .flowi6_uid = sock_net_uid(net, NULL), 2695 }; 2696 2697 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 2698 rt6_do_redirect(dst, NULL, skb); 2699 dst_release(dst); 2700 } 2701 2702 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2703 { 2704 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2705 sk->sk_uid); 2706 } 2707 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2708 2709 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2710 { 2711 struct net_device *dev = dst->dev; 2712 unsigned int mtu = dst_mtu(dst); 2713 struct net *net = dev_net(dev); 2714 2715 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2716 2717 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2718 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2719 2720 /* 2721 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2722 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2723 * IPV6_MAXPLEN is also valid and means: "any MSS, 2724 * rely only on pmtu discovery" 2725 */ 2726 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2727 mtu = IPV6_MAXPLEN; 2728 return mtu; 2729 } 2730 2731 static unsigned int ip6_mtu(const struct dst_entry *dst) 2732 { 2733 struct inet6_dev *idev; 2734 unsigned int mtu; 2735 2736 mtu = dst_metric_raw(dst, RTAX_MTU); 2737 if (mtu) 2738 goto out; 2739 2740 mtu = IPV6_MIN_MTU; 2741 2742 rcu_read_lock(); 2743 idev = __in6_dev_get(dst->dev); 2744 if (idev) 2745 mtu = idev->cnf.mtu6; 2746 rcu_read_unlock(); 2747 2748 out: 2749 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2750 2751 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2752 } 2753 2754 /* MTU selection: 2755 * 1. mtu on route is locked - use it 2756 * 2. mtu from nexthop exception 2757 * 3. mtu from egress device 2758 * 2759 * based on ip6_dst_mtu_forward and exception logic of 2760 * rt6_find_cached_rt; called with rcu_read_lock 2761 */ 2762 u32 ip6_mtu_from_fib6(const struct fib6_result *res, 2763 const struct in6_addr *daddr, 2764 const struct in6_addr *saddr) 2765 { 2766 const struct fib6_nh *nh = res->nh; 2767 struct fib6_info *f6i = res->f6i; 2768 struct inet6_dev *idev; 2769 struct rt6_info *rt; 2770 u32 mtu = 0; 2771 2772 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 2773 mtu = f6i->fib6_pmtu; 2774 if (mtu) 2775 goto out; 2776 } 2777 2778 rt = rt6_find_cached_rt(res, daddr, saddr); 2779 if (unlikely(rt)) { 2780 mtu = dst_metric_raw(&rt->dst, RTAX_MTU); 2781 } else { 2782 struct net_device *dev = nh->fib_nh_dev; 2783 2784 mtu = IPV6_MIN_MTU; 2785 idev = __in6_dev_get(dev); 2786 if (idev && idev->cnf.mtu6 > mtu) 2787 mtu = idev->cnf.mtu6; 2788 } 2789 2790 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2791 out: 2792 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); 2793 } 2794 2795 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2796 struct flowi6 *fl6) 2797 { 2798 struct dst_entry *dst; 2799 struct rt6_info *rt; 2800 struct inet6_dev *idev = in6_dev_get(dev); 2801 struct net *net = dev_net(dev); 2802 2803 if (unlikely(!idev)) 2804 return ERR_PTR(-ENODEV); 2805 2806 rt = ip6_dst_alloc(net, dev, 0); 2807 if (unlikely(!rt)) { 2808 in6_dev_put(idev); 2809 dst = ERR_PTR(-ENOMEM); 2810 goto out; 2811 } 2812 2813 rt->dst.flags |= DST_HOST; 2814 rt->dst.input = ip6_input; 2815 rt->dst.output = ip6_output; 2816 rt->rt6i_gateway = fl6->daddr; 2817 rt->rt6i_dst.addr = fl6->daddr; 2818 rt->rt6i_dst.plen = 128; 2819 rt->rt6i_idev = idev; 2820 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2821 2822 /* Add this dst into uncached_list so that rt6_disable_ip() can 2823 * do proper release of the net_device 2824 */ 2825 rt6_uncached_list_add(rt); 2826 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2827 2828 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2829 2830 out: 2831 return dst; 2832 } 2833 2834 static int ip6_dst_gc(struct dst_ops *ops) 2835 { 2836 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2837 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2838 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2839 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2840 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2841 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2842 int entries; 2843 2844 entries = dst_entries_get_fast(ops); 2845 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2846 entries <= rt_max_size) 2847 goto out; 2848 2849 net->ipv6.ip6_rt_gc_expire++; 2850 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2851 entries = dst_entries_get_slow(ops); 2852 if (entries < ops->gc_thresh) 2853 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2854 out: 2855 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2856 return entries > rt_max_size; 2857 } 2858 2859 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2860 struct fib6_config *cfg, 2861 const struct in6_addr *gw_addr, 2862 u32 tbid, int flags) 2863 { 2864 struct flowi6 fl6 = { 2865 .flowi6_oif = cfg->fc_ifindex, 2866 .daddr = *gw_addr, 2867 .saddr = cfg->fc_prefsrc, 2868 }; 2869 struct fib6_table *table; 2870 struct rt6_info *rt; 2871 2872 table = fib6_get_table(net, tbid); 2873 if (!table) 2874 return NULL; 2875 2876 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2877 flags |= RT6_LOOKUP_F_HAS_SADDR; 2878 2879 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 2880 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); 2881 2882 /* if table lookup failed, fall back to full lookup */ 2883 if (rt == net->ipv6.ip6_null_entry) { 2884 ip6_rt_put(rt); 2885 rt = NULL; 2886 } 2887 2888 return rt; 2889 } 2890 2891 static int ip6_route_check_nh_onlink(struct net *net, 2892 struct fib6_config *cfg, 2893 const struct net_device *dev, 2894 struct netlink_ext_ack *extack) 2895 { 2896 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; 2897 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2898 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; 2899 struct fib6_info *from; 2900 struct rt6_info *grt; 2901 int err; 2902 2903 err = 0; 2904 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2905 if (grt) { 2906 rcu_read_lock(); 2907 from = rcu_dereference(grt->from); 2908 if (!grt->dst.error && 2909 /* ignore match if it is the default route */ 2910 from && !ipv6_addr_any(&from->fib6_dst.addr) && 2911 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2912 NL_SET_ERR_MSG(extack, 2913 "Nexthop has invalid gateway or device mismatch"); 2914 err = -EINVAL; 2915 } 2916 rcu_read_unlock(); 2917 2918 ip6_rt_put(grt); 2919 } 2920 2921 return err; 2922 } 2923 2924 static int ip6_route_check_nh(struct net *net, 2925 struct fib6_config *cfg, 2926 struct net_device **_dev, 2927 struct inet6_dev **idev) 2928 { 2929 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2930 struct net_device *dev = _dev ? *_dev : NULL; 2931 struct rt6_info *grt = NULL; 2932 int err = -EHOSTUNREACH; 2933 2934 if (cfg->fc_table) { 2935 int flags = RT6_LOOKUP_F_IFACE; 2936 2937 grt = ip6_nh_lookup_table(net, cfg, gw_addr, 2938 cfg->fc_table, flags); 2939 if (grt) { 2940 if (grt->rt6i_flags & RTF_GATEWAY || 2941 (dev && dev != grt->dst.dev)) { 2942 ip6_rt_put(grt); 2943 grt = NULL; 2944 } 2945 } 2946 } 2947 2948 if (!grt) 2949 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); 2950 2951 if (!grt) 2952 goto out; 2953 2954 if (dev) { 2955 if (dev != grt->dst.dev) { 2956 ip6_rt_put(grt); 2957 goto out; 2958 } 2959 } else { 2960 *_dev = dev = grt->dst.dev; 2961 *idev = grt->rt6i_idev; 2962 dev_hold(dev); 2963 in6_dev_hold(grt->rt6i_idev); 2964 } 2965 2966 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2967 err = 0; 2968 2969 ip6_rt_put(grt); 2970 2971 out: 2972 return err; 2973 } 2974 2975 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 2976 struct net_device **_dev, struct inet6_dev **idev, 2977 struct netlink_ext_ack *extack) 2978 { 2979 const struct in6_addr *gw_addr = &cfg->fc_gateway; 2980 int gwa_type = ipv6_addr_type(gw_addr); 2981 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 2982 const struct net_device *dev = *_dev; 2983 bool need_addr_check = !dev; 2984 int err = -EINVAL; 2985 2986 /* if gw_addr is local we will fail to detect this in case 2987 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2988 * will return already-added prefix route via interface that 2989 * prefix route was assigned to, which might be non-loopback. 2990 */ 2991 if (dev && 2992 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 2993 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 2994 goto out; 2995 } 2996 2997 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 2998 /* IPv6 strictly inhibits using not link-local 2999 * addresses as nexthop address. 3000 * Otherwise, router will not able to send redirects. 3001 * It is very good, but in some (rare!) circumstances 3002 * (SIT, PtP, NBMA NOARP links) it is handy to allow 3003 * some exceptions. --ANK 3004 * We allow IPv4-mapped nexthops to support RFC4798-type 3005 * addressing 3006 */ 3007 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 3008 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 3009 goto out; 3010 } 3011 3012 if (cfg->fc_flags & RTNH_F_ONLINK) 3013 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 3014 else 3015 err = ip6_route_check_nh(net, cfg, _dev, idev); 3016 3017 if (err) 3018 goto out; 3019 } 3020 3021 /* reload in case device was changed */ 3022 dev = *_dev; 3023 3024 err = -EINVAL; 3025 if (!dev) { 3026 NL_SET_ERR_MSG(extack, "Egress device not specified"); 3027 goto out; 3028 } else if (dev->flags & IFF_LOOPBACK) { 3029 NL_SET_ERR_MSG(extack, 3030 "Egress device can not be loopback device for this route"); 3031 goto out; 3032 } 3033 3034 /* if we did not check gw_addr above, do so now that the 3035 * egress device has been resolved. 3036 */ 3037 if (need_addr_check && 3038 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 3039 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 3040 goto out; 3041 } 3042 3043 err = 0; 3044 out: 3045 return err; 3046 } 3047 3048 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type) 3049 { 3050 if ((flags & RTF_REJECT) || 3051 (dev && (dev->flags & IFF_LOOPBACK) && 3052 !(addr_type & IPV6_ADDR_LOOPBACK) && 3053 !(flags & RTF_LOCAL))) 3054 return true; 3055 3056 return false; 3057 } 3058 3059 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, 3060 struct fib6_config *cfg, gfp_t gfp_flags, 3061 struct netlink_ext_ack *extack) 3062 { 3063 struct net_device *dev = NULL; 3064 struct inet6_dev *idev = NULL; 3065 int addr_type; 3066 int err; 3067 3068 fib6_nh->fib_nh_family = AF_INET6; 3069 3070 err = -ENODEV; 3071 if (cfg->fc_ifindex) { 3072 dev = dev_get_by_index(net, cfg->fc_ifindex); 3073 if (!dev) 3074 goto out; 3075 idev = in6_dev_get(dev); 3076 if (!idev) 3077 goto out; 3078 } 3079 3080 if (cfg->fc_flags & RTNH_F_ONLINK) { 3081 if (!dev) { 3082 NL_SET_ERR_MSG(extack, 3083 "Nexthop device required for onlink"); 3084 goto out; 3085 } 3086 3087 if (!(dev->flags & IFF_UP)) { 3088 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3089 err = -ENETDOWN; 3090 goto out; 3091 } 3092 3093 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK; 3094 } 3095 3096 fib6_nh->fib_nh_weight = 1; 3097 3098 /* We cannot add true routes via loopback here, 3099 * they would result in kernel looping; promote them to reject routes 3100 */ 3101 addr_type = ipv6_addr_type(&cfg->fc_dst); 3102 if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) { 3103 /* hold loopback dev/idev if we haven't done so. */ 3104 if (dev != net->loopback_dev) { 3105 if (dev) { 3106 dev_put(dev); 3107 in6_dev_put(idev); 3108 } 3109 dev = net->loopback_dev; 3110 dev_hold(dev); 3111 idev = in6_dev_get(dev); 3112 if (!idev) { 3113 err = -ENODEV; 3114 goto out; 3115 } 3116 } 3117 goto set_dev; 3118 } 3119 3120 if (cfg->fc_flags & RTF_GATEWAY) { 3121 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 3122 if (err) 3123 goto out; 3124 3125 fib6_nh->fib_nh_gw6 = cfg->fc_gateway; 3126 fib6_nh->fib_nh_gw_family = AF_INET6; 3127 } 3128 3129 err = -ENODEV; 3130 if (!dev) 3131 goto out; 3132 3133 if (idev->cnf.disable_ipv6) { 3134 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3135 err = -EACCES; 3136 goto out; 3137 } 3138 3139 if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) { 3140 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3141 err = -ENETDOWN; 3142 goto out; 3143 } 3144 3145 if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3146 !netif_carrier_ok(dev)) 3147 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; 3148 3149 fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); 3150 if (!fib6_nh->rt6i_pcpu) { 3151 err = -ENOMEM; 3152 goto out; 3153 } 3154 3155 err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap, 3156 cfg->fc_encap_type, cfg, gfp_flags, extack); 3157 if (err) 3158 goto out; 3159 set_dev: 3160 fib6_nh->fib_nh_dev = dev; 3161 fib6_nh->fib_nh_oif = dev->ifindex; 3162 err = 0; 3163 out: 3164 if (idev) 3165 in6_dev_put(idev); 3166 3167 if (err) { 3168 lwtstate_put(fib6_nh->fib_nh_lws); 3169 fib6_nh->fib_nh_lws = NULL; 3170 if (dev) 3171 dev_put(dev); 3172 } 3173 3174 return err; 3175 } 3176 3177 void fib6_nh_release(struct fib6_nh *fib6_nh) 3178 { 3179 struct rt6_exception_bucket *bucket; 3180 3181 rcu_read_lock(); 3182 3183 fib6_nh_flush_exceptions(fib6_nh, NULL); 3184 bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL); 3185 if (bucket) { 3186 rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL); 3187 kfree(bucket); 3188 } 3189 3190 rcu_read_unlock(); 3191 3192 if (fib6_nh->rt6i_pcpu) { 3193 int cpu; 3194 3195 for_each_possible_cpu(cpu) { 3196 struct rt6_info **ppcpu_rt; 3197 struct rt6_info *pcpu_rt; 3198 3199 ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); 3200 pcpu_rt = *ppcpu_rt; 3201 if (pcpu_rt) { 3202 dst_dev_put(&pcpu_rt->dst); 3203 dst_release(&pcpu_rt->dst); 3204 *ppcpu_rt = NULL; 3205 } 3206 } 3207 3208 free_percpu(fib6_nh->rt6i_pcpu); 3209 } 3210 3211 fib_nh_common_release(&fib6_nh->nh_common); 3212 } 3213 3214 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 3215 gfp_t gfp_flags, 3216 struct netlink_ext_ack *extack) 3217 { 3218 struct net *net = cfg->fc_nlinfo.nl_net; 3219 struct fib6_info *rt = NULL; 3220 struct fib6_table *table; 3221 int err = -EINVAL; 3222 int addr_type; 3223 3224 /* RTF_PCPU is an internal flag; can not be set by userspace */ 3225 if (cfg->fc_flags & RTF_PCPU) { 3226 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 3227 goto out; 3228 } 3229 3230 /* RTF_CACHE is an internal flag; can not be set by userspace */ 3231 if (cfg->fc_flags & RTF_CACHE) { 3232 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 3233 goto out; 3234 } 3235 3236 if (cfg->fc_type > RTN_MAX) { 3237 NL_SET_ERR_MSG(extack, "Invalid route type"); 3238 goto out; 3239 } 3240 3241 if (cfg->fc_dst_len > 128) { 3242 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 3243 goto out; 3244 } 3245 if (cfg->fc_src_len > 128) { 3246 NL_SET_ERR_MSG(extack, "Invalid source address length"); 3247 goto out; 3248 } 3249 #ifndef CONFIG_IPV6_SUBTREES 3250 if (cfg->fc_src_len) { 3251 NL_SET_ERR_MSG(extack, 3252 "Specifying source address requires IPV6_SUBTREES to be enabled"); 3253 goto out; 3254 } 3255 #endif 3256 3257 err = -ENOBUFS; 3258 if (cfg->fc_nlinfo.nlh && 3259 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 3260 table = fib6_get_table(net, cfg->fc_table); 3261 if (!table) { 3262 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 3263 table = fib6_new_table(net, cfg->fc_table); 3264 } 3265 } else { 3266 table = fib6_new_table(net, cfg->fc_table); 3267 } 3268 3269 if (!table) 3270 goto out; 3271 3272 err = -ENOMEM; 3273 rt = fib6_info_alloc(gfp_flags, true); 3274 if (!rt) 3275 goto out; 3276 3277 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len, 3278 extack); 3279 if (IS_ERR(rt->fib6_metrics)) { 3280 err = PTR_ERR(rt->fib6_metrics); 3281 /* Do not leave garbage there. */ 3282 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; 3283 goto out; 3284 } 3285 3286 if (cfg->fc_flags & RTF_ADDRCONF) 3287 rt->dst_nocount = true; 3288 3289 if (cfg->fc_flags & RTF_EXPIRES) 3290 fib6_set_expires(rt, jiffies + 3291 clock_t_to_jiffies(cfg->fc_expires)); 3292 else 3293 fib6_clean_expires(rt); 3294 3295 if (cfg->fc_protocol == RTPROT_UNSPEC) 3296 cfg->fc_protocol = RTPROT_BOOT; 3297 rt->fib6_protocol = cfg->fc_protocol; 3298 3299 rt->fib6_table = table; 3300 rt->fib6_metric = cfg->fc_metric; 3301 rt->fib6_type = cfg->fc_type; 3302 rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY; 3303 3304 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3305 rt->fib6_dst.plen = cfg->fc_dst_len; 3306 if (rt->fib6_dst.plen == 128) 3307 rt->dst_host = true; 3308 3309 #ifdef CONFIG_IPV6_SUBTREES 3310 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3311 rt->fib6_src.plen = cfg->fc_src_len; 3312 #endif 3313 err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack); 3314 if (err) 3315 goto out; 3316 3317 /* We cannot add true routes via loopback here, 3318 * they would result in kernel looping; promote them to reject routes 3319 */ 3320 addr_type = ipv6_addr_type(&cfg->fc_dst); 3321 if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev, addr_type)) 3322 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; 3323 3324 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3325 struct net_device *dev = fib6_info_nh_dev(rt); 3326 3327 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3328 NL_SET_ERR_MSG(extack, "Invalid source address"); 3329 err = -EINVAL; 3330 goto out; 3331 } 3332 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3333 rt->fib6_prefsrc.plen = 128; 3334 } else 3335 rt->fib6_prefsrc.plen = 0; 3336 3337 return rt; 3338 out: 3339 fib6_info_release(rt); 3340 return ERR_PTR(err); 3341 } 3342 3343 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3344 struct netlink_ext_ack *extack) 3345 { 3346 struct fib6_info *rt; 3347 int err; 3348 3349 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3350 if (IS_ERR(rt)) 3351 return PTR_ERR(rt); 3352 3353 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3354 fib6_info_release(rt); 3355 3356 return err; 3357 } 3358 3359 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3360 { 3361 struct net *net = info->nl_net; 3362 struct fib6_table *table; 3363 int err; 3364 3365 if (rt == net->ipv6.fib6_null_entry) { 3366 err = -ENOENT; 3367 goto out; 3368 } 3369 3370 table = rt->fib6_table; 3371 spin_lock_bh(&table->tb6_lock); 3372 err = fib6_del(rt, info); 3373 spin_unlock_bh(&table->tb6_lock); 3374 3375 out: 3376 fib6_info_release(rt); 3377 return err; 3378 } 3379 3380 int ip6_del_rt(struct net *net, struct fib6_info *rt) 3381 { 3382 struct nl_info info = { .nl_net = net }; 3383 3384 return __ip6_del_rt(rt, &info); 3385 } 3386 3387 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3388 { 3389 struct nl_info *info = &cfg->fc_nlinfo; 3390 struct net *net = info->nl_net; 3391 struct sk_buff *skb = NULL; 3392 struct fib6_table *table; 3393 int err = -ENOENT; 3394 3395 if (rt == net->ipv6.fib6_null_entry) 3396 goto out_put; 3397 table = rt->fib6_table; 3398 spin_lock_bh(&table->tb6_lock); 3399 3400 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3401 struct fib6_info *sibling, *next_sibling; 3402 3403 /* prefer to send a single notification with all hops */ 3404 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3405 if (skb) { 3406 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3407 3408 if (rt6_fill_node(net, skb, rt, NULL, 3409 NULL, NULL, 0, RTM_DELROUTE, 3410 info->portid, seq, 0) < 0) { 3411 kfree_skb(skb); 3412 skb = NULL; 3413 } else 3414 info->skip_notify = 1; 3415 } 3416 3417 list_for_each_entry_safe(sibling, next_sibling, 3418 &rt->fib6_siblings, 3419 fib6_siblings) { 3420 err = fib6_del(sibling, info); 3421 if (err) 3422 goto out_unlock; 3423 } 3424 } 3425 3426 err = fib6_del(rt, info); 3427 out_unlock: 3428 spin_unlock_bh(&table->tb6_lock); 3429 out_put: 3430 fib6_info_release(rt); 3431 3432 if (skb) { 3433 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3434 info->nlh, gfp_any()); 3435 } 3436 return err; 3437 } 3438 3439 static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3440 { 3441 int rc = -ESRCH; 3442 3443 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3444 goto out; 3445 3446 if (cfg->fc_flags & RTF_GATEWAY && 3447 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3448 goto out; 3449 3450 rc = rt6_remove_exception_rt(rt); 3451 out: 3452 return rc; 3453 } 3454 3455 static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt, 3456 struct fib6_nh *nh) 3457 { 3458 struct fib6_result res = { 3459 .f6i = rt, 3460 .nh = nh, 3461 }; 3462 struct rt6_info *rt_cache; 3463 3464 rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src); 3465 if (rt_cache) 3466 return __ip6_del_cached_rt(rt_cache, cfg); 3467 3468 return 0; 3469 } 3470 3471 static int ip6_route_del(struct fib6_config *cfg, 3472 struct netlink_ext_ack *extack) 3473 { 3474 struct fib6_table *table; 3475 struct fib6_info *rt; 3476 struct fib6_node *fn; 3477 int err = -ESRCH; 3478 3479 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 3480 if (!table) { 3481 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 3482 return err; 3483 } 3484 3485 rcu_read_lock(); 3486 3487 fn = fib6_locate(&table->tb6_root, 3488 &cfg->fc_dst, cfg->fc_dst_len, 3489 &cfg->fc_src, cfg->fc_src_len, 3490 !(cfg->fc_flags & RTF_CACHE)); 3491 3492 if (fn) { 3493 for_each_fib6_node_rt_rcu(fn) { 3494 struct fib6_nh *nh; 3495 3496 nh = rt->fib6_nh; 3497 if (cfg->fc_flags & RTF_CACHE) { 3498 int rc; 3499 3500 rc = ip6_del_cached_rt(cfg, rt, nh); 3501 if (rc != -ESRCH) { 3502 rcu_read_unlock(); 3503 return rc; 3504 } 3505 continue; 3506 } 3507 3508 if (cfg->fc_ifindex && 3509 (!nh->fib_nh_dev || 3510 nh->fib_nh_dev->ifindex != cfg->fc_ifindex)) 3511 continue; 3512 if (cfg->fc_flags & RTF_GATEWAY && 3513 !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6)) 3514 continue; 3515 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 3516 continue; 3517 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) 3518 continue; 3519 if (!fib6_info_hold_safe(rt)) 3520 continue; 3521 rcu_read_unlock(); 3522 3523 /* if gateway was specified only delete the one hop */ 3524 if (cfg->fc_flags & RTF_GATEWAY) 3525 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3526 3527 return __ip6_del_rt_siblings(rt, cfg); 3528 } 3529 } 3530 rcu_read_unlock(); 3531 3532 return err; 3533 } 3534 3535 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3536 { 3537 struct netevent_redirect netevent; 3538 struct rt6_info *rt, *nrt = NULL; 3539 struct fib6_result res = {}; 3540 struct ndisc_options ndopts; 3541 struct inet6_dev *in6_dev; 3542 struct neighbour *neigh; 3543 struct rd_msg *msg; 3544 int optlen, on_link; 3545 u8 *lladdr; 3546 3547 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3548 optlen -= sizeof(*msg); 3549 3550 if (optlen < 0) { 3551 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3552 return; 3553 } 3554 3555 msg = (struct rd_msg *)icmp6_hdr(skb); 3556 3557 if (ipv6_addr_is_multicast(&msg->dest)) { 3558 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3559 return; 3560 } 3561 3562 on_link = 0; 3563 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3564 on_link = 1; 3565 } else if (ipv6_addr_type(&msg->target) != 3566 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3567 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3568 return; 3569 } 3570 3571 in6_dev = __in6_dev_get(skb->dev); 3572 if (!in6_dev) 3573 return; 3574 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3575 return; 3576 3577 /* RFC2461 8.1: 3578 * The IP source address of the Redirect MUST be the same as the current 3579 * first-hop router for the specified ICMP Destination Address. 3580 */ 3581 3582 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3583 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3584 return; 3585 } 3586 3587 lladdr = NULL; 3588 if (ndopts.nd_opts_tgt_lladdr) { 3589 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3590 skb->dev); 3591 if (!lladdr) { 3592 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 3593 return; 3594 } 3595 } 3596 3597 rt = (struct rt6_info *) dst; 3598 if (rt->rt6i_flags & RTF_REJECT) { 3599 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 3600 return; 3601 } 3602 3603 /* Redirect received -> path was valid. 3604 * Look, redirects are sent only in response to data packets, 3605 * so that this nexthop apparently is reachable. --ANK 3606 */ 3607 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3608 3609 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3610 if (!neigh) 3611 return; 3612 3613 /* 3614 * We have finally decided to accept it. 3615 */ 3616 3617 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3618 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3619 NEIGH_UPDATE_F_OVERRIDE| 3620 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3621 NEIGH_UPDATE_F_ISROUTER)), 3622 NDISC_REDIRECT, &ndopts); 3623 3624 rcu_read_lock(); 3625 res.f6i = rcu_dereference(rt->from); 3626 if (!res.f6i) 3627 goto out; 3628 3629 res.nh = res.f6i->fib6_nh; 3630 res.fib6_flags = res.f6i->fib6_flags; 3631 res.fib6_type = res.f6i->fib6_type; 3632 nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL); 3633 if (!nrt) 3634 goto out; 3635 3636 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3637 if (on_link) 3638 nrt->rt6i_flags &= ~RTF_GATEWAY; 3639 3640 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3641 3642 /* rt6_insert_exception() will take care of duplicated exceptions */ 3643 if (rt6_insert_exception(nrt, &res)) { 3644 dst_release_immediate(&nrt->dst); 3645 goto out; 3646 } 3647 3648 netevent.old = &rt->dst; 3649 netevent.new = &nrt->dst; 3650 netevent.daddr = &msg->dest; 3651 netevent.neigh = neigh; 3652 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3653 3654 out: 3655 rcu_read_unlock(); 3656 neigh_release(neigh); 3657 } 3658 3659 #ifdef CONFIG_IPV6_ROUTE_INFO 3660 static struct fib6_info *rt6_get_route_info(struct net *net, 3661 const struct in6_addr *prefix, int prefixlen, 3662 const struct in6_addr *gwaddr, 3663 struct net_device *dev) 3664 { 3665 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3666 int ifindex = dev->ifindex; 3667 struct fib6_node *fn; 3668 struct fib6_info *rt = NULL; 3669 struct fib6_table *table; 3670 3671 table = fib6_get_table(net, tb_id); 3672 if (!table) 3673 return NULL; 3674 3675 rcu_read_lock(); 3676 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3677 if (!fn) 3678 goto out; 3679 3680 for_each_fib6_node_rt_rcu(fn) { 3681 if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex) 3682 continue; 3683 if (!(rt->fib6_flags & RTF_ROUTEINFO) || 3684 !rt->fib6_nh->fib_nh_gw_family) 3685 continue; 3686 if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr)) 3687 continue; 3688 if (!fib6_info_hold_safe(rt)) 3689 continue; 3690 break; 3691 } 3692 out: 3693 rcu_read_unlock(); 3694 return rt; 3695 } 3696 3697 static struct fib6_info *rt6_add_route_info(struct net *net, 3698 const struct in6_addr *prefix, int prefixlen, 3699 const struct in6_addr *gwaddr, 3700 struct net_device *dev, 3701 unsigned int pref) 3702 { 3703 struct fib6_config cfg = { 3704 .fc_metric = IP6_RT_PRIO_USER, 3705 .fc_ifindex = dev->ifindex, 3706 .fc_dst_len = prefixlen, 3707 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3708 RTF_UP | RTF_PREF(pref), 3709 .fc_protocol = RTPROT_RA, 3710 .fc_type = RTN_UNICAST, 3711 .fc_nlinfo.portid = 0, 3712 .fc_nlinfo.nlh = NULL, 3713 .fc_nlinfo.nl_net = net, 3714 }; 3715 3716 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3717 cfg.fc_dst = *prefix; 3718 cfg.fc_gateway = *gwaddr; 3719 3720 /* We should treat it as a default route if prefix length is 0. */ 3721 if (!prefixlen) 3722 cfg.fc_flags |= RTF_DEFAULT; 3723 3724 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 3725 3726 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3727 } 3728 #endif 3729 3730 struct fib6_info *rt6_get_dflt_router(struct net *net, 3731 const struct in6_addr *addr, 3732 struct net_device *dev) 3733 { 3734 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3735 struct fib6_info *rt; 3736 struct fib6_table *table; 3737 3738 table = fib6_get_table(net, tb_id); 3739 if (!table) 3740 return NULL; 3741 3742 rcu_read_lock(); 3743 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3744 struct fib6_nh *nh = rt->fib6_nh; 3745 3746 if (dev == nh->fib_nh_dev && 3747 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3748 ipv6_addr_equal(&nh->fib_nh_gw6, addr)) 3749 break; 3750 } 3751 if (rt && !fib6_info_hold_safe(rt)) 3752 rt = NULL; 3753 rcu_read_unlock(); 3754 return rt; 3755 } 3756 3757 struct fib6_info *rt6_add_dflt_router(struct net *net, 3758 const struct in6_addr *gwaddr, 3759 struct net_device *dev, 3760 unsigned int pref) 3761 { 3762 struct fib6_config cfg = { 3763 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3764 .fc_metric = IP6_RT_PRIO_USER, 3765 .fc_ifindex = dev->ifindex, 3766 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3767 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3768 .fc_protocol = RTPROT_RA, 3769 .fc_type = RTN_UNICAST, 3770 .fc_nlinfo.portid = 0, 3771 .fc_nlinfo.nlh = NULL, 3772 .fc_nlinfo.nl_net = net, 3773 }; 3774 3775 cfg.fc_gateway = *gwaddr; 3776 3777 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 3778 struct fib6_table *table; 3779 3780 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3781 if (table) 3782 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3783 } 3784 3785 return rt6_get_dflt_router(net, gwaddr, dev); 3786 } 3787 3788 static void __rt6_purge_dflt_routers(struct net *net, 3789 struct fib6_table *table) 3790 { 3791 struct fib6_info *rt; 3792 3793 restart: 3794 rcu_read_lock(); 3795 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3796 struct net_device *dev = fib6_info_nh_dev(rt); 3797 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 3798 3799 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3800 (!idev || idev->cnf.accept_ra != 2) && 3801 fib6_info_hold_safe(rt)) { 3802 rcu_read_unlock(); 3803 ip6_del_rt(net, rt); 3804 goto restart; 3805 } 3806 } 3807 rcu_read_unlock(); 3808 3809 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3810 } 3811 3812 void rt6_purge_dflt_routers(struct net *net) 3813 { 3814 struct fib6_table *table; 3815 struct hlist_head *head; 3816 unsigned int h; 3817 3818 rcu_read_lock(); 3819 3820 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3821 head = &net->ipv6.fib_table_hash[h]; 3822 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3823 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3824 __rt6_purge_dflt_routers(net, table); 3825 } 3826 } 3827 3828 rcu_read_unlock(); 3829 } 3830 3831 static void rtmsg_to_fib6_config(struct net *net, 3832 struct in6_rtmsg *rtmsg, 3833 struct fib6_config *cfg) 3834 { 3835 *cfg = (struct fib6_config){ 3836 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3837 : RT6_TABLE_MAIN, 3838 .fc_ifindex = rtmsg->rtmsg_ifindex, 3839 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER, 3840 .fc_expires = rtmsg->rtmsg_info, 3841 .fc_dst_len = rtmsg->rtmsg_dst_len, 3842 .fc_src_len = rtmsg->rtmsg_src_len, 3843 .fc_flags = rtmsg->rtmsg_flags, 3844 .fc_type = rtmsg->rtmsg_type, 3845 3846 .fc_nlinfo.nl_net = net, 3847 3848 .fc_dst = rtmsg->rtmsg_dst, 3849 .fc_src = rtmsg->rtmsg_src, 3850 .fc_gateway = rtmsg->rtmsg_gateway, 3851 }; 3852 } 3853 3854 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3855 { 3856 struct fib6_config cfg; 3857 struct in6_rtmsg rtmsg; 3858 int err; 3859 3860 switch (cmd) { 3861 case SIOCADDRT: /* Add a route */ 3862 case SIOCDELRT: /* Delete a route */ 3863 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3864 return -EPERM; 3865 err = copy_from_user(&rtmsg, arg, 3866 sizeof(struct in6_rtmsg)); 3867 if (err) 3868 return -EFAULT; 3869 3870 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3871 3872 rtnl_lock(); 3873 switch (cmd) { 3874 case SIOCADDRT: 3875 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 3876 break; 3877 case SIOCDELRT: 3878 err = ip6_route_del(&cfg, NULL); 3879 break; 3880 default: 3881 err = -EINVAL; 3882 } 3883 rtnl_unlock(); 3884 3885 return err; 3886 } 3887 3888 return -EINVAL; 3889 } 3890 3891 /* 3892 * Drop the packet on the floor 3893 */ 3894 3895 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3896 { 3897 struct dst_entry *dst = skb_dst(skb); 3898 struct net *net = dev_net(dst->dev); 3899 struct inet6_dev *idev; 3900 int type; 3901 3902 if (netif_is_l3_master(skb->dev) && 3903 dst->dev == net->loopback_dev) 3904 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); 3905 else 3906 idev = ip6_dst_idev(dst); 3907 3908 switch (ipstats_mib_noroutes) { 3909 case IPSTATS_MIB_INNOROUTES: 3910 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3911 if (type == IPV6_ADDR_ANY) { 3912 IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 3913 break; 3914 } 3915 /* FALLTHROUGH */ 3916 case IPSTATS_MIB_OUTNOROUTES: 3917 IP6_INC_STATS(net, idev, ipstats_mib_noroutes); 3918 break; 3919 } 3920 3921 /* Start over by dropping the dst for l3mdev case */ 3922 if (netif_is_l3_master(skb->dev)) 3923 skb_dst_drop(skb); 3924 3925 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3926 kfree_skb(skb); 3927 return 0; 3928 } 3929 3930 static int ip6_pkt_discard(struct sk_buff *skb) 3931 { 3932 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3933 } 3934 3935 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3936 { 3937 skb->dev = skb_dst(skb)->dev; 3938 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3939 } 3940 3941 static int ip6_pkt_prohibit(struct sk_buff *skb) 3942 { 3943 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3944 } 3945 3946 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3947 { 3948 skb->dev = skb_dst(skb)->dev; 3949 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3950 } 3951 3952 /* 3953 * Allocate a dst for local (unicast / anycast) address. 3954 */ 3955 3956 struct fib6_info *addrconf_f6i_alloc(struct net *net, 3957 struct inet6_dev *idev, 3958 const struct in6_addr *addr, 3959 bool anycast, gfp_t gfp_flags) 3960 { 3961 struct fib6_config cfg = { 3962 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL, 3963 .fc_ifindex = idev->dev->ifindex, 3964 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP, 3965 .fc_dst = *addr, 3966 .fc_dst_len = 128, 3967 .fc_protocol = RTPROT_KERNEL, 3968 .fc_nlinfo.nl_net = net, 3969 .fc_ignore_dev_down = true, 3970 }; 3971 3972 if (anycast) { 3973 cfg.fc_type = RTN_ANYCAST; 3974 cfg.fc_flags |= RTF_ANYCAST; 3975 } else { 3976 cfg.fc_type = RTN_LOCAL; 3977 cfg.fc_flags |= RTF_LOCAL; 3978 } 3979 3980 return ip6_route_info_create(&cfg, gfp_flags, NULL); 3981 } 3982 3983 /* remove deleted ip from prefsrc entries */ 3984 struct arg_dev_net_ip { 3985 struct net_device *dev; 3986 struct net *net; 3987 struct in6_addr *addr; 3988 }; 3989 3990 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 3991 { 3992 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3993 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3994 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3995 3996 if (((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) && 3997 rt != net->ipv6.fib6_null_entry && 3998 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 3999 spin_lock_bh(&rt6_exception_lock); 4000 /* remove prefsrc entry */ 4001 rt->fib6_prefsrc.plen = 0; 4002 spin_unlock_bh(&rt6_exception_lock); 4003 } 4004 return 0; 4005 } 4006 4007 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 4008 { 4009 struct net *net = dev_net(ifp->idev->dev); 4010 struct arg_dev_net_ip adni = { 4011 .dev = ifp->idev->dev, 4012 .net = net, 4013 .addr = &ifp->addr, 4014 }; 4015 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 4016 } 4017 4018 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT) 4019 4020 /* Remove routers and update dst entries when gateway turn into host. */ 4021 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 4022 { 4023 struct in6_addr *gateway = (struct in6_addr *)arg; 4024 struct fib6_nh *nh = rt->fib6_nh; 4025 4026 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 4027 nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6)) 4028 return -1; 4029 4030 /* Further clean up cached routes in exception table. 4031 * This is needed because cached route may have a different 4032 * gateway than its 'parent' in the case of an ip redirect. 4033 */ 4034 fib6_nh_exceptions_clean_tohost(nh, gateway); 4035 4036 return 0; 4037 } 4038 4039 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 4040 { 4041 fib6_clean_all(net, fib6_clean_tohost, gateway); 4042 } 4043 4044 struct arg_netdev_event { 4045 const struct net_device *dev; 4046 union { 4047 unsigned char nh_flags; 4048 unsigned long event; 4049 }; 4050 }; 4051 4052 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 4053 { 4054 struct fib6_info *iter; 4055 struct fib6_node *fn; 4056 4057 fn = rcu_dereference_protected(rt->fib6_node, 4058 lockdep_is_held(&rt->fib6_table->tb6_lock)); 4059 iter = rcu_dereference_protected(fn->leaf, 4060 lockdep_is_held(&rt->fib6_table->tb6_lock)); 4061 while (iter) { 4062 if (iter->fib6_metric == rt->fib6_metric && 4063 rt6_qualify_for_ecmp(iter)) 4064 return iter; 4065 iter = rcu_dereference_protected(iter->fib6_next, 4066 lockdep_is_held(&rt->fib6_table->tb6_lock)); 4067 } 4068 4069 return NULL; 4070 } 4071 4072 static bool rt6_is_dead(const struct fib6_info *rt) 4073 { 4074 if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD || 4075 (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN && 4076 ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev))) 4077 return true; 4078 4079 return false; 4080 } 4081 4082 static int rt6_multipath_total_weight(const struct fib6_info *rt) 4083 { 4084 struct fib6_info *iter; 4085 int total = 0; 4086 4087 if (!rt6_is_dead(rt)) 4088 total += rt->fib6_nh->fib_nh_weight; 4089 4090 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 4091 if (!rt6_is_dead(iter)) 4092 total += iter->fib6_nh->fib_nh_weight; 4093 } 4094 4095 return total; 4096 } 4097 4098 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 4099 { 4100 int upper_bound = -1; 4101 4102 if (!rt6_is_dead(rt)) { 4103 *weight += rt->fib6_nh->fib_nh_weight; 4104 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 4105 total) - 1; 4106 } 4107 atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound); 4108 } 4109 4110 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 4111 { 4112 struct fib6_info *iter; 4113 int weight = 0; 4114 4115 rt6_upper_bound_set(rt, &weight, total); 4116 4117 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4118 rt6_upper_bound_set(iter, &weight, total); 4119 } 4120 4121 void rt6_multipath_rebalance(struct fib6_info *rt) 4122 { 4123 struct fib6_info *first; 4124 int total; 4125 4126 /* In case the entire multipath route was marked for flushing, 4127 * then there is no need to rebalance upon the removal of every 4128 * sibling route. 4129 */ 4130 if (!rt->fib6_nsiblings || rt->should_flush) 4131 return; 4132 4133 /* During lookup routes are evaluated in order, so we need to 4134 * make sure upper bounds are assigned from the first sibling 4135 * onwards. 4136 */ 4137 first = rt6_multipath_first_sibling(rt); 4138 if (WARN_ON_ONCE(!first)) 4139 return; 4140 4141 total = rt6_multipath_total_weight(first); 4142 rt6_multipath_upper_bound_set(first, total); 4143 } 4144 4145 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 4146 { 4147 const struct arg_netdev_event *arg = p_arg; 4148 struct net *net = dev_net(arg->dev); 4149 4150 if (rt != net->ipv6.fib6_null_entry && 4151 rt->fib6_nh->fib_nh_dev == arg->dev) { 4152 rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags; 4153 fib6_update_sernum_upto_root(net, rt); 4154 rt6_multipath_rebalance(rt); 4155 } 4156 4157 return 0; 4158 } 4159 4160 void rt6_sync_up(struct net_device *dev, unsigned char nh_flags) 4161 { 4162 struct arg_netdev_event arg = { 4163 .dev = dev, 4164 { 4165 .nh_flags = nh_flags, 4166 }, 4167 }; 4168 4169 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 4170 arg.nh_flags |= RTNH_F_LINKDOWN; 4171 4172 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 4173 } 4174 4175 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 4176 const struct net_device *dev) 4177 { 4178 struct fib6_info *iter; 4179 4180 if (rt->fib6_nh->fib_nh_dev == dev) 4181 return true; 4182 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4183 if (iter->fib6_nh->fib_nh_dev == dev) 4184 return true; 4185 4186 return false; 4187 } 4188 4189 static void rt6_multipath_flush(struct fib6_info *rt) 4190 { 4191 struct fib6_info *iter; 4192 4193 rt->should_flush = 1; 4194 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4195 iter->should_flush = 1; 4196 } 4197 4198 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 4199 const struct net_device *down_dev) 4200 { 4201 struct fib6_info *iter; 4202 unsigned int dead = 0; 4203 4204 if (rt->fib6_nh->fib_nh_dev == down_dev || 4205 rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD) 4206 dead++; 4207 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4208 if (iter->fib6_nh->fib_nh_dev == down_dev || 4209 iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD) 4210 dead++; 4211 4212 return dead; 4213 } 4214 4215 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 4216 const struct net_device *dev, 4217 unsigned char nh_flags) 4218 { 4219 struct fib6_info *iter; 4220 4221 if (rt->fib6_nh->fib_nh_dev == dev) 4222 rt->fib6_nh->fib_nh_flags |= nh_flags; 4223 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4224 if (iter->fib6_nh->fib_nh_dev == dev) 4225 iter->fib6_nh->fib_nh_flags |= nh_flags; 4226 } 4227 4228 /* called with write lock held for table with rt */ 4229 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 4230 { 4231 const struct arg_netdev_event *arg = p_arg; 4232 const struct net_device *dev = arg->dev; 4233 struct net *net = dev_net(dev); 4234 4235 if (rt == net->ipv6.fib6_null_entry) 4236 return 0; 4237 4238 switch (arg->event) { 4239 case NETDEV_UNREGISTER: 4240 return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; 4241 case NETDEV_DOWN: 4242 if (rt->should_flush) 4243 return -1; 4244 if (!rt->fib6_nsiblings) 4245 return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; 4246 if (rt6_multipath_uses_dev(rt, dev)) { 4247 unsigned int count; 4248 4249 count = rt6_multipath_dead_count(rt, dev); 4250 if (rt->fib6_nsiblings + 1 == count) { 4251 rt6_multipath_flush(rt); 4252 return -1; 4253 } 4254 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4255 RTNH_F_LINKDOWN); 4256 fib6_update_sernum(net, rt); 4257 rt6_multipath_rebalance(rt); 4258 } 4259 return -2; 4260 case NETDEV_CHANGE: 4261 if (rt->fib6_nh->fib_nh_dev != dev || 4262 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 4263 break; 4264 rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; 4265 rt6_multipath_rebalance(rt); 4266 break; 4267 } 4268 4269 return 0; 4270 } 4271 4272 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4273 { 4274 struct arg_netdev_event arg = { 4275 .dev = dev, 4276 { 4277 .event = event, 4278 }, 4279 }; 4280 struct net *net = dev_net(dev); 4281 4282 if (net->ipv6.sysctl.skip_notify_on_dev_down) 4283 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg); 4284 else 4285 fib6_clean_all(net, fib6_ifdown, &arg); 4286 } 4287 4288 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4289 { 4290 rt6_sync_down_dev(dev, event); 4291 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4292 neigh_ifdown(&nd_tbl, dev); 4293 } 4294 4295 struct rt6_mtu_change_arg { 4296 struct net_device *dev; 4297 unsigned int mtu; 4298 struct fib6_info *f6i; 4299 }; 4300 4301 static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg) 4302 { 4303 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg; 4304 struct fib6_info *f6i = arg->f6i; 4305 4306 /* For administrative MTU increase, there is no way to discover 4307 * IPv6 PMTU increase, so PMTU increase should be updated here. 4308 * Since RFC 1981 doesn't include administrative MTU increase 4309 * update PMTU increase is a MUST. (i.e. jumbo frame) 4310 */ 4311 if (nh->fib_nh_dev == arg->dev) { 4312 struct inet6_dev *idev = __in6_dev_get(arg->dev); 4313 u32 mtu = f6i->fib6_pmtu; 4314 4315 if (mtu >= arg->mtu || 4316 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4317 fib6_metric_set(f6i, RTAX_MTU, arg->mtu); 4318 4319 spin_lock_bh(&rt6_exception_lock); 4320 rt6_exceptions_update_pmtu(idev, nh, arg->mtu); 4321 spin_unlock_bh(&rt6_exception_lock); 4322 } 4323 4324 return 0; 4325 } 4326 4327 static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg) 4328 { 4329 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4330 struct inet6_dev *idev; 4331 4332 /* In IPv6 pmtu discovery is not optional, 4333 so that RTAX_MTU lock cannot disable it. 4334 We still use this lock to block changes 4335 caused by addrconf/ndisc. 4336 */ 4337 4338 idev = __in6_dev_get(arg->dev); 4339 if (!idev) 4340 return 0; 4341 4342 if (fib6_metric_locked(f6i, RTAX_MTU)) 4343 return 0; 4344 4345 arg->f6i = f6i; 4346 return fib6_nh_mtu_change(f6i->fib6_nh, arg); 4347 } 4348 4349 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4350 { 4351 struct rt6_mtu_change_arg arg = { 4352 .dev = dev, 4353 .mtu = mtu, 4354 }; 4355 4356 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4357 } 4358 4359 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4360 [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 }, 4361 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4362 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4363 [RTA_OIF] = { .type = NLA_U32 }, 4364 [RTA_IIF] = { .type = NLA_U32 }, 4365 [RTA_PRIORITY] = { .type = NLA_U32 }, 4366 [RTA_METRICS] = { .type = NLA_NESTED }, 4367 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4368 [RTA_PREF] = { .type = NLA_U8 }, 4369 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4370 [RTA_ENCAP] = { .type = NLA_NESTED }, 4371 [RTA_EXPIRES] = { .type = NLA_U32 }, 4372 [RTA_UID] = { .type = NLA_U32 }, 4373 [RTA_MARK] = { .type = NLA_U32 }, 4374 [RTA_TABLE] = { .type = NLA_U32 }, 4375 [RTA_IP_PROTO] = { .type = NLA_U8 }, 4376 [RTA_SPORT] = { .type = NLA_U16 }, 4377 [RTA_DPORT] = { .type = NLA_U16 }, 4378 }; 4379 4380 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4381 struct fib6_config *cfg, 4382 struct netlink_ext_ack *extack) 4383 { 4384 struct rtmsg *rtm; 4385 struct nlattr *tb[RTA_MAX+1]; 4386 unsigned int pref; 4387 int err; 4388 4389 err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, 4390 rtm_ipv6_policy, extack); 4391 if (err < 0) 4392 goto errout; 4393 4394 err = -EINVAL; 4395 rtm = nlmsg_data(nlh); 4396 4397 *cfg = (struct fib6_config){ 4398 .fc_table = rtm->rtm_table, 4399 .fc_dst_len = rtm->rtm_dst_len, 4400 .fc_src_len = rtm->rtm_src_len, 4401 .fc_flags = RTF_UP, 4402 .fc_protocol = rtm->rtm_protocol, 4403 .fc_type = rtm->rtm_type, 4404 4405 .fc_nlinfo.portid = NETLINK_CB(skb).portid, 4406 .fc_nlinfo.nlh = nlh, 4407 .fc_nlinfo.nl_net = sock_net(skb->sk), 4408 }; 4409 4410 if (rtm->rtm_type == RTN_UNREACHABLE || 4411 rtm->rtm_type == RTN_BLACKHOLE || 4412 rtm->rtm_type == RTN_PROHIBIT || 4413 rtm->rtm_type == RTN_THROW) 4414 cfg->fc_flags |= RTF_REJECT; 4415 4416 if (rtm->rtm_type == RTN_LOCAL) 4417 cfg->fc_flags |= RTF_LOCAL; 4418 4419 if (rtm->rtm_flags & RTM_F_CLONED) 4420 cfg->fc_flags |= RTF_CACHE; 4421 4422 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4423 4424 if (tb[RTA_GATEWAY]) { 4425 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4426 cfg->fc_flags |= RTF_GATEWAY; 4427 } 4428 if (tb[RTA_VIA]) { 4429 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute"); 4430 goto errout; 4431 } 4432 4433 if (tb[RTA_DST]) { 4434 int plen = (rtm->rtm_dst_len + 7) >> 3; 4435 4436 if (nla_len(tb[RTA_DST]) < plen) 4437 goto errout; 4438 4439 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 4440 } 4441 4442 if (tb[RTA_SRC]) { 4443 int plen = (rtm->rtm_src_len + 7) >> 3; 4444 4445 if (nla_len(tb[RTA_SRC]) < plen) 4446 goto errout; 4447 4448 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 4449 } 4450 4451 if (tb[RTA_PREFSRC]) 4452 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 4453 4454 if (tb[RTA_OIF]) 4455 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 4456 4457 if (tb[RTA_PRIORITY]) 4458 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 4459 4460 if (tb[RTA_METRICS]) { 4461 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 4462 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 4463 } 4464 4465 if (tb[RTA_TABLE]) 4466 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 4467 4468 if (tb[RTA_MULTIPATH]) { 4469 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 4470 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 4471 4472 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 4473 cfg->fc_mp_len, extack); 4474 if (err < 0) 4475 goto errout; 4476 } 4477 4478 if (tb[RTA_PREF]) { 4479 pref = nla_get_u8(tb[RTA_PREF]); 4480 if (pref != ICMPV6_ROUTER_PREF_LOW && 4481 pref != ICMPV6_ROUTER_PREF_HIGH) 4482 pref = ICMPV6_ROUTER_PREF_MEDIUM; 4483 cfg->fc_flags |= RTF_PREF(pref); 4484 } 4485 4486 if (tb[RTA_ENCAP]) 4487 cfg->fc_encap = tb[RTA_ENCAP]; 4488 4489 if (tb[RTA_ENCAP_TYPE]) { 4490 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 4491 4492 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 4493 if (err < 0) 4494 goto errout; 4495 } 4496 4497 if (tb[RTA_EXPIRES]) { 4498 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 4499 4500 if (addrconf_finite_timeout(timeout)) { 4501 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 4502 cfg->fc_flags |= RTF_EXPIRES; 4503 } 4504 } 4505 4506 err = 0; 4507 errout: 4508 return err; 4509 } 4510 4511 struct rt6_nh { 4512 struct fib6_info *fib6_info; 4513 struct fib6_config r_cfg; 4514 struct list_head next; 4515 }; 4516 4517 static int ip6_route_info_append(struct net *net, 4518 struct list_head *rt6_nh_list, 4519 struct fib6_info *rt, 4520 struct fib6_config *r_cfg) 4521 { 4522 struct rt6_nh *nh; 4523 int err = -EEXIST; 4524 4525 list_for_each_entry(nh, rt6_nh_list, next) { 4526 /* check if fib6_info already exists */ 4527 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 4528 return err; 4529 } 4530 4531 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4532 if (!nh) 4533 return -ENOMEM; 4534 nh->fib6_info = rt; 4535 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4536 list_add_tail(&nh->next, rt6_nh_list); 4537 4538 return 0; 4539 } 4540 4541 static void ip6_route_mpath_notify(struct fib6_info *rt, 4542 struct fib6_info *rt_last, 4543 struct nl_info *info, 4544 __u16 nlflags) 4545 { 4546 /* if this is an APPEND route, then rt points to the first route 4547 * inserted and rt_last points to last route inserted. Userspace 4548 * wants a consistent dump of the route which starts at the first 4549 * nexthop. Since sibling routes are always added at the end of 4550 * the list, find the first sibling of the last route appended 4551 */ 4552 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 4553 rt = list_first_entry(&rt_last->fib6_siblings, 4554 struct fib6_info, 4555 fib6_siblings); 4556 } 4557 4558 if (rt) 4559 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 4560 } 4561 4562 static int ip6_route_multipath_add(struct fib6_config *cfg, 4563 struct netlink_ext_ack *extack) 4564 { 4565 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 4566 struct nl_info *info = &cfg->fc_nlinfo; 4567 struct fib6_config r_cfg; 4568 struct rtnexthop *rtnh; 4569 struct fib6_info *rt; 4570 struct rt6_nh *err_nh; 4571 struct rt6_nh *nh, *nh_safe; 4572 __u16 nlflags; 4573 int remaining; 4574 int attrlen; 4575 int err = 1; 4576 int nhn = 0; 4577 int replace = (cfg->fc_nlinfo.nlh && 4578 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 4579 LIST_HEAD(rt6_nh_list); 4580 4581 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 4582 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 4583 nlflags |= NLM_F_APPEND; 4584 4585 remaining = cfg->fc_mp_len; 4586 rtnh = (struct rtnexthop *)cfg->fc_mp; 4587 4588 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4589 * fib6_info structs per nexthop 4590 */ 4591 while (rtnh_ok(rtnh, remaining)) { 4592 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4593 if (rtnh->rtnh_ifindex) 4594 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4595 4596 attrlen = rtnh_attrlen(rtnh); 4597 if (attrlen > 0) { 4598 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4599 4600 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4601 if (nla) { 4602 r_cfg.fc_gateway = nla_get_in6_addr(nla); 4603 r_cfg.fc_flags |= RTF_GATEWAY; 4604 } 4605 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 4606 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 4607 if (nla) 4608 r_cfg.fc_encap_type = nla_get_u16(nla); 4609 } 4610 4611 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 4612 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 4613 if (IS_ERR(rt)) { 4614 err = PTR_ERR(rt); 4615 rt = NULL; 4616 goto cleanup; 4617 } 4618 if (!rt6_qualify_for_ecmp(rt)) { 4619 err = -EINVAL; 4620 NL_SET_ERR_MSG(extack, 4621 "Device only routes can not be added for IPv6 using the multipath API."); 4622 fib6_info_release(rt); 4623 goto cleanup; 4624 } 4625 4626 rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; 4627 4628 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 4629 rt, &r_cfg); 4630 if (err) { 4631 fib6_info_release(rt); 4632 goto cleanup; 4633 } 4634 4635 rtnh = rtnh_next(rtnh, &remaining); 4636 } 4637 4638 /* for add and replace send one notification with all nexthops. 4639 * Skip the notification in fib6_add_rt2node and send one with 4640 * the full route when done 4641 */ 4642 info->skip_notify = 1; 4643 4644 err_nh = NULL; 4645 list_for_each_entry(nh, &rt6_nh_list, next) { 4646 err = __ip6_ins_rt(nh->fib6_info, info, extack); 4647 fib6_info_release(nh->fib6_info); 4648 4649 if (!err) { 4650 /* save reference to last route successfully inserted */ 4651 rt_last = nh->fib6_info; 4652 4653 /* save reference to first route for notification */ 4654 if (!rt_notif) 4655 rt_notif = nh->fib6_info; 4656 } 4657 4658 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 4659 nh->fib6_info = NULL; 4660 if (err) { 4661 if (replace && nhn) 4662 NL_SET_ERR_MSG_MOD(extack, 4663 "multipath route replace failed (check consistency of installed routes)"); 4664 err_nh = nh; 4665 goto add_errout; 4666 } 4667 4668 /* Because each route is added like a single route we remove 4669 * these flags after the first nexthop: if there is a collision, 4670 * we have already failed to add the first nexthop: 4671 * fib6_add_rt2node() has rejected it; when replacing, old 4672 * nexthops have been replaced by first new, the rest should 4673 * be added to it. 4674 */ 4675 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4676 NLM_F_REPLACE); 4677 nhn++; 4678 } 4679 4680 /* success ... tell user about new route */ 4681 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4682 goto cleanup; 4683 4684 add_errout: 4685 /* send notification for routes that were added so that 4686 * the delete notifications sent by ip6_route_del are 4687 * coherent 4688 */ 4689 if (rt_notif) 4690 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 4691 4692 /* Delete routes that were already added */ 4693 list_for_each_entry(nh, &rt6_nh_list, next) { 4694 if (err_nh == nh) 4695 break; 4696 ip6_route_del(&nh->r_cfg, extack); 4697 } 4698 4699 cleanup: 4700 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4701 if (nh->fib6_info) 4702 fib6_info_release(nh->fib6_info); 4703 list_del(&nh->next); 4704 kfree(nh); 4705 } 4706 4707 return err; 4708 } 4709 4710 static int ip6_route_multipath_del(struct fib6_config *cfg, 4711 struct netlink_ext_ack *extack) 4712 { 4713 struct fib6_config r_cfg; 4714 struct rtnexthop *rtnh; 4715 int remaining; 4716 int attrlen; 4717 int err = 1, last_err = 0; 4718 4719 remaining = cfg->fc_mp_len; 4720 rtnh = (struct rtnexthop *)cfg->fc_mp; 4721 4722 /* Parse a Multipath Entry */ 4723 while (rtnh_ok(rtnh, remaining)) { 4724 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4725 if (rtnh->rtnh_ifindex) 4726 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 4727 4728 attrlen = rtnh_attrlen(rtnh); 4729 if (attrlen > 0) { 4730 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 4731 4732 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 4733 if (nla) { 4734 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 4735 r_cfg.fc_flags |= RTF_GATEWAY; 4736 } 4737 } 4738 err = ip6_route_del(&r_cfg, extack); 4739 if (err) 4740 last_err = err; 4741 4742 rtnh = rtnh_next(rtnh, &remaining); 4743 } 4744 4745 return last_err; 4746 } 4747 4748 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4749 struct netlink_ext_ack *extack) 4750 { 4751 struct fib6_config cfg; 4752 int err; 4753 4754 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4755 if (err < 0) 4756 return err; 4757 4758 if (cfg.fc_mp) 4759 return ip6_route_multipath_del(&cfg, extack); 4760 else { 4761 cfg.fc_delete_all_nh = 1; 4762 return ip6_route_del(&cfg, extack); 4763 } 4764 } 4765 4766 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 4767 struct netlink_ext_ack *extack) 4768 { 4769 struct fib6_config cfg; 4770 int err; 4771 4772 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 4773 if (err < 0) 4774 return err; 4775 4776 if (cfg.fc_metric == 0) 4777 cfg.fc_metric = IP6_RT_PRIO_USER; 4778 4779 if (cfg.fc_mp) 4780 return ip6_route_multipath_add(&cfg, extack); 4781 else 4782 return ip6_route_add(&cfg, GFP_KERNEL, extack); 4783 } 4784 4785 static size_t rt6_nlmsg_size(struct fib6_info *rt) 4786 { 4787 int nexthop_len = 0; 4788 4789 if (rt->fib6_nsiblings) { 4790 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4791 + NLA_ALIGN(sizeof(struct rtnexthop)) 4792 + nla_total_size(16) /* RTA_GATEWAY */ 4793 + lwtunnel_get_encap_size(rt->fib6_nh->fib_nh_lws); 4794 4795 nexthop_len *= rt->fib6_nsiblings; 4796 } 4797 4798 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4799 + nla_total_size(16) /* RTA_SRC */ 4800 + nla_total_size(16) /* RTA_DST */ 4801 + nla_total_size(16) /* RTA_GATEWAY */ 4802 + nla_total_size(16) /* RTA_PREFSRC */ 4803 + nla_total_size(4) /* RTA_TABLE */ 4804 + nla_total_size(4) /* RTA_IIF */ 4805 + nla_total_size(4) /* RTA_OIF */ 4806 + nla_total_size(4) /* RTA_PRIORITY */ 4807 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 4808 + nla_total_size(sizeof(struct rta_cacheinfo)) 4809 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4810 + nla_total_size(1) /* RTA_PREF */ 4811 + lwtunnel_get_encap_size(rt->fib6_nh->fib_nh_lws) 4812 + nexthop_len; 4813 } 4814 4815 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 4816 struct fib6_info *rt, struct dst_entry *dst, 4817 struct in6_addr *dest, struct in6_addr *src, 4818 int iif, int type, u32 portid, u32 seq, 4819 unsigned int flags) 4820 { 4821 struct rt6_info *rt6 = (struct rt6_info *)dst; 4822 struct rt6key *rt6_dst, *rt6_src; 4823 u32 *pmetrics, table, rt6_flags; 4824 struct nlmsghdr *nlh; 4825 struct rtmsg *rtm; 4826 long expires = 0; 4827 4828 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4829 if (!nlh) 4830 return -EMSGSIZE; 4831 4832 if (rt6) { 4833 rt6_dst = &rt6->rt6i_dst; 4834 rt6_src = &rt6->rt6i_src; 4835 rt6_flags = rt6->rt6i_flags; 4836 } else { 4837 rt6_dst = &rt->fib6_dst; 4838 rt6_src = &rt->fib6_src; 4839 rt6_flags = rt->fib6_flags; 4840 } 4841 4842 rtm = nlmsg_data(nlh); 4843 rtm->rtm_family = AF_INET6; 4844 rtm->rtm_dst_len = rt6_dst->plen; 4845 rtm->rtm_src_len = rt6_src->plen; 4846 rtm->rtm_tos = 0; 4847 if (rt->fib6_table) 4848 table = rt->fib6_table->tb6_id; 4849 else 4850 table = RT6_TABLE_UNSPEC; 4851 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT; 4852 if (nla_put_u32(skb, RTA_TABLE, table)) 4853 goto nla_put_failure; 4854 4855 rtm->rtm_type = rt->fib6_type; 4856 rtm->rtm_flags = 0; 4857 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4858 rtm->rtm_protocol = rt->fib6_protocol; 4859 4860 if (rt6_flags & RTF_CACHE) 4861 rtm->rtm_flags |= RTM_F_CLONED; 4862 4863 if (dest) { 4864 if (nla_put_in6_addr(skb, RTA_DST, dest)) 4865 goto nla_put_failure; 4866 rtm->rtm_dst_len = 128; 4867 } else if (rtm->rtm_dst_len) 4868 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr)) 4869 goto nla_put_failure; 4870 #ifdef CONFIG_IPV6_SUBTREES 4871 if (src) { 4872 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4873 goto nla_put_failure; 4874 rtm->rtm_src_len = 128; 4875 } else if (rtm->rtm_src_len && 4876 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr)) 4877 goto nla_put_failure; 4878 #endif 4879 if (iif) { 4880 #ifdef CONFIG_IPV6_MROUTE 4881 if (ipv6_addr_is_multicast(&rt6_dst->addr)) { 4882 int err = ip6mr_get_route(net, skb, rtm, portid); 4883 4884 if (err == 0) 4885 return 0; 4886 if (err < 0) 4887 goto nla_put_failure; 4888 } else 4889 #endif 4890 if (nla_put_u32(skb, RTA_IIF, iif)) 4891 goto nla_put_failure; 4892 } else if (dest) { 4893 struct in6_addr saddr_buf; 4894 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 4895 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4896 goto nla_put_failure; 4897 } 4898 4899 if (rt->fib6_prefsrc.plen) { 4900 struct in6_addr saddr_buf; 4901 saddr_buf = rt->fib6_prefsrc.addr; 4902 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4903 goto nla_put_failure; 4904 } 4905 4906 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 4907 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 4908 goto nla_put_failure; 4909 4910 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 4911 goto nla_put_failure; 4912 4913 /* For multipath routes, walk the siblings list and add 4914 * each as a nexthop within RTA_MULTIPATH. 4915 */ 4916 if (rt6) { 4917 if (rt6_flags & RTF_GATEWAY && 4918 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) 4919 goto nla_put_failure; 4920 4921 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex)) 4922 goto nla_put_failure; 4923 } else if (rt->fib6_nsiblings) { 4924 struct fib6_info *sibling, *next_sibling; 4925 struct nlattr *mp; 4926 4927 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); 4928 if (!mp) 4929 goto nla_put_failure; 4930 4931 if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common, 4932 rt->fib6_nh->fib_nh_weight) < 0) 4933 goto nla_put_failure; 4934 4935 list_for_each_entry_safe(sibling, next_sibling, 4936 &rt->fib6_siblings, fib6_siblings) { 4937 if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common, 4938 sibling->fib6_nh->fib_nh_weight) < 0) 4939 goto nla_put_failure; 4940 } 4941 4942 nla_nest_end(skb, mp); 4943 } else { 4944 unsigned char nh_flags = 0; 4945 4946 if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, 4947 &nh_flags, false) < 0) 4948 goto nla_put_failure; 4949 4950 rtm->rtm_flags |= nh_flags; 4951 } 4952 4953 if (rt6_flags & RTF_EXPIRES) { 4954 expires = dst ? dst->expires : rt->expires; 4955 expires -= jiffies; 4956 } 4957 4958 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 4959 goto nla_put_failure; 4960 4961 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags))) 4962 goto nla_put_failure; 4963 4964 4965 nlmsg_end(skb, nlh); 4966 return 0; 4967 4968 nla_put_failure: 4969 nlmsg_cancel(skb, nlh); 4970 return -EMSGSIZE; 4971 } 4972 4973 static bool fib6_info_uses_dev(const struct fib6_info *f6i, 4974 const struct net_device *dev) 4975 { 4976 if (f6i->fib6_nh->fib_nh_dev == dev) 4977 return true; 4978 4979 if (f6i->fib6_nsiblings) { 4980 struct fib6_info *sibling, *next_sibling; 4981 4982 list_for_each_entry_safe(sibling, next_sibling, 4983 &f6i->fib6_siblings, fib6_siblings) { 4984 if (sibling->fib6_nh->fib_nh_dev == dev) 4985 return true; 4986 } 4987 } 4988 4989 return false; 4990 } 4991 4992 int rt6_dump_route(struct fib6_info *rt, void *p_arg) 4993 { 4994 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4995 struct fib_dump_filter *filter = &arg->filter; 4996 unsigned int flags = NLM_F_MULTI; 4997 struct net *net = arg->net; 4998 4999 if (rt == net->ipv6.fib6_null_entry) 5000 return 0; 5001 5002 if ((filter->flags & RTM_F_PREFIX) && 5003 !(rt->fib6_flags & RTF_PREFIX_RT)) { 5004 /* success since this is not a prefix route */ 5005 return 1; 5006 } 5007 if (filter->filter_set) { 5008 if ((filter->rt_type && rt->fib6_type != filter->rt_type) || 5009 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || 5010 (filter->protocol && rt->fib6_protocol != filter->protocol)) { 5011 return 1; 5012 } 5013 flags |= NLM_F_DUMP_FILTERED; 5014 } 5015 5016 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, 5017 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, 5018 arg->cb->nlh->nlmsg_seq, flags); 5019 } 5020 5021 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, 5022 const struct nlmsghdr *nlh, 5023 struct nlattr **tb, 5024 struct netlink_ext_ack *extack) 5025 { 5026 struct rtmsg *rtm; 5027 int i, err; 5028 5029 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { 5030 NL_SET_ERR_MSG_MOD(extack, 5031 "Invalid header for get route request"); 5032 return -EINVAL; 5033 } 5034 5035 if (!netlink_strict_get_check(skb)) 5036 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, 5037 rtm_ipv6_policy, extack); 5038 5039 rtm = nlmsg_data(nlh); 5040 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || 5041 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || 5042 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || 5043 rtm->rtm_type) { 5044 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request"); 5045 return -EINVAL; 5046 } 5047 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) { 5048 NL_SET_ERR_MSG_MOD(extack, 5049 "Invalid flags for get route request"); 5050 return -EINVAL; 5051 } 5052 5053 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, 5054 rtm_ipv6_policy, extack); 5055 if (err) 5056 return err; 5057 5058 if ((tb[RTA_SRC] && !rtm->rtm_src_len) || 5059 (tb[RTA_DST] && !rtm->rtm_dst_len)) { 5060 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); 5061 return -EINVAL; 5062 } 5063 5064 for (i = 0; i <= RTA_MAX; i++) { 5065 if (!tb[i]) 5066 continue; 5067 5068 switch (i) { 5069 case RTA_SRC: 5070 case RTA_DST: 5071 case RTA_IIF: 5072 case RTA_OIF: 5073 case RTA_MARK: 5074 case RTA_UID: 5075 case RTA_SPORT: 5076 case RTA_DPORT: 5077 case RTA_IP_PROTO: 5078 break; 5079 default: 5080 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); 5081 return -EINVAL; 5082 } 5083 } 5084 5085 return 0; 5086 } 5087 5088 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 5089 struct netlink_ext_ack *extack) 5090 { 5091 struct net *net = sock_net(in_skb->sk); 5092 struct nlattr *tb[RTA_MAX+1]; 5093 int err, iif = 0, oif = 0; 5094 struct fib6_info *from; 5095 struct dst_entry *dst; 5096 struct rt6_info *rt; 5097 struct sk_buff *skb; 5098 struct rtmsg *rtm; 5099 struct flowi6 fl6 = {}; 5100 bool fibmatch; 5101 5102 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack); 5103 if (err < 0) 5104 goto errout; 5105 5106 err = -EINVAL; 5107 rtm = nlmsg_data(nlh); 5108 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 5109 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 5110 5111 if (tb[RTA_SRC]) { 5112 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 5113 goto errout; 5114 5115 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 5116 } 5117 5118 if (tb[RTA_DST]) { 5119 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 5120 goto errout; 5121 5122 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 5123 } 5124 5125 if (tb[RTA_IIF]) 5126 iif = nla_get_u32(tb[RTA_IIF]); 5127 5128 if (tb[RTA_OIF]) 5129 oif = nla_get_u32(tb[RTA_OIF]); 5130 5131 if (tb[RTA_MARK]) 5132 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 5133 5134 if (tb[RTA_UID]) 5135 fl6.flowi6_uid = make_kuid(current_user_ns(), 5136 nla_get_u32(tb[RTA_UID])); 5137 else 5138 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 5139 5140 if (tb[RTA_SPORT]) 5141 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); 5142 5143 if (tb[RTA_DPORT]) 5144 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); 5145 5146 if (tb[RTA_IP_PROTO]) { 5147 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 5148 &fl6.flowi6_proto, AF_INET6, 5149 extack); 5150 if (err) 5151 goto errout; 5152 } 5153 5154 if (iif) { 5155 struct net_device *dev; 5156 int flags = 0; 5157 5158 rcu_read_lock(); 5159 5160 dev = dev_get_by_index_rcu(net, iif); 5161 if (!dev) { 5162 rcu_read_unlock(); 5163 err = -ENODEV; 5164 goto errout; 5165 } 5166 5167 fl6.flowi6_iif = iif; 5168 5169 if (!ipv6_addr_any(&fl6.saddr)) 5170 flags |= RT6_LOOKUP_F_HAS_SADDR; 5171 5172 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 5173 5174 rcu_read_unlock(); 5175 } else { 5176 fl6.flowi6_oif = oif; 5177 5178 dst = ip6_route_output(net, NULL, &fl6); 5179 } 5180 5181 5182 rt = container_of(dst, struct rt6_info, dst); 5183 if (rt->dst.error) { 5184 err = rt->dst.error; 5185 ip6_rt_put(rt); 5186 goto errout; 5187 } 5188 5189 if (rt == net->ipv6.ip6_null_entry) { 5190 err = rt->dst.error; 5191 ip6_rt_put(rt); 5192 goto errout; 5193 } 5194 5195 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 5196 if (!skb) { 5197 ip6_rt_put(rt); 5198 err = -ENOBUFS; 5199 goto errout; 5200 } 5201 5202 skb_dst_set(skb, &rt->dst); 5203 5204 rcu_read_lock(); 5205 from = rcu_dereference(rt->from); 5206 if (from) { 5207 if (fibmatch) 5208 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, 5209 iif, RTM_NEWROUTE, 5210 NETLINK_CB(in_skb).portid, 5211 nlh->nlmsg_seq, 0); 5212 else 5213 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 5214 &fl6.saddr, iif, RTM_NEWROUTE, 5215 NETLINK_CB(in_skb).portid, 5216 nlh->nlmsg_seq, 0); 5217 } else { 5218 err = -ENETUNREACH; 5219 } 5220 rcu_read_unlock(); 5221 5222 if (err < 0) { 5223 kfree_skb(skb); 5224 goto errout; 5225 } 5226 5227 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 5228 errout: 5229 return err; 5230 } 5231 5232 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 5233 unsigned int nlm_flags) 5234 { 5235 struct sk_buff *skb; 5236 struct net *net = info->nl_net; 5237 u32 seq; 5238 int err; 5239 5240 err = -ENOBUFS; 5241 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 5242 5243 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 5244 if (!skb) 5245 goto errout; 5246 5247 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 5248 event, info->portid, seq, nlm_flags); 5249 if (err < 0) { 5250 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 5251 WARN_ON(err == -EMSGSIZE); 5252 kfree_skb(skb); 5253 goto errout; 5254 } 5255 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 5256 info->nlh, gfp_any()); 5257 return; 5258 errout: 5259 if (err < 0) 5260 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 5261 } 5262 5263 void fib6_rt_update(struct net *net, struct fib6_info *rt, 5264 struct nl_info *info) 5265 { 5266 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 5267 struct sk_buff *skb; 5268 int err = -ENOBUFS; 5269 5270 /* call_fib6_entry_notifiers will be removed when in-kernel notifier 5271 * is implemented and supported for nexthop objects 5272 */ 5273 call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, rt, NULL); 5274 5275 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 5276 if (!skb) 5277 goto errout; 5278 5279 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, 5280 RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE); 5281 if (err < 0) { 5282 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 5283 WARN_ON(err == -EMSGSIZE); 5284 kfree_skb(skb); 5285 goto errout; 5286 } 5287 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 5288 info->nlh, gfp_any()); 5289 return; 5290 errout: 5291 if (err < 0) 5292 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 5293 } 5294 5295 static int ip6_route_dev_notify(struct notifier_block *this, 5296 unsigned long event, void *ptr) 5297 { 5298 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 5299 struct net *net = dev_net(dev); 5300 5301 if (!(dev->flags & IFF_LOOPBACK)) 5302 return NOTIFY_OK; 5303 5304 if (event == NETDEV_REGISTER) { 5305 net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev; 5306 net->ipv6.ip6_null_entry->dst.dev = dev; 5307 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 5308 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5309 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 5310 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 5311 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 5312 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 5313 #endif 5314 } else if (event == NETDEV_UNREGISTER && 5315 dev->reg_state != NETREG_UNREGISTERED) { 5316 /* NETDEV_UNREGISTER could be fired for multiple times by 5317 * netdev_wait_allrefs(). Make sure we only call this once. 5318 */ 5319 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 5320 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5321 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 5322 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 5323 #endif 5324 } 5325 5326 return NOTIFY_OK; 5327 } 5328 5329 /* 5330 * /proc 5331 */ 5332 5333 #ifdef CONFIG_PROC_FS 5334 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 5335 { 5336 struct net *net = (struct net *)seq->private; 5337 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 5338 net->ipv6.rt6_stats->fib_nodes, 5339 net->ipv6.rt6_stats->fib_route_nodes, 5340 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 5341 net->ipv6.rt6_stats->fib_rt_entries, 5342 net->ipv6.rt6_stats->fib_rt_cache, 5343 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 5344 net->ipv6.rt6_stats->fib_discarded_routes); 5345 5346 return 0; 5347 } 5348 #endif /* CONFIG_PROC_FS */ 5349 5350 #ifdef CONFIG_SYSCTL 5351 5352 static 5353 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 5354 void __user *buffer, size_t *lenp, loff_t *ppos) 5355 { 5356 struct net *net; 5357 int delay; 5358 int ret; 5359 if (!write) 5360 return -EINVAL; 5361 5362 net = (struct net *)ctl->extra1; 5363 delay = net->ipv6.sysctl.flush_delay; 5364 ret = proc_dointvec(ctl, write, buffer, lenp, ppos); 5365 if (ret) 5366 return ret; 5367 5368 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 5369 return 0; 5370 } 5371 5372 static int zero; 5373 static int one = 1; 5374 5375 static struct ctl_table ipv6_route_table_template[] = { 5376 { 5377 .procname = "flush", 5378 .data = &init_net.ipv6.sysctl.flush_delay, 5379 .maxlen = sizeof(int), 5380 .mode = 0200, 5381 .proc_handler = ipv6_sysctl_rtcache_flush 5382 }, 5383 { 5384 .procname = "gc_thresh", 5385 .data = &ip6_dst_ops_template.gc_thresh, 5386 .maxlen = sizeof(int), 5387 .mode = 0644, 5388 .proc_handler = proc_dointvec, 5389 }, 5390 { 5391 .procname = "max_size", 5392 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 5393 .maxlen = sizeof(int), 5394 .mode = 0644, 5395 .proc_handler = proc_dointvec, 5396 }, 5397 { 5398 .procname = "gc_min_interval", 5399 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5400 .maxlen = sizeof(int), 5401 .mode = 0644, 5402 .proc_handler = proc_dointvec_jiffies, 5403 }, 5404 { 5405 .procname = "gc_timeout", 5406 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 5407 .maxlen = sizeof(int), 5408 .mode = 0644, 5409 .proc_handler = proc_dointvec_jiffies, 5410 }, 5411 { 5412 .procname = "gc_interval", 5413 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 5414 .maxlen = sizeof(int), 5415 .mode = 0644, 5416 .proc_handler = proc_dointvec_jiffies, 5417 }, 5418 { 5419 .procname = "gc_elasticity", 5420 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 5421 .maxlen = sizeof(int), 5422 .mode = 0644, 5423 .proc_handler = proc_dointvec, 5424 }, 5425 { 5426 .procname = "mtu_expires", 5427 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 5428 .maxlen = sizeof(int), 5429 .mode = 0644, 5430 .proc_handler = proc_dointvec_jiffies, 5431 }, 5432 { 5433 .procname = "min_adv_mss", 5434 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 5435 .maxlen = sizeof(int), 5436 .mode = 0644, 5437 .proc_handler = proc_dointvec, 5438 }, 5439 { 5440 .procname = "gc_min_interval_ms", 5441 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 5442 .maxlen = sizeof(int), 5443 .mode = 0644, 5444 .proc_handler = proc_dointvec_ms_jiffies, 5445 }, 5446 { 5447 .procname = "skip_notify_on_dev_down", 5448 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down, 5449 .maxlen = sizeof(int), 5450 .mode = 0644, 5451 .proc_handler = proc_dointvec, 5452 .extra1 = &zero, 5453 .extra2 = &one, 5454 }, 5455 { } 5456 }; 5457 5458 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 5459 { 5460 struct ctl_table *table; 5461 5462 table = kmemdup(ipv6_route_table_template, 5463 sizeof(ipv6_route_table_template), 5464 GFP_KERNEL); 5465 5466 if (table) { 5467 table[0].data = &net->ipv6.sysctl.flush_delay; 5468 table[0].extra1 = net; 5469 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 5470 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 5471 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5472 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 5473 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 5474 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 5475 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 5476 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 5477 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5478 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down; 5479 5480 /* Don't export sysctls to unprivileged users */ 5481 if (net->user_ns != &init_user_ns) 5482 table[0].procname = NULL; 5483 } 5484 5485 return table; 5486 } 5487 #endif 5488 5489 static int __net_init ip6_route_net_init(struct net *net) 5490 { 5491 int ret = -ENOMEM; 5492 5493 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 5494 sizeof(net->ipv6.ip6_dst_ops)); 5495 5496 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 5497 goto out_ip6_dst_ops; 5498 5499 net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true); 5500 if (!net->ipv6.fib6_null_entry) 5501 goto out_ip6_dst_entries; 5502 memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template, 5503 sizeof(*net->ipv6.fib6_null_entry)); 5504 5505 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 5506 sizeof(*net->ipv6.ip6_null_entry), 5507 GFP_KERNEL); 5508 if (!net->ipv6.ip6_null_entry) 5509 goto out_fib6_null_entry; 5510 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5511 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 5512 ip6_template_metrics, true); 5513 5514 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5515 net->ipv6.fib6_has_custom_rules = false; 5516 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 5517 sizeof(*net->ipv6.ip6_prohibit_entry), 5518 GFP_KERNEL); 5519 if (!net->ipv6.ip6_prohibit_entry) 5520 goto out_ip6_null_entry; 5521 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5522 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 5523 ip6_template_metrics, true); 5524 5525 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 5526 sizeof(*net->ipv6.ip6_blk_hole_entry), 5527 GFP_KERNEL); 5528 if (!net->ipv6.ip6_blk_hole_entry) 5529 goto out_ip6_prohibit_entry; 5530 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5531 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 5532 ip6_template_metrics, true); 5533 #endif 5534 5535 net->ipv6.sysctl.flush_delay = 0; 5536 net->ipv6.sysctl.ip6_rt_max_size = 4096; 5537 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 5538 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 5539 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 5540 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 5541 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 5542 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 5543 net->ipv6.sysctl.skip_notify_on_dev_down = 0; 5544 5545 net->ipv6.ip6_rt_gc_expire = 30*HZ; 5546 5547 ret = 0; 5548 out: 5549 return ret; 5550 5551 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5552 out_ip6_prohibit_entry: 5553 kfree(net->ipv6.ip6_prohibit_entry); 5554 out_ip6_null_entry: 5555 kfree(net->ipv6.ip6_null_entry); 5556 #endif 5557 out_fib6_null_entry: 5558 kfree(net->ipv6.fib6_null_entry); 5559 out_ip6_dst_entries: 5560 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5561 out_ip6_dst_ops: 5562 goto out; 5563 } 5564 5565 static void __net_exit ip6_route_net_exit(struct net *net) 5566 { 5567 kfree(net->ipv6.fib6_null_entry); 5568 kfree(net->ipv6.ip6_null_entry); 5569 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5570 kfree(net->ipv6.ip6_prohibit_entry); 5571 kfree(net->ipv6.ip6_blk_hole_entry); 5572 #endif 5573 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5574 } 5575 5576 static int __net_init ip6_route_net_init_late(struct net *net) 5577 { 5578 #ifdef CONFIG_PROC_FS 5579 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops, 5580 sizeof(struct ipv6_route_iter)); 5581 proc_create_net_single("rt6_stats", 0444, net->proc_net, 5582 rt6_stats_seq_show, NULL); 5583 #endif 5584 return 0; 5585 } 5586 5587 static void __net_exit ip6_route_net_exit_late(struct net *net) 5588 { 5589 #ifdef CONFIG_PROC_FS 5590 remove_proc_entry("ipv6_route", net->proc_net); 5591 remove_proc_entry("rt6_stats", net->proc_net); 5592 #endif 5593 } 5594 5595 static struct pernet_operations ip6_route_net_ops = { 5596 .init = ip6_route_net_init, 5597 .exit = ip6_route_net_exit, 5598 }; 5599 5600 static int __net_init ipv6_inetpeer_init(struct net *net) 5601 { 5602 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 5603 5604 if (!bp) 5605 return -ENOMEM; 5606 inet_peer_base_init(bp); 5607 net->ipv6.peers = bp; 5608 return 0; 5609 } 5610 5611 static void __net_exit ipv6_inetpeer_exit(struct net *net) 5612 { 5613 struct inet_peer_base *bp = net->ipv6.peers; 5614 5615 net->ipv6.peers = NULL; 5616 inetpeer_invalidate_tree(bp); 5617 kfree(bp); 5618 } 5619 5620 static struct pernet_operations ipv6_inetpeer_ops = { 5621 .init = ipv6_inetpeer_init, 5622 .exit = ipv6_inetpeer_exit, 5623 }; 5624 5625 static struct pernet_operations ip6_route_net_late_ops = { 5626 .init = ip6_route_net_init_late, 5627 .exit = ip6_route_net_exit_late, 5628 }; 5629 5630 static struct notifier_block ip6_route_dev_notifier = { 5631 .notifier_call = ip6_route_dev_notify, 5632 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 5633 }; 5634 5635 void __init ip6_route_init_special_entries(void) 5636 { 5637 /* Registering of the loopback is done before this portion of code, 5638 * the loopback reference in rt6_info will not be taken, do it 5639 * manually for init_net */ 5640 init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev; 5641 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5642 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5643 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5644 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 5645 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5646 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 5647 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5648 #endif 5649 } 5650 5651 int __init ip6_route_init(void) 5652 { 5653 int ret; 5654 int cpu; 5655 5656 ret = -ENOMEM; 5657 ip6_dst_ops_template.kmem_cachep = 5658 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 5659 SLAB_HWCACHE_ALIGN, NULL); 5660 if (!ip6_dst_ops_template.kmem_cachep) 5661 goto out; 5662 5663 ret = dst_entries_init(&ip6_dst_blackhole_ops); 5664 if (ret) 5665 goto out_kmem_cache; 5666 5667 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 5668 if (ret) 5669 goto out_dst_entries; 5670 5671 ret = register_pernet_subsys(&ip6_route_net_ops); 5672 if (ret) 5673 goto out_register_inetpeer; 5674 5675 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 5676 5677 ret = fib6_init(); 5678 if (ret) 5679 goto out_register_subsys; 5680 5681 ret = xfrm6_init(); 5682 if (ret) 5683 goto out_fib6_init; 5684 5685 ret = fib6_rules_init(); 5686 if (ret) 5687 goto xfrm6_init; 5688 5689 ret = register_pernet_subsys(&ip6_route_net_late_ops); 5690 if (ret) 5691 goto fib6_rules_init; 5692 5693 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, 5694 inet6_rtm_newroute, NULL, 0); 5695 if (ret < 0) 5696 goto out_register_late_subsys; 5697 5698 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, 5699 inet6_rtm_delroute, NULL, 0); 5700 if (ret < 0) 5701 goto out_register_late_subsys; 5702 5703 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, 5704 inet6_rtm_getroute, NULL, 5705 RTNL_FLAG_DOIT_UNLOCKED); 5706 if (ret < 0) 5707 goto out_register_late_subsys; 5708 5709 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 5710 if (ret) 5711 goto out_register_late_subsys; 5712 5713 for_each_possible_cpu(cpu) { 5714 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 5715 5716 INIT_LIST_HEAD(&ul->head); 5717 spin_lock_init(&ul->lock); 5718 } 5719 5720 out: 5721 return ret; 5722 5723 out_register_late_subsys: 5724 rtnl_unregister_all(PF_INET6); 5725 unregister_pernet_subsys(&ip6_route_net_late_ops); 5726 fib6_rules_init: 5727 fib6_rules_cleanup(); 5728 xfrm6_init: 5729 xfrm6_fini(); 5730 out_fib6_init: 5731 fib6_gc_cleanup(); 5732 out_register_subsys: 5733 unregister_pernet_subsys(&ip6_route_net_ops); 5734 out_register_inetpeer: 5735 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5736 out_dst_entries: 5737 dst_entries_destroy(&ip6_dst_blackhole_ops); 5738 out_kmem_cache: 5739 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5740 goto out; 5741 } 5742 5743 void ip6_route_cleanup(void) 5744 { 5745 unregister_netdevice_notifier(&ip6_route_dev_notifier); 5746 unregister_pernet_subsys(&ip6_route_net_late_ops); 5747 fib6_rules_cleanup(); 5748 xfrm6_fini(); 5749 fib6_gc_cleanup(); 5750 unregister_pernet_subsys(&ipv6_inetpeer_ops); 5751 unregister_pernet_subsys(&ip6_route_net_ops); 5752 dst_entries_destroy(&ip6_dst_blackhole_ops); 5753 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 5754 } 5755