1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/nexthop.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <trace/events/fib6.h> 67 68 #include <linux/uaccess.h> 69 70 #ifdef CONFIG_SYSCTL 71 #include <linux/sysctl.h> 72 #endif 73 74 enum rt6_nud_state { 75 RT6_NUD_FAIL_HARD = -3, 76 RT6_NUD_FAIL_PROBE = -2, 77 RT6_NUD_FAIL_DO_RR = -1, 78 RT6_NUD_SUCCEED = 1 79 }; 80 81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort); 82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 83 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 84 static unsigned int ip6_mtu(const struct dst_entry *dst); 85 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 86 static void ip6_dst_destroy(struct dst_entry *); 87 static void ip6_dst_ifdown(struct dst_entry *, 88 struct net_device *dev, int how); 89 static int ip6_dst_gc(struct dst_ops *ops); 90 91 static int ip6_pkt_discard(struct sk_buff *skb); 92 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 93 static int ip6_pkt_prohibit(struct sk_buff *skb); 94 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 95 static void ip6_link_failure(struct sk_buff *skb); 96 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 97 struct sk_buff *skb, u32 mtu); 98 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 99 struct sk_buff *skb); 100 static void rt6_dst_from_metrics_check(struct rt6_info *rt); 101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict); 102 static size_t rt6_nlmsg_size(struct rt6_info *rt); 103 static int rt6_fill_node(struct net *net, 104 struct sk_buff *skb, struct rt6_info *rt, 105 struct in6_addr *dst, struct in6_addr *src, 106 int iif, int type, u32 portid, u32 seq, 107 unsigned int flags); 108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, 109 struct in6_addr *daddr, 110 struct in6_addr *saddr); 111 112 #ifdef CONFIG_IPV6_ROUTE_INFO 113 static struct rt6_info *rt6_add_route_info(struct net *net, 114 const struct in6_addr *prefix, int prefixlen, 115 const struct in6_addr *gwaddr, 116 struct net_device *dev, 117 unsigned int pref); 118 static struct rt6_info *rt6_get_route_info(struct net *net, 119 const struct in6_addr *prefix, int prefixlen, 120 const struct in6_addr *gwaddr, 121 struct net_device *dev); 122 #endif 123 124 struct uncached_list { 125 spinlock_t lock; 126 struct list_head head; 127 }; 128 129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 130 131 static void rt6_uncached_list_add(struct rt6_info *rt) 132 { 133 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 134 135 rt->rt6i_uncached_list = ul; 136 137 spin_lock_bh(&ul->lock); 138 list_add_tail(&rt->rt6i_uncached, &ul->head); 139 spin_unlock_bh(&ul->lock); 140 } 141 142 static void rt6_uncached_list_del(struct rt6_info *rt) 143 { 144 if (!list_empty(&rt->rt6i_uncached)) { 145 struct uncached_list *ul = rt->rt6i_uncached_list; 146 struct net *net = dev_net(rt->dst.dev); 147 148 spin_lock_bh(&ul->lock); 149 list_del(&rt->rt6i_uncached); 150 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 151 spin_unlock_bh(&ul->lock); 152 } 153 } 154 155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 156 { 157 struct net_device *loopback_dev = net->loopback_dev; 158 int cpu; 159 160 if (dev == loopback_dev) 161 return; 162 163 for_each_possible_cpu(cpu) { 164 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 165 struct rt6_info *rt; 166 167 spin_lock_bh(&ul->lock); 168 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 169 struct inet6_dev *rt_idev = rt->rt6i_idev; 170 struct net_device *rt_dev = rt->dst.dev; 171 172 if (rt_idev->dev == dev) { 173 rt->rt6i_idev = in6_dev_get(loopback_dev); 174 in6_dev_put(rt_idev); 175 } 176 177 if (rt_dev == dev) { 178 rt->dst.dev = loopback_dev; 179 dev_hold(rt->dst.dev); 180 dev_put(rt_dev); 181 } 182 } 183 spin_unlock_bh(&ul->lock); 184 } 185 } 186 187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt) 188 { 189 return dst_metrics_write_ptr(rt->dst.from); 190 } 191 192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) 193 { 194 struct rt6_info *rt = (struct rt6_info *)dst; 195 196 if (rt->rt6i_flags & RTF_PCPU) 197 return rt6_pcpu_cow_metrics(rt); 198 else if (rt->rt6i_flags & RTF_CACHE) 199 return NULL; 200 else 201 return dst_cow_metrics_generic(dst, old); 202 } 203 204 static inline const void *choose_neigh_daddr(struct rt6_info *rt, 205 struct sk_buff *skb, 206 const void *daddr) 207 { 208 struct in6_addr *p = &rt->rt6i_gateway; 209 210 if (!ipv6_addr_any(p)) 211 return (const void *) p; 212 else if (skb) 213 return &ipv6_hdr(skb)->daddr; 214 return daddr; 215 } 216 217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, 218 struct sk_buff *skb, 219 const void *daddr) 220 { 221 struct rt6_info *rt = (struct rt6_info *) dst; 222 struct neighbour *n; 223 224 daddr = choose_neigh_daddr(rt, skb, daddr); 225 n = __ipv6_neigh_lookup(dst->dev, daddr); 226 if (n) 227 return n; 228 return neigh_create(&nd_tbl, daddr, dst->dev); 229 } 230 231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 232 { 233 struct net_device *dev = dst->dev; 234 struct rt6_info *rt = (struct rt6_info *)dst; 235 236 daddr = choose_neigh_daddr(rt, NULL, daddr); 237 if (!daddr) 238 return; 239 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 240 return; 241 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 242 return; 243 __ipv6_confirm_neigh(dev, daddr); 244 } 245 246 static struct dst_ops ip6_dst_ops_template = { 247 .family = AF_INET6, 248 .gc = ip6_dst_gc, 249 .gc_thresh = 1024, 250 .check = ip6_dst_check, 251 .default_advmss = ip6_default_advmss, 252 .mtu = ip6_mtu, 253 .cow_metrics = ipv6_cow_metrics, 254 .destroy = ip6_dst_destroy, 255 .ifdown = ip6_dst_ifdown, 256 .negative_advice = ip6_negative_advice, 257 .link_failure = ip6_link_failure, 258 .update_pmtu = ip6_rt_update_pmtu, 259 .redirect = rt6_do_redirect, 260 .local_out = __ip6_local_out, 261 .neigh_lookup = ip6_neigh_lookup, 262 .confirm_neigh = ip6_confirm_neigh, 263 }; 264 265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 266 { 267 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 268 269 return mtu ? : dst->dev->mtu; 270 } 271 272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 273 struct sk_buff *skb, u32 mtu) 274 { 275 } 276 277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 278 struct sk_buff *skb) 279 { 280 } 281 282 static struct dst_ops ip6_dst_blackhole_ops = { 283 .family = AF_INET6, 284 .destroy = ip6_dst_destroy, 285 .check = ip6_dst_check, 286 .mtu = ip6_blackhole_mtu, 287 .default_advmss = ip6_default_advmss, 288 .update_pmtu = ip6_rt_blackhole_update_pmtu, 289 .redirect = ip6_rt_blackhole_redirect, 290 .cow_metrics = dst_cow_metrics_generic, 291 .neigh_lookup = ip6_neigh_lookup, 292 }; 293 294 static const u32 ip6_template_metrics[RTAX_MAX] = { 295 [RTAX_HOPLIMIT - 1] = 0, 296 }; 297 298 static const struct rt6_info ip6_null_entry_template = { 299 .dst = { 300 .__refcnt = ATOMIC_INIT(1), 301 .__use = 1, 302 .obsolete = DST_OBSOLETE_FORCE_CHK, 303 .error = -ENETUNREACH, 304 .input = ip6_pkt_discard, 305 .output = ip6_pkt_discard_out, 306 }, 307 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 308 .rt6i_protocol = RTPROT_KERNEL, 309 .rt6i_metric = ~(u32) 0, 310 .rt6i_ref = ATOMIC_INIT(1), 311 }; 312 313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 314 315 static const struct rt6_info ip6_prohibit_entry_template = { 316 .dst = { 317 .__refcnt = ATOMIC_INIT(1), 318 .__use = 1, 319 .obsolete = DST_OBSOLETE_FORCE_CHK, 320 .error = -EACCES, 321 .input = ip6_pkt_prohibit, 322 .output = ip6_pkt_prohibit_out, 323 }, 324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 325 .rt6i_protocol = RTPROT_KERNEL, 326 .rt6i_metric = ~(u32) 0, 327 .rt6i_ref = ATOMIC_INIT(1), 328 }; 329 330 static const struct rt6_info ip6_blk_hole_entry_template = { 331 .dst = { 332 .__refcnt = ATOMIC_INIT(1), 333 .__use = 1, 334 .obsolete = DST_OBSOLETE_FORCE_CHK, 335 .error = -EINVAL, 336 .input = dst_discard, 337 .output = dst_discard_out, 338 }, 339 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 340 .rt6i_protocol = RTPROT_KERNEL, 341 .rt6i_metric = ~(u32) 0, 342 .rt6i_ref = ATOMIC_INIT(1), 343 }; 344 345 #endif 346 347 static void rt6_info_init(struct rt6_info *rt) 348 { 349 struct dst_entry *dst = &rt->dst; 350 351 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 352 INIT_LIST_HEAD(&rt->rt6i_siblings); 353 INIT_LIST_HEAD(&rt->rt6i_uncached); 354 } 355 356 /* allocate dst with ip6_dst_ops */ 357 static struct rt6_info *__ip6_dst_alloc(struct net *net, 358 struct net_device *dev, 359 int flags) 360 { 361 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 362 1, DST_OBSOLETE_FORCE_CHK, flags); 363 364 if (rt) { 365 rt6_info_init(rt); 366 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 367 } 368 369 return rt; 370 } 371 372 struct rt6_info *ip6_dst_alloc(struct net *net, 373 struct net_device *dev, 374 int flags) 375 { 376 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags); 377 378 if (rt) { 379 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC); 380 if (!rt->rt6i_pcpu) { 381 dst_release_immediate(&rt->dst); 382 return NULL; 383 } 384 } 385 386 return rt; 387 } 388 EXPORT_SYMBOL(ip6_dst_alloc); 389 390 static void ip6_dst_destroy(struct dst_entry *dst) 391 { 392 struct rt6_info *rt = (struct rt6_info *)dst; 393 struct rt6_exception_bucket *bucket; 394 struct dst_entry *from = dst->from; 395 struct inet6_dev *idev; 396 397 dst_destroy_metrics_generic(dst); 398 free_percpu(rt->rt6i_pcpu); 399 rt6_uncached_list_del(rt); 400 401 idev = rt->rt6i_idev; 402 if (idev) { 403 rt->rt6i_idev = NULL; 404 in6_dev_put(idev); 405 } 406 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1); 407 if (bucket) { 408 rt->rt6i_exception_bucket = NULL; 409 kfree(bucket); 410 } 411 412 dst->from = NULL; 413 dst_release(from); 414 } 415 416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 417 int how) 418 { 419 struct rt6_info *rt = (struct rt6_info *)dst; 420 struct inet6_dev *idev = rt->rt6i_idev; 421 struct net_device *loopback_dev = 422 dev_net(dev)->loopback_dev; 423 424 if (idev && idev->dev != loopback_dev) { 425 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 426 if (loopback_idev) { 427 rt->rt6i_idev = loopback_idev; 428 in6_dev_put(idev); 429 } 430 } 431 } 432 433 static bool __rt6_check_expired(const struct rt6_info *rt) 434 { 435 if (rt->rt6i_flags & RTF_EXPIRES) 436 return time_after(jiffies, rt->dst.expires); 437 else 438 return false; 439 } 440 441 static bool rt6_check_expired(const struct rt6_info *rt) 442 { 443 if (rt->rt6i_flags & RTF_EXPIRES) { 444 if (time_after(jiffies, rt->dst.expires)) 445 return true; 446 } else if (rt->dst.from) { 447 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 448 rt6_check_expired((struct rt6_info *)rt->dst.from); 449 } 450 return false; 451 } 452 453 static struct rt6_info *rt6_multipath_select(struct rt6_info *match, 454 struct flowi6 *fl6, int oif, 455 int strict) 456 { 457 struct rt6_info *sibling, *next_sibling; 458 int route_choosen; 459 460 /* We might have already computed the hash for ICMPv6 errors. In such 461 * case it will always be non-zero. Otherwise now is the time to do it. 462 */ 463 if (!fl6->mp_hash) 464 fl6->mp_hash = rt6_multipath_hash(fl6, NULL); 465 466 route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1); 467 /* Don't change the route, if route_choosen == 0 468 * (siblings does not include ourself) 469 */ 470 if (route_choosen) 471 list_for_each_entry_safe(sibling, next_sibling, 472 &match->rt6i_siblings, rt6i_siblings) { 473 route_choosen--; 474 if (route_choosen == 0) { 475 if (rt6_score_route(sibling, oif, strict) < 0) 476 break; 477 match = sibling; 478 break; 479 } 480 } 481 return match; 482 } 483 484 /* 485 * Route lookup. rcu_read_lock() should be held. 486 */ 487 488 static inline struct rt6_info *rt6_device_match(struct net *net, 489 struct rt6_info *rt, 490 const struct in6_addr *saddr, 491 int oif, 492 int flags) 493 { 494 struct rt6_info *local = NULL; 495 struct rt6_info *sprt; 496 497 if (!oif && ipv6_addr_any(saddr)) 498 goto out; 499 500 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) { 501 struct net_device *dev = sprt->dst.dev; 502 503 if (oif) { 504 if (dev->ifindex == oif) 505 return sprt; 506 if (dev->flags & IFF_LOOPBACK) { 507 if (!sprt->rt6i_idev || 508 sprt->rt6i_idev->dev->ifindex != oif) { 509 if (flags & RT6_LOOKUP_F_IFACE) 510 continue; 511 if (local && 512 local->rt6i_idev->dev->ifindex == oif) 513 continue; 514 } 515 local = sprt; 516 } 517 } else { 518 if (ipv6_chk_addr(net, saddr, dev, 519 flags & RT6_LOOKUP_F_IFACE)) 520 return sprt; 521 } 522 } 523 524 if (oif) { 525 if (local) 526 return local; 527 528 if (flags & RT6_LOOKUP_F_IFACE) 529 return net->ipv6.ip6_null_entry; 530 } 531 out: 532 return rt; 533 } 534 535 #ifdef CONFIG_IPV6_ROUTER_PREF 536 struct __rt6_probe_work { 537 struct work_struct work; 538 struct in6_addr target; 539 struct net_device *dev; 540 }; 541 542 static void rt6_probe_deferred(struct work_struct *w) 543 { 544 struct in6_addr mcaddr; 545 struct __rt6_probe_work *work = 546 container_of(w, struct __rt6_probe_work, work); 547 548 addrconf_addr_solict_mult(&work->target, &mcaddr); 549 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 550 dev_put(work->dev); 551 kfree(work); 552 } 553 554 static void rt6_probe(struct rt6_info *rt) 555 { 556 struct __rt6_probe_work *work; 557 struct neighbour *neigh; 558 /* 559 * Okay, this does not seem to be appropriate 560 * for now, however, we need to check if it 561 * is really so; aka Router Reachability Probing. 562 * 563 * Router Reachability Probe MUST be rate-limited 564 * to no more than one per minute. 565 */ 566 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY)) 567 return; 568 rcu_read_lock_bh(); 569 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 570 if (neigh) { 571 if (neigh->nud_state & NUD_VALID) 572 goto out; 573 574 work = NULL; 575 write_lock(&neigh->lock); 576 if (!(neigh->nud_state & NUD_VALID) && 577 time_after(jiffies, 578 neigh->updated + 579 rt->rt6i_idev->cnf.rtr_probe_interval)) { 580 work = kmalloc(sizeof(*work), GFP_ATOMIC); 581 if (work) 582 __neigh_set_probe_once(neigh); 583 } 584 write_unlock(&neigh->lock); 585 } else { 586 work = kmalloc(sizeof(*work), GFP_ATOMIC); 587 } 588 589 if (work) { 590 INIT_WORK(&work->work, rt6_probe_deferred); 591 work->target = rt->rt6i_gateway; 592 dev_hold(rt->dst.dev); 593 work->dev = rt->dst.dev; 594 schedule_work(&work->work); 595 } 596 597 out: 598 rcu_read_unlock_bh(); 599 } 600 #else 601 static inline void rt6_probe(struct rt6_info *rt) 602 { 603 } 604 #endif 605 606 /* 607 * Default Router Selection (RFC 2461 6.3.6) 608 */ 609 static inline int rt6_check_dev(struct rt6_info *rt, int oif) 610 { 611 struct net_device *dev = rt->dst.dev; 612 if (!oif || dev->ifindex == oif) 613 return 2; 614 if ((dev->flags & IFF_LOOPBACK) && 615 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif) 616 return 1; 617 return 0; 618 } 619 620 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt) 621 { 622 struct neighbour *neigh; 623 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 624 625 if (rt->rt6i_flags & RTF_NONEXTHOP || 626 !(rt->rt6i_flags & RTF_GATEWAY)) 627 return RT6_NUD_SUCCEED; 628 629 rcu_read_lock_bh(); 630 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 631 if (neigh) { 632 read_lock(&neigh->lock); 633 if (neigh->nud_state & NUD_VALID) 634 ret = RT6_NUD_SUCCEED; 635 #ifdef CONFIG_IPV6_ROUTER_PREF 636 else if (!(neigh->nud_state & NUD_FAILED)) 637 ret = RT6_NUD_SUCCEED; 638 else 639 ret = RT6_NUD_FAIL_PROBE; 640 #endif 641 read_unlock(&neigh->lock); 642 } else { 643 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 644 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 645 } 646 rcu_read_unlock_bh(); 647 648 return ret; 649 } 650 651 static int rt6_score_route(struct rt6_info *rt, int oif, 652 int strict) 653 { 654 int m; 655 656 m = rt6_check_dev(rt, oif); 657 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 658 return RT6_NUD_FAIL_HARD; 659 #ifdef CONFIG_IPV6_ROUTER_PREF 660 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2; 661 #endif 662 if (strict & RT6_LOOKUP_F_REACHABLE) { 663 int n = rt6_check_neigh(rt); 664 if (n < 0) 665 return n; 666 } 667 return m; 668 } 669 670 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, 671 int *mpri, struct rt6_info *match, 672 bool *do_rr) 673 { 674 int m; 675 bool match_do_rr = false; 676 struct inet6_dev *idev = rt->rt6i_idev; 677 struct net_device *dev = rt->dst.dev; 678 679 if (dev && !netif_carrier_ok(dev) && 680 idev->cnf.ignore_routes_with_linkdown && 681 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 682 goto out; 683 684 if (rt6_check_expired(rt)) 685 goto out; 686 687 m = rt6_score_route(rt, oif, strict); 688 if (m == RT6_NUD_FAIL_DO_RR) { 689 match_do_rr = true; 690 m = 0; /* lowest valid score */ 691 } else if (m == RT6_NUD_FAIL_HARD) { 692 goto out; 693 } 694 695 if (strict & RT6_LOOKUP_F_REACHABLE) 696 rt6_probe(rt); 697 698 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 699 if (m > *mpri) { 700 *do_rr = match_do_rr; 701 *mpri = m; 702 match = rt; 703 } 704 out: 705 return match; 706 } 707 708 static struct rt6_info *find_rr_leaf(struct fib6_node *fn, 709 struct rt6_info *leaf, 710 struct rt6_info *rr_head, 711 u32 metric, int oif, int strict, 712 bool *do_rr) 713 { 714 struct rt6_info *rt, *match, *cont; 715 int mpri = -1; 716 717 match = NULL; 718 cont = NULL; 719 for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) { 720 if (rt->rt6i_metric != metric) { 721 cont = rt; 722 break; 723 } 724 725 match = find_match(rt, oif, strict, &mpri, match, do_rr); 726 } 727 728 for (rt = leaf; rt && rt != rr_head; 729 rt = rcu_dereference(rt->dst.rt6_next)) { 730 if (rt->rt6i_metric != metric) { 731 cont = rt; 732 break; 733 } 734 735 match = find_match(rt, oif, strict, &mpri, match, do_rr); 736 } 737 738 if (match || !cont) 739 return match; 740 741 for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next)) 742 match = find_match(rt, oif, strict, &mpri, match, do_rr); 743 744 return match; 745 } 746 747 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, 748 int oif, int strict) 749 { 750 struct rt6_info *leaf = rcu_dereference(fn->leaf); 751 struct rt6_info *match, *rt0; 752 bool do_rr = false; 753 int key_plen; 754 755 if (!leaf || leaf == net->ipv6.ip6_null_entry) 756 return net->ipv6.ip6_null_entry; 757 758 rt0 = rcu_dereference(fn->rr_ptr); 759 if (!rt0) 760 rt0 = leaf; 761 762 /* Double check to make sure fn is not an intermediate node 763 * and fn->leaf does not points to its child's leaf 764 * (This might happen if all routes under fn are deleted from 765 * the tree and fib6_repair_tree() is called on the node.) 766 */ 767 key_plen = rt0->rt6i_dst.plen; 768 #ifdef CONFIG_IPV6_SUBTREES 769 if (rt0->rt6i_src.plen) 770 key_plen = rt0->rt6i_src.plen; 771 #endif 772 if (fn->fn_bit != key_plen) 773 return net->ipv6.ip6_null_entry; 774 775 match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict, 776 &do_rr); 777 778 if (do_rr) { 779 struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next); 780 781 /* no entries matched; do round-robin */ 782 if (!next || next->rt6i_metric != rt0->rt6i_metric) 783 next = leaf; 784 785 if (next != rt0) { 786 spin_lock_bh(&leaf->rt6i_table->tb6_lock); 787 /* make sure next is not being deleted from the tree */ 788 if (next->rt6i_node) 789 rcu_assign_pointer(fn->rr_ptr, next); 790 spin_unlock_bh(&leaf->rt6i_table->tb6_lock); 791 } 792 } 793 794 return match ? match : net->ipv6.ip6_null_entry; 795 } 796 797 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt) 798 { 799 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); 800 } 801 802 #ifdef CONFIG_IPV6_ROUTE_INFO 803 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 804 const struct in6_addr *gwaddr) 805 { 806 struct net *net = dev_net(dev); 807 struct route_info *rinfo = (struct route_info *) opt; 808 struct in6_addr prefix_buf, *prefix; 809 unsigned int pref; 810 unsigned long lifetime; 811 struct rt6_info *rt; 812 813 if (len < sizeof(struct route_info)) { 814 return -EINVAL; 815 } 816 817 /* Sanity check for prefix_len and length */ 818 if (rinfo->length > 3) { 819 return -EINVAL; 820 } else if (rinfo->prefix_len > 128) { 821 return -EINVAL; 822 } else if (rinfo->prefix_len > 64) { 823 if (rinfo->length < 2) { 824 return -EINVAL; 825 } 826 } else if (rinfo->prefix_len > 0) { 827 if (rinfo->length < 1) { 828 return -EINVAL; 829 } 830 } 831 832 pref = rinfo->route_pref; 833 if (pref == ICMPV6_ROUTER_PREF_INVALID) 834 return -EINVAL; 835 836 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 837 838 if (rinfo->length == 3) 839 prefix = (struct in6_addr *)rinfo->prefix; 840 else { 841 /* this function is safe */ 842 ipv6_addr_prefix(&prefix_buf, 843 (struct in6_addr *)rinfo->prefix, 844 rinfo->prefix_len); 845 prefix = &prefix_buf; 846 } 847 848 if (rinfo->prefix_len == 0) 849 rt = rt6_get_dflt_router(gwaddr, dev); 850 else 851 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 852 gwaddr, dev); 853 854 if (rt && !lifetime) { 855 ip6_del_rt(rt); 856 rt = NULL; 857 } 858 859 if (!rt && lifetime) 860 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 861 dev, pref); 862 else if (rt) 863 rt->rt6i_flags = RTF_ROUTEINFO | 864 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 865 866 if (rt) { 867 if (!addrconf_finite_timeout(lifetime)) 868 rt6_clean_expires(rt); 869 else 870 rt6_set_expires(rt, jiffies + HZ * lifetime); 871 872 ip6_rt_put(rt); 873 } 874 return 0; 875 } 876 #endif 877 878 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 879 struct in6_addr *saddr) 880 { 881 struct fib6_node *pn, *sn; 882 while (1) { 883 if (fn->fn_flags & RTN_TL_ROOT) 884 return NULL; 885 pn = rcu_dereference(fn->parent); 886 sn = FIB6_SUBTREE(pn); 887 if (sn && sn != fn) 888 fn = fib6_lookup(sn, NULL, saddr); 889 else 890 fn = pn; 891 if (fn->fn_flags & RTN_RTINFO) 892 return fn; 893 } 894 } 895 896 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, 897 bool null_fallback) 898 { 899 struct rt6_info *rt = *prt; 900 901 if (dst_hold_safe(&rt->dst)) 902 return true; 903 if (null_fallback) { 904 rt = net->ipv6.ip6_null_entry; 905 dst_hold(&rt->dst); 906 } else { 907 rt = NULL; 908 } 909 *prt = rt; 910 return false; 911 } 912 913 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 914 struct fib6_table *table, 915 struct flowi6 *fl6, int flags) 916 { 917 struct rt6_info *rt, *rt_cache; 918 struct fib6_node *fn; 919 920 rcu_read_lock(); 921 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 922 restart: 923 rt = rcu_dereference(fn->leaf); 924 if (!rt) { 925 rt = net->ipv6.ip6_null_entry; 926 } else { 927 rt = rt6_device_match(net, rt, &fl6->saddr, 928 fl6->flowi6_oif, flags); 929 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) 930 rt = rt6_multipath_select(rt, fl6, 931 fl6->flowi6_oif, flags); 932 } 933 if (rt == net->ipv6.ip6_null_entry) { 934 fn = fib6_backtrack(fn, &fl6->saddr); 935 if (fn) 936 goto restart; 937 } 938 /* Search through exception table */ 939 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr); 940 if (rt_cache) 941 rt = rt_cache; 942 943 if (ip6_hold_safe(net, &rt, true)) 944 dst_use_noref(&rt->dst, jiffies); 945 946 rcu_read_unlock(); 947 948 trace_fib6_table_lookup(net, rt, table, fl6); 949 950 return rt; 951 952 } 953 954 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 955 int flags) 956 { 957 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup); 958 } 959 EXPORT_SYMBOL_GPL(ip6_route_lookup); 960 961 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 962 const struct in6_addr *saddr, int oif, int strict) 963 { 964 struct flowi6 fl6 = { 965 .flowi6_oif = oif, 966 .daddr = *daddr, 967 }; 968 struct dst_entry *dst; 969 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 970 971 if (saddr) { 972 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 973 flags |= RT6_LOOKUP_F_HAS_SADDR; 974 } 975 976 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup); 977 if (dst->error == 0) 978 return (struct rt6_info *) dst; 979 980 dst_release(dst); 981 982 return NULL; 983 } 984 EXPORT_SYMBOL(rt6_lookup); 985 986 /* ip6_ins_rt is called with FREE table->tb6_lock. 987 * It takes new route entry, the addition fails by any reason the 988 * route is released. 989 * Caller must hold dst before calling it. 990 */ 991 992 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, 993 struct mx6_config *mxc, 994 struct netlink_ext_ack *extack) 995 { 996 int err; 997 struct fib6_table *table; 998 999 table = rt->rt6i_table; 1000 spin_lock_bh(&table->tb6_lock); 1001 err = fib6_add(&table->tb6_root, rt, info, mxc, extack); 1002 spin_unlock_bh(&table->tb6_lock); 1003 1004 return err; 1005 } 1006 1007 int ip6_ins_rt(struct rt6_info *rt) 1008 { 1009 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), }; 1010 struct mx6_config mxc = { .mx = NULL, }; 1011 1012 /* Hold dst to account for the reference from the fib6 tree */ 1013 dst_hold(&rt->dst); 1014 return __ip6_ins_rt(rt, &info, &mxc, NULL); 1015 } 1016 1017 /* called with rcu_lock held */ 1018 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt) 1019 { 1020 struct net_device *dev = rt->dst.dev; 1021 1022 if (rt->rt6i_flags & RTF_LOCAL) { 1023 /* for copies of local routes, dst->dev needs to be the 1024 * device if it is a master device, the master device if 1025 * device is enslaved, and the loopback as the default 1026 */ 1027 if (netif_is_l3_slave(dev) && 1028 !rt6_need_strict(&rt->rt6i_dst.addr)) 1029 dev = l3mdev_master_dev_rcu(dev); 1030 else if (!netif_is_l3_master(dev)) 1031 dev = dev_net(dev)->loopback_dev; 1032 /* last case is netif_is_l3_master(dev) is true in which 1033 * case we want dev returned to be dev 1034 */ 1035 } 1036 1037 return dev; 1038 } 1039 1040 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, 1041 const struct in6_addr *daddr, 1042 const struct in6_addr *saddr) 1043 { 1044 struct net_device *dev; 1045 struct rt6_info *rt; 1046 1047 /* 1048 * Clone the route. 1049 */ 1050 1051 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) 1052 ort = (struct rt6_info *)ort->dst.from; 1053 1054 rcu_read_lock(); 1055 dev = ip6_rt_get_dev_rcu(ort); 1056 rt = __ip6_dst_alloc(dev_net(dev), dev, 0); 1057 rcu_read_unlock(); 1058 if (!rt) 1059 return NULL; 1060 1061 ip6_rt_copy_init(rt, ort); 1062 rt->rt6i_flags |= RTF_CACHE; 1063 rt->rt6i_metric = 0; 1064 rt->dst.flags |= DST_HOST; 1065 rt->rt6i_dst.addr = *daddr; 1066 rt->rt6i_dst.plen = 128; 1067 1068 if (!rt6_is_gw_or_nonexthop(ort)) { 1069 if (ort->rt6i_dst.plen != 128 && 1070 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr)) 1071 rt->rt6i_flags |= RTF_ANYCAST; 1072 #ifdef CONFIG_IPV6_SUBTREES 1073 if (rt->rt6i_src.plen && saddr) { 1074 rt->rt6i_src.addr = *saddr; 1075 rt->rt6i_src.plen = 128; 1076 } 1077 #endif 1078 } 1079 1080 return rt; 1081 } 1082 1083 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) 1084 { 1085 struct net_device *dev; 1086 struct rt6_info *pcpu_rt; 1087 1088 rcu_read_lock(); 1089 dev = ip6_rt_get_dev_rcu(rt); 1090 pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags); 1091 rcu_read_unlock(); 1092 if (!pcpu_rt) 1093 return NULL; 1094 ip6_rt_copy_init(pcpu_rt, rt); 1095 pcpu_rt->rt6i_protocol = rt->rt6i_protocol; 1096 pcpu_rt->rt6i_flags |= RTF_PCPU; 1097 return pcpu_rt; 1098 } 1099 1100 /* It should be called with rcu_read_lock() acquired */ 1101 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) 1102 { 1103 struct rt6_info *pcpu_rt, **p; 1104 1105 p = this_cpu_ptr(rt->rt6i_pcpu); 1106 pcpu_rt = *p; 1107 1108 if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false)) 1109 rt6_dst_from_metrics_check(pcpu_rt); 1110 1111 return pcpu_rt; 1112 } 1113 1114 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) 1115 { 1116 struct rt6_info *pcpu_rt, *prev, **p; 1117 1118 pcpu_rt = ip6_rt_pcpu_alloc(rt); 1119 if (!pcpu_rt) { 1120 struct net *net = dev_net(rt->dst.dev); 1121 1122 dst_hold(&net->ipv6.ip6_null_entry->dst); 1123 return net->ipv6.ip6_null_entry; 1124 } 1125 1126 dst_hold(&pcpu_rt->dst); 1127 p = this_cpu_ptr(rt->rt6i_pcpu); 1128 prev = cmpxchg(p, NULL, pcpu_rt); 1129 BUG_ON(prev); 1130 1131 rt6_dst_from_metrics_check(pcpu_rt); 1132 return pcpu_rt; 1133 } 1134 1135 /* exception hash table implementation 1136 */ 1137 static DEFINE_SPINLOCK(rt6_exception_lock); 1138 1139 /* Remove rt6_ex from hash table and free the memory 1140 * Caller must hold rt6_exception_lock 1141 */ 1142 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1143 struct rt6_exception *rt6_ex) 1144 { 1145 struct net *net; 1146 1147 if (!bucket || !rt6_ex) 1148 return; 1149 1150 net = dev_net(rt6_ex->rt6i->dst.dev); 1151 rt6_ex->rt6i->rt6i_node = NULL; 1152 hlist_del_rcu(&rt6_ex->hlist); 1153 rt6_release(rt6_ex->rt6i); 1154 kfree_rcu(rt6_ex, rcu); 1155 WARN_ON_ONCE(!bucket->depth); 1156 bucket->depth--; 1157 net->ipv6.rt6_stats->fib_rt_cache--; 1158 } 1159 1160 /* Remove oldest rt6_ex in bucket and free the memory 1161 * Caller must hold rt6_exception_lock 1162 */ 1163 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1164 { 1165 struct rt6_exception *rt6_ex, *oldest = NULL; 1166 1167 if (!bucket) 1168 return; 1169 1170 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1171 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1172 oldest = rt6_ex; 1173 } 1174 rt6_remove_exception(bucket, oldest); 1175 } 1176 1177 static u32 rt6_exception_hash(const struct in6_addr *dst, 1178 const struct in6_addr *src) 1179 { 1180 static u32 seed __read_mostly; 1181 u32 val; 1182 1183 net_get_random_once(&seed, sizeof(seed)); 1184 val = jhash(dst, sizeof(*dst), seed); 1185 1186 #ifdef CONFIG_IPV6_SUBTREES 1187 if (src) 1188 val = jhash(src, sizeof(*src), val); 1189 #endif 1190 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1191 } 1192 1193 /* Helper function to find the cached rt in the hash table 1194 * and update bucket pointer to point to the bucket for this 1195 * (daddr, saddr) pair 1196 * Caller must hold rt6_exception_lock 1197 */ 1198 static struct rt6_exception * 1199 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1200 const struct in6_addr *daddr, 1201 const struct in6_addr *saddr) 1202 { 1203 struct rt6_exception *rt6_ex; 1204 u32 hval; 1205 1206 if (!(*bucket) || !daddr) 1207 return NULL; 1208 1209 hval = rt6_exception_hash(daddr, saddr); 1210 *bucket += hval; 1211 1212 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1213 struct rt6_info *rt6 = rt6_ex->rt6i; 1214 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1215 1216 #ifdef CONFIG_IPV6_SUBTREES 1217 if (matched && saddr) 1218 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1219 #endif 1220 if (matched) 1221 return rt6_ex; 1222 } 1223 return NULL; 1224 } 1225 1226 /* Helper function to find the cached rt in the hash table 1227 * and update bucket pointer to point to the bucket for this 1228 * (daddr, saddr) pair 1229 * Caller must hold rcu_read_lock() 1230 */ 1231 static struct rt6_exception * 1232 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1233 const struct in6_addr *daddr, 1234 const struct in6_addr *saddr) 1235 { 1236 struct rt6_exception *rt6_ex; 1237 u32 hval; 1238 1239 WARN_ON_ONCE(!rcu_read_lock_held()); 1240 1241 if (!(*bucket) || !daddr) 1242 return NULL; 1243 1244 hval = rt6_exception_hash(daddr, saddr); 1245 *bucket += hval; 1246 1247 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1248 struct rt6_info *rt6 = rt6_ex->rt6i; 1249 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1250 1251 #ifdef CONFIG_IPV6_SUBTREES 1252 if (matched && saddr) 1253 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1254 #endif 1255 if (matched) 1256 return rt6_ex; 1257 } 1258 return NULL; 1259 } 1260 1261 static int rt6_insert_exception(struct rt6_info *nrt, 1262 struct rt6_info *ort) 1263 { 1264 struct net *net = dev_net(ort->dst.dev); 1265 struct rt6_exception_bucket *bucket; 1266 struct in6_addr *src_key = NULL; 1267 struct rt6_exception *rt6_ex; 1268 int err = 0; 1269 1270 /* ort can't be a cache or pcpu route */ 1271 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) 1272 ort = (struct rt6_info *)ort->dst.from; 1273 WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)); 1274 1275 spin_lock_bh(&rt6_exception_lock); 1276 1277 if (ort->exception_bucket_flushed) { 1278 err = -EINVAL; 1279 goto out; 1280 } 1281 1282 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, 1283 lockdep_is_held(&rt6_exception_lock)); 1284 if (!bucket) { 1285 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1286 GFP_ATOMIC); 1287 if (!bucket) { 1288 err = -ENOMEM; 1289 goto out; 1290 } 1291 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); 1292 } 1293 1294 #ifdef CONFIG_IPV6_SUBTREES 1295 /* rt6i_src.plen != 0 indicates ort is in subtree 1296 * and exception table is indexed by a hash of 1297 * both rt6i_dst and rt6i_src. 1298 * Otherwise, the exception table is indexed by 1299 * a hash of only rt6i_dst. 1300 */ 1301 if (ort->rt6i_src.plen) 1302 src_key = &nrt->rt6i_src.addr; 1303 #endif 1304 1305 /* Update rt6i_prefsrc as it could be changed 1306 * in rt6_remove_prefsrc() 1307 */ 1308 nrt->rt6i_prefsrc = ort->rt6i_prefsrc; 1309 /* rt6_mtu_change() might lower mtu on ort. 1310 * Only insert this exception route if its mtu 1311 * is less than ort's mtu value. 1312 */ 1313 if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) { 1314 err = -EINVAL; 1315 goto out; 1316 } 1317 1318 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1319 src_key); 1320 if (rt6_ex) 1321 rt6_remove_exception(bucket, rt6_ex); 1322 1323 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1324 if (!rt6_ex) { 1325 err = -ENOMEM; 1326 goto out; 1327 } 1328 rt6_ex->rt6i = nrt; 1329 rt6_ex->stamp = jiffies; 1330 atomic_inc(&nrt->rt6i_ref); 1331 nrt->rt6i_node = ort->rt6i_node; 1332 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1333 bucket->depth++; 1334 net->ipv6.rt6_stats->fib_rt_cache++; 1335 1336 if (bucket->depth > FIB6_MAX_DEPTH) 1337 rt6_exception_remove_oldest(bucket); 1338 1339 out: 1340 spin_unlock_bh(&rt6_exception_lock); 1341 1342 /* Update fn->fn_sernum to invalidate all cached dst */ 1343 if (!err) { 1344 fib6_update_sernum(ort); 1345 fib6_force_start_gc(net); 1346 } 1347 1348 return err; 1349 } 1350 1351 void rt6_flush_exceptions(struct rt6_info *rt) 1352 { 1353 struct rt6_exception_bucket *bucket; 1354 struct rt6_exception *rt6_ex; 1355 struct hlist_node *tmp; 1356 int i; 1357 1358 spin_lock_bh(&rt6_exception_lock); 1359 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1360 rt->exception_bucket_flushed = 1; 1361 1362 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1363 lockdep_is_held(&rt6_exception_lock)); 1364 if (!bucket) 1365 goto out; 1366 1367 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1368 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1369 rt6_remove_exception(bucket, rt6_ex); 1370 WARN_ON_ONCE(bucket->depth); 1371 bucket++; 1372 } 1373 1374 out: 1375 spin_unlock_bh(&rt6_exception_lock); 1376 } 1377 1378 /* Find cached rt in the hash table inside passed in rt 1379 * Caller has to hold rcu_read_lock() 1380 */ 1381 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, 1382 struct in6_addr *daddr, 1383 struct in6_addr *saddr) 1384 { 1385 struct rt6_exception_bucket *bucket; 1386 struct in6_addr *src_key = NULL; 1387 struct rt6_exception *rt6_ex; 1388 struct rt6_info *res = NULL; 1389 1390 bucket = rcu_dereference(rt->rt6i_exception_bucket); 1391 1392 #ifdef CONFIG_IPV6_SUBTREES 1393 /* rt6i_src.plen != 0 indicates rt is in subtree 1394 * and exception table is indexed by a hash of 1395 * both rt6i_dst and rt6i_src. 1396 * Otherwise, the exception table is indexed by 1397 * a hash of only rt6i_dst. 1398 */ 1399 if (rt->rt6i_src.plen) 1400 src_key = saddr; 1401 #endif 1402 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1403 1404 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1405 res = rt6_ex->rt6i; 1406 1407 return res; 1408 } 1409 1410 /* Remove the passed in cached rt from the hash table that contains it */ 1411 int rt6_remove_exception_rt(struct rt6_info *rt) 1412 { 1413 struct rt6_info *from = (struct rt6_info *)rt->dst.from; 1414 struct rt6_exception_bucket *bucket; 1415 struct in6_addr *src_key = NULL; 1416 struct rt6_exception *rt6_ex; 1417 int err; 1418 1419 if (!from || 1420 !(rt->rt6i_flags & RTF_CACHE)) 1421 return -EINVAL; 1422 1423 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1424 return -ENOENT; 1425 1426 spin_lock_bh(&rt6_exception_lock); 1427 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1428 lockdep_is_held(&rt6_exception_lock)); 1429 #ifdef CONFIG_IPV6_SUBTREES 1430 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1431 * and exception table is indexed by a hash of 1432 * both rt6i_dst and rt6i_src. 1433 * Otherwise, the exception table is indexed by 1434 * a hash of only rt6i_dst. 1435 */ 1436 if (from->rt6i_src.plen) 1437 src_key = &rt->rt6i_src.addr; 1438 #endif 1439 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1440 &rt->rt6i_dst.addr, 1441 src_key); 1442 if (rt6_ex) { 1443 rt6_remove_exception(bucket, rt6_ex); 1444 err = 0; 1445 } else { 1446 err = -ENOENT; 1447 } 1448 1449 spin_unlock_bh(&rt6_exception_lock); 1450 return err; 1451 } 1452 1453 /* Find rt6_ex which contains the passed in rt cache and 1454 * refresh its stamp 1455 */ 1456 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1457 { 1458 struct rt6_info *from = (struct rt6_info *)rt->dst.from; 1459 struct rt6_exception_bucket *bucket; 1460 struct in6_addr *src_key = NULL; 1461 struct rt6_exception *rt6_ex; 1462 1463 if (!from || 1464 !(rt->rt6i_flags & RTF_CACHE)) 1465 return; 1466 1467 rcu_read_lock(); 1468 bucket = rcu_dereference(from->rt6i_exception_bucket); 1469 1470 #ifdef CONFIG_IPV6_SUBTREES 1471 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1472 * and exception table is indexed by a hash of 1473 * both rt6i_dst and rt6i_src. 1474 * Otherwise, the exception table is indexed by 1475 * a hash of only rt6i_dst. 1476 */ 1477 if (from->rt6i_src.plen) 1478 src_key = &rt->rt6i_src.addr; 1479 #endif 1480 rt6_ex = __rt6_find_exception_rcu(&bucket, 1481 &rt->rt6i_dst.addr, 1482 src_key); 1483 if (rt6_ex) 1484 rt6_ex->stamp = jiffies; 1485 1486 rcu_read_unlock(); 1487 } 1488 1489 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt) 1490 { 1491 struct rt6_exception_bucket *bucket; 1492 struct rt6_exception *rt6_ex; 1493 int i; 1494 1495 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1496 lockdep_is_held(&rt6_exception_lock)); 1497 1498 if (bucket) { 1499 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1500 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1501 rt6_ex->rt6i->rt6i_prefsrc.plen = 0; 1502 } 1503 bucket++; 1504 } 1505 } 1506 } 1507 1508 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu) 1509 { 1510 struct rt6_exception_bucket *bucket; 1511 struct rt6_exception *rt6_ex; 1512 int i; 1513 1514 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1515 lockdep_is_held(&rt6_exception_lock)); 1516 1517 if (bucket) { 1518 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1519 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1520 struct rt6_info *entry = rt6_ex->rt6i; 1521 /* For RTF_CACHE with rt6i_pmtu == 0 1522 * (i.e. a redirected route), 1523 * the metrics of its rt->dst.from has already 1524 * been updated. 1525 */ 1526 if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu) 1527 entry->rt6i_pmtu = mtu; 1528 } 1529 bucket++; 1530 } 1531 } 1532 } 1533 1534 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1535 1536 static void rt6_exceptions_clean_tohost(struct rt6_info *rt, 1537 struct in6_addr *gateway) 1538 { 1539 struct rt6_exception_bucket *bucket; 1540 struct rt6_exception *rt6_ex; 1541 struct hlist_node *tmp; 1542 int i; 1543 1544 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1545 return; 1546 1547 spin_lock_bh(&rt6_exception_lock); 1548 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1549 lockdep_is_held(&rt6_exception_lock)); 1550 1551 if (bucket) { 1552 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1553 hlist_for_each_entry_safe(rt6_ex, tmp, 1554 &bucket->chain, hlist) { 1555 struct rt6_info *entry = rt6_ex->rt6i; 1556 1557 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1558 RTF_CACHE_GATEWAY && 1559 ipv6_addr_equal(gateway, 1560 &entry->rt6i_gateway)) { 1561 rt6_remove_exception(bucket, rt6_ex); 1562 } 1563 } 1564 bucket++; 1565 } 1566 } 1567 1568 spin_unlock_bh(&rt6_exception_lock); 1569 } 1570 1571 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1572 struct rt6_exception *rt6_ex, 1573 struct fib6_gc_args *gc_args, 1574 unsigned long now) 1575 { 1576 struct rt6_info *rt = rt6_ex->rt6i; 1577 1578 /* we are pruning and obsoleting aged-out and non gateway exceptions 1579 * even if others have still references to them, so that on next 1580 * dst_check() such references can be dropped. 1581 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 1582 * expired, independently from their aging, as per RFC 8201 section 4 1583 */ 1584 if (!(rt->rt6i_flags & RTF_EXPIRES) && 1585 time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1586 RT6_TRACE("aging clone %p\n", rt); 1587 rt6_remove_exception(bucket, rt6_ex); 1588 return; 1589 } else if (rt->rt6i_flags & RTF_GATEWAY) { 1590 struct neighbour *neigh; 1591 __u8 neigh_flags = 0; 1592 1593 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway); 1594 if (neigh) { 1595 neigh_flags = neigh->flags; 1596 neigh_release(neigh); 1597 } 1598 if (!(neigh_flags & NTF_ROUTER)) { 1599 RT6_TRACE("purging route %p via non-router but gateway\n", 1600 rt); 1601 rt6_remove_exception(bucket, rt6_ex); 1602 return; 1603 } 1604 } else if (__rt6_check_expired(rt)) { 1605 RT6_TRACE("purging expired route %p\n", rt); 1606 rt6_remove_exception(bucket, rt6_ex); 1607 return; 1608 } 1609 gc_args->more++; 1610 } 1611 1612 void rt6_age_exceptions(struct rt6_info *rt, 1613 struct fib6_gc_args *gc_args, 1614 unsigned long now) 1615 { 1616 struct rt6_exception_bucket *bucket; 1617 struct rt6_exception *rt6_ex; 1618 struct hlist_node *tmp; 1619 int i; 1620 1621 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1622 return; 1623 1624 spin_lock_bh(&rt6_exception_lock); 1625 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1626 lockdep_is_held(&rt6_exception_lock)); 1627 1628 if (bucket) { 1629 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1630 hlist_for_each_entry_safe(rt6_ex, tmp, 1631 &bucket->chain, hlist) { 1632 rt6_age_examine_exception(bucket, rt6_ex, 1633 gc_args, now); 1634 } 1635 bucket++; 1636 } 1637 } 1638 spin_unlock_bh(&rt6_exception_lock); 1639 } 1640 1641 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1642 int oif, struct flowi6 *fl6, int flags) 1643 { 1644 struct fib6_node *fn, *saved_fn; 1645 struct rt6_info *rt, *rt_cache; 1646 int strict = 0; 1647 1648 strict |= flags & RT6_LOOKUP_F_IFACE; 1649 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1650 if (net->ipv6.devconf_all->forwarding == 0) 1651 strict |= RT6_LOOKUP_F_REACHABLE; 1652 1653 rcu_read_lock(); 1654 1655 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1656 saved_fn = fn; 1657 1658 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1659 oif = 0; 1660 1661 redo_rt6_select: 1662 rt = rt6_select(net, fn, oif, strict); 1663 if (rt->rt6i_nsiblings) 1664 rt = rt6_multipath_select(rt, fl6, oif, strict); 1665 if (rt == net->ipv6.ip6_null_entry) { 1666 fn = fib6_backtrack(fn, &fl6->saddr); 1667 if (fn) 1668 goto redo_rt6_select; 1669 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1670 /* also consider unreachable route */ 1671 strict &= ~RT6_LOOKUP_F_REACHABLE; 1672 fn = saved_fn; 1673 goto redo_rt6_select; 1674 } 1675 } 1676 1677 /*Search through exception table */ 1678 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr); 1679 if (rt_cache) 1680 rt = rt_cache; 1681 1682 if (rt == net->ipv6.ip6_null_entry) { 1683 rcu_read_unlock(); 1684 dst_hold(&rt->dst); 1685 trace_fib6_table_lookup(net, rt, table, fl6); 1686 return rt; 1687 } else if (rt->rt6i_flags & RTF_CACHE) { 1688 if (ip6_hold_safe(net, &rt, true)) { 1689 dst_use_noref(&rt->dst, jiffies); 1690 rt6_dst_from_metrics_check(rt); 1691 } 1692 rcu_read_unlock(); 1693 trace_fib6_table_lookup(net, rt, table, fl6); 1694 return rt; 1695 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1696 !(rt->rt6i_flags & RTF_GATEWAY))) { 1697 /* Create a RTF_CACHE clone which will not be 1698 * owned by the fib6 tree. It is for the special case where 1699 * the daddr in the skb during the neighbor look-up is different 1700 * from the fl6->daddr used to look-up route here. 1701 */ 1702 1703 struct rt6_info *uncached_rt; 1704 1705 if (ip6_hold_safe(net, &rt, true)) { 1706 dst_use_noref(&rt->dst, jiffies); 1707 } else { 1708 rcu_read_unlock(); 1709 uncached_rt = rt; 1710 goto uncached_rt_out; 1711 } 1712 rcu_read_unlock(); 1713 1714 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); 1715 dst_release(&rt->dst); 1716 1717 if (uncached_rt) { 1718 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1719 * No need for another dst_hold() 1720 */ 1721 rt6_uncached_list_add(uncached_rt); 1722 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 1723 } else { 1724 uncached_rt = net->ipv6.ip6_null_entry; 1725 dst_hold(&uncached_rt->dst); 1726 } 1727 1728 uncached_rt_out: 1729 trace_fib6_table_lookup(net, uncached_rt, table, fl6); 1730 return uncached_rt; 1731 1732 } else { 1733 /* Get a percpu copy */ 1734 1735 struct rt6_info *pcpu_rt; 1736 1737 dst_use_noref(&rt->dst, jiffies); 1738 local_bh_disable(); 1739 pcpu_rt = rt6_get_pcpu_route(rt); 1740 1741 if (!pcpu_rt) { 1742 /* atomic_inc_not_zero() is needed when using rcu */ 1743 if (atomic_inc_not_zero(&rt->rt6i_ref)) { 1744 /* No dst_hold() on rt is needed because grabbing 1745 * rt->rt6i_ref makes sure rt can't be released. 1746 */ 1747 pcpu_rt = rt6_make_pcpu_route(rt); 1748 rt6_release(rt); 1749 } else { 1750 /* rt is already removed from tree */ 1751 pcpu_rt = net->ipv6.ip6_null_entry; 1752 dst_hold(&pcpu_rt->dst); 1753 } 1754 } 1755 local_bh_enable(); 1756 rcu_read_unlock(); 1757 trace_fib6_table_lookup(net, pcpu_rt, table, fl6); 1758 return pcpu_rt; 1759 } 1760 } 1761 EXPORT_SYMBOL_GPL(ip6_pol_route); 1762 1763 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, 1764 struct flowi6 *fl6, int flags) 1765 { 1766 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags); 1767 } 1768 1769 struct dst_entry *ip6_route_input_lookup(struct net *net, 1770 struct net_device *dev, 1771 struct flowi6 *fl6, int flags) 1772 { 1773 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1774 flags |= RT6_LOOKUP_F_IFACE; 1775 1776 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input); 1777 } 1778 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1779 1780 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1781 struct flow_keys *keys) 1782 { 1783 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1784 const struct ipv6hdr *key_iph = outer_iph; 1785 const struct ipv6hdr *inner_iph; 1786 const struct icmp6hdr *icmph; 1787 struct ipv6hdr _inner_iph; 1788 1789 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1790 goto out; 1791 1792 icmph = icmp6_hdr(skb); 1793 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1794 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1795 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1796 icmph->icmp6_type != ICMPV6_PARAMPROB) 1797 goto out; 1798 1799 inner_iph = skb_header_pointer(skb, 1800 skb_transport_offset(skb) + sizeof(*icmph), 1801 sizeof(_inner_iph), &_inner_iph); 1802 if (!inner_iph) 1803 goto out; 1804 1805 key_iph = inner_iph; 1806 out: 1807 memset(keys, 0, sizeof(*keys)); 1808 keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 1809 keys->addrs.v6addrs.src = key_iph->saddr; 1810 keys->addrs.v6addrs.dst = key_iph->daddr; 1811 keys->tags.flow_label = ip6_flowinfo(key_iph); 1812 keys->basic.ip_proto = key_iph->nexthdr; 1813 } 1814 1815 /* if skb is set it will be used and fl6 can be NULL */ 1816 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb) 1817 { 1818 struct flow_keys hash_keys; 1819 1820 if (skb) { 1821 ip6_multipath_l3_keys(skb, &hash_keys); 1822 return flow_hash_from_keys(&hash_keys); 1823 } 1824 1825 return get_hash_from_flowi6(fl6); 1826 } 1827 1828 void ip6_route_input(struct sk_buff *skb) 1829 { 1830 const struct ipv6hdr *iph = ipv6_hdr(skb); 1831 struct net *net = dev_net(skb->dev); 1832 int flags = RT6_LOOKUP_F_HAS_SADDR; 1833 struct ip_tunnel_info *tun_info; 1834 struct flowi6 fl6 = { 1835 .flowi6_iif = skb->dev->ifindex, 1836 .daddr = iph->daddr, 1837 .saddr = iph->saddr, 1838 .flowlabel = ip6_flowinfo(iph), 1839 .flowi6_mark = skb->mark, 1840 .flowi6_proto = iph->nexthdr, 1841 }; 1842 1843 tun_info = skb_tunnel_info(skb); 1844 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 1845 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 1846 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 1847 fl6.mp_hash = rt6_multipath_hash(&fl6, skb); 1848 skb_dst_drop(skb); 1849 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags)); 1850 } 1851 1852 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table, 1853 struct flowi6 *fl6, int flags) 1854 { 1855 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags); 1856 } 1857 1858 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 1859 struct flowi6 *fl6, int flags) 1860 { 1861 bool any_src; 1862 1863 if (rt6_need_strict(&fl6->daddr)) { 1864 struct dst_entry *dst; 1865 1866 dst = l3mdev_link_scope_lookup(net, fl6); 1867 if (dst) 1868 return dst; 1869 } 1870 1871 fl6->flowi6_iif = LOOPBACK_IFINDEX; 1872 1873 any_src = ipv6_addr_any(&fl6->saddr); 1874 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 1875 (fl6->flowi6_oif && any_src)) 1876 flags |= RT6_LOOKUP_F_IFACE; 1877 1878 if (!any_src) 1879 flags |= RT6_LOOKUP_F_HAS_SADDR; 1880 else if (sk) 1881 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 1882 1883 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output); 1884 } 1885 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 1886 1887 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 1888 { 1889 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 1890 struct net_device *loopback_dev = net->loopback_dev; 1891 struct dst_entry *new = NULL; 1892 1893 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 1894 DST_OBSOLETE_DEAD, 0); 1895 if (rt) { 1896 rt6_info_init(rt); 1897 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 1898 1899 new = &rt->dst; 1900 new->__use = 1; 1901 new->input = dst_discard; 1902 new->output = dst_discard_out; 1903 1904 dst_copy_metrics(new, &ort->dst); 1905 1906 rt->rt6i_idev = in6_dev_get(loopback_dev); 1907 rt->rt6i_gateway = ort->rt6i_gateway; 1908 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 1909 rt->rt6i_metric = 0; 1910 1911 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 1912 #ifdef CONFIG_IPV6_SUBTREES 1913 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 1914 #endif 1915 } 1916 1917 dst_release(dst_orig); 1918 return new ? new : ERR_PTR(-ENOMEM); 1919 } 1920 1921 /* 1922 * Destination cache support functions 1923 */ 1924 1925 static void rt6_dst_from_metrics_check(struct rt6_info *rt) 1926 { 1927 if (rt->dst.from && 1928 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from)) 1929 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true); 1930 } 1931 1932 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) 1933 { 1934 u32 rt_cookie = 0; 1935 1936 if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie) 1937 return NULL; 1938 1939 if (rt6_check_expired(rt)) 1940 return NULL; 1941 1942 return &rt->dst; 1943 } 1944 1945 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) 1946 { 1947 if (!__rt6_check_expired(rt) && 1948 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 1949 rt6_check((struct rt6_info *)(rt->dst.from), cookie)) 1950 return &rt->dst; 1951 else 1952 return NULL; 1953 } 1954 1955 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 1956 { 1957 struct rt6_info *rt; 1958 1959 rt = (struct rt6_info *) dst; 1960 1961 /* All IPV6 dsts are created with ->obsolete set to the value 1962 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 1963 * into this function always. 1964 */ 1965 1966 rt6_dst_from_metrics_check(rt); 1967 1968 if (rt->rt6i_flags & RTF_PCPU || 1969 (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from)) 1970 return rt6_dst_from_check(rt, cookie); 1971 else 1972 return rt6_check(rt, cookie); 1973 } 1974 1975 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 1976 { 1977 struct rt6_info *rt = (struct rt6_info *) dst; 1978 1979 if (rt) { 1980 if (rt->rt6i_flags & RTF_CACHE) { 1981 if (rt6_check_expired(rt)) { 1982 ip6_del_rt(rt); 1983 dst = NULL; 1984 } 1985 } else { 1986 dst_release(dst); 1987 dst = NULL; 1988 } 1989 } 1990 return dst; 1991 } 1992 1993 static void ip6_link_failure(struct sk_buff *skb) 1994 { 1995 struct rt6_info *rt; 1996 1997 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 1998 1999 rt = (struct rt6_info *) skb_dst(skb); 2000 if (rt) { 2001 if (rt->rt6i_flags & RTF_CACHE) { 2002 if (dst_hold_safe(&rt->dst)) 2003 ip6_del_rt(rt); 2004 } else { 2005 struct fib6_node *fn; 2006 2007 rcu_read_lock(); 2008 fn = rcu_dereference(rt->rt6i_node); 2009 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2010 fn->fn_sernum = -1; 2011 rcu_read_unlock(); 2012 } 2013 } 2014 } 2015 2016 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2017 { 2018 struct net *net = dev_net(rt->dst.dev); 2019 2020 rt->rt6i_flags |= RTF_MODIFIED; 2021 rt->rt6i_pmtu = mtu; 2022 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2023 } 2024 2025 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2026 { 2027 return !(rt->rt6i_flags & RTF_CACHE) && 2028 (rt->rt6i_flags & RTF_PCPU || 2029 rcu_access_pointer(rt->rt6i_node)); 2030 } 2031 2032 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2033 const struct ipv6hdr *iph, u32 mtu) 2034 { 2035 const struct in6_addr *daddr, *saddr; 2036 struct rt6_info *rt6 = (struct rt6_info *)dst; 2037 2038 if (rt6->rt6i_flags & RTF_LOCAL) 2039 return; 2040 2041 if (dst_metric_locked(dst, RTAX_MTU)) 2042 return; 2043 2044 if (iph) { 2045 daddr = &iph->daddr; 2046 saddr = &iph->saddr; 2047 } else if (sk) { 2048 daddr = &sk->sk_v6_daddr; 2049 saddr = &inet6_sk(sk)->saddr; 2050 } else { 2051 daddr = NULL; 2052 saddr = NULL; 2053 } 2054 dst_confirm_neigh(dst, daddr); 2055 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2056 if (mtu >= dst_mtu(dst)) 2057 return; 2058 2059 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2060 rt6_do_update_pmtu(rt6, mtu); 2061 /* update rt6_ex->stamp for cache */ 2062 if (rt6->rt6i_flags & RTF_CACHE) 2063 rt6_update_exception_stamp_rt(rt6); 2064 } else if (daddr) { 2065 struct rt6_info *nrt6; 2066 2067 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr); 2068 if (nrt6) { 2069 rt6_do_update_pmtu(nrt6, mtu); 2070 if (rt6_insert_exception(nrt6, rt6)) 2071 dst_release_immediate(&nrt6->dst); 2072 } 2073 } 2074 } 2075 2076 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2077 struct sk_buff *skb, u32 mtu) 2078 { 2079 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2080 } 2081 2082 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2083 int oif, u32 mark, kuid_t uid) 2084 { 2085 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2086 struct dst_entry *dst; 2087 struct flowi6 fl6; 2088 2089 memset(&fl6, 0, sizeof(fl6)); 2090 fl6.flowi6_oif = oif; 2091 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); 2092 fl6.daddr = iph->daddr; 2093 fl6.saddr = iph->saddr; 2094 fl6.flowlabel = ip6_flowinfo(iph); 2095 fl6.flowi6_uid = uid; 2096 2097 dst = ip6_route_output(net, NULL, &fl6); 2098 if (!dst->error) 2099 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2100 dst_release(dst); 2101 } 2102 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2103 2104 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2105 { 2106 struct dst_entry *dst; 2107 2108 ip6_update_pmtu(skb, sock_net(sk), mtu, 2109 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid); 2110 2111 dst = __sk_dst_get(sk); 2112 if (!dst || !dst->obsolete || 2113 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2114 return; 2115 2116 bh_lock_sock(sk); 2117 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2118 ip6_datagram_dst_update(sk, false); 2119 bh_unlock_sock(sk); 2120 } 2121 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2122 2123 /* Handle redirects */ 2124 struct ip6rd_flowi { 2125 struct flowi6 fl6; 2126 struct in6_addr gateway; 2127 }; 2128 2129 static struct rt6_info *__ip6_route_redirect(struct net *net, 2130 struct fib6_table *table, 2131 struct flowi6 *fl6, 2132 int flags) 2133 { 2134 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2135 struct rt6_info *rt, *rt_cache; 2136 struct fib6_node *fn; 2137 2138 /* Get the "current" route for this destination and 2139 * check if the redirect has come from appropriate router. 2140 * 2141 * RFC 4861 specifies that redirects should only be 2142 * accepted if they come from the nexthop to the target. 2143 * Due to the way the routes are chosen, this notion 2144 * is a bit fuzzy and one might need to check all possible 2145 * routes. 2146 */ 2147 2148 rcu_read_lock(); 2149 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2150 restart: 2151 for_each_fib6_node_rt_rcu(fn) { 2152 if (rt6_check_expired(rt)) 2153 continue; 2154 if (rt->dst.error) 2155 break; 2156 if (!(rt->rt6i_flags & RTF_GATEWAY)) 2157 continue; 2158 if (fl6->flowi6_oif != rt->dst.dev->ifindex) 2159 continue; 2160 /* rt_cache's gateway might be different from its 'parent' 2161 * in the case of an ip redirect. 2162 * So we keep searching in the exception table if the gateway 2163 * is different. 2164 */ 2165 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) { 2166 rt_cache = rt6_find_cached_rt(rt, 2167 &fl6->daddr, 2168 &fl6->saddr); 2169 if (rt_cache && 2170 ipv6_addr_equal(&rdfl->gateway, 2171 &rt_cache->rt6i_gateway)) { 2172 rt = rt_cache; 2173 break; 2174 } 2175 continue; 2176 } 2177 break; 2178 } 2179 2180 if (!rt) 2181 rt = net->ipv6.ip6_null_entry; 2182 else if (rt->dst.error) { 2183 rt = net->ipv6.ip6_null_entry; 2184 goto out; 2185 } 2186 2187 if (rt == net->ipv6.ip6_null_entry) { 2188 fn = fib6_backtrack(fn, &fl6->saddr); 2189 if (fn) 2190 goto restart; 2191 } 2192 2193 out: 2194 ip6_hold_safe(net, &rt, true); 2195 2196 rcu_read_unlock(); 2197 2198 trace_fib6_table_lookup(net, rt, table, fl6); 2199 return rt; 2200 }; 2201 2202 static struct dst_entry *ip6_route_redirect(struct net *net, 2203 const struct flowi6 *fl6, 2204 const struct in6_addr *gateway) 2205 { 2206 int flags = RT6_LOOKUP_F_HAS_SADDR; 2207 struct ip6rd_flowi rdfl; 2208 2209 rdfl.fl6 = *fl6; 2210 rdfl.gateway = *gateway; 2211 2212 return fib6_rule_lookup(net, &rdfl.fl6, 2213 flags, __ip6_route_redirect); 2214 } 2215 2216 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2217 kuid_t uid) 2218 { 2219 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2220 struct dst_entry *dst; 2221 struct flowi6 fl6; 2222 2223 memset(&fl6, 0, sizeof(fl6)); 2224 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2225 fl6.flowi6_oif = oif; 2226 fl6.flowi6_mark = mark; 2227 fl6.daddr = iph->daddr; 2228 fl6.saddr = iph->saddr; 2229 fl6.flowlabel = ip6_flowinfo(iph); 2230 fl6.flowi6_uid = uid; 2231 2232 dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr); 2233 rt6_do_redirect(dst, NULL, skb); 2234 dst_release(dst); 2235 } 2236 EXPORT_SYMBOL_GPL(ip6_redirect); 2237 2238 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, 2239 u32 mark) 2240 { 2241 const struct ipv6hdr *iph = ipv6_hdr(skb); 2242 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2243 struct dst_entry *dst; 2244 struct flowi6 fl6; 2245 2246 memset(&fl6, 0, sizeof(fl6)); 2247 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2248 fl6.flowi6_oif = oif; 2249 fl6.flowi6_mark = mark; 2250 fl6.daddr = msg->dest; 2251 fl6.saddr = iph->daddr; 2252 fl6.flowi6_uid = sock_net_uid(net, NULL); 2253 2254 dst = ip6_route_redirect(net, &fl6, &iph->saddr); 2255 rt6_do_redirect(dst, NULL, skb); 2256 dst_release(dst); 2257 } 2258 2259 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2260 { 2261 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2262 sk->sk_uid); 2263 } 2264 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2265 2266 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2267 { 2268 struct net_device *dev = dst->dev; 2269 unsigned int mtu = dst_mtu(dst); 2270 struct net *net = dev_net(dev); 2271 2272 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2273 2274 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2275 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2276 2277 /* 2278 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2279 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2280 * IPV6_MAXPLEN is also valid and means: "any MSS, 2281 * rely only on pmtu discovery" 2282 */ 2283 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2284 mtu = IPV6_MAXPLEN; 2285 return mtu; 2286 } 2287 2288 static unsigned int ip6_mtu(const struct dst_entry *dst) 2289 { 2290 const struct rt6_info *rt = (const struct rt6_info *)dst; 2291 unsigned int mtu = rt->rt6i_pmtu; 2292 struct inet6_dev *idev; 2293 2294 if (mtu) 2295 goto out; 2296 2297 mtu = dst_metric_raw(dst, RTAX_MTU); 2298 if (mtu) 2299 goto out; 2300 2301 mtu = IPV6_MIN_MTU; 2302 2303 rcu_read_lock(); 2304 idev = __in6_dev_get(dst->dev); 2305 if (idev) 2306 mtu = idev->cnf.mtu6; 2307 rcu_read_unlock(); 2308 2309 out: 2310 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2311 2312 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2313 } 2314 2315 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2316 struct flowi6 *fl6) 2317 { 2318 struct dst_entry *dst; 2319 struct rt6_info *rt; 2320 struct inet6_dev *idev = in6_dev_get(dev); 2321 struct net *net = dev_net(dev); 2322 2323 if (unlikely(!idev)) 2324 return ERR_PTR(-ENODEV); 2325 2326 rt = ip6_dst_alloc(net, dev, 0); 2327 if (unlikely(!rt)) { 2328 in6_dev_put(idev); 2329 dst = ERR_PTR(-ENOMEM); 2330 goto out; 2331 } 2332 2333 rt->dst.flags |= DST_HOST; 2334 rt->dst.output = ip6_output; 2335 rt->rt6i_gateway = fl6->daddr; 2336 rt->rt6i_dst.addr = fl6->daddr; 2337 rt->rt6i_dst.plen = 128; 2338 rt->rt6i_idev = idev; 2339 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2340 2341 /* Add this dst into uncached_list so that rt6_ifdown() can 2342 * do proper release of the net_device 2343 */ 2344 rt6_uncached_list_add(rt); 2345 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2346 2347 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2348 2349 out: 2350 return dst; 2351 } 2352 2353 static int ip6_dst_gc(struct dst_ops *ops) 2354 { 2355 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2356 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2357 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2358 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2359 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2360 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2361 int entries; 2362 2363 entries = dst_entries_get_fast(ops); 2364 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2365 entries <= rt_max_size) 2366 goto out; 2367 2368 net->ipv6.ip6_rt_gc_expire++; 2369 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2370 entries = dst_entries_get_slow(ops); 2371 if (entries < ops->gc_thresh) 2372 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2373 out: 2374 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2375 return entries > rt_max_size; 2376 } 2377 2378 static int ip6_convert_metrics(struct mx6_config *mxc, 2379 const struct fib6_config *cfg) 2380 { 2381 bool ecn_ca = false; 2382 struct nlattr *nla; 2383 int remaining; 2384 u32 *mp; 2385 2386 if (!cfg->fc_mx) 2387 return 0; 2388 2389 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); 2390 if (unlikely(!mp)) 2391 return -ENOMEM; 2392 2393 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { 2394 int type = nla_type(nla); 2395 u32 val; 2396 2397 if (!type) 2398 continue; 2399 if (unlikely(type > RTAX_MAX)) 2400 goto err; 2401 2402 if (type == RTAX_CC_ALGO) { 2403 char tmp[TCP_CA_NAME_MAX]; 2404 2405 nla_strlcpy(tmp, nla, sizeof(tmp)); 2406 val = tcp_ca_get_key_by_name(tmp, &ecn_ca); 2407 if (val == TCP_CA_UNSPEC) 2408 goto err; 2409 } else { 2410 val = nla_get_u32(nla); 2411 } 2412 if (type == RTAX_HOPLIMIT && val > 255) 2413 val = 255; 2414 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) 2415 goto err; 2416 2417 mp[type - 1] = val; 2418 __set_bit(type - 1, mxc->mx_valid); 2419 } 2420 2421 if (ecn_ca) { 2422 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid); 2423 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; 2424 } 2425 2426 mxc->mx = mp; 2427 return 0; 2428 err: 2429 kfree(mp); 2430 return -EINVAL; 2431 } 2432 2433 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2434 struct fib6_config *cfg, 2435 const struct in6_addr *gw_addr) 2436 { 2437 struct flowi6 fl6 = { 2438 .flowi6_oif = cfg->fc_ifindex, 2439 .daddr = *gw_addr, 2440 .saddr = cfg->fc_prefsrc, 2441 }; 2442 struct fib6_table *table; 2443 struct rt6_info *rt; 2444 int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE; 2445 2446 table = fib6_get_table(net, cfg->fc_table); 2447 if (!table) 2448 return NULL; 2449 2450 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2451 flags |= RT6_LOOKUP_F_HAS_SADDR; 2452 2453 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags); 2454 2455 /* if table lookup failed, fall back to full lookup */ 2456 if (rt == net->ipv6.ip6_null_entry) { 2457 ip6_rt_put(rt); 2458 rt = NULL; 2459 } 2460 2461 return rt; 2462 } 2463 2464 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, 2465 struct netlink_ext_ack *extack) 2466 { 2467 struct net *net = cfg->fc_nlinfo.nl_net; 2468 struct rt6_info *rt = NULL; 2469 struct net_device *dev = NULL; 2470 struct inet6_dev *idev = NULL; 2471 struct fib6_table *table; 2472 int addr_type; 2473 int err = -EINVAL; 2474 2475 /* RTF_PCPU is an internal flag; can not be set by userspace */ 2476 if (cfg->fc_flags & RTF_PCPU) { 2477 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 2478 goto out; 2479 } 2480 2481 /* RTF_CACHE is an internal flag; can not be set by userspace */ 2482 if (cfg->fc_flags & RTF_CACHE) { 2483 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 2484 goto out; 2485 } 2486 2487 if (cfg->fc_dst_len > 128) { 2488 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 2489 goto out; 2490 } 2491 if (cfg->fc_src_len > 128) { 2492 NL_SET_ERR_MSG(extack, "Invalid source address length"); 2493 goto out; 2494 } 2495 #ifndef CONFIG_IPV6_SUBTREES 2496 if (cfg->fc_src_len) { 2497 NL_SET_ERR_MSG(extack, 2498 "Specifying source address requires IPV6_SUBTREES to be enabled"); 2499 goto out; 2500 } 2501 #endif 2502 if (cfg->fc_ifindex) { 2503 err = -ENODEV; 2504 dev = dev_get_by_index(net, cfg->fc_ifindex); 2505 if (!dev) 2506 goto out; 2507 idev = in6_dev_get(dev); 2508 if (!idev) 2509 goto out; 2510 } 2511 2512 if (cfg->fc_metric == 0) 2513 cfg->fc_metric = IP6_RT_PRIO_USER; 2514 2515 err = -ENOBUFS; 2516 if (cfg->fc_nlinfo.nlh && 2517 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 2518 table = fib6_get_table(net, cfg->fc_table); 2519 if (!table) { 2520 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 2521 table = fib6_new_table(net, cfg->fc_table); 2522 } 2523 } else { 2524 table = fib6_new_table(net, cfg->fc_table); 2525 } 2526 2527 if (!table) 2528 goto out; 2529 2530 rt = ip6_dst_alloc(net, NULL, 2531 (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT); 2532 2533 if (!rt) { 2534 err = -ENOMEM; 2535 goto out; 2536 } 2537 2538 if (cfg->fc_flags & RTF_EXPIRES) 2539 rt6_set_expires(rt, jiffies + 2540 clock_t_to_jiffies(cfg->fc_expires)); 2541 else 2542 rt6_clean_expires(rt); 2543 2544 if (cfg->fc_protocol == RTPROT_UNSPEC) 2545 cfg->fc_protocol = RTPROT_BOOT; 2546 rt->rt6i_protocol = cfg->fc_protocol; 2547 2548 addr_type = ipv6_addr_type(&cfg->fc_dst); 2549 2550 if (addr_type & IPV6_ADDR_MULTICAST) 2551 rt->dst.input = ip6_mc_input; 2552 else if (cfg->fc_flags & RTF_LOCAL) 2553 rt->dst.input = ip6_input; 2554 else 2555 rt->dst.input = ip6_forward; 2556 2557 rt->dst.output = ip6_output; 2558 2559 if (cfg->fc_encap) { 2560 struct lwtunnel_state *lwtstate; 2561 2562 err = lwtunnel_build_state(cfg->fc_encap_type, 2563 cfg->fc_encap, AF_INET6, cfg, 2564 &lwtstate, extack); 2565 if (err) 2566 goto out; 2567 rt->dst.lwtstate = lwtstate_get(lwtstate); 2568 if (lwtunnel_output_redirect(rt->dst.lwtstate)) { 2569 rt->dst.lwtstate->orig_output = rt->dst.output; 2570 rt->dst.output = lwtunnel_output; 2571 } 2572 if (lwtunnel_input_redirect(rt->dst.lwtstate)) { 2573 rt->dst.lwtstate->orig_input = rt->dst.input; 2574 rt->dst.input = lwtunnel_input; 2575 } 2576 } 2577 2578 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 2579 rt->rt6i_dst.plen = cfg->fc_dst_len; 2580 if (rt->rt6i_dst.plen == 128) 2581 rt->dst.flags |= DST_HOST; 2582 2583 #ifdef CONFIG_IPV6_SUBTREES 2584 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len); 2585 rt->rt6i_src.plen = cfg->fc_src_len; 2586 #endif 2587 2588 rt->rt6i_metric = cfg->fc_metric; 2589 2590 /* We cannot add true routes via loopback here, 2591 they would result in kernel looping; promote them to reject routes 2592 */ 2593 if ((cfg->fc_flags & RTF_REJECT) || 2594 (dev && (dev->flags & IFF_LOOPBACK) && 2595 !(addr_type & IPV6_ADDR_LOOPBACK) && 2596 !(cfg->fc_flags & RTF_LOCAL))) { 2597 /* hold loopback dev/idev if we haven't done so. */ 2598 if (dev != net->loopback_dev) { 2599 if (dev) { 2600 dev_put(dev); 2601 in6_dev_put(idev); 2602 } 2603 dev = net->loopback_dev; 2604 dev_hold(dev); 2605 idev = in6_dev_get(dev); 2606 if (!idev) { 2607 err = -ENODEV; 2608 goto out; 2609 } 2610 } 2611 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP; 2612 switch (cfg->fc_type) { 2613 case RTN_BLACKHOLE: 2614 rt->dst.error = -EINVAL; 2615 rt->dst.output = dst_discard_out; 2616 rt->dst.input = dst_discard; 2617 break; 2618 case RTN_PROHIBIT: 2619 rt->dst.error = -EACCES; 2620 rt->dst.output = ip6_pkt_prohibit_out; 2621 rt->dst.input = ip6_pkt_prohibit; 2622 break; 2623 case RTN_THROW: 2624 case RTN_UNREACHABLE: 2625 default: 2626 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN 2627 : (cfg->fc_type == RTN_UNREACHABLE) 2628 ? -EHOSTUNREACH : -ENETUNREACH; 2629 rt->dst.output = ip6_pkt_discard_out; 2630 rt->dst.input = ip6_pkt_discard; 2631 break; 2632 } 2633 goto install_route; 2634 } 2635 2636 if (cfg->fc_flags & RTF_GATEWAY) { 2637 const struct in6_addr *gw_addr; 2638 int gwa_type; 2639 2640 gw_addr = &cfg->fc_gateway; 2641 gwa_type = ipv6_addr_type(gw_addr); 2642 2643 /* if gw_addr is local we will fail to detect this in case 2644 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2645 * will return already-added prefix route via interface that 2646 * prefix route was assigned to, which might be non-loopback. 2647 */ 2648 err = -EINVAL; 2649 if (ipv6_chk_addr_and_flags(net, gw_addr, 2650 gwa_type & IPV6_ADDR_LINKLOCAL ? 2651 dev : NULL, 0, 0)) { 2652 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2653 goto out; 2654 } 2655 rt->rt6i_gateway = *gw_addr; 2656 2657 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { 2658 struct rt6_info *grt = NULL; 2659 2660 /* IPv6 strictly inhibits using not link-local 2661 addresses as nexthop address. 2662 Otherwise, router will not able to send redirects. 2663 It is very good, but in some (rare!) circumstances 2664 (SIT, PtP, NBMA NOARP links) it is handy to allow 2665 some exceptions. --ANK 2666 We allow IPv4-mapped nexthops to support RFC4798-type 2667 addressing 2668 */ 2669 if (!(gwa_type & (IPV6_ADDR_UNICAST | 2670 IPV6_ADDR_MAPPED))) { 2671 NL_SET_ERR_MSG(extack, 2672 "Invalid gateway address"); 2673 goto out; 2674 } 2675 2676 if (cfg->fc_table) { 2677 grt = ip6_nh_lookup_table(net, cfg, gw_addr); 2678 2679 if (grt) { 2680 if (grt->rt6i_flags & RTF_GATEWAY || 2681 (dev && dev != grt->dst.dev)) { 2682 ip6_rt_put(grt); 2683 grt = NULL; 2684 } 2685 } 2686 } 2687 2688 if (!grt) 2689 grt = rt6_lookup(net, gw_addr, NULL, 2690 cfg->fc_ifindex, 1); 2691 2692 err = -EHOSTUNREACH; 2693 if (!grt) 2694 goto out; 2695 if (dev) { 2696 if (dev != grt->dst.dev) { 2697 ip6_rt_put(grt); 2698 goto out; 2699 } 2700 } else { 2701 dev = grt->dst.dev; 2702 idev = grt->rt6i_idev; 2703 dev_hold(dev); 2704 in6_dev_hold(grt->rt6i_idev); 2705 } 2706 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2707 err = 0; 2708 ip6_rt_put(grt); 2709 2710 if (err) 2711 goto out; 2712 } 2713 err = -EINVAL; 2714 if (!dev) { 2715 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2716 goto out; 2717 } else if (dev->flags & IFF_LOOPBACK) { 2718 NL_SET_ERR_MSG(extack, 2719 "Egress device can not be loopback device for this route"); 2720 goto out; 2721 } 2722 } 2723 2724 err = -ENODEV; 2725 if (!dev) 2726 goto out; 2727 2728 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 2729 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 2730 NL_SET_ERR_MSG(extack, "Invalid source address"); 2731 err = -EINVAL; 2732 goto out; 2733 } 2734 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc; 2735 rt->rt6i_prefsrc.plen = 128; 2736 } else 2737 rt->rt6i_prefsrc.plen = 0; 2738 2739 rt->rt6i_flags = cfg->fc_flags; 2740 2741 install_route: 2742 rt->dst.dev = dev; 2743 rt->rt6i_idev = idev; 2744 rt->rt6i_table = table; 2745 2746 cfg->fc_nlinfo.nl_net = dev_net(dev); 2747 2748 return rt; 2749 out: 2750 if (dev) 2751 dev_put(dev); 2752 if (idev) 2753 in6_dev_put(idev); 2754 if (rt) 2755 dst_release_immediate(&rt->dst); 2756 2757 return ERR_PTR(err); 2758 } 2759 2760 int ip6_route_add(struct fib6_config *cfg, 2761 struct netlink_ext_ack *extack) 2762 { 2763 struct mx6_config mxc = { .mx = NULL, }; 2764 struct rt6_info *rt; 2765 int err; 2766 2767 rt = ip6_route_info_create(cfg, extack); 2768 if (IS_ERR(rt)) { 2769 err = PTR_ERR(rt); 2770 rt = NULL; 2771 goto out; 2772 } 2773 2774 err = ip6_convert_metrics(&mxc, cfg); 2775 if (err) 2776 goto out; 2777 2778 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack); 2779 2780 kfree(mxc.mx); 2781 2782 return err; 2783 out: 2784 if (rt) 2785 dst_release_immediate(&rt->dst); 2786 2787 return err; 2788 } 2789 2790 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) 2791 { 2792 int err; 2793 struct fib6_table *table; 2794 struct net *net = dev_net(rt->dst.dev); 2795 2796 if (rt == net->ipv6.ip6_null_entry) { 2797 err = -ENOENT; 2798 goto out; 2799 } 2800 2801 table = rt->rt6i_table; 2802 spin_lock_bh(&table->tb6_lock); 2803 err = fib6_del(rt, info); 2804 spin_unlock_bh(&table->tb6_lock); 2805 2806 out: 2807 ip6_rt_put(rt); 2808 return err; 2809 } 2810 2811 int ip6_del_rt(struct rt6_info *rt) 2812 { 2813 struct nl_info info = { 2814 .nl_net = dev_net(rt->dst.dev), 2815 }; 2816 return __ip6_del_rt(rt, &info); 2817 } 2818 2819 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) 2820 { 2821 struct nl_info *info = &cfg->fc_nlinfo; 2822 struct net *net = info->nl_net; 2823 struct sk_buff *skb = NULL; 2824 struct fib6_table *table; 2825 int err = -ENOENT; 2826 2827 if (rt == net->ipv6.ip6_null_entry) 2828 goto out_put; 2829 table = rt->rt6i_table; 2830 spin_lock_bh(&table->tb6_lock); 2831 2832 if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) { 2833 struct rt6_info *sibling, *next_sibling; 2834 2835 /* prefer to send a single notification with all hops */ 2836 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 2837 if (skb) { 2838 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 2839 2840 if (rt6_fill_node(net, skb, rt, 2841 NULL, NULL, 0, RTM_DELROUTE, 2842 info->portid, seq, 0) < 0) { 2843 kfree_skb(skb); 2844 skb = NULL; 2845 } else 2846 info->skip_notify = 1; 2847 } 2848 2849 list_for_each_entry_safe(sibling, next_sibling, 2850 &rt->rt6i_siblings, 2851 rt6i_siblings) { 2852 err = fib6_del(sibling, info); 2853 if (err) 2854 goto out_unlock; 2855 } 2856 } 2857 2858 err = fib6_del(rt, info); 2859 out_unlock: 2860 spin_unlock_bh(&table->tb6_lock); 2861 out_put: 2862 ip6_rt_put(rt); 2863 2864 if (skb) { 2865 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 2866 info->nlh, gfp_any()); 2867 } 2868 return err; 2869 } 2870 2871 static int ip6_route_del(struct fib6_config *cfg, 2872 struct netlink_ext_ack *extack) 2873 { 2874 struct rt6_info *rt, *rt_cache; 2875 struct fib6_table *table; 2876 struct fib6_node *fn; 2877 int err = -ESRCH; 2878 2879 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 2880 if (!table) { 2881 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 2882 return err; 2883 } 2884 2885 rcu_read_lock(); 2886 2887 fn = fib6_locate(&table->tb6_root, 2888 &cfg->fc_dst, cfg->fc_dst_len, 2889 &cfg->fc_src, cfg->fc_src_len, 2890 !(cfg->fc_flags & RTF_CACHE)); 2891 2892 if (fn) { 2893 for_each_fib6_node_rt_rcu(fn) { 2894 if (cfg->fc_flags & RTF_CACHE) { 2895 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, 2896 &cfg->fc_src); 2897 if (!rt_cache) 2898 continue; 2899 rt = rt_cache; 2900 } 2901 if (cfg->fc_ifindex && 2902 (!rt->dst.dev || 2903 rt->dst.dev->ifindex != cfg->fc_ifindex)) 2904 continue; 2905 if (cfg->fc_flags & RTF_GATEWAY && 2906 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 2907 continue; 2908 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric) 2909 continue; 2910 if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol) 2911 continue; 2912 if (!dst_hold_safe(&rt->dst)) 2913 break; 2914 rcu_read_unlock(); 2915 2916 /* if gateway was specified only delete the one hop */ 2917 if (cfg->fc_flags & RTF_GATEWAY) 2918 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 2919 2920 return __ip6_del_rt_siblings(rt, cfg); 2921 } 2922 } 2923 rcu_read_unlock(); 2924 2925 return err; 2926 } 2927 2928 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 2929 { 2930 struct netevent_redirect netevent; 2931 struct rt6_info *rt, *nrt = NULL; 2932 struct ndisc_options ndopts; 2933 struct inet6_dev *in6_dev; 2934 struct neighbour *neigh; 2935 struct rd_msg *msg; 2936 int optlen, on_link; 2937 u8 *lladdr; 2938 2939 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 2940 optlen -= sizeof(*msg); 2941 2942 if (optlen < 0) { 2943 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 2944 return; 2945 } 2946 2947 msg = (struct rd_msg *)icmp6_hdr(skb); 2948 2949 if (ipv6_addr_is_multicast(&msg->dest)) { 2950 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 2951 return; 2952 } 2953 2954 on_link = 0; 2955 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 2956 on_link = 1; 2957 } else if (ipv6_addr_type(&msg->target) != 2958 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 2959 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 2960 return; 2961 } 2962 2963 in6_dev = __in6_dev_get(skb->dev); 2964 if (!in6_dev) 2965 return; 2966 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 2967 return; 2968 2969 /* RFC2461 8.1: 2970 * The IP source address of the Redirect MUST be the same as the current 2971 * first-hop router for the specified ICMP Destination Address. 2972 */ 2973 2974 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 2975 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 2976 return; 2977 } 2978 2979 lladdr = NULL; 2980 if (ndopts.nd_opts_tgt_lladdr) { 2981 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 2982 skb->dev); 2983 if (!lladdr) { 2984 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 2985 return; 2986 } 2987 } 2988 2989 rt = (struct rt6_info *) dst; 2990 if (rt->rt6i_flags & RTF_REJECT) { 2991 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 2992 return; 2993 } 2994 2995 /* Redirect received -> path was valid. 2996 * Look, redirects are sent only in response to data packets, 2997 * so that this nexthop apparently is reachable. --ANK 2998 */ 2999 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 3000 3001 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 3002 if (!neigh) 3003 return; 3004 3005 /* 3006 * We have finally decided to accept it. 3007 */ 3008 3009 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3010 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3011 NEIGH_UPDATE_F_OVERRIDE| 3012 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3013 NEIGH_UPDATE_F_ISROUTER)), 3014 NDISC_REDIRECT, &ndopts); 3015 3016 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL); 3017 if (!nrt) 3018 goto out; 3019 3020 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3021 if (on_link) 3022 nrt->rt6i_flags &= ~RTF_GATEWAY; 3023 3024 nrt->rt6i_protocol = RTPROT_REDIRECT; 3025 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3026 3027 /* No need to remove rt from the exception table if rt is 3028 * a cached route because rt6_insert_exception() will 3029 * takes care of it 3030 */ 3031 if (rt6_insert_exception(nrt, rt)) { 3032 dst_release_immediate(&nrt->dst); 3033 goto out; 3034 } 3035 3036 netevent.old = &rt->dst; 3037 netevent.new = &nrt->dst; 3038 netevent.daddr = &msg->dest; 3039 netevent.neigh = neigh; 3040 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3041 3042 out: 3043 neigh_release(neigh); 3044 } 3045 3046 /* 3047 * Misc support functions 3048 */ 3049 3050 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) 3051 { 3052 BUG_ON(from->dst.from); 3053 3054 rt->rt6i_flags &= ~RTF_EXPIRES; 3055 dst_hold(&from->dst); 3056 rt->dst.from = &from->dst; 3057 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true); 3058 } 3059 3060 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) 3061 { 3062 rt->dst.input = ort->dst.input; 3063 rt->dst.output = ort->dst.output; 3064 rt->rt6i_dst = ort->rt6i_dst; 3065 rt->dst.error = ort->dst.error; 3066 rt->rt6i_idev = ort->rt6i_idev; 3067 if (rt->rt6i_idev) 3068 in6_dev_hold(rt->rt6i_idev); 3069 rt->dst.lastuse = jiffies; 3070 rt->rt6i_gateway = ort->rt6i_gateway; 3071 rt->rt6i_flags = ort->rt6i_flags; 3072 rt6_set_from(rt, ort); 3073 rt->rt6i_metric = ort->rt6i_metric; 3074 #ifdef CONFIG_IPV6_SUBTREES 3075 rt->rt6i_src = ort->rt6i_src; 3076 #endif 3077 rt->rt6i_prefsrc = ort->rt6i_prefsrc; 3078 rt->rt6i_table = ort->rt6i_table; 3079 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate); 3080 } 3081 3082 #ifdef CONFIG_IPV6_ROUTE_INFO 3083 static struct rt6_info *rt6_get_route_info(struct net *net, 3084 const struct in6_addr *prefix, int prefixlen, 3085 const struct in6_addr *gwaddr, 3086 struct net_device *dev) 3087 { 3088 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3089 int ifindex = dev->ifindex; 3090 struct fib6_node *fn; 3091 struct rt6_info *rt = NULL; 3092 struct fib6_table *table; 3093 3094 table = fib6_get_table(net, tb_id); 3095 if (!table) 3096 return NULL; 3097 3098 rcu_read_lock(); 3099 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3100 if (!fn) 3101 goto out; 3102 3103 for_each_fib6_node_rt_rcu(fn) { 3104 if (rt->dst.dev->ifindex != ifindex) 3105 continue; 3106 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) 3107 continue; 3108 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr)) 3109 continue; 3110 ip6_hold_safe(NULL, &rt, false); 3111 break; 3112 } 3113 out: 3114 rcu_read_unlock(); 3115 return rt; 3116 } 3117 3118 static struct rt6_info *rt6_add_route_info(struct net *net, 3119 const struct in6_addr *prefix, int prefixlen, 3120 const struct in6_addr *gwaddr, 3121 struct net_device *dev, 3122 unsigned int pref) 3123 { 3124 struct fib6_config cfg = { 3125 .fc_metric = IP6_RT_PRIO_USER, 3126 .fc_ifindex = dev->ifindex, 3127 .fc_dst_len = prefixlen, 3128 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3129 RTF_UP | RTF_PREF(pref), 3130 .fc_protocol = RTPROT_RA, 3131 .fc_nlinfo.portid = 0, 3132 .fc_nlinfo.nlh = NULL, 3133 .fc_nlinfo.nl_net = net, 3134 }; 3135 3136 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3137 cfg.fc_dst = *prefix; 3138 cfg.fc_gateway = *gwaddr; 3139 3140 /* We should treat it as a default route if prefix length is 0. */ 3141 if (!prefixlen) 3142 cfg.fc_flags |= RTF_DEFAULT; 3143 3144 ip6_route_add(&cfg, NULL); 3145 3146 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3147 } 3148 #endif 3149 3150 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev) 3151 { 3152 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3153 struct rt6_info *rt; 3154 struct fib6_table *table; 3155 3156 table = fib6_get_table(dev_net(dev), tb_id); 3157 if (!table) 3158 return NULL; 3159 3160 rcu_read_lock(); 3161 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3162 if (dev == rt->dst.dev && 3163 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3164 ipv6_addr_equal(&rt->rt6i_gateway, addr)) 3165 break; 3166 } 3167 if (rt) 3168 ip6_hold_safe(NULL, &rt, false); 3169 rcu_read_unlock(); 3170 return rt; 3171 } 3172 3173 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, 3174 struct net_device *dev, 3175 unsigned int pref) 3176 { 3177 struct fib6_config cfg = { 3178 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3179 .fc_metric = IP6_RT_PRIO_USER, 3180 .fc_ifindex = dev->ifindex, 3181 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3182 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3183 .fc_protocol = RTPROT_RA, 3184 .fc_nlinfo.portid = 0, 3185 .fc_nlinfo.nlh = NULL, 3186 .fc_nlinfo.nl_net = dev_net(dev), 3187 }; 3188 3189 cfg.fc_gateway = *gwaddr; 3190 3191 if (!ip6_route_add(&cfg, NULL)) { 3192 struct fib6_table *table; 3193 3194 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3195 if (table) 3196 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3197 } 3198 3199 return rt6_get_dflt_router(gwaddr, dev); 3200 } 3201 3202 static void __rt6_purge_dflt_routers(struct fib6_table *table) 3203 { 3204 struct rt6_info *rt; 3205 3206 restart: 3207 rcu_read_lock(); 3208 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3209 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3210 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) { 3211 if (dst_hold_safe(&rt->dst)) { 3212 rcu_read_unlock(); 3213 ip6_del_rt(rt); 3214 } else { 3215 rcu_read_unlock(); 3216 } 3217 goto restart; 3218 } 3219 } 3220 rcu_read_unlock(); 3221 3222 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3223 } 3224 3225 void rt6_purge_dflt_routers(struct net *net) 3226 { 3227 struct fib6_table *table; 3228 struct hlist_head *head; 3229 unsigned int h; 3230 3231 rcu_read_lock(); 3232 3233 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3234 head = &net->ipv6.fib_table_hash[h]; 3235 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3236 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3237 __rt6_purge_dflt_routers(table); 3238 } 3239 } 3240 3241 rcu_read_unlock(); 3242 } 3243 3244 static void rtmsg_to_fib6_config(struct net *net, 3245 struct in6_rtmsg *rtmsg, 3246 struct fib6_config *cfg) 3247 { 3248 memset(cfg, 0, sizeof(*cfg)); 3249 3250 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3251 : RT6_TABLE_MAIN; 3252 cfg->fc_ifindex = rtmsg->rtmsg_ifindex; 3253 cfg->fc_metric = rtmsg->rtmsg_metric; 3254 cfg->fc_expires = rtmsg->rtmsg_info; 3255 cfg->fc_dst_len = rtmsg->rtmsg_dst_len; 3256 cfg->fc_src_len = rtmsg->rtmsg_src_len; 3257 cfg->fc_flags = rtmsg->rtmsg_flags; 3258 3259 cfg->fc_nlinfo.nl_net = net; 3260 3261 cfg->fc_dst = rtmsg->rtmsg_dst; 3262 cfg->fc_src = rtmsg->rtmsg_src; 3263 cfg->fc_gateway = rtmsg->rtmsg_gateway; 3264 } 3265 3266 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3267 { 3268 struct fib6_config cfg; 3269 struct in6_rtmsg rtmsg; 3270 int err; 3271 3272 switch (cmd) { 3273 case SIOCADDRT: /* Add a route */ 3274 case SIOCDELRT: /* Delete a route */ 3275 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3276 return -EPERM; 3277 err = copy_from_user(&rtmsg, arg, 3278 sizeof(struct in6_rtmsg)); 3279 if (err) 3280 return -EFAULT; 3281 3282 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3283 3284 rtnl_lock(); 3285 switch (cmd) { 3286 case SIOCADDRT: 3287 err = ip6_route_add(&cfg, NULL); 3288 break; 3289 case SIOCDELRT: 3290 err = ip6_route_del(&cfg, NULL); 3291 break; 3292 default: 3293 err = -EINVAL; 3294 } 3295 rtnl_unlock(); 3296 3297 return err; 3298 } 3299 3300 return -EINVAL; 3301 } 3302 3303 /* 3304 * Drop the packet on the floor 3305 */ 3306 3307 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3308 { 3309 int type; 3310 struct dst_entry *dst = skb_dst(skb); 3311 switch (ipstats_mib_noroutes) { 3312 case IPSTATS_MIB_INNOROUTES: 3313 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3314 if (type == IPV6_ADDR_ANY) { 3315 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3316 IPSTATS_MIB_INADDRERRORS); 3317 break; 3318 } 3319 /* FALLTHROUGH */ 3320 case IPSTATS_MIB_OUTNOROUTES: 3321 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3322 ipstats_mib_noroutes); 3323 break; 3324 } 3325 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3326 kfree_skb(skb); 3327 return 0; 3328 } 3329 3330 static int ip6_pkt_discard(struct sk_buff *skb) 3331 { 3332 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3333 } 3334 3335 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3336 { 3337 skb->dev = skb_dst(skb)->dev; 3338 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3339 } 3340 3341 static int ip6_pkt_prohibit(struct sk_buff *skb) 3342 { 3343 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3344 } 3345 3346 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3347 { 3348 skb->dev = skb_dst(skb)->dev; 3349 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3350 } 3351 3352 /* 3353 * Allocate a dst for local (unicast / anycast) address. 3354 */ 3355 3356 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, 3357 const struct in6_addr *addr, 3358 bool anycast) 3359 { 3360 u32 tb_id; 3361 struct net *net = dev_net(idev->dev); 3362 struct net_device *dev = idev->dev; 3363 struct rt6_info *rt; 3364 3365 rt = ip6_dst_alloc(net, dev, DST_NOCOUNT); 3366 if (!rt) 3367 return ERR_PTR(-ENOMEM); 3368 3369 in6_dev_hold(idev); 3370 3371 rt->dst.flags |= DST_HOST; 3372 rt->dst.input = ip6_input; 3373 rt->dst.output = ip6_output; 3374 rt->rt6i_idev = idev; 3375 3376 rt->rt6i_protocol = RTPROT_KERNEL; 3377 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; 3378 if (anycast) 3379 rt->rt6i_flags |= RTF_ANYCAST; 3380 else 3381 rt->rt6i_flags |= RTF_LOCAL; 3382 3383 rt->rt6i_gateway = *addr; 3384 rt->rt6i_dst.addr = *addr; 3385 rt->rt6i_dst.plen = 128; 3386 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; 3387 rt->rt6i_table = fib6_get_table(net, tb_id); 3388 3389 return rt; 3390 } 3391 3392 /* remove deleted ip from prefsrc entries */ 3393 struct arg_dev_net_ip { 3394 struct net_device *dev; 3395 struct net *net; 3396 struct in6_addr *addr; 3397 }; 3398 3399 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg) 3400 { 3401 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3402 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3403 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3404 3405 if (((void *)rt->dst.dev == dev || !dev) && 3406 rt != net->ipv6.ip6_null_entry && 3407 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) { 3408 spin_lock_bh(&rt6_exception_lock); 3409 /* remove prefsrc entry */ 3410 rt->rt6i_prefsrc.plen = 0; 3411 /* need to update cache as well */ 3412 rt6_exceptions_remove_prefsrc(rt); 3413 spin_unlock_bh(&rt6_exception_lock); 3414 } 3415 return 0; 3416 } 3417 3418 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3419 { 3420 struct net *net = dev_net(ifp->idev->dev); 3421 struct arg_dev_net_ip adni = { 3422 .dev = ifp->idev->dev, 3423 .net = net, 3424 .addr = &ifp->addr, 3425 }; 3426 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3427 } 3428 3429 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) 3430 3431 /* Remove routers and update dst entries when gateway turn into host. */ 3432 static int fib6_clean_tohost(struct rt6_info *rt, void *arg) 3433 { 3434 struct in6_addr *gateway = (struct in6_addr *)arg; 3435 3436 if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3437 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) { 3438 return -1; 3439 } 3440 3441 /* Further clean up cached routes in exception table. 3442 * This is needed because cached route may have a different 3443 * gateway than its 'parent' in the case of an ip redirect. 3444 */ 3445 rt6_exceptions_clean_tohost(rt, gateway); 3446 3447 return 0; 3448 } 3449 3450 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3451 { 3452 fib6_clean_all(net, fib6_clean_tohost, gateway); 3453 } 3454 3455 struct arg_dev_net { 3456 struct net_device *dev; 3457 struct net *net; 3458 }; 3459 3460 /* called with write lock held for table with rt */ 3461 static int fib6_ifdown(struct rt6_info *rt, void *arg) 3462 { 3463 const struct arg_dev_net *adn = arg; 3464 const struct net_device *dev = adn->dev; 3465 3466 if ((rt->dst.dev == dev || !dev) && 3467 rt != adn->net->ipv6.ip6_null_entry && 3468 (rt->rt6i_nsiblings == 0 || 3469 (dev && netdev_unregistering(dev)) || 3470 !rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) 3471 return -1; 3472 3473 return 0; 3474 } 3475 3476 void rt6_ifdown(struct net *net, struct net_device *dev) 3477 { 3478 struct arg_dev_net adn = { 3479 .dev = dev, 3480 .net = net, 3481 }; 3482 3483 fib6_clean_all(net, fib6_ifdown, &adn); 3484 if (dev) 3485 rt6_uncached_list_flush_dev(net, dev); 3486 } 3487 3488 struct rt6_mtu_change_arg { 3489 struct net_device *dev; 3490 unsigned int mtu; 3491 }; 3492 3493 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) 3494 { 3495 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 3496 struct inet6_dev *idev; 3497 3498 /* In IPv6 pmtu discovery is not optional, 3499 so that RTAX_MTU lock cannot disable it. 3500 We still use this lock to block changes 3501 caused by addrconf/ndisc. 3502 */ 3503 3504 idev = __in6_dev_get(arg->dev); 3505 if (!idev) 3506 return 0; 3507 3508 /* For administrative MTU increase, there is no way to discover 3509 IPv6 PMTU increase, so PMTU increase should be updated here. 3510 Since RFC 1981 doesn't include administrative MTU increase 3511 update PMTU increase is a MUST. (i.e. jumbo frame) 3512 */ 3513 /* 3514 If new MTU is less than route PMTU, this new MTU will be the 3515 lowest MTU in the path, update the route PMTU to reflect PMTU 3516 decreases; if new MTU is greater than route PMTU, and the 3517 old MTU is the lowest MTU in the path, update the route PMTU 3518 to reflect the increase. In this case if the other nodes' MTU 3519 also have the lowest MTU, TOO BIG MESSAGE will be lead to 3520 PMTU discovery. 3521 */ 3522 if (rt->dst.dev == arg->dev && 3523 dst_metric_raw(&rt->dst, RTAX_MTU) && 3524 !dst_metric_locked(&rt->dst, RTAX_MTU)) { 3525 spin_lock_bh(&rt6_exception_lock); 3526 if (dst_mtu(&rt->dst) >= arg->mtu || 3527 (dst_mtu(&rt->dst) < arg->mtu && 3528 dst_mtu(&rt->dst) == idev->cnf.mtu6)) { 3529 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); 3530 } 3531 rt6_exceptions_update_pmtu(rt, arg->mtu); 3532 spin_unlock_bh(&rt6_exception_lock); 3533 } 3534 return 0; 3535 } 3536 3537 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 3538 { 3539 struct rt6_mtu_change_arg arg = { 3540 .dev = dev, 3541 .mtu = mtu, 3542 }; 3543 3544 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 3545 } 3546 3547 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 3548 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 3549 [RTA_OIF] = { .type = NLA_U32 }, 3550 [RTA_IIF] = { .type = NLA_U32 }, 3551 [RTA_PRIORITY] = { .type = NLA_U32 }, 3552 [RTA_METRICS] = { .type = NLA_NESTED }, 3553 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 3554 [RTA_PREF] = { .type = NLA_U8 }, 3555 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 3556 [RTA_ENCAP] = { .type = NLA_NESTED }, 3557 [RTA_EXPIRES] = { .type = NLA_U32 }, 3558 [RTA_UID] = { .type = NLA_U32 }, 3559 [RTA_MARK] = { .type = NLA_U32 }, 3560 }; 3561 3562 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 3563 struct fib6_config *cfg, 3564 struct netlink_ext_ack *extack) 3565 { 3566 struct rtmsg *rtm; 3567 struct nlattr *tb[RTA_MAX+1]; 3568 unsigned int pref; 3569 int err; 3570 3571 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 3572 NULL); 3573 if (err < 0) 3574 goto errout; 3575 3576 err = -EINVAL; 3577 rtm = nlmsg_data(nlh); 3578 memset(cfg, 0, sizeof(*cfg)); 3579 3580 cfg->fc_table = rtm->rtm_table; 3581 cfg->fc_dst_len = rtm->rtm_dst_len; 3582 cfg->fc_src_len = rtm->rtm_src_len; 3583 cfg->fc_flags = RTF_UP; 3584 cfg->fc_protocol = rtm->rtm_protocol; 3585 cfg->fc_type = rtm->rtm_type; 3586 3587 if (rtm->rtm_type == RTN_UNREACHABLE || 3588 rtm->rtm_type == RTN_BLACKHOLE || 3589 rtm->rtm_type == RTN_PROHIBIT || 3590 rtm->rtm_type == RTN_THROW) 3591 cfg->fc_flags |= RTF_REJECT; 3592 3593 if (rtm->rtm_type == RTN_LOCAL) 3594 cfg->fc_flags |= RTF_LOCAL; 3595 3596 if (rtm->rtm_flags & RTM_F_CLONED) 3597 cfg->fc_flags |= RTF_CACHE; 3598 3599 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; 3600 cfg->fc_nlinfo.nlh = nlh; 3601 cfg->fc_nlinfo.nl_net = sock_net(skb->sk); 3602 3603 if (tb[RTA_GATEWAY]) { 3604 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 3605 cfg->fc_flags |= RTF_GATEWAY; 3606 } 3607 3608 if (tb[RTA_DST]) { 3609 int plen = (rtm->rtm_dst_len + 7) >> 3; 3610 3611 if (nla_len(tb[RTA_DST]) < plen) 3612 goto errout; 3613 3614 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 3615 } 3616 3617 if (tb[RTA_SRC]) { 3618 int plen = (rtm->rtm_src_len + 7) >> 3; 3619 3620 if (nla_len(tb[RTA_SRC]) < plen) 3621 goto errout; 3622 3623 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 3624 } 3625 3626 if (tb[RTA_PREFSRC]) 3627 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 3628 3629 if (tb[RTA_OIF]) 3630 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 3631 3632 if (tb[RTA_PRIORITY]) 3633 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 3634 3635 if (tb[RTA_METRICS]) { 3636 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 3637 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 3638 } 3639 3640 if (tb[RTA_TABLE]) 3641 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 3642 3643 if (tb[RTA_MULTIPATH]) { 3644 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 3645 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 3646 3647 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 3648 cfg->fc_mp_len, extack); 3649 if (err < 0) 3650 goto errout; 3651 } 3652 3653 if (tb[RTA_PREF]) { 3654 pref = nla_get_u8(tb[RTA_PREF]); 3655 if (pref != ICMPV6_ROUTER_PREF_LOW && 3656 pref != ICMPV6_ROUTER_PREF_HIGH) 3657 pref = ICMPV6_ROUTER_PREF_MEDIUM; 3658 cfg->fc_flags |= RTF_PREF(pref); 3659 } 3660 3661 if (tb[RTA_ENCAP]) 3662 cfg->fc_encap = tb[RTA_ENCAP]; 3663 3664 if (tb[RTA_ENCAP_TYPE]) { 3665 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 3666 3667 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 3668 if (err < 0) 3669 goto errout; 3670 } 3671 3672 if (tb[RTA_EXPIRES]) { 3673 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 3674 3675 if (addrconf_finite_timeout(timeout)) { 3676 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 3677 cfg->fc_flags |= RTF_EXPIRES; 3678 } 3679 } 3680 3681 err = 0; 3682 errout: 3683 return err; 3684 } 3685 3686 struct rt6_nh { 3687 struct rt6_info *rt6_info; 3688 struct fib6_config r_cfg; 3689 struct mx6_config mxc; 3690 struct list_head next; 3691 }; 3692 3693 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) 3694 { 3695 struct rt6_nh *nh; 3696 3697 list_for_each_entry(nh, rt6_nh_list, next) { 3698 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n", 3699 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, 3700 nh->r_cfg.fc_ifindex); 3701 } 3702 } 3703 3704 static int ip6_route_info_append(struct list_head *rt6_nh_list, 3705 struct rt6_info *rt, struct fib6_config *r_cfg) 3706 { 3707 struct rt6_nh *nh; 3708 int err = -EEXIST; 3709 3710 list_for_each_entry(nh, rt6_nh_list, next) { 3711 /* check if rt6_info already exists */ 3712 if (rt6_duplicate_nexthop(nh->rt6_info, rt)) 3713 return err; 3714 } 3715 3716 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 3717 if (!nh) 3718 return -ENOMEM; 3719 nh->rt6_info = rt; 3720 err = ip6_convert_metrics(&nh->mxc, r_cfg); 3721 if (err) { 3722 kfree(nh); 3723 return err; 3724 } 3725 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 3726 list_add_tail(&nh->next, rt6_nh_list); 3727 3728 return 0; 3729 } 3730 3731 static void ip6_route_mpath_notify(struct rt6_info *rt, 3732 struct rt6_info *rt_last, 3733 struct nl_info *info, 3734 __u16 nlflags) 3735 { 3736 /* if this is an APPEND route, then rt points to the first route 3737 * inserted and rt_last points to last route inserted. Userspace 3738 * wants a consistent dump of the route which starts at the first 3739 * nexthop. Since sibling routes are always added at the end of 3740 * the list, find the first sibling of the last route appended 3741 */ 3742 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) { 3743 rt = list_first_entry(&rt_last->rt6i_siblings, 3744 struct rt6_info, 3745 rt6i_siblings); 3746 } 3747 3748 if (rt) 3749 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 3750 } 3751 3752 static int ip6_route_multipath_add(struct fib6_config *cfg, 3753 struct netlink_ext_ack *extack) 3754 { 3755 struct rt6_info *rt_notif = NULL, *rt_last = NULL; 3756 struct nl_info *info = &cfg->fc_nlinfo; 3757 struct fib6_config r_cfg; 3758 struct rtnexthop *rtnh; 3759 struct rt6_info *rt; 3760 struct rt6_nh *err_nh; 3761 struct rt6_nh *nh, *nh_safe; 3762 __u16 nlflags; 3763 int remaining; 3764 int attrlen; 3765 int err = 1; 3766 int nhn = 0; 3767 int replace = (cfg->fc_nlinfo.nlh && 3768 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 3769 LIST_HEAD(rt6_nh_list); 3770 3771 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 3772 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 3773 nlflags |= NLM_F_APPEND; 3774 3775 remaining = cfg->fc_mp_len; 3776 rtnh = (struct rtnexthop *)cfg->fc_mp; 3777 3778 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 3779 * rt6_info structs per nexthop 3780 */ 3781 while (rtnh_ok(rtnh, remaining)) { 3782 memcpy(&r_cfg, cfg, sizeof(*cfg)); 3783 if (rtnh->rtnh_ifindex) 3784 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 3785 3786 attrlen = rtnh_attrlen(rtnh); 3787 if (attrlen > 0) { 3788 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 3789 3790 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 3791 if (nla) { 3792 r_cfg.fc_gateway = nla_get_in6_addr(nla); 3793 r_cfg.fc_flags |= RTF_GATEWAY; 3794 } 3795 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 3796 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 3797 if (nla) 3798 r_cfg.fc_encap_type = nla_get_u16(nla); 3799 } 3800 3801 rt = ip6_route_info_create(&r_cfg, extack); 3802 if (IS_ERR(rt)) { 3803 err = PTR_ERR(rt); 3804 rt = NULL; 3805 goto cleanup; 3806 } 3807 3808 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); 3809 if (err) { 3810 dst_release_immediate(&rt->dst); 3811 goto cleanup; 3812 } 3813 3814 rtnh = rtnh_next(rtnh, &remaining); 3815 } 3816 3817 /* for add and replace send one notification with all nexthops. 3818 * Skip the notification in fib6_add_rt2node and send one with 3819 * the full route when done 3820 */ 3821 info->skip_notify = 1; 3822 3823 err_nh = NULL; 3824 list_for_each_entry(nh, &rt6_nh_list, next) { 3825 rt_last = nh->rt6_info; 3826 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack); 3827 /* save reference to first route for notification */ 3828 if (!rt_notif && !err) 3829 rt_notif = nh->rt6_info; 3830 3831 /* nh->rt6_info is used or freed at this point, reset to NULL*/ 3832 nh->rt6_info = NULL; 3833 if (err) { 3834 if (replace && nhn) 3835 ip6_print_replace_route_err(&rt6_nh_list); 3836 err_nh = nh; 3837 goto add_errout; 3838 } 3839 3840 /* Because each route is added like a single route we remove 3841 * these flags after the first nexthop: if there is a collision, 3842 * we have already failed to add the first nexthop: 3843 * fib6_add_rt2node() has rejected it; when replacing, old 3844 * nexthops have been replaced by first new, the rest should 3845 * be added to it. 3846 */ 3847 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 3848 NLM_F_REPLACE); 3849 nhn++; 3850 } 3851 3852 /* success ... tell user about new route */ 3853 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 3854 goto cleanup; 3855 3856 add_errout: 3857 /* send notification for routes that were added so that 3858 * the delete notifications sent by ip6_route_del are 3859 * coherent 3860 */ 3861 if (rt_notif) 3862 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 3863 3864 /* Delete routes that were already added */ 3865 list_for_each_entry(nh, &rt6_nh_list, next) { 3866 if (err_nh == nh) 3867 break; 3868 ip6_route_del(&nh->r_cfg, extack); 3869 } 3870 3871 cleanup: 3872 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 3873 if (nh->rt6_info) 3874 dst_release_immediate(&nh->rt6_info->dst); 3875 kfree(nh->mxc.mx); 3876 list_del(&nh->next); 3877 kfree(nh); 3878 } 3879 3880 return err; 3881 } 3882 3883 static int ip6_route_multipath_del(struct fib6_config *cfg, 3884 struct netlink_ext_ack *extack) 3885 { 3886 struct fib6_config r_cfg; 3887 struct rtnexthop *rtnh; 3888 int remaining; 3889 int attrlen; 3890 int err = 1, last_err = 0; 3891 3892 remaining = cfg->fc_mp_len; 3893 rtnh = (struct rtnexthop *)cfg->fc_mp; 3894 3895 /* Parse a Multipath Entry */ 3896 while (rtnh_ok(rtnh, remaining)) { 3897 memcpy(&r_cfg, cfg, sizeof(*cfg)); 3898 if (rtnh->rtnh_ifindex) 3899 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 3900 3901 attrlen = rtnh_attrlen(rtnh); 3902 if (attrlen > 0) { 3903 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 3904 3905 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 3906 if (nla) { 3907 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 3908 r_cfg.fc_flags |= RTF_GATEWAY; 3909 } 3910 } 3911 err = ip6_route_del(&r_cfg, extack); 3912 if (err) 3913 last_err = err; 3914 3915 rtnh = rtnh_next(rtnh, &remaining); 3916 } 3917 3918 return last_err; 3919 } 3920 3921 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 3922 struct netlink_ext_ack *extack) 3923 { 3924 struct fib6_config cfg; 3925 int err; 3926 3927 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 3928 if (err < 0) 3929 return err; 3930 3931 if (cfg.fc_mp) 3932 return ip6_route_multipath_del(&cfg, extack); 3933 else { 3934 cfg.fc_delete_all_nh = 1; 3935 return ip6_route_del(&cfg, extack); 3936 } 3937 } 3938 3939 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 3940 struct netlink_ext_ack *extack) 3941 { 3942 struct fib6_config cfg; 3943 int err; 3944 3945 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 3946 if (err < 0) 3947 return err; 3948 3949 if (cfg.fc_mp) 3950 return ip6_route_multipath_add(&cfg, extack); 3951 else 3952 return ip6_route_add(&cfg, extack); 3953 } 3954 3955 static size_t rt6_nlmsg_size(struct rt6_info *rt) 3956 { 3957 int nexthop_len = 0; 3958 3959 if (rt->rt6i_nsiblings) { 3960 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 3961 + NLA_ALIGN(sizeof(struct rtnexthop)) 3962 + nla_total_size(16) /* RTA_GATEWAY */ 3963 + lwtunnel_get_encap_size(rt->dst.lwtstate); 3964 3965 nexthop_len *= rt->rt6i_nsiblings; 3966 } 3967 3968 return NLMSG_ALIGN(sizeof(struct rtmsg)) 3969 + nla_total_size(16) /* RTA_SRC */ 3970 + nla_total_size(16) /* RTA_DST */ 3971 + nla_total_size(16) /* RTA_GATEWAY */ 3972 + nla_total_size(16) /* RTA_PREFSRC */ 3973 + nla_total_size(4) /* RTA_TABLE */ 3974 + nla_total_size(4) /* RTA_IIF */ 3975 + nla_total_size(4) /* RTA_OIF */ 3976 + nla_total_size(4) /* RTA_PRIORITY */ 3977 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 3978 + nla_total_size(sizeof(struct rta_cacheinfo)) 3979 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 3980 + nla_total_size(1) /* RTA_PREF */ 3981 + lwtunnel_get_encap_size(rt->dst.lwtstate) 3982 + nexthop_len; 3983 } 3984 3985 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt, 3986 unsigned int *flags, bool skip_oif) 3987 { 3988 if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) { 3989 *flags |= RTNH_F_LINKDOWN; 3990 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown) 3991 *flags |= RTNH_F_DEAD; 3992 } 3993 3994 if (rt->rt6i_flags & RTF_GATEWAY) { 3995 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0) 3996 goto nla_put_failure; 3997 } 3998 3999 if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD) 4000 *flags |= RTNH_F_OFFLOAD; 4001 4002 /* not needed for multipath encoding b/c it has a rtnexthop struct */ 4003 if (!skip_oif && rt->dst.dev && 4004 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) 4005 goto nla_put_failure; 4006 4007 if (rt->dst.lwtstate && 4008 lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0) 4009 goto nla_put_failure; 4010 4011 return 0; 4012 4013 nla_put_failure: 4014 return -EMSGSIZE; 4015 } 4016 4017 /* add multipath next hop */ 4018 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt) 4019 { 4020 struct rtnexthop *rtnh; 4021 unsigned int flags = 0; 4022 4023 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 4024 if (!rtnh) 4025 goto nla_put_failure; 4026 4027 rtnh->rtnh_hops = 0; 4028 rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0; 4029 4030 if (rt6_nexthop_info(skb, rt, &flags, true) < 0) 4031 goto nla_put_failure; 4032 4033 rtnh->rtnh_flags = flags; 4034 4035 /* length of rtnetlink header + attributes */ 4036 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; 4037 4038 return 0; 4039 4040 nla_put_failure: 4041 return -EMSGSIZE; 4042 } 4043 4044 static int rt6_fill_node(struct net *net, 4045 struct sk_buff *skb, struct rt6_info *rt, 4046 struct in6_addr *dst, struct in6_addr *src, 4047 int iif, int type, u32 portid, u32 seq, 4048 unsigned int flags) 4049 { 4050 u32 metrics[RTAX_MAX]; 4051 struct rtmsg *rtm; 4052 struct nlmsghdr *nlh; 4053 long expires; 4054 u32 table; 4055 4056 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4057 if (!nlh) 4058 return -EMSGSIZE; 4059 4060 rtm = nlmsg_data(nlh); 4061 rtm->rtm_family = AF_INET6; 4062 rtm->rtm_dst_len = rt->rt6i_dst.plen; 4063 rtm->rtm_src_len = rt->rt6i_src.plen; 4064 rtm->rtm_tos = 0; 4065 if (rt->rt6i_table) 4066 table = rt->rt6i_table->tb6_id; 4067 else 4068 table = RT6_TABLE_UNSPEC; 4069 rtm->rtm_table = table; 4070 if (nla_put_u32(skb, RTA_TABLE, table)) 4071 goto nla_put_failure; 4072 if (rt->rt6i_flags & RTF_REJECT) { 4073 switch (rt->dst.error) { 4074 case -EINVAL: 4075 rtm->rtm_type = RTN_BLACKHOLE; 4076 break; 4077 case -EACCES: 4078 rtm->rtm_type = RTN_PROHIBIT; 4079 break; 4080 case -EAGAIN: 4081 rtm->rtm_type = RTN_THROW; 4082 break; 4083 default: 4084 rtm->rtm_type = RTN_UNREACHABLE; 4085 break; 4086 } 4087 } 4088 else if (rt->rt6i_flags & RTF_LOCAL) 4089 rtm->rtm_type = RTN_LOCAL; 4090 else if (rt->rt6i_flags & RTF_ANYCAST) 4091 rtm->rtm_type = RTN_ANYCAST; 4092 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK)) 4093 rtm->rtm_type = RTN_LOCAL; 4094 else 4095 rtm->rtm_type = RTN_UNICAST; 4096 rtm->rtm_flags = 0; 4097 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4098 rtm->rtm_protocol = rt->rt6i_protocol; 4099 4100 if (rt->rt6i_flags & RTF_CACHE) 4101 rtm->rtm_flags |= RTM_F_CLONED; 4102 4103 if (dst) { 4104 if (nla_put_in6_addr(skb, RTA_DST, dst)) 4105 goto nla_put_failure; 4106 rtm->rtm_dst_len = 128; 4107 } else if (rtm->rtm_dst_len) 4108 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr)) 4109 goto nla_put_failure; 4110 #ifdef CONFIG_IPV6_SUBTREES 4111 if (src) { 4112 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4113 goto nla_put_failure; 4114 rtm->rtm_src_len = 128; 4115 } else if (rtm->rtm_src_len && 4116 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr)) 4117 goto nla_put_failure; 4118 #endif 4119 if (iif) { 4120 #ifdef CONFIG_IPV6_MROUTE 4121 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) { 4122 int err = ip6mr_get_route(net, skb, rtm, portid); 4123 4124 if (err == 0) 4125 return 0; 4126 if (err < 0) 4127 goto nla_put_failure; 4128 } else 4129 #endif 4130 if (nla_put_u32(skb, RTA_IIF, iif)) 4131 goto nla_put_failure; 4132 } else if (dst) { 4133 struct in6_addr saddr_buf; 4134 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 && 4135 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4136 goto nla_put_failure; 4137 } 4138 4139 if (rt->rt6i_prefsrc.plen) { 4140 struct in6_addr saddr_buf; 4141 saddr_buf = rt->rt6i_prefsrc.addr; 4142 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4143 goto nla_put_failure; 4144 } 4145 4146 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); 4147 if (rt->rt6i_pmtu) 4148 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu; 4149 if (rtnetlink_put_metrics(skb, metrics) < 0) 4150 goto nla_put_failure; 4151 4152 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric)) 4153 goto nla_put_failure; 4154 4155 /* For multipath routes, walk the siblings list and add 4156 * each as a nexthop within RTA_MULTIPATH. 4157 */ 4158 if (rt->rt6i_nsiblings) { 4159 struct rt6_info *sibling, *next_sibling; 4160 struct nlattr *mp; 4161 4162 mp = nla_nest_start(skb, RTA_MULTIPATH); 4163 if (!mp) 4164 goto nla_put_failure; 4165 4166 if (rt6_add_nexthop(skb, rt) < 0) 4167 goto nla_put_failure; 4168 4169 list_for_each_entry_safe(sibling, next_sibling, 4170 &rt->rt6i_siblings, rt6i_siblings) { 4171 if (rt6_add_nexthop(skb, sibling) < 0) 4172 goto nla_put_failure; 4173 } 4174 4175 nla_nest_end(skb, mp); 4176 } else { 4177 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0) 4178 goto nla_put_failure; 4179 } 4180 4181 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0; 4182 4183 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) 4184 goto nla_put_failure; 4185 4186 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) 4187 goto nla_put_failure; 4188 4189 4190 nlmsg_end(skb, nlh); 4191 return 0; 4192 4193 nla_put_failure: 4194 nlmsg_cancel(skb, nlh); 4195 return -EMSGSIZE; 4196 } 4197 4198 int rt6_dump_route(struct rt6_info *rt, void *p_arg) 4199 { 4200 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4201 struct net *net = arg->net; 4202 4203 if (rt == net->ipv6.ip6_null_entry) 4204 return 0; 4205 4206 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { 4207 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); 4208 4209 /* user wants prefix routes only */ 4210 if (rtm->rtm_flags & RTM_F_PREFIX && 4211 !(rt->rt6i_flags & RTF_PREFIX_RT)) { 4212 /* success since this is not a prefix route */ 4213 return 1; 4214 } 4215 } 4216 4217 return rt6_fill_node(net, 4218 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, 4219 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq, 4220 NLM_F_MULTI); 4221 } 4222 4223 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4224 struct netlink_ext_ack *extack) 4225 { 4226 struct net *net = sock_net(in_skb->sk); 4227 struct nlattr *tb[RTA_MAX+1]; 4228 int err, iif = 0, oif = 0; 4229 struct dst_entry *dst; 4230 struct rt6_info *rt; 4231 struct sk_buff *skb; 4232 struct rtmsg *rtm; 4233 struct flowi6 fl6; 4234 bool fibmatch; 4235 4236 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4237 extack); 4238 if (err < 0) 4239 goto errout; 4240 4241 err = -EINVAL; 4242 memset(&fl6, 0, sizeof(fl6)); 4243 rtm = nlmsg_data(nlh); 4244 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4245 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4246 4247 if (tb[RTA_SRC]) { 4248 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4249 goto errout; 4250 4251 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4252 } 4253 4254 if (tb[RTA_DST]) { 4255 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4256 goto errout; 4257 4258 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4259 } 4260 4261 if (tb[RTA_IIF]) 4262 iif = nla_get_u32(tb[RTA_IIF]); 4263 4264 if (tb[RTA_OIF]) 4265 oif = nla_get_u32(tb[RTA_OIF]); 4266 4267 if (tb[RTA_MARK]) 4268 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4269 4270 if (tb[RTA_UID]) 4271 fl6.flowi6_uid = make_kuid(current_user_ns(), 4272 nla_get_u32(tb[RTA_UID])); 4273 else 4274 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4275 4276 if (iif) { 4277 struct net_device *dev; 4278 int flags = 0; 4279 4280 rcu_read_lock(); 4281 4282 dev = dev_get_by_index_rcu(net, iif); 4283 if (!dev) { 4284 rcu_read_unlock(); 4285 err = -ENODEV; 4286 goto errout; 4287 } 4288 4289 fl6.flowi6_iif = iif; 4290 4291 if (!ipv6_addr_any(&fl6.saddr)) 4292 flags |= RT6_LOOKUP_F_HAS_SADDR; 4293 4294 if (!fibmatch) 4295 dst = ip6_route_input_lookup(net, dev, &fl6, flags); 4296 else 4297 dst = ip6_route_lookup(net, &fl6, 0); 4298 4299 rcu_read_unlock(); 4300 } else { 4301 fl6.flowi6_oif = oif; 4302 4303 if (!fibmatch) 4304 dst = ip6_route_output(net, NULL, &fl6); 4305 else 4306 dst = ip6_route_lookup(net, &fl6, 0); 4307 } 4308 4309 4310 rt = container_of(dst, struct rt6_info, dst); 4311 if (rt->dst.error) { 4312 err = rt->dst.error; 4313 ip6_rt_put(rt); 4314 goto errout; 4315 } 4316 4317 if (rt == net->ipv6.ip6_null_entry) { 4318 err = rt->dst.error; 4319 ip6_rt_put(rt); 4320 goto errout; 4321 } 4322 4323 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 4324 if (!skb) { 4325 ip6_rt_put(rt); 4326 err = -ENOBUFS; 4327 goto errout; 4328 } 4329 4330 skb_dst_set(skb, &rt->dst); 4331 if (fibmatch) 4332 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif, 4333 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 4334 nlh->nlmsg_seq, 0); 4335 else 4336 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif, 4337 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 4338 nlh->nlmsg_seq, 0); 4339 if (err < 0) { 4340 kfree_skb(skb); 4341 goto errout; 4342 } 4343 4344 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 4345 errout: 4346 return err; 4347 } 4348 4349 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, 4350 unsigned int nlm_flags) 4351 { 4352 struct sk_buff *skb; 4353 struct net *net = info->nl_net; 4354 u32 seq; 4355 int err; 4356 4357 err = -ENOBUFS; 4358 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 4359 4360 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 4361 if (!skb) 4362 goto errout; 4363 4364 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0, 4365 event, info->portid, seq, nlm_flags); 4366 if (err < 0) { 4367 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 4368 WARN_ON(err == -EMSGSIZE); 4369 kfree_skb(skb); 4370 goto errout; 4371 } 4372 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 4373 info->nlh, gfp_any()); 4374 return; 4375 errout: 4376 if (err < 0) 4377 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 4378 } 4379 4380 static int ip6_route_dev_notify(struct notifier_block *this, 4381 unsigned long event, void *ptr) 4382 { 4383 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 4384 struct net *net = dev_net(dev); 4385 4386 if (!(dev->flags & IFF_LOOPBACK)) 4387 return NOTIFY_OK; 4388 4389 if (event == NETDEV_REGISTER) { 4390 net->ipv6.ip6_null_entry->dst.dev = dev; 4391 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 4392 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4393 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 4394 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 4395 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 4396 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 4397 #endif 4398 } else if (event == NETDEV_UNREGISTER && 4399 dev->reg_state != NETREG_UNREGISTERED) { 4400 /* NETDEV_UNREGISTER could be fired for multiple times by 4401 * netdev_wait_allrefs(). Make sure we only call this once. 4402 */ 4403 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 4404 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4405 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 4406 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 4407 #endif 4408 } 4409 4410 return NOTIFY_OK; 4411 } 4412 4413 /* 4414 * /proc 4415 */ 4416 4417 #ifdef CONFIG_PROC_FS 4418 4419 static const struct file_operations ipv6_route_proc_fops = { 4420 .owner = THIS_MODULE, 4421 .open = ipv6_route_open, 4422 .read = seq_read, 4423 .llseek = seq_lseek, 4424 .release = seq_release_net, 4425 }; 4426 4427 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 4428 { 4429 struct net *net = (struct net *)seq->private; 4430 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 4431 net->ipv6.rt6_stats->fib_nodes, 4432 net->ipv6.rt6_stats->fib_route_nodes, 4433 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 4434 net->ipv6.rt6_stats->fib_rt_entries, 4435 net->ipv6.rt6_stats->fib_rt_cache, 4436 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 4437 net->ipv6.rt6_stats->fib_discarded_routes); 4438 4439 return 0; 4440 } 4441 4442 static int rt6_stats_seq_open(struct inode *inode, struct file *file) 4443 { 4444 return single_open_net(inode, file, rt6_stats_seq_show); 4445 } 4446 4447 static const struct file_operations rt6_stats_seq_fops = { 4448 .owner = THIS_MODULE, 4449 .open = rt6_stats_seq_open, 4450 .read = seq_read, 4451 .llseek = seq_lseek, 4452 .release = single_release_net, 4453 }; 4454 #endif /* CONFIG_PROC_FS */ 4455 4456 #ifdef CONFIG_SYSCTL 4457 4458 static 4459 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 4460 void __user *buffer, size_t *lenp, loff_t *ppos) 4461 { 4462 struct net *net; 4463 int delay; 4464 if (!write) 4465 return -EINVAL; 4466 4467 net = (struct net *)ctl->extra1; 4468 delay = net->ipv6.sysctl.flush_delay; 4469 proc_dointvec(ctl, write, buffer, lenp, ppos); 4470 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 4471 return 0; 4472 } 4473 4474 struct ctl_table ipv6_route_table_template[] = { 4475 { 4476 .procname = "flush", 4477 .data = &init_net.ipv6.sysctl.flush_delay, 4478 .maxlen = sizeof(int), 4479 .mode = 0200, 4480 .proc_handler = ipv6_sysctl_rtcache_flush 4481 }, 4482 { 4483 .procname = "gc_thresh", 4484 .data = &ip6_dst_ops_template.gc_thresh, 4485 .maxlen = sizeof(int), 4486 .mode = 0644, 4487 .proc_handler = proc_dointvec, 4488 }, 4489 { 4490 .procname = "max_size", 4491 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 4492 .maxlen = sizeof(int), 4493 .mode = 0644, 4494 .proc_handler = proc_dointvec, 4495 }, 4496 { 4497 .procname = "gc_min_interval", 4498 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 4499 .maxlen = sizeof(int), 4500 .mode = 0644, 4501 .proc_handler = proc_dointvec_jiffies, 4502 }, 4503 { 4504 .procname = "gc_timeout", 4505 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 4506 .maxlen = sizeof(int), 4507 .mode = 0644, 4508 .proc_handler = proc_dointvec_jiffies, 4509 }, 4510 { 4511 .procname = "gc_interval", 4512 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 4513 .maxlen = sizeof(int), 4514 .mode = 0644, 4515 .proc_handler = proc_dointvec_jiffies, 4516 }, 4517 { 4518 .procname = "gc_elasticity", 4519 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 4520 .maxlen = sizeof(int), 4521 .mode = 0644, 4522 .proc_handler = proc_dointvec, 4523 }, 4524 { 4525 .procname = "mtu_expires", 4526 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 4527 .maxlen = sizeof(int), 4528 .mode = 0644, 4529 .proc_handler = proc_dointvec_jiffies, 4530 }, 4531 { 4532 .procname = "min_adv_mss", 4533 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 4534 .maxlen = sizeof(int), 4535 .mode = 0644, 4536 .proc_handler = proc_dointvec, 4537 }, 4538 { 4539 .procname = "gc_min_interval_ms", 4540 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 4541 .maxlen = sizeof(int), 4542 .mode = 0644, 4543 .proc_handler = proc_dointvec_ms_jiffies, 4544 }, 4545 { } 4546 }; 4547 4548 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 4549 { 4550 struct ctl_table *table; 4551 4552 table = kmemdup(ipv6_route_table_template, 4553 sizeof(ipv6_route_table_template), 4554 GFP_KERNEL); 4555 4556 if (table) { 4557 table[0].data = &net->ipv6.sysctl.flush_delay; 4558 table[0].extra1 = net; 4559 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 4560 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 4561 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 4562 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 4563 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 4564 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 4565 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 4566 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 4567 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 4568 4569 /* Don't export sysctls to unprivileged users */ 4570 if (net->user_ns != &init_user_ns) 4571 table[0].procname = NULL; 4572 } 4573 4574 return table; 4575 } 4576 #endif 4577 4578 static int __net_init ip6_route_net_init(struct net *net) 4579 { 4580 int ret = -ENOMEM; 4581 4582 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 4583 sizeof(net->ipv6.ip6_dst_ops)); 4584 4585 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 4586 goto out_ip6_dst_ops; 4587 4588 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 4589 sizeof(*net->ipv6.ip6_null_entry), 4590 GFP_KERNEL); 4591 if (!net->ipv6.ip6_null_entry) 4592 goto out_ip6_dst_entries; 4593 net->ipv6.ip6_null_entry->dst.path = 4594 (struct dst_entry *)net->ipv6.ip6_null_entry; 4595 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 4596 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 4597 ip6_template_metrics, true); 4598 4599 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4600 net->ipv6.fib6_has_custom_rules = false; 4601 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 4602 sizeof(*net->ipv6.ip6_prohibit_entry), 4603 GFP_KERNEL); 4604 if (!net->ipv6.ip6_prohibit_entry) 4605 goto out_ip6_null_entry; 4606 net->ipv6.ip6_prohibit_entry->dst.path = 4607 (struct dst_entry *)net->ipv6.ip6_prohibit_entry; 4608 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 4609 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 4610 ip6_template_metrics, true); 4611 4612 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 4613 sizeof(*net->ipv6.ip6_blk_hole_entry), 4614 GFP_KERNEL); 4615 if (!net->ipv6.ip6_blk_hole_entry) 4616 goto out_ip6_prohibit_entry; 4617 net->ipv6.ip6_blk_hole_entry->dst.path = 4618 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry; 4619 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 4620 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 4621 ip6_template_metrics, true); 4622 #endif 4623 4624 net->ipv6.sysctl.flush_delay = 0; 4625 net->ipv6.sysctl.ip6_rt_max_size = 4096; 4626 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 4627 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 4628 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 4629 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 4630 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 4631 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 4632 4633 net->ipv6.ip6_rt_gc_expire = 30*HZ; 4634 4635 ret = 0; 4636 out: 4637 return ret; 4638 4639 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4640 out_ip6_prohibit_entry: 4641 kfree(net->ipv6.ip6_prohibit_entry); 4642 out_ip6_null_entry: 4643 kfree(net->ipv6.ip6_null_entry); 4644 #endif 4645 out_ip6_dst_entries: 4646 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 4647 out_ip6_dst_ops: 4648 goto out; 4649 } 4650 4651 static void __net_exit ip6_route_net_exit(struct net *net) 4652 { 4653 kfree(net->ipv6.ip6_null_entry); 4654 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4655 kfree(net->ipv6.ip6_prohibit_entry); 4656 kfree(net->ipv6.ip6_blk_hole_entry); 4657 #endif 4658 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 4659 } 4660 4661 static int __net_init ip6_route_net_init_late(struct net *net) 4662 { 4663 #ifdef CONFIG_PROC_FS 4664 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops); 4665 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops); 4666 #endif 4667 return 0; 4668 } 4669 4670 static void __net_exit ip6_route_net_exit_late(struct net *net) 4671 { 4672 #ifdef CONFIG_PROC_FS 4673 remove_proc_entry("ipv6_route", net->proc_net); 4674 remove_proc_entry("rt6_stats", net->proc_net); 4675 #endif 4676 } 4677 4678 static struct pernet_operations ip6_route_net_ops = { 4679 .init = ip6_route_net_init, 4680 .exit = ip6_route_net_exit, 4681 }; 4682 4683 static int __net_init ipv6_inetpeer_init(struct net *net) 4684 { 4685 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 4686 4687 if (!bp) 4688 return -ENOMEM; 4689 inet_peer_base_init(bp); 4690 net->ipv6.peers = bp; 4691 return 0; 4692 } 4693 4694 static void __net_exit ipv6_inetpeer_exit(struct net *net) 4695 { 4696 struct inet_peer_base *bp = net->ipv6.peers; 4697 4698 net->ipv6.peers = NULL; 4699 inetpeer_invalidate_tree(bp); 4700 kfree(bp); 4701 } 4702 4703 static struct pernet_operations ipv6_inetpeer_ops = { 4704 .init = ipv6_inetpeer_init, 4705 .exit = ipv6_inetpeer_exit, 4706 }; 4707 4708 static struct pernet_operations ip6_route_net_late_ops = { 4709 .init = ip6_route_net_init_late, 4710 .exit = ip6_route_net_exit_late, 4711 }; 4712 4713 static struct notifier_block ip6_route_dev_notifier = { 4714 .notifier_call = ip6_route_dev_notify, 4715 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 4716 }; 4717 4718 void __init ip6_route_init_special_entries(void) 4719 { 4720 /* Registering of the loopback is done before this portion of code, 4721 * the loopback reference in rt6_info will not be taken, do it 4722 * manually for init_net */ 4723 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 4724 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 4725 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4726 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 4727 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 4728 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 4729 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 4730 #endif 4731 } 4732 4733 int __init ip6_route_init(void) 4734 { 4735 int ret; 4736 int cpu; 4737 4738 ret = -ENOMEM; 4739 ip6_dst_ops_template.kmem_cachep = 4740 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 4741 SLAB_HWCACHE_ALIGN, NULL); 4742 if (!ip6_dst_ops_template.kmem_cachep) 4743 goto out; 4744 4745 ret = dst_entries_init(&ip6_dst_blackhole_ops); 4746 if (ret) 4747 goto out_kmem_cache; 4748 4749 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 4750 if (ret) 4751 goto out_dst_entries; 4752 4753 ret = register_pernet_subsys(&ip6_route_net_ops); 4754 if (ret) 4755 goto out_register_inetpeer; 4756 4757 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 4758 4759 ret = fib6_init(); 4760 if (ret) 4761 goto out_register_subsys; 4762 4763 ret = xfrm6_init(); 4764 if (ret) 4765 goto out_fib6_init; 4766 4767 ret = fib6_rules_init(); 4768 if (ret) 4769 goto xfrm6_init; 4770 4771 ret = register_pernet_subsys(&ip6_route_net_late_ops); 4772 if (ret) 4773 goto fib6_rules_init; 4774 4775 ret = -ENOBUFS; 4776 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) || 4777 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) || 4778 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, 4779 RTNL_FLAG_DOIT_UNLOCKED)) 4780 goto out_register_late_subsys; 4781 4782 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 4783 if (ret) 4784 goto out_register_late_subsys; 4785 4786 for_each_possible_cpu(cpu) { 4787 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 4788 4789 INIT_LIST_HEAD(&ul->head); 4790 spin_lock_init(&ul->lock); 4791 } 4792 4793 out: 4794 return ret; 4795 4796 out_register_late_subsys: 4797 unregister_pernet_subsys(&ip6_route_net_late_ops); 4798 fib6_rules_init: 4799 fib6_rules_cleanup(); 4800 xfrm6_init: 4801 xfrm6_fini(); 4802 out_fib6_init: 4803 fib6_gc_cleanup(); 4804 out_register_subsys: 4805 unregister_pernet_subsys(&ip6_route_net_ops); 4806 out_register_inetpeer: 4807 unregister_pernet_subsys(&ipv6_inetpeer_ops); 4808 out_dst_entries: 4809 dst_entries_destroy(&ip6_dst_blackhole_ops); 4810 out_kmem_cache: 4811 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 4812 goto out; 4813 } 4814 4815 void ip6_route_cleanup(void) 4816 { 4817 unregister_netdevice_notifier(&ip6_route_dev_notifier); 4818 unregister_pernet_subsys(&ip6_route_net_late_ops); 4819 fib6_rules_cleanup(); 4820 xfrm6_fini(); 4821 fib6_gc_cleanup(); 4822 unregister_pernet_subsys(&ipv6_inetpeer_ops); 4823 unregister_pernet_subsys(&ip6_route_net_ops); 4824 dst_entries_destroy(&ip6_dst_blackhole_ops); 4825 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 4826 } 4827