1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <net/net_namespace.h> 48 #include <net/snmp.h> 49 #include <net/ipv6.h> 50 #include <net/ip6_fib.h> 51 #include <net/ip6_route.h> 52 #include <net/ndisc.h> 53 #include <net/addrconf.h> 54 #include <net/tcp.h> 55 #include <linux/rtnetlink.h> 56 #include <net/dst.h> 57 #include <net/dst_metadata.h> 58 #include <net/xfrm.h> 59 #include <net/netevent.h> 60 #include <net/netlink.h> 61 #include <net/nexthop.h> 62 #include <net/lwtunnel.h> 63 #include <net/ip_tunnels.h> 64 #include <net/l3mdev.h> 65 #include <trace/events/fib6.h> 66 67 #include <linux/uaccess.h> 68 69 #ifdef CONFIG_SYSCTL 70 #include <linux/sysctl.h> 71 #endif 72 73 enum rt6_nud_state { 74 RT6_NUD_FAIL_HARD = -3, 75 RT6_NUD_FAIL_PROBE = -2, 76 RT6_NUD_FAIL_DO_RR = -1, 77 RT6_NUD_SUCCEED = 1 78 }; 79 80 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort); 81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 82 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 83 static unsigned int ip6_mtu(const struct dst_entry *dst); 84 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 85 static void ip6_dst_destroy(struct dst_entry *); 86 static void ip6_dst_ifdown(struct dst_entry *, 87 struct net_device *dev, int how); 88 static int ip6_dst_gc(struct dst_ops *ops); 89 90 static int ip6_pkt_discard(struct sk_buff *skb); 91 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 92 static int ip6_pkt_prohibit(struct sk_buff *skb); 93 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 94 static void ip6_link_failure(struct sk_buff *skb); 95 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 96 struct sk_buff *skb, u32 mtu); 97 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 98 struct sk_buff *skb); 99 static void rt6_dst_from_metrics_check(struct rt6_info *rt); 100 static int rt6_score_route(struct rt6_info *rt, int oif, int strict); 101 static size_t rt6_nlmsg_size(struct rt6_info *rt); 102 static int rt6_fill_node(struct net *net, 103 struct sk_buff *skb, struct rt6_info *rt, 104 struct in6_addr *dst, struct in6_addr *src, 105 int iif, int type, u32 portid, u32 seq, 106 unsigned int flags); 107 108 #ifdef CONFIG_IPV6_ROUTE_INFO 109 static struct rt6_info *rt6_add_route_info(struct net *net, 110 const struct in6_addr *prefix, int prefixlen, 111 const struct in6_addr *gwaddr, 112 struct net_device *dev, 113 unsigned int pref); 114 static struct rt6_info *rt6_get_route_info(struct net *net, 115 const struct in6_addr *prefix, int prefixlen, 116 const struct in6_addr *gwaddr, 117 struct net_device *dev); 118 #endif 119 120 struct uncached_list { 121 spinlock_t lock; 122 struct list_head head; 123 }; 124 125 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 126 127 static void rt6_uncached_list_add(struct rt6_info *rt) 128 { 129 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 130 131 rt->rt6i_uncached_list = ul; 132 133 spin_lock_bh(&ul->lock); 134 list_add_tail(&rt->rt6i_uncached, &ul->head); 135 spin_unlock_bh(&ul->lock); 136 } 137 138 static void rt6_uncached_list_del(struct rt6_info *rt) 139 { 140 if (!list_empty(&rt->rt6i_uncached)) { 141 struct uncached_list *ul = rt->rt6i_uncached_list; 142 143 spin_lock_bh(&ul->lock); 144 list_del(&rt->rt6i_uncached); 145 spin_unlock_bh(&ul->lock); 146 } 147 } 148 149 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 150 { 151 struct net_device *loopback_dev = net->loopback_dev; 152 int cpu; 153 154 if (dev == loopback_dev) 155 return; 156 157 for_each_possible_cpu(cpu) { 158 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 159 struct rt6_info *rt; 160 161 spin_lock_bh(&ul->lock); 162 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 163 struct inet6_dev *rt_idev = rt->rt6i_idev; 164 struct net_device *rt_dev = rt->dst.dev; 165 166 if (rt_idev->dev == dev) { 167 rt->rt6i_idev = in6_dev_get(loopback_dev); 168 in6_dev_put(rt_idev); 169 } 170 171 if (rt_dev == dev) { 172 rt->dst.dev = loopback_dev; 173 dev_hold(rt->dst.dev); 174 dev_put(rt_dev); 175 } 176 } 177 spin_unlock_bh(&ul->lock); 178 } 179 } 180 181 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt) 182 { 183 return dst_metrics_write_ptr(rt->dst.from); 184 } 185 186 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) 187 { 188 struct rt6_info *rt = (struct rt6_info *)dst; 189 190 if (rt->rt6i_flags & RTF_PCPU) 191 return rt6_pcpu_cow_metrics(rt); 192 else if (rt->rt6i_flags & RTF_CACHE) 193 return NULL; 194 else 195 return dst_cow_metrics_generic(dst, old); 196 } 197 198 static inline const void *choose_neigh_daddr(struct rt6_info *rt, 199 struct sk_buff *skb, 200 const void *daddr) 201 { 202 struct in6_addr *p = &rt->rt6i_gateway; 203 204 if (!ipv6_addr_any(p)) 205 return (const void *) p; 206 else if (skb) 207 return &ipv6_hdr(skb)->daddr; 208 return daddr; 209 } 210 211 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, 212 struct sk_buff *skb, 213 const void *daddr) 214 { 215 struct rt6_info *rt = (struct rt6_info *) dst; 216 struct neighbour *n; 217 218 daddr = choose_neigh_daddr(rt, skb, daddr); 219 n = __ipv6_neigh_lookup(dst->dev, daddr); 220 if (n) 221 return n; 222 return neigh_create(&nd_tbl, daddr, dst->dev); 223 } 224 225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 226 { 227 struct net_device *dev = dst->dev; 228 struct rt6_info *rt = (struct rt6_info *)dst; 229 230 daddr = choose_neigh_daddr(rt, NULL, daddr); 231 if (!daddr) 232 return; 233 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 234 return; 235 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 236 return; 237 __ipv6_confirm_neigh(dev, daddr); 238 } 239 240 static struct dst_ops ip6_dst_ops_template = { 241 .family = AF_INET6, 242 .gc = ip6_dst_gc, 243 .gc_thresh = 1024, 244 .check = ip6_dst_check, 245 .default_advmss = ip6_default_advmss, 246 .mtu = ip6_mtu, 247 .cow_metrics = ipv6_cow_metrics, 248 .destroy = ip6_dst_destroy, 249 .ifdown = ip6_dst_ifdown, 250 .negative_advice = ip6_negative_advice, 251 .link_failure = ip6_link_failure, 252 .update_pmtu = ip6_rt_update_pmtu, 253 .redirect = rt6_do_redirect, 254 .local_out = __ip6_local_out, 255 .neigh_lookup = ip6_neigh_lookup, 256 .confirm_neigh = ip6_confirm_neigh, 257 }; 258 259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 260 { 261 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 262 263 return mtu ? : dst->dev->mtu; 264 } 265 266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 267 struct sk_buff *skb, u32 mtu) 268 { 269 } 270 271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 272 struct sk_buff *skb) 273 { 274 } 275 276 static struct dst_ops ip6_dst_blackhole_ops = { 277 .family = AF_INET6, 278 .destroy = ip6_dst_destroy, 279 .check = ip6_dst_check, 280 .mtu = ip6_blackhole_mtu, 281 .default_advmss = ip6_default_advmss, 282 .update_pmtu = ip6_rt_blackhole_update_pmtu, 283 .redirect = ip6_rt_blackhole_redirect, 284 .cow_metrics = dst_cow_metrics_generic, 285 .neigh_lookup = ip6_neigh_lookup, 286 }; 287 288 static const u32 ip6_template_metrics[RTAX_MAX] = { 289 [RTAX_HOPLIMIT - 1] = 0, 290 }; 291 292 static const struct rt6_info ip6_null_entry_template = { 293 .dst = { 294 .__refcnt = ATOMIC_INIT(1), 295 .__use = 1, 296 .obsolete = DST_OBSOLETE_FORCE_CHK, 297 .error = -ENETUNREACH, 298 .input = ip6_pkt_discard, 299 .output = ip6_pkt_discard_out, 300 }, 301 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 302 .rt6i_protocol = RTPROT_KERNEL, 303 .rt6i_metric = ~(u32) 0, 304 .rt6i_ref = ATOMIC_INIT(1), 305 }; 306 307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 308 309 static const struct rt6_info ip6_prohibit_entry_template = { 310 .dst = { 311 .__refcnt = ATOMIC_INIT(1), 312 .__use = 1, 313 .obsolete = DST_OBSOLETE_FORCE_CHK, 314 .error = -EACCES, 315 .input = ip6_pkt_prohibit, 316 .output = ip6_pkt_prohibit_out, 317 }, 318 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 319 .rt6i_protocol = RTPROT_KERNEL, 320 .rt6i_metric = ~(u32) 0, 321 .rt6i_ref = ATOMIC_INIT(1), 322 }; 323 324 static const struct rt6_info ip6_blk_hole_entry_template = { 325 .dst = { 326 .__refcnt = ATOMIC_INIT(1), 327 .__use = 1, 328 .obsolete = DST_OBSOLETE_FORCE_CHK, 329 .error = -EINVAL, 330 .input = dst_discard, 331 .output = dst_discard_out, 332 }, 333 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 334 .rt6i_protocol = RTPROT_KERNEL, 335 .rt6i_metric = ~(u32) 0, 336 .rt6i_ref = ATOMIC_INIT(1), 337 }; 338 339 #endif 340 341 static void rt6_info_init(struct rt6_info *rt) 342 { 343 struct dst_entry *dst = &rt->dst; 344 345 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 346 INIT_LIST_HEAD(&rt->rt6i_siblings); 347 INIT_LIST_HEAD(&rt->rt6i_uncached); 348 } 349 350 /* allocate dst with ip6_dst_ops */ 351 static struct rt6_info *__ip6_dst_alloc(struct net *net, 352 struct net_device *dev, 353 int flags) 354 { 355 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 356 1, DST_OBSOLETE_FORCE_CHK, flags); 357 358 if (rt) 359 rt6_info_init(rt); 360 361 return rt; 362 } 363 364 struct rt6_info *ip6_dst_alloc(struct net *net, 365 struct net_device *dev, 366 int flags) 367 { 368 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags); 369 370 if (rt) { 371 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC); 372 if (rt->rt6i_pcpu) { 373 int cpu; 374 375 for_each_possible_cpu(cpu) { 376 struct rt6_info **p; 377 378 p = per_cpu_ptr(rt->rt6i_pcpu, cpu); 379 /* no one shares rt */ 380 *p = NULL; 381 } 382 } else { 383 dst_release_immediate(&rt->dst); 384 return NULL; 385 } 386 } 387 388 return rt; 389 } 390 EXPORT_SYMBOL(ip6_dst_alloc); 391 392 static void ip6_dst_destroy(struct dst_entry *dst) 393 { 394 struct rt6_info *rt = (struct rt6_info *)dst; 395 struct dst_entry *from = dst->from; 396 struct inet6_dev *idev; 397 398 dst_destroy_metrics_generic(dst); 399 free_percpu(rt->rt6i_pcpu); 400 rt6_uncached_list_del(rt); 401 402 idev = rt->rt6i_idev; 403 if (idev) { 404 rt->rt6i_idev = NULL; 405 in6_dev_put(idev); 406 } 407 408 dst->from = NULL; 409 dst_release(from); 410 } 411 412 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 413 int how) 414 { 415 struct rt6_info *rt = (struct rt6_info *)dst; 416 struct inet6_dev *idev = rt->rt6i_idev; 417 struct net_device *loopback_dev = 418 dev_net(dev)->loopback_dev; 419 420 if (idev && idev->dev != loopback_dev) { 421 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 422 if (loopback_idev) { 423 rt->rt6i_idev = loopback_idev; 424 in6_dev_put(idev); 425 } 426 } 427 } 428 429 static bool __rt6_check_expired(const struct rt6_info *rt) 430 { 431 if (rt->rt6i_flags & RTF_EXPIRES) 432 return time_after(jiffies, rt->dst.expires); 433 else 434 return false; 435 } 436 437 static bool rt6_check_expired(const struct rt6_info *rt) 438 { 439 if (rt->rt6i_flags & RTF_EXPIRES) { 440 if (time_after(jiffies, rt->dst.expires)) 441 return true; 442 } else if (rt->dst.from) { 443 return rt6_check_expired((struct rt6_info *) rt->dst.from); 444 } 445 return false; 446 } 447 448 static struct rt6_info *rt6_multipath_select(struct rt6_info *match, 449 struct flowi6 *fl6, int oif, 450 int strict) 451 { 452 struct rt6_info *sibling, *next_sibling; 453 int route_choosen; 454 455 /* We might have already computed the hash for ICMPv6 errors. In such 456 * case it will always be non-zero. Otherwise now is the time to do it. 457 */ 458 if (!fl6->mp_hash) 459 fl6->mp_hash = rt6_multipath_hash(fl6, NULL); 460 461 route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1); 462 /* Don't change the route, if route_choosen == 0 463 * (siblings does not include ourself) 464 */ 465 if (route_choosen) 466 list_for_each_entry_safe(sibling, next_sibling, 467 &match->rt6i_siblings, rt6i_siblings) { 468 route_choosen--; 469 if (route_choosen == 0) { 470 if (rt6_score_route(sibling, oif, strict) < 0) 471 break; 472 match = sibling; 473 break; 474 } 475 } 476 return match; 477 } 478 479 /* 480 * Route lookup. Any table->tb6_lock is implied. 481 */ 482 483 static inline struct rt6_info *rt6_device_match(struct net *net, 484 struct rt6_info *rt, 485 const struct in6_addr *saddr, 486 int oif, 487 int flags) 488 { 489 struct rt6_info *local = NULL; 490 struct rt6_info *sprt; 491 492 if (!oif && ipv6_addr_any(saddr)) 493 goto out; 494 495 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) { 496 struct net_device *dev = sprt->dst.dev; 497 498 if (oif) { 499 if (dev->ifindex == oif) 500 return sprt; 501 if (dev->flags & IFF_LOOPBACK) { 502 if (!sprt->rt6i_idev || 503 sprt->rt6i_idev->dev->ifindex != oif) { 504 if (flags & RT6_LOOKUP_F_IFACE) 505 continue; 506 if (local && 507 local->rt6i_idev->dev->ifindex == oif) 508 continue; 509 } 510 local = sprt; 511 } 512 } else { 513 if (ipv6_chk_addr(net, saddr, dev, 514 flags & RT6_LOOKUP_F_IFACE)) 515 return sprt; 516 } 517 } 518 519 if (oif) { 520 if (local) 521 return local; 522 523 if (flags & RT6_LOOKUP_F_IFACE) 524 return net->ipv6.ip6_null_entry; 525 } 526 out: 527 return rt; 528 } 529 530 #ifdef CONFIG_IPV6_ROUTER_PREF 531 struct __rt6_probe_work { 532 struct work_struct work; 533 struct in6_addr target; 534 struct net_device *dev; 535 }; 536 537 static void rt6_probe_deferred(struct work_struct *w) 538 { 539 struct in6_addr mcaddr; 540 struct __rt6_probe_work *work = 541 container_of(w, struct __rt6_probe_work, work); 542 543 addrconf_addr_solict_mult(&work->target, &mcaddr); 544 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 545 dev_put(work->dev); 546 kfree(work); 547 } 548 549 static void rt6_probe(struct rt6_info *rt) 550 { 551 struct __rt6_probe_work *work; 552 struct neighbour *neigh; 553 /* 554 * Okay, this does not seem to be appropriate 555 * for now, however, we need to check if it 556 * is really so; aka Router Reachability Probing. 557 * 558 * Router Reachability Probe MUST be rate-limited 559 * to no more than one per minute. 560 */ 561 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY)) 562 return; 563 rcu_read_lock_bh(); 564 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 565 if (neigh) { 566 if (neigh->nud_state & NUD_VALID) 567 goto out; 568 569 work = NULL; 570 write_lock(&neigh->lock); 571 if (!(neigh->nud_state & NUD_VALID) && 572 time_after(jiffies, 573 neigh->updated + 574 rt->rt6i_idev->cnf.rtr_probe_interval)) { 575 work = kmalloc(sizeof(*work), GFP_ATOMIC); 576 if (work) 577 __neigh_set_probe_once(neigh); 578 } 579 write_unlock(&neigh->lock); 580 } else { 581 work = kmalloc(sizeof(*work), GFP_ATOMIC); 582 } 583 584 if (work) { 585 INIT_WORK(&work->work, rt6_probe_deferred); 586 work->target = rt->rt6i_gateway; 587 dev_hold(rt->dst.dev); 588 work->dev = rt->dst.dev; 589 schedule_work(&work->work); 590 } 591 592 out: 593 rcu_read_unlock_bh(); 594 } 595 #else 596 static inline void rt6_probe(struct rt6_info *rt) 597 { 598 } 599 #endif 600 601 /* 602 * Default Router Selection (RFC 2461 6.3.6) 603 */ 604 static inline int rt6_check_dev(struct rt6_info *rt, int oif) 605 { 606 struct net_device *dev = rt->dst.dev; 607 if (!oif || dev->ifindex == oif) 608 return 2; 609 if ((dev->flags & IFF_LOOPBACK) && 610 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif) 611 return 1; 612 return 0; 613 } 614 615 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt) 616 { 617 struct neighbour *neigh; 618 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 619 620 if (rt->rt6i_flags & RTF_NONEXTHOP || 621 !(rt->rt6i_flags & RTF_GATEWAY)) 622 return RT6_NUD_SUCCEED; 623 624 rcu_read_lock_bh(); 625 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 626 if (neigh) { 627 read_lock(&neigh->lock); 628 if (neigh->nud_state & NUD_VALID) 629 ret = RT6_NUD_SUCCEED; 630 #ifdef CONFIG_IPV6_ROUTER_PREF 631 else if (!(neigh->nud_state & NUD_FAILED)) 632 ret = RT6_NUD_SUCCEED; 633 else 634 ret = RT6_NUD_FAIL_PROBE; 635 #endif 636 read_unlock(&neigh->lock); 637 } else { 638 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 639 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 640 } 641 rcu_read_unlock_bh(); 642 643 return ret; 644 } 645 646 static int rt6_score_route(struct rt6_info *rt, int oif, 647 int strict) 648 { 649 int m; 650 651 m = rt6_check_dev(rt, oif); 652 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 653 return RT6_NUD_FAIL_HARD; 654 #ifdef CONFIG_IPV6_ROUTER_PREF 655 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2; 656 #endif 657 if (strict & RT6_LOOKUP_F_REACHABLE) { 658 int n = rt6_check_neigh(rt); 659 if (n < 0) 660 return n; 661 } 662 return m; 663 } 664 665 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, 666 int *mpri, struct rt6_info *match, 667 bool *do_rr) 668 { 669 int m; 670 bool match_do_rr = false; 671 struct inet6_dev *idev = rt->rt6i_idev; 672 struct net_device *dev = rt->dst.dev; 673 674 if (dev && !netif_carrier_ok(dev) && 675 idev->cnf.ignore_routes_with_linkdown && 676 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 677 goto out; 678 679 if (rt6_check_expired(rt)) 680 goto out; 681 682 m = rt6_score_route(rt, oif, strict); 683 if (m == RT6_NUD_FAIL_DO_RR) { 684 match_do_rr = true; 685 m = 0; /* lowest valid score */ 686 } else if (m == RT6_NUD_FAIL_HARD) { 687 goto out; 688 } 689 690 if (strict & RT6_LOOKUP_F_REACHABLE) 691 rt6_probe(rt); 692 693 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 694 if (m > *mpri) { 695 *do_rr = match_do_rr; 696 *mpri = m; 697 match = rt; 698 } 699 out: 700 return match; 701 } 702 703 static struct rt6_info *find_rr_leaf(struct fib6_node *fn, 704 struct rt6_info *rr_head, 705 u32 metric, int oif, int strict, 706 bool *do_rr) 707 { 708 struct rt6_info *rt, *match, *cont; 709 int mpri = -1; 710 711 match = NULL; 712 cont = NULL; 713 for (rt = rr_head; rt; rt = rt->dst.rt6_next) { 714 if (rt->rt6i_metric != metric) { 715 cont = rt; 716 break; 717 } 718 719 match = find_match(rt, oif, strict, &mpri, match, do_rr); 720 } 721 722 for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) { 723 if (rt->rt6i_metric != metric) { 724 cont = rt; 725 break; 726 } 727 728 match = find_match(rt, oif, strict, &mpri, match, do_rr); 729 } 730 731 if (match || !cont) 732 return match; 733 734 for (rt = cont; rt; rt = rt->dst.rt6_next) 735 match = find_match(rt, oif, strict, &mpri, match, do_rr); 736 737 return match; 738 } 739 740 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict) 741 { 742 struct rt6_info *match, *rt0; 743 struct net *net; 744 bool do_rr = false; 745 746 rt0 = fn->rr_ptr; 747 if (!rt0) 748 fn->rr_ptr = rt0 = fn->leaf; 749 750 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict, 751 &do_rr); 752 753 if (do_rr) { 754 struct rt6_info *next = rt0->dst.rt6_next; 755 756 /* no entries matched; do round-robin */ 757 if (!next || next->rt6i_metric != rt0->rt6i_metric) 758 next = fn->leaf; 759 760 if (next != rt0) 761 fn->rr_ptr = next; 762 } 763 764 net = dev_net(rt0->dst.dev); 765 return match ? match : net->ipv6.ip6_null_entry; 766 } 767 768 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt) 769 { 770 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); 771 } 772 773 #ifdef CONFIG_IPV6_ROUTE_INFO 774 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 775 const struct in6_addr *gwaddr) 776 { 777 struct net *net = dev_net(dev); 778 struct route_info *rinfo = (struct route_info *) opt; 779 struct in6_addr prefix_buf, *prefix; 780 unsigned int pref; 781 unsigned long lifetime; 782 struct rt6_info *rt; 783 784 if (len < sizeof(struct route_info)) { 785 return -EINVAL; 786 } 787 788 /* Sanity check for prefix_len and length */ 789 if (rinfo->length > 3) { 790 return -EINVAL; 791 } else if (rinfo->prefix_len > 128) { 792 return -EINVAL; 793 } else if (rinfo->prefix_len > 64) { 794 if (rinfo->length < 2) { 795 return -EINVAL; 796 } 797 } else if (rinfo->prefix_len > 0) { 798 if (rinfo->length < 1) { 799 return -EINVAL; 800 } 801 } 802 803 pref = rinfo->route_pref; 804 if (pref == ICMPV6_ROUTER_PREF_INVALID) 805 return -EINVAL; 806 807 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 808 809 if (rinfo->length == 3) 810 prefix = (struct in6_addr *)rinfo->prefix; 811 else { 812 /* this function is safe */ 813 ipv6_addr_prefix(&prefix_buf, 814 (struct in6_addr *)rinfo->prefix, 815 rinfo->prefix_len); 816 prefix = &prefix_buf; 817 } 818 819 if (rinfo->prefix_len == 0) 820 rt = rt6_get_dflt_router(gwaddr, dev); 821 else 822 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 823 gwaddr, dev); 824 825 if (rt && !lifetime) { 826 ip6_del_rt(rt); 827 rt = NULL; 828 } 829 830 if (!rt && lifetime) 831 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 832 dev, pref); 833 else if (rt) 834 rt->rt6i_flags = RTF_ROUTEINFO | 835 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 836 837 if (rt) { 838 if (!addrconf_finite_timeout(lifetime)) 839 rt6_clean_expires(rt); 840 else 841 rt6_set_expires(rt, jiffies + HZ * lifetime); 842 843 ip6_rt_put(rt); 844 } 845 return 0; 846 } 847 #endif 848 849 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 850 struct in6_addr *saddr) 851 { 852 struct fib6_node *pn; 853 while (1) { 854 if (fn->fn_flags & RTN_TL_ROOT) 855 return NULL; 856 pn = fn->parent; 857 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) 858 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); 859 else 860 fn = pn; 861 if (fn->fn_flags & RTN_RTINFO) 862 return fn; 863 } 864 } 865 866 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 867 struct fib6_table *table, 868 struct flowi6 *fl6, int flags) 869 { 870 struct fib6_node *fn; 871 struct rt6_info *rt; 872 873 read_lock_bh(&table->tb6_lock); 874 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 875 restart: 876 rt = fn->leaf; 877 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); 878 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) 879 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags); 880 if (rt == net->ipv6.ip6_null_entry) { 881 fn = fib6_backtrack(fn, &fl6->saddr); 882 if (fn) 883 goto restart; 884 } 885 dst_use(&rt->dst, jiffies); 886 read_unlock_bh(&table->tb6_lock); 887 888 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); 889 890 return rt; 891 892 } 893 894 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 895 int flags) 896 { 897 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup); 898 } 899 EXPORT_SYMBOL_GPL(ip6_route_lookup); 900 901 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 902 const struct in6_addr *saddr, int oif, int strict) 903 { 904 struct flowi6 fl6 = { 905 .flowi6_oif = oif, 906 .daddr = *daddr, 907 }; 908 struct dst_entry *dst; 909 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 910 911 if (saddr) { 912 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 913 flags |= RT6_LOOKUP_F_HAS_SADDR; 914 } 915 916 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup); 917 if (dst->error == 0) 918 return (struct rt6_info *) dst; 919 920 dst_release(dst); 921 922 return NULL; 923 } 924 EXPORT_SYMBOL(rt6_lookup); 925 926 /* ip6_ins_rt is called with FREE table->tb6_lock. 927 * It takes new route entry, the addition fails by any reason the 928 * route is released. 929 * Caller must hold dst before calling it. 930 */ 931 932 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, 933 struct mx6_config *mxc, 934 struct netlink_ext_ack *extack) 935 { 936 int err; 937 struct fib6_table *table; 938 939 table = rt->rt6i_table; 940 write_lock_bh(&table->tb6_lock); 941 err = fib6_add(&table->tb6_root, rt, info, mxc, extack); 942 write_unlock_bh(&table->tb6_lock); 943 944 return err; 945 } 946 947 int ip6_ins_rt(struct rt6_info *rt) 948 { 949 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), }; 950 struct mx6_config mxc = { .mx = NULL, }; 951 952 /* Hold dst to account for the reference from the fib6 tree */ 953 dst_hold(&rt->dst); 954 return __ip6_ins_rt(rt, &info, &mxc, NULL); 955 } 956 957 /* called with rcu_lock held */ 958 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt) 959 { 960 struct net_device *dev = rt->dst.dev; 961 962 if (rt->rt6i_flags & RTF_LOCAL) { 963 /* for copies of local routes, dst->dev needs to be the 964 * device if it is a master device, the master device if 965 * device is enslaved, and the loopback as the default 966 */ 967 if (netif_is_l3_slave(dev) && 968 !rt6_need_strict(&rt->rt6i_dst.addr)) 969 dev = l3mdev_master_dev_rcu(dev); 970 else if (!netif_is_l3_master(dev)) 971 dev = dev_net(dev)->loopback_dev; 972 /* last case is netif_is_l3_master(dev) is true in which 973 * case we want dev returned to be dev 974 */ 975 } 976 977 return dev; 978 } 979 980 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, 981 const struct in6_addr *daddr, 982 const struct in6_addr *saddr) 983 { 984 struct net_device *dev; 985 struct rt6_info *rt; 986 987 /* 988 * Clone the route. 989 */ 990 991 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) 992 ort = (struct rt6_info *)ort->dst.from; 993 994 rcu_read_lock(); 995 dev = ip6_rt_get_dev_rcu(ort); 996 rt = __ip6_dst_alloc(dev_net(dev), dev, 0); 997 rcu_read_unlock(); 998 if (!rt) 999 return NULL; 1000 1001 ip6_rt_copy_init(rt, ort); 1002 rt->rt6i_flags |= RTF_CACHE; 1003 rt->rt6i_metric = 0; 1004 rt->dst.flags |= DST_HOST; 1005 rt->rt6i_dst.addr = *daddr; 1006 rt->rt6i_dst.plen = 128; 1007 1008 if (!rt6_is_gw_or_nonexthop(ort)) { 1009 if (ort->rt6i_dst.plen != 128 && 1010 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr)) 1011 rt->rt6i_flags |= RTF_ANYCAST; 1012 #ifdef CONFIG_IPV6_SUBTREES 1013 if (rt->rt6i_src.plen && saddr) { 1014 rt->rt6i_src.addr = *saddr; 1015 rt->rt6i_src.plen = 128; 1016 } 1017 #endif 1018 } 1019 1020 return rt; 1021 } 1022 1023 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) 1024 { 1025 struct net_device *dev; 1026 struct rt6_info *pcpu_rt; 1027 1028 rcu_read_lock(); 1029 dev = ip6_rt_get_dev_rcu(rt); 1030 pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags); 1031 rcu_read_unlock(); 1032 if (!pcpu_rt) 1033 return NULL; 1034 ip6_rt_copy_init(pcpu_rt, rt); 1035 pcpu_rt->rt6i_protocol = rt->rt6i_protocol; 1036 pcpu_rt->rt6i_flags |= RTF_PCPU; 1037 return pcpu_rt; 1038 } 1039 1040 /* It should be called with read_lock_bh(&tb6_lock) acquired */ 1041 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) 1042 { 1043 struct rt6_info *pcpu_rt, **p; 1044 1045 p = this_cpu_ptr(rt->rt6i_pcpu); 1046 pcpu_rt = *p; 1047 1048 if (pcpu_rt) { 1049 dst_hold(&pcpu_rt->dst); 1050 rt6_dst_from_metrics_check(pcpu_rt); 1051 } 1052 return pcpu_rt; 1053 } 1054 1055 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) 1056 { 1057 struct fib6_table *table = rt->rt6i_table; 1058 struct rt6_info *pcpu_rt, *prev, **p; 1059 1060 pcpu_rt = ip6_rt_pcpu_alloc(rt); 1061 if (!pcpu_rt) { 1062 struct net *net = dev_net(rt->dst.dev); 1063 1064 dst_hold(&net->ipv6.ip6_null_entry->dst); 1065 return net->ipv6.ip6_null_entry; 1066 } 1067 1068 read_lock_bh(&table->tb6_lock); 1069 if (rt->rt6i_pcpu) { 1070 p = this_cpu_ptr(rt->rt6i_pcpu); 1071 prev = cmpxchg(p, NULL, pcpu_rt); 1072 if (prev) { 1073 /* If someone did it before us, return prev instead */ 1074 dst_release_immediate(&pcpu_rt->dst); 1075 pcpu_rt = prev; 1076 } 1077 } else { 1078 /* rt has been removed from the fib6 tree 1079 * before we have a chance to acquire the read_lock. 1080 * In this case, don't brother to create a pcpu rt 1081 * since rt is going away anyway. The next 1082 * dst_check() will trigger a re-lookup. 1083 */ 1084 dst_release_immediate(&pcpu_rt->dst); 1085 pcpu_rt = rt; 1086 } 1087 dst_hold(&pcpu_rt->dst); 1088 rt6_dst_from_metrics_check(pcpu_rt); 1089 read_unlock_bh(&table->tb6_lock); 1090 return pcpu_rt; 1091 } 1092 1093 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1094 int oif, struct flowi6 *fl6, int flags) 1095 { 1096 struct fib6_node *fn, *saved_fn; 1097 struct rt6_info *rt; 1098 int strict = 0; 1099 1100 strict |= flags & RT6_LOOKUP_F_IFACE; 1101 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1102 if (net->ipv6.devconf_all->forwarding == 0) 1103 strict |= RT6_LOOKUP_F_REACHABLE; 1104 1105 read_lock_bh(&table->tb6_lock); 1106 1107 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1108 saved_fn = fn; 1109 1110 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1111 oif = 0; 1112 1113 redo_rt6_select: 1114 rt = rt6_select(fn, oif, strict); 1115 if (rt->rt6i_nsiblings) 1116 rt = rt6_multipath_select(rt, fl6, oif, strict); 1117 if (rt == net->ipv6.ip6_null_entry) { 1118 fn = fib6_backtrack(fn, &fl6->saddr); 1119 if (fn) 1120 goto redo_rt6_select; 1121 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1122 /* also consider unreachable route */ 1123 strict &= ~RT6_LOOKUP_F_REACHABLE; 1124 fn = saved_fn; 1125 goto redo_rt6_select; 1126 } 1127 } 1128 1129 1130 if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) { 1131 dst_use(&rt->dst, jiffies); 1132 read_unlock_bh(&table->tb6_lock); 1133 1134 rt6_dst_from_metrics_check(rt); 1135 1136 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); 1137 return rt; 1138 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1139 !(rt->rt6i_flags & RTF_GATEWAY))) { 1140 /* Create a RTF_CACHE clone which will not be 1141 * owned by the fib6 tree. It is for the special case where 1142 * the daddr in the skb during the neighbor look-up is different 1143 * from the fl6->daddr used to look-up route here. 1144 */ 1145 1146 struct rt6_info *uncached_rt; 1147 1148 dst_use(&rt->dst, jiffies); 1149 read_unlock_bh(&table->tb6_lock); 1150 1151 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); 1152 dst_release(&rt->dst); 1153 1154 if (uncached_rt) { 1155 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1156 * No need for another dst_hold() 1157 */ 1158 rt6_uncached_list_add(uncached_rt); 1159 } else { 1160 uncached_rt = net->ipv6.ip6_null_entry; 1161 dst_hold(&uncached_rt->dst); 1162 } 1163 1164 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6); 1165 return uncached_rt; 1166 1167 } else { 1168 /* Get a percpu copy */ 1169 1170 struct rt6_info *pcpu_rt; 1171 1172 rt->dst.lastuse = jiffies; 1173 rt->dst.__use++; 1174 pcpu_rt = rt6_get_pcpu_route(rt); 1175 1176 if (pcpu_rt) { 1177 read_unlock_bh(&table->tb6_lock); 1178 } else { 1179 /* We have to do the read_unlock first 1180 * because rt6_make_pcpu_route() may trigger 1181 * ip6_dst_gc() which will take the write_lock. 1182 */ 1183 dst_hold(&rt->dst); 1184 read_unlock_bh(&table->tb6_lock); 1185 pcpu_rt = rt6_make_pcpu_route(rt); 1186 dst_release(&rt->dst); 1187 } 1188 1189 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6); 1190 return pcpu_rt; 1191 1192 } 1193 } 1194 EXPORT_SYMBOL_GPL(ip6_pol_route); 1195 1196 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, 1197 struct flowi6 *fl6, int flags) 1198 { 1199 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags); 1200 } 1201 1202 struct dst_entry *ip6_route_input_lookup(struct net *net, 1203 struct net_device *dev, 1204 struct flowi6 *fl6, int flags) 1205 { 1206 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1207 flags |= RT6_LOOKUP_F_IFACE; 1208 1209 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input); 1210 } 1211 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1212 1213 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1214 struct flow_keys *keys) 1215 { 1216 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1217 const struct ipv6hdr *key_iph = outer_iph; 1218 const struct ipv6hdr *inner_iph; 1219 const struct icmp6hdr *icmph; 1220 struct ipv6hdr _inner_iph; 1221 1222 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1223 goto out; 1224 1225 icmph = icmp6_hdr(skb); 1226 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1227 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1228 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1229 icmph->icmp6_type != ICMPV6_PARAMPROB) 1230 goto out; 1231 1232 inner_iph = skb_header_pointer(skb, 1233 skb_transport_offset(skb) + sizeof(*icmph), 1234 sizeof(_inner_iph), &_inner_iph); 1235 if (!inner_iph) 1236 goto out; 1237 1238 key_iph = inner_iph; 1239 out: 1240 memset(keys, 0, sizeof(*keys)); 1241 keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 1242 keys->addrs.v6addrs.src = key_iph->saddr; 1243 keys->addrs.v6addrs.dst = key_iph->daddr; 1244 keys->tags.flow_label = ip6_flowinfo(key_iph); 1245 keys->basic.ip_proto = key_iph->nexthdr; 1246 } 1247 1248 /* if skb is set it will be used and fl6 can be NULL */ 1249 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb) 1250 { 1251 struct flow_keys hash_keys; 1252 1253 if (skb) { 1254 ip6_multipath_l3_keys(skb, &hash_keys); 1255 return flow_hash_from_keys(&hash_keys); 1256 } 1257 1258 return get_hash_from_flowi6(fl6); 1259 } 1260 1261 void ip6_route_input(struct sk_buff *skb) 1262 { 1263 const struct ipv6hdr *iph = ipv6_hdr(skb); 1264 struct net *net = dev_net(skb->dev); 1265 int flags = RT6_LOOKUP_F_HAS_SADDR; 1266 struct ip_tunnel_info *tun_info; 1267 struct flowi6 fl6 = { 1268 .flowi6_iif = skb->dev->ifindex, 1269 .daddr = iph->daddr, 1270 .saddr = iph->saddr, 1271 .flowlabel = ip6_flowinfo(iph), 1272 .flowi6_mark = skb->mark, 1273 .flowi6_proto = iph->nexthdr, 1274 }; 1275 1276 tun_info = skb_tunnel_info(skb); 1277 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 1278 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 1279 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 1280 fl6.mp_hash = rt6_multipath_hash(&fl6, skb); 1281 skb_dst_drop(skb); 1282 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags)); 1283 } 1284 1285 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table, 1286 struct flowi6 *fl6, int flags) 1287 { 1288 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags); 1289 } 1290 1291 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 1292 struct flowi6 *fl6, int flags) 1293 { 1294 bool any_src; 1295 1296 if (rt6_need_strict(&fl6->daddr)) { 1297 struct dst_entry *dst; 1298 1299 dst = l3mdev_link_scope_lookup(net, fl6); 1300 if (dst) 1301 return dst; 1302 } 1303 1304 fl6->flowi6_iif = LOOPBACK_IFINDEX; 1305 1306 any_src = ipv6_addr_any(&fl6->saddr); 1307 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 1308 (fl6->flowi6_oif && any_src)) 1309 flags |= RT6_LOOKUP_F_IFACE; 1310 1311 if (!any_src) 1312 flags |= RT6_LOOKUP_F_HAS_SADDR; 1313 else if (sk) 1314 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 1315 1316 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output); 1317 } 1318 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 1319 1320 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 1321 { 1322 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 1323 struct net_device *loopback_dev = net->loopback_dev; 1324 struct dst_entry *new = NULL; 1325 1326 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 1327 DST_OBSOLETE_NONE, 0); 1328 if (rt) { 1329 rt6_info_init(rt); 1330 1331 new = &rt->dst; 1332 new->__use = 1; 1333 new->input = dst_discard; 1334 new->output = dst_discard_out; 1335 1336 dst_copy_metrics(new, &ort->dst); 1337 1338 rt->rt6i_idev = in6_dev_get(loopback_dev); 1339 rt->rt6i_gateway = ort->rt6i_gateway; 1340 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 1341 rt->rt6i_metric = 0; 1342 1343 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 1344 #ifdef CONFIG_IPV6_SUBTREES 1345 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 1346 #endif 1347 } 1348 1349 dst_release(dst_orig); 1350 return new ? new : ERR_PTR(-ENOMEM); 1351 } 1352 1353 /* 1354 * Destination cache support functions 1355 */ 1356 1357 static void rt6_dst_from_metrics_check(struct rt6_info *rt) 1358 { 1359 if (rt->dst.from && 1360 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from)) 1361 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true); 1362 } 1363 1364 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) 1365 { 1366 if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie)) 1367 return NULL; 1368 1369 if (rt6_check_expired(rt)) 1370 return NULL; 1371 1372 return &rt->dst; 1373 } 1374 1375 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) 1376 { 1377 if (!__rt6_check_expired(rt) && 1378 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 1379 rt6_check((struct rt6_info *)(rt->dst.from), cookie)) 1380 return &rt->dst; 1381 else 1382 return NULL; 1383 } 1384 1385 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 1386 { 1387 struct rt6_info *rt; 1388 1389 rt = (struct rt6_info *) dst; 1390 1391 /* All IPV6 dsts are created with ->obsolete set to the value 1392 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 1393 * into this function always. 1394 */ 1395 1396 rt6_dst_from_metrics_check(rt); 1397 1398 if (rt->rt6i_flags & RTF_PCPU || 1399 (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from)) 1400 return rt6_dst_from_check(rt, cookie); 1401 else 1402 return rt6_check(rt, cookie); 1403 } 1404 1405 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 1406 { 1407 struct rt6_info *rt = (struct rt6_info *) dst; 1408 1409 if (rt) { 1410 if (rt->rt6i_flags & RTF_CACHE) { 1411 if (rt6_check_expired(rt)) { 1412 ip6_del_rt(rt); 1413 dst = NULL; 1414 } 1415 } else { 1416 dst_release(dst); 1417 dst = NULL; 1418 } 1419 } 1420 return dst; 1421 } 1422 1423 static void ip6_link_failure(struct sk_buff *skb) 1424 { 1425 struct rt6_info *rt; 1426 1427 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 1428 1429 rt = (struct rt6_info *) skb_dst(skb); 1430 if (rt) { 1431 if (rt->rt6i_flags & RTF_CACHE) { 1432 if (dst_hold_safe(&rt->dst)) 1433 ip6_del_rt(rt); 1434 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) { 1435 rt->rt6i_node->fn_sernum = -1; 1436 } 1437 } 1438 } 1439 1440 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 1441 { 1442 struct net *net = dev_net(rt->dst.dev); 1443 1444 rt->rt6i_flags |= RTF_MODIFIED; 1445 rt->rt6i_pmtu = mtu; 1446 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 1447 } 1448 1449 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 1450 { 1451 return !(rt->rt6i_flags & RTF_CACHE) && 1452 (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node); 1453 } 1454 1455 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 1456 const struct ipv6hdr *iph, u32 mtu) 1457 { 1458 const struct in6_addr *daddr, *saddr; 1459 struct rt6_info *rt6 = (struct rt6_info *)dst; 1460 1461 if (rt6->rt6i_flags & RTF_LOCAL) 1462 return; 1463 1464 if (dst_metric_locked(dst, RTAX_MTU)) 1465 return; 1466 1467 if (iph) { 1468 daddr = &iph->daddr; 1469 saddr = &iph->saddr; 1470 } else if (sk) { 1471 daddr = &sk->sk_v6_daddr; 1472 saddr = &inet6_sk(sk)->saddr; 1473 } else { 1474 daddr = NULL; 1475 saddr = NULL; 1476 } 1477 dst_confirm_neigh(dst, daddr); 1478 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 1479 if (mtu >= dst_mtu(dst)) 1480 return; 1481 1482 if (!rt6_cache_allowed_for_pmtu(rt6)) { 1483 rt6_do_update_pmtu(rt6, mtu); 1484 } else if (daddr) { 1485 struct rt6_info *nrt6; 1486 1487 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr); 1488 if (nrt6) { 1489 rt6_do_update_pmtu(nrt6, mtu); 1490 1491 /* ip6_ins_rt(nrt6) will bump the 1492 * rt6->rt6i_node->fn_sernum 1493 * which will fail the next rt6_check() and 1494 * invalidate the sk->sk_dst_cache. 1495 */ 1496 ip6_ins_rt(nrt6); 1497 /* Release the reference taken in 1498 * ip6_rt_cache_alloc() 1499 */ 1500 dst_release(&nrt6->dst); 1501 } 1502 } 1503 } 1504 1505 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 1506 struct sk_buff *skb, u32 mtu) 1507 { 1508 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 1509 } 1510 1511 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 1512 int oif, u32 mark, kuid_t uid) 1513 { 1514 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 1515 struct dst_entry *dst; 1516 struct flowi6 fl6; 1517 1518 memset(&fl6, 0, sizeof(fl6)); 1519 fl6.flowi6_oif = oif; 1520 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); 1521 fl6.daddr = iph->daddr; 1522 fl6.saddr = iph->saddr; 1523 fl6.flowlabel = ip6_flowinfo(iph); 1524 fl6.flowi6_uid = uid; 1525 1526 dst = ip6_route_output(net, NULL, &fl6); 1527 if (!dst->error) 1528 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 1529 dst_release(dst); 1530 } 1531 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 1532 1533 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 1534 { 1535 struct dst_entry *dst; 1536 1537 ip6_update_pmtu(skb, sock_net(sk), mtu, 1538 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid); 1539 1540 dst = __sk_dst_get(sk); 1541 if (!dst || !dst->obsolete || 1542 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 1543 return; 1544 1545 bh_lock_sock(sk); 1546 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 1547 ip6_datagram_dst_update(sk, false); 1548 bh_unlock_sock(sk); 1549 } 1550 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 1551 1552 /* Handle redirects */ 1553 struct ip6rd_flowi { 1554 struct flowi6 fl6; 1555 struct in6_addr gateway; 1556 }; 1557 1558 static struct rt6_info *__ip6_route_redirect(struct net *net, 1559 struct fib6_table *table, 1560 struct flowi6 *fl6, 1561 int flags) 1562 { 1563 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 1564 struct rt6_info *rt; 1565 struct fib6_node *fn; 1566 1567 /* Get the "current" route for this destination and 1568 * check if the redirect has come from appropriate router. 1569 * 1570 * RFC 4861 specifies that redirects should only be 1571 * accepted if they come from the nexthop to the target. 1572 * Due to the way the routes are chosen, this notion 1573 * is a bit fuzzy and one might need to check all possible 1574 * routes. 1575 */ 1576 1577 read_lock_bh(&table->tb6_lock); 1578 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1579 restart: 1580 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { 1581 if (rt6_check_expired(rt)) 1582 continue; 1583 if (rt->dst.error) 1584 break; 1585 if (!(rt->rt6i_flags & RTF_GATEWAY)) 1586 continue; 1587 if (fl6->flowi6_oif != rt->dst.dev->ifindex) 1588 continue; 1589 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) 1590 continue; 1591 break; 1592 } 1593 1594 if (!rt) 1595 rt = net->ipv6.ip6_null_entry; 1596 else if (rt->dst.error) { 1597 rt = net->ipv6.ip6_null_entry; 1598 goto out; 1599 } 1600 1601 if (rt == net->ipv6.ip6_null_entry) { 1602 fn = fib6_backtrack(fn, &fl6->saddr); 1603 if (fn) 1604 goto restart; 1605 } 1606 1607 out: 1608 dst_hold(&rt->dst); 1609 1610 read_unlock_bh(&table->tb6_lock); 1611 1612 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); 1613 return rt; 1614 }; 1615 1616 static struct dst_entry *ip6_route_redirect(struct net *net, 1617 const struct flowi6 *fl6, 1618 const struct in6_addr *gateway) 1619 { 1620 int flags = RT6_LOOKUP_F_HAS_SADDR; 1621 struct ip6rd_flowi rdfl; 1622 1623 rdfl.fl6 = *fl6; 1624 rdfl.gateway = *gateway; 1625 1626 return fib6_rule_lookup(net, &rdfl.fl6, 1627 flags, __ip6_route_redirect); 1628 } 1629 1630 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 1631 kuid_t uid) 1632 { 1633 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 1634 struct dst_entry *dst; 1635 struct flowi6 fl6; 1636 1637 memset(&fl6, 0, sizeof(fl6)); 1638 fl6.flowi6_iif = LOOPBACK_IFINDEX; 1639 fl6.flowi6_oif = oif; 1640 fl6.flowi6_mark = mark; 1641 fl6.daddr = iph->daddr; 1642 fl6.saddr = iph->saddr; 1643 fl6.flowlabel = ip6_flowinfo(iph); 1644 fl6.flowi6_uid = uid; 1645 1646 dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr); 1647 rt6_do_redirect(dst, NULL, skb); 1648 dst_release(dst); 1649 } 1650 EXPORT_SYMBOL_GPL(ip6_redirect); 1651 1652 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, 1653 u32 mark) 1654 { 1655 const struct ipv6hdr *iph = ipv6_hdr(skb); 1656 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 1657 struct dst_entry *dst; 1658 struct flowi6 fl6; 1659 1660 memset(&fl6, 0, sizeof(fl6)); 1661 fl6.flowi6_iif = LOOPBACK_IFINDEX; 1662 fl6.flowi6_oif = oif; 1663 fl6.flowi6_mark = mark; 1664 fl6.daddr = msg->dest; 1665 fl6.saddr = iph->daddr; 1666 fl6.flowi6_uid = sock_net_uid(net, NULL); 1667 1668 dst = ip6_route_redirect(net, &fl6, &iph->saddr); 1669 rt6_do_redirect(dst, NULL, skb); 1670 dst_release(dst); 1671 } 1672 1673 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 1674 { 1675 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 1676 sk->sk_uid); 1677 } 1678 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 1679 1680 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 1681 { 1682 struct net_device *dev = dst->dev; 1683 unsigned int mtu = dst_mtu(dst); 1684 struct net *net = dev_net(dev); 1685 1686 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 1687 1688 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 1689 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 1690 1691 /* 1692 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 1693 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 1694 * IPV6_MAXPLEN is also valid and means: "any MSS, 1695 * rely only on pmtu discovery" 1696 */ 1697 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 1698 mtu = IPV6_MAXPLEN; 1699 return mtu; 1700 } 1701 1702 static unsigned int ip6_mtu(const struct dst_entry *dst) 1703 { 1704 const struct rt6_info *rt = (const struct rt6_info *)dst; 1705 unsigned int mtu = rt->rt6i_pmtu; 1706 struct inet6_dev *idev; 1707 1708 if (mtu) 1709 goto out; 1710 1711 mtu = dst_metric_raw(dst, RTAX_MTU); 1712 if (mtu) 1713 goto out; 1714 1715 mtu = IPV6_MIN_MTU; 1716 1717 rcu_read_lock(); 1718 idev = __in6_dev_get(dst->dev); 1719 if (idev) 1720 mtu = idev->cnf.mtu6; 1721 rcu_read_unlock(); 1722 1723 out: 1724 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1725 1726 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 1727 } 1728 1729 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 1730 struct flowi6 *fl6) 1731 { 1732 struct dst_entry *dst; 1733 struct rt6_info *rt; 1734 struct inet6_dev *idev = in6_dev_get(dev); 1735 struct net *net = dev_net(dev); 1736 1737 if (unlikely(!idev)) 1738 return ERR_PTR(-ENODEV); 1739 1740 rt = ip6_dst_alloc(net, dev, 0); 1741 if (unlikely(!rt)) { 1742 in6_dev_put(idev); 1743 dst = ERR_PTR(-ENOMEM); 1744 goto out; 1745 } 1746 1747 rt->dst.flags |= DST_HOST; 1748 rt->dst.output = ip6_output; 1749 rt->rt6i_gateway = fl6->daddr; 1750 rt->rt6i_dst.addr = fl6->daddr; 1751 rt->rt6i_dst.plen = 128; 1752 rt->rt6i_idev = idev; 1753 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 1754 1755 /* Add this dst into uncached_list so that rt6_ifdown() can 1756 * do proper release of the net_device 1757 */ 1758 rt6_uncached_list_add(rt); 1759 1760 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 1761 1762 out: 1763 return dst; 1764 } 1765 1766 static int ip6_dst_gc(struct dst_ops *ops) 1767 { 1768 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 1769 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 1770 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 1771 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 1772 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 1773 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 1774 int entries; 1775 1776 entries = dst_entries_get_fast(ops); 1777 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 1778 entries <= rt_max_size) 1779 goto out; 1780 1781 net->ipv6.ip6_rt_gc_expire++; 1782 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 1783 entries = dst_entries_get_slow(ops); 1784 if (entries < ops->gc_thresh) 1785 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 1786 out: 1787 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 1788 return entries > rt_max_size; 1789 } 1790 1791 static int ip6_convert_metrics(struct mx6_config *mxc, 1792 const struct fib6_config *cfg) 1793 { 1794 bool ecn_ca = false; 1795 struct nlattr *nla; 1796 int remaining; 1797 u32 *mp; 1798 1799 if (!cfg->fc_mx) 1800 return 0; 1801 1802 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); 1803 if (unlikely(!mp)) 1804 return -ENOMEM; 1805 1806 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { 1807 int type = nla_type(nla); 1808 u32 val; 1809 1810 if (!type) 1811 continue; 1812 if (unlikely(type > RTAX_MAX)) 1813 goto err; 1814 1815 if (type == RTAX_CC_ALGO) { 1816 char tmp[TCP_CA_NAME_MAX]; 1817 1818 nla_strlcpy(tmp, nla, sizeof(tmp)); 1819 val = tcp_ca_get_key_by_name(tmp, &ecn_ca); 1820 if (val == TCP_CA_UNSPEC) 1821 goto err; 1822 } else { 1823 val = nla_get_u32(nla); 1824 } 1825 if (type == RTAX_HOPLIMIT && val > 255) 1826 val = 255; 1827 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) 1828 goto err; 1829 1830 mp[type - 1] = val; 1831 __set_bit(type - 1, mxc->mx_valid); 1832 } 1833 1834 if (ecn_ca) { 1835 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid); 1836 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; 1837 } 1838 1839 mxc->mx = mp; 1840 return 0; 1841 err: 1842 kfree(mp); 1843 return -EINVAL; 1844 } 1845 1846 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 1847 struct fib6_config *cfg, 1848 const struct in6_addr *gw_addr) 1849 { 1850 struct flowi6 fl6 = { 1851 .flowi6_oif = cfg->fc_ifindex, 1852 .daddr = *gw_addr, 1853 .saddr = cfg->fc_prefsrc, 1854 }; 1855 struct fib6_table *table; 1856 struct rt6_info *rt; 1857 int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE; 1858 1859 table = fib6_get_table(net, cfg->fc_table); 1860 if (!table) 1861 return NULL; 1862 1863 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 1864 flags |= RT6_LOOKUP_F_HAS_SADDR; 1865 1866 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags); 1867 1868 /* if table lookup failed, fall back to full lookup */ 1869 if (rt == net->ipv6.ip6_null_entry) { 1870 ip6_rt_put(rt); 1871 rt = NULL; 1872 } 1873 1874 return rt; 1875 } 1876 1877 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, 1878 struct netlink_ext_ack *extack) 1879 { 1880 struct net *net = cfg->fc_nlinfo.nl_net; 1881 struct rt6_info *rt = NULL; 1882 struct net_device *dev = NULL; 1883 struct inet6_dev *idev = NULL; 1884 struct fib6_table *table; 1885 int addr_type; 1886 int err = -EINVAL; 1887 1888 /* RTF_PCPU is an internal flag; can not be set by userspace */ 1889 if (cfg->fc_flags & RTF_PCPU) { 1890 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 1891 goto out; 1892 } 1893 1894 if (cfg->fc_dst_len > 128) { 1895 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 1896 goto out; 1897 } 1898 if (cfg->fc_src_len > 128) { 1899 NL_SET_ERR_MSG(extack, "Invalid source address length"); 1900 goto out; 1901 } 1902 #ifndef CONFIG_IPV6_SUBTREES 1903 if (cfg->fc_src_len) { 1904 NL_SET_ERR_MSG(extack, 1905 "Specifying source address requires IPV6_SUBTREES to be enabled"); 1906 goto out; 1907 } 1908 #endif 1909 if (cfg->fc_ifindex) { 1910 err = -ENODEV; 1911 dev = dev_get_by_index(net, cfg->fc_ifindex); 1912 if (!dev) 1913 goto out; 1914 idev = in6_dev_get(dev); 1915 if (!idev) 1916 goto out; 1917 } 1918 1919 if (cfg->fc_metric == 0) 1920 cfg->fc_metric = IP6_RT_PRIO_USER; 1921 1922 err = -ENOBUFS; 1923 if (cfg->fc_nlinfo.nlh && 1924 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 1925 table = fib6_get_table(net, cfg->fc_table); 1926 if (!table) { 1927 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 1928 table = fib6_new_table(net, cfg->fc_table); 1929 } 1930 } else { 1931 table = fib6_new_table(net, cfg->fc_table); 1932 } 1933 1934 if (!table) 1935 goto out; 1936 1937 rt = ip6_dst_alloc(net, NULL, 1938 (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT); 1939 1940 if (!rt) { 1941 err = -ENOMEM; 1942 goto out; 1943 } 1944 1945 if (cfg->fc_flags & RTF_EXPIRES) 1946 rt6_set_expires(rt, jiffies + 1947 clock_t_to_jiffies(cfg->fc_expires)); 1948 else 1949 rt6_clean_expires(rt); 1950 1951 if (cfg->fc_protocol == RTPROT_UNSPEC) 1952 cfg->fc_protocol = RTPROT_BOOT; 1953 rt->rt6i_protocol = cfg->fc_protocol; 1954 1955 addr_type = ipv6_addr_type(&cfg->fc_dst); 1956 1957 if (addr_type & IPV6_ADDR_MULTICAST) 1958 rt->dst.input = ip6_mc_input; 1959 else if (cfg->fc_flags & RTF_LOCAL) 1960 rt->dst.input = ip6_input; 1961 else 1962 rt->dst.input = ip6_forward; 1963 1964 rt->dst.output = ip6_output; 1965 1966 if (cfg->fc_encap) { 1967 struct lwtunnel_state *lwtstate; 1968 1969 err = lwtunnel_build_state(cfg->fc_encap_type, 1970 cfg->fc_encap, AF_INET6, cfg, 1971 &lwtstate, extack); 1972 if (err) 1973 goto out; 1974 rt->dst.lwtstate = lwtstate_get(lwtstate); 1975 if (lwtunnel_output_redirect(rt->dst.lwtstate)) { 1976 rt->dst.lwtstate->orig_output = rt->dst.output; 1977 rt->dst.output = lwtunnel_output; 1978 } 1979 if (lwtunnel_input_redirect(rt->dst.lwtstate)) { 1980 rt->dst.lwtstate->orig_input = rt->dst.input; 1981 rt->dst.input = lwtunnel_input; 1982 } 1983 } 1984 1985 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 1986 rt->rt6i_dst.plen = cfg->fc_dst_len; 1987 if (rt->rt6i_dst.plen == 128) 1988 rt->dst.flags |= DST_HOST; 1989 1990 #ifdef CONFIG_IPV6_SUBTREES 1991 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len); 1992 rt->rt6i_src.plen = cfg->fc_src_len; 1993 #endif 1994 1995 rt->rt6i_metric = cfg->fc_metric; 1996 1997 /* We cannot add true routes via loopback here, 1998 they would result in kernel looping; promote them to reject routes 1999 */ 2000 if ((cfg->fc_flags & RTF_REJECT) || 2001 (dev && (dev->flags & IFF_LOOPBACK) && 2002 !(addr_type & IPV6_ADDR_LOOPBACK) && 2003 !(cfg->fc_flags & RTF_LOCAL))) { 2004 /* hold loopback dev/idev if we haven't done so. */ 2005 if (dev != net->loopback_dev) { 2006 if (dev) { 2007 dev_put(dev); 2008 in6_dev_put(idev); 2009 } 2010 dev = net->loopback_dev; 2011 dev_hold(dev); 2012 idev = in6_dev_get(dev); 2013 if (!idev) { 2014 err = -ENODEV; 2015 goto out; 2016 } 2017 } 2018 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP; 2019 switch (cfg->fc_type) { 2020 case RTN_BLACKHOLE: 2021 rt->dst.error = -EINVAL; 2022 rt->dst.output = dst_discard_out; 2023 rt->dst.input = dst_discard; 2024 break; 2025 case RTN_PROHIBIT: 2026 rt->dst.error = -EACCES; 2027 rt->dst.output = ip6_pkt_prohibit_out; 2028 rt->dst.input = ip6_pkt_prohibit; 2029 break; 2030 case RTN_THROW: 2031 case RTN_UNREACHABLE: 2032 default: 2033 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN 2034 : (cfg->fc_type == RTN_UNREACHABLE) 2035 ? -EHOSTUNREACH : -ENETUNREACH; 2036 rt->dst.output = ip6_pkt_discard_out; 2037 rt->dst.input = ip6_pkt_discard; 2038 break; 2039 } 2040 goto install_route; 2041 } 2042 2043 if (cfg->fc_flags & RTF_GATEWAY) { 2044 const struct in6_addr *gw_addr; 2045 int gwa_type; 2046 2047 gw_addr = &cfg->fc_gateway; 2048 gwa_type = ipv6_addr_type(gw_addr); 2049 2050 /* if gw_addr is local we will fail to detect this in case 2051 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2052 * will return already-added prefix route via interface that 2053 * prefix route was assigned to, which might be non-loopback. 2054 */ 2055 err = -EINVAL; 2056 if (ipv6_chk_addr_and_flags(net, gw_addr, 2057 gwa_type & IPV6_ADDR_LINKLOCAL ? 2058 dev : NULL, 0, 0)) { 2059 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2060 goto out; 2061 } 2062 rt->rt6i_gateway = *gw_addr; 2063 2064 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { 2065 struct rt6_info *grt = NULL; 2066 2067 /* IPv6 strictly inhibits using not link-local 2068 addresses as nexthop address. 2069 Otherwise, router will not able to send redirects. 2070 It is very good, but in some (rare!) circumstances 2071 (SIT, PtP, NBMA NOARP links) it is handy to allow 2072 some exceptions. --ANK 2073 We allow IPv4-mapped nexthops to support RFC4798-type 2074 addressing 2075 */ 2076 if (!(gwa_type & (IPV6_ADDR_UNICAST | 2077 IPV6_ADDR_MAPPED))) { 2078 NL_SET_ERR_MSG(extack, 2079 "Invalid gateway address"); 2080 goto out; 2081 } 2082 2083 if (cfg->fc_table) { 2084 grt = ip6_nh_lookup_table(net, cfg, gw_addr); 2085 2086 if (grt) { 2087 if (grt->rt6i_flags & RTF_GATEWAY || 2088 (dev && dev != grt->dst.dev)) { 2089 ip6_rt_put(grt); 2090 grt = NULL; 2091 } 2092 } 2093 } 2094 2095 if (!grt) 2096 grt = rt6_lookup(net, gw_addr, NULL, 2097 cfg->fc_ifindex, 1); 2098 2099 err = -EHOSTUNREACH; 2100 if (!grt) 2101 goto out; 2102 if (dev) { 2103 if (dev != grt->dst.dev) { 2104 ip6_rt_put(grt); 2105 goto out; 2106 } 2107 } else { 2108 dev = grt->dst.dev; 2109 idev = grt->rt6i_idev; 2110 dev_hold(dev); 2111 in6_dev_hold(grt->rt6i_idev); 2112 } 2113 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2114 err = 0; 2115 ip6_rt_put(grt); 2116 2117 if (err) 2118 goto out; 2119 } 2120 err = -EINVAL; 2121 if (!dev) { 2122 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2123 goto out; 2124 } else if (dev->flags & IFF_LOOPBACK) { 2125 NL_SET_ERR_MSG(extack, 2126 "Egress device can not be loopback device for this route"); 2127 goto out; 2128 } 2129 } 2130 2131 err = -ENODEV; 2132 if (!dev) 2133 goto out; 2134 2135 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 2136 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 2137 NL_SET_ERR_MSG(extack, "Invalid source address"); 2138 err = -EINVAL; 2139 goto out; 2140 } 2141 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc; 2142 rt->rt6i_prefsrc.plen = 128; 2143 } else 2144 rt->rt6i_prefsrc.plen = 0; 2145 2146 rt->rt6i_flags = cfg->fc_flags; 2147 2148 install_route: 2149 rt->dst.dev = dev; 2150 rt->rt6i_idev = idev; 2151 rt->rt6i_table = table; 2152 2153 cfg->fc_nlinfo.nl_net = dev_net(dev); 2154 2155 return rt; 2156 out: 2157 if (dev) 2158 dev_put(dev); 2159 if (idev) 2160 in6_dev_put(idev); 2161 if (rt) 2162 dst_release_immediate(&rt->dst); 2163 2164 return ERR_PTR(err); 2165 } 2166 2167 int ip6_route_add(struct fib6_config *cfg, 2168 struct netlink_ext_ack *extack) 2169 { 2170 struct mx6_config mxc = { .mx = NULL, }; 2171 struct rt6_info *rt; 2172 int err; 2173 2174 rt = ip6_route_info_create(cfg, extack); 2175 if (IS_ERR(rt)) { 2176 err = PTR_ERR(rt); 2177 rt = NULL; 2178 goto out; 2179 } 2180 2181 err = ip6_convert_metrics(&mxc, cfg); 2182 if (err) 2183 goto out; 2184 2185 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack); 2186 2187 kfree(mxc.mx); 2188 2189 return err; 2190 out: 2191 if (rt) 2192 dst_release_immediate(&rt->dst); 2193 2194 return err; 2195 } 2196 2197 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) 2198 { 2199 int err; 2200 struct fib6_table *table; 2201 struct net *net = dev_net(rt->dst.dev); 2202 2203 if (rt == net->ipv6.ip6_null_entry) { 2204 err = -ENOENT; 2205 goto out; 2206 } 2207 2208 table = rt->rt6i_table; 2209 write_lock_bh(&table->tb6_lock); 2210 err = fib6_del(rt, info); 2211 write_unlock_bh(&table->tb6_lock); 2212 2213 out: 2214 ip6_rt_put(rt); 2215 return err; 2216 } 2217 2218 int ip6_del_rt(struct rt6_info *rt) 2219 { 2220 struct nl_info info = { 2221 .nl_net = dev_net(rt->dst.dev), 2222 }; 2223 return __ip6_del_rt(rt, &info); 2224 } 2225 2226 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) 2227 { 2228 struct nl_info *info = &cfg->fc_nlinfo; 2229 struct net *net = info->nl_net; 2230 struct sk_buff *skb = NULL; 2231 struct fib6_table *table; 2232 int err = -ENOENT; 2233 2234 if (rt == net->ipv6.ip6_null_entry) 2235 goto out_put; 2236 table = rt->rt6i_table; 2237 write_lock_bh(&table->tb6_lock); 2238 2239 if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) { 2240 struct rt6_info *sibling, *next_sibling; 2241 2242 /* prefer to send a single notification with all hops */ 2243 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 2244 if (skb) { 2245 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 2246 2247 if (rt6_fill_node(net, skb, rt, 2248 NULL, NULL, 0, RTM_DELROUTE, 2249 info->portid, seq, 0) < 0) { 2250 kfree_skb(skb); 2251 skb = NULL; 2252 } else 2253 info->skip_notify = 1; 2254 } 2255 2256 list_for_each_entry_safe(sibling, next_sibling, 2257 &rt->rt6i_siblings, 2258 rt6i_siblings) { 2259 err = fib6_del(sibling, info); 2260 if (err) 2261 goto out_unlock; 2262 } 2263 } 2264 2265 err = fib6_del(rt, info); 2266 out_unlock: 2267 write_unlock_bh(&table->tb6_lock); 2268 out_put: 2269 ip6_rt_put(rt); 2270 2271 if (skb) { 2272 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 2273 info->nlh, gfp_any()); 2274 } 2275 return err; 2276 } 2277 2278 static int ip6_route_del(struct fib6_config *cfg, 2279 struct netlink_ext_ack *extack) 2280 { 2281 struct fib6_table *table; 2282 struct fib6_node *fn; 2283 struct rt6_info *rt; 2284 int err = -ESRCH; 2285 2286 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 2287 if (!table) { 2288 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 2289 return err; 2290 } 2291 2292 read_lock_bh(&table->tb6_lock); 2293 2294 fn = fib6_locate(&table->tb6_root, 2295 &cfg->fc_dst, cfg->fc_dst_len, 2296 &cfg->fc_src, cfg->fc_src_len); 2297 2298 if (fn) { 2299 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { 2300 if ((rt->rt6i_flags & RTF_CACHE) && 2301 !(cfg->fc_flags & RTF_CACHE)) 2302 continue; 2303 if (cfg->fc_ifindex && 2304 (!rt->dst.dev || 2305 rt->dst.dev->ifindex != cfg->fc_ifindex)) 2306 continue; 2307 if (cfg->fc_flags & RTF_GATEWAY && 2308 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 2309 continue; 2310 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric) 2311 continue; 2312 if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol) 2313 continue; 2314 dst_hold(&rt->dst); 2315 read_unlock_bh(&table->tb6_lock); 2316 2317 /* if gateway was specified only delete the one hop */ 2318 if (cfg->fc_flags & RTF_GATEWAY) 2319 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 2320 2321 return __ip6_del_rt_siblings(rt, cfg); 2322 } 2323 } 2324 read_unlock_bh(&table->tb6_lock); 2325 2326 return err; 2327 } 2328 2329 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 2330 { 2331 struct netevent_redirect netevent; 2332 struct rt6_info *rt, *nrt = NULL; 2333 struct ndisc_options ndopts; 2334 struct inet6_dev *in6_dev; 2335 struct neighbour *neigh; 2336 struct rd_msg *msg; 2337 int optlen, on_link; 2338 u8 *lladdr; 2339 2340 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 2341 optlen -= sizeof(*msg); 2342 2343 if (optlen < 0) { 2344 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 2345 return; 2346 } 2347 2348 msg = (struct rd_msg *)icmp6_hdr(skb); 2349 2350 if (ipv6_addr_is_multicast(&msg->dest)) { 2351 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 2352 return; 2353 } 2354 2355 on_link = 0; 2356 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 2357 on_link = 1; 2358 } else if (ipv6_addr_type(&msg->target) != 2359 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 2360 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 2361 return; 2362 } 2363 2364 in6_dev = __in6_dev_get(skb->dev); 2365 if (!in6_dev) 2366 return; 2367 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 2368 return; 2369 2370 /* RFC2461 8.1: 2371 * The IP source address of the Redirect MUST be the same as the current 2372 * first-hop router for the specified ICMP Destination Address. 2373 */ 2374 2375 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 2376 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 2377 return; 2378 } 2379 2380 lladdr = NULL; 2381 if (ndopts.nd_opts_tgt_lladdr) { 2382 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 2383 skb->dev); 2384 if (!lladdr) { 2385 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 2386 return; 2387 } 2388 } 2389 2390 rt = (struct rt6_info *) dst; 2391 if (rt->rt6i_flags & RTF_REJECT) { 2392 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 2393 return; 2394 } 2395 2396 /* Redirect received -> path was valid. 2397 * Look, redirects are sent only in response to data packets, 2398 * so that this nexthop apparently is reachable. --ANK 2399 */ 2400 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 2401 2402 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 2403 if (!neigh) 2404 return; 2405 2406 /* 2407 * We have finally decided to accept it. 2408 */ 2409 2410 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 2411 NEIGH_UPDATE_F_WEAK_OVERRIDE| 2412 NEIGH_UPDATE_F_OVERRIDE| 2413 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 2414 NEIGH_UPDATE_F_ISROUTER)), 2415 NDISC_REDIRECT, &ndopts); 2416 2417 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL); 2418 if (!nrt) 2419 goto out; 2420 2421 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 2422 if (on_link) 2423 nrt->rt6i_flags &= ~RTF_GATEWAY; 2424 2425 nrt->rt6i_protocol = RTPROT_REDIRECT; 2426 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 2427 2428 if (ip6_ins_rt(nrt)) 2429 goto out_release; 2430 2431 netevent.old = &rt->dst; 2432 netevent.new = &nrt->dst; 2433 netevent.daddr = &msg->dest; 2434 netevent.neigh = neigh; 2435 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 2436 2437 if (rt->rt6i_flags & RTF_CACHE) { 2438 rt = (struct rt6_info *) dst_clone(&rt->dst); 2439 ip6_del_rt(rt); 2440 } 2441 2442 out_release: 2443 /* Release the reference taken in 2444 * ip6_rt_cache_alloc() 2445 */ 2446 dst_release(&nrt->dst); 2447 2448 out: 2449 neigh_release(neigh); 2450 } 2451 2452 /* 2453 * Misc support functions 2454 */ 2455 2456 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) 2457 { 2458 BUG_ON(from->dst.from); 2459 2460 rt->rt6i_flags &= ~RTF_EXPIRES; 2461 dst_hold(&from->dst); 2462 rt->dst.from = &from->dst; 2463 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true); 2464 } 2465 2466 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) 2467 { 2468 rt->dst.input = ort->dst.input; 2469 rt->dst.output = ort->dst.output; 2470 rt->rt6i_dst = ort->rt6i_dst; 2471 rt->dst.error = ort->dst.error; 2472 rt->rt6i_idev = ort->rt6i_idev; 2473 if (rt->rt6i_idev) 2474 in6_dev_hold(rt->rt6i_idev); 2475 rt->dst.lastuse = jiffies; 2476 rt->rt6i_gateway = ort->rt6i_gateway; 2477 rt->rt6i_flags = ort->rt6i_flags; 2478 rt6_set_from(rt, ort); 2479 rt->rt6i_metric = ort->rt6i_metric; 2480 #ifdef CONFIG_IPV6_SUBTREES 2481 rt->rt6i_src = ort->rt6i_src; 2482 #endif 2483 rt->rt6i_prefsrc = ort->rt6i_prefsrc; 2484 rt->rt6i_table = ort->rt6i_table; 2485 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate); 2486 } 2487 2488 #ifdef CONFIG_IPV6_ROUTE_INFO 2489 static struct rt6_info *rt6_get_route_info(struct net *net, 2490 const struct in6_addr *prefix, int prefixlen, 2491 const struct in6_addr *gwaddr, 2492 struct net_device *dev) 2493 { 2494 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 2495 int ifindex = dev->ifindex; 2496 struct fib6_node *fn; 2497 struct rt6_info *rt = NULL; 2498 struct fib6_table *table; 2499 2500 table = fib6_get_table(net, tb_id); 2501 if (!table) 2502 return NULL; 2503 2504 read_lock_bh(&table->tb6_lock); 2505 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0); 2506 if (!fn) 2507 goto out; 2508 2509 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { 2510 if (rt->dst.dev->ifindex != ifindex) 2511 continue; 2512 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) 2513 continue; 2514 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr)) 2515 continue; 2516 dst_hold(&rt->dst); 2517 break; 2518 } 2519 out: 2520 read_unlock_bh(&table->tb6_lock); 2521 return rt; 2522 } 2523 2524 static struct rt6_info *rt6_add_route_info(struct net *net, 2525 const struct in6_addr *prefix, int prefixlen, 2526 const struct in6_addr *gwaddr, 2527 struct net_device *dev, 2528 unsigned int pref) 2529 { 2530 struct fib6_config cfg = { 2531 .fc_metric = IP6_RT_PRIO_USER, 2532 .fc_ifindex = dev->ifindex, 2533 .fc_dst_len = prefixlen, 2534 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 2535 RTF_UP | RTF_PREF(pref), 2536 .fc_protocol = RTPROT_RA, 2537 .fc_nlinfo.portid = 0, 2538 .fc_nlinfo.nlh = NULL, 2539 .fc_nlinfo.nl_net = net, 2540 }; 2541 2542 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 2543 cfg.fc_dst = *prefix; 2544 cfg.fc_gateway = *gwaddr; 2545 2546 /* We should treat it as a default route if prefix length is 0. */ 2547 if (!prefixlen) 2548 cfg.fc_flags |= RTF_DEFAULT; 2549 2550 ip6_route_add(&cfg, NULL); 2551 2552 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 2553 } 2554 #endif 2555 2556 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev) 2557 { 2558 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 2559 struct rt6_info *rt; 2560 struct fib6_table *table; 2561 2562 table = fib6_get_table(dev_net(dev), tb_id); 2563 if (!table) 2564 return NULL; 2565 2566 read_lock_bh(&table->tb6_lock); 2567 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { 2568 if (dev == rt->dst.dev && 2569 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 2570 ipv6_addr_equal(&rt->rt6i_gateway, addr)) 2571 break; 2572 } 2573 if (rt) 2574 dst_hold(&rt->dst); 2575 read_unlock_bh(&table->tb6_lock); 2576 return rt; 2577 } 2578 2579 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, 2580 struct net_device *dev, 2581 unsigned int pref) 2582 { 2583 struct fib6_config cfg = { 2584 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 2585 .fc_metric = IP6_RT_PRIO_USER, 2586 .fc_ifindex = dev->ifindex, 2587 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 2588 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 2589 .fc_protocol = RTPROT_RA, 2590 .fc_nlinfo.portid = 0, 2591 .fc_nlinfo.nlh = NULL, 2592 .fc_nlinfo.nl_net = dev_net(dev), 2593 }; 2594 2595 cfg.fc_gateway = *gwaddr; 2596 2597 if (!ip6_route_add(&cfg, NULL)) { 2598 struct fib6_table *table; 2599 2600 table = fib6_get_table(dev_net(dev), cfg.fc_table); 2601 if (table) 2602 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 2603 } 2604 2605 return rt6_get_dflt_router(gwaddr, dev); 2606 } 2607 2608 static void __rt6_purge_dflt_routers(struct fib6_table *table) 2609 { 2610 struct rt6_info *rt; 2611 2612 restart: 2613 read_lock_bh(&table->tb6_lock); 2614 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { 2615 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 2616 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) { 2617 dst_hold(&rt->dst); 2618 read_unlock_bh(&table->tb6_lock); 2619 ip6_del_rt(rt); 2620 goto restart; 2621 } 2622 } 2623 read_unlock_bh(&table->tb6_lock); 2624 2625 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 2626 } 2627 2628 void rt6_purge_dflt_routers(struct net *net) 2629 { 2630 struct fib6_table *table; 2631 struct hlist_head *head; 2632 unsigned int h; 2633 2634 rcu_read_lock(); 2635 2636 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 2637 head = &net->ipv6.fib_table_hash[h]; 2638 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 2639 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 2640 __rt6_purge_dflt_routers(table); 2641 } 2642 } 2643 2644 rcu_read_unlock(); 2645 } 2646 2647 static void rtmsg_to_fib6_config(struct net *net, 2648 struct in6_rtmsg *rtmsg, 2649 struct fib6_config *cfg) 2650 { 2651 memset(cfg, 0, sizeof(*cfg)); 2652 2653 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 2654 : RT6_TABLE_MAIN; 2655 cfg->fc_ifindex = rtmsg->rtmsg_ifindex; 2656 cfg->fc_metric = rtmsg->rtmsg_metric; 2657 cfg->fc_expires = rtmsg->rtmsg_info; 2658 cfg->fc_dst_len = rtmsg->rtmsg_dst_len; 2659 cfg->fc_src_len = rtmsg->rtmsg_src_len; 2660 cfg->fc_flags = rtmsg->rtmsg_flags; 2661 2662 cfg->fc_nlinfo.nl_net = net; 2663 2664 cfg->fc_dst = rtmsg->rtmsg_dst; 2665 cfg->fc_src = rtmsg->rtmsg_src; 2666 cfg->fc_gateway = rtmsg->rtmsg_gateway; 2667 } 2668 2669 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 2670 { 2671 struct fib6_config cfg; 2672 struct in6_rtmsg rtmsg; 2673 int err; 2674 2675 switch (cmd) { 2676 case SIOCADDRT: /* Add a route */ 2677 case SIOCDELRT: /* Delete a route */ 2678 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 2679 return -EPERM; 2680 err = copy_from_user(&rtmsg, arg, 2681 sizeof(struct in6_rtmsg)); 2682 if (err) 2683 return -EFAULT; 2684 2685 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 2686 2687 rtnl_lock(); 2688 switch (cmd) { 2689 case SIOCADDRT: 2690 err = ip6_route_add(&cfg, NULL); 2691 break; 2692 case SIOCDELRT: 2693 err = ip6_route_del(&cfg, NULL); 2694 break; 2695 default: 2696 err = -EINVAL; 2697 } 2698 rtnl_unlock(); 2699 2700 return err; 2701 } 2702 2703 return -EINVAL; 2704 } 2705 2706 /* 2707 * Drop the packet on the floor 2708 */ 2709 2710 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 2711 { 2712 int type; 2713 struct dst_entry *dst = skb_dst(skb); 2714 switch (ipstats_mib_noroutes) { 2715 case IPSTATS_MIB_INNOROUTES: 2716 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 2717 if (type == IPV6_ADDR_ANY) { 2718 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 2719 IPSTATS_MIB_INADDRERRORS); 2720 break; 2721 } 2722 /* FALLTHROUGH */ 2723 case IPSTATS_MIB_OUTNOROUTES: 2724 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 2725 ipstats_mib_noroutes); 2726 break; 2727 } 2728 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 2729 kfree_skb(skb); 2730 return 0; 2731 } 2732 2733 static int ip6_pkt_discard(struct sk_buff *skb) 2734 { 2735 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 2736 } 2737 2738 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 2739 { 2740 skb->dev = skb_dst(skb)->dev; 2741 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 2742 } 2743 2744 static int ip6_pkt_prohibit(struct sk_buff *skb) 2745 { 2746 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 2747 } 2748 2749 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 2750 { 2751 skb->dev = skb_dst(skb)->dev; 2752 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 2753 } 2754 2755 /* 2756 * Allocate a dst for local (unicast / anycast) address. 2757 */ 2758 2759 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, 2760 const struct in6_addr *addr, 2761 bool anycast) 2762 { 2763 u32 tb_id; 2764 struct net *net = dev_net(idev->dev); 2765 struct net_device *dev = idev->dev; 2766 struct rt6_info *rt; 2767 2768 rt = ip6_dst_alloc(net, dev, DST_NOCOUNT); 2769 if (!rt) 2770 return ERR_PTR(-ENOMEM); 2771 2772 in6_dev_hold(idev); 2773 2774 rt->dst.flags |= DST_HOST; 2775 rt->dst.input = ip6_input; 2776 rt->dst.output = ip6_output; 2777 rt->rt6i_idev = idev; 2778 2779 rt->rt6i_protocol = RTPROT_KERNEL; 2780 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; 2781 if (anycast) 2782 rt->rt6i_flags |= RTF_ANYCAST; 2783 else 2784 rt->rt6i_flags |= RTF_LOCAL; 2785 2786 rt->rt6i_gateway = *addr; 2787 rt->rt6i_dst.addr = *addr; 2788 rt->rt6i_dst.plen = 128; 2789 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; 2790 rt->rt6i_table = fib6_get_table(net, tb_id); 2791 2792 return rt; 2793 } 2794 2795 /* remove deleted ip from prefsrc entries */ 2796 struct arg_dev_net_ip { 2797 struct net_device *dev; 2798 struct net *net; 2799 struct in6_addr *addr; 2800 }; 2801 2802 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg) 2803 { 2804 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 2805 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 2806 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 2807 2808 if (((void *)rt->dst.dev == dev || !dev) && 2809 rt != net->ipv6.ip6_null_entry && 2810 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) { 2811 /* remove prefsrc entry */ 2812 rt->rt6i_prefsrc.plen = 0; 2813 } 2814 return 0; 2815 } 2816 2817 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 2818 { 2819 struct net *net = dev_net(ifp->idev->dev); 2820 struct arg_dev_net_ip adni = { 2821 .dev = ifp->idev->dev, 2822 .net = net, 2823 .addr = &ifp->addr, 2824 }; 2825 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 2826 } 2827 2828 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) 2829 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 2830 2831 /* Remove routers and update dst entries when gateway turn into host. */ 2832 static int fib6_clean_tohost(struct rt6_info *rt, void *arg) 2833 { 2834 struct in6_addr *gateway = (struct in6_addr *)arg; 2835 2836 if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) || 2837 ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) && 2838 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) { 2839 return -1; 2840 } 2841 return 0; 2842 } 2843 2844 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 2845 { 2846 fib6_clean_all(net, fib6_clean_tohost, gateway); 2847 } 2848 2849 struct arg_dev_net { 2850 struct net_device *dev; 2851 struct net *net; 2852 }; 2853 2854 /* called with write lock held for table with rt */ 2855 static int fib6_ifdown(struct rt6_info *rt, void *arg) 2856 { 2857 const struct arg_dev_net *adn = arg; 2858 const struct net_device *dev = adn->dev; 2859 2860 if ((rt->dst.dev == dev || !dev) && 2861 rt != adn->net->ipv6.ip6_null_entry && 2862 (rt->rt6i_nsiblings == 0 || 2863 (dev && netdev_unregistering(dev)) || 2864 !rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) 2865 return -1; 2866 2867 return 0; 2868 } 2869 2870 void rt6_ifdown(struct net *net, struct net_device *dev) 2871 { 2872 struct arg_dev_net adn = { 2873 .dev = dev, 2874 .net = net, 2875 }; 2876 2877 fib6_clean_all(net, fib6_ifdown, &adn); 2878 if (dev) 2879 rt6_uncached_list_flush_dev(net, dev); 2880 } 2881 2882 struct rt6_mtu_change_arg { 2883 struct net_device *dev; 2884 unsigned int mtu; 2885 }; 2886 2887 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) 2888 { 2889 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 2890 struct inet6_dev *idev; 2891 2892 /* In IPv6 pmtu discovery is not optional, 2893 so that RTAX_MTU lock cannot disable it. 2894 We still use this lock to block changes 2895 caused by addrconf/ndisc. 2896 */ 2897 2898 idev = __in6_dev_get(arg->dev); 2899 if (!idev) 2900 return 0; 2901 2902 /* For administrative MTU increase, there is no way to discover 2903 IPv6 PMTU increase, so PMTU increase should be updated here. 2904 Since RFC 1981 doesn't include administrative MTU increase 2905 update PMTU increase is a MUST. (i.e. jumbo frame) 2906 */ 2907 /* 2908 If new MTU is less than route PMTU, this new MTU will be the 2909 lowest MTU in the path, update the route PMTU to reflect PMTU 2910 decreases; if new MTU is greater than route PMTU, and the 2911 old MTU is the lowest MTU in the path, update the route PMTU 2912 to reflect the increase. In this case if the other nodes' MTU 2913 also have the lowest MTU, TOO BIG MESSAGE will be lead to 2914 PMTU discovery. 2915 */ 2916 if (rt->dst.dev == arg->dev && 2917 dst_metric_raw(&rt->dst, RTAX_MTU) && 2918 !dst_metric_locked(&rt->dst, RTAX_MTU)) { 2919 if (rt->rt6i_flags & RTF_CACHE) { 2920 /* For RTF_CACHE with rt6i_pmtu == 0 2921 * (i.e. a redirected route), 2922 * the metrics of its rt->dst.from has already 2923 * been updated. 2924 */ 2925 if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu) 2926 rt->rt6i_pmtu = arg->mtu; 2927 } else if (dst_mtu(&rt->dst) >= arg->mtu || 2928 (dst_mtu(&rt->dst) < arg->mtu && 2929 dst_mtu(&rt->dst) == idev->cnf.mtu6)) { 2930 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); 2931 } 2932 } 2933 return 0; 2934 } 2935 2936 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 2937 { 2938 struct rt6_mtu_change_arg arg = { 2939 .dev = dev, 2940 .mtu = mtu, 2941 }; 2942 2943 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 2944 } 2945 2946 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 2947 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 2948 [RTA_OIF] = { .type = NLA_U32 }, 2949 [RTA_IIF] = { .type = NLA_U32 }, 2950 [RTA_PRIORITY] = { .type = NLA_U32 }, 2951 [RTA_METRICS] = { .type = NLA_NESTED }, 2952 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 2953 [RTA_PREF] = { .type = NLA_U8 }, 2954 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 2955 [RTA_ENCAP] = { .type = NLA_NESTED }, 2956 [RTA_EXPIRES] = { .type = NLA_U32 }, 2957 [RTA_UID] = { .type = NLA_U32 }, 2958 [RTA_MARK] = { .type = NLA_U32 }, 2959 }; 2960 2961 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 2962 struct fib6_config *cfg, 2963 struct netlink_ext_ack *extack) 2964 { 2965 struct rtmsg *rtm; 2966 struct nlattr *tb[RTA_MAX+1]; 2967 unsigned int pref; 2968 int err; 2969 2970 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 2971 NULL); 2972 if (err < 0) 2973 goto errout; 2974 2975 err = -EINVAL; 2976 rtm = nlmsg_data(nlh); 2977 memset(cfg, 0, sizeof(*cfg)); 2978 2979 cfg->fc_table = rtm->rtm_table; 2980 cfg->fc_dst_len = rtm->rtm_dst_len; 2981 cfg->fc_src_len = rtm->rtm_src_len; 2982 cfg->fc_flags = RTF_UP; 2983 cfg->fc_protocol = rtm->rtm_protocol; 2984 cfg->fc_type = rtm->rtm_type; 2985 2986 if (rtm->rtm_type == RTN_UNREACHABLE || 2987 rtm->rtm_type == RTN_BLACKHOLE || 2988 rtm->rtm_type == RTN_PROHIBIT || 2989 rtm->rtm_type == RTN_THROW) 2990 cfg->fc_flags |= RTF_REJECT; 2991 2992 if (rtm->rtm_type == RTN_LOCAL) 2993 cfg->fc_flags |= RTF_LOCAL; 2994 2995 if (rtm->rtm_flags & RTM_F_CLONED) 2996 cfg->fc_flags |= RTF_CACHE; 2997 2998 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; 2999 cfg->fc_nlinfo.nlh = nlh; 3000 cfg->fc_nlinfo.nl_net = sock_net(skb->sk); 3001 3002 if (tb[RTA_GATEWAY]) { 3003 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 3004 cfg->fc_flags |= RTF_GATEWAY; 3005 } 3006 3007 if (tb[RTA_DST]) { 3008 int plen = (rtm->rtm_dst_len + 7) >> 3; 3009 3010 if (nla_len(tb[RTA_DST]) < plen) 3011 goto errout; 3012 3013 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 3014 } 3015 3016 if (tb[RTA_SRC]) { 3017 int plen = (rtm->rtm_src_len + 7) >> 3; 3018 3019 if (nla_len(tb[RTA_SRC]) < plen) 3020 goto errout; 3021 3022 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 3023 } 3024 3025 if (tb[RTA_PREFSRC]) 3026 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 3027 3028 if (tb[RTA_OIF]) 3029 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 3030 3031 if (tb[RTA_PRIORITY]) 3032 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 3033 3034 if (tb[RTA_METRICS]) { 3035 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 3036 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 3037 } 3038 3039 if (tb[RTA_TABLE]) 3040 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 3041 3042 if (tb[RTA_MULTIPATH]) { 3043 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 3044 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 3045 3046 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 3047 cfg->fc_mp_len, extack); 3048 if (err < 0) 3049 goto errout; 3050 } 3051 3052 if (tb[RTA_PREF]) { 3053 pref = nla_get_u8(tb[RTA_PREF]); 3054 if (pref != ICMPV6_ROUTER_PREF_LOW && 3055 pref != ICMPV6_ROUTER_PREF_HIGH) 3056 pref = ICMPV6_ROUTER_PREF_MEDIUM; 3057 cfg->fc_flags |= RTF_PREF(pref); 3058 } 3059 3060 if (tb[RTA_ENCAP]) 3061 cfg->fc_encap = tb[RTA_ENCAP]; 3062 3063 if (tb[RTA_ENCAP_TYPE]) { 3064 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 3065 3066 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 3067 if (err < 0) 3068 goto errout; 3069 } 3070 3071 if (tb[RTA_EXPIRES]) { 3072 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 3073 3074 if (addrconf_finite_timeout(timeout)) { 3075 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 3076 cfg->fc_flags |= RTF_EXPIRES; 3077 } 3078 } 3079 3080 err = 0; 3081 errout: 3082 return err; 3083 } 3084 3085 struct rt6_nh { 3086 struct rt6_info *rt6_info; 3087 struct fib6_config r_cfg; 3088 struct mx6_config mxc; 3089 struct list_head next; 3090 }; 3091 3092 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) 3093 { 3094 struct rt6_nh *nh; 3095 3096 list_for_each_entry(nh, rt6_nh_list, next) { 3097 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n", 3098 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, 3099 nh->r_cfg.fc_ifindex); 3100 } 3101 } 3102 3103 static int ip6_route_info_append(struct list_head *rt6_nh_list, 3104 struct rt6_info *rt, struct fib6_config *r_cfg) 3105 { 3106 struct rt6_nh *nh; 3107 int err = -EEXIST; 3108 3109 list_for_each_entry(nh, rt6_nh_list, next) { 3110 /* check if rt6_info already exists */ 3111 if (rt6_duplicate_nexthop(nh->rt6_info, rt)) 3112 return err; 3113 } 3114 3115 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 3116 if (!nh) 3117 return -ENOMEM; 3118 nh->rt6_info = rt; 3119 err = ip6_convert_metrics(&nh->mxc, r_cfg); 3120 if (err) { 3121 kfree(nh); 3122 return err; 3123 } 3124 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 3125 list_add_tail(&nh->next, rt6_nh_list); 3126 3127 return 0; 3128 } 3129 3130 static void ip6_route_mpath_notify(struct rt6_info *rt, 3131 struct rt6_info *rt_last, 3132 struct nl_info *info, 3133 __u16 nlflags) 3134 { 3135 /* if this is an APPEND route, then rt points to the first route 3136 * inserted and rt_last points to last route inserted. Userspace 3137 * wants a consistent dump of the route which starts at the first 3138 * nexthop. Since sibling routes are always added at the end of 3139 * the list, find the first sibling of the last route appended 3140 */ 3141 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) { 3142 rt = list_first_entry(&rt_last->rt6i_siblings, 3143 struct rt6_info, 3144 rt6i_siblings); 3145 } 3146 3147 if (rt) 3148 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 3149 } 3150 3151 static int ip6_route_multipath_add(struct fib6_config *cfg, 3152 struct netlink_ext_ack *extack) 3153 { 3154 struct rt6_info *rt_notif = NULL, *rt_last = NULL; 3155 struct nl_info *info = &cfg->fc_nlinfo; 3156 struct fib6_config r_cfg; 3157 struct rtnexthop *rtnh; 3158 struct rt6_info *rt; 3159 struct rt6_nh *err_nh; 3160 struct rt6_nh *nh, *nh_safe; 3161 __u16 nlflags; 3162 int remaining; 3163 int attrlen; 3164 int err = 1; 3165 int nhn = 0; 3166 int replace = (cfg->fc_nlinfo.nlh && 3167 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 3168 LIST_HEAD(rt6_nh_list); 3169 3170 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 3171 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 3172 nlflags |= NLM_F_APPEND; 3173 3174 remaining = cfg->fc_mp_len; 3175 rtnh = (struct rtnexthop *)cfg->fc_mp; 3176 3177 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 3178 * rt6_info structs per nexthop 3179 */ 3180 while (rtnh_ok(rtnh, remaining)) { 3181 memcpy(&r_cfg, cfg, sizeof(*cfg)); 3182 if (rtnh->rtnh_ifindex) 3183 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 3184 3185 attrlen = rtnh_attrlen(rtnh); 3186 if (attrlen > 0) { 3187 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 3188 3189 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 3190 if (nla) { 3191 r_cfg.fc_gateway = nla_get_in6_addr(nla); 3192 r_cfg.fc_flags |= RTF_GATEWAY; 3193 } 3194 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 3195 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 3196 if (nla) 3197 r_cfg.fc_encap_type = nla_get_u16(nla); 3198 } 3199 3200 rt = ip6_route_info_create(&r_cfg, extack); 3201 if (IS_ERR(rt)) { 3202 err = PTR_ERR(rt); 3203 rt = NULL; 3204 goto cleanup; 3205 } 3206 3207 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); 3208 if (err) { 3209 dst_release_immediate(&rt->dst); 3210 goto cleanup; 3211 } 3212 3213 rtnh = rtnh_next(rtnh, &remaining); 3214 } 3215 3216 /* for add and replace send one notification with all nexthops. 3217 * Skip the notification in fib6_add_rt2node and send one with 3218 * the full route when done 3219 */ 3220 info->skip_notify = 1; 3221 3222 err_nh = NULL; 3223 list_for_each_entry(nh, &rt6_nh_list, next) { 3224 rt_last = nh->rt6_info; 3225 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack); 3226 /* save reference to first route for notification */ 3227 if (!rt_notif && !err) 3228 rt_notif = nh->rt6_info; 3229 3230 /* nh->rt6_info is used or freed at this point, reset to NULL*/ 3231 nh->rt6_info = NULL; 3232 if (err) { 3233 if (replace && nhn) 3234 ip6_print_replace_route_err(&rt6_nh_list); 3235 err_nh = nh; 3236 goto add_errout; 3237 } 3238 3239 /* Because each route is added like a single route we remove 3240 * these flags after the first nexthop: if there is a collision, 3241 * we have already failed to add the first nexthop: 3242 * fib6_add_rt2node() has rejected it; when replacing, old 3243 * nexthops have been replaced by first new, the rest should 3244 * be added to it. 3245 */ 3246 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 3247 NLM_F_REPLACE); 3248 nhn++; 3249 } 3250 3251 /* success ... tell user about new route */ 3252 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 3253 goto cleanup; 3254 3255 add_errout: 3256 /* send notification for routes that were added so that 3257 * the delete notifications sent by ip6_route_del are 3258 * coherent 3259 */ 3260 if (rt_notif) 3261 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 3262 3263 /* Delete routes that were already added */ 3264 list_for_each_entry(nh, &rt6_nh_list, next) { 3265 if (err_nh == nh) 3266 break; 3267 ip6_route_del(&nh->r_cfg, extack); 3268 } 3269 3270 cleanup: 3271 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 3272 if (nh->rt6_info) 3273 dst_release_immediate(&nh->rt6_info->dst); 3274 kfree(nh->mxc.mx); 3275 list_del(&nh->next); 3276 kfree(nh); 3277 } 3278 3279 return err; 3280 } 3281 3282 static int ip6_route_multipath_del(struct fib6_config *cfg, 3283 struct netlink_ext_ack *extack) 3284 { 3285 struct fib6_config r_cfg; 3286 struct rtnexthop *rtnh; 3287 int remaining; 3288 int attrlen; 3289 int err = 1, last_err = 0; 3290 3291 remaining = cfg->fc_mp_len; 3292 rtnh = (struct rtnexthop *)cfg->fc_mp; 3293 3294 /* Parse a Multipath Entry */ 3295 while (rtnh_ok(rtnh, remaining)) { 3296 memcpy(&r_cfg, cfg, sizeof(*cfg)); 3297 if (rtnh->rtnh_ifindex) 3298 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 3299 3300 attrlen = rtnh_attrlen(rtnh); 3301 if (attrlen > 0) { 3302 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 3303 3304 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 3305 if (nla) { 3306 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 3307 r_cfg.fc_flags |= RTF_GATEWAY; 3308 } 3309 } 3310 err = ip6_route_del(&r_cfg, extack); 3311 if (err) 3312 last_err = err; 3313 3314 rtnh = rtnh_next(rtnh, &remaining); 3315 } 3316 3317 return last_err; 3318 } 3319 3320 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 3321 struct netlink_ext_ack *extack) 3322 { 3323 struct fib6_config cfg; 3324 int err; 3325 3326 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 3327 if (err < 0) 3328 return err; 3329 3330 if (cfg.fc_mp) 3331 return ip6_route_multipath_del(&cfg, extack); 3332 else { 3333 cfg.fc_delete_all_nh = 1; 3334 return ip6_route_del(&cfg, extack); 3335 } 3336 } 3337 3338 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 3339 struct netlink_ext_ack *extack) 3340 { 3341 struct fib6_config cfg; 3342 int err; 3343 3344 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 3345 if (err < 0) 3346 return err; 3347 3348 if (cfg.fc_mp) 3349 return ip6_route_multipath_add(&cfg, extack); 3350 else 3351 return ip6_route_add(&cfg, extack); 3352 } 3353 3354 static size_t rt6_nlmsg_size(struct rt6_info *rt) 3355 { 3356 int nexthop_len = 0; 3357 3358 if (rt->rt6i_nsiblings) { 3359 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 3360 + NLA_ALIGN(sizeof(struct rtnexthop)) 3361 + nla_total_size(16) /* RTA_GATEWAY */ 3362 + lwtunnel_get_encap_size(rt->dst.lwtstate); 3363 3364 nexthop_len *= rt->rt6i_nsiblings; 3365 } 3366 3367 return NLMSG_ALIGN(sizeof(struct rtmsg)) 3368 + nla_total_size(16) /* RTA_SRC */ 3369 + nla_total_size(16) /* RTA_DST */ 3370 + nla_total_size(16) /* RTA_GATEWAY */ 3371 + nla_total_size(16) /* RTA_PREFSRC */ 3372 + nla_total_size(4) /* RTA_TABLE */ 3373 + nla_total_size(4) /* RTA_IIF */ 3374 + nla_total_size(4) /* RTA_OIF */ 3375 + nla_total_size(4) /* RTA_PRIORITY */ 3376 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 3377 + nla_total_size(sizeof(struct rta_cacheinfo)) 3378 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 3379 + nla_total_size(1) /* RTA_PREF */ 3380 + lwtunnel_get_encap_size(rt->dst.lwtstate) 3381 + nexthop_len; 3382 } 3383 3384 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt, 3385 unsigned int *flags, bool skip_oif) 3386 { 3387 if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) { 3388 *flags |= RTNH_F_LINKDOWN; 3389 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown) 3390 *flags |= RTNH_F_DEAD; 3391 } 3392 3393 if (rt->rt6i_flags & RTF_GATEWAY) { 3394 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0) 3395 goto nla_put_failure; 3396 } 3397 3398 if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD) 3399 *flags |= RTNH_F_OFFLOAD; 3400 3401 /* not needed for multipath encoding b/c it has a rtnexthop struct */ 3402 if (!skip_oif && rt->dst.dev && 3403 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) 3404 goto nla_put_failure; 3405 3406 if (rt->dst.lwtstate && 3407 lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0) 3408 goto nla_put_failure; 3409 3410 return 0; 3411 3412 nla_put_failure: 3413 return -EMSGSIZE; 3414 } 3415 3416 /* add multipath next hop */ 3417 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt) 3418 { 3419 struct rtnexthop *rtnh; 3420 unsigned int flags = 0; 3421 3422 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 3423 if (!rtnh) 3424 goto nla_put_failure; 3425 3426 rtnh->rtnh_hops = 0; 3427 rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0; 3428 3429 if (rt6_nexthop_info(skb, rt, &flags, true) < 0) 3430 goto nla_put_failure; 3431 3432 rtnh->rtnh_flags = flags; 3433 3434 /* length of rtnetlink header + attributes */ 3435 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; 3436 3437 return 0; 3438 3439 nla_put_failure: 3440 return -EMSGSIZE; 3441 } 3442 3443 static int rt6_fill_node(struct net *net, 3444 struct sk_buff *skb, struct rt6_info *rt, 3445 struct in6_addr *dst, struct in6_addr *src, 3446 int iif, int type, u32 portid, u32 seq, 3447 unsigned int flags) 3448 { 3449 u32 metrics[RTAX_MAX]; 3450 struct rtmsg *rtm; 3451 struct nlmsghdr *nlh; 3452 long expires; 3453 u32 table; 3454 3455 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 3456 if (!nlh) 3457 return -EMSGSIZE; 3458 3459 rtm = nlmsg_data(nlh); 3460 rtm->rtm_family = AF_INET6; 3461 rtm->rtm_dst_len = rt->rt6i_dst.plen; 3462 rtm->rtm_src_len = rt->rt6i_src.plen; 3463 rtm->rtm_tos = 0; 3464 if (rt->rt6i_table) 3465 table = rt->rt6i_table->tb6_id; 3466 else 3467 table = RT6_TABLE_UNSPEC; 3468 rtm->rtm_table = table; 3469 if (nla_put_u32(skb, RTA_TABLE, table)) 3470 goto nla_put_failure; 3471 if (rt->rt6i_flags & RTF_REJECT) { 3472 switch (rt->dst.error) { 3473 case -EINVAL: 3474 rtm->rtm_type = RTN_BLACKHOLE; 3475 break; 3476 case -EACCES: 3477 rtm->rtm_type = RTN_PROHIBIT; 3478 break; 3479 case -EAGAIN: 3480 rtm->rtm_type = RTN_THROW; 3481 break; 3482 default: 3483 rtm->rtm_type = RTN_UNREACHABLE; 3484 break; 3485 } 3486 } 3487 else if (rt->rt6i_flags & RTF_LOCAL) 3488 rtm->rtm_type = RTN_LOCAL; 3489 else if (rt->rt6i_flags & RTF_ANYCAST) 3490 rtm->rtm_type = RTN_ANYCAST; 3491 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK)) 3492 rtm->rtm_type = RTN_LOCAL; 3493 else 3494 rtm->rtm_type = RTN_UNICAST; 3495 rtm->rtm_flags = 0; 3496 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 3497 rtm->rtm_protocol = rt->rt6i_protocol; 3498 3499 if (rt->rt6i_flags & RTF_CACHE) 3500 rtm->rtm_flags |= RTM_F_CLONED; 3501 3502 if (dst) { 3503 if (nla_put_in6_addr(skb, RTA_DST, dst)) 3504 goto nla_put_failure; 3505 rtm->rtm_dst_len = 128; 3506 } else if (rtm->rtm_dst_len) 3507 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr)) 3508 goto nla_put_failure; 3509 #ifdef CONFIG_IPV6_SUBTREES 3510 if (src) { 3511 if (nla_put_in6_addr(skb, RTA_SRC, src)) 3512 goto nla_put_failure; 3513 rtm->rtm_src_len = 128; 3514 } else if (rtm->rtm_src_len && 3515 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr)) 3516 goto nla_put_failure; 3517 #endif 3518 if (iif) { 3519 #ifdef CONFIG_IPV6_MROUTE 3520 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) { 3521 int err = ip6mr_get_route(net, skb, rtm, portid); 3522 3523 if (err == 0) 3524 return 0; 3525 if (err < 0) 3526 goto nla_put_failure; 3527 } else 3528 #endif 3529 if (nla_put_u32(skb, RTA_IIF, iif)) 3530 goto nla_put_failure; 3531 } else if (dst) { 3532 struct in6_addr saddr_buf; 3533 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 && 3534 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 3535 goto nla_put_failure; 3536 } 3537 3538 if (rt->rt6i_prefsrc.plen) { 3539 struct in6_addr saddr_buf; 3540 saddr_buf = rt->rt6i_prefsrc.addr; 3541 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 3542 goto nla_put_failure; 3543 } 3544 3545 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); 3546 if (rt->rt6i_pmtu) 3547 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu; 3548 if (rtnetlink_put_metrics(skb, metrics) < 0) 3549 goto nla_put_failure; 3550 3551 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric)) 3552 goto nla_put_failure; 3553 3554 /* For multipath routes, walk the siblings list and add 3555 * each as a nexthop within RTA_MULTIPATH. 3556 */ 3557 if (rt->rt6i_nsiblings) { 3558 struct rt6_info *sibling, *next_sibling; 3559 struct nlattr *mp; 3560 3561 mp = nla_nest_start(skb, RTA_MULTIPATH); 3562 if (!mp) 3563 goto nla_put_failure; 3564 3565 if (rt6_add_nexthop(skb, rt) < 0) 3566 goto nla_put_failure; 3567 3568 list_for_each_entry_safe(sibling, next_sibling, 3569 &rt->rt6i_siblings, rt6i_siblings) { 3570 if (rt6_add_nexthop(skb, sibling) < 0) 3571 goto nla_put_failure; 3572 } 3573 3574 nla_nest_end(skb, mp); 3575 } else { 3576 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0) 3577 goto nla_put_failure; 3578 } 3579 3580 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0; 3581 3582 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) 3583 goto nla_put_failure; 3584 3585 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) 3586 goto nla_put_failure; 3587 3588 3589 nlmsg_end(skb, nlh); 3590 return 0; 3591 3592 nla_put_failure: 3593 nlmsg_cancel(skb, nlh); 3594 return -EMSGSIZE; 3595 } 3596 3597 int rt6_dump_route(struct rt6_info *rt, void *p_arg) 3598 { 3599 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 3600 struct net *net = arg->net; 3601 3602 if (rt == net->ipv6.ip6_null_entry) 3603 return 0; 3604 3605 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { 3606 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); 3607 3608 /* user wants prefix routes only */ 3609 if (rtm->rtm_flags & RTM_F_PREFIX && 3610 !(rt->rt6i_flags & RTF_PREFIX_RT)) { 3611 /* success since this is not a prefix route */ 3612 return 1; 3613 } 3614 } 3615 3616 return rt6_fill_node(net, 3617 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, 3618 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq, 3619 NLM_F_MULTI); 3620 } 3621 3622 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 3623 struct netlink_ext_ack *extack) 3624 { 3625 struct net *net = sock_net(in_skb->sk); 3626 struct nlattr *tb[RTA_MAX+1]; 3627 int err, iif = 0, oif = 0; 3628 struct dst_entry *dst; 3629 struct rt6_info *rt; 3630 struct sk_buff *skb; 3631 struct rtmsg *rtm; 3632 struct flowi6 fl6; 3633 bool fibmatch; 3634 3635 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 3636 extack); 3637 if (err < 0) 3638 goto errout; 3639 3640 err = -EINVAL; 3641 memset(&fl6, 0, sizeof(fl6)); 3642 rtm = nlmsg_data(nlh); 3643 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 3644 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 3645 3646 if (tb[RTA_SRC]) { 3647 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 3648 goto errout; 3649 3650 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 3651 } 3652 3653 if (tb[RTA_DST]) { 3654 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 3655 goto errout; 3656 3657 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 3658 } 3659 3660 if (tb[RTA_IIF]) 3661 iif = nla_get_u32(tb[RTA_IIF]); 3662 3663 if (tb[RTA_OIF]) 3664 oif = nla_get_u32(tb[RTA_OIF]); 3665 3666 if (tb[RTA_MARK]) 3667 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 3668 3669 if (tb[RTA_UID]) 3670 fl6.flowi6_uid = make_kuid(current_user_ns(), 3671 nla_get_u32(tb[RTA_UID])); 3672 else 3673 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 3674 3675 if (iif) { 3676 struct net_device *dev; 3677 int flags = 0; 3678 3679 rcu_read_lock(); 3680 3681 dev = dev_get_by_index_rcu(net, iif); 3682 if (!dev) { 3683 rcu_read_unlock(); 3684 err = -ENODEV; 3685 goto errout; 3686 } 3687 3688 fl6.flowi6_iif = iif; 3689 3690 if (!ipv6_addr_any(&fl6.saddr)) 3691 flags |= RT6_LOOKUP_F_HAS_SADDR; 3692 3693 if (!fibmatch) 3694 dst = ip6_route_input_lookup(net, dev, &fl6, flags); 3695 else 3696 dst = ip6_route_lookup(net, &fl6, 0); 3697 3698 rcu_read_unlock(); 3699 } else { 3700 fl6.flowi6_oif = oif; 3701 3702 if (!fibmatch) 3703 dst = ip6_route_output(net, NULL, &fl6); 3704 else 3705 dst = ip6_route_lookup(net, &fl6, 0); 3706 } 3707 3708 3709 rt = container_of(dst, struct rt6_info, dst); 3710 if (rt->dst.error) { 3711 err = rt->dst.error; 3712 ip6_rt_put(rt); 3713 goto errout; 3714 } 3715 3716 if (rt == net->ipv6.ip6_null_entry) { 3717 err = rt->dst.error; 3718 ip6_rt_put(rt); 3719 goto errout; 3720 } 3721 3722 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 3723 if (!skb) { 3724 ip6_rt_put(rt); 3725 err = -ENOBUFS; 3726 goto errout; 3727 } 3728 3729 skb_dst_set(skb, &rt->dst); 3730 if (fibmatch) 3731 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif, 3732 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 3733 nlh->nlmsg_seq, 0); 3734 else 3735 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif, 3736 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 3737 nlh->nlmsg_seq, 0); 3738 if (err < 0) { 3739 kfree_skb(skb); 3740 goto errout; 3741 } 3742 3743 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 3744 errout: 3745 return err; 3746 } 3747 3748 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, 3749 unsigned int nlm_flags) 3750 { 3751 struct sk_buff *skb; 3752 struct net *net = info->nl_net; 3753 u32 seq; 3754 int err; 3755 3756 err = -ENOBUFS; 3757 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3758 3759 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3760 if (!skb) 3761 goto errout; 3762 3763 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0, 3764 event, info->portid, seq, nlm_flags); 3765 if (err < 0) { 3766 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 3767 WARN_ON(err == -EMSGSIZE); 3768 kfree_skb(skb); 3769 goto errout; 3770 } 3771 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3772 info->nlh, gfp_any()); 3773 return; 3774 errout: 3775 if (err < 0) 3776 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 3777 } 3778 3779 static int ip6_route_dev_notify(struct notifier_block *this, 3780 unsigned long event, void *ptr) 3781 { 3782 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 3783 struct net *net = dev_net(dev); 3784 3785 if (!(dev->flags & IFF_LOOPBACK)) 3786 return NOTIFY_OK; 3787 3788 if (event == NETDEV_REGISTER) { 3789 net->ipv6.ip6_null_entry->dst.dev = dev; 3790 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 3791 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 3792 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 3793 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 3794 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 3795 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 3796 #endif 3797 } else if (event == NETDEV_UNREGISTER && 3798 dev->reg_state != NETREG_UNREGISTERED) { 3799 /* NETDEV_UNREGISTER could be fired for multiple times by 3800 * netdev_wait_allrefs(). Make sure we only call this once. 3801 */ 3802 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 3803 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 3804 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 3805 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 3806 #endif 3807 } 3808 3809 return NOTIFY_OK; 3810 } 3811 3812 /* 3813 * /proc 3814 */ 3815 3816 #ifdef CONFIG_PROC_FS 3817 3818 static const struct file_operations ipv6_route_proc_fops = { 3819 .owner = THIS_MODULE, 3820 .open = ipv6_route_open, 3821 .read = seq_read, 3822 .llseek = seq_lseek, 3823 .release = seq_release_net, 3824 }; 3825 3826 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 3827 { 3828 struct net *net = (struct net *)seq->private; 3829 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 3830 net->ipv6.rt6_stats->fib_nodes, 3831 net->ipv6.rt6_stats->fib_route_nodes, 3832 net->ipv6.rt6_stats->fib_rt_alloc, 3833 net->ipv6.rt6_stats->fib_rt_entries, 3834 net->ipv6.rt6_stats->fib_rt_cache, 3835 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 3836 net->ipv6.rt6_stats->fib_discarded_routes); 3837 3838 return 0; 3839 } 3840 3841 static int rt6_stats_seq_open(struct inode *inode, struct file *file) 3842 { 3843 return single_open_net(inode, file, rt6_stats_seq_show); 3844 } 3845 3846 static const struct file_operations rt6_stats_seq_fops = { 3847 .owner = THIS_MODULE, 3848 .open = rt6_stats_seq_open, 3849 .read = seq_read, 3850 .llseek = seq_lseek, 3851 .release = single_release_net, 3852 }; 3853 #endif /* CONFIG_PROC_FS */ 3854 3855 #ifdef CONFIG_SYSCTL 3856 3857 static 3858 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 3859 void __user *buffer, size_t *lenp, loff_t *ppos) 3860 { 3861 struct net *net; 3862 int delay; 3863 if (!write) 3864 return -EINVAL; 3865 3866 net = (struct net *)ctl->extra1; 3867 delay = net->ipv6.sysctl.flush_delay; 3868 proc_dointvec(ctl, write, buffer, lenp, ppos); 3869 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 3870 return 0; 3871 } 3872 3873 struct ctl_table ipv6_route_table_template[] = { 3874 { 3875 .procname = "flush", 3876 .data = &init_net.ipv6.sysctl.flush_delay, 3877 .maxlen = sizeof(int), 3878 .mode = 0200, 3879 .proc_handler = ipv6_sysctl_rtcache_flush 3880 }, 3881 { 3882 .procname = "gc_thresh", 3883 .data = &ip6_dst_ops_template.gc_thresh, 3884 .maxlen = sizeof(int), 3885 .mode = 0644, 3886 .proc_handler = proc_dointvec, 3887 }, 3888 { 3889 .procname = "max_size", 3890 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 3891 .maxlen = sizeof(int), 3892 .mode = 0644, 3893 .proc_handler = proc_dointvec, 3894 }, 3895 { 3896 .procname = "gc_min_interval", 3897 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 3898 .maxlen = sizeof(int), 3899 .mode = 0644, 3900 .proc_handler = proc_dointvec_jiffies, 3901 }, 3902 { 3903 .procname = "gc_timeout", 3904 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 3905 .maxlen = sizeof(int), 3906 .mode = 0644, 3907 .proc_handler = proc_dointvec_jiffies, 3908 }, 3909 { 3910 .procname = "gc_interval", 3911 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 3912 .maxlen = sizeof(int), 3913 .mode = 0644, 3914 .proc_handler = proc_dointvec_jiffies, 3915 }, 3916 { 3917 .procname = "gc_elasticity", 3918 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 3919 .maxlen = sizeof(int), 3920 .mode = 0644, 3921 .proc_handler = proc_dointvec, 3922 }, 3923 { 3924 .procname = "mtu_expires", 3925 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 3926 .maxlen = sizeof(int), 3927 .mode = 0644, 3928 .proc_handler = proc_dointvec_jiffies, 3929 }, 3930 { 3931 .procname = "min_adv_mss", 3932 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 3933 .maxlen = sizeof(int), 3934 .mode = 0644, 3935 .proc_handler = proc_dointvec, 3936 }, 3937 { 3938 .procname = "gc_min_interval_ms", 3939 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 3940 .maxlen = sizeof(int), 3941 .mode = 0644, 3942 .proc_handler = proc_dointvec_ms_jiffies, 3943 }, 3944 { } 3945 }; 3946 3947 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 3948 { 3949 struct ctl_table *table; 3950 3951 table = kmemdup(ipv6_route_table_template, 3952 sizeof(ipv6_route_table_template), 3953 GFP_KERNEL); 3954 3955 if (table) { 3956 table[0].data = &net->ipv6.sysctl.flush_delay; 3957 table[0].extra1 = net; 3958 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 3959 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 3960 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 3961 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 3962 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 3963 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 3964 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 3965 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 3966 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 3967 3968 /* Don't export sysctls to unprivileged users */ 3969 if (net->user_ns != &init_user_ns) 3970 table[0].procname = NULL; 3971 } 3972 3973 return table; 3974 } 3975 #endif 3976 3977 static int __net_init ip6_route_net_init(struct net *net) 3978 { 3979 int ret = -ENOMEM; 3980 3981 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 3982 sizeof(net->ipv6.ip6_dst_ops)); 3983 3984 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 3985 goto out_ip6_dst_ops; 3986 3987 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 3988 sizeof(*net->ipv6.ip6_null_entry), 3989 GFP_KERNEL); 3990 if (!net->ipv6.ip6_null_entry) 3991 goto out_ip6_dst_entries; 3992 net->ipv6.ip6_null_entry->dst.path = 3993 (struct dst_entry *)net->ipv6.ip6_null_entry; 3994 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 3995 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 3996 ip6_template_metrics, true); 3997 3998 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 3999 net->ipv6.fib6_has_custom_rules = false; 4000 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 4001 sizeof(*net->ipv6.ip6_prohibit_entry), 4002 GFP_KERNEL); 4003 if (!net->ipv6.ip6_prohibit_entry) 4004 goto out_ip6_null_entry; 4005 net->ipv6.ip6_prohibit_entry->dst.path = 4006 (struct dst_entry *)net->ipv6.ip6_prohibit_entry; 4007 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 4008 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 4009 ip6_template_metrics, true); 4010 4011 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 4012 sizeof(*net->ipv6.ip6_blk_hole_entry), 4013 GFP_KERNEL); 4014 if (!net->ipv6.ip6_blk_hole_entry) 4015 goto out_ip6_prohibit_entry; 4016 net->ipv6.ip6_blk_hole_entry->dst.path = 4017 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry; 4018 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 4019 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 4020 ip6_template_metrics, true); 4021 #endif 4022 4023 net->ipv6.sysctl.flush_delay = 0; 4024 net->ipv6.sysctl.ip6_rt_max_size = 4096; 4025 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 4026 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 4027 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 4028 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 4029 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 4030 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 4031 4032 net->ipv6.ip6_rt_gc_expire = 30*HZ; 4033 4034 ret = 0; 4035 out: 4036 return ret; 4037 4038 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4039 out_ip6_prohibit_entry: 4040 kfree(net->ipv6.ip6_prohibit_entry); 4041 out_ip6_null_entry: 4042 kfree(net->ipv6.ip6_null_entry); 4043 #endif 4044 out_ip6_dst_entries: 4045 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 4046 out_ip6_dst_ops: 4047 goto out; 4048 } 4049 4050 static void __net_exit ip6_route_net_exit(struct net *net) 4051 { 4052 kfree(net->ipv6.ip6_null_entry); 4053 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4054 kfree(net->ipv6.ip6_prohibit_entry); 4055 kfree(net->ipv6.ip6_blk_hole_entry); 4056 #endif 4057 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 4058 } 4059 4060 static int __net_init ip6_route_net_init_late(struct net *net) 4061 { 4062 #ifdef CONFIG_PROC_FS 4063 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops); 4064 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops); 4065 #endif 4066 return 0; 4067 } 4068 4069 static void __net_exit ip6_route_net_exit_late(struct net *net) 4070 { 4071 #ifdef CONFIG_PROC_FS 4072 remove_proc_entry("ipv6_route", net->proc_net); 4073 remove_proc_entry("rt6_stats", net->proc_net); 4074 #endif 4075 } 4076 4077 static struct pernet_operations ip6_route_net_ops = { 4078 .init = ip6_route_net_init, 4079 .exit = ip6_route_net_exit, 4080 }; 4081 4082 static int __net_init ipv6_inetpeer_init(struct net *net) 4083 { 4084 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 4085 4086 if (!bp) 4087 return -ENOMEM; 4088 inet_peer_base_init(bp); 4089 net->ipv6.peers = bp; 4090 return 0; 4091 } 4092 4093 static void __net_exit ipv6_inetpeer_exit(struct net *net) 4094 { 4095 struct inet_peer_base *bp = net->ipv6.peers; 4096 4097 net->ipv6.peers = NULL; 4098 inetpeer_invalidate_tree(bp); 4099 kfree(bp); 4100 } 4101 4102 static struct pernet_operations ipv6_inetpeer_ops = { 4103 .init = ipv6_inetpeer_init, 4104 .exit = ipv6_inetpeer_exit, 4105 }; 4106 4107 static struct pernet_operations ip6_route_net_late_ops = { 4108 .init = ip6_route_net_init_late, 4109 .exit = ip6_route_net_exit_late, 4110 }; 4111 4112 static struct notifier_block ip6_route_dev_notifier = { 4113 .notifier_call = ip6_route_dev_notify, 4114 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 4115 }; 4116 4117 void __init ip6_route_init_special_entries(void) 4118 { 4119 /* Registering of the loopback is done before this portion of code, 4120 * the loopback reference in rt6_info will not be taken, do it 4121 * manually for init_net */ 4122 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 4123 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 4124 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4125 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 4126 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 4127 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 4128 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 4129 #endif 4130 } 4131 4132 int __init ip6_route_init(void) 4133 { 4134 int ret; 4135 int cpu; 4136 4137 ret = -ENOMEM; 4138 ip6_dst_ops_template.kmem_cachep = 4139 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 4140 SLAB_HWCACHE_ALIGN, NULL); 4141 if (!ip6_dst_ops_template.kmem_cachep) 4142 goto out; 4143 4144 ret = dst_entries_init(&ip6_dst_blackhole_ops); 4145 if (ret) 4146 goto out_kmem_cache; 4147 4148 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 4149 if (ret) 4150 goto out_dst_entries; 4151 4152 ret = register_pernet_subsys(&ip6_route_net_ops); 4153 if (ret) 4154 goto out_register_inetpeer; 4155 4156 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 4157 4158 ret = fib6_init(); 4159 if (ret) 4160 goto out_register_subsys; 4161 4162 ret = xfrm6_init(); 4163 if (ret) 4164 goto out_fib6_init; 4165 4166 ret = fib6_rules_init(); 4167 if (ret) 4168 goto xfrm6_init; 4169 4170 ret = register_pernet_subsys(&ip6_route_net_late_ops); 4171 if (ret) 4172 goto fib6_rules_init; 4173 4174 ret = -ENOBUFS; 4175 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) || 4176 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) || 4177 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, 4178 RTNL_FLAG_DOIT_UNLOCKED)) 4179 goto out_register_late_subsys; 4180 4181 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 4182 if (ret) 4183 goto out_register_late_subsys; 4184 4185 for_each_possible_cpu(cpu) { 4186 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 4187 4188 INIT_LIST_HEAD(&ul->head); 4189 spin_lock_init(&ul->lock); 4190 } 4191 4192 out: 4193 return ret; 4194 4195 out_register_late_subsys: 4196 unregister_pernet_subsys(&ip6_route_net_late_ops); 4197 fib6_rules_init: 4198 fib6_rules_cleanup(); 4199 xfrm6_init: 4200 xfrm6_fini(); 4201 out_fib6_init: 4202 fib6_gc_cleanup(); 4203 out_register_subsys: 4204 unregister_pernet_subsys(&ip6_route_net_ops); 4205 out_register_inetpeer: 4206 unregister_pernet_subsys(&ipv6_inetpeer_ops); 4207 out_dst_entries: 4208 dst_entries_destroy(&ip6_dst_blackhole_ops); 4209 out_kmem_cache: 4210 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 4211 goto out; 4212 } 4213 4214 void ip6_route_cleanup(void) 4215 { 4216 unregister_netdevice_notifier(&ip6_route_dev_notifier); 4217 unregister_pernet_subsys(&ip6_route_net_late_ops); 4218 fib6_rules_cleanup(); 4219 xfrm6_fini(); 4220 fib6_gc_cleanup(); 4221 unregister_pernet_subsys(&ipv6_inetpeer_ops); 4222 unregister_pernet_subsys(&ip6_route_net_ops); 4223 dst_entries_destroy(&ip6_dst_blackhole_ops); 4224 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 4225 } 4226