1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2013 Nicira, Inc. 4 */ 5 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 8 #include <linux/capability.h> 9 #include <linux/module.h> 10 #include <linux/types.h> 11 #include <linux/kernel.h> 12 #include <linux/slab.h> 13 #include <linux/uaccess.h> 14 #include <linux/skbuff.h> 15 #include <linux/netdevice.h> 16 #include <linux/in.h> 17 #include <linux/tcp.h> 18 #include <linux/udp.h> 19 #include <linux/if_arp.h> 20 #include <linux/init.h> 21 #include <linux/in6.h> 22 #include <linux/inetdevice.h> 23 #include <linux/igmp.h> 24 #include <linux/netfilter_ipv4.h> 25 #include <linux/etherdevice.h> 26 #include <linux/if_ether.h> 27 #include <linux/if_vlan.h> 28 #include <linux/rculist.h> 29 #include <linux/err.h> 30 31 #include <net/sock.h> 32 #include <net/ip.h> 33 #include <net/icmp.h> 34 #include <net/protocol.h> 35 #include <net/ip_tunnels.h> 36 #include <net/arp.h> 37 #include <net/checksum.h> 38 #include <net/dsfield.h> 39 #include <net/inet_ecn.h> 40 #include <net/xfrm.h> 41 #include <net/net_namespace.h> 42 #include <net/netns/generic.h> 43 #include <net/rtnetlink.h> 44 #include <net/udp.h> 45 #include <net/dst_metadata.h> 46 47 #if IS_ENABLED(CONFIG_IPV6) 48 #include <net/ipv6.h> 49 #include <net/ip6_fib.h> 50 #include <net/ip6_route.h> 51 #endif 52 53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote) 54 { 55 return hash_32((__force u32)key ^ (__force u32)remote, 56 IP_TNL_HASH_BITS); 57 } 58 59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p, 60 __be16 flags, __be32 key) 61 { 62 if (p->i_flags & TUNNEL_KEY) { 63 if (flags & TUNNEL_KEY) 64 return key == p->i_key; 65 else 66 /* key expected, none present */ 67 return false; 68 } else 69 return !(flags & TUNNEL_KEY); 70 } 71 72 /* Fallback tunnel: no source, no destination, no key, no options 73 74 Tunnel hash table: 75 We require exact key match i.e. if a key is present in packet 76 it will match only tunnel with the same key; if it is not present, 77 it will match only keyless tunnel. 78 79 All keysless packets, if not matched configured keyless tunnels 80 will match fallback tunnel. 81 Given src, dst and key, find appropriate for input tunnel. 82 */ 83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, 84 int link, __be16 flags, 85 __be32 remote, __be32 local, 86 __be32 key) 87 { 88 struct ip_tunnel *t, *cand = NULL; 89 struct hlist_head *head; 90 struct net_device *ndev; 91 unsigned int hash; 92 93 hash = ip_tunnel_hash(key, remote); 94 head = &itn->tunnels[hash]; 95 96 hlist_for_each_entry_rcu(t, head, hash_node) { 97 if (local != t->parms.iph.saddr || 98 remote != t->parms.iph.daddr || 99 !(t->dev->flags & IFF_UP)) 100 continue; 101 102 if (!ip_tunnel_key_match(&t->parms, flags, key)) 103 continue; 104 105 if (t->parms.link == link) 106 return t; 107 else 108 cand = t; 109 } 110 111 hlist_for_each_entry_rcu(t, head, hash_node) { 112 if (remote != t->parms.iph.daddr || 113 t->parms.iph.saddr != 0 || 114 !(t->dev->flags & IFF_UP)) 115 continue; 116 117 if (!ip_tunnel_key_match(&t->parms, flags, key)) 118 continue; 119 120 if (t->parms.link == link) 121 return t; 122 else if (!cand) 123 cand = t; 124 } 125 126 hash = ip_tunnel_hash(key, 0); 127 head = &itn->tunnels[hash]; 128 129 hlist_for_each_entry_rcu(t, head, hash_node) { 130 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) && 131 (local != t->parms.iph.daddr || !ipv4_is_multicast(local))) 132 continue; 133 134 if (!(t->dev->flags & IFF_UP)) 135 continue; 136 137 if (!ip_tunnel_key_match(&t->parms, flags, key)) 138 continue; 139 140 if (t->parms.link == link) 141 return t; 142 else if (!cand) 143 cand = t; 144 } 145 146 hlist_for_each_entry_rcu(t, head, hash_node) { 147 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) || 148 t->parms.iph.saddr != 0 || 149 t->parms.iph.daddr != 0 || 150 !(t->dev->flags & IFF_UP)) 151 continue; 152 153 if (t->parms.link == link) 154 return t; 155 else if (!cand) 156 cand = t; 157 } 158 159 if (cand) 160 return cand; 161 162 t = rcu_dereference(itn->collect_md_tun); 163 if (t && t->dev->flags & IFF_UP) 164 return t; 165 166 ndev = READ_ONCE(itn->fb_tunnel_dev); 167 if (ndev && ndev->flags & IFF_UP) 168 return netdev_priv(ndev); 169 170 return NULL; 171 } 172 EXPORT_SYMBOL_GPL(ip_tunnel_lookup); 173 174 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn, 175 struct ip_tunnel_parm *parms) 176 { 177 unsigned int h; 178 __be32 remote; 179 __be32 i_key = parms->i_key; 180 181 if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr)) 182 remote = parms->iph.daddr; 183 else 184 remote = 0; 185 186 if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI)) 187 i_key = 0; 188 189 h = ip_tunnel_hash(i_key, remote); 190 return &itn->tunnels[h]; 191 } 192 193 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t) 194 { 195 struct hlist_head *head = ip_bucket(itn, &t->parms); 196 197 if (t->collect_md) 198 rcu_assign_pointer(itn->collect_md_tun, t); 199 hlist_add_head_rcu(&t->hash_node, head); 200 } 201 202 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t) 203 { 204 if (t->collect_md) 205 rcu_assign_pointer(itn->collect_md_tun, NULL); 206 hlist_del_init_rcu(&t->hash_node); 207 } 208 209 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, 210 struct ip_tunnel_parm *parms, 211 int type) 212 { 213 __be32 remote = parms->iph.daddr; 214 __be32 local = parms->iph.saddr; 215 __be32 key = parms->i_key; 216 __be16 flags = parms->i_flags; 217 int link = parms->link; 218 struct ip_tunnel *t = NULL; 219 struct hlist_head *head = ip_bucket(itn, parms); 220 221 hlist_for_each_entry_rcu(t, head, hash_node) { 222 if (local == t->parms.iph.saddr && 223 remote == t->parms.iph.daddr && 224 link == t->parms.link && 225 type == t->dev->type && 226 ip_tunnel_key_match(&t->parms, flags, key)) 227 break; 228 } 229 return t; 230 } 231 232 static struct net_device *__ip_tunnel_create(struct net *net, 233 const struct rtnl_link_ops *ops, 234 struct ip_tunnel_parm *parms) 235 { 236 int err; 237 struct ip_tunnel *tunnel; 238 struct net_device *dev; 239 char name[IFNAMSIZ]; 240 241 err = -E2BIG; 242 if (parms->name[0]) { 243 if (!dev_valid_name(parms->name)) 244 goto failed; 245 strscpy(name, parms->name, IFNAMSIZ); 246 } else { 247 if (strlen(ops->kind) > (IFNAMSIZ - 3)) 248 goto failed; 249 strcpy(name, ops->kind); 250 strcat(name, "%d"); 251 } 252 253 ASSERT_RTNL(); 254 dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup); 255 if (!dev) { 256 err = -ENOMEM; 257 goto failed; 258 } 259 dev_net_set(dev, net); 260 261 dev->rtnl_link_ops = ops; 262 263 tunnel = netdev_priv(dev); 264 tunnel->parms = *parms; 265 tunnel->net = net; 266 267 err = register_netdevice(dev); 268 if (err) 269 goto failed_free; 270 271 return dev; 272 273 failed_free: 274 free_netdev(dev); 275 failed: 276 return ERR_PTR(err); 277 } 278 279 static int ip_tunnel_bind_dev(struct net_device *dev) 280 { 281 struct net_device *tdev = NULL; 282 struct ip_tunnel *tunnel = netdev_priv(dev); 283 const struct iphdr *iph; 284 int hlen = LL_MAX_HEADER; 285 int mtu = ETH_DATA_LEN; 286 int t_hlen = tunnel->hlen + sizeof(struct iphdr); 287 288 iph = &tunnel->parms.iph; 289 290 /* Guess output device to choose reasonable mtu and needed_headroom */ 291 if (iph->daddr) { 292 struct flowi4 fl4; 293 struct rtable *rt; 294 295 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr, 296 iph->saddr, tunnel->parms.o_key, 297 RT_TOS(iph->tos), dev_net(dev), 298 tunnel->parms.link, tunnel->fwmark, 0, 0); 299 rt = ip_route_output_key(tunnel->net, &fl4); 300 301 if (!IS_ERR(rt)) { 302 tdev = rt->dst.dev; 303 ip_rt_put(rt); 304 } 305 if (dev->type != ARPHRD_ETHER) 306 dev->flags |= IFF_POINTOPOINT; 307 308 dst_cache_reset(&tunnel->dst_cache); 309 } 310 311 if (!tdev && tunnel->parms.link) 312 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); 313 314 if (tdev) { 315 hlen = tdev->hard_header_len + tdev->needed_headroom; 316 mtu = min(tdev->mtu, IP_MAX_MTU); 317 } 318 319 dev->needed_headroom = t_hlen + hlen; 320 mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0); 321 322 if (mtu < IPV4_MIN_MTU) 323 mtu = IPV4_MIN_MTU; 324 325 return mtu; 326 } 327 328 static struct ip_tunnel *ip_tunnel_create(struct net *net, 329 struct ip_tunnel_net *itn, 330 struct ip_tunnel_parm *parms) 331 { 332 struct ip_tunnel *nt; 333 struct net_device *dev; 334 int t_hlen; 335 int mtu; 336 int err; 337 338 dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms); 339 if (IS_ERR(dev)) 340 return ERR_CAST(dev); 341 342 mtu = ip_tunnel_bind_dev(dev); 343 err = dev_set_mtu(dev, mtu); 344 if (err) 345 goto err_dev_set_mtu; 346 347 nt = netdev_priv(dev); 348 t_hlen = nt->hlen + sizeof(struct iphdr); 349 dev->min_mtu = ETH_MIN_MTU; 350 dev->max_mtu = IP_MAX_MTU - t_hlen; 351 if (dev->type == ARPHRD_ETHER) 352 dev->max_mtu -= dev->hard_header_len; 353 354 ip_tunnel_add(itn, nt); 355 return nt; 356 357 err_dev_set_mtu: 358 unregister_netdevice(dev); 359 return ERR_PTR(err); 360 } 361 362 void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info) 363 { 364 const struct iphdr *iph = ip_hdr(skb); 365 const struct udphdr *udph; 366 367 if (iph->protocol != IPPROTO_UDP) 368 return; 369 370 udph = (struct udphdr *)((__u8 *)iph + (iph->ihl << 2)); 371 info->encap.sport = udph->source; 372 info->encap.dport = udph->dest; 373 } 374 EXPORT_SYMBOL(ip_tunnel_md_udp_encap); 375 376 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, 377 const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, 378 bool log_ecn_error) 379 { 380 const struct iphdr *iph = ip_hdr(skb); 381 int err; 382 383 #ifdef CONFIG_NET_IPGRE_BROADCAST 384 if (ipv4_is_multicast(iph->daddr)) { 385 DEV_STATS_INC(tunnel->dev, multicast); 386 skb->pkt_type = PACKET_BROADCAST; 387 } 388 #endif 389 390 if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) || 391 ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) { 392 DEV_STATS_INC(tunnel->dev, rx_crc_errors); 393 DEV_STATS_INC(tunnel->dev, rx_errors); 394 goto drop; 395 } 396 397 if (tunnel->parms.i_flags&TUNNEL_SEQ) { 398 if (!(tpi->flags&TUNNEL_SEQ) || 399 (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { 400 DEV_STATS_INC(tunnel->dev, rx_fifo_errors); 401 DEV_STATS_INC(tunnel->dev, rx_errors); 402 goto drop; 403 } 404 tunnel->i_seqno = ntohl(tpi->seq) + 1; 405 } 406 407 skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0); 408 409 err = IP_ECN_decapsulate(iph, skb); 410 if (unlikely(err)) { 411 if (log_ecn_error) 412 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", 413 &iph->saddr, iph->tos); 414 if (err > 1) { 415 DEV_STATS_INC(tunnel->dev, rx_frame_errors); 416 DEV_STATS_INC(tunnel->dev, rx_errors); 417 goto drop; 418 } 419 } 420 421 dev_sw_netstats_rx_add(tunnel->dev, skb->len); 422 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); 423 424 if (tunnel->dev->type == ARPHRD_ETHER) { 425 skb->protocol = eth_type_trans(skb, tunnel->dev); 426 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 427 } else { 428 skb->dev = tunnel->dev; 429 } 430 431 if (tun_dst) 432 skb_dst_set(skb, (struct dst_entry *)tun_dst); 433 434 gro_cells_receive(&tunnel->gro_cells, skb); 435 return 0; 436 437 drop: 438 if (tun_dst) 439 dst_release((struct dst_entry *)tun_dst); 440 kfree_skb(skb); 441 return 0; 442 } 443 EXPORT_SYMBOL_GPL(ip_tunnel_rcv); 444 445 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops, 446 unsigned int num) 447 { 448 if (num >= MAX_IPTUN_ENCAP_OPS) 449 return -ERANGE; 450 451 return !cmpxchg((const struct ip_tunnel_encap_ops **) 452 &iptun_encaps[num], 453 NULL, ops) ? 0 : -1; 454 } 455 EXPORT_SYMBOL(ip_tunnel_encap_add_ops); 456 457 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops, 458 unsigned int num) 459 { 460 int ret; 461 462 if (num >= MAX_IPTUN_ENCAP_OPS) 463 return -ERANGE; 464 465 ret = (cmpxchg((const struct ip_tunnel_encap_ops **) 466 &iptun_encaps[num], 467 ops, NULL) == ops) ? 0 : -1; 468 469 synchronize_net(); 470 471 return ret; 472 } 473 EXPORT_SYMBOL(ip_tunnel_encap_del_ops); 474 475 int ip_tunnel_encap_setup(struct ip_tunnel *t, 476 struct ip_tunnel_encap *ipencap) 477 { 478 int hlen; 479 480 memset(&t->encap, 0, sizeof(t->encap)); 481 482 hlen = ip_encap_hlen(ipencap); 483 if (hlen < 0) 484 return hlen; 485 486 t->encap.type = ipencap->type; 487 t->encap.sport = ipencap->sport; 488 t->encap.dport = ipencap->dport; 489 t->encap.flags = ipencap->flags; 490 491 t->encap_hlen = hlen; 492 t->hlen = t->encap_hlen + t->tun_hlen; 493 494 return 0; 495 } 496 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup); 497 498 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, 499 struct rtable *rt, __be16 df, 500 const struct iphdr *inner_iph, 501 int tunnel_hlen, __be32 dst, bool md) 502 { 503 struct ip_tunnel *tunnel = netdev_priv(dev); 504 int pkt_size; 505 int mtu; 506 507 tunnel_hlen = md ? tunnel_hlen : tunnel->hlen; 508 pkt_size = skb->len - tunnel_hlen; 509 pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0; 510 511 if (df) { 512 mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen); 513 mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0; 514 } else { 515 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; 516 } 517 518 if (skb_valid_dst(skb)) 519 skb_dst_update_pmtu_no_confirm(skb, mtu); 520 521 if (skb->protocol == htons(ETH_P_IP)) { 522 if (!skb_is_gso(skb) && 523 (inner_iph->frag_off & htons(IP_DF)) && 524 mtu < pkt_size) { 525 icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); 526 return -E2BIG; 527 } 528 } 529 #if IS_ENABLED(CONFIG_IPV6) 530 else if (skb->protocol == htons(ETH_P_IPV6)) { 531 struct rt6_info *rt6; 532 __be32 daddr; 533 534 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) : 535 NULL; 536 daddr = md ? dst : tunnel->parms.iph.daddr; 537 538 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && 539 mtu >= IPV6_MIN_MTU) { 540 if ((daddr && !ipv4_is_multicast(daddr)) || 541 rt6->rt6i_dst.plen == 128) { 542 rt6->rt6i_flags |= RTF_MODIFIED; 543 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); 544 } 545 } 546 547 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU && 548 mtu < pkt_size) { 549 icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 550 return -E2BIG; 551 } 552 } 553 #endif 554 return 0; 555 } 556 557 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, 558 u8 proto, int tunnel_hlen) 559 { 560 struct ip_tunnel *tunnel = netdev_priv(dev); 561 u32 headroom = sizeof(struct iphdr); 562 struct ip_tunnel_info *tun_info; 563 const struct ip_tunnel_key *key; 564 const struct iphdr *inner_iph; 565 struct rtable *rt = NULL; 566 struct flowi4 fl4; 567 __be16 df = 0; 568 u8 tos, ttl; 569 bool use_cache; 570 571 tun_info = skb_tunnel_info(skb); 572 if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || 573 ip_tunnel_info_af(tun_info) != AF_INET)) 574 goto tx_error; 575 key = &tun_info->key; 576 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 577 inner_iph = (const struct iphdr *)skb_inner_network_header(skb); 578 tos = key->tos; 579 if (tos == 1) { 580 if (skb->protocol == htons(ETH_P_IP)) 581 tos = inner_iph->tos; 582 else if (skb->protocol == htons(ETH_P_IPV6)) 583 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); 584 } 585 ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 586 tunnel_id_to_key32(key->tun_id), RT_TOS(tos), 587 dev_net(dev), 0, skb->mark, skb_get_hash(skb), 588 key->flow_flags); 589 590 if (!tunnel_hlen) 591 tunnel_hlen = ip_encap_hlen(&tun_info->encap); 592 593 if (ip_tunnel_encap(skb, &tun_info->encap, &proto, &fl4) < 0) 594 goto tx_error; 595 596 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); 597 if (use_cache) 598 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr); 599 if (!rt) { 600 rt = ip_route_output_key(tunnel->net, &fl4); 601 if (IS_ERR(rt)) { 602 DEV_STATS_INC(dev, tx_carrier_errors); 603 goto tx_error; 604 } 605 if (use_cache) 606 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, 607 fl4.saddr); 608 } 609 if (rt->dst.dev == dev) { 610 ip_rt_put(rt); 611 DEV_STATS_INC(dev, collisions); 612 goto tx_error; 613 } 614 615 if (key->tun_flags & TUNNEL_DONT_FRAGMENT) 616 df = htons(IP_DF); 617 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen, 618 key->u.ipv4.dst, true)) { 619 ip_rt_put(rt); 620 goto tx_error; 621 } 622 623 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); 624 ttl = key->ttl; 625 if (ttl == 0) { 626 if (skb->protocol == htons(ETH_P_IP)) 627 ttl = inner_iph->ttl; 628 else if (skb->protocol == htons(ETH_P_IPV6)) 629 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; 630 else 631 ttl = ip4_dst_hoplimit(&rt->dst); 632 } 633 634 headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; 635 if (headroom > READ_ONCE(dev->needed_headroom)) 636 WRITE_ONCE(dev->needed_headroom, headroom); 637 638 if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) { 639 ip_rt_put(rt); 640 goto tx_dropped; 641 } 642 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl, 643 df, !net_eq(tunnel->net, dev_net(dev))); 644 return; 645 tx_error: 646 DEV_STATS_INC(dev, tx_errors); 647 goto kfree; 648 tx_dropped: 649 DEV_STATS_INC(dev, tx_dropped); 650 kfree: 651 kfree_skb(skb); 652 } 653 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit); 654 655 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, 656 const struct iphdr *tnl_params, u8 protocol) 657 { 658 struct ip_tunnel *tunnel = netdev_priv(dev); 659 struct ip_tunnel_info *tun_info = NULL; 660 const struct iphdr *inner_iph; 661 unsigned int max_headroom; /* The extra header space needed */ 662 struct rtable *rt = NULL; /* Route to the other host */ 663 __be16 payload_protocol; 664 bool use_cache = false; 665 struct flowi4 fl4; 666 bool md = false; 667 bool connected; 668 u8 tos, ttl; 669 __be32 dst; 670 __be16 df; 671 672 inner_iph = (const struct iphdr *)skb_inner_network_header(skb); 673 connected = (tunnel->parms.iph.daddr != 0); 674 payload_protocol = skb_protocol(skb, true); 675 676 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 677 678 dst = tnl_params->daddr; 679 if (dst == 0) { 680 /* NBMA tunnel */ 681 682 if (!skb_dst(skb)) { 683 DEV_STATS_INC(dev, tx_fifo_errors); 684 goto tx_error; 685 } 686 687 tun_info = skb_tunnel_info(skb); 688 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) && 689 ip_tunnel_info_af(tun_info) == AF_INET && 690 tun_info->key.u.ipv4.dst) { 691 dst = tun_info->key.u.ipv4.dst; 692 md = true; 693 connected = true; 694 } else if (payload_protocol == htons(ETH_P_IP)) { 695 rt = skb_rtable(skb); 696 dst = rt_nexthop(rt, inner_iph->daddr); 697 } 698 #if IS_ENABLED(CONFIG_IPV6) 699 else if (payload_protocol == htons(ETH_P_IPV6)) { 700 const struct in6_addr *addr6; 701 struct neighbour *neigh; 702 bool do_tx_error_icmp; 703 int addr_type; 704 705 neigh = dst_neigh_lookup(skb_dst(skb), 706 &ipv6_hdr(skb)->daddr); 707 if (!neigh) 708 goto tx_error; 709 710 addr6 = (const struct in6_addr *)&neigh->primary_key; 711 addr_type = ipv6_addr_type(addr6); 712 713 if (addr_type == IPV6_ADDR_ANY) { 714 addr6 = &ipv6_hdr(skb)->daddr; 715 addr_type = ipv6_addr_type(addr6); 716 } 717 718 if ((addr_type & IPV6_ADDR_COMPATv4) == 0) 719 do_tx_error_icmp = true; 720 else { 721 do_tx_error_icmp = false; 722 dst = addr6->s6_addr32[3]; 723 } 724 neigh_release(neigh); 725 if (do_tx_error_icmp) 726 goto tx_error_icmp; 727 } 728 #endif 729 else 730 goto tx_error; 731 732 if (!md) 733 connected = false; 734 } 735 736 tos = tnl_params->tos; 737 if (tos & 0x1) { 738 tos &= ~0x1; 739 if (payload_protocol == htons(ETH_P_IP)) { 740 tos = inner_iph->tos; 741 connected = false; 742 } else if (payload_protocol == htons(ETH_P_IPV6)) { 743 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); 744 connected = false; 745 } 746 } 747 748 ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, 749 tunnel->parms.o_key, RT_TOS(tos), 750 dev_net(dev), tunnel->parms.link, 751 tunnel->fwmark, skb_get_hash(skb), 0); 752 753 if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0) 754 goto tx_error; 755 756 if (connected && md) { 757 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); 758 if (use_cache) 759 rt = dst_cache_get_ip4(&tun_info->dst_cache, 760 &fl4.saddr); 761 } else { 762 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, 763 &fl4.saddr) : NULL; 764 } 765 766 if (!rt) { 767 rt = ip_route_output_key(tunnel->net, &fl4); 768 769 if (IS_ERR(rt)) { 770 DEV_STATS_INC(dev, tx_carrier_errors); 771 goto tx_error; 772 } 773 if (use_cache) 774 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, 775 fl4.saddr); 776 else if (!md && connected) 777 dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, 778 fl4.saddr); 779 } 780 781 if (rt->dst.dev == dev) { 782 ip_rt_put(rt); 783 DEV_STATS_INC(dev, collisions); 784 goto tx_error; 785 } 786 787 df = tnl_params->frag_off; 788 if (payload_protocol == htons(ETH_P_IP) && !tunnel->ignore_df) 789 df |= (inner_iph->frag_off & htons(IP_DF)); 790 791 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) { 792 ip_rt_put(rt); 793 goto tx_error; 794 } 795 796 if (tunnel->err_count > 0) { 797 if (time_before(jiffies, 798 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { 799 tunnel->err_count--; 800 801 dst_link_failure(skb); 802 } else 803 tunnel->err_count = 0; 804 } 805 806 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); 807 ttl = tnl_params->ttl; 808 if (ttl == 0) { 809 if (payload_protocol == htons(ETH_P_IP)) 810 ttl = inner_iph->ttl; 811 #if IS_ENABLED(CONFIG_IPV6) 812 else if (payload_protocol == htons(ETH_P_IPV6)) 813 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; 814 #endif 815 else 816 ttl = ip4_dst_hoplimit(&rt->dst); 817 } 818 819 max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) 820 + rt->dst.header_len + ip_encap_hlen(&tunnel->encap); 821 if (max_headroom > READ_ONCE(dev->needed_headroom)) 822 WRITE_ONCE(dev->needed_headroom, max_headroom); 823 824 if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) { 825 ip_rt_put(rt); 826 DEV_STATS_INC(dev, tx_dropped); 827 kfree_skb(skb); 828 return; 829 } 830 831 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, 832 df, !net_eq(tunnel->net, dev_net(dev))); 833 return; 834 835 #if IS_ENABLED(CONFIG_IPV6) 836 tx_error_icmp: 837 dst_link_failure(skb); 838 #endif 839 tx_error: 840 DEV_STATS_INC(dev, tx_errors); 841 kfree_skb(skb); 842 } 843 EXPORT_SYMBOL_GPL(ip_tunnel_xmit); 844 845 static void ip_tunnel_update(struct ip_tunnel_net *itn, 846 struct ip_tunnel *t, 847 struct net_device *dev, 848 struct ip_tunnel_parm *p, 849 bool set_mtu, 850 __u32 fwmark) 851 { 852 ip_tunnel_del(itn, t); 853 t->parms.iph.saddr = p->iph.saddr; 854 t->parms.iph.daddr = p->iph.daddr; 855 t->parms.i_key = p->i_key; 856 t->parms.o_key = p->o_key; 857 if (dev->type != ARPHRD_ETHER) { 858 __dev_addr_set(dev, &p->iph.saddr, 4); 859 memcpy(dev->broadcast, &p->iph.daddr, 4); 860 } 861 ip_tunnel_add(itn, t); 862 863 t->parms.iph.ttl = p->iph.ttl; 864 t->parms.iph.tos = p->iph.tos; 865 t->parms.iph.frag_off = p->iph.frag_off; 866 867 if (t->parms.link != p->link || t->fwmark != fwmark) { 868 int mtu; 869 870 t->parms.link = p->link; 871 t->fwmark = fwmark; 872 mtu = ip_tunnel_bind_dev(dev); 873 if (set_mtu) 874 dev->mtu = mtu; 875 } 876 dst_cache_reset(&t->dst_cache); 877 netdev_state_change(dev); 878 } 879 880 int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) 881 { 882 int err = 0; 883 struct ip_tunnel *t = netdev_priv(dev); 884 struct net *net = t->net; 885 struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id); 886 887 switch (cmd) { 888 case SIOCGETTUNNEL: 889 if (dev == itn->fb_tunnel_dev) { 890 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); 891 if (!t) 892 t = netdev_priv(dev); 893 } 894 memcpy(p, &t->parms, sizeof(*p)); 895 break; 896 897 case SIOCADDTUNNEL: 898 case SIOCCHGTUNNEL: 899 err = -EPERM; 900 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 901 goto done; 902 if (p->iph.ttl) 903 p->iph.frag_off |= htons(IP_DF); 904 if (!(p->i_flags & VTI_ISVTI)) { 905 if (!(p->i_flags & TUNNEL_KEY)) 906 p->i_key = 0; 907 if (!(p->o_flags & TUNNEL_KEY)) 908 p->o_key = 0; 909 } 910 911 t = ip_tunnel_find(itn, p, itn->type); 912 913 if (cmd == SIOCADDTUNNEL) { 914 if (!t) { 915 t = ip_tunnel_create(net, itn, p); 916 err = PTR_ERR_OR_ZERO(t); 917 break; 918 } 919 920 err = -EEXIST; 921 break; 922 } 923 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { 924 if (t) { 925 if (t->dev != dev) { 926 err = -EEXIST; 927 break; 928 } 929 } else { 930 unsigned int nflags = 0; 931 932 if (ipv4_is_multicast(p->iph.daddr)) 933 nflags = IFF_BROADCAST; 934 else if (p->iph.daddr) 935 nflags = IFF_POINTOPOINT; 936 937 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) { 938 err = -EINVAL; 939 break; 940 } 941 942 t = netdev_priv(dev); 943 } 944 } 945 946 if (t) { 947 err = 0; 948 ip_tunnel_update(itn, t, dev, p, true, 0); 949 } else { 950 err = -ENOENT; 951 } 952 break; 953 954 case SIOCDELTUNNEL: 955 err = -EPERM; 956 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 957 goto done; 958 959 if (dev == itn->fb_tunnel_dev) { 960 err = -ENOENT; 961 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); 962 if (!t) 963 goto done; 964 err = -EPERM; 965 if (t == netdev_priv(itn->fb_tunnel_dev)) 966 goto done; 967 dev = t->dev; 968 } 969 unregister_netdevice(dev); 970 err = 0; 971 break; 972 973 default: 974 err = -EINVAL; 975 } 976 977 done: 978 return err; 979 } 980 EXPORT_SYMBOL_GPL(ip_tunnel_ctl); 981 982 int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, 983 void __user *data, int cmd) 984 { 985 struct ip_tunnel_parm p; 986 int err; 987 988 if (copy_from_user(&p, data, sizeof(p))) 989 return -EFAULT; 990 err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd); 991 if (!err && copy_to_user(data, &p, sizeof(p))) 992 return -EFAULT; 993 return err; 994 } 995 EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate); 996 997 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) 998 { 999 struct ip_tunnel *tunnel = netdev_priv(dev); 1000 int t_hlen = tunnel->hlen + sizeof(struct iphdr); 1001 int max_mtu = IP_MAX_MTU - t_hlen; 1002 1003 if (dev->type == ARPHRD_ETHER) 1004 max_mtu -= dev->hard_header_len; 1005 1006 if (new_mtu < ETH_MIN_MTU) 1007 return -EINVAL; 1008 1009 if (new_mtu > max_mtu) { 1010 if (strict) 1011 return -EINVAL; 1012 1013 new_mtu = max_mtu; 1014 } 1015 1016 dev->mtu = new_mtu; 1017 return 0; 1018 } 1019 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu); 1020 1021 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu) 1022 { 1023 return __ip_tunnel_change_mtu(dev, new_mtu, true); 1024 } 1025 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu); 1026 1027 static void ip_tunnel_dev_free(struct net_device *dev) 1028 { 1029 struct ip_tunnel *tunnel = netdev_priv(dev); 1030 1031 gro_cells_destroy(&tunnel->gro_cells); 1032 dst_cache_destroy(&tunnel->dst_cache); 1033 free_percpu(dev->tstats); 1034 } 1035 1036 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) 1037 { 1038 struct ip_tunnel *tunnel = netdev_priv(dev); 1039 struct ip_tunnel_net *itn; 1040 1041 itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id); 1042 1043 if (itn->fb_tunnel_dev != dev) { 1044 ip_tunnel_del(itn, netdev_priv(dev)); 1045 unregister_netdevice_queue(dev, head); 1046 } 1047 } 1048 EXPORT_SYMBOL_GPL(ip_tunnel_dellink); 1049 1050 struct net *ip_tunnel_get_link_net(const struct net_device *dev) 1051 { 1052 struct ip_tunnel *tunnel = netdev_priv(dev); 1053 1054 return tunnel->net; 1055 } 1056 EXPORT_SYMBOL(ip_tunnel_get_link_net); 1057 1058 int ip_tunnel_get_iflink(const struct net_device *dev) 1059 { 1060 struct ip_tunnel *tunnel = netdev_priv(dev); 1061 1062 return tunnel->parms.link; 1063 } 1064 EXPORT_SYMBOL(ip_tunnel_get_iflink); 1065 1066 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, 1067 struct rtnl_link_ops *ops, char *devname) 1068 { 1069 struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); 1070 struct ip_tunnel_parm parms; 1071 unsigned int i; 1072 1073 itn->rtnl_link_ops = ops; 1074 for (i = 0; i < IP_TNL_HASH_SIZE; i++) 1075 INIT_HLIST_HEAD(&itn->tunnels[i]); 1076 1077 if (!ops || !net_has_fallback_tunnels(net)) { 1078 struct ip_tunnel_net *it_init_net; 1079 1080 it_init_net = net_generic(&init_net, ip_tnl_net_id); 1081 itn->type = it_init_net->type; 1082 itn->fb_tunnel_dev = NULL; 1083 return 0; 1084 } 1085 1086 memset(&parms, 0, sizeof(parms)); 1087 if (devname) 1088 strscpy(parms.name, devname, IFNAMSIZ); 1089 1090 rtnl_lock(); 1091 itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms); 1092 /* FB netdevice is special: we have one, and only one per netns. 1093 * Allowing to move it to another netns is clearly unsafe. 1094 */ 1095 if (!IS_ERR(itn->fb_tunnel_dev)) { 1096 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; 1097 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev); 1098 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev)); 1099 itn->type = itn->fb_tunnel_dev->type; 1100 } 1101 rtnl_unlock(); 1102 1103 return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev); 1104 } 1105 EXPORT_SYMBOL_GPL(ip_tunnel_init_net); 1106 1107 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn, 1108 struct list_head *head, 1109 struct rtnl_link_ops *ops) 1110 { 1111 struct net_device *dev, *aux; 1112 int h; 1113 1114 for_each_netdev_safe(net, dev, aux) 1115 if (dev->rtnl_link_ops == ops) 1116 unregister_netdevice_queue(dev, head); 1117 1118 for (h = 0; h < IP_TNL_HASH_SIZE; h++) { 1119 struct ip_tunnel *t; 1120 struct hlist_node *n; 1121 struct hlist_head *thead = &itn->tunnels[h]; 1122 1123 hlist_for_each_entry_safe(t, n, thead, hash_node) 1124 /* If dev is in the same netns, it has already 1125 * been added to the list by the previous loop. 1126 */ 1127 if (!net_eq(dev_net(t->dev), net)) 1128 unregister_netdevice_queue(t->dev, head); 1129 } 1130 } 1131 1132 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, 1133 struct rtnl_link_ops *ops, 1134 struct list_head *dev_to_kill) 1135 { 1136 struct ip_tunnel_net *itn; 1137 struct net *net; 1138 1139 ASSERT_RTNL(); 1140 list_for_each_entry(net, net_list, exit_list) { 1141 itn = net_generic(net, id); 1142 ip_tunnel_destroy(net, itn, dev_to_kill, ops); 1143 } 1144 } 1145 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets); 1146 1147 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], 1148 struct ip_tunnel_parm *p, __u32 fwmark) 1149 { 1150 struct ip_tunnel *nt; 1151 struct net *net = dev_net(dev); 1152 struct ip_tunnel_net *itn; 1153 int mtu; 1154 int err; 1155 1156 nt = netdev_priv(dev); 1157 itn = net_generic(net, nt->ip_tnl_net_id); 1158 1159 if (nt->collect_md) { 1160 if (rtnl_dereference(itn->collect_md_tun)) 1161 return -EEXIST; 1162 } else { 1163 if (ip_tunnel_find(itn, p, dev->type)) 1164 return -EEXIST; 1165 } 1166 1167 nt->net = net; 1168 nt->parms = *p; 1169 nt->fwmark = fwmark; 1170 err = register_netdevice(dev); 1171 if (err) 1172 goto err_register_netdevice; 1173 1174 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) 1175 eth_hw_addr_random(dev); 1176 1177 mtu = ip_tunnel_bind_dev(dev); 1178 if (tb[IFLA_MTU]) { 1179 unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr)); 1180 1181 if (dev->type == ARPHRD_ETHER) 1182 max -= dev->hard_header_len; 1183 1184 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max); 1185 } 1186 1187 err = dev_set_mtu(dev, mtu); 1188 if (err) 1189 goto err_dev_set_mtu; 1190 1191 ip_tunnel_add(itn, nt); 1192 return 0; 1193 1194 err_dev_set_mtu: 1195 unregister_netdevice(dev); 1196 err_register_netdevice: 1197 return err; 1198 } 1199 EXPORT_SYMBOL_GPL(ip_tunnel_newlink); 1200 1201 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], 1202 struct ip_tunnel_parm *p, __u32 fwmark) 1203 { 1204 struct ip_tunnel *t; 1205 struct ip_tunnel *tunnel = netdev_priv(dev); 1206 struct net *net = tunnel->net; 1207 struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); 1208 1209 if (dev == itn->fb_tunnel_dev) 1210 return -EINVAL; 1211 1212 t = ip_tunnel_find(itn, p, dev->type); 1213 1214 if (t) { 1215 if (t->dev != dev) 1216 return -EEXIST; 1217 } else { 1218 t = tunnel; 1219 1220 if (dev->type != ARPHRD_ETHER) { 1221 unsigned int nflags = 0; 1222 1223 if (ipv4_is_multicast(p->iph.daddr)) 1224 nflags = IFF_BROADCAST; 1225 else if (p->iph.daddr) 1226 nflags = IFF_POINTOPOINT; 1227 1228 if ((dev->flags ^ nflags) & 1229 (IFF_POINTOPOINT | IFF_BROADCAST)) 1230 return -EINVAL; 1231 } 1232 } 1233 1234 ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark); 1235 return 0; 1236 } 1237 EXPORT_SYMBOL_GPL(ip_tunnel_changelink); 1238 1239 int ip_tunnel_init(struct net_device *dev) 1240 { 1241 struct ip_tunnel *tunnel = netdev_priv(dev); 1242 struct iphdr *iph = &tunnel->parms.iph; 1243 int err; 1244 1245 dev->needs_free_netdev = true; 1246 dev->priv_destructor = ip_tunnel_dev_free; 1247 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); 1248 if (!dev->tstats) 1249 return -ENOMEM; 1250 1251 err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); 1252 if (err) { 1253 free_percpu(dev->tstats); 1254 return err; 1255 } 1256 1257 err = gro_cells_init(&tunnel->gro_cells, dev); 1258 if (err) { 1259 dst_cache_destroy(&tunnel->dst_cache); 1260 free_percpu(dev->tstats); 1261 return err; 1262 } 1263 1264 tunnel->dev = dev; 1265 tunnel->net = dev_net(dev); 1266 strcpy(tunnel->parms.name, dev->name); 1267 iph->version = 4; 1268 iph->ihl = 5; 1269 1270 if (tunnel->collect_md) 1271 netif_keep_dst(dev); 1272 netdev_lockdep_set_classes(dev); 1273 return 0; 1274 } 1275 EXPORT_SYMBOL_GPL(ip_tunnel_init); 1276 1277 void ip_tunnel_uninit(struct net_device *dev) 1278 { 1279 struct ip_tunnel *tunnel = netdev_priv(dev); 1280 struct net *net = tunnel->net; 1281 struct ip_tunnel_net *itn; 1282 1283 itn = net_generic(net, tunnel->ip_tnl_net_id); 1284 ip_tunnel_del(itn, netdev_priv(dev)); 1285 if (itn->fb_tunnel_dev == dev) 1286 WRITE_ONCE(itn->fb_tunnel_dev, NULL); 1287 1288 dst_cache_reset(&tunnel->dst_cache); 1289 } 1290 EXPORT_SYMBOL_GPL(ip_tunnel_uninit); 1291 1292 /* Do least required initialization, rest of init is done in tunnel_init call */ 1293 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id) 1294 { 1295 struct ip_tunnel *tunnel = netdev_priv(dev); 1296 tunnel->ip_tnl_net_id = net_id; 1297 } 1298 EXPORT_SYMBOL_GPL(ip_tunnel_setup); 1299 1300 MODULE_LICENSE("GPL"); 1301