1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2013 Nicira, Inc. 4 */ 5 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 8 #include <linux/capability.h> 9 #include <linux/module.h> 10 #include <linux/types.h> 11 #include <linux/kernel.h> 12 #include <linux/slab.h> 13 #include <linux/uaccess.h> 14 #include <linux/skbuff.h> 15 #include <linux/netdevice.h> 16 #include <linux/in.h> 17 #include <linux/tcp.h> 18 #include <linux/udp.h> 19 #include <linux/if_arp.h> 20 #include <linux/init.h> 21 #include <linux/in6.h> 22 #include <linux/inetdevice.h> 23 #include <linux/igmp.h> 24 #include <linux/netfilter_ipv4.h> 25 #include <linux/etherdevice.h> 26 #include <linux/if_ether.h> 27 #include <linux/if_vlan.h> 28 #include <linux/rculist.h> 29 #include <linux/err.h> 30 31 #include <net/sock.h> 32 #include <net/ip.h> 33 #include <net/icmp.h> 34 #include <net/protocol.h> 35 #include <net/ip_tunnels.h> 36 #include <net/arp.h> 37 #include <net/checksum.h> 38 #include <net/dsfield.h> 39 #include <net/inet_ecn.h> 40 #include <net/xfrm.h> 41 #include <net/net_namespace.h> 42 #include <net/netns/generic.h> 43 #include <net/rtnetlink.h> 44 #include <net/udp.h> 45 #include <net/dst_metadata.h> 46 47 #if IS_ENABLED(CONFIG_IPV6) 48 #include <net/ipv6.h> 49 #include <net/ip6_fib.h> 50 #include <net/ip6_route.h> 51 #endif 52 53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote) 54 { 55 return hash_32((__force u32)key ^ (__force u32)remote, 56 IP_TNL_HASH_BITS); 57 } 58 59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p, 60 __be16 flags, __be32 key) 61 { 62 if (p->i_flags & TUNNEL_KEY) { 63 if (flags & TUNNEL_KEY) 64 return key == p->i_key; 65 else 66 /* key expected, none present */ 67 return false; 68 } else 69 return !(flags & TUNNEL_KEY); 70 } 71 72 /* Fallback tunnel: no source, no destination, no key, no options 73 74 Tunnel hash table: 75 We require exact key match i.e. if a key is present in packet 76 it will match only tunnel with the same key; if it is not present, 77 it will match only keyless tunnel. 78 79 All keysless packets, if not matched configured keyless tunnels 80 will match fallback tunnel. 81 Given src, dst and key, find appropriate for input tunnel. 82 */ 83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, 84 int link, __be16 flags, 85 __be32 remote, __be32 local, 86 __be32 key) 87 { 88 struct ip_tunnel *t, *cand = NULL; 89 struct hlist_head *head; 90 struct net_device *ndev; 91 unsigned int hash; 92 93 hash = ip_tunnel_hash(key, remote); 94 head = &itn->tunnels[hash]; 95 96 hlist_for_each_entry_rcu(t, head, hash_node) { 97 if (local != t->parms.iph.saddr || 98 remote != t->parms.iph.daddr || 99 !(t->dev->flags & IFF_UP)) 100 continue; 101 102 if (!ip_tunnel_key_match(&t->parms, flags, key)) 103 continue; 104 105 if (READ_ONCE(t->parms.link) == link) 106 return t; 107 cand = t; 108 } 109 110 hlist_for_each_entry_rcu(t, head, hash_node) { 111 if (remote != t->parms.iph.daddr || 112 t->parms.iph.saddr != 0 || 113 !(t->dev->flags & IFF_UP)) 114 continue; 115 116 if (!ip_tunnel_key_match(&t->parms, flags, key)) 117 continue; 118 119 if (READ_ONCE(t->parms.link) == link) 120 return t; 121 if (!cand) 122 cand = t; 123 } 124 125 hash = ip_tunnel_hash(key, 0); 126 head = &itn->tunnels[hash]; 127 128 hlist_for_each_entry_rcu(t, head, hash_node) { 129 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) && 130 (local != t->parms.iph.daddr || !ipv4_is_multicast(local))) 131 continue; 132 133 if (!(t->dev->flags & IFF_UP)) 134 continue; 135 136 if (!ip_tunnel_key_match(&t->parms, flags, key)) 137 continue; 138 139 if (READ_ONCE(t->parms.link) == link) 140 return t; 141 if (!cand) 142 cand = t; 143 } 144 145 hlist_for_each_entry_rcu(t, head, hash_node) { 146 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) || 147 t->parms.iph.saddr != 0 || 148 t->parms.iph.daddr != 0 || 149 !(t->dev->flags & IFF_UP)) 150 continue; 151 152 if (READ_ONCE(t->parms.link) == link) 153 return t; 154 if (!cand) 155 cand = t; 156 } 157 158 if (cand) 159 return cand; 160 161 t = rcu_dereference(itn->collect_md_tun); 162 if (t && t->dev->flags & IFF_UP) 163 return t; 164 165 ndev = READ_ONCE(itn->fb_tunnel_dev); 166 if (ndev && ndev->flags & IFF_UP) 167 return netdev_priv(ndev); 168 169 return NULL; 170 } 171 EXPORT_SYMBOL_GPL(ip_tunnel_lookup); 172 173 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn, 174 struct ip_tunnel_parm *parms) 175 { 176 unsigned int h; 177 __be32 remote; 178 __be32 i_key = parms->i_key; 179 180 if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr)) 181 remote = parms->iph.daddr; 182 else 183 remote = 0; 184 185 if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI)) 186 i_key = 0; 187 188 h = ip_tunnel_hash(i_key, remote); 189 return &itn->tunnels[h]; 190 } 191 192 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t) 193 { 194 struct hlist_head *head = ip_bucket(itn, &t->parms); 195 196 if (t->collect_md) 197 rcu_assign_pointer(itn->collect_md_tun, t); 198 hlist_add_head_rcu(&t->hash_node, head); 199 } 200 201 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t) 202 { 203 if (t->collect_md) 204 rcu_assign_pointer(itn->collect_md_tun, NULL); 205 hlist_del_init_rcu(&t->hash_node); 206 } 207 208 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, 209 struct ip_tunnel_parm *parms, 210 int type) 211 { 212 __be32 remote = parms->iph.daddr; 213 __be32 local = parms->iph.saddr; 214 __be32 key = parms->i_key; 215 __be16 flags = parms->i_flags; 216 int link = parms->link; 217 struct ip_tunnel *t = NULL; 218 struct hlist_head *head = ip_bucket(itn, parms); 219 220 hlist_for_each_entry_rcu(t, head, hash_node) { 221 if (local == t->parms.iph.saddr && 222 remote == t->parms.iph.daddr && 223 link == READ_ONCE(t->parms.link) && 224 type == t->dev->type && 225 ip_tunnel_key_match(&t->parms, flags, key)) 226 break; 227 } 228 return t; 229 } 230 231 static struct net_device *__ip_tunnel_create(struct net *net, 232 const struct rtnl_link_ops *ops, 233 struct ip_tunnel_parm *parms) 234 { 235 int err; 236 struct ip_tunnel *tunnel; 237 struct net_device *dev; 238 char name[IFNAMSIZ]; 239 240 err = -E2BIG; 241 if (parms->name[0]) { 242 if (!dev_valid_name(parms->name)) 243 goto failed; 244 strscpy(name, parms->name, IFNAMSIZ); 245 } else { 246 if (strlen(ops->kind) > (IFNAMSIZ - 3)) 247 goto failed; 248 strcpy(name, ops->kind); 249 strcat(name, "%d"); 250 } 251 252 ASSERT_RTNL(); 253 dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup); 254 if (!dev) { 255 err = -ENOMEM; 256 goto failed; 257 } 258 dev_net_set(dev, net); 259 260 dev->rtnl_link_ops = ops; 261 262 tunnel = netdev_priv(dev); 263 tunnel->parms = *parms; 264 tunnel->net = net; 265 266 err = register_netdevice(dev); 267 if (err) 268 goto failed_free; 269 270 return dev; 271 272 failed_free: 273 free_netdev(dev); 274 failed: 275 return ERR_PTR(err); 276 } 277 278 static int ip_tunnel_bind_dev(struct net_device *dev) 279 { 280 struct net_device *tdev = NULL; 281 struct ip_tunnel *tunnel = netdev_priv(dev); 282 const struct iphdr *iph; 283 int hlen = LL_MAX_HEADER; 284 int mtu = ETH_DATA_LEN; 285 int t_hlen = tunnel->hlen + sizeof(struct iphdr); 286 287 iph = &tunnel->parms.iph; 288 289 /* Guess output device to choose reasonable mtu and needed_headroom */ 290 if (iph->daddr) { 291 struct flowi4 fl4; 292 struct rtable *rt; 293 294 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr, 295 iph->saddr, tunnel->parms.o_key, 296 RT_TOS(iph->tos), dev_net(dev), 297 tunnel->parms.link, tunnel->fwmark, 0, 0); 298 rt = ip_route_output_key(tunnel->net, &fl4); 299 300 if (!IS_ERR(rt)) { 301 tdev = rt->dst.dev; 302 ip_rt_put(rt); 303 } 304 if (dev->type != ARPHRD_ETHER) 305 dev->flags |= IFF_POINTOPOINT; 306 307 dst_cache_reset(&tunnel->dst_cache); 308 } 309 310 if (!tdev && tunnel->parms.link) 311 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); 312 313 if (tdev) { 314 hlen = tdev->hard_header_len + tdev->needed_headroom; 315 mtu = min(tdev->mtu, IP_MAX_MTU); 316 } 317 318 dev->needed_headroom = t_hlen + hlen; 319 mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0); 320 321 if (mtu < IPV4_MIN_MTU) 322 mtu = IPV4_MIN_MTU; 323 324 return mtu; 325 } 326 327 static struct ip_tunnel *ip_tunnel_create(struct net *net, 328 struct ip_tunnel_net *itn, 329 struct ip_tunnel_parm *parms) 330 { 331 struct ip_tunnel *nt; 332 struct net_device *dev; 333 int t_hlen; 334 int mtu; 335 int err; 336 337 dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms); 338 if (IS_ERR(dev)) 339 return ERR_CAST(dev); 340 341 mtu = ip_tunnel_bind_dev(dev); 342 err = dev_set_mtu(dev, mtu); 343 if (err) 344 goto err_dev_set_mtu; 345 346 nt = netdev_priv(dev); 347 t_hlen = nt->hlen + sizeof(struct iphdr); 348 dev->min_mtu = ETH_MIN_MTU; 349 dev->max_mtu = IP_MAX_MTU - t_hlen; 350 if (dev->type == ARPHRD_ETHER) 351 dev->max_mtu -= dev->hard_header_len; 352 353 ip_tunnel_add(itn, nt); 354 return nt; 355 356 err_dev_set_mtu: 357 unregister_netdevice(dev); 358 return ERR_PTR(err); 359 } 360 361 void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info) 362 { 363 const struct iphdr *iph = ip_hdr(skb); 364 const struct udphdr *udph; 365 366 if (iph->protocol != IPPROTO_UDP) 367 return; 368 369 udph = (struct udphdr *)((__u8 *)iph + (iph->ihl << 2)); 370 info->encap.sport = udph->source; 371 info->encap.dport = udph->dest; 372 } 373 EXPORT_SYMBOL(ip_tunnel_md_udp_encap); 374 375 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, 376 const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, 377 bool log_ecn_error) 378 { 379 const struct iphdr *iph = ip_hdr(skb); 380 int err; 381 382 #ifdef CONFIG_NET_IPGRE_BROADCAST 383 if (ipv4_is_multicast(iph->daddr)) { 384 DEV_STATS_INC(tunnel->dev, multicast); 385 skb->pkt_type = PACKET_BROADCAST; 386 } 387 #endif 388 389 if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) || 390 ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) { 391 DEV_STATS_INC(tunnel->dev, rx_crc_errors); 392 DEV_STATS_INC(tunnel->dev, rx_errors); 393 goto drop; 394 } 395 396 if (tunnel->parms.i_flags&TUNNEL_SEQ) { 397 if (!(tpi->flags&TUNNEL_SEQ) || 398 (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { 399 DEV_STATS_INC(tunnel->dev, rx_fifo_errors); 400 DEV_STATS_INC(tunnel->dev, rx_errors); 401 goto drop; 402 } 403 tunnel->i_seqno = ntohl(tpi->seq) + 1; 404 } 405 406 skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0); 407 408 err = IP_ECN_decapsulate(iph, skb); 409 if (unlikely(err)) { 410 if (log_ecn_error) 411 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", 412 &iph->saddr, iph->tos); 413 if (err > 1) { 414 DEV_STATS_INC(tunnel->dev, rx_frame_errors); 415 DEV_STATS_INC(tunnel->dev, rx_errors); 416 goto drop; 417 } 418 } 419 420 dev_sw_netstats_rx_add(tunnel->dev, skb->len); 421 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); 422 423 if (tunnel->dev->type == ARPHRD_ETHER) { 424 skb->protocol = eth_type_trans(skb, tunnel->dev); 425 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 426 } else { 427 skb->dev = tunnel->dev; 428 } 429 430 if (tun_dst) 431 skb_dst_set(skb, (struct dst_entry *)tun_dst); 432 433 gro_cells_receive(&tunnel->gro_cells, skb); 434 return 0; 435 436 drop: 437 if (tun_dst) 438 dst_release((struct dst_entry *)tun_dst); 439 kfree_skb(skb); 440 return 0; 441 } 442 EXPORT_SYMBOL_GPL(ip_tunnel_rcv); 443 444 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops, 445 unsigned int num) 446 { 447 if (num >= MAX_IPTUN_ENCAP_OPS) 448 return -ERANGE; 449 450 return !cmpxchg((const struct ip_tunnel_encap_ops **) 451 &iptun_encaps[num], 452 NULL, ops) ? 0 : -1; 453 } 454 EXPORT_SYMBOL(ip_tunnel_encap_add_ops); 455 456 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops, 457 unsigned int num) 458 { 459 int ret; 460 461 if (num >= MAX_IPTUN_ENCAP_OPS) 462 return -ERANGE; 463 464 ret = (cmpxchg((const struct ip_tunnel_encap_ops **) 465 &iptun_encaps[num], 466 ops, NULL) == ops) ? 0 : -1; 467 468 synchronize_net(); 469 470 return ret; 471 } 472 EXPORT_SYMBOL(ip_tunnel_encap_del_ops); 473 474 int ip_tunnel_encap_setup(struct ip_tunnel *t, 475 struct ip_tunnel_encap *ipencap) 476 { 477 int hlen; 478 479 memset(&t->encap, 0, sizeof(t->encap)); 480 481 hlen = ip_encap_hlen(ipencap); 482 if (hlen < 0) 483 return hlen; 484 485 t->encap.type = ipencap->type; 486 t->encap.sport = ipencap->sport; 487 t->encap.dport = ipencap->dport; 488 t->encap.flags = ipencap->flags; 489 490 t->encap_hlen = hlen; 491 t->hlen = t->encap_hlen + t->tun_hlen; 492 493 return 0; 494 } 495 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup); 496 497 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, 498 struct rtable *rt, __be16 df, 499 const struct iphdr *inner_iph, 500 int tunnel_hlen, __be32 dst, bool md) 501 { 502 struct ip_tunnel *tunnel = netdev_priv(dev); 503 int pkt_size; 504 int mtu; 505 506 tunnel_hlen = md ? tunnel_hlen : tunnel->hlen; 507 pkt_size = skb->len - tunnel_hlen; 508 pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0; 509 510 if (df) { 511 mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen); 512 mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0; 513 } else { 514 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; 515 } 516 517 if (skb_valid_dst(skb)) 518 skb_dst_update_pmtu_no_confirm(skb, mtu); 519 520 if (skb->protocol == htons(ETH_P_IP)) { 521 if (!skb_is_gso(skb) && 522 (inner_iph->frag_off & htons(IP_DF)) && 523 mtu < pkt_size) { 524 icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); 525 return -E2BIG; 526 } 527 } 528 #if IS_ENABLED(CONFIG_IPV6) 529 else if (skb->protocol == htons(ETH_P_IPV6)) { 530 struct rt6_info *rt6; 531 __be32 daddr; 532 533 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) : 534 NULL; 535 daddr = md ? dst : tunnel->parms.iph.daddr; 536 537 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && 538 mtu >= IPV6_MIN_MTU) { 539 if ((daddr && !ipv4_is_multicast(daddr)) || 540 rt6->rt6i_dst.plen == 128) { 541 rt6->rt6i_flags |= RTF_MODIFIED; 542 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); 543 } 544 } 545 546 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU && 547 mtu < pkt_size) { 548 icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 549 return -E2BIG; 550 } 551 } 552 #endif 553 return 0; 554 } 555 556 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, 557 u8 proto, int tunnel_hlen) 558 { 559 struct ip_tunnel *tunnel = netdev_priv(dev); 560 u32 headroom = sizeof(struct iphdr); 561 struct ip_tunnel_info *tun_info; 562 const struct ip_tunnel_key *key; 563 const struct iphdr *inner_iph; 564 struct rtable *rt = NULL; 565 struct flowi4 fl4; 566 __be16 df = 0; 567 u8 tos, ttl; 568 bool use_cache; 569 570 tun_info = skb_tunnel_info(skb); 571 if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || 572 ip_tunnel_info_af(tun_info) != AF_INET)) 573 goto tx_error; 574 key = &tun_info->key; 575 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 576 inner_iph = (const struct iphdr *)skb_inner_network_header(skb); 577 tos = key->tos; 578 if (tos == 1) { 579 if (skb->protocol == htons(ETH_P_IP)) 580 tos = inner_iph->tos; 581 else if (skb->protocol == htons(ETH_P_IPV6)) 582 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); 583 } 584 ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 585 tunnel_id_to_key32(key->tun_id), RT_TOS(tos), 586 dev_net(dev), 0, skb->mark, skb_get_hash(skb), 587 key->flow_flags); 588 589 if (!tunnel_hlen) 590 tunnel_hlen = ip_encap_hlen(&tun_info->encap); 591 592 if (ip_tunnel_encap(skb, &tun_info->encap, &proto, &fl4) < 0) 593 goto tx_error; 594 595 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); 596 if (use_cache) 597 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr); 598 if (!rt) { 599 rt = ip_route_output_key(tunnel->net, &fl4); 600 if (IS_ERR(rt)) { 601 DEV_STATS_INC(dev, tx_carrier_errors); 602 goto tx_error; 603 } 604 if (use_cache) 605 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, 606 fl4.saddr); 607 } 608 if (rt->dst.dev == dev) { 609 ip_rt_put(rt); 610 DEV_STATS_INC(dev, collisions); 611 goto tx_error; 612 } 613 614 if (key->tun_flags & TUNNEL_DONT_FRAGMENT) 615 df = htons(IP_DF); 616 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen, 617 key->u.ipv4.dst, true)) { 618 ip_rt_put(rt); 619 goto tx_error; 620 } 621 622 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); 623 ttl = key->ttl; 624 if (ttl == 0) { 625 if (skb->protocol == htons(ETH_P_IP)) 626 ttl = inner_iph->ttl; 627 else if (skb->protocol == htons(ETH_P_IPV6)) 628 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; 629 else 630 ttl = ip4_dst_hoplimit(&rt->dst); 631 } 632 633 headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; 634 if (headroom > READ_ONCE(dev->needed_headroom)) 635 WRITE_ONCE(dev->needed_headroom, headroom); 636 637 if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) { 638 ip_rt_put(rt); 639 goto tx_dropped; 640 } 641 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl, 642 df, !net_eq(tunnel->net, dev_net(dev))); 643 return; 644 tx_error: 645 DEV_STATS_INC(dev, tx_errors); 646 goto kfree; 647 tx_dropped: 648 DEV_STATS_INC(dev, tx_dropped); 649 kfree: 650 kfree_skb(skb); 651 } 652 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit); 653 654 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, 655 const struct iphdr *tnl_params, u8 protocol) 656 { 657 struct ip_tunnel *tunnel = netdev_priv(dev); 658 struct ip_tunnel_info *tun_info = NULL; 659 const struct iphdr *inner_iph; 660 unsigned int max_headroom; /* The extra header space needed */ 661 struct rtable *rt = NULL; /* Route to the other host */ 662 __be16 payload_protocol; 663 bool use_cache = false; 664 struct flowi4 fl4; 665 bool md = false; 666 bool connected; 667 u8 tos, ttl; 668 __be32 dst; 669 __be16 df; 670 671 inner_iph = (const struct iphdr *)skb_inner_network_header(skb); 672 connected = (tunnel->parms.iph.daddr != 0); 673 payload_protocol = skb_protocol(skb, true); 674 675 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 676 677 dst = tnl_params->daddr; 678 if (dst == 0) { 679 /* NBMA tunnel */ 680 681 if (!skb_dst(skb)) { 682 DEV_STATS_INC(dev, tx_fifo_errors); 683 goto tx_error; 684 } 685 686 tun_info = skb_tunnel_info(skb); 687 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) && 688 ip_tunnel_info_af(tun_info) == AF_INET && 689 tun_info->key.u.ipv4.dst) { 690 dst = tun_info->key.u.ipv4.dst; 691 md = true; 692 connected = true; 693 } else if (payload_protocol == htons(ETH_P_IP)) { 694 rt = skb_rtable(skb); 695 dst = rt_nexthop(rt, inner_iph->daddr); 696 } 697 #if IS_ENABLED(CONFIG_IPV6) 698 else if (payload_protocol == htons(ETH_P_IPV6)) { 699 const struct in6_addr *addr6; 700 struct neighbour *neigh; 701 bool do_tx_error_icmp; 702 int addr_type; 703 704 neigh = dst_neigh_lookup(skb_dst(skb), 705 &ipv6_hdr(skb)->daddr); 706 if (!neigh) 707 goto tx_error; 708 709 addr6 = (const struct in6_addr *)&neigh->primary_key; 710 addr_type = ipv6_addr_type(addr6); 711 712 if (addr_type == IPV6_ADDR_ANY) { 713 addr6 = &ipv6_hdr(skb)->daddr; 714 addr_type = ipv6_addr_type(addr6); 715 } 716 717 if ((addr_type & IPV6_ADDR_COMPATv4) == 0) 718 do_tx_error_icmp = true; 719 else { 720 do_tx_error_icmp = false; 721 dst = addr6->s6_addr32[3]; 722 } 723 neigh_release(neigh); 724 if (do_tx_error_icmp) 725 goto tx_error_icmp; 726 } 727 #endif 728 else 729 goto tx_error; 730 731 if (!md) 732 connected = false; 733 } 734 735 tos = tnl_params->tos; 736 if (tos & 0x1) { 737 tos &= ~0x1; 738 if (payload_protocol == htons(ETH_P_IP)) { 739 tos = inner_iph->tos; 740 connected = false; 741 } else if (payload_protocol == htons(ETH_P_IPV6)) { 742 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); 743 connected = false; 744 } 745 } 746 747 ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, 748 tunnel->parms.o_key, RT_TOS(tos), 749 dev_net(dev), READ_ONCE(tunnel->parms.link), 750 tunnel->fwmark, skb_get_hash(skb), 0); 751 752 if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0) 753 goto tx_error; 754 755 if (connected && md) { 756 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); 757 if (use_cache) 758 rt = dst_cache_get_ip4(&tun_info->dst_cache, 759 &fl4.saddr); 760 } else { 761 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, 762 &fl4.saddr) : NULL; 763 } 764 765 if (!rt) { 766 rt = ip_route_output_key(tunnel->net, &fl4); 767 768 if (IS_ERR(rt)) { 769 DEV_STATS_INC(dev, tx_carrier_errors); 770 goto tx_error; 771 } 772 if (use_cache) 773 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, 774 fl4.saddr); 775 else if (!md && connected) 776 dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, 777 fl4.saddr); 778 } 779 780 if (rt->dst.dev == dev) { 781 ip_rt_put(rt); 782 DEV_STATS_INC(dev, collisions); 783 goto tx_error; 784 } 785 786 df = tnl_params->frag_off; 787 if (payload_protocol == htons(ETH_P_IP) && !tunnel->ignore_df) 788 df |= (inner_iph->frag_off & htons(IP_DF)); 789 790 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) { 791 ip_rt_put(rt); 792 goto tx_error; 793 } 794 795 if (tunnel->err_count > 0) { 796 if (time_before(jiffies, 797 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { 798 tunnel->err_count--; 799 800 dst_link_failure(skb); 801 } else 802 tunnel->err_count = 0; 803 } 804 805 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); 806 ttl = tnl_params->ttl; 807 if (ttl == 0) { 808 if (payload_protocol == htons(ETH_P_IP)) 809 ttl = inner_iph->ttl; 810 #if IS_ENABLED(CONFIG_IPV6) 811 else if (payload_protocol == htons(ETH_P_IPV6)) 812 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; 813 #endif 814 else 815 ttl = ip4_dst_hoplimit(&rt->dst); 816 } 817 818 max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) 819 + rt->dst.header_len + ip_encap_hlen(&tunnel->encap); 820 if (max_headroom > READ_ONCE(dev->needed_headroom)) 821 WRITE_ONCE(dev->needed_headroom, max_headroom); 822 823 if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) { 824 ip_rt_put(rt); 825 DEV_STATS_INC(dev, tx_dropped); 826 kfree_skb(skb); 827 return; 828 } 829 830 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, 831 df, !net_eq(tunnel->net, dev_net(dev))); 832 return; 833 834 #if IS_ENABLED(CONFIG_IPV6) 835 tx_error_icmp: 836 dst_link_failure(skb); 837 #endif 838 tx_error: 839 DEV_STATS_INC(dev, tx_errors); 840 kfree_skb(skb); 841 } 842 EXPORT_SYMBOL_GPL(ip_tunnel_xmit); 843 844 static void ip_tunnel_update(struct ip_tunnel_net *itn, 845 struct ip_tunnel *t, 846 struct net_device *dev, 847 struct ip_tunnel_parm *p, 848 bool set_mtu, 849 __u32 fwmark) 850 { 851 ip_tunnel_del(itn, t); 852 t->parms.iph.saddr = p->iph.saddr; 853 t->parms.iph.daddr = p->iph.daddr; 854 t->parms.i_key = p->i_key; 855 t->parms.o_key = p->o_key; 856 if (dev->type != ARPHRD_ETHER) { 857 __dev_addr_set(dev, &p->iph.saddr, 4); 858 memcpy(dev->broadcast, &p->iph.daddr, 4); 859 } 860 ip_tunnel_add(itn, t); 861 862 t->parms.iph.ttl = p->iph.ttl; 863 t->parms.iph.tos = p->iph.tos; 864 t->parms.iph.frag_off = p->iph.frag_off; 865 866 if (t->parms.link != p->link || t->fwmark != fwmark) { 867 int mtu; 868 869 WRITE_ONCE(t->parms.link, p->link); 870 t->fwmark = fwmark; 871 mtu = ip_tunnel_bind_dev(dev); 872 if (set_mtu) 873 dev->mtu = mtu; 874 } 875 dst_cache_reset(&t->dst_cache); 876 netdev_state_change(dev); 877 } 878 879 int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) 880 { 881 int err = 0; 882 struct ip_tunnel *t = netdev_priv(dev); 883 struct net *net = t->net; 884 struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id); 885 886 switch (cmd) { 887 case SIOCGETTUNNEL: 888 if (dev == itn->fb_tunnel_dev) { 889 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); 890 if (!t) 891 t = netdev_priv(dev); 892 } 893 memcpy(p, &t->parms, sizeof(*p)); 894 break; 895 896 case SIOCADDTUNNEL: 897 case SIOCCHGTUNNEL: 898 err = -EPERM; 899 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 900 goto done; 901 if (p->iph.ttl) 902 p->iph.frag_off |= htons(IP_DF); 903 if (!(p->i_flags & VTI_ISVTI)) { 904 if (!(p->i_flags & TUNNEL_KEY)) 905 p->i_key = 0; 906 if (!(p->o_flags & TUNNEL_KEY)) 907 p->o_key = 0; 908 } 909 910 t = ip_tunnel_find(itn, p, itn->type); 911 912 if (cmd == SIOCADDTUNNEL) { 913 if (!t) { 914 t = ip_tunnel_create(net, itn, p); 915 err = PTR_ERR_OR_ZERO(t); 916 break; 917 } 918 919 err = -EEXIST; 920 break; 921 } 922 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { 923 if (t) { 924 if (t->dev != dev) { 925 err = -EEXIST; 926 break; 927 } 928 } else { 929 unsigned int nflags = 0; 930 931 if (ipv4_is_multicast(p->iph.daddr)) 932 nflags = IFF_BROADCAST; 933 else if (p->iph.daddr) 934 nflags = IFF_POINTOPOINT; 935 936 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) { 937 err = -EINVAL; 938 break; 939 } 940 941 t = netdev_priv(dev); 942 } 943 } 944 945 if (t) { 946 err = 0; 947 ip_tunnel_update(itn, t, dev, p, true, 0); 948 } else { 949 err = -ENOENT; 950 } 951 break; 952 953 case SIOCDELTUNNEL: 954 err = -EPERM; 955 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 956 goto done; 957 958 if (dev == itn->fb_tunnel_dev) { 959 err = -ENOENT; 960 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); 961 if (!t) 962 goto done; 963 err = -EPERM; 964 if (t == netdev_priv(itn->fb_tunnel_dev)) 965 goto done; 966 dev = t->dev; 967 } 968 unregister_netdevice(dev); 969 err = 0; 970 break; 971 972 default: 973 err = -EINVAL; 974 } 975 976 done: 977 return err; 978 } 979 EXPORT_SYMBOL_GPL(ip_tunnel_ctl); 980 981 int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, 982 void __user *data, int cmd) 983 { 984 struct ip_tunnel_parm p; 985 int err; 986 987 if (copy_from_user(&p, data, sizeof(p))) 988 return -EFAULT; 989 err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd); 990 if (!err && copy_to_user(data, &p, sizeof(p))) 991 return -EFAULT; 992 return err; 993 } 994 EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate); 995 996 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) 997 { 998 struct ip_tunnel *tunnel = netdev_priv(dev); 999 int t_hlen = tunnel->hlen + sizeof(struct iphdr); 1000 int max_mtu = IP_MAX_MTU - t_hlen; 1001 1002 if (dev->type == ARPHRD_ETHER) 1003 max_mtu -= dev->hard_header_len; 1004 1005 if (new_mtu < ETH_MIN_MTU) 1006 return -EINVAL; 1007 1008 if (new_mtu > max_mtu) { 1009 if (strict) 1010 return -EINVAL; 1011 1012 new_mtu = max_mtu; 1013 } 1014 1015 dev->mtu = new_mtu; 1016 return 0; 1017 } 1018 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu); 1019 1020 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu) 1021 { 1022 return __ip_tunnel_change_mtu(dev, new_mtu, true); 1023 } 1024 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu); 1025 1026 static void ip_tunnel_dev_free(struct net_device *dev) 1027 { 1028 struct ip_tunnel *tunnel = netdev_priv(dev); 1029 1030 gro_cells_destroy(&tunnel->gro_cells); 1031 dst_cache_destroy(&tunnel->dst_cache); 1032 free_percpu(dev->tstats); 1033 } 1034 1035 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) 1036 { 1037 struct ip_tunnel *tunnel = netdev_priv(dev); 1038 struct ip_tunnel_net *itn; 1039 1040 itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id); 1041 1042 if (itn->fb_tunnel_dev != dev) { 1043 ip_tunnel_del(itn, netdev_priv(dev)); 1044 unregister_netdevice_queue(dev, head); 1045 } 1046 } 1047 EXPORT_SYMBOL_GPL(ip_tunnel_dellink); 1048 1049 struct net *ip_tunnel_get_link_net(const struct net_device *dev) 1050 { 1051 struct ip_tunnel *tunnel = netdev_priv(dev); 1052 1053 return tunnel->net; 1054 } 1055 EXPORT_SYMBOL(ip_tunnel_get_link_net); 1056 1057 int ip_tunnel_get_iflink(const struct net_device *dev) 1058 { 1059 const struct ip_tunnel *tunnel = netdev_priv(dev); 1060 1061 return READ_ONCE(tunnel->parms.link); 1062 } 1063 EXPORT_SYMBOL(ip_tunnel_get_iflink); 1064 1065 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, 1066 struct rtnl_link_ops *ops, char *devname) 1067 { 1068 struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); 1069 struct ip_tunnel_parm parms; 1070 unsigned int i; 1071 1072 itn->rtnl_link_ops = ops; 1073 for (i = 0; i < IP_TNL_HASH_SIZE; i++) 1074 INIT_HLIST_HEAD(&itn->tunnels[i]); 1075 1076 if (!ops || !net_has_fallback_tunnels(net)) { 1077 struct ip_tunnel_net *it_init_net; 1078 1079 it_init_net = net_generic(&init_net, ip_tnl_net_id); 1080 itn->type = it_init_net->type; 1081 itn->fb_tunnel_dev = NULL; 1082 return 0; 1083 } 1084 1085 memset(&parms, 0, sizeof(parms)); 1086 if (devname) 1087 strscpy(parms.name, devname, IFNAMSIZ); 1088 1089 rtnl_lock(); 1090 itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms); 1091 /* FB netdevice is special: we have one, and only one per netns. 1092 * Allowing to move it to another netns is clearly unsafe. 1093 */ 1094 if (!IS_ERR(itn->fb_tunnel_dev)) { 1095 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; 1096 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev); 1097 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev)); 1098 itn->type = itn->fb_tunnel_dev->type; 1099 } 1100 rtnl_unlock(); 1101 1102 return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev); 1103 } 1104 EXPORT_SYMBOL_GPL(ip_tunnel_init_net); 1105 1106 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn, 1107 struct list_head *head, 1108 struct rtnl_link_ops *ops) 1109 { 1110 struct net_device *dev, *aux; 1111 int h; 1112 1113 for_each_netdev_safe(net, dev, aux) 1114 if (dev->rtnl_link_ops == ops) 1115 unregister_netdevice_queue(dev, head); 1116 1117 for (h = 0; h < IP_TNL_HASH_SIZE; h++) { 1118 struct ip_tunnel *t; 1119 struct hlist_node *n; 1120 struct hlist_head *thead = &itn->tunnels[h]; 1121 1122 hlist_for_each_entry_safe(t, n, thead, hash_node) 1123 /* If dev is in the same netns, it has already 1124 * been added to the list by the previous loop. 1125 */ 1126 if (!net_eq(dev_net(t->dev), net)) 1127 unregister_netdevice_queue(t->dev, head); 1128 } 1129 } 1130 1131 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, 1132 struct rtnl_link_ops *ops, 1133 struct list_head *dev_to_kill) 1134 { 1135 struct ip_tunnel_net *itn; 1136 struct net *net; 1137 1138 ASSERT_RTNL(); 1139 list_for_each_entry(net, net_list, exit_list) { 1140 itn = net_generic(net, id); 1141 ip_tunnel_destroy(net, itn, dev_to_kill, ops); 1142 } 1143 } 1144 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets); 1145 1146 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], 1147 struct ip_tunnel_parm *p, __u32 fwmark) 1148 { 1149 struct ip_tunnel *nt; 1150 struct net *net = dev_net(dev); 1151 struct ip_tunnel_net *itn; 1152 int mtu; 1153 int err; 1154 1155 nt = netdev_priv(dev); 1156 itn = net_generic(net, nt->ip_tnl_net_id); 1157 1158 if (nt->collect_md) { 1159 if (rtnl_dereference(itn->collect_md_tun)) 1160 return -EEXIST; 1161 } else { 1162 if (ip_tunnel_find(itn, p, dev->type)) 1163 return -EEXIST; 1164 } 1165 1166 nt->net = net; 1167 nt->parms = *p; 1168 nt->fwmark = fwmark; 1169 err = register_netdevice(dev); 1170 if (err) 1171 goto err_register_netdevice; 1172 1173 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) 1174 eth_hw_addr_random(dev); 1175 1176 mtu = ip_tunnel_bind_dev(dev); 1177 if (tb[IFLA_MTU]) { 1178 unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr)); 1179 1180 if (dev->type == ARPHRD_ETHER) 1181 max -= dev->hard_header_len; 1182 1183 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max); 1184 } 1185 1186 err = dev_set_mtu(dev, mtu); 1187 if (err) 1188 goto err_dev_set_mtu; 1189 1190 ip_tunnel_add(itn, nt); 1191 return 0; 1192 1193 err_dev_set_mtu: 1194 unregister_netdevice(dev); 1195 err_register_netdevice: 1196 return err; 1197 } 1198 EXPORT_SYMBOL_GPL(ip_tunnel_newlink); 1199 1200 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], 1201 struct ip_tunnel_parm *p, __u32 fwmark) 1202 { 1203 struct ip_tunnel *t; 1204 struct ip_tunnel *tunnel = netdev_priv(dev); 1205 struct net *net = tunnel->net; 1206 struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); 1207 1208 if (dev == itn->fb_tunnel_dev) 1209 return -EINVAL; 1210 1211 t = ip_tunnel_find(itn, p, dev->type); 1212 1213 if (t) { 1214 if (t->dev != dev) 1215 return -EEXIST; 1216 } else { 1217 t = tunnel; 1218 1219 if (dev->type != ARPHRD_ETHER) { 1220 unsigned int nflags = 0; 1221 1222 if (ipv4_is_multicast(p->iph.daddr)) 1223 nflags = IFF_BROADCAST; 1224 else if (p->iph.daddr) 1225 nflags = IFF_POINTOPOINT; 1226 1227 if ((dev->flags ^ nflags) & 1228 (IFF_POINTOPOINT | IFF_BROADCAST)) 1229 return -EINVAL; 1230 } 1231 } 1232 1233 ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark); 1234 return 0; 1235 } 1236 EXPORT_SYMBOL_GPL(ip_tunnel_changelink); 1237 1238 int ip_tunnel_init(struct net_device *dev) 1239 { 1240 struct ip_tunnel *tunnel = netdev_priv(dev); 1241 struct iphdr *iph = &tunnel->parms.iph; 1242 int err; 1243 1244 dev->needs_free_netdev = true; 1245 dev->priv_destructor = ip_tunnel_dev_free; 1246 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); 1247 if (!dev->tstats) 1248 return -ENOMEM; 1249 1250 err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); 1251 if (err) { 1252 free_percpu(dev->tstats); 1253 return err; 1254 } 1255 1256 err = gro_cells_init(&tunnel->gro_cells, dev); 1257 if (err) { 1258 dst_cache_destroy(&tunnel->dst_cache); 1259 free_percpu(dev->tstats); 1260 return err; 1261 } 1262 1263 tunnel->dev = dev; 1264 tunnel->net = dev_net(dev); 1265 strcpy(tunnel->parms.name, dev->name); 1266 iph->version = 4; 1267 iph->ihl = 5; 1268 1269 if (tunnel->collect_md) 1270 netif_keep_dst(dev); 1271 netdev_lockdep_set_classes(dev); 1272 return 0; 1273 } 1274 EXPORT_SYMBOL_GPL(ip_tunnel_init); 1275 1276 void ip_tunnel_uninit(struct net_device *dev) 1277 { 1278 struct ip_tunnel *tunnel = netdev_priv(dev); 1279 struct net *net = tunnel->net; 1280 struct ip_tunnel_net *itn; 1281 1282 itn = net_generic(net, tunnel->ip_tnl_net_id); 1283 ip_tunnel_del(itn, netdev_priv(dev)); 1284 if (itn->fb_tunnel_dev == dev) 1285 WRITE_ONCE(itn->fb_tunnel_dev, NULL); 1286 1287 dst_cache_reset(&tunnel->dst_cache); 1288 } 1289 EXPORT_SYMBOL_GPL(ip_tunnel_uninit); 1290 1291 /* Do least required initialization, rest of init is done in tunnel_init call */ 1292 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id) 1293 { 1294 struct ip_tunnel *tunnel = netdev_priv(dev); 1295 tunnel->ip_tnl_net_id = net_id; 1296 } 1297 EXPORT_SYMBOL_GPL(ip_tunnel_setup); 1298 1299 MODULE_DESCRIPTION("IPv4 tunnel implementation library"); 1300 MODULE_LICENSE("GPL"); 1301