1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * ip_vs_xmit.c: various packet transmitters for IPVS 4 * 5 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 6 * Julian Anastasov <ja@ssi.bg> 7 * 8 * Changes: 9 * 10 * Description of forwarding methods: 11 * - all transmitters are called from LOCAL_IN (remote clients) and 12 * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD 13 * - not all connections have destination server, for example, 14 * connections in backup server when fwmark is used 15 * - bypass connections use daddr from packet 16 * - we can use dst without ref while sending in RCU section, we use 17 * ref when returning NF_ACCEPT for NAT-ed packet via loopback 18 * LOCAL_OUT rules: 19 * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING) 20 * - skb->pkt_type is not set yet 21 * - the only place where we can see skb->sk != NULL 22 */ 23 24 #define pr_fmt(fmt) "IPVS: " fmt 25 26 #include <linux/kernel.h> 27 #include <linux/slab.h> 28 #include <linux/tcp.h> /* for tcphdr */ 29 #include <net/ip.h> 30 #include <net/gue.h> 31 #include <net/gre.h> 32 #include <net/tcp.h> /* for csum_tcpudp_magic */ 33 #include <net/udp.h> 34 #include <net/icmp.h> /* for icmp_send */ 35 #include <net/route.h> /* for ip_route_output */ 36 #include <net/ipv6.h> 37 #include <net/ip6_route.h> 38 #include <net/ip_tunnels.h> 39 #include <net/ip6_checksum.h> 40 #include <net/addrconf.h> 41 #include <linux/icmpv6.h> 42 #include <linux/netfilter.h> 43 #include <linux/netfilter_ipv4.h> 44 45 #include <net/ip_vs.h> 46 47 enum { 48 IP_VS_RT_MODE_LOCAL = 1, /* Allow local dest */ 49 IP_VS_RT_MODE_NON_LOCAL = 2, /* Allow non-local dest */ 50 IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to 51 * local 52 */ 53 IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */ 54 IP_VS_RT_MODE_KNOWN_NH = 16,/* Route via remote addr */ 55 IP_VS_RT_MODE_TUNNEL = 32,/* Tunnel mode */ 56 }; 57 58 static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void) 59 { 60 return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC); 61 } 62 63 static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst) 64 { 65 kfree(dest_dst); 66 } 67 68 /* 69 * Destination cache to speed up outgoing route lookup 70 */ 71 static inline void 72 __ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst, 73 struct dst_entry *dst, u32 dst_cookie) 74 { 75 struct ip_vs_dest_dst *old; 76 77 old = rcu_dereference_protected(dest->dest_dst, 78 lockdep_is_held(&dest->dst_lock)); 79 80 if (dest_dst) { 81 dest_dst->dst_cache = dst; 82 dest_dst->dst_cookie = dst_cookie; 83 } 84 rcu_assign_pointer(dest->dest_dst, dest_dst); 85 86 if (old) 87 call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); 88 } 89 90 static inline struct ip_vs_dest_dst * 91 __ip_vs_dst_check(struct ip_vs_dest *dest) 92 { 93 struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst); 94 struct dst_entry *dst; 95 96 if (!dest_dst) 97 return NULL; 98 dst = dest_dst->dst_cache; 99 if (READ_ONCE(dst->obsolete) && 100 dst->ops->check(dst, dest_dst->dst_cookie) == NULL) 101 return NULL; 102 return dest_dst; 103 } 104 105 static inline bool 106 __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu) 107 { 108 if (IP6CB(skb)->frag_max_size) { 109 /* frag_max_size tell us that, this packet have been 110 * defragmented by netfilter IPv6 conntrack module. 111 */ 112 if (IP6CB(skb)->frag_max_size > mtu) 113 return true; /* largest fragment violate MTU */ 114 } 115 else if (skb->len > mtu && !skb_is_gso(skb)) { 116 return true; /* Packet size violate MTU size */ 117 } 118 return false; 119 } 120 121 /* Get route to daddr, optionally bind route to saddr */ 122 static struct rtable *do_output_route4(struct net *net, __be32 daddr, 123 int rt_mode, __be32 *ret_saddr) 124 { 125 struct flowi4 fl4; 126 struct rtable *rt; 127 128 memset(&fl4, 0, sizeof(fl4)); 129 fl4.daddr = daddr; 130 fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ? 131 FLOWI_FLAG_KNOWN_NH : 0; 132 133 retry: 134 rt = ip_route_output_key(net, &fl4); 135 if (IS_ERR(rt)) { 136 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr); 137 return NULL; 138 } 139 if (rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { 140 ip_rt_put(rt); 141 flowi4_update_output(&fl4, 0, daddr, fl4.saddr); 142 rt_mode = 0; 143 goto retry; 144 } 145 if (ret_saddr) 146 *ret_saddr = fl4.saddr; 147 return rt; 148 } 149 150 #ifdef CONFIG_IP_VS_IPV6 151 static inline int __ip_vs_is_local_route6(struct rt6_info *rt) 152 { 153 return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK; 154 } 155 #endif 156 157 static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb, 158 int rt_mode, 159 bool new_rt_is_local) 160 { 161 bool rt_mode_allow_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL); 162 bool rt_mode_allow_non_local = !!(rt_mode & IP_VS_RT_MODE_NON_LOCAL); 163 bool rt_mode_allow_redirect = !!(rt_mode & IP_VS_RT_MODE_RDR); 164 bool source_is_loopback; 165 bool old_rt_is_local; 166 167 #ifdef CONFIG_IP_VS_IPV6 168 if (skb_af == AF_INET6) { 169 int addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr); 170 171 source_is_loopback = 172 (!skb->dev || skb->dev->flags & IFF_LOOPBACK) && 173 (addr_type & IPV6_ADDR_LOOPBACK); 174 old_rt_is_local = __ip_vs_is_local_route6( 175 dst_rt6_info(skb_dst(skb))); 176 } else 177 #endif 178 { 179 source_is_loopback = ipv4_is_loopback(ip_hdr(skb)->saddr); 180 old_rt_is_local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; 181 } 182 183 if (unlikely(new_rt_is_local)) { 184 if (!rt_mode_allow_local) 185 return true; 186 if (!rt_mode_allow_redirect && !old_rt_is_local) 187 return true; 188 } else { 189 if (!rt_mode_allow_non_local) 190 return true; 191 if (source_is_loopback) 192 return true; 193 } 194 return false; 195 } 196 197 static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu) 198 { 199 struct sock *sk = skb->sk; 200 struct rtable *ort = skb_rtable(skb); 201 202 if (!skb->dev && sk && sk_fullsock(sk)) 203 ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu, true); 204 } 205 206 static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af, 207 int rt_mode, 208 struct ip_vs_iphdr *ipvsh, 209 struct sk_buff *skb, int mtu) 210 { 211 #ifdef CONFIG_IP_VS_IPV6 212 if (skb_af == AF_INET6) { 213 struct net *net = ipvs->net; 214 215 if (unlikely(__mtu_check_toobig_v6(skb, mtu))) { 216 if (!skb->dev) 217 skb->dev = net->loopback_dev; 218 /* only send ICMP too big on first fragment */ 219 if (!ipvsh->fragoffs && !ip_vs_iph_icmp(ipvsh)) 220 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 221 IP_VS_DBG(1, "frag needed for %pI6c\n", 222 &ipv6_hdr(skb)->saddr); 223 return false; 224 } 225 } else 226 #endif 227 { 228 /* If we're going to tunnel the packet and pmtu discovery 229 * is disabled, we'll just fragment it anyway 230 */ 231 if ((rt_mode & IP_VS_RT_MODE_TUNNEL) && !sysctl_pmtu_disc(ipvs)) 232 return true; 233 234 if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) && 235 skb->len > mtu && !skb_is_gso(skb) && 236 !ip_vs_iph_icmp(ipvsh))) { 237 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, 238 htonl(mtu)); 239 IP_VS_DBG(1, "frag needed for %pI4\n", 240 &ip_hdr(skb)->saddr); 241 return false; 242 } 243 } 244 245 return true; 246 } 247 248 static inline bool decrement_ttl(struct netns_ipvs *ipvs, 249 int skb_af, 250 struct sk_buff *skb) 251 { 252 struct net *net = ipvs->net; 253 254 #ifdef CONFIG_IP_VS_IPV6 255 if (skb_af == AF_INET6) { 256 struct dst_entry *dst = skb_dst(skb); 257 258 /* check and decrement ttl */ 259 if (ipv6_hdr(skb)->hop_limit <= 1) { 260 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); 261 262 /* Force OUTPUT device used as source address */ 263 skb->dev = dst->dev; 264 icmpv6_send(skb, ICMPV6_TIME_EXCEED, 265 ICMPV6_EXC_HOPLIMIT, 0); 266 IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); 267 268 return false; 269 } 270 271 /* don't propagate ttl change to cloned packets */ 272 if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) 273 return false; 274 275 ipv6_hdr(skb)->hop_limit--; 276 } else 277 #endif 278 { 279 if (ip_hdr(skb)->ttl <= 1) { 280 /* Tell the sender its packet died... */ 281 IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); 282 icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); 283 return false; 284 } 285 286 /* don't propagate ttl change to cloned packets */ 287 if (skb_ensure_writable(skb, sizeof(struct iphdr))) 288 return false; 289 290 /* Decrease ttl */ 291 ip_decrease_ttl(ip_hdr(skb)); 292 } 293 294 return true; 295 } 296 297 /* Get route to destination or remote server */ 298 static int 299 __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, 300 struct ip_vs_dest *dest, 301 __be32 daddr, int rt_mode, __be32 *ret_saddr, 302 struct ip_vs_iphdr *ipvsh) 303 { 304 struct net *net = ipvs->net; 305 struct ip_vs_dest_dst *dest_dst; 306 struct rtable *rt; /* Route to the other host */ 307 int mtu; 308 int local, noref = 1; 309 310 if (dest) { 311 dest_dst = __ip_vs_dst_check(dest); 312 if (likely(dest_dst)) 313 rt = dst_rtable(dest_dst->dst_cache); 314 else { 315 dest_dst = ip_vs_dest_dst_alloc(); 316 spin_lock_bh(&dest->dst_lock); 317 if (!dest_dst) { 318 __ip_vs_dst_set(dest, NULL, NULL, 0); 319 spin_unlock_bh(&dest->dst_lock); 320 goto err_unreach; 321 } 322 rt = do_output_route4(net, dest->addr.ip, rt_mode, 323 &dest_dst->dst_saddr.ip); 324 if (!rt) { 325 __ip_vs_dst_set(dest, NULL, NULL, 0); 326 spin_unlock_bh(&dest->dst_lock); 327 ip_vs_dest_dst_free(dest_dst); 328 goto err_unreach; 329 } 330 __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0); 331 spin_unlock_bh(&dest->dst_lock); 332 IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n", 333 &dest->addr.ip, &dest_dst->dst_saddr.ip, 334 rcuref_read(&rt->dst.__rcuref)); 335 } 336 if (ret_saddr) 337 *ret_saddr = dest_dst->dst_saddr.ip; 338 } else { 339 noref = 0; 340 341 /* For such unconfigured boxes avoid many route lookups 342 * for performance reasons because we do not remember saddr 343 */ 344 rt_mode &= ~IP_VS_RT_MODE_CONNECT; 345 rt = do_output_route4(net, daddr, rt_mode, ret_saddr); 346 if (!rt) 347 goto err_unreach; 348 } 349 350 local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0; 351 if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode, 352 local))) { 353 IP_VS_DBG_RL("We are crossing local and non-local addresses" 354 " daddr=%pI4\n", &daddr); 355 goto err_put; 356 } 357 358 if (unlikely(local)) { 359 /* skb to local stack, preserve old route */ 360 if (!noref) 361 ip_rt_put(rt); 362 return local; 363 } 364 365 if (!decrement_ttl(ipvs, skb_af, skb)) 366 goto err_put; 367 368 if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) { 369 mtu = dst_mtu(&rt->dst); 370 } else { 371 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); 372 if (!dest) 373 goto err_put; 374 if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 375 mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); 376 if ((dest->tun_flags & 377 IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 378 skb->ip_summed == CHECKSUM_PARTIAL) 379 mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 380 } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 381 IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; 382 383 if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 384 __set_bit(IP_TUNNEL_CSUM_BIT, tflags); 385 mtu -= gre_calc_hlen(tflags); 386 } 387 if (mtu < 68) { 388 IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); 389 goto err_put; 390 } 391 maybe_update_pmtu(skb_af, skb, mtu); 392 } 393 394 if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu)) 395 goto err_put; 396 397 skb_dst_drop(skb); 398 if (noref) 399 skb_dst_set_noref(skb, &rt->dst); 400 else 401 skb_dst_set(skb, &rt->dst); 402 403 return local; 404 405 err_put: 406 if (!noref) 407 ip_rt_put(rt); 408 return -1; 409 410 err_unreach: 411 if (!skb->dev) 412 skb->dev = skb_dst(skb)->dev; 413 414 dst_link_failure(skb); 415 return -1; 416 } 417 418 #ifdef CONFIG_IP_VS_IPV6 419 static struct dst_entry * 420 __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, 421 struct in6_addr *ret_saddr, int do_xfrm, int rt_mode) 422 { 423 struct dst_entry *dst; 424 struct flowi6 fl6 = { 425 .daddr = *daddr, 426 }; 427 428 if (rt_mode & IP_VS_RT_MODE_KNOWN_NH) 429 fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH; 430 431 dst = ip6_route_output(net, NULL, &fl6); 432 if (dst->error) 433 goto out_err; 434 if (!ret_saddr) 435 return dst; 436 if (ipv6_addr_any(&fl6.saddr) && 437 ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, 438 &fl6.daddr, 0, &fl6.saddr) < 0) 439 goto out_err; 440 if (do_xfrm) { 441 dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); 442 if (IS_ERR(dst)) { 443 dst = NULL; 444 goto out_err; 445 } 446 } 447 *ret_saddr = fl6.saddr; 448 return dst; 449 450 out_err: 451 dst_release(dst); 452 IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr); 453 return NULL; 454 } 455 456 /* 457 * Get route to destination or remote server 458 */ 459 static int 460 __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, 461 struct ip_vs_dest *dest, 462 struct in6_addr *daddr, struct in6_addr *ret_saddr, 463 struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode) 464 { 465 struct net *net = ipvs->net; 466 struct ip_vs_dest_dst *dest_dst; 467 struct rt6_info *rt; /* Route to the other host */ 468 struct dst_entry *dst; 469 int mtu; 470 int local, noref = 1; 471 472 if (dest) { 473 dest_dst = __ip_vs_dst_check(dest); 474 if (likely(dest_dst)) 475 rt = dst_rt6_info(dest_dst->dst_cache); 476 else { 477 u32 cookie; 478 479 dest_dst = ip_vs_dest_dst_alloc(); 480 spin_lock_bh(&dest->dst_lock); 481 if (!dest_dst) { 482 __ip_vs_dst_set(dest, NULL, NULL, 0); 483 spin_unlock_bh(&dest->dst_lock); 484 goto err_unreach; 485 } 486 dst = __ip_vs_route_output_v6(net, &dest->addr.in6, 487 &dest_dst->dst_saddr.in6, 488 do_xfrm, rt_mode); 489 if (!dst) { 490 __ip_vs_dst_set(dest, NULL, NULL, 0); 491 spin_unlock_bh(&dest->dst_lock); 492 ip_vs_dest_dst_free(dest_dst); 493 goto err_unreach; 494 } 495 rt = dst_rt6_info(dst); 496 cookie = rt6_get_cookie(rt); 497 __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); 498 spin_unlock_bh(&dest->dst_lock); 499 IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", 500 &dest->addr.in6, &dest_dst->dst_saddr.in6, 501 rcuref_read(&rt->dst.__rcuref)); 502 } 503 if (ret_saddr) 504 *ret_saddr = dest_dst->dst_saddr.in6; 505 } else { 506 noref = 0; 507 dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm, 508 rt_mode); 509 if (!dst) 510 goto err_unreach; 511 rt = dst_rt6_info(dst); 512 } 513 514 local = __ip_vs_is_local_route6(rt); 515 516 if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode, 517 local))) { 518 IP_VS_DBG_RL("We are crossing local and non-local addresses" 519 " daddr=%pI6\n", daddr); 520 goto err_put; 521 } 522 523 if (unlikely(local)) { 524 /* skb to local stack, preserve old route */ 525 if (!noref) 526 dst_release(&rt->dst); 527 return local; 528 } 529 530 if (!decrement_ttl(ipvs, skb_af, skb)) 531 goto err_put; 532 533 /* MTU checking */ 534 if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) 535 mtu = dst_mtu(&rt->dst); 536 else { 537 mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); 538 if (!dest) 539 goto err_put; 540 if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 541 mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); 542 if ((dest->tun_flags & 543 IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 544 skb->ip_summed == CHECKSUM_PARTIAL) 545 mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 546 } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 547 IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; 548 549 if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 550 __set_bit(IP_TUNNEL_CSUM_BIT, tflags); 551 mtu -= gre_calc_hlen(tflags); 552 } 553 if (mtu < IPV6_MIN_MTU) { 554 IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, 555 IPV6_MIN_MTU); 556 goto err_put; 557 } 558 maybe_update_pmtu(skb_af, skb, mtu); 559 } 560 561 if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu)) 562 goto err_put; 563 564 skb_dst_drop(skb); 565 if (noref) 566 skb_dst_set_noref(skb, &rt->dst); 567 else 568 skb_dst_set(skb, &rt->dst); 569 570 return local; 571 572 err_put: 573 if (!noref) 574 dst_release(&rt->dst); 575 return -1; 576 577 err_unreach: 578 /* The ip6_link_failure function requires the dev field to be set 579 * in order to get the net (further for the sake of fwmark 580 * reflection). 581 */ 582 if (!skb->dev) 583 skb->dev = skb_dst(skb)->dev; 584 585 dst_link_failure(skb); 586 return -1; 587 } 588 #endif 589 590 591 /* return NF_ACCEPT to allow forwarding or other NF_xxx on error */ 592 static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, 593 struct ip_vs_conn *cp) 594 { 595 int ret = NF_ACCEPT; 596 597 skb->ipvs_property = 1; 598 if (unlikely(cp->flags & IP_VS_CONN_F_NFCT)) 599 ret = ip_vs_confirm_conntrack(skb); 600 if (ret == NF_ACCEPT) { 601 nf_reset_ct(skb); 602 skb_forward_csum(skb); 603 if (skb->dev) 604 skb_clear_tstamp(skb); 605 } 606 return ret; 607 } 608 609 /* In the event of a remote destination, it's possible that we would have 610 * matches against an old socket (particularly a TIME-WAIT socket). This 611 * causes havoc down the line (ip_local_out et. al. expect regular sockets 612 * and invalid memory accesses will happen) so simply drop the association 613 * in this case. 614 */ 615 static inline void ip_vs_drop_early_demux_sk(struct sk_buff *skb) 616 { 617 /* If dev is set, the packet came from the LOCAL_IN callback and 618 * not from a local TCP socket. 619 */ 620 if (skb->dev) 621 skb_orphan(skb); 622 } 623 624 /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ 625 static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, 626 struct ip_vs_conn *cp, int local) 627 { 628 int ret = NF_STOLEN; 629 630 skb->ipvs_property = 1; 631 if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) 632 ip_vs_notrack(skb); 633 else 634 ip_vs_update_conntrack(skb, cp, 1); 635 636 /* Remove the early_demux association unless it's bound for the 637 * exact same port and address on this host after translation. 638 */ 639 if (!local || cp->vport != cp->dport || 640 !ip_vs_addr_equal(cp->af, &cp->vaddr, &cp->daddr)) 641 ip_vs_drop_early_demux_sk(skb); 642 643 if (!local) { 644 skb_forward_csum(skb); 645 if (skb->dev) 646 skb_clear_tstamp(skb); 647 NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, 648 NULL, skb_dst(skb)->dev, dst_output); 649 } else 650 ret = NF_ACCEPT; 651 652 return ret; 653 } 654 655 /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ 656 static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, 657 struct ip_vs_conn *cp, int local) 658 { 659 int ret = NF_STOLEN; 660 661 skb->ipvs_property = 1; 662 if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) 663 ip_vs_notrack(skb); 664 if (!local) { 665 ip_vs_drop_early_demux_sk(skb); 666 skb_forward_csum(skb); 667 if (skb->dev) 668 skb_clear_tstamp(skb); 669 NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, 670 NULL, skb_dst(skb)->dev, dst_output); 671 } else 672 ret = NF_ACCEPT; 673 return ret; 674 } 675 676 677 /* 678 * NULL transmitter (do nothing except return NF_ACCEPT) 679 */ 680 int 681 ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 682 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 683 { 684 /* we do not touch skb and do not need pskb ptr */ 685 return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); 686 } 687 688 689 /* 690 * Bypass transmitter 691 * Let packets bypass the destination when the destination is not 692 * available, it may be only used in transparent cache cluster. 693 */ 694 int 695 ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 696 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 697 { 698 struct iphdr *iph = ip_hdr(skb); 699 700 if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr, 701 IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0) 702 goto tx_error; 703 704 ip_send_check(iph); 705 706 /* Another hack: avoid icmp_send in ip_fragment */ 707 skb->ignore_df = 1; 708 709 ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); 710 711 return NF_STOLEN; 712 713 tx_error: 714 kfree_skb(skb); 715 return NF_STOLEN; 716 } 717 718 #ifdef CONFIG_IP_VS_IPV6 719 int 720 ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 721 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 722 { 723 struct ipv6hdr *iph = ipv6_hdr(skb); 724 725 if (__ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, NULL, 726 &iph->daddr, NULL, 727 ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0) 728 goto tx_error; 729 730 /* Another hack: avoid icmp_send in ip_fragment */ 731 skb->ignore_df = 1; 732 733 ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); 734 735 return NF_STOLEN; 736 737 tx_error: 738 kfree_skb(skb); 739 return NF_STOLEN; 740 } 741 #endif 742 743 /* 744 * NAT transmitter (only for outside-to-inside nat forwarding) 745 * Not used for related ICMP 746 */ 747 int 748 ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 749 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 750 { 751 struct rtable *rt; /* Route to the other host */ 752 int local, rc, was_input; 753 754 /* check if it is a connection of no-client-port */ 755 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { 756 __be16 _pt, *p; 757 758 p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt); 759 if (p == NULL) 760 goto tx_error; 761 ip_vs_conn_fill_cport(cp, *p); 762 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); 763 } 764 765 was_input = rt_is_input_route(skb_rtable(skb)); 766 local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, 767 IP_VS_RT_MODE_LOCAL | 768 IP_VS_RT_MODE_NON_LOCAL | 769 IP_VS_RT_MODE_RDR, NULL, ipvsh); 770 if (local < 0) 771 goto tx_error; 772 rt = skb_rtable(skb); 773 /* 774 * Avoid duplicate tuple in reply direction for NAT traffic 775 * to local address when connection is sync-ed 776 */ 777 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 778 if (cp->flags & IP_VS_CONN_F_SYNC && local) { 779 enum ip_conntrack_info ctinfo; 780 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 781 782 if (ct) { 783 IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, ipvsh->off, 784 "ip_vs_nat_xmit(): " 785 "stopping DNAT to local address"); 786 goto tx_error; 787 } 788 } 789 #endif 790 791 /* From world but DNAT to loopback address? */ 792 if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { 793 IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, ipvsh->off, 794 "ip_vs_nat_xmit(): stopping DNAT to loopback " 795 "address"); 796 goto tx_error; 797 } 798 799 /* copy-on-write the packet before mangling it */ 800 if (skb_ensure_writable(skb, sizeof(struct iphdr))) 801 goto tx_error; 802 803 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 804 goto tx_error; 805 806 /* mangle the packet */ 807 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) 808 goto tx_error; 809 ip_hdr(skb)->daddr = cp->daddr.ip; 810 ip_send_check(ip_hdr(skb)); 811 812 IP_VS_DBG_PKT(10, AF_INET, pp, skb, ipvsh->off, "After DNAT"); 813 814 /* FIXME: when application helper enlarges the packet and the length 815 is larger than the MTU of outgoing device, there will be still 816 MTU problem. */ 817 818 /* Another hack: avoid icmp_send in ip_fragment */ 819 skb->ignore_df = 1; 820 821 rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); 822 823 return rc; 824 825 tx_error: 826 kfree_skb(skb); 827 return NF_STOLEN; 828 } 829 830 #ifdef CONFIG_IP_VS_IPV6 831 int 832 ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 833 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 834 { 835 struct rt6_info *rt; /* Route to the other host */ 836 int local, rc; 837 838 /* check if it is a connection of no-client-port */ 839 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) { 840 __be16 _pt, *p; 841 p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt); 842 if (p == NULL) 843 goto tx_error; 844 ip_vs_conn_fill_cport(cp, *p); 845 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); 846 } 847 848 local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, 849 &cp->daddr.in6, 850 NULL, ipvsh, 0, 851 IP_VS_RT_MODE_LOCAL | 852 IP_VS_RT_MODE_NON_LOCAL | 853 IP_VS_RT_MODE_RDR); 854 if (local < 0) 855 goto tx_error; 856 rt = dst_rt6_info(skb_dst(skb)); 857 /* 858 * Avoid duplicate tuple in reply direction for NAT traffic 859 * to local address when connection is sync-ed 860 */ 861 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 862 if (cp->flags & IP_VS_CONN_F_SYNC && local) { 863 enum ip_conntrack_info ctinfo; 864 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 865 866 if (ct) { 867 IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, ipvsh->off, 868 "ip_vs_nat_xmit_v6(): " 869 "stopping DNAT to local address"); 870 goto tx_error; 871 } 872 } 873 #endif 874 875 /* From world but DNAT to loopback address? */ 876 if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && 877 ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) { 878 IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, ipvsh->off, 879 "ip_vs_nat_xmit_v6(): " 880 "stopping DNAT to loopback address"); 881 goto tx_error; 882 } 883 884 /* copy-on-write the packet before mangling it */ 885 if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) 886 goto tx_error; 887 888 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 889 goto tx_error; 890 891 /* mangle the packet */ 892 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) 893 goto tx_error; 894 ipv6_hdr(skb)->daddr = cp->daddr.in6; 895 896 IP_VS_DBG_PKT(10, AF_INET6, pp, skb, ipvsh->off, "After DNAT"); 897 898 /* FIXME: when application helper enlarges the packet and the length 899 is larger than the MTU of outgoing device, there will be still 900 MTU problem. */ 901 902 /* Another hack: avoid icmp_send in ip_fragment */ 903 skb->ignore_df = 1; 904 905 rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); 906 907 return rc; 908 909 tx_error: 910 kfree_skb(skb); 911 return NF_STOLEN; 912 } 913 #endif 914 915 /* When forwarding a packet, we must ensure that we've got enough headroom 916 * for the encapsulation packet in the skb. This also gives us an 917 * opportunity to figure out what the payload_len, dsfield, ttl, and df 918 * values should be, so that we won't need to look at the old ip header 919 * again 920 */ 921 static struct sk_buff * 922 ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, 923 unsigned int max_headroom, __u8 *next_protocol, 924 __u32 *payload_len, __u8 *dsfield, __u8 *ttl, 925 __be16 *df) 926 { 927 struct sk_buff *new_skb = NULL; 928 struct iphdr *old_iph = NULL; 929 __u8 old_dsfield; 930 #ifdef CONFIG_IP_VS_IPV6 931 struct ipv6hdr *old_ipv6h = NULL; 932 #endif 933 934 ip_vs_drop_early_demux_sk(skb); 935 936 if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { 937 new_skb = skb_realloc_headroom(skb, max_headroom); 938 if (!new_skb) 939 goto error; 940 if (skb->sk) 941 skb_set_owner_w(new_skb, skb->sk); 942 consume_skb(skb); 943 skb = new_skb; 944 } 945 946 #ifdef CONFIG_IP_VS_IPV6 947 if (skb_af == AF_INET6) { 948 old_ipv6h = ipv6_hdr(skb); 949 *next_protocol = IPPROTO_IPV6; 950 if (payload_len) 951 *payload_len = 952 ntohs(old_ipv6h->payload_len) + 953 sizeof(*old_ipv6h); 954 old_dsfield = ipv6_get_dsfield(old_ipv6h); 955 *ttl = old_ipv6h->hop_limit; 956 if (df) 957 *df = 0; 958 } else 959 #endif 960 { 961 old_iph = ip_hdr(skb); 962 /* Copy DF, reset fragment offset and MF */ 963 if (df) 964 *df = (old_iph->frag_off & htons(IP_DF)); 965 *next_protocol = IPPROTO_IPIP; 966 967 /* fix old IP header checksum */ 968 ip_send_check(old_iph); 969 old_dsfield = ipv4_get_dsfield(old_iph); 970 *ttl = old_iph->ttl; 971 if (payload_len) 972 *payload_len = skb_ip_totlen(skb); 973 } 974 975 /* Implement full-functionality option for ECN encapsulation */ 976 *dsfield = INET_ECN_encapsulate(old_dsfield, old_dsfield); 977 978 return skb; 979 error: 980 kfree_skb(skb); 981 return ERR_PTR(-ENOMEM); 982 } 983 984 static inline int __tun_gso_type_mask(int encaps_af, int orig_af) 985 { 986 switch (encaps_af) { 987 case AF_INET: 988 return SKB_GSO_IPXIP4; 989 case AF_INET6: 990 return SKB_GSO_IPXIP6; 991 default: 992 return 0; 993 } 994 } 995 996 static int 997 ipvs_gue_encap(struct net *net, struct sk_buff *skb, 998 struct ip_vs_conn *cp, __u8 *next_protocol) 999 { 1000 __be16 dport; 1001 __be16 sport = udp_flow_src_port(net, skb, 0, 0, false); 1002 struct udphdr *udph; /* Our new UDP header */ 1003 struct guehdr *gueh; /* Our new GUE header */ 1004 size_t hdrlen, optlen = 0; 1005 void *data; 1006 bool need_priv = false; 1007 1008 if ((cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1009 skb->ip_summed == CHECKSUM_PARTIAL) { 1010 optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 1011 need_priv = true; 1012 } 1013 1014 hdrlen = sizeof(struct guehdr) + optlen; 1015 1016 skb_push(skb, hdrlen); 1017 1018 gueh = (struct guehdr *)skb->data; 1019 1020 gueh->control = 0; 1021 gueh->version = 0; 1022 gueh->hlen = optlen >> 2; 1023 gueh->flags = 0; 1024 gueh->proto_ctype = *next_protocol; 1025 1026 data = &gueh[1]; 1027 1028 if (need_priv) { 1029 __be32 *flags = data; 1030 u16 csum_start = skb_checksum_start_offset(skb); 1031 __be16 *pd; 1032 1033 gueh->flags |= GUE_FLAG_PRIV; 1034 *flags = 0; 1035 data += GUE_LEN_PRIV; 1036 1037 if (csum_start < hdrlen) 1038 return -EINVAL; 1039 1040 csum_start -= hdrlen; 1041 pd = data; 1042 pd[0] = htons(csum_start); 1043 pd[1] = htons(csum_start + skb->csum_offset); 1044 1045 if (!skb_is_gso(skb)) { 1046 skb->ip_summed = CHECKSUM_NONE; 1047 skb->encapsulation = 0; 1048 } 1049 1050 *flags |= GUE_PFLAG_REMCSUM; 1051 data += GUE_PLEN_REMCSUM; 1052 } 1053 1054 skb_push(skb, sizeof(struct udphdr)); 1055 skb_reset_transport_header(skb); 1056 1057 udph = udp_hdr(skb); 1058 1059 dport = cp->dest->tun_port; 1060 udph->dest = dport; 1061 udph->source = sport; 1062 udph->len = htons(skb->len); 1063 udph->check = 0; 1064 1065 *next_protocol = IPPROTO_UDP; 1066 1067 return 0; 1068 } 1069 1070 static void 1071 ipvs_gre_encap(struct net *net, struct sk_buff *skb, 1072 struct ip_vs_conn *cp, __u8 *next_protocol) 1073 { 1074 __be16 proto = *next_protocol == IPPROTO_IPIP ? 1075 htons(ETH_P_IP) : htons(ETH_P_IPV6); 1076 IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; 1077 size_t hdrlen; 1078 1079 if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1080 __set_bit(IP_TUNNEL_CSUM_BIT, tflags); 1081 1082 hdrlen = gre_calc_hlen(tflags); 1083 gre_build_header(skb, hdrlen, tflags, proto, 0, 0); 1084 1085 *next_protocol = IPPROTO_GRE; 1086 } 1087 1088 /* 1089 * IP Tunneling transmitter 1090 * 1091 * This function encapsulates the packet in a new IP packet, its 1092 * destination will be set to cp->daddr. Most code of this function 1093 * is taken from ipip.c. 1094 * 1095 * It is used in VS/TUN cluster. The load balancer selects a real 1096 * server from a cluster based on a scheduling algorithm, 1097 * encapsulates the request packet and forwards it to the selected 1098 * server. For example, all real servers are configured with 1099 * "ifconfig tunl0 <Virtual IP Address> up". When the server receives 1100 * the encapsulated packet, it will decapsulate the packet, processe 1101 * the request and return the response packets directly to the client 1102 * without passing the load balancer. This can greatly increase the 1103 * scalability of virtual server. 1104 * 1105 * Used for ANY protocol 1106 */ 1107 int 1108 ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 1109 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 1110 { 1111 struct netns_ipvs *ipvs = cp->ipvs; 1112 struct net *net = ipvs->net; 1113 struct rtable *rt; /* Route to the other host */ 1114 __be32 saddr; /* Source for tunnel */ 1115 struct net_device *tdev; /* Device to other host */ 1116 __u8 next_protocol = 0; 1117 __u8 dsfield = 0; 1118 __u8 ttl = 0; 1119 __be16 df = 0; 1120 __be16 *dfp = NULL; 1121 struct iphdr *iph; /* Our new IP header */ 1122 unsigned int max_headroom; /* The extra header space needed */ 1123 int ret, local; 1124 int tun_type, gso_type; 1125 int tun_flags; 1126 1127 local = __ip_vs_get_out_rt(ipvs, cp->af, skb, cp->dest, cp->daddr.ip, 1128 IP_VS_RT_MODE_LOCAL | 1129 IP_VS_RT_MODE_NON_LOCAL | 1130 IP_VS_RT_MODE_CONNECT | 1131 IP_VS_RT_MODE_TUNNEL, &saddr, ipvsh); 1132 if (local < 0) 1133 goto tx_error; 1134 if (local) 1135 return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); 1136 1137 rt = skb_rtable(skb); 1138 tdev = rt->dst.dev; 1139 1140 /* 1141 * Okay, now see if we can stuff it in the buffer as-is. 1142 */ 1143 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); 1144 1145 tun_type = cp->dest->tun_type; 1146 tun_flags = cp->dest->tun_flags; 1147 1148 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1149 size_t gue_hdrlen, gue_optlen = 0; 1150 1151 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1152 skb->ip_summed == CHECKSUM_PARTIAL) { 1153 gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 1154 } 1155 gue_hdrlen = sizeof(struct guehdr) + gue_optlen; 1156 1157 max_headroom += sizeof(struct udphdr) + gue_hdrlen; 1158 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 1159 IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; 1160 size_t gre_hdrlen; 1161 1162 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1163 __set_bit(IP_TUNNEL_CSUM_BIT, tflags); 1164 gre_hdrlen = gre_calc_hlen(tflags); 1165 1166 max_headroom += gre_hdrlen; 1167 } 1168 1169 /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */ 1170 dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL; 1171 skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, 1172 &next_protocol, NULL, &dsfield, 1173 &ttl, dfp); 1174 if (IS_ERR(skb)) 1175 return NF_STOLEN; 1176 1177 gso_type = __tun_gso_type_mask(AF_INET, cp->af); 1178 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1179 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || 1180 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) 1181 gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; 1182 else 1183 gso_type |= SKB_GSO_UDP_TUNNEL; 1184 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1185 skb->ip_summed == CHECKSUM_PARTIAL) { 1186 gso_type |= SKB_GSO_TUNNEL_REMCSUM; 1187 } 1188 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 1189 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1190 gso_type |= SKB_GSO_GRE_CSUM; 1191 else 1192 gso_type |= SKB_GSO_GRE; 1193 } 1194 1195 if (iptunnel_handle_offloads(skb, gso_type)) 1196 goto tx_error; 1197 1198 skb->transport_header = skb->network_header; 1199 1200 skb_set_inner_ipproto(skb, next_protocol); 1201 skb_set_inner_mac_header(skb, skb_inner_network_offset(skb)); 1202 1203 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1204 bool check = false; 1205 1206 if (ipvs_gue_encap(net, skb, cp, &next_protocol)) 1207 goto tx_error; 1208 1209 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || 1210 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) 1211 check = true; 1212 1213 udp_set_csum(!check, skb, saddr, cp->daddr.ip, skb->len); 1214 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) 1215 ipvs_gre_encap(net, skb, cp, &next_protocol); 1216 1217 skb_push(skb, sizeof(struct iphdr)); 1218 skb_reset_network_header(skb); 1219 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 1220 1221 /* 1222 * Push down and install the IPIP header. 1223 */ 1224 iph = ip_hdr(skb); 1225 iph->version = 4; 1226 iph->ihl = sizeof(struct iphdr)>>2; 1227 iph->frag_off = df; 1228 iph->protocol = next_protocol; 1229 iph->tos = dsfield; 1230 iph->daddr = cp->daddr.ip; 1231 iph->saddr = saddr; 1232 iph->ttl = ttl; 1233 ip_select_ident(net, skb, NULL); 1234 1235 /* Another hack: avoid icmp_send in ip_fragment */ 1236 skb->ignore_df = 1; 1237 1238 ret = ip_vs_tunnel_xmit_prepare(skb, cp); 1239 if (ret == NF_ACCEPT) 1240 ip_local_out(net, skb->sk, skb); 1241 else if (ret == NF_DROP) 1242 kfree_skb(skb); 1243 1244 return NF_STOLEN; 1245 1246 tx_error: 1247 kfree_skb(skb); 1248 return NF_STOLEN; 1249 } 1250 1251 #ifdef CONFIG_IP_VS_IPV6 1252 int 1253 ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 1254 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 1255 { 1256 struct netns_ipvs *ipvs = cp->ipvs; 1257 struct net *net = ipvs->net; 1258 struct rt6_info *rt; /* Route to the other host */ 1259 struct in6_addr saddr; /* Source for tunnel */ 1260 struct net_device *tdev; /* Device to other host */ 1261 __u8 next_protocol = 0; 1262 __u32 payload_len = 0; 1263 __u8 dsfield = 0; 1264 __u8 ttl = 0; 1265 struct ipv6hdr *iph; /* Our new IP header */ 1266 unsigned int max_headroom; /* The extra header space needed */ 1267 int ret, local; 1268 int tun_type, gso_type; 1269 int tun_flags; 1270 1271 local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest, 1272 &cp->daddr.in6, 1273 &saddr, ipvsh, 1, 1274 IP_VS_RT_MODE_LOCAL | 1275 IP_VS_RT_MODE_NON_LOCAL | 1276 IP_VS_RT_MODE_TUNNEL); 1277 if (local < 0) 1278 goto tx_error; 1279 if (local) 1280 return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); 1281 1282 rt = dst_rt6_info(skb_dst(skb)); 1283 tdev = rt->dst.dev; 1284 1285 /* 1286 * Okay, now see if we can stuff it in the buffer as-is. 1287 */ 1288 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); 1289 1290 tun_type = cp->dest->tun_type; 1291 tun_flags = cp->dest->tun_flags; 1292 1293 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1294 size_t gue_hdrlen, gue_optlen = 0; 1295 1296 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1297 skb->ip_summed == CHECKSUM_PARTIAL) { 1298 gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 1299 } 1300 gue_hdrlen = sizeof(struct guehdr) + gue_optlen; 1301 1302 max_headroom += sizeof(struct udphdr) + gue_hdrlen; 1303 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 1304 IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; 1305 size_t gre_hdrlen; 1306 1307 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1308 __set_bit(IP_TUNNEL_CSUM_BIT, tflags); 1309 gre_hdrlen = gre_calc_hlen(tflags); 1310 1311 max_headroom += gre_hdrlen; 1312 } 1313 1314 skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, 1315 &next_protocol, &payload_len, 1316 &dsfield, &ttl, NULL); 1317 if (IS_ERR(skb)) 1318 return NF_STOLEN; 1319 1320 gso_type = __tun_gso_type_mask(AF_INET6, cp->af); 1321 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1322 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || 1323 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) 1324 gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; 1325 else 1326 gso_type |= SKB_GSO_UDP_TUNNEL; 1327 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1328 skb->ip_summed == CHECKSUM_PARTIAL) { 1329 gso_type |= SKB_GSO_TUNNEL_REMCSUM; 1330 } 1331 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 1332 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1333 gso_type |= SKB_GSO_GRE_CSUM; 1334 else 1335 gso_type |= SKB_GSO_GRE; 1336 } 1337 1338 if (iptunnel_handle_offloads(skb, gso_type)) 1339 goto tx_error; 1340 1341 skb->transport_header = skb->network_header; 1342 1343 skb_set_inner_ipproto(skb, next_protocol); 1344 skb_set_inner_mac_header(skb, skb_inner_network_offset(skb)); 1345 1346 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1347 bool check = false; 1348 1349 if (ipvs_gue_encap(net, skb, cp, &next_protocol)) 1350 goto tx_error; 1351 1352 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || 1353 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) 1354 check = true; 1355 1356 udp6_set_csum(!check, skb, &saddr, &cp->daddr.in6, skb->len); 1357 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) 1358 ipvs_gre_encap(net, skb, cp, &next_protocol); 1359 1360 skb_push(skb, sizeof(struct ipv6hdr)); 1361 skb_reset_network_header(skb); 1362 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 1363 1364 /* 1365 * Push down and install the IPIP header. 1366 */ 1367 iph = ipv6_hdr(skb); 1368 iph->version = 6; 1369 iph->nexthdr = next_protocol; 1370 iph->payload_len = htons(payload_len); 1371 memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); 1372 ipv6_change_dsfield(iph, 0, dsfield); 1373 iph->daddr = cp->daddr.in6; 1374 iph->saddr = saddr; 1375 iph->hop_limit = ttl; 1376 1377 /* Another hack: avoid icmp_send in ip_fragment */ 1378 skb->ignore_df = 1; 1379 1380 ret = ip_vs_tunnel_xmit_prepare(skb, cp); 1381 if (ret == NF_ACCEPT) 1382 ip6_local_out(net, skb->sk, skb); 1383 else if (ret == NF_DROP) 1384 kfree_skb(skb); 1385 1386 return NF_STOLEN; 1387 1388 tx_error: 1389 kfree_skb(skb); 1390 return NF_STOLEN; 1391 } 1392 #endif 1393 1394 1395 /* 1396 * Direct Routing transmitter 1397 * Used for ANY protocol 1398 */ 1399 int 1400 ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 1401 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 1402 { 1403 int local; 1404 1405 local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, 1406 IP_VS_RT_MODE_LOCAL | 1407 IP_VS_RT_MODE_NON_LOCAL | 1408 IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh); 1409 if (local < 0) 1410 goto tx_error; 1411 if (local) 1412 return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); 1413 1414 ip_send_check(ip_hdr(skb)); 1415 1416 /* Another hack: avoid icmp_send in ip_fragment */ 1417 skb->ignore_df = 1; 1418 1419 ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); 1420 1421 return NF_STOLEN; 1422 1423 tx_error: 1424 kfree_skb(skb); 1425 return NF_STOLEN; 1426 } 1427 1428 #ifdef CONFIG_IP_VS_IPV6 1429 int 1430 ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 1431 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 1432 { 1433 int local; 1434 1435 local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, 1436 &cp->daddr.in6, 1437 NULL, ipvsh, 0, 1438 IP_VS_RT_MODE_LOCAL | 1439 IP_VS_RT_MODE_NON_LOCAL | 1440 IP_VS_RT_MODE_KNOWN_NH); 1441 if (local < 0) 1442 goto tx_error; 1443 if (local) 1444 return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); 1445 1446 /* Another hack: avoid icmp_send in ip_fragment */ 1447 skb->ignore_df = 1; 1448 1449 ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); 1450 1451 return NF_STOLEN; 1452 1453 tx_error: 1454 kfree_skb(skb); 1455 return NF_STOLEN; 1456 } 1457 #endif 1458 1459 1460 /* 1461 * ICMP packet transmitter 1462 * called by the ip_vs_in_icmp 1463 */ 1464 int 1465 ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 1466 struct ip_vs_protocol *pp, int offset, unsigned int hooknum, 1467 struct ip_vs_iphdr *iph) 1468 { 1469 struct rtable *rt; /* Route to the other host */ 1470 int rc; 1471 int local; 1472 int rt_mode, was_input; 1473 1474 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be 1475 forwarded directly here, because there is no need to 1476 translate address/port back */ 1477 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { 1478 if (cp->packet_xmit) 1479 rc = cp->packet_xmit(skb, cp, pp, iph); 1480 else 1481 rc = NF_ACCEPT; 1482 /* do not touch skb anymore */ 1483 atomic_inc(&cp->in_pkts); 1484 return rc; 1485 } 1486 1487 /* 1488 * mangle and send the packet here (only for VS/NAT) 1489 */ 1490 was_input = rt_is_input_route(skb_rtable(skb)); 1491 1492 /* LOCALNODE from FORWARD hook is not supported */ 1493 rt_mode = (hooknum != NF_INET_FORWARD) ? 1494 IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | 1495 IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; 1496 local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, rt_mode, 1497 NULL, iph); 1498 if (local < 0) 1499 goto tx_error; 1500 rt = skb_rtable(skb); 1501 1502 /* 1503 * Avoid duplicate tuple in reply direction for NAT traffic 1504 * to local address when connection is sync-ed 1505 */ 1506 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 1507 if (cp->flags & IP_VS_CONN_F_SYNC && local) { 1508 enum ip_conntrack_info ctinfo; 1509 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 1510 1511 if (ct) { 1512 IP_VS_DBG(10, "%s(): " 1513 "stopping DNAT to local address %pI4\n", 1514 __func__, &cp->daddr.ip); 1515 goto tx_error; 1516 } 1517 } 1518 #endif 1519 1520 /* From world but DNAT to loopback address? */ 1521 if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { 1522 IP_VS_DBG(1, "%s(): " 1523 "stopping DNAT to loopback %pI4\n", 1524 __func__, &cp->daddr.ip); 1525 goto tx_error; 1526 } 1527 1528 /* copy-on-write the packet before mangling it */ 1529 if (skb_ensure_writable(skb, offset)) 1530 goto tx_error; 1531 1532 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 1533 goto tx_error; 1534 1535 ip_vs_nat_icmp(skb, pp, cp, 0); 1536 1537 /* Another hack: avoid icmp_send in ip_fragment */ 1538 skb->ignore_df = 1; 1539 1540 return ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); 1541 1542 tx_error: 1543 kfree_skb(skb); 1544 rc = NF_STOLEN; 1545 return rc; 1546 } 1547 1548 #ifdef CONFIG_IP_VS_IPV6 1549 int 1550 ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 1551 struct ip_vs_protocol *pp, int offset, unsigned int hooknum, 1552 struct ip_vs_iphdr *ipvsh) 1553 { 1554 struct rt6_info *rt; /* Route to the other host */ 1555 int rc; 1556 int local; 1557 int rt_mode; 1558 1559 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be 1560 forwarded directly here, because there is no need to 1561 translate address/port back */ 1562 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { 1563 if (cp->packet_xmit) 1564 rc = cp->packet_xmit(skb, cp, pp, ipvsh); 1565 else 1566 rc = NF_ACCEPT; 1567 /* do not touch skb anymore */ 1568 atomic_inc(&cp->in_pkts); 1569 return rc; 1570 } 1571 1572 /* 1573 * mangle and send the packet here (only for VS/NAT) 1574 */ 1575 1576 /* LOCALNODE from FORWARD hook is not supported */ 1577 rt_mode = (hooknum != NF_INET_FORWARD) ? 1578 IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | 1579 IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; 1580 local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, 1581 &cp->daddr.in6, NULL, ipvsh, 0, rt_mode); 1582 if (local < 0) 1583 goto tx_error; 1584 rt = dst_rt6_info(skb_dst(skb)); 1585 /* 1586 * Avoid duplicate tuple in reply direction for NAT traffic 1587 * to local address when connection is sync-ed 1588 */ 1589 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 1590 if (cp->flags & IP_VS_CONN_F_SYNC && local) { 1591 enum ip_conntrack_info ctinfo; 1592 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 1593 1594 if (ct) { 1595 IP_VS_DBG(10, "%s(): " 1596 "stopping DNAT to local address %pI6\n", 1597 __func__, &cp->daddr.in6); 1598 goto tx_error; 1599 } 1600 } 1601 #endif 1602 1603 /* From world but DNAT to loopback address? */ 1604 if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && 1605 ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) { 1606 IP_VS_DBG(1, "%s(): " 1607 "stopping DNAT to loopback %pI6\n", 1608 __func__, &cp->daddr.in6); 1609 goto tx_error; 1610 } 1611 1612 /* copy-on-write the packet before mangling it */ 1613 if (skb_ensure_writable(skb, offset)) 1614 goto tx_error; 1615 1616 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 1617 goto tx_error; 1618 1619 ip_vs_nat_icmp_v6(skb, pp, cp, 0); 1620 1621 /* Another hack: avoid icmp_send in ip_fragment */ 1622 skb->ignore_df = 1; 1623 1624 return ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); 1625 1626 tx_error: 1627 kfree_skb(skb); 1628 rc = NF_STOLEN; 1629 return rc; 1630 } 1631 #endif 1632