1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * ip_vs_xmit.c: various packet transmitters for IPVS 4 * 5 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 6 * Julian Anastasov <ja@ssi.bg> 7 * 8 * Changes: 9 * 10 * Description of forwarding methods: 11 * - all transmitters are called from LOCAL_IN (remote clients) and 12 * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD 13 * - not all connections have destination server, for example, 14 * connections in backup server when fwmark is used 15 * - bypass connections use daddr from packet 16 * - we can use dst without ref while sending in RCU section, we use 17 * ref when returning NF_ACCEPT for NAT-ed packet via loopback 18 * LOCAL_OUT rules: 19 * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING) 20 * - skb->pkt_type is not set yet 21 * - the only place where we can see skb->sk != NULL 22 */ 23 24 #define pr_fmt(fmt) "IPVS: " fmt 25 26 #include <linux/kernel.h> 27 #include <linux/slab.h> 28 #include <linux/tcp.h> /* for tcphdr */ 29 #include <net/ip.h> 30 #include <net/gue.h> 31 #include <net/gre.h> 32 #include <net/tcp.h> /* for csum_tcpudp_magic */ 33 #include <net/udp.h> 34 #include <net/icmp.h> /* for icmp_send */ 35 #include <net/route.h> /* for ip_route_output */ 36 #include <net/ipv6.h> 37 #include <net/ip6_route.h> 38 #include <net/ip_tunnels.h> 39 #include <net/ip6_checksum.h> 40 #include <net/addrconf.h> 41 #include <linux/icmpv6.h> 42 #include <linux/netfilter.h> 43 #include <linux/netfilter_ipv4.h> 44 45 #include <net/ip_vs.h> 46 47 enum { 48 IP_VS_RT_MODE_LOCAL = 1, /* Allow local dest */ 49 IP_VS_RT_MODE_NON_LOCAL = 2, /* Allow non-local dest */ 50 IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to 51 * local 52 */ 53 IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */ 54 IP_VS_RT_MODE_KNOWN_NH = 16,/* Route via remote addr */ 55 IP_VS_RT_MODE_TUNNEL = 32,/* Tunnel mode */ 56 }; 57 58 static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void) 59 { 60 return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC); 61 } 62 63 static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst) 64 { 65 kfree(dest_dst); 66 } 67 68 /* 69 * Destination cache to speed up outgoing route lookup 70 */ 71 static inline void 72 __ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst, 73 struct dst_entry *dst, u32 dst_cookie) 74 { 75 struct ip_vs_dest_dst *old; 76 77 old = rcu_dereference_protected(dest->dest_dst, 78 lockdep_is_held(&dest->dst_lock)); 79 80 if (dest_dst) { 81 dest_dst->dst_cache = dst; 82 dest_dst->dst_cookie = dst_cookie; 83 } 84 rcu_assign_pointer(dest->dest_dst, dest_dst); 85 86 if (old) 87 call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); 88 } 89 90 static inline struct ip_vs_dest_dst * 91 __ip_vs_dst_check(struct ip_vs_dest *dest) 92 { 93 struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst); 94 struct dst_entry *dst; 95 96 if (!dest_dst) 97 return NULL; 98 dst = dest_dst->dst_cache; 99 if (READ_ONCE(dst->obsolete) && 100 dst->ops->check(dst, dest_dst->dst_cookie) == NULL) 101 return NULL; 102 return dest_dst; 103 } 104 105 static inline bool 106 __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu) 107 { 108 if (IP6CB(skb)->frag_max_size) { 109 /* frag_max_size tell us that, this packet have been 110 * defragmented by netfilter IPv6 conntrack module. 111 */ 112 if (IP6CB(skb)->frag_max_size > mtu) 113 return true; /* largest fragment violate MTU */ 114 } 115 else if (skb->len > mtu && !skb_is_gso(skb)) { 116 return true; /* Packet size violate MTU size */ 117 } 118 return false; 119 } 120 121 /* Get route to daddr, optionally bind route to saddr */ 122 static struct rtable *do_output_route4(struct net *net, __be32 daddr, 123 int rt_mode, __be32 *ret_saddr) 124 { 125 struct flowi4 fl4; 126 struct rtable *rt; 127 128 memset(&fl4, 0, sizeof(fl4)); 129 fl4.daddr = daddr; 130 fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ? 131 FLOWI_FLAG_KNOWN_NH : 0; 132 133 retry: 134 rt = ip_route_output_key(net, &fl4); 135 if (IS_ERR(rt)) { 136 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr); 137 return NULL; 138 } 139 if (rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { 140 ip_rt_put(rt); 141 flowi4_update_output(&fl4, 0, daddr, fl4.saddr); 142 rt_mode = 0; 143 goto retry; 144 } 145 if (ret_saddr) 146 *ret_saddr = fl4.saddr; 147 return rt; 148 } 149 150 #ifdef CONFIG_IP_VS_IPV6 151 static inline int __ip_vs_is_local_route6(struct rt6_info *rt) 152 { 153 return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK; 154 } 155 #endif 156 157 static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb, 158 int rt_mode, 159 bool new_rt_is_local) 160 { 161 bool rt_mode_allow_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL); 162 bool rt_mode_allow_non_local = !!(rt_mode & IP_VS_RT_MODE_NON_LOCAL); 163 bool rt_mode_allow_redirect = !!(rt_mode & IP_VS_RT_MODE_RDR); 164 bool source_is_loopback; 165 bool old_rt_is_local; 166 167 #ifdef CONFIG_IP_VS_IPV6 168 if (skb_af == AF_INET6) { 169 int addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr); 170 171 source_is_loopback = 172 (!skb->dev || skb->dev->flags & IFF_LOOPBACK) && 173 (addr_type & IPV6_ADDR_LOOPBACK); 174 old_rt_is_local = __ip_vs_is_local_route6( 175 dst_rt6_info(skb_dst(skb))); 176 } else 177 #endif 178 { 179 source_is_loopback = ipv4_is_loopback(ip_hdr(skb)->saddr); 180 old_rt_is_local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; 181 } 182 183 if (unlikely(new_rt_is_local)) { 184 if (!rt_mode_allow_local) 185 return true; 186 if (!rt_mode_allow_redirect && !old_rt_is_local) 187 return true; 188 } else { 189 if (!rt_mode_allow_non_local) 190 return true; 191 if (source_is_loopback) 192 return true; 193 } 194 return false; 195 } 196 197 static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu) 198 { 199 struct sock *sk = skb->sk; 200 struct rtable *ort = skb_rtable(skb); 201 202 if (!skb->dev && sk && sk_fullsock(sk)) 203 ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu, true); 204 } 205 206 static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af, 207 int rt_mode, 208 struct ip_vs_iphdr *ipvsh, 209 struct sk_buff *skb, int mtu) 210 { 211 #ifdef CONFIG_IP_VS_IPV6 212 if (skb_af == AF_INET6) { 213 struct net *net = ipvs->net; 214 215 if (unlikely(__mtu_check_toobig_v6(skb, mtu))) { 216 if (!skb->dev) 217 skb->dev = net->loopback_dev; 218 /* only send ICMP too big on first fragment */ 219 if (!ipvsh->fragoffs && !ip_vs_iph_icmp(ipvsh)) 220 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 221 IP_VS_DBG(1, "frag needed for %pI6c\n", 222 &ipv6_hdr(skb)->saddr); 223 return false; 224 } 225 } else 226 #endif 227 { 228 /* If we're going to tunnel the packet and pmtu discovery 229 * is disabled, we'll just fragment it anyway 230 */ 231 if ((rt_mode & IP_VS_RT_MODE_TUNNEL) && !sysctl_pmtu_disc(ipvs)) 232 return true; 233 234 if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) && 235 skb->len > mtu && !skb_is_gso(skb) && 236 !ip_vs_iph_icmp(ipvsh))) { 237 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, 238 htonl(mtu)); 239 IP_VS_DBG(1, "frag needed for %pI4\n", 240 &ip_hdr(skb)->saddr); 241 return false; 242 } 243 } 244 245 return true; 246 } 247 248 static inline bool decrement_ttl(struct netns_ipvs *ipvs, 249 int skb_af, 250 struct sk_buff *skb) 251 { 252 struct net *net = ipvs->net; 253 254 #ifdef CONFIG_IP_VS_IPV6 255 if (skb_af == AF_INET6) { 256 struct dst_entry *dst = skb_dst(skb); 257 258 /* check and decrement ttl */ 259 if (ipv6_hdr(skb)->hop_limit <= 1) { 260 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); 261 262 /* Force OUTPUT device used as source address */ 263 skb->dev = dst->dev; 264 icmpv6_send(skb, ICMPV6_TIME_EXCEED, 265 ICMPV6_EXC_HOPLIMIT, 0); 266 IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); 267 268 return false; 269 } 270 271 /* don't propagate ttl change to cloned packets */ 272 if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) 273 return false; 274 275 ipv6_hdr(skb)->hop_limit--; 276 } else 277 #endif 278 { 279 if (ip_hdr(skb)->ttl <= 1) { 280 /* Tell the sender its packet died... */ 281 IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); 282 icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); 283 return false; 284 } 285 286 /* don't propagate ttl change to cloned packets */ 287 if (skb_ensure_writable(skb, sizeof(struct iphdr))) 288 return false; 289 290 /* Decrease ttl */ 291 ip_decrease_ttl(ip_hdr(skb)); 292 } 293 294 return true; 295 } 296 297 /* Get route to destination or remote server */ 298 static int 299 __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, 300 struct ip_vs_dest *dest, 301 __be32 daddr, int rt_mode, __be32 *ret_saddr, 302 struct ip_vs_iphdr *ipvsh) 303 { 304 struct net *net = ipvs->net; 305 struct ip_vs_dest_dst *dest_dst; 306 struct rtable *rt; /* Route to the other host */ 307 int mtu; 308 int local, noref = 1; 309 310 if (dest) { 311 dest_dst = __ip_vs_dst_check(dest); 312 if (likely(dest_dst)) 313 rt = dst_rtable(dest_dst->dst_cache); 314 else { 315 dest_dst = ip_vs_dest_dst_alloc(); 316 spin_lock_bh(&dest->dst_lock); 317 if (!dest_dst) { 318 __ip_vs_dst_set(dest, NULL, NULL, 0); 319 spin_unlock_bh(&dest->dst_lock); 320 goto err_unreach; 321 } 322 rt = do_output_route4(net, dest->addr.ip, rt_mode, 323 &dest_dst->dst_saddr.ip); 324 if (!rt) { 325 __ip_vs_dst_set(dest, NULL, NULL, 0); 326 spin_unlock_bh(&dest->dst_lock); 327 ip_vs_dest_dst_free(dest_dst); 328 goto err_unreach; 329 } 330 __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0); 331 spin_unlock_bh(&dest->dst_lock); 332 IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n", 333 &dest->addr.ip, &dest_dst->dst_saddr.ip, 334 rcuref_read(&rt->dst.__rcuref)); 335 } 336 if (ret_saddr) 337 *ret_saddr = dest_dst->dst_saddr.ip; 338 } else { 339 noref = 0; 340 341 /* For such unconfigured boxes avoid many route lookups 342 * for performance reasons because we do not remember saddr 343 */ 344 rt_mode &= ~IP_VS_RT_MODE_CONNECT; 345 rt = do_output_route4(net, daddr, rt_mode, ret_saddr); 346 if (!rt) 347 goto err_unreach; 348 } 349 350 local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0; 351 if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode, 352 local))) { 353 IP_VS_DBG_RL("We are crossing local and non-local addresses" 354 " daddr=%pI4\n", &daddr); 355 goto err_put; 356 } 357 358 if (unlikely(local)) { 359 /* skb to local stack, preserve old route */ 360 if (!noref) 361 ip_rt_put(rt); 362 return local; 363 } 364 365 if (!decrement_ttl(ipvs, skb_af, skb)) 366 goto err_put; 367 368 if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) { 369 mtu = dst_mtu(&rt->dst); 370 } else { 371 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); 372 if (!dest) 373 goto err_put; 374 if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 375 mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); 376 if ((dest->tun_flags & 377 IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 378 skb->ip_summed == CHECKSUM_PARTIAL) 379 mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 380 } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 381 IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; 382 383 if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 384 __set_bit(IP_TUNNEL_CSUM_BIT, tflags); 385 mtu -= gre_calc_hlen(tflags); 386 } 387 if (mtu < 68) { 388 IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); 389 goto err_put; 390 } 391 maybe_update_pmtu(skb_af, skb, mtu); 392 } 393 394 if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu)) 395 goto err_put; 396 397 skb_dst_drop(skb); 398 if (noref) 399 skb_dst_set_noref(skb, &rt->dst); 400 else 401 skb_dst_set(skb, &rt->dst); 402 403 return local; 404 405 err_put: 406 if (!noref) 407 ip_rt_put(rt); 408 return -1; 409 410 err_unreach: 411 dst_link_failure(skb); 412 return -1; 413 } 414 415 #ifdef CONFIG_IP_VS_IPV6 416 static struct dst_entry * 417 __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, 418 struct in6_addr *ret_saddr, int do_xfrm, int rt_mode) 419 { 420 struct dst_entry *dst; 421 struct flowi6 fl6 = { 422 .daddr = *daddr, 423 }; 424 425 if (rt_mode & IP_VS_RT_MODE_KNOWN_NH) 426 fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH; 427 428 dst = ip6_route_output(net, NULL, &fl6); 429 if (dst->error) 430 goto out_err; 431 if (!ret_saddr) 432 return dst; 433 if (ipv6_addr_any(&fl6.saddr) && 434 ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, 435 &fl6.daddr, 0, &fl6.saddr) < 0) 436 goto out_err; 437 if (do_xfrm) { 438 dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); 439 if (IS_ERR(dst)) { 440 dst = NULL; 441 goto out_err; 442 } 443 } 444 *ret_saddr = fl6.saddr; 445 return dst; 446 447 out_err: 448 dst_release(dst); 449 IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr); 450 return NULL; 451 } 452 453 /* 454 * Get route to destination or remote server 455 */ 456 static int 457 __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, 458 struct ip_vs_dest *dest, 459 struct in6_addr *daddr, struct in6_addr *ret_saddr, 460 struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode) 461 { 462 struct net *net = ipvs->net; 463 struct ip_vs_dest_dst *dest_dst; 464 struct rt6_info *rt; /* Route to the other host */ 465 struct dst_entry *dst; 466 int mtu; 467 int local, noref = 1; 468 469 if (dest) { 470 dest_dst = __ip_vs_dst_check(dest); 471 if (likely(dest_dst)) 472 rt = dst_rt6_info(dest_dst->dst_cache); 473 else { 474 u32 cookie; 475 476 dest_dst = ip_vs_dest_dst_alloc(); 477 spin_lock_bh(&dest->dst_lock); 478 if (!dest_dst) { 479 __ip_vs_dst_set(dest, NULL, NULL, 0); 480 spin_unlock_bh(&dest->dst_lock); 481 goto err_unreach; 482 } 483 dst = __ip_vs_route_output_v6(net, &dest->addr.in6, 484 &dest_dst->dst_saddr.in6, 485 do_xfrm, rt_mode); 486 if (!dst) { 487 __ip_vs_dst_set(dest, NULL, NULL, 0); 488 spin_unlock_bh(&dest->dst_lock); 489 ip_vs_dest_dst_free(dest_dst); 490 goto err_unreach; 491 } 492 rt = dst_rt6_info(dst); 493 cookie = rt6_get_cookie(rt); 494 __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); 495 spin_unlock_bh(&dest->dst_lock); 496 IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", 497 &dest->addr.in6, &dest_dst->dst_saddr.in6, 498 rcuref_read(&rt->dst.__rcuref)); 499 } 500 if (ret_saddr) 501 *ret_saddr = dest_dst->dst_saddr.in6; 502 } else { 503 noref = 0; 504 dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm, 505 rt_mode); 506 if (!dst) 507 goto err_unreach; 508 rt = dst_rt6_info(dst); 509 } 510 511 local = __ip_vs_is_local_route6(rt); 512 513 if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode, 514 local))) { 515 IP_VS_DBG_RL("We are crossing local and non-local addresses" 516 " daddr=%pI6\n", daddr); 517 goto err_put; 518 } 519 520 if (unlikely(local)) { 521 /* skb to local stack, preserve old route */ 522 if (!noref) 523 dst_release(&rt->dst); 524 return local; 525 } 526 527 if (!decrement_ttl(ipvs, skb_af, skb)) 528 goto err_put; 529 530 /* MTU checking */ 531 if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) 532 mtu = dst_mtu(&rt->dst); 533 else { 534 mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); 535 if (!dest) 536 goto err_put; 537 if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 538 mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); 539 if ((dest->tun_flags & 540 IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 541 skb->ip_summed == CHECKSUM_PARTIAL) 542 mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 543 } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 544 IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; 545 546 if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 547 __set_bit(IP_TUNNEL_CSUM_BIT, tflags); 548 mtu -= gre_calc_hlen(tflags); 549 } 550 if (mtu < IPV6_MIN_MTU) { 551 IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, 552 IPV6_MIN_MTU); 553 goto err_put; 554 } 555 maybe_update_pmtu(skb_af, skb, mtu); 556 } 557 558 if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu)) 559 goto err_put; 560 561 skb_dst_drop(skb); 562 if (noref) 563 skb_dst_set_noref(skb, &rt->dst); 564 else 565 skb_dst_set(skb, &rt->dst); 566 567 return local; 568 569 err_put: 570 if (!noref) 571 dst_release(&rt->dst); 572 return -1; 573 574 err_unreach: 575 /* The ip6_link_failure function requires the dev field to be set 576 * in order to get the net (further for the sake of fwmark 577 * reflection). 578 */ 579 if (!skb->dev) 580 skb->dev = skb_dst(skb)->dev; 581 582 dst_link_failure(skb); 583 return -1; 584 } 585 #endif 586 587 588 /* return NF_ACCEPT to allow forwarding or other NF_xxx on error */ 589 static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, 590 struct ip_vs_conn *cp) 591 { 592 int ret = NF_ACCEPT; 593 594 skb->ipvs_property = 1; 595 if (unlikely(cp->flags & IP_VS_CONN_F_NFCT)) 596 ret = ip_vs_confirm_conntrack(skb); 597 if (ret == NF_ACCEPT) { 598 nf_reset_ct(skb); 599 skb_forward_csum(skb); 600 if (skb->dev) 601 skb_clear_tstamp(skb); 602 } 603 return ret; 604 } 605 606 /* In the event of a remote destination, it's possible that we would have 607 * matches against an old socket (particularly a TIME-WAIT socket). This 608 * causes havoc down the line (ip_local_out et. al. expect regular sockets 609 * and invalid memory accesses will happen) so simply drop the association 610 * in this case. 611 */ 612 static inline void ip_vs_drop_early_demux_sk(struct sk_buff *skb) 613 { 614 /* If dev is set, the packet came from the LOCAL_IN callback and 615 * not from a local TCP socket. 616 */ 617 if (skb->dev) 618 skb_orphan(skb); 619 } 620 621 /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ 622 static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, 623 struct ip_vs_conn *cp, int local) 624 { 625 int ret = NF_STOLEN; 626 627 skb->ipvs_property = 1; 628 if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) 629 ip_vs_notrack(skb); 630 else 631 ip_vs_update_conntrack(skb, cp, 1); 632 633 /* Remove the early_demux association unless it's bound for the 634 * exact same port and address on this host after translation. 635 */ 636 if (!local || cp->vport != cp->dport || 637 !ip_vs_addr_equal(cp->af, &cp->vaddr, &cp->daddr)) 638 ip_vs_drop_early_demux_sk(skb); 639 640 if (!local) { 641 skb_forward_csum(skb); 642 if (skb->dev) 643 skb_clear_tstamp(skb); 644 NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, 645 NULL, skb_dst(skb)->dev, dst_output); 646 } else 647 ret = NF_ACCEPT; 648 649 return ret; 650 } 651 652 /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ 653 static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, 654 struct ip_vs_conn *cp, int local) 655 { 656 int ret = NF_STOLEN; 657 658 skb->ipvs_property = 1; 659 if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) 660 ip_vs_notrack(skb); 661 if (!local) { 662 ip_vs_drop_early_demux_sk(skb); 663 skb_forward_csum(skb); 664 if (skb->dev) 665 skb_clear_tstamp(skb); 666 NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, 667 NULL, skb_dst(skb)->dev, dst_output); 668 } else 669 ret = NF_ACCEPT; 670 return ret; 671 } 672 673 674 /* 675 * NULL transmitter (do nothing except return NF_ACCEPT) 676 */ 677 int 678 ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 679 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 680 { 681 /* we do not touch skb and do not need pskb ptr */ 682 return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); 683 } 684 685 686 /* 687 * Bypass transmitter 688 * Let packets bypass the destination when the destination is not 689 * available, it may be only used in transparent cache cluster. 690 */ 691 int 692 ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 693 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 694 { 695 struct iphdr *iph = ip_hdr(skb); 696 697 if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr, 698 IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0) 699 goto tx_error; 700 701 ip_send_check(iph); 702 703 /* Another hack: avoid icmp_send in ip_fragment */ 704 skb->ignore_df = 1; 705 706 ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); 707 708 return NF_STOLEN; 709 710 tx_error: 711 kfree_skb(skb); 712 return NF_STOLEN; 713 } 714 715 #ifdef CONFIG_IP_VS_IPV6 716 int 717 ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 718 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 719 { 720 struct ipv6hdr *iph = ipv6_hdr(skb); 721 722 if (__ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, NULL, 723 &iph->daddr, NULL, 724 ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0) 725 goto tx_error; 726 727 /* Another hack: avoid icmp_send in ip_fragment */ 728 skb->ignore_df = 1; 729 730 ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); 731 732 return NF_STOLEN; 733 734 tx_error: 735 kfree_skb(skb); 736 return NF_STOLEN; 737 } 738 #endif 739 740 /* 741 * NAT transmitter (only for outside-to-inside nat forwarding) 742 * Not used for related ICMP 743 */ 744 int 745 ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 746 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 747 { 748 struct rtable *rt; /* Route to the other host */ 749 int local, rc, was_input; 750 751 /* check if it is a connection of no-client-port */ 752 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { 753 __be16 _pt, *p; 754 755 p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt); 756 if (p == NULL) 757 goto tx_error; 758 ip_vs_conn_fill_cport(cp, *p); 759 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); 760 } 761 762 was_input = rt_is_input_route(skb_rtable(skb)); 763 local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, 764 IP_VS_RT_MODE_LOCAL | 765 IP_VS_RT_MODE_NON_LOCAL | 766 IP_VS_RT_MODE_RDR, NULL, ipvsh); 767 if (local < 0) 768 goto tx_error; 769 rt = skb_rtable(skb); 770 /* 771 * Avoid duplicate tuple in reply direction for NAT traffic 772 * to local address when connection is sync-ed 773 */ 774 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 775 if (cp->flags & IP_VS_CONN_F_SYNC && local) { 776 enum ip_conntrack_info ctinfo; 777 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 778 779 if (ct) { 780 IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, ipvsh->off, 781 "ip_vs_nat_xmit(): " 782 "stopping DNAT to local address"); 783 goto tx_error; 784 } 785 } 786 #endif 787 788 /* From world but DNAT to loopback address? */ 789 if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { 790 IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, ipvsh->off, 791 "ip_vs_nat_xmit(): stopping DNAT to loopback " 792 "address"); 793 goto tx_error; 794 } 795 796 /* copy-on-write the packet before mangling it */ 797 if (skb_ensure_writable(skb, sizeof(struct iphdr))) 798 goto tx_error; 799 800 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 801 goto tx_error; 802 803 /* mangle the packet */ 804 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) 805 goto tx_error; 806 ip_hdr(skb)->daddr = cp->daddr.ip; 807 ip_send_check(ip_hdr(skb)); 808 809 IP_VS_DBG_PKT(10, AF_INET, pp, skb, ipvsh->off, "After DNAT"); 810 811 /* FIXME: when application helper enlarges the packet and the length 812 is larger than the MTU of outgoing device, there will be still 813 MTU problem. */ 814 815 /* Another hack: avoid icmp_send in ip_fragment */ 816 skb->ignore_df = 1; 817 818 rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); 819 820 return rc; 821 822 tx_error: 823 kfree_skb(skb); 824 return NF_STOLEN; 825 } 826 827 #ifdef CONFIG_IP_VS_IPV6 828 int 829 ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 830 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 831 { 832 struct rt6_info *rt; /* Route to the other host */ 833 int local, rc; 834 835 /* check if it is a connection of no-client-port */ 836 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) { 837 __be16 _pt, *p; 838 p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt); 839 if (p == NULL) 840 goto tx_error; 841 ip_vs_conn_fill_cport(cp, *p); 842 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); 843 } 844 845 local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, 846 &cp->daddr.in6, 847 NULL, ipvsh, 0, 848 IP_VS_RT_MODE_LOCAL | 849 IP_VS_RT_MODE_NON_LOCAL | 850 IP_VS_RT_MODE_RDR); 851 if (local < 0) 852 goto tx_error; 853 rt = dst_rt6_info(skb_dst(skb)); 854 /* 855 * Avoid duplicate tuple in reply direction for NAT traffic 856 * to local address when connection is sync-ed 857 */ 858 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 859 if (cp->flags & IP_VS_CONN_F_SYNC && local) { 860 enum ip_conntrack_info ctinfo; 861 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 862 863 if (ct) { 864 IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, ipvsh->off, 865 "ip_vs_nat_xmit_v6(): " 866 "stopping DNAT to local address"); 867 goto tx_error; 868 } 869 } 870 #endif 871 872 /* From world but DNAT to loopback address? */ 873 if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && 874 ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) { 875 IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, ipvsh->off, 876 "ip_vs_nat_xmit_v6(): " 877 "stopping DNAT to loopback address"); 878 goto tx_error; 879 } 880 881 /* copy-on-write the packet before mangling it */ 882 if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) 883 goto tx_error; 884 885 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 886 goto tx_error; 887 888 /* mangle the packet */ 889 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) 890 goto tx_error; 891 ipv6_hdr(skb)->daddr = cp->daddr.in6; 892 893 IP_VS_DBG_PKT(10, AF_INET6, pp, skb, ipvsh->off, "After DNAT"); 894 895 /* FIXME: when application helper enlarges the packet and the length 896 is larger than the MTU of outgoing device, there will be still 897 MTU problem. */ 898 899 /* Another hack: avoid icmp_send in ip_fragment */ 900 skb->ignore_df = 1; 901 902 rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); 903 904 return rc; 905 906 tx_error: 907 kfree_skb(skb); 908 return NF_STOLEN; 909 } 910 #endif 911 912 /* When forwarding a packet, we must ensure that we've got enough headroom 913 * for the encapsulation packet in the skb. This also gives us an 914 * opportunity to figure out what the payload_len, dsfield, ttl, and df 915 * values should be, so that we won't need to look at the old ip header 916 * again 917 */ 918 static struct sk_buff * 919 ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, 920 unsigned int max_headroom, __u8 *next_protocol, 921 __u32 *payload_len, __u8 *dsfield, __u8 *ttl, 922 __be16 *df) 923 { 924 struct sk_buff *new_skb = NULL; 925 struct iphdr *old_iph = NULL; 926 __u8 old_dsfield; 927 #ifdef CONFIG_IP_VS_IPV6 928 struct ipv6hdr *old_ipv6h = NULL; 929 #endif 930 931 ip_vs_drop_early_demux_sk(skb); 932 933 if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { 934 new_skb = skb_realloc_headroom(skb, max_headroom); 935 if (!new_skb) 936 goto error; 937 if (skb->sk) 938 skb_set_owner_w(new_skb, skb->sk); 939 consume_skb(skb); 940 skb = new_skb; 941 } 942 943 #ifdef CONFIG_IP_VS_IPV6 944 if (skb_af == AF_INET6) { 945 old_ipv6h = ipv6_hdr(skb); 946 *next_protocol = IPPROTO_IPV6; 947 if (payload_len) 948 *payload_len = 949 ntohs(old_ipv6h->payload_len) + 950 sizeof(*old_ipv6h); 951 old_dsfield = ipv6_get_dsfield(old_ipv6h); 952 *ttl = old_ipv6h->hop_limit; 953 if (df) 954 *df = 0; 955 } else 956 #endif 957 { 958 old_iph = ip_hdr(skb); 959 /* Copy DF, reset fragment offset and MF */ 960 if (df) 961 *df = (old_iph->frag_off & htons(IP_DF)); 962 *next_protocol = IPPROTO_IPIP; 963 964 /* fix old IP header checksum */ 965 ip_send_check(old_iph); 966 old_dsfield = ipv4_get_dsfield(old_iph); 967 *ttl = old_iph->ttl; 968 if (payload_len) 969 *payload_len = skb_ip_totlen(skb); 970 } 971 972 /* Implement full-functionality option for ECN encapsulation */ 973 *dsfield = INET_ECN_encapsulate(old_dsfield, old_dsfield); 974 975 return skb; 976 error: 977 kfree_skb(skb); 978 return ERR_PTR(-ENOMEM); 979 } 980 981 static inline int __tun_gso_type_mask(int encaps_af, int orig_af) 982 { 983 switch (encaps_af) { 984 case AF_INET: 985 return SKB_GSO_IPXIP4; 986 case AF_INET6: 987 return SKB_GSO_IPXIP6; 988 default: 989 return 0; 990 } 991 } 992 993 static int 994 ipvs_gue_encap(struct net *net, struct sk_buff *skb, 995 struct ip_vs_conn *cp, __u8 *next_protocol) 996 { 997 __be16 dport; 998 __be16 sport = udp_flow_src_port(net, skb, 0, 0, false); 999 struct udphdr *udph; /* Our new UDP header */ 1000 struct guehdr *gueh; /* Our new GUE header */ 1001 size_t hdrlen, optlen = 0; 1002 void *data; 1003 bool need_priv = false; 1004 1005 if ((cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1006 skb->ip_summed == CHECKSUM_PARTIAL) { 1007 optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 1008 need_priv = true; 1009 } 1010 1011 hdrlen = sizeof(struct guehdr) + optlen; 1012 1013 skb_push(skb, hdrlen); 1014 1015 gueh = (struct guehdr *)skb->data; 1016 1017 gueh->control = 0; 1018 gueh->version = 0; 1019 gueh->hlen = optlen >> 2; 1020 gueh->flags = 0; 1021 gueh->proto_ctype = *next_protocol; 1022 1023 data = &gueh[1]; 1024 1025 if (need_priv) { 1026 __be32 *flags = data; 1027 u16 csum_start = skb_checksum_start_offset(skb); 1028 __be16 *pd; 1029 1030 gueh->flags |= GUE_FLAG_PRIV; 1031 *flags = 0; 1032 data += GUE_LEN_PRIV; 1033 1034 if (csum_start < hdrlen) 1035 return -EINVAL; 1036 1037 csum_start -= hdrlen; 1038 pd = data; 1039 pd[0] = htons(csum_start); 1040 pd[1] = htons(csum_start + skb->csum_offset); 1041 1042 if (!skb_is_gso(skb)) { 1043 skb->ip_summed = CHECKSUM_NONE; 1044 skb->encapsulation = 0; 1045 } 1046 1047 *flags |= GUE_PFLAG_REMCSUM; 1048 data += GUE_PLEN_REMCSUM; 1049 } 1050 1051 skb_push(skb, sizeof(struct udphdr)); 1052 skb_reset_transport_header(skb); 1053 1054 udph = udp_hdr(skb); 1055 1056 dport = cp->dest->tun_port; 1057 udph->dest = dport; 1058 udph->source = sport; 1059 udph->len = htons(skb->len); 1060 udph->check = 0; 1061 1062 *next_protocol = IPPROTO_UDP; 1063 1064 return 0; 1065 } 1066 1067 static void 1068 ipvs_gre_encap(struct net *net, struct sk_buff *skb, 1069 struct ip_vs_conn *cp, __u8 *next_protocol) 1070 { 1071 __be16 proto = *next_protocol == IPPROTO_IPIP ? 1072 htons(ETH_P_IP) : htons(ETH_P_IPV6); 1073 IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; 1074 size_t hdrlen; 1075 1076 if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1077 __set_bit(IP_TUNNEL_CSUM_BIT, tflags); 1078 1079 hdrlen = gre_calc_hlen(tflags); 1080 gre_build_header(skb, hdrlen, tflags, proto, 0, 0); 1081 1082 *next_protocol = IPPROTO_GRE; 1083 } 1084 1085 /* 1086 * IP Tunneling transmitter 1087 * 1088 * This function encapsulates the packet in a new IP packet, its 1089 * destination will be set to cp->daddr. Most code of this function 1090 * is taken from ipip.c. 1091 * 1092 * It is used in VS/TUN cluster. The load balancer selects a real 1093 * server from a cluster based on a scheduling algorithm, 1094 * encapsulates the request packet and forwards it to the selected 1095 * server. For example, all real servers are configured with 1096 * "ifconfig tunl0 <Virtual IP Address> up". When the server receives 1097 * the encapsulated packet, it will decapsulate the packet, processe 1098 * the request and return the response packets directly to the client 1099 * without passing the load balancer. This can greatly increase the 1100 * scalability of virtual server. 1101 * 1102 * Used for ANY protocol 1103 */ 1104 int 1105 ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 1106 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 1107 { 1108 struct netns_ipvs *ipvs = cp->ipvs; 1109 struct net *net = ipvs->net; 1110 struct rtable *rt; /* Route to the other host */ 1111 __be32 saddr; /* Source for tunnel */ 1112 struct net_device *tdev; /* Device to other host */ 1113 __u8 next_protocol = 0; 1114 __u8 dsfield = 0; 1115 __u8 ttl = 0; 1116 __be16 df = 0; 1117 __be16 *dfp = NULL; 1118 struct iphdr *iph; /* Our new IP header */ 1119 unsigned int max_headroom; /* The extra header space needed */ 1120 int ret, local; 1121 int tun_type, gso_type; 1122 int tun_flags; 1123 1124 local = __ip_vs_get_out_rt(ipvs, cp->af, skb, cp->dest, cp->daddr.ip, 1125 IP_VS_RT_MODE_LOCAL | 1126 IP_VS_RT_MODE_NON_LOCAL | 1127 IP_VS_RT_MODE_CONNECT | 1128 IP_VS_RT_MODE_TUNNEL, &saddr, ipvsh); 1129 if (local < 0) 1130 goto tx_error; 1131 if (local) 1132 return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); 1133 1134 rt = skb_rtable(skb); 1135 tdev = rt->dst.dev; 1136 1137 /* 1138 * Okay, now see if we can stuff it in the buffer as-is. 1139 */ 1140 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); 1141 1142 tun_type = cp->dest->tun_type; 1143 tun_flags = cp->dest->tun_flags; 1144 1145 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1146 size_t gue_hdrlen, gue_optlen = 0; 1147 1148 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1149 skb->ip_summed == CHECKSUM_PARTIAL) { 1150 gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 1151 } 1152 gue_hdrlen = sizeof(struct guehdr) + gue_optlen; 1153 1154 max_headroom += sizeof(struct udphdr) + gue_hdrlen; 1155 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 1156 IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; 1157 size_t gre_hdrlen; 1158 1159 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1160 __set_bit(IP_TUNNEL_CSUM_BIT, tflags); 1161 gre_hdrlen = gre_calc_hlen(tflags); 1162 1163 max_headroom += gre_hdrlen; 1164 } 1165 1166 /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */ 1167 dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL; 1168 skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, 1169 &next_protocol, NULL, &dsfield, 1170 &ttl, dfp); 1171 if (IS_ERR(skb)) 1172 return NF_STOLEN; 1173 1174 gso_type = __tun_gso_type_mask(AF_INET, cp->af); 1175 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1176 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || 1177 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) 1178 gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; 1179 else 1180 gso_type |= SKB_GSO_UDP_TUNNEL; 1181 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1182 skb->ip_summed == CHECKSUM_PARTIAL) { 1183 gso_type |= SKB_GSO_TUNNEL_REMCSUM; 1184 } 1185 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 1186 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1187 gso_type |= SKB_GSO_GRE_CSUM; 1188 else 1189 gso_type |= SKB_GSO_GRE; 1190 } 1191 1192 if (iptunnel_handle_offloads(skb, gso_type)) 1193 goto tx_error; 1194 1195 skb->transport_header = skb->network_header; 1196 1197 skb_set_inner_ipproto(skb, next_protocol); 1198 skb_set_inner_mac_header(skb, skb_inner_network_offset(skb)); 1199 1200 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1201 bool check = false; 1202 1203 if (ipvs_gue_encap(net, skb, cp, &next_protocol)) 1204 goto tx_error; 1205 1206 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || 1207 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) 1208 check = true; 1209 1210 udp_set_csum(!check, skb, saddr, cp->daddr.ip, skb->len); 1211 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) 1212 ipvs_gre_encap(net, skb, cp, &next_protocol); 1213 1214 skb_push(skb, sizeof(struct iphdr)); 1215 skb_reset_network_header(skb); 1216 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 1217 1218 /* 1219 * Push down and install the IPIP header. 1220 */ 1221 iph = ip_hdr(skb); 1222 iph->version = 4; 1223 iph->ihl = sizeof(struct iphdr)>>2; 1224 iph->frag_off = df; 1225 iph->protocol = next_protocol; 1226 iph->tos = dsfield; 1227 iph->daddr = cp->daddr.ip; 1228 iph->saddr = saddr; 1229 iph->ttl = ttl; 1230 ip_select_ident(net, skb, NULL); 1231 1232 /* Another hack: avoid icmp_send in ip_fragment */ 1233 skb->ignore_df = 1; 1234 1235 ret = ip_vs_tunnel_xmit_prepare(skb, cp); 1236 if (ret == NF_ACCEPT) 1237 ip_local_out(net, skb->sk, skb); 1238 else if (ret == NF_DROP) 1239 kfree_skb(skb); 1240 1241 return NF_STOLEN; 1242 1243 tx_error: 1244 kfree_skb(skb); 1245 return NF_STOLEN; 1246 } 1247 1248 #ifdef CONFIG_IP_VS_IPV6 1249 int 1250 ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 1251 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 1252 { 1253 struct netns_ipvs *ipvs = cp->ipvs; 1254 struct net *net = ipvs->net; 1255 struct rt6_info *rt; /* Route to the other host */ 1256 struct in6_addr saddr; /* Source for tunnel */ 1257 struct net_device *tdev; /* Device to other host */ 1258 __u8 next_protocol = 0; 1259 __u32 payload_len = 0; 1260 __u8 dsfield = 0; 1261 __u8 ttl = 0; 1262 struct ipv6hdr *iph; /* Our new IP header */ 1263 unsigned int max_headroom; /* The extra header space needed */ 1264 int ret, local; 1265 int tun_type, gso_type; 1266 int tun_flags; 1267 1268 local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest, 1269 &cp->daddr.in6, 1270 &saddr, ipvsh, 1, 1271 IP_VS_RT_MODE_LOCAL | 1272 IP_VS_RT_MODE_NON_LOCAL | 1273 IP_VS_RT_MODE_TUNNEL); 1274 if (local < 0) 1275 goto tx_error; 1276 if (local) 1277 return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); 1278 1279 rt = dst_rt6_info(skb_dst(skb)); 1280 tdev = rt->dst.dev; 1281 1282 /* 1283 * Okay, now see if we can stuff it in the buffer as-is. 1284 */ 1285 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); 1286 1287 tun_type = cp->dest->tun_type; 1288 tun_flags = cp->dest->tun_flags; 1289 1290 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1291 size_t gue_hdrlen, gue_optlen = 0; 1292 1293 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1294 skb->ip_summed == CHECKSUM_PARTIAL) { 1295 gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 1296 } 1297 gue_hdrlen = sizeof(struct guehdr) + gue_optlen; 1298 1299 max_headroom += sizeof(struct udphdr) + gue_hdrlen; 1300 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 1301 IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; 1302 size_t gre_hdrlen; 1303 1304 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1305 __set_bit(IP_TUNNEL_CSUM_BIT, tflags); 1306 gre_hdrlen = gre_calc_hlen(tflags); 1307 1308 max_headroom += gre_hdrlen; 1309 } 1310 1311 skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, 1312 &next_protocol, &payload_len, 1313 &dsfield, &ttl, NULL); 1314 if (IS_ERR(skb)) 1315 return NF_STOLEN; 1316 1317 gso_type = __tun_gso_type_mask(AF_INET6, cp->af); 1318 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1319 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || 1320 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) 1321 gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; 1322 else 1323 gso_type |= SKB_GSO_UDP_TUNNEL; 1324 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1325 skb->ip_summed == CHECKSUM_PARTIAL) { 1326 gso_type |= SKB_GSO_TUNNEL_REMCSUM; 1327 } 1328 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 1329 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1330 gso_type |= SKB_GSO_GRE_CSUM; 1331 else 1332 gso_type |= SKB_GSO_GRE; 1333 } 1334 1335 if (iptunnel_handle_offloads(skb, gso_type)) 1336 goto tx_error; 1337 1338 skb->transport_header = skb->network_header; 1339 1340 skb_set_inner_ipproto(skb, next_protocol); 1341 skb_set_inner_mac_header(skb, skb_inner_network_offset(skb)); 1342 1343 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1344 bool check = false; 1345 1346 if (ipvs_gue_encap(net, skb, cp, &next_protocol)) 1347 goto tx_error; 1348 1349 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || 1350 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) 1351 check = true; 1352 1353 udp6_set_csum(!check, skb, &saddr, &cp->daddr.in6, skb->len); 1354 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) 1355 ipvs_gre_encap(net, skb, cp, &next_protocol); 1356 1357 skb_push(skb, sizeof(struct ipv6hdr)); 1358 skb_reset_network_header(skb); 1359 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 1360 1361 /* 1362 * Push down and install the IPIP header. 1363 */ 1364 iph = ipv6_hdr(skb); 1365 iph->version = 6; 1366 iph->nexthdr = next_protocol; 1367 iph->payload_len = htons(payload_len); 1368 memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); 1369 ipv6_change_dsfield(iph, 0, dsfield); 1370 iph->daddr = cp->daddr.in6; 1371 iph->saddr = saddr; 1372 iph->hop_limit = ttl; 1373 1374 /* Another hack: avoid icmp_send in ip_fragment */ 1375 skb->ignore_df = 1; 1376 1377 ret = ip_vs_tunnel_xmit_prepare(skb, cp); 1378 if (ret == NF_ACCEPT) 1379 ip6_local_out(net, skb->sk, skb); 1380 else if (ret == NF_DROP) 1381 kfree_skb(skb); 1382 1383 return NF_STOLEN; 1384 1385 tx_error: 1386 kfree_skb(skb); 1387 return NF_STOLEN; 1388 } 1389 #endif 1390 1391 1392 /* 1393 * Direct Routing transmitter 1394 * Used for ANY protocol 1395 */ 1396 int 1397 ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 1398 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 1399 { 1400 int local; 1401 1402 local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, 1403 IP_VS_RT_MODE_LOCAL | 1404 IP_VS_RT_MODE_NON_LOCAL | 1405 IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh); 1406 if (local < 0) 1407 goto tx_error; 1408 if (local) 1409 return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); 1410 1411 ip_send_check(ip_hdr(skb)); 1412 1413 /* Another hack: avoid icmp_send in ip_fragment */ 1414 skb->ignore_df = 1; 1415 1416 ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); 1417 1418 return NF_STOLEN; 1419 1420 tx_error: 1421 kfree_skb(skb); 1422 return NF_STOLEN; 1423 } 1424 1425 #ifdef CONFIG_IP_VS_IPV6 1426 int 1427 ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 1428 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 1429 { 1430 int local; 1431 1432 local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, 1433 &cp->daddr.in6, 1434 NULL, ipvsh, 0, 1435 IP_VS_RT_MODE_LOCAL | 1436 IP_VS_RT_MODE_NON_LOCAL | 1437 IP_VS_RT_MODE_KNOWN_NH); 1438 if (local < 0) 1439 goto tx_error; 1440 if (local) 1441 return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); 1442 1443 /* Another hack: avoid icmp_send in ip_fragment */ 1444 skb->ignore_df = 1; 1445 1446 ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); 1447 1448 return NF_STOLEN; 1449 1450 tx_error: 1451 kfree_skb(skb); 1452 return NF_STOLEN; 1453 } 1454 #endif 1455 1456 1457 /* 1458 * ICMP packet transmitter 1459 * called by the ip_vs_in_icmp 1460 */ 1461 int 1462 ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 1463 struct ip_vs_protocol *pp, int offset, unsigned int hooknum, 1464 struct ip_vs_iphdr *iph) 1465 { 1466 struct rtable *rt; /* Route to the other host */ 1467 int rc; 1468 int local; 1469 int rt_mode, was_input; 1470 1471 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be 1472 forwarded directly here, because there is no need to 1473 translate address/port back */ 1474 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { 1475 if (cp->packet_xmit) 1476 rc = cp->packet_xmit(skb, cp, pp, iph); 1477 else 1478 rc = NF_ACCEPT; 1479 /* do not touch skb anymore */ 1480 atomic_inc(&cp->in_pkts); 1481 return rc; 1482 } 1483 1484 /* 1485 * mangle and send the packet here (only for VS/NAT) 1486 */ 1487 was_input = rt_is_input_route(skb_rtable(skb)); 1488 1489 /* LOCALNODE from FORWARD hook is not supported */ 1490 rt_mode = (hooknum != NF_INET_FORWARD) ? 1491 IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | 1492 IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; 1493 local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, rt_mode, 1494 NULL, iph); 1495 if (local < 0) 1496 goto tx_error; 1497 rt = skb_rtable(skb); 1498 1499 /* 1500 * Avoid duplicate tuple in reply direction for NAT traffic 1501 * to local address when connection is sync-ed 1502 */ 1503 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 1504 if (cp->flags & IP_VS_CONN_F_SYNC && local) { 1505 enum ip_conntrack_info ctinfo; 1506 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 1507 1508 if (ct) { 1509 IP_VS_DBG(10, "%s(): " 1510 "stopping DNAT to local address %pI4\n", 1511 __func__, &cp->daddr.ip); 1512 goto tx_error; 1513 } 1514 } 1515 #endif 1516 1517 /* From world but DNAT to loopback address? */ 1518 if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { 1519 IP_VS_DBG(1, "%s(): " 1520 "stopping DNAT to loopback %pI4\n", 1521 __func__, &cp->daddr.ip); 1522 goto tx_error; 1523 } 1524 1525 /* copy-on-write the packet before mangling it */ 1526 if (skb_ensure_writable(skb, offset)) 1527 goto tx_error; 1528 1529 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 1530 goto tx_error; 1531 1532 ip_vs_nat_icmp(skb, pp, cp, 0); 1533 1534 /* Another hack: avoid icmp_send in ip_fragment */ 1535 skb->ignore_df = 1; 1536 1537 return ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); 1538 1539 tx_error: 1540 kfree_skb(skb); 1541 rc = NF_STOLEN; 1542 return rc; 1543 } 1544 1545 #ifdef CONFIG_IP_VS_IPV6 1546 int 1547 ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 1548 struct ip_vs_protocol *pp, int offset, unsigned int hooknum, 1549 struct ip_vs_iphdr *ipvsh) 1550 { 1551 struct rt6_info *rt; /* Route to the other host */ 1552 int rc; 1553 int local; 1554 int rt_mode; 1555 1556 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be 1557 forwarded directly here, because there is no need to 1558 translate address/port back */ 1559 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { 1560 if (cp->packet_xmit) 1561 rc = cp->packet_xmit(skb, cp, pp, ipvsh); 1562 else 1563 rc = NF_ACCEPT; 1564 /* do not touch skb anymore */ 1565 atomic_inc(&cp->in_pkts); 1566 return rc; 1567 } 1568 1569 /* 1570 * mangle and send the packet here (only for VS/NAT) 1571 */ 1572 1573 /* LOCALNODE from FORWARD hook is not supported */ 1574 rt_mode = (hooknum != NF_INET_FORWARD) ? 1575 IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | 1576 IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; 1577 local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, 1578 &cp->daddr.in6, NULL, ipvsh, 0, rt_mode); 1579 if (local < 0) 1580 goto tx_error; 1581 rt = dst_rt6_info(skb_dst(skb)); 1582 /* 1583 * Avoid duplicate tuple in reply direction for NAT traffic 1584 * to local address when connection is sync-ed 1585 */ 1586 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 1587 if (cp->flags & IP_VS_CONN_F_SYNC && local) { 1588 enum ip_conntrack_info ctinfo; 1589 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 1590 1591 if (ct) { 1592 IP_VS_DBG(10, "%s(): " 1593 "stopping DNAT to local address %pI6\n", 1594 __func__, &cp->daddr.in6); 1595 goto tx_error; 1596 } 1597 } 1598 #endif 1599 1600 /* From world but DNAT to loopback address? */ 1601 if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && 1602 ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) { 1603 IP_VS_DBG(1, "%s(): " 1604 "stopping DNAT to loopback %pI6\n", 1605 __func__, &cp->daddr.in6); 1606 goto tx_error; 1607 } 1608 1609 /* copy-on-write the packet before mangling it */ 1610 if (skb_ensure_writable(skb, offset)) 1611 goto tx_error; 1612 1613 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 1614 goto tx_error; 1615 1616 ip_vs_nat_icmp_v6(skb, pp, cp, 0); 1617 1618 /* Another hack: avoid icmp_send in ip_fragment */ 1619 skb->ignore_df = 1; 1620 1621 return ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); 1622 1623 tx_error: 1624 kfree_skb(skb); 1625 rc = NF_STOLEN; 1626 return rc; 1627 } 1628 #endif 1629