xref: /linux/net/netfilter/ipvs/ip_vs_xmit.c (revision 2c89c1b655c0b06823f4ee8b055140d8628fc4da)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * ip_vs_xmit.c: various packet transmitters for IPVS
4  *
5  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
6  *              Julian Anastasov <ja@ssi.bg>
7  *
8  * Changes:
9  *
10  * Description of forwarding methods:
11  * - all transmitters are called from LOCAL_IN (remote clients) and
12  * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD
13  * - not all connections have destination server, for example,
14  * connections in backup server when fwmark is used
15  * - bypass connections use daddr from packet
16  * - we can use dst without ref while sending in RCU section, we use
17  * ref when returning NF_ACCEPT for NAT-ed packet via loopback
18  * LOCAL_OUT rules:
19  * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
20  * - skb->pkt_type is not set yet
21  * - the only place where we can see skb->sk != NULL
22  */
23 
24 #define KMSG_COMPONENT "IPVS"
25 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
26 
27 #include <linux/kernel.h>
28 #include <linux/slab.h>
29 #include <linux/tcp.h>                  /* for tcphdr */
30 #include <net/ip.h>
31 #include <net/gue.h>
32 #include <net/gre.h>
33 #include <net/tcp.h>                    /* for csum_tcpudp_magic */
34 #include <net/udp.h>
35 #include <net/icmp.h>                   /* for icmp_send */
36 #include <net/route.h>                  /* for ip_route_output */
37 #include <net/ipv6.h>
38 #include <net/ip6_route.h>
39 #include <net/ip_tunnels.h>
40 #include <net/ip6_checksum.h>
41 #include <net/addrconf.h>
42 #include <linux/icmpv6.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv4.h>
45 
46 #include <net/ip_vs.h>
47 
48 enum {
49 	IP_VS_RT_MODE_LOCAL	= 1, /* Allow local dest */
50 	IP_VS_RT_MODE_NON_LOCAL	= 2, /* Allow non-local dest */
51 	IP_VS_RT_MODE_RDR	= 4, /* Allow redirect from remote daddr to
52 				      * local
53 				      */
54 	IP_VS_RT_MODE_CONNECT	= 8, /* Always bind route to saddr */
55 	IP_VS_RT_MODE_KNOWN_NH	= 16,/* Route via remote addr */
56 	IP_VS_RT_MODE_TUNNEL	= 32,/* Tunnel mode */
57 };
58 
ip_vs_dest_dst_alloc(void)59 static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void)
60 {
61 	return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC);
62 }
63 
ip_vs_dest_dst_free(struct ip_vs_dest_dst * dest_dst)64 static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst)
65 {
66 	kfree(dest_dst);
67 }
68 
69 /*
70  *      Destination cache to speed up outgoing route lookup
71  */
72 static inline void
__ip_vs_dst_set(struct ip_vs_dest * dest,struct ip_vs_dest_dst * dest_dst,struct dst_entry * dst,u32 dst_cookie)73 __ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst,
74 		struct dst_entry *dst, u32 dst_cookie)
75 {
76 	struct ip_vs_dest_dst *old;
77 
78 	old = rcu_dereference_protected(dest->dest_dst,
79 					lockdep_is_held(&dest->dst_lock));
80 
81 	if (dest_dst) {
82 		dest_dst->dst_cache = dst;
83 		dest_dst->dst_cookie = dst_cookie;
84 	}
85 	rcu_assign_pointer(dest->dest_dst, dest_dst);
86 
87 	if (old)
88 		call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
89 }
90 
91 static inline struct ip_vs_dest_dst *
__ip_vs_dst_check(struct ip_vs_dest * dest)92 __ip_vs_dst_check(struct ip_vs_dest *dest)
93 {
94 	struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst);
95 	struct dst_entry *dst;
96 
97 	if (!dest_dst)
98 		return NULL;
99 	dst = dest_dst->dst_cache;
100 	if (dst->obsolete &&
101 	    dst->ops->check(dst, dest_dst->dst_cookie) == NULL)
102 		return NULL;
103 	return dest_dst;
104 }
105 
106 static inline bool
__mtu_check_toobig_v6(const struct sk_buff * skb,u32 mtu)107 __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu)
108 {
109 	if (IP6CB(skb)->frag_max_size) {
110 		/* frag_max_size tell us that, this packet have been
111 		 * defragmented by netfilter IPv6 conntrack module.
112 		 */
113 		if (IP6CB(skb)->frag_max_size > mtu)
114 			return true; /* largest fragment violate MTU */
115 	}
116 	else if (skb->len > mtu && !skb_is_gso(skb)) {
117 		return true; /* Packet size violate MTU size */
118 	}
119 	return false;
120 }
121 
122 /* Get route to daddr, optionally bind route to saddr */
do_output_route4(struct net * net,__be32 daddr,int rt_mode,__be32 * ret_saddr)123 static struct rtable *do_output_route4(struct net *net, __be32 daddr,
124 				       int rt_mode, __be32 *ret_saddr)
125 {
126 	struct flowi4 fl4;
127 	struct rtable *rt;
128 
129 	memset(&fl4, 0, sizeof(fl4));
130 	fl4.daddr = daddr;
131 	fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ?
132 			   FLOWI_FLAG_KNOWN_NH : 0;
133 
134 retry:
135 	rt = ip_route_output_key(net, &fl4);
136 	if (IS_ERR(rt)) {
137 		IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr);
138 		return NULL;
139 	}
140 	if (rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) {
141 		ip_rt_put(rt);
142 		flowi4_update_output(&fl4, 0, daddr, fl4.saddr);
143 		rt_mode = 0;
144 		goto retry;
145 	}
146 	if (ret_saddr)
147 		*ret_saddr = fl4.saddr;
148 	return rt;
149 }
150 
151 #ifdef CONFIG_IP_VS_IPV6
__ip_vs_is_local_route6(struct rt6_info * rt)152 static inline int __ip_vs_is_local_route6(struct rt6_info *rt)
153 {
154 	return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK;
155 }
156 #endif
157 
crosses_local_route_boundary(int skb_af,struct sk_buff * skb,int rt_mode,bool new_rt_is_local)158 static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb,
159 						int rt_mode,
160 						bool new_rt_is_local)
161 {
162 	bool rt_mode_allow_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL);
163 	bool rt_mode_allow_non_local = !!(rt_mode & IP_VS_RT_MODE_NON_LOCAL);
164 	bool rt_mode_allow_redirect = !!(rt_mode & IP_VS_RT_MODE_RDR);
165 	bool source_is_loopback;
166 	bool old_rt_is_local;
167 
168 #ifdef CONFIG_IP_VS_IPV6
169 	if (skb_af == AF_INET6) {
170 		int addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr);
171 
172 		source_is_loopback =
173 			(!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
174 			(addr_type & IPV6_ADDR_LOOPBACK);
175 		old_rt_is_local = __ip_vs_is_local_route6(
176 			dst_rt6_info(skb_dst(skb)));
177 	} else
178 #endif
179 	{
180 		source_is_loopback = ipv4_is_loopback(ip_hdr(skb)->saddr);
181 		old_rt_is_local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
182 	}
183 
184 	if (unlikely(new_rt_is_local)) {
185 		if (!rt_mode_allow_local)
186 			return true;
187 		if (!rt_mode_allow_redirect && !old_rt_is_local)
188 			return true;
189 	} else {
190 		if (!rt_mode_allow_non_local)
191 			return true;
192 		if (source_is_loopback)
193 			return true;
194 	}
195 	return false;
196 }
197 
maybe_update_pmtu(int skb_af,struct sk_buff * skb,int mtu)198 static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu)
199 {
200 	struct sock *sk = skb->sk;
201 	struct rtable *ort = skb_rtable(skb);
202 
203 	if (!skb->dev && sk && sk_fullsock(sk))
204 		ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu, true);
205 }
206 
ensure_mtu_is_adequate(struct netns_ipvs * ipvs,int skb_af,int rt_mode,struct ip_vs_iphdr * ipvsh,struct sk_buff * skb,int mtu)207 static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af,
208 					  int rt_mode,
209 					  struct ip_vs_iphdr *ipvsh,
210 					  struct sk_buff *skb, int mtu)
211 {
212 #ifdef CONFIG_IP_VS_IPV6
213 	if (skb_af == AF_INET6) {
214 		struct net *net = ipvs->net;
215 
216 		if (unlikely(__mtu_check_toobig_v6(skb, mtu))) {
217 			if (!skb->dev)
218 				skb->dev = net->loopback_dev;
219 			/* only send ICMP too big on first fragment */
220 			if (!ipvsh->fragoffs && !ip_vs_iph_icmp(ipvsh))
221 				icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
222 			IP_VS_DBG(1, "frag needed for %pI6c\n",
223 				  &ipv6_hdr(skb)->saddr);
224 			return false;
225 		}
226 	} else
227 #endif
228 	{
229 		/* If we're going to tunnel the packet and pmtu discovery
230 		 * is disabled, we'll just fragment it anyway
231 		 */
232 		if ((rt_mode & IP_VS_RT_MODE_TUNNEL) && !sysctl_pmtu_disc(ipvs))
233 			return true;
234 
235 		if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) &&
236 			     skb->len > mtu && !skb_is_gso(skb) &&
237 			     !ip_vs_iph_icmp(ipvsh))) {
238 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
239 				  htonl(mtu));
240 			IP_VS_DBG(1, "frag needed for %pI4\n",
241 				  &ip_hdr(skb)->saddr);
242 			return false;
243 		}
244 	}
245 
246 	return true;
247 }
248 
decrement_ttl(struct netns_ipvs * ipvs,int skb_af,struct sk_buff * skb)249 static inline bool decrement_ttl(struct netns_ipvs *ipvs,
250 				 int skb_af,
251 				 struct sk_buff *skb)
252 {
253 	struct net *net = ipvs->net;
254 
255 #ifdef CONFIG_IP_VS_IPV6
256 	if (skb_af == AF_INET6) {
257 		struct dst_entry *dst = skb_dst(skb);
258 
259 		/* check and decrement ttl */
260 		if (ipv6_hdr(skb)->hop_limit <= 1) {
261 			struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
262 
263 			/* Force OUTPUT device used as source address */
264 			skb->dev = dst->dev;
265 			icmpv6_send(skb, ICMPV6_TIME_EXCEED,
266 				    ICMPV6_EXC_HOPLIMIT, 0);
267 			IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
268 
269 			return false;
270 		}
271 
272 		/* don't propagate ttl change to cloned packets */
273 		if (skb_ensure_writable(skb, sizeof(struct ipv6hdr)))
274 			return false;
275 
276 		ipv6_hdr(skb)->hop_limit--;
277 	} else
278 #endif
279 	{
280 		if (ip_hdr(skb)->ttl <= 1) {
281 			/* Tell the sender its packet died... */
282 			IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
283 			icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
284 			return false;
285 		}
286 
287 		/* don't propagate ttl change to cloned packets */
288 		if (skb_ensure_writable(skb, sizeof(struct iphdr)))
289 			return false;
290 
291 		/* Decrease ttl */
292 		ip_decrease_ttl(ip_hdr(skb));
293 	}
294 
295 	return true;
296 }
297 
298 /* Get route to destination or remote server */
299 static int
__ip_vs_get_out_rt(struct netns_ipvs * ipvs,int skb_af,struct sk_buff * skb,struct ip_vs_dest * dest,__be32 daddr,int rt_mode,__be32 * ret_saddr,struct ip_vs_iphdr * ipvsh)300 __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
301 		   struct ip_vs_dest *dest,
302 		   __be32 daddr, int rt_mode, __be32 *ret_saddr,
303 		   struct ip_vs_iphdr *ipvsh)
304 {
305 	struct net *net = ipvs->net;
306 	struct ip_vs_dest_dst *dest_dst;
307 	struct rtable *rt;			/* Route to the other host */
308 	int mtu;
309 	int local, noref = 1;
310 
311 	if (dest) {
312 		dest_dst = __ip_vs_dst_check(dest);
313 		if (likely(dest_dst))
314 			rt = dst_rtable(dest_dst->dst_cache);
315 		else {
316 			dest_dst = ip_vs_dest_dst_alloc();
317 			spin_lock_bh(&dest->dst_lock);
318 			if (!dest_dst) {
319 				__ip_vs_dst_set(dest, NULL, NULL, 0);
320 				spin_unlock_bh(&dest->dst_lock);
321 				goto err_unreach;
322 			}
323 			rt = do_output_route4(net, dest->addr.ip, rt_mode,
324 					      &dest_dst->dst_saddr.ip);
325 			if (!rt) {
326 				__ip_vs_dst_set(dest, NULL, NULL, 0);
327 				spin_unlock_bh(&dest->dst_lock);
328 				ip_vs_dest_dst_free(dest_dst);
329 				goto err_unreach;
330 			}
331 			__ip_vs_dst_set(dest, dest_dst, &rt->dst, 0);
332 			spin_unlock_bh(&dest->dst_lock);
333 			IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
334 				  &dest->addr.ip, &dest_dst->dst_saddr.ip,
335 				  rcuref_read(&rt->dst.__rcuref));
336 		}
337 		if (ret_saddr)
338 			*ret_saddr = dest_dst->dst_saddr.ip;
339 	} else {
340 		noref = 0;
341 
342 		/* For such unconfigured boxes avoid many route lookups
343 		 * for performance reasons because we do not remember saddr
344 		 */
345 		rt_mode &= ~IP_VS_RT_MODE_CONNECT;
346 		rt = do_output_route4(net, daddr, rt_mode, ret_saddr);
347 		if (!rt)
348 			goto err_unreach;
349 	}
350 
351 	local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0;
352 	if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode,
353 						  local))) {
354 		IP_VS_DBG_RL("We are crossing local and non-local addresses"
355 			     " daddr=%pI4\n", &daddr);
356 		goto err_put;
357 	}
358 
359 	if (unlikely(local)) {
360 		/* skb to local stack, preserve old route */
361 		if (!noref)
362 			ip_rt_put(rt);
363 		return local;
364 	}
365 
366 	if (!decrement_ttl(ipvs, skb_af, skb))
367 		goto err_put;
368 
369 	if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) {
370 		mtu = dst_mtu(&rt->dst);
371 	} else {
372 		mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
373 		if (!dest)
374 			goto err_put;
375 		if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
376 			mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
377 			if ((dest->tun_flags &
378 			     IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
379 			    skb->ip_summed == CHECKSUM_PARTIAL)
380 				mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
381 		} else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
382 			IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
383 
384 			if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
385 				__set_bit(IP_TUNNEL_CSUM_BIT, tflags);
386 			mtu -= gre_calc_hlen(tflags);
387 		}
388 		if (mtu < 68) {
389 			IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
390 			goto err_put;
391 		}
392 		maybe_update_pmtu(skb_af, skb, mtu);
393 	}
394 
395 	if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu))
396 		goto err_put;
397 
398 	skb_dst_drop(skb);
399 	if (noref)
400 		skb_dst_set_noref(skb, &rt->dst);
401 	else
402 		skb_dst_set(skb, &rt->dst);
403 
404 	return local;
405 
406 err_put:
407 	if (!noref)
408 		ip_rt_put(rt);
409 	return -1;
410 
411 err_unreach:
412 	dst_link_failure(skb);
413 	return -1;
414 }
415 
416 #ifdef CONFIG_IP_VS_IPV6
417 static struct dst_entry *
__ip_vs_route_output_v6(struct net * net,struct in6_addr * daddr,struct in6_addr * ret_saddr,int do_xfrm,int rt_mode)418 __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
419 			struct in6_addr *ret_saddr, int do_xfrm, int rt_mode)
420 {
421 	struct dst_entry *dst;
422 	struct flowi6 fl6 = {
423 		.daddr = *daddr,
424 	};
425 
426 	if (rt_mode & IP_VS_RT_MODE_KNOWN_NH)
427 		fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH;
428 
429 	dst = ip6_route_output(net, NULL, &fl6);
430 	if (dst->error)
431 		goto out_err;
432 	if (!ret_saddr)
433 		return dst;
434 	if (ipv6_addr_any(&fl6.saddr) &&
435 	    ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
436 			       &fl6.daddr, 0, &fl6.saddr) < 0)
437 		goto out_err;
438 	if (do_xfrm) {
439 		dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
440 		if (IS_ERR(dst)) {
441 			dst = NULL;
442 			goto out_err;
443 		}
444 	}
445 	*ret_saddr = fl6.saddr;
446 	return dst;
447 
448 out_err:
449 	dst_release(dst);
450 	IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
451 	return NULL;
452 }
453 
454 /*
455  * Get route to destination or remote server
456  */
457 static int
__ip_vs_get_out_rt_v6(struct netns_ipvs * ipvs,int skb_af,struct sk_buff * skb,struct ip_vs_dest * dest,struct in6_addr * daddr,struct in6_addr * ret_saddr,struct ip_vs_iphdr * ipvsh,int do_xfrm,int rt_mode)458 __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
459 		      struct ip_vs_dest *dest,
460 		      struct in6_addr *daddr, struct in6_addr *ret_saddr,
461 		      struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode)
462 {
463 	struct net *net = ipvs->net;
464 	struct ip_vs_dest_dst *dest_dst;
465 	struct rt6_info *rt;			/* Route to the other host */
466 	struct dst_entry *dst;
467 	int mtu;
468 	int local, noref = 1;
469 
470 	if (dest) {
471 		dest_dst = __ip_vs_dst_check(dest);
472 		if (likely(dest_dst))
473 			rt = dst_rt6_info(dest_dst->dst_cache);
474 		else {
475 			u32 cookie;
476 
477 			dest_dst = ip_vs_dest_dst_alloc();
478 			spin_lock_bh(&dest->dst_lock);
479 			if (!dest_dst) {
480 				__ip_vs_dst_set(dest, NULL, NULL, 0);
481 				spin_unlock_bh(&dest->dst_lock);
482 				goto err_unreach;
483 			}
484 			dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
485 						      &dest_dst->dst_saddr.in6,
486 						      do_xfrm, rt_mode);
487 			if (!dst) {
488 				__ip_vs_dst_set(dest, NULL, NULL, 0);
489 				spin_unlock_bh(&dest->dst_lock);
490 				ip_vs_dest_dst_free(dest_dst);
491 				goto err_unreach;
492 			}
493 			rt = dst_rt6_info(dst);
494 			cookie = rt6_get_cookie(rt);
495 			__ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie);
496 			spin_unlock_bh(&dest->dst_lock);
497 			IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
498 				  &dest->addr.in6, &dest_dst->dst_saddr.in6,
499 				  rcuref_read(&rt->dst.__rcuref));
500 		}
501 		if (ret_saddr)
502 			*ret_saddr = dest_dst->dst_saddr.in6;
503 	} else {
504 		noref = 0;
505 		dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm,
506 					      rt_mode);
507 		if (!dst)
508 			goto err_unreach;
509 		rt = dst_rt6_info(dst);
510 	}
511 
512 	local = __ip_vs_is_local_route6(rt);
513 
514 	if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode,
515 						  local))) {
516 		IP_VS_DBG_RL("We are crossing local and non-local addresses"
517 			     " daddr=%pI6\n", daddr);
518 		goto err_put;
519 	}
520 
521 	if (unlikely(local)) {
522 		/* skb to local stack, preserve old route */
523 		if (!noref)
524 			dst_release(&rt->dst);
525 		return local;
526 	}
527 
528 	if (!decrement_ttl(ipvs, skb_af, skb))
529 		goto err_put;
530 
531 	/* MTU checking */
532 	if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL)))
533 		mtu = dst_mtu(&rt->dst);
534 	else {
535 		mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
536 		if (!dest)
537 			goto err_put;
538 		if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
539 			mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
540 			if ((dest->tun_flags &
541 			     IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
542 			    skb->ip_summed == CHECKSUM_PARTIAL)
543 				mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
544 		} else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
545 			IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
546 
547 			if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
548 				__set_bit(IP_TUNNEL_CSUM_BIT, tflags);
549 			mtu -= gre_calc_hlen(tflags);
550 		}
551 		if (mtu < IPV6_MIN_MTU) {
552 			IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
553 				     IPV6_MIN_MTU);
554 			goto err_put;
555 		}
556 		maybe_update_pmtu(skb_af, skb, mtu);
557 	}
558 
559 	if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu))
560 		goto err_put;
561 
562 	skb_dst_drop(skb);
563 	if (noref)
564 		skb_dst_set_noref(skb, &rt->dst);
565 	else
566 		skb_dst_set(skb, &rt->dst);
567 
568 	return local;
569 
570 err_put:
571 	if (!noref)
572 		dst_release(&rt->dst);
573 	return -1;
574 
575 err_unreach:
576 	/* The ip6_link_failure function requires the dev field to be set
577 	 * in order to get the net (further for the sake of fwmark
578 	 * reflection).
579 	 */
580 	if (!skb->dev)
581 		skb->dev = skb_dst(skb)->dev;
582 
583 	dst_link_failure(skb);
584 	return -1;
585 }
586 #endif
587 
588 
589 /* return NF_ACCEPT to allow forwarding or other NF_xxx on error */
ip_vs_tunnel_xmit_prepare(struct sk_buff * skb,struct ip_vs_conn * cp)590 static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
591 					    struct ip_vs_conn *cp)
592 {
593 	int ret = NF_ACCEPT;
594 
595 	skb->ipvs_property = 1;
596 	if (unlikely(cp->flags & IP_VS_CONN_F_NFCT))
597 		ret = ip_vs_confirm_conntrack(skb);
598 	if (ret == NF_ACCEPT) {
599 		nf_reset_ct(skb);
600 		skb_forward_csum(skb);
601 		if (skb->dev)
602 			skb_clear_tstamp(skb);
603 	}
604 	return ret;
605 }
606 
607 /* In the event of a remote destination, it's possible that we would have
608  * matches against an old socket (particularly a TIME-WAIT socket). This
609  * causes havoc down the line (ip_local_out et. al. expect regular sockets
610  * and invalid memory accesses will happen) so simply drop the association
611  * in this case.
612 */
ip_vs_drop_early_demux_sk(struct sk_buff * skb)613 static inline void ip_vs_drop_early_demux_sk(struct sk_buff *skb)
614 {
615 	/* If dev is set, the packet came from the LOCAL_IN callback and
616 	 * not from a local TCP socket.
617 	 */
618 	if (skb->dev)
619 		skb_orphan(skb);
620 }
621 
622 /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
ip_vs_nat_send_or_cont(int pf,struct sk_buff * skb,struct ip_vs_conn * cp,int local)623 static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
624 					 struct ip_vs_conn *cp, int local)
625 {
626 	int ret = NF_STOLEN;
627 
628 	skb->ipvs_property = 1;
629 	if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
630 		ip_vs_notrack(skb);
631 	else
632 		ip_vs_update_conntrack(skb, cp, 1);
633 
634 	/* Remove the early_demux association unless it's bound for the
635 	 * exact same port and address on this host after translation.
636 	 */
637 	if (!local || cp->vport != cp->dport ||
638 	    !ip_vs_addr_equal(cp->af, &cp->vaddr, &cp->daddr))
639 		ip_vs_drop_early_demux_sk(skb);
640 
641 	if (!local) {
642 		skb_forward_csum(skb);
643 		if (skb->dev)
644 			skb_clear_tstamp(skb);
645 		NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
646 			NULL, skb_dst(skb)->dev, dst_output);
647 	} else
648 		ret = NF_ACCEPT;
649 
650 	return ret;
651 }
652 
653 /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
ip_vs_send_or_cont(int pf,struct sk_buff * skb,struct ip_vs_conn * cp,int local)654 static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
655 				     struct ip_vs_conn *cp, int local)
656 {
657 	int ret = NF_STOLEN;
658 
659 	skb->ipvs_property = 1;
660 	if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
661 		ip_vs_notrack(skb);
662 	if (!local) {
663 		ip_vs_drop_early_demux_sk(skb);
664 		skb_forward_csum(skb);
665 		if (skb->dev)
666 			skb_clear_tstamp(skb);
667 		NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
668 			NULL, skb_dst(skb)->dev, dst_output);
669 	} else
670 		ret = NF_ACCEPT;
671 	return ret;
672 }
673 
674 
675 /*
676  *      NULL transmitter (do nothing except return NF_ACCEPT)
677  */
678 int
ip_vs_null_xmit(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,struct ip_vs_iphdr * ipvsh)679 ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
680 		struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
681 {
682 	/* we do not touch skb and do not need pskb ptr */
683 	return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
684 }
685 
686 
687 /*
688  *      Bypass transmitter
689  *      Let packets bypass the destination when the destination is not
690  *      available, it may be only used in transparent cache cluster.
691  */
692 int
ip_vs_bypass_xmit(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,struct ip_vs_iphdr * ipvsh)693 ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
694 		  struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
695 {
696 	struct iphdr  *iph = ip_hdr(skb);
697 
698 	if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr,
699 			       IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0)
700 		goto tx_error;
701 
702 	ip_send_check(iph);
703 
704 	/* Another hack: avoid icmp_send in ip_fragment */
705 	skb->ignore_df = 1;
706 
707 	ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
708 
709 	return NF_STOLEN;
710 
711  tx_error:
712 	kfree_skb(skb);
713 	return NF_STOLEN;
714 }
715 
716 #ifdef CONFIG_IP_VS_IPV6
717 int
ip_vs_bypass_xmit_v6(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,struct ip_vs_iphdr * ipvsh)718 ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
719 		     struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
720 {
721 	struct ipv6hdr *iph = ipv6_hdr(skb);
722 
723 	if (__ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, NULL,
724 				  &iph->daddr, NULL,
725 				  ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0)
726 		goto tx_error;
727 
728 	/* Another hack: avoid icmp_send in ip_fragment */
729 	skb->ignore_df = 1;
730 
731 	ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
732 
733 	return NF_STOLEN;
734 
735  tx_error:
736 	kfree_skb(skb);
737 	return NF_STOLEN;
738 }
739 #endif
740 
741 /*
742  *      NAT transmitter (only for outside-to-inside nat forwarding)
743  *      Not used for related ICMP
744  */
745 int
ip_vs_nat_xmit(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,struct ip_vs_iphdr * ipvsh)746 ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
747 	       struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
748 {
749 	struct rtable *rt;		/* Route to the other host */
750 	int local, rc, was_input;
751 
752 	/* check if it is a connection of no-client-port */
753 	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
754 		__be16 _pt, *p;
755 
756 		p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
757 		if (p == NULL)
758 			goto tx_error;
759 		ip_vs_conn_fill_cport(cp, *p);
760 		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
761 	}
762 
763 	was_input = rt_is_input_route(skb_rtable(skb));
764 	local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
765 				   IP_VS_RT_MODE_LOCAL |
766 				   IP_VS_RT_MODE_NON_LOCAL |
767 				   IP_VS_RT_MODE_RDR, NULL, ipvsh);
768 	if (local < 0)
769 		goto tx_error;
770 	rt = skb_rtable(skb);
771 	/*
772 	 * Avoid duplicate tuple in reply direction for NAT traffic
773 	 * to local address when connection is sync-ed
774 	 */
775 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
776 	if (cp->flags & IP_VS_CONN_F_SYNC && local) {
777 		enum ip_conntrack_info ctinfo;
778 		struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
779 
780 		if (ct) {
781 			IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, ipvsh->off,
782 					 "ip_vs_nat_xmit(): "
783 					 "stopping DNAT to local address");
784 			goto tx_error;
785 		}
786 	}
787 #endif
788 
789 	/* From world but DNAT to loopback address? */
790 	if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
791 		IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, ipvsh->off,
792 				 "ip_vs_nat_xmit(): stopping DNAT to loopback "
793 				 "address");
794 		goto tx_error;
795 	}
796 
797 	/* copy-on-write the packet before mangling it */
798 	if (skb_ensure_writable(skb, sizeof(struct iphdr)))
799 		goto tx_error;
800 
801 	if (skb_cow(skb, rt->dst.dev->hard_header_len))
802 		goto tx_error;
803 
804 	/* mangle the packet */
805 	if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
806 		goto tx_error;
807 	ip_hdr(skb)->daddr = cp->daddr.ip;
808 	ip_send_check(ip_hdr(skb));
809 
810 	IP_VS_DBG_PKT(10, AF_INET, pp, skb, ipvsh->off, "After DNAT");
811 
812 	/* FIXME: when application helper enlarges the packet and the length
813 	   is larger than the MTU of outgoing device, there will be still
814 	   MTU problem. */
815 
816 	/* Another hack: avoid icmp_send in ip_fragment */
817 	skb->ignore_df = 1;
818 
819 	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
820 
821 	return rc;
822 
823   tx_error:
824 	kfree_skb(skb);
825 	return NF_STOLEN;
826 }
827 
828 #ifdef CONFIG_IP_VS_IPV6
829 int
ip_vs_nat_xmit_v6(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,struct ip_vs_iphdr * ipvsh)830 ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
831 		  struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
832 {
833 	struct rt6_info *rt;		/* Route to the other host */
834 	int local, rc;
835 
836 	/* check if it is a connection of no-client-port */
837 	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) {
838 		__be16 _pt, *p;
839 		p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
840 		if (p == NULL)
841 			goto tx_error;
842 		ip_vs_conn_fill_cport(cp, *p);
843 		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
844 	}
845 
846 	local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
847 				      &cp->daddr.in6,
848 				      NULL, ipvsh, 0,
849 				      IP_VS_RT_MODE_LOCAL |
850 				      IP_VS_RT_MODE_NON_LOCAL |
851 				      IP_VS_RT_MODE_RDR);
852 	if (local < 0)
853 		goto tx_error;
854 	rt = dst_rt6_info(skb_dst(skb));
855 	/*
856 	 * Avoid duplicate tuple in reply direction for NAT traffic
857 	 * to local address when connection is sync-ed
858 	 */
859 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
860 	if (cp->flags & IP_VS_CONN_F_SYNC && local) {
861 		enum ip_conntrack_info ctinfo;
862 		struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
863 
864 		if (ct) {
865 			IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, ipvsh->off,
866 					 "ip_vs_nat_xmit_v6(): "
867 					 "stopping DNAT to local address");
868 			goto tx_error;
869 		}
870 	}
871 #endif
872 
873 	/* From world but DNAT to loopback address? */
874 	if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
875 	    ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) {
876 		IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, ipvsh->off,
877 				 "ip_vs_nat_xmit_v6(): "
878 				 "stopping DNAT to loopback address");
879 		goto tx_error;
880 	}
881 
882 	/* copy-on-write the packet before mangling it */
883 	if (skb_ensure_writable(skb, sizeof(struct ipv6hdr)))
884 		goto tx_error;
885 
886 	if (skb_cow(skb, rt->dst.dev->hard_header_len))
887 		goto tx_error;
888 
889 	/* mangle the packet */
890 	if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
891 		goto tx_error;
892 	ipv6_hdr(skb)->daddr = cp->daddr.in6;
893 
894 	IP_VS_DBG_PKT(10, AF_INET6, pp, skb, ipvsh->off, "After DNAT");
895 
896 	/* FIXME: when application helper enlarges the packet and the length
897 	   is larger than the MTU of outgoing device, there will be still
898 	   MTU problem. */
899 
900 	/* Another hack: avoid icmp_send in ip_fragment */
901 	skb->ignore_df = 1;
902 
903 	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
904 
905 	return rc;
906 
907 tx_error:
908 	kfree_skb(skb);
909 	return NF_STOLEN;
910 }
911 #endif
912 
913 /* When forwarding a packet, we must ensure that we've got enough headroom
914  * for the encapsulation packet in the skb.  This also gives us an
915  * opportunity to figure out what the payload_len, dsfield, ttl, and df
916  * values should be, so that we won't need to look at the old ip header
917  * again
918  */
919 static struct sk_buff *
ip_vs_prepare_tunneled_skb(struct sk_buff * skb,int skb_af,unsigned int max_headroom,__u8 * next_protocol,__u32 * payload_len,__u8 * dsfield,__u8 * ttl,__be16 * df)920 ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af,
921 			   unsigned int max_headroom, __u8 *next_protocol,
922 			   __u32 *payload_len, __u8 *dsfield, __u8 *ttl,
923 			   __be16 *df)
924 {
925 	struct sk_buff *new_skb = NULL;
926 	struct iphdr *old_iph = NULL;
927 	__u8 old_dsfield;
928 #ifdef CONFIG_IP_VS_IPV6
929 	struct ipv6hdr *old_ipv6h = NULL;
930 #endif
931 
932 	ip_vs_drop_early_demux_sk(skb);
933 
934 	if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) {
935 		new_skb = skb_realloc_headroom(skb, max_headroom);
936 		if (!new_skb)
937 			goto error;
938 		if (skb->sk)
939 			skb_set_owner_w(new_skb, skb->sk);
940 		consume_skb(skb);
941 		skb = new_skb;
942 	}
943 
944 #ifdef CONFIG_IP_VS_IPV6
945 	if (skb_af == AF_INET6) {
946 		old_ipv6h = ipv6_hdr(skb);
947 		*next_protocol = IPPROTO_IPV6;
948 		if (payload_len)
949 			*payload_len =
950 				ntohs(old_ipv6h->payload_len) +
951 				sizeof(*old_ipv6h);
952 		old_dsfield = ipv6_get_dsfield(old_ipv6h);
953 		*ttl = old_ipv6h->hop_limit;
954 		if (df)
955 			*df = 0;
956 	} else
957 #endif
958 	{
959 		old_iph = ip_hdr(skb);
960 		/* Copy DF, reset fragment offset and MF */
961 		if (df)
962 			*df = (old_iph->frag_off & htons(IP_DF));
963 		*next_protocol = IPPROTO_IPIP;
964 
965 		/* fix old IP header checksum */
966 		ip_send_check(old_iph);
967 		old_dsfield = ipv4_get_dsfield(old_iph);
968 		*ttl = old_iph->ttl;
969 		if (payload_len)
970 			*payload_len = skb_ip_totlen(skb);
971 	}
972 
973 	/* Implement full-functionality option for ECN encapsulation */
974 	*dsfield = INET_ECN_encapsulate(old_dsfield, old_dsfield);
975 
976 	return skb;
977 error:
978 	kfree_skb(skb);
979 	return ERR_PTR(-ENOMEM);
980 }
981 
__tun_gso_type_mask(int encaps_af,int orig_af)982 static inline int __tun_gso_type_mask(int encaps_af, int orig_af)
983 {
984 	switch (encaps_af) {
985 	case AF_INET:
986 		return SKB_GSO_IPXIP4;
987 	case AF_INET6:
988 		return SKB_GSO_IPXIP6;
989 	default:
990 		return 0;
991 	}
992 }
993 
994 static int
ipvs_gue_encap(struct net * net,struct sk_buff * skb,struct ip_vs_conn * cp,__u8 * next_protocol)995 ipvs_gue_encap(struct net *net, struct sk_buff *skb,
996 	       struct ip_vs_conn *cp, __u8 *next_protocol)
997 {
998 	__be16 dport;
999 	__be16 sport = udp_flow_src_port(net, skb, 0, 0, false);
1000 	struct udphdr  *udph;	/* Our new UDP header */
1001 	struct guehdr  *gueh;	/* Our new GUE header */
1002 	size_t hdrlen, optlen = 0;
1003 	void *data;
1004 	bool need_priv = false;
1005 
1006 	if ((cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
1007 	    skb->ip_summed == CHECKSUM_PARTIAL) {
1008 		optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
1009 		need_priv = true;
1010 	}
1011 
1012 	hdrlen = sizeof(struct guehdr) + optlen;
1013 
1014 	skb_push(skb, hdrlen);
1015 
1016 	gueh = (struct guehdr *)skb->data;
1017 
1018 	gueh->control = 0;
1019 	gueh->version = 0;
1020 	gueh->hlen = optlen >> 2;
1021 	gueh->flags = 0;
1022 	gueh->proto_ctype = *next_protocol;
1023 
1024 	data = &gueh[1];
1025 
1026 	if (need_priv) {
1027 		__be32 *flags = data;
1028 		u16 csum_start = skb_checksum_start_offset(skb);
1029 		__be16 *pd;
1030 
1031 		gueh->flags |= GUE_FLAG_PRIV;
1032 		*flags = 0;
1033 		data += GUE_LEN_PRIV;
1034 
1035 		if (csum_start < hdrlen)
1036 			return -EINVAL;
1037 
1038 		csum_start -= hdrlen;
1039 		pd = data;
1040 		pd[0] = htons(csum_start);
1041 		pd[1] = htons(csum_start + skb->csum_offset);
1042 
1043 		if (!skb_is_gso(skb)) {
1044 			skb->ip_summed = CHECKSUM_NONE;
1045 			skb->encapsulation = 0;
1046 		}
1047 
1048 		*flags |= GUE_PFLAG_REMCSUM;
1049 		data += GUE_PLEN_REMCSUM;
1050 	}
1051 
1052 	skb_push(skb, sizeof(struct udphdr));
1053 	skb_reset_transport_header(skb);
1054 
1055 	udph = udp_hdr(skb);
1056 
1057 	dport = cp->dest->tun_port;
1058 	udph->dest = dport;
1059 	udph->source = sport;
1060 	udph->len = htons(skb->len);
1061 	udph->check = 0;
1062 
1063 	*next_protocol = IPPROTO_UDP;
1064 
1065 	return 0;
1066 }
1067 
1068 static void
ipvs_gre_encap(struct net * net,struct sk_buff * skb,struct ip_vs_conn * cp,__u8 * next_protocol)1069 ipvs_gre_encap(struct net *net, struct sk_buff *skb,
1070 	       struct ip_vs_conn *cp, __u8 *next_protocol)
1071 {
1072 	__be16 proto = *next_protocol == IPPROTO_IPIP ?
1073 				htons(ETH_P_IP) : htons(ETH_P_IPV6);
1074 	IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
1075 	size_t hdrlen;
1076 
1077 	if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
1078 		__set_bit(IP_TUNNEL_CSUM_BIT, tflags);
1079 
1080 	hdrlen = gre_calc_hlen(tflags);
1081 	gre_build_header(skb, hdrlen, tflags, proto, 0, 0);
1082 
1083 	*next_protocol = IPPROTO_GRE;
1084 }
1085 
1086 /*
1087  *   IP Tunneling transmitter
1088  *
1089  *   This function encapsulates the packet in a new IP packet, its
1090  *   destination will be set to cp->daddr. Most code of this function
1091  *   is taken from ipip.c.
1092  *
1093  *   It is used in VS/TUN cluster. The load balancer selects a real
1094  *   server from a cluster based on a scheduling algorithm,
1095  *   encapsulates the request packet and forwards it to the selected
1096  *   server. For example, all real servers are configured with
1097  *   "ifconfig tunl0 <Virtual IP Address> up". When the server receives
1098  *   the encapsulated packet, it will decapsulate the packet, processe
1099  *   the request and return the response packets directly to the client
1100  *   without passing the load balancer. This can greatly increase the
1101  *   scalability of virtual server.
1102  *
1103  *   Used for ANY protocol
1104  */
1105 int
ip_vs_tunnel_xmit(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,struct ip_vs_iphdr * ipvsh)1106 ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
1107 		  struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
1108 {
1109 	struct netns_ipvs *ipvs = cp->ipvs;
1110 	struct net *net = ipvs->net;
1111 	struct rtable *rt;			/* Route to the other host */
1112 	__be32 saddr;				/* Source for tunnel */
1113 	struct net_device *tdev;		/* Device to other host */
1114 	__u8 next_protocol = 0;
1115 	__u8 dsfield = 0;
1116 	__u8 ttl = 0;
1117 	__be16 df = 0;
1118 	__be16 *dfp = NULL;
1119 	struct iphdr  *iph;			/* Our new IP header */
1120 	unsigned int max_headroom;		/* The extra header space needed */
1121 	int ret, local;
1122 	int tun_type, gso_type;
1123 	int tun_flags;
1124 
1125 	local = __ip_vs_get_out_rt(ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
1126 				   IP_VS_RT_MODE_LOCAL |
1127 				   IP_VS_RT_MODE_NON_LOCAL |
1128 				   IP_VS_RT_MODE_CONNECT |
1129 				   IP_VS_RT_MODE_TUNNEL, &saddr, ipvsh);
1130 	if (local < 0)
1131 		goto tx_error;
1132 	if (local)
1133 		return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
1134 
1135 	rt = skb_rtable(skb);
1136 	tdev = rt->dst.dev;
1137 
1138 	/*
1139 	 * Okay, now see if we can stuff it in the buffer as-is.
1140 	 */
1141 	max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
1142 
1143 	tun_type = cp->dest->tun_type;
1144 	tun_flags = cp->dest->tun_flags;
1145 
1146 	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1147 		size_t gue_hdrlen, gue_optlen = 0;
1148 
1149 		if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
1150 		    skb->ip_summed == CHECKSUM_PARTIAL) {
1151 			gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
1152 		}
1153 		gue_hdrlen = sizeof(struct guehdr) + gue_optlen;
1154 
1155 		max_headroom += sizeof(struct udphdr) + gue_hdrlen;
1156 	} else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
1157 		IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
1158 		size_t gre_hdrlen;
1159 
1160 		if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
1161 			__set_bit(IP_TUNNEL_CSUM_BIT, tflags);
1162 		gre_hdrlen = gre_calc_hlen(tflags);
1163 
1164 		max_headroom += gre_hdrlen;
1165 	}
1166 
1167 	/* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */
1168 	dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL;
1169 	skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
1170 					 &next_protocol, NULL, &dsfield,
1171 					 &ttl, dfp);
1172 	if (IS_ERR(skb))
1173 		return NF_STOLEN;
1174 
1175 	gso_type = __tun_gso_type_mask(AF_INET, cp->af);
1176 	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1177 		if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
1178 		    (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
1179 			gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
1180 		else
1181 			gso_type |= SKB_GSO_UDP_TUNNEL;
1182 		if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
1183 		    skb->ip_summed == CHECKSUM_PARTIAL) {
1184 			gso_type |= SKB_GSO_TUNNEL_REMCSUM;
1185 		}
1186 	} else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
1187 		if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
1188 			gso_type |= SKB_GSO_GRE_CSUM;
1189 		else
1190 			gso_type |= SKB_GSO_GRE;
1191 	}
1192 
1193 	if (iptunnel_handle_offloads(skb, gso_type))
1194 		goto tx_error;
1195 
1196 	skb->transport_header = skb->network_header;
1197 
1198 	skb_set_inner_ipproto(skb, next_protocol);
1199 	skb_set_inner_mac_header(skb, skb_inner_network_offset(skb));
1200 
1201 	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1202 		bool check = false;
1203 
1204 		if (ipvs_gue_encap(net, skb, cp, &next_protocol))
1205 			goto tx_error;
1206 
1207 		if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
1208 		    (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
1209 			check = true;
1210 
1211 		udp_set_csum(!check, skb, saddr, cp->daddr.ip, skb->len);
1212 	} else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE)
1213 		ipvs_gre_encap(net, skb, cp, &next_protocol);
1214 
1215 	skb_push(skb, sizeof(struct iphdr));
1216 	skb_reset_network_header(skb);
1217 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1218 
1219 	/*
1220 	 *	Push down and install the IPIP header.
1221 	 */
1222 	iph			=	ip_hdr(skb);
1223 	iph->version		=	4;
1224 	iph->ihl		=	sizeof(struct iphdr)>>2;
1225 	iph->frag_off		=	df;
1226 	iph->protocol		=	next_protocol;
1227 	iph->tos		=	dsfield;
1228 	iph->daddr		=	cp->daddr.ip;
1229 	iph->saddr		=	saddr;
1230 	iph->ttl		=	ttl;
1231 	ip_select_ident(net, skb, NULL);
1232 
1233 	/* Another hack: avoid icmp_send in ip_fragment */
1234 	skb->ignore_df = 1;
1235 
1236 	ret = ip_vs_tunnel_xmit_prepare(skb, cp);
1237 	if (ret == NF_ACCEPT)
1238 		ip_local_out(net, skb->sk, skb);
1239 	else if (ret == NF_DROP)
1240 		kfree_skb(skb);
1241 
1242 	return NF_STOLEN;
1243 
1244   tx_error:
1245 	kfree_skb(skb);
1246 	return NF_STOLEN;
1247 }
1248 
1249 #ifdef CONFIG_IP_VS_IPV6
1250 int
ip_vs_tunnel_xmit_v6(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,struct ip_vs_iphdr * ipvsh)1251 ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
1252 		     struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
1253 {
1254 	struct netns_ipvs *ipvs = cp->ipvs;
1255 	struct net *net = ipvs->net;
1256 	struct rt6_info *rt;		/* Route to the other host */
1257 	struct in6_addr saddr;		/* Source for tunnel */
1258 	struct net_device *tdev;	/* Device to other host */
1259 	__u8 next_protocol = 0;
1260 	__u32 payload_len = 0;
1261 	__u8 dsfield = 0;
1262 	__u8 ttl = 0;
1263 	struct ipv6hdr  *iph;		/* Our new IP header */
1264 	unsigned int max_headroom;	/* The extra header space needed */
1265 	int ret, local;
1266 	int tun_type, gso_type;
1267 	int tun_flags;
1268 
1269 	local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest,
1270 				      &cp->daddr.in6,
1271 				      &saddr, ipvsh, 1,
1272 				      IP_VS_RT_MODE_LOCAL |
1273 				      IP_VS_RT_MODE_NON_LOCAL |
1274 				      IP_VS_RT_MODE_TUNNEL);
1275 	if (local < 0)
1276 		goto tx_error;
1277 	if (local)
1278 		return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
1279 
1280 	rt = dst_rt6_info(skb_dst(skb));
1281 	tdev = rt->dst.dev;
1282 
1283 	/*
1284 	 * Okay, now see if we can stuff it in the buffer as-is.
1285 	 */
1286 	max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
1287 
1288 	tun_type = cp->dest->tun_type;
1289 	tun_flags = cp->dest->tun_flags;
1290 
1291 	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1292 		size_t gue_hdrlen, gue_optlen = 0;
1293 
1294 		if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
1295 		    skb->ip_summed == CHECKSUM_PARTIAL) {
1296 			gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
1297 		}
1298 		gue_hdrlen = sizeof(struct guehdr) + gue_optlen;
1299 
1300 		max_headroom += sizeof(struct udphdr) + gue_hdrlen;
1301 	} else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
1302 		IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
1303 		size_t gre_hdrlen;
1304 
1305 		if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
1306 			__set_bit(IP_TUNNEL_CSUM_BIT, tflags);
1307 		gre_hdrlen = gre_calc_hlen(tflags);
1308 
1309 		max_headroom += gre_hdrlen;
1310 	}
1311 
1312 	skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
1313 					 &next_protocol, &payload_len,
1314 					 &dsfield, &ttl, NULL);
1315 	if (IS_ERR(skb))
1316 		return NF_STOLEN;
1317 
1318 	gso_type = __tun_gso_type_mask(AF_INET6, cp->af);
1319 	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1320 		if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
1321 		    (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
1322 			gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
1323 		else
1324 			gso_type |= SKB_GSO_UDP_TUNNEL;
1325 		if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
1326 		    skb->ip_summed == CHECKSUM_PARTIAL) {
1327 			gso_type |= SKB_GSO_TUNNEL_REMCSUM;
1328 		}
1329 	} else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
1330 		if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
1331 			gso_type |= SKB_GSO_GRE_CSUM;
1332 		else
1333 			gso_type |= SKB_GSO_GRE;
1334 	}
1335 
1336 	if (iptunnel_handle_offloads(skb, gso_type))
1337 		goto tx_error;
1338 
1339 	skb->transport_header = skb->network_header;
1340 
1341 	skb_set_inner_ipproto(skb, next_protocol);
1342 	skb_set_inner_mac_header(skb, skb_inner_network_offset(skb));
1343 
1344 	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1345 		bool check = false;
1346 
1347 		if (ipvs_gue_encap(net, skb, cp, &next_protocol))
1348 			goto tx_error;
1349 
1350 		if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
1351 		    (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
1352 			check = true;
1353 
1354 		udp6_set_csum(!check, skb, &saddr, &cp->daddr.in6, skb->len);
1355 	} else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE)
1356 		ipvs_gre_encap(net, skb, cp, &next_protocol);
1357 
1358 	skb_push(skb, sizeof(struct ipv6hdr));
1359 	skb_reset_network_header(skb);
1360 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1361 
1362 	/*
1363 	 *	Push down and install the IPIP header.
1364 	 */
1365 	iph			=	ipv6_hdr(skb);
1366 	iph->version		=	6;
1367 	iph->nexthdr		=	next_protocol;
1368 	iph->payload_len	=	htons(payload_len);
1369 	memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
1370 	ipv6_change_dsfield(iph, 0, dsfield);
1371 	iph->daddr = cp->daddr.in6;
1372 	iph->saddr = saddr;
1373 	iph->hop_limit		=	ttl;
1374 
1375 	/* Another hack: avoid icmp_send in ip_fragment */
1376 	skb->ignore_df = 1;
1377 
1378 	ret = ip_vs_tunnel_xmit_prepare(skb, cp);
1379 	if (ret == NF_ACCEPT)
1380 		ip6_local_out(net, skb->sk, skb);
1381 	else if (ret == NF_DROP)
1382 		kfree_skb(skb);
1383 
1384 	return NF_STOLEN;
1385 
1386 tx_error:
1387 	kfree_skb(skb);
1388 	return NF_STOLEN;
1389 }
1390 #endif
1391 
1392 
1393 /*
1394  *      Direct Routing transmitter
1395  *      Used for ANY protocol
1396  */
1397 int
ip_vs_dr_xmit(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,struct ip_vs_iphdr * ipvsh)1398 ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
1399 	      struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
1400 {
1401 	int local;
1402 
1403 	local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
1404 				   IP_VS_RT_MODE_LOCAL |
1405 				   IP_VS_RT_MODE_NON_LOCAL |
1406 				   IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh);
1407 	if (local < 0)
1408 		goto tx_error;
1409 	if (local)
1410 		return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
1411 
1412 	ip_send_check(ip_hdr(skb));
1413 
1414 	/* Another hack: avoid icmp_send in ip_fragment */
1415 	skb->ignore_df = 1;
1416 
1417 	ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
1418 
1419 	return NF_STOLEN;
1420 
1421   tx_error:
1422 	kfree_skb(skb);
1423 	return NF_STOLEN;
1424 }
1425 
1426 #ifdef CONFIG_IP_VS_IPV6
1427 int
ip_vs_dr_xmit_v6(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,struct ip_vs_iphdr * ipvsh)1428 ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
1429 		 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
1430 {
1431 	int local;
1432 
1433 	local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
1434 				      &cp->daddr.in6,
1435 				      NULL, ipvsh, 0,
1436 				      IP_VS_RT_MODE_LOCAL |
1437 				      IP_VS_RT_MODE_NON_LOCAL |
1438 				      IP_VS_RT_MODE_KNOWN_NH);
1439 	if (local < 0)
1440 		goto tx_error;
1441 	if (local)
1442 		return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
1443 
1444 	/* Another hack: avoid icmp_send in ip_fragment */
1445 	skb->ignore_df = 1;
1446 
1447 	ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
1448 
1449 	return NF_STOLEN;
1450 
1451 tx_error:
1452 	kfree_skb(skb);
1453 	return NF_STOLEN;
1454 }
1455 #endif
1456 
1457 
1458 /*
1459  *	ICMP packet transmitter
1460  *	called by the ip_vs_in_icmp
1461  */
1462 int
ip_vs_icmp_xmit(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,int offset,unsigned int hooknum,struct ip_vs_iphdr * iph)1463 ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
1464 		struct ip_vs_protocol *pp, int offset, unsigned int hooknum,
1465 		struct ip_vs_iphdr *iph)
1466 {
1467 	struct rtable	*rt;	/* Route to the other host */
1468 	int rc;
1469 	int local;
1470 	int rt_mode, was_input;
1471 
1472 	/* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
1473 	   forwarded directly here, because there is no need to
1474 	   translate address/port back */
1475 	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
1476 		if (cp->packet_xmit)
1477 			rc = cp->packet_xmit(skb, cp, pp, iph);
1478 		else
1479 			rc = NF_ACCEPT;
1480 		/* do not touch skb anymore */
1481 		atomic_inc(&cp->in_pkts);
1482 		return rc;
1483 	}
1484 
1485 	/*
1486 	 * mangle and send the packet here (only for VS/NAT)
1487 	 */
1488 	was_input = rt_is_input_route(skb_rtable(skb));
1489 
1490 	/* LOCALNODE from FORWARD hook is not supported */
1491 	rt_mode = (hooknum != NF_INET_FORWARD) ?
1492 		  IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
1493 		  IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
1494 	local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, rt_mode,
1495 				   NULL, iph);
1496 	if (local < 0)
1497 		goto tx_error;
1498 	rt = skb_rtable(skb);
1499 
1500 	/*
1501 	 * Avoid duplicate tuple in reply direction for NAT traffic
1502 	 * to local address when connection is sync-ed
1503 	 */
1504 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
1505 	if (cp->flags & IP_VS_CONN_F_SYNC && local) {
1506 		enum ip_conntrack_info ctinfo;
1507 		struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
1508 
1509 		if (ct) {
1510 			IP_VS_DBG(10, "%s(): "
1511 				  "stopping DNAT to local address %pI4\n",
1512 				  __func__, &cp->daddr.ip);
1513 			goto tx_error;
1514 		}
1515 	}
1516 #endif
1517 
1518 	/* From world but DNAT to loopback address? */
1519 	if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
1520 		IP_VS_DBG(1, "%s(): "
1521 			  "stopping DNAT to loopback %pI4\n",
1522 			  __func__, &cp->daddr.ip);
1523 		goto tx_error;
1524 	}
1525 
1526 	/* copy-on-write the packet before mangling it */
1527 	if (skb_ensure_writable(skb, offset))
1528 		goto tx_error;
1529 
1530 	if (skb_cow(skb, rt->dst.dev->hard_header_len))
1531 		goto tx_error;
1532 
1533 	ip_vs_nat_icmp(skb, pp, cp, 0);
1534 
1535 	/* Another hack: avoid icmp_send in ip_fragment */
1536 	skb->ignore_df = 1;
1537 
1538 	return ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
1539 
1540   tx_error:
1541 	kfree_skb(skb);
1542 	rc = NF_STOLEN;
1543 	return rc;
1544 }
1545 
1546 #ifdef CONFIG_IP_VS_IPV6
1547 int
ip_vs_icmp_xmit_v6(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,int offset,unsigned int hooknum,struct ip_vs_iphdr * ipvsh)1548 ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
1549 		struct ip_vs_protocol *pp, int offset, unsigned int hooknum,
1550 		struct ip_vs_iphdr *ipvsh)
1551 {
1552 	struct rt6_info	*rt;	/* Route to the other host */
1553 	int rc;
1554 	int local;
1555 	int rt_mode;
1556 
1557 	/* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
1558 	   forwarded directly here, because there is no need to
1559 	   translate address/port back */
1560 	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
1561 		if (cp->packet_xmit)
1562 			rc = cp->packet_xmit(skb, cp, pp, ipvsh);
1563 		else
1564 			rc = NF_ACCEPT;
1565 		/* do not touch skb anymore */
1566 		atomic_inc(&cp->in_pkts);
1567 		return rc;
1568 	}
1569 
1570 	/*
1571 	 * mangle and send the packet here (only for VS/NAT)
1572 	 */
1573 
1574 	/* LOCALNODE from FORWARD hook is not supported */
1575 	rt_mode = (hooknum != NF_INET_FORWARD) ?
1576 		  IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
1577 		  IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
1578 	local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
1579 				      &cp->daddr.in6, NULL, ipvsh, 0, rt_mode);
1580 	if (local < 0)
1581 		goto tx_error;
1582 	rt = dst_rt6_info(skb_dst(skb));
1583 	/*
1584 	 * Avoid duplicate tuple in reply direction for NAT traffic
1585 	 * to local address when connection is sync-ed
1586 	 */
1587 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
1588 	if (cp->flags & IP_VS_CONN_F_SYNC && local) {
1589 		enum ip_conntrack_info ctinfo;
1590 		struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
1591 
1592 		if (ct) {
1593 			IP_VS_DBG(10, "%s(): "
1594 				  "stopping DNAT to local address %pI6\n",
1595 				  __func__, &cp->daddr.in6);
1596 			goto tx_error;
1597 		}
1598 	}
1599 #endif
1600 
1601 	/* From world but DNAT to loopback address? */
1602 	if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
1603 	    ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) {
1604 		IP_VS_DBG(1, "%s(): "
1605 			  "stopping DNAT to loopback %pI6\n",
1606 			  __func__, &cp->daddr.in6);
1607 		goto tx_error;
1608 	}
1609 
1610 	/* copy-on-write the packet before mangling it */
1611 	if (skb_ensure_writable(skb, offset))
1612 		goto tx_error;
1613 
1614 	if (skb_cow(skb, rt->dst.dev->hard_header_len))
1615 		goto tx_error;
1616 
1617 	ip_vs_nat_icmp_v6(skb, pp, cp, 0);
1618 
1619 	/* Another hack: avoid icmp_send in ip_fragment */
1620 	skb->ignore_df = 1;
1621 
1622 	return ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
1623 
1624 tx_error:
1625 	kfree_skb(skb);
1626 	rc = NF_STOLEN;
1627 	return rc;
1628 }
1629 #endif
1630