1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * ip_vs_xmit.c: various packet transmitters for IPVS
4 *
5 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
6 * Julian Anastasov <ja@ssi.bg>
7 *
8 * Changes:
9 *
10 * Description of forwarding methods:
11 * - all transmitters are called from LOCAL_IN (remote clients) and
12 * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD
13 * - not all connections have destination server, for example,
14 * connections in backup server when fwmark is used
15 * - bypass connections use daddr from packet
16 * - we can use dst without ref while sending in RCU section, we use
17 * ref when returning NF_ACCEPT for NAT-ed packet via loopback
18 * LOCAL_OUT rules:
19 * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
20 * - skb->pkt_type is not set yet
21 * - the only place where we can see skb->sk != NULL
22 */
23
24 #define KMSG_COMPONENT "IPVS"
25 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
26
27 #include <linux/kernel.h>
28 #include <linux/slab.h>
29 #include <linux/tcp.h> /* for tcphdr */
30 #include <net/ip.h>
31 #include <net/gue.h>
32 #include <net/gre.h>
33 #include <net/tcp.h> /* for csum_tcpudp_magic */
34 #include <net/udp.h>
35 #include <net/icmp.h> /* for icmp_send */
36 #include <net/route.h> /* for ip_route_output */
37 #include <net/ipv6.h>
38 #include <net/ip6_route.h>
39 #include <net/ip_tunnels.h>
40 #include <net/ip6_checksum.h>
41 #include <net/addrconf.h>
42 #include <linux/icmpv6.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv4.h>
45
46 #include <net/ip_vs.h>
47
48 enum {
49 IP_VS_RT_MODE_LOCAL = 1, /* Allow local dest */
50 IP_VS_RT_MODE_NON_LOCAL = 2, /* Allow non-local dest */
51 IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to
52 * local
53 */
54 IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */
55 IP_VS_RT_MODE_KNOWN_NH = 16,/* Route via remote addr */
56 IP_VS_RT_MODE_TUNNEL = 32,/* Tunnel mode */
57 };
58
ip_vs_dest_dst_alloc(void)59 static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void)
60 {
61 return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC);
62 }
63
ip_vs_dest_dst_free(struct ip_vs_dest_dst * dest_dst)64 static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst)
65 {
66 kfree(dest_dst);
67 }
68
69 /*
70 * Destination cache to speed up outgoing route lookup
71 */
72 static inline void
__ip_vs_dst_set(struct ip_vs_dest * dest,struct ip_vs_dest_dst * dest_dst,struct dst_entry * dst,u32 dst_cookie)73 __ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst,
74 struct dst_entry *dst, u32 dst_cookie)
75 {
76 struct ip_vs_dest_dst *old;
77
78 old = rcu_dereference_protected(dest->dest_dst,
79 lockdep_is_held(&dest->dst_lock));
80
81 if (dest_dst) {
82 dest_dst->dst_cache = dst;
83 dest_dst->dst_cookie = dst_cookie;
84 }
85 rcu_assign_pointer(dest->dest_dst, dest_dst);
86
87 if (old)
88 call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
89 }
90
91 static inline struct ip_vs_dest_dst *
__ip_vs_dst_check(struct ip_vs_dest * dest)92 __ip_vs_dst_check(struct ip_vs_dest *dest)
93 {
94 struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst);
95 struct dst_entry *dst;
96
97 if (!dest_dst)
98 return NULL;
99 dst = dest_dst->dst_cache;
100 if (dst->obsolete &&
101 dst->ops->check(dst, dest_dst->dst_cookie) == NULL)
102 return NULL;
103 return dest_dst;
104 }
105
106 static inline bool
__mtu_check_toobig_v6(const struct sk_buff * skb,u32 mtu)107 __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu)
108 {
109 if (IP6CB(skb)->frag_max_size) {
110 /* frag_max_size tell us that, this packet have been
111 * defragmented by netfilter IPv6 conntrack module.
112 */
113 if (IP6CB(skb)->frag_max_size > mtu)
114 return true; /* largest fragment violate MTU */
115 }
116 else if (skb->len > mtu && !skb_is_gso(skb)) {
117 return true; /* Packet size violate MTU size */
118 }
119 return false;
120 }
121
122 /* Get route to daddr, optionally bind route to saddr */
do_output_route4(struct net * net,__be32 daddr,int rt_mode,__be32 * ret_saddr)123 static struct rtable *do_output_route4(struct net *net, __be32 daddr,
124 int rt_mode, __be32 *ret_saddr)
125 {
126 struct flowi4 fl4;
127 struct rtable *rt;
128
129 memset(&fl4, 0, sizeof(fl4));
130 fl4.daddr = daddr;
131 fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ?
132 FLOWI_FLAG_KNOWN_NH : 0;
133
134 retry:
135 rt = ip_route_output_key(net, &fl4);
136 if (IS_ERR(rt)) {
137 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr);
138 return NULL;
139 }
140 if (rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) {
141 ip_rt_put(rt);
142 flowi4_update_output(&fl4, 0, daddr, fl4.saddr);
143 rt_mode = 0;
144 goto retry;
145 }
146 if (ret_saddr)
147 *ret_saddr = fl4.saddr;
148 return rt;
149 }
150
151 #ifdef CONFIG_IP_VS_IPV6
__ip_vs_is_local_route6(struct rt6_info * rt)152 static inline int __ip_vs_is_local_route6(struct rt6_info *rt)
153 {
154 return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK;
155 }
156 #endif
157
crosses_local_route_boundary(int skb_af,struct sk_buff * skb,int rt_mode,bool new_rt_is_local)158 static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb,
159 int rt_mode,
160 bool new_rt_is_local)
161 {
162 bool rt_mode_allow_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL);
163 bool rt_mode_allow_non_local = !!(rt_mode & IP_VS_RT_MODE_NON_LOCAL);
164 bool rt_mode_allow_redirect = !!(rt_mode & IP_VS_RT_MODE_RDR);
165 bool source_is_loopback;
166 bool old_rt_is_local;
167
168 #ifdef CONFIG_IP_VS_IPV6
169 if (skb_af == AF_INET6) {
170 int addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr);
171
172 source_is_loopback =
173 (!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
174 (addr_type & IPV6_ADDR_LOOPBACK);
175 old_rt_is_local = __ip_vs_is_local_route6(
176 dst_rt6_info(skb_dst(skb)));
177 } else
178 #endif
179 {
180 source_is_loopback = ipv4_is_loopback(ip_hdr(skb)->saddr);
181 old_rt_is_local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
182 }
183
184 if (unlikely(new_rt_is_local)) {
185 if (!rt_mode_allow_local)
186 return true;
187 if (!rt_mode_allow_redirect && !old_rt_is_local)
188 return true;
189 } else {
190 if (!rt_mode_allow_non_local)
191 return true;
192 if (source_is_loopback)
193 return true;
194 }
195 return false;
196 }
197
maybe_update_pmtu(int skb_af,struct sk_buff * skb,int mtu)198 static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu)
199 {
200 struct sock *sk = skb->sk;
201 struct rtable *ort = skb_rtable(skb);
202
203 if (!skb->dev && sk && sk_fullsock(sk))
204 ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu, true);
205 }
206
ensure_mtu_is_adequate(struct netns_ipvs * ipvs,int skb_af,int rt_mode,struct ip_vs_iphdr * ipvsh,struct sk_buff * skb,int mtu)207 static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af,
208 int rt_mode,
209 struct ip_vs_iphdr *ipvsh,
210 struct sk_buff *skb, int mtu)
211 {
212 #ifdef CONFIG_IP_VS_IPV6
213 if (skb_af == AF_INET6) {
214 struct net *net = ipvs->net;
215
216 if (unlikely(__mtu_check_toobig_v6(skb, mtu))) {
217 if (!skb->dev)
218 skb->dev = net->loopback_dev;
219 /* only send ICMP too big on first fragment */
220 if (!ipvsh->fragoffs && !ip_vs_iph_icmp(ipvsh))
221 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
222 IP_VS_DBG(1, "frag needed for %pI6c\n",
223 &ipv6_hdr(skb)->saddr);
224 return false;
225 }
226 } else
227 #endif
228 {
229 /* If we're going to tunnel the packet and pmtu discovery
230 * is disabled, we'll just fragment it anyway
231 */
232 if ((rt_mode & IP_VS_RT_MODE_TUNNEL) && !sysctl_pmtu_disc(ipvs))
233 return true;
234
235 if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) &&
236 skb->len > mtu && !skb_is_gso(skb) &&
237 !ip_vs_iph_icmp(ipvsh))) {
238 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
239 htonl(mtu));
240 IP_VS_DBG(1, "frag needed for %pI4\n",
241 &ip_hdr(skb)->saddr);
242 return false;
243 }
244 }
245
246 return true;
247 }
248
decrement_ttl(struct netns_ipvs * ipvs,int skb_af,struct sk_buff * skb)249 static inline bool decrement_ttl(struct netns_ipvs *ipvs,
250 int skb_af,
251 struct sk_buff *skb)
252 {
253 struct net *net = ipvs->net;
254
255 #ifdef CONFIG_IP_VS_IPV6
256 if (skb_af == AF_INET6) {
257 struct dst_entry *dst = skb_dst(skb);
258
259 /* check and decrement ttl */
260 if (ipv6_hdr(skb)->hop_limit <= 1) {
261 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
262
263 /* Force OUTPUT device used as source address */
264 skb->dev = dst->dev;
265 icmpv6_send(skb, ICMPV6_TIME_EXCEED,
266 ICMPV6_EXC_HOPLIMIT, 0);
267 IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
268
269 return false;
270 }
271
272 /* don't propagate ttl change to cloned packets */
273 if (skb_ensure_writable(skb, sizeof(struct ipv6hdr)))
274 return false;
275
276 ipv6_hdr(skb)->hop_limit--;
277 } else
278 #endif
279 {
280 if (ip_hdr(skb)->ttl <= 1) {
281 /* Tell the sender its packet died... */
282 IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
283 icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
284 return false;
285 }
286
287 /* don't propagate ttl change to cloned packets */
288 if (skb_ensure_writable(skb, sizeof(struct iphdr)))
289 return false;
290
291 /* Decrease ttl */
292 ip_decrease_ttl(ip_hdr(skb));
293 }
294
295 return true;
296 }
297
298 /* Get route to destination or remote server */
299 static int
__ip_vs_get_out_rt(struct netns_ipvs * ipvs,int skb_af,struct sk_buff * skb,struct ip_vs_dest * dest,__be32 daddr,int rt_mode,__be32 * ret_saddr,struct ip_vs_iphdr * ipvsh)300 __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
301 struct ip_vs_dest *dest,
302 __be32 daddr, int rt_mode, __be32 *ret_saddr,
303 struct ip_vs_iphdr *ipvsh)
304 {
305 struct net *net = ipvs->net;
306 struct ip_vs_dest_dst *dest_dst;
307 struct rtable *rt; /* Route to the other host */
308 int mtu;
309 int local, noref = 1;
310
311 if (dest) {
312 dest_dst = __ip_vs_dst_check(dest);
313 if (likely(dest_dst))
314 rt = dst_rtable(dest_dst->dst_cache);
315 else {
316 dest_dst = ip_vs_dest_dst_alloc();
317 spin_lock_bh(&dest->dst_lock);
318 if (!dest_dst) {
319 __ip_vs_dst_set(dest, NULL, NULL, 0);
320 spin_unlock_bh(&dest->dst_lock);
321 goto err_unreach;
322 }
323 rt = do_output_route4(net, dest->addr.ip, rt_mode,
324 &dest_dst->dst_saddr.ip);
325 if (!rt) {
326 __ip_vs_dst_set(dest, NULL, NULL, 0);
327 spin_unlock_bh(&dest->dst_lock);
328 ip_vs_dest_dst_free(dest_dst);
329 goto err_unreach;
330 }
331 __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0);
332 spin_unlock_bh(&dest->dst_lock);
333 IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
334 &dest->addr.ip, &dest_dst->dst_saddr.ip,
335 rcuref_read(&rt->dst.__rcuref));
336 }
337 if (ret_saddr)
338 *ret_saddr = dest_dst->dst_saddr.ip;
339 } else {
340 noref = 0;
341
342 /* For such unconfigured boxes avoid many route lookups
343 * for performance reasons because we do not remember saddr
344 */
345 rt_mode &= ~IP_VS_RT_MODE_CONNECT;
346 rt = do_output_route4(net, daddr, rt_mode, ret_saddr);
347 if (!rt)
348 goto err_unreach;
349 }
350
351 local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0;
352 if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode,
353 local))) {
354 IP_VS_DBG_RL("We are crossing local and non-local addresses"
355 " daddr=%pI4\n", &daddr);
356 goto err_put;
357 }
358
359 if (unlikely(local)) {
360 /* skb to local stack, preserve old route */
361 if (!noref)
362 ip_rt_put(rt);
363 return local;
364 }
365
366 if (!decrement_ttl(ipvs, skb_af, skb))
367 goto err_put;
368
369 if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) {
370 mtu = dst_mtu(&rt->dst);
371 } else {
372 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
373 if (!dest)
374 goto err_put;
375 if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
376 mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
377 if ((dest->tun_flags &
378 IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
379 skb->ip_summed == CHECKSUM_PARTIAL)
380 mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
381 } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
382 IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
383
384 if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
385 __set_bit(IP_TUNNEL_CSUM_BIT, tflags);
386 mtu -= gre_calc_hlen(tflags);
387 }
388 if (mtu < 68) {
389 IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
390 goto err_put;
391 }
392 maybe_update_pmtu(skb_af, skb, mtu);
393 }
394
395 if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu))
396 goto err_put;
397
398 skb_dst_drop(skb);
399 if (noref)
400 skb_dst_set_noref(skb, &rt->dst);
401 else
402 skb_dst_set(skb, &rt->dst);
403
404 return local;
405
406 err_put:
407 if (!noref)
408 ip_rt_put(rt);
409 return -1;
410
411 err_unreach:
412 dst_link_failure(skb);
413 return -1;
414 }
415
416 #ifdef CONFIG_IP_VS_IPV6
417 static struct dst_entry *
__ip_vs_route_output_v6(struct net * net,struct in6_addr * daddr,struct in6_addr * ret_saddr,int do_xfrm,int rt_mode)418 __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
419 struct in6_addr *ret_saddr, int do_xfrm, int rt_mode)
420 {
421 struct dst_entry *dst;
422 struct flowi6 fl6 = {
423 .daddr = *daddr,
424 };
425
426 if (rt_mode & IP_VS_RT_MODE_KNOWN_NH)
427 fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH;
428
429 dst = ip6_route_output(net, NULL, &fl6);
430 if (dst->error)
431 goto out_err;
432 if (!ret_saddr)
433 return dst;
434 if (ipv6_addr_any(&fl6.saddr) &&
435 ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
436 &fl6.daddr, 0, &fl6.saddr) < 0)
437 goto out_err;
438 if (do_xfrm) {
439 dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
440 if (IS_ERR(dst)) {
441 dst = NULL;
442 goto out_err;
443 }
444 }
445 *ret_saddr = fl6.saddr;
446 return dst;
447
448 out_err:
449 dst_release(dst);
450 IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
451 return NULL;
452 }
453
454 /*
455 * Get route to destination or remote server
456 */
457 static int
__ip_vs_get_out_rt_v6(struct netns_ipvs * ipvs,int skb_af,struct sk_buff * skb,struct ip_vs_dest * dest,struct in6_addr * daddr,struct in6_addr * ret_saddr,struct ip_vs_iphdr * ipvsh,int do_xfrm,int rt_mode)458 __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
459 struct ip_vs_dest *dest,
460 struct in6_addr *daddr, struct in6_addr *ret_saddr,
461 struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode)
462 {
463 struct net *net = ipvs->net;
464 struct ip_vs_dest_dst *dest_dst;
465 struct rt6_info *rt; /* Route to the other host */
466 struct dst_entry *dst;
467 int mtu;
468 int local, noref = 1;
469
470 if (dest) {
471 dest_dst = __ip_vs_dst_check(dest);
472 if (likely(dest_dst))
473 rt = dst_rt6_info(dest_dst->dst_cache);
474 else {
475 u32 cookie;
476
477 dest_dst = ip_vs_dest_dst_alloc();
478 spin_lock_bh(&dest->dst_lock);
479 if (!dest_dst) {
480 __ip_vs_dst_set(dest, NULL, NULL, 0);
481 spin_unlock_bh(&dest->dst_lock);
482 goto err_unreach;
483 }
484 dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
485 &dest_dst->dst_saddr.in6,
486 do_xfrm, rt_mode);
487 if (!dst) {
488 __ip_vs_dst_set(dest, NULL, NULL, 0);
489 spin_unlock_bh(&dest->dst_lock);
490 ip_vs_dest_dst_free(dest_dst);
491 goto err_unreach;
492 }
493 rt = dst_rt6_info(dst);
494 cookie = rt6_get_cookie(rt);
495 __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie);
496 spin_unlock_bh(&dest->dst_lock);
497 IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
498 &dest->addr.in6, &dest_dst->dst_saddr.in6,
499 rcuref_read(&rt->dst.__rcuref));
500 }
501 if (ret_saddr)
502 *ret_saddr = dest_dst->dst_saddr.in6;
503 } else {
504 noref = 0;
505 dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm,
506 rt_mode);
507 if (!dst)
508 goto err_unreach;
509 rt = dst_rt6_info(dst);
510 }
511
512 local = __ip_vs_is_local_route6(rt);
513
514 if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode,
515 local))) {
516 IP_VS_DBG_RL("We are crossing local and non-local addresses"
517 " daddr=%pI6\n", daddr);
518 goto err_put;
519 }
520
521 if (unlikely(local)) {
522 /* skb to local stack, preserve old route */
523 if (!noref)
524 dst_release(&rt->dst);
525 return local;
526 }
527
528 if (!decrement_ttl(ipvs, skb_af, skb))
529 goto err_put;
530
531 /* MTU checking */
532 if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL)))
533 mtu = dst_mtu(&rt->dst);
534 else {
535 mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
536 if (!dest)
537 goto err_put;
538 if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
539 mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
540 if ((dest->tun_flags &
541 IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
542 skb->ip_summed == CHECKSUM_PARTIAL)
543 mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
544 } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
545 IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
546
547 if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
548 __set_bit(IP_TUNNEL_CSUM_BIT, tflags);
549 mtu -= gre_calc_hlen(tflags);
550 }
551 if (mtu < IPV6_MIN_MTU) {
552 IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
553 IPV6_MIN_MTU);
554 goto err_put;
555 }
556 maybe_update_pmtu(skb_af, skb, mtu);
557 }
558
559 if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu))
560 goto err_put;
561
562 skb_dst_drop(skb);
563 if (noref)
564 skb_dst_set_noref(skb, &rt->dst);
565 else
566 skb_dst_set(skb, &rt->dst);
567
568 return local;
569
570 err_put:
571 if (!noref)
572 dst_release(&rt->dst);
573 return -1;
574
575 err_unreach:
576 /* The ip6_link_failure function requires the dev field to be set
577 * in order to get the net (further for the sake of fwmark
578 * reflection).
579 */
580 if (!skb->dev)
581 skb->dev = skb_dst(skb)->dev;
582
583 dst_link_failure(skb);
584 return -1;
585 }
586 #endif
587
588
589 /* return NF_ACCEPT to allow forwarding or other NF_xxx on error */
ip_vs_tunnel_xmit_prepare(struct sk_buff * skb,struct ip_vs_conn * cp)590 static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
591 struct ip_vs_conn *cp)
592 {
593 int ret = NF_ACCEPT;
594
595 skb->ipvs_property = 1;
596 if (unlikely(cp->flags & IP_VS_CONN_F_NFCT))
597 ret = ip_vs_confirm_conntrack(skb);
598 if (ret == NF_ACCEPT) {
599 nf_reset_ct(skb);
600 skb_forward_csum(skb);
601 if (skb->dev)
602 skb_clear_tstamp(skb);
603 }
604 return ret;
605 }
606
607 /* In the event of a remote destination, it's possible that we would have
608 * matches against an old socket (particularly a TIME-WAIT socket). This
609 * causes havoc down the line (ip_local_out et. al. expect regular sockets
610 * and invalid memory accesses will happen) so simply drop the association
611 * in this case.
612 */
ip_vs_drop_early_demux_sk(struct sk_buff * skb)613 static inline void ip_vs_drop_early_demux_sk(struct sk_buff *skb)
614 {
615 /* If dev is set, the packet came from the LOCAL_IN callback and
616 * not from a local TCP socket.
617 */
618 if (skb->dev)
619 skb_orphan(skb);
620 }
621
622 /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
ip_vs_nat_send_or_cont(int pf,struct sk_buff * skb,struct ip_vs_conn * cp,int local)623 static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
624 struct ip_vs_conn *cp, int local)
625 {
626 int ret = NF_STOLEN;
627
628 skb->ipvs_property = 1;
629 if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
630 ip_vs_notrack(skb);
631 else
632 ip_vs_update_conntrack(skb, cp, 1);
633
634 /* Remove the early_demux association unless it's bound for the
635 * exact same port and address on this host after translation.
636 */
637 if (!local || cp->vport != cp->dport ||
638 !ip_vs_addr_equal(cp->af, &cp->vaddr, &cp->daddr))
639 ip_vs_drop_early_demux_sk(skb);
640
641 if (!local) {
642 skb_forward_csum(skb);
643 if (skb->dev)
644 skb_clear_tstamp(skb);
645 NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
646 NULL, skb_dst(skb)->dev, dst_output);
647 } else
648 ret = NF_ACCEPT;
649
650 return ret;
651 }
652
653 /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
ip_vs_send_or_cont(int pf,struct sk_buff * skb,struct ip_vs_conn * cp,int local)654 static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
655 struct ip_vs_conn *cp, int local)
656 {
657 int ret = NF_STOLEN;
658
659 skb->ipvs_property = 1;
660 if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
661 ip_vs_notrack(skb);
662 if (!local) {
663 ip_vs_drop_early_demux_sk(skb);
664 skb_forward_csum(skb);
665 if (skb->dev)
666 skb_clear_tstamp(skb);
667 NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
668 NULL, skb_dst(skb)->dev, dst_output);
669 } else
670 ret = NF_ACCEPT;
671 return ret;
672 }
673
674
675 /*
676 * NULL transmitter (do nothing except return NF_ACCEPT)
677 */
678 int
ip_vs_null_xmit(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,struct ip_vs_iphdr * ipvsh)679 ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
680 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
681 {
682 /* we do not touch skb and do not need pskb ptr */
683 return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
684 }
685
686
687 /*
688 * Bypass transmitter
689 * Let packets bypass the destination when the destination is not
690 * available, it may be only used in transparent cache cluster.
691 */
692 int
ip_vs_bypass_xmit(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,struct ip_vs_iphdr * ipvsh)693 ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
694 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
695 {
696 struct iphdr *iph = ip_hdr(skb);
697
698 if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr,
699 IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0)
700 goto tx_error;
701
702 ip_send_check(iph);
703
704 /* Another hack: avoid icmp_send in ip_fragment */
705 skb->ignore_df = 1;
706
707 ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
708
709 return NF_STOLEN;
710
711 tx_error:
712 kfree_skb(skb);
713 return NF_STOLEN;
714 }
715
716 #ifdef CONFIG_IP_VS_IPV6
717 int
ip_vs_bypass_xmit_v6(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,struct ip_vs_iphdr * ipvsh)718 ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
719 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
720 {
721 struct ipv6hdr *iph = ipv6_hdr(skb);
722
723 if (__ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, NULL,
724 &iph->daddr, NULL,
725 ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0)
726 goto tx_error;
727
728 /* Another hack: avoid icmp_send in ip_fragment */
729 skb->ignore_df = 1;
730
731 ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
732
733 return NF_STOLEN;
734
735 tx_error:
736 kfree_skb(skb);
737 return NF_STOLEN;
738 }
739 #endif
740
741 /*
742 * NAT transmitter (only for outside-to-inside nat forwarding)
743 * Not used for related ICMP
744 */
745 int
ip_vs_nat_xmit(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,struct ip_vs_iphdr * ipvsh)746 ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
747 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
748 {
749 struct rtable *rt; /* Route to the other host */
750 int local, rc, was_input;
751
752 /* check if it is a connection of no-client-port */
753 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
754 __be16 _pt, *p;
755
756 p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
757 if (p == NULL)
758 goto tx_error;
759 ip_vs_conn_fill_cport(cp, *p);
760 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
761 }
762
763 was_input = rt_is_input_route(skb_rtable(skb));
764 local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
765 IP_VS_RT_MODE_LOCAL |
766 IP_VS_RT_MODE_NON_LOCAL |
767 IP_VS_RT_MODE_RDR, NULL, ipvsh);
768 if (local < 0)
769 goto tx_error;
770 rt = skb_rtable(skb);
771 /*
772 * Avoid duplicate tuple in reply direction for NAT traffic
773 * to local address when connection is sync-ed
774 */
775 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
776 if (cp->flags & IP_VS_CONN_F_SYNC && local) {
777 enum ip_conntrack_info ctinfo;
778 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
779
780 if (ct) {
781 IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, ipvsh->off,
782 "ip_vs_nat_xmit(): "
783 "stopping DNAT to local address");
784 goto tx_error;
785 }
786 }
787 #endif
788
789 /* From world but DNAT to loopback address? */
790 if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
791 IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, ipvsh->off,
792 "ip_vs_nat_xmit(): stopping DNAT to loopback "
793 "address");
794 goto tx_error;
795 }
796
797 /* copy-on-write the packet before mangling it */
798 if (skb_ensure_writable(skb, sizeof(struct iphdr)))
799 goto tx_error;
800
801 if (skb_cow(skb, rt->dst.dev->hard_header_len))
802 goto tx_error;
803
804 /* mangle the packet */
805 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
806 goto tx_error;
807 ip_hdr(skb)->daddr = cp->daddr.ip;
808 ip_send_check(ip_hdr(skb));
809
810 IP_VS_DBG_PKT(10, AF_INET, pp, skb, ipvsh->off, "After DNAT");
811
812 /* FIXME: when application helper enlarges the packet and the length
813 is larger than the MTU of outgoing device, there will be still
814 MTU problem. */
815
816 /* Another hack: avoid icmp_send in ip_fragment */
817 skb->ignore_df = 1;
818
819 rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
820
821 return rc;
822
823 tx_error:
824 kfree_skb(skb);
825 return NF_STOLEN;
826 }
827
828 #ifdef CONFIG_IP_VS_IPV6
829 int
ip_vs_nat_xmit_v6(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,struct ip_vs_iphdr * ipvsh)830 ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
831 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
832 {
833 struct rt6_info *rt; /* Route to the other host */
834 int local, rc;
835
836 /* check if it is a connection of no-client-port */
837 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) {
838 __be16 _pt, *p;
839 p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
840 if (p == NULL)
841 goto tx_error;
842 ip_vs_conn_fill_cport(cp, *p);
843 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
844 }
845
846 local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
847 &cp->daddr.in6,
848 NULL, ipvsh, 0,
849 IP_VS_RT_MODE_LOCAL |
850 IP_VS_RT_MODE_NON_LOCAL |
851 IP_VS_RT_MODE_RDR);
852 if (local < 0)
853 goto tx_error;
854 rt = dst_rt6_info(skb_dst(skb));
855 /*
856 * Avoid duplicate tuple in reply direction for NAT traffic
857 * to local address when connection is sync-ed
858 */
859 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
860 if (cp->flags & IP_VS_CONN_F_SYNC && local) {
861 enum ip_conntrack_info ctinfo;
862 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
863
864 if (ct) {
865 IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, ipvsh->off,
866 "ip_vs_nat_xmit_v6(): "
867 "stopping DNAT to local address");
868 goto tx_error;
869 }
870 }
871 #endif
872
873 /* From world but DNAT to loopback address? */
874 if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
875 ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) {
876 IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, ipvsh->off,
877 "ip_vs_nat_xmit_v6(): "
878 "stopping DNAT to loopback address");
879 goto tx_error;
880 }
881
882 /* copy-on-write the packet before mangling it */
883 if (skb_ensure_writable(skb, sizeof(struct ipv6hdr)))
884 goto tx_error;
885
886 if (skb_cow(skb, rt->dst.dev->hard_header_len))
887 goto tx_error;
888
889 /* mangle the packet */
890 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
891 goto tx_error;
892 ipv6_hdr(skb)->daddr = cp->daddr.in6;
893
894 IP_VS_DBG_PKT(10, AF_INET6, pp, skb, ipvsh->off, "After DNAT");
895
896 /* FIXME: when application helper enlarges the packet and the length
897 is larger than the MTU of outgoing device, there will be still
898 MTU problem. */
899
900 /* Another hack: avoid icmp_send in ip_fragment */
901 skb->ignore_df = 1;
902
903 rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
904
905 return rc;
906
907 tx_error:
908 kfree_skb(skb);
909 return NF_STOLEN;
910 }
911 #endif
912
913 /* When forwarding a packet, we must ensure that we've got enough headroom
914 * for the encapsulation packet in the skb. This also gives us an
915 * opportunity to figure out what the payload_len, dsfield, ttl, and df
916 * values should be, so that we won't need to look at the old ip header
917 * again
918 */
919 static struct sk_buff *
ip_vs_prepare_tunneled_skb(struct sk_buff * skb,int skb_af,unsigned int max_headroom,__u8 * next_protocol,__u32 * payload_len,__u8 * dsfield,__u8 * ttl,__be16 * df)920 ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af,
921 unsigned int max_headroom, __u8 *next_protocol,
922 __u32 *payload_len, __u8 *dsfield, __u8 *ttl,
923 __be16 *df)
924 {
925 struct sk_buff *new_skb = NULL;
926 struct iphdr *old_iph = NULL;
927 __u8 old_dsfield;
928 #ifdef CONFIG_IP_VS_IPV6
929 struct ipv6hdr *old_ipv6h = NULL;
930 #endif
931
932 ip_vs_drop_early_demux_sk(skb);
933
934 if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) {
935 new_skb = skb_realloc_headroom(skb, max_headroom);
936 if (!new_skb)
937 goto error;
938 if (skb->sk)
939 skb_set_owner_w(new_skb, skb->sk);
940 consume_skb(skb);
941 skb = new_skb;
942 }
943
944 #ifdef CONFIG_IP_VS_IPV6
945 if (skb_af == AF_INET6) {
946 old_ipv6h = ipv6_hdr(skb);
947 *next_protocol = IPPROTO_IPV6;
948 if (payload_len)
949 *payload_len =
950 ntohs(old_ipv6h->payload_len) +
951 sizeof(*old_ipv6h);
952 old_dsfield = ipv6_get_dsfield(old_ipv6h);
953 *ttl = old_ipv6h->hop_limit;
954 if (df)
955 *df = 0;
956 } else
957 #endif
958 {
959 old_iph = ip_hdr(skb);
960 /* Copy DF, reset fragment offset and MF */
961 if (df)
962 *df = (old_iph->frag_off & htons(IP_DF));
963 *next_protocol = IPPROTO_IPIP;
964
965 /* fix old IP header checksum */
966 ip_send_check(old_iph);
967 old_dsfield = ipv4_get_dsfield(old_iph);
968 *ttl = old_iph->ttl;
969 if (payload_len)
970 *payload_len = skb_ip_totlen(skb);
971 }
972
973 /* Implement full-functionality option for ECN encapsulation */
974 *dsfield = INET_ECN_encapsulate(old_dsfield, old_dsfield);
975
976 return skb;
977 error:
978 kfree_skb(skb);
979 return ERR_PTR(-ENOMEM);
980 }
981
__tun_gso_type_mask(int encaps_af,int orig_af)982 static inline int __tun_gso_type_mask(int encaps_af, int orig_af)
983 {
984 switch (encaps_af) {
985 case AF_INET:
986 return SKB_GSO_IPXIP4;
987 case AF_INET6:
988 return SKB_GSO_IPXIP6;
989 default:
990 return 0;
991 }
992 }
993
994 static int
ipvs_gue_encap(struct net * net,struct sk_buff * skb,struct ip_vs_conn * cp,__u8 * next_protocol)995 ipvs_gue_encap(struct net *net, struct sk_buff *skb,
996 struct ip_vs_conn *cp, __u8 *next_protocol)
997 {
998 __be16 dport;
999 __be16 sport = udp_flow_src_port(net, skb, 0, 0, false);
1000 struct udphdr *udph; /* Our new UDP header */
1001 struct guehdr *gueh; /* Our new GUE header */
1002 size_t hdrlen, optlen = 0;
1003 void *data;
1004 bool need_priv = false;
1005
1006 if ((cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
1007 skb->ip_summed == CHECKSUM_PARTIAL) {
1008 optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
1009 need_priv = true;
1010 }
1011
1012 hdrlen = sizeof(struct guehdr) + optlen;
1013
1014 skb_push(skb, hdrlen);
1015
1016 gueh = (struct guehdr *)skb->data;
1017
1018 gueh->control = 0;
1019 gueh->version = 0;
1020 gueh->hlen = optlen >> 2;
1021 gueh->flags = 0;
1022 gueh->proto_ctype = *next_protocol;
1023
1024 data = &gueh[1];
1025
1026 if (need_priv) {
1027 __be32 *flags = data;
1028 u16 csum_start = skb_checksum_start_offset(skb);
1029 __be16 *pd;
1030
1031 gueh->flags |= GUE_FLAG_PRIV;
1032 *flags = 0;
1033 data += GUE_LEN_PRIV;
1034
1035 if (csum_start < hdrlen)
1036 return -EINVAL;
1037
1038 csum_start -= hdrlen;
1039 pd = data;
1040 pd[0] = htons(csum_start);
1041 pd[1] = htons(csum_start + skb->csum_offset);
1042
1043 if (!skb_is_gso(skb)) {
1044 skb->ip_summed = CHECKSUM_NONE;
1045 skb->encapsulation = 0;
1046 }
1047
1048 *flags |= GUE_PFLAG_REMCSUM;
1049 data += GUE_PLEN_REMCSUM;
1050 }
1051
1052 skb_push(skb, sizeof(struct udphdr));
1053 skb_reset_transport_header(skb);
1054
1055 udph = udp_hdr(skb);
1056
1057 dport = cp->dest->tun_port;
1058 udph->dest = dport;
1059 udph->source = sport;
1060 udph->len = htons(skb->len);
1061 udph->check = 0;
1062
1063 *next_protocol = IPPROTO_UDP;
1064
1065 return 0;
1066 }
1067
1068 static void
ipvs_gre_encap(struct net * net,struct sk_buff * skb,struct ip_vs_conn * cp,__u8 * next_protocol)1069 ipvs_gre_encap(struct net *net, struct sk_buff *skb,
1070 struct ip_vs_conn *cp, __u8 *next_protocol)
1071 {
1072 __be16 proto = *next_protocol == IPPROTO_IPIP ?
1073 htons(ETH_P_IP) : htons(ETH_P_IPV6);
1074 IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
1075 size_t hdrlen;
1076
1077 if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
1078 __set_bit(IP_TUNNEL_CSUM_BIT, tflags);
1079
1080 hdrlen = gre_calc_hlen(tflags);
1081 gre_build_header(skb, hdrlen, tflags, proto, 0, 0);
1082
1083 *next_protocol = IPPROTO_GRE;
1084 }
1085
1086 /*
1087 * IP Tunneling transmitter
1088 *
1089 * This function encapsulates the packet in a new IP packet, its
1090 * destination will be set to cp->daddr. Most code of this function
1091 * is taken from ipip.c.
1092 *
1093 * It is used in VS/TUN cluster. The load balancer selects a real
1094 * server from a cluster based on a scheduling algorithm,
1095 * encapsulates the request packet and forwards it to the selected
1096 * server. For example, all real servers are configured with
1097 * "ifconfig tunl0 <Virtual IP Address> up". When the server receives
1098 * the encapsulated packet, it will decapsulate the packet, processe
1099 * the request and return the response packets directly to the client
1100 * without passing the load balancer. This can greatly increase the
1101 * scalability of virtual server.
1102 *
1103 * Used for ANY protocol
1104 */
1105 int
ip_vs_tunnel_xmit(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,struct ip_vs_iphdr * ipvsh)1106 ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
1107 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
1108 {
1109 struct netns_ipvs *ipvs = cp->ipvs;
1110 struct net *net = ipvs->net;
1111 struct rtable *rt; /* Route to the other host */
1112 __be32 saddr; /* Source for tunnel */
1113 struct net_device *tdev; /* Device to other host */
1114 __u8 next_protocol = 0;
1115 __u8 dsfield = 0;
1116 __u8 ttl = 0;
1117 __be16 df = 0;
1118 __be16 *dfp = NULL;
1119 struct iphdr *iph; /* Our new IP header */
1120 unsigned int max_headroom; /* The extra header space needed */
1121 int ret, local;
1122 int tun_type, gso_type;
1123 int tun_flags;
1124
1125 local = __ip_vs_get_out_rt(ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
1126 IP_VS_RT_MODE_LOCAL |
1127 IP_VS_RT_MODE_NON_LOCAL |
1128 IP_VS_RT_MODE_CONNECT |
1129 IP_VS_RT_MODE_TUNNEL, &saddr, ipvsh);
1130 if (local < 0)
1131 goto tx_error;
1132 if (local)
1133 return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
1134
1135 rt = skb_rtable(skb);
1136 tdev = rt->dst.dev;
1137
1138 /*
1139 * Okay, now see if we can stuff it in the buffer as-is.
1140 */
1141 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
1142
1143 tun_type = cp->dest->tun_type;
1144 tun_flags = cp->dest->tun_flags;
1145
1146 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1147 size_t gue_hdrlen, gue_optlen = 0;
1148
1149 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
1150 skb->ip_summed == CHECKSUM_PARTIAL) {
1151 gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
1152 }
1153 gue_hdrlen = sizeof(struct guehdr) + gue_optlen;
1154
1155 max_headroom += sizeof(struct udphdr) + gue_hdrlen;
1156 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
1157 IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
1158 size_t gre_hdrlen;
1159
1160 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
1161 __set_bit(IP_TUNNEL_CSUM_BIT, tflags);
1162 gre_hdrlen = gre_calc_hlen(tflags);
1163
1164 max_headroom += gre_hdrlen;
1165 }
1166
1167 /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */
1168 dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL;
1169 skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
1170 &next_protocol, NULL, &dsfield,
1171 &ttl, dfp);
1172 if (IS_ERR(skb))
1173 return NF_STOLEN;
1174
1175 gso_type = __tun_gso_type_mask(AF_INET, cp->af);
1176 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1177 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
1178 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
1179 gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
1180 else
1181 gso_type |= SKB_GSO_UDP_TUNNEL;
1182 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
1183 skb->ip_summed == CHECKSUM_PARTIAL) {
1184 gso_type |= SKB_GSO_TUNNEL_REMCSUM;
1185 }
1186 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
1187 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
1188 gso_type |= SKB_GSO_GRE_CSUM;
1189 else
1190 gso_type |= SKB_GSO_GRE;
1191 }
1192
1193 if (iptunnel_handle_offloads(skb, gso_type))
1194 goto tx_error;
1195
1196 skb->transport_header = skb->network_header;
1197
1198 skb_set_inner_ipproto(skb, next_protocol);
1199 skb_set_inner_mac_header(skb, skb_inner_network_offset(skb));
1200
1201 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1202 bool check = false;
1203
1204 if (ipvs_gue_encap(net, skb, cp, &next_protocol))
1205 goto tx_error;
1206
1207 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
1208 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
1209 check = true;
1210
1211 udp_set_csum(!check, skb, saddr, cp->daddr.ip, skb->len);
1212 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE)
1213 ipvs_gre_encap(net, skb, cp, &next_protocol);
1214
1215 skb_push(skb, sizeof(struct iphdr));
1216 skb_reset_network_header(skb);
1217 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1218
1219 /*
1220 * Push down and install the IPIP header.
1221 */
1222 iph = ip_hdr(skb);
1223 iph->version = 4;
1224 iph->ihl = sizeof(struct iphdr)>>2;
1225 iph->frag_off = df;
1226 iph->protocol = next_protocol;
1227 iph->tos = dsfield;
1228 iph->daddr = cp->daddr.ip;
1229 iph->saddr = saddr;
1230 iph->ttl = ttl;
1231 ip_select_ident(net, skb, NULL);
1232
1233 /* Another hack: avoid icmp_send in ip_fragment */
1234 skb->ignore_df = 1;
1235
1236 ret = ip_vs_tunnel_xmit_prepare(skb, cp);
1237 if (ret == NF_ACCEPT)
1238 ip_local_out(net, skb->sk, skb);
1239 else if (ret == NF_DROP)
1240 kfree_skb(skb);
1241
1242 return NF_STOLEN;
1243
1244 tx_error:
1245 kfree_skb(skb);
1246 return NF_STOLEN;
1247 }
1248
1249 #ifdef CONFIG_IP_VS_IPV6
1250 int
ip_vs_tunnel_xmit_v6(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,struct ip_vs_iphdr * ipvsh)1251 ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
1252 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
1253 {
1254 struct netns_ipvs *ipvs = cp->ipvs;
1255 struct net *net = ipvs->net;
1256 struct rt6_info *rt; /* Route to the other host */
1257 struct in6_addr saddr; /* Source for tunnel */
1258 struct net_device *tdev; /* Device to other host */
1259 __u8 next_protocol = 0;
1260 __u32 payload_len = 0;
1261 __u8 dsfield = 0;
1262 __u8 ttl = 0;
1263 struct ipv6hdr *iph; /* Our new IP header */
1264 unsigned int max_headroom; /* The extra header space needed */
1265 int ret, local;
1266 int tun_type, gso_type;
1267 int tun_flags;
1268
1269 local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest,
1270 &cp->daddr.in6,
1271 &saddr, ipvsh, 1,
1272 IP_VS_RT_MODE_LOCAL |
1273 IP_VS_RT_MODE_NON_LOCAL |
1274 IP_VS_RT_MODE_TUNNEL);
1275 if (local < 0)
1276 goto tx_error;
1277 if (local)
1278 return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
1279
1280 rt = dst_rt6_info(skb_dst(skb));
1281 tdev = rt->dst.dev;
1282
1283 /*
1284 * Okay, now see if we can stuff it in the buffer as-is.
1285 */
1286 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
1287
1288 tun_type = cp->dest->tun_type;
1289 tun_flags = cp->dest->tun_flags;
1290
1291 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1292 size_t gue_hdrlen, gue_optlen = 0;
1293
1294 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
1295 skb->ip_summed == CHECKSUM_PARTIAL) {
1296 gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
1297 }
1298 gue_hdrlen = sizeof(struct guehdr) + gue_optlen;
1299
1300 max_headroom += sizeof(struct udphdr) + gue_hdrlen;
1301 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
1302 IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
1303 size_t gre_hdrlen;
1304
1305 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
1306 __set_bit(IP_TUNNEL_CSUM_BIT, tflags);
1307 gre_hdrlen = gre_calc_hlen(tflags);
1308
1309 max_headroom += gre_hdrlen;
1310 }
1311
1312 skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
1313 &next_protocol, &payload_len,
1314 &dsfield, &ttl, NULL);
1315 if (IS_ERR(skb))
1316 return NF_STOLEN;
1317
1318 gso_type = __tun_gso_type_mask(AF_INET6, cp->af);
1319 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1320 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
1321 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
1322 gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
1323 else
1324 gso_type |= SKB_GSO_UDP_TUNNEL;
1325 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
1326 skb->ip_summed == CHECKSUM_PARTIAL) {
1327 gso_type |= SKB_GSO_TUNNEL_REMCSUM;
1328 }
1329 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
1330 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
1331 gso_type |= SKB_GSO_GRE_CSUM;
1332 else
1333 gso_type |= SKB_GSO_GRE;
1334 }
1335
1336 if (iptunnel_handle_offloads(skb, gso_type))
1337 goto tx_error;
1338
1339 skb->transport_header = skb->network_header;
1340
1341 skb_set_inner_ipproto(skb, next_protocol);
1342 skb_set_inner_mac_header(skb, skb_inner_network_offset(skb));
1343
1344 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1345 bool check = false;
1346
1347 if (ipvs_gue_encap(net, skb, cp, &next_protocol))
1348 goto tx_error;
1349
1350 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
1351 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
1352 check = true;
1353
1354 udp6_set_csum(!check, skb, &saddr, &cp->daddr.in6, skb->len);
1355 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE)
1356 ipvs_gre_encap(net, skb, cp, &next_protocol);
1357
1358 skb_push(skb, sizeof(struct ipv6hdr));
1359 skb_reset_network_header(skb);
1360 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1361
1362 /*
1363 * Push down and install the IPIP header.
1364 */
1365 iph = ipv6_hdr(skb);
1366 iph->version = 6;
1367 iph->nexthdr = next_protocol;
1368 iph->payload_len = htons(payload_len);
1369 memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
1370 ipv6_change_dsfield(iph, 0, dsfield);
1371 iph->daddr = cp->daddr.in6;
1372 iph->saddr = saddr;
1373 iph->hop_limit = ttl;
1374
1375 /* Another hack: avoid icmp_send in ip_fragment */
1376 skb->ignore_df = 1;
1377
1378 ret = ip_vs_tunnel_xmit_prepare(skb, cp);
1379 if (ret == NF_ACCEPT)
1380 ip6_local_out(net, skb->sk, skb);
1381 else if (ret == NF_DROP)
1382 kfree_skb(skb);
1383
1384 return NF_STOLEN;
1385
1386 tx_error:
1387 kfree_skb(skb);
1388 return NF_STOLEN;
1389 }
1390 #endif
1391
1392
1393 /*
1394 * Direct Routing transmitter
1395 * Used for ANY protocol
1396 */
1397 int
ip_vs_dr_xmit(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,struct ip_vs_iphdr * ipvsh)1398 ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
1399 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
1400 {
1401 int local;
1402
1403 local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
1404 IP_VS_RT_MODE_LOCAL |
1405 IP_VS_RT_MODE_NON_LOCAL |
1406 IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh);
1407 if (local < 0)
1408 goto tx_error;
1409 if (local)
1410 return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
1411
1412 ip_send_check(ip_hdr(skb));
1413
1414 /* Another hack: avoid icmp_send in ip_fragment */
1415 skb->ignore_df = 1;
1416
1417 ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
1418
1419 return NF_STOLEN;
1420
1421 tx_error:
1422 kfree_skb(skb);
1423 return NF_STOLEN;
1424 }
1425
1426 #ifdef CONFIG_IP_VS_IPV6
1427 int
ip_vs_dr_xmit_v6(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,struct ip_vs_iphdr * ipvsh)1428 ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
1429 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
1430 {
1431 int local;
1432
1433 local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
1434 &cp->daddr.in6,
1435 NULL, ipvsh, 0,
1436 IP_VS_RT_MODE_LOCAL |
1437 IP_VS_RT_MODE_NON_LOCAL |
1438 IP_VS_RT_MODE_KNOWN_NH);
1439 if (local < 0)
1440 goto tx_error;
1441 if (local)
1442 return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
1443
1444 /* Another hack: avoid icmp_send in ip_fragment */
1445 skb->ignore_df = 1;
1446
1447 ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
1448
1449 return NF_STOLEN;
1450
1451 tx_error:
1452 kfree_skb(skb);
1453 return NF_STOLEN;
1454 }
1455 #endif
1456
1457
1458 /*
1459 * ICMP packet transmitter
1460 * called by the ip_vs_in_icmp
1461 */
1462 int
ip_vs_icmp_xmit(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,int offset,unsigned int hooknum,struct ip_vs_iphdr * iph)1463 ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
1464 struct ip_vs_protocol *pp, int offset, unsigned int hooknum,
1465 struct ip_vs_iphdr *iph)
1466 {
1467 struct rtable *rt; /* Route to the other host */
1468 int rc;
1469 int local;
1470 int rt_mode, was_input;
1471
1472 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
1473 forwarded directly here, because there is no need to
1474 translate address/port back */
1475 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
1476 if (cp->packet_xmit)
1477 rc = cp->packet_xmit(skb, cp, pp, iph);
1478 else
1479 rc = NF_ACCEPT;
1480 /* do not touch skb anymore */
1481 atomic_inc(&cp->in_pkts);
1482 return rc;
1483 }
1484
1485 /*
1486 * mangle and send the packet here (only for VS/NAT)
1487 */
1488 was_input = rt_is_input_route(skb_rtable(skb));
1489
1490 /* LOCALNODE from FORWARD hook is not supported */
1491 rt_mode = (hooknum != NF_INET_FORWARD) ?
1492 IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
1493 IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
1494 local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, rt_mode,
1495 NULL, iph);
1496 if (local < 0)
1497 goto tx_error;
1498 rt = skb_rtable(skb);
1499
1500 /*
1501 * Avoid duplicate tuple in reply direction for NAT traffic
1502 * to local address when connection is sync-ed
1503 */
1504 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
1505 if (cp->flags & IP_VS_CONN_F_SYNC && local) {
1506 enum ip_conntrack_info ctinfo;
1507 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
1508
1509 if (ct) {
1510 IP_VS_DBG(10, "%s(): "
1511 "stopping DNAT to local address %pI4\n",
1512 __func__, &cp->daddr.ip);
1513 goto tx_error;
1514 }
1515 }
1516 #endif
1517
1518 /* From world but DNAT to loopback address? */
1519 if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
1520 IP_VS_DBG(1, "%s(): "
1521 "stopping DNAT to loopback %pI4\n",
1522 __func__, &cp->daddr.ip);
1523 goto tx_error;
1524 }
1525
1526 /* copy-on-write the packet before mangling it */
1527 if (skb_ensure_writable(skb, offset))
1528 goto tx_error;
1529
1530 if (skb_cow(skb, rt->dst.dev->hard_header_len))
1531 goto tx_error;
1532
1533 ip_vs_nat_icmp(skb, pp, cp, 0);
1534
1535 /* Another hack: avoid icmp_send in ip_fragment */
1536 skb->ignore_df = 1;
1537
1538 return ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
1539
1540 tx_error:
1541 kfree_skb(skb);
1542 rc = NF_STOLEN;
1543 return rc;
1544 }
1545
1546 #ifdef CONFIG_IP_VS_IPV6
1547 int
ip_vs_icmp_xmit_v6(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,int offset,unsigned int hooknum,struct ip_vs_iphdr * ipvsh)1548 ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
1549 struct ip_vs_protocol *pp, int offset, unsigned int hooknum,
1550 struct ip_vs_iphdr *ipvsh)
1551 {
1552 struct rt6_info *rt; /* Route to the other host */
1553 int rc;
1554 int local;
1555 int rt_mode;
1556
1557 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
1558 forwarded directly here, because there is no need to
1559 translate address/port back */
1560 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
1561 if (cp->packet_xmit)
1562 rc = cp->packet_xmit(skb, cp, pp, ipvsh);
1563 else
1564 rc = NF_ACCEPT;
1565 /* do not touch skb anymore */
1566 atomic_inc(&cp->in_pkts);
1567 return rc;
1568 }
1569
1570 /*
1571 * mangle and send the packet here (only for VS/NAT)
1572 */
1573
1574 /* LOCALNODE from FORWARD hook is not supported */
1575 rt_mode = (hooknum != NF_INET_FORWARD) ?
1576 IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
1577 IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
1578 local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
1579 &cp->daddr.in6, NULL, ipvsh, 0, rt_mode);
1580 if (local < 0)
1581 goto tx_error;
1582 rt = dst_rt6_info(skb_dst(skb));
1583 /*
1584 * Avoid duplicate tuple in reply direction for NAT traffic
1585 * to local address when connection is sync-ed
1586 */
1587 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
1588 if (cp->flags & IP_VS_CONN_F_SYNC && local) {
1589 enum ip_conntrack_info ctinfo;
1590 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
1591
1592 if (ct) {
1593 IP_VS_DBG(10, "%s(): "
1594 "stopping DNAT to local address %pI6\n",
1595 __func__, &cp->daddr.in6);
1596 goto tx_error;
1597 }
1598 }
1599 #endif
1600
1601 /* From world but DNAT to loopback address? */
1602 if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
1603 ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) {
1604 IP_VS_DBG(1, "%s(): "
1605 "stopping DNAT to loopback %pI6\n",
1606 __func__, &cp->daddr.in6);
1607 goto tx_error;
1608 }
1609
1610 /* copy-on-write the packet before mangling it */
1611 if (skb_ensure_writable(skb, offset))
1612 goto tx_error;
1613
1614 if (skb_cow(skb, rt->dst.dev->hard_header_len))
1615 goto tx_error;
1616
1617 ip_vs_nat_icmp_v6(skb, pp, cp, 0);
1618
1619 /* Another hack: avoid icmp_send in ip_fragment */
1620 skb->ignore_df = 1;
1621
1622 return ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
1623
1624 tx_error:
1625 kfree_skb(skb);
1626 rc = NF_STOLEN;
1627 return rc;
1628 }
1629 #endif
1630