1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * ip_vs_xmit.c: various packet transmitters for IPVS
4 *
5 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
6 * Julian Anastasov <ja@ssi.bg>
7 *
8 * Changes:
9 *
10 * Description of forwarding methods:
11 * - all transmitters are called from LOCAL_IN (remote clients) and
12 * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD
13 * - not all connections have destination server, for example,
14 * connections in backup server when fwmark is used
15 * - bypass connections use daddr from packet
16 * - we can use dst without ref while sending in RCU section, we use
17 * ref when returning NF_ACCEPT for NAT-ed packet via loopback
18 * LOCAL_OUT rules:
19 * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
20 * - skb->pkt_type is not set yet
21 * - the only place where we can see skb->sk != NULL
22 */
23
24 #define pr_fmt(fmt) "IPVS: " fmt
25
26 #include <linux/kernel.h>
27 #include <linux/slab.h>
28 #include <linux/tcp.h> /* for tcphdr */
29 #include <net/ip.h>
30 #include <net/gue.h>
31 #include <net/gre.h>
32 #include <net/tcp.h> /* for csum_tcpudp_magic */
33 #include <net/udp.h>
34 #include <net/icmp.h> /* for icmp_send */
35 #include <net/route.h> /* for ip_route_output */
36 #include <net/ipv6.h>
37 #include <net/ip6_route.h>
38 #include <net/ip_tunnels.h>
39 #include <net/ip6_checksum.h>
40 #include <net/addrconf.h>
41 #include <linux/icmpv6.h>
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv4.h>
44
45 #include <net/ip_vs.h>
46
47 enum {
48 IP_VS_RT_MODE_LOCAL = 1, /* Allow local dest */
49 IP_VS_RT_MODE_NON_LOCAL = 2, /* Allow non-local dest */
50 IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to
51 * local
52 */
53 IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */
54 IP_VS_RT_MODE_KNOWN_NH = 16,/* Route via remote addr */
55 IP_VS_RT_MODE_TUNNEL = 32,/* Tunnel mode */
56 };
57
ip_vs_dest_dst_alloc(void)58 static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void)
59 {
60 return kmalloc_obj(struct ip_vs_dest_dst, GFP_ATOMIC);
61 }
62
ip_vs_dest_dst_free(struct ip_vs_dest_dst * dest_dst)63 static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst)
64 {
65 kfree(dest_dst);
66 }
67
68 /*
69 * Destination cache to speed up outgoing route lookup
70 */
71 static inline void
__ip_vs_dst_set(struct ip_vs_dest * dest,struct ip_vs_dest_dst * dest_dst,struct dst_entry * dst,u32 dst_cookie)72 __ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst,
73 struct dst_entry *dst, u32 dst_cookie)
74 {
75 struct ip_vs_dest_dst *old;
76
77 old = rcu_dereference_protected(dest->dest_dst,
78 lockdep_is_held(&dest->dst_lock));
79
80 if (dest_dst) {
81 dest_dst->dst_cache = dst;
82 dest_dst->dst_cookie = dst_cookie;
83 }
84 rcu_assign_pointer(dest->dest_dst, dest_dst);
85
86 if (old)
87 call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
88 }
89
90 static inline struct ip_vs_dest_dst *
__ip_vs_dst_check(struct ip_vs_dest * dest)91 __ip_vs_dst_check(struct ip_vs_dest *dest)
92 {
93 struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst);
94 struct dst_entry *dst;
95
96 if (!dest_dst)
97 return NULL;
98 dst = dest_dst->dst_cache;
99 if (READ_ONCE(dst->obsolete) &&
100 dst->ops->check(dst, dest_dst->dst_cookie) == NULL)
101 return NULL;
102 return dest_dst;
103 }
104
105 static inline bool
__mtu_check_toobig_v6(const struct sk_buff * skb,u32 mtu)106 __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu)
107 {
108 if (IP6CB(skb)->frag_max_size) {
109 /* frag_max_size tell us that, this packet have been
110 * defragmented by netfilter IPv6 conntrack module.
111 */
112 if (IP6CB(skb)->frag_max_size > mtu)
113 return true; /* largest fragment violate MTU */
114 }
115 else if (skb->len > mtu && !skb_is_gso(skb)) {
116 return true; /* Packet size violate MTU size */
117 }
118 return false;
119 }
120
121 /* Get route to daddr, optionally bind route to saddr */
do_output_route4(struct net * net,__be32 daddr,int rt_mode,__be32 * ret_saddr)122 static struct rtable *do_output_route4(struct net *net, __be32 daddr,
123 int rt_mode, __be32 *ret_saddr)
124 {
125 struct flowi4 fl4;
126 struct rtable *rt;
127
128 memset(&fl4, 0, sizeof(fl4));
129 fl4.daddr = daddr;
130 fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ?
131 FLOWI_FLAG_KNOWN_NH : 0;
132
133 retry:
134 rt = ip_route_output_key(net, &fl4);
135 if (IS_ERR(rt)) {
136 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr);
137 return NULL;
138 }
139 if (rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) {
140 ip_rt_put(rt);
141 flowi4_update_output(&fl4, 0, daddr, fl4.saddr);
142 rt_mode = 0;
143 goto retry;
144 }
145 if (ret_saddr)
146 *ret_saddr = fl4.saddr;
147 return rt;
148 }
149
150 #ifdef CONFIG_IP_VS_IPV6
__ip_vs_is_local_route6(struct rt6_info * rt)151 static inline int __ip_vs_is_local_route6(struct rt6_info *rt)
152 {
153 return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK;
154 }
155 #endif
156
crosses_local_route_boundary(int skb_af,struct sk_buff * skb,int rt_mode,bool new_rt_is_local)157 static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb,
158 int rt_mode,
159 bool new_rt_is_local)
160 {
161 bool rt_mode_allow_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL);
162 bool rt_mode_allow_non_local = !!(rt_mode & IP_VS_RT_MODE_NON_LOCAL);
163 bool rt_mode_allow_redirect = !!(rt_mode & IP_VS_RT_MODE_RDR);
164 bool source_is_loopback;
165 bool old_rt_is_local;
166
167 #ifdef CONFIG_IP_VS_IPV6
168 if (skb_af == AF_INET6) {
169 int addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr);
170
171 source_is_loopback =
172 (!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
173 (addr_type & IPV6_ADDR_LOOPBACK);
174 old_rt_is_local = __ip_vs_is_local_route6(
175 dst_rt6_info(skb_dst(skb)));
176 } else
177 #endif
178 {
179 source_is_loopback = ipv4_is_loopback(ip_hdr(skb)->saddr);
180 old_rt_is_local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
181 }
182
183 if (unlikely(new_rt_is_local)) {
184 if (!rt_mode_allow_local)
185 return true;
186 if (!rt_mode_allow_redirect && !old_rt_is_local)
187 return true;
188 } else {
189 if (!rt_mode_allow_non_local)
190 return true;
191 if (source_is_loopback)
192 return true;
193 }
194 return false;
195 }
196
maybe_update_pmtu(int skb_af,struct sk_buff * skb,int mtu)197 static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu)
198 {
199 struct sock *sk = skb->sk;
200 struct rtable *ort = skb_rtable(skb);
201
202 if (!skb->dev && sk && sk_fullsock(sk))
203 ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu, true);
204 }
205
ensure_mtu_is_adequate(struct netns_ipvs * ipvs,int skb_af,int rt_mode,struct ip_vs_iphdr * ipvsh,struct sk_buff * skb,int mtu)206 static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af,
207 int rt_mode,
208 struct ip_vs_iphdr *ipvsh,
209 struct sk_buff *skb, int mtu)
210 {
211 #ifdef CONFIG_IP_VS_IPV6
212 if (skb_af == AF_INET6) {
213 struct net *net = ipvs->net;
214
215 if (unlikely(__mtu_check_toobig_v6(skb, mtu))) {
216 if (!skb->dev)
217 skb->dev = net->loopback_dev;
218 /* only send ICMP too big on first fragment */
219 if (!ipvsh->fragoffs && !ip_vs_iph_icmp(ipvsh))
220 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
221 IP_VS_DBG(1, "frag needed for %pI6c\n",
222 &ipv6_hdr(skb)->saddr);
223 return false;
224 }
225 } else
226 #endif
227 {
228 /* If we're going to tunnel the packet and pmtu discovery
229 * is disabled, we'll just fragment it anyway
230 */
231 if ((rt_mode & IP_VS_RT_MODE_TUNNEL) && !sysctl_pmtu_disc(ipvs))
232 return true;
233
234 if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) &&
235 skb->len > mtu && !skb_is_gso(skb) &&
236 !ip_vs_iph_icmp(ipvsh))) {
237 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
238 htonl(mtu));
239 IP_VS_DBG(1, "frag needed for %pI4\n",
240 &ip_hdr(skb)->saddr);
241 return false;
242 }
243 }
244
245 return true;
246 }
247
decrement_ttl(struct netns_ipvs * ipvs,int skb_af,struct sk_buff * skb)248 static inline bool decrement_ttl(struct netns_ipvs *ipvs,
249 int skb_af,
250 struct sk_buff *skb)
251 {
252 struct net *net = ipvs->net;
253
254 #ifdef CONFIG_IP_VS_IPV6
255 if (skb_af == AF_INET6) {
256 struct dst_entry *dst = skb_dst(skb);
257
258 /* check and decrement ttl */
259 if (ipv6_hdr(skb)->hop_limit <= 1) {
260 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
261
262 /* Force OUTPUT device used as source address */
263 skb->dev = dst->dev;
264 icmpv6_send(skb, ICMPV6_TIME_EXCEED,
265 ICMPV6_EXC_HOPLIMIT, 0);
266 IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
267
268 return false;
269 }
270
271 /* don't propagate ttl change to cloned packets */
272 if (skb_ensure_writable(skb, sizeof(struct ipv6hdr)))
273 return false;
274
275 ipv6_hdr(skb)->hop_limit--;
276 } else
277 #endif
278 {
279 if (ip_hdr(skb)->ttl <= 1) {
280 /* Tell the sender its packet died... */
281 IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
282 icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
283 return false;
284 }
285
286 /* don't propagate ttl change to cloned packets */
287 if (skb_ensure_writable(skb, sizeof(struct iphdr)))
288 return false;
289
290 /* Decrease ttl */
291 ip_decrease_ttl(ip_hdr(skb));
292 }
293
294 return true;
295 }
296
297 /* rt has device that is down */
rt_dev_is_down(const struct net_device * dev)298 static bool rt_dev_is_down(const struct net_device *dev)
299 {
300 return dev && !netif_running(dev);
301 }
302
303 /* Get route to destination or remote server */
304 static int
__ip_vs_get_out_rt(struct netns_ipvs * ipvs,int skb_af,struct sk_buff * skb,struct ip_vs_dest * dest,__be32 daddr,int rt_mode,__be32 * ret_saddr,struct ip_vs_iphdr * ipvsh)305 __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
306 struct ip_vs_dest *dest,
307 __be32 daddr, int rt_mode, __be32 *ret_saddr,
308 struct ip_vs_iphdr *ipvsh)
309 {
310 struct net *net = ipvs->net;
311 struct ip_vs_dest_dst *dest_dst;
312 struct rtable *rt; /* Route to the other host */
313 int mtu;
314 int local, noref = 1;
315
316 if (dest) {
317 dest_dst = __ip_vs_dst_check(dest);
318 if (likely(dest_dst)) {
319 rt = dst_rtable(dest_dst->dst_cache);
320 if (ret_saddr)
321 *ret_saddr = dest_dst->dst_saddr.ip;
322 } else {
323 dest_dst = ip_vs_dest_dst_alloc();
324 spin_lock_bh(&dest->dst_lock);
325 if (!dest_dst) {
326 __ip_vs_dst_set(dest, NULL, NULL, 0);
327 spin_unlock_bh(&dest->dst_lock);
328 goto err_unreach;
329 }
330 rt = do_output_route4(net, dest->addr.ip, rt_mode,
331 &dest_dst->dst_saddr.ip);
332 if (!rt) {
333 __ip_vs_dst_set(dest, NULL, NULL, 0);
334 spin_unlock_bh(&dest->dst_lock);
335 ip_vs_dest_dst_free(dest_dst);
336 goto err_unreach;
337 }
338 /* It is forbidden to attach dest->dest_dst if
339 * device is going down.
340 */
341 if (!rt_dev_is_down(dst_dev_rcu(&rt->dst)))
342 __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0);
343 else
344 noref = 0;
345 spin_unlock_bh(&dest->dst_lock);
346 IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
347 &dest->addr.ip, &dest_dst->dst_saddr.ip,
348 rcuref_read(&rt->dst.__rcuref));
349 if (ret_saddr)
350 *ret_saddr = dest_dst->dst_saddr.ip;
351 if (!noref)
352 ip_vs_dest_dst_free(dest_dst);
353 }
354 } else {
355 noref = 0;
356
357 /* For such unconfigured boxes avoid many route lookups
358 * for performance reasons because we do not remember saddr
359 */
360 rt_mode &= ~IP_VS_RT_MODE_CONNECT;
361 rt = do_output_route4(net, daddr, rt_mode, ret_saddr);
362 if (!rt)
363 goto err_unreach;
364 }
365
366 local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0;
367 if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode,
368 local))) {
369 IP_VS_DBG_RL("We are crossing local and non-local addresses"
370 " daddr=%pI4\n", &daddr);
371 goto err_put;
372 }
373
374 if (unlikely(local)) {
375 /* skb to local stack, preserve old route */
376 if (!noref)
377 ip_rt_put(rt);
378 return local;
379 }
380
381 if (!decrement_ttl(ipvs, skb_af, skb))
382 goto err_put;
383
384 if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) {
385 mtu = dst_mtu(&rt->dst);
386 } else {
387 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
388 if (!dest)
389 goto err_put;
390 if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
391 mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
392 if ((dest->tun_flags &
393 IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
394 skb->ip_summed == CHECKSUM_PARTIAL)
395 mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
396 } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
397 IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
398
399 if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
400 __set_bit(IP_TUNNEL_CSUM_BIT, tflags);
401 mtu -= gre_calc_hlen(tflags);
402 }
403 if (mtu < 68) {
404 IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
405 goto err_put;
406 }
407 maybe_update_pmtu(skb_af, skb, mtu);
408 }
409
410 if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu))
411 goto err_put;
412
413 skb_dst_drop(skb);
414 if (noref)
415 skb_dst_set_noref(skb, &rt->dst);
416 else
417 skb_dst_set(skb, &rt->dst);
418
419 return local;
420
421 err_put:
422 if (!noref)
423 ip_rt_put(rt);
424 return -1;
425
426 err_unreach:
427 if (!skb->dev)
428 skb->dev = skb_dst(skb)->dev;
429
430 dst_link_failure(skb);
431 return -1;
432 }
433
434 #ifdef CONFIG_IP_VS_IPV6
435 static struct dst_entry *
__ip_vs_route_output_v6(struct net * net,struct in6_addr * daddr,struct in6_addr * ret_saddr,int do_xfrm,int rt_mode)436 __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
437 struct in6_addr *ret_saddr, int do_xfrm, int rt_mode)
438 {
439 struct dst_entry *dst;
440 struct flowi6 fl6 = {
441 .daddr = *daddr,
442 };
443
444 if (rt_mode & IP_VS_RT_MODE_KNOWN_NH)
445 fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH;
446
447 dst = ip6_route_output(net, NULL, &fl6);
448 if (dst->error)
449 goto out_err;
450 if (!ret_saddr)
451 return dst;
452 if (ipv6_addr_any(&fl6.saddr) &&
453 ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
454 &fl6.daddr, 0, &fl6.saddr) < 0)
455 goto out_err;
456 if (do_xfrm) {
457 dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
458 if (IS_ERR(dst)) {
459 dst = NULL;
460 goto out_err;
461 }
462 }
463 *ret_saddr = fl6.saddr;
464 return dst;
465
466 out_err:
467 dst_release(dst);
468 IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
469 return NULL;
470 }
471
472 /*
473 * Get route to destination or remote server
474 */
475 static int
__ip_vs_get_out_rt_v6(struct netns_ipvs * ipvs,int skb_af,struct sk_buff * skb,struct ip_vs_dest * dest,struct in6_addr * daddr,struct in6_addr * ret_saddr,struct ip_vs_iphdr * ipvsh,int do_xfrm,int rt_mode)476 __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
477 struct ip_vs_dest *dest,
478 struct in6_addr *daddr, struct in6_addr *ret_saddr,
479 struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode)
480 {
481 struct net *net = ipvs->net;
482 struct ip_vs_dest_dst *dest_dst;
483 struct rt6_info *rt; /* Route to the other host */
484 struct dst_entry *dst;
485 int mtu;
486 int local, noref = 1;
487
488 if (dest) {
489 dest_dst = __ip_vs_dst_check(dest);
490 if (likely(dest_dst)) {
491 rt = dst_rt6_info(dest_dst->dst_cache);
492 if (ret_saddr)
493 *ret_saddr = dest_dst->dst_saddr.in6;
494 } else {
495 u32 cookie;
496
497 dest_dst = ip_vs_dest_dst_alloc();
498 spin_lock_bh(&dest->dst_lock);
499 if (!dest_dst) {
500 __ip_vs_dst_set(dest, NULL, NULL, 0);
501 spin_unlock_bh(&dest->dst_lock);
502 goto err_unreach;
503 }
504 dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
505 &dest_dst->dst_saddr.in6,
506 do_xfrm, rt_mode);
507 if (!dst) {
508 __ip_vs_dst_set(dest, NULL, NULL, 0);
509 spin_unlock_bh(&dest->dst_lock);
510 ip_vs_dest_dst_free(dest_dst);
511 goto err_unreach;
512 }
513 rt = dst_rt6_info(dst);
514 cookie = rt6_get_cookie(rt);
515 /* It is forbidden to attach dest->dest_dst if
516 * device is going down.
517 */
518 if (!rt_dev_is_down(dst_dev_rcu(&rt->dst)))
519 __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie);
520 else
521 noref = 0;
522 spin_unlock_bh(&dest->dst_lock);
523 IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
524 &dest->addr.in6, &dest_dst->dst_saddr.in6,
525 rcuref_read(&rt->dst.__rcuref));
526 if (ret_saddr)
527 *ret_saddr = dest_dst->dst_saddr.in6;
528 if (!noref)
529 ip_vs_dest_dst_free(dest_dst);
530 }
531 } else {
532 noref = 0;
533 dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm,
534 rt_mode);
535 if (!dst)
536 goto err_unreach;
537 rt = dst_rt6_info(dst);
538 }
539
540 local = __ip_vs_is_local_route6(rt);
541
542 if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode,
543 local))) {
544 IP_VS_DBG_RL("We are crossing local and non-local addresses"
545 " daddr=%pI6\n", daddr);
546 goto err_put;
547 }
548
549 if (unlikely(local)) {
550 /* skb to local stack, preserve old route */
551 if (!noref)
552 dst_release(&rt->dst);
553 return local;
554 }
555
556 if (!decrement_ttl(ipvs, skb_af, skb))
557 goto err_put;
558
559 /* MTU checking */
560 if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL)))
561 mtu = dst_mtu(&rt->dst);
562 else {
563 mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
564 if (!dest)
565 goto err_put;
566 if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
567 mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
568 if ((dest->tun_flags &
569 IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
570 skb->ip_summed == CHECKSUM_PARTIAL)
571 mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
572 } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
573 IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
574
575 if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
576 __set_bit(IP_TUNNEL_CSUM_BIT, tflags);
577 mtu -= gre_calc_hlen(tflags);
578 }
579 if (mtu < IPV6_MIN_MTU) {
580 IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
581 IPV6_MIN_MTU);
582 goto err_put;
583 }
584 maybe_update_pmtu(skb_af, skb, mtu);
585 }
586
587 if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu))
588 goto err_put;
589
590 skb_dst_drop(skb);
591 if (noref)
592 skb_dst_set_noref(skb, &rt->dst);
593 else
594 skb_dst_set(skb, &rt->dst);
595
596 return local;
597
598 err_put:
599 if (!noref)
600 dst_release(&rt->dst);
601 return -1;
602
603 err_unreach:
604 /* The ip6_link_failure function requires the dev field to be set
605 * in order to get the net (further for the sake of fwmark
606 * reflection).
607 */
608 if (!skb->dev)
609 skb->dev = skb_dst(skb)->dev;
610
611 dst_link_failure(skb);
612 return -1;
613 }
614 #endif
615
616
617 /* return NF_ACCEPT to allow forwarding or other NF_xxx on error */
ip_vs_tunnel_xmit_prepare(struct sk_buff * skb,struct ip_vs_conn * cp)618 static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
619 struct ip_vs_conn *cp)
620 {
621 int ret = NF_ACCEPT;
622
623 skb->ipvs_property = 1;
624 if (unlikely(cp->flags & IP_VS_CONN_F_NFCT))
625 ret = ip_vs_confirm_conntrack(skb);
626 if (ret == NF_ACCEPT) {
627 nf_reset_ct(skb);
628 skb_forward_csum(skb);
629 if (skb->dev)
630 skb_clear_tstamp(skb);
631 }
632 return ret;
633 }
634
635 /* In the event of a remote destination, it's possible that we would have
636 * matches against an old socket (particularly a TIME-WAIT socket). This
637 * causes havoc down the line (ip_local_out et. al. expect regular sockets
638 * and invalid memory accesses will happen) so simply drop the association
639 * in this case.
640 */
ip_vs_drop_early_demux_sk(struct sk_buff * skb)641 static inline void ip_vs_drop_early_demux_sk(struct sk_buff *skb)
642 {
643 /* If dev is set, the packet came from the LOCAL_IN callback and
644 * not from a local TCP socket.
645 */
646 if (skb->dev)
647 skb_orphan(skb);
648 }
649
650 /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
ip_vs_nat_send_or_cont(int pf,struct sk_buff * skb,struct ip_vs_conn * cp,int local)651 static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
652 struct ip_vs_conn *cp, int local)
653 {
654 int ret = NF_STOLEN;
655
656 skb->ipvs_property = 1;
657 if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
658 ip_vs_notrack(skb);
659 else
660 ip_vs_update_conntrack(skb, cp, 1);
661
662 /* Remove the early_demux association unless it's bound for the
663 * exact same port and address on this host after translation.
664 */
665 if (!local || cp->vport != cp->dport ||
666 !ip_vs_addr_equal(cp->af, &cp->vaddr, &cp->daddr))
667 ip_vs_drop_early_demux_sk(skb);
668
669 if (!local) {
670 skb_forward_csum(skb);
671 if (skb->dev)
672 skb_clear_tstamp(skb);
673 NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
674 NULL, skb_dst(skb)->dev, dst_output);
675 } else
676 ret = NF_ACCEPT;
677
678 return ret;
679 }
680
681 /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
ip_vs_send_or_cont(int pf,struct sk_buff * skb,struct ip_vs_conn * cp,int local)682 static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
683 struct ip_vs_conn *cp, int local)
684 {
685 int ret = NF_STOLEN;
686
687 skb->ipvs_property = 1;
688 if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
689 ip_vs_notrack(skb);
690 if (!local) {
691 ip_vs_drop_early_demux_sk(skb);
692 skb_forward_csum(skb);
693 if (skb->dev)
694 skb_clear_tstamp(skb);
695 NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
696 NULL, skb_dst(skb)->dev, dst_output);
697 } else
698 ret = NF_ACCEPT;
699 return ret;
700 }
701
702
703 /*
704 * NULL transmitter (do nothing except return NF_ACCEPT)
705 */
706 int
ip_vs_null_xmit(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,struct ip_vs_iphdr * ipvsh)707 ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
708 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
709 {
710 /* we do not touch skb and do not need pskb ptr */
711 return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
712 }
713
714
715 /*
716 * Bypass transmitter
717 * Let packets bypass the destination when the destination is not
718 * available, it may be only used in transparent cache cluster.
719 */
720 int
ip_vs_bypass_xmit(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,struct ip_vs_iphdr * ipvsh)721 ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
722 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
723 {
724 struct iphdr *iph = ip_hdr(skb);
725
726 if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr,
727 IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0)
728 goto tx_error;
729
730 ip_send_check(iph);
731
732 /* Another hack: avoid icmp_send in ip_fragment */
733 skb->ignore_df = 1;
734
735 ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
736
737 return NF_STOLEN;
738
739 tx_error:
740 kfree_skb(skb);
741 return NF_STOLEN;
742 }
743
744 #ifdef CONFIG_IP_VS_IPV6
745 int
ip_vs_bypass_xmit_v6(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,struct ip_vs_iphdr * ipvsh)746 ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
747 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
748 {
749 struct ipv6hdr *iph = ipv6_hdr(skb);
750
751 if (__ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, NULL,
752 &iph->daddr, NULL,
753 ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0)
754 goto tx_error;
755
756 /* Another hack: avoid icmp_send in ip_fragment */
757 skb->ignore_df = 1;
758
759 ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
760
761 return NF_STOLEN;
762
763 tx_error:
764 kfree_skb(skb);
765 return NF_STOLEN;
766 }
767 #endif
768
769 /*
770 * NAT transmitter (only for outside-to-inside nat forwarding)
771 * Not used for related ICMP
772 */
773 int
ip_vs_nat_xmit(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,struct ip_vs_iphdr * ipvsh)774 ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
775 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
776 {
777 struct rtable *rt; /* Route to the other host */
778 int local, rc, was_input;
779
780 /* check if it is a connection of no-client-port */
781 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
782 __be16 _pt, *p;
783
784 p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
785 if (p == NULL)
786 goto tx_error;
787 ip_vs_conn_fill_cport(cp, *p);
788 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
789 }
790
791 was_input = rt_is_input_route(skb_rtable(skb));
792 local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
793 IP_VS_RT_MODE_LOCAL |
794 IP_VS_RT_MODE_NON_LOCAL |
795 IP_VS_RT_MODE_RDR, NULL, ipvsh);
796 if (local < 0)
797 goto tx_error;
798 rt = skb_rtable(skb);
799 /*
800 * Avoid duplicate tuple in reply direction for NAT traffic
801 * to local address when connection is sync-ed
802 */
803 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
804 if (cp->flags & IP_VS_CONN_F_SYNC && local) {
805 enum ip_conntrack_info ctinfo;
806 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
807
808 if (ct) {
809 IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, ipvsh->off,
810 "ip_vs_nat_xmit(): "
811 "stopping DNAT to local address");
812 goto tx_error;
813 }
814 }
815 #endif
816
817 /* From world but DNAT to loopback address? */
818 if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
819 IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, ipvsh->off,
820 "ip_vs_nat_xmit(): stopping DNAT to loopback "
821 "address");
822 goto tx_error;
823 }
824
825 /* copy-on-write the packet before mangling it */
826 if (skb_ensure_writable(skb, sizeof(struct iphdr)))
827 goto tx_error;
828
829 if (skb_cow(skb, rt->dst.dev->hard_header_len))
830 goto tx_error;
831
832 /* mangle the packet */
833 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
834 goto tx_error;
835 ip_hdr(skb)->daddr = cp->daddr.ip;
836 ip_send_check(ip_hdr(skb));
837
838 IP_VS_DBG_PKT(10, AF_INET, pp, skb, ipvsh->off, "After DNAT");
839
840 /* FIXME: when application helper enlarges the packet and the length
841 is larger than the MTU of outgoing device, there will be still
842 MTU problem. */
843
844 /* Another hack: avoid icmp_send in ip_fragment */
845 skb->ignore_df = 1;
846
847 rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
848
849 return rc;
850
851 tx_error:
852 kfree_skb(skb);
853 return NF_STOLEN;
854 }
855
856 #ifdef CONFIG_IP_VS_IPV6
857 int
ip_vs_nat_xmit_v6(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,struct ip_vs_iphdr * ipvsh)858 ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
859 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
860 {
861 struct rt6_info *rt; /* Route to the other host */
862 int local, rc;
863
864 /* check if it is a connection of no-client-port */
865 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) {
866 __be16 _pt, *p;
867 p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
868 if (p == NULL)
869 goto tx_error;
870 ip_vs_conn_fill_cport(cp, *p);
871 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
872 }
873
874 local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
875 &cp->daddr.in6,
876 NULL, ipvsh, 0,
877 IP_VS_RT_MODE_LOCAL |
878 IP_VS_RT_MODE_NON_LOCAL |
879 IP_VS_RT_MODE_RDR);
880 if (local < 0)
881 goto tx_error;
882 rt = dst_rt6_info(skb_dst(skb));
883 /*
884 * Avoid duplicate tuple in reply direction for NAT traffic
885 * to local address when connection is sync-ed
886 */
887 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
888 if (cp->flags & IP_VS_CONN_F_SYNC && local) {
889 enum ip_conntrack_info ctinfo;
890 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
891
892 if (ct) {
893 IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, ipvsh->off,
894 "ip_vs_nat_xmit_v6(): "
895 "stopping DNAT to local address");
896 goto tx_error;
897 }
898 }
899 #endif
900
901 /* From world but DNAT to loopback address? */
902 if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
903 ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) {
904 IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, ipvsh->off,
905 "ip_vs_nat_xmit_v6(): "
906 "stopping DNAT to loopback address");
907 goto tx_error;
908 }
909
910 /* copy-on-write the packet before mangling it */
911 if (skb_ensure_writable(skb, sizeof(struct ipv6hdr)))
912 goto tx_error;
913
914 if (skb_cow(skb, rt->dst.dev->hard_header_len))
915 goto tx_error;
916
917 /* mangle the packet */
918 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
919 goto tx_error;
920 ipv6_hdr(skb)->daddr = cp->daddr.in6;
921
922 IP_VS_DBG_PKT(10, AF_INET6, pp, skb, ipvsh->off, "After DNAT");
923
924 /* FIXME: when application helper enlarges the packet and the length
925 is larger than the MTU of outgoing device, there will be still
926 MTU problem. */
927
928 /* Another hack: avoid icmp_send in ip_fragment */
929 skb->ignore_df = 1;
930
931 rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
932
933 return rc;
934
935 tx_error:
936 kfree_skb(skb);
937 return NF_STOLEN;
938 }
939 #endif
940
941 /* When forwarding a packet, we must ensure that we've got enough headroom
942 * for the encapsulation packet in the skb. This also gives us an
943 * opportunity to figure out what the payload_len, dsfield, ttl, and df
944 * values should be, so that we won't need to look at the old ip header
945 * again
946 */
947 static struct sk_buff *
ip_vs_prepare_tunneled_skb(struct sk_buff * skb,int skb_af,unsigned int max_headroom,__u8 * next_protocol,__u32 * payload_len,__u8 * dsfield,__u8 * ttl,__be16 * df)948 ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af,
949 unsigned int max_headroom, __u8 *next_protocol,
950 __u32 *payload_len, __u8 *dsfield, __u8 *ttl,
951 __be16 *df)
952 {
953 struct sk_buff *new_skb = NULL;
954 struct iphdr *old_iph = NULL;
955 __u8 old_dsfield;
956 #ifdef CONFIG_IP_VS_IPV6
957 struct ipv6hdr *old_ipv6h = NULL;
958 #endif
959
960 ip_vs_drop_early_demux_sk(skb);
961
962 if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) {
963 new_skb = skb_realloc_headroom(skb, max_headroom);
964 if (!new_skb)
965 goto error;
966 if (skb->sk)
967 skb_set_owner_w(new_skb, skb->sk);
968 consume_skb(skb);
969 skb = new_skb;
970 }
971
972 #ifdef CONFIG_IP_VS_IPV6
973 if (skb_af == AF_INET6) {
974 old_ipv6h = ipv6_hdr(skb);
975 *next_protocol = IPPROTO_IPV6;
976 if (payload_len)
977 *payload_len =
978 ipv6_payload_len(skb, old_ipv6h) +
979 sizeof(*old_ipv6h);
980 old_dsfield = ipv6_get_dsfield(old_ipv6h);
981 *ttl = old_ipv6h->hop_limit;
982 if (df)
983 *df = 0;
984 } else
985 #endif
986 {
987 old_iph = ip_hdr(skb);
988 /* Copy DF, reset fragment offset and MF */
989 if (df)
990 *df = (old_iph->frag_off & htons(IP_DF));
991 *next_protocol = IPPROTO_IPIP;
992
993 /* fix old IP header checksum */
994 ip_send_check(old_iph);
995 old_dsfield = ipv4_get_dsfield(old_iph);
996 *ttl = old_iph->ttl;
997 if (payload_len)
998 *payload_len = skb_ip_totlen(skb);
999 }
1000
1001 /* Implement full-functionality option for ECN encapsulation */
1002 *dsfield = INET_ECN_encapsulate(old_dsfield, old_dsfield);
1003
1004 return skb;
1005 error:
1006 kfree_skb(skb);
1007 return ERR_PTR(-ENOMEM);
1008 }
1009
__tun_gso_type_mask(int encaps_af,int orig_af)1010 static inline int __tun_gso_type_mask(int encaps_af, int orig_af)
1011 {
1012 switch (encaps_af) {
1013 case AF_INET:
1014 return SKB_GSO_IPXIP4;
1015 case AF_INET6:
1016 return SKB_GSO_IPXIP6;
1017 default:
1018 return 0;
1019 }
1020 }
1021
1022 static int
ipvs_gue_encap(struct net * net,struct sk_buff * skb,struct ip_vs_conn * cp,__u8 * next_protocol)1023 ipvs_gue_encap(struct net *net, struct sk_buff *skb,
1024 struct ip_vs_conn *cp, __u8 *next_protocol)
1025 {
1026 __be16 dport;
1027 __be16 sport = udp_flow_src_port(net, skb, 0, 0, false);
1028 struct udphdr *udph; /* Our new UDP header */
1029 struct guehdr *gueh; /* Our new GUE header */
1030 size_t hdrlen, optlen = 0;
1031 void *data;
1032 bool need_priv = false;
1033
1034 if ((cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
1035 skb->ip_summed == CHECKSUM_PARTIAL) {
1036 optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
1037 need_priv = true;
1038 }
1039
1040 hdrlen = sizeof(struct guehdr) + optlen;
1041
1042 skb_push(skb, hdrlen);
1043
1044 gueh = (struct guehdr *)skb->data;
1045
1046 gueh->control = 0;
1047 gueh->version = 0;
1048 gueh->hlen = optlen >> 2;
1049 gueh->flags = 0;
1050 gueh->proto_ctype = *next_protocol;
1051
1052 data = &gueh[1];
1053
1054 if (need_priv) {
1055 __be32 *flags = data;
1056 u16 csum_start = skb_checksum_start_offset(skb);
1057 __be16 *pd;
1058
1059 gueh->flags |= GUE_FLAG_PRIV;
1060 *flags = 0;
1061 data += GUE_LEN_PRIV;
1062
1063 if (csum_start < hdrlen)
1064 return -EINVAL;
1065
1066 csum_start -= hdrlen;
1067 pd = data;
1068 pd[0] = htons(csum_start);
1069 pd[1] = htons(csum_start + skb->csum_offset);
1070
1071 if (!skb_is_gso(skb)) {
1072 skb->ip_summed = CHECKSUM_NONE;
1073 skb->encapsulation = 0;
1074 }
1075
1076 *flags |= GUE_PFLAG_REMCSUM;
1077 data += GUE_PLEN_REMCSUM;
1078 }
1079
1080 skb_push(skb, sizeof(struct udphdr));
1081 skb_reset_transport_header(skb);
1082
1083 udph = udp_hdr(skb);
1084
1085 dport = cp->dest->tun_port;
1086 udph->dest = dport;
1087 udph->source = sport;
1088 udph->len = htons(skb->len);
1089 udph->check = 0;
1090
1091 *next_protocol = IPPROTO_UDP;
1092
1093 return 0;
1094 }
1095
1096 static void
ipvs_gre_encap(struct net * net,struct sk_buff * skb,struct ip_vs_conn * cp,__u8 * next_protocol)1097 ipvs_gre_encap(struct net *net, struct sk_buff *skb,
1098 struct ip_vs_conn *cp, __u8 *next_protocol)
1099 {
1100 __be16 proto = *next_protocol == IPPROTO_IPIP ?
1101 htons(ETH_P_IP) : htons(ETH_P_IPV6);
1102 IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
1103 size_t hdrlen;
1104
1105 if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
1106 __set_bit(IP_TUNNEL_CSUM_BIT, tflags);
1107
1108 hdrlen = gre_calc_hlen(tflags);
1109 gre_build_header(skb, hdrlen, tflags, proto, 0, 0);
1110
1111 *next_protocol = IPPROTO_GRE;
1112 }
1113
1114 /*
1115 * IP Tunneling transmitter
1116 *
1117 * This function encapsulates the packet in a new IP packet, its
1118 * destination will be set to cp->daddr. Most code of this function
1119 * is taken from ipip.c.
1120 *
1121 * It is used in VS/TUN cluster. The load balancer selects a real
1122 * server from a cluster based on a scheduling algorithm,
1123 * encapsulates the request packet and forwards it to the selected
1124 * server. For example, all real servers are configured with
1125 * "ifconfig tunl0 <Virtual IP Address> up". When the server receives
1126 * the encapsulated packet, it will decapsulate the packet, processe
1127 * the request and return the response packets directly to the client
1128 * without passing the load balancer. This can greatly increase the
1129 * scalability of virtual server.
1130 *
1131 * Used for ANY protocol
1132 */
1133 int
ip_vs_tunnel_xmit(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,struct ip_vs_iphdr * ipvsh)1134 ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
1135 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
1136 {
1137 struct netns_ipvs *ipvs = cp->ipvs;
1138 struct net *net = ipvs->net;
1139 struct rtable *rt; /* Route to the other host */
1140 __be32 saddr; /* Source for tunnel */
1141 struct net_device *tdev; /* Device to other host */
1142 __u8 next_protocol = 0;
1143 __u8 dsfield = 0;
1144 __u8 ttl = 0;
1145 __be16 df = 0;
1146 __be16 *dfp = NULL;
1147 struct iphdr *iph; /* Our new IP header */
1148 unsigned int max_headroom; /* The extra header space needed */
1149 int ret, local;
1150 int tun_type, gso_type;
1151 int tun_flags;
1152
1153 local = __ip_vs_get_out_rt(ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
1154 IP_VS_RT_MODE_LOCAL |
1155 IP_VS_RT_MODE_NON_LOCAL |
1156 IP_VS_RT_MODE_CONNECT |
1157 IP_VS_RT_MODE_TUNNEL, &saddr, ipvsh);
1158 if (local < 0)
1159 goto tx_error;
1160 if (local)
1161 return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
1162
1163 rt = skb_rtable(skb);
1164 tdev = rt->dst.dev;
1165
1166 /*
1167 * Okay, now see if we can stuff it in the buffer as-is.
1168 */
1169 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
1170
1171 tun_type = cp->dest->tun_type;
1172 tun_flags = cp->dest->tun_flags;
1173
1174 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1175 size_t gue_hdrlen, gue_optlen = 0;
1176
1177 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
1178 skb->ip_summed == CHECKSUM_PARTIAL) {
1179 gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
1180 }
1181 gue_hdrlen = sizeof(struct guehdr) + gue_optlen;
1182
1183 max_headroom += sizeof(struct udphdr) + gue_hdrlen;
1184 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
1185 IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
1186 size_t gre_hdrlen;
1187
1188 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
1189 __set_bit(IP_TUNNEL_CSUM_BIT, tflags);
1190 gre_hdrlen = gre_calc_hlen(tflags);
1191
1192 max_headroom += gre_hdrlen;
1193 }
1194
1195 /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */
1196 dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL;
1197 skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
1198 &next_protocol, NULL, &dsfield,
1199 &ttl, dfp);
1200 if (IS_ERR(skb))
1201 return NF_STOLEN;
1202
1203 gso_type = __tun_gso_type_mask(AF_INET, cp->af);
1204 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1205 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
1206 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
1207 gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
1208 else
1209 gso_type |= SKB_GSO_UDP_TUNNEL;
1210 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
1211 skb->ip_summed == CHECKSUM_PARTIAL) {
1212 gso_type |= SKB_GSO_TUNNEL_REMCSUM;
1213 }
1214 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
1215 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
1216 gso_type |= SKB_GSO_GRE_CSUM;
1217 else
1218 gso_type |= SKB_GSO_GRE;
1219 }
1220
1221 if (iptunnel_handle_offloads(skb, gso_type))
1222 goto tx_error;
1223
1224 skb->transport_header = skb->network_header;
1225
1226 skb_set_inner_ipproto(skb, next_protocol);
1227 skb_set_inner_mac_header(skb, skb_inner_network_offset(skb));
1228
1229 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1230 bool check = false;
1231
1232 if (ipvs_gue_encap(net, skb, cp, &next_protocol))
1233 goto tx_error;
1234
1235 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
1236 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
1237 check = true;
1238
1239 udp_set_csum(!check, skb, saddr, cp->daddr.ip, skb->len);
1240 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE)
1241 ipvs_gre_encap(net, skb, cp, &next_protocol);
1242
1243 skb_push(skb, sizeof(struct iphdr));
1244 skb_reset_network_header(skb);
1245 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1246
1247 /*
1248 * Push down and install the IPIP header.
1249 */
1250 iph = ip_hdr(skb);
1251 iph->version = 4;
1252 iph->ihl = sizeof(struct iphdr)>>2;
1253 iph->frag_off = df;
1254 iph->protocol = next_protocol;
1255 iph->tos = dsfield;
1256 iph->daddr = cp->daddr.ip;
1257 iph->saddr = saddr;
1258 iph->ttl = ttl;
1259 ip_select_ident(net, skb, NULL);
1260
1261 /* Another hack: avoid icmp_send in ip_fragment */
1262 skb->ignore_df = 1;
1263
1264 ret = ip_vs_tunnel_xmit_prepare(skb, cp);
1265 if (ret == NF_ACCEPT)
1266 ip_local_out(net, skb->sk, skb);
1267 else if (ret == NF_DROP)
1268 kfree_skb(skb);
1269
1270 return NF_STOLEN;
1271
1272 tx_error:
1273 kfree_skb(skb);
1274 return NF_STOLEN;
1275 }
1276
1277 #ifdef CONFIG_IP_VS_IPV6
1278 int
ip_vs_tunnel_xmit_v6(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,struct ip_vs_iphdr * ipvsh)1279 ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
1280 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
1281 {
1282 struct netns_ipvs *ipvs = cp->ipvs;
1283 struct net *net = ipvs->net;
1284 struct rt6_info *rt; /* Route to the other host */
1285 struct in6_addr saddr; /* Source for tunnel */
1286 struct net_device *tdev; /* Device to other host */
1287 __u8 next_protocol = 0;
1288 __u32 payload_len = 0;
1289 __u8 dsfield = 0;
1290 __u8 ttl = 0;
1291 struct ipv6hdr *iph; /* Our new IP header */
1292 unsigned int max_headroom; /* The extra header space needed */
1293 int ret, local;
1294 int tun_type, gso_type;
1295 int tun_flags;
1296
1297 local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest,
1298 &cp->daddr.in6,
1299 &saddr, ipvsh, 1,
1300 IP_VS_RT_MODE_LOCAL |
1301 IP_VS_RT_MODE_NON_LOCAL |
1302 IP_VS_RT_MODE_TUNNEL);
1303 if (local < 0)
1304 goto tx_error;
1305 if (local)
1306 return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
1307
1308 rt = dst_rt6_info(skb_dst(skb));
1309 tdev = rt->dst.dev;
1310
1311 /*
1312 * Okay, now see if we can stuff it in the buffer as-is.
1313 */
1314 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
1315
1316 tun_type = cp->dest->tun_type;
1317 tun_flags = cp->dest->tun_flags;
1318
1319 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1320 size_t gue_hdrlen, gue_optlen = 0;
1321
1322 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
1323 skb->ip_summed == CHECKSUM_PARTIAL) {
1324 gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
1325 }
1326 gue_hdrlen = sizeof(struct guehdr) + gue_optlen;
1327
1328 max_headroom += sizeof(struct udphdr) + gue_hdrlen;
1329 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
1330 IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
1331 size_t gre_hdrlen;
1332
1333 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
1334 __set_bit(IP_TUNNEL_CSUM_BIT, tflags);
1335 gre_hdrlen = gre_calc_hlen(tflags);
1336
1337 max_headroom += gre_hdrlen;
1338 }
1339
1340 skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
1341 &next_protocol, &payload_len,
1342 &dsfield, &ttl, NULL);
1343 if (IS_ERR(skb))
1344 return NF_STOLEN;
1345
1346 gso_type = __tun_gso_type_mask(AF_INET6, cp->af);
1347 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1348 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
1349 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
1350 gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
1351 else
1352 gso_type |= SKB_GSO_UDP_TUNNEL;
1353 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
1354 skb->ip_summed == CHECKSUM_PARTIAL) {
1355 gso_type |= SKB_GSO_TUNNEL_REMCSUM;
1356 }
1357 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
1358 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
1359 gso_type |= SKB_GSO_GRE_CSUM;
1360 else
1361 gso_type |= SKB_GSO_GRE;
1362 }
1363
1364 if (iptunnel_handle_offloads(skb, gso_type))
1365 goto tx_error;
1366
1367 skb->transport_header = skb->network_header;
1368
1369 skb_set_inner_ipproto(skb, next_protocol);
1370 skb_set_inner_mac_header(skb, skb_inner_network_offset(skb));
1371
1372 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1373 bool check = false;
1374
1375 if (ipvs_gue_encap(net, skb, cp, &next_protocol))
1376 goto tx_error;
1377
1378 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
1379 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
1380 check = true;
1381
1382 udp6_set_csum(!check, skb, &saddr, &cp->daddr.in6, skb->len);
1383 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE)
1384 ipvs_gre_encap(net, skb, cp, &next_protocol);
1385
1386 skb_push(skb, sizeof(struct ipv6hdr));
1387 skb_reset_network_header(skb);
1388 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1389
1390 /*
1391 * Push down and install the IPIP header.
1392 */
1393 iph = ipv6_hdr(skb);
1394 iph->version = 6;
1395 iph->nexthdr = next_protocol;
1396 iph->payload_len = htons(payload_len);
1397 memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
1398 ipv6_change_dsfield(iph, 0, dsfield);
1399 iph->daddr = cp->daddr.in6;
1400 iph->saddr = saddr;
1401 iph->hop_limit = ttl;
1402
1403 /* Another hack: avoid icmp_send in ip_fragment */
1404 skb->ignore_df = 1;
1405
1406 ret = ip_vs_tunnel_xmit_prepare(skb, cp);
1407 if (ret == NF_ACCEPT)
1408 ip6_local_out(net, skb->sk, skb);
1409 else if (ret == NF_DROP)
1410 kfree_skb(skb);
1411
1412 return NF_STOLEN;
1413
1414 tx_error:
1415 kfree_skb(skb);
1416 return NF_STOLEN;
1417 }
1418 #endif
1419
1420
1421 /*
1422 * Direct Routing transmitter
1423 * Used for ANY protocol
1424 */
1425 int
ip_vs_dr_xmit(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,struct ip_vs_iphdr * ipvsh)1426 ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
1427 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
1428 {
1429 int local;
1430
1431 local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
1432 IP_VS_RT_MODE_LOCAL |
1433 IP_VS_RT_MODE_NON_LOCAL |
1434 IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh);
1435 if (local < 0)
1436 goto tx_error;
1437 if (local)
1438 return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
1439
1440 ip_send_check(ip_hdr(skb));
1441
1442 /* Another hack: avoid icmp_send in ip_fragment */
1443 skb->ignore_df = 1;
1444
1445 ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
1446
1447 return NF_STOLEN;
1448
1449 tx_error:
1450 kfree_skb(skb);
1451 return NF_STOLEN;
1452 }
1453
1454 #ifdef CONFIG_IP_VS_IPV6
1455 int
ip_vs_dr_xmit_v6(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,struct ip_vs_iphdr * ipvsh)1456 ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
1457 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
1458 {
1459 int local;
1460
1461 local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
1462 &cp->daddr.in6,
1463 NULL, ipvsh, 0,
1464 IP_VS_RT_MODE_LOCAL |
1465 IP_VS_RT_MODE_NON_LOCAL |
1466 IP_VS_RT_MODE_KNOWN_NH);
1467 if (local < 0)
1468 goto tx_error;
1469 if (local)
1470 return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
1471
1472 /* Another hack: avoid icmp_send in ip_fragment */
1473 skb->ignore_df = 1;
1474
1475 ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
1476
1477 return NF_STOLEN;
1478
1479 tx_error:
1480 kfree_skb(skb);
1481 return NF_STOLEN;
1482 }
1483 #endif
1484
1485
1486 /*
1487 * ICMP packet transmitter
1488 * called by the ip_vs_in_icmp
1489 */
1490 int
ip_vs_icmp_xmit(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,int offset,unsigned int hooknum,struct ip_vs_iphdr * iph)1491 ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
1492 struct ip_vs_protocol *pp, int offset, unsigned int hooknum,
1493 struct ip_vs_iphdr *iph)
1494 {
1495 struct rtable *rt; /* Route to the other host */
1496 int rc;
1497 int local;
1498 int rt_mode, was_input;
1499
1500 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
1501 forwarded directly here, because there is no need to
1502 translate address/port back */
1503 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
1504 if (cp->packet_xmit)
1505 rc = cp->packet_xmit(skb, cp, pp, iph);
1506 else
1507 rc = NF_ACCEPT;
1508 /* do not touch skb anymore */
1509 atomic_inc(&cp->in_pkts);
1510 return rc;
1511 }
1512
1513 /*
1514 * mangle and send the packet here (only for VS/NAT)
1515 */
1516 was_input = rt_is_input_route(skb_rtable(skb));
1517
1518 /* LOCALNODE from FORWARD hook is not supported */
1519 rt_mode = (hooknum != NF_INET_FORWARD) ?
1520 IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
1521 IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
1522 local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, rt_mode,
1523 NULL, iph);
1524 if (local < 0)
1525 goto tx_error;
1526 rt = skb_rtable(skb);
1527
1528 /*
1529 * Avoid duplicate tuple in reply direction for NAT traffic
1530 * to local address when connection is sync-ed
1531 */
1532 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
1533 if (cp->flags & IP_VS_CONN_F_SYNC && local) {
1534 enum ip_conntrack_info ctinfo;
1535 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
1536
1537 if (ct) {
1538 IP_VS_DBG(10, "%s(): "
1539 "stopping DNAT to local address %pI4\n",
1540 __func__, &cp->daddr.ip);
1541 goto tx_error;
1542 }
1543 }
1544 #endif
1545
1546 /* From world but DNAT to loopback address? */
1547 if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
1548 IP_VS_DBG(1, "%s(): "
1549 "stopping DNAT to loopback %pI4\n",
1550 __func__, &cp->daddr.ip);
1551 goto tx_error;
1552 }
1553
1554 /* copy-on-write the packet before mangling it */
1555 if (skb_ensure_writable(skb, offset))
1556 goto tx_error;
1557
1558 if (skb_cow(skb, rt->dst.dev->hard_header_len))
1559 goto tx_error;
1560
1561 ip_vs_nat_icmp(skb, pp, cp, 0);
1562
1563 /* Another hack: avoid icmp_send in ip_fragment */
1564 skb->ignore_df = 1;
1565
1566 return ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
1567
1568 tx_error:
1569 kfree_skb(skb);
1570 rc = NF_STOLEN;
1571 return rc;
1572 }
1573
1574 #ifdef CONFIG_IP_VS_IPV6
1575 int
ip_vs_icmp_xmit_v6(struct sk_buff * skb,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,int offset,unsigned int hooknum,struct ip_vs_iphdr * ipvsh)1576 ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
1577 struct ip_vs_protocol *pp, int offset, unsigned int hooknum,
1578 struct ip_vs_iphdr *ipvsh)
1579 {
1580 struct rt6_info *rt; /* Route to the other host */
1581 int rc;
1582 int local;
1583 int rt_mode;
1584
1585 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
1586 forwarded directly here, because there is no need to
1587 translate address/port back */
1588 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
1589 if (cp->packet_xmit)
1590 rc = cp->packet_xmit(skb, cp, pp, ipvsh);
1591 else
1592 rc = NF_ACCEPT;
1593 /* do not touch skb anymore */
1594 atomic_inc(&cp->in_pkts);
1595 return rc;
1596 }
1597
1598 /*
1599 * mangle and send the packet here (only for VS/NAT)
1600 */
1601
1602 /* LOCALNODE from FORWARD hook is not supported */
1603 rt_mode = (hooknum != NF_INET_FORWARD) ?
1604 IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
1605 IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
1606 local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
1607 &cp->daddr.in6, NULL, ipvsh, 0, rt_mode);
1608 if (local < 0)
1609 goto tx_error;
1610 rt = dst_rt6_info(skb_dst(skb));
1611 /*
1612 * Avoid duplicate tuple in reply direction for NAT traffic
1613 * to local address when connection is sync-ed
1614 */
1615 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
1616 if (cp->flags & IP_VS_CONN_F_SYNC && local) {
1617 enum ip_conntrack_info ctinfo;
1618 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
1619
1620 if (ct) {
1621 IP_VS_DBG(10, "%s(): "
1622 "stopping DNAT to local address %pI6\n",
1623 __func__, &cp->daddr.in6);
1624 goto tx_error;
1625 }
1626 }
1627 #endif
1628
1629 /* From world but DNAT to loopback address? */
1630 if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
1631 ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) {
1632 IP_VS_DBG(1, "%s(): "
1633 "stopping DNAT to loopback %pI6\n",
1634 __func__, &cp->daddr.in6);
1635 goto tx_error;
1636 }
1637
1638 /* copy-on-write the packet before mangling it */
1639 if (skb_ensure_writable(skb, offset))
1640 goto tx_error;
1641
1642 if (skb_cow(skb, rt->dst.dev->hard_header_len))
1643 goto tx_error;
1644
1645 ip_vs_nat_icmp_v6(skb, pp, cp, 0);
1646
1647 /* Another hack: avoid icmp_send in ip_fragment */
1648 skb->ignore_df = 1;
1649
1650 return ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
1651
1652 tx_error:
1653 kfree_skb(skb);
1654 rc = NF_STOLEN;
1655 return rc;
1656 }
1657 #endif
1658