xref: /linux/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c (revision 24168c5e6dfbdd5b414f048f47f75d64533296ca)
1 // SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
2 /* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
3 
4 #include "vmlinux.h"
5 
6 #include <bpf/bpf_helpers.h>
7 #include <bpf/bpf_endian.h>
8 #include <asm/errno.h>
9 
10 #include "bpf_compiler.h"
11 
12 #define TC_ACT_OK 0
13 #define TC_ACT_SHOT 2
14 
15 #define NSEC_PER_SEC 1000000000L
16 
17 #define ETH_ALEN 6
18 #define ETH_P_IP 0x0800
19 #define ETH_P_IPV6 0x86DD
20 
21 #define tcp_flag_word(tp) (((union tcp_word_hdr *)(tp))->words[3])
22 
23 #define IP_DF 0x4000
24 #define IP_MF 0x2000
25 #define IP_OFFSET 0x1fff
26 
27 #define NEXTHDR_TCP 6
28 
29 #define TCPOPT_NOP 1
30 #define TCPOPT_EOL 0
31 #define TCPOPT_MSS 2
32 #define TCPOPT_WINDOW 3
33 #define TCPOPT_SACK_PERM 4
34 #define TCPOPT_TIMESTAMP 8
35 
36 #define TCPOLEN_MSS 4
37 #define TCPOLEN_WINDOW 3
38 #define TCPOLEN_SACK_PERM 2
39 #define TCPOLEN_TIMESTAMP 10
40 
41 #define TCP_TS_HZ 1000
42 #define TS_OPT_WSCALE_MASK 0xf
43 #define TS_OPT_SACK (1 << 4)
44 #define TS_OPT_ECN (1 << 5)
45 #define TSBITS 6
46 #define TSMASK (((__u32)1 << TSBITS) - 1)
47 #define TCP_MAX_WSCALE 14U
48 
49 #define IPV4_MAXLEN 60
50 #define TCP_MAXLEN 60
51 
52 #define DEFAULT_MSS4 1460
53 #define DEFAULT_MSS6 1440
54 #define DEFAULT_WSCALE 7
55 #define DEFAULT_TTL 64
56 #define MAX_ALLOWED_PORTS 8
57 
58 #define MAX_PACKET_OFF 0xffff
59 
60 #define swap(a, b) \
61 	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
62 
63 #define __get_unaligned_t(type, ptr) ({						\
64 	const struct { type x; } __attribute__((__packed__)) *__pptr = (typeof(__pptr))(ptr); \
65 	__pptr->x;								\
66 })
67 
68 #define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr))
69 
70 struct {
71 	__uint(type, BPF_MAP_TYPE_ARRAY);
72 	__type(key, __u32);
73 	__type(value, __u64);
74 	__uint(max_entries, 2);
75 } values SEC(".maps");
76 
77 struct {
78 	__uint(type, BPF_MAP_TYPE_ARRAY);
79 	__type(key, __u32);
80 	__type(value, __u16);
81 	__uint(max_entries, MAX_ALLOWED_PORTS);
82 } allowed_ports SEC(".maps");
83 
84 /* Some symbols defined in net/netfilter/nf_conntrack_bpf.c are unavailable in
85  * vmlinux.h if CONFIG_NF_CONNTRACK=m, so they are redefined locally.
86  */
87 
88 struct bpf_ct_opts___local {
89 	s32 netns_id;
90 	s32 error;
91 	u8 l4proto;
92 	u8 dir;
93 	u8 reserved[2];
94 } __attribute__((preserve_access_index));
95 
96 #define BPF_F_CURRENT_NETNS (-1)
97 
98 extern struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx,
99 					 struct bpf_sock_tuple *bpf_tuple,
100 					 __u32 len_tuple,
101 					 struct bpf_ct_opts___local *opts,
102 					 __u32 len_opts) __ksym;
103 
104 extern struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *skb_ctx,
105 					 struct bpf_sock_tuple *bpf_tuple,
106 					 u32 len_tuple,
107 					 struct bpf_ct_opts___local *opts,
108 					 u32 len_opts) __ksym;
109 
110 extern void bpf_ct_release(struct nf_conn *ct) __ksym;
111 
112 static __always_inline void swap_eth_addr(__u8 *a, __u8 *b)
113 {
114 	__u8 tmp[ETH_ALEN];
115 
116 	__builtin_memcpy(tmp, a, ETH_ALEN);
117 	__builtin_memcpy(a, b, ETH_ALEN);
118 	__builtin_memcpy(b, tmp, ETH_ALEN);
119 }
120 
121 static __always_inline __u16 csum_fold(__u32 csum)
122 {
123 	csum = (csum & 0xffff) + (csum >> 16);
124 	csum = (csum & 0xffff) + (csum >> 16);
125 	return (__u16)~csum;
126 }
127 
128 static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
129 					       __u32 len, __u8 proto,
130 					       __u32 csum)
131 {
132 	__u64 s = csum;
133 
134 	s += (__u32)saddr;
135 	s += (__u32)daddr;
136 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
137 	s += proto + len;
138 #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
139 	s += (proto + len) << 8;
140 #else
141 #error Unknown endian
142 #endif
143 	s = (s & 0xffffffff) + (s >> 32);
144 	s = (s & 0xffffffff) + (s >> 32);
145 
146 	return csum_fold((__u32)s);
147 }
148 
149 static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr,
150 					     const struct in6_addr *daddr,
151 					     __u32 len, __u8 proto, __u32 csum)
152 {
153 	__u64 sum = csum;
154 	int i;
155 
156 	__pragma_loop_unroll
157 	for (i = 0; i < 4; i++)
158 		sum += (__u32)saddr->in6_u.u6_addr32[i];
159 
160 	__pragma_loop_unroll
161 	for (i = 0; i < 4; i++)
162 		sum += (__u32)daddr->in6_u.u6_addr32[i];
163 
164 	/* Don't combine additions to avoid 32-bit overflow. */
165 	sum += bpf_htonl(len);
166 	sum += bpf_htonl(proto);
167 
168 	sum = (sum & 0xffffffff) + (sum >> 32);
169 	sum = (sum & 0xffffffff) + (sum >> 32);
170 
171 	return csum_fold((__u32)sum);
172 }
173 
174 static __always_inline __u64 tcp_clock_ns(void)
175 {
176 	return bpf_ktime_get_ns();
177 }
178 
179 static __always_inline __u32 tcp_ns_to_ts(__u64 ns)
180 {
181 	return ns / (NSEC_PER_SEC / TCP_TS_HZ);
182 }
183 
184 static __always_inline __u32 tcp_clock_ms(void)
185 {
186 	return tcp_ns_to_ts(tcp_clock_ns());
187 }
188 
189 struct tcpopt_context {
190 	void *data;
191 	void *data_end;
192 	__be32 *tsecr;
193 	__u8 wscale;
194 	bool option_timestamp;
195 	bool option_sack;
196 	__u32 off;
197 };
198 
199 static __always_inline u8 *next(struct tcpopt_context *ctx, __u32 sz)
200 {
201 	__u64 off = ctx->off;
202 	__u8 *data;
203 
204 	/* Verifier forbids access to packet when offset exceeds MAX_PACKET_OFF */
205 	if (off > MAX_PACKET_OFF - sz)
206 		return NULL;
207 
208 	data = ctx->data + off;
209 	barrier_var(data);
210 	if (data + sz >= ctx->data_end)
211 		return NULL;
212 
213 	ctx->off += sz;
214 	return data;
215 }
216 
217 static int tscookie_tcpopt_parse(struct tcpopt_context *ctx)
218 {
219 	__u8 *opcode, *opsize, *wscale, *tsecr;
220 	__u32 off = ctx->off;
221 
222 	opcode = next(ctx, 1);
223 	if (!opcode)
224 		return 1;
225 
226 	if (*opcode == TCPOPT_EOL)
227 		return 1;
228 	if (*opcode == TCPOPT_NOP)
229 		return 0;
230 
231 	opsize = next(ctx, 1);
232 	if (!opsize || *opsize < 2)
233 		return 1;
234 
235 	switch (*opcode) {
236 	case TCPOPT_WINDOW:
237 		wscale = next(ctx, 1);
238 		if (!wscale)
239 			return 1;
240 		if (*opsize == TCPOLEN_WINDOW)
241 			ctx->wscale = *wscale < TCP_MAX_WSCALE ? *wscale : TCP_MAX_WSCALE;
242 		break;
243 	case TCPOPT_TIMESTAMP:
244 		tsecr = next(ctx, 4);
245 		if (!tsecr)
246 			return 1;
247 		if (*opsize == TCPOLEN_TIMESTAMP) {
248 			ctx->option_timestamp = true;
249 			/* Client's tsval becomes our tsecr. */
250 			*ctx->tsecr = get_unaligned((__be32 *)tsecr);
251 		}
252 		break;
253 	case TCPOPT_SACK_PERM:
254 		if (*opsize == TCPOLEN_SACK_PERM)
255 			ctx->option_sack = true;
256 		break;
257 	}
258 
259 	ctx->off = off + *opsize;
260 
261 	return 0;
262 }
263 
264 static int tscookie_tcpopt_parse_batch(__u32 index, void *context)
265 {
266 	int i;
267 
268 	for (i = 0; i < 7; i++)
269 		if (tscookie_tcpopt_parse(context))
270 			return 1;
271 	return 0;
272 }
273 
274 static __always_inline bool tscookie_init(struct tcphdr *tcp_header,
275 					  __u16 tcp_len, __be32 *tsval,
276 					  __be32 *tsecr, void *data, void *data_end)
277 {
278 	struct tcpopt_context loop_ctx = {
279 		.data = data,
280 		.data_end = data_end,
281 		.tsecr = tsecr,
282 		.wscale = TS_OPT_WSCALE_MASK,
283 		.option_timestamp = false,
284 		.option_sack = false,
285 		/* Note: currently verifier would track .off as unbound scalar.
286 		 *       In case if verifier would at some point get smarter and
287 		 *       compute bounded value for this var, beware that it might
288 		 *       hinder bpf_loop() convergence validation.
289 		 */
290 		.off = (__u8 *)(tcp_header + 1) - (__u8 *)data,
291 	};
292 	u32 cookie;
293 
294 	bpf_loop(6, tscookie_tcpopt_parse_batch, &loop_ctx, 0);
295 
296 	if (!loop_ctx.option_timestamp)
297 		return false;
298 
299 	cookie = tcp_clock_ms() & ~TSMASK;
300 	cookie |= loop_ctx.wscale & TS_OPT_WSCALE_MASK;
301 	if (loop_ctx.option_sack)
302 		cookie |= TS_OPT_SACK;
303 	if (tcp_header->ece && tcp_header->cwr)
304 		cookie |= TS_OPT_ECN;
305 	*tsval = bpf_htonl(cookie);
306 
307 	return true;
308 }
309 
310 static __always_inline void values_get_tcpipopts(__u16 *mss, __u8 *wscale,
311 						 __u8 *ttl, bool ipv6)
312 {
313 	__u32 key = 0;
314 	__u64 *value;
315 
316 	value = bpf_map_lookup_elem(&values, &key);
317 	if (value && *value != 0) {
318 		if (ipv6)
319 			*mss = (*value >> 32) & 0xffff;
320 		else
321 			*mss = *value & 0xffff;
322 		*wscale = (*value >> 16) & 0xf;
323 		*ttl = (*value >> 24) & 0xff;
324 		return;
325 	}
326 
327 	*mss = ipv6 ? DEFAULT_MSS6 : DEFAULT_MSS4;
328 	*wscale = DEFAULT_WSCALE;
329 	*ttl = DEFAULT_TTL;
330 }
331 
332 static __always_inline void values_inc_synacks(void)
333 {
334 	__u32 key = 1;
335 	__u64 *value;
336 
337 	value = bpf_map_lookup_elem(&values, &key);
338 	if (value)
339 		__sync_fetch_and_add(value, 1);
340 }
341 
342 static __always_inline bool check_port_allowed(__u16 port)
343 {
344 	__u32 i;
345 
346 	for (i = 0; i < MAX_ALLOWED_PORTS; i++) {
347 		__u32 key = i;
348 		__u16 *value;
349 
350 		value = bpf_map_lookup_elem(&allowed_ports, &key);
351 
352 		if (!value)
353 			break;
354 		/* 0 is a terminator value. Check it first to avoid matching on
355 		 * a forbidden port == 0 and returning true.
356 		 */
357 		if (*value == 0)
358 			break;
359 
360 		if (*value == port)
361 			return true;
362 	}
363 
364 	return false;
365 }
366 
367 struct header_pointers {
368 	struct ethhdr *eth;
369 	struct iphdr *ipv4;
370 	struct ipv6hdr *ipv6;
371 	struct tcphdr *tcp;
372 	__u16 tcp_len;
373 };
374 
375 static __always_inline int tcp_dissect(void *data, void *data_end,
376 				       struct header_pointers *hdr)
377 {
378 	hdr->eth = data;
379 	if (hdr->eth + 1 > data_end)
380 		return XDP_DROP;
381 
382 	switch (bpf_ntohs(hdr->eth->h_proto)) {
383 	case ETH_P_IP:
384 		hdr->ipv6 = NULL;
385 
386 		hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth);
387 		if (hdr->ipv4 + 1 > data_end)
388 			return XDP_DROP;
389 		if (hdr->ipv4->ihl * 4 < sizeof(*hdr->ipv4))
390 			return XDP_DROP;
391 		if (hdr->ipv4->version != 4)
392 			return XDP_DROP;
393 
394 		if (hdr->ipv4->protocol != IPPROTO_TCP)
395 			return XDP_PASS;
396 
397 		hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4;
398 		break;
399 	case ETH_P_IPV6:
400 		hdr->ipv4 = NULL;
401 
402 		hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth);
403 		if (hdr->ipv6 + 1 > data_end)
404 			return XDP_DROP;
405 		if (hdr->ipv6->version != 6)
406 			return XDP_DROP;
407 
408 		/* XXX: Extension headers are not supported and could circumvent
409 		 * XDP SYN flood protection.
410 		 */
411 		if (hdr->ipv6->nexthdr != NEXTHDR_TCP)
412 			return XDP_PASS;
413 
414 		hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6);
415 		break;
416 	default:
417 		/* XXX: VLANs will circumvent XDP SYN flood protection. */
418 		return XDP_PASS;
419 	}
420 
421 	if (hdr->tcp + 1 > data_end)
422 		return XDP_DROP;
423 	hdr->tcp_len = hdr->tcp->doff * 4;
424 	if (hdr->tcp_len < sizeof(*hdr->tcp))
425 		return XDP_DROP;
426 
427 	return XDP_TX;
428 }
429 
430 static __always_inline int tcp_lookup(void *ctx, struct header_pointers *hdr, bool xdp)
431 {
432 	struct bpf_ct_opts___local ct_lookup_opts = {
433 		.netns_id = BPF_F_CURRENT_NETNS,
434 		.l4proto = IPPROTO_TCP,
435 	};
436 	struct bpf_sock_tuple tup = {};
437 	struct nf_conn *ct;
438 	__u32 tup_size;
439 
440 	if (hdr->ipv4) {
441 		/* TCP doesn't normally use fragments, and XDP can't reassemble
442 		 * them.
443 		 */
444 		if ((hdr->ipv4->frag_off & bpf_htons(IP_DF | IP_MF | IP_OFFSET)) != bpf_htons(IP_DF))
445 			return XDP_DROP;
446 
447 		tup.ipv4.saddr = hdr->ipv4->saddr;
448 		tup.ipv4.daddr = hdr->ipv4->daddr;
449 		tup.ipv4.sport = hdr->tcp->source;
450 		tup.ipv4.dport = hdr->tcp->dest;
451 		tup_size = sizeof(tup.ipv4);
452 	} else if (hdr->ipv6) {
453 		__builtin_memcpy(tup.ipv6.saddr, &hdr->ipv6->saddr, sizeof(tup.ipv6.saddr));
454 		__builtin_memcpy(tup.ipv6.daddr, &hdr->ipv6->daddr, sizeof(tup.ipv6.daddr));
455 		tup.ipv6.sport = hdr->tcp->source;
456 		tup.ipv6.dport = hdr->tcp->dest;
457 		tup_size = sizeof(tup.ipv6);
458 	} else {
459 		/* The verifier can't track that either ipv4 or ipv6 is not
460 		 * NULL.
461 		 */
462 		return XDP_ABORTED;
463 	}
464 	if (xdp)
465 		ct = bpf_xdp_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts));
466 	else
467 		ct = bpf_skb_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts));
468 	if (ct) {
469 		unsigned long status = ct->status;
470 
471 		bpf_ct_release(ct);
472 		if (status & IPS_CONFIRMED)
473 			return XDP_PASS;
474 	} else if (ct_lookup_opts.error != -ENOENT) {
475 		return XDP_ABORTED;
476 	}
477 
478 	/* error == -ENOENT || !(status & IPS_CONFIRMED) */
479 	return XDP_TX;
480 }
481 
482 static __always_inline __u8 tcp_mkoptions(__be32 *buf, __be32 *tsopt, __u16 mss,
483 					  __u8 wscale)
484 {
485 	__be32 *start = buf;
486 
487 	*buf++ = bpf_htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
488 
489 	if (!tsopt)
490 		return buf - start;
491 
492 	if (tsopt[0] & bpf_htonl(1 << 4))
493 		*buf++ = bpf_htonl((TCPOPT_SACK_PERM << 24) |
494 				   (TCPOLEN_SACK_PERM << 16) |
495 				   (TCPOPT_TIMESTAMP << 8) |
496 				   TCPOLEN_TIMESTAMP);
497 	else
498 		*buf++ = bpf_htonl((TCPOPT_NOP << 24) |
499 				   (TCPOPT_NOP << 16) |
500 				   (TCPOPT_TIMESTAMP << 8) |
501 				   TCPOLEN_TIMESTAMP);
502 	*buf++ = tsopt[0];
503 	*buf++ = tsopt[1];
504 
505 	if ((tsopt[0] & bpf_htonl(0xf)) != bpf_htonl(0xf))
506 		*buf++ = bpf_htonl((TCPOPT_NOP << 24) |
507 				   (TCPOPT_WINDOW << 16) |
508 				   (TCPOLEN_WINDOW << 8) |
509 				   wscale);
510 
511 	return buf - start;
512 }
513 
514 static __always_inline void tcp_gen_synack(struct tcphdr *tcp_header,
515 					   __u32 cookie, __be32 *tsopt,
516 					   __u16 mss, __u8 wscale)
517 {
518 	void *tcp_options;
519 
520 	tcp_flag_word(tcp_header) = TCP_FLAG_SYN | TCP_FLAG_ACK;
521 	if (tsopt && (tsopt[0] & bpf_htonl(1 << 5)))
522 		tcp_flag_word(tcp_header) |= TCP_FLAG_ECE;
523 	tcp_header->doff = 5; /* doff is part of tcp_flag_word. */
524 	swap(tcp_header->source, tcp_header->dest);
525 	tcp_header->ack_seq = bpf_htonl(bpf_ntohl(tcp_header->seq) + 1);
526 	tcp_header->seq = bpf_htonl(cookie);
527 	tcp_header->window = 0;
528 	tcp_header->urg_ptr = 0;
529 	tcp_header->check = 0; /* Calculate checksum later. */
530 
531 	tcp_options = (void *)(tcp_header + 1);
532 	tcp_header->doff += tcp_mkoptions(tcp_options, tsopt, mss, wscale);
533 }
534 
535 static __always_inline void tcpv4_gen_synack(struct header_pointers *hdr,
536 					     __u32 cookie, __be32 *tsopt)
537 {
538 	__u8 wscale;
539 	__u16 mss;
540 	__u8 ttl;
541 
542 	values_get_tcpipopts(&mss, &wscale, &ttl, false);
543 
544 	swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);
545 
546 	swap(hdr->ipv4->saddr, hdr->ipv4->daddr);
547 	hdr->ipv4->check = 0; /* Calculate checksum later. */
548 	hdr->ipv4->tos = 0;
549 	hdr->ipv4->id = 0;
550 	hdr->ipv4->ttl = ttl;
551 
552 	tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);
553 
554 	hdr->tcp_len = hdr->tcp->doff * 4;
555 	hdr->ipv4->tot_len = bpf_htons(sizeof(*hdr->ipv4) + hdr->tcp_len);
556 }
557 
558 static __always_inline void tcpv6_gen_synack(struct header_pointers *hdr,
559 					     __u32 cookie, __be32 *tsopt)
560 {
561 	__u8 wscale;
562 	__u16 mss;
563 	__u8 ttl;
564 
565 	values_get_tcpipopts(&mss, &wscale, &ttl, true);
566 
567 	swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);
568 
569 	swap(hdr->ipv6->saddr, hdr->ipv6->daddr);
570 	*(__be32 *)hdr->ipv6 = bpf_htonl(0x60000000);
571 	hdr->ipv6->hop_limit = ttl;
572 
573 	tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);
574 
575 	hdr->tcp_len = hdr->tcp->doff * 4;
576 	hdr->ipv6->payload_len = bpf_htons(hdr->tcp_len);
577 }
578 
579 static __always_inline int syncookie_handle_syn(struct header_pointers *hdr,
580 						void *ctx,
581 						void *data, void *data_end,
582 						bool xdp)
583 {
584 	__u32 old_pkt_size, new_pkt_size;
585 	/* Unlike clang 10, clang 11 and 12 generate code that doesn't pass the
586 	 * BPF verifier if tsopt is not volatile. Volatile forces it to store
587 	 * the pointer value and use it directly, otherwise tcp_mkoptions is
588 	 * (mis)compiled like this:
589 	 *   if (!tsopt)
590 	 *       return buf - start;
591 	 *   reg = stored_return_value_of_tscookie_init;
592 	 *   if (reg)
593 	 *       tsopt = tsopt_buf;
594 	 *   else
595 	 *       tsopt = NULL;
596 	 *   ...
597 	 *   *buf++ = tsopt[1];
598 	 * It creates a dead branch where tsopt is assigned NULL, but the
599 	 * verifier can't prove it's dead and blocks the program.
600 	 */
601 	__be32 * volatile tsopt = NULL;
602 	__be32 tsopt_buf[2] = {};
603 	__u16 ip_len;
604 	__u32 cookie;
605 	__s64 value;
606 
607 	/* Checksum is not yet verified, but both checksum failure and TCP
608 	 * header checks return XDP_DROP, so the order doesn't matter.
609 	 */
610 	if (hdr->tcp->fin || hdr->tcp->rst)
611 		return XDP_DROP;
612 
613 	/* Issue SYN cookies on allowed ports, drop SYN packets on blocked
614 	 * ports.
615 	 */
616 	if (!check_port_allowed(bpf_ntohs(hdr->tcp->dest)))
617 		return XDP_DROP;
618 
619 	if (hdr->ipv4) {
620 		/* Check the IPv4 and TCP checksums before creating a SYNACK. */
621 		value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, hdr->ipv4->ihl * 4, 0);
622 		if (value < 0)
623 			return XDP_ABORTED;
624 		if (csum_fold(value) != 0)
625 			return XDP_DROP; /* Bad IPv4 checksum. */
626 
627 		value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
628 		if (value < 0)
629 			return XDP_ABORTED;
630 		if (csum_tcpudp_magic(hdr->ipv4->saddr, hdr->ipv4->daddr,
631 				      hdr->tcp_len, IPPROTO_TCP, value) != 0)
632 			return XDP_DROP; /* Bad TCP checksum. */
633 
634 		ip_len = sizeof(*hdr->ipv4);
635 
636 		value = bpf_tcp_raw_gen_syncookie_ipv4(hdr->ipv4, hdr->tcp,
637 						       hdr->tcp_len);
638 	} else if (hdr->ipv6) {
639 		/* Check the TCP checksum before creating a SYNACK. */
640 		value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
641 		if (value < 0)
642 			return XDP_ABORTED;
643 		if (csum_ipv6_magic(&hdr->ipv6->saddr, &hdr->ipv6->daddr,
644 				    hdr->tcp_len, IPPROTO_TCP, value) != 0)
645 			return XDP_DROP; /* Bad TCP checksum. */
646 
647 		ip_len = sizeof(*hdr->ipv6);
648 
649 		value = bpf_tcp_raw_gen_syncookie_ipv6(hdr->ipv6, hdr->tcp,
650 						       hdr->tcp_len);
651 	} else {
652 		return XDP_ABORTED;
653 	}
654 
655 	if (value < 0)
656 		return XDP_ABORTED;
657 	cookie = (__u32)value;
658 
659 	if (tscookie_init((void *)hdr->tcp, hdr->tcp_len,
660 			  &tsopt_buf[0], &tsopt_buf[1], data, data_end))
661 		tsopt = tsopt_buf;
662 
663 	/* Check that there is enough space for a SYNACK. It also covers
664 	 * the check that the destination of the __builtin_memmove below
665 	 * doesn't overflow.
666 	 */
667 	if (data + sizeof(*hdr->eth) + ip_len + TCP_MAXLEN > data_end)
668 		return XDP_ABORTED;
669 
670 	if (hdr->ipv4) {
671 		if (hdr->ipv4->ihl * 4 > sizeof(*hdr->ipv4)) {
672 			struct tcphdr *new_tcp_header;
673 
674 			new_tcp_header = data + sizeof(*hdr->eth) + sizeof(*hdr->ipv4);
675 			__builtin_memmove(new_tcp_header, hdr->tcp, sizeof(*hdr->tcp));
676 			hdr->tcp = new_tcp_header;
677 
678 			hdr->ipv4->ihl = sizeof(*hdr->ipv4) / 4;
679 		}
680 
681 		tcpv4_gen_synack(hdr, cookie, tsopt);
682 	} else if (hdr->ipv6) {
683 		tcpv6_gen_synack(hdr, cookie, tsopt);
684 	} else {
685 		return XDP_ABORTED;
686 	}
687 
688 	/* Recalculate checksums. */
689 	hdr->tcp->check = 0;
690 	value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
691 	if (value < 0)
692 		return XDP_ABORTED;
693 	if (hdr->ipv4) {
694 		hdr->tcp->check = csum_tcpudp_magic(hdr->ipv4->saddr,
695 						    hdr->ipv4->daddr,
696 						    hdr->tcp_len,
697 						    IPPROTO_TCP,
698 						    value);
699 
700 		hdr->ipv4->check = 0;
701 		value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, sizeof(*hdr->ipv4), 0);
702 		if (value < 0)
703 			return XDP_ABORTED;
704 		hdr->ipv4->check = csum_fold(value);
705 	} else if (hdr->ipv6) {
706 		hdr->tcp->check = csum_ipv6_magic(&hdr->ipv6->saddr,
707 						  &hdr->ipv6->daddr,
708 						  hdr->tcp_len,
709 						  IPPROTO_TCP,
710 						  value);
711 	} else {
712 		return XDP_ABORTED;
713 	}
714 
715 	/* Set the new packet size. */
716 	old_pkt_size = data_end - data;
717 	new_pkt_size = sizeof(*hdr->eth) + ip_len + hdr->tcp->doff * 4;
718 	if (xdp) {
719 		if (bpf_xdp_adjust_tail(ctx, new_pkt_size - old_pkt_size))
720 			return XDP_ABORTED;
721 	} else {
722 		if (bpf_skb_change_tail(ctx, new_pkt_size, 0))
723 			return XDP_ABORTED;
724 	}
725 
726 	values_inc_synacks();
727 
728 	return XDP_TX;
729 }
730 
731 static __always_inline int syncookie_handle_ack(struct header_pointers *hdr)
732 {
733 	int err;
734 
735 	if (hdr->tcp->rst)
736 		return XDP_DROP;
737 
738 	if (hdr->ipv4)
739 		err = bpf_tcp_raw_check_syncookie_ipv4(hdr->ipv4, hdr->tcp);
740 	else if (hdr->ipv6)
741 		err = bpf_tcp_raw_check_syncookie_ipv6(hdr->ipv6, hdr->tcp);
742 	else
743 		return XDP_ABORTED;
744 	if (err)
745 		return XDP_DROP;
746 
747 	return XDP_PASS;
748 }
749 
750 static __always_inline int syncookie_part1(void *ctx, void *data, void *data_end,
751 					   struct header_pointers *hdr, bool xdp)
752 {
753 	int ret;
754 
755 	ret = tcp_dissect(data, data_end, hdr);
756 	if (ret != XDP_TX)
757 		return ret;
758 
759 	ret = tcp_lookup(ctx, hdr, xdp);
760 	if (ret != XDP_TX)
761 		return ret;
762 
763 	/* Packet is TCP and doesn't belong to an established connection. */
764 
765 	if ((hdr->tcp->syn ^ hdr->tcp->ack) != 1)
766 		return XDP_DROP;
767 
768 	/* Grow the TCP header to TCP_MAXLEN to be able to pass any hdr->tcp_len
769 	 * to bpf_tcp_raw_gen_syncookie_ipv{4,6} and pass the verifier.
770 	 */
771 	if (xdp) {
772 		if (bpf_xdp_adjust_tail(ctx, TCP_MAXLEN - hdr->tcp_len))
773 			return XDP_ABORTED;
774 	} else {
775 		/* Without volatile the verifier throws this error:
776 		 * R9 32-bit pointer arithmetic prohibited
777 		 */
778 		volatile u64 old_len = data_end - data;
779 
780 		if (bpf_skb_change_tail(ctx, old_len + TCP_MAXLEN - hdr->tcp_len, 0))
781 			return XDP_ABORTED;
782 	}
783 
784 	return XDP_TX;
785 }
786 
787 static __always_inline int syncookie_part2(void *ctx, void *data, void *data_end,
788 					   struct header_pointers *hdr, bool xdp)
789 {
790 	if (hdr->ipv4) {
791 		hdr->eth = data;
792 		hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth);
793 		/* IPV4_MAXLEN is needed when calculating checksum.
794 		 * At least sizeof(struct iphdr) is needed here to access ihl.
795 		 */
796 		if ((void *)hdr->ipv4 + IPV4_MAXLEN > data_end)
797 			return XDP_ABORTED;
798 		hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4;
799 	} else if (hdr->ipv6) {
800 		hdr->eth = data;
801 		hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth);
802 		hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6);
803 	} else {
804 		return XDP_ABORTED;
805 	}
806 
807 	if ((void *)hdr->tcp + TCP_MAXLEN > data_end)
808 		return XDP_ABORTED;
809 
810 	/* We run out of registers, tcp_len gets spilled to the stack, and the
811 	 * verifier forgets its min and max values checked above in tcp_dissect.
812 	 */
813 	hdr->tcp_len = hdr->tcp->doff * 4;
814 	if (hdr->tcp_len < sizeof(*hdr->tcp))
815 		return XDP_ABORTED;
816 
817 	return hdr->tcp->syn ? syncookie_handle_syn(hdr, ctx, data, data_end, xdp) :
818 			       syncookie_handle_ack(hdr);
819 }
820 
821 SEC("xdp")
822 int syncookie_xdp(struct xdp_md *ctx)
823 {
824 	void *data_end = (void *)(long)ctx->data_end;
825 	void *data = (void *)(long)ctx->data;
826 	struct header_pointers hdr;
827 	int ret;
828 
829 	ret = syncookie_part1(ctx, data, data_end, &hdr, true);
830 	if (ret != XDP_TX)
831 		return ret;
832 
833 	data_end = (void *)(long)ctx->data_end;
834 	data = (void *)(long)ctx->data;
835 
836 	return syncookie_part2(ctx, data, data_end, &hdr, true);
837 }
838 
839 SEC("tc")
840 int syncookie_tc(struct __sk_buff *skb)
841 {
842 	void *data_end = (void *)(long)skb->data_end;
843 	void *data = (void *)(long)skb->data;
844 	struct header_pointers hdr;
845 	int ret;
846 
847 	ret = syncookie_part1(skb, data, data_end, &hdr, false);
848 	if (ret != XDP_TX)
849 		return ret == XDP_PASS ? TC_ACT_OK : TC_ACT_SHOT;
850 
851 	data_end = (void *)(long)skb->data_end;
852 	data = (void *)(long)skb->data;
853 
854 	ret = syncookie_part2(skb, data, data_end, &hdr, false);
855 	switch (ret) {
856 	case XDP_PASS:
857 		return TC_ACT_OK;
858 	case XDP_TX:
859 		return bpf_redirect(skb->ifindex, 0);
860 	default:
861 		return TC_ACT_SHOT;
862 	}
863 }
864 
865 char _license[] SEC("license") = "GPL";
866