1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Linux Socket Filter - Kernel level socket filtering
4 *
5 * Based on the design of the Berkeley Packet Filter. The new
6 * internal format has been designed by PLUMgrid:
7 *
8 * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
9 *
10 * Authors:
11 *
12 * Jay Schulist <jschlst@samba.org>
13 * Alexei Starovoitov <ast@plumgrid.com>
14 * Daniel Borkmann <dborkman@redhat.com>
15 *
16 * Andi Kleen - Fix a few bad bugs and races.
17 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
18 */
19
20 #include <linux/atomic.h>
21 #include <linux/bpf_verifier.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/mm.h>
25 #include <linux/fcntl.h>
26 #include <linux/socket.h>
27 #include <linux/sock_diag.h>
28 #include <linux/in.h>
29 #include <linux/inet.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_packet.h>
32 #include <linux/if_arp.h>
33 #include <linux/gfp.h>
34 #include <net/inet_common.h>
35 #include <net/ip.h>
36 #include <net/protocol.h>
37 #include <net/netlink.h>
38 #include <linux/skbuff.h>
39 #include <linux/skmsg.h>
40 #include <net/sock.h>
41 #include <net/flow_dissector.h>
42 #include <linux/errno.h>
43 #include <linux/timer.h>
44 #include <linux/uaccess.h>
45 #include <linux/unaligned.h>
46 #include <linux/filter.h>
47 #include <linux/ratelimit.h>
48 #include <linux/seccomp.h>
49 #include <linux/if_vlan.h>
50 #include <linux/bpf.h>
51 #include <linux/btf.h>
52 #include <net/sch_generic.h>
53 #include <net/cls_cgroup.h>
54 #include <net/dst_metadata.h>
55 #include <net/dst.h>
56 #include <net/sock_reuseport.h>
57 #include <net/busy_poll.h>
58 #include <net/tcp.h>
59 #include <net/xfrm.h>
60 #include <net/udp.h>
61 #include <linux/bpf_trace.h>
62 #include <net/xdp_sock.h>
63 #include <linux/inetdevice.h>
64 #include <net/inet_hashtables.h>
65 #include <net/inet6_hashtables.h>
66 #include <net/ip_fib.h>
67 #include <net/nexthop.h>
68 #include <net/flow.h>
69 #include <net/arp.h>
70 #include <net/ipv6.h>
71 #include <net/net_namespace.h>
72 #include <linux/seg6_local.h>
73 #include <net/seg6.h>
74 #include <net/seg6_local.h>
75 #include <net/lwtunnel.h>
76 #include <net/ipv6_stubs.h>
77 #include <net/bpf_sk_storage.h>
78 #include <net/transp_v6.h>
79 #include <linux/btf_ids.h>
80 #include <net/tls.h>
81 #include <net/xdp.h>
82 #include <net/mptcp.h>
83 #include <net/netfilter/nf_conntrack_bpf.h>
84 #include <net/netkit.h>
85 #include <linux/un.h>
86 #include <net/xdp_sock_drv.h>
87 #include <net/inet_dscp.h>
88
89 #include "dev.h"
90
91 /* Keep the struct bpf_fib_lookup small so that it fits into a cacheline */
92 static_assert(sizeof(struct bpf_fib_lookup) == 64, "struct bpf_fib_lookup size check");
93
94 static const struct bpf_func_proto *
95 bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);
96
copy_bpf_fprog_from_user(struct sock_fprog * dst,sockptr_t src,int len)97 int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len)
98 {
99 if (in_compat_syscall()) {
100 struct compat_sock_fprog f32;
101
102 if (len != sizeof(f32))
103 return -EINVAL;
104 if (copy_from_sockptr(&f32, src, sizeof(f32)))
105 return -EFAULT;
106 memset(dst, 0, sizeof(*dst));
107 dst->len = f32.len;
108 dst->filter = compat_ptr(f32.filter);
109 } else {
110 if (len != sizeof(*dst))
111 return -EINVAL;
112 if (copy_from_sockptr(dst, src, sizeof(*dst)))
113 return -EFAULT;
114 }
115
116 return 0;
117 }
118 EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user);
119
120 /**
121 * sk_filter_trim_cap - run a packet through a socket filter
122 * @sk: sock associated with &sk_buff
123 * @skb: buffer to filter
124 * @cap: limit on how short the eBPF program may trim the packet
125 * @reason: record drop reason on errors (negative return value)
126 *
127 * Run the eBPF program and then cut skb->data to correct size returned by
128 * the program. If pkt_len is 0 we toss packet. If skb->len is smaller
129 * than pkt_len we keep whole skb->data. This is the socket level
130 * wrapper to bpf_prog_run. It returns 0 if the packet should
131 * be accepted or -EPERM if the packet should be tossed.
132 *
133 */
sk_filter_trim_cap(struct sock * sk,struct sk_buff * skb,unsigned int cap,enum skb_drop_reason * reason)134 int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb,
135 unsigned int cap, enum skb_drop_reason *reason)
136 {
137 int err;
138 struct sk_filter *filter;
139
140 /*
141 * If the skb was allocated from pfmemalloc reserves, only
142 * allow SOCK_MEMALLOC sockets to use it as this socket is
143 * helping free memory
144 */
145 if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) {
146 NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
147 *reason = SKB_DROP_REASON_PFMEMALLOC;
148 return -ENOMEM;
149 }
150 err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
151 if (err) {
152 *reason = SKB_DROP_REASON_SOCKET_FILTER;
153 return err;
154 }
155
156 err = security_sock_rcv_skb(sk, skb);
157 if (err) {
158 *reason = SKB_DROP_REASON_SECURITY_HOOK;
159 return err;
160 }
161
162 rcu_read_lock();
163 filter = rcu_dereference(sk->sk_filter);
164 if (filter) {
165 struct sock *save_sk = skb->sk;
166 unsigned int pkt_len;
167
168 skb->sk = sk;
169 pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
170 skb->sk = save_sk;
171 err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM;
172 if (err)
173 *reason = SKB_DROP_REASON_SOCKET_FILTER;
174 }
175 rcu_read_unlock();
176
177 return err;
178 }
179 EXPORT_SYMBOL(sk_filter_trim_cap);
180
BPF_CALL_1(bpf_skb_get_pay_offset,struct sk_buff *,skb)181 BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb)
182 {
183 return skb_get_poff(skb);
184 }
185
BPF_CALL_3(bpf_skb_get_nlattr,struct sk_buff *,skb,u32,a,u32,x)186 BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
187 {
188 struct nlattr *nla;
189
190 if (skb_is_nonlinear(skb))
191 return 0;
192
193 if (skb->len < sizeof(struct nlattr))
194 return 0;
195
196 if (a > skb->len - sizeof(struct nlattr))
197 return 0;
198
199 nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x);
200 if (nla)
201 return (void *) nla - (void *) skb->data;
202
203 return 0;
204 }
205
BPF_CALL_3(bpf_skb_get_nlattr_nest,struct sk_buff *,skb,u32,a,u32,x)206 BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
207 {
208 struct nlattr *nla;
209
210 if (skb_is_nonlinear(skb))
211 return 0;
212
213 if (skb->len < sizeof(struct nlattr))
214 return 0;
215
216 if (a > skb->len - sizeof(struct nlattr))
217 return 0;
218
219 nla = (struct nlattr *) &skb->data[a];
220 if (!nla_ok(nla, skb->len - a))
221 return 0;
222
223 nla = nla_find_nested(nla, x);
224 if (nla)
225 return (void *) nla - (void *) skb->data;
226
227 return 0;
228 }
229
bpf_skb_load_helper_convert_offset(const struct sk_buff * skb,int offset)230 static int bpf_skb_load_helper_convert_offset(const struct sk_buff *skb, int offset)
231 {
232 if (likely(offset >= 0))
233 return offset;
234
235 if (offset >= SKF_NET_OFF)
236 return offset - SKF_NET_OFF + skb_network_offset(skb);
237
238 if (offset >= SKF_LL_OFF && skb_mac_header_was_set(skb))
239 return offset - SKF_LL_OFF + skb_mac_offset(skb);
240
241 return INT_MIN;
242 }
243
BPF_CALL_4(bpf_skb_load_helper_8,const struct sk_buff *,skb,const void *,data,int,headlen,int,offset)244 BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *,
245 data, int, headlen, int, offset)
246 {
247 u8 tmp;
248 const int len = sizeof(tmp);
249
250 offset = bpf_skb_load_helper_convert_offset(skb, offset);
251 if (offset == INT_MIN)
252 return -EFAULT;
253
254 if (headlen - offset >= len)
255 return *(u8 *)(data + offset);
256 if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
257 return tmp;
258 else
259 return -EFAULT;
260 }
261
BPF_CALL_2(bpf_skb_load_helper_8_no_cache,const struct sk_buff *,skb,int,offset)262 BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb,
263 int, offset)
264 {
265 return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len,
266 offset);
267 }
268
BPF_CALL_4(bpf_skb_load_helper_16,const struct sk_buff *,skb,const void *,data,int,headlen,int,offset)269 BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *,
270 data, int, headlen, int, offset)
271 {
272 __be16 tmp;
273 const int len = sizeof(tmp);
274
275 offset = bpf_skb_load_helper_convert_offset(skb, offset);
276 if (offset == INT_MIN)
277 return -EFAULT;
278
279 if (headlen - offset >= len)
280 return get_unaligned_be16(data + offset);
281 if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
282 return be16_to_cpu(tmp);
283 else
284 return -EFAULT;
285 }
286
BPF_CALL_2(bpf_skb_load_helper_16_no_cache,const struct sk_buff *,skb,int,offset)287 BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb,
288 int, offset)
289 {
290 return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len,
291 offset);
292 }
293
BPF_CALL_4(bpf_skb_load_helper_32,const struct sk_buff *,skb,const void *,data,int,headlen,int,offset)294 BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *,
295 data, int, headlen, int, offset)
296 {
297 __be32 tmp;
298 const int len = sizeof(tmp);
299
300 offset = bpf_skb_load_helper_convert_offset(skb, offset);
301 if (offset == INT_MIN)
302 return -EFAULT;
303
304 if (headlen - offset >= len)
305 return get_unaligned_be32(data + offset);
306 if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
307 return be32_to_cpu(tmp);
308 else
309 return -EFAULT;
310 }
311
BPF_CALL_2(bpf_skb_load_helper_32_no_cache,const struct sk_buff *,skb,int,offset)312 BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb,
313 int, offset)
314 {
315 return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len,
316 offset);
317 }
318
convert_skb_access(int skb_field,int dst_reg,int src_reg,struct bpf_insn * insn_buf)319 static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
320 struct bpf_insn *insn_buf)
321 {
322 struct bpf_insn *insn = insn_buf;
323
324 switch (skb_field) {
325 case SKF_AD_MARK:
326 BUILD_BUG_ON(sizeof_field(struct sk_buff, mark) != 4);
327
328 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
329 offsetof(struct sk_buff, mark));
330 break;
331
332 case SKF_AD_PKTTYPE:
333 *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET);
334 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX);
335 #ifdef __BIG_ENDIAN_BITFIELD
336 *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5);
337 #endif
338 break;
339
340 case SKF_AD_QUEUE:
341 BUILD_BUG_ON(sizeof_field(struct sk_buff, queue_mapping) != 2);
342
343 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
344 offsetof(struct sk_buff, queue_mapping));
345 break;
346
347 case SKF_AD_VLAN_TAG:
348 BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_tci) != 2);
349
350 /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */
351 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
352 offsetof(struct sk_buff, vlan_tci));
353 break;
354 case SKF_AD_VLAN_TAG_PRESENT:
355 BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_all) != 4);
356 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
357 offsetof(struct sk_buff, vlan_all));
358 *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1);
359 *insn++ = BPF_ALU32_IMM(BPF_MOV, dst_reg, 1);
360 break;
361 }
362
363 return insn - insn_buf;
364 }
365
convert_bpf_extensions(struct sock_filter * fp,struct bpf_insn ** insnp)366 static bool convert_bpf_extensions(struct sock_filter *fp,
367 struct bpf_insn **insnp)
368 {
369 struct bpf_insn *insn = *insnp;
370 u32 cnt;
371
372 switch (fp->k) {
373 case SKF_AD_OFF + SKF_AD_PROTOCOL:
374 BUILD_BUG_ON(sizeof_field(struct sk_buff, protocol) != 2);
375
376 /* A = *(u16 *) (CTX + offsetof(protocol)) */
377 *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
378 offsetof(struct sk_buff, protocol));
379 /* A = ntohs(A) [emitting a nop or swap16] */
380 *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
381 break;
382
383 case SKF_AD_OFF + SKF_AD_PKTTYPE:
384 cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn);
385 insn += cnt - 1;
386 break;
387
388 case SKF_AD_OFF + SKF_AD_IFINDEX:
389 case SKF_AD_OFF + SKF_AD_HATYPE:
390 BUILD_BUG_ON(sizeof_field(struct net_device, ifindex) != 4);
391 BUILD_BUG_ON(sizeof_field(struct net_device, type) != 2);
392
393 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
394 BPF_REG_TMP, BPF_REG_CTX,
395 offsetof(struct sk_buff, dev));
396 /* if (tmp != 0) goto pc + 1 */
397 *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1);
398 *insn++ = BPF_EXIT_INSN();
399 if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX)
400 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP,
401 offsetof(struct net_device, ifindex));
402 else
403 *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP,
404 offsetof(struct net_device, type));
405 break;
406
407 case SKF_AD_OFF + SKF_AD_MARK:
408 cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn);
409 insn += cnt - 1;
410 break;
411
412 case SKF_AD_OFF + SKF_AD_RXHASH:
413 BUILD_BUG_ON(sizeof_field(struct sk_buff, hash) != 4);
414
415 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
416 offsetof(struct sk_buff, hash));
417 break;
418
419 case SKF_AD_OFF + SKF_AD_QUEUE:
420 cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn);
421 insn += cnt - 1;
422 break;
423
424 case SKF_AD_OFF + SKF_AD_VLAN_TAG:
425 cnt = convert_skb_access(SKF_AD_VLAN_TAG,
426 BPF_REG_A, BPF_REG_CTX, insn);
427 insn += cnt - 1;
428 break;
429
430 case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
431 cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
432 BPF_REG_A, BPF_REG_CTX, insn);
433 insn += cnt - 1;
434 break;
435
436 case SKF_AD_OFF + SKF_AD_VLAN_TPID:
437 BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_proto) != 2);
438
439 /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */
440 *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
441 offsetof(struct sk_buff, vlan_proto));
442 /* A = ntohs(A) [emitting a nop or swap16] */
443 *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
444 break;
445
446 case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
447 case SKF_AD_OFF + SKF_AD_NLATTR:
448 case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
449 case SKF_AD_OFF + SKF_AD_CPU:
450 case SKF_AD_OFF + SKF_AD_RANDOM:
451 /* arg1 = CTX */
452 *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
453 /* arg2 = A */
454 *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A);
455 /* arg3 = X */
456 *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X);
457 /* Emit call(arg1=CTX, arg2=A, arg3=X) */
458 switch (fp->k) {
459 case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
460 *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset);
461 break;
462 case SKF_AD_OFF + SKF_AD_NLATTR:
463 *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr);
464 break;
465 case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
466 *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest);
467 break;
468 case SKF_AD_OFF + SKF_AD_CPU:
469 *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id);
470 break;
471 case SKF_AD_OFF + SKF_AD_RANDOM:
472 *insn = BPF_EMIT_CALL(bpf_user_rnd_u32);
473 bpf_user_rnd_init_once();
474 break;
475 }
476 break;
477
478 case SKF_AD_OFF + SKF_AD_ALU_XOR_X:
479 /* A ^= X */
480 *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X);
481 break;
482
483 default:
484 /* This is just a dummy call to avoid letting the compiler
485 * evict __bpf_call_base() as an optimization. Placed here
486 * where no-one bothers.
487 */
488 BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0);
489 return false;
490 }
491
492 *insnp = insn;
493 return true;
494 }
495
convert_bpf_ld_abs(struct sock_filter * fp,struct bpf_insn ** insnp)496 static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp)
497 {
498 const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS);
499 int size = bpf_size_to_bytes(BPF_SIZE(fp->code));
500 bool endian = BPF_SIZE(fp->code) == BPF_H ||
501 BPF_SIZE(fp->code) == BPF_W;
502 bool indirect = BPF_MODE(fp->code) == BPF_IND;
503 const int ip_align = NET_IP_ALIGN;
504 struct bpf_insn *insn = *insnp;
505 int offset = fp->k;
506
507 if (!indirect &&
508 ((unaligned_ok && offset >= 0) ||
509 (!unaligned_ok && offset >= 0 &&
510 offset + ip_align >= 0 &&
511 offset + ip_align % size == 0))) {
512 bool ldx_off_ok = offset <= S16_MAX;
513
514 *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H);
515 if (offset)
516 *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
517 *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP,
518 size, 2 + endian + (!ldx_off_ok * 2));
519 if (ldx_off_ok) {
520 *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
521 BPF_REG_D, offset);
522 } else {
523 *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_D);
524 *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_TMP, offset);
525 *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
526 BPF_REG_TMP, 0);
527 }
528 if (endian)
529 *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8);
530 *insn++ = BPF_JMP_A(8);
531 }
532
533 *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
534 *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D);
535 *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H);
536 if (!indirect) {
537 *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset);
538 } else {
539 *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X);
540 if (fp->k)
541 *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset);
542 }
543
544 switch (BPF_SIZE(fp->code)) {
545 case BPF_B:
546 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8);
547 break;
548 case BPF_H:
549 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16);
550 break;
551 case BPF_W:
552 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32);
553 break;
554 default:
555 return false;
556 }
557
558 *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2);
559 *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
560 *insn = BPF_EXIT_INSN();
561
562 *insnp = insn;
563 return true;
564 }
565
566 /**
567 * bpf_convert_filter - convert filter program
568 * @prog: the user passed filter program
569 * @len: the length of the user passed filter program
570 * @new_prog: allocated 'struct bpf_prog' or NULL
571 * @new_len: pointer to store length of converted program
572 * @seen_ld_abs: bool whether we've seen ld_abs/ind
573 *
574 * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn'
575 * style extended BPF (eBPF).
576 * Conversion workflow:
577 *
578 * 1) First pass for calculating the new program length:
579 * bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs)
580 *
581 * 2) 2nd pass to remap in two passes: 1st pass finds new
582 * jump offsets, 2nd pass remapping:
583 * bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs)
584 */
bpf_convert_filter(struct sock_filter * prog,int len,struct bpf_prog * new_prog,int * new_len,bool * seen_ld_abs)585 static int bpf_convert_filter(struct sock_filter *prog, int len,
586 struct bpf_prog *new_prog, int *new_len,
587 bool *seen_ld_abs)
588 {
589 int new_flen = 0, pass = 0, target, i, stack_off;
590 struct bpf_insn *new_insn, *first_insn = NULL;
591 struct sock_filter *fp;
592 int *addrs = NULL;
593 u8 bpf_src;
594
595 BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK);
596 BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
597
598 if (len <= 0 || len > BPF_MAXINSNS)
599 return -EINVAL;
600
601 if (new_prog) {
602 first_insn = new_prog->insnsi;
603 addrs = kcalloc(len, sizeof(*addrs),
604 GFP_KERNEL | __GFP_NOWARN);
605 if (!addrs)
606 return -ENOMEM;
607 }
608
609 do_pass:
610 new_insn = first_insn;
611 fp = prog;
612
613 /* Classic BPF related prologue emission. */
614 if (new_prog) {
615 /* Classic BPF expects A and X to be reset first. These need
616 * to be guaranteed to be the first two instructions.
617 */
618 *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
619 *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X);
620
621 /* All programs must keep CTX in callee saved BPF_REG_CTX.
622 * In eBPF case it's done by the compiler, here we need to
623 * do this ourself. Initial CTX is present in BPF_REG_ARG1.
624 */
625 *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
626 if (*seen_ld_abs) {
627 /* For packet access in classic BPF, cache skb->data
628 * in callee-saved BPF R8 and skb->len - skb->data_len
629 * (headlen) in BPF R9. Since classic BPF is read-only
630 * on CTX, we only need to cache it once.
631 */
632 *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
633 BPF_REG_D, BPF_REG_CTX,
634 offsetof(struct sk_buff, data));
635 *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX,
636 offsetof(struct sk_buff, len));
637 *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX,
638 offsetof(struct sk_buff, data_len));
639 *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP);
640 }
641 } else {
642 new_insn += 3;
643 }
644
645 for (i = 0; i < len; fp++, i++) {
646 struct bpf_insn tmp_insns[32] = { };
647 struct bpf_insn *insn = tmp_insns;
648
649 if (addrs)
650 addrs[i] = new_insn - first_insn;
651
652 switch (fp->code) {
653 /* All arithmetic insns and skb loads map as-is. */
654 case BPF_ALU | BPF_ADD | BPF_X:
655 case BPF_ALU | BPF_ADD | BPF_K:
656 case BPF_ALU | BPF_SUB | BPF_X:
657 case BPF_ALU | BPF_SUB | BPF_K:
658 case BPF_ALU | BPF_AND | BPF_X:
659 case BPF_ALU | BPF_AND | BPF_K:
660 case BPF_ALU | BPF_OR | BPF_X:
661 case BPF_ALU | BPF_OR | BPF_K:
662 case BPF_ALU | BPF_LSH | BPF_X:
663 case BPF_ALU | BPF_LSH | BPF_K:
664 case BPF_ALU | BPF_RSH | BPF_X:
665 case BPF_ALU | BPF_RSH | BPF_K:
666 case BPF_ALU | BPF_XOR | BPF_X:
667 case BPF_ALU | BPF_XOR | BPF_K:
668 case BPF_ALU | BPF_MUL | BPF_X:
669 case BPF_ALU | BPF_MUL | BPF_K:
670 case BPF_ALU | BPF_DIV | BPF_X:
671 case BPF_ALU | BPF_DIV | BPF_K:
672 case BPF_ALU | BPF_MOD | BPF_X:
673 case BPF_ALU | BPF_MOD | BPF_K:
674 case BPF_ALU | BPF_NEG:
675 case BPF_LD | BPF_ABS | BPF_W:
676 case BPF_LD | BPF_ABS | BPF_H:
677 case BPF_LD | BPF_ABS | BPF_B:
678 case BPF_LD | BPF_IND | BPF_W:
679 case BPF_LD | BPF_IND | BPF_H:
680 case BPF_LD | BPF_IND | BPF_B:
681 /* Check for overloaded BPF extension and
682 * directly convert it if found, otherwise
683 * just move on with mapping.
684 */
685 if (BPF_CLASS(fp->code) == BPF_LD &&
686 BPF_MODE(fp->code) == BPF_ABS &&
687 convert_bpf_extensions(fp, &insn))
688 break;
689 if (BPF_CLASS(fp->code) == BPF_LD &&
690 convert_bpf_ld_abs(fp, &insn)) {
691 *seen_ld_abs = true;
692 break;
693 }
694
695 if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) ||
696 fp->code == (BPF_ALU | BPF_MOD | BPF_X)) {
697 *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X);
698 /* Error with exception code on div/mod by 0.
699 * For cBPF programs, this was always return 0.
700 */
701 *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2);
702 *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
703 *insn++ = BPF_EXIT_INSN();
704 }
705
706 *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k);
707 break;
708
709 /* Jump transformation cannot use BPF block macros
710 * everywhere as offset calculation and target updates
711 * require a bit more work than the rest, i.e. jump
712 * opcodes map as-is, but offsets need adjustment.
713 */
714
715 #define BPF_EMIT_JMP \
716 do { \
717 const s32 off_min = S16_MIN, off_max = S16_MAX; \
718 s32 off; \
719 \
720 if (target >= len || target < 0) \
721 goto err; \
722 off = addrs ? addrs[target] - addrs[i] - 1 : 0; \
723 /* Adjust pc relative offset for 2nd or 3rd insn. */ \
724 off -= insn - tmp_insns; \
725 /* Reject anything not fitting into insn->off. */ \
726 if (off < off_min || off > off_max) \
727 goto err; \
728 insn->off = off; \
729 } while (0)
730
731 case BPF_JMP | BPF_JA:
732 target = i + fp->k + 1;
733 insn->code = fp->code;
734 BPF_EMIT_JMP;
735 break;
736
737 case BPF_JMP | BPF_JEQ | BPF_K:
738 case BPF_JMP | BPF_JEQ | BPF_X:
739 case BPF_JMP | BPF_JSET | BPF_K:
740 case BPF_JMP | BPF_JSET | BPF_X:
741 case BPF_JMP | BPF_JGT | BPF_K:
742 case BPF_JMP | BPF_JGT | BPF_X:
743 case BPF_JMP | BPF_JGE | BPF_K:
744 case BPF_JMP | BPF_JGE | BPF_X:
745 if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) {
746 /* BPF immediates are signed, zero extend
747 * immediate into tmp register and use it
748 * in compare insn.
749 */
750 *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k);
751
752 insn->dst_reg = BPF_REG_A;
753 insn->src_reg = BPF_REG_TMP;
754 bpf_src = BPF_X;
755 } else {
756 insn->dst_reg = BPF_REG_A;
757 insn->imm = fp->k;
758 bpf_src = BPF_SRC(fp->code);
759 insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0;
760 }
761
762 /* Common case where 'jump_false' is next insn. */
763 if (fp->jf == 0) {
764 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
765 target = i + fp->jt + 1;
766 BPF_EMIT_JMP;
767 break;
768 }
769
770 /* Convert some jumps when 'jump_true' is next insn. */
771 if (fp->jt == 0) {
772 switch (BPF_OP(fp->code)) {
773 case BPF_JEQ:
774 insn->code = BPF_JMP | BPF_JNE | bpf_src;
775 break;
776 case BPF_JGT:
777 insn->code = BPF_JMP | BPF_JLE | bpf_src;
778 break;
779 case BPF_JGE:
780 insn->code = BPF_JMP | BPF_JLT | bpf_src;
781 break;
782 default:
783 goto jmp_rest;
784 }
785
786 target = i + fp->jf + 1;
787 BPF_EMIT_JMP;
788 break;
789 }
790 jmp_rest:
791 /* Other jumps are mapped into two insns: Jxx and JA. */
792 target = i + fp->jt + 1;
793 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
794 BPF_EMIT_JMP;
795 insn++;
796
797 insn->code = BPF_JMP | BPF_JA;
798 target = i + fp->jf + 1;
799 BPF_EMIT_JMP;
800 break;
801
802 /* ldxb 4 * ([14] & 0xf) is remapped into 6 insns. */
803 case BPF_LDX | BPF_MSH | BPF_B: {
804 struct sock_filter tmp = {
805 .code = BPF_LD | BPF_ABS | BPF_B,
806 .k = fp->k,
807 };
808
809 *seen_ld_abs = true;
810
811 /* X = A */
812 *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
813 /* A = BPF_R0 = *(u8 *) (skb->data + K) */
814 convert_bpf_ld_abs(&tmp, &insn);
815 insn++;
816 /* A &= 0xf */
817 *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
818 /* A <<= 2 */
819 *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
820 /* tmp = X */
821 *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X);
822 /* X = A */
823 *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
824 /* A = tmp */
825 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
826 break;
827 }
828 /* RET_K is remapped into 2 insns. RET_A case doesn't need an
829 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
830 */
831 case BPF_RET | BPF_A:
832 case BPF_RET | BPF_K:
833 if (BPF_RVAL(fp->code) == BPF_K)
834 *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0,
835 0, fp->k);
836 *insn = BPF_EXIT_INSN();
837 break;
838
839 /* Store to stack. */
840 case BPF_ST:
841 case BPF_STX:
842 stack_off = fp->k * 4 + 4;
843 *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) ==
844 BPF_ST ? BPF_REG_A : BPF_REG_X,
845 -stack_off);
846 /* check_load_and_stores() verifies that classic BPF can
847 * load from stack only after write, so tracking
848 * stack_depth for ST|STX insns is enough
849 */
850 if (new_prog && new_prog->aux->stack_depth < stack_off)
851 new_prog->aux->stack_depth = stack_off;
852 break;
853
854 /* Load from stack. */
855 case BPF_LD | BPF_MEM:
856 case BPF_LDX | BPF_MEM:
857 stack_off = fp->k * 4 + 4;
858 *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
859 BPF_REG_A : BPF_REG_X, BPF_REG_FP,
860 -stack_off);
861 break;
862
863 /* A = K or X = K */
864 case BPF_LD | BPF_IMM:
865 case BPF_LDX | BPF_IMM:
866 *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ?
867 BPF_REG_A : BPF_REG_X, fp->k);
868 break;
869
870 /* X = A */
871 case BPF_MISC | BPF_TAX:
872 *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
873 break;
874
875 /* A = X */
876 case BPF_MISC | BPF_TXA:
877 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X);
878 break;
879
880 /* A = skb->len or X = skb->len */
881 case BPF_LD | BPF_W | BPF_LEN:
882 case BPF_LDX | BPF_W | BPF_LEN:
883 *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
884 BPF_REG_A : BPF_REG_X, BPF_REG_CTX,
885 offsetof(struct sk_buff, len));
886 break;
887
888 /* Access seccomp_data fields. */
889 case BPF_LDX | BPF_ABS | BPF_W:
890 /* A = *(u32 *) (ctx + K) */
891 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k);
892 break;
893
894 /* Unknown instruction. */
895 default:
896 goto err;
897 }
898
899 insn++;
900 if (new_prog)
901 memcpy(new_insn, tmp_insns,
902 sizeof(*insn) * (insn - tmp_insns));
903 new_insn += insn - tmp_insns;
904 }
905
906 if (!new_prog) {
907 /* Only calculating new length. */
908 *new_len = new_insn - first_insn;
909 if (*seen_ld_abs)
910 *new_len += 4; /* Prologue bits. */
911 return 0;
912 }
913
914 pass++;
915 if (new_flen != new_insn - first_insn) {
916 new_flen = new_insn - first_insn;
917 if (pass > 2)
918 goto err;
919 goto do_pass;
920 }
921
922 kfree(addrs);
923 BUG_ON(*new_len != new_flen);
924 return 0;
925 err:
926 kfree(addrs);
927 return -EINVAL;
928 }
929
930 /* Security:
931 *
932 * As we dont want to clear mem[] array for each packet going through
933 * __bpf_prog_run(), we check that filter loaded by user never try to read
934 * a cell if not previously written, and we check all branches to be sure
935 * a malicious user doesn't try to abuse us.
936 */
check_load_and_stores(const struct sock_filter * filter,int flen)937 static int check_load_and_stores(const struct sock_filter *filter, int flen)
938 {
939 u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */
940 int pc, ret = 0;
941
942 BUILD_BUG_ON(BPF_MEMWORDS > 16);
943
944 masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL);
945 if (!masks)
946 return -ENOMEM;
947
948 memset(masks, 0xff, flen * sizeof(*masks));
949
950 for (pc = 0; pc < flen; pc++) {
951 memvalid &= masks[pc];
952
953 switch (filter[pc].code) {
954 case BPF_ST:
955 case BPF_STX:
956 memvalid |= (1 << filter[pc].k);
957 break;
958 case BPF_LD | BPF_MEM:
959 case BPF_LDX | BPF_MEM:
960 if (!(memvalid & (1 << filter[pc].k))) {
961 ret = -EINVAL;
962 goto error;
963 }
964 break;
965 case BPF_JMP | BPF_JA:
966 /* A jump must set masks on target */
967 masks[pc + 1 + filter[pc].k] &= memvalid;
968 memvalid = ~0;
969 break;
970 case BPF_JMP | BPF_JEQ | BPF_K:
971 case BPF_JMP | BPF_JEQ | BPF_X:
972 case BPF_JMP | BPF_JGE | BPF_K:
973 case BPF_JMP | BPF_JGE | BPF_X:
974 case BPF_JMP | BPF_JGT | BPF_K:
975 case BPF_JMP | BPF_JGT | BPF_X:
976 case BPF_JMP | BPF_JSET | BPF_K:
977 case BPF_JMP | BPF_JSET | BPF_X:
978 /* A jump must set masks on targets */
979 masks[pc + 1 + filter[pc].jt] &= memvalid;
980 masks[pc + 1 + filter[pc].jf] &= memvalid;
981 memvalid = ~0;
982 break;
983 }
984 }
985 error:
986 kfree(masks);
987 return ret;
988 }
989
chk_code_allowed(u16 code_to_probe)990 static bool chk_code_allowed(u16 code_to_probe)
991 {
992 static const bool codes[] = {
993 /* 32 bit ALU operations */
994 [BPF_ALU | BPF_ADD | BPF_K] = true,
995 [BPF_ALU | BPF_ADD | BPF_X] = true,
996 [BPF_ALU | BPF_SUB | BPF_K] = true,
997 [BPF_ALU | BPF_SUB | BPF_X] = true,
998 [BPF_ALU | BPF_MUL | BPF_K] = true,
999 [BPF_ALU | BPF_MUL | BPF_X] = true,
1000 [BPF_ALU | BPF_DIV | BPF_K] = true,
1001 [BPF_ALU | BPF_DIV | BPF_X] = true,
1002 [BPF_ALU | BPF_MOD | BPF_K] = true,
1003 [BPF_ALU | BPF_MOD | BPF_X] = true,
1004 [BPF_ALU | BPF_AND | BPF_K] = true,
1005 [BPF_ALU | BPF_AND | BPF_X] = true,
1006 [BPF_ALU | BPF_OR | BPF_K] = true,
1007 [BPF_ALU | BPF_OR | BPF_X] = true,
1008 [BPF_ALU | BPF_XOR | BPF_K] = true,
1009 [BPF_ALU | BPF_XOR | BPF_X] = true,
1010 [BPF_ALU | BPF_LSH | BPF_K] = true,
1011 [BPF_ALU | BPF_LSH | BPF_X] = true,
1012 [BPF_ALU | BPF_RSH | BPF_K] = true,
1013 [BPF_ALU | BPF_RSH | BPF_X] = true,
1014 [BPF_ALU | BPF_NEG] = true,
1015 /* Load instructions */
1016 [BPF_LD | BPF_W | BPF_ABS] = true,
1017 [BPF_LD | BPF_H | BPF_ABS] = true,
1018 [BPF_LD | BPF_B | BPF_ABS] = true,
1019 [BPF_LD | BPF_W | BPF_LEN] = true,
1020 [BPF_LD | BPF_W | BPF_IND] = true,
1021 [BPF_LD | BPF_H | BPF_IND] = true,
1022 [BPF_LD | BPF_B | BPF_IND] = true,
1023 [BPF_LD | BPF_IMM] = true,
1024 [BPF_LD | BPF_MEM] = true,
1025 [BPF_LDX | BPF_W | BPF_LEN] = true,
1026 [BPF_LDX | BPF_B | BPF_MSH] = true,
1027 [BPF_LDX | BPF_IMM] = true,
1028 [BPF_LDX | BPF_MEM] = true,
1029 /* Store instructions */
1030 [BPF_ST] = true,
1031 [BPF_STX] = true,
1032 /* Misc instructions */
1033 [BPF_MISC | BPF_TAX] = true,
1034 [BPF_MISC | BPF_TXA] = true,
1035 /* Return instructions */
1036 [BPF_RET | BPF_K] = true,
1037 [BPF_RET | BPF_A] = true,
1038 /* Jump instructions */
1039 [BPF_JMP | BPF_JA] = true,
1040 [BPF_JMP | BPF_JEQ | BPF_K] = true,
1041 [BPF_JMP | BPF_JEQ | BPF_X] = true,
1042 [BPF_JMP | BPF_JGE | BPF_K] = true,
1043 [BPF_JMP | BPF_JGE | BPF_X] = true,
1044 [BPF_JMP | BPF_JGT | BPF_K] = true,
1045 [BPF_JMP | BPF_JGT | BPF_X] = true,
1046 [BPF_JMP | BPF_JSET | BPF_K] = true,
1047 [BPF_JMP | BPF_JSET | BPF_X] = true,
1048 };
1049
1050 if (code_to_probe >= ARRAY_SIZE(codes))
1051 return false;
1052
1053 return codes[code_to_probe];
1054 }
1055
bpf_check_basics_ok(const struct sock_filter * filter,unsigned int flen)1056 static bool bpf_check_basics_ok(const struct sock_filter *filter,
1057 unsigned int flen)
1058 {
1059 if (filter == NULL)
1060 return false;
1061 if (flen == 0 || flen > BPF_MAXINSNS)
1062 return false;
1063
1064 return true;
1065 }
1066
1067 /**
1068 * bpf_check_classic - verify socket filter code
1069 * @filter: filter to verify
1070 * @flen: length of filter
1071 *
1072 * Check the user's filter code. If we let some ugly
1073 * filter code slip through kaboom! The filter must contain
1074 * no references or jumps that are out of range, no illegal
1075 * instructions, and must end with a RET instruction.
1076 *
1077 * All jumps are forward as they are not signed.
1078 *
1079 * Returns 0 if the rule set is legal or -EINVAL if not.
1080 */
bpf_check_classic(const struct sock_filter * filter,unsigned int flen)1081 static int bpf_check_classic(const struct sock_filter *filter,
1082 unsigned int flen)
1083 {
1084 bool anc_found;
1085 int pc;
1086
1087 /* Check the filter code now */
1088 for (pc = 0; pc < flen; pc++) {
1089 const struct sock_filter *ftest = &filter[pc];
1090
1091 /* May we actually operate on this code? */
1092 if (!chk_code_allowed(ftest->code))
1093 return -EINVAL;
1094
1095 /* Some instructions need special checks */
1096 switch (ftest->code) {
1097 case BPF_ALU | BPF_DIV | BPF_K:
1098 case BPF_ALU | BPF_MOD | BPF_K:
1099 /* Check for division by zero */
1100 if (ftest->k == 0)
1101 return -EINVAL;
1102 break;
1103 case BPF_ALU | BPF_LSH | BPF_K:
1104 case BPF_ALU | BPF_RSH | BPF_K:
1105 if (ftest->k >= 32)
1106 return -EINVAL;
1107 break;
1108 case BPF_LD | BPF_MEM:
1109 case BPF_LDX | BPF_MEM:
1110 case BPF_ST:
1111 case BPF_STX:
1112 /* Check for invalid memory addresses */
1113 if (ftest->k >= BPF_MEMWORDS)
1114 return -EINVAL;
1115 break;
1116 case BPF_JMP | BPF_JA:
1117 /* Note, the large ftest->k might cause loops.
1118 * Compare this with conditional jumps below,
1119 * where offsets are limited. --ANK (981016)
1120 */
1121 if (ftest->k >= (unsigned int)(flen - pc - 1))
1122 return -EINVAL;
1123 break;
1124 case BPF_JMP | BPF_JEQ | BPF_K:
1125 case BPF_JMP | BPF_JEQ | BPF_X:
1126 case BPF_JMP | BPF_JGE | BPF_K:
1127 case BPF_JMP | BPF_JGE | BPF_X:
1128 case BPF_JMP | BPF_JGT | BPF_K:
1129 case BPF_JMP | BPF_JGT | BPF_X:
1130 case BPF_JMP | BPF_JSET | BPF_K:
1131 case BPF_JMP | BPF_JSET | BPF_X:
1132 /* Both conditionals must be safe */
1133 if (pc + ftest->jt + 1 >= flen ||
1134 pc + ftest->jf + 1 >= flen)
1135 return -EINVAL;
1136 break;
1137 case BPF_LD | BPF_W | BPF_ABS:
1138 case BPF_LD | BPF_H | BPF_ABS:
1139 case BPF_LD | BPF_B | BPF_ABS:
1140 anc_found = false;
1141 if (bpf_anc_helper(ftest) & BPF_ANC)
1142 anc_found = true;
1143 /* Ancillary operation unknown or unsupported */
1144 if (anc_found == false && ftest->k >= SKF_AD_OFF)
1145 return -EINVAL;
1146 }
1147 }
1148
1149 /* Last instruction must be a RET code */
1150 switch (filter[flen - 1].code) {
1151 case BPF_RET | BPF_K:
1152 case BPF_RET | BPF_A:
1153 return check_load_and_stores(filter, flen);
1154 }
1155
1156 return -EINVAL;
1157 }
1158
bpf_prog_store_orig_filter(struct bpf_prog * fp,const struct sock_fprog * fprog)1159 static int bpf_prog_store_orig_filter(struct bpf_prog *fp,
1160 const struct sock_fprog *fprog)
1161 {
1162 unsigned int fsize = bpf_classic_proglen(fprog);
1163 struct sock_fprog_kern *fkprog;
1164
1165 fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL);
1166 if (!fp->orig_prog)
1167 return -ENOMEM;
1168
1169 fkprog = fp->orig_prog;
1170 fkprog->len = fprog->len;
1171
1172 fkprog->filter = kmemdup(fp->insns, fsize,
1173 GFP_KERNEL | __GFP_NOWARN);
1174 if (!fkprog->filter) {
1175 kfree(fp->orig_prog);
1176 return -ENOMEM;
1177 }
1178
1179 return 0;
1180 }
1181
bpf_release_orig_filter(struct bpf_prog * fp)1182 static void bpf_release_orig_filter(struct bpf_prog *fp)
1183 {
1184 struct sock_fprog_kern *fprog = fp->orig_prog;
1185
1186 if (fprog) {
1187 kfree(fprog->filter);
1188 kfree(fprog);
1189 }
1190 }
1191
__bpf_prog_release(struct bpf_prog * prog)1192 static void __bpf_prog_release(struct bpf_prog *prog)
1193 {
1194 if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) {
1195 bpf_prog_put(prog);
1196 } else {
1197 bpf_release_orig_filter(prog);
1198 bpf_prog_free(prog);
1199 }
1200 }
1201
__sk_filter_release(struct sk_filter * fp)1202 static void __sk_filter_release(struct sk_filter *fp)
1203 {
1204 __bpf_prog_release(fp->prog);
1205 kfree(fp);
1206 }
1207
1208 /**
1209 * sk_filter_release_rcu - Release a socket filter by rcu_head
1210 * @rcu: rcu_head that contains the sk_filter to free
1211 */
sk_filter_release_rcu(struct rcu_head * rcu)1212 static void sk_filter_release_rcu(struct rcu_head *rcu)
1213 {
1214 struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
1215
1216 __sk_filter_release(fp);
1217 }
1218
1219 /**
1220 * sk_filter_release - release a socket filter
1221 * @fp: filter to remove
1222 *
1223 * Remove a filter from a socket and release its resources.
1224 */
sk_filter_release(struct sk_filter * fp)1225 static void sk_filter_release(struct sk_filter *fp)
1226 {
1227 if (refcount_dec_and_test(&fp->refcnt))
1228 call_rcu(&fp->rcu, sk_filter_release_rcu);
1229 }
1230
sk_filter_uncharge(struct sock * sk,struct sk_filter * fp)1231 void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
1232 {
1233 u32 filter_size = bpf_prog_size(fp->prog->len);
1234
1235 atomic_sub(filter_size, &sk->sk_omem_alloc);
1236 sk_filter_release(fp);
1237 }
1238
1239 /* try to charge the socket memory if there is space available
1240 * return true on success
1241 */
__sk_filter_charge(struct sock * sk,struct sk_filter * fp)1242 static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp)
1243 {
1244 int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
1245 u32 filter_size = bpf_prog_size(fp->prog->len);
1246
1247 /* same check as in sock_kmalloc() */
1248 if (filter_size <= optmem_max &&
1249 atomic_read(&sk->sk_omem_alloc) + filter_size < optmem_max) {
1250 atomic_add(filter_size, &sk->sk_omem_alloc);
1251 return true;
1252 }
1253 return false;
1254 }
1255
sk_filter_charge(struct sock * sk,struct sk_filter * fp)1256 bool sk_filter_charge(struct sock *sk, struct sk_filter *fp)
1257 {
1258 if (!refcount_inc_not_zero(&fp->refcnt))
1259 return false;
1260
1261 if (!__sk_filter_charge(sk, fp)) {
1262 sk_filter_release(fp);
1263 return false;
1264 }
1265 return true;
1266 }
1267
bpf_migrate_filter(struct bpf_prog * fp)1268 static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
1269 {
1270 struct sock_filter *old_prog;
1271 struct bpf_prog *old_fp;
1272 int err, new_len, old_len = fp->len;
1273 bool seen_ld_abs = false;
1274
1275 /* We are free to overwrite insns et al right here as it won't be used at
1276 * this point in time anymore internally after the migration to the eBPF
1277 * instruction representation.
1278 */
1279 BUILD_BUG_ON(sizeof(struct sock_filter) !=
1280 sizeof(struct bpf_insn));
1281
1282 /* Conversion cannot happen on overlapping memory areas,
1283 * so we need to keep the user BPF around until the 2nd
1284 * pass. At this time, the user BPF is stored in fp->insns.
1285 */
1286 old_prog = kmemdup_array(fp->insns, old_len, sizeof(struct sock_filter),
1287 GFP_KERNEL | __GFP_NOWARN);
1288 if (!old_prog) {
1289 err = -ENOMEM;
1290 goto out_err;
1291 }
1292
1293 /* 1st pass: calculate the new program length. */
1294 err = bpf_convert_filter(old_prog, old_len, NULL, &new_len,
1295 &seen_ld_abs);
1296 if (err)
1297 goto out_err_free;
1298
1299 /* Expand fp for appending the new filter representation. */
1300 old_fp = fp;
1301 fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0);
1302 if (!fp) {
1303 /* The old_fp is still around in case we couldn't
1304 * allocate new memory, so uncharge on that one.
1305 */
1306 fp = old_fp;
1307 err = -ENOMEM;
1308 goto out_err_free;
1309 }
1310
1311 fp->len = new_len;
1312
1313 /* 2nd pass: remap sock_filter insns into bpf_insn insns. */
1314 err = bpf_convert_filter(old_prog, old_len, fp, &new_len,
1315 &seen_ld_abs);
1316 if (err)
1317 /* 2nd bpf_convert_filter() can fail only if it fails
1318 * to allocate memory, remapping must succeed. Note,
1319 * that at this time old_fp has already been released
1320 * by krealloc().
1321 */
1322 goto out_err_free;
1323
1324 fp = bpf_prog_select_runtime(fp, &err);
1325 if (err)
1326 goto out_err_free;
1327
1328 kfree(old_prog);
1329 return fp;
1330
1331 out_err_free:
1332 kfree(old_prog);
1333 out_err:
1334 __bpf_prog_release(fp);
1335 return ERR_PTR(err);
1336 }
1337
bpf_prepare_filter(struct bpf_prog * fp,bpf_aux_classic_check_t trans)1338 static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
1339 bpf_aux_classic_check_t trans)
1340 {
1341 int err;
1342
1343 fp->bpf_func = NULL;
1344 fp->jited = 0;
1345
1346 err = bpf_check_classic(fp->insns, fp->len);
1347 if (err) {
1348 __bpf_prog_release(fp);
1349 return ERR_PTR(err);
1350 }
1351
1352 /* There might be additional checks and transformations
1353 * needed on classic filters, f.e. in case of seccomp.
1354 */
1355 if (trans) {
1356 err = trans(fp->insns, fp->len);
1357 if (err) {
1358 __bpf_prog_release(fp);
1359 return ERR_PTR(err);
1360 }
1361 }
1362
1363 /* Probe if we can JIT compile the filter and if so, do
1364 * the compilation of the filter.
1365 */
1366 bpf_jit_compile(fp);
1367
1368 /* JIT compiler couldn't process this filter, so do the eBPF translation
1369 * for the optimized interpreter.
1370 */
1371 if (!fp->jited)
1372 fp = bpf_migrate_filter(fp);
1373
1374 return fp;
1375 }
1376
1377 /**
1378 * bpf_prog_create - create an unattached filter
1379 * @pfp: the unattached filter that is created
1380 * @fprog: the filter program
1381 *
1382 * Create a filter independent of any socket. We first run some
1383 * sanity checks on it to make sure it does not explode on us later.
1384 * If an error occurs or there is insufficient memory for the filter
1385 * a negative errno code is returned. On success the return is zero.
1386 */
bpf_prog_create(struct bpf_prog ** pfp,struct sock_fprog_kern * fprog)1387 int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
1388 {
1389 unsigned int fsize = bpf_classic_proglen(fprog);
1390 struct bpf_prog *fp;
1391
1392 /* Make sure new filter is there and in the right amounts. */
1393 if (!bpf_check_basics_ok(fprog->filter, fprog->len))
1394 return -EINVAL;
1395
1396 fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
1397 if (!fp)
1398 return -ENOMEM;
1399
1400 memcpy(fp->insns, fprog->filter, fsize);
1401
1402 fp->len = fprog->len;
1403 /* Since unattached filters are not copied back to user
1404 * space through sk_get_filter(), we do not need to hold
1405 * a copy here, and can spare us the work.
1406 */
1407 fp->orig_prog = NULL;
1408
1409 /* bpf_prepare_filter() already takes care of freeing
1410 * memory in case something goes wrong.
1411 */
1412 fp = bpf_prepare_filter(fp, NULL);
1413 if (IS_ERR(fp))
1414 return PTR_ERR(fp);
1415
1416 *pfp = fp;
1417 return 0;
1418 }
1419 EXPORT_SYMBOL_GPL(bpf_prog_create);
1420
1421 /**
1422 * bpf_prog_create_from_user - create an unattached filter from user buffer
1423 * @pfp: the unattached filter that is created
1424 * @fprog: the filter program
1425 * @trans: post-classic verifier transformation handler
1426 * @save_orig: save classic BPF program
1427 *
1428 * This function effectively does the same as bpf_prog_create(), only
1429 * that it builds up its insns buffer from user space provided buffer.
1430 * It also allows for passing a bpf_aux_classic_check_t handler.
1431 */
bpf_prog_create_from_user(struct bpf_prog ** pfp,struct sock_fprog * fprog,bpf_aux_classic_check_t trans,bool save_orig)1432 int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
1433 bpf_aux_classic_check_t trans, bool save_orig)
1434 {
1435 unsigned int fsize = bpf_classic_proglen(fprog);
1436 struct bpf_prog *fp;
1437 int err;
1438
1439 /* Make sure new filter is there and in the right amounts. */
1440 if (!bpf_check_basics_ok(fprog->filter, fprog->len))
1441 return -EINVAL;
1442
1443 fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
1444 if (!fp)
1445 return -ENOMEM;
1446
1447 if (copy_from_user(fp->insns, fprog->filter, fsize)) {
1448 __bpf_prog_free(fp);
1449 return -EFAULT;
1450 }
1451
1452 fp->len = fprog->len;
1453 fp->orig_prog = NULL;
1454
1455 if (save_orig) {
1456 err = bpf_prog_store_orig_filter(fp, fprog);
1457 if (err) {
1458 __bpf_prog_free(fp);
1459 return -ENOMEM;
1460 }
1461 }
1462
1463 /* bpf_prepare_filter() already takes care of freeing
1464 * memory in case something goes wrong.
1465 */
1466 fp = bpf_prepare_filter(fp, trans);
1467 if (IS_ERR(fp))
1468 return PTR_ERR(fp);
1469
1470 *pfp = fp;
1471 return 0;
1472 }
1473 EXPORT_SYMBOL_GPL(bpf_prog_create_from_user);
1474
bpf_prog_destroy(struct bpf_prog * fp)1475 void bpf_prog_destroy(struct bpf_prog *fp)
1476 {
1477 __bpf_prog_release(fp);
1478 }
1479 EXPORT_SYMBOL_GPL(bpf_prog_destroy);
1480
__sk_attach_prog(struct bpf_prog * prog,struct sock * sk)1481 static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
1482 {
1483 struct sk_filter *fp, *old_fp;
1484
1485 fp = kmalloc(sizeof(*fp), GFP_KERNEL);
1486 if (!fp)
1487 return -ENOMEM;
1488
1489 fp->prog = prog;
1490
1491 if (!__sk_filter_charge(sk, fp)) {
1492 kfree(fp);
1493 return -ENOMEM;
1494 }
1495 refcount_set(&fp->refcnt, 1);
1496
1497 old_fp = rcu_dereference_protected(sk->sk_filter,
1498 lockdep_sock_is_held(sk));
1499 rcu_assign_pointer(sk->sk_filter, fp);
1500
1501 if (old_fp)
1502 sk_filter_uncharge(sk, old_fp);
1503
1504 return 0;
1505 }
1506
1507 static
__get_filter(struct sock_fprog * fprog,struct sock * sk)1508 struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
1509 {
1510 unsigned int fsize = bpf_classic_proglen(fprog);
1511 struct bpf_prog *prog;
1512 int err;
1513
1514 if (sock_flag(sk, SOCK_FILTER_LOCKED))
1515 return ERR_PTR(-EPERM);
1516
1517 /* Make sure new filter is there and in the right amounts. */
1518 if (!bpf_check_basics_ok(fprog->filter, fprog->len))
1519 return ERR_PTR(-EINVAL);
1520
1521 prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
1522 if (!prog)
1523 return ERR_PTR(-ENOMEM);
1524
1525 if (copy_from_user(prog->insns, fprog->filter, fsize)) {
1526 __bpf_prog_free(prog);
1527 return ERR_PTR(-EFAULT);
1528 }
1529
1530 prog->len = fprog->len;
1531
1532 err = bpf_prog_store_orig_filter(prog, fprog);
1533 if (err) {
1534 __bpf_prog_free(prog);
1535 return ERR_PTR(-ENOMEM);
1536 }
1537
1538 /* bpf_prepare_filter() already takes care of freeing
1539 * memory in case something goes wrong.
1540 */
1541 return bpf_prepare_filter(prog, NULL);
1542 }
1543
1544 /**
1545 * sk_attach_filter - attach a socket filter
1546 * @fprog: the filter program
1547 * @sk: the socket to use
1548 *
1549 * Attach the user's filter code. We first run some sanity checks on
1550 * it to make sure it does not explode on us later. If an error
1551 * occurs or there is insufficient memory for the filter a negative
1552 * errno code is returned. On success the return is zero.
1553 */
sk_attach_filter(struct sock_fprog * fprog,struct sock * sk)1554 int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
1555 {
1556 struct bpf_prog *prog = __get_filter(fprog, sk);
1557 int err;
1558
1559 if (IS_ERR(prog))
1560 return PTR_ERR(prog);
1561
1562 err = __sk_attach_prog(prog, sk);
1563 if (err < 0) {
1564 __bpf_prog_release(prog);
1565 return err;
1566 }
1567
1568 return 0;
1569 }
1570 EXPORT_SYMBOL_GPL(sk_attach_filter);
1571
sk_reuseport_attach_filter(struct sock_fprog * fprog,struct sock * sk)1572 int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
1573 {
1574 struct bpf_prog *prog = __get_filter(fprog, sk);
1575 int err, optmem_max;
1576
1577 if (IS_ERR(prog))
1578 return PTR_ERR(prog);
1579
1580 optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
1581 if (bpf_prog_size(prog->len) > optmem_max)
1582 err = -ENOMEM;
1583 else
1584 err = reuseport_attach_prog(sk, prog);
1585
1586 if (err)
1587 __bpf_prog_release(prog);
1588
1589 return err;
1590 }
1591
__get_bpf(u32 ufd,struct sock * sk)1592 static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)
1593 {
1594 if (sock_flag(sk, SOCK_FILTER_LOCKED))
1595 return ERR_PTR(-EPERM);
1596
1597 return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
1598 }
1599
sk_attach_bpf(u32 ufd,struct sock * sk)1600 int sk_attach_bpf(u32 ufd, struct sock *sk)
1601 {
1602 struct bpf_prog *prog = __get_bpf(ufd, sk);
1603 int err;
1604
1605 if (IS_ERR(prog))
1606 return PTR_ERR(prog);
1607
1608 err = __sk_attach_prog(prog, sk);
1609 if (err < 0) {
1610 bpf_prog_put(prog);
1611 return err;
1612 }
1613
1614 return 0;
1615 }
1616
sk_reuseport_attach_bpf(u32 ufd,struct sock * sk)1617 int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
1618 {
1619 struct bpf_prog *prog;
1620 int err, optmem_max;
1621
1622 if (sock_flag(sk, SOCK_FILTER_LOCKED))
1623 return -EPERM;
1624
1625 prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
1626 if (PTR_ERR(prog) == -EINVAL)
1627 prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT);
1628 if (IS_ERR(prog))
1629 return PTR_ERR(prog);
1630
1631 if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) {
1632 /* Like other non BPF_PROG_TYPE_SOCKET_FILTER
1633 * bpf prog (e.g. sockmap). It depends on the
1634 * limitation imposed by bpf_prog_load().
1635 * Hence, sysctl_optmem_max is not checked.
1636 */
1637 if ((sk->sk_type != SOCK_STREAM &&
1638 sk->sk_type != SOCK_DGRAM) ||
1639 (sk->sk_protocol != IPPROTO_UDP &&
1640 sk->sk_protocol != IPPROTO_TCP) ||
1641 (sk->sk_family != AF_INET &&
1642 sk->sk_family != AF_INET6)) {
1643 err = -ENOTSUPP;
1644 goto err_prog_put;
1645 }
1646 } else {
1647 /* BPF_PROG_TYPE_SOCKET_FILTER */
1648 optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
1649 if (bpf_prog_size(prog->len) > optmem_max) {
1650 err = -ENOMEM;
1651 goto err_prog_put;
1652 }
1653 }
1654
1655 err = reuseport_attach_prog(sk, prog);
1656 err_prog_put:
1657 if (err)
1658 bpf_prog_put(prog);
1659
1660 return err;
1661 }
1662
sk_reuseport_prog_free(struct bpf_prog * prog)1663 void sk_reuseport_prog_free(struct bpf_prog *prog)
1664 {
1665 if (!prog)
1666 return;
1667
1668 if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
1669 bpf_prog_put(prog);
1670 else
1671 bpf_prog_destroy(prog);
1672 }
1673
__bpf_try_make_writable(struct sk_buff * skb,unsigned int write_len)1674 static inline int __bpf_try_make_writable(struct sk_buff *skb,
1675 unsigned int write_len)
1676 {
1677 #ifdef CONFIG_DEBUG_NET
1678 /* Avoid a splat in pskb_may_pull_reason() */
1679 if (write_len > INT_MAX)
1680 return -EINVAL;
1681 #endif
1682 return skb_ensure_writable(skb, write_len);
1683 }
1684
bpf_try_make_writable(struct sk_buff * skb,unsigned int write_len)1685 static inline int bpf_try_make_writable(struct sk_buff *skb,
1686 unsigned int write_len)
1687 {
1688 int err = __bpf_try_make_writable(skb, write_len);
1689
1690 bpf_compute_data_pointers(skb);
1691 return err;
1692 }
1693
bpf_try_make_head_writable(struct sk_buff * skb)1694 static int bpf_try_make_head_writable(struct sk_buff *skb)
1695 {
1696 return bpf_try_make_writable(skb, skb_headlen(skb));
1697 }
1698
bpf_push_mac_rcsum(struct sk_buff * skb)1699 static inline void bpf_push_mac_rcsum(struct sk_buff *skb)
1700 {
1701 if (skb_at_tc_ingress(skb))
1702 skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len);
1703 }
1704
bpf_pull_mac_rcsum(struct sk_buff * skb)1705 static inline void bpf_pull_mac_rcsum(struct sk_buff *skb)
1706 {
1707 if (skb_at_tc_ingress(skb))
1708 skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len);
1709 }
1710
BPF_CALL_5(bpf_skb_store_bytes,struct sk_buff *,skb,u32,offset,const void *,from,u32,len,u64,flags)1711 BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset,
1712 const void *, from, u32, len, u64, flags)
1713 {
1714 void *ptr;
1715
1716 if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
1717 return -EINVAL;
1718 if (unlikely(offset > INT_MAX))
1719 return -EFAULT;
1720 if (unlikely(bpf_try_make_writable(skb, offset + len)))
1721 return -EFAULT;
1722
1723 ptr = skb->data + offset;
1724 if (flags & BPF_F_RECOMPUTE_CSUM)
1725 __skb_postpull_rcsum(skb, ptr, len, offset);
1726
1727 memcpy(ptr, from, len);
1728
1729 if (flags & BPF_F_RECOMPUTE_CSUM)
1730 __skb_postpush_rcsum(skb, ptr, len, offset);
1731 if (flags & BPF_F_INVALIDATE_HASH)
1732 skb_clear_hash(skb);
1733
1734 return 0;
1735 }
1736
1737 static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
1738 .func = bpf_skb_store_bytes,
1739 .gpl_only = false,
1740 .ret_type = RET_INTEGER,
1741 .arg1_type = ARG_PTR_TO_CTX,
1742 .arg2_type = ARG_ANYTHING,
1743 .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
1744 .arg4_type = ARG_CONST_SIZE,
1745 .arg5_type = ARG_ANYTHING,
1746 };
1747
__bpf_skb_store_bytes(struct sk_buff * skb,u32 offset,const void * from,u32 len,u64 flags)1748 int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from,
1749 u32 len, u64 flags)
1750 {
1751 return ____bpf_skb_store_bytes(skb, offset, from, len, flags);
1752 }
1753
BPF_CALL_4(bpf_skb_load_bytes,const struct sk_buff *,skb,u32,offset,void *,to,u32,len)1754 BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
1755 void *, to, u32, len)
1756 {
1757 void *ptr;
1758
1759 if (unlikely(offset > INT_MAX))
1760 goto err_clear;
1761
1762 ptr = skb_header_pointer(skb, offset, len, to);
1763 if (unlikely(!ptr))
1764 goto err_clear;
1765 if (ptr != to)
1766 memcpy(to, ptr, len);
1767
1768 return 0;
1769 err_clear:
1770 memset(to, 0, len);
1771 return -EFAULT;
1772 }
1773
1774 static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
1775 .func = bpf_skb_load_bytes,
1776 .gpl_only = false,
1777 .ret_type = RET_INTEGER,
1778 .arg1_type = ARG_PTR_TO_CTX,
1779 .arg2_type = ARG_ANYTHING,
1780 .arg3_type = ARG_PTR_TO_UNINIT_MEM,
1781 .arg4_type = ARG_CONST_SIZE,
1782 };
1783
__bpf_skb_load_bytes(const struct sk_buff * skb,u32 offset,void * to,u32 len)1784 int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len)
1785 {
1786 return ____bpf_skb_load_bytes(skb, offset, to, len);
1787 }
1788
BPF_CALL_4(bpf_flow_dissector_load_bytes,const struct bpf_flow_dissector *,ctx,u32,offset,void *,to,u32,len)1789 BPF_CALL_4(bpf_flow_dissector_load_bytes,
1790 const struct bpf_flow_dissector *, ctx, u32, offset,
1791 void *, to, u32, len)
1792 {
1793 void *ptr;
1794
1795 if (unlikely(offset > 0xffff))
1796 goto err_clear;
1797
1798 if (unlikely(!ctx->skb))
1799 goto err_clear;
1800
1801 ptr = skb_header_pointer(ctx->skb, offset, len, to);
1802 if (unlikely(!ptr))
1803 goto err_clear;
1804 if (ptr != to)
1805 memcpy(to, ptr, len);
1806
1807 return 0;
1808 err_clear:
1809 memset(to, 0, len);
1810 return -EFAULT;
1811 }
1812
1813 static const struct bpf_func_proto bpf_flow_dissector_load_bytes_proto = {
1814 .func = bpf_flow_dissector_load_bytes,
1815 .gpl_only = false,
1816 .ret_type = RET_INTEGER,
1817 .arg1_type = ARG_PTR_TO_CTX,
1818 .arg2_type = ARG_ANYTHING,
1819 .arg3_type = ARG_PTR_TO_UNINIT_MEM,
1820 .arg4_type = ARG_CONST_SIZE,
1821 };
1822
BPF_CALL_5(bpf_skb_load_bytes_relative,const struct sk_buff *,skb,u32,offset,void *,to,u32,len,u32,start_header)1823 BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb,
1824 u32, offset, void *, to, u32, len, u32, start_header)
1825 {
1826 u8 *end = skb_tail_pointer(skb);
1827 u8 *start, *ptr;
1828
1829 if (unlikely(offset > 0xffff))
1830 goto err_clear;
1831
1832 switch (start_header) {
1833 case BPF_HDR_START_MAC:
1834 if (unlikely(!skb_mac_header_was_set(skb)))
1835 goto err_clear;
1836 start = skb_mac_header(skb);
1837 break;
1838 case BPF_HDR_START_NET:
1839 start = skb_network_header(skb);
1840 break;
1841 default:
1842 goto err_clear;
1843 }
1844
1845 ptr = start + offset;
1846
1847 if (likely(ptr + len <= end)) {
1848 memcpy(to, ptr, len);
1849 return 0;
1850 }
1851
1852 err_clear:
1853 memset(to, 0, len);
1854 return -EFAULT;
1855 }
1856
1857 static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = {
1858 .func = bpf_skb_load_bytes_relative,
1859 .gpl_only = false,
1860 .ret_type = RET_INTEGER,
1861 .arg1_type = ARG_PTR_TO_CTX,
1862 .arg2_type = ARG_ANYTHING,
1863 .arg3_type = ARG_PTR_TO_UNINIT_MEM,
1864 .arg4_type = ARG_CONST_SIZE,
1865 .arg5_type = ARG_ANYTHING,
1866 };
1867
BPF_CALL_2(bpf_skb_pull_data,struct sk_buff *,skb,u32,len)1868 BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
1869 {
1870 /* Idea is the following: should the needed direct read/write
1871 * test fail during runtime, we can pull in more data and redo
1872 * again, since implicitly, we invalidate previous checks here.
1873 *
1874 * Or, since we know how much we need to make read/writeable,
1875 * this can be done once at the program beginning for direct
1876 * access case. By this we overcome limitations of only current
1877 * headroom being accessible.
1878 */
1879 return bpf_try_make_writable(skb, len ? : skb_headlen(skb));
1880 }
1881
1882 static const struct bpf_func_proto bpf_skb_pull_data_proto = {
1883 .func = bpf_skb_pull_data,
1884 .gpl_only = false,
1885 .ret_type = RET_INTEGER,
1886 .arg1_type = ARG_PTR_TO_CTX,
1887 .arg2_type = ARG_ANYTHING,
1888 };
1889
BPF_CALL_1(bpf_sk_fullsock,struct sock *,sk)1890 BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk)
1891 {
1892 return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL;
1893 }
1894
1895 static const struct bpf_func_proto bpf_sk_fullsock_proto = {
1896 .func = bpf_sk_fullsock,
1897 .gpl_only = false,
1898 .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
1899 .arg1_type = ARG_PTR_TO_SOCK_COMMON,
1900 };
1901
sk_skb_try_make_writable(struct sk_buff * skb,unsigned int write_len)1902 static inline int sk_skb_try_make_writable(struct sk_buff *skb,
1903 unsigned int write_len)
1904 {
1905 return __bpf_try_make_writable(skb, write_len);
1906 }
1907
BPF_CALL_2(sk_skb_pull_data,struct sk_buff *,skb,u32,len)1908 BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len)
1909 {
1910 /* Idea is the following: should the needed direct read/write
1911 * test fail during runtime, we can pull in more data and redo
1912 * again, since implicitly, we invalidate previous checks here.
1913 *
1914 * Or, since we know how much we need to make read/writeable,
1915 * this can be done once at the program beginning for direct
1916 * access case. By this we overcome limitations of only current
1917 * headroom being accessible.
1918 */
1919 return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb));
1920 }
1921
1922 static const struct bpf_func_proto sk_skb_pull_data_proto = {
1923 .func = sk_skb_pull_data,
1924 .gpl_only = false,
1925 .ret_type = RET_INTEGER,
1926 .arg1_type = ARG_PTR_TO_CTX,
1927 .arg2_type = ARG_ANYTHING,
1928 };
1929
BPF_CALL_5(bpf_l3_csum_replace,struct sk_buff *,skb,u32,offset,u64,from,u64,to,u64,flags)1930 BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset,
1931 u64, from, u64, to, u64, flags)
1932 {
1933 __sum16 *ptr;
1934
1935 if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK)))
1936 return -EINVAL;
1937 if (unlikely(offset > 0xffff || offset & 1))
1938 return -EFAULT;
1939 if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
1940 return -EFAULT;
1941
1942 ptr = (__sum16 *)(skb->data + offset);
1943 switch (flags & BPF_F_HDR_FIELD_MASK) {
1944 case 0:
1945 if (unlikely(from != 0))
1946 return -EINVAL;
1947
1948 csum_replace_by_diff(ptr, to);
1949 break;
1950 case 2:
1951 csum_replace2(ptr, from, to);
1952 break;
1953 case 4:
1954 csum_replace4(ptr, from, to);
1955 break;
1956 default:
1957 return -EINVAL;
1958 }
1959
1960 return 0;
1961 }
1962
1963 static const struct bpf_func_proto bpf_l3_csum_replace_proto = {
1964 .func = bpf_l3_csum_replace,
1965 .gpl_only = false,
1966 .ret_type = RET_INTEGER,
1967 .arg1_type = ARG_PTR_TO_CTX,
1968 .arg2_type = ARG_ANYTHING,
1969 .arg3_type = ARG_ANYTHING,
1970 .arg4_type = ARG_ANYTHING,
1971 .arg5_type = ARG_ANYTHING,
1972 };
1973
BPF_CALL_5(bpf_l4_csum_replace,struct sk_buff *,skb,u32,offset,u64,from,u64,to,u64,flags)1974 BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
1975 u64, from, u64, to, u64, flags)
1976 {
1977 bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
1978 bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
1979 bool do_mforce = flags & BPF_F_MARK_ENFORCE;
1980 bool is_ipv6 = flags & BPF_F_IPV6;
1981 __sum16 *ptr;
1982
1983 if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE |
1984 BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK | BPF_F_IPV6)))
1985 return -EINVAL;
1986 if (unlikely(offset > 0xffff || offset & 1))
1987 return -EFAULT;
1988 if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
1989 return -EFAULT;
1990
1991 ptr = (__sum16 *)(skb->data + offset);
1992 if (is_mmzero && !do_mforce && !*ptr)
1993 return 0;
1994
1995 switch (flags & BPF_F_HDR_FIELD_MASK) {
1996 case 0:
1997 if (unlikely(from != 0))
1998 return -EINVAL;
1999
2000 inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo, is_ipv6);
2001 break;
2002 case 2:
2003 inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo);
2004 break;
2005 case 4:
2006 inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo);
2007 break;
2008 default:
2009 return -EINVAL;
2010 }
2011
2012 if (is_mmzero && !*ptr)
2013 *ptr = CSUM_MANGLED_0;
2014 return 0;
2015 }
2016
2017 static const struct bpf_func_proto bpf_l4_csum_replace_proto = {
2018 .func = bpf_l4_csum_replace,
2019 .gpl_only = false,
2020 .ret_type = RET_INTEGER,
2021 .arg1_type = ARG_PTR_TO_CTX,
2022 .arg2_type = ARG_ANYTHING,
2023 .arg3_type = ARG_ANYTHING,
2024 .arg4_type = ARG_ANYTHING,
2025 .arg5_type = ARG_ANYTHING,
2026 };
2027
BPF_CALL_5(bpf_csum_diff,__be32 *,from,u32,from_size,__be32 *,to,u32,to_size,__wsum,seed)2028 BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
2029 __be32 *, to, u32, to_size, __wsum, seed)
2030 {
2031 /* This is quite flexible, some examples:
2032 *
2033 * from_size == 0, to_size > 0, seed := csum --> pushing data
2034 * from_size > 0, to_size == 0, seed := csum --> pulling data
2035 * from_size > 0, to_size > 0, seed := 0 --> diffing data
2036 *
2037 * Even for diffing, from_size and to_size don't need to be equal.
2038 */
2039
2040 __wsum ret = seed;
2041
2042 if (from_size && to_size)
2043 ret = csum_sub(csum_partial(to, to_size, ret),
2044 csum_partial(from, from_size, 0));
2045 else if (to_size)
2046 ret = csum_partial(to, to_size, ret);
2047
2048 else if (from_size)
2049 ret = ~csum_partial(from, from_size, ~ret);
2050
2051 return csum_from32to16((__force unsigned int)ret);
2052 }
2053
2054 static const struct bpf_func_proto bpf_csum_diff_proto = {
2055 .func = bpf_csum_diff,
2056 .gpl_only = false,
2057 .pkt_access = true,
2058 .ret_type = RET_INTEGER,
2059 .arg1_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
2060 .arg2_type = ARG_CONST_SIZE_OR_ZERO,
2061 .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
2062 .arg4_type = ARG_CONST_SIZE_OR_ZERO,
2063 .arg5_type = ARG_ANYTHING,
2064 };
2065
BPF_CALL_2(bpf_csum_update,struct sk_buff *,skb,__wsum,csum)2066 BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum)
2067 {
2068 /* The interface is to be used in combination with bpf_csum_diff()
2069 * for direct packet writes. csum rotation for alignment as well
2070 * as emulating csum_sub() can be done from the eBPF program.
2071 */
2072 if (skb->ip_summed == CHECKSUM_COMPLETE)
2073 return (skb->csum = csum_add(skb->csum, csum));
2074
2075 return -ENOTSUPP;
2076 }
2077
2078 static const struct bpf_func_proto bpf_csum_update_proto = {
2079 .func = bpf_csum_update,
2080 .gpl_only = false,
2081 .ret_type = RET_INTEGER,
2082 .arg1_type = ARG_PTR_TO_CTX,
2083 .arg2_type = ARG_ANYTHING,
2084 };
2085
BPF_CALL_2(bpf_csum_level,struct sk_buff *,skb,u64,level)2086 BPF_CALL_2(bpf_csum_level, struct sk_buff *, skb, u64, level)
2087 {
2088 /* The interface is to be used in combination with bpf_skb_adjust_room()
2089 * for encap/decap of packet headers when BPF_F_ADJ_ROOM_NO_CSUM_RESET
2090 * is passed as flags, for example.
2091 */
2092 switch (level) {
2093 case BPF_CSUM_LEVEL_INC:
2094 __skb_incr_checksum_unnecessary(skb);
2095 break;
2096 case BPF_CSUM_LEVEL_DEC:
2097 __skb_decr_checksum_unnecessary(skb);
2098 break;
2099 case BPF_CSUM_LEVEL_RESET:
2100 __skb_reset_checksum_unnecessary(skb);
2101 break;
2102 case BPF_CSUM_LEVEL_QUERY:
2103 return skb->ip_summed == CHECKSUM_UNNECESSARY ?
2104 skb->csum_level : -EACCES;
2105 default:
2106 return -EINVAL;
2107 }
2108
2109 return 0;
2110 }
2111
2112 static const struct bpf_func_proto bpf_csum_level_proto = {
2113 .func = bpf_csum_level,
2114 .gpl_only = false,
2115 .ret_type = RET_INTEGER,
2116 .arg1_type = ARG_PTR_TO_CTX,
2117 .arg2_type = ARG_ANYTHING,
2118 };
2119
__bpf_rx_skb(struct net_device * dev,struct sk_buff * skb)2120 static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
2121 {
2122 return dev_forward_skb_nomtu(dev, skb);
2123 }
2124
__bpf_rx_skb_no_mac(struct net_device * dev,struct sk_buff * skb)2125 static inline int __bpf_rx_skb_no_mac(struct net_device *dev,
2126 struct sk_buff *skb)
2127 {
2128 int ret = ____dev_forward_skb(dev, skb, false);
2129
2130 if (likely(!ret)) {
2131 skb->dev = dev;
2132 ret = netif_rx(skb);
2133 }
2134
2135 return ret;
2136 }
2137
__bpf_tx_skb(struct net_device * dev,struct sk_buff * skb)2138 static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
2139 {
2140 int ret;
2141
2142 if (dev_xmit_recursion()) {
2143 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
2144 kfree_skb(skb);
2145 return -ENETDOWN;
2146 }
2147
2148 skb->dev = dev;
2149 skb_set_redirected_noclear(skb, skb_at_tc_ingress(skb));
2150 skb_clear_tstamp(skb);
2151
2152 dev_xmit_recursion_inc();
2153 ret = dev_queue_xmit(skb);
2154 dev_xmit_recursion_dec();
2155
2156 return ret;
2157 }
2158
__bpf_redirect_no_mac(struct sk_buff * skb,struct net_device * dev,u32 flags)2159 static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
2160 u32 flags)
2161 {
2162 unsigned int mlen = skb_network_offset(skb);
2163
2164 if (unlikely(skb->len <= mlen)) {
2165 kfree_skb(skb);
2166 return -ERANGE;
2167 }
2168
2169 if (mlen) {
2170 __skb_pull(skb, mlen);
2171
2172 /* At ingress, the mac header has already been pulled once.
2173 * At egress, skb_pospull_rcsum has to be done in case that
2174 * the skb is originated from ingress (i.e. a forwarded skb)
2175 * to ensure that rcsum starts at net header.
2176 */
2177 if (!skb_at_tc_ingress(skb))
2178 skb_postpull_rcsum(skb, skb_mac_header(skb), mlen);
2179 }
2180 skb_pop_mac_header(skb);
2181 skb_reset_mac_len(skb);
2182 return flags & BPF_F_INGRESS ?
2183 __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb);
2184 }
2185
__bpf_redirect_common(struct sk_buff * skb,struct net_device * dev,u32 flags)2186 static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
2187 u32 flags)
2188 {
2189 /* Verify that a link layer header is carried */
2190 if (unlikely(skb->mac_header >= skb->network_header || skb->len == 0)) {
2191 kfree_skb(skb);
2192 return -ERANGE;
2193 }
2194
2195 bpf_push_mac_rcsum(skb);
2196 return flags & BPF_F_INGRESS ?
2197 __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
2198 }
2199
__bpf_redirect(struct sk_buff * skb,struct net_device * dev,u32 flags)2200 static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
2201 u32 flags)
2202 {
2203 if (dev_is_mac_header_xmit(dev))
2204 return __bpf_redirect_common(skb, dev, flags);
2205 else
2206 return __bpf_redirect_no_mac(skb, dev, flags);
2207 }
2208
2209 #if IS_ENABLED(CONFIG_IPV6)
bpf_out_neigh_v6(struct net * net,struct sk_buff * skb,struct net_device * dev,struct bpf_nh_params * nh)2210 static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
2211 struct net_device *dev, struct bpf_nh_params *nh)
2212 {
2213 u32 hh_len = LL_RESERVED_SPACE(dev);
2214 const struct in6_addr *nexthop;
2215 struct dst_entry *dst = NULL;
2216 struct neighbour *neigh;
2217
2218 if (dev_xmit_recursion()) {
2219 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
2220 goto out_drop;
2221 }
2222
2223 skb->dev = dev;
2224 skb_clear_tstamp(skb);
2225
2226 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
2227 skb = skb_expand_head(skb, hh_len);
2228 if (!skb)
2229 return -ENOMEM;
2230 }
2231
2232 rcu_read_lock();
2233 if (!nh) {
2234 dst = skb_dst(skb);
2235 nexthop = rt6_nexthop(dst_rt6_info(dst),
2236 &ipv6_hdr(skb)->daddr);
2237 } else {
2238 nexthop = &nh->ipv6_nh;
2239 }
2240 neigh = ip_neigh_gw6(dev, nexthop);
2241 if (likely(!IS_ERR(neigh))) {
2242 int ret;
2243
2244 sock_confirm_neigh(skb, neigh);
2245 local_bh_disable();
2246 dev_xmit_recursion_inc();
2247 ret = neigh_output(neigh, skb, false);
2248 dev_xmit_recursion_dec();
2249 local_bh_enable();
2250 rcu_read_unlock();
2251 return ret;
2252 }
2253 rcu_read_unlock();
2254 if (dst)
2255 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
2256 out_drop:
2257 kfree_skb(skb);
2258 return -ENETDOWN;
2259 }
2260
__bpf_redirect_neigh_v6(struct sk_buff * skb,struct net_device * dev,struct bpf_nh_params * nh)2261 static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
2262 struct bpf_nh_params *nh)
2263 {
2264 const struct ipv6hdr *ip6h = ipv6_hdr(skb);
2265 struct net *net = dev_net(dev);
2266 int err, ret = NET_XMIT_DROP;
2267
2268 if (!nh) {
2269 struct dst_entry *dst;
2270 struct flowi6 fl6 = {
2271 .flowi6_flags = FLOWI_FLAG_ANYSRC,
2272 .flowi6_mark = skb->mark,
2273 .flowlabel = ip6_flowinfo(ip6h),
2274 .flowi6_oif = dev->ifindex,
2275 .flowi6_proto = ip6h->nexthdr,
2276 .daddr = ip6h->daddr,
2277 .saddr = ip6h->saddr,
2278 };
2279
2280 dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL);
2281 if (IS_ERR(dst))
2282 goto out_drop;
2283
2284 skb_dst_set(skb, dst);
2285 } else if (nh->nh_family != AF_INET6) {
2286 goto out_drop;
2287 }
2288
2289 err = bpf_out_neigh_v6(net, skb, dev, nh);
2290 if (unlikely(net_xmit_eval(err)))
2291 DEV_STATS_INC(dev, tx_errors);
2292 else
2293 ret = NET_XMIT_SUCCESS;
2294 goto out_xmit;
2295 out_drop:
2296 DEV_STATS_INC(dev, tx_errors);
2297 kfree_skb(skb);
2298 out_xmit:
2299 return ret;
2300 }
2301 #else
__bpf_redirect_neigh_v6(struct sk_buff * skb,struct net_device * dev,struct bpf_nh_params * nh)2302 static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
2303 struct bpf_nh_params *nh)
2304 {
2305 kfree_skb(skb);
2306 return NET_XMIT_DROP;
2307 }
2308 #endif /* CONFIG_IPV6 */
2309
2310 #if IS_ENABLED(CONFIG_INET)
bpf_out_neigh_v4(struct net * net,struct sk_buff * skb,struct net_device * dev,struct bpf_nh_params * nh)2311 static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
2312 struct net_device *dev, struct bpf_nh_params *nh)
2313 {
2314 u32 hh_len = LL_RESERVED_SPACE(dev);
2315 struct neighbour *neigh;
2316 bool is_v6gw = false;
2317
2318 if (dev_xmit_recursion()) {
2319 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
2320 goto out_drop;
2321 }
2322
2323 skb->dev = dev;
2324 skb_clear_tstamp(skb);
2325
2326 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
2327 skb = skb_expand_head(skb, hh_len);
2328 if (!skb)
2329 return -ENOMEM;
2330 }
2331
2332 rcu_read_lock();
2333 if (!nh) {
2334 struct rtable *rt = skb_rtable(skb);
2335
2336 neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
2337 } else if (nh->nh_family == AF_INET6) {
2338 neigh = ip_neigh_gw6(dev, &nh->ipv6_nh);
2339 is_v6gw = true;
2340 } else if (nh->nh_family == AF_INET) {
2341 neigh = ip_neigh_gw4(dev, nh->ipv4_nh);
2342 } else {
2343 rcu_read_unlock();
2344 goto out_drop;
2345 }
2346
2347 if (likely(!IS_ERR(neigh))) {
2348 int ret;
2349
2350 sock_confirm_neigh(skb, neigh);
2351 local_bh_disable();
2352 dev_xmit_recursion_inc();
2353 ret = neigh_output(neigh, skb, is_v6gw);
2354 dev_xmit_recursion_dec();
2355 local_bh_enable();
2356 rcu_read_unlock();
2357 return ret;
2358 }
2359 rcu_read_unlock();
2360 out_drop:
2361 kfree_skb(skb);
2362 return -ENETDOWN;
2363 }
2364
__bpf_redirect_neigh_v4(struct sk_buff * skb,struct net_device * dev,struct bpf_nh_params * nh)2365 static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
2366 struct bpf_nh_params *nh)
2367 {
2368 const struct iphdr *ip4h = ip_hdr(skb);
2369 struct net *net = dev_net(dev);
2370 int err, ret = NET_XMIT_DROP;
2371
2372 if (!nh) {
2373 struct flowi4 fl4 = {
2374 .flowi4_flags = FLOWI_FLAG_ANYSRC,
2375 .flowi4_mark = skb->mark,
2376 .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)),
2377 .flowi4_oif = dev->ifindex,
2378 .flowi4_proto = ip4h->protocol,
2379 .daddr = ip4h->daddr,
2380 .saddr = ip4h->saddr,
2381 };
2382 struct rtable *rt;
2383
2384 rt = ip_route_output_flow(net, &fl4, NULL);
2385 if (IS_ERR(rt))
2386 goto out_drop;
2387 if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
2388 ip_rt_put(rt);
2389 goto out_drop;
2390 }
2391
2392 skb_dst_set(skb, &rt->dst);
2393 }
2394
2395 err = bpf_out_neigh_v4(net, skb, dev, nh);
2396 if (unlikely(net_xmit_eval(err)))
2397 DEV_STATS_INC(dev, tx_errors);
2398 else
2399 ret = NET_XMIT_SUCCESS;
2400 goto out_xmit;
2401 out_drop:
2402 DEV_STATS_INC(dev, tx_errors);
2403 kfree_skb(skb);
2404 out_xmit:
2405 return ret;
2406 }
2407 #else
__bpf_redirect_neigh_v4(struct sk_buff * skb,struct net_device * dev,struct bpf_nh_params * nh)2408 static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
2409 struct bpf_nh_params *nh)
2410 {
2411 kfree_skb(skb);
2412 return NET_XMIT_DROP;
2413 }
2414 #endif /* CONFIG_INET */
2415
__bpf_redirect_neigh(struct sk_buff * skb,struct net_device * dev,struct bpf_nh_params * nh)2416 static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev,
2417 struct bpf_nh_params *nh)
2418 {
2419 struct ethhdr *ethh = eth_hdr(skb);
2420
2421 if (unlikely(skb->mac_header >= skb->network_header))
2422 goto out;
2423 bpf_push_mac_rcsum(skb);
2424 if (is_multicast_ether_addr(ethh->h_dest))
2425 goto out;
2426
2427 skb_pull(skb, sizeof(*ethh));
2428 skb_unset_mac_header(skb);
2429 skb_reset_network_header(skb);
2430
2431 if (skb->protocol == htons(ETH_P_IP))
2432 return __bpf_redirect_neigh_v4(skb, dev, nh);
2433 else if (skb->protocol == htons(ETH_P_IPV6))
2434 return __bpf_redirect_neigh_v6(skb, dev, nh);
2435 out:
2436 kfree_skb(skb);
2437 return -ENOTSUPP;
2438 }
2439
2440 /* Internal, non-exposed redirect flags. */
2441 enum {
2442 BPF_F_NEIGH = (1ULL << 16),
2443 BPF_F_PEER = (1ULL << 17),
2444 BPF_F_NEXTHOP = (1ULL << 18),
2445 #define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP)
2446 };
2447
BPF_CALL_3(bpf_clone_redirect,struct sk_buff *,skb,u32,ifindex,u64,flags)2448 BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
2449 {
2450 struct net_device *dev;
2451 struct sk_buff *clone;
2452 int ret;
2453
2454 BUILD_BUG_ON(BPF_F_REDIRECT_INTERNAL & BPF_F_REDIRECT_FLAGS);
2455
2456 if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
2457 return -EINVAL;
2458
2459 dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex);
2460 if (unlikely(!dev))
2461 return -EINVAL;
2462
2463 clone = skb_clone(skb, GFP_ATOMIC);
2464 if (unlikely(!clone))
2465 return -ENOMEM;
2466
2467 /* For direct write, we need to keep the invariant that the skbs
2468 * we're dealing with need to be uncloned. Should uncloning fail
2469 * here, we need to free the just generated clone to unclone once
2470 * again.
2471 */
2472 ret = bpf_try_make_head_writable(skb);
2473 if (unlikely(ret)) {
2474 kfree_skb(clone);
2475 return -ENOMEM;
2476 }
2477
2478 return __bpf_redirect(clone, dev, flags);
2479 }
2480
2481 static const struct bpf_func_proto bpf_clone_redirect_proto = {
2482 .func = bpf_clone_redirect,
2483 .gpl_only = false,
2484 .ret_type = RET_INTEGER,
2485 .arg1_type = ARG_PTR_TO_CTX,
2486 .arg2_type = ARG_ANYTHING,
2487 .arg3_type = ARG_ANYTHING,
2488 };
2489
skb_get_peer_dev(struct net_device * dev)2490 static struct net_device *skb_get_peer_dev(struct net_device *dev)
2491 {
2492 const struct net_device_ops *ops = dev->netdev_ops;
2493
2494 if (likely(ops->ndo_get_peer_dev))
2495 return INDIRECT_CALL_1(ops->ndo_get_peer_dev,
2496 netkit_peer_dev, dev);
2497 return NULL;
2498 }
2499
skb_do_redirect(struct sk_buff * skb)2500 int skb_do_redirect(struct sk_buff *skb)
2501 {
2502 struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
2503 struct net *net = dev_net(skb->dev);
2504 struct net_device *dev;
2505 u32 flags = ri->flags;
2506
2507 dev = dev_get_by_index_rcu(net, ri->tgt_index);
2508 ri->tgt_index = 0;
2509 ri->flags = 0;
2510 if (unlikely(!dev))
2511 goto out_drop;
2512 if (flags & BPF_F_PEER) {
2513 if (unlikely(!skb_at_tc_ingress(skb)))
2514 goto out_drop;
2515 dev = skb_get_peer_dev(dev);
2516 if (unlikely(!dev ||
2517 !(dev->flags & IFF_UP) ||
2518 net_eq(net, dev_net(dev))))
2519 goto out_drop;
2520 skb->dev = dev;
2521 dev_sw_netstats_rx_add(dev, skb->len);
2522 skb_scrub_packet(skb, false);
2523 return -EAGAIN;
2524 }
2525 return flags & BPF_F_NEIGH ?
2526 __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ?
2527 &ri->nh : NULL) :
2528 __bpf_redirect(skb, dev, flags);
2529 out_drop:
2530 kfree_skb(skb);
2531 return -EINVAL;
2532 }
2533
BPF_CALL_2(bpf_redirect,u32,ifindex,u64,flags)2534 BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
2535 {
2536 struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
2537
2538 if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
2539 return TC_ACT_SHOT;
2540
2541 ri->flags = flags;
2542 ri->tgt_index = ifindex;
2543
2544 return TC_ACT_REDIRECT;
2545 }
2546
2547 static const struct bpf_func_proto bpf_redirect_proto = {
2548 .func = bpf_redirect,
2549 .gpl_only = false,
2550 .ret_type = RET_INTEGER,
2551 .arg1_type = ARG_ANYTHING,
2552 .arg2_type = ARG_ANYTHING,
2553 };
2554
BPF_CALL_2(bpf_redirect_peer,u32,ifindex,u64,flags)2555 BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags)
2556 {
2557 struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
2558
2559 if (unlikely(flags))
2560 return TC_ACT_SHOT;
2561
2562 ri->flags = BPF_F_PEER;
2563 ri->tgt_index = ifindex;
2564
2565 return TC_ACT_REDIRECT;
2566 }
2567
2568 static const struct bpf_func_proto bpf_redirect_peer_proto = {
2569 .func = bpf_redirect_peer,
2570 .gpl_only = false,
2571 .ret_type = RET_INTEGER,
2572 .arg1_type = ARG_ANYTHING,
2573 .arg2_type = ARG_ANYTHING,
2574 };
2575
BPF_CALL_4(bpf_redirect_neigh,u32,ifindex,struct bpf_redir_neigh *,params,int,plen,u64,flags)2576 BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params,
2577 int, plen, u64, flags)
2578 {
2579 struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
2580
2581 if (unlikely((plen && plen < sizeof(*params)) || flags))
2582 return TC_ACT_SHOT;
2583
2584 ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0);
2585 ri->tgt_index = ifindex;
2586
2587 BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params));
2588 if (plen)
2589 memcpy(&ri->nh, params, sizeof(ri->nh));
2590
2591 return TC_ACT_REDIRECT;
2592 }
2593
2594 static const struct bpf_func_proto bpf_redirect_neigh_proto = {
2595 .func = bpf_redirect_neigh,
2596 .gpl_only = false,
2597 .ret_type = RET_INTEGER,
2598 .arg1_type = ARG_ANYTHING,
2599 .arg2_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
2600 .arg3_type = ARG_CONST_SIZE_OR_ZERO,
2601 .arg4_type = ARG_ANYTHING,
2602 };
2603
BPF_CALL_2(bpf_msg_apply_bytes,struct sk_msg *,msg,u32,bytes)2604 BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes)
2605 {
2606 msg->apply_bytes = bytes;
2607 return 0;
2608 }
2609
2610 static const struct bpf_func_proto bpf_msg_apply_bytes_proto = {
2611 .func = bpf_msg_apply_bytes,
2612 .gpl_only = false,
2613 .ret_type = RET_INTEGER,
2614 .arg1_type = ARG_PTR_TO_CTX,
2615 .arg2_type = ARG_ANYTHING,
2616 };
2617
BPF_CALL_2(bpf_msg_cork_bytes,struct sk_msg *,msg,u32,bytes)2618 BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes)
2619 {
2620 msg->cork_bytes = bytes;
2621 return 0;
2622 }
2623
sk_msg_reset_curr(struct sk_msg * msg)2624 static void sk_msg_reset_curr(struct sk_msg *msg)
2625 {
2626 if (!msg->sg.size) {
2627 msg->sg.curr = msg->sg.start;
2628 msg->sg.copybreak = 0;
2629 } else {
2630 u32 i = msg->sg.end;
2631
2632 sk_msg_iter_var_prev(i);
2633 msg->sg.curr = i;
2634 msg->sg.copybreak = msg->sg.data[i].length;
2635 }
2636 }
2637
2638 static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
2639 .func = bpf_msg_cork_bytes,
2640 .gpl_only = false,
2641 .ret_type = RET_INTEGER,
2642 .arg1_type = ARG_PTR_TO_CTX,
2643 .arg2_type = ARG_ANYTHING,
2644 };
2645
BPF_CALL_4(bpf_msg_pull_data,struct sk_msg *,msg,u32,start,u32,end,u64,flags)2646 BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
2647 u32, end, u64, flags)
2648 {
2649 u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start;
2650 u32 first_sge, last_sge, i, shift, bytes_sg_total;
2651 struct scatterlist *sge;
2652 u8 *raw, *to, *from;
2653 struct page *page;
2654
2655 if (unlikely(flags || end <= start))
2656 return -EINVAL;
2657
2658 /* First find the starting scatterlist element */
2659 i = msg->sg.start;
2660 do {
2661 offset += len;
2662 len = sk_msg_elem(msg, i)->length;
2663 if (start < offset + len)
2664 break;
2665 sk_msg_iter_var_next(i);
2666 } while (i != msg->sg.end);
2667
2668 if (unlikely(start >= offset + len))
2669 return -EINVAL;
2670
2671 first_sge = i;
2672 /* The start may point into the sg element so we need to also
2673 * account for the headroom.
2674 */
2675 bytes_sg_total = start - offset + bytes;
2676 if (!test_bit(i, msg->sg.copy) && bytes_sg_total <= len)
2677 goto out;
2678
2679 /* At this point we need to linearize multiple scatterlist
2680 * elements or a single shared page. Either way we need to
2681 * copy into a linear buffer exclusively owned by BPF. Then
2682 * place the buffer in the scatterlist and fixup the original
2683 * entries by removing the entries now in the linear buffer
2684 * and shifting the remaining entries. For now we do not try
2685 * to copy partial entries to avoid complexity of running out
2686 * of sg_entry slots. The downside is reading a single byte
2687 * will copy the entire sg entry.
2688 */
2689 do {
2690 copy += sk_msg_elem(msg, i)->length;
2691 sk_msg_iter_var_next(i);
2692 if (bytes_sg_total <= copy)
2693 break;
2694 } while (i != msg->sg.end);
2695 last_sge = i;
2696
2697 if (unlikely(bytes_sg_total > copy))
2698 return -EINVAL;
2699
2700 page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
2701 get_order(copy));
2702 if (unlikely(!page))
2703 return -ENOMEM;
2704
2705 raw = page_address(page);
2706 i = first_sge;
2707 do {
2708 sge = sk_msg_elem(msg, i);
2709 from = sg_virt(sge);
2710 len = sge->length;
2711 to = raw + poffset;
2712
2713 memcpy(to, from, len);
2714 poffset += len;
2715 sge->length = 0;
2716 put_page(sg_page(sge));
2717
2718 sk_msg_iter_var_next(i);
2719 } while (i != last_sge);
2720
2721 sg_set_page(&msg->sg.data[first_sge], page, copy, 0);
2722
2723 /* To repair sg ring we need to shift entries. If we only
2724 * had a single entry though we can just replace it and
2725 * be done. Otherwise walk the ring and shift the entries.
2726 */
2727 WARN_ON_ONCE(last_sge == first_sge);
2728 shift = last_sge > first_sge ?
2729 last_sge - first_sge - 1 :
2730 NR_MSG_FRAG_IDS - first_sge + last_sge - 1;
2731 if (!shift)
2732 goto out;
2733
2734 i = first_sge;
2735 sk_msg_iter_var_next(i);
2736 do {
2737 u32 move_from;
2738
2739 if (i + shift >= NR_MSG_FRAG_IDS)
2740 move_from = i + shift - NR_MSG_FRAG_IDS;
2741 else
2742 move_from = i + shift;
2743 if (move_from == msg->sg.end)
2744 break;
2745
2746 msg->sg.data[i] = msg->sg.data[move_from];
2747 msg->sg.data[move_from].length = 0;
2748 msg->sg.data[move_from].page_link = 0;
2749 msg->sg.data[move_from].offset = 0;
2750 sk_msg_iter_var_next(i);
2751 } while (1);
2752
2753 msg->sg.end = msg->sg.end - shift > msg->sg.end ?
2754 msg->sg.end - shift + NR_MSG_FRAG_IDS :
2755 msg->sg.end - shift;
2756 out:
2757 sk_msg_reset_curr(msg);
2758 msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
2759 msg->data_end = msg->data + bytes;
2760 return 0;
2761 }
2762
2763 static const struct bpf_func_proto bpf_msg_pull_data_proto = {
2764 .func = bpf_msg_pull_data,
2765 .gpl_only = false,
2766 .ret_type = RET_INTEGER,
2767 .arg1_type = ARG_PTR_TO_CTX,
2768 .arg2_type = ARG_ANYTHING,
2769 .arg3_type = ARG_ANYTHING,
2770 .arg4_type = ARG_ANYTHING,
2771 };
2772
BPF_CALL_4(bpf_msg_push_data,struct sk_msg *,msg,u32,start,u32,len,u64,flags)2773 BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
2774 u32, len, u64, flags)
2775 {
2776 struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge;
2777 u32 new, i = 0, l = 0, space, copy = 0, offset = 0;
2778 u8 *raw, *to, *from;
2779 struct page *page;
2780
2781 if (unlikely(flags))
2782 return -EINVAL;
2783
2784 if (unlikely(len == 0))
2785 return 0;
2786
2787 /* First find the starting scatterlist element */
2788 i = msg->sg.start;
2789 do {
2790 offset += l;
2791 l = sk_msg_elem(msg, i)->length;
2792
2793 if (start < offset + l)
2794 break;
2795 sk_msg_iter_var_next(i);
2796 } while (i != msg->sg.end);
2797
2798 if (start > offset + l)
2799 return -EINVAL;
2800
2801 space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
2802
2803 /* If no space available will fallback to copy, we need at
2804 * least one scatterlist elem available to push data into
2805 * when start aligns to the beginning of an element or two
2806 * when it falls inside an element. We handle the start equals
2807 * offset case because its the common case for inserting a
2808 * header.
2809 */
2810 if (!space || (space == 1 && start != offset))
2811 copy = msg->sg.data[i].length;
2812
2813 page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
2814 get_order(copy + len));
2815 if (unlikely(!page))
2816 return -ENOMEM;
2817
2818 if (copy) {
2819 int front, back;
2820
2821 raw = page_address(page);
2822
2823 if (i == msg->sg.end)
2824 sk_msg_iter_var_prev(i);
2825 psge = sk_msg_elem(msg, i);
2826 front = start - offset;
2827 back = psge->length - front;
2828 from = sg_virt(psge);
2829
2830 if (front)
2831 memcpy(raw, from, front);
2832
2833 if (back) {
2834 from += front;
2835 to = raw + front + len;
2836
2837 memcpy(to, from, back);
2838 }
2839
2840 put_page(sg_page(psge));
2841 new = i;
2842 goto place_new;
2843 }
2844
2845 if (start - offset) {
2846 if (i == msg->sg.end)
2847 sk_msg_iter_var_prev(i);
2848 psge = sk_msg_elem(msg, i);
2849 rsge = sk_msg_elem_cpy(msg, i);
2850
2851 psge->length = start - offset;
2852 rsge.length -= psge->length;
2853 rsge.offset += start;
2854
2855 sk_msg_iter_var_next(i);
2856 sg_unmark_end(psge);
2857 sg_unmark_end(&rsge);
2858 }
2859
2860 /* Slot(s) to place newly allocated data */
2861 sk_msg_iter_next(msg, end);
2862 new = i;
2863 sk_msg_iter_var_next(i);
2864
2865 if (i == msg->sg.end) {
2866 if (!rsge.length)
2867 goto place_new;
2868 sk_msg_iter_next(msg, end);
2869 goto place_new;
2870 }
2871
2872 /* Shift one or two slots as needed */
2873 sge = sk_msg_elem_cpy(msg, new);
2874 sg_unmark_end(&sge);
2875
2876 nsge = sk_msg_elem_cpy(msg, i);
2877 if (rsge.length) {
2878 sk_msg_iter_var_next(i);
2879 nnsge = sk_msg_elem_cpy(msg, i);
2880 sk_msg_iter_next(msg, end);
2881 }
2882
2883 while (i != msg->sg.end) {
2884 msg->sg.data[i] = sge;
2885 sge = nsge;
2886 sk_msg_iter_var_next(i);
2887 if (rsge.length) {
2888 nsge = nnsge;
2889 nnsge = sk_msg_elem_cpy(msg, i);
2890 } else {
2891 nsge = sk_msg_elem_cpy(msg, i);
2892 }
2893 }
2894
2895 place_new:
2896 /* Place newly allocated data buffer */
2897 sk_mem_charge(msg->sk, len);
2898 msg->sg.size += len;
2899 __clear_bit(new, msg->sg.copy);
2900 sg_set_page(&msg->sg.data[new], page, len + copy, 0);
2901 if (rsge.length) {
2902 get_page(sg_page(&rsge));
2903 sk_msg_iter_var_next(new);
2904 msg->sg.data[new] = rsge;
2905 }
2906
2907 sk_msg_reset_curr(msg);
2908 sk_msg_compute_data_pointers(msg);
2909 return 0;
2910 }
2911
2912 static const struct bpf_func_proto bpf_msg_push_data_proto = {
2913 .func = bpf_msg_push_data,
2914 .gpl_only = false,
2915 .ret_type = RET_INTEGER,
2916 .arg1_type = ARG_PTR_TO_CTX,
2917 .arg2_type = ARG_ANYTHING,
2918 .arg3_type = ARG_ANYTHING,
2919 .arg4_type = ARG_ANYTHING,
2920 };
2921
sk_msg_shift_left(struct sk_msg * msg,int i)2922 static void sk_msg_shift_left(struct sk_msg *msg, int i)
2923 {
2924 struct scatterlist *sge = sk_msg_elem(msg, i);
2925 int prev;
2926
2927 put_page(sg_page(sge));
2928 do {
2929 prev = i;
2930 sk_msg_iter_var_next(i);
2931 msg->sg.data[prev] = msg->sg.data[i];
2932 } while (i != msg->sg.end);
2933
2934 sk_msg_iter_prev(msg, end);
2935 }
2936
sk_msg_shift_right(struct sk_msg * msg,int i)2937 static void sk_msg_shift_right(struct sk_msg *msg, int i)
2938 {
2939 struct scatterlist tmp, sge;
2940
2941 sk_msg_iter_next(msg, end);
2942 sge = sk_msg_elem_cpy(msg, i);
2943 sk_msg_iter_var_next(i);
2944 tmp = sk_msg_elem_cpy(msg, i);
2945
2946 while (i != msg->sg.end) {
2947 msg->sg.data[i] = sge;
2948 sk_msg_iter_var_next(i);
2949 sge = tmp;
2950 tmp = sk_msg_elem_cpy(msg, i);
2951 }
2952 }
2953
BPF_CALL_4(bpf_msg_pop_data,struct sk_msg *,msg,u32,start,u32,len,u64,flags)2954 BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
2955 u32, len, u64, flags)
2956 {
2957 u32 i = 0, l = 0, space, offset = 0;
2958 u64 last = start + len;
2959 int pop;
2960
2961 if (unlikely(flags))
2962 return -EINVAL;
2963
2964 if (unlikely(len == 0))
2965 return 0;
2966
2967 /* First find the starting scatterlist element */
2968 i = msg->sg.start;
2969 do {
2970 offset += l;
2971 l = sk_msg_elem(msg, i)->length;
2972
2973 if (start < offset + l)
2974 break;
2975 sk_msg_iter_var_next(i);
2976 } while (i != msg->sg.end);
2977
2978 /* Bounds checks: start and pop must be inside message */
2979 if (start >= offset + l || last > msg->sg.size)
2980 return -EINVAL;
2981
2982 space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
2983
2984 pop = len;
2985 /* --------------| offset
2986 * -| start |-------- len -------|
2987 *
2988 * |----- a ----|-------- pop -------|----- b ----|
2989 * |______________________________________________| length
2990 *
2991 *
2992 * a: region at front of scatter element to save
2993 * b: region at back of scatter element to save when length > A + pop
2994 * pop: region to pop from element, same as input 'pop' here will be
2995 * decremented below per iteration.
2996 *
2997 * Two top-level cases to handle when start != offset, first B is non
2998 * zero and second B is zero corresponding to when a pop includes more
2999 * than one element.
3000 *
3001 * Then if B is non-zero AND there is no space allocate space and
3002 * compact A, B regions into page. If there is space shift ring to
3003 * the right free'ing the next element in ring to place B, leaving
3004 * A untouched except to reduce length.
3005 */
3006 if (start != offset) {
3007 struct scatterlist *nsge, *sge = sk_msg_elem(msg, i);
3008 int a = start - offset;
3009 int b = sge->length - pop - a;
3010
3011 sk_msg_iter_var_next(i);
3012
3013 if (b > 0) {
3014 if (space) {
3015 sge->length = a;
3016 sk_msg_shift_right(msg, i);
3017 nsge = sk_msg_elem(msg, i);
3018 get_page(sg_page(sge));
3019 sg_set_page(nsge,
3020 sg_page(sge),
3021 b, sge->offset + pop + a);
3022 } else {
3023 struct page *page, *orig;
3024 u8 *to, *from;
3025
3026 page = alloc_pages(__GFP_NOWARN |
3027 __GFP_COMP | GFP_ATOMIC,
3028 get_order(a + b));
3029 if (unlikely(!page))
3030 return -ENOMEM;
3031
3032 orig = sg_page(sge);
3033 from = sg_virt(sge);
3034 to = page_address(page);
3035 memcpy(to, from, a);
3036 memcpy(to + a, from + a + pop, b);
3037 sg_set_page(sge, page, a + b, 0);
3038 put_page(orig);
3039 }
3040 pop = 0;
3041 } else {
3042 pop -= (sge->length - a);
3043 sge->length = a;
3044 }
3045 }
3046
3047 /* From above the current layout _must_ be as follows,
3048 *
3049 * -| offset
3050 * -| start
3051 *
3052 * |---- pop ---|---------------- b ------------|
3053 * |____________________________________________| length
3054 *
3055 * Offset and start of the current msg elem are equal because in the
3056 * previous case we handled offset != start and either consumed the
3057 * entire element and advanced to the next element OR pop == 0.
3058 *
3059 * Two cases to handle here are first pop is less than the length
3060 * leaving some remainder b above. Simply adjust the element's layout
3061 * in this case. Or pop >= length of the element so that b = 0. In this
3062 * case advance to next element decrementing pop.
3063 */
3064 while (pop) {
3065 struct scatterlist *sge = sk_msg_elem(msg, i);
3066
3067 if (pop < sge->length) {
3068 sge->length -= pop;
3069 sge->offset += pop;
3070 pop = 0;
3071 } else {
3072 pop -= sge->length;
3073 sk_msg_shift_left(msg, i);
3074 }
3075 }
3076
3077 sk_mem_uncharge(msg->sk, len - pop);
3078 msg->sg.size -= (len - pop);
3079 sk_msg_reset_curr(msg);
3080 sk_msg_compute_data_pointers(msg);
3081 return 0;
3082 }
3083
3084 static const struct bpf_func_proto bpf_msg_pop_data_proto = {
3085 .func = bpf_msg_pop_data,
3086 .gpl_only = false,
3087 .ret_type = RET_INTEGER,
3088 .arg1_type = ARG_PTR_TO_CTX,
3089 .arg2_type = ARG_ANYTHING,
3090 .arg3_type = ARG_ANYTHING,
3091 .arg4_type = ARG_ANYTHING,
3092 };
3093
3094 #ifdef CONFIG_CGROUP_NET_CLASSID
BPF_CALL_0(bpf_get_cgroup_classid_curr)3095 BPF_CALL_0(bpf_get_cgroup_classid_curr)
3096 {
3097 return __task_get_classid(current);
3098 }
3099
3100 const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = {
3101 .func = bpf_get_cgroup_classid_curr,
3102 .gpl_only = false,
3103 .ret_type = RET_INTEGER,
3104 };
3105
BPF_CALL_1(bpf_skb_cgroup_classid,const struct sk_buff *,skb)3106 BPF_CALL_1(bpf_skb_cgroup_classid, const struct sk_buff *, skb)
3107 {
3108 struct sock *sk = skb_to_full_sk(skb);
3109
3110 if (!sk || !sk_fullsock(sk))
3111 return 0;
3112
3113 return sock_cgroup_classid(&sk->sk_cgrp_data);
3114 }
3115
3116 static const struct bpf_func_proto bpf_skb_cgroup_classid_proto = {
3117 .func = bpf_skb_cgroup_classid,
3118 .gpl_only = false,
3119 .ret_type = RET_INTEGER,
3120 .arg1_type = ARG_PTR_TO_CTX,
3121 };
3122 #endif
3123
BPF_CALL_1(bpf_get_cgroup_classid,const struct sk_buff *,skb)3124 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
3125 {
3126 return task_get_classid(skb);
3127 }
3128
3129 static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
3130 .func = bpf_get_cgroup_classid,
3131 .gpl_only = false,
3132 .ret_type = RET_INTEGER,
3133 .arg1_type = ARG_PTR_TO_CTX,
3134 };
3135
BPF_CALL_1(bpf_get_route_realm,const struct sk_buff *,skb)3136 BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb)
3137 {
3138 return dst_tclassid(skb);
3139 }
3140
3141 static const struct bpf_func_proto bpf_get_route_realm_proto = {
3142 .func = bpf_get_route_realm,
3143 .gpl_only = false,
3144 .ret_type = RET_INTEGER,
3145 .arg1_type = ARG_PTR_TO_CTX,
3146 };
3147
BPF_CALL_1(bpf_get_hash_recalc,struct sk_buff *,skb)3148 BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb)
3149 {
3150 /* If skb_clear_hash() was called due to mangling, we can
3151 * trigger SW recalculation here. Later access to hash
3152 * can then use the inline skb->hash via context directly
3153 * instead of calling this helper again.
3154 */
3155 return skb_get_hash(skb);
3156 }
3157
3158 static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
3159 .func = bpf_get_hash_recalc,
3160 .gpl_only = false,
3161 .ret_type = RET_INTEGER,
3162 .arg1_type = ARG_PTR_TO_CTX,
3163 };
3164
BPF_CALL_1(bpf_set_hash_invalid,struct sk_buff *,skb)3165 BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb)
3166 {
3167 /* After all direct packet write, this can be used once for
3168 * triggering a lazy recalc on next skb_get_hash() invocation.
3169 */
3170 skb_clear_hash(skb);
3171 return 0;
3172 }
3173
3174 static const struct bpf_func_proto bpf_set_hash_invalid_proto = {
3175 .func = bpf_set_hash_invalid,
3176 .gpl_only = false,
3177 .ret_type = RET_INTEGER,
3178 .arg1_type = ARG_PTR_TO_CTX,
3179 };
3180
BPF_CALL_2(bpf_set_hash,struct sk_buff *,skb,u32,hash)3181 BPF_CALL_2(bpf_set_hash, struct sk_buff *, skb, u32, hash)
3182 {
3183 /* Set user specified hash as L4(+), so that it gets returned
3184 * on skb_get_hash() call unless BPF prog later on triggers a
3185 * skb_clear_hash().
3186 */
3187 __skb_set_sw_hash(skb, hash, true);
3188 return 0;
3189 }
3190
3191 static const struct bpf_func_proto bpf_set_hash_proto = {
3192 .func = bpf_set_hash,
3193 .gpl_only = false,
3194 .ret_type = RET_INTEGER,
3195 .arg1_type = ARG_PTR_TO_CTX,
3196 .arg2_type = ARG_ANYTHING,
3197 };
3198
BPF_CALL_3(bpf_skb_vlan_push,struct sk_buff *,skb,__be16,vlan_proto,u16,vlan_tci)3199 BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
3200 u16, vlan_tci)
3201 {
3202 int ret;
3203
3204 if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
3205 vlan_proto != htons(ETH_P_8021AD)))
3206 vlan_proto = htons(ETH_P_8021Q);
3207
3208 bpf_push_mac_rcsum(skb);
3209 ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
3210 bpf_pull_mac_rcsum(skb);
3211 skb_reset_mac_len(skb);
3212
3213 bpf_compute_data_pointers(skb);
3214 return ret;
3215 }
3216
3217 static const struct bpf_func_proto bpf_skb_vlan_push_proto = {
3218 .func = bpf_skb_vlan_push,
3219 .gpl_only = false,
3220 .ret_type = RET_INTEGER,
3221 .arg1_type = ARG_PTR_TO_CTX,
3222 .arg2_type = ARG_ANYTHING,
3223 .arg3_type = ARG_ANYTHING,
3224 };
3225
BPF_CALL_1(bpf_skb_vlan_pop,struct sk_buff *,skb)3226 BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
3227 {
3228 int ret;
3229
3230 bpf_push_mac_rcsum(skb);
3231 ret = skb_vlan_pop(skb);
3232 bpf_pull_mac_rcsum(skb);
3233
3234 bpf_compute_data_pointers(skb);
3235 return ret;
3236 }
3237
3238 static const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
3239 .func = bpf_skb_vlan_pop,
3240 .gpl_only = false,
3241 .ret_type = RET_INTEGER,
3242 .arg1_type = ARG_PTR_TO_CTX,
3243 };
3244
bpf_skb_change_protocol(struct sk_buff * skb,u16 proto)3245 static void bpf_skb_change_protocol(struct sk_buff *skb, u16 proto)
3246 {
3247 skb->protocol = htons(proto);
3248 if (skb_valid_dst(skb))
3249 skb_dst_drop(skb);
3250 }
3251
bpf_skb_generic_push(struct sk_buff * skb,u32 off,u32 len)3252 static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
3253 {
3254 /* Caller already did skb_cow() with len as headroom,
3255 * so no need to do it here.
3256 */
3257 skb_push(skb, len);
3258 memmove(skb->data, skb->data + len, off);
3259 memset(skb->data + off, 0, len);
3260
3261 /* No skb_postpush_rcsum(skb, skb->data + off, len)
3262 * needed here as it does not change the skb->csum
3263 * result for checksum complete when summing over
3264 * zeroed blocks.
3265 */
3266 return 0;
3267 }
3268
bpf_skb_generic_pop(struct sk_buff * skb,u32 off,u32 len)3269 static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len)
3270 {
3271 void *old_data;
3272
3273 /* skb_ensure_writable() is not needed here, as we're
3274 * already working on an uncloned skb.
3275 */
3276 if (unlikely(!pskb_may_pull(skb, off + len)))
3277 return -ENOMEM;
3278
3279 old_data = skb->data;
3280 __skb_pull(skb, len);
3281 skb_postpull_rcsum(skb, old_data + off, len);
3282 memmove(skb->data, old_data, off);
3283
3284 return 0;
3285 }
3286
bpf_skb_net_hdr_push(struct sk_buff * skb,u32 off,u32 len)3287 static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len)
3288 {
3289 bool trans_same = skb->transport_header == skb->network_header;
3290 int ret;
3291
3292 /* There's no need for __skb_push()/__skb_pull() pair to
3293 * get to the start of the mac header as we're guaranteed
3294 * to always start from here under eBPF.
3295 */
3296 ret = bpf_skb_generic_push(skb, off, len);
3297 if (likely(!ret)) {
3298 skb->mac_header -= len;
3299 skb->network_header -= len;
3300 if (trans_same)
3301 skb->transport_header = skb->network_header;
3302 }
3303
3304 return ret;
3305 }
3306
bpf_skb_net_hdr_pop(struct sk_buff * skb,u32 off,u32 len)3307 static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len)
3308 {
3309 bool trans_same = skb->transport_header == skb->network_header;
3310 int ret;
3311
3312 /* Same here, __skb_push()/__skb_pull() pair not needed. */
3313 ret = bpf_skb_generic_pop(skb, off, len);
3314 if (likely(!ret)) {
3315 skb->mac_header += len;
3316 skb->network_header += len;
3317 if (trans_same)
3318 skb->transport_header = skb->network_header;
3319 }
3320
3321 return ret;
3322 }
3323
bpf_skb_proto_4_to_6(struct sk_buff * skb)3324 static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
3325 {
3326 const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
3327 u32 off = skb_mac_header_len(skb);
3328 int ret;
3329
3330 ret = skb_cow(skb, len_diff);
3331 if (unlikely(ret < 0))
3332 return ret;
3333
3334 ret = bpf_skb_net_hdr_push(skb, off, len_diff);
3335 if (unlikely(ret < 0))
3336 return ret;
3337
3338 if (skb_is_gso(skb)) {
3339 struct skb_shared_info *shinfo = skb_shinfo(skb);
3340
3341 /* SKB_GSO_TCPV4 needs to be changed into SKB_GSO_TCPV6. */
3342 if (shinfo->gso_type & SKB_GSO_TCPV4) {
3343 shinfo->gso_type &= ~SKB_GSO_TCPV4;
3344 shinfo->gso_type |= SKB_GSO_TCPV6;
3345 }
3346 }
3347
3348 bpf_skb_change_protocol(skb, ETH_P_IPV6);
3349 skb_clear_hash(skb);
3350
3351 return 0;
3352 }
3353
bpf_skb_proto_6_to_4(struct sk_buff * skb)3354 static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
3355 {
3356 const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
3357 u32 off = skb_mac_header_len(skb);
3358 int ret;
3359
3360 ret = skb_unclone(skb, GFP_ATOMIC);
3361 if (unlikely(ret < 0))
3362 return ret;
3363
3364 ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
3365 if (unlikely(ret < 0))
3366 return ret;
3367
3368 if (skb_is_gso(skb)) {
3369 struct skb_shared_info *shinfo = skb_shinfo(skb);
3370
3371 /* SKB_GSO_TCPV6 needs to be changed into SKB_GSO_TCPV4. */
3372 if (shinfo->gso_type & SKB_GSO_TCPV6) {
3373 shinfo->gso_type &= ~SKB_GSO_TCPV6;
3374 shinfo->gso_type |= SKB_GSO_TCPV4;
3375 }
3376 }
3377
3378 bpf_skb_change_protocol(skb, ETH_P_IP);
3379 skb_clear_hash(skb);
3380
3381 return 0;
3382 }
3383
bpf_skb_proto_xlat(struct sk_buff * skb,__be16 to_proto)3384 static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto)
3385 {
3386 __be16 from_proto = skb->protocol;
3387
3388 if (from_proto == htons(ETH_P_IP) &&
3389 to_proto == htons(ETH_P_IPV6))
3390 return bpf_skb_proto_4_to_6(skb);
3391
3392 if (from_proto == htons(ETH_P_IPV6) &&
3393 to_proto == htons(ETH_P_IP))
3394 return bpf_skb_proto_6_to_4(skb);
3395
3396 return -ENOTSUPP;
3397 }
3398
BPF_CALL_3(bpf_skb_change_proto,struct sk_buff *,skb,__be16,proto,u64,flags)3399 BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto,
3400 u64, flags)
3401 {
3402 int ret;
3403
3404 if (unlikely(flags))
3405 return -EINVAL;
3406
3407 /* General idea is that this helper does the basic groundwork
3408 * needed for changing the protocol, and eBPF program fills the
3409 * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace()
3410 * and other helpers, rather than passing a raw buffer here.
3411 *
3412 * The rationale is to keep this minimal and without a need to
3413 * deal with raw packet data. F.e. even if we would pass buffers
3414 * here, the program still needs to call the bpf_lX_csum_replace()
3415 * helpers anyway. Plus, this way we keep also separation of
3416 * concerns, since f.e. bpf_skb_store_bytes() should only take
3417 * care of stores.
3418 *
3419 * Currently, additional options and extension header space are
3420 * not supported, but flags register is reserved so we can adapt
3421 * that. For offloads, we mark packet as dodgy, so that headers
3422 * need to be verified first.
3423 */
3424 ret = bpf_skb_proto_xlat(skb, proto);
3425 bpf_compute_data_pointers(skb);
3426 return ret;
3427 }
3428
3429 static const struct bpf_func_proto bpf_skb_change_proto_proto = {
3430 .func = bpf_skb_change_proto,
3431 .gpl_only = false,
3432 .ret_type = RET_INTEGER,
3433 .arg1_type = ARG_PTR_TO_CTX,
3434 .arg2_type = ARG_ANYTHING,
3435 .arg3_type = ARG_ANYTHING,
3436 };
3437
BPF_CALL_2(bpf_skb_change_type,struct sk_buff *,skb,u32,pkt_type)3438 BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type)
3439 {
3440 /* We only allow a restricted subset to be changed for now. */
3441 if (unlikely(!skb_pkt_type_ok(skb->pkt_type) ||
3442 !skb_pkt_type_ok(pkt_type)))
3443 return -EINVAL;
3444
3445 skb->pkt_type = pkt_type;
3446 return 0;
3447 }
3448
3449 static const struct bpf_func_proto bpf_skb_change_type_proto = {
3450 .func = bpf_skb_change_type,
3451 .gpl_only = false,
3452 .ret_type = RET_INTEGER,
3453 .arg1_type = ARG_PTR_TO_CTX,
3454 .arg2_type = ARG_ANYTHING,
3455 };
3456
bpf_skb_net_base_len(const struct sk_buff * skb)3457 static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
3458 {
3459 switch (skb->protocol) {
3460 case htons(ETH_P_IP):
3461 return sizeof(struct iphdr);
3462 case htons(ETH_P_IPV6):
3463 return sizeof(struct ipv6hdr);
3464 default:
3465 return ~0U;
3466 }
3467 }
3468
3469 #define BPF_F_ADJ_ROOM_ENCAP_L3_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \
3470 BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
3471
3472 #define BPF_F_ADJ_ROOM_DECAP_L3_MASK (BPF_F_ADJ_ROOM_DECAP_L3_IPV4 | \
3473 BPF_F_ADJ_ROOM_DECAP_L3_IPV6)
3474
3475 #define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \
3476 BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
3477 BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
3478 BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \
3479 BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \
3480 BPF_F_ADJ_ROOM_ENCAP_L2( \
3481 BPF_ADJ_ROOM_ENCAP_L2_MASK) | \
3482 BPF_F_ADJ_ROOM_DECAP_L3_MASK)
3483
bpf_skb_net_grow(struct sk_buff * skb,u32 off,u32 len_diff,u64 flags)3484 static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
3485 u64 flags)
3486 {
3487 u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT;
3488 bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK;
3489 u16 mac_len = 0, inner_net = 0, inner_trans = 0;
3490 unsigned int gso_type = SKB_GSO_DODGY;
3491 int ret;
3492
3493 if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
3494 /* udp gso_size delineates datagrams, only allow if fixed */
3495 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
3496 !(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
3497 return -ENOTSUPP;
3498 }
3499
3500 ret = skb_cow_head(skb, len_diff);
3501 if (unlikely(ret < 0))
3502 return ret;
3503
3504 if (encap) {
3505 if (skb->protocol != htons(ETH_P_IP) &&
3506 skb->protocol != htons(ETH_P_IPV6))
3507 return -ENOTSUPP;
3508
3509 if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 &&
3510 flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
3511 return -EINVAL;
3512
3513 if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE &&
3514 flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
3515 return -EINVAL;
3516
3517 if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH &&
3518 inner_mac_len < ETH_HLEN)
3519 return -EINVAL;
3520
3521 if (skb->encapsulation)
3522 return -EALREADY;
3523
3524 mac_len = skb->network_header - skb->mac_header;
3525 inner_net = skb->network_header;
3526 if (inner_mac_len > len_diff)
3527 return -EINVAL;
3528 inner_trans = skb->transport_header;
3529 }
3530
3531 ret = bpf_skb_net_hdr_push(skb, off, len_diff);
3532 if (unlikely(ret < 0))
3533 return ret;
3534
3535 if (encap) {
3536 skb->inner_mac_header = inner_net - inner_mac_len;
3537 skb->inner_network_header = inner_net;
3538 skb->inner_transport_header = inner_trans;
3539
3540 if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH)
3541 skb_set_inner_protocol(skb, htons(ETH_P_TEB));
3542 else
3543 skb_set_inner_protocol(skb, skb->protocol);
3544
3545 skb->encapsulation = 1;
3546 skb_set_network_header(skb, mac_len);
3547
3548 if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
3549 gso_type |= SKB_GSO_UDP_TUNNEL;
3550 else if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE)
3551 gso_type |= SKB_GSO_GRE;
3552 else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
3553 gso_type |= SKB_GSO_IPXIP6;
3554 else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
3555 gso_type |= SKB_GSO_IPXIP4;
3556
3557 if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE ||
3558 flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) {
3559 int nh_len = flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 ?
3560 sizeof(struct ipv6hdr) :
3561 sizeof(struct iphdr);
3562
3563 skb_set_transport_header(skb, mac_len + nh_len);
3564 }
3565
3566 /* Match skb->protocol to new outer l3 protocol */
3567 if (skb->protocol == htons(ETH_P_IP) &&
3568 flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
3569 bpf_skb_change_protocol(skb, ETH_P_IPV6);
3570 else if (skb->protocol == htons(ETH_P_IPV6) &&
3571 flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
3572 bpf_skb_change_protocol(skb, ETH_P_IP);
3573 }
3574
3575 if (skb_is_gso(skb)) {
3576 struct skb_shared_info *shinfo = skb_shinfo(skb);
3577
3578 /* Header must be checked, and gso_segs recomputed. */
3579 shinfo->gso_type |= gso_type;
3580 shinfo->gso_segs = 0;
3581
3582 /* Due to header growth, MSS needs to be downgraded.
3583 * There is a BUG_ON() when segmenting the frag_list with
3584 * head_frag true, so linearize the skb after downgrading
3585 * the MSS.
3586 */
3587 if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) {
3588 skb_decrease_gso_size(shinfo, len_diff);
3589 if (shinfo->frag_list)
3590 return skb_linearize(skb);
3591 }
3592 }
3593
3594 return 0;
3595 }
3596
bpf_skb_net_shrink(struct sk_buff * skb,u32 off,u32 len_diff,u64 flags)3597 static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
3598 u64 flags)
3599 {
3600 int ret;
3601
3602 if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO |
3603 BPF_F_ADJ_ROOM_DECAP_L3_MASK |
3604 BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
3605 return -EINVAL;
3606
3607 if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
3608 /* udp gso_size delineates datagrams, only allow if fixed */
3609 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
3610 !(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
3611 return -ENOTSUPP;
3612 }
3613
3614 ret = skb_unclone(skb, GFP_ATOMIC);
3615 if (unlikely(ret < 0))
3616 return ret;
3617
3618 ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
3619 if (unlikely(ret < 0))
3620 return ret;
3621
3622 /* Match skb->protocol to new outer l3 protocol */
3623 if (skb->protocol == htons(ETH_P_IP) &&
3624 flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6)
3625 bpf_skb_change_protocol(skb, ETH_P_IPV6);
3626 else if (skb->protocol == htons(ETH_P_IPV6) &&
3627 flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4)
3628 bpf_skb_change_protocol(skb, ETH_P_IP);
3629
3630 if (skb_is_gso(skb)) {
3631 struct skb_shared_info *shinfo = skb_shinfo(skb);
3632
3633 /* Due to header shrink, MSS can be upgraded. */
3634 if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
3635 skb_increase_gso_size(shinfo, len_diff);
3636
3637 /* Header must be checked, and gso_segs recomputed. */
3638 shinfo->gso_type |= SKB_GSO_DODGY;
3639 shinfo->gso_segs = 0;
3640 }
3641
3642 return 0;
3643 }
3644
3645 #define BPF_SKB_MAX_LEN SKB_MAX_ALLOC
3646
BPF_CALL_4(sk_skb_adjust_room,struct sk_buff *,skb,s32,len_diff,u32,mode,u64,flags)3647 BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
3648 u32, mode, u64, flags)
3649 {
3650 u32 len_diff_abs = abs(len_diff);
3651 bool shrink = len_diff < 0;
3652 int ret = 0;
3653
3654 if (unlikely(flags || mode))
3655 return -EINVAL;
3656 if (unlikely(len_diff_abs > 0xfffU))
3657 return -EFAULT;
3658
3659 if (!shrink) {
3660 ret = skb_cow(skb, len_diff);
3661 if (unlikely(ret < 0))
3662 return ret;
3663 __skb_push(skb, len_diff_abs);
3664 memset(skb->data, 0, len_diff_abs);
3665 } else {
3666 if (unlikely(!pskb_may_pull(skb, len_diff_abs)))
3667 return -ENOMEM;
3668 __skb_pull(skb, len_diff_abs);
3669 }
3670 if (tls_sw_has_ctx_rx(skb->sk)) {
3671 struct strp_msg *rxm = strp_msg(skb);
3672
3673 rxm->full_len += len_diff;
3674 }
3675 return ret;
3676 }
3677
3678 static const struct bpf_func_proto sk_skb_adjust_room_proto = {
3679 .func = sk_skb_adjust_room,
3680 .gpl_only = false,
3681 .ret_type = RET_INTEGER,
3682 .arg1_type = ARG_PTR_TO_CTX,
3683 .arg2_type = ARG_ANYTHING,
3684 .arg3_type = ARG_ANYTHING,
3685 .arg4_type = ARG_ANYTHING,
3686 };
3687
BPF_CALL_4(bpf_skb_adjust_room,struct sk_buff *,skb,s32,len_diff,u32,mode,u64,flags)3688 BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
3689 u32, mode, u64, flags)
3690 {
3691 u32 len_cur, len_diff_abs = abs(len_diff);
3692 u32 len_min = bpf_skb_net_base_len(skb);
3693 u32 len_max = BPF_SKB_MAX_LEN;
3694 __be16 proto = skb->protocol;
3695 bool shrink = len_diff < 0;
3696 u32 off;
3697 int ret;
3698
3699 if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK |
3700 BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
3701 return -EINVAL;
3702 if (unlikely(len_diff_abs > 0xfffU))
3703 return -EFAULT;
3704 if (unlikely(proto != htons(ETH_P_IP) &&
3705 proto != htons(ETH_P_IPV6)))
3706 return -ENOTSUPP;
3707
3708 off = skb_mac_header_len(skb);
3709 switch (mode) {
3710 case BPF_ADJ_ROOM_NET:
3711 off += bpf_skb_net_base_len(skb);
3712 break;
3713 case BPF_ADJ_ROOM_MAC:
3714 break;
3715 default:
3716 return -ENOTSUPP;
3717 }
3718
3719 if (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) {
3720 if (!shrink)
3721 return -EINVAL;
3722
3723 switch (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) {
3724 case BPF_F_ADJ_ROOM_DECAP_L3_IPV4:
3725 len_min = sizeof(struct iphdr);
3726 break;
3727 case BPF_F_ADJ_ROOM_DECAP_L3_IPV6:
3728 len_min = sizeof(struct ipv6hdr);
3729 break;
3730 default:
3731 return -EINVAL;
3732 }
3733 }
3734
3735 len_cur = skb->len - skb_network_offset(skb);
3736 if ((shrink && (len_diff_abs >= len_cur ||
3737 len_cur - len_diff_abs < len_min)) ||
3738 (!shrink && (skb->len + len_diff_abs > len_max &&
3739 !skb_is_gso(skb))))
3740 return -ENOTSUPP;
3741
3742 ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) :
3743 bpf_skb_net_grow(skb, off, len_diff_abs, flags);
3744 if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET))
3745 __skb_reset_checksum_unnecessary(skb);
3746
3747 bpf_compute_data_pointers(skb);
3748 return ret;
3749 }
3750
3751 static const struct bpf_func_proto bpf_skb_adjust_room_proto = {
3752 .func = bpf_skb_adjust_room,
3753 .gpl_only = false,
3754 .ret_type = RET_INTEGER,
3755 .arg1_type = ARG_PTR_TO_CTX,
3756 .arg2_type = ARG_ANYTHING,
3757 .arg3_type = ARG_ANYTHING,
3758 .arg4_type = ARG_ANYTHING,
3759 };
3760
__bpf_skb_min_len(const struct sk_buff * skb)3761 static u32 __bpf_skb_min_len(const struct sk_buff *skb)
3762 {
3763 int offset = skb_network_offset(skb);
3764 u32 min_len = 0;
3765
3766 if (offset > 0)
3767 min_len = offset;
3768 if (skb_transport_header_was_set(skb)) {
3769 offset = skb_transport_offset(skb);
3770 if (offset > 0)
3771 min_len = offset;
3772 }
3773 if (skb->ip_summed == CHECKSUM_PARTIAL) {
3774 offset = skb_checksum_start_offset(skb) +
3775 skb->csum_offset + sizeof(__sum16);
3776 if (offset > 0)
3777 min_len = offset;
3778 }
3779 return min_len;
3780 }
3781
bpf_skb_grow_rcsum(struct sk_buff * skb,unsigned int new_len)3782 static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len)
3783 {
3784 unsigned int old_len = skb->len;
3785 int ret;
3786
3787 ret = __skb_grow_rcsum(skb, new_len);
3788 if (!ret)
3789 memset(skb->data + old_len, 0, new_len - old_len);
3790 return ret;
3791 }
3792
bpf_skb_trim_rcsum(struct sk_buff * skb,unsigned int new_len)3793 static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
3794 {
3795 return __skb_trim_rcsum(skb, new_len);
3796 }
3797
__bpf_skb_change_tail(struct sk_buff * skb,u32 new_len,u64 flags)3798 static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len,
3799 u64 flags)
3800 {
3801 u32 max_len = BPF_SKB_MAX_LEN;
3802 u32 min_len = __bpf_skb_min_len(skb);
3803 int ret;
3804
3805 if (unlikely(flags || new_len > max_len || new_len < min_len))
3806 return -EINVAL;
3807 if (skb->encapsulation)
3808 return -ENOTSUPP;
3809
3810 /* The basic idea of this helper is that it's performing the
3811 * needed work to either grow or trim an skb, and eBPF program
3812 * rewrites the rest via helpers like bpf_skb_store_bytes(),
3813 * bpf_lX_csum_replace() and others rather than passing a raw
3814 * buffer here. This one is a slow path helper and intended
3815 * for replies with control messages.
3816 *
3817 * Like in bpf_skb_change_proto(), we want to keep this rather
3818 * minimal and without protocol specifics so that we are able
3819 * to separate concerns as in bpf_skb_store_bytes() should only
3820 * be the one responsible for writing buffers.
3821 *
3822 * It's really expected to be a slow path operation here for
3823 * control message replies, so we're implicitly linearizing,
3824 * uncloning and drop offloads from the skb by this.
3825 */
3826 ret = __bpf_try_make_writable(skb, skb->len);
3827 if (!ret) {
3828 if (new_len > skb->len)
3829 ret = bpf_skb_grow_rcsum(skb, new_len);
3830 else if (new_len < skb->len)
3831 ret = bpf_skb_trim_rcsum(skb, new_len);
3832 if (!ret && skb_is_gso(skb))
3833 skb_gso_reset(skb);
3834 }
3835 return ret;
3836 }
3837
BPF_CALL_3(bpf_skb_change_tail,struct sk_buff *,skb,u32,new_len,u64,flags)3838 BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
3839 u64, flags)
3840 {
3841 int ret = __bpf_skb_change_tail(skb, new_len, flags);
3842
3843 bpf_compute_data_pointers(skb);
3844 return ret;
3845 }
3846
3847 static const struct bpf_func_proto bpf_skb_change_tail_proto = {
3848 .func = bpf_skb_change_tail,
3849 .gpl_only = false,
3850 .ret_type = RET_INTEGER,
3851 .arg1_type = ARG_PTR_TO_CTX,
3852 .arg2_type = ARG_ANYTHING,
3853 .arg3_type = ARG_ANYTHING,
3854 };
3855
BPF_CALL_3(sk_skb_change_tail,struct sk_buff *,skb,u32,new_len,u64,flags)3856 BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len,
3857 u64, flags)
3858 {
3859 return __bpf_skb_change_tail(skb, new_len, flags);
3860 }
3861
3862 static const struct bpf_func_proto sk_skb_change_tail_proto = {
3863 .func = sk_skb_change_tail,
3864 .gpl_only = false,
3865 .ret_type = RET_INTEGER,
3866 .arg1_type = ARG_PTR_TO_CTX,
3867 .arg2_type = ARG_ANYTHING,
3868 .arg3_type = ARG_ANYTHING,
3869 };
3870
__bpf_skb_change_head(struct sk_buff * skb,u32 head_room,u64 flags)3871 static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room,
3872 u64 flags)
3873 {
3874 u32 max_len = BPF_SKB_MAX_LEN;
3875 u32 new_len = skb->len + head_room;
3876 int ret;
3877
3878 if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) ||
3879 new_len < skb->len))
3880 return -EINVAL;
3881
3882 ret = skb_cow(skb, head_room);
3883 if (likely(!ret)) {
3884 /* Idea for this helper is that we currently only
3885 * allow to expand on mac header. This means that
3886 * skb->protocol network header, etc, stay as is.
3887 * Compared to bpf_skb_change_tail(), we're more
3888 * flexible due to not needing to linearize or
3889 * reset GSO. Intention for this helper is to be
3890 * used by an L3 skb that needs to push mac header
3891 * for redirection into L2 device.
3892 */
3893 __skb_push(skb, head_room);
3894 memset(skb->data, 0, head_room);
3895 skb_reset_mac_header(skb);
3896 skb_reset_mac_len(skb);
3897 }
3898
3899 return ret;
3900 }
3901
BPF_CALL_3(bpf_skb_change_head,struct sk_buff *,skb,u32,head_room,u64,flags)3902 BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
3903 u64, flags)
3904 {
3905 int ret = __bpf_skb_change_head(skb, head_room, flags);
3906
3907 bpf_compute_data_pointers(skb);
3908 return ret;
3909 }
3910
3911 static const struct bpf_func_proto bpf_skb_change_head_proto = {
3912 .func = bpf_skb_change_head,
3913 .gpl_only = false,
3914 .ret_type = RET_INTEGER,
3915 .arg1_type = ARG_PTR_TO_CTX,
3916 .arg2_type = ARG_ANYTHING,
3917 .arg3_type = ARG_ANYTHING,
3918 };
3919
BPF_CALL_3(sk_skb_change_head,struct sk_buff *,skb,u32,head_room,u64,flags)3920 BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room,
3921 u64, flags)
3922 {
3923 return __bpf_skb_change_head(skb, head_room, flags);
3924 }
3925
3926 static const struct bpf_func_proto sk_skb_change_head_proto = {
3927 .func = sk_skb_change_head,
3928 .gpl_only = false,
3929 .ret_type = RET_INTEGER,
3930 .arg1_type = ARG_PTR_TO_CTX,
3931 .arg2_type = ARG_ANYTHING,
3932 .arg3_type = ARG_ANYTHING,
3933 };
3934
BPF_CALL_1(bpf_xdp_get_buff_len,struct xdp_buff *,xdp)3935 BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp)
3936 {
3937 return xdp_get_buff_len(xdp);
3938 }
3939
3940 static const struct bpf_func_proto bpf_xdp_get_buff_len_proto = {
3941 .func = bpf_xdp_get_buff_len,
3942 .gpl_only = false,
3943 .ret_type = RET_INTEGER,
3944 .arg1_type = ARG_PTR_TO_CTX,
3945 };
3946
3947 BTF_ID_LIST_SINGLE(bpf_xdp_get_buff_len_bpf_ids, struct, xdp_buff)
3948
3949 const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto = {
3950 .func = bpf_xdp_get_buff_len,
3951 .gpl_only = false,
3952 .arg1_type = ARG_PTR_TO_BTF_ID,
3953 .arg1_btf_id = &bpf_xdp_get_buff_len_bpf_ids[0],
3954 };
3955
xdp_get_metalen(const struct xdp_buff * xdp)3956 static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
3957 {
3958 return xdp_data_meta_unsupported(xdp) ? 0 :
3959 xdp->data - xdp->data_meta;
3960 }
3961
BPF_CALL_2(bpf_xdp_adjust_head,struct xdp_buff *,xdp,int,offset)3962 BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
3963 {
3964 void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
3965 unsigned long metalen = xdp_get_metalen(xdp);
3966 void *data_start = xdp_frame_end + metalen;
3967 void *data = xdp->data + offset;
3968
3969 if (unlikely(data < data_start ||
3970 data > xdp->data_end - ETH_HLEN))
3971 return -EINVAL;
3972
3973 if (metalen)
3974 memmove(xdp->data_meta + offset,
3975 xdp->data_meta, metalen);
3976 xdp->data_meta += offset;
3977 xdp->data = data;
3978
3979 return 0;
3980 }
3981
3982 static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
3983 .func = bpf_xdp_adjust_head,
3984 .gpl_only = false,
3985 .ret_type = RET_INTEGER,
3986 .arg1_type = ARG_PTR_TO_CTX,
3987 .arg2_type = ARG_ANYTHING,
3988 };
3989
bpf_xdp_copy_buf(struct xdp_buff * xdp,unsigned long off,void * buf,unsigned long len,bool flush)3990 void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
3991 void *buf, unsigned long len, bool flush)
3992 {
3993 unsigned long ptr_len, ptr_off = 0;
3994 skb_frag_t *next_frag, *end_frag;
3995 struct skb_shared_info *sinfo;
3996 void *src, *dst;
3997 u8 *ptr_buf;
3998
3999 if (likely(xdp->data_end - xdp->data >= off + len)) {
4000 src = flush ? buf : xdp->data + off;
4001 dst = flush ? xdp->data + off : buf;
4002 memcpy(dst, src, len);
4003 return;
4004 }
4005
4006 sinfo = xdp_get_shared_info_from_buff(xdp);
4007 end_frag = &sinfo->frags[sinfo->nr_frags];
4008 next_frag = &sinfo->frags[0];
4009
4010 ptr_len = xdp->data_end - xdp->data;
4011 ptr_buf = xdp->data;
4012
4013 while (true) {
4014 if (off < ptr_off + ptr_len) {
4015 unsigned long copy_off = off - ptr_off;
4016 unsigned long copy_len = min(len, ptr_len - copy_off);
4017
4018 src = flush ? buf : ptr_buf + copy_off;
4019 dst = flush ? ptr_buf + copy_off : buf;
4020 memcpy(dst, src, copy_len);
4021
4022 off += copy_len;
4023 len -= copy_len;
4024 buf += copy_len;
4025 }
4026
4027 if (!len || next_frag == end_frag)
4028 break;
4029
4030 ptr_off += ptr_len;
4031 ptr_buf = skb_frag_address(next_frag);
4032 ptr_len = skb_frag_size(next_frag);
4033 next_frag++;
4034 }
4035 }
4036
bpf_xdp_pointer(struct xdp_buff * xdp,u32 offset,u32 len)4037 void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
4038 {
4039 u32 size = xdp->data_end - xdp->data;
4040 struct skb_shared_info *sinfo;
4041 void *addr = xdp->data;
4042 int i;
4043
4044 if (unlikely(offset > 0xffff || len > 0xffff))
4045 return ERR_PTR(-EFAULT);
4046
4047 if (unlikely(offset + len > xdp_get_buff_len(xdp)))
4048 return ERR_PTR(-EINVAL);
4049
4050 if (likely(offset < size)) /* linear area */
4051 goto out;
4052
4053 sinfo = xdp_get_shared_info_from_buff(xdp);
4054 offset -= size;
4055 for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */
4056 u32 frag_size = skb_frag_size(&sinfo->frags[i]);
4057
4058 if (offset < frag_size) {
4059 addr = skb_frag_address(&sinfo->frags[i]);
4060 size = frag_size;
4061 break;
4062 }
4063 offset -= frag_size;
4064 }
4065 out:
4066 return offset + len <= size ? addr + offset : NULL;
4067 }
4068
BPF_CALL_4(bpf_xdp_load_bytes,struct xdp_buff *,xdp,u32,offset,void *,buf,u32,len)4069 BPF_CALL_4(bpf_xdp_load_bytes, struct xdp_buff *, xdp, u32, offset,
4070 void *, buf, u32, len)
4071 {
4072 void *ptr;
4073
4074 ptr = bpf_xdp_pointer(xdp, offset, len);
4075 if (IS_ERR(ptr))
4076 return PTR_ERR(ptr);
4077
4078 if (!ptr)
4079 bpf_xdp_copy_buf(xdp, offset, buf, len, false);
4080 else
4081 memcpy(buf, ptr, len);
4082
4083 return 0;
4084 }
4085
4086 static const struct bpf_func_proto bpf_xdp_load_bytes_proto = {
4087 .func = bpf_xdp_load_bytes,
4088 .gpl_only = false,
4089 .ret_type = RET_INTEGER,
4090 .arg1_type = ARG_PTR_TO_CTX,
4091 .arg2_type = ARG_ANYTHING,
4092 .arg3_type = ARG_PTR_TO_UNINIT_MEM,
4093 .arg4_type = ARG_CONST_SIZE,
4094 };
4095
__bpf_xdp_load_bytes(struct xdp_buff * xdp,u32 offset,void * buf,u32 len)4096 int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len)
4097 {
4098 return ____bpf_xdp_load_bytes(xdp, offset, buf, len);
4099 }
4100
BPF_CALL_4(bpf_xdp_store_bytes,struct xdp_buff *,xdp,u32,offset,void *,buf,u32,len)4101 BPF_CALL_4(bpf_xdp_store_bytes, struct xdp_buff *, xdp, u32, offset,
4102 void *, buf, u32, len)
4103 {
4104 void *ptr;
4105
4106 ptr = bpf_xdp_pointer(xdp, offset, len);
4107 if (IS_ERR(ptr))
4108 return PTR_ERR(ptr);
4109
4110 if (!ptr)
4111 bpf_xdp_copy_buf(xdp, offset, buf, len, true);
4112 else
4113 memcpy(ptr, buf, len);
4114
4115 return 0;
4116 }
4117
4118 static const struct bpf_func_proto bpf_xdp_store_bytes_proto = {
4119 .func = bpf_xdp_store_bytes,
4120 .gpl_only = false,
4121 .ret_type = RET_INTEGER,
4122 .arg1_type = ARG_PTR_TO_CTX,
4123 .arg2_type = ARG_ANYTHING,
4124 .arg3_type = ARG_PTR_TO_UNINIT_MEM,
4125 .arg4_type = ARG_CONST_SIZE,
4126 };
4127
__bpf_xdp_store_bytes(struct xdp_buff * xdp,u32 offset,void * buf,u32 len)4128 int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len)
4129 {
4130 return ____bpf_xdp_store_bytes(xdp, offset, buf, len);
4131 }
4132
bpf_xdp_frags_increase_tail(struct xdp_buff * xdp,int offset)4133 static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
4134 {
4135 struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
4136 skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1];
4137 struct xdp_rxq_info *rxq = xdp->rxq;
4138 unsigned int tailroom;
4139
4140 if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz)
4141 return -EOPNOTSUPP;
4142
4143 tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag);
4144 if (unlikely(offset > tailroom))
4145 return -EINVAL;
4146
4147 memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset);
4148 skb_frag_size_add(frag, offset);
4149 sinfo->xdp_frags_size += offset;
4150 if (rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)
4151 xsk_buff_get_tail(xdp)->data_end += offset;
4152
4153 return 0;
4154 }
4155
bpf_xdp_shrink_data_zc(struct xdp_buff * xdp,int shrink,enum xdp_mem_type mem_type,bool release)4156 static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink,
4157 enum xdp_mem_type mem_type, bool release)
4158 {
4159 struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp);
4160
4161 if (release) {
4162 xsk_buff_del_tail(zc_frag);
4163 __xdp_return(0, mem_type, false, zc_frag);
4164 } else {
4165 zc_frag->data_end -= shrink;
4166 }
4167 }
4168
bpf_xdp_shrink_data(struct xdp_buff * xdp,skb_frag_t * frag,int shrink)4169 static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag,
4170 int shrink)
4171 {
4172 enum xdp_mem_type mem_type = xdp->rxq->mem.type;
4173 bool release = skb_frag_size(frag) == shrink;
4174
4175 if (mem_type == MEM_TYPE_XSK_BUFF_POOL) {
4176 bpf_xdp_shrink_data_zc(xdp, shrink, mem_type, release);
4177 goto out;
4178 }
4179
4180 if (release)
4181 __xdp_return(skb_frag_netmem(frag), mem_type, false, NULL);
4182
4183 out:
4184 return release;
4185 }
4186
bpf_xdp_frags_shrink_tail(struct xdp_buff * xdp,int offset)4187 static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
4188 {
4189 struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
4190 int i, n_frags_free = 0, len_free = 0;
4191
4192 if (unlikely(offset > (int)xdp_get_buff_len(xdp) - ETH_HLEN))
4193 return -EINVAL;
4194
4195 for (i = sinfo->nr_frags - 1; i >= 0 && offset > 0; i--) {
4196 skb_frag_t *frag = &sinfo->frags[i];
4197 int shrink = min_t(int, offset, skb_frag_size(frag));
4198
4199 len_free += shrink;
4200 offset -= shrink;
4201 if (bpf_xdp_shrink_data(xdp, frag, shrink)) {
4202 n_frags_free++;
4203 } else {
4204 skb_frag_size_sub(frag, shrink);
4205 break;
4206 }
4207 }
4208 sinfo->nr_frags -= n_frags_free;
4209 sinfo->xdp_frags_size -= len_free;
4210
4211 if (unlikely(!sinfo->nr_frags)) {
4212 xdp_buff_clear_frags_flag(xdp);
4213 xdp->data_end -= offset;
4214 }
4215
4216 return 0;
4217 }
4218
BPF_CALL_2(bpf_xdp_adjust_tail,struct xdp_buff *,xdp,int,offset)4219 BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
4220 {
4221 void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
4222 void *data_end = xdp->data_end + offset;
4223
4224 if (unlikely(xdp_buff_has_frags(xdp))) { /* non-linear xdp buff */
4225 if (offset < 0)
4226 return bpf_xdp_frags_shrink_tail(xdp, -offset);
4227
4228 return bpf_xdp_frags_increase_tail(xdp, offset);
4229 }
4230
4231 /* Notice that xdp_data_hard_end have reserved some tailroom */
4232 if (unlikely(data_end > data_hard_end))
4233 return -EINVAL;
4234
4235 if (unlikely(data_end < xdp->data + ETH_HLEN))
4236 return -EINVAL;
4237
4238 /* Clear memory area on grow, can contain uninit kernel memory */
4239 if (offset > 0)
4240 memset(xdp->data_end, 0, offset);
4241
4242 xdp->data_end = data_end;
4243
4244 return 0;
4245 }
4246
4247 static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = {
4248 .func = bpf_xdp_adjust_tail,
4249 .gpl_only = false,
4250 .ret_type = RET_INTEGER,
4251 .arg1_type = ARG_PTR_TO_CTX,
4252 .arg2_type = ARG_ANYTHING,
4253 };
4254
BPF_CALL_2(bpf_xdp_adjust_meta,struct xdp_buff *,xdp,int,offset)4255 BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
4256 {
4257 void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
4258 void *meta = xdp->data_meta + offset;
4259 unsigned long metalen = xdp->data - meta;
4260
4261 if (xdp_data_meta_unsupported(xdp))
4262 return -ENOTSUPP;
4263 if (unlikely(meta < xdp_frame_end ||
4264 meta > xdp->data))
4265 return -EINVAL;
4266 if (unlikely(xdp_metalen_invalid(metalen)))
4267 return -EACCES;
4268
4269 xdp->data_meta = meta;
4270
4271 return 0;
4272 }
4273
4274 static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
4275 .func = bpf_xdp_adjust_meta,
4276 .gpl_only = false,
4277 .ret_type = RET_INTEGER,
4278 .arg1_type = ARG_PTR_TO_CTX,
4279 .arg2_type = ARG_ANYTHING,
4280 };
4281
4282 /**
4283 * DOC: xdp redirect
4284 *
4285 * XDP_REDIRECT works by a three-step process, implemented in the functions
4286 * below:
4287 *
4288 * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target
4289 * of the redirect and store it (along with some other metadata) in a per-CPU
4290 * struct bpf_redirect_info.
4291 *
4292 * 2. When the program returns the XDP_REDIRECT return code, the driver will
4293 * call xdp_do_redirect() which will use the information in struct
4294 * bpf_redirect_info to actually enqueue the frame into a map type-specific
4295 * bulk queue structure.
4296 *
4297 * 3. Before exiting its NAPI poll loop, the driver will call
4298 * xdp_do_flush(), which will flush all the different bulk queues,
4299 * thus completing the redirect. Note that xdp_do_flush() must be
4300 * called before napi_complete_done() in the driver, as the
4301 * XDP_REDIRECT logic relies on being inside a single NAPI instance
4302 * through to the xdp_do_flush() call for RCU protection of all
4303 * in-kernel data structures.
4304 */
4305 /*
4306 * Pointers to the map entries will be kept around for this whole sequence of
4307 * steps, protected by RCU. However, there is no top-level rcu_read_lock() in
4308 * the core code; instead, the RCU protection relies on everything happening
4309 * inside a single NAPI poll sequence, which means it's between a pair of calls
4310 * to local_bh_disable()/local_bh_enable().
4311 *
4312 * The map entries are marked as __rcu and the map code makes sure to
4313 * dereference those pointers with rcu_dereference_check() in a way that works
4314 * for both sections that to hold an rcu_read_lock() and sections that are
4315 * called from NAPI without a separate rcu_read_lock(). The code below does not
4316 * use RCU annotations, but relies on those in the map code.
4317 */
xdp_do_flush(void)4318 void xdp_do_flush(void)
4319 {
4320 struct list_head *lh_map, *lh_dev, *lh_xsk;
4321
4322 bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
4323 if (lh_dev)
4324 __dev_flush(lh_dev);
4325 if (lh_map)
4326 __cpu_map_flush(lh_map);
4327 if (lh_xsk)
4328 __xsk_map_flush(lh_xsk);
4329 }
4330 EXPORT_SYMBOL_GPL(xdp_do_flush);
4331
4332 #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
xdp_do_check_flushed(struct napi_struct * napi)4333 void xdp_do_check_flushed(struct napi_struct *napi)
4334 {
4335 struct list_head *lh_map, *lh_dev, *lh_xsk;
4336 bool missed = false;
4337
4338 bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
4339 if (lh_dev) {
4340 __dev_flush(lh_dev);
4341 missed = true;
4342 }
4343 if (lh_map) {
4344 __cpu_map_flush(lh_map);
4345 missed = true;
4346 }
4347 if (lh_xsk) {
4348 __xsk_map_flush(lh_xsk);
4349 missed = true;
4350 }
4351
4352 WARN_ONCE(missed, "Missing xdp_do_flush() invocation after NAPI by %ps\n",
4353 napi->poll);
4354 }
4355 #endif
4356
4357 DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key);
4358 EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key);
4359
xdp_master_redirect(struct xdp_buff * xdp)4360 u32 xdp_master_redirect(struct xdp_buff *xdp)
4361 {
4362 struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
4363 struct net_device *master, *slave;
4364
4365 master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev);
4366 slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp);
4367 if (slave && slave != xdp->rxq->dev) {
4368 /* The target device is different from the receiving device, so
4369 * redirect it to the new device.
4370 * Using XDP_REDIRECT gets the correct behaviour from XDP enabled
4371 * drivers to unmap the packet from their rx ring.
4372 */
4373 ri->tgt_index = slave->ifindex;
4374 ri->map_id = INT_MAX;
4375 ri->map_type = BPF_MAP_TYPE_UNSPEC;
4376 return XDP_REDIRECT;
4377 }
4378 return XDP_TX;
4379 }
4380 EXPORT_SYMBOL_GPL(xdp_master_redirect);
4381
__xdp_do_redirect_xsk(struct bpf_redirect_info * ri,const struct net_device * dev,struct xdp_buff * xdp,const struct bpf_prog * xdp_prog)4382 static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri,
4383 const struct net_device *dev,
4384 struct xdp_buff *xdp,
4385 const struct bpf_prog *xdp_prog)
4386 {
4387 enum bpf_map_type map_type = ri->map_type;
4388 void *fwd = ri->tgt_value;
4389 u32 map_id = ri->map_id;
4390 int err;
4391
4392 ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
4393 ri->map_type = BPF_MAP_TYPE_UNSPEC;
4394
4395 err = __xsk_map_redirect(fwd, xdp);
4396 if (unlikely(err))
4397 goto err;
4398
4399 _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
4400 return 0;
4401 err:
4402 _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
4403 return err;
4404 }
4405
4406 static __always_inline int
__xdp_do_redirect_frame(struct bpf_redirect_info * ri,struct net_device * dev,struct xdp_frame * xdpf,const struct bpf_prog * xdp_prog)4407 __xdp_do_redirect_frame(struct bpf_redirect_info *ri, struct net_device *dev,
4408 struct xdp_frame *xdpf,
4409 const struct bpf_prog *xdp_prog)
4410 {
4411 enum bpf_map_type map_type = ri->map_type;
4412 void *fwd = ri->tgt_value;
4413 u32 map_id = ri->map_id;
4414 u32 flags = ri->flags;
4415 struct bpf_map *map;
4416 int err;
4417
4418 ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
4419 ri->flags = 0;
4420 ri->map_type = BPF_MAP_TYPE_UNSPEC;
4421
4422 if (unlikely(!xdpf)) {
4423 err = -EOVERFLOW;
4424 goto err;
4425 }
4426
4427 switch (map_type) {
4428 case BPF_MAP_TYPE_DEVMAP:
4429 fallthrough;
4430 case BPF_MAP_TYPE_DEVMAP_HASH:
4431 if (unlikely(flags & BPF_F_BROADCAST)) {
4432 map = READ_ONCE(ri->map);
4433
4434 /* The map pointer is cleared when the map is being torn
4435 * down by dev_map_free()
4436 */
4437 if (unlikely(!map)) {
4438 err = -ENOENT;
4439 break;
4440 }
4441
4442 WRITE_ONCE(ri->map, NULL);
4443 err = dev_map_enqueue_multi(xdpf, dev, map,
4444 flags & BPF_F_EXCLUDE_INGRESS);
4445 } else {
4446 err = dev_map_enqueue(fwd, xdpf, dev);
4447 }
4448 break;
4449 case BPF_MAP_TYPE_CPUMAP:
4450 err = cpu_map_enqueue(fwd, xdpf, dev);
4451 break;
4452 case BPF_MAP_TYPE_UNSPEC:
4453 if (map_id == INT_MAX) {
4454 fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
4455 if (unlikely(!fwd)) {
4456 err = -EINVAL;
4457 break;
4458 }
4459 err = dev_xdp_enqueue(fwd, xdpf, dev);
4460 break;
4461 }
4462 fallthrough;
4463 default:
4464 err = -EBADRQC;
4465 }
4466
4467 if (unlikely(err))
4468 goto err;
4469
4470 _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
4471 return 0;
4472 err:
4473 _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
4474 return err;
4475 }
4476
xdp_do_redirect(struct net_device * dev,struct xdp_buff * xdp,const struct bpf_prog * xdp_prog)4477 int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
4478 const struct bpf_prog *xdp_prog)
4479 {
4480 struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
4481 enum bpf_map_type map_type = ri->map_type;
4482
4483 if (map_type == BPF_MAP_TYPE_XSKMAP)
4484 return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog);
4485
4486 return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp),
4487 xdp_prog);
4488 }
4489 EXPORT_SYMBOL_GPL(xdp_do_redirect);
4490
xdp_do_redirect_frame(struct net_device * dev,struct xdp_buff * xdp,struct xdp_frame * xdpf,const struct bpf_prog * xdp_prog)4491 int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp,
4492 struct xdp_frame *xdpf,
4493 const struct bpf_prog *xdp_prog)
4494 {
4495 struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
4496 enum bpf_map_type map_type = ri->map_type;
4497
4498 if (map_type == BPF_MAP_TYPE_XSKMAP)
4499 return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog);
4500
4501 return __xdp_do_redirect_frame(ri, dev, xdpf, xdp_prog);
4502 }
4503 EXPORT_SYMBOL_GPL(xdp_do_redirect_frame);
4504
xdp_do_generic_redirect_map(struct net_device * dev,struct sk_buff * skb,struct xdp_buff * xdp,const struct bpf_prog * xdp_prog,void * fwd,enum bpf_map_type map_type,u32 map_id,u32 flags)4505 static int xdp_do_generic_redirect_map(struct net_device *dev,
4506 struct sk_buff *skb,
4507 struct xdp_buff *xdp,
4508 const struct bpf_prog *xdp_prog,
4509 void *fwd, enum bpf_map_type map_type,
4510 u32 map_id, u32 flags)
4511 {
4512 struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
4513 struct bpf_map *map;
4514 int err;
4515
4516 switch (map_type) {
4517 case BPF_MAP_TYPE_DEVMAP:
4518 fallthrough;
4519 case BPF_MAP_TYPE_DEVMAP_HASH:
4520 if (unlikely(flags & BPF_F_BROADCAST)) {
4521 map = READ_ONCE(ri->map);
4522
4523 /* The map pointer is cleared when the map is being torn
4524 * down by dev_map_free()
4525 */
4526 if (unlikely(!map)) {
4527 err = -ENOENT;
4528 break;
4529 }
4530
4531 WRITE_ONCE(ri->map, NULL);
4532 err = dev_map_redirect_multi(dev, skb, xdp_prog, map,
4533 flags & BPF_F_EXCLUDE_INGRESS);
4534 } else {
4535 err = dev_map_generic_redirect(fwd, skb, xdp_prog);
4536 }
4537 if (unlikely(err))
4538 goto err;
4539 break;
4540 case BPF_MAP_TYPE_XSKMAP:
4541 err = xsk_generic_rcv(fwd, xdp);
4542 if (err)
4543 goto err;
4544 consume_skb(skb);
4545 break;
4546 case BPF_MAP_TYPE_CPUMAP:
4547 err = cpu_map_generic_redirect(fwd, skb);
4548 if (unlikely(err))
4549 goto err;
4550 break;
4551 default:
4552 err = -EBADRQC;
4553 goto err;
4554 }
4555
4556 _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
4557 return 0;
4558 err:
4559 _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
4560 return err;
4561 }
4562
xdp_do_generic_redirect(struct net_device * dev,struct sk_buff * skb,struct xdp_buff * xdp,const struct bpf_prog * xdp_prog)4563 int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
4564 struct xdp_buff *xdp,
4565 const struct bpf_prog *xdp_prog)
4566 {
4567 struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
4568 enum bpf_map_type map_type = ri->map_type;
4569 void *fwd = ri->tgt_value;
4570 u32 map_id = ri->map_id;
4571 u32 flags = ri->flags;
4572 int err;
4573
4574 ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
4575 ri->flags = 0;
4576 ri->map_type = BPF_MAP_TYPE_UNSPEC;
4577
4578 if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) {
4579 fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
4580 if (unlikely(!fwd)) {
4581 err = -EINVAL;
4582 goto err;
4583 }
4584
4585 err = xdp_ok_fwd_dev(fwd, skb->len);
4586 if (unlikely(err))
4587 goto err;
4588
4589 skb->dev = fwd;
4590 _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index);
4591 generic_xdp_tx(skb, xdp_prog);
4592 return 0;
4593 }
4594
4595 return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id, flags);
4596 err:
4597 _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err);
4598 return err;
4599 }
4600
BPF_CALL_2(bpf_xdp_redirect,u32,ifindex,u64,flags)4601 BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
4602 {
4603 struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
4604
4605 if (unlikely(flags))
4606 return XDP_ABORTED;
4607
4608 /* NB! Map type UNSPEC and map_id == INT_MAX (never generated
4609 * by map_idr) is used for ifindex based XDP redirect.
4610 */
4611 ri->tgt_index = ifindex;
4612 ri->map_id = INT_MAX;
4613 ri->map_type = BPF_MAP_TYPE_UNSPEC;
4614
4615 return XDP_REDIRECT;
4616 }
4617
4618 static const struct bpf_func_proto bpf_xdp_redirect_proto = {
4619 .func = bpf_xdp_redirect,
4620 .gpl_only = false,
4621 .ret_type = RET_INTEGER,
4622 .arg1_type = ARG_ANYTHING,
4623 .arg2_type = ARG_ANYTHING,
4624 };
4625
BPF_CALL_3(bpf_xdp_redirect_map,struct bpf_map *,map,u64,key,u64,flags)4626 BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u64, key,
4627 u64, flags)
4628 {
4629 return map->ops->map_redirect(map, key, flags);
4630 }
4631
4632 static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
4633 .func = bpf_xdp_redirect_map,
4634 .gpl_only = false,
4635 .ret_type = RET_INTEGER,
4636 .arg1_type = ARG_CONST_MAP_PTR,
4637 .arg2_type = ARG_ANYTHING,
4638 .arg3_type = ARG_ANYTHING,
4639 };
4640
bpf_skb_copy(void * dst_buff,const void * skb,unsigned long off,unsigned long len)4641 static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
4642 unsigned long off, unsigned long len)
4643 {
4644 void *ptr = skb_header_pointer(skb, off, len, dst_buff);
4645
4646 if (unlikely(!ptr))
4647 return len;
4648 if (ptr != dst_buff)
4649 memcpy(dst_buff, ptr, len);
4650
4651 return 0;
4652 }
4653
BPF_CALL_5(bpf_skb_event_output,struct sk_buff *,skb,struct bpf_map *,map,u64,flags,void *,meta,u64,meta_size)4654 BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map,
4655 u64, flags, void *, meta, u64, meta_size)
4656 {
4657 u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
4658
4659 if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
4660 return -EINVAL;
4661 if (unlikely(!skb || skb_size > skb->len))
4662 return -EFAULT;
4663
4664 return bpf_event_output(map, flags, meta, meta_size, skb, skb_size,
4665 bpf_skb_copy);
4666 }
4667
4668 static const struct bpf_func_proto bpf_skb_event_output_proto = {
4669 .func = bpf_skb_event_output,
4670 .gpl_only = true,
4671 .ret_type = RET_INTEGER,
4672 .arg1_type = ARG_PTR_TO_CTX,
4673 .arg2_type = ARG_CONST_MAP_PTR,
4674 .arg3_type = ARG_ANYTHING,
4675 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
4676 .arg5_type = ARG_CONST_SIZE_OR_ZERO,
4677 };
4678
4679 BTF_ID_LIST_SINGLE(bpf_skb_output_btf_ids, struct, sk_buff)
4680
4681 const struct bpf_func_proto bpf_skb_output_proto = {
4682 .func = bpf_skb_event_output,
4683 .gpl_only = true,
4684 .ret_type = RET_INTEGER,
4685 .arg1_type = ARG_PTR_TO_BTF_ID,
4686 .arg1_btf_id = &bpf_skb_output_btf_ids[0],
4687 .arg2_type = ARG_CONST_MAP_PTR,
4688 .arg3_type = ARG_ANYTHING,
4689 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
4690 .arg5_type = ARG_CONST_SIZE_OR_ZERO,
4691 };
4692
bpf_tunnel_key_af(u64 flags)4693 static unsigned short bpf_tunnel_key_af(u64 flags)
4694 {
4695 return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET;
4696 }
4697
BPF_CALL_4(bpf_skb_get_tunnel_key,struct sk_buff *,skb,struct bpf_tunnel_key *,to,u32,size,u64,flags)4698 BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to,
4699 u32, size, u64, flags)
4700 {
4701 const struct ip_tunnel_info *info = skb_tunnel_info(skb);
4702 u8 compat[sizeof(struct bpf_tunnel_key)];
4703 void *to_orig = to;
4704 int err;
4705
4706 if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6 |
4707 BPF_F_TUNINFO_FLAGS)))) {
4708 err = -EINVAL;
4709 goto err_clear;
4710 }
4711 if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) {
4712 err = -EPROTO;
4713 goto err_clear;
4714 }
4715 if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
4716 err = -EINVAL;
4717 switch (size) {
4718 case offsetof(struct bpf_tunnel_key, local_ipv6[0]):
4719 case offsetof(struct bpf_tunnel_key, tunnel_label):
4720 case offsetof(struct bpf_tunnel_key, tunnel_ext):
4721 goto set_compat;
4722 case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
4723 /* Fixup deprecated structure layouts here, so we have
4724 * a common path later on.
4725 */
4726 if (ip_tunnel_info_af(info) != AF_INET)
4727 goto err_clear;
4728 set_compat:
4729 to = (struct bpf_tunnel_key *)compat;
4730 break;
4731 default:
4732 goto err_clear;
4733 }
4734 }
4735
4736 to->tunnel_id = be64_to_cpu(info->key.tun_id);
4737 to->tunnel_tos = info->key.tos;
4738 to->tunnel_ttl = info->key.ttl;
4739 if (flags & BPF_F_TUNINFO_FLAGS)
4740 to->tunnel_flags = ip_tunnel_flags_to_be16(info->key.tun_flags);
4741 else
4742 to->tunnel_ext = 0;
4743
4744 if (flags & BPF_F_TUNINFO_IPV6) {
4745 memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
4746 sizeof(to->remote_ipv6));
4747 memcpy(to->local_ipv6, &info->key.u.ipv6.dst,
4748 sizeof(to->local_ipv6));
4749 to->tunnel_label = be32_to_cpu(info->key.label);
4750 } else {
4751 to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
4752 memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
4753 to->local_ipv4 = be32_to_cpu(info->key.u.ipv4.dst);
4754 memset(&to->local_ipv6[1], 0, sizeof(__u32) * 3);
4755 to->tunnel_label = 0;
4756 }
4757
4758 if (unlikely(size != sizeof(struct bpf_tunnel_key)))
4759 memcpy(to_orig, to, size);
4760
4761 return 0;
4762 err_clear:
4763 memset(to_orig, 0, size);
4764 return err;
4765 }
4766
4767 static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
4768 .func = bpf_skb_get_tunnel_key,
4769 .gpl_only = false,
4770 .ret_type = RET_INTEGER,
4771 .arg1_type = ARG_PTR_TO_CTX,
4772 .arg2_type = ARG_PTR_TO_UNINIT_MEM,
4773 .arg3_type = ARG_CONST_SIZE,
4774 .arg4_type = ARG_ANYTHING,
4775 };
4776
BPF_CALL_3(bpf_skb_get_tunnel_opt,struct sk_buff *,skb,u8 *,to,u32,size)4777 BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size)
4778 {
4779 const struct ip_tunnel_info *info = skb_tunnel_info(skb);
4780 int err;
4781
4782 if (unlikely(!info ||
4783 !ip_tunnel_is_options_present(info->key.tun_flags))) {
4784 err = -ENOENT;
4785 goto err_clear;
4786 }
4787 if (unlikely(size < info->options_len)) {
4788 err = -ENOMEM;
4789 goto err_clear;
4790 }
4791
4792 ip_tunnel_info_opts_get(to, info);
4793 if (size > info->options_len)
4794 memset(to + info->options_len, 0, size - info->options_len);
4795
4796 return info->options_len;
4797 err_clear:
4798 memset(to, 0, size);
4799 return err;
4800 }
4801
4802 static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
4803 .func = bpf_skb_get_tunnel_opt,
4804 .gpl_only = false,
4805 .ret_type = RET_INTEGER,
4806 .arg1_type = ARG_PTR_TO_CTX,
4807 .arg2_type = ARG_PTR_TO_UNINIT_MEM,
4808 .arg3_type = ARG_CONST_SIZE,
4809 };
4810
4811 static struct metadata_dst __percpu *md_dst;
4812
BPF_CALL_4(bpf_skb_set_tunnel_key,struct sk_buff *,skb,const struct bpf_tunnel_key *,from,u32,size,u64,flags)4813 BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
4814 const struct bpf_tunnel_key *, from, u32, size, u64, flags)
4815 {
4816 struct metadata_dst *md = this_cpu_ptr(md_dst);
4817 u8 compat[sizeof(struct bpf_tunnel_key)];
4818 struct ip_tunnel_info *info;
4819
4820 if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
4821 BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER |
4822 BPF_F_NO_TUNNEL_KEY)))
4823 return -EINVAL;
4824 if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
4825 switch (size) {
4826 case offsetof(struct bpf_tunnel_key, local_ipv6[0]):
4827 case offsetof(struct bpf_tunnel_key, tunnel_label):
4828 case offsetof(struct bpf_tunnel_key, tunnel_ext):
4829 case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
4830 /* Fixup deprecated structure layouts here, so we have
4831 * a common path later on.
4832 */
4833 memcpy(compat, from, size);
4834 memset(compat + size, 0, sizeof(compat) - size);
4835 from = (const struct bpf_tunnel_key *) compat;
4836 break;
4837 default:
4838 return -EINVAL;
4839 }
4840 }
4841 if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) ||
4842 from->tunnel_ext))
4843 return -EINVAL;
4844
4845 skb_dst_drop(skb);
4846 dst_hold((struct dst_entry *) md);
4847 skb_dst_set(skb, (struct dst_entry *) md);
4848
4849 info = &md->u.tun_info;
4850 memset(info, 0, sizeof(*info));
4851 info->mode = IP_TUNNEL_INFO_TX;
4852
4853 __set_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags);
4854 __assign_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags,
4855 flags & BPF_F_DONT_FRAGMENT);
4856 __assign_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags,
4857 !(flags & BPF_F_ZERO_CSUM_TX));
4858 __assign_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags,
4859 flags & BPF_F_SEQ_NUMBER);
4860 __assign_bit(IP_TUNNEL_KEY_BIT, info->key.tun_flags,
4861 !(flags & BPF_F_NO_TUNNEL_KEY));
4862
4863 info->key.tun_id = cpu_to_be64(from->tunnel_id);
4864 info->key.tos = from->tunnel_tos;
4865 info->key.ttl = from->tunnel_ttl;
4866
4867 if (flags & BPF_F_TUNINFO_IPV6) {
4868 info->mode |= IP_TUNNEL_INFO_IPV6;
4869 memcpy(&info->key.u.ipv6.dst, from->remote_ipv6,
4870 sizeof(from->remote_ipv6));
4871 memcpy(&info->key.u.ipv6.src, from->local_ipv6,
4872 sizeof(from->local_ipv6));
4873 info->key.label = cpu_to_be32(from->tunnel_label) &
4874 IPV6_FLOWLABEL_MASK;
4875 } else {
4876 info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
4877 info->key.u.ipv4.src = cpu_to_be32(from->local_ipv4);
4878 info->key.flow_flags = FLOWI_FLAG_ANYSRC;
4879 }
4880
4881 return 0;
4882 }
4883
4884 static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
4885 .func = bpf_skb_set_tunnel_key,
4886 .gpl_only = false,
4887 .ret_type = RET_INTEGER,
4888 .arg1_type = ARG_PTR_TO_CTX,
4889 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
4890 .arg3_type = ARG_CONST_SIZE,
4891 .arg4_type = ARG_ANYTHING,
4892 };
4893
BPF_CALL_3(bpf_skb_set_tunnel_opt,struct sk_buff *,skb,const u8 *,from,u32,size)4894 BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb,
4895 const u8 *, from, u32, size)
4896 {
4897 struct ip_tunnel_info *info = skb_tunnel_info(skb);
4898 const struct metadata_dst *md = this_cpu_ptr(md_dst);
4899 IP_TUNNEL_DECLARE_FLAGS(present) = { };
4900
4901 if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))
4902 return -EINVAL;
4903 if (unlikely(size > IP_TUNNEL_OPTS_MAX))
4904 return -ENOMEM;
4905
4906 ip_tunnel_set_options_present(present);
4907 ip_tunnel_info_opts_set(info, from, size, present);
4908
4909 return 0;
4910 }
4911
4912 static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
4913 .func = bpf_skb_set_tunnel_opt,
4914 .gpl_only = false,
4915 .ret_type = RET_INTEGER,
4916 .arg1_type = ARG_PTR_TO_CTX,
4917 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
4918 .arg3_type = ARG_CONST_SIZE,
4919 };
4920
4921 static const struct bpf_func_proto *
bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)4922 bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
4923 {
4924 if (!md_dst) {
4925 struct metadata_dst __percpu *tmp;
4926
4927 tmp = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
4928 METADATA_IP_TUNNEL,
4929 GFP_KERNEL);
4930 if (!tmp)
4931 return NULL;
4932 if (cmpxchg(&md_dst, NULL, tmp))
4933 metadata_dst_free_percpu(tmp);
4934 }
4935
4936 switch (which) {
4937 case BPF_FUNC_skb_set_tunnel_key:
4938 return &bpf_skb_set_tunnel_key_proto;
4939 case BPF_FUNC_skb_set_tunnel_opt:
4940 return &bpf_skb_set_tunnel_opt_proto;
4941 default:
4942 return NULL;
4943 }
4944 }
4945
BPF_CALL_3(bpf_skb_under_cgroup,struct sk_buff *,skb,struct bpf_map *,map,u32,idx)4946 BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map,
4947 u32, idx)
4948 {
4949 struct bpf_array *array = container_of(map, struct bpf_array, map);
4950 struct cgroup *cgrp;
4951 struct sock *sk;
4952
4953 sk = skb_to_full_sk(skb);
4954 if (!sk || !sk_fullsock(sk))
4955 return -ENOENT;
4956 if (unlikely(idx >= array->map.max_entries))
4957 return -E2BIG;
4958
4959 cgrp = READ_ONCE(array->ptrs[idx]);
4960 if (unlikely(!cgrp))
4961 return -EAGAIN;
4962
4963 return sk_under_cgroup_hierarchy(sk, cgrp);
4964 }
4965
4966 static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
4967 .func = bpf_skb_under_cgroup,
4968 .gpl_only = false,
4969 .ret_type = RET_INTEGER,
4970 .arg1_type = ARG_PTR_TO_CTX,
4971 .arg2_type = ARG_CONST_MAP_PTR,
4972 .arg3_type = ARG_ANYTHING,
4973 };
4974
4975 #ifdef CONFIG_SOCK_CGROUP_DATA
__bpf_sk_cgroup_id(struct sock * sk)4976 static inline u64 __bpf_sk_cgroup_id(struct sock *sk)
4977 {
4978 struct cgroup *cgrp;
4979
4980 sk = sk_to_full_sk(sk);
4981 if (!sk || !sk_fullsock(sk))
4982 return 0;
4983
4984 cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
4985 return cgroup_id(cgrp);
4986 }
4987
BPF_CALL_1(bpf_skb_cgroup_id,const struct sk_buff *,skb)4988 BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
4989 {
4990 return __bpf_sk_cgroup_id(skb->sk);
4991 }
4992
4993 static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
4994 .func = bpf_skb_cgroup_id,
4995 .gpl_only = false,
4996 .ret_type = RET_INTEGER,
4997 .arg1_type = ARG_PTR_TO_CTX,
4998 };
4999
__bpf_sk_ancestor_cgroup_id(struct sock * sk,int ancestor_level)5000 static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk,
5001 int ancestor_level)
5002 {
5003 struct cgroup *ancestor;
5004 struct cgroup *cgrp;
5005
5006 sk = sk_to_full_sk(sk);
5007 if (!sk || !sk_fullsock(sk))
5008 return 0;
5009
5010 cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
5011 ancestor = cgroup_ancestor(cgrp, ancestor_level);
5012 if (!ancestor)
5013 return 0;
5014
5015 return cgroup_id(ancestor);
5016 }
5017
BPF_CALL_2(bpf_skb_ancestor_cgroup_id,const struct sk_buff *,skb,int,ancestor_level)5018 BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
5019 ancestor_level)
5020 {
5021 return __bpf_sk_ancestor_cgroup_id(skb->sk, ancestor_level);
5022 }
5023
5024 static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
5025 .func = bpf_skb_ancestor_cgroup_id,
5026 .gpl_only = false,
5027 .ret_type = RET_INTEGER,
5028 .arg1_type = ARG_PTR_TO_CTX,
5029 .arg2_type = ARG_ANYTHING,
5030 };
5031
BPF_CALL_1(bpf_sk_cgroup_id,struct sock *,sk)5032 BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk)
5033 {
5034 return __bpf_sk_cgroup_id(sk);
5035 }
5036
5037 static const struct bpf_func_proto bpf_sk_cgroup_id_proto = {
5038 .func = bpf_sk_cgroup_id,
5039 .gpl_only = false,
5040 .ret_type = RET_INTEGER,
5041 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
5042 };
5043
BPF_CALL_2(bpf_sk_ancestor_cgroup_id,struct sock *,sk,int,ancestor_level)5044 BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level)
5045 {
5046 return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level);
5047 }
5048
5049 static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = {
5050 .func = bpf_sk_ancestor_cgroup_id,
5051 .gpl_only = false,
5052 .ret_type = RET_INTEGER,
5053 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
5054 .arg2_type = ARG_ANYTHING,
5055 };
5056 #endif
5057
bpf_xdp_copy(void * dst,const void * ctx,unsigned long off,unsigned long len)5058 static unsigned long bpf_xdp_copy(void *dst, const void *ctx,
5059 unsigned long off, unsigned long len)
5060 {
5061 struct xdp_buff *xdp = (struct xdp_buff *)ctx;
5062
5063 bpf_xdp_copy_buf(xdp, off, dst, len, false);
5064 return 0;
5065 }
5066
BPF_CALL_5(bpf_xdp_event_output,struct xdp_buff *,xdp,struct bpf_map *,map,u64,flags,void *,meta,u64,meta_size)5067 BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
5068 u64, flags, void *, meta, u64, meta_size)
5069 {
5070 u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
5071
5072 if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
5073 return -EINVAL;
5074
5075 if (unlikely(!xdp || xdp_size > xdp_get_buff_len(xdp)))
5076 return -EFAULT;
5077
5078 return bpf_event_output(map, flags, meta, meta_size, xdp,
5079 xdp_size, bpf_xdp_copy);
5080 }
5081
5082 static const struct bpf_func_proto bpf_xdp_event_output_proto = {
5083 .func = bpf_xdp_event_output,
5084 .gpl_only = true,
5085 .ret_type = RET_INTEGER,
5086 .arg1_type = ARG_PTR_TO_CTX,
5087 .arg2_type = ARG_CONST_MAP_PTR,
5088 .arg3_type = ARG_ANYTHING,
5089 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
5090 .arg5_type = ARG_CONST_SIZE_OR_ZERO,
5091 };
5092
5093 BTF_ID_LIST_SINGLE(bpf_xdp_output_btf_ids, struct, xdp_buff)
5094
5095 const struct bpf_func_proto bpf_xdp_output_proto = {
5096 .func = bpf_xdp_event_output,
5097 .gpl_only = true,
5098 .ret_type = RET_INTEGER,
5099 .arg1_type = ARG_PTR_TO_BTF_ID,
5100 .arg1_btf_id = &bpf_xdp_output_btf_ids[0],
5101 .arg2_type = ARG_CONST_MAP_PTR,
5102 .arg3_type = ARG_ANYTHING,
5103 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
5104 .arg5_type = ARG_CONST_SIZE_OR_ZERO,
5105 };
5106
BPF_CALL_1(bpf_get_socket_cookie,struct sk_buff *,skb)5107 BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb)
5108 {
5109 return skb->sk ? __sock_gen_cookie(skb->sk) : 0;
5110 }
5111
5112 static const struct bpf_func_proto bpf_get_socket_cookie_proto = {
5113 .func = bpf_get_socket_cookie,
5114 .gpl_only = false,
5115 .ret_type = RET_INTEGER,
5116 .arg1_type = ARG_PTR_TO_CTX,
5117 };
5118
BPF_CALL_1(bpf_get_socket_cookie_sock_addr,struct bpf_sock_addr_kern *,ctx)5119 BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
5120 {
5121 return __sock_gen_cookie(ctx->sk);
5122 }
5123
5124 static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = {
5125 .func = bpf_get_socket_cookie_sock_addr,
5126 .gpl_only = false,
5127 .ret_type = RET_INTEGER,
5128 .arg1_type = ARG_PTR_TO_CTX,
5129 };
5130
BPF_CALL_1(bpf_get_socket_cookie_sock,struct sock *,ctx)5131 BPF_CALL_1(bpf_get_socket_cookie_sock, struct sock *, ctx)
5132 {
5133 return __sock_gen_cookie(ctx);
5134 }
5135
5136 static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = {
5137 .func = bpf_get_socket_cookie_sock,
5138 .gpl_only = false,
5139 .ret_type = RET_INTEGER,
5140 .arg1_type = ARG_PTR_TO_CTX,
5141 };
5142
BPF_CALL_1(bpf_get_socket_ptr_cookie,struct sock *,sk)5143 BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk)
5144 {
5145 return sk ? sock_gen_cookie(sk) : 0;
5146 }
5147
5148 const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = {
5149 .func = bpf_get_socket_ptr_cookie,
5150 .gpl_only = false,
5151 .ret_type = RET_INTEGER,
5152 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | PTR_MAYBE_NULL,
5153 };
5154
BPF_CALL_1(bpf_get_socket_cookie_sock_ops,struct bpf_sock_ops_kern *,ctx)5155 BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
5156 {
5157 return __sock_gen_cookie(ctx->sk);
5158 }
5159
5160 static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = {
5161 .func = bpf_get_socket_cookie_sock_ops,
5162 .gpl_only = false,
5163 .ret_type = RET_INTEGER,
5164 .arg1_type = ARG_PTR_TO_CTX,
5165 };
5166
__bpf_get_netns_cookie(struct sock * sk)5167 static u64 __bpf_get_netns_cookie(struct sock *sk)
5168 {
5169 const struct net *net = sk ? sock_net(sk) : &init_net;
5170
5171 return net->net_cookie;
5172 }
5173
BPF_CALL_1(bpf_get_netns_cookie,struct sk_buff *,skb)5174 BPF_CALL_1(bpf_get_netns_cookie, struct sk_buff *, skb)
5175 {
5176 return __bpf_get_netns_cookie(skb && skb->sk ? skb->sk : NULL);
5177 }
5178
5179 static const struct bpf_func_proto bpf_get_netns_cookie_proto = {
5180 .func = bpf_get_netns_cookie,
5181 .ret_type = RET_INTEGER,
5182 .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
5183 };
5184
BPF_CALL_1(bpf_get_netns_cookie_sock,struct sock *,ctx)5185 BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx)
5186 {
5187 return __bpf_get_netns_cookie(ctx);
5188 }
5189
5190 static const struct bpf_func_proto bpf_get_netns_cookie_sock_proto = {
5191 .func = bpf_get_netns_cookie_sock,
5192 .gpl_only = false,
5193 .ret_type = RET_INTEGER,
5194 .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
5195 };
5196
BPF_CALL_1(bpf_get_netns_cookie_sock_addr,struct bpf_sock_addr_kern *,ctx)5197 BPF_CALL_1(bpf_get_netns_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
5198 {
5199 return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
5200 }
5201
5202 static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = {
5203 .func = bpf_get_netns_cookie_sock_addr,
5204 .gpl_only = false,
5205 .ret_type = RET_INTEGER,
5206 .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
5207 };
5208
BPF_CALL_1(bpf_get_netns_cookie_sock_ops,struct bpf_sock_ops_kern *,ctx)5209 BPF_CALL_1(bpf_get_netns_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
5210 {
5211 return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
5212 }
5213
5214 static const struct bpf_func_proto bpf_get_netns_cookie_sock_ops_proto = {
5215 .func = bpf_get_netns_cookie_sock_ops,
5216 .gpl_only = false,
5217 .ret_type = RET_INTEGER,
5218 .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
5219 };
5220
BPF_CALL_1(bpf_get_netns_cookie_sk_msg,struct sk_msg *,ctx)5221 BPF_CALL_1(bpf_get_netns_cookie_sk_msg, struct sk_msg *, ctx)
5222 {
5223 return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
5224 }
5225
5226 static const struct bpf_func_proto bpf_get_netns_cookie_sk_msg_proto = {
5227 .func = bpf_get_netns_cookie_sk_msg,
5228 .gpl_only = false,
5229 .ret_type = RET_INTEGER,
5230 .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
5231 };
5232
BPF_CALL_1(bpf_get_socket_uid,struct sk_buff *,skb)5233 BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb)
5234 {
5235 struct sock *sk = sk_to_full_sk(skb->sk);
5236 kuid_t kuid;
5237
5238 if (!sk || !sk_fullsock(sk))
5239 return overflowuid;
5240 kuid = sock_net_uid(sock_net(sk), sk);
5241 return from_kuid_munged(sock_net(sk)->user_ns, kuid);
5242 }
5243
5244 static const struct bpf_func_proto bpf_get_socket_uid_proto = {
5245 .func = bpf_get_socket_uid,
5246 .gpl_only = false,
5247 .ret_type = RET_INTEGER,
5248 .arg1_type = ARG_PTR_TO_CTX,
5249 };
5250
sk_bpf_set_get_cb_flags(struct sock * sk,char * optval,bool getopt)5251 static int sk_bpf_set_get_cb_flags(struct sock *sk, char *optval, bool getopt)
5252 {
5253 u32 sk_bpf_cb_flags;
5254
5255 if (getopt) {
5256 *(u32 *)optval = sk->sk_bpf_cb_flags;
5257 return 0;
5258 }
5259
5260 sk_bpf_cb_flags = *(u32 *)optval;
5261
5262 if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK)
5263 return -EINVAL;
5264
5265 sk->sk_bpf_cb_flags = sk_bpf_cb_flags;
5266
5267 return 0;
5268 }
5269
sol_socket_sockopt(struct sock * sk,int optname,char * optval,int * optlen,bool getopt)5270 static int sol_socket_sockopt(struct sock *sk, int optname,
5271 char *optval, int *optlen,
5272 bool getopt)
5273 {
5274 switch (optname) {
5275 case SO_REUSEADDR:
5276 case SO_SNDBUF:
5277 case SO_RCVBUF:
5278 case SO_KEEPALIVE:
5279 case SO_PRIORITY:
5280 case SO_REUSEPORT:
5281 case SO_RCVLOWAT:
5282 case SO_MARK:
5283 case SO_MAX_PACING_RATE:
5284 case SO_BINDTOIFINDEX:
5285 case SO_TXREHASH:
5286 case SK_BPF_CB_FLAGS:
5287 if (*optlen != sizeof(int))
5288 return -EINVAL;
5289 break;
5290 case SO_BINDTODEVICE:
5291 break;
5292 default:
5293 return -EINVAL;
5294 }
5295
5296 if (optname == SK_BPF_CB_FLAGS)
5297 return sk_bpf_set_get_cb_flags(sk, optval, getopt);
5298
5299 if (getopt) {
5300 if (optname == SO_BINDTODEVICE)
5301 return -EINVAL;
5302 return sk_getsockopt(sk, SOL_SOCKET, optname,
5303 KERNEL_SOCKPTR(optval),
5304 KERNEL_SOCKPTR(optlen));
5305 }
5306
5307 return sk_setsockopt(sk, SOL_SOCKET, optname,
5308 KERNEL_SOCKPTR(optval), *optlen);
5309 }
5310
bpf_sol_tcp_getsockopt(struct sock * sk,int optname,char * optval,int optlen)5311 static int bpf_sol_tcp_getsockopt(struct sock *sk, int optname,
5312 char *optval, int optlen)
5313 {
5314 if (optlen != sizeof(int))
5315 return -EINVAL;
5316
5317 switch (optname) {
5318 case TCP_BPF_SOCK_OPS_CB_FLAGS: {
5319 int cb_flags = tcp_sk(sk)->bpf_sock_ops_cb_flags;
5320
5321 memcpy(optval, &cb_flags, optlen);
5322 break;
5323 }
5324 case TCP_BPF_RTO_MIN: {
5325 int rto_min_us = jiffies_to_usecs(inet_csk(sk)->icsk_rto_min);
5326
5327 memcpy(optval, &rto_min_us, optlen);
5328 break;
5329 }
5330 case TCP_BPF_DELACK_MAX: {
5331 int delack_max_us = jiffies_to_usecs(inet_csk(sk)->icsk_delack_max);
5332
5333 memcpy(optval, &delack_max_us, optlen);
5334 break;
5335 }
5336 default:
5337 return -EINVAL;
5338 }
5339
5340 return 0;
5341 }
5342
bpf_sol_tcp_setsockopt(struct sock * sk,int optname,char * optval,int optlen)5343 static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname,
5344 char *optval, int optlen)
5345 {
5346 struct tcp_sock *tp = tcp_sk(sk);
5347 unsigned long timeout;
5348 int val;
5349
5350 if (optlen != sizeof(int))
5351 return -EINVAL;
5352
5353 val = *(int *)optval;
5354
5355 /* Only some options are supported */
5356 switch (optname) {
5357 case TCP_BPF_IW:
5358 if (val <= 0 || tp->data_segs_out > tp->syn_data)
5359 return -EINVAL;
5360 tcp_snd_cwnd_set(tp, val);
5361 break;
5362 case TCP_BPF_SNDCWND_CLAMP:
5363 if (val <= 0)
5364 return -EINVAL;
5365 tp->snd_cwnd_clamp = val;
5366 tp->snd_ssthresh = val;
5367 break;
5368 case TCP_BPF_DELACK_MAX:
5369 timeout = usecs_to_jiffies(val);
5370 if (timeout > TCP_DELACK_MAX ||
5371 timeout < TCP_TIMEOUT_MIN)
5372 return -EINVAL;
5373 inet_csk(sk)->icsk_delack_max = timeout;
5374 break;
5375 case TCP_BPF_RTO_MIN:
5376 timeout = usecs_to_jiffies(val);
5377 if (timeout > TCP_RTO_MIN ||
5378 timeout < TCP_TIMEOUT_MIN)
5379 return -EINVAL;
5380 inet_csk(sk)->icsk_rto_min = timeout;
5381 break;
5382 case TCP_BPF_SOCK_OPS_CB_FLAGS:
5383 if (val & ~(BPF_SOCK_OPS_ALL_CB_FLAGS))
5384 return -EINVAL;
5385 tp->bpf_sock_ops_cb_flags = val;
5386 break;
5387 default:
5388 return -EINVAL;
5389 }
5390
5391 return 0;
5392 }
5393
sol_tcp_sockopt_congestion(struct sock * sk,char * optval,int * optlen,bool getopt)5394 static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval,
5395 int *optlen, bool getopt)
5396 {
5397 struct tcp_sock *tp;
5398 int ret;
5399
5400 if (*optlen < 2)
5401 return -EINVAL;
5402
5403 if (getopt) {
5404 if (!inet_csk(sk)->icsk_ca_ops)
5405 return -EINVAL;
5406 /* BPF expects NULL-terminated tcp-cc string */
5407 optval[--(*optlen)] = '\0';
5408 return do_tcp_getsockopt(sk, SOL_TCP, TCP_CONGESTION,
5409 KERNEL_SOCKPTR(optval),
5410 KERNEL_SOCKPTR(optlen));
5411 }
5412
5413 /* "cdg" is the only cc that alloc a ptr
5414 * in inet_csk_ca area. The bpf-tcp-cc may
5415 * overwrite this ptr after switching to cdg.
5416 */
5417 if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen))
5418 return -ENOTSUPP;
5419
5420 /* It stops this looping
5421 *
5422 * .init => bpf_setsockopt(tcp_cc) => .init =>
5423 * bpf_setsockopt(tcp_cc)" => .init => ....
5424 *
5425 * The second bpf_setsockopt(tcp_cc) is not allowed
5426 * in order to break the loop when both .init
5427 * are the same bpf prog.
5428 *
5429 * This applies even the second bpf_setsockopt(tcp_cc)
5430 * does not cause a loop. This limits only the first
5431 * '.init' can call bpf_setsockopt(TCP_CONGESTION) to
5432 * pick a fallback cc (eg. peer does not support ECN)
5433 * and the second '.init' cannot fallback to
5434 * another.
5435 */
5436 tp = tcp_sk(sk);
5437 if (tp->bpf_chg_cc_inprogress)
5438 return -EBUSY;
5439
5440 tp->bpf_chg_cc_inprogress = 1;
5441 ret = do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
5442 KERNEL_SOCKPTR(optval), *optlen);
5443 tp->bpf_chg_cc_inprogress = 0;
5444 return ret;
5445 }
5446
sol_tcp_sockopt(struct sock * sk,int optname,char * optval,int * optlen,bool getopt)5447 static int sol_tcp_sockopt(struct sock *sk, int optname,
5448 char *optval, int *optlen,
5449 bool getopt)
5450 {
5451 if (sk->sk_protocol != IPPROTO_TCP)
5452 return -EINVAL;
5453
5454 switch (optname) {
5455 case TCP_NODELAY:
5456 case TCP_MAXSEG:
5457 case TCP_KEEPIDLE:
5458 case TCP_KEEPINTVL:
5459 case TCP_KEEPCNT:
5460 case TCP_SYNCNT:
5461 case TCP_WINDOW_CLAMP:
5462 case TCP_THIN_LINEAR_TIMEOUTS:
5463 case TCP_USER_TIMEOUT:
5464 case TCP_NOTSENT_LOWAT:
5465 case TCP_SAVE_SYN:
5466 case TCP_RTO_MAX_MS:
5467 if (*optlen != sizeof(int))
5468 return -EINVAL;
5469 break;
5470 case TCP_CONGESTION:
5471 return sol_tcp_sockopt_congestion(sk, optval, optlen, getopt);
5472 case TCP_SAVED_SYN:
5473 if (*optlen < 1)
5474 return -EINVAL;
5475 break;
5476 default:
5477 if (getopt)
5478 return bpf_sol_tcp_getsockopt(sk, optname, optval, *optlen);
5479 return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen);
5480 }
5481
5482 if (getopt) {
5483 if (optname == TCP_SAVED_SYN) {
5484 struct tcp_sock *tp = tcp_sk(sk);
5485
5486 if (!tp->saved_syn ||
5487 *optlen > tcp_saved_syn_len(tp->saved_syn))
5488 return -EINVAL;
5489 memcpy(optval, tp->saved_syn->data, *optlen);
5490 /* It cannot free tp->saved_syn here because it
5491 * does not know if the user space still needs it.
5492 */
5493 return 0;
5494 }
5495
5496 return do_tcp_getsockopt(sk, SOL_TCP, optname,
5497 KERNEL_SOCKPTR(optval),
5498 KERNEL_SOCKPTR(optlen));
5499 }
5500
5501 return do_tcp_setsockopt(sk, SOL_TCP, optname,
5502 KERNEL_SOCKPTR(optval), *optlen);
5503 }
5504
sol_ip_sockopt(struct sock * sk,int optname,char * optval,int * optlen,bool getopt)5505 static int sol_ip_sockopt(struct sock *sk, int optname,
5506 char *optval, int *optlen,
5507 bool getopt)
5508 {
5509 if (sk->sk_family != AF_INET)
5510 return -EINVAL;
5511
5512 switch (optname) {
5513 case IP_TOS:
5514 if (*optlen != sizeof(int))
5515 return -EINVAL;
5516 break;
5517 default:
5518 return -EINVAL;
5519 }
5520
5521 if (getopt)
5522 return do_ip_getsockopt(sk, SOL_IP, optname,
5523 KERNEL_SOCKPTR(optval),
5524 KERNEL_SOCKPTR(optlen));
5525
5526 return do_ip_setsockopt(sk, SOL_IP, optname,
5527 KERNEL_SOCKPTR(optval), *optlen);
5528 }
5529
sol_ipv6_sockopt(struct sock * sk,int optname,char * optval,int * optlen,bool getopt)5530 static int sol_ipv6_sockopt(struct sock *sk, int optname,
5531 char *optval, int *optlen,
5532 bool getopt)
5533 {
5534 if (sk->sk_family != AF_INET6)
5535 return -EINVAL;
5536
5537 switch (optname) {
5538 case IPV6_TCLASS:
5539 case IPV6_AUTOFLOWLABEL:
5540 if (*optlen != sizeof(int))
5541 return -EINVAL;
5542 break;
5543 default:
5544 return -EINVAL;
5545 }
5546
5547 if (getopt)
5548 return ipv6_bpf_stub->ipv6_getsockopt(sk, SOL_IPV6, optname,
5549 KERNEL_SOCKPTR(optval),
5550 KERNEL_SOCKPTR(optlen));
5551
5552 return ipv6_bpf_stub->ipv6_setsockopt(sk, SOL_IPV6, optname,
5553 KERNEL_SOCKPTR(optval), *optlen);
5554 }
5555
__bpf_setsockopt(struct sock * sk,int level,int optname,char * optval,int optlen)5556 static int __bpf_setsockopt(struct sock *sk, int level, int optname,
5557 char *optval, int optlen)
5558 {
5559 if (!sk_fullsock(sk))
5560 return -EINVAL;
5561
5562 if (level == SOL_SOCKET)
5563 return sol_socket_sockopt(sk, optname, optval, &optlen, false);
5564 else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP)
5565 return sol_ip_sockopt(sk, optname, optval, &optlen, false);
5566 else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6)
5567 return sol_ipv6_sockopt(sk, optname, optval, &optlen, false);
5568 else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP)
5569 return sol_tcp_sockopt(sk, optname, optval, &optlen, false);
5570
5571 return -EINVAL;
5572 }
5573
is_locked_tcp_sock_ops(struct bpf_sock_ops_kern * bpf_sock)5574 static bool is_locked_tcp_sock_ops(struct bpf_sock_ops_kern *bpf_sock)
5575 {
5576 return bpf_sock->op <= BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
5577 }
5578
_bpf_setsockopt(struct sock * sk,int level,int optname,char * optval,int optlen)5579 static int _bpf_setsockopt(struct sock *sk, int level, int optname,
5580 char *optval, int optlen)
5581 {
5582 if (sk_fullsock(sk))
5583 sock_owned_by_me(sk);
5584 return __bpf_setsockopt(sk, level, optname, optval, optlen);
5585 }
5586
__bpf_getsockopt(struct sock * sk,int level,int optname,char * optval,int optlen)5587 static int __bpf_getsockopt(struct sock *sk, int level, int optname,
5588 char *optval, int optlen)
5589 {
5590 int err, saved_optlen = optlen;
5591
5592 if (!sk_fullsock(sk)) {
5593 err = -EINVAL;
5594 goto done;
5595 }
5596
5597 if (level == SOL_SOCKET)
5598 err = sol_socket_sockopt(sk, optname, optval, &optlen, true);
5599 else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP)
5600 err = sol_tcp_sockopt(sk, optname, optval, &optlen, true);
5601 else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP)
5602 err = sol_ip_sockopt(sk, optname, optval, &optlen, true);
5603 else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6)
5604 err = sol_ipv6_sockopt(sk, optname, optval, &optlen, true);
5605 else
5606 err = -EINVAL;
5607
5608 done:
5609 if (err)
5610 optlen = 0;
5611 if (optlen < saved_optlen)
5612 memset(optval + optlen, 0, saved_optlen - optlen);
5613 return err;
5614 }
5615
_bpf_getsockopt(struct sock * sk,int level,int optname,char * optval,int optlen)5616 static int _bpf_getsockopt(struct sock *sk, int level, int optname,
5617 char *optval, int optlen)
5618 {
5619 if (sk_fullsock(sk))
5620 sock_owned_by_me(sk);
5621 return __bpf_getsockopt(sk, level, optname, optval, optlen);
5622 }
5623
BPF_CALL_5(bpf_sk_setsockopt,struct sock *,sk,int,level,int,optname,char *,optval,int,optlen)5624 BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level,
5625 int, optname, char *, optval, int, optlen)
5626 {
5627 return _bpf_setsockopt(sk, level, optname, optval, optlen);
5628 }
5629
5630 const struct bpf_func_proto bpf_sk_setsockopt_proto = {
5631 .func = bpf_sk_setsockopt,
5632 .gpl_only = false,
5633 .ret_type = RET_INTEGER,
5634 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
5635 .arg2_type = ARG_ANYTHING,
5636 .arg3_type = ARG_ANYTHING,
5637 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
5638 .arg5_type = ARG_CONST_SIZE,
5639 };
5640
BPF_CALL_5(bpf_sk_getsockopt,struct sock *,sk,int,level,int,optname,char *,optval,int,optlen)5641 BPF_CALL_5(bpf_sk_getsockopt, struct sock *, sk, int, level,
5642 int, optname, char *, optval, int, optlen)
5643 {
5644 return _bpf_getsockopt(sk, level, optname, optval, optlen);
5645 }
5646
5647 const struct bpf_func_proto bpf_sk_getsockopt_proto = {
5648 .func = bpf_sk_getsockopt,
5649 .gpl_only = false,
5650 .ret_type = RET_INTEGER,
5651 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
5652 .arg2_type = ARG_ANYTHING,
5653 .arg3_type = ARG_ANYTHING,
5654 .arg4_type = ARG_PTR_TO_UNINIT_MEM,
5655 .arg5_type = ARG_CONST_SIZE,
5656 };
5657
BPF_CALL_5(bpf_unlocked_sk_setsockopt,struct sock *,sk,int,level,int,optname,char *,optval,int,optlen)5658 BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level,
5659 int, optname, char *, optval, int, optlen)
5660 {
5661 return __bpf_setsockopt(sk, level, optname, optval, optlen);
5662 }
5663
5664 const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto = {
5665 .func = bpf_unlocked_sk_setsockopt,
5666 .gpl_only = false,
5667 .ret_type = RET_INTEGER,
5668 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
5669 .arg2_type = ARG_ANYTHING,
5670 .arg3_type = ARG_ANYTHING,
5671 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
5672 .arg5_type = ARG_CONST_SIZE,
5673 };
5674
BPF_CALL_5(bpf_unlocked_sk_getsockopt,struct sock *,sk,int,level,int,optname,char *,optval,int,optlen)5675 BPF_CALL_5(bpf_unlocked_sk_getsockopt, struct sock *, sk, int, level,
5676 int, optname, char *, optval, int, optlen)
5677 {
5678 return __bpf_getsockopt(sk, level, optname, optval, optlen);
5679 }
5680
5681 const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto = {
5682 .func = bpf_unlocked_sk_getsockopt,
5683 .gpl_only = false,
5684 .ret_type = RET_INTEGER,
5685 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
5686 .arg2_type = ARG_ANYTHING,
5687 .arg3_type = ARG_ANYTHING,
5688 .arg4_type = ARG_PTR_TO_UNINIT_MEM,
5689 .arg5_type = ARG_CONST_SIZE,
5690 };
5691
BPF_CALL_5(bpf_sock_addr_setsockopt,struct bpf_sock_addr_kern *,ctx,int,level,int,optname,char *,optval,int,optlen)5692 BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx,
5693 int, level, int, optname, char *, optval, int, optlen)
5694 {
5695 return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen);
5696 }
5697
5698 static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = {
5699 .func = bpf_sock_addr_setsockopt,
5700 .gpl_only = false,
5701 .ret_type = RET_INTEGER,
5702 .arg1_type = ARG_PTR_TO_CTX,
5703 .arg2_type = ARG_ANYTHING,
5704 .arg3_type = ARG_ANYTHING,
5705 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
5706 .arg5_type = ARG_CONST_SIZE,
5707 };
5708
BPF_CALL_5(bpf_sock_addr_getsockopt,struct bpf_sock_addr_kern *,ctx,int,level,int,optname,char *,optval,int,optlen)5709 BPF_CALL_5(bpf_sock_addr_getsockopt, struct bpf_sock_addr_kern *, ctx,
5710 int, level, int, optname, char *, optval, int, optlen)
5711 {
5712 return _bpf_getsockopt(ctx->sk, level, optname, optval, optlen);
5713 }
5714
5715 static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = {
5716 .func = bpf_sock_addr_getsockopt,
5717 .gpl_only = false,
5718 .ret_type = RET_INTEGER,
5719 .arg1_type = ARG_PTR_TO_CTX,
5720 .arg2_type = ARG_ANYTHING,
5721 .arg3_type = ARG_ANYTHING,
5722 .arg4_type = ARG_PTR_TO_UNINIT_MEM,
5723 .arg5_type = ARG_CONST_SIZE,
5724 };
5725
BPF_CALL_5(bpf_sock_ops_setsockopt,struct bpf_sock_ops_kern *,bpf_sock,int,level,int,optname,char *,optval,int,optlen)5726 BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
5727 int, level, int, optname, char *, optval, int, optlen)
5728 {
5729 if (!is_locked_tcp_sock_ops(bpf_sock))
5730 return -EOPNOTSUPP;
5731
5732 return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen);
5733 }
5734
5735 static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = {
5736 .func = bpf_sock_ops_setsockopt,
5737 .gpl_only = false,
5738 .ret_type = RET_INTEGER,
5739 .arg1_type = ARG_PTR_TO_CTX,
5740 .arg2_type = ARG_ANYTHING,
5741 .arg3_type = ARG_ANYTHING,
5742 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
5743 .arg5_type = ARG_CONST_SIZE,
5744 };
5745
bpf_sock_ops_get_syn(struct bpf_sock_ops_kern * bpf_sock,int optname,const u8 ** start)5746 static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock,
5747 int optname, const u8 **start)
5748 {
5749 struct sk_buff *syn_skb = bpf_sock->syn_skb;
5750 const u8 *hdr_start;
5751 int ret;
5752
5753 if (syn_skb) {
5754 /* sk is a request_sock here */
5755
5756 if (optname == TCP_BPF_SYN) {
5757 hdr_start = syn_skb->data;
5758 ret = tcp_hdrlen(syn_skb);
5759 } else if (optname == TCP_BPF_SYN_IP) {
5760 hdr_start = skb_network_header(syn_skb);
5761 ret = skb_network_header_len(syn_skb) +
5762 tcp_hdrlen(syn_skb);
5763 } else {
5764 /* optname == TCP_BPF_SYN_MAC */
5765 hdr_start = skb_mac_header(syn_skb);
5766 ret = skb_mac_header_len(syn_skb) +
5767 skb_network_header_len(syn_skb) +
5768 tcp_hdrlen(syn_skb);
5769 }
5770 } else {
5771 struct sock *sk = bpf_sock->sk;
5772 struct saved_syn *saved_syn;
5773
5774 if (sk->sk_state == TCP_NEW_SYN_RECV)
5775 /* synack retransmit. bpf_sock->syn_skb will
5776 * not be available. It has to resort to
5777 * saved_syn (if it is saved).
5778 */
5779 saved_syn = inet_reqsk(sk)->saved_syn;
5780 else
5781 saved_syn = tcp_sk(sk)->saved_syn;
5782
5783 if (!saved_syn)
5784 return -ENOENT;
5785
5786 if (optname == TCP_BPF_SYN) {
5787 hdr_start = saved_syn->data +
5788 saved_syn->mac_hdrlen +
5789 saved_syn->network_hdrlen;
5790 ret = saved_syn->tcp_hdrlen;
5791 } else if (optname == TCP_BPF_SYN_IP) {
5792 hdr_start = saved_syn->data +
5793 saved_syn->mac_hdrlen;
5794 ret = saved_syn->network_hdrlen +
5795 saved_syn->tcp_hdrlen;
5796 } else {
5797 /* optname == TCP_BPF_SYN_MAC */
5798
5799 /* TCP_SAVE_SYN may not have saved the mac hdr */
5800 if (!saved_syn->mac_hdrlen)
5801 return -ENOENT;
5802
5803 hdr_start = saved_syn->data;
5804 ret = saved_syn->mac_hdrlen +
5805 saved_syn->network_hdrlen +
5806 saved_syn->tcp_hdrlen;
5807 }
5808 }
5809
5810 *start = hdr_start;
5811 return ret;
5812 }
5813
BPF_CALL_5(bpf_sock_ops_getsockopt,struct bpf_sock_ops_kern *,bpf_sock,int,level,int,optname,char *,optval,int,optlen)5814 BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
5815 int, level, int, optname, char *, optval, int, optlen)
5816 {
5817 if (!is_locked_tcp_sock_ops(bpf_sock))
5818 return -EOPNOTSUPP;
5819
5820 if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP &&
5821 optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) {
5822 int ret, copy_len = 0;
5823 const u8 *start;
5824
5825 ret = bpf_sock_ops_get_syn(bpf_sock, optname, &start);
5826 if (ret > 0) {
5827 copy_len = ret;
5828 if (optlen < copy_len) {
5829 copy_len = optlen;
5830 ret = -ENOSPC;
5831 }
5832
5833 memcpy(optval, start, copy_len);
5834 }
5835
5836 /* Zero out unused buffer at the end */
5837 memset(optval + copy_len, 0, optlen - copy_len);
5838
5839 return ret;
5840 }
5841
5842 return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen);
5843 }
5844
5845 static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = {
5846 .func = bpf_sock_ops_getsockopt,
5847 .gpl_only = false,
5848 .ret_type = RET_INTEGER,
5849 .arg1_type = ARG_PTR_TO_CTX,
5850 .arg2_type = ARG_ANYTHING,
5851 .arg3_type = ARG_ANYTHING,
5852 .arg4_type = ARG_PTR_TO_UNINIT_MEM,
5853 .arg5_type = ARG_CONST_SIZE,
5854 };
5855
BPF_CALL_2(bpf_sock_ops_cb_flags_set,struct bpf_sock_ops_kern *,bpf_sock,int,argval)5856 BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock,
5857 int, argval)
5858 {
5859 struct sock *sk = bpf_sock->sk;
5860 int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;
5861
5862 if (!is_locked_tcp_sock_ops(bpf_sock))
5863 return -EOPNOTSUPP;
5864
5865 if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk))
5866 return -EINVAL;
5867
5868 tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
5869
5870 return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS);
5871 }
5872
5873 static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = {
5874 .func = bpf_sock_ops_cb_flags_set,
5875 .gpl_only = false,
5876 .ret_type = RET_INTEGER,
5877 .arg1_type = ARG_PTR_TO_CTX,
5878 .arg2_type = ARG_ANYTHING,
5879 };
5880
5881 const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly;
5882 EXPORT_SYMBOL_GPL(ipv6_bpf_stub);
5883
BPF_CALL_3(bpf_bind,struct bpf_sock_addr_kern *,ctx,struct sockaddr *,addr,int,addr_len)5884 BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr,
5885 int, addr_len)
5886 {
5887 #ifdef CONFIG_INET
5888 struct sock *sk = ctx->sk;
5889 u32 flags = BIND_FROM_BPF;
5890 int err;
5891
5892 err = -EINVAL;
5893 if (addr_len < offsetofend(struct sockaddr, sa_family))
5894 return err;
5895 if (addr->sa_family == AF_INET) {
5896 if (addr_len < sizeof(struct sockaddr_in))
5897 return err;
5898 if (((struct sockaddr_in *)addr)->sin_port == htons(0))
5899 flags |= BIND_FORCE_ADDRESS_NO_PORT;
5900 return __inet_bind(sk, addr, addr_len, flags);
5901 #if IS_ENABLED(CONFIG_IPV6)
5902 } else if (addr->sa_family == AF_INET6) {
5903 if (addr_len < SIN6_LEN_RFC2133)
5904 return err;
5905 if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0))
5906 flags |= BIND_FORCE_ADDRESS_NO_PORT;
5907 /* ipv6_bpf_stub cannot be NULL, since it's called from
5908 * bpf_cgroup_inet6_connect hook and ipv6 is already loaded
5909 */
5910 return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags);
5911 #endif /* CONFIG_IPV6 */
5912 }
5913 #endif /* CONFIG_INET */
5914
5915 return -EAFNOSUPPORT;
5916 }
5917
5918 static const struct bpf_func_proto bpf_bind_proto = {
5919 .func = bpf_bind,
5920 .gpl_only = false,
5921 .ret_type = RET_INTEGER,
5922 .arg1_type = ARG_PTR_TO_CTX,
5923 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
5924 .arg3_type = ARG_CONST_SIZE,
5925 };
5926
5927 #ifdef CONFIG_XFRM
5928
5929 #if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \
5930 (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
5931
5932 struct metadata_dst __percpu *xfrm_bpf_md_dst;
5933 EXPORT_SYMBOL_GPL(xfrm_bpf_md_dst);
5934
5935 #endif
5936
BPF_CALL_5(bpf_skb_get_xfrm_state,struct sk_buff *,skb,u32,index,struct bpf_xfrm_state *,to,u32,size,u64,flags)5937 BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index,
5938 struct bpf_xfrm_state *, to, u32, size, u64, flags)
5939 {
5940 const struct sec_path *sp = skb_sec_path(skb);
5941 const struct xfrm_state *x;
5942
5943 if (!sp || unlikely(index >= sp->len || flags))
5944 goto err_clear;
5945
5946 x = sp->xvec[index];
5947
5948 if (unlikely(size != sizeof(struct bpf_xfrm_state)))
5949 goto err_clear;
5950
5951 to->reqid = x->props.reqid;
5952 to->spi = x->id.spi;
5953 to->family = x->props.family;
5954 to->ext = 0;
5955
5956 if (to->family == AF_INET6) {
5957 memcpy(to->remote_ipv6, x->props.saddr.a6,
5958 sizeof(to->remote_ipv6));
5959 } else {
5960 to->remote_ipv4 = x->props.saddr.a4;
5961 memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
5962 }
5963
5964 return 0;
5965 err_clear:
5966 memset(to, 0, size);
5967 return -EINVAL;
5968 }
5969
5970 static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
5971 .func = bpf_skb_get_xfrm_state,
5972 .gpl_only = false,
5973 .ret_type = RET_INTEGER,
5974 .arg1_type = ARG_PTR_TO_CTX,
5975 .arg2_type = ARG_ANYTHING,
5976 .arg3_type = ARG_PTR_TO_UNINIT_MEM,
5977 .arg4_type = ARG_CONST_SIZE,
5978 .arg5_type = ARG_ANYTHING,
5979 };
5980 #endif
5981
5982 #if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
bpf_fib_set_fwd_params(struct bpf_fib_lookup * params,u32 mtu)5983 static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, u32 mtu)
5984 {
5985 params->h_vlan_TCI = 0;
5986 params->h_vlan_proto = 0;
5987 if (mtu)
5988 params->mtu_result = mtu; /* union with tot_len */
5989
5990 return 0;
5991 }
5992 #endif
5993
5994 #if IS_ENABLED(CONFIG_INET)
bpf_ipv4_fib_lookup(struct net * net,struct bpf_fib_lookup * params,u32 flags,bool check_mtu)5995 static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
5996 u32 flags, bool check_mtu)
5997 {
5998 struct fib_nh_common *nhc;
5999 struct in_device *in_dev;
6000 struct neighbour *neigh;
6001 struct net_device *dev;
6002 struct fib_result res;
6003 struct flowi4 fl4;
6004 u32 mtu = 0;
6005 int err;
6006
6007 dev = dev_get_by_index_rcu(net, params->ifindex);
6008 if (unlikely(!dev))
6009 return -ENODEV;
6010
6011 /* verify forwarding is enabled on this interface */
6012 in_dev = __in_dev_get_rcu(dev);
6013 if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
6014 return BPF_FIB_LKUP_RET_FWD_DISABLED;
6015
6016 if (flags & BPF_FIB_LOOKUP_OUTPUT) {
6017 fl4.flowi4_iif = 1;
6018 fl4.flowi4_oif = params->ifindex;
6019 } else {
6020 fl4.flowi4_iif = params->ifindex;
6021 fl4.flowi4_oif = 0;
6022 }
6023 fl4.flowi4_tos = params->tos & INET_DSCP_MASK;
6024 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
6025 fl4.flowi4_flags = 0;
6026
6027 fl4.flowi4_proto = params->l4_protocol;
6028 fl4.daddr = params->ipv4_dst;
6029 fl4.saddr = params->ipv4_src;
6030 fl4.fl4_sport = params->sport;
6031 fl4.fl4_dport = params->dport;
6032 fl4.flowi4_multipath_hash = 0;
6033
6034 if (flags & BPF_FIB_LOOKUP_DIRECT) {
6035 u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
6036 struct fib_table *tb;
6037
6038 if (flags & BPF_FIB_LOOKUP_TBID) {
6039 tbid = params->tbid;
6040 /* zero out for vlan output */
6041 params->tbid = 0;
6042 }
6043
6044 tb = fib_get_table(net, tbid);
6045 if (unlikely(!tb))
6046 return BPF_FIB_LKUP_RET_NOT_FWDED;
6047
6048 err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
6049 } else {
6050 if (flags & BPF_FIB_LOOKUP_MARK)
6051 fl4.flowi4_mark = params->mark;
6052 else
6053 fl4.flowi4_mark = 0;
6054 fl4.flowi4_secid = 0;
6055 fl4.flowi4_tun_key.tun_id = 0;
6056 fl4.flowi4_uid = sock_net_uid(net, NULL);
6057
6058 err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
6059 }
6060
6061 if (err) {
6062 /* map fib lookup errors to RTN_ type */
6063 if (err == -EINVAL)
6064 return BPF_FIB_LKUP_RET_BLACKHOLE;
6065 if (err == -EHOSTUNREACH)
6066 return BPF_FIB_LKUP_RET_UNREACHABLE;
6067 if (err == -EACCES)
6068 return BPF_FIB_LKUP_RET_PROHIBIT;
6069
6070 return BPF_FIB_LKUP_RET_NOT_FWDED;
6071 }
6072
6073 if (res.type != RTN_UNICAST)
6074 return BPF_FIB_LKUP_RET_NOT_FWDED;
6075
6076 if (fib_info_num_path(res.fi) > 1)
6077 fib_select_path(net, &res, &fl4, NULL);
6078
6079 if (check_mtu) {
6080 mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
6081 if (params->tot_len > mtu) {
6082 params->mtu_result = mtu; /* union with tot_len */
6083 return BPF_FIB_LKUP_RET_FRAG_NEEDED;
6084 }
6085 }
6086
6087 nhc = res.nhc;
6088
6089 /* do not handle lwt encaps right now */
6090 if (nhc->nhc_lwtstate)
6091 return BPF_FIB_LKUP_RET_UNSUPP_LWT;
6092
6093 dev = nhc->nhc_dev;
6094
6095 params->rt_metric = res.fi->fib_priority;
6096 params->ifindex = dev->ifindex;
6097
6098 if (flags & BPF_FIB_LOOKUP_SRC)
6099 params->ipv4_src = fib_result_prefsrc(net, &res);
6100
6101 /* xdp and cls_bpf programs are run in RCU-bh so
6102 * rcu_read_lock_bh is not needed here
6103 */
6104 if (likely(nhc->nhc_gw_family != AF_INET6)) {
6105 if (nhc->nhc_gw_family)
6106 params->ipv4_dst = nhc->nhc_gw.ipv4;
6107 } else {
6108 struct in6_addr *dst = (struct in6_addr *)params->ipv6_dst;
6109
6110 params->family = AF_INET6;
6111 *dst = nhc->nhc_gw.ipv6;
6112 }
6113
6114 if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH)
6115 goto set_fwd_params;
6116
6117 if (likely(nhc->nhc_gw_family != AF_INET6))
6118 neigh = __ipv4_neigh_lookup_noref(dev,
6119 (__force u32)params->ipv4_dst);
6120 else
6121 neigh = __ipv6_neigh_lookup_noref_stub(dev, params->ipv6_dst);
6122
6123 if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID))
6124 return BPF_FIB_LKUP_RET_NO_NEIGH;
6125 memcpy(params->dmac, neigh->ha, ETH_ALEN);
6126 memcpy(params->smac, dev->dev_addr, ETH_ALEN);
6127
6128 set_fwd_params:
6129 return bpf_fib_set_fwd_params(params, mtu);
6130 }
6131 #endif
6132
6133 #if IS_ENABLED(CONFIG_IPV6)
bpf_ipv6_fib_lookup(struct net * net,struct bpf_fib_lookup * params,u32 flags,bool check_mtu)6134 static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
6135 u32 flags, bool check_mtu)
6136 {
6137 struct in6_addr *src = (struct in6_addr *) params->ipv6_src;
6138 struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst;
6139 struct fib6_result res = {};
6140 struct neighbour *neigh;
6141 struct net_device *dev;
6142 struct inet6_dev *idev;
6143 struct flowi6 fl6;
6144 int strict = 0;
6145 int oif, err;
6146 u32 mtu = 0;
6147
6148 /* link local addresses are never forwarded */
6149 if (rt6_need_strict(dst) || rt6_need_strict(src))
6150 return BPF_FIB_LKUP_RET_NOT_FWDED;
6151
6152 dev = dev_get_by_index_rcu(net, params->ifindex);
6153 if (unlikely(!dev))
6154 return -ENODEV;
6155
6156 idev = __in6_dev_get_safely(dev);
6157 if (unlikely(!idev || !READ_ONCE(idev->cnf.forwarding)))
6158 return BPF_FIB_LKUP_RET_FWD_DISABLED;
6159
6160 if (flags & BPF_FIB_LOOKUP_OUTPUT) {
6161 fl6.flowi6_iif = 1;
6162 oif = fl6.flowi6_oif = params->ifindex;
6163 } else {
6164 oif = fl6.flowi6_iif = params->ifindex;
6165 fl6.flowi6_oif = 0;
6166 strict = RT6_LOOKUP_F_HAS_SADDR;
6167 }
6168 fl6.flowlabel = params->flowinfo;
6169 fl6.flowi6_scope = 0;
6170 fl6.flowi6_flags = 0;
6171 fl6.mp_hash = 0;
6172
6173 fl6.flowi6_proto = params->l4_protocol;
6174 fl6.daddr = *dst;
6175 fl6.saddr = *src;
6176 fl6.fl6_sport = params->sport;
6177 fl6.fl6_dport = params->dport;
6178
6179 if (flags & BPF_FIB_LOOKUP_DIRECT) {
6180 u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
6181 struct fib6_table *tb;
6182
6183 if (flags & BPF_FIB_LOOKUP_TBID) {
6184 tbid = params->tbid;
6185 /* zero out for vlan output */
6186 params->tbid = 0;
6187 }
6188
6189 tb = ipv6_stub->fib6_get_table(net, tbid);
6190 if (unlikely(!tb))
6191 return BPF_FIB_LKUP_RET_NOT_FWDED;
6192
6193 err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res,
6194 strict);
6195 } else {
6196 if (flags & BPF_FIB_LOOKUP_MARK)
6197 fl6.flowi6_mark = params->mark;
6198 else
6199 fl6.flowi6_mark = 0;
6200 fl6.flowi6_secid = 0;
6201 fl6.flowi6_tun_key.tun_id = 0;
6202 fl6.flowi6_uid = sock_net_uid(net, NULL);
6203
6204 err = ipv6_stub->fib6_lookup(net, oif, &fl6, &res, strict);
6205 }
6206
6207 if (unlikely(err || IS_ERR_OR_NULL(res.f6i) ||
6208 res.f6i == net->ipv6.fib6_null_entry))
6209 return BPF_FIB_LKUP_RET_NOT_FWDED;
6210
6211 switch (res.fib6_type) {
6212 /* only unicast is forwarded */
6213 case RTN_UNICAST:
6214 break;
6215 case RTN_BLACKHOLE:
6216 return BPF_FIB_LKUP_RET_BLACKHOLE;
6217 case RTN_UNREACHABLE:
6218 return BPF_FIB_LKUP_RET_UNREACHABLE;
6219 case RTN_PROHIBIT:
6220 return BPF_FIB_LKUP_RET_PROHIBIT;
6221 default:
6222 return BPF_FIB_LKUP_RET_NOT_FWDED;
6223 }
6224
6225 ipv6_stub->fib6_select_path(net, &res, &fl6, fl6.flowi6_oif,
6226 fl6.flowi6_oif != 0, NULL, strict);
6227
6228 if (check_mtu) {
6229 mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src);
6230 if (params->tot_len > mtu) {
6231 params->mtu_result = mtu; /* union with tot_len */
6232 return BPF_FIB_LKUP_RET_FRAG_NEEDED;
6233 }
6234 }
6235
6236 if (res.nh->fib_nh_lws)
6237 return BPF_FIB_LKUP_RET_UNSUPP_LWT;
6238
6239 if (res.nh->fib_nh_gw_family)
6240 *dst = res.nh->fib_nh_gw6;
6241
6242 dev = res.nh->fib_nh_dev;
6243 params->rt_metric = res.f6i->fib6_metric;
6244 params->ifindex = dev->ifindex;
6245
6246 if (flags & BPF_FIB_LOOKUP_SRC) {
6247 if (res.f6i->fib6_prefsrc.plen) {
6248 *src = res.f6i->fib6_prefsrc.addr;
6249 } else {
6250 err = ipv6_bpf_stub->ipv6_dev_get_saddr(net, dev,
6251 &fl6.daddr, 0,
6252 src);
6253 if (err)
6254 return BPF_FIB_LKUP_RET_NO_SRC_ADDR;
6255 }
6256 }
6257
6258 if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH)
6259 goto set_fwd_params;
6260
6261 /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
6262 * not needed here.
6263 */
6264 neigh = __ipv6_neigh_lookup_noref_stub(dev, dst);
6265 if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID))
6266 return BPF_FIB_LKUP_RET_NO_NEIGH;
6267 memcpy(params->dmac, neigh->ha, ETH_ALEN);
6268 memcpy(params->smac, dev->dev_addr, ETH_ALEN);
6269
6270 set_fwd_params:
6271 return bpf_fib_set_fwd_params(params, mtu);
6272 }
6273 #endif
6274
6275 #define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \
6276 BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \
6277 BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_MARK)
6278
BPF_CALL_4(bpf_xdp_fib_lookup,struct xdp_buff *,ctx,struct bpf_fib_lookup *,params,int,plen,u32,flags)6279 BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
6280 struct bpf_fib_lookup *, params, int, plen, u32, flags)
6281 {
6282 if (plen < sizeof(*params))
6283 return -EINVAL;
6284
6285 if (flags & ~BPF_FIB_LOOKUP_MASK)
6286 return -EINVAL;
6287
6288 switch (params->family) {
6289 #if IS_ENABLED(CONFIG_INET)
6290 case AF_INET:
6291 return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params,
6292 flags, true);
6293 #endif
6294 #if IS_ENABLED(CONFIG_IPV6)
6295 case AF_INET6:
6296 return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params,
6297 flags, true);
6298 #endif
6299 }
6300 return -EAFNOSUPPORT;
6301 }
6302
6303 static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
6304 .func = bpf_xdp_fib_lookup,
6305 .gpl_only = true,
6306 .ret_type = RET_INTEGER,
6307 .arg1_type = ARG_PTR_TO_CTX,
6308 .arg2_type = ARG_PTR_TO_MEM,
6309 .arg3_type = ARG_CONST_SIZE,
6310 .arg4_type = ARG_ANYTHING,
6311 };
6312
BPF_CALL_4(bpf_skb_fib_lookup,struct sk_buff *,skb,struct bpf_fib_lookup *,params,int,plen,u32,flags)6313 BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
6314 struct bpf_fib_lookup *, params, int, plen, u32, flags)
6315 {
6316 struct net *net = dev_net(skb->dev);
6317 int rc = -EAFNOSUPPORT;
6318 bool check_mtu = false;
6319
6320 if (plen < sizeof(*params))
6321 return -EINVAL;
6322
6323 if (flags & ~BPF_FIB_LOOKUP_MASK)
6324 return -EINVAL;
6325
6326 if (params->tot_len)
6327 check_mtu = true;
6328
6329 switch (params->family) {
6330 #if IS_ENABLED(CONFIG_INET)
6331 case AF_INET:
6332 rc = bpf_ipv4_fib_lookup(net, params, flags, check_mtu);
6333 break;
6334 #endif
6335 #if IS_ENABLED(CONFIG_IPV6)
6336 case AF_INET6:
6337 rc = bpf_ipv6_fib_lookup(net, params, flags, check_mtu);
6338 break;
6339 #endif
6340 }
6341
6342 if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) {
6343 struct net_device *dev;
6344
6345 /* When tot_len isn't provided by user, check skb
6346 * against MTU of FIB lookup resulting net_device
6347 */
6348 dev = dev_get_by_index_rcu(net, params->ifindex);
6349 if (!is_skb_forwardable(dev, skb))
6350 rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;
6351
6352 params->mtu_result = dev->mtu; /* union with tot_len */
6353 }
6354
6355 return rc;
6356 }
6357
6358 static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
6359 .func = bpf_skb_fib_lookup,
6360 .gpl_only = true,
6361 .ret_type = RET_INTEGER,
6362 .arg1_type = ARG_PTR_TO_CTX,
6363 .arg2_type = ARG_PTR_TO_MEM,
6364 .arg3_type = ARG_CONST_SIZE,
6365 .arg4_type = ARG_ANYTHING,
6366 };
6367
__dev_via_ifindex(struct net_device * dev_curr,u32 ifindex)6368 static struct net_device *__dev_via_ifindex(struct net_device *dev_curr,
6369 u32 ifindex)
6370 {
6371 struct net *netns = dev_net(dev_curr);
6372
6373 /* Non-redirect use-cases can use ifindex=0 and save ifindex lookup */
6374 if (ifindex == 0)
6375 return dev_curr;
6376
6377 return dev_get_by_index_rcu(netns, ifindex);
6378 }
6379
BPF_CALL_5(bpf_skb_check_mtu,struct sk_buff *,skb,u32,ifindex,u32 *,mtu_len,s32,len_diff,u64,flags)6380 BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb,
6381 u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags)
6382 {
6383 int ret = BPF_MTU_CHK_RET_FRAG_NEEDED;
6384 struct net_device *dev = skb->dev;
6385 int mtu, dev_len, skb_len;
6386
6387 if (unlikely(flags & ~(BPF_MTU_CHK_SEGS)))
6388 return -EINVAL;
6389 if (unlikely(flags & BPF_MTU_CHK_SEGS && (len_diff || *mtu_len)))
6390 return -EINVAL;
6391
6392 dev = __dev_via_ifindex(dev, ifindex);
6393 if (unlikely(!dev))
6394 return -ENODEV;
6395
6396 mtu = READ_ONCE(dev->mtu);
6397 dev_len = mtu + dev->hard_header_len;
6398
6399 /* If set use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
6400 skb_len = *mtu_len ? *mtu_len + dev->hard_header_len : skb->len;
6401
6402 skb_len += len_diff; /* minus result pass check */
6403 if (skb_len <= dev_len) {
6404 ret = BPF_MTU_CHK_RET_SUCCESS;
6405 goto out;
6406 }
6407 /* At this point, skb->len exceed MTU, but as it include length of all
6408 * segments, it can still be below MTU. The SKB can possibly get
6409 * re-segmented in transmit path (see validate_xmit_skb). Thus, user
6410 * must choose if segs are to be MTU checked.
6411 */
6412 if (skb_is_gso(skb)) {
6413 ret = BPF_MTU_CHK_RET_SUCCESS;
6414 if (flags & BPF_MTU_CHK_SEGS &&
6415 !skb_gso_validate_network_len(skb, mtu))
6416 ret = BPF_MTU_CHK_RET_SEGS_TOOBIG;
6417 }
6418 out:
6419 *mtu_len = mtu;
6420 return ret;
6421 }
6422
BPF_CALL_5(bpf_xdp_check_mtu,struct xdp_buff *,xdp,u32,ifindex,u32 *,mtu_len,s32,len_diff,u64,flags)6423 BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp,
6424 u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags)
6425 {
6426 struct net_device *dev = xdp->rxq->dev;
6427 int xdp_len = xdp->data_end - xdp->data;
6428 int ret = BPF_MTU_CHK_RET_SUCCESS;
6429 int mtu, dev_len;
6430
6431 /* XDP variant doesn't support multi-buffer segment check (yet) */
6432 if (unlikely(flags))
6433 return -EINVAL;
6434
6435 dev = __dev_via_ifindex(dev, ifindex);
6436 if (unlikely(!dev))
6437 return -ENODEV;
6438
6439 mtu = READ_ONCE(dev->mtu);
6440 dev_len = mtu + dev->hard_header_len;
6441
6442 /* Use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
6443 if (*mtu_len)
6444 xdp_len = *mtu_len + dev->hard_header_len;
6445
6446 xdp_len += len_diff; /* minus result pass check */
6447 if (xdp_len > dev_len)
6448 ret = BPF_MTU_CHK_RET_FRAG_NEEDED;
6449
6450 *mtu_len = mtu;
6451 return ret;
6452 }
6453
6454 static const struct bpf_func_proto bpf_skb_check_mtu_proto = {
6455 .func = bpf_skb_check_mtu,
6456 .gpl_only = true,
6457 .ret_type = RET_INTEGER,
6458 .arg1_type = ARG_PTR_TO_CTX,
6459 .arg2_type = ARG_ANYTHING,
6460 .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED,
6461 .arg3_size = sizeof(u32),
6462 .arg4_type = ARG_ANYTHING,
6463 .arg5_type = ARG_ANYTHING,
6464 };
6465
6466 static const struct bpf_func_proto bpf_xdp_check_mtu_proto = {
6467 .func = bpf_xdp_check_mtu,
6468 .gpl_only = true,
6469 .ret_type = RET_INTEGER,
6470 .arg1_type = ARG_PTR_TO_CTX,
6471 .arg2_type = ARG_ANYTHING,
6472 .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED,
6473 .arg3_size = sizeof(u32),
6474 .arg4_type = ARG_ANYTHING,
6475 .arg5_type = ARG_ANYTHING,
6476 };
6477
6478 #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
bpf_push_seg6_encap(struct sk_buff * skb,u32 type,void * hdr,u32 len)6479 static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
6480 {
6481 int err;
6482 struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr;
6483
6484 if (!seg6_validate_srh(srh, len, false))
6485 return -EINVAL;
6486
6487 switch (type) {
6488 case BPF_LWT_ENCAP_SEG6_INLINE:
6489 if (skb->protocol != htons(ETH_P_IPV6))
6490 return -EBADMSG;
6491
6492 err = seg6_do_srh_inline(skb, srh);
6493 break;
6494 case BPF_LWT_ENCAP_SEG6:
6495 skb_reset_inner_headers(skb);
6496 skb->encapsulation = 1;
6497 err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6);
6498 break;
6499 default:
6500 return -EINVAL;
6501 }
6502
6503 bpf_compute_data_pointers(skb);
6504 if (err)
6505 return err;
6506
6507 skb_set_transport_header(skb, sizeof(struct ipv6hdr));
6508
6509 return seg6_lookup_nexthop(skb, NULL, 0);
6510 }
6511 #endif /* CONFIG_IPV6_SEG6_BPF */
6512
6513 #if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
bpf_push_ip_encap(struct sk_buff * skb,void * hdr,u32 len,bool ingress)6514 static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
6515 bool ingress)
6516 {
6517 return bpf_lwt_push_ip_encap(skb, hdr, len, ingress);
6518 }
6519 #endif
6520
BPF_CALL_4(bpf_lwt_in_push_encap,struct sk_buff *,skb,u32,type,void *,hdr,u32,len)6521 BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
6522 u32, len)
6523 {
6524 switch (type) {
6525 #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
6526 case BPF_LWT_ENCAP_SEG6:
6527 case BPF_LWT_ENCAP_SEG6_INLINE:
6528 return bpf_push_seg6_encap(skb, type, hdr, len);
6529 #endif
6530 #if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
6531 case BPF_LWT_ENCAP_IP:
6532 return bpf_push_ip_encap(skb, hdr, len, true /* ingress */);
6533 #endif
6534 default:
6535 return -EINVAL;
6536 }
6537 }
6538
BPF_CALL_4(bpf_lwt_xmit_push_encap,struct sk_buff *,skb,u32,type,void *,hdr,u32,len)6539 BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type,
6540 void *, hdr, u32, len)
6541 {
6542 switch (type) {
6543 #if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
6544 case BPF_LWT_ENCAP_IP:
6545 return bpf_push_ip_encap(skb, hdr, len, false /* egress */);
6546 #endif
6547 default:
6548 return -EINVAL;
6549 }
6550 }
6551
6552 static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = {
6553 .func = bpf_lwt_in_push_encap,
6554 .gpl_only = false,
6555 .ret_type = RET_INTEGER,
6556 .arg1_type = ARG_PTR_TO_CTX,
6557 .arg2_type = ARG_ANYTHING,
6558 .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
6559 .arg4_type = ARG_CONST_SIZE
6560 };
6561
6562 static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = {
6563 .func = bpf_lwt_xmit_push_encap,
6564 .gpl_only = false,
6565 .ret_type = RET_INTEGER,
6566 .arg1_type = ARG_PTR_TO_CTX,
6567 .arg2_type = ARG_ANYTHING,
6568 .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
6569 .arg4_type = ARG_CONST_SIZE
6570 };
6571
6572 #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
BPF_CALL_4(bpf_lwt_seg6_store_bytes,struct sk_buff *,skb,u32,offset,const void *,from,u32,len)6573 BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
6574 const void *, from, u32, len)
6575 {
6576 struct seg6_bpf_srh_state *srh_state =
6577 this_cpu_ptr(&seg6_bpf_srh_states);
6578 struct ipv6_sr_hdr *srh = srh_state->srh;
6579 void *srh_tlvs, *srh_end, *ptr;
6580 int srhoff = 0;
6581
6582 lockdep_assert_held(&srh_state->bh_lock);
6583 if (srh == NULL)
6584 return -EINVAL;
6585
6586 srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4));
6587 srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen);
6588
6589 ptr = skb->data + offset;
6590 if (ptr >= srh_tlvs && ptr + len <= srh_end)
6591 srh_state->valid = false;
6592 else if (ptr < (void *)&srh->flags ||
6593 ptr + len > (void *)&srh->segments)
6594 return -EFAULT;
6595
6596 if (unlikely(bpf_try_make_writable(skb, offset + len)))
6597 return -EFAULT;
6598 if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
6599 return -EINVAL;
6600 srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
6601
6602 memcpy(skb->data + offset, from, len);
6603 return 0;
6604 }
6605
6606 static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
6607 .func = bpf_lwt_seg6_store_bytes,
6608 .gpl_only = false,
6609 .ret_type = RET_INTEGER,
6610 .arg1_type = ARG_PTR_TO_CTX,
6611 .arg2_type = ARG_ANYTHING,
6612 .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
6613 .arg4_type = ARG_CONST_SIZE
6614 };
6615
bpf_update_srh_state(struct sk_buff * skb)6616 static void bpf_update_srh_state(struct sk_buff *skb)
6617 {
6618 struct seg6_bpf_srh_state *srh_state =
6619 this_cpu_ptr(&seg6_bpf_srh_states);
6620 int srhoff = 0;
6621
6622 if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) {
6623 srh_state->srh = NULL;
6624 } else {
6625 srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
6626 srh_state->hdrlen = srh_state->srh->hdrlen << 3;
6627 srh_state->valid = true;
6628 }
6629 }
6630
BPF_CALL_4(bpf_lwt_seg6_action,struct sk_buff *,skb,u32,action,void *,param,u32,param_len)6631 BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
6632 u32, action, void *, param, u32, param_len)
6633 {
6634 struct seg6_bpf_srh_state *srh_state =
6635 this_cpu_ptr(&seg6_bpf_srh_states);
6636 int hdroff = 0;
6637 int err;
6638
6639 lockdep_assert_held(&srh_state->bh_lock);
6640 switch (action) {
6641 case SEG6_LOCAL_ACTION_END_X:
6642 if (!seg6_bpf_has_valid_srh(skb))
6643 return -EBADMSG;
6644 if (param_len != sizeof(struct in6_addr))
6645 return -EINVAL;
6646 return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0);
6647 case SEG6_LOCAL_ACTION_END_T:
6648 if (!seg6_bpf_has_valid_srh(skb))
6649 return -EBADMSG;
6650 if (param_len != sizeof(int))
6651 return -EINVAL;
6652 return seg6_lookup_nexthop(skb, NULL, *(int *)param);
6653 case SEG6_LOCAL_ACTION_END_DT6:
6654 if (!seg6_bpf_has_valid_srh(skb))
6655 return -EBADMSG;
6656 if (param_len != sizeof(int))
6657 return -EINVAL;
6658
6659 if (ipv6_find_hdr(skb, &hdroff, IPPROTO_IPV6, NULL, NULL) < 0)
6660 return -EBADMSG;
6661 if (!pskb_pull(skb, hdroff))
6662 return -EBADMSG;
6663
6664 skb_postpull_rcsum(skb, skb_network_header(skb), hdroff);
6665 skb_reset_network_header(skb);
6666 skb_reset_transport_header(skb);
6667 skb->encapsulation = 0;
6668
6669 bpf_compute_data_pointers(skb);
6670 bpf_update_srh_state(skb);
6671 return seg6_lookup_nexthop(skb, NULL, *(int *)param);
6672 case SEG6_LOCAL_ACTION_END_B6:
6673 if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
6674 return -EBADMSG;
6675 err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE,
6676 param, param_len);
6677 if (!err)
6678 bpf_update_srh_state(skb);
6679
6680 return err;
6681 case SEG6_LOCAL_ACTION_END_B6_ENCAP:
6682 if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
6683 return -EBADMSG;
6684 err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6,
6685 param, param_len);
6686 if (!err)
6687 bpf_update_srh_state(skb);
6688
6689 return err;
6690 default:
6691 return -EINVAL;
6692 }
6693 }
6694
6695 static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {
6696 .func = bpf_lwt_seg6_action,
6697 .gpl_only = false,
6698 .ret_type = RET_INTEGER,
6699 .arg1_type = ARG_PTR_TO_CTX,
6700 .arg2_type = ARG_ANYTHING,
6701 .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
6702 .arg4_type = ARG_CONST_SIZE
6703 };
6704
BPF_CALL_3(bpf_lwt_seg6_adjust_srh,struct sk_buff *,skb,u32,offset,s32,len)6705 BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
6706 s32, len)
6707 {
6708 struct seg6_bpf_srh_state *srh_state =
6709 this_cpu_ptr(&seg6_bpf_srh_states);
6710 struct ipv6_sr_hdr *srh = srh_state->srh;
6711 void *srh_end, *srh_tlvs, *ptr;
6712 struct ipv6hdr *hdr;
6713 int srhoff = 0;
6714 int ret;
6715
6716 lockdep_assert_held(&srh_state->bh_lock);
6717 if (unlikely(srh == NULL))
6718 return -EINVAL;
6719
6720 srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) +
6721 ((srh->first_segment + 1) << 4));
6722 srh_end = (void *)((unsigned char *)srh + sizeof(*srh) +
6723 srh_state->hdrlen);
6724 ptr = skb->data + offset;
6725
6726 if (unlikely(ptr < srh_tlvs || ptr > srh_end))
6727 return -EFAULT;
6728 if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end))
6729 return -EFAULT;
6730
6731 if (len > 0) {
6732 ret = skb_cow_head(skb, len);
6733 if (unlikely(ret < 0))
6734 return ret;
6735
6736 ret = bpf_skb_net_hdr_push(skb, offset, len);
6737 } else {
6738 ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len);
6739 }
6740
6741 bpf_compute_data_pointers(skb);
6742 if (unlikely(ret < 0))
6743 return ret;
6744
6745 hdr = (struct ipv6hdr *)skb->data;
6746 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
6747
6748 if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
6749 return -EINVAL;
6750 srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
6751 srh_state->hdrlen += len;
6752 srh_state->valid = false;
6753 return 0;
6754 }
6755
6756 static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
6757 .func = bpf_lwt_seg6_adjust_srh,
6758 .gpl_only = false,
6759 .ret_type = RET_INTEGER,
6760 .arg1_type = ARG_PTR_TO_CTX,
6761 .arg2_type = ARG_ANYTHING,
6762 .arg3_type = ARG_ANYTHING,
6763 };
6764 #endif /* CONFIG_IPV6_SEG6_BPF */
6765
6766 #ifdef CONFIG_INET
sk_lookup(struct net * net,struct bpf_sock_tuple * tuple,int dif,int sdif,u8 family,u8 proto)6767 static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
6768 int dif, int sdif, u8 family, u8 proto)
6769 {
6770 struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
6771 bool refcounted = false;
6772 struct sock *sk = NULL;
6773
6774 if (family == AF_INET) {
6775 __be32 src4 = tuple->ipv4.saddr;
6776 __be32 dst4 = tuple->ipv4.daddr;
6777
6778 if (proto == IPPROTO_TCP)
6779 sk = __inet_lookup(net, hinfo, NULL, 0,
6780 src4, tuple->ipv4.sport,
6781 dst4, tuple->ipv4.dport,
6782 dif, sdif, &refcounted);
6783 else
6784 sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport,
6785 dst4, tuple->ipv4.dport,
6786 dif, sdif, net->ipv4.udp_table, NULL);
6787 #if IS_ENABLED(CONFIG_IPV6)
6788 } else {
6789 struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr;
6790 struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr;
6791
6792 if (proto == IPPROTO_TCP)
6793 sk = __inet6_lookup(net, hinfo, NULL, 0,
6794 src6, tuple->ipv6.sport,
6795 dst6, ntohs(tuple->ipv6.dport),
6796 dif, sdif, &refcounted);
6797 else if (likely(ipv6_bpf_stub))
6798 sk = ipv6_bpf_stub->udp6_lib_lookup(net,
6799 src6, tuple->ipv6.sport,
6800 dst6, tuple->ipv6.dport,
6801 dif, sdif,
6802 net->ipv4.udp_table, NULL);
6803 #endif
6804 }
6805
6806 if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) {
6807 WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
6808 sk = NULL;
6809 }
6810 return sk;
6811 }
6812
6813 /* bpf_skc_lookup performs the core lookup for different types of sockets,
6814 * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE.
6815 */
6816 static struct sock *
__bpf_skc_lookup(struct sk_buff * skb,struct bpf_sock_tuple * tuple,u32 len,struct net * caller_net,u32 ifindex,u8 proto,u64 netns_id,u64 flags,int sdif)6817 __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
6818 struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
6819 u64 flags, int sdif)
6820 {
6821 struct sock *sk = NULL;
6822 struct net *net;
6823 u8 family;
6824
6825 if (len == sizeof(tuple->ipv4))
6826 family = AF_INET;
6827 else if (len == sizeof(tuple->ipv6))
6828 family = AF_INET6;
6829 else
6830 return NULL;
6831
6832 if (unlikely(flags || !((s32)netns_id < 0 || netns_id <= S32_MAX)))
6833 goto out;
6834
6835 if (sdif < 0) {
6836 if (family == AF_INET)
6837 sdif = inet_sdif(skb);
6838 else
6839 sdif = inet6_sdif(skb);
6840 }
6841
6842 if ((s32)netns_id < 0) {
6843 net = caller_net;
6844 sk = sk_lookup(net, tuple, ifindex, sdif, family, proto);
6845 } else {
6846 net = get_net_ns_by_id(caller_net, netns_id);
6847 if (unlikely(!net))
6848 goto out;
6849 sk = sk_lookup(net, tuple, ifindex, sdif, family, proto);
6850 put_net(net);
6851 }
6852
6853 out:
6854 return sk;
6855 }
6856
6857 static struct sock *
__bpf_sk_lookup(struct sk_buff * skb,struct bpf_sock_tuple * tuple,u32 len,struct net * caller_net,u32 ifindex,u8 proto,u64 netns_id,u64 flags,int sdif)6858 __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
6859 struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
6860 u64 flags, int sdif)
6861 {
6862 struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net,
6863 ifindex, proto, netns_id, flags,
6864 sdif);
6865
6866 if (sk) {
6867 struct sock *sk2 = sk_to_full_sk(sk);
6868
6869 /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
6870 * sock refcnt is decremented to prevent a request_sock leak.
6871 */
6872 if (sk2 != sk) {
6873 sock_gen_put(sk);
6874 /* Ensure there is no need to bump sk2 refcnt */
6875 if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) {
6876 WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
6877 return NULL;
6878 }
6879 sk = sk2;
6880 }
6881 }
6882
6883 return sk;
6884 }
6885
6886 static struct sock *
bpf_skc_lookup(struct sk_buff * skb,struct bpf_sock_tuple * tuple,u32 len,u8 proto,u64 netns_id,u64 flags)6887 bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
6888 u8 proto, u64 netns_id, u64 flags)
6889 {
6890 struct net *caller_net;
6891 int ifindex;
6892
6893 if (skb->dev) {
6894 caller_net = dev_net(skb->dev);
6895 ifindex = skb->dev->ifindex;
6896 } else {
6897 caller_net = sock_net(skb->sk);
6898 ifindex = 0;
6899 }
6900
6901 return __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto,
6902 netns_id, flags, -1);
6903 }
6904
6905 static struct sock *
bpf_sk_lookup(struct sk_buff * skb,struct bpf_sock_tuple * tuple,u32 len,u8 proto,u64 netns_id,u64 flags)6906 bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
6907 u8 proto, u64 netns_id, u64 flags)
6908 {
6909 struct sock *sk = bpf_skc_lookup(skb, tuple, len, proto, netns_id,
6910 flags);
6911
6912 if (sk) {
6913 struct sock *sk2 = sk_to_full_sk(sk);
6914
6915 /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
6916 * sock refcnt is decremented to prevent a request_sock leak.
6917 */
6918 if (sk2 != sk) {
6919 sock_gen_put(sk);
6920 /* Ensure there is no need to bump sk2 refcnt */
6921 if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) {
6922 WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
6923 return NULL;
6924 }
6925 sk = sk2;
6926 }
6927 }
6928
6929 return sk;
6930 }
6931
BPF_CALL_5(bpf_skc_lookup_tcp,struct sk_buff *,skb,struct bpf_sock_tuple *,tuple,u32,len,u64,netns_id,u64,flags)6932 BPF_CALL_5(bpf_skc_lookup_tcp, struct sk_buff *, skb,
6933 struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
6934 {
6935 return (unsigned long)bpf_skc_lookup(skb, tuple, len, IPPROTO_TCP,
6936 netns_id, flags);
6937 }
6938
6939 static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = {
6940 .func = bpf_skc_lookup_tcp,
6941 .gpl_only = false,
6942 .pkt_access = true,
6943 .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
6944 .arg1_type = ARG_PTR_TO_CTX,
6945 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
6946 .arg3_type = ARG_CONST_SIZE_OR_ZERO,
6947 .arg4_type = ARG_ANYTHING,
6948 .arg5_type = ARG_ANYTHING,
6949 };
6950
BPF_CALL_5(bpf_sk_lookup_tcp,struct sk_buff *,skb,struct bpf_sock_tuple *,tuple,u32,len,u64,netns_id,u64,flags)6951 BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb,
6952 struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
6953 {
6954 return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP,
6955 netns_id, flags);
6956 }
6957
6958 static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {
6959 .func = bpf_sk_lookup_tcp,
6960 .gpl_only = false,
6961 .pkt_access = true,
6962 .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
6963 .arg1_type = ARG_PTR_TO_CTX,
6964 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
6965 .arg3_type = ARG_CONST_SIZE_OR_ZERO,
6966 .arg4_type = ARG_ANYTHING,
6967 .arg5_type = ARG_ANYTHING,
6968 };
6969
BPF_CALL_5(bpf_sk_lookup_udp,struct sk_buff *,skb,struct bpf_sock_tuple *,tuple,u32,len,u64,netns_id,u64,flags)6970 BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb,
6971 struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
6972 {
6973 return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP,
6974 netns_id, flags);
6975 }
6976
6977 static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
6978 .func = bpf_sk_lookup_udp,
6979 .gpl_only = false,
6980 .pkt_access = true,
6981 .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
6982 .arg1_type = ARG_PTR_TO_CTX,
6983 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
6984 .arg3_type = ARG_CONST_SIZE_OR_ZERO,
6985 .arg4_type = ARG_ANYTHING,
6986 .arg5_type = ARG_ANYTHING,
6987 };
6988
BPF_CALL_5(bpf_tc_skc_lookup_tcp,struct sk_buff *,skb,struct bpf_sock_tuple *,tuple,u32,len,u64,netns_id,u64,flags)6989 BPF_CALL_5(bpf_tc_skc_lookup_tcp, struct sk_buff *, skb,
6990 struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
6991 {
6992 struct net_device *dev = skb->dev;
6993 int ifindex = dev->ifindex, sdif = dev_sdif(dev);
6994 struct net *caller_net = dev_net(dev);
6995
6996 return (unsigned long)__bpf_skc_lookup(skb, tuple, len, caller_net,
6997 ifindex, IPPROTO_TCP, netns_id,
6998 flags, sdif);
6999 }
7000
7001 static const struct bpf_func_proto bpf_tc_skc_lookup_tcp_proto = {
7002 .func = bpf_tc_skc_lookup_tcp,
7003 .gpl_only = false,
7004 .pkt_access = true,
7005 .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
7006 .arg1_type = ARG_PTR_TO_CTX,
7007 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
7008 .arg3_type = ARG_CONST_SIZE_OR_ZERO,
7009 .arg4_type = ARG_ANYTHING,
7010 .arg5_type = ARG_ANYTHING,
7011 };
7012
BPF_CALL_5(bpf_tc_sk_lookup_tcp,struct sk_buff *,skb,struct bpf_sock_tuple *,tuple,u32,len,u64,netns_id,u64,flags)7013 BPF_CALL_5(bpf_tc_sk_lookup_tcp, struct sk_buff *, skb,
7014 struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
7015 {
7016 struct net_device *dev = skb->dev;
7017 int ifindex = dev->ifindex, sdif = dev_sdif(dev);
7018 struct net *caller_net = dev_net(dev);
7019
7020 return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net,
7021 ifindex, IPPROTO_TCP, netns_id,
7022 flags, sdif);
7023 }
7024
7025 static const struct bpf_func_proto bpf_tc_sk_lookup_tcp_proto = {
7026 .func = bpf_tc_sk_lookup_tcp,
7027 .gpl_only = false,
7028 .pkt_access = true,
7029 .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
7030 .arg1_type = ARG_PTR_TO_CTX,
7031 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
7032 .arg3_type = ARG_CONST_SIZE_OR_ZERO,
7033 .arg4_type = ARG_ANYTHING,
7034 .arg5_type = ARG_ANYTHING,
7035 };
7036
BPF_CALL_5(bpf_tc_sk_lookup_udp,struct sk_buff *,skb,struct bpf_sock_tuple *,tuple,u32,len,u64,netns_id,u64,flags)7037 BPF_CALL_5(bpf_tc_sk_lookup_udp, struct sk_buff *, skb,
7038 struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
7039 {
7040 struct net_device *dev = skb->dev;
7041 int ifindex = dev->ifindex, sdif = dev_sdif(dev);
7042 struct net *caller_net = dev_net(dev);
7043
7044 return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net,
7045 ifindex, IPPROTO_UDP, netns_id,
7046 flags, sdif);
7047 }
7048
7049 static const struct bpf_func_proto bpf_tc_sk_lookup_udp_proto = {
7050 .func = bpf_tc_sk_lookup_udp,
7051 .gpl_only = false,
7052 .pkt_access = true,
7053 .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
7054 .arg1_type = ARG_PTR_TO_CTX,
7055 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
7056 .arg3_type = ARG_CONST_SIZE_OR_ZERO,
7057 .arg4_type = ARG_ANYTHING,
7058 .arg5_type = ARG_ANYTHING,
7059 };
7060
BPF_CALL_1(bpf_sk_release,struct sock *,sk)7061 BPF_CALL_1(bpf_sk_release, struct sock *, sk)
7062 {
7063 if (sk && sk_is_refcounted(sk))
7064 sock_gen_put(sk);
7065 return 0;
7066 }
7067
7068 static const struct bpf_func_proto bpf_sk_release_proto = {
7069 .func = bpf_sk_release,
7070 .gpl_only = false,
7071 .ret_type = RET_INTEGER,
7072 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | OBJ_RELEASE,
7073 };
7074
BPF_CALL_5(bpf_xdp_sk_lookup_udp,struct xdp_buff *,ctx,struct bpf_sock_tuple *,tuple,u32,len,u32,netns_id,u64,flags)7075 BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx,
7076 struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
7077 {
7078 struct net_device *dev = ctx->rxq->dev;
7079 int ifindex = dev->ifindex, sdif = dev_sdif(dev);
7080 struct net *caller_net = dev_net(dev);
7081
7082 return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,
7083 ifindex, IPPROTO_UDP, netns_id,
7084 flags, sdif);
7085 }
7086
7087 static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = {
7088 .func = bpf_xdp_sk_lookup_udp,
7089 .gpl_only = false,
7090 .pkt_access = true,
7091 .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
7092 .arg1_type = ARG_PTR_TO_CTX,
7093 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
7094 .arg3_type = ARG_CONST_SIZE_OR_ZERO,
7095 .arg4_type = ARG_ANYTHING,
7096 .arg5_type = ARG_ANYTHING,
7097 };
7098
BPF_CALL_5(bpf_xdp_skc_lookup_tcp,struct xdp_buff *,ctx,struct bpf_sock_tuple *,tuple,u32,len,u32,netns_id,u64,flags)7099 BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx,
7100 struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
7101 {
7102 struct net_device *dev = ctx->rxq->dev;
7103 int ifindex = dev->ifindex, sdif = dev_sdif(dev);
7104 struct net *caller_net = dev_net(dev);
7105
7106 return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net,
7107 ifindex, IPPROTO_TCP, netns_id,
7108 flags, sdif);
7109 }
7110
7111 static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = {
7112 .func = bpf_xdp_skc_lookup_tcp,
7113 .gpl_only = false,
7114 .pkt_access = true,
7115 .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
7116 .arg1_type = ARG_PTR_TO_CTX,
7117 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
7118 .arg3_type = ARG_CONST_SIZE_OR_ZERO,
7119 .arg4_type = ARG_ANYTHING,
7120 .arg5_type = ARG_ANYTHING,
7121 };
7122
BPF_CALL_5(bpf_xdp_sk_lookup_tcp,struct xdp_buff *,ctx,struct bpf_sock_tuple *,tuple,u32,len,u32,netns_id,u64,flags)7123 BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx,
7124 struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
7125 {
7126 struct net_device *dev = ctx->rxq->dev;
7127 int ifindex = dev->ifindex, sdif = dev_sdif(dev);
7128 struct net *caller_net = dev_net(dev);
7129
7130 return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,
7131 ifindex, IPPROTO_TCP, netns_id,
7132 flags, sdif);
7133 }
7134
7135 static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = {
7136 .func = bpf_xdp_sk_lookup_tcp,
7137 .gpl_only = false,
7138 .pkt_access = true,
7139 .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
7140 .arg1_type = ARG_PTR_TO_CTX,
7141 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
7142 .arg3_type = ARG_CONST_SIZE_OR_ZERO,
7143 .arg4_type = ARG_ANYTHING,
7144 .arg5_type = ARG_ANYTHING,
7145 };
7146
BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp,struct bpf_sock_addr_kern *,ctx,struct bpf_sock_tuple *,tuple,u32,len,u64,netns_id,u64,flags)7147 BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp, struct bpf_sock_addr_kern *, ctx,
7148 struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
7149 {
7150 return (unsigned long)__bpf_skc_lookup(NULL, tuple, len,
7151 sock_net(ctx->sk), 0,
7152 IPPROTO_TCP, netns_id, flags,
7153 -1);
7154 }
7155
7156 static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = {
7157 .func = bpf_sock_addr_skc_lookup_tcp,
7158 .gpl_only = false,
7159 .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
7160 .arg1_type = ARG_PTR_TO_CTX,
7161 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
7162 .arg3_type = ARG_CONST_SIZE_OR_ZERO,
7163 .arg4_type = ARG_ANYTHING,
7164 .arg5_type = ARG_ANYTHING,
7165 };
7166
BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp,struct bpf_sock_addr_kern *,ctx,struct bpf_sock_tuple *,tuple,u32,len,u64,netns_id,u64,flags)7167 BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx,
7168 struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
7169 {
7170 return (unsigned long)__bpf_sk_lookup(NULL, tuple, len,
7171 sock_net(ctx->sk), 0, IPPROTO_TCP,
7172 netns_id, flags, -1);
7173 }
7174
7175 static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = {
7176 .func = bpf_sock_addr_sk_lookup_tcp,
7177 .gpl_only = false,
7178 .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
7179 .arg1_type = ARG_PTR_TO_CTX,
7180 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
7181 .arg3_type = ARG_CONST_SIZE_OR_ZERO,
7182 .arg4_type = ARG_ANYTHING,
7183 .arg5_type = ARG_ANYTHING,
7184 };
7185
BPF_CALL_5(bpf_sock_addr_sk_lookup_udp,struct bpf_sock_addr_kern *,ctx,struct bpf_sock_tuple *,tuple,u32,len,u64,netns_id,u64,flags)7186 BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx,
7187 struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
7188 {
7189 return (unsigned long)__bpf_sk_lookup(NULL, tuple, len,
7190 sock_net(ctx->sk), 0, IPPROTO_UDP,
7191 netns_id, flags, -1);
7192 }
7193
7194 static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
7195 .func = bpf_sock_addr_sk_lookup_udp,
7196 .gpl_only = false,
7197 .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
7198 .arg1_type = ARG_PTR_TO_CTX,
7199 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
7200 .arg3_type = ARG_CONST_SIZE_OR_ZERO,
7201 .arg4_type = ARG_ANYTHING,
7202 .arg5_type = ARG_ANYTHING,
7203 };
7204
bpf_tcp_sock_is_valid_access(int off,int size,enum bpf_access_type type,struct bpf_insn_access_aux * info)7205 bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
7206 struct bpf_insn_access_aux *info)
7207 {
7208 if (off < 0 || off >= offsetofend(struct bpf_tcp_sock,
7209 icsk_retransmits))
7210 return false;
7211
7212 if (off % size != 0)
7213 return false;
7214
7215 switch (off) {
7216 case offsetof(struct bpf_tcp_sock, bytes_received):
7217 case offsetof(struct bpf_tcp_sock, bytes_acked):
7218 return size == sizeof(__u64);
7219 default:
7220 return size == sizeof(__u32);
7221 }
7222 }
7223
bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)7224 u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
7225 const struct bpf_insn *si,
7226 struct bpf_insn *insn_buf,
7227 struct bpf_prog *prog, u32 *target_size)
7228 {
7229 struct bpf_insn *insn = insn_buf;
7230
7231 #define BPF_TCP_SOCK_GET_COMMON(FIELD) \
7232 do { \
7233 BUILD_BUG_ON(sizeof_field(struct tcp_sock, FIELD) > \
7234 sizeof_field(struct bpf_tcp_sock, FIELD)); \
7235 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\
7236 si->dst_reg, si->src_reg, \
7237 offsetof(struct tcp_sock, FIELD)); \
7238 } while (0)
7239
7240 #define BPF_INET_SOCK_GET_COMMON(FIELD) \
7241 do { \
7242 BUILD_BUG_ON(sizeof_field(struct inet_connection_sock, \
7243 FIELD) > \
7244 sizeof_field(struct bpf_tcp_sock, FIELD)); \
7245 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
7246 struct inet_connection_sock, \
7247 FIELD), \
7248 si->dst_reg, si->src_reg, \
7249 offsetof( \
7250 struct inet_connection_sock, \
7251 FIELD)); \
7252 } while (0)
7253
7254 BTF_TYPE_EMIT(struct bpf_tcp_sock);
7255
7256 switch (si->off) {
7257 case offsetof(struct bpf_tcp_sock, rtt_min):
7258 BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) !=
7259 sizeof(struct minmax));
7260 BUILD_BUG_ON(sizeof(struct minmax) <
7261 sizeof(struct minmax_sample));
7262
7263 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
7264 offsetof(struct tcp_sock, rtt_min) +
7265 offsetof(struct minmax_sample, v));
7266 break;
7267 case offsetof(struct bpf_tcp_sock, snd_cwnd):
7268 BPF_TCP_SOCK_GET_COMMON(snd_cwnd);
7269 break;
7270 case offsetof(struct bpf_tcp_sock, srtt_us):
7271 BPF_TCP_SOCK_GET_COMMON(srtt_us);
7272 break;
7273 case offsetof(struct bpf_tcp_sock, snd_ssthresh):
7274 BPF_TCP_SOCK_GET_COMMON(snd_ssthresh);
7275 break;
7276 case offsetof(struct bpf_tcp_sock, rcv_nxt):
7277 BPF_TCP_SOCK_GET_COMMON(rcv_nxt);
7278 break;
7279 case offsetof(struct bpf_tcp_sock, snd_nxt):
7280 BPF_TCP_SOCK_GET_COMMON(snd_nxt);
7281 break;
7282 case offsetof(struct bpf_tcp_sock, snd_una):
7283 BPF_TCP_SOCK_GET_COMMON(snd_una);
7284 break;
7285 case offsetof(struct bpf_tcp_sock, mss_cache):
7286 BPF_TCP_SOCK_GET_COMMON(mss_cache);
7287 break;
7288 case offsetof(struct bpf_tcp_sock, ecn_flags):
7289 BPF_TCP_SOCK_GET_COMMON(ecn_flags);
7290 break;
7291 case offsetof(struct bpf_tcp_sock, rate_delivered):
7292 BPF_TCP_SOCK_GET_COMMON(rate_delivered);
7293 break;
7294 case offsetof(struct bpf_tcp_sock, rate_interval_us):
7295 BPF_TCP_SOCK_GET_COMMON(rate_interval_us);
7296 break;
7297 case offsetof(struct bpf_tcp_sock, packets_out):
7298 BPF_TCP_SOCK_GET_COMMON(packets_out);
7299 break;
7300 case offsetof(struct bpf_tcp_sock, retrans_out):
7301 BPF_TCP_SOCK_GET_COMMON(retrans_out);
7302 break;
7303 case offsetof(struct bpf_tcp_sock, total_retrans):
7304 BPF_TCP_SOCK_GET_COMMON(total_retrans);
7305 break;
7306 case offsetof(struct bpf_tcp_sock, segs_in):
7307 BPF_TCP_SOCK_GET_COMMON(segs_in);
7308 break;
7309 case offsetof(struct bpf_tcp_sock, data_segs_in):
7310 BPF_TCP_SOCK_GET_COMMON(data_segs_in);
7311 break;
7312 case offsetof(struct bpf_tcp_sock, segs_out):
7313 BPF_TCP_SOCK_GET_COMMON(segs_out);
7314 break;
7315 case offsetof(struct bpf_tcp_sock, data_segs_out):
7316 BPF_TCP_SOCK_GET_COMMON(data_segs_out);
7317 break;
7318 case offsetof(struct bpf_tcp_sock, lost_out):
7319 BPF_TCP_SOCK_GET_COMMON(lost_out);
7320 break;
7321 case offsetof(struct bpf_tcp_sock, sacked_out):
7322 BPF_TCP_SOCK_GET_COMMON(sacked_out);
7323 break;
7324 case offsetof(struct bpf_tcp_sock, bytes_received):
7325 BPF_TCP_SOCK_GET_COMMON(bytes_received);
7326 break;
7327 case offsetof(struct bpf_tcp_sock, bytes_acked):
7328 BPF_TCP_SOCK_GET_COMMON(bytes_acked);
7329 break;
7330 case offsetof(struct bpf_tcp_sock, dsack_dups):
7331 BPF_TCP_SOCK_GET_COMMON(dsack_dups);
7332 break;
7333 case offsetof(struct bpf_tcp_sock, delivered):
7334 BPF_TCP_SOCK_GET_COMMON(delivered);
7335 break;
7336 case offsetof(struct bpf_tcp_sock, delivered_ce):
7337 BPF_TCP_SOCK_GET_COMMON(delivered_ce);
7338 break;
7339 case offsetof(struct bpf_tcp_sock, icsk_retransmits):
7340 BPF_INET_SOCK_GET_COMMON(icsk_retransmits);
7341 break;
7342 }
7343
7344 return insn - insn_buf;
7345 }
7346
BPF_CALL_1(bpf_tcp_sock,struct sock *,sk)7347 BPF_CALL_1(bpf_tcp_sock, struct sock *, sk)
7348 {
7349 if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
7350 return (unsigned long)sk;
7351
7352 return (unsigned long)NULL;
7353 }
7354
7355 const struct bpf_func_proto bpf_tcp_sock_proto = {
7356 .func = bpf_tcp_sock,
7357 .gpl_only = false,
7358 .ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL,
7359 .arg1_type = ARG_PTR_TO_SOCK_COMMON,
7360 };
7361
BPF_CALL_1(bpf_get_listener_sock,struct sock *,sk)7362 BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk)
7363 {
7364 sk = sk_to_full_sk(sk);
7365
7366 if (sk && sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE))
7367 return (unsigned long)sk;
7368
7369 return (unsigned long)NULL;
7370 }
7371
7372 static const struct bpf_func_proto bpf_get_listener_sock_proto = {
7373 .func = bpf_get_listener_sock,
7374 .gpl_only = false,
7375 .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
7376 .arg1_type = ARG_PTR_TO_SOCK_COMMON,
7377 };
7378
BPF_CALL_1(bpf_skb_ecn_set_ce,struct sk_buff *,skb)7379 BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
7380 {
7381 unsigned int iphdr_len;
7382
7383 switch (skb_protocol(skb, true)) {
7384 case cpu_to_be16(ETH_P_IP):
7385 iphdr_len = sizeof(struct iphdr);
7386 break;
7387 case cpu_to_be16(ETH_P_IPV6):
7388 iphdr_len = sizeof(struct ipv6hdr);
7389 break;
7390 default:
7391 return 0;
7392 }
7393
7394 if (skb_headlen(skb) < iphdr_len)
7395 return 0;
7396
7397 if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len))
7398 return 0;
7399
7400 return INET_ECN_set_ce(skb);
7401 }
7402
bpf_xdp_sock_is_valid_access(int off,int size,enum bpf_access_type type,struct bpf_insn_access_aux * info)7403 bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
7404 struct bpf_insn_access_aux *info)
7405 {
7406 if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id))
7407 return false;
7408
7409 if (off % size != 0)
7410 return false;
7411
7412 switch (off) {
7413 default:
7414 return size == sizeof(__u32);
7415 }
7416 }
7417
bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)7418 u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
7419 const struct bpf_insn *si,
7420 struct bpf_insn *insn_buf,
7421 struct bpf_prog *prog, u32 *target_size)
7422 {
7423 struct bpf_insn *insn = insn_buf;
7424
7425 #define BPF_XDP_SOCK_GET(FIELD) \
7426 do { \
7427 BUILD_BUG_ON(sizeof_field(struct xdp_sock, FIELD) > \
7428 sizeof_field(struct bpf_xdp_sock, FIELD)); \
7429 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\
7430 si->dst_reg, si->src_reg, \
7431 offsetof(struct xdp_sock, FIELD)); \
7432 } while (0)
7433
7434 switch (si->off) {
7435 case offsetof(struct bpf_xdp_sock, queue_id):
7436 BPF_XDP_SOCK_GET(queue_id);
7437 break;
7438 }
7439
7440 return insn - insn_buf;
7441 }
7442
7443 static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
7444 .func = bpf_skb_ecn_set_ce,
7445 .gpl_only = false,
7446 .ret_type = RET_INTEGER,
7447 .arg1_type = ARG_PTR_TO_CTX,
7448 };
7449
BPF_CALL_5(bpf_tcp_check_syncookie,struct sock *,sk,void *,iph,u32,iph_len,struct tcphdr *,th,u32,th_len)7450 BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
7451 struct tcphdr *, th, u32, th_len)
7452 {
7453 #ifdef CONFIG_SYN_COOKIES
7454 int ret;
7455
7456 if (unlikely(!sk || th_len < sizeof(*th)))
7457 return -EINVAL;
7458
7459 /* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */
7460 if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
7461 return -EINVAL;
7462
7463 if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies))
7464 return -EINVAL;
7465
7466 if (!th->ack || th->rst || th->syn)
7467 return -ENOENT;
7468
7469 if (unlikely(iph_len < sizeof(struct iphdr)))
7470 return -EINVAL;
7471
7472 if (tcp_synq_no_recent_overflow(sk))
7473 return -ENOENT;
7474
7475 /* Both struct iphdr and struct ipv6hdr have the version field at the
7476 * same offset so we can cast to the shorter header (struct iphdr).
7477 */
7478 switch (((struct iphdr *)iph)->version) {
7479 case 4:
7480 if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk))
7481 return -EINVAL;
7482
7483 ret = __cookie_v4_check((struct iphdr *)iph, th);
7484 break;
7485
7486 #if IS_BUILTIN(CONFIG_IPV6)
7487 case 6:
7488 if (unlikely(iph_len < sizeof(struct ipv6hdr)))
7489 return -EINVAL;
7490
7491 if (sk->sk_family != AF_INET6)
7492 return -EINVAL;
7493
7494 ret = __cookie_v6_check((struct ipv6hdr *)iph, th);
7495 break;
7496 #endif /* CONFIG_IPV6 */
7497
7498 default:
7499 return -EPROTONOSUPPORT;
7500 }
7501
7502 if (ret > 0)
7503 return 0;
7504
7505 return -ENOENT;
7506 #else
7507 return -ENOTSUPP;
7508 #endif
7509 }
7510
7511 static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = {
7512 .func = bpf_tcp_check_syncookie,
7513 .gpl_only = true,
7514 .pkt_access = true,
7515 .ret_type = RET_INTEGER,
7516 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
7517 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
7518 .arg3_type = ARG_CONST_SIZE,
7519 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
7520 .arg5_type = ARG_CONST_SIZE,
7521 };
7522
BPF_CALL_5(bpf_tcp_gen_syncookie,struct sock *,sk,void *,iph,u32,iph_len,struct tcphdr *,th,u32,th_len)7523 BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
7524 struct tcphdr *, th, u32, th_len)
7525 {
7526 #ifdef CONFIG_SYN_COOKIES
7527 u32 cookie;
7528 u16 mss;
7529
7530 if (unlikely(!sk || th_len < sizeof(*th) || th_len != th->doff * 4))
7531 return -EINVAL;
7532
7533 if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
7534 return -EINVAL;
7535
7536 if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies))
7537 return -ENOENT;
7538
7539 if (!th->syn || th->ack || th->fin || th->rst)
7540 return -EINVAL;
7541
7542 if (unlikely(iph_len < sizeof(struct iphdr)))
7543 return -EINVAL;
7544
7545 /* Both struct iphdr and struct ipv6hdr have the version field at the
7546 * same offset so we can cast to the shorter header (struct iphdr).
7547 */
7548 switch (((struct iphdr *)iph)->version) {
7549 case 4:
7550 if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk))
7551 return -EINVAL;
7552
7553 mss = tcp_v4_get_syncookie(sk, iph, th, &cookie);
7554 break;
7555
7556 #if IS_BUILTIN(CONFIG_IPV6)
7557 case 6:
7558 if (unlikely(iph_len < sizeof(struct ipv6hdr)))
7559 return -EINVAL;
7560
7561 if (sk->sk_family != AF_INET6)
7562 return -EINVAL;
7563
7564 mss = tcp_v6_get_syncookie(sk, iph, th, &cookie);
7565 break;
7566 #endif /* CONFIG_IPV6 */
7567
7568 default:
7569 return -EPROTONOSUPPORT;
7570 }
7571 if (mss == 0)
7572 return -ENOENT;
7573
7574 return cookie | ((u64)mss << 32);
7575 #else
7576 return -EOPNOTSUPP;
7577 #endif /* CONFIG_SYN_COOKIES */
7578 }
7579
7580 static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = {
7581 .func = bpf_tcp_gen_syncookie,
7582 .gpl_only = true, /* __cookie_v*_init_sequence() is GPL */
7583 .pkt_access = true,
7584 .ret_type = RET_INTEGER,
7585 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
7586 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
7587 .arg3_type = ARG_CONST_SIZE,
7588 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
7589 .arg5_type = ARG_CONST_SIZE,
7590 };
7591
BPF_CALL_3(bpf_sk_assign,struct sk_buff *,skb,struct sock *,sk,u64,flags)7592 BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags)
7593 {
7594 if (!sk || flags != 0)
7595 return -EINVAL;
7596 if (!skb_at_tc_ingress(skb))
7597 return -EOPNOTSUPP;
7598 if (unlikely(dev_net(skb->dev) != sock_net(sk)))
7599 return -ENETUNREACH;
7600 if (sk_unhashed(sk))
7601 return -EOPNOTSUPP;
7602 if (sk_is_refcounted(sk) &&
7603 unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
7604 return -ENOENT;
7605
7606 skb_orphan(skb);
7607 skb->sk = sk;
7608 skb->destructor = sock_pfree;
7609
7610 return 0;
7611 }
7612
7613 static const struct bpf_func_proto bpf_sk_assign_proto = {
7614 .func = bpf_sk_assign,
7615 .gpl_only = false,
7616 .ret_type = RET_INTEGER,
7617 .arg1_type = ARG_PTR_TO_CTX,
7618 .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
7619 .arg3_type = ARG_ANYTHING,
7620 };
7621
bpf_search_tcp_opt(const u8 * op,const u8 * opend,u8 search_kind,const u8 * magic,u8 magic_len,bool * eol)7622 static const u8 *bpf_search_tcp_opt(const u8 *op, const u8 *opend,
7623 u8 search_kind, const u8 *magic,
7624 u8 magic_len, bool *eol)
7625 {
7626 u8 kind, kind_len;
7627
7628 *eol = false;
7629
7630 while (op < opend) {
7631 kind = op[0];
7632
7633 if (kind == TCPOPT_EOL) {
7634 *eol = true;
7635 return ERR_PTR(-ENOMSG);
7636 } else if (kind == TCPOPT_NOP) {
7637 op++;
7638 continue;
7639 }
7640
7641 if (opend - op < 2 || opend - op < op[1] || op[1] < 2)
7642 /* Something is wrong in the received header.
7643 * Follow the TCP stack's tcp_parse_options()
7644 * and just bail here.
7645 */
7646 return ERR_PTR(-EFAULT);
7647
7648 kind_len = op[1];
7649 if (search_kind == kind) {
7650 if (!magic_len)
7651 return op;
7652
7653 if (magic_len > kind_len - 2)
7654 return ERR_PTR(-ENOMSG);
7655
7656 if (!memcmp(&op[2], magic, magic_len))
7657 return op;
7658 }
7659
7660 op += kind_len;
7661 }
7662
7663 return ERR_PTR(-ENOMSG);
7664 }
7665
BPF_CALL_4(bpf_sock_ops_load_hdr_opt,struct bpf_sock_ops_kern *,bpf_sock,void *,search_res,u32,len,u64,flags)7666 BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
7667 void *, search_res, u32, len, u64, flags)
7668 {
7669 bool eol, load_syn = flags & BPF_LOAD_HDR_OPT_TCP_SYN;
7670 const u8 *op, *opend, *magic, *search = search_res;
7671 u8 search_kind, search_len, copy_len, magic_len;
7672 int ret;
7673
7674 if (!is_locked_tcp_sock_ops(bpf_sock))
7675 return -EOPNOTSUPP;
7676
7677 /* 2 byte is the minimal option len except TCPOPT_NOP and
7678 * TCPOPT_EOL which are useless for the bpf prog to learn
7679 * and this helper disallow loading them also.
7680 */
7681 if (len < 2 || flags & ~BPF_LOAD_HDR_OPT_TCP_SYN)
7682 return -EINVAL;
7683
7684 search_kind = search[0];
7685 search_len = search[1];
7686
7687 if (search_len > len || search_kind == TCPOPT_NOP ||
7688 search_kind == TCPOPT_EOL)
7689 return -EINVAL;
7690
7691 if (search_kind == TCPOPT_EXP || search_kind == 253) {
7692 /* 16 or 32 bit magic. +2 for kind and kind length */
7693 if (search_len != 4 && search_len != 6)
7694 return -EINVAL;
7695 magic = &search[2];
7696 magic_len = search_len - 2;
7697 } else {
7698 if (search_len)
7699 return -EINVAL;
7700 magic = NULL;
7701 magic_len = 0;
7702 }
7703
7704 if (load_syn) {
7705 ret = bpf_sock_ops_get_syn(bpf_sock, TCP_BPF_SYN, &op);
7706 if (ret < 0)
7707 return ret;
7708
7709 opend = op + ret;
7710 op += sizeof(struct tcphdr);
7711 } else {
7712 if (!bpf_sock->skb ||
7713 bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB)
7714 /* This bpf_sock->op cannot call this helper */
7715 return -EPERM;
7716
7717 opend = bpf_sock->skb_data_end;
7718 op = bpf_sock->skb->data + sizeof(struct tcphdr);
7719 }
7720
7721 op = bpf_search_tcp_opt(op, opend, search_kind, magic, magic_len,
7722 &eol);
7723 if (IS_ERR(op))
7724 return PTR_ERR(op);
7725
7726 copy_len = op[1];
7727 ret = copy_len;
7728 if (copy_len > len) {
7729 ret = -ENOSPC;
7730 copy_len = len;
7731 }
7732
7733 memcpy(search_res, op, copy_len);
7734 return ret;
7735 }
7736
7737 static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = {
7738 .func = bpf_sock_ops_load_hdr_opt,
7739 .gpl_only = false,
7740 .ret_type = RET_INTEGER,
7741 .arg1_type = ARG_PTR_TO_CTX,
7742 .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE,
7743 .arg3_type = ARG_CONST_SIZE,
7744 .arg4_type = ARG_ANYTHING,
7745 };
7746
BPF_CALL_4(bpf_sock_ops_store_hdr_opt,struct bpf_sock_ops_kern *,bpf_sock,const void *,from,u32,len,u64,flags)7747 BPF_CALL_4(bpf_sock_ops_store_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
7748 const void *, from, u32, len, u64, flags)
7749 {
7750 u8 new_kind, new_kind_len, magic_len = 0, *opend;
7751 const u8 *op, *new_op, *magic = NULL;
7752 struct sk_buff *skb;
7753 bool eol;
7754
7755 if (bpf_sock->op != BPF_SOCK_OPS_WRITE_HDR_OPT_CB)
7756 return -EPERM;
7757
7758 if (len < 2 || flags)
7759 return -EINVAL;
7760
7761 new_op = from;
7762 new_kind = new_op[0];
7763 new_kind_len = new_op[1];
7764
7765 if (new_kind_len > len || new_kind == TCPOPT_NOP ||
7766 new_kind == TCPOPT_EOL)
7767 return -EINVAL;
7768
7769 if (new_kind_len > bpf_sock->remaining_opt_len)
7770 return -ENOSPC;
7771
7772 /* 253 is another experimental kind */
7773 if (new_kind == TCPOPT_EXP || new_kind == 253) {
7774 if (new_kind_len < 4)
7775 return -EINVAL;
7776 /* Match for the 2 byte magic also.
7777 * RFC 6994: the magic could be 2 or 4 bytes.
7778 * Hence, matching by 2 byte only is on the
7779 * conservative side but it is the right
7780 * thing to do for the 'search-for-duplication'
7781 * purpose.
7782 */
7783 magic = &new_op[2];
7784 magic_len = 2;
7785 }
7786
7787 /* Check for duplication */
7788 skb = bpf_sock->skb;
7789 op = skb->data + sizeof(struct tcphdr);
7790 opend = bpf_sock->skb_data_end;
7791
7792 op = bpf_search_tcp_opt(op, opend, new_kind, magic, magic_len,
7793 &eol);
7794 if (!IS_ERR(op))
7795 return -EEXIST;
7796
7797 if (PTR_ERR(op) != -ENOMSG)
7798 return PTR_ERR(op);
7799
7800 if (eol)
7801 /* The option has been ended. Treat it as no more
7802 * header option can be written.
7803 */
7804 return -ENOSPC;
7805
7806 /* No duplication found. Store the header option. */
7807 memcpy(opend, from, new_kind_len);
7808
7809 bpf_sock->remaining_opt_len -= new_kind_len;
7810 bpf_sock->skb_data_end += new_kind_len;
7811
7812 return 0;
7813 }
7814
7815 static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = {
7816 .func = bpf_sock_ops_store_hdr_opt,
7817 .gpl_only = false,
7818 .ret_type = RET_INTEGER,
7819 .arg1_type = ARG_PTR_TO_CTX,
7820 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
7821 .arg3_type = ARG_CONST_SIZE,
7822 .arg4_type = ARG_ANYTHING,
7823 };
7824
BPF_CALL_3(bpf_sock_ops_reserve_hdr_opt,struct bpf_sock_ops_kern *,bpf_sock,u32,len,u64,flags)7825 BPF_CALL_3(bpf_sock_ops_reserve_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
7826 u32, len, u64, flags)
7827 {
7828 if (bpf_sock->op != BPF_SOCK_OPS_HDR_OPT_LEN_CB)
7829 return -EPERM;
7830
7831 if (flags || len < 2)
7832 return -EINVAL;
7833
7834 if (len > bpf_sock->remaining_opt_len)
7835 return -ENOSPC;
7836
7837 bpf_sock->remaining_opt_len -= len;
7838
7839 return 0;
7840 }
7841
7842 static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = {
7843 .func = bpf_sock_ops_reserve_hdr_opt,
7844 .gpl_only = false,
7845 .ret_type = RET_INTEGER,
7846 .arg1_type = ARG_PTR_TO_CTX,
7847 .arg2_type = ARG_ANYTHING,
7848 .arg3_type = ARG_ANYTHING,
7849 };
7850
BPF_CALL_3(bpf_skb_set_tstamp,struct sk_buff *,skb,u64,tstamp,u32,tstamp_type)7851 BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb,
7852 u64, tstamp, u32, tstamp_type)
7853 {
7854 /* skb_clear_delivery_time() is done for inet protocol */
7855 if (skb->protocol != htons(ETH_P_IP) &&
7856 skb->protocol != htons(ETH_P_IPV6))
7857 return -EOPNOTSUPP;
7858
7859 switch (tstamp_type) {
7860 case BPF_SKB_CLOCK_REALTIME:
7861 skb->tstamp = tstamp;
7862 skb->tstamp_type = SKB_CLOCK_REALTIME;
7863 break;
7864 case BPF_SKB_CLOCK_MONOTONIC:
7865 if (!tstamp)
7866 return -EINVAL;
7867 skb->tstamp = tstamp;
7868 skb->tstamp_type = SKB_CLOCK_MONOTONIC;
7869 break;
7870 case BPF_SKB_CLOCK_TAI:
7871 if (!tstamp)
7872 return -EINVAL;
7873 skb->tstamp = tstamp;
7874 skb->tstamp_type = SKB_CLOCK_TAI;
7875 break;
7876 default:
7877 return -EINVAL;
7878 }
7879
7880 return 0;
7881 }
7882
7883 static const struct bpf_func_proto bpf_skb_set_tstamp_proto = {
7884 .func = bpf_skb_set_tstamp,
7885 .gpl_only = false,
7886 .ret_type = RET_INTEGER,
7887 .arg1_type = ARG_PTR_TO_CTX,
7888 .arg2_type = ARG_ANYTHING,
7889 .arg3_type = ARG_ANYTHING,
7890 };
7891
7892 #ifdef CONFIG_SYN_COOKIES
BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv4,struct iphdr *,iph,struct tcphdr *,th,u32,th_len)7893 BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv4, struct iphdr *, iph,
7894 struct tcphdr *, th, u32, th_len)
7895 {
7896 u32 cookie;
7897 u16 mss;
7898
7899 if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4))
7900 return -EINVAL;
7901
7902 mss = tcp_parse_mss_option(th, 0) ?: TCP_MSS_DEFAULT;
7903 cookie = __cookie_v4_init_sequence(iph, th, &mss);
7904
7905 return cookie | ((u64)mss << 32);
7906 }
7907
7908 static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv4_proto = {
7909 .func = bpf_tcp_raw_gen_syncookie_ipv4,
7910 .gpl_only = true, /* __cookie_v4_init_sequence() is GPL */
7911 .pkt_access = true,
7912 .ret_type = RET_INTEGER,
7913 .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM,
7914 .arg1_size = sizeof(struct iphdr),
7915 .arg2_type = ARG_PTR_TO_MEM,
7916 .arg3_type = ARG_CONST_SIZE_OR_ZERO,
7917 };
7918
BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv6,struct ipv6hdr *,iph,struct tcphdr *,th,u32,th_len)7919 BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv6, struct ipv6hdr *, iph,
7920 struct tcphdr *, th, u32, th_len)
7921 {
7922 #if IS_BUILTIN(CONFIG_IPV6)
7923 const u16 mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) -
7924 sizeof(struct ipv6hdr);
7925 u32 cookie;
7926 u16 mss;
7927
7928 if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4))
7929 return -EINVAL;
7930
7931 mss = tcp_parse_mss_option(th, 0) ?: mss_clamp;
7932 cookie = __cookie_v6_init_sequence(iph, th, &mss);
7933
7934 return cookie | ((u64)mss << 32);
7935 #else
7936 return -EPROTONOSUPPORT;
7937 #endif
7938 }
7939
7940 static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = {
7941 .func = bpf_tcp_raw_gen_syncookie_ipv6,
7942 .gpl_only = true, /* __cookie_v6_init_sequence() is GPL */
7943 .pkt_access = true,
7944 .ret_type = RET_INTEGER,
7945 .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM,
7946 .arg1_size = sizeof(struct ipv6hdr),
7947 .arg2_type = ARG_PTR_TO_MEM,
7948 .arg3_type = ARG_CONST_SIZE_OR_ZERO,
7949 };
7950
BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4,struct iphdr *,iph,struct tcphdr *,th)7951 BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4, struct iphdr *, iph,
7952 struct tcphdr *, th)
7953 {
7954 if (__cookie_v4_check(iph, th) > 0)
7955 return 0;
7956
7957 return -EACCES;
7958 }
7959
7960 static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv4_proto = {
7961 .func = bpf_tcp_raw_check_syncookie_ipv4,
7962 .gpl_only = true, /* __cookie_v4_check is GPL */
7963 .pkt_access = true,
7964 .ret_type = RET_INTEGER,
7965 .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM,
7966 .arg1_size = sizeof(struct iphdr),
7967 .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM,
7968 .arg2_size = sizeof(struct tcphdr),
7969 };
7970
BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv6,struct ipv6hdr *,iph,struct tcphdr *,th)7971 BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv6, struct ipv6hdr *, iph,
7972 struct tcphdr *, th)
7973 {
7974 #if IS_BUILTIN(CONFIG_IPV6)
7975 if (__cookie_v6_check(iph, th) > 0)
7976 return 0;
7977
7978 return -EACCES;
7979 #else
7980 return -EPROTONOSUPPORT;
7981 #endif
7982 }
7983
7984 static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = {
7985 .func = bpf_tcp_raw_check_syncookie_ipv6,
7986 .gpl_only = true, /* __cookie_v6_check is GPL */
7987 .pkt_access = true,
7988 .ret_type = RET_INTEGER,
7989 .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM,
7990 .arg1_size = sizeof(struct ipv6hdr),
7991 .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM,
7992 .arg2_size = sizeof(struct tcphdr),
7993 };
7994 #endif /* CONFIG_SYN_COOKIES */
7995
7996 #endif /* CONFIG_INET */
7997
bpf_helper_changes_pkt_data(enum bpf_func_id func_id)7998 bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id)
7999 {
8000 switch (func_id) {
8001 case BPF_FUNC_clone_redirect:
8002 case BPF_FUNC_l3_csum_replace:
8003 case BPF_FUNC_l4_csum_replace:
8004 case BPF_FUNC_lwt_push_encap:
8005 case BPF_FUNC_lwt_seg6_action:
8006 case BPF_FUNC_lwt_seg6_adjust_srh:
8007 case BPF_FUNC_lwt_seg6_store_bytes:
8008 case BPF_FUNC_msg_pop_data:
8009 case BPF_FUNC_msg_pull_data:
8010 case BPF_FUNC_msg_push_data:
8011 case BPF_FUNC_skb_adjust_room:
8012 case BPF_FUNC_skb_change_head:
8013 case BPF_FUNC_skb_change_proto:
8014 case BPF_FUNC_skb_change_tail:
8015 case BPF_FUNC_skb_pull_data:
8016 case BPF_FUNC_skb_store_bytes:
8017 case BPF_FUNC_skb_vlan_pop:
8018 case BPF_FUNC_skb_vlan_push:
8019 case BPF_FUNC_store_hdr_opt:
8020 case BPF_FUNC_xdp_adjust_head:
8021 case BPF_FUNC_xdp_adjust_meta:
8022 case BPF_FUNC_xdp_adjust_tail:
8023 /* tail-called program could call any of the above */
8024 case BPF_FUNC_tail_call:
8025 return true;
8026 default:
8027 return false;
8028 }
8029 }
8030
8031 const struct bpf_func_proto bpf_event_output_data_proto __weak;
8032 const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak;
8033
8034 static const struct bpf_func_proto *
sock_filter_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)8035 sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
8036 {
8037 const struct bpf_func_proto *func_proto;
8038
8039 func_proto = cgroup_common_func_proto(func_id, prog);
8040 if (func_proto)
8041 return func_proto;
8042
8043 switch (func_id) {
8044 case BPF_FUNC_get_socket_cookie:
8045 return &bpf_get_socket_cookie_sock_proto;
8046 case BPF_FUNC_get_netns_cookie:
8047 return &bpf_get_netns_cookie_sock_proto;
8048 case BPF_FUNC_perf_event_output:
8049 return &bpf_event_output_data_proto;
8050 case BPF_FUNC_sk_storage_get:
8051 return &bpf_sk_storage_get_cg_sock_proto;
8052 case BPF_FUNC_ktime_get_coarse_ns:
8053 return &bpf_ktime_get_coarse_ns_proto;
8054 default:
8055 return bpf_base_func_proto(func_id, prog);
8056 }
8057 }
8058
8059 static const struct bpf_func_proto *
sock_addr_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)8060 sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
8061 {
8062 const struct bpf_func_proto *func_proto;
8063
8064 func_proto = cgroup_common_func_proto(func_id, prog);
8065 if (func_proto)
8066 return func_proto;
8067
8068 switch (func_id) {
8069 case BPF_FUNC_bind:
8070 switch (prog->expected_attach_type) {
8071 case BPF_CGROUP_INET4_CONNECT:
8072 case BPF_CGROUP_INET6_CONNECT:
8073 return &bpf_bind_proto;
8074 default:
8075 return NULL;
8076 }
8077 case BPF_FUNC_get_socket_cookie:
8078 return &bpf_get_socket_cookie_sock_addr_proto;
8079 case BPF_FUNC_get_netns_cookie:
8080 return &bpf_get_netns_cookie_sock_addr_proto;
8081 case BPF_FUNC_perf_event_output:
8082 return &bpf_event_output_data_proto;
8083 #ifdef CONFIG_INET
8084 case BPF_FUNC_sk_lookup_tcp:
8085 return &bpf_sock_addr_sk_lookup_tcp_proto;
8086 case BPF_FUNC_sk_lookup_udp:
8087 return &bpf_sock_addr_sk_lookup_udp_proto;
8088 case BPF_FUNC_sk_release:
8089 return &bpf_sk_release_proto;
8090 case BPF_FUNC_skc_lookup_tcp:
8091 return &bpf_sock_addr_skc_lookup_tcp_proto;
8092 #endif /* CONFIG_INET */
8093 case BPF_FUNC_sk_storage_get:
8094 return &bpf_sk_storage_get_proto;
8095 case BPF_FUNC_sk_storage_delete:
8096 return &bpf_sk_storage_delete_proto;
8097 case BPF_FUNC_setsockopt:
8098 switch (prog->expected_attach_type) {
8099 case BPF_CGROUP_INET4_BIND:
8100 case BPF_CGROUP_INET6_BIND:
8101 case BPF_CGROUP_INET4_CONNECT:
8102 case BPF_CGROUP_INET6_CONNECT:
8103 case BPF_CGROUP_UNIX_CONNECT:
8104 case BPF_CGROUP_UDP4_RECVMSG:
8105 case BPF_CGROUP_UDP6_RECVMSG:
8106 case BPF_CGROUP_UNIX_RECVMSG:
8107 case BPF_CGROUP_UDP4_SENDMSG:
8108 case BPF_CGROUP_UDP6_SENDMSG:
8109 case BPF_CGROUP_UNIX_SENDMSG:
8110 case BPF_CGROUP_INET4_GETPEERNAME:
8111 case BPF_CGROUP_INET6_GETPEERNAME:
8112 case BPF_CGROUP_UNIX_GETPEERNAME:
8113 case BPF_CGROUP_INET4_GETSOCKNAME:
8114 case BPF_CGROUP_INET6_GETSOCKNAME:
8115 case BPF_CGROUP_UNIX_GETSOCKNAME:
8116 return &bpf_sock_addr_setsockopt_proto;
8117 default:
8118 return NULL;
8119 }
8120 case BPF_FUNC_getsockopt:
8121 switch (prog->expected_attach_type) {
8122 case BPF_CGROUP_INET4_BIND:
8123 case BPF_CGROUP_INET6_BIND:
8124 case BPF_CGROUP_INET4_CONNECT:
8125 case BPF_CGROUP_INET6_CONNECT:
8126 case BPF_CGROUP_UNIX_CONNECT:
8127 case BPF_CGROUP_UDP4_RECVMSG:
8128 case BPF_CGROUP_UDP6_RECVMSG:
8129 case BPF_CGROUP_UNIX_RECVMSG:
8130 case BPF_CGROUP_UDP4_SENDMSG:
8131 case BPF_CGROUP_UDP6_SENDMSG:
8132 case BPF_CGROUP_UNIX_SENDMSG:
8133 case BPF_CGROUP_INET4_GETPEERNAME:
8134 case BPF_CGROUP_INET6_GETPEERNAME:
8135 case BPF_CGROUP_UNIX_GETPEERNAME:
8136 case BPF_CGROUP_INET4_GETSOCKNAME:
8137 case BPF_CGROUP_INET6_GETSOCKNAME:
8138 case BPF_CGROUP_UNIX_GETSOCKNAME:
8139 return &bpf_sock_addr_getsockopt_proto;
8140 default:
8141 return NULL;
8142 }
8143 default:
8144 return bpf_sk_base_func_proto(func_id, prog);
8145 }
8146 }
8147
8148 static const struct bpf_func_proto *
sk_filter_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)8149 sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
8150 {
8151 switch (func_id) {
8152 case BPF_FUNC_skb_load_bytes:
8153 return &bpf_skb_load_bytes_proto;
8154 case BPF_FUNC_skb_load_bytes_relative:
8155 return &bpf_skb_load_bytes_relative_proto;
8156 case BPF_FUNC_get_socket_cookie:
8157 return &bpf_get_socket_cookie_proto;
8158 case BPF_FUNC_get_netns_cookie:
8159 return &bpf_get_netns_cookie_proto;
8160 case BPF_FUNC_get_socket_uid:
8161 return &bpf_get_socket_uid_proto;
8162 case BPF_FUNC_perf_event_output:
8163 return &bpf_skb_event_output_proto;
8164 default:
8165 return bpf_sk_base_func_proto(func_id, prog);
8166 }
8167 }
8168
8169 const struct bpf_func_proto bpf_sk_storage_get_proto __weak;
8170 const struct bpf_func_proto bpf_sk_storage_delete_proto __weak;
8171
8172 static const struct bpf_func_proto *
cg_skb_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)8173 cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
8174 {
8175 const struct bpf_func_proto *func_proto;
8176
8177 func_proto = cgroup_common_func_proto(func_id, prog);
8178 if (func_proto)
8179 return func_proto;
8180
8181 switch (func_id) {
8182 case BPF_FUNC_sk_fullsock:
8183 return &bpf_sk_fullsock_proto;
8184 case BPF_FUNC_sk_storage_get:
8185 return &bpf_sk_storage_get_proto;
8186 case BPF_FUNC_sk_storage_delete:
8187 return &bpf_sk_storage_delete_proto;
8188 case BPF_FUNC_perf_event_output:
8189 return &bpf_skb_event_output_proto;
8190 #ifdef CONFIG_SOCK_CGROUP_DATA
8191 case BPF_FUNC_skb_cgroup_id:
8192 return &bpf_skb_cgroup_id_proto;
8193 case BPF_FUNC_skb_ancestor_cgroup_id:
8194 return &bpf_skb_ancestor_cgroup_id_proto;
8195 case BPF_FUNC_sk_cgroup_id:
8196 return &bpf_sk_cgroup_id_proto;
8197 case BPF_FUNC_sk_ancestor_cgroup_id:
8198 return &bpf_sk_ancestor_cgroup_id_proto;
8199 #endif
8200 #ifdef CONFIG_INET
8201 case BPF_FUNC_sk_lookup_tcp:
8202 return &bpf_sk_lookup_tcp_proto;
8203 case BPF_FUNC_sk_lookup_udp:
8204 return &bpf_sk_lookup_udp_proto;
8205 case BPF_FUNC_sk_release:
8206 return &bpf_sk_release_proto;
8207 case BPF_FUNC_skc_lookup_tcp:
8208 return &bpf_skc_lookup_tcp_proto;
8209 case BPF_FUNC_tcp_sock:
8210 return &bpf_tcp_sock_proto;
8211 case BPF_FUNC_get_listener_sock:
8212 return &bpf_get_listener_sock_proto;
8213 case BPF_FUNC_skb_ecn_set_ce:
8214 return &bpf_skb_ecn_set_ce_proto;
8215 #endif
8216 default:
8217 return sk_filter_func_proto(func_id, prog);
8218 }
8219 }
8220
8221 static const struct bpf_func_proto *
tc_cls_act_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)8222 tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
8223 {
8224 switch (func_id) {
8225 case BPF_FUNC_skb_store_bytes:
8226 return &bpf_skb_store_bytes_proto;
8227 case BPF_FUNC_skb_load_bytes:
8228 return &bpf_skb_load_bytes_proto;
8229 case BPF_FUNC_skb_load_bytes_relative:
8230 return &bpf_skb_load_bytes_relative_proto;
8231 case BPF_FUNC_skb_pull_data:
8232 return &bpf_skb_pull_data_proto;
8233 case BPF_FUNC_csum_diff:
8234 return &bpf_csum_diff_proto;
8235 case BPF_FUNC_csum_update:
8236 return &bpf_csum_update_proto;
8237 case BPF_FUNC_csum_level:
8238 return &bpf_csum_level_proto;
8239 case BPF_FUNC_l3_csum_replace:
8240 return &bpf_l3_csum_replace_proto;
8241 case BPF_FUNC_l4_csum_replace:
8242 return &bpf_l4_csum_replace_proto;
8243 case BPF_FUNC_clone_redirect:
8244 return &bpf_clone_redirect_proto;
8245 case BPF_FUNC_get_cgroup_classid:
8246 return &bpf_get_cgroup_classid_proto;
8247 case BPF_FUNC_skb_vlan_push:
8248 return &bpf_skb_vlan_push_proto;
8249 case BPF_FUNC_skb_vlan_pop:
8250 return &bpf_skb_vlan_pop_proto;
8251 case BPF_FUNC_skb_change_proto:
8252 return &bpf_skb_change_proto_proto;
8253 case BPF_FUNC_skb_change_type:
8254 return &bpf_skb_change_type_proto;
8255 case BPF_FUNC_skb_adjust_room:
8256 return &bpf_skb_adjust_room_proto;
8257 case BPF_FUNC_skb_change_tail:
8258 return &bpf_skb_change_tail_proto;
8259 case BPF_FUNC_skb_change_head:
8260 return &bpf_skb_change_head_proto;
8261 case BPF_FUNC_skb_get_tunnel_key:
8262 return &bpf_skb_get_tunnel_key_proto;
8263 case BPF_FUNC_skb_set_tunnel_key:
8264 return bpf_get_skb_set_tunnel_proto(func_id);
8265 case BPF_FUNC_skb_get_tunnel_opt:
8266 return &bpf_skb_get_tunnel_opt_proto;
8267 case BPF_FUNC_skb_set_tunnel_opt:
8268 return bpf_get_skb_set_tunnel_proto(func_id);
8269 case BPF_FUNC_redirect:
8270 return &bpf_redirect_proto;
8271 case BPF_FUNC_redirect_neigh:
8272 return &bpf_redirect_neigh_proto;
8273 case BPF_FUNC_redirect_peer:
8274 return &bpf_redirect_peer_proto;
8275 case BPF_FUNC_get_route_realm:
8276 return &bpf_get_route_realm_proto;
8277 case BPF_FUNC_get_hash_recalc:
8278 return &bpf_get_hash_recalc_proto;
8279 case BPF_FUNC_set_hash_invalid:
8280 return &bpf_set_hash_invalid_proto;
8281 case BPF_FUNC_set_hash:
8282 return &bpf_set_hash_proto;
8283 case BPF_FUNC_perf_event_output:
8284 return &bpf_skb_event_output_proto;
8285 case BPF_FUNC_get_smp_processor_id:
8286 return &bpf_get_smp_processor_id_proto;
8287 case BPF_FUNC_skb_under_cgroup:
8288 return &bpf_skb_under_cgroup_proto;
8289 case BPF_FUNC_get_socket_cookie:
8290 return &bpf_get_socket_cookie_proto;
8291 case BPF_FUNC_get_netns_cookie:
8292 return &bpf_get_netns_cookie_proto;
8293 case BPF_FUNC_get_socket_uid:
8294 return &bpf_get_socket_uid_proto;
8295 case BPF_FUNC_fib_lookup:
8296 return &bpf_skb_fib_lookup_proto;
8297 case BPF_FUNC_check_mtu:
8298 return &bpf_skb_check_mtu_proto;
8299 case BPF_FUNC_sk_fullsock:
8300 return &bpf_sk_fullsock_proto;
8301 case BPF_FUNC_sk_storage_get:
8302 return &bpf_sk_storage_get_proto;
8303 case BPF_FUNC_sk_storage_delete:
8304 return &bpf_sk_storage_delete_proto;
8305 #ifdef CONFIG_XFRM
8306 case BPF_FUNC_skb_get_xfrm_state:
8307 return &bpf_skb_get_xfrm_state_proto;
8308 #endif
8309 #ifdef CONFIG_CGROUP_NET_CLASSID
8310 case BPF_FUNC_skb_cgroup_classid:
8311 return &bpf_skb_cgroup_classid_proto;
8312 #endif
8313 #ifdef CONFIG_SOCK_CGROUP_DATA
8314 case BPF_FUNC_skb_cgroup_id:
8315 return &bpf_skb_cgroup_id_proto;
8316 case BPF_FUNC_skb_ancestor_cgroup_id:
8317 return &bpf_skb_ancestor_cgroup_id_proto;
8318 #endif
8319 #ifdef CONFIG_INET
8320 case BPF_FUNC_sk_lookup_tcp:
8321 return &bpf_tc_sk_lookup_tcp_proto;
8322 case BPF_FUNC_sk_lookup_udp:
8323 return &bpf_tc_sk_lookup_udp_proto;
8324 case BPF_FUNC_sk_release:
8325 return &bpf_sk_release_proto;
8326 case BPF_FUNC_tcp_sock:
8327 return &bpf_tcp_sock_proto;
8328 case BPF_FUNC_get_listener_sock:
8329 return &bpf_get_listener_sock_proto;
8330 case BPF_FUNC_skc_lookup_tcp:
8331 return &bpf_tc_skc_lookup_tcp_proto;
8332 case BPF_FUNC_tcp_check_syncookie:
8333 return &bpf_tcp_check_syncookie_proto;
8334 case BPF_FUNC_skb_ecn_set_ce:
8335 return &bpf_skb_ecn_set_ce_proto;
8336 case BPF_FUNC_tcp_gen_syncookie:
8337 return &bpf_tcp_gen_syncookie_proto;
8338 case BPF_FUNC_sk_assign:
8339 return &bpf_sk_assign_proto;
8340 case BPF_FUNC_skb_set_tstamp:
8341 return &bpf_skb_set_tstamp_proto;
8342 #ifdef CONFIG_SYN_COOKIES
8343 case BPF_FUNC_tcp_raw_gen_syncookie_ipv4:
8344 return &bpf_tcp_raw_gen_syncookie_ipv4_proto;
8345 case BPF_FUNC_tcp_raw_gen_syncookie_ipv6:
8346 return &bpf_tcp_raw_gen_syncookie_ipv6_proto;
8347 case BPF_FUNC_tcp_raw_check_syncookie_ipv4:
8348 return &bpf_tcp_raw_check_syncookie_ipv4_proto;
8349 case BPF_FUNC_tcp_raw_check_syncookie_ipv6:
8350 return &bpf_tcp_raw_check_syncookie_ipv6_proto;
8351 #endif
8352 #endif
8353 default:
8354 return bpf_sk_base_func_proto(func_id, prog);
8355 }
8356 }
8357
8358 static const struct bpf_func_proto *
xdp_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)8359 xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
8360 {
8361 switch (func_id) {
8362 case BPF_FUNC_perf_event_output:
8363 return &bpf_xdp_event_output_proto;
8364 case BPF_FUNC_get_smp_processor_id:
8365 return &bpf_get_smp_processor_id_proto;
8366 case BPF_FUNC_csum_diff:
8367 return &bpf_csum_diff_proto;
8368 case BPF_FUNC_xdp_adjust_head:
8369 return &bpf_xdp_adjust_head_proto;
8370 case BPF_FUNC_xdp_adjust_meta:
8371 return &bpf_xdp_adjust_meta_proto;
8372 case BPF_FUNC_redirect:
8373 return &bpf_xdp_redirect_proto;
8374 case BPF_FUNC_redirect_map:
8375 return &bpf_xdp_redirect_map_proto;
8376 case BPF_FUNC_xdp_adjust_tail:
8377 return &bpf_xdp_adjust_tail_proto;
8378 case BPF_FUNC_xdp_get_buff_len:
8379 return &bpf_xdp_get_buff_len_proto;
8380 case BPF_FUNC_xdp_load_bytes:
8381 return &bpf_xdp_load_bytes_proto;
8382 case BPF_FUNC_xdp_store_bytes:
8383 return &bpf_xdp_store_bytes_proto;
8384 case BPF_FUNC_fib_lookup:
8385 return &bpf_xdp_fib_lookup_proto;
8386 case BPF_FUNC_check_mtu:
8387 return &bpf_xdp_check_mtu_proto;
8388 #ifdef CONFIG_INET
8389 case BPF_FUNC_sk_lookup_udp:
8390 return &bpf_xdp_sk_lookup_udp_proto;
8391 case BPF_FUNC_sk_lookup_tcp:
8392 return &bpf_xdp_sk_lookup_tcp_proto;
8393 case BPF_FUNC_sk_release:
8394 return &bpf_sk_release_proto;
8395 case BPF_FUNC_skc_lookup_tcp:
8396 return &bpf_xdp_skc_lookup_tcp_proto;
8397 case BPF_FUNC_tcp_check_syncookie:
8398 return &bpf_tcp_check_syncookie_proto;
8399 case BPF_FUNC_tcp_gen_syncookie:
8400 return &bpf_tcp_gen_syncookie_proto;
8401 #ifdef CONFIG_SYN_COOKIES
8402 case BPF_FUNC_tcp_raw_gen_syncookie_ipv4:
8403 return &bpf_tcp_raw_gen_syncookie_ipv4_proto;
8404 case BPF_FUNC_tcp_raw_gen_syncookie_ipv6:
8405 return &bpf_tcp_raw_gen_syncookie_ipv6_proto;
8406 case BPF_FUNC_tcp_raw_check_syncookie_ipv4:
8407 return &bpf_tcp_raw_check_syncookie_ipv4_proto;
8408 case BPF_FUNC_tcp_raw_check_syncookie_ipv6:
8409 return &bpf_tcp_raw_check_syncookie_ipv6_proto;
8410 #endif
8411 #endif
8412 default:
8413 return bpf_sk_base_func_proto(func_id, prog);
8414 }
8415
8416 #if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)
8417 /* The nf_conn___init type is used in the NF_CONNTRACK kfuncs. The
8418 * kfuncs are defined in two different modules, and we want to be able
8419 * to use them interchangeably with the same BTF type ID. Because modules
8420 * can't de-duplicate BTF IDs between each other, we need the type to be
8421 * referenced in the vmlinux BTF or the verifier will get confused about
8422 * the different types. So we add this dummy type reference which will
8423 * be included in vmlinux BTF, allowing both modules to refer to the
8424 * same type ID.
8425 */
8426 BTF_TYPE_EMIT(struct nf_conn___init);
8427 #endif
8428 }
8429
8430 const struct bpf_func_proto bpf_sock_map_update_proto __weak;
8431 const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
8432
8433 static const struct bpf_func_proto *
sock_ops_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)8434 sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
8435 {
8436 const struct bpf_func_proto *func_proto;
8437
8438 func_proto = cgroup_common_func_proto(func_id, prog);
8439 if (func_proto)
8440 return func_proto;
8441
8442 switch (func_id) {
8443 case BPF_FUNC_setsockopt:
8444 return &bpf_sock_ops_setsockopt_proto;
8445 case BPF_FUNC_getsockopt:
8446 return &bpf_sock_ops_getsockopt_proto;
8447 case BPF_FUNC_sock_ops_cb_flags_set:
8448 return &bpf_sock_ops_cb_flags_set_proto;
8449 case BPF_FUNC_sock_map_update:
8450 return &bpf_sock_map_update_proto;
8451 case BPF_FUNC_sock_hash_update:
8452 return &bpf_sock_hash_update_proto;
8453 case BPF_FUNC_get_socket_cookie:
8454 return &bpf_get_socket_cookie_sock_ops_proto;
8455 case BPF_FUNC_perf_event_output:
8456 return &bpf_event_output_data_proto;
8457 case BPF_FUNC_sk_storage_get:
8458 return &bpf_sk_storage_get_proto;
8459 case BPF_FUNC_sk_storage_delete:
8460 return &bpf_sk_storage_delete_proto;
8461 case BPF_FUNC_get_netns_cookie:
8462 return &bpf_get_netns_cookie_sock_ops_proto;
8463 #ifdef CONFIG_INET
8464 case BPF_FUNC_load_hdr_opt:
8465 return &bpf_sock_ops_load_hdr_opt_proto;
8466 case BPF_FUNC_store_hdr_opt:
8467 return &bpf_sock_ops_store_hdr_opt_proto;
8468 case BPF_FUNC_reserve_hdr_opt:
8469 return &bpf_sock_ops_reserve_hdr_opt_proto;
8470 case BPF_FUNC_tcp_sock:
8471 return &bpf_tcp_sock_proto;
8472 #endif /* CONFIG_INET */
8473 default:
8474 return bpf_sk_base_func_proto(func_id, prog);
8475 }
8476 }
8477
8478 const struct bpf_func_proto bpf_msg_redirect_map_proto __weak;
8479 const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak;
8480
8481 static const struct bpf_func_proto *
sk_msg_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)8482 sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
8483 {
8484 switch (func_id) {
8485 case BPF_FUNC_msg_redirect_map:
8486 return &bpf_msg_redirect_map_proto;
8487 case BPF_FUNC_msg_redirect_hash:
8488 return &bpf_msg_redirect_hash_proto;
8489 case BPF_FUNC_msg_apply_bytes:
8490 return &bpf_msg_apply_bytes_proto;
8491 case BPF_FUNC_msg_cork_bytes:
8492 return &bpf_msg_cork_bytes_proto;
8493 case BPF_FUNC_msg_pull_data:
8494 return &bpf_msg_pull_data_proto;
8495 case BPF_FUNC_msg_push_data:
8496 return &bpf_msg_push_data_proto;
8497 case BPF_FUNC_msg_pop_data:
8498 return &bpf_msg_pop_data_proto;
8499 case BPF_FUNC_perf_event_output:
8500 return &bpf_event_output_data_proto;
8501 case BPF_FUNC_sk_storage_get:
8502 return &bpf_sk_storage_get_proto;
8503 case BPF_FUNC_sk_storage_delete:
8504 return &bpf_sk_storage_delete_proto;
8505 case BPF_FUNC_get_netns_cookie:
8506 return &bpf_get_netns_cookie_sk_msg_proto;
8507 default:
8508 return bpf_sk_base_func_proto(func_id, prog);
8509 }
8510 }
8511
8512 const struct bpf_func_proto bpf_sk_redirect_map_proto __weak;
8513 const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak;
8514
8515 static const struct bpf_func_proto *
sk_skb_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)8516 sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
8517 {
8518 switch (func_id) {
8519 case BPF_FUNC_skb_store_bytes:
8520 return &bpf_skb_store_bytes_proto;
8521 case BPF_FUNC_skb_load_bytes:
8522 return &bpf_skb_load_bytes_proto;
8523 case BPF_FUNC_skb_pull_data:
8524 return &sk_skb_pull_data_proto;
8525 case BPF_FUNC_skb_change_tail:
8526 return &sk_skb_change_tail_proto;
8527 case BPF_FUNC_skb_change_head:
8528 return &sk_skb_change_head_proto;
8529 case BPF_FUNC_skb_adjust_room:
8530 return &sk_skb_adjust_room_proto;
8531 case BPF_FUNC_get_socket_cookie:
8532 return &bpf_get_socket_cookie_proto;
8533 case BPF_FUNC_get_socket_uid:
8534 return &bpf_get_socket_uid_proto;
8535 case BPF_FUNC_sk_redirect_map:
8536 return &bpf_sk_redirect_map_proto;
8537 case BPF_FUNC_sk_redirect_hash:
8538 return &bpf_sk_redirect_hash_proto;
8539 case BPF_FUNC_perf_event_output:
8540 return &bpf_skb_event_output_proto;
8541 #ifdef CONFIG_INET
8542 case BPF_FUNC_sk_lookup_tcp:
8543 return &bpf_sk_lookup_tcp_proto;
8544 case BPF_FUNC_sk_lookup_udp:
8545 return &bpf_sk_lookup_udp_proto;
8546 case BPF_FUNC_sk_release:
8547 return &bpf_sk_release_proto;
8548 case BPF_FUNC_skc_lookup_tcp:
8549 return &bpf_skc_lookup_tcp_proto;
8550 #endif
8551 default:
8552 return bpf_sk_base_func_proto(func_id, prog);
8553 }
8554 }
8555
8556 static const struct bpf_func_proto *
flow_dissector_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)8557 flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
8558 {
8559 switch (func_id) {
8560 case BPF_FUNC_skb_load_bytes:
8561 return &bpf_flow_dissector_load_bytes_proto;
8562 default:
8563 return bpf_sk_base_func_proto(func_id, prog);
8564 }
8565 }
8566
8567 static const struct bpf_func_proto *
lwt_out_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)8568 lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
8569 {
8570 switch (func_id) {
8571 case BPF_FUNC_skb_load_bytes:
8572 return &bpf_skb_load_bytes_proto;
8573 case BPF_FUNC_skb_pull_data:
8574 return &bpf_skb_pull_data_proto;
8575 case BPF_FUNC_csum_diff:
8576 return &bpf_csum_diff_proto;
8577 case BPF_FUNC_get_cgroup_classid:
8578 return &bpf_get_cgroup_classid_proto;
8579 case BPF_FUNC_get_route_realm:
8580 return &bpf_get_route_realm_proto;
8581 case BPF_FUNC_get_hash_recalc:
8582 return &bpf_get_hash_recalc_proto;
8583 case BPF_FUNC_perf_event_output:
8584 return &bpf_skb_event_output_proto;
8585 case BPF_FUNC_get_smp_processor_id:
8586 return &bpf_get_smp_processor_id_proto;
8587 case BPF_FUNC_skb_under_cgroup:
8588 return &bpf_skb_under_cgroup_proto;
8589 default:
8590 return bpf_sk_base_func_proto(func_id, prog);
8591 }
8592 }
8593
8594 static const struct bpf_func_proto *
lwt_in_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)8595 lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
8596 {
8597 switch (func_id) {
8598 case BPF_FUNC_lwt_push_encap:
8599 return &bpf_lwt_in_push_encap_proto;
8600 default:
8601 return lwt_out_func_proto(func_id, prog);
8602 }
8603 }
8604
8605 static const struct bpf_func_proto *
lwt_xmit_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)8606 lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
8607 {
8608 switch (func_id) {
8609 case BPF_FUNC_skb_get_tunnel_key:
8610 return &bpf_skb_get_tunnel_key_proto;
8611 case BPF_FUNC_skb_set_tunnel_key:
8612 return bpf_get_skb_set_tunnel_proto(func_id);
8613 case BPF_FUNC_skb_get_tunnel_opt:
8614 return &bpf_skb_get_tunnel_opt_proto;
8615 case BPF_FUNC_skb_set_tunnel_opt:
8616 return bpf_get_skb_set_tunnel_proto(func_id);
8617 case BPF_FUNC_redirect:
8618 return &bpf_redirect_proto;
8619 case BPF_FUNC_clone_redirect:
8620 return &bpf_clone_redirect_proto;
8621 case BPF_FUNC_skb_change_tail:
8622 return &bpf_skb_change_tail_proto;
8623 case BPF_FUNC_skb_change_head:
8624 return &bpf_skb_change_head_proto;
8625 case BPF_FUNC_skb_store_bytes:
8626 return &bpf_skb_store_bytes_proto;
8627 case BPF_FUNC_csum_update:
8628 return &bpf_csum_update_proto;
8629 case BPF_FUNC_csum_level:
8630 return &bpf_csum_level_proto;
8631 case BPF_FUNC_l3_csum_replace:
8632 return &bpf_l3_csum_replace_proto;
8633 case BPF_FUNC_l4_csum_replace:
8634 return &bpf_l4_csum_replace_proto;
8635 case BPF_FUNC_set_hash_invalid:
8636 return &bpf_set_hash_invalid_proto;
8637 case BPF_FUNC_lwt_push_encap:
8638 return &bpf_lwt_xmit_push_encap_proto;
8639 default:
8640 return lwt_out_func_proto(func_id, prog);
8641 }
8642 }
8643
8644 static const struct bpf_func_proto *
lwt_seg6local_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)8645 lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
8646 {
8647 switch (func_id) {
8648 #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
8649 case BPF_FUNC_lwt_seg6_store_bytes:
8650 return &bpf_lwt_seg6_store_bytes_proto;
8651 case BPF_FUNC_lwt_seg6_action:
8652 return &bpf_lwt_seg6_action_proto;
8653 case BPF_FUNC_lwt_seg6_adjust_srh:
8654 return &bpf_lwt_seg6_adjust_srh_proto;
8655 #endif
8656 default:
8657 return lwt_out_func_proto(func_id, prog);
8658 }
8659 }
8660
bpf_skb_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)8661 static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type,
8662 const struct bpf_prog *prog,
8663 struct bpf_insn_access_aux *info)
8664 {
8665 const int size_default = sizeof(__u32);
8666
8667 if (off < 0 || off >= sizeof(struct __sk_buff))
8668 return false;
8669
8670 /* The verifier guarantees that size > 0. */
8671 if (off % size != 0)
8672 return false;
8673
8674 switch (off) {
8675 case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
8676 if (off + size > offsetofend(struct __sk_buff, cb[4]))
8677 return false;
8678 break;
8679 case bpf_ctx_range(struct __sk_buff, data):
8680 case bpf_ctx_range(struct __sk_buff, data_meta):
8681 case bpf_ctx_range(struct __sk_buff, data_end):
8682 if (info->is_ldsx || size != size_default)
8683 return false;
8684 break;
8685 case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]):
8686 case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]):
8687 case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4):
8688 case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4):
8689 if (size != size_default)
8690 return false;
8691 break;
8692 case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
8693 return false;
8694 case bpf_ctx_range(struct __sk_buff, hwtstamp):
8695 if (type == BPF_WRITE || size != sizeof(__u64))
8696 return false;
8697 break;
8698 case bpf_ctx_range(struct __sk_buff, tstamp):
8699 if (size != sizeof(__u64))
8700 return false;
8701 break;
8702 case bpf_ctx_range_ptr(struct __sk_buff, sk):
8703 if (type == BPF_WRITE || size != sizeof(__u64))
8704 return false;
8705 info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
8706 break;
8707 case offsetof(struct __sk_buff, tstamp_type):
8708 return false;
8709 case offsetofend(struct __sk_buff, tstamp_type) ... offsetof(struct __sk_buff, hwtstamp) - 1:
8710 /* Explicitly prohibit access to padding in __sk_buff. */
8711 return false;
8712 default:
8713 /* Only narrow read access allowed for now. */
8714 if (type == BPF_WRITE) {
8715 if (size != size_default)
8716 return false;
8717 } else {
8718 bpf_ctx_record_field_size(info, size_default);
8719 if (!bpf_ctx_narrow_access_ok(off, size, size_default))
8720 return false;
8721 }
8722 }
8723
8724 return true;
8725 }
8726
sk_filter_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)8727 static bool sk_filter_is_valid_access(int off, int size,
8728 enum bpf_access_type type,
8729 const struct bpf_prog *prog,
8730 struct bpf_insn_access_aux *info)
8731 {
8732 switch (off) {
8733 case bpf_ctx_range(struct __sk_buff, tc_classid):
8734 case bpf_ctx_range(struct __sk_buff, data):
8735 case bpf_ctx_range(struct __sk_buff, data_meta):
8736 case bpf_ctx_range(struct __sk_buff, data_end):
8737 case bpf_ctx_range_till(struct __sk_buff, family, local_port):
8738 case bpf_ctx_range(struct __sk_buff, tstamp):
8739 case bpf_ctx_range(struct __sk_buff, wire_len):
8740 case bpf_ctx_range(struct __sk_buff, hwtstamp):
8741 return false;
8742 }
8743
8744 if (type == BPF_WRITE) {
8745 switch (off) {
8746 case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
8747 break;
8748 default:
8749 return false;
8750 }
8751 }
8752
8753 return bpf_skb_is_valid_access(off, size, type, prog, info);
8754 }
8755
cg_skb_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)8756 static bool cg_skb_is_valid_access(int off, int size,
8757 enum bpf_access_type type,
8758 const struct bpf_prog *prog,
8759 struct bpf_insn_access_aux *info)
8760 {
8761 switch (off) {
8762 case bpf_ctx_range(struct __sk_buff, tc_classid):
8763 case bpf_ctx_range(struct __sk_buff, data_meta):
8764 case bpf_ctx_range(struct __sk_buff, wire_len):
8765 return false;
8766 case bpf_ctx_range(struct __sk_buff, data):
8767 case bpf_ctx_range(struct __sk_buff, data_end):
8768 if (!bpf_token_capable(prog->aux->token, CAP_BPF))
8769 return false;
8770 break;
8771 }
8772
8773 if (type == BPF_WRITE) {
8774 switch (off) {
8775 case bpf_ctx_range(struct __sk_buff, mark):
8776 case bpf_ctx_range(struct __sk_buff, priority):
8777 case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
8778 break;
8779 case bpf_ctx_range(struct __sk_buff, tstamp):
8780 if (!bpf_token_capable(prog->aux->token, CAP_BPF))
8781 return false;
8782 break;
8783 default:
8784 return false;
8785 }
8786 }
8787
8788 switch (off) {
8789 case bpf_ctx_range(struct __sk_buff, data):
8790 info->reg_type = PTR_TO_PACKET;
8791 break;
8792 case bpf_ctx_range(struct __sk_buff, data_end):
8793 info->reg_type = PTR_TO_PACKET_END;
8794 break;
8795 }
8796
8797 return bpf_skb_is_valid_access(off, size, type, prog, info);
8798 }
8799
lwt_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)8800 static bool lwt_is_valid_access(int off, int size,
8801 enum bpf_access_type type,
8802 const struct bpf_prog *prog,
8803 struct bpf_insn_access_aux *info)
8804 {
8805 switch (off) {
8806 case bpf_ctx_range(struct __sk_buff, tc_classid):
8807 case bpf_ctx_range_till(struct __sk_buff, family, local_port):
8808 case bpf_ctx_range(struct __sk_buff, data_meta):
8809 case bpf_ctx_range(struct __sk_buff, tstamp):
8810 case bpf_ctx_range(struct __sk_buff, wire_len):
8811 case bpf_ctx_range(struct __sk_buff, hwtstamp):
8812 return false;
8813 }
8814
8815 if (type == BPF_WRITE) {
8816 switch (off) {
8817 case bpf_ctx_range(struct __sk_buff, mark):
8818 case bpf_ctx_range(struct __sk_buff, priority):
8819 case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
8820 break;
8821 default:
8822 return false;
8823 }
8824 }
8825
8826 switch (off) {
8827 case bpf_ctx_range(struct __sk_buff, data):
8828 info->reg_type = PTR_TO_PACKET;
8829 break;
8830 case bpf_ctx_range(struct __sk_buff, data_end):
8831 info->reg_type = PTR_TO_PACKET_END;
8832 break;
8833 }
8834
8835 return bpf_skb_is_valid_access(off, size, type, prog, info);
8836 }
8837
8838 /* Attach type specific accesses */
__sock_filter_check_attach_type(int off,enum bpf_access_type access_type,enum bpf_attach_type attach_type)8839 static bool __sock_filter_check_attach_type(int off,
8840 enum bpf_access_type access_type,
8841 enum bpf_attach_type attach_type)
8842 {
8843 switch (off) {
8844 case offsetof(struct bpf_sock, bound_dev_if):
8845 case offsetof(struct bpf_sock, mark):
8846 case offsetof(struct bpf_sock, priority):
8847 switch (attach_type) {
8848 case BPF_CGROUP_INET_SOCK_CREATE:
8849 case BPF_CGROUP_INET_SOCK_RELEASE:
8850 goto full_access;
8851 default:
8852 return false;
8853 }
8854 case bpf_ctx_range(struct bpf_sock, src_ip4):
8855 switch (attach_type) {
8856 case BPF_CGROUP_INET4_POST_BIND:
8857 goto read_only;
8858 default:
8859 return false;
8860 }
8861 case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
8862 switch (attach_type) {
8863 case BPF_CGROUP_INET6_POST_BIND:
8864 goto read_only;
8865 default:
8866 return false;
8867 }
8868 case bpf_ctx_range(struct bpf_sock, src_port):
8869 switch (attach_type) {
8870 case BPF_CGROUP_INET4_POST_BIND:
8871 case BPF_CGROUP_INET6_POST_BIND:
8872 goto read_only;
8873 default:
8874 return false;
8875 }
8876 }
8877 read_only:
8878 return access_type == BPF_READ;
8879 full_access:
8880 return true;
8881 }
8882
bpf_sock_common_is_valid_access(int off,int size,enum bpf_access_type type,struct bpf_insn_access_aux * info)8883 bool bpf_sock_common_is_valid_access(int off, int size,
8884 enum bpf_access_type type,
8885 struct bpf_insn_access_aux *info)
8886 {
8887 switch (off) {
8888 case bpf_ctx_range_till(struct bpf_sock, type, priority):
8889 return false;
8890 default:
8891 return bpf_sock_is_valid_access(off, size, type, info);
8892 }
8893 }
8894
bpf_sock_is_valid_access(int off,int size,enum bpf_access_type type,struct bpf_insn_access_aux * info)8895 bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
8896 struct bpf_insn_access_aux *info)
8897 {
8898 const int size_default = sizeof(__u32);
8899 int field_size;
8900
8901 if (off < 0 || off >= sizeof(struct bpf_sock))
8902 return false;
8903 if (off % size != 0)
8904 return false;
8905
8906 switch (off) {
8907 case offsetof(struct bpf_sock, state):
8908 case offsetof(struct bpf_sock, family):
8909 case offsetof(struct bpf_sock, type):
8910 case offsetof(struct bpf_sock, protocol):
8911 case offsetof(struct bpf_sock, src_port):
8912 case offsetof(struct bpf_sock, rx_queue_mapping):
8913 case bpf_ctx_range(struct bpf_sock, src_ip4):
8914 case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
8915 case bpf_ctx_range(struct bpf_sock, dst_ip4):
8916 case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
8917 bpf_ctx_record_field_size(info, size_default);
8918 return bpf_ctx_narrow_access_ok(off, size, size_default);
8919 case bpf_ctx_range(struct bpf_sock, dst_port):
8920 field_size = size == size_default ?
8921 size_default : sizeof_field(struct bpf_sock, dst_port);
8922 bpf_ctx_record_field_size(info, field_size);
8923 return bpf_ctx_narrow_access_ok(off, size, field_size);
8924 case offsetofend(struct bpf_sock, dst_port) ...
8925 offsetof(struct bpf_sock, dst_ip4) - 1:
8926 return false;
8927 }
8928
8929 return size == size_default;
8930 }
8931
sock_filter_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)8932 static bool sock_filter_is_valid_access(int off, int size,
8933 enum bpf_access_type type,
8934 const struct bpf_prog *prog,
8935 struct bpf_insn_access_aux *info)
8936 {
8937 if (!bpf_sock_is_valid_access(off, size, type, info))
8938 return false;
8939 return __sock_filter_check_attach_type(off, type,
8940 prog->expected_attach_type);
8941 }
8942
bpf_noop_prologue(struct bpf_insn * insn_buf,bool direct_write,const struct bpf_prog * prog)8943 static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write,
8944 const struct bpf_prog *prog)
8945 {
8946 /* Neither direct read nor direct write requires any preliminary
8947 * action.
8948 */
8949 return 0;
8950 }
8951
bpf_unclone_prologue(struct bpf_insn * insn_buf,bool direct_write,const struct bpf_prog * prog,int drop_verdict)8952 static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
8953 const struct bpf_prog *prog, int drop_verdict)
8954 {
8955 struct bpf_insn *insn = insn_buf;
8956
8957 if (!direct_write)
8958 return 0;
8959
8960 /* if (!skb->cloned)
8961 * goto start;
8962 *
8963 * (Fast-path, otherwise approximation that we might be
8964 * a clone, do the rest in helper.)
8965 */
8966 *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET);
8967 *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK);
8968 *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7);
8969
8970 /* ret = bpf_skb_pull_data(skb, 0); */
8971 *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
8972 *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2);
8973 *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
8974 BPF_FUNC_skb_pull_data);
8975 /* if (!ret)
8976 * goto restore;
8977 * return TC_ACT_SHOT;
8978 */
8979 *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2);
8980 *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, drop_verdict);
8981 *insn++ = BPF_EXIT_INSN();
8982
8983 /* restore: */
8984 *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
8985 /* start: */
8986 *insn++ = prog->insnsi[0];
8987
8988 return insn - insn_buf;
8989 }
8990
bpf_gen_ld_abs(const struct bpf_insn * orig,struct bpf_insn * insn_buf)8991 static int bpf_gen_ld_abs(const struct bpf_insn *orig,
8992 struct bpf_insn *insn_buf)
8993 {
8994 bool indirect = BPF_MODE(orig->code) == BPF_IND;
8995 struct bpf_insn *insn = insn_buf;
8996
8997 if (!indirect) {
8998 *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm);
8999 } else {
9000 *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg);
9001 if (orig->imm)
9002 *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm);
9003 }
9004 /* We're guaranteed here that CTX is in R6. */
9005 *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX);
9006
9007 switch (BPF_SIZE(orig->code)) {
9008 case BPF_B:
9009 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache);
9010 break;
9011 case BPF_H:
9012 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache);
9013 break;
9014 case BPF_W:
9015 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache);
9016 break;
9017 }
9018
9019 *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2);
9020 *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0);
9021 *insn++ = BPF_EXIT_INSN();
9022
9023 return insn - insn_buf;
9024 }
9025
tc_cls_act_prologue(struct bpf_insn * insn_buf,bool direct_write,const struct bpf_prog * prog)9026 static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
9027 const struct bpf_prog *prog)
9028 {
9029 return bpf_unclone_prologue(insn_buf, direct_write, prog, TC_ACT_SHOT);
9030 }
9031
tc_cls_act_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)9032 static bool tc_cls_act_is_valid_access(int off, int size,
9033 enum bpf_access_type type,
9034 const struct bpf_prog *prog,
9035 struct bpf_insn_access_aux *info)
9036 {
9037 if (type == BPF_WRITE) {
9038 switch (off) {
9039 case bpf_ctx_range(struct __sk_buff, mark):
9040 case bpf_ctx_range(struct __sk_buff, tc_index):
9041 case bpf_ctx_range(struct __sk_buff, priority):
9042 case bpf_ctx_range(struct __sk_buff, tc_classid):
9043 case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
9044 case bpf_ctx_range(struct __sk_buff, tstamp):
9045 case bpf_ctx_range(struct __sk_buff, queue_mapping):
9046 break;
9047 default:
9048 return false;
9049 }
9050 }
9051
9052 switch (off) {
9053 case bpf_ctx_range(struct __sk_buff, data):
9054 info->reg_type = PTR_TO_PACKET;
9055 break;
9056 case bpf_ctx_range(struct __sk_buff, data_meta):
9057 info->reg_type = PTR_TO_PACKET_META;
9058 break;
9059 case bpf_ctx_range(struct __sk_buff, data_end):
9060 info->reg_type = PTR_TO_PACKET_END;
9061 break;
9062 case bpf_ctx_range_till(struct __sk_buff, family, local_port):
9063 return false;
9064 case offsetof(struct __sk_buff, tstamp_type):
9065 /* The convert_ctx_access() on reading and writing
9066 * __sk_buff->tstamp depends on whether the bpf prog
9067 * has used __sk_buff->tstamp_type or not.
9068 * Thus, we need to set prog->tstamp_type_access
9069 * earlier during is_valid_access() here.
9070 */
9071 ((struct bpf_prog *)prog)->tstamp_type_access = 1;
9072 return size == sizeof(__u8);
9073 }
9074
9075 return bpf_skb_is_valid_access(off, size, type, prog, info);
9076 }
9077
9078 DEFINE_MUTEX(nf_conn_btf_access_lock);
9079 EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock);
9080
9081 int (*nfct_btf_struct_access)(struct bpf_verifier_log *log,
9082 const struct bpf_reg_state *reg,
9083 int off, int size);
9084 EXPORT_SYMBOL_GPL(nfct_btf_struct_access);
9085
tc_cls_act_btf_struct_access(struct bpf_verifier_log * log,const struct bpf_reg_state * reg,int off,int size)9086 static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log,
9087 const struct bpf_reg_state *reg,
9088 int off, int size)
9089 {
9090 int ret = -EACCES;
9091
9092 mutex_lock(&nf_conn_btf_access_lock);
9093 if (nfct_btf_struct_access)
9094 ret = nfct_btf_struct_access(log, reg, off, size);
9095 mutex_unlock(&nf_conn_btf_access_lock);
9096
9097 return ret;
9098 }
9099
__is_valid_xdp_access(int off,int size)9100 static bool __is_valid_xdp_access(int off, int size)
9101 {
9102 if (off < 0 || off >= sizeof(struct xdp_md))
9103 return false;
9104 if (off % size != 0)
9105 return false;
9106 if (size != sizeof(__u32))
9107 return false;
9108
9109 return true;
9110 }
9111
xdp_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)9112 static bool xdp_is_valid_access(int off, int size,
9113 enum bpf_access_type type,
9114 const struct bpf_prog *prog,
9115 struct bpf_insn_access_aux *info)
9116 {
9117 if (prog->expected_attach_type != BPF_XDP_DEVMAP) {
9118 switch (off) {
9119 case offsetof(struct xdp_md, egress_ifindex):
9120 return false;
9121 }
9122 }
9123
9124 if (type == BPF_WRITE) {
9125 if (bpf_prog_is_offloaded(prog->aux)) {
9126 switch (off) {
9127 case offsetof(struct xdp_md, rx_queue_index):
9128 return __is_valid_xdp_access(off, size);
9129 }
9130 }
9131 return false;
9132 } else {
9133 switch (off) {
9134 case offsetof(struct xdp_md, data_meta):
9135 case offsetof(struct xdp_md, data):
9136 case offsetof(struct xdp_md, data_end):
9137 if (info->is_ldsx)
9138 return false;
9139 }
9140 }
9141
9142 switch (off) {
9143 case offsetof(struct xdp_md, data):
9144 info->reg_type = PTR_TO_PACKET;
9145 break;
9146 case offsetof(struct xdp_md, data_meta):
9147 info->reg_type = PTR_TO_PACKET_META;
9148 break;
9149 case offsetof(struct xdp_md, data_end):
9150 info->reg_type = PTR_TO_PACKET_END;
9151 break;
9152 }
9153
9154 return __is_valid_xdp_access(off, size);
9155 }
9156
bpf_warn_invalid_xdp_action(const struct net_device * dev,const struct bpf_prog * prog,u32 act)9157 void bpf_warn_invalid_xdp_action(const struct net_device *dev,
9158 const struct bpf_prog *prog, u32 act)
9159 {
9160 const u32 act_max = XDP_REDIRECT;
9161
9162 pr_warn_once("%s XDP return value %u on prog %s (id %d) dev %s, expect packet loss!\n",
9163 act > act_max ? "Illegal" : "Driver unsupported",
9164 act, prog->aux->name, prog->aux->id, dev ? dev->name : "N/A");
9165 }
9166 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
9167
xdp_btf_struct_access(struct bpf_verifier_log * log,const struct bpf_reg_state * reg,int off,int size)9168 static int xdp_btf_struct_access(struct bpf_verifier_log *log,
9169 const struct bpf_reg_state *reg,
9170 int off, int size)
9171 {
9172 int ret = -EACCES;
9173
9174 mutex_lock(&nf_conn_btf_access_lock);
9175 if (nfct_btf_struct_access)
9176 ret = nfct_btf_struct_access(log, reg, off, size);
9177 mutex_unlock(&nf_conn_btf_access_lock);
9178
9179 return ret;
9180 }
9181
sock_addr_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)9182 static bool sock_addr_is_valid_access(int off, int size,
9183 enum bpf_access_type type,
9184 const struct bpf_prog *prog,
9185 struct bpf_insn_access_aux *info)
9186 {
9187 const int size_default = sizeof(__u32);
9188
9189 if (off < 0 || off >= sizeof(struct bpf_sock_addr))
9190 return false;
9191 if (off % size != 0)
9192 return false;
9193
9194 /* Disallow access to fields not belonging to the attach type's address
9195 * family.
9196 */
9197 switch (off) {
9198 case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
9199 switch (prog->expected_attach_type) {
9200 case BPF_CGROUP_INET4_BIND:
9201 case BPF_CGROUP_INET4_CONNECT:
9202 case BPF_CGROUP_INET4_GETPEERNAME:
9203 case BPF_CGROUP_INET4_GETSOCKNAME:
9204 case BPF_CGROUP_UDP4_SENDMSG:
9205 case BPF_CGROUP_UDP4_RECVMSG:
9206 break;
9207 default:
9208 return false;
9209 }
9210 break;
9211 case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
9212 switch (prog->expected_attach_type) {
9213 case BPF_CGROUP_INET6_BIND:
9214 case BPF_CGROUP_INET6_CONNECT:
9215 case BPF_CGROUP_INET6_GETPEERNAME:
9216 case BPF_CGROUP_INET6_GETSOCKNAME:
9217 case BPF_CGROUP_UDP6_SENDMSG:
9218 case BPF_CGROUP_UDP6_RECVMSG:
9219 break;
9220 default:
9221 return false;
9222 }
9223 break;
9224 case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
9225 switch (prog->expected_attach_type) {
9226 case BPF_CGROUP_UDP4_SENDMSG:
9227 break;
9228 default:
9229 return false;
9230 }
9231 break;
9232 case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
9233 msg_src_ip6[3]):
9234 switch (prog->expected_attach_type) {
9235 case BPF_CGROUP_UDP6_SENDMSG:
9236 break;
9237 default:
9238 return false;
9239 }
9240 break;
9241 }
9242
9243 switch (off) {
9244 case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
9245 case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
9246 case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
9247 case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
9248 msg_src_ip6[3]):
9249 case bpf_ctx_range(struct bpf_sock_addr, user_port):
9250 if (type == BPF_READ) {
9251 bpf_ctx_record_field_size(info, size_default);
9252
9253 if (bpf_ctx_wide_access_ok(off, size,
9254 struct bpf_sock_addr,
9255 user_ip6))
9256 return true;
9257
9258 if (bpf_ctx_wide_access_ok(off, size,
9259 struct bpf_sock_addr,
9260 msg_src_ip6))
9261 return true;
9262
9263 if (!bpf_ctx_narrow_access_ok(off, size, size_default))
9264 return false;
9265 } else {
9266 if (bpf_ctx_wide_access_ok(off, size,
9267 struct bpf_sock_addr,
9268 user_ip6))
9269 return true;
9270
9271 if (bpf_ctx_wide_access_ok(off, size,
9272 struct bpf_sock_addr,
9273 msg_src_ip6))
9274 return true;
9275
9276 if (size != size_default)
9277 return false;
9278 }
9279 break;
9280 case bpf_ctx_range_ptr(struct bpf_sock_addr, sk):
9281 if (type != BPF_READ)
9282 return false;
9283 if (size != sizeof(__u64))
9284 return false;
9285 info->reg_type = PTR_TO_SOCKET;
9286 break;
9287 default:
9288 if (type == BPF_READ) {
9289 if (size != size_default)
9290 return false;
9291 } else {
9292 return false;
9293 }
9294 }
9295
9296 return true;
9297 }
9298
sock_ops_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)9299 static bool sock_ops_is_valid_access(int off, int size,
9300 enum bpf_access_type type,
9301 const struct bpf_prog *prog,
9302 struct bpf_insn_access_aux *info)
9303 {
9304 const int size_default = sizeof(__u32);
9305
9306 if (off < 0 || off >= sizeof(struct bpf_sock_ops))
9307 return false;
9308
9309 /* The verifier guarantees that size > 0. */
9310 if (off % size != 0)
9311 return false;
9312
9313 if (type == BPF_WRITE) {
9314 switch (off) {
9315 case offsetof(struct bpf_sock_ops, reply):
9316 case offsetof(struct bpf_sock_ops, sk_txhash):
9317 if (size != size_default)
9318 return false;
9319 break;
9320 default:
9321 return false;
9322 }
9323 } else {
9324 switch (off) {
9325 case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received,
9326 bytes_acked):
9327 if (size != sizeof(__u64))
9328 return false;
9329 break;
9330 case bpf_ctx_range_ptr(struct bpf_sock_ops, sk):
9331 if (size != sizeof(__u64))
9332 return false;
9333 info->reg_type = PTR_TO_SOCKET_OR_NULL;
9334 break;
9335 case bpf_ctx_range_ptr(struct bpf_sock_ops, skb_data):
9336 if (size != sizeof(__u64))
9337 return false;
9338 info->reg_type = PTR_TO_PACKET;
9339 break;
9340 case bpf_ctx_range_ptr(struct bpf_sock_ops, skb_data_end):
9341 if (size != sizeof(__u64))
9342 return false;
9343 info->reg_type = PTR_TO_PACKET_END;
9344 break;
9345 case offsetof(struct bpf_sock_ops, skb_tcp_flags):
9346 bpf_ctx_record_field_size(info, size_default);
9347 return bpf_ctx_narrow_access_ok(off, size,
9348 size_default);
9349 case bpf_ctx_range(struct bpf_sock_ops, skb_hwtstamp):
9350 if (size != sizeof(__u64))
9351 return false;
9352 break;
9353 default:
9354 if (size != size_default)
9355 return false;
9356 break;
9357 }
9358 }
9359
9360 return true;
9361 }
9362
sk_skb_prologue(struct bpf_insn * insn_buf,bool direct_write,const struct bpf_prog * prog)9363 static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write,
9364 const struct bpf_prog *prog)
9365 {
9366 return bpf_unclone_prologue(insn_buf, direct_write, prog, SK_DROP);
9367 }
9368
sk_skb_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)9369 static bool sk_skb_is_valid_access(int off, int size,
9370 enum bpf_access_type type,
9371 const struct bpf_prog *prog,
9372 struct bpf_insn_access_aux *info)
9373 {
9374 switch (off) {
9375 case bpf_ctx_range(struct __sk_buff, tc_classid):
9376 case bpf_ctx_range(struct __sk_buff, data_meta):
9377 case bpf_ctx_range(struct __sk_buff, tstamp):
9378 case bpf_ctx_range(struct __sk_buff, wire_len):
9379 case bpf_ctx_range(struct __sk_buff, hwtstamp):
9380 return false;
9381 }
9382
9383 if (type == BPF_WRITE) {
9384 switch (off) {
9385 case bpf_ctx_range(struct __sk_buff, tc_index):
9386 case bpf_ctx_range(struct __sk_buff, priority):
9387 break;
9388 default:
9389 return false;
9390 }
9391 }
9392
9393 switch (off) {
9394 case bpf_ctx_range(struct __sk_buff, mark):
9395 return false;
9396 case bpf_ctx_range(struct __sk_buff, data):
9397 info->reg_type = PTR_TO_PACKET;
9398 break;
9399 case bpf_ctx_range(struct __sk_buff, data_end):
9400 info->reg_type = PTR_TO_PACKET_END;
9401 break;
9402 }
9403
9404 return bpf_skb_is_valid_access(off, size, type, prog, info);
9405 }
9406
sk_msg_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)9407 static bool sk_msg_is_valid_access(int off, int size,
9408 enum bpf_access_type type,
9409 const struct bpf_prog *prog,
9410 struct bpf_insn_access_aux *info)
9411 {
9412 if (type == BPF_WRITE)
9413 return false;
9414
9415 if (off % size != 0)
9416 return false;
9417
9418 switch (off) {
9419 case bpf_ctx_range_ptr(struct sk_msg_md, data):
9420 info->reg_type = PTR_TO_PACKET;
9421 if (size != sizeof(__u64))
9422 return false;
9423 break;
9424 case bpf_ctx_range_ptr(struct sk_msg_md, data_end):
9425 info->reg_type = PTR_TO_PACKET_END;
9426 if (size != sizeof(__u64))
9427 return false;
9428 break;
9429 case bpf_ctx_range_ptr(struct sk_msg_md, sk):
9430 if (size != sizeof(__u64))
9431 return false;
9432 info->reg_type = PTR_TO_SOCKET;
9433 break;
9434 case bpf_ctx_range(struct sk_msg_md, family):
9435 case bpf_ctx_range(struct sk_msg_md, remote_ip4):
9436 case bpf_ctx_range(struct sk_msg_md, local_ip4):
9437 case bpf_ctx_range_till(struct sk_msg_md, remote_ip6[0], remote_ip6[3]):
9438 case bpf_ctx_range_till(struct sk_msg_md, local_ip6[0], local_ip6[3]):
9439 case bpf_ctx_range(struct sk_msg_md, remote_port):
9440 case bpf_ctx_range(struct sk_msg_md, local_port):
9441 case bpf_ctx_range(struct sk_msg_md, size):
9442 if (size != sizeof(__u32))
9443 return false;
9444 break;
9445 default:
9446 return false;
9447 }
9448 return true;
9449 }
9450
flow_dissector_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)9451 static bool flow_dissector_is_valid_access(int off, int size,
9452 enum bpf_access_type type,
9453 const struct bpf_prog *prog,
9454 struct bpf_insn_access_aux *info)
9455 {
9456 const int size_default = sizeof(__u32);
9457
9458 if (off < 0 || off >= sizeof(struct __sk_buff))
9459 return false;
9460
9461 if (off % size != 0)
9462 return false;
9463
9464 if (type == BPF_WRITE)
9465 return false;
9466
9467 switch (off) {
9468 case bpf_ctx_range(struct __sk_buff, data):
9469 if (info->is_ldsx || size != size_default)
9470 return false;
9471 info->reg_type = PTR_TO_PACKET;
9472 return true;
9473 case bpf_ctx_range(struct __sk_buff, data_end):
9474 if (info->is_ldsx || size != size_default)
9475 return false;
9476 info->reg_type = PTR_TO_PACKET_END;
9477 return true;
9478 case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
9479 if (size != sizeof(__u64))
9480 return false;
9481 info->reg_type = PTR_TO_FLOW_KEYS;
9482 return true;
9483 default:
9484 return false;
9485 }
9486 }
9487
flow_dissector_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)9488 static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type,
9489 const struct bpf_insn *si,
9490 struct bpf_insn *insn_buf,
9491 struct bpf_prog *prog,
9492 u32 *target_size)
9493
9494 {
9495 struct bpf_insn *insn = insn_buf;
9496
9497 switch (si->off) {
9498 case offsetof(struct __sk_buff, data):
9499 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data),
9500 si->dst_reg, si->src_reg,
9501 offsetof(struct bpf_flow_dissector, data));
9502 break;
9503
9504 case offsetof(struct __sk_buff, data_end):
9505 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data_end),
9506 si->dst_reg, si->src_reg,
9507 offsetof(struct bpf_flow_dissector, data_end));
9508 break;
9509
9510 case offsetof(struct __sk_buff, flow_keys):
9511 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, flow_keys),
9512 si->dst_reg, si->src_reg,
9513 offsetof(struct bpf_flow_dissector, flow_keys));
9514 break;
9515 }
9516
9517 return insn - insn_buf;
9518 }
9519
bpf_convert_tstamp_type_read(const struct bpf_insn * si,struct bpf_insn * insn)9520 static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si,
9521 struct bpf_insn *insn)
9522 {
9523 __u8 value_reg = si->dst_reg;
9524 __u8 skb_reg = si->src_reg;
9525 BUILD_BUG_ON(__SKB_CLOCK_MAX != (int)BPF_SKB_CLOCK_TAI);
9526 BUILD_BUG_ON(SKB_CLOCK_REALTIME != (int)BPF_SKB_CLOCK_REALTIME);
9527 BUILD_BUG_ON(SKB_CLOCK_MONOTONIC != (int)BPF_SKB_CLOCK_MONOTONIC);
9528 BUILD_BUG_ON(SKB_CLOCK_TAI != (int)BPF_SKB_CLOCK_TAI);
9529 *insn++ = BPF_LDX_MEM(BPF_B, value_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
9530 *insn++ = BPF_ALU32_IMM(BPF_AND, value_reg, SKB_TSTAMP_TYPE_MASK);
9531 #ifdef __BIG_ENDIAN_BITFIELD
9532 *insn++ = BPF_ALU32_IMM(BPF_RSH, value_reg, SKB_TSTAMP_TYPE_RSHIFT);
9533 #else
9534 BUILD_BUG_ON(!(SKB_TSTAMP_TYPE_MASK & 0x1));
9535 #endif
9536
9537 return insn;
9538 }
9539
bpf_convert_shinfo_access(__u8 dst_reg,__u8 skb_reg,struct bpf_insn * insn)9540 static struct bpf_insn *bpf_convert_shinfo_access(__u8 dst_reg, __u8 skb_reg,
9541 struct bpf_insn *insn)
9542 {
9543 /* si->dst_reg = skb_shinfo(SKB); */
9544 #ifdef NET_SKBUFF_DATA_USES_OFFSET
9545 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
9546 BPF_REG_AX, skb_reg,
9547 offsetof(struct sk_buff, end));
9548 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head),
9549 dst_reg, skb_reg,
9550 offsetof(struct sk_buff, head));
9551 *insn++ = BPF_ALU64_REG(BPF_ADD, dst_reg, BPF_REG_AX);
9552 #else
9553 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
9554 dst_reg, skb_reg,
9555 offsetof(struct sk_buff, end));
9556 #endif
9557
9558 return insn;
9559 }
9560
bpf_convert_tstamp_read(const struct bpf_prog * prog,const struct bpf_insn * si,struct bpf_insn * insn)9561 static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
9562 const struct bpf_insn *si,
9563 struct bpf_insn *insn)
9564 {
9565 __u8 value_reg = si->dst_reg;
9566 __u8 skb_reg = si->src_reg;
9567
9568 #ifdef CONFIG_NET_XGRESS
9569 /* If the tstamp_type is read,
9570 * the bpf prog is aware the tstamp could have delivery time.
9571 * Thus, read skb->tstamp as is if tstamp_type_access is true.
9572 */
9573 if (!prog->tstamp_type_access) {
9574 /* AX is needed because src_reg and dst_reg could be the same */
9575 __u8 tmp_reg = BPF_REG_AX;
9576
9577 *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
9578 /* check if ingress mask bits is set */
9579 *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
9580 *insn++ = BPF_JMP_A(4);
9581 *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_TSTAMP_TYPE_MASK, 1);
9582 *insn++ = BPF_JMP_A(2);
9583 /* skb->tc_at_ingress && skb->tstamp_type,
9584 * read 0 as the (rcv) timestamp.
9585 */
9586 *insn++ = BPF_MOV64_IMM(value_reg, 0);
9587 *insn++ = BPF_JMP_A(1);
9588 }
9589 #endif
9590
9591 *insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg,
9592 offsetof(struct sk_buff, tstamp));
9593 return insn;
9594 }
9595
bpf_convert_tstamp_write(const struct bpf_prog * prog,const struct bpf_insn * si,struct bpf_insn * insn)9596 static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
9597 const struct bpf_insn *si,
9598 struct bpf_insn *insn)
9599 {
9600 __u8 value_reg = si->src_reg;
9601 __u8 skb_reg = si->dst_reg;
9602
9603 #ifdef CONFIG_NET_XGRESS
9604 /* If the tstamp_type is read,
9605 * the bpf prog is aware the tstamp could have delivery time.
9606 * Thus, write skb->tstamp as is if tstamp_type_access is true.
9607 * Otherwise, writing at ingress will have to clear the
9608 * skb->tstamp_type bit also.
9609 */
9610 if (!prog->tstamp_type_access) {
9611 __u8 tmp_reg = BPF_REG_AX;
9612
9613 *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
9614 /* Writing __sk_buff->tstamp as ingress, goto <clear> */
9615 *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
9616 /* goto <store> */
9617 *insn++ = BPF_JMP_A(2);
9618 /* <clear>: skb->tstamp_type */
9619 *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TSTAMP_TYPE_MASK);
9620 *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET);
9621 }
9622 #endif
9623
9624 /* <store>: skb->tstamp = tstamp */
9625 *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_DW | BPF_MEM,
9626 skb_reg, value_reg, offsetof(struct sk_buff, tstamp), si->imm);
9627 return insn;
9628 }
9629
9630 #define BPF_EMIT_STORE(size, si, off) \
9631 BPF_RAW_INSN(BPF_CLASS((si)->code) | (size) | BPF_MEM, \
9632 (si)->dst_reg, (si)->src_reg, (off), (si)->imm)
9633
bpf_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)9634 static u32 bpf_convert_ctx_access(enum bpf_access_type type,
9635 const struct bpf_insn *si,
9636 struct bpf_insn *insn_buf,
9637 struct bpf_prog *prog, u32 *target_size)
9638 {
9639 struct bpf_insn *insn = insn_buf;
9640 int off;
9641
9642 switch (si->off) {
9643 case offsetof(struct __sk_buff, len):
9644 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
9645 bpf_target_off(struct sk_buff, len, 4,
9646 target_size));
9647 break;
9648
9649 case offsetof(struct __sk_buff, protocol):
9650 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
9651 bpf_target_off(struct sk_buff, protocol, 2,
9652 target_size));
9653 break;
9654
9655 case offsetof(struct __sk_buff, vlan_proto):
9656 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
9657 bpf_target_off(struct sk_buff, vlan_proto, 2,
9658 target_size));
9659 break;
9660
9661 case offsetof(struct __sk_buff, priority):
9662 if (type == BPF_WRITE)
9663 *insn++ = BPF_EMIT_STORE(BPF_W, si,
9664 bpf_target_off(struct sk_buff, priority, 4,
9665 target_size));
9666 else
9667 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
9668 bpf_target_off(struct sk_buff, priority, 4,
9669 target_size));
9670 break;
9671
9672 case offsetof(struct __sk_buff, ingress_ifindex):
9673 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
9674 bpf_target_off(struct sk_buff, skb_iif, 4,
9675 target_size));
9676 break;
9677
9678 case offsetof(struct __sk_buff, ifindex):
9679 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
9680 si->dst_reg, si->src_reg,
9681 offsetof(struct sk_buff, dev));
9682 *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
9683 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
9684 bpf_target_off(struct net_device, ifindex, 4,
9685 target_size));
9686 break;
9687
9688 case offsetof(struct __sk_buff, hash):
9689 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
9690 bpf_target_off(struct sk_buff, hash, 4,
9691 target_size));
9692 break;
9693
9694 case offsetof(struct __sk_buff, mark):
9695 if (type == BPF_WRITE)
9696 *insn++ = BPF_EMIT_STORE(BPF_W, si,
9697 bpf_target_off(struct sk_buff, mark, 4,
9698 target_size));
9699 else
9700 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
9701 bpf_target_off(struct sk_buff, mark, 4,
9702 target_size));
9703 break;
9704
9705 case offsetof(struct __sk_buff, pkt_type):
9706 *target_size = 1;
9707 *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg,
9708 PKT_TYPE_OFFSET);
9709 *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX);
9710 #ifdef __BIG_ENDIAN_BITFIELD
9711 *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5);
9712 #endif
9713 break;
9714
9715 case offsetof(struct __sk_buff, queue_mapping):
9716 if (type == BPF_WRITE) {
9717 u32 offset = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size);
9718
9719 if (BPF_CLASS(si->code) == BPF_ST && si->imm >= NO_QUEUE_MAPPING) {
9720 *insn++ = BPF_JMP_A(0); /* noop */
9721 break;
9722 }
9723
9724 if (BPF_CLASS(si->code) == BPF_STX)
9725 *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1);
9726 *insn++ = BPF_EMIT_STORE(BPF_H, si, offset);
9727 } else {
9728 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
9729 bpf_target_off(struct sk_buff,
9730 queue_mapping,
9731 2, target_size));
9732 }
9733 break;
9734
9735 case offsetof(struct __sk_buff, vlan_present):
9736 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
9737 bpf_target_off(struct sk_buff,
9738 vlan_all, 4, target_size));
9739 *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
9740 *insn++ = BPF_ALU32_IMM(BPF_MOV, si->dst_reg, 1);
9741 break;
9742
9743 case offsetof(struct __sk_buff, vlan_tci):
9744 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
9745 bpf_target_off(struct sk_buff, vlan_tci, 2,
9746 target_size));
9747 break;
9748
9749 case offsetof(struct __sk_buff, cb[0]) ...
9750 offsetofend(struct __sk_buff, cb[4]) - 1:
9751 BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, data) < 20);
9752 BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
9753 offsetof(struct qdisc_skb_cb, data)) %
9754 sizeof(__u64));
9755
9756 prog->cb_access = 1;
9757 off = si->off;
9758 off -= offsetof(struct __sk_buff, cb[0]);
9759 off += offsetof(struct sk_buff, cb);
9760 off += offsetof(struct qdisc_skb_cb, data);
9761 if (type == BPF_WRITE)
9762 *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off);
9763 else
9764 *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
9765 si->src_reg, off);
9766 break;
9767
9768 case offsetof(struct __sk_buff, tc_classid):
9769 BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, tc_classid) != 2);
9770
9771 off = si->off;
9772 off -= offsetof(struct __sk_buff, tc_classid);
9773 off += offsetof(struct sk_buff, cb);
9774 off += offsetof(struct qdisc_skb_cb, tc_classid);
9775 *target_size = 2;
9776 if (type == BPF_WRITE)
9777 *insn++ = BPF_EMIT_STORE(BPF_H, si, off);
9778 else
9779 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg,
9780 si->src_reg, off);
9781 break;
9782
9783 case offsetof(struct __sk_buff, data):
9784 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
9785 si->dst_reg, si->src_reg,
9786 offsetof(struct sk_buff, data));
9787 break;
9788
9789 case offsetof(struct __sk_buff, data_meta):
9790 off = si->off;
9791 off -= offsetof(struct __sk_buff, data_meta);
9792 off += offsetof(struct sk_buff, cb);
9793 off += offsetof(struct bpf_skb_data_end, data_meta);
9794 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
9795 si->src_reg, off);
9796 break;
9797
9798 case offsetof(struct __sk_buff, data_end):
9799 off = si->off;
9800 off -= offsetof(struct __sk_buff, data_end);
9801 off += offsetof(struct sk_buff, cb);
9802 off += offsetof(struct bpf_skb_data_end, data_end);
9803 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
9804 si->src_reg, off);
9805 break;
9806
9807 case offsetof(struct __sk_buff, tc_index):
9808 #ifdef CONFIG_NET_SCHED
9809 if (type == BPF_WRITE)
9810 *insn++ = BPF_EMIT_STORE(BPF_H, si,
9811 bpf_target_off(struct sk_buff, tc_index, 2,
9812 target_size));
9813 else
9814 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
9815 bpf_target_off(struct sk_buff, tc_index, 2,
9816 target_size));
9817 #else
9818 *target_size = 2;
9819 if (type == BPF_WRITE)
9820 *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg);
9821 else
9822 *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
9823 #endif
9824 break;
9825
9826 case offsetof(struct __sk_buff, napi_id):
9827 #if defined(CONFIG_NET_RX_BUSY_POLL)
9828 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
9829 bpf_target_off(struct sk_buff, napi_id, 4,
9830 target_size));
9831 *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1);
9832 *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
9833 #else
9834 *target_size = 4;
9835 *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
9836 #endif
9837 break;
9838 case offsetof(struct __sk_buff, family):
9839 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);
9840
9841 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
9842 si->dst_reg, si->src_reg,
9843 offsetof(struct sk_buff, sk));
9844 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
9845 bpf_target_off(struct sock_common,
9846 skc_family,
9847 2, target_size));
9848 break;
9849 case offsetof(struct __sk_buff, remote_ip4):
9850 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);
9851
9852 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
9853 si->dst_reg, si->src_reg,
9854 offsetof(struct sk_buff, sk));
9855 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
9856 bpf_target_off(struct sock_common,
9857 skc_daddr,
9858 4, target_size));
9859 break;
9860 case offsetof(struct __sk_buff, local_ip4):
9861 BUILD_BUG_ON(sizeof_field(struct sock_common,
9862 skc_rcv_saddr) != 4);
9863
9864 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
9865 si->dst_reg, si->src_reg,
9866 offsetof(struct sk_buff, sk));
9867 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
9868 bpf_target_off(struct sock_common,
9869 skc_rcv_saddr,
9870 4, target_size));
9871 break;
9872 case offsetof(struct __sk_buff, remote_ip6[0]) ...
9873 offsetof(struct __sk_buff, remote_ip6[3]):
9874 #if IS_ENABLED(CONFIG_IPV6)
9875 BUILD_BUG_ON(sizeof_field(struct sock_common,
9876 skc_v6_daddr.s6_addr32[0]) != 4);
9877
9878 off = si->off;
9879 off -= offsetof(struct __sk_buff, remote_ip6[0]);
9880
9881 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
9882 si->dst_reg, si->src_reg,
9883 offsetof(struct sk_buff, sk));
9884 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
9885 offsetof(struct sock_common,
9886 skc_v6_daddr.s6_addr32[0]) +
9887 off);
9888 #else
9889 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
9890 #endif
9891 break;
9892 case offsetof(struct __sk_buff, local_ip6[0]) ...
9893 offsetof(struct __sk_buff, local_ip6[3]):
9894 #if IS_ENABLED(CONFIG_IPV6)
9895 BUILD_BUG_ON(sizeof_field(struct sock_common,
9896 skc_v6_rcv_saddr.s6_addr32[0]) != 4);
9897
9898 off = si->off;
9899 off -= offsetof(struct __sk_buff, local_ip6[0]);
9900
9901 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
9902 si->dst_reg, si->src_reg,
9903 offsetof(struct sk_buff, sk));
9904 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
9905 offsetof(struct sock_common,
9906 skc_v6_rcv_saddr.s6_addr32[0]) +
9907 off);
9908 #else
9909 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
9910 #endif
9911 break;
9912
9913 case offsetof(struct __sk_buff, remote_port):
9914 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);
9915
9916 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
9917 si->dst_reg, si->src_reg,
9918 offsetof(struct sk_buff, sk));
9919 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
9920 bpf_target_off(struct sock_common,
9921 skc_dport,
9922 2, target_size));
9923 #ifndef __BIG_ENDIAN_BITFIELD
9924 *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
9925 #endif
9926 break;
9927
9928 case offsetof(struct __sk_buff, local_port):
9929 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);
9930
9931 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
9932 si->dst_reg, si->src_reg,
9933 offsetof(struct sk_buff, sk));
9934 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
9935 bpf_target_off(struct sock_common,
9936 skc_num, 2, target_size));
9937 break;
9938
9939 case offsetof(struct __sk_buff, tstamp):
9940 BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8);
9941
9942 if (type == BPF_WRITE)
9943 insn = bpf_convert_tstamp_write(prog, si, insn);
9944 else
9945 insn = bpf_convert_tstamp_read(prog, si, insn);
9946 break;
9947
9948 case offsetof(struct __sk_buff, tstamp_type):
9949 insn = bpf_convert_tstamp_type_read(si, insn);
9950 break;
9951
9952 case offsetof(struct __sk_buff, gso_segs):
9953 insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn);
9954 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs),
9955 si->dst_reg, si->dst_reg,
9956 bpf_target_off(struct skb_shared_info,
9957 gso_segs, 2,
9958 target_size));
9959 break;
9960 case offsetof(struct __sk_buff, gso_size):
9961 insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn);
9962 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_size),
9963 si->dst_reg, si->dst_reg,
9964 bpf_target_off(struct skb_shared_info,
9965 gso_size, 2,
9966 target_size));
9967 break;
9968 case offsetof(struct __sk_buff, wire_len):
9969 BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, pkt_len) != 4);
9970
9971 off = si->off;
9972 off -= offsetof(struct __sk_buff, wire_len);
9973 off += offsetof(struct sk_buff, cb);
9974 off += offsetof(struct qdisc_skb_cb, pkt_len);
9975 *target_size = 4;
9976 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off);
9977 break;
9978
9979 case offsetof(struct __sk_buff, sk):
9980 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
9981 si->dst_reg, si->src_reg,
9982 offsetof(struct sk_buff, sk));
9983 break;
9984 case offsetof(struct __sk_buff, hwtstamp):
9985 BUILD_BUG_ON(sizeof_field(struct skb_shared_hwtstamps, hwtstamp) != 8);
9986 BUILD_BUG_ON(offsetof(struct skb_shared_hwtstamps, hwtstamp) != 0);
9987
9988 insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn);
9989 *insn++ = BPF_LDX_MEM(BPF_DW,
9990 si->dst_reg, si->dst_reg,
9991 bpf_target_off(struct skb_shared_info,
9992 hwtstamps, 8,
9993 target_size));
9994 break;
9995 }
9996
9997 return insn - insn_buf;
9998 }
9999
bpf_sock_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)10000 u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
10001 const struct bpf_insn *si,
10002 struct bpf_insn *insn_buf,
10003 struct bpf_prog *prog, u32 *target_size)
10004 {
10005 struct bpf_insn *insn = insn_buf;
10006 int off;
10007
10008 switch (si->off) {
10009 case offsetof(struct bpf_sock, bound_dev_if):
10010 BUILD_BUG_ON(sizeof_field(struct sock, sk_bound_dev_if) != 4);
10011
10012 if (type == BPF_WRITE)
10013 *insn++ = BPF_EMIT_STORE(BPF_W, si,
10014 offsetof(struct sock, sk_bound_dev_if));
10015 else
10016 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
10017 offsetof(struct sock, sk_bound_dev_if));
10018 break;
10019
10020 case offsetof(struct bpf_sock, mark):
10021 BUILD_BUG_ON(sizeof_field(struct sock, sk_mark) != 4);
10022
10023 if (type == BPF_WRITE)
10024 *insn++ = BPF_EMIT_STORE(BPF_W, si,
10025 offsetof(struct sock, sk_mark));
10026 else
10027 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
10028 offsetof(struct sock, sk_mark));
10029 break;
10030
10031 case offsetof(struct bpf_sock, priority):
10032 BUILD_BUG_ON(sizeof_field(struct sock, sk_priority) != 4);
10033
10034 if (type == BPF_WRITE)
10035 *insn++ = BPF_EMIT_STORE(BPF_W, si,
10036 offsetof(struct sock, sk_priority));
10037 else
10038 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
10039 offsetof(struct sock, sk_priority));
10040 break;
10041
10042 case offsetof(struct bpf_sock, family):
10043 *insn++ = BPF_LDX_MEM(
10044 BPF_FIELD_SIZEOF(struct sock_common, skc_family),
10045 si->dst_reg, si->src_reg,
10046 bpf_target_off(struct sock_common,
10047 skc_family,
10048 sizeof_field(struct sock_common,
10049 skc_family),
10050 target_size));
10051 break;
10052
10053 case offsetof(struct bpf_sock, type):
10054 *insn++ = BPF_LDX_MEM(
10055 BPF_FIELD_SIZEOF(struct sock, sk_type),
10056 si->dst_reg, si->src_reg,
10057 bpf_target_off(struct sock, sk_type,
10058 sizeof_field(struct sock, sk_type),
10059 target_size));
10060 break;
10061
10062 case offsetof(struct bpf_sock, protocol):
10063 *insn++ = BPF_LDX_MEM(
10064 BPF_FIELD_SIZEOF(struct sock, sk_protocol),
10065 si->dst_reg, si->src_reg,
10066 bpf_target_off(struct sock, sk_protocol,
10067 sizeof_field(struct sock, sk_protocol),
10068 target_size));
10069 break;
10070
10071 case offsetof(struct bpf_sock, src_ip4):
10072 *insn++ = BPF_LDX_MEM(
10073 BPF_SIZE(si->code), si->dst_reg, si->src_reg,
10074 bpf_target_off(struct sock_common, skc_rcv_saddr,
10075 sizeof_field(struct sock_common,
10076 skc_rcv_saddr),
10077 target_size));
10078 break;
10079
10080 case offsetof(struct bpf_sock, dst_ip4):
10081 *insn++ = BPF_LDX_MEM(
10082 BPF_SIZE(si->code), si->dst_reg, si->src_reg,
10083 bpf_target_off(struct sock_common, skc_daddr,
10084 sizeof_field(struct sock_common,
10085 skc_daddr),
10086 target_size));
10087 break;
10088
10089 case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
10090 #if IS_ENABLED(CONFIG_IPV6)
10091 off = si->off;
10092 off -= offsetof(struct bpf_sock, src_ip6[0]);
10093 *insn++ = BPF_LDX_MEM(
10094 BPF_SIZE(si->code), si->dst_reg, si->src_reg,
10095 bpf_target_off(
10096 struct sock_common,
10097 skc_v6_rcv_saddr.s6_addr32[0],
10098 sizeof_field(struct sock_common,
10099 skc_v6_rcv_saddr.s6_addr32[0]),
10100 target_size) + off);
10101 #else
10102 (void)off;
10103 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
10104 #endif
10105 break;
10106
10107 case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
10108 #if IS_ENABLED(CONFIG_IPV6)
10109 off = si->off;
10110 off -= offsetof(struct bpf_sock, dst_ip6[0]);
10111 *insn++ = BPF_LDX_MEM(
10112 BPF_SIZE(si->code), si->dst_reg, si->src_reg,
10113 bpf_target_off(struct sock_common,
10114 skc_v6_daddr.s6_addr32[0],
10115 sizeof_field(struct sock_common,
10116 skc_v6_daddr.s6_addr32[0]),
10117 target_size) + off);
10118 #else
10119 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
10120 *target_size = 4;
10121 #endif
10122 break;
10123
10124 case offsetof(struct bpf_sock, src_port):
10125 *insn++ = BPF_LDX_MEM(
10126 BPF_FIELD_SIZEOF(struct sock_common, skc_num),
10127 si->dst_reg, si->src_reg,
10128 bpf_target_off(struct sock_common, skc_num,
10129 sizeof_field(struct sock_common,
10130 skc_num),
10131 target_size));
10132 break;
10133
10134 case offsetof(struct bpf_sock, dst_port):
10135 *insn++ = BPF_LDX_MEM(
10136 BPF_FIELD_SIZEOF(struct sock_common, skc_dport),
10137 si->dst_reg, si->src_reg,
10138 bpf_target_off(struct sock_common, skc_dport,
10139 sizeof_field(struct sock_common,
10140 skc_dport),
10141 target_size));
10142 break;
10143
10144 case offsetof(struct bpf_sock, state):
10145 *insn++ = BPF_LDX_MEM(
10146 BPF_FIELD_SIZEOF(struct sock_common, skc_state),
10147 si->dst_reg, si->src_reg,
10148 bpf_target_off(struct sock_common, skc_state,
10149 sizeof_field(struct sock_common,
10150 skc_state),
10151 target_size));
10152 break;
10153 case offsetof(struct bpf_sock, rx_queue_mapping):
10154 #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
10155 *insn++ = BPF_LDX_MEM(
10156 BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping),
10157 si->dst_reg, si->src_reg,
10158 bpf_target_off(struct sock, sk_rx_queue_mapping,
10159 sizeof_field(struct sock,
10160 sk_rx_queue_mapping),
10161 target_size));
10162 *insn++ = BPF_JMP_IMM(BPF_JNE, si->dst_reg, NO_QUEUE_MAPPING,
10163 1);
10164 *insn++ = BPF_MOV64_IMM(si->dst_reg, -1);
10165 #else
10166 *insn++ = BPF_MOV64_IMM(si->dst_reg, -1);
10167 *target_size = 2;
10168 #endif
10169 break;
10170 }
10171
10172 return insn - insn_buf;
10173 }
10174
tc_cls_act_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)10175 static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type,
10176 const struct bpf_insn *si,
10177 struct bpf_insn *insn_buf,
10178 struct bpf_prog *prog, u32 *target_size)
10179 {
10180 struct bpf_insn *insn = insn_buf;
10181
10182 switch (si->off) {
10183 case offsetof(struct __sk_buff, ifindex):
10184 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
10185 si->dst_reg, si->src_reg,
10186 offsetof(struct sk_buff, dev));
10187 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
10188 bpf_target_off(struct net_device, ifindex, 4,
10189 target_size));
10190 break;
10191 default:
10192 return bpf_convert_ctx_access(type, si, insn_buf, prog,
10193 target_size);
10194 }
10195
10196 return insn - insn_buf;
10197 }
10198
xdp_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)10199 static u32 xdp_convert_ctx_access(enum bpf_access_type type,
10200 const struct bpf_insn *si,
10201 struct bpf_insn *insn_buf,
10202 struct bpf_prog *prog, u32 *target_size)
10203 {
10204 struct bpf_insn *insn = insn_buf;
10205
10206 switch (si->off) {
10207 case offsetof(struct xdp_md, data):
10208 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),
10209 si->dst_reg, si->src_reg,
10210 offsetof(struct xdp_buff, data));
10211 break;
10212 case offsetof(struct xdp_md, data_meta):
10213 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta),
10214 si->dst_reg, si->src_reg,
10215 offsetof(struct xdp_buff, data_meta));
10216 break;
10217 case offsetof(struct xdp_md, data_end):
10218 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
10219 si->dst_reg, si->src_reg,
10220 offsetof(struct xdp_buff, data_end));
10221 break;
10222 case offsetof(struct xdp_md, ingress_ifindex):
10223 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
10224 si->dst_reg, si->src_reg,
10225 offsetof(struct xdp_buff, rxq));
10226 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev),
10227 si->dst_reg, si->dst_reg,
10228 offsetof(struct xdp_rxq_info, dev));
10229 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
10230 offsetof(struct net_device, ifindex));
10231 break;
10232 case offsetof(struct xdp_md, rx_queue_index):
10233 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
10234 si->dst_reg, si->src_reg,
10235 offsetof(struct xdp_buff, rxq));
10236 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
10237 offsetof(struct xdp_rxq_info,
10238 queue_index));
10239 break;
10240 case offsetof(struct xdp_md, egress_ifindex):
10241 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq),
10242 si->dst_reg, si->src_reg,
10243 offsetof(struct xdp_buff, txq));
10244 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev),
10245 si->dst_reg, si->dst_reg,
10246 offsetof(struct xdp_txq_info, dev));
10247 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
10248 offsetof(struct net_device, ifindex));
10249 break;
10250 }
10251
10252 return insn - insn_buf;
10253 }
10254
10255 /* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of
10256 * context Structure, F is Field in context structure that contains a pointer
10257 * to Nested Structure of type NS that has the field NF.
10258 *
10259 * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make
10260 * sure that SIZE is not greater than actual size of S.F.NF.
10261 *
10262 * If offset OFF is provided, the load happens from that offset relative to
10263 * offset of NF.
10264 */
10265 #define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF) \
10266 do { \
10267 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg, \
10268 si->src_reg, offsetof(S, F)); \
10269 *insn++ = BPF_LDX_MEM( \
10270 SIZE, si->dst_reg, si->dst_reg, \
10271 bpf_target_off(NS, NF, sizeof_field(NS, NF), \
10272 target_size) \
10273 + OFF); \
10274 } while (0)
10275
10276 #define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF) \
10277 SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, \
10278 BPF_FIELD_SIZEOF(NS, NF), 0)
10279
10280 /* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to
10281 * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation.
10282 *
10283 * In addition it uses Temporary Field TF (member of struct S) as the 3rd
10284 * "register" since two registers available in convert_ctx_access are not
10285 * enough: we can't override neither SRC, since it contains value to store, nor
10286 * DST since it contains pointer to context that may be used by later
10287 * instructions. But we need a temporary place to save pointer to nested
10288 * structure whose field we want to store to.
10289 */
10290 #define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, OFF, TF) \
10291 do { \
10292 int tmp_reg = BPF_REG_9; \
10293 if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \
10294 --tmp_reg; \
10295 if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \
10296 --tmp_reg; \
10297 *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg, \
10298 offsetof(S, TF)); \
10299 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \
10300 si->dst_reg, offsetof(S, F)); \
10301 *insn++ = BPF_RAW_INSN(SIZE | BPF_MEM | BPF_CLASS(si->code), \
10302 tmp_reg, si->src_reg, \
10303 bpf_target_off(NS, NF, sizeof_field(NS, NF), \
10304 target_size) \
10305 + OFF, \
10306 si->imm); \
10307 *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg, \
10308 offsetof(S, TF)); \
10309 } while (0)
10310
10311 #define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \
10312 TF) \
10313 do { \
10314 if (type == BPF_WRITE) { \
10315 SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, \
10316 OFF, TF); \
10317 } else { \
10318 SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \
10319 S, NS, F, NF, SIZE, OFF); \
10320 } \
10321 } while (0)
10322
sock_addr_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)10323 static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
10324 const struct bpf_insn *si,
10325 struct bpf_insn *insn_buf,
10326 struct bpf_prog *prog, u32 *target_size)
10327 {
10328 int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port);
10329 struct bpf_insn *insn = insn_buf;
10330
10331 switch (si->off) {
10332 case offsetof(struct bpf_sock_addr, user_family):
10333 SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
10334 struct sockaddr, uaddr, sa_family);
10335 break;
10336
10337 case offsetof(struct bpf_sock_addr, user_ip4):
10338 SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
10339 struct bpf_sock_addr_kern, struct sockaddr_in, uaddr,
10340 sin_addr, BPF_SIZE(si->code), 0, tmp_reg);
10341 break;
10342
10343 case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
10344 off = si->off;
10345 off -= offsetof(struct bpf_sock_addr, user_ip6[0]);
10346 SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
10347 struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr,
10348 sin6_addr.s6_addr32[0], BPF_SIZE(si->code), off,
10349 tmp_reg);
10350 break;
10351
10352 case offsetof(struct bpf_sock_addr, user_port):
10353 /* To get port we need to know sa_family first and then treat
10354 * sockaddr as either sockaddr_in or sockaddr_in6.
10355 * Though we can simplify since port field has same offset and
10356 * size in both structures.
10357 * Here we check this invariant and use just one of the
10358 * structures if it's true.
10359 */
10360 BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) !=
10361 offsetof(struct sockaddr_in6, sin6_port));
10362 BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) !=
10363 sizeof_field(struct sockaddr_in6, sin6_port));
10364 /* Account for sin6_port being smaller than user_port. */
10365 port_size = min(port_size, BPF_LDST_BYTES(si));
10366 SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
10367 struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr,
10368 sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg);
10369 break;
10370
10371 case offsetof(struct bpf_sock_addr, family):
10372 SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
10373 struct sock, sk, sk_family);
10374 break;
10375
10376 case offsetof(struct bpf_sock_addr, type):
10377 SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
10378 struct sock, sk, sk_type);
10379 break;
10380
10381 case offsetof(struct bpf_sock_addr, protocol):
10382 SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
10383 struct sock, sk, sk_protocol);
10384 break;
10385
10386 case offsetof(struct bpf_sock_addr, msg_src_ip4):
10387 /* Treat t_ctx as struct in_addr for msg_src_ip4. */
10388 SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
10389 struct bpf_sock_addr_kern, struct in_addr, t_ctx,
10390 s_addr, BPF_SIZE(si->code), 0, tmp_reg);
10391 break;
10392
10393 case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
10394 msg_src_ip6[3]):
10395 off = si->off;
10396 off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]);
10397 /* Treat t_ctx as struct in6_addr for msg_src_ip6. */
10398 SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
10399 struct bpf_sock_addr_kern, struct in6_addr, t_ctx,
10400 s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg);
10401 break;
10402 case offsetof(struct bpf_sock_addr, sk):
10403 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_addr_kern, sk),
10404 si->dst_reg, si->src_reg,
10405 offsetof(struct bpf_sock_addr_kern, sk));
10406 break;
10407 }
10408
10409 return insn - insn_buf;
10410 }
10411
sock_ops_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)10412 static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
10413 const struct bpf_insn *si,
10414 struct bpf_insn *insn_buf,
10415 struct bpf_prog *prog,
10416 u32 *target_size)
10417 {
10418 struct bpf_insn *insn = insn_buf;
10419 int off;
10420
10421 /* Helper macro for adding read access to tcp_sock or sock fields. */
10422 #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
10423 do { \
10424 int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 2; \
10425 BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \
10426 sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \
10427 if (si->dst_reg == reg || si->src_reg == reg) \
10428 reg--; \
10429 if (si->dst_reg == reg || si->src_reg == reg) \
10430 reg--; \
10431 if (si->dst_reg == si->src_reg) { \
10432 *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, \
10433 offsetof(struct bpf_sock_ops_kern, \
10434 temp)); \
10435 fullsock_reg = reg; \
10436 jmp += 2; \
10437 } \
10438 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
10439 struct bpf_sock_ops_kern, \
10440 is_locked_tcp_sock), \
10441 fullsock_reg, si->src_reg, \
10442 offsetof(struct bpf_sock_ops_kern, \
10443 is_locked_tcp_sock)); \
10444 *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \
10445 if (si->dst_reg == si->src_reg) \
10446 *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \
10447 offsetof(struct bpf_sock_ops_kern, \
10448 temp)); \
10449 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
10450 struct bpf_sock_ops_kern, sk),\
10451 si->dst_reg, si->src_reg, \
10452 offsetof(struct bpf_sock_ops_kern, sk));\
10453 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \
10454 OBJ_FIELD), \
10455 si->dst_reg, si->dst_reg, \
10456 offsetof(OBJ, OBJ_FIELD)); \
10457 if (si->dst_reg == si->src_reg) { \
10458 *insn++ = BPF_JMP_A(1); \
10459 *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \
10460 offsetof(struct bpf_sock_ops_kern, \
10461 temp)); \
10462 } \
10463 } while (0)
10464
10465 #define SOCK_OPS_GET_SK() \
10466 do { \
10467 int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 1; \
10468 if (si->dst_reg == reg || si->src_reg == reg) \
10469 reg--; \
10470 if (si->dst_reg == reg || si->src_reg == reg) \
10471 reg--; \
10472 if (si->dst_reg == si->src_reg) { \
10473 *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, \
10474 offsetof(struct bpf_sock_ops_kern, \
10475 temp)); \
10476 fullsock_reg = reg; \
10477 jmp += 2; \
10478 } \
10479 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
10480 struct bpf_sock_ops_kern, \
10481 is_fullsock), \
10482 fullsock_reg, si->src_reg, \
10483 offsetof(struct bpf_sock_ops_kern, \
10484 is_fullsock)); \
10485 *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \
10486 if (si->dst_reg == si->src_reg) \
10487 *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \
10488 offsetof(struct bpf_sock_ops_kern, \
10489 temp)); \
10490 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
10491 struct bpf_sock_ops_kern, sk),\
10492 si->dst_reg, si->src_reg, \
10493 offsetof(struct bpf_sock_ops_kern, sk));\
10494 if (si->dst_reg == si->src_reg) { \
10495 *insn++ = BPF_JMP_A(1); \
10496 *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \
10497 offsetof(struct bpf_sock_ops_kern, \
10498 temp)); \
10499 } \
10500 } while (0)
10501
10502 #define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \
10503 SOCK_OPS_GET_FIELD(FIELD, FIELD, struct tcp_sock)
10504
10505 /* Helper macro for adding write access to tcp_sock or sock fields.
10506 * The macro is called with two registers, dst_reg which contains a pointer
10507 * to ctx (context) and src_reg which contains the value that should be
10508 * stored. However, we need an additional register since we cannot overwrite
10509 * dst_reg because it may be used later in the program.
10510 * Instead we "borrow" one of the other register. We first save its value
10511 * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore
10512 * it at the end of the macro.
10513 */
10514 #define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
10515 do { \
10516 int reg = BPF_REG_9; \
10517 BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \
10518 sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \
10519 if (si->dst_reg == reg || si->src_reg == reg) \
10520 reg--; \
10521 if (si->dst_reg == reg || si->src_reg == reg) \
10522 reg--; \
10523 *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \
10524 offsetof(struct bpf_sock_ops_kern, \
10525 temp)); \
10526 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
10527 struct bpf_sock_ops_kern, \
10528 is_locked_tcp_sock), \
10529 reg, si->dst_reg, \
10530 offsetof(struct bpf_sock_ops_kern, \
10531 is_locked_tcp_sock)); \
10532 *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \
10533 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
10534 struct bpf_sock_ops_kern, sk),\
10535 reg, si->dst_reg, \
10536 offsetof(struct bpf_sock_ops_kern, sk));\
10537 *insn++ = BPF_RAW_INSN(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD) | \
10538 BPF_MEM | BPF_CLASS(si->code), \
10539 reg, si->src_reg, \
10540 offsetof(OBJ, OBJ_FIELD), \
10541 si->imm); \
10542 *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \
10543 offsetof(struct bpf_sock_ops_kern, \
10544 temp)); \
10545 } while (0)
10546
10547 #define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \
10548 do { \
10549 if (TYPE == BPF_WRITE) \
10550 SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \
10551 else \
10552 SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \
10553 } while (0)
10554
10555 switch (si->off) {
10556 case offsetof(struct bpf_sock_ops, op):
10557 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
10558 op),
10559 si->dst_reg, si->src_reg,
10560 offsetof(struct bpf_sock_ops_kern, op));
10561 break;
10562
10563 case offsetof(struct bpf_sock_ops, replylong[0]) ...
10564 offsetof(struct bpf_sock_ops, replylong[3]):
10565 BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, reply) !=
10566 sizeof_field(struct bpf_sock_ops_kern, reply));
10567 BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, replylong) !=
10568 sizeof_field(struct bpf_sock_ops_kern, replylong));
10569 off = si->off;
10570 off -= offsetof(struct bpf_sock_ops, replylong[0]);
10571 off += offsetof(struct bpf_sock_ops_kern, replylong[0]);
10572 if (type == BPF_WRITE)
10573 *insn++ = BPF_EMIT_STORE(BPF_W, si, off);
10574 else
10575 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
10576 off);
10577 break;
10578
10579 case offsetof(struct bpf_sock_ops, family):
10580 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);
10581
10582 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10583 struct bpf_sock_ops_kern, sk),
10584 si->dst_reg, si->src_reg,
10585 offsetof(struct bpf_sock_ops_kern, sk));
10586 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
10587 offsetof(struct sock_common, skc_family));
10588 break;
10589
10590 case offsetof(struct bpf_sock_ops, remote_ip4):
10591 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);
10592
10593 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10594 struct bpf_sock_ops_kern, sk),
10595 si->dst_reg, si->src_reg,
10596 offsetof(struct bpf_sock_ops_kern, sk));
10597 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
10598 offsetof(struct sock_common, skc_daddr));
10599 break;
10600
10601 case offsetof(struct bpf_sock_ops, local_ip4):
10602 BUILD_BUG_ON(sizeof_field(struct sock_common,
10603 skc_rcv_saddr) != 4);
10604
10605 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10606 struct bpf_sock_ops_kern, sk),
10607 si->dst_reg, si->src_reg,
10608 offsetof(struct bpf_sock_ops_kern, sk));
10609 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
10610 offsetof(struct sock_common,
10611 skc_rcv_saddr));
10612 break;
10613
10614 case offsetof(struct bpf_sock_ops, remote_ip6[0]) ...
10615 offsetof(struct bpf_sock_ops, remote_ip6[3]):
10616 #if IS_ENABLED(CONFIG_IPV6)
10617 BUILD_BUG_ON(sizeof_field(struct sock_common,
10618 skc_v6_daddr.s6_addr32[0]) != 4);
10619
10620 off = si->off;
10621 off -= offsetof(struct bpf_sock_ops, remote_ip6[0]);
10622 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10623 struct bpf_sock_ops_kern, sk),
10624 si->dst_reg, si->src_reg,
10625 offsetof(struct bpf_sock_ops_kern, sk));
10626 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
10627 offsetof(struct sock_common,
10628 skc_v6_daddr.s6_addr32[0]) +
10629 off);
10630 #else
10631 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
10632 #endif
10633 break;
10634
10635 case offsetof(struct bpf_sock_ops, local_ip6[0]) ...
10636 offsetof(struct bpf_sock_ops, local_ip6[3]):
10637 #if IS_ENABLED(CONFIG_IPV6)
10638 BUILD_BUG_ON(sizeof_field(struct sock_common,
10639 skc_v6_rcv_saddr.s6_addr32[0]) != 4);
10640
10641 off = si->off;
10642 off -= offsetof(struct bpf_sock_ops, local_ip6[0]);
10643 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10644 struct bpf_sock_ops_kern, sk),
10645 si->dst_reg, si->src_reg,
10646 offsetof(struct bpf_sock_ops_kern, sk));
10647 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
10648 offsetof(struct sock_common,
10649 skc_v6_rcv_saddr.s6_addr32[0]) +
10650 off);
10651 #else
10652 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
10653 #endif
10654 break;
10655
10656 case offsetof(struct bpf_sock_ops, remote_port):
10657 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);
10658
10659 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10660 struct bpf_sock_ops_kern, sk),
10661 si->dst_reg, si->src_reg,
10662 offsetof(struct bpf_sock_ops_kern, sk));
10663 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
10664 offsetof(struct sock_common, skc_dport));
10665 #ifndef __BIG_ENDIAN_BITFIELD
10666 *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
10667 #endif
10668 break;
10669
10670 case offsetof(struct bpf_sock_ops, local_port):
10671 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);
10672
10673 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10674 struct bpf_sock_ops_kern, sk),
10675 si->dst_reg, si->src_reg,
10676 offsetof(struct bpf_sock_ops_kern, sk));
10677 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
10678 offsetof(struct sock_common, skc_num));
10679 break;
10680
10681 case offsetof(struct bpf_sock_ops, is_fullsock):
10682 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10683 struct bpf_sock_ops_kern,
10684 is_fullsock),
10685 si->dst_reg, si->src_reg,
10686 offsetof(struct bpf_sock_ops_kern,
10687 is_fullsock));
10688 break;
10689
10690 case offsetof(struct bpf_sock_ops, state):
10691 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_state) != 1);
10692
10693 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10694 struct bpf_sock_ops_kern, sk),
10695 si->dst_reg, si->src_reg,
10696 offsetof(struct bpf_sock_ops_kern, sk));
10697 *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg,
10698 offsetof(struct sock_common, skc_state));
10699 break;
10700
10701 case offsetof(struct bpf_sock_ops, rtt_min):
10702 BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) !=
10703 sizeof(struct minmax));
10704 BUILD_BUG_ON(sizeof(struct minmax) <
10705 sizeof(struct minmax_sample));
10706
10707 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10708 struct bpf_sock_ops_kern, sk),
10709 si->dst_reg, si->src_reg,
10710 offsetof(struct bpf_sock_ops_kern, sk));
10711 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
10712 offsetof(struct tcp_sock, rtt_min) +
10713 sizeof_field(struct minmax_sample, t));
10714 break;
10715
10716 case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags):
10717 SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags,
10718 struct tcp_sock);
10719 break;
10720
10721 case offsetof(struct bpf_sock_ops, sk_txhash):
10722 SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
10723 struct sock, type);
10724 break;
10725 case offsetof(struct bpf_sock_ops, snd_cwnd):
10726 SOCK_OPS_GET_TCP_SOCK_FIELD(snd_cwnd);
10727 break;
10728 case offsetof(struct bpf_sock_ops, srtt_us):
10729 SOCK_OPS_GET_TCP_SOCK_FIELD(srtt_us);
10730 break;
10731 case offsetof(struct bpf_sock_ops, snd_ssthresh):
10732 SOCK_OPS_GET_TCP_SOCK_FIELD(snd_ssthresh);
10733 break;
10734 case offsetof(struct bpf_sock_ops, rcv_nxt):
10735 SOCK_OPS_GET_TCP_SOCK_FIELD(rcv_nxt);
10736 break;
10737 case offsetof(struct bpf_sock_ops, snd_nxt):
10738 SOCK_OPS_GET_TCP_SOCK_FIELD(snd_nxt);
10739 break;
10740 case offsetof(struct bpf_sock_ops, snd_una):
10741 SOCK_OPS_GET_TCP_SOCK_FIELD(snd_una);
10742 break;
10743 case offsetof(struct bpf_sock_ops, mss_cache):
10744 SOCK_OPS_GET_TCP_SOCK_FIELD(mss_cache);
10745 break;
10746 case offsetof(struct bpf_sock_ops, ecn_flags):
10747 SOCK_OPS_GET_TCP_SOCK_FIELD(ecn_flags);
10748 break;
10749 case offsetof(struct bpf_sock_ops, rate_delivered):
10750 SOCK_OPS_GET_TCP_SOCK_FIELD(rate_delivered);
10751 break;
10752 case offsetof(struct bpf_sock_ops, rate_interval_us):
10753 SOCK_OPS_GET_TCP_SOCK_FIELD(rate_interval_us);
10754 break;
10755 case offsetof(struct bpf_sock_ops, packets_out):
10756 SOCK_OPS_GET_TCP_SOCK_FIELD(packets_out);
10757 break;
10758 case offsetof(struct bpf_sock_ops, retrans_out):
10759 SOCK_OPS_GET_TCP_SOCK_FIELD(retrans_out);
10760 break;
10761 case offsetof(struct bpf_sock_ops, total_retrans):
10762 SOCK_OPS_GET_TCP_SOCK_FIELD(total_retrans);
10763 break;
10764 case offsetof(struct bpf_sock_ops, segs_in):
10765 SOCK_OPS_GET_TCP_SOCK_FIELD(segs_in);
10766 break;
10767 case offsetof(struct bpf_sock_ops, data_segs_in):
10768 SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_in);
10769 break;
10770 case offsetof(struct bpf_sock_ops, segs_out):
10771 SOCK_OPS_GET_TCP_SOCK_FIELD(segs_out);
10772 break;
10773 case offsetof(struct bpf_sock_ops, data_segs_out):
10774 SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_out);
10775 break;
10776 case offsetof(struct bpf_sock_ops, lost_out):
10777 SOCK_OPS_GET_TCP_SOCK_FIELD(lost_out);
10778 break;
10779 case offsetof(struct bpf_sock_ops, sacked_out):
10780 SOCK_OPS_GET_TCP_SOCK_FIELD(sacked_out);
10781 break;
10782 case offsetof(struct bpf_sock_ops, bytes_received):
10783 SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_received);
10784 break;
10785 case offsetof(struct bpf_sock_ops, bytes_acked):
10786 SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked);
10787 break;
10788 case offsetof(struct bpf_sock_ops, sk):
10789 SOCK_OPS_GET_SK();
10790 break;
10791 case offsetof(struct bpf_sock_ops, skb_data_end):
10792 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
10793 skb_data_end),
10794 si->dst_reg, si->src_reg,
10795 offsetof(struct bpf_sock_ops_kern,
10796 skb_data_end));
10797 break;
10798 case offsetof(struct bpf_sock_ops, skb_data):
10799 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
10800 skb),
10801 si->dst_reg, si->src_reg,
10802 offsetof(struct bpf_sock_ops_kern,
10803 skb));
10804 *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
10805 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
10806 si->dst_reg, si->dst_reg,
10807 offsetof(struct sk_buff, data));
10808 break;
10809 case offsetof(struct bpf_sock_ops, skb_len):
10810 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
10811 skb),
10812 si->dst_reg, si->src_reg,
10813 offsetof(struct bpf_sock_ops_kern,
10814 skb));
10815 *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
10816 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
10817 si->dst_reg, si->dst_reg,
10818 offsetof(struct sk_buff, len));
10819 break;
10820 case offsetof(struct bpf_sock_ops, skb_tcp_flags):
10821 off = offsetof(struct sk_buff, cb);
10822 off += offsetof(struct tcp_skb_cb, tcp_flags);
10823 *target_size = sizeof_field(struct tcp_skb_cb, tcp_flags);
10824 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
10825 skb),
10826 si->dst_reg, si->src_reg,
10827 offsetof(struct bpf_sock_ops_kern,
10828 skb));
10829 *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
10830 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_skb_cb,
10831 tcp_flags),
10832 si->dst_reg, si->dst_reg, off);
10833 break;
10834 case offsetof(struct bpf_sock_ops, skb_hwtstamp): {
10835 struct bpf_insn *jmp_on_null_skb;
10836
10837 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
10838 skb),
10839 si->dst_reg, si->src_reg,
10840 offsetof(struct bpf_sock_ops_kern,
10841 skb));
10842 /* Reserve one insn to test skb == NULL */
10843 jmp_on_null_skb = insn++;
10844 insn = bpf_convert_shinfo_access(si->dst_reg, si->dst_reg, insn);
10845 *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
10846 bpf_target_off(struct skb_shared_info,
10847 hwtstamps, 8,
10848 target_size));
10849 *jmp_on_null_skb = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0,
10850 insn - jmp_on_null_skb - 1);
10851 break;
10852 }
10853 }
10854 return insn - insn_buf;
10855 }
10856
10857 /* data_end = skb->data + skb_headlen() */
bpf_convert_data_end_access(const struct bpf_insn * si,struct bpf_insn * insn)10858 static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si,
10859 struct bpf_insn *insn)
10860 {
10861 int reg;
10862 int temp_reg_off = offsetof(struct sk_buff, cb) +
10863 offsetof(struct sk_skb_cb, temp_reg);
10864
10865 if (si->src_reg == si->dst_reg) {
10866 /* We need an extra register, choose and save a register. */
10867 reg = BPF_REG_9;
10868 if (si->src_reg == reg || si->dst_reg == reg)
10869 reg--;
10870 if (si->src_reg == reg || si->dst_reg == reg)
10871 reg--;
10872 *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, temp_reg_off);
10873 } else {
10874 reg = si->dst_reg;
10875 }
10876
10877 /* reg = skb->data */
10878 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
10879 reg, si->src_reg,
10880 offsetof(struct sk_buff, data));
10881 /* AX = skb->len */
10882 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
10883 BPF_REG_AX, si->src_reg,
10884 offsetof(struct sk_buff, len));
10885 /* reg = skb->data + skb->len */
10886 *insn++ = BPF_ALU64_REG(BPF_ADD, reg, BPF_REG_AX);
10887 /* AX = skb->data_len */
10888 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len),
10889 BPF_REG_AX, si->src_reg,
10890 offsetof(struct sk_buff, data_len));
10891
10892 /* reg = skb->data + skb->len - skb->data_len */
10893 *insn++ = BPF_ALU64_REG(BPF_SUB, reg, BPF_REG_AX);
10894
10895 if (si->src_reg == si->dst_reg) {
10896 /* Restore the saved register */
10897 *insn++ = BPF_MOV64_REG(BPF_REG_AX, si->src_reg);
10898 *insn++ = BPF_MOV64_REG(si->dst_reg, reg);
10899 *insn++ = BPF_LDX_MEM(BPF_DW, reg, BPF_REG_AX, temp_reg_off);
10900 }
10901
10902 return insn;
10903 }
10904
sk_skb_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)10905 static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
10906 const struct bpf_insn *si,
10907 struct bpf_insn *insn_buf,
10908 struct bpf_prog *prog, u32 *target_size)
10909 {
10910 struct bpf_insn *insn = insn_buf;
10911 int off;
10912
10913 switch (si->off) {
10914 case offsetof(struct __sk_buff, data_end):
10915 insn = bpf_convert_data_end_access(si, insn);
10916 break;
10917 case offsetof(struct __sk_buff, cb[0]) ...
10918 offsetofend(struct __sk_buff, cb[4]) - 1:
10919 BUILD_BUG_ON(sizeof_field(struct sk_skb_cb, data) < 20);
10920 BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
10921 offsetof(struct sk_skb_cb, data)) %
10922 sizeof(__u64));
10923
10924 prog->cb_access = 1;
10925 off = si->off;
10926 off -= offsetof(struct __sk_buff, cb[0]);
10927 off += offsetof(struct sk_buff, cb);
10928 off += offsetof(struct sk_skb_cb, data);
10929 if (type == BPF_WRITE)
10930 *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off);
10931 else
10932 *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
10933 si->src_reg, off);
10934 break;
10935
10936
10937 default:
10938 return bpf_convert_ctx_access(type, si, insn_buf, prog,
10939 target_size);
10940 }
10941
10942 return insn - insn_buf;
10943 }
10944
sk_msg_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)10945 static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
10946 const struct bpf_insn *si,
10947 struct bpf_insn *insn_buf,
10948 struct bpf_prog *prog, u32 *target_size)
10949 {
10950 struct bpf_insn *insn = insn_buf;
10951 #if IS_ENABLED(CONFIG_IPV6)
10952 int off;
10953 #endif
10954
10955 /* convert ctx uses the fact sg element is first in struct */
10956 BUILD_BUG_ON(offsetof(struct sk_msg, sg) != 0);
10957
10958 switch (si->off) {
10959 case offsetof(struct sk_msg_md, data):
10960 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data),
10961 si->dst_reg, si->src_reg,
10962 offsetof(struct sk_msg, data));
10963 break;
10964 case offsetof(struct sk_msg_md, data_end):
10965 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end),
10966 si->dst_reg, si->src_reg,
10967 offsetof(struct sk_msg, data_end));
10968 break;
10969 case offsetof(struct sk_msg_md, family):
10970 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);
10971
10972 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10973 struct sk_msg, sk),
10974 si->dst_reg, si->src_reg,
10975 offsetof(struct sk_msg, sk));
10976 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
10977 offsetof(struct sock_common, skc_family));
10978 break;
10979
10980 case offsetof(struct sk_msg_md, remote_ip4):
10981 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);
10982
10983 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10984 struct sk_msg, sk),
10985 si->dst_reg, si->src_reg,
10986 offsetof(struct sk_msg, sk));
10987 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
10988 offsetof(struct sock_common, skc_daddr));
10989 break;
10990
10991 case offsetof(struct sk_msg_md, local_ip4):
10992 BUILD_BUG_ON(sizeof_field(struct sock_common,
10993 skc_rcv_saddr) != 4);
10994
10995 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10996 struct sk_msg, sk),
10997 si->dst_reg, si->src_reg,
10998 offsetof(struct sk_msg, sk));
10999 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
11000 offsetof(struct sock_common,
11001 skc_rcv_saddr));
11002 break;
11003
11004 case offsetof(struct sk_msg_md, remote_ip6[0]) ...
11005 offsetof(struct sk_msg_md, remote_ip6[3]):
11006 #if IS_ENABLED(CONFIG_IPV6)
11007 BUILD_BUG_ON(sizeof_field(struct sock_common,
11008 skc_v6_daddr.s6_addr32[0]) != 4);
11009
11010 off = si->off;
11011 off -= offsetof(struct sk_msg_md, remote_ip6[0]);
11012 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
11013 struct sk_msg, sk),
11014 si->dst_reg, si->src_reg,
11015 offsetof(struct sk_msg, sk));
11016 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
11017 offsetof(struct sock_common,
11018 skc_v6_daddr.s6_addr32[0]) +
11019 off);
11020 #else
11021 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
11022 #endif
11023 break;
11024
11025 case offsetof(struct sk_msg_md, local_ip6[0]) ...
11026 offsetof(struct sk_msg_md, local_ip6[3]):
11027 #if IS_ENABLED(CONFIG_IPV6)
11028 BUILD_BUG_ON(sizeof_field(struct sock_common,
11029 skc_v6_rcv_saddr.s6_addr32[0]) != 4);
11030
11031 off = si->off;
11032 off -= offsetof(struct sk_msg_md, local_ip6[0]);
11033 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
11034 struct sk_msg, sk),
11035 si->dst_reg, si->src_reg,
11036 offsetof(struct sk_msg, sk));
11037 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
11038 offsetof(struct sock_common,
11039 skc_v6_rcv_saddr.s6_addr32[0]) +
11040 off);
11041 #else
11042 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
11043 #endif
11044 break;
11045
11046 case offsetof(struct sk_msg_md, remote_port):
11047 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);
11048
11049 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
11050 struct sk_msg, sk),
11051 si->dst_reg, si->src_reg,
11052 offsetof(struct sk_msg, sk));
11053 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
11054 offsetof(struct sock_common, skc_dport));
11055 #ifndef __BIG_ENDIAN_BITFIELD
11056 *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
11057 #endif
11058 break;
11059
11060 case offsetof(struct sk_msg_md, local_port):
11061 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);
11062
11063 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
11064 struct sk_msg, sk),
11065 si->dst_reg, si->src_reg,
11066 offsetof(struct sk_msg, sk));
11067 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
11068 offsetof(struct sock_common, skc_num));
11069 break;
11070
11071 case offsetof(struct sk_msg_md, size):
11072 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_sg, size),
11073 si->dst_reg, si->src_reg,
11074 offsetof(struct sk_msg_sg, size));
11075 break;
11076
11077 case offsetof(struct sk_msg_md, sk):
11078 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, sk),
11079 si->dst_reg, si->src_reg,
11080 offsetof(struct sk_msg, sk));
11081 break;
11082 }
11083
11084 return insn - insn_buf;
11085 }
11086
11087 const struct bpf_verifier_ops sk_filter_verifier_ops = {
11088 .get_func_proto = sk_filter_func_proto,
11089 .is_valid_access = sk_filter_is_valid_access,
11090 .convert_ctx_access = bpf_convert_ctx_access,
11091 .gen_ld_abs = bpf_gen_ld_abs,
11092 };
11093
11094 const struct bpf_prog_ops sk_filter_prog_ops = {
11095 .test_run = bpf_prog_test_run_skb,
11096 };
11097
11098 const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
11099 .get_func_proto = tc_cls_act_func_proto,
11100 .is_valid_access = tc_cls_act_is_valid_access,
11101 .convert_ctx_access = tc_cls_act_convert_ctx_access,
11102 .gen_prologue = tc_cls_act_prologue,
11103 .gen_ld_abs = bpf_gen_ld_abs,
11104 .btf_struct_access = tc_cls_act_btf_struct_access,
11105 };
11106
11107 const struct bpf_prog_ops tc_cls_act_prog_ops = {
11108 .test_run = bpf_prog_test_run_skb,
11109 };
11110
11111 const struct bpf_verifier_ops xdp_verifier_ops = {
11112 .get_func_proto = xdp_func_proto,
11113 .is_valid_access = xdp_is_valid_access,
11114 .convert_ctx_access = xdp_convert_ctx_access,
11115 .gen_prologue = bpf_noop_prologue,
11116 .btf_struct_access = xdp_btf_struct_access,
11117 };
11118
11119 const struct bpf_prog_ops xdp_prog_ops = {
11120 .test_run = bpf_prog_test_run_xdp,
11121 };
11122
11123 const struct bpf_verifier_ops cg_skb_verifier_ops = {
11124 .get_func_proto = cg_skb_func_proto,
11125 .is_valid_access = cg_skb_is_valid_access,
11126 .convert_ctx_access = bpf_convert_ctx_access,
11127 };
11128
11129 const struct bpf_prog_ops cg_skb_prog_ops = {
11130 .test_run = bpf_prog_test_run_skb,
11131 };
11132
11133 const struct bpf_verifier_ops lwt_in_verifier_ops = {
11134 .get_func_proto = lwt_in_func_proto,
11135 .is_valid_access = lwt_is_valid_access,
11136 .convert_ctx_access = bpf_convert_ctx_access,
11137 };
11138
11139 const struct bpf_prog_ops lwt_in_prog_ops = {
11140 .test_run = bpf_prog_test_run_skb,
11141 };
11142
11143 const struct bpf_verifier_ops lwt_out_verifier_ops = {
11144 .get_func_proto = lwt_out_func_proto,
11145 .is_valid_access = lwt_is_valid_access,
11146 .convert_ctx_access = bpf_convert_ctx_access,
11147 };
11148
11149 const struct bpf_prog_ops lwt_out_prog_ops = {
11150 .test_run = bpf_prog_test_run_skb,
11151 };
11152
11153 const struct bpf_verifier_ops lwt_xmit_verifier_ops = {
11154 .get_func_proto = lwt_xmit_func_proto,
11155 .is_valid_access = lwt_is_valid_access,
11156 .convert_ctx_access = bpf_convert_ctx_access,
11157 .gen_prologue = tc_cls_act_prologue,
11158 };
11159
11160 const struct bpf_prog_ops lwt_xmit_prog_ops = {
11161 .test_run = bpf_prog_test_run_skb,
11162 };
11163
11164 const struct bpf_verifier_ops lwt_seg6local_verifier_ops = {
11165 .get_func_proto = lwt_seg6local_func_proto,
11166 .is_valid_access = lwt_is_valid_access,
11167 .convert_ctx_access = bpf_convert_ctx_access,
11168 };
11169
11170 const struct bpf_prog_ops lwt_seg6local_prog_ops = {
11171 };
11172
11173 const struct bpf_verifier_ops cg_sock_verifier_ops = {
11174 .get_func_proto = sock_filter_func_proto,
11175 .is_valid_access = sock_filter_is_valid_access,
11176 .convert_ctx_access = bpf_sock_convert_ctx_access,
11177 };
11178
11179 const struct bpf_prog_ops cg_sock_prog_ops = {
11180 };
11181
11182 const struct bpf_verifier_ops cg_sock_addr_verifier_ops = {
11183 .get_func_proto = sock_addr_func_proto,
11184 .is_valid_access = sock_addr_is_valid_access,
11185 .convert_ctx_access = sock_addr_convert_ctx_access,
11186 };
11187
11188 const struct bpf_prog_ops cg_sock_addr_prog_ops = {
11189 };
11190
11191 const struct bpf_verifier_ops sock_ops_verifier_ops = {
11192 .get_func_proto = sock_ops_func_proto,
11193 .is_valid_access = sock_ops_is_valid_access,
11194 .convert_ctx_access = sock_ops_convert_ctx_access,
11195 };
11196
11197 const struct bpf_prog_ops sock_ops_prog_ops = {
11198 };
11199
11200 const struct bpf_verifier_ops sk_skb_verifier_ops = {
11201 .get_func_proto = sk_skb_func_proto,
11202 .is_valid_access = sk_skb_is_valid_access,
11203 .convert_ctx_access = sk_skb_convert_ctx_access,
11204 .gen_prologue = sk_skb_prologue,
11205 };
11206
11207 const struct bpf_prog_ops sk_skb_prog_ops = {
11208 };
11209
11210 const struct bpf_verifier_ops sk_msg_verifier_ops = {
11211 .get_func_proto = sk_msg_func_proto,
11212 .is_valid_access = sk_msg_is_valid_access,
11213 .convert_ctx_access = sk_msg_convert_ctx_access,
11214 .gen_prologue = bpf_noop_prologue,
11215 };
11216
11217 const struct bpf_prog_ops sk_msg_prog_ops = {
11218 };
11219
11220 const struct bpf_verifier_ops flow_dissector_verifier_ops = {
11221 .get_func_proto = flow_dissector_func_proto,
11222 .is_valid_access = flow_dissector_is_valid_access,
11223 .convert_ctx_access = flow_dissector_convert_ctx_access,
11224 };
11225
11226 const struct bpf_prog_ops flow_dissector_prog_ops = {
11227 .test_run = bpf_prog_test_run_flow_dissector,
11228 };
11229
sk_detach_filter(struct sock * sk)11230 int sk_detach_filter(struct sock *sk)
11231 {
11232 int ret = -ENOENT;
11233 struct sk_filter *filter;
11234
11235 if (sock_flag(sk, SOCK_FILTER_LOCKED))
11236 return -EPERM;
11237
11238 filter = rcu_dereference_protected(sk->sk_filter,
11239 lockdep_sock_is_held(sk));
11240 if (filter) {
11241 RCU_INIT_POINTER(sk->sk_filter, NULL);
11242 sk_filter_uncharge(sk, filter);
11243 ret = 0;
11244 }
11245
11246 return ret;
11247 }
11248 EXPORT_SYMBOL_GPL(sk_detach_filter);
11249
sk_get_filter(struct sock * sk,sockptr_t optval,unsigned int len)11250 int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len)
11251 {
11252 struct sock_fprog_kern *fprog;
11253 struct sk_filter *filter;
11254 int ret = 0;
11255
11256 sockopt_lock_sock(sk);
11257 filter = rcu_dereference_protected(sk->sk_filter,
11258 lockdep_sock_is_held(sk));
11259 if (!filter)
11260 goto out;
11261
11262 /* We're copying the filter that has been originally attached,
11263 * so no conversion/decode needed anymore. eBPF programs that
11264 * have no original program cannot be dumped through this.
11265 */
11266 ret = -EACCES;
11267 fprog = filter->prog->orig_prog;
11268 if (!fprog)
11269 goto out;
11270
11271 ret = fprog->len;
11272 if (!len)
11273 /* User space only enquires number of filter blocks. */
11274 goto out;
11275
11276 ret = -EINVAL;
11277 if (len < fprog->len)
11278 goto out;
11279
11280 ret = -EFAULT;
11281 if (copy_to_sockptr(optval, fprog->filter, bpf_classic_proglen(fprog)))
11282 goto out;
11283
11284 /* Instead of bytes, the API requests to return the number
11285 * of filter blocks.
11286 */
11287 ret = fprog->len;
11288 out:
11289 sockopt_release_sock(sk);
11290 return ret;
11291 }
11292
11293 #ifdef CONFIG_INET
bpf_init_reuseport_kern(struct sk_reuseport_kern * reuse_kern,struct sock_reuseport * reuse,struct sock * sk,struct sk_buff * skb,struct sock * migrating_sk,u32 hash)11294 static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
11295 struct sock_reuseport *reuse,
11296 struct sock *sk, struct sk_buff *skb,
11297 struct sock *migrating_sk,
11298 u32 hash)
11299 {
11300 reuse_kern->skb = skb;
11301 reuse_kern->sk = sk;
11302 reuse_kern->selected_sk = NULL;
11303 reuse_kern->migrating_sk = migrating_sk;
11304 reuse_kern->data_end = skb->data + skb_headlen(skb);
11305 reuse_kern->hash = hash;
11306 reuse_kern->reuseport_id = reuse->reuseport_id;
11307 reuse_kern->bind_inany = reuse->bind_inany;
11308 }
11309
bpf_run_sk_reuseport(struct sock_reuseport * reuse,struct sock * sk,struct bpf_prog * prog,struct sk_buff * skb,struct sock * migrating_sk,u32 hash)11310 struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
11311 struct bpf_prog *prog, struct sk_buff *skb,
11312 struct sock *migrating_sk,
11313 u32 hash)
11314 {
11315 struct sk_reuseport_kern reuse_kern;
11316 enum sk_action action;
11317
11318 bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash);
11319 action = bpf_prog_run(prog, &reuse_kern);
11320
11321 if (action == SK_PASS)
11322 return reuse_kern.selected_sk;
11323 else
11324 return ERR_PTR(-ECONNREFUSED);
11325 }
11326
BPF_CALL_4(sk_select_reuseport,struct sk_reuseport_kern *,reuse_kern,struct bpf_map *,map,void *,key,u32,flags)11327 BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
11328 struct bpf_map *, map, void *, key, u32, flags)
11329 {
11330 bool is_sockarray = map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY;
11331 struct sock_reuseport *reuse;
11332 struct sock *selected_sk;
11333 int err;
11334
11335 selected_sk = map->ops->map_lookup_elem(map, key);
11336 if (!selected_sk)
11337 return -ENOENT;
11338
11339 reuse = rcu_dereference(selected_sk->sk_reuseport_cb);
11340 if (!reuse) {
11341 /* reuseport_array has only sk with non NULL sk_reuseport_cb.
11342 * The only (!reuse) case here is - the sk has already been
11343 * unhashed (e.g. by close()), so treat it as -ENOENT.
11344 *
11345 * Other maps (e.g. sock_map) do not provide this guarantee and
11346 * the sk may never be in the reuseport group to begin with.
11347 */
11348 err = is_sockarray ? -ENOENT : -EINVAL;
11349 goto error;
11350 }
11351
11352 if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) {
11353 struct sock *sk = reuse_kern->sk;
11354
11355 if (sk->sk_protocol != selected_sk->sk_protocol) {
11356 err = -EPROTOTYPE;
11357 } else if (sk->sk_family != selected_sk->sk_family) {
11358 err = -EAFNOSUPPORT;
11359 } else {
11360 /* Catch all. Likely bound to a different sockaddr. */
11361 err = -EBADFD;
11362 }
11363 goto error;
11364 }
11365
11366 reuse_kern->selected_sk = selected_sk;
11367
11368 return 0;
11369 error:
11370 /* Lookup in sock_map can return TCP ESTABLISHED sockets. */
11371 if (sk_is_refcounted(selected_sk))
11372 sock_put(selected_sk);
11373
11374 return err;
11375 }
11376
11377 static const struct bpf_func_proto sk_select_reuseport_proto = {
11378 .func = sk_select_reuseport,
11379 .gpl_only = false,
11380 .ret_type = RET_INTEGER,
11381 .arg1_type = ARG_PTR_TO_CTX,
11382 .arg2_type = ARG_CONST_MAP_PTR,
11383 .arg3_type = ARG_PTR_TO_MAP_KEY,
11384 .arg4_type = ARG_ANYTHING,
11385 };
11386
BPF_CALL_4(sk_reuseport_load_bytes,const struct sk_reuseport_kern *,reuse_kern,u32,offset,void *,to,u32,len)11387 BPF_CALL_4(sk_reuseport_load_bytes,
11388 const struct sk_reuseport_kern *, reuse_kern, u32, offset,
11389 void *, to, u32, len)
11390 {
11391 return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len);
11392 }
11393
11394 static const struct bpf_func_proto sk_reuseport_load_bytes_proto = {
11395 .func = sk_reuseport_load_bytes,
11396 .gpl_only = false,
11397 .ret_type = RET_INTEGER,
11398 .arg1_type = ARG_PTR_TO_CTX,
11399 .arg2_type = ARG_ANYTHING,
11400 .arg3_type = ARG_PTR_TO_UNINIT_MEM,
11401 .arg4_type = ARG_CONST_SIZE,
11402 };
11403
BPF_CALL_5(sk_reuseport_load_bytes_relative,const struct sk_reuseport_kern *,reuse_kern,u32,offset,void *,to,u32,len,u32,start_header)11404 BPF_CALL_5(sk_reuseport_load_bytes_relative,
11405 const struct sk_reuseport_kern *, reuse_kern, u32, offset,
11406 void *, to, u32, len, u32, start_header)
11407 {
11408 return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to,
11409 len, start_header);
11410 }
11411
11412 static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = {
11413 .func = sk_reuseport_load_bytes_relative,
11414 .gpl_only = false,
11415 .ret_type = RET_INTEGER,
11416 .arg1_type = ARG_PTR_TO_CTX,
11417 .arg2_type = ARG_ANYTHING,
11418 .arg3_type = ARG_PTR_TO_UNINIT_MEM,
11419 .arg4_type = ARG_CONST_SIZE,
11420 .arg5_type = ARG_ANYTHING,
11421 };
11422
11423 static const struct bpf_func_proto *
sk_reuseport_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)11424 sk_reuseport_func_proto(enum bpf_func_id func_id,
11425 const struct bpf_prog *prog)
11426 {
11427 switch (func_id) {
11428 case BPF_FUNC_sk_select_reuseport:
11429 return &sk_select_reuseport_proto;
11430 case BPF_FUNC_skb_load_bytes:
11431 return &sk_reuseport_load_bytes_proto;
11432 case BPF_FUNC_skb_load_bytes_relative:
11433 return &sk_reuseport_load_bytes_relative_proto;
11434 case BPF_FUNC_get_socket_cookie:
11435 return &bpf_get_socket_ptr_cookie_proto;
11436 case BPF_FUNC_ktime_get_coarse_ns:
11437 return &bpf_ktime_get_coarse_ns_proto;
11438 default:
11439 return bpf_base_func_proto(func_id, prog);
11440 }
11441 }
11442
11443 static bool
sk_reuseport_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)11444 sk_reuseport_is_valid_access(int off, int size,
11445 enum bpf_access_type type,
11446 const struct bpf_prog *prog,
11447 struct bpf_insn_access_aux *info)
11448 {
11449 const u32 size_default = sizeof(__u32);
11450
11451 if (off < 0 || off >= sizeof(struct sk_reuseport_md) ||
11452 off % size || type != BPF_READ)
11453 return false;
11454
11455 switch (off) {
11456 case offsetof(struct sk_reuseport_md, data):
11457 info->reg_type = PTR_TO_PACKET;
11458 return size == sizeof(__u64);
11459
11460 case offsetof(struct sk_reuseport_md, data_end):
11461 info->reg_type = PTR_TO_PACKET_END;
11462 return size == sizeof(__u64);
11463
11464 case offsetof(struct sk_reuseport_md, hash):
11465 return size == size_default;
11466
11467 case offsetof(struct sk_reuseport_md, sk):
11468 info->reg_type = PTR_TO_SOCKET;
11469 return size == sizeof(__u64);
11470
11471 case offsetof(struct sk_reuseport_md, migrating_sk):
11472 info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
11473 return size == sizeof(__u64);
11474
11475 /* Fields that allow narrowing */
11476 case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
11477 if (size < sizeof_field(struct sk_buff, protocol))
11478 return false;
11479 fallthrough;
11480 case bpf_ctx_range(struct sk_reuseport_md, ip_protocol):
11481 case bpf_ctx_range(struct sk_reuseport_md, bind_inany):
11482 case bpf_ctx_range(struct sk_reuseport_md, len):
11483 bpf_ctx_record_field_size(info, size_default);
11484 return bpf_ctx_narrow_access_ok(off, size, size_default);
11485
11486 default:
11487 return false;
11488 }
11489 }
11490
11491 #define SK_REUSEPORT_LOAD_FIELD(F) ({ \
11492 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \
11493 si->dst_reg, si->src_reg, \
11494 bpf_target_off(struct sk_reuseport_kern, F, \
11495 sizeof_field(struct sk_reuseport_kern, F), \
11496 target_size)); \
11497 })
11498
11499 #define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD) \
11500 SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \
11501 struct sk_buff, \
11502 skb, \
11503 SKB_FIELD)
11504
11505 #define SK_REUSEPORT_LOAD_SK_FIELD(SK_FIELD) \
11506 SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \
11507 struct sock, \
11508 sk, \
11509 SK_FIELD)
11510
sk_reuseport_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)11511 static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
11512 const struct bpf_insn *si,
11513 struct bpf_insn *insn_buf,
11514 struct bpf_prog *prog,
11515 u32 *target_size)
11516 {
11517 struct bpf_insn *insn = insn_buf;
11518
11519 switch (si->off) {
11520 case offsetof(struct sk_reuseport_md, data):
11521 SK_REUSEPORT_LOAD_SKB_FIELD(data);
11522 break;
11523
11524 case offsetof(struct sk_reuseport_md, len):
11525 SK_REUSEPORT_LOAD_SKB_FIELD(len);
11526 break;
11527
11528 case offsetof(struct sk_reuseport_md, eth_protocol):
11529 SK_REUSEPORT_LOAD_SKB_FIELD(protocol);
11530 break;
11531
11532 case offsetof(struct sk_reuseport_md, ip_protocol):
11533 SK_REUSEPORT_LOAD_SK_FIELD(sk_protocol);
11534 break;
11535
11536 case offsetof(struct sk_reuseport_md, data_end):
11537 SK_REUSEPORT_LOAD_FIELD(data_end);
11538 break;
11539
11540 case offsetof(struct sk_reuseport_md, hash):
11541 SK_REUSEPORT_LOAD_FIELD(hash);
11542 break;
11543
11544 case offsetof(struct sk_reuseport_md, bind_inany):
11545 SK_REUSEPORT_LOAD_FIELD(bind_inany);
11546 break;
11547
11548 case offsetof(struct sk_reuseport_md, sk):
11549 SK_REUSEPORT_LOAD_FIELD(sk);
11550 break;
11551
11552 case offsetof(struct sk_reuseport_md, migrating_sk):
11553 SK_REUSEPORT_LOAD_FIELD(migrating_sk);
11554 break;
11555 }
11556
11557 return insn - insn_buf;
11558 }
11559
11560 const struct bpf_verifier_ops sk_reuseport_verifier_ops = {
11561 .get_func_proto = sk_reuseport_func_proto,
11562 .is_valid_access = sk_reuseport_is_valid_access,
11563 .convert_ctx_access = sk_reuseport_convert_ctx_access,
11564 };
11565
11566 const struct bpf_prog_ops sk_reuseport_prog_ops = {
11567 };
11568
11569 DEFINE_STATIC_KEY_FALSE(bpf_sk_lookup_enabled);
11570 EXPORT_SYMBOL(bpf_sk_lookup_enabled);
11571
BPF_CALL_3(bpf_sk_lookup_assign,struct bpf_sk_lookup_kern *,ctx,struct sock *,sk,u64,flags)11572 BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx,
11573 struct sock *, sk, u64, flags)
11574 {
11575 if (unlikely(flags & ~(BPF_SK_LOOKUP_F_REPLACE |
11576 BPF_SK_LOOKUP_F_NO_REUSEPORT)))
11577 return -EINVAL;
11578 if (unlikely(sk && sk_is_refcounted(sk)))
11579 return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */
11580 if (unlikely(sk && sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN))
11581 return -ESOCKTNOSUPPORT; /* only accept TCP socket in LISTEN */
11582 if (unlikely(sk && sk_is_udp(sk) && sk->sk_state != TCP_CLOSE))
11583 return -ESOCKTNOSUPPORT; /* only accept UDP socket in CLOSE */
11584
11585 /* Check if socket is suitable for packet L3/L4 protocol */
11586 if (sk && sk->sk_protocol != ctx->protocol)
11587 return -EPROTOTYPE;
11588 if (sk && sk->sk_family != ctx->family &&
11589 (sk->sk_family == AF_INET || ipv6_only_sock(sk)))
11590 return -EAFNOSUPPORT;
11591
11592 if (ctx->selected_sk && !(flags & BPF_SK_LOOKUP_F_REPLACE))
11593 return -EEXIST;
11594
11595 /* Select socket as lookup result */
11596 ctx->selected_sk = sk;
11597 ctx->no_reuseport = flags & BPF_SK_LOOKUP_F_NO_REUSEPORT;
11598 return 0;
11599 }
11600
11601 static const struct bpf_func_proto bpf_sk_lookup_assign_proto = {
11602 .func = bpf_sk_lookup_assign,
11603 .gpl_only = false,
11604 .ret_type = RET_INTEGER,
11605 .arg1_type = ARG_PTR_TO_CTX,
11606 .arg2_type = ARG_PTR_TO_SOCKET_OR_NULL,
11607 .arg3_type = ARG_ANYTHING,
11608 };
11609
11610 static const struct bpf_func_proto *
sk_lookup_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)11611 sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
11612 {
11613 switch (func_id) {
11614 case BPF_FUNC_perf_event_output:
11615 return &bpf_event_output_data_proto;
11616 case BPF_FUNC_sk_assign:
11617 return &bpf_sk_lookup_assign_proto;
11618 case BPF_FUNC_sk_release:
11619 return &bpf_sk_release_proto;
11620 default:
11621 return bpf_sk_base_func_proto(func_id, prog);
11622 }
11623 }
11624
sk_lookup_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)11625 static bool sk_lookup_is_valid_access(int off, int size,
11626 enum bpf_access_type type,
11627 const struct bpf_prog *prog,
11628 struct bpf_insn_access_aux *info)
11629 {
11630 if (off < 0 || off >= sizeof(struct bpf_sk_lookup))
11631 return false;
11632 if (off % size != 0)
11633 return false;
11634 if (type != BPF_READ)
11635 return false;
11636
11637 switch (off) {
11638 case bpf_ctx_range_ptr(struct bpf_sk_lookup, sk):
11639 info->reg_type = PTR_TO_SOCKET_OR_NULL;
11640 return size == sizeof(__u64);
11641
11642 case bpf_ctx_range(struct bpf_sk_lookup, family):
11643 case bpf_ctx_range(struct bpf_sk_lookup, protocol):
11644 case bpf_ctx_range(struct bpf_sk_lookup, remote_ip4):
11645 case bpf_ctx_range(struct bpf_sk_lookup, local_ip4):
11646 case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]):
11647 case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]):
11648 case bpf_ctx_range(struct bpf_sk_lookup, local_port):
11649 case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex):
11650 bpf_ctx_record_field_size(info, sizeof(__u32));
11651 return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32));
11652
11653 case bpf_ctx_range(struct bpf_sk_lookup, remote_port):
11654 /* Allow 4-byte access to 2-byte field for backward compatibility */
11655 if (size == sizeof(__u32))
11656 return true;
11657 bpf_ctx_record_field_size(info, sizeof(__be16));
11658 return bpf_ctx_narrow_access_ok(off, size, sizeof(__be16));
11659
11660 case offsetofend(struct bpf_sk_lookup, remote_port) ...
11661 offsetof(struct bpf_sk_lookup, local_ip4) - 1:
11662 /* Allow access to zero padding for backward compatibility */
11663 bpf_ctx_record_field_size(info, sizeof(__u16));
11664 return bpf_ctx_narrow_access_ok(off, size, sizeof(__u16));
11665
11666 default:
11667 return false;
11668 }
11669 }
11670
sk_lookup_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)11671 static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type,
11672 const struct bpf_insn *si,
11673 struct bpf_insn *insn_buf,
11674 struct bpf_prog *prog,
11675 u32 *target_size)
11676 {
11677 struct bpf_insn *insn = insn_buf;
11678
11679 switch (si->off) {
11680 case offsetof(struct bpf_sk_lookup, sk):
11681 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
11682 offsetof(struct bpf_sk_lookup_kern, selected_sk));
11683 break;
11684
11685 case offsetof(struct bpf_sk_lookup, family):
11686 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
11687 bpf_target_off(struct bpf_sk_lookup_kern,
11688 family, 2, target_size));
11689 break;
11690
11691 case offsetof(struct bpf_sk_lookup, protocol):
11692 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
11693 bpf_target_off(struct bpf_sk_lookup_kern,
11694 protocol, 2, target_size));
11695 break;
11696
11697 case offsetof(struct bpf_sk_lookup, remote_ip4):
11698 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
11699 bpf_target_off(struct bpf_sk_lookup_kern,
11700 v4.saddr, 4, target_size));
11701 break;
11702
11703 case offsetof(struct bpf_sk_lookup, local_ip4):
11704 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
11705 bpf_target_off(struct bpf_sk_lookup_kern,
11706 v4.daddr, 4, target_size));
11707 break;
11708
11709 case bpf_ctx_range_till(struct bpf_sk_lookup,
11710 remote_ip6[0], remote_ip6[3]): {
11711 #if IS_ENABLED(CONFIG_IPV6)
11712 int off = si->off;
11713
11714 off -= offsetof(struct bpf_sk_lookup, remote_ip6[0]);
11715 off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size);
11716 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
11717 offsetof(struct bpf_sk_lookup_kern, v6.saddr));
11718 *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
11719 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off);
11720 #else
11721 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
11722 #endif
11723 break;
11724 }
11725 case bpf_ctx_range_till(struct bpf_sk_lookup,
11726 local_ip6[0], local_ip6[3]): {
11727 #if IS_ENABLED(CONFIG_IPV6)
11728 int off = si->off;
11729
11730 off -= offsetof(struct bpf_sk_lookup, local_ip6[0]);
11731 off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size);
11732 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
11733 offsetof(struct bpf_sk_lookup_kern, v6.daddr));
11734 *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
11735 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off);
11736 #else
11737 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
11738 #endif
11739 break;
11740 }
11741 case offsetof(struct bpf_sk_lookup, remote_port):
11742 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
11743 bpf_target_off(struct bpf_sk_lookup_kern,
11744 sport, 2, target_size));
11745 break;
11746
11747 case offsetofend(struct bpf_sk_lookup, remote_port):
11748 *target_size = 2;
11749 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
11750 break;
11751
11752 case offsetof(struct bpf_sk_lookup, local_port):
11753 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
11754 bpf_target_off(struct bpf_sk_lookup_kern,
11755 dport, 2, target_size));
11756 break;
11757
11758 case offsetof(struct bpf_sk_lookup, ingress_ifindex):
11759 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
11760 bpf_target_off(struct bpf_sk_lookup_kern,
11761 ingress_ifindex, 4, target_size));
11762 break;
11763 }
11764
11765 return insn - insn_buf;
11766 }
11767
11768 const struct bpf_prog_ops sk_lookup_prog_ops = {
11769 .test_run = bpf_prog_test_run_sk_lookup,
11770 };
11771
11772 const struct bpf_verifier_ops sk_lookup_verifier_ops = {
11773 .get_func_proto = sk_lookup_func_proto,
11774 .is_valid_access = sk_lookup_is_valid_access,
11775 .convert_ctx_access = sk_lookup_convert_ctx_access,
11776 };
11777
11778 #endif /* CONFIG_INET */
11779
DEFINE_BPF_DISPATCHER(xdp)11780 DEFINE_BPF_DISPATCHER(xdp)
11781
11782 void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog)
11783 {
11784 bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog);
11785 }
11786
BTF_ID_LIST_GLOBAL(btf_sock_ids,MAX_BTF_SOCK_TYPE)11787 BTF_ID_LIST_GLOBAL(btf_sock_ids, MAX_BTF_SOCK_TYPE)
11788 #define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type)
11789 BTF_SOCK_TYPE_xxx
11790 #undef BTF_SOCK_TYPE
11791
11792 BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk)
11793 {
11794 /* tcp6_sock type is not generated in dwarf and hence btf,
11795 * trigger an explicit type generation here.
11796 */
11797 BTF_TYPE_EMIT(struct tcp6_sock);
11798 if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP &&
11799 sk->sk_family == AF_INET6)
11800 return (unsigned long)sk;
11801
11802 return (unsigned long)NULL;
11803 }
11804
11805 const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = {
11806 .func = bpf_skc_to_tcp6_sock,
11807 .gpl_only = false,
11808 .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
11809 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
11810 .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP6],
11811 };
11812
BPF_CALL_1(bpf_skc_to_tcp_sock,struct sock *,sk)11813 BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk)
11814 {
11815 if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
11816 return (unsigned long)sk;
11817
11818 return (unsigned long)NULL;
11819 }
11820
11821 const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = {
11822 .func = bpf_skc_to_tcp_sock,
11823 .gpl_only = false,
11824 .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
11825 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
11826 .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
11827 };
11828
BPF_CALL_1(bpf_skc_to_tcp_timewait_sock,struct sock *,sk)11829 BPF_CALL_1(bpf_skc_to_tcp_timewait_sock, struct sock *, sk)
11830 {
11831 /* BTF types for tcp_timewait_sock and inet_timewait_sock are not
11832 * generated if CONFIG_INET=n. Trigger an explicit generation here.
11833 */
11834 BTF_TYPE_EMIT(struct inet_timewait_sock);
11835 BTF_TYPE_EMIT(struct tcp_timewait_sock);
11836
11837 #ifdef CONFIG_INET
11838 if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_TIME_WAIT)
11839 return (unsigned long)sk;
11840 #endif
11841
11842 #if IS_BUILTIN(CONFIG_IPV6)
11843 if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_TIME_WAIT)
11844 return (unsigned long)sk;
11845 #endif
11846
11847 return (unsigned long)NULL;
11848 }
11849
11850 const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = {
11851 .func = bpf_skc_to_tcp_timewait_sock,
11852 .gpl_only = false,
11853 .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
11854 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
11855 .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW],
11856 };
11857
BPF_CALL_1(bpf_skc_to_tcp_request_sock,struct sock *,sk)11858 BPF_CALL_1(bpf_skc_to_tcp_request_sock, struct sock *, sk)
11859 {
11860 #ifdef CONFIG_INET
11861 if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_NEW_SYN_RECV)
11862 return (unsigned long)sk;
11863 #endif
11864
11865 #if IS_BUILTIN(CONFIG_IPV6)
11866 if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_NEW_SYN_RECV)
11867 return (unsigned long)sk;
11868 #endif
11869
11870 return (unsigned long)NULL;
11871 }
11872
11873 const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = {
11874 .func = bpf_skc_to_tcp_request_sock,
11875 .gpl_only = false,
11876 .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
11877 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
11878 .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ],
11879 };
11880
BPF_CALL_1(bpf_skc_to_udp6_sock,struct sock *,sk)11881 BPF_CALL_1(bpf_skc_to_udp6_sock, struct sock *, sk)
11882 {
11883 /* udp6_sock type is not generated in dwarf and hence btf,
11884 * trigger an explicit type generation here.
11885 */
11886 BTF_TYPE_EMIT(struct udp6_sock);
11887 if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_UDP &&
11888 sk->sk_type == SOCK_DGRAM && sk->sk_family == AF_INET6)
11889 return (unsigned long)sk;
11890
11891 return (unsigned long)NULL;
11892 }
11893
11894 const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = {
11895 .func = bpf_skc_to_udp6_sock,
11896 .gpl_only = false,
11897 .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
11898 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
11899 .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UDP6],
11900 };
11901
BPF_CALL_1(bpf_skc_to_unix_sock,struct sock *,sk)11902 BPF_CALL_1(bpf_skc_to_unix_sock, struct sock *, sk)
11903 {
11904 /* unix_sock type is not generated in dwarf and hence btf,
11905 * trigger an explicit type generation here.
11906 */
11907 BTF_TYPE_EMIT(struct unix_sock);
11908 if (sk && sk_fullsock(sk) && sk->sk_family == AF_UNIX)
11909 return (unsigned long)sk;
11910
11911 return (unsigned long)NULL;
11912 }
11913
11914 const struct bpf_func_proto bpf_skc_to_unix_sock_proto = {
11915 .func = bpf_skc_to_unix_sock,
11916 .gpl_only = false,
11917 .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
11918 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
11919 .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UNIX],
11920 };
11921
BPF_CALL_1(bpf_skc_to_mptcp_sock,struct sock *,sk)11922 BPF_CALL_1(bpf_skc_to_mptcp_sock, struct sock *, sk)
11923 {
11924 BTF_TYPE_EMIT(struct mptcp_sock);
11925 return (unsigned long)bpf_mptcp_sock_from_subflow(sk);
11926 }
11927
11928 const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto = {
11929 .func = bpf_skc_to_mptcp_sock,
11930 .gpl_only = false,
11931 .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
11932 .arg1_type = ARG_PTR_TO_SOCK_COMMON,
11933 .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_MPTCP],
11934 };
11935
BPF_CALL_1(bpf_sock_from_file,struct file *,file)11936 BPF_CALL_1(bpf_sock_from_file, struct file *, file)
11937 {
11938 return (unsigned long)sock_from_file(file);
11939 }
11940
11941 BTF_ID_LIST(bpf_sock_from_file_btf_ids)
11942 BTF_ID(struct, socket)
11943 BTF_ID(struct, file)
11944
11945 const struct bpf_func_proto bpf_sock_from_file_proto = {
11946 .func = bpf_sock_from_file,
11947 .gpl_only = false,
11948 .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
11949 .ret_btf_id = &bpf_sock_from_file_btf_ids[0],
11950 .arg1_type = ARG_PTR_TO_BTF_ID,
11951 .arg1_btf_id = &bpf_sock_from_file_btf_ids[1],
11952 };
11953
11954 static const struct bpf_func_proto *
bpf_sk_base_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)11955 bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
11956 {
11957 const struct bpf_func_proto *func;
11958
11959 switch (func_id) {
11960 case BPF_FUNC_skc_to_tcp6_sock:
11961 func = &bpf_skc_to_tcp6_sock_proto;
11962 break;
11963 case BPF_FUNC_skc_to_tcp_sock:
11964 func = &bpf_skc_to_tcp_sock_proto;
11965 break;
11966 case BPF_FUNC_skc_to_tcp_timewait_sock:
11967 func = &bpf_skc_to_tcp_timewait_sock_proto;
11968 break;
11969 case BPF_FUNC_skc_to_tcp_request_sock:
11970 func = &bpf_skc_to_tcp_request_sock_proto;
11971 break;
11972 case BPF_FUNC_skc_to_udp6_sock:
11973 func = &bpf_skc_to_udp6_sock_proto;
11974 break;
11975 case BPF_FUNC_skc_to_unix_sock:
11976 func = &bpf_skc_to_unix_sock_proto;
11977 break;
11978 case BPF_FUNC_skc_to_mptcp_sock:
11979 func = &bpf_skc_to_mptcp_sock_proto;
11980 break;
11981 case BPF_FUNC_ktime_get_coarse_ns:
11982 return &bpf_ktime_get_coarse_ns_proto;
11983 default:
11984 return bpf_base_func_proto(func_id, prog);
11985 }
11986
11987 if (!bpf_token_capable(prog->aux->token, CAP_PERFMON))
11988 return NULL;
11989
11990 return func;
11991 }
11992
11993 __bpf_kfunc_start_defs();
bpf_dynptr_from_skb(struct __sk_buff * s,u64 flags,struct bpf_dynptr * ptr__uninit)11994 __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags,
11995 struct bpf_dynptr *ptr__uninit)
11996 {
11997 struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
11998 struct sk_buff *skb = (struct sk_buff *)s;
11999
12000 if (flags) {
12001 bpf_dynptr_set_null(ptr);
12002 return -EINVAL;
12003 }
12004
12005 bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len);
12006
12007 return 0;
12008 }
12009
bpf_dynptr_from_xdp(struct xdp_md * x,u64 flags,struct bpf_dynptr * ptr__uninit)12010 __bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_md *x, u64 flags,
12011 struct bpf_dynptr *ptr__uninit)
12012 {
12013 struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
12014 struct xdp_buff *xdp = (struct xdp_buff *)x;
12015
12016 if (flags) {
12017 bpf_dynptr_set_null(ptr);
12018 return -EINVAL;
12019 }
12020
12021 bpf_dynptr_init(ptr, xdp, BPF_DYNPTR_TYPE_XDP, 0, xdp_get_buff_len(xdp));
12022
12023 return 0;
12024 }
12025
bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern * sa_kern,const u8 * sun_path,u32 sun_path__sz)12026 __bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern,
12027 const u8 *sun_path, u32 sun_path__sz)
12028 {
12029 struct sockaddr_un *un;
12030
12031 if (sa_kern->sk->sk_family != AF_UNIX)
12032 return -EINVAL;
12033
12034 /* We do not allow changing the address to unnamed or larger than the
12035 * maximum allowed address size for a unix sockaddr.
12036 */
12037 if (sun_path__sz == 0 || sun_path__sz > UNIX_PATH_MAX)
12038 return -EINVAL;
12039
12040 un = (struct sockaddr_un *)sa_kern->uaddr;
12041 memcpy(un->sun_path, sun_path, sun_path__sz);
12042 sa_kern->uaddrlen = offsetof(struct sockaddr_un, sun_path) + sun_path__sz;
12043
12044 return 0;
12045 }
12046
bpf_sk_assign_tcp_reqsk(struct __sk_buff * s,struct sock * sk,struct bpf_tcp_req_attrs * attrs,int attrs__sz)12047 __bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct __sk_buff *s, struct sock *sk,
12048 struct bpf_tcp_req_attrs *attrs, int attrs__sz)
12049 {
12050 #if IS_ENABLED(CONFIG_SYN_COOKIES)
12051 struct sk_buff *skb = (struct sk_buff *)s;
12052 const struct request_sock_ops *ops;
12053 struct inet_request_sock *ireq;
12054 struct tcp_request_sock *treq;
12055 struct request_sock *req;
12056 struct net *net;
12057 __u16 min_mss;
12058 u32 tsoff = 0;
12059
12060 if (attrs__sz != sizeof(*attrs) ||
12061 attrs->reserved[0] || attrs->reserved[1] || attrs->reserved[2])
12062 return -EINVAL;
12063
12064 if (!skb_at_tc_ingress(skb))
12065 return -EINVAL;
12066
12067 net = dev_net(skb->dev);
12068 if (net != sock_net(sk))
12069 return -ENETUNREACH;
12070
12071 switch (skb->protocol) {
12072 case htons(ETH_P_IP):
12073 ops = &tcp_request_sock_ops;
12074 min_mss = 536;
12075 break;
12076 #if IS_BUILTIN(CONFIG_IPV6)
12077 case htons(ETH_P_IPV6):
12078 ops = &tcp6_request_sock_ops;
12079 min_mss = IPV6_MIN_MTU - 60;
12080 break;
12081 #endif
12082 default:
12083 return -EINVAL;
12084 }
12085
12086 if (sk->sk_type != SOCK_STREAM || sk->sk_state != TCP_LISTEN ||
12087 sk_is_mptcp(sk))
12088 return -EINVAL;
12089
12090 if (attrs->mss < min_mss)
12091 return -EINVAL;
12092
12093 if (attrs->wscale_ok) {
12094 if (!READ_ONCE(net->ipv4.sysctl_tcp_window_scaling))
12095 return -EINVAL;
12096
12097 if (attrs->snd_wscale > TCP_MAX_WSCALE ||
12098 attrs->rcv_wscale > TCP_MAX_WSCALE)
12099 return -EINVAL;
12100 }
12101
12102 if (attrs->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack))
12103 return -EINVAL;
12104
12105 if (attrs->tstamp_ok) {
12106 if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps))
12107 return -EINVAL;
12108
12109 tsoff = attrs->rcv_tsecr - tcp_ns_to_ts(attrs->usec_ts_ok, tcp_clock_ns());
12110 }
12111
12112 req = inet_reqsk_alloc(ops, sk, false);
12113 if (!req)
12114 return -ENOMEM;
12115
12116 ireq = inet_rsk(req);
12117 treq = tcp_rsk(req);
12118
12119 req->rsk_listener = sk;
12120 req->syncookie = 1;
12121 req->mss = attrs->mss;
12122 req->ts_recent = attrs->rcv_tsval;
12123
12124 ireq->snd_wscale = attrs->snd_wscale;
12125 ireq->rcv_wscale = attrs->rcv_wscale;
12126 ireq->tstamp_ok = !!attrs->tstamp_ok;
12127 ireq->sack_ok = !!attrs->sack_ok;
12128 ireq->wscale_ok = !!attrs->wscale_ok;
12129 ireq->ecn_ok = !!attrs->ecn_ok;
12130
12131 treq->req_usec_ts = !!attrs->usec_ts_ok;
12132 treq->ts_off = tsoff;
12133
12134 skb_orphan(skb);
12135 skb->sk = req_to_sk(req);
12136 skb->destructor = sock_pfree;
12137
12138 return 0;
12139 #else
12140 return -EOPNOTSUPP;
12141 #endif
12142 }
12143
bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern * skops,u64 flags)12144 __bpf_kfunc int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops,
12145 u64 flags)
12146 {
12147 struct sk_buff *skb;
12148
12149 if (skops->op != BPF_SOCK_OPS_TSTAMP_SENDMSG_CB)
12150 return -EOPNOTSUPP;
12151
12152 if (flags)
12153 return -EINVAL;
12154
12155 skb = skops->skb;
12156 skb_shinfo(skb)->tx_flags |= SKBTX_BPF;
12157 TCP_SKB_CB(skb)->txstamp_ack |= TSTAMP_ACK_BPF;
12158 skb_shinfo(skb)->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
12159
12160 return 0;
12161 }
12162
12163 __bpf_kfunc_end_defs();
12164
bpf_dynptr_from_skb_rdonly(struct __sk_buff * skb,u64 flags,struct bpf_dynptr * ptr__uninit)12165 int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags,
12166 struct bpf_dynptr *ptr__uninit)
12167 {
12168 struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
12169 int err;
12170
12171 err = bpf_dynptr_from_skb(skb, flags, ptr__uninit);
12172 if (err)
12173 return err;
12174
12175 bpf_dynptr_set_rdonly(ptr);
12176
12177 return 0;
12178 }
12179
12180 BTF_KFUNCS_START(bpf_kfunc_check_set_skb)
12181 BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS)
12182 BTF_KFUNCS_END(bpf_kfunc_check_set_skb)
12183
12184 BTF_KFUNCS_START(bpf_kfunc_check_set_xdp)
12185 BTF_ID_FLAGS(func, bpf_dynptr_from_xdp)
12186 BTF_KFUNCS_END(bpf_kfunc_check_set_xdp)
12187
12188 BTF_KFUNCS_START(bpf_kfunc_check_set_sock_addr)
12189 BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path)
12190 BTF_KFUNCS_END(bpf_kfunc_check_set_sock_addr)
12191
12192 BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk)
12193 BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS)
12194 BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk)
12195
12196 BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops)
12197 BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp, KF_TRUSTED_ARGS)
12198 BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops)
12199
12200 static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
12201 .owner = THIS_MODULE,
12202 .set = &bpf_kfunc_check_set_skb,
12203 };
12204
12205 static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = {
12206 .owner = THIS_MODULE,
12207 .set = &bpf_kfunc_check_set_xdp,
12208 };
12209
12210 static const struct btf_kfunc_id_set bpf_kfunc_set_sock_addr = {
12211 .owner = THIS_MODULE,
12212 .set = &bpf_kfunc_check_set_sock_addr,
12213 };
12214
12215 static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = {
12216 .owner = THIS_MODULE,
12217 .set = &bpf_kfunc_check_set_tcp_reqsk,
12218 };
12219
12220 static const struct btf_kfunc_id_set bpf_kfunc_set_sock_ops = {
12221 .owner = THIS_MODULE,
12222 .set = &bpf_kfunc_check_set_sock_ops,
12223 };
12224
bpf_kfunc_init(void)12225 static int __init bpf_kfunc_init(void)
12226 {
12227 int ret;
12228
12229 ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb);
12230 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb);
12231 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SK_SKB, &bpf_kfunc_set_skb);
12232 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCKET_FILTER, &bpf_kfunc_set_skb);
12233 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &bpf_kfunc_set_skb);
12234 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_OUT, &bpf_kfunc_set_skb);
12235 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_IN, &bpf_kfunc_set_skb);
12236 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb);
12237 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb);
12238 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb);
12239 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb);
12240 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
12241 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
12242 &bpf_kfunc_set_sock_addr);
12243 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk);
12244 return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCK_OPS, &bpf_kfunc_set_sock_ops);
12245 }
12246 late_initcall(bpf_kfunc_init);
12247
12248 __bpf_kfunc_start_defs();
12249
12250 /* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code.
12251 *
12252 * The function expects a non-NULL pointer to a socket, and invokes the
12253 * protocol specific socket destroy handlers.
12254 *
12255 * The helper can only be called from BPF contexts that have acquired the socket
12256 * locks.
12257 *
12258 * Parameters:
12259 * @sock: Pointer to socket to be destroyed
12260 *
12261 * Return:
12262 * On error, may return EPROTONOSUPPORT, EINVAL.
12263 * EPROTONOSUPPORT if protocol specific destroy handler is not supported.
12264 * 0 otherwise
12265 */
bpf_sock_destroy(struct sock_common * sock)12266 __bpf_kfunc int bpf_sock_destroy(struct sock_common *sock)
12267 {
12268 struct sock *sk = (struct sock *)sock;
12269
12270 /* The locking semantics that allow for synchronous execution of the
12271 * destroy handlers are only supported for TCP and UDP.
12272 * Supporting protocols will need to acquire sock lock in the BPF context
12273 * prior to invoking this kfunc.
12274 */
12275 if (!sk->sk_prot->diag_destroy || (sk->sk_protocol != IPPROTO_TCP &&
12276 sk->sk_protocol != IPPROTO_UDP))
12277 return -EOPNOTSUPP;
12278
12279 return sk->sk_prot->diag_destroy(sk, ECONNABORTED);
12280 }
12281
12282 __bpf_kfunc_end_defs();
12283
12284 BTF_KFUNCS_START(bpf_sk_iter_kfunc_ids)
BTF_ID_FLAGS(func,bpf_sock_destroy,KF_TRUSTED_ARGS)12285 BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS)
12286 BTF_KFUNCS_END(bpf_sk_iter_kfunc_ids)
12287
12288 static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id)
12289 {
12290 if (btf_id_set8_contains(&bpf_sk_iter_kfunc_ids, kfunc_id) &&
12291 prog->expected_attach_type != BPF_TRACE_ITER)
12292 return -EACCES;
12293 return 0;
12294 }
12295
12296 static const struct btf_kfunc_id_set bpf_sk_iter_kfunc_set = {
12297 .owner = THIS_MODULE,
12298 .set = &bpf_sk_iter_kfunc_ids,
12299 .filter = tracing_iter_filter,
12300 };
12301
init_subsystem(void)12302 static int init_subsystem(void)
12303 {
12304 return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_sk_iter_kfunc_set);
12305 }
12306 late_initcall(init_subsystem);
12307