1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Unstable Conntrack Helpers for XDP and TC-BPF hook 3 * 4 * These are called from the XDP and SCHED_CLS BPF programs. Note that it is 5 * allowed to break compatibility for these functions since the interface they 6 * are exposed through to BPF programs is explicitly unstable. 7 */ 8 9 #include <linux/bpf_verifier.h> 10 #include <linux/bpf.h> 11 #include <linux/btf.h> 12 #include <linux/filter.h> 13 #include <linux/mutex.h> 14 #include <linux/types.h> 15 #include <linux/btf_ids.h> 16 #include <linux/net_namespace.h> 17 #include <net/sock.h> 18 #include <net/xdp.h> 19 #include <net/netfilter/nf_conntrack_bpf.h> 20 #include <net/netfilter/nf_conntrack_core.h> 21 22 /* bpf_ct_opts - Options for CT lookup helpers 23 * 24 * Members: 25 * @netns_id - Specify the network namespace for lookup 26 * Values: 27 * BPF_F_CURRENT_NETNS (-1) 28 * Use namespace associated with ctx (xdp_md, __sk_buff) 29 * [0, S32_MAX] 30 * Network Namespace ID 31 * @error - Out parameter, set for any errors encountered 32 * Values: 33 * -EINVAL - Passed NULL for bpf_tuple pointer 34 * -EINVAL - opts->reserved is not 0 35 * -EINVAL - netns_id is less than -1 36 * -EINVAL - opts__sz isn't NF_BPF_CT_OPTS_SZ (16) or 12 37 * -EINVAL - opts->ct_zone_id set when 38 opts__sz isn't NF_BPF_CT_OPTS_SZ (16) 39 * -EPROTO - l4proto isn't one of IPPROTO_TCP or IPPROTO_UDP 40 * -ENONET - No network namespace found for netns_id 41 * -ENOENT - Conntrack lookup could not find entry for tuple 42 * -EAFNOSUPPORT - tuple__sz isn't one of sizeof(tuple->ipv4) 43 * or sizeof(tuple->ipv6) 44 * @l4proto - Layer 4 protocol 45 * Values: 46 * IPPROTO_TCP, IPPROTO_UDP 47 * @dir: - connection tracking tuple direction. 48 * @ct_zone_id - connection tracking zone id. 49 * @ct_zone_dir - connection tracking zone direction. 50 * @reserved - Reserved member, will be reused for more options in future 51 * Values: 52 * 0 53 */ 54 struct bpf_ct_opts { 55 s32 netns_id; 56 s32 error; 57 u8 l4proto; 58 u8 dir; 59 u16 ct_zone_id; 60 u8 ct_zone_dir; 61 u8 reserved[3]; 62 }; 63 64 enum { 65 NF_BPF_CT_OPTS_SZ = 16, 66 }; 67 68 static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple *bpf_tuple, 69 u32 tuple_len, u8 protonum, u8 dir, 70 struct nf_conntrack_tuple *tuple) 71 { 72 union nf_inet_addr *src = dir ? &tuple->dst.u3 : &tuple->src.u3; 73 union nf_inet_addr *dst = dir ? &tuple->src.u3 : &tuple->dst.u3; 74 union nf_conntrack_man_proto *sport = dir ? (void *)&tuple->dst.u 75 : &tuple->src.u; 76 union nf_conntrack_man_proto *dport = dir ? &tuple->src.u 77 : (void *)&tuple->dst.u; 78 79 if (unlikely(protonum != IPPROTO_TCP && protonum != IPPROTO_UDP)) 80 return -EPROTO; 81 82 memset(tuple, 0, sizeof(*tuple)); 83 84 switch (tuple_len) { 85 case sizeof(bpf_tuple->ipv4): 86 tuple->src.l3num = AF_INET; 87 src->ip = bpf_tuple->ipv4.saddr; 88 sport->tcp.port = bpf_tuple->ipv4.sport; 89 dst->ip = bpf_tuple->ipv4.daddr; 90 dport->tcp.port = bpf_tuple->ipv4.dport; 91 break; 92 case sizeof(bpf_tuple->ipv6): 93 tuple->src.l3num = AF_INET6; 94 memcpy(src->ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr)); 95 sport->tcp.port = bpf_tuple->ipv6.sport; 96 memcpy(dst->ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr)); 97 dport->tcp.port = bpf_tuple->ipv6.dport; 98 break; 99 default: 100 return -EAFNOSUPPORT; 101 } 102 tuple->dst.protonum = protonum; 103 tuple->dst.dir = dir; 104 105 return 0; 106 } 107 108 static struct nf_conn * 109 __bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple, 110 u32 tuple_len, struct bpf_ct_opts *opts, u32 opts_len, 111 u32 timeout) 112 { 113 struct nf_conntrack_tuple otuple, rtuple; 114 struct nf_conntrack_zone ct_zone; 115 struct nf_conn *ct; 116 int err; 117 118 if (!(opts_len == NF_BPF_CT_OPTS_SZ || opts_len == 12)) 119 return ERR_PTR(-EINVAL); 120 if (opts_len == NF_BPF_CT_OPTS_SZ) { 121 if (opts->reserved[0] || opts->reserved[1] || opts->reserved[2]) 122 return ERR_PTR(-EINVAL); 123 } else { 124 if (opts->ct_zone_id) 125 return ERR_PTR(-EINVAL); 126 } 127 128 if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS)) 129 return ERR_PTR(-EINVAL); 130 131 err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto, 132 IP_CT_DIR_ORIGINAL, &otuple); 133 if (err < 0) 134 return ERR_PTR(err); 135 136 err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto, 137 IP_CT_DIR_REPLY, &rtuple); 138 if (err < 0) 139 return ERR_PTR(err); 140 141 if (opts->netns_id >= 0) { 142 net = get_net_ns_by_id(net, opts->netns_id); 143 if (unlikely(!net)) 144 return ERR_PTR(-ENONET); 145 } 146 147 if (opts_len == NF_BPF_CT_OPTS_SZ) { 148 if (opts->ct_zone_dir == 0) 149 opts->ct_zone_dir = NF_CT_DEFAULT_ZONE_DIR; 150 nf_ct_zone_init(&ct_zone, 151 opts->ct_zone_id, opts->ct_zone_dir, 0); 152 } else { 153 ct_zone = nf_ct_zone_dflt; 154 } 155 156 ct = nf_conntrack_alloc(net, &ct_zone, &otuple, &rtuple, 157 GFP_ATOMIC); 158 if (IS_ERR(ct)) 159 goto out; 160 161 memset(&ct->proto, 0, sizeof(ct->proto)); 162 __nf_ct_set_timeout(ct, timeout * HZ); 163 164 out: 165 if (opts->netns_id >= 0) 166 put_net(net); 167 168 return ct; 169 } 170 171 static struct nf_conn *__bpf_nf_ct_lookup(struct net *net, 172 struct bpf_sock_tuple *bpf_tuple, 173 u32 tuple_len, struct bpf_ct_opts *opts, 174 u32 opts_len) 175 { 176 struct nf_conntrack_tuple_hash *hash; 177 struct nf_conntrack_tuple tuple; 178 struct nf_conntrack_zone ct_zone; 179 struct nf_conn *ct; 180 int err; 181 182 if (!opts || !bpf_tuple) 183 return ERR_PTR(-EINVAL); 184 if (!(opts_len == NF_BPF_CT_OPTS_SZ || opts_len == 12)) 185 return ERR_PTR(-EINVAL); 186 if (opts_len == NF_BPF_CT_OPTS_SZ) { 187 if (opts->reserved[0] || opts->reserved[1] || opts->reserved[2]) 188 return ERR_PTR(-EINVAL); 189 } else { 190 if (opts->ct_zone_id) 191 return ERR_PTR(-EINVAL); 192 } 193 if (unlikely(opts->l4proto != IPPROTO_TCP && opts->l4proto != IPPROTO_UDP)) 194 return ERR_PTR(-EPROTO); 195 if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS)) 196 return ERR_PTR(-EINVAL); 197 198 err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto, 199 IP_CT_DIR_ORIGINAL, &tuple); 200 if (err < 0) 201 return ERR_PTR(err); 202 203 if (opts->netns_id >= 0) { 204 net = get_net_ns_by_id(net, opts->netns_id); 205 if (unlikely(!net)) 206 return ERR_PTR(-ENONET); 207 } 208 209 if (opts_len == NF_BPF_CT_OPTS_SZ) { 210 if (opts->ct_zone_dir == 0) 211 opts->ct_zone_dir = NF_CT_DEFAULT_ZONE_DIR; 212 nf_ct_zone_init(&ct_zone, 213 opts->ct_zone_id, opts->ct_zone_dir, 0); 214 } else { 215 ct_zone = nf_ct_zone_dflt; 216 } 217 218 hash = nf_conntrack_find_get(net, &ct_zone, &tuple); 219 if (opts->netns_id >= 0) 220 put_net(net); 221 if (!hash) 222 return ERR_PTR(-ENOENT); 223 224 ct = nf_ct_tuplehash_to_ctrack(hash); 225 opts->dir = NF_CT_DIRECTION(hash); 226 227 return ct; 228 } 229 230 BTF_ID_LIST(btf_nf_conn_ids) 231 BTF_ID(struct, nf_conn) 232 BTF_ID(struct, nf_conn___init) 233 234 /* Check writes into `struct nf_conn` */ 235 static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log, 236 const struct bpf_reg_state *reg, 237 int off, int size) 238 { 239 const struct btf_type *ncit, *nct, *t; 240 size_t end; 241 242 ncit = btf_type_by_id(reg->btf, btf_nf_conn_ids[1]); 243 nct = btf_type_by_id(reg->btf, btf_nf_conn_ids[0]); 244 t = btf_type_by_id(reg->btf, reg->btf_id); 245 if (t != nct && t != ncit) { 246 bpf_log(log, "only read is supported\n"); 247 return -EACCES; 248 } 249 250 /* `struct nf_conn` and `struct nf_conn___init` have the same layout 251 * so we are safe to simply merge offset checks here 252 */ 253 switch (off) { 254 #if defined(CONFIG_NF_CONNTRACK_MARK) 255 case offsetof(struct nf_conn, mark): 256 end = offsetofend(struct nf_conn, mark); 257 break; 258 #endif 259 default: 260 bpf_log(log, "no write support to nf_conn at off %d\n", off); 261 return -EACCES; 262 } 263 264 if (off + size > end) { 265 bpf_log(log, 266 "write access at off %d with size %d beyond the member of nf_conn ended at %zu\n", 267 off, size, end); 268 return -EACCES; 269 } 270 271 return 0; 272 } 273 274 __bpf_kfunc_start_defs(); 275 276 /* bpf_xdp_ct_alloc - Allocate a new CT entry 277 * 278 * Parameters: 279 * @xdp_ctx - Pointer to ctx (xdp_md) in XDP program 280 * Cannot be NULL 281 * @bpf_tuple - Pointer to memory representing the tuple to look up 282 * Cannot be NULL 283 * @tuple__sz - Length of the tuple structure 284 * Must be one of sizeof(bpf_tuple->ipv4) or 285 * sizeof(bpf_tuple->ipv6) 286 * @opts - Additional options for allocation (documented above) 287 * Cannot be NULL 288 * @opts__sz - Length of the bpf_ct_opts structure 289 * Must be NF_BPF_CT_OPTS_SZ (16) or 12 290 */ 291 __bpf_kfunc struct nf_conn___init * 292 bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, 293 u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz) 294 { 295 struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx; 296 struct nf_conn *nfct; 297 298 nfct = __bpf_nf_ct_alloc_entry(dev_net(ctx->rxq->dev), bpf_tuple, tuple__sz, 299 opts, opts__sz, 10); 300 if (IS_ERR(nfct)) { 301 opts->error = PTR_ERR(nfct); 302 return NULL; 303 } 304 305 return (struct nf_conn___init *)nfct; 306 } 307 308 /* bpf_xdp_ct_lookup - Lookup CT entry for the given tuple, and acquire a 309 * reference to it 310 * 311 * Parameters: 312 * @xdp_ctx - Pointer to ctx (xdp_md) in XDP program 313 * Cannot be NULL 314 * @bpf_tuple - Pointer to memory representing the tuple to look up 315 * Cannot be NULL 316 * @tuple__sz - Length of the tuple structure 317 * Must be one of sizeof(bpf_tuple->ipv4) or 318 * sizeof(bpf_tuple->ipv6) 319 * @opts - Additional options for lookup (documented above) 320 * Cannot be NULL 321 * @opts__sz - Length of the bpf_ct_opts structure 322 * Must be NF_BPF_CT_OPTS_SZ (16) or 12 323 */ 324 __bpf_kfunc struct nf_conn * 325 bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, 326 u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz) 327 { 328 struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx; 329 struct net *caller_net; 330 struct nf_conn *nfct; 331 332 caller_net = dev_net(ctx->rxq->dev); 333 nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz); 334 if (IS_ERR(nfct)) { 335 opts->error = PTR_ERR(nfct); 336 return NULL; 337 } 338 return nfct; 339 } 340 341 /* bpf_skb_ct_alloc - Allocate a new CT entry 342 * 343 * Parameters: 344 * @skb_ctx - Pointer to ctx (__sk_buff) in TC program 345 * Cannot be NULL 346 * @bpf_tuple - Pointer to memory representing the tuple to look up 347 * Cannot be NULL 348 * @tuple__sz - Length of the tuple structure 349 * Must be one of sizeof(bpf_tuple->ipv4) or 350 * sizeof(bpf_tuple->ipv6) 351 * @opts - Additional options for allocation (documented above) 352 * Cannot be NULL 353 * @opts__sz - Length of the bpf_ct_opts structure 354 * Must be NF_BPF_CT_OPTS_SZ (16) or 12 355 */ 356 __bpf_kfunc struct nf_conn___init * 357 bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, 358 u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz) 359 { 360 struct sk_buff *skb = (struct sk_buff *)skb_ctx; 361 struct nf_conn *nfct; 362 struct net *net; 363 364 net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk); 365 nfct = __bpf_nf_ct_alloc_entry(net, bpf_tuple, tuple__sz, opts, opts__sz, 10); 366 if (IS_ERR(nfct)) { 367 opts->error = PTR_ERR(nfct); 368 return NULL; 369 } 370 371 return (struct nf_conn___init *)nfct; 372 } 373 374 /* bpf_skb_ct_lookup - Lookup CT entry for the given tuple, and acquire a 375 * reference to it 376 * 377 * Parameters: 378 * @skb_ctx - Pointer to ctx (__sk_buff) in TC program 379 * Cannot be NULL 380 * @bpf_tuple - Pointer to memory representing the tuple to look up 381 * Cannot be NULL 382 * @tuple__sz - Length of the tuple structure 383 * Must be one of sizeof(bpf_tuple->ipv4) or 384 * sizeof(bpf_tuple->ipv6) 385 * @opts - Additional options for lookup (documented above) 386 * Cannot be NULL 387 * @opts__sz - Length of the bpf_ct_opts structure 388 * Must be NF_BPF_CT_OPTS_SZ (16) or 12 389 */ 390 __bpf_kfunc struct nf_conn * 391 bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, 392 u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz) 393 { 394 struct sk_buff *skb = (struct sk_buff *)skb_ctx; 395 struct net *caller_net; 396 struct nf_conn *nfct; 397 398 caller_net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk); 399 nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz); 400 if (IS_ERR(nfct)) { 401 opts->error = PTR_ERR(nfct); 402 return NULL; 403 } 404 return nfct; 405 } 406 407 /* bpf_ct_insert_entry - Add the provided entry into a CT map 408 * 409 * This must be invoked for referenced PTR_TO_BTF_ID. 410 * 411 * @nfct - Pointer to referenced nf_conn___init object, obtained 412 * using bpf_xdp_ct_alloc or bpf_skb_ct_alloc. 413 */ 414 __bpf_kfunc struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i) 415 { 416 struct nf_conn *nfct = (struct nf_conn *)nfct_i; 417 int err; 418 419 if (!nf_ct_is_confirmed(nfct)) 420 nfct->timeout += nfct_time_stamp; 421 nfct->status |= IPS_CONFIRMED; 422 err = nf_conntrack_hash_check_insert(nfct); 423 if (err < 0) { 424 nf_conntrack_free(nfct); 425 return NULL; 426 } 427 return nfct; 428 } 429 430 /* bpf_ct_release - Release acquired nf_conn object 431 * 432 * This must be invoked for referenced PTR_TO_BTF_ID, and the verifier rejects 433 * the program if any references remain in the program in all of the explored 434 * states. 435 * 436 * Parameters: 437 * @nf_conn - Pointer to referenced nf_conn object, obtained using 438 * bpf_xdp_ct_lookup or bpf_skb_ct_lookup. 439 */ 440 __bpf_kfunc void bpf_ct_release(struct nf_conn *nfct) 441 { 442 nf_ct_put(nfct); 443 } 444 445 /* bpf_ct_set_timeout - Set timeout of allocated nf_conn 446 * 447 * Sets the default timeout of newly allocated nf_conn before insertion. 448 * This helper must be invoked for refcounted pointer to nf_conn___init. 449 * 450 * Parameters: 451 * @nfct - Pointer to referenced nf_conn object, obtained using 452 * bpf_xdp_ct_alloc or bpf_skb_ct_alloc. 453 * @timeout - Timeout in msecs. 454 */ 455 __bpf_kfunc void bpf_ct_set_timeout(struct nf_conn___init *nfct, u32 timeout) 456 { 457 __nf_ct_set_timeout((struct nf_conn *)nfct, msecs_to_jiffies(timeout)); 458 } 459 460 /* bpf_ct_change_timeout - Change timeout of inserted nf_conn 461 * 462 * Change timeout associated of the inserted or looked up nf_conn. 463 * This helper must be invoked for refcounted pointer to nf_conn. 464 * 465 * Parameters: 466 * @nfct - Pointer to referenced nf_conn object, obtained using 467 * bpf_ct_insert_entry, bpf_xdp_ct_lookup, or bpf_skb_ct_lookup. 468 * @timeout - New timeout in msecs. 469 */ 470 __bpf_kfunc int bpf_ct_change_timeout(struct nf_conn *nfct, u32 timeout) 471 { 472 return __nf_ct_change_timeout(nfct, msecs_to_jiffies(timeout)); 473 } 474 475 /* bpf_ct_set_status - Set status field of allocated nf_conn 476 * 477 * Set the status field of the newly allocated nf_conn before insertion. 478 * This must be invoked for referenced PTR_TO_BTF_ID to nf_conn___init. 479 * 480 * Parameters: 481 * @nfct - Pointer to referenced nf_conn object, obtained using 482 * bpf_xdp_ct_alloc or bpf_skb_ct_alloc. 483 * @status - New status value. 484 */ 485 __bpf_kfunc int bpf_ct_set_status(const struct nf_conn___init *nfct, u32 status) 486 { 487 return nf_ct_change_status_common((struct nf_conn *)nfct, status); 488 } 489 490 /* bpf_ct_change_status - Change status of inserted nf_conn 491 * 492 * Change the status field of the provided connection tracking entry. 493 * This must be invoked for referenced PTR_TO_BTF_ID to nf_conn. 494 * 495 * Parameters: 496 * @nfct - Pointer to referenced nf_conn object, obtained using 497 * bpf_ct_insert_entry, bpf_xdp_ct_lookup or bpf_skb_ct_lookup. 498 * @status - New status value. 499 */ 500 __bpf_kfunc int bpf_ct_change_status(struct nf_conn *nfct, u32 status) 501 { 502 return nf_ct_change_status_common(nfct, status); 503 } 504 505 __bpf_kfunc_end_defs(); 506 507 BTF_KFUNCS_START(nf_ct_kfunc_set) 508 BTF_ID_FLAGS(func, bpf_xdp_ct_alloc, KF_ACQUIRE | KF_RET_NULL) 509 BTF_ID_FLAGS(func, bpf_xdp_ct_lookup, KF_ACQUIRE | KF_RET_NULL) 510 BTF_ID_FLAGS(func, bpf_skb_ct_alloc, KF_ACQUIRE | KF_RET_NULL) 511 BTF_ID_FLAGS(func, bpf_skb_ct_lookup, KF_ACQUIRE | KF_RET_NULL) 512 BTF_ID_FLAGS(func, bpf_ct_insert_entry, KF_ACQUIRE | KF_RET_NULL | KF_RELEASE) 513 BTF_ID_FLAGS(func, bpf_ct_release, KF_RELEASE) 514 BTF_ID_FLAGS(func, bpf_ct_set_timeout) 515 BTF_ID_FLAGS(func, bpf_ct_change_timeout) 516 BTF_ID_FLAGS(func, bpf_ct_set_status) 517 BTF_ID_FLAGS(func, bpf_ct_change_status) 518 BTF_KFUNCS_END(nf_ct_kfunc_set) 519 520 static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = { 521 .owner = THIS_MODULE, 522 .set = &nf_ct_kfunc_set, 523 }; 524 525 int register_nf_conntrack_bpf(void) 526 { 527 int ret; 528 529 ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &nf_conntrack_kfunc_set); 530 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &nf_conntrack_kfunc_set); 531 if (!ret) { 532 mutex_lock(&nf_conn_btf_access_lock); 533 nfct_btf_struct_access = _nf_conntrack_btf_struct_access; 534 mutex_unlock(&nf_conn_btf_access_lock); 535 } 536 537 return ret; 538 } 539 540 void cleanup_nf_conntrack_bpf(void) 541 { 542 mutex_lock(&nf_conn_btf_access_lock); 543 nfct_btf_struct_access = NULL; 544 mutex_unlock(&nf_conn_btf_access_lock); 545 } 546