1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Connection state tracking for netfilter. This is separated from, 3 but required by, the NAT layer; it can also be used by an iptables 4 extension. */ 5 6 /* (C) 1999-2001 Paul `Rusty' Russell 7 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> 8 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> 9 * (C) 2005-2012 Patrick McHardy <kaber@trash.net> 10 */ 11 12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13 14 #include <linux/types.h> 15 #include <linux/netfilter.h> 16 #include <linux/module.h> 17 #include <linux/sched.h> 18 #include <linux/skbuff.h> 19 #include <linux/proc_fs.h> 20 #include <linux/vmalloc.h> 21 #include <linux/stddef.h> 22 #include <linux/slab.h> 23 #include <linux/random.h> 24 #include <linux/siphash.h> 25 #include <linux/err.h> 26 #include <linux/percpu.h> 27 #include <linux/moduleparam.h> 28 #include <linux/notifier.h> 29 #include <linux/kernel.h> 30 #include <linux/netdevice.h> 31 #include <linux/socket.h> 32 #include <linux/mm.h> 33 #include <linux/nsproxy.h> 34 #include <linux/rculist_nulls.h> 35 36 #include <net/netfilter/nf_conntrack.h> 37 #include <net/netfilter/nf_conntrack_bpf.h> 38 #include <net/netfilter/nf_conntrack_l4proto.h> 39 #include <net/netfilter/nf_conntrack_expect.h> 40 #include <net/netfilter/nf_conntrack_helper.h> 41 #include <net/netfilter/nf_conntrack_core.h> 42 #include <net/netfilter/nf_conntrack_extend.h> 43 #include <net/netfilter/nf_conntrack_acct.h> 44 #include <net/netfilter/nf_conntrack_ecache.h> 45 #include <net/netfilter/nf_conntrack_zones.h> 46 #include <net/netfilter/nf_conntrack_timestamp.h> 47 #include <net/netfilter/nf_conntrack_timeout.h> 48 #include <net/netfilter/nf_conntrack_labels.h> 49 #include <net/netfilter/nf_conntrack_synproxy.h> 50 #include <net/netfilter/nf_nat.h> 51 #include <net/netfilter/nf_nat_helper.h> 52 #include <net/netns/hash.h> 53 #include <net/ip.h> 54 55 #include "nf_internals.h" 56 57 __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; 58 EXPORT_SYMBOL_GPL(nf_conntrack_locks); 59 60 __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); 61 EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); 62 63 struct hlist_nulls_head *nf_conntrack_hash __read_mostly; 64 EXPORT_SYMBOL_GPL(nf_conntrack_hash); 65 66 struct conntrack_gc_work { 67 struct delayed_work dwork; 68 u32 next_bucket; 69 u32 avg_timeout; 70 u32 count; 71 u32 start_time; 72 bool exiting; 73 bool early_drop; 74 }; 75 76 static __read_mostly struct kmem_cache *nf_conntrack_cachep; 77 static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); 78 static __read_mostly bool nf_conntrack_locks_all; 79 80 /* serialize hash resizes and nf_ct_iterate_cleanup */ 81 static DEFINE_MUTEX(nf_conntrack_mutex); 82 83 #define GC_SCAN_INTERVAL_MAX (60ul * HZ) 84 #define GC_SCAN_INTERVAL_MIN (1ul * HZ) 85 86 /* clamp timeouts to this value (TCP unacked) */ 87 #define GC_SCAN_INTERVAL_CLAMP (300ul * HZ) 88 89 /* Initial bias pretending we have 100 entries at the upper bound so we don't 90 * wakeup often just because we have three entries with a 1s timeout while still 91 * allowing non-idle machines to wakeup more often when needed. 92 */ 93 #define GC_SCAN_INITIAL_COUNT 100 94 #define GC_SCAN_INTERVAL_INIT GC_SCAN_INTERVAL_MAX 95 96 #define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) 97 #define GC_SCAN_EXPIRED_MAX (64000u / HZ) 98 99 #define MIN_CHAINLEN 50u 100 #define MAX_CHAINLEN (80u - MIN_CHAINLEN) 101 102 static struct conntrack_gc_work conntrack_gc_work; 103 104 void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) 105 { 106 /* 1) Acquire the lock */ 107 spin_lock(lock); 108 109 /* 2) read nf_conntrack_locks_all, with ACQUIRE semantics 110 * It pairs with the smp_store_release() in nf_conntrack_all_unlock() 111 */ 112 if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false)) 113 return; 114 115 /* fast path failed, unlock */ 116 spin_unlock(lock); 117 118 /* Slow path 1) get global lock */ 119 spin_lock(&nf_conntrack_locks_all_lock); 120 121 /* Slow path 2) get the lock we want */ 122 spin_lock(lock); 123 124 /* Slow path 3) release the global lock */ 125 spin_unlock(&nf_conntrack_locks_all_lock); 126 } 127 EXPORT_SYMBOL_GPL(nf_conntrack_lock); 128 129 static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) 130 { 131 h1 %= CONNTRACK_LOCKS; 132 h2 %= CONNTRACK_LOCKS; 133 spin_unlock(&nf_conntrack_locks[h1]); 134 if (h1 != h2) 135 spin_unlock(&nf_conntrack_locks[h2]); 136 } 137 138 /* return true if we need to recompute hashes (in case hash table was resized) */ 139 static bool nf_conntrack_double_lock(unsigned int h1, unsigned int h2, 140 unsigned int sequence) 141 { 142 h1 %= CONNTRACK_LOCKS; 143 h2 %= CONNTRACK_LOCKS; 144 if (h1 <= h2) { 145 nf_conntrack_lock(&nf_conntrack_locks[h1]); 146 if (h1 != h2) 147 spin_lock_nested(&nf_conntrack_locks[h2], 148 SINGLE_DEPTH_NESTING); 149 } else { 150 nf_conntrack_lock(&nf_conntrack_locks[h2]); 151 spin_lock_nested(&nf_conntrack_locks[h1], 152 SINGLE_DEPTH_NESTING); 153 } 154 if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { 155 nf_conntrack_double_unlock(h1, h2); 156 return true; 157 } 158 return false; 159 } 160 161 static void nf_conntrack_all_lock(void) 162 __acquires(&nf_conntrack_locks_all_lock) 163 { 164 int i; 165 166 spin_lock(&nf_conntrack_locks_all_lock); 167 168 /* For nf_contrack_locks_all, only the latest time when another 169 * CPU will see an update is controlled, by the "release" of the 170 * spin_lock below. 171 * The earliest time is not controlled, an thus KCSAN could detect 172 * a race when nf_conntract_lock() reads the variable. 173 * WRITE_ONCE() is used to ensure the compiler will not 174 * optimize the write. 175 */ 176 WRITE_ONCE(nf_conntrack_locks_all, true); 177 178 for (i = 0; i < CONNTRACK_LOCKS; i++) { 179 spin_lock(&nf_conntrack_locks[i]); 180 181 /* This spin_unlock provides the "release" to ensure that 182 * nf_conntrack_locks_all==true is visible to everyone that 183 * acquired spin_lock(&nf_conntrack_locks[]). 184 */ 185 spin_unlock(&nf_conntrack_locks[i]); 186 } 187 } 188 189 static void nf_conntrack_all_unlock(void) 190 __releases(&nf_conntrack_locks_all_lock) 191 { 192 /* All prior stores must be complete before we clear 193 * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock() 194 * might observe the false value but not the entire 195 * critical section. 196 * It pairs with the smp_load_acquire() in nf_conntrack_lock() 197 */ 198 smp_store_release(&nf_conntrack_locks_all, false); 199 spin_unlock(&nf_conntrack_locks_all_lock); 200 } 201 202 unsigned int nf_conntrack_htable_size __read_mostly; 203 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); 204 205 unsigned int nf_conntrack_max __read_mostly; 206 EXPORT_SYMBOL_GPL(nf_conntrack_max); 207 seqcount_spinlock_t nf_conntrack_generation __read_mostly; 208 static siphash_aligned_key_t nf_conntrack_hash_rnd; 209 210 static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, 211 unsigned int zoneid, 212 const struct net *net) 213 { 214 siphash_key_t key; 215 216 get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); 217 218 key = nf_conntrack_hash_rnd; 219 220 key.key[0] ^= zoneid; 221 key.key[1] ^= net_hash_mix(net); 222 223 return siphash((void *)tuple, 224 offsetofend(struct nf_conntrack_tuple, dst.__nfct_hash_offsetend), 225 &key); 226 } 227 228 static u32 scale_hash(u32 hash) 229 { 230 return reciprocal_scale(hash, nf_conntrack_htable_size); 231 } 232 233 static u32 __hash_conntrack(const struct net *net, 234 const struct nf_conntrack_tuple *tuple, 235 unsigned int zoneid, 236 unsigned int size) 237 { 238 return reciprocal_scale(hash_conntrack_raw(tuple, zoneid, net), size); 239 } 240 241 static u32 hash_conntrack(const struct net *net, 242 const struct nf_conntrack_tuple *tuple, 243 unsigned int zoneid) 244 { 245 return scale_hash(hash_conntrack_raw(tuple, zoneid, net)); 246 } 247 248 static bool nf_ct_get_tuple_ports(const struct sk_buff *skb, 249 unsigned int dataoff, 250 struct nf_conntrack_tuple *tuple) 251 { struct { 252 __be16 sport; 253 __be16 dport; 254 } _inet_hdr, *inet_hdr; 255 256 /* Actually only need first 4 bytes to get ports. */ 257 inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr); 258 if (!inet_hdr) 259 return false; 260 261 tuple->src.u.udp.port = inet_hdr->sport; 262 tuple->dst.u.udp.port = inet_hdr->dport; 263 return true; 264 } 265 266 static bool 267 nf_ct_get_tuple(const struct sk_buff *skb, 268 unsigned int nhoff, 269 unsigned int dataoff, 270 u_int16_t l3num, 271 u_int8_t protonum, 272 struct net *net, 273 struct nf_conntrack_tuple *tuple) 274 { 275 unsigned int size; 276 const __be32 *ap; 277 __be32 _addrs[8]; 278 279 memset(tuple, 0, sizeof(*tuple)); 280 281 tuple->src.l3num = l3num; 282 switch (l3num) { 283 case NFPROTO_IPV4: 284 nhoff += offsetof(struct iphdr, saddr); 285 size = 2 * sizeof(__be32); 286 break; 287 case NFPROTO_IPV6: 288 nhoff += offsetof(struct ipv6hdr, saddr); 289 size = sizeof(_addrs); 290 break; 291 default: 292 return true; 293 } 294 295 ap = skb_header_pointer(skb, nhoff, size, _addrs); 296 if (!ap) 297 return false; 298 299 switch (l3num) { 300 case NFPROTO_IPV4: 301 tuple->src.u3.ip = ap[0]; 302 tuple->dst.u3.ip = ap[1]; 303 break; 304 case NFPROTO_IPV6: 305 memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); 306 memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); 307 break; 308 } 309 310 tuple->dst.protonum = protonum; 311 tuple->dst.dir = IP_CT_DIR_ORIGINAL; 312 313 switch (protonum) { 314 #if IS_ENABLED(CONFIG_IPV6) 315 case IPPROTO_ICMPV6: 316 return icmpv6_pkt_to_tuple(skb, dataoff, net, tuple); 317 #endif 318 case IPPROTO_ICMP: 319 return icmp_pkt_to_tuple(skb, dataoff, net, tuple); 320 #ifdef CONFIG_NF_CT_PROTO_GRE 321 case IPPROTO_GRE: 322 return gre_pkt_to_tuple(skb, dataoff, net, tuple); 323 #endif 324 case IPPROTO_TCP: 325 case IPPROTO_UDP: 326 #ifdef CONFIG_NF_CT_PROTO_UDPLITE 327 case IPPROTO_UDPLITE: 328 #endif 329 #ifdef CONFIG_NF_CT_PROTO_SCTP 330 case IPPROTO_SCTP: 331 #endif 332 /* fallthrough */ 333 return nf_ct_get_tuple_ports(skb, dataoff, tuple); 334 default: 335 break; 336 } 337 338 return true; 339 } 340 341 static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, 342 u_int8_t *protonum) 343 { 344 int dataoff = -1; 345 const struct iphdr *iph; 346 struct iphdr _iph; 347 348 iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); 349 if (!iph) 350 return -1; 351 352 /* Conntrack defragments packets, we might still see fragments 353 * inside ICMP packets though. 354 */ 355 if (iph->frag_off & htons(IP_OFFSET)) 356 return -1; 357 358 dataoff = nhoff + (iph->ihl << 2); 359 *protonum = iph->protocol; 360 361 /* Check bogus IP headers */ 362 if (dataoff > skb->len) { 363 pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n", 364 nhoff, iph->ihl << 2, skb->len); 365 return -1; 366 } 367 return dataoff; 368 } 369 370 #if IS_ENABLED(CONFIG_IPV6) 371 static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, 372 u8 *protonum) 373 { 374 int protoff = -1; 375 unsigned int extoff = nhoff + sizeof(struct ipv6hdr); 376 __be16 frag_off; 377 u8 nexthdr; 378 379 if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr), 380 &nexthdr, sizeof(nexthdr)) != 0) { 381 pr_debug("can't get nexthdr\n"); 382 return -1; 383 } 384 protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off); 385 /* 386 * (protoff == skb->len) means the packet has not data, just 387 * IPv6 and possibly extensions headers, but it is tracked anyway 388 */ 389 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { 390 pr_debug("can't find proto in pkt\n"); 391 return -1; 392 } 393 394 *protonum = nexthdr; 395 return protoff; 396 } 397 #endif 398 399 static int get_l4proto(const struct sk_buff *skb, 400 unsigned int nhoff, u8 pf, u8 *l4num) 401 { 402 switch (pf) { 403 case NFPROTO_IPV4: 404 return ipv4_get_l4proto(skb, nhoff, l4num); 405 #if IS_ENABLED(CONFIG_IPV6) 406 case NFPROTO_IPV6: 407 return ipv6_get_l4proto(skb, nhoff, l4num); 408 #endif 409 default: 410 *l4num = 0; 411 break; 412 } 413 return -1; 414 } 415 416 bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, 417 u_int16_t l3num, 418 struct net *net, struct nf_conntrack_tuple *tuple) 419 { 420 u8 protonum; 421 int protoff; 422 423 protoff = get_l4proto(skb, nhoff, l3num, &protonum); 424 if (protoff <= 0) 425 return false; 426 427 return nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple); 428 } 429 EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr); 430 431 bool 432 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, 433 const struct nf_conntrack_tuple *orig) 434 { 435 memset(inverse, 0, sizeof(*inverse)); 436 437 inverse->src.l3num = orig->src.l3num; 438 439 switch (orig->src.l3num) { 440 case NFPROTO_IPV4: 441 inverse->src.u3.ip = orig->dst.u3.ip; 442 inverse->dst.u3.ip = orig->src.u3.ip; 443 break; 444 case NFPROTO_IPV6: 445 inverse->src.u3.in6 = orig->dst.u3.in6; 446 inverse->dst.u3.in6 = orig->src.u3.in6; 447 break; 448 default: 449 break; 450 } 451 452 inverse->dst.dir = !orig->dst.dir; 453 454 inverse->dst.protonum = orig->dst.protonum; 455 456 switch (orig->dst.protonum) { 457 case IPPROTO_ICMP: 458 return nf_conntrack_invert_icmp_tuple(inverse, orig); 459 #if IS_ENABLED(CONFIG_IPV6) 460 case IPPROTO_ICMPV6: 461 return nf_conntrack_invert_icmpv6_tuple(inverse, orig); 462 #endif 463 } 464 465 inverse->src.u.all = orig->dst.u.all; 466 inverse->dst.u.all = orig->src.u.all; 467 return true; 468 } 469 EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); 470 471 /* Generate a almost-unique pseudo-id for a given conntrack. 472 * 473 * intentionally doesn't re-use any of the seeds used for hash 474 * table location, we assume id gets exposed to userspace. 475 * 476 * Following nf_conn items do not change throughout lifetime 477 * of the nf_conn: 478 * 479 * 1. nf_conn address 480 * 2. nf_conn->master address (normally NULL) 481 * 3. the associated net namespace 482 * 4. the original direction tuple 483 */ 484 u32 nf_ct_get_id(const struct nf_conn *ct) 485 { 486 static siphash_aligned_key_t ct_id_seed; 487 unsigned long a, b, c, d; 488 489 net_get_random_once(&ct_id_seed, sizeof(ct_id_seed)); 490 491 a = (unsigned long)ct; 492 b = (unsigned long)ct->master; 493 c = (unsigned long)nf_ct_net(ct); 494 d = (unsigned long)siphash(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 495 sizeof(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple), 496 &ct_id_seed); 497 #ifdef CONFIG_64BIT 498 return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed); 499 #else 500 return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed); 501 #endif 502 } 503 EXPORT_SYMBOL_GPL(nf_ct_get_id); 504 505 static u32 nf_conntrack_get_id(const struct nf_conntrack *nfct) 506 { 507 return nf_ct_get_id(nf_ct_to_nf_conn(nfct)); 508 } 509 510 static void 511 clean_from_lists(struct nf_conn *ct) 512 { 513 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 514 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); 515 516 /* Destroy all pending expectations */ 517 nf_ct_remove_expectations(ct); 518 } 519 520 #define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK) 521 522 /* Released via nf_ct_destroy() */ 523 struct nf_conn *nf_ct_tmpl_alloc(struct net *net, 524 const struct nf_conntrack_zone *zone, 525 gfp_t flags) 526 { 527 struct nf_conn *tmpl, *p; 528 529 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) { 530 tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags); 531 if (!tmpl) 532 return NULL; 533 534 p = tmpl; 535 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); 536 if (tmpl != p) 537 tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p; 538 } else { 539 tmpl = kzalloc(sizeof(*tmpl), flags); 540 if (!tmpl) 541 return NULL; 542 } 543 544 tmpl->status = IPS_TEMPLATE; 545 write_pnet(&tmpl->ct_net, net); 546 nf_ct_zone_add(tmpl, zone); 547 refcount_set(&tmpl->ct_general.use, 1); 548 549 return tmpl; 550 } 551 EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); 552 553 void nf_ct_tmpl_free(struct nf_conn *tmpl) 554 { 555 kfree(tmpl->ext); 556 557 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) 558 kfree((char *)tmpl - tmpl->proto.tmpl_padto); 559 else 560 kfree(tmpl); 561 } 562 EXPORT_SYMBOL_GPL(nf_ct_tmpl_free); 563 564 static void destroy_gre_conntrack(struct nf_conn *ct) 565 { 566 #ifdef CONFIG_NF_CT_PROTO_GRE 567 struct nf_conn *master = ct->master; 568 569 if (master) 570 nf_ct_gre_keymap_destroy(master); 571 #endif 572 } 573 574 void nf_ct_destroy(struct nf_conntrack *nfct) 575 { 576 struct nf_conn *ct = (struct nf_conn *)nfct; 577 578 WARN_ON(refcount_read(&nfct->use) != 0); 579 580 if (unlikely(nf_ct_is_template(ct))) { 581 nf_ct_tmpl_free(ct); 582 return; 583 } 584 585 if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE)) 586 destroy_gre_conntrack(ct); 587 588 /* Expectations will have been removed in clean_from_lists, 589 * except TFTP can create an expectation on the first packet, 590 * before connection is in the list, so we need to clean here, 591 * too. 592 */ 593 nf_ct_remove_expectations(ct); 594 595 if (ct->master) 596 nf_ct_put(ct->master); 597 598 nf_conntrack_free(ct); 599 } 600 EXPORT_SYMBOL(nf_ct_destroy); 601 602 static void __nf_ct_delete_from_lists(struct nf_conn *ct) 603 { 604 struct net *net = nf_ct_net(ct); 605 unsigned int hash, reply_hash; 606 unsigned int sequence; 607 608 do { 609 sequence = read_seqcount_begin(&nf_conntrack_generation); 610 hash = hash_conntrack(net, 611 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 612 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); 613 reply_hash = hash_conntrack(net, 614 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 615 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 616 } while (nf_conntrack_double_lock(hash, reply_hash, sequence)); 617 618 clean_from_lists(ct); 619 nf_conntrack_double_unlock(hash, reply_hash); 620 } 621 622 static void nf_ct_delete_from_lists(struct nf_conn *ct) 623 { 624 nf_ct_helper_destroy(ct); 625 local_bh_disable(); 626 627 __nf_ct_delete_from_lists(ct); 628 629 local_bh_enable(); 630 } 631 632 static void nf_ct_add_to_ecache_list(struct nf_conn *ct) 633 { 634 #ifdef CONFIG_NF_CONNTRACK_EVENTS 635 struct nf_conntrack_net *cnet = nf_ct_pernet(nf_ct_net(ct)); 636 637 spin_lock(&cnet->ecache.dying_lock); 638 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 639 &cnet->ecache.dying_list); 640 spin_unlock(&cnet->ecache.dying_lock); 641 #endif 642 } 643 644 bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) 645 { 646 struct nf_conn_tstamp *tstamp; 647 struct net *net; 648 649 if (test_and_set_bit(IPS_DYING_BIT, &ct->status)) 650 return false; 651 652 tstamp = nf_conn_tstamp_find(ct); 653 if (tstamp) { 654 s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp; 655 656 tstamp->stop = ktime_get_real_ns(); 657 if (timeout < 0) 658 tstamp->stop -= jiffies_to_nsecs(-timeout); 659 } 660 661 if (nf_conntrack_event_report(IPCT_DESTROY, ct, 662 portid, report) < 0) { 663 /* destroy event was not delivered. nf_ct_put will 664 * be done by event cache worker on redelivery. 665 */ 666 nf_ct_helper_destroy(ct); 667 local_bh_disable(); 668 __nf_ct_delete_from_lists(ct); 669 nf_ct_add_to_ecache_list(ct); 670 local_bh_enable(); 671 672 nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL); 673 return false; 674 } 675 676 net = nf_ct_net(ct); 677 if (nf_conntrack_ecache_dwork_pending(net)) 678 nf_conntrack_ecache_work(net, NFCT_ECACHE_DESTROY_SENT); 679 nf_ct_delete_from_lists(ct); 680 nf_ct_put(ct); 681 return true; 682 } 683 EXPORT_SYMBOL_GPL(nf_ct_delete); 684 685 static inline bool 686 nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, 687 const struct nf_conntrack_tuple *tuple, 688 const struct nf_conntrack_zone *zone, 689 const struct net *net) 690 { 691 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 692 693 /* A conntrack can be recreated with the equal tuple, 694 * so we need to check that the conntrack is confirmed 695 */ 696 return nf_ct_tuple_equal(tuple, &h->tuple) && 697 nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) && 698 nf_ct_is_confirmed(ct) && 699 net_eq(net, nf_ct_net(ct)); 700 } 701 702 static inline bool 703 nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2) 704 { 705 return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 706 &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && 707 nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple, 708 &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) && 709 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) && 710 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) && 711 net_eq(nf_ct_net(ct1), nf_ct_net(ct2)); 712 } 713 714 /* caller must hold rcu readlock and none of the nf_conntrack_locks */ 715 static void nf_ct_gc_expired(struct nf_conn *ct) 716 { 717 if (!refcount_inc_not_zero(&ct->ct_general.use)) 718 return; 719 720 /* load ->status after refcount increase */ 721 smp_acquire__after_ctrl_dep(); 722 723 if (nf_ct_should_gc(ct)) 724 nf_ct_kill(ct); 725 726 nf_ct_put(ct); 727 } 728 729 /* 730 * Warning : 731 * - Caller must take a reference on returned object 732 * and recheck nf_ct_tuple_equal(tuple, &h->tuple) 733 */ 734 static struct nf_conntrack_tuple_hash * 735 ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, 736 const struct nf_conntrack_tuple *tuple, u32 hash) 737 { 738 struct nf_conntrack_tuple_hash *h; 739 struct hlist_nulls_head *ct_hash; 740 struct hlist_nulls_node *n; 741 unsigned int bucket, hsize; 742 743 begin: 744 nf_conntrack_get_ht(&ct_hash, &hsize); 745 bucket = reciprocal_scale(hash, hsize); 746 747 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { 748 struct nf_conn *ct; 749 750 ct = nf_ct_tuplehash_to_ctrack(h); 751 if (nf_ct_is_expired(ct)) { 752 nf_ct_gc_expired(ct); 753 continue; 754 } 755 756 if (nf_ct_key_equal(h, tuple, zone, net)) 757 return h; 758 } 759 /* 760 * if the nulls value we got at the end of this lookup is 761 * not the expected one, we must restart lookup. 762 * We probably met an item that was moved to another chain. 763 */ 764 if (get_nulls_value(n) != bucket) { 765 NF_CT_STAT_INC_ATOMIC(net, search_restart); 766 goto begin; 767 } 768 769 return NULL; 770 } 771 772 /* Find a connection corresponding to a tuple. */ 773 static struct nf_conntrack_tuple_hash * 774 __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, 775 const struct nf_conntrack_tuple *tuple, u32 hash) 776 { 777 struct nf_conntrack_tuple_hash *h; 778 struct nf_conn *ct; 779 780 h = ____nf_conntrack_find(net, zone, tuple, hash); 781 if (h) { 782 /* We have a candidate that matches the tuple we're interested 783 * in, try to obtain a reference and re-check tuple 784 */ 785 ct = nf_ct_tuplehash_to_ctrack(h); 786 if (likely(refcount_inc_not_zero(&ct->ct_general.use))) { 787 /* re-check key after refcount */ 788 smp_acquire__after_ctrl_dep(); 789 790 if (likely(nf_ct_key_equal(h, tuple, zone, net))) 791 return h; 792 793 /* TYPESAFE_BY_RCU recycled the candidate */ 794 nf_ct_put(ct); 795 } 796 797 h = NULL; 798 } 799 800 return h; 801 } 802 803 struct nf_conntrack_tuple_hash * 804 nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, 805 const struct nf_conntrack_tuple *tuple) 806 { 807 unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); 808 struct nf_conntrack_tuple_hash *thash; 809 810 rcu_read_lock(); 811 812 thash = __nf_conntrack_find_get(net, zone, tuple, 813 hash_conntrack_raw(tuple, zone_id, net)); 814 815 if (thash) 816 goto out_unlock; 817 818 rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); 819 if (rid != zone_id) 820 thash = __nf_conntrack_find_get(net, zone, tuple, 821 hash_conntrack_raw(tuple, rid, net)); 822 823 out_unlock: 824 rcu_read_unlock(); 825 return thash; 826 } 827 EXPORT_SYMBOL_GPL(nf_conntrack_find_get); 828 829 static void __nf_conntrack_hash_insert(struct nf_conn *ct, 830 unsigned int hash, 831 unsigned int reply_hash) 832 { 833 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 834 &nf_conntrack_hash[hash]); 835 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 836 &nf_conntrack_hash[reply_hash]); 837 } 838 839 static bool nf_ct_ext_valid_pre(const struct nf_ct_ext *ext) 840 { 841 /* if ext->gen_id is not equal to nf_conntrack_ext_genid, some extensions 842 * may contain stale pointers to e.g. helper that has been removed. 843 * 844 * The helper can't clear this because the nf_conn object isn't in 845 * any hash and synchronize_rcu() isn't enough because associated skb 846 * might sit in a queue. 847 */ 848 return !ext || ext->gen_id == atomic_read(&nf_conntrack_ext_genid); 849 } 850 851 static bool nf_ct_ext_valid_post(struct nf_ct_ext *ext) 852 { 853 if (!ext) 854 return true; 855 856 if (ext->gen_id != atomic_read(&nf_conntrack_ext_genid)) 857 return false; 858 859 /* inserted into conntrack table, nf_ct_iterate_cleanup() 860 * will find it. Disable nf_ct_ext_find() id check. 861 */ 862 WRITE_ONCE(ext->gen_id, 0); 863 return true; 864 } 865 866 int 867 nf_conntrack_hash_check_insert(struct nf_conn *ct) 868 { 869 const struct nf_conntrack_zone *zone; 870 struct net *net = nf_ct_net(ct); 871 unsigned int hash, reply_hash; 872 struct nf_conntrack_tuple_hash *h; 873 struct hlist_nulls_node *n; 874 unsigned int max_chainlen; 875 unsigned int chainlen = 0; 876 unsigned int sequence; 877 int err = -EEXIST; 878 879 zone = nf_ct_zone(ct); 880 881 if (!nf_ct_ext_valid_pre(ct->ext)) 882 return -EAGAIN; 883 884 local_bh_disable(); 885 do { 886 sequence = read_seqcount_begin(&nf_conntrack_generation); 887 hash = hash_conntrack(net, 888 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 889 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); 890 reply_hash = hash_conntrack(net, 891 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 892 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 893 } while (nf_conntrack_double_lock(hash, reply_hash, sequence)); 894 895 max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN); 896 897 /* See if there's one in the list already, including reverse */ 898 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { 899 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 900 zone, net)) 901 goto out; 902 903 if (chainlen++ > max_chainlen) 904 goto chaintoolong; 905 } 906 907 chainlen = 0; 908 909 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { 910 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 911 zone, net)) 912 goto out; 913 if (chainlen++ > max_chainlen) 914 goto chaintoolong; 915 } 916 917 /* If genid has changed, we can't insert anymore because ct 918 * extensions could have stale pointers and nf_ct_iterate_destroy 919 * might have completed its table scan already. 920 * 921 * Increment of the ext genid right after this check is fine: 922 * nf_ct_iterate_destroy blocks until locks are released. 923 */ 924 if (!nf_ct_ext_valid_post(ct->ext)) { 925 err = -EAGAIN; 926 goto out; 927 } 928 929 smp_wmb(); 930 /* The caller holds a reference to this object */ 931 refcount_set(&ct->ct_general.use, 2); 932 __nf_conntrack_hash_insert(ct, hash, reply_hash); 933 nf_conntrack_double_unlock(hash, reply_hash); 934 NF_CT_STAT_INC(net, insert); 935 local_bh_enable(); 936 937 return 0; 938 chaintoolong: 939 NF_CT_STAT_INC(net, chaintoolong); 940 err = -ENOSPC; 941 out: 942 nf_conntrack_double_unlock(hash, reply_hash); 943 local_bh_enable(); 944 return err; 945 } 946 EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); 947 948 void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets, 949 unsigned int bytes) 950 { 951 struct nf_conn_acct *acct; 952 953 acct = nf_conn_acct_find(ct); 954 if (acct) { 955 struct nf_conn_counter *counter = acct->counter; 956 957 atomic64_add(packets, &counter[dir].packets); 958 atomic64_add(bytes, &counter[dir].bytes); 959 } 960 } 961 EXPORT_SYMBOL_GPL(nf_ct_acct_add); 962 963 static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, 964 const struct nf_conn *loser_ct) 965 { 966 struct nf_conn_acct *acct; 967 968 acct = nf_conn_acct_find(loser_ct); 969 if (acct) { 970 struct nf_conn_counter *counter = acct->counter; 971 unsigned int bytes; 972 973 /* u32 should be fine since we must have seen one packet. */ 974 bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes); 975 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes); 976 } 977 } 978 979 static void __nf_conntrack_insert_prepare(struct nf_conn *ct) 980 { 981 struct nf_conn_tstamp *tstamp; 982 983 refcount_inc(&ct->ct_general.use); 984 985 /* set conntrack timestamp, if enabled. */ 986 tstamp = nf_conn_tstamp_find(ct); 987 if (tstamp) 988 tstamp->start = ktime_get_real_ns(); 989 } 990 991 /** 992 * nf_ct_match_reverse - check if ct1 and ct2 refer to identical flow 993 * @ct1: conntrack in hash table to check against 994 * @ct2: merge candidate 995 * 996 * returns true if ct1 and ct2 happen to refer to the same flow, but 997 * in opposing directions, i.e. 998 * ct1: a:b -> c:d 999 * ct2: c:d -> a:b 1000 * for both directions. If so, @ct2 should not have been created 1001 * as the skb should have been picked up as ESTABLISHED flow. 1002 * But ct1 was not yet committed to hash table before skb that created 1003 * ct2 had arrived. 1004 * 1005 * Note we don't compare netns because ct entries in different net 1006 * namespace cannot clash to begin with. 1007 * 1008 * @return: true if ct1 and ct2 are identical when swapping origin/reply. 1009 */ 1010 static bool 1011 nf_ct_match_reverse(const struct nf_conn *ct1, const struct nf_conn *ct2) 1012 { 1013 u16 id1, id2; 1014 1015 if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1016 &ct2->tuplehash[IP_CT_DIR_REPLY].tuple)) 1017 return false; 1018 1019 if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple, 1020 &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) 1021 return false; 1022 1023 id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_ORIGINAL); 1024 id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_REPLY); 1025 if (id1 != id2) 1026 return false; 1027 1028 id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_REPLY); 1029 id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL); 1030 1031 return id1 == id2; 1032 } 1033 1034 static int nf_ct_can_merge(const struct nf_conn *ct, 1035 const struct nf_conn *loser_ct) 1036 { 1037 return nf_ct_match(ct, loser_ct) || 1038 nf_ct_match_reverse(ct, loser_ct); 1039 } 1040 1041 /* caller must hold locks to prevent concurrent changes */ 1042 static int __nf_ct_resolve_clash(struct sk_buff *skb, 1043 struct nf_conntrack_tuple_hash *h) 1044 { 1045 /* This is the conntrack entry already in hashes that won race. */ 1046 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 1047 enum ip_conntrack_info ctinfo; 1048 struct nf_conn *loser_ct; 1049 1050 loser_ct = nf_ct_get(skb, &ctinfo); 1051 1052 if (nf_ct_can_merge(ct, loser_ct)) { 1053 struct net *net = nf_ct_net(ct); 1054 1055 nf_conntrack_get(&ct->ct_general); 1056 1057 nf_ct_acct_merge(ct, ctinfo, loser_ct); 1058 nf_ct_put(loser_ct); 1059 nf_ct_set(skb, ct, ctinfo); 1060 1061 NF_CT_STAT_INC(net, clash_resolve); 1062 return NF_ACCEPT; 1063 } 1064 1065 return NF_DROP; 1066 } 1067 1068 /** 1069 * nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry 1070 * 1071 * @skb: skb that causes the collision 1072 * @repl_idx: hash slot for reply direction 1073 * 1074 * Called when origin or reply direction had a clash. 1075 * The skb can be handled without packet drop provided the reply direction 1076 * is unique or there the existing entry has the identical tuple in both 1077 * directions. 1078 * 1079 * Caller must hold conntrack table locks to prevent concurrent updates. 1080 * 1081 * Returns NF_DROP if the clash could not be handled. 1082 */ 1083 static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) 1084 { 1085 struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb); 1086 const struct nf_conntrack_zone *zone; 1087 struct nf_conntrack_tuple_hash *h; 1088 struct hlist_nulls_node *n; 1089 struct net *net; 1090 1091 zone = nf_ct_zone(loser_ct); 1092 net = nf_ct_net(loser_ct); 1093 1094 /* Reply direction must never result in a clash, unless both origin 1095 * and reply tuples are identical. 1096 */ 1097 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) { 1098 if (nf_ct_key_equal(h, 1099 &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1100 zone, net)) 1101 return __nf_ct_resolve_clash(skb, h); 1102 } 1103 1104 /* We want the clashing entry to go away real soon: 1 second timeout. */ 1105 WRITE_ONCE(loser_ct->timeout, nfct_time_stamp + HZ); 1106 1107 /* IPS_NAT_CLASH removes the entry automatically on the first 1108 * reply. Also prevents UDP tracker from moving the entry to 1109 * ASSURED state, i.e. the entry can always be evicted under 1110 * pressure. 1111 */ 1112 loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH; 1113 1114 __nf_conntrack_insert_prepare(loser_ct); 1115 1116 /* fake add for ORIGINAL dir: we want lookups to only find the entry 1117 * already in the table. This also hides the clashing entry from 1118 * ctnetlink iteration, i.e. conntrack -L won't show them. 1119 */ 1120 hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 1121 1122 hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 1123 &nf_conntrack_hash[repl_idx]); 1124 /* confirmed bit must be set after hlist add, not before: 1125 * loser_ct can still be visible to other cpu due to 1126 * SLAB_TYPESAFE_BY_RCU. 1127 */ 1128 smp_mb__before_atomic(); 1129 set_bit(IPS_CONFIRMED_BIT, &loser_ct->status); 1130 1131 NF_CT_STAT_INC(net, clash_resolve); 1132 return NF_ACCEPT; 1133 } 1134 1135 /** 1136 * nf_ct_resolve_clash - attempt to handle clash without packet drop 1137 * 1138 * @skb: skb that causes the clash 1139 * @h: tuplehash of the clashing entry already in table 1140 * @reply_hash: hash slot for reply direction 1141 * 1142 * A conntrack entry can be inserted to the connection tracking table 1143 * if there is no existing entry with an identical tuple. 1144 * 1145 * If there is one, @skb (and the associated, unconfirmed conntrack) has 1146 * to be dropped. In case @skb is retransmitted, next conntrack lookup 1147 * will find the already-existing entry. 1148 * 1149 * The major problem with such packet drop is the extra delay added by 1150 * the packet loss -- it will take some time for a retransmit to occur 1151 * (or the sender to time out when waiting for a reply). 1152 * 1153 * This function attempts to handle the situation without packet drop. 1154 * 1155 * If @skb has no NAT transformation or if the colliding entries are 1156 * exactly the same, only the to-be-confirmed conntrack entry is discarded 1157 * and @skb is associated with the conntrack entry already in the table. 1158 * 1159 * Failing that, the new, unconfirmed conntrack is still added to the table 1160 * provided that the collision only occurs in the ORIGINAL direction. 1161 * The new entry will be added only in the non-clashing REPLY direction, 1162 * so packets in the ORIGINAL direction will continue to match the existing 1163 * entry. The new entry will also have a fixed timeout so it expires -- 1164 * due to the collision, it will only see reply traffic. 1165 * 1166 * Returns NF_DROP if the clash could not be resolved. 1167 */ 1168 static __cold noinline int 1169 nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h, 1170 u32 reply_hash) 1171 { 1172 /* This is the conntrack entry already in hashes that won race. */ 1173 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 1174 const struct nf_conntrack_l4proto *l4proto; 1175 enum ip_conntrack_info ctinfo; 1176 struct nf_conn *loser_ct; 1177 struct net *net; 1178 int ret; 1179 1180 loser_ct = nf_ct_get(skb, &ctinfo); 1181 net = nf_ct_net(loser_ct); 1182 1183 l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); 1184 if (!l4proto->allow_clash) 1185 goto drop; 1186 1187 ret = __nf_ct_resolve_clash(skb, h); 1188 if (ret == NF_ACCEPT) 1189 return ret; 1190 1191 ret = nf_ct_resolve_clash_harder(skb, reply_hash); 1192 if (ret == NF_ACCEPT) 1193 return ret; 1194 1195 drop: 1196 NF_CT_STAT_INC(net, drop); 1197 NF_CT_STAT_INC(net, insert_failed); 1198 return NF_DROP; 1199 } 1200 1201 /* Confirm a connection given skb; places it in hash table */ 1202 int 1203 __nf_conntrack_confirm(struct sk_buff *skb) 1204 { 1205 unsigned int chainlen = 0, sequence, max_chainlen; 1206 const struct nf_conntrack_zone *zone; 1207 unsigned int hash, reply_hash; 1208 struct nf_conntrack_tuple_hash *h; 1209 struct nf_conn *ct; 1210 struct nf_conn_help *help; 1211 struct hlist_nulls_node *n; 1212 enum ip_conntrack_info ctinfo; 1213 struct net *net; 1214 int ret = NF_DROP; 1215 1216 ct = nf_ct_get(skb, &ctinfo); 1217 net = nf_ct_net(ct); 1218 1219 /* ipt_REJECT uses nf_conntrack_attach to attach related 1220 ICMP/TCP RST packets in other direction. Actual packet 1221 which created connection will be IP_CT_NEW or for an 1222 expected connection, IP_CT_RELATED. */ 1223 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) 1224 return NF_ACCEPT; 1225 1226 zone = nf_ct_zone(ct); 1227 local_bh_disable(); 1228 1229 do { 1230 sequence = read_seqcount_begin(&nf_conntrack_generation); 1231 /* reuse the hash saved before */ 1232 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; 1233 hash = scale_hash(hash); 1234 reply_hash = hash_conntrack(net, 1235 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1236 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 1237 } while (nf_conntrack_double_lock(hash, reply_hash, sequence)); 1238 1239 /* We're not in hash table, and we refuse to set up related 1240 * connections for unconfirmed conns. But packet copies and 1241 * REJECT will give spurious warnings here. 1242 */ 1243 1244 /* Another skb with the same unconfirmed conntrack may 1245 * win the race. This may happen for bridge(br_flood) 1246 * or broadcast/multicast packets do skb_clone with 1247 * unconfirmed conntrack. 1248 */ 1249 if (unlikely(nf_ct_is_confirmed(ct))) { 1250 WARN_ON_ONCE(1); 1251 nf_conntrack_double_unlock(hash, reply_hash); 1252 local_bh_enable(); 1253 return NF_DROP; 1254 } 1255 1256 if (!nf_ct_ext_valid_pre(ct->ext)) { 1257 NF_CT_STAT_INC(net, insert_failed); 1258 goto dying; 1259 } 1260 1261 /* We have to check the DYING flag after unlink to prevent 1262 * a race against nf_ct_get_next_corpse() possibly called from 1263 * user context, else we insert an already 'dead' hash, blocking 1264 * further use of that particular connection -JM. 1265 */ 1266 if (unlikely(nf_ct_is_dying(ct))) { 1267 NF_CT_STAT_INC(net, insert_failed); 1268 goto dying; 1269 } 1270 1271 max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN); 1272 /* See if there's one in the list already, including reverse: 1273 NAT could have grabbed it without realizing, since we're 1274 not in the hash. If there is, we lost race. */ 1275 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { 1276 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1277 zone, net)) 1278 goto out; 1279 if (chainlen++ > max_chainlen) 1280 goto chaintoolong; 1281 } 1282 1283 chainlen = 0; 1284 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { 1285 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1286 zone, net)) 1287 goto out; 1288 if (chainlen++ > max_chainlen) { 1289 chaintoolong: 1290 NF_CT_STAT_INC(net, chaintoolong); 1291 NF_CT_STAT_INC(net, insert_failed); 1292 ret = NF_DROP; 1293 goto dying; 1294 } 1295 } 1296 1297 /* Timeout is relative to confirmation time, not original 1298 setting time, otherwise we'd get timer wrap in 1299 weird delay cases. */ 1300 ct->timeout += nfct_time_stamp; 1301 1302 __nf_conntrack_insert_prepare(ct); 1303 1304 /* Since the lookup is lockless, hash insertion must be done after 1305 * setting ct->timeout. The RCU barriers guarantee that no other CPU 1306 * can find the conntrack before the above stores are visible. 1307 */ 1308 __nf_conntrack_hash_insert(ct, hash, reply_hash); 1309 1310 /* IPS_CONFIRMED unset means 'ct not (yet) in hash', conntrack lookups 1311 * skip entries that lack this bit. This happens when a CPU is looking 1312 * at a stale entry that is being recycled due to SLAB_TYPESAFE_BY_RCU 1313 * or when another CPU encounters this entry right after the insertion 1314 * but before the set-confirm-bit below. This bit must not be set until 1315 * after __nf_conntrack_hash_insert(). 1316 */ 1317 smp_mb__before_atomic(); 1318 set_bit(IPS_CONFIRMED_BIT, &ct->status); 1319 1320 nf_conntrack_double_unlock(hash, reply_hash); 1321 local_bh_enable(); 1322 1323 /* ext area is still valid (rcu read lock is held, 1324 * but will go out of scope soon, we need to remove 1325 * this conntrack again. 1326 */ 1327 if (!nf_ct_ext_valid_post(ct->ext)) { 1328 nf_ct_kill(ct); 1329 NF_CT_STAT_INC_ATOMIC(net, drop); 1330 return NF_DROP; 1331 } 1332 1333 help = nfct_help(ct); 1334 if (help && help->helper) 1335 nf_conntrack_event_cache(IPCT_HELPER, ct); 1336 1337 nf_conntrack_event_cache(master_ct(ct) ? 1338 IPCT_RELATED : IPCT_NEW, ct); 1339 return NF_ACCEPT; 1340 1341 out: 1342 ret = nf_ct_resolve_clash(skb, h, reply_hash); 1343 dying: 1344 nf_conntrack_double_unlock(hash, reply_hash); 1345 local_bh_enable(); 1346 return ret; 1347 } 1348 EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); 1349 1350 /* Returns true if a connection corresponds to the tuple (required 1351 for NAT). */ 1352 int 1353 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, 1354 const struct nf_conn *ignored_conntrack) 1355 { 1356 struct net *net = nf_ct_net(ignored_conntrack); 1357 const struct nf_conntrack_zone *zone; 1358 struct nf_conntrack_tuple_hash *h; 1359 struct hlist_nulls_head *ct_hash; 1360 unsigned int hash, hsize; 1361 struct hlist_nulls_node *n; 1362 struct nf_conn *ct; 1363 1364 zone = nf_ct_zone(ignored_conntrack); 1365 1366 rcu_read_lock(); 1367 begin: 1368 nf_conntrack_get_ht(&ct_hash, &hsize); 1369 hash = __hash_conntrack(net, tuple, nf_ct_zone_id(zone, IP_CT_DIR_REPLY), hsize); 1370 1371 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { 1372 ct = nf_ct_tuplehash_to_ctrack(h); 1373 1374 if (ct == ignored_conntrack) 1375 continue; 1376 1377 if (nf_ct_is_expired(ct)) { 1378 nf_ct_gc_expired(ct); 1379 continue; 1380 } 1381 1382 if (nf_ct_key_equal(h, tuple, zone, net)) { 1383 /* Tuple is taken already, so caller will need to find 1384 * a new source port to use. 1385 * 1386 * Only exception: 1387 * If the *original tuples* are identical, then both 1388 * conntracks refer to the same flow. 1389 * This is a rare situation, it can occur e.g. when 1390 * more than one UDP packet is sent from same socket 1391 * in different threads. 1392 * 1393 * Let nf_ct_resolve_clash() deal with this later. 1394 */ 1395 if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1396 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && 1397 nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) 1398 continue; 1399 1400 NF_CT_STAT_INC_ATOMIC(net, found); 1401 rcu_read_unlock(); 1402 return 1; 1403 } 1404 } 1405 1406 if (get_nulls_value(n) != hash) { 1407 NF_CT_STAT_INC_ATOMIC(net, search_restart); 1408 goto begin; 1409 } 1410 1411 rcu_read_unlock(); 1412 1413 return 0; 1414 } 1415 EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); 1416 1417 #define NF_CT_EVICTION_RANGE 8 1418 1419 /* There's a small race here where we may free a just-assured 1420 connection. Too bad: we're in trouble anyway. */ 1421 static unsigned int early_drop_list(struct net *net, 1422 struct hlist_nulls_head *head) 1423 { 1424 struct nf_conntrack_tuple_hash *h; 1425 struct hlist_nulls_node *n; 1426 unsigned int drops = 0; 1427 struct nf_conn *tmp; 1428 1429 hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { 1430 tmp = nf_ct_tuplehash_to_ctrack(h); 1431 1432 if (nf_ct_is_expired(tmp)) { 1433 nf_ct_gc_expired(tmp); 1434 continue; 1435 } 1436 1437 if (test_bit(IPS_ASSURED_BIT, &tmp->status) || 1438 !net_eq(nf_ct_net(tmp), net) || 1439 nf_ct_is_dying(tmp)) 1440 continue; 1441 1442 if (!refcount_inc_not_zero(&tmp->ct_general.use)) 1443 continue; 1444 1445 /* load ->ct_net and ->status after refcount increase */ 1446 smp_acquire__after_ctrl_dep(); 1447 1448 /* kill only if still in same netns -- might have moved due to 1449 * SLAB_TYPESAFE_BY_RCU rules. 1450 * 1451 * We steal the timer reference. If that fails timer has 1452 * already fired or someone else deleted it. Just drop ref 1453 * and move to next entry. 1454 */ 1455 if (net_eq(nf_ct_net(tmp), net) && 1456 nf_ct_is_confirmed(tmp) && 1457 nf_ct_delete(tmp, 0, 0)) 1458 drops++; 1459 1460 nf_ct_put(tmp); 1461 } 1462 1463 return drops; 1464 } 1465 1466 static noinline int early_drop(struct net *net, unsigned int hash) 1467 { 1468 unsigned int i, bucket; 1469 1470 for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { 1471 struct hlist_nulls_head *ct_hash; 1472 unsigned int hsize, drops; 1473 1474 rcu_read_lock(); 1475 nf_conntrack_get_ht(&ct_hash, &hsize); 1476 if (!i) 1477 bucket = reciprocal_scale(hash, hsize); 1478 else 1479 bucket = (bucket + 1) % hsize; 1480 1481 drops = early_drop_list(net, &ct_hash[bucket]); 1482 rcu_read_unlock(); 1483 1484 if (drops) { 1485 NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops); 1486 return true; 1487 } 1488 } 1489 1490 return false; 1491 } 1492 1493 static bool gc_worker_skip_ct(const struct nf_conn *ct) 1494 { 1495 return !nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct); 1496 } 1497 1498 static bool gc_worker_can_early_drop(const struct nf_conn *ct) 1499 { 1500 const struct nf_conntrack_l4proto *l4proto; 1501 u8 protonum = nf_ct_protonum(ct); 1502 1503 if (!test_bit(IPS_ASSURED_BIT, &ct->status)) 1504 return true; 1505 1506 l4proto = nf_ct_l4proto_find(protonum); 1507 if (l4proto->can_early_drop && l4proto->can_early_drop(ct)) 1508 return true; 1509 1510 return false; 1511 } 1512 1513 static void gc_worker(struct work_struct *work) 1514 { 1515 unsigned int i, hashsz, nf_conntrack_max95 = 0; 1516 u32 end_time, start_time = nfct_time_stamp; 1517 struct conntrack_gc_work *gc_work; 1518 unsigned int expired_count = 0; 1519 unsigned long next_run; 1520 s32 delta_time; 1521 long count; 1522 1523 gc_work = container_of(work, struct conntrack_gc_work, dwork.work); 1524 1525 i = gc_work->next_bucket; 1526 if (gc_work->early_drop) 1527 nf_conntrack_max95 = nf_conntrack_max / 100u * 95u; 1528 1529 if (i == 0) { 1530 gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT; 1531 gc_work->count = GC_SCAN_INITIAL_COUNT; 1532 gc_work->start_time = start_time; 1533 } 1534 1535 next_run = gc_work->avg_timeout; 1536 count = gc_work->count; 1537 1538 end_time = start_time + GC_SCAN_MAX_DURATION; 1539 1540 do { 1541 struct nf_conntrack_tuple_hash *h; 1542 struct hlist_nulls_head *ct_hash; 1543 struct hlist_nulls_node *n; 1544 struct nf_conn *tmp; 1545 1546 rcu_read_lock(); 1547 1548 nf_conntrack_get_ht(&ct_hash, &hashsz); 1549 if (i >= hashsz) { 1550 rcu_read_unlock(); 1551 break; 1552 } 1553 1554 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) { 1555 struct nf_conntrack_net *cnet; 1556 struct net *net; 1557 long expires; 1558 1559 tmp = nf_ct_tuplehash_to_ctrack(h); 1560 1561 if (expired_count > GC_SCAN_EXPIRED_MAX) { 1562 rcu_read_unlock(); 1563 1564 gc_work->next_bucket = i; 1565 gc_work->avg_timeout = next_run; 1566 gc_work->count = count; 1567 1568 delta_time = nfct_time_stamp - gc_work->start_time; 1569 1570 /* re-sched immediately if total cycle time is exceeded */ 1571 next_run = delta_time < (s32)GC_SCAN_INTERVAL_MAX; 1572 goto early_exit; 1573 } 1574 1575 if (nf_ct_is_expired(tmp)) { 1576 nf_ct_gc_expired(tmp); 1577 expired_count++; 1578 continue; 1579 } 1580 1581 expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP); 1582 expires = (expires - (long)next_run) / ++count; 1583 next_run += expires; 1584 1585 if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp)) 1586 continue; 1587 1588 net = nf_ct_net(tmp); 1589 cnet = nf_ct_pernet(net); 1590 if (atomic_read(&cnet->count) < nf_conntrack_max95) 1591 continue; 1592 1593 /* need to take reference to avoid possible races */ 1594 if (!refcount_inc_not_zero(&tmp->ct_general.use)) 1595 continue; 1596 1597 /* load ->status after refcount increase */ 1598 smp_acquire__after_ctrl_dep(); 1599 1600 if (gc_worker_skip_ct(tmp)) { 1601 nf_ct_put(tmp); 1602 continue; 1603 } 1604 1605 if (gc_worker_can_early_drop(tmp)) { 1606 nf_ct_kill(tmp); 1607 expired_count++; 1608 } 1609 1610 nf_ct_put(tmp); 1611 } 1612 1613 /* could check get_nulls_value() here and restart if ct 1614 * was moved to another chain. But given gc is best-effort 1615 * we will just continue with next hash slot. 1616 */ 1617 rcu_read_unlock(); 1618 cond_resched(); 1619 i++; 1620 1621 delta_time = nfct_time_stamp - end_time; 1622 if (delta_time > 0 && i < hashsz) { 1623 gc_work->avg_timeout = next_run; 1624 gc_work->count = count; 1625 gc_work->next_bucket = i; 1626 next_run = 0; 1627 goto early_exit; 1628 } 1629 } while (i < hashsz); 1630 1631 gc_work->next_bucket = 0; 1632 1633 next_run = clamp(next_run, GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_MAX); 1634 1635 delta_time = max_t(s32, nfct_time_stamp - gc_work->start_time, 1); 1636 if (next_run > (unsigned long)delta_time) 1637 next_run -= delta_time; 1638 else 1639 next_run = 1; 1640 1641 early_exit: 1642 if (gc_work->exiting) 1643 return; 1644 1645 if (next_run) 1646 gc_work->early_drop = false; 1647 1648 queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run); 1649 } 1650 1651 static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) 1652 { 1653 INIT_DELAYED_WORK(&gc_work->dwork, gc_worker); 1654 gc_work->exiting = false; 1655 } 1656 1657 static struct nf_conn * 1658 __nf_conntrack_alloc(struct net *net, 1659 const struct nf_conntrack_zone *zone, 1660 const struct nf_conntrack_tuple *orig, 1661 const struct nf_conntrack_tuple *repl, 1662 gfp_t gfp, u32 hash) 1663 { 1664 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 1665 unsigned int ct_count; 1666 struct nf_conn *ct; 1667 1668 /* We don't want any race condition at early drop stage */ 1669 ct_count = atomic_inc_return(&cnet->count); 1670 1671 if (unlikely(ct_count > nf_conntrack_max)) { 1672 if (!early_drop(net, hash)) { 1673 if (!conntrack_gc_work.early_drop) 1674 conntrack_gc_work.early_drop = true; 1675 atomic_dec(&cnet->count); 1676 if (net == &init_net) 1677 net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); 1678 else 1679 net_warn_ratelimited("nf_conntrack: table full in netns %u, dropping packet\n", 1680 net->ns.inum); 1681 return ERR_PTR(-ENOMEM); 1682 } 1683 } 1684 1685 /* 1686 * Do not use kmem_cache_zalloc(), as this cache uses 1687 * SLAB_TYPESAFE_BY_RCU. 1688 */ 1689 ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); 1690 if (ct == NULL) 1691 goto out; 1692 1693 spin_lock_init(&ct->lock); 1694 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; 1695 ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; 1696 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; 1697 /* save hash for reusing when confirming */ 1698 *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; 1699 ct->status = 0; 1700 WRITE_ONCE(ct->timeout, 0); 1701 write_pnet(&ct->ct_net, net); 1702 memset_after(ct, 0, __nfct_init_offset); 1703 1704 nf_ct_zone_add(ct, zone); 1705 1706 /* Because we use RCU lookups, we set ct_general.use to zero before 1707 * this is inserted in any list. 1708 */ 1709 refcount_set(&ct->ct_general.use, 0); 1710 return ct; 1711 out: 1712 atomic_dec(&cnet->count); 1713 return ERR_PTR(-ENOMEM); 1714 } 1715 1716 struct nf_conn *nf_conntrack_alloc(struct net *net, 1717 const struct nf_conntrack_zone *zone, 1718 const struct nf_conntrack_tuple *orig, 1719 const struct nf_conntrack_tuple *repl, 1720 gfp_t gfp) 1721 { 1722 return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0); 1723 } 1724 EXPORT_SYMBOL_GPL(nf_conntrack_alloc); 1725 1726 void nf_conntrack_free(struct nf_conn *ct) 1727 { 1728 struct net *net = nf_ct_net(ct); 1729 struct nf_conntrack_net *cnet; 1730 1731 /* A freed object has refcnt == 0, that's 1732 * the golden rule for SLAB_TYPESAFE_BY_RCU 1733 */ 1734 WARN_ON(refcount_read(&ct->ct_general.use) != 0); 1735 1736 if (ct->status & IPS_SRC_NAT_DONE) { 1737 const struct nf_nat_hook *nat_hook; 1738 1739 rcu_read_lock(); 1740 nat_hook = rcu_dereference(nf_nat_hook); 1741 if (nat_hook) 1742 nat_hook->remove_nat_bysrc(ct); 1743 rcu_read_unlock(); 1744 } 1745 1746 kfree(ct->ext); 1747 kmem_cache_free(nf_conntrack_cachep, ct); 1748 cnet = nf_ct_pernet(net); 1749 1750 smp_mb__before_atomic(); 1751 atomic_dec(&cnet->count); 1752 } 1753 EXPORT_SYMBOL_GPL(nf_conntrack_free); 1754 1755 1756 /* Allocate a new conntrack: we return -ENOMEM if classification 1757 failed due to stress. Otherwise it really is unclassifiable. */ 1758 static noinline struct nf_conntrack_tuple_hash * 1759 init_conntrack(struct net *net, struct nf_conn *tmpl, 1760 const struct nf_conntrack_tuple *tuple, 1761 struct sk_buff *skb, 1762 unsigned int dataoff, u32 hash) 1763 { 1764 struct nf_conn *ct; 1765 struct nf_conn_help *help; 1766 struct nf_conntrack_tuple repl_tuple; 1767 #ifdef CONFIG_NF_CONNTRACK_EVENTS 1768 struct nf_conntrack_ecache *ecache; 1769 #endif 1770 struct nf_conntrack_expect *exp = NULL; 1771 const struct nf_conntrack_zone *zone; 1772 struct nf_conn_timeout *timeout_ext; 1773 struct nf_conntrack_zone tmp; 1774 struct nf_conntrack_net *cnet; 1775 1776 if (!nf_ct_invert_tuple(&repl_tuple, tuple)) 1777 return NULL; 1778 1779 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1780 ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, 1781 hash); 1782 if (IS_ERR(ct)) 1783 return ERR_CAST(ct); 1784 1785 if (!nf_ct_add_synproxy(ct, tmpl)) { 1786 nf_conntrack_free(ct); 1787 return ERR_PTR(-ENOMEM); 1788 } 1789 1790 timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; 1791 1792 if (timeout_ext) 1793 nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout), 1794 GFP_ATOMIC); 1795 1796 nf_ct_acct_ext_add(ct, GFP_ATOMIC); 1797 nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); 1798 nf_ct_labels_ext_add(ct); 1799 1800 #ifdef CONFIG_NF_CONNTRACK_EVENTS 1801 ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; 1802 1803 if ((ecache || net->ct.sysctl_events) && 1804 !nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, 1805 ecache ? ecache->expmask : 0, 1806 GFP_ATOMIC)) { 1807 nf_conntrack_free(ct); 1808 return ERR_PTR(-ENOMEM); 1809 } 1810 #endif 1811 1812 cnet = nf_ct_pernet(net); 1813 if (cnet->expect_count) { 1814 spin_lock_bh(&nf_conntrack_expect_lock); 1815 exp = nf_ct_find_expectation(net, zone, tuple, !tmpl || nf_ct_is_confirmed(tmpl)); 1816 if (exp) { 1817 /* Welcome, Mr. Bond. We've been expecting you... */ 1818 __set_bit(IPS_EXPECTED_BIT, &ct->status); 1819 /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ 1820 ct->master = exp->master; 1821 if (exp->helper) { 1822 help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); 1823 if (help) 1824 rcu_assign_pointer(help->helper, exp->helper); 1825 } 1826 1827 #ifdef CONFIG_NF_CONNTRACK_MARK 1828 ct->mark = READ_ONCE(exp->master->mark); 1829 #endif 1830 #ifdef CONFIG_NF_CONNTRACK_SECMARK 1831 ct->secmark = exp->master->secmark; 1832 #endif 1833 NF_CT_STAT_INC(net, expect_new); 1834 } 1835 spin_unlock_bh(&nf_conntrack_expect_lock); 1836 } 1837 if (!exp && tmpl) 1838 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); 1839 1840 /* Other CPU might have obtained a pointer to this object before it was 1841 * released. Because refcount is 0, refcount_inc_not_zero() will fail. 1842 * 1843 * After refcount_set(1) it will succeed; ensure that zeroing of 1844 * ct->status and the correct ct->net pointer are visible; else other 1845 * core might observe CONFIRMED bit which means the entry is valid and 1846 * in the hash table, but its not (anymore). 1847 */ 1848 smp_wmb(); 1849 1850 /* Now it is going to be associated with an sk_buff, set refcount to 1. */ 1851 refcount_set(&ct->ct_general.use, 1); 1852 1853 if (exp) { 1854 if (exp->expectfn) 1855 exp->expectfn(ct, exp); 1856 nf_ct_expect_put(exp); 1857 } 1858 1859 return &ct->tuplehash[IP_CT_DIR_ORIGINAL]; 1860 } 1861 1862 /* On success, returns 0, sets skb->_nfct | ctinfo */ 1863 static int 1864 resolve_normal_ct(struct nf_conn *tmpl, 1865 struct sk_buff *skb, 1866 unsigned int dataoff, 1867 u_int8_t protonum, 1868 const struct nf_hook_state *state) 1869 { 1870 const struct nf_conntrack_zone *zone; 1871 struct nf_conntrack_tuple tuple; 1872 struct nf_conntrack_tuple_hash *h; 1873 enum ip_conntrack_info ctinfo; 1874 struct nf_conntrack_zone tmp; 1875 u32 hash, zone_id, rid; 1876 struct nf_conn *ct; 1877 1878 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), 1879 dataoff, state->pf, protonum, state->net, 1880 &tuple)) 1881 return 0; 1882 1883 /* look for tuple match */ 1884 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1885 1886 zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); 1887 hash = hash_conntrack_raw(&tuple, zone_id, state->net); 1888 h = __nf_conntrack_find_get(state->net, zone, &tuple, hash); 1889 1890 if (!h) { 1891 rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); 1892 if (zone_id != rid) { 1893 u32 tmp = hash_conntrack_raw(&tuple, rid, state->net); 1894 1895 h = __nf_conntrack_find_get(state->net, zone, &tuple, tmp); 1896 } 1897 } 1898 1899 if (!h) { 1900 h = init_conntrack(state->net, tmpl, &tuple, 1901 skb, dataoff, hash); 1902 if (!h) 1903 return 0; 1904 if (IS_ERR(h)) 1905 return PTR_ERR(h); 1906 } 1907 ct = nf_ct_tuplehash_to_ctrack(h); 1908 1909 /* It exists; we have (non-exclusive) reference. */ 1910 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { 1911 ctinfo = IP_CT_ESTABLISHED_REPLY; 1912 } else { 1913 unsigned long status = READ_ONCE(ct->status); 1914 1915 /* Once we've had two way comms, always ESTABLISHED. */ 1916 if (likely(status & IPS_SEEN_REPLY)) 1917 ctinfo = IP_CT_ESTABLISHED; 1918 else if (status & IPS_EXPECTED) 1919 ctinfo = IP_CT_RELATED; 1920 else 1921 ctinfo = IP_CT_NEW; 1922 } 1923 nf_ct_set(skb, ct, ctinfo); 1924 return 0; 1925 } 1926 1927 /* 1928 * icmp packets need special treatment to handle error messages that are 1929 * related to a connection. 1930 * 1931 * Callers need to check if skb has a conntrack assigned when this 1932 * helper returns; in such case skb belongs to an already known connection. 1933 */ 1934 static unsigned int __cold 1935 nf_conntrack_handle_icmp(struct nf_conn *tmpl, 1936 struct sk_buff *skb, 1937 unsigned int dataoff, 1938 u8 protonum, 1939 const struct nf_hook_state *state) 1940 { 1941 int ret; 1942 1943 if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP) 1944 ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state); 1945 #if IS_ENABLED(CONFIG_IPV6) 1946 else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6) 1947 ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state); 1948 #endif 1949 else 1950 return NF_ACCEPT; 1951 1952 if (ret <= 0) 1953 NF_CT_STAT_INC_ATOMIC(state->net, error); 1954 1955 return ret; 1956 } 1957 1958 static int generic_packet(struct nf_conn *ct, struct sk_buff *skb, 1959 enum ip_conntrack_info ctinfo) 1960 { 1961 const unsigned int *timeout = nf_ct_timeout_lookup(ct); 1962 1963 if (!timeout) 1964 timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout; 1965 1966 nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); 1967 return NF_ACCEPT; 1968 } 1969 1970 /* Returns verdict for packet, or -1 for invalid. */ 1971 static int nf_conntrack_handle_packet(struct nf_conn *ct, 1972 struct sk_buff *skb, 1973 unsigned int dataoff, 1974 enum ip_conntrack_info ctinfo, 1975 const struct nf_hook_state *state) 1976 { 1977 switch (nf_ct_protonum(ct)) { 1978 case IPPROTO_TCP: 1979 return nf_conntrack_tcp_packet(ct, skb, dataoff, 1980 ctinfo, state); 1981 case IPPROTO_UDP: 1982 return nf_conntrack_udp_packet(ct, skb, dataoff, 1983 ctinfo, state); 1984 case IPPROTO_ICMP: 1985 return nf_conntrack_icmp_packet(ct, skb, ctinfo, state); 1986 #if IS_ENABLED(CONFIG_IPV6) 1987 case IPPROTO_ICMPV6: 1988 return nf_conntrack_icmpv6_packet(ct, skb, ctinfo, state); 1989 #endif 1990 #ifdef CONFIG_NF_CT_PROTO_UDPLITE 1991 case IPPROTO_UDPLITE: 1992 return nf_conntrack_udplite_packet(ct, skb, dataoff, 1993 ctinfo, state); 1994 #endif 1995 #ifdef CONFIG_NF_CT_PROTO_SCTP 1996 case IPPROTO_SCTP: 1997 return nf_conntrack_sctp_packet(ct, skb, dataoff, 1998 ctinfo, state); 1999 #endif 2000 #ifdef CONFIG_NF_CT_PROTO_GRE 2001 case IPPROTO_GRE: 2002 return nf_conntrack_gre_packet(ct, skb, dataoff, 2003 ctinfo, state); 2004 #endif 2005 } 2006 2007 return generic_packet(ct, skb, ctinfo); 2008 } 2009 2010 unsigned int 2011 nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) 2012 { 2013 enum ip_conntrack_info ctinfo; 2014 struct nf_conn *ct, *tmpl; 2015 u_int8_t protonum; 2016 int dataoff, ret; 2017 2018 tmpl = nf_ct_get(skb, &ctinfo); 2019 if (tmpl || ctinfo == IP_CT_UNTRACKED) { 2020 /* Previously seen (loopback or untracked)? Ignore. */ 2021 if ((tmpl && !nf_ct_is_template(tmpl)) || 2022 ctinfo == IP_CT_UNTRACKED) 2023 return NF_ACCEPT; 2024 skb->_nfct = 0; 2025 } 2026 2027 /* rcu_read_lock()ed by nf_hook_thresh */ 2028 dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum); 2029 if (dataoff <= 0) { 2030 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 2031 ret = NF_ACCEPT; 2032 goto out; 2033 } 2034 2035 if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) { 2036 ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff, 2037 protonum, state); 2038 if (ret <= 0) { 2039 ret = -ret; 2040 goto out; 2041 } 2042 /* ICMP[v6] protocol trackers may assign one conntrack. */ 2043 if (skb->_nfct) 2044 goto out; 2045 } 2046 repeat: 2047 ret = resolve_normal_ct(tmpl, skb, dataoff, 2048 protonum, state); 2049 if (ret < 0) { 2050 /* Too stressed to deal. */ 2051 NF_CT_STAT_INC_ATOMIC(state->net, drop); 2052 ret = NF_DROP; 2053 goto out; 2054 } 2055 2056 ct = nf_ct_get(skb, &ctinfo); 2057 if (!ct) { 2058 /* Not valid part of a connection */ 2059 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 2060 ret = NF_ACCEPT; 2061 goto out; 2062 } 2063 2064 ret = nf_conntrack_handle_packet(ct, skb, dataoff, ctinfo, state); 2065 if (ret <= 0) { 2066 /* Invalid: inverse of the return code tells 2067 * the netfilter core what to do */ 2068 nf_ct_put(ct); 2069 skb->_nfct = 0; 2070 /* Special case: TCP tracker reports an attempt to reopen a 2071 * closed/aborted connection. We have to go back and create a 2072 * fresh conntrack. 2073 */ 2074 if (ret == -NF_REPEAT) 2075 goto repeat; 2076 2077 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 2078 if (ret == NF_DROP) 2079 NF_CT_STAT_INC_ATOMIC(state->net, drop); 2080 2081 ret = -ret; 2082 goto out; 2083 } 2084 2085 if (ctinfo == IP_CT_ESTABLISHED_REPLY && 2086 !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) 2087 nf_conntrack_event_cache(IPCT_REPLY, ct); 2088 out: 2089 if (tmpl) 2090 nf_ct_put(tmpl); 2091 2092 return ret; 2093 } 2094 EXPORT_SYMBOL_GPL(nf_conntrack_in); 2095 2096 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ 2097 void __nf_ct_refresh_acct(struct nf_conn *ct, 2098 enum ip_conntrack_info ctinfo, 2099 u32 extra_jiffies, 2100 unsigned int bytes) 2101 { 2102 /* Only update if this is not a fixed timeout */ 2103 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) 2104 goto acct; 2105 2106 /* If not in hash table, timer will not be active yet */ 2107 if (nf_ct_is_confirmed(ct)) 2108 extra_jiffies += nfct_time_stamp; 2109 2110 if (READ_ONCE(ct->timeout) != extra_jiffies) 2111 WRITE_ONCE(ct->timeout, extra_jiffies); 2112 acct: 2113 if (bytes) 2114 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes); 2115 } 2116 EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); 2117 2118 bool nf_ct_kill_acct(struct nf_conn *ct, 2119 enum ip_conntrack_info ctinfo, 2120 const struct sk_buff *skb) 2121 { 2122 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); 2123 2124 return nf_ct_delete(ct, 0, 0); 2125 } 2126 EXPORT_SYMBOL_GPL(nf_ct_kill_acct); 2127 2128 #if IS_ENABLED(CONFIG_NF_CT_NETLINK) 2129 2130 #include <linux/netfilter/nfnetlink.h> 2131 #include <linux/netfilter/nfnetlink_conntrack.h> 2132 #include <linux/mutex.h> 2133 2134 /* Generic function for tcp/udp/sctp/dccp and alike. */ 2135 int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, 2136 const struct nf_conntrack_tuple *tuple) 2137 { 2138 if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) || 2139 nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port)) 2140 goto nla_put_failure; 2141 return 0; 2142 2143 nla_put_failure: 2144 return -1; 2145 } 2146 EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr); 2147 2148 const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = { 2149 [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 }, 2150 [CTA_PROTO_DST_PORT] = { .type = NLA_U16 }, 2151 }; 2152 EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy); 2153 2154 int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], 2155 struct nf_conntrack_tuple *t, 2156 u_int32_t flags) 2157 { 2158 if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) { 2159 if (!tb[CTA_PROTO_SRC_PORT]) 2160 return -EINVAL; 2161 2162 t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); 2163 } 2164 2165 if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) { 2166 if (!tb[CTA_PROTO_DST_PORT]) 2167 return -EINVAL; 2168 2169 t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); 2170 } 2171 2172 return 0; 2173 } 2174 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple); 2175 2176 unsigned int nf_ct_port_nlattr_tuple_size(void) 2177 { 2178 static unsigned int size __read_mostly; 2179 2180 if (!size) 2181 size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); 2182 2183 return size; 2184 } 2185 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); 2186 #endif 2187 2188 /* Used by ipt_REJECT and ip6t_REJECT. */ 2189 static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) 2190 { 2191 struct nf_conn *ct; 2192 enum ip_conntrack_info ctinfo; 2193 2194 /* This ICMP is in reverse direction to the packet which caused it */ 2195 ct = nf_ct_get(skb, &ctinfo); 2196 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) 2197 ctinfo = IP_CT_RELATED_REPLY; 2198 else 2199 ctinfo = IP_CT_RELATED; 2200 2201 /* Attach to new skbuff, and increment count */ 2202 nf_ct_set(nskb, ct, ctinfo); 2203 nf_conntrack_get(skb_nfct(nskb)); 2204 } 2205 2206 /* This packet is coming from userspace via nf_queue, complete the packet 2207 * processing after the helper invocation in nf_confirm(). 2208 */ 2209 static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct, 2210 enum ip_conntrack_info ctinfo) 2211 { 2212 const struct nf_conntrack_helper *helper; 2213 const struct nf_conn_help *help; 2214 int protoff; 2215 2216 help = nfct_help(ct); 2217 if (!help) 2218 return NF_ACCEPT; 2219 2220 helper = rcu_dereference(help->helper); 2221 if (!helper) 2222 return NF_ACCEPT; 2223 2224 if (!(helper->flags & NF_CT_HELPER_F_USERSPACE)) 2225 return NF_ACCEPT; 2226 2227 switch (nf_ct_l3num(ct)) { 2228 case NFPROTO_IPV4: 2229 protoff = skb_network_offset(skb) + ip_hdrlen(skb); 2230 break; 2231 #if IS_ENABLED(CONFIG_IPV6) 2232 case NFPROTO_IPV6: { 2233 __be16 frag_off; 2234 u8 pnum; 2235 2236 pnum = ipv6_hdr(skb)->nexthdr; 2237 protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, 2238 &frag_off); 2239 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) 2240 return NF_ACCEPT; 2241 break; 2242 } 2243 #endif 2244 default: 2245 return NF_ACCEPT; 2246 } 2247 2248 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && 2249 !nf_is_loopback_packet(skb)) { 2250 if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { 2251 NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); 2252 return NF_DROP; 2253 } 2254 } 2255 2256 /* We've seen it coming out the other side: confirm it */ 2257 return nf_conntrack_confirm(skb); 2258 } 2259 2260 static int nf_conntrack_update(struct net *net, struct sk_buff *skb) 2261 { 2262 enum ip_conntrack_info ctinfo; 2263 struct nf_conn *ct; 2264 2265 ct = nf_ct_get(skb, &ctinfo); 2266 if (!ct) 2267 return NF_ACCEPT; 2268 2269 return nf_confirm_cthelper(skb, ct, ctinfo); 2270 } 2271 2272 static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, 2273 const struct sk_buff *skb) 2274 { 2275 const struct nf_conntrack_tuple *src_tuple; 2276 const struct nf_conntrack_tuple_hash *hash; 2277 struct nf_conntrack_tuple srctuple; 2278 enum ip_conntrack_info ctinfo; 2279 struct nf_conn *ct; 2280 2281 ct = nf_ct_get(skb, &ctinfo); 2282 if (ct) { 2283 src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo)); 2284 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); 2285 return true; 2286 } 2287 2288 if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), 2289 NFPROTO_IPV4, dev_net(skb->dev), 2290 &srctuple)) 2291 return false; 2292 2293 hash = nf_conntrack_find_get(dev_net(skb->dev), 2294 &nf_ct_zone_dflt, 2295 &srctuple); 2296 if (!hash) 2297 return false; 2298 2299 ct = nf_ct_tuplehash_to_ctrack(hash); 2300 src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir); 2301 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); 2302 nf_ct_put(ct); 2303 2304 return true; 2305 } 2306 2307 /* Bring out ya dead! */ 2308 static struct nf_conn * 2309 get_next_corpse(int (*iter)(struct nf_conn *i, void *data), 2310 const struct nf_ct_iter_data *iter_data, unsigned int *bucket) 2311 { 2312 struct nf_conntrack_tuple_hash *h; 2313 struct nf_conn *ct; 2314 struct hlist_nulls_node *n; 2315 spinlock_t *lockp; 2316 2317 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { 2318 struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket]; 2319 2320 if (hlist_nulls_empty(hslot)) 2321 continue; 2322 2323 lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; 2324 local_bh_disable(); 2325 nf_conntrack_lock(lockp); 2326 hlist_nulls_for_each_entry(h, n, hslot, hnnode) { 2327 if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY) 2328 continue; 2329 /* All nf_conn objects are added to hash table twice, one 2330 * for original direction tuple, once for the reply tuple. 2331 * 2332 * Exception: In the IPS_NAT_CLASH case, only the reply 2333 * tuple is added (the original tuple already existed for 2334 * a different object). 2335 * 2336 * We only need to call the iterator once for each 2337 * conntrack, so we just use the 'reply' direction 2338 * tuple while iterating. 2339 */ 2340 ct = nf_ct_tuplehash_to_ctrack(h); 2341 2342 if (iter_data->net && 2343 !net_eq(iter_data->net, nf_ct_net(ct))) 2344 continue; 2345 2346 if (iter(ct, iter_data->data)) 2347 goto found; 2348 } 2349 spin_unlock(lockp); 2350 local_bh_enable(); 2351 cond_resched(); 2352 } 2353 2354 return NULL; 2355 found: 2356 refcount_inc(&ct->ct_general.use); 2357 spin_unlock(lockp); 2358 local_bh_enable(); 2359 return ct; 2360 } 2361 2362 static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), 2363 const struct nf_ct_iter_data *iter_data) 2364 { 2365 unsigned int bucket = 0; 2366 struct nf_conn *ct; 2367 2368 might_sleep(); 2369 2370 mutex_lock(&nf_conntrack_mutex); 2371 while ((ct = get_next_corpse(iter, iter_data, &bucket)) != NULL) { 2372 /* Time to push up daises... */ 2373 2374 nf_ct_delete(ct, iter_data->portid, iter_data->report); 2375 nf_ct_put(ct); 2376 cond_resched(); 2377 } 2378 mutex_unlock(&nf_conntrack_mutex); 2379 } 2380 2381 void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data), 2382 const struct nf_ct_iter_data *iter_data) 2383 { 2384 struct net *net = iter_data->net; 2385 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2386 2387 might_sleep(); 2388 2389 if (atomic_read(&cnet->count) == 0) 2390 return; 2391 2392 nf_ct_iterate_cleanup(iter, iter_data); 2393 } 2394 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net); 2395 2396 /** 2397 * nf_ct_iterate_destroy - destroy unconfirmed conntracks and iterate table 2398 * @iter: callback to invoke for each conntrack 2399 * @data: data to pass to @iter 2400 * 2401 * Like nf_ct_iterate_cleanup, but first marks conntracks on the 2402 * unconfirmed list as dying (so they will not be inserted into 2403 * main table). 2404 * 2405 * Can only be called in module exit path. 2406 */ 2407 void 2408 nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) 2409 { 2410 struct nf_ct_iter_data iter_data = {}; 2411 struct net *net; 2412 2413 down_read(&net_rwsem); 2414 for_each_net(net) { 2415 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2416 2417 if (atomic_read(&cnet->count) == 0) 2418 continue; 2419 nf_queue_nf_hook_drop(net); 2420 } 2421 up_read(&net_rwsem); 2422 2423 /* Need to wait for netns cleanup worker to finish, if its 2424 * running -- it might have deleted a net namespace from 2425 * the global list, so hook drop above might not have 2426 * affected all namespaces. 2427 */ 2428 net_ns_barrier(); 2429 2430 /* a skb w. unconfirmed conntrack could have been reinjected just 2431 * before we called nf_queue_nf_hook_drop(). 2432 * 2433 * This makes sure its inserted into conntrack table. 2434 */ 2435 synchronize_net(); 2436 2437 nf_ct_ext_bump_genid(); 2438 iter_data.data = data; 2439 nf_ct_iterate_cleanup(iter, &iter_data); 2440 2441 /* Another cpu might be in a rcu read section with 2442 * rcu protected pointer cleared in iter callback 2443 * or hidden via nf_ct_ext_bump_genid() above. 2444 * 2445 * Wait until those are done. 2446 */ 2447 synchronize_rcu(); 2448 } 2449 EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy); 2450 2451 static int kill_all(struct nf_conn *i, void *data) 2452 { 2453 return 1; 2454 } 2455 2456 void nf_conntrack_cleanup_start(void) 2457 { 2458 cleanup_nf_conntrack_bpf(); 2459 conntrack_gc_work.exiting = true; 2460 } 2461 2462 void nf_conntrack_cleanup_end(void) 2463 { 2464 RCU_INIT_POINTER(nf_ct_hook, NULL); 2465 cancel_delayed_work_sync(&conntrack_gc_work.dwork); 2466 kvfree(nf_conntrack_hash); 2467 2468 nf_conntrack_proto_fini(); 2469 nf_conntrack_helper_fini(); 2470 nf_conntrack_expect_fini(); 2471 2472 kmem_cache_destroy(nf_conntrack_cachep); 2473 } 2474 2475 /* 2476 * Mishearing the voices in his head, our hero wonders how he's 2477 * supposed to kill the mall. 2478 */ 2479 void nf_conntrack_cleanup_net(struct net *net) 2480 { 2481 LIST_HEAD(single); 2482 2483 list_add(&net->exit_list, &single); 2484 nf_conntrack_cleanup_net_list(&single); 2485 } 2486 2487 void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) 2488 { 2489 struct nf_ct_iter_data iter_data = {}; 2490 struct net *net; 2491 int busy; 2492 2493 /* 2494 * This makes sure all current packets have passed through 2495 * netfilter framework. Roll on, two-stage module 2496 * delete... 2497 */ 2498 synchronize_rcu_expedited(); 2499 i_see_dead_people: 2500 busy = 0; 2501 list_for_each_entry(net, net_exit_list, exit_list) { 2502 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2503 2504 iter_data.net = net; 2505 nf_ct_iterate_cleanup_net(kill_all, &iter_data); 2506 if (atomic_read(&cnet->count) != 0) 2507 busy = 1; 2508 } 2509 if (busy) { 2510 schedule(); 2511 goto i_see_dead_people; 2512 } 2513 2514 list_for_each_entry(net, net_exit_list, exit_list) { 2515 nf_conntrack_ecache_pernet_fini(net); 2516 nf_conntrack_expect_pernet_fini(net); 2517 free_percpu(net->ct.stat); 2518 } 2519 } 2520 2521 void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) 2522 { 2523 struct hlist_nulls_head *hash; 2524 unsigned int nr_slots, i; 2525 2526 if (*sizep > (INT_MAX / sizeof(struct hlist_nulls_head))) 2527 return NULL; 2528 2529 BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); 2530 nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); 2531 2532 if (nr_slots > (INT_MAX / sizeof(struct hlist_nulls_head))) 2533 return NULL; 2534 2535 hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL); 2536 2537 if (hash && nulls) 2538 for (i = 0; i < nr_slots; i++) 2539 INIT_HLIST_NULLS_HEAD(&hash[i], i); 2540 2541 return hash; 2542 } 2543 EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); 2544 2545 int nf_conntrack_hash_resize(unsigned int hashsize) 2546 { 2547 int i, bucket; 2548 unsigned int old_size; 2549 struct hlist_nulls_head *hash, *old_hash; 2550 struct nf_conntrack_tuple_hash *h; 2551 struct nf_conn *ct; 2552 2553 if (!hashsize) 2554 return -EINVAL; 2555 2556 hash = nf_ct_alloc_hashtable(&hashsize, 1); 2557 if (!hash) 2558 return -ENOMEM; 2559 2560 mutex_lock(&nf_conntrack_mutex); 2561 old_size = nf_conntrack_htable_size; 2562 if (old_size == hashsize) { 2563 mutex_unlock(&nf_conntrack_mutex); 2564 kvfree(hash); 2565 return 0; 2566 } 2567 2568 local_bh_disable(); 2569 nf_conntrack_all_lock(); 2570 write_seqcount_begin(&nf_conntrack_generation); 2571 2572 /* Lookups in the old hash might happen in parallel, which means we 2573 * might get false negatives during connection lookup. New connections 2574 * created because of a false negative won't make it into the hash 2575 * though since that required taking the locks. 2576 */ 2577 2578 for (i = 0; i < nf_conntrack_htable_size; i++) { 2579 while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { 2580 unsigned int zone_id; 2581 2582 h = hlist_nulls_entry(nf_conntrack_hash[i].first, 2583 struct nf_conntrack_tuple_hash, hnnode); 2584 ct = nf_ct_tuplehash_to_ctrack(h); 2585 hlist_nulls_del_rcu(&h->hnnode); 2586 2587 zone_id = nf_ct_zone_id(nf_ct_zone(ct), NF_CT_DIRECTION(h)); 2588 bucket = __hash_conntrack(nf_ct_net(ct), 2589 &h->tuple, zone_id, hashsize); 2590 hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); 2591 } 2592 } 2593 old_hash = nf_conntrack_hash; 2594 2595 nf_conntrack_hash = hash; 2596 nf_conntrack_htable_size = hashsize; 2597 2598 write_seqcount_end(&nf_conntrack_generation); 2599 nf_conntrack_all_unlock(); 2600 local_bh_enable(); 2601 2602 mutex_unlock(&nf_conntrack_mutex); 2603 2604 synchronize_net(); 2605 kvfree(old_hash); 2606 return 0; 2607 } 2608 2609 int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) 2610 { 2611 unsigned int hashsize; 2612 int rc; 2613 2614 if (current->nsproxy->net_ns != &init_net) 2615 return -EOPNOTSUPP; 2616 2617 /* On boot, we can set this without any fancy locking. */ 2618 if (!nf_conntrack_hash) 2619 return param_set_uint(val, kp); 2620 2621 rc = kstrtouint(val, 0, &hashsize); 2622 if (rc) 2623 return rc; 2624 2625 return nf_conntrack_hash_resize(hashsize); 2626 } 2627 2628 int nf_conntrack_init_start(void) 2629 { 2630 unsigned long nr_pages = totalram_pages(); 2631 int max_factor = 8; 2632 int ret = -ENOMEM; 2633 int i; 2634 2635 seqcount_spinlock_init(&nf_conntrack_generation, 2636 &nf_conntrack_locks_all_lock); 2637 2638 for (i = 0; i < CONNTRACK_LOCKS; i++) 2639 spin_lock_init(&nf_conntrack_locks[i]); 2640 2641 if (!nf_conntrack_htable_size) { 2642 nf_conntrack_htable_size 2643 = (((nr_pages << PAGE_SHIFT) / 16384) 2644 / sizeof(struct hlist_head)); 2645 if (BITS_PER_LONG >= 64 && 2646 nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) 2647 nf_conntrack_htable_size = 262144; 2648 else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) 2649 nf_conntrack_htable_size = 65536; 2650 2651 if (nf_conntrack_htable_size < 1024) 2652 nf_conntrack_htable_size = 1024; 2653 /* Use a max. factor of one by default to keep the average 2654 * hash chain length at 2 entries. Each entry has to be added 2655 * twice (once for original direction, once for reply). 2656 * When a table size is given we use the old value of 8 to 2657 * avoid implicit reduction of the max entries setting. 2658 */ 2659 max_factor = 1; 2660 } 2661 2662 nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1); 2663 if (!nf_conntrack_hash) 2664 return -ENOMEM; 2665 2666 nf_conntrack_max = max_factor * nf_conntrack_htable_size; 2667 2668 nf_conntrack_cachep = kmem_cache_create("nf_conntrack", 2669 sizeof(struct nf_conn), 2670 NFCT_INFOMASK + 1, 2671 SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); 2672 if (!nf_conntrack_cachep) 2673 goto err_cachep; 2674 2675 ret = nf_conntrack_expect_init(); 2676 if (ret < 0) 2677 goto err_expect; 2678 2679 ret = nf_conntrack_helper_init(); 2680 if (ret < 0) 2681 goto err_helper; 2682 2683 ret = nf_conntrack_proto_init(); 2684 if (ret < 0) 2685 goto err_proto; 2686 2687 conntrack_gc_work_init(&conntrack_gc_work); 2688 queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ); 2689 2690 ret = register_nf_conntrack_bpf(); 2691 if (ret < 0) 2692 goto err_kfunc; 2693 2694 return 0; 2695 2696 err_kfunc: 2697 cancel_delayed_work_sync(&conntrack_gc_work.dwork); 2698 nf_conntrack_proto_fini(); 2699 err_proto: 2700 nf_conntrack_helper_fini(); 2701 err_helper: 2702 nf_conntrack_expect_fini(); 2703 err_expect: 2704 kmem_cache_destroy(nf_conntrack_cachep); 2705 err_cachep: 2706 kvfree(nf_conntrack_hash); 2707 return ret; 2708 } 2709 2710 static void nf_conntrack_set_closing(struct nf_conntrack *nfct) 2711 { 2712 struct nf_conn *ct = nf_ct_to_nf_conn(nfct); 2713 2714 switch (nf_ct_protonum(ct)) { 2715 case IPPROTO_TCP: 2716 nf_conntrack_tcp_set_closing(ct); 2717 break; 2718 } 2719 } 2720 2721 static const struct nf_ct_hook nf_conntrack_hook = { 2722 .update = nf_conntrack_update, 2723 .destroy = nf_ct_destroy, 2724 .get_tuple_skb = nf_conntrack_get_tuple_skb, 2725 .attach = nf_conntrack_attach, 2726 .set_closing = nf_conntrack_set_closing, 2727 .confirm = __nf_conntrack_confirm, 2728 .get_id = nf_conntrack_get_id, 2729 }; 2730 2731 void nf_conntrack_init_end(void) 2732 { 2733 RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook); 2734 } 2735 2736 /* 2737 * We need to use special "null" values, not used in hash table 2738 */ 2739 #define UNCONFIRMED_NULLS_VAL ((1<<30)+0) 2740 2741 int nf_conntrack_init_net(struct net *net) 2742 { 2743 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2744 int ret = -ENOMEM; 2745 2746 BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER); 2747 BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS); 2748 atomic_set(&cnet->count, 0); 2749 2750 net->ct.stat = alloc_percpu(struct ip_conntrack_stat); 2751 if (!net->ct.stat) 2752 return ret; 2753 2754 ret = nf_conntrack_expect_pernet_init(net); 2755 if (ret < 0) 2756 goto err_expect; 2757 2758 nf_conntrack_acct_pernet_init(net); 2759 nf_conntrack_tstamp_pernet_init(net); 2760 nf_conntrack_ecache_pernet_init(net); 2761 nf_conntrack_proto_pernet_init(net); 2762 2763 return 0; 2764 2765 err_expect: 2766 free_percpu(net->ct.stat); 2767 return ret; 2768 } 2769 2770 /* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */ 2771 2772 int __nf_ct_change_timeout(struct nf_conn *ct, u64 timeout) 2773 { 2774 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) 2775 return -EPERM; 2776 2777 __nf_ct_set_timeout(ct, timeout); 2778 2779 if (test_bit(IPS_DYING_BIT, &ct->status)) 2780 return -ETIME; 2781 2782 return 0; 2783 } 2784 EXPORT_SYMBOL_GPL(__nf_ct_change_timeout); 2785 2786 void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off) 2787 { 2788 unsigned int bit; 2789 2790 /* Ignore these unchangable bits */ 2791 on &= ~IPS_UNCHANGEABLE_MASK; 2792 off &= ~IPS_UNCHANGEABLE_MASK; 2793 2794 for (bit = 0; bit < __IPS_MAX_BIT; bit++) { 2795 if (on & (1 << bit)) 2796 set_bit(bit, &ct->status); 2797 else if (off & (1 << bit)) 2798 clear_bit(bit, &ct->status); 2799 } 2800 } 2801 EXPORT_SYMBOL_GPL(__nf_ct_change_status); 2802 2803 int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status) 2804 { 2805 unsigned long d; 2806 2807 d = ct->status ^ status; 2808 2809 if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING)) 2810 /* unchangeable */ 2811 return -EBUSY; 2812 2813 if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) 2814 /* SEEN_REPLY bit can only be set */ 2815 return -EBUSY; 2816 2817 if (d & IPS_ASSURED && !(status & IPS_ASSURED)) 2818 /* ASSURED bit can only be set */ 2819 return -EBUSY; 2820 2821 __nf_ct_change_status(ct, status, 0); 2822 return 0; 2823 } 2824 EXPORT_SYMBOL_GPL(nf_ct_change_status_common); 2825