1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Connection state tracking for netfilter. This is separated from, 3 but required by, the NAT layer; it can also be used by an iptables 4 extension. */ 5 6 /* (C) 1999-2001 Paul `Rusty' Russell 7 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> 8 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> 9 * (C) 2005-2012 Patrick McHardy <kaber@trash.net> 10 */ 11 12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13 14 #include <linux/types.h> 15 #include <linux/netfilter.h> 16 #include <linux/module.h> 17 #include <linux/sched.h> 18 #include <linux/skbuff.h> 19 #include <linux/proc_fs.h> 20 #include <linux/vmalloc.h> 21 #include <linux/stddef.h> 22 #include <linux/slab.h> 23 #include <linux/random.h> 24 #include <linux/siphash.h> 25 #include <linux/err.h> 26 #include <linux/percpu.h> 27 #include <linux/moduleparam.h> 28 #include <linux/notifier.h> 29 #include <linux/kernel.h> 30 #include <linux/netdevice.h> 31 #include <linux/socket.h> 32 #include <linux/mm.h> 33 #include <linux/nsproxy.h> 34 #include <linux/rculist_nulls.h> 35 36 #include <net/netfilter/nf_conntrack.h> 37 #include <net/netfilter/nf_conntrack_bpf.h> 38 #include <net/netfilter/nf_conntrack_l4proto.h> 39 #include <net/netfilter/nf_conntrack_expect.h> 40 #include <net/netfilter/nf_conntrack_helper.h> 41 #include <net/netfilter/nf_conntrack_core.h> 42 #include <net/netfilter/nf_conntrack_extend.h> 43 #include <net/netfilter/nf_conntrack_acct.h> 44 #include <net/netfilter/nf_conntrack_ecache.h> 45 #include <net/netfilter/nf_conntrack_zones.h> 46 #include <net/netfilter/nf_conntrack_timestamp.h> 47 #include <net/netfilter/nf_conntrack_timeout.h> 48 #include <net/netfilter/nf_conntrack_labels.h> 49 #include <net/netfilter/nf_conntrack_synproxy.h> 50 #include <net/netfilter/nf_nat.h> 51 #include <net/netfilter/nf_nat_helper.h> 52 #include <net/netns/hash.h> 53 #include <net/ip.h> 54 55 #include "nf_internals.h" 56 57 __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; 58 EXPORT_SYMBOL_GPL(nf_conntrack_locks); 59 60 __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); 61 EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); 62 63 struct hlist_nulls_head *nf_conntrack_hash __read_mostly; 64 EXPORT_SYMBOL_GPL(nf_conntrack_hash); 65 66 struct conntrack_gc_work { 67 struct delayed_work dwork; 68 u32 next_bucket; 69 u32 avg_timeout; 70 u32 count; 71 u32 start_time; 72 bool exiting; 73 bool early_drop; 74 }; 75 76 static __read_mostly struct kmem_cache *nf_conntrack_cachep; 77 static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); 78 static __read_mostly bool nf_conntrack_locks_all; 79 80 /* serialize hash resizes and nf_ct_iterate_cleanup */ 81 static DEFINE_MUTEX(nf_conntrack_mutex); 82 83 #define GC_SCAN_INTERVAL_MAX (60ul * HZ) 84 #define GC_SCAN_INTERVAL_MIN (1ul * HZ) 85 86 /* clamp timeouts to this value (TCP unacked) */ 87 #define GC_SCAN_INTERVAL_CLAMP (300ul * HZ) 88 89 /* Initial bias pretending we have 100 entries at the upper bound so we don't 90 * wakeup often just because we have three entries with a 1s timeout while still 91 * allowing non-idle machines to wakeup more often when needed. 92 */ 93 #define GC_SCAN_INITIAL_COUNT 100 94 #define GC_SCAN_INTERVAL_INIT GC_SCAN_INTERVAL_MAX 95 96 #define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) 97 #define GC_SCAN_EXPIRED_MAX (64000u / HZ) 98 99 #define MIN_CHAINLEN 50u 100 #define MAX_CHAINLEN (80u - MIN_CHAINLEN) 101 102 static struct conntrack_gc_work conntrack_gc_work; 103 104 void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) 105 { 106 /* 1) Acquire the lock */ 107 spin_lock(lock); 108 109 /* 2) read nf_conntrack_locks_all, with ACQUIRE semantics 110 * It pairs with the smp_store_release() in nf_conntrack_all_unlock() 111 */ 112 if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false)) 113 return; 114 115 /* fast path failed, unlock */ 116 spin_unlock(lock); 117 118 /* Slow path 1) get global lock */ 119 spin_lock(&nf_conntrack_locks_all_lock); 120 121 /* Slow path 2) get the lock we want */ 122 spin_lock(lock); 123 124 /* Slow path 3) release the global lock */ 125 spin_unlock(&nf_conntrack_locks_all_lock); 126 } 127 EXPORT_SYMBOL_GPL(nf_conntrack_lock); 128 129 static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) 130 { 131 h1 %= CONNTRACK_LOCKS; 132 h2 %= CONNTRACK_LOCKS; 133 spin_unlock(&nf_conntrack_locks[h1]); 134 if (h1 != h2) 135 spin_unlock(&nf_conntrack_locks[h2]); 136 } 137 138 /* return true if we need to recompute hashes (in case hash table was resized) */ 139 static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, 140 unsigned int h2, unsigned int sequence) 141 { 142 h1 %= CONNTRACK_LOCKS; 143 h2 %= CONNTRACK_LOCKS; 144 if (h1 <= h2) { 145 nf_conntrack_lock(&nf_conntrack_locks[h1]); 146 if (h1 != h2) 147 spin_lock_nested(&nf_conntrack_locks[h2], 148 SINGLE_DEPTH_NESTING); 149 } else { 150 nf_conntrack_lock(&nf_conntrack_locks[h2]); 151 spin_lock_nested(&nf_conntrack_locks[h1], 152 SINGLE_DEPTH_NESTING); 153 } 154 if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { 155 nf_conntrack_double_unlock(h1, h2); 156 return true; 157 } 158 return false; 159 } 160 161 static void nf_conntrack_all_lock(void) 162 __acquires(&nf_conntrack_locks_all_lock) 163 { 164 int i; 165 166 spin_lock(&nf_conntrack_locks_all_lock); 167 168 /* For nf_contrack_locks_all, only the latest time when another 169 * CPU will see an update is controlled, by the "release" of the 170 * spin_lock below. 171 * The earliest time is not controlled, an thus KCSAN could detect 172 * a race when nf_conntract_lock() reads the variable. 173 * WRITE_ONCE() is used to ensure the compiler will not 174 * optimize the write. 175 */ 176 WRITE_ONCE(nf_conntrack_locks_all, true); 177 178 for (i = 0; i < CONNTRACK_LOCKS; i++) { 179 spin_lock(&nf_conntrack_locks[i]); 180 181 /* This spin_unlock provides the "release" to ensure that 182 * nf_conntrack_locks_all==true is visible to everyone that 183 * acquired spin_lock(&nf_conntrack_locks[]). 184 */ 185 spin_unlock(&nf_conntrack_locks[i]); 186 } 187 } 188 189 static void nf_conntrack_all_unlock(void) 190 __releases(&nf_conntrack_locks_all_lock) 191 { 192 /* All prior stores must be complete before we clear 193 * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock() 194 * might observe the false value but not the entire 195 * critical section. 196 * It pairs with the smp_load_acquire() in nf_conntrack_lock() 197 */ 198 smp_store_release(&nf_conntrack_locks_all, false); 199 spin_unlock(&nf_conntrack_locks_all_lock); 200 } 201 202 unsigned int nf_conntrack_htable_size __read_mostly; 203 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); 204 205 unsigned int nf_conntrack_max __read_mostly; 206 EXPORT_SYMBOL_GPL(nf_conntrack_max); 207 seqcount_spinlock_t nf_conntrack_generation __read_mostly; 208 static siphash_aligned_key_t nf_conntrack_hash_rnd; 209 210 static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, 211 unsigned int zoneid, 212 const struct net *net) 213 { 214 siphash_key_t key; 215 216 get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); 217 218 key = nf_conntrack_hash_rnd; 219 220 key.key[0] ^= zoneid; 221 key.key[1] ^= net_hash_mix(net); 222 223 return siphash((void *)tuple, 224 offsetofend(struct nf_conntrack_tuple, dst.__nfct_hash_offsetend), 225 &key); 226 } 227 228 static u32 scale_hash(u32 hash) 229 { 230 return reciprocal_scale(hash, nf_conntrack_htable_size); 231 } 232 233 static u32 __hash_conntrack(const struct net *net, 234 const struct nf_conntrack_tuple *tuple, 235 unsigned int zoneid, 236 unsigned int size) 237 { 238 return reciprocal_scale(hash_conntrack_raw(tuple, zoneid, net), size); 239 } 240 241 static u32 hash_conntrack(const struct net *net, 242 const struct nf_conntrack_tuple *tuple, 243 unsigned int zoneid) 244 { 245 return scale_hash(hash_conntrack_raw(tuple, zoneid, net)); 246 } 247 248 static bool nf_ct_get_tuple_ports(const struct sk_buff *skb, 249 unsigned int dataoff, 250 struct nf_conntrack_tuple *tuple) 251 { struct { 252 __be16 sport; 253 __be16 dport; 254 } _inet_hdr, *inet_hdr; 255 256 /* Actually only need first 4 bytes to get ports. */ 257 inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr); 258 if (!inet_hdr) 259 return false; 260 261 tuple->src.u.udp.port = inet_hdr->sport; 262 tuple->dst.u.udp.port = inet_hdr->dport; 263 return true; 264 } 265 266 static bool 267 nf_ct_get_tuple(const struct sk_buff *skb, 268 unsigned int nhoff, 269 unsigned int dataoff, 270 u_int16_t l3num, 271 u_int8_t protonum, 272 struct net *net, 273 struct nf_conntrack_tuple *tuple) 274 { 275 unsigned int size; 276 const __be32 *ap; 277 __be32 _addrs[8]; 278 279 memset(tuple, 0, sizeof(*tuple)); 280 281 tuple->src.l3num = l3num; 282 switch (l3num) { 283 case NFPROTO_IPV4: 284 nhoff += offsetof(struct iphdr, saddr); 285 size = 2 * sizeof(__be32); 286 break; 287 case NFPROTO_IPV6: 288 nhoff += offsetof(struct ipv6hdr, saddr); 289 size = sizeof(_addrs); 290 break; 291 default: 292 return true; 293 } 294 295 ap = skb_header_pointer(skb, nhoff, size, _addrs); 296 if (!ap) 297 return false; 298 299 switch (l3num) { 300 case NFPROTO_IPV4: 301 tuple->src.u3.ip = ap[0]; 302 tuple->dst.u3.ip = ap[1]; 303 break; 304 case NFPROTO_IPV6: 305 memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); 306 memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); 307 break; 308 } 309 310 tuple->dst.protonum = protonum; 311 tuple->dst.dir = IP_CT_DIR_ORIGINAL; 312 313 switch (protonum) { 314 #if IS_ENABLED(CONFIG_IPV6) 315 case IPPROTO_ICMPV6: 316 return icmpv6_pkt_to_tuple(skb, dataoff, net, tuple); 317 #endif 318 case IPPROTO_ICMP: 319 return icmp_pkt_to_tuple(skb, dataoff, net, tuple); 320 #ifdef CONFIG_NF_CT_PROTO_GRE 321 case IPPROTO_GRE: 322 return gre_pkt_to_tuple(skb, dataoff, net, tuple); 323 #endif 324 case IPPROTO_TCP: 325 case IPPROTO_UDP: 326 #ifdef CONFIG_NF_CT_PROTO_UDPLITE 327 case IPPROTO_UDPLITE: 328 #endif 329 #ifdef CONFIG_NF_CT_PROTO_SCTP 330 case IPPROTO_SCTP: 331 #endif 332 #ifdef CONFIG_NF_CT_PROTO_DCCP 333 case IPPROTO_DCCP: 334 #endif 335 /* fallthrough */ 336 return nf_ct_get_tuple_ports(skb, dataoff, tuple); 337 default: 338 break; 339 } 340 341 return true; 342 } 343 344 static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, 345 u_int8_t *protonum) 346 { 347 int dataoff = -1; 348 const struct iphdr *iph; 349 struct iphdr _iph; 350 351 iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); 352 if (!iph) 353 return -1; 354 355 /* Conntrack defragments packets, we might still see fragments 356 * inside ICMP packets though. 357 */ 358 if (iph->frag_off & htons(IP_OFFSET)) 359 return -1; 360 361 dataoff = nhoff + (iph->ihl << 2); 362 *protonum = iph->protocol; 363 364 /* Check bogus IP headers */ 365 if (dataoff > skb->len) { 366 pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n", 367 nhoff, iph->ihl << 2, skb->len); 368 return -1; 369 } 370 return dataoff; 371 } 372 373 #if IS_ENABLED(CONFIG_IPV6) 374 static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, 375 u8 *protonum) 376 { 377 int protoff = -1; 378 unsigned int extoff = nhoff + sizeof(struct ipv6hdr); 379 __be16 frag_off; 380 u8 nexthdr; 381 382 if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr), 383 &nexthdr, sizeof(nexthdr)) != 0) { 384 pr_debug("can't get nexthdr\n"); 385 return -1; 386 } 387 protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off); 388 /* 389 * (protoff == skb->len) means the packet has not data, just 390 * IPv6 and possibly extensions headers, but it is tracked anyway 391 */ 392 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { 393 pr_debug("can't find proto in pkt\n"); 394 return -1; 395 } 396 397 *protonum = nexthdr; 398 return protoff; 399 } 400 #endif 401 402 static int get_l4proto(const struct sk_buff *skb, 403 unsigned int nhoff, u8 pf, u8 *l4num) 404 { 405 switch (pf) { 406 case NFPROTO_IPV4: 407 return ipv4_get_l4proto(skb, nhoff, l4num); 408 #if IS_ENABLED(CONFIG_IPV6) 409 case NFPROTO_IPV6: 410 return ipv6_get_l4proto(skb, nhoff, l4num); 411 #endif 412 default: 413 *l4num = 0; 414 break; 415 } 416 return -1; 417 } 418 419 bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, 420 u_int16_t l3num, 421 struct net *net, struct nf_conntrack_tuple *tuple) 422 { 423 u8 protonum; 424 int protoff; 425 426 protoff = get_l4proto(skb, nhoff, l3num, &protonum); 427 if (protoff <= 0) 428 return false; 429 430 return nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple); 431 } 432 EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr); 433 434 bool 435 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, 436 const struct nf_conntrack_tuple *orig) 437 { 438 memset(inverse, 0, sizeof(*inverse)); 439 440 inverse->src.l3num = orig->src.l3num; 441 442 switch (orig->src.l3num) { 443 case NFPROTO_IPV4: 444 inverse->src.u3.ip = orig->dst.u3.ip; 445 inverse->dst.u3.ip = orig->src.u3.ip; 446 break; 447 case NFPROTO_IPV6: 448 inverse->src.u3.in6 = orig->dst.u3.in6; 449 inverse->dst.u3.in6 = orig->src.u3.in6; 450 break; 451 default: 452 break; 453 } 454 455 inverse->dst.dir = !orig->dst.dir; 456 457 inverse->dst.protonum = orig->dst.protonum; 458 459 switch (orig->dst.protonum) { 460 case IPPROTO_ICMP: 461 return nf_conntrack_invert_icmp_tuple(inverse, orig); 462 #if IS_ENABLED(CONFIG_IPV6) 463 case IPPROTO_ICMPV6: 464 return nf_conntrack_invert_icmpv6_tuple(inverse, orig); 465 #endif 466 } 467 468 inverse->src.u.all = orig->dst.u.all; 469 inverse->dst.u.all = orig->src.u.all; 470 return true; 471 } 472 EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); 473 474 /* Generate a almost-unique pseudo-id for a given conntrack. 475 * 476 * intentionally doesn't re-use any of the seeds used for hash 477 * table location, we assume id gets exposed to userspace. 478 * 479 * Following nf_conn items do not change throughout lifetime 480 * of the nf_conn: 481 * 482 * 1. nf_conn address 483 * 2. nf_conn->master address (normally NULL) 484 * 3. the associated net namespace 485 * 4. the original direction tuple 486 */ 487 u32 nf_ct_get_id(const struct nf_conn *ct) 488 { 489 static siphash_aligned_key_t ct_id_seed; 490 unsigned long a, b, c, d; 491 492 net_get_random_once(&ct_id_seed, sizeof(ct_id_seed)); 493 494 a = (unsigned long)ct; 495 b = (unsigned long)ct->master; 496 c = (unsigned long)nf_ct_net(ct); 497 d = (unsigned long)siphash(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 498 sizeof(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple), 499 &ct_id_seed); 500 #ifdef CONFIG_64BIT 501 return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed); 502 #else 503 return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed); 504 #endif 505 } 506 EXPORT_SYMBOL_GPL(nf_ct_get_id); 507 508 static u32 nf_conntrack_get_id(const struct nf_conntrack *nfct) 509 { 510 return nf_ct_get_id(nf_ct_to_nf_conn(nfct)); 511 } 512 513 static void 514 clean_from_lists(struct nf_conn *ct) 515 { 516 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 517 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); 518 519 /* Destroy all pending expectations */ 520 nf_ct_remove_expectations(ct); 521 } 522 523 #define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK) 524 525 /* Released via nf_ct_destroy() */ 526 struct nf_conn *nf_ct_tmpl_alloc(struct net *net, 527 const struct nf_conntrack_zone *zone, 528 gfp_t flags) 529 { 530 struct nf_conn *tmpl, *p; 531 532 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) { 533 tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags); 534 if (!tmpl) 535 return NULL; 536 537 p = tmpl; 538 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); 539 if (tmpl != p) 540 tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p; 541 } else { 542 tmpl = kzalloc(sizeof(*tmpl), flags); 543 if (!tmpl) 544 return NULL; 545 } 546 547 tmpl->status = IPS_TEMPLATE; 548 write_pnet(&tmpl->ct_net, net); 549 nf_ct_zone_add(tmpl, zone); 550 refcount_set(&tmpl->ct_general.use, 1); 551 552 return tmpl; 553 } 554 EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); 555 556 void nf_ct_tmpl_free(struct nf_conn *tmpl) 557 { 558 kfree(tmpl->ext); 559 560 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) 561 kfree((char *)tmpl - tmpl->proto.tmpl_padto); 562 else 563 kfree(tmpl); 564 } 565 EXPORT_SYMBOL_GPL(nf_ct_tmpl_free); 566 567 static void destroy_gre_conntrack(struct nf_conn *ct) 568 { 569 #ifdef CONFIG_NF_CT_PROTO_GRE 570 struct nf_conn *master = ct->master; 571 572 if (master) 573 nf_ct_gre_keymap_destroy(master); 574 #endif 575 } 576 577 void nf_ct_destroy(struct nf_conntrack *nfct) 578 { 579 struct nf_conn *ct = (struct nf_conn *)nfct; 580 581 WARN_ON(refcount_read(&nfct->use) != 0); 582 583 if (unlikely(nf_ct_is_template(ct))) { 584 nf_ct_tmpl_free(ct); 585 return; 586 } 587 588 if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE)) 589 destroy_gre_conntrack(ct); 590 591 /* Expectations will have been removed in clean_from_lists, 592 * except TFTP can create an expectation on the first packet, 593 * before connection is in the list, so we need to clean here, 594 * too. 595 */ 596 nf_ct_remove_expectations(ct); 597 598 if (ct->master) 599 nf_ct_put(ct->master); 600 601 nf_conntrack_free(ct); 602 } 603 EXPORT_SYMBOL(nf_ct_destroy); 604 605 static void __nf_ct_delete_from_lists(struct nf_conn *ct) 606 { 607 struct net *net = nf_ct_net(ct); 608 unsigned int hash, reply_hash; 609 unsigned int sequence; 610 611 do { 612 sequence = read_seqcount_begin(&nf_conntrack_generation); 613 hash = hash_conntrack(net, 614 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 615 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); 616 reply_hash = hash_conntrack(net, 617 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 618 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 619 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 620 621 clean_from_lists(ct); 622 nf_conntrack_double_unlock(hash, reply_hash); 623 } 624 625 static void nf_ct_delete_from_lists(struct nf_conn *ct) 626 { 627 nf_ct_helper_destroy(ct); 628 local_bh_disable(); 629 630 __nf_ct_delete_from_lists(ct); 631 632 local_bh_enable(); 633 } 634 635 static void nf_ct_add_to_ecache_list(struct nf_conn *ct) 636 { 637 #ifdef CONFIG_NF_CONNTRACK_EVENTS 638 struct nf_conntrack_net *cnet = nf_ct_pernet(nf_ct_net(ct)); 639 640 spin_lock(&cnet->ecache.dying_lock); 641 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 642 &cnet->ecache.dying_list); 643 spin_unlock(&cnet->ecache.dying_lock); 644 #endif 645 } 646 647 bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) 648 { 649 struct nf_conn_tstamp *tstamp; 650 struct net *net; 651 652 if (test_and_set_bit(IPS_DYING_BIT, &ct->status)) 653 return false; 654 655 tstamp = nf_conn_tstamp_find(ct); 656 if (tstamp) { 657 s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp; 658 659 tstamp->stop = ktime_get_real_ns(); 660 if (timeout < 0) 661 tstamp->stop -= jiffies_to_nsecs(-timeout); 662 } 663 664 if (nf_conntrack_event_report(IPCT_DESTROY, ct, 665 portid, report) < 0) { 666 /* destroy event was not delivered. nf_ct_put will 667 * be done by event cache worker on redelivery. 668 */ 669 nf_ct_helper_destroy(ct); 670 local_bh_disable(); 671 __nf_ct_delete_from_lists(ct); 672 nf_ct_add_to_ecache_list(ct); 673 local_bh_enable(); 674 675 nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL); 676 return false; 677 } 678 679 net = nf_ct_net(ct); 680 if (nf_conntrack_ecache_dwork_pending(net)) 681 nf_conntrack_ecache_work(net, NFCT_ECACHE_DESTROY_SENT); 682 nf_ct_delete_from_lists(ct); 683 nf_ct_put(ct); 684 return true; 685 } 686 EXPORT_SYMBOL_GPL(nf_ct_delete); 687 688 static inline bool 689 nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, 690 const struct nf_conntrack_tuple *tuple, 691 const struct nf_conntrack_zone *zone, 692 const struct net *net) 693 { 694 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 695 696 /* A conntrack can be recreated with the equal tuple, 697 * so we need to check that the conntrack is confirmed 698 */ 699 return nf_ct_tuple_equal(tuple, &h->tuple) && 700 nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) && 701 nf_ct_is_confirmed(ct) && 702 net_eq(net, nf_ct_net(ct)); 703 } 704 705 static inline bool 706 nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2) 707 { 708 return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 709 &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && 710 nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple, 711 &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) && 712 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) && 713 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) && 714 net_eq(nf_ct_net(ct1), nf_ct_net(ct2)); 715 } 716 717 /* caller must hold rcu readlock and none of the nf_conntrack_locks */ 718 static void nf_ct_gc_expired(struct nf_conn *ct) 719 { 720 if (!refcount_inc_not_zero(&ct->ct_general.use)) 721 return; 722 723 /* load ->status after refcount increase */ 724 smp_acquire__after_ctrl_dep(); 725 726 if (nf_ct_should_gc(ct)) 727 nf_ct_kill(ct); 728 729 nf_ct_put(ct); 730 } 731 732 /* 733 * Warning : 734 * - Caller must take a reference on returned object 735 * and recheck nf_ct_tuple_equal(tuple, &h->tuple) 736 */ 737 static struct nf_conntrack_tuple_hash * 738 ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, 739 const struct nf_conntrack_tuple *tuple, u32 hash) 740 { 741 struct nf_conntrack_tuple_hash *h; 742 struct hlist_nulls_head *ct_hash; 743 struct hlist_nulls_node *n; 744 unsigned int bucket, hsize; 745 746 begin: 747 nf_conntrack_get_ht(&ct_hash, &hsize); 748 bucket = reciprocal_scale(hash, hsize); 749 750 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { 751 struct nf_conn *ct; 752 753 ct = nf_ct_tuplehash_to_ctrack(h); 754 if (nf_ct_is_expired(ct)) { 755 nf_ct_gc_expired(ct); 756 continue; 757 } 758 759 if (nf_ct_key_equal(h, tuple, zone, net)) 760 return h; 761 } 762 /* 763 * if the nulls value we got at the end of this lookup is 764 * not the expected one, we must restart lookup. 765 * We probably met an item that was moved to another chain. 766 */ 767 if (get_nulls_value(n) != bucket) { 768 NF_CT_STAT_INC_ATOMIC(net, search_restart); 769 goto begin; 770 } 771 772 return NULL; 773 } 774 775 /* Find a connection corresponding to a tuple. */ 776 static struct nf_conntrack_tuple_hash * 777 __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, 778 const struct nf_conntrack_tuple *tuple, u32 hash) 779 { 780 struct nf_conntrack_tuple_hash *h; 781 struct nf_conn *ct; 782 783 h = ____nf_conntrack_find(net, zone, tuple, hash); 784 if (h) { 785 /* We have a candidate that matches the tuple we're interested 786 * in, try to obtain a reference and re-check tuple 787 */ 788 ct = nf_ct_tuplehash_to_ctrack(h); 789 if (likely(refcount_inc_not_zero(&ct->ct_general.use))) { 790 /* re-check key after refcount */ 791 smp_acquire__after_ctrl_dep(); 792 793 if (likely(nf_ct_key_equal(h, tuple, zone, net))) 794 return h; 795 796 /* TYPESAFE_BY_RCU recycled the candidate */ 797 nf_ct_put(ct); 798 } 799 800 h = NULL; 801 } 802 803 return h; 804 } 805 806 struct nf_conntrack_tuple_hash * 807 nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, 808 const struct nf_conntrack_tuple *tuple) 809 { 810 unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); 811 struct nf_conntrack_tuple_hash *thash; 812 813 rcu_read_lock(); 814 815 thash = __nf_conntrack_find_get(net, zone, tuple, 816 hash_conntrack_raw(tuple, zone_id, net)); 817 818 if (thash) 819 goto out_unlock; 820 821 rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); 822 if (rid != zone_id) 823 thash = __nf_conntrack_find_get(net, zone, tuple, 824 hash_conntrack_raw(tuple, rid, net)); 825 826 out_unlock: 827 rcu_read_unlock(); 828 return thash; 829 } 830 EXPORT_SYMBOL_GPL(nf_conntrack_find_get); 831 832 static void __nf_conntrack_hash_insert(struct nf_conn *ct, 833 unsigned int hash, 834 unsigned int reply_hash) 835 { 836 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 837 &nf_conntrack_hash[hash]); 838 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 839 &nf_conntrack_hash[reply_hash]); 840 } 841 842 static bool nf_ct_ext_valid_pre(const struct nf_ct_ext *ext) 843 { 844 /* if ext->gen_id is not equal to nf_conntrack_ext_genid, some extensions 845 * may contain stale pointers to e.g. helper that has been removed. 846 * 847 * The helper can't clear this because the nf_conn object isn't in 848 * any hash and synchronize_rcu() isn't enough because associated skb 849 * might sit in a queue. 850 */ 851 return !ext || ext->gen_id == atomic_read(&nf_conntrack_ext_genid); 852 } 853 854 static bool nf_ct_ext_valid_post(struct nf_ct_ext *ext) 855 { 856 if (!ext) 857 return true; 858 859 if (ext->gen_id != atomic_read(&nf_conntrack_ext_genid)) 860 return false; 861 862 /* inserted into conntrack table, nf_ct_iterate_cleanup() 863 * will find it. Disable nf_ct_ext_find() id check. 864 */ 865 WRITE_ONCE(ext->gen_id, 0); 866 return true; 867 } 868 869 int 870 nf_conntrack_hash_check_insert(struct nf_conn *ct) 871 { 872 const struct nf_conntrack_zone *zone; 873 struct net *net = nf_ct_net(ct); 874 unsigned int hash, reply_hash; 875 struct nf_conntrack_tuple_hash *h; 876 struct hlist_nulls_node *n; 877 unsigned int max_chainlen; 878 unsigned int chainlen = 0; 879 unsigned int sequence; 880 int err = -EEXIST; 881 882 zone = nf_ct_zone(ct); 883 884 if (!nf_ct_ext_valid_pre(ct->ext)) 885 return -EAGAIN; 886 887 local_bh_disable(); 888 do { 889 sequence = read_seqcount_begin(&nf_conntrack_generation); 890 hash = hash_conntrack(net, 891 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 892 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); 893 reply_hash = hash_conntrack(net, 894 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 895 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 896 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 897 898 max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN); 899 900 /* See if there's one in the list already, including reverse */ 901 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { 902 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 903 zone, net)) 904 goto out; 905 906 if (chainlen++ > max_chainlen) 907 goto chaintoolong; 908 } 909 910 chainlen = 0; 911 912 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { 913 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 914 zone, net)) 915 goto out; 916 if (chainlen++ > max_chainlen) 917 goto chaintoolong; 918 } 919 920 /* If genid has changed, we can't insert anymore because ct 921 * extensions could have stale pointers and nf_ct_iterate_destroy 922 * might have completed its table scan already. 923 * 924 * Increment of the ext genid right after this check is fine: 925 * nf_ct_iterate_destroy blocks until locks are released. 926 */ 927 if (!nf_ct_ext_valid_post(ct->ext)) { 928 err = -EAGAIN; 929 goto out; 930 } 931 932 smp_wmb(); 933 /* The caller holds a reference to this object */ 934 refcount_set(&ct->ct_general.use, 2); 935 __nf_conntrack_hash_insert(ct, hash, reply_hash); 936 nf_conntrack_double_unlock(hash, reply_hash); 937 NF_CT_STAT_INC(net, insert); 938 local_bh_enable(); 939 940 return 0; 941 chaintoolong: 942 NF_CT_STAT_INC(net, chaintoolong); 943 err = -ENOSPC; 944 out: 945 nf_conntrack_double_unlock(hash, reply_hash); 946 local_bh_enable(); 947 return err; 948 } 949 EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); 950 951 void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets, 952 unsigned int bytes) 953 { 954 struct nf_conn_acct *acct; 955 956 acct = nf_conn_acct_find(ct); 957 if (acct) { 958 struct nf_conn_counter *counter = acct->counter; 959 960 atomic64_add(packets, &counter[dir].packets); 961 atomic64_add(bytes, &counter[dir].bytes); 962 } 963 } 964 EXPORT_SYMBOL_GPL(nf_ct_acct_add); 965 966 static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, 967 const struct nf_conn *loser_ct) 968 { 969 struct nf_conn_acct *acct; 970 971 acct = nf_conn_acct_find(loser_ct); 972 if (acct) { 973 struct nf_conn_counter *counter = acct->counter; 974 unsigned int bytes; 975 976 /* u32 should be fine since we must have seen one packet. */ 977 bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes); 978 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes); 979 } 980 } 981 982 static void __nf_conntrack_insert_prepare(struct nf_conn *ct) 983 { 984 struct nf_conn_tstamp *tstamp; 985 986 refcount_inc(&ct->ct_general.use); 987 988 /* set conntrack timestamp, if enabled. */ 989 tstamp = nf_conn_tstamp_find(ct); 990 if (tstamp) 991 tstamp->start = ktime_get_real_ns(); 992 } 993 994 /** 995 * nf_ct_match_reverse - check if ct1 and ct2 refer to identical flow 996 * @ct1: conntrack in hash table to check against 997 * @ct2: merge candidate 998 * 999 * returns true if ct1 and ct2 happen to refer to the same flow, but 1000 * in opposing directions, i.e. 1001 * ct1: a:b -> c:d 1002 * ct2: c:d -> a:b 1003 * for both directions. If so, @ct2 should not have been created 1004 * as the skb should have been picked up as ESTABLISHED flow. 1005 * But ct1 was not yet committed to hash table before skb that created 1006 * ct2 had arrived. 1007 * 1008 * Note we don't compare netns because ct entries in different net 1009 * namespace cannot clash to begin with. 1010 * 1011 * @return: true if ct1 and ct2 are identical when swapping origin/reply. 1012 */ 1013 static bool 1014 nf_ct_match_reverse(const struct nf_conn *ct1, const struct nf_conn *ct2) 1015 { 1016 u16 id1, id2; 1017 1018 if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1019 &ct2->tuplehash[IP_CT_DIR_REPLY].tuple)) 1020 return false; 1021 1022 if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple, 1023 &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) 1024 return false; 1025 1026 id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_ORIGINAL); 1027 id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_REPLY); 1028 if (id1 != id2) 1029 return false; 1030 1031 id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_REPLY); 1032 id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL); 1033 1034 return id1 == id2; 1035 } 1036 1037 static int nf_ct_can_merge(const struct nf_conn *ct, 1038 const struct nf_conn *loser_ct) 1039 { 1040 return nf_ct_match(ct, loser_ct) || 1041 nf_ct_match_reverse(ct, loser_ct); 1042 } 1043 1044 /* caller must hold locks to prevent concurrent changes */ 1045 static int __nf_ct_resolve_clash(struct sk_buff *skb, 1046 struct nf_conntrack_tuple_hash *h) 1047 { 1048 /* This is the conntrack entry already in hashes that won race. */ 1049 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 1050 enum ip_conntrack_info ctinfo; 1051 struct nf_conn *loser_ct; 1052 1053 loser_ct = nf_ct_get(skb, &ctinfo); 1054 1055 if (nf_ct_can_merge(ct, loser_ct)) { 1056 struct net *net = nf_ct_net(ct); 1057 1058 nf_conntrack_get(&ct->ct_general); 1059 1060 nf_ct_acct_merge(ct, ctinfo, loser_ct); 1061 nf_ct_put(loser_ct); 1062 nf_ct_set(skb, ct, ctinfo); 1063 1064 NF_CT_STAT_INC(net, clash_resolve); 1065 return NF_ACCEPT; 1066 } 1067 1068 return NF_DROP; 1069 } 1070 1071 /** 1072 * nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry 1073 * 1074 * @skb: skb that causes the collision 1075 * @repl_idx: hash slot for reply direction 1076 * 1077 * Called when origin or reply direction had a clash. 1078 * The skb can be handled without packet drop provided the reply direction 1079 * is unique or there the existing entry has the identical tuple in both 1080 * directions. 1081 * 1082 * Caller must hold conntrack table locks to prevent concurrent updates. 1083 * 1084 * Returns NF_DROP if the clash could not be handled. 1085 */ 1086 static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) 1087 { 1088 struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb); 1089 const struct nf_conntrack_zone *zone; 1090 struct nf_conntrack_tuple_hash *h; 1091 struct hlist_nulls_node *n; 1092 struct net *net; 1093 1094 zone = nf_ct_zone(loser_ct); 1095 net = nf_ct_net(loser_ct); 1096 1097 /* Reply direction must never result in a clash, unless both origin 1098 * and reply tuples are identical. 1099 */ 1100 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) { 1101 if (nf_ct_key_equal(h, 1102 &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1103 zone, net)) 1104 return __nf_ct_resolve_clash(skb, h); 1105 } 1106 1107 /* We want the clashing entry to go away real soon: 1 second timeout. */ 1108 WRITE_ONCE(loser_ct->timeout, nfct_time_stamp + HZ); 1109 1110 /* IPS_NAT_CLASH removes the entry automatically on the first 1111 * reply. Also prevents UDP tracker from moving the entry to 1112 * ASSURED state, i.e. the entry can always be evicted under 1113 * pressure. 1114 */ 1115 loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH; 1116 1117 __nf_conntrack_insert_prepare(loser_ct); 1118 1119 /* fake add for ORIGINAL dir: we want lookups to only find the entry 1120 * already in the table. This also hides the clashing entry from 1121 * ctnetlink iteration, i.e. conntrack -L won't show them. 1122 */ 1123 hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 1124 1125 hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 1126 &nf_conntrack_hash[repl_idx]); 1127 1128 NF_CT_STAT_INC(net, clash_resolve); 1129 return NF_ACCEPT; 1130 } 1131 1132 /** 1133 * nf_ct_resolve_clash - attempt to handle clash without packet drop 1134 * 1135 * @skb: skb that causes the clash 1136 * @h: tuplehash of the clashing entry already in table 1137 * @reply_hash: hash slot for reply direction 1138 * 1139 * A conntrack entry can be inserted to the connection tracking table 1140 * if there is no existing entry with an identical tuple. 1141 * 1142 * If there is one, @skb (and the associated, unconfirmed conntrack) has 1143 * to be dropped. In case @skb is retransmitted, next conntrack lookup 1144 * will find the already-existing entry. 1145 * 1146 * The major problem with such packet drop is the extra delay added by 1147 * the packet loss -- it will take some time for a retransmit to occur 1148 * (or the sender to time out when waiting for a reply). 1149 * 1150 * This function attempts to handle the situation without packet drop. 1151 * 1152 * If @skb has no NAT transformation or if the colliding entries are 1153 * exactly the same, only the to-be-confirmed conntrack entry is discarded 1154 * and @skb is associated with the conntrack entry already in the table. 1155 * 1156 * Failing that, the new, unconfirmed conntrack is still added to the table 1157 * provided that the collision only occurs in the ORIGINAL direction. 1158 * The new entry will be added only in the non-clashing REPLY direction, 1159 * so packets in the ORIGINAL direction will continue to match the existing 1160 * entry. The new entry will also have a fixed timeout so it expires -- 1161 * due to the collision, it will only see reply traffic. 1162 * 1163 * Returns NF_DROP if the clash could not be resolved. 1164 */ 1165 static __cold noinline int 1166 nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h, 1167 u32 reply_hash) 1168 { 1169 /* This is the conntrack entry already in hashes that won race. */ 1170 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 1171 const struct nf_conntrack_l4proto *l4proto; 1172 enum ip_conntrack_info ctinfo; 1173 struct nf_conn *loser_ct; 1174 struct net *net; 1175 int ret; 1176 1177 loser_ct = nf_ct_get(skb, &ctinfo); 1178 net = nf_ct_net(loser_ct); 1179 1180 l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); 1181 if (!l4proto->allow_clash) 1182 goto drop; 1183 1184 ret = __nf_ct_resolve_clash(skb, h); 1185 if (ret == NF_ACCEPT) 1186 return ret; 1187 1188 ret = nf_ct_resolve_clash_harder(skb, reply_hash); 1189 if (ret == NF_ACCEPT) 1190 return ret; 1191 1192 drop: 1193 NF_CT_STAT_INC(net, drop); 1194 NF_CT_STAT_INC(net, insert_failed); 1195 return NF_DROP; 1196 } 1197 1198 /* Confirm a connection given skb; places it in hash table */ 1199 int 1200 __nf_conntrack_confirm(struct sk_buff *skb) 1201 { 1202 unsigned int chainlen = 0, sequence, max_chainlen; 1203 const struct nf_conntrack_zone *zone; 1204 unsigned int hash, reply_hash; 1205 struct nf_conntrack_tuple_hash *h; 1206 struct nf_conn *ct; 1207 struct nf_conn_help *help; 1208 struct hlist_nulls_node *n; 1209 enum ip_conntrack_info ctinfo; 1210 struct net *net; 1211 int ret = NF_DROP; 1212 1213 ct = nf_ct_get(skb, &ctinfo); 1214 net = nf_ct_net(ct); 1215 1216 /* ipt_REJECT uses nf_conntrack_attach to attach related 1217 ICMP/TCP RST packets in other direction. Actual packet 1218 which created connection will be IP_CT_NEW or for an 1219 expected connection, IP_CT_RELATED. */ 1220 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) 1221 return NF_ACCEPT; 1222 1223 zone = nf_ct_zone(ct); 1224 local_bh_disable(); 1225 1226 do { 1227 sequence = read_seqcount_begin(&nf_conntrack_generation); 1228 /* reuse the hash saved before */ 1229 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; 1230 hash = scale_hash(hash); 1231 reply_hash = hash_conntrack(net, 1232 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1233 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 1234 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 1235 1236 /* We're not in hash table, and we refuse to set up related 1237 * connections for unconfirmed conns. But packet copies and 1238 * REJECT will give spurious warnings here. 1239 */ 1240 1241 /* Another skb with the same unconfirmed conntrack may 1242 * win the race. This may happen for bridge(br_flood) 1243 * or broadcast/multicast packets do skb_clone with 1244 * unconfirmed conntrack. 1245 */ 1246 if (unlikely(nf_ct_is_confirmed(ct))) { 1247 WARN_ON_ONCE(1); 1248 nf_conntrack_double_unlock(hash, reply_hash); 1249 local_bh_enable(); 1250 return NF_DROP; 1251 } 1252 1253 if (!nf_ct_ext_valid_pre(ct->ext)) { 1254 NF_CT_STAT_INC(net, insert_failed); 1255 goto dying; 1256 } 1257 1258 /* We have to check the DYING flag after unlink to prevent 1259 * a race against nf_ct_get_next_corpse() possibly called from 1260 * user context, else we insert an already 'dead' hash, blocking 1261 * further use of that particular connection -JM. 1262 */ 1263 ct->status |= IPS_CONFIRMED; 1264 1265 if (unlikely(nf_ct_is_dying(ct))) { 1266 NF_CT_STAT_INC(net, insert_failed); 1267 goto dying; 1268 } 1269 1270 max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN); 1271 /* See if there's one in the list already, including reverse: 1272 NAT could have grabbed it without realizing, since we're 1273 not in the hash. If there is, we lost race. */ 1274 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { 1275 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1276 zone, net)) 1277 goto out; 1278 if (chainlen++ > max_chainlen) 1279 goto chaintoolong; 1280 } 1281 1282 chainlen = 0; 1283 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { 1284 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1285 zone, net)) 1286 goto out; 1287 if (chainlen++ > max_chainlen) { 1288 chaintoolong: 1289 NF_CT_STAT_INC(net, chaintoolong); 1290 NF_CT_STAT_INC(net, insert_failed); 1291 ret = NF_DROP; 1292 goto dying; 1293 } 1294 } 1295 1296 /* Timer relative to confirmation time, not original 1297 setting time, otherwise we'd get timer wrap in 1298 weird delay cases. */ 1299 ct->timeout += nfct_time_stamp; 1300 1301 __nf_conntrack_insert_prepare(ct); 1302 1303 /* Since the lookup is lockless, hash insertion must be done after 1304 * starting the timer and setting the CONFIRMED bit. The RCU barriers 1305 * guarantee that no other CPU can find the conntrack before the above 1306 * stores are visible. 1307 */ 1308 __nf_conntrack_hash_insert(ct, hash, reply_hash); 1309 nf_conntrack_double_unlock(hash, reply_hash); 1310 local_bh_enable(); 1311 1312 /* ext area is still valid (rcu read lock is held, 1313 * but will go out of scope soon, we need to remove 1314 * this conntrack again. 1315 */ 1316 if (!nf_ct_ext_valid_post(ct->ext)) { 1317 nf_ct_kill(ct); 1318 NF_CT_STAT_INC_ATOMIC(net, drop); 1319 return NF_DROP; 1320 } 1321 1322 help = nfct_help(ct); 1323 if (help && help->helper) 1324 nf_conntrack_event_cache(IPCT_HELPER, ct); 1325 1326 nf_conntrack_event_cache(master_ct(ct) ? 1327 IPCT_RELATED : IPCT_NEW, ct); 1328 return NF_ACCEPT; 1329 1330 out: 1331 ret = nf_ct_resolve_clash(skb, h, reply_hash); 1332 dying: 1333 nf_conntrack_double_unlock(hash, reply_hash); 1334 local_bh_enable(); 1335 return ret; 1336 } 1337 EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); 1338 1339 /* Returns true if a connection corresponds to the tuple (required 1340 for NAT). */ 1341 int 1342 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, 1343 const struct nf_conn *ignored_conntrack) 1344 { 1345 struct net *net = nf_ct_net(ignored_conntrack); 1346 const struct nf_conntrack_zone *zone; 1347 struct nf_conntrack_tuple_hash *h; 1348 struct hlist_nulls_head *ct_hash; 1349 unsigned int hash, hsize; 1350 struct hlist_nulls_node *n; 1351 struct nf_conn *ct; 1352 1353 zone = nf_ct_zone(ignored_conntrack); 1354 1355 rcu_read_lock(); 1356 begin: 1357 nf_conntrack_get_ht(&ct_hash, &hsize); 1358 hash = __hash_conntrack(net, tuple, nf_ct_zone_id(zone, IP_CT_DIR_REPLY), hsize); 1359 1360 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { 1361 ct = nf_ct_tuplehash_to_ctrack(h); 1362 1363 if (ct == ignored_conntrack) 1364 continue; 1365 1366 if (nf_ct_is_expired(ct)) { 1367 nf_ct_gc_expired(ct); 1368 continue; 1369 } 1370 1371 if (nf_ct_key_equal(h, tuple, zone, net)) { 1372 /* Tuple is taken already, so caller will need to find 1373 * a new source port to use. 1374 * 1375 * Only exception: 1376 * If the *original tuples* are identical, then both 1377 * conntracks refer to the same flow. 1378 * This is a rare situation, it can occur e.g. when 1379 * more than one UDP packet is sent from same socket 1380 * in different threads. 1381 * 1382 * Let nf_ct_resolve_clash() deal with this later. 1383 */ 1384 if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1385 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && 1386 nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) 1387 continue; 1388 1389 NF_CT_STAT_INC_ATOMIC(net, found); 1390 rcu_read_unlock(); 1391 return 1; 1392 } 1393 } 1394 1395 if (get_nulls_value(n) != hash) { 1396 NF_CT_STAT_INC_ATOMIC(net, search_restart); 1397 goto begin; 1398 } 1399 1400 rcu_read_unlock(); 1401 1402 return 0; 1403 } 1404 EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); 1405 1406 #define NF_CT_EVICTION_RANGE 8 1407 1408 /* There's a small race here where we may free a just-assured 1409 connection. Too bad: we're in trouble anyway. */ 1410 static unsigned int early_drop_list(struct net *net, 1411 struct hlist_nulls_head *head) 1412 { 1413 struct nf_conntrack_tuple_hash *h; 1414 struct hlist_nulls_node *n; 1415 unsigned int drops = 0; 1416 struct nf_conn *tmp; 1417 1418 hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { 1419 tmp = nf_ct_tuplehash_to_ctrack(h); 1420 1421 if (nf_ct_is_expired(tmp)) { 1422 nf_ct_gc_expired(tmp); 1423 continue; 1424 } 1425 1426 if (test_bit(IPS_ASSURED_BIT, &tmp->status) || 1427 !net_eq(nf_ct_net(tmp), net) || 1428 nf_ct_is_dying(tmp)) 1429 continue; 1430 1431 if (!refcount_inc_not_zero(&tmp->ct_general.use)) 1432 continue; 1433 1434 /* load ->ct_net and ->status after refcount increase */ 1435 smp_acquire__after_ctrl_dep(); 1436 1437 /* kill only if still in same netns -- might have moved due to 1438 * SLAB_TYPESAFE_BY_RCU rules. 1439 * 1440 * We steal the timer reference. If that fails timer has 1441 * already fired or someone else deleted it. Just drop ref 1442 * and move to next entry. 1443 */ 1444 if (net_eq(nf_ct_net(tmp), net) && 1445 nf_ct_is_confirmed(tmp) && 1446 nf_ct_delete(tmp, 0, 0)) 1447 drops++; 1448 1449 nf_ct_put(tmp); 1450 } 1451 1452 return drops; 1453 } 1454 1455 static noinline int early_drop(struct net *net, unsigned int hash) 1456 { 1457 unsigned int i, bucket; 1458 1459 for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { 1460 struct hlist_nulls_head *ct_hash; 1461 unsigned int hsize, drops; 1462 1463 rcu_read_lock(); 1464 nf_conntrack_get_ht(&ct_hash, &hsize); 1465 if (!i) 1466 bucket = reciprocal_scale(hash, hsize); 1467 else 1468 bucket = (bucket + 1) % hsize; 1469 1470 drops = early_drop_list(net, &ct_hash[bucket]); 1471 rcu_read_unlock(); 1472 1473 if (drops) { 1474 NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops); 1475 return true; 1476 } 1477 } 1478 1479 return false; 1480 } 1481 1482 static bool gc_worker_skip_ct(const struct nf_conn *ct) 1483 { 1484 return !nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct); 1485 } 1486 1487 static bool gc_worker_can_early_drop(const struct nf_conn *ct) 1488 { 1489 const struct nf_conntrack_l4proto *l4proto; 1490 u8 protonum = nf_ct_protonum(ct); 1491 1492 if (!test_bit(IPS_ASSURED_BIT, &ct->status)) 1493 return true; 1494 1495 l4proto = nf_ct_l4proto_find(protonum); 1496 if (l4proto->can_early_drop && l4proto->can_early_drop(ct)) 1497 return true; 1498 1499 return false; 1500 } 1501 1502 static void gc_worker(struct work_struct *work) 1503 { 1504 unsigned int i, hashsz, nf_conntrack_max95 = 0; 1505 u32 end_time, start_time = nfct_time_stamp; 1506 struct conntrack_gc_work *gc_work; 1507 unsigned int expired_count = 0; 1508 unsigned long next_run; 1509 s32 delta_time; 1510 long count; 1511 1512 gc_work = container_of(work, struct conntrack_gc_work, dwork.work); 1513 1514 i = gc_work->next_bucket; 1515 if (gc_work->early_drop) 1516 nf_conntrack_max95 = nf_conntrack_max / 100u * 95u; 1517 1518 if (i == 0) { 1519 gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT; 1520 gc_work->count = GC_SCAN_INITIAL_COUNT; 1521 gc_work->start_time = start_time; 1522 } 1523 1524 next_run = gc_work->avg_timeout; 1525 count = gc_work->count; 1526 1527 end_time = start_time + GC_SCAN_MAX_DURATION; 1528 1529 do { 1530 struct nf_conntrack_tuple_hash *h; 1531 struct hlist_nulls_head *ct_hash; 1532 struct hlist_nulls_node *n; 1533 struct nf_conn *tmp; 1534 1535 rcu_read_lock(); 1536 1537 nf_conntrack_get_ht(&ct_hash, &hashsz); 1538 if (i >= hashsz) { 1539 rcu_read_unlock(); 1540 break; 1541 } 1542 1543 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) { 1544 struct nf_conntrack_net *cnet; 1545 struct net *net; 1546 long expires; 1547 1548 tmp = nf_ct_tuplehash_to_ctrack(h); 1549 1550 if (expired_count > GC_SCAN_EXPIRED_MAX) { 1551 rcu_read_unlock(); 1552 1553 gc_work->next_bucket = i; 1554 gc_work->avg_timeout = next_run; 1555 gc_work->count = count; 1556 1557 delta_time = nfct_time_stamp - gc_work->start_time; 1558 1559 /* re-sched immediately if total cycle time is exceeded */ 1560 next_run = delta_time < (s32)GC_SCAN_INTERVAL_MAX; 1561 goto early_exit; 1562 } 1563 1564 if (nf_ct_is_expired(tmp)) { 1565 nf_ct_gc_expired(tmp); 1566 expired_count++; 1567 continue; 1568 } 1569 1570 expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP); 1571 expires = (expires - (long)next_run) / ++count; 1572 next_run += expires; 1573 1574 if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp)) 1575 continue; 1576 1577 net = nf_ct_net(tmp); 1578 cnet = nf_ct_pernet(net); 1579 if (atomic_read(&cnet->count) < nf_conntrack_max95) 1580 continue; 1581 1582 /* need to take reference to avoid possible races */ 1583 if (!refcount_inc_not_zero(&tmp->ct_general.use)) 1584 continue; 1585 1586 /* load ->status after refcount increase */ 1587 smp_acquire__after_ctrl_dep(); 1588 1589 if (gc_worker_skip_ct(tmp)) { 1590 nf_ct_put(tmp); 1591 continue; 1592 } 1593 1594 if (gc_worker_can_early_drop(tmp)) { 1595 nf_ct_kill(tmp); 1596 expired_count++; 1597 } 1598 1599 nf_ct_put(tmp); 1600 } 1601 1602 /* could check get_nulls_value() here and restart if ct 1603 * was moved to another chain. But given gc is best-effort 1604 * we will just continue with next hash slot. 1605 */ 1606 rcu_read_unlock(); 1607 cond_resched(); 1608 i++; 1609 1610 delta_time = nfct_time_stamp - end_time; 1611 if (delta_time > 0 && i < hashsz) { 1612 gc_work->avg_timeout = next_run; 1613 gc_work->count = count; 1614 gc_work->next_bucket = i; 1615 next_run = 0; 1616 goto early_exit; 1617 } 1618 } while (i < hashsz); 1619 1620 gc_work->next_bucket = 0; 1621 1622 next_run = clamp(next_run, GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_MAX); 1623 1624 delta_time = max_t(s32, nfct_time_stamp - gc_work->start_time, 1); 1625 if (next_run > (unsigned long)delta_time) 1626 next_run -= delta_time; 1627 else 1628 next_run = 1; 1629 1630 early_exit: 1631 if (gc_work->exiting) 1632 return; 1633 1634 if (next_run) 1635 gc_work->early_drop = false; 1636 1637 queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run); 1638 } 1639 1640 static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) 1641 { 1642 INIT_DELAYED_WORK(&gc_work->dwork, gc_worker); 1643 gc_work->exiting = false; 1644 } 1645 1646 static struct nf_conn * 1647 __nf_conntrack_alloc(struct net *net, 1648 const struct nf_conntrack_zone *zone, 1649 const struct nf_conntrack_tuple *orig, 1650 const struct nf_conntrack_tuple *repl, 1651 gfp_t gfp, u32 hash) 1652 { 1653 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 1654 unsigned int ct_count; 1655 struct nf_conn *ct; 1656 1657 /* We don't want any race condition at early drop stage */ 1658 ct_count = atomic_inc_return(&cnet->count); 1659 1660 if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { 1661 if (!early_drop(net, hash)) { 1662 if (!conntrack_gc_work.early_drop) 1663 conntrack_gc_work.early_drop = true; 1664 atomic_dec(&cnet->count); 1665 net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); 1666 return ERR_PTR(-ENOMEM); 1667 } 1668 } 1669 1670 /* 1671 * Do not use kmem_cache_zalloc(), as this cache uses 1672 * SLAB_TYPESAFE_BY_RCU. 1673 */ 1674 ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); 1675 if (ct == NULL) 1676 goto out; 1677 1678 spin_lock_init(&ct->lock); 1679 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; 1680 ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; 1681 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; 1682 /* save hash for reusing when confirming */ 1683 *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; 1684 ct->status = 0; 1685 WRITE_ONCE(ct->timeout, 0); 1686 write_pnet(&ct->ct_net, net); 1687 memset_after(ct, 0, __nfct_init_offset); 1688 1689 nf_ct_zone_add(ct, zone); 1690 1691 /* Because we use RCU lookups, we set ct_general.use to zero before 1692 * this is inserted in any list. 1693 */ 1694 refcount_set(&ct->ct_general.use, 0); 1695 return ct; 1696 out: 1697 atomic_dec(&cnet->count); 1698 return ERR_PTR(-ENOMEM); 1699 } 1700 1701 struct nf_conn *nf_conntrack_alloc(struct net *net, 1702 const struct nf_conntrack_zone *zone, 1703 const struct nf_conntrack_tuple *orig, 1704 const struct nf_conntrack_tuple *repl, 1705 gfp_t gfp) 1706 { 1707 return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0); 1708 } 1709 EXPORT_SYMBOL_GPL(nf_conntrack_alloc); 1710 1711 void nf_conntrack_free(struct nf_conn *ct) 1712 { 1713 struct net *net = nf_ct_net(ct); 1714 struct nf_conntrack_net *cnet; 1715 1716 /* A freed object has refcnt == 0, that's 1717 * the golden rule for SLAB_TYPESAFE_BY_RCU 1718 */ 1719 WARN_ON(refcount_read(&ct->ct_general.use) != 0); 1720 1721 if (ct->status & IPS_SRC_NAT_DONE) { 1722 const struct nf_nat_hook *nat_hook; 1723 1724 rcu_read_lock(); 1725 nat_hook = rcu_dereference(nf_nat_hook); 1726 if (nat_hook) 1727 nat_hook->remove_nat_bysrc(ct); 1728 rcu_read_unlock(); 1729 } 1730 1731 kfree(ct->ext); 1732 kmem_cache_free(nf_conntrack_cachep, ct); 1733 cnet = nf_ct_pernet(net); 1734 1735 smp_mb__before_atomic(); 1736 atomic_dec(&cnet->count); 1737 } 1738 EXPORT_SYMBOL_GPL(nf_conntrack_free); 1739 1740 1741 /* Allocate a new conntrack: we return -ENOMEM if classification 1742 failed due to stress. Otherwise it really is unclassifiable. */ 1743 static noinline struct nf_conntrack_tuple_hash * 1744 init_conntrack(struct net *net, struct nf_conn *tmpl, 1745 const struct nf_conntrack_tuple *tuple, 1746 struct sk_buff *skb, 1747 unsigned int dataoff, u32 hash) 1748 { 1749 struct nf_conn *ct; 1750 struct nf_conn_help *help; 1751 struct nf_conntrack_tuple repl_tuple; 1752 #ifdef CONFIG_NF_CONNTRACK_EVENTS 1753 struct nf_conntrack_ecache *ecache; 1754 #endif 1755 struct nf_conntrack_expect *exp = NULL; 1756 const struct nf_conntrack_zone *zone; 1757 struct nf_conn_timeout *timeout_ext; 1758 struct nf_conntrack_zone tmp; 1759 struct nf_conntrack_net *cnet; 1760 1761 if (!nf_ct_invert_tuple(&repl_tuple, tuple)) 1762 return NULL; 1763 1764 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1765 ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, 1766 hash); 1767 if (IS_ERR(ct)) 1768 return ERR_CAST(ct); 1769 1770 if (!nf_ct_add_synproxy(ct, tmpl)) { 1771 nf_conntrack_free(ct); 1772 return ERR_PTR(-ENOMEM); 1773 } 1774 1775 timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; 1776 1777 if (timeout_ext) 1778 nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout), 1779 GFP_ATOMIC); 1780 1781 nf_ct_acct_ext_add(ct, GFP_ATOMIC); 1782 nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); 1783 nf_ct_labels_ext_add(ct); 1784 1785 #ifdef CONFIG_NF_CONNTRACK_EVENTS 1786 ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; 1787 1788 if ((ecache || net->ct.sysctl_events) && 1789 !nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, 1790 ecache ? ecache->expmask : 0, 1791 GFP_ATOMIC)) { 1792 nf_conntrack_free(ct); 1793 return ERR_PTR(-ENOMEM); 1794 } 1795 #endif 1796 1797 cnet = nf_ct_pernet(net); 1798 if (cnet->expect_count) { 1799 spin_lock_bh(&nf_conntrack_expect_lock); 1800 exp = nf_ct_find_expectation(net, zone, tuple, !tmpl || nf_ct_is_confirmed(tmpl)); 1801 if (exp) { 1802 /* Welcome, Mr. Bond. We've been expecting you... */ 1803 __set_bit(IPS_EXPECTED_BIT, &ct->status); 1804 /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ 1805 ct->master = exp->master; 1806 if (exp->helper) { 1807 help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); 1808 if (help) 1809 rcu_assign_pointer(help->helper, exp->helper); 1810 } 1811 1812 #ifdef CONFIG_NF_CONNTRACK_MARK 1813 ct->mark = READ_ONCE(exp->master->mark); 1814 #endif 1815 #ifdef CONFIG_NF_CONNTRACK_SECMARK 1816 ct->secmark = exp->master->secmark; 1817 #endif 1818 NF_CT_STAT_INC(net, expect_new); 1819 } 1820 spin_unlock_bh(&nf_conntrack_expect_lock); 1821 } 1822 if (!exp && tmpl) 1823 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); 1824 1825 /* Other CPU might have obtained a pointer to this object before it was 1826 * released. Because refcount is 0, refcount_inc_not_zero() will fail. 1827 * 1828 * After refcount_set(1) it will succeed; ensure that zeroing of 1829 * ct->status and the correct ct->net pointer are visible; else other 1830 * core might observe CONFIRMED bit which means the entry is valid and 1831 * in the hash table, but its not (anymore). 1832 */ 1833 smp_wmb(); 1834 1835 /* Now it is going to be associated with an sk_buff, set refcount to 1. */ 1836 refcount_set(&ct->ct_general.use, 1); 1837 1838 if (exp) { 1839 if (exp->expectfn) 1840 exp->expectfn(ct, exp); 1841 nf_ct_expect_put(exp); 1842 } 1843 1844 return &ct->tuplehash[IP_CT_DIR_ORIGINAL]; 1845 } 1846 1847 /* On success, returns 0, sets skb->_nfct | ctinfo */ 1848 static int 1849 resolve_normal_ct(struct nf_conn *tmpl, 1850 struct sk_buff *skb, 1851 unsigned int dataoff, 1852 u_int8_t protonum, 1853 const struct nf_hook_state *state) 1854 { 1855 const struct nf_conntrack_zone *zone; 1856 struct nf_conntrack_tuple tuple; 1857 struct nf_conntrack_tuple_hash *h; 1858 enum ip_conntrack_info ctinfo; 1859 struct nf_conntrack_zone tmp; 1860 u32 hash, zone_id, rid; 1861 struct nf_conn *ct; 1862 1863 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), 1864 dataoff, state->pf, protonum, state->net, 1865 &tuple)) 1866 return 0; 1867 1868 /* look for tuple match */ 1869 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1870 1871 zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); 1872 hash = hash_conntrack_raw(&tuple, zone_id, state->net); 1873 h = __nf_conntrack_find_get(state->net, zone, &tuple, hash); 1874 1875 if (!h) { 1876 rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); 1877 if (zone_id != rid) { 1878 u32 tmp = hash_conntrack_raw(&tuple, rid, state->net); 1879 1880 h = __nf_conntrack_find_get(state->net, zone, &tuple, tmp); 1881 } 1882 } 1883 1884 if (!h) { 1885 h = init_conntrack(state->net, tmpl, &tuple, 1886 skb, dataoff, hash); 1887 if (!h) 1888 return 0; 1889 if (IS_ERR(h)) 1890 return PTR_ERR(h); 1891 } 1892 ct = nf_ct_tuplehash_to_ctrack(h); 1893 1894 /* It exists; we have (non-exclusive) reference. */ 1895 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { 1896 ctinfo = IP_CT_ESTABLISHED_REPLY; 1897 } else { 1898 unsigned long status = READ_ONCE(ct->status); 1899 1900 /* Once we've had two way comms, always ESTABLISHED. */ 1901 if (likely(status & IPS_SEEN_REPLY)) 1902 ctinfo = IP_CT_ESTABLISHED; 1903 else if (status & IPS_EXPECTED) 1904 ctinfo = IP_CT_RELATED; 1905 else 1906 ctinfo = IP_CT_NEW; 1907 } 1908 nf_ct_set(skb, ct, ctinfo); 1909 return 0; 1910 } 1911 1912 /* 1913 * icmp packets need special treatment to handle error messages that are 1914 * related to a connection. 1915 * 1916 * Callers need to check if skb has a conntrack assigned when this 1917 * helper returns; in such case skb belongs to an already known connection. 1918 */ 1919 static unsigned int __cold 1920 nf_conntrack_handle_icmp(struct nf_conn *tmpl, 1921 struct sk_buff *skb, 1922 unsigned int dataoff, 1923 u8 protonum, 1924 const struct nf_hook_state *state) 1925 { 1926 int ret; 1927 1928 if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP) 1929 ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state); 1930 #if IS_ENABLED(CONFIG_IPV6) 1931 else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6) 1932 ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state); 1933 #endif 1934 else 1935 return NF_ACCEPT; 1936 1937 if (ret <= 0) 1938 NF_CT_STAT_INC_ATOMIC(state->net, error); 1939 1940 return ret; 1941 } 1942 1943 static int generic_packet(struct nf_conn *ct, struct sk_buff *skb, 1944 enum ip_conntrack_info ctinfo) 1945 { 1946 const unsigned int *timeout = nf_ct_timeout_lookup(ct); 1947 1948 if (!timeout) 1949 timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout; 1950 1951 nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); 1952 return NF_ACCEPT; 1953 } 1954 1955 /* Returns verdict for packet, or -1 for invalid. */ 1956 static int nf_conntrack_handle_packet(struct nf_conn *ct, 1957 struct sk_buff *skb, 1958 unsigned int dataoff, 1959 enum ip_conntrack_info ctinfo, 1960 const struct nf_hook_state *state) 1961 { 1962 switch (nf_ct_protonum(ct)) { 1963 case IPPROTO_TCP: 1964 return nf_conntrack_tcp_packet(ct, skb, dataoff, 1965 ctinfo, state); 1966 case IPPROTO_UDP: 1967 return nf_conntrack_udp_packet(ct, skb, dataoff, 1968 ctinfo, state); 1969 case IPPROTO_ICMP: 1970 return nf_conntrack_icmp_packet(ct, skb, ctinfo, state); 1971 #if IS_ENABLED(CONFIG_IPV6) 1972 case IPPROTO_ICMPV6: 1973 return nf_conntrack_icmpv6_packet(ct, skb, ctinfo, state); 1974 #endif 1975 #ifdef CONFIG_NF_CT_PROTO_UDPLITE 1976 case IPPROTO_UDPLITE: 1977 return nf_conntrack_udplite_packet(ct, skb, dataoff, 1978 ctinfo, state); 1979 #endif 1980 #ifdef CONFIG_NF_CT_PROTO_SCTP 1981 case IPPROTO_SCTP: 1982 return nf_conntrack_sctp_packet(ct, skb, dataoff, 1983 ctinfo, state); 1984 #endif 1985 #ifdef CONFIG_NF_CT_PROTO_DCCP 1986 case IPPROTO_DCCP: 1987 return nf_conntrack_dccp_packet(ct, skb, dataoff, 1988 ctinfo, state); 1989 #endif 1990 #ifdef CONFIG_NF_CT_PROTO_GRE 1991 case IPPROTO_GRE: 1992 return nf_conntrack_gre_packet(ct, skb, dataoff, 1993 ctinfo, state); 1994 #endif 1995 } 1996 1997 return generic_packet(ct, skb, ctinfo); 1998 } 1999 2000 unsigned int 2001 nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) 2002 { 2003 enum ip_conntrack_info ctinfo; 2004 struct nf_conn *ct, *tmpl; 2005 u_int8_t protonum; 2006 int dataoff, ret; 2007 2008 tmpl = nf_ct_get(skb, &ctinfo); 2009 if (tmpl || ctinfo == IP_CT_UNTRACKED) { 2010 /* Previously seen (loopback or untracked)? Ignore. */ 2011 if ((tmpl && !nf_ct_is_template(tmpl)) || 2012 ctinfo == IP_CT_UNTRACKED) 2013 return NF_ACCEPT; 2014 skb->_nfct = 0; 2015 } 2016 2017 /* rcu_read_lock()ed by nf_hook_thresh */ 2018 dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum); 2019 if (dataoff <= 0) { 2020 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 2021 ret = NF_ACCEPT; 2022 goto out; 2023 } 2024 2025 if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) { 2026 ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff, 2027 protonum, state); 2028 if (ret <= 0) { 2029 ret = -ret; 2030 goto out; 2031 } 2032 /* ICMP[v6] protocol trackers may assign one conntrack. */ 2033 if (skb->_nfct) 2034 goto out; 2035 } 2036 repeat: 2037 ret = resolve_normal_ct(tmpl, skb, dataoff, 2038 protonum, state); 2039 if (ret < 0) { 2040 /* Too stressed to deal. */ 2041 NF_CT_STAT_INC_ATOMIC(state->net, drop); 2042 ret = NF_DROP; 2043 goto out; 2044 } 2045 2046 ct = nf_ct_get(skb, &ctinfo); 2047 if (!ct) { 2048 /* Not valid part of a connection */ 2049 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 2050 ret = NF_ACCEPT; 2051 goto out; 2052 } 2053 2054 ret = nf_conntrack_handle_packet(ct, skb, dataoff, ctinfo, state); 2055 if (ret <= 0) { 2056 /* Invalid: inverse of the return code tells 2057 * the netfilter core what to do */ 2058 nf_ct_put(ct); 2059 skb->_nfct = 0; 2060 /* Special case: TCP tracker reports an attempt to reopen a 2061 * closed/aborted connection. We have to go back and create a 2062 * fresh conntrack. 2063 */ 2064 if (ret == -NF_REPEAT) 2065 goto repeat; 2066 2067 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 2068 if (ret == NF_DROP) 2069 NF_CT_STAT_INC_ATOMIC(state->net, drop); 2070 2071 ret = -ret; 2072 goto out; 2073 } 2074 2075 if (ctinfo == IP_CT_ESTABLISHED_REPLY && 2076 !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) 2077 nf_conntrack_event_cache(IPCT_REPLY, ct); 2078 out: 2079 if (tmpl) 2080 nf_ct_put(tmpl); 2081 2082 return ret; 2083 } 2084 EXPORT_SYMBOL_GPL(nf_conntrack_in); 2085 2086 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ 2087 void __nf_ct_refresh_acct(struct nf_conn *ct, 2088 enum ip_conntrack_info ctinfo, 2089 u32 extra_jiffies, 2090 unsigned int bytes) 2091 { 2092 /* Only update if this is not a fixed timeout */ 2093 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) 2094 goto acct; 2095 2096 /* If not in hash table, timer will not be active yet */ 2097 if (nf_ct_is_confirmed(ct)) 2098 extra_jiffies += nfct_time_stamp; 2099 2100 if (READ_ONCE(ct->timeout) != extra_jiffies) 2101 WRITE_ONCE(ct->timeout, extra_jiffies); 2102 acct: 2103 if (bytes) 2104 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes); 2105 } 2106 EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); 2107 2108 bool nf_ct_kill_acct(struct nf_conn *ct, 2109 enum ip_conntrack_info ctinfo, 2110 const struct sk_buff *skb) 2111 { 2112 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); 2113 2114 return nf_ct_delete(ct, 0, 0); 2115 } 2116 EXPORT_SYMBOL_GPL(nf_ct_kill_acct); 2117 2118 #if IS_ENABLED(CONFIG_NF_CT_NETLINK) 2119 2120 #include <linux/netfilter/nfnetlink.h> 2121 #include <linux/netfilter/nfnetlink_conntrack.h> 2122 #include <linux/mutex.h> 2123 2124 /* Generic function for tcp/udp/sctp/dccp and alike. */ 2125 int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, 2126 const struct nf_conntrack_tuple *tuple) 2127 { 2128 if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) || 2129 nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port)) 2130 goto nla_put_failure; 2131 return 0; 2132 2133 nla_put_failure: 2134 return -1; 2135 } 2136 EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr); 2137 2138 const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = { 2139 [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 }, 2140 [CTA_PROTO_DST_PORT] = { .type = NLA_U16 }, 2141 }; 2142 EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy); 2143 2144 int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], 2145 struct nf_conntrack_tuple *t, 2146 u_int32_t flags) 2147 { 2148 if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) { 2149 if (!tb[CTA_PROTO_SRC_PORT]) 2150 return -EINVAL; 2151 2152 t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); 2153 } 2154 2155 if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) { 2156 if (!tb[CTA_PROTO_DST_PORT]) 2157 return -EINVAL; 2158 2159 t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); 2160 } 2161 2162 return 0; 2163 } 2164 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple); 2165 2166 unsigned int nf_ct_port_nlattr_tuple_size(void) 2167 { 2168 static unsigned int size __read_mostly; 2169 2170 if (!size) 2171 size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); 2172 2173 return size; 2174 } 2175 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); 2176 #endif 2177 2178 /* Used by ipt_REJECT and ip6t_REJECT. */ 2179 static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) 2180 { 2181 struct nf_conn *ct; 2182 enum ip_conntrack_info ctinfo; 2183 2184 /* This ICMP is in reverse direction to the packet which caused it */ 2185 ct = nf_ct_get(skb, &ctinfo); 2186 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) 2187 ctinfo = IP_CT_RELATED_REPLY; 2188 else 2189 ctinfo = IP_CT_RELATED; 2190 2191 /* Attach to new skbuff, and increment count */ 2192 nf_ct_set(nskb, ct, ctinfo); 2193 nf_conntrack_get(skb_nfct(nskb)); 2194 } 2195 2196 /* This packet is coming from userspace via nf_queue, complete the packet 2197 * processing after the helper invocation in nf_confirm(). 2198 */ 2199 static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct, 2200 enum ip_conntrack_info ctinfo) 2201 { 2202 const struct nf_conntrack_helper *helper; 2203 const struct nf_conn_help *help; 2204 int protoff; 2205 2206 help = nfct_help(ct); 2207 if (!help) 2208 return NF_ACCEPT; 2209 2210 helper = rcu_dereference(help->helper); 2211 if (!helper) 2212 return NF_ACCEPT; 2213 2214 if (!(helper->flags & NF_CT_HELPER_F_USERSPACE)) 2215 return NF_ACCEPT; 2216 2217 switch (nf_ct_l3num(ct)) { 2218 case NFPROTO_IPV4: 2219 protoff = skb_network_offset(skb) + ip_hdrlen(skb); 2220 break; 2221 #if IS_ENABLED(CONFIG_IPV6) 2222 case NFPROTO_IPV6: { 2223 __be16 frag_off; 2224 u8 pnum; 2225 2226 pnum = ipv6_hdr(skb)->nexthdr; 2227 protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, 2228 &frag_off); 2229 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) 2230 return NF_ACCEPT; 2231 break; 2232 } 2233 #endif 2234 default: 2235 return NF_ACCEPT; 2236 } 2237 2238 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && 2239 !nf_is_loopback_packet(skb)) { 2240 if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { 2241 NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); 2242 return NF_DROP; 2243 } 2244 } 2245 2246 /* We've seen it coming out the other side: confirm it */ 2247 return nf_conntrack_confirm(skb); 2248 } 2249 2250 static int nf_conntrack_update(struct net *net, struct sk_buff *skb) 2251 { 2252 enum ip_conntrack_info ctinfo; 2253 struct nf_conn *ct; 2254 2255 ct = nf_ct_get(skb, &ctinfo); 2256 if (!ct) 2257 return NF_ACCEPT; 2258 2259 return nf_confirm_cthelper(skb, ct, ctinfo); 2260 } 2261 2262 static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, 2263 const struct sk_buff *skb) 2264 { 2265 const struct nf_conntrack_tuple *src_tuple; 2266 const struct nf_conntrack_tuple_hash *hash; 2267 struct nf_conntrack_tuple srctuple; 2268 enum ip_conntrack_info ctinfo; 2269 struct nf_conn *ct; 2270 2271 ct = nf_ct_get(skb, &ctinfo); 2272 if (ct) { 2273 src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo)); 2274 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); 2275 return true; 2276 } 2277 2278 if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), 2279 NFPROTO_IPV4, dev_net(skb->dev), 2280 &srctuple)) 2281 return false; 2282 2283 hash = nf_conntrack_find_get(dev_net(skb->dev), 2284 &nf_ct_zone_dflt, 2285 &srctuple); 2286 if (!hash) 2287 return false; 2288 2289 ct = nf_ct_tuplehash_to_ctrack(hash); 2290 src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir); 2291 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); 2292 nf_ct_put(ct); 2293 2294 return true; 2295 } 2296 2297 /* Bring out ya dead! */ 2298 static struct nf_conn * 2299 get_next_corpse(int (*iter)(struct nf_conn *i, void *data), 2300 const struct nf_ct_iter_data *iter_data, unsigned int *bucket) 2301 { 2302 struct nf_conntrack_tuple_hash *h; 2303 struct nf_conn *ct; 2304 struct hlist_nulls_node *n; 2305 spinlock_t *lockp; 2306 2307 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { 2308 struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket]; 2309 2310 if (hlist_nulls_empty(hslot)) 2311 continue; 2312 2313 lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; 2314 local_bh_disable(); 2315 nf_conntrack_lock(lockp); 2316 hlist_nulls_for_each_entry(h, n, hslot, hnnode) { 2317 if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY) 2318 continue; 2319 /* All nf_conn objects are added to hash table twice, one 2320 * for original direction tuple, once for the reply tuple. 2321 * 2322 * Exception: In the IPS_NAT_CLASH case, only the reply 2323 * tuple is added (the original tuple already existed for 2324 * a different object). 2325 * 2326 * We only need to call the iterator once for each 2327 * conntrack, so we just use the 'reply' direction 2328 * tuple while iterating. 2329 */ 2330 ct = nf_ct_tuplehash_to_ctrack(h); 2331 2332 if (iter_data->net && 2333 !net_eq(iter_data->net, nf_ct_net(ct))) 2334 continue; 2335 2336 if (iter(ct, iter_data->data)) 2337 goto found; 2338 } 2339 spin_unlock(lockp); 2340 local_bh_enable(); 2341 cond_resched(); 2342 } 2343 2344 return NULL; 2345 found: 2346 refcount_inc(&ct->ct_general.use); 2347 spin_unlock(lockp); 2348 local_bh_enable(); 2349 return ct; 2350 } 2351 2352 static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), 2353 const struct nf_ct_iter_data *iter_data) 2354 { 2355 unsigned int bucket = 0; 2356 struct nf_conn *ct; 2357 2358 might_sleep(); 2359 2360 mutex_lock(&nf_conntrack_mutex); 2361 while ((ct = get_next_corpse(iter, iter_data, &bucket)) != NULL) { 2362 /* Time to push up daises... */ 2363 2364 nf_ct_delete(ct, iter_data->portid, iter_data->report); 2365 nf_ct_put(ct); 2366 cond_resched(); 2367 } 2368 mutex_unlock(&nf_conntrack_mutex); 2369 } 2370 2371 void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data), 2372 const struct nf_ct_iter_data *iter_data) 2373 { 2374 struct net *net = iter_data->net; 2375 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2376 2377 might_sleep(); 2378 2379 if (atomic_read(&cnet->count) == 0) 2380 return; 2381 2382 nf_ct_iterate_cleanup(iter, iter_data); 2383 } 2384 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net); 2385 2386 /** 2387 * nf_ct_iterate_destroy - destroy unconfirmed conntracks and iterate table 2388 * @iter: callback to invoke for each conntrack 2389 * @data: data to pass to @iter 2390 * 2391 * Like nf_ct_iterate_cleanup, but first marks conntracks on the 2392 * unconfirmed list as dying (so they will not be inserted into 2393 * main table). 2394 * 2395 * Can only be called in module exit path. 2396 */ 2397 void 2398 nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) 2399 { 2400 struct nf_ct_iter_data iter_data = {}; 2401 struct net *net; 2402 2403 down_read(&net_rwsem); 2404 for_each_net(net) { 2405 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2406 2407 if (atomic_read(&cnet->count) == 0) 2408 continue; 2409 nf_queue_nf_hook_drop(net); 2410 } 2411 up_read(&net_rwsem); 2412 2413 /* Need to wait for netns cleanup worker to finish, if its 2414 * running -- it might have deleted a net namespace from 2415 * the global list, so hook drop above might not have 2416 * affected all namespaces. 2417 */ 2418 net_ns_barrier(); 2419 2420 /* a skb w. unconfirmed conntrack could have been reinjected just 2421 * before we called nf_queue_nf_hook_drop(). 2422 * 2423 * This makes sure its inserted into conntrack table. 2424 */ 2425 synchronize_net(); 2426 2427 nf_ct_ext_bump_genid(); 2428 iter_data.data = data; 2429 nf_ct_iterate_cleanup(iter, &iter_data); 2430 2431 /* Another cpu might be in a rcu read section with 2432 * rcu protected pointer cleared in iter callback 2433 * or hidden via nf_ct_ext_bump_genid() above. 2434 * 2435 * Wait until those are done. 2436 */ 2437 synchronize_rcu(); 2438 } 2439 EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy); 2440 2441 static int kill_all(struct nf_conn *i, void *data) 2442 { 2443 return 1; 2444 } 2445 2446 void nf_conntrack_cleanup_start(void) 2447 { 2448 cleanup_nf_conntrack_bpf(); 2449 conntrack_gc_work.exiting = true; 2450 } 2451 2452 void nf_conntrack_cleanup_end(void) 2453 { 2454 RCU_INIT_POINTER(nf_ct_hook, NULL); 2455 cancel_delayed_work_sync(&conntrack_gc_work.dwork); 2456 kvfree(nf_conntrack_hash); 2457 2458 nf_conntrack_proto_fini(); 2459 nf_conntrack_helper_fini(); 2460 nf_conntrack_expect_fini(); 2461 2462 kmem_cache_destroy(nf_conntrack_cachep); 2463 } 2464 2465 /* 2466 * Mishearing the voices in his head, our hero wonders how he's 2467 * supposed to kill the mall. 2468 */ 2469 void nf_conntrack_cleanup_net(struct net *net) 2470 { 2471 LIST_HEAD(single); 2472 2473 list_add(&net->exit_list, &single); 2474 nf_conntrack_cleanup_net_list(&single); 2475 } 2476 2477 void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) 2478 { 2479 struct nf_ct_iter_data iter_data = {}; 2480 struct net *net; 2481 int busy; 2482 2483 /* 2484 * This makes sure all current packets have passed through 2485 * netfilter framework. Roll on, two-stage module 2486 * delete... 2487 */ 2488 synchronize_rcu_expedited(); 2489 i_see_dead_people: 2490 busy = 0; 2491 list_for_each_entry(net, net_exit_list, exit_list) { 2492 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2493 2494 iter_data.net = net; 2495 nf_ct_iterate_cleanup_net(kill_all, &iter_data); 2496 if (atomic_read(&cnet->count) != 0) 2497 busy = 1; 2498 } 2499 if (busy) { 2500 schedule(); 2501 goto i_see_dead_people; 2502 } 2503 2504 list_for_each_entry(net, net_exit_list, exit_list) { 2505 nf_conntrack_ecache_pernet_fini(net); 2506 nf_conntrack_expect_pernet_fini(net); 2507 free_percpu(net->ct.stat); 2508 } 2509 } 2510 2511 void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) 2512 { 2513 struct hlist_nulls_head *hash; 2514 unsigned int nr_slots, i; 2515 2516 if (*sizep > (INT_MAX / sizeof(struct hlist_nulls_head))) 2517 return NULL; 2518 2519 BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); 2520 nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); 2521 2522 if (nr_slots > (INT_MAX / sizeof(struct hlist_nulls_head))) 2523 return NULL; 2524 2525 hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL); 2526 2527 if (hash && nulls) 2528 for (i = 0; i < nr_slots; i++) 2529 INIT_HLIST_NULLS_HEAD(&hash[i], i); 2530 2531 return hash; 2532 } 2533 EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); 2534 2535 int nf_conntrack_hash_resize(unsigned int hashsize) 2536 { 2537 int i, bucket; 2538 unsigned int old_size; 2539 struct hlist_nulls_head *hash, *old_hash; 2540 struct nf_conntrack_tuple_hash *h; 2541 struct nf_conn *ct; 2542 2543 if (!hashsize) 2544 return -EINVAL; 2545 2546 hash = nf_ct_alloc_hashtable(&hashsize, 1); 2547 if (!hash) 2548 return -ENOMEM; 2549 2550 mutex_lock(&nf_conntrack_mutex); 2551 old_size = nf_conntrack_htable_size; 2552 if (old_size == hashsize) { 2553 mutex_unlock(&nf_conntrack_mutex); 2554 kvfree(hash); 2555 return 0; 2556 } 2557 2558 local_bh_disable(); 2559 nf_conntrack_all_lock(); 2560 write_seqcount_begin(&nf_conntrack_generation); 2561 2562 /* Lookups in the old hash might happen in parallel, which means we 2563 * might get false negatives during connection lookup. New connections 2564 * created because of a false negative won't make it into the hash 2565 * though since that required taking the locks. 2566 */ 2567 2568 for (i = 0; i < nf_conntrack_htable_size; i++) { 2569 while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { 2570 unsigned int zone_id; 2571 2572 h = hlist_nulls_entry(nf_conntrack_hash[i].first, 2573 struct nf_conntrack_tuple_hash, hnnode); 2574 ct = nf_ct_tuplehash_to_ctrack(h); 2575 hlist_nulls_del_rcu(&h->hnnode); 2576 2577 zone_id = nf_ct_zone_id(nf_ct_zone(ct), NF_CT_DIRECTION(h)); 2578 bucket = __hash_conntrack(nf_ct_net(ct), 2579 &h->tuple, zone_id, hashsize); 2580 hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); 2581 } 2582 } 2583 old_hash = nf_conntrack_hash; 2584 2585 nf_conntrack_hash = hash; 2586 nf_conntrack_htable_size = hashsize; 2587 2588 write_seqcount_end(&nf_conntrack_generation); 2589 nf_conntrack_all_unlock(); 2590 local_bh_enable(); 2591 2592 mutex_unlock(&nf_conntrack_mutex); 2593 2594 synchronize_net(); 2595 kvfree(old_hash); 2596 return 0; 2597 } 2598 2599 int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) 2600 { 2601 unsigned int hashsize; 2602 int rc; 2603 2604 if (current->nsproxy->net_ns != &init_net) 2605 return -EOPNOTSUPP; 2606 2607 /* On boot, we can set this without any fancy locking. */ 2608 if (!nf_conntrack_hash) 2609 return param_set_uint(val, kp); 2610 2611 rc = kstrtouint(val, 0, &hashsize); 2612 if (rc) 2613 return rc; 2614 2615 return nf_conntrack_hash_resize(hashsize); 2616 } 2617 2618 int nf_conntrack_init_start(void) 2619 { 2620 unsigned long nr_pages = totalram_pages(); 2621 int max_factor = 8; 2622 int ret = -ENOMEM; 2623 int i; 2624 2625 seqcount_spinlock_init(&nf_conntrack_generation, 2626 &nf_conntrack_locks_all_lock); 2627 2628 for (i = 0; i < CONNTRACK_LOCKS; i++) 2629 spin_lock_init(&nf_conntrack_locks[i]); 2630 2631 if (!nf_conntrack_htable_size) { 2632 nf_conntrack_htable_size 2633 = (((nr_pages << PAGE_SHIFT) / 16384) 2634 / sizeof(struct hlist_head)); 2635 if (BITS_PER_LONG >= 64 && 2636 nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) 2637 nf_conntrack_htable_size = 262144; 2638 else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) 2639 nf_conntrack_htable_size = 65536; 2640 2641 if (nf_conntrack_htable_size < 1024) 2642 nf_conntrack_htable_size = 1024; 2643 /* Use a max. factor of one by default to keep the average 2644 * hash chain length at 2 entries. Each entry has to be added 2645 * twice (once for original direction, once for reply). 2646 * When a table size is given we use the old value of 8 to 2647 * avoid implicit reduction of the max entries setting. 2648 */ 2649 max_factor = 1; 2650 } 2651 2652 nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1); 2653 if (!nf_conntrack_hash) 2654 return -ENOMEM; 2655 2656 nf_conntrack_max = max_factor * nf_conntrack_htable_size; 2657 2658 nf_conntrack_cachep = kmem_cache_create("nf_conntrack", 2659 sizeof(struct nf_conn), 2660 NFCT_INFOMASK + 1, 2661 SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); 2662 if (!nf_conntrack_cachep) 2663 goto err_cachep; 2664 2665 ret = nf_conntrack_expect_init(); 2666 if (ret < 0) 2667 goto err_expect; 2668 2669 ret = nf_conntrack_helper_init(); 2670 if (ret < 0) 2671 goto err_helper; 2672 2673 ret = nf_conntrack_proto_init(); 2674 if (ret < 0) 2675 goto err_proto; 2676 2677 conntrack_gc_work_init(&conntrack_gc_work); 2678 queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ); 2679 2680 ret = register_nf_conntrack_bpf(); 2681 if (ret < 0) 2682 goto err_kfunc; 2683 2684 return 0; 2685 2686 err_kfunc: 2687 cancel_delayed_work_sync(&conntrack_gc_work.dwork); 2688 nf_conntrack_proto_fini(); 2689 err_proto: 2690 nf_conntrack_helper_fini(); 2691 err_helper: 2692 nf_conntrack_expect_fini(); 2693 err_expect: 2694 kmem_cache_destroy(nf_conntrack_cachep); 2695 err_cachep: 2696 kvfree(nf_conntrack_hash); 2697 return ret; 2698 } 2699 2700 static void nf_conntrack_set_closing(struct nf_conntrack *nfct) 2701 { 2702 struct nf_conn *ct = nf_ct_to_nf_conn(nfct); 2703 2704 switch (nf_ct_protonum(ct)) { 2705 case IPPROTO_TCP: 2706 nf_conntrack_tcp_set_closing(ct); 2707 break; 2708 } 2709 } 2710 2711 static const struct nf_ct_hook nf_conntrack_hook = { 2712 .update = nf_conntrack_update, 2713 .destroy = nf_ct_destroy, 2714 .get_tuple_skb = nf_conntrack_get_tuple_skb, 2715 .attach = nf_conntrack_attach, 2716 .set_closing = nf_conntrack_set_closing, 2717 .confirm = __nf_conntrack_confirm, 2718 .get_id = nf_conntrack_get_id, 2719 }; 2720 2721 void nf_conntrack_init_end(void) 2722 { 2723 RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook); 2724 } 2725 2726 /* 2727 * We need to use special "null" values, not used in hash table 2728 */ 2729 #define UNCONFIRMED_NULLS_VAL ((1<<30)+0) 2730 2731 int nf_conntrack_init_net(struct net *net) 2732 { 2733 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2734 int ret = -ENOMEM; 2735 2736 BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER); 2737 BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS); 2738 atomic_set(&cnet->count, 0); 2739 2740 net->ct.stat = alloc_percpu(struct ip_conntrack_stat); 2741 if (!net->ct.stat) 2742 return ret; 2743 2744 ret = nf_conntrack_expect_pernet_init(net); 2745 if (ret < 0) 2746 goto err_expect; 2747 2748 nf_conntrack_acct_pernet_init(net); 2749 nf_conntrack_tstamp_pernet_init(net); 2750 nf_conntrack_ecache_pernet_init(net); 2751 nf_conntrack_proto_pernet_init(net); 2752 2753 return 0; 2754 2755 err_expect: 2756 free_percpu(net->ct.stat); 2757 return ret; 2758 } 2759 2760 /* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */ 2761 2762 int __nf_ct_change_timeout(struct nf_conn *ct, u64 timeout) 2763 { 2764 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) 2765 return -EPERM; 2766 2767 __nf_ct_set_timeout(ct, timeout); 2768 2769 if (test_bit(IPS_DYING_BIT, &ct->status)) 2770 return -ETIME; 2771 2772 return 0; 2773 } 2774 EXPORT_SYMBOL_GPL(__nf_ct_change_timeout); 2775 2776 void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off) 2777 { 2778 unsigned int bit; 2779 2780 /* Ignore these unchangable bits */ 2781 on &= ~IPS_UNCHANGEABLE_MASK; 2782 off &= ~IPS_UNCHANGEABLE_MASK; 2783 2784 for (bit = 0; bit < __IPS_MAX_BIT; bit++) { 2785 if (on & (1 << bit)) 2786 set_bit(bit, &ct->status); 2787 else if (off & (1 << bit)) 2788 clear_bit(bit, &ct->status); 2789 } 2790 } 2791 EXPORT_SYMBOL_GPL(__nf_ct_change_status); 2792 2793 int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status) 2794 { 2795 unsigned long d; 2796 2797 d = ct->status ^ status; 2798 2799 if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING)) 2800 /* unchangeable */ 2801 return -EBUSY; 2802 2803 if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) 2804 /* SEEN_REPLY bit can only be set */ 2805 return -EBUSY; 2806 2807 if (d & IPS_ASSURED && !(status & IPS_ASSURED)) 2808 /* ASSURED bit can only be set */ 2809 return -EBUSY; 2810 2811 __nf_ct_change_status(ct, status, 0); 2812 return 0; 2813 } 2814 EXPORT_SYMBOL_GPL(nf_ct_change_status_common); 2815