1e48c414eSArnaldo Carvalho de Melo /* 2e48c414eSArnaldo Carvalho de Melo * INET An implementation of the TCP/IP protocol suite for the LINUX 3e48c414eSArnaldo Carvalho de Melo * operating system. INET is implemented using the BSD Socket 4e48c414eSArnaldo Carvalho de Melo * interface as the means of communication with the user level. 5e48c414eSArnaldo Carvalho de Melo * 6e48c414eSArnaldo Carvalho de Melo * Generic TIME_WAIT sockets functions 7e48c414eSArnaldo Carvalho de Melo * 8e48c414eSArnaldo Carvalho de Melo * From code orinally in TCP 9e48c414eSArnaldo Carvalho de Melo */ 10e48c414eSArnaldo Carvalho de Melo 11172589ccSIlpo Järvinen #include <linux/kernel.h> 129e337b0fSVegard Nossum #include <linux/kmemcheck.h> 135a0e3ad6STejun Heo #include <linux/slab.h> 143a9a231dSPaul Gortmaker #include <linux/module.h> 15e48c414eSArnaldo Carvalho de Melo #include <net/inet_hashtables.h> 16e48c414eSArnaldo Carvalho de Melo #include <net/inet_timewait_sock.h> 17696ab2d3SArnaldo Carvalho de Melo #include <net/ip.h> 18e48c414eSArnaldo Carvalho de Melo 1913475a30SEric Dumazet 202a8875e7SEric Dumazet /** 212a8875e7SEric Dumazet * inet_twsk_unhash - unhash a timewait socket from established hash 222a8875e7SEric Dumazet * @tw: timewait socket 232a8875e7SEric Dumazet * 242a8875e7SEric Dumazet * unhash a timewait socket from established hash, if hashed. 252a8875e7SEric Dumazet * ehash lock must be held by caller. 262a8875e7SEric Dumazet * Returns 1 if caller should call inet_twsk_put() after lock release. 2713475a30SEric Dumazet */ 2813475a30SEric Dumazet int inet_twsk_unhash(struct inet_timewait_sock *tw) 2913475a30SEric Dumazet { 3013475a30SEric Dumazet if (hlist_nulls_unhashed(&tw->tw_node)) 3113475a30SEric Dumazet return 0; 3213475a30SEric Dumazet 3313475a30SEric Dumazet hlist_nulls_del_rcu(&tw->tw_node); 3413475a30SEric Dumazet sk_nulls_node_init(&tw->tw_node); 352a8875e7SEric Dumazet /* 362a8875e7SEric Dumazet * We cannot call inet_twsk_put() ourself under lock, 372a8875e7SEric Dumazet * caller must call it for us. 382a8875e7SEric Dumazet */ 3913475a30SEric Dumazet return 1; 4013475a30SEric Dumazet } 4113475a30SEric Dumazet 422a8875e7SEric Dumazet /** 432a8875e7SEric Dumazet * inet_twsk_bind_unhash - unhash a timewait socket from bind hash 442a8875e7SEric Dumazet * @tw: timewait socket 452a8875e7SEric Dumazet * @hashinfo: hashinfo pointer 462a8875e7SEric Dumazet * 472a8875e7SEric Dumazet * unhash a timewait socket from bind hash, if hashed. 482a8875e7SEric Dumazet * bind hash lock must be held by caller. 492a8875e7SEric Dumazet * Returns 1 if caller should call inet_twsk_put() after lock release. 503cdaedaeSEric Dumazet */ 513cdaedaeSEric Dumazet int inet_twsk_bind_unhash(struct inet_timewait_sock *tw, 523cdaedaeSEric Dumazet struct inet_hashinfo *hashinfo) 533cdaedaeSEric Dumazet { 543cdaedaeSEric Dumazet struct inet_bind_bucket *tb = tw->tw_tb; 553cdaedaeSEric Dumazet 563cdaedaeSEric Dumazet if (!tb) 573cdaedaeSEric Dumazet return 0; 583cdaedaeSEric Dumazet 593cdaedaeSEric Dumazet __hlist_del(&tw->tw_bind_node); 603cdaedaeSEric Dumazet tw->tw_tb = NULL; 613cdaedaeSEric Dumazet inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 622a8875e7SEric Dumazet /* 632a8875e7SEric Dumazet * We cannot call inet_twsk_put() ourself under lock, 642a8875e7SEric Dumazet * caller must call it for us. 652a8875e7SEric Dumazet */ 663cdaedaeSEric Dumazet return 1; 673cdaedaeSEric Dumazet } 683cdaedaeSEric Dumazet 69e48c414eSArnaldo Carvalho de Melo /* Must be called with locally disabled BHs. */ 70acd159b6SAdrian Bunk static void __inet_twsk_kill(struct inet_timewait_sock *tw, 71acd159b6SAdrian Bunk struct inet_hashinfo *hashinfo) 72e48c414eSArnaldo Carvalho de Melo { 73e48c414eSArnaldo Carvalho de Melo struct inet_bind_hashbucket *bhead; 7413475a30SEric Dumazet int refcnt; 75e48c414eSArnaldo Carvalho de Melo /* Unlink from established hashes. */ 769db66bdcSEric Dumazet spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); 77e48c414eSArnaldo Carvalho de Melo 789db66bdcSEric Dumazet spin_lock(lock); 7913475a30SEric Dumazet refcnt = inet_twsk_unhash(tw); 809db66bdcSEric Dumazet spin_unlock(lock); 81e48c414eSArnaldo Carvalho de Melo 82e48c414eSArnaldo Carvalho de Melo /* Disassociate with bind bucket. */ 837f635ab7SPavel Emelyanov bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num, 847f635ab7SPavel Emelyanov hashinfo->bhash_size)]; 853cdaedaeSEric Dumazet 86e48c414eSArnaldo Carvalho de Melo spin_lock(&bhead->lock); 873cdaedaeSEric Dumazet refcnt += inet_twsk_bind_unhash(tw, hashinfo); 88e48c414eSArnaldo Carvalho de Melo spin_unlock(&bhead->lock); 893cdaedaeSEric Dumazet 9005dbc7b5SEric Dumazet BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt)); 9105dbc7b5SEric Dumazet atomic_sub(refcnt, &tw->tw_refcnt); 92e48c414eSArnaldo Carvalho de Melo } 93e48c414eSArnaldo Carvalho de Melo 9405dbc7b5SEric Dumazet void inet_twsk_free(struct inet_timewait_sock *tw) 957054fb93SPavel Emelyanov { 967054fb93SPavel Emelyanov struct module *owner = tw->tw_prot->owner; 977054fb93SPavel Emelyanov twsk_destructor((struct sock *)tw); 987054fb93SPavel Emelyanov #ifdef SOCK_REFCNT_DEBUG 994dbc8ef7SArnaldo Carvalho de Melo pr_debug("%s timewait_sock %p released\n", tw->tw_prot->name, tw); 1007054fb93SPavel Emelyanov #endif 1017054fb93SPavel Emelyanov kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw); 1027054fb93SPavel Emelyanov module_put(owner); 1037054fb93SPavel Emelyanov } 1044dbc8ef7SArnaldo Carvalho de Melo 1054dbc8ef7SArnaldo Carvalho de Melo void inet_twsk_put(struct inet_timewait_sock *tw) 1064dbc8ef7SArnaldo Carvalho de Melo { 1074dbc8ef7SArnaldo Carvalho de Melo if (atomic_dec_and_test(&tw->tw_refcnt)) 1084dbc8ef7SArnaldo Carvalho de Melo inet_twsk_free(tw); 1097054fb93SPavel Emelyanov } 1107054fb93SPavel Emelyanov EXPORT_SYMBOL_GPL(inet_twsk_put); 1117054fb93SPavel Emelyanov 11205dbc7b5SEric Dumazet static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw, 11305dbc7b5SEric Dumazet struct hlist_nulls_head *list) 11405dbc7b5SEric Dumazet { 11505dbc7b5SEric Dumazet hlist_nulls_add_head_rcu(&tw->tw_node, list); 11605dbc7b5SEric Dumazet } 11705dbc7b5SEric Dumazet 11805dbc7b5SEric Dumazet static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, 11905dbc7b5SEric Dumazet struct hlist_head *list) 12005dbc7b5SEric Dumazet { 12105dbc7b5SEric Dumazet hlist_add_head(&tw->tw_bind_node, list); 12205dbc7b5SEric Dumazet } 12305dbc7b5SEric Dumazet 124e48c414eSArnaldo Carvalho de Melo /* 125e48c414eSArnaldo Carvalho de Melo * Enter the time wait state. This is called with locally disabled BH. 126e48c414eSArnaldo Carvalho de Melo * Essentially we whip up a timewait bucket, copy the relevant info into it 127e48c414eSArnaldo Carvalho de Melo * from the SK, and mess with hash chains and list linkage. 128e48c414eSArnaldo Carvalho de Melo */ 129e48c414eSArnaldo Carvalho de Melo void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, 130e48c414eSArnaldo Carvalho de Melo struct inet_hashinfo *hashinfo) 131e48c414eSArnaldo Carvalho de Melo { 132e48c414eSArnaldo Carvalho de Melo const struct inet_sock *inet = inet_sk(sk); 133463c84b9SArnaldo Carvalho de Melo const struct inet_connection_sock *icsk = inet_csk(sk); 13481c3d547SEric Dumazet struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash); 1359db66bdcSEric Dumazet spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 136e48c414eSArnaldo Carvalho de Melo struct inet_bind_hashbucket *bhead; 137e48c414eSArnaldo Carvalho de Melo /* Step 1: Put TW into bind hash. Original socket stays there too. 138e48c414eSArnaldo Carvalho de Melo Note, that any socket with inet->num != 0 MUST be bound in 139e48c414eSArnaldo Carvalho de Melo binding cache, even if it is closed. 140e48c414eSArnaldo Carvalho de Melo */ 141c720c7e8SEric Dumazet bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num, 1427f635ab7SPavel Emelyanov hashinfo->bhash_size)]; 143e48c414eSArnaldo Carvalho de Melo spin_lock(&bhead->lock); 144463c84b9SArnaldo Carvalho de Melo tw->tw_tb = icsk->icsk_bind_hash; 145547b792cSIlpo Järvinen WARN_ON(!icsk->icsk_bind_hash); 146e48c414eSArnaldo Carvalho de Melo inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); 147e48c414eSArnaldo Carvalho de Melo spin_unlock(&bhead->lock); 148e48c414eSArnaldo Carvalho de Melo 1499db66bdcSEric Dumazet spin_lock(lock); 150e48c414eSArnaldo Carvalho de Melo 1513ab5aee7SEric Dumazet /* 15205dbc7b5SEric Dumazet * Step 2: Hash TW into tcp ehash chain. 15305dbc7b5SEric Dumazet * Notes : 15405dbc7b5SEric Dumazet * - tw_refcnt is set to 3 because : 15505dbc7b5SEric Dumazet * - We have one reference from bhash chain. 15605dbc7b5SEric Dumazet * - We have one reference from ehash chain. 15705dbc7b5SEric Dumazet * We can use atomic_set() because prior spin_lock()/spin_unlock() 15805dbc7b5SEric Dumazet * committed into memory all tw fields. 1593ab5aee7SEric Dumazet */ 16005dbc7b5SEric Dumazet atomic_set(&tw->tw_refcnt, 1 + 1 + 1); 16105dbc7b5SEric Dumazet inet_twsk_add_node_rcu(tw, &ehead->chain); 1623ab5aee7SEric Dumazet 16305dbc7b5SEric Dumazet /* Step 3: Remove SK from hash chain */ 1643ab5aee7SEric Dumazet if (__sk_nulls_del_node_init_rcu(sk)) 1653ab5aee7SEric Dumazet sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 166e48c414eSArnaldo Carvalho de Melo 1679db66bdcSEric Dumazet spin_unlock(lock); 168e48c414eSArnaldo Carvalho de Melo } 169696ab2d3SArnaldo Carvalho de Melo EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); 170696ab2d3SArnaldo Carvalho de Melo 171c676270bSArnaldo Carvalho de Melo struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) 172c676270bSArnaldo Carvalho de Melo { 1736d6ee43eSArnaldo Carvalho de Melo struct inet_timewait_sock *tw = 1746d6ee43eSArnaldo Carvalho de Melo kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, 17554e6ecb2SChristoph Lameter GFP_ATOMIC); 176*00db4124SIan Morris if (tw) { 177c676270bSArnaldo Carvalho de Melo const struct inet_sock *inet = inet_sk(sk); 178c676270bSArnaldo Carvalho de Melo 1799e337b0fSVegard Nossum kmemcheck_annotate_bitfield(tw, flags); 1809e337b0fSVegard Nossum 181c676270bSArnaldo Carvalho de Melo /* Give us an identity. */ 182c720c7e8SEric Dumazet tw->tw_daddr = inet->inet_daddr; 183c720c7e8SEric Dumazet tw->tw_rcv_saddr = inet->inet_rcv_saddr; 184c676270bSArnaldo Carvalho de Melo tw->tw_bound_dev_if = sk->sk_bound_dev_if; 18566b13d99SEric Dumazet tw->tw_tos = inet->tos; 186c720c7e8SEric Dumazet tw->tw_num = inet->inet_num; 187c676270bSArnaldo Carvalho de Melo tw->tw_state = TCP_TIME_WAIT; 188c676270bSArnaldo Carvalho de Melo tw->tw_substate = state; 189c720c7e8SEric Dumazet tw->tw_sport = inet->inet_sport; 190c720c7e8SEric Dumazet tw->tw_dport = inet->inet_dport; 191c676270bSArnaldo Carvalho de Melo tw->tw_family = sk->sk_family; 192c676270bSArnaldo Carvalho de Melo tw->tw_reuse = sk->sk_reuse; 19381c3d547SEric Dumazet tw->tw_hash = sk->sk_hash; 194c676270bSArnaldo Carvalho de Melo tw->tw_ipv6only = 0; 195f5715aeaSKOVACS Krisztian tw->tw_transparent = inet->transparent; 196c676270bSArnaldo Carvalho de Melo tw->tw_prot = sk->sk_prot_creator; 19733cf7c90SEric Dumazet atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); 198efd7ef1cSEric W. Biederman twsk_net_set(tw, sock_net(sk)); 19947e1c323SEric Dumazet /* 20047e1c323SEric Dumazet * Because we use RCU lookups, we should not set tw_refcnt 20147e1c323SEric Dumazet * to a non null value before everything is setup for this 20247e1c323SEric Dumazet * timewait socket. 20347e1c323SEric Dumazet */ 20447e1c323SEric Dumazet atomic_set(&tw->tw_refcnt, 0); 205c676270bSArnaldo Carvalho de Melo inet_twsk_dead_node_init(tw); 206eeb2b856SArnaldo Carvalho de Melo __module_get(tw->tw_prot->owner); 207c676270bSArnaldo Carvalho de Melo } 208c676270bSArnaldo Carvalho de Melo 209c676270bSArnaldo Carvalho de Melo return tw; 210c676270bSArnaldo Carvalho de Melo } 211696ab2d3SArnaldo Carvalho de Melo EXPORT_SYMBOL_GPL(inet_twsk_alloc); 212696ab2d3SArnaldo Carvalho de Melo 213696ab2d3SArnaldo Carvalho de Melo /* Returns non-zero if quota exceeded. */ 214696ab2d3SArnaldo Carvalho de Melo static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr, 215696ab2d3SArnaldo Carvalho de Melo const int slot) 216696ab2d3SArnaldo Carvalho de Melo { 217696ab2d3SArnaldo Carvalho de Melo struct inet_timewait_sock *tw; 218696ab2d3SArnaldo Carvalho de Melo unsigned int killed; 219696ab2d3SArnaldo Carvalho de Melo int ret; 220696ab2d3SArnaldo Carvalho de Melo 221696ab2d3SArnaldo Carvalho de Melo /* NOTE: compare this to previous version where lock 222696ab2d3SArnaldo Carvalho de Melo * was released after detaching chain. It was racy, 223696ab2d3SArnaldo Carvalho de Melo * because tw buckets are scheduled in not serialized context 224696ab2d3SArnaldo Carvalho de Melo * in 2.3 (with netfilter), and with softnet it is common, because 225696ab2d3SArnaldo Carvalho de Melo * soft irqs are not sequenced. 226696ab2d3SArnaldo Carvalho de Melo */ 227696ab2d3SArnaldo Carvalho de Melo killed = 0; 228696ab2d3SArnaldo Carvalho de Melo ret = 0; 229696ab2d3SArnaldo Carvalho de Melo rescan: 230b67bfe0dSSasha Levin inet_twsk_for_each_inmate(tw, &twdr->cells[slot]) { 231696ab2d3SArnaldo Carvalho de Melo __inet_twsk_del_dead_node(tw); 232696ab2d3SArnaldo Carvalho de Melo spin_unlock(&twdr->death_lock); 233696ab2d3SArnaldo Carvalho de Melo __inet_twsk_kill(tw, twdr->hashinfo); 234f2bf415cSPavel Emelyanov #ifdef CONFIG_NET_NS 235f2bf415cSPavel Emelyanov NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED); 236f2bf415cSPavel Emelyanov #endif 237696ab2d3SArnaldo Carvalho de Melo inet_twsk_put(tw); 238696ab2d3SArnaldo Carvalho de Melo killed++; 239696ab2d3SArnaldo Carvalho de Melo spin_lock(&twdr->death_lock); 240696ab2d3SArnaldo Carvalho de Melo if (killed > INET_TWDR_TWKILL_QUOTA) { 241696ab2d3SArnaldo Carvalho de Melo ret = 1; 242696ab2d3SArnaldo Carvalho de Melo break; 243696ab2d3SArnaldo Carvalho de Melo } 244696ab2d3SArnaldo Carvalho de Melo 245696ab2d3SArnaldo Carvalho de Melo /* While we dropped twdr->death_lock, another cpu may have 246696ab2d3SArnaldo Carvalho de Melo * killed off the next TW bucket in the list, therefore 247696ab2d3SArnaldo Carvalho de Melo * do a fresh re-read of the hlist head node with the 248696ab2d3SArnaldo Carvalho de Melo * lock reacquired. We still use the hlist traversal 249696ab2d3SArnaldo Carvalho de Melo * macro in order to get the prefetches. 250696ab2d3SArnaldo Carvalho de Melo */ 251696ab2d3SArnaldo Carvalho de Melo goto rescan; 252696ab2d3SArnaldo Carvalho de Melo } 253696ab2d3SArnaldo Carvalho de Melo 254696ab2d3SArnaldo Carvalho de Melo twdr->tw_count -= killed; 255f2bf415cSPavel Emelyanov #ifndef CONFIG_NET_NS 256f2bf415cSPavel Emelyanov NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITED, killed); 257f2bf415cSPavel Emelyanov #endif 258696ab2d3SArnaldo Carvalho de Melo return ret; 259696ab2d3SArnaldo Carvalho de Melo } 260696ab2d3SArnaldo Carvalho de Melo 261696ab2d3SArnaldo Carvalho de Melo void inet_twdr_hangman(unsigned long data) 262696ab2d3SArnaldo Carvalho de Melo { 263696ab2d3SArnaldo Carvalho de Melo struct inet_timewait_death_row *twdr; 26495c96174SEric Dumazet unsigned int need_timer; 265696ab2d3SArnaldo Carvalho de Melo 266696ab2d3SArnaldo Carvalho de Melo twdr = (struct inet_timewait_death_row *)data; 267696ab2d3SArnaldo Carvalho de Melo spin_lock(&twdr->death_lock); 268696ab2d3SArnaldo Carvalho de Melo 269696ab2d3SArnaldo Carvalho de Melo if (twdr->tw_count == 0) 270696ab2d3SArnaldo Carvalho de Melo goto out; 271696ab2d3SArnaldo Carvalho de Melo 272696ab2d3SArnaldo Carvalho de Melo need_timer = 0; 273696ab2d3SArnaldo Carvalho de Melo if (inet_twdr_do_twkill_work(twdr, twdr->slot)) { 274696ab2d3SArnaldo Carvalho de Melo twdr->thread_slots |= (1 << twdr->slot); 275696ab2d3SArnaldo Carvalho de Melo schedule_work(&twdr->twkill_work); 276696ab2d3SArnaldo Carvalho de Melo need_timer = 1; 277696ab2d3SArnaldo Carvalho de Melo } else { 278696ab2d3SArnaldo Carvalho de Melo /* We purged the entire slot, anything left? */ 279696ab2d3SArnaldo Carvalho de Melo if (twdr->tw_count) 280696ab2d3SArnaldo Carvalho de Melo need_timer = 1; 281696ab2d3SArnaldo Carvalho de Melo twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1)); 28280a1096bSOctavian Purdila } 283696ab2d3SArnaldo Carvalho de Melo if (need_timer) 284696ab2d3SArnaldo Carvalho de Melo mod_timer(&twdr->tw_timer, jiffies + twdr->period); 285696ab2d3SArnaldo Carvalho de Melo out: 286696ab2d3SArnaldo Carvalho de Melo spin_unlock(&twdr->death_lock); 287696ab2d3SArnaldo Carvalho de Melo } 288696ab2d3SArnaldo Carvalho de Melo EXPORT_SYMBOL_GPL(inet_twdr_hangman); 289696ab2d3SArnaldo Carvalho de Melo 29065f27f38SDavid Howells void inet_twdr_twkill_work(struct work_struct *work) 291696ab2d3SArnaldo Carvalho de Melo { 29265f27f38SDavid Howells struct inet_timewait_death_row *twdr = 29365f27f38SDavid Howells container_of(work, struct inet_timewait_death_row, twkill_work); 294696ab2d3SArnaldo Carvalho de Melo int i; 295696ab2d3SArnaldo Carvalho de Melo 29695c9382aSPavel Emelyanov BUILD_BUG_ON((INET_TWDR_TWKILL_SLOTS - 1) > 29795c9382aSPavel Emelyanov (sizeof(twdr->thread_slots) * 8)); 298696ab2d3SArnaldo Carvalho de Melo 299696ab2d3SArnaldo Carvalho de Melo while (twdr->thread_slots) { 300696ab2d3SArnaldo Carvalho de Melo spin_lock_bh(&twdr->death_lock); 301696ab2d3SArnaldo Carvalho de Melo for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) { 302696ab2d3SArnaldo Carvalho de Melo if (!(twdr->thread_slots & (1 << i))) 303696ab2d3SArnaldo Carvalho de Melo continue; 304696ab2d3SArnaldo Carvalho de Melo 305696ab2d3SArnaldo Carvalho de Melo while (inet_twdr_do_twkill_work(twdr, i) != 0) { 306696ab2d3SArnaldo Carvalho de Melo if (need_resched()) { 307696ab2d3SArnaldo Carvalho de Melo spin_unlock_bh(&twdr->death_lock); 308696ab2d3SArnaldo Carvalho de Melo schedule(); 309696ab2d3SArnaldo Carvalho de Melo spin_lock_bh(&twdr->death_lock); 310696ab2d3SArnaldo Carvalho de Melo } 311696ab2d3SArnaldo Carvalho de Melo } 312696ab2d3SArnaldo Carvalho de Melo 313696ab2d3SArnaldo Carvalho de Melo twdr->thread_slots &= ~(1 << i); 314696ab2d3SArnaldo Carvalho de Melo } 315696ab2d3SArnaldo Carvalho de Melo spin_unlock_bh(&twdr->death_lock); 316696ab2d3SArnaldo Carvalho de Melo } 317696ab2d3SArnaldo Carvalho de Melo } 318696ab2d3SArnaldo Carvalho de Melo EXPORT_SYMBOL_GPL(inet_twdr_twkill_work); 319696ab2d3SArnaldo Carvalho de Melo 320696ab2d3SArnaldo Carvalho de Melo /* These are always called from BH context. See callers in 321696ab2d3SArnaldo Carvalho de Melo * tcp_input.c to verify this. 322696ab2d3SArnaldo Carvalho de Melo */ 323696ab2d3SArnaldo Carvalho de Melo 324696ab2d3SArnaldo Carvalho de Melo /* This is for handling early-kills of TIME_WAIT sockets. */ 325696ab2d3SArnaldo Carvalho de Melo void inet_twsk_deschedule(struct inet_timewait_sock *tw, 326696ab2d3SArnaldo Carvalho de Melo struct inet_timewait_death_row *twdr) 327696ab2d3SArnaldo Carvalho de Melo { 328696ab2d3SArnaldo Carvalho de Melo spin_lock(&twdr->death_lock); 329696ab2d3SArnaldo Carvalho de Melo if (inet_twsk_del_dead_node(tw)) { 330696ab2d3SArnaldo Carvalho de Melo inet_twsk_put(tw); 331696ab2d3SArnaldo Carvalho de Melo if (--twdr->tw_count == 0) 332696ab2d3SArnaldo Carvalho de Melo del_timer(&twdr->tw_timer); 333696ab2d3SArnaldo Carvalho de Melo } 334696ab2d3SArnaldo Carvalho de Melo spin_unlock(&twdr->death_lock); 335696ab2d3SArnaldo Carvalho de Melo __inet_twsk_kill(tw, twdr->hashinfo); 336696ab2d3SArnaldo Carvalho de Melo } 337696ab2d3SArnaldo Carvalho de Melo EXPORT_SYMBOL(inet_twsk_deschedule); 338696ab2d3SArnaldo Carvalho de Melo 339696ab2d3SArnaldo Carvalho de Melo void inet_twsk_schedule(struct inet_timewait_sock *tw, 340696ab2d3SArnaldo Carvalho de Melo struct inet_timewait_death_row *twdr, 341696ab2d3SArnaldo Carvalho de Melo const int timeo, const int timewait_len) 342696ab2d3SArnaldo Carvalho de Melo { 343696ab2d3SArnaldo Carvalho de Melo struct hlist_head *list; 344696ab2d3SArnaldo Carvalho de Melo int slot; 345696ab2d3SArnaldo Carvalho de Melo 346696ab2d3SArnaldo Carvalho de Melo /* timeout := RTO * 3.5 347696ab2d3SArnaldo Carvalho de Melo * 348696ab2d3SArnaldo Carvalho de Melo * 3.5 = 1+2+0.5 to wait for two retransmits. 349696ab2d3SArnaldo Carvalho de Melo * 350696ab2d3SArnaldo Carvalho de Melo * RATIONALE: if FIN arrived and we entered TIME-WAIT state, 351696ab2d3SArnaldo Carvalho de Melo * our ACK acking that FIN can be lost. If N subsequent retransmitted 352696ab2d3SArnaldo Carvalho de Melo * FINs (or previous seqments) are lost (probability of such event 353696ab2d3SArnaldo Carvalho de Melo * is p^(N+1), where p is probability to lose single packet and 354696ab2d3SArnaldo Carvalho de Melo * time to detect the loss is about RTO*(2^N - 1) with exponential 355696ab2d3SArnaldo Carvalho de Melo * backoff). Normal timewait length is calculated so, that we 356696ab2d3SArnaldo Carvalho de Melo * waited at least for one retransmitted FIN (maximal RTO is 120sec). 357696ab2d3SArnaldo Carvalho de Melo * [ BTW Linux. following BSD, violates this requirement waiting 358696ab2d3SArnaldo Carvalho de Melo * only for 60sec, we should wait at least for 240 secs. 359696ab2d3SArnaldo Carvalho de Melo * Well, 240 consumes too much of resources 8) 360696ab2d3SArnaldo Carvalho de Melo * ] 361696ab2d3SArnaldo Carvalho de Melo * This interval is not reduced to catch old duplicate and 362696ab2d3SArnaldo Carvalho de Melo * responces to our wandering segments living for two MSLs. 363696ab2d3SArnaldo Carvalho de Melo * However, if we use PAWS to detect 364696ab2d3SArnaldo Carvalho de Melo * old duplicates, we can reduce the interval to bounds required 365696ab2d3SArnaldo Carvalho de Melo * by RTO, rather than MSL. So, if peer understands PAWS, we 366696ab2d3SArnaldo Carvalho de Melo * kill tw bucket after 3.5*RTO (it is important that this number 367696ab2d3SArnaldo Carvalho de Melo * is greater than TS tick!) and detect old duplicates with help 368696ab2d3SArnaldo Carvalho de Melo * of PAWS. 369696ab2d3SArnaldo Carvalho de Melo */ 370696ab2d3SArnaldo Carvalho de Melo slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK; 371696ab2d3SArnaldo Carvalho de Melo 372696ab2d3SArnaldo Carvalho de Melo spin_lock(&twdr->death_lock); 373696ab2d3SArnaldo Carvalho de Melo 374696ab2d3SArnaldo Carvalho de Melo /* Unlink it, if it was scheduled */ 375696ab2d3SArnaldo Carvalho de Melo if (inet_twsk_del_dead_node(tw)) 376696ab2d3SArnaldo Carvalho de Melo twdr->tw_count--; 377696ab2d3SArnaldo Carvalho de Melo else 378696ab2d3SArnaldo Carvalho de Melo atomic_inc(&tw->tw_refcnt); 379696ab2d3SArnaldo Carvalho de Melo 380696ab2d3SArnaldo Carvalho de Melo if (slot >= INET_TWDR_RECYCLE_SLOTS) { 381696ab2d3SArnaldo Carvalho de Melo /* Schedule to slow timer */ 382696ab2d3SArnaldo Carvalho de Melo if (timeo >= timewait_len) { 383696ab2d3SArnaldo Carvalho de Melo slot = INET_TWDR_TWKILL_SLOTS - 1; 384696ab2d3SArnaldo Carvalho de Melo } else { 385172589ccSIlpo Järvinen slot = DIV_ROUND_UP(timeo, twdr->period); 386696ab2d3SArnaldo Carvalho de Melo if (slot >= INET_TWDR_TWKILL_SLOTS) 387696ab2d3SArnaldo Carvalho de Melo slot = INET_TWDR_TWKILL_SLOTS - 1; 388696ab2d3SArnaldo Carvalho de Melo } 38996f817feSEric Dumazet tw->tw_ttd = inet_tw_time_stamp() + timeo; 390696ab2d3SArnaldo Carvalho de Melo slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1); 391696ab2d3SArnaldo Carvalho de Melo list = &twdr->cells[slot]; 392696ab2d3SArnaldo Carvalho de Melo } else { 39396f817feSEric Dumazet tw->tw_ttd = inet_tw_time_stamp() + (slot << INET_TWDR_RECYCLE_TICK); 394696ab2d3SArnaldo Carvalho de Melo 395696ab2d3SArnaldo Carvalho de Melo if (twdr->twcal_hand < 0) { 396696ab2d3SArnaldo Carvalho de Melo twdr->twcal_hand = 0; 397696ab2d3SArnaldo Carvalho de Melo twdr->twcal_jiffie = jiffies; 398696ab2d3SArnaldo Carvalho de Melo twdr->twcal_timer.expires = twdr->twcal_jiffie + 399696ab2d3SArnaldo Carvalho de Melo (slot << INET_TWDR_RECYCLE_TICK); 400696ab2d3SArnaldo Carvalho de Melo add_timer(&twdr->twcal_timer); 401696ab2d3SArnaldo Carvalho de Melo } else { 402696ab2d3SArnaldo Carvalho de Melo if (time_after(twdr->twcal_timer.expires, 403696ab2d3SArnaldo Carvalho de Melo jiffies + (slot << INET_TWDR_RECYCLE_TICK))) 404696ab2d3SArnaldo Carvalho de Melo mod_timer(&twdr->twcal_timer, 405696ab2d3SArnaldo Carvalho de Melo jiffies + (slot << INET_TWDR_RECYCLE_TICK)); 406696ab2d3SArnaldo Carvalho de Melo slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1); 407696ab2d3SArnaldo Carvalho de Melo } 408696ab2d3SArnaldo Carvalho de Melo list = &twdr->twcal_row[slot]; 409696ab2d3SArnaldo Carvalho de Melo } 410696ab2d3SArnaldo Carvalho de Melo 411696ab2d3SArnaldo Carvalho de Melo hlist_add_head(&tw->tw_death_node, list); 412696ab2d3SArnaldo Carvalho de Melo 413696ab2d3SArnaldo Carvalho de Melo if (twdr->tw_count++ == 0) 414696ab2d3SArnaldo Carvalho de Melo mod_timer(&twdr->tw_timer, jiffies + twdr->period); 415696ab2d3SArnaldo Carvalho de Melo spin_unlock(&twdr->death_lock); 416696ab2d3SArnaldo Carvalho de Melo } 417696ab2d3SArnaldo Carvalho de Melo EXPORT_SYMBOL_GPL(inet_twsk_schedule); 418696ab2d3SArnaldo Carvalho de Melo 419696ab2d3SArnaldo Carvalho de Melo void inet_twdr_twcal_tick(unsigned long data) 420696ab2d3SArnaldo Carvalho de Melo { 421696ab2d3SArnaldo Carvalho de Melo struct inet_timewait_death_row *twdr; 422696ab2d3SArnaldo Carvalho de Melo int n, slot; 423696ab2d3SArnaldo Carvalho de Melo unsigned long j; 424696ab2d3SArnaldo Carvalho de Melo unsigned long now = jiffies; 425696ab2d3SArnaldo Carvalho de Melo int killed = 0; 426696ab2d3SArnaldo Carvalho de Melo int adv = 0; 427696ab2d3SArnaldo Carvalho de Melo 428696ab2d3SArnaldo Carvalho de Melo twdr = (struct inet_timewait_death_row *)data; 429696ab2d3SArnaldo Carvalho de Melo 430696ab2d3SArnaldo Carvalho de Melo spin_lock(&twdr->death_lock); 431696ab2d3SArnaldo Carvalho de Melo if (twdr->twcal_hand < 0) 432696ab2d3SArnaldo Carvalho de Melo goto out; 433696ab2d3SArnaldo Carvalho de Melo 434696ab2d3SArnaldo Carvalho de Melo slot = twdr->twcal_hand; 435696ab2d3SArnaldo Carvalho de Melo j = twdr->twcal_jiffie; 436696ab2d3SArnaldo Carvalho de Melo 437696ab2d3SArnaldo Carvalho de Melo for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) { 438696ab2d3SArnaldo Carvalho de Melo if (time_before_eq(j, now)) { 439b67bfe0dSSasha Levin struct hlist_node *safe; 440696ab2d3SArnaldo Carvalho de Melo struct inet_timewait_sock *tw; 441696ab2d3SArnaldo Carvalho de Melo 442b67bfe0dSSasha Levin inet_twsk_for_each_inmate_safe(tw, safe, 443696ab2d3SArnaldo Carvalho de Melo &twdr->twcal_row[slot]) { 444696ab2d3SArnaldo Carvalho de Melo __inet_twsk_del_dead_node(tw); 445696ab2d3SArnaldo Carvalho de Melo __inet_twsk_kill(tw, twdr->hashinfo); 446f2bf415cSPavel Emelyanov #ifdef CONFIG_NET_NS 447f2bf415cSPavel Emelyanov NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED); 448f2bf415cSPavel Emelyanov #endif 449696ab2d3SArnaldo Carvalho de Melo inet_twsk_put(tw); 450696ab2d3SArnaldo Carvalho de Melo killed++; 451696ab2d3SArnaldo Carvalho de Melo } 452696ab2d3SArnaldo Carvalho de Melo } else { 453696ab2d3SArnaldo Carvalho de Melo if (!adv) { 454696ab2d3SArnaldo Carvalho de Melo adv = 1; 455696ab2d3SArnaldo Carvalho de Melo twdr->twcal_jiffie = j; 456696ab2d3SArnaldo Carvalho de Melo twdr->twcal_hand = slot; 457696ab2d3SArnaldo Carvalho de Melo } 458696ab2d3SArnaldo Carvalho de Melo 459696ab2d3SArnaldo Carvalho de Melo if (!hlist_empty(&twdr->twcal_row[slot])) { 460696ab2d3SArnaldo Carvalho de Melo mod_timer(&twdr->twcal_timer, j); 461696ab2d3SArnaldo Carvalho de Melo goto out; 462696ab2d3SArnaldo Carvalho de Melo } 463696ab2d3SArnaldo Carvalho de Melo } 464696ab2d3SArnaldo Carvalho de Melo j += 1 << INET_TWDR_RECYCLE_TICK; 465696ab2d3SArnaldo Carvalho de Melo slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1); 466696ab2d3SArnaldo Carvalho de Melo } 467696ab2d3SArnaldo Carvalho de Melo twdr->twcal_hand = -1; 468696ab2d3SArnaldo Carvalho de Melo 469696ab2d3SArnaldo Carvalho de Melo out: 470696ab2d3SArnaldo Carvalho de Melo if ((twdr->tw_count -= killed) == 0) 471696ab2d3SArnaldo Carvalho de Melo del_timer(&twdr->tw_timer); 472f2bf415cSPavel Emelyanov #ifndef CONFIG_NET_NS 473f2bf415cSPavel Emelyanov NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITKILLED, killed); 474f2bf415cSPavel Emelyanov #endif 475696ab2d3SArnaldo Carvalho de Melo spin_unlock(&twdr->death_lock); 476696ab2d3SArnaldo Carvalho de Melo } 477696ab2d3SArnaldo Carvalho de Melo EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick); 478d315492bSDaniel Lezcano 479b099ce26SEric W. Biederman void inet_twsk_purge(struct inet_hashinfo *hashinfo, 480d315492bSDaniel Lezcano struct inet_timewait_death_row *twdr, int family) 481d315492bSDaniel Lezcano { 482d315492bSDaniel Lezcano struct inet_timewait_sock *tw; 483d315492bSDaniel Lezcano struct sock *sk; 4843ab5aee7SEric Dumazet struct hlist_nulls_node *node; 485575f4cd5SEric W. Biederman unsigned int slot; 486d315492bSDaniel Lezcano 487575f4cd5SEric W. Biederman for (slot = 0; slot <= hashinfo->ehash_mask; slot++) { 488575f4cd5SEric W. Biederman struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; 489575f4cd5SEric W. Biederman restart_rcu: 490738e6d30SEric Dumazet cond_resched(); 491575f4cd5SEric W. Biederman rcu_read_lock(); 492d315492bSDaniel Lezcano restart: 49305dbc7b5SEric Dumazet sk_nulls_for_each_rcu(sk, node, &head->chain) { 49405dbc7b5SEric Dumazet if (sk->sk_state != TCP_TIME_WAIT) 49505dbc7b5SEric Dumazet continue; 496d315492bSDaniel Lezcano tw = inet_twsk(sk); 497b099ce26SEric W. Biederman if ((tw->tw_family != family) || 498b099ce26SEric W. Biederman atomic_read(&twsk_net(tw)->count)) 499d315492bSDaniel Lezcano continue; 500d315492bSDaniel Lezcano 501575f4cd5SEric W. Biederman if (unlikely(!atomic_inc_not_zero(&tw->tw_refcnt))) 502575f4cd5SEric W. Biederman continue; 503d315492bSDaniel Lezcano 504b099ce26SEric W. Biederman if (unlikely((tw->tw_family != family) || 505b099ce26SEric W. Biederman atomic_read(&twsk_net(tw)->count))) { 506575f4cd5SEric W. Biederman inet_twsk_put(tw); 507d315492bSDaniel Lezcano goto restart; 508d315492bSDaniel Lezcano } 509575f4cd5SEric W. Biederman 510575f4cd5SEric W. Biederman rcu_read_unlock(); 51191035f0bSEric Dumazet local_bh_disable(); 512575f4cd5SEric W. Biederman inet_twsk_deschedule(tw, twdr); 51391035f0bSEric Dumazet local_bh_enable(); 514575f4cd5SEric W. Biederman inet_twsk_put(tw); 515575f4cd5SEric W. Biederman goto restart_rcu; 516d315492bSDaniel Lezcano } 517575f4cd5SEric W. Biederman /* If the nulls value we got at the end of this lookup is 518575f4cd5SEric W. Biederman * not the expected one, we must restart lookup. 519575f4cd5SEric W. Biederman * We probably met an item that was moved to another chain. 520575f4cd5SEric W. Biederman */ 521575f4cd5SEric W. Biederman if (get_nulls_value(node) != slot) 522575f4cd5SEric W. Biederman goto restart; 523575f4cd5SEric W. Biederman rcu_read_unlock(); 524575f4cd5SEric W. Biederman } 525d315492bSDaniel Lezcano } 526d315492bSDaniel Lezcano EXPORT_SYMBOL_GPL(inet_twsk_purge); 527