12874c5fdSThomas Gleixner // SPDX-License-Identifier: GPL-2.0-or-later 277d8bf9cSArnaldo Carvalho de Melo /* 377d8bf9cSArnaldo Carvalho de Melo * INET An implementation of the TCP/IP protocol suite for the LINUX 477d8bf9cSArnaldo Carvalho de Melo * operating system. INET is implemented using the BSD Socket 577d8bf9cSArnaldo Carvalho de Melo * interface as the means of communication with the user level. 677d8bf9cSArnaldo Carvalho de Melo * 777d8bf9cSArnaldo Carvalho de Melo * Generic INET transport hashtables 877d8bf9cSArnaldo Carvalho de Melo * 977d8bf9cSArnaldo Carvalho de Melo * Authors: Lotsa people, from code originally in tcp 1077d8bf9cSArnaldo Carvalho de Melo */ 1177d8bf9cSArnaldo Carvalho de Melo 122d8c4ce5SArnaldo Carvalho de Melo #include <linux/module.h> 13a7f5e7f1SArnaldo Carvalho de Melo #include <linux/random.h> 14f3f05f70SArnaldo Carvalho de Melo #include <linux/sched.h> 1577d8bf9cSArnaldo Carvalho de Melo #include <linux/slab.h> 16f3f05f70SArnaldo Carvalho de Melo #include <linux/wait.h> 17095dc8e0SEric Dumazet #include <linux/vmalloc.h> 1857c8a661SMike Rapoport #include <linux/memblock.h> 1977d8bf9cSArnaldo Carvalho de Melo 20c125e80bSCraig Gallek #include <net/addrconf.h> 21463c84b9SArnaldo Carvalho de Melo #include <net/inet_connection_sock.h> 2277d8bf9cSArnaldo Carvalho de Melo #include <net/inet_hashtables.h> 2301770a16SRicardo Dias #if IS_ENABLED(CONFIG_IPV6) 2401770a16SRicardo Dias #include <net/inet6_hashtables.h> 2501770a16SRicardo Dias #endif 266e5714eaSDavid S. Miller #include <net/secure_seq.h> 27a7f5e7f1SArnaldo Carvalho de Melo #include <net/ip.h> 28a04a480dSDavid Ahern #include <net/tcp.h> 29c125e80bSCraig Gallek #include <net/sock_reuseport.h> 3077d8bf9cSArnaldo Carvalho de Melo 316eada011SEric Dumazet static u32 inet_ehashfn(const struct net *net, const __be32 laddr, 3265cd8033SHannes Frederic Sowa const __u16 lport, const __be32 faddr, 3365cd8033SHannes Frederic Sowa const __be16 fport) 3465cd8033SHannes Frederic Sowa { 351bbdceefSHannes Frederic Sowa static u32 inet_ehash_secret __read_mostly; 361bbdceefSHannes Frederic Sowa 371bbdceefSHannes Frederic Sowa net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret)); 381bbdceefSHannes Frederic Sowa 3965cd8033SHannes Frederic Sowa return __inet_ehashfn(laddr, lport, faddr, fport, 4065cd8033SHannes Frederic Sowa inet_ehash_secret + net_hash_mix(net)); 4165cd8033SHannes Frederic Sowa } 4265cd8033SHannes Frederic Sowa 43d1e559d0SEric Dumazet /* This function handles inet_sock, but also timewait and request sockets 44d1e559d0SEric Dumazet * for IPv4/IPv6. 45d1e559d0SEric Dumazet */ 46784c372aSEric Dumazet static u32 sk_ehashfn(const struct sock *sk) 4765cd8033SHannes Frederic Sowa { 48d1e559d0SEric Dumazet #if IS_ENABLED(CONFIG_IPV6) 49d1e559d0SEric Dumazet if (sk->sk_family == AF_INET6 && 50d1e559d0SEric Dumazet !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 51d1e559d0SEric Dumazet return inet6_ehashfn(sock_net(sk), 52d1e559d0SEric Dumazet &sk->sk_v6_rcv_saddr, sk->sk_num, 53d1e559d0SEric Dumazet &sk->sk_v6_daddr, sk->sk_dport); 54d1e559d0SEric Dumazet #endif 555b441f76SEric Dumazet return inet_ehashfn(sock_net(sk), 565b441f76SEric Dumazet sk->sk_rcv_saddr, sk->sk_num, 575b441f76SEric Dumazet sk->sk_daddr, sk->sk_dport); 5865cd8033SHannes Frederic Sowa } 5965cd8033SHannes Frederic Sowa 6077d8bf9cSArnaldo Carvalho de Melo /* 6177d8bf9cSArnaldo Carvalho de Melo * Allocate and initialize a new local port bind bucket. 6277d8bf9cSArnaldo Carvalho de Melo * The bindhash mutex for snum's hash chain must be held here. 6377d8bf9cSArnaldo Carvalho de Melo */ 64e18b890bSChristoph Lameter struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, 65941b1d22SPavel Emelyanov struct net *net, 6677d8bf9cSArnaldo Carvalho de Melo struct inet_bind_hashbucket *head, 673c82a21fSRobert Shearman const unsigned short snum, 683c82a21fSRobert Shearman int l3mdev) 6977d8bf9cSArnaldo Carvalho de Melo { 7054e6ecb2SChristoph Lameter struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 7177d8bf9cSArnaldo Carvalho de Melo 7200db4124SIan Morris if (tb) { 73efd7ef1cSEric W. Biederman write_pnet(&tb->ib_net, net); 743c82a21fSRobert Shearman tb->l3mdev = l3mdev; 7577d8bf9cSArnaldo Carvalho de Melo tb->port = snum; 7677d8bf9cSArnaldo Carvalho de Melo tb->fastreuse = 0; 77da5e3630STom Herbert tb->fastreuseport = 0; 7877d8bf9cSArnaldo Carvalho de Melo INIT_HLIST_HEAD(&tb->owners); 7977d8bf9cSArnaldo Carvalho de Melo hlist_add_head(&tb->node, &head->chain); 8077d8bf9cSArnaldo Carvalho de Melo } 8177d8bf9cSArnaldo Carvalho de Melo return tb; 8277d8bf9cSArnaldo Carvalho de Melo } 8377d8bf9cSArnaldo Carvalho de Melo 8477d8bf9cSArnaldo Carvalho de Melo /* 8577d8bf9cSArnaldo Carvalho de Melo * Caller must hold hashbucket lock for this tb with local BH disabled 8677d8bf9cSArnaldo Carvalho de Melo */ 87e18b890bSChristoph Lameter void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) 8877d8bf9cSArnaldo Carvalho de Melo { 8977d8bf9cSArnaldo Carvalho de Melo if (hlist_empty(&tb->owners)) { 9077d8bf9cSArnaldo Carvalho de Melo __hlist_del(&tb->node); 9177d8bf9cSArnaldo Carvalho de Melo kmem_cache_free(cachep, tb); 9277d8bf9cSArnaldo Carvalho de Melo } 9377d8bf9cSArnaldo Carvalho de Melo } 942d8c4ce5SArnaldo Carvalho de Melo 952d8c4ce5SArnaldo Carvalho de Melo void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 962d8c4ce5SArnaldo Carvalho de Melo const unsigned short snum) 972d8c4ce5SArnaldo Carvalho de Melo { 98c720c7e8SEric Dumazet inet_sk(sk)->inet_num = snum; 992d8c4ce5SArnaldo Carvalho de Melo sk_add_bind_node(sk, &tb->owners); 100463c84b9SArnaldo Carvalho de Melo inet_csk(sk)->icsk_bind_hash = tb; 1012d8c4ce5SArnaldo Carvalho de Melo } 1022d8c4ce5SArnaldo Carvalho de Melo 1032d8c4ce5SArnaldo Carvalho de Melo /* 1042d8c4ce5SArnaldo Carvalho de Melo * Get rid of any references to a local port held by the given sock. 1052d8c4ce5SArnaldo Carvalho de Melo */ 106ab1e0a13SArnaldo Carvalho de Melo static void __inet_put_port(struct sock *sk) 1072d8c4ce5SArnaldo Carvalho de Melo { 10839d8cda7SPavel Emelyanov struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 109c720c7e8SEric Dumazet const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num, 1107f635ab7SPavel Emelyanov hashinfo->bhash_size); 1112d8c4ce5SArnaldo Carvalho de Melo struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; 1122d8c4ce5SArnaldo Carvalho de Melo struct inet_bind_bucket *tb; 1132d8c4ce5SArnaldo Carvalho de Melo 1142d8c4ce5SArnaldo Carvalho de Melo spin_lock(&head->lock); 115463c84b9SArnaldo Carvalho de Melo tb = inet_csk(sk)->icsk_bind_hash; 1162d8c4ce5SArnaldo Carvalho de Melo __sk_del_bind_node(sk); 117463c84b9SArnaldo Carvalho de Melo inet_csk(sk)->icsk_bind_hash = NULL; 118c720c7e8SEric Dumazet inet_sk(sk)->inet_num = 0; 1192d8c4ce5SArnaldo Carvalho de Melo inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 1202d8c4ce5SArnaldo Carvalho de Melo spin_unlock(&head->lock); 1212d8c4ce5SArnaldo Carvalho de Melo } 1222d8c4ce5SArnaldo Carvalho de Melo 123ab1e0a13SArnaldo Carvalho de Melo void inet_put_port(struct sock *sk) 1242d8c4ce5SArnaldo Carvalho de Melo { 1252d8c4ce5SArnaldo Carvalho de Melo local_bh_disable(); 126ab1e0a13SArnaldo Carvalho de Melo __inet_put_port(sk); 1272d8c4ce5SArnaldo Carvalho de Melo local_bh_enable(); 1282d8c4ce5SArnaldo Carvalho de Melo } 1292d8c4ce5SArnaldo Carvalho de Melo EXPORT_SYMBOL(inet_put_port); 130f3f05f70SArnaldo Carvalho de Melo 1311ce31c9eSEric Dumazet int __inet_inherit_port(const struct sock *sk, struct sock *child) 13253083773SPavel Emelyanov { 13353083773SPavel Emelyanov struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; 134093d2823SBalazs Scheidler unsigned short port = inet_sk(child)->inet_num; 135093d2823SBalazs Scheidler const int bhash = inet_bhashfn(sock_net(sk), port, 1367f635ab7SPavel Emelyanov table->bhash_size); 13753083773SPavel Emelyanov struct inet_bind_hashbucket *head = &table->bhash[bhash]; 13853083773SPavel Emelyanov struct inet_bind_bucket *tb; 1393c82a21fSRobert Shearman int l3mdev; 14053083773SPavel Emelyanov 14153083773SPavel Emelyanov spin_lock(&head->lock); 14253083773SPavel Emelyanov tb = inet_csk(sk)->icsk_bind_hash; 143c2f34a65SEric Dumazet if (unlikely(!tb)) { 144c2f34a65SEric Dumazet spin_unlock(&head->lock); 145c2f34a65SEric Dumazet return -ENOENT; 146c2f34a65SEric Dumazet } 147093d2823SBalazs Scheidler if (tb->port != port) { 1483c82a21fSRobert Shearman l3mdev = inet_sk_bound_l3mdev(sk); 1493c82a21fSRobert Shearman 150093d2823SBalazs Scheidler /* NOTE: using tproxy and redirecting skbs to a proxy 151093d2823SBalazs Scheidler * on a different listener port breaks the assumption 152093d2823SBalazs Scheidler * that the listener socket's icsk_bind_hash is the same 153093d2823SBalazs Scheidler * as that of the child socket. We have to look up or 154093d2823SBalazs Scheidler * create a new bind bucket for the child here. */ 155b67bfe0dSSasha Levin inet_bind_bucket_for_each(tb, &head->chain) { 156093d2823SBalazs Scheidler if (net_eq(ib_net(tb), sock_net(sk)) && 1573c82a21fSRobert Shearman tb->l3mdev == l3mdev && tb->port == port) 158093d2823SBalazs Scheidler break; 159093d2823SBalazs Scheidler } 160b67bfe0dSSasha Levin if (!tb) { 161093d2823SBalazs Scheidler tb = inet_bind_bucket_create(table->bind_bucket_cachep, 1623c82a21fSRobert Shearman sock_net(sk), head, port, 1633c82a21fSRobert Shearman l3mdev); 164093d2823SBalazs Scheidler if (!tb) { 165093d2823SBalazs Scheidler spin_unlock(&head->lock); 166093d2823SBalazs Scheidler return -ENOMEM; 167093d2823SBalazs Scheidler } 168093d2823SBalazs Scheidler } 169d76f3351STim Froidcoeur inet_csk_update_fastreuse(tb, child); 170093d2823SBalazs Scheidler } 171b4ff3c90SNagendra Tomar inet_bind_hash(child, tb, port); 17253083773SPavel Emelyanov spin_unlock(&head->lock); 173093d2823SBalazs Scheidler 174093d2823SBalazs Scheidler return 0; 17553083773SPavel Emelyanov } 17653083773SPavel Emelyanov EXPORT_SYMBOL_GPL(__inet_inherit_port); 17753083773SPavel Emelyanov 17861b7c691SMartin KaFai Lau static struct inet_listen_hashbucket * 17961b7c691SMartin KaFai Lau inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) 18061b7c691SMartin KaFai Lau { 18161b7c691SMartin KaFai Lau u32 hash; 18261b7c691SMartin KaFai Lau 18361b7c691SMartin KaFai Lau #if IS_ENABLED(CONFIG_IPV6) 18461b7c691SMartin KaFai Lau if (sk->sk_family == AF_INET6) 18561b7c691SMartin KaFai Lau hash = ipv6_portaddr_hash(sock_net(sk), 18661b7c691SMartin KaFai Lau &sk->sk_v6_rcv_saddr, 18761b7c691SMartin KaFai Lau inet_sk(sk)->inet_num); 18861b7c691SMartin KaFai Lau else 18961b7c691SMartin KaFai Lau #endif 19061b7c691SMartin KaFai Lau hash = ipv4_portaddr_hash(sock_net(sk), 19161b7c691SMartin KaFai Lau inet_sk(sk)->inet_rcv_saddr, 19261b7c691SMartin KaFai Lau inet_sk(sk)->inet_num); 19361b7c691SMartin KaFai Lau return inet_lhash2_bucket(h, hash); 19461b7c691SMartin KaFai Lau } 19561b7c691SMartin KaFai Lau 19661b7c691SMartin KaFai Lau static void inet_hash2(struct inet_hashinfo *h, struct sock *sk) 19761b7c691SMartin KaFai Lau { 19861b7c691SMartin KaFai Lau struct inet_listen_hashbucket *ilb2; 19961b7c691SMartin KaFai Lau 20061b7c691SMartin KaFai Lau if (!h->lhash2) 20161b7c691SMartin KaFai Lau return; 20261b7c691SMartin KaFai Lau 20361b7c691SMartin KaFai Lau ilb2 = inet_lhash2_bucket_sk(h, sk); 20461b7c691SMartin KaFai Lau 20561b7c691SMartin KaFai Lau spin_lock(&ilb2->lock); 20661b7c691SMartin KaFai Lau if (sk->sk_reuseport && sk->sk_family == AF_INET6) 20761b7c691SMartin KaFai Lau hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, 20861b7c691SMartin KaFai Lau &ilb2->head); 20961b7c691SMartin KaFai Lau else 21061b7c691SMartin KaFai Lau hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, 21161b7c691SMartin KaFai Lau &ilb2->head); 21261b7c691SMartin KaFai Lau ilb2->count++; 21361b7c691SMartin KaFai Lau spin_unlock(&ilb2->lock); 21461b7c691SMartin KaFai Lau } 21561b7c691SMartin KaFai Lau 21661b7c691SMartin KaFai Lau static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk) 21761b7c691SMartin KaFai Lau { 21861b7c691SMartin KaFai Lau struct inet_listen_hashbucket *ilb2; 21961b7c691SMartin KaFai Lau 22061b7c691SMartin KaFai Lau if (!h->lhash2 || 22161b7c691SMartin KaFai Lau WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node))) 22261b7c691SMartin KaFai Lau return; 22361b7c691SMartin KaFai Lau 22461b7c691SMartin KaFai Lau ilb2 = inet_lhash2_bucket_sk(h, sk); 22561b7c691SMartin KaFai Lau 22661b7c691SMartin KaFai Lau spin_lock(&ilb2->lock); 22761b7c691SMartin KaFai Lau hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node); 22861b7c691SMartin KaFai Lau ilb2->count--; 22961b7c691SMartin KaFai Lau spin_unlock(&ilb2->lock); 23061b7c691SMartin KaFai Lau } 23161b7c691SMartin KaFai Lau 232c25eb3bfSEric Dumazet static inline int compute_score(struct sock *sk, struct net *net, 233c25eb3bfSEric Dumazet const unsigned short hnum, const __be32 daddr, 23434e1ec31SMiaohe Lin const int dif, const int sdif) 235c25eb3bfSEric Dumazet { 236c25eb3bfSEric Dumazet int score = -1; 237c25eb3bfSEric Dumazet 238d9fbc7f6SPeter Oskolkov if (net_eq(sock_net(sk), net) && sk->sk_num == hnum && 239c25eb3bfSEric Dumazet !ipv6_only_sock(sk)) { 240d9fbc7f6SPeter Oskolkov if (sk->sk_rcv_saddr != daddr) 241c25eb3bfSEric Dumazet return -1; 242e7819058SMike Manning 243d9fbc7f6SPeter Oskolkov if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) 244d9fbc7f6SPeter Oskolkov return -1; 2458d6c414cSMike Manning score = sk->sk_bound_dev_if ? 2 : 1; 246d9fbc7f6SPeter Oskolkov 2478d6c414cSMike Manning if (sk->sk_family == PF_INET) 2488d6c414cSMike Manning score++; 2497170a977SEric Dumazet if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) 25070da268bSEric Dumazet score++; 251c25eb3bfSEric Dumazet } 252c25eb3bfSEric Dumazet return score; 253c25eb3bfSEric Dumazet } 254c25eb3bfSEric Dumazet 25580b373f7SJakub Sitnicki static inline struct sock *lookup_reuseport(struct net *net, struct sock *sk, 25680b373f7SJakub Sitnicki struct sk_buff *skb, int doff, 25780b373f7SJakub Sitnicki __be32 saddr, __be16 sport, 25880b373f7SJakub Sitnicki __be32 daddr, unsigned short hnum) 25980b373f7SJakub Sitnicki { 26080b373f7SJakub Sitnicki struct sock *reuse_sk = NULL; 26180b373f7SJakub Sitnicki u32 phash; 26280b373f7SJakub Sitnicki 26380b373f7SJakub Sitnicki if (sk->sk_reuseport) { 26480b373f7SJakub Sitnicki phash = inet_ehashfn(net, daddr, hnum, saddr, sport); 26580b373f7SJakub Sitnicki reuse_sk = reuseport_select_sock(sk, phash, skb, doff); 26680b373f7SJakub Sitnicki } 26780b373f7SJakub Sitnicki return reuse_sk; 26880b373f7SJakub Sitnicki } 26980b373f7SJakub Sitnicki 270f3f05f70SArnaldo Carvalho de Melo /* 2713b24d854SEric Dumazet * Here are some nice properties to exploit here. The BSD API 2723b24d854SEric Dumazet * does not allow a listening sock to specify the remote port nor the 27333b62231SArnaldo Carvalho de Melo * remote address for the connection. So always assume those are both 27433b62231SArnaldo Carvalho de Melo * wildcarded during the search since they can never be otherwise. 27533b62231SArnaldo Carvalho de Melo */ 27633b62231SArnaldo Carvalho de Melo 2773b24d854SEric Dumazet /* called with rcu_read_lock() : No refcount taken on the socket */ 27861b7c691SMartin KaFai Lau static struct sock *inet_lhash2_lookup(struct net *net, 27961b7c691SMartin KaFai Lau struct inet_listen_hashbucket *ilb2, 28061b7c691SMartin KaFai Lau struct sk_buff *skb, int doff, 28161b7c691SMartin KaFai Lau const __be32 saddr, __be16 sport, 28261b7c691SMartin KaFai Lau const __be32 daddr, const unsigned short hnum, 28361b7c691SMartin KaFai Lau const int dif, const int sdif) 28461b7c691SMartin KaFai Lau { 28561b7c691SMartin KaFai Lau struct inet_connection_sock *icsk; 28661b7c691SMartin KaFai Lau struct sock *sk, *result = NULL; 28761b7c691SMartin KaFai Lau int score, hiscore = 0; 28861b7c691SMartin KaFai Lau 28961b7c691SMartin KaFai Lau inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { 29061b7c691SMartin KaFai Lau sk = (struct sock *)icsk; 29134e1ec31SMiaohe Lin score = compute_score(sk, net, hnum, daddr, dif, sdif); 29261b7c691SMartin KaFai Lau if (score > hiscore) { 29380b373f7SJakub Sitnicki result = lookup_reuseport(net, sk, skb, doff, 29480b373f7SJakub Sitnicki saddr, sport, daddr, hnum); 29561b7c691SMartin KaFai Lau if (result) 29661b7c691SMartin KaFai Lau return result; 29780b373f7SJakub Sitnicki 29861b7c691SMartin KaFai Lau result = sk; 29961b7c691SMartin KaFai Lau hiscore = score; 30061b7c691SMartin KaFai Lau } 30161b7c691SMartin KaFai Lau } 30261b7c691SMartin KaFai Lau 30361b7c691SMartin KaFai Lau return result; 30461b7c691SMartin KaFai Lau } 30561b7c691SMartin KaFai Lau 3061559b4aaSJakub Sitnicki static inline struct sock *inet_lookup_run_bpf(struct net *net, 3071559b4aaSJakub Sitnicki struct inet_hashinfo *hashinfo, 3081559b4aaSJakub Sitnicki struct sk_buff *skb, int doff, 3091559b4aaSJakub Sitnicki __be32 saddr, __be16 sport, 310f8931565SMark Pashmfouroush __be32 daddr, u16 hnum, const int dif) 3111559b4aaSJakub Sitnicki { 3121559b4aaSJakub Sitnicki struct sock *sk, *reuse_sk; 3131559b4aaSJakub Sitnicki bool no_reuseport; 3141559b4aaSJakub Sitnicki 3151559b4aaSJakub Sitnicki if (hashinfo != &tcp_hashinfo) 3161559b4aaSJakub Sitnicki return NULL; /* only TCP is supported */ 3171559b4aaSJakub Sitnicki 318f8931565SMark Pashmfouroush no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_TCP, saddr, sport, 319f8931565SMark Pashmfouroush daddr, hnum, dif, &sk); 3201559b4aaSJakub Sitnicki if (no_reuseport || IS_ERR_OR_NULL(sk)) 3211559b4aaSJakub Sitnicki return sk; 3221559b4aaSJakub Sitnicki 3231559b4aaSJakub Sitnicki reuse_sk = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); 3241559b4aaSJakub Sitnicki if (reuse_sk) 3251559b4aaSJakub Sitnicki sk = reuse_sk; 3261559b4aaSJakub Sitnicki return sk; 3271559b4aaSJakub Sitnicki } 3281559b4aaSJakub Sitnicki 329c67499c0SPavel Emelyanov struct sock *__inet_lookup_listener(struct net *net, 330c67499c0SPavel Emelyanov struct inet_hashinfo *hashinfo, 331a583636aSCraig Gallek struct sk_buff *skb, int doff, 332da5e3630STom Herbert const __be32 saddr, __be16 sport, 333fb99c848SAl Viro const __be32 daddr, const unsigned short hnum, 3343fa6f616SDavid Ahern const int dif, const int sdif) 33599a92ff5SHerbert Xu { 33661b7c691SMartin KaFai Lau struct inet_listen_hashbucket *ilb2; 337d9fbc7f6SPeter Oskolkov struct sock *result = NULL; 33861b7c691SMartin KaFai Lau unsigned int hash2; 33961b7c691SMartin KaFai Lau 3401559b4aaSJakub Sitnicki /* Lookup redirect from BPF */ 3411559b4aaSJakub Sitnicki if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { 3421559b4aaSJakub Sitnicki result = inet_lookup_run_bpf(net, hashinfo, skb, doff, 343f8931565SMark Pashmfouroush saddr, sport, daddr, hnum, dif); 3441559b4aaSJakub Sitnicki if (result) 3451559b4aaSJakub Sitnicki goto done; 3461559b4aaSJakub Sitnicki } 3471559b4aaSJakub Sitnicki 34861b7c691SMartin KaFai Lau hash2 = ipv4_portaddr_hash(net, daddr, hnum); 34961b7c691SMartin KaFai Lau ilb2 = inet_lhash2_bucket(hashinfo, hash2); 35061b7c691SMartin KaFai Lau 35161b7c691SMartin KaFai Lau result = inet_lhash2_lookup(net, ilb2, skb, doff, 35261b7c691SMartin KaFai Lau saddr, sport, daddr, hnum, 35361b7c691SMartin KaFai Lau dif, sdif); 35461b7c691SMartin KaFai Lau if (result) 3558217ca65SMartin KaFai Lau goto done; 35661b7c691SMartin KaFai Lau 35761b7c691SMartin KaFai Lau /* Lookup lhash2 with INADDR_ANY */ 35861b7c691SMartin KaFai Lau hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); 35961b7c691SMartin KaFai Lau ilb2 = inet_lhash2_bucket(hashinfo, hash2); 36061b7c691SMartin KaFai Lau 3618217ca65SMartin KaFai Lau result = inet_lhash2_lookup(net, ilb2, skb, doff, 362d9fbc7f6SPeter Oskolkov saddr, sport, htonl(INADDR_ANY), hnum, 36361b7c691SMartin KaFai Lau dif, sdif); 3648217ca65SMartin KaFai Lau done: 36588e235b8SEnrico Weigelt if (IS_ERR(result)) 3668217ca65SMartin KaFai Lau return NULL; 367c25eb3bfSEric Dumazet return result; 36899a92ff5SHerbert Xu } 3698f491069SHerbert Xu EXPORT_SYMBOL_GPL(__inet_lookup_listener); 370a7f5e7f1SArnaldo Carvalho de Melo 37105dbc7b5SEric Dumazet /* All sockets share common refcount, but have different destructors */ 37205dbc7b5SEric Dumazet void sock_gen_put(struct sock *sk) 37305dbc7b5SEric Dumazet { 37441c6d650SReshetova, Elena if (!refcount_dec_and_test(&sk->sk_refcnt)) 37505dbc7b5SEric Dumazet return; 37605dbc7b5SEric Dumazet 37705dbc7b5SEric Dumazet if (sk->sk_state == TCP_TIME_WAIT) 37805dbc7b5SEric Dumazet inet_twsk_free(inet_twsk(sk)); 37941b822c5SEric Dumazet else if (sk->sk_state == TCP_NEW_SYN_RECV) 38041b822c5SEric Dumazet reqsk_free(inet_reqsk(sk)); 38105dbc7b5SEric Dumazet else 38205dbc7b5SEric Dumazet sk_free(sk); 38305dbc7b5SEric Dumazet } 38405dbc7b5SEric Dumazet EXPORT_SYMBOL_GPL(sock_gen_put); 38505dbc7b5SEric Dumazet 3862c13270bSEric Dumazet void sock_edemux(struct sk_buff *skb) 3872c13270bSEric Dumazet { 3882c13270bSEric Dumazet sock_gen_put(skb->sk); 3892c13270bSEric Dumazet } 3902c13270bSEric Dumazet EXPORT_SYMBOL(sock_edemux); 3912c13270bSEric Dumazet 392c67499c0SPavel Emelyanov struct sock *__inet_lookup_established(struct net *net, 393c67499c0SPavel Emelyanov struct inet_hashinfo *hashinfo, 39477a5ba55SPavel Emelyanov const __be32 saddr, const __be16 sport, 39577a5ba55SPavel Emelyanov const __be32 daddr, const u16 hnum, 3963fa6f616SDavid Ahern const int dif, const int sdif) 39777a5ba55SPavel Emelyanov { 398c7228317SJoe Perches INET_ADDR_COOKIE(acookie, saddr, daddr); 39977a5ba55SPavel Emelyanov const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 40077a5ba55SPavel Emelyanov struct sock *sk; 4013ab5aee7SEric Dumazet const struct hlist_nulls_node *node; 40277a5ba55SPavel Emelyanov /* Optimize here for direct hit, only listening connections can 40377a5ba55SPavel Emelyanov * have wildcards anyways. 40477a5ba55SPavel Emelyanov */ 4059f26b3adSPavel Emelyanov unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); 406f373b53bSEric Dumazet unsigned int slot = hash & hashinfo->ehash_mask; 4073ab5aee7SEric Dumazet struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; 40877a5ba55SPavel Emelyanov 4093ab5aee7SEric Dumazet begin: 4103ab5aee7SEric Dumazet sk_nulls_for_each_rcu(sk, node, &head->chain) { 411ce43b03eSEric Dumazet if (sk->sk_hash != hash) 412ce43b03eSEric Dumazet continue; 413ce43b03eSEric Dumazet if (likely(INET_MATCH(sk, net, acookie, 4143fa6f616SDavid Ahern saddr, daddr, ports, dif, sdif))) { 41541c6d650SReshetova, Elena if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) 41605dbc7b5SEric Dumazet goto out; 417ce43b03eSEric Dumazet if (unlikely(!INET_MATCH(sk, net, acookie, 4183fa6f616SDavid Ahern saddr, daddr, ports, 4193fa6f616SDavid Ahern dif, sdif))) { 42005dbc7b5SEric Dumazet sock_gen_put(sk); 4213ab5aee7SEric Dumazet goto begin; 42277a5ba55SPavel Emelyanov } 42305dbc7b5SEric Dumazet goto found; 4243ab5aee7SEric Dumazet } 4253ab5aee7SEric Dumazet } 4263ab5aee7SEric Dumazet /* 4273ab5aee7SEric Dumazet * if the nulls value we got at the end of this lookup is 4283ab5aee7SEric Dumazet * not the expected one, we must restart lookup. 4293ab5aee7SEric Dumazet * We probably met an item that was moved to another chain. 4303ab5aee7SEric Dumazet */ 4313ab5aee7SEric Dumazet if (get_nulls_value(node) != slot) 4323ab5aee7SEric Dumazet goto begin; 43377a5ba55SPavel Emelyanov out: 43405dbc7b5SEric Dumazet sk = NULL; 43505dbc7b5SEric Dumazet found: 43677a5ba55SPavel Emelyanov return sk; 43777a5ba55SPavel Emelyanov } 43877a5ba55SPavel Emelyanov EXPORT_SYMBOL_GPL(__inet_lookup_established); 43977a5ba55SPavel Emelyanov 440a7f5e7f1SArnaldo Carvalho de Melo /* called with local bh disabled */ 441a7f5e7f1SArnaldo Carvalho de Melo static int __inet_check_established(struct inet_timewait_death_row *death_row, 442a7f5e7f1SArnaldo Carvalho de Melo struct sock *sk, __u16 lport, 443a7f5e7f1SArnaldo Carvalho de Melo struct inet_timewait_sock **twp) 444a7f5e7f1SArnaldo Carvalho de Melo { 445a7f5e7f1SArnaldo Carvalho de Melo struct inet_hashinfo *hinfo = death_row->hashinfo; 446a7f5e7f1SArnaldo Carvalho de Melo struct inet_sock *inet = inet_sk(sk); 447c720c7e8SEric Dumazet __be32 daddr = inet->inet_rcv_saddr; 448c720c7e8SEric Dumazet __be32 saddr = inet->inet_daddr; 449a7f5e7f1SArnaldo Carvalho de Melo int dif = sk->sk_bound_dev_if; 4503fa6f616SDavid Ahern struct net *net = sock_net(sk); 4513fa6f616SDavid Ahern int sdif = l3mdev_master_ifindex_by_index(net, dif); 452c7228317SJoe Perches INET_ADDR_COOKIE(acookie, saddr, daddr); 453c720c7e8SEric Dumazet const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); 454c720c7e8SEric Dumazet unsigned int hash = inet_ehashfn(net, daddr, lport, 455c720c7e8SEric Dumazet saddr, inet->inet_dport); 456a7f5e7f1SArnaldo Carvalho de Melo struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 4579db66bdcSEric Dumazet spinlock_t *lock = inet_ehash_lockp(hinfo, hash); 458a7f5e7f1SArnaldo Carvalho de Melo struct sock *sk2; 4593ab5aee7SEric Dumazet const struct hlist_nulls_node *node; 46005dbc7b5SEric Dumazet struct inet_timewait_sock *tw = NULL; 461a7f5e7f1SArnaldo Carvalho de Melo 4629db66bdcSEric Dumazet spin_lock(lock); 463a7f5e7f1SArnaldo Carvalho de Melo 4643ab5aee7SEric Dumazet sk_nulls_for_each(sk2, node, &head->chain) { 465ce43b03eSEric Dumazet if (sk2->sk_hash != hash) 466ce43b03eSEric Dumazet continue; 46705dbc7b5SEric Dumazet 468ce43b03eSEric Dumazet if (likely(INET_MATCH(sk2, net, acookie, 4693fa6f616SDavid Ahern saddr, daddr, ports, dif, sdif))) { 47005dbc7b5SEric Dumazet if (sk2->sk_state == TCP_TIME_WAIT) { 47105dbc7b5SEric Dumazet tw = inet_twsk(sk2); 47205dbc7b5SEric Dumazet if (twsk_unique(sk, sk2, twp)) 47305dbc7b5SEric Dumazet break; 47405dbc7b5SEric Dumazet } 475a7f5e7f1SArnaldo Carvalho de Melo goto not_unique; 476a7f5e7f1SArnaldo Carvalho de Melo } 47705dbc7b5SEric Dumazet } 478a7f5e7f1SArnaldo Carvalho de Melo 479a7f5e7f1SArnaldo Carvalho de Melo /* Must record num and sport now. Otherwise we will see 48005dbc7b5SEric Dumazet * in hash table socket with a funny identity. 48105dbc7b5SEric Dumazet */ 482c720c7e8SEric Dumazet inet->inet_num = lport; 483c720c7e8SEric Dumazet inet->inet_sport = htons(lport); 484a7f5e7f1SArnaldo Carvalho de Melo sk->sk_hash = hash; 485547b792cSIlpo Järvinen WARN_ON(!sk_unhashed(sk)); 4863ab5aee7SEric Dumazet __sk_nulls_add_node_rcu(sk, &head->chain); 48713475a30SEric Dumazet if (tw) { 488fc01538fSEric Dumazet sk_nulls_del_node_init_rcu((struct sock *)tw); 48902a1d6e7SEric Dumazet __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); 49013475a30SEric Dumazet } 4919db66bdcSEric Dumazet spin_unlock(lock); 492c29a0bc4SPavel Emelyanov sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 493a7f5e7f1SArnaldo Carvalho de Melo 494a7f5e7f1SArnaldo Carvalho de Melo if (twp) { 495a7f5e7f1SArnaldo Carvalho de Melo *twp = tw; 496a7f5e7f1SArnaldo Carvalho de Melo } else if (tw) { 497a7f5e7f1SArnaldo Carvalho de Melo /* Silly. Should hash-dance instead... */ 498dbe7faa4SEric Dumazet inet_twsk_deschedule_put(tw); 499a7f5e7f1SArnaldo Carvalho de Melo } 500a7f5e7f1SArnaldo Carvalho de Melo return 0; 501a7f5e7f1SArnaldo Carvalho de Melo 502a7f5e7f1SArnaldo Carvalho de Melo not_unique: 5039db66bdcSEric Dumazet spin_unlock(lock); 504a7f5e7f1SArnaldo Carvalho de Melo return -EADDRNOTAVAIL; 505a7f5e7f1SArnaldo Carvalho de Melo } 506a7f5e7f1SArnaldo Carvalho de Melo 507b2d05756SWilly Tarreau static u64 inet_sk_port_offset(const struct sock *sk) 508a7f5e7f1SArnaldo Carvalho de Melo { 509a7f5e7f1SArnaldo Carvalho de Melo const struct inet_sock *inet = inet_sk(sk); 510e2baad9eSEric Dumazet 511c720c7e8SEric Dumazet return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, 512c720c7e8SEric Dumazet inet->inet_daddr, 513c720c7e8SEric Dumazet inet->inet_dport); 514a7f5e7f1SArnaldo Carvalho de Melo } 515a7f5e7f1SArnaldo Carvalho de Melo 51601770a16SRicardo Dias /* Searches for an exsiting socket in the ehash bucket list. 51701770a16SRicardo Dias * Returns true if found, false otherwise. 518079096f1SEric Dumazet */ 51901770a16SRicardo Dias static bool inet_ehash_lookup_by_sk(struct sock *sk, 52001770a16SRicardo Dias struct hlist_nulls_head *list) 52101770a16SRicardo Dias { 52201770a16SRicardo Dias const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num); 52301770a16SRicardo Dias const int sdif = sk->sk_bound_dev_if; 52401770a16SRicardo Dias const int dif = sk->sk_bound_dev_if; 52501770a16SRicardo Dias const struct hlist_nulls_node *node; 52601770a16SRicardo Dias struct net *net = sock_net(sk); 52701770a16SRicardo Dias struct sock *esk; 52801770a16SRicardo Dias 52901770a16SRicardo Dias INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr); 53001770a16SRicardo Dias 53101770a16SRicardo Dias sk_nulls_for_each_rcu(esk, node, list) { 53201770a16SRicardo Dias if (esk->sk_hash != sk->sk_hash) 53301770a16SRicardo Dias continue; 53401770a16SRicardo Dias if (sk->sk_family == AF_INET) { 53501770a16SRicardo Dias if (unlikely(INET_MATCH(esk, net, acookie, 53601770a16SRicardo Dias sk->sk_daddr, 53701770a16SRicardo Dias sk->sk_rcv_saddr, 53801770a16SRicardo Dias ports, dif, sdif))) { 53901770a16SRicardo Dias return true; 54001770a16SRicardo Dias } 54101770a16SRicardo Dias } 54201770a16SRicardo Dias #if IS_ENABLED(CONFIG_IPV6) 54301770a16SRicardo Dias else if (sk->sk_family == AF_INET6) { 54401770a16SRicardo Dias if (unlikely(INET6_MATCH(esk, net, 54501770a16SRicardo Dias &sk->sk_v6_daddr, 54601770a16SRicardo Dias &sk->sk_v6_rcv_saddr, 54701770a16SRicardo Dias ports, dif, sdif))) { 54801770a16SRicardo Dias return true; 54901770a16SRicardo Dias } 55001770a16SRicardo Dias } 55101770a16SRicardo Dias #endif 55201770a16SRicardo Dias } 55301770a16SRicardo Dias return false; 55401770a16SRicardo Dias } 55501770a16SRicardo Dias 55601770a16SRicardo Dias /* Insert a socket into ehash, and eventually remove another one 55701770a16SRicardo Dias * (The another one can be a SYN_RECV or TIMEWAIT) 55801770a16SRicardo Dias * If an existing socket already exists, socket sk is not inserted, 55901770a16SRicardo Dias * and sets found_dup_sk parameter to true. 56001770a16SRicardo Dias */ 56101770a16SRicardo Dias bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) 562152da81dSPavel Emelyanov { 56339d8cda7SPavel Emelyanov struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 5643ab5aee7SEric Dumazet struct hlist_nulls_head *list; 565152da81dSPavel Emelyanov struct inet_ehash_bucket *head; 5665b441f76SEric Dumazet spinlock_t *lock; 5675e0724d0SEric Dumazet bool ret = true; 568152da81dSPavel Emelyanov 569079096f1SEric Dumazet WARN_ON_ONCE(!sk_unhashed(sk)); 570152da81dSPavel Emelyanov 5715b441f76SEric Dumazet sk->sk_hash = sk_ehashfn(sk); 572152da81dSPavel Emelyanov head = inet_ehash_bucket(hashinfo, sk->sk_hash); 573152da81dSPavel Emelyanov list = &head->chain; 574152da81dSPavel Emelyanov lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 575152da81dSPavel Emelyanov 5769db66bdcSEric Dumazet spin_lock(lock); 577fc01538fSEric Dumazet if (osk) { 5785e0724d0SEric Dumazet WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); 5795e0724d0SEric Dumazet ret = sk_nulls_del_node_init_rcu(osk); 58001770a16SRicardo Dias } else if (found_dup_sk) { 58101770a16SRicardo Dias *found_dup_sk = inet_ehash_lookup_by_sk(sk, list); 58201770a16SRicardo Dias if (*found_dup_sk) 58301770a16SRicardo Dias ret = false; 5849327f705SEric Dumazet } 58501770a16SRicardo Dias 5865e0724d0SEric Dumazet if (ret) 5875e0724d0SEric Dumazet __sk_nulls_add_node_rcu(sk, list); 58801770a16SRicardo Dias 5899db66bdcSEric Dumazet spin_unlock(lock); 59001770a16SRicardo Dias 591079096f1SEric Dumazet return ret; 592079096f1SEric Dumazet } 593079096f1SEric Dumazet 59401770a16SRicardo Dias bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) 595079096f1SEric Dumazet { 59601770a16SRicardo Dias bool ok = inet_ehash_insert(sk, osk, found_dup_sk); 5975e0724d0SEric Dumazet 5985e0724d0SEric Dumazet if (ok) { 599c29a0bc4SPavel Emelyanov sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 6005e0724d0SEric Dumazet } else { 60119757cebSEric Dumazet this_cpu_inc(*sk->sk_prot->orphan_count); 602563e0bb0SYafang Shao inet_sk_set_state(sk, TCP_CLOSE); 6035e0724d0SEric Dumazet sock_set_flag(sk, SOCK_DEAD); 6045e0724d0SEric Dumazet inet_csk_destroy_sock(sk); 605152da81dSPavel Emelyanov } 6065e0724d0SEric Dumazet return ok; 6075e0724d0SEric Dumazet } 6085e0724d0SEric Dumazet EXPORT_SYMBOL_GPL(inet_ehash_nolisten); 609152da81dSPavel Emelyanov 610c125e80bSCraig Gallek static int inet_reuseport_add_sock(struct sock *sk, 611fe38d2a1SJosef Bacik struct inet_listen_hashbucket *ilb) 612c125e80bSCraig Gallek { 61390e5d0dbSCraig Gallek struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; 6148dbd76e7SEric Dumazet const struct hlist_nulls_node *node; 615c125e80bSCraig Gallek struct sock *sk2; 616c125e80bSCraig Gallek kuid_t uid = sock_i_uid(sk); 617c125e80bSCraig Gallek 6188dbd76e7SEric Dumazet sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) { 619c125e80bSCraig Gallek if (sk2 != sk && 620c125e80bSCraig Gallek sk2->sk_family == sk->sk_family && 621c125e80bSCraig Gallek ipv6_only_sock(sk2) == ipv6_only_sock(sk) && 622c125e80bSCraig Gallek sk2->sk_bound_dev_if == sk->sk_bound_dev_if && 62390e5d0dbSCraig Gallek inet_csk(sk2)->icsk_bind_hash == tb && 624c125e80bSCraig Gallek sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && 625fe38d2a1SJosef Bacik inet_rcv_saddr_equal(sk, sk2, false)) 6262dbb9b9eSMartin KaFai Lau return reuseport_add_sock(sk, sk2, 6272dbb9b9eSMartin KaFai Lau inet_rcv_saddr_any(sk)); 628c125e80bSCraig Gallek } 629c125e80bSCraig Gallek 6302dbb9b9eSMartin KaFai Lau return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); 631c125e80bSCraig Gallek } 632c125e80bSCraig Gallek 633fe38d2a1SJosef Bacik int __inet_hash(struct sock *sk, struct sock *osk) 634152da81dSPavel Emelyanov { 63539d8cda7SPavel Emelyanov struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 6365caea4eaSEric Dumazet struct inet_listen_hashbucket *ilb; 637c125e80bSCraig Gallek int err = 0; 638152da81dSPavel Emelyanov 6395e0724d0SEric Dumazet if (sk->sk_state != TCP_LISTEN) { 6404f9bf2a2SSebastian Andrzej Siewior local_bh_disable(); 64101770a16SRicardo Dias inet_ehash_nolisten(sk, osk, NULL); 6424f9bf2a2SSebastian Andrzej Siewior local_bh_enable(); 643c125e80bSCraig Gallek return 0; 6445e0724d0SEric Dumazet } 645547b792cSIlpo Järvinen WARN_ON(!sk_unhashed(sk)); 6465caea4eaSEric Dumazet ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 647152da81dSPavel Emelyanov 6485caea4eaSEric Dumazet spin_lock(&ilb->lock); 649c125e80bSCraig Gallek if (sk->sk_reuseport) { 650fe38d2a1SJosef Bacik err = inet_reuseport_add_sock(sk, ilb); 651c125e80bSCraig Gallek if (err) 652c125e80bSCraig Gallek goto unlock; 653c125e80bSCraig Gallek } 654d296ba60SCraig Gallek if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && 655d296ba60SCraig Gallek sk->sk_family == AF_INET6) 6568dbd76e7SEric Dumazet __sk_nulls_add_node_tail_rcu(sk, &ilb->nulls_head); 657d296ba60SCraig Gallek else 6588dbd76e7SEric Dumazet __sk_nulls_add_node_rcu(sk, &ilb->nulls_head); 65961b7c691SMartin KaFai Lau inet_hash2(hashinfo, sk); 66076d013b2SMartin KaFai Lau ilb->count++; 6613b24d854SEric Dumazet sock_set_flag(sk, SOCK_RCU_FREE); 662c29a0bc4SPavel Emelyanov sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 663c125e80bSCraig Gallek unlock: 6645caea4eaSEric Dumazet spin_unlock(&ilb->lock); 665c125e80bSCraig Gallek 666c125e80bSCraig Gallek return err; 667152da81dSPavel Emelyanov } 66877a6a471SEric Dumazet EXPORT_SYMBOL(__inet_hash); 669ab1e0a13SArnaldo Carvalho de Melo 670086c653fSCraig Gallek int inet_hash(struct sock *sk) 671ab1e0a13SArnaldo Carvalho de Melo { 672c125e80bSCraig Gallek int err = 0; 673c125e80bSCraig Gallek 6744f9bf2a2SSebastian Andrzej Siewior if (sk->sk_state != TCP_CLOSE) 675fe38d2a1SJosef Bacik err = __inet_hash(sk, NULL); 676086c653fSCraig Gallek 677c125e80bSCraig Gallek return err; 678ab1e0a13SArnaldo Carvalho de Melo } 679ab1e0a13SArnaldo Carvalho de Melo EXPORT_SYMBOL_GPL(inet_hash); 680ab1e0a13SArnaldo Carvalho de Melo 6814f9bf2a2SSebastian Andrzej Siewior static void __inet_unhash(struct sock *sk, struct inet_listen_hashbucket *ilb) 682ab1e0a13SArnaldo Carvalho de Melo { 683ab1e0a13SArnaldo Carvalho de Melo if (sk_unhashed(sk)) 6845caea4eaSEric Dumazet return; 685ab1e0a13SArnaldo Carvalho de Melo 686c125e80bSCraig Gallek if (rcu_access_pointer(sk->sk_reuseport_cb)) 687333bb73fSKuniyuki Iwashima reuseport_stop_listen_sock(sk); 6880ba98718SGeert Uytterhoeven if (ilb) { 6894f9bf2a2SSebastian Andrzej Siewior struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 6904f9bf2a2SSebastian Andrzej Siewior 69161b7c691SMartin KaFai Lau inet_unhash2(hashinfo, sk); 69276d013b2SMartin KaFai Lau ilb->count--; 69376d013b2SMartin KaFai Lau } 6948dbd76e7SEric Dumazet __sk_nulls_del_node_init_rcu(sk); 69561b7c691SMartin KaFai Lau sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 6964f9bf2a2SSebastian Andrzej Siewior } 6974f9bf2a2SSebastian Andrzej Siewior 6984f9bf2a2SSebastian Andrzej Siewior void inet_unhash(struct sock *sk) 6994f9bf2a2SSebastian Andrzej Siewior { 7004f9bf2a2SSebastian Andrzej Siewior struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 7014f9bf2a2SSebastian Andrzej Siewior 7024f9bf2a2SSebastian Andrzej Siewior if (sk_unhashed(sk)) 7034f9bf2a2SSebastian Andrzej Siewior return; 7044f9bf2a2SSebastian Andrzej Siewior 7054f9bf2a2SSebastian Andrzej Siewior if (sk->sk_state == TCP_LISTEN) { 7064f9bf2a2SSebastian Andrzej Siewior struct inet_listen_hashbucket *ilb; 7074f9bf2a2SSebastian Andrzej Siewior 7084f9bf2a2SSebastian Andrzej Siewior ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 7094f9bf2a2SSebastian Andrzej Siewior /* Don't disable bottom halves while acquiring the lock to 7104f9bf2a2SSebastian Andrzej Siewior * avoid circular locking dependency on PREEMPT_RT. 7114f9bf2a2SSebastian Andrzej Siewior */ 7124f9bf2a2SSebastian Andrzej Siewior spin_lock(&ilb->lock); 7134f9bf2a2SSebastian Andrzej Siewior __inet_unhash(sk, ilb); 7144f9bf2a2SSebastian Andrzej Siewior spin_unlock(&ilb->lock); 7154f9bf2a2SSebastian Andrzej Siewior } else { 7164f9bf2a2SSebastian Andrzej Siewior spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 7174f9bf2a2SSebastian Andrzej Siewior 7184f9bf2a2SSebastian Andrzej Siewior spin_lock_bh(lock); 7194f9bf2a2SSebastian Andrzej Siewior __inet_unhash(sk, NULL); 720920de804SEric Dumazet spin_unlock_bh(lock); 721ab1e0a13SArnaldo Carvalho de Melo } 7224f9bf2a2SSebastian Andrzej Siewior } 723ab1e0a13SArnaldo Carvalho de Melo EXPORT_SYMBOL_GPL(inet_unhash); 724152da81dSPavel Emelyanov 725190cc824SEric Dumazet /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm 726190cc824SEric Dumazet * Note that we use 32bit integers (vs RFC 'short integers') 727190cc824SEric Dumazet * because 2^16 is not a multiple of num_ephemeral and this 728190cc824SEric Dumazet * property might be used by clever attacker. 7294c2c8f03SWilly Tarreau * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though 7304c2c8f03SWilly Tarreau * attacks were since demonstrated, thus we use 65536 instead to really 7314c2c8f03SWilly Tarreau * give more isolation and privacy, at the expense of 256kB of kernel 7324c2c8f03SWilly Tarreau * memory. 733190cc824SEric Dumazet */ 7344c2c8f03SWilly Tarreau #define INET_TABLE_PERTURB_SHIFT 16 735e9261476SWilly Tarreau #define INET_TABLE_PERTURB_SIZE (1 << INET_TABLE_PERTURB_SHIFT) 736e9261476SWilly Tarreau static u32 *table_perturb; 737190cc824SEric Dumazet 7385ee31fc1SPavel Emelyanov int __inet_hash_connect(struct inet_timewait_death_row *death_row, 739b2d05756SWilly Tarreau struct sock *sk, u64 port_offset, 7405ee31fc1SPavel Emelyanov int (*check_established)(struct inet_timewait_death_row *, 741b4d6444eSEric Dumazet struct sock *, __u16, struct inet_timewait_sock **)) 742a7f5e7f1SArnaldo Carvalho de Melo { 743a7f5e7f1SArnaldo Carvalho de Melo struct inet_hashinfo *hinfo = death_row->hashinfo; 744a7f5e7f1SArnaldo Carvalho de Melo struct inet_timewait_sock *tw = NULL; 7451580ab63SEric Dumazet struct inet_bind_hashbucket *head; 7461580ab63SEric Dumazet int port = inet_sk(sk)->inet_num; 7471580ab63SEric Dumazet struct net *net = sock_net(sk); 7481580ab63SEric Dumazet struct inet_bind_bucket *tb; 7491580ab63SEric Dumazet u32 remaining, offset; 7501580ab63SEric Dumazet int ret, i, low, high; 7513c82a21fSRobert Shearman int l3mdev; 752190cc824SEric Dumazet u32 index; 7531580ab63SEric Dumazet 7541580ab63SEric Dumazet if (port) { 7551580ab63SEric Dumazet head = &hinfo->bhash[inet_bhashfn(net, port, 7561580ab63SEric Dumazet hinfo->bhash_size)]; 7571580ab63SEric Dumazet tb = inet_csk(sk)->icsk_bind_hash; 7581580ab63SEric Dumazet spin_lock_bh(&head->lock); 7591580ab63SEric Dumazet if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { 76001770a16SRicardo Dias inet_ehash_nolisten(sk, NULL, NULL); 7611580ab63SEric Dumazet spin_unlock_bh(&head->lock); 7621580ab63SEric Dumazet return 0; 7631580ab63SEric Dumazet } 7641580ab63SEric Dumazet spin_unlock(&head->lock); 7651580ab63SEric Dumazet /* No definite answer... Walk to established hash table */ 7661580ab63SEric Dumazet ret = check_established(death_row, sk, port, NULL); 7671580ab63SEric Dumazet local_bh_enable(); 7681580ab63SEric Dumazet return ret; 7691580ab63SEric Dumazet } 770a7f5e7f1SArnaldo Carvalho de Melo 7713c82a21fSRobert Shearman l3mdev = inet_sk_bound_l3mdev(sk); 7723c82a21fSRobert Shearman 7730bbf87d8SEric W. Biederman inet_get_local_port_range(net, &low, &high); 7741580ab63SEric Dumazet high++; /* [32768, 60999] -> [32768, 61000[ */ 7751580ab63SEric Dumazet remaining = high - low; 7761580ab63SEric Dumazet if (likely(remaining > 1)) 7771580ab63SEric Dumazet remaining &= ~1U; 778227b60f5SStephen Hemminger 779e9261476SWilly Tarreau net_get_random_once(table_perturb, 780e9261476SWilly Tarreau INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb)); 781*e8161345SWilly Tarreau index = port_offset & (INET_TABLE_PERTURB_SIZE - 1); 782190cc824SEric Dumazet 7839e9b70aeSWilly Tarreau offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32); 784b2d05756SWilly Tarreau offset %= remaining; 785b2d05756SWilly Tarreau 7861580ab63SEric Dumazet /* In first pass we try ports of @low parity. 7871580ab63SEric Dumazet * inet_csk_get_port() does the opposite choice. 78807f4c900SEric Dumazet */ 7891580ab63SEric Dumazet offset &= ~1U; 7901580ab63SEric Dumazet other_parity_scan: 7911580ab63SEric Dumazet port = low + offset; 7921580ab63SEric Dumazet for (i = 0; i < remaining; i += 2, port += 2) { 7931580ab63SEric Dumazet if (unlikely(port >= high)) 7941580ab63SEric Dumazet port -= remaining; 795122ff243SWANG Cong if (inet_is_local_reserved_port(net, port)) 796e3826f1eSAmerigo Wang continue; 7977f635ab7SPavel Emelyanov head = &hinfo->bhash[inet_bhashfn(net, port, 7987f635ab7SPavel Emelyanov hinfo->bhash_size)]; 7991580ab63SEric Dumazet spin_lock_bh(&head->lock); 800a7f5e7f1SArnaldo Carvalho de Melo 8011580ab63SEric Dumazet /* Does not bother with rcv_saddr checks, because 8021580ab63SEric Dumazet * the established check is already unique enough. 803a7f5e7f1SArnaldo Carvalho de Melo */ 804b67bfe0dSSasha Levin inet_bind_bucket_for_each(tb, &head->chain) { 8053c82a21fSRobert Shearman if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && 8063c82a21fSRobert Shearman tb->port == port) { 807da5e3630STom Herbert if (tb->fastreuse >= 0 || 808da5e3630STom Herbert tb->fastreuseport >= 0) 809a7f5e7f1SArnaldo Carvalho de Melo goto next_port; 810a9d8f911SEvgeniy Polyakov WARN_ON(hlist_empty(&tb->owners)); 8115ee31fc1SPavel Emelyanov if (!check_established(death_row, sk, 8125ee31fc1SPavel Emelyanov port, &tw)) 813a7f5e7f1SArnaldo Carvalho de Melo goto ok; 814a7f5e7f1SArnaldo Carvalho de Melo goto next_port; 815a7f5e7f1SArnaldo Carvalho de Melo } 816a7f5e7f1SArnaldo Carvalho de Melo } 817a7f5e7f1SArnaldo Carvalho de Melo 818941b1d22SPavel Emelyanov tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 8193c82a21fSRobert Shearman net, head, port, l3mdev); 820a7f5e7f1SArnaldo Carvalho de Melo if (!tb) { 8211580ab63SEric Dumazet spin_unlock_bh(&head->lock); 8221580ab63SEric Dumazet return -ENOMEM; 823a7f5e7f1SArnaldo Carvalho de Melo } 824a7f5e7f1SArnaldo Carvalho de Melo tb->fastreuse = -1; 825da5e3630STom Herbert tb->fastreuseport = -1; 826a7f5e7f1SArnaldo Carvalho de Melo goto ok; 827a7f5e7f1SArnaldo Carvalho de Melo next_port: 8281580ab63SEric Dumazet spin_unlock_bh(&head->lock); 8291580ab63SEric Dumazet cond_resched(); 830a7f5e7f1SArnaldo Carvalho de Melo } 8311580ab63SEric Dumazet 8321580ab63SEric Dumazet offset++; 8331580ab63SEric Dumazet if ((offset & 1) && remaining > 1) 8341580ab63SEric Dumazet goto other_parity_scan; 835a7f5e7f1SArnaldo Carvalho de Melo 836a7f5e7f1SArnaldo Carvalho de Melo return -EADDRNOTAVAIL; 837a7f5e7f1SArnaldo Carvalho de Melo 838a7f5e7f1SArnaldo Carvalho de Melo ok: 839ca7af040SWilly Tarreau /* Here we want to add a little bit of randomness to the next source 840ca7af040SWilly Tarreau * port that will be chosen. We use a max() with a random here so that 841ca7af040SWilly Tarreau * on low contention the randomness is maximal and on high contention 842ca7af040SWilly Tarreau * it may be inexistent. 843c579bd1bSEric Dumazet */ 844ca7af040SWilly Tarreau i = max_t(int, i, (prandom_u32() & 7) * 2); 845190cc824SEric Dumazet WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2); 846a7f5e7f1SArnaldo Carvalho de Melo 847a7f5e7f1SArnaldo Carvalho de Melo /* Head lock still held and bh's disabled */ 848a7f5e7f1SArnaldo Carvalho de Melo inet_bind_hash(sk, tb, port); 849a7f5e7f1SArnaldo Carvalho de Melo if (sk_unhashed(sk)) { 850c720c7e8SEric Dumazet inet_sk(sk)->inet_sport = htons(port); 85101770a16SRicardo Dias inet_ehash_nolisten(sk, (struct sock *)tw, NULL); 852a7f5e7f1SArnaldo Carvalho de Melo } 8533cdaedaeSEric Dumazet if (tw) 854fc01538fSEric Dumazet inet_twsk_bind_unhash(tw, hinfo); 855a7f5e7f1SArnaldo Carvalho de Melo spin_unlock(&head->lock); 856dbe7faa4SEric Dumazet if (tw) 857dbe7faa4SEric Dumazet inet_twsk_deschedule_put(tw); 858a7f5e7f1SArnaldo Carvalho de Melo local_bh_enable(); 8591580ab63SEric Dumazet return 0; 860a7f5e7f1SArnaldo Carvalho de Melo } 8615ee31fc1SPavel Emelyanov 8625ee31fc1SPavel Emelyanov /* 8635ee31fc1SPavel Emelyanov * Bind a port for a connect operation and hash it. 8645ee31fc1SPavel Emelyanov */ 8655ee31fc1SPavel Emelyanov int inet_hash_connect(struct inet_timewait_death_row *death_row, 8665ee31fc1SPavel Emelyanov struct sock *sk) 8675ee31fc1SPavel Emelyanov { 868b2d05756SWilly Tarreau u64 port_offset = 0; 869e2baad9eSEric Dumazet 870e2baad9eSEric Dumazet if (!inet_sk(sk)->inet_num) 871e2baad9eSEric Dumazet port_offset = inet_sk_port_offset(sk); 872e2baad9eSEric Dumazet return __inet_hash_connect(death_row, sk, port_offset, 873b4d6444eSEric Dumazet __inet_check_established); 8745ee31fc1SPavel Emelyanov } 875a7f5e7f1SArnaldo Carvalho de Melo EXPORT_SYMBOL_GPL(inet_hash_connect); 8765caea4eaSEric Dumazet 8775caea4eaSEric Dumazet void inet_hashinfo_init(struct inet_hashinfo *h) 8785caea4eaSEric Dumazet { 8795caea4eaSEric Dumazet int i; 8805caea4eaSEric Dumazet 881c25eb3bfSEric Dumazet for (i = 0; i < INET_LHTABLE_SIZE; i++) { 8825caea4eaSEric Dumazet spin_lock_init(&h->listening_hash[i].lock); 8838dbd76e7SEric Dumazet INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].nulls_head, 8848dbd76e7SEric Dumazet i + LISTENING_NULLS_BASE); 88576d013b2SMartin KaFai Lau h->listening_hash[i].count = 0; 886c25eb3bfSEric Dumazet } 88761b7c691SMartin KaFai Lau 88861b7c691SMartin KaFai Lau h->lhash2 = NULL; 8895caea4eaSEric Dumazet } 8905caea4eaSEric Dumazet EXPORT_SYMBOL_GPL(inet_hashinfo_init); 891095dc8e0SEric Dumazet 892c92c81dfSPeter Oskolkov static void init_hashinfo_lhash2(struct inet_hashinfo *h) 893c92c81dfSPeter Oskolkov { 894c92c81dfSPeter Oskolkov int i; 895c92c81dfSPeter Oskolkov 896c92c81dfSPeter Oskolkov for (i = 0; i <= h->lhash2_mask; i++) { 897c92c81dfSPeter Oskolkov spin_lock_init(&h->lhash2[i].lock); 898c92c81dfSPeter Oskolkov INIT_HLIST_HEAD(&h->lhash2[i].head); 899c92c81dfSPeter Oskolkov h->lhash2[i].count = 0; 900c92c81dfSPeter Oskolkov } 901c92c81dfSPeter Oskolkov } 902c92c81dfSPeter Oskolkov 90361b7c691SMartin KaFai Lau void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, 90461b7c691SMartin KaFai Lau unsigned long numentries, int scale, 90561b7c691SMartin KaFai Lau unsigned long low_limit, 90661b7c691SMartin KaFai Lau unsigned long high_limit) 90761b7c691SMartin KaFai Lau { 90861b7c691SMartin KaFai Lau h->lhash2 = alloc_large_system_hash(name, 90961b7c691SMartin KaFai Lau sizeof(*h->lhash2), 91061b7c691SMartin KaFai Lau numentries, 91161b7c691SMartin KaFai Lau scale, 91261b7c691SMartin KaFai Lau 0, 91361b7c691SMartin KaFai Lau NULL, 91461b7c691SMartin KaFai Lau &h->lhash2_mask, 91561b7c691SMartin KaFai Lau low_limit, 91661b7c691SMartin KaFai Lau high_limit); 917c92c81dfSPeter Oskolkov init_hashinfo_lhash2(h); 918e9261476SWilly Tarreau 919e9261476SWilly Tarreau /* this one is used for source ports of outgoing connections */ 920e9261476SWilly Tarreau table_perturb = kmalloc_array(INET_TABLE_PERTURB_SIZE, 921e9261476SWilly Tarreau sizeof(*table_perturb), GFP_KERNEL); 922e9261476SWilly Tarreau if (!table_perturb) 923e9261476SWilly Tarreau panic("TCP: failed to alloc table_perturb"); 924c92c81dfSPeter Oskolkov } 92561b7c691SMartin KaFai Lau 926c92c81dfSPeter Oskolkov int inet_hashinfo2_init_mod(struct inet_hashinfo *h) 927c92c81dfSPeter Oskolkov { 928c92c81dfSPeter Oskolkov h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL); 929c92c81dfSPeter Oskolkov if (!h->lhash2) 930c92c81dfSPeter Oskolkov return -ENOMEM; 931c92c81dfSPeter Oskolkov 932c92c81dfSPeter Oskolkov h->lhash2_mask = INET_LHTABLE_SIZE - 1; 933c92c81dfSPeter Oskolkov /* INET_LHTABLE_SIZE must be a power of 2 */ 934c92c81dfSPeter Oskolkov BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask); 935c92c81dfSPeter Oskolkov 936c92c81dfSPeter Oskolkov init_hashinfo_lhash2(h); 937c92c81dfSPeter Oskolkov return 0; 93861b7c691SMartin KaFai Lau } 939c92c81dfSPeter Oskolkov EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod); 94061b7c691SMartin KaFai Lau 941095dc8e0SEric Dumazet int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) 942095dc8e0SEric Dumazet { 94389e478a2SEric Dumazet unsigned int locksz = sizeof(spinlock_t); 944095dc8e0SEric Dumazet unsigned int i, nblocks = 1; 945095dc8e0SEric Dumazet 94689e478a2SEric Dumazet if (locksz != 0) { 947095dc8e0SEric Dumazet /* allocate 2 cache lines or at least one spinlock per cpu */ 94889e478a2SEric Dumazet nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U); 949095dc8e0SEric Dumazet nblocks = roundup_pow_of_two(nblocks * num_possible_cpus()); 950095dc8e0SEric Dumazet 951095dc8e0SEric Dumazet /* no more locks than number of hash buckets */ 952095dc8e0SEric Dumazet nblocks = min(nblocks, hashinfo->ehash_mask + 1); 953095dc8e0SEric Dumazet 954752ade68SMichal Hocko hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL); 955095dc8e0SEric Dumazet if (!hashinfo->ehash_locks) 956095dc8e0SEric Dumazet return -ENOMEM; 957095dc8e0SEric Dumazet 958095dc8e0SEric Dumazet for (i = 0; i < nblocks; i++) 959095dc8e0SEric Dumazet spin_lock_init(&hashinfo->ehash_locks[i]); 960095dc8e0SEric Dumazet } 961095dc8e0SEric Dumazet hashinfo->ehash_locks_mask = nblocks - 1; 962095dc8e0SEric Dumazet return 0; 963095dc8e0SEric Dumazet } 964095dc8e0SEric Dumazet EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc); 965