12874c5fdSThomas Gleixner // SPDX-License-Identifier: GPL-2.0-or-later 277d8bf9cSArnaldo Carvalho de Melo /* 377d8bf9cSArnaldo Carvalho de Melo * INET An implementation of the TCP/IP protocol suite for the LINUX 477d8bf9cSArnaldo Carvalho de Melo * operating system. INET is implemented using the BSD Socket 577d8bf9cSArnaldo Carvalho de Melo * interface as the means of communication with the user level. 677d8bf9cSArnaldo Carvalho de Melo * 777d8bf9cSArnaldo Carvalho de Melo * Generic INET transport hashtables 877d8bf9cSArnaldo Carvalho de Melo * 977d8bf9cSArnaldo Carvalho de Melo * Authors: Lotsa people, from code originally in tcp 1077d8bf9cSArnaldo Carvalho de Melo */ 1177d8bf9cSArnaldo Carvalho de Melo 122d8c4ce5SArnaldo Carvalho de Melo #include <linux/module.h> 13a7f5e7f1SArnaldo Carvalho de Melo #include <linux/random.h> 14f3f05f70SArnaldo Carvalho de Melo #include <linux/sched.h> 1577d8bf9cSArnaldo Carvalho de Melo #include <linux/slab.h> 16f3f05f70SArnaldo Carvalho de Melo #include <linux/wait.h> 17095dc8e0SEric Dumazet #include <linux/vmalloc.h> 1857c8a661SMike Rapoport #include <linux/memblock.h> 1977d8bf9cSArnaldo Carvalho de Melo 20c125e80bSCraig Gallek #include <net/addrconf.h> 21463c84b9SArnaldo Carvalho de Melo #include <net/inet_connection_sock.h> 2277d8bf9cSArnaldo Carvalho de Melo #include <net/inet_hashtables.h> 2301770a16SRicardo Dias #if IS_ENABLED(CONFIG_IPV6) 2401770a16SRicardo Dias #include <net/inet6_hashtables.h> 2501770a16SRicardo Dias #endif 266e5714eaSDavid S. Miller #include <net/secure_seq.h> 27a7f5e7f1SArnaldo Carvalho de Melo #include <net/ip.h> 28a04a480dSDavid Ahern #include <net/tcp.h> 29c125e80bSCraig Gallek #include <net/sock_reuseport.h> 3077d8bf9cSArnaldo Carvalho de Melo 310f495f76SLorenz Bauer u32 inet_ehashfn(const struct net *net, const __be32 laddr, 3265cd8033SHannes Frederic Sowa const __u16 lport, const __be32 faddr, 3365cd8033SHannes Frederic Sowa const __be16 fport) 3465cd8033SHannes Frederic Sowa { 351bbdceefSHannes Frederic Sowa static u32 inet_ehash_secret __read_mostly; 361bbdceefSHannes Frederic Sowa 371bbdceefSHannes Frederic Sowa net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret)); 381bbdceefSHannes Frederic Sowa 3965cd8033SHannes Frederic Sowa return __inet_ehashfn(laddr, lport, faddr, fport, 4065cd8033SHannes Frederic Sowa inet_ehash_secret + net_hash_mix(net)); 4165cd8033SHannes Frederic Sowa } 420f495f76SLorenz Bauer EXPORT_SYMBOL_GPL(inet_ehashfn); 4365cd8033SHannes Frederic Sowa 44d1e559d0SEric Dumazet /* This function handles inet_sock, but also timewait and request sockets 45d1e559d0SEric Dumazet * for IPv4/IPv6. 46d1e559d0SEric Dumazet */ 47784c372aSEric Dumazet static u32 sk_ehashfn(const struct sock *sk) 4865cd8033SHannes Frederic Sowa { 49d1e559d0SEric Dumazet #if IS_ENABLED(CONFIG_IPV6) 50d1e559d0SEric Dumazet if (sk->sk_family == AF_INET6 && 51d1e559d0SEric Dumazet !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 52d1e559d0SEric Dumazet return inet6_ehashfn(sock_net(sk), 53d1e559d0SEric Dumazet &sk->sk_v6_rcv_saddr, sk->sk_num, 54d1e559d0SEric Dumazet &sk->sk_v6_daddr, sk->sk_dport); 55d1e559d0SEric Dumazet #endif 565b441f76SEric Dumazet return inet_ehashfn(sock_net(sk), 575b441f76SEric Dumazet sk->sk_rcv_saddr, sk->sk_num, 585b441f76SEric Dumazet sk->sk_daddr, sk->sk_dport); 5965cd8033SHannes Frederic Sowa } 6065cd8033SHannes Frederic Sowa 6177d8bf9cSArnaldo Carvalho de Melo /* 6277d8bf9cSArnaldo Carvalho de Melo * Allocate and initialize a new local port bind bucket. 6377d8bf9cSArnaldo Carvalho de Melo * The bindhash mutex for snum's hash chain must be held here. 6477d8bf9cSArnaldo Carvalho de Melo */ 65e18b890bSChristoph Lameter struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, 66941b1d22SPavel Emelyanov struct net *net, 6777d8bf9cSArnaldo Carvalho de Melo struct inet_bind_hashbucket *head, 683c82a21fSRobert Shearman const unsigned short snum, 693c82a21fSRobert Shearman int l3mdev) 7077d8bf9cSArnaldo Carvalho de Melo { 7154e6ecb2SChristoph Lameter struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 7277d8bf9cSArnaldo Carvalho de Melo 7300db4124SIan Morris if (tb) { 74efd7ef1cSEric W. Biederman write_pnet(&tb->ib_net, net); 753c82a21fSRobert Shearman tb->l3mdev = l3mdev; 7677d8bf9cSArnaldo Carvalho de Melo tb->port = snum; 7777d8bf9cSArnaldo Carvalho de Melo tb->fastreuse = 0; 78da5e3630STom Herbert tb->fastreuseport = 0; 7977d8bf9cSArnaldo Carvalho de Melo INIT_HLIST_HEAD(&tb->owners); 8077d8bf9cSArnaldo Carvalho de Melo hlist_add_head(&tb->node, &head->chain); 8177d8bf9cSArnaldo Carvalho de Melo } 8277d8bf9cSArnaldo Carvalho de Melo return tb; 8377d8bf9cSArnaldo Carvalho de Melo } 8477d8bf9cSArnaldo Carvalho de Melo 8577d8bf9cSArnaldo Carvalho de Melo /* 8677d8bf9cSArnaldo Carvalho de Melo * Caller must hold hashbucket lock for this tb with local BH disabled 8777d8bf9cSArnaldo Carvalho de Melo */ 88e18b890bSChristoph Lameter void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) 8977d8bf9cSArnaldo Carvalho de Melo { 9077d8bf9cSArnaldo Carvalho de Melo if (hlist_empty(&tb->owners)) { 9177d8bf9cSArnaldo Carvalho de Melo __hlist_del(&tb->node); 9277d8bf9cSArnaldo Carvalho de Melo kmem_cache_free(cachep, tb); 9377d8bf9cSArnaldo Carvalho de Melo } 9477d8bf9cSArnaldo Carvalho de Melo } 952d8c4ce5SArnaldo Carvalho de Melo 9628044fc1SJoanne Koong bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, 9728044fc1SJoanne Koong unsigned short port, int l3mdev) 982d8c4ce5SArnaldo Carvalho de Melo { 9928044fc1SJoanne Koong return net_eq(ib_net(tb), net) && tb->port == port && 10028044fc1SJoanne Koong tb->l3mdev == l3mdev; 10128044fc1SJoanne Koong } 10228044fc1SJoanne Koong 10328044fc1SJoanne Koong static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb, 10428044fc1SJoanne Koong struct net *net, 10528044fc1SJoanne Koong struct inet_bind_hashbucket *head, 10628044fc1SJoanne Koong unsigned short port, int l3mdev, 10728044fc1SJoanne Koong const struct sock *sk) 10828044fc1SJoanne Koong { 10928044fc1SJoanne Koong write_pnet(&tb->ib_net, net); 11028044fc1SJoanne Koong tb->l3mdev = l3mdev; 11128044fc1SJoanne Koong tb->port = port; 11228044fc1SJoanne Koong #if IS_ENABLED(CONFIG_IPV6) 1135456262dSMartin KaFai Lau tb->family = sk->sk_family; 11428044fc1SJoanne Koong if (sk->sk_family == AF_INET6) 11528044fc1SJoanne Koong tb->v6_rcv_saddr = sk->sk_v6_rcv_saddr; 11628044fc1SJoanne Koong else 11728044fc1SJoanne Koong #endif 11828044fc1SJoanne Koong tb->rcv_saddr = sk->sk_rcv_saddr; 11928044fc1SJoanne Koong INIT_HLIST_HEAD(&tb->owners); 120936a192fSKuniyuki Iwashima INIT_HLIST_HEAD(&tb->deathrow); 12128044fc1SJoanne Koong hlist_add_head(&tb->node, &head->chain); 12228044fc1SJoanne Koong } 12328044fc1SJoanne Koong 12428044fc1SJoanne Koong struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, 12528044fc1SJoanne Koong struct net *net, 12628044fc1SJoanne Koong struct inet_bind_hashbucket *head, 12728044fc1SJoanne Koong unsigned short port, 12828044fc1SJoanne Koong int l3mdev, 12928044fc1SJoanne Koong const struct sock *sk) 13028044fc1SJoanne Koong { 13128044fc1SJoanne Koong struct inet_bind2_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 13228044fc1SJoanne Koong 13328044fc1SJoanne Koong if (tb) 13428044fc1SJoanne Koong inet_bind2_bucket_init(tb, net, head, port, l3mdev, sk); 13528044fc1SJoanne Koong 13628044fc1SJoanne Koong return tb; 13728044fc1SJoanne Koong } 13828044fc1SJoanne Koong 13928044fc1SJoanne Koong /* Caller must hold hashbucket lock for this tb with local BH disabled */ 14028044fc1SJoanne Koong void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) 14128044fc1SJoanne Koong { 142936a192fSKuniyuki Iwashima if (hlist_empty(&tb->owners) && hlist_empty(&tb->deathrow)) { 14328044fc1SJoanne Koong __hlist_del(&tb->node); 14428044fc1SJoanne Koong kmem_cache_free(cachep, tb); 14528044fc1SJoanne Koong } 14628044fc1SJoanne Koong } 14728044fc1SJoanne Koong 14828044fc1SJoanne Koong static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, 14928044fc1SJoanne Koong const struct sock *sk) 15028044fc1SJoanne Koong { 15128044fc1SJoanne Koong #if IS_ENABLED(CONFIG_IPV6) 1528702cf12SKuniyuki Iwashima if (sk->sk_family != tb2->family) { 1538702cf12SKuniyuki Iwashima if (sk->sk_family == AF_INET) 1548702cf12SKuniyuki Iwashima return ipv6_addr_v4mapped(&tb2->v6_rcv_saddr) && 1558702cf12SKuniyuki Iwashima tb2->v6_rcv_saddr.s6_addr32[3] == sk->sk_rcv_saddr; 1568702cf12SKuniyuki Iwashima 1578702cf12SKuniyuki Iwashima return ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr) && 1588702cf12SKuniyuki Iwashima sk->sk_v6_rcv_saddr.s6_addr32[3] == tb2->rcv_saddr; 1598702cf12SKuniyuki Iwashima } 1605456262dSMartin KaFai Lau 16128044fc1SJoanne Koong if (sk->sk_family == AF_INET6) 16228044fc1SJoanne Koong return ipv6_addr_equal(&tb2->v6_rcv_saddr, 16328044fc1SJoanne Koong &sk->sk_v6_rcv_saddr); 16428044fc1SJoanne Koong #endif 16528044fc1SJoanne Koong return tb2->rcv_saddr == sk->sk_rcv_saddr; 16628044fc1SJoanne Koong } 16728044fc1SJoanne Koong 16828044fc1SJoanne Koong void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 16928044fc1SJoanne Koong struct inet_bind2_bucket *tb2, unsigned short port) 17028044fc1SJoanne Koong { 17128044fc1SJoanne Koong inet_sk(sk)->inet_num = port; 1722d8c4ce5SArnaldo Carvalho de Melo sk_add_bind_node(sk, &tb->owners); 173463c84b9SArnaldo Carvalho de Melo inet_csk(sk)->icsk_bind_hash = tb; 17428044fc1SJoanne Koong sk_add_bind2_node(sk, &tb2->owners); 17528044fc1SJoanne Koong inet_csk(sk)->icsk_bind2_hash = tb2; 1762d8c4ce5SArnaldo Carvalho de Melo } 1772d8c4ce5SArnaldo Carvalho de Melo 1782d8c4ce5SArnaldo Carvalho de Melo /* 1792d8c4ce5SArnaldo Carvalho de Melo * Get rid of any references to a local port held by the given sock. 1802d8c4ce5SArnaldo Carvalho de Melo */ 181ab1e0a13SArnaldo Carvalho de Melo static void __inet_put_port(struct sock *sk) 1822d8c4ce5SArnaldo Carvalho de Melo { 183429e42c1SKuniyuki Iwashima struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); 18408eaef90SKuniyuki Iwashima struct inet_bind_hashbucket *head, *head2; 18508eaef90SKuniyuki Iwashima struct net *net = sock_net(sk); 1862d8c4ce5SArnaldo Carvalho de Melo struct inet_bind_bucket *tb; 18708eaef90SKuniyuki Iwashima int bhash; 18808eaef90SKuniyuki Iwashima 18908eaef90SKuniyuki Iwashima bhash = inet_bhashfn(net, inet_sk(sk)->inet_num, hashinfo->bhash_size); 19008eaef90SKuniyuki Iwashima head = &hashinfo->bhash[bhash]; 19108eaef90SKuniyuki Iwashima head2 = inet_bhashfn_portaddr(hashinfo, sk, net, inet_sk(sk)->inet_num); 1922d8c4ce5SArnaldo Carvalho de Melo 1932d8c4ce5SArnaldo Carvalho de Melo spin_lock(&head->lock); 194463c84b9SArnaldo Carvalho de Melo tb = inet_csk(sk)->icsk_bind_hash; 1952d8c4ce5SArnaldo Carvalho de Melo __sk_del_bind_node(sk); 196463c84b9SArnaldo Carvalho de Melo inet_csk(sk)->icsk_bind_hash = NULL; 197c720c7e8SEric Dumazet inet_sk(sk)->inet_num = 0; 1982d8c4ce5SArnaldo Carvalho de Melo inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 19928044fc1SJoanne Koong 20028044fc1SJoanne Koong spin_lock(&head2->lock); 20128044fc1SJoanne Koong if (inet_csk(sk)->icsk_bind2_hash) { 20228044fc1SJoanne Koong struct inet_bind2_bucket *tb2 = inet_csk(sk)->icsk_bind2_hash; 20328044fc1SJoanne Koong 20428044fc1SJoanne Koong __sk_del_bind2_node(sk); 20528044fc1SJoanne Koong inet_csk(sk)->icsk_bind2_hash = NULL; 20628044fc1SJoanne Koong inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); 20728044fc1SJoanne Koong } 20828044fc1SJoanne Koong spin_unlock(&head2->lock); 20928044fc1SJoanne Koong 2102d8c4ce5SArnaldo Carvalho de Melo spin_unlock(&head->lock); 2112d8c4ce5SArnaldo Carvalho de Melo } 2122d8c4ce5SArnaldo Carvalho de Melo 213ab1e0a13SArnaldo Carvalho de Melo void inet_put_port(struct sock *sk) 2142d8c4ce5SArnaldo Carvalho de Melo { 2152d8c4ce5SArnaldo Carvalho de Melo local_bh_disable(); 216ab1e0a13SArnaldo Carvalho de Melo __inet_put_port(sk); 2172d8c4ce5SArnaldo Carvalho de Melo local_bh_enable(); 2182d8c4ce5SArnaldo Carvalho de Melo } 2192d8c4ce5SArnaldo Carvalho de Melo EXPORT_SYMBOL(inet_put_port); 220f3f05f70SArnaldo Carvalho de Melo 2211ce31c9eSEric Dumazet int __inet_inherit_port(const struct sock *sk, struct sock *child) 22253083773SPavel Emelyanov { 223429e42c1SKuniyuki Iwashima struct inet_hashinfo *table = tcp_or_dccp_get_hashinfo(sk); 224093d2823SBalazs Scheidler unsigned short port = inet_sk(child)->inet_num; 22508eaef90SKuniyuki Iwashima struct inet_bind_hashbucket *head, *head2; 22628044fc1SJoanne Koong bool created_inet_bind_bucket = false; 22728044fc1SJoanne Koong struct net *net = sock_net(sk); 22808eaef90SKuniyuki Iwashima bool update_fastreuse = false; 22928044fc1SJoanne Koong struct inet_bind2_bucket *tb2; 23053083773SPavel Emelyanov struct inet_bind_bucket *tb; 23108eaef90SKuniyuki Iwashima int bhash, l3mdev; 23208eaef90SKuniyuki Iwashima 23308eaef90SKuniyuki Iwashima bhash = inet_bhashfn(net, port, table->bhash_size); 23408eaef90SKuniyuki Iwashima head = &table->bhash[bhash]; 23508eaef90SKuniyuki Iwashima head2 = inet_bhashfn_portaddr(table, child, net, port); 23653083773SPavel Emelyanov 23753083773SPavel Emelyanov spin_lock(&head->lock); 23828044fc1SJoanne Koong spin_lock(&head2->lock); 23953083773SPavel Emelyanov tb = inet_csk(sk)->icsk_bind_hash; 24028044fc1SJoanne Koong tb2 = inet_csk(sk)->icsk_bind2_hash; 24128044fc1SJoanne Koong if (unlikely(!tb || !tb2)) { 24228044fc1SJoanne Koong spin_unlock(&head2->lock); 243c2f34a65SEric Dumazet spin_unlock(&head->lock); 244c2f34a65SEric Dumazet return -ENOENT; 245c2f34a65SEric Dumazet } 246093d2823SBalazs Scheidler if (tb->port != port) { 2473c82a21fSRobert Shearman l3mdev = inet_sk_bound_l3mdev(sk); 2483c82a21fSRobert Shearman 249093d2823SBalazs Scheidler /* NOTE: using tproxy and redirecting skbs to a proxy 250093d2823SBalazs Scheidler * on a different listener port breaks the assumption 251093d2823SBalazs Scheidler * that the listener socket's icsk_bind_hash is the same 252093d2823SBalazs Scheidler * as that of the child socket. We have to look up or 253093d2823SBalazs Scheidler * create a new bind bucket for the child here. */ 254b67bfe0dSSasha Levin inet_bind_bucket_for_each(tb, &head->chain) { 25528044fc1SJoanne Koong if (inet_bind_bucket_match(tb, net, port, l3mdev)) 256093d2823SBalazs Scheidler break; 257093d2823SBalazs Scheidler } 258b67bfe0dSSasha Levin if (!tb) { 259093d2823SBalazs Scheidler tb = inet_bind_bucket_create(table->bind_bucket_cachep, 26028044fc1SJoanne Koong net, head, port, l3mdev); 261093d2823SBalazs Scheidler if (!tb) { 26228044fc1SJoanne Koong spin_unlock(&head2->lock); 263093d2823SBalazs Scheidler spin_unlock(&head->lock); 264093d2823SBalazs Scheidler return -ENOMEM; 265093d2823SBalazs Scheidler } 26628044fc1SJoanne Koong created_inet_bind_bucket = true; 267093d2823SBalazs Scheidler } 26828044fc1SJoanne Koong update_fastreuse = true; 26928044fc1SJoanne Koong 27028044fc1SJoanne Koong goto bhash2_find; 27128044fc1SJoanne Koong } else if (!inet_bind2_bucket_addr_match(tb2, child)) { 27228044fc1SJoanne Koong l3mdev = inet_sk_bound_l3mdev(sk); 27328044fc1SJoanne Koong 27428044fc1SJoanne Koong bhash2_find: 27528044fc1SJoanne Koong tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, child); 27628044fc1SJoanne Koong if (!tb2) { 27728044fc1SJoanne Koong tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep, 27828044fc1SJoanne Koong net, head2, port, 27928044fc1SJoanne Koong l3mdev, child); 28028044fc1SJoanne Koong if (!tb2) 28128044fc1SJoanne Koong goto error; 28228044fc1SJoanne Koong } 28328044fc1SJoanne Koong } 28428044fc1SJoanne Koong if (update_fastreuse) 285d76f3351STim Froidcoeur inet_csk_update_fastreuse(tb, child); 28628044fc1SJoanne Koong inet_bind_hash(child, tb, tb2, port); 28728044fc1SJoanne Koong spin_unlock(&head2->lock); 28853083773SPavel Emelyanov spin_unlock(&head->lock); 289093d2823SBalazs Scheidler 290093d2823SBalazs Scheidler return 0; 29128044fc1SJoanne Koong 29228044fc1SJoanne Koong error: 29328044fc1SJoanne Koong if (created_inet_bind_bucket) 29428044fc1SJoanne Koong inet_bind_bucket_destroy(table->bind_bucket_cachep, tb); 29528044fc1SJoanne Koong spin_unlock(&head2->lock); 29628044fc1SJoanne Koong spin_unlock(&head->lock); 29728044fc1SJoanne Koong return -ENOMEM; 29853083773SPavel Emelyanov } 29953083773SPavel Emelyanov EXPORT_SYMBOL_GPL(__inet_inherit_port); 30053083773SPavel Emelyanov 30161b7c691SMartin KaFai Lau static struct inet_listen_hashbucket * 30261b7c691SMartin KaFai Lau inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) 30361b7c691SMartin KaFai Lau { 30461b7c691SMartin KaFai Lau u32 hash; 30561b7c691SMartin KaFai Lau 30661b7c691SMartin KaFai Lau #if IS_ENABLED(CONFIG_IPV6) 30761b7c691SMartin KaFai Lau if (sk->sk_family == AF_INET6) 30861b7c691SMartin KaFai Lau hash = ipv6_portaddr_hash(sock_net(sk), 30961b7c691SMartin KaFai Lau &sk->sk_v6_rcv_saddr, 31061b7c691SMartin KaFai Lau inet_sk(sk)->inet_num); 31161b7c691SMartin KaFai Lau else 31261b7c691SMartin KaFai Lau #endif 31361b7c691SMartin KaFai Lau hash = ipv4_portaddr_hash(sock_net(sk), 31461b7c691SMartin KaFai Lau inet_sk(sk)->inet_rcv_saddr, 31561b7c691SMartin KaFai Lau inet_sk(sk)->inet_num); 31661b7c691SMartin KaFai Lau return inet_lhash2_bucket(h, hash); 31761b7c691SMartin KaFai Lau } 31861b7c691SMartin KaFai Lau 319c25eb3bfSEric Dumazet static inline int compute_score(struct sock *sk, struct net *net, 320c25eb3bfSEric Dumazet const unsigned short hnum, const __be32 daddr, 32134e1ec31SMiaohe Lin const int dif, const int sdif) 322c25eb3bfSEric Dumazet { 323c25eb3bfSEric Dumazet int score = -1; 324c25eb3bfSEric Dumazet 325d9fbc7f6SPeter Oskolkov if (net_eq(sock_net(sk), net) && sk->sk_num == hnum && 326c25eb3bfSEric Dumazet !ipv6_only_sock(sk)) { 327d9fbc7f6SPeter Oskolkov if (sk->sk_rcv_saddr != daddr) 328c25eb3bfSEric Dumazet return -1; 329e7819058SMike Manning 330d9fbc7f6SPeter Oskolkov if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) 331d9fbc7f6SPeter Oskolkov return -1; 3328d6c414cSMike Manning score = sk->sk_bound_dev_if ? 2 : 1; 333d9fbc7f6SPeter Oskolkov 3348d6c414cSMike Manning if (sk->sk_family == PF_INET) 3358d6c414cSMike Manning score++; 3367170a977SEric Dumazet if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) 33770da268bSEric Dumazet score++; 338c25eb3bfSEric Dumazet } 339c25eb3bfSEric Dumazet return score; 340c25eb3bfSEric Dumazet } 341c25eb3bfSEric Dumazet 3422a617763SLorenz Bauer /** 3432a617763SLorenz Bauer * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary. 3442a617763SLorenz Bauer * @net: network namespace. 3452a617763SLorenz Bauer * @sk: AF_INET socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP. 3462a617763SLorenz Bauer * @skb: context for a potential SK_REUSEPORT program. 3472a617763SLorenz Bauer * @doff: header offset. 3482a617763SLorenz Bauer * @saddr: source address. 3492a617763SLorenz Bauer * @sport: source port. 3502a617763SLorenz Bauer * @daddr: destination address. 3512a617763SLorenz Bauer * @hnum: destination port in host byte order. 3522a617763SLorenz Bauer * @ehashfn: hash function used to generate the fallback hash. 3532a617763SLorenz Bauer * 3542a617763SLorenz Bauer * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to 3552a617763SLorenz Bauer * the selected sock or an error. 3562a617763SLorenz Bauer */ 357ce796e60SLorenz Bauer struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk, 35880b373f7SJakub Sitnicki struct sk_buff *skb, int doff, 35980b373f7SJakub Sitnicki __be32 saddr, __be16 sport, 3600f495f76SLorenz Bauer __be32 daddr, unsigned short hnum, 3610f495f76SLorenz Bauer inet_ehashfn_t *ehashfn) 36280b373f7SJakub Sitnicki { 36380b373f7SJakub Sitnicki struct sock *reuse_sk = NULL; 36480b373f7SJakub Sitnicki u32 phash; 36580b373f7SJakub Sitnicki 36680b373f7SJakub Sitnicki if (sk->sk_reuseport) { 3670f495f76SLorenz Bauer phash = INDIRECT_CALL_2(ehashfn, udp_ehashfn, inet_ehashfn, 3680f495f76SLorenz Bauer net, daddr, hnum, saddr, sport); 36980b373f7SJakub Sitnicki reuse_sk = reuseport_select_sock(sk, phash, skb, doff); 37080b373f7SJakub Sitnicki } 37180b373f7SJakub Sitnicki return reuse_sk; 37280b373f7SJakub Sitnicki } 373ce796e60SLorenz Bauer EXPORT_SYMBOL_GPL(inet_lookup_reuseport); 37480b373f7SJakub Sitnicki 375f3f05f70SArnaldo Carvalho de Melo /* 3763b24d854SEric Dumazet * Here are some nice properties to exploit here. The BSD API 3773b24d854SEric Dumazet * does not allow a listening sock to specify the remote port nor the 37833b62231SArnaldo Carvalho de Melo * remote address for the connection. So always assume those are both 37933b62231SArnaldo Carvalho de Melo * wildcarded during the search since they can never be otherwise. 38033b62231SArnaldo Carvalho de Melo */ 38133b62231SArnaldo Carvalho de Melo 3823b24d854SEric Dumazet /* called with rcu_read_lock() : No refcount taken on the socket */ 38361b7c691SMartin KaFai Lau static struct sock *inet_lhash2_lookup(struct net *net, 38461b7c691SMartin KaFai Lau struct inet_listen_hashbucket *ilb2, 38561b7c691SMartin KaFai Lau struct sk_buff *skb, int doff, 38661b7c691SMartin KaFai Lau const __be32 saddr, __be16 sport, 38761b7c691SMartin KaFai Lau const __be32 daddr, const unsigned short hnum, 38861b7c691SMartin KaFai Lau const int dif, const int sdif) 38961b7c691SMartin KaFai Lau { 39061b7c691SMartin KaFai Lau struct sock *sk, *result = NULL; 391cae3873cSMartin KaFai Lau struct hlist_nulls_node *node; 39261b7c691SMartin KaFai Lau int score, hiscore = 0; 39361b7c691SMartin KaFai Lau 394cae3873cSMartin KaFai Lau sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { 39534e1ec31SMiaohe Lin score = compute_score(sk, net, hnum, daddr, dif, sdif); 39661b7c691SMartin KaFai Lau if (score > hiscore) { 397ce796e60SLorenz Bauer result = inet_lookup_reuseport(net, sk, skb, doff, 3980f495f76SLorenz Bauer saddr, sport, daddr, hnum, inet_ehashfn); 39961b7c691SMartin KaFai Lau if (result) 40061b7c691SMartin KaFai Lau return result; 40180b373f7SJakub Sitnicki 40261b7c691SMartin KaFai Lau result = sk; 40361b7c691SMartin KaFai Lau hiscore = score; 40461b7c691SMartin KaFai Lau } 40561b7c691SMartin KaFai Lau } 40661b7c691SMartin KaFai Lau 40761b7c691SMartin KaFai Lau return result; 40861b7c691SMartin KaFai Lau } 40961b7c691SMartin KaFai Lau 4106c886db2SLorenz Bauer struct sock *inet_lookup_run_sk_lookup(struct net *net, 4116c886db2SLorenz Bauer int protocol, 4121559b4aaSJakub Sitnicki struct sk_buff *skb, int doff, 4131559b4aaSJakub Sitnicki __be32 saddr, __be16 sport, 4146c886db2SLorenz Bauer __be32 daddr, u16 hnum, const int dif, 4156c886db2SLorenz Bauer inet_ehashfn_t *ehashfn) 4161559b4aaSJakub Sitnicki { 4171559b4aaSJakub Sitnicki struct sock *sk, *reuse_sk; 4181559b4aaSJakub Sitnicki bool no_reuseport; 4191559b4aaSJakub Sitnicki 4206c886db2SLorenz Bauer no_reuseport = bpf_sk_lookup_run_v4(net, protocol, saddr, sport, 421f8931565SMark Pashmfouroush daddr, hnum, dif, &sk); 4221559b4aaSJakub Sitnicki if (no_reuseport || IS_ERR_OR_NULL(sk)) 4231559b4aaSJakub Sitnicki return sk; 4241559b4aaSJakub Sitnicki 4250f495f76SLorenz Bauer reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, 4266c886db2SLorenz Bauer ehashfn); 4271559b4aaSJakub Sitnicki if (reuse_sk) 4281559b4aaSJakub Sitnicki sk = reuse_sk; 4291559b4aaSJakub Sitnicki return sk; 4301559b4aaSJakub Sitnicki } 4311559b4aaSJakub Sitnicki 432c67499c0SPavel Emelyanov struct sock *__inet_lookup_listener(struct net *net, 433c67499c0SPavel Emelyanov struct inet_hashinfo *hashinfo, 434a583636aSCraig Gallek struct sk_buff *skb, int doff, 435da5e3630STom Herbert const __be32 saddr, __be16 sport, 436fb99c848SAl Viro const __be32 daddr, const unsigned short hnum, 4373fa6f616SDavid Ahern const int dif, const int sdif) 43899a92ff5SHerbert Xu { 43961b7c691SMartin KaFai Lau struct inet_listen_hashbucket *ilb2; 440d9fbc7f6SPeter Oskolkov struct sock *result = NULL; 44161b7c691SMartin KaFai Lau unsigned int hash2; 44261b7c691SMartin KaFai Lau 4431559b4aaSJakub Sitnicki /* Lookup redirect from BPF */ 4446c886db2SLorenz Bauer if (static_branch_unlikely(&bpf_sk_lookup_enabled) && 4456c886db2SLorenz Bauer hashinfo == net->ipv4.tcp_death_row.hashinfo) { 4466c886db2SLorenz Bauer result = inet_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff, 4476c886db2SLorenz Bauer saddr, sport, daddr, hnum, dif, 4486c886db2SLorenz Bauer inet_ehashfn); 4491559b4aaSJakub Sitnicki if (result) 4501559b4aaSJakub Sitnicki goto done; 4511559b4aaSJakub Sitnicki } 4521559b4aaSJakub Sitnicki 45361b7c691SMartin KaFai Lau hash2 = ipv4_portaddr_hash(net, daddr, hnum); 45461b7c691SMartin KaFai Lau ilb2 = inet_lhash2_bucket(hashinfo, hash2); 45561b7c691SMartin KaFai Lau 45661b7c691SMartin KaFai Lau result = inet_lhash2_lookup(net, ilb2, skb, doff, 45761b7c691SMartin KaFai Lau saddr, sport, daddr, hnum, 45861b7c691SMartin KaFai Lau dif, sdif); 45961b7c691SMartin KaFai Lau if (result) 4608217ca65SMartin KaFai Lau goto done; 46161b7c691SMartin KaFai Lau 46261b7c691SMartin KaFai Lau /* Lookup lhash2 with INADDR_ANY */ 46361b7c691SMartin KaFai Lau hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); 46461b7c691SMartin KaFai Lau ilb2 = inet_lhash2_bucket(hashinfo, hash2); 46561b7c691SMartin KaFai Lau 4668217ca65SMartin KaFai Lau result = inet_lhash2_lookup(net, ilb2, skb, doff, 467d9fbc7f6SPeter Oskolkov saddr, sport, htonl(INADDR_ANY), hnum, 46861b7c691SMartin KaFai Lau dif, sdif); 4698217ca65SMartin KaFai Lau done: 47088e235b8SEnrico Weigelt if (IS_ERR(result)) 4718217ca65SMartin KaFai Lau return NULL; 472c25eb3bfSEric Dumazet return result; 47399a92ff5SHerbert Xu } 4748f491069SHerbert Xu EXPORT_SYMBOL_GPL(__inet_lookup_listener); 475a7f5e7f1SArnaldo Carvalho de Melo 47605dbc7b5SEric Dumazet /* All sockets share common refcount, but have different destructors */ 47705dbc7b5SEric Dumazet void sock_gen_put(struct sock *sk) 47805dbc7b5SEric Dumazet { 47941c6d650SReshetova, Elena if (!refcount_dec_and_test(&sk->sk_refcnt)) 48005dbc7b5SEric Dumazet return; 48105dbc7b5SEric Dumazet 48205dbc7b5SEric Dumazet if (sk->sk_state == TCP_TIME_WAIT) 48305dbc7b5SEric Dumazet inet_twsk_free(inet_twsk(sk)); 48441b822c5SEric Dumazet else if (sk->sk_state == TCP_NEW_SYN_RECV) 48541b822c5SEric Dumazet reqsk_free(inet_reqsk(sk)); 48605dbc7b5SEric Dumazet else 48705dbc7b5SEric Dumazet sk_free(sk); 48805dbc7b5SEric Dumazet } 48905dbc7b5SEric Dumazet EXPORT_SYMBOL_GPL(sock_gen_put); 49005dbc7b5SEric Dumazet 4912c13270bSEric Dumazet void sock_edemux(struct sk_buff *skb) 4922c13270bSEric Dumazet { 4932c13270bSEric Dumazet sock_gen_put(skb->sk); 4942c13270bSEric Dumazet } 4952c13270bSEric Dumazet EXPORT_SYMBOL(sock_edemux); 4962c13270bSEric Dumazet 497c67499c0SPavel Emelyanov struct sock *__inet_lookup_established(struct net *net, 498c67499c0SPavel Emelyanov struct inet_hashinfo *hashinfo, 49977a5ba55SPavel Emelyanov const __be32 saddr, const __be16 sport, 50077a5ba55SPavel Emelyanov const __be32 daddr, const u16 hnum, 5013fa6f616SDavid Ahern const int dif, const int sdif) 50277a5ba55SPavel Emelyanov { 503c7228317SJoe Perches INET_ADDR_COOKIE(acookie, saddr, daddr); 50477a5ba55SPavel Emelyanov const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 50577a5ba55SPavel Emelyanov struct sock *sk; 5063ab5aee7SEric Dumazet const struct hlist_nulls_node *node; 50777a5ba55SPavel Emelyanov /* Optimize here for direct hit, only listening connections can 50877a5ba55SPavel Emelyanov * have wildcards anyways. 50977a5ba55SPavel Emelyanov */ 5109f26b3adSPavel Emelyanov unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); 511f373b53bSEric Dumazet unsigned int slot = hash & hashinfo->ehash_mask; 5123ab5aee7SEric Dumazet struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; 51377a5ba55SPavel Emelyanov 5143ab5aee7SEric Dumazet begin: 5153ab5aee7SEric Dumazet sk_nulls_for_each_rcu(sk, node, &head->chain) { 516ce43b03eSEric Dumazet if (sk->sk_hash != hash) 517ce43b03eSEric Dumazet continue; 518eda090c3SEric Dumazet if (likely(inet_match(net, sk, acookie, ports, dif, sdif))) { 51941c6d650SReshetova, Elena if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) 52005dbc7b5SEric Dumazet goto out; 521eda090c3SEric Dumazet if (unlikely(!inet_match(net, sk, acookie, 5224915d50eSEric Dumazet ports, dif, sdif))) { 52305dbc7b5SEric Dumazet sock_gen_put(sk); 5243ab5aee7SEric Dumazet goto begin; 52577a5ba55SPavel Emelyanov } 52605dbc7b5SEric Dumazet goto found; 5273ab5aee7SEric Dumazet } 5283ab5aee7SEric Dumazet } 5293ab5aee7SEric Dumazet /* 5303ab5aee7SEric Dumazet * if the nulls value we got at the end of this lookup is 5313ab5aee7SEric Dumazet * not the expected one, we must restart lookup. 5323ab5aee7SEric Dumazet * We probably met an item that was moved to another chain. 5333ab5aee7SEric Dumazet */ 5343ab5aee7SEric Dumazet if (get_nulls_value(node) != slot) 5353ab5aee7SEric Dumazet goto begin; 53677a5ba55SPavel Emelyanov out: 53705dbc7b5SEric Dumazet sk = NULL; 53805dbc7b5SEric Dumazet found: 53977a5ba55SPavel Emelyanov return sk; 54077a5ba55SPavel Emelyanov } 54177a5ba55SPavel Emelyanov EXPORT_SYMBOL_GPL(__inet_lookup_established); 54277a5ba55SPavel Emelyanov 543a7f5e7f1SArnaldo Carvalho de Melo /* called with local bh disabled */ 544a7f5e7f1SArnaldo Carvalho de Melo static int __inet_check_established(struct inet_timewait_death_row *death_row, 545a7f5e7f1SArnaldo Carvalho de Melo struct sock *sk, __u16 lport, 546a7f5e7f1SArnaldo Carvalho de Melo struct inet_timewait_sock **twp) 547a7f5e7f1SArnaldo Carvalho de Melo { 548a7f5e7f1SArnaldo Carvalho de Melo struct inet_hashinfo *hinfo = death_row->hashinfo; 549a7f5e7f1SArnaldo Carvalho de Melo struct inet_sock *inet = inet_sk(sk); 550c720c7e8SEric Dumazet __be32 daddr = inet->inet_rcv_saddr; 551c720c7e8SEric Dumazet __be32 saddr = inet->inet_daddr; 552a7f5e7f1SArnaldo Carvalho de Melo int dif = sk->sk_bound_dev_if; 5533fa6f616SDavid Ahern struct net *net = sock_net(sk); 5543fa6f616SDavid Ahern int sdif = l3mdev_master_ifindex_by_index(net, dif); 555c7228317SJoe Perches INET_ADDR_COOKIE(acookie, saddr, daddr); 556c720c7e8SEric Dumazet const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); 557c720c7e8SEric Dumazet unsigned int hash = inet_ehashfn(net, daddr, lport, 558c720c7e8SEric Dumazet saddr, inet->inet_dport); 559a7f5e7f1SArnaldo Carvalho de Melo struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 5609db66bdcSEric Dumazet spinlock_t *lock = inet_ehash_lockp(hinfo, hash); 561a7f5e7f1SArnaldo Carvalho de Melo struct sock *sk2; 5623ab5aee7SEric Dumazet const struct hlist_nulls_node *node; 56305dbc7b5SEric Dumazet struct inet_timewait_sock *tw = NULL; 564a7f5e7f1SArnaldo Carvalho de Melo 5659db66bdcSEric Dumazet spin_lock(lock); 566a7f5e7f1SArnaldo Carvalho de Melo 5673ab5aee7SEric Dumazet sk_nulls_for_each(sk2, node, &head->chain) { 568ce43b03eSEric Dumazet if (sk2->sk_hash != hash) 569ce43b03eSEric Dumazet continue; 57005dbc7b5SEric Dumazet 571eda090c3SEric Dumazet if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) { 57205dbc7b5SEric Dumazet if (sk2->sk_state == TCP_TIME_WAIT) { 57305dbc7b5SEric Dumazet tw = inet_twsk(sk2); 57405dbc7b5SEric Dumazet if (twsk_unique(sk, sk2, twp)) 57505dbc7b5SEric Dumazet break; 57605dbc7b5SEric Dumazet } 577a7f5e7f1SArnaldo Carvalho de Melo goto not_unique; 578a7f5e7f1SArnaldo Carvalho de Melo } 57905dbc7b5SEric Dumazet } 580a7f5e7f1SArnaldo Carvalho de Melo 581a7f5e7f1SArnaldo Carvalho de Melo /* Must record num and sport now. Otherwise we will see 58205dbc7b5SEric Dumazet * in hash table socket with a funny identity. 58305dbc7b5SEric Dumazet */ 584c720c7e8SEric Dumazet inet->inet_num = lport; 585c720c7e8SEric Dumazet inet->inet_sport = htons(lport); 586a7f5e7f1SArnaldo Carvalho de Melo sk->sk_hash = hash; 587547b792cSIlpo Järvinen WARN_ON(!sk_unhashed(sk)); 5883ab5aee7SEric Dumazet __sk_nulls_add_node_rcu(sk, &head->chain); 58913475a30SEric Dumazet if (tw) { 590fc01538fSEric Dumazet sk_nulls_del_node_init_rcu((struct sock *)tw); 59102a1d6e7SEric Dumazet __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); 59213475a30SEric Dumazet } 5939db66bdcSEric Dumazet spin_unlock(lock); 594c29a0bc4SPavel Emelyanov sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 595a7f5e7f1SArnaldo Carvalho de Melo 596a7f5e7f1SArnaldo Carvalho de Melo if (twp) { 597a7f5e7f1SArnaldo Carvalho de Melo *twp = tw; 598a7f5e7f1SArnaldo Carvalho de Melo } else if (tw) { 599a7f5e7f1SArnaldo Carvalho de Melo /* Silly. Should hash-dance instead... */ 600dbe7faa4SEric Dumazet inet_twsk_deschedule_put(tw); 601a7f5e7f1SArnaldo Carvalho de Melo } 602a7f5e7f1SArnaldo Carvalho de Melo return 0; 603a7f5e7f1SArnaldo Carvalho de Melo 604a7f5e7f1SArnaldo Carvalho de Melo not_unique: 6059db66bdcSEric Dumazet spin_unlock(lock); 606a7f5e7f1SArnaldo Carvalho de Melo return -EADDRNOTAVAIL; 607a7f5e7f1SArnaldo Carvalho de Melo } 608a7f5e7f1SArnaldo Carvalho de Melo 609b2d05756SWilly Tarreau static u64 inet_sk_port_offset(const struct sock *sk) 610a7f5e7f1SArnaldo Carvalho de Melo { 611a7f5e7f1SArnaldo Carvalho de Melo const struct inet_sock *inet = inet_sk(sk); 612e2baad9eSEric Dumazet 613c720c7e8SEric Dumazet return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, 614c720c7e8SEric Dumazet inet->inet_daddr, 615c720c7e8SEric Dumazet inet->inet_dport); 616a7f5e7f1SArnaldo Carvalho de Melo } 617a7f5e7f1SArnaldo Carvalho de Melo 61801770a16SRicardo Dias /* Searches for an exsiting socket in the ehash bucket list. 61901770a16SRicardo Dias * Returns true if found, false otherwise. 620079096f1SEric Dumazet */ 62101770a16SRicardo Dias static bool inet_ehash_lookup_by_sk(struct sock *sk, 62201770a16SRicardo Dias struct hlist_nulls_head *list) 62301770a16SRicardo Dias { 62401770a16SRicardo Dias const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num); 62501770a16SRicardo Dias const int sdif = sk->sk_bound_dev_if; 62601770a16SRicardo Dias const int dif = sk->sk_bound_dev_if; 62701770a16SRicardo Dias const struct hlist_nulls_node *node; 62801770a16SRicardo Dias struct net *net = sock_net(sk); 62901770a16SRicardo Dias struct sock *esk; 63001770a16SRicardo Dias 63101770a16SRicardo Dias INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr); 63201770a16SRicardo Dias 63301770a16SRicardo Dias sk_nulls_for_each_rcu(esk, node, list) { 63401770a16SRicardo Dias if (esk->sk_hash != sk->sk_hash) 63501770a16SRicardo Dias continue; 63601770a16SRicardo Dias if (sk->sk_family == AF_INET) { 637eda090c3SEric Dumazet if (unlikely(inet_match(net, esk, acookie, 63801770a16SRicardo Dias ports, dif, sdif))) { 63901770a16SRicardo Dias return true; 64001770a16SRicardo Dias } 64101770a16SRicardo Dias } 64201770a16SRicardo Dias #if IS_ENABLED(CONFIG_IPV6) 64301770a16SRicardo Dias else if (sk->sk_family == AF_INET6) { 6445d368f03SEric Dumazet if (unlikely(inet6_match(net, esk, 64501770a16SRicardo Dias &sk->sk_v6_daddr, 64601770a16SRicardo Dias &sk->sk_v6_rcv_saddr, 64701770a16SRicardo Dias ports, dif, sdif))) { 64801770a16SRicardo Dias return true; 64901770a16SRicardo Dias } 65001770a16SRicardo Dias } 65101770a16SRicardo Dias #endif 65201770a16SRicardo Dias } 65301770a16SRicardo Dias return false; 65401770a16SRicardo Dias } 65501770a16SRicardo Dias 65601770a16SRicardo Dias /* Insert a socket into ehash, and eventually remove another one 65701770a16SRicardo Dias * (The another one can be a SYN_RECV or TIMEWAIT) 65801770a16SRicardo Dias * If an existing socket already exists, socket sk is not inserted, 65901770a16SRicardo Dias * and sets found_dup_sk parameter to true. 66001770a16SRicardo Dias */ 66101770a16SRicardo Dias bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) 662152da81dSPavel Emelyanov { 663429e42c1SKuniyuki Iwashima struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); 664152da81dSPavel Emelyanov struct inet_ehash_bucket *head; 66508eaef90SKuniyuki Iwashima struct hlist_nulls_head *list; 6665b441f76SEric Dumazet spinlock_t *lock; 6675e0724d0SEric Dumazet bool ret = true; 668152da81dSPavel Emelyanov 669079096f1SEric Dumazet WARN_ON_ONCE(!sk_unhashed(sk)); 670152da81dSPavel Emelyanov 6715b441f76SEric Dumazet sk->sk_hash = sk_ehashfn(sk); 672152da81dSPavel Emelyanov head = inet_ehash_bucket(hashinfo, sk->sk_hash); 673152da81dSPavel Emelyanov list = &head->chain; 674152da81dSPavel Emelyanov lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 675152da81dSPavel Emelyanov 6769db66bdcSEric Dumazet spin_lock(lock); 677fc01538fSEric Dumazet if (osk) { 6785e0724d0SEric Dumazet WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); 67981b3ade5SKuniyuki Iwashima ret = sk_nulls_del_node_init_rcu(osk); 68081b3ade5SKuniyuki Iwashima } else if (found_dup_sk) { 68101770a16SRicardo Dias *found_dup_sk = inet_ehash_lookup_by_sk(sk, list); 68201770a16SRicardo Dias if (*found_dup_sk) 68301770a16SRicardo Dias ret = false; 6849327f705SEric Dumazet } 68501770a16SRicardo Dias 6865e0724d0SEric Dumazet if (ret) 6875e0724d0SEric Dumazet __sk_nulls_add_node_rcu(sk, list); 68801770a16SRicardo Dias 6899db66bdcSEric Dumazet spin_unlock(lock); 69001770a16SRicardo Dias 691079096f1SEric Dumazet return ret; 692079096f1SEric Dumazet } 693079096f1SEric Dumazet 69401770a16SRicardo Dias bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) 695079096f1SEric Dumazet { 69601770a16SRicardo Dias bool ok = inet_ehash_insert(sk, osk, found_dup_sk); 6975e0724d0SEric Dumazet 6985e0724d0SEric Dumazet if (ok) { 699c29a0bc4SPavel Emelyanov sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 7005e0724d0SEric Dumazet } else { 70119757cebSEric Dumazet this_cpu_inc(*sk->sk_prot->orphan_count); 702563e0bb0SYafang Shao inet_sk_set_state(sk, TCP_CLOSE); 7035e0724d0SEric Dumazet sock_set_flag(sk, SOCK_DEAD); 7045e0724d0SEric Dumazet inet_csk_destroy_sock(sk); 705152da81dSPavel Emelyanov } 7065e0724d0SEric Dumazet return ok; 7075e0724d0SEric Dumazet } 7085e0724d0SEric Dumazet EXPORT_SYMBOL_GPL(inet_ehash_nolisten); 709152da81dSPavel Emelyanov 710c125e80bSCraig Gallek static int inet_reuseport_add_sock(struct sock *sk, 711fe38d2a1SJosef Bacik struct inet_listen_hashbucket *ilb) 712c125e80bSCraig Gallek { 71390e5d0dbSCraig Gallek struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; 7148dbd76e7SEric Dumazet const struct hlist_nulls_node *node; 715c125e80bSCraig Gallek struct sock *sk2; 716c125e80bSCraig Gallek kuid_t uid = sock_i_uid(sk); 717c125e80bSCraig Gallek 7188dbd76e7SEric Dumazet sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) { 719c125e80bSCraig Gallek if (sk2 != sk && 720c125e80bSCraig Gallek sk2->sk_family == sk->sk_family && 721c125e80bSCraig Gallek ipv6_only_sock(sk2) == ipv6_only_sock(sk) && 722c125e80bSCraig Gallek sk2->sk_bound_dev_if == sk->sk_bound_dev_if && 72390e5d0dbSCraig Gallek inet_csk(sk2)->icsk_bind_hash == tb && 724c125e80bSCraig Gallek sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && 725fe38d2a1SJosef Bacik inet_rcv_saddr_equal(sk, sk2, false)) 7262dbb9b9eSMartin KaFai Lau return reuseport_add_sock(sk, sk2, 7272dbb9b9eSMartin KaFai Lau inet_rcv_saddr_any(sk)); 728c125e80bSCraig Gallek } 729c125e80bSCraig Gallek 7302dbb9b9eSMartin KaFai Lau return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); 731c125e80bSCraig Gallek } 732c125e80bSCraig Gallek 733fe38d2a1SJosef Bacik int __inet_hash(struct sock *sk, struct sock *osk) 734152da81dSPavel Emelyanov { 735429e42c1SKuniyuki Iwashima struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); 736e8d00590SMartin KaFai Lau struct inet_listen_hashbucket *ilb2; 737c125e80bSCraig Gallek int err = 0; 738152da81dSPavel Emelyanov 7395e0724d0SEric Dumazet if (sk->sk_state != TCP_LISTEN) { 7404f9bf2a2SSebastian Andrzej Siewior local_bh_disable(); 74101770a16SRicardo Dias inet_ehash_nolisten(sk, osk, NULL); 7424f9bf2a2SSebastian Andrzej Siewior local_bh_enable(); 743c125e80bSCraig Gallek return 0; 7445e0724d0SEric Dumazet } 745547b792cSIlpo Järvinen WARN_ON(!sk_unhashed(sk)); 746e8d00590SMartin KaFai Lau ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); 747152da81dSPavel Emelyanov 748e8d00590SMartin KaFai Lau spin_lock(&ilb2->lock); 749c125e80bSCraig Gallek if (sk->sk_reuseport) { 750cae3873cSMartin KaFai Lau err = inet_reuseport_add_sock(sk, ilb2); 751c125e80bSCraig Gallek if (err) 752c125e80bSCraig Gallek goto unlock; 753c125e80bSCraig Gallek } 754871019b2SStanislav Fomichev sock_set_flag(sk, SOCK_RCU_FREE); 755d296ba60SCraig Gallek if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && 756cae3873cSMartin KaFai Lau sk->sk_family == AF_INET6) 757cae3873cSMartin KaFai Lau __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head); 758cae3873cSMartin KaFai Lau else 759cae3873cSMartin KaFai Lau __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head); 760c29a0bc4SPavel Emelyanov sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 761c125e80bSCraig Gallek unlock: 762e8d00590SMartin KaFai Lau spin_unlock(&ilb2->lock); 763c125e80bSCraig Gallek 764c125e80bSCraig Gallek return err; 765152da81dSPavel Emelyanov } 76677a6a471SEric Dumazet EXPORT_SYMBOL(__inet_hash); 767ab1e0a13SArnaldo Carvalho de Melo 768086c653fSCraig Gallek int inet_hash(struct sock *sk) 769ab1e0a13SArnaldo Carvalho de Melo { 770c125e80bSCraig Gallek int err = 0; 771c125e80bSCraig Gallek 7724f9bf2a2SSebastian Andrzej Siewior if (sk->sk_state != TCP_CLOSE) 773fe38d2a1SJosef Bacik err = __inet_hash(sk, NULL); 774086c653fSCraig Gallek 775c125e80bSCraig Gallek return err; 776ab1e0a13SArnaldo Carvalho de Melo } 777ab1e0a13SArnaldo Carvalho de Melo EXPORT_SYMBOL_GPL(inet_hash); 778ab1e0a13SArnaldo Carvalho de Melo 7794f9bf2a2SSebastian Andrzej Siewior void inet_unhash(struct sock *sk) 7804f9bf2a2SSebastian Andrzej Siewior { 781429e42c1SKuniyuki Iwashima struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); 7824f9bf2a2SSebastian Andrzej Siewior 7834f9bf2a2SSebastian Andrzej Siewior if (sk_unhashed(sk)) 7844f9bf2a2SSebastian Andrzej Siewior return; 7854f9bf2a2SSebastian Andrzej Siewior 7864f9bf2a2SSebastian Andrzej Siewior if (sk->sk_state == TCP_LISTEN) { 787e8d00590SMartin KaFai Lau struct inet_listen_hashbucket *ilb2; 7884f9bf2a2SSebastian Andrzej Siewior 789e8d00590SMartin KaFai Lau ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); 7904f9bf2a2SSebastian Andrzej Siewior /* Don't disable bottom halves while acquiring the lock to 7914f9bf2a2SSebastian Andrzej Siewior * avoid circular locking dependency on PREEMPT_RT. 7924f9bf2a2SSebastian Andrzej Siewior */ 793e8d00590SMartin KaFai Lau spin_lock(&ilb2->lock); 794e8d00590SMartin KaFai Lau if (sk_unhashed(sk)) { 795e8d00590SMartin KaFai Lau spin_unlock(&ilb2->lock); 796e8d00590SMartin KaFai Lau return; 797e8d00590SMartin KaFai Lau } 798e8d00590SMartin KaFai Lau 799e8d00590SMartin KaFai Lau if (rcu_access_pointer(sk->sk_reuseport_cb)) 800e8d00590SMartin KaFai Lau reuseport_stop_listen_sock(sk); 801e8d00590SMartin KaFai Lau 802e8d00590SMartin KaFai Lau __sk_nulls_del_node_init_rcu(sk); 803e8d00590SMartin KaFai Lau sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 804e8d00590SMartin KaFai Lau spin_unlock(&ilb2->lock); 8054f9bf2a2SSebastian Andrzej Siewior } else { 8064f9bf2a2SSebastian Andrzej Siewior spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 8074f9bf2a2SSebastian Andrzej Siewior 8084f9bf2a2SSebastian Andrzej Siewior spin_lock_bh(lock); 809e8d00590SMartin KaFai Lau if (sk_unhashed(sk)) { 810e8d00590SMartin KaFai Lau spin_unlock_bh(lock); 811e8d00590SMartin KaFai Lau return; 812e8d00590SMartin KaFai Lau } 813e8d00590SMartin KaFai Lau __sk_nulls_del_node_init_rcu(sk); 814e8d00590SMartin KaFai Lau sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 815920de804SEric Dumazet spin_unlock_bh(lock); 816ab1e0a13SArnaldo Carvalho de Melo } 8174f9bf2a2SSebastian Andrzej Siewior } 818ab1e0a13SArnaldo Carvalho de Melo EXPORT_SYMBOL_GPL(inet_unhash); 819152da81dSPavel Emelyanov 82028044fc1SJoanne Koong static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb, 82128044fc1SJoanne Koong const struct net *net, unsigned short port, 82228044fc1SJoanne Koong int l3mdev, const struct sock *sk) 82328044fc1SJoanne Koong { 824c6d27706SKuniyuki Iwashima if (!net_eq(ib2_net(tb), net) || tb->port != port || 825c6d27706SKuniyuki Iwashima tb->l3mdev != l3mdev) 826c6d27706SKuniyuki Iwashima return false; 827c6d27706SKuniyuki Iwashima 8288702cf12SKuniyuki Iwashima return inet_bind2_bucket_addr_match(tb, sk); 82928044fc1SJoanne Koong } 83028044fc1SJoanne Koong 83128044fc1SJoanne Koong bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net, 83228044fc1SJoanne Koong unsigned short port, int l3mdev, const struct sock *sk) 83328044fc1SJoanne Koong { 834c6d27706SKuniyuki Iwashima if (!net_eq(ib2_net(tb), net) || tb->port != port || 835c6d27706SKuniyuki Iwashima tb->l3mdev != l3mdev) 836c6d27706SKuniyuki Iwashima return false; 837c6d27706SKuniyuki Iwashima 83828044fc1SJoanne Koong #if IS_ENABLED(CONFIG_IPV6) 839d9ba9934SKuniyuki Iwashima if (sk->sk_family != tb->family) { 840d9ba9934SKuniyuki Iwashima if (sk->sk_family == AF_INET) 841aa99e5f8SKuniyuki Iwashima return ipv6_addr_any(&tb->v6_rcv_saddr) || 842aa99e5f8SKuniyuki Iwashima ipv6_addr_v4mapped_any(&tb->v6_rcv_saddr); 843d9ba9934SKuniyuki Iwashima 844*5e07e672SKuniyuki Iwashima return ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr) && 845*5e07e672SKuniyuki Iwashima tb->rcv_saddr == 0; 846d9ba9934SKuniyuki Iwashima } 8475456262dSMartin KaFai Lau 84828044fc1SJoanne Koong if (sk->sk_family == AF_INET6) 849c6d27706SKuniyuki Iwashima return ipv6_addr_any(&tb->v6_rcv_saddr); 85028044fc1SJoanne Koong #endif 851c6d27706SKuniyuki Iwashima return tb->rcv_saddr == 0; 85228044fc1SJoanne Koong } 85328044fc1SJoanne Koong 85428044fc1SJoanne Koong /* The socket's bhash2 hashbucket spinlock must be held when this is called */ 85528044fc1SJoanne Koong struct inet_bind2_bucket * 85628044fc1SJoanne Koong inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net, 85728044fc1SJoanne Koong unsigned short port, int l3mdev, const struct sock *sk) 85828044fc1SJoanne Koong { 85928044fc1SJoanne Koong struct inet_bind2_bucket *bhash2 = NULL; 86028044fc1SJoanne Koong 86128044fc1SJoanne Koong inet_bind_bucket_for_each(bhash2, &head->chain) 86228044fc1SJoanne Koong if (inet_bind2_bucket_match(bhash2, net, port, l3mdev, sk)) 86328044fc1SJoanne Koong break; 86428044fc1SJoanne Koong 86528044fc1SJoanne Koong return bhash2; 86628044fc1SJoanne Koong } 86728044fc1SJoanne Koong 86828044fc1SJoanne Koong struct inet_bind_hashbucket * 86928044fc1SJoanne Koong inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port) 87028044fc1SJoanne Koong { 871429e42c1SKuniyuki Iwashima struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); 87228044fc1SJoanne Koong u32 hash; 87328044fc1SJoanne Koong 8748cdc3223SKuniyuki Iwashima #if IS_ENABLED(CONFIG_IPV6) 87528044fc1SJoanne Koong if (sk->sk_family == AF_INET6) 8768cdc3223SKuniyuki Iwashima hash = ipv6_portaddr_hash(net, &in6addr_any, port); 87728044fc1SJoanne Koong else 87828044fc1SJoanne Koong #endif 87928044fc1SJoanne Koong hash = ipv4_portaddr_hash(net, 0, port); 88028044fc1SJoanne Koong 88128044fc1SJoanne Koong return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; 88228044fc1SJoanne Koong } 88328044fc1SJoanne Koong 8848c5dae4cSKuniyuki Iwashima static void inet_update_saddr(struct sock *sk, void *saddr, int family) 8858c5dae4cSKuniyuki Iwashima { 8868c5dae4cSKuniyuki Iwashima if (family == AF_INET) { 8878c5dae4cSKuniyuki Iwashima inet_sk(sk)->inet_saddr = *(__be32 *)saddr; 8888c5dae4cSKuniyuki Iwashima sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr); 8898c5dae4cSKuniyuki Iwashima } 8908c5dae4cSKuniyuki Iwashima #if IS_ENABLED(CONFIG_IPV6) 8918c5dae4cSKuniyuki Iwashima else { 8928c5dae4cSKuniyuki Iwashima sk->sk_v6_rcv_saddr = *(struct in6_addr *)saddr; 8938c5dae4cSKuniyuki Iwashima } 8948c5dae4cSKuniyuki Iwashima #endif 8958c5dae4cSKuniyuki Iwashima } 8968c5dae4cSKuniyuki Iwashima 897e0833d1fSKuniyuki Iwashima static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset) 89828044fc1SJoanne Koong { 899429e42c1SKuniyuki Iwashima struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); 9008c5dae4cSKuniyuki Iwashima struct inet_bind_hashbucket *head, *head2; 90128044fc1SJoanne Koong struct inet_bind2_bucket *tb2, *new_tb2; 90228044fc1SJoanne Koong int l3mdev = inet_sk_bound_l3mdev(sk); 90328044fc1SJoanne Koong int port = inet_sk(sk)->inet_num; 90428044fc1SJoanne Koong struct net *net = sock_net(sk); 9058c5dae4cSKuniyuki Iwashima int bhash; 9068c5dae4cSKuniyuki Iwashima 9078c5dae4cSKuniyuki Iwashima if (!inet_csk(sk)->icsk_bind2_hash) { 9088c5dae4cSKuniyuki Iwashima /* Not bind()ed before. */ 909e0833d1fSKuniyuki Iwashima if (reset) 910e0833d1fSKuniyuki Iwashima inet_reset_saddr(sk); 911e0833d1fSKuniyuki Iwashima else 9128c5dae4cSKuniyuki Iwashima inet_update_saddr(sk, saddr, family); 913e0833d1fSKuniyuki Iwashima 9148c5dae4cSKuniyuki Iwashima return 0; 9158c5dae4cSKuniyuki Iwashima } 91628044fc1SJoanne Koong 91728044fc1SJoanne Koong /* Allocate a bind2 bucket ahead of time to avoid permanently putting 91828044fc1SJoanne Koong * the bhash2 table in an inconsistent state if a new tb2 bucket 91928044fc1SJoanne Koong * allocation fails. 92028044fc1SJoanne Koong */ 92128044fc1SJoanne Koong new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC); 922e0833d1fSKuniyuki Iwashima if (!new_tb2) { 923e0833d1fSKuniyuki Iwashima if (reset) { 924e0833d1fSKuniyuki Iwashima /* The (INADDR_ANY, port) bucket might have already 925e0833d1fSKuniyuki Iwashima * been freed, then we cannot fixup icsk_bind2_hash, 926e0833d1fSKuniyuki Iwashima * so we give up and unlink sk from bhash/bhash2 not 927e0833d1fSKuniyuki Iwashima * to leave inconsistency in bhash2. 928e0833d1fSKuniyuki Iwashima */ 929e0833d1fSKuniyuki Iwashima inet_put_port(sk); 930e0833d1fSKuniyuki Iwashima inet_reset_saddr(sk); 931e0833d1fSKuniyuki Iwashima } 932e0833d1fSKuniyuki Iwashima 93328044fc1SJoanne Koong return -ENOMEM; 934e0833d1fSKuniyuki Iwashima } 93528044fc1SJoanne Koong 9368c5dae4cSKuniyuki Iwashima bhash = inet_bhashfn(net, port, hinfo->bhash_size); 9378c5dae4cSKuniyuki Iwashima head = &hinfo->bhash[bhash]; 93828044fc1SJoanne Koong head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); 93928044fc1SJoanne Koong 9408c5dae4cSKuniyuki Iwashima /* If we change saddr locklessly, another thread 9418c5dae4cSKuniyuki Iwashima * iterating over bhash might see corrupted address. 9428c5dae4cSKuniyuki Iwashima */ 9438c5dae4cSKuniyuki Iwashima spin_lock_bh(&head->lock); 9448c5dae4cSKuniyuki Iwashima 9458c5dae4cSKuniyuki Iwashima spin_lock(&head2->lock); 94628044fc1SJoanne Koong __sk_del_bind2_node(sk); 9478acdad37SKuniyuki Iwashima inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash); 9488c5dae4cSKuniyuki Iwashima spin_unlock(&head2->lock); 94928044fc1SJoanne Koong 950e0833d1fSKuniyuki Iwashima if (reset) 951e0833d1fSKuniyuki Iwashima inet_reset_saddr(sk); 952e0833d1fSKuniyuki Iwashima else 9538c5dae4cSKuniyuki Iwashima inet_update_saddr(sk, saddr, family); 9548c5dae4cSKuniyuki Iwashima 9558c5dae4cSKuniyuki Iwashima head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); 9568c5dae4cSKuniyuki Iwashima 9578c5dae4cSKuniyuki Iwashima spin_lock(&head2->lock); 95828044fc1SJoanne Koong tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); 95928044fc1SJoanne Koong if (!tb2) { 96028044fc1SJoanne Koong tb2 = new_tb2; 96128044fc1SJoanne Koong inet_bind2_bucket_init(tb2, net, head2, port, l3mdev, sk); 96228044fc1SJoanne Koong } 96328044fc1SJoanne Koong sk_add_bind2_node(sk, &tb2->owners); 96428044fc1SJoanne Koong inet_csk(sk)->icsk_bind2_hash = tb2; 9658c5dae4cSKuniyuki Iwashima spin_unlock(&head2->lock); 9668c5dae4cSKuniyuki Iwashima 9678c5dae4cSKuniyuki Iwashima spin_unlock_bh(&head->lock); 96828044fc1SJoanne Koong 96928044fc1SJoanne Koong if (tb2 != new_tb2) 97028044fc1SJoanne Koong kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2); 97128044fc1SJoanne Koong 97228044fc1SJoanne Koong return 0; 97328044fc1SJoanne Koong } 974e0833d1fSKuniyuki Iwashima 975e0833d1fSKuniyuki Iwashima int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) 976e0833d1fSKuniyuki Iwashima { 977e0833d1fSKuniyuki Iwashima return __inet_bhash2_update_saddr(sk, saddr, family, false); 978e0833d1fSKuniyuki Iwashima } 97928044fc1SJoanne Koong EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr); 98028044fc1SJoanne Koong 981e0833d1fSKuniyuki Iwashima void inet_bhash2_reset_saddr(struct sock *sk) 982e0833d1fSKuniyuki Iwashima { 983e0833d1fSKuniyuki Iwashima if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) 984e0833d1fSKuniyuki Iwashima __inet_bhash2_update_saddr(sk, NULL, 0, true); 985e0833d1fSKuniyuki Iwashima } 986e0833d1fSKuniyuki Iwashima EXPORT_SYMBOL_GPL(inet_bhash2_reset_saddr); 987e0833d1fSKuniyuki Iwashima 988190cc824SEric Dumazet /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm 989190cc824SEric Dumazet * Note that we use 32bit integers (vs RFC 'short integers') 990190cc824SEric Dumazet * because 2^16 is not a multiple of num_ephemeral and this 991190cc824SEric Dumazet * property might be used by clever attacker. 992aeac4ec8SGleb Mazovetskiy * 9934c2c8f03SWilly Tarreau * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though 994aeac4ec8SGleb Mazovetskiy * attacks were since demonstrated, thus we use 65536 by default instead 995aeac4ec8SGleb Mazovetskiy * to really give more isolation and privacy, at the expense of 256kB 996aeac4ec8SGleb Mazovetskiy * of kernel memory. 997190cc824SEric Dumazet */ 998aeac4ec8SGleb Mazovetskiy #define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER) 999e9261476SWilly Tarreau static u32 *table_perturb; 1000190cc824SEric Dumazet 10015ee31fc1SPavel Emelyanov int __inet_hash_connect(struct inet_timewait_death_row *death_row, 1002b2d05756SWilly Tarreau struct sock *sk, u64 port_offset, 10035ee31fc1SPavel Emelyanov int (*check_established)(struct inet_timewait_death_row *, 1004b4d6444eSEric Dumazet struct sock *, __u16, struct inet_timewait_sock **)) 1005a7f5e7f1SArnaldo Carvalho de Melo { 1006a7f5e7f1SArnaldo Carvalho de Melo struct inet_hashinfo *hinfo = death_row->hashinfo; 100728044fc1SJoanne Koong struct inet_bind_hashbucket *head, *head2; 1008a7f5e7f1SArnaldo Carvalho de Melo struct inet_timewait_sock *tw = NULL; 10091580ab63SEric Dumazet int port = inet_sk(sk)->inet_num; 10101580ab63SEric Dumazet struct net *net = sock_net(sk); 101128044fc1SJoanne Koong struct inet_bind2_bucket *tb2; 10121580ab63SEric Dumazet struct inet_bind_bucket *tb; 101328044fc1SJoanne Koong bool tb_created = false; 10141580ab63SEric Dumazet u32 remaining, offset; 10151580ab63SEric Dumazet int ret, i, low, high; 101620718485SEric Dumazet bool local_ports; 101720718485SEric Dumazet int step, l3mdev; 1018190cc824SEric Dumazet u32 index; 10191580ab63SEric Dumazet 10201580ab63SEric Dumazet if (port) { 102121cbd90aSPietro Borrello local_bh_disable(); 10221580ab63SEric Dumazet ret = check_established(death_row, sk, port, NULL); 10231580ab63SEric Dumazet local_bh_enable(); 10241580ab63SEric Dumazet return ret; 10251580ab63SEric Dumazet } 1026a7f5e7f1SArnaldo Carvalho de Melo 10273c82a21fSRobert Shearman l3mdev = inet_sk_bound_l3mdev(sk); 10283c82a21fSRobert Shearman 102920718485SEric Dumazet local_ports = inet_sk_get_local_port_range(sk, &low, &high); 103020718485SEric Dumazet step = local_ports ? 1 : 2; 103120718485SEric Dumazet 10321580ab63SEric Dumazet high++; /* [32768, 60999] -> [32768, 61000[ */ 10331580ab63SEric Dumazet remaining = high - low; 103420718485SEric Dumazet if (!local_ports && remaining > 1) 10351580ab63SEric Dumazet remaining &= ~1U; 1036227b60f5SStephen Hemminger 10372a4187f4SJason A. Donenfeld get_random_sleepable_once(table_perturb, 1038e9261476SWilly Tarreau INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb)); 1039e8161345SWilly Tarreau index = port_offset & (INET_TABLE_PERTURB_SIZE - 1); 1040190cc824SEric Dumazet 10419e9b70aeSWilly Tarreau offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32); 1042b2d05756SWilly Tarreau offset %= remaining; 1043b2d05756SWilly Tarreau 10441580ab63SEric Dumazet /* In first pass we try ports of @low parity. 10451580ab63SEric Dumazet * inet_csk_get_port() does the opposite choice. 104607f4c900SEric Dumazet */ 104720718485SEric Dumazet if (!local_ports) 10481580ab63SEric Dumazet offset &= ~1U; 10491580ab63SEric Dumazet other_parity_scan: 10501580ab63SEric Dumazet port = low + offset; 105120718485SEric Dumazet for (i = 0; i < remaining; i += step, port += step) { 10521580ab63SEric Dumazet if (unlikely(port >= high)) 10531580ab63SEric Dumazet port -= remaining; 1054122ff243SWANG Cong if (inet_is_local_reserved_port(net, port)) 1055e3826f1eSAmerigo Wang continue; 10567f635ab7SPavel Emelyanov head = &hinfo->bhash[inet_bhashfn(net, port, 10577f635ab7SPavel Emelyanov hinfo->bhash_size)]; 10581580ab63SEric Dumazet spin_lock_bh(&head->lock); 1059a7f5e7f1SArnaldo Carvalho de Melo 10601580ab63SEric Dumazet /* Does not bother with rcv_saddr checks, because 10611580ab63SEric Dumazet * the established check is already unique enough. 1062a7f5e7f1SArnaldo Carvalho de Melo */ 1063b67bfe0dSSasha Levin inet_bind_bucket_for_each(tb, &head->chain) { 106428044fc1SJoanne Koong if (inet_bind_bucket_match(tb, net, port, l3mdev)) { 1065da5e3630STom Herbert if (tb->fastreuse >= 0 || 1066da5e3630STom Herbert tb->fastreuseport >= 0) 1067a7f5e7f1SArnaldo Carvalho de Melo goto next_port; 1068a9d8f911SEvgeniy Polyakov WARN_ON(hlist_empty(&tb->owners)); 10695ee31fc1SPavel Emelyanov if (!check_established(death_row, sk, 10705ee31fc1SPavel Emelyanov port, &tw)) 1071a7f5e7f1SArnaldo Carvalho de Melo goto ok; 1072a7f5e7f1SArnaldo Carvalho de Melo goto next_port; 1073a7f5e7f1SArnaldo Carvalho de Melo } 1074a7f5e7f1SArnaldo Carvalho de Melo } 1075a7f5e7f1SArnaldo Carvalho de Melo 1076941b1d22SPavel Emelyanov tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 10773c82a21fSRobert Shearman net, head, port, l3mdev); 1078a7f5e7f1SArnaldo Carvalho de Melo if (!tb) { 10791580ab63SEric Dumazet spin_unlock_bh(&head->lock); 10801580ab63SEric Dumazet return -ENOMEM; 1081a7f5e7f1SArnaldo Carvalho de Melo } 108228044fc1SJoanne Koong tb_created = true; 1083a7f5e7f1SArnaldo Carvalho de Melo tb->fastreuse = -1; 1084da5e3630STom Herbert tb->fastreuseport = -1; 1085a7f5e7f1SArnaldo Carvalho de Melo goto ok; 1086a7f5e7f1SArnaldo Carvalho de Melo next_port: 10871580ab63SEric Dumazet spin_unlock_bh(&head->lock); 10881580ab63SEric Dumazet cond_resched(); 1089a7f5e7f1SArnaldo Carvalho de Melo } 10901580ab63SEric Dumazet 109120718485SEric Dumazet if (!local_ports) { 10921580ab63SEric Dumazet offset++; 10931580ab63SEric Dumazet if ((offset & 1) && remaining > 1) 10941580ab63SEric Dumazet goto other_parity_scan; 109520718485SEric Dumazet } 1096a7f5e7f1SArnaldo Carvalho de Melo return -EADDRNOTAVAIL; 1097a7f5e7f1SArnaldo Carvalho de Melo 1098a7f5e7f1SArnaldo Carvalho de Melo ok: 109928044fc1SJoanne Koong /* Find the corresponding tb2 bucket since we need to 110028044fc1SJoanne Koong * add the socket to the bhash2 table as well 110128044fc1SJoanne Koong */ 110228044fc1SJoanne Koong head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); 110328044fc1SJoanne Koong spin_lock(&head2->lock); 110428044fc1SJoanne Koong 110528044fc1SJoanne Koong tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); 110628044fc1SJoanne Koong if (!tb2) { 110728044fc1SJoanne Koong tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net, 110828044fc1SJoanne Koong head2, port, l3mdev, sk); 110928044fc1SJoanne Koong if (!tb2) 111028044fc1SJoanne Koong goto error; 111128044fc1SJoanne Koong } 111228044fc1SJoanne Koong 1113ca7af040SWilly Tarreau /* Here we want to add a little bit of randomness to the next source 1114ca7af040SWilly Tarreau * port that will be chosen. We use a max() with a random here so that 1115ca7af040SWilly Tarreau * on low contention the randomness is maximal and on high contention 1116ca7af040SWilly Tarreau * it may be inexistent. 1117c579bd1bSEric Dumazet */ 111820718485SEric Dumazet i = max_t(int, i, get_random_u32_below(8) * step); 111920718485SEric Dumazet WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + step); 1120a7f5e7f1SArnaldo Carvalho de Melo 1121a7f5e7f1SArnaldo Carvalho de Melo /* Head lock still held and bh's disabled */ 112228044fc1SJoanne Koong inet_bind_hash(sk, tb, tb2, port); 112328044fc1SJoanne Koong 1124a7f5e7f1SArnaldo Carvalho de Melo if (sk_unhashed(sk)) { 1125c720c7e8SEric Dumazet inet_sk(sk)->inet_sport = htons(port); 112601770a16SRicardo Dias inet_ehash_nolisten(sk, (struct sock *)tw, NULL); 1127a7f5e7f1SArnaldo Carvalho de Melo } 11283cdaedaeSEric Dumazet if (tw) 1129fc01538fSEric Dumazet inet_twsk_bind_unhash(tw, hinfo); 1130936a192fSKuniyuki Iwashima 1131936a192fSKuniyuki Iwashima spin_unlock(&head2->lock); 1132a7f5e7f1SArnaldo Carvalho de Melo spin_unlock(&head->lock); 1133936a192fSKuniyuki Iwashima 1134dbe7faa4SEric Dumazet if (tw) 1135dbe7faa4SEric Dumazet inet_twsk_deschedule_put(tw); 1136a7f5e7f1SArnaldo Carvalho de Melo local_bh_enable(); 11371580ab63SEric Dumazet return 0; 113828044fc1SJoanne Koong 113928044fc1SJoanne Koong error: 114028044fc1SJoanne Koong spin_unlock(&head2->lock); 114128044fc1SJoanne Koong if (tb_created) 114228044fc1SJoanne Koong inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); 114328044fc1SJoanne Koong spin_unlock_bh(&head->lock); 114428044fc1SJoanne Koong return -ENOMEM; 1145a7f5e7f1SArnaldo Carvalho de Melo } 11465ee31fc1SPavel Emelyanov 11475ee31fc1SPavel Emelyanov /* 11485ee31fc1SPavel Emelyanov * Bind a port for a connect operation and hash it. 11495ee31fc1SPavel Emelyanov */ 11505ee31fc1SPavel Emelyanov int inet_hash_connect(struct inet_timewait_death_row *death_row, 11515ee31fc1SPavel Emelyanov struct sock *sk) 11525ee31fc1SPavel Emelyanov { 1153b2d05756SWilly Tarreau u64 port_offset = 0; 1154e2baad9eSEric Dumazet 1155e2baad9eSEric Dumazet if (!inet_sk(sk)->inet_num) 1156e2baad9eSEric Dumazet port_offset = inet_sk_port_offset(sk); 1157e2baad9eSEric Dumazet return __inet_hash_connect(death_row, sk, port_offset, 1158b4d6444eSEric Dumazet __inet_check_established); 11595ee31fc1SPavel Emelyanov } 1160a7f5e7f1SArnaldo Carvalho de Melo EXPORT_SYMBOL_GPL(inet_hash_connect); 11615caea4eaSEric Dumazet 1162c92c81dfSPeter Oskolkov static void init_hashinfo_lhash2(struct inet_hashinfo *h) 1163c92c81dfSPeter Oskolkov { 1164c92c81dfSPeter Oskolkov int i; 1165c92c81dfSPeter Oskolkov 1166c92c81dfSPeter Oskolkov for (i = 0; i <= h->lhash2_mask; i++) { 1167c92c81dfSPeter Oskolkov spin_lock_init(&h->lhash2[i].lock); 1168cae3873cSMartin KaFai Lau INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head, 1169cae3873cSMartin KaFai Lau i + LISTENING_NULLS_BASE); 1170c92c81dfSPeter Oskolkov } 1171c92c81dfSPeter Oskolkov } 1172c92c81dfSPeter Oskolkov 117361b7c691SMartin KaFai Lau void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, 117461b7c691SMartin KaFai Lau unsigned long numentries, int scale, 117561b7c691SMartin KaFai Lau unsigned long low_limit, 117661b7c691SMartin KaFai Lau unsigned long high_limit) 117761b7c691SMartin KaFai Lau { 117861b7c691SMartin KaFai Lau h->lhash2 = alloc_large_system_hash(name, 117961b7c691SMartin KaFai Lau sizeof(*h->lhash2), 118061b7c691SMartin KaFai Lau numentries, 118161b7c691SMartin KaFai Lau scale, 118261b7c691SMartin KaFai Lau 0, 118361b7c691SMartin KaFai Lau NULL, 118461b7c691SMartin KaFai Lau &h->lhash2_mask, 118561b7c691SMartin KaFai Lau low_limit, 118661b7c691SMartin KaFai Lau high_limit); 1187c92c81dfSPeter Oskolkov init_hashinfo_lhash2(h); 1188e9261476SWilly Tarreau 1189e9261476SWilly Tarreau /* this one is used for source ports of outgoing connections */ 1190e67b72b9SMuchun Song table_perturb = alloc_large_system_hash("Table-perturb", 1191e67b72b9SMuchun Song sizeof(*table_perturb), 1192e67b72b9SMuchun Song INET_TABLE_PERTURB_SIZE, 1193e67b72b9SMuchun Song 0, 0, NULL, NULL, 1194e67b72b9SMuchun Song INET_TABLE_PERTURB_SIZE, 1195e67b72b9SMuchun Song INET_TABLE_PERTURB_SIZE); 1196c92c81dfSPeter Oskolkov } 119761b7c691SMartin KaFai Lau 1198c92c81dfSPeter Oskolkov int inet_hashinfo2_init_mod(struct inet_hashinfo *h) 1199c92c81dfSPeter Oskolkov { 1200c92c81dfSPeter Oskolkov h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL); 1201c92c81dfSPeter Oskolkov if (!h->lhash2) 1202c92c81dfSPeter Oskolkov return -ENOMEM; 1203c92c81dfSPeter Oskolkov 1204c92c81dfSPeter Oskolkov h->lhash2_mask = INET_LHTABLE_SIZE - 1; 1205c92c81dfSPeter Oskolkov /* INET_LHTABLE_SIZE must be a power of 2 */ 1206c92c81dfSPeter Oskolkov BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask); 1207c92c81dfSPeter Oskolkov 1208c92c81dfSPeter Oskolkov init_hashinfo_lhash2(h); 1209c92c81dfSPeter Oskolkov return 0; 121061b7c691SMartin KaFai Lau } 1211c92c81dfSPeter Oskolkov EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod); 121261b7c691SMartin KaFai Lau 1213095dc8e0SEric Dumazet int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) 1214095dc8e0SEric Dumazet { 121589e478a2SEric Dumazet unsigned int locksz = sizeof(spinlock_t); 1216095dc8e0SEric Dumazet unsigned int i, nblocks = 1; 1217095dc8e0SEric Dumazet 121889e478a2SEric Dumazet if (locksz != 0) { 1219095dc8e0SEric Dumazet /* allocate 2 cache lines or at least one spinlock per cpu */ 122089e478a2SEric Dumazet nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U); 1221095dc8e0SEric Dumazet nblocks = roundup_pow_of_two(nblocks * num_possible_cpus()); 1222095dc8e0SEric Dumazet 1223095dc8e0SEric Dumazet /* no more locks than number of hash buckets */ 1224095dc8e0SEric Dumazet nblocks = min(nblocks, hashinfo->ehash_mask + 1); 1225095dc8e0SEric Dumazet 1226752ade68SMichal Hocko hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL); 1227095dc8e0SEric Dumazet if (!hashinfo->ehash_locks) 1228095dc8e0SEric Dumazet return -ENOMEM; 1229095dc8e0SEric Dumazet 1230095dc8e0SEric Dumazet for (i = 0; i < nblocks; i++) 1231095dc8e0SEric Dumazet spin_lock_init(&hashinfo->ehash_locks[i]); 1232095dc8e0SEric Dumazet } 1233095dc8e0SEric Dumazet hashinfo->ehash_locks_mask = nblocks - 1; 1234095dc8e0SEric Dumazet return 0; 1235095dc8e0SEric Dumazet } 1236095dc8e0SEric Dumazet EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc); 1237d1e5e640SKuniyuki Iwashima 1238d1e5e640SKuniyuki Iwashima struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, 1239d1e5e640SKuniyuki Iwashima unsigned int ehash_entries) 1240d1e5e640SKuniyuki Iwashima { 1241d1e5e640SKuniyuki Iwashima struct inet_hashinfo *new_hashinfo; 1242d1e5e640SKuniyuki Iwashima int i; 1243d1e5e640SKuniyuki Iwashima 1244d1e5e640SKuniyuki Iwashima new_hashinfo = kmemdup(hashinfo, sizeof(*hashinfo), GFP_KERNEL); 1245d1e5e640SKuniyuki Iwashima if (!new_hashinfo) 1246d1e5e640SKuniyuki Iwashima goto err; 1247d1e5e640SKuniyuki Iwashima 1248d1e5e640SKuniyuki Iwashima new_hashinfo->ehash = vmalloc_huge(ehash_entries * sizeof(struct inet_ehash_bucket), 1249d1e5e640SKuniyuki Iwashima GFP_KERNEL_ACCOUNT); 1250d1e5e640SKuniyuki Iwashima if (!new_hashinfo->ehash) 1251d1e5e640SKuniyuki Iwashima goto free_hashinfo; 1252d1e5e640SKuniyuki Iwashima 1253d1e5e640SKuniyuki Iwashima new_hashinfo->ehash_mask = ehash_entries - 1; 1254d1e5e640SKuniyuki Iwashima 1255d1e5e640SKuniyuki Iwashima if (inet_ehash_locks_alloc(new_hashinfo)) 1256d1e5e640SKuniyuki Iwashima goto free_ehash; 1257d1e5e640SKuniyuki Iwashima 1258d1e5e640SKuniyuki Iwashima for (i = 0; i < ehash_entries; i++) 1259d1e5e640SKuniyuki Iwashima INIT_HLIST_NULLS_HEAD(&new_hashinfo->ehash[i].chain, i); 1260d1e5e640SKuniyuki Iwashima 1261d1e5e640SKuniyuki Iwashima new_hashinfo->pernet = true; 1262d1e5e640SKuniyuki Iwashima 1263d1e5e640SKuniyuki Iwashima return new_hashinfo; 1264d1e5e640SKuniyuki Iwashima 1265d1e5e640SKuniyuki Iwashima free_ehash: 1266d1e5e640SKuniyuki Iwashima vfree(new_hashinfo->ehash); 1267d1e5e640SKuniyuki Iwashima free_hashinfo: 1268d1e5e640SKuniyuki Iwashima kfree(new_hashinfo); 1269d1e5e640SKuniyuki Iwashima err: 1270d1e5e640SKuniyuki Iwashima return NULL; 1271d1e5e640SKuniyuki Iwashima } 1272d1e5e640SKuniyuki Iwashima EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_alloc); 1273d1e5e640SKuniyuki Iwashima 1274d1e5e640SKuniyuki Iwashima void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo) 1275d1e5e640SKuniyuki Iwashima { 1276d1e5e640SKuniyuki Iwashima if (!hashinfo->pernet) 1277d1e5e640SKuniyuki Iwashima return; 1278d1e5e640SKuniyuki Iwashima 1279d1e5e640SKuniyuki Iwashima inet_ehash_locks_free(hashinfo); 1280d1e5e640SKuniyuki Iwashima vfree(hashinfo->ehash); 1281d1e5e640SKuniyuki Iwashima kfree(hashinfo); 1282d1e5e640SKuniyuki Iwashima } 1283d1e5e640SKuniyuki Iwashima EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_free); 1284