12874c5fdSThomas Gleixner // SPDX-License-Identifier: GPL-2.0-or-later 277d8bf9cSArnaldo Carvalho de Melo /* 377d8bf9cSArnaldo Carvalho de Melo * INET An implementation of the TCP/IP protocol suite for the LINUX 477d8bf9cSArnaldo Carvalho de Melo * operating system. INET is implemented using the BSD Socket 577d8bf9cSArnaldo Carvalho de Melo * interface as the means of communication with the user level. 677d8bf9cSArnaldo Carvalho de Melo * 777d8bf9cSArnaldo Carvalho de Melo * Generic INET transport hashtables 877d8bf9cSArnaldo Carvalho de Melo * 977d8bf9cSArnaldo Carvalho de Melo * Authors: Lotsa people, from code originally in tcp 1077d8bf9cSArnaldo Carvalho de Melo */ 1177d8bf9cSArnaldo Carvalho de Melo 122d8c4ce5SArnaldo Carvalho de Melo #include <linux/module.h> 13a7f5e7f1SArnaldo Carvalho de Melo #include <linux/random.h> 14f3f05f70SArnaldo Carvalho de Melo #include <linux/sched.h> 1577d8bf9cSArnaldo Carvalho de Melo #include <linux/slab.h> 16f3f05f70SArnaldo Carvalho de Melo #include <linux/wait.h> 17095dc8e0SEric Dumazet #include <linux/vmalloc.h> 1857c8a661SMike Rapoport #include <linux/memblock.h> 1977d8bf9cSArnaldo Carvalho de Melo 20c125e80bSCraig Gallek #include <net/addrconf.h> 21463c84b9SArnaldo Carvalho de Melo #include <net/inet_connection_sock.h> 2277d8bf9cSArnaldo Carvalho de Melo #include <net/inet_hashtables.h> 2301770a16SRicardo Dias #if IS_ENABLED(CONFIG_IPV6) 2401770a16SRicardo Dias #include <net/inet6_hashtables.h> 2501770a16SRicardo Dias #endif 266e5714eaSDavid S. Miller #include <net/secure_seq.h> 27a7f5e7f1SArnaldo Carvalho de Melo #include <net/ip.h> 28a04a480dSDavid Ahern #include <net/tcp.h> 29c125e80bSCraig Gallek #include <net/sock_reuseport.h> 3077d8bf9cSArnaldo Carvalho de Melo 316eada011SEric Dumazet static u32 inet_ehashfn(const struct net *net, const __be32 laddr, 3265cd8033SHannes Frederic Sowa const __u16 lport, const __be32 faddr, 3365cd8033SHannes Frederic Sowa const __be16 fport) 3465cd8033SHannes Frederic Sowa { 351bbdceefSHannes Frederic Sowa static u32 inet_ehash_secret __read_mostly; 361bbdceefSHannes Frederic Sowa 371bbdceefSHannes Frederic Sowa net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret)); 381bbdceefSHannes Frederic Sowa 3965cd8033SHannes Frederic Sowa return __inet_ehashfn(laddr, lport, faddr, fport, 4065cd8033SHannes Frederic Sowa inet_ehash_secret + net_hash_mix(net)); 4165cd8033SHannes Frederic Sowa } 4265cd8033SHannes Frederic Sowa 43d1e559d0SEric Dumazet /* This function handles inet_sock, but also timewait and request sockets 44d1e559d0SEric Dumazet * for IPv4/IPv6. 45d1e559d0SEric Dumazet */ 46784c372aSEric Dumazet static u32 sk_ehashfn(const struct sock *sk) 4765cd8033SHannes Frederic Sowa { 48d1e559d0SEric Dumazet #if IS_ENABLED(CONFIG_IPV6) 49d1e559d0SEric Dumazet if (sk->sk_family == AF_INET6 && 50d1e559d0SEric Dumazet !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 51d1e559d0SEric Dumazet return inet6_ehashfn(sock_net(sk), 52d1e559d0SEric Dumazet &sk->sk_v6_rcv_saddr, sk->sk_num, 53d1e559d0SEric Dumazet &sk->sk_v6_daddr, sk->sk_dport); 54d1e559d0SEric Dumazet #endif 555b441f76SEric Dumazet return inet_ehashfn(sock_net(sk), 565b441f76SEric Dumazet sk->sk_rcv_saddr, sk->sk_num, 575b441f76SEric Dumazet sk->sk_daddr, sk->sk_dport); 5865cd8033SHannes Frederic Sowa } 5965cd8033SHannes Frederic Sowa 6077d8bf9cSArnaldo Carvalho de Melo /* 6177d8bf9cSArnaldo Carvalho de Melo * Allocate and initialize a new local port bind bucket. 6277d8bf9cSArnaldo Carvalho de Melo * The bindhash mutex for snum's hash chain must be held here. 6377d8bf9cSArnaldo Carvalho de Melo */ 64e18b890bSChristoph Lameter struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, 65941b1d22SPavel Emelyanov struct net *net, 6677d8bf9cSArnaldo Carvalho de Melo struct inet_bind_hashbucket *head, 673c82a21fSRobert Shearman const unsigned short snum, 683c82a21fSRobert Shearman int l3mdev) 6977d8bf9cSArnaldo Carvalho de Melo { 7054e6ecb2SChristoph Lameter struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 7177d8bf9cSArnaldo Carvalho de Melo 7200db4124SIan Morris if (tb) { 73efd7ef1cSEric W. Biederman write_pnet(&tb->ib_net, net); 743c82a21fSRobert Shearman tb->l3mdev = l3mdev; 7577d8bf9cSArnaldo Carvalho de Melo tb->port = snum; 7677d8bf9cSArnaldo Carvalho de Melo tb->fastreuse = 0; 77da5e3630STom Herbert tb->fastreuseport = 0; 7877d8bf9cSArnaldo Carvalho de Melo INIT_HLIST_HEAD(&tb->owners); 7977d8bf9cSArnaldo Carvalho de Melo hlist_add_head(&tb->node, &head->chain); 8077d8bf9cSArnaldo Carvalho de Melo } 8177d8bf9cSArnaldo Carvalho de Melo return tb; 8277d8bf9cSArnaldo Carvalho de Melo } 8377d8bf9cSArnaldo Carvalho de Melo 8477d8bf9cSArnaldo Carvalho de Melo /* 8577d8bf9cSArnaldo Carvalho de Melo * Caller must hold hashbucket lock for this tb with local BH disabled 8677d8bf9cSArnaldo Carvalho de Melo */ 87e18b890bSChristoph Lameter void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) 8877d8bf9cSArnaldo Carvalho de Melo { 8977d8bf9cSArnaldo Carvalho de Melo if (hlist_empty(&tb->owners)) { 9077d8bf9cSArnaldo Carvalho de Melo __hlist_del(&tb->node); 9177d8bf9cSArnaldo Carvalho de Melo kmem_cache_free(cachep, tb); 9277d8bf9cSArnaldo Carvalho de Melo } 9377d8bf9cSArnaldo Carvalho de Melo } 942d8c4ce5SArnaldo Carvalho de Melo 9528044fc1SJoanne Koong bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, 9628044fc1SJoanne Koong unsigned short port, int l3mdev) 972d8c4ce5SArnaldo Carvalho de Melo { 9828044fc1SJoanne Koong return net_eq(ib_net(tb), net) && tb->port == port && 9928044fc1SJoanne Koong tb->l3mdev == l3mdev; 10028044fc1SJoanne Koong } 10128044fc1SJoanne Koong 10228044fc1SJoanne Koong static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb, 10328044fc1SJoanne Koong struct net *net, 10428044fc1SJoanne Koong struct inet_bind_hashbucket *head, 10528044fc1SJoanne Koong unsigned short port, int l3mdev, 10628044fc1SJoanne Koong const struct sock *sk) 10728044fc1SJoanne Koong { 10828044fc1SJoanne Koong write_pnet(&tb->ib_net, net); 10928044fc1SJoanne Koong tb->l3mdev = l3mdev; 11028044fc1SJoanne Koong tb->port = port; 11128044fc1SJoanne Koong #if IS_ENABLED(CONFIG_IPV6) 1125456262dSMartin KaFai Lau tb->family = sk->sk_family; 11328044fc1SJoanne Koong if (sk->sk_family == AF_INET6) 11428044fc1SJoanne Koong tb->v6_rcv_saddr = sk->sk_v6_rcv_saddr; 11528044fc1SJoanne Koong else 11628044fc1SJoanne Koong #endif 11728044fc1SJoanne Koong tb->rcv_saddr = sk->sk_rcv_saddr; 11828044fc1SJoanne Koong INIT_HLIST_HEAD(&tb->owners); 11928044fc1SJoanne Koong hlist_add_head(&tb->node, &head->chain); 12028044fc1SJoanne Koong } 12128044fc1SJoanne Koong 12228044fc1SJoanne Koong struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, 12328044fc1SJoanne Koong struct net *net, 12428044fc1SJoanne Koong struct inet_bind_hashbucket *head, 12528044fc1SJoanne Koong unsigned short port, 12628044fc1SJoanne Koong int l3mdev, 12728044fc1SJoanne Koong const struct sock *sk) 12828044fc1SJoanne Koong { 12928044fc1SJoanne Koong struct inet_bind2_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 13028044fc1SJoanne Koong 13128044fc1SJoanne Koong if (tb) 13228044fc1SJoanne Koong inet_bind2_bucket_init(tb, net, head, port, l3mdev, sk); 13328044fc1SJoanne Koong 13428044fc1SJoanne Koong return tb; 13528044fc1SJoanne Koong } 13628044fc1SJoanne Koong 13728044fc1SJoanne Koong /* Caller must hold hashbucket lock for this tb with local BH disabled */ 13828044fc1SJoanne Koong void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) 13928044fc1SJoanne Koong { 14028044fc1SJoanne Koong if (hlist_empty(&tb->owners)) { 14128044fc1SJoanne Koong __hlist_del(&tb->node); 14228044fc1SJoanne Koong kmem_cache_free(cachep, tb); 14328044fc1SJoanne Koong } 14428044fc1SJoanne Koong } 14528044fc1SJoanne Koong 14628044fc1SJoanne Koong static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, 14728044fc1SJoanne Koong const struct sock *sk) 14828044fc1SJoanne Koong { 14928044fc1SJoanne Koong #if IS_ENABLED(CONFIG_IPV6) 1505456262dSMartin KaFai Lau if (sk->sk_family != tb2->family) 1515456262dSMartin KaFai Lau return false; 1525456262dSMartin KaFai Lau 15328044fc1SJoanne Koong if (sk->sk_family == AF_INET6) 15428044fc1SJoanne Koong return ipv6_addr_equal(&tb2->v6_rcv_saddr, 15528044fc1SJoanne Koong &sk->sk_v6_rcv_saddr); 15628044fc1SJoanne Koong #endif 15728044fc1SJoanne Koong return tb2->rcv_saddr == sk->sk_rcv_saddr; 15828044fc1SJoanne Koong } 15928044fc1SJoanne Koong 16028044fc1SJoanne Koong void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 16128044fc1SJoanne Koong struct inet_bind2_bucket *tb2, unsigned short port) 16228044fc1SJoanne Koong { 16328044fc1SJoanne Koong inet_sk(sk)->inet_num = port; 1642d8c4ce5SArnaldo Carvalho de Melo sk_add_bind_node(sk, &tb->owners); 165463c84b9SArnaldo Carvalho de Melo inet_csk(sk)->icsk_bind_hash = tb; 16628044fc1SJoanne Koong sk_add_bind2_node(sk, &tb2->owners); 16728044fc1SJoanne Koong inet_csk(sk)->icsk_bind2_hash = tb2; 1682d8c4ce5SArnaldo Carvalho de Melo } 1692d8c4ce5SArnaldo Carvalho de Melo 1702d8c4ce5SArnaldo Carvalho de Melo /* 1712d8c4ce5SArnaldo Carvalho de Melo * Get rid of any references to a local port held by the given sock. 1722d8c4ce5SArnaldo Carvalho de Melo */ 173ab1e0a13SArnaldo Carvalho de Melo static void __inet_put_port(struct sock *sk) 1742d8c4ce5SArnaldo Carvalho de Melo { 175429e42c1SKuniyuki Iwashima struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); 17608eaef90SKuniyuki Iwashima struct inet_bind_hashbucket *head, *head2; 17708eaef90SKuniyuki Iwashima struct net *net = sock_net(sk); 1782d8c4ce5SArnaldo Carvalho de Melo struct inet_bind_bucket *tb; 17908eaef90SKuniyuki Iwashima int bhash; 18008eaef90SKuniyuki Iwashima 18108eaef90SKuniyuki Iwashima bhash = inet_bhashfn(net, inet_sk(sk)->inet_num, hashinfo->bhash_size); 18208eaef90SKuniyuki Iwashima head = &hashinfo->bhash[bhash]; 18308eaef90SKuniyuki Iwashima head2 = inet_bhashfn_portaddr(hashinfo, sk, net, inet_sk(sk)->inet_num); 1842d8c4ce5SArnaldo Carvalho de Melo 1852d8c4ce5SArnaldo Carvalho de Melo spin_lock(&head->lock); 186463c84b9SArnaldo Carvalho de Melo tb = inet_csk(sk)->icsk_bind_hash; 1872d8c4ce5SArnaldo Carvalho de Melo __sk_del_bind_node(sk); 188463c84b9SArnaldo Carvalho de Melo inet_csk(sk)->icsk_bind_hash = NULL; 189c720c7e8SEric Dumazet inet_sk(sk)->inet_num = 0; 1902d8c4ce5SArnaldo Carvalho de Melo inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 19128044fc1SJoanne Koong 19228044fc1SJoanne Koong spin_lock(&head2->lock); 19328044fc1SJoanne Koong if (inet_csk(sk)->icsk_bind2_hash) { 19428044fc1SJoanne Koong struct inet_bind2_bucket *tb2 = inet_csk(sk)->icsk_bind2_hash; 19528044fc1SJoanne Koong 19628044fc1SJoanne Koong __sk_del_bind2_node(sk); 19728044fc1SJoanne Koong inet_csk(sk)->icsk_bind2_hash = NULL; 19828044fc1SJoanne Koong inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); 19928044fc1SJoanne Koong } 20028044fc1SJoanne Koong spin_unlock(&head2->lock); 20128044fc1SJoanne Koong 2022d8c4ce5SArnaldo Carvalho de Melo spin_unlock(&head->lock); 2032d8c4ce5SArnaldo Carvalho de Melo } 2042d8c4ce5SArnaldo Carvalho de Melo 205ab1e0a13SArnaldo Carvalho de Melo void inet_put_port(struct sock *sk) 2062d8c4ce5SArnaldo Carvalho de Melo { 2072d8c4ce5SArnaldo Carvalho de Melo local_bh_disable(); 208ab1e0a13SArnaldo Carvalho de Melo __inet_put_port(sk); 2092d8c4ce5SArnaldo Carvalho de Melo local_bh_enable(); 2102d8c4ce5SArnaldo Carvalho de Melo } 2112d8c4ce5SArnaldo Carvalho de Melo EXPORT_SYMBOL(inet_put_port); 212f3f05f70SArnaldo Carvalho de Melo 2131ce31c9eSEric Dumazet int __inet_inherit_port(const struct sock *sk, struct sock *child) 21453083773SPavel Emelyanov { 215429e42c1SKuniyuki Iwashima struct inet_hashinfo *table = tcp_or_dccp_get_hashinfo(sk); 216093d2823SBalazs Scheidler unsigned short port = inet_sk(child)->inet_num; 21708eaef90SKuniyuki Iwashima struct inet_bind_hashbucket *head, *head2; 21828044fc1SJoanne Koong bool created_inet_bind_bucket = false; 21928044fc1SJoanne Koong struct net *net = sock_net(sk); 22008eaef90SKuniyuki Iwashima bool update_fastreuse = false; 22128044fc1SJoanne Koong struct inet_bind2_bucket *tb2; 22253083773SPavel Emelyanov struct inet_bind_bucket *tb; 22308eaef90SKuniyuki Iwashima int bhash, l3mdev; 22408eaef90SKuniyuki Iwashima 22508eaef90SKuniyuki Iwashima bhash = inet_bhashfn(net, port, table->bhash_size); 22608eaef90SKuniyuki Iwashima head = &table->bhash[bhash]; 22708eaef90SKuniyuki Iwashima head2 = inet_bhashfn_portaddr(table, child, net, port); 22853083773SPavel Emelyanov 22953083773SPavel Emelyanov spin_lock(&head->lock); 23028044fc1SJoanne Koong spin_lock(&head2->lock); 23153083773SPavel Emelyanov tb = inet_csk(sk)->icsk_bind_hash; 23228044fc1SJoanne Koong tb2 = inet_csk(sk)->icsk_bind2_hash; 23328044fc1SJoanne Koong if (unlikely(!tb || !tb2)) { 23428044fc1SJoanne Koong spin_unlock(&head2->lock); 235c2f34a65SEric Dumazet spin_unlock(&head->lock); 236c2f34a65SEric Dumazet return -ENOENT; 237c2f34a65SEric Dumazet } 238093d2823SBalazs Scheidler if (tb->port != port) { 2393c82a21fSRobert Shearman l3mdev = inet_sk_bound_l3mdev(sk); 2403c82a21fSRobert Shearman 241093d2823SBalazs Scheidler /* NOTE: using tproxy and redirecting skbs to a proxy 242093d2823SBalazs Scheidler * on a different listener port breaks the assumption 243093d2823SBalazs Scheidler * that the listener socket's icsk_bind_hash is the same 244093d2823SBalazs Scheidler * as that of the child socket. We have to look up or 245093d2823SBalazs Scheidler * create a new bind bucket for the child here. */ 246b67bfe0dSSasha Levin inet_bind_bucket_for_each(tb, &head->chain) { 24728044fc1SJoanne Koong if (inet_bind_bucket_match(tb, net, port, l3mdev)) 248093d2823SBalazs Scheidler break; 249093d2823SBalazs Scheidler } 250b67bfe0dSSasha Levin if (!tb) { 251093d2823SBalazs Scheidler tb = inet_bind_bucket_create(table->bind_bucket_cachep, 25228044fc1SJoanne Koong net, head, port, l3mdev); 253093d2823SBalazs Scheidler if (!tb) { 25428044fc1SJoanne Koong spin_unlock(&head2->lock); 255093d2823SBalazs Scheidler spin_unlock(&head->lock); 256093d2823SBalazs Scheidler return -ENOMEM; 257093d2823SBalazs Scheidler } 25828044fc1SJoanne Koong created_inet_bind_bucket = true; 259093d2823SBalazs Scheidler } 26028044fc1SJoanne Koong update_fastreuse = true; 26128044fc1SJoanne Koong 26228044fc1SJoanne Koong goto bhash2_find; 26328044fc1SJoanne Koong } else if (!inet_bind2_bucket_addr_match(tb2, child)) { 26428044fc1SJoanne Koong l3mdev = inet_sk_bound_l3mdev(sk); 26528044fc1SJoanne Koong 26628044fc1SJoanne Koong bhash2_find: 26728044fc1SJoanne Koong tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, child); 26828044fc1SJoanne Koong if (!tb2) { 26928044fc1SJoanne Koong tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep, 27028044fc1SJoanne Koong net, head2, port, 27128044fc1SJoanne Koong l3mdev, child); 27228044fc1SJoanne Koong if (!tb2) 27328044fc1SJoanne Koong goto error; 27428044fc1SJoanne Koong } 27528044fc1SJoanne Koong } 27628044fc1SJoanne Koong if (update_fastreuse) 277d76f3351STim Froidcoeur inet_csk_update_fastreuse(tb, child); 27828044fc1SJoanne Koong inet_bind_hash(child, tb, tb2, port); 27928044fc1SJoanne Koong spin_unlock(&head2->lock); 28053083773SPavel Emelyanov spin_unlock(&head->lock); 281093d2823SBalazs Scheidler 282093d2823SBalazs Scheidler return 0; 28328044fc1SJoanne Koong 28428044fc1SJoanne Koong error: 28528044fc1SJoanne Koong if (created_inet_bind_bucket) 28628044fc1SJoanne Koong inet_bind_bucket_destroy(table->bind_bucket_cachep, tb); 28728044fc1SJoanne Koong spin_unlock(&head2->lock); 28828044fc1SJoanne Koong spin_unlock(&head->lock); 28928044fc1SJoanne Koong return -ENOMEM; 29053083773SPavel Emelyanov } 29153083773SPavel Emelyanov EXPORT_SYMBOL_GPL(__inet_inherit_port); 29253083773SPavel Emelyanov 29361b7c691SMartin KaFai Lau static struct inet_listen_hashbucket * 29461b7c691SMartin KaFai Lau inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) 29561b7c691SMartin KaFai Lau { 29661b7c691SMartin KaFai Lau u32 hash; 29761b7c691SMartin KaFai Lau 29861b7c691SMartin KaFai Lau #if IS_ENABLED(CONFIG_IPV6) 29961b7c691SMartin KaFai Lau if (sk->sk_family == AF_INET6) 30061b7c691SMartin KaFai Lau hash = ipv6_portaddr_hash(sock_net(sk), 30161b7c691SMartin KaFai Lau &sk->sk_v6_rcv_saddr, 30261b7c691SMartin KaFai Lau inet_sk(sk)->inet_num); 30361b7c691SMartin KaFai Lau else 30461b7c691SMartin KaFai Lau #endif 30561b7c691SMartin KaFai Lau hash = ipv4_portaddr_hash(sock_net(sk), 30661b7c691SMartin KaFai Lau inet_sk(sk)->inet_rcv_saddr, 30761b7c691SMartin KaFai Lau inet_sk(sk)->inet_num); 30861b7c691SMartin KaFai Lau return inet_lhash2_bucket(h, hash); 30961b7c691SMartin KaFai Lau } 31061b7c691SMartin KaFai Lau 311c25eb3bfSEric Dumazet static inline int compute_score(struct sock *sk, struct net *net, 312c25eb3bfSEric Dumazet const unsigned short hnum, const __be32 daddr, 31334e1ec31SMiaohe Lin const int dif, const int sdif) 314c25eb3bfSEric Dumazet { 315c25eb3bfSEric Dumazet int score = -1; 316c25eb3bfSEric Dumazet 317d9fbc7f6SPeter Oskolkov if (net_eq(sock_net(sk), net) && sk->sk_num == hnum && 318c25eb3bfSEric Dumazet !ipv6_only_sock(sk)) { 319d9fbc7f6SPeter Oskolkov if (sk->sk_rcv_saddr != daddr) 320c25eb3bfSEric Dumazet return -1; 321e7819058SMike Manning 322d9fbc7f6SPeter Oskolkov if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) 323d9fbc7f6SPeter Oskolkov return -1; 3248d6c414cSMike Manning score = sk->sk_bound_dev_if ? 2 : 1; 325d9fbc7f6SPeter Oskolkov 3268d6c414cSMike Manning if (sk->sk_family == PF_INET) 3278d6c414cSMike Manning score++; 3287170a977SEric Dumazet if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) 32970da268bSEric Dumazet score++; 330c25eb3bfSEric Dumazet } 331c25eb3bfSEric Dumazet return score; 332c25eb3bfSEric Dumazet } 333c25eb3bfSEric Dumazet 33480b373f7SJakub Sitnicki static inline struct sock *lookup_reuseport(struct net *net, struct sock *sk, 33580b373f7SJakub Sitnicki struct sk_buff *skb, int doff, 33680b373f7SJakub Sitnicki __be32 saddr, __be16 sport, 33780b373f7SJakub Sitnicki __be32 daddr, unsigned short hnum) 33880b373f7SJakub Sitnicki { 33980b373f7SJakub Sitnicki struct sock *reuse_sk = NULL; 34080b373f7SJakub Sitnicki u32 phash; 34180b373f7SJakub Sitnicki 34280b373f7SJakub Sitnicki if (sk->sk_reuseport) { 34380b373f7SJakub Sitnicki phash = inet_ehashfn(net, daddr, hnum, saddr, sport); 34480b373f7SJakub Sitnicki reuse_sk = reuseport_select_sock(sk, phash, skb, doff); 34580b373f7SJakub Sitnicki } 34680b373f7SJakub Sitnicki return reuse_sk; 34780b373f7SJakub Sitnicki } 34880b373f7SJakub Sitnicki 349f3f05f70SArnaldo Carvalho de Melo /* 3503b24d854SEric Dumazet * Here are some nice properties to exploit here. The BSD API 3513b24d854SEric Dumazet * does not allow a listening sock to specify the remote port nor the 35233b62231SArnaldo Carvalho de Melo * remote address for the connection. So always assume those are both 35333b62231SArnaldo Carvalho de Melo * wildcarded during the search since they can never be otherwise. 35433b62231SArnaldo Carvalho de Melo */ 35533b62231SArnaldo Carvalho de Melo 3563b24d854SEric Dumazet /* called with rcu_read_lock() : No refcount taken on the socket */ 35761b7c691SMartin KaFai Lau static struct sock *inet_lhash2_lookup(struct net *net, 35861b7c691SMartin KaFai Lau struct inet_listen_hashbucket *ilb2, 35961b7c691SMartin KaFai Lau struct sk_buff *skb, int doff, 36061b7c691SMartin KaFai Lau const __be32 saddr, __be16 sport, 36161b7c691SMartin KaFai Lau const __be32 daddr, const unsigned short hnum, 36261b7c691SMartin KaFai Lau const int dif, const int sdif) 36361b7c691SMartin KaFai Lau { 36461b7c691SMartin KaFai Lau struct sock *sk, *result = NULL; 365cae3873cSMartin KaFai Lau struct hlist_nulls_node *node; 36661b7c691SMartin KaFai Lau int score, hiscore = 0; 36761b7c691SMartin KaFai Lau 368cae3873cSMartin KaFai Lau sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { 36934e1ec31SMiaohe Lin score = compute_score(sk, net, hnum, daddr, dif, sdif); 37061b7c691SMartin KaFai Lau if (score > hiscore) { 37180b373f7SJakub Sitnicki result = lookup_reuseport(net, sk, skb, doff, 37280b373f7SJakub Sitnicki saddr, sport, daddr, hnum); 37361b7c691SMartin KaFai Lau if (result) 37461b7c691SMartin KaFai Lau return result; 37580b373f7SJakub Sitnicki 37661b7c691SMartin KaFai Lau result = sk; 37761b7c691SMartin KaFai Lau hiscore = score; 37861b7c691SMartin KaFai Lau } 37961b7c691SMartin KaFai Lau } 38061b7c691SMartin KaFai Lau 38161b7c691SMartin KaFai Lau return result; 38261b7c691SMartin KaFai Lau } 38361b7c691SMartin KaFai Lau 3841559b4aaSJakub Sitnicki static inline struct sock *inet_lookup_run_bpf(struct net *net, 3851559b4aaSJakub Sitnicki struct inet_hashinfo *hashinfo, 3861559b4aaSJakub Sitnicki struct sk_buff *skb, int doff, 3871559b4aaSJakub Sitnicki __be32 saddr, __be16 sport, 388f8931565SMark Pashmfouroush __be32 daddr, u16 hnum, const int dif) 3891559b4aaSJakub Sitnicki { 3901559b4aaSJakub Sitnicki struct sock *sk, *reuse_sk; 3911559b4aaSJakub Sitnicki bool no_reuseport; 3921559b4aaSJakub Sitnicki 3934461568aSKuniyuki Iwashima if (hashinfo != net->ipv4.tcp_death_row.hashinfo) 3941559b4aaSJakub Sitnicki return NULL; /* only TCP is supported */ 3951559b4aaSJakub Sitnicki 396f8931565SMark Pashmfouroush no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_TCP, saddr, sport, 397f8931565SMark Pashmfouroush daddr, hnum, dif, &sk); 3981559b4aaSJakub Sitnicki if (no_reuseport || IS_ERR_OR_NULL(sk)) 3991559b4aaSJakub Sitnicki return sk; 4001559b4aaSJakub Sitnicki 4011559b4aaSJakub Sitnicki reuse_sk = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); 4021559b4aaSJakub Sitnicki if (reuse_sk) 4031559b4aaSJakub Sitnicki sk = reuse_sk; 4041559b4aaSJakub Sitnicki return sk; 4051559b4aaSJakub Sitnicki } 4061559b4aaSJakub Sitnicki 407c67499c0SPavel Emelyanov struct sock *__inet_lookup_listener(struct net *net, 408c67499c0SPavel Emelyanov struct inet_hashinfo *hashinfo, 409a583636aSCraig Gallek struct sk_buff *skb, int doff, 410da5e3630STom Herbert const __be32 saddr, __be16 sport, 411fb99c848SAl Viro const __be32 daddr, const unsigned short hnum, 4123fa6f616SDavid Ahern const int dif, const int sdif) 41399a92ff5SHerbert Xu { 41461b7c691SMartin KaFai Lau struct inet_listen_hashbucket *ilb2; 415d9fbc7f6SPeter Oskolkov struct sock *result = NULL; 41661b7c691SMartin KaFai Lau unsigned int hash2; 41761b7c691SMartin KaFai Lau 4181559b4aaSJakub Sitnicki /* Lookup redirect from BPF */ 4191559b4aaSJakub Sitnicki if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { 4201559b4aaSJakub Sitnicki result = inet_lookup_run_bpf(net, hashinfo, skb, doff, 421f8931565SMark Pashmfouroush saddr, sport, daddr, hnum, dif); 4221559b4aaSJakub Sitnicki if (result) 4231559b4aaSJakub Sitnicki goto done; 4241559b4aaSJakub Sitnicki } 4251559b4aaSJakub Sitnicki 42661b7c691SMartin KaFai Lau hash2 = ipv4_portaddr_hash(net, daddr, hnum); 42761b7c691SMartin KaFai Lau ilb2 = inet_lhash2_bucket(hashinfo, hash2); 42861b7c691SMartin KaFai Lau 42961b7c691SMartin KaFai Lau result = inet_lhash2_lookup(net, ilb2, skb, doff, 43061b7c691SMartin KaFai Lau saddr, sport, daddr, hnum, 43161b7c691SMartin KaFai Lau dif, sdif); 43261b7c691SMartin KaFai Lau if (result) 4338217ca65SMartin KaFai Lau goto done; 43461b7c691SMartin KaFai Lau 43561b7c691SMartin KaFai Lau /* Lookup lhash2 with INADDR_ANY */ 43661b7c691SMartin KaFai Lau hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); 43761b7c691SMartin KaFai Lau ilb2 = inet_lhash2_bucket(hashinfo, hash2); 43861b7c691SMartin KaFai Lau 4398217ca65SMartin KaFai Lau result = inet_lhash2_lookup(net, ilb2, skb, doff, 440d9fbc7f6SPeter Oskolkov saddr, sport, htonl(INADDR_ANY), hnum, 44161b7c691SMartin KaFai Lau dif, sdif); 4428217ca65SMartin KaFai Lau done: 44388e235b8SEnrico Weigelt if (IS_ERR(result)) 4448217ca65SMartin KaFai Lau return NULL; 445c25eb3bfSEric Dumazet return result; 44699a92ff5SHerbert Xu } 4478f491069SHerbert Xu EXPORT_SYMBOL_GPL(__inet_lookup_listener); 448a7f5e7f1SArnaldo Carvalho de Melo 44905dbc7b5SEric Dumazet /* All sockets share common refcount, but have different destructors */ 45005dbc7b5SEric Dumazet void sock_gen_put(struct sock *sk) 45105dbc7b5SEric Dumazet { 45241c6d650SReshetova, Elena if (!refcount_dec_and_test(&sk->sk_refcnt)) 45305dbc7b5SEric Dumazet return; 45405dbc7b5SEric Dumazet 45505dbc7b5SEric Dumazet if (sk->sk_state == TCP_TIME_WAIT) 45605dbc7b5SEric Dumazet inet_twsk_free(inet_twsk(sk)); 45741b822c5SEric Dumazet else if (sk->sk_state == TCP_NEW_SYN_RECV) 45841b822c5SEric Dumazet reqsk_free(inet_reqsk(sk)); 45905dbc7b5SEric Dumazet else 46005dbc7b5SEric Dumazet sk_free(sk); 46105dbc7b5SEric Dumazet } 46205dbc7b5SEric Dumazet EXPORT_SYMBOL_GPL(sock_gen_put); 46305dbc7b5SEric Dumazet 4642c13270bSEric Dumazet void sock_edemux(struct sk_buff *skb) 4652c13270bSEric Dumazet { 4662c13270bSEric Dumazet sock_gen_put(skb->sk); 4672c13270bSEric Dumazet } 4682c13270bSEric Dumazet EXPORT_SYMBOL(sock_edemux); 4692c13270bSEric Dumazet 470c67499c0SPavel Emelyanov struct sock *__inet_lookup_established(struct net *net, 471c67499c0SPavel Emelyanov struct inet_hashinfo *hashinfo, 47277a5ba55SPavel Emelyanov const __be32 saddr, const __be16 sport, 47377a5ba55SPavel Emelyanov const __be32 daddr, const u16 hnum, 4743fa6f616SDavid Ahern const int dif, const int sdif) 47577a5ba55SPavel Emelyanov { 476c7228317SJoe Perches INET_ADDR_COOKIE(acookie, saddr, daddr); 47777a5ba55SPavel Emelyanov const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 47877a5ba55SPavel Emelyanov struct sock *sk; 4793ab5aee7SEric Dumazet const struct hlist_nulls_node *node; 48077a5ba55SPavel Emelyanov /* Optimize here for direct hit, only listening connections can 48177a5ba55SPavel Emelyanov * have wildcards anyways. 48277a5ba55SPavel Emelyanov */ 4839f26b3adSPavel Emelyanov unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); 484f373b53bSEric Dumazet unsigned int slot = hash & hashinfo->ehash_mask; 4853ab5aee7SEric Dumazet struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; 48677a5ba55SPavel Emelyanov 4873ab5aee7SEric Dumazet begin: 4883ab5aee7SEric Dumazet sk_nulls_for_each_rcu(sk, node, &head->chain) { 489ce43b03eSEric Dumazet if (sk->sk_hash != hash) 490ce43b03eSEric Dumazet continue; 491eda090c3SEric Dumazet if (likely(inet_match(net, sk, acookie, ports, dif, sdif))) { 49241c6d650SReshetova, Elena if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) 49305dbc7b5SEric Dumazet goto out; 494eda090c3SEric Dumazet if (unlikely(!inet_match(net, sk, acookie, 4954915d50eSEric Dumazet ports, dif, sdif))) { 49605dbc7b5SEric Dumazet sock_gen_put(sk); 4973ab5aee7SEric Dumazet goto begin; 49877a5ba55SPavel Emelyanov } 49905dbc7b5SEric Dumazet goto found; 5003ab5aee7SEric Dumazet } 5013ab5aee7SEric Dumazet } 5023ab5aee7SEric Dumazet /* 5033ab5aee7SEric Dumazet * if the nulls value we got at the end of this lookup is 5043ab5aee7SEric Dumazet * not the expected one, we must restart lookup. 5053ab5aee7SEric Dumazet * We probably met an item that was moved to another chain. 5063ab5aee7SEric Dumazet */ 5073ab5aee7SEric Dumazet if (get_nulls_value(node) != slot) 5083ab5aee7SEric Dumazet goto begin; 50977a5ba55SPavel Emelyanov out: 51005dbc7b5SEric Dumazet sk = NULL; 51105dbc7b5SEric Dumazet found: 51277a5ba55SPavel Emelyanov return sk; 51377a5ba55SPavel Emelyanov } 51477a5ba55SPavel Emelyanov EXPORT_SYMBOL_GPL(__inet_lookup_established); 51577a5ba55SPavel Emelyanov 516a7f5e7f1SArnaldo Carvalho de Melo /* called with local bh disabled */ 517a7f5e7f1SArnaldo Carvalho de Melo static int __inet_check_established(struct inet_timewait_death_row *death_row, 518a7f5e7f1SArnaldo Carvalho de Melo struct sock *sk, __u16 lport, 519a7f5e7f1SArnaldo Carvalho de Melo struct inet_timewait_sock **twp) 520a7f5e7f1SArnaldo Carvalho de Melo { 521a7f5e7f1SArnaldo Carvalho de Melo struct inet_hashinfo *hinfo = death_row->hashinfo; 522a7f5e7f1SArnaldo Carvalho de Melo struct inet_sock *inet = inet_sk(sk); 523c720c7e8SEric Dumazet __be32 daddr = inet->inet_rcv_saddr; 524c720c7e8SEric Dumazet __be32 saddr = inet->inet_daddr; 525a7f5e7f1SArnaldo Carvalho de Melo int dif = sk->sk_bound_dev_if; 5263fa6f616SDavid Ahern struct net *net = sock_net(sk); 5273fa6f616SDavid Ahern int sdif = l3mdev_master_ifindex_by_index(net, dif); 528c7228317SJoe Perches INET_ADDR_COOKIE(acookie, saddr, daddr); 529c720c7e8SEric Dumazet const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); 530c720c7e8SEric Dumazet unsigned int hash = inet_ehashfn(net, daddr, lport, 531c720c7e8SEric Dumazet saddr, inet->inet_dport); 532a7f5e7f1SArnaldo Carvalho de Melo struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 5339db66bdcSEric Dumazet spinlock_t *lock = inet_ehash_lockp(hinfo, hash); 534a7f5e7f1SArnaldo Carvalho de Melo struct sock *sk2; 5353ab5aee7SEric Dumazet const struct hlist_nulls_node *node; 53605dbc7b5SEric Dumazet struct inet_timewait_sock *tw = NULL; 537a7f5e7f1SArnaldo Carvalho de Melo 5389db66bdcSEric Dumazet spin_lock(lock); 539a7f5e7f1SArnaldo Carvalho de Melo 5403ab5aee7SEric Dumazet sk_nulls_for_each(sk2, node, &head->chain) { 541ce43b03eSEric Dumazet if (sk2->sk_hash != hash) 542ce43b03eSEric Dumazet continue; 54305dbc7b5SEric Dumazet 544eda090c3SEric Dumazet if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) { 54505dbc7b5SEric Dumazet if (sk2->sk_state == TCP_TIME_WAIT) { 54605dbc7b5SEric Dumazet tw = inet_twsk(sk2); 54705dbc7b5SEric Dumazet if (twsk_unique(sk, sk2, twp)) 54805dbc7b5SEric Dumazet break; 54905dbc7b5SEric Dumazet } 550a7f5e7f1SArnaldo Carvalho de Melo goto not_unique; 551a7f5e7f1SArnaldo Carvalho de Melo } 55205dbc7b5SEric Dumazet } 553a7f5e7f1SArnaldo Carvalho de Melo 554a7f5e7f1SArnaldo Carvalho de Melo /* Must record num and sport now. Otherwise we will see 55505dbc7b5SEric Dumazet * in hash table socket with a funny identity. 55605dbc7b5SEric Dumazet */ 557c720c7e8SEric Dumazet inet->inet_num = lport; 558c720c7e8SEric Dumazet inet->inet_sport = htons(lport); 559a7f5e7f1SArnaldo Carvalho de Melo sk->sk_hash = hash; 560547b792cSIlpo Järvinen WARN_ON(!sk_unhashed(sk)); 5613ab5aee7SEric Dumazet __sk_nulls_add_node_rcu(sk, &head->chain); 56213475a30SEric Dumazet if (tw) { 563fc01538fSEric Dumazet sk_nulls_del_node_init_rcu((struct sock *)tw); 56402a1d6e7SEric Dumazet __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); 56513475a30SEric Dumazet } 5669db66bdcSEric Dumazet spin_unlock(lock); 567c29a0bc4SPavel Emelyanov sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 568a7f5e7f1SArnaldo Carvalho de Melo 569a7f5e7f1SArnaldo Carvalho de Melo if (twp) { 570a7f5e7f1SArnaldo Carvalho de Melo *twp = tw; 571a7f5e7f1SArnaldo Carvalho de Melo } else if (tw) { 572a7f5e7f1SArnaldo Carvalho de Melo /* Silly. Should hash-dance instead... */ 573dbe7faa4SEric Dumazet inet_twsk_deschedule_put(tw); 574a7f5e7f1SArnaldo Carvalho de Melo } 575a7f5e7f1SArnaldo Carvalho de Melo return 0; 576a7f5e7f1SArnaldo Carvalho de Melo 577a7f5e7f1SArnaldo Carvalho de Melo not_unique: 5789db66bdcSEric Dumazet spin_unlock(lock); 579a7f5e7f1SArnaldo Carvalho de Melo return -EADDRNOTAVAIL; 580a7f5e7f1SArnaldo Carvalho de Melo } 581a7f5e7f1SArnaldo Carvalho de Melo 582b2d05756SWilly Tarreau static u64 inet_sk_port_offset(const struct sock *sk) 583a7f5e7f1SArnaldo Carvalho de Melo { 584a7f5e7f1SArnaldo Carvalho de Melo const struct inet_sock *inet = inet_sk(sk); 585e2baad9eSEric Dumazet 586c720c7e8SEric Dumazet return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, 587c720c7e8SEric Dumazet inet->inet_daddr, 588c720c7e8SEric Dumazet inet->inet_dport); 589a7f5e7f1SArnaldo Carvalho de Melo } 590a7f5e7f1SArnaldo Carvalho de Melo 59101770a16SRicardo Dias /* Searches for an exsiting socket in the ehash bucket list. 59201770a16SRicardo Dias * Returns true if found, false otherwise. 593079096f1SEric Dumazet */ 59401770a16SRicardo Dias static bool inet_ehash_lookup_by_sk(struct sock *sk, 59501770a16SRicardo Dias struct hlist_nulls_head *list) 59601770a16SRicardo Dias { 59701770a16SRicardo Dias const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num); 59801770a16SRicardo Dias const int sdif = sk->sk_bound_dev_if; 59901770a16SRicardo Dias const int dif = sk->sk_bound_dev_if; 60001770a16SRicardo Dias const struct hlist_nulls_node *node; 60101770a16SRicardo Dias struct net *net = sock_net(sk); 60201770a16SRicardo Dias struct sock *esk; 60301770a16SRicardo Dias 60401770a16SRicardo Dias INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr); 60501770a16SRicardo Dias 60601770a16SRicardo Dias sk_nulls_for_each_rcu(esk, node, list) { 60701770a16SRicardo Dias if (esk->sk_hash != sk->sk_hash) 60801770a16SRicardo Dias continue; 60901770a16SRicardo Dias if (sk->sk_family == AF_INET) { 610eda090c3SEric Dumazet if (unlikely(inet_match(net, esk, acookie, 61101770a16SRicardo Dias ports, dif, sdif))) { 61201770a16SRicardo Dias return true; 61301770a16SRicardo Dias } 61401770a16SRicardo Dias } 61501770a16SRicardo Dias #if IS_ENABLED(CONFIG_IPV6) 61601770a16SRicardo Dias else if (sk->sk_family == AF_INET6) { 6175d368f03SEric Dumazet if (unlikely(inet6_match(net, esk, 61801770a16SRicardo Dias &sk->sk_v6_daddr, 61901770a16SRicardo Dias &sk->sk_v6_rcv_saddr, 62001770a16SRicardo Dias ports, dif, sdif))) { 62101770a16SRicardo Dias return true; 62201770a16SRicardo Dias } 62301770a16SRicardo Dias } 62401770a16SRicardo Dias #endif 62501770a16SRicardo Dias } 62601770a16SRicardo Dias return false; 62701770a16SRicardo Dias } 62801770a16SRicardo Dias 62901770a16SRicardo Dias /* Insert a socket into ehash, and eventually remove another one 63001770a16SRicardo Dias * (The another one can be a SYN_RECV or TIMEWAIT) 63101770a16SRicardo Dias * If an existing socket already exists, socket sk is not inserted, 63201770a16SRicardo Dias * and sets found_dup_sk parameter to true. 63301770a16SRicardo Dias */ 63401770a16SRicardo Dias bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) 635152da81dSPavel Emelyanov { 636429e42c1SKuniyuki Iwashima struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); 637152da81dSPavel Emelyanov struct inet_ehash_bucket *head; 63808eaef90SKuniyuki Iwashima struct hlist_nulls_head *list; 6395b441f76SEric Dumazet spinlock_t *lock; 6405e0724d0SEric Dumazet bool ret = true; 641152da81dSPavel Emelyanov 642079096f1SEric Dumazet WARN_ON_ONCE(!sk_unhashed(sk)); 643152da81dSPavel Emelyanov 6445b441f76SEric Dumazet sk->sk_hash = sk_ehashfn(sk); 645152da81dSPavel Emelyanov head = inet_ehash_bucket(hashinfo, sk->sk_hash); 646152da81dSPavel Emelyanov list = &head->chain; 647152da81dSPavel Emelyanov lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 648152da81dSPavel Emelyanov 6499db66bdcSEric Dumazet spin_lock(lock); 650fc01538fSEric Dumazet if (osk) { 6515e0724d0SEric Dumazet WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); 6525e0724d0SEric Dumazet ret = sk_nulls_del_node_init_rcu(osk); 65301770a16SRicardo Dias } else if (found_dup_sk) { 65401770a16SRicardo Dias *found_dup_sk = inet_ehash_lookup_by_sk(sk, list); 65501770a16SRicardo Dias if (*found_dup_sk) 65601770a16SRicardo Dias ret = false; 6579327f705SEric Dumazet } 65801770a16SRicardo Dias 6595e0724d0SEric Dumazet if (ret) 6605e0724d0SEric Dumazet __sk_nulls_add_node_rcu(sk, list); 66101770a16SRicardo Dias 6629db66bdcSEric Dumazet spin_unlock(lock); 66301770a16SRicardo Dias 664079096f1SEric Dumazet return ret; 665079096f1SEric Dumazet } 666079096f1SEric Dumazet 66701770a16SRicardo Dias bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) 668079096f1SEric Dumazet { 66901770a16SRicardo Dias bool ok = inet_ehash_insert(sk, osk, found_dup_sk); 6705e0724d0SEric Dumazet 6715e0724d0SEric Dumazet if (ok) { 672c29a0bc4SPavel Emelyanov sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 6735e0724d0SEric Dumazet } else { 67419757cebSEric Dumazet this_cpu_inc(*sk->sk_prot->orphan_count); 675563e0bb0SYafang Shao inet_sk_set_state(sk, TCP_CLOSE); 6765e0724d0SEric Dumazet sock_set_flag(sk, SOCK_DEAD); 6775e0724d0SEric Dumazet inet_csk_destroy_sock(sk); 678152da81dSPavel Emelyanov } 6795e0724d0SEric Dumazet return ok; 6805e0724d0SEric Dumazet } 6815e0724d0SEric Dumazet EXPORT_SYMBOL_GPL(inet_ehash_nolisten); 682152da81dSPavel Emelyanov 683c125e80bSCraig Gallek static int inet_reuseport_add_sock(struct sock *sk, 684fe38d2a1SJosef Bacik struct inet_listen_hashbucket *ilb) 685c125e80bSCraig Gallek { 68690e5d0dbSCraig Gallek struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; 6878dbd76e7SEric Dumazet const struct hlist_nulls_node *node; 688c125e80bSCraig Gallek struct sock *sk2; 689c125e80bSCraig Gallek kuid_t uid = sock_i_uid(sk); 690c125e80bSCraig Gallek 6918dbd76e7SEric Dumazet sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) { 692c125e80bSCraig Gallek if (sk2 != sk && 693c125e80bSCraig Gallek sk2->sk_family == sk->sk_family && 694c125e80bSCraig Gallek ipv6_only_sock(sk2) == ipv6_only_sock(sk) && 695c125e80bSCraig Gallek sk2->sk_bound_dev_if == sk->sk_bound_dev_if && 69690e5d0dbSCraig Gallek inet_csk(sk2)->icsk_bind_hash == tb && 697c125e80bSCraig Gallek sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && 698fe38d2a1SJosef Bacik inet_rcv_saddr_equal(sk, sk2, false)) 6992dbb9b9eSMartin KaFai Lau return reuseport_add_sock(sk, sk2, 7002dbb9b9eSMartin KaFai Lau inet_rcv_saddr_any(sk)); 701c125e80bSCraig Gallek } 702c125e80bSCraig Gallek 7032dbb9b9eSMartin KaFai Lau return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); 704c125e80bSCraig Gallek } 705c125e80bSCraig Gallek 706fe38d2a1SJosef Bacik int __inet_hash(struct sock *sk, struct sock *osk) 707152da81dSPavel Emelyanov { 708429e42c1SKuniyuki Iwashima struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); 709e8d00590SMartin KaFai Lau struct inet_listen_hashbucket *ilb2; 710c125e80bSCraig Gallek int err = 0; 711152da81dSPavel Emelyanov 7125e0724d0SEric Dumazet if (sk->sk_state != TCP_LISTEN) { 7134f9bf2a2SSebastian Andrzej Siewior local_bh_disable(); 71401770a16SRicardo Dias inet_ehash_nolisten(sk, osk, NULL); 7154f9bf2a2SSebastian Andrzej Siewior local_bh_enable(); 716c125e80bSCraig Gallek return 0; 7175e0724d0SEric Dumazet } 718547b792cSIlpo Järvinen WARN_ON(!sk_unhashed(sk)); 719e8d00590SMartin KaFai Lau ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); 720152da81dSPavel Emelyanov 721e8d00590SMartin KaFai Lau spin_lock(&ilb2->lock); 722c125e80bSCraig Gallek if (sk->sk_reuseport) { 723cae3873cSMartin KaFai Lau err = inet_reuseport_add_sock(sk, ilb2); 724c125e80bSCraig Gallek if (err) 725c125e80bSCraig Gallek goto unlock; 726c125e80bSCraig Gallek } 727d296ba60SCraig Gallek if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && 728cae3873cSMartin KaFai Lau sk->sk_family == AF_INET6) 729cae3873cSMartin KaFai Lau __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head); 730cae3873cSMartin KaFai Lau else 731cae3873cSMartin KaFai Lau __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head); 7323b24d854SEric Dumazet sock_set_flag(sk, SOCK_RCU_FREE); 733c29a0bc4SPavel Emelyanov sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 734c125e80bSCraig Gallek unlock: 735e8d00590SMartin KaFai Lau spin_unlock(&ilb2->lock); 736c125e80bSCraig Gallek 737c125e80bSCraig Gallek return err; 738152da81dSPavel Emelyanov } 73977a6a471SEric Dumazet EXPORT_SYMBOL(__inet_hash); 740ab1e0a13SArnaldo Carvalho de Melo 741086c653fSCraig Gallek int inet_hash(struct sock *sk) 742ab1e0a13SArnaldo Carvalho de Melo { 743c125e80bSCraig Gallek int err = 0; 744c125e80bSCraig Gallek 7454f9bf2a2SSebastian Andrzej Siewior if (sk->sk_state != TCP_CLOSE) 746fe38d2a1SJosef Bacik err = __inet_hash(sk, NULL); 747086c653fSCraig Gallek 748c125e80bSCraig Gallek return err; 749ab1e0a13SArnaldo Carvalho de Melo } 750ab1e0a13SArnaldo Carvalho de Melo EXPORT_SYMBOL_GPL(inet_hash); 751ab1e0a13SArnaldo Carvalho de Melo 7524f9bf2a2SSebastian Andrzej Siewior void inet_unhash(struct sock *sk) 7534f9bf2a2SSebastian Andrzej Siewior { 754429e42c1SKuniyuki Iwashima struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); 7554f9bf2a2SSebastian Andrzej Siewior 7564f9bf2a2SSebastian Andrzej Siewior if (sk_unhashed(sk)) 7574f9bf2a2SSebastian Andrzej Siewior return; 7584f9bf2a2SSebastian Andrzej Siewior 7594f9bf2a2SSebastian Andrzej Siewior if (sk->sk_state == TCP_LISTEN) { 760e8d00590SMartin KaFai Lau struct inet_listen_hashbucket *ilb2; 7614f9bf2a2SSebastian Andrzej Siewior 762e8d00590SMartin KaFai Lau ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); 7634f9bf2a2SSebastian Andrzej Siewior /* Don't disable bottom halves while acquiring the lock to 7644f9bf2a2SSebastian Andrzej Siewior * avoid circular locking dependency on PREEMPT_RT. 7654f9bf2a2SSebastian Andrzej Siewior */ 766e8d00590SMartin KaFai Lau spin_lock(&ilb2->lock); 767e8d00590SMartin KaFai Lau if (sk_unhashed(sk)) { 768e8d00590SMartin KaFai Lau spin_unlock(&ilb2->lock); 769e8d00590SMartin KaFai Lau return; 770e8d00590SMartin KaFai Lau } 771e8d00590SMartin KaFai Lau 772e8d00590SMartin KaFai Lau if (rcu_access_pointer(sk->sk_reuseport_cb)) 773e8d00590SMartin KaFai Lau reuseport_stop_listen_sock(sk); 774e8d00590SMartin KaFai Lau 775e8d00590SMartin KaFai Lau __sk_nulls_del_node_init_rcu(sk); 776e8d00590SMartin KaFai Lau sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 777e8d00590SMartin KaFai Lau spin_unlock(&ilb2->lock); 7784f9bf2a2SSebastian Andrzej Siewior } else { 7794f9bf2a2SSebastian Andrzej Siewior spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 7804f9bf2a2SSebastian Andrzej Siewior 7814f9bf2a2SSebastian Andrzej Siewior spin_lock_bh(lock); 782e8d00590SMartin KaFai Lau if (sk_unhashed(sk)) { 783e8d00590SMartin KaFai Lau spin_unlock_bh(lock); 784e8d00590SMartin KaFai Lau return; 785e8d00590SMartin KaFai Lau } 786e8d00590SMartin KaFai Lau __sk_nulls_del_node_init_rcu(sk); 787e8d00590SMartin KaFai Lau sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 788920de804SEric Dumazet spin_unlock_bh(lock); 789ab1e0a13SArnaldo Carvalho de Melo } 7904f9bf2a2SSebastian Andrzej Siewior } 791ab1e0a13SArnaldo Carvalho de Melo EXPORT_SYMBOL_GPL(inet_unhash); 792152da81dSPavel Emelyanov 79328044fc1SJoanne Koong static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb, 79428044fc1SJoanne Koong const struct net *net, unsigned short port, 79528044fc1SJoanne Koong int l3mdev, const struct sock *sk) 79628044fc1SJoanne Koong { 79728044fc1SJoanne Koong #if IS_ENABLED(CONFIG_IPV6) 7985456262dSMartin KaFai Lau if (sk->sk_family != tb->family) 7995456262dSMartin KaFai Lau return false; 8005456262dSMartin KaFai Lau 80128044fc1SJoanne Koong if (sk->sk_family == AF_INET6) 80228044fc1SJoanne Koong return net_eq(ib2_net(tb), net) && tb->port == port && 80328044fc1SJoanne Koong tb->l3mdev == l3mdev && 80428044fc1SJoanne Koong ipv6_addr_equal(&tb->v6_rcv_saddr, &sk->sk_v6_rcv_saddr); 80528044fc1SJoanne Koong else 80628044fc1SJoanne Koong #endif 80728044fc1SJoanne Koong return net_eq(ib2_net(tb), net) && tb->port == port && 80828044fc1SJoanne Koong tb->l3mdev == l3mdev && tb->rcv_saddr == sk->sk_rcv_saddr; 80928044fc1SJoanne Koong } 81028044fc1SJoanne Koong 81128044fc1SJoanne Koong bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net, 81228044fc1SJoanne Koong unsigned short port, int l3mdev, const struct sock *sk) 81328044fc1SJoanne Koong { 81428044fc1SJoanne Koong #if IS_ENABLED(CONFIG_IPV6) 81528044fc1SJoanne Koong struct in6_addr addr_any = {}; 81628044fc1SJoanne Koong 8175456262dSMartin KaFai Lau if (sk->sk_family != tb->family) 8185456262dSMartin KaFai Lau return false; 8195456262dSMartin KaFai Lau 82028044fc1SJoanne Koong if (sk->sk_family == AF_INET6) 82128044fc1SJoanne Koong return net_eq(ib2_net(tb), net) && tb->port == port && 82228044fc1SJoanne Koong tb->l3mdev == l3mdev && 82328044fc1SJoanne Koong ipv6_addr_equal(&tb->v6_rcv_saddr, &addr_any); 82428044fc1SJoanne Koong else 82528044fc1SJoanne Koong #endif 82628044fc1SJoanne Koong return net_eq(ib2_net(tb), net) && tb->port == port && 82728044fc1SJoanne Koong tb->l3mdev == l3mdev && tb->rcv_saddr == 0; 82828044fc1SJoanne Koong } 82928044fc1SJoanne Koong 83028044fc1SJoanne Koong /* The socket's bhash2 hashbucket spinlock must be held when this is called */ 83128044fc1SJoanne Koong struct inet_bind2_bucket * 83228044fc1SJoanne Koong inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net, 83328044fc1SJoanne Koong unsigned short port, int l3mdev, const struct sock *sk) 83428044fc1SJoanne Koong { 83528044fc1SJoanne Koong struct inet_bind2_bucket *bhash2 = NULL; 83628044fc1SJoanne Koong 83728044fc1SJoanne Koong inet_bind_bucket_for_each(bhash2, &head->chain) 83828044fc1SJoanne Koong if (inet_bind2_bucket_match(bhash2, net, port, l3mdev, sk)) 83928044fc1SJoanne Koong break; 84028044fc1SJoanne Koong 84128044fc1SJoanne Koong return bhash2; 84228044fc1SJoanne Koong } 84328044fc1SJoanne Koong 84428044fc1SJoanne Koong struct inet_bind_hashbucket * 84528044fc1SJoanne Koong inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port) 84628044fc1SJoanne Koong { 847429e42c1SKuniyuki Iwashima struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); 84828044fc1SJoanne Koong u32 hash; 84928044fc1SJoanne Koong #if IS_ENABLED(CONFIG_IPV6) 85028044fc1SJoanne Koong struct in6_addr addr_any = {}; 85128044fc1SJoanne Koong 85228044fc1SJoanne Koong if (sk->sk_family == AF_INET6) 85328044fc1SJoanne Koong hash = ipv6_portaddr_hash(net, &addr_any, port); 85428044fc1SJoanne Koong else 85528044fc1SJoanne Koong #endif 85628044fc1SJoanne Koong hash = ipv4_portaddr_hash(net, 0, port); 85728044fc1SJoanne Koong 85828044fc1SJoanne Koong return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; 85928044fc1SJoanne Koong } 86028044fc1SJoanne Koong 86128044fc1SJoanne Koong int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct sock *sk) 86228044fc1SJoanne Koong { 863429e42c1SKuniyuki Iwashima struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); 86428044fc1SJoanne Koong struct inet_bind2_bucket *tb2, *new_tb2; 86528044fc1SJoanne Koong int l3mdev = inet_sk_bound_l3mdev(sk); 86628044fc1SJoanne Koong struct inet_bind_hashbucket *head2; 86728044fc1SJoanne Koong int port = inet_sk(sk)->inet_num; 86828044fc1SJoanne Koong struct net *net = sock_net(sk); 86928044fc1SJoanne Koong 87028044fc1SJoanne Koong /* Allocate a bind2 bucket ahead of time to avoid permanently putting 87128044fc1SJoanne Koong * the bhash2 table in an inconsistent state if a new tb2 bucket 87228044fc1SJoanne Koong * allocation fails. 87328044fc1SJoanne Koong */ 87428044fc1SJoanne Koong new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC); 87528044fc1SJoanne Koong if (!new_tb2) 87628044fc1SJoanne Koong return -ENOMEM; 87728044fc1SJoanne Koong 87828044fc1SJoanne Koong head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); 87928044fc1SJoanne Koong 88028044fc1SJoanne Koong if (prev_saddr) { 88128044fc1SJoanne Koong spin_lock_bh(&prev_saddr->lock); 88228044fc1SJoanne Koong __sk_del_bind2_node(sk); 88328044fc1SJoanne Koong inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, 88428044fc1SJoanne Koong inet_csk(sk)->icsk_bind2_hash); 88528044fc1SJoanne Koong spin_unlock_bh(&prev_saddr->lock); 88628044fc1SJoanne Koong } 88728044fc1SJoanne Koong 88828044fc1SJoanne Koong spin_lock_bh(&head2->lock); 88928044fc1SJoanne Koong tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); 89028044fc1SJoanne Koong if (!tb2) { 89128044fc1SJoanne Koong tb2 = new_tb2; 89228044fc1SJoanne Koong inet_bind2_bucket_init(tb2, net, head2, port, l3mdev, sk); 89328044fc1SJoanne Koong } 89428044fc1SJoanne Koong sk_add_bind2_node(sk, &tb2->owners); 89528044fc1SJoanne Koong inet_csk(sk)->icsk_bind2_hash = tb2; 89628044fc1SJoanne Koong spin_unlock_bh(&head2->lock); 89728044fc1SJoanne Koong 89828044fc1SJoanne Koong if (tb2 != new_tb2) 89928044fc1SJoanne Koong kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2); 90028044fc1SJoanne Koong 90128044fc1SJoanne Koong return 0; 90228044fc1SJoanne Koong } 90328044fc1SJoanne Koong EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr); 90428044fc1SJoanne Koong 905190cc824SEric Dumazet /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm 906190cc824SEric Dumazet * Note that we use 32bit integers (vs RFC 'short integers') 907190cc824SEric Dumazet * because 2^16 is not a multiple of num_ephemeral and this 908190cc824SEric Dumazet * property might be used by clever attacker. 9094c2c8f03SWilly Tarreau * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though 9104c2c8f03SWilly Tarreau * attacks were since demonstrated, thus we use 65536 instead to really 9114c2c8f03SWilly Tarreau * give more isolation and privacy, at the expense of 256kB of kernel 9124c2c8f03SWilly Tarreau * memory. 913190cc824SEric Dumazet */ 9144c2c8f03SWilly Tarreau #define INET_TABLE_PERTURB_SHIFT 16 915e9261476SWilly Tarreau #define INET_TABLE_PERTURB_SIZE (1 << INET_TABLE_PERTURB_SHIFT) 916e9261476SWilly Tarreau static u32 *table_perturb; 917190cc824SEric Dumazet 9185ee31fc1SPavel Emelyanov int __inet_hash_connect(struct inet_timewait_death_row *death_row, 919b2d05756SWilly Tarreau struct sock *sk, u64 port_offset, 9205ee31fc1SPavel Emelyanov int (*check_established)(struct inet_timewait_death_row *, 921b4d6444eSEric Dumazet struct sock *, __u16, struct inet_timewait_sock **)) 922a7f5e7f1SArnaldo Carvalho de Melo { 923a7f5e7f1SArnaldo Carvalho de Melo struct inet_hashinfo *hinfo = death_row->hashinfo; 92428044fc1SJoanne Koong struct inet_bind_hashbucket *head, *head2; 925a7f5e7f1SArnaldo Carvalho de Melo struct inet_timewait_sock *tw = NULL; 9261580ab63SEric Dumazet int port = inet_sk(sk)->inet_num; 9271580ab63SEric Dumazet struct net *net = sock_net(sk); 92828044fc1SJoanne Koong struct inet_bind2_bucket *tb2; 9291580ab63SEric Dumazet struct inet_bind_bucket *tb; 93028044fc1SJoanne Koong bool tb_created = false; 9311580ab63SEric Dumazet u32 remaining, offset; 9321580ab63SEric Dumazet int ret, i, low, high; 9333c82a21fSRobert Shearman int l3mdev; 934190cc824SEric Dumazet u32 index; 9351580ab63SEric Dumazet 9361580ab63SEric Dumazet if (port) { 9371580ab63SEric Dumazet head = &hinfo->bhash[inet_bhashfn(net, port, 9381580ab63SEric Dumazet hinfo->bhash_size)]; 9391580ab63SEric Dumazet tb = inet_csk(sk)->icsk_bind_hash; 9401580ab63SEric Dumazet spin_lock_bh(&head->lock); 9411580ab63SEric Dumazet if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { 94201770a16SRicardo Dias inet_ehash_nolisten(sk, NULL, NULL); 9431580ab63SEric Dumazet spin_unlock_bh(&head->lock); 9441580ab63SEric Dumazet return 0; 9451580ab63SEric Dumazet } 9461580ab63SEric Dumazet spin_unlock(&head->lock); 9471580ab63SEric Dumazet /* No definite answer... Walk to established hash table */ 9481580ab63SEric Dumazet ret = check_established(death_row, sk, port, NULL); 9491580ab63SEric Dumazet local_bh_enable(); 9501580ab63SEric Dumazet return ret; 9511580ab63SEric Dumazet } 952a7f5e7f1SArnaldo Carvalho de Melo 9533c82a21fSRobert Shearman l3mdev = inet_sk_bound_l3mdev(sk); 9543c82a21fSRobert Shearman 9550bbf87d8SEric W. Biederman inet_get_local_port_range(net, &low, &high); 9561580ab63SEric Dumazet high++; /* [32768, 60999] -> [32768, 61000[ */ 9571580ab63SEric Dumazet remaining = high - low; 9581580ab63SEric Dumazet if (likely(remaining > 1)) 9591580ab63SEric Dumazet remaining &= ~1U; 960227b60f5SStephen Hemminger 9612a4187f4SJason A. Donenfeld get_random_sleepable_once(table_perturb, 962e9261476SWilly Tarreau INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb)); 963e8161345SWilly Tarreau index = port_offset & (INET_TABLE_PERTURB_SIZE - 1); 964190cc824SEric Dumazet 9659e9b70aeSWilly Tarreau offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32); 966b2d05756SWilly Tarreau offset %= remaining; 967b2d05756SWilly Tarreau 9681580ab63SEric Dumazet /* In first pass we try ports of @low parity. 9691580ab63SEric Dumazet * inet_csk_get_port() does the opposite choice. 97007f4c900SEric Dumazet */ 9711580ab63SEric Dumazet offset &= ~1U; 9721580ab63SEric Dumazet other_parity_scan: 9731580ab63SEric Dumazet port = low + offset; 9741580ab63SEric Dumazet for (i = 0; i < remaining; i += 2, port += 2) { 9751580ab63SEric Dumazet if (unlikely(port >= high)) 9761580ab63SEric Dumazet port -= remaining; 977122ff243SWANG Cong if (inet_is_local_reserved_port(net, port)) 978e3826f1eSAmerigo Wang continue; 9797f635ab7SPavel Emelyanov head = &hinfo->bhash[inet_bhashfn(net, port, 9807f635ab7SPavel Emelyanov hinfo->bhash_size)]; 9811580ab63SEric Dumazet spin_lock_bh(&head->lock); 982a7f5e7f1SArnaldo Carvalho de Melo 9831580ab63SEric Dumazet /* Does not bother with rcv_saddr checks, because 9841580ab63SEric Dumazet * the established check is already unique enough. 985a7f5e7f1SArnaldo Carvalho de Melo */ 986b67bfe0dSSasha Levin inet_bind_bucket_for_each(tb, &head->chain) { 98728044fc1SJoanne Koong if (inet_bind_bucket_match(tb, net, port, l3mdev)) { 988da5e3630STom Herbert if (tb->fastreuse >= 0 || 989da5e3630STom Herbert tb->fastreuseport >= 0) 990a7f5e7f1SArnaldo Carvalho de Melo goto next_port; 991a9d8f911SEvgeniy Polyakov WARN_ON(hlist_empty(&tb->owners)); 9925ee31fc1SPavel Emelyanov if (!check_established(death_row, sk, 9935ee31fc1SPavel Emelyanov port, &tw)) 994a7f5e7f1SArnaldo Carvalho de Melo goto ok; 995a7f5e7f1SArnaldo Carvalho de Melo goto next_port; 996a7f5e7f1SArnaldo Carvalho de Melo } 997a7f5e7f1SArnaldo Carvalho de Melo } 998a7f5e7f1SArnaldo Carvalho de Melo 999941b1d22SPavel Emelyanov tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 10003c82a21fSRobert Shearman net, head, port, l3mdev); 1001a7f5e7f1SArnaldo Carvalho de Melo if (!tb) { 10021580ab63SEric Dumazet spin_unlock_bh(&head->lock); 10031580ab63SEric Dumazet return -ENOMEM; 1004a7f5e7f1SArnaldo Carvalho de Melo } 100528044fc1SJoanne Koong tb_created = true; 1006a7f5e7f1SArnaldo Carvalho de Melo tb->fastreuse = -1; 1007da5e3630STom Herbert tb->fastreuseport = -1; 1008a7f5e7f1SArnaldo Carvalho de Melo goto ok; 1009a7f5e7f1SArnaldo Carvalho de Melo next_port: 10101580ab63SEric Dumazet spin_unlock_bh(&head->lock); 10111580ab63SEric Dumazet cond_resched(); 1012a7f5e7f1SArnaldo Carvalho de Melo } 10131580ab63SEric Dumazet 10141580ab63SEric Dumazet offset++; 10151580ab63SEric Dumazet if ((offset & 1) && remaining > 1) 10161580ab63SEric Dumazet goto other_parity_scan; 1017a7f5e7f1SArnaldo Carvalho de Melo 1018a7f5e7f1SArnaldo Carvalho de Melo return -EADDRNOTAVAIL; 1019a7f5e7f1SArnaldo Carvalho de Melo 1020a7f5e7f1SArnaldo Carvalho de Melo ok: 102128044fc1SJoanne Koong /* Find the corresponding tb2 bucket since we need to 102228044fc1SJoanne Koong * add the socket to the bhash2 table as well 102328044fc1SJoanne Koong */ 102428044fc1SJoanne Koong head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); 102528044fc1SJoanne Koong spin_lock(&head2->lock); 102628044fc1SJoanne Koong 102728044fc1SJoanne Koong tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); 102828044fc1SJoanne Koong if (!tb2) { 102928044fc1SJoanne Koong tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net, 103028044fc1SJoanne Koong head2, port, l3mdev, sk); 103128044fc1SJoanne Koong if (!tb2) 103228044fc1SJoanne Koong goto error; 103328044fc1SJoanne Koong } 103428044fc1SJoanne Koong 1035ca7af040SWilly Tarreau /* Here we want to add a little bit of randomness to the next source 1036ca7af040SWilly Tarreau * port that will be chosen. We use a max() with a random here so that 1037ca7af040SWilly Tarreau * on low contention the randomness is maximal and on high contention 1038ca7af040SWilly Tarreau * it may be inexistent. 1039c579bd1bSEric Dumazet */ 1040*8032bf12SJason A. Donenfeld i = max_t(int, i, get_random_u32_below(8) * 2); 1041190cc824SEric Dumazet WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2); 1042a7f5e7f1SArnaldo Carvalho de Melo 1043a7f5e7f1SArnaldo Carvalho de Melo /* Head lock still held and bh's disabled */ 104428044fc1SJoanne Koong inet_bind_hash(sk, tb, tb2, port); 104528044fc1SJoanne Koong 104628044fc1SJoanne Koong spin_unlock(&head2->lock); 104728044fc1SJoanne Koong 1048a7f5e7f1SArnaldo Carvalho de Melo if (sk_unhashed(sk)) { 1049c720c7e8SEric Dumazet inet_sk(sk)->inet_sport = htons(port); 105001770a16SRicardo Dias inet_ehash_nolisten(sk, (struct sock *)tw, NULL); 1051a7f5e7f1SArnaldo Carvalho de Melo } 10523cdaedaeSEric Dumazet if (tw) 1053fc01538fSEric Dumazet inet_twsk_bind_unhash(tw, hinfo); 1054a7f5e7f1SArnaldo Carvalho de Melo spin_unlock(&head->lock); 1055dbe7faa4SEric Dumazet if (tw) 1056dbe7faa4SEric Dumazet inet_twsk_deschedule_put(tw); 1057a7f5e7f1SArnaldo Carvalho de Melo local_bh_enable(); 10581580ab63SEric Dumazet return 0; 105928044fc1SJoanne Koong 106028044fc1SJoanne Koong error: 106128044fc1SJoanne Koong spin_unlock(&head2->lock); 106228044fc1SJoanne Koong if (tb_created) 106328044fc1SJoanne Koong inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); 106428044fc1SJoanne Koong spin_unlock_bh(&head->lock); 106528044fc1SJoanne Koong return -ENOMEM; 1066a7f5e7f1SArnaldo Carvalho de Melo } 10675ee31fc1SPavel Emelyanov 10685ee31fc1SPavel Emelyanov /* 10695ee31fc1SPavel Emelyanov * Bind a port for a connect operation and hash it. 10705ee31fc1SPavel Emelyanov */ 10715ee31fc1SPavel Emelyanov int inet_hash_connect(struct inet_timewait_death_row *death_row, 10725ee31fc1SPavel Emelyanov struct sock *sk) 10735ee31fc1SPavel Emelyanov { 1074b2d05756SWilly Tarreau u64 port_offset = 0; 1075e2baad9eSEric Dumazet 1076e2baad9eSEric Dumazet if (!inet_sk(sk)->inet_num) 1077e2baad9eSEric Dumazet port_offset = inet_sk_port_offset(sk); 1078e2baad9eSEric Dumazet return __inet_hash_connect(death_row, sk, port_offset, 1079b4d6444eSEric Dumazet __inet_check_established); 10805ee31fc1SPavel Emelyanov } 1081a7f5e7f1SArnaldo Carvalho de Melo EXPORT_SYMBOL_GPL(inet_hash_connect); 10825caea4eaSEric Dumazet 1083c92c81dfSPeter Oskolkov static void init_hashinfo_lhash2(struct inet_hashinfo *h) 1084c92c81dfSPeter Oskolkov { 1085c92c81dfSPeter Oskolkov int i; 1086c92c81dfSPeter Oskolkov 1087c92c81dfSPeter Oskolkov for (i = 0; i <= h->lhash2_mask; i++) { 1088c92c81dfSPeter Oskolkov spin_lock_init(&h->lhash2[i].lock); 1089cae3873cSMartin KaFai Lau INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head, 1090cae3873cSMartin KaFai Lau i + LISTENING_NULLS_BASE); 1091c92c81dfSPeter Oskolkov } 1092c92c81dfSPeter Oskolkov } 1093c92c81dfSPeter Oskolkov 109461b7c691SMartin KaFai Lau void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, 109561b7c691SMartin KaFai Lau unsigned long numentries, int scale, 109661b7c691SMartin KaFai Lau unsigned long low_limit, 109761b7c691SMartin KaFai Lau unsigned long high_limit) 109861b7c691SMartin KaFai Lau { 109961b7c691SMartin KaFai Lau h->lhash2 = alloc_large_system_hash(name, 110061b7c691SMartin KaFai Lau sizeof(*h->lhash2), 110161b7c691SMartin KaFai Lau numentries, 110261b7c691SMartin KaFai Lau scale, 110361b7c691SMartin KaFai Lau 0, 110461b7c691SMartin KaFai Lau NULL, 110561b7c691SMartin KaFai Lau &h->lhash2_mask, 110661b7c691SMartin KaFai Lau low_limit, 110761b7c691SMartin KaFai Lau high_limit); 1108c92c81dfSPeter Oskolkov init_hashinfo_lhash2(h); 1109e9261476SWilly Tarreau 1110e9261476SWilly Tarreau /* this one is used for source ports of outgoing connections */ 1111e67b72b9SMuchun Song table_perturb = alloc_large_system_hash("Table-perturb", 1112e67b72b9SMuchun Song sizeof(*table_perturb), 1113e67b72b9SMuchun Song INET_TABLE_PERTURB_SIZE, 1114e67b72b9SMuchun Song 0, 0, NULL, NULL, 1115e67b72b9SMuchun Song INET_TABLE_PERTURB_SIZE, 1116e67b72b9SMuchun Song INET_TABLE_PERTURB_SIZE); 1117c92c81dfSPeter Oskolkov } 111861b7c691SMartin KaFai Lau 1119c92c81dfSPeter Oskolkov int inet_hashinfo2_init_mod(struct inet_hashinfo *h) 1120c92c81dfSPeter Oskolkov { 1121c92c81dfSPeter Oskolkov h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL); 1122c92c81dfSPeter Oskolkov if (!h->lhash2) 1123c92c81dfSPeter Oskolkov return -ENOMEM; 1124c92c81dfSPeter Oskolkov 1125c92c81dfSPeter Oskolkov h->lhash2_mask = INET_LHTABLE_SIZE - 1; 1126c92c81dfSPeter Oskolkov /* INET_LHTABLE_SIZE must be a power of 2 */ 1127c92c81dfSPeter Oskolkov BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask); 1128c92c81dfSPeter Oskolkov 1129c92c81dfSPeter Oskolkov init_hashinfo_lhash2(h); 1130c92c81dfSPeter Oskolkov return 0; 113161b7c691SMartin KaFai Lau } 1132c92c81dfSPeter Oskolkov EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod); 113361b7c691SMartin KaFai Lau 1134095dc8e0SEric Dumazet int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) 1135095dc8e0SEric Dumazet { 113689e478a2SEric Dumazet unsigned int locksz = sizeof(spinlock_t); 1137095dc8e0SEric Dumazet unsigned int i, nblocks = 1; 1138095dc8e0SEric Dumazet 113989e478a2SEric Dumazet if (locksz != 0) { 1140095dc8e0SEric Dumazet /* allocate 2 cache lines or at least one spinlock per cpu */ 114189e478a2SEric Dumazet nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U); 1142095dc8e0SEric Dumazet nblocks = roundup_pow_of_two(nblocks * num_possible_cpus()); 1143095dc8e0SEric Dumazet 1144095dc8e0SEric Dumazet /* no more locks than number of hash buckets */ 1145095dc8e0SEric Dumazet nblocks = min(nblocks, hashinfo->ehash_mask + 1); 1146095dc8e0SEric Dumazet 1147752ade68SMichal Hocko hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL); 1148095dc8e0SEric Dumazet if (!hashinfo->ehash_locks) 1149095dc8e0SEric Dumazet return -ENOMEM; 1150095dc8e0SEric Dumazet 1151095dc8e0SEric Dumazet for (i = 0; i < nblocks; i++) 1152095dc8e0SEric Dumazet spin_lock_init(&hashinfo->ehash_locks[i]); 1153095dc8e0SEric Dumazet } 1154095dc8e0SEric Dumazet hashinfo->ehash_locks_mask = nblocks - 1; 1155095dc8e0SEric Dumazet return 0; 1156095dc8e0SEric Dumazet } 1157095dc8e0SEric Dumazet EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc); 1158d1e5e640SKuniyuki Iwashima 1159d1e5e640SKuniyuki Iwashima struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, 1160d1e5e640SKuniyuki Iwashima unsigned int ehash_entries) 1161d1e5e640SKuniyuki Iwashima { 1162d1e5e640SKuniyuki Iwashima struct inet_hashinfo *new_hashinfo; 1163d1e5e640SKuniyuki Iwashima int i; 1164d1e5e640SKuniyuki Iwashima 1165d1e5e640SKuniyuki Iwashima new_hashinfo = kmemdup(hashinfo, sizeof(*hashinfo), GFP_KERNEL); 1166d1e5e640SKuniyuki Iwashima if (!new_hashinfo) 1167d1e5e640SKuniyuki Iwashima goto err; 1168d1e5e640SKuniyuki Iwashima 1169d1e5e640SKuniyuki Iwashima new_hashinfo->ehash = vmalloc_huge(ehash_entries * sizeof(struct inet_ehash_bucket), 1170d1e5e640SKuniyuki Iwashima GFP_KERNEL_ACCOUNT); 1171d1e5e640SKuniyuki Iwashima if (!new_hashinfo->ehash) 1172d1e5e640SKuniyuki Iwashima goto free_hashinfo; 1173d1e5e640SKuniyuki Iwashima 1174d1e5e640SKuniyuki Iwashima new_hashinfo->ehash_mask = ehash_entries - 1; 1175d1e5e640SKuniyuki Iwashima 1176d1e5e640SKuniyuki Iwashima if (inet_ehash_locks_alloc(new_hashinfo)) 1177d1e5e640SKuniyuki Iwashima goto free_ehash; 1178d1e5e640SKuniyuki Iwashima 1179d1e5e640SKuniyuki Iwashima for (i = 0; i < ehash_entries; i++) 1180d1e5e640SKuniyuki Iwashima INIT_HLIST_NULLS_HEAD(&new_hashinfo->ehash[i].chain, i); 1181d1e5e640SKuniyuki Iwashima 1182d1e5e640SKuniyuki Iwashima new_hashinfo->pernet = true; 1183d1e5e640SKuniyuki Iwashima 1184d1e5e640SKuniyuki Iwashima return new_hashinfo; 1185d1e5e640SKuniyuki Iwashima 1186d1e5e640SKuniyuki Iwashima free_ehash: 1187d1e5e640SKuniyuki Iwashima vfree(new_hashinfo->ehash); 1188d1e5e640SKuniyuki Iwashima free_hashinfo: 1189d1e5e640SKuniyuki Iwashima kfree(new_hashinfo); 1190d1e5e640SKuniyuki Iwashima err: 1191d1e5e640SKuniyuki Iwashima return NULL; 1192d1e5e640SKuniyuki Iwashima } 1193d1e5e640SKuniyuki Iwashima EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_alloc); 1194d1e5e640SKuniyuki Iwashima 1195d1e5e640SKuniyuki Iwashima void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo) 1196d1e5e640SKuniyuki Iwashima { 1197d1e5e640SKuniyuki Iwashima if (!hashinfo->pernet) 1198d1e5e640SKuniyuki Iwashima return; 1199d1e5e640SKuniyuki Iwashima 1200d1e5e640SKuniyuki Iwashima inet_ehash_locks_free(hashinfo); 1201d1e5e640SKuniyuki Iwashima vfree(hashinfo->ehash); 1202d1e5e640SKuniyuki Iwashima kfree(hashinfo); 1203d1e5e640SKuniyuki Iwashima } 1204d1e5e640SKuniyuki Iwashima EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_free); 1205