xref: /linux/net/ipv4/inet_hashtables.c (revision 2ed4b46b4fc77749cb0f8dd31a01441b82c8dbaa)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic INET transport hashtables
8  *
9  * Authors:	Lotsa people, from code originally in tcp
10  */
11 
12 #include <linux/module.h>
13 #include <linux/random.h>
14 #include <linux/sched.h>
15 #include <linux/slab.h>
16 #include <linux/wait.h>
17 #include <linux/vmalloc.h>
18 #include <linux/memblock.h>
19 
20 #include <net/addrconf.h>
21 #include <net/inet_connection_sock.h>
22 #include <net/inet_hashtables.h>
23 #if IS_ENABLED(CONFIG_IPV6)
24 #include <net/inet6_hashtables.h>
25 #endif
26 #include <net/hotdata.h>
27 #include <net/ip.h>
28 #include <net/rps.h>
29 #include <net/secure_seq.h>
30 #include <net/sock_reuseport.h>
31 #include <net/tcp.h>
32 
33 static void inet_init_ehash_secret(void)
34 {
35 	net_get_random_sleepable_once(&inet_ehash_secret,
36 				      sizeof(inet_ehash_secret));
37 }
38 
39 u32 inet_ehashfn(const struct net *net, const __be32 laddr,
40 		 const __u16 lport, const __be32 faddr,
41 		 const __be16 fport)
42 {
43 	return lport + __inet_ehashfn(laddr, 0, faddr, fport,
44 				      inet_ehash_secret + net_hash_mix(net));
45 }
46 EXPORT_SYMBOL_GPL(inet_ehashfn);
47 
48 /* This function handles inet_sock, but also timewait and request sockets
49  * for IPv4/IPv6.
50  */
51 static u32 sk_ehashfn(const struct sock *sk)
52 {
53 #if IS_ENABLED(CONFIG_IPV6)
54 	if (sk->sk_family == AF_INET6 &&
55 	    !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
56 		return inet6_ehashfn(sock_net(sk),
57 				     &sk->sk_v6_rcv_saddr, sk->sk_num,
58 				     &sk->sk_v6_daddr, sk->sk_dport);
59 #endif
60 	return inet_ehashfn(sock_net(sk),
61 			    sk->sk_rcv_saddr, sk->sk_num,
62 			    sk->sk_daddr, sk->sk_dport);
63 }
64 
65 static bool sk_is_connect_bind(const struct sock *sk)
66 {
67 	if (sk->sk_state == TCP_TIME_WAIT)
68 		return inet_twsk(sk)->tw_connect_bind;
69 	else
70 		return sk->sk_userlocks & SOCK_CONNECT_BIND;
71 }
72 
73 /*
74  * Allocate and initialize a new local port bind bucket.
75  * The bindhash mutex for snum's hash chain must be held here.
76  */
77 struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
78 						 struct net *net,
79 						 struct inet_bind_hashbucket *head,
80 						 const unsigned short snum,
81 						 int l3mdev)
82 {
83 	struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
84 
85 	if (tb) {
86 		write_pnet(&tb->ib_net, net);
87 		tb->l3mdev    = l3mdev;
88 		tb->port      = snum;
89 		tb->fastreuse = 0;
90 		tb->fastreuseport = 0;
91 		INIT_HLIST_HEAD(&tb->bhash2);
92 		hlist_add_head_rcu(&tb->node, &head->chain);
93 	}
94 	return tb;
95 }
96 
97 /*
98  * Caller must hold hashbucket lock for this tb with local BH disabled
99  */
100 void inet_bind_bucket_destroy(struct inet_bind_bucket *tb)
101 {
102 	const struct inet_bind2_bucket *tb2;
103 
104 	if (hlist_empty(&tb->bhash2)) {
105 		hlist_del_rcu(&tb->node);
106 		kfree_rcu(tb, rcu);
107 		return;
108 	}
109 
110 	if (tb->fastreuse == -1 && tb->fastreuseport == -1)
111 		return;
112 	hlist_for_each_entry(tb2, &tb->bhash2, bhash_node) {
113 		if (tb2->fastreuse != -1 || tb2->fastreuseport != -1)
114 			return;
115 	}
116 	tb->fastreuse = -1;
117 	tb->fastreuseport = -1;
118 }
119 
120 bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net,
121 			    unsigned short port, int l3mdev)
122 {
123 	return net_eq(ib_net(tb), net) && tb->port == port &&
124 		tb->l3mdev == l3mdev;
125 }
126 
127 static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2,
128 				   struct net *net,
129 				   struct inet_bind_hashbucket *head,
130 				   struct inet_bind_bucket *tb,
131 				   const struct sock *sk)
132 {
133 	write_pnet(&tb2->ib_net, net);
134 	tb2->l3mdev = tb->l3mdev;
135 	tb2->port = tb->port;
136 #if IS_ENABLED(CONFIG_IPV6)
137 	BUILD_BUG_ON(USHRT_MAX < (IPV6_ADDR_ANY | IPV6_ADDR_MAPPED));
138 	if (sk->sk_family == AF_INET6) {
139 		tb2->addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
140 		tb2->v6_rcv_saddr = sk->sk_v6_rcv_saddr;
141 	} else {
142 		tb2->addr_type = IPV6_ADDR_MAPPED;
143 		ipv6_addr_set_v4mapped(sk->sk_rcv_saddr, &tb2->v6_rcv_saddr);
144 	}
145 #else
146 	tb2->rcv_saddr = sk->sk_rcv_saddr;
147 #endif
148 	tb2->fastreuse = 0;
149 	tb2->fastreuseport = 0;
150 	INIT_HLIST_HEAD(&tb2->owners);
151 	hlist_add_head(&tb2->node, &head->chain);
152 	hlist_add_head(&tb2->bhash_node, &tb->bhash2);
153 }
154 
155 struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep,
156 						   struct net *net,
157 						   struct inet_bind_hashbucket *head,
158 						   struct inet_bind_bucket *tb,
159 						   const struct sock *sk)
160 {
161 	struct inet_bind2_bucket *tb2 = kmem_cache_alloc(cachep, GFP_ATOMIC);
162 
163 	if (tb2)
164 		inet_bind2_bucket_init(tb2, net, head, tb, sk);
165 
166 	return tb2;
167 }
168 
169 /* Caller must hold hashbucket lock for this tb with local BH disabled */
170 void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb)
171 {
172 	const struct sock *sk;
173 
174 	if (hlist_empty(&tb->owners)) {
175 		__hlist_del(&tb->node);
176 		__hlist_del(&tb->bhash_node);
177 		kmem_cache_free(cachep, tb);
178 		return;
179 	}
180 
181 	if (tb->fastreuse == -1 && tb->fastreuseport == -1)
182 		return;
183 	sk_for_each_bound(sk, &tb->owners) {
184 		if (!sk_is_connect_bind(sk))
185 			return;
186 	}
187 	tb->fastreuse = -1;
188 	tb->fastreuseport = -1;
189 }
190 
191 static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2,
192 					 const struct sock *sk)
193 {
194 #if IS_ENABLED(CONFIG_IPV6)
195 	if (sk->sk_family == AF_INET6)
196 		return ipv6_addr_equal(&tb2->v6_rcv_saddr, &sk->sk_v6_rcv_saddr);
197 
198 	if (tb2->addr_type != IPV6_ADDR_MAPPED)
199 		return false;
200 #endif
201 	return tb2->rcv_saddr == sk->sk_rcv_saddr;
202 }
203 
204 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
205 		    struct inet_bind2_bucket *tb2, unsigned short port)
206 {
207 	WRITE_ONCE(inet_sk(sk)->inet_num, port);
208 	inet_csk(sk)->icsk_bind_hash = tb;
209 	inet_csk(sk)->icsk_bind2_hash = tb2;
210 	sk_add_bind_node(sk, &tb2->owners);
211 }
212 
213 /*
214  * Get rid of any references to a local port held by the given sock.
215  */
216 static void __inet_put_port(struct sock *sk)
217 {
218 	struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
219 	struct inet_bind_hashbucket *head, *head2;
220 	struct net *net = sock_net(sk);
221 	struct inet_bind_bucket *tb;
222 	int bhash;
223 
224 	bhash = inet_bhashfn(net, inet_sk(sk)->inet_num, hashinfo->bhash_size);
225 	head = &hashinfo->bhash[bhash];
226 	head2 = inet_bhashfn_portaddr(hashinfo, sk, net, inet_sk(sk)->inet_num);
227 
228 	spin_lock(&head->lock);
229 	tb = inet_csk(sk)->icsk_bind_hash;
230 	inet_csk(sk)->icsk_bind_hash = NULL;
231 	WRITE_ONCE(inet_sk(sk)->inet_num, 0);
232 	sk->sk_userlocks &= ~SOCK_CONNECT_BIND;
233 
234 	spin_lock(&head2->lock);
235 	if (inet_csk(sk)->icsk_bind2_hash) {
236 		struct inet_bind2_bucket *tb2 = inet_csk(sk)->icsk_bind2_hash;
237 
238 		__sk_del_bind_node(sk);
239 		inet_csk(sk)->icsk_bind2_hash = NULL;
240 		inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
241 	}
242 	spin_unlock(&head2->lock);
243 
244 	inet_bind_bucket_destroy(tb);
245 	spin_unlock(&head->lock);
246 }
247 
248 void inet_put_port(struct sock *sk)
249 {
250 	local_bh_disable();
251 	__inet_put_port(sk);
252 	local_bh_enable();
253 }
254 EXPORT_SYMBOL(inet_put_port);
255 
256 int __inet_inherit_port(const struct sock *sk, struct sock *child)
257 {
258 	struct inet_hashinfo *table = tcp_get_hashinfo(sk);
259 	unsigned short port = inet_sk(child)->inet_num;
260 	struct inet_bind_hashbucket *head, *head2;
261 	bool created_inet_bind_bucket = false;
262 	struct net *net = sock_net(sk);
263 	bool update_fastreuse = false;
264 	struct inet_bind2_bucket *tb2;
265 	struct inet_bind_bucket *tb;
266 	int bhash, l3mdev;
267 
268 	bhash = inet_bhashfn(net, port, table->bhash_size);
269 	head = &table->bhash[bhash];
270 	head2 = inet_bhashfn_portaddr(table, child, net, port);
271 
272 	spin_lock(&head->lock);
273 	spin_lock(&head2->lock);
274 	tb = inet_csk(sk)->icsk_bind_hash;
275 	tb2 = inet_csk(sk)->icsk_bind2_hash;
276 	if (unlikely(!tb || !tb2)) {
277 		spin_unlock(&head2->lock);
278 		spin_unlock(&head->lock);
279 		return -ENOENT;
280 	}
281 	if (tb->port != port) {
282 		l3mdev = inet_sk_bound_l3mdev(sk);
283 
284 		/* NOTE: using tproxy and redirecting skbs to a proxy
285 		 * on a different listener port breaks the assumption
286 		 * that the listener socket's icsk_bind_hash is the same
287 		 * as that of the child socket. We have to look up or
288 		 * create a new bind bucket for the child here. */
289 		inet_bind_bucket_for_each(tb, &head->chain) {
290 			if (inet_bind_bucket_match(tb, net, port, l3mdev))
291 				break;
292 		}
293 		if (!tb) {
294 			tb = inet_bind_bucket_create(table->bind_bucket_cachep,
295 						     net, head, port, l3mdev);
296 			if (!tb) {
297 				spin_unlock(&head2->lock);
298 				spin_unlock(&head->lock);
299 				return -ENOMEM;
300 			}
301 			created_inet_bind_bucket = true;
302 		}
303 		update_fastreuse = true;
304 
305 		goto bhash2_find;
306 	} else if (!inet_bind2_bucket_addr_match(tb2, child)) {
307 		l3mdev = inet_sk_bound_l3mdev(sk);
308 
309 bhash2_find:
310 		tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, child);
311 		if (!tb2) {
312 			tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep,
313 						       net, head2, tb, child);
314 			if (!tb2)
315 				goto error;
316 		}
317 	}
318 	if (update_fastreuse)
319 		inet_csk_update_fastreuse(child, tb, tb2);
320 	inet_bind_hash(child, tb, tb2, port);
321 	spin_unlock(&head2->lock);
322 	spin_unlock(&head->lock);
323 
324 	return 0;
325 
326 error:
327 	if (created_inet_bind_bucket)
328 		inet_bind_bucket_destroy(tb);
329 	spin_unlock(&head2->lock);
330 	spin_unlock(&head->lock);
331 	return -ENOMEM;
332 }
333 EXPORT_SYMBOL_GPL(__inet_inherit_port);
334 
335 static struct inet_listen_hashbucket *
336 inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk)
337 {
338 	u32 hash;
339 
340 #if IS_ENABLED(CONFIG_IPV6)
341 	if (sk->sk_family == AF_INET6)
342 		hash = ipv6_portaddr_hash(sock_net(sk),
343 					  &sk->sk_v6_rcv_saddr,
344 					  inet_sk(sk)->inet_num);
345 	else
346 #endif
347 		hash = ipv4_portaddr_hash(sock_net(sk),
348 					  inet_sk(sk)->inet_rcv_saddr,
349 					  inet_sk(sk)->inet_num);
350 	return inet_lhash2_bucket(h, hash);
351 }
352 
353 static inline int compute_score(struct sock *sk, const struct net *net,
354 				const unsigned short hnum, const __be32 daddr,
355 				const int dif, const int sdif)
356 {
357 	int score = -1;
358 
359 	if (net_eq(sock_net(sk), net) && READ_ONCE(sk->sk_num) == hnum &&
360 			!ipv6_only_sock(sk)) {
361 		if (sk->sk_rcv_saddr != daddr)
362 			return -1;
363 
364 		if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
365 			return -1;
366 		score =  sk->sk_bound_dev_if ? 2 : 1;
367 
368 		if (sk->sk_family == PF_INET)
369 			score++;
370 		if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
371 			score++;
372 	}
373 	return score;
374 }
375 
376 /**
377  * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary.
378  * @net: network namespace.
379  * @sk: AF_INET socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP.
380  * @skb: context for a potential SK_REUSEPORT program.
381  * @doff: header offset.
382  * @saddr: source address.
383  * @sport: source port.
384  * @daddr: destination address.
385  * @hnum: destination port in host byte order.
386  * @ehashfn: hash function used to generate the fallback hash.
387  *
388  * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to
389  *         the selected sock or an error.
390  */
391 struct sock *inet_lookup_reuseport(const struct net *net, struct sock *sk,
392 				   struct sk_buff *skb, int doff,
393 				   __be32 saddr, __be16 sport,
394 				   __be32 daddr, unsigned short hnum,
395 				   inet_ehashfn_t *ehashfn)
396 {
397 	struct sock *reuse_sk = NULL;
398 	u32 phash;
399 
400 	if (sk->sk_reuseport) {
401 		phash = INDIRECT_CALL_2(ehashfn, udp_ehashfn, inet_ehashfn,
402 					net, daddr, hnum, saddr, sport);
403 		reuse_sk = reuseport_select_sock(sk, phash, skb, doff);
404 	}
405 	return reuse_sk;
406 }
407 EXPORT_SYMBOL_GPL(inet_lookup_reuseport);
408 
409 /*
410  * Here are some nice properties to exploit here. The BSD API
411  * does not allow a listening sock to specify the remote port nor the
412  * remote address for the connection. So always assume those are both
413  * wildcarded during the search since they can never be otherwise.
414  */
415 
416 /* called with rcu_read_lock() : No refcount taken on the socket */
417 static struct sock *inet_lhash2_lookup(const struct net *net,
418 				struct inet_listen_hashbucket *ilb2,
419 				struct sk_buff *skb, int doff,
420 				const __be32 saddr, __be16 sport,
421 				const __be32 daddr, const unsigned short hnum,
422 				const int dif, const int sdif)
423 {
424 	struct sock *sk, *result = NULL;
425 	struct hlist_nulls_node *node;
426 	int score, hiscore = 0;
427 
428 	sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) {
429 		score = compute_score(sk, net, hnum, daddr, dif, sdif);
430 		if (score > hiscore) {
431 			result = inet_lookup_reuseport(net, sk, skb, doff,
432 						       saddr, sport, daddr, hnum, inet_ehashfn);
433 			if (result)
434 				return result;
435 
436 			result = sk;
437 			hiscore = score;
438 		}
439 	}
440 
441 	return result;
442 }
443 
444 struct sock *inet_lookup_run_sk_lookup(const struct net *net,
445 				       int protocol,
446 				       struct sk_buff *skb, int doff,
447 				       __be32 saddr, __be16 sport,
448 				       __be32 daddr, u16 hnum, const int dif,
449 				       inet_ehashfn_t *ehashfn)
450 {
451 	struct sock *sk, *reuse_sk;
452 	bool no_reuseport;
453 
454 	no_reuseport = bpf_sk_lookup_run_v4(net, protocol, saddr, sport,
455 					    daddr, hnum, dif, &sk);
456 	if (no_reuseport || IS_ERR_OR_NULL(sk))
457 		return sk;
458 
459 	reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum,
460 					 ehashfn);
461 	if (reuse_sk)
462 		sk = reuse_sk;
463 	return sk;
464 }
465 
466 struct sock *__inet_lookup_listener(const struct net *net,
467 				    struct sk_buff *skb, int doff,
468 				    const __be32 saddr, __be16 sport,
469 				    const __be32 daddr, const unsigned short hnum,
470 				    const int dif, const int sdif)
471 {
472 	struct inet_listen_hashbucket *ilb2;
473 	struct inet_hashinfo *hashinfo;
474 	struct sock *result = NULL;
475 	unsigned int hash2;
476 
477 	/* Lookup redirect from BPF */
478 	if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
479 		result = inet_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff,
480 						   saddr, sport, daddr, hnum, dif,
481 						   inet_ehashfn);
482 		if (result)
483 			goto done;
484 	}
485 
486 	hashinfo = net->ipv4.tcp_death_row.hashinfo;
487 	hash2 = ipv4_portaddr_hash(net, daddr, hnum);
488 	ilb2 = inet_lhash2_bucket(hashinfo, hash2);
489 
490 	result = inet_lhash2_lookup(net, ilb2, skb, doff,
491 				    saddr, sport, daddr, hnum,
492 				    dif, sdif);
493 	if (result)
494 		goto done;
495 
496 	/* Lookup lhash2 with INADDR_ANY */
497 	hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
498 	ilb2 = inet_lhash2_bucket(hashinfo, hash2);
499 
500 	result = inet_lhash2_lookup(net, ilb2, skb, doff,
501 				    saddr, sport, htonl(INADDR_ANY), hnum,
502 				    dif, sdif);
503 done:
504 	if (IS_ERR(result))
505 		return NULL;
506 	return result;
507 }
508 EXPORT_SYMBOL_GPL(__inet_lookup_listener);
509 
510 /* All sockets share common refcount, but have different destructors */
511 void sock_gen_put(struct sock *sk)
512 {
513 	if (!refcount_dec_and_test(&sk->sk_refcnt))
514 		return;
515 
516 	if (sk->sk_state == TCP_TIME_WAIT)
517 		inet_twsk_free(inet_twsk(sk));
518 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
519 		reqsk_free(inet_reqsk(sk));
520 	else
521 		sk_free(sk);
522 }
523 EXPORT_SYMBOL_GPL(sock_gen_put);
524 
525 void sock_edemux(struct sk_buff *skb)
526 {
527 	sock_gen_put(skb->sk);
528 }
529 EXPORT_SYMBOL(sock_edemux);
530 
531 struct sock *__inet_lookup_established(const struct net *net,
532 				       const __be32 saddr, const __be16 sport,
533 				       const __be32 daddr, const u16 hnum,
534 				       const int dif, const int sdif)
535 {
536 	const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
537 	INET_ADDR_COOKIE(acookie, saddr, daddr);
538 	const struct hlist_nulls_node *node;
539 	struct inet_ehash_bucket *head;
540 	struct inet_hashinfo *hashinfo;
541 	unsigned int hash, slot;
542 	struct sock *sk;
543 
544 	hashinfo = net->ipv4.tcp_death_row.hashinfo;
545 	hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
546 	slot = hash & hashinfo->ehash_mask;
547 	head = &hashinfo->ehash[slot];
548 
549 begin:
550 	sk_nulls_for_each_rcu(sk, node, &head->chain) {
551 		if (sk->sk_hash != hash)
552 			continue;
553 		if (likely(inet_match(net, sk, acookie, ports, dif, sdif))) {
554 			if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
555 				goto out;
556 			if (unlikely(!inet_match(net, sk, acookie,
557 						 ports, dif, sdif))) {
558 				sock_gen_put(sk);
559 				goto begin;
560 			}
561 			goto found;
562 		}
563 	}
564 	/*
565 	 * if the nulls value we got at the end of this lookup is
566 	 * not the expected one, we must restart lookup.
567 	 * We probably met an item that was moved to another chain.
568 	 */
569 	if (get_nulls_value(node) != slot)
570 		goto begin;
571 out:
572 	sk = NULL;
573 found:
574 	return sk;
575 }
576 EXPORT_SYMBOL_GPL(__inet_lookup_established);
577 
578 /* called with local bh disabled */
579 static int __inet_check_established(struct inet_timewait_death_row *death_row,
580 				    struct sock *sk, __u16 lport,
581 				    struct inet_timewait_sock **twp,
582 				    bool rcu_lookup,
583 				    u32 hash)
584 {
585 	struct inet_hashinfo *hinfo = death_row->hashinfo;
586 	struct inet_sock *inet = inet_sk(sk);
587 	__be32 daddr = inet->inet_rcv_saddr;
588 	__be32 saddr = inet->inet_daddr;
589 	int dif = sk->sk_bound_dev_if;
590 	struct net *net = sock_net(sk);
591 	int sdif = l3mdev_master_ifindex_by_index(net, dif);
592 	INET_ADDR_COOKIE(acookie, saddr, daddr);
593 	const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
594 	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
595 	struct inet_timewait_sock *tw = NULL;
596 	const struct hlist_nulls_node *node;
597 	struct sock *sk2;
598 	spinlock_t *lock;
599 
600 	if (rcu_lookup) {
601 		sk_nulls_for_each(sk2, node, &head->chain) {
602 			if (sk2->sk_hash != hash ||
603 			    !inet_match(net, sk2, acookie, ports, dif, sdif))
604 				continue;
605 			if (sk2->sk_state == TCP_TIME_WAIT)
606 				break;
607 			return -EADDRNOTAVAIL;
608 		}
609 		return 0;
610 	}
611 
612 	lock = inet_ehash_lockp(hinfo, hash);
613 	spin_lock(lock);
614 
615 	sk_nulls_for_each(sk2, node, &head->chain) {
616 		if (sk2->sk_hash != hash)
617 			continue;
618 
619 		if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) {
620 			if (sk2->sk_state == TCP_TIME_WAIT) {
621 				tw = inet_twsk(sk2);
622 				if (tcp_twsk_unique(sk, sk2, twp))
623 					break;
624 			}
625 			goto not_unique;
626 		}
627 	}
628 
629 	/* Must record num and sport now. Otherwise we will see
630 	 * in hash table socket with a funny identity.
631 	 */
632 	inet->inet_num = lport;
633 	inet->inet_sport = htons(lport);
634 	sk->sk_hash = hash;
635 	WARN_ON(!sk_unhashed(sk));
636 	__sk_nulls_add_node_rcu(sk, &head->chain);
637 	if (tw) {
638 		sk_nulls_del_node_init_rcu((struct sock *)tw);
639 		__NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED);
640 	}
641 	spin_unlock(lock);
642 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
643 
644 	if (twp) {
645 		*twp = tw;
646 	} else if (tw) {
647 		/* Silly. Should hash-dance instead... */
648 		inet_twsk_deschedule_put(tw);
649 	}
650 	return 0;
651 
652 not_unique:
653 	spin_unlock(lock);
654 	return -EADDRNOTAVAIL;
655 }
656 
657 static u64 inet_sk_port_offset(const struct sock *sk)
658 {
659 	const struct inet_sock *inet = inet_sk(sk);
660 
661 	return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
662 					  inet->inet_daddr,
663 					  inet->inet_dport);
664 }
665 
666 /* Searches for an exsiting socket in the ehash bucket list.
667  * Returns true if found, false otherwise.
668  */
669 static bool inet_ehash_lookup_by_sk(struct sock *sk,
670 				    struct hlist_nulls_head *list)
671 {
672 	const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num);
673 	const int sdif = sk->sk_bound_dev_if;
674 	const int dif = sk->sk_bound_dev_if;
675 	const struct hlist_nulls_node *node;
676 	struct net *net = sock_net(sk);
677 	struct sock *esk;
678 
679 	INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr);
680 
681 	sk_nulls_for_each_rcu(esk, node, list) {
682 		if (esk->sk_hash != sk->sk_hash)
683 			continue;
684 		if (sk->sk_family == AF_INET) {
685 			if (unlikely(inet_match(net, esk, acookie,
686 						ports, dif, sdif))) {
687 				return true;
688 			}
689 		}
690 #if IS_ENABLED(CONFIG_IPV6)
691 		else if (sk->sk_family == AF_INET6) {
692 			if (unlikely(inet6_match(net, esk,
693 						 &sk->sk_v6_daddr,
694 						 &sk->sk_v6_rcv_saddr,
695 						 ports, dif, sdif))) {
696 				return true;
697 			}
698 		}
699 #endif
700 	}
701 	return false;
702 }
703 
704 /* Insert a socket into ehash, and eventually remove another one
705  * (The another one can be a SYN_RECV or TIMEWAIT)
706  * If an existing socket already exists, socket sk is not inserted,
707  * and sets found_dup_sk parameter to true.
708  */
709 bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk)
710 {
711 	struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
712 	struct inet_ehash_bucket *head;
713 	struct hlist_nulls_head *list;
714 	spinlock_t *lock;
715 	bool ret = true;
716 
717 	WARN_ON_ONCE(!sk_unhashed(sk));
718 
719 	sk->sk_hash = sk_ehashfn(sk);
720 	head = inet_ehash_bucket(hashinfo, sk->sk_hash);
721 	list = &head->chain;
722 	lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
723 
724 	spin_lock(lock);
725 	if (osk) {
726 		WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
727 		ret = sk_nulls_replace_node_init_rcu(osk, sk);
728 		goto unlock;
729 	}
730 
731 	if (found_dup_sk) {
732 		*found_dup_sk = inet_ehash_lookup_by_sk(sk, list);
733 		if (*found_dup_sk)
734 			ret = false;
735 	}
736 
737 	if (ret)
738 		__sk_nulls_add_node_rcu(sk, list);
739 
740 unlock:
741 	spin_unlock(lock);
742 
743 	return ret;
744 }
745 
746 bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk)
747 {
748 	bool ok = inet_ehash_insert(sk, osk, found_dup_sk);
749 
750 	if (ok) {
751 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
752 	} else {
753 		tcp_orphan_count_inc();
754 		inet_sk_set_state(sk, TCP_CLOSE);
755 		sock_set_flag(sk, SOCK_DEAD);
756 		inet_csk_destroy_sock(sk);
757 	}
758 	return ok;
759 }
760 EXPORT_IPV6_MOD(inet_ehash_nolisten);
761 
762 static int inet_reuseport_add_sock(struct sock *sk,
763 				   struct inet_listen_hashbucket *ilb)
764 {
765 	struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
766 	const struct hlist_nulls_node *node;
767 	kuid_t uid = sk_uid(sk);
768 	struct sock *sk2;
769 
770 	sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) {
771 		if (sk2 != sk &&
772 		    sk2->sk_family == sk->sk_family &&
773 		    ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
774 		    sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
775 		    inet_csk(sk2)->icsk_bind_hash == tb &&
776 		    sk2->sk_reuseport && uid_eq(uid, sk_uid(sk2)) &&
777 		    inet_rcv_saddr_equal(sk, sk2, false))
778 			return reuseport_add_sock(sk, sk2,
779 						  inet_rcv_saddr_any(sk));
780 	}
781 
782 	return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
783 }
784 
785 int inet_hash(struct sock *sk)
786 {
787 	struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
788 	struct inet_listen_hashbucket *ilb2;
789 	int err = 0;
790 
791 	if (sk->sk_state == TCP_CLOSE)
792 		return 0;
793 
794 	if (sk->sk_state != TCP_LISTEN) {
795 		local_bh_disable();
796 		inet_ehash_nolisten(sk, NULL, NULL);
797 		local_bh_enable();
798 		return 0;
799 	}
800 
801 #if IS_ENABLED(CONFIG_IPV6)
802 	if (sk->sk_family == AF_INET6)
803 		inet6_init_ehash_secret();
804 #endif
805 	inet_init_ehash_secret();
806 
807 	WARN_ON(!sk_unhashed(sk));
808 	ilb2 = inet_lhash2_bucket_sk(hashinfo, sk);
809 
810 	spin_lock(&ilb2->lock);
811 	if (sk->sk_reuseport) {
812 		err = inet_reuseport_add_sock(sk, ilb2);
813 		if (err)
814 			goto unlock;
815 	}
816 	sock_set_flag(sk, SOCK_RCU_FREE);
817 	if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
818 		sk->sk_family == AF_INET6)
819 		__sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
820 	else
821 		__sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
822 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
823 unlock:
824 	spin_unlock(&ilb2->lock);
825 
826 	return err;
827 }
828 EXPORT_IPV6_MOD(inet_hash);
829 
830 void inet_unhash(struct sock *sk)
831 {
832 	struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
833 
834 	if (sk_unhashed(sk))
835 		return;
836 
837 	sock_rps_delete_flow(sk);
838 	if (sk->sk_state == TCP_LISTEN) {
839 		struct inet_listen_hashbucket *ilb2;
840 
841 		ilb2 = inet_lhash2_bucket_sk(hashinfo, sk);
842 		/* Don't disable bottom halves while acquiring the lock to
843 		 * avoid circular locking dependency on PREEMPT_RT.
844 		 */
845 		spin_lock(&ilb2->lock);
846 		if (rcu_access_pointer(sk->sk_reuseport_cb))
847 			reuseport_stop_listen_sock(sk);
848 
849 		__sk_nulls_del_node_init_rcu(sk);
850 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
851 		spin_unlock(&ilb2->lock);
852 	} else {
853 		spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
854 
855 		spin_lock_bh(lock);
856 		__sk_nulls_del_node_init_rcu(sk);
857 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
858 		spin_unlock_bh(lock);
859 	}
860 }
861 EXPORT_IPV6_MOD(inet_unhash);
862 
863 static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb,
864 				    const struct net *net, unsigned short port,
865 				    int l3mdev, const struct sock *sk)
866 {
867 	if (!net_eq(ib2_net(tb), net) || tb->port != port ||
868 	    tb->l3mdev != l3mdev)
869 		return false;
870 
871 	return inet_bind2_bucket_addr_match(tb, sk);
872 }
873 
874 bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net,
875 				      unsigned short port, int l3mdev, const struct sock *sk)
876 {
877 	if (!net_eq(ib2_net(tb), net) || tb->port != port ||
878 	    tb->l3mdev != l3mdev)
879 		return false;
880 
881 #if IS_ENABLED(CONFIG_IPV6)
882 	if (tb->addr_type == IPV6_ADDR_ANY)
883 		return true;
884 
885 	if (tb->addr_type != IPV6_ADDR_MAPPED)
886 		return false;
887 
888 	if (sk->sk_family == AF_INET6 &&
889 	    !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
890 		return false;
891 #endif
892 	return tb->rcv_saddr == 0;
893 }
894 
895 /* The socket's bhash2 hashbucket spinlock must be held when this is called */
896 struct inet_bind2_bucket *
897 inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net,
898 		       unsigned short port, int l3mdev, const struct sock *sk)
899 {
900 	struct inet_bind2_bucket *bhash2 = NULL;
901 
902 	inet_bind_bucket_for_each(bhash2, &head->chain)
903 		if (inet_bind2_bucket_match(bhash2, net, port, l3mdev, sk))
904 			break;
905 
906 	return bhash2;
907 }
908 
909 struct inet_bind_hashbucket *
910 inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port)
911 {
912 	struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk);
913 	u32 hash;
914 
915 #if IS_ENABLED(CONFIG_IPV6)
916 	if (sk->sk_family == AF_INET6)
917 		hash = ipv6_portaddr_hash(net, &in6addr_any, port);
918 	else
919 #endif
920 		hash = ipv4_portaddr_hash(net, 0, port);
921 
922 	return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)];
923 }
924 
925 static void inet_update_saddr(struct sock *sk, void *saddr, int family)
926 {
927 	if (family == AF_INET) {
928 		inet_sk(sk)->inet_saddr = *(__be32 *)saddr;
929 		sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr);
930 	}
931 #if IS_ENABLED(CONFIG_IPV6)
932 	else {
933 		sk->sk_v6_rcv_saddr = *(struct in6_addr *)saddr;
934 	}
935 #endif
936 }
937 
938 static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset)
939 {
940 	struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk);
941 	struct inet_bind_hashbucket *head, *head2;
942 	struct inet_bind2_bucket *tb2, *new_tb2;
943 	int l3mdev = inet_sk_bound_l3mdev(sk);
944 	int port = inet_sk(sk)->inet_num;
945 	struct net *net = sock_net(sk);
946 	int bhash;
947 
948 	if (!inet_csk(sk)->icsk_bind2_hash) {
949 		/* Not bind()ed before. */
950 		if (reset)
951 			inet_reset_saddr(sk);
952 		else
953 			inet_update_saddr(sk, saddr, family);
954 
955 		return 0;
956 	}
957 
958 	/* Allocate a bind2 bucket ahead of time to avoid permanently putting
959 	 * the bhash2 table in an inconsistent state if a new tb2 bucket
960 	 * allocation fails.
961 	 */
962 	new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC);
963 	if (!new_tb2) {
964 		if (reset) {
965 			/* The (INADDR_ANY, port) bucket might have already
966 			 * been freed, then we cannot fixup icsk_bind2_hash,
967 			 * so we give up and unlink sk from bhash/bhash2 not
968 			 * to leave inconsistency in bhash2.
969 			 */
970 			inet_put_port(sk);
971 			inet_reset_saddr(sk);
972 		}
973 
974 		return -ENOMEM;
975 	}
976 
977 	bhash = inet_bhashfn(net, port, hinfo->bhash_size);
978 	head = &hinfo->bhash[bhash];
979 	head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
980 
981 	/* If we change saddr locklessly, another thread
982 	 * iterating over bhash might see corrupted address.
983 	 */
984 	spin_lock_bh(&head->lock);
985 
986 	spin_lock(&head2->lock);
987 	__sk_del_bind_node(sk);
988 	inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash);
989 	spin_unlock(&head2->lock);
990 
991 	if (reset)
992 		inet_reset_saddr(sk);
993 	else
994 		inet_update_saddr(sk, saddr, family);
995 
996 	head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
997 
998 	spin_lock(&head2->lock);
999 	tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
1000 	if (!tb2) {
1001 		tb2 = new_tb2;
1002 		inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk);
1003 		if (sk_is_connect_bind(sk)) {
1004 			tb2->fastreuse = -1;
1005 			tb2->fastreuseport = -1;
1006 		}
1007 	}
1008 	inet_csk(sk)->icsk_bind2_hash = tb2;
1009 	sk_add_bind_node(sk, &tb2->owners);
1010 	spin_unlock(&head2->lock);
1011 
1012 	spin_unlock_bh(&head->lock);
1013 
1014 	if (tb2 != new_tb2)
1015 		kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2);
1016 
1017 	return 0;
1018 }
1019 
1020 int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family)
1021 {
1022 	return __inet_bhash2_update_saddr(sk, saddr, family, false);
1023 }
1024 EXPORT_IPV6_MOD(inet_bhash2_update_saddr);
1025 
1026 void inet_bhash2_reset_saddr(struct sock *sk)
1027 {
1028 	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1029 		__inet_bhash2_update_saddr(sk, NULL, 0, true);
1030 }
1031 EXPORT_IPV6_MOD(inet_bhash2_reset_saddr);
1032 
1033 /* RFC 6056 3.3.4.  Algorithm 4: Double-Hash Port Selection Algorithm
1034  * Note that we use 32bit integers (vs RFC 'short integers')
1035  * because 2^16 is not a multiple of num_ephemeral and this
1036  * property might be used by clever attacker.
1037  *
1038  * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though
1039  * attacks were since demonstrated, thus we use 65536 by default instead
1040  * to really give more isolation and privacy, at the expense of 256kB
1041  * of kernel memory.
1042  */
1043 #define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER)
1044 static u32 *table_perturb;
1045 
1046 int __inet_hash_connect(struct inet_timewait_death_row *death_row,
1047 		struct sock *sk, u64 port_offset,
1048 		u32 hash_port0,
1049 		int (*check_established)(struct inet_timewait_death_row *,
1050 			struct sock *, __u16, struct inet_timewait_sock **,
1051 			bool rcu_lookup, u32 hash))
1052 {
1053 	struct inet_hashinfo *hinfo = death_row->hashinfo;
1054 	struct inet_bind_hashbucket *head, *head2;
1055 	struct inet_timewait_sock *tw = NULL;
1056 	int port = inet_sk(sk)->inet_num;
1057 	struct net *net = sock_net(sk);
1058 	struct inet_bind2_bucket *tb2;
1059 	struct inet_bind_bucket *tb;
1060 	bool tb_created = false;
1061 	u32 remaining, offset;
1062 	int ret, i, low, high;
1063 	bool local_ports;
1064 	int step, l3mdev;
1065 	u32 index;
1066 
1067 	if (port) {
1068 		local_bh_disable();
1069 		ret = check_established(death_row, sk, port, NULL, false,
1070 					hash_port0 + port);
1071 		local_bh_enable();
1072 		return ret;
1073 	}
1074 
1075 	l3mdev = inet_sk_bound_l3mdev(sk);
1076 
1077 	local_ports = inet_sk_get_local_port_range(sk, &low, &high);
1078 	step = local_ports ? 1 : 2;
1079 
1080 	high++; /* [32768, 60999] -> [32768, 61000[ */
1081 	remaining = high - low;
1082 	if (!local_ports && remaining > 1)
1083 		remaining &= ~1U;
1084 
1085 	get_random_sleepable_once(table_perturb,
1086 				  INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb));
1087 	index = port_offset & (INET_TABLE_PERTURB_SIZE - 1);
1088 
1089 	offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32);
1090 	offset %= remaining;
1091 
1092 	/* In first pass we try ports of @low parity.
1093 	 * inet_csk_get_port() does the opposite choice.
1094 	 */
1095 	if (!local_ports)
1096 		offset &= ~1U;
1097 other_parity_scan:
1098 	port = low + offset;
1099 	for (i = 0; i < remaining; i += step, port += step) {
1100 		if (unlikely(port >= high))
1101 			port -= remaining;
1102 		if (inet_is_local_reserved_port(net, port))
1103 			continue;
1104 		head = &hinfo->bhash[inet_bhashfn(net, port,
1105 						  hinfo->bhash_size)];
1106 		rcu_read_lock();
1107 		hlist_for_each_entry_rcu(tb, &head->chain, node) {
1108 			if (!inet_bind_bucket_match(tb, net, port, l3mdev))
1109 				continue;
1110 			if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) {
1111 				rcu_read_unlock();
1112 				goto next_port;
1113 			}
1114 			if (!check_established(death_row, sk, port, &tw, true,
1115 					       hash_port0 + port))
1116 				break;
1117 			rcu_read_unlock();
1118 			goto next_port;
1119 		}
1120 		rcu_read_unlock();
1121 
1122 		spin_lock_bh(&head->lock);
1123 
1124 		/* Does not bother with rcv_saddr checks, because
1125 		 * the established check is already unique enough.
1126 		 */
1127 		inet_bind_bucket_for_each(tb, &head->chain) {
1128 			if (inet_bind_bucket_match(tb, net, port, l3mdev)) {
1129 				if (tb->fastreuse >= 0 ||
1130 				    tb->fastreuseport >= 0)
1131 					goto next_port_unlock;
1132 				WARN_ON(hlist_empty(&tb->bhash2));
1133 				if (!check_established(death_row, sk,
1134 						       port, &tw, false,
1135 						       hash_port0 + port))
1136 					goto ok;
1137 				goto next_port_unlock;
1138 			}
1139 		}
1140 
1141 		tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
1142 					     net, head, port, l3mdev);
1143 		if (!tb) {
1144 			spin_unlock_bh(&head->lock);
1145 			return -ENOMEM;
1146 		}
1147 		tb_created = true;
1148 		tb->fastreuse = -1;
1149 		tb->fastreuseport = -1;
1150 		goto ok;
1151 next_port_unlock:
1152 		spin_unlock_bh(&head->lock);
1153 next_port:
1154 		cond_resched();
1155 	}
1156 
1157 	if (!local_ports) {
1158 		offset++;
1159 		if ((offset & 1) && remaining > 1)
1160 			goto other_parity_scan;
1161 	}
1162 	return -EADDRNOTAVAIL;
1163 
1164 ok:
1165 	/* Find the corresponding tb2 bucket since we need to
1166 	 * add the socket to the bhash2 table as well
1167 	 */
1168 	head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
1169 	spin_lock(&head2->lock);
1170 
1171 	tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
1172 	if (!tb2) {
1173 		tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net,
1174 					       head2, tb, sk);
1175 		if (!tb2)
1176 			goto error;
1177 		tb2->fastreuse = -1;
1178 		tb2->fastreuseport = -1;
1179 	}
1180 
1181 	/* Here we want to add a little bit of randomness to the next source
1182 	 * port that will be chosen. We use a max() with a random here so that
1183 	 * on low contention the randomness is maximal and on high contention
1184 	 * it may be inexistent.
1185 	 */
1186 	i = max_t(int, i, get_random_u32_below(8) * step);
1187 	WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + step);
1188 
1189 	/* Head lock still held and bh's disabled */
1190 	inet_bind_hash(sk, tb, tb2, port);
1191 	sk->sk_userlocks |= SOCK_CONNECT_BIND;
1192 
1193 	if (sk_unhashed(sk)) {
1194 		inet_sk(sk)->inet_sport = htons(port);
1195 		inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
1196 	}
1197 	if (tw)
1198 		inet_twsk_bind_unhash(tw, hinfo);
1199 
1200 	spin_unlock(&head2->lock);
1201 	spin_unlock(&head->lock);
1202 
1203 	if (tw)
1204 		inet_twsk_deschedule_put(tw);
1205 	local_bh_enable();
1206 	return 0;
1207 
1208 error:
1209 	if (sk_hashed(sk)) {
1210 		spinlock_t *lock = inet_ehash_lockp(hinfo, sk->sk_hash);
1211 
1212 		sock_prot_inuse_add(net, sk->sk_prot, -1);
1213 
1214 		spin_lock(lock);
1215 		__sk_nulls_del_node_init_rcu(sk);
1216 		spin_unlock(lock);
1217 
1218 		sk->sk_hash = 0;
1219 		inet_sk(sk)->inet_sport = 0;
1220 		WRITE_ONCE(inet_sk(sk)->inet_num, 0);
1221 
1222 		if (tw)
1223 			inet_twsk_bind_unhash(tw, hinfo);
1224 	}
1225 
1226 	spin_unlock(&head2->lock);
1227 	if (tb_created)
1228 		inet_bind_bucket_destroy(tb);
1229 	spin_unlock(&head->lock);
1230 
1231 	if (tw)
1232 		inet_twsk_deschedule_put(tw);
1233 
1234 	local_bh_enable();
1235 
1236 	return -ENOMEM;
1237 }
1238 
1239 /*
1240  * Bind a port for a connect operation and hash it.
1241  */
1242 int inet_hash_connect(struct inet_timewait_death_row *death_row,
1243 		      struct sock *sk)
1244 {
1245 	const struct inet_sock *inet = inet_sk(sk);
1246 	const struct net *net = sock_net(sk);
1247 	u64 port_offset = 0;
1248 	u32 hash_port0;
1249 
1250 	if (!inet_sk(sk)->inet_num)
1251 		port_offset = inet_sk_port_offset(sk);
1252 
1253 	inet_init_ehash_secret();
1254 
1255 	hash_port0 = inet_ehashfn(net, inet->inet_rcv_saddr, 0,
1256 				  inet->inet_daddr, inet->inet_dport);
1257 
1258 	return __inet_hash_connect(death_row, sk, port_offset, hash_port0,
1259 				   __inet_check_established);
1260 }
1261 
1262 void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
1263 				unsigned long numentries, int scale,
1264 				unsigned long low_limit,
1265 				unsigned long high_limit)
1266 {
1267 	unsigned int i;
1268 
1269 	h->lhash2 = alloc_large_system_hash(name,
1270 					    sizeof(*h->lhash2),
1271 					    numentries,
1272 					    scale,
1273 					    0,
1274 					    NULL,
1275 					    &h->lhash2_mask,
1276 					    low_limit,
1277 					    high_limit);
1278 
1279 	for (i = 0; i <= h->lhash2_mask; i++) {
1280 		spin_lock_init(&h->lhash2[i].lock);
1281 		INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head,
1282 				      i + LISTENING_NULLS_BASE);
1283 	}
1284 
1285 	/* this one is used for source ports of outgoing connections */
1286 	table_perturb = alloc_large_system_hash("Table-perturb",
1287 						sizeof(*table_perturb),
1288 						INET_TABLE_PERTURB_SIZE,
1289 						0, 0, NULL, NULL,
1290 						INET_TABLE_PERTURB_SIZE,
1291 						INET_TABLE_PERTURB_SIZE);
1292 }
1293 
1294 int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
1295 {
1296 	unsigned int locksz = sizeof(spinlock_t);
1297 	unsigned int i, nblocks = 1;
1298 	spinlock_t *ptr = NULL;
1299 
1300 	if (locksz == 0)
1301 		goto set_mask;
1302 
1303 	/* Allocate 2 cache lines or at least one spinlock per cpu. */
1304 	nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U) * num_possible_cpus();
1305 
1306 	/* At least one page per NUMA node. */
1307 	nblocks = max(nblocks, num_online_nodes() * PAGE_SIZE / locksz);
1308 
1309 	nblocks = roundup_pow_of_two(nblocks);
1310 
1311 	/* No more locks than number of hash buckets. */
1312 	nblocks = min(nblocks, hashinfo->ehash_mask + 1);
1313 
1314 	if (num_online_nodes() > 1) {
1315 		/* Use vmalloc() to allow NUMA policy to spread pages
1316 		 * on all available nodes if desired.
1317 		 */
1318 		ptr = vmalloc_array(nblocks, locksz);
1319 	}
1320 	if (!ptr) {
1321 		ptr = kvmalloc_array(nblocks, locksz, GFP_KERNEL);
1322 		if (!ptr)
1323 			return -ENOMEM;
1324 	}
1325 	for (i = 0; i < nblocks; i++)
1326 		spin_lock_init(&ptr[i]);
1327 	hashinfo->ehash_locks = ptr;
1328 set_mask:
1329 	hashinfo->ehash_locks_mask = nblocks - 1;
1330 	return 0;
1331 }
1332 
1333 struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo,
1334 						 unsigned int ehash_entries)
1335 {
1336 	struct inet_hashinfo *new_hashinfo;
1337 	int i;
1338 
1339 	new_hashinfo = kmemdup(hashinfo, sizeof(*hashinfo), GFP_KERNEL);
1340 	if (!new_hashinfo)
1341 		goto err;
1342 
1343 	new_hashinfo->ehash = vmalloc_huge(ehash_entries * sizeof(struct inet_ehash_bucket),
1344 					   GFP_KERNEL_ACCOUNT);
1345 	if (!new_hashinfo->ehash)
1346 		goto free_hashinfo;
1347 
1348 	new_hashinfo->ehash_mask = ehash_entries - 1;
1349 
1350 	if (inet_ehash_locks_alloc(new_hashinfo))
1351 		goto free_ehash;
1352 
1353 	for (i = 0; i < ehash_entries; i++)
1354 		INIT_HLIST_NULLS_HEAD(&new_hashinfo->ehash[i].chain, i);
1355 
1356 	new_hashinfo->pernet = true;
1357 
1358 	return new_hashinfo;
1359 
1360 free_ehash:
1361 	vfree(new_hashinfo->ehash);
1362 free_hashinfo:
1363 	kfree(new_hashinfo);
1364 err:
1365 	return NULL;
1366 }
1367 
1368 void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo)
1369 {
1370 	if (!hashinfo->pernet)
1371 		return;
1372 
1373 	inet_ehash_locks_free(hashinfo);
1374 	vfree(hashinfo->ehash);
1375 	kfree(hashinfo);
1376 }
1377