1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Generic INET transport hashtables 8 * 9 * Authors: Lotsa people, from code originally in tcp 10 */ 11 12 #include <linux/module.h> 13 #include <linux/random.h> 14 #include <linux/sched.h> 15 #include <linux/slab.h> 16 #include <linux/wait.h> 17 #include <linux/vmalloc.h> 18 #include <linux/memblock.h> 19 #include <linux/gcd.h> 20 21 #include <net/addrconf.h> 22 #include <net/inet_connection_sock.h> 23 #include <net/inet_hashtables.h> 24 #if IS_ENABLED(CONFIG_IPV6) 25 #include <net/inet6_hashtables.h> 26 #endif 27 #include <net/hotdata.h> 28 #include <net/ip.h> 29 #include <net/rps.h> 30 #include <net/secure_seq.h> 31 #include <net/sock_reuseport.h> 32 #include <net/tcp.h> 33 34 static void inet_init_ehash_secret(void) 35 { 36 net_get_random_sleepable_once(&inet_ehash_secret, 37 sizeof(inet_ehash_secret)); 38 } 39 40 u32 inet_ehashfn(const struct net *net, const __be32 laddr, 41 const __u16 lport, const __be32 faddr, 42 const __be16 fport) 43 { 44 return lport + __inet_ehashfn(laddr, 0, faddr, fport, 45 inet_ehash_secret + net_hash_mix(net)); 46 } 47 EXPORT_SYMBOL_GPL(inet_ehashfn); 48 49 /* This function handles inet_sock, but also timewait and request sockets 50 * for IPv4/IPv6. 51 */ 52 static u32 sk_ehashfn(const struct sock *sk) 53 { 54 #if IS_ENABLED(CONFIG_IPV6) 55 if (sk->sk_family == AF_INET6 && 56 !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 57 return inet6_ehashfn(sock_net(sk), 58 &sk->sk_v6_rcv_saddr, sk->sk_num, 59 &sk->sk_v6_daddr, sk->sk_dport); 60 #endif 61 return inet_ehashfn(sock_net(sk), 62 sk->sk_rcv_saddr, sk->sk_num, 63 sk->sk_daddr, sk->sk_dport); 64 } 65 66 static bool sk_is_connect_bind(const struct sock *sk) 67 { 68 if (sk->sk_state == TCP_TIME_WAIT) 69 return inet_twsk(sk)->tw_connect_bind; 70 else 71 return sk->sk_userlocks & SOCK_CONNECT_BIND; 72 } 73 74 /* 75 * Allocate and initialize a new local port bind bucket. 76 * The bindhash mutex for snum's hash chain must be held here. 77 */ 78 struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, 79 struct net *net, 80 struct inet_bind_hashbucket *head, 81 const unsigned short snum, 82 int l3mdev) 83 { 84 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 85 86 if (tb) { 87 write_pnet(&tb->ib_net, net); 88 tb->l3mdev = l3mdev; 89 tb->port = snum; 90 tb->fastreuse = 0; 91 tb->fastreuseport = 0; 92 INIT_HLIST_HEAD(&tb->bhash2); 93 hlist_add_head_rcu(&tb->node, &head->chain); 94 } 95 return tb; 96 } 97 98 /* 99 * Caller must hold hashbucket lock for this tb with local BH disabled 100 */ 101 void inet_bind_bucket_destroy(struct inet_bind_bucket *tb) 102 { 103 const struct inet_bind2_bucket *tb2; 104 105 if (hlist_empty(&tb->bhash2)) { 106 hlist_del_rcu(&tb->node); 107 kfree_rcu(tb, rcu); 108 return; 109 } 110 111 if (tb->fastreuse == -1 && tb->fastreuseport == -1) 112 return; 113 hlist_for_each_entry(tb2, &tb->bhash2, bhash_node) { 114 if (tb2->fastreuse != -1 || tb2->fastreuseport != -1) 115 return; 116 } 117 tb->fastreuse = -1; 118 tb->fastreuseport = -1; 119 } 120 121 bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, 122 unsigned short port, int l3mdev) 123 { 124 return net_eq(ib_net(tb), net) && tb->port == port && 125 tb->l3mdev == l3mdev; 126 } 127 128 static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2, 129 struct net *net, 130 struct inet_bind_hashbucket *head, 131 struct inet_bind_bucket *tb, 132 const struct sock *sk) 133 { 134 write_pnet(&tb2->ib_net, net); 135 tb2->l3mdev = tb->l3mdev; 136 tb2->port = tb->port; 137 #if IS_ENABLED(CONFIG_IPV6) 138 BUILD_BUG_ON(USHRT_MAX < (IPV6_ADDR_ANY | IPV6_ADDR_MAPPED)); 139 if (sk->sk_family == AF_INET6) { 140 tb2->addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); 141 tb2->v6_rcv_saddr = sk->sk_v6_rcv_saddr; 142 } else { 143 tb2->addr_type = IPV6_ADDR_MAPPED; 144 ipv6_addr_set_v4mapped(sk->sk_rcv_saddr, &tb2->v6_rcv_saddr); 145 } 146 #else 147 tb2->rcv_saddr = sk->sk_rcv_saddr; 148 #endif 149 tb2->fastreuse = 0; 150 tb2->fastreuseport = 0; 151 INIT_HLIST_HEAD(&tb2->owners); 152 hlist_add_head(&tb2->node, &head->chain); 153 hlist_add_head(&tb2->bhash_node, &tb->bhash2); 154 } 155 156 struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, 157 struct net *net, 158 struct inet_bind_hashbucket *head, 159 struct inet_bind_bucket *tb, 160 const struct sock *sk) 161 { 162 struct inet_bind2_bucket *tb2 = kmem_cache_alloc(cachep, GFP_ATOMIC); 163 164 if (tb2) 165 inet_bind2_bucket_init(tb2, net, head, tb, sk); 166 167 return tb2; 168 } 169 170 /* Caller must hold hashbucket lock for this tb with local BH disabled */ 171 void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) 172 { 173 const struct sock *sk; 174 175 if (hlist_empty(&tb->owners)) { 176 __hlist_del(&tb->node); 177 __hlist_del(&tb->bhash_node); 178 kmem_cache_free(cachep, tb); 179 return; 180 } 181 182 if (tb->fastreuse == -1 && tb->fastreuseport == -1) 183 return; 184 sk_for_each_bound(sk, &tb->owners) { 185 if (!sk_is_connect_bind(sk)) 186 return; 187 } 188 tb->fastreuse = -1; 189 tb->fastreuseport = -1; 190 } 191 192 static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, 193 const struct sock *sk) 194 { 195 #if IS_ENABLED(CONFIG_IPV6) 196 if (sk->sk_family == AF_INET6) 197 return ipv6_addr_equal(&tb2->v6_rcv_saddr, &sk->sk_v6_rcv_saddr); 198 199 if (tb2->addr_type != IPV6_ADDR_MAPPED) 200 return false; 201 #endif 202 return tb2->rcv_saddr == sk->sk_rcv_saddr; 203 } 204 205 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 206 struct inet_bind2_bucket *tb2, unsigned short port) 207 { 208 WRITE_ONCE(inet_sk(sk)->inet_num, port); 209 inet_csk(sk)->icsk_bind_hash = tb; 210 inet_csk(sk)->icsk_bind2_hash = tb2; 211 sk_add_bind_node(sk, &tb2->owners); 212 } 213 214 /* 215 * Get rid of any references to a local port held by the given sock. 216 */ 217 static void __inet_put_port(struct sock *sk) 218 { 219 struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); 220 struct inet_bind_hashbucket *head, *head2; 221 struct net *net = sock_net(sk); 222 struct inet_bind_bucket *tb; 223 int bhash; 224 225 bhash = inet_bhashfn(net, inet_sk(sk)->inet_num, hashinfo->bhash_size); 226 head = &hashinfo->bhash[bhash]; 227 head2 = inet_bhashfn_portaddr(hashinfo, sk, net, inet_sk(sk)->inet_num); 228 229 spin_lock(&head->lock); 230 tb = inet_csk(sk)->icsk_bind_hash; 231 inet_csk(sk)->icsk_bind_hash = NULL; 232 WRITE_ONCE(inet_sk(sk)->inet_num, 0); 233 sk->sk_userlocks &= ~SOCK_CONNECT_BIND; 234 235 spin_lock(&head2->lock); 236 if (inet_csk(sk)->icsk_bind2_hash) { 237 struct inet_bind2_bucket *tb2 = inet_csk(sk)->icsk_bind2_hash; 238 239 __sk_del_bind_node(sk); 240 inet_csk(sk)->icsk_bind2_hash = NULL; 241 inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); 242 } 243 spin_unlock(&head2->lock); 244 245 inet_bind_bucket_destroy(tb); 246 spin_unlock(&head->lock); 247 } 248 249 void inet_put_port(struct sock *sk) 250 { 251 local_bh_disable(); 252 __inet_put_port(sk); 253 local_bh_enable(); 254 } 255 256 int __inet_inherit_port(const struct sock *sk, struct sock *child) 257 { 258 struct inet_hashinfo *table = tcp_get_hashinfo(sk); 259 unsigned short port = inet_sk(child)->inet_num; 260 struct inet_bind_hashbucket *head, *head2; 261 bool created_inet_bind_bucket = false; 262 struct net *net = sock_net(sk); 263 bool update_fastreuse = false; 264 struct inet_bind2_bucket *tb2; 265 struct inet_bind_bucket *tb; 266 int bhash, l3mdev; 267 268 bhash = inet_bhashfn(net, port, table->bhash_size); 269 head = &table->bhash[bhash]; 270 head2 = inet_bhashfn_portaddr(table, child, net, port); 271 272 spin_lock(&head->lock); 273 spin_lock(&head2->lock); 274 tb = inet_csk(sk)->icsk_bind_hash; 275 tb2 = inet_csk(sk)->icsk_bind2_hash; 276 if (unlikely(!tb || !tb2)) { 277 spin_unlock(&head2->lock); 278 spin_unlock(&head->lock); 279 return -ENOENT; 280 } 281 if (tb->port != port) { 282 l3mdev = inet_sk_bound_l3mdev(sk); 283 284 /* NOTE: using tproxy and redirecting skbs to a proxy 285 * on a different listener port breaks the assumption 286 * that the listener socket's icsk_bind_hash is the same 287 * as that of the child socket. We have to look up or 288 * create a new bind bucket for the child here. */ 289 inet_bind_bucket_for_each(tb, &head->chain) { 290 if (inet_bind_bucket_match(tb, net, port, l3mdev)) 291 break; 292 } 293 if (!tb) { 294 tb = inet_bind_bucket_create(table->bind_bucket_cachep, 295 net, head, port, l3mdev); 296 if (!tb) { 297 spin_unlock(&head2->lock); 298 spin_unlock(&head->lock); 299 return -ENOMEM; 300 } 301 created_inet_bind_bucket = true; 302 } 303 update_fastreuse = true; 304 305 goto bhash2_find; 306 } else if (!inet_bind2_bucket_addr_match(tb2, child)) { 307 l3mdev = inet_sk_bound_l3mdev(sk); 308 309 bhash2_find: 310 tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, child); 311 if (!tb2) { 312 tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep, 313 net, head2, tb, child); 314 if (!tb2) 315 goto error; 316 } 317 } 318 if (update_fastreuse) 319 inet_csk_update_fastreuse(child, tb, tb2); 320 inet_bind_hash(child, tb, tb2, port); 321 spin_unlock(&head2->lock); 322 spin_unlock(&head->lock); 323 324 return 0; 325 326 error: 327 if (created_inet_bind_bucket) 328 inet_bind_bucket_destroy(tb); 329 spin_unlock(&head2->lock); 330 spin_unlock(&head->lock); 331 return -ENOMEM; 332 } 333 334 static struct inet_listen_hashbucket * 335 inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) 336 { 337 u32 hash; 338 339 #if IS_ENABLED(CONFIG_IPV6) 340 if (sk->sk_family == AF_INET6) 341 hash = ipv6_portaddr_hash(sock_net(sk), 342 &sk->sk_v6_rcv_saddr, 343 inet_sk(sk)->inet_num); 344 else 345 #endif 346 hash = ipv4_portaddr_hash(sock_net(sk), 347 inet_sk(sk)->inet_rcv_saddr, 348 inet_sk(sk)->inet_num); 349 return inet_lhash2_bucket(h, hash); 350 } 351 352 static inline int compute_score(struct sock *sk, const struct net *net, 353 const unsigned short hnum, const __be32 daddr, 354 const int dif, const int sdif) 355 { 356 int score = -1; 357 358 if (net_eq(sock_net(sk), net) && READ_ONCE(sk->sk_num) == hnum && 359 !ipv6_only_sock(sk)) { 360 if (sk->sk_rcv_saddr != daddr) 361 return -1; 362 363 if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) 364 return -1; 365 score = sk->sk_bound_dev_if ? 2 : 1; 366 367 if (sk->sk_family == PF_INET) 368 score++; 369 if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) 370 score++; 371 } 372 return score; 373 } 374 375 /** 376 * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary. 377 * @net: network namespace. 378 * @sk: AF_INET socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP. 379 * @skb: context for a potential SK_REUSEPORT program. 380 * @doff: header offset. 381 * @saddr: source address. 382 * @sport: source port. 383 * @daddr: destination address. 384 * @hnum: destination port in host byte order. 385 * @ehashfn: hash function used to generate the fallback hash. 386 * 387 * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to 388 * the selected sock or an error. 389 */ 390 struct sock *inet_lookup_reuseport(const struct net *net, struct sock *sk, 391 struct sk_buff *skb, int doff, 392 __be32 saddr, __be16 sport, 393 __be32 daddr, unsigned short hnum, 394 inet_ehashfn_t *ehashfn) 395 { 396 struct sock *reuse_sk = NULL; 397 u32 phash; 398 399 if (sk->sk_reuseport) { 400 phash = INDIRECT_CALL_2(ehashfn, udp_ehashfn, inet_ehashfn, 401 net, daddr, hnum, saddr, sport); 402 reuse_sk = reuseport_select_sock(sk, phash, skb, doff); 403 } 404 return reuse_sk; 405 } 406 EXPORT_SYMBOL_GPL(inet_lookup_reuseport); 407 408 /* 409 * Here are some nice properties to exploit here. The BSD API 410 * does not allow a listening sock to specify the remote port nor the 411 * remote address for the connection. So always assume those are both 412 * wildcarded during the search since they can never be otherwise. 413 */ 414 415 /* called with rcu_read_lock() : No refcount taken on the socket */ 416 static struct sock *inet_lhash2_lookup(const struct net *net, 417 struct inet_listen_hashbucket *ilb2, 418 struct sk_buff *skb, int doff, 419 const __be32 saddr, __be16 sport, 420 const __be32 daddr, const unsigned short hnum, 421 const int dif, const int sdif) 422 { 423 struct sock *sk, *result = NULL; 424 struct hlist_nulls_node *node; 425 int score, hiscore = 0; 426 427 sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { 428 score = compute_score(sk, net, hnum, daddr, dif, sdif); 429 if (score > hiscore) { 430 result = inet_lookup_reuseport(net, sk, skb, doff, 431 saddr, sport, daddr, hnum, inet_ehashfn); 432 if (result) 433 return result; 434 435 result = sk; 436 hiscore = score; 437 } 438 } 439 440 return result; 441 } 442 443 struct sock *inet_lookup_run_sk_lookup(const struct net *net, 444 int protocol, 445 struct sk_buff *skb, int doff, 446 __be32 saddr, __be16 sport, 447 __be32 daddr, u16 hnum, const int dif, 448 inet_ehashfn_t *ehashfn) 449 { 450 struct sock *sk, *reuse_sk; 451 bool no_reuseport; 452 453 no_reuseport = bpf_sk_lookup_run_v4(net, protocol, saddr, sport, 454 daddr, hnum, dif, &sk); 455 if (no_reuseport || IS_ERR_OR_NULL(sk)) 456 return sk; 457 458 reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, 459 ehashfn); 460 if (reuse_sk) 461 sk = reuse_sk; 462 return sk; 463 } 464 465 struct sock *__inet_lookup_listener(const struct net *net, 466 struct sk_buff *skb, int doff, 467 const __be32 saddr, __be16 sport, 468 const __be32 daddr, const unsigned short hnum, 469 const int dif, const int sdif) 470 { 471 struct inet_listen_hashbucket *ilb2; 472 struct inet_hashinfo *hashinfo; 473 struct sock *result = NULL; 474 unsigned int hash2; 475 476 /* Lookup redirect from BPF */ 477 if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { 478 result = inet_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff, 479 saddr, sport, daddr, hnum, dif, 480 inet_ehashfn); 481 if (result) 482 goto done; 483 } 484 485 hashinfo = net->ipv4.tcp_death_row.hashinfo; 486 hash2 = ipv4_portaddr_hash(net, daddr, hnum); 487 ilb2 = inet_lhash2_bucket(hashinfo, hash2); 488 489 result = inet_lhash2_lookup(net, ilb2, skb, doff, 490 saddr, sport, daddr, hnum, 491 dif, sdif); 492 if (result) 493 goto done; 494 495 /* Lookup lhash2 with INADDR_ANY */ 496 hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); 497 ilb2 = inet_lhash2_bucket(hashinfo, hash2); 498 499 result = inet_lhash2_lookup(net, ilb2, skb, doff, 500 saddr, sport, htonl(INADDR_ANY), hnum, 501 dif, sdif); 502 done: 503 if (IS_ERR(result)) 504 return NULL; 505 return result; 506 } 507 EXPORT_SYMBOL_GPL(__inet_lookup_listener); 508 509 /* All sockets share common refcount, but have different destructors */ 510 void sock_gen_put(struct sock *sk) 511 { 512 if (!refcount_dec_and_test(&sk->sk_refcnt)) 513 return; 514 515 if (sk->sk_state == TCP_TIME_WAIT) 516 inet_twsk_free(inet_twsk(sk)); 517 else if (sk->sk_state == TCP_NEW_SYN_RECV) 518 reqsk_free(inet_reqsk(sk)); 519 else 520 sk_free(sk); 521 } 522 EXPORT_SYMBOL_GPL(sock_gen_put); 523 524 void sock_edemux(struct sk_buff *skb) 525 { 526 sock_gen_put(skb->sk); 527 } 528 EXPORT_SYMBOL(sock_edemux); 529 530 struct sock *__inet_lookup_established(const struct net *net, 531 const __be32 saddr, const __be16 sport, 532 const __be32 daddr, const u16 hnum, 533 const int dif, const int sdif) 534 { 535 const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 536 INET_ADDR_COOKIE(acookie, saddr, daddr); 537 const struct hlist_nulls_node *node; 538 struct inet_ehash_bucket *head; 539 struct inet_hashinfo *hashinfo; 540 unsigned int hash, slot; 541 struct sock *sk; 542 543 hashinfo = net->ipv4.tcp_death_row.hashinfo; 544 hash = inet_ehashfn(net, daddr, hnum, saddr, sport); 545 slot = hash & hashinfo->ehash_mask; 546 head = &hashinfo->ehash[slot]; 547 548 begin: 549 sk_nulls_for_each_rcu(sk, node, &head->chain) { 550 if (sk->sk_hash != hash) 551 continue; 552 if (likely(inet_match(net, sk, acookie, ports, dif, sdif))) { 553 if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) 554 goto out; 555 if (unlikely(!inet_match(net, sk, acookie, 556 ports, dif, sdif))) { 557 sock_gen_put(sk); 558 goto begin; 559 } 560 goto found; 561 } 562 } 563 /* 564 * if the nulls value we got at the end of this lookup is 565 * not the expected one, we must restart lookup. 566 * We probably met an item that was moved to another chain. 567 */ 568 if (get_nulls_value(node) != slot) 569 goto begin; 570 out: 571 sk = NULL; 572 found: 573 return sk; 574 } 575 EXPORT_SYMBOL_GPL(__inet_lookup_established); 576 577 /* called with local bh disabled */ 578 static int __inet_check_established(struct inet_timewait_death_row *death_row, 579 struct sock *sk, __u16 lport, 580 struct inet_timewait_sock **twp, 581 bool rcu_lookup, 582 u32 hash) 583 { 584 struct inet_hashinfo *hinfo = death_row->hashinfo; 585 struct inet_sock *inet = inet_sk(sk); 586 __be32 daddr = inet->inet_rcv_saddr; 587 __be32 saddr = inet->inet_daddr; 588 int dif = sk->sk_bound_dev_if; 589 struct net *net = sock_net(sk); 590 int sdif = l3mdev_master_ifindex_by_index(net, dif); 591 INET_ADDR_COOKIE(acookie, saddr, daddr); 592 const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); 593 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 594 struct inet_timewait_sock *tw = NULL; 595 const struct hlist_nulls_node *node; 596 struct sock *sk2; 597 spinlock_t *lock; 598 599 if (rcu_lookup) { 600 sk_nulls_for_each(sk2, node, &head->chain) { 601 if (sk2->sk_hash != hash || 602 !inet_match(net, sk2, acookie, ports, dif, sdif)) 603 continue; 604 if (sk2->sk_state == TCP_TIME_WAIT) 605 break; 606 return -EADDRNOTAVAIL; 607 } 608 return 0; 609 } 610 611 lock = inet_ehash_lockp(hinfo, hash); 612 spin_lock(lock); 613 614 sk_nulls_for_each(sk2, node, &head->chain) { 615 if (sk2->sk_hash != hash) 616 continue; 617 618 if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) { 619 if (sk2->sk_state == TCP_TIME_WAIT) { 620 tw = inet_twsk(sk2); 621 if (tcp_twsk_unique(sk, sk2, twp)) 622 break; 623 } 624 goto not_unique; 625 } 626 } 627 628 /* Must record num and sport now. Otherwise we will see 629 * in hash table socket with a funny identity. 630 */ 631 inet->inet_num = lport; 632 inet->inet_sport = htons(lport); 633 sk->sk_hash = hash; 634 WARN_ON(!sk_unhashed(sk)); 635 __sk_nulls_add_node_rcu(sk, &head->chain); 636 if (tw) { 637 sk_nulls_del_node_init_rcu((struct sock *)tw); 638 __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); 639 } 640 spin_unlock(lock); 641 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 642 643 if (twp) { 644 *twp = tw; 645 } else if (tw) { 646 /* Silly. Should hash-dance instead... */ 647 inet_twsk_deschedule_put(tw); 648 } 649 return 0; 650 651 not_unique: 652 spin_unlock(lock); 653 return -EADDRNOTAVAIL; 654 } 655 656 static u64 inet_sk_port_offset(const struct sock *sk) 657 { 658 const struct inet_sock *inet = inet_sk(sk); 659 660 return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, 661 inet->inet_daddr, 662 inet->inet_dport); 663 } 664 665 /* Searches for an exsiting socket in the ehash bucket list. 666 * Returns true if found, false otherwise. 667 */ 668 static bool inet_ehash_lookup_by_sk(struct sock *sk, 669 struct hlist_nulls_head *list) 670 { 671 const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num); 672 const int sdif = sk->sk_bound_dev_if; 673 const int dif = sk->sk_bound_dev_if; 674 const struct hlist_nulls_node *node; 675 struct net *net = sock_net(sk); 676 struct sock *esk; 677 678 INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr); 679 680 sk_nulls_for_each_rcu(esk, node, list) { 681 if (esk->sk_hash != sk->sk_hash) 682 continue; 683 if (sk->sk_family == AF_INET) { 684 if (unlikely(inet_match(net, esk, acookie, 685 ports, dif, sdif))) { 686 return true; 687 } 688 } 689 #if IS_ENABLED(CONFIG_IPV6) 690 else if (sk->sk_family == AF_INET6) { 691 if (unlikely(inet6_match(net, esk, 692 &sk->sk_v6_daddr, 693 &sk->sk_v6_rcv_saddr, 694 ports, dif, sdif))) { 695 return true; 696 } 697 } 698 #endif 699 } 700 return false; 701 } 702 703 /* Insert a socket into ehash, and eventually remove another one 704 * (The another one can be a SYN_RECV or TIMEWAIT) 705 * If an existing socket already exists, socket sk is not inserted, 706 * and sets found_dup_sk parameter to true. 707 */ 708 bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) 709 { 710 struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); 711 struct inet_ehash_bucket *head; 712 struct hlist_nulls_head *list; 713 spinlock_t *lock; 714 bool ret = true; 715 716 WARN_ON_ONCE(!sk_unhashed(sk)); 717 718 sk->sk_hash = sk_ehashfn(sk); 719 head = inet_ehash_bucket(hashinfo, sk->sk_hash); 720 list = &head->chain; 721 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 722 723 spin_lock(lock); 724 if (osk) { 725 WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); 726 ret = sk_nulls_replace_node_init_rcu(osk, sk); 727 goto unlock; 728 } 729 730 if (found_dup_sk) { 731 *found_dup_sk = inet_ehash_lookup_by_sk(sk, list); 732 if (*found_dup_sk) 733 ret = false; 734 } 735 736 if (ret) 737 __sk_nulls_add_node_rcu(sk, list); 738 739 unlock: 740 spin_unlock(lock); 741 742 return ret; 743 } 744 745 bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) 746 { 747 bool ok = inet_ehash_insert(sk, osk, found_dup_sk); 748 749 if (ok) { 750 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 751 } else { 752 tcp_orphan_count_inc(); 753 inet_sk_set_state(sk, TCP_CLOSE); 754 sock_set_flag(sk, SOCK_DEAD); 755 inet_csk_destroy_sock(sk); 756 } 757 return ok; 758 } 759 760 static int inet_reuseport_add_sock(struct sock *sk, 761 struct inet_listen_hashbucket *ilb) 762 { 763 struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; 764 const struct hlist_nulls_node *node; 765 kuid_t uid = sk_uid(sk); 766 struct sock *sk2; 767 768 sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) { 769 if (sk2 != sk && 770 sk2->sk_family == sk->sk_family && 771 ipv6_only_sock(sk2) == ipv6_only_sock(sk) && 772 sk2->sk_bound_dev_if == sk->sk_bound_dev_if && 773 inet_csk(sk2)->icsk_bind_hash == tb && 774 sk2->sk_reuseport && uid_eq(uid, sk_uid(sk2)) && 775 inet_rcv_saddr_equal(sk, sk2, false)) 776 return reuseport_add_sock(sk, sk2, 777 inet_rcv_saddr_any(sk)); 778 } 779 780 return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); 781 } 782 783 int inet_hash(struct sock *sk) 784 { 785 struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); 786 struct inet_listen_hashbucket *ilb2; 787 int err = 0; 788 789 if (sk->sk_state == TCP_CLOSE) 790 return 0; 791 792 if (sk->sk_state != TCP_LISTEN) { 793 local_bh_disable(); 794 inet_ehash_nolisten(sk, NULL, NULL); 795 local_bh_enable(); 796 return 0; 797 } 798 799 #if IS_ENABLED(CONFIG_IPV6) 800 if (sk->sk_family == AF_INET6) 801 inet6_init_ehash_secret(); 802 #endif 803 inet_init_ehash_secret(); 804 805 WARN_ON(!sk_unhashed(sk)); 806 ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); 807 808 spin_lock(&ilb2->lock); 809 if (sk->sk_reuseport) { 810 err = inet_reuseport_add_sock(sk, ilb2); 811 if (err) 812 goto unlock; 813 } 814 sock_set_flag(sk, SOCK_RCU_FREE); 815 if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && 816 sk->sk_family == AF_INET6) 817 __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head); 818 else 819 __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head); 820 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 821 unlock: 822 spin_unlock(&ilb2->lock); 823 824 return err; 825 } 826 827 void inet_unhash(struct sock *sk) 828 { 829 struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); 830 831 if (sk_unhashed(sk)) 832 return; 833 834 sock_rps_delete_flow(sk); 835 if (sk->sk_state == TCP_LISTEN) { 836 struct inet_listen_hashbucket *ilb2; 837 838 ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); 839 /* Don't disable bottom halves while acquiring the lock to 840 * avoid circular locking dependency on PREEMPT_RT. 841 */ 842 spin_lock(&ilb2->lock); 843 if (rcu_access_pointer(sk->sk_reuseport_cb)) 844 reuseport_stop_listen_sock(sk); 845 846 __sk_nulls_del_node_init_rcu(sk); 847 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 848 spin_unlock(&ilb2->lock); 849 } else { 850 spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 851 852 spin_lock_bh(lock); 853 __sk_nulls_del_node_init_rcu(sk); 854 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 855 spin_unlock_bh(lock); 856 } 857 } 858 859 static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb, 860 const struct net *net, unsigned short port, 861 int l3mdev, const struct sock *sk) 862 { 863 if (!net_eq(ib2_net(tb), net) || tb->port != port || 864 tb->l3mdev != l3mdev) 865 return false; 866 867 return inet_bind2_bucket_addr_match(tb, sk); 868 } 869 870 bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net, 871 unsigned short port, int l3mdev, const struct sock *sk) 872 { 873 if (!net_eq(ib2_net(tb), net) || tb->port != port || 874 tb->l3mdev != l3mdev) 875 return false; 876 877 #if IS_ENABLED(CONFIG_IPV6) 878 if (tb->addr_type == IPV6_ADDR_ANY) 879 return true; 880 881 if (tb->addr_type != IPV6_ADDR_MAPPED) 882 return false; 883 884 if (sk->sk_family == AF_INET6 && 885 !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) 886 return false; 887 #endif 888 return tb->rcv_saddr == 0; 889 } 890 891 /* The socket's bhash2 hashbucket spinlock must be held when this is called */ 892 struct inet_bind2_bucket * 893 inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net, 894 unsigned short port, int l3mdev, const struct sock *sk) 895 { 896 struct inet_bind2_bucket *bhash2 = NULL; 897 898 inet_bind_bucket_for_each(bhash2, &head->chain) 899 if (inet_bind2_bucket_match(bhash2, net, port, l3mdev, sk)) 900 break; 901 902 return bhash2; 903 } 904 905 struct inet_bind_hashbucket * 906 inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port) 907 { 908 struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); 909 u32 hash; 910 911 #if IS_ENABLED(CONFIG_IPV6) 912 if (sk->sk_family == AF_INET6) 913 hash = ipv6_portaddr_hash(net, &in6addr_any, port); 914 else 915 #endif 916 hash = ipv4_portaddr_hash(net, 0, port); 917 918 return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; 919 } 920 921 static void inet_update_saddr(struct sock *sk, void *saddr, int family) 922 { 923 if (family == AF_INET) { 924 inet_sk(sk)->inet_saddr = *(__be32 *)saddr; 925 sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr); 926 } 927 #if IS_ENABLED(CONFIG_IPV6) 928 else { 929 sk->sk_v6_rcv_saddr = *(struct in6_addr *)saddr; 930 } 931 #endif 932 } 933 934 static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset) 935 { 936 struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); 937 struct inet_bind_hashbucket *head, *head2; 938 struct inet_bind2_bucket *tb2, *new_tb2; 939 int l3mdev = inet_sk_bound_l3mdev(sk); 940 int port = inet_sk(sk)->inet_num; 941 struct net *net = sock_net(sk); 942 int bhash; 943 944 if (!inet_csk(sk)->icsk_bind2_hash) { 945 /* Not bind()ed before. */ 946 if (reset) 947 inet_reset_saddr(sk); 948 else 949 inet_update_saddr(sk, saddr, family); 950 951 return 0; 952 } 953 954 /* Allocate a bind2 bucket ahead of time to avoid permanently putting 955 * the bhash2 table in an inconsistent state if a new tb2 bucket 956 * allocation fails. 957 */ 958 new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC); 959 if (!new_tb2) { 960 if (reset) { 961 /* The (INADDR_ANY, port) bucket might have already 962 * been freed, then we cannot fixup icsk_bind2_hash, 963 * so we give up and unlink sk from bhash/bhash2 not 964 * to leave inconsistency in bhash2. 965 */ 966 inet_put_port(sk); 967 inet_reset_saddr(sk); 968 } 969 970 return -ENOMEM; 971 } 972 973 bhash = inet_bhashfn(net, port, hinfo->bhash_size); 974 head = &hinfo->bhash[bhash]; 975 head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); 976 977 /* If we change saddr locklessly, another thread 978 * iterating over bhash might see corrupted address. 979 */ 980 spin_lock_bh(&head->lock); 981 982 spin_lock(&head2->lock); 983 __sk_del_bind_node(sk); 984 inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash); 985 spin_unlock(&head2->lock); 986 987 if (reset) 988 inet_reset_saddr(sk); 989 else 990 inet_update_saddr(sk, saddr, family); 991 992 head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); 993 994 spin_lock(&head2->lock); 995 tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); 996 if (!tb2) { 997 tb2 = new_tb2; 998 inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk); 999 if (sk_is_connect_bind(sk)) { 1000 tb2->fastreuse = -1; 1001 tb2->fastreuseport = -1; 1002 } 1003 } 1004 inet_csk(sk)->icsk_bind2_hash = tb2; 1005 sk_add_bind_node(sk, &tb2->owners); 1006 spin_unlock(&head2->lock); 1007 1008 spin_unlock_bh(&head->lock); 1009 1010 if (tb2 != new_tb2) 1011 kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2); 1012 1013 return 0; 1014 } 1015 1016 int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) 1017 { 1018 return __inet_bhash2_update_saddr(sk, saddr, family, false); 1019 } 1020 1021 void inet_bhash2_reset_saddr(struct sock *sk) 1022 { 1023 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) 1024 __inet_bhash2_update_saddr(sk, NULL, 0, true); 1025 } 1026 1027 /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm 1028 * Note that we use 32bit integers (vs RFC 'short integers') 1029 * because 2^16 is not a multiple of num_ephemeral and this 1030 * property might be used by clever attacker. 1031 * 1032 * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though 1033 * attacks were since demonstrated, thus we use 65536 by default instead 1034 * to really give more isolation and privacy, at the expense of 256kB 1035 * of kernel memory. 1036 */ 1037 #define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER) 1038 static u32 *table_perturb; 1039 1040 int __inet_hash_connect(struct inet_timewait_death_row *death_row, 1041 struct sock *sk, u64 port_offset, 1042 u32 hash_port0, 1043 int (*check_established)(struct inet_timewait_death_row *, 1044 struct sock *, __u16, struct inet_timewait_sock **, 1045 bool rcu_lookup, u32 hash)) 1046 { 1047 struct inet_hashinfo *hinfo = death_row->hashinfo; 1048 struct inet_bind_hashbucket *head, *head2; 1049 struct inet_timewait_sock *tw = NULL; 1050 int port = inet_sk(sk)->inet_num; 1051 struct net *net = sock_net(sk); 1052 struct inet_bind2_bucket *tb2; 1053 struct inet_bind_bucket *tb; 1054 int step, scan_step, l3mdev; 1055 u32 index, max_rand_step; 1056 bool tb_created = false; 1057 u32 remaining, offset; 1058 int ret, i, low, high; 1059 bool local_ports; 1060 1061 if (port) { 1062 local_bh_disable(); 1063 ret = check_established(death_row, sk, port, NULL, false, 1064 hash_port0 + port); 1065 local_bh_enable(); 1066 return ret; 1067 } 1068 1069 l3mdev = inet_sk_bound_l3mdev(sk); 1070 1071 local_ports = inet_sk_get_local_port_range(sk, &low, &high); 1072 step = local_ports ? 1 : 2; 1073 scan_step = step; 1074 max_rand_step = READ_ONCE(net->ipv4.sysctl_ip_local_port_step_width); 1075 1076 high++; /* [32768, 60999] -> [32768, 61000[ */ 1077 remaining = high - low; 1078 if (!local_ports && remaining > 1) 1079 remaining &= ~1U; 1080 1081 get_random_sleepable_once(table_perturb, 1082 INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb)); 1083 index = port_offset & (INET_TABLE_PERTURB_SIZE - 1); 1084 1085 offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32); 1086 offset %= remaining; 1087 1088 /* In first pass we try ports of @low parity. 1089 * inet_csk_get_port() does the opposite choice. 1090 */ 1091 if (!local_ports) 1092 offset &= ~1U; 1093 1094 if (max_rand_step && remaining > 1) { 1095 u32 range = remaining / step; 1096 u32 upper_bound; 1097 1098 upper_bound = min(range, max_rand_step); 1099 scan_step = get_random_u32_inclusive(1, upper_bound); 1100 while (gcd(scan_step, range) != 1) { 1101 scan_step++; 1102 /* if both scan_step and range are even gcd won't be 1 */ 1103 if (!(scan_step & 1) && !(range & 1)) 1104 scan_step++; 1105 if (unlikely(scan_step > upper_bound)) { 1106 scan_step = 1; 1107 break; 1108 } 1109 } 1110 scan_step *= step; 1111 } 1112 other_parity_scan: 1113 port = low + offset; 1114 for (i = 0; i < remaining; i += step, port += scan_step) { 1115 if (unlikely(port >= high)) 1116 port -= remaining; 1117 if (inet_is_local_reserved_port(net, port)) 1118 continue; 1119 head = &hinfo->bhash[inet_bhashfn(net, port, 1120 hinfo->bhash_size)]; 1121 rcu_read_lock(); 1122 hlist_for_each_entry_rcu(tb, &head->chain, node) { 1123 if (!inet_bind_bucket_match(tb, net, port, l3mdev)) 1124 continue; 1125 if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) { 1126 rcu_read_unlock(); 1127 goto next_port; 1128 } 1129 if (!check_established(death_row, sk, port, &tw, true, 1130 hash_port0 + port)) 1131 break; 1132 rcu_read_unlock(); 1133 goto next_port; 1134 } 1135 rcu_read_unlock(); 1136 1137 spin_lock_bh(&head->lock); 1138 1139 /* Does not bother with rcv_saddr checks, because 1140 * the established check is already unique enough. 1141 */ 1142 inet_bind_bucket_for_each(tb, &head->chain) { 1143 if (inet_bind_bucket_match(tb, net, port, l3mdev)) { 1144 if (tb->fastreuse >= 0 || 1145 tb->fastreuseport >= 0) 1146 goto next_port_unlock; 1147 WARN_ON(hlist_empty(&tb->bhash2)); 1148 if (!check_established(death_row, sk, 1149 port, &tw, false, 1150 hash_port0 + port)) 1151 goto ok; 1152 goto next_port_unlock; 1153 } 1154 } 1155 1156 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 1157 net, head, port, l3mdev); 1158 if (!tb) { 1159 spin_unlock_bh(&head->lock); 1160 return -ENOMEM; 1161 } 1162 tb_created = true; 1163 tb->fastreuse = -1; 1164 tb->fastreuseport = -1; 1165 goto ok; 1166 next_port_unlock: 1167 spin_unlock_bh(&head->lock); 1168 next_port: 1169 cond_resched(); 1170 } 1171 1172 if (!local_ports) { 1173 offset++; 1174 if ((offset & 1) && remaining > 1) 1175 goto other_parity_scan; 1176 } 1177 return -EADDRNOTAVAIL; 1178 1179 ok: 1180 /* Find the corresponding tb2 bucket since we need to 1181 * add the socket to the bhash2 table as well 1182 */ 1183 head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); 1184 spin_lock(&head2->lock); 1185 1186 tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); 1187 if (!tb2) { 1188 tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net, 1189 head2, tb, sk); 1190 if (!tb2) 1191 goto error; 1192 tb2->fastreuse = -1; 1193 tb2->fastreuseport = -1; 1194 } 1195 1196 /* Here we want to add a little bit of randomness to the next source 1197 * port that will be chosen. We use a max() with a random here so that 1198 * on low contention the randomness is maximal and on high contention 1199 * it may be inexistent. 1200 */ 1201 i = max_t(int, i, get_random_u32_below(8) * step); 1202 WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + step); 1203 1204 /* Head lock still held and bh's disabled */ 1205 inet_bind_hash(sk, tb, tb2, port); 1206 sk->sk_userlocks |= SOCK_CONNECT_BIND; 1207 1208 if (sk_unhashed(sk)) { 1209 inet_sk(sk)->inet_sport = htons(port); 1210 inet_ehash_nolisten(sk, (struct sock *)tw, NULL); 1211 } 1212 if (tw) 1213 inet_twsk_bind_unhash(tw, hinfo); 1214 1215 spin_unlock(&head2->lock); 1216 spin_unlock(&head->lock); 1217 1218 if (tw) 1219 inet_twsk_deschedule_put(tw); 1220 local_bh_enable(); 1221 return 0; 1222 1223 error: 1224 if (sk_hashed(sk)) { 1225 spinlock_t *lock = inet_ehash_lockp(hinfo, sk->sk_hash); 1226 1227 sock_prot_inuse_add(net, sk->sk_prot, -1); 1228 1229 spin_lock(lock); 1230 __sk_nulls_del_node_init_rcu(sk); 1231 spin_unlock(lock); 1232 1233 sk->sk_hash = 0; 1234 inet_sk(sk)->inet_sport = 0; 1235 WRITE_ONCE(inet_sk(sk)->inet_num, 0); 1236 1237 if (tw) 1238 inet_twsk_bind_unhash(tw, hinfo); 1239 } 1240 1241 spin_unlock(&head2->lock); 1242 if (tb_created) 1243 inet_bind_bucket_destroy(tb); 1244 spin_unlock(&head->lock); 1245 1246 if (tw) 1247 inet_twsk_deschedule_put(tw); 1248 1249 local_bh_enable(); 1250 1251 return -ENOMEM; 1252 } 1253 1254 /* 1255 * Bind a port for a connect operation and hash it. 1256 */ 1257 int inet_hash_connect(struct inet_timewait_death_row *death_row, 1258 struct sock *sk) 1259 { 1260 const struct inet_sock *inet = inet_sk(sk); 1261 const struct net *net = sock_net(sk); 1262 u64 port_offset = 0; 1263 u32 hash_port0; 1264 1265 if (!inet_sk(sk)->inet_num) 1266 port_offset = inet_sk_port_offset(sk); 1267 1268 inet_init_ehash_secret(); 1269 1270 hash_port0 = inet_ehashfn(net, inet->inet_rcv_saddr, 0, 1271 inet->inet_daddr, inet->inet_dport); 1272 1273 return __inet_hash_connect(death_row, sk, port_offset, hash_port0, 1274 __inet_check_established); 1275 } 1276 1277 void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, 1278 unsigned long numentries, int scale, 1279 unsigned long low_limit, 1280 unsigned long high_limit) 1281 { 1282 unsigned int i; 1283 1284 h->lhash2 = alloc_large_system_hash(name, 1285 sizeof(*h->lhash2), 1286 numentries, 1287 scale, 1288 0, 1289 NULL, 1290 &h->lhash2_mask, 1291 low_limit, 1292 high_limit); 1293 1294 for (i = 0; i <= h->lhash2_mask; i++) { 1295 spin_lock_init(&h->lhash2[i].lock); 1296 INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head, 1297 i + LISTENING_NULLS_BASE); 1298 } 1299 1300 /* this one is used for source ports of outgoing connections */ 1301 table_perturb = alloc_large_system_hash("Table-perturb", 1302 sizeof(*table_perturb), 1303 INET_TABLE_PERTURB_SIZE, 1304 0, 0, NULL, NULL, 1305 INET_TABLE_PERTURB_SIZE, 1306 INET_TABLE_PERTURB_SIZE); 1307 } 1308 1309 int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) 1310 { 1311 unsigned int locksz = sizeof(spinlock_t); 1312 unsigned int i, nblocks = 1; 1313 spinlock_t *ptr = NULL; 1314 1315 if (locksz == 0) 1316 goto set_mask; 1317 1318 /* Allocate 2 cache lines or at least one spinlock per cpu. */ 1319 nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U) * num_possible_cpus(); 1320 1321 /* At least one page per NUMA node. */ 1322 nblocks = max(nblocks, num_online_nodes() * PAGE_SIZE / locksz); 1323 1324 nblocks = roundup_pow_of_two(nblocks); 1325 1326 /* No more locks than number of hash buckets. */ 1327 nblocks = min(nblocks, hashinfo->ehash_mask + 1); 1328 1329 if (num_online_nodes() > 1) { 1330 /* Use vmalloc() to allow NUMA policy to spread pages 1331 * on all available nodes if desired. 1332 */ 1333 ptr = vmalloc_array(nblocks, locksz); 1334 } 1335 if (!ptr) { 1336 ptr = kvmalloc_array(nblocks, locksz, GFP_KERNEL); 1337 if (!ptr) 1338 return -ENOMEM; 1339 } 1340 for (i = 0; i < nblocks; i++) 1341 spin_lock_init(&ptr[i]); 1342 hashinfo->ehash_locks = ptr; 1343 set_mask: 1344 hashinfo->ehash_locks_mask = nblocks - 1; 1345 return 0; 1346 } 1347 1348 struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, 1349 unsigned int ehash_entries) 1350 { 1351 struct inet_hashinfo *new_hashinfo; 1352 int i; 1353 1354 new_hashinfo = kmemdup(hashinfo, sizeof(*hashinfo), GFP_KERNEL); 1355 if (!new_hashinfo) 1356 goto err; 1357 1358 new_hashinfo->ehash = vmalloc_huge(ehash_entries * sizeof(struct inet_ehash_bucket), 1359 GFP_KERNEL_ACCOUNT); 1360 if (!new_hashinfo->ehash) 1361 goto free_hashinfo; 1362 1363 new_hashinfo->ehash_mask = ehash_entries - 1; 1364 1365 if (inet_ehash_locks_alloc(new_hashinfo)) 1366 goto free_ehash; 1367 1368 for (i = 0; i < ehash_entries; i++) 1369 INIT_HLIST_NULLS_HEAD(&new_hashinfo->ehash[i].chain, i); 1370 1371 new_hashinfo->pernet = true; 1372 1373 return new_hashinfo; 1374 1375 free_ehash: 1376 vfree(new_hashinfo->ehash); 1377 free_hashinfo: 1378 kfree(new_hashinfo); 1379 err: 1380 return NULL; 1381 } 1382 1383 void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo) 1384 { 1385 if (!hashinfo->pernet) 1386 return; 1387 1388 inet_ehash_locks_free(hashinfo); 1389 vfree(hashinfo->ehash); 1390 kfree(hashinfo); 1391 } 1392