1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Generic INET transport hashtables 8 * 9 * Authors: Lotsa people, from code originally in tcp 10 */ 11 12 #include <linux/module.h> 13 #include <linux/random.h> 14 #include <linux/sched.h> 15 #include <linux/slab.h> 16 #include <linux/wait.h> 17 #include <linux/vmalloc.h> 18 #include <linux/memblock.h> 19 20 #include <net/addrconf.h> 21 #include <net/inet_connection_sock.h> 22 #include <net/inet_hashtables.h> 23 #include <net/secure_seq.h> 24 #include <net/ip.h> 25 #include <net/tcp.h> 26 #include <net/sock_reuseport.h> 27 28 static u32 inet_ehashfn(const struct net *net, const __be32 laddr, 29 const __u16 lport, const __be32 faddr, 30 const __be16 fport) 31 { 32 static u32 inet_ehash_secret __read_mostly; 33 34 net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret)); 35 36 return __inet_ehashfn(laddr, lport, faddr, fport, 37 inet_ehash_secret + net_hash_mix(net)); 38 } 39 40 /* This function handles inet_sock, but also timewait and request sockets 41 * for IPv4/IPv6. 42 */ 43 static u32 sk_ehashfn(const struct sock *sk) 44 { 45 #if IS_ENABLED(CONFIG_IPV6) 46 if (sk->sk_family == AF_INET6 && 47 !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 48 return inet6_ehashfn(sock_net(sk), 49 &sk->sk_v6_rcv_saddr, sk->sk_num, 50 &sk->sk_v6_daddr, sk->sk_dport); 51 #endif 52 return inet_ehashfn(sock_net(sk), 53 sk->sk_rcv_saddr, sk->sk_num, 54 sk->sk_daddr, sk->sk_dport); 55 } 56 57 /* 58 * Allocate and initialize a new local port bind bucket. 59 * The bindhash mutex for snum's hash chain must be held here. 60 */ 61 struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, 62 struct net *net, 63 struct inet_bind_hashbucket *head, 64 const unsigned short snum, 65 int l3mdev) 66 { 67 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 68 69 if (tb) { 70 write_pnet(&tb->ib_net, net); 71 tb->l3mdev = l3mdev; 72 tb->port = snum; 73 tb->fastreuse = 0; 74 tb->fastreuseport = 0; 75 INIT_HLIST_HEAD(&tb->owners); 76 hlist_add_head(&tb->node, &head->chain); 77 } 78 return tb; 79 } 80 81 /* 82 * Caller must hold hashbucket lock for this tb with local BH disabled 83 */ 84 void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) 85 { 86 if (hlist_empty(&tb->owners)) { 87 __hlist_del(&tb->node); 88 kmem_cache_free(cachep, tb); 89 } 90 } 91 92 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 93 const unsigned short snum) 94 { 95 inet_sk(sk)->inet_num = snum; 96 sk_add_bind_node(sk, &tb->owners); 97 inet_csk(sk)->icsk_bind_hash = tb; 98 } 99 100 /* 101 * Get rid of any references to a local port held by the given sock. 102 */ 103 static void __inet_put_port(struct sock *sk) 104 { 105 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 106 const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num, 107 hashinfo->bhash_size); 108 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; 109 struct inet_bind_bucket *tb; 110 111 spin_lock(&head->lock); 112 tb = inet_csk(sk)->icsk_bind_hash; 113 __sk_del_bind_node(sk); 114 inet_csk(sk)->icsk_bind_hash = NULL; 115 inet_sk(sk)->inet_num = 0; 116 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 117 spin_unlock(&head->lock); 118 } 119 120 void inet_put_port(struct sock *sk) 121 { 122 local_bh_disable(); 123 __inet_put_port(sk); 124 local_bh_enable(); 125 } 126 EXPORT_SYMBOL(inet_put_port); 127 128 int __inet_inherit_port(const struct sock *sk, struct sock *child) 129 { 130 struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; 131 unsigned short port = inet_sk(child)->inet_num; 132 const int bhash = inet_bhashfn(sock_net(sk), port, 133 table->bhash_size); 134 struct inet_bind_hashbucket *head = &table->bhash[bhash]; 135 struct inet_bind_bucket *tb; 136 int l3mdev; 137 138 spin_lock(&head->lock); 139 tb = inet_csk(sk)->icsk_bind_hash; 140 if (unlikely(!tb)) { 141 spin_unlock(&head->lock); 142 return -ENOENT; 143 } 144 if (tb->port != port) { 145 l3mdev = inet_sk_bound_l3mdev(sk); 146 147 /* NOTE: using tproxy and redirecting skbs to a proxy 148 * on a different listener port breaks the assumption 149 * that the listener socket's icsk_bind_hash is the same 150 * as that of the child socket. We have to look up or 151 * create a new bind bucket for the child here. */ 152 inet_bind_bucket_for_each(tb, &head->chain) { 153 if (net_eq(ib_net(tb), sock_net(sk)) && 154 tb->l3mdev == l3mdev && tb->port == port) 155 break; 156 } 157 if (!tb) { 158 tb = inet_bind_bucket_create(table->bind_bucket_cachep, 159 sock_net(sk), head, port, 160 l3mdev); 161 if (!tb) { 162 spin_unlock(&head->lock); 163 return -ENOMEM; 164 } 165 } 166 } 167 inet_bind_hash(child, tb, port); 168 spin_unlock(&head->lock); 169 170 return 0; 171 } 172 EXPORT_SYMBOL_GPL(__inet_inherit_port); 173 174 static struct inet_listen_hashbucket * 175 inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) 176 { 177 u32 hash; 178 179 #if IS_ENABLED(CONFIG_IPV6) 180 if (sk->sk_family == AF_INET6) 181 hash = ipv6_portaddr_hash(sock_net(sk), 182 &sk->sk_v6_rcv_saddr, 183 inet_sk(sk)->inet_num); 184 else 185 #endif 186 hash = ipv4_portaddr_hash(sock_net(sk), 187 inet_sk(sk)->inet_rcv_saddr, 188 inet_sk(sk)->inet_num); 189 return inet_lhash2_bucket(h, hash); 190 } 191 192 static void inet_hash2(struct inet_hashinfo *h, struct sock *sk) 193 { 194 struct inet_listen_hashbucket *ilb2; 195 196 if (!h->lhash2) 197 return; 198 199 ilb2 = inet_lhash2_bucket_sk(h, sk); 200 201 spin_lock(&ilb2->lock); 202 if (sk->sk_reuseport && sk->sk_family == AF_INET6) 203 hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, 204 &ilb2->head); 205 else 206 hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, 207 &ilb2->head); 208 ilb2->count++; 209 spin_unlock(&ilb2->lock); 210 } 211 212 static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk) 213 { 214 struct inet_listen_hashbucket *ilb2; 215 216 if (!h->lhash2 || 217 WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node))) 218 return; 219 220 ilb2 = inet_lhash2_bucket_sk(h, sk); 221 222 spin_lock(&ilb2->lock); 223 hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node); 224 ilb2->count--; 225 spin_unlock(&ilb2->lock); 226 } 227 228 static inline int compute_score(struct sock *sk, struct net *net, 229 const unsigned short hnum, const __be32 daddr, 230 const int dif, const int sdif, bool exact_dif) 231 { 232 int score = -1; 233 234 if (net_eq(sock_net(sk), net) && sk->sk_num == hnum && 235 !ipv6_only_sock(sk)) { 236 if (sk->sk_rcv_saddr != daddr) 237 return -1; 238 239 if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) 240 return -1; 241 242 score = sk->sk_family == PF_INET ? 2 : 1; 243 if (sk->sk_incoming_cpu == raw_smp_processor_id()) 244 score++; 245 } 246 return score; 247 } 248 249 /* 250 * Here are some nice properties to exploit here. The BSD API 251 * does not allow a listening sock to specify the remote port nor the 252 * remote address for the connection. So always assume those are both 253 * wildcarded during the search since they can never be otherwise. 254 */ 255 256 /* called with rcu_read_lock() : No refcount taken on the socket */ 257 static struct sock *inet_lhash2_lookup(struct net *net, 258 struct inet_listen_hashbucket *ilb2, 259 struct sk_buff *skb, int doff, 260 const __be32 saddr, __be16 sport, 261 const __be32 daddr, const unsigned short hnum, 262 const int dif, const int sdif) 263 { 264 bool exact_dif = inet_exact_dif_match(net, skb); 265 struct inet_connection_sock *icsk; 266 struct sock *sk, *result = NULL; 267 int score, hiscore = 0; 268 u32 phash = 0; 269 270 inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { 271 sk = (struct sock *)icsk; 272 score = compute_score(sk, net, hnum, daddr, 273 dif, sdif, exact_dif); 274 if (score > hiscore) { 275 if (sk->sk_reuseport) { 276 phash = inet_ehashfn(net, daddr, hnum, 277 saddr, sport); 278 result = reuseport_select_sock(sk, phash, 279 skb, doff); 280 if (result) 281 return result; 282 } 283 result = sk; 284 hiscore = score; 285 } 286 } 287 288 return result; 289 } 290 291 struct sock *__inet_lookup_listener(struct net *net, 292 struct inet_hashinfo *hashinfo, 293 struct sk_buff *skb, int doff, 294 const __be32 saddr, __be16 sport, 295 const __be32 daddr, const unsigned short hnum, 296 const int dif, const int sdif) 297 { 298 struct inet_listen_hashbucket *ilb2; 299 struct sock *result = NULL; 300 unsigned int hash2; 301 302 hash2 = ipv4_portaddr_hash(net, daddr, hnum); 303 ilb2 = inet_lhash2_bucket(hashinfo, hash2); 304 305 result = inet_lhash2_lookup(net, ilb2, skb, doff, 306 saddr, sport, daddr, hnum, 307 dif, sdif); 308 if (result) 309 goto done; 310 311 /* Lookup lhash2 with INADDR_ANY */ 312 hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); 313 ilb2 = inet_lhash2_bucket(hashinfo, hash2); 314 315 result = inet_lhash2_lookup(net, ilb2, skb, doff, 316 saddr, sport, htonl(INADDR_ANY), hnum, 317 dif, sdif); 318 done: 319 if (IS_ERR(result)) 320 return NULL; 321 return result; 322 } 323 EXPORT_SYMBOL_GPL(__inet_lookup_listener); 324 325 /* All sockets share common refcount, but have different destructors */ 326 void sock_gen_put(struct sock *sk) 327 { 328 if (!refcount_dec_and_test(&sk->sk_refcnt)) 329 return; 330 331 if (sk->sk_state == TCP_TIME_WAIT) 332 inet_twsk_free(inet_twsk(sk)); 333 else if (sk->sk_state == TCP_NEW_SYN_RECV) 334 reqsk_free(inet_reqsk(sk)); 335 else 336 sk_free(sk); 337 } 338 EXPORT_SYMBOL_GPL(sock_gen_put); 339 340 void sock_edemux(struct sk_buff *skb) 341 { 342 sock_gen_put(skb->sk); 343 } 344 EXPORT_SYMBOL(sock_edemux); 345 346 struct sock *__inet_lookup_established(struct net *net, 347 struct inet_hashinfo *hashinfo, 348 const __be32 saddr, const __be16 sport, 349 const __be32 daddr, const u16 hnum, 350 const int dif, const int sdif) 351 { 352 INET_ADDR_COOKIE(acookie, saddr, daddr); 353 const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 354 struct sock *sk; 355 const struct hlist_nulls_node *node; 356 /* Optimize here for direct hit, only listening connections can 357 * have wildcards anyways. 358 */ 359 unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); 360 unsigned int slot = hash & hashinfo->ehash_mask; 361 struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; 362 363 begin: 364 sk_nulls_for_each_rcu(sk, node, &head->chain) { 365 if (sk->sk_hash != hash) 366 continue; 367 if (likely(INET_MATCH(sk, net, acookie, 368 saddr, daddr, ports, dif, sdif))) { 369 if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) 370 goto out; 371 if (unlikely(!INET_MATCH(sk, net, acookie, 372 saddr, daddr, ports, 373 dif, sdif))) { 374 sock_gen_put(sk); 375 goto begin; 376 } 377 goto found; 378 } 379 } 380 /* 381 * if the nulls value we got at the end of this lookup is 382 * not the expected one, we must restart lookup. 383 * We probably met an item that was moved to another chain. 384 */ 385 if (get_nulls_value(node) != slot) 386 goto begin; 387 out: 388 sk = NULL; 389 found: 390 return sk; 391 } 392 EXPORT_SYMBOL_GPL(__inet_lookup_established); 393 394 /* called with local bh disabled */ 395 static int __inet_check_established(struct inet_timewait_death_row *death_row, 396 struct sock *sk, __u16 lport, 397 struct inet_timewait_sock **twp) 398 { 399 struct inet_hashinfo *hinfo = death_row->hashinfo; 400 struct inet_sock *inet = inet_sk(sk); 401 __be32 daddr = inet->inet_rcv_saddr; 402 __be32 saddr = inet->inet_daddr; 403 int dif = sk->sk_bound_dev_if; 404 struct net *net = sock_net(sk); 405 int sdif = l3mdev_master_ifindex_by_index(net, dif); 406 INET_ADDR_COOKIE(acookie, saddr, daddr); 407 const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); 408 unsigned int hash = inet_ehashfn(net, daddr, lport, 409 saddr, inet->inet_dport); 410 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 411 spinlock_t *lock = inet_ehash_lockp(hinfo, hash); 412 struct sock *sk2; 413 const struct hlist_nulls_node *node; 414 struct inet_timewait_sock *tw = NULL; 415 416 spin_lock(lock); 417 418 sk_nulls_for_each(sk2, node, &head->chain) { 419 if (sk2->sk_hash != hash) 420 continue; 421 422 if (likely(INET_MATCH(sk2, net, acookie, 423 saddr, daddr, ports, dif, sdif))) { 424 if (sk2->sk_state == TCP_TIME_WAIT) { 425 tw = inet_twsk(sk2); 426 if (twsk_unique(sk, sk2, twp)) 427 break; 428 } 429 goto not_unique; 430 } 431 } 432 433 /* Must record num and sport now. Otherwise we will see 434 * in hash table socket with a funny identity. 435 */ 436 inet->inet_num = lport; 437 inet->inet_sport = htons(lport); 438 sk->sk_hash = hash; 439 WARN_ON(!sk_unhashed(sk)); 440 __sk_nulls_add_node_rcu(sk, &head->chain); 441 if (tw) { 442 sk_nulls_del_node_init_rcu((struct sock *)tw); 443 __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); 444 } 445 spin_unlock(lock); 446 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 447 448 if (twp) { 449 *twp = tw; 450 } else if (tw) { 451 /* Silly. Should hash-dance instead... */ 452 inet_twsk_deschedule_put(tw); 453 } 454 return 0; 455 456 not_unique: 457 spin_unlock(lock); 458 return -EADDRNOTAVAIL; 459 } 460 461 static u32 inet_sk_port_offset(const struct sock *sk) 462 { 463 const struct inet_sock *inet = inet_sk(sk); 464 465 return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, 466 inet->inet_daddr, 467 inet->inet_dport); 468 } 469 470 /* insert a socket into ehash, and eventually remove another one 471 * (The another one can be a SYN_RECV or TIMEWAIT 472 */ 473 bool inet_ehash_insert(struct sock *sk, struct sock *osk) 474 { 475 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 476 struct hlist_nulls_head *list; 477 struct inet_ehash_bucket *head; 478 spinlock_t *lock; 479 bool ret = true; 480 481 WARN_ON_ONCE(!sk_unhashed(sk)); 482 483 sk->sk_hash = sk_ehashfn(sk); 484 head = inet_ehash_bucket(hashinfo, sk->sk_hash); 485 list = &head->chain; 486 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 487 488 spin_lock(lock); 489 if (osk) { 490 WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); 491 ret = sk_nulls_del_node_init_rcu(osk); 492 } 493 if (ret) 494 __sk_nulls_add_node_rcu(sk, list); 495 spin_unlock(lock); 496 return ret; 497 } 498 499 bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) 500 { 501 bool ok = inet_ehash_insert(sk, osk); 502 503 if (ok) { 504 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 505 } else { 506 percpu_counter_inc(sk->sk_prot->orphan_count); 507 inet_sk_set_state(sk, TCP_CLOSE); 508 sock_set_flag(sk, SOCK_DEAD); 509 inet_csk_destroy_sock(sk); 510 } 511 return ok; 512 } 513 EXPORT_SYMBOL_GPL(inet_ehash_nolisten); 514 515 static int inet_reuseport_add_sock(struct sock *sk, 516 struct inet_listen_hashbucket *ilb) 517 { 518 struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; 519 struct sock *sk2; 520 kuid_t uid = sock_i_uid(sk); 521 522 sk_for_each_rcu(sk2, &ilb->head) { 523 if (sk2 != sk && 524 sk2->sk_family == sk->sk_family && 525 ipv6_only_sock(sk2) == ipv6_only_sock(sk) && 526 sk2->sk_bound_dev_if == sk->sk_bound_dev_if && 527 inet_csk(sk2)->icsk_bind_hash == tb && 528 sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && 529 inet_rcv_saddr_equal(sk, sk2, false)) 530 return reuseport_add_sock(sk, sk2, 531 inet_rcv_saddr_any(sk)); 532 } 533 534 return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); 535 } 536 537 int __inet_hash(struct sock *sk, struct sock *osk) 538 { 539 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 540 struct inet_listen_hashbucket *ilb; 541 int err = 0; 542 543 if (sk->sk_state != TCP_LISTEN) { 544 inet_ehash_nolisten(sk, osk); 545 return 0; 546 } 547 WARN_ON(!sk_unhashed(sk)); 548 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 549 550 spin_lock(&ilb->lock); 551 if (sk->sk_reuseport) { 552 err = inet_reuseport_add_sock(sk, ilb); 553 if (err) 554 goto unlock; 555 } 556 if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && 557 sk->sk_family == AF_INET6) 558 hlist_add_tail_rcu(&sk->sk_node, &ilb->head); 559 else 560 hlist_add_head_rcu(&sk->sk_node, &ilb->head); 561 inet_hash2(hashinfo, sk); 562 ilb->count++; 563 sock_set_flag(sk, SOCK_RCU_FREE); 564 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 565 unlock: 566 spin_unlock(&ilb->lock); 567 568 return err; 569 } 570 EXPORT_SYMBOL(__inet_hash); 571 572 int inet_hash(struct sock *sk) 573 { 574 int err = 0; 575 576 if (sk->sk_state != TCP_CLOSE) { 577 local_bh_disable(); 578 err = __inet_hash(sk, NULL); 579 local_bh_enable(); 580 } 581 582 return err; 583 } 584 EXPORT_SYMBOL_GPL(inet_hash); 585 586 void inet_unhash(struct sock *sk) 587 { 588 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 589 struct inet_listen_hashbucket *ilb = NULL; 590 spinlock_t *lock; 591 592 if (sk_unhashed(sk)) 593 return; 594 595 if (sk->sk_state == TCP_LISTEN) { 596 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 597 lock = &ilb->lock; 598 } else { 599 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 600 } 601 spin_lock_bh(lock); 602 if (sk_unhashed(sk)) 603 goto unlock; 604 605 if (rcu_access_pointer(sk->sk_reuseport_cb)) 606 reuseport_detach_sock(sk); 607 if (ilb) { 608 inet_unhash2(hashinfo, sk); 609 __sk_del_node_init(sk); 610 ilb->count--; 611 } else { 612 __sk_nulls_del_node_init_rcu(sk); 613 } 614 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 615 unlock: 616 spin_unlock_bh(lock); 617 } 618 EXPORT_SYMBOL_GPL(inet_unhash); 619 620 int __inet_hash_connect(struct inet_timewait_death_row *death_row, 621 struct sock *sk, u32 port_offset, 622 int (*check_established)(struct inet_timewait_death_row *, 623 struct sock *, __u16, struct inet_timewait_sock **)) 624 { 625 struct inet_hashinfo *hinfo = death_row->hashinfo; 626 struct inet_timewait_sock *tw = NULL; 627 struct inet_bind_hashbucket *head; 628 int port = inet_sk(sk)->inet_num; 629 struct net *net = sock_net(sk); 630 struct inet_bind_bucket *tb; 631 u32 remaining, offset; 632 int ret, i, low, high; 633 static u32 hint; 634 int l3mdev; 635 636 if (port) { 637 head = &hinfo->bhash[inet_bhashfn(net, port, 638 hinfo->bhash_size)]; 639 tb = inet_csk(sk)->icsk_bind_hash; 640 spin_lock_bh(&head->lock); 641 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { 642 inet_ehash_nolisten(sk, NULL); 643 spin_unlock_bh(&head->lock); 644 return 0; 645 } 646 spin_unlock(&head->lock); 647 /* No definite answer... Walk to established hash table */ 648 ret = check_established(death_row, sk, port, NULL); 649 local_bh_enable(); 650 return ret; 651 } 652 653 l3mdev = inet_sk_bound_l3mdev(sk); 654 655 inet_get_local_port_range(net, &low, &high); 656 high++; /* [32768, 60999] -> [32768, 61000[ */ 657 remaining = high - low; 658 if (likely(remaining > 1)) 659 remaining &= ~1U; 660 661 offset = (hint + port_offset) % remaining; 662 /* In first pass we try ports of @low parity. 663 * inet_csk_get_port() does the opposite choice. 664 */ 665 offset &= ~1U; 666 other_parity_scan: 667 port = low + offset; 668 for (i = 0; i < remaining; i += 2, port += 2) { 669 if (unlikely(port >= high)) 670 port -= remaining; 671 if (inet_is_local_reserved_port(net, port)) 672 continue; 673 head = &hinfo->bhash[inet_bhashfn(net, port, 674 hinfo->bhash_size)]; 675 spin_lock_bh(&head->lock); 676 677 /* Does not bother with rcv_saddr checks, because 678 * the established check is already unique enough. 679 */ 680 inet_bind_bucket_for_each(tb, &head->chain) { 681 if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && 682 tb->port == port) { 683 if (tb->fastreuse >= 0 || 684 tb->fastreuseport >= 0) 685 goto next_port; 686 WARN_ON(hlist_empty(&tb->owners)); 687 if (!check_established(death_row, sk, 688 port, &tw)) 689 goto ok; 690 goto next_port; 691 } 692 } 693 694 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 695 net, head, port, l3mdev); 696 if (!tb) { 697 spin_unlock_bh(&head->lock); 698 return -ENOMEM; 699 } 700 tb->fastreuse = -1; 701 tb->fastreuseport = -1; 702 goto ok; 703 next_port: 704 spin_unlock_bh(&head->lock); 705 cond_resched(); 706 } 707 708 offset++; 709 if ((offset & 1) && remaining > 1) 710 goto other_parity_scan; 711 712 return -EADDRNOTAVAIL; 713 714 ok: 715 hint += i + 2; 716 717 /* Head lock still held and bh's disabled */ 718 inet_bind_hash(sk, tb, port); 719 if (sk_unhashed(sk)) { 720 inet_sk(sk)->inet_sport = htons(port); 721 inet_ehash_nolisten(sk, (struct sock *)tw); 722 } 723 if (tw) 724 inet_twsk_bind_unhash(tw, hinfo); 725 spin_unlock(&head->lock); 726 if (tw) 727 inet_twsk_deschedule_put(tw); 728 local_bh_enable(); 729 return 0; 730 } 731 732 /* 733 * Bind a port for a connect operation and hash it. 734 */ 735 int inet_hash_connect(struct inet_timewait_death_row *death_row, 736 struct sock *sk) 737 { 738 u32 port_offset = 0; 739 740 if (!inet_sk(sk)->inet_num) 741 port_offset = inet_sk_port_offset(sk); 742 return __inet_hash_connect(death_row, sk, port_offset, 743 __inet_check_established); 744 } 745 EXPORT_SYMBOL_GPL(inet_hash_connect); 746 747 void inet_hashinfo_init(struct inet_hashinfo *h) 748 { 749 int i; 750 751 for (i = 0; i < INET_LHTABLE_SIZE; i++) { 752 spin_lock_init(&h->listening_hash[i].lock); 753 INIT_HLIST_HEAD(&h->listening_hash[i].head); 754 h->listening_hash[i].count = 0; 755 } 756 757 h->lhash2 = NULL; 758 } 759 EXPORT_SYMBOL_GPL(inet_hashinfo_init); 760 761 static void init_hashinfo_lhash2(struct inet_hashinfo *h) 762 { 763 int i; 764 765 for (i = 0; i <= h->lhash2_mask; i++) { 766 spin_lock_init(&h->lhash2[i].lock); 767 INIT_HLIST_HEAD(&h->lhash2[i].head); 768 h->lhash2[i].count = 0; 769 } 770 } 771 772 void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, 773 unsigned long numentries, int scale, 774 unsigned long low_limit, 775 unsigned long high_limit) 776 { 777 h->lhash2 = alloc_large_system_hash(name, 778 sizeof(*h->lhash2), 779 numentries, 780 scale, 781 0, 782 NULL, 783 &h->lhash2_mask, 784 low_limit, 785 high_limit); 786 init_hashinfo_lhash2(h); 787 } 788 789 int inet_hashinfo2_init_mod(struct inet_hashinfo *h) 790 { 791 h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL); 792 if (!h->lhash2) 793 return -ENOMEM; 794 795 h->lhash2_mask = INET_LHTABLE_SIZE - 1; 796 /* INET_LHTABLE_SIZE must be a power of 2 */ 797 BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask); 798 799 init_hashinfo_lhash2(h); 800 return 0; 801 } 802 EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod); 803 804 int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) 805 { 806 unsigned int locksz = sizeof(spinlock_t); 807 unsigned int i, nblocks = 1; 808 809 if (locksz != 0) { 810 /* allocate 2 cache lines or at least one spinlock per cpu */ 811 nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U); 812 nblocks = roundup_pow_of_two(nblocks * num_possible_cpus()); 813 814 /* no more locks than number of hash buckets */ 815 nblocks = min(nblocks, hashinfo->ehash_mask + 1); 816 817 hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL); 818 if (!hashinfo->ehash_locks) 819 return -ENOMEM; 820 821 for (i = 0; i < nblocks; i++) 822 spin_lock_init(&hashinfo->ehash_locks[i]); 823 } 824 hashinfo->ehash_locks_mask = nblocks - 1; 825 return 0; 826 } 827 EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc); 828