1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Generic INET transport hashtables 8 * 9 * Authors: Lotsa people, from code originally in tcp 10 */ 11 12 #include <linux/module.h> 13 #include <linux/random.h> 14 #include <linux/sched.h> 15 #include <linux/slab.h> 16 #include <linux/wait.h> 17 #include <linux/vmalloc.h> 18 #include <linux/memblock.h> 19 20 #include <net/addrconf.h> 21 #include <net/inet_connection_sock.h> 22 #include <net/inet_hashtables.h> 23 #if IS_ENABLED(CONFIG_IPV6) 24 #include <net/inet6_hashtables.h> 25 #endif 26 #include <net/hotdata.h> 27 #include <net/ip.h> 28 #include <net/rps.h> 29 #include <net/secure_seq.h> 30 #include <net/sock_reuseport.h> 31 #include <net/tcp.h> 32 33 u32 inet_ehashfn(const struct net *net, const __be32 laddr, 34 const __u16 lport, const __be32 faddr, 35 const __be16 fport) 36 { 37 net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret)); 38 39 return lport + __inet_ehashfn(laddr, 0, faddr, fport, 40 inet_ehash_secret + net_hash_mix(net)); 41 } 42 EXPORT_SYMBOL_GPL(inet_ehashfn); 43 44 /* This function handles inet_sock, but also timewait and request sockets 45 * for IPv4/IPv6. 46 */ 47 static u32 sk_ehashfn(const struct sock *sk) 48 { 49 #if IS_ENABLED(CONFIG_IPV6) 50 if (sk->sk_family == AF_INET6 && 51 !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 52 return inet6_ehashfn(sock_net(sk), 53 &sk->sk_v6_rcv_saddr, sk->sk_num, 54 &sk->sk_v6_daddr, sk->sk_dport); 55 #endif 56 return inet_ehashfn(sock_net(sk), 57 sk->sk_rcv_saddr, sk->sk_num, 58 sk->sk_daddr, sk->sk_dport); 59 } 60 61 static bool sk_is_connect_bind(const struct sock *sk) 62 { 63 if (sk->sk_state == TCP_TIME_WAIT) 64 return inet_twsk(sk)->tw_connect_bind; 65 else 66 return sk->sk_userlocks & SOCK_CONNECT_BIND; 67 } 68 69 /* 70 * Allocate and initialize a new local port bind bucket. 71 * The bindhash mutex for snum's hash chain must be held here. 72 */ 73 struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, 74 struct net *net, 75 struct inet_bind_hashbucket *head, 76 const unsigned short snum, 77 int l3mdev) 78 { 79 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 80 81 if (tb) { 82 write_pnet(&tb->ib_net, net); 83 tb->l3mdev = l3mdev; 84 tb->port = snum; 85 tb->fastreuse = 0; 86 tb->fastreuseport = 0; 87 INIT_HLIST_HEAD(&tb->bhash2); 88 hlist_add_head_rcu(&tb->node, &head->chain); 89 } 90 return tb; 91 } 92 93 /* 94 * Caller must hold hashbucket lock for this tb with local BH disabled 95 */ 96 void inet_bind_bucket_destroy(struct inet_bind_bucket *tb) 97 { 98 const struct inet_bind2_bucket *tb2; 99 100 if (hlist_empty(&tb->bhash2)) { 101 hlist_del_rcu(&tb->node); 102 kfree_rcu(tb, rcu); 103 return; 104 } 105 106 if (tb->fastreuse == -1 && tb->fastreuseport == -1) 107 return; 108 hlist_for_each_entry(tb2, &tb->bhash2, bhash_node) { 109 if (tb2->fastreuse != -1 || tb2->fastreuseport != -1) 110 return; 111 } 112 tb->fastreuse = -1; 113 tb->fastreuseport = -1; 114 } 115 116 bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, 117 unsigned short port, int l3mdev) 118 { 119 return net_eq(ib_net(tb), net) && tb->port == port && 120 tb->l3mdev == l3mdev; 121 } 122 123 static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2, 124 struct net *net, 125 struct inet_bind_hashbucket *head, 126 struct inet_bind_bucket *tb, 127 const struct sock *sk) 128 { 129 write_pnet(&tb2->ib_net, net); 130 tb2->l3mdev = tb->l3mdev; 131 tb2->port = tb->port; 132 #if IS_ENABLED(CONFIG_IPV6) 133 BUILD_BUG_ON(USHRT_MAX < (IPV6_ADDR_ANY | IPV6_ADDR_MAPPED)); 134 if (sk->sk_family == AF_INET6) { 135 tb2->addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); 136 tb2->v6_rcv_saddr = sk->sk_v6_rcv_saddr; 137 } else { 138 tb2->addr_type = IPV6_ADDR_MAPPED; 139 ipv6_addr_set_v4mapped(sk->sk_rcv_saddr, &tb2->v6_rcv_saddr); 140 } 141 #else 142 tb2->rcv_saddr = sk->sk_rcv_saddr; 143 #endif 144 tb2->fastreuse = 0; 145 tb2->fastreuseport = 0; 146 INIT_HLIST_HEAD(&tb2->owners); 147 hlist_add_head(&tb2->node, &head->chain); 148 hlist_add_head(&tb2->bhash_node, &tb->bhash2); 149 } 150 151 struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, 152 struct net *net, 153 struct inet_bind_hashbucket *head, 154 struct inet_bind_bucket *tb, 155 const struct sock *sk) 156 { 157 struct inet_bind2_bucket *tb2 = kmem_cache_alloc(cachep, GFP_ATOMIC); 158 159 if (tb2) 160 inet_bind2_bucket_init(tb2, net, head, tb, sk); 161 162 return tb2; 163 } 164 165 /* Caller must hold hashbucket lock for this tb with local BH disabled */ 166 void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) 167 { 168 const struct sock *sk; 169 170 if (hlist_empty(&tb->owners)) { 171 __hlist_del(&tb->node); 172 __hlist_del(&tb->bhash_node); 173 kmem_cache_free(cachep, tb); 174 return; 175 } 176 177 if (tb->fastreuse == -1 && tb->fastreuseport == -1) 178 return; 179 sk_for_each_bound(sk, &tb->owners) { 180 if (!sk_is_connect_bind(sk)) 181 return; 182 } 183 tb->fastreuse = -1; 184 tb->fastreuseport = -1; 185 } 186 187 static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, 188 const struct sock *sk) 189 { 190 #if IS_ENABLED(CONFIG_IPV6) 191 if (sk->sk_family == AF_INET6) 192 return ipv6_addr_equal(&tb2->v6_rcv_saddr, &sk->sk_v6_rcv_saddr); 193 194 if (tb2->addr_type != IPV6_ADDR_MAPPED) 195 return false; 196 #endif 197 return tb2->rcv_saddr == sk->sk_rcv_saddr; 198 } 199 200 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 201 struct inet_bind2_bucket *tb2, unsigned short port) 202 { 203 inet_sk(sk)->inet_num = port; 204 inet_csk(sk)->icsk_bind_hash = tb; 205 inet_csk(sk)->icsk_bind2_hash = tb2; 206 sk_add_bind_node(sk, &tb2->owners); 207 } 208 209 /* 210 * Get rid of any references to a local port held by the given sock. 211 */ 212 static void __inet_put_port(struct sock *sk) 213 { 214 struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); 215 struct inet_bind_hashbucket *head, *head2; 216 struct net *net = sock_net(sk); 217 struct inet_bind_bucket *tb; 218 int bhash; 219 220 bhash = inet_bhashfn(net, inet_sk(sk)->inet_num, hashinfo->bhash_size); 221 head = &hashinfo->bhash[bhash]; 222 head2 = inet_bhashfn_portaddr(hashinfo, sk, net, inet_sk(sk)->inet_num); 223 224 spin_lock(&head->lock); 225 tb = inet_csk(sk)->icsk_bind_hash; 226 inet_csk(sk)->icsk_bind_hash = NULL; 227 inet_sk(sk)->inet_num = 0; 228 sk->sk_userlocks &= ~SOCK_CONNECT_BIND; 229 230 spin_lock(&head2->lock); 231 if (inet_csk(sk)->icsk_bind2_hash) { 232 struct inet_bind2_bucket *tb2 = inet_csk(sk)->icsk_bind2_hash; 233 234 __sk_del_bind_node(sk); 235 inet_csk(sk)->icsk_bind2_hash = NULL; 236 inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); 237 } 238 spin_unlock(&head2->lock); 239 240 inet_bind_bucket_destroy(tb); 241 spin_unlock(&head->lock); 242 } 243 244 void inet_put_port(struct sock *sk) 245 { 246 local_bh_disable(); 247 __inet_put_port(sk); 248 local_bh_enable(); 249 } 250 EXPORT_SYMBOL(inet_put_port); 251 252 int __inet_inherit_port(const struct sock *sk, struct sock *child) 253 { 254 struct inet_hashinfo *table = tcp_get_hashinfo(sk); 255 unsigned short port = inet_sk(child)->inet_num; 256 struct inet_bind_hashbucket *head, *head2; 257 bool created_inet_bind_bucket = false; 258 struct net *net = sock_net(sk); 259 bool update_fastreuse = false; 260 struct inet_bind2_bucket *tb2; 261 struct inet_bind_bucket *tb; 262 int bhash, l3mdev; 263 264 bhash = inet_bhashfn(net, port, table->bhash_size); 265 head = &table->bhash[bhash]; 266 head2 = inet_bhashfn_portaddr(table, child, net, port); 267 268 spin_lock(&head->lock); 269 spin_lock(&head2->lock); 270 tb = inet_csk(sk)->icsk_bind_hash; 271 tb2 = inet_csk(sk)->icsk_bind2_hash; 272 if (unlikely(!tb || !tb2)) { 273 spin_unlock(&head2->lock); 274 spin_unlock(&head->lock); 275 return -ENOENT; 276 } 277 if (tb->port != port) { 278 l3mdev = inet_sk_bound_l3mdev(sk); 279 280 /* NOTE: using tproxy and redirecting skbs to a proxy 281 * on a different listener port breaks the assumption 282 * that the listener socket's icsk_bind_hash is the same 283 * as that of the child socket. We have to look up or 284 * create a new bind bucket for the child here. */ 285 inet_bind_bucket_for_each(tb, &head->chain) { 286 if (inet_bind_bucket_match(tb, net, port, l3mdev)) 287 break; 288 } 289 if (!tb) { 290 tb = inet_bind_bucket_create(table->bind_bucket_cachep, 291 net, head, port, l3mdev); 292 if (!tb) { 293 spin_unlock(&head2->lock); 294 spin_unlock(&head->lock); 295 return -ENOMEM; 296 } 297 created_inet_bind_bucket = true; 298 } 299 update_fastreuse = true; 300 301 goto bhash2_find; 302 } else if (!inet_bind2_bucket_addr_match(tb2, child)) { 303 l3mdev = inet_sk_bound_l3mdev(sk); 304 305 bhash2_find: 306 tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, child); 307 if (!tb2) { 308 tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep, 309 net, head2, tb, child); 310 if (!tb2) 311 goto error; 312 } 313 } 314 if (update_fastreuse) 315 inet_csk_update_fastreuse(child, tb, tb2); 316 inet_bind_hash(child, tb, tb2, port); 317 spin_unlock(&head2->lock); 318 spin_unlock(&head->lock); 319 320 return 0; 321 322 error: 323 if (created_inet_bind_bucket) 324 inet_bind_bucket_destroy(tb); 325 spin_unlock(&head2->lock); 326 spin_unlock(&head->lock); 327 return -ENOMEM; 328 } 329 EXPORT_SYMBOL_GPL(__inet_inherit_port); 330 331 static struct inet_listen_hashbucket * 332 inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) 333 { 334 u32 hash; 335 336 #if IS_ENABLED(CONFIG_IPV6) 337 if (sk->sk_family == AF_INET6) 338 hash = ipv6_portaddr_hash(sock_net(sk), 339 &sk->sk_v6_rcv_saddr, 340 inet_sk(sk)->inet_num); 341 else 342 #endif 343 hash = ipv4_portaddr_hash(sock_net(sk), 344 inet_sk(sk)->inet_rcv_saddr, 345 inet_sk(sk)->inet_num); 346 return inet_lhash2_bucket(h, hash); 347 } 348 349 static inline int compute_score(struct sock *sk, const struct net *net, 350 const unsigned short hnum, const __be32 daddr, 351 const int dif, const int sdif) 352 { 353 int score = -1; 354 355 if (net_eq(sock_net(sk), net) && sk->sk_num == hnum && 356 !ipv6_only_sock(sk)) { 357 if (sk->sk_rcv_saddr != daddr) 358 return -1; 359 360 if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) 361 return -1; 362 score = sk->sk_bound_dev_if ? 2 : 1; 363 364 if (sk->sk_family == PF_INET) 365 score++; 366 if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) 367 score++; 368 } 369 return score; 370 } 371 372 /** 373 * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary. 374 * @net: network namespace. 375 * @sk: AF_INET socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP. 376 * @skb: context for a potential SK_REUSEPORT program. 377 * @doff: header offset. 378 * @saddr: source address. 379 * @sport: source port. 380 * @daddr: destination address. 381 * @hnum: destination port in host byte order. 382 * @ehashfn: hash function used to generate the fallback hash. 383 * 384 * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to 385 * the selected sock or an error. 386 */ 387 struct sock *inet_lookup_reuseport(const struct net *net, struct sock *sk, 388 struct sk_buff *skb, int doff, 389 __be32 saddr, __be16 sport, 390 __be32 daddr, unsigned short hnum, 391 inet_ehashfn_t *ehashfn) 392 { 393 struct sock *reuse_sk = NULL; 394 u32 phash; 395 396 if (sk->sk_reuseport) { 397 phash = INDIRECT_CALL_2(ehashfn, udp_ehashfn, inet_ehashfn, 398 net, daddr, hnum, saddr, sport); 399 reuse_sk = reuseport_select_sock(sk, phash, skb, doff); 400 } 401 return reuse_sk; 402 } 403 EXPORT_SYMBOL_GPL(inet_lookup_reuseport); 404 405 /* 406 * Here are some nice properties to exploit here. The BSD API 407 * does not allow a listening sock to specify the remote port nor the 408 * remote address for the connection. So always assume those are both 409 * wildcarded during the search since they can never be otherwise. 410 */ 411 412 /* called with rcu_read_lock() : No refcount taken on the socket */ 413 static struct sock *inet_lhash2_lookup(const struct net *net, 414 struct inet_listen_hashbucket *ilb2, 415 struct sk_buff *skb, int doff, 416 const __be32 saddr, __be16 sport, 417 const __be32 daddr, const unsigned short hnum, 418 const int dif, const int sdif) 419 { 420 struct sock *sk, *result = NULL; 421 struct hlist_nulls_node *node; 422 int score, hiscore = 0; 423 424 sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { 425 score = compute_score(sk, net, hnum, daddr, dif, sdif); 426 if (score > hiscore) { 427 result = inet_lookup_reuseport(net, sk, skb, doff, 428 saddr, sport, daddr, hnum, inet_ehashfn); 429 if (result) 430 return result; 431 432 result = sk; 433 hiscore = score; 434 } 435 } 436 437 return result; 438 } 439 440 struct sock *inet_lookup_run_sk_lookup(const struct net *net, 441 int protocol, 442 struct sk_buff *skb, int doff, 443 __be32 saddr, __be16 sport, 444 __be32 daddr, u16 hnum, const int dif, 445 inet_ehashfn_t *ehashfn) 446 { 447 struct sock *sk, *reuse_sk; 448 bool no_reuseport; 449 450 no_reuseport = bpf_sk_lookup_run_v4(net, protocol, saddr, sport, 451 daddr, hnum, dif, &sk); 452 if (no_reuseport || IS_ERR_OR_NULL(sk)) 453 return sk; 454 455 reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, 456 ehashfn); 457 if (reuse_sk) 458 sk = reuse_sk; 459 return sk; 460 } 461 462 struct sock *__inet_lookup_listener(const struct net *net, 463 struct sk_buff *skb, int doff, 464 const __be32 saddr, __be16 sport, 465 const __be32 daddr, const unsigned short hnum, 466 const int dif, const int sdif) 467 { 468 struct inet_listen_hashbucket *ilb2; 469 struct inet_hashinfo *hashinfo; 470 struct sock *result = NULL; 471 unsigned int hash2; 472 473 /* Lookup redirect from BPF */ 474 if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { 475 result = inet_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff, 476 saddr, sport, daddr, hnum, dif, 477 inet_ehashfn); 478 if (result) 479 goto done; 480 } 481 482 hashinfo = net->ipv4.tcp_death_row.hashinfo; 483 hash2 = ipv4_portaddr_hash(net, daddr, hnum); 484 ilb2 = inet_lhash2_bucket(hashinfo, hash2); 485 486 result = inet_lhash2_lookup(net, ilb2, skb, doff, 487 saddr, sport, daddr, hnum, 488 dif, sdif); 489 if (result) 490 goto done; 491 492 /* Lookup lhash2 with INADDR_ANY */ 493 hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); 494 ilb2 = inet_lhash2_bucket(hashinfo, hash2); 495 496 result = inet_lhash2_lookup(net, ilb2, skb, doff, 497 saddr, sport, htonl(INADDR_ANY), hnum, 498 dif, sdif); 499 done: 500 if (IS_ERR(result)) 501 return NULL; 502 return result; 503 } 504 EXPORT_SYMBOL_GPL(__inet_lookup_listener); 505 506 /* All sockets share common refcount, but have different destructors */ 507 void sock_gen_put(struct sock *sk) 508 { 509 if (!refcount_dec_and_test(&sk->sk_refcnt)) 510 return; 511 512 if (sk->sk_state == TCP_TIME_WAIT) 513 inet_twsk_free(inet_twsk(sk)); 514 else if (sk->sk_state == TCP_NEW_SYN_RECV) 515 reqsk_free(inet_reqsk(sk)); 516 else 517 sk_free(sk); 518 } 519 EXPORT_SYMBOL_GPL(sock_gen_put); 520 521 void sock_edemux(struct sk_buff *skb) 522 { 523 sock_gen_put(skb->sk); 524 } 525 EXPORT_SYMBOL(sock_edemux); 526 527 struct sock *__inet_lookup_established(const struct net *net, 528 const __be32 saddr, const __be16 sport, 529 const __be32 daddr, const u16 hnum, 530 const int dif, const int sdif) 531 { 532 const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 533 INET_ADDR_COOKIE(acookie, saddr, daddr); 534 const struct hlist_nulls_node *node; 535 struct inet_ehash_bucket *head; 536 struct inet_hashinfo *hashinfo; 537 unsigned int hash, slot; 538 struct sock *sk; 539 540 hashinfo = net->ipv4.tcp_death_row.hashinfo; 541 hash = inet_ehashfn(net, daddr, hnum, saddr, sport); 542 slot = hash & hashinfo->ehash_mask; 543 head = &hashinfo->ehash[slot]; 544 545 begin: 546 sk_nulls_for_each_rcu(sk, node, &head->chain) { 547 if (sk->sk_hash != hash) 548 continue; 549 if (likely(inet_match(net, sk, acookie, ports, dif, sdif))) { 550 if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) 551 goto out; 552 if (unlikely(!inet_match(net, sk, acookie, 553 ports, dif, sdif))) { 554 sock_gen_put(sk); 555 goto begin; 556 } 557 goto found; 558 } 559 } 560 /* 561 * if the nulls value we got at the end of this lookup is 562 * not the expected one, we must restart lookup. 563 * We probably met an item that was moved to another chain. 564 */ 565 if (get_nulls_value(node) != slot) 566 goto begin; 567 out: 568 sk = NULL; 569 found: 570 return sk; 571 } 572 EXPORT_SYMBOL_GPL(__inet_lookup_established); 573 574 /* called with local bh disabled */ 575 static int __inet_check_established(struct inet_timewait_death_row *death_row, 576 struct sock *sk, __u16 lport, 577 struct inet_timewait_sock **twp, 578 bool rcu_lookup, 579 u32 hash) 580 { 581 struct inet_hashinfo *hinfo = death_row->hashinfo; 582 struct inet_sock *inet = inet_sk(sk); 583 __be32 daddr = inet->inet_rcv_saddr; 584 __be32 saddr = inet->inet_daddr; 585 int dif = sk->sk_bound_dev_if; 586 struct net *net = sock_net(sk); 587 int sdif = l3mdev_master_ifindex_by_index(net, dif); 588 INET_ADDR_COOKIE(acookie, saddr, daddr); 589 const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); 590 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 591 struct inet_timewait_sock *tw = NULL; 592 const struct hlist_nulls_node *node; 593 struct sock *sk2; 594 spinlock_t *lock; 595 596 if (rcu_lookup) { 597 sk_nulls_for_each(sk2, node, &head->chain) { 598 if (sk2->sk_hash != hash || 599 !inet_match(net, sk2, acookie, ports, dif, sdif)) 600 continue; 601 if (sk2->sk_state == TCP_TIME_WAIT) 602 break; 603 return -EADDRNOTAVAIL; 604 } 605 return 0; 606 } 607 608 lock = inet_ehash_lockp(hinfo, hash); 609 spin_lock(lock); 610 611 sk_nulls_for_each(sk2, node, &head->chain) { 612 if (sk2->sk_hash != hash) 613 continue; 614 615 if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) { 616 if (sk2->sk_state == TCP_TIME_WAIT) { 617 tw = inet_twsk(sk2); 618 if (tcp_twsk_unique(sk, sk2, twp)) 619 break; 620 } 621 goto not_unique; 622 } 623 } 624 625 /* Must record num and sport now. Otherwise we will see 626 * in hash table socket with a funny identity. 627 */ 628 inet->inet_num = lport; 629 inet->inet_sport = htons(lport); 630 sk->sk_hash = hash; 631 WARN_ON(!sk_unhashed(sk)); 632 __sk_nulls_add_node_rcu(sk, &head->chain); 633 if (tw) { 634 sk_nulls_del_node_init_rcu((struct sock *)tw); 635 __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); 636 } 637 spin_unlock(lock); 638 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 639 640 if (twp) { 641 *twp = tw; 642 } else if (tw) { 643 /* Silly. Should hash-dance instead... */ 644 inet_twsk_deschedule_put(tw); 645 } 646 return 0; 647 648 not_unique: 649 spin_unlock(lock); 650 return -EADDRNOTAVAIL; 651 } 652 653 static u64 inet_sk_port_offset(const struct sock *sk) 654 { 655 const struct inet_sock *inet = inet_sk(sk); 656 657 return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, 658 inet->inet_daddr, 659 inet->inet_dport); 660 } 661 662 /* Searches for an exsiting socket in the ehash bucket list. 663 * Returns true if found, false otherwise. 664 */ 665 static bool inet_ehash_lookup_by_sk(struct sock *sk, 666 struct hlist_nulls_head *list) 667 { 668 const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num); 669 const int sdif = sk->sk_bound_dev_if; 670 const int dif = sk->sk_bound_dev_if; 671 const struct hlist_nulls_node *node; 672 struct net *net = sock_net(sk); 673 struct sock *esk; 674 675 INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr); 676 677 sk_nulls_for_each_rcu(esk, node, list) { 678 if (esk->sk_hash != sk->sk_hash) 679 continue; 680 if (sk->sk_family == AF_INET) { 681 if (unlikely(inet_match(net, esk, acookie, 682 ports, dif, sdif))) { 683 return true; 684 } 685 } 686 #if IS_ENABLED(CONFIG_IPV6) 687 else if (sk->sk_family == AF_INET6) { 688 if (unlikely(inet6_match(net, esk, 689 &sk->sk_v6_daddr, 690 &sk->sk_v6_rcv_saddr, 691 ports, dif, sdif))) { 692 return true; 693 } 694 } 695 #endif 696 } 697 return false; 698 } 699 700 /* Insert a socket into ehash, and eventually remove another one 701 * (The another one can be a SYN_RECV or TIMEWAIT) 702 * If an existing socket already exists, socket sk is not inserted, 703 * and sets found_dup_sk parameter to true. 704 */ 705 bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) 706 { 707 struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); 708 struct inet_ehash_bucket *head; 709 struct hlist_nulls_head *list; 710 spinlock_t *lock; 711 bool ret = true; 712 713 WARN_ON_ONCE(!sk_unhashed(sk)); 714 715 sk->sk_hash = sk_ehashfn(sk); 716 head = inet_ehash_bucket(hashinfo, sk->sk_hash); 717 list = &head->chain; 718 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 719 720 spin_lock(lock); 721 if (osk) { 722 WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); 723 ret = sk_nulls_del_node_init_rcu(osk); 724 } else if (found_dup_sk) { 725 *found_dup_sk = inet_ehash_lookup_by_sk(sk, list); 726 if (*found_dup_sk) 727 ret = false; 728 } 729 730 if (ret) 731 __sk_nulls_add_node_rcu(sk, list); 732 733 spin_unlock(lock); 734 735 return ret; 736 } 737 738 bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) 739 { 740 bool ok = inet_ehash_insert(sk, osk, found_dup_sk); 741 742 if (ok) { 743 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 744 } else { 745 tcp_orphan_count_inc(); 746 inet_sk_set_state(sk, TCP_CLOSE); 747 sock_set_flag(sk, SOCK_DEAD); 748 inet_csk_destroy_sock(sk); 749 } 750 return ok; 751 } 752 EXPORT_IPV6_MOD(inet_ehash_nolisten); 753 754 static int inet_reuseport_add_sock(struct sock *sk, 755 struct inet_listen_hashbucket *ilb) 756 { 757 struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; 758 const struct hlist_nulls_node *node; 759 kuid_t uid = sk_uid(sk); 760 struct sock *sk2; 761 762 sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) { 763 if (sk2 != sk && 764 sk2->sk_family == sk->sk_family && 765 ipv6_only_sock(sk2) == ipv6_only_sock(sk) && 766 sk2->sk_bound_dev_if == sk->sk_bound_dev_if && 767 inet_csk(sk2)->icsk_bind_hash == tb && 768 sk2->sk_reuseport && uid_eq(uid, sk_uid(sk2)) && 769 inet_rcv_saddr_equal(sk, sk2, false)) 770 return reuseport_add_sock(sk, sk2, 771 inet_rcv_saddr_any(sk)); 772 } 773 774 return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); 775 } 776 777 int inet_hash(struct sock *sk) 778 { 779 struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); 780 struct inet_listen_hashbucket *ilb2; 781 int err = 0; 782 783 if (sk->sk_state == TCP_CLOSE) 784 return 0; 785 786 if (sk->sk_state != TCP_LISTEN) { 787 local_bh_disable(); 788 inet_ehash_nolisten(sk, NULL, NULL); 789 local_bh_enable(); 790 return 0; 791 } 792 WARN_ON(!sk_unhashed(sk)); 793 ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); 794 795 spin_lock(&ilb2->lock); 796 if (sk->sk_reuseport) { 797 err = inet_reuseport_add_sock(sk, ilb2); 798 if (err) 799 goto unlock; 800 } 801 sock_set_flag(sk, SOCK_RCU_FREE); 802 if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && 803 sk->sk_family == AF_INET6) 804 __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head); 805 else 806 __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head); 807 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 808 unlock: 809 spin_unlock(&ilb2->lock); 810 811 return err; 812 } 813 EXPORT_IPV6_MOD(inet_hash); 814 815 void inet_unhash(struct sock *sk) 816 { 817 struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); 818 819 if (sk_unhashed(sk)) 820 return; 821 822 sock_rps_delete_flow(sk); 823 if (sk->sk_state == TCP_LISTEN) { 824 struct inet_listen_hashbucket *ilb2; 825 826 ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); 827 /* Don't disable bottom halves while acquiring the lock to 828 * avoid circular locking dependency on PREEMPT_RT. 829 */ 830 spin_lock(&ilb2->lock); 831 if (rcu_access_pointer(sk->sk_reuseport_cb)) 832 reuseport_stop_listen_sock(sk); 833 834 __sk_nulls_del_node_init_rcu(sk); 835 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 836 spin_unlock(&ilb2->lock); 837 } else { 838 spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 839 840 spin_lock_bh(lock); 841 __sk_nulls_del_node_init_rcu(sk); 842 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 843 spin_unlock_bh(lock); 844 } 845 } 846 EXPORT_IPV6_MOD(inet_unhash); 847 848 static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb, 849 const struct net *net, unsigned short port, 850 int l3mdev, const struct sock *sk) 851 { 852 if (!net_eq(ib2_net(tb), net) || tb->port != port || 853 tb->l3mdev != l3mdev) 854 return false; 855 856 return inet_bind2_bucket_addr_match(tb, sk); 857 } 858 859 bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net, 860 unsigned short port, int l3mdev, const struct sock *sk) 861 { 862 if (!net_eq(ib2_net(tb), net) || tb->port != port || 863 tb->l3mdev != l3mdev) 864 return false; 865 866 #if IS_ENABLED(CONFIG_IPV6) 867 if (tb->addr_type == IPV6_ADDR_ANY) 868 return true; 869 870 if (tb->addr_type != IPV6_ADDR_MAPPED) 871 return false; 872 873 if (sk->sk_family == AF_INET6 && 874 !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) 875 return false; 876 #endif 877 return tb->rcv_saddr == 0; 878 } 879 880 /* The socket's bhash2 hashbucket spinlock must be held when this is called */ 881 struct inet_bind2_bucket * 882 inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net, 883 unsigned short port, int l3mdev, const struct sock *sk) 884 { 885 struct inet_bind2_bucket *bhash2 = NULL; 886 887 inet_bind_bucket_for_each(bhash2, &head->chain) 888 if (inet_bind2_bucket_match(bhash2, net, port, l3mdev, sk)) 889 break; 890 891 return bhash2; 892 } 893 894 struct inet_bind_hashbucket * 895 inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port) 896 { 897 struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); 898 u32 hash; 899 900 #if IS_ENABLED(CONFIG_IPV6) 901 if (sk->sk_family == AF_INET6) 902 hash = ipv6_portaddr_hash(net, &in6addr_any, port); 903 else 904 #endif 905 hash = ipv4_portaddr_hash(net, 0, port); 906 907 return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; 908 } 909 910 static void inet_update_saddr(struct sock *sk, void *saddr, int family) 911 { 912 if (family == AF_INET) { 913 inet_sk(sk)->inet_saddr = *(__be32 *)saddr; 914 sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr); 915 } 916 #if IS_ENABLED(CONFIG_IPV6) 917 else { 918 sk->sk_v6_rcv_saddr = *(struct in6_addr *)saddr; 919 } 920 #endif 921 } 922 923 static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset) 924 { 925 struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); 926 struct inet_bind_hashbucket *head, *head2; 927 struct inet_bind2_bucket *tb2, *new_tb2; 928 int l3mdev = inet_sk_bound_l3mdev(sk); 929 int port = inet_sk(sk)->inet_num; 930 struct net *net = sock_net(sk); 931 int bhash; 932 933 if (!inet_csk(sk)->icsk_bind2_hash) { 934 /* Not bind()ed before. */ 935 if (reset) 936 inet_reset_saddr(sk); 937 else 938 inet_update_saddr(sk, saddr, family); 939 940 return 0; 941 } 942 943 /* Allocate a bind2 bucket ahead of time to avoid permanently putting 944 * the bhash2 table in an inconsistent state if a new tb2 bucket 945 * allocation fails. 946 */ 947 new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC); 948 if (!new_tb2) { 949 if (reset) { 950 /* The (INADDR_ANY, port) bucket might have already 951 * been freed, then we cannot fixup icsk_bind2_hash, 952 * so we give up and unlink sk from bhash/bhash2 not 953 * to leave inconsistency in bhash2. 954 */ 955 inet_put_port(sk); 956 inet_reset_saddr(sk); 957 } 958 959 return -ENOMEM; 960 } 961 962 bhash = inet_bhashfn(net, port, hinfo->bhash_size); 963 head = &hinfo->bhash[bhash]; 964 head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); 965 966 /* If we change saddr locklessly, another thread 967 * iterating over bhash might see corrupted address. 968 */ 969 spin_lock_bh(&head->lock); 970 971 spin_lock(&head2->lock); 972 __sk_del_bind_node(sk); 973 inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash); 974 spin_unlock(&head2->lock); 975 976 if (reset) 977 inet_reset_saddr(sk); 978 else 979 inet_update_saddr(sk, saddr, family); 980 981 head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); 982 983 spin_lock(&head2->lock); 984 tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); 985 if (!tb2) { 986 tb2 = new_tb2; 987 inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk); 988 if (sk_is_connect_bind(sk)) { 989 tb2->fastreuse = -1; 990 tb2->fastreuseport = -1; 991 } 992 } 993 inet_csk(sk)->icsk_bind2_hash = tb2; 994 sk_add_bind_node(sk, &tb2->owners); 995 spin_unlock(&head2->lock); 996 997 spin_unlock_bh(&head->lock); 998 999 if (tb2 != new_tb2) 1000 kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2); 1001 1002 return 0; 1003 } 1004 1005 int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) 1006 { 1007 return __inet_bhash2_update_saddr(sk, saddr, family, false); 1008 } 1009 EXPORT_IPV6_MOD(inet_bhash2_update_saddr); 1010 1011 void inet_bhash2_reset_saddr(struct sock *sk) 1012 { 1013 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) 1014 __inet_bhash2_update_saddr(sk, NULL, 0, true); 1015 } 1016 EXPORT_IPV6_MOD(inet_bhash2_reset_saddr); 1017 1018 /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm 1019 * Note that we use 32bit integers (vs RFC 'short integers') 1020 * because 2^16 is not a multiple of num_ephemeral and this 1021 * property might be used by clever attacker. 1022 * 1023 * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though 1024 * attacks were since demonstrated, thus we use 65536 by default instead 1025 * to really give more isolation and privacy, at the expense of 256kB 1026 * of kernel memory. 1027 */ 1028 #define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER) 1029 static u32 *table_perturb; 1030 1031 int __inet_hash_connect(struct inet_timewait_death_row *death_row, 1032 struct sock *sk, u64 port_offset, 1033 u32 hash_port0, 1034 int (*check_established)(struct inet_timewait_death_row *, 1035 struct sock *, __u16, struct inet_timewait_sock **, 1036 bool rcu_lookup, u32 hash)) 1037 { 1038 struct inet_hashinfo *hinfo = death_row->hashinfo; 1039 struct inet_bind_hashbucket *head, *head2; 1040 struct inet_timewait_sock *tw = NULL; 1041 int port = inet_sk(sk)->inet_num; 1042 struct net *net = sock_net(sk); 1043 struct inet_bind2_bucket *tb2; 1044 struct inet_bind_bucket *tb; 1045 bool tb_created = false; 1046 u32 remaining, offset; 1047 int ret, i, low, high; 1048 bool local_ports; 1049 int step, l3mdev; 1050 u32 index; 1051 1052 if (port) { 1053 local_bh_disable(); 1054 ret = check_established(death_row, sk, port, NULL, false, 1055 hash_port0 + port); 1056 local_bh_enable(); 1057 return ret; 1058 } 1059 1060 l3mdev = inet_sk_bound_l3mdev(sk); 1061 1062 local_ports = inet_sk_get_local_port_range(sk, &low, &high); 1063 step = local_ports ? 1 : 2; 1064 1065 high++; /* [32768, 60999] -> [32768, 61000[ */ 1066 remaining = high - low; 1067 if (!local_ports && remaining > 1) 1068 remaining &= ~1U; 1069 1070 get_random_sleepable_once(table_perturb, 1071 INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb)); 1072 index = port_offset & (INET_TABLE_PERTURB_SIZE - 1); 1073 1074 offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32); 1075 offset %= remaining; 1076 1077 /* In first pass we try ports of @low parity. 1078 * inet_csk_get_port() does the opposite choice. 1079 */ 1080 if (!local_ports) 1081 offset &= ~1U; 1082 other_parity_scan: 1083 port = low + offset; 1084 for (i = 0; i < remaining; i += step, port += step) { 1085 if (unlikely(port >= high)) 1086 port -= remaining; 1087 if (inet_is_local_reserved_port(net, port)) 1088 continue; 1089 head = &hinfo->bhash[inet_bhashfn(net, port, 1090 hinfo->bhash_size)]; 1091 rcu_read_lock(); 1092 hlist_for_each_entry_rcu(tb, &head->chain, node) { 1093 if (!inet_bind_bucket_match(tb, net, port, l3mdev)) 1094 continue; 1095 if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) { 1096 rcu_read_unlock(); 1097 goto next_port; 1098 } 1099 if (!check_established(death_row, sk, port, &tw, true, 1100 hash_port0 + port)) 1101 break; 1102 rcu_read_unlock(); 1103 goto next_port; 1104 } 1105 rcu_read_unlock(); 1106 1107 spin_lock_bh(&head->lock); 1108 1109 /* Does not bother with rcv_saddr checks, because 1110 * the established check is already unique enough. 1111 */ 1112 inet_bind_bucket_for_each(tb, &head->chain) { 1113 if (inet_bind_bucket_match(tb, net, port, l3mdev)) { 1114 if (tb->fastreuse >= 0 || 1115 tb->fastreuseport >= 0) 1116 goto next_port_unlock; 1117 WARN_ON(hlist_empty(&tb->bhash2)); 1118 if (!check_established(death_row, sk, 1119 port, &tw, false, 1120 hash_port0 + port)) 1121 goto ok; 1122 goto next_port_unlock; 1123 } 1124 } 1125 1126 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 1127 net, head, port, l3mdev); 1128 if (!tb) { 1129 spin_unlock_bh(&head->lock); 1130 return -ENOMEM; 1131 } 1132 tb_created = true; 1133 tb->fastreuse = -1; 1134 tb->fastreuseport = -1; 1135 goto ok; 1136 next_port_unlock: 1137 spin_unlock_bh(&head->lock); 1138 next_port: 1139 cond_resched(); 1140 } 1141 1142 if (!local_ports) { 1143 offset++; 1144 if ((offset & 1) && remaining > 1) 1145 goto other_parity_scan; 1146 } 1147 return -EADDRNOTAVAIL; 1148 1149 ok: 1150 /* Find the corresponding tb2 bucket since we need to 1151 * add the socket to the bhash2 table as well 1152 */ 1153 head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); 1154 spin_lock(&head2->lock); 1155 1156 tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); 1157 if (!tb2) { 1158 tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net, 1159 head2, tb, sk); 1160 if (!tb2) 1161 goto error; 1162 tb2->fastreuse = -1; 1163 tb2->fastreuseport = -1; 1164 } 1165 1166 /* Here we want to add a little bit of randomness to the next source 1167 * port that will be chosen. We use a max() with a random here so that 1168 * on low contention the randomness is maximal and on high contention 1169 * it may be inexistent. 1170 */ 1171 i = max_t(int, i, get_random_u32_below(8) * step); 1172 WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + step); 1173 1174 /* Head lock still held and bh's disabled */ 1175 inet_bind_hash(sk, tb, tb2, port); 1176 sk->sk_userlocks |= SOCK_CONNECT_BIND; 1177 1178 if (sk_unhashed(sk)) { 1179 inet_sk(sk)->inet_sport = htons(port); 1180 inet_ehash_nolisten(sk, (struct sock *)tw, NULL); 1181 } 1182 if (tw) 1183 inet_twsk_bind_unhash(tw, hinfo); 1184 1185 spin_unlock(&head2->lock); 1186 spin_unlock(&head->lock); 1187 1188 if (tw) 1189 inet_twsk_deschedule_put(tw); 1190 local_bh_enable(); 1191 return 0; 1192 1193 error: 1194 if (sk_hashed(sk)) { 1195 spinlock_t *lock = inet_ehash_lockp(hinfo, sk->sk_hash); 1196 1197 sock_prot_inuse_add(net, sk->sk_prot, -1); 1198 1199 spin_lock(lock); 1200 __sk_nulls_del_node_init_rcu(sk); 1201 spin_unlock(lock); 1202 1203 sk->sk_hash = 0; 1204 inet_sk(sk)->inet_sport = 0; 1205 inet_sk(sk)->inet_num = 0; 1206 1207 if (tw) 1208 inet_twsk_bind_unhash(tw, hinfo); 1209 } 1210 1211 spin_unlock(&head2->lock); 1212 if (tb_created) 1213 inet_bind_bucket_destroy(tb); 1214 spin_unlock(&head->lock); 1215 1216 if (tw) 1217 inet_twsk_deschedule_put(tw); 1218 1219 local_bh_enable(); 1220 1221 return -ENOMEM; 1222 } 1223 1224 /* 1225 * Bind a port for a connect operation and hash it. 1226 */ 1227 int inet_hash_connect(struct inet_timewait_death_row *death_row, 1228 struct sock *sk) 1229 { 1230 const struct inet_sock *inet = inet_sk(sk); 1231 const struct net *net = sock_net(sk); 1232 u64 port_offset = 0; 1233 u32 hash_port0; 1234 1235 if (!inet_sk(sk)->inet_num) 1236 port_offset = inet_sk_port_offset(sk); 1237 1238 hash_port0 = inet_ehashfn(net, inet->inet_rcv_saddr, 0, 1239 inet->inet_daddr, inet->inet_dport); 1240 1241 return __inet_hash_connect(death_row, sk, port_offset, hash_port0, 1242 __inet_check_established); 1243 } 1244 1245 static void init_hashinfo_lhash2(struct inet_hashinfo *h) 1246 { 1247 int i; 1248 1249 for (i = 0; i <= h->lhash2_mask; i++) { 1250 spin_lock_init(&h->lhash2[i].lock); 1251 INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head, 1252 i + LISTENING_NULLS_BASE); 1253 } 1254 } 1255 1256 void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, 1257 unsigned long numentries, int scale, 1258 unsigned long low_limit, 1259 unsigned long high_limit) 1260 { 1261 h->lhash2 = alloc_large_system_hash(name, 1262 sizeof(*h->lhash2), 1263 numentries, 1264 scale, 1265 0, 1266 NULL, 1267 &h->lhash2_mask, 1268 low_limit, 1269 high_limit); 1270 init_hashinfo_lhash2(h); 1271 1272 /* this one is used for source ports of outgoing connections */ 1273 table_perturb = alloc_large_system_hash("Table-perturb", 1274 sizeof(*table_perturb), 1275 INET_TABLE_PERTURB_SIZE, 1276 0, 0, NULL, NULL, 1277 INET_TABLE_PERTURB_SIZE, 1278 INET_TABLE_PERTURB_SIZE); 1279 } 1280 1281 int inet_hashinfo2_init_mod(struct inet_hashinfo *h) 1282 { 1283 h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL); 1284 if (!h->lhash2) 1285 return -ENOMEM; 1286 1287 h->lhash2_mask = INET_LHTABLE_SIZE - 1; 1288 /* INET_LHTABLE_SIZE must be a power of 2 */ 1289 BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask); 1290 1291 init_hashinfo_lhash2(h); 1292 return 0; 1293 } 1294 1295 int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) 1296 { 1297 unsigned int locksz = sizeof(spinlock_t); 1298 unsigned int i, nblocks = 1; 1299 spinlock_t *ptr = NULL; 1300 1301 if (locksz == 0) 1302 goto set_mask; 1303 1304 /* Allocate 2 cache lines or at least one spinlock per cpu. */ 1305 nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U) * num_possible_cpus(); 1306 1307 /* At least one page per NUMA node. */ 1308 nblocks = max(nblocks, num_online_nodes() * PAGE_SIZE / locksz); 1309 1310 nblocks = roundup_pow_of_two(nblocks); 1311 1312 /* No more locks than number of hash buckets. */ 1313 nblocks = min(nblocks, hashinfo->ehash_mask + 1); 1314 1315 if (num_online_nodes() > 1) { 1316 /* Use vmalloc() to allow NUMA policy to spread pages 1317 * on all available nodes if desired. 1318 */ 1319 ptr = vmalloc_array(nblocks, locksz); 1320 } 1321 if (!ptr) { 1322 ptr = kvmalloc_array(nblocks, locksz, GFP_KERNEL); 1323 if (!ptr) 1324 return -ENOMEM; 1325 } 1326 for (i = 0; i < nblocks; i++) 1327 spin_lock_init(&ptr[i]); 1328 hashinfo->ehash_locks = ptr; 1329 set_mask: 1330 hashinfo->ehash_locks_mask = nblocks - 1; 1331 return 0; 1332 } 1333 1334 struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, 1335 unsigned int ehash_entries) 1336 { 1337 struct inet_hashinfo *new_hashinfo; 1338 int i; 1339 1340 new_hashinfo = kmemdup(hashinfo, sizeof(*hashinfo), GFP_KERNEL); 1341 if (!new_hashinfo) 1342 goto err; 1343 1344 new_hashinfo->ehash = vmalloc_huge(ehash_entries * sizeof(struct inet_ehash_bucket), 1345 GFP_KERNEL_ACCOUNT); 1346 if (!new_hashinfo->ehash) 1347 goto free_hashinfo; 1348 1349 new_hashinfo->ehash_mask = ehash_entries - 1; 1350 1351 if (inet_ehash_locks_alloc(new_hashinfo)) 1352 goto free_ehash; 1353 1354 for (i = 0; i < ehash_entries; i++) 1355 INIT_HLIST_NULLS_HEAD(&new_hashinfo->ehash[i].chain, i); 1356 1357 new_hashinfo->pernet = true; 1358 1359 return new_hashinfo; 1360 1361 free_ehash: 1362 vfree(new_hashinfo->ehash); 1363 free_hashinfo: 1364 kfree(new_hashinfo); 1365 err: 1366 return NULL; 1367 } 1368 1369 void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo) 1370 { 1371 if (!hashinfo->pernet) 1372 return; 1373 1374 inet_ehash_locks_free(hashinfo); 1375 vfree(hashinfo->ehash); 1376 kfree(hashinfo); 1377 } 1378