1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Generic INET transport hashtables 8 * 9 * Authors: Lotsa people, from code originally in tcp 10 */ 11 12 #include <linux/module.h> 13 #include <linux/random.h> 14 #include <linux/sched.h> 15 #include <linux/slab.h> 16 #include <linux/wait.h> 17 #include <linux/vmalloc.h> 18 #include <linux/memblock.h> 19 20 #include <net/addrconf.h> 21 #include <net/inet_connection_sock.h> 22 #include <net/inet_hashtables.h> 23 #if IS_ENABLED(CONFIG_IPV6) 24 #include <net/inet6_hashtables.h> 25 #endif 26 #include <net/hotdata.h> 27 #include <net/ip.h> 28 #include <net/rps.h> 29 #include <net/secure_seq.h> 30 #include <net/sock_reuseport.h> 31 #include <net/tcp.h> 32 33 u32 inet_ehashfn(const struct net *net, const __be32 laddr, 34 const __u16 lport, const __be32 faddr, 35 const __be16 fport) 36 { 37 net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret)); 38 39 return lport + __inet_ehashfn(laddr, 0, faddr, fport, 40 inet_ehash_secret + net_hash_mix(net)); 41 } 42 EXPORT_SYMBOL_GPL(inet_ehashfn); 43 44 /* This function handles inet_sock, but also timewait and request sockets 45 * for IPv4/IPv6. 46 */ 47 static u32 sk_ehashfn(const struct sock *sk) 48 { 49 #if IS_ENABLED(CONFIG_IPV6) 50 if (sk->sk_family == AF_INET6 && 51 !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 52 return inet6_ehashfn(sock_net(sk), 53 &sk->sk_v6_rcv_saddr, sk->sk_num, 54 &sk->sk_v6_daddr, sk->sk_dport); 55 #endif 56 return inet_ehashfn(sock_net(sk), 57 sk->sk_rcv_saddr, sk->sk_num, 58 sk->sk_daddr, sk->sk_dport); 59 } 60 61 static bool sk_is_connect_bind(const struct sock *sk) 62 { 63 if (sk->sk_state == TCP_TIME_WAIT) 64 return inet_twsk(sk)->tw_connect_bind; 65 else 66 return sk->sk_userlocks & SOCK_CONNECT_BIND; 67 } 68 69 /* 70 * Allocate and initialize a new local port bind bucket. 71 * The bindhash mutex for snum's hash chain must be held here. 72 */ 73 struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, 74 struct net *net, 75 struct inet_bind_hashbucket *head, 76 const unsigned short snum, 77 int l3mdev) 78 { 79 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 80 81 if (tb) { 82 write_pnet(&tb->ib_net, net); 83 tb->l3mdev = l3mdev; 84 tb->port = snum; 85 tb->fastreuse = 0; 86 tb->fastreuseport = 0; 87 INIT_HLIST_HEAD(&tb->bhash2); 88 hlist_add_head_rcu(&tb->node, &head->chain); 89 } 90 return tb; 91 } 92 93 /* 94 * Caller must hold hashbucket lock for this tb with local BH disabled 95 */ 96 void inet_bind_bucket_destroy(struct inet_bind_bucket *tb) 97 { 98 const struct inet_bind2_bucket *tb2; 99 100 if (hlist_empty(&tb->bhash2)) { 101 hlist_del_rcu(&tb->node); 102 kfree_rcu(tb, rcu); 103 return; 104 } 105 106 if (tb->fastreuse == -1 && tb->fastreuseport == -1) 107 return; 108 hlist_for_each_entry(tb2, &tb->bhash2, bhash_node) { 109 if (tb2->fastreuse != -1 || tb2->fastreuseport != -1) 110 return; 111 } 112 tb->fastreuse = -1; 113 tb->fastreuseport = -1; 114 } 115 116 bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, 117 unsigned short port, int l3mdev) 118 { 119 return net_eq(ib_net(tb), net) && tb->port == port && 120 tb->l3mdev == l3mdev; 121 } 122 123 static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2, 124 struct net *net, 125 struct inet_bind_hashbucket *head, 126 struct inet_bind_bucket *tb, 127 const struct sock *sk) 128 { 129 write_pnet(&tb2->ib_net, net); 130 tb2->l3mdev = tb->l3mdev; 131 tb2->port = tb->port; 132 #if IS_ENABLED(CONFIG_IPV6) 133 BUILD_BUG_ON(USHRT_MAX < (IPV6_ADDR_ANY | IPV6_ADDR_MAPPED)); 134 if (sk->sk_family == AF_INET6) { 135 tb2->addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); 136 tb2->v6_rcv_saddr = sk->sk_v6_rcv_saddr; 137 } else { 138 tb2->addr_type = IPV6_ADDR_MAPPED; 139 ipv6_addr_set_v4mapped(sk->sk_rcv_saddr, &tb2->v6_rcv_saddr); 140 } 141 #else 142 tb2->rcv_saddr = sk->sk_rcv_saddr; 143 #endif 144 tb2->fastreuse = 0; 145 tb2->fastreuseport = 0; 146 INIT_HLIST_HEAD(&tb2->owners); 147 hlist_add_head(&tb2->node, &head->chain); 148 hlist_add_head(&tb2->bhash_node, &tb->bhash2); 149 } 150 151 struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, 152 struct net *net, 153 struct inet_bind_hashbucket *head, 154 struct inet_bind_bucket *tb, 155 const struct sock *sk) 156 { 157 struct inet_bind2_bucket *tb2 = kmem_cache_alloc(cachep, GFP_ATOMIC); 158 159 if (tb2) 160 inet_bind2_bucket_init(tb2, net, head, tb, sk); 161 162 return tb2; 163 } 164 165 /* Caller must hold hashbucket lock for this tb with local BH disabled */ 166 void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) 167 { 168 const struct sock *sk; 169 170 if (hlist_empty(&tb->owners)) { 171 __hlist_del(&tb->node); 172 __hlist_del(&tb->bhash_node); 173 kmem_cache_free(cachep, tb); 174 return; 175 } 176 177 if (tb->fastreuse == -1 && tb->fastreuseport == -1) 178 return; 179 sk_for_each_bound(sk, &tb->owners) { 180 if (!sk_is_connect_bind(sk)) 181 return; 182 } 183 tb->fastreuse = -1; 184 tb->fastreuseport = -1; 185 } 186 187 static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, 188 const struct sock *sk) 189 { 190 #if IS_ENABLED(CONFIG_IPV6) 191 if (sk->sk_family == AF_INET6) 192 return ipv6_addr_equal(&tb2->v6_rcv_saddr, &sk->sk_v6_rcv_saddr); 193 194 if (tb2->addr_type != IPV6_ADDR_MAPPED) 195 return false; 196 #endif 197 return tb2->rcv_saddr == sk->sk_rcv_saddr; 198 } 199 200 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 201 struct inet_bind2_bucket *tb2, unsigned short port) 202 { 203 inet_sk(sk)->inet_num = port; 204 inet_csk(sk)->icsk_bind_hash = tb; 205 inet_csk(sk)->icsk_bind2_hash = tb2; 206 sk_add_bind_node(sk, &tb2->owners); 207 } 208 209 /* 210 * Get rid of any references to a local port held by the given sock. 211 */ 212 static void __inet_put_port(struct sock *sk) 213 { 214 struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); 215 struct inet_bind_hashbucket *head, *head2; 216 struct net *net = sock_net(sk); 217 struct inet_bind_bucket *tb; 218 int bhash; 219 220 bhash = inet_bhashfn(net, inet_sk(sk)->inet_num, hashinfo->bhash_size); 221 head = &hashinfo->bhash[bhash]; 222 head2 = inet_bhashfn_portaddr(hashinfo, sk, net, inet_sk(sk)->inet_num); 223 224 spin_lock(&head->lock); 225 tb = inet_csk(sk)->icsk_bind_hash; 226 inet_csk(sk)->icsk_bind_hash = NULL; 227 inet_sk(sk)->inet_num = 0; 228 sk->sk_userlocks &= ~SOCK_CONNECT_BIND; 229 230 spin_lock(&head2->lock); 231 if (inet_csk(sk)->icsk_bind2_hash) { 232 struct inet_bind2_bucket *tb2 = inet_csk(sk)->icsk_bind2_hash; 233 234 __sk_del_bind_node(sk); 235 inet_csk(sk)->icsk_bind2_hash = NULL; 236 inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); 237 } 238 spin_unlock(&head2->lock); 239 240 inet_bind_bucket_destroy(tb); 241 spin_unlock(&head->lock); 242 } 243 244 void inet_put_port(struct sock *sk) 245 { 246 local_bh_disable(); 247 __inet_put_port(sk); 248 local_bh_enable(); 249 } 250 EXPORT_SYMBOL(inet_put_port); 251 252 int __inet_inherit_port(const struct sock *sk, struct sock *child) 253 { 254 struct inet_hashinfo *table = tcp_get_hashinfo(sk); 255 unsigned short port = inet_sk(child)->inet_num; 256 struct inet_bind_hashbucket *head, *head2; 257 bool created_inet_bind_bucket = false; 258 struct net *net = sock_net(sk); 259 bool update_fastreuse = false; 260 struct inet_bind2_bucket *tb2; 261 struct inet_bind_bucket *tb; 262 int bhash, l3mdev; 263 264 bhash = inet_bhashfn(net, port, table->bhash_size); 265 head = &table->bhash[bhash]; 266 head2 = inet_bhashfn_portaddr(table, child, net, port); 267 268 spin_lock(&head->lock); 269 spin_lock(&head2->lock); 270 tb = inet_csk(sk)->icsk_bind_hash; 271 tb2 = inet_csk(sk)->icsk_bind2_hash; 272 if (unlikely(!tb || !tb2)) { 273 spin_unlock(&head2->lock); 274 spin_unlock(&head->lock); 275 return -ENOENT; 276 } 277 if (tb->port != port) { 278 l3mdev = inet_sk_bound_l3mdev(sk); 279 280 /* NOTE: using tproxy and redirecting skbs to a proxy 281 * on a different listener port breaks the assumption 282 * that the listener socket's icsk_bind_hash is the same 283 * as that of the child socket. We have to look up or 284 * create a new bind bucket for the child here. */ 285 inet_bind_bucket_for_each(tb, &head->chain) { 286 if (inet_bind_bucket_match(tb, net, port, l3mdev)) 287 break; 288 } 289 if (!tb) { 290 tb = inet_bind_bucket_create(table->bind_bucket_cachep, 291 net, head, port, l3mdev); 292 if (!tb) { 293 spin_unlock(&head2->lock); 294 spin_unlock(&head->lock); 295 return -ENOMEM; 296 } 297 created_inet_bind_bucket = true; 298 } 299 update_fastreuse = true; 300 301 goto bhash2_find; 302 } else if (!inet_bind2_bucket_addr_match(tb2, child)) { 303 l3mdev = inet_sk_bound_l3mdev(sk); 304 305 bhash2_find: 306 tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, child); 307 if (!tb2) { 308 tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep, 309 net, head2, tb, child); 310 if (!tb2) 311 goto error; 312 } 313 } 314 if (update_fastreuse) 315 inet_csk_update_fastreuse(child, tb, tb2); 316 inet_bind_hash(child, tb, tb2, port); 317 spin_unlock(&head2->lock); 318 spin_unlock(&head->lock); 319 320 return 0; 321 322 error: 323 if (created_inet_bind_bucket) 324 inet_bind_bucket_destroy(tb); 325 spin_unlock(&head2->lock); 326 spin_unlock(&head->lock); 327 return -ENOMEM; 328 } 329 EXPORT_SYMBOL_GPL(__inet_inherit_port); 330 331 static struct inet_listen_hashbucket * 332 inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) 333 { 334 u32 hash; 335 336 #if IS_ENABLED(CONFIG_IPV6) 337 if (sk->sk_family == AF_INET6) 338 hash = ipv6_portaddr_hash(sock_net(sk), 339 &sk->sk_v6_rcv_saddr, 340 inet_sk(sk)->inet_num); 341 else 342 #endif 343 hash = ipv4_portaddr_hash(sock_net(sk), 344 inet_sk(sk)->inet_rcv_saddr, 345 inet_sk(sk)->inet_num); 346 return inet_lhash2_bucket(h, hash); 347 } 348 349 static inline int compute_score(struct sock *sk, const struct net *net, 350 const unsigned short hnum, const __be32 daddr, 351 const int dif, const int sdif) 352 { 353 int score = -1; 354 355 if (net_eq(sock_net(sk), net) && sk->sk_num == hnum && 356 !ipv6_only_sock(sk)) { 357 if (sk->sk_rcv_saddr != daddr) 358 return -1; 359 360 if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) 361 return -1; 362 score = sk->sk_bound_dev_if ? 2 : 1; 363 364 if (sk->sk_family == PF_INET) 365 score++; 366 if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) 367 score++; 368 } 369 return score; 370 } 371 372 /** 373 * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary. 374 * @net: network namespace. 375 * @sk: AF_INET socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP. 376 * @skb: context for a potential SK_REUSEPORT program. 377 * @doff: header offset. 378 * @saddr: source address. 379 * @sport: source port. 380 * @daddr: destination address. 381 * @hnum: destination port in host byte order. 382 * @ehashfn: hash function used to generate the fallback hash. 383 * 384 * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to 385 * the selected sock or an error. 386 */ 387 struct sock *inet_lookup_reuseport(const struct net *net, struct sock *sk, 388 struct sk_buff *skb, int doff, 389 __be32 saddr, __be16 sport, 390 __be32 daddr, unsigned short hnum, 391 inet_ehashfn_t *ehashfn) 392 { 393 struct sock *reuse_sk = NULL; 394 u32 phash; 395 396 if (sk->sk_reuseport) { 397 phash = INDIRECT_CALL_2(ehashfn, udp_ehashfn, inet_ehashfn, 398 net, daddr, hnum, saddr, sport); 399 reuse_sk = reuseport_select_sock(sk, phash, skb, doff); 400 } 401 return reuse_sk; 402 } 403 EXPORT_SYMBOL_GPL(inet_lookup_reuseport); 404 405 /* 406 * Here are some nice properties to exploit here. The BSD API 407 * does not allow a listening sock to specify the remote port nor the 408 * remote address for the connection. So always assume those are both 409 * wildcarded during the search since they can never be otherwise. 410 */ 411 412 /* called with rcu_read_lock() : No refcount taken on the socket */ 413 static struct sock *inet_lhash2_lookup(const struct net *net, 414 struct inet_listen_hashbucket *ilb2, 415 struct sk_buff *skb, int doff, 416 const __be32 saddr, __be16 sport, 417 const __be32 daddr, const unsigned short hnum, 418 const int dif, const int sdif) 419 { 420 struct sock *sk, *result = NULL; 421 struct hlist_nulls_node *node; 422 int score, hiscore = 0; 423 424 sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { 425 score = compute_score(sk, net, hnum, daddr, dif, sdif); 426 if (score > hiscore) { 427 result = inet_lookup_reuseport(net, sk, skb, doff, 428 saddr, sport, daddr, hnum, inet_ehashfn); 429 if (result) 430 return result; 431 432 result = sk; 433 hiscore = score; 434 } 435 } 436 437 return result; 438 } 439 440 struct sock *inet_lookup_run_sk_lookup(const struct net *net, 441 int protocol, 442 struct sk_buff *skb, int doff, 443 __be32 saddr, __be16 sport, 444 __be32 daddr, u16 hnum, const int dif, 445 inet_ehashfn_t *ehashfn) 446 { 447 struct sock *sk, *reuse_sk; 448 bool no_reuseport; 449 450 no_reuseport = bpf_sk_lookup_run_v4(net, protocol, saddr, sport, 451 daddr, hnum, dif, &sk); 452 if (no_reuseport || IS_ERR_OR_NULL(sk)) 453 return sk; 454 455 reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, 456 ehashfn); 457 if (reuse_sk) 458 sk = reuse_sk; 459 return sk; 460 } 461 462 struct sock *__inet_lookup_listener(const struct net *net, 463 struct sk_buff *skb, int doff, 464 const __be32 saddr, __be16 sport, 465 const __be32 daddr, const unsigned short hnum, 466 const int dif, const int sdif) 467 { 468 struct inet_listen_hashbucket *ilb2; 469 struct inet_hashinfo *hashinfo; 470 struct sock *result = NULL; 471 unsigned int hash2; 472 473 /* Lookup redirect from BPF */ 474 if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { 475 result = inet_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff, 476 saddr, sport, daddr, hnum, dif, 477 inet_ehashfn); 478 if (result) 479 goto done; 480 } 481 482 hashinfo = net->ipv4.tcp_death_row.hashinfo; 483 hash2 = ipv4_portaddr_hash(net, daddr, hnum); 484 ilb2 = inet_lhash2_bucket(hashinfo, hash2); 485 486 result = inet_lhash2_lookup(net, ilb2, skb, doff, 487 saddr, sport, daddr, hnum, 488 dif, sdif); 489 if (result) 490 goto done; 491 492 /* Lookup lhash2 with INADDR_ANY */ 493 hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); 494 ilb2 = inet_lhash2_bucket(hashinfo, hash2); 495 496 result = inet_lhash2_lookup(net, ilb2, skb, doff, 497 saddr, sport, htonl(INADDR_ANY), hnum, 498 dif, sdif); 499 done: 500 if (IS_ERR(result)) 501 return NULL; 502 return result; 503 } 504 EXPORT_SYMBOL_GPL(__inet_lookup_listener); 505 506 /* All sockets share common refcount, but have different destructors */ 507 void sock_gen_put(struct sock *sk) 508 { 509 if (!refcount_dec_and_test(&sk->sk_refcnt)) 510 return; 511 512 if (sk->sk_state == TCP_TIME_WAIT) 513 inet_twsk_free(inet_twsk(sk)); 514 else if (sk->sk_state == TCP_NEW_SYN_RECV) 515 reqsk_free(inet_reqsk(sk)); 516 else 517 sk_free(sk); 518 } 519 EXPORT_SYMBOL_GPL(sock_gen_put); 520 521 void sock_edemux(struct sk_buff *skb) 522 { 523 sock_gen_put(skb->sk); 524 } 525 EXPORT_SYMBOL(sock_edemux); 526 527 struct sock *__inet_lookup_established(const struct net *net, 528 const __be32 saddr, const __be16 sport, 529 const __be32 daddr, const u16 hnum, 530 const int dif, const int sdif) 531 { 532 const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 533 INET_ADDR_COOKIE(acookie, saddr, daddr); 534 const struct hlist_nulls_node *node; 535 struct inet_ehash_bucket *head; 536 struct inet_hashinfo *hashinfo; 537 unsigned int hash, slot; 538 struct sock *sk; 539 540 hashinfo = net->ipv4.tcp_death_row.hashinfo; 541 hash = inet_ehashfn(net, daddr, hnum, saddr, sport); 542 slot = hash & hashinfo->ehash_mask; 543 head = &hashinfo->ehash[slot]; 544 545 begin: 546 sk_nulls_for_each_rcu(sk, node, &head->chain) { 547 if (sk->sk_hash != hash) 548 continue; 549 if (likely(inet_match(net, sk, acookie, ports, dif, sdif))) { 550 if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) 551 goto out; 552 if (unlikely(!inet_match(net, sk, acookie, 553 ports, dif, sdif))) { 554 sock_gen_put(sk); 555 goto begin; 556 } 557 goto found; 558 } 559 } 560 /* 561 * if the nulls value we got at the end of this lookup is 562 * not the expected one, we must restart lookup. 563 * We probably met an item that was moved to another chain. 564 */ 565 if (get_nulls_value(node) != slot) 566 goto begin; 567 out: 568 sk = NULL; 569 found: 570 return sk; 571 } 572 EXPORT_SYMBOL_GPL(__inet_lookup_established); 573 574 /* called with local bh disabled */ 575 static int __inet_check_established(struct inet_timewait_death_row *death_row, 576 struct sock *sk, __u16 lport, 577 struct inet_timewait_sock **twp, 578 bool rcu_lookup, 579 u32 hash) 580 { 581 struct inet_hashinfo *hinfo = death_row->hashinfo; 582 struct inet_sock *inet = inet_sk(sk); 583 __be32 daddr = inet->inet_rcv_saddr; 584 __be32 saddr = inet->inet_daddr; 585 int dif = sk->sk_bound_dev_if; 586 struct net *net = sock_net(sk); 587 int sdif = l3mdev_master_ifindex_by_index(net, dif); 588 INET_ADDR_COOKIE(acookie, saddr, daddr); 589 const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); 590 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 591 struct inet_timewait_sock *tw = NULL; 592 const struct hlist_nulls_node *node; 593 struct sock *sk2; 594 spinlock_t *lock; 595 596 if (rcu_lookup) { 597 sk_nulls_for_each(sk2, node, &head->chain) { 598 if (sk2->sk_hash != hash || 599 !inet_match(net, sk2, acookie, ports, dif, sdif)) 600 continue; 601 if (sk2->sk_state == TCP_TIME_WAIT) 602 break; 603 return -EADDRNOTAVAIL; 604 } 605 return 0; 606 } 607 608 lock = inet_ehash_lockp(hinfo, hash); 609 spin_lock(lock); 610 611 sk_nulls_for_each(sk2, node, &head->chain) { 612 if (sk2->sk_hash != hash) 613 continue; 614 615 if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) { 616 if (sk2->sk_state == TCP_TIME_WAIT) { 617 tw = inet_twsk(sk2); 618 if (tcp_twsk_unique(sk, sk2, twp)) 619 break; 620 } 621 goto not_unique; 622 } 623 } 624 625 /* Must record num and sport now. Otherwise we will see 626 * in hash table socket with a funny identity. 627 */ 628 inet->inet_num = lport; 629 inet->inet_sport = htons(lport); 630 sk->sk_hash = hash; 631 WARN_ON(!sk_unhashed(sk)); 632 __sk_nulls_add_node_rcu(sk, &head->chain); 633 if (tw) { 634 sk_nulls_del_node_init_rcu((struct sock *)tw); 635 __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); 636 } 637 spin_unlock(lock); 638 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 639 640 if (twp) { 641 *twp = tw; 642 } else if (tw) { 643 /* Silly. Should hash-dance instead... */ 644 inet_twsk_deschedule_put(tw); 645 } 646 return 0; 647 648 not_unique: 649 spin_unlock(lock); 650 return -EADDRNOTAVAIL; 651 } 652 653 static u64 inet_sk_port_offset(const struct sock *sk) 654 { 655 const struct inet_sock *inet = inet_sk(sk); 656 657 return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, 658 inet->inet_daddr, 659 inet->inet_dport); 660 } 661 662 /* Searches for an exsiting socket in the ehash bucket list. 663 * Returns true if found, false otherwise. 664 */ 665 static bool inet_ehash_lookup_by_sk(struct sock *sk, 666 struct hlist_nulls_head *list) 667 { 668 const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num); 669 const int sdif = sk->sk_bound_dev_if; 670 const int dif = sk->sk_bound_dev_if; 671 const struct hlist_nulls_node *node; 672 struct net *net = sock_net(sk); 673 struct sock *esk; 674 675 INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr); 676 677 sk_nulls_for_each_rcu(esk, node, list) { 678 if (esk->sk_hash != sk->sk_hash) 679 continue; 680 if (sk->sk_family == AF_INET) { 681 if (unlikely(inet_match(net, esk, acookie, 682 ports, dif, sdif))) { 683 return true; 684 } 685 } 686 #if IS_ENABLED(CONFIG_IPV6) 687 else if (sk->sk_family == AF_INET6) { 688 if (unlikely(inet6_match(net, esk, 689 &sk->sk_v6_daddr, 690 &sk->sk_v6_rcv_saddr, 691 ports, dif, sdif))) { 692 return true; 693 } 694 } 695 #endif 696 } 697 return false; 698 } 699 700 /* Insert a socket into ehash, and eventually remove another one 701 * (The another one can be a SYN_RECV or TIMEWAIT) 702 * If an existing socket already exists, socket sk is not inserted, 703 * and sets found_dup_sk parameter to true. 704 */ 705 bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) 706 { 707 struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); 708 struct inet_ehash_bucket *head; 709 struct hlist_nulls_head *list; 710 spinlock_t *lock; 711 bool ret = true; 712 713 WARN_ON_ONCE(!sk_unhashed(sk)); 714 715 sk->sk_hash = sk_ehashfn(sk); 716 head = inet_ehash_bucket(hashinfo, sk->sk_hash); 717 list = &head->chain; 718 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 719 720 spin_lock(lock); 721 if (osk) { 722 WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); 723 ret = sk_nulls_replace_node_init_rcu(osk, sk); 724 goto unlock; 725 } 726 727 if (found_dup_sk) { 728 *found_dup_sk = inet_ehash_lookup_by_sk(sk, list); 729 if (*found_dup_sk) 730 ret = false; 731 } 732 733 if (ret) 734 __sk_nulls_add_node_rcu(sk, list); 735 736 unlock: 737 spin_unlock(lock); 738 739 return ret; 740 } 741 742 bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) 743 { 744 bool ok = inet_ehash_insert(sk, osk, found_dup_sk); 745 746 if (ok) { 747 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 748 } else { 749 tcp_orphan_count_inc(); 750 inet_sk_set_state(sk, TCP_CLOSE); 751 sock_set_flag(sk, SOCK_DEAD); 752 inet_csk_destroy_sock(sk); 753 } 754 return ok; 755 } 756 EXPORT_IPV6_MOD(inet_ehash_nolisten); 757 758 static int inet_reuseport_add_sock(struct sock *sk, 759 struct inet_listen_hashbucket *ilb) 760 { 761 struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; 762 const struct hlist_nulls_node *node; 763 kuid_t uid = sk_uid(sk); 764 struct sock *sk2; 765 766 sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) { 767 if (sk2 != sk && 768 sk2->sk_family == sk->sk_family && 769 ipv6_only_sock(sk2) == ipv6_only_sock(sk) && 770 sk2->sk_bound_dev_if == sk->sk_bound_dev_if && 771 inet_csk(sk2)->icsk_bind_hash == tb && 772 sk2->sk_reuseport && uid_eq(uid, sk_uid(sk2)) && 773 inet_rcv_saddr_equal(sk, sk2, false)) 774 return reuseport_add_sock(sk, sk2, 775 inet_rcv_saddr_any(sk)); 776 } 777 778 return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); 779 } 780 781 int inet_hash(struct sock *sk) 782 { 783 struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); 784 struct inet_listen_hashbucket *ilb2; 785 int err = 0; 786 787 if (sk->sk_state == TCP_CLOSE) 788 return 0; 789 790 if (sk->sk_state != TCP_LISTEN) { 791 local_bh_disable(); 792 inet_ehash_nolisten(sk, NULL, NULL); 793 local_bh_enable(); 794 return 0; 795 } 796 WARN_ON(!sk_unhashed(sk)); 797 ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); 798 799 spin_lock(&ilb2->lock); 800 if (sk->sk_reuseport) { 801 err = inet_reuseport_add_sock(sk, ilb2); 802 if (err) 803 goto unlock; 804 } 805 sock_set_flag(sk, SOCK_RCU_FREE); 806 if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && 807 sk->sk_family == AF_INET6) 808 __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head); 809 else 810 __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head); 811 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 812 unlock: 813 spin_unlock(&ilb2->lock); 814 815 return err; 816 } 817 EXPORT_IPV6_MOD(inet_hash); 818 819 void inet_unhash(struct sock *sk) 820 { 821 struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); 822 823 if (sk_unhashed(sk)) 824 return; 825 826 sock_rps_delete_flow(sk); 827 if (sk->sk_state == TCP_LISTEN) { 828 struct inet_listen_hashbucket *ilb2; 829 830 ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); 831 /* Don't disable bottom halves while acquiring the lock to 832 * avoid circular locking dependency on PREEMPT_RT. 833 */ 834 spin_lock(&ilb2->lock); 835 if (rcu_access_pointer(sk->sk_reuseport_cb)) 836 reuseport_stop_listen_sock(sk); 837 838 __sk_nulls_del_node_init_rcu(sk); 839 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 840 spin_unlock(&ilb2->lock); 841 } else { 842 spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 843 844 spin_lock_bh(lock); 845 __sk_nulls_del_node_init_rcu(sk); 846 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 847 spin_unlock_bh(lock); 848 } 849 } 850 EXPORT_IPV6_MOD(inet_unhash); 851 852 static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb, 853 const struct net *net, unsigned short port, 854 int l3mdev, const struct sock *sk) 855 { 856 if (!net_eq(ib2_net(tb), net) || tb->port != port || 857 tb->l3mdev != l3mdev) 858 return false; 859 860 return inet_bind2_bucket_addr_match(tb, sk); 861 } 862 863 bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net, 864 unsigned short port, int l3mdev, const struct sock *sk) 865 { 866 if (!net_eq(ib2_net(tb), net) || tb->port != port || 867 tb->l3mdev != l3mdev) 868 return false; 869 870 #if IS_ENABLED(CONFIG_IPV6) 871 if (tb->addr_type == IPV6_ADDR_ANY) 872 return true; 873 874 if (tb->addr_type != IPV6_ADDR_MAPPED) 875 return false; 876 877 if (sk->sk_family == AF_INET6 && 878 !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) 879 return false; 880 #endif 881 return tb->rcv_saddr == 0; 882 } 883 884 /* The socket's bhash2 hashbucket spinlock must be held when this is called */ 885 struct inet_bind2_bucket * 886 inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net, 887 unsigned short port, int l3mdev, const struct sock *sk) 888 { 889 struct inet_bind2_bucket *bhash2 = NULL; 890 891 inet_bind_bucket_for_each(bhash2, &head->chain) 892 if (inet_bind2_bucket_match(bhash2, net, port, l3mdev, sk)) 893 break; 894 895 return bhash2; 896 } 897 898 struct inet_bind_hashbucket * 899 inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port) 900 { 901 struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); 902 u32 hash; 903 904 #if IS_ENABLED(CONFIG_IPV6) 905 if (sk->sk_family == AF_INET6) 906 hash = ipv6_portaddr_hash(net, &in6addr_any, port); 907 else 908 #endif 909 hash = ipv4_portaddr_hash(net, 0, port); 910 911 return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; 912 } 913 914 static void inet_update_saddr(struct sock *sk, void *saddr, int family) 915 { 916 if (family == AF_INET) { 917 inet_sk(sk)->inet_saddr = *(__be32 *)saddr; 918 sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr); 919 } 920 #if IS_ENABLED(CONFIG_IPV6) 921 else { 922 sk->sk_v6_rcv_saddr = *(struct in6_addr *)saddr; 923 } 924 #endif 925 } 926 927 static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset) 928 { 929 struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); 930 struct inet_bind_hashbucket *head, *head2; 931 struct inet_bind2_bucket *tb2, *new_tb2; 932 int l3mdev = inet_sk_bound_l3mdev(sk); 933 int port = inet_sk(sk)->inet_num; 934 struct net *net = sock_net(sk); 935 int bhash; 936 937 if (!inet_csk(sk)->icsk_bind2_hash) { 938 /* Not bind()ed before. */ 939 if (reset) 940 inet_reset_saddr(sk); 941 else 942 inet_update_saddr(sk, saddr, family); 943 944 return 0; 945 } 946 947 /* Allocate a bind2 bucket ahead of time to avoid permanently putting 948 * the bhash2 table in an inconsistent state if a new tb2 bucket 949 * allocation fails. 950 */ 951 new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC); 952 if (!new_tb2) { 953 if (reset) { 954 /* The (INADDR_ANY, port) bucket might have already 955 * been freed, then we cannot fixup icsk_bind2_hash, 956 * so we give up and unlink sk from bhash/bhash2 not 957 * to leave inconsistency in bhash2. 958 */ 959 inet_put_port(sk); 960 inet_reset_saddr(sk); 961 } 962 963 return -ENOMEM; 964 } 965 966 bhash = inet_bhashfn(net, port, hinfo->bhash_size); 967 head = &hinfo->bhash[bhash]; 968 head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); 969 970 /* If we change saddr locklessly, another thread 971 * iterating over bhash might see corrupted address. 972 */ 973 spin_lock_bh(&head->lock); 974 975 spin_lock(&head2->lock); 976 __sk_del_bind_node(sk); 977 inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash); 978 spin_unlock(&head2->lock); 979 980 if (reset) 981 inet_reset_saddr(sk); 982 else 983 inet_update_saddr(sk, saddr, family); 984 985 head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); 986 987 spin_lock(&head2->lock); 988 tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); 989 if (!tb2) { 990 tb2 = new_tb2; 991 inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk); 992 if (sk_is_connect_bind(sk)) { 993 tb2->fastreuse = -1; 994 tb2->fastreuseport = -1; 995 } 996 } 997 inet_csk(sk)->icsk_bind2_hash = tb2; 998 sk_add_bind_node(sk, &tb2->owners); 999 spin_unlock(&head2->lock); 1000 1001 spin_unlock_bh(&head->lock); 1002 1003 if (tb2 != new_tb2) 1004 kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2); 1005 1006 return 0; 1007 } 1008 1009 int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) 1010 { 1011 return __inet_bhash2_update_saddr(sk, saddr, family, false); 1012 } 1013 EXPORT_IPV6_MOD(inet_bhash2_update_saddr); 1014 1015 void inet_bhash2_reset_saddr(struct sock *sk) 1016 { 1017 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) 1018 __inet_bhash2_update_saddr(sk, NULL, 0, true); 1019 } 1020 EXPORT_IPV6_MOD(inet_bhash2_reset_saddr); 1021 1022 /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm 1023 * Note that we use 32bit integers (vs RFC 'short integers') 1024 * because 2^16 is not a multiple of num_ephemeral and this 1025 * property might be used by clever attacker. 1026 * 1027 * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though 1028 * attacks were since demonstrated, thus we use 65536 by default instead 1029 * to really give more isolation and privacy, at the expense of 256kB 1030 * of kernel memory. 1031 */ 1032 #define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER) 1033 static u32 *table_perturb; 1034 1035 int __inet_hash_connect(struct inet_timewait_death_row *death_row, 1036 struct sock *sk, u64 port_offset, 1037 u32 hash_port0, 1038 int (*check_established)(struct inet_timewait_death_row *, 1039 struct sock *, __u16, struct inet_timewait_sock **, 1040 bool rcu_lookup, u32 hash)) 1041 { 1042 struct inet_hashinfo *hinfo = death_row->hashinfo; 1043 struct inet_bind_hashbucket *head, *head2; 1044 struct inet_timewait_sock *tw = NULL; 1045 int port = inet_sk(sk)->inet_num; 1046 struct net *net = sock_net(sk); 1047 struct inet_bind2_bucket *tb2; 1048 struct inet_bind_bucket *tb; 1049 bool tb_created = false; 1050 u32 remaining, offset; 1051 int ret, i, low, high; 1052 bool local_ports; 1053 int step, l3mdev; 1054 u32 index; 1055 1056 if (port) { 1057 local_bh_disable(); 1058 ret = check_established(death_row, sk, port, NULL, false, 1059 hash_port0 + port); 1060 local_bh_enable(); 1061 return ret; 1062 } 1063 1064 l3mdev = inet_sk_bound_l3mdev(sk); 1065 1066 local_ports = inet_sk_get_local_port_range(sk, &low, &high); 1067 step = local_ports ? 1 : 2; 1068 1069 high++; /* [32768, 60999] -> [32768, 61000[ */ 1070 remaining = high - low; 1071 if (!local_ports && remaining > 1) 1072 remaining &= ~1U; 1073 1074 get_random_sleepable_once(table_perturb, 1075 INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb)); 1076 index = port_offset & (INET_TABLE_PERTURB_SIZE - 1); 1077 1078 offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32); 1079 offset %= remaining; 1080 1081 /* In first pass we try ports of @low parity. 1082 * inet_csk_get_port() does the opposite choice. 1083 */ 1084 if (!local_ports) 1085 offset &= ~1U; 1086 other_parity_scan: 1087 port = low + offset; 1088 for (i = 0; i < remaining; i += step, port += step) { 1089 if (unlikely(port >= high)) 1090 port -= remaining; 1091 if (inet_is_local_reserved_port(net, port)) 1092 continue; 1093 head = &hinfo->bhash[inet_bhashfn(net, port, 1094 hinfo->bhash_size)]; 1095 rcu_read_lock(); 1096 hlist_for_each_entry_rcu(tb, &head->chain, node) { 1097 if (!inet_bind_bucket_match(tb, net, port, l3mdev)) 1098 continue; 1099 if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) { 1100 rcu_read_unlock(); 1101 goto next_port; 1102 } 1103 if (!check_established(death_row, sk, port, &tw, true, 1104 hash_port0 + port)) 1105 break; 1106 rcu_read_unlock(); 1107 goto next_port; 1108 } 1109 rcu_read_unlock(); 1110 1111 spin_lock_bh(&head->lock); 1112 1113 /* Does not bother with rcv_saddr checks, because 1114 * the established check is already unique enough. 1115 */ 1116 inet_bind_bucket_for_each(tb, &head->chain) { 1117 if (inet_bind_bucket_match(tb, net, port, l3mdev)) { 1118 if (tb->fastreuse >= 0 || 1119 tb->fastreuseport >= 0) 1120 goto next_port_unlock; 1121 WARN_ON(hlist_empty(&tb->bhash2)); 1122 if (!check_established(death_row, sk, 1123 port, &tw, false, 1124 hash_port0 + port)) 1125 goto ok; 1126 goto next_port_unlock; 1127 } 1128 } 1129 1130 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 1131 net, head, port, l3mdev); 1132 if (!tb) { 1133 spin_unlock_bh(&head->lock); 1134 return -ENOMEM; 1135 } 1136 tb_created = true; 1137 tb->fastreuse = -1; 1138 tb->fastreuseport = -1; 1139 goto ok; 1140 next_port_unlock: 1141 spin_unlock_bh(&head->lock); 1142 next_port: 1143 cond_resched(); 1144 } 1145 1146 if (!local_ports) { 1147 offset++; 1148 if ((offset & 1) && remaining > 1) 1149 goto other_parity_scan; 1150 } 1151 return -EADDRNOTAVAIL; 1152 1153 ok: 1154 /* Find the corresponding tb2 bucket since we need to 1155 * add the socket to the bhash2 table as well 1156 */ 1157 head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); 1158 spin_lock(&head2->lock); 1159 1160 tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); 1161 if (!tb2) { 1162 tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net, 1163 head2, tb, sk); 1164 if (!tb2) 1165 goto error; 1166 tb2->fastreuse = -1; 1167 tb2->fastreuseport = -1; 1168 } 1169 1170 /* Here we want to add a little bit of randomness to the next source 1171 * port that will be chosen. We use a max() with a random here so that 1172 * on low contention the randomness is maximal and on high contention 1173 * it may be inexistent. 1174 */ 1175 i = max_t(int, i, get_random_u32_below(8) * step); 1176 WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + step); 1177 1178 /* Head lock still held and bh's disabled */ 1179 inet_bind_hash(sk, tb, tb2, port); 1180 sk->sk_userlocks |= SOCK_CONNECT_BIND; 1181 1182 if (sk_unhashed(sk)) { 1183 inet_sk(sk)->inet_sport = htons(port); 1184 inet_ehash_nolisten(sk, (struct sock *)tw, NULL); 1185 } 1186 if (tw) 1187 inet_twsk_bind_unhash(tw, hinfo); 1188 1189 spin_unlock(&head2->lock); 1190 spin_unlock(&head->lock); 1191 1192 if (tw) 1193 inet_twsk_deschedule_put(tw); 1194 local_bh_enable(); 1195 return 0; 1196 1197 error: 1198 if (sk_hashed(sk)) { 1199 spinlock_t *lock = inet_ehash_lockp(hinfo, sk->sk_hash); 1200 1201 sock_prot_inuse_add(net, sk->sk_prot, -1); 1202 1203 spin_lock(lock); 1204 __sk_nulls_del_node_init_rcu(sk); 1205 spin_unlock(lock); 1206 1207 sk->sk_hash = 0; 1208 inet_sk(sk)->inet_sport = 0; 1209 inet_sk(sk)->inet_num = 0; 1210 1211 if (tw) 1212 inet_twsk_bind_unhash(tw, hinfo); 1213 } 1214 1215 spin_unlock(&head2->lock); 1216 if (tb_created) 1217 inet_bind_bucket_destroy(tb); 1218 spin_unlock(&head->lock); 1219 1220 if (tw) 1221 inet_twsk_deschedule_put(tw); 1222 1223 local_bh_enable(); 1224 1225 return -ENOMEM; 1226 } 1227 1228 /* 1229 * Bind a port for a connect operation and hash it. 1230 */ 1231 int inet_hash_connect(struct inet_timewait_death_row *death_row, 1232 struct sock *sk) 1233 { 1234 const struct inet_sock *inet = inet_sk(sk); 1235 const struct net *net = sock_net(sk); 1236 u64 port_offset = 0; 1237 u32 hash_port0; 1238 1239 if (!inet_sk(sk)->inet_num) 1240 port_offset = inet_sk_port_offset(sk); 1241 1242 hash_port0 = inet_ehashfn(net, inet->inet_rcv_saddr, 0, 1243 inet->inet_daddr, inet->inet_dport); 1244 1245 return __inet_hash_connect(death_row, sk, port_offset, hash_port0, 1246 __inet_check_established); 1247 } 1248 1249 static void init_hashinfo_lhash2(struct inet_hashinfo *h) 1250 { 1251 int i; 1252 1253 for (i = 0; i <= h->lhash2_mask; i++) { 1254 spin_lock_init(&h->lhash2[i].lock); 1255 INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head, 1256 i + LISTENING_NULLS_BASE); 1257 } 1258 } 1259 1260 void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, 1261 unsigned long numentries, int scale, 1262 unsigned long low_limit, 1263 unsigned long high_limit) 1264 { 1265 h->lhash2 = alloc_large_system_hash(name, 1266 sizeof(*h->lhash2), 1267 numentries, 1268 scale, 1269 0, 1270 NULL, 1271 &h->lhash2_mask, 1272 low_limit, 1273 high_limit); 1274 init_hashinfo_lhash2(h); 1275 1276 /* this one is used for source ports of outgoing connections */ 1277 table_perturb = alloc_large_system_hash("Table-perturb", 1278 sizeof(*table_perturb), 1279 INET_TABLE_PERTURB_SIZE, 1280 0, 0, NULL, NULL, 1281 INET_TABLE_PERTURB_SIZE, 1282 INET_TABLE_PERTURB_SIZE); 1283 } 1284 1285 int inet_hashinfo2_init_mod(struct inet_hashinfo *h) 1286 { 1287 h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL); 1288 if (!h->lhash2) 1289 return -ENOMEM; 1290 1291 h->lhash2_mask = INET_LHTABLE_SIZE - 1; 1292 /* INET_LHTABLE_SIZE must be a power of 2 */ 1293 BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask); 1294 1295 init_hashinfo_lhash2(h); 1296 return 0; 1297 } 1298 1299 int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) 1300 { 1301 unsigned int locksz = sizeof(spinlock_t); 1302 unsigned int i, nblocks = 1; 1303 spinlock_t *ptr = NULL; 1304 1305 if (locksz == 0) 1306 goto set_mask; 1307 1308 /* Allocate 2 cache lines or at least one spinlock per cpu. */ 1309 nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U) * num_possible_cpus(); 1310 1311 /* At least one page per NUMA node. */ 1312 nblocks = max(nblocks, num_online_nodes() * PAGE_SIZE / locksz); 1313 1314 nblocks = roundup_pow_of_two(nblocks); 1315 1316 /* No more locks than number of hash buckets. */ 1317 nblocks = min(nblocks, hashinfo->ehash_mask + 1); 1318 1319 if (num_online_nodes() > 1) { 1320 /* Use vmalloc() to allow NUMA policy to spread pages 1321 * on all available nodes if desired. 1322 */ 1323 ptr = vmalloc_array(nblocks, locksz); 1324 } 1325 if (!ptr) { 1326 ptr = kvmalloc_array(nblocks, locksz, GFP_KERNEL); 1327 if (!ptr) 1328 return -ENOMEM; 1329 } 1330 for (i = 0; i < nblocks; i++) 1331 spin_lock_init(&ptr[i]); 1332 hashinfo->ehash_locks = ptr; 1333 set_mask: 1334 hashinfo->ehash_locks_mask = nblocks - 1; 1335 return 0; 1336 } 1337 1338 struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, 1339 unsigned int ehash_entries) 1340 { 1341 struct inet_hashinfo *new_hashinfo; 1342 int i; 1343 1344 new_hashinfo = kmemdup(hashinfo, sizeof(*hashinfo), GFP_KERNEL); 1345 if (!new_hashinfo) 1346 goto err; 1347 1348 new_hashinfo->ehash = vmalloc_huge(ehash_entries * sizeof(struct inet_ehash_bucket), 1349 GFP_KERNEL_ACCOUNT); 1350 if (!new_hashinfo->ehash) 1351 goto free_hashinfo; 1352 1353 new_hashinfo->ehash_mask = ehash_entries - 1; 1354 1355 if (inet_ehash_locks_alloc(new_hashinfo)) 1356 goto free_ehash; 1357 1358 for (i = 0; i < ehash_entries; i++) 1359 INIT_HLIST_NULLS_HEAD(&new_hashinfo->ehash[i].chain, i); 1360 1361 new_hashinfo->pernet = true; 1362 1363 return new_hashinfo; 1364 1365 free_ehash: 1366 vfree(new_hashinfo->ehash); 1367 free_hashinfo: 1368 kfree(new_hashinfo); 1369 err: 1370 return NULL; 1371 } 1372 1373 void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo) 1374 { 1375 if (!hashinfo->pernet) 1376 return; 1377 1378 inet_ehash_locks_free(hashinfo); 1379 vfree(hashinfo->ehash); 1380 kfree(hashinfo); 1381 } 1382