1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Generic INET transport hashtables 8 * 9 * Authors: Lotsa people, from code originally in tcp 10 */ 11 12 #include <linux/module.h> 13 #include <linux/random.h> 14 #include <linux/sched.h> 15 #include <linux/slab.h> 16 #include <linux/wait.h> 17 #include <linux/vmalloc.h> 18 #include <linux/memblock.h> 19 20 #include <net/addrconf.h> 21 #include <net/inet_connection_sock.h> 22 #include <net/inet_hashtables.h> 23 #if IS_ENABLED(CONFIG_IPV6) 24 #include <net/inet6_hashtables.h> 25 #endif 26 #include <net/hotdata.h> 27 #include <net/ip.h> 28 #include <net/rps.h> 29 #include <net/secure_seq.h> 30 #include <net/sock_reuseport.h> 31 #include <net/tcp.h> 32 33 static void inet_init_ehash_secret(void) 34 { 35 net_get_random_sleepable_once(&inet_ehash_secret, 36 sizeof(inet_ehash_secret)); 37 } 38 39 u32 inet_ehashfn(const struct net *net, const __be32 laddr, 40 const __u16 lport, const __be32 faddr, 41 const __be16 fport) 42 { 43 return lport + __inet_ehashfn(laddr, 0, faddr, fport, 44 inet_ehash_secret + net_hash_mix(net)); 45 } 46 EXPORT_SYMBOL_GPL(inet_ehashfn); 47 48 /* This function handles inet_sock, but also timewait and request sockets 49 * for IPv4/IPv6. 50 */ 51 static u32 sk_ehashfn(const struct sock *sk) 52 { 53 #if IS_ENABLED(CONFIG_IPV6) 54 if (sk->sk_family == AF_INET6 && 55 !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 56 return inet6_ehashfn(sock_net(sk), 57 &sk->sk_v6_rcv_saddr, sk->sk_num, 58 &sk->sk_v6_daddr, sk->sk_dport); 59 #endif 60 return inet_ehashfn(sock_net(sk), 61 sk->sk_rcv_saddr, sk->sk_num, 62 sk->sk_daddr, sk->sk_dport); 63 } 64 65 static bool sk_is_connect_bind(const struct sock *sk) 66 { 67 if (sk->sk_state == TCP_TIME_WAIT) 68 return inet_twsk(sk)->tw_connect_bind; 69 else 70 return sk->sk_userlocks & SOCK_CONNECT_BIND; 71 } 72 73 /* 74 * Allocate and initialize a new local port bind bucket. 75 * The bindhash mutex for snum's hash chain must be held here. 76 */ 77 struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, 78 struct net *net, 79 struct inet_bind_hashbucket *head, 80 const unsigned short snum, 81 int l3mdev) 82 { 83 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 84 85 if (tb) { 86 write_pnet(&tb->ib_net, net); 87 tb->l3mdev = l3mdev; 88 tb->port = snum; 89 tb->fastreuse = 0; 90 tb->fastreuseport = 0; 91 INIT_HLIST_HEAD(&tb->bhash2); 92 hlist_add_head_rcu(&tb->node, &head->chain); 93 } 94 return tb; 95 } 96 97 /* 98 * Caller must hold hashbucket lock for this tb with local BH disabled 99 */ 100 void inet_bind_bucket_destroy(struct inet_bind_bucket *tb) 101 { 102 const struct inet_bind2_bucket *tb2; 103 104 if (hlist_empty(&tb->bhash2)) { 105 hlist_del_rcu(&tb->node); 106 kfree_rcu(tb, rcu); 107 return; 108 } 109 110 if (tb->fastreuse == -1 && tb->fastreuseport == -1) 111 return; 112 hlist_for_each_entry(tb2, &tb->bhash2, bhash_node) { 113 if (tb2->fastreuse != -1 || tb2->fastreuseport != -1) 114 return; 115 } 116 tb->fastreuse = -1; 117 tb->fastreuseport = -1; 118 } 119 120 bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, 121 unsigned short port, int l3mdev) 122 { 123 return net_eq(ib_net(tb), net) && tb->port == port && 124 tb->l3mdev == l3mdev; 125 } 126 127 static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2, 128 struct net *net, 129 struct inet_bind_hashbucket *head, 130 struct inet_bind_bucket *tb, 131 const struct sock *sk) 132 { 133 write_pnet(&tb2->ib_net, net); 134 tb2->l3mdev = tb->l3mdev; 135 tb2->port = tb->port; 136 #if IS_ENABLED(CONFIG_IPV6) 137 BUILD_BUG_ON(USHRT_MAX < (IPV6_ADDR_ANY | IPV6_ADDR_MAPPED)); 138 if (sk->sk_family == AF_INET6) { 139 tb2->addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); 140 tb2->v6_rcv_saddr = sk->sk_v6_rcv_saddr; 141 } else { 142 tb2->addr_type = IPV6_ADDR_MAPPED; 143 ipv6_addr_set_v4mapped(sk->sk_rcv_saddr, &tb2->v6_rcv_saddr); 144 } 145 #else 146 tb2->rcv_saddr = sk->sk_rcv_saddr; 147 #endif 148 tb2->fastreuse = 0; 149 tb2->fastreuseport = 0; 150 INIT_HLIST_HEAD(&tb2->owners); 151 hlist_add_head(&tb2->node, &head->chain); 152 hlist_add_head(&tb2->bhash_node, &tb->bhash2); 153 } 154 155 struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, 156 struct net *net, 157 struct inet_bind_hashbucket *head, 158 struct inet_bind_bucket *tb, 159 const struct sock *sk) 160 { 161 struct inet_bind2_bucket *tb2 = kmem_cache_alloc(cachep, GFP_ATOMIC); 162 163 if (tb2) 164 inet_bind2_bucket_init(tb2, net, head, tb, sk); 165 166 return tb2; 167 } 168 169 /* Caller must hold hashbucket lock for this tb with local BH disabled */ 170 void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) 171 { 172 const struct sock *sk; 173 174 if (hlist_empty(&tb->owners)) { 175 __hlist_del(&tb->node); 176 __hlist_del(&tb->bhash_node); 177 kmem_cache_free(cachep, tb); 178 return; 179 } 180 181 if (tb->fastreuse == -1 && tb->fastreuseport == -1) 182 return; 183 sk_for_each_bound(sk, &tb->owners) { 184 if (!sk_is_connect_bind(sk)) 185 return; 186 } 187 tb->fastreuse = -1; 188 tb->fastreuseport = -1; 189 } 190 191 static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, 192 const struct sock *sk) 193 { 194 #if IS_ENABLED(CONFIG_IPV6) 195 if (sk->sk_family == AF_INET6) 196 return ipv6_addr_equal(&tb2->v6_rcv_saddr, &sk->sk_v6_rcv_saddr); 197 198 if (tb2->addr_type != IPV6_ADDR_MAPPED) 199 return false; 200 #endif 201 return tb2->rcv_saddr == sk->sk_rcv_saddr; 202 } 203 204 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 205 struct inet_bind2_bucket *tb2, unsigned short port) 206 { 207 WRITE_ONCE(inet_sk(sk)->inet_num, port); 208 inet_csk(sk)->icsk_bind_hash = tb; 209 inet_csk(sk)->icsk_bind2_hash = tb2; 210 sk_add_bind_node(sk, &tb2->owners); 211 } 212 213 /* 214 * Get rid of any references to a local port held by the given sock. 215 */ 216 static void __inet_put_port(struct sock *sk) 217 { 218 struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); 219 struct inet_bind_hashbucket *head, *head2; 220 struct net *net = sock_net(sk); 221 struct inet_bind_bucket *tb; 222 int bhash; 223 224 bhash = inet_bhashfn(net, inet_sk(sk)->inet_num, hashinfo->bhash_size); 225 head = &hashinfo->bhash[bhash]; 226 head2 = inet_bhashfn_portaddr(hashinfo, sk, net, inet_sk(sk)->inet_num); 227 228 spin_lock(&head->lock); 229 tb = inet_csk(sk)->icsk_bind_hash; 230 inet_csk(sk)->icsk_bind_hash = NULL; 231 WRITE_ONCE(inet_sk(sk)->inet_num, 0); 232 sk->sk_userlocks &= ~SOCK_CONNECT_BIND; 233 234 spin_lock(&head2->lock); 235 if (inet_csk(sk)->icsk_bind2_hash) { 236 struct inet_bind2_bucket *tb2 = inet_csk(sk)->icsk_bind2_hash; 237 238 __sk_del_bind_node(sk); 239 inet_csk(sk)->icsk_bind2_hash = NULL; 240 inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); 241 } 242 spin_unlock(&head2->lock); 243 244 inet_bind_bucket_destroy(tb); 245 spin_unlock(&head->lock); 246 } 247 248 void inet_put_port(struct sock *sk) 249 { 250 local_bh_disable(); 251 __inet_put_port(sk); 252 local_bh_enable(); 253 } 254 EXPORT_SYMBOL(inet_put_port); 255 256 int __inet_inherit_port(const struct sock *sk, struct sock *child) 257 { 258 struct inet_hashinfo *table = tcp_get_hashinfo(sk); 259 unsigned short port = inet_sk(child)->inet_num; 260 struct inet_bind_hashbucket *head, *head2; 261 bool created_inet_bind_bucket = false; 262 struct net *net = sock_net(sk); 263 bool update_fastreuse = false; 264 struct inet_bind2_bucket *tb2; 265 struct inet_bind_bucket *tb; 266 int bhash, l3mdev; 267 268 bhash = inet_bhashfn(net, port, table->bhash_size); 269 head = &table->bhash[bhash]; 270 head2 = inet_bhashfn_portaddr(table, child, net, port); 271 272 spin_lock(&head->lock); 273 spin_lock(&head2->lock); 274 tb = inet_csk(sk)->icsk_bind_hash; 275 tb2 = inet_csk(sk)->icsk_bind2_hash; 276 if (unlikely(!tb || !tb2)) { 277 spin_unlock(&head2->lock); 278 spin_unlock(&head->lock); 279 return -ENOENT; 280 } 281 if (tb->port != port) { 282 l3mdev = inet_sk_bound_l3mdev(sk); 283 284 /* NOTE: using tproxy and redirecting skbs to a proxy 285 * on a different listener port breaks the assumption 286 * that the listener socket's icsk_bind_hash is the same 287 * as that of the child socket. We have to look up or 288 * create a new bind bucket for the child here. */ 289 inet_bind_bucket_for_each(tb, &head->chain) { 290 if (inet_bind_bucket_match(tb, net, port, l3mdev)) 291 break; 292 } 293 if (!tb) { 294 tb = inet_bind_bucket_create(table->bind_bucket_cachep, 295 net, head, port, l3mdev); 296 if (!tb) { 297 spin_unlock(&head2->lock); 298 spin_unlock(&head->lock); 299 return -ENOMEM; 300 } 301 created_inet_bind_bucket = true; 302 } 303 update_fastreuse = true; 304 305 goto bhash2_find; 306 } else if (!inet_bind2_bucket_addr_match(tb2, child)) { 307 l3mdev = inet_sk_bound_l3mdev(sk); 308 309 bhash2_find: 310 tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, child); 311 if (!tb2) { 312 tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep, 313 net, head2, tb, child); 314 if (!tb2) 315 goto error; 316 } 317 } 318 if (update_fastreuse) 319 inet_csk_update_fastreuse(child, tb, tb2); 320 inet_bind_hash(child, tb, tb2, port); 321 spin_unlock(&head2->lock); 322 spin_unlock(&head->lock); 323 324 return 0; 325 326 error: 327 if (created_inet_bind_bucket) 328 inet_bind_bucket_destroy(tb); 329 spin_unlock(&head2->lock); 330 spin_unlock(&head->lock); 331 return -ENOMEM; 332 } 333 EXPORT_SYMBOL_GPL(__inet_inherit_port); 334 335 static struct inet_listen_hashbucket * 336 inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) 337 { 338 u32 hash; 339 340 #if IS_ENABLED(CONFIG_IPV6) 341 if (sk->sk_family == AF_INET6) 342 hash = ipv6_portaddr_hash(sock_net(sk), 343 &sk->sk_v6_rcv_saddr, 344 inet_sk(sk)->inet_num); 345 else 346 #endif 347 hash = ipv4_portaddr_hash(sock_net(sk), 348 inet_sk(sk)->inet_rcv_saddr, 349 inet_sk(sk)->inet_num); 350 return inet_lhash2_bucket(h, hash); 351 } 352 353 static inline int compute_score(struct sock *sk, const struct net *net, 354 const unsigned short hnum, const __be32 daddr, 355 const int dif, const int sdif) 356 { 357 int score = -1; 358 359 if (net_eq(sock_net(sk), net) && READ_ONCE(sk->sk_num) == hnum && 360 !ipv6_only_sock(sk)) { 361 if (sk->sk_rcv_saddr != daddr) 362 return -1; 363 364 if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) 365 return -1; 366 score = sk->sk_bound_dev_if ? 2 : 1; 367 368 if (sk->sk_family == PF_INET) 369 score++; 370 if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) 371 score++; 372 } 373 return score; 374 } 375 376 /** 377 * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary. 378 * @net: network namespace. 379 * @sk: AF_INET socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP. 380 * @skb: context for a potential SK_REUSEPORT program. 381 * @doff: header offset. 382 * @saddr: source address. 383 * @sport: source port. 384 * @daddr: destination address. 385 * @hnum: destination port in host byte order. 386 * @ehashfn: hash function used to generate the fallback hash. 387 * 388 * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to 389 * the selected sock or an error. 390 */ 391 struct sock *inet_lookup_reuseport(const struct net *net, struct sock *sk, 392 struct sk_buff *skb, int doff, 393 __be32 saddr, __be16 sport, 394 __be32 daddr, unsigned short hnum, 395 inet_ehashfn_t *ehashfn) 396 { 397 struct sock *reuse_sk = NULL; 398 u32 phash; 399 400 if (sk->sk_reuseport) { 401 phash = INDIRECT_CALL_2(ehashfn, udp_ehashfn, inet_ehashfn, 402 net, daddr, hnum, saddr, sport); 403 reuse_sk = reuseport_select_sock(sk, phash, skb, doff); 404 } 405 return reuse_sk; 406 } 407 EXPORT_SYMBOL_GPL(inet_lookup_reuseport); 408 409 /* 410 * Here are some nice properties to exploit here. The BSD API 411 * does not allow a listening sock to specify the remote port nor the 412 * remote address for the connection. So always assume those are both 413 * wildcarded during the search since they can never be otherwise. 414 */ 415 416 /* called with rcu_read_lock() : No refcount taken on the socket */ 417 static struct sock *inet_lhash2_lookup(const struct net *net, 418 struct inet_listen_hashbucket *ilb2, 419 struct sk_buff *skb, int doff, 420 const __be32 saddr, __be16 sport, 421 const __be32 daddr, const unsigned short hnum, 422 const int dif, const int sdif) 423 { 424 struct sock *sk, *result = NULL; 425 struct hlist_nulls_node *node; 426 int score, hiscore = 0; 427 428 sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { 429 score = compute_score(sk, net, hnum, daddr, dif, sdif); 430 if (score > hiscore) { 431 result = inet_lookup_reuseport(net, sk, skb, doff, 432 saddr, sport, daddr, hnum, inet_ehashfn); 433 if (result) 434 return result; 435 436 result = sk; 437 hiscore = score; 438 } 439 } 440 441 return result; 442 } 443 444 struct sock *inet_lookup_run_sk_lookup(const struct net *net, 445 int protocol, 446 struct sk_buff *skb, int doff, 447 __be32 saddr, __be16 sport, 448 __be32 daddr, u16 hnum, const int dif, 449 inet_ehashfn_t *ehashfn) 450 { 451 struct sock *sk, *reuse_sk; 452 bool no_reuseport; 453 454 no_reuseport = bpf_sk_lookup_run_v4(net, protocol, saddr, sport, 455 daddr, hnum, dif, &sk); 456 if (no_reuseport || IS_ERR_OR_NULL(sk)) 457 return sk; 458 459 reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, 460 ehashfn); 461 if (reuse_sk) 462 sk = reuse_sk; 463 return sk; 464 } 465 466 struct sock *__inet_lookup_listener(const struct net *net, 467 struct sk_buff *skb, int doff, 468 const __be32 saddr, __be16 sport, 469 const __be32 daddr, const unsigned short hnum, 470 const int dif, const int sdif) 471 { 472 struct inet_listen_hashbucket *ilb2; 473 struct inet_hashinfo *hashinfo; 474 struct sock *result = NULL; 475 unsigned int hash2; 476 477 /* Lookup redirect from BPF */ 478 if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { 479 result = inet_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff, 480 saddr, sport, daddr, hnum, dif, 481 inet_ehashfn); 482 if (result) 483 goto done; 484 } 485 486 hashinfo = net->ipv4.tcp_death_row.hashinfo; 487 hash2 = ipv4_portaddr_hash(net, daddr, hnum); 488 ilb2 = inet_lhash2_bucket(hashinfo, hash2); 489 490 result = inet_lhash2_lookup(net, ilb2, skb, doff, 491 saddr, sport, daddr, hnum, 492 dif, sdif); 493 if (result) 494 goto done; 495 496 /* Lookup lhash2 with INADDR_ANY */ 497 hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); 498 ilb2 = inet_lhash2_bucket(hashinfo, hash2); 499 500 result = inet_lhash2_lookup(net, ilb2, skb, doff, 501 saddr, sport, htonl(INADDR_ANY), hnum, 502 dif, sdif); 503 done: 504 if (IS_ERR(result)) 505 return NULL; 506 return result; 507 } 508 EXPORT_SYMBOL_GPL(__inet_lookup_listener); 509 510 /* All sockets share common refcount, but have different destructors */ 511 void sock_gen_put(struct sock *sk) 512 { 513 if (!refcount_dec_and_test(&sk->sk_refcnt)) 514 return; 515 516 if (sk->sk_state == TCP_TIME_WAIT) 517 inet_twsk_free(inet_twsk(sk)); 518 else if (sk->sk_state == TCP_NEW_SYN_RECV) 519 reqsk_free(inet_reqsk(sk)); 520 else 521 sk_free(sk); 522 } 523 EXPORT_SYMBOL_GPL(sock_gen_put); 524 525 void sock_edemux(struct sk_buff *skb) 526 { 527 sock_gen_put(skb->sk); 528 } 529 EXPORT_SYMBOL(sock_edemux); 530 531 struct sock *__inet_lookup_established(const struct net *net, 532 const __be32 saddr, const __be16 sport, 533 const __be32 daddr, const u16 hnum, 534 const int dif, const int sdif) 535 { 536 const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 537 INET_ADDR_COOKIE(acookie, saddr, daddr); 538 const struct hlist_nulls_node *node; 539 struct inet_ehash_bucket *head; 540 struct inet_hashinfo *hashinfo; 541 unsigned int hash, slot; 542 struct sock *sk; 543 544 hashinfo = net->ipv4.tcp_death_row.hashinfo; 545 hash = inet_ehashfn(net, daddr, hnum, saddr, sport); 546 slot = hash & hashinfo->ehash_mask; 547 head = &hashinfo->ehash[slot]; 548 549 begin: 550 sk_nulls_for_each_rcu(sk, node, &head->chain) { 551 if (sk->sk_hash != hash) 552 continue; 553 if (likely(inet_match(net, sk, acookie, ports, dif, sdif))) { 554 if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) 555 goto out; 556 if (unlikely(!inet_match(net, sk, acookie, 557 ports, dif, sdif))) { 558 sock_gen_put(sk); 559 goto begin; 560 } 561 goto found; 562 } 563 } 564 /* 565 * if the nulls value we got at the end of this lookup is 566 * not the expected one, we must restart lookup. 567 * We probably met an item that was moved to another chain. 568 */ 569 if (get_nulls_value(node) != slot) 570 goto begin; 571 out: 572 sk = NULL; 573 found: 574 return sk; 575 } 576 EXPORT_SYMBOL_GPL(__inet_lookup_established); 577 578 /* called with local bh disabled */ 579 static int __inet_check_established(struct inet_timewait_death_row *death_row, 580 struct sock *sk, __u16 lport, 581 struct inet_timewait_sock **twp, 582 bool rcu_lookup, 583 u32 hash) 584 { 585 struct inet_hashinfo *hinfo = death_row->hashinfo; 586 struct inet_sock *inet = inet_sk(sk); 587 __be32 daddr = inet->inet_rcv_saddr; 588 __be32 saddr = inet->inet_daddr; 589 int dif = sk->sk_bound_dev_if; 590 struct net *net = sock_net(sk); 591 int sdif = l3mdev_master_ifindex_by_index(net, dif); 592 INET_ADDR_COOKIE(acookie, saddr, daddr); 593 const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); 594 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 595 struct inet_timewait_sock *tw = NULL; 596 const struct hlist_nulls_node *node; 597 struct sock *sk2; 598 spinlock_t *lock; 599 600 if (rcu_lookup) { 601 sk_nulls_for_each(sk2, node, &head->chain) { 602 if (sk2->sk_hash != hash || 603 !inet_match(net, sk2, acookie, ports, dif, sdif)) 604 continue; 605 if (sk2->sk_state == TCP_TIME_WAIT) 606 break; 607 return -EADDRNOTAVAIL; 608 } 609 return 0; 610 } 611 612 lock = inet_ehash_lockp(hinfo, hash); 613 spin_lock(lock); 614 615 sk_nulls_for_each(sk2, node, &head->chain) { 616 if (sk2->sk_hash != hash) 617 continue; 618 619 if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) { 620 if (sk2->sk_state == TCP_TIME_WAIT) { 621 tw = inet_twsk(sk2); 622 if (tcp_twsk_unique(sk, sk2, twp)) 623 break; 624 } 625 goto not_unique; 626 } 627 } 628 629 /* Must record num and sport now. Otherwise we will see 630 * in hash table socket with a funny identity. 631 */ 632 inet->inet_num = lport; 633 inet->inet_sport = htons(lport); 634 sk->sk_hash = hash; 635 WARN_ON(!sk_unhashed(sk)); 636 __sk_nulls_add_node_rcu(sk, &head->chain); 637 if (tw) { 638 sk_nulls_del_node_init_rcu((struct sock *)tw); 639 __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); 640 } 641 spin_unlock(lock); 642 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 643 644 if (twp) { 645 *twp = tw; 646 } else if (tw) { 647 /* Silly. Should hash-dance instead... */ 648 inet_twsk_deschedule_put(tw); 649 } 650 return 0; 651 652 not_unique: 653 spin_unlock(lock); 654 return -EADDRNOTAVAIL; 655 } 656 657 static u64 inet_sk_port_offset(const struct sock *sk) 658 { 659 const struct inet_sock *inet = inet_sk(sk); 660 661 return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, 662 inet->inet_daddr, 663 inet->inet_dport); 664 } 665 666 /* Searches for an exsiting socket in the ehash bucket list. 667 * Returns true if found, false otherwise. 668 */ 669 static bool inet_ehash_lookup_by_sk(struct sock *sk, 670 struct hlist_nulls_head *list) 671 { 672 const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num); 673 const int sdif = sk->sk_bound_dev_if; 674 const int dif = sk->sk_bound_dev_if; 675 const struct hlist_nulls_node *node; 676 struct net *net = sock_net(sk); 677 struct sock *esk; 678 679 INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr); 680 681 sk_nulls_for_each_rcu(esk, node, list) { 682 if (esk->sk_hash != sk->sk_hash) 683 continue; 684 if (sk->sk_family == AF_INET) { 685 if (unlikely(inet_match(net, esk, acookie, 686 ports, dif, sdif))) { 687 return true; 688 } 689 } 690 #if IS_ENABLED(CONFIG_IPV6) 691 else if (sk->sk_family == AF_INET6) { 692 if (unlikely(inet6_match(net, esk, 693 &sk->sk_v6_daddr, 694 &sk->sk_v6_rcv_saddr, 695 ports, dif, sdif))) { 696 return true; 697 } 698 } 699 #endif 700 } 701 return false; 702 } 703 704 /* Insert a socket into ehash, and eventually remove another one 705 * (The another one can be a SYN_RECV or TIMEWAIT) 706 * If an existing socket already exists, socket sk is not inserted, 707 * and sets found_dup_sk parameter to true. 708 */ 709 bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) 710 { 711 struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); 712 struct inet_ehash_bucket *head; 713 struct hlist_nulls_head *list; 714 spinlock_t *lock; 715 bool ret = true; 716 717 WARN_ON_ONCE(!sk_unhashed(sk)); 718 719 sk->sk_hash = sk_ehashfn(sk); 720 head = inet_ehash_bucket(hashinfo, sk->sk_hash); 721 list = &head->chain; 722 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 723 724 spin_lock(lock); 725 if (osk) { 726 WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); 727 ret = sk_nulls_replace_node_init_rcu(osk, sk); 728 goto unlock; 729 } 730 731 if (found_dup_sk) { 732 *found_dup_sk = inet_ehash_lookup_by_sk(sk, list); 733 if (*found_dup_sk) 734 ret = false; 735 } 736 737 if (ret) 738 __sk_nulls_add_node_rcu(sk, list); 739 740 unlock: 741 spin_unlock(lock); 742 743 return ret; 744 } 745 746 bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) 747 { 748 bool ok = inet_ehash_insert(sk, osk, found_dup_sk); 749 750 if (ok) { 751 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 752 } else { 753 tcp_orphan_count_inc(); 754 inet_sk_set_state(sk, TCP_CLOSE); 755 sock_set_flag(sk, SOCK_DEAD); 756 inet_csk_destroy_sock(sk); 757 } 758 return ok; 759 } 760 EXPORT_IPV6_MOD(inet_ehash_nolisten); 761 762 static int inet_reuseport_add_sock(struct sock *sk, 763 struct inet_listen_hashbucket *ilb) 764 { 765 struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; 766 const struct hlist_nulls_node *node; 767 kuid_t uid = sk_uid(sk); 768 struct sock *sk2; 769 770 sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) { 771 if (sk2 != sk && 772 sk2->sk_family == sk->sk_family && 773 ipv6_only_sock(sk2) == ipv6_only_sock(sk) && 774 sk2->sk_bound_dev_if == sk->sk_bound_dev_if && 775 inet_csk(sk2)->icsk_bind_hash == tb && 776 sk2->sk_reuseport && uid_eq(uid, sk_uid(sk2)) && 777 inet_rcv_saddr_equal(sk, sk2, false)) 778 return reuseport_add_sock(sk, sk2, 779 inet_rcv_saddr_any(sk)); 780 } 781 782 return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); 783 } 784 785 int inet_hash(struct sock *sk) 786 { 787 struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); 788 struct inet_listen_hashbucket *ilb2; 789 int err = 0; 790 791 if (sk->sk_state == TCP_CLOSE) 792 return 0; 793 794 if (sk->sk_state != TCP_LISTEN) { 795 local_bh_disable(); 796 inet_ehash_nolisten(sk, NULL, NULL); 797 local_bh_enable(); 798 return 0; 799 } 800 801 #if IS_ENABLED(CONFIG_IPV6) 802 if (sk->sk_family == AF_INET6) 803 inet6_init_ehash_secret(); 804 #endif 805 inet_init_ehash_secret(); 806 807 WARN_ON(!sk_unhashed(sk)); 808 ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); 809 810 spin_lock(&ilb2->lock); 811 if (sk->sk_reuseport) { 812 err = inet_reuseport_add_sock(sk, ilb2); 813 if (err) 814 goto unlock; 815 } 816 sock_set_flag(sk, SOCK_RCU_FREE); 817 if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && 818 sk->sk_family == AF_INET6) 819 __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head); 820 else 821 __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head); 822 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 823 unlock: 824 spin_unlock(&ilb2->lock); 825 826 return err; 827 } 828 EXPORT_IPV6_MOD(inet_hash); 829 830 void inet_unhash(struct sock *sk) 831 { 832 struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); 833 834 if (sk_unhashed(sk)) 835 return; 836 837 sock_rps_delete_flow(sk); 838 if (sk->sk_state == TCP_LISTEN) { 839 struct inet_listen_hashbucket *ilb2; 840 841 ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); 842 /* Don't disable bottom halves while acquiring the lock to 843 * avoid circular locking dependency on PREEMPT_RT. 844 */ 845 spin_lock(&ilb2->lock); 846 if (rcu_access_pointer(sk->sk_reuseport_cb)) 847 reuseport_stop_listen_sock(sk); 848 849 __sk_nulls_del_node_init_rcu(sk); 850 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 851 spin_unlock(&ilb2->lock); 852 } else { 853 spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 854 855 spin_lock_bh(lock); 856 __sk_nulls_del_node_init_rcu(sk); 857 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 858 spin_unlock_bh(lock); 859 } 860 } 861 EXPORT_IPV6_MOD(inet_unhash); 862 863 static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb, 864 const struct net *net, unsigned short port, 865 int l3mdev, const struct sock *sk) 866 { 867 if (!net_eq(ib2_net(tb), net) || tb->port != port || 868 tb->l3mdev != l3mdev) 869 return false; 870 871 return inet_bind2_bucket_addr_match(tb, sk); 872 } 873 874 bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net, 875 unsigned short port, int l3mdev, const struct sock *sk) 876 { 877 if (!net_eq(ib2_net(tb), net) || tb->port != port || 878 tb->l3mdev != l3mdev) 879 return false; 880 881 #if IS_ENABLED(CONFIG_IPV6) 882 if (tb->addr_type == IPV6_ADDR_ANY) 883 return true; 884 885 if (tb->addr_type != IPV6_ADDR_MAPPED) 886 return false; 887 888 if (sk->sk_family == AF_INET6 && 889 !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) 890 return false; 891 #endif 892 return tb->rcv_saddr == 0; 893 } 894 895 /* The socket's bhash2 hashbucket spinlock must be held when this is called */ 896 struct inet_bind2_bucket * 897 inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net, 898 unsigned short port, int l3mdev, const struct sock *sk) 899 { 900 struct inet_bind2_bucket *bhash2 = NULL; 901 902 inet_bind_bucket_for_each(bhash2, &head->chain) 903 if (inet_bind2_bucket_match(bhash2, net, port, l3mdev, sk)) 904 break; 905 906 return bhash2; 907 } 908 909 struct inet_bind_hashbucket * 910 inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port) 911 { 912 struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); 913 u32 hash; 914 915 #if IS_ENABLED(CONFIG_IPV6) 916 if (sk->sk_family == AF_INET6) 917 hash = ipv6_portaddr_hash(net, &in6addr_any, port); 918 else 919 #endif 920 hash = ipv4_portaddr_hash(net, 0, port); 921 922 return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; 923 } 924 925 static void inet_update_saddr(struct sock *sk, void *saddr, int family) 926 { 927 if (family == AF_INET) { 928 inet_sk(sk)->inet_saddr = *(__be32 *)saddr; 929 sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr); 930 } 931 #if IS_ENABLED(CONFIG_IPV6) 932 else { 933 sk->sk_v6_rcv_saddr = *(struct in6_addr *)saddr; 934 } 935 #endif 936 } 937 938 static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset) 939 { 940 struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); 941 struct inet_bind_hashbucket *head, *head2; 942 struct inet_bind2_bucket *tb2, *new_tb2; 943 int l3mdev = inet_sk_bound_l3mdev(sk); 944 int port = inet_sk(sk)->inet_num; 945 struct net *net = sock_net(sk); 946 int bhash; 947 948 if (!inet_csk(sk)->icsk_bind2_hash) { 949 /* Not bind()ed before. */ 950 if (reset) 951 inet_reset_saddr(sk); 952 else 953 inet_update_saddr(sk, saddr, family); 954 955 return 0; 956 } 957 958 /* Allocate a bind2 bucket ahead of time to avoid permanently putting 959 * the bhash2 table in an inconsistent state if a new tb2 bucket 960 * allocation fails. 961 */ 962 new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC); 963 if (!new_tb2) { 964 if (reset) { 965 /* The (INADDR_ANY, port) bucket might have already 966 * been freed, then we cannot fixup icsk_bind2_hash, 967 * so we give up and unlink sk from bhash/bhash2 not 968 * to leave inconsistency in bhash2. 969 */ 970 inet_put_port(sk); 971 inet_reset_saddr(sk); 972 } 973 974 return -ENOMEM; 975 } 976 977 bhash = inet_bhashfn(net, port, hinfo->bhash_size); 978 head = &hinfo->bhash[bhash]; 979 head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); 980 981 /* If we change saddr locklessly, another thread 982 * iterating over bhash might see corrupted address. 983 */ 984 spin_lock_bh(&head->lock); 985 986 spin_lock(&head2->lock); 987 __sk_del_bind_node(sk); 988 inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash); 989 spin_unlock(&head2->lock); 990 991 if (reset) 992 inet_reset_saddr(sk); 993 else 994 inet_update_saddr(sk, saddr, family); 995 996 head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); 997 998 spin_lock(&head2->lock); 999 tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); 1000 if (!tb2) { 1001 tb2 = new_tb2; 1002 inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk); 1003 if (sk_is_connect_bind(sk)) { 1004 tb2->fastreuse = -1; 1005 tb2->fastreuseport = -1; 1006 } 1007 } 1008 inet_csk(sk)->icsk_bind2_hash = tb2; 1009 sk_add_bind_node(sk, &tb2->owners); 1010 spin_unlock(&head2->lock); 1011 1012 spin_unlock_bh(&head->lock); 1013 1014 if (tb2 != new_tb2) 1015 kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2); 1016 1017 return 0; 1018 } 1019 1020 int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) 1021 { 1022 return __inet_bhash2_update_saddr(sk, saddr, family, false); 1023 } 1024 EXPORT_IPV6_MOD(inet_bhash2_update_saddr); 1025 1026 void inet_bhash2_reset_saddr(struct sock *sk) 1027 { 1028 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) 1029 __inet_bhash2_update_saddr(sk, NULL, 0, true); 1030 } 1031 EXPORT_IPV6_MOD(inet_bhash2_reset_saddr); 1032 1033 /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm 1034 * Note that we use 32bit integers (vs RFC 'short integers') 1035 * because 2^16 is not a multiple of num_ephemeral and this 1036 * property might be used by clever attacker. 1037 * 1038 * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though 1039 * attacks were since demonstrated, thus we use 65536 by default instead 1040 * to really give more isolation and privacy, at the expense of 256kB 1041 * of kernel memory. 1042 */ 1043 #define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER) 1044 static u32 *table_perturb; 1045 1046 int __inet_hash_connect(struct inet_timewait_death_row *death_row, 1047 struct sock *sk, u64 port_offset, 1048 u32 hash_port0, 1049 int (*check_established)(struct inet_timewait_death_row *, 1050 struct sock *, __u16, struct inet_timewait_sock **, 1051 bool rcu_lookup, u32 hash)) 1052 { 1053 struct inet_hashinfo *hinfo = death_row->hashinfo; 1054 struct inet_bind_hashbucket *head, *head2; 1055 struct inet_timewait_sock *tw = NULL; 1056 int port = inet_sk(sk)->inet_num; 1057 struct net *net = sock_net(sk); 1058 struct inet_bind2_bucket *tb2; 1059 struct inet_bind_bucket *tb; 1060 bool tb_created = false; 1061 u32 remaining, offset; 1062 int ret, i, low, high; 1063 bool local_ports; 1064 int step, l3mdev; 1065 u32 index; 1066 1067 if (port) { 1068 local_bh_disable(); 1069 ret = check_established(death_row, sk, port, NULL, false, 1070 hash_port0 + port); 1071 local_bh_enable(); 1072 return ret; 1073 } 1074 1075 l3mdev = inet_sk_bound_l3mdev(sk); 1076 1077 local_ports = inet_sk_get_local_port_range(sk, &low, &high); 1078 step = local_ports ? 1 : 2; 1079 1080 high++; /* [32768, 60999] -> [32768, 61000[ */ 1081 remaining = high - low; 1082 if (!local_ports && remaining > 1) 1083 remaining &= ~1U; 1084 1085 get_random_sleepable_once(table_perturb, 1086 INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb)); 1087 index = port_offset & (INET_TABLE_PERTURB_SIZE - 1); 1088 1089 offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32); 1090 offset %= remaining; 1091 1092 /* In first pass we try ports of @low parity. 1093 * inet_csk_get_port() does the opposite choice. 1094 */ 1095 if (!local_ports) 1096 offset &= ~1U; 1097 other_parity_scan: 1098 port = low + offset; 1099 for (i = 0; i < remaining; i += step, port += step) { 1100 if (unlikely(port >= high)) 1101 port -= remaining; 1102 if (inet_is_local_reserved_port(net, port)) 1103 continue; 1104 head = &hinfo->bhash[inet_bhashfn(net, port, 1105 hinfo->bhash_size)]; 1106 rcu_read_lock(); 1107 hlist_for_each_entry_rcu(tb, &head->chain, node) { 1108 if (!inet_bind_bucket_match(tb, net, port, l3mdev)) 1109 continue; 1110 if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) { 1111 rcu_read_unlock(); 1112 goto next_port; 1113 } 1114 if (!check_established(death_row, sk, port, &tw, true, 1115 hash_port0 + port)) 1116 break; 1117 rcu_read_unlock(); 1118 goto next_port; 1119 } 1120 rcu_read_unlock(); 1121 1122 spin_lock_bh(&head->lock); 1123 1124 /* Does not bother with rcv_saddr checks, because 1125 * the established check is already unique enough. 1126 */ 1127 inet_bind_bucket_for_each(tb, &head->chain) { 1128 if (inet_bind_bucket_match(tb, net, port, l3mdev)) { 1129 if (tb->fastreuse >= 0 || 1130 tb->fastreuseport >= 0) 1131 goto next_port_unlock; 1132 WARN_ON(hlist_empty(&tb->bhash2)); 1133 if (!check_established(death_row, sk, 1134 port, &tw, false, 1135 hash_port0 + port)) 1136 goto ok; 1137 goto next_port_unlock; 1138 } 1139 } 1140 1141 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 1142 net, head, port, l3mdev); 1143 if (!tb) { 1144 spin_unlock_bh(&head->lock); 1145 return -ENOMEM; 1146 } 1147 tb_created = true; 1148 tb->fastreuse = -1; 1149 tb->fastreuseport = -1; 1150 goto ok; 1151 next_port_unlock: 1152 spin_unlock_bh(&head->lock); 1153 next_port: 1154 cond_resched(); 1155 } 1156 1157 if (!local_ports) { 1158 offset++; 1159 if ((offset & 1) && remaining > 1) 1160 goto other_parity_scan; 1161 } 1162 return -EADDRNOTAVAIL; 1163 1164 ok: 1165 /* Find the corresponding tb2 bucket since we need to 1166 * add the socket to the bhash2 table as well 1167 */ 1168 head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); 1169 spin_lock(&head2->lock); 1170 1171 tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); 1172 if (!tb2) { 1173 tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net, 1174 head2, tb, sk); 1175 if (!tb2) 1176 goto error; 1177 tb2->fastreuse = -1; 1178 tb2->fastreuseport = -1; 1179 } 1180 1181 /* Here we want to add a little bit of randomness to the next source 1182 * port that will be chosen. We use a max() with a random here so that 1183 * on low contention the randomness is maximal and on high contention 1184 * it may be inexistent. 1185 */ 1186 i = max_t(int, i, get_random_u32_below(8) * step); 1187 WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + step); 1188 1189 /* Head lock still held and bh's disabled */ 1190 inet_bind_hash(sk, tb, tb2, port); 1191 sk->sk_userlocks |= SOCK_CONNECT_BIND; 1192 1193 if (sk_unhashed(sk)) { 1194 inet_sk(sk)->inet_sport = htons(port); 1195 inet_ehash_nolisten(sk, (struct sock *)tw, NULL); 1196 } 1197 if (tw) 1198 inet_twsk_bind_unhash(tw, hinfo); 1199 1200 spin_unlock(&head2->lock); 1201 spin_unlock(&head->lock); 1202 1203 if (tw) 1204 inet_twsk_deschedule_put(tw); 1205 local_bh_enable(); 1206 return 0; 1207 1208 error: 1209 if (sk_hashed(sk)) { 1210 spinlock_t *lock = inet_ehash_lockp(hinfo, sk->sk_hash); 1211 1212 sock_prot_inuse_add(net, sk->sk_prot, -1); 1213 1214 spin_lock(lock); 1215 __sk_nulls_del_node_init_rcu(sk); 1216 spin_unlock(lock); 1217 1218 sk->sk_hash = 0; 1219 inet_sk(sk)->inet_sport = 0; 1220 WRITE_ONCE(inet_sk(sk)->inet_num, 0); 1221 1222 if (tw) 1223 inet_twsk_bind_unhash(tw, hinfo); 1224 } 1225 1226 spin_unlock(&head2->lock); 1227 if (tb_created) 1228 inet_bind_bucket_destroy(tb); 1229 spin_unlock(&head->lock); 1230 1231 if (tw) 1232 inet_twsk_deschedule_put(tw); 1233 1234 local_bh_enable(); 1235 1236 return -ENOMEM; 1237 } 1238 1239 /* 1240 * Bind a port for a connect operation and hash it. 1241 */ 1242 int inet_hash_connect(struct inet_timewait_death_row *death_row, 1243 struct sock *sk) 1244 { 1245 const struct inet_sock *inet = inet_sk(sk); 1246 const struct net *net = sock_net(sk); 1247 u64 port_offset = 0; 1248 u32 hash_port0; 1249 1250 if (!inet_sk(sk)->inet_num) 1251 port_offset = inet_sk_port_offset(sk); 1252 1253 inet_init_ehash_secret(); 1254 1255 hash_port0 = inet_ehashfn(net, inet->inet_rcv_saddr, 0, 1256 inet->inet_daddr, inet->inet_dport); 1257 1258 return __inet_hash_connect(death_row, sk, port_offset, hash_port0, 1259 __inet_check_established); 1260 } 1261 1262 void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, 1263 unsigned long numentries, int scale, 1264 unsigned long low_limit, 1265 unsigned long high_limit) 1266 { 1267 unsigned int i; 1268 1269 h->lhash2 = alloc_large_system_hash(name, 1270 sizeof(*h->lhash2), 1271 numentries, 1272 scale, 1273 0, 1274 NULL, 1275 &h->lhash2_mask, 1276 low_limit, 1277 high_limit); 1278 1279 for (i = 0; i <= h->lhash2_mask; i++) { 1280 spin_lock_init(&h->lhash2[i].lock); 1281 INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head, 1282 i + LISTENING_NULLS_BASE); 1283 } 1284 1285 /* this one is used for source ports of outgoing connections */ 1286 table_perturb = alloc_large_system_hash("Table-perturb", 1287 sizeof(*table_perturb), 1288 INET_TABLE_PERTURB_SIZE, 1289 0, 0, NULL, NULL, 1290 INET_TABLE_PERTURB_SIZE, 1291 INET_TABLE_PERTURB_SIZE); 1292 } 1293 1294 int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) 1295 { 1296 unsigned int locksz = sizeof(spinlock_t); 1297 unsigned int i, nblocks = 1; 1298 spinlock_t *ptr = NULL; 1299 1300 if (locksz == 0) 1301 goto set_mask; 1302 1303 /* Allocate 2 cache lines or at least one spinlock per cpu. */ 1304 nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U) * num_possible_cpus(); 1305 1306 /* At least one page per NUMA node. */ 1307 nblocks = max(nblocks, num_online_nodes() * PAGE_SIZE / locksz); 1308 1309 nblocks = roundup_pow_of_two(nblocks); 1310 1311 /* No more locks than number of hash buckets. */ 1312 nblocks = min(nblocks, hashinfo->ehash_mask + 1); 1313 1314 if (num_online_nodes() > 1) { 1315 /* Use vmalloc() to allow NUMA policy to spread pages 1316 * on all available nodes if desired. 1317 */ 1318 ptr = vmalloc_array(nblocks, locksz); 1319 } 1320 if (!ptr) { 1321 ptr = kvmalloc_array(nblocks, locksz, GFP_KERNEL); 1322 if (!ptr) 1323 return -ENOMEM; 1324 } 1325 for (i = 0; i < nblocks; i++) 1326 spin_lock_init(&ptr[i]); 1327 hashinfo->ehash_locks = ptr; 1328 set_mask: 1329 hashinfo->ehash_locks_mask = nblocks - 1; 1330 return 0; 1331 } 1332 1333 struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, 1334 unsigned int ehash_entries) 1335 { 1336 struct inet_hashinfo *new_hashinfo; 1337 int i; 1338 1339 new_hashinfo = kmemdup(hashinfo, sizeof(*hashinfo), GFP_KERNEL); 1340 if (!new_hashinfo) 1341 goto err; 1342 1343 new_hashinfo->ehash = vmalloc_huge(ehash_entries * sizeof(struct inet_ehash_bucket), 1344 GFP_KERNEL_ACCOUNT); 1345 if (!new_hashinfo->ehash) 1346 goto free_hashinfo; 1347 1348 new_hashinfo->ehash_mask = ehash_entries - 1; 1349 1350 if (inet_ehash_locks_alloc(new_hashinfo)) 1351 goto free_ehash; 1352 1353 for (i = 0; i < ehash_entries; i++) 1354 INIT_HLIST_NULLS_HEAD(&new_hashinfo->ehash[i].chain, i); 1355 1356 new_hashinfo->pernet = true; 1357 1358 return new_hashinfo; 1359 1360 free_ehash: 1361 vfree(new_hashinfo->ehash); 1362 free_hashinfo: 1363 kfree(new_hashinfo); 1364 err: 1365 return NULL; 1366 } 1367 1368 void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo) 1369 { 1370 if (!hashinfo->pernet) 1371 return; 1372 1373 inet_ehash_locks_free(hashinfo); 1374 vfree(hashinfo->ehash); 1375 kfree(hashinfo); 1376 } 1377