1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Generic INET transport hashtables 8 * 9 * Authors: Lotsa people, from code originally in tcp 10 */ 11 12 #include <linux/module.h> 13 #include <linux/random.h> 14 #include <linux/sched.h> 15 #include <linux/slab.h> 16 #include <linux/wait.h> 17 #include <linux/vmalloc.h> 18 #include <linux/memblock.h> 19 20 #include <net/addrconf.h> 21 #include <net/inet_connection_sock.h> 22 #include <net/inet_hashtables.h> 23 #if IS_ENABLED(CONFIG_IPV6) 24 #include <net/inet6_hashtables.h> 25 #endif 26 #include <net/secure_seq.h> 27 #include <net/ip.h> 28 #include <net/tcp.h> 29 #include <net/sock_reuseport.h> 30 31 static u32 inet_ehashfn(const struct net *net, const __be32 laddr, 32 const __u16 lport, const __be32 faddr, 33 const __be16 fport) 34 { 35 static u32 inet_ehash_secret __read_mostly; 36 37 net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret)); 38 39 return __inet_ehashfn(laddr, lport, faddr, fport, 40 inet_ehash_secret + net_hash_mix(net)); 41 } 42 43 /* This function handles inet_sock, but also timewait and request sockets 44 * for IPv4/IPv6. 45 */ 46 static u32 sk_ehashfn(const struct sock *sk) 47 { 48 #if IS_ENABLED(CONFIG_IPV6) 49 if (sk->sk_family == AF_INET6 && 50 !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 51 return inet6_ehashfn(sock_net(sk), 52 &sk->sk_v6_rcv_saddr, sk->sk_num, 53 &sk->sk_v6_daddr, sk->sk_dport); 54 #endif 55 return inet_ehashfn(sock_net(sk), 56 sk->sk_rcv_saddr, sk->sk_num, 57 sk->sk_daddr, sk->sk_dport); 58 } 59 60 /* 61 * Allocate and initialize a new local port bind bucket. 62 * The bindhash mutex for snum's hash chain must be held here. 63 */ 64 struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, 65 struct net *net, 66 struct inet_bind_hashbucket *head, 67 const unsigned short snum, 68 int l3mdev) 69 { 70 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 71 72 if (tb) { 73 write_pnet(&tb->ib_net, net); 74 tb->l3mdev = l3mdev; 75 tb->port = snum; 76 tb->fastreuse = 0; 77 tb->fastreuseport = 0; 78 INIT_HLIST_HEAD(&tb->owners); 79 hlist_add_head(&tb->node, &head->chain); 80 } 81 return tb; 82 } 83 84 /* 85 * Caller must hold hashbucket lock for this tb with local BH disabled 86 */ 87 void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) 88 { 89 if (hlist_empty(&tb->owners)) { 90 __hlist_del(&tb->node); 91 kmem_cache_free(cachep, tb); 92 } 93 } 94 95 bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, 96 unsigned short port, int l3mdev) 97 { 98 return net_eq(ib_net(tb), net) && tb->port == port && 99 tb->l3mdev == l3mdev; 100 } 101 102 static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb, 103 struct net *net, 104 struct inet_bind_hashbucket *head, 105 unsigned short port, int l3mdev, 106 const struct sock *sk) 107 { 108 write_pnet(&tb->ib_net, net); 109 tb->l3mdev = l3mdev; 110 tb->port = port; 111 #if IS_ENABLED(CONFIG_IPV6) 112 if (sk->sk_family == AF_INET6) 113 tb->v6_rcv_saddr = sk->sk_v6_rcv_saddr; 114 else 115 #endif 116 tb->rcv_saddr = sk->sk_rcv_saddr; 117 INIT_HLIST_HEAD(&tb->owners); 118 hlist_add_head(&tb->node, &head->chain); 119 } 120 121 struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, 122 struct net *net, 123 struct inet_bind_hashbucket *head, 124 unsigned short port, 125 int l3mdev, 126 const struct sock *sk) 127 { 128 struct inet_bind2_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 129 130 if (tb) 131 inet_bind2_bucket_init(tb, net, head, port, l3mdev, sk); 132 133 return tb; 134 } 135 136 /* Caller must hold hashbucket lock for this tb with local BH disabled */ 137 void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) 138 { 139 if (hlist_empty(&tb->owners)) { 140 __hlist_del(&tb->node); 141 kmem_cache_free(cachep, tb); 142 } 143 } 144 145 static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, 146 const struct sock *sk) 147 { 148 #if IS_ENABLED(CONFIG_IPV6) 149 if (sk->sk_family == AF_INET6) 150 return ipv6_addr_equal(&tb2->v6_rcv_saddr, 151 &sk->sk_v6_rcv_saddr); 152 #endif 153 return tb2->rcv_saddr == sk->sk_rcv_saddr; 154 } 155 156 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 157 struct inet_bind2_bucket *tb2, unsigned short port) 158 { 159 inet_sk(sk)->inet_num = port; 160 sk_add_bind_node(sk, &tb->owners); 161 inet_csk(sk)->icsk_bind_hash = tb; 162 sk_add_bind2_node(sk, &tb2->owners); 163 inet_csk(sk)->icsk_bind2_hash = tb2; 164 } 165 166 /* 167 * Get rid of any references to a local port held by the given sock. 168 */ 169 static void __inet_put_port(struct sock *sk) 170 { 171 struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); 172 struct inet_bind_hashbucket *head, *head2; 173 struct net *net = sock_net(sk); 174 struct inet_bind_bucket *tb; 175 int bhash; 176 177 bhash = inet_bhashfn(net, inet_sk(sk)->inet_num, hashinfo->bhash_size); 178 head = &hashinfo->bhash[bhash]; 179 head2 = inet_bhashfn_portaddr(hashinfo, sk, net, inet_sk(sk)->inet_num); 180 181 spin_lock(&head->lock); 182 tb = inet_csk(sk)->icsk_bind_hash; 183 __sk_del_bind_node(sk); 184 inet_csk(sk)->icsk_bind_hash = NULL; 185 inet_sk(sk)->inet_num = 0; 186 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 187 188 spin_lock(&head2->lock); 189 if (inet_csk(sk)->icsk_bind2_hash) { 190 struct inet_bind2_bucket *tb2 = inet_csk(sk)->icsk_bind2_hash; 191 192 __sk_del_bind2_node(sk); 193 inet_csk(sk)->icsk_bind2_hash = NULL; 194 inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); 195 } 196 spin_unlock(&head2->lock); 197 198 spin_unlock(&head->lock); 199 } 200 201 void inet_put_port(struct sock *sk) 202 { 203 local_bh_disable(); 204 __inet_put_port(sk); 205 local_bh_enable(); 206 } 207 EXPORT_SYMBOL(inet_put_port); 208 209 int __inet_inherit_port(const struct sock *sk, struct sock *child) 210 { 211 struct inet_hashinfo *table = tcp_or_dccp_get_hashinfo(sk); 212 unsigned short port = inet_sk(child)->inet_num; 213 struct inet_bind_hashbucket *head, *head2; 214 bool created_inet_bind_bucket = false; 215 struct net *net = sock_net(sk); 216 bool update_fastreuse = false; 217 struct inet_bind2_bucket *tb2; 218 struct inet_bind_bucket *tb; 219 int bhash, l3mdev; 220 221 bhash = inet_bhashfn(net, port, table->bhash_size); 222 head = &table->bhash[bhash]; 223 head2 = inet_bhashfn_portaddr(table, child, net, port); 224 225 spin_lock(&head->lock); 226 spin_lock(&head2->lock); 227 tb = inet_csk(sk)->icsk_bind_hash; 228 tb2 = inet_csk(sk)->icsk_bind2_hash; 229 if (unlikely(!tb || !tb2)) { 230 spin_unlock(&head2->lock); 231 spin_unlock(&head->lock); 232 return -ENOENT; 233 } 234 if (tb->port != port) { 235 l3mdev = inet_sk_bound_l3mdev(sk); 236 237 /* NOTE: using tproxy and redirecting skbs to a proxy 238 * on a different listener port breaks the assumption 239 * that the listener socket's icsk_bind_hash is the same 240 * as that of the child socket. We have to look up or 241 * create a new bind bucket for the child here. */ 242 inet_bind_bucket_for_each(tb, &head->chain) { 243 if (inet_bind_bucket_match(tb, net, port, l3mdev)) 244 break; 245 } 246 if (!tb) { 247 tb = inet_bind_bucket_create(table->bind_bucket_cachep, 248 net, head, port, l3mdev); 249 if (!tb) { 250 spin_unlock(&head2->lock); 251 spin_unlock(&head->lock); 252 return -ENOMEM; 253 } 254 created_inet_bind_bucket = true; 255 } 256 update_fastreuse = true; 257 258 goto bhash2_find; 259 } else if (!inet_bind2_bucket_addr_match(tb2, child)) { 260 l3mdev = inet_sk_bound_l3mdev(sk); 261 262 bhash2_find: 263 tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, child); 264 if (!tb2) { 265 tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep, 266 net, head2, port, 267 l3mdev, child); 268 if (!tb2) 269 goto error; 270 } 271 } 272 if (update_fastreuse) 273 inet_csk_update_fastreuse(tb, child); 274 inet_bind_hash(child, tb, tb2, port); 275 spin_unlock(&head2->lock); 276 spin_unlock(&head->lock); 277 278 return 0; 279 280 error: 281 if (created_inet_bind_bucket) 282 inet_bind_bucket_destroy(table->bind_bucket_cachep, tb); 283 spin_unlock(&head2->lock); 284 spin_unlock(&head->lock); 285 return -ENOMEM; 286 } 287 EXPORT_SYMBOL_GPL(__inet_inherit_port); 288 289 static struct inet_listen_hashbucket * 290 inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) 291 { 292 u32 hash; 293 294 #if IS_ENABLED(CONFIG_IPV6) 295 if (sk->sk_family == AF_INET6) 296 hash = ipv6_portaddr_hash(sock_net(sk), 297 &sk->sk_v6_rcv_saddr, 298 inet_sk(sk)->inet_num); 299 else 300 #endif 301 hash = ipv4_portaddr_hash(sock_net(sk), 302 inet_sk(sk)->inet_rcv_saddr, 303 inet_sk(sk)->inet_num); 304 return inet_lhash2_bucket(h, hash); 305 } 306 307 static inline int compute_score(struct sock *sk, struct net *net, 308 const unsigned short hnum, const __be32 daddr, 309 const int dif, const int sdif) 310 { 311 int score = -1; 312 313 if (net_eq(sock_net(sk), net) && sk->sk_num == hnum && 314 !ipv6_only_sock(sk)) { 315 if (sk->sk_rcv_saddr != daddr) 316 return -1; 317 318 if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) 319 return -1; 320 score = sk->sk_bound_dev_if ? 2 : 1; 321 322 if (sk->sk_family == PF_INET) 323 score++; 324 if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) 325 score++; 326 } 327 return score; 328 } 329 330 static inline struct sock *lookup_reuseport(struct net *net, struct sock *sk, 331 struct sk_buff *skb, int doff, 332 __be32 saddr, __be16 sport, 333 __be32 daddr, unsigned short hnum) 334 { 335 struct sock *reuse_sk = NULL; 336 u32 phash; 337 338 if (sk->sk_reuseport) { 339 phash = inet_ehashfn(net, daddr, hnum, saddr, sport); 340 reuse_sk = reuseport_select_sock(sk, phash, skb, doff); 341 } 342 return reuse_sk; 343 } 344 345 /* 346 * Here are some nice properties to exploit here. The BSD API 347 * does not allow a listening sock to specify the remote port nor the 348 * remote address for the connection. So always assume those are both 349 * wildcarded during the search since they can never be otherwise. 350 */ 351 352 /* called with rcu_read_lock() : No refcount taken on the socket */ 353 static struct sock *inet_lhash2_lookup(struct net *net, 354 struct inet_listen_hashbucket *ilb2, 355 struct sk_buff *skb, int doff, 356 const __be32 saddr, __be16 sport, 357 const __be32 daddr, const unsigned short hnum, 358 const int dif, const int sdif) 359 { 360 struct sock *sk, *result = NULL; 361 struct hlist_nulls_node *node; 362 int score, hiscore = 0; 363 364 sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { 365 score = compute_score(sk, net, hnum, daddr, dif, sdif); 366 if (score > hiscore) { 367 result = lookup_reuseport(net, sk, skb, doff, 368 saddr, sport, daddr, hnum); 369 if (result) 370 return result; 371 372 result = sk; 373 hiscore = score; 374 } 375 } 376 377 return result; 378 } 379 380 static inline struct sock *inet_lookup_run_bpf(struct net *net, 381 struct inet_hashinfo *hashinfo, 382 struct sk_buff *skb, int doff, 383 __be32 saddr, __be16 sport, 384 __be32 daddr, u16 hnum, const int dif) 385 { 386 struct sock *sk, *reuse_sk; 387 bool no_reuseport; 388 389 if (hashinfo != net->ipv4.tcp_death_row.hashinfo) 390 return NULL; /* only TCP is supported */ 391 392 no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_TCP, saddr, sport, 393 daddr, hnum, dif, &sk); 394 if (no_reuseport || IS_ERR_OR_NULL(sk)) 395 return sk; 396 397 reuse_sk = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); 398 if (reuse_sk) 399 sk = reuse_sk; 400 return sk; 401 } 402 403 struct sock *__inet_lookup_listener(struct net *net, 404 struct inet_hashinfo *hashinfo, 405 struct sk_buff *skb, int doff, 406 const __be32 saddr, __be16 sport, 407 const __be32 daddr, const unsigned short hnum, 408 const int dif, const int sdif) 409 { 410 struct inet_listen_hashbucket *ilb2; 411 struct sock *result = NULL; 412 unsigned int hash2; 413 414 /* Lookup redirect from BPF */ 415 if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { 416 result = inet_lookup_run_bpf(net, hashinfo, skb, doff, 417 saddr, sport, daddr, hnum, dif); 418 if (result) 419 goto done; 420 } 421 422 hash2 = ipv4_portaddr_hash(net, daddr, hnum); 423 ilb2 = inet_lhash2_bucket(hashinfo, hash2); 424 425 result = inet_lhash2_lookup(net, ilb2, skb, doff, 426 saddr, sport, daddr, hnum, 427 dif, sdif); 428 if (result) 429 goto done; 430 431 /* Lookup lhash2 with INADDR_ANY */ 432 hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); 433 ilb2 = inet_lhash2_bucket(hashinfo, hash2); 434 435 result = inet_lhash2_lookup(net, ilb2, skb, doff, 436 saddr, sport, htonl(INADDR_ANY), hnum, 437 dif, sdif); 438 done: 439 if (IS_ERR(result)) 440 return NULL; 441 return result; 442 } 443 EXPORT_SYMBOL_GPL(__inet_lookup_listener); 444 445 /* All sockets share common refcount, but have different destructors */ 446 void sock_gen_put(struct sock *sk) 447 { 448 if (!refcount_dec_and_test(&sk->sk_refcnt)) 449 return; 450 451 if (sk->sk_state == TCP_TIME_WAIT) 452 inet_twsk_free(inet_twsk(sk)); 453 else if (sk->sk_state == TCP_NEW_SYN_RECV) 454 reqsk_free(inet_reqsk(sk)); 455 else 456 sk_free(sk); 457 } 458 EXPORT_SYMBOL_GPL(sock_gen_put); 459 460 void sock_edemux(struct sk_buff *skb) 461 { 462 sock_gen_put(skb->sk); 463 } 464 EXPORT_SYMBOL(sock_edemux); 465 466 struct sock *__inet_lookup_established(struct net *net, 467 struct inet_hashinfo *hashinfo, 468 const __be32 saddr, const __be16 sport, 469 const __be32 daddr, const u16 hnum, 470 const int dif, const int sdif) 471 { 472 INET_ADDR_COOKIE(acookie, saddr, daddr); 473 const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 474 struct sock *sk; 475 const struct hlist_nulls_node *node; 476 /* Optimize here for direct hit, only listening connections can 477 * have wildcards anyways. 478 */ 479 unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); 480 unsigned int slot = hash & hashinfo->ehash_mask; 481 struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; 482 483 begin: 484 sk_nulls_for_each_rcu(sk, node, &head->chain) { 485 if (sk->sk_hash != hash) 486 continue; 487 if (likely(inet_match(net, sk, acookie, ports, dif, sdif))) { 488 if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) 489 goto out; 490 if (unlikely(!inet_match(net, sk, acookie, 491 ports, dif, sdif))) { 492 sock_gen_put(sk); 493 goto begin; 494 } 495 goto found; 496 } 497 } 498 /* 499 * if the nulls value we got at the end of this lookup is 500 * not the expected one, we must restart lookup. 501 * We probably met an item that was moved to another chain. 502 */ 503 if (get_nulls_value(node) != slot) 504 goto begin; 505 out: 506 sk = NULL; 507 found: 508 return sk; 509 } 510 EXPORT_SYMBOL_GPL(__inet_lookup_established); 511 512 /* called with local bh disabled */ 513 static int __inet_check_established(struct inet_timewait_death_row *death_row, 514 struct sock *sk, __u16 lport, 515 struct inet_timewait_sock **twp) 516 { 517 struct inet_hashinfo *hinfo = death_row->hashinfo; 518 struct inet_sock *inet = inet_sk(sk); 519 __be32 daddr = inet->inet_rcv_saddr; 520 __be32 saddr = inet->inet_daddr; 521 int dif = sk->sk_bound_dev_if; 522 struct net *net = sock_net(sk); 523 int sdif = l3mdev_master_ifindex_by_index(net, dif); 524 INET_ADDR_COOKIE(acookie, saddr, daddr); 525 const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); 526 unsigned int hash = inet_ehashfn(net, daddr, lport, 527 saddr, inet->inet_dport); 528 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 529 spinlock_t *lock = inet_ehash_lockp(hinfo, hash); 530 struct sock *sk2; 531 const struct hlist_nulls_node *node; 532 struct inet_timewait_sock *tw = NULL; 533 534 spin_lock(lock); 535 536 sk_nulls_for_each(sk2, node, &head->chain) { 537 if (sk2->sk_hash != hash) 538 continue; 539 540 if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) { 541 if (sk2->sk_state == TCP_TIME_WAIT) { 542 tw = inet_twsk(sk2); 543 if (twsk_unique(sk, sk2, twp)) 544 break; 545 } 546 goto not_unique; 547 } 548 } 549 550 /* Must record num and sport now. Otherwise we will see 551 * in hash table socket with a funny identity. 552 */ 553 inet->inet_num = lport; 554 inet->inet_sport = htons(lport); 555 sk->sk_hash = hash; 556 WARN_ON(!sk_unhashed(sk)); 557 __sk_nulls_add_node_rcu(sk, &head->chain); 558 if (tw) { 559 sk_nulls_del_node_init_rcu((struct sock *)tw); 560 __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); 561 } 562 spin_unlock(lock); 563 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 564 565 if (twp) { 566 *twp = tw; 567 } else if (tw) { 568 /* Silly. Should hash-dance instead... */ 569 inet_twsk_deschedule_put(tw); 570 } 571 return 0; 572 573 not_unique: 574 spin_unlock(lock); 575 return -EADDRNOTAVAIL; 576 } 577 578 static u64 inet_sk_port_offset(const struct sock *sk) 579 { 580 const struct inet_sock *inet = inet_sk(sk); 581 582 return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, 583 inet->inet_daddr, 584 inet->inet_dport); 585 } 586 587 /* Searches for an exsiting socket in the ehash bucket list. 588 * Returns true if found, false otherwise. 589 */ 590 static bool inet_ehash_lookup_by_sk(struct sock *sk, 591 struct hlist_nulls_head *list) 592 { 593 const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num); 594 const int sdif = sk->sk_bound_dev_if; 595 const int dif = sk->sk_bound_dev_if; 596 const struct hlist_nulls_node *node; 597 struct net *net = sock_net(sk); 598 struct sock *esk; 599 600 INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr); 601 602 sk_nulls_for_each_rcu(esk, node, list) { 603 if (esk->sk_hash != sk->sk_hash) 604 continue; 605 if (sk->sk_family == AF_INET) { 606 if (unlikely(inet_match(net, esk, acookie, 607 ports, dif, sdif))) { 608 return true; 609 } 610 } 611 #if IS_ENABLED(CONFIG_IPV6) 612 else if (sk->sk_family == AF_INET6) { 613 if (unlikely(inet6_match(net, esk, 614 &sk->sk_v6_daddr, 615 &sk->sk_v6_rcv_saddr, 616 ports, dif, sdif))) { 617 return true; 618 } 619 } 620 #endif 621 } 622 return false; 623 } 624 625 /* Insert a socket into ehash, and eventually remove another one 626 * (The another one can be a SYN_RECV or TIMEWAIT) 627 * If an existing socket already exists, socket sk is not inserted, 628 * and sets found_dup_sk parameter to true. 629 */ 630 bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) 631 { 632 struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); 633 struct inet_ehash_bucket *head; 634 struct hlist_nulls_head *list; 635 spinlock_t *lock; 636 bool ret = true; 637 638 WARN_ON_ONCE(!sk_unhashed(sk)); 639 640 sk->sk_hash = sk_ehashfn(sk); 641 head = inet_ehash_bucket(hashinfo, sk->sk_hash); 642 list = &head->chain; 643 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 644 645 spin_lock(lock); 646 if (osk) { 647 WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); 648 ret = sk_nulls_del_node_init_rcu(osk); 649 } else if (found_dup_sk) { 650 *found_dup_sk = inet_ehash_lookup_by_sk(sk, list); 651 if (*found_dup_sk) 652 ret = false; 653 } 654 655 if (ret) 656 __sk_nulls_add_node_rcu(sk, list); 657 658 spin_unlock(lock); 659 660 return ret; 661 } 662 663 bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) 664 { 665 bool ok = inet_ehash_insert(sk, osk, found_dup_sk); 666 667 if (ok) { 668 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 669 } else { 670 this_cpu_inc(*sk->sk_prot->orphan_count); 671 inet_sk_set_state(sk, TCP_CLOSE); 672 sock_set_flag(sk, SOCK_DEAD); 673 inet_csk_destroy_sock(sk); 674 } 675 return ok; 676 } 677 EXPORT_SYMBOL_GPL(inet_ehash_nolisten); 678 679 static int inet_reuseport_add_sock(struct sock *sk, 680 struct inet_listen_hashbucket *ilb) 681 { 682 struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; 683 const struct hlist_nulls_node *node; 684 struct sock *sk2; 685 kuid_t uid = sock_i_uid(sk); 686 687 sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) { 688 if (sk2 != sk && 689 sk2->sk_family == sk->sk_family && 690 ipv6_only_sock(sk2) == ipv6_only_sock(sk) && 691 sk2->sk_bound_dev_if == sk->sk_bound_dev_if && 692 inet_csk(sk2)->icsk_bind_hash == tb && 693 sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && 694 inet_rcv_saddr_equal(sk, sk2, false)) 695 return reuseport_add_sock(sk, sk2, 696 inet_rcv_saddr_any(sk)); 697 } 698 699 return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); 700 } 701 702 int __inet_hash(struct sock *sk, struct sock *osk) 703 { 704 struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); 705 struct inet_listen_hashbucket *ilb2; 706 int err = 0; 707 708 if (sk->sk_state != TCP_LISTEN) { 709 local_bh_disable(); 710 inet_ehash_nolisten(sk, osk, NULL); 711 local_bh_enable(); 712 return 0; 713 } 714 WARN_ON(!sk_unhashed(sk)); 715 ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); 716 717 spin_lock(&ilb2->lock); 718 if (sk->sk_reuseport) { 719 err = inet_reuseport_add_sock(sk, ilb2); 720 if (err) 721 goto unlock; 722 } 723 if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && 724 sk->sk_family == AF_INET6) 725 __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head); 726 else 727 __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head); 728 sock_set_flag(sk, SOCK_RCU_FREE); 729 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 730 unlock: 731 spin_unlock(&ilb2->lock); 732 733 return err; 734 } 735 EXPORT_SYMBOL(__inet_hash); 736 737 int inet_hash(struct sock *sk) 738 { 739 int err = 0; 740 741 if (sk->sk_state != TCP_CLOSE) 742 err = __inet_hash(sk, NULL); 743 744 return err; 745 } 746 EXPORT_SYMBOL_GPL(inet_hash); 747 748 void inet_unhash(struct sock *sk) 749 { 750 struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); 751 752 if (sk_unhashed(sk)) 753 return; 754 755 if (sk->sk_state == TCP_LISTEN) { 756 struct inet_listen_hashbucket *ilb2; 757 758 ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); 759 /* Don't disable bottom halves while acquiring the lock to 760 * avoid circular locking dependency on PREEMPT_RT. 761 */ 762 spin_lock(&ilb2->lock); 763 if (sk_unhashed(sk)) { 764 spin_unlock(&ilb2->lock); 765 return; 766 } 767 768 if (rcu_access_pointer(sk->sk_reuseport_cb)) 769 reuseport_stop_listen_sock(sk); 770 771 __sk_nulls_del_node_init_rcu(sk); 772 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 773 spin_unlock(&ilb2->lock); 774 } else { 775 spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 776 777 spin_lock_bh(lock); 778 if (sk_unhashed(sk)) { 779 spin_unlock_bh(lock); 780 return; 781 } 782 __sk_nulls_del_node_init_rcu(sk); 783 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 784 spin_unlock_bh(lock); 785 } 786 } 787 EXPORT_SYMBOL_GPL(inet_unhash); 788 789 static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb, 790 const struct net *net, unsigned short port, 791 int l3mdev, const struct sock *sk) 792 { 793 #if IS_ENABLED(CONFIG_IPV6) 794 if (sk->sk_family == AF_INET6) 795 return net_eq(ib2_net(tb), net) && tb->port == port && 796 tb->l3mdev == l3mdev && 797 ipv6_addr_equal(&tb->v6_rcv_saddr, &sk->sk_v6_rcv_saddr); 798 else 799 #endif 800 return net_eq(ib2_net(tb), net) && tb->port == port && 801 tb->l3mdev == l3mdev && tb->rcv_saddr == sk->sk_rcv_saddr; 802 } 803 804 bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net, 805 unsigned short port, int l3mdev, const struct sock *sk) 806 { 807 #if IS_ENABLED(CONFIG_IPV6) 808 struct in6_addr addr_any = {}; 809 810 if (sk->sk_family == AF_INET6) 811 return net_eq(ib2_net(tb), net) && tb->port == port && 812 tb->l3mdev == l3mdev && 813 ipv6_addr_equal(&tb->v6_rcv_saddr, &addr_any); 814 else 815 #endif 816 return net_eq(ib2_net(tb), net) && tb->port == port && 817 tb->l3mdev == l3mdev && tb->rcv_saddr == 0; 818 } 819 820 /* The socket's bhash2 hashbucket spinlock must be held when this is called */ 821 struct inet_bind2_bucket * 822 inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net, 823 unsigned short port, int l3mdev, const struct sock *sk) 824 { 825 struct inet_bind2_bucket *bhash2 = NULL; 826 827 inet_bind_bucket_for_each(bhash2, &head->chain) 828 if (inet_bind2_bucket_match(bhash2, net, port, l3mdev, sk)) 829 break; 830 831 return bhash2; 832 } 833 834 struct inet_bind_hashbucket * 835 inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port) 836 { 837 struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); 838 u32 hash; 839 #if IS_ENABLED(CONFIG_IPV6) 840 struct in6_addr addr_any = {}; 841 842 if (sk->sk_family == AF_INET6) 843 hash = ipv6_portaddr_hash(net, &addr_any, port); 844 else 845 #endif 846 hash = ipv4_portaddr_hash(net, 0, port); 847 848 return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; 849 } 850 851 int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct sock *sk) 852 { 853 struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); 854 struct inet_bind2_bucket *tb2, *new_tb2; 855 int l3mdev = inet_sk_bound_l3mdev(sk); 856 struct inet_bind_hashbucket *head2; 857 int port = inet_sk(sk)->inet_num; 858 struct net *net = sock_net(sk); 859 860 /* Allocate a bind2 bucket ahead of time to avoid permanently putting 861 * the bhash2 table in an inconsistent state if a new tb2 bucket 862 * allocation fails. 863 */ 864 new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC); 865 if (!new_tb2) 866 return -ENOMEM; 867 868 head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); 869 870 if (prev_saddr) { 871 spin_lock_bh(&prev_saddr->lock); 872 __sk_del_bind2_node(sk); 873 inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, 874 inet_csk(sk)->icsk_bind2_hash); 875 spin_unlock_bh(&prev_saddr->lock); 876 } 877 878 spin_lock_bh(&head2->lock); 879 tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); 880 if (!tb2) { 881 tb2 = new_tb2; 882 inet_bind2_bucket_init(tb2, net, head2, port, l3mdev, sk); 883 } 884 sk_add_bind2_node(sk, &tb2->owners); 885 inet_csk(sk)->icsk_bind2_hash = tb2; 886 spin_unlock_bh(&head2->lock); 887 888 if (tb2 != new_tb2) 889 kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2); 890 891 return 0; 892 } 893 EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr); 894 895 /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm 896 * Note that we use 32bit integers (vs RFC 'short integers') 897 * because 2^16 is not a multiple of num_ephemeral and this 898 * property might be used by clever attacker. 899 * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though 900 * attacks were since demonstrated, thus we use 65536 instead to really 901 * give more isolation and privacy, at the expense of 256kB of kernel 902 * memory. 903 */ 904 #define INET_TABLE_PERTURB_SHIFT 16 905 #define INET_TABLE_PERTURB_SIZE (1 << INET_TABLE_PERTURB_SHIFT) 906 static u32 *table_perturb; 907 908 int __inet_hash_connect(struct inet_timewait_death_row *death_row, 909 struct sock *sk, u64 port_offset, 910 int (*check_established)(struct inet_timewait_death_row *, 911 struct sock *, __u16, struct inet_timewait_sock **)) 912 { 913 struct inet_hashinfo *hinfo = death_row->hashinfo; 914 struct inet_bind_hashbucket *head, *head2; 915 struct inet_timewait_sock *tw = NULL; 916 int port = inet_sk(sk)->inet_num; 917 struct net *net = sock_net(sk); 918 struct inet_bind2_bucket *tb2; 919 struct inet_bind_bucket *tb; 920 bool tb_created = false; 921 u32 remaining, offset; 922 int ret, i, low, high; 923 int l3mdev; 924 u32 index; 925 926 if (port) { 927 head = &hinfo->bhash[inet_bhashfn(net, port, 928 hinfo->bhash_size)]; 929 tb = inet_csk(sk)->icsk_bind_hash; 930 spin_lock_bh(&head->lock); 931 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { 932 inet_ehash_nolisten(sk, NULL, NULL); 933 spin_unlock_bh(&head->lock); 934 return 0; 935 } 936 spin_unlock(&head->lock); 937 /* No definite answer... Walk to established hash table */ 938 ret = check_established(death_row, sk, port, NULL); 939 local_bh_enable(); 940 return ret; 941 } 942 943 l3mdev = inet_sk_bound_l3mdev(sk); 944 945 inet_get_local_port_range(net, &low, &high); 946 high++; /* [32768, 60999] -> [32768, 61000[ */ 947 remaining = high - low; 948 if (likely(remaining > 1)) 949 remaining &= ~1U; 950 951 net_get_random_once(table_perturb, 952 INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb)); 953 index = port_offset & (INET_TABLE_PERTURB_SIZE - 1); 954 955 offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32); 956 offset %= remaining; 957 958 /* In first pass we try ports of @low parity. 959 * inet_csk_get_port() does the opposite choice. 960 */ 961 offset &= ~1U; 962 other_parity_scan: 963 port = low + offset; 964 for (i = 0; i < remaining; i += 2, port += 2) { 965 if (unlikely(port >= high)) 966 port -= remaining; 967 if (inet_is_local_reserved_port(net, port)) 968 continue; 969 head = &hinfo->bhash[inet_bhashfn(net, port, 970 hinfo->bhash_size)]; 971 spin_lock_bh(&head->lock); 972 973 /* Does not bother with rcv_saddr checks, because 974 * the established check is already unique enough. 975 */ 976 inet_bind_bucket_for_each(tb, &head->chain) { 977 if (inet_bind_bucket_match(tb, net, port, l3mdev)) { 978 if (tb->fastreuse >= 0 || 979 tb->fastreuseport >= 0) 980 goto next_port; 981 WARN_ON(hlist_empty(&tb->owners)); 982 if (!check_established(death_row, sk, 983 port, &tw)) 984 goto ok; 985 goto next_port; 986 } 987 } 988 989 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 990 net, head, port, l3mdev); 991 if (!tb) { 992 spin_unlock_bh(&head->lock); 993 return -ENOMEM; 994 } 995 tb_created = true; 996 tb->fastreuse = -1; 997 tb->fastreuseport = -1; 998 goto ok; 999 next_port: 1000 spin_unlock_bh(&head->lock); 1001 cond_resched(); 1002 } 1003 1004 offset++; 1005 if ((offset & 1) && remaining > 1) 1006 goto other_parity_scan; 1007 1008 return -EADDRNOTAVAIL; 1009 1010 ok: 1011 /* Find the corresponding tb2 bucket since we need to 1012 * add the socket to the bhash2 table as well 1013 */ 1014 head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); 1015 spin_lock(&head2->lock); 1016 1017 tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); 1018 if (!tb2) { 1019 tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net, 1020 head2, port, l3mdev, sk); 1021 if (!tb2) 1022 goto error; 1023 } 1024 1025 /* Here we want to add a little bit of randomness to the next source 1026 * port that will be chosen. We use a max() with a random here so that 1027 * on low contention the randomness is maximal and on high contention 1028 * it may be inexistent. 1029 */ 1030 i = max_t(int, i, (prandom_u32() & 7) * 2); 1031 WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2); 1032 1033 /* Head lock still held and bh's disabled */ 1034 inet_bind_hash(sk, tb, tb2, port); 1035 1036 spin_unlock(&head2->lock); 1037 1038 if (sk_unhashed(sk)) { 1039 inet_sk(sk)->inet_sport = htons(port); 1040 inet_ehash_nolisten(sk, (struct sock *)tw, NULL); 1041 } 1042 if (tw) 1043 inet_twsk_bind_unhash(tw, hinfo); 1044 spin_unlock(&head->lock); 1045 if (tw) 1046 inet_twsk_deschedule_put(tw); 1047 local_bh_enable(); 1048 return 0; 1049 1050 error: 1051 spin_unlock(&head2->lock); 1052 if (tb_created) 1053 inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); 1054 spin_unlock_bh(&head->lock); 1055 return -ENOMEM; 1056 } 1057 1058 /* 1059 * Bind a port for a connect operation and hash it. 1060 */ 1061 int inet_hash_connect(struct inet_timewait_death_row *death_row, 1062 struct sock *sk) 1063 { 1064 u64 port_offset = 0; 1065 1066 if (!inet_sk(sk)->inet_num) 1067 port_offset = inet_sk_port_offset(sk); 1068 return __inet_hash_connect(death_row, sk, port_offset, 1069 __inet_check_established); 1070 } 1071 EXPORT_SYMBOL_GPL(inet_hash_connect); 1072 1073 static void init_hashinfo_lhash2(struct inet_hashinfo *h) 1074 { 1075 int i; 1076 1077 for (i = 0; i <= h->lhash2_mask; i++) { 1078 spin_lock_init(&h->lhash2[i].lock); 1079 INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head, 1080 i + LISTENING_NULLS_BASE); 1081 } 1082 } 1083 1084 void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, 1085 unsigned long numentries, int scale, 1086 unsigned long low_limit, 1087 unsigned long high_limit) 1088 { 1089 h->lhash2 = alloc_large_system_hash(name, 1090 sizeof(*h->lhash2), 1091 numentries, 1092 scale, 1093 0, 1094 NULL, 1095 &h->lhash2_mask, 1096 low_limit, 1097 high_limit); 1098 init_hashinfo_lhash2(h); 1099 1100 /* this one is used for source ports of outgoing connections */ 1101 table_perturb = alloc_large_system_hash("Table-perturb", 1102 sizeof(*table_perturb), 1103 INET_TABLE_PERTURB_SIZE, 1104 0, 0, NULL, NULL, 1105 INET_TABLE_PERTURB_SIZE, 1106 INET_TABLE_PERTURB_SIZE); 1107 } 1108 1109 int inet_hashinfo2_init_mod(struct inet_hashinfo *h) 1110 { 1111 h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL); 1112 if (!h->lhash2) 1113 return -ENOMEM; 1114 1115 h->lhash2_mask = INET_LHTABLE_SIZE - 1; 1116 /* INET_LHTABLE_SIZE must be a power of 2 */ 1117 BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask); 1118 1119 init_hashinfo_lhash2(h); 1120 return 0; 1121 } 1122 EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod); 1123 1124 int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) 1125 { 1126 unsigned int locksz = sizeof(spinlock_t); 1127 unsigned int i, nblocks = 1; 1128 1129 if (locksz != 0) { 1130 /* allocate 2 cache lines or at least one spinlock per cpu */ 1131 nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U); 1132 nblocks = roundup_pow_of_two(nblocks * num_possible_cpus()); 1133 1134 /* no more locks than number of hash buckets */ 1135 nblocks = min(nblocks, hashinfo->ehash_mask + 1); 1136 1137 hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL); 1138 if (!hashinfo->ehash_locks) 1139 return -ENOMEM; 1140 1141 for (i = 0; i < nblocks; i++) 1142 spin_lock_init(&hashinfo->ehash_locks[i]); 1143 } 1144 hashinfo->ehash_locks_mask = nblocks - 1; 1145 return 0; 1146 } 1147 EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc); 1148