1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Generic INET transport hashtables 7 * 8 * Authors: Lotsa people, from code originally in tcp 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 */ 15 16 #include <linux/module.h> 17 #include <linux/random.h> 18 #include <linux/sched.h> 19 #include <linux/slab.h> 20 #include <linux/wait.h> 21 22 #include <net/inet_connection_sock.h> 23 #include <net/inet_hashtables.h> 24 #include <net/ip.h> 25 26 /* 27 * Allocate and initialize a new local port bind bucket. 28 * The bindhash mutex for snum's hash chain must be held here. 29 */ 30 struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, 31 struct net *net, 32 struct inet_bind_hashbucket *head, 33 const unsigned short snum) 34 { 35 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 36 37 if (tb != NULL) { 38 write_pnet(&tb->ib_net, hold_net(net)); 39 tb->port = snum; 40 tb->fastreuse = 0; 41 tb->num_owners = 0; 42 INIT_HLIST_HEAD(&tb->owners); 43 hlist_add_head(&tb->node, &head->chain); 44 } 45 return tb; 46 } 47 48 /* 49 * Caller must hold hashbucket lock for this tb with local BH disabled 50 */ 51 void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) 52 { 53 if (hlist_empty(&tb->owners)) { 54 __hlist_del(&tb->node); 55 release_net(ib_net(tb)); 56 kmem_cache_free(cachep, tb); 57 } 58 } 59 60 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 61 const unsigned short snum) 62 { 63 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 64 65 atomic_inc(&hashinfo->bsockets); 66 67 inet_sk(sk)->inet_num = snum; 68 sk_add_bind_node(sk, &tb->owners); 69 tb->num_owners++; 70 inet_csk(sk)->icsk_bind_hash = tb; 71 } 72 73 /* 74 * Get rid of any references to a local port held by the given sock. 75 */ 76 static void __inet_put_port(struct sock *sk) 77 { 78 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 79 const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num, 80 hashinfo->bhash_size); 81 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; 82 struct inet_bind_bucket *tb; 83 84 atomic_dec(&hashinfo->bsockets); 85 86 spin_lock(&head->lock); 87 tb = inet_csk(sk)->icsk_bind_hash; 88 __sk_del_bind_node(sk); 89 tb->num_owners--; 90 inet_csk(sk)->icsk_bind_hash = NULL; 91 inet_sk(sk)->inet_num = 0; 92 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 93 spin_unlock(&head->lock); 94 } 95 96 void inet_put_port(struct sock *sk) 97 { 98 local_bh_disable(); 99 __inet_put_port(sk); 100 local_bh_enable(); 101 } 102 EXPORT_SYMBOL(inet_put_port); 103 104 void __inet_inherit_port(struct sock *sk, struct sock *child) 105 { 106 struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; 107 const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->inet_num, 108 table->bhash_size); 109 struct inet_bind_hashbucket *head = &table->bhash[bhash]; 110 struct inet_bind_bucket *tb; 111 112 spin_lock(&head->lock); 113 tb = inet_csk(sk)->icsk_bind_hash; 114 sk_add_bind_node(child, &tb->owners); 115 inet_csk(child)->icsk_bind_hash = tb; 116 spin_unlock(&head->lock); 117 } 118 EXPORT_SYMBOL_GPL(__inet_inherit_port); 119 120 static inline int compute_score(struct sock *sk, struct net *net, 121 const unsigned short hnum, const __be32 daddr, 122 const int dif) 123 { 124 int score = -1; 125 struct inet_sock *inet = inet_sk(sk); 126 127 if (net_eq(sock_net(sk), net) && inet->inet_num == hnum && 128 !ipv6_only_sock(sk)) { 129 __be32 rcv_saddr = inet->inet_rcv_saddr; 130 score = sk->sk_family == PF_INET ? 1 : 0; 131 if (rcv_saddr) { 132 if (rcv_saddr != daddr) 133 return -1; 134 score += 2; 135 } 136 if (sk->sk_bound_dev_if) { 137 if (sk->sk_bound_dev_if != dif) 138 return -1; 139 score += 2; 140 } 141 } 142 return score; 143 } 144 145 /* 146 * Don't inline this cruft. Here are some nice properties to exploit here. The 147 * BSD API does not allow a listening sock to specify the remote port nor the 148 * remote address for the connection. So always assume those are both 149 * wildcarded during the search since they can never be otherwise. 150 */ 151 152 153 struct sock *__inet_lookup_listener(struct net *net, 154 struct inet_hashinfo *hashinfo, 155 const __be32 daddr, const unsigned short hnum, 156 const int dif) 157 { 158 struct sock *sk, *result; 159 struct hlist_nulls_node *node; 160 unsigned int hash = inet_lhashfn(net, hnum); 161 struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; 162 int score, hiscore; 163 164 rcu_read_lock(); 165 begin: 166 result = NULL; 167 hiscore = -1; 168 sk_nulls_for_each_rcu(sk, node, &ilb->head) { 169 score = compute_score(sk, net, hnum, daddr, dif); 170 if (score > hiscore) { 171 result = sk; 172 hiscore = score; 173 } 174 } 175 /* 176 * if the nulls value we got at the end of this lookup is 177 * not the expected one, we must restart lookup. 178 * We probably met an item that was moved to another chain. 179 */ 180 if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) 181 goto begin; 182 if (result) { 183 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) 184 result = NULL; 185 else if (unlikely(compute_score(result, net, hnum, daddr, 186 dif) < hiscore)) { 187 sock_put(result); 188 goto begin; 189 } 190 } 191 rcu_read_unlock(); 192 return result; 193 } 194 EXPORT_SYMBOL_GPL(__inet_lookup_listener); 195 196 struct sock * __inet_lookup_established(struct net *net, 197 struct inet_hashinfo *hashinfo, 198 const __be32 saddr, const __be16 sport, 199 const __be32 daddr, const u16 hnum, 200 const int dif) 201 { 202 INET_ADDR_COOKIE(acookie, saddr, daddr) 203 const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 204 struct sock *sk; 205 const struct hlist_nulls_node *node; 206 /* Optimize here for direct hit, only listening connections can 207 * have wildcards anyways. 208 */ 209 unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); 210 unsigned int slot = hash & hashinfo->ehash_mask; 211 struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; 212 213 rcu_read_lock(); 214 begin: 215 sk_nulls_for_each_rcu(sk, node, &head->chain) { 216 if (INET_MATCH(sk, net, hash, acookie, 217 saddr, daddr, ports, dif)) { 218 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) 219 goto begintw; 220 if (unlikely(!INET_MATCH(sk, net, hash, acookie, 221 saddr, daddr, ports, dif))) { 222 sock_put(sk); 223 goto begin; 224 } 225 goto out; 226 } 227 } 228 /* 229 * if the nulls value we got at the end of this lookup is 230 * not the expected one, we must restart lookup. 231 * We probably met an item that was moved to another chain. 232 */ 233 if (get_nulls_value(node) != slot) 234 goto begin; 235 236 begintw: 237 /* Must check for a TIME_WAIT'er before going to listener hash. */ 238 sk_nulls_for_each_rcu(sk, node, &head->twchain) { 239 if (INET_TW_MATCH(sk, net, hash, acookie, 240 saddr, daddr, ports, dif)) { 241 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) { 242 sk = NULL; 243 goto out; 244 } 245 if (unlikely(!INET_TW_MATCH(sk, net, hash, acookie, 246 saddr, daddr, ports, dif))) { 247 sock_put(sk); 248 goto begintw; 249 } 250 goto out; 251 } 252 } 253 /* 254 * if the nulls value we got at the end of this lookup is 255 * not the expected one, we must restart lookup. 256 * We probably met an item that was moved to another chain. 257 */ 258 if (get_nulls_value(node) != slot) 259 goto begintw; 260 sk = NULL; 261 out: 262 rcu_read_unlock(); 263 return sk; 264 } 265 EXPORT_SYMBOL_GPL(__inet_lookup_established); 266 267 /* called with local bh disabled */ 268 static int __inet_check_established(struct inet_timewait_death_row *death_row, 269 struct sock *sk, __u16 lport, 270 struct inet_timewait_sock **twp) 271 { 272 struct inet_hashinfo *hinfo = death_row->hashinfo; 273 struct inet_sock *inet = inet_sk(sk); 274 __be32 daddr = inet->inet_rcv_saddr; 275 __be32 saddr = inet->inet_daddr; 276 int dif = sk->sk_bound_dev_if; 277 INET_ADDR_COOKIE(acookie, saddr, daddr) 278 const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); 279 struct net *net = sock_net(sk); 280 unsigned int hash = inet_ehashfn(net, daddr, lport, 281 saddr, inet->inet_dport); 282 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 283 spinlock_t *lock = inet_ehash_lockp(hinfo, hash); 284 struct sock *sk2; 285 const struct hlist_nulls_node *node; 286 struct inet_timewait_sock *tw; 287 int twrefcnt = 0; 288 289 spin_lock(lock); 290 291 /* Check TIME-WAIT sockets first. */ 292 sk_nulls_for_each(sk2, node, &head->twchain) { 293 tw = inet_twsk(sk2); 294 295 if (INET_TW_MATCH(sk2, net, hash, acookie, 296 saddr, daddr, ports, dif)) { 297 if (twsk_unique(sk, sk2, twp)) 298 goto unique; 299 else 300 goto not_unique; 301 } 302 } 303 tw = NULL; 304 305 /* And established part... */ 306 sk_nulls_for_each(sk2, node, &head->chain) { 307 if (INET_MATCH(sk2, net, hash, acookie, 308 saddr, daddr, ports, dif)) 309 goto not_unique; 310 } 311 312 unique: 313 /* Must record num and sport now. Otherwise we will see 314 * in hash table socket with a funny identity. */ 315 inet->inet_num = lport; 316 inet->inet_sport = htons(lport); 317 sk->sk_hash = hash; 318 WARN_ON(!sk_unhashed(sk)); 319 __sk_nulls_add_node_rcu(sk, &head->chain); 320 if (tw) { 321 twrefcnt = inet_twsk_unhash(tw); 322 NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); 323 } 324 spin_unlock(lock); 325 if (twrefcnt) 326 inet_twsk_put(tw); 327 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 328 329 if (twp) { 330 *twp = tw; 331 } else if (tw) { 332 /* Silly. Should hash-dance instead... */ 333 inet_twsk_deschedule(tw, death_row); 334 335 inet_twsk_put(tw); 336 } 337 return 0; 338 339 not_unique: 340 spin_unlock(lock); 341 return -EADDRNOTAVAIL; 342 } 343 344 static inline u32 inet_sk_port_offset(const struct sock *sk) 345 { 346 const struct inet_sock *inet = inet_sk(sk); 347 return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, 348 inet->inet_daddr, 349 inet->inet_dport); 350 } 351 352 int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw) 353 { 354 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 355 struct hlist_nulls_head *list; 356 spinlock_t *lock; 357 struct inet_ehash_bucket *head; 358 int twrefcnt = 0; 359 360 WARN_ON(!sk_unhashed(sk)); 361 362 sk->sk_hash = inet_sk_ehashfn(sk); 363 head = inet_ehash_bucket(hashinfo, sk->sk_hash); 364 list = &head->chain; 365 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 366 367 spin_lock(lock); 368 __sk_nulls_add_node_rcu(sk, list); 369 if (tw) { 370 WARN_ON(sk->sk_hash != tw->tw_hash); 371 twrefcnt = inet_twsk_unhash(tw); 372 } 373 spin_unlock(lock); 374 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 375 return twrefcnt; 376 } 377 EXPORT_SYMBOL_GPL(__inet_hash_nolisten); 378 379 static void __inet_hash(struct sock *sk) 380 { 381 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 382 struct inet_listen_hashbucket *ilb; 383 384 if (sk->sk_state != TCP_LISTEN) { 385 __inet_hash_nolisten(sk, NULL); 386 return; 387 } 388 389 WARN_ON(!sk_unhashed(sk)); 390 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 391 392 spin_lock(&ilb->lock); 393 __sk_nulls_add_node_rcu(sk, &ilb->head); 394 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 395 spin_unlock(&ilb->lock); 396 } 397 398 void inet_hash(struct sock *sk) 399 { 400 if (sk->sk_state != TCP_CLOSE) { 401 local_bh_disable(); 402 __inet_hash(sk); 403 local_bh_enable(); 404 } 405 } 406 EXPORT_SYMBOL_GPL(inet_hash); 407 408 void inet_unhash(struct sock *sk) 409 { 410 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 411 spinlock_t *lock; 412 int done; 413 414 if (sk_unhashed(sk)) 415 return; 416 417 if (sk->sk_state == TCP_LISTEN) 418 lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock; 419 else 420 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 421 422 spin_lock_bh(lock); 423 done =__sk_nulls_del_node_init_rcu(sk); 424 if (done) 425 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 426 spin_unlock_bh(lock); 427 } 428 EXPORT_SYMBOL_GPL(inet_unhash); 429 430 int __inet_hash_connect(struct inet_timewait_death_row *death_row, 431 struct sock *sk, u32 port_offset, 432 int (*check_established)(struct inet_timewait_death_row *, 433 struct sock *, __u16, struct inet_timewait_sock **), 434 int (*hash)(struct sock *sk, struct inet_timewait_sock *twp)) 435 { 436 struct inet_hashinfo *hinfo = death_row->hashinfo; 437 const unsigned short snum = inet_sk(sk)->inet_num; 438 struct inet_bind_hashbucket *head; 439 struct inet_bind_bucket *tb; 440 int ret; 441 struct net *net = sock_net(sk); 442 int twrefcnt = 1; 443 444 if (!snum) { 445 int i, remaining, low, high, port; 446 static u32 hint; 447 u32 offset = hint + port_offset; 448 struct hlist_node *node; 449 struct inet_timewait_sock *tw = NULL; 450 451 inet_get_local_port_range(&low, &high); 452 remaining = (high - low) + 1; 453 454 local_bh_disable(); 455 for (i = 1; i <= remaining; i++) { 456 port = low + (i + offset) % remaining; 457 if (inet_is_reserved_local_port(port)) 458 continue; 459 head = &hinfo->bhash[inet_bhashfn(net, port, 460 hinfo->bhash_size)]; 461 spin_lock(&head->lock); 462 463 /* Does not bother with rcv_saddr checks, 464 * because the established check is already 465 * unique enough. 466 */ 467 inet_bind_bucket_for_each(tb, node, &head->chain) { 468 if (net_eq(ib_net(tb), net) && 469 tb->port == port) { 470 if (tb->fastreuse >= 0) 471 goto next_port; 472 WARN_ON(hlist_empty(&tb->owners)); 473 if (!check_established(death_row, sk, 474 port, &tw)) 475 goto ok; 476 goto next_port; 477 } 478 } 479 480 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 481 net, head, port); 482 if (!tb) { 483 spin_unlock(&head->lock); 484 break; 485 } 486 tb->fastreuse = -1; 487 goto ok; 488 489 next_port: 490 spin_unlock(&head->lock); 491 } 492 local_bh_enable(); 493 494 return -EADDRNOTAVAIL; 495 496 ok: 497 hint += i; 498 499 /* Head lock still held and bh's disabled */ 500 inet_bind_hash(sk, tb, port); 501 if (sk_unhashed(sk)) { 502 inet_sk(sk)->inet_sport = htons(port); 503 twrefcnt += hash(sk, tw); 504 } 505 if (tw) 506 twrefcnt += inet_twsk_bind_unhash(tw, hinfo); 507 spin_unlock(&head->lock); 508 509 if (tw) { 510 inet_twsk_deschedule(tw, death_row); 511 while (twrefcnt) { 512 twrefcnt--; 513 inet_twsk_put(tw); 514 } 515 } 516 517 ret = 0; 518 goto out; 519 } 520 521 head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)]; 522 tb = inet_csk(sk)->icsk_bind_hash; 523 spin_lock_bh(&head->lock); 524 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { 525 hash(sk, NULL); 526 spin_unlock_bh(&head->lock); 527 return 0; 528 } else { 529 spin_unlock(&head->lock); 530 /* No definite answer... Walk to established hash table */ 531 ret = check_established(death_row, sk, snum, NULL); 532 out: 533 local_bh_enable(); 534 return ret; 535 } 536 } 537 538 /* 539 * Bind a port for a connect operation and hash it. 540 */ 541 int inet_hash_connect(struct inet_timewait_death_row *death_row, 542 struct sock *sk) 543 { 544 return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk), 545 __inet_check_established, __inet_hash_nolisten); 546 } 547 EXPORT_SYMBOL_GPL(inet_hash_connect); 548 549 void inet_hashinfo_init(struct inet_hashinfo *h) 550 { 551 int i; 552 553 atomic_set(&h->bsockets, 0); 554 for (i = 0; i < INET_LHTABLE_SIZE; i++) { 555 spin_lock_init(&h->listening_hash[i].lock); 556 INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head, 557 i + LISTENING_NULLS_BASE); 558 } 559 } 560 EXPORT_SYMBOL_GPL(inet_hashinfo_init); 561