1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Generic INET transport hashtables 7 * 8 * Authors: Lotsa people, from code originally in tcp 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 */ 15 16 #include <linux/module.h> 17 #include <linux/random.h> 18 #include <linux/sched.h> 19 #include <linux/slab.h> 20 #include <linux/wait.h> 21 22 #include <net/inet_connection_sock.h> 23 #include <net/inet_hashtables.h> 24 #include <net/ip.h> 25 26 /* 27 * Allocate and initialize a new local port bind bucket. 28 * The bindhash mutex for snum's hash chain must be held here. 29 */ 30 struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, 31 struct net *net, 32 struct inet_bind_hashbucket *head, 33 const unsigned short snum) 34 { 35 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 36 37 if (tb != NULL) { 38 tb->ib_net = net; 39 tb->port = snum; 40 tb->fastreuse = 0; 41 INIT_HLIST_HEAD(&tb->owners); 42 hlist_add_head(&tb->node, &head->chain); 43 } 44 return tb; 45 } 46 47 /* 48 * Caller must hold hashbucket lock for this tb with local BH disabled 49 */ 50 void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) 51 { 52 if (hlist_empty(&tb->owners)) { 53 __hlist_del(&tb->node); 54 kmem_cache_free(cachep, tb); 55 } 56 } 57 58 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 59 const unsigned short snum) 60 { 61 inet_sk(sk)->num = snum; 62 sk_add_bind_node(sk, &tb->owners); 63 inet_csk(sk)->icsk_bind_hash = tb; 64 } 65 66 /* 67 * Get rid of any references to a local port held by the given sock. 68 */ 69 static void __inet_put_port(struct sock *sk) 70 { 71 struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo; 72 const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size); 73 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; 74 struct inet_bind_bucket *tb; 75 76 spin_lock(&head->lock); 77 tb = inet_csk(sk)->icsk_bind_hash; 78 __sk_del_bind_node(sk); 79 inet_csk(sk)->icsk_bind_hash = NULL; 80 inet_sk(sk)->num = 0; 81 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 82 spin_unlock(&head->lock); 83 } 84 85 void inet_put_port(struct sock *sk) 86 { 87 local_bh_disable(); 88 __inet_put_port(sk); 89 local_bh_enable(); 90 } 91 92 EXPORT_SYMBOL(inet_put_port); 93 94 /* 95 * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. 96 * Look, when several writers sleep and reader wakes them up, all but one 97 * immediately hit write lock and grab all the cpus. Exclusive sleep solves 98 * this, _but_ remember, it adds useless work on UP machines (wake up each 99 * exclusive lock release). It should be ifdefed really. 100 */ 101 void inet_listen_wlock(struct inet_hashinfo *hashinfo) 102 __acquires(hashinfo->lhash_lock) 103 { 104 write_lock(&hashinfo->lhash_lock); 105 106 if (atomic_read(&hashinfo->lhash_users)) { 107 DEFINE_WAIT(wait); 108 109 for (;;) { 110 prepare_to_wait_exclusive(&hashinfo->lhash_wait, 111 &wait, TASK_UNINTERRUPTIBLE); 112 if (!atomic_read(&hashinfo->lhash_users)) 113 break; 114 write_unlock_bh(&hashinfo->lhash_lock); 115 schedule(); 116 write_lock_bh(&hashinfo->lhash_lock); 117 } 118 119 finish_wait(&hashinfo->lhash_wait, &wait); 120 } 121 } 122 123 EXPORT_SYMBOL(inet_listen_wlock); 124 125 /* 126 * Don't inline this cruft. Here are some nice properties to exploit here. The 127 * BSD API does not allow a listening sock to specify the remote port nor the 128 * remote address for the connection. So always assume those are both 129 * wildcarded during the search since they can never be otherwise. 130 */ 131 static struct sock *inet_lookup_listener_slow(struct net *net, 132 const struct hlist_head *head, 133 const __be32 daddr, 134 const unsigned short hnum, 135 const int dif) 136 { 137 struct sock *result = NULL, *sk; 138 const struct hlist_node *node; 139 int hiscore = -1; 140 141 sk_for_each(sk, node, head) { 142 const struct inet_sock *inet = inet_sk(sk); 143 144 if (sk->sk_net == net && inet->num == hnum && 145 !ipv6_only_sock(sk)) { 146 const __be32 rcv_saddr = inet->rcv_saddr; 147 int score = sk->sk_family == PF_INET ? 1 : 0; 148 149 if (rcv_saddr) { 150 if (rcv_saddr != daddr) 151 continue; 152 score += 2; 153 } 154 if (sk->sk_bound_dev_if) { 155 if (sk->sk_bound_dev_if != dif) 156 continue; 157 score += 2; 158 } 159 if (score == 5) 160 return sk; 161 if (score > hiscore) { 162 hiscore = score; 163 result = sk; 164 } 165 } 166 } 167 return result; 168 } 169 170 /* Optimize the common listener case. */ 171 struct sock *__inet_lookup_listener(struct net *net, 172 struct inet_hashinfo *hashinfo, 173 const __be32 daddr, const unsigned short hnum, 174 const int dif) 175 { 176 struct sock *sk = NULL; 177 const struct hlist_head *head; 178 179 read_lock(&hashinfo->lhash_lock); 180 head = &hashinfo->listening_hash[inet_lhashfn(hnum)]; 181 if (!hlist_empty(head)) { 182 const struct inet_sock *inet = inet_sk((sk = __sk_head(head))); 183 184 if (inet->num == hnum && !sk->sk_node.next && 185 (!inet->rcv_saddr || inet->rcv_saddr == daddr) && 186 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && 187 !sk->sk_bound_dev_if && sk->sk_net == net) 188 goto sherry_cache; 189 sk = inet_lookup_listener_slow(net, head, daddr, hnum, dif); 190 } 191 if (sk) { 192 sherry_cache: 193 sock_hold(sk); 194 } 195 read_unlock(&hashinfo->lhash_lock); 196 return sk; 197 } 198 EXPORT_SYMBOL_GPL(__inet_lookup_listener); 199 200 struct sock * __inet_lookup_established(struct net *net, 201 struct inet_hashinfo *hashinfo, 202 const __be32 saddr, const __be16 sport, 203 const __be32 daddr, const u16 hnum, 204 const int dif) 205 { 206 INET_ADDR_COOKIE(acookie, saddr, daddr) 207 const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 208 struct sock *sk; 209 const struct hlist_node *node; 210 /* Optimize here for direct hit, only listening connections can 211 * have wildcards anyways. 212 */ 213 unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport); 214 struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); 215 rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); 216 217 prefetch(head->chain.first); 218 read_lock(lock); 219 sk_for_each(sk, node, &head->chain) { 220 if (INET_MATCH(sk, net, hash, acookie, 221 saddr, daddr, ports, dif)) 222 goto hit; /* You sunk my battleship! */ 223 } 224 225 /* Must check for a TIME_WAIT'er before going to listener hash. */ 226 sk_for_each(sk, node, &head->twchain) { 227 if (INET_TW_MATCH(sk, net, hash, acookie, 228 saddr, daddr, ports, dif)) 229 goto hit; 230 } 231 sk = NULL; 232 out: 233 read_unlock(lock); 234 return sk; 235 hit: 236 sock_hold(sk); 237 goto out; 238 } 239 EXPORT_SYMBOL_GPL(__inet_lookup_established); 240 241 /* called with local bh disabled */ 242 static int __inet_check_established(struct inet_timewait_death_row *death_row, 243 struct sock *sk, __u16 lport, 244 struct inet_timewait_sock **twp) 245 { 246 struct inet_hashinfo *hinfo = death_row->hashinfo; 247 struct inet_sock *inet = inet_sk(sk); 248 __be32 daddr = inet->rcv_saddr; 249 __be32 saddr = inet->daddr; 250 int dif = sk->sk_bound_dev_if; 251 INET_ADDR_COOKIE(acookie, saddr, daddr) 252 const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport); 253 unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); 254 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 255 rwlock_t *lock = inet_ehash_lockp(hinfo, hash); 256 struct sock *sk2; 257 const struct hlist_node *node; 258 struct inet_timewait_sock *tw; 259 struct net *net = sk->sk_net; 260 261 prefetch(head->chain.first); 262 write_lock(lock); 263 264 /* Check TIME-WAIT sockets first. */ 265 sk_for_each(sk2, node, &head->twchain) { 266 tw = inet_twsk(sk2); 267 268 if (INET_TW_MATCH(sk2, net, hash, acookie, 269 saddr, daddr, ports, dif)) { 270 if (twsk_unique(sk, sk2, twp)) 271 goto unique; 272 else 273 goto not_unique; 274 } 275 } 276 tw = NULL; 277 278 /* And established part... */ 279 sk_for_each(sk2, node, &head->chain) { 280 if (INET_MATCH(sk2, net, hash, acookie, 281 saddr, daddr, ports, dif)) 282 goto not_unique; 283 } 284 285 unique: 286 /* Must record num and sport now. Otherwise we will see 287 * in hash table socket with a funny identity. */ 288 inet->num = lport; 289 inet->sport = htons(lport); 290 sk->sk_hash = hash; 291 BUG_TRAP(sk_unhashed(sk)); 292 __sk_add_node(sk, &head->chain); 293 sock_prot_inuse_add(sk->sk_prot, 1); 294 write_unlock(lock); 295 296 if (twp) { 297 *twp = tw; 298 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); 299 } else if (tw) { 300 /* Silly. Should hash-dance instead... */ 301 inet_twsk_deschedule(tw, death_row); 302 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); 303 304 inet_twsk_put(tw); 305 } 306 307 return 0; 308 309 not_unique: 310 write_unlock(lock); 311 return -EADDRNOTAVAIL; 312 } 313 314 static inline u32 inet_sk_port_offset(const struct sock *sk) 315 { 316 const struct inet_sock *inet = inet_sk(sk); 317 return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr, 318 inet->dport); 319 } 320 321 void __inet_hash_nolisten(struct sock *sk) 322 { 323 struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo; 324 struct hlist_head *list; 325 rwlock_t *lock; 326 struct inet_ehash_bucket *head; 327 328 BUG_TRAP(sk_unhashed(sk)); 329 330 sk->sk_hash = inet_sk_ehashfn(sk); 331 head = inet_ehash_bucket(hashinfo, sk->sk_hash); 332 list = &head->chain; 333 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 334 335 write_lock(lock); 336 __sk_add_node(sk, list); 337 sock_prot_inuse_add(sk->sk_prot, 1); 338 write_unlock(lock); 339 } 340 EXPORT_SYMBOL_GPL(__inet_hash_nolisten); 341 342 static void __inet_hash(struct sock *sk) 343 { 344 struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo; 345 struct hlist_head *list; 346 rwlock_t *lock; 347 348 if (sk->sk_state != TCP_LISTEN) { 349 __inet_hash_nolisten(sk); 350 return; 351 } 352 353 BUG_TRAP(sk_unhashed(sk)); 354 list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 355 lock = &hashinfo->lhash_lock; 356 357 inet_listen_wlock(hashinfo); 358 __sk_add_node(sk, list); 359 sock_prot_inuse_add(sk->sk_prot, 1); 360 write_unlock(lock); 361 wake_up(&hashinfo->lhash_wait); 362 } 363 364 void inet_hash(struct sock *sk) 365 { 366 if (sk->sk_state != TCP_CLOSE) { 367 local_bh_disable(); 368 __inet_hash(sk); 369 local_bh_enable(); 370 } 371 } 372 EXPORT_SYMBOL_GPL(inet_hash); 373 374 void inet_unhash(struct sock *sk) 375 { 376 rwlock_t *lock; 377 struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo; 378 379 if (sk_unhashed(sk)) 380 goto out; 381 382 if (sk->sk_state == TCP_LISTEN) { 383 local_bh_disable(); 384 inet_listen_wlock(hashinfo); 385 lock = &hashinfo->lhash_lock; 386 } else { 387 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 388 write_lock_bh(lock); 389 } 390 391 if (__sk_del_node_init(sk)) 392 sock_prot_inuse_add(sk->sk_prot, -1); 393 write_unlock_bh(lock); 394 out: 395 if (sk->sk_state == TCP_LISTEN) 396 wake_up(&hashinfo->lhash_wait); 397 } 398 EXPORT_SYMBOL_GPL(inet_unhash); 399 400 int __inet_hash_connect(struct inet_timewait_death_row *death_row, 401 struct sock *sk, u32 port_offset, 402 int (*check_established)(struct inet_timewait_death_row *, 403 struct sock *, __u16, struct inet_timewait_sock **), 404 void (*hash)(struct sock *sk)) 405 { 406 struct inet_hashinfo *hinfo = death_row->hashinfo; 407 const unsigned short snum = inet_sk(sk)->num; 408 struct inet_bind_hashbucket *head; 409 struct inet_bind_bucket *tb; 410 int ret; 411 struct net *net = sk->sk_net; 412 413 if (!snum) { 414 int i, remaining, low, high, port; 415 static u32 hint; 416 u32 offset = hint + port_offset; 417 struct hlist_node *node; 418 struct inet_timewait_sock *tw = NULL; 419 420 inet_get_local_port_range(&low, &high); 421 remaining = (high - low) + 1; 422 423 local_bh_disable(); 424 for (i = 1; i <= remaining; i++) { 425 port = low + (i + offset) % remaining; 426 head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; 427 spin_lock(&head->lock); 428 429 /* Does not bother with rcv_saddr checks, 430 * because the established check is already 431 * unique enough. 432 */ 433 inet_bind_bucket_for_each(tb, node, &head->chain) { 434 if (tb->ib_net == net && tb->port == port) { 435 BUG_TRAP(!hlist_empty(&tb->owners)); 436 if (tb->fastreuse >= 0) 437 goto next_port; 438 if (!check_established(death_row, sk, 439 port, &tw)) 440 goto ok; 441 goto next_port; 442 } 443 } 444 445 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 446 net, head, port); 447 if (!tb) { 448 spin_unlock(&head->lock); 449 break; 450 } 451 tb->fastreuse = -1; 452 goto ok; 453 454 next_port: 455 spin_unlock(&head->lock); 456 } 457 local_bh_enable(); 458 459 return -EADDRNOTAVAIL; 460 461 ok: 462 hint += i; 463 464 /* Head lock still held and bh's disabled */ 465 inet_bind_hash(sk, tb, port); 466 if (sk_unhashed(sk)) { 467 inet_sk(sk)->sport = htons(port); 468 hash(sk); 469 } 470 spin_unlock(&head->lock); 471 472 if (tw) { 473 inet_twsk_deschedule(tw, death_row); 474 inet_twsk_put(tw); 475 } 476 477 ret = 0; 478 goto out; 479 } 480 481 head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)]; 482 tb = inet_csk(sk)->icsk_bind_hash; 483 spin_lock_bh(&head->lock); 484 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { 485 hash(sk); 486 spin_unlock_bh(&head->lock); 487 return 0; 488 } else { 489 spin_unlock(&head->lock); 490 /* No definite answer... Walk to established hash table */ 491 ret = check_established(death_row, sk, snum, NULL); 492 out: 493 local_bh_enable(); 494 return ret; 495 } 496 } 497 EXPORT_SYMBOL_GPL(__inet_hash_connect); 498 499 /* 500 * Bind a port for a connect operation and hash it. 501 */ 502 int inet_hash_connect(struct inet_timewait_death_row *death_row, 503 struct sock *sk) 504 { 505 return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk), 506 __inet_check_established, __inet_hash_nolisten); 507 } 508 509 EXPORT_SYMBOL_GPL(inet_hash_connect); 510