1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Support for INET connection oriented protocols. 7 * 8 * Authors: See the TCP sources 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or(at your option) any later version. 14 */ 15 16 #include <linux/module.h> 17 #include <linux/jhash.h> 18 19 #include <net/inet_connection_sock.h> 20 #include <net/inet_hashtables.h> 21 #include <net/inet_timewait_sock.h> 22 #include <net/ip.h> 23 #include <net/route.h> 24 #include <net/tcp_states.h> 25 #include <net/xfrm.h> 26 27 #ifdef INET_CSK_DEBUG 28 const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; 29 EXPORT_SYMBOL(inet_csk_timer_bug_msg); 30 #endif 31 32 void inet_get_local_port_range(struct net *net, int *low, int *high) 33 { 34 unsigned int seq; 35 36 do { 37 seq = read_seqbegin(&net->ipv4.ip_local_ports.lock); 38 39 *low = net->ipv4.ip_local_ports.range[0]; 40 *high = net->ipv4.ip_local_ports.range[1]; 41 } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq)); 42 } 43 EXPORT_SYMBOL(inet_get_local_port_range); 44 45 int inet_csk_bind_conflict(const struct sock *sk, 46 const struct inet_bind_bucket *tb, bool relax) 47 { 48 struct sock *sk2; 49 int reuse = sk->sk_reuse; 50 int reuseport = sk->sk_reuseport; 51 kuid_t uid = sock_i_uid((struct sock *)sk); 52 53 /* 54 * Unlike other sk lookup places we do not check 55 * for sk_net here, since _all_ the socks listed 56 * in tb->owners list belong to the same net - the 57 * one this bucket belongs to. 58 */ 59 60 sk_for_each_bound(sk2, &tb->owners) { 61 if (sk != sk2 && 62 !inet_v6_ipv6only(sk2) && 63 (!sk->sk_bound_dev_if || 64 !sk2->sk_bound_dev_if || 65 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { 66 if ((!reuse || !sk2->sk_reuse || 67 sk2->sk_state == TCP_LISTEN) && 68 (!reuseport || !sk2->sk_reuseport || 69 (sk2->sk_state != TCP_TIME_WAIT && 70 !uid_eq(uid, sock_i_uid(sk2))))) { 71 72 if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || 73 sk2->sk_rcv_saddr == sk->sk_rcv_saddr) 74 break; 75 } 76 if (!relax && reuse && sk2->sk_reuse && 77 sk2->sk_state != TCP_LISTEN) { 78 79 if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || 80 sk2->sk_rcv_saddr == sk->sk_rcv_saddr) 81 break; 82 } 83 } 84 } 85 return sk2 != NULL; 86 } 87 EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); 88 89 /* Obtain a reference to a local port for the given sock, 90 * if snum is zero it means select any available local port. 91 */ 92 int inet_csk_get_port(struct sock *sk, unsigned short snum) 93 { 94 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 95 struct inet_bind_hashbucket *head; 96 struct inet_bind_bucket *tb; 97 int ret, attempts = 5; 98 struct net *net = sock_net(sk); 99 int smallest_size = -1, smallest_rover; 100 kuid_t uid = sock_i_uid(sk); 101 102 local_bh_disable(); 103 if (!snum) { 104 int remaining, rover, low, high; 105 106 again: 107 inet_get_local_port_range(net, &low, &high); 108 remaining = (high - low) + 1; 109 smallest_rover = rover = prandom_u32() % remaining + low; 110 111 smallest_size = -1; 112 do { 113 if (inet_is_local_reserved_port(net, rover)) 114 goto next_nolock; 115 head = &hashinfo->bhash[inet_bhashfn(net, rover, 116 hashinfo->bhash_size)]; 117 spin_lock(&head->lock); 118 inet_bind_bucket_for_each(tb, &head->chain) 119 if (net_eq(ib_net(tb), net) && tb->port == rover) { 120 if (((tb->fastreuse > 0 && 121 sk->sk_reuse && 122 sk->sk_state != TCP_LISTEN) || 123 (tb->fastreuseport > 0 && 124 sk->sk_reuseport && 125 uid_eq(tb->fastuid, uid))) && 126 (tb->num_owners < smallest_size || smallest_size == -1)) { 127 smallest_size = tb->num_owners; 128 smallest_rover = rover; 129 if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 && 130 !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { 131 snum = smallest_rover; 132 goto tb_found; 133 } 134 } 135 if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { 136 snum = rover; 137 goto tb_found; 138 } 139 goto next; 140 } 141 break; 142 next: 143 spin_unlock(&head->lock); 144 next_nolock: 145 if (++rover > high) 146 rover = low; 147 } while (--remaining > 0); 148 149 /* Exhausted local port range during search? It is not 150 * possible for us to be holding one of the bind hash 151 * locks if this test triggers, because if 'remaining' 152 * drops to zero, we broke out of the do/while loop at 153 * the top level, not from the 'break;' statement. 154 */ 155 ret = 1; 156 if (remaining <= 0) { 157 if (smallest_size != -1) { 158 snum = smallest_rover; 159 goto have_snum; 160 } 161 goto fail; 162 } 163 /* OK, here is the one we will use. HEAD is 164 * non-NULL and we hold it's mutex. 165 */ 166 snum = rover; 167 } else { 168 have_snum: 169 head = &hashinfo->bhash[inet_bhashfn(net, snum, 170 hashinfo->bhash_size)]; 171 spin_lock(&head->lock); 172 inet_bind_bucket_for_each(tb, &head->chain) 173 if (net_eq(ib_net(tb), net) && tb->port == snum) 174 goto tb_found; 175 } 176 tb = NULL; 177 goto tb_not_found; 178 tb_found: 179 if (!hlist_empty(&tb->owners)) { 180 if (sk->sk_reuse == SK_FORCE_REUSE) 181 goto success; 182 183 if (((tb->fastreuse > 0 && 184 sk->sk_reuse && sk->sk_state != TCP_LISTEN) || 185 (tb->fastreuseport > 0 && 186 sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && 187 smallest_size == -1) { 188 goto success; 189 } else { 190 ret = 1; 191 if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { 192 if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) || 193 (tb->fastreuseport > 0 && 194 sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && 195 smallest_size != -1 && --attempts >= 0) { 196 spin_unlock(&head->lock); 197 goto again; 198 } 199 200 goto fail_unlock; 201 } 202 } 203 } 204 tb_not_found: 205 ret = 1; 206 if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, 207 net, head, snum)) == NULL) 208 goto fail_unlock; 209 if (hlist_empty(&tb->owners)) { 210 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) 211 tb->fastreuse = 1; 212 else 213 tb->fastreuse = 0; 214 if (sk->sk_reuseport) { 215 tb->fastreuseport = 1; 216 tb->fastuid = uid; 217 } else 218 tb->fastreuseport = 0; 219 } else { 220 if (tb->fastreuse && 221 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) 222 tb->fastreuse = 0; 223 if (tb->fastreuseport && 224 (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))) 225 tb->fastreuseport = 0; 226 } 227 success: 228 if (!inet_csk(sk)->icsk_bind_hash) 229 inet_bind_hash(sk, tb, snum); 230 WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); 231 ret = 0; 232 233 fail_unlock: 234 spin_unlock(&head->lock); 235 fail: 236 local_bh_enable(); 237 return ret; 238 } 239 EXPORT_SYMBOL_GPL(inet_csk_get_port); 240 241 /* 242 * Wait for an incoming connection, avoid race conditions. This must be called 243 * with the socket locked. 244 */ 245 static int inet_csk_wait_for_connect(struct sock *sk, long timeo) 246 { 247 struct inet_connection_sock *icsk = inet_csk(sk); 248 DEFINE_WAIT(wait); 249 int err; 250 251 /* 252 * True wake-one mechanism for incoming connections: only 253 * one process gets woken up, not the 'whole herd'. 254 * Since we do not 'race & poll' for established sockets 255 * anymore, the common case will execute the loop only once. 256 * 257 * Subtle issue: "add_wait_queue_exclusive()" will be added 258 * after any current non-exclusive waiters, and we know that 259 * it will always _stay_ after any new non-exclusive waiters 260 * because all non-exclusive waiters are added at the 261 * beginning of the wait-queue. As such, it's ok to "drop" 262 * our exclusiveness temporarily when we get woken up without 263 * having to remove and re-insert us on the wait queue. 264 */ 265 for (;;) { 266 prepare_to_wait_exclusive(sk_sleep(sk), &wait, 267 TASK_INTERRUPTIBLE); 268 release_sock(sk); 269 if (reqsk_queue_empty(&icsk->icsk_accept_queue)) 270 timeo = schedule_timeout(timeo); 271 sched_annotate_sleep(); 272 lock_sock(sk); 273 err = 0; 274 if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) 275 break; 276 err = -EINVAL; 277 if (sk->sk_state != TCP_LISTEN) 278 break; 279 err = sock_intr_errno(timeo); 280 if (signal_pending(current)) 281 break; 282 err = -EAGAIN; 283 if (!timeo) 284 break; 285 } 286 finish_wait(sk_sleep(sk), &wait); 287 return err; 288 } 289 290 /* 291 * This will accept the next outstanding connection. 292 */ 293 struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) 294 { 295 struct inet_connection_sock *icsk = inet_csk(sk); 296 struct request_sock_queue *queue = &icsk->icsk_accept_queue; 297 struct sock *newsk; 298 struct request_sock *req; 299 int error; 300 301 lock_sock(sk); 302 303 /* We need to make sure that this socket is listening, 304 * and that it has something pending. 305 */ 306 error = -EINVAL; 307 if (sk->sk_state != TCP_LISTEN) 308 goto out_err; 309 310 /* Find already established connection */ 311 if (reqsk_queue_empty(queue)) { 312 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); 313 314 /* If this is a non blocking socket don't sleep */ 315 error = -EAGAIN; 316 if (!timeo) 317 goto out_err; 318 319 error = inet_csk_wait_for_connect(sk, timeo); 320 if (error) 321 goto out_err; 322 } 323 req = reqsk_queue_remove(queue); 324 newsk = req->sk; 325 326 sk_acceptq_removed(sk); 327 if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) { 328 spin_lock_bh(&queue->fastopenq->lock); 329 if (tcp_rsk(req)->listener) { 330 /* We are still waiting for the final ACK from 3WHS 331 * so can't free req now. Instead, we set req->sk to 332 * NULL to signify that the child socket is taken 333 * so reqsk_fastopen_remove() will free the req 334 * when 3WHS finishes (or is aborted). 335 */ 336 req->sk = NULL; 337 req = NULL; 338 } 339 spin_unlock_bh(&queue->fastopenq->lock); 340 } 341 out: 342 release_sock(sk); 343 if (req) 344 __reqsk_free(req); 345 return newsk; 346 out_err: 347 newsk = NULL; 348 req = NULL; 349 *err = error; 350 goto out; 351 } 352 EXPORT_SYMBOL(inet_csk_accept); 353 354 /* 355 * Using different timers for retransmit, delayed acks and probes 356 * We may wish use just one timer maintaining a list of expire jiffies 357 * to optimize. 358 */ 359 void inet_csk_init_xmit_timers(struct sock *sk, 360 void (*retransmit_handler)(unsigned long), 361 void (*delack_handler)(unsigned long), 362 void (*keepalive_handler)(unsigned long)) 363 { 364 struct inet_connection_sock *icsk = inet_csk(sk); 365 366 setup_timer(&icsk->icsk_retransmit_timer, retransmit_handler, 367 (unsigned long)sk); 368 setup_timer(&icsk->icsk_delack_timer, delack_handler, 369 (unsigned long)sk); 370 setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk); 371 icsk->icsk_pending = icsk->icsk_ack.pending = 0; 372 } 373 EXPORT_SYMBOL(inet_csk_init_xmit_timers); 374 375 void inet_csk_clear_xmit_timers(struct sock *sk) 376 { 377 struct inet_connection_sock *icsk = inet_csk(sk); 378 379 icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0; 380 381 sk_stop_timer(sk, &icsk->icsk_retransmit_timer); 382 sk_stop_timer(sk, &icsk->icsk_delack_timer); 383 sk_stop_timer(sk, &sk->sk_timer); 384 } 385 EXPORT_SYMBOL(inet_csk_clear_xmit_timers); 386 387 void inet_csk_delete_keepalive_timer(struct sock *sk) 388 { 389 sk_stop_timer(sk, &sk->sk_timer); 390 } 391 EXPORT_SYMBOL(inet_csk_delete_keepalive_timer); 392 393 void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) 394 { 395 sk_reset_timer(sk, &sk->sk_timer, jiffies + len); 396 } 397 EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); 398 399 struct dst_entry *inet_csk_route_req(struct sock *sk, 400 struct flowi4 *fl4, 401 const struct request_sock *req) 402 { 403 struct rtable *rt; 404 const struct inet_request_sock *ireq = inet_rsk(req); 405 struct ip_options_rcu *opt = inet_rsk(req)->opt; 406 struct net *net = sock_net(sk); 407 int flags = inet_sk_flowi_flags(sk); 408 409 flowi4_init_output(fl4, sk->sk_bound_dev_if, ireq->ir_mark, 410 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, 411 sk->sk_protocol, 412 flags, 413 (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, 414 ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport); 415 security_req_classify_flow(req, flowi4_to_flowi(fl4)); 416 rt = ip_route_output_flow(net, fl4, sk); 417 if (IS_ERR(rt)) 418 goto no_route; 419 if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) 420 goto route_err; 421 return &rt->dst; 422 423 route_err: 424 ip_rt_put(rt); 425 no_route: 426 IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); 427 return NULL; 428 } 429 EXPORT_SYMBOL_GPL(inet_csk_route_req); 430 431 struct dst_entry *inet_csk_route_child_sock(struct sock *sk, 432 struct sock *newsk, 433 const struct request_sock *req) 434 { 435 const struct inet_request_sock *ireq = inet_rsk(req); 436 struct inet_sock *newinet = inet_sk(newsk); 437 struct ip_options_rcu *opt; 438 struct net *net = sock_net(sk); 439 struct flowi4 *fl4; 440 struct rtable *rt; 441 442 fl4 = &newinet->cork.fl.u.ip4; 443 444 rcu_read_lock(); 445 opt = rcu_dereference(newinet->inet_opt); 446 flowi4_init_output(fl4, sk->sk_bound_dev_if, inet_rsk(req)->ir_mark, 447 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, 448 sk->sk_protocol, inet_sk_flowi_flags(sk), 449 (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, 450 ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport); 451 security_req_classify_flow(req, flowi4_to_flowi(fl4)); 452 rt = ip_route_output_flow(net, fl4, sk); 453 if (IS_ERR(rt)) 454 goto no_route; 455 if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) 456 goto route_err; 457 rcu_read_unlock(); 458 return &rt->dst; 459 460 route_err: 461 ip_rt_put(rt); 462 no_route: 463 rcu_read_unlock(); 464 IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); 465 return NULL; 466 } 467 EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); 468 469 static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, 470 const u32 rnd, const u32 synq_hsize) 471 { 472 return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1); 473 } 474 475 #if IS_ENABLED(CONFIG_IPV6) 476 #define AF_INET_FAMILY(fam) ((fam) == AF_INET) 477 #else 478 #define AF_INET_FAMILY(fam) 1 479 #endif 480 481 struct request_sock *inet_csk_search_req(const struct sock *sk, 482 struct request_sock ***prevp, 483 const __be16 rport, const __be32 raddr, 484 const __be32 laddr) 485 { 486 const struct inet_connection_sock *icsk = inet_csk(sk); 487 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; 488 struct request_sock *req, **prev; 489 490 for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd, 491 lopt->nr_table_entries)]; 492 (req = *prev) != NULL; 493 prev = &req->dl_next) { 494 const struct inet_request_sock *ireq = inet_rsk(req); 495 496 if (ireq->ir_rmt_port == rport && 497 ireq->ir_rmt_addr == raddr && 498 ireq->ir_loc_addr == laddr && 499 AF_INET_FAMILY(req->rsk_ops->family)) { 500 WARN_ON(req->sk); 501 *prevp = prev; 502 break; 503 } 504 } 505 506 return req; 507 } 508 EXPORT_SYMBOL_GPL(inet_csk_search_req); 509 510 void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, 511 unsigned long timeout) 512 { 513 struct inet_connection_sock *icsk = inet_csk(sk); 514 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; 515 const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr, 516 inet_rsk(req)->ir_rmt_port, 517 lopt->hash_rnd, lopt->nr_table_entries); 518 519 reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); 520 inet_csk_reqsk_queue_added(sk, timeout); 521 } 522 EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); 523 524 /* Only thing we need from tcp.h */ 525 extern int sysctl_tcp_synack_retries; 526 527 528 /* Decide when to expire the request and when to resend SYN-ACK */ 529 static inline void syn_ack_recalc(struct request_sock *req, const int thresh, 530 const int max_retries, 531 const u8 rskq_defer_accept, 532 int *expire, int *resend) 533 { 534 if (!rskq_defer_accept) { 535 *expire = req->num_timeout >= thresh; 536 *resend = 1; 537 return; 538 } 539 *expire = req->num_timeout >= thresh && 540 (!inet_rsk(req)->acked || req->num_timeout >= max_retries); 541 /* 542 * Do not resend while waiting for data after ACK, 543 * start to resend on end of deferring period to give 544 * last chance for data or ACK to create established socket. 545 */ 546 *resend = !inet_rsk(req)->acked || 547 req->num_timeout >= rskq_defer_accept - 1; 548 } 549 550 int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req) 551 { 552 int err = req->rsk_ops->rtx_syn_ack(parent, req); 553 554 if (!err) 555 req->num_retrans++; 556 return err; 557 } 558 EXPORT_SYMBOL(inet_rtx_syn_ack); 559 560 void inet_csk_reqsk_queue_prune(struct sock *parent, 561 const unsigned long interval, 562 const unsigned long timeout, 563 const unsigned long max_rto) 564 { 565 struct inet_connection_sock *icsk = inet_csk(parent); 566 struct request_sock_queue *queue = &icsk->icsk_accept_queue; 567 struct listen_sock *lopt = queue->listen_opt; 568 int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; 569 int thresh = max_retries; 570 unsigned long now = jiffies; 571 struct request_sock **reqp, *req; 572 int i, budget; 573 574 if (lopt == NULL || lopt->qlen == 0) 575 return; 576 577 /* Normally all the openreqs are young and become mature 578 * (i.e. converted to established socket) for first timeout. 579 * If synack was not acknowledged for 1 second, it means 580 * one of the following things: synack was lost, ack was lost, 581 * rtt is high or nobody planned to ack (i.e. synflood). 582 * When server is a bit loaded, queue is populated with old 583 * open requests, reducing effective size of queue. 584 * When server is well loaded, queue size reduces to zero 585 * after several minutes of work. It is not synflood, 586 * it is normal operation. The solution is pruning 587 * too old entries overriding normal timeout, when 588 * situation becomes dangerous. 589 * 590 * Essentially, we reserve half of room for young 591 * embrions; and abort old ones without pity, if old 592 * ones are about to clog our table. 593 */ 594 if (lopt->qlen>>(lopt->max_qlen_log-1)) { 595 int young = (lopt->qlen_young<<1); 596 597 while (thresh > 2) { 598 if (lopt->qlen < young) 599 break; 600 thresh--; 601 young <<= 1; 602 } 603 } 604 605 if (queue->rskq_defer_accept) 606 max_retries = queue->rskq_defer_accept; 607 608 budget = 2 * (lopt->nr_table_entries / (timeout / interval)); 609 i = lopt->clock_hand; 610 611 do { 612 reqp=&lopt->syn_table[i]; 613 while ((req = *reqp) != NULL) { 614 if (time_after_eq(now, req->expires)) { 615 int expire = 0, resend = 0; 616 617 syn_ack_recalc(req, thresh, max_retries, 618 queue->rskq_defer_accept, 619 &expire, &resend); 620 req->rsk_ops->syn_ack_timeout(parent, req); 621 if (!expire && 622 (!resend || 623 !inet_rtx_syn_ack(parent, req) || 624 inet_rsk(req)->acked)) { 625 unsigned long timeo; 626 627 if (req->num_timeout++ == 0) 628 lopt->qlen_young--; 629 timeo = min(timeout << req->num_timeout, 630 max_rto); 631 req->expires = now + timeo; 632 reqp = &req->dl_next; 633 continue; 634 } 635 636 /* Drop this request */ 637 inet_csk_reqsk_queue_unlink(parent, req, reqp); 638 reqsk_queue_removed(queue, req); 639 reqsk_free(req); 640 continue; 641 } 642 reqp = &req->dl_next; 643 } 644 645 i = (i + 1) & (lopt->nr_table_entries - 1); 646 647 } while (--budget > 0); 648 649 lopt->clock_hand = i; 650 651 if (lopt->qlen) 652 inet_csk_reset_keepalive_timer(parent, interval); 653 } 654 EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune); 655 656 /** 657 * inet_csk_clone_lock - clone an inet socket, and lock its clone 658 * @sk: the socket to clone 659 * @req: request_sock 660 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 661 * 662 * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) 663 */ 664 struct sock *inet_csk_clone_lock(const struct sock *sk, 665 const struct request_sock *req, 666 const gfp_t priority) 667 { 668 struct sock *newsk = sk_clone_lock(sk, priority); 669 670 if (newsk != NULL) { 671 struct inet_connection_sock *newicsk = inet_csk(newsk); 672 673 newsk->sk_state = TCP_SYN_RECV; 674 newicsk->icsk_bind_hash = NULL; 675 676 inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port; 677 inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num; 678 inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num); 679 newsk->sk_write_space = sk_stream_write_space; 680 681 newsk->sk_mark = inet_rsk(req)->ir_mark; 682 683 newicsk->icsk_retransmits = 0; 684 newicsk->icsk_backoff = 0; 685 newicsk->icsk_probes_out = 0; 686 687 /* Deinitialize accept_queue to trap illegal accesses. */ 688 memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); 689 690 security_inet_csk_clone(newsk, req); 691 } 692 return newsk; 693 } 694 EXPORT_SYMBOL_GPL(inet_csk_clone_lock); 695 696 /* 697 * At this point, there should be no process reference to this 698 * socket, and thus no user references at all. Therefore we 699 * can assume the socket waitqueue is inactive and nobody will 700 * try to jump onto it. 701 */ 702 void inet_csk_destroy_sock(struct sock *sk) 703 { 704 WARN_ON(sk->sk_state != TCP_CLOSE); 705 WARN_ON(!sock_flag(sk, SOCK_DEAD)); 706 707 /* It cannot be in hash table! */ 708 WARN_ON(!sk_unhashed(sk)); 709 710 /* If it has not 0 inet_sk(sk)->inet_num, it must be bound */ 711 WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash); 712 713 sk->sk_prot->destroy(sk); 714 715 sk_stream_kill_queues(sk); 716 717 xfrm_sk_free_policy(sk); 718 719 sk_refcnt_debug_release(sk); 720 721 percpu_counter_dec(sk->sk_prot->orphan_count); 722 sock_put(sk); 723 } 724 EXPORT_SYMBOL(inet_csk_destroy_sock); 725 726 /* This function allows to force a closure of a socket after the call to 727 * tcp/dccp_create_openreq_child(). 728 */ 729 void inet_csk_prepare_forced_close(struct sock *sk) 730 __releases(&sk->sk_lock.slock) 731 { 732 /* sk_clone_lock locked the socket and set refcnt to 2 */ 733 bh_unlock_sock(sk); 734 sock_put(sk); 735 736 /* The below has to be done to allow calling inet_csk_destroy_sock */ 737 sock_set_flag(sk, SOCK_DEAD); 738 percpu_counter_inc(sk->sk_prot->orphan_count); 739 inet_sk(sk)->inet_num = 0; 740 } 741 EXPORT_SYMBOL(inet_csk_prepare_forced_close); 742 743 int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) 744 { 745 struct inet_sock *inet = inet_sk(sk); 746 struct inet_connection_sock *icsk = inet_csk(sk); 747 int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); 748 749 if (rc != 0) 750 return rc; 751 752 sk->sk_max_ack_backlog = 0; 753 sk->sk_ack_backlog = 0; 754 inet_csk_delack_init(sk); 755 756 /* There is race window here: we announce ourselves listening, 757 * but this transition is still not validated by get_port(). 758 * It is OK, because this socket enters to hash table only 759 * after validation is complete. 760 */ 761 sk->sk_state = TCP_LISTEN; 762 if (!sk->sk_prot->get_port(sk, inet->inet_num)) { 763 inet->inet_sport = htons(inet->inet_num); 764 765 sk_dst_reset(sk); 766 sk->sk_prot->hash(sk); 767 768 return 0; 769 } 770 771 sk->sk_state = TCP_CLOSE; 772 __reqsk_queue_destroy(&icsk->icsk_accept_queue); 773 return -EADDRINUSE; 774 } 775 EXPORT_SYMBOL_GPL(inet_csk_listen_start); 776 777 /* 778 * This routine closes sockets which have been at least partially 779 * opened, but not yet accepted. 780 */ 781 void inet_csk_listen_stop(struct sock *sk) 782 { 783 struct inet_connection_sock *icsk = inet_csk(sk); 784 struct request_sock_queue *queue = &icsk->icsk_accept_queue; 785 struct request_sock *acc_req; 786 struct request_sock *req; 787 788 inet_csk_delete_keepalive_timer(sk); 789 790 /* make all the listen_opt local to us */ 791 acc_req = reqsk_queue_yank_acceptq(queue); 792 793 /* Following specs, it would be better either to send FIN 794 * (and enter FIN-WAIT-1, it is normal close) 795 * or to send active reset (abort). 796 * Certainly, it is pretty dangerous while synflood, but it is 797 * bad justification for our negligence 8) 798 * To be honest, we are not able to make either 799 * of the variants now. --ANK 800 */ 801 reqsk_queue_destroy(queue); 802 803 while ((req = acc_req) != NULL) { 804 struct sock *child = req->sk; 805 806 acc_req = req->dl_next; 807 808 local_bh_disable(); 809 bh_lock_sock(child); 810 WARN_ON(sock_owned_by_user(child)); 811 sock_hold(child); 812 813 sk->sk_prot->disconnect(child, O_NONBLOCK); 814 815 sock_orphan(child); 816 817 percpu_counter_inc(sk->sk_prot->orphan_count); 818 819 if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->listener) { 820 BUG_ON(tcp_sk(child)->fastopen_rsk != req); 821 BUG_ON(sk != tcp_rsk(req)->listener); 822 823 /* Paranoid, to prevent race condition if 824 * an inbound pkt destined for child is 825 * blocked by sock lock in tcp_v4_rcv(). 826 * Also to satisfy an assertion in 827 * tcp_v4_destroy_sock(). 828 */ 829 tcp_sk(child)->fastopen_rsk = NULL; 830 sock_put(sk); 831 } 832 inet_csk_destroy_sock(child); 833 834 bh_unlock_sock(child); 835 local_bh_enable(); 836 sock_put(child); 837 838 sk_acceptq_removed(sk); 839 __reqsk_free(req); 840 } 841 if (queue->fastopenq != NULL) { 842 /* Free all the reqs queued in rskq_rst_head. */ 843 spin_lock_bh(&queue->fastopenq->lock); 844 acc_req = queue->fastopenq->rskq_rst_head; 845 queue->fastopenq->rskq_rst_head = NULL; 846 spin_unlock_bh(&queue->fastopenq->lock); 847 while ((req = acc_req) != NULL) { 848 acc_req = req->dl_next; 849 __reqsk_free(req); 850 } 851 } 852 WARN_ON(sk->sk_ack_backlog); 853 } 854 EXPORT_SYMBOL_GPL(inet_csk_listen_stop); 855 856 void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) 857 { 858 struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; 859 const struct inet_sock *inet = inet_sk(sk); 860 861 sin->sin_family = AF_INET; 862 sin->sin_addr.s_addr = inet->inet_daddr; 863 sin->sin_port = inet->inet_dport; 864 } 865 EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); 866 867 #ifdef CONFIG_COMPAT 868 int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname, 869 char __user *optval, int __user *optlen) 870 { 871 const struct inet_connection_sock *icsk = inet_csk(sk); 872 873 if (icsk->icsk_af_ops->compat_getsockopt != NULL) 874 return icsk->icsk_af_ops->compat_getsockopt(sk, level, optname, 875 optval, optlen); 876 return icsk->icsk_af_ops->getsockopt(sk, level, optname, 877 optval, optlen); 878 } 879 EXPORT_SYMBOL_GPL(inet_csk_compat_getsockopt); 880 881 int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname, 882 char __user *optval, unsigned int optlen) 883 { 884 const struct inet_connection_sock *icsk = inet_csk(sk); 885 886 if (icsk->icsk_af_ops->compat_setsockopt != NULL) 887 return icsk->icsk_af_ops->compat_setsockopt(sk, level, optname, 888 optval, optlen); 889 return icsk->icsk_af_ops->setsockopt(sk, level, optname, 890 optval, optlen); 891 } 892 EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt); 893 #endif 894 895 static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl) 896 { 897 const struct inet_sock *inet = inet_sk(sk); 898 const struct ip_options_rcu *inet_opt; 899 __be32 daddr = inet->inet_daddr; 900 struct flowi4 *fl4; 901 struct rtable *rt; 902 903 rcu_read_lock(); 904 inet_opt = rcu_dereference(inet->inet_opt); 905 if (inet_opt && inet_opt->opt.srr) 906 daddr = inet_opt->opt.faddr; 907 fl4 = &fl->u.ip4; 908 rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, 909 inet->inet_saddr, inet->inet_dport, 910 inet->inet_sport, sk->sk_protocol, 911 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); 912 if (IS_ERR(rt)) 913 rt = NULL; 914 if (rt) 915 sk_setup_caps(sk, &rt->dst); 916 rcu_read_unlock(); 917 918 return &rt->dst; 919 } 920 921 struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu) 922 { 923 struct dst_entry *dst = __sk_dst_check(sk, 0); 924 struct inet_sock *inet = inet_sk(sk); 925 926 if (!dst) { 927 dst = inet_csk_rebuild_route(sk, &inet->cork.fl); 928 if (!dst) 929 goto out; 930 } 931 dst->ops->update_pmtu(dst, sk, NULL, mtu); 932 933 dst = __sk_dst_check(sk, 0); 934 if (!dst) 935 dst = inet_csk_rebuild_route(sk, &inet->cork.fl); 936 out: 937 return dst; 938 } 939 EXPORT_SYMBOL_GPL(inet_csk_update_pmtu); 940