1 /* 2 * linux/net/sunrpc/svcsock.c 3 * 4 * These are the RPC server socket internals. 5 * 6 * The server scheduling algorithm does not always distribute the load 7 * evenly when servicing a single client. May need to modify the 8 * svc_sock_enqueue procedure... 9 * 10 * TCP support is largely untested and may be a little slow. The problem 11 * is that we currently do two separate recvfrom's, one for the 4-byte 12 * record length, and the second for the actual record. This could possibly 13 * be improved by always reading a minimum size of around 100 bytes and 14 * tucking any superfluous bytes away in a temporary store. Still, that 15 * leaves write requests out in the rain. An alternative may be to peek at 16 * the first skb in the queue, and if it matches the next TCP sequence 17 * number, to extract the record marker. Yuck. 18 * 19 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> 20 */ 21 22 #include <linux/sched.h> 23 #include <linux/errno.h> 24 #include <linux/fcntl.h> 25 #include <linux/net.h> 26 #include <linux/in.h> 27 #include <linux/inet.h> 28 #include <linux/udp.h> 29 #include <linux/tcp.h> 30 #include <linux/unistd.h> 31 #include <linux/slab.h> 32 #include <linux/netdevice.h> 33 #include <linux/skbuff.h> 34 #include <linux/file.h> 35 #include <linux/freezer.h> 36 #include <net/sock.h> 37 #include <net/checksum.h> 38 #include <net/ip.h> 39 #include <net/tcp_states.h> 40 #include <asm/uaccess.h> 41 #include <asm/ioctls.h> 42 43 #include <linux/sunrpc/types.h> 44 #include <linux/sunrpc/xdr.h> 45 #include <linux/sunrpc/svcsock.h> 46 #include <linux/sunrpc/stats.h> 47 48 /* SMP locking strategy: 49 * 50 * svc_pool->sp_lock protects most of the fields of that pool. 51 * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt. 52 * when both need to be taken (rare), svc_serv->sv_lock is first. 53 * BKL protects svc_serv->sv_nrthread. 54 * svc_sock->sk_defer_lock protects the svc_sock->sk_deferred list 55 * svc_sock->sk_flags.SK_BUSY prevents a svc_sock being enqueued multiply. 56 * 57 * Some flags can be set to certain values at any time 58 * providing that certain rules are followed: 59 * 60 * SK_CONN, SK_DATA, can be set or cleared at any time. 61 * after a set, svc_sock_enqueue must be called. 62 * after a clear, the socket must be read/accepted 63 * if this succeeds, it must be set again. 64 * SK_CLOSE can set at any time. It is never cleared. 65 * 66 */ 67 68 #define RPCDBG_FACILITY RPCDBG_SVCSOCK 69 70 71 static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, 72 int *errp, int pmap_reg); 73 static void svc_udp_data_ready(struct sock *, int); 74 static int svc_udp_recvfrom(struct svc_rqst *); 75 static int svc_udp_sendto(struct svc_rqst *); 76 77 static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk); 78 static int svc_deferred_recv(struct svc_rqst *rqstp); 79 static struct cache_deferred_req *svc_defer(struct cache_req *req); 80 81 /* apparently the "standard" is that clients close 82 * idle connections after 5 minutes, servers after 83 * 6 minutes 84 * http://www.connectathon.org/talks96/nfstcp.pdf 85 */ 86 static int svc_conn_age_period = 6*60; 87 88 #ifdef CONFIG_DEBUG_LOCK_ALLOC 89 static struct lock_class_key svc_key[2]; 90 static struct lock_class_key svc_slock_key[2]; 91 92 static inline void svc_reclassify_socket(struct socket *sock) 93 { 94 struct sock *sk = sock->sk; 95 BUG_ON(sk->sk_lock.owner != NULL); 96 switch (sk->sk_family) { 97 case AF_INET: 98 sock_lock_init_class_and_name(sk, "slock-AF_INET-NFSD", 99 &svc_slock_key[0], "sk_lock-AF_INET-NFSD", &svc_key[0]); 100 break; 101 102 case AF_INET6: 103 sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFSD", 104 &svc_slock_key[1], "sk_lock-AF_INET6-NFSD", &svc_key[1]); 105 break; 106 107 default: 108 BUG(); 109 } 110 } 111 #else 112 static inline void svc_reclassify_socket(struct socket *sock) 113 { 114 } 115 #endif 116 117 /* 118 * Queue up an idle server thread. Must have pool->sp_lock held. 119 * Note: this is really a stack rather than a queue, so that we only 120 * use as many different threads as we need, and the rest don't pollute 121 * the cache. 122 */ 123 static inline void 124 svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp) 125 { 126 list_add(&rqstp->rq_list, &pool->sp_threads); 127 } 128 129 /* 130 * Dequeue an nfsd thread. Must have pool->sp_lock held. 131 */ 132 static inline void 133 svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp) 134 { 135 list_del(&rqstp->rq_list); 136 } 137 138 /* 139 * Release an skbuff after use 140 */ 141 static inline void 142 svc_release_skb(struct svc_rqst *rqstp) 143 { 144 struct sk_buff *skb = rqstp->rq_skbuff; 145 struct svc_deferred_req *dr = rqstp->rq_deferred; 146 147 if (skb) { 148 rqstp->rq_skbuff = NULL; 149 150 dprintk("svc: service %p, releasing skb %p\n", rqstp, skb); 151 skb_free_datagram(rqstp->rq_sock->sk_sk, skb); 152 } 153 if (dr) { 154 rqstp->rq_deferred = NULL; 155 kfree(dr); 156 } 157 } 158 159 /* 160 * Any space to write? 161 */ 162 static inline unsigned long 163 svc_sock_wspace(struct svc_sock *svsk) 164 { 165 int wspace; 166 167 if (svsk->sk_sock->type == SOCK_STREAM) 168 wspace = sk_stream_wspace(svsk->sk_sk); 169 else 170 wspace = sock_wspace(svsk->sk_sk); 171 172 return wspace; 173 } 174 175 /* 176 * Queue up a socket with data pending. If there are idle nfsd 177 * processes, wake 'em up. 178 * 179 */ 180 static void 181 svc_sock_enqueue(struct svc_sock *svsk) 182 { 183 struct svc_serv *serv = svsk->sk_server; 184 struct svc_pool *pool; 185 struct svc_rqst *rqstp; 186 int cpu; 187 188 if (!(svsk->sk_flags & 189 ( (1<<SK_CONN)|(1<<SK_DATA)|(1<<SK_CLOSE)|(1<<SK_DEFERRED)) )) 190 return; 191 if (test_bit(SK_DEAD, &svsk->sk_flags)) 192 return; 193 194 cpu = get_cpu(); 195 pool = svc_pool_for_cpu(svsk->sk_server, cpu); 196 put_cpu(); 197 198 spin_lock_bh(&pool->sp_lock); 199 200 if (!list_empty(&pool->sp_threads) && 201 !list_empty(&pool->sp_sockets)) 202 printk(KERN_ERR 203 "svc_sock_enqueue: threads and sockets both waiting??\n"); 204 205 if (test_bit(SK_DEAD, &svsk->sk_flags)) { 206 /* Don't enqueue dead sockets */ 207 dprintk("svc: socket %p is dead, not enqueued\n", svsk->sk_sk); 208 goto out_unlock; 209 } 210 211 /* Mark socket as busy. It will remain in this state until the 212 * server has processed all pending data and put the socket back 213 * on the idle list. We update SK_BUSY atomically because 214 * it also guards against trying to enqueue the svc_sock twice. 215 */ 216 if (test_and_set_bit(SK_BUSY, &svsk->sk_flags)) { 217 /* Don't enqueue socket while already enqueued */ 218 dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk); 219 goto out_unlock; 220 } 221 BUG_ON(svsk->sk_pool != NULL); 222 svsk->sk_pool = pool; 223 224 set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); 225 if (((atomic_read(&svsk->sk_reserved) + serv->sv_max_mesg)*2 226 > svc_sock_wspace(svsk)) 227 && !test_bit(SK_CLOSE, &svsk->sk_flags) 228 && !test_bit(SK_CONN, &svsk->sk_flags)) { 229 /* Don't enqueue while not enough space for reply */ 230 dprintk("svc: socket %p no space, %d*2 > %ld, not enqueued\n", 231 svsk->sk_sk, atomic_read(&svsk->sk_reserved)+serv->sv_max_mesg, 232 svc_sock_wspace(svsk)); 233 svsk->sk_pool = NULL; 234 clear_bit(SK_BUSY, &svsk->sk_flags); 235 goto out_unlock; 236 } 237 clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); 238 239 240 if (!list_empty(&pool->sp_threads)) { 241 rqstp = list_entry(pool->sp_threads.next, 242 struct svc_rqst, 243 rq_list); 244 dprintk("svc: socket %p served by daemon %p\n", 245 svsk->sk_sk, rqstp); 246 svc_thread_dequeue(pool, rqstp); 247 if (rqstp->rq_sock) 248 printk(KERN_ERR 249 "svc_sock_enqueue: server %p, rq_sock=%p!\n", 250 rqstp, rqstp->rq_sock); 251 rqstp->rq_sock = svsk; 252 atomic_inc(&svsk->sk_inuse); 253 rqstp->rq_reserved = serv->sv_max_mesg; 254 atomic_add(rqstp->rq_reserved, &svsk->sk_reserved); 255 BUG_ON(svsk->sk_pool != pool); 256 wake_up(&rqstp->rq_wait); 257 } else { 258 dprintk("svc: socket %p put into queue\n", svsk->sk_sk); 259 list_add_tail(&svsk->sk_ready, &pool->sp_sockets); 260 BUG_ON(svsk->sk_pool != pool); 261 } 262 263 out_unlock: 264 spin_unlock_bh(&pool->sp_lock); 265 } 266 267 /* 268 * Dequeue the first socket. Must be called with the pool->sp_lock held. 269 */ 270 static inline struct svc_sock * 271 svc_sock_dequeue(struct svc_pool *pool) 272 { 273 struct svc_sock *svsk; 274 275 if (list_empty(&pool->sp_sockets)) 276 return NULL; 277 278 svsk = list_entry(pool->sp_sockets.next, 279 struct svc_sock, sk_ready); 280 list_del_init(&svsk->sk_ready); 281 282 dprintk("svc: socket %p dequeued, inuse=%d\n", 283 svsk->sk_sk, atomic_read(&svsk->sk_inuse)); 284 285 return svsk; 286 } 287 288 /* 289 * Having read something from a socket, check whether it 290 * needs to be re-enqueued. 291 * Note: SK_DATA only gets cleared when a read-attempt finds 292 * no (or insufficient) data. 293 */ 294 static inline void 295 svc_sock_received(struct svc_sock *svsk) 296 { 297 svsk->sk_pool = NULL; 298 clear_bit(SK_BUSY, &svsk->sk_flags); 299 svc_sock_enqueue(svsk); 300 } 301 302 303 /** 304 * svc_reserve - change the space reserved for the reply to a request. 305 * @rqstp: The request in question 306 * @space: new max space to reserve 307 * 308 * Each request reserves some space on the output queue of the socket 309 * to make sure the reply fits. This function reduces that reserved 310 * space to be the amount of space used already, plus @space. 311 * 312 */ 313 void svc_reserve(struct svc_rqst *rqstp, int space) 314 { 315 space += rqstp->rq_res.head[0].iov_len; 316 317 if (space < rqstp->rq_reserved) { 318 struct svc_sock *svsk = rqstp->rq_sock; 319 atomic_sub((rqstp->rq_reserved - space), &svsk->sk_reserved); 320 rqstp->rq_reserved = space; 321 322 svc_sock_enqueue(svsk); 323 } 324 } 325 326 /* 327 * Release a socket after use. 328 */ 329 static inline void 330 svc_sock_put(struct svc_sock *svsk) 331 { 332 if (atomic_dec_and_test(&svsk->sk_inuse) && 333 test_bit(SK_DEAD, &svsk->sk_flags)) { 334 dprintk("svc: releasing dead socket\n"); 335 if (svsk->sk_sock->file) 336 sockfd_put(svsk->sk_sock); 337 else 338 sock_release(svsk->sk_sock); 339 if (svsk->sk_info_authunix != NULL) 340 svcauth_unix_info_release(svsk->sk_info_authunix); 341 kfree(svsk); 342 } 343 } 344 345 static void 346 svc_sock_release(struct svc_rqst *rqstp) 347 { 348 struct svc_sock *svsk = rqstp->rq_sock; 349 350 svc_release_skb(rqstp); 351 352 svc_free_res_pages(rqstp); 353 rqstp->rq_res.page_len = 0; 354 rqstp->rq_res.page_base = 0; 355 356 357 /* Reset response buffer and release 358 * the reservation. 359 * But first, check that enough space was reserved 360 * for the reply, otherwise we have a bug! 361 */ 362 if ((rqstp->rq_res.len) > rqstp->rq_reserved) 363 printk(KERN_ERR "RPC request reserved %d but used %d\n", 364 rqstp->rq_reserved, 365 rqstp->rq_res.len); 366 367 rqstp->rq_res.head[0].iov_len = 0; 368 svc_reserve(rqstp, 0); 369 rqstp->rq_sock = NULL; 370 371 svc_sock_put(svsk); 372 } 373 374 /* 375 * External function to wake up a server waiting for data 376 * This really only makes sense for services like lockd 377 * which have exactly one thread anyway. 378 */ 379 void 380 svc_wake_up(struct svc_serv *serv) 381 { 382 struct svc_rqst *rqstp; 383 unsigned int i; 384 struct svc_pool *pool; 385 386 for (i = 0; i < serv->sv_nrpools; i++) { 387 pool = &serv->sv_pools[i]; 388 389 spin_lock_bh(&pool->sp_lock); 390 if (!list_empty(&pool->sp_threads)) { 391 rqstp = list_entry(pool->sp_threads.next, 392 struct svc_rqst, 393 rq_list); 394 dprintk("svc: daemon %p woken up.\n", rqstp); 395 /* 396 svc_thread_dequeue(pool, rqstp); 397 rqstp->rq_sock = NULL; 398 */ 399 wake_up(&rqstp->rq_wait); 400 } 401 spin_unlock_bh(&pool->sp_lock); 402 } 403 } 404 405 /* 406 * Generic sendto routine 407 */ 408 static int 409 svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) 410 { 411 struct svc_sock *svsk = rqstp->rq_sock; 412 struct socket *sock = svsk->sk_sock; 413 int slen; 414 char buffer[CMSG_SPACE(sizeof(struct in_pktinfo))]; 415 struct cmsghdr *cmh = (struct cmsghdr *)buffer; 416 struct in_pktinfo *pki = (struct in_pktinfo *)CMSG_DATA(cmh); 417 int len = 0; 418 int result; 419 int size; 420 struct page **ppage = xdr->pages; 421 size_t base = xdr->page_base; 422 unsigned int pglen = xdr->page_len; 423 unsigned int flags = MSG_MORE; 424 425 slen = xdr->len; 426 427 if (rqstp->rq_prot == IPPROTO_UDP) { 428 /* set the source and destination */ 429 struct msghdr msg; 430 msg.msg_name = &rqstp->rq_addr; 431 msg.msg_namelen = sizeof(rqstp->rq_addr); 432 msg.msg_iov = NULL; 433 msg.msg_iovlen = 0; 434 msg.msg_flags = MSG_MORE; 435 436 msg.msg_control = cmh; 437 msg.msg_controllen = sizeof(buffer); 438 cmh->cmsg_len = CMSG_LEN(sizeof(*pki)); 439 cmh->cmsg_level = SOL_IP; 440 cmh->cmsg_type = IP_PKTINFO; 441 pki->ipi_ifindex = 0; 442 pki->ipi_spec_dst.s_addr = rqstp->rq_daddr; 443 444 if (sock_sendmsg(sock, &msg, 0) < 0) 445 goto out; 446 } 447 448 /* send head */ 449 if (slen == xdr->head[0].iov_len) 450 flags = 0; 451 len = kernel_sendpage(sock, rqstp->rq_respages[0], 0, 452 xdr->head[0].iov_len, flags); 453 if (len != xdr->head[0].iov_len) 454 goto out; 455 slen -= xdr->head[0].iov_len; 456 if (slen == 0) 457 goto out; 458 459 /* send page data */ 460 size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen; 461 while (pglen > 0) { 462 if (slen == size) 463 flags = 0; 464 result = kernel_sendpage(sock, *ppage, base, size, flags); 465 if (result > 0) 466 len += result; 467 if (result != size) 468 goto out; 469 slen -= size; 470 pglen -= size; 471 size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen; 472 base = 0; 473 ppage++; 474 } 475 /* send tail */ 476 if (xdr->tail[0].iov_len) { 477 result = kernel_sendpage(sock, rqstp->rq_respages[0], 478 ((unsigned long)xdr->tail[0].iov_base) 479 & (PAGE_SIZE-1), 480 xdr->tail[0].iov_len, 0); 481 482 if (result > 0) 483 len += result; 484 } 485 out: 486 dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %x)\n", 487 rqstp->rq_sock, xdr->head[0].iov_base, xdr->head[0].iov_len, xdr->len, len, 488 rqstp->rq_addr.sin_addr.s_addr); 489 490 return len; 491 } 492 493 /* 494 * Report socket names for nfsdfs 495 */ 496 static int one_sock_name(char *buf, struct svc_sock *svsk) 497 { 498 int len; 499 500 switch(svsk->sk_sk->sk_family) { 501 case AF_INET: 502 len = sprintf(buf, "ipv4 %s %u.%u.%u.%u %d\n", 503 svsk->sk_sk->sk_protocol==IPPROTO_UDP? 504 "udp" : "tcp", 505 NIPQUAD(inet_sk(svsk->sk_sk)->rcv_saddr), 506 inet_sk(svsk->sk_sk)->num); 507 break; 508 default: 509 len = sprintf(buf, "*unknown-%d*\n", 510 svsk->sk_sk->sk_family); 511 } 512 return len; 513 } 514 515 int 516 svc_sock_names(char *buf, struct svc_serv *serv, char *toclose) 517 { 518 struct svc_sock *svsk, *closesk = NULL; 519 int len = 0; 520 521 if (!serv) 522 return 0; 523 spin_lock(&serv->sv_lock); 524 list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) { 525 int onelen = one_sock_name(buf+len, svsk); 526 if (toclose && strcmp(toclose, buf+len) == 0) 527 closesk = svsk; 528 else 529 len += onelen; 530 } 531 spin_unlock(&serv->sv_lock); 532 if (closesk) 533 /* Should unregister with portmap, but you cannot 534 * unregister just one protocol... 535 */ 536 svc_delete_socket(closesk); 537 else if (toclose) 538 return -ENOENT; 539 return len; 540 } 541 EXPORT_SYMBOL(svc_sock_names); 542 543 /* 544 * Check input queue length 545 */ 546 static int 547 svc_recv_available(struct svc_sock *svsk) 548 { 549 struct socket *sock = svsk->sk_sock; 550 int avail, err; 551 552 err = kernel_sock_ioctl(sock, TIOCINQ, (unsigned long) &avail); 553 554 return (err >= 0)? avail : err; 555 } 556 557 /* 558 * Generic recvfrom routine. 559 */ 560 static int 561 svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, int buflen) 562 { 563 struct msghdr msg; 564 struct socket *sock; 565 int len, alen; 566 567 rqstp->rq_addrlen = sizeof(rqstp->rq_addr); 568 sock = rqstp->rq_sock->sk_sock; 569 570 msg.msg_name = &rqstp->rq_addr; 571 msg.msg_namelen = sizeof(rqstp->rq_addr); 572 msg.msg_control = NULL; 573 msg.msg_controllen = 0; 574 575 msg.msg_flags = MSG_DONTWAIT; 576 577 len = kernel_recvmsg(sock, &msg, iov, nr, buflen, MSG_DONTWAIT); 578 579 /* sock_recvmsg doesn't fill in the name/namelen, so we must.. 580 * possibly we should cache this in the svc_sock structure 581 * at accept time. FIXME 582 */ 583 alen = sizeof(rqstp->rq_addr); 584 kernel_getpeername(sock, (struct sockaddr *)&rqstp->rq_addr, &alen); 585 586 dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n", 587 rqstp->rq_sock, iov[0].iov_base, iov[0].iov_len, len); 588 589 return len; 590 } 591 592 /* 593 * Set socket snd and rcv buffer lengths 594 */ 595 static inline void 596 svc_sock_setbufsize(struct socket *sock, unsigned int snd, unsigned int rcv) 597 { 598 #if 0 599 mm_segment_t oldfs; 600 oldfs = get_fs(); set_fs(KERNEL_DS); 601 sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF, 602 (char*)&snd, sizeof(snd)); 603 sock_setsockopt(sock, SOL_SOCKET, SO_RCVBUF, 604 (char*)&rcv, sizeof(rcv)); 605 #else 606 /* sock_setsockopt limits use to sysctl_?mem_max, 607 * which isn't acceptable. Until that is made conditional 608 * on not having CAP_SYS_RESOURCE or similar, we go direct... 609 * DaveM said I could! 610 */ 611 lock_sock(sock->sk); 612 sock->sk->sk_sndbuf = snd * 2; 613 sock->sk->sk_rcvbuf = rcv * 2; 614 sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK; 615 release_sock(sock->sk); 616 #endif 617 } 618 /* 619 * INET callback when data has been received on the socket. 620 */ 621 static void 622 svc_udp_data_ready(struct sock *sk, int count) 623 { 624 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; 625 626 if (svsk) { 627 dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n", 628 svsk, sk, count, test_bit(SK_BUSY, &svsk->sk_flags)); 629 set_bit(SK_DATA, &svsk->sk_flags); 630 svc_sock_enqueue(svsk); 631 } 632 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) 633 wake_up_interruptible(sk->sk_sleep); 634 } 635 636 /* 637 * INET callback when space is newly available on the socket. 638 */ 639 static void 640 svc_write_space(struct sock *sk) 641 { 642 struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data); 643 644 if (svsk) { 645 dprintk("svc: socket %p(inet %p), write_space busy=%d\n", 646 svsk, sk, test_bit(SK_BUSY, &svsk->sk_flags)); 647 svc_sock_enqueue(svsk); 648 } 649 650 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) { 651 dprintk("RPC svc_write_space: someone sleeping on %p\n", 652 svsk); 653 wake_up_interruptible(sk->sk_sleep); 654 } 655 } 656 657 /* 658 * Receive a datagram from a UDP socket. 659 */ 660 static int 661 svc_udp_recvfrom(struct svc_rqst *rqstp) 662 { 663 struct svc_sock *svsk = rqstp->rq_sock; 664 struct svc_serv *serv = svsk->sk_server; 665 struct sk_buff *skb; 666 int err, len; 667 668 if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) 669 /* udp sockets need large rcvbuf as all pending 670 * requests are still in that buffer. sndbuf must 671 * also be large enough that there is enough space 672 * for one reply per thread. We count all threads 673 * rather than threads in a particular pool, which 674 * provides an upper bound on the number of threads 675 * which will access the socket. 676 */ 677 svc_sock_setbufsize(svsk->sk_sock, 678 (serv->sv_nrthreads+3) * serv->sv_max_mesg, 679 (serv->sv_nrthreads+3) * serv->sv_max_mesg); 680 681 if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { 682 svc_sock_received(svsk); 683 return svc_deferred_recv(rqstp); 684 } 685 686 clear_bit(SK_DATA, &svsk->sk_flags); 687 while ((skb = skb_recv_datagram(svsk->sk_sk, 0, 1, &err)) == NULL) { 688 if (err == -EAGAIN) { 689 svc_sock_received(svsk); 690 return err; 691 } 692 /* possibly an icmp error */ 693 dprintk("svc: recvfrom returned error %d\n", -err); 694 } 695 if (skb->tstamp.off_sec == 0) { 696 struct timeval tv; 697 698 tv.tv_sec = xtime.tv_sec; 699 tv.tv_usec = xtime.tv_nsec / NSEC_PER_USEC; 700 skb_set_timestamp(skb, &tv); 701 /* Don't enable netstamp, sunrpc doesn't 702 need that much accuracy */ 703 } 704 skb_get_timestamp(skb, &svsk->sk_sk->sk_stamp); 705 set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */ 706 707 /* 708 * Maybe more packets - kick another thread ASAP. 709 */ 710 svc_sock_received(svsk); 711 712 len = skb->len - sizeof(struct udphdr); 713 rqstp->rq_arg.len = len; 714 715 rqstp->rq_prot = IPPROTO_UDP; 716 717 /* Get sender address */ 718 rqstp->rq_addr.sin_family = AF_INET; 719 rqstp->rq_addr.sin_port = skb->h.uh->source; 720 rqstp->rq_addr.sin_addr.s_addr = skb->nh.iph->saddr; 721 rqstp->rq_daddr = skb->nh.iph->daddr; 722 723 if (skb_is_nonlinear(skb)) { 724 /* we have to copy */ 725 local_bh_disable(); 726 if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) { 727 local_bh_enable(); 728 /* checksum error */ 729 skb_free_datagram(svsk->sk_sk, skb); 730 return 0; 731 } 732 local_bh_enable(); 733 skb_free_datagram(svsk->sk_sk, skb); 734 } else { 735 /* we can use it in-place */ 736 rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr); 737 rqstp->rq_arg.head[0].iov_len = len; 738 if (skb_checksum_complete(skb)) { 739 skb_free_datagram(svsk->sk_sk, skb); 740 return 0; 741 } 742 rqstp->rq_skbuff = skb; 743 } 744 745 rqstp->rq_arg.page_base = 0; 746 if (len <= rqstp->rq_arg.head[0].iov_len) { 747 rqstp->rq_arg.head[0].iov_len = len; 748 rqstp->rq_arg.page_len = 0; 749 rqstp->rq_respages = rqstp->rq_pages+1; 750 } else { 751 rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; 752 rqstp->rq_respages = rqstp->rq_pages + 1 + 753 (rqstp->rq_arg.page_len + PAGE_SIZE - 1)/ PAGE_SIZE; 754 } 755 756 if (serv->sv_stats) 757 serv->sv_stats->netudpcnt++; 758 759 return len; 760 } 761 762 static int 763 svc_udp_sendto(struct svc_rqst *rqstp) 764 { 765 int error; 766 767 error = svc_sendto(rqstp, &rqstp->rq_res); 768 if (error == -ECONNREFUSED) 769 /* ICMP error on earlier request. */ 770 error = svc_sendto(rqstp, &rqstp->rq_res); 771 772 return error; 773 } 774 775 static void 776 svc_udp_init(struct svc_sock *svsk) 777 { 778 svsk->sk_sk->sk_data_ready = svc_udp_data_ready; 779 svsk->sk_sk->sk_write_space = svc_write_space; 780 svsk->sk_recvfrom = svc_udp_recvfrom; 781 svsk->sk_sendto = svc_udp_sendto; 782 783 /* initialise setting must have enough space to 784 * receive and respond to one request. 785 * svc_udp_recvfrom will re-adjust if necessary 786 */ 787 svc_sock_setbufsize(svsk->sk_sock, 788 3 * svsk->sk_server->sv_max_mesg, 789 3 * svsk->sk_server->sv_max_mesg); 790 791 set_bit(SK_DATA, &svsk->sk_flags); /* might have come in before data_ready set up */ 792 set_bit(SK_CHNGBUF, &svsk->sk_flags); 793 } 794 795 /* 796 * A data_ready event on a listening socket means there's a connection 797 * pending. Do not use state_change as a substitute for it. 798 */ 799 static void 800 svc_tcp_listen_data_ready(struct sock *sk, int count_unused) 801 { 802 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; 803 804 dprintk("svc: socket %p TCP (listen) state change %d\n", 805 sk, sk->sk_state); 806 807 /* 808 * This callback may called twice when a new connection 809 * is established as a child socket inherits everything 810 * from a parent LISTEN socket. 811 * 1) data_ready method of the parent socket will be called 812 * when one of child sockets become ESTABLISHED. 813 * 2) data_ready method of the child socket may be called 814 * when it receives data before the socket is accepted. 815 * In case of 2, we should ignore it silently. 816 */ 817 if (sk->sk_state == TCP_LISTEN) { 818 if (svsk) { 819 set_bit(SK_CONN, &svsk->sk_flags); 820 svc_sock_enqueue(svsk); 821 } else 822 printk("svc: socket %p: no user data\n", sk); 823 } 824 825 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) 826 wake_up_interruptible_all(sk->sk_sleep); 827 } 828 829 /* 830 * A state change on a connected socket means it's dying or dead. 831 */ 832 static void 833 svc_tcp_state_change(struct sock *sk) 834 { 835 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; 836 837 dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n", 838 sk, sk->sk_state, sk->sk_user_data); 839 840 if (!svsk) 841 printk("svc: socket %p: no user data\n", sk); 842 else { 843 set_bit(SK_CLOSE, &svsk->sk_flags); 844 svc_sock_enqueue(svsk); 845 } 846 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) 847 wake_up_interruptible_all(sk->sk_sleep); 848 } 849 850 static void 851 svc_tcp_data_ready(struct sock *sk, int count) 852 { 853 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; 854 855 dprintk("svc: socket %p TCP data ready (svsk %p)\n", 856 sk, sk->sk_user_data); 857 if (svsk) { 858 set_bit(SK_DATA, &svsk->sk_flags); 859 svc_sock_enqueue(svsk); 860 } 861 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) 862 wake_up_interruptible(sk->sk_sleep); 863 } 864 865 /* 866 * Accept a TCP connection 867 */ 868 static void 869 svc_tcp_accept(struct svc_sock *svsk) 870 { 871 struct sockaddr_in sin; 872 struct svc_serv *serv = svsk->sk_server; 873 struct socket *sock = svsk->sk_sock; 874 struct socket *newsock; 875 struct svc_sock *newsvsk; 876 int err, slen; 877 878 dprintk("svc: tcp_accept %p sock %p\n", svsk, sock); 879 if (!sock) 880 return; 881 882 clear_bit(SK_CONN, &svsk->sk_flags); 883 err = kernel_accept(sock, &newsock, O_NONBLOCK); 884 if (err < 0) { 885 if (err == -ENOMEM) 886 printk(KERN_WARNING "%s: no more sockets!\n", 887 serv->sv_name); 888 else if (err != -EAGAIN && net_ratelimit()) 889 printk(KERN_WARNING "%s: accept failed (err %d)!\n", 890 serv->sv_name, -err); 891 return; 892 } 893 894 set_bit(SK_CONN, &svsk->sk_flags); 895 svc_sock_enqueue(svsk); 896 897 slen = sizeof(sin); 898 err = kernel_getpeername(newsock, (struct sockaddr *) &sin, &slen); 899 if (err < 0) { 900 if (net_ratelimit()) 901 printk(KERN_WARNING "%s: peername failed (err %d)!\n", 902 serv->sv_name, -err); 903 goto failed; /* aborted connection or whatever */ 904 } 905 906 /* Ideally, we would want to reject connections from unauthorized 907 * hosts here, but when we get encription, the IP of the host won't 908 * tell us anything. For now just warn about unpriv connections. 909 */ 910 if (ntohs(sin.sin_port) >= 1024) { 911 dprintk(KERN_WARNING 912 "%s: connect from unprivileged port: %u.%u.%u.%u:%d\n", 913 serv->sv_name, 914 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); 915 } 916 917 dprintk("%s: connect from %u.%u.%u.%u:%04x\n", serv->sv_name, 918 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); 919 920 /* make sure that a write doesn't block forever when 921 * low on memory 922 */ 923 newsock->sk->sk_sndtimeo = HZ*30; 924 925 if (!(newsvsk = svc_setup_socket(serv, newsock, &err, 0))) 926 goto failed; 927 928 929 /* make sure that we don't have too many active connections. 930 * If we have, something must be dropped. 931 * 932 * There's no point in trying to do random drop here for 933 * DoS prevention. The NFS clients does 1 reconnect in 15 934 * seconds. An attacker can easily beat that. 935 * 936 * The only somewhat efficient mechanism would be if drop 937 * old connections from the same IP first. But right now 938 * we don't even record the client IP in svc_sock. 939 */ 940 if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) { 941 struct svc_sock *svsk = NULL; 942 spin_lock_bh(&serv->sv_lock); 943 if (!list_empty(&serv->sv_tempsocks)) { 944 if (net_ratelimit()) { 945 /* Try to help the admin */ 946 printk(KERN_NOTICE "%s: too many open TCP " 947 "sockets, consider increasing the " 948 "number of nfsd threads\n", 949 serv->sv_name); 950 printk(KERN_NOTICE "%s: last TCP connect from " 951 "%u.%u.%u.%u:%d\n", 952 serv->sv_name, 953 NIPQUAD(sin.sin_addr.s_addr), 954 ntohs(sin.sin_port)); 955 } 956 /* 957 * Always select the oldest socket. It's not fair, 958 * but so is life 959 */ 960 svsk = list_entry(serv->sv_tempsocks.prev, 961 struct svc_sock, 962 sk_list); 963 set_bit(SK_CLOSE, &svsk->sk_flags); 964 atomic_inc(&svsk->sk_inuse); 965 } 966 spin_unlock_bh(&serv->sv_lock); 967 968 if (svsk) { 969 svc_sock_enqueue(svsk); 970 svc_sock_put(svsk); 971 } 972 973 } 974 975 if (serv->sv_stats) 976 serv->sv_stats->nettcpconn++; 977 978 return; 979 980 failed: 981 sock_release(newsock); 982 return; 983 } 984 985 /* 986 * Receive data from a TCP socket. 987 */ 988 static int 989 svc_tcp_recvfrom(struct svc_rqst *rqstp) 990 { 991 struct svc_sock *svsk = rqstp->rq_sock; 992 struct svc_serv *serv = svsk->sk_server; 993 int len; 994 struct kvec *vec; 995 int pnum, vlen; 996 997 dprintk("svc: tcp_recv %p data %d conn %d close %d\n", 998 svsk, test_bit(SK_DATA, &svsk->sk_flags), 999 test_bit(SK_CONN, &svsk->sk_flags), 1000 test_bit(SK_CLOSE, &svsk->sk_flags)); 1001 1002 if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { 1003 svc_sock_received(svsk); 1004 return svc_deferred_recv(rqstp); 1005 } 1006 1007 if (test_bit(SK_CLOSE, &svsk->sk_flags)) { 1008 svc_delete_socket(svsk); 1009 return 0; 1010 } 1011 1012 if (svsk->sk_sk->sk_state == TCP_LISTEN) { 1013 svc_tcp_accept(svsk); 1014 svc_sock_received(svsk); 1015 return 0; 1016 } 1017 1018 if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) 1019 /* sndbuf needs to have room for one request 1020 * per thread, otherwise we can stall even when the 1021 * network isn't a bottleneck. 1022 * 1023 * We count all threads rather than threads in a 1024 * particular pool, which provides an upper bound 1025 * on the number of threads which will access the socket. 1026 * 1027 * rcvbuf just needs to be able to hold a few requests. 1028 * Normally they will be removed from the queue 1029 * as soon a a complete request arrives. 1030 */ 1031 svc_sock_setbufsize(svsk->sk_sock, 1032 (serv->sv_nrthreads+3) * serv->sv_max_mesg, 1033 3 * serv->sv_max_mesg); 1034 1035 clear_bit(SK_DATA, &svsk->sk_flags); 1036 1037 /* Receive data. If we haven't got the record length yet, get 1038 * the next four bytes. Otherwise try to gobble up as much as 1039 * possible up to the complete record length. 1040 */ 1041 if (svsk->sk_tcplen < 4) { 1042 unsigned long want = 4 - svsk->sk_tcplen; 1043 struct kvec iov; 1044 1045 iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen; 1046 iov.iov_len = want; 1047 if ((len = svc_recvfrom(rqstp, &iov, 1, want)) < 0) 1048 goto error; 1049 svsk->sk_tcplen += len; 1050 1051 if (len < want) { 1052 dprintk("svc: short recvfrom while reading record length (%d of %lu)\n", 1053 len, want); 1054 svc_sock_received(svsk); 1055 return -EAGAIN; /* record header not complete */ 1056 } 1057 1058 svsk->sk_reclen = ntohl(svsk->sk_reclen); 1059 if (!(svsk->sk_reclen & 0x80000000)) { 1060 /* FIXME: technically, a record can be fragmented, 1061 * and non-terminal fragments will not have the top 1062 * bit set in the fragment length header. 1063 * But apparently no known nfs clients send fragmented 1064 * records. */ 1065 if (net_ratelimit()) 1066 printk(KERN_NOTICE "RPC: bad TCP reclen 0x%08lx" 1067 " (non-terminal)\n", 1068 (unsigned long) svsk->sk_reclen); 1069 goto err_delete; 1070 } 1071 svsk->sk_reclen &= 0x7fffffff; 1072 dprintk("svc: TCP record, %d bytes\n", svsk->sk_reclen); 1073 if (svsk->sk_reclen > serv->sv_max_mesg) { 1074 if (net_ratelimit()) 1075 printk(KERN_NOTICE "RPC: bad TCP reclen 0x%08lx" 1076 " (large)\n", 1077 (unsigned long) svsk->sk_reclen); 1078 goto err_delete; 1079 } 1080 } 1081 1082 /* Check whether enough data is available */ 1083 len = svc_recv_available(svsk); 1084 if (len < 0) 1085 goto error; 1086 1087 if (len < svsk->sk_reclen) { 1088 dprintk("svc: incomplete TCP record (%d of %d)\n", 1089 len, svsk->sk_reclen); 1090 svc_sock_received(svsk); 1091 return -EAGAIN; /* record not complete */ 1092 } 1093 len = svsk->sk_reclen; 1094 set_bit(SK_DATA, &svsk->sk_flags); 1095 1096 vec = rqstp->rq_vec; 1097 vec[0] = rqstp->rq_arg.head[0]; 1098 vlen = PAGE_SIZE; 1099 pnum = 1; 1100 while (vlen < len) { 1101 vec[pnum].iov_base = page_address(rqstp->rq_pages[pnum]); 1102 vec[pnum].iov_len = PAGE_SIZE; 1103 pnum++; 1104 vlen += PAGE_SIZE; 1105 } 1106 rqstp->rq_respages = &rqstp->rq_pages[pnum]; 1107 1108 /* Now receive data */ 1109 len = svc_recvfrom(rqstp, vec, pnum, len); 1110 if (len < 0) 1111 goto error; 1112 1113 dprintk("svc: TCP complete record (%d bytes)\n", len); 1114 rqstp->rq_arg.len = len; 1115 rqstp->rq_arg.page_base = 0; 1116 if (len <= rqstp->rq_arg.head[0].iov_len) { 1117 rqstp->rq_arg.head[0].iov_len = len; 1118 rqstp->rq_arg.page_len = 0; 1119 } else { 1120 rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; 1121 } 1122 1123 rqstp->rq_skbuff = NULL; 1124 rqstp->rq_prot = IPPROTO_TCP; 1125 1126 /* Reset TCP read info */ 1127 svsk->sk_reclen = 0; 1128 svsk->sk_tcplen = 0; 1129 1130 svc_sock_received(svsk); 1131 if (serv->sv_stats) 1132 serv->sv_stats->nettcpcnt++; 1133 1134 return len; 1135 1136 err_delete: 1137 svc_delete_socket(svsk); 1138 return -EAGAIN; 1139 1140 error: 1141 if (len == -EAGAIN) { 1142 dprintk("RPC: TCP recvfrom got EAGAIN\n"); 1143 svc_sock_received(svsk); 1144 } else { 1145 printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", 1146 svsk->sk_server->sv_name, -len); 1147 goto err_delete; 1148 } 1149 1150 return len; 1151 } 1152 1153 /* 1154 * Send out data on TCP socket. 1155 */ 1156 static int 1157 svc_tcp_sendto(struct svc_rqst *rqstp) 1158 { 1159 struct xdr_buf *xbufp = &rqstp->rq_res; 1160 int sent; 1161 __be32 reclen; 1162 1163 /* Set up the first element of the reply kvec. 1164 * Any other kvecs that may be in use have been taken 1165 * care of by the server implementation itself. 1166 */ 1167 reclen = htonl(0x80000000|((xbufp->len ) - 4)); 1168 memcpy(xbufp->head[0].iov_base, &reclen, 4); 1169 1170 if (test_bit(SK_DEAD, &rqstp->rq_sock->sk_flags)) 1171 return -ENOTCONN; 1172 1173 sent = svc_sendto(rqstp, &rqstp->rq_res); 1174 if (sent != xbufp->len) { 1175 printk(KERN_NOTICE "rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n", 1176 rqstp->rq_sock->sk_server->sv_name, 1177 (sent<0)?"got error":"sent only", 1178 sent, xbufp->len); 1179 svc_delete_socket(rqstp->rq_sock); 1180 sent = -EAGAIN; 1181 } 1182 return sent; 1183 } 1184 1185 static void 1186 svc_tcp_init(struct svc_sock *svsk) 1187 { 1188 struct sock *sk = svsk->sk_sk; 1189 struct tcp_sock *tp = tcp_sk(sk); 1190 1191 svsk->sk_recvfrom = svc_tcp_recvfrom; 1192 svsk->sk_sendto = svc_tcp_sendto; 1193 1194 if (sk->sk_state == TCP_LISTEN) { 1195 dprintk("setting up TCP socket for listening\n"); 1196 sk->sk_data_ready = svc_tcp_listen_data_ready; 1197 set_bit(SK_CONN, &svsk->sk_flags); 1198 } else { 1199 dprintk("setting up TCP socket for reading\n"); 1200 sk->sk_state_change = svc_tcp_state_change; 1201 sk->sk_data_ready = svc_tcp_data_ready; 1202 sk->sk_write_space = svc_write_space; 1203 1204 svsk->sk_reclen = 0; 1205 svsk->sk_tcplen = 0; 1206 1207 tp->nonagle = 1; /* disable Nagle's algorithm */ 1208 1209 /* initialise setting must have enough space to 1210 * receive and respond to one request. 1211 * svc_tcp_recvfrom will re-adjust if necessary 1212 */ 1213 svc_sock_setbufsize(svsk->sk_sock, 1214 3 * svsk->sk_server->sv_max_mesg, 1215 3 * svsk->sk_server->sv_max_mesg); 1216 1217 set_bit(SK_CHNGBUF, &svsk->sk_flags); 1218 set_bit(SK_DATA, &svsk->sk_flags); 1219 if (sk->sk_state != TCP_ESTABLISHED) 1220 set_bit(SK_CLOSE, &svsk->sk_flags); 1221 } 1222 } 1223 1224 void 1225 svc_sock_update_bufs(struct svc_serv *serv) 1226 { 1227 /* 1228 * The number of server threads has changed. Update 1229 * rcvbuf and sndbuf accordingly on all sockets 1230 */ 1231 struct list_head *le; 1232 1233 spin_lock_bh(&serv->sv_lock); 1234 list_for_each(le, &serv->sv_permsocks) { 1235 struct svc_sock *svsk = 1236 list_entry(le, struct svc_sock, sk_list); 1237 set_bit(SK_CHNGBUF, &svsk->sk_flags); 1238 } 1239 list_for_each(le, &serv->sv_tempsocks) { 1240 struct svc_sock *svsk = 1241 list_entry(le, struct svc_sock, sk_list); 1242 set_bit(SK_CHNGBUF, &svsk->sk_flags); 1243 } 1244 spin_unlock_bh(&serv->sv_lock); 1245 } 1246 1247 /* 1248 * Receive the next request on any socket. This code is carefully 1249 * organised not to touch any cachelines in the shared svc_serv 1250 * structure, only cachelines in the local svc_pool. 1251 */ 1252 int 1253 svc_recv(struct svc_rqst *rqstp, long timeout) 1254 { 1255 struct svc_sock *svsk =NULL; 1256 struct svc_serv *serv = rqstp->rq_server; 1257 struct svc_pool *pool = rqstp->rq_pool; 1258 int len, i; 1259 int pages; 1260 struct xdr_buf *arg; 1261 DECLARE_WAITQUEUE(wait, current); 1262 1263 dprintk("svc: server %p waiting for data (to = %ld)\n", 1264 rqstp, timeout); 1265 1266 if (rqstp->rq_sock) 1267 printk(KERN_ERR 1268 "svc_recv: service %p, socket not NULL!\n", 1269 rqstp); 1270 if (waitqueue_active(&rqstp->rq_wait)) 1271 printk(KERN_ERR 1272 "svc_recv: service %p, wait queue active!\n", 1273 rqstp); 1274 1275 1276 /* now allocate needed pages. If we get a failure, sleep briefly */ 1277 pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE; 1278 for (i=0; i < pages ; i++) 1279 while (rqstp->rq_pages[i] == NULL) { 1280 struct page *p = alloc_page(GFP_KERNEL); 1281 if (!p) 1282 schedule_timeout_uninterruptible(msecs_to_jiffies(500)); 1283 rqstp->rq_pages[i] = p; 1284 } 1285 rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */ 1286 BUG_ON(pages >= RPCSVC_MAXPAGES); 1287 1288 /* Make arg->head point to first page and arg->pages point to rest */ 1289 arg = &rqstp->rq_arg; 1290 arg->head[0].iov_base = page_address(rqstp->rq_pages[0]); 1291 arg->head[0].iov_len = PAGE_SIZE; 1292 arg->pages = rqstp->rq_pages + 1; 1293 arg->page_base = 0; 1294 /* save at least one page for response */ 1295 arg->page_len = (pages-2)*PAGE_SIZE; 1296 arg->len = (pages-1)*PAGE_SIZE; 1297 arg->tail[0].iov_len = 0; 1298 1299 try_to_freeze(); 1300 cond_resched(); 1301 if (signalled()) 1302 return -EINTR; 1303 1304 spin_lock_bh(&pool->sp_lock); 1305 if ((svsk = svc_sock_dequeue(pool)) != NULL) { 1306 rqstp->rq_sock = svsk; 1307 atomic_inc(&svsk->sk_inuse); 1308 rqstp->rq_reserved = serv->sv_max_mesg; 1309 atomic_add(rqstp->rq_reserved, &svsk->sk_reserved); 1310 } else { 1311 /* No data pending. Go to sleep */ 1312 svc_thread_enqueue(pool, rqstp); 1313 1314 /* 1315 * We have to be able to interrupt this wait 1316 * to bring down the daemons ... 1317 */ 1318 set_current_state(TASK_INTERRUPTIBLE); 1319 add_wait_queue(&rqstp->rq_wait, &wait); 1320 spin_unlock_bh(&pool->sp_lock); 1321 1322 schedule_timeout(timeout); 1323 1324 try_to_freeze(); 1325 1326 spin_lock_bh(&pool->sp_lock); 1327 remove_wait_queue(&rqstp->rq_wait, &wait); 1328 1329 if (!(svsk = rqstp->rq_sock)) { 1330 svc_thread_dequeue(pool, rqstp); 1331 spin_unlock_bh(&pool->sp_lock); 1332 dprintk("svc: server %p, no data yet\n", rqstp); 1333 return signalled()? -EINTR : -EAGAIN; 1334 } 1335 } 1336 spin_unlock_bh(&pool->sp_lock); 1337 1338 dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n", 1339 rqstp, pool->sp_id, svsk, atomic_read(&svsk->sk_inuse)); 1340 len = svsk->sk_recvfrom(rqstp); 1341 dprintk("svc: got len=%d\n", len); 1342 1343 /* No data, incomplete (TCP) read, or accept() */ 1344 if (len == 0 || len == -EAGAIN) { 1345 rqstp->rq_res.len = 0; 1346 svc_sock_release(rqstp); 1347 return -EAGAIN; 1348 } 1349 svsk->sk_lastrecv = get_seconds(); 1350 clear_bit(SK_OLD, &svsk->sk_flags); 1351 1352 rqstp->rq_secure = ntohs(rqstp->rq_addr.sin_port) < 1024; 1353 rqstp->rq_chandle.defer = svc_defer; 1354 1355 if (serv->sv_stats) 1356 serv->sv_stats->netcnt++; 1357 return len; 1358 } 1359 1360 /* 1361 * Drop request 1362 */ 1363 void 1364 svc_drop(struct svc_rqst *rqstp) 1365 { 1366 dprintk("svc: socket %p dropped request\n", rqstp->rq_sock); 1367 svc_sock_release(rqstp); 1368 } 1369 1370 /* 1371 * Return reply to client. 1372 */ 1373 int 1374 svc_send(struct svc_rqst *rqstp) 1375 { 1376 struct svc_sock *svsk; 1377 int len; 1378 struct xdr_buf *xb; 1379 1380 if ((svsk = rqstp->rq_sock) == NULL) { 1381 printk(KERN_WARNING "NULL socket pointer in %s:%d\n", 1382 __FILE__, __LINE__); 1383 return -EFAULT; 1384 } 1385 1386 /* release the receive skb before sending the reply */ 1387 svc_release_skb(rqstp); 1388 1389 /* calculate over-all length */ 1390 xb = & rqstp->rq_res; 1391 xb->len = xb->head[0].iov_len + 1392 xb->page_len + 1393 xb->tail[0].iov_len; 1394 1395 /* Grab svsk->sk_mutex to serialize outgoing data. */ 1396 mutex_lock(&svsk->sk_mutex); 1397 if (test_bit(SK_DEAD, &svsk->sk_flags)) 1398 len = -ENOTCONN; 1399 else 1400 len = svsk->sk_sendto(rqstp); 1401 mutex_unlock(&svsk->sk_mutex); 1402 svc_sock_release(rqstp); 1403 1404 if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN) 1405 return 0; 1406 return len; 1407 } 1408 1409 /* 1410 * Timer function to close old temporary sockets, using 1411 * a mark-and-sweep algorithm. 1412 */ 1413 static void 1414 svc_age_temp_sockets(unsigned long closure) 1415 { 1416 struct svc_serv *serv = (struct svc_serv *)closure; 1417 struct svc_sock *svsk; 1418 struct list_head *le, *next; 1419 LIST_HEAD(to_be_aged); 1420 1421 dprintk("svc_age_temp_sockets\n"); 1422 1423 if (!spin_trylock_bh(&serv->sv_lock)) { 1424 /* busy, try again 1 sec later */ 1425 dprintk("svc_age_temp_sockets: busy\n"); 1426 mod_timer(&serv->sv_temptimer, jiffies + HZ); 1427 return; 1428 } 1429 1430 list_for_each_safe(le, next, &serv->sv_tempsocks) { 1431 svsk = list_entry(le, struct svc_sock, sk_list); 1432 1433 if (!test_and_set_bit(SK_OLD, &svsk->sk_flags)) 1434 continue; 1435 if (atomic_read(&svsk->sk_inuse) || test_bit(SK_BUSY, &svsk->sk_flags)) 1436 continue; 1437 atomic_inc(&svsk->sk_inuse); 1438 list_move(le, &to_be_aged); 1439 set_bit(SK_CLOSE, &svsk->sk_flags); 1440 set_bit(SK_DETACHED, &svsk->sk_flags); 1441 } 1442 spin_unlock_bh(&serv->sv_lock); 1443 1444 while (!list_empty(&to_be_aged)) { 1445 le = to_be_aged.next; 1446 /* fiddling the sk_list node is safe 'cos we're SK_DETACHED */ 1447 list_del_init(le); 1448 svsk = list_entry(le, struct svc_sock, sk_list); 1449 1450 dprintk("queuing svsk %p for closing, %lu seconds old\n", 1451 svsk, get_seconds() - svsk->sk_lastrecv); 1452 1453 /* a thread will dequeue and close it soon */ 1454 svc_sock_enqueue(svsk); 1455 svc_sock_put(svsk); 1456 } 1457 1458 mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); 1459 } 1460 1461 /* 1462 * Initialize socket for RPC use and create svc_sock struct 1463 * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF. 1464 */ 1465 static struct svc_sock * 1466 svc_setup_socket(struct svc_serv *serv, struct socket *sock, 1467 int *errp, int pmap_register) 1468 { 1469 struct svc_sock *svsk; 1470 struct sock *inet; 1471 1472 dprintk("svc: svc_setup_socket %p\n", sock); 1473 if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) { 1474 *errp = -ENOMEM; 1475 return NULL; 1476 } 1477 1478 inet = sock->sk; 1479 1480 /* Register socket with portmapper */ 1481 if (*errp >= 0 && pmap_register) 1482 *errp = svc_register(serv, inet->sk_protocol, 1483 ntohs(inet_sk(inet)->sport)); 1484 1485 if (*errp < 0) { 1486 kfree(svsk); 1487 return NULL; 1488 } 1489 1490 set_bit(SK_BUSY, &svsk->sk_flags); 1491 inet->sk_user_data = svsk; 1492 svsk->sk_sock = sock; 1493 svsk->sk_sk = inet; 1494 svsk->sk_ostate = inet->sk_state_change; 1495 svsk->sk_odata = inet->sk_data_ready; 1496 svsk->sk_owspace = inet->sk_write_space; 1497 svsk->sk_server = serv; 1498 atomic_set(&svsk->sk_inuse, 0); 1499 svsk->sk_lastrecv = get_seconds(); 1500 spin_lock_init(&svsk->sk_defer_lock); 1501 INIT_LIST_HEAD(&svsk->sk_deferred); 1502 INIT_LIST_HEAD(&svsk->sk_ready); 1503 mutex_init(&svsk->sk_mutex); 1504 1505 /* Initialize the socket */ 1506 if (sock->type == SOCK_DGRAM) 1507 svc_udp_init(svsk); 1508 else 1509 svc_tcp_init(svsk); 1510 1511 spin_lock_bh(&serv->sv_lock); 1512 if (!pmap_register) { 1513 set_bit(SK_TEMP, &svsk->sk_flags); 1514 list_add(&svsk->sk_list, &serv->sv_tempsocks); 1515 serv->sv_tmpcnt++; 1516 if (serv->sv_temptimer.function == NULL) { 1517 /* setup timer to age temp sockets */ 1518 setup_timer(&serv->sv_temptimer, svc_age_temp_sockets, 1519 (unsigned long)serv); 1520 mod_timer(&serv->sv_temptimer, 1521 jiffies + svc_conn_age_period * HZ); 1522 } 1523 } else { 1524 clear_bit(SK_TEMP, &svsk->sk_flags); 1525 list_add(&svsk->sk_list, &serv->sv_permsocks); 1526 } 1527 spin_unlock_bh(&serv->sv_lock); 1528 1529 dprintk("svc: svc_setup_socket created %p (inet %p)\n", 1530 svsk, svsk->sk_sk); 1531 1532 clear_bit(SK_BUSY, &svsk->sk_flags); 1533 svc_sock_enqueue(svsk); 1534 return svsk; 1535 } 1536 1537 int svc_addsock(struct svc_serv *serv, 1538 int fd, 1539 char *name_return, 1540 int *proto) 1541 { 1542 int err = 0; 1543 struct socket *so = sockfd_lookup(fd, &err); 1544 struct svc_sock *svsk = NULL; 1545 1546 if (!so) 1547 return err; 1548 if (so->sk->sk_family != AF_INET) 1549 err = -EAFNOSUPPORT; 1550 else if (so->sk->sk_protocol != IPPROTO_TCP && 1551 so->sk->sk_protocol != IPPROTO_UDP) 1552 err = -EPROTONOSUPPORT; 1553 else if (so->state > SS_UNCONNECTED) 1554 err = -EISCONN; 1555 else { 1556 svsk = svc_setup_socket(serv, so, &err, 1); 1557 if (svsk) 1558 err = 0; 1559 } 1560 if (err) { 1561 sockfd_put(so); 1562 return err; 1563 } 1564 if (proto) *proto = so->sk->sk_protocol; 1565 return one_sock_name(name_return, svsk); 1566 } 1567 EXPORT_SYMBOL_GPL(svc_addsock); 1568 1569 /* 1570 * Create socket for RPC service. 1571 */ 1572 static int 1573 svc_create_socket(struct svc_serv *serv, int protocol, struct sockaddr_in *sin) 1574 { 1575 struct svc_sock *svsk; 1576 struct socket *sock; 1577 int error; 1578 int type; 1579 1580 dprintk("svc: svc_create_socket(%s, %d, %u.%u.%u.%u:%d)\n", 1581 serv->sv_program->pg_name, protocol, 1582 NIPQUAD(sin->sin_addr.s_addr), 1583 ntohs(sin->sin_port)); 1584 1585 if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) { 1586 printk(KERN_WARNING "svc: only UDP and TCP " 1587 "sockets supported\n"); 1588 return -EINVAL; 1589 } 1590 type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; 1591 1592 if ((error = sock_create_kern(PF_INET, type, protocol, &sock)) < 0) 1593 return error; 1594 1595 svc_reclassify_socket(sock); 1596 1597 if (type == SOCK_STREAM) 1598 sock->sk->sk_reuse = 1; /* allow address reuse */ 1599 error = kernel_bind(sock, (struct sockaddr *) sin, 1600 sizeof(*sin)); 1601 if (error < 0) 1602 goto bummer; 1603 1604 if (protocol == IPPROTO_TCP) { 1605 if ((error = kernel_listen(sock, 64)) < 0) 1606 goto bummer; 1607 } 1608 1609 if ((svsk = svc_setup_socket(serv, sock, &error, 1)) != NULL) 1610 return 0; 1611 1612 bummer: 1613 dprintk("svc: svc_create_socket error = %d\n", -error); 1614 sock_release(sock); 1615 return error; 1616 } 1617 1618 /* 1619 * Remove a dead socket 1620 */ 1621 void 1622 svc_delete_socket(struct svc_sock *svsk) 1623 { 1624 struct svc_serv *serv; 1625 struct sock *sk; 1626 1627 dprintk("svc: svc_delete_socket(%p)\n", svsk); 1628 1629 serv = svsk->sk_server; 1630 sk = svsk->sk_sk; 1631 1632 sk->sk_state_change = svsk->sk_ostate; 1633 sk->sk_data_ready = svsk->sk_odata; 1634 sk->sk_write_space = svsk->sk_owspace; 1635 1636 spin_lock_bh(&serv->sv_lock); 1637 1638 if (!test_and_set_bit(SK_DETACHED, &svsk->sk_flags)) 1639 list_del_init(&svsk->sk_list); 1640 /* 1641 * We used to delete the svc_sock from whichever list 1642 * it's sk_ready node was on, but we don't actually 1643 * need to. This is because the only time we're called 1644 * while still attached to a queue, the queue itself 1645 * is about to be destroyed (in svc_destroy). 1646 */ 1647 if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags)) 1648 if (test_bit(SK_TEMP, &svsk->sk_flags)) 1649 serv->sv_tmpcnt--; 1650 1651 /* This atomic_inc should be needed - svc_delete_socket 1652 * should have the semantic of dropping a reference. 1653 * But it doesn't yet.... 1654 */ 1655 atomic_inc(&svsk->sk_inuse); 1656 spin_unlock_bh(&serv->sv_lock); 1657 svc_sock_put(svsk); 1658 } 1659 1660 /* 1661 * Make a socket for nfsd and lockd 1662 */ 1663 int 1664 svc_makesock(struct svc_serv *serv, int protocol, unsigned short port) 1665 { 1666 struct sockaddr_in sin; 1667 1668 dprintk("svc: creating socket proto = %d\n", protocol); 1669 sin.sin_family = AF_INET; 1670 sin.sin_addr.s_addr = INADDR_ANY; 1671 sin.sin_port = htons(port); 1672 return svc_create_socket(serv, protocol, &sin); 1673 } 1674 1675 /* 1676 * Handle defer and revisit of requests 1677 */ 1678 1679 static void svc_revisit(struct cache_deferred_req *dreq, int too_many) 1680 { 1681 struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle); 1682 struct svc_sock *svsk; 1683 1684 if (too_many) { 1685 svc_sock_put(dr->svsk); 1686 kfree(dr); 1687 return; 1688 } 1689 dprintk("revisit queued\n"); 1690 svsk = dr->svsk; 1691 dr->svsk = NULL; 1692 spin_lock_bh(&svsk->sk_defer_lock); 1693 list_add(&dr->handle.recent, &svsk->sk_deferred); 1694 spin_unlock_bh(&svsk->sk_defer_lock); 1695 set_bit(SK_DEFERRED, &svsk->sk_flags); 1696 svc_sock_enqueue(svsk); 1697 svc_sock_put(svsk); 1698 } 1699 1700 static struct cache_deferred_req * 1701 svc_defer(struct cache_req *req) 1702 { 1703 struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle); 1704 int size = sizeof(struct svc_deferred_req) + (rqstp->rq_arg.len); 1705 struct svc_deferred_req *dr; 1706 1707 if (rqstp->rq_arg.page_len) 1708 return NULL; /* if more than a page, give up FIXME */ 1709 if (rqstp->rq_deferred) { 1710 dr = rqstp->rq_deferred; 1711 rqstp->rq_deferred = NULL; 1712 } else { 1713 int skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; 1714 /* FIXME maybe discard if size too large */ 1715 dr = kmalloc(size, GFP_KERNEL); 1716 if (dr == NULL) 1717 return NULL; 1718 1719 dr->handle.owner = rqstp->rq_server; 1720 dr->prot = rqstp->rq_prot; 1721 dr->addr = rqstp->rq_addr; 1722 dr->daddr = rqstp->rq_daddr; 1723 dr->argslen = rqstp->rq_arg.len >> 2; 1724 memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2); 1725 } 1726 atomic_inc(&rqstp->rq_sock->sk_inuse); 1727 dr->svsk = rqstp->rq_sock; 1728 1729 dr->handle.revisit = svc_revisit; 1730 return &dr->handle; 1731 } 1732 1733 /* 1734 * recv data from a deferred request into an active one 1735 */ 1736 static int svc_deferred_recv(struct svc_rqst *rqstp) 1737 { 1738 struct svc_deferred_req *dr = rqstp->rq_deferred; 1739 1740 rqstp->rq_arg.head[0].iov_base = dr->args; 1741 rqstp->rq_arg.head[0].iov_len = dr->argslen<<2; 1742 rqstp->rq_arg.page_len = 0; 1743 rqstp->rq_arg.len = dr->argslen<<2; 1744 rqstp->rq_prot = dr->prot; 1745 rqstp->rq_addr = dr->addr; 1746 rqstp->rq_daddr = dr->daddr; 1747 rqstp->rq_respages = rqstp->rq_pages; 1748 return dr->argslen<<2; 1749 } 1750 1751 1752 static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk) 1753 { 1754 struct svc_deferred_req *dr = NULL; 1755 1756 if (!test_bit(SK_DEFERRED, &svsk->sk_flags)) 1757 return NULL; 1758 spin_lock_bh(&svsk->sk_defer_lock); 1759 clear_bit(SK_DEFERRED, &svsk->sk_flags); 1760 if (!list_empty(&svsk->sk_deferred)) { 1761 dr = list_entry(svsk->sk_deferred.next, 1762 struct svc_deferred_req, 1763 handle.recent); 1764 list_del_init(&dr->handle.recent); 1765 set_bit(SK_DEFERRED, &svsk->sk_flags); 1766 } 1767 spin_unlock_bh(&svsk->sk_defer_lock); 1768 return dr; 1769 } 1770