1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T 28 * All Rights Reserved 29 */ 30 31 /* 32 * Portions of this source code were derived from Berkeley 4.3 BSD 33 * under license from the Regents of the University of California. 34 */ 35 36 37 /* 38 * Implements a kernel based, client side RPC over Connection Oriented 39 * Transports (COTS). 40 */ 41 42 /* 43 * Much of this file has been re-written to let NFS work better over slow 44 * transports. A description follows. 45 * 46 * One of the annoying things about kRPC/COTS is that it will temporarily 47 * create more than one connection between a client and server. This 48 * happens because when a connection is made, the end-points entry in the 49 * linked list of connections (headed by cm_hd), is removed so that other 50 * threads don't mess with it. Went ahead and bit the bullet by keeping 51 * the endpoint on the connection list and introducing state bits, 52 * condition variables etc. to the connection entry data structure (struct 53 * cm_xprt). 54 * 55 * Here is a summary of the changes to cm-xprt: 56 * 57 * x_ctime is the timestamp of when the endpoint was last 58 * connected or disconnected. If an end-point is ever disconnected 59 * or re-connected, then any outstanding RPC request is presumed 60 * lost, telling clnt_cots_kcallit that it needs to re-send the 61 * request, not just wait for the original request's reply to 62 * arrive. 63 * 64 * x_thread flag which tells us if a thread is doing a connection attempt. 65 * 66 * x_waitdis flag which tells us we are waiting a disconnect ACK. 67 * 68 * x_needdis flag which tells us we need to send a T_DISCONN_REQ 69 * to kill the connection. 70 * 71 * x_needrel flag which tells us we need to send a T_ORDREL_REQ to 72 * gracefully close the connection. 73 * 74 * #defined bitmasks for the all the b_* bits so that more 75 * efficient (and at times less clumsy) masks can be used to 76 * manipulated state in cases where multiple bits have to 77 * set/cleared/checked in the same critical section. 78 * 79 * x_conn_cv and x_dis-_cv are new condition variables to let 80 * threads knows when the connection attempt is done, and to let 81 * the connecting thread know when the disconnect handshake is 82 * done. 83 * 84 * Added the CONN_HOLD() macro so that all reference holds have the same 85 * look and feel. 86 * 87 * In the private (cku_private) portion of the client handle, 88 * 89 * cku_flags replaces the cku_sent a boolean. cku_flags keeps 90 * track of whether a request as been sent, and whether the 91 * client's handles call record is on the dispatch list (so that 92 * the reply can be matched by XID to the right client handle). 93 * The idea of CKU_ONQUEUE is that we can exit clnt_cots_kcallit() 94 * and still have the response find the right client handle so 95 * that the retry of CLNT_CALL() gets the result. Testing, found 96 * situations where if the timeout was increased, performance 97 * degraded. This was due to us hitting a window where the thread 98 * was back in rfscall() (probably printing server not responding) 99 * while the response came back but no place to put it. 100 * 101 * cku_ctime is just a cache of x_ctime. If they match, 102 * clnt_cots_kcallit() won't to send a retry (unless the maximum 103 * receive count limit as been reached). If the don't match, then 104 * we assume the request has been lost, and a retry of the request 105 * is needed. 106 * 107 * cku_recv_attempts counts the number of receive count attempts 108 * after one try is sent on the wire. 109 * 110 * Added the clnt_delay() routine so that interruptible and 111 * noninterruptible delays are possible. 112 * 113 * CLNT_MIN_TIMEOUT has been bumped to 10 seconds from 3. This is used to 114 * control how long the client delays before returned after getting 115 * ECONNREFUSED. At 3 seconds, 8 client threads per mount really does bash 116 * a server that may be booting and not yet started nfsd. 117 * 118 * CLNT_MAXRECV_WITHOUT_RETRY is a new macro (value of 3) (with a tunable) 119 * Why don't we just wait forever (receive an infinite # of times)? 120 * Because the server may have rebooted. More insidious is that some 121 * servers (ours) will drop NFS/TCP requests in some cases. This is bad, 122 * but it is a reality. 123 * 124 * The case of a server doing orderly release really messes up the 125 * client's recovery, especially if the server's TCP implementation is 126 * buggy. It was found was that the kRPC/COTS client was breaking some 127 * TPI rules, such as not waiting for the acknowledgement of a 128 * T_DISCON_REQ (hence the added case statements T_ERROR_ACK, T_OK_ACK and 129 * T_DISCON_REQ in clnt_dispatch_notifyall()). 130 * 131 * One of things that we've seen is that a kRPC TCP endpoint goes into 132 * TIMEWAIT and a thus a reconnect takes a long time to satisfy because 133 * that the TIMEWAIT state takes a while to finish. If a server sends a 134 * T_ORDREL_IND, there is little point in an RPC client doing a 135 * T_ORDREL_REQ, because the RPC request isn't going to make it (the 136 * server is saying that it won't accept any more data). So kRPC was 137 * changed to send a T_DISCON_REQ when we get a T_ORDREL_IND. So now the 138 * connection skips the TIMEWAIT state and goes straight to a bound state 139 * that kRPC can quickly switch to connected. 140 * 141 * Code that issues TPI request must use waitforack() to wait for the 142 * corresponding ack (assuming there is one) in any future modifications. 143 * This works around problems that may be introduced by breaking TPI rules 144 * (by submitting new calls before earlier requests have been acked) in the 145 * case of a signal or other early return. waitforack() depends on 146 * clnt_dispatch_notifyconn() to issue the wakeup when the ack 147 * arrives, so adding new TPI calls may require corresponding changes 148 * to clnt_dispatch_notifyconn(). Presently, the timeout period is based on 149 * CLNT_MIN_TIMEOUT which is 10 seconds. If you modify this value, be sure 150 * not to set it too low or TPI ACKS will be lost. 151 */ 152 153 #include <sys/param.h> 154 #include <sys/types.h> 155 #include <sys/user.h> 156 #include <sys/systm.h> 157 #include <sys/sysmacros.h> 158 #include <sys/proc.h> 159 #include <sys/socket.h> 160 #include <sys/file.h> 161 #include <sys/stream.h> 162 #include <sys/strsubr.h> 163 #include <sys/stropts.h> 164 #include <sys/strsun.h> 165 #include <sys/timod.h> 166 #include <sys/tiuser.h> 167 #include <sys/tihdr.h> 168 #include <sys/t_kuser.h> 169 #include <sys/fcntl.h> 170 #include <sys/errno.h> 171 #include <sys/kmem.h> 172 #include <sys/debug.h> 173 #include <sys/systm.h> 174 #include <sys/kstat.h> 175 #include <sys/t_lock.h> 176 #include <sys/ddi.h> 177 #include <sys/cmn_err.h> 178 #include <sys/time.h> 179 #include <sys/isa_defs.h> 180 #include <sys/callb.h> 181 #include <sys/sunddi.h> 182 #include <sys/atomic.h> 183 #include <sys/sdt.h> 184 185 #include <netinet/in.h> 186 #include <netinet/tcp.h> 187 188 #include <rpc/types.h> 189 #include <rpc/xdr.h> 190 #include <rpc/auth.h> 191 #include <rpc/clnt.h> 192 #include <rpc/rpc_msg.h> 193 194 #define COTS_DEFAULT_ALLOCSIZE 2048 195 196 #define WIRE_HDR_SIZE 20 /* serialized call header, sans proc number */ 197 #define MSG_OFFSET 128 /* offset of call into the mblk */ 198 199 const char *kinet_ntop6(uchar_t *, char *, size_t); 200 201 static int clnt_cots_ksettimers(CLIENT *, struct rpc_timers *, 202 struct rpc_timers *, int, void(*)(int, int, caddr_t), caddr_t, uint32_t); 203 static enum clnt_stat clnt_cots_kcallit(CLIENT *, rpcproc_t, xdrproc_t, 204 caddr_t, xdrproc_t, caddr_t, struct timeval); 205 static void clnt_cots_kabort(CLIENT *); 206 static void clnt_cots_kerror(CLIENT *, struct rpc_err *); 207 static bool_t clnt_cots_kfreeres(CLIENT *, xdrproc_t, caddr_t); 208 static void clnt_cots_kdestroy(CLIENT *); 209 static bool_t clnt_cots_kcontrol(CLIENT *, int, char *); 210 211 212 /* List of transports managed by the connection manager. */ 213 struct cm_xprt { 214 TIUSER *x_tiptr; /* transport handle */ 215 queue_t *x_wq; /* send queue */ 216 clock_t x_time; /* last time we handed this xprt out */ 217 clock_t x_ctime; /* time we went to CONNECTED */ 218 int x_tidu_size; /* TIDU size of this transport */ 219 union { 220 struct { 221 unsigned int 222 #ifdef _BIT_FIELDS_HTOL 223 b_closing: 1, /* we've sent a ord rel on this conn */ 224 b_dead: 1, /* transport is closed or disconn */ 225 b_doomed: 1, /* too many conns, let this go idle */ 226 b_connected: 1, /* this connection is connected */ 227 228 b_ordrel: 1, /* do an orderly release? */ 229 b_thread: 1, /* thread doing connect */ 230 b_waitdis: 1, /* waiting for disconnect ACK */ 231 b_needdis: 1, /* need T_DISCON_REQ */ 232 233 b_needrel: 1, /* need T_ORDREL_REQ */ 234 b_early_disc: 1, /* got a T_ORDREL_IND or T_DISCON_IND */ 235 /* disconnect during connect */ 236 237 b_pad: 22; 238 239 #endif 240 241 #ifdef _BIT_FIELDS_LTOH 242 b_pad: 22, 243 244 b_early_disc: 1, /* got a T_ORDREL_IND or T_DISCON_IND */ 245 /* disconnect during connect */ 246 b_needrel: 1, /* need T_ORDREL_REQ */ 247 248 b_needdis: 1, /* need T_DISCON_REQ */ 249 b_waitdis: 1, /* waiting for disconnect ACK */ 250 b_thread: 1, /* thread doing connect */ 251 b_ordrel: 1, /* do an orderly release? */ 252 253 b_connected: 1, /* this connection is connected */ 254 b_doomed: 1, /* too many conns, let this go idle */ 255 b_dead: 1, /* transport is closed or disconn */ 256 b_closing: 1; /* we've sent a ord rel on this conn */ 257 #endif 258 } bit; unsigned int word; 259 260 #define x_closing x_state.bit.b_closing 261 #define x_dead x_state.bit.b_dead 262 #define x_doomed x_state.bit.b_doomed 263 #define x_connected x_state.bit.b_connected 264 265 #define x_ordrel x_state.bit.b_ordrel 266 #define x_thread x_state.bit.b_thread 267 #define x_waitdis x_state.bit.b_waitdis 268 #define x_needdis x_state.bit.b_needdis 269 270 #define x_needrel x_state.bit.b_needrel 271 #define x_early_disc x_state.bit.b_early_disc 272 273 #define x_state_flags x_state.word 274 275 #define X_CLOSING 0x80000000 276 #define X_DEAD 0x40000000 277 #define X_DOOMED 0x20000000 278 #define X_CONNECTED 0x10000000 279 280 #define X_ORDREL 0x08000000 281 #define X_THREAD 0x04000000 282 #define X_WAITDIS 0x02000000 283 #define X_NEEDDIS 0x01000000 284 285 #define X_NEEDREL 0x00800000 286 #define X_EARLYDISC 0x00400000 287 288 #define X_BADSTATES (X_CLOSING | X_DEAD | X_DOOMED) 289 290 } x_state; 291 int x_ref; /* number of users of this xprt */ 292 int x_family; /* address family of transport */ 293 dev_t x_rdev; /* device number of transport */ 294 struct cm_xprt *x_next; 295 296 struct netbuf x_server; /* destination address */ 297 struct netbuf x_src; /* src address (for retries) */ 298 kmutex_t x_lock; /* lock on this entry */ 299 kcondvar_t x_cv; /* to signal when can be closed */ 300 kcondvar_t x_conn_cv; /* to signal when connection attempt */ 301 /* is complete */ 302 kstat_t *x_ksp; 303 304 kcondvar_t x_dis_cv; /* to signal when disconnect attempt */ 305 /* is complete */ 306 zoneid_t x_zoneid; /* zone this xprt belongs to */ 307 }; 308 309 typedef struct cm_kstat_xprt { 310 kstat_named_t x_wq; 311 kstat_named_t x_server; 312 kstat_named_t x_family; 313 kstat_named_t x_rdev; 314 kstat_named_t x_time; 315 kstat_named_t x_state; 316 kstat_named_t x_ref; 317 kstat_named_t x_port; 318 } cm_kstat_xprt_t; 319 320 static cm_kstat_xprt_t cm_kstat_template = { 321 { "write_queue", KSTAT_DATA_UINT32 }, 322 { "server", KSTAT_DATA_STRING }, 323 { "addr_family", KSTAT_DATA_UINT32 }, 324 { "device", KSTAT_DATA_UINT32 }, 325 { "time_stamp", KSTAT_DATA_UINT32 }, 326 { "status", KSTAT_DATA_UINT32 }, 327 { "ref_count", KSTAT_DATA_INT32 }, 328 { "port", KSTAT_DATA_UINT32 }, 329 }; 330 331 /* 332 * The inverse of this is connmgr_release(). 333 */ 334 #define CONN_HOLD(Cm_entry) {\ 335 mutex_enter(&(Cm_entry)->x_lock); \ 336 (Cm_entry)->x_ref++; \ 337 mutex_exit(&(Cm_entry)->x_lock); \ 338 } 339 340 341 /* 342 * Private data per rpc handle. This structure is allocated by 343 * clnt_cots_kcreate, and freed by clnt_cots_kdestroy. 344 */ 345 typedef struct cku_private_s { 346 CLIENT cku_client; /* client handle */ 347 calllist_t cku_call; /* for dispatching calls */ 348 struct rpc_err cku_err; /* error status */ 349 350 struct netbuf cku_srcaddr; /* source address for retries */ 351 int cku_addrfmly; /* for binding port */ 352 struct netbuf cku_addr; /* remote address */ 353 dev_t cku_device; /* device to use */ 354 uint_t cku_flags; 355 #define CKU_ONQUEUE 0x1 356 #define CKU_SENT 0x2 357 358 bool_t cku_progress; /* for CLSET_PROGRESS */ 359 uint32_t cku_xid; /* current XID */ 360 clock_t cku_ctime; /* time stamp of when */ 361 /* connection was created */ 362 uint_t cku_recv_attempts; 363 XDR cku_outxdr; /* xdr routine for output */ 364 XDR cku_inxdr; /* xdr routine for input */ 365 char cku_rpchdr[WIRE_HDR_SIZE + 4]; 366 /* pre-serialized rpc header */ 367 368 uint_t cku_outbuflen; /* default output mblk length */ 369 struct cred *cku_cred; /* credentials */ 370 bool_t cku_nodelayonerr; 371 /* for CLSET_NODELAYONERR */ 372 int cku_useresvport; /* Use reserved port */ 373 struct rpc_cots_client *cku_stats; /* stats for zone */ 374 } cku_private_t; 375 376 static struct cm_xprt *connmgr_wrapconnect(struct cm_xprt *, 377 const struct timeval *, struct netbuf *, int, struct netbuf *, 378 struct rpc_err *, bool_t, bool_t, cred_t *); 379 380 static bool_t connmgr_connect(struct cm_xprt *, queue_t *, struct netbuf *, 381 int, calllist_t *, int *, bool_t reconnect, 382 const struct timeval *, bool_t, cred_t *); 383 384 static void *connmgr_opt_getoff(mblk_t *mp, t_uscalar_t offset, 385 t_uscalar_t length, uint_t align_size); 386 static bool_t connmgr_setbufsz(calllist_t *e, queue_t *wq, cred_t *cr); 387 static bool_t connmgr_getopt_int(queue_t *wq, int level, int name, int *val, 388 calllist_t *e, cred_t *cr); 389 static bool_t connmgr_setopt_int(queue_t *wq, int level, int name, int val, 390 calllist_t *e, cred_t *cr); 391 static bool_t connmgr_setopt(queue_t *, int, int, calllist_t *, cred_t *cr); 392 static void connmgr_sndrel(struct cm_xprt *); 393 static void connmgr_snddis(struct cm_xprt *); 394 static void connmgr_close(struct cm_xprt *); 395 static void connmgr_release(struct cm_xprt *); 396 static struct cm_xprt *connmgr_wrapget(struct netbuf *, const struct timeval *, 397 cku_private_t *); 398 399 static struct cm_xprt *connmgr_get(struct netbuf *, const struct timeval *, 400 struct netbuf *, int, struct netbuf *, struct rpc_err *, dev_t, 401 bool_t, int, cred_t *); 402 403 static void connmgr_cancelconn(struct cm_xprt *); 404 static enum clnt_stat connmgr_cwait(struct cm_xprt *, const struct timeval *, 405 bool_t); 406 static void connmgr_dis_and_wait(struct cm_xprt *); 407 408 static int clnt_dispatch_send(queue_t *, mblk_t *, calllist_t *, uint_t, 409 uint_t); 410 411 static int clnt_delay(clock_t, bool_t); 412 413 static int waitforack(calllist_t *, t_scalar_t, const struct timeval *, bool_t); 414 415 /* 416 * Operations vector for TCP/IP based RPC 417 */ 418 static struct clnt_ops tcp_ops = { 419 clnt_cots_kcallit, /* do rpc call */ 420 clnt_cots_kabort, /* abort call */ 421 clnt_cots_kerror, /* return error status */ 422 clnt_cots_kfreeres, /* free results */ 423 clnt_cots_kdestroy, /* destroy rpc handle */ 424 clnt_cots_kcontrol, /* the ioctl() of rpc */ 425 clnt_cots_ksettimers, /* set retry timers */ 426 }; 427 428 static int rpc_kstat_instance = 0; /* keeps the current instance */ 429 /* number for the next kstat_create */ 430 431 static struct cm_xprt *cm_hd = NULL; 432 static kmutex_t connmgr_lock; /* for connection mngr's list of transports */ 433 434 extern kmutex_t clnt_max_msg_lock; 435 436 static calllist_t *clnt_pending = NULL; 437 extern kmutex_t clnt_pending_lock; 438 439 static int clnt_cots_hash_size = DEFAULT_HASH_SIZE; 440 441 static call_table_t *cots_call_ht; 442 443 static const struct rpc_cots_client { 444 kstat_named_t rccalls; 445 kstat_named_t rcbadcalls; 446 kstat_named_t rcbadxids; 447 kstat_named_t rctimeouts; 448 kstat_named_t rcnewcreds; 449 kstat_named_t rcbadverfs; 450 kstat_named_t rctimers; 451 kstat_named_t rccantconn; 452 kstat_named_t rcnomem; 453 kstat_named_t rcintrs; 454 } cots_rcstat_tmpl = { 455 { "calls", KSTAT_DATA_UINT64 }, 456 { "badcalls", KSTAT_DATA_UINT64 }, 457 { "badxids", KSTAT_DATA_UINT64 }, 458 { "timeouts", KSTAT_DATA_UINT64 }, 459 { "newcreds", KSTAT_DATA_UINT64 }, 460 { "badverfs", KSTAT_DATA_UINT64 }, 461 { "timers", KSTAT_DATA_UINT64 }, 462 { "cantconn", KSTAT_DATA_UINT64 }, 463 { "nomem", KSTAT_DATA_UINT64 }, 464 { "interrupts", KSTAT_DATA_UINT64 } 465 }; 466 467 #define COTSRCSTAT_INCR(p, x) \ 468 atomic_add_64(&(p)->x.value.ui64, 1) 469 470 #define CLNT_MAX_CONNS 1 /* concurrent connections between clnt/srvr */ 471 int clnt_max_conns = CLNT_MAX_CONNS; 472 473 #define CLNT_MIN_TIMEOUT 10 /* seconds to wait after we get a */ 474 /* connection reset */ 475 #define CLNT_MIN_CONNTIMEOUT 5 /* seconds to wait for a connection */ 476 477 478 int clnt_cots_min_tout = CLNT_MIN_TIMEOUT; 479 int clnt_cots_min_conntout = CLNT_MIN_CONNTIMEOUT; 480 481 /* 482 * Limit the number of times we will attempt to receive a reply without 483 * re-sending a response. 484 */ 485 #define CLNT_MAXRECV_WITHOUT_RETRY 3 486 uint_t clnt_cots_maxrecv = CLNT_MAXRECV_WITHOUT_RETRY; 487 488 uint_t *clnt_max_msg_sizep; 489 void (*clnt_stop_idle)(queue_t *wq); 490 491 #define ptoh(p) (&((p)->cku_client)) 492 #define htop(h) ((cku_private_t *)((h)->cl_private)) 493 494 /* 495 * Times to retry 496 */ 497 #define REFRESHES 2 /* authentication refreshes */ 498 499 /* 500 * The following is used to determine the global default behavior for 501 * COTS when binding to a local port. 502 * 503 * If the value is set to 1 the default will be to select a reserved 504 * (aka privileged) port, if the value is zero the default will be to 505 * use non-reserved ports. Users of kRPC may override this by using 506 * CLNT_CONTROL() and CLSET_BINDRESVPORT. 507 */ 508 int clnt_cots_do_bindresvport = 1; 509 510 static zone_key_t zone_cots_key; 511 512 /* 513 * Defaults TCP send and receive buffer size for RPC connections. 514 * These values can be tuned by /etc/system. 515 */ 516 int rpc_send_bufsz = 1024*1024; 517 int rpc_recv_bufsz = 1024*1024; 518 /* 519 * To use system-wide default for TCP send and receive buffer size, 520 * use /etc/system to set rpc_default_tcp_bufsz to 1: 521 * 522 * set rpcmod:rpc_default_tcp_bufsz=1 523 */ 524 int rpc_default_tcp_bufsz = 0; 525 526 /* 527 * We need to do this after all kernel threads in the zone have exited. 528 */ 529 /* ARGSUSED */ 530 static void 531 clnt_zone_destroy(zoneid_t zoneid, void *unused) 532 { 533 struct cm_xprt **cmp; 534 struct cm_xprt *cm_entry; 535 struct cm_xprt *freelist = NULL; 536 537 mutex_enter(&connmgr_lock); 538 cmp = &cm_hd; 539 while ((cm_entry = *cmp) != NULL) { 540 if (cm_entry->x_zoneid == zoneid) { 541 *cmp = cm_entry->x_next; 542 cm_entry->x_next = freelist; 543 freelist = cm_entry; 544 } else { 545 cmp = &cm_entry->x_next; 546 } 547 } 548 mutex_exit(&connmgr_lock); 549 while ((cm_entry = freelist) != NULL) { 550 freelist = cm_entry->x_next; 551 connmgr_close(cm_entry); 552 } 553 } 554 555 int 556 clnt_cots_kcreate(dev_t dev, struct netbuf *addr, int family, rpcprog_t prog, 557 rpcvers_t vers, uint_t max_msgsize, cred_t *cred, CLIENT **ncl) 558 { 559 CLIENT *h; 560 cku_private_t *p; 561 struct rpc_msg call_msg; 562 struct rpcstat *rpcstat; 563 564 RPCLOG(8, "clnt_cots_kcreate: prog %u\n", prog); 565 566 rpcstat = zone_getspecific(rpcstat_zone_key, rpc_zone()); 567 ASSERT(rpcstat != NULL); 568 569 /* Allocate and intialize the client handle. */ 570 p = kmem_zalloc(sizeof (*p), KM_SLEEP); 571 572 h = ptoh(p); 573 574 h->cl_private = (caddr_t)p; 575 h->cl_auth = authkern_create(); 576 h->cl_ops = &tcp_ops; 577 578 cv_init(&p->cku_call.call_cv, NULL, CV_DEFAULT, NULL); 579 mutex_init(&p->cku_call.call_lock, NULL, MUTEX_DEFAULT, NULL); 580 581 /* 582 * If the current sanity check size in rpcmod is smaller 583 * than the size needed, then increase the sanity check. 584 */ 585 if (max_msgsize != 0 && clnt_max_msg_sizep != NULL && 586 max_msgsize > *clnt_max_msg_sizep) { 587 mutex_enter(&clnt_max_msg_lock); 588 if (max_msgsize > *clnt_max_msg_sizep) 589 *clnt_max_msg_sizep = max_msgsize; 590 mutex_exit(&clnt_max_msg_lock); 591 } 592 593 p->cku_outbuflen = COTS_DEFAULT_ALLOCSIZE; 594 595 /* Preserialize the call message header */ 596 597 call_msg.rm_xid = 0; 598 call_msg.rm_direction = CALL; 599 call_msg.rm_call.cb_rpcvers = RPC_MSG_VERSION; 600 call_msg.rm_call.cb_prog = prog; 601 call_msg.rm_call.cb_vers = vers; 602 603 xdrmem_create(&p->cku_outxdr, p->cku_rpchdr, WIRE_HDR_SIZE, XDR_ENCODE); 604 605 if (!xdr_callhdr(&p->cku_outxdr, &call_msg)) { 606 RPCLOG0(1, "clnt_cots_kcreate - Fatal header serialization " 607 "error\n"); 608 auth_destroy(h->cl_auth); 609 kmem_free(p, sizeof (cku_private_t)); 610 RPCLOG0(1, "clnt_cots_kcreate: create failed error EINVAL\n"); 611 return (EINVAL); /* XXX */ 612 } 613 614 /* 615 * The zalloc initialized the fields below. 616 * p->cku_xid = 0; 617 * p->cku_flags = 0; 618 * p->cku_srcaddr.len = 0; 619 * p->cku_srcaddr.maxlen = 0; 620 */ 621 622 p->cku_cred = cred; 623 p->cku_device = dev; 624 p->cku_addrfmly = family; 625 p->cku_addr.buf = kmem_zalloc(addr->maxlen, KM_SLEEP); 626 p->cku_addr.maxlen = addr->maxlen; 627 p->cku_addr.len = addr->len; 628 bcopy(addr->buf, p->cku_addr.buf, addr->len); 629 p->cku_stats = rpcstat->rpc_cots_client; 630 p->cku_useresvport = -1; /* value is has not been set */ 631 632 *ncl = h; 633 return (0); 634 } 635 636 /*ARGSUSED*/ 637 static void 638 clnt_cots_kabort(CLIENT *h) 639 { 640 } 641 642 /* 643 * Return error info on this handle. 644 */ 645 static void 646 clnt_cots_kerror(CLIENT *h, struct rpc_err *err) 647 { 648 /* LINTED pointer alignment */ 649 cku_private_t *p = htop(h); 650 651 *err = p->cku_err; 652 } 653 654 static bool_t 655 clnt_cots_kfreeres(CLIENT *h, xdrproc_t xdr_res, caddr_t res_ptr) 656 { 657 /* LINTED pointer alignment */ 658 cku_private_t *p = htop(h); 659 XDR *xdrs; 660 661 xdrs = &(p->cku_outxdr); 662 xdrs->x_op = XDR_FREE; 663 return ((*xdr_res)(xdrs, res_ptr)); 664 } 665 666 static bool_t 667 clnt_cots_kcontrol(CLIENT *h, int cmd, char *arg) 668 { 669 cku_private_t *p = htop(h); 670 671 switch (cmd) { 672 case CLSET_PROGRESS: 673 p->cku_progress = TRUE; 674 return (TRUE); 675 676 case CLSET_XID: 677 if (arg == NULL) 678 return (FALSE); 679 680 p->cku_xid = *((uint32_t *)arg); 681 return (TRUE); 682 683 case CLGET_XID: 684 if (arg == NULL) 685 return (FALSE); 686 687 *((uint32_t *)arg) = p->cku_xid; 688 return (TRUE); 689 690 case CLSET_NODELAYONERR: 691 if (arg == NULL) 692 return (FALSE); 693 694 if (*((bool_t *)arg) == TRUE) { 695 p->cku_nodelayonerr = TRUE; 696 return (TRUE); 697 } 698 if (*((bool_t *)arg) == FALSE) { 699 p->cku_nodelayonerr = FALSE; 700 return (TRUE); 701 } 702 return (FALSE); 703 704 case CLGET_NODELAYONERR: 705 if (arg == NULL) 706 return (FALSE); 707 708 *((bool_t *)arg) = p->cku_nodelayonerr; 709 return (TRUE); 710 711 case CLSET_BINDRESVPORT: 712 if (arg == NULL) 713 return (FALSE); 714 715 if (*(int *)arg != 1 && *(int *)arg != 0) 716 return (FALSE); 717 718 p->cku_useresvport = *(int *)arg; 719 720 return (TRUE); 721 722 case CLGET_BINDRESVPORT: 723 if (arg == NULL) 724 return (FALSE); 725 726 *(int *)arg = p->cku_useresvport; 727 728 return (TRUE); 729 730 default: 731 return (FALSE); 732 } 733 } 734 735 /* 736 * Destroy rpc handle. Frees the space used for output buffer, 737 * private data, and handle structure. 738 */ 739 static void 740 clnt_cots_kdestroy(CLIENT *h) 741 { 742 /* LINTED pointer alignment */ 743 cku_private_t *p = htop(h); 744 calllist_t *call = &p->cku_call; 745 746 RPCLOG(8, "clnt_cots_kdestroy h: %p\n", (void *)h); 747 RPCLOG(8, "clnt_cots_kdestroy h: xid=0x%x\n", p->cku_xid); 748 749 if (p->cku_flags & CKU_ONQUEUE) { 750 RPCLOG(64, "clnt_cots_kdestroy h: removing call for xid 0x%x " 751 "from dispatch list\n", p->cku_xid); 752 call_table_remove(call); 753 } 754 755 if (call->call_reply) 756 freemsg(call->call_reply); 757 cv_destroy(&call->call_cv); 758 mutex_destroy(&call->call_lock); 759 760 kmem_free(p->cku_srcaddr.buf, p->cku_srcaddr.maxlen); 761 kmem_free(p->cku_addr.buf, p->cku_addr.maxlen); 762 kmem_free(p, sizeof (*p)); 763 } 764 765 static int clnt_cots_pulls; 766 #define RM_HDR_SIZE 4 /* record mark header size */ 767 768 /* 769 * Call remote procedure. 770 */ 771 static enum clnt_stat 772 clnt_cots_kcallit(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args, 773 caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp, struct timeval wait) 774 { 775 /* LINTED pointer alignment */ 776 cku_private_t *p = htop(h); 777 calllist_t *call = &p->cku_call; 778 XDR *xdrs; 779 struct rpc_msg reply_msg; 780 mblk_t *mp; 781 #ifdef RPCDEBUG 782 clock_t time_sent; 783 #endif 784 struct netbuf *retryaddr; 785 struct cm_xprt *cm_entry = NULL; 786 queue_t *wq; 787 int len, waitsecs, max_waitsecs; 788 int mpsize; 789 int refreshes = REFRESHES; 790 int interrupted; 791 int tidu_size; 792 enum clnt_stat status; 793 struct timeval cwait; 794 bool_t delay_first = FALSE; 795 clock_t ticks, now; 796 797 RPCLOG(2, "clnt_cots_kcallit, procnum %u\n", procnum); 798 COTSRCSTAT_INCR(p->cku_stats, rccalls); 799 800 RPCLOG(2, "clnt_cots_kcallit: wait.tv_sec: %ld\n", wait.tv_sec); 801 RPCLOG(2, "clnt_cots_kcallit: wait.tv_usec: %ld\n", wait.tv_usec); 802 /* 803 * Bug ID 1240234: 804 * Look out for zero length timeouts. We don't want to 805 * wait zero seconds for a connection to be established. 806 */ 807 if (wait.tv_sec < clnt_cots_min_conntout) { 808 cwait.tv_sec = clnt_cots_min_conntout; 809 cwait.tv_usec = 0; 810 RPCLOG(8, "clnt_cots_kcallit: wait.tv_sec (%ld) too low,", 811 wait.tv_sec); 812 RPCLOG(8, " setting to: %d\n", clnt_cots_min_conntout); 813 } else { 814 cwait = wait; 815 } 816 817 call_again: 818 if (cm_entry) { 819 connmgr_release(cm_entry); 820 cm_entry = NULL; 821 } 822 823 mp = NULL; 824 825 /* 826 * If the call is not a retry, allocate a new xid and cache it 827 * for future retries. 828 * Bug ID 1246045: 829 * Treat call as a retry for purposes of binding the source 830 * port only if we actually attempted to send anything on 831 * the previous call. 832 */ 833 if (p->cku_xid == 0) { 834 p->cku_xid = alloc_xid(); 835 call->call_zoneid = rpc_zoneid(); 836 837 /* 838 * We need to ASSERT here that our xid != 0 because this 839 * determines whether or not our call record gets placed on 840 * the hash table or the linked list. By design, we mandate 841 * that RPC calls over cots must have xid's != 0, so we can 842 * ensure proper management of the hash table. 843 */ 844 ASSERT(p->cku_xid != 0); 845 846 retryaddr = NULL; 847 p->cku_flags &= ~CKU_SENT; 848 849 if (p->cku_flags & CKU_ONQUEUE) { 850 RPCLOG(8, "clnt_cots_kcallit: new call, dequeuing old" 851 " one (%p)\n", (void *)call); 852 call_table_remove(call); 853 p->cku_flags &= ~CKU_ONQUEUE; 854 RPCLOG(64, "clnt_cots_kcallit: removing call from " 855 "dispatch list because xid was zero (now 0x%x)\n", 856 p->cku_xid); 857 } 858 859 if (call->call_reply != NULL) { 860 freemsg(call->call_reply); 861 call->call_reply = NULL; 862 } 863 } else if (p->cku_srcaddr.buf == NULL || p->cku_srcaddr.len == 0) { 864 retryaddr = NULL; 865 866 } else if (p->cku_flags & CKU_SENT) { 867 retryaddr = &p->cku_srcaddr; 868 869 } else { 870 /* 871 * Bug ID 1246045: Nothing was sent, so set retryaddr to 872 * NULL and let connmgr_get() bind to any source port it 873 * can get. 874 */ 875 retryaddr = NULL; 876 } 877 878 RPCLOG(64, "clnt_cots_kcallit: xid = 0x%x", p->cku_xid); 879 RPCLOG(64, " flags = 0x%x\n", p->cku_flags); 880 881 p->cku_err.re_status = RPC_TIMEDOUT; 882 p->cku_err.re_errno = p->cku_err.re_terrno = 0; 883 884 cm_entry = connmgr_wrapget(retryaddr, &cwait, p); 885 886 if (cm_entry == NULL) { 887 RPCLOG(1, "clnt_cots_kcallit: can't connect status %s\n", 888 clnt_sperrno(p->cku_err.re_status)); 889 890 /* 891 * The reasons why we fail to create a connection are 892 * varied. In most cases we don't want the caller to 893 * immediately retry. This could have one or more 894 * bad effects. This includes flooding the net with 895 * connect requests to ports with no listener; a hard 896 * kernel loop due to all the "reserved" TCP ports being 897 * in use. 898 */ 899 delay_first = TRUE; 900 901 /* 902 * Even if we end up returning EINTR, we still count a 903 * a "can't connect", because the connection manager 904 * might have been committed to waiting for or timing out on 905 * a connection. 906 */ 907 COTSRCSTAT_INCR(p->cku_stats, rccantconn); 908 switch (p->cku_err.re_status) { 909 case RPC_INTR: 910 p->cku_err.re_errno = EINTR; 911 912 /* 913 * No need to delay because a UNIX signal(2) 914 * interrupted us. The caller likely won't 915 * retry the CLNT_CALL() and even if it does, 916 * we assume the caller knows what it is doing. 917 */ 918 delay_first = FALSE; 919 break; 920 921 case RPC_TIMEDOUT: 922 p->cku_err.re_errno = ETIMEDOUT; 923 924 /* 925 * No need to delay because timed out already 926 * on the connection request and assume that the 927 * transport time out is longer than our minimum 928 * timeout, or least not too much smaller. 929 */ 930 delay_first = FALSE; 931 break; 932 933 case RPC_SYSTEMERROR: 934 case RPC_TLIERROR: 935 /* 936 * We want to delay here because a transient 937 * system error has a better chance of going away 938 * if we delay a bit. If it's not transient, then 939 * we don't want end up in a hard kernel loop 940 * due to retries. 941 */ 942 ASSERT(p->cku_err.re_errno != 0); 943 break; 944 945 946 case RPC_CANTCONNECT: 947 /* 948 * RPC_CANTCONNECT is set on T_ERROR_ACK which 949 * implies some error down in the TCP layer or 950 * below. If cku_nodelayonerror is set then we 951 * assume the caller knows not to try too hard. 952 */ 953 RPCLOG0(8, "clnt_cots_kcallit: connection failed,"); 954 RPCLOG0(8, " re_status=RPC_CANTCONNECT,"); 955 RPCLOG(8, " re_errno=%d,", p->cku_err.re_errno); 956 RPCLOG(8, " cku_nodelayonerr=%d", p->cku_nodelayonerr); 957 if (p->cku_nodelayonerr == TRUE) 958 delay_first = FALSE; 959 960 p->cku_err.re_errno = EIO; 961 962 break; 963 964 case RPC_XPRTFAILED: 965 /* 966 * We want to delay here because we likely 967 * got a refused connection. 968 */ 969 if (p->cku_err.re_errno == 0) 970 p->cku_err.re_errno = EIO; 971 972 RPCLOG(1, "clnt_cots_kcallit: transport failed: %d\n", 973 p->cku_err.re_errno); 974 975 break; 976 977 default: 978 /* 979 * We delay here because it is better to err 980 * on the side of caution. If we got here then 981 * status could have been RPC_SUCCESS, but we 982 * know that we did not get a connection, so 983 * force the rpc status to RPC_CANTCONNECT. 984 */ 985 p->cku_err.re_status = RPC_CANTCONNECT; 986 p->cku_err.re_errno = EIO; 987 break; 988 } 989 if (delay_first == TRUE) 990 ticks = clnt_cots_min_tout * drv_usectohz(1000000); 991 goto cots_done; 992 } 993 994 /* 995 * If we've never sent any request on this connection (send count 996 * is zero, or the connection has been reset), cache the 997 * the connection's create time and send a request (possibly a retry) 998 */ 999 if ((p->cku_flags & CKU_SENT) == 0 || 1000 p->cku_ctime != cm_entry->x_ctime) { 1001 p->cku_ctime = cm_entry->x_ctime; 1002 1003 } else if ((p->cku_flags & CKU_SENT) && (p->cku_flags & CKU_ONQUEUE) && 1004 (call->call_reply != NULL || 1005 p->cku_recv_attempts < clnt_cots_maxrecv)) { 1006 1007 /* 1008 * If we've sent a request and our call is on the dispatch 1009 * queue and we haven't made too many receive attempts, then 1010 * don't re-send, just receive. 1011 */ 1012 p->cku_recv_attempts++; 1013 goto read_again; 1014 } 1015 1016 /* 1017 * Now we create the RPC request in a STREAMS message. We have to do 1018 * this after the call to connmgr_get so that we have the correct 1019 * TIDU size for the transport. 1020 */ 1021 tidu_size = cm_entry->x_tidu_size; 1022 len = MSG_OFFSET + MAX(tidu_size, RM_HDR_SIZE + WIRE_HDR_SIZE); 1023 1024 while ((mp = allocb(len, BPRI_MED)) == NULL) { 1025 if (strwaitbuf(len, BPRI_MED)) { 1026 p->cku_err.re_status = RPC_SYSTEMERROR; 1027 p->cku_err.re_errno = ENOSR; 1028 COTSRCSTAT_INCR(p->cku_stats, rcnomem); 1029 goto cots_done; 1030 } 1031 } 1032 xdrs = &p->cku_outxdr; 1033 xdrmblk_init(xdrs, mp, XDR_ENCODE, tidu_size); 1034 mpsize = MBLKSIZE(mp); 1035 ASSERT(mpsize >= len); 1036 ASSERT(mp->b_rptr == mp->b_datap->db_base); 1037 1038 /* 1039 * If the size of mblk is not appreciably larger than what we 1040 * asked, then resize the mblk to exactly len bytes. The reason for 1041 * this: suppose len is 1600 bytes, the tidu is 1460 bytes 1042 * (from TCP over ethernet), and the arguments to the RPC require 1043 * 2800 bytes. Ideally we want the protocol to render two 1044 * ~1400 byte segments over the wire. However if allocb() gives us a 2k 1045 * mblk, and we allocate a second mblk for the remainder, the protocol 1046 * module may generate 3 segments over the wire: 1047 * 1460 bytes for the first, 448 (2048 - 1600) for the second, and 1048 * 892 for the third. If we "waste" 448 bytes in the first mblk, 1049 * the XDR encoding will generate two ~1400 byte mblks, and the 1050 * protocol module is more likely to produce properly sized segments. 1051 */ 1052 if ((mpsize >> 1) <= len) 1053 mp->b_rptr += (mpsize - len); 1054 1055 /* 1056 * Adjust b_rptr to reserve space for the non-data protocol headers 1057 * any downstream modules might like to add, and for the 1058 * record marking header. 1059 */ 1060 mp->b_rptr += (MSG_OFFSET + RM_HDR_SIZE); 1061 1062 if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) { 1063 /* Copy in the preserialized RPC header information. */ 1064 bcopy(p->cku_rpchdr, mp->b_rptr, WIRE_HDR_SIZE); 1065 1066 /* Use XDR_SETPOS() to set the b_wptr to past the RPC header. */ 1067 XDR_SETPOS(xdrs, (uint_t)(mp->b_rptr - mp->b_datap->db_base + 1068 WIRE_HDR_SIZE)); 1069 1070 ASSERT((mp->b_wptr - mp->b_rptr) == WIRE_HDR_SIZE); 1071 1072 /* Serialize the procedure number and the arguments. */ 1073 if ((!XDR_PUTINT32(xdrs, (int32_t *)&procnum)) || 1074 (!AUTH_MARSHALL(h->cl_auth, xdrs, p->cku_cred)) || 1075 (!(*xdr_args)(xdrs, argsp))) { 1076 p->cku_err.re_status = RPC_CANTENCODEARGS; 1077 p->cku_err.re_errno = EIO; 1078 goto cots_done; 1079 } 1080 1081 (*(uint32_t *)(mp->b_rptr)) = p->cku_xid; 1082 } else { 1083 uint32_t *uproc = (uint32_t *)&p->cku_rpchdr[WIRE_HDR_SIZE]; 1084 IXDR_PUT_U_INT32(uproc, procnum); 1085 1086 (*(uint32_t *)(&p->cku_rpchdr[0])) = p->cku_xid; 1087 1088 /* Use XDR_SETPOS() to set the b_wptr. */ 1089 XDR_SETPOS(xdrs, (uint_t)(mp->b_rptr - mp->b_datap->db_base)); 1090 1091 /* Serialize the procedure number and the arguments. */ 1092 if (!AUTH_WRAP(h->cl_auth, p->cku_rpchdr, WIRE_HDR_SIZE+4, 1093 xdrs, xdr_args, argsp)) { 1094 p->cku_err.re_status = RPC_CANTENCODEARGS; 1095 p->cku_err.re_errno = EIO; 1096 goto cots_done; 1097 } 1098 } 1099 1100 RPCLOG(2, "clnt_cots_kcallit: connected, sending call, tidu_size %d\n", 1101 tidu_size); 1102 1103 wq = cm_entry->x_wq; 1104 waitsecs = 0; 1105 1106 dispatch_again: 1107 status = clnt_dispatch_send(wq, mp, call, p->cku_xid, 1108 (p->cku_flags & CKU_ONQUEUE)); 1109 1110 if ((status == RPC_CANTSEND) && (call->call_reason == ENOBUFS)) { 1111 /* 1112 * QFULL condition, allow some time for queue to drain 1113 * and try again. Give up after waiting for all timeout 1114 * specified for the call, or zone is going away. 1115 */ 1116 max_waitsecs = wait.tv_sec ? wait.tv_sec : clnt_cots_min_tout; 1117 if ((waitsecs++ < max_waitsecs) && 1118 !(zone_status_get(curproc->p_zone) >= 1119 ZONE_IS_SHUTTING_DOWN)) { 1120 1121 /* wait 1 sec for queue to drain */ 1122 if (clnt_delay(drv_usectohz(1000000), 1123 h->cl_nosignal) == EINTR) { 1124 p->cku_err.re_errno = EINTR; 1125 p->cku_err.re_status = RPC_INTR; 1126 1127 goto cots_done; 1128 } 1129 1130 /* and try again */ 1131 goto dispatch_again; 1132 } 1133 p->cku_err.re_status = status; 1134 p->cku_err.re_errno = call->call_reason; 1135 DTRACE_PROBE(krpc__e__clntcots__kcallit__cantsend); 1136 1137 goto cots_done; 1138 } 1139 1140 if (waitsecs) { 1141 /* adjust timeout to account for time wait to send */ 1142 wait.tv_sec -= waitsecs; 1143 if (wait.tv_sec < 0) { 1144 /* pick up reply on next retry */ 1145 wait.tv_sec = 0; 1146 } 1147 DTRACE_PROBE2(clnt_cots__sendwait, CLIENT *, h, 1148 int, waitsecs); 1149 } 1150 1151 RPCLOG(64, "clnt_cots_kcallit: sent call for xid 0x%x\n", 1152 (uint_t)p->cku_xid); 1153 p->cku_flags = (CKU_ONQUEUE|CKU_SENT); 1154 p->cku_recv_attempts = 1; 1155 1156 #ifdef RPCDEBUG 1157 time_sent = ddi_get_lbolt(); 1158 #endif 1159 1160 /* 1161 * Wait for a reply or a timeout. If there is no error or timeout, 1162 * (both indicated by call_status), call->call_reply will contain 1163 * the RPC reply message. 1164 */ 1165 read_again: 1166 mutex_enter(&call->call_lock); 1167 interrupted = 0; 1168 if (call->call_status == RPC_TIMEDOUT) { 1169 /* 1170 * Indicate that the lwp is not to be stopped while waiting 1171 * for this network traffic. This is to avoid deadlock while 1172 * debugging a process via /proc and also to avoid recursive 1173 * mutex_enter()s due to NFS page faults while stopping 1174 * (NFS holds locks when it calls here). 1175 */ 1176 clock_t cv_wait_ret; 1177 clock_t timout; 1178 clock_t oldlbolt; 1179 1180 klwp_t *lwp = ttolwp(curthread); 1181 1182 if (lwp != NULL) 1183 lwp->lwp_nostop++; 1184 1185 oldlbolt = ddi_get_lbolt(); 1186 timout = wait.tv_sec * drv_usectohz(1000000) + 1187 drv_usectohz(wait.tv_usec) + oldlbolt; 1188 /* 1189 * Iterate until the call_status is changed to something 1190 * other that RPC_TIMEDOUT, or if cv_timedwait_sig() returns 1191 * something <=0 zero. The latter means that we timed 1192 * out. 1193 */ 1194 if (h->cl_nosignal) 1195 while ((cv_wait_ret = cv_timedwait(&call->call_cv, 1196 &call->call_lock, timout)) > 0 && 1197 call->call_status == RPC_TIMEDOUT) 1198 ; 1199 else 1200 while ((cv_wait_ret = cv_timedwait_sig( 1201 &call->call_cv, 1202 &call->call_lock, timout)) > 0 && 1203 call->call_status == RPC_TIMEDOUT) 1204 ; 1205 1206 switch (cv_wait_ret) { 1207 case 0: 1208 /* 1209 * If we got out of the above loop with 1210 * cv_timedwait_sig() returning 0, then we were 1211 * interrupted regardless what call_status is. 1212 */ 1213 interrupted = 1; 1214 break; 1215 case -1: 1216 /* cv_timedwait_sig() timed out */ 1217 break; 1218 default: 1219 1220 /* 1221 * We were cv_signaled(). If we didn't 1222 * get a successful call_status and returned 1223 * before time expired, delay up to clnt_cots_min_tout 1224 * seconds so that the caller doesn't immediately 1225 * try to call us again and thus force the 1226 * same condition that got us here (such 1227 * as a RPC_XPRTFAILED due to the server not 1228 * listening on the end-point. 1229 */ 1230 if (call->call_status != RPC_SUCCESS) { 1231 clock_t curlbolt; 1232 clock_t diff; 1233 1234 curlbolt = ddi_get_lbolt(); 1235 ticks = clnt_cots_min_tout * 1236 drv_usectohz(1000000); 1237 diff = curlbolt - oldlbolt; 1238 if (diff < ticks) { 1239 delay_first = TRUE; 1240 if (diff > 0) 1241 ticks -= diff; 1242 } 1243 } 1244 break; 1245 } 1246 1247 if (lwp != NULL) 1248 lwp->lwp_nostop--; 1249 } 1250 /* 1251 * Get the reply message, if any. This will be freed at the end 1252 * whether or not an error occurred. 1253 */ 1254 mp = call->call_reply; 1255 call->call_reply = NULL; 1256 1257 /* 1258 * call_err is the error info when the call is on dispatch queue. 1259 * cku_err is the error info returned to the caller. 1260 * Sync cku_err with call_err for local message processing. 1261 */ 1262 1263 status = call->call_status; 1264 p->cku_err = call->call_err; 1265 mutex_exit(&call->call_lock); 1266 1267 if (status != RPC_SUCCESS) { 1268 switch (status) { 1269 case RPC_TIMEDOUT: 1270 now = ddi_get_lbolt(); 1271 if (interrupted) { 1272 COTSRCSTAT_INCR(p->cku_stats, rcintrs); 1273 p->cku_err.re_status = RPC_INTR; 1274 p->cku_err.re_errno = EINTR; 1275 RPCLOG(1, "clnt_cots_kcallit: xid 0x%x", 1276 p->cku_xid); 1277 RPCLOG(1, "signal interrupted at %ld", now); 1278 RPCLOG(1, ", was sent at %ld\n", time_sent); 1279 } else { 1280 COTSRCSTAT_INCR(p->cku_stats, rctimeouts); 1281 p->cku_err.re_errno = ETIMEDOUT; 1282 RPCLOG(1, "clnt_cots_kcallit: timed out at %ld", 1283 now); 1284 RPCLOG(1, ", was sent at %ld\n", time_sent); 1285 } 1286 break; 1287 1288 case RPC_XPRTFAILED: 1289 if (p->cku_err.re_errno == 0) 1290 p->cku_err.re_errno = EIO; 1291 1292 RPCLOG(1, "clnt_cots_kcallit: transport failed: %d\n", 1293 p->cku_err.re_errno); 1294 break; 1295 1296 case RPC_SYSTEMERROR: 1297 ASSERT(p->cku_err.re_errno); 1298 RPCLOG(1, "clnt_cots_kcallit: system error: %d\n", 1299 p->cku_err.re_errno); 1300 break; 1301 1302 default: 1303 p->cku_err.re_status = RPC_SYSTEMERROR; 1304 p->cku_err.re_errno = EIO; 1305 RPCLOG(1, "clnt_cots_kcallit: error: %s\n", 1306 clnt_sperrno(status)); 1307 break; 1308 } 1309 if (p->cku_err.re_status != RPC_TIMEDOUT) { 1310 1311 if (p->cku_flags & CKU_ONQUEUE) { 1312 call_table_remove(call); 1313 p->cku_flags &= ~CKU_ONQUEUE; 1314 } 1315 1316 RPCLOG(64, "clnt_cots_kcallit: non TIMEOUT so xid 0x%x " 1317 "taken off dispatch list\n", p->cku_xid); 1318 if (call->call_reply) { 1319 freemsg(call->call_reply); 1320 call->call_reply = NULL; 1321 } 1322 } else if (wait.tv_sec != 0) { 1323 /* 1324 * We've sent the request over TCP and so we have 1325 * every reason to believe it will get 1326 * delivered. In which case returning a timeout is not 1327 * appropriate. 1328 */ 1329 if (p->cku_progress == TRUE && 1330 p->cku_recv_attempts < clnt_cots_maxrecv) { 1331 p->cku_err.re_status = RPC_INPROGRESS; 1332 } 1333 } 1334 goto cots_done; 1335 } 1336 1337 xdrs = &p->cku_inxdr; 1338 xdrmblk_init(xdrs, mp, XDR_DECODE, 0); 1339 1340 reply_msg.rm_direction = REPLY; 1341 reply_msg.rm_reply.rp_stat = MSG_ACCEPTED; 1342 reply_msg.acpted_rply.ar_stat = SUCCESS; 1343 1344 reply_msg.acpted_rply.ar_verf = _null_auth; 1345 /* 1346 * xdr_results will be done in AUTH_UNWRAP. 1347 */ 1348 reply_msg.acpted_rply.ar_results.where = NULL; 1349 reply_msg.acpted_rply.ar_results.proc = xdr_void; 1350 1351 if (xdr_replymsg(xdrs, &reply_msg)) { 1352 enum clnt_stat re_status; 1353 1354 _seterr_reply(&reply_msg, &p->cku_err); 1355 1356 re_status = p->cku_err.re_status; 1357 if (re_status == RPC_SUCCESS) { 1358 /* 1359 * Reply is good, check auth. 1360 */ 1361 if (!AUTH_VALIDATE(h->cl_auth, 1362 &reply_msg.acpted_rply.ar_verf)) { 1363 COTSRCSTAT_INCR(p->cku_stats, rcbadverfs); 1364 RPCLOG0(1, "clnt_cots_kcallit: validation " 1365 "failure\n"); 1366 freemsg(mp); 1367 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 1368 mutex_enter(&call->call_lock); 1369 if (call->call_reply == NULL) 1370 call->call_status = RPC_TIMEDOUT; 1371 mutex_exit(&call->call_lock); 1372 goto read_again; 1373 } else if (!AUTH_UNWRAP(h->cl_auth, xdrs, 1374 xdr_results, resultsp)) { 1375 RPCLOG0(1, "clnt_cots_kcallit: validation " 1376 "failure (unwrap)\n"); 1377 p->cku_err.re_status = RPC_CANTDECODERES; 1378 p->cku_err.re_errno = EIO; 1379 } 1380 } else { 1381 /* set errno in case we can't recover */ 1382 if (re_status != RPC_VERSMISMATCH && 1383 re_status != RPC_AUTHERROR && 1384 re_status != RPC_PROGVERSMISMATCH) 1385 p->cku_err.re_errno = EIO; 1386 1387 if (re_status == RPC_AUTHERROR) { 1388 /* 1389 * Maybe our credential need to be refreshed 1390 */ 1391 if (cm_entry) { 1392 /* 1393 * There is the potential that the 1394 * cm_entry has/will be marked dead, 1395 * so drop the connection altogether, 1396 * force REFRESH to establish new 1397 * connection. 1398 */ 1399 connmgr_cancelconn(cm_entry); 1400 cm_entry = NULL; 1401 } 1402 1403 (void) xdr_rpc_free_verifier(xdrs, 1404 &reply_msg); 1405 1406 if (p->cku_flags & CKU_ONQUEUE) { 1407 call_table_remove(call); 1408 p->cku_flags &= ~CKU_ONQUEUE; 1409 } 1410 RPCLOG(64, 1411 "clnt_cots_kcallit: AUTH_ERROR, xid" 1412 " 0x%x removed off dispatch list\n", 1413 p->cku_xid); 1414 if (call->call_reply) { 1415 freemsg(call->call_reply); 1416 call->call_reply = NULL; 1417 } 1418 1419 if ((refreshes > 0) && 1420 AUTH_REFRESH(h->cl_auth, &reply_msg, 1421 p->cku_cred)) { 1422 refreshes--; 1423 freemsg(mp); 1424 mp = NULL; 1425 1426 COTSRCSTAT_INCR(p->cku_stats, 1427 rcbadcalls); 1428 COTSRCSTAT_INCR(p->cku_stats, 1429 rcnewcreds); 1430 goto call_again; 1431 } 1432 1433 /* 1434 * We have used the client handle to 1435 * do an AUTH_REFRESH and the RPC status may 1436 * be set to RPC_SUCCESS; Let's make sure to 1437 * set it to RPC_AUTHERROR. 1438 */ 1439 p->cku_err.re_status = RPC_AUTHERROR; 1440 1441 /* 1442 * Map recoverable and unrecoverable 1443 * authentication errors to appropriate errno 1444 */ 1445 switch (p->cku_err.re_why) { 1446 case AUTH_TOOWEAK: 1447 /* 1448 * This could be a failure where the 1449 * server requires use of a reserved 1450 * port, check and optionally set the 1451 * client handle useresvport trying 1452 * one more time. Next go round we 1453 * fall out with the tooweak error. 1454 */ 1455 if (p->cku_useresvport != 1) { 1456 p->cku_useresvport = 1; 1457 p->cku_xid = 0; 1458 freemsg(mp); 1459 mp = NULL; 1460 goto call_again; 1461 } 1462 /* FALLTHRU */ 1463 case AUTH_BADCRED: 1464 case AUTH_BADVERF: 1465 case AUTH_INVALIDRESP: 1466 case AUTH_FAILED: 1467 case RPCSEC_GSS_NOCRED: 1468 case RPCSEC_GSS_FAILED: 1469 p->cku_err.re_errno = EACCES; 1470 break; 1471 case AUTH_REJECTEDCRED: 1472 case AUTH_REJECTEDVERF: 1473 default: p->cku_err.re_errno = EIO; 1474 break; 1475 } 1476 RPCLOG(1, "clnt_cots_kcallit : authentication" 1477 " failed with RPC_AUTHERROR of type %d\n", 1478 (int)p->cku_err.re_why); 1479 goto cots_done; 1480 } 1481 } 1482 } else { 1483 /* reply didn't decode properly. */ 1484 p->cku_err.re_status = RPC_CANTDECODERES; 1485 p->cku_err.re_errno = EIO; 1486 RPCLOG0(1, "clnt_cots_kcallit: decode failure\n"); 1487 } 1488 1489 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 1490 1491 if (p->cku_flags & CKU_ONQUEUE) { 1492 call_table_remove(call); 1493 p->cku_flags &= ~CKU_ONQUEUE; 1494 } 1495 1496 RPCLOG(64, "clnt_cots_kcallit: xid 0x%x taken off dispatch list", 1497 p->cku_xid); 1498 RPCLOG(64, " status is %s\n", clnt_sperrno(p->cku_err.re_status)); 1499 cots_done: 1500 if (cm_entry) 1501 connmgr_release(cm_entry); 1502 1503 if (mp != NULL) 1504 freemsg(mp); 1505 if ((p->cku_flags & CKU_ONQUEUE) == 0 && call->call_reply) { 1506 freemsg(call->call_reply); 1507 call->call_reply = NULL; 1508 } 1509 if (p->cku_err.re_status != RPC_SUCCESS) { 1510 RPCLOG0(1, "clnt_cots_kcallit: tail-end failure\n"); 1511 COTSRCSTAT_INCR(p->cku_stats, rcbadcalls); 1512 } 1513 1514 /* 1515 * No point in delaying if the zone is going away. 1516 */ 1517 if (delay_first == TRUE && 1518 !(zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)) { 1519 if (clnt_delay(ticks, h->cl_nosignal) == EINTR) { 1520 p->cku_err.re_errno = EINTR; 1521 p->cku_err.re_status = RPC_INTR; 1522 } 1523 } 1524 return (p->cku_err.re_status); 1525 } 1526 1527 /* 1528 * Kinit routine for cots. This sets up the correct operations in 1529 * the client handle, as the handle may have previously been a clts 1530 * handle, and clears the xid field so there is no way a new call 1531 * could be mistaken for a retry. It also sets in the handle the 1532 * information that is passed at create/kinit time but needed at 1533 * call time, as cots creates the transport at call time - device, 1534 * address of the server, protocol family. 1535 */ 1536 void 1537 clnt_cots_kinit(CLIENT *h, dev_t dev, int family, struct netbuf *addr, 1538 int max_msgsize, cred_t *cred) 1539 { 1540 /* LINTED pointer alignment */ 1541 cku_private_t *p = htop(h); 1542 calllist_t *call = &p->cku_call; 1543 1544 h->cl_ops = &tcp_ops; 1545 if (p->cku_flags & CKU_ONQUEUE) { 1546 call_table_remove(call); 1547 p->cku_flags &= ~CKU_ONQUEUE; 1548 RPCLOG(64, "clnt_cots_kinit: removing call for xid 0x%x from" 1549 " dispatch list\n", p->cku_xid); 1550 } 1551 1552 if (call->call_reply != NULL) { 1553 freemsg(call->call_reply); 1554 call->call_reply = NULL; 1555 } 1556 1557 call->call_bucket = NULL; 1558 call->call_hash = 0; 1559 1560 /* 1561 * We don't clear cku_flags here, because clnt_cots_kcallit() 1562 * takes care of handling the cku_flags reset. 1563 */ 1564 p->cku_xid = 0; 1565 p->cku_device = dev; 1566 p->cku_addrfmly = family; 1567 p->cku_cred = cred; 1568 1569 if (p->cku_addr.maxlen < addr->len) { 1570 if (p->cku_addr.maxlen != 0 && p->cku_addr.buf != NULL) 1571 kmem_free(p->cku_addr.buf, p->cku_addr.maxlen); 1572 p->cku_addr.buf = kmem_zalloc(addr->maxlen, KM_SLEEP); 1573 p->cku_addr.maxlen = addr->maxlen; 1574 } 1575 1576 p->cku_addr.len = addr->len; 1577 bcopy(addr->buf, p->cku_addr.buf, addr->len); 1578 1579 /* 1580 * If the current sanity check size in rpcmod is smaller 1581 * than the size needed, then increase the sanity check. 1582 */ 1583 if (max_msgsize != 0 && clnt_max_msg_sizep != NULL && 1584 max_msgsize > *clnt_max_msg_sizep) { 1585 mutex_enter(&clnt_max_msg_lock); 1586 if (max_msgsize > *clnt_max_msg_sizep) 1587 *clnt_max_msg_sizep = max_msgsize; 1588 mutex_exit(&clnt_max_msg_lock); 1589 } 1590 } 1591 1592 /* 1593 * ksettimers is a no-op for cots, with the exception of setting the xid. 1594 */ 1595 /* ARGSUSED */ 1596 static int 1597 clnt_cots_ksettimers(CLIENT *h, struct rpc_timers *t, struct rpc_timers *all, 1598 int minimum, void (*feedback)(int, int, caddr_t), caddr_t arg, 1599 uint32_t xid) 1600 { 1601 /* LINTED pointer alignment */ 1602 cku_private_t *p = htop(h); 1603 1604 if (xid) 1605 p->cku_xid = xid; 1606 COTSRCSTAT_INCR(p->cku_stats, rctimers); 1607 return (0); 1608 } 1609 1610 extern void rpc_poptimod(struct vnode *); 1611 extern int kstr_push(struct vnode *, char *); 1612 1613 int 1614 conn_kstat_update(kstat_t *ksp, int rw) 1615 { 1616 struct cm_xprt *cm_entry; 1617 struct cm_kstat_xprt *cm_ksp_data; 1618 uchar_t *b; 1619 char *fbuf; 1620 1621 if (rw == KSTAT_WRITE) 1622 return (EACCES); 1623 if (ksp == NULL || ksp->ks_private == NULL) 1624 return (EIO); 1625 cm_entry = (struct cm_xprt *)ksp->ks_private; 1626 cm_ksp_data = (struct cm_kstat_xprt *)ksp->ks_data; 1627 1628 cm_ksp_data->x_wq.value.ui32 = (uint32_t)(uintptr_t)cm_entry->x_wq; 1629 cm_ksp_data->x_family.value.ui32 = cm_entry->x_family; 1630 cm_ksp_data->x_rdev.value.ui32 = (uint32_t)cm_entry->x_rdev; 1631 cm_ksp_data->x_time.value.ui32 = cm_entry->x_time; 1632 cm_ksp_data->x_ref.value.ui32 = cm_entry->x_ref; 1633 cm_ksp_data->x_state.value.ui32 = cm_entry->x_state_flags; 1634 1635 if (cm_entry->x_server.buf) { 1636 fbuf = cm_ksp_data->x_server.value.str.addr.ptr; 1637 if (cm_entry->x_family == AF_INET && 1638 cm_entry->x_server.len == 1639 sizeof (struct sockaddr_in)) { 1640 struct sockaddr_in *sa; 1641 sa = (struct sockaddr_in *) 1642 cm_entry->x_server.buf; 1643 b = (uchar_t *)&sa->sin_addr; 1644 (void) sprintf(fbuf, 1645 "%03d.%03d.%03d.%03d", b[0] & 0xFF, b[1] & 0xFF, 1646 b[2] & 0xFF, b[3] & 0xFF); 1647 cm_ksp_data->x_port.value.ui32 = 1648 (uint32_t)sa->sin_port; 1649 } else if (cm_entry->x_family == AF_INET6 && 1650 cm_entry->x_server.len >= 1651 sizeof (struct sockaddr_in6)) { 1652 /* extract server IP address & port */ 1653 struct sockaddr_in6 *sin6; 1654 sin6 = (struct sockaddr_in6 *)cm_entry->x_server.buf; 1655 (void) kinet_ntop6((uchar_t *)&sin6->sin6_addr, fbuf, 1656 INET6_ADDRSTRLEN); 1657 cm_ksp_data->x_port.value.ui32 = sin6->sin6_port; 1658 } else { 1659 struct sockaddr_in *sa; 1660 1661 sa = (struct sockaddr_in *)cm_entry->x_server.buf; 1662 b = (uchar_t *)&sa->sin_addr; 1663 (void) sprintf(fbuf, 1664 "%03d.%03d.%03d.%03d", b[0] & 0xFF, b[1] & 0xFF, 1665 b[2] & 0xFF, b[3] & 0xFF); 1666 } 1667 KSTAT_NAMED_STR_BUFLEN(&cm_ksp_data->x_server) = 1668 strlen(fbuf) + 1; 1669 } 1670 1671 return (0); 1672 } 1673 1674 1675 /* 1676 * We want a version of delay which is interruptible by a UNIX signal 1677 * Return EINTR if an interrupt occured. 1678 */ 1679 static int 1680 clnt_delay(clock_t ticks, bool_t nosignal) 1681 { 1682 if (nosignal == TRUE) { 1683 delay(ticks); 1684 return (0); 1685 } 1686 return (delay_sig(ticks)); 1687 } 1688 1689 /* 1690 * Wait for a connection until a timeout, or until we are 1691 * signalled that there has been a connection state change. 1692 */ 1693 static enum clnt_stat 1694 connmgr_cwait(struct cm_xprt *cm_entry, const struct timeval *waitp, 1695 bool_t nosignal) 1696 { 1697 bool_t interrupted; 1698 clock_t timout, cv_stat; 1699 enum clnt_stat clstat; 1700 unsigned int old_state; 1701 1702 ASSERT(MUTEX_HELD(&connmgr_lock)); 1703 /* 1704 * We wait for the transport connection to be made, or an 1705 * indication that it could not be made. 1706 */ 1707 clstat = RPC_TIMEDOUT; 1708 interrupted = FALSE; 1709 1710 old_state = cm_entry->x_state_flags; 1711 /* 1712 * Now loop until cv_timedwait{_sig} returns because of 1713 * a signal(0) or timeout(-1) or cv_signal(>0). But it may be 1714 * cv_signalled for various other reasons too. So loop 1715 * until there is a state change on the connection. 1716 */ 1717 1718 timout = waitp->tv_sec * drv_usectohz(1000000) + 1719 drv_usectohz(waitp->tv_usec) + ddi_get_lbolt(); 1720 1721 if (nosignal) { 1722 while ((cv_stat = cv_timedwait(&cm_entry->x_conn_cv, 1723 &connmgr_lock, timout)) > 0 && 1724 cm_entry->x_state_flags == old_state) 1725 ; 1726 } else { 1727 while ((cv_stat = cv_timedwait_sig(&cm_entry->x_conn_cv, 1728 &connmgr_lock, timout)) > 0 && 1729 cm_entry->x_state_flags == old_state) 1730 ; 1731 1732 if (cv_stat == 0) /* got intr signal? */ 1733 interrupted = TRUE; 1734 } 1735 1736 if ((cm_entry->x_state_flags & (X_BADSTATES|X_CONNECTED)) == 1737 X_CONNECTED) { 1738 clstat = RPC_SUCCESS; 1739 } else { 1740 if (interrupted == TRUE) 1741 clstat = RPC_INTR; 1742 RPCLOG(1, "connmgr_cwait: can't connect, error: %s\n", 1743 clnt_sperrno(clstat)); 1744 } 1745 1746 return (clstat); 1747 } 1748 1749 /* 1750 * Primary interface for how RPC grabs a connection. 1751 */ 1752 static struct cm_xprt * 1753 connmgr_wrapget( 1754 struct netbuf *retryaddr, 1755 const struct timeval *waitp, 1756 cku_private_t *p) 1757 { 1758 struct cm_xprt *cm_entry; 1759 1760 cm_entry = connmgr_get(retryaddr, waitp, &p->cku_addr, p->cku_addrfmly, 1761 &p->cku_srcaddr, &p->cku_err, p->cku_device, 1762 p->cku_client.cl_nosignal, p->cku_useresvport, p->cku_cred); 1763 1764 if (cm_entry == NULL) { 1765 /* 1766 * Re-map the call status to RPC_INTR if the err code is 1767 * EINTR. This can happen if calls status is RPC_TLIERROR. 1768 * However, don't re-map if signalling has been turned off. 1769 * XXX Really need to create a separate thread whenever 1770 * there isn't an existing connection. 1771 */ 1772 if (p->cku_err.re_errno == EINTR) { 1773 if (p->cku_client.cl_nosignal == TRUE) 1774 p->cku_err.re_errno = EIO; 1775 else 1776 p->cku_err.re_status = RPC_INTR; 1777 } 1778 } 1779 1780 return (cm_entry); 1781 } 1782 1783 /* 1784 * Obtains a transport to the server specified in addr. If a suitable transport 1785 * does not already exist in the list of cached transports, a new connection 1786 * is created, connected, and added to the list. The connection is for sending 1787 * only - the reply message may come back on another transport connection. 1788 * 1789 * To implement round-robin load balancing with multiple client connections, 1790 * the last entry on the list is always selected. Once the entry is selected 1791 * it's re-inserted to the head of the list. 1792 */ 1793 static struct cm_xprt * 1794 connmgr_get( 1795 struct netbuf *retryaddr, 1796 const struct timeval *waitp, /* changed to a ptr to converse stack */ 1797 struct netbuf *destaddr, 1798 int addrfmly, 1799 struct netbuf *srcaddr, 1800 struct rpc_err *rpcerr, 1801 dev_t device, 1802 bool_t nosignal, 1803 int useresvport, 1804 cred_t *cr) 1805 { 1806 struct cm_xprt *cm_entry; 1807 struct cm_xprt *lru_entry; 1808 struct cm_xprt **cmp, **prev; 1809 queue_t *wq; 1810 TIUSER *tiptr; 1811 int i; 1812 int retval; 1813 int tidu_size; 1814 bool_t connected; 1815 zoneid_t zoneid = rpc_zoneid(); 1816 1817 /* 1818 * If the call is not a retry, look for a transport entry that 1819 * goes to the server of interest. 1820 */ 1821 mutex_enter(&connmgr_lock); 1822 1823 if (retryaddr == NULL) { 1824 use_new_conn: 1825 i = 0; 1826 cm_entry = lru_entry = NULL; 1827 1828 prev = cmp = &cm_hd; 1829 while ((cm_entry = *cmp) != NULL) { 1830 ASSERT(cm_entry != cm_entry->x_next); 1831 /* 1832 * Garbage collect conections that are marked 1833 * for needs disconnect. 1834 */ 1835 if (cm_entry->x_needdis) { 1836 CONN_HOLD(cm_entry); 1837 connmgr_dis_and_wait(cm_entry); 1838 connmgr_release(cm_entry); 1839 /* 1840 * connmgr_lock could have been 1841 * dropped for the disconnect 1842 * processing so start over. 1843 */ 1844 goto use_new_conn; 1845 } 1846 1847 /* 1848 * Garbage collect the dead connections that have 1849 * no threads working on them. 1850 */ 1851 if ((cm_entry->x_state_flags & (X_DEAD|X_THREAD)) == 1852 X_DEAD) { 1853 mutex_enter(&cm_entry->x_lock); 1854 if (cm_entry->x_ref != 0) { 1855 /* 1856 * Currently in use. 1857 * Cleanup later. 1858 */ 1859 cmp = &cm_entry->x_next; 1860 mutex_exit(&cm_entry->x_lock); 1861 continue; 1862 } 1863 mutex_exit(&cm_entry->x_lock); 1864 *cmp = cm_entry->x_next; 1865 mutex_exit(&connmgr_lock); 1866 connmgr_close(cm_entry); 1867 mutex_enter(&connmgr_lock); 1868 goto use_new_conn; 1869 } 1870 1871 1872 if ((cm_entry->x_state_flags & X_BADSTATES) == 0 && 1873 cm_entry->x_zoneid == zoneid && 1874 cm_entry->x_rdev == device && 1875 destaddr->len == cm_entry->x_server.len && 1876 bcmp(destaddr->buf, cm_entry->x_server.buf, 1877 destaddr->len) == 0) { 1878 /* 1879 * If the matching entry isn't connected, 1880 * attempt to reconnect it. 1881 */ 1882 if (cm_entry->x_connected == FALSE) { 1883 /* 1884 * We don't go through trying 1885 * to find the least recently 1886 * used connected because 1887 * connmgr_reconnect() briefly 1888 * dropped the connmgr_lock, 1889 * allowing a window for our 1890 * accounting to be messed up. 1891 * In any case, a re-connected 1892 * connection is as good as 1893 * a LRU connection. 1894 */ 1895 return (connmgr_wrapconnect(cm_entry, 1896 waitp, destaddr, addrfmly, srcaddr, 1897 rpcerr, TRUE, nosignal, cr)); 1898 } 1899 i++; 1900 1901 /* keep track of the last entry */ 1902 lru_entry = cm_entry; 1903 prev = cmp; 1904 } 1905 cmp = &cm_entry->x_next; 1906 } 1907 1908 if (i > clnt_max_conns) { 1909 RPCLOG(8, "connmgr_get: too many conns, dooming entry" 1910 " %p\n", (void *)lru_entry->x_tiptr); 1911 lru_entry->x_doomed = TRUE; 1912 goto use_new_conn; 1913 } 1914 1915 /* 1916 * If we are at the maximum number of connections to 1917 * the server, hand back the least recently used one. 1918 */ 1919 if (i == clnt_max_conns) { 1920 /* 1921 * Copy into the handle the source address of 1922 * the connection, which we will use in case of 1923 * a later retry. 1924 */ 1925 if (srcaddr->len != lru_entry->x_src.len) { 1926 if (srcaddr->len > 0) 1927 kmem_free(srcaddr->buf, 1928 srcaddr->maxlen); 1929 srcaddr->buf = kmem_zalloc( 1930 lru_entry->x_src.len, KM_SLEEP); 1931 srcaddr->maxlen = srcaddr->len = 1932 lru_entry->x_src.len; 1933 } 1934 bcopy(lru_entry->x_src.buf, srcaddr->buf, srcaddr->len); 1935 RPCLOG(2, "connmgr_get: call going out on %p\n", 1936 (void *)lru_entry); 1937 lru_entry->x_time = ddi_get_lbolt(); 1938 CONN_HOLD(lru_entry); 1939 1940 if ((i > 1) && (prev != &cm_hd)) { 1941 /* 1942 * remove and re-insert entry at head of list. 1943 */ 1944 *prev = lru_entry->x_next; 1945 lru_entry->x_next = cm_hd; 1946 cm_hd = lru_entry; 1947 } 1948 1949 mutex_exit(&connmgr_lock); 1950 return (lru_entry); 1951 } 1952 1953 } else { 1954 /* 1955 * This is the retry case (retryaddr != NULL). Retries must 1956 * be sent on the same source port as the original call. 1957 */ 1958 1959 /* 1960 * Walk the list looking for a connection with a source address 1961 * that matches the retry address. 1962 */ 1963 start_retry_loop: 1964 cmp = &cm_hd; 1965 while ((cm_entry = *cmp) != NULL) { 1966 ASSERT(cm_entry != cm_entry->x_next); 1967 1968 /* 1969 * determine if this connection matches the passed 1970 * in retry address. If it does not match, advance 1971 * to the next element on the list. 1972 */ 1973 if (zoneid != cm_entry->x_zoneid || 1974 device != cm_entry->x_rdev || 1975 retryaddr->len != cm_entry->x_src.len || 1976 bcmp(retryaddr->buf, cm_entry->x_src.buf, 1977 retryaddr->len) != 0) { 1978 cmp = &cm_entry->x_next; 1979 continue; 1980 } 1981 /* 1982 * Garbage collect conections that are marked 1983 * for needs disconnect. 1984 */ 1985 if (cm_entry->x_needdis) { 1986 CONN_HOLD(cm_entry); 1987 connmgr_dis_and_wait(cm_entry); 1988 connmgr_release(cm_entry); 1989 /* 1990 * connmgr_lock could have been 1991 * dropped for the disconnect 1992 * processing so start over. 1993 */ 1994 goto start_retry_loop; 1995 } 1996 /* 1997 * Garbage collect the dead connections that have 1998 * no threads working on them. 1999 */ 2000 if ((cm_entry->x_state_flags & (X_DEAD|X_THREAD)) == 2001 X_DEAD) { 2002 mutex_enter(&cm_entry->x_lock); 2003 if (cm_entry->x_ref != 0) { 2004 /* 2005 * Currently in use. 2006 * Cleanup later. 2007 */ 2008 cmp = &cm_entry->x_next; 2009 mutex_exit(&cm_entry->x_lock); 2010 continue; 2011 } 2012 mutex_exit(&cm_entry->x_lock); 2013 *cmp = cm_entry->x_next; 2014 mutex_exit(&connmgr_lock); 2015 connmgr_close(cm_entry); 2016 mutex_enter(&connmgr_lock); 2017 goto start_retry_loop; 2018 } 2019 2020 /* 2021 * Sanity check: if the connection with our source 2022 * port is going to some other server, something went 2023 * wrong, as we never delete connections (i.e. release 2024 * ports) unless they have been idle. In this case, 2025 * it is probably better to send the call out using 2026 * a new source address than to fail it altogether, 2027 * since that port may never be released. 2028 */ 2029 if (destaddr->len != cm_entry->x_server.len || 2030 bcmp(destaddr->buf, cm_entry->x_server.buf, 2031 destaddr->len) != 0) { 2032 RPCLOG(1, "connmgr_get: tiptr %p" 2033 " is going to a different server" 2034 " with the port that belongs" 2035 " to us!\n", (void *)cm_entry->x_tiptr); 2036 retryaddr = NULL; 2037 goto use_new_conn; 2038 } 2039 2040 /* 2041 * If the connection of interest is not connected and we 2042 * can't reconnect it, then the server is probably 2043 * still down. Return NULL to the caller and let it 2044 * retry later if it wants to. We have a delay so the 2045 * machine doesn't go into a tight retry loop. If the 2046 * entry was already connected, or the reconnected was 2047 * successful, return this entry. 2048 */ 2049 if (cm_entry->x_connected == FALSE) { 2050 return (connmgr_wrapconnect(cm_entry, 2051 waitp, destaddr, addrfmly, NULL, 2052 rpcerr, TRUE, nosignal, cr)); 2053 } else { 2054 CONN_HOLD(cm_entry); 2055 2056 cm_entry->x_time = ddi_get_lbolt(); 2057 mutex_exit(&connmgr_lock); 2058 RPCLOG(2, "connmgr_get: found old " 2059 "transport %p for retry\n", 2060 (void *)cm_entry); 2061 return (cm_entry); 2062 } 2063 } 2064 2065 /* 2066 * We cannot find an entry in the list for this retry. 2067 * Either the entry has been removed temporarily to be 2068 * reconnected by another thread, or the original call 2069 * got a port but never got connected, 2070 * and hence the transport never got put in the 2071 * list. Fall through to the "create new connection" code - 2072 * the former case will fail there trying to rebind the port, 2073 * and the later case (and any other pathological cases) will 2074 * rebind and reconnect and not hang the client machine. 2075 */ 2076 RPCLOG0(8, "connmgr_get: no entry in list for retry\n"); 2077 } 2078 /* 2079 * Set up a transport entry in the connection manager's list. 2080 */ 2081 cm_entry = (struct cm_xprt *) 2082 kmem_zalloc(sizeof (struct cm_xprt), KM_SLEEP); 2083 2084 cm_entry->x_server.buf = kmem_zalloc(destaddr->len, KM_SLEEP); 2085 bcopy(destaddr->buf, cm_entry->x_server.buf, destaddr->len); 2086 cm_entry->x_server.len = cm_entry->x_server.maxlen = destaddr->len; 2087 2088 cm_entry->x_state_flags = X_THREAD; 2089 cm_entry->x_ref = 1; 2090 cm_entry->x_family = addrfmly; 2091 cm_entry->x_rdev = device; 2092 cm_entry->x_zoneid = zoneid; 2093 mutex_init(&cm_entry->x_lock, NULL, MUTEX_DEFAULT, NULL); 2094 cv_init(&cm_entry->x_cv, NULL, CV_DEFAULT, NULL); 2095 cv_init(&cm_entry->x_conn_cv, NULL, CV_DEFAULT, NULL); 2096 cv_init(&cm_entry->x_dis_cv, NULL, CV_DEFAULT, NULL); 2097 2098 /* 2099 * Note that we add this partially initialized entry to the 2100 * connection list. This is so that we don't have connections to 2101 * the same server. 2102 * 2103 * Note that x_src is not initialized at this point. This is because 2104 * retryaddr might be NULL in which case x_src is whatever 2105 * t_kbind/bindresvport gives us. If another thread wants a 2106 * connection to the same server, seemingly we have an issue, but we 2107 * don't. If the other thread comes in with retryaddr == NULL, then it 2108 * will never look at x_src, and it will end up waiting in 2109 * connmgr_cwait() for the first thread to finish the connection 2110 * attempt. If the other thread comes in with retryaddr != NULL, then 2111 * that means there was a request sent on a connection, in which case 2112 * the the connection should already exist. Thus the first thread 2113 * never gets here ... it finds the connection it its server in the 2114 * connection list. 2115 * 2116 * But even if theory is wrong, in the retryaddr != NULL case, the 2nd 2117 * thread will skip us because x_src.len == 0. 2118 */ 2119 cm_entry->x_next = cm_hd; 2120 cm_hd = cm_entry; 2121 mutex_exit(&connmgr_lock); 2122 2123 /* 2124 * Either we didn't find an entry to the server of interest, or we 2125 * don't have the maximum number of connections to that server - 2126 * create a new connection. 2127 */ 2128 RPCLOG0(8, "connmgr_get: creating new connection\n"); 2129 rpcerr->re_status = RPC_TLIERROR; 2130 2131 i = t_kopen(NULL, device, FREAD|FWRITE|FNDELAY, &tiptr, zone_kcred()); 2132 if (i) { 2133 RPCLOG(1, "connmgr_get: can't open cots device, error %d\n", i); 2134 rpcerr->re_errno = i; 2135 connmgr_cancelconn(cm_entry); 2136 return (NULL); 2137 } 2138 rpc_poptimod(tiptr->fp->f_vnode); 2139 2140 if (i = strioctl(tiptr->fp->f_vnode, I_PUSH, (intptr_t)"rpcmod", 0, 2141 K_TO_K, kcred, &retval)) { 2142 RPCLOG(1, "connmgr_get: can't push cots module, %d\n", i); 2143 (void) t_kclose(tiptr, 1); 2144 rpcerr->re_errno = i; 2145 connmgr_cancelconn(cm_entry); 2146 return (NULL); 2147 } 2148 2149 if (i = strioctl(tiptr->fp->f_vnode, RPC_CLIENT, 0, 0, K_TO_K, 2150 kcred, &retval)) { 2151 RPCLOG(1, "connmgr_get: can't set client status with cots " 2152 "module, %d\n", i); 2153 (void) t_kclose(tiptr, 1); 2154 rpcerr->re_errno = i; 2155 connmgr_cancelconn(cm_entry); 2156 return (NULL); 2157 } 2158 2159 mutex_enter(&connmgr_lock); 2160 2161 wq = tiptr->fp->f_vnode->v_stream->sd_wrq->q_next; 2162 cm_entry->x_wq = wq; 2163 2164 mutex_exit(&connmgr_lock); 2165 2166 if (i = strioctl(tiptr->fp->f_vnode, I_PUSH, (intptr_t)"timod", 0, 2167 K_TO_K, kcred, &retval)) { 2168 RPCLOG(1, "connmgr_get: can't push timod, %d\n", i); 2169 (void) t_kclose(tiptr, 1); 2170 rpcerr->re_errno = i; 2171 connmgr_cancelconn(cm_entry); 2172 return (NULL); 2173 } 2174 2175 /* 2176 * If the caller has not specified reserved port usage then 2177 * take the system default. 2178 */ 2179 if (useresvport == -1) 2180 useresvport = clnt_cots_do_bindresvport; 2181 2182 if ((useresvport || retryaddr != NULL) && 2183 (addrfmly == AF_INET || addrfmly == AF_INET6)) { 2184 bool_t alloc_src = FALSE; 2185 2186 if (srcaddr->len != destaddr->len) { 2187 kmem_free(srcaddr->buf, srcaddr->maxlen); 2188 srcaddr->buf = kmem_zalloc(destaddr->len, KM_SLEEP); 2189 srcaddr->maxlen = destaddr->len; 2190 srcaddr->len = destaddr->len; 2191 alloc_src = TRUE; 2192 } 2193 2194 if ((i = bindresvport(tiptr, retryaddr, srcaddr, TRUE)) != 0) { 2195 (void) t_kclose(tiptr, 1); 2196 RPCLOG(1, "connmgr_get: couldn't bind, retryaddr: " 2197 "%p\n", (void *)retryaddr); 2198 2199 /* 2200 * 1225408: If we allocated a source address, then it 2201 * is either garbage or all zeroes. In that case 2202 * we need to clear srcaddr. 2203 */ 2204 if (alloc_src == TRUE) { 2205 kmem_free(srcaddr->buf, srcaddr->maxlen); 2206 srcaddr->maxlen = srcaddr->len = 0; 2207 srcaddr->buf = NULL; 2208 } 2209 rpcerr->re_errno = i; 2210 connmgr_cancelconn(cm_entry); 2211 return (NULL); 2212 } 2213 } else { 2214 if ((i = t_kbind(tiptr, NULL, NULL)) != 0) { 2215 RPCLOG(1, "clnt_cots_kcreate: t_kbind: %d\n", i); 2216 (void) t_kclose(tiptr, 1); 2217 rpcerr->re_errno = i; 2218 connmgr_cancelconn(cm_entry); 2219 return (NULL); 2220 } 2221 } 2222 2223 { 2224 /* 2225 * Keep the kernel stack lean. Don't move this call 2226 * declaration to the top of this function because a 2227 * call is declared in connmgr_wrapconnect() 2228 */ 2229 calllist_t call; 2230 2231 bzero(&call, sizeof (call)); 2232 cv_init(&call.call_cv, NULL, CV_DEFAULT, NULL); 2233 2234 /* 2235 * This is a bound end-point so don't close it's stream. 2236 */ 2237 connected = connmgr_connect(cm_entry, wq, destaddr, addrfmly, 2238 &call, &tidu_size, FALSE, waitp, nosignal, cr); 2239 *rpcerr = call.call_err; 2240 cv_destroy(&call.call_cv); 2241 2242 } 2243 2244 mutex_enter(&connmgr_lock); 2245 2246 /* 2247 * Set up a transport entry in the connection manager's list. 2248 */ 2249 cm_entry->x_src.buf = kmem_zalloc(srcaddr->len, KM_SLEEP); 2250 bcopy(srcaddr->buf, cm_entry->x_src.buf, srcaddr->len); 2251 cm_entry->x_src.len = cm_entry->x_src.maxlen = srcaddr->len; 2252 2253 cm_entry->x_tiptr = tiptr; 2254 cm_entry->x_time = ddi_get_lbolt(); 2255 2256 if (tiptr->tp_info.servtype == T_COTS_ORD) 2257 cm_entry->x_ordrel = TRUE; 2258 else 2259 cm_entry->x_ordrel = FALSE; 2260 2261 cm_entry->x_tidu_size = tidu_size; 2262 2263 if (cm_entry->x_early_disc) { 2264 /* 2265 * We need to check if a disconnect request has come 2266 * while we are connected, if so, then we need to 2267 * set rpcerr->re_status appropriately before returning 2268 * NULL to caller. 2269 */ 2270 if (rpcerr->re_status == RPC_SUCCESS) 2271 rpcerr->re_status = RPC_XPRTFAILED; 2272 cm_entry->x_connected = FALSE; 2273 } else 2274 cm_entry->x_connected = connected; 2275 2276 /* 2277 * There could be a discrepancy here such that 2278 * x_early_disc is TRUE yet connected is TRUE as well 2279 * and the connection is actually connected. In that case 2280 * lets be conservative and declare the connection as not 2281 * connected. 2282 */ 2283 cm_entry->x_early_disc = FALSE; 2284 cm_entry->x_needdis = (cm_entry->x_connected == FALSE); 2285 cm_entry->x_ctime = ddi_get_lbolt(); 2286 2287 /* 2288 * Notify any threads waiting that the connection attempt is done. 2289 */ 2290 cm_entry->x_thread = FALSE; 2291 cv_broadcast(&cm_entry->x_conn_cv); 2292 2293 if (cm_entry->x_connected == FALSE) { 2294 mutex_exit(&connmgr_lock); 2295 connmgr_release(cm_entry); 2296 return (NULL); 2297 } 2298 2299 mutex_exit(&connmgr_lock); 2300 2301 return (cm_entry); 2302 } 2303 2304 /* 2305 * Keep the cm_xprt entry on the connecton list when making a connection. This 2306 * is to prevent multiple connections to a slow server from appearing. 2307 * We use the bit field x_thread to tell if a thread is doing a connection 2308 * which keeps other interested threads from messing with connection. 2309 * Those other threads just wait if x_thread is set. 2310 * 2311 * If x_thread is not set, then we do the actual work of connecting via 2312 * connmgr_connect(). 2313 * 2314 * mutex convention: called with connmgr_lock held, returns with it released. 2315 */ 2316 static struct cm_xprt * 2317 connmgr_wrapconnect( 2318 struct cm_xprt *cm_entry, 2319 const struct timeval *waitp, 2320 struct netbuf *destaddr, 2321 int addrfmly, 2322 struct netbuf *srcaddr, 2323 struct rpc_err *rpcerr, 2324 bool_t reconnect, 2325 bool_t nosignal, 2326 cred_t *cr) 2327 { 2328 ASSERT(MUTEX_HELD(&connmgr_lock)); 2329 /* 2330 * Hold this entry as we are about to drop connmgr_lock. 2331 */ 2332 CONN_HOLD(cm_entry); 2333 2334 /* 2335 * If there is a thread already making a connection for us, then 2336 * wait for it to complete the connection. 2337 */ 2338 if (cm_entry->x_thread == TRUE) { 2339 rpcerr->re_status = connmgr_cwait(cm_entry, waitp, nosignal); 2340 2341 if (rpcerr->re_status != RPC_SUCCESS) { 2342 mutex_exit(&connmgr_lock); 2343 connmgr_release(cm_entry); 2344 return (NULL); 2345 } 2346 } else { 2347 bool_t connected; 2348 calllist_t call; 2349 2350 cm_entry->x_thread = TRUE; 2351 2352 while (cm_entry->x_needrel == TRUE) { 2353 cm_entry->x_needrel = FALSE; 2354 2355 connmgr_sndrel(cm_entry); 2356 delay(drv_usectohz(1000000)); 2357 2358 mutex_enter(&connmgr_lock); 2359 } 2360 2361 /* 2362 * If we need to send a T_DISCON_REQ, send one. 2363 */ 2364 connmgr_dis_and_wait(cm_entry); 2365 2366 mutex_exit(&connmgr_lock); 2367 2368 bzero(&call, sizeof (call)); 2369 cv_init(&call.call_cv, NULL, CV_DEFAULT, NULL); 2370 2371 connected = connmgr_connect(cm_entry, cm_entry->x_wq, 2372 destaddr, addrfmly, &call, &cm_entry->x_tidu_size, 2373 reconnect, waitp, nosignal, cr); 2374 2375 *rpcerr = call.call_err; 2376 cv_destroy(&call.call_cv); 2377 2378 mutex_enter(&connmgr_lock); 2379 2380 2381 if (cm_entry->x_early_disc) { 2382 /* 2383 * We need to check if a disconnect request has come 2384 * while we are connected, if so, then we need to 2385 * set rpcerr->re_status appropriately before returning 2386 * NULL to caller. 2387 */ 2388 if (rpcerr->re_status == RPC_SUCCESS) 2389 rpcerr->re_status = RPC_XPRTFAILED; 2390 cm_entry->x_connected = FALSE; 2391 } else 2392 cm_entry->x_connected = connected; 2393 2394 /* 2395 * There could be a discrepancy here such that 2396 * x_early_disc is TRUE yet connected is TRUE as well 2397 * and the connection is actually connected. In that case 2398 * lets be conservative and declare the connection as not 2399 * connected. 2400 */ 2401 2402 cm_entry->x_early_disc = FALSE; 2403 cm_entry->x_needdis = (cm_entry->x_connected == FALSE); 2404 2405 2406 /* 2407 * connmgr_connect() may have given up before the connection 2408 * actually timed out. So ensure that before the next 2409 * connection attempt we do a disconnect. 2410 */ 2411 cm_entry->x_ctime = ddi_get_lbolt(); 2412 cm_entry->x_thread = FALSE; 2413 2414 cv_broadcast(&cm_entry->x_conn_cv); 2415 2416 if (cm_entry->x_connected == FALSE) { 2417 mutex_exit(&connmgr_lock); 2418 connmgr_release(cm_entry); 2419 return (NULL); 2420 } 2421 } 2422 2423 if (srcaddr != NULL) { 2424 /* 2425 * Copy into the handle the 2426 * source address of the 2427 * connection, which we will use 2428 * in case of a later retry. 2429 */ 2430 if (srcaddr->len != cm_entry->x_src.len) { 2431 if (srcaddr->maxlen > 0) 2432 kmem_free(srcaddr->buf, srcaddr->maxlen); 2433 srcaddr->buf = kmem_zalloc(cm_entry->x_src.len, 2434 KM_SLEEP); 2435 srcaddr->maxlen = srcaddr->len = 2436 cm_entry->x_src.len; 2437 } 2438 bcopy(cm_entry->x_src.buf, srcaddr->buf, srcaddr->len); 2439 } 2440 cm_entry->x_time = ddi_get_lbolt(); 2441 mutex_exit(&connmgr_lock); 2442 return (cm_entry); 2443 } 2444 2445 /* 2446 * If we need to send a T_DISCON_REQ, send one. 2447 */ 2448 static void 2449 connmgr_dis_and_wait(struct cm_xprt *cm_entry) 2450 { 2451 ASSERT(MUTEX_HELD(&connmgr_lock)); 2452 for (;;) { 2453 while (cm_entry->x_needdis == TRUE) { 2454 RPCLOG(8, "connmgr_dis_and_wait: need " 2455 "T_DISCON_REQ for connection 0x%p\n", 2456 (void *)cm_entry); 2457 cm_entry->x_needdis = FALSE; 2458 cm_entry->x_waitdis = TRUE; 2459 2460 connmgr_snddis(cm_entry); 2461 2462 mutex_enter(&connmgr_lock); 2463 } 2464 2465 if (cm_entry->x_waitdis == TRUE) { 2466 clock_t timout; 2467 2468 RPCLOG(8, "connmgr_dis_and_wait waiting for " 2469 "T_DISCON_REQ's ACK for connection %p\n", 2470 (void *)cm_entry); 2471 2472 timout = clnt_cots_min_conntout * drv_usectohz(1000000); 2473 2474 /* 2475 * The TPI spec says that the T_DISCON_REQ 2476 * will get acknowledged, but in practice 2477 * the ACK may never get sent. So don't 2478 * block forever. 2479 */ 2480 (void) cv_reltimedwait(&cm_entry->x_dis_cv, 2481 &connmgr_lock, timout, TR_CLOCK_TICK); 2482 } 2483 /* 2484 * If we got the ACK, break. If we didn't, 2485 * then send another T_DISCON_REQ. 2486 */ 2487 if (cm_entry->x_waitdis == FALSE) { 2488 break; 2489 } else { 2490 RPCLOG(8, "connmgr_dis_and_wait: did" 2491 "not get T_DISCON_REQ's ACK for " 2492 "connection %p\n", (void *)cm_entry); 2493 cm_entry->x_needdis = TRUE; 2494 } 2495 } 2496 } 2497 2498 static void 2499 connmgr_cancelconn(struct cm_xprt *cm_entry) 2500 { 2501 /* 2502 * Mark the connection table entry as dead; the next thread that 2503 * goes through connmgr_release() will notice this and deal with it. 2504 */ 2505 mutex_enter(&connmgr_lock); 2506 cm_entry->x_dead = TRUE; 2507 2508 /* 2509 * Notify any threads waiting for the connection that it isn't 2510 * going to happen. 2511 */ 2512 cm_entry->x_thread = FALSE; 2513 cv_broadcast(&cm_entry->x_conn_cv); 2514 mutex_exit(&connmgr_lock); 2515 2516 connmgr_release(cm_entry); 2517 } 2518 2519 static void 2520 connmgr_close(struct cm_xprt *cm_entry) 2521 { 2522 mutex_enter(&cm_entry->x_lock); 2523 while (cm_entry->x_ref != 0) { 2524 /* 2525 * Must be a noninterruptible wait. 2526 */ 2527 cv_wait(&cm_entry->x_cv, &cm_entry->x_lock); 2528 } 2529 2530 if (cm_entry->x_tiptr != NULL) 2531 (void) t_kclose(cm_entry->x_tiptr, 1); 2532 2533 mutex_exit(&cm_entry->x_lock); 2534 if (cm_entry->x_ksp != NULL) { 2535 mutex_enter(&connmgr_lock); 2536 cm_entry->x_ksp->ks_private = NULL; 2537 mutex_exit(&connmgr_lock); 2538 2539 /* 2540 * Must free the buffer we allocated for the 2541 * server address in the update function 2542 */ 2543 if (((struct cm_kstat_xprt *)(cm_entry->x_ksp->ks_data))-> 2544 x_server.value.str.addr.ptr != NULL) 2545 kmem_free(((struct cm_kstat_xprt *)(cm_entry->x_ksp-> 2546 ks_data))->x_server.value.str.addr.ptr, 2547 INET6_ADDRSTRLEN); 2548 kmem_free(cm_entry->x_ksp->ks_data, 2549 cm_entry->x_ksp->ks_data_size); 2550 kstat_delete(cm_entry->x_ksp); 2551 } 2552 2553 mutex_destroy(&cm_entry->x_lock); 2554 cv_destroy(&cm_entry->x_cv); 2555 cv_destroy(&cm_entry->x_conn_cv); 2556 cv_destroy(&cm_entry->x_dis_cv); 2557 2558 if (cm_entry->x_server.buf != NULL) 2559 kmem_free(cm_entry->x_server.buf, cm_entry->x_server.maxlen); 2560 if (cm_entry->x_src.buf != NULL) 2561 kmem_free(cm_entry->x_src.buf, cm_entry->x_src.maxlen); 2562 kmem_free(cm_entry, sizeof (struct cm_xprt)); 2563 } 2564 2565 /* 2566 * Called by KRPC after sending the call message to release the connection 2567 * it was using. 2568 */ 2569 static void 2570 connmgr_release(struct cm_xprt *cm_entry) 2571 { 2572 mutex_enter(&cm_entry->x_lock); 2573 cm_entry->x_ref--; 2574 if (cm_entry->x_ref == 0) 2575 cv_signal(&cm_entry->x_cv); 2576 mutex_exit(&cm_entry->x_lock); 2577 } 2578 2579 /* 2580 * Set TCP receive and xmit buffer size for RPC connections. 2581 */ 2582 static bool_t 2583 connmgr_setbufsz(calllist_t *e, queue_t *wq, cred_t *cr) 2584 { 2585 int ok = FALSE; 2586 int val; 2587 2588 if (rpc_default_tcp_bufsz) 2589 return (FALSE); 2590 2591 /* 2592 * Only set new buffer size if it's larger than the system 2593 * default buffer size. If smaller buffer size is needed 2594 * then use /etc/system to set rpc_default_tcp_bufsz to 1. 2595 */ 2596 ok = connmgr_getopt_int(wq, SOL_SOCKET, SO_RCVBUF, &val, e, cr); 2597 if ((ok == TRUE) && (val < rpc_send_bufsz)) { 2598 ok = connmgr_setopt_int(wq, SOL_SOCKET, SO_RCVBUF, 2599 rpc_send_bufsz, e, cr); 2600 DTRACE_PROBE2(krpc__i__connmgr_rcvbufsz, 2601 int, ok, calllist_t *, e); 2602 } 2603 2604 ok = connmgr_getopt_int(wq, SOL_SOCKET, SO_SNDBUF, &val, e, cr); 2605 if ((ok == TRUE) && (val < rpc_recv_bufsz)) { 2606 ok = connmgr_setopt_int(wq, SOL_SOCKET, SO_SNDBUF, 2607 rpc_recv_bufsz, e, cr); 2608 DTRACE_PROBE2(krpc__i__connmgr_sndbufsz, 2609 int, ok, calllist_t *, e); 2610 } 2611 return (TRUE); 2612 } 2613 2614 /* 2615 * Given an open stream, connect to the remote. Returns true if connected, 2616 * false otherwise. 2617 */ 2618 static bool_t 2619 connmgr_connect( 2620 struct cm_xprt *cm_entry, 2621 queue_t *wq, 2622 struct netbuf *addr, 2623 int addrfmly, 2624 calllist_t *e, 2625 int *tidu_ptr, 2626 bool_t reconnect, 2627 const struct timeval *waitp, 2628 bool_t nosignal, 2629 cred_t *cr) 2630 { 2631 mblk_t *mp; 2632 struct T_conn_req *tcr; 2633 struct T_info_ack *tinfo; 2634 int interrupted, error; 2635 int tidu_size, kstat_instance; 2636 2637 /* if it's a reconnect, flush any lingering data messages */ 2638 if (reconnect) 2639 (void) putctl1(wq, M_FLUSH, FLUSHRW); 2640 2641 /* 2642 * Note: if the receiver uses SCM_UCRED/getpeerucred the pid will 2643 * appear as -1. 2644 */ 2645 mp = allocb_cred(sizeof (*tcr) + addr->len, cr, NOPID); 2646 if (mp == NULL) { 2647 /* 2648 * This is unfortunate, but we need to look up the stats for 2649 * this zone to increment the "memory allocation failed" 2650 * counter. curproc->p_zone is safe since we're initiating a 2651 * connection and not in some strange streams context. 2652 */ 2653 struct rpcstat *rpcstat; 2654 2655 rpcstat = zone_getspecific(rpcstat_zone_key, rpc_zone()); 2656 ASSERT(rpcstat != NULL); 2657 2658 RPCLOG0(1, "connmgr_connect: cannot alloc mp for " 2659 "sending conn request\n"); 2660 COTSRCSTAT_INCR(rpcstat->rpc_cots_client, rcnomem); 2661 e->call_status = RPC_SYSTEMERROR; 2662 e->call_reason = ENOSR; 2663 return (FALSE); 2664 } 2665 2666 /* Set TCP buffer size for RPC connections if needed */ 2667 if (addrfmly == AF_INET || addrfmly == AF_INET6) 2668 (void) connmgr_setbufsz(e, wq, cr); 2669 2670 mp->b_datap->db_type = M_PROTO; 2671 tcr = (struct T_conn_req *)mp->b_rptr; 2672 bzero(tcr, sizeof (*tcr)); 2673 tcr->PRIM_type = T_CONN_REQ; 2674 tcr->DEST_length = addr->len; 2675 tcr->DEST_offset = sizeof (struct T_conn_req); 2676 mp->b_wptr = mp->b_rptr + sizeof (*tcr); 2677 2678 bcopy(addr->buf, mp->b_wptr, tcr->DEST_length); 2679 mp->b_wptr += tcr->DEST_length; 2680 2681 RPCLOG(8, "connmgr_connect: sending conn request on queue " 2682 "%p", (void *)wq); 2683 RPCLOG(8, " call %p\n", (void *)wq); 2684 /* 2685 * We use the entry in the handle that is normally used for 2686 * waiting for RPC replies to wait for the connection accept. 2687 */ 2688 if (clnt_dispatch_send(wq, mp, e, 0, 0) != RPC_SUCCESS) { 2689 DTRACE_PROBE(krpc__e__connmgr__connect__cantsend); 2690 freemsg(mp); 2691 return (FALSE); 2692 } 2693 2694 mutex_enter(&clnt_pending_lock); 2695 2696 /* 2697 * We wait for the transport connection to be made, or an 2698 * indication that it could not be made. 2699 */ 2700 interrupted = 0; 2701 2702 /* 2703 * waitforack should have been called with T_OK_ACK, but the 2704 * present implementation needs to be passed T_INFO_ACK to 2705 * work correctly. 2706 */ 2707 error = waitforack(e, T_INFO_ACK, waitp, nosignal); 2708 if (error == EINTR) 2709 interrupted = 1; 2710 if (zone_status_get(curproc->p_zone) >= ZONE_IS_EMPTY) { 2711 /* 2712 * No time to lose; we essentially have been signaled to 2713 * quit. 2714 */ 2715 interrupted = 1; 2716 } 2717 #ifdef RPCDEBUG 2718 if (error == ETIME) 2719 RPCLOG0(8, "connmgr_connect: giving up " 2720 "on connection attempt; " 2721 "clnt_dispatch notifyconn " 2722 "diagnostic 'no one waiting for " 2723 "connection' should not be " 2724 "unexpected\n"); 2725 #endif 2726 if (e->call_prev) 2727 e->call_prev->call_next = e->call_next; 2728 else 2729 clnt_pending = e->call_next; 2730 if (e->call_next) 2731 e->call_next->call_prev = e->call_prev; 2732 mutex_exit(&clnt_pending_lock); 2733 2734 if (e->call_status != RPC_SUCCESS || error != 0) { 2735 if (interrupted) 2736 e->call_status = RPC_INTR; 2737 else if (error == ETIME) 2738 e->call_status = RPC_TIMEDOUT; 2739 else if (error == EPROTO) { 2740 e->call_status = RPC_SYSTEMERROR; 2741 e->call_reason = EPROTO; 2742 } 2743 2744 RPCLOG(8, "connmgr_connect: can't connect, status: " 2745 "%s\n", clnt_sperrno(e->call_status)); 2746 2747 if (e->call_reply) { 2748 freemsg(e->call_reply); 2749 e->call_reply = NULL; 2750 } 2751 2752 return (FALSE); 2753 } 2754 /* 2755 * The result of the "connection accept" is a T_info_ack 2756 * in the call_reply field. 2757 */ 2758 ASSERT(e->call_reply != NULL); 2759 mp = e->call_reply; 2760 e->call_reply = NULL; 2761 tinfo = (struct T_info_ack *)mp->b_rptr; 2762 2763 tidu_size = tinfo->TIDU_size; 2764 tidu_size -= (tidu_size % BYTES_PER_XDR_UNIT); 2765 if (tidu_size > COTS_DEFAULT_ALLOCSIZE || (tidu_size <= 0)) 2766 tidu_size = COTS_DEFAULT_ALLOCSIZE; 2767 *tidu_ptr = tidu_size; 2768 2769 freemsg(mp); 2770 2771 /* 2772 * Set up the pertinent options. NODELAY is so the transport doesn't 2773 * buffer up RPC messages on either end. This may not be valid for 2774 * all transports. Failure to set this option is not cause to 2775 * bail out so we return success anyway. Note that lack of NODELAY 2776 * or some other way to flush the message on both ends will cause 2777 * lots of retries and terrible performance. 2778 */ 2779 if (addrfmly == AF_INET || addrfmly == AF_INET6) { 2780 (void) connmgr_setopt(wq, IPPROTO_TCP, TCP_NODELAY, e, cr); 2781 if (e->call_status == RPC_XPRTFAILED) 2782 return (FALSE); 2783 } 2784 2785 /* 2786 * Since we have a connection, we now need to figure out if 2787 * we need to create a kstat. If x_ksp is not NULL then we 2788 * are reusing a connection and so we do not need to create 2789 * another kstat -- lets just return. 2790 */ 2791 if (cm_entry->x_ksp != NULL) 2792 return (TRUE); 2793 2794 /* 2795 * We need to increment rpc_kstat_instance atomically to prevent 2796 * two kstats being created with the same instance. 2797 */ 2798 kstat_instance = atomic_add_32_nv((uint32_t *)&rpc_kstat_instance, 1); 2799 2800 if ((cm_entry->x_ksp = kstat_create_zone("unix", kstat_instance, 2801 "rpc_cots_connections", "rpc", KSTAT_TYPE_NAMED, 2802 (uint_t)(sizeof (cm_kstat_xprt_t) / sizeof (kstat_named_t)), 2803 KSTAT_FLAG_VIRTUAL, cm_entry->x_zoneid)) == NULL) { 2804 return (TRUE); 2805 } 2806 2807 cm_entry->x_ksp->ks_lock = &connmgr_lock; 2808 cm_entry->x_ksp->ks_private = cm_entry; 2809 cm_entry->x_ksp->ks_data_size = ((INET6_ADDRSTRLEN * sizeof (char)) 2810 + sizeof (cm_kstat_template)); 2811 cm_entry->x_ksp->ks_data = kmem_alloc(cm_entry->x_ksp->ks_data_size, 2812 KM_SLEEP); 2813 bcopy(&cm_kstat_template, cm_entry->x_ksp->ks_data, 2814 cm_entry->x_ksp->ks_data_size); 2815 ((struct cm_kstat_xprt *)(cm_entry->x_ksp->ks_data))-> 2816 x_server.value.str.addr.ptr = 2817 kmem_alloc(INET6_ADDRSTRLEN, KM_SLEEP); 2818 2819 cm_entry->x_ksp->ks_update = conn_kstat_update; 2820 kstat_install(cm_entry->x_ksp); 2821 return (TRUE); 2822 } 2823 2824 /* 2825 * Verify that the specified offset falls within the mblk and 2826 * that the resulting pointer is aligned. 2827 * Returns NULL if not. 2828 * 2829 * code from fs/sockfs/socksubr.c 2830 */ 2831 static void * 2832 connmgr_opt_getoff(mblk_t *mp, t_uscalar_t offset, 2833 t_uscalar_t length, uint_t align_size) 2834 { 2835 uintptr_t ptr1, ptr2; 2836 2837 ASSERT(mp && mp->b_wptr >= mp->b_rptr); 2838 ptr1 = (uintptr_t)mp->b_rptr + offset; 2839 ptr2 = (uintptr_t)ptr1 + length; 2840 if (ptr1 < (uintptr_t)mp->b_rptr || ptr2 > (uintptr_t)mp->b_wptr) { 2841 return (NULL); 2842 } 2843 if ((ptr1 & (align_size - 1)) != 0) { 2844 return (NULL); 2845 } 2846 return ((void *)ptr1); 2847 } 2848 2849 static bool_t 2850 connmgr_getopt_int(queue_t *wq, int level, int name, int *val, 2851 calllist_t *e, cred_t *cr) 2852 { 2853 mblk_t *mp; 2854 struct opthdr *opt, *opt_res; 2855 struct T_optmgmt_req *tor; 2856 struct T_optmgmt_ack *opt_ack; 2857 struct timeval waitp; 2858 int error; 2859 2860 mp = allocb_cred(sizeof (struct T_optmgmt_req) + 2861 sizeof (struct opthdr) + sizeof (int), cr, NOPID); 2862 if (mp == NULL) 2863 return (FALSE); 2864 2865 mp->b_datap->db_type = M_PROTO; 2866 tor = (struct T_optmgmt_req *)(mp->b_rptr); 2867 tor->PRIM_type = T_SVR4_OPTMGMT_REQ; 2868 tor->MGMT_flags = T_CURRENT; 2869 tor->OPT_length = sizeof (struct opthdr) + sizeof (int); 2870 tor->OPT_offset = sizeof (struct T_optmgmt_req); 2871 2872 opt = (struct opthdr *)(mp->b_rptr + sizeof (struct T_optmgmt_req)); 2873 opt->level = level; 2874 opt->name = name; 2875 opt->len = sizeof (int); 2876 mp->b_wptr += sizeof (struct T_optmgmt_req) + sizeof (struct opthdr) + 2877 sizeof (int); 2878 2879 /* 2880 * We will use this connection regardless 2881 * of whether or not the option is readable. 2882 */ 2883 if (clnt_dispatch_send(wq, mp, e, 0, 0) != RPC_SUCCESS) { 2884 DTRACE_PROBE(krpc__e__connmgr__getopt__cantsend); 2885 freemsg(mp); 2886 return (FALSE); 2887 } 2888 2889 mutex_enter(&clnt_pending_lock); 2890 2891 waitp.tv_sec = clnt_cots_min_conntout; 2892 waitp.tv_usec = 0; 2893 error = waitforack(e, T_OPTMGMT_ACK, &waitp, 1); 2894 2895 if (e->call_prev) 2896 e->call_prev->call_next = e->call_next; 2897 else 2898 clnt_pending = e->call_next; 2899 if (e->call_next) 2900 e->call_next->call_prev = e->call_prev; 2901 mutex_exit(&clnt_pending_lock); 2902 2903 /* get reply message */ 2904 mp = e->call_reply; 2905 e->call_reply = NULL; 2906 2907 if ((!mp) || (e->call_status != RPC_SUCCESS) || (error != 0)) { 2908 2909 DTRACE_PROBE4(krpc__e__connmgr_getopt, int, name, 2910 int, e->call_status, int, error, mblk_t *, mp); 2911 2912 if (mp) 2913 freemsg(mp); 2914 return (FALSE); 2915 } 2916 2917 opt_ack = (struct T_optmgmt_ack *)mp->b_rptr; 2918 opt_res = (struct opthdr *)connmgr_opt_getoff(mp, opt_ack->OPT_offset, 2919 opt_ack->OPT_length, __TPI_ALIGN_SIZE); 2920 2921 if (!opt_res) { 2922 DTRACE_PROBE4(krpc__e__connmgr_optres, mblk_t *, mp, int, name, 2923 int, opt_ack->OPT_offset, int, opt_ack->OPT_length); 2924 freemsg(mp); 2925 return (FALSE); 2926 } 2927 *val = *(int *)&opt_res[1]; 2928 2929 DTRACE_PROBE2(connmgr_getopt__ok, int, name, int, *val); 2930 2931 freemsg(mp); 2932 return (TRUE); 2933 } 2934 2935 /* 2936 * Called by connmgr_connect to set an option on the new stream. 2937 */ 2938 static bool_t 2939 connmgr_setopt_int(queue_t *wq, int level, int name, int val, 2940 calllist_t *e, cred_t *cr) 2941 { 2942 mblk_t *mp; 2943 struct opthdr *opt; 2944 struct T_optmgmt_req *tor; 2945 struct timeval waitp; 2946 int error; 2947 2948 mp = allocb_cred(sizeof (struct T_optmgmt_req) + 2949 sizeof (struct opthdr) + sizeof (int), cr, NOPID); 2950 if (mp == NULL) { 2951 RPCLOG0(1, "connmgr_setopt: cannot alloc mp for option " 2952 "request\n"); 2953 return (FALSE); 2954 } 2955 2956 mp->b_datap->db_type = M_PROTO; 2957 tor = (struct T_optmgmt_req *)(mp->b_rptr); 2958 tor->PRIM_type = T_SVR4_OPTMGMT_REQ; 2959 tor->MGMT_flags = T_NEGOTIATE; 2960 tor->OPT_length = sizeof (struct opthdr) + sizeof (int); 2961 tor->OPT_offset = sizeof (struct T_optmgmt_req); 2962 2963 opt = (struct opthdr *)(mp->b_rptr + sizeof (struct T_optmgmt_req)); 2964 opt->level = level; 2965 opt->name = name; 2966 opt->len = sizeof (int); 2967 *(int *)((char *)opt + sizeof (*opt)) = val; 2968 mp->b_wptr += sizeof (struct T_optmgmt_req) + sizeof (struct opthdr) + 2969 sizeof (int); 2970 2971 /* 2972 * We will use this connection regardless 2973 * of whether or not the option is settable. 2974 */ 2975 if (clnt_dispatch_send(wq, mp, e, 0, 0) != RPC_SUCCESS) { 2976 DTRACE_PROBE(krpc__e__connmgr__setopt__cantsend); 2977 freemsg(mp); 2978 return (FALSE); 2979 } 2980 2981 mutex_enter(&clnt_pending_lock); 2982 2983 waitp.tv_sec = clnt_cots_min_conntout; 2984 waitp.tv_usec = 0; 2985 error = waitforack(e, T_OPTMGMT_ACK, &waitp, 1); 2986 2987 if (e->call_prev) 2988 e->call_prev->call_next = e->call_next; 2989 else 2990 clnt_pending = e->call_next; 2991 if (e->call_next) 2992 e->call_next->call_prev = e->call_prev; 2993 mutex_exit(&clnt_pending_lock); 2994 2995 if (e->call_reply != NULL) { 2996 freemsg(e->call_reply); 2997 e->call_reply = NULL; 2998 } 2999 3000 if (e->call_status != RPC_SUCCESS || error != 0) { 3001 RPCLOG(1, "connmgr_setopt: can't set option: %d\n", name); 3002 return (FALSE); 3003 } 3004 RPCLOG(8, "connmgr_setopt: successfully set option: %d\n", name); 3005 return (TRUE); 3006 } 3007 3008 static bool_t 3009 connmgr_setopt(queue_t *wq, int level, int name, calllist_t *e, cred_t *cr) 3010 { 3011 return (connmgr_setopt_int(wq, level, name, 1, e, cr)); 3012 } 3013 3014 #ifdef DEBUG 3015 3016 /* 3017 * This is a knob to let us force code coverage in allocation failure 3018 * case. 3019 */ 3020 static int connmgr_failsnd; 3021 #define CONN_SND_ALLOC(Size, Pri) \ 3022 ((connmgr_failsnd-- > 0) ? NULL : allocb(Size, Pri)) 3023 3024 #else 3025 3026 #define CONN_SND_ALLOC(Size, Pri) allocb(Size, Pri) 3027 3028 #endif 3029 3030 /* 3031 * Sends an orderly release on the specified queue. 3032 * Entered with connmgr_lock. Exited without connmgr_lock 3033 */ 3034 static void 3035 connmgr_sndrel(struct cm_xprt *cm_entry) 3036 { 3037 struct T_ordrel_req *torr; 3038 mblk_t *mp; 3039 queue_t *q = cm_entry->x_wq; 3040 ASSERT(MUTEX_HELD(&connmgr_lock)); 3041 mp = CONN_SND_ALLOC(sizeof (struct T_ordrel_req), BPRI_LO); 3042 if (mp == NULL) { 3043 cm_entry->x_needrel = TRUE; 3044 mutex_exit(&connmgr_lock); 3045 RPCLOG(1, "connmgr_sndrel: cannot alloc mp for sending ordrel " 3046 "to queue %p\n", (void *)q); 3047 return; 3048 } 3049 mutex_exit(&connmgr_lock); 3050 3051 mp->b_datap->db_type = M_PROTO; 3052 torr = (struct T_ordrel_req *)(mp->b_rptr); 3053 torr->PRIM_type = T_ORDREL_REQ; 3054 mp->b_wptr = mp->b_rptr + sizeof (struct T_ordrel_req); 3055 3056 RPCLOG(8, "connmgr_sndrel: sending ordrel to queue %p\n", (void *)q); 3057 put(q, mp); 3058 } 3059 3060 /* 3061 * Sends an disconnect on the specified queue. 3062 * Entered with connmgr_lock. Exited without connmgr_lock 3063 */ 3064 static void 3065 connmgr_snddis(struct cm_xprt *cm_entry) 3066 { 3067 struct T_discon_req *tdis; 3068 mblk_t *mp; 3069 queue_t *q = cm_entry->x_wq; 3070 3071 ASSERT(MUTEX_HELD(&connmgr_lock)); 3072 mp = CONN_SND_ALLOC(sizeof (*tdis), BPRI_LO); 3073 if (mp == NULL) { 3074 cm_entry->x_needdis = TRUE; 3075 mutex_exit(&connmgr_lock); 3076 RPCLOG(1, "connmgr_snddis: cannot alloc mp for sending discon " 3077 "to queue %p\n", (void *)q); 3078 return; 3079 } 3080 mutex_exit(&connmgr_lock); 3081 3082 mp->b_datap->db_type = M_PROTO; 3083 tdis = (struct T_discon_req *)mp->b_rptr; 3084 tdis->PRIM_type = T_DISCON_REQ; 3085 mp->b_wptr = mp->b_rptr + sizeof (*tdis); 3086 3087 RPCLOG(8, "connmgr_snddis: sending discon to queue %p\n", (void *)q); 3088 put(q, mp); 3089 } 3090 3091 /* 3092 * Sets up the entry for receiving replies, and calls rpcmod's write put proc 3093 * (through put) to send the call. 3094 */ 3095 static int 3096 clnt_dispatch_send(queue_t *q, mblk_t *mp, calllist_t *e, uint_t xid, 3097 uint_t queue_flag) 3098 { 3099 ASSERT(e != NULL); 3100 3101 e->call_status = RPC_TIMEDOUT; /* optimistic, eh? */ 3102 e->call_reason = 0; 3103 e->call_wq = q; 3104 e->call_xid = xid; 3105 e->call_notified = FALSE; 3106 3107 if (!canput(q)) { 3108 e->call_status = RPC_CANTSEND; 3109 e->call_reason = ENOBUFS; 3110 return (RPC_CANTSEND); 3111 } 3112 3113 /* 3114 * If queue_flag is set then the calllist_t is already on the hash 3115 * queue. In this case just send the message and return. 3116 */ 3117 if (queue_flag) { 3118 put(q, mp); 3119 return (RPC_SUCCESS); 3120 3121 } 3122 3123 /* 3124 * Set up calls for RPC requests (with XID != 0) on the hash 3125 * queue for fast lookups and place other calls (i.e. 3126 * connection management) on the linked list. 3127 */ 3128 if (xid != 0) { 3129 RPCLOG(64, "clnt_dispatch_send: putting xid 0x%x on " 3130 "dispatch list\n", xid); 3131 e->call_hash = call_hash(xid, clnt_cots_hash_size); 3132 e->call_bucket = &cots_call_ht[e->call_hash]; 3133 call_table_enter(e); 3134 } else { 3135 mutex_enter(&clnt_pending_lock); 3136 if (clnt_pending) 3137 clnt_pending->call_prev = e; 3138 e->call_next = clnt_pending; 3139 e->call_prev = NULL; 3140 clnt_pending = e; 3141 mutex_exit(&clnt_pending_lock); 3142 } 3143 3144 put(q, mp); 3145 return (RPC_SUCCESS); 3146 } 3147 3148 /* 3149 * Called by rpcmod to notify a client with a clnt_pending call that its reply 3150 * has arrived. If we can't find a client waiting for this reply, we log 3151 * the error and return. 3152 */ 3153 bool_t 3154 clnt_dispatch_notify(mblk_t *mp, zoneid_t zoneid) 3155 { 3156 calllist_t *e = NULL; 3157 call_table_t *chtp; 3158 uint32_t xid; 3159 uint_t hash; 3160 3161 if ((IS_P2ALIGNED(mp->b_rptr, sizeof (uint32_t))) && 3162 (mp->b_wptr - mp->b_rptr) >= sizeof (xid)) 3163 xid = *((uint32_t *)mp->b_rptr); 3164 else { 3165 int i = 0; 3166 unsigned char *p = (unsigned char *)&xid; 3167 unsigned char *rptr; 3168 mblk_t *tmp = mp; 3169 3170 /* 3171 * Copy the xid, byte-by-byte into xid. 3172 */ 3173 while (tmp) { 3174 rptr = tmp->b_rptr; 3175 while (rptr < tmp->b_wptr) { 3176 *p++ = *rptr++; 3177 if (++i >= sizeof (xid)) 3178 goto done_xid_copy; 3179 } 3180 tmp = tmp->b_cont; 3181 } 3182 3183 /* 3184 * If we got here, we ran out of mblk space before the 3185 * xid could be copied. 3186 */ 3187 ASSERT(tmp == NULL && i < sizeof (xid)); 3188 3189 RPCLOG0(1, 3190 "clnt_dispatch_notify: message less than size of xid\n"); 3191 return (FALSE); 3192 3193 } 3194 done_xid_copy: 3195 3196 hash = call_hash(xid, clnt_cots_hash_size); 3197 chtp = &cots_call_ht[hash]; 3198 /* call_table_find returns with the hash bucket locked */ 3199 call_table_find(chtp, xid, e); 3200 3201 if (e != NULL) { 3202 /* 3203 * Found thread waiting for this reply 3204 */ 3205 mutex_enter(&e->call_lock); 3206 3207 /* 3208 * verify that the reply is coming in on 3209 * the same zone that it was sent from. 3210 */ 3211 if (e->call_zoneid != zoneid) { 3212 mutex_exit(&e->call_lock); 3213 mutex_exit(&chtp->ct_lock); 3214 RPCLOG0(1, "clnt_dispatch_notify: incorrect zoneid\n"); 3215 return (FALSE); 3216 } 3217 3218 if (e->call_reply) 3219 /* 3220 * This can happen under the following scenario: 3221 * clnt_cots_kcallit() times out on the response, 3222 * rfscall() repeats the CLNT_CALL() with 3223 * the same xid, clnt_cots_kcallit() sends the retry, 3224 * thereby putting the clnt handle on the pending list, 3225 * the first response arrives, signalling the thread 3226 * in clnt_cots_kcallit(). Before that thread is 3227 * dispatched, the second response arrives as well, 3228 * and clnt_dispatch_notify still finds the handle on 3229 * the pending list, with call_reply set. So free the 3230 * old reply now. 3231 * 3232 * It is also possible for a response intended for 3233 * an RPC call with a different xid to reside here. 3234 * This can happen if the thread that owned this 3235 * client handle prior to the current owner bailed 3236 * out and left its call record on the dispatch 3237 * queue. A window exists where the response can 3238 * arrive before the current owner dispatches its 3239 * RPC call. 3240 * 3241 * In any case, this is the very last point where we 3242 * can safely check the call_reply field before 3243 * placing the new response there. 3244 */ 3245 freemsg(e->call_reply); 3246 e->call_reply = mp; 3247 e->call_status = RPC_SUCCESS; 3248 e->call_notified = TRUE; 3249 cv_signal(&e->call_cv); 3250 mutex_exit(&e->call_lock); 3251 mutex_exit(&chtp->ct_lock); 3252 return (TRUE); 3253 } else { 3254 zone_t *zone; 3255 struct rpcstat *rpcstat; 3256 3257 mutex_exit(&chtp->ct_lock); 3258 RPCLOG(65, "clnt_dispatch_notify: no caller for reply 0x%x\n", 3259 xid); 3260 /* 3261 * This is unfortunate, but we need to lookup the zone so we 3262 * can increment its "rcbadxids" counter. 3263 */ 3264 zone = zone_find_by_id(zoneid); 3265 if (zone == NULL) { 3266 /* 3267 * The zone went away... 3268 */ 3269 return (FALSE); 3270 } 3271 rpcstat = zone_getspecific(rpcstat_zone_key, zone); 3272 if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) { 3273 /* 3274 * Not interested 3275 */ 3276 zone_rele(zone); 3277 return (FALSE); 3278 } 3279 COTSRCSTAT_INCR(rpcstat->rpc_cots_client, rcbadxids); 3280 zone_rele(zone); 3281 } 3282 return (FALSE); 3283 } 3284 3285 /* 3286 * Called by rpcmod when a non-data indication arrives. The ones in which we 3287 * are interested are connection indications and options acks. We dispatch 3288 * based on the queue the indication came in on. If we are not interested in 3289 * what came in, we return false to rpcmod, who will then pass it upstream. 3290 */ 3291 bool_t 3292 clnt_dispatch_notifyconn(queue_t *q, mblk_t *mp) 3293 { 3294 calllist_t *e; 3295 int type; 3296 3297 ASSERT((q->q_flag & QREADR) == 0); 3298 3299 type = ((union T_primitives *)mp->b_rptr)->type; 3300 RPCLOG(8, "clnt_dispatch_notifyconn: prim type: [%s]\n", 3301 rpc_tpiprim2name(type)); 3302 mutex_enter(&clnt_pending_lock); 3303 for (e = clnt_pending; /* NO CONDITION */; e = e->call_next) { 3304 if (e == NULL) { 3305 mutex_exit(&clnt_pending_lock); 3306 RPCLOG(1, "clnt_dispatch_notifyconn: no one waiting " 3307 "for connection on queue 0x%p\n", (void *)q); 3308 return (FALSE); 3309 } 3310 if (e->call_wq == q) 3311 break; 3312 } 3313 3314 switch (type) { 3315 case T_CONN_CON: 3316 /* 3317 * The transport is now connected, send a T_INFO_REQ to get 3318 * the tidu size. 3319 */ 3320 mutex_exit(&clnt_pending_lock); 3321 ASSERT(mp->b_datap->db_lim - mp->b_datap->db_base >= 3322 sizeof (struct T_info_req)); 3323 mp->b_rptr = mp->b_datap->db_base; 3324 ((union T_primitives *)mp->b_rptr)->type = T_INFO_REQ; 3325 mp->b_wptr = mp->b_rptr + sizeof (struct T_info_req); 3326 mp->b_datap->db_type = M_PCPROTO; 3327 put(q, mp); 3328 return (TRUE); 3329 case T_INFO_ACK: 3330 case T_OPTMGMT_ACK: 3331 e->call_status = RPC_SUCCESS; 3332 e->call_reply = mp; 3333 e->call_notified = TRUE; 3334 cv_signal(&e->call_cv); 3335 break; 3336 case T_ERROR_ACK: 3337 e->call_status = RPC_CANTCONNECT; 3338 e->call_reply = mp; 3339 e->call_notified = TRUE; 3340 cv_signal(&e->call_cv); 3341 break; 3342 case T_OK_ACK: 3343 /* 3344 * Great, but we are really waiting for a T_CONN_CON 3345 */ 3346 freemsg(mp); 3347 break; 3348 default: 3349 mutex_exit(&clnt_pending_lock); 3350 RPCLOG(1, "clnt_dispatch_notifyconn: bad type %d\n", type); 3351 return (FALSE); 3352 } 3353 3354 mutex_exit(&clnt_pending_lock); 3355 return (TRUE); 3356 } 3357 3358 /* 3359 * Called by rpcmod when the transport is (or should be) going away. Informs 3360 * all callers waiting for replies and marks the entry in the connection 3361 * manager's list as unconnected, and either closing (close handshake in 3362 * progress) or dead. 3363 */ 3364 void 3365 clnt_dispatch_notifyall(queue_t *q, int32_t msg_type, int32_t reason) 3366 { 3367 calllist_t *e; 3368 call_table_t *ctp; 3369 struct cm_xprt *cm_entry; 3370 int have_connmgr_lock; 3371 int i; 3372 3373 ASSERT((q->q_flag & QREADR) == 0); 3374 3375 RPCLOG(1, "clnt_dispatch_notifyall on queue %p", (void *)q); 3376 RPCLOG(1, " received a notifcation prim type [%s]", 3377 rpc_tpiprim2name(msg_type)); 3378 RPCLOG(1, " and reason %d\n", reason); 3379 3380 /* 3381 * Find the transport entry in the connection manager's list, close 3382 * the transport and delete the entry. In the case where rpcmod's 3383 * idle timer goes off, it sends us a T_ORDREL_REQ, indicating we 3384 * should gracefully close the connection. 3385 */ 3386 have_connmgr_lock = 1; 3387 mutex_enter(&connmgr_lock); 3388 for (cm_entry = cm_hd; cm_entry; cm_entry = cm_entry->x_next) { 3389 ASSERT(cm_entry != cm_entry->x_next); 3390 if (cm_entry->x_wq == q) { 3391 ASSERT(MUTEX_HELD(&connmgr_lock)); 3392 ASSERT(have_connmgr_lock == 1); 3393 switch (msg_type) { 3394 case T_ORDREL_REQ: 3395 3396 if (cm_entry->x_dead) { 3397 RPCLOG(1, "idle timeout on dead " 3398 "connection: %p\n", 3399 (void *)cm_entry); 3400 if (clnt_stop_idle != NULL) 3401 (*clnt_stop_idle)(q); 3402 break; 3403 } 3404 3405 /* 3406 * Only mark the connection as dead if it is 3407 * connected and idle. 3408 * An unconnected connection has probably 3409 * gone idle because the server is down, 3410 * and when it comes back up there will be 3411 * retries that need to use that connection. 3412 */ 3413 if (cm_entry->x_connected || 3414 cm_entry->x_doomed) { 3415 if (cm_entry->x_ordrel) { 3416 if (cm_entry->x_closing == 3417 TRUE) { 3418 /* 3419 * The connection is 3420 * obviously wedged due 3421 * to a bug or problem 3422 * with the transport. 3423 * Mark it as dead. 3424 * Otherwise we can 3425 * leak connections. 3426 */ 3427 cm_entry->x_dead = TRUE; 3428 mutex_exit( 3429 &connmgr_lock); 3430 have_connmgr_lock = 0; 3431 if (clnt_stop_idle != 3432 NULL) 3433 (*clnt_stop_idle)(q); 3434 break; 3435 } 3436 cm_entry->x_closing = TRUE; 3437 connmgr_sndrel(cm_entry); 3438 have_connmgr_lock = 0; 3439 } else { 3440 cm_entry->x_dead = TRUE; 3441 mutex_exit(&connmgr_lock); 3442 have_connmgr_lock = 0; 3443 if (clnt_stop_idle != NULL) 3444 (*clnt_stop_idle)(q); 3445 } 3446 } else { 3447 /* 3448 * We don't mark the connection 3449 * as dead, but we turn off the 3450 * idle timer. 3451 */ 3452 mutex_exit(&connmgr_lock); 3453 have_connmgr_lock = 0; 3454 if (clnt_stop_idle != NULL) 3455 (*clnt_stop_idle)(q); 3456 RPCLOG(1, "clnt_dispatch_notifyall:" 3457 " ignoring timeout from rpcmod" 3458 " (q %p) because we are not " 3459 " connected\n", (void *)q); 3460 } 3461 break; 3462 case T_ORDREL_IND: 3463 /* 3464 * If this entry is marked closing, then we are 3465 * completing a close handshake, and the 3466 * connection is dead. Otherwise, the server is 3467 * trying to close. Since the server will not 3468 * be sending any more RPC replies, we abort 3469 * the connection, including flushing 3470 * any RPC requests that are in-transit. 3471 * In either case, mark the entry as dead so 3472 * that it can be closed by the connection 3473 * manager's garbage collector. 3474 */ 3475 cm_entry->x_dead = TRUE; 3476 if (cm_entry->x_closing) { 3477 mutex_exit(&connmgr_lock); 3478 have_connmgr_lock = 0; 3479 if (clnt_stop_idle != NULL) 3480 (*clnt_stop_idle)(q); 3481 } else { 3482 /* 3483 * if we're getting a disconnect 3484 * before we've finished our 3485 * connect attempt, mark it for 3486 * later processing 3487 */ 3488 if (cm_entry->x_thread) 3489 cm_entry->x_early_disc = TRUE; 3490 else 3491 cm_entry->x_connected = FALSE; 3492 cm_entry->x_waitdis = TRUE; 3493 connmgr_snddis(cm_entry); 3494 have_connmgr_lock = 0; 3495 } 3496 break; 3497 3498 case T_ERROR_ACK: 3499 case T_OK_ACK: 3500 cm_entry->x_waitdis = FALSE; 3501 cv_signal(&cm_entry->x_dis_cv); 3502 mutex_exit(&connmgr_lock); 3503 return; 3504 3505 case T_DISCON_REQ: 3506 if (cm_entry->x_thread) 3507 cm_entry->x_early_disc = TRUE; 3508 else 3509 cm_entry->x_connected = FALSE; 3510 cm_entry->x_waitdis = TRUE; 3511 3512 connmgr_snddis(cm_entry); 3513 have_connmgr_lock = 0; 3514 break; 3515 3516 case T_DISCON_IND: 3517 default: 3518 /* 3519 * if we're getting a disconnect before 3520 * we've finished our connect attempt, 3521 * mark it for later processing 3522 */ 3523 if (cm_entry->x_closing) { 3524 cm_entry->x_dead = TRUE; 3525 mutex_exit(&connmgr_lock); 3526 have_connmgr_lock = 0; 3527 if (clnt_stop_idle != NULL) 3528 (*clnt_stop_idle)(q); 3529 } else { 3530 if (cm_entry->x_thread) { 3531 cm_entry->x_early_disc = TRUE; 3532 } else { 3533 cm_entry->x_dead = TRUE; 3534 cm_entry->x_connected = FALSE; 3535 } 3536 } 3537 break; 3538 } 3539 break; 3540 } 3541 } 3542 3543 if (have_connmgr_lock) 3544 mutex_exit(&connmgr_lock); 3545 3546 if (msg_type == T_ERROR_ACK || msg_type == T_OK_ACK) { 3547 RPCLOG(1, "clnt_dispatch_notifyall: (wq %p) could not find " 3548 "connmgr entry for discon ack\n", (void *)q); 3549 return; 3550 } 3551 3552 /* 3553 * Then kick all the clnt_pending calls out of their wait. There 3554 * should be no clnt_pending calls in the case of rpcmod's idle 3555 * timer firing. 3556 */ 3557 for (i = 0; i < clnt_cots_hash_size; i++) { 3558 ctp = &cots_call_ht[i]; 3559 mutex_enter(&ctp->ct_lock); 3560 for (e = ctp->ct_call_next; 3561 e != (calllist_t *)ctp; 3562 e = e->call_next) { 3563 if (e->call_wq == q && e->call_notified == FALSE) { 3564 RPCLOG(1, 3565 "clnt_dispatch_notifyall for queue %p ", 3566 (void *)q); 3567 RPCLOG(1, "aborting clnt_pending call %p\n", 3568 (void *)e); 3569 3570 if (msg_type == T_DISCON_IND) 3571 e->call_reason = reason; 3572 e->call_notified = TRUE; 3573 e->call_status = RPC_XPRTFAILED; 3574 cv_signal(&e->call_cv); 3575 } 3576 } 3577 mutex_exit(&ctp->ct_lock); 3578 } 3579 3580 mutex_enter(&clnt_pending_lock); 3581 for (e = clnt_pending; e; e = e->call_next) { 3582 /* 3583 * Only signal those RPC handles that haven't been 3584 * signalled yet. Otherwise we can get a bogus call_reason. 3585 * This can happen if thread A is making a call over a 3586 * connection. If the server is killed, it will cause 3587 * reset, and reason will default to EIO as a result of 3588 * a T_ORDREL_IND. Thread B then attempts to recreate 3589 * the connection but gets a T_DISCON_IND. If we set the 3590 * call_reason code for all threads, then if thread A 3591 * hasn't been dispatched yet, it will get the wrong 3592 * reason. The bogus call_reason can make it harder to 3593 * discriminate between calls that fail because the 3594 * connection attempt failed versus those where the call 3595 * may have been executed on the server. 3596 */ 3597 if (e->call_wq == q && e->call_notified == FALSE) { 3598 RPCLOG(1, "clnt_dispatch_notifyall for queue %p ", 3599 (void *)q); 3600 RPCLOG(1, " aborting clnt_pending call %p\n", 3601 (void *)e); 3602 3603 if (msg_type == T_DISCON_IND) 3604 e->call_reason = reason; 3605 e->call_notified = TRUE; 3606 /* 3607 * Let the caller timeout, else he will retry 3608 * immediately. 3609 */ 3610 e->call_status = RPC_XPRTFAILED; 3611 3612 /* 3613 * We used to just signal those threads 3614 * waiting for a connection, (call_xid = 0). 3615 * That meant that threads waiting for a response 3616 * waited till their timeout expired. This 3617 * could be a long time if they've specified a 3618 * maximum timeout. (2^31 - 1). So we 3619 * Signal all threads now. 3620 */ 3621 cv_signal(&e->call_cv); 3622 } 3623 } 3624 mutex_exit(&clnt_pending_lock); 3625 } 3626 3627 3628 /*ARGSUSED*/ 3629 /* 3630 * after resuming a system that's been suspended for longer than the 3631 * NFS server's idle timeout (svc_idle_timeout for Solaris 2), rfscall() 3632 * generates "NFS server X not responding" and "NFS server X ok" messages; 3633 * here we reset inet connections to cause a re-connect and avoid those 3634 * NFS messages. see 4045054 3635 */ 3636 boolean_t 3637 connmgr_cpr_reset(void *arg, int code) 3638 { 3639 struct cm_xprt *cxp; 3640 3641 if (code == CB_CODE_CPR_CHKPT) 3642 return (B_TRUE); 3643 3644 if (mutex_tryenter(&connmgr_lock) == 0) 3645 return (B_FALSE); 3646 for (cxp = cm_hd; cxp; cxp = cxp->x_next) { 3647 if ((cxp->x_family == AF_INET || cxp->x_family == AF_INET6) && 3648 cxp->x_connected == TRUE) { 3649 if (cxp->x_thread) 3650 cxp->x_early_disc = TRUE; 3651 else 3652 cxp->x_connected = FALSE; 3653 cxp->x_needdis = TRUE; 3654 } 3655 } 3656 mutex_exit(&connmgr_lock); 3657 return (B_TRUE); 3658 } 3659 3660 void 3661 clnt_cots_stats_init(zoneid_t zoneid, struct rpc_cots_client **statsp) 3662 { 3663 3664 *statsp = (struct rpc_cots_client *)rpcstat_zone_init_common(zoneid, 3665 "unix", "rpc_cots_client", (const kstat_named_t *)&cots_rcstat_tmpl, 3666 sizeof (cots_rcstat_tmpl)); 3667 } 3668 3669 void 3670 clnt_cots_stats_fini(zoneid_t zoneid, struct rpc_cots_client **statsp) 3671 { 3672 rpcstat_zone_fini_common(zoneid, "unix", "rpc_cots_client"); 3673 kmem_free(*statsp, sizeof (cots_rcstat_tmpl)); 3674 } 3675 3676 void 3677 clnt_cots_init(void) 3678 { 3679 mutex_init(&connmgr_lock, NULL, MUTEX_DEFAULT, NULL); 3680 mutex_init(&clnt_pending_lock, NULL, MUTEX_DEFAULT, NULL); 3681 3682 if (clnt_cots_hash_size < DEFAULT_MIN_HASH_SIZE) 3683 clnt_cots_hash_size = DEFAULT_MIN_HASH_SIZE; 3684 3685 cots_call_ht = call_table_init(clnt_cots_hash_size); 3686 zone_key_create(&zone_cots_key, NULL, NULL, clnt_zone_destroy); 3687 } 3688 3689 void 3690 clnt_cots_fini(void) 3691 { 3692 (void) zone_key_delete(zone_cots_key); 3693 } 3694 3695 /* 3696 * Wait for TPI ack, returns success only if expected ack is received 3697 * within timeout period. 3698 */ 3699 3700 static int 3701 waitforack(calllist_t *e, t_scalar_t ack_prim, const struct timeval *waitp, 3702 bool_t nosignal) 3703 { 3704 union T_primitives *tpr; 3705 clock_t timout; 3706 int cv_stat = 1; 3707 3708 ASSERT(MUTEX_HELD(&clnt_pending_lock)); 3709 while (e->call_reply == NULL) { 3710 if (waitp != NULL) { 3711 timout = waitp->tv_sec * drv_usectohz(MICROSEC) + 3712 drv_usectohz(waitp->tv_usec); 3713 if (nosignal) 3714 cv_stat = cv_reltimedwait(&e->call_cv, 3715 &clnt_pending_lock, timout, TR_CLOCK_TICK); 3716 else 3717 cv_stat = cv_reltimedwait_sig(&e->call_cv, 3718 &clnt_pending_lock, timout, TR_CLOCK_TICK); 3719 } else { 3720 if (nosignal) 3721 cv_wait(&e->call_cv, &clnt_pending_lock); 3722 else 3723 cv_stat = cv_wait_sig(&e->call_cv, 3724 &clnt_pending_lock); 3725 } 3726 if (cv_stat == -1) 3727 return (ETIME); 3728 if (cv_stat == 0) 3729 return (EINTR); 3730 /* 3731 * if we received an error from the server and we know a reply 3732 * is not going to be sent, do not wait for the full timeout, 3733 * return now. 3734 */ 3735 if (e->call_status == RPC_XPRTFAILED) 3736 return (e->call_reason); 3737 } 3738 tpr = (union T_primitives *)e->call_reply->b_rptr; 3739 if (tpr->type == ack_prim) 3740 return (0); /* Success */ 3741 3742 if (tpr->type == T_ERROR_ACK) { 3743 if (tpr->error_ack.TLI_error == TSYSERR) 3744 return (tpr->error_ack.UNIX_error); 3745 else 3746 return (t_tlitosyserr(tpr->error_ack.TLI_error)); 3747 } 3748 3749 return (EPROTO); /* unknown or unexpected primitive */ 3750 } 3751