1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2016 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2016 by Delphix. All rights reserved. 25 * Copyright 2019 Joyent, Inc. 26 */ 27 28 /* 29 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 30 * Use is subject to license terms. 31 */ 32 33 /* 34 * Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T 35 * All Rights Reserved 36 */ 37 38 /* 39 * Portions of this source code were derived from Berkeley 4.3 BSD 40 * under license from the Regents of the University of California. 41 */ 42 43 44 /* 45 * Implements a kernel based, client side RPC over Connection Oriented 46 * Transports (COTS). 47 */ 48 49 /* 50 * Much of this file has been re-written to let NFS work better over slow 51 * transports. A description follows. 52 * 53 * One of the annoying things about kRPC/COTS is that it will temporarily 54 * create more than one connection between a client and server. This 55 * happens because when a connection is made, the end-points entry in the 56 * linked list of connections (headed by cm_hd), is removed so that other 57 * threads don't mess with it. Went ahead and bit the bullet by keeping 58 * the endpoint on the connection list and introducing state bits, 59 * condition variables etc. to the connection entry data structure (struct 60 * cm_xprt). 61 * 62 * Here is a summary of the changes to cm-xprt: 63 * 64 * x_ctime is the timestamp of when the endpoint was last 65 * connected or disconnected. If an end-point is ever disconnected 66 * or re-connected, then any outstanding RPC request is presumed 67 * lost, telling clnt_cots_kcallit that it needs to re-send the 68 * request, not just wait for the original request's reply to 69 * arrive. 70 * 71 * x_thread flag which tells us if a thread is doing a connection attempt. 72 * 73 * x_waitdis flag which tells us we are waiting a disconnect ACK. 74 * 75 * x_needdis flag which tells us we need to send a T_DISCONN_REQ 76 * to kill the connection. 77 * 78 * x_needrel flag which tells us we need to send a T_ORDREL_REQ to 79 * gracefully close the connection. 80 * 81 * #defined bitmasks for the all the b_* bits so that more 82 * efficient (and at times less clumsy) masks can be used to 83 * manipulated state in cases where multiple bits have to 84 * set/cleared/checked in the same critical section. 85 * 86 * x_conn_cv and x_dis-_cv are new condition variables to let 87 * threads knows when the connection attempt is done, and to let 88 * the connecting thread know when the disconnect handshake is 89 * done. 90 * 91 * Added the CONN_HOLD() macro so that all reference holds have the same 92 * look and feel. 93 * 94 * In the private (cku_private) portion of the client handle, 95 * 96 * cku_flags replaces the cku_sent a boolean. cku_flags keeps 97 * track of whether a request as been sent, and whether the 98 * client's handles call record is on the dispatch list (so that 99 * the reply can be matched by XID to the right client handle). 100 * The idea of CKU_ONQUEUE is that we can exit clnt_cots_kcallit() 101 * and still have the response find the right client handle so 102 * that the retry of CLNT_CALL() gets the result. Testing, found 103 * situations where if the timeout was increased, performance 104 * degraded. This was due to us hitting a window where the thread 105 * was back in rfscall() (probably printing server not responding) 106 * while the response came back but no place to put it. 107 * 108 * cku_ctime is just a cache of x_ctime. If they match, 109 * clnt_cots_kcallit() won't to send a retry (unless the maximum 110 * receive count limit as been reached). If the don't match, then 111 * we assume the request has been lost, and a retry of the request 112 * is needed. 113 * 114 * cku_recv_attempts counts the number of receive count attempts 115 * after one try is sent on the wire. 116 * 117 * Added the clnt_delay() routine so that interruptible and 118 * noninterruptible delays are possible. 119 * 120 * CLNT_MIN_TIMEOUT has been bumped to 10 seconds from 3. This is used to 121 * control how long the client delays before returned after getting 122 * ECONNREFUSED. At 3 seconds, 8 client threads per mount really does bash 123 * a server that may be booting and not yet started nfsd. 124 * 125 * CLNT_MAXRECV_WITHOUT_RETRY is a new macro (value of 3) (with a tunable) 126 * Why don't we just wait forever (receive an infinite # of times)? 127 * Because the server may have rebooted. More insidious is that some 128 * servers (ours) will drop NFS/TCP requests in some cases. This is bad, 129 * but it is a reality. 130 * 131 * The case of a server doing orderly release really messes up the 132 * client's recovery, especially if the server's TCP implementation is 133 * buggy. It was found was that the kRPC/COTS client was breaking some 134 * TPI rules, such as not waiting for the acknowledgement of a 135 * T_DISCON_REQ (hence the added case statements T_ERROR_ACK, T_OK_ACK and 136 * T_DISCON_REQ in clnt_dispatch_notifyall()). 137 * 138 * One of things that we've seen is that a kRPC TCP endpoint goes into 139 * TIMEWAIT and a thus a reconnect takes a long time to satisfy because 140 * that the TIMEWAIT state takes a while to finish. If a server sends a 141 * T_ORDREL_IND, there is little point in an RPC client doing a 142 * T_ORDREL_REQ, because the RPC request isn't going to make it (the 143 * server is saying that it won't accept any more data). So kRPC was 144 * changed to send a T_DISCON_REQ when we get a T_ORDREL_IND. So now the 145 * connection skips the TIMEWAIT state and goes straight to a bound state 146 * that kRPC can quickly switch to connected. 147 * 148 * Code that issues TPI request must use waitforack() to wait for the 149 * corresponding ack (assuming there is one) in any future modifications. 150 * This works around problems that may be introduced by breaking TPI rules 151 * (by submitting new calls before earlier requests have been acked) in the 152 * case of a signal or other early return. waitforack() depends on 153 * clnt_dispatch_notifyconn() to issue the wakeup when the ack 154 * arrives, so adding new TPI calls may require corresponding changes 155 * to clnt_dispatch_notifyconn(). Presently, the timeout period is based on 156 * CLNT_MIN_TIMEOUT which is 10 seconds. If you modify this value, be sure 157 * not to set it too low or TPI ACKS will be lost. 158 */ 159 160 #include <sys/param.h> 161 #include <sys/types.h> 162 #include <sys/user.h> 163 #include <sys/systm.h> 164 #include <sys/sysmacros.h> 165 #include <sys/proc.h> 166 #include <sys/socket.h> 167 #include <sys/file.h> 168 #include <sys/stream.h> 169 #include <sys/strsubr.h> 170 #include <sys/stropts.h> 171 #include <sys/strsun.h> 172 #include <sys/timod.h> 173 #include <sys/tiuser.h> 174 #include <sys/tihdr.h> 175 #include <sys/t_kuser.h> 176 #include <sys/fcntl.h> 177 #include <sys/errno.h> 178 #include <sys/kmem.h> 179 #include <sys/debug.h> 180 #include <sys/systm.h> 181 #include <sys/kstat.h> 182 #include <sys/t_lock.h> 183 #include <sys/ddi.h> 184 #include <sys/cmn_err.h> 185 #include <sys/time.h> 186 #include <sys/isa_defs.h> 187 #include <sys/callb.h> 188 #include <sys/sunddi.h> 189 #include <sys/atomic.h> 190 #include <sys/sdt.h> 191 192 #include <netinet/in.h> 193 #include <netinet/tcp.h> 194 195 #include <rpc/types.h> 196 #include <rpc/xdr.h> 197 #include <rpc/auth.h> 198 #include <rpc/clnt.h> 199 #include <rpc/rpc_msg.h> 200 201 #define COTS_DEFAULT_ALLOCSIZE 2048 202 203 #define WIRE_HDR_SIZE 20 /* serialized call header, sans proc number */ 204 #define MSG_OFFSET 128 /* offset of call into the mblk */ 205 206 const char *kinet_ntop6(uchar_t *, char *, size_t); 207 208 static int clnt_cots_ksettimers(CLIENT *, struct rpc_timers *, 209 struct rpc_timers *, int, void(*)(int, int, caddr_t), caddr_t, uint32_t); 210 static enum clnt_stat clnt_cots_kcallit(CLIENT *, rpcproc_t, xdrproc_t, 211 caddr_t, xdrproc_t, caddr_t, struct timeval); 212 static void clnt_cots_kabort(CLIENT *); 213 static void clnt_cots_kerror(CLIENT *, struct rpc_err *); 214 static bool_t clnt_cots_kfreeres(CLIENT *, xdrproc_t, caddr_t); 215 static void clnt_cots_kdestroy(CLIENT *); 216 static bool_t clnt_cots_kcontrol(CLIENT *, int, char *); 217 218 219 /* List of transports managed by the connection manager. */ 220 struct cm_xprt { 221 TIUSER *x_tiptr; /* transport handle */ 222 queue_t *x_wq; /* send queue */ 223 clock_t x_time; /* last time we handed this xprt out */ 224 clock_t x_ctime; /* time we went to CONNECTED */ 225 int x_tidu_size; /* TIDU size of this transport */ 226 union { 227 struct { 228 unsigned int 229 #ifdef _BIT_FIELDS_HTOL 230 b_closing: 1, /* we've sent a ord rel on this conn */ 231 b_dead: 1, /* transport is closed or disconn */ 232 b_doomed: 1, /* too many conns, let this go idle */ 233 b_connected: 1, /* this connection is connected */ 234 235 b_ordrel: 1, /* do an orderly release? */ 236 b_thread: 1, /* thread doing connect */ 237 b_waitdis: 1, /* waiting for disconnect ACK */ 238 b_needdis: 1, /* need T_DISCON_REQ */ 239 240 b_needrel: 1, /* need T_ORDREL_REQ */ 241 b_early_disc: 1, /* got a T_ORDREL_IND or T_DISCON_IND */ 242 /* disconnect during connect */ 243 244 b_pad: 22; 245 246 #endif 247 248 #ifdef _BIT_FIELDS_LTOH 249 b_pad: 22, 250 251 b_early_disc: 1, /* got a T_ORDREL_IND or T_DISCON_IND */ 252 /* disconnect during connect */ 253 b_needrel: 1, /* need T_ORDREL_REQ */ 254 255 b_needdis: 1, /* need T_DISCON_REQ */ 256 b_waitdis: 1, /* waiting for disconnect ACK */ 257 b_thread: 1, /* thread doing connect */ 258 b_ordrel: 1, /* do an orderly release? */ 259 260 b_connected: 1, /* this connection is connected */ 261 b_doomed: 1, /* too many conns, let this go idle */ 262 b_dead: 1, /* transport is closed or disconn */ 263 b_closing: 1; /* we've sent a ord rel on this conn */ 264 #endif 265 } bit; unsigned int word; 266 267 #define x_closing x_state.bit.b_closing 268 #define x_dead x_state.bit.b_dead 269 #define x_doomed x_state.bit.b_doomed 270 #define x_connected x_state.bit.b_connected 271 272 #define x_ordrel x_state.bit.b_ordrel 273 #define x_thread x_state.bit.b_thread 274 #define x_waitdis x_state.bit.b_waitdis 275 #define x_needdis x_state.bit.b_needdis 276 277 #define x_needrel x_state.bit.b_needrel 278 #define x_early_disc x_state.bit.b_early_disc 279 280 #define x_state_flags x_state.word 281 282 #define X_CLOSING 0x80000000 283 #define X_DEAD 0x40000000 284 #define X_DOOMED 0x20000000 285 #define X_CONNECTED 0x10000000 286 287 #define X_ORDREL 0x08000000 288 #define X_THREAD 0x04000000 289 #define X_WAITDIS 0x02000000 290 #define X_NEEDDIS 0x01000000 291 292 #define X_NEEDREL 0x00800000 293 #define X_EARLYDISC 0x00400000 294 295 #define X_BADSTATES (X_CLOSING | X_DEAD | X_DOOMED) 296 297 } x_state; 298 int x_ref; /* number of users of this xprt */ 299 int x_family; /* address family of transport */ 300 dev_t x_rdev; /* device number of transport */ 301 struct cm_xprt *x_next; 302 303 struct netbuf x_server; /* destination address */ 304 struct netbuf x_src; /* src address (for retries) */ 305 kmutex_t x_lock; /* lock on this entry */ 306 kcondvar_t x_cv; /* to signal when can be closed */ 307 kcondvar_t x_conn_cv; /* to signal when connection attempt */ 308 /* is complete */ 309 kstat_t *x_ksp; 310 311 kcondvar_t x_dis_cv; /* to signal when disconnect attempt */ 312 /* is complete */ 313 zoneid_t x_zoneid; /* zone this xprt belongs to */ 314 }; 315 316 typedef struct cm_kstat_xprt { 317 kstat_named_t x_wq; 318 kstat_named_t x_server; 319 kstat_named_t x_family; 320 kstat_named_t x_rdev; 321 kstat_named_t x_time; 322 kstat_named_t x_state; 323 kstat_named_t x_ref; 324 kstat_named_t x_port; 325 } cm_kstat_xprt_t; 326 327 static cm_kstat_xprt_t cm_kstat_template = { 328 { "write_queue", KSTAT_DATA_UINT32 }, 329 { "server", KSTAT_DATA_STRING }, 330 { "addr_family", KSTAT_DATA_UINT32 }, 331 { "device", KSTAT_DATA_UINT32 }, 332 { "time_stamp", KSTAT_DATA_UINT32 }, 333 { "status", KSTAT_DATA_UINT32 }, 334 { "ref_count", KSTAT_DATA_INT32 }, 335 { "port", KSTAT_DATA_UINT32 }, 336 }; 337 338 /* 339 * The inverse of this is connmgr_release(). 340 */ 341 #define CONN_HOLD(Cm_entry) {\ 342 mutex_enter(&(Cm_entry)->x_lock); \ 343 (Cm_entry)->x_ref++; \ 344 mutex_exit(&(Cm_entry)->x_lock); \ 345 } 346 347 348 /* 349 * Private data per rpc handle. This structure is allocated by 350 * clnt_cots_kcreate, and freed by clnt_cots_kdestroy. 351 */ 352 typedef struct cku_private_s { 353 CLIENT cku_client; /* client handle */ 354 calllist_t cku_call; /* for dispatching calls */ 355 struct rpc_err cku_err; /* error status */ 356 357 struct netbuf cku_srcaddr; /* source address for retries */ 358 int cku_addrfmly; /* for binding port */ 359 struct netbuf cku_addr; /* remote address */ 360 dev_t cku_device; /* device to use */ 361 uint_t cku_flags; 362 #define CKU_ONQUEUE 0x1 363 #define CKU_SENT 0x2 364 365 bool_t cku_progress; /* for CLSET_PROGRESS */ 366 uint32_t cku_xid; /* current XID */ 367 clock_t cku_ctime; /* time stamp of when */ 368 /* connection was created */ 369 uint_t cku_recv_attempts; 370 XDR cku_outxdr; /* xdr routine for output */ 371 XDR cku_inxdr; /* xdr routine for input */ 372 char cku_rpchdr[WIRE_HDR_SIZE + 4]; 373 /* pre-serialized rpc header */ 374 375 uint_t cku_outbuflen; /* default output mblk length */ 376 struct cred *cku_cred; /* credentials */ 377 bool_t cku_nodelayonerr; 378 /* for CLSET_NODELAYONERR */ 379 int cku_useresvport; /* Use reserved port */ 380 struct rpc_cots_client *cku_stats; /* stats for zone */ 381 } cku_private_t; 382 383 static struct cm_xprt *connmgr_wrapconnect(struct cm_xprt *, 384 const struct timeval *, struct netbuf *, int, struct netbuf *, 385 struct rpc_err *, bool_t, bool_t, cred_t *); 386 387 static bool_t connmgr_connect(struct cm_xprt *, queue_t *, struct netbuf *, 388 int, calllist_t *, int *, bool_t reconnect, 389 const struct timeval *, bool_t, cred_t *); 390 391 static void *connmgr_opt_getoff(mblk_t *mp, t_uscalar_t offset, 392 t_uscalar_t length, uint_t align_size); 393 static bool_t connmgr_setbufsz(calllist_t *e, queue_t *wq, cred_t *cr); 394 static bool_t connmgr_getopt_int(queue_t *wq, int level, int name, int *val, 395 calllist_t *e, cred_t *cr); 396 static bool_t connmgr_setopt_int(queue_t *wq, int level, int name, int val, 397 calllist_t *e, cred_t *cr); 398 static bool_t connmgr_setopt(queue_t *, int, int, calllist_t *, cred_t *cr); 399 static void connmgr_sndrel(struct cm_xprt *); 400 static void connmgr_snddis(struct cm_xprt *); 401 static void connmgr_close(struct cm_xprt *); 402 static void connmgr_release(struct cm_xprt *); 403 static struct cm_xprt *connmgr_wrapget(struct netbuf *, const struct timeval *, 404 cku_private_t *); 405 406 static struct cm_xprt *connmgr_get(struct netbuf *, const struct timeval *, 407 struct netbuf *, int, struct netbuf *, struct rpc_err *, dev_t, 408 bool_t, int, cred_t *); 409 410 static void connmgr_cancelconn(struct cm_xprt *); 411 static enum clnt_stat connmgr_cwait(struct cm_xprt *, const struct timeval *, 412 bool_t); 413 static void connmgr_dis_and_wait(struct cm_xprt *); 414 415 static int clnt_dispatch_send(queue_t *, mblk_t *, calllist_t *, uint_t, 416 uint_t); 417 418 static int clnt_delay(clock_t, bool_t); 419 420 static int waitforack(calllist_t *, t_scalar_t, const struct timeval *, bool_t); 421 422 /* 423 * Operations vector for TCP/IP based RPC 424 */ 425 static struct clnt_ops tcp_ops = { 426 clnt_cots_kcallit, /* do rpc call */ 427 clnt_cots_kabort, /* abort call */ 428 clnt_cots_kerror, /* return error status */ 429 clnt_cots_kfreeres, /* free results */ 430 clnt_cots_kdestroy, /* destroy rpc handle */ 431 clnt_cots_kcontrol, /* the ioctl() of rpc */ 432 clnt_cots_ksettimers, /* set retry timers */ 433 }; 434 435 static int rpc_kstat_instance = 0; /* keeps the current instance */ 436 /* number for the next kstat_create */ 437 438 static struct cm_xprt *cm_hd = NULL; 439 static kmutex_t connmgr_lock; /* for connection mngr's list of transports */ 440 441 extern kmutex_t clnt_max_msg_lock; 442 443 static calllist_t *clnt_pending = NULL; 444 extern kmutex_t clnt_pending_lock; 445 446 static int clnt_cots_hash_size = DEFAULT_HASH_SIZE; 447 448 static call_table_t *cots_call_ht; 449 450 static const struct rpc_cots_client { 451 kstat_named_t rccalls; 452 kstat_named_t rcbadcalls; 453 kstat_named_t rcbadxids; 454 kstat_named_t rctimeouts; 455 kstat_named_t rcnewcreds; 456 kstat_named_t rcbadverfs; 457 kstat_named_t rctimers; 458 kstat_named_t rccantconn; 459 kstat_named_t rcnomem; 460 kstat_named_t rcintrs; 461 } cots_rcstat_tmpl = { 462 { "calls", KSTAT_DATA_UINT64 }, 463 { "badcalls", KSTAT_DATA_UINT64 }, 464 { "badxids", KSTAT_DATA_UINT64 }, 465 { "timeouts", KSTAT_DATA_UINT64 }, 466 { "newcreds", KSTAT_DATA_UINT64 }, 467 { "badverfs", KSTAT_DATA_UINT64 }, 468 { "timers", KSTAT_DATA_UINT64 }, 469 { "cantconn", KSTAT_DATA_UINT64 }, 470 { "nomem", KSTAT_DATA_UINT64 }, 471 { "interrupts", KSTAT_DATA_UINT64 } 472 }; 473 474 #define COTSRCSTAT_INCR(p, x) \ 475 atomic_inc_64(&(p)->x.value.ui64) 476 477 #define CLNT_MAX_CONNS 1 /* concurrent connections between clnt/srvr */ 478 int clnt_max_conns = CLNT_MAX_CONNS; 479 480 #define CLNT_MIN_TIMEOUT 10 /* seconds to wait after we get a */ 481 /* connection reset */ 482 #define CLNT_MIN_CONNTIMEOUT 5 /* seconds to wait for a connection */ 483 484 485 int clnt_cots_min_tout = CLNT_MIN_TIMEOUT; 486 int clnt_cots_min_conntout = CLNT_MIN_CONNTIMEOUT; 487 488 /* 489 * Limit the number of times we will attempt to receive a reply without 490 * re-sending a response. 491 */ 492 #define CLNT_MAXRECV_WITHOUT_RETRY 3 493 uint_t clnt_cots_maxrecv = CLNT_MAXRECV_WITHOUT_RETRY; 494 495 uint_t *clnt_max_msg_sizep; 496 void (*clnt_stop_idle)(queue_t *wq); 497 498 #define ptoh(p) (&((p)->cku_client)) 499 #define htop(h) ((cku_private_t *)((h)->cl_private)) 500 501 /* 502 * Times to retry 503 */ 504 #define REFRESHES 2 /* authentication refreshes */ 505 506 /* 507 * The following is used to determine the global default behavior for 508 * COTS when binding to a local port. 509 * 510 * If the value is set to 1 the default will be to select a reserved 511 * (aka privileged) port, if the value is zero the default will be to 512 * use non-reserved ports. Users of kRPC may override this by using 513 * CLNT_CONTROL() and CLSET_BINDRESVPORT. 514 */ 515 int clnt_cots_do_bindresvport = 1; 516 517 static zone_key_t zone_cots_key; 518 519 /* 520 * Defaults TCP send and receive buffer size for RPC connections. 521 * These values can be tuned by /etc/system. 522 */ 523 int rpc_send_bufsz = 1024*1024; 524 int rpc_recv_bufsz = 1024*1024; 525 /* 526 * To use system-wide default for TCP send and receive buffer size, 527 * use /etc/system to set rpc_default_tcp_bufsz to 1: 528 * 529 * set rpcmod:rpc_default_tcp_bufsz=1 530 */ 531 int rpc_default_tcp_bufsz = 0; 532 533 /* 534 * We need to do this after all kernel threads in the zone have exited. 535 */ 536 /* ARGSUSED */ 537 static void 538 clnt_zone_destroy(zoneid_t zoneid, void *unused) 539 { 540 struct cm_xprt **cmp; 541 struct cm_xprt *cm_entry; 542 struct cm_xprt *freelist = NULL; 543 544 mutex_enter(&connmgr_lock); 545 cmp = &cm_hd; 546 while ((cm_entry = *cmp) != NULL) { 547 if (cm_entry->x_zoneid == zoneid) { 548 *cmp = cm_entry->x_next; 549 cm_entry->x_next = freelist; 550 freelist = cm_entry; 551 } else { 552 cmp = &cm_entry->x_next; 553 } 554 } 555 mutex_exit(&connmgr_lock); 556 while ((cm_entry = freelist) != NULL) { 557 freelist = cm_entry->x_next; 558 connmgr_close(cm_entry); 559 } 560 } 561 562 int 563 clnt_cots_kcreate(dev_t dev, struct netbuf *addr, int family, rpcprog_t prog, 564 rpcvers_t vers, uint_t max_msgsize, cred_t *cred, CLIENT **ncl) 565 { 566 CLIENT *h; 567 cku_private_t *p; 568 struct rpc_msg call_msg; 569 struct rpcstat *rpcstat; 570 571 RPCLOG(8, "clnt_cots_kcreate: prog %u\n", prog); 572 573 rpcstat = zone_getspecific(rpcstat_zone_key, rpc_zone()); 574 ASSERT(rpcstat != NULL); 575 576 /* Allocate and intialize the client handle. */ 577 p = kmem_zalloc(sizeof (*p), KM_SLEEP); 578 579 h = ptoh(p); 580 581 h->cl_private = (caddr_t)p; 582 h->cl_auth = authkern_create(); 583 h->cl_ops = &tcp_ops; 584 585 cv_init(&p->cku_call.call_cv, NULL, CV_DEFAULT, NULL); 586 mutex_init(&p->cku_call.call_lock, NULL, MUTEX_DEFAULT, NULL); 587 588 /* 589 * If the current sanity check size in rpcmod is smaller 590 * than the size needed, then increase the sanity check. 591 */ 592 if (max_msgsize != 0 && clnt_max_msg_sizep != NULL && 593 max_msgsize > *clnt_max_msg_sizep) { 594 mutex_enter(&clnt_max_msg_lock); 595 if (max_msgsize > *clnt_max_msg_sizep) 596 *clnt_max_msg_sizep = max_msgsize; 597 mutex_exit(&clnt_max_msg_lock); 598 } 599 600 p->cku_outbuflen = COTS_DEFAULT_ALLOCSIZE; 601 602 /* Preserialize the call message header */ 603 604 call_msg.rm_xid = 0; 605 call_msg.rm_direction = CALL; 606 call_msg.rm_call.cb_rpcvers = RPC_MSG_VERSION; 607 call_msg.rm_call.cb_prog = prog; 608 call_msg.rm_call.cb_vers = vers; 609 610 xdrmem_create(&p->cku_outxdr, p->cku_rpchdr, WIRE_HDR_SIZE, XDR_ENCODE); 611 612 if (!xdr_callhdr(&p->cku_outxdr, &call_msg)) { 613 XDR_DESTROY(&p->cku_outxdr); 614 RPCLOG0(1, "clnt_cots_kcreate - Fatal header serialization " 615 "error\n"); 616 auth_destroy(h->cl_auth); 617 kmem_free(p, sizeof (cku_private_t)); 618 RPCLOG0(1, "clnt_cots_kcreate: create failed error EINVAL\n"); 619 return (EINVAL); /* XXX */ 620 } 621 XDR_DESTROY(&p->cku_outxdr); 622 623 /* 624 * The zalloc initialized the fields below. 625 * p->cku_xid = 0; 626 * p->cku_flags = 0; 627 * p->cku_srcaddr.buf = NULL; 628 * p->cku_srcaddr.len = 0; 629 * p->cku_srcaddr.maxlen = 0; 630 */ 631 632 p->cku_cred = cred; 633 p->cku_device = dev; 634 p->cku_addrfmly = family; 635 p->cku_addr.buf = kmem_zalloc(addr->maxlen, KM_SLEEP); 636 p->cku_addr.maxlen = addr->maxlen; 637 p->cku_addr.len = addr->len; 638 bcopy(addr->buf, p->cku_addr.buf, addr->len); 639 p->cku_stats = rpcstat->rpc_cots_client; 640 p->cku_useresvport = -1; /* value is has not been set */ 641 642 *ncl = h; 643 return (0); 644 } 645 646 /*ARGSUSED*/ 647 static void 648 clnt_cots_kabort(CLIENT *h) 649 { 650 } 651 652 /* 653 * Return error info on this handle. 654 */ 655 static void 656 clnt_cots_kerror(CLIENT *h, struct rpc_err *err) 657 { 658 /* LINTED pointer alignment */ 659 cku_private_t *p = htop(h); 660 661 *err = p->cku_err; 662 } 663 664 /*ARGSUSED*/ 665 static bool_t 666 clnt_cots_kfreeres(CLIENT *h, xdrproc_t xdr_res, caddr_t res_ptr) 667 { 668 xdr_free(xdr_res, res_ptr); 669 670 return (TRUE); 671 } 672 673 static bool_t 674 clnt_cots_kcontrol(CLIENT *h, int cmd, char *arg) 675 { 676 cku_private_t *p = htop(h); 677 678 switch (cmd) { 679 case CLSET_PROGRESS: 680 p->cku_progress = TRUE; 681 return (TRUE); 682 683 case CLSET_XID: 684 if (arg == NULL) 685 return (FALSE); 686 687 p->cku_xid = *((uint32_t *)arg); 688 return (TRUE); 689 690 case CLGET_XID: 691 if (arg == NULL) 692 return (FALSE); 693 694 *((uint32_t *)arg) = p->cku_xid; 695 return (TRUE); 696 697 case CLSET_NODELAYONERR: 698 if (arg == NULL) 699 return (FALSE); 700 701 if (*((bool_t *)arg) == TRUE) { 702 p->cku_nodelayonerr = TRUE; 703 return (TRUE); 704 } 705 if (*((bool_t *)arg) == FALSE) { 706 p->cku_nodelayonerr = FALSE; 707 return (TRUE); 708 } 709 return (FALSE); 710 711 case CLGET_NODELAYONERR: 712 if (arg == NULL) 713 return (FALSE); 714 715 *((bool_t *)arg) = p->cku_nodelayonerr; 716 return (TRUE); 717 718 case CLSET_BINDRESVPORT: 719 if (arg == NULL) 720 return (FALSE); 721 722 if (*(int *)arg != 1 && *(int *)arg != 0) 723 return (FALSE); 724 725 p->cku_useresvport = *(int *)arg; 726 727 return (TRUE); 728 729 case CLGET_BINDRESVPORT: 730 if (arg == NULL) 731 return (FALSE); 732 733 *(int *)arg = p->cku_useresvport; 734 735 return (TRUE); 736 737 default: 738 return (FALSE); 739 } 740 } 741 742 /* 743 * Destroy rpc handle. Frees the space used for output buffer, 744 * private data, and handle structure. 745 */ 746 static void 747 clnt_cots_kdestroy(CLIENT *h) 748 { 749 /* LINTED pointer alignment */ 750 cku_private_t *p = htop(h); 751 calllist_t *call = &p->cku_call; 752 753 RPCLOG(8, "clnt_cots_kdestroy h: %p\n", (void *)h); 754 RPCLOG(8, "clnt_cots_kdestroy h: xid=0x%x\n", p->cku_xid); 755 756 if (p->cku_flags & CKU_ONQUEUE) { 757 RPCLOG(64, "clnt_cots_kdestroy h: removing call for xid 0x%x " 758 "from dispatch list\n", p->cku_xid); 759 call_table_remove(call); 760 } 761 762 if (call->call_reply) 763 freemsg(call->call_reply); 764 cv_destroy(&call->call_cv); 765 mutex_destroy(&call->call_lock); 766 767 kmem_free(p->cku_srcaddr.buf, p->cku_srcaddr.maxlen); 768 kmem_free(p->cku_addr.buf, p->cku_addr.maxlen); 769 kmem_free(p, sizeof (*p)); 770 } 771 772 static int clnt_cots_pulls; 773 #define RM_HDR_SIZE 4 /* record mark header size */ 774 775 /* 776 * Call remote procedure. 777 */ 778 static enum clnt_stat 779 clnt_cots_kcallit(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args, 780 caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp, struct timeval wait) 781 { 782 /* LINTED pointer alignment */ 783 cku_private_t *p = htop(h); 784 calllist_t *call = &p->cku_call; 785 XDR *xdrs; 786 struct rpc_msg reply_msg; 787 mblk_t *mp; 788 #ifdef RPCDEBUG 789 clock_t time_sent; 790 #endif 791 struct netbuf *retryaddr; 792 struct cm_xprt *cm_entry = NULL; 793 queue_t *wq; 794 int len, waitsecs, max_waitsecs; 795 int mpsize; 796 int refreshes = REFRESHES; 797 int interrupted; 798 int tidu_size; 799 enum clnt_stat status; 800 struct timeval cwait; 801 bool_t delay_first = FALSE; 802 clock_t ticks, now; 803 804 RPCLOG(2, "clnt_cots_kcallit, procnum %u\n", procnum); 805 COTSRCSTAT_INCR(p->cku_stats, rccalls); 806 807 RPCLOG(2, "clnt_cots_kcallit: wait.tv_sec: %ld\n", wait.tv_sec); 808 RPCLOG(2, "clnt_cots_kcallit: wait.tv_usec: %ld\n", wait.tv_usec); 809 /* 810 * Bug ID 1240234: 811 * Look out for zero length timeouts. We don't want to 812 * wait zero seconds for a connection to be established. 813 */ 814 if (wait.tv_sec < clnt_cots_min_conntout) { 815 cwait.tv_sec = clnt_cots_min_conntout; 816 cwait.tv_usec = 0; 817 RPCLOG(8, "clnt_cots_kcallit: wait.tv_sec (%ld) too low,", 818 wait.tv_sec); 819 RPCLOG(8, " setting to: %d\n", clnt_cots_min_conntout); 820 } else { 821 cwait = wait; 822 } 823 824 call_again: 825 if (cm_entry) { 826 connmgr_release(cm_entry); 827 cm_entry = NULL; 828 } 829 830 mp = NULL; 831 832 /* 833 * If the call is not a retry, allocate a new xid and cache it 834 * for future retries. 835 * Bug ID 1246045: 836 * Treat call as a retry for purposes of binding the source 837 * port only if we actually attempted to send anything on 838 * the previous call. 839 */ 840 if (p->cku_xid == 0) { 841 p->cku_xid = alloc_xid(); 842 call->call_zoneid = rpc_zoneid(); 843 844 /* 845 * We need to ASSERT here that our xid != 0 because this 846 * determines whether or not our call record gets placed on 847 * the hash table or the linked list. By design, we mandate 848 * that RPC calls over cots must have xid's != 0, so we can 849 * ensure proper management of the hash table. 850 */ 851 ASSERT(p->cku_xid != 0); 852 853 retryaddr = NULL; 854 p->cku_flags &= ~CKU_SENT; 855 856 if (p->cku_flags & CKU_ONQUEUE) { 857 RPCLOG(8, "clnt_cots_kcallit: new call, dequeuing old" 858 " one (%p)\n", (void *)call); 859 call_table_remove(call); 860 p->cku_flags &= ~CKU_ONQUEUE; 861 RPCLOG(64, "clnt_cots_kcallit: removing call from " 862 "dispatch list because xid was zero (now 0x%x)\n", 863 p->cku_xid); 864 } 865 866 if (call->call_reply != NULL) { 867 freemsg(call->call_reply); 868 call->call_reply = NULL; 869 } 870 } else if (p->cku_srcaddr.buf == NULL || p->cku_srcaddr.len == 0) { 871 retryaddr = NULL; 872 873 } else if (p->cku_flags & CKU_SENT) { 874 retryaddr = &p->cku_srcaddr; 875 876 } else { 877 /* 878 * Bug ID 1246045: Nothing was sent, so set retryaddr to 879 * NULL and let connmgr_get() bind to any source port it 880 * can get. 881 */ 882 retryaddr = NULL; 883 } 884 885 RPCLOG(64, "clnt_cots_kcallit: xid = 0x%x", p->cku_xid); 886 RPCLOG(64, " flags = 0x%x\n", p->cku_flags); 887 888 p->cku_err.re_status = RPC_TIMEDOUT; 889 p->cku_err.re_errno = p->cku_err.re_terrno = 0; 890 891 cm_entry = connmgr_wrapget(retryaddr, &cwait, p); 892 893 if (cm_entry == NULL) { 894 RPCLOG(1, "clnt_cots_kcallit: can't connect status %s\n", 895 clnt_sperrno(p->cku_err.re_status)); 896 897 /* 898 * The reasons why we fail to create a connection are 899 * varied. In most cases we don't want the caller to 900 * immediately retry. This could have one or more 901 * bad effects. This includes flooding the net with 902 * connect requests to ports with no listener; a hard 903 * kernel loop due to all the "reserved" TCP ports being 904 * in use. 905 */ 906 delay_first = TRUE; 907 908 /* 909 * Even if we end up returning EINTR, we still count a 910 * a "can't connect", because the connection manager 911 * might have been committed to waiting for or timing out on 912 * a connection. 913 */ 914 COTSRCSTAT_INCR(p->cku_stats, rccantconn); 915 switch (p->cku_err.re_status) { 916 case RPC_INTR: 917 p->cku_err.re_errno = EINTR; 918 919 /* 920 * No need to delay because a UNIX signal(2) 921 * interrupted us. The caller likely won't 922 * retry the CLNT_CALL() and even if it does, 923 * we assume the caller knows what it is doing. 924 */ 925 delay_first = FALSE; 926 break; 927 928 case RPC_TIMEDOUT: 929 p->cku_err.re_errno = ETIMEDOUT; 930 931 /* 932 * No need to delay because timed out already 933 * on the connection request and assume that the 934 * transport time out is longer than our minimum 935 * timeout, or least not too much smaller. 936 */ 937 delay_first = FALSE; 938 break; 939 940 case RPC_SYSTEMERROR: 941 case RPC_TLIERROR: 942 /* 943 * We want to delay here because a transient 944 * system error has a better chance of going away 945 * if we delay a bit. If it's not transient, then 946 * we don't want end up in a hard kernel loop 947 * due to retries. 948 */ 949 ASSERT(p->cku_err.re_errno != 0); 950 break; 951 952 953 case RPC_CANTCONNECT: 954 /* 955 * RPC_CANTCONNECT is set on T_ERROR_ACK which 956 * implies some error down in the TCP layer or 957 * below. If cku_nodelayonerror is set then we 958 * assume the caller knows not to try too hard. 959 */ 960 RPCLOG0(8, "clnt_cots_kcallit: connection failed,"); 961 RPCLOG0(8, " re_status=RPC_CANTCONNECT,"); 962 RPCLOG(8, " re_errno=%d,", p->cku_err.re_errno); 963 RPCLOG(8, " cku_nodelayonerr=%d", p->cku_nodelayonerr); 964 if (p->cku_nodelayonerr == TRUE) 965 delay_first = FALSE; 966 967 p->cku_err.re_errno = EIO; 968 969 break; 970 971 case RPC_XPRTFAILED: 972 /* 973 * We want to delay here because we likely 974 * got a refused connection. 975 */ 976 if (p->cku_err.re_errno == 0) 977 p->cku_err.re_errno = EIO; 978 979 RPCLOG(1, "clnt_cots_kcallit: transport failed: %d\n", 980 p->cku_err.re_errno); 981 982 break; 983 984 default: 985 /* 986 * We delay here because it is better to err 987 * on the side of caution. If we got here then 988 * status could have been RPC_SUCCESS, but we 989 * know that we did not get a connection, so 990 * force the rpc status to RPC_CANTCONNECT. 991 */ 992 p->cku_err.re_status = RPC_CANTCONNECT; 993 p->cku_err.re_errno = EIO; 994 break; 995 } 996 if (delay_first == TRUE) 997 ticks = clnt_cots_min_tout * drv_usectohz(1000000); 998 goto cots_done; 999 } 1000 1001 /* 1002 * If we've never sent any request on this connection (send count 1003 * is zero, or the connection has been reset), cache the 1004 * the connection's create time and send a request (possibly a retry) 1005 */ 1006 if ((p->cku_flags & CKU_SENT) == 0 || 1007 p->cku_ctime != cm_entry->x_ctime) { 1008 p->cku_ctime = cm_entry->x_ctime; 1009 1010 } else if ((p->cku_flags & CKU_SENT) && (p->cku_flags & CKU_ONQUEUE) && 1011 (call->call_reply != NULL || 1012 p->cku_recv_attempts < clnt_cots_maxrecv)) { 1013 1014 /* 1015 * If we've sent a request and our call is on the dispatch 1016 * queue and we haven't made too many receive attempts, then 1017 * don't re-send, just receive. 1018 */ 1019 p->cku_recv_attempts++; 1020 goto read_again; 1021 } 1022 1023 /* 1024 * Now we create the RPC request in a STREAMS message. We have to do 1025 * this after the call to connmgr_get so that we have the correct 1026 * TIDU size for the transport. 1027 */ 1028 tidu_size = cm_entry->x_tidu_size; 1029 len = MSG_OFFSET + MAX(tidu_size, RM_HDR_SIZE + WIRE_HDR_SIZE); 1030 1031 while ((mp = allocb(len, BPRI_MED)) == NULL) { 1032 if (strwaitbuf(len, BPRI_MED)) { 1033 p->cku_err.re_status = RPC_SYSTEMERROR; 1034 p->cku_err.re_errno = ENOSR; 1035 COTSRCSTAT_INCR(p->cku_stats, rcnomem); 1036 goto cots_done; 1037 } 1038 } 1039 xdrs = &p->cku_outxdr; 1040 xdrmblk_init(xdrs, mp, XDR_ENCODE, tidu_size); 1041 mpsize = MBLKSIZE(mp); 1042 ASSERT(mpsize >= len); 1043 ASSERT(mp->b_rptr == mp->b_datap->db_base); 1044 1045 /* 1046 * If the size of mblk is not appreciably larger than what we 1047 * asked, then resize the mblk to exactly len bytes. The reason for 1048 * this: suppose len is 1600 bytes, the tidu is 1460 bytes 1049 * (from TCP over ethernet), and the arguments to the RPC require 1050 * 2800 bytes. Ideally we want the protocol to render two 1051 * ~1400 byte segments over the wire. However if allocb() gives us a 2k 1052 * mblk, and we allocate a second mblk for the remainder, the protocol 1053 * module may generate 3 segments over the wire: 1054 * 1460 bytes for the first, 448 (2048 - 1600) for the second, and 1055 * 892 for the third. If we "waste" 448 bytes in the first mblk, 1056 * the XDR encoding will generate two ~1400 byte mblks, and the 1057 * protocol module is more likely to produce properly sized segments. 1058 */ 1059 if ((mpsize >> 1) <= len) 1060 mp->b_rptr += (mpsize - len); 1061 1062 /* 1063 * Adjust b_rptr to reserve space for the non-data protocol headers 1064 * any downstream modules might like to add, and for the 1065 * record marking header. 1066 */ 1067 mp->b_rptr += (MSG_OFFSET + RM_HDR_SIZE); 1068 1069 if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) { 1070 /* Copy in the preserialized RPC header information. */ 1071 bcopy(p->cku_rpchdr, mp->b_rptr, WIRE_HDR_SIZE); 1072 1073 /* Use XDR_SETPOS() to set the b_wptr to past the RPC header. */ 1074 XDR_SETPOS(xdrs, (uint_t)(mp->b_rptr - mp->b_datap->db_base + 1075 WIRE_HDR_SIZE)); 1076 1077 ASSERT((mp->b_wptr - mp->b_rptr) == WIRE_HDR_SIZE); 1078 1079 /* Serialize the procedure number and the arguments. */ 1080 if ((!XDR_PUTINT32(xdrs, (int32_t *)&procnum)) || 1081 (!AUTH_MARSHALL(h->cl_auth, xdrs, p->cku_cred)) || 1082 (!(*xdr_args)(xdrs, argsp))) { 1083 XDR_DESTROY(xdrs); 1084 p->cku_err.re_status = RPC_CANTENCODEARGS; 1085 p->cku_err.re_errno = EIO; 1086 goto cots_done; 1087 } 1088 1089 (*(uint32_t *)(mp->b_rptr)) = p->cku_xid; 1090 } else { 1091 uint32_t *uproc = (uint32_t *)&p->cku_rpchdr[WIRE_HDR_SIZE]; 1092 IXDR_PUT_U_INT32(uproc, procnum); 1093 1094 (*(uint32_t *)(&p->cku_rpchdr[0])) = p->cku_xid; 1095 1096 /* Use XDR_SETPOS() to set the b_wptr. */ 1097 XDR_SETPOS(xdrs, (uint_t)(mp->b_rptr - mp->b_datap->db_base)); 1098 1099 /* Serialize the procedure number and the arguments. */ 1100 if (!AUTH_WRAP(h->cl_auth, p->cku_rpchdr, WIRE_HDR_SIZE+4, 1101 xdrs, xdr_args, argsp)) { 1102 XDR_DESTROY(xdrs); 1103 p->cku_err.re_status = RPC_CANTENCODEARGS; 1104 p->cku_err.re_errno = EIO; 1105 goto cots_done; 1106 } 1107 } 1108 1109 XDR_DESTROY(xdrs); 1110 1111 RPCLOG(2, "clnt_cots_kcallit: connected, sending call, tidu_size %d\n", 1112 tidu_size); 1113 1114 wq = cm_entry->x_wq; 1115 waitsecs = 0; 1116 1117 dispatch_again: 1118 status = clnt_dispatch_send(wq, mp, call, p->cku_xid, 1119 (p->cku_flags & CKU_ONQUEUE)); 1120 1121 if ((status == RPC_CANTSEND) && (call->call_reason == ENOBUFS)) { 1122 /* 1123 * QFULL condition, allow some time for queue to drain 1124 * and try again. Give up after waiting for all timeout 1125 * specified for the call, or zone is going away. 1126 */ 1127 max_waitsecs = wait.tv_sec ? wait.tv_sec : clnt_cots_min_tout; 1128 if ((waitsecs++ < max_waitsecs) && 1129 !(zone_status_get(curproc->p_zone) >= 1130 ZONE_IS_SHUTTING_DOWN)) { 1131 1132 /* wait 1 sec for queue to drain */ 1133 if (clnt_delay(drv_usectohz(1000000), 1134 h->cl_nosignal) == EINTR) { 1135 p->cku_err.re_errno = EINTR; 1136 p->cku_err.re_status = RPC_INTR; 1137 1138 goto cots_done; 1139 } 1140 1141 /* and try again */ 1142 goto dispatch_again; 1143 } 1144 p->cku_err.re_status = status; 1145 p->cku_err.re_errno = call->call_reason; 1146 DTRACE_PROBE(krpc__e__clntcots__kcallit__cantsend); 1147 1148 goto cots_done; 1149 } 1150 1151 if (waitsecs) { 1152 /* adjust timeout to account for time wait to send */ 1153 wait.tv_sec -= waitsecs; 1154 if (wait.tv_sec < 0) { 1155 /* pick up reply on next retry */ 1156 wait.tv_sec = 0; 1157 } 1158 DTRACE_PROBE2(clnt_cots__sendwait, CLIENT *, h, 1159 int, waitsecs); 1160 } 1161 1162 RPCLOG(64, "clnt_cots_kcallit: sent call for xid 0x%x\n", 1163 (uint_t)p->cku_xid); 1164 p->cku_flags = (CKU_ONQUEUE|CKU_SENT); 1165 p->cku_recv_attempts = 1; 1166 1167 #ifdef RPCDEBUG 1168 time_sent = ddi_get_lbolt(); 1169 #endif 1170 1171 /* 1172 * Wait for a reply or a timeout. If there is no error or timeout, 1173 * (both indicated by call_status), call->call_reply will contain 1174 * the RPC reply message. 1175 */ 1176 read_again: 1177 mutex_enter(&call->call_lock); 1178 interrupted = 0; 1179 if (call->call_status == RPC_TIMEDOUT) { 1180 /* 1181 * Indicate that the lwp is not to be stopped while waiting 1182 * for this network traffic. This is to avoid deadlock while 1183 * debugging a process via /proc and also to avoid recursive 1184 * mutex_enter()s due to NFS page faults while stopping 1185 * (NFS holds locks when it calls here). 1186 */ 1187 clock_t cv_wait_ret; 1188 clock_t timout; 1189 clock_t oldlbolt; 1190 1191 klwp_t *lwp = ttolwp(curthread); 1192 1193 if (lwp != NULL) 1194 lwp->lwp_nostop++; 1195 1196 oldlbolt = ddi_get_lbolt(); 1197 timout = wait.tv_sec * drv_usectohz(1000000) + 1198 drv_usectohz(wait.tv_usec) + oldlbolt; 1199 /* 1200 * Iterate until the call_status is changed to something 1201 * other that RPC_TIMEDOUT, or if cv_timedwait_sig() returns 1202 * something <=0 zero. The latter means that we timed 1203 * out. 1204 */ 1205 if (h->cl_nosignal) 1206 while ((cv_wait_ret = cv_timedwait(&call->call_cv, 1207 &call->call_lock, timout)) > 0 && 1208 call->call_status == RPC_TIMEDOUT) 1209 ; 1210 else 1211 while ((cv_wait_ret = cv_timedwait_sig( 1212 &call->call_cv, 1213 &call->call_lock, timout)) > 0 && 1214 call->call_status == RPC_TIMEDOUT) 1215 ; 1216 1217 switch (cv_wait_ret) { 1218 case 0: 1219 /* 1220 * If we got out of the above loop with 1221 * cv_timedwait_sig() returning 0, then we were 1222 * interrupted regardless what call_status is. 1223 */ 1224 interrupted = 1; 1225 break; 1226 case -1: 1227 /* cv_timedwait_sig() timed out */ 1228 break; 1229 default: 1230 1231 /* 1232 * We were cv_signaled(). If we didn't 1233 * get a successful call_status and returned 1234 * before time expired, delay up to clnt_cots_min_tout 1235 * seconds so that the caller doesn't immediately 1236 * try to call us again and thus force the 1237 * same condition that got us here (such 1238 * as a RPC_XPRTFAILED due to the server not 1239 * listening on the end-point. 1240 */ 1241 if (call->call_status != RPC_SUCCESS) { 1242 clock_t curlbolt; 1243 clock_t diff; 1244 1245 curlbolt = ddi_get_lbolt(); 1246 ticks = clnt_cots_min_tout * 1247 drv_usectohz(1000000); 1248 diff = curlbolt - oldlbolt; 1249 if (diff < ticks) { 1250 delay_first = TRUE; 1251 if (diff > 0) 1252 ticks -= diff; 1253 } 1254 } 1255 break; 1256 } 1257 1258 if (lwp != NULL) 1259 lwp->lwp_nostop--; 1260 } 1261 /* 1262 * Get the reply message, if any. This will be freed at the end 1263 * whether or not an error occurred. 1264 */ 1265 mp = call->call_reply; 1266 call->call_reply = NULL; 1267 1268 /* 1269 * call_err is the error info when the call is on dispatch queue. 1270 * cku_err is the error info returned to the caller. 1271 * Sync cku_err with call_err for local message processing. 1272 */ 1273 1274 status = call->call_status; 1275 p->cku_err = call->call_err; 1276 mutex_exit(&call->call_lock); 1277 1278 if (status != RPC_SUCCESS) { 1279 switch (status) { 1280 case RPC_TIMEDOUT: 1281 now = ddi_get_lbolt(); 1282 if (interrupted) { 1283 COTSRCSTAT_INCR(p->cku_stats, rcintrs); 1284 p->cku_err.re_status = RPC_INTR; 1285 p->cku_err.re_errno = EINTR; 1286 RPCLOG(1, "clnt_cots_kcallit: xid 0x%x", 1287 p->cku_xid); 1288 RPCLOG(1, "signal interrupted at %ld", now); 1289 RPCLOG(1, ", was sent at %ld\n", time_sent); 1290 } else { 1291 COTSRCSTAT_INCR(p->cku_stats, rctimeouts); 1292 p->cku_err.re_errno = ETIMEDOUT; 1293 RPCLOG(1, "clnt_cots_kcallit: timed out at %ld", 1294 now); 1295 RPCLOG(1, ", was sent at %ld\n", time_sent); 1296 } 1297 break; 1298 1299 case RPC_XPRTFAILED: 1300 if (p->cku_err.re_errno == 0) 1301 p->cku_err.re_errno = EIO; 1302 1303 RPCLOG(1, "clnt_cots_kcallit: transport failed: %d\n", 1304 p->cku_err.re_errno); 1305 break; 1306 1307 case RPC_SYSTEMERROR: 1308 ASSERT(p->cku_err.re_errno); 1309 RPCLOG(1, "clnt_cots_kcallit: system error: %d\n", 1310 p->cku_err.re_errno); 1311 break; 1312 1313 default: 1314 p->cku_err.re_status = RPC_SYSTEMERROR; 1315 p->cku_err.re_errno = EIO; 1316 RPCLOG(1, "clnt_cots_kcallit: error: %s\n", 1317 clnt_sperrno(status)); 1318 break; 1319 } 1320 if (p->cku_err.re_status != RPC_TIMEDOUT) { 1321 1322 if (p->cku_flags & CKU_ONQUEUE) { 1323 call_table_remove(call); 1324 p->cku_flags &= ~CKU_ONQUEUE; 1325 } 1326 1327 RPCLOG(64, "clnt_cots_kcallit: non TIMEOUT so xid 0x%x " 1328 "taken off dispatch list\n", p->cku_xid); 1329 if (call->call_reply) { 1330 freemsg(call->call_reply); 1331 call->call_reply = NULL; 1332 } 1333 } else if (wait.tv_sec != 0) { 1334 /* 1335 * We've sent the request over TCP and so we have 1336 * every reason to believe it will get 1337 * delivered. In which case returning a timeout is not 1338 * appropriate. 1339 */ 1340 if (p->cku_progress == TRUE && 1341 p->cku_recv_attempts < clnt_cots_maxrecv) { 1342 p->cku_err.re_status = RPC_INPROGRESS; 1343 } 1344 } 1345 goto cots_done; 1346 } 1347 1348 xdrs = &p->cku_inxdr; 1349 xdrmblk_init(xdrs, mp, XDR_DECODE, 0); 1350 1351 reply_msg.rm_direction = REPLY; 1352 reply_msg.rm_reply.rp_stat = MSG_ACCEPTED; 1353 reply_msg.acpted_rply.ar_stat = SUCCESS; 1354 1355 reply_msg.acpted_rply.ar_verf = _null_auth; 1356 /* 1357 * xdr_results will be done in AUTH_UNWRAP. 1358 */ 1359 reply_msg.acpted_rply.ar_results.where = NULL; 1360 reply_msg.acpted_rply.ar_results.proc = xdr_void; 1361 1362 if (xdr_replymsg(xdrs, &reply_msg)) { 1363 enum clnt_stat re_status; 1364 1365 _seterr_reply(&reply_msg, &p->cku_err); 1366 1367 re_status = p->cku_err.re_status; 1368 if (re_status == RPC_SUCCESS) { 1369 /* 1370 * Reply is good, check auth. 1371 */ 1372 if (!AUTH_VALIDATE(h->cl_auth, 1373 &reply_msg.acpted_rply.ar_verf)) { 1374 COTSRCSTAT_INCR(p->cku_stats, rcbadverfs); 1375 RPCLOG0(1, "clnt_cots_kcallit: validation " 1376 "failure\n"); 1377 freemsg(mp); 1378 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 1379 XDR_DESTROY(xdrs); 1380 mutex_enter(&call->call_lock); 1381 if (call->call_reply == NULL) 1382 call->call_status = RPC_TIMEDOUT; 1383 mutex_exit(&call->call_lock); 1384 goto read_again; 1385 } else if (!AUTH_UNWRAP(h->cl_auth, xdrs, 1386 xdr_results, resultsp)) { 1387 RPCLOG0(1, "clnt_cots_kcallit: validation " 1388 "failure (unwrap)\n"); 1389 p->cku_err.re_status = RPC_CANTDECODERES; 1390 p->cku_err.re_errno = EIO; 1391 } 1392 } else { 1393 /* set errno in case we can't recover */ 1394 if (re_status != RPC_VERSMISMATCH && 1395 re_status != RPC_AUTHERROR && 1396 re_status != RPC_PROGVERSMISMATCH) 1397 p->cku_err.re_errno = EIO; 1398 1399 if (re_status == RPC_AUTHERROR) { 1400 /* 1401 * Maybe our credential need to be refreshed 1402 */ 1403 if (cm_entry) { 1404 /* 1405 * There is the potential that the 1406 * cm_entry has/will be marked dead, 1407 * so drop the connection altogether, 1408 * force REFRESH to establish new 1409 * connection. 1410 */ 1411 connmgr_cancelconn(cm_entry); 1412 cm_entry = NULL; 1413 } 1414 1415 (void) xdr_rpc_free_verifier(xdrs, 1416 &reply_msg); 1417 XDR_DESTROY(xdrs); 1418 1419 if (p->cku_flags & CKU_ONQUEUE) { 1420 call_table_remove(call); 1421 p->cku_flags &= ~CKU_ONQUEUE; 1422 } 1423 RPCLOG(64, 1424 "clnt_cots_kcallit: AUTH_ERROR, xid" 1425 " 0x%x removed off dispatch list\n", 1426 p->cku_xid); 1427 if (call->call_reply) { 1428 freemsg(call->call_reply); 1429 call->call_reply = NULL; 1430 } 1431 1432 if ((refreshes > 0) && 1433 AUTH_REFRESH(h->cl_auth, &reply_msg, 1434 p->cku_cred)) { 1435 refreshes--; 1436 freemsg(mp); 1437 mp = NULL; 1438 1439 COTSRCSTAT_INCR(p->cku_stats, 1440 rcbadcalls); 1441 COTSRCSTAT_INCR(p->cku_stats, 1442 rcnewcreds); 1443 goto call_again; 1444 } 1445 1446 /* 1447 * We have used the client handle to 1448 * do an AUTH_REFRESH and the RPC status may 1449 * be set to RPC_SUCCESS; Let's make sure to 1450 * set it to RPC_AUTHERROR. 1451 */ 1452 p->cku_err.re_status = RPC_AUTHERROR; 1453 1454 /* 1455 * Map recoverable and unrecoverable 1456 * authentication errors to appropriate errno 1457 */ 1458 switch (p->cku_err.re_why) { 1459 case AUTH_TOOWEAK: 1460 /* 1461 * This could be a failure where the 1462 * server requires use of a reserved 1463 * port, check and optionally set the 1464 * client handle useresvport trying 1465 * one more time. Next go round we 1466 * fall out with the tooweak error. 1467 */ 1468 if (p->cku_useresvport != 1) { 1469 p->cku_useresvport = 1; 1470 p->cku_xid = 0; 1471 freemsg(mp); 1472 mp = NULL; 1473 goto call_again; 1474 } 1475 /* FALLTHRU */ 1476 case AUTH_BADCRED: 1477 case AUTH_BADVERF: 1478 case AUTH_INVALIDRESP: 1479 case AUTH_FAILED: 1480 case RPCSEC_GSS_NOCRED: 1481 case RPCSEC_GSS_FAILED: 1482 p->cku_err.re_errno = EACCES; 1483 break; 1484 case AUTH_REJECTEDCRED: 1485 case AUTH_REJECTEDVERF: 1486 default: p->cku_err.re_errno = EIO; 1487 break; 1488 } 1489 RPCLOG(1, "clnt_cots_kcallit : authentication" 1490 " failed with RPC_AUTHERROR of type %d\n", 1491 (int)p->cku_err.re_why); 1492 goto cots_done; 1493 } 1494 } 1495 } else { 1496 /* reply didn't decode properly. */ 1497 p->cku_err.re_status = RPC_CANTDECODERES; 1498 p->cku_err.re_errno = EIO; 1499 RPCLOG0(1, "clnt_cots_kcallit: decode failure\n"); 1500 } 1501 1502 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 1503 XDR_DESTROY(xdrs); 1504 1505 if (p->cku_flags & CKU_ONQUEUE) { 1506 call_table_remove(call); 1507 p->cku_flags &= ~CKU_ONQUEUE; 1508 } 1509 1510 RPCLOG(64, "clnt_cots_kcallit: xid 0x%x taken off dispatch list", 1511 p->cku_xid); 1512 RPCLOG(64, " status is %s\n", clnt_sperrno(p->cku_err.re_status)); 1513 cots_done: 1514 if (cm_entry) 1515 connmgr_release(cm_entry); 1516 1517 if (mp != NULL) 1518 freemsg(mp); 1519 if ((p->cku_flags & CKU_ONQUEUE) == 0 && call->call_reply) { 1520 freemsg(call->call_reply); 1521 call->call_reply = NULL; 1522 } 1523 if (p->cku_err.re_status != RPC_SUCCESS) { 1524 RPCLOG0(1, "clnt_cots_kcallit: tail-end failure\n"); 1525 COTSRCSTAT_INCR(p->cku_stats, rcbadcalls); 1526 } 1527 1528 /* 1529 * No point in delaying if the zone is going away. 1530 */ 1531 if (delay_first == TRUE && 1532 !(zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)) { 1533 if (clnt_delay(ticks, h->cl_nosignal) == EINTR) { 1534 p->cku_err.re_errno = EINTR; 1535 p->cku_err.re_status = RPC_INTR; 1536 } 1537 } 1538 return (p->cku_err.re_status); 1539 } 1540 1541 /* 1542 * Kinit routine for cots. This sets up the correct operations in 1543 * the client handle, as the handle may have previously been a clts 1544 * handle, and clears the xid field so there is no way a new call 1545 * could be mistaken for a retry. It also sets in the handle the 1546 * information that is passed at create/kinit time but needed at 1547 * call time, as cots creates the transport at call time - device, 1548 * address of the server, protocol family. 1549 */ 1550 void 1551 clnt_cots_kinit(CLIENT *h, dev_t dev, int family, struct netbuf *addr, 1552 int max_msgsize, cred_t *cred) 1553 { 1554 /* LINTED pointer alignment */ 1555 cku_private_t *p = htop(h); 1556 calllist_t *call = &p->cku_call; 1557 1558 h->cl_ops = &tcp_ops; 1559 if (p->cku_flags & CKU_ONQUEUE) { 1560 call_table_remove(call); 1561 p->cku_flags &= ~CKU_ONQUEUE; 1562 RPCLOG(64, "clnt_cots_kinit: removing call for xid 0x%x from" 1563 " dispatch list\n", p->cku_xid); 1564 } 1565 1566 if (call->call_reply != NULL) { 1567 freemsg(call->call_reply); 1568 call->call_reply = NULL; 1569 } 1570 1571 call->call_bucket = NULL; 1572 call->call_hash = 0; 1573 1574 /* 1575 * We don't clear cku_flags here, because clnt_cots_kcallit() 1576 * takes care of handling the cku_flags reset. 1577 */ 1578 p->cku_xid = 0; 1579 p->cku_device = dev; 1580 p->cku_addrfmly = family; 1581 p->cku_cred = cred; 1582 1583 if (p->cku_addr.maxlen < addr->len) { 1584 kmem_free(p->cku_addr.buf, p->cku_addr.maxlen); 1585 p->cku_addr.buf = kmem_zalloc(addr->maxlen, KM_SLEEP); 1586 p->cku_addr.maxlen = addr->maxlen; 1587 } 1588 1589 p->cku_addr.len = addr->len; 1590 bcopy(addr->buf, p->cku_addr.buf, addr->len); 1591 1592 /* 1593 * If the current sanity check size in rpcmod is smaller 1594 * than the size needed, then increase the sanity check. 1595 */ 1596 if (max_msgsize != 0 && clnt_max_msg_sizep != NULL && 1597 max_msgsize > *clnt_max_msg_sizep) { 1598 mutex_enter(&clnt_max_msg_lock); 1599 if (max_msgsize > *clnt_max_msg_sizep) 1600 *clnt_max_msg_sizep = max_msgsize; 1601 mutex_exit(&clnt_max_msg_lock); 1602 } 1603 } 1604 1605 /* 1606 * ksettimers is a no-op for cots, with the exception of setting the xid. 1607 */ 1608 /* ARGSUSED */ 1609 static int 1610 clnt_cots_ksettimers(CLIENT *h, struct rpc_timers *t, struct rpc_timers *all, 1611 int minimum, void (*feedback)(int, int, caddr_t), caddr_t arg, uint32_t xid) 1612 { 1613 /* LINTED pointer alignment */ 1614 cku_private_t *p = htop(h); 1615 1616 if (xid) 1617 p->cku_xid = xid; 1618 COTSRCSTAT_INCR(p->cku_stats, rctimers); 1619 return (0); 1620 } 1621 1622 extern void rpc_poptimod(struct vnode *); 1623 extern int kstr_push(struct vnode *, char *); 1624 1625 int 1626 conn_kstat_update(kstat_t *ksp, int rw) 1627 { 1628 struct cm_xprt *cm_entry; 1629 struct cm_kstat_xprt *cm_ksp_data; 1630 uchar_t *b; 1631 char *fbuf; 1632 1633 if (rw == KSTAT_WRITE) 1634 return (EACCES); 1635 if (ksp == NULL || ksp->ks_private == NULL) 1636 return (EIO); 1637 cm_entry = (struct cm_xprt *)ksp->ks_private; 1638 cm_ksp_data = (struct cm_kstat_xprt *)ksp->ks_data; 1639 1640 cm_ksp_data->x_wq.value.ui32 = (uint32_t)(uintptr_t)cm_entry->x_wq; 1641 cm_ksp_data->x_family.value.ui32 = cm_entry->x_family; 1642 cm_ksp_data->x_rdev.value.ui32 = (uint32_t)cm_entry->x_rdev; 1643 cm_ksp_data->x_time.value.ui32 = cm_entry->x_time; 1644 cm_ksp_data->x_ref.value.ui32 = cm_entry->x_ref; 1645 cm_ksp_data->x_state.value.ui32 = cm_entry->x_state_flags; 1646 1647 if (cm_entry->x_server.buf) { 1648 fbuf = cm_ksp_data->x_server.value.str.addr.ptr; 1649 if (cm_entry->x_family == AF_INET && 1650 cm_entry->x_server.len == 1651 sizeof (struct sockaddr_in)) { 1652 struct sockaddr_in *sa; 1653 sa = (struct sockaddr_in *) 1654 cm_entry->x_server.buf; 1655 b = (uchar_t *)&sa->sin_addr; 1656 (void) sprintf(fbuf, 1657 "%d.%d.%d.%d", b[0] & 0xFF, b[1] & 0xFF, 1658 b[2] & 0xFF, b[3] & 0xFF); 1659 cm_ksp_data->x_port.value.ui32 = ntohs(sa->sin_port); 1660 } else if (cm_entry->x_family == AF_INET6 && 1661 cm_entry->x_server.len >= 1662 sizeof (struct sockaddr_in6)) { 1663 /* extract server IP address & port */ 1664 struct sockaddr_in6 *sin6; 1665 sin6 = (struct sockaddr_in6 *)cm_entry->x_server.buf; 1666 (void) kinet_ntop6((uchar_t *)&sin6->sin6_addr, fbuf, 1667 INET6_ADDRSTRLEN); 1668 cm_ksp_data->x_port.value.ui32 = ntohs(sin6->sin6_port); 1669 } else { 1670 struct sockaddr_in *sa; 1671 1672 sa = (struct sockaddr_in *)cm_entry->x_server.buf; 1673 b = (uchar_t *)&sa->sin_addr; 1674 (void) sprintf(fbuf, 1675 "%d.%d.%d.%d", b[0] & 0xFF, b[1] & 0xFF, 1676 b[2] & 0xFF, b[3] & 0xFF); 1677 } 1678 KSTAT_NAMED_STR_BUFLEN(&cm_ksp_data->x_server) = 1679 strlen(fbuf) + 1; 1680 } 1681 1682 return (0); 1683 } 1684 1685 1686 /* 1687 * We want a version of delay which is interruptible by a UNIX signal 1688 * Return EINTR if an interrupt occured. 1689 */ 1690 static int 1691 clnt_delay(clock_t ticks, bool_t nosignal) 1692 { 1693 if (nosignal == TRUE) { 1694 delay(ticks); 1695 return (0); 1696 } 1697 return (delay_sig(ticks)); 1698 } 1699 1700 /* 1701 * Wait for a connection until a timeout, or until we are 1702 * signalled that there has been a connection state change. 1703 */ 1704 static enum clnt_stat 1705 connmgr_cwait(struct cm_xprt *cm_entry, const struct timeval *waitp, 1706 bool_t nosignal) 1707 { 1708 bool_t interrupted; 1709 clock_t timout, cv_stat; 1710 enum clnt_stat clstat; 1711 unsigned int old_state; 1712 1713 ASSERT(MUTEX_HELD(&connmgr_lock)); 1714 /* 1715 * We wait for the transport connection to be made, or an 1716 * indication that it could not be made. 1717 */ 1718 clstat = RPC_TIMEDOUT; 1719 interrupted = FALSE; 1720 1721 old_state = cm_entry->x_state_flags; 1722 /* 1723 * Now loop until cv_timedwait{_sig} returns because of 1724 * a signal(0) or timeout(-1) or cv_signal(>0). But it may be 1725 * cv_signalled for various other reasons too. So loop 1726 * until there is a state change on the connection. 1727 */ 1728 1729 timout = waitp->tv_sec * drv_usectohz(1000000) + 1730 drv_usectohz(waitp->tv_usec) + ddi_get_lbolt(); 1731 1732 if (nosignal) { 1733 while ((cv_stat = cv_timedwait(&cm_entry->x_conn_cv, 1734 &connmgr_lock, timout)) > 0 && 1735 cm_entry->x_state_flags == old_state) 1736 ; 1737 } else { 1738 while ((cv_stat = cv_timedwait_sig(&cm_entry->x_conn_cv, 1739 &connmgr_lock, timout)) > 0 && 1740 cm_entry->x_state_flags == old_state) 1741 ; 1742 1743 if (cv_stat == 0) /* got intr signal? */ 1744 interrupted = TRUE; 1745 } 1746 1747 if ((cm_entry->x_state_flags & (X_BADSTATES|X_CONNECTED)) == 1748 X_CONNECTED) { 1749 clstat = RPC_SUCCESS; 1750 } else { 1751 if (interrupted == TRUE) 1752 clstat = RPC_INTR; 1753 RPCLOG(1, "connmgr_cwait: can't connect, error: %s\n", 1754 clnt_sperrno(clstat)); 1755 } 1756 1757 return (clstat); 1758 } 1759 1760 /* 1761 * Primary interface for how RPC grabs a connection. 1762 */ 1763 static struct cm_xprt * 1764 connmgr_wrapget( 1765 struct netbuf *retryaddr, 1766 const struct timeval *waitp, 1767 cku_private_t *p) 1768 { 1769 struct cm_xprt *cm_entry; 1770 1771 cm_entry = connmgr_get(retryaddr, waitp, &p->cku_addr, p->cku_addrfmly, 1772 &p->cku_srcaddr, &p->cku_err, p->cku_device, 1773 p->cku_client.cl_nosignal, p->cku_useresvport, p->cku_cred); 1774 1775 if (cm_entry == NULL) { 1776 /* 1777 * Re-map the call status to RPC_INTR if the err code is 1778 * EINTR. This can happen if calls status is RPC_TLIERROR. 1779 * However, don't re-map if signalling has been turned off. 1780 * XXX Really need to create a separate thread whenever 1781 * there isn't an existing connection. 1782 */ 1783 if (p->cku_err.re_errno == EINTR) { 1784 if (p->cku_client.cl_nosignal == TRUE) 1785 p->cku_err.re_errno = EIO; 1786 else 1787 p->cku_err.re_status = RPC_INTR; 1788 } 1789 } 1790 1791 return (cm_entry); 1792 } 1793 1794 /* 1795 * Obtains a transport to the server specified in addr. If a suitable transport 1796 * does not already exist in the list of cached transports, a new connection 1797 * is created, connected, and added to the list. The connection is for sending 1798 * only - the reply message may come back on another transport connection. 1799 * 1800 * To implement round-robin load balancing with multiple client connections, 1801 * the last entry on the list is always selected. Once the entry is selected 1802 * it's re-inserted to the head of the list. 1803 */ 1804 static struct cm_xprt * 1805 connmgr_get( 1806 struct netbuf *retryaddr, 1807 const struct timeval *waitp, /* changed to a ptr to converse stack */ 1808 struct netbuf *destaddr, 1809 int addrfmly, 1810 struct netbuf *srcaddr, 1811 struct rpc_err *rpcerr, 1812 dev_t device, 1813 bool_t nosignal, 1814 int useresvport, 1815 cred_t *cr) 1816 { 1817 struct cm_xprt *cm_entry; 1818 struct cm_xprt *lru_entry; 1819 struct cm_xprt **cmp, **prev; 1820 queue_t *wq; 1821 TIUSER *tiptr; 1822 int i; 1823 int retval; 1824 int tidu_size; 1825 bool_t connected; 1826 zoneid_t zoneid = rpc_zoneid(); 1827 1828 /* 1829 * If the call is not a retry, look for a transport entry that 1830 * goes to the server of interest. 1831 */ 1832 mutex_enter(&connmgr_lock); 1833 1834 if (retryaddr == NULL) { 1835 use_new_conn: 1836 i = 0; 1837 cm_entry = lru_entry = NULL; 1838 1839 prev = cmp = &cm_hd; 1840 while ((cm_entry = *cmp) != NULL) { 1841 ASSERT(cm_entry != cm_entry->x_next); 1842 /* 1843 * Garbage collect conections that are marked 1844 * for needs disconnect. 1845 */ 1846 if (cm_entry->x_needdis) { 1847 CONN_HOLD(cm_entry); 1848 connmgr_dis_and_wait(cm_entry); 1849 connmgr_release(cm_entry); 1850 /* 1851 * connmgr_lock could have been 1852 * dropped for the disconnect 1853 * processing so start over. 1854 */ 1855 goto use_new_conn; 1856 } 1857 1858 /* 1859 * Garbage collect the dead connections that have 1860 * no threads working on them. 1861 */ 1862 if ((cm_entry->x_state_flags & (X_DEAD|X_THREAD)) == 1863 X_DEAD) { 1864 mutex_enter(&cm_entry->x_lock); 1865 if (cm_entry->x_ref != 0) { 1866 /* 1867 * Currently in use. 1868 * Cleanup later. 1869 */ 1870 cmp = &cm_entry->x_next; 1871 mutex_exit(&cm_entry->x_lock); 1872 continue; 1873 } 1874 mutex_exit(&cm_entry->x_lock); 1875 *cmp = cm_entry->x_next; 1876 mutex_exit(&connmgr_lock); 1877 connmgr_close(cm_entry); 1878 mutex_enter(&connmgr_lock); 1879 goto use_new_conn; 1880 } 1881 1882 1883 if ((cm_entry->x_state_flags & X_BADSTATES) == 0 && 1884 cm_entry->x_zoneid == zoneid && 1885 cm_entry->x_rdev == device && 1886 destaddr->len == cm_entry->x_server.len && 1887 bcmp(destaddr->buf, cm_entry->x_server.buf, 1888 destaddr->len) == 0) { 1889 /* 1890 * If the matching entry isn't connected, 1891 * attempt to reconnect it. 1892 */ 1893 if (cm_entry->x_connected == FALSE) { 1894 /* 1895 * We don't go through trying 1896 * to find the least recently 1897 * used connected because 1898 * connmgr_reconnect() briefly 1899 * dropped the connmgr_lock, 1900 * allowing a window for our 1901 * accounting to be messed up. 1902 * In any case, a re-connected 1903 * connection is as good as 1904 * a LRU connection. 1905 */ 1906 return (connmgr_wrapconnect(cm_entry, 1907 waitp, destaddr, addrfmly, srcaddr, 1908 rpcerr, TRUE, nosignal, cr)); 1909 } 1910 i++; 1911 1912 /* keep track of the last entry */ 1913 lru_entry = cm_entry; 1914 prev = cmp; 1915 } 1916 cmp = &cm_entry->x_next; 1917 } 1918 1919 if (i > clnt_max_conns) { 1920 RPCLOG(8, "connmgr_get: too many conns, dooming entry" 1921 " %p\n", (void *)lru_entry->x_tiptr); 1922 lru_entry->x_doomed = TRUE; 1923 goto use_new_conn; 1924 } 1925 1926 /* 1927 * If we are at the maximum number of connections to 1928 * the server, hand back the least recently used one. 1929 */ 1930 if (i == clnt_max_conns) { 1931 /* 1932 * Copy into the handle the source address of 1933 * the connection, which we will use in case of 1934 * a later retry. 1935 */ 1936 if (srcaddr->len != lru_entry->x_src.len) { 1937 kmem_free(srcaddr->buf, srcaddr->maxlen); 1938 ASSERT(lru_entry->x_src.len != 0); 1939 srcaddr->buf = kmem_alloc( 1940 lru_entry->x_src.len, KM_SLEEP); 1941 srcaddr->maxlen = srcaddr->len = 1942 lru_entry->x_src.len; 1943 } 1944 bcopy(lru_entry->x_src.buf, srcaddr->buf, srcaddr->len); 1945 RPCLOG(2, "connmgr_get: call going out on %p\n", 1946 (void *)lru_entry); 1947 lru_entry->x_time = ddi_get_lbolt(); 1948 CONN_HOLD(lru_entry); 1949 1950 if ((i > 1) && (prev != &cm_hd)) { 1951 /* 1952 * remove and re-insert entry at head of list. 1953 */ 1954 *prev = lru_entry->x_next; 1955 lru_entry->x_next = cm_hd; 1956 cm_hd = lru_entry; 1957 } 1958 1959 mutex_exit(&connmgr_lock); 1960 return (lru_entry); 1961 } 1962 1963 } else { 1964 /* 1965 * This is the retry case (retryaddr != NULL). Retries must 1966 * be sent on the same source port as the original call. 1967 */ 1968 1969 /* 1970 * Walk the list looking for a connection with a source address 1971 * that matches the retry address. 1972 */ 1973 start_retry_loop: 1974 cmp = &cm_hd; 1975 while ((cm_entry = *cmp) != NULL) { 1976 ASSERT(cm_entry != cm_entry->x_next); 1977 1978 /* 1979 * determine if this connection matches the passed 1980 * in retry address. If it does not match, advance 1981 * to the next element on the list. 1982 */ 1983 if (zoneid != cm_entry->x_zoneid || 1984 device != cm_entry->x_rdev || 1985 retryaddr->len != cm_entry->x_src.len || 1986 bcmp(retryaddr->buf, cm_entry->x_src.buf, 1987 retryaddr->len) != 0) { 1988 cmp = &cm_entry->x_next; 1989 continue; 1990 } 1991 /* 1992 * Garbage collect conections that are marked 1993 * for needs disconnect. 1994 */ 1995 if (cm_entry->x_needdis) { 1996 CONN_HOLD(cm_entry); 1997 connmgr_dis_and_wait(cm_entry); 1998 connmgr_release(cm_entry); 1999 /* 2000 * connmgr_lock could have been 2001 * dropped for the disconnect 2002 * processing so start over. 2003 */ 2004 goto start_retry_loop; 2005 } 2006 /* 2007 * Garbage collect the dead connections that have 2008 * no threads working on them. 2009 */ 2010 if ((cm_entry->x_state_flags & (X_DEAD|X_THREAD)) == 2011 X_DEAD) { 2012 mutex_enter(&cm_entry->x_lock); 2013 if (cm_entry->x_ref != 0) { 2014 /* 2015 * Currently in use. 2016 * Cleanup later. 2017 */ 2018 cmp = &cm_entry->x_next; 2019 mutex_exit(&cm_entry->x_lock); 2020 continue; 2021 } 2022 mutex_exit(&cm_entry->x_lock); 2023 *cmp = cm_entry->x_next; 2024 mutex_exit(&connmgr_lock); 2025 connmgr_close(cm_entry); 2026 mutex_enter(&connmgr_lock); 2027 goto start_retry_loop; 2028 } 2029 2030 /* 2031 * Sanity check: if the connection with our source 2032 * port is going to some other server, something went 2033 * wrong, as we never delete connections (i.e. release 2034 * ports) unless they have been idle. In this case, 2035 * it is probably better to send the call out using 2036 * a new source address than to fail it altogether, 2037 * since that port may never be released. 2038 */ 2039 if (destaddr->len != cm_entry->x_server.len || 2040 bcmp(destaddr->buf, cm_entry->x_server.buf, 2041 destaddr->len) != 0) { 2042 RPCLOG(1, "connmgr_get: tiptr %p" 2043 " is going to a different server" 2044 " with the port that belongs" 2045 " to us!\n", (void *)cm_entry->x_tiptr); 2046 retryaddr = NULL; 2047 goto use_new_conn; 2048 } 2049 2050 /* 2051 * If the connection of interest is not connected and we 2052 * can't reconnect it, then the server is probably 2053 * still down. Return NULL to the caller and let it 2054 * retry later if it wants to. We have a delay so the 2055 * machine doesn't go into a tight retry loop. If the 2056 * entry was already connected, or the reconnected was 2057 * successful, return this entry. 2058 */ 2059 if (cm_entry->x_connected == FALSE) { 2060 return (connmgr_wrapconnect(cm_entry, 2061 waitp, destaddr, addrfmly, NULL, 2062 rpcerr, TRUE, nosignal, cr)); 2063 } else { 2064 CONN_HOLD(cm_entry); 2065 2066 cm_entry->x_time = ddi_get_lbolt(); 2067 mutex_exit(&connmgr_lock); 2068 RPCLOG(2, "connmgr_get: found old " 2069 "transport %p for retry\n", 2070 (void *)cm_entry); 2071 return (cm_entry); 2072 } 2073 } 2074 2075 /* 2076 * We cannot find an entry in the list for this retry. 2077 * Either the entry has been removed temporarily to be 2078 * reconnected by another thread, or the original call 2079 * got a port but never got connected, 2080 * and hence the transport never got put in the 2081 * list. Fall through to the "create new connection" code - 2082 * the former case will fail there trying to rebind the port, 2083 * and the later case (and any other pathological cases) will 2084 * rebind and reconnect and not hang the client machine. 2085 */ 2086 RPCLOG0(8, "connmgr_get: no entry in list for retry\n"); 2087 } 2088 /* 2089 * Set up a transport entry in the connection manager's list. 2090 */ 2091 cm_entry = (struct cm_xprt *) 2092 kmem_zalloc(sizeof (struct cm_xprt), KM_SLEEP); 2093 2094 cm_entry->x_server.buf = kmem_alloc(destaddr->len, KM_SLEEP); 2095 bcopy(destaddr->buf, cm_entry->x_server.buf, destaddr->len); 2096 cm_entry->x_server.len = cm_entry->x_server.maxlen = destaddr->len; 2097 2098 cm_entry->x_state_flags = X_THREAD; 2099 cm_entry->x_ref = 1; 2100 cm_entry->x_family = addrfmly; 2101 cm_entry->x_rdev = device; 2102 cm_entry->x_zoneid = zoneid; 2103 mutex_init(&cm_entry->x_lock, NULL, MUTEX_DEFAULT, NULL); 2104 cv_init(&cm_entry->x_cv, NULL, CV_DEFAULT, NULL); 2105 cv_init(&cm_entry->x_conn_cv, NULL, CV_DEFAULT, NULL); 2106 cv_init(&cm_entry->x_dis_cv, NULL, CV_DEFAULT, NULL); 2107 2108 /* 2109 * Note that we add this partially initialized entry to the 2110 * connection list. This is so that we don't have connections to 2111 * the same server. 2112 * 2113 * Note that x_src is not initialized at this point. This is because 2114 * retryaddr might be NULL in which case x_src is whatever 2115 * t_kbind/bindresvport gives us. If another thread wants a 2116 * connection to the same server, seemingly we have an issue, but we 2117 * don't. If the other thread comes in with retryaddr == NULL, then it 2118 * will never look at x_src, and it will end up waiting in 2119 * connmgr_cwait() for the first thread to finish the connection 2120 * attempt. If the other thread comes in with retryaddr != NULL, then 2121 * that means there was a request sent on a connection, in which case 2122 * the the connection should already exist. Thus the first thread 2123 * never gets here ... it finds the connection it its server in the 2124 * connection list. 2125 * 2126 * But even if theory is wrong, in the retryaddr != NULL case, the 2nd 2127 * thread will skip us because x_src.len == 0. 2128 */ 2129 cm_entry->x_next = cm_hd; 2130 cm_hd = cm_entry; 2131 mutex_exit(&connmgr_lock); 2132 2133 /* 2134 * Either we didn't find an entry to the server of interest, or we 2135 * don't have the maximum number of connections to that server - 2136 * create a new connection. 2137 */ 2138 RPCLOG0(8, "connmgr_get: creating new connection\n"); 2139 rpcerr->re_status = RPC_TLIERROR; 2140 2141 i = t_kopen(NULL, device, FREAD|FWRITE|FNDELAY, &tiptr, zone_kcred()); 2142 if (i) { 2143 RPCLOG(1, "connmgr_get: can't open cots device, error %d\n", i); 2144 rpcerr->re_errno = i; 2145 connmgr_cancelconn(cm_entry); 2146 return (NULL); 2147 } 2148 rpc_poptimod(tiptr->fp->f_vnode); 2149 2150 if (i = strioctl(tiptr->fp->f_vnode, I_PUSH, (intptr_t)"rpcmod", 0, 2151 K_TO_K, kcred, &retval)) { 2152 RPCLOG(1, "connmgr_get: can't push cots module, %d\n", i); 2153 (void) t_kclose(tiptr, 1); 2154 rpcerr->re_errno = i; 2155 connmgr_cancelconn(cm_entry); 2156 return (NULL); 2157 } 2158 2159 if (i = strioctl(tiptr->fp->f_vnode, RPC_CLIENT, 0, 0, K_TO_K, 2160 kcred, &retval)) { 2161 RPCLOG(1, "connmgr_get: can't set client status with cots " 2162 "module, %d\n", i); 2163 (void) t_kclose(tiptr, 1); 2164 rpcerr->re_errno = i; 2165 connmgr_cancelconn(cm_entry); 2166 return (NULL); 2167 } 2168 2169 mutex_enter(&connmgr_lock); 2170 2171 wq = tiptr->fp->f_vnode->v_stream->sd_wrq->q_next; 2172 cm_entry->x_wq = wq; 2173 2174 mutex_exit(&connmgr_lock); 2175 2176 if (i = strioctl(tiptr->fp->f_vnode, I_PUSH, (intptr_t)"timod", 0, 2177 K_TO_K, kcred, &retval)) { 2178 RPCLOG(1, "connmgr_get: can't push timod, %d\n", i); 2179 (void) t_kclose(tiptr, 1); 2180 rpcerr->re_errno = i; 2181 connmgr_cancelconn(cm_entry); 2182 return (NULL); 2183 } 2184 2185 /* 2186 * If the caller has not specified reserved port usage then 2187 * take the system default. 2188 */ 2189 if (useresvport == -1) 2190 useresvport = clnt_cots_do_bindresvport; 2191 2192 if ((useresvport || retryaddr != NULL) && 2193 (addrfmly == AF_INET || addrfmly == AF_INET6)) { 2194 bool_t alloc_src = FALSE; 2195 2196 if (srcaddr->len != destaddr->len) { 2197 kmem_free(srcaddr->buf, srcaddr->maxlen); 2198 srcaddr->buf = kmem_zalloc(destaddr->len, KM_SLEEP); 2199 srcaddr->maxlen = destaddr->len; 2200 srcaddr->len = destaddr->len; 2201 alloc_src = TRUE; 2202 } 2203 2204 if ((i = bindresvport(tiptr, retryaddr, srcaddr, TRUE)) != 0) { 2205 (void) t_kclose(tiptr, 1); 2206 RPCLOG(1, "connmgr_get: couldn't bind, retryaddr: " 2207 "%p\n", (void *)retryaddr); 2208 2209 /* 2210 * 1225408: If we allocated a source address, then it 2211 * is either garbage or all zeroes. In that case 2212 * we need to clear srcaddr. 2213 */ 2214 if (alloc_src == TRUE) { 2215 kmem_free(srcaddr->buf, srcaddr->maxlen); 2216 srcaddr->maxlen = srcaddr->len = 0; 2217 srcaddr->buf = NULL; 2218 } 2219 rpcerr->re_errno = i; 2220 connmgr_cancelconn(cm_entry); 2221 return (NULL); 2222 } 2223 } else { 2224 if ((i = t_kbind(tiptr, NULL, NULL)) != 0) { 2225 RPCLOG(1, "clnt_cots_kcreate: t_kbind: %d\n", i); 2226 (void) t_kclose(tiptr, 1); 2227 rpcerr->re_errno = i; 2228 connmgr_cancelconn(cm_entry); 2229 return (NULL); 2230 } 2231 } 2232 2233 { 2234 /* 2235 * Keep the kernel stack lean. Don't move this call 2236 * declaration to the top of this function because a 2237 * call is declared in connmgr_wrapconnect() 2238 */ 2239 calllist_t call; 2240 2241 bzero(&call, sizeof (call)); 2242 cv_init(&call.call_cv, NULL, CV_DEFAULT, NULL); 2243 2244 /* 2245 * This is a bound end-point so don't close it's stream. 2246 */ 2247 connected = connmgr_connect(cm_entry, wq, destaddr, addrfmly, 2248 &call, &tidu_size, FALSE, waitp, nosignal, cr); 2249 *rpcerr = call.call_err; 2250 cv_destroy(&call.call_cv); 2251 2252 } 2253 2254 mutex_enter(&connmgr_lock); 2255 2256 /* 2257 * Set up a transport entry in the connection manager's list. 2258 */ 2259 if (srcaddr->len > 0) { 2260 cm_entry->x_src.buf = kmem_alloc(srcaddr->len, KM_SLEEP); 2261 bcopy(srcaddr->buf, cm_entry->x_src.buf, srcaddr->len); 2262 cm_entry->x_src.len = cm_entry->x_src.maxlen = srcaddr->len; 2263 } /* Else kmem_zalloc() of cm_entry already sets its x_src to NULL. */ 2264 2265 cm_entry->x_tiptr = tiptr; 2266 cm_entry->x_time = ddi_get_lbolt(); 2267 2268 if (tiptr->tp_info.servtype == T_COTS_ORD) 2269 cm_entry->x_ordrel = TRUE; 2270 else 2271 cm_entry->x_ordrel = FALSE; 2272 2273 cm_entry->x_tidu_size = tidu_size; 2274 2275 if (cm_entry->x_early_disc) { 2276 /* 2277 * We need to check if a disconnect request has come 2278 * while we are connected, if so, then we need to 2279 * set rpcerr->re_status appropriately before returning 2280 * NULL to caller. 2281 */ 2282 if (rpcerr->re_status == RPC_SUCCESS) 2283 rpcerr->re_status = RPC_XPRTFAILED; 2284 cm_entry->x_connected = FALSE; 2285 } else 2286 cm_entry->x_connected = connected; 2287 2288 /* 2289 * There could be a discrepancy here such that 2290 * x_early_disc is TRUE yet connected is TRUE as well 2291 * and the connection is actually connected. In that case 2292 * lets be conservative and declare the connection as not 2293 * connected. 2294 */ 2295 cm_entry->x_early_disc = FALSE; 2296 cm_entry->x_needdis = (cm_entry->x_connected == FALSE); 2297 cm_entry->x_ctime = ddi_get_lbolt(); 2298 2299 /* 2300 * Notify any threads waiting that the connection attempt is done. 2301 */ 2302 cm_entry->x_thread = FALSE; 2303 cv_broadcast(&cm_entry->x_conn_cv); 2304 2305 if (cm_entry->x_connected == FALSE) { 2306 mutex_exit(&connmgr_lock); 2307 connmgr_release(cm_entry); 2308 return (NULL); 2309 } 2310 2311 mutex_exit(&connmgr_lock); 2312 2313 return (cm_entry); 2314 } 2315 2316 /* 2317 * Keep the cm_xprt entry on the connecton list when making a connection. This 2318 * is to prevent multiple connections to a slow server from appearing. 2319 * We use the bit field x_thread to tell if a thread is doing a connection 2320 * which keeps other interested threads from messing with connection. 2321 * Those other threads just wait if x_thread is set. 2322 * 2323 * If x_thread is not set, then we do the actual work of connecting via 2324 * connmgr_connect(). 2325 * 2326 * mutex convention: called with connmgr_lock held, returns with it released. 2327 */ 2328 static struct cm_xprt * 2329 connmgr_wrapconnect( 2330 struct cm_xprt *cm_entry, 2331 const struct timeval *waitp, 2332 struct netbuf *destaddr, 2333 int addrfmly, 2334 struct netbuf *srcaddr, 2335 struct rpc_err *rpcerr, 2336 bool_t reconnect, 2337 bool_t nosignal, 2338 cred_t *cr) 2339 { 2340 ASSERT(MUTEX_HELD(&connmgr_lock)); 2341 /* 2342 * Hold this entry as we are about to drop connmgr_lock. 2343 */ 2344 CONN_HOLD(cm_entry); 2345 2346 /* 2347 * If there is a thread already making a connection for us, then 2348 * wait for it to complete the connection. 2349 */ 2350 if (cm_entry->x_thread == TRUE) { 2351 rpcerr->re_status = connmgr_cwait(cm_entry, waitp, nosignal); 2352 2353 if (rpcerr->re_status != RPC_SUCCESS) { 2354 mutex_exit(&connmgr_lock); 2355 connmgr_release(cm_entry); 2356 return (NULL); 2357 } 2358 } else { 2359 bool_t connected; 2360 calllist_t call; 2361 2362 cm_entry->x_thread = TRUE; 2363 2364 while (cm_entry->x_needrel == TRUE) { 2365 cm_entry->x_needrel = FALSE; 2366 2367 connmgr_sndrel(cm_entry); 2368 delay(drv_usectohz(1000000)); 2369 2370 mutex_enter(&connmgr_lock); 2371 } 2372 2373 /* 2374 * If we need to send a T_DISCON_REQ, send one. 2375 */ 2376 connmgr_dis_and_wait(cm_entry); 2377 2378 mutex_exit(&connmgr_lock); 2379 2380 bzero(&call, sizeof (call)); 2381 cv_init(&call.call_cv, NULL, CV_DEFAULT, NULL); 2382 2383 connected = connmgr_connect(cm_entry, cm_entry->x_wq, 2384 destaddr, addrfmly, &call, &cm_entry->x_tidu_size, 2385 reconnect, waitp, nosignal, cr); 2386 2387 *rpcerr = call.call_err; 2388 cv_destroy(&call.call_cv); 2389 2390 mutex_enter(&connmgr_lock); 2391 2392 2393 if (cm_entry->x_early_disc) { 2394 /* 2395 * We need to check if a disconnect request has come 2396 * while we are connected, if so, then we need to 2397 * set rpcerr->re_status appropriately before returning 2398 * NULL to caller. 2399 */ 2400 if (rpcerr->re_status == RPC_SUCCESS) 2401 rpcerr->re_status = RPC_XPRTFAILED; 2402 cm_entry->x_connected = FALSE; 2403 } else 2404 cm_entry->x_connected = connected; 2405 2406 /* 2407 * There could be a discrepancy here such that 2408 * x_early_disc is TRUE yet connected is TRUE as well 2409 * and the connection is actually connected. In that case 2410 * lets be conservative and declare the connection as not 2411 * connected. 2412 */ 2413 2414 cm_entry->x_early_disc = FALSE; 2415 cm_entry->x_needdis = (cm_entry->x_connected == FALSE); 2416 2417 2418 /* 2419 * connmgr_connect() may have given up before the connection 2420 * actually timed out. So ensure that before the next 2421 * connection attempt we do a disconnect. 2422 */ 2423 cm_entry->x_ctime = ddi_get_lbolt(); 2424 cm_entry->x_thread = FALSE; 2425 2426 cv_broadcast(&cm_entry->x_conn_cv); 2427 2428 if (cm_entry->x_connected == FALSE) { 2429 mutex_exit(&connmgr_lock); 2430 connmgr_release(cm_entry); 2431 return (NULL); 2432 } 2433 } 2434 2435 if (srcaddr != NULL) { 2436 /* 2437 * Copy into the handle the 2438 * source address of the 2439 * connection, which we will use 2440 * in case of a later retry. 2441 */ 2442 if (srcaddr->len != cm_entry->x_src.len) { 2443 kmem_free(srcaddr->buf, srcaddr->maxlen); 2444 ASSERT(cm_entry->x_src.len != 0); 2445 srcaddr->buf = kmem_alloc(cm_entry->x_src.len, 2446 KM_SLEEP); 2447 srcaddr->maxlen = srcaddr->len = cm_entry->x_src.len; 2448 } 2449 bcopy(cm_entry->x_src.buf, srcaddr->buf, srcaddr->len); 2450 } 2451 cm_entry->x_time = ddi_get_lbolt(); 2452 mutex_exit(&connmgr_lock); 2453 return (cm_entry); 2454 } 2455 2456 /* 2457 * If we need to send a T_DISCON_REQ, send one. 2458 */ 2459 static void 2460 connmgr_dis_and_wait(struct cm_xprt *cm_entry) 2461 { 2462 ASSERT(MUTEX_HELD(&connmgr_lock)); 2463 for (;;) { 2464 while (cm_entry->x_needdis == TRUE) { 2465 RPCLOG(8, "connmgr_dis_and_wait: need " 2466 "T_DISCON_REQ for connection 0x%p\n", 2467 (void *)cm_entry); 2468 cm_entry->x_needdis = FALSE; 2469 cm_entry->x_waitdis = TRUE; 2470 2471 connmgr_snddis(cm_entry); 2472 2473 mutex_enter(&connmgr_lock); 2474 } 2475 2476 if (cm_entry->x_waitdis == TRUE) { 2477 clock_t timout; 2478 2479 RPCLOG(8, "connmgr_dis_and_wait waiting for " 2480 "T_DISCON_REQ's ACK for connection %p\n", 2481 (void *)cm_entry); 2482 2483 timout = clnt_cots_min_conntout * drv_usectohz(1000000); 2484 2485 /* 2486 * The TPI spec says that the T_DISCON_REQ 2487 * will get acknowledged, but in practice 2488 * the ACK may never get sent. So don't 2489 * block forever. 2490 */ 2491 (void) cv_reltimedwait(&cm_entry->x_dis_cv, 2492 &connmgr_lock, timout, TR_CLOCK_TICK); 2493 } 2494 /* 2495 * If we got the ACK, break. If we didn't, 2496 * then send another T_DISCON_REQ. 2497 */ 2498 if (cm_entry->x_waitdis == FALSE) { 2499 break; 2500 } else { 2501 RPCLOG(8, "connmgr_dis_and_wait: did" 2502 "not get T_DISCON_REQ's ACK for " 2503 "connection %p\n", (void *)cm_entry); 2504 cm_entry->x_needdis = TRUE; 2505 } 2506 } 2507 } 2508 2509 static void 2510 connmgr_cancelconn(struct cm_xprt *cm_entry) 2511 { 2512 /* 2513 * Mark the connection table entry as dead; the next thread that 2514 * goes through connmgr_release() will notice this and deal with it. 2515 */ 2516 mutex_enter(&connmgr_lock); 2517 cm_entry->x_dead = TRUE; 2518 2519 /* 2520 * Notify any threads waiting for the connection that it isn't 2521 * going to happen. 2522 */ 2523 cm_entry->x_thread = FALSE; 2524 cv_broadcast(&cm_entry->x_conn_cv); 2525 mutex_exit(&connmgr_lock); 2526 2527 connmgr_release(cm_entry); 2528 } 2529 2530 static void 2531 connmgr_close(struct cm_xprt *cm_entry) 2532 { 2533 mutex_enter(&cm_entry->x_lock); 2534 while (cm_entry->x_ref != 0) { 2535 /* 2536 * Must be a noninterruptible wait. 2537 */ 2538 cv_wait(&cm_entry->x_cv, &cm_entry->x_lock); 2539 } 2540 2541 if (cm_entry->x_tiptr != NULL) 2542 (void) t_kclose(cm_entry->x_tiptr, 1); 2543 2544 mutex_exit(&cm_entry->x_lock); 2545 if (cm_entry->x_ksp != NULL) { 2546 mutex_enter(&connmgr_lock); 2547 cm_entry->x_ksp->ks_private = NULL; 2548 mutex_exit(&connmgr_lock); 2549 2550 /* 2551 * Must free the buffer we allocated for the 2552 * server address in the update function 2553 */ 2554 if (((struct cm_kstat_xprt *)(cm_entry->x_ksp->ks_data))-> 2555 x_server.value.str.addr.ptr != NULL) 2556 kmem_free(((struct cm_kstat_xprt *)(cm_entry->x_ksp-> 2557 ks_data))->x_server.value.str.addr.ptr, 2558 INET6_ADDRSTRLEN); 2559 kmem_free(cm_entry->x_ksp->ks_data, 2560 cm_entry->x_ksp->ks_data_size); 2561 kstat_delete(cm_entry->x_ksp); 2562 } 2563 2564 mutex_destroy(&cm_entry->x_lock); 2565 cv_destroy(&cm_entry->x_cv); 2566 cv_destroy(&cm_entry->x_conn_cv); 2567 cv_destroy(&cm_entry->x_dis_cv); 2568 2569 kmem_free(cm_entry->x_server.buf, cm_entry->x_server.maxlen); 2570 kmem_free(cm_entry->x_src.buf, cm_entry->x_src.maxlen); 2571 kmem_free(cm_entry, sizeof (struct cm_xprt)); 2572 } 2573 2574 /* 2575 * Called by KRPC after sending the call message to release the connection 2576 * it was using. 2577 */ 2578 static void 2579 connmgr_release(struct cm_xprt *cm_entry) 2580 { 2581 mutex_enter(&cm_entry->x_lock); 2582 cm_entry->x_ref--; 2583 if (cm_entry->x_ref == 0) 2584 cv_signal(&cm_entry->x_cv); 2585 mutex_exit(&cm_entry->x_lock); 2586 } 2587 2588 /* 2589 * Set TCP receive and xmit buffer size for RPC connections. 2590 */ 2591 static bool_t 2592 connmgr_setbufsz(calllist_t *e, queue_t *wq, cred_t *cr) 2593 { 2594 int ok = FALSE; 2595 int val; 2596 2597 if (rpc_default_tcp_bufsz) 2598 return (FALSE); 2599 2600 /* 2601 * Only set new buffer size if it's larger than the system 2602 * default buffer size. If smaller buffer size is needed 2603 * then use /etc/system to set rpc_default_tcp_bufsz to 1. 2604 */ 2605 ok = connmgr_getopt_int(wq, SOL_SOCKET, SO_RCVBUF, &val, e, cr); 2606 if ((ok == TRUE) && (val < rpc_send_bufsz)) { 2607 ok = connmgr_setopt_int(wq, SOL_SOCKET, SO_RCVBUF, 2608 rpc_send_bufsz, e, cr); 2609 DTRACE_PROBE2(krpc__i__connmgr_rcvbufsz, 2610 int, ok, calllist_t *, e); 2611 } 2612 2613 ok = connmgr_getopt_int(wq, SOL_SOCKET, SO_SNDBUF, &val, e, cr); 2614 if ((ok == TRUE) && (val < rpc_recv_bufsz)) { 2615 ok = connmgr_setopt_int(wq, SOL_SOCKET, SO_SNDBUF, 2616 rpc_recv_bufsz, e, cr); 2617 DTRACE_PROBE2(krpc__i__connmgr_sndbufsz, 2618 int, ok, calllist_t *, e); 2619 } 2620 return (TRUE); 2621 } 2622 2623 /* 2624 * Given an open stream, connect to the remote. Returns true if connected, 2625 * false otherwise. 2626 */ 2627 static bool_t 2628 connmgr_connect( 2629 struct cm_xprt *cm_entry, 2630 queue_t *wq, 2631 struct netbuf *addr, 2632 int addrfmly, 2633 calllist_t *e, 2634 int *tidu_ptr, 2635 bool_t reconnect, 2636 const struct timeval *waitp, 2637 bool_t nosignal, 2638 cred_t *cr) 2639 { 2640 mblk_t *mp; 2641 struct T_conn_req *tcr; 2642 struct T_info_ack *tinfo; 2643 int interrupted, error; 2644 int tidu_size, kstat_instance; 2645 2646 /* if it's a reconnect, flush any lingering data messages */ 2647 if (reconnect) 2648 (void) putctl1(wq, M_FLUSH, FLUSHRW); 2649 2650 /* 2651 * Note: if the receiver uses SCM_UCRED/getpeerucred the pid will 2652 * appear as -1. 2653 */ 2654 mp = allocb_cred(sizeof (*tcr) + addr->len, cr, NOPID); 2655 if (mp == NULL) { 2656 /* 2657 * This is unfortunate, but we need to look up the stats for 2658 * this zone to increment the "memory allocation failed" 2659 * counter. curproc->p_zone is safe since we're initiating a 2660 * connection and not in some strange streams context. 2661 */ 2662 struct rpcstat *rpcstat; 2663 2664 rpcstat = zone_getspecific(rpcstat_zone_key, rpc_zone()); 2665 ASSERT(rpcstat != NULL); 2666 2667 RPCLOG0(1, "connmgr_connect: cannot alloc mp for " 2668 "sending conn request\n"); 2669 COTSRCSTAT_INCR(rpcstat->rpc_cots_client, rcnomem); 2670 e->call_status = RPC_SYSTEMERROR; 2671 e->call_reason = ENOSR; 2672 return (FALSE); 2673 } 2674 2675 /* Set TCP buffer size for RPC connections if needed */ 2676 if (addrfmly == AF_INET || addrfmly == AF_INET6) 2677 (void) connmgr_setbufsz(e, wq, cr); 2678 2679 mp->b_datap->db_type = M_PROTO; 2680 tcr = (struct T_conn_req *)mp->b_rptr; 2681 bzero(tcr, sizeof (*tcr)); 2682 tcr->PRIM_type = T_CONN_REQ; 2683 tcr->DEST_length = addr->len; 2684 tcr->DEST_offset = sizeof (struct T_conn_req); 2685 mp->b_wptr = mp->b_rptr + sizeof (*tcr); 2686 2687 bcopy(addr->buf, mp->b_wptr, tcr->DEST_length); 2688 mp->b_wptr += tcr->DEST_length; 2689 2690 RPCLOG(8, "connmgr_connect: sending conn request on queue " 2691 "%p", (void *)wq); 2692 RPCLOG(8, " call %p\n", (void *)wq); 2693 /* 2694 * We use the entry in the handle that is normally used for 2695 * waiting for RPC replies to wait for the connection accept. 2696 */ 2697 if (clnt_dispatch_send(wq, mp, e, 0, 0) != RPC_SUCCESS) { 2698 DTRACE_PROBE(krpc__e__connmgr__connect__cantsend); 2699 freemsg(mp); 2700 return (FALSE); 2701 } 2702 2703 mutex_enter(&clnt_pending_lock); 2704 2705 /* 2706 * We wait for the transport connection to be made, or an 2707 * indication that it could not be made. 2708 */ 2709 interrupted = 0; 2710 2711 /* 2712 * waitforack should have been called with T_OK_ACK, but the 2713 * present implementation needs to be passed T_INFO_ACK to 2714 * work correctly. 2715 */ 2716 error = waitforack(e, T_INFO_ACK, waitp, nosignal); 2717 if (error == EINTR) 2718 interrupted = 1; 2719 if (zone_status_get(curproc->p_zone) >= ZONE_IS_EMPTY) { 2720 /* 2721 * No time to lose; we essentially have been signaled to 2722 * quit. 2723 */ 2724 interrupted = 1; 2725 } 2726 #ifdef RPCDEBUG 2727 if (error == ETIME) 2728 RPCLOG0(8, "connmgr_connect: giving up " 2729 "on connection attempt; " 2730 "clnt_dispatch notifyconn " 2731 "diagnostic 'no one waiting for " 2732 "connection' should not be " 2733 "unexpected\n"); 2734 #endif 2735 if (e->call_prev) 2736 e->call_prev->call_next = e->call_next; 2737 else 2738 clnt_pending = e->call_next; 2739 if (e->call_next) 2740 e->call_next->call_prev = e->call_prev; 2741 mutex_exit(&clnt_pending_lock); 2742 2743 if (e->call_status != RPC_SUCCESS || error != 0) { 2744 if (interrupted) 2745 e->call_status = RPC_INTR; 2746 else if (error == ETIME) 2747 e->call_status = RPC_TIMEDOUT; 2748 else if (error == EPROTO) { 2749 e->call_status = RPC_SYSTEMERROR; 2750 e->call_reason = EPROTO; 2751 } 2752 2753 RPCLOG(8, "connmgr_connect: can't connect, status: " 2754 "%s\n", clnt_sperrno(e->call_status)); 2755 2756 if (e->call_reply) { 2757 freemsg(e->call_reply); 2758 e->call_reply = NULL; 2759 } 2760 2761 return (FALSE); 2762 } 2763 /* 2764 * The result of the "connection accept" is a T_info_ack 2765 * in the call_reply field. 2766 */ 2767 ASSERT(e->call_reply != NULL); 2768 mp = e->call_reply; 2769 e->call_reply = NULL; 2770 tinfo = (struct T_info_ack *)mp->b_rptr; 2771 2772 tidu_size = tinfo->TIDU_size; 2773 tidu_size -= (tidu_size % BYTES_PER_XDR_UNIT); 2774 if (tidu_size > COTS_DEFAULT_ALLOCSIZE || (tidu_size <= 0)) 2775 tidu_size = COTS_DEFAULT_ALLOCSIZE; 2776 *tidu_ptr = tidu_size; 2777 2778 freemsg(mp); 2779 2780 /* 2781 * Set up the pertinent options. NODELAY is so the transport doesn't 2782 * buffer up RPC messages on either end. This may not be valid for 2783 * all transports. Failure to set this option is not cause to 2784 * bail out so we return success anyway. Note that lack of NODELAY 2785 * or some other way to flush the message on both ends will cause 2786 * lots of retries and terrible performance. 2787 */ 2788 if (addrfmly == AF_INET || addrfmly == AF_INET6) { 2789 (void) connmgr_setopt(wq, IPPROTO_TCP, TCP_NODELAY, e, cr); 2790 if (e->call_status == RPC_XPRTFAILED) 2791 return (FALSE); 2792 } 2793 2794 /* 2795 * Since we have a connection, we now need to figure out if 2796 * we need to create a kstat. If x_ksp is not NULL then we 2797 * are reusing a connection and so we do not need to create 2798 * another kstat -- lets just return. 2799 */ 2800 if (cm_entry->x_ksp != NULL) 2801 return (TRUE); 2802 2803 /* 2804 * We need to increment rpc_kstat_instance atomically to prevent 2805 * two kstats being created with the same instance. 2806 */ 2807 kstat_instance = atomic_inc_32_nv((uint32_t *)&rpc_kstat_instance); 2808 2809 if ((cm_entry->x_ksp = kstat_create_zone("unix", kstat_instance, 2810 "rpc_cots_connections", "rpc", KSTAT_TYPE_NAMED, 2811 (uint_t)(sizeof (cm_kstat_xprt_t) / sizeof (kstat_named_t)), 2812 KSTAT_FLAG_VIRTUAL, cm_entry->x_zoneid)) == NULL) { 2813 return (TRUE); 2814 } 2815 2816 cm_entry->x_ksp->ks_lock = &connmgr_lock; 2817 cm_entry->x_ksp->ks_private = cm_entry; 2818 cm_entry->x_ksp->ks_data_size = ((INET6_ADDRSTRLEN * sizeof (char)) 2819 + sizeof (cm_kstat_template)); 2820 cm_entry->x_ksp->ks_data = kmem_alloc(cm_entry->x_ksp->ks_data_size, 2821 KM_SLEEP); 2822 bcopy(&cm_kstat_template, cm_entry->x_ksp->ks_data, 2823 cm_entry->x_ksp->ks_data_size); 2824 ((struct cm_kstat_xprt *)(cm_entry->x_ksp->ks_data))-> 2825 x_server.value.str.addr.ptr = 2826 kmem_alloc(INET6_ADDRSTRLEN, KM_SLEEP); 2827 2828 cm_entry->x_ksp->ks_update = conn_kstat_update; 2829 kstat_install(cm_entry->x_ksp); 2830 return (TRUE); 2831 } 2832 2833 /* 2834 * Verify that the specified offset falls within the mblk and 2835 * that the resulting pointer is aligned. 2836 * Returns NULL if not. 2837 * 2838 * code from fs/sockfs/socksubr.c 2839 */ 2840 static void * 2841 connmgr_opt_getoff(mblk_t *mp, t_uscalar_t offset, 2842 t_uscalar_t length, uint_t align_size) 2843 { 2844 uintptr_t ptr1, ptr2; 2845 2846 ASSERT(mp && mp->b_wptr >= mp->b_rptr); 2847 ptr1 = (uintptr_t)mp->b_rptr + offset; 2848 ptr2 = (uintptr_t)ptr1 + length; 2849 if (ptr1 < (uintptr_t)mp->b_rptr || ptr2 > (uintptr_t)mp->b_wptr) { 2850 return (NULL); 2851 } 2852 if ((ptr1 & (align_size - 1)) != 0) { 2853 return (NULL); 2854 } 2855 return ((void *)ptr1); 2856 } 2857 2858 static bool_t 2859 connmgr_getopt_int(queue_t *wq, int level, int name, int *val, 2860 calllist_t *e, cred_t *cr) 2861 { 2862 mblk_t *mp; 2863 struct opthdr *opt, *opt_res; 2864 struct T_optmgmt_req *tor; 2865 struct T_optmgmt_ack *opt_ack; 2866 struct timeval waitp; 2867 int error; 2868 2869 mp = allocb_cred(sizeof (struct T_optmgmt_req) + 2870 sizeof (struct opthdr) + sizeof (int), cr, NOPID); 2871 if (mp == NULL) 2872 return (FALSE); 2873 2874 mp->b_datap->db_type = M_PROTO; 2875 tor = (struct T_optmgmt_req *)(mp->b_rptr); 2876 tor->PRIM_type = T_SVR4_OPTMGMT_REQ; 2877 tor->MGMT_flags = T_CURRENT; 2878 tor->OPT_length = sizeof (struct opthdr) + sizeof (int); 2879 tor->OPT_offset = sizeof (struct T_optmgmt_req); 2880 2881 opt = (struct opthdr *)(mp->b_rptr + sizeof (struct T_optmgmt_req)); 2882 opt->level = level; 2883 opt->name = name; 2884 opt->len = sizeof (int); 2885 mp->b_wptr += sizeof (struct T_optmgmt_req) + sizeof (struct opthdr) + 2886 sizeof (int); 2887 2888 /* 2889 * We will use this connection regardless 2890 * of whether or not the option is readable. 2891 */ 2892 if (clnt_dispatch_send(wq, mp, e, 0, 0) != RPC_SUCCESS) { 2893 DTRACE_PROBE(krpc__e__connmgr__getopt__cantsend); 2894 freemsg(mp); 2895 return (FALSE); 2896 } 2897 2898 mutex_enter(&clnt_pending_lock); 2899 2900 waitp.tv_sec = clnt_cots_min_conntout; 2901 waitp.tv_usec = 0; 2902 error = waitforack(e, T_OPTMGMT_ACK, &waitp, 1); 2903 2904 if (e->call_prev) 2905 e->call_prev->call_next = e->call_next; 2906 else 2907 clnt_pending = e->call_next; 2908 if (e->call_next) 2909 e->call_next->call_prev = e->call_prev; 2910 mutex_exit(&clnt_pending_lock); 2911 2912 /* get reply message */ 2913 mp = e->call_reply; 2914 e->call_reply = NULL; 2915 2916 if ((!mp) || (e->call_status != RPC_SUCCESS) || (error != 0)) { 2917 2918 DTRACE_PROBE4(krpc__e__connmgr_getopt, int, name, 2919 int, e->call_status, int, error, mblk_t *, mp); 2920 2921 if (mp) 2922 freemsg(mp); 2923 return (FALSE); 2924 } 2925 2926 opt_ack = (struct T_optmgmt_ack *)mp->b_rptr; 2927 opt_res = (struct opthdr *)connmgr_opt_getoff(mp, opt_ack->OPT_offset, 2928 opt_ack->OPT_length, __TPI_ALIGN_SIZE); 2929 2930 if (!opt_res) { 2931 DTRACE_PROBE4(krpc__e__connmgr_optres, mblk_t *, mp, int, name, 2932 int, opt_ack->OPT_offset, int, opt_ack->OPT_length); 2933 freemsg(mp); 2934 return (FALSE); 2935 } 2936 *val = *(int *)&opt_res[1]; 2937 2938 DTRACE_PROBE2(connmgr_getopt__ok, int, name, int, *val); 2939 2940 freemsg(mp); 2941 return (TRUE); 2942 } 2943 2944 /* 2945 * Called by connmgr_connect to set an option on the new stream. 2946 */ 2947 static bool_t 2948 connmgr_setopt_int(queue_t *wq, int level, int name, int val, 2949 calllist_t *e, cred_t *cr) 2950 { 2951 mblk_t *mp; 2952 struct opthdr *opt; 2953 struct T_optmgmt_req *tor; 2954 struct timeval waitp; 2955 int error; 2956 2957 mp = allocb_cred(sizeof (struct T_optmgmt_req) + 2958 sizeof (struct opthdr) + sizeof (int), cr, NOPID); 2959 if (mp == NULL) { 2960 RPCLOG0(1, "connmgr_setopt: cannot alloc mp for option " 2961 "request\n"); 2962 return (FALSE); 2963 } 2964 2965 mp->b_datap->db_type = M_PROTO; 2966 tor = (struct T_optmgmt_req *)(mp->b_rptr); 2967 tor->PRIM_type = T_SVR4_OPTMGMT_REQ; 2968 tor->MGMT_flags = T_NEGOTIATE; 2969 tor->OPT_length = sizeof (struct opthdr) + sizeof (int); 2970 tor->OPT_offset = sizeof (struct T_optmgmt_req); 2971 2972 opt = (struct opthdr *)(mp->b_rptr + sizeof (struct T_optmgmt_req)); 2973 opt->level = level; 2974 opt->name = name; 2975 opt->len = sizeof (int); 2976 *(int *)((char *)opt + sizeof (*opt)) = val; 2977 mp->b_wptr += sizeof (struct T_optmgmt_req) + sizeof (struct opthdr) + 2978 sizeof (int); 2979 2980 /* 2981 * We will use this connection regardless 2982 * of whether or not the option is settable. 2983 */ 2984 if (clnt_dispatch_send(wq, mp, e, 0, 0) != RPC_SUCCESS) { 2985 DTRACE_PROBE(krpc__e__connmgr__setopt__cantsend); 2986 freemsg(mp); 2987 return (FALSE); 2988 } 2989 2990 mutex_enter(&clnt_pending_lock); 2991 2992 waitp.tv_sec = clnt_cots_min_conntout; 2993 waitp.tv_usec = 0; 2994 error = waitforack(e, T_OPTMGMT_ACK, &waitp, 1); 2995 2996 if (e->call_prev) 2997 e->call_prev->call_next = e->call_next; 2998 else 2999 clnt_pending = e->call_next; 3000 if (e->call_next) 3001 e->call_next->call_prev = e->call_prev; 3002 mutex_exit(&clnt_pending_lock); 3003 3004 if (e->call_reply != NULL) { 3005 freemsg(e->call_reply); 3006 e->call_reply = NULL; 3007 } 3008 3009 if (e->call_status != RPC_SUCCESS || error != 0) { 3010 RPCLOG(1, "connmgr_setopt: can't set option: %d\n", name); 3011 return (FALSE); 3012 } 3013 RPCLOG(8, "connmgr_setopt: successfully set option: %d\n", name); 3014 return (TRUE); 3015 } 3016 3017 static bool_t 3018 connmgr_setopt(queue_t *wq, int level, int name, calllist_t *e, cred_t *cr) 3019 { 3020 return (connmgr_setopt_int(wq, level, name, 1, e, cr)); 3021 } 3022 3023 #ifdef DEBUG 3024 3025 /* 3026 * This is a knob to let us force code coverage in allocation failure 3027 * case. 3028 */ 3029 static int connmgr_failsnd; 3030 #define CONN_SND_ALLOC(Size, Pri) \ 3031 ((connmgr_failsnd-- > 0) ? NULL : allocb(Size, Pri)) 3032 3033 #else 3034 3035 #define CONN_SND_ALLOC(Size, Pri) allocb(Size, Pri) 3036 3037 #endif 3038 3039 /* 3040 * Sends an orderly release on the specified queue. 3041 * Entered with connmgr_lock. Exited without connmgr_lock 3042 */ 3043 static void 3044 connmgr_sndrel(struct cm_xprt *cm_entry) 3045 { 3046 struct T_ordrel_req *torr; 3047 mblk_t *mp; 3048 queue_t *q = cm_entry->x_wq; 3049 ASSERT(MUTEX_HELD(&connmgr_lock)); 3050 mp = CONN_SND_ALLOC(sizeof (struct T_ordrel_req), BPRI_LO); 3051 if (mp == NULL) { 3052 cm_entry->x_needrel = TRUE; 3053 mutex_exit(&connmgr_lock); 3054 RPCLOG(1, "connmgr_sndrel: cannot alloc mp for sending ordrel " 3055 "to queue %p\n", (void *)q); 3056 return; 3057 } 3058 mutex_exit(&connmgr_lock); 3059 3060 mp->b_datap->db_type = M_PROTO; 3061 torr = (struct T_ordrel_req *)(mp->b_rptr); 3062 torr->PRIM_type = T_ORDREL_REQ; 3063 mp->b_wptr = mp->b_rptr + sizeof (struct T_ordrel_req); 3064 3065 RPCLOG(8, "connmgr_sndrel: sending ordrel to queue %p\n", (void *)q); 3066 put(q, mp); 3067 } 3068 3069 /* 3070 * Sends an disconnect on the specified queue. 3071 * Entered with connmgr_lock. Exited without connmgr_lock 3072 */ 3073 static void 3074 connmgr_snddis(struct cm_xprt *cm_entry) 3075 { 3076 struct T_discon_req *tdis; 3077 mblk_t *mp; 3078 queue_t *q = cm_entry->x_wq; 3079 3080 ASSERT(MUTEX_HELD(&connmgr_lock)); 3081 mp = CONN_SND_ALLOC(sizeof (*tdis), BPRI_LO); 3082 if (mp == NULL) { 3083 cm_entry->x_needdis = TRUE; 3084 mutex_exit(&connmgr_lock); 3085 RPCLOG(1, "connmgr_snddis: cannot alloc mp for sending discon " 3086 "to queue %p\n", (void *)q); 3087 return; 3088 } 3089 mutex_exit(&connmgr_lock); 3090 3091 mp->b_datap->db_type = M_PROTO; 3092 tdis = (struct T_discon_req *)mp->b_rptr; 3093 tdis->PRIM_type = T_DISCON_REQ; 3094 mp->b_wptr = mp->b_rptr + sizeof (*tdis); 3095 3096 RPCLOG(8, "connmgr_snddis: sending discon to queue %p\n", (void *)q); 3097 put(q, mp); 3098 } 3099 3100 /* 3101 * Sets up the entry for receiving replies, and calls rpcmod's write put proc 3102 * (through put) to send the call. 3103 */ 3104 static int 3105 clnt_dispatch_send(queue_t *q, mblk_t *mp, calllist_t *e, uint_t xid, 3106 uint_t queue_flag) 3107 { 3108 ASSERT(e != NULL); 3109 3110 e->call_status = RPC_TIMEDOUT; /* optimistic, eh? */ 3111 e->call_reason = 0; 3112 e->call_wq = q; 3113 e->call_xid = xid; 3114 e->call_notified = FALSE; 3115 3116 if (!canput(q)) { 3117 e->call_status = RPC_CANTSEND; 3118 e->call_reason = ENOBUFS; 3119 return (RPC_CANTSEND); 3120 } 3121 3122 /* 3123 * If queue_flag is set then the calllist_t is already on the hash 3124 * queue. In this case just send the message and return. 3125 */ 3126 if (queue_flag) { 3127 put(q, mp); 3128 return (RPC_SUCCESS); 3129 3130 } 3131 3132 /* 3133 * Set up calls for RPC requests (with XID != 0) on the hash 3134 * queue for fast lookups and place other calls (i.e. 3135 * connection management) on the linked list. 3136 */ 3137 if (xid != 0) { 3138 RPCLOG(64, "clnt_dispatch_send: putting xid 0x%x on " 3139 "dispatch list\n", xid); 3140 e->call_hash = call_hash(xid, clnt_cots_hash_size); 3141 e->call_bucket = &cots_call_ht[e->call_hash]; 3142 call_table_enter(e); 3143 } else { 3144 mutex_enter(&clnt_pending_lock); 3145 if (clnt_pending) 3146 clnt_pending->call_prev = e; 3147 e->call_next = clnt_pending; 3148 e->call_prev = NULL; 3149 clnt_pending = e; 3150 mutex_exit(&clnt_pending_lock); 3151 } 3152 3153 put(q, mp); 3154 return (RPC_SUCCESS); 3155 } 3156 3157 /* 3158 * Called by rpcmod to notify a client with a clnt_pending call that its reply 3159 * has arrived. If we can't find a client waiting for this reply, we log 3160 * the error and return. 3161 */ 3162 bool_t 3163 clnt_dispatch_notify(mblk_t *mp, zoneid_t zoneid) 3164 { 3165 calllist_t *e = NULL; 3166 call_table_t *chtp; 3167 uint32_t xid; 3168 uint_t hash; 3169 3170 if ((IS_P2ALIGNED(mp->b_rptr, sizeof (uint32_t))) && 3171 (mp->b_wptr - mp->b_rptr) >= sizeof (xid)) 3172 xid = *((uint32_t *)mp->b_rptr); 3173 else { 3174 int i = 0; 3175 unsigned char *p = (unsigned char *)&xid; 3176 unsigned char *rptr; 3177 mblk_t *tmp = mp; 3178 3179 /* 3180 * Copy the xid, byte-by-byte into xid. 3181 */ 3182 while (tmp) { 3183 rptr = tmp->b_rptr; 3184 while (rptr < tmp->b_wptr) { 3185 *p++ = *rptr++; 3186 if (++i >= sizeof (xid)) 3187 goto done_xid_copy; 3188 } 3189 tmp = tmp->b_cont; 3190 } 3191 3192 /* 3193 * If we got here, we ran out of mblk space before the 3194 * xid could be copied. 3195 */ 3196 ASSERT(tmp == NULL && i < sizeof (xid)); 3197 3198 RPCLOG0(1, 3199 "clnt_dispatch_notify: message less than size of xid\n"); 3200 return (FALSE); 3201 3202 } 3203 done_xid_copy: 3204 3205 hash = call_hash(xid, clnt_cots_hash_size); 3206 chtp = &cots_call_ht[hash]; 3207 /* call_table_find returns with the hash bucket locked */ 3208 call_table_find(chtp, xid, e); 3209 3210 if (e != NULL) { 3211 /* 3212 * Found thread waiting for this reply 3213 */ 3214 mutex_enter(&e->call_lock); 3215 3216 /* 3217 * verify that the reply is coming in on 3218 * the same zone that it was sent from. 3219 */ 3220 if (e->call_zoneid != zoneid) { 3221 mutex_exit(&e->call_lock); 3222 mutex_exit(&chtp->ct_lock); 3223 RPCLOG0(1, "clnt_dispatch_notify: incorrect zoneid\n"); 3224 return (FALSE); 3225 } 3226 3227 if (e->call_reply) 3228 /* 3229 * This can happen under the following scenario: 3230 * clnt_cots_kcallit() times out on the response, 3231 * rfscall() repeats the CLNT_CALL() with 3232 * the same xid, clnt_cots_kcallit() sends the retry, 3233 * thereby putting the clnt handle on the pending list, 3234 * the first response arrives, signalling the thread 3235 * in clnt_cots_kcallit(). Before that thread is 3236 * dispatched, the second response arrives as well, 3237 * and clnt_dispatch_notify still finds the handle on 3238 * the pending list, with call_reply set. So free the 3239 * old reply now. 3240 * 3241 * It is also possible for a response intended for 3242 * an RPC call with a different xid to reside here. 3243 * This can happen if the thread that owned this 3244 * client handle prior to the current owner bailed 3245 * out and left its call record on the dispatch 3246 * queue. A window exists where the response can 3247 * arrive before the current owner dispatches its 3248 * RPC call. 3249 * 3250 * In any case, this is the very last point where we 3251 * can safely check the call_reply field before 3252 * placing the new response there. 3253 */ 3254 freemsg(e->call_reply); 3255 e->call_reply = mp; 3256 e->call_status = RPC_SUCCESS; 3257 e->call_notified = TRUE; 3258 cv_signal(&e->call_cv); 3259 mutex_exit(&e->call_lock); 3260 mutex_exit(&chtp->ct_lock); 3261 return (TRUE); 3262 } else { 3263 zone_t *zone; 3264 struct rpcstat *rpcstat; 3265 3266 mutex_exit(&chtp->ct_lock); 3267 RPCLOG(65, "clnt_dispatch_notify: no caller for reply 0x%x\n", 3268 xid); 3269 /* 3270 * This is unfortunate, but we need to lookup the zone so we 3271 * can increment its "rcbadxids" counter. 3272 */ 3273 zone = zone_find_by_id(zoneid); 3274 if (zone == NULL) { 3275 /* 3276 * The zone went away... 3277 */ 3278 return (FALSE); 3279 } 3280 rpcstat = zone_getspecific(rpcstat_zone_key, zone); 3281 if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) { 3282 /* 3283 * Not interested 3284 */ 3285 zone_rele(zone); 3286 return (FALSE); 3287 } 3288 COTSRCSTAT_INCR(rpcstat->rpc_cots_client, rcbadxids); 3289 zone_rele(zone); 3290 } 3291 return (FALSE); 3292 } 3293 3294 /* 3295 * Called by rpcmod when a non-data indication arrives. The ones in which we 3296 * are interested are connection indications and options acks. We dispatch 3297 * based on the queue the indication came in on. If we are not interested in 3298 * what came in, we return false to rpcmod, who will then pass it upstream. 3299 */ 3300 bool_t 3301 clnt_dispatch_notifyconn(queue_t *q, mblk_t *mp) 3302 { 3303 calllist_t *e; 3304 int type; 3305 3306 ASSERT((q->q_flag & QREADR) == 0); 3307 3308 type = ((union T_primitives *)mp->b_rptr)->type; 3309 RPCLOG(8, "clnt_dispatch_notifyconn: prim type: [%s]\n", 3310 rpc_tpiprim2name(type)); 3311 mutex_enter(&clnt_pending_lock); 3312 for (e = clnt_pending; /* NO CONDITION */; e = e->call_next) { 3313 if (e == NULL) { 3314 mutex_exit(&clnt_pending_lock); 3315 RPCLOG(1, "clnt_dispatch_notifyconn: no one waiting " 3316 "for connection on queue 0x%p\n", (void *)q); 3317 return (FALSE); 3318 } 3319 if (e->call_wq == q) 3320 break; 3321 } 3322 3323 switch (type) { 3324 case T_CONN_CON: 3325 /* 3326 * The transport is now connected, send a T_INFO_REQ to get 3327 * the tidu size. 3328 */ 3329 mutex_exit(&clnt_pending_lock); 3330 ASSERT(mp->b_datap->db_lim - mp->b_datap->db_base >= 3331 sizeof (struct T_info_req)); 3332 mp->b_rptr = mp->b_datap->db_base; 3333 ((union T_primitives *)mp->b_rptr)->type = T_INFO_REQ; 3334 mp->b_wptr = mp->b_rptr + sizeof (struct T_info_req); 3335 mp->b_datap->db_type = M_PCPROTO; 3336 put(q, mp); 3337 return (TRUE); 3338 case T_INFO_ACK: 3339 case T_OPTMGMT_ACK: 3340 e->call_status = RPC_SUCCESS; 3341 e->call_reply = mp; 3342 e->call_notified = TRUE; 3343 cv_signal(&e->call_cv); 3344 break; 3345 case T_ERROR_ACK: 3346 e->call_status = RPC_CANTCONNECT; 3347 e->call_reply = mp; 3348 e->call_notified = TRUE; 3349 cv_signal(&e->call_cv); 3350 break; 3351 case T_OK_ACK: 3352 /* 3353 * Great, but we are really waiting for a T_CONN_CON 3354 */ 3355 freemsg(mp); 3356 break; 3357 default: 3358 mutex_exit(&clnt_pending_lock); 3359 RPCLOG(1, "clnt_dispatch_notifyconn: bad type %d\n", type); 3360 return (FALSE); 3361 } 3362 3363 mutex_exit(&clnt_pending_lock); 3364 return (TRUE); 3365 } 3366 3367 /* 3368 * Called by rpcmod when the transport is (or should be) going away. Informs 3369 * all callers waiting for replies and marks the entry in the connection 3370 * manager's list as unconnected, and either closing (close handshake in 3371 * progress) or dead. 3372 */ 3373 void 3374 clnt_dispatch_notifyall(queue_t *q, int32_t msg_type, int32_t reason) 3375 { 3376 calllist_t *e; 3377 call_table_t *ctp; 3378 struct cm_xprt *cm_entry; 3379 int have_connmgr_lock; 3380 int i; 3381 3382 ASSERT((q->q_flag & QREADR) == 0); 3383 3384 RPCLOG(1, "clnt_dispatch_notifyall on queue %p", (void *)q); 3385 RPCLOG(1, " received a notifcation prim type [%s]", 3386 rpc_tpiprim2name(msg_type)); 3387 RPCLOG(1, " and reason %d\n", reason); 3388 3389 /* 3390 * Find the transport entry in the connection manager's list, close 3391 * the transport and delete the entry. In the case where rpcmod's 3392 * idle timer goes off, it sends us a T_ORDREL_REQ, indicating we 3393 * should gracefully close the connection. 3394 */ 3395 have_connmgr_lock = 1; 3396 mutex_enter(&connmgr_lock); 3397 for (cm_entry = cm_hd; cm_entry; cm_entry = cm_entry->x_next) { 3398 ASSERT(cm_entry != cm_entry->x_next); 3399 if (cm_entry->x_wq == q) { 3400 ASSERT(MUTEX_HELD(&connmgr_lock)); 3401 ASSERT(have_connmgr_lock == 1); 3402 switch (msg_type) { 3403 case T_ORDREL_REQ: 3404 3405 if (cm_entry->x_dead) { 3406 RPCLOG(1, "idle timeout on dead " 3407 "connection: %p\n", 3408 (void *)cm_entry); 3409 if (clnt_stop_idle != NULL) 3410 (*clnt_stop_idle)(q); 3411 break; 3412 } 3413 3414 /* 3415 * Only mark the connection as dead if it is 3416 * connected and idle. 3417 * An unconnected connection has probably 3418 * gone idle because the server is down, 3419 * and when it comes back up there will be 3420 * retries that need to use that connection. 3421 */ 3422 if (cm_entry->x_connected || 3423 cm_entry->x_doomed) { 3424 if (cm_entry->x_ordrel) { 3425 if (cm_entry->x_closing == 3426 TRUE) { 3427 /* 3428 * The connection is 3429 * obviously wedged due 3430 * to a bug or problem 3431 * with the transport. 3432 * Mark it as dead. 3433 * Otherwise we can 3434 * leak connections. 3435 */ 3436 cm_entry->x_dead = TRUE; 3437 mutex_exit( 3438 &connmgr_lock); 3439 have_connmgr_lock = 0; 3440 if (clnt_stop_idle != 3441 NULL) 3442 (*clnt_stop_idle)(q); 3443 break; 3444 } 3445 cm_entry->x_closing = TRUE; 3446 connmgr_sndrel(cm_entry); 3447 have_connmgr_lock = 0; 3448 } else { 3449 cm_entry->x_dead = TRUE; 3450 mutex_exit(&connmgr_lock); 3451 have_connmgr_lock = 0; 3452 if (clnt_stop_idle != NULL) 3453 (*clnt_stop_idle)(q); 3454 } 3455 } else { 3456 /* 3457 * We don't mark the connection 3458 * as dead, but we turn off the 3459 * idle timer. 3460 */ 3461 mutex_exit(&connmgr_lock); 3462 have_connmgr_lock = 0; 3463 if (clnt_stop_idle != NULL) 3464 (*clnt_stop_idle)(q); 3465 RPCLOG(1, "clnt_dispatch_notifyall:" 3466 " ignoring timeout from rpcmod" 3467 " (q %p) because we are not " 3468 " connected\n", (void *)q); 3469 } 3470 break; 3471 case T_ORDREL_IND: 3472 /* 3473 * If this entry is marked closing, then we are 3474 * completing a close handshake, and the 3475 * connection is dead. Otherwise, the server is 3476 * trying to close. Since the server will not 3477 * be sending any more RPC replies, we abort 3478 * the connection, including flushing 3479 * any RPC requests that are in-transit. 3480 * In either case, mark the entry as dead so 3481 * that it can be closed by the connection 3482 * manager's garbage collector. 3483 */ 3484 cm_entry->x_dead = TRUE; 3485 if (cm_entry->x_closing) { 3486 mutex_exit(&connmgr_lock); 3487 have_connmgr_lock = 0; 3488 if (clnt_stop_idle != NULL) 3489 (*clnt_stop_idle)(q); 3490 } else { 3491 /* 3492 * if we're getting a disconnect 3493 * before we've finished our 3494 * connect attempt, mark it for 3495 * later processing 3496 */ 3497 if (cm_entry->x_thread) 3498 cm_entry->x_early_disc = TRUE; 3499 else 3500 cm_entry->x_connected = FALSE; 3501 cm_entry->x_waitdis = TRUE; 3502 connmgr_snddis(cm_entry); 3503 have_connmgr_lock = 0; 3504 } 3505 break; 3506 3507 case T_ERROR_ACK: 3508 case T_OK_ACK: 3509 cm_entry->x_waitdis = FALSE; 3510 cv_signal(&cm_entry->x_dis_cv); 3511 mutex_exit(&connmgr_lock); 3512 return; 3513 3514 case T_DISCON_REQ: 3515 if (cm_entry->x_thread) 3516 cm_entry->x_early_disc = TRUE; 3517 else 3518 cm_entry->x_connected = FALSE; 3519 cm_entry->x_waitdis = TRUE; 3520 3521 connmgr_snddis(cm_entry); 3522 have_connmgr_lock = 0; 3523 break; 3524 3525 case T_DISCON_IND: 3526 default: 3527 /* 3528 * if we're getting a disconnect before 3529 * we've finished our connect attempt, 3530 * mark it for later processing 3531 */ 3532 if (cm_entry->x_closing) { 3533 cm_entry->x_dead = TRUE; 3534 mutex_exit(&connmgr_lock); 3535 have_connmgr_lock = 0; 3536 if (clnt_stop_idle != NULL) 3537 (*clnt_stop_idle)(q); 3538 } else { 3539 if (cm_entry->x_thread) { 3540 cm_entry->x_early_disc = TRUE; 3541 } else { 3542 cm_entry->x_dead = TRUE; 3543 cm_entry->x_connected = FALSE; 3544 } 3545 } 3546 break; 3547 } 3548 break; 3549 } 3550 } 3551 3552 if (have_connmgr_lock) 3553 mutex_exit(&connmgr_lock); 3554 3555 if (msg_type == T_ERROR_ACK || msg_type == T_OK_ACK) { 3556 RPCLOG(1, "clnt_dispatch_notifyall: (wq %p) could not find " 3557 "connmgr entry for discon ack\n", (void *)q); 3558 return; 3559 } 3560 3561 /* 3562 * Then kick all the clnt_pending calls out of their wait. There 3563 * should be no clnt_pending calls in the case of rpcmod's idle 3564 * timer firing. 3565 */ 3566 for (i = 0; i < clnt_cots_hash_size; i++) { 3567 ctp = &cots_call_ht[i]; 3568 mutex_enter(&ctp->ct_lock); 3569 for (e = ctp->ct_call_next; 3570 e != (calllist_t *)ctp; 3571 e = e->call_next) { 3572 if (e->call_wq == q && e->call_notified == FALSE) { 3573 RPCLOG(1, 3574 "clnt_dispatch_notifyall for queue %p ", 3575 (void *)q); 3576 RPCLOG(1, "aborting clnt_pending call %p\n", 3577 (void *)e); 3578 3579 if (msg_type == T_DISCON_IND) 3580 e->call_reason = reason; 3581 e->call_notified = TRUE; 3582 e->call_status = RPC_XPRTFAILED; 3583 cv_signal(&e->call_cv); 3584 } 3585 } 3586 mutex_exit(&ctp->ct_lock); 3587 } 3588 3589 mutex_enter(&clnt_pending_lock); 3590 for (e = clnt_pending; e; e = e->call_next) { 3591 /* 3592 * Only signal those RPC handles that haven't been 3593 * signalled yet. Otherwise we can get a bogus call_reason. 3594 * This can happen if thread A is making a call over a 3595 * connection. If the server is killed, it will cause 3596 * reset, and reason will default to EIO as a result of 3597 * a T_ORDREL_IND. Thread B then attempts to recreate 3598 * the connection but gets a T_DISCON_IND. If we set the 3599 * call_reason code for all threads, then if thread A 3600 * hasn't been dispatched yet, it will get the wrong 3601 * reason. The bogus call_reason can make it harder to 3602 * discriminate between calls that fail because the 3603 * connection attempt failed versus those where the call 3604 * may have been executed on the server. 3605 */ 3606 if (e->call_wq == q && e->call_notified == FALSE) { 3607 RPCLOG(1, "clnt_dispatch_notifyall for queue %p ", 3608 (void *)q); 3609 RPCLOG(1, " aborting clnt_pending call %p\n", 3610 (void *)e); 3611 3612 if (msg_type == T_DISCON_IND) 3613 e->call_reason = reason; 3614 e->call_notified = TRUE; 3615 /* 3616 * Let the caller timeout, else it will retry 3617 * immediately. 3618 */ 3619 e->call_status = RPC_XPRTFAILED; 3620 3621 /* 3622 * We used to just signal those threads 3623 * waiting for a connection, (call_xid = 0). 3624 * That meant that threads waiting for a response 3625 * waited till their timeout expired. This 3626 * could be a long time if they've specified a 3627 * maximum timeout. (2^31 - 1). So we 3628 * Signal all threads now. 3629 */ 3630 cv_signal(&e->call_cv); 3631 } 3632 } 3633 mutex_exit(&clnt_pending_lock); 3634 } 3635 3636 3637 /*ARGSUSED*/ 3638 /* 3639 * after resuming a system that's been suspended for longer than the 3640 * NFS server's idle timeout (svc_idle_timeout for Solaris 2), rfscall() 3641 * generates "NFS server X not responding" and "NFS server X ok" messages; 3642 * here we reset inet connections to cause a re-connect and avoid those 3643 * NFS messages. see 4045054 3644 */ 3645 boolean_t 3646 connmgr_cpr_reset(void *arg, int code) 3647 { 3648 struct cm_xprt *cxp; 3649 3650 if (code == CB_CODE_CPR_CHKPT) 3651 return (B_TRUE); 3652 3653 if (mutex_tryenter(&connmgr_lock) == 0) 3654 return (B_FALSE); 3655 for (cxp = cm_hd; cxp; cxp = cxp->x_next) { 3656 if ((cxp->x_family == AF_INET || cxp->x_family == AF_INET6) && 3657 cxp->x_connected == TRUE) { 3658 if (cxp->x_thread) 3659 cxp->x_early_disc = TRUE; 3660 else 3661 cxp->x_connected = FALSE; 3662 cxp->x_needdis = TRUE; 3663 } 3664 } 3665 mutex_exit(&connmgr_lock); 3666 return (B_TRUE); 3667 } 3668 3669 void 3670 clnt_cots_stats_init(zoneid_t zoneid, struct rpc_cots_client **statsp) 3671 { 3672 3673 *statsp = (struct rpc_cots_client *)rpcstat_zone_init_common(zoneid, 3674 "unix", "rpc_cots_client", (const kstat_named_t *)&cots_rcstat_tmpl, 3675 sizeof (cots_rcstat_tmpl)); 3676 } 3677 3678 void 3679 clnt_cots_stats_fini(zoneid_t zoneid, struct rpc_cots_client **statsp) 3680 { 3681 rpcstat_zone_fini_common(zoneid, "unix", "rpc_cots_client"); 3682 kmem_free(*statsp, sizeof (cots_rcstat_tmpl)); 3683 } 3684 3685 void 3686 clnt_cots_init(void) 3687 { 3688 mutex_init(&connmgr_lock, NULL, MUTEX_DEFAULT, NULL); 3689 mutex_init(&clnt_pending_lock, NULL, MUTEX_DEFAULT, NULL); 3690 3691 if (clnt_cots_hash_size < DEFAULT_MIN_HASH_SIZE) 3692 clnt_cots_hash_size = DEFAULT_MIN_HASH_SIZE; 3693 3694 cots_call_ht = call_table_init(clnt_cots_hash_size); 3695 zone_key_create(&zone_cots_key, NULL, NULL, clnt_zone_destroy); 3696 } 3697 3698 void 3699 clnt_cots_fini(void) 3700 { 3701 (void) zone_key_delete(zone_cots_key); 3702 } 3703 3704 /* 3705 * Wait for TPI ack, returns success only if expected ack is received 3706 * within timeout period. 3707 */ 3708 3709 static int 3710 waitforack(calllist_t *e, t_scalar_t ack_prim, const struct timeval *waitp, 3711 bool_t nosignal) 3712 { 3713 union T_primitives *tpr; 3714 clock_t timout; 3715 int cv_stat = 1; 3716 3717 ASSERT(MUTEX_HELD(&clnt_pending_lock)); 3718 while (e->call_reply == NULL) { 3719 if (waitp != NULL) { 3720 timout = waitp->tv_sec * drv_usectohz(MICROSEC) + 3721 drv_usectohz(waitp->tv_usec); 3722 if (nosignal) 3723 cv_stat = cv_reltimedwait(&e->call_cv, 3724 &clnt_pending_lock, timout, TR_CLOCK_TICK); 3725 else 3726 cv_stat = cv_reltimedwait_sig(&e->call_cv, 3727 &clnt_pending_lock, timout, TR_CLOCK_TICK); 3728 } else { 3729 if (nosignal) 3730 cv_wait(&e->call_cv, &clnt_pending_lock); 3731 else 3732 cv_stat = cv_wait_sig(&e->call_cv, 3733 &clnt_pending_lock); 3734 } 3735 if (cv_stat == -1) 3736 return (ETIME); 3737 if (cv_stat == 0) 3738 return (EINTR); 3739 /* 3740 * if we received an error from the server and we know a reply 3741 * is not going to be sent, do not wait for the full timeout, 3742 * return now. 3743 */ 3744 if (e->call_status == RPC_XPRTFAILED) 3745 return (e->call_reason); 3746 } 3747 tpr = (union T_primitives *)e->call_reply->b_rptr; 3748 if (tpr->type == ack_prim) 3749 return (0); /* Success */ 3750 3751 if (tpr->type == T_ERROR_ACK) { 3752 if (tpr->error_ack.TLI_error == TSYSERR) 3753 return (tpr->error_ack.UNIX_error); 3754 else 3755 return (t_tlitosyserr(tpr->error_ack.TLI_error)); 3756 } 3757 3758 return (EPROTO); /* unknown or unexpected primitive */ 3759 } 3760