1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 24 */ 25 26 /* 27 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 28 * Use is subject to license terms. 29 */ 30 31 /* 32 * Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T 33 * All Rights Reserved 34 */ 35 36 /* 37 * Portions of this source code were derived from Berkeley 4.3 BSD 38 * under license from the Regents of the University of California. 39 */ 40 41 42 /* 43 * Implements a kernel based, client side RPC over Connection Oriented 44 * Transports (COTS). 45 */ 46 47 /* 48 * Much of this file has been re-written to let NFS work better over slow 49 * transports. A description follows. 50 * 51 * One of the annoying things about kRPC/COTS is that it will temporarily 52 * create more than one connection between a client and server. This 53 * happens because when a connection is made, the end-points entry in the 54 * linked list of connections (headed by cm_hd), is removed so that other 55 * threads don't mess with it. Went ahead and bit the bullet by keeping 56 * the endpoint on the connection list and introducing state bits, 57 * condition variables etc. to the connection entry data structure (struct 58 * cm_xprt). 59 * 60 * Here is a summary of the changes to cm-xprt: 61 * 62 * x_ctime is the timestamp of when the endpoint was last 63 * connected or disconnected. If an end-point is ever disconnected 64 * or re-connected, then any outstanding RPC request is presumed 65 * lost, telling clnt_cots_kcallit that it needs to re-send the 66 * request, not just wait for the original request's reply to 67 * arrive. 68 * 69 * x_thread flag which tells us if a thread is doing a connection attempt. 70 * 71 * x_waitdis flag which tells us we are waiting a disconnect ACK. 72 * 73 * x_needdis flag which tells us we need to send a T_DISCONN_REQ 74 * to kill the connection. 75 * 76 * x_needrel flag which tells us we need to send a T_ORDREL_REQ to 77 * gracefully close the connection. 78 * 79 * #defined bitmasks for the all the b_* bits so that more 80 * efficient (and at times less clumsy) masks can be used to 81 * manipulated state in cases where multiple bits have to 82 * set/cleared/checked in the same critical section. 83 * 84 * x_conn_cv and x_dis-_cv are new condition variables to let 85 * threads knows when the connection attempt is done, and to let 86 * the connecting thread know when the disconnect handshake is 87 * done. 88 * 89 * Added the CONN_HOLD() macro so that all reference holds have the same 90 * look and feel. 91 * 92 * In the private (cku_private) portion of the client handle, 93 * 94 * cku_flags replaces the cku_sent a boolean. cku_flags keeps 95 * track of whether a request as been sent, and whether the 96 * client's handles call record is on the dispatch list (so that 97 * the reply can be matched by XID to the right client handle). 98 * The idea of CKU_ONQUEUE is that we can exit clnt_cots_kcallit() 99 * and still have the response find the right client handle so 100 * that the retry of CLNT_CALL() gets the result. Testing, found 101 * situations where if the timeout was increased, performance 102 * degraded. This was due to us hitting a window where the thread 103 * was back in rfscall() (probably printing server not responding) 104 * while the response came back but no place to put it. 105 * 106 * cku_ctime is just a cache of x_ctime. If they match, 107 * clnt_cots_kcallit() won't to send a retry (unless the maximum 108 * receive count limit as been reached). If the don't match, then 109 * we assume the request has been lost, and a retry of the request 110 * is needed. 111 * 112 * cku_recv_attempts counts the number of receive count attempts 113 * after one try is sent on the wire. 114 * 115 * Added the clnt_delay() routine so that interruptible and 116 * noninterruptible delays are possible. 117 * 118 * CLNT_MIN_TIMEOUT has been bumped to 10 seconds from 3. This is used to 119 * control how long the client delays before returned after getting 120 * ECONNREFUSED. At 3 seconds, 8 client threads per mount really does bash 121 * a server that may be booting and not yet started nfsd. 122 * 123 * CLNT_MAXRECV_WITHOUT_RETRY is a new macro (value of 3) (with a tunable) 124 * Why don't we just wait forever (receive an infinite # of times)? 125 * Because the server may have rebooted. More insidious is that some 126 * servers (ours) will drop NFS/TCP requests in some cases. This is bad, 127 * but it is a reality. 128 * 129 * The case of a server doing orderly release really messes up the 130 * client's recovery, especially if the server's TCP implementation is 131 * buggy. It was found was that the kRPC/COTS client was breaking some 132 * TPI rules, such as not waiting for the acknowledgement of a 133 * T_DISCON_REQ (hence the added case statements T_ERROR_ACK, T_OK_ACK and 134 * T_DISCON_REQ in clnt_dispatch_notifyall()). 135 * 136 * One of things that we've seen is that a kRPC TCP endpoint goes into 137 * TIMEWAIT and a thus a reconnect takes a long time to satisfy because 138 * that the TIMEWAIT state takes a while to finish. If a server sends a 139 * T_ORDREL_IND, there is little point in an RPC client doing a 140 * T_ORDREL_REQ, because the RPC request isn't going to make it (the 141 * server is saying that it won't accept any more data). So kRPC was 142 * changed to send a T_DISCON_REQ when we get a T_ORDREL_IND. So now the 143 * connection skips the TIMEWAIT state and goes straight to a bound state 144 * that kRPC can quickly switch to connected. 145 * 146 * Code that issues TPI request must use waitforack() to wait for the 147 * corresponding ack (assuming there is one) in any future modifications. 148 * This works around problems that may be introduced by breaking TPI rules 149 * (by submitting new calls before earlier requests have been acked) in the 150 * case of a signal or other early return. waitforack() depends on 151 * clnt_dispatch_notifyconn() to issue the wakeup when the ack 152 * arrives, so adding new TPI calls may require corresponding changes 153 * to clnt_dispatch_notifyconn(). Presently, the timeout period is based on 154 * CLNT_MIN_TIMEOUT which is 10 seconds. If you modify this value, be sure 155 * not to set it too low or TPI ACKS will be lost. 156 */ 157 158 #include <sys/param.h> 159 #include <sys/types.h> 160 #include <sys/user.h> 161 #include <sys/systm.h> 162 #include <sys/sysmacros.h> 163 #include <sys/proc.h> 164 #include <sys/socket.h> 165 #include <sys/file.h> 166 #include <sys/stream.h> 167 #include <sys/strsubr.h> 168 #include <sys/stropts.h> 169 #include <sys/strsun.h> 170 #include <sys/timod.h> 171 #include <sys/tiuser.h> 172 #include <sys/tihdr.h> 173 #include <sys/t_kuser.h> 174 #include <sys/fcntl.h> 175 #include <sys/errno.h> 176 #include <sys/kmem.h> 177 #include <sys/debug.h> 178 #include <sys/systm.h> 179 #include <sys/kstat.h> 180 #include <sys/t_lock.h> 181 #include <sys/ddi.h> 182 #include <sys/cmn_err.h> 183 #include <sys/time.h> 184 #include <sys/isa_defs.h> 185 #include <sys/callb.h> 186 #include <sys/sunddi.h> 187 #include <sys/atomic.h> 188 #include <sys/sdt.h> 189 190 #include <netinet/in.h> 191 #include <netinet/tcp.h> 192 193 #include <rpc/types.h> 194 #include <rpc/xdr.h> 195 #include <rpc/auth.h> 196 #include <rpc/clnt.h> 197 #include <rpc/rpc_msg.h> 198 199 #define COTS_DEFAULT_ALLOCSIZE 2048 200 201 #define WIRE_HDR_SIZE 20 /* serialized call header, sans proc number */ 202 #define MSG_OFFSET 128 /* offset of call into the mblk */ 203 204 const char *kinet_ntop6(uchar_t *, char *, size_t); 205 206 static int clnt_cots_ksettimers(CLIENT *, struct rpc_timers *, 207 struct rpc_timers *, int, void(*)(int, int, caddr_t), caddr_t, uint32_t); 208 static enum clnt_stat clnt_cots_kcallit(CLIENT *, rpcproc_t, xdrproc_t, 209 caddr_t, xdrproc_t, caddr_t, struct timeval); 210 static void clnt_cots_kabort(CLIENT *); 211 static void clnt_cots_kerror(CLIENT *, struct rpc_err *); 212 static bool_t clnt_cots_kfreeres(CLIENT *, xdrproc_t, caddr_t); 213 static void clnt_cots_kdestroy(CLIENT *); 214 static bool_t clnt_cots_kcontrol(CLIENT *, int, char *); 215 216 217 /* List of transports managed by the connection manager. */ 218 struct cm_xprt { 219 TIUSER *x_tiptr; /* transport handle */ 220 queue_t *x_wq; /* send queue */ 221 clock_t x_time; /* last time we handed this xprt out */ 222 clock_t x_ctime; /* time we went to CONNECTED */ 223 int x_tidu_size; /* TIDU size of this transport */ 224 union { 225 struct { 226 unsigned int 227 #ifdef _BIT_FIELDS_HTOL 228 b_closing: 1, /* we've sent a ord rel on this conn */ 229 b_dead: 1, /* transport is closed or disconn */ 230 b_doomed: 1, /* too many conns, let this go idle */ 231 b_connected: 1, /* this connection is connected */ 232 233 b_ordrel: 1, /* do an orderly release? */ 234 b_thread: 1, /* thread doing connect */ 235 b_waitdis: 1, /* waiting for disconnect ACK */ 236 b_needdis: 1, /* need T_DISCON_REQ */ 237 238 b_needrel: 1, /* need T_ORDREL_REQ */ 239 b_early_disc: 1, /* got a T_ORDREL_IND or T_DISCON_IND */ 240 /* disconnect during connect */ 241 242 b_pad: 22; 243 244 #endif 245 246 #ifdef _BIT_FIELDS_LTOH 247 b_pad: 22, 248 249 b_early_disc: 1, /* got a T_ORDREL_IND or T_DISCON_IND */ 250 /* disconnect during connect */ 251 b_needrel: 1, /* need T_ORDREL_REQ */ 252 253 b_needdis: 1, /* need T_DISCON_REQ */ 254 b_waitdis: 1, /* waiting for disconnect ACK */ 255 b_thread: 1, /* thread doing connect */ 256 b_ordrel: 1, /* do an orderly release? */ 257 258 b_connected: 1, /* this connection is connected */ 259 b_doomed: 1, /* too many conns, let this go idle */ 260 b_dead: 1, /* transport is closed or disconn */ 261 b_closing: 1; /* we've sent a ord rel on this conn */ 262 #endif 263 } bit; unsigned int word; 264 265 #define x_closing x_state.bit.b_closing 266 #define x_dead x_state.bit.b_dead 267 #define x_doomed x_state.bit.b_doomed 268 #define x_connected x_state.bit.b_connected 269 270 #define x_ordrel x_state.bit.b_ordrel 271 #define x_thread x_state.bit.b_thread 272 #define x_waitdis x_state.bit.b_waitdis 273 #define x_needdis x_state.bit.b_needdis 274 275 #define x_needrel x_state.bit.b_needrel 276 #define x_early_disc x_state.bit.b_early_disc 277 278 #define x_state_flags x_state.word 279 280 #define X_CLOSING 0x80000000 281 #define X_DEAD 0x40000000 282 #define X_DOOMED 0x20000000 283 #define X_CONNECTED 0x10000000 284 285 #define X_ORDREL 0x08000000 286 #define X_THREAD 0x04000000 287 #define X_WAITDIS 0x02000000 288 #define X_NEEDDIS 0x01000000 289 290 #define X_NEEDREL 0x00800000 291 #define X_EARLYDISC 0x00400000 292 293 #define X_BADSTATES (X_CLOSING | X_DEAD | X_DOOMED) 294 295 } x_state; 296 int x_ref; /* number of users of this xprt */ 297 int x_family; /* address family of transport */ 298 dev_t x_rdev; /* device number of transport */ 299 struct cm_xprt *x_next; 300 301 struct netbuf x_server; /* destination address */ 302 struct netbuf x_src; /* src address (for retries) */ 303 kmutex_t x_lock; /* lock on this entry */ 304 kcondvar_t x_cv; /* to signal when can be closed */ 305 kcondvar_t x_conn_cv; /* to signal when connection attempt */ 306 /* is complete */ 307 kstat_t *x_ksp; 308 309 kcondvar_t x_dis_cv; /* to signal when disconnect attempt */ 310 /* is complete */ 311 zoneid_t x_zoneid; /* zone this xprt belongs to */ 312 }; 313 314 typedef struct cm_kstat_xprt { 315 kstat_named_t x_wq; 316 kstat_named_t x_server; 317 kstat_named_t x_family; 318 kstat_named_t x_rdev; 319 kstat_named_t x_time; 320 kstat_named_t x_state; 321 kstat_named_t x_ref; 322 kstat_named_t x_port; 323 } cm_kstat_xprt_t; 324 325 static cm_kstat_xprt_t cm_kstat_template = { 326 { "write_queue", KSTAT_DATA_UINT32 }, 327 { "server", KSTAT_DATA_STRING }, 328 { "addr_family", KSTAT_DATA_UINT32 }, 329 { "device", KSTAT_DATA_UINT32 }, 330 { "time_stamp", KSTAT_DATA_UINT32 }, 331 { "status", KSTAT_DATA_UINT32 }, 332 { "ref_count", KSTAT_DATA_INT32 }, 333 { "port", KSTAT_DATA_UINT32 }, 334 }; 335 336 /* 337 * The inverse of this is connmgr_release(). 338 */ 339 #define CONN_HOLD(Cm_entry) {\ 340 mutex_enter(&(Cm_entry)->x_lock); \ 341 (Cm_entry)->x_ref++; \ 342 mutex_exit(&(Cm_entry)->x_lock); \ 343 } 344 345 346 /* 347 * Private data per rpc handle. This structure is allocated by 348 * clnt_cots_kcreate, and freed by clnt_cots_kdestroy. 349 */ 350 typedef struct cku_private_s { 351 CLIENT cku_client; /* client handle */ 352 calllist_t cku_call; /* for dispatching calls */ 353 struct rpc_err cku_err; /* error status */ 354 355 struct netbuf cku_srcaddr; /* source address for retries */ 356 int cku_addrfmly; /* for binding port */ 357 struct netbuf cku_addr; /* remote address */ 358 dev_t cku_device; /* device to use */ 359 uint_t cku_flags; 360 #define CKU_ONQUEUE 0x1 361 #define CKU_SENT 0x2 362 363 bool_t cku_progress; /* for CLSET_PROGRESS */ 364 uint32_t cku_xid; /* current XID */ 365 clock_t cku_ctime; /* time stamp of when */ 366 /* connection was created */ 367 uint_t cku_recv_attempts; 368 XDR cku_outxdr; /* xdr routine for output */ 369 XDR cku_inxdr; /* xdr routine for input */ 370 char cku_rpchdr[WIRE_HDR_SIZE + 4]; 371 /* pre-serialized rpc header */ 372 373 uint_t cku_outbuflen; /* default output mblk length */ 374 struct cred *cku_cred; /* credentials */ 375 bool_t cku_nodelayonerr; 376 /* for CLSET_NODELAYONERR */ 377 int cku_useresvport; /* Use reserved port */ 378 struct rpc_cots_client *cku_stats; /* stats for zone */ 379 } cku_private_t; 380 381 static struct cm_xprt *connmgr_wrapconnect(struct cm_xprt *, 382 const struct timeval *, struct netbuf *, int, struct netbuf *, 383 struct rpc_err *, bool_t, bool_t, cred_t *); 384 385 static bool_t connmgr_connect(struct cm_xprt *, queue_t *, struct netbuf *, 386 int, calllist_t *, int *, bool_t reconnect, 387 const struct timeval *, bool_t, cred_t *); 388 389 static void *connmgr_opt_getoff(mblk_t *mp, t_uscalar_t offset, 390 t_uscalar_t length, uint_t align_size); 391 static bool_t connmgr_setbufsz(calllist_t *e, queue_t *wq, cred_t *cr); 392 static bool_t connmgr_getopt_int(queue_t *wq, int level, int name, int *val, 393 calllist_t *e, cred_t *cr); 394 static bool_t connmgr_setopt_int(queue_t *wq, int level, int name, int val, 395 calllist_t *e, cred_t *cr); 396 static bool_t connmgr_setopt(queue_t *, int, int, calllist_t *, cred_t *cr); 397 static void connmgr_sndrel(struct cm_xprt *); 398 static void connmgr_snddis(struct cm_xprt *); 399 static void connmgr_close(struct cm_xprt *); 400 static void connmgr_release(struct cm_xprt *); 401 static struct cm_xprt *connmgr_wrapget(struct netbuf *, const struct timeval *, 402 cku_private_t *); 403 404 static struct cm_xprt *connmgr_get(struct netbuf *, const struct timeval *, 405 struct netbuf *, int, struct netbuf *, struct rpc_err *, dev_t, 406 bool_t, int, cred_t *); 407 408 static void connmgr_cancelconn(struct cm_xprt *); 409 static enum clnt_stat connmgr_cwait(struct cm_xprt *, const struct timeval *, 410 bool_t); 411 static void connmgr_dis_and_wait(struct cm_xprt *); 412 413 static int clnt_dispatch_send(queue_t *, mblk_t *, calllist_t *, uint_t, 414 uint_t); 415 416 static int clnt_delay(clock_t, bool_t); 417 418 static int waitforack(calllist_t *, t_scalar_t, const struct timeval *, bool_t); 419 420 /* 421 * Operations vector for TCP/IP based RPC 422 */ 423 static struct clnt_ops tcp_ops = { 424 clnt_cots_kcallit, /* do rpc call */ 425 clnt_cots_kabort, /* abort call */ 426 clnt_cots_kerror, /* return error status */ 427 clnt_cots_kfreeres, /* free results */ 428 clnt_cots_kdestroy, /* destroy rpc handle */ 429 clnt_cots_kcontrol, /* the ioctl() of rpc */ 430 clnt_cots_ksettimers, /* set retry timers */ 431 }; 432 433 static int rpc_kstat_instance = 0; /* keeps the current instance */ 434 /* number for the next kstat_create */ 435 436 static struct cm_xprt *cm_hd = NULL; 437 static kmutex_t connmgr_lock; /* for connection mngr's list of transports */ 438 439 extern kmutex_t clnt_max_msg_lock; 440 441 static calllist_t *clnt_pending = NULL; 442 extern kmutex_t clnt_pending_lock; 443 444 static int clnt_cots_hash_size = DEFAULT_HASH_SIZE; 445 446 static call_table_t *cots_call_ht; 447 448 static const struct rpc_cots_client { 449 kstat_named_t rccalls; 450 kstat_named_t rcbadcalls; 451 kstat_named_t rcbadxids; 452 kstat_named_t rctimeouts; 453 kstat_named_t rcnewcreds; 454 kstat_named_t rcbadverfs; 455 kstat_named_t rctimers; 456 kstat_named_t rccantconn; 457 kstat_named_t rcnomem; 458 kstat_named_t rcintrs; 459 } cots_rcstat_tmpl = { 460 { "calls", KSTAT_DATA_UINT64 }, 461 { "badcalls", KSTAT_DATA_UINT64 }, 462 { "badxids", KSTAT_DATA_UINT64 }, 463 { "timeouts", KSTAT_DATA_UINT64 }, 464 { "newcreds", KSTAT_DATA_UINT64 }, 465 { "badverfs", KSTAT_DATA_UINT64 }, 466 { "timers", KSTAT_DATA_UINT64 }, 467 { "cantconn", KSTAT_DATA_UINT64 }, 468 { "nomem", KSTAT_DATA_UINT64 }, 469 { "interrupts", KSTAT_DATA_UINT64 } 470 }; 471 472 #define COTSRCSTAT_INCR(p, x) \ 473 atomic_inc_64(&(p)->x.value.ui64) 474 475 #define CLNT_MAX_CONNS 1 /* concurrent connections between clnt/srvr */ 476 int clnt_max_conns = CLNT_MAX_CONNS; 477 478 #define CLNT_MIN_TIMEOUT 10 /* seconds to wait after we get a */ 479 /* connection reset */ 480 #define CLNT_MIN_CONNTIMEOUT 5 /* seconds to wait for a connection */ 481 482 483 int clnt_cots_min_tout = CLNT_MIN_TIMEOUT; 484 int clnt_cots_min_conntout = CLNT_MIN_CONNTIMEOUT; 485 486 /* 487 * Limit the number of times we will attempt to receive a reply without 488 * re-sending a response. 489 */ 490 #define CLNT_MAXRECV_WITHOUT_RETRY 3 491 uint_t clnt_cots_maxrecv = CLNT_MAXRECV_WITHOUT_RETRY; 492 493 uint_t *clnt_max_msg_sizep; 494 void (*clnt_stop_idle)(queue_t *wq); 495 496 #define ptoh(p) (&((p)->cku_client)) 497 #define htop(h) ((cku_private_t *)((h)->cl_private)) 498 499 /* 500 * Times to retry 501 */ 502 #define REFRESHES 2 /* authentication refreshes */ 503 504 /* 505 * The following is used to determine the global default behavior for 506 * COTS when binding to a local port. 507 * 508 * If the value is set to 1 the default will be to select a reserved 509 * (aka privileged) port, if the value is zero the default will be to 510 * use non-reserved ports. Users of kRPC may override this by using 511 * CLNT_CONTROL() and CLSET_BINDRESVPORT. 512 */ 513 int clnt_cots_do_bindresvport = 1; 514 515 static zone_key_t zone_cots_key; 516 517 /* 518 * Defaults TCP send and receive buffer size for RPC connections. 519 * These values can be tuned by /etc/system. 520 */ 521 int rpc_send_bufsz = 1024*1024; 522 int rpc_recv_bufsz = 1024*1024; 523 /* 524 * To use system-wide default for TCP send and receive buffer size, 525 * use /etc/system to set rpc_default_tcp_bufsz to 1: 526 * 527 * set rpcmod:rpc_default_tcp_bufsz=1 528 */ 529 int rpc_default_tcp_bufsz = 0; 530 531 /* 532 * We need to do this after all kernel threads in the zone have exited. 533 */ 534 /* ARGSUSED */ 535 static void 536 clnt_zone_destroy(zoneid_t zoneid, void *unused) 537 { 538 struct cm_xprt **cmp; 539 struct cm_xprt *cm_entry; 540 struct cm_xprt *freelist = NULL; 541 542 mutex_enter(&connmgr_lock); 543 cmp = &cm_hd; 544 while ((cm_entry = *cmp) != NULL) { 545 if (cm_entry->x_zoneid == zoneid) { 546 *cmp = cm_entry->x_next; 547 cm_entry->x_next = freelist; 548 freelist = cm_entry; 549 } else { 550 cmp = &cm_entry->x_next; 551 } 552 } 553 mutex_exit(&connmgr_lock); 554 while ((cm_entry = freelist) != NULL) { 555 freelist = cm_entry->x_next; 556 connmgr_close(cm_entry); 557 } 558 } 559 560 int 561 clnt_cots_kcreate(dev_t dev, struct netbuf *addr, int family, rpcprog_t prog, 562 rpcvers_t vers, uint_t max_msgsize, cred_t *cred, CLIENT **ncl) 563 { 564 CLIENT *h; 565 cku_private_t *p; 566 struct rpc_msg call_msg; 567 struct rpcstat *rpcstat; 568 569 RPCLOG(8, "clnt_cots_kcreate: prog %u\n", prog); 570 571 rpcstat = zone_getspecific(rpcstat_zone_key, rpc_zone()); 572 ASSERT(rpcstat != NULL); 573 574 /* Allocate and intialize the client handle. */ 575 p = kmem_zalloc(sizeof (*p), KM_SLEEP); 576 577 h = ptoh(p); 578 579 h->cl_private = (caddr_t)p; 580 h->cl_auth = authkern_create(); 581 h->cl_ops = &tcp_ops; 582 583 cv_init(&p->cku_call.call_cv, NULL, CV_DEFAULT, NULL); 584 mutex_init(&p->cku_call.call_lock, NULL, MUTEX_DEFAULT, NULL); 585 586 /* 587 * If the current sanity check size in rpcmod is smaller 588 * than the size needed, then increase the sanity check. 589 */ 590 if (max_msgsize != 0 && clnt_max_msg_sizep != NULL && 591 max_msgsize > *clnt_max_msg_sizep) { 592 mutex_enter(&clnt_max_msg_lock); 593 if (max_msgsize > *clnt_max_msg_sizep) 594 *clnt_max_msg_sizep = max_msgsize; 595 mutex_exit(&clnt_max_msg_lock); 596 } 597 598 p->cku_outbuflen = COTS_DEFAULT_ALLOCSIZE; 599 600 /* Preserialize the call message header */ 601 602 call_msg.rm_xid = 0; 603 call_msg.rm_direction = CALL; 604 call_msg.rm_call.cb_rpcvers = RPC_MSG_VERSION; 605 call_msg.rm_call.cb_prog = prog; 606 call_msg.rm_call.cb_vers = vers; 607 608 xdrmem_create(&p->cku_outxdr, p->cku_rpchdr, WIRE_HDR_SIZE, XDR_ENCODE); 609 610 if (!xdr_callhdr(&p->cku_outxdr, &call_msg)) { 611 XDR_DESTROY(&p->cku_outxdr); 612 RPCLOG0(1, "clnt_cots_kcreate - Fatal header serialization " 613 "error\n"); 614 auth_destroy(h->cl_auth); 615 kmem_free(p, sizeof (cku_private_t)); 616 RPCLOG0(1, "clnt_cots_kcreate: create failed error EINVAL\n"); 617 return (EINVAL); /* XXX */ 618 } 619 XDR_DESTROY(&p->cku_outxdr); 620 621 /* 622 * The zalloc initialized the fields below. 623 * p->cku_xid = 0; 624 * p->cku_flags = 0; 625 * p->cku_srcaddr.len = 0; 626 * p->cku_srcaddr.maxlen = 0; 627 */ 628 629 p->cku_cred = cred; 630 p->cku_device = dev; 631 p->cku_addrfmly = family; 632 p->cku_addr.buf = kmem_zalloc(addr->maxlen, KM_SLEEP); 633 p->cku_addr.maxlen = addr->maxlen; 634 p->cku_addr.len = addr->len; 635 bcopy(addr->buf, p->cku_addr.buf, addr->len); 636 p->cku_stats = rpcstat->rpc_cots_client; 637 p->cku_useresvport = -1; /* value is has not been set */ 638 639 *ncl = h; 640 return (0); 641 } 642 643 /*ARGSUSED*/ 644 static void 645 clnt_cots_kabort(CLIENT *h) 646 { 647 } 648 649 /* 650 * Return error info on this handle. 651 */ 652 static void 653 clnt_cots_kerror(CLIENT *h, struct rpc_err *err) 654 { 655 /* LINTED pointer alignment */ 656 cku_private_t *p = htop(h); 657 658 *err = p->cku_err; 659 } 660 661 /*ARGSUSED*/ 662 static bool_t 663 clnt_cots_kfreeres(CLIENT *h, xdrproc_t xdr_res, caddr_t res_ptr) 664 { 665 xdr_free(xdr_res, res_ptr); 666 667 return (TRUE); 668 } 669 670 static bool_t 671 clnt_cots_kcontrol(CLIENT *h, int cmd, char *arg) 672 { 673 cku_private_t *p = htop(h); 674 675 switch (cmd) { 676 case CLSET_PROGRESS: 677 p->cku_progress = TRUE; 678 return (TRUE); 679 680 case CLSET_XID: 681 if (arg == NULL) 682 return (FALSE); 683 684 p->cku_xid = *((uint32_t *)arg); 685 return (TRUE); 686 687 case CLGET_XID: 688 if (arg == NULL) 689 return (FALSE); 690 691 *((uint32_t *)arg) = p->cku_xid; 692 return (TRUE); 693 694 case CLSET_NODELAYONERR: 695 if (arg == NULL) 696 return (FALSE); 697 698 if (*((bool_t *)arg) == TRUE) { 699 p->cku_nodelayonerr = TRUE; 700 return (TRUE); 701 } 702 if (*((bool_t *)arg) == FALSE) { 703 p->cku_nodelayonerr = FALSE; 704 return (TRUE); 705 } 706 return (FALSE); 707 708 case CLGET_NODELAYONERR: 709 if (arg == NULL) 710 return (FALSE); 711 712 *((bool_t *)arg) = p->cku_nodelayonerr; 713 return (TRUE); 714 715 case CLSET_BINDRESVPORT: 716 if (arg == NULL) 717 return (FALSE); 718 719 if (*(int *)arg != 1 && *(int *)arg != 0) 720 return (FALSE); 721 722 p->cku_useresvport = *(int *)arg; 723 724 return (TRUE); 725 726 case CLGET_BINDRESVPORT: 727 if (arg == NULL) 728 return (FALSE); 729 730 *(int *)arg = p->cku_useresvport; 731 732 return (TRUE); 733 734 default: 735 return (FALSE); 736 } 737 } 738 739 /* 740 * Destroy rpc handle. Frees the space used for output buffer, 741 * private data, and handle structure. 742 */ 743 static void 744 clnt_cots_kdestroy(CLIENT *h) 745 { 746 /* LINTED pointer alignment */ 747 cku_private_t *p = htop(h); 748 calllist_t *call = &p->cku_call; 749 750 RPCLOG(8, "clnt_cots_kdestroy h: %p\n", (void *)h); 751 RPCLOG(8, "clnt_cots_kdestroy h: xid=0x%x\n", p->cku_xid); 752 753 if (p->cku_flags & CKU_ONQUEUE) { 754 RPCLOG(64, "clnt_cots_kdestroy h: removing call for xid 0x%x " 755 "from dispatch list\n", p->cku_xid); 756 call_table_remove(call); 757 } 758 759 if (call->call_reply) 760 freemsg(call->call_reply); 761 cv_destroy(&call->call_cv); 762 mutex_destroy(&call->call_lock); 763 764 kmem_free(p->cku_srcaddr.buf, p->cku_srcaddr.maxlen); 765 kmem_free(p->cku_addr.buf, p->cku_addr.maxlen); 766 kmem_free(p, sizeof (*p)); 767 } 768 769 static int clnt_cots_pulls; 770 #define RM_HDR_SIZE 4 /* record mark header size */ 771 772 /* 773 * Call remote procedure. 774 */ 775 static enum clnt_stat 776 clnt_cots_kcallit(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args, 777 caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp, struct timeval wait) 778 { 779 /* LINTED pointer alignment */ 780 cku_private_t *p = htop(h); 781 calllist_t *call = &p->cku_call; 782 XDR *xdrs; 783 struct rpc_msg reply_msg; 784 mblk_t *mp; 785 #ifdef RPCDEBUG 786 clock_t time_sent; 787 #endif 788 struct netbuf *retryaddr; 789 struct cm_xprt *cm_entry = NULL; 790 queue_t *wq; 791 int len, waitsecs, max_waitsecs; 792 int mpsize; 793 int refreshes = REFRESHES; 794 int interrupted; 795 int tidu_size; 796 enum clnt_stat status; 797 struct timeval cwait; 798 bool_t delay_first = FALSE; 799 clock_t ticks, now; 800 801 RPCLOG(2, "clnt_cots_kcallit, procnum %u\n", procnum); 802 COTSRCSTAT_INCR(p->cku_stats, rccalls); 803 804 RPCLOG(2, "clnt_cots_kcallit: wait.tv_sec: %ld\n", wait.tv_sec); 805 RPCLOG(2, "clnt_cots_kcallit: wait.tv_usec: %ld\n", wait.tv_usec); 806 /* 807 * Bug ID 1240234: 808 * Look out for zero length timeouts. We don't want to 809 * wait zero seconds for a connection to be established. 810 */ 811 if (wait.tv_sec < clnt_cots_min_conntout) { 812 cwait.tv_sec = clnt_cots_min_conntout; 813 cwait.tv_usec = 0; 814 RPCLOG(8, "clnt_cots_kcallit: wait.tv_sec (%ld) too low,", 815 wait.tv_sec); 816 RPCLOG(8, " setting to: %d\n", clnt_cots_min_conntout); 817 } else { 818 cwait = wait; 819 } 820 821 call_again: 822 if (cm_entry) { 823 connmgr_release(cm_entry); 824 cm_entry = NULL; 825 } 826 827 mp = NULL; 828 829 /* 830 * If the call is not a retry, allocate a new xid and cache it 831 * for future retries. 832 * Bug ID 1246045: 833 * Treat call as a retry for purposes of binding the source 834 * port only if we actually attempted to send anything on 835 * the previous call. 836 */ 837 if (p->cku_xid == 0) { 838 p->cku_xid = alloc_xid(); 839 call->call_zoneid = rpc_zoneid(); 840 841 /* 842 * We need to ASSERT here that our xid != 0 because this 843 * determines whether or not our call record gets placed on 844 * the hash table or the linked list. By design, we mandate 845 * that RPC calls over cots must have xid's != 0, so we can 846 * ensure proper management of the hash table. 847 */ 848 ASSERT(p->cku_xid != 0); 849 850 retryaddr = NULL; 851 p->cku_flags &= ~CKU_SENT; 852 853 if (p->cku_flags & CKU_ONQUEUE) { 854 RPCLOG(8, "clnt_cots_kcallit: new call, dequeuing old" 855 " one (%p)\n", (void *)call); 856 call_table_remove(call); 857 p->cku_flags &= ~CKU_ONQUEUE; 858 RPCLOG(64, "clnt_cots_kcallit: removing call from " 859 "dispatch list because xid was zero (now 0x%x)\n", 860 p->cku_xid); 861 } 862 863 if (call->call_reply != NULL) { 864 freemsg(call->call_reply); 865 call->call_reply = NULL; 866 } 867 } else if (p->cku_srcaddr.buf == NULL || p->cku_srcaddr.len == 0) { 868 retryaddr = NULL; 869 870 } else if (p->cku_flags & CKU_SENT) { 871 retryaddr = &p->cku_srcaddr; 872 873 } else { 874 /* 875 * Bug ID 1246045: Nothing was sent, so set retryaddr to 876 * NULL and let connmgr_get() bind to any source port it 877 * can get. 878 */ 879 retryaddr = NULL; 880 } 881 882 RPCLOG(64, "clnt_cots_kcallit: xid = 0x%x", p->cku_xid); 883 RPCLOG(64, " flags = 0x%x\n", p->cku_flags); 884 885 p->cku_err.re_status = RPC_TIMEDOUT; 886 p->cku_err.re_errno = p->cku_err.re_terrno = 0; 887 888 cm_entry = connmgr_wrapget(retryaddr, &cwait, p); 889 890 if (cm_entry == NULL) { 891 RPCLOG(1, "clnt_cots_kcallit: can't connect status %s\n", 892 clnt_sperrno(p->cku_err.re_status)); 893 894 /* 895 * The reasons why we fail to create a connection are 896 * varied. In most cases we don't want the caller to 897 * immediately retry. This could have one or more 898 * bad effects. This includes flooding the net with 899 * connect requests to ports with no listener; a hard 900 * kernel loop due to all the "reserved" TCP ports being 901 * in use. 902 */ 903 delay_first = TRUE; 904 905 /* 906 * Even if we end up returning EINTR, we still count a 907 * a "can't connect", because the connection manager 908 * might have been committed to waiting for or timing out on 909 * a connection. 910 */ 911 COTSRCSTAT_INCR(p->cku_stats, rccantconn); 912 switch (p->cku_err.re_status) { 913 case RPC_INTR: 914 p->cku_err.re_errno = EINTR; 915 916 /* 917 * No need to delay because a UNIX signal(2) 918 * interrupted us. The caller likely won't 919 * retry the CLNT_CALL() and even if it does, 920 * we assume the caller knows what it is doing. 921 */ 922 delay_first = FALSE; 923 break; 924 925 case RPC_TIMEDOUT: 926 p->cku_err.re_errno = ETIMEDOUT; 927 928 /* 929 * No need to delay because timed out already 930 * on the connection request and assume that the 931 * transport time out is longer than our minimum 932 * timeout, or least not too much smaller. 933 */ 934 delay_first = FALSE; 935 break; 936 937 case RPC_SYSTEMERROR: 938 case RPC_TLIERROR: 939 /* 940 * We want to delay here because a transient 941 * system error has a better chance of going away 942 * if we delay a bit. If it's not transient, then 943 * we don't want end up in a hard kernel loop 944 * due to retries. 945 */ 946 ASSERT(p->cku_err.re_errno != 0); 947 break; 948 949 950 case RPC_CANTCONNECT: 951 /* 952 * RPC_CANTCONNECT is set on T_ERROR_ACK which 953 * implies some error down in the TCP layer or 954 * below. If cku_nodelayonerror is set then we 955 * assume the caller knows not to try too hard. 956 */ 957 RPCLOG0(8, "clnt_cots_kcallit: connection failed,"); 958 RPCLOG0(8, " re_status=RPC_CANTCONNECT,"); 959 RPCLOG(8, " re_errno=%d,", p->cku_err.re_errno); 960 RPCLOG(8, " cku_nodelayonerr=%d", p->cku_nodelayonerr); 961 if (p->cku_nodelayonerr == TRUE) 962 delay_first = FALSE; 963 964 p->cku_err.re_errno = EIO; 965 966 break; 967 968 case RPC_XPRTFAILED: 969 /* 970 * We want to delay here because we likely 971 * got a refused connection. 972 */ 973 if (p->cku_err.re_errno == 0) 974 p->cku_err.re_errno = EIO; 975 976 RPCLOG(1, "clnt_cots_kcallit: transport failed: %d\n", 977 p->cku_err.re_errno); 978 979 break; 980 981 default: 982 /* 983 * We delay here because it is better to err 984 * on the side of caution. If we got here then 985 * status could have been RPC_SUCCESS, but we 986 * know that we did not get a connection, so 987 * force the rpc status to RPC_CANTCONNECT. 988 */ 989 p->cku_err.re_status = RPC_CANTCONNECT; 990 p->cku_err.re_errno = EIO; 991 break; 992 } 993 if (delay_first == TRUE) 994 ticks = clnt_cots_min_tout * drv_usectohz(1000000); 995 goto cots_done; 996 } 997 998 /* 999 * If we've never sent any request on this connection (send count 1000 * is zero, or the connection has been reset), cache the 1001 * the connection's create time and send a request (possibly a retry) 1002 */ 1003 if ((p->cku_flags & CKU_SENT) == 0 || 1004 p->cku_ctime != cm_entry->x_ctime) { 1005 p->cku_ctime = cm_entry->x_ctime; 1006 1007 } else if ((p->cku_flags & CKU_SENT) && (p->cku_flags & CKU_ONQUEUE) && 1008 (call->call_reply != NULL || 1009 p->cku_recv_attempts < clnt_cots_maxrecv)) { 1010 1011 /* 1012 * If we've sent a request and our call is on the dispatch 1013 * queue and we haven't made too many receive attempts, then 1014 * don't re-send, just receive. 1015 */ 1016 p->cku_recv_attempts++; 1017 goto read_again; 1018 } 1019 1020 /* 1021 * Now we create the RPC request in a STREAMS message. We have to do 1022 * this after the call to connmgr_get so that we have the correct 1023 * TIDU size for the transport. 1024 */ 1025 tidu_size = cm_entry->x_tidu_size; 1026 len = MSG_OFFSET + MAX(tidu_size, RM_HDR_SIZE + WIRE_HDR_SIZE); 1027 1028 while ((mp = allocb(len, BPRI_MED)) == NULL) { 1029 if (strwaitbuf(len, BPRI_MED)) { 1030 p->cku_err.re_status = RPC_SYSTEMERROR; 1031 p->cku_err.re_errno = ENOSR; 1032 COTSRCSTAT_INCR(p->cku_stats, rcnomem); 1033 goto cots_done; 1034 } 1035 } 1036 xdrs = &p->cku_outxdr; 1037 xdrmblk_init(xdrs, mp, XDR_ENCODE, tidu_size); 1038 mpsize = MBLKSIZE(mp); 1039 ASSERT(mpsize >= len); 1040 ASSERT(mp->b_rptr == mp->b_datap->db_base); 1041 1042 /* 1043 * If the size of mblk is not appreciably larger than what we 1044 * asked, then resize the mblk to exactly len bytes. The reason for 1045 * this: suppose len is 1600 bytes, the tidu is 1460 bytes 1046 * (from TCP over ethernet), and the arguments to the RPC require 1047 * 2800 bytes. Ideally we want the protocol to render two 1048 * ~1400 byte segments over the wire. However if allocb() gives us a 2k 1049 * mblk, and we allocate a second mblk for the remainder, the protocol 1050 * module may generate 3 segments over the wire: 1051 * 1460 bytes for the first, 448 (2048 - 1600) for the second, and 1052 * 892 for the third. If we "waste" 448 bytes in the first mblk, 1053 * the XDR encoding will generate two ~1400 byte mblks, and the 1054 * protocol module is more likely to produce properly sized segments. 1055 */ 1056 if ((mpsize >> 1) <= len) 1057 mp->b_rptr += (mpsize - len); 1058 1059 /* 1060 * Adjust b_rptr to reserve space for the non-data protocol headers 1061 * any downstream modules might like to add, and for the 1062 * record marking header. 1063 */ 1064 mp->b_rptr += (MSG_OFFSET + RM_HDR_SIZE); 1065 1066 if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) { 1067 /* Copy in the preserialized RPC header information. */ 1068 bcopy(p->cku_rpchdr, mp->b_rptr, WIRE_HDR_SIZE); 1069 1070 /* Use XDR_SETPOS() to set the b_wptr to past the RPC header. */ 1071 XDR_SETPOS(xdrs, (uint_t)(mp->b_rptr - mp->b_datap->db_base + 1072 WIRE_HDR_SIZE)); 1073 1074 ASSERT((mp->b_wptr - mp->b_rptr) == WIRE_HDR_SIZE); 1075 1076 /* Serialize the procedure number and the arguments. */ 1077 if ((!XDR_PUTINT32(xdrs, (int32_t *)&procnum)) || 1078 (!AUTH_MARSHALL(h->cl_auth, xdrs, p->cku_cred)) || 1079 (!(*xdr_args)(xdrs, argsp))) { 1080 XDR_DESTROY(xdrs); 1081 p->cku_err.re_status = RPC_CANTENCODEARGS; 1082 p->cku_err.re_errno = EIO; 1083 goto cots_done; 1084 } 1085 1086 (*(uint32_t *)(mp->b_rptr)) = p->cku_xid; 1087 } else { 1088 uint32_t *uproc = (uint32_t *)&p->cku_rpchdr[WIRE_HDR_SIZE]; 1089 IXDR_PUT_U_INT32(uproc, procnum); 1090 1091 (*(uint32_t *)(&p->cku_rpchdr[0])) = p->cku_xid; 1092 1093 /* Use XDR_SETPOS() to set the b_wptr. */ 1094 XDR_SETPOS(xdrs, (uint_t)(mp->b_rptr - mp->b_datap->db_base)); 1095 1096 /* Serialize the procedure number and the arguments. */ 1097 if (!AUTH_WRAP(h->cl_auth, p->cku_rpchdr, WIRE_HDR_SIZE+4, 1098 xdrs, xdr_args, argsp)) { 1099 XDR_DESTROY(xdrs); 1100 p->cku_err.re_status = RPC_CANTENCODEARGS; 1101 p->cku_err.re_errno = EIO; 1102 goto cots_done; 1103 } 1104 } 1105 1106 XDR_DESTROY(xdrs); 1107 1108 RPCLOG(2, "clnt_cots_kcallit: connected, sending call, tidu_size %d\n", 1109 tidu_size); 1110 1111 wq = cm_entry->x_wq; 1112 waitsecs = 0; 1113 1114 dispatch_again: 1115 status = clnt_dispatch_send(wq, mp, call, p->cku_xid, 1116 (p->cku_flags & CKU_ONQUEUE)); 1117 1118 if ((status == RPC_CANTSEND) && (call->call_reason == ENOBUFS)) { 1119 /* 1120 * QFULL condition, allow some time for queue to drain 1121 * and try again. Give up after waiting for all timeout 1122 * specified for the call, or zone is going away. 1123 */ 1124 max_waitsecs = wait.tv_sec ? wait.tv_sec : clnt_cots_min_tout; 1125 if ((waitsecs++ < max_waitsecs) && 1126 !(zone_status_get(curproc->p_zone) >= 1127 ZONE_IS_SHUTTING_DOWN)) { 1128 1129 /* wait 1 sec for queue to drain */ 1130 if (clnt_delay(drv_usectohz(1000000), 1131 h->cl_nosignal) == EINTR) { 1132 p->cku_err.re_errno = EINTR; 1133 p->cku_err.re_status = RPC_INTR; 1134 1135 goto cots_done; 1136 } 1137 1138 /* and try again */ 1139 goto dispatch_again; 1140 } 1141 p->cku_err.re_status = status; 1142 p->cku_err.re_errno = call->call_reason; 1143 DTRACE_PROBE(krpc__e__clntcots__kcallit__cantsend); 1144 1145 goto cots_done; 1146 } 1147 1148 if (waitsecs) { 1149 /* adjust timeout to account for time wait to send */ 1150 wait.tv_sec -= waitsecs; 1151 if (wait.tv_sec < 0) { 1152 /* pick up reply on next retry */ 1153 wait.tv_sec = 0; 1154 } 1155 DTRACE_PROBE2(clnt_cots__sendwait, CLIENT *, h, 1156 int, waitsecs); 1157 } 1158 1159 RPCLOG(64, "clnt_cots_kcallit: sent call for xid 0x%x\n", 1160 (uint_t)p->cku_xid); 1161 p->cku_flags = (CKU_ONQUEUE|CKU_SENT); 1162 p->cku_recv_attempts = 1; 1163 1164 #ifdef RPCDEBUG 1165 time_sent = ddi_get_lbolt(); 1166 #endif 1167 1168 /* 1169 * Wait for a reply or a timeout. If there is no error or timeout, 1170 * (both indicated by call_status), call->call_reply will contain 1171 * the RPC reply message. 1172 */ 1173 read_again: 1174 mutex_enter(&call->call_lock); 1175 interrupted = 0; 1176 if (call->call_status == RPC_TIMEDOUT) { 1177 /* 1178 * Indicate that the lwp is not to be stopped while waiting 1179 * for this network traffic. This is to avoid deadlock while 1180 * debugging a process via /proc and also to avoid recursive 1181 * mutex_enter()s due to NFS page faults while stopping 1182 * (NFS holds locks when it calls here). 1183 */ 1184 clock_t cv_wait_ret; 1185 clock_t timout; 1186 clock_t oldlbolt; 1187 1188 klwp_t *lwp = ttolwp(curthread); 1189 1190 if (lwp != NULL) 1191 lwp->lwp_nostop++; 1192 1193 oldlbolt = ddi_get_lbolt(); 1194 timout = wait.tv_sec * drv_usectohz(1000000) + 1195 drv_usectohz(wait.tv_usec) + oldlbolt; 1196 /* 1197 * Iterate until the call_status is changed to something 1198 * other that RPC_TIMEDOUT, or if cv_timedwait_sig() returns 1199 * something <=0 zero. The latter means that we timed 1200 * out. 1201 */ 1202 if (h->cl_nosignal) 1203 while ((cv_wait_ret = cv_timedwait(&call->call_cv, 1204 &call->call_lock, timout)) > 0 && 1205 call->call_status == RPC_TIMEDOUT) 1206 ; 1207 else 1208 while ((cv_wait_ret = cv_timedwait_sig( 1209 &call->call_cv, 1210 &call->call_lock, timout)) > 0 && 1211 call->call_status == RPC_TIMEDOUT) 1212 ; 1213 1214 switch (cv_wait_ret) { 1215 case 0: 1216 /* 1217 * If we got out of the above loop with 1218 * cv_timedwait_sig() returning 0, then we were 1219 * interrupted regardless what call_status is. 1220 */ 1221 interrupted = 1; 1222 break; 1223 case -1: 1224 /* cv_timedwait_sig() timed out */ 1225 break; 1226 default: 1227 1228 /* 1229 * We were cv_signaled(). If we didn't 1230 * get a successful call_status and returned 1231 * before time expired, delay up to clnt_cots_min_tout 1232 * seconds so that the caller doesn't immediately 1233 * try to call us again and thus force the 1234 * same condition that got us here (such 1235 * as a RPC_XPRTFAILED due to the server not 1236 * listening on the end-point. 1237 */ 1238 if (call->call_status != RPC_SUCCESS) { 1239 clock_t curlbolt; 1240 clock_t diff; 1241 1242 curlbolt = ddi_get_lbolt(); 1243 ticks = clnt_cots_min_tout * 1244 drv_usectohz(1000000); 1245 diff = curlbolt - oldlbolt; 1246 if (diff < ticks) { 1247 delay_first = TRUE; 1248 if (diff > 0) 1249 ticks -= diff; 1250 } 1251 } 1252 break; 1253 } 1254 1255 if (lwp != NULL) 1256 lwp->lwp_nostop--; 1257 } 1258 /* 1259 * Get the reply message, if any. This will be freed at the end 1260 * whether or not an error occurred. 1261 */ 1262 mp = call->call_reply; 1263 call->call_reply = NULL; 1264 1265 /* 1266 * call_err is the error info when the call is on dispatch queue. 1267 * cku_err is the error info returned to the caller. 1268 * Sync cku_err with call_err for local message processing. 1269 */ 1270 1271 status = call->call_status; 1272 p->cku_err = call->call_err; 1273 mutex_exit(&call->call_lock); 1274 1275 if (status != RPC_SUCCESS) { 1276 switch (status) { 1277 case RPC_TIMEDOUT: 1278 now = ddi_get_lbolt(); 1279 if (interrupted) { 1280 COTSRCSTAT_INCR(p->cku_stats, rcintrs); 1281 p->cku_err.re_status = RPC_INTR; 1282 p->cku_err.re_errno = EINTR; 1283 RPCLOG(1, "clnt_cots_kcallit: xid 0x%x", 1284 p->cku_xid); 1285 RPCLOG(1, "signal interrupted at %ld", now); 1286 RPCLOG(1, ", was sent at %ld\n", time_sent); 1287 } else { 1288 COTSRCSTAT_INCR(p->cku_stats, rctimeouts); 1289 p->cku_err.re_errno = ETIMEDOUT; 1290 RPCLOG(1, "clnt_cots_kcallit: timed out at %ld", 1291 now); 1292 RPCLOG(1, ", was sent at %ld\n", time_sent); 1293 } 1294 break; 1295 1296 case RPC_XPRTFAILED: 1297 if (p->cku_err.re_errno == 0) 1298 p->cku_err.re_errno = EIO; 1299 1300 RPCLOG(1, "clnt_cots_kcallit: transport failed: %d\n", 1301 p->cku_err.re_errno); 1302 break; 1303 1304 case RPC_SYSTEMERROR: 1305 ASSERT(p->cku_err.re_errno); 1306 RPCLOG(1, "clnt_cots_kcallit: system error: %d\n", 1307 p->cku_err.re_errno); 1308 break; 1309 1310 default: 1311 p->cku_err.re_status = RPC_SYSTEMERROR; 1312 p->cku_err.re_errno = EIO; 1313 RPCLOG(1, "clnt_cots_kcallit: error: %s\n", 1314 clnt_sperrno(status)); 1315 break; 1316 } 1317 if (p->cku_err.re_status != RPC_TIMEDOUT) { 1318 1319 if (p->cku_flags & CKU_ONQUEUE) { 1320 call_table_remove(call); 1321 p->cku_flags &= ~CKU_ONQUEUE; 1322 } 1323 1324 RPCLOG(64, "clnt_cots_kcallit: non TIMEOUT so xid 0x%x " 1325 "taken off dispatch list\n", p->cku_xid); 1326 if (call->call_reply) { 1327 freemsg(call->call_reply); 1328 call->call_reply = NULL; 1329 } 1330 } else if (wait.tv_sec != 0) { 1331 /* 1332 * We've sent the request over TCP and so we have 1333 * every reason to believe it will get 1334 * delivered. In which case returning a timeout is not 1335 * appropriate. 1336 */ 1337 if (p->cku_progress == TRUE && 1338 p->cku_recv_attempts < clnt_cots_maxrecv) { 1339 p->cku_err.re_status = RPC_INPROGRESS; 1340 } 1341 } 1342 goto cots_done; 1343 } 1344 1345 xdrs = &p->cku_inxdr; 1346 xdrmblk_init(xdrs, mp, XDR_DECODE, 0); 1347 1348 reply_msg.rm_direction = REPLY; 1349 reply_msg.rm_reply.rp_stat = MSG_ACCEPTED; 1350 reply_msg.acpted_rply.ar_stat = SUCCESS; 1351 1352 reply_msg.acpted_rply.ar_verf = _null_auth; 1353 /* 1354 * xdr_results will be done in AUTH_UNWRAP. 1355 */ 1356 reply_msg.acpted_rply.ar_results.where = NULL; 1357 reply_msg.acpted_rply.ar_results.proc = xdr_void; 1358 1359 if (xdr_replymsg(xdrs, &reply_msg)) { 1360 enum clnt_stat re_status; 1361 1362 _seterr_reply(&reply_msg, &p->cku_err); 1363 1364 re_status = p->cku_err.re_status; 1365 if (re_status == RPC_SUCCESS) { 1366 /* 1367 * Reply is good, check auth. 1368 */ 1369 if (!AUTH_VALIDATE(h->cl_auth, 1370 &reply_msg.acpted_rply.ar_verf)) { 1371 COTSRCSTAT_INCR(p->cku_stats, rcbadverfs); 1372 RPCLOG0(1, "clnt_cots_kcallit: validation " 1373 "failure\n"); 1374 freemsg(mp); 1375 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 1376 XDR_DESTROY(xdrs); 1377 mutex_enter(&call->call_lock); 1378 if (call->call_reply == NULL) 1379 call->call_status = RPC_TIMEDOUT; 1380 mutex_exit(&call->call_lock); 1381 goto read_again; 1382 } else if (!AUTH_UNWRAP(h->cl_auth, xdrs, 1383 xdr_results, resultsp)) { 1384 RPCLOG0(1, "clnt_cots_kcallit: validation " 1385 "failure (unwrap)\n"); 1386 p->cku_err.re_status = RPC_CANTDECODERES; 1387 p->cku_err.re_errno = EIO; 1388 } 1389 } else { 1390 /* set errno in case we can't recover */ 1391 if (re_status != RPC_VERSMISMATCH && 1392 re_status != RPC_AUTHERROR && 1393 re_status != RPC_PROGVERSMISMATCH) 1394 p->cku_err.re_errno = EIO; 1395 1396 if (re_status == RPC_AUTHERROR) { 1397 /* 1398 * Maybe our credential need to be refreshed 1399 */ 1400 if (cm_entry) { 1401 /* 1402 * There is the potential that the 1403 * cm_entry has/will be marked dead, 1404 * so drop the connection altogether, 1405 * force REFRESH to establish new 1406 * connection. 1407 */ 1408 connmgr_cancelconn(cm_entry); 1409 cm_entry = NULL; 1410 } 1411 1412 (void) xdr_rpc_free_verifier(xdrs, 1413 &reply_msg); 1414 XDR_DESTROY(xdrs); 1415 1416 if (p->cku_flags & CKU_ONQUEUE) { 1417 call_table_remove(call); 1418 p->cku_flags &= ~CKU_ONQUEUE; 1419 } 1420 RPCLOG(64, 1421 "clnt_cots_kcallit: AUTH_ERROR, xid" 1422 " 0x%x removed off dispatch list\n", 1423 p->cku_xid); 1424 if (call->call_reply) { 1425 freemsg(call->call_reply); 1426 call->call_reply = NULL; 1427 } 1428 1429 if ((refreshes > 0) && 1430 AUTH_REFRESH(h->cl_auth, &reply_msg, 1431 p->cku_cred)) { 1432 refreshes--; 1433 freemsg(mp); 1434 mp = NULL; 1435 1436 COTSRCSTAT_INCR(p->cku_stats, 1437 rcbadcalls); 1438 COTSRCSTAT_INCR(p->cku_stats, 1439 rcnewcreds); 1440 goto call_again; 1441 } 1442 1443 /* 1444 * We have used the client handle to 1445 * do an AUTH_REFRESH and the RPC status may 1446 * be set to RPC_SUCCESS; Let's make sure to 1447 * set it to RPC_AUTHERROR. 1448 */ 1449 p->cku_err.re_status = RPC_AUTHERROR; 1450 1451 /* 1452 * Map recoverable and unrecoverable 1453 * authentication errors to appropriate errno 1454 */ 1455 switch (p->cku_err.re_why) { 1456 case AUTH_TOOWEAK: 1457 /* 1458 * This could be a failure where the 1459 * server requires use of a reserved 1460 * port, check and optionally set the 1461 * client handle useresvport trying 1462 * one more time. Next go round we 1463 * fall out with the tooweak error. 1464 */ 1465 if (p->cku_useresvport != 1) { 1466 p->cku_useresvport = 1; 1467 p->cku_xid = 0; 1468 freemsg(mp); 1469 mp = NULL; 1470 goto call_again; 1471 } 1472 /* FALLTHRU */ 1473 case AUTH_BADCRED: 1474 case AUTH_BADVERF: 1475 case AUTH_INVALIDRESP: 1476 case AUTH_FAILED: 1477 case RPCSEC_GSS_NOCRED: 1478 case RPCSEC_GSS_FAILED: 1479 p->cku_err.re_errno = EACCES; 1480 break; 1481 case AUTH_REJECTEDCRED: 1482 case AUTH_REJECTEDVERF: 1483 default: p->cku_err.re_errno = EIO; 1484 break; 1485 } 1486 RPCLOG(1, "clnt_cots_kcallit : authentication" 1487 " failed with RPC_AUTHERROR of type %d\n", 1488 (int)p->cku_err.re_why); 1489 goto cots_done; 1490 } 1491 } 1492 } else { 1493 /* reply didn't decode properly. */ 1494 p->cku_err.re_status = RPC_CANTDECODERES; 1495 p->cku_err.re_errno = EIO; 1496 RPCLOG0(1, "clnt_cots_kcallit: decode failure\n"); 1497 } 1498 1499 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 1500 XDR_DESTROY(xdrs); 1501 1502 if (p->cku_flags & CKU_ONQUEUE) { 1503 call_table_remove(call); 1504 p->cku_flags &= ~CKU_ONQUEUE; 1505 } 1506 1507 RPCLOG(64, "clnt_cots_kcallit: xid 0x%x taken off dispatch list", 1508 p->cku_xid); 1509 RPCLOG(64, " status is %s\n", clnt_sperrno(p->cku_err.re_status)); 1510 cots_done: 1511 if (cm_entry) 1512 connmgr_release(cm_entry); 1513 1514 if (mp != NULL) 1515 freemsg(mp); 1516 if ((p->cku_flags & CKU_ONQUEUE) == 0 && call->call_reply) { 1517 freemsg(call->call_reply); 1518 call->call_reply = NULL; 1519 } 1520 if (p->cku_err.re_status != RPC_SUCCESS) { 1521 RPCLOG0(1, "clnt_cots_kcallit: tail-end failure\n"); 1522 COTSRCSTAT_INCR(p->cku_stats, rcbadcalls); 1523 } 1524 1525 /* 1526 * No point in delaying if the zone is going away. 1527 */ 1528 if (delay_first == TRUE && 1529 !(zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)) { 1530 if (clnt_delay(ticks, h->cl_nosignal) == EINTR) { 1531 p->cku_err.re_errno = EINTR; 1532 p->cku_err.re_status = RPC_INTR; 1533 } 1534 } 1535 return (p->cku_err.re_status); 1536 } 1537 1538 /* 1539 * Kinit routine for cots. This sets up the correct operations in 1540 * the client handle, as the handle may have previously been a clts 1541 * handle, and clears the xid field so there is no way a new call 1542 * could be mistaken for a retry. It also sets in the handle the 1543 * information that is passed at create/kinit time but needed at 1544 * call time, as cots creates the transport at call time - device, 1545 * address of the server, protocol family. 1546 */ 1547 void 1548 clnt_cots_kinit(CLIENT *h, dev_t dev, int family, struct netbuf *addr, 1549 int max_msgsize, cred_t *cred) 1550 { 1551 /* LINTED pointer alignment */ 1552 cku_private_t *p = htop(h); 1553 calllist_t *call = &p->cku_call; 1554 1555 h->cl_ops = &tcp_ops; 1556 if (p->cku_flags & CKU_ONQUEUE) { 1557 call_table_remove(call); 1558 p->cku_flags &= ~CKU_ONQUEUE; 1559 RPCLOG(64, "clnt_cots_kinit: removing call for xid 0x%x from" 1560 " dispatch list\n", p->cku_xid); 1561 } 1562 1563 if (call->call_reply != NULL) { 1564 freemsg(call->call_reply); 1565 call->call_reply = NULL; 1566 } 1567 1568 call->call_bucket = NULL; 1569 call->call_hash = 0; 1570 1571 /* 1572 * We don't clear cku_flags here, because clnt_cots_kcallit() 1573 * takes care of handling the cku_flags reset. 1574 */ 1575 p->cku_xid = 0; 1576 p->cku_device = dev; 1577 p->cku_addrfmly = family; 1578 p->cku_cred = cred; 1579 1580 if (p->cku_addr.maxlen < addr->len) { 1581 if (p->cku_addr.maxlen != 0 && p->cku_addr.buf != NULL) 1582 kmem_free(p->cku_addr.buf, p->cku_addr.maxlen); 1583 p->cku_addr.buf = kmem_zalloc(addr->maxlen, KM_SLEEP); 1584 p->cku_addr.maxlen = addr->maxlen; 1585 } 1586 1587 p->cku_addr.len = addr->len; 1588 bcopy(addr->buf, p->cku_addr.buf, addr->len); 1589 1590 /* 1591 * If the current sanity check size in rpcmod is smaller 1592 * than the size needed, then increase the sanity check. 1593 */ 1594 if (max_msgsize != 0 && clnt_max_msg_sizep != NULL && 1595 max_msgsize > *clnt_max_msg_sizep) { 1596 mutex_enter(&clnt_max_msg_lock); 1597 if (max_msgsize > *clnt_max_msg_sizep) 1598 *clnt_max_msg_sizep = max_msgsize; 1599 mutex_exit(&clnt_max_msg_lock); 1600 } 1601 } 1602 1603 /* 1604 * ksettimers is a no-op for cots, with the exception of setting the xid. 1605 */ 1606 /* ARGSUSED */ 1607 static int 1608 clnt_cots_ksettimers(CLIENT *h, struct rpc_timers *t, struct rpc_timers *all, 1609 int minimum, void (*feedback)(int, int, caddr_t), caddr_t arg, 1610 uint32_t xid) 1611 { 1612 /* LINTED pointer alignment */ 1613 cku_private_t *p = htop(h); 1614 1615 if (xid) 1616 p->cku_xid = xid; 1617 COTSRCSTAT_INCR(p->cku_stats, rctimers); 1618 return (0); 1619 } 1620 1621 extern void rpc_poptimod(struct vnode *); 1622 extern int kstr_push(struct vnode *, char *); 1623 1624 int 1625 conn_kstat_update(kstat_t *ksp, int rw) 1626 { 1627 struct cm_xprt *cm_entry; 1628 struct cm_kstat_xprt *cm_ksp_data; 1629 uchar_t *b; 1630 char *fbuf; 1631 1632 if (rw == KSTAT_WRITE) 1633 return (EACCES); 1634 if (ksp == NULL || ksp->ks_private == NULL) 1635 return (EIO); 1636 cm_entry = (struct cm_xprt *)ksp->ks_private; 1637 cm_ksp_data = (struct cm_kstat_xprt *)ksp->ks_data; 1638 1639 cm_ksp_data->x_wq.value.ui32 = (uint32_t)(uintptr_t)cm_entry->x_wq; 1640 cm_ksp_data->x_family.value.ui32 = cm_entry->x_family; 1641 cm_ksp_data->x_rdev.value.ui32 = (uint32_t)cm_entry->x_rdev; 1642 cm_ksp_data->x_time.value.ui32 = cm_entry->x_time; 1643 cm_ksp_data->x_ref.value.ui32 = cm_entry->x_ref; 1644 cm_ksp_data->x_state.value.ui32 = cm_entry->x_state_flags; 1645 1646 if (cm_entry->x_server.buf) { 1647 fbuf = cm_ksp_data->x_server.value.str.addr.ptr; 1648 if (cm_entry->x_family == AF_INET && 1649 cm_entry->x_server.len == 1650 sizeof (struct sockaddr_in)) { 1651 struct sockaddr_in *sa; 1652 sa = (struct sockaddr_in *) 1653 cm_entry->x_server.buf; 1654 b = (uchar_t *)&sa->sin_addr; 1655 (void) sprintf(fbuf, 1656 "%03d.%03d.%03d.%03d", b[0] & 0xFF, b[1] & 0xFF, 1657 b[2] & 0xFF, b[3] & 0xFF); 1658 cm_ksp_data->x_port.value.ui32 = 1659 (uint32_t)sa->sin_port; 1660 } else if (cm_entry->x_family == AF_INET6 && 1661 cm_entry->x_server.len >= 1662 sizeof (struct sockaddr_in6)) { 1663 /* extract server IP address & port */ 1664 struct sockaddr_in6 *sin6; 1665 sin6 = (struct sockaddr_in6 *)cm_entry->x_server.buf; 1666 (void) kinet_ntop6((uchar_t *)&sin6->sin6_addr, fbuf, 1667 INET6_ADDRSTRLEN); 1668 cm_ksp_data->x_port.value.ui32 = sin6->sin6_port; 1669 } else { 1670 struct sockaddr_in *sa; 1671 1672 sa = (struct sockaddr_in *)cm_entry->x_server.buf; 1673 b = (uchar_t *)&sa->sin_addr; 1674 (void) sprintf(fbuf, 1675 "%03d.%03d.%03d.%03d", b[0] & 0xFF, b[1] & 0xFF, 1676 b[2] & 0xFF, b[3] & 0xFF); 1677 } 1678 KSTAT_NAMED_STR_BUFLEN(&cm_ksp_data->x_server) = 1679 strlen(fbuf) + 1; 1680 } 1681 1682 return (0); 1683 } 1684 1685 1686 /* 1687 * We want a version of delay which is interruptible by a UNIX signal 1688 * Return EINTR if an interrupt occured. 1689 */ 1690 static int 1691 clnt_delay(clock_t ticks, bool_t nosignal) 1692 { 1693 if (nosignal == TRUE) { 1694 delay(ticks); 1695 return (0); 1696 } 1697 return (delay_sig(ticks)); 1698 } 1699 1700 /* 1701 * Wait for a connection until a timeout, or until we are 1702 * signalled that there has been a connection state change. 1703 */ 1704 static enum clnt_stat 1705 connmgr_cwait(struct cm_xprt *cm_entry, const struct timeval *waitp, 1706 bool_t nosignal) 1707 { 1708 bool_t interrupted; 1709 clock_t timout, cv_stat; 1710 enum clnt_stat clstat; 1711 unsigned int old_state; 1712 1713 ASSERT(MUTEX_HELD(&connmgr_lock)); 1714 /* 1715 * We wait for the transport connection to be made, or an 1716 * indication that it could not be made. 1717 */ 1718 clstat = RPC_TIMEDOUT; 1719 interrupted = FALSE; 1720 1721 old_state = cm_entry->x_state_flags; 1722 /* 1723 * Now loop until cv_timedwait{_sig} returns because of 1724 * a signal(0) or timeout(-1) or cv_signal(>0). But it may be 1725 * cv_signalled for various other reasons too. So loop 1726 * until there is a state change on the connection. 1727 */ 1728 1729 timout = waitp->tv_sec * drv_usectohz(1000000) + 1730 drv_usectohz(waitp->tv_usec) + ddi_get_lbolt(); 1731 1732 if (nosignal) { 1733 while ((cv_stat = cv_timedwait(&cm_entry->x_conn_cv, 1734 &connmgr_lock, timout)) > 0 && 1735 cm_entry->x_state_flags == old_state) 1736 ; 1737 } else { 1738 while ((cv_stat = cv_timedwait_sig(&cm_entry->x_conn_cv, 1739 &connmgr_lock, timout)) > 0 && 1740 cm_entry->x_state_flags == old_state) 1741 ; 1742 1743 if (cv_stat == 0) /* got intr signal? */ 1744 interrupted = TRUE; 1745 } 1746 1747 if ((cm_entry->x_state_flags & (X_BADSTATES|X_CONNECTED)) == 1748 X_CONNECTED) { 1749 clstat = RPC_SUCCESS; 1750 } else { 1751 if (interrupted == TRUE) 1752 clstat = RPC_INTR; 1753 RPCLOG(1, "connmgr_cwait: can't connect, error: %s\n", 1754 clnt_sperrno(clstat)); 1755 } 1756 1757 return (clstat); 1758 } 1759 1760 /* 1761 * Primary interface for how RPC grabs a connection. 1762 */ 1763 static struct cm_xprt * 1764 connmgr_wrapget( 1765 struct netbuf *retryaddr, 1766 const struct timeval *waitp, 1767 cku_private_t *p) 1768 { 1769 struct cm_xprt *cm_entry; 1770 1771 cm_entry = connmgr_get(retryaddr, waitp, &p->cku_addr, p->cku_addrfmly, 1772 &p->cku_srcaddr, &p->cku_err, p->cku_device, 1773 p->cku_client.cl_nosignal, p->cku_useresvport, p->cku_cred); 1774 1775 if (cm_entry == NULL) { 1776 /* 1777 * Re-map the call status to RPC_INTR if the err code is 1778 * EINTR. This can happen if calls status is RPC_TLIERROR. 1779 * However, don't re-map if signalling has been turned off. 1780 * XXX Really need to create a separate thread whenever 1781 * there isn't an existing connection. 1782 */ 1783 if (p->cku_err.re_errno == EINTR) { 1784 if (p->cku_client.cl_nosignal == TRUE) 1785 p->cku_err.re_errno = EIO; 1786 else 1787 p->cku_err.re_status = RPC_INTR; 1788 } 1789 } 1790 1791 return (cm_entry); 1792 } 1793 1794 /* 1795 * Obtains a transport to the server specified in addr. If a suitable transport 1796 * does not already exist in the list of cached transports, a new connection 1797 * is created, connected, and added to the list. The connection is for sending 1798 * only - the reply message may come back on another transport connection. 1799 * 1800 * To implement round-robin load balancing with multiple client connections, 1801 * the last entry on the list is always selected. Once the entry is selected 1802 * it's re-inserted to the head of the list. 1803 */ 1804 static struct cm_xprt * 1805 connmgr_get( 1806 struct netbuf *retryaddr, 1807 const struct timeval *waitp, /* changed to a ptr to converse stack */ 1808 struct netbuf *destaddr, 1809 int addrfmly, 1810 struct netbuf *srcaddr, 1811 struct rpc_err *rpcerr, 1812 dev_t device, 1813 bool_t nosignal, 1814 int useresvport, 1815 cred_t *cr) 1816 { 1817 struct cm_xprt *cm_entry; 1818 struct cm_xprt *lru_entry; 1819 struct cm_xprt **cmp, **prev; 1820 queue_t *wq; 1821 TIUSER *tiptr; 1822 int i; 1823 int retval; 1824 int tidu_size; 1825 bool_t connected; 1826 zoneid_t zoneid = rpc_zoneid(); 1827 1828 /* 1829 * If the call is not a retry, look for a transport entry that 1830 * goes to the server of interest. 1831 */ 1832 mutex_enter(&connmgr_lock); 1833 1834 if (retryaddr == NULL) { 1835 use_new_conn: 1836 i = 0; 1837 cm_entry = lru_entry = NULL; 1838 1839 prev = cmp = &cm_hd; 1840 while ((cm_entry = *cmp) != NULL) { 1841 ASSERT(cm_entry != cm_entry->x_next); 1842 /* 1843 * Garbage collect conections that are marked 1844 * for needs disconnect. 1845 */ 1846 if (cm_entry->x_needdis) { 1847 CONN_HOLD(cm_entry); 1848 connmgr_dis_and_wait(cm_entry); 1849 connmgr_release(cm_entry); 1850 /* 1851 * connmgr_lock could have been 1852 * dropped for the disconnect 1853 * processing so start over. 1854 */ 1855 goto use_new_conn; 1856 } 1857 1858 /* 1859 * Garbage collect the dead connections that have 1860 * no threads working on them. 1861 */ 1862 if ((cm_entry->x_state_flags & (X_DEAD|X_THREAD)) == 1863 X_DEAD) { 1864 mutex_enter(&cm_entry->x_lock); 1865 if (cm_entry->x_ref != 0) { 1866 /* 1867 * Currently in use. 1868 * Cleanup later. 1869 */ 1870 cmp = &cm_entry->x_next; 1871 mutex_exit(&cm_entry->x_lock); 1872 continue; 1873 } 1874 mutex_exit(&cm_entry->x_lock); 1875 *cmp = cm_entry->x_next; 1876 mutex_exit(&connmgr_lock); 1877 connmgr_close(cm_entry); 1878 mutex_enter(&connmgr_lock); 1879 goto use_new_conn; 1880 } 1881 1882 1883 if ((cm_entry->x_state_flags & X_BADSTATES) == 0 && 1884 cm_entry->x_zoneid == zoneid && 1885 cm_entry->x_rdev == device && 1886 destaddr->len == cm_entry->x_server.len && 1887 bcmp(destaddr->buf, cm_entry->x_server.buf, 1888 destaddr->len) == 0) { 1889 /* 1890 * If the matching entry isn't connected, 1891 * attempt to reconnect it. 1892 */ 1893 if (cm_entry->x_connected == FALSE) { 1894 /* 1895 * We don't go through trying 1896 * to find the least recently 1897 * used connected because 1898 * connmgr_reconnect() briefly 1899 * dropped the connmgr_lock, 1900 * allowing a window for our 1901 * accounting to be messed up. 1902 * In any case, a re-connected 1903 * connection is as good as 1904 * a LRU connection. 1905 */ 1906 return (connmgr_wrapconnect(cm_entry, 1907 waitp, destaddr, addrfmly, srcaddr, 1908 rpcerr, TRUE, nosignal, cr)); 1909 } 1910 i++; 1911 1912 /* keep track of the last entry */ 1913 lru_entry = cm_entry; 1914 prev = cmp; 1915 } 1916 cmp = &cm_entry->x_next; 1917 } 1918 1919 if (i > clnt_max_conns) { 1920 RPCLOG(8, "connmgr_get: too many conns, dooming entry" 1921 " %p\n", (void *)lru_entry->x_tiptr); 1922 lru_entry->x_doomed = TRUE; 1923 goto use_new_conn; 1924 } 1925 1926 /* 1927 * If we are at the maximum number of connections to 1928 * the server, hand back the least recently used one. 1929 */ 1930 if (i == clnt_max_conns) { 1931 /* 1932 * Copy into the handle the source address of 1933 * the connection, which we will use in case of 1934 * a later retry. 1935 */ 1936 if (srcaddr->len != lru_entry->x_src.len) { 1937 if (srcaddr->len > 0) 1938 kmem_free(srcaddr->buf, 1939 srcaddr->maxlen); 1940 srcaddr->buf = kmem_zalloc( 1941 lru_entry->x_src.len, KM_SLEEP); 1942 srcaddr->maxlen = srcaddr->len = 1943 lru_entry->x_src.len; 1944 } 1945 bcopy(lru_entry->x_src.buf, srcaddr->buf, srcaddr->len); 1946 RPCLOG(2, "connmgr_get: call going out on %p\n", 1947 (void *)lru_entry); 1948 lru_entry->x_time = ddi_get_lbolt(); 1949 CONN_HOLD(lru_entry); 1950 1951 if ((i > 1) && (prev != &cm_hd)) { 1952 /* 1953 * remove and re-insert entry at head of list. 1954 */ 1955 *prev = lru_entry->x_next; 1956 lru_entry->x_next = cm_hd; 1957 cm_hd = lru_entry; 1958 } 1959 1960 mutex_exit(&connmgr_lock); 1961 return (lru_entry); 1962 } 1963 1964 } else { 1965 /* 1966 * This is the retry case (retryaddr != NULL). Retries must 1967 * be sent on the same source port as the original call. 1968 */ 1969 1970 /* 1971 * Walk the list looking for a connection with a source address 1972 * that matches the retry address. 1973 */ 1974 start_retry_loop: 1975 cmp = &cm_hd; 1976 while ((cm_entry = *cmp) != NULL) { 1977 ASSERT(cm_entry != cm_entry->x_next); 1978 1979 /* 1980 * determine if this connection matches the passed 1981 * in retry address. If it does not match, advance 1982 * to the next element on the list. 1983 */ 1984 if (zoneid != cm_entry->x_zoneid || 1985 device != cm_entry->x_rdev || 1986 retryaddr->len != cm_entry->x_src.len || 1987 bcmp(retryaddr->buf, cm_entry->x_src.buf, 1988 retryaddr->len) != 0) { 1989 cmp = &cm_entry->x_next; 1990 continue; 1991 } 1992 /* 1993 * Garbage collect conections that are marked 1994 * for needs disconnect. 1995 */ 1996 if (cm_entry->x_needdis) { 1997 CONN_HOLD(cm_entry); 1998 connmgr_dis_and_wait(cm_entry); 1999 connmgr_release(cm_entry); 2000 /* 2001 * connmgr_lock could have been 2002 * dropped for the disconnect 2003 * processing so start over. 2004 */ 2005 goto start_retry_loop; 2006 } 2007 /* 2008 * Garbage collect the dead connections that have 2009 * no threads working on them. 2010 */ 2011 if ((cm_entry->x_state_flags & (X_DEAD|X_THREAD)) == 2012 X_DEAD) { 2013 mutex_enter(&cm_entry->x_lock); 2014 if (cm_entry->x_ref != 0) { 2015 /* 2016 * Currently in use. 2017 * Cleanup later. 2018 */ 2019 cmp = &cm_entry->x_next; 2020 mutex_exit(&cm_entry->x_lock); 2021 continue; 2022 } 2023 mutex_exit(&cm_entry->x_lock); 2024 *cmp = cm_entry->x_next; 2025 mutex_exit(&connmgr_lock); 2026 connmgr_close(cm_entry); 2027 mutex_enter(&connmgr_lock); 2028 goto start_retry_loop; 2029 } 2030 2031 /* 2032 * Sanity check: if the connection with our source 2033 * port is going to some other server, something went 2034 * wrong, as we never delete connections (i.e. release 2035 * ports) unless they have been idle. In this case, 2036 * it is probably better to send the call out using 2037 * a new source address than to fail it altogether, 2038 * since that port may never be released. 2039 */ 2040 if (destaddr->len != cm_entry->x_server.len || 2041 bcmp(destaddr->buf, cm_entry->x_server.buf, 2042 destaddr->len) != 0) { 2043 RPCLOG(1, "connmgr_get: tiptr %p" 2044 " is going to a different server" 2045 " with the port that belongs" 2046 " to us!\n", (void *)cm_entry->x_tiptr); 2047 retryaddr = NULL; 2048 goto use_new_conn; 2049 } 2050 2051 /* 2052 * If the connection of interest is not connected and we 2053 * can't reconnect it, then the server is probably 2054 * still down. Return NULL to the caller and let it 2055 * retry later if it wants to. We have a delay so the 2056 * machine doesn't go into a tight retry loop. If the 2057 * entry was already connected, or the reconnected was 2058 * successful, return this entry. 2059 */ 2060 if (cm_entry->x_connected == FALSE) { 2061 return (connmgr_wrapconnect(cm_entry, 2062 waitp, destaddr, addrfmly, NULL, 2063 rpcerr, TRUE, nosignal, cr)); 2064 } else { 2065 CONN_HOLD(cm_entry); 2066 2067 cm_entry->x_time = ddi_get_lbolt(); 2068 mutex_exit(&connmgr_lock); 2069 RPCLOG(2, "connmgr_get: found old " 2070 "transport %p for retry\n", 2071 (void *)cm_entry); 2072 return (cm_entry); 2073 } 2074 } 2075 2076 /* 2077 * We cannot find an entry in the list for this retry. 2078 * Either the entry has been removed temporarily to be 2079 * reconnected by another thread, or the original call 2080 * got a port but never got connected, 2081 * and hence the transport never got put in the 2082 * list. Fall through to the "create new connection" code - 2083 * the former case will fail there trying to rebind the port, 2084 * and the later case (and any other pathological cases) will 2085 * rebind and reconnect and not hang the client machine. 2086 */ 2087 RPCLOG0(8, "connmgr_get: no entry in list for retry\n"); 2088 } 2089 /* 2090 * Set up a transport entry in the connection manager's list. 2091 */ 2092 cm_entry = (struct cm_xprt *) 2093 kmem_zalloc(sizeof (struct cm_xprt), KM_SLEEP); 2094 2095 cm_entry->x_server.buf = kmem_zalloc(destaddr->len, KM_SLEEP); 2096 bcopy(destaddr->buf, cm_entry->x_server.buf, destaddr->len); 2097 cm_entry->x_server.len = cm_entry->x_server.maxlen = destaddr->len; 2098 2099 cm_entry->x_state_flags = X_THREAD; 2100 cm_entry->x_ref = 1; 2101 cm_entry->x_family = addrfmly; 2102 cm_entry->x_rdev = device; 2103 cm_entry->x_zoneid = zoneid; 2104 mutex_init(&cm_entry->x_lock, NULL, MUTEX_DEFAULT, NULL); 2105 cv_init(&cm_entry->x_cv, NULL, CV_DEFAULT, NULL); 2106 cv_init(&cm_entry->x_conn_cv, NULL, CV_DEFAULT, NULL); 2107 cv_init(&cm_entry->x_dis_cv, NULL, CV_DEFAULT, NULL); 2108 2109 /* 2110 * Note that we add this partially initialized entry to the 2111 * connection list. This is so that we don't have connections to 2112 * the same server. 2113 * 2114 * Note that x_src is not initialized at this point. This is because 2115 * retryaddr might be NULL in which case x_src is whatever 2116 * t_kbind/bindresvport gives us. If another thread wants a 2117 * connection to the same server, seemingly we have an issue, but we 2118 * don't. If the other thread comes in with retryaddr == NULL, then it 2119 * will never look at x_src, and it will end up waiting in 2120 * connmgr_cwait() for the first thread to finish the connection 2121 * attempt. If the other thread comes in with retryaddr != NULL, then 2122 * that means there was a request sent on a connection, in which case 2123 * the the connection should already exist. Thus the first thread 2124 * never gets here ... it finds the connection it its server in the 2125 * connection list. 2126 * 2127 * But even if theory is wrong, in the retryaddr != NULL case, the 2nd 2128 * thread will skip us because x_src.len == 0. 2129 */ 2130 cm_entry->x_next = cm_hd; 2131 cm_hd = cm_entry; 2132 mutex_exit(&connmgr_lock); 2133 2134 /* 2135 * Either we didn't find an entry to the server of interest, or we 2136 * don't have the maximum number of connections to that server - 2137 * create a new connection. 2138 */ 2139 RPCLOG0(8, "connmgr_get: creating new connection\n"); 2140 rpcerr->re_status = RPC_TLIERROR; 2141 2142 i = t_kopen(NULL, device, FREAD|FWRITE|FNDELAY, &tiptr, zone_kcred()); 2143 if (i) { 2144 RPCLOG(1, "connmgr_get: can't open cots device, error %d\n", i); 2145 rpcerr->re_errno = i; 2146 connmgr_cancelconn(cm_entry); 2147 return (NULL); 2148 } 2149 rpc_poptimod(tiptr->fp->f_vnode); 2150 2151 if (i = strioctl(tiptr->fp->f_vnode, I_PUSH, (intptr_t)"rpcmod", 0, 2152 K_TO_K, kcred, &retval)) { 2153 RPCLOG(1, "connmgr_get: can't push cots module, %d\n", i); 2154 (void) t_kclose(tiptr, 1); 2155 rpcerr->re_errno = i; 2156 connmgr_cancelconn(cm_entry); 2157 return (NULL); 2158 } 2159 2160 if (i = strioctl(tiptr->fp->f_vnode, RPC_CLIENT, 0, 0, K_TO_K, 2161 kcred, &retval)) { 2162 RPCLOG(1, "connmgr_get: can't set client status with cots " 2163 "module, %d\n", i); 2164 (void) t_kclose(tiptr, 1); 2165 rpcerr->re_errno = i; 2166 connmgr_cancelconn(cm_entry); 2167 return (NULL); 2168 } 2169 2170 mutex_enter(&connmgr_lock); 2171 2172 wq = tiptr->fp->f_vnode->v_stream->sd_wrq->q_next; 2173 cm_entry->x_wq = wq; 2174 2175 mutex_exit(&connmgr_lock); 2176 2177 if (i = strioctl(tiptr->fp->f_vnode, I_PUSH, (intptr_t)"timod", 0, 2178 K_TO_K, kcred, &retval)) { 2179 RPCLOG(1, "connmgr_get: can't push timod, %d\n", i); 2180 (void) t_kclose(tiptr, 1); 2181 rpcerr->re_errno = i; 2182 connmgr_cancelconn(cm_entry); 2183 return (NULL); 2184 } 2185 2186 /* 2187 * If the caller has not specified reserved port usage then 2188 * take the system default. 2189 */ 2190 if (useresvport == -1) 2191 useresvport = clnt_cots_do_bindresvport; 2192 2193 if ((useresvport || retryaddr != NULL) && 2194 (addrfmly == AF_INET || addrfmly == AF_INET6)) { 2195 bool_t alloc_src = FALSE; 2196 2197 if (srcaddr->len != destaddr->len) { 2198 kmem_free(srcaddr->buf, srcaddr->maxlen); 2199 srcaddr->buf = kmem_zalloc(destaddr->len, KM_SLEEP); 2200 srcaddr->maxlen = destaddr->len; 2201 srcaddr->len = destaddr->len; 2202 alloc_src = TRUE; 2203 } 2204 2205 if ((i = bindresvport(tiptr, retryaddr, srcaddr, TRUE)) != 0) { 2206 (void) t_kclose(tiptr, 1); 2207 RPCLOG(1, "connmgr_get: couldn't bind, retryaddr: " 2208 "%p\n", (void *)retryaddr); 2209 2210 /* 2211 * 1225408: If we allocated a source address, then it 2212 * is either garbage or all zeroes. In that case 2213 * we need to clear srcaddr. 2214 */ 2215 if (alloc_src == TRUE) { 2216 kmem_free(srcaddr->buf, srcaddr->maxlen); 2217 srcaddr->maxlen = srcaddr->len = 0; 2218 srcaddr->buf = NULL; 2219 } 2220 rpcerr->re_errno = i; 2221 connmgr_cancelconn(cm_entry); 2222 return (NULL); 2223 } 2224 } else { 2225 if ((i = t_kbind(tiptr, NULL, NULL)) != 0) { 2226 RPCLOG(1, "clnt_cots_kcreate: t_kbind: %d\n", i); 2227 (void) t_kclose(tiptr, 1); 2228 rpcerr->re_errno = i; 2229 connmgr_cancelconn(cm_entry); 2230 return (NULL); 2231 } 2232 } 2233 2234 { 2235 /* 2236 * Keep the kernel stack lean. Don't move this call 2237 * declaration to the top of this function because a 2238 * call is declared in connmgr_wrapconnect() 2239 */ 2240 calllist_t call; 2241 2242 bzero(&call, sizeof (call)); 2243 cv_init(&call.call_cv, NULL, CV_DEFAULT, NULL); 2244 2245 /* 2246 * This is a bound end-point so don't close it's stream. 2247 */ 2248 connected = connmgr_connect(cm_entry, wq, destaddr, addrfmly, 2249 &call, &tidu_size, FALSE, waitp, nosignal, cr); 2250 *rpcerr = call.call_err; 2251 cv_destroy(&call.call_cv); 2252 2253 } 2254 2255 mutex_enter(&connmgr_lock); 2256 2257 /* 2258 * Set up a transport entry in the connection manager's list. 2259 */ 2260 cm_entry->x_src.buf = kmem_zalloc(srcaddr->len, KM_SLEEP); 2261 bcopy(srcaddr->buf, cm_entry->x_src.buf, srcaddr->len); 2262 cm_entry->x_src.len = cm_entry->x_src.maxlen = srcaddr->len; 2263 2264 cm_entry->x_tiptr = tiptr; 2265 cm_entry->x_time = ddi_get_lbolt(); 2266 2267 if (tiptr->tp_info.servtype == T_COTS_ORD) 2268 cm_entry->x_ordrel = TRUE; 2269 else 2270 cm_entry->x_ordrel = FALSE; 2271 2272 cm_entry->x_tidu_size = tidu_size; 2273 2274 if (cm_entry->x_early_disc) { 2275 /* 2276 * We need to check if a disconnect request has come 2277 * while we are connected, if so, then we need to 2278 * set rpcerr->re_status appropriately before returning 2279 * NULL to caller. 2280 */ 2281 if (rpcerr->re_status == RPC_SUCCESS) 2282 rpcerr->re_status = RPC_XPRTFAILED; 2283 cm_entry->x_connected = FALSE; 2284 } else 2285 cm_entry->x_connected = connected; 2286 2287 /* 2288 * There could be a discrepancy here such that 2289 * x_early_disc is TRUE yet connected is TRUE as well 2290 * and the connection is actually connected. In that case 2291 * lets be conservative and declare the connection as not 2292 * connected. 2293 */ 2294 cm_entry->x_early_disc = FALSE; 2295 cm_entry->x_needdis = (cm_entry->x_connected == FALSE); 2296 cm_entry->x_ctime = ddi_get_lbolt(); 2297 2298 /* 2299 * Notify any threads waiting that the connection attempt is done. 2300 */ 2301 cm_entry->x_thread = FALSE; 2302 cv_broadcast(&cm_entry->x_conn_cv); 2303 2304 if (cm_entry->x_connected == FALSE) { 2305 mutex_exit(&connmgr_lock); 2306 connmgr_release(cm_entry); 2307 return (NULL); 2308 } 2309 2310 mutex_exit(&connmgr_lock); 2311 2312 return (cm_entry); 2313 } 2314 2315 /* 2316 * Keep the cm_xprt entry on the connecton list when making a connection. This 2317 * is to prevent multiple connections to a slow server from appearing. 2318 * We use the bit field x_thread to tell if a thread is doing a connection 2319 * which keeps other interested threads from messing with connection. 2320 * Those other threads just wait if x_thread is set. 2321 * 2322 * If x_thread is not set, then we do the actual work of connecting via 2323 * connmgr_connect(). 2324 * 2325 * mutex convention: called with connmgr_lock held, returns with it released. 2326 */ 2327 static struct cm_xprt * 2328 connmgr_wrapconnect( 2329 struct cm_xprt *cm_entry, 2330 const struct timeval *waitp, 2331 struct netbuf *destaddr, 2332 int addrfmly, 2333 struct netbuf *srcaddr, 2334 struct rpc_err *rpcerr, 2335 bool_t reconnect, 2336 bool_t nosignal, 2337 cred_t *cr) 2338 { 2339 ASSERT(MUTEX_HELD(&connmgr_lock)); 2340 /* 2341 * Hold this entry as we are about to drop connmgr_lock. 2342 */ 2343 CONN_HOLD(cm_entry); 2344 2345 /* 2346 * If there is a thread already making a connection for us, then 2347 * wait for it to complete the connection. 2348 */ 2349 if (cm_entry->x_thread == TRUE) { 2350 rpcerr->re_status = connmgr_cwait(cm_entry, waitp, nosignal); 2351 2352 if (rpcerr->re_status != RPC_SUCCESS) { 2353 mutex_exit(&connmgr_lock); 2354 connmgr_release(cm_entry); 2355 return (NULL); 2356 } 2357 } else { 2358 bool_t connected; 2359 calllist_t call; 2360 2361 cm_entry->x_thread = TRUE; 2362 2363 while (cm_entry->x_needrel == TRUE) { 2364 cm_entry->x_needrel = FALSE; 2365 2366 connmgr_sndrel(cm_entry); 2367 delay(drv_usectohz(1000000)); 2368 2369 mutex_enter(&connmgr_lock); 2370 } 2371 2372 /* 2373 * If we need to send a T_DISCON_REQ, send one. 2374 */ 2375 connmgr_dis_and_wait(cm_entry); 2376 2377 mutex_exit(&connmgr_lock); 2378 2379 bzero(&call, sizeof (call)); 2380 cv_init(&call.call_cv, NULL, CV_DEFAULT, NULL); 2381 2382 connected = connmgr_connect(cm_entry, cm_entry->x_wq, 2383 destaddr, addrfmly, &call, &cm_entry->x_tidu_size, 2384 reconnect, waitp, nosignal, cr); 2385 2386 *rpcerr = call.call_err; 2387 cv_destroy(&call.call_cv); 2388 2389 mutex_enter(&connmgr_lock); 2390 2391 2392 if (cm_entry->x_early_disc) { 2393 /* 2394 * We need to check if a disconnect request has come 2395 * while we are connected, if so, then we need to 2396 * set rpcerr->re_status appropriately before returning 2397 * NULL to caller. 2398 */ 2399 if (rpcerr->re_status == RPC_SUCCESS) 2400 rpcerr->re_status = RPC_XPRTFAILED; 2401 cm_entry->x_connected = FALSE; 2402 } else 2403 cm_entry->x_connected = connected; 2404 2405 /* 2406 * There could be a discrepancy here such that 2407 * x_early_disc is TRUE yet connected is TRUE as well 2408 * and the connection is actually connected. In that case 2409 * lets be conservative and declare the connection as not 2410 * connected. 2411 */ 2412 2413 cm_entry->x_early_disc = FALSE; 2414 cm_entry->x_needdis = (cm_entry->x_connected == FALSE); 2415 2416 2417 /* 2418 * connmgr_connect() may have given up before the connection 2419 * actually timed out. So ensure that before the next 2420 * connection attempt we do a disconnect. 2421 */ 2422 cm_entry->x_ctime = ddi_get_lbolt(); 2423 cm_entry->x_thread = FALSE; 2424 2425 cv_broadcast(&cm_entry->x_conn_cv); 2426 2427 if (cm_entry->x_connected == FALSE) { 2428 mutex_exit(&connmgr_lock); 2429 connmgr_release(cm_entry); 2430 return (NULL); 2431 } 2432 } 2433 2434 if (srcaddr != NULL) { 2435 /* 2436 * Copy into the handle the 2437 * source address of the 2438 * connection, which we will use 2439 * in case of a later retry. 2440 */ 2441 if (srcaddr->len != cm_entry->x_src.len) { 2442 if (srcaddr->maxlen > 0) 2443 kmem_free(srcaddr->buf, srcaddr->maxlen); 2444 srcaddr->buf = kmem_zalloc(cm_entry->x_src.len, 2445 KM_SLEEP); 2446 srcaddr->maxlen = srcaddr->len = 2447 cm_entry->x_src.len; 2448 } 2449 bcopy(cm_entry->x_src.buf, srcaddr->buf, srcaddr->len); 2450 } 2451 cm_entry->x_time = ddi_get_lbolt(); 2452 mutex_exit(&connmgr_lock); 2453 return (cm_entry); 2454 } 2455 2456 /* 2457 * If we need to send a T_DISCON_REQ, send one. 2458 */ 2459 static void 2460 connmgr_dis_and_wait(struct cm_xprt *cm_entry) 2461 { 2462 ASSERT(MUTEX_HELD(&connmgr_lock)); 2463 for (;;) { 2464 while (cm_entry->x_needdis == TRUE) { 2465 RPCLOG(8, "connmgr_dis_and_wait: need " 2466 "T_DISCON_REQ for connection 0x%p\n", 2467 (void *)cm_entry); 2468 cm_entry->x_needdis = FALSE; 2469 cm_entry->x_waitdis = TRUE; 2470 2471 connmgr_snddis(cm_entry); 2472 2473 mutex_enter(&connmgr_lock); 2474 } 2475 2476 if (cm_entry->x_waitdis == TRUE) { 2477 clock_t timout; 2478 2479 RPCLOG(8, "connmgr_dis_and_wait waiting for " 2480 "T_DISCON_REQ's ACK for connection %p\n", 2481 (void *)cm_entry); 2482 2483 timout = clnt_cots_min_conntout * drv_usectohz(1000000); 2484 2485 /* 2486 * The TPI spec says that the T_DISCON_REQ 2487 * will get acknowledged, but in practice 2488 * the ACK may never get sent. So don't 2489 * block forever. 2490 */ 2491 (void) cv_reltimedwait(&cm_entry->x_dis_cv, 2492 &connmgr_lock, timout, TR_CLOCK_TICK); 2493 } 2494 /* 2495 * If we got the ACK, break. If we didn't, 2496 * then send another T_DISCON_REQ. 2497 */ 2498 if (cm_entry->x_waitdis == FALSE) { 2499 break; 2500 } else { 2501 RPCLOG(8, "connmgr_dis_and_wait: did" 2502 "not get T_DISCON_REQ's ACK for " 2503 "connection %p\n", (void *)cm_entry); 2504 cm_entry->x_needdis = TRUE; 2505 } 2506 } 2507 } 2508 2509 static void 2510 connmgr_cancelconn(struct cm_xprt *cm_entry) 2511 { 2512 /* 2513 * Mark the connection table entry as dead; the next thread that 2514 * goes through connmgr_release() will notice this and deal with it. 2515 */ 2516 mutex_enter(&connmgr_lock); 2517 cm_entry->x_dead = TRUE; 2518 2519 /* 2520 * Notify any threads waiting for the connection that it isn't 2521 * going to happen. 2522 */ 2523 cm_entry->x_thread = FALSE; 2524 cv_broadcast(&cm_entry->x_conn_cv); 2525 mutex_exit(&connmgr_lock); 2526 2527 connmgr_release(cm_entry); 2528 } 2529 2530 static void 2531 connmgr_close(struct cm_xprt *cm_entry) 2532 { 2533 mutex_enter(&cm_entry->x_lock); 2534 while (cm_entry->x_ref != 0) { 2535 /* 2536 * Must be a noninterruptible wait. 2537 */ 2538 cv_wait(&cm_entry->x_cv, &cm_entry->x_lock); 2539 } 2540 2541 if (cm_entry->x_tiptr != NULL) 2542 (void) t_kclose(cm_entry->x_tiptr, 1); 2543 2544 mutex_exit(&cm_entry->x_lock); 2545 if (cm_entry->x_ksp != NULL) { 2546 mutex_enter(&connmgr_lock); 2547 cm_entry->x_ksp->ks_private = NULL; 2548 mutex_exit(&connmgr_lock); 2549 2550 /* 2551 * Must free the buffer we allocated for the 2552 * server address in the update function 2553 */ 2554 if (((struct cm_kstat_xprt *)(cm_entry->x_ksp->ks_data))-> 2555 x_server.value.str.addr.ptr != NULL) 2556 kmem_free(((struct cm_kstat_xprt *)(cm_entry->x_ksp-> 2557 ks_data))->x_server.value.str.addr.ptr, 2558 INET6_ADDRSTRLEN); 2559 kmem_free(cm_entry->x_ksp->ks_data, 2560 cm_entry->x_ksp->ks_data_size); 2561 kstat_delete(cm_entry->x_ksp); 2562 } 2563 2564 mutex_destroy(&cm_entry->x_lock); 2565 cv_destroy(&cm_entry->x_cv); 2566 cv_destroy(&cm_entry->x_conn_cv); 2567 cv_destroy(&cm_entry->x_dis_cv); 2568 2569 if (cm_entry->x_server.buf != NULL) 2570 kmem_free(cm_entry->x_server.buf, cm_entry->x_server.maxlen); 2571 if (cm_entry->x_src.buf != NULL) 2572 kmem_free(cm_entry->x_src.buf, cm_entry->x_src.maxlen); 2573 kmem_free(cm_entry, sizeof (struct cm_xprt)); 2574 } 2575 2576 /* 2577 * Called by KRPC after sending the call message to release the connection 2578 * it was using. 2579 */ 2580 static void 2581 connmgr_release(struct cm_xprt *cm_entry) 2582 { 2583 mutex_enter(&cm_entry->x_lock); 2584 cm_entry->x_ref--; 2585 if (cm_entry->x_ref == 0) 2586 cv_signal(&cm_entry->x_cv); 2587 mutex_exit(&cm_entry->x_lock); 2588 } 2589 2590 /* 2591 * Set TCP receive and xmit buffer size for RPC connections. 2592 */ 2593 static bool_t 2594 connmgr_setbufsz(calllist_t *e, queue_t *wq, cred_t *cr) 2595 { 2596 int ok = FALSE; 2597 int val; 2598 2599 if (rpc_default_tcp_bufsz) 2600 return (FALSE); 2601 2602 /* 2603 * Only set new buffer size if it's larger than the system 2604 * default buffer size. If smaller buffer size is needed 2605 * then use /etc/system to set rpc_default_tcp_bufsz to 1. 2606 */ 2607 ok = connmgr_getopt_int(wq, SOL_SOCKET, SO_RCVBUF, &val, e, cr); 2608 if ((ok == TRUE) && (val < rpc_send_bufsz)) { 2609 ok = connmgr_setopt_int(wq, SOL_SOCKET, SO_RCVBUF, 2610 rpc_send_bufsz, e, cr); 2611 DTRACE_PROBE2(krpc__i__connmgr_rcvbufsz, 2612 int, ok, calllist_t *, e); 2613 } 2614 2615 ok = connmgr_getopt_int(wq, SOL_SOCKET, SO_SNDBUF, &val, e, cr); 2616 if ((ok == TRUE) && (val < rpc_recv_bufsz)) { 2617 ok = connmgr_setopt_int(wq, SOL_SOCKET, SO_SNDBUF, 2618 rpc_recv_bufsz, e, cr); 2619 DTRACE_PROBE2(krpc__i__connmgr_sndbufsz, 2620 int, ok, calllist_t *, e); 2621 } 2622 return (TRUE); 2623 } 2624 2625 /* 2626 * Given an open stream, connect to the remote. Returns true if connected, 2627 * false otherwise. 2628 */ 2629 static bool_t 2630 connmgr_connect( 2631 struct cm_xprt *cm_entry, 2632 queue_t *wq, 2633 struct netbuf *addr, 2634 int addrfmly, 2635 calllist_t *e, 2636 int *tidu_ptr, 2637 bool_t reconnect, 2638 const struct timeval *waitp, 2639 bool_t nosignal, 2640 cred_t *cr) 2641 { 2642 mblk_t *mp; 2643 struct T_conn_req *tcr; 2644 struct T_info_ack *tinfo; 2645 int interrupted, error; 2646 int tidu_size, kstat_instance; 2647 2648 /* if it's a reconnect, flush any lingering data messages */ 2649 if (reconnect) 2650 (void) putctl1(wq, M_FLUSH, FLUSHRW); 2651 2652 /* 2653 * Note: if the receiver uses SCM_UCRED/getpeerucred the pid will 2654 * appear as -1. 2655 */ 2656 mp = allocb_cred(sizeof (*tcr) + addr->len, cr, NOPID); 2657 if (mp == NULL) { 2658 /* 2659 * This is unfortunate, but we need to look up the stats for 2660 * this zone to increment the "memory allocation failed" 2661 * counter. curproc->p_zone is safe since we're initiating a 2662 * connection and not in some strange streams context. 2663 */ 2664 struct rpcstat *rpcstat; 2665 2666 rpcstat = zone_getspecific(rpcstat_zone_key, rpc_zone()); 2667 ASSERT(rpcstat != NULL); 2668 2669 RPCLOG0(1, "connmgr_connect: cannot alloc mp for " 2670 "sending conn request\n"); 2671 COTSRCSTAT_INCR(rpcstat->rpc_cots_client, rcnomem); 2672 e->call_status = RPC_SYSTEMERROR; 2673 e->call_reason = ENOSR; 2674 return (FALSE); 2675 } 2676 2677 /* Set TCP buffer size for RPC connections if needed */ 2678 if (addrfmly == AF_INET || addrfmly == AF_INET6) 2679 (void) connmgr_setbufsz(e, wq, cr); 2680 2681 mp->b_datap->db_type = M_PROTO; 2682 tcr = (struct T_conn_req *)mp->b_rptr; 2683 bzero(tcr, sizeof (*tcr)); 2684 tcr->PRIM_type = T_CONN_REQ; 2685 tcr->DEST_length = addr->len; 2686 tcr->DEST_offset = sizeof (struct T_conn_req); 2687 mp->b_wptr = mp->b_rptr + sizeof (*tcr); 2688 2689 bcopy(addr->buf, mp->b_wptr, tcr->DEST_length); 2690 mp->b_wptr += tcr->DEST_length; 2691 2692 RPCLOG(8, "connmgr_connect: sending conn request on queue " 2693 "%p", (void *)wq); 2694 RPCLOG(8, " call %p\n", (void *)wq); 2695 /* 2696 * We use the entry in the handle that is normally used for 2697 * waiting for RPC replies to wait for the connection accept. 2698 */ 2699 if (clnt_dispatch_send(wq, mp, e, 0, 0) != RPC_SUCCESS) { 2700 DTRACE_PROBE(krpc__e__connmgr__connect__cantsend); 2701 freemsg(mp); 2702 return (FALSE); 2703 } 2704 2705 mutex_enter(&clnt_pending_lock); 2706 2707 /* 2708 * We wait for the transport connection to be made, or an 2709 * indication that it could not be made. 2710 */ 2711 interrupted = 0; 2712 2713 /* 2714 * waitforack should have been called with T_OK_ACK, but the 2715 * present implementation needs to be passed T_INFO_ACK to 2716 * work correctly. 2717 */ 2718 error = waitforack(e, T_INFO_ACK, waitp, nosignal); 2719 if (error == EINTR) 2720 interrupted = 1; 2721 if (zone_status_get(curproc->p_zone) >= ZONE_IS_EMPTY) { 2722 /* 2723 * No time to lose; we essentially have been signaled to 2724 * quit. 2725 */ 2726 interrupted = 1; 2727 } 2728 #ifdef RPCDEBUG 2729 if (error == ETIME) 2730 RPCLOG0(8, "connmgr_connect: giving up " 2731 "on connection attempt; " 2732 "clnt_dispatch notifyconn " 2733 "diagnostic 'no one waiting for " 2734 "connection' should not be " 2735 "unexpected\n"); 2736 #endif 2737 if (e->call_prev) 2738 e->call_prev->call_next = e->call_next; 2739 else 2740 clnt_pending = e->call_next; 2741 if (e->call_next) 2742 e->call_next->call_prev = e->call_prev; 2743 mutex_exit(&clnt_pending_lock); 2744 2745 if (e->call_status != RPC_SUCCESS || error != 0) { 2746 if (interrupted) 2747 e->call_status = RPC_INTR; 2748 else if (error == ETIME) 2749 e->call_status = RPC_TIMEDOUT; 2750 else if (error == EPROTO) { 2751 e->call_status = RPC_SYSTEMERROR; 2752 e->call_reason = EPROTO; 2753 } 2754 2755 RPCLOG(8, "connmgr_connect: can't connect, status: " 2756 "%s\n", clnt_sperrno(e->call_status)); 2757 2758 if (e->call_reply) { 2759 freemsg(e->call_reply); 2760 e->call_reply = NULL; 2761 } 2762 2763 return (FALSE); 2764 } 2765 /* 2766 * The result of the "connection accept" is a T_info_ack 2767 * in the call_reply field. 2768 */ 2769 ASSERT(e->call_reply != NULL); 2770 mp = e->call_reply; 2771 e->call_reply = NULL; 2772 tinfo = (struct T_info_ack *)mp->b_rptr; 2773 2774 tidu_size = tinfo->TIDU_size; 2775 tidu_size -= (tidu_size % BYTES_PER_XDR_UNIT); 2776 if (tidu_size > COTS_DEFAULT_ALLOCSIZE || (tidu_size <= 0)) 2777 tidu_size = COTS_DEFAULT_ALLOCSIZE; 2778 *tidu_ptr = tidu_size; 2779 2780 freemsg(mp); 2781 2782 /* 2783 * Set up the pertinent options. NODELAY is so the transport doesn't 2784 * buffer up RPC messages on either end. This may not be valid for 2785 * all transports. Failure to set this option is not cause to 2786 * bail out so we return success anyway. Note that lack of NODELAY 2787 * or some other way to flush the message on both ends will cause 2788 * lots of retries and terrible performance. 2789 */ 2790 if (addrfmly == AF_INET || addrfmly == AF_INET6) { 2791 (void) connmgr_setopt(wq, IPPROTO_TCP, TCP_NODELAY, e, cr); 2792 if (e->call_status == RPC_XPRTFAILED) 2793 return (FALSE); 2794 } 2795 2796 /* 2797 * Since we have a connection, we now need to figure out if 2798 * we need to create a kstat. If x_ksp is not NULL then we 2799 * are reusing a connection and so we do not need to create 2800 * another kstat -- lets just return. 2801 */ 2802 if (cm_entry->x_ksp != NULL) 2803 return (TRUE); 2804 2805 /* 2806 * We need to increment rpc_kstat_instance atomically to prevent 2807 * two kstats being created with the same instance. 2808 */ 2809 kstat_instance = atomic_inc_32_nv((uint32_t *)&rpc_kstat_instance); 2810 2811 if ((cm_entry->x_ksp = kstat_create_zone("unix", kstat_instance, 2812 "rpc_cots_connections", "rpc", KSTAT_TYPE_NAMED, 2813 (uint_t)(sizeof (cm_kstat_xprt_t) / sizeof (kstat_named_t)), 2814 KSTAT_FLAG_VIRTUAL, cm_entry->x_zoneid)) == NULL) { 2815 return (TRUE); 2816 } 2817 2818 cm_entry->x_ksp->ks_lock = &connmgr_lock; 2819 cm_entry->x_ksp->ks_private = cm_entry; 2820 cm_entry->x_ksp->ks_data_size = ((INET6_ADDRSTRLEN * sizeof (char)) 2821 + sizeof (cm_kstat_template)); 2822 cm_entry->x_ksp->ks_data = kmem_alloc(cm_entry->x_ksp->ks_data_size, 2823 KM_SLEEP); 2824 bcopy(&cm_kstat_template, cm_entry->x_ksp->ks_data, 2825 cm_entry->x_ksp->ks_data_size); 2826 ((struct cm_kstat_xprt *)(cm_entry->x_ksp->ks_data))-> 2827 x_server.value.str.addr.ptr = 2828 kmem_alloc(INET6_ADDRSTRLEN, KM_SLEEP); 2829 2830 cm_entry->x_ksp->ks_update = conn_kstat_update; 2831 kstat_install(cm_entry->x_ksp); 2832 return (TRUE); 2833 } 2834 2835 /* 2836 * Verify that the specified offset falls within the mblk and 2837 * that the resulting pointer is aligned. 2838 * Returns NULL if not. 2839 * 2840 * code from fs/sockfs/socksubr.c 2841 */ 2842 static void * 2843 connmgr_opt_getoff(mblk_t *mp, t_uscalar_t offset, 2844 t_uscalar_t length, uint_t align_size) 2845 { 2846 uintptr_t ptr1, ptr2; 2847 2848 ASSERT(mp && mp->b_wptr >= mp->b_rptr); 2849 ptr1 = (uintptr_t)mp->b_rptr + offset; 2850 ptr2 = (uintptr_t)ptr1 + length; 2851 if (ptr1 < (uintptr_t)mp->b_rptr || ptr2 > (uintptr_t)mp->b_wptr) { 2852 return (NULL); 2853 } 2854 if ((ptr1 & (align_size - 1)) != 0) { 2855 return (NULL); 2856 } 2857 return ((void *)ptr1); 2858 } 2859 2860 static bool_t 2861 connmgr_getopt_int(queue_t *wq, int level, int name, int *val, 2862 calllist_t *e, cred_t *cr) 2863 { 2864 mblk_t *mp; 2865 struct opthdr *opt, *opt_res; 2866 struct T_optmgmt_req *tor; 2867 struct T_optmgmt_ack *opt_ack; 2868 struct timeval waitp; 2869 int error; 2870 2871 mp = allocb_cred(sizeof (struct T_optmgmt_req) + 2872 sizeof (struct opthdr) + sizeof (int), cr, NOPID); 2873 if (mp == NULL) 2874 return (FALSE); 2875 2876 mp->b_datap->db_type = M_PROTO; 2877 tor = (struct T_optmgmt_req *)(mp->b_rptr); 2878 tor->PRIM_type = T_SVR4_OPTMGMT_REQ; 2879 tor->MGMT_flags = T_CURRENT; 2880 tor->OPT_length = sizeof (struct opthdr) + sizeof (int); 2881 tor->OPT_offset = sizeof (struct T_optmgmt_req); 2882 2883 opt = (struct opthdr *)(mp->b_rptr + sizeof (struct T_optmgmt_req)); 2884 opt->level = level; 2885 opt->name = name; 2886 opt->len = sizeof (int); 2887 mp->b_wptr += sizeof (struct T_optmgmt_req) + sizeof (struct opthdr) + 2888 sizeof (int); 2889 2890 /* 2891 * We will use this connection regardless 2892 * of whether or not the option is readable. 2893 */ 2894 if (clnt_dispatch_send(wq, mp, e, 0, 0) != RPC_SUCCESS) { 2895 DTRACE_PROBE(krpc__e__connmgr__getopt__cantsend); 2896 freemsg(mp); 2897 return (FALSE); 2898 } 2899 2900 mutex_enter(&clnt_pending_lock); 2901 2902 waitp.tv_sec = clnt_cots_min_conntout; 2903 waitp.tv_usec = 0; 2904 error = waitforack(e, T_OPTMGMT_ACK, &waitp, 1); 2905 2906 if (e->call_prev) 2907 e->call_prev->call_next = e->call_next; 2908 else 2909 clnt_pending = e->call_next; 2910 if (e->call_next) 2911 e->call_next->call_prev = e->call_prev; 2912 mutex_exit(&clnt_pending_lock); 2913 2914 /* get reply message */ 2915 mp = e->call_reply; 2916 e->call_reply = NULL; 2917 2918 if ((!mp) || (e->call_status != RPC_SUCCESS) || (error != 0)) { 2919 2920 DTRACE_PROBE4(krpc__e__connmgr_getopt, int, name, 2921 int, e->call_status, int, error, mblk_t *, mp); 2922 2923 if (mp) 2924 freemsg(mp); 2925 return (FALSE); 2926 } 2927 2928 opt_ack = (struct T_optmgmt_ack *)mp->b_rptr; 2929 opt_res = (struct opthdr *)connmgr_opt_getoff(mp, opt_ack->OPT_offset, 2930 opt_ack->OPT_length, __TPI_ALIGN_SIZE); 2931 2932 if (!opt_res) { 2933 DTRACE_PROBE4(krpc__e__connmgr_optres, mblk_t *, mp, int, name, 2934 int, opt_ack->OPT_offset, int, opt_ack->OPT_length); 2935 freemsg(mp); 2936 return (FALSE); 2937 } 2938 *val = *(int *)&opt_res[1]; 2939 2940 DTRACE_PROBE2(connmgr_getopt__ok, int, name, int, *val); 2941 2942 freemsg(mp); 2943 return (TRUE); 2944 } 2945 2946 /* 2947 * Called by connmgr_connect to set an option on the new stream. 2948 */ 2949 static bool_t 2950 connmgr_setopt_int(queue_t *wq, int level, int name, int val, 2951 calllist_t *e, cred_t *cr) 2952 { 2953 mblk_t *mp; 2954 struct opthdr *opt; 2955 struct T_optmgmt_req *tor; 2956 struct timeval waitp; 2957 int error; 2958 2959 mp = allocb_cred(sizeof (struct T_optmgmt_req) + 2960 sizeof (struct opthdr) + sizeof (int), cr, NOPID); 2961 if (mp == NULL) { 2962 RPCLOG0(1, "connmgr_setopt: cannot alloc mp for option " 2963 "request\n"); 2964 return (FALSE); 2965 } 2966 2967 mp->b_datap->db_type = M_PROTO; 2968 tor = (struct T_optmgmt_req *)(mp->b_rptr); 2969 tor->PRIM_type = T_SVR4_OPTMGMT_REQ; 2970 tor->MGMT_flags = T_NEGOTIATE; 2971 tor->OPT_length = sizeof (struct opthdr) + sizeof (int); 2972 tor->OPT_offset = sizeof (struct T_optmgmt_req); 2973 2974 opt = (struct opthdr *)(mp->b_rptr + sizeof (struct T_optmgmt_req)); 2975 opt->level = level; 2976 opt->name = name; 2977 opt->len = sizeof (int); 2978 *(int *)((char *)opt + sizeof (*opt)) = val; 2979 mp->b_wptr += sizeof (struct T_optmgmt_req) + sizeof (struct opthdr) + 2980 sizeof (int); 2981 2982 /* 2983 * We will use this connection regardless 2984 * of whether or not the option is settable. 2985 */ 2986 if (clnt_dispatch_send(wq, mp, e, 0, 0) != RPC_SUCCESS) { 2987 DTRACE_PROBE(krpc__e__connmgr__setopt__cantsend); 2988 freemsg(mp); 2989 return (FALSE); 2990 } 2991 2992 mutex_enter(&clnt_pending_lock); 2993 2994 waitp.tv_sec = clnt_cots_min_conntout; 2995 waitp.tv_usec = 0; 2996 error = waitforack(e, T_OPTMGMT_ACK, &waitp, 1); 2997 2998 if (e->call_prev) 2999 e->call_prev->call_next = e->call_next; 3000 else 3001 clnt_pending = e->call_next; 3002 if (e->call_next) 3003 e->call_next->call_prev = e->call_prev; 3004 mutex_exit(&clnt_pending_lock); 3005 3006 if (e->call_reply != NULL) { 3007 freemsg(e->call_reply); 3008 e->call_reply = NULL; 3009 } 3010 3011 if (e->call_status != RPC_SUCCESS || error != 0) { 3012 RPCLOG(1, "connmgr_setopt: can't set option: %d\n", name); 3013 return (FALSE); 3014 } 3015 RPCLOG(8, "connmgr_setopt: successfully set option: %d\n", name); 3016 return (TRUE); 3017 } 3018 3019 static bool_t 3020 connmgr_setopt(queue_t *wq, int level, int name, calllist_t *e, cred_t *cr) 3021 { 3022 return (connmgr_setopt_int(wq, level, name, 1, e, cr)); 3023 } 3024 3025 #ifdef DEBUG 3026 3027 /* 3028 * This is a knob to let us force code coverage in allocation failure 3029 * case. 3030 */ 3031 static int connmgr_failsnd; 3032 #define CONN_SND_ALLOC(Size, Pri) \ 3033 ((connmgr_failsnd-- > 0) ? NULL : allocb(Size, Pri)) 3034 3035 #else 3036 3037 #define CONN_SND_ALLOC(Size, Pri) allocb(Size, Pri) 3038 3039 #endif 3040 3041 /* 3042 * Sends an orderly release on the specified queue. 3043 * Entered with connmgr_lock. Exited without connmgr_lock 3044 */ 3045 static void 3046 connmgr_sndrel(struct cm_xprt *cm_entry) 3047 { 3048 struct T_ordrel_req *torr; 3049 mblk_t *mp; 3050 queue_t *q = cm_entry->x_wq; 3051 ASSERT(MUTEX_HELD(&connmgr_lock)); 3052 mp = CONN_SND_ALLOC(sizeof (struct T_ordrel_req), BPRI_LO); 3053 if (mp == NULL) { 3054 cm_entry->x_needrel = TRUE; 3055 mutex_exit(&connmgr_lock); 3056 RPCLOG(1, "connmgr_sndrel: cannot alloc mp for sending ordrel " 3057 "to queue %p\n", (void *)q); 3058 return; 3059 } 3060 mutex_exit(&connmgr_lock); 3061 3062 mp->b_datap->db_type = M_PROTO; 3063 torr = (struct T_ordrel_req *)(mp->b_rptr); 3064 torr->PRIM_type = T_ORDREL_REQ; 3065 mp->b_wptr = mp->b_rptr + sizeof (struct T_ordrel_req); 3066 3067 RPCLOG(8, "connmgr_sndrel: sending ordrel to queue %p\n", (void *)q); 3068 put(q, mp); 3069 } 3070 3071 /* 3072 * Sends an disconnect on the specified queue. 3073 * Entered with connmgr_lock. Exited without connmgr_lock 3074 */ 3075 static void 3076 connmgr_snddis(struct cm_xprt *cm_entry) 3077 { 3078 struct T_discon_req *tdis; 3079 mblk_t *mp; 3080 queue_t *q = cm_entry->x_wq; 3081 3082 ASSERT(MUTEX_HELD(&connmgr_lock)); 3083 mp = CONN_SND_ALLOC(sizeof (*tdis), BPRI_LO); 3084 if (mp == NULL) { 3085 cm_entry->x_needdis = TRUE; 3086 mutex_exit(&connmgr_lock); 3087 RPCLOG(1, "connmgr_snddis: cannot alloc mp for sending discon " 3088 "to queue %p\n", (void *)q); 3089 return; 3090 } 3091 mutex_exit(&connmgr_lock); 3092 3093 mp->b_datap->db_type = M_PROTO; 3094 tdis = (struct T_discon_req *)mp->b_rptr; 3095 tdis->PRIM_type = T_DISCON_REQ; 3096 mp->b_wptr = mp->b_rptr + sizeof (*tdis); 3097 3098 RPCLOG(8, "connmgr_snddis: sending discon to queue %p\n", (void *)q); 3099 put(q, mp); 3100 } 3101 3102 /* 3103 * Sets up the entry for receiving replies, and calls rpcmod's write put proc 3104 * (through put) to send the call. 3105 */ 3106 static int 3107 clnt_dispatch_send(queue_t *q, mblk_t *mp, calllist_t *e, uint_t xid, 3108 uint_t queue_flag) 3109 { 3110 ASSERT(e != NULL); 3111 3112 e->call_status = RPC_TIMEDOUT; /* optimistic, eh? */ 3113 e->call_reason = 0; 3114 e->call_wq = q; 3115 e->call_xid = xid; 3116 e->call_notified = FALSE; 3117 3118 if (!canput(q)) { 3119 e->call_status = RPC_CANTSEND; 3120 e->call_reason = ENOBUFS; 3121 return (RPC_CANTSEND); 3122 } 3123 3124 /* 3125 * If queue_flag is set then the calllist_t is already on the hash 3126 * queue. In this case just send the message and return. 3127 */ 3128 if (queue_flag) { 3129 put(q, mp); 3130 return (RPC_SUCCESS); 3131 3132 } 3133 3134 /* 3135 * Set up calls for RPC requests (with XID != 0) on the hash 3136 * queue for fast lookups and place other calls (i.e. 3137 * connection management) on the linked list. 3138 */ 3139 if (xid != 0) { 3140 RPCLOG(64, "clnt_dispatch_send: putting xid 0x%x on " 3141 "dispatch list\n", xid); 3142 e->call_hash = call_hash(xid, clnt_cots_hash_size); 3143 e->call_bucket = &cots_call_ht[e->call_hash]; 3144 call_table_enter(e); 3145 } else { 3146 mutex_enter(&clnt_pending_lock); 3147 if (clnt_pending) 3148 clnt_pending->call_prev = e; 3149 e->call_next = clnt_pending; 3150 e->call_prev = NULL; 3151 clnt_pending = e; 3152 mutex_exit(&clnt_pending_lock); 3153 } 3154 3155 put(q, mp); 3156 return (RPC_SUCCESS); 3157 } 3158 3159 /* 3160 * Called by rpcmod to notify a client with a clnt_pending call that its reply 3161 * has arrived. If we can't find a client waiting for this reply, we log 3162 * the error and return. 3163 */ 3164 bool_t 3165 clnt_dispatch_notify(mblk_t *mp, zoneid_t zoneid) 3166 { 3167 calllist_t *e = NULL; 3168 call_table_t *chtp; 3169 uint32_t xid; 3170 uint_t hash; 3171 3172 if ((IS_P2ALIGNED(mp->b_rptr, sizeof (uint32_t))) && 3173 (mp->b_wptr - mp->b_rptr) >= sizeof (xid)) 3174 xid = *((uint32_t *)mp->b_rptr); 3175 else { 3176 int i = 0; 3177 unsigned char *p = (unsigned char *)&xid; 3178 unsigned char *rptr; 3179 mblk_t *tmp = mp; 3180 3181 /* 3182 * Copy the xid, byte-by-byte into xid. 3183 */ 3184 while (tmp) { 3185 rptr = tmp->b_rptr; 3186 while (rptr < tmp->b_wptr) { 3187 *p++ = *rptr++; 3188 if (++i >= sizeof (xid)) 3189 goto done_xid_copy; 3190 } 3191 tmp = tmp->b_cont; 3192 } 3193 3194 /* 3195 * If we got here, we ran out of mblk space before the 3196 * xid could be copied. 3197 */ 3198 ASSERT(tmp == NULL && i < sizeof (xid)); 3199 3200 RPCLOG0(1, 3201 "clnt_dispatch_notify: message less than size of xid\n"); 3202 return (FALSE); 3203 3204 } 3205 done_xid_copy: 3206 3207 hash = call_hash(xid, clnt_cots_hash_size); 3208 chtp = &cots_call_ht[hash]; 3209 /* call_table_find returns with the hash bucket locked */ 3210 call_table_find(chtp, xid, e); 3211 3212 if (e != NULL) { 3213 /* 3214 * Found thread waiting for this reply 3215 */ 3216 mutex_enter(&e->call_lock); 3217 3218 /* 3219 * verify that the reply is coming in on 3220 * the same zone that it was sent from. 3221 */ 3222 if (e->call_zoneid != zoneid) { 3223 mutex_exit(&e->call_lock); 3224 mutex_exit(&chtp->ct_lock); 3225 RPCLOG0(1, "clnt_dispatch_notify: incorrect zoneid\n"); 3226 return (FALSE); 3227 } 3228 3229 if (e->call_reply) 3230 /* 3231 * This can happen under the following scenario: 3232 * clnt_cots_kcallit() times out on the response, 3233 * rfscall() repeats the CLNT_CALL() with 3234 * the same xid, clnt_cots_kcallit() sends the retry, 3235 * thereby putting the clnt handle on the pending list, 3236 * the first response arrives, signalling the thread 3237 * in clnt_cots_kcallit(). Before that thread is 3238 * dispatched, the second response arrives as well, 3239 * and clnt_dispatch_notify still finds the handle on 3240 * the pending list, with call_reply set. So free the 3241 * old reply now. 3242 * 3243 * It is also possible for a response intended for 3244 * an RPC call with a different xid to reside here. 3245 * This can happen if the thread that owned this 3246 * client handle prior to the current owner bailed 3247 * out and left its call record on the dispatch 3248 * queue. A window exists where the response can 3249 * arrive before the current owner dispatches its 3250 * RPC call. 3251 * 3252 * In any case, this is the very last point where we 3253 * can safely check the call_reply field before 3254 * placing the new response there. 3255 */ 3256 freemsg(e->call_reply); 3257 e->call_reply = mp; 3258 e->call_status = RPC_SUCCESS; 3259 e->call_notified = TRUE; 3260 cv_signal(&e->call_cv); 3261 mutex_exit(&e->call_lock); 3262 mutex_exit(&chtp->ct_lock); 3263 return (TRUE); 3264 } else { 3265 zone_t *zone; 3266 struct rpcstat *rpcstat; 3267 3268 mutex_exit(&chtp->ct_lock); 3269 RPCLOG(65, "clnt_dispatch_notify: no caller for reply 0x%x\n", 3270 xid); 3271 /* 3272 * This is unfortunate, but we need to lookup the zone so we 3273 * can increment its "rcbadxids" counter. 3274 */ 3275 zone = zone_find_by_id(zoneid); 3276 if (zone == NULL) { 3277 /* 3278 * The zone went away... 3279 */ 3280 return (FALSE); 3281 } 3282 rpcstat = zone_getspecific(rpcstat_zone_key, zone); 3283 if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) { 3284 /* 3285 * Not interested 3286 */ 3287 zone_rele(zone); 3288 return (FALSE); 3289 } 3290 COTSRCSTAT_INCR(rpcstat->rpc_cots_client, rcbadxids); 3291 zone_rele(zone); 3292 } 3293 return (FALSE); 3294 } 3295 3296 /* 3297 * Called by rpcmod when a non-data indication arrives. The ones in which we 3298 * are interested are connection indications and options acks. We dispatch 3299 * based on the queue the indication came in on. If we are not interested in 3300 * what came in, we return false to rpcmod, who will then pass it upstream. 3301 */ 3302 bool_t 3303 clnt_dispatch_notifyconn(queue_t *q, mblk_t *mp) 3304 { 3305 calllist_t *e; 3306 int type; 3307 3308 ASSERT((q->q_flag & QREADR) == 0); 3309 3310 type = ((union T_primitives *)mp->b_rptr)->type; 3311 RPCLOG(8, "clnt_dispatch_notifyconn: prim type: [%s]\n", 3312 rpc_tpiprim2name(type)); 3313 mutex_enter(&clnt_pending_lock); 3314 for (e = clnt_pending; /* NO CONDITION */; e = e->call_next) { 3315 if (e == NULL) { 3316 mutex_exit(&clnt_pending_lock); 3317 RPCLOG(1, "clnt_dispatch_notifyconn: no one waiting " 3318 "for connection on queue 0x%p\n", (void *)q); 3319 return (FALSE); 3320 } 3321 if (e->call_wq == q) 3322 break; 3323 } 3324 3325 switch (type) { 3326 case T_CONN_CON: 3327 /* 3328 * The transport is now connected, send a T_INFO_REQ to get 3329 * the tidu size. 3330 */ 3331 mutex_exit(&clnt_pending_lock); 3332 ASSERT(mp->b_datap->db_lim - mp->b_datap->db_base >= 3333 sizeof (struct T_info_req)); 3334 mp->b_rptr = mp->b_datap->db_base; 3335 ((union T_primitives *)mp->b_rptr)->type = T_INFO_REQ; 3336 mp->b_wptr = mp->b_rptr + sizeof (struct T_info_req); 3337 mp->b_datap->db_type = M_PCPROTO; 3338 put(q, mp); 3339 return (TRUE); 3340 case T_INFO_ACK: 3341 case T_OPTMGMT_ACK: 3342 e->call_status = RPC_SUCCESS; 3343 e->call_reply = mp; 3344 e->call_notified = TRUE; 3345 cv_signal(&e->call_cv); 3346 break; 3347 case T_ERROR_ACK: 3348 e->call_status = RPC_CANTCONNECT; 3349 e->call_reply = mp; 3350 e->call_notified = TRUE; 3351 cv_signal(&e->call_cv); 3352 break; 3353 case T_OK_ACK: 3354 /* 3355 * Great, but we are really waiting for a T_CONN_CON 3356 */ 3357 freemsg(mp); 3358 break; 3359 default: 3360 mutex_exit(&clnt_pending_lock); 3361 RPCLOG(1, "clnt_dispatch_notifyconn: bad type %d\n", type); 3362 return (FALSE); 3363 } 3364 3365 mutex_exit(&clnt_pending_lock); 3366 return (TRUE); 3367 } 3368 3369 /* 3370 * Called by rpcmod when the transport is (or should be) going away. Informs 3371 * all callers waiting for replies and marks the entry in the connection 3372 * manager's list as unconnected, and either closing (close handshake in 3373 * progress) or dead. 3374 */ 3375 void 3376 clnt_dispatch_notifyall(queue_t *q, int32_t msg_type, int32_t reason) 3377 { 3378 calllist_t *e; 3379 call_table_t *ctp; 3380 struct cm_xprt *cm_entry; 3381 int have_connmgr_lock; 3382 int i; 3383 3384 ASSERT((q->q_flag & QREADR) == 0); 3385 3386 RPCLOG(1, "clnt_dispatch_notifyall on queue %p", (void *)q); 3387 RPCLOG(1, " received a notifcation prim type [%s]", 3388 rpc_tpiprim2name(msg_type)); 3389 RPCLOG(1, " and reason %d\n", reason); 3390 3391 /* 3392 * Find the transport entry in the connection manager's list, close 3393 * the transport and delete the entry. In the case where rpcmod's 3394 * idle timer goes off, it sends us a T_ORDREL_REQ, indicating we 3395 * should gracefully close the connection. 3396 */ 3397 have_connmgr_lock = 1; 3398 mutex_enter(&connmgr_lock); 3399 for (cm_entry = cm_hd; cm_entry; cm_entry = cm_entry->x_next) { 3400 ASSERT(cm_entry != cm_entry->x_next); 3401 if (cm_entry->x_wq == q) { 3402 ASSERT(MUTEX_HELD(&connmgr_lock)); 3403 ASSERT(have_connmgr_lock == 1); 3404 switch (msg_type) { 3405 case T_ORDREL_REQ: 3406 3407 if (cm_entry->x_dead) { 3408 RPCLOG(1, "idle timeout on dead " 3409 "connection: %p\n", 3410 (void *)cm_entry); 3411 if (clnt_stop_idle != NULL) 3412 (*clnt_stop_idle)(q); 3413 break; 3414 } 3415 3416 /* 3417 * Only mark the connection as dead if it is 3418 * connected and idle. 3419 * An unconnected connection has probably 3420 * gone idle because the server is down, 3421 * and when it comes back up there will be 3422 * retries that need to use that connection. 3423 */ 3424 if (cm_entry->x_connected || 3425 cm_entry->x_doomed) { 3426 if (cm_entry->x_ordrel) { 3427 if (cm_entry->x_closing == 3428 TRUE) { 3429 /* 3430 * The connection is 3431 * obviously wedged due 3432 * to a bug or problem 3433 * with the transport. 3434 * Mark it as dead. 3435 * Otherwise we can 3436 * leak connections. 3437 */ 3438 cm_entry->x_dead = TRUE; 3439 mutex_exit( 3440 &connmgr_lock); 3441 have_connmgr_lock = 0; 3442 if (clnt_stop_idle != 3443 NULL) 3444 (*clnt_stop_idle)(q); 3445 break; 3446 } 3447 cm_entry->x_closing = TRUE; 3448 connmgr_sndrel(cm_entry); 3449 have_connmgr_lock = 0; 3450 } else { 3451 cm_entry->x_dead = TRUE; 3452 mutex_exit(&connmgr_lock); 3453 have_connmgr_lock = 0; 3454 if (clnt_stop_idle != NULL) 3455 (*clnt_stop_idle)(q); 3456 } 3457 } else { 3458 /* 3459 * We don't mark the connection 3460 * as dead, but we turn off the 3461 * idle timer. 3462 */ 3463 mutex_exit(&connmgr_lock); 3464 have_connmgr_lock = 0; 3465 if (clnt_stop_idle != NULL) 3466 (*clnt_stop_idle)(q); 3467 RPCLOG(1, "clnt_dispatch_notifyall:" 3468 " ignoring timeout from rpcmod" 3469 " (q %p) because we are not " 3470 " connected\n", (void *)q); 3471 } 3472 break; 3473 case T_ORDREL_IND: 3474 /* 3475 * If this entry is marked closing, then we are 3476 * completing a close handshake, and the 3477 * connection is dead. Otherwise, the server is 3478 * trying to close. Since the server will not 3479 * be sending any more RPC replies, we abort 3480 * the connection, including flushing 3481 * any RPC requests that are in-transit. 3482 * In either case, mark the entry as dead so 3483 * that it can be closed by the connection 3484 * manager's garbage collector. 3485 */ 3486 cm_entry->x_dead = TRUE; 3487 if (cm_entry->x_closing) { 3488 mutex_exit(&connmgr_lock); 3489 have_connmgr_lock = 0; 3490 if (clnt_stop_idle != NULL) 3491 (*clnt_stop_idle)(q); 3492 } else { 3493 /* 3494 * if we're getting a disconnect 3495 * before we've finished our 3496 * connect attempt, mark it for 3497 * later processing 3498 */ 3499 if (cm_entry->x_thread) 3500 cm_entry->x_early_disc = TRUE; 3501 else 3502 cm_entry->x_connected = FALSE; 3503 cm_entry->x_waitdis = TRUE; 3504 connmgr_snddis(cm_entry); 3505 have_connmgr_lock = 0; 3506 } 3507 break; 3508 3509 case T_ERROR_ACK: 3510 case T_OK_ACK: 3511 cm_entry->x_waitdis = FALSE; 3512 cv_signal(&cm_entry->x_dis_cv); 3513 mutex_exit(&connmgr_lock); 3514 return; 3515 3516 case T_DISCON_REQ: 3517 if (cm_entry->x_thread) 3518 cm_entry->x_early_disc = TRUE; 3519 else 3520 cm_entry->x_connected = FALSE; 3521 cm_entry->x_waitdis = TRUE; 3522 3523 connmgr_snddis(cm_entry); 3524 have_connmgr_lock = 0; 3525 break; 3526 3527 case T_DISCON_IND: 3528 default: 3529 /* 3530 * if we're getting a disconnect before 3531 * we've finished our connect attempt, 3532 * mark it for later processing 3533 */ 3534 if (cm_entry->x_closing) { 3535 cm_entry->x_dead = TRUE; 3536 mutex_exit(&connmgr_lock); 3537 have_connmgr_lock = 0; 3538 if (clnt_stop_idle != NULL) 3539 (*clnt_stop_idle)(q); 3540 } else { 3541 if (cm_entry->x_thread) { 3542 cm_entry->x_early_disc = TRUE; 3543 } else { 3544 cm_entry->x_dead = TRUE; 3545 cm_entry->x_connected = FALSE; 3546 } 3547 } 3548 break; 3549 } 3550 break; 3551 } 3552 } 3553 3554 if (have_connmgr_lock) 3555 mutex_exit(&connmgr_lock); 3556 3557 if (msg_type == T_ERROR_ACK || msg_type == T_OK_ACK) { 3558 RPCLOG(1, "clnt_dispatch_notifyall: (wq %p) could not find " 3559 "connmgr entry for discon ack\n", (void *)q); 3560 return; 3561 } 3562 3563 /* 3564 * Then kick all the clnt_pending calls out of their wait. There 3565 * should be no clnt_pending calls in the case of rpcmod's idle 3566 * timer firing. 3567 */ 3568 for (i = 0; i < clnt_cots_hash_size; i++) { 3569 ctp = &cots_call_ht[i]; 3570 mutex_enter(&ctp->ct_lock); 3571 for (e = ctp->ct_call_next; 3572 e != (calllist_t *)ctp; 3573 e = e->call_next) { 3574 if (e->call_wq == q && e->call_notified == FALSE) { 3575 RPCLOG(1, 3576 "clnt_dispatch_notifyall for queue %p ", 3577 (void *)q); 3578 RPCLOG(1, "aborting clnt_pending call %p\n", 3579 (void *)e); 3580 3581 if (msg_type == T_DISCON_IND) 3582 e->call_reason = reason; 3583 e->call_notified = TRUE; 3584 e->call_status = RPC_XPRTFAILED; 3585 cv_signal(&e->call_cv); 3586 } 3587 } 3588 mutex_exit(&ctp->ct_lock); 3589 } 3590 3591 mutex_enter(&clnt_pending_lock); 3592 for (e = clnt_pending; e; e = e->call_next) { 3593 /* 3594 * Only signal those RPC handles that haven't been 3595 * signalled yet. Otherwise we can get a bogus call_reason. 3596 * This can happen if thread A is making a call over a 3597 * connection. If the server is killed, it will cause 3598 * reset, and reason will default to EIO as a result of 3599 * a T_ORDREL_IND. Thread B then attempts to recreate 3600 * the connection but gets a T_DISCON_IND. If we set the 3601 * call_reason code for all threads, then if thread A 3602 * hasn't been dispatched yet, it will get the wrong 3603 * reason. The bogus call_reason can make it harder to 3604 * discriminate between calls that fail because the 3605 * connection attempt failed versus those where the call 3606 * may have been executed on the server. 3607 */ 3608 if (e->call_wq == q && e->call_notified == FALSE) { 3609 RPCLOG(1, "clnt_dispatch_notifyall for queue %p ", 3610 (void *)q); 3611 RPCLOG(1, " aborting clnt_pending call %p\n", 3612 (void *)e); 3613 3614 if (msg_type == T_DISCON_IND) 3615 e->call_reason = reason; 3616 e->call_notified = TRUE; 3617 /* 3618 * Let the caller timeout, else he will retry 3619 * immediately. 3620 */ 3621 e->call_status = RPC_XPRTFAILED; 3622 3623 /* 3624 * We used to just signal those threads 3625 * waiting for a connection, (call_xid = 0). 3626 * That meant that threads waiting for a response 3627 * waited till their timeout expired. This 3628 * could be a long time if they've specified a 3629 * maximum timeout. (2^31 - 1). So we 3630 * Signal all threads now. 3631 */ 3632 cv_signal(&e->call_cv); 3633 } 3634 } 3635 mutex_exit(&clnt_pending_lock); 3636 } 3637 3638 3639 /*ARGSUSED*/ 3640 /* 3641 * after resuming a system that's been suspended for longer than the 3642 * NFS server's idle timeout (svc_idle_timeout for Solaris 2), rfscall() 3643 * generates "NFS server X not responding" and "NFS server X ok" messages; 3644 * here we reset inet connections to cause a re-connect and avoid those 3645 * NFS messages. see 4045054 3646 */ 3647 boolean_t 3648 connmgr_cpr_reset(void *arg, int code) 3649 { 3650 struct cm_xprt *cxp; 3651 3652 if (code == CB_CODE_CPR_CHKPT) 3653 return (B_TRUE); 3654 3655 if (mutex_tryenter(&connmgr_lock) == 0) 3656 return (B_FALSE); 3657 for (cxp = cm_hd; cxp; cxp = cxp->x_next) { 3658 if ((cxp->x_family == AF_INET || cxp->x_family == AF_INET6) && 3659 cxp->x_connected == TRUE) { 3660 if (cxp->x_thread) 3661 cxp->x_early_disc = TRUE; 3662 else 3663 cxp->x_connected = FALSE; 3664 cxp->x_needdis = TRUE; 3665 } 3666 } 3667 mutex_exit(&connmgr_lock); 3668 return (B_TRUE); 3669 } 3670 3671 void 3672 clnt_cots_stats_init(zoneid_t zoneid, struct rpc_cots_client **statsp) 3673 { 3674 3675 *statsp = (struct rpc_cots_client *)rpcstat_zone_init_common(zoneid, 3676 "unix", "rpc_cots_client", (const kstat_named_t *)&cots_rcstat_tmpl, 3677 sizeof (cots_rcstat_tmpl)); 3678 } 3679 3680 void 3681 clnt_cots_stats_fini(zoneid_t zoneid, struct rpc_cots_client **statsp) 3682 { 3683 rpcstat_zone_fini_common(zoneid, "unix", "rpc_cots_client"); 3684 kmem_free(*statsp, sizeof (cots_rcstat_tmpl)); 3685 } 3686 3687 void 3688 clnt_cots_init(void) 3689 { 3690 mutex_init(&connmgr_lock, NULL, MUTEX_DEFAULT, NULL); 3691 mutex_init(&clnt_pending_lock, NULL, MUTEX_DEFAULT, NULL); 3692 3693 if (clnt_cots_hash_size < DEFAULT_MIN_HASH_SIZE) 3694 clnt_cots_hash_size = DEFAULT_MIN_HASH_SIZE; 3695 3696 cots_call_ht = call_table_init(clnt_cots_hash_size); 3697 zone_key_create(&zone_cots_key, NULL, NULL, clnt_zone_destroy); 3698 } 3699 3700 void 3701 clnt_cots_fini(void) 3702 { 3703 (void) zone_key_delete(zone_cots_key); 3704 } 3705 3706 /* 3707 * Wait for TPI ack, returns success only if expected ack is received 3708 * within timeout period. 3709 */ 3710 3711 static int 3712 waitforack(calllist_t *e, t_scalar_t ack_prim, const struct timeval *waitp, 3713 bool_t nosignal) 3714 { 3715 union T_primitives *tpr; 3716 clock_t timout; 3717 int cv_stat = 1; 3718 3719 ASSERT(MUTEX_HELD(&clnt_pending_lock)); 3720 while (e->call_reply == NULL) { 3721 if (waitp != NULL) { 3722 timout = waitp->tv_sec * drv_usectohz(MICROSEC) + 3723 drv_usectohz(waitp->tv_usec); 3724 if (nosignal) 3725 cv_stat = cv_reltimedwait(&e->call_cv, 3726 &clnt_pending_lock, timout, TR_CLOCK_TICK); 3727 else 3728 cv_stat = cv_reltimedwait_sig(&e->call_cv, 3729 &clnt_pending_lock, timout, TR_CLOCK_TICK); 3730 } else { 3731 if (nosignal) 3732 cv_wait(&e->call_cv, &clnt_pending_lock); 3733 else 3734 cv_stat = cv_wait_sig(&e->call_cv, 3735 &clnt_pending_lock); 3736 } 3737 if (cv_stat == -1) 3738 return (ETIME); 3739 if (cv_stat == 0) 3740 return (EINTR); 3741 /* 3742 * if we received an error from the server and we know a reply 3743 * is not going to be sent, do not wait for the full timeout, 3744 * return now. 3745 */ 3746 if (e->call_status == RPC_XPRTFAILED) 3747 return (e->call_reason); 3748 } 3749 tpr = (union T_primitives *)e->call_reply->b_rptr; 3750 if (tpr->type == ack_prim) 3751 return (0); /* Success */ 3752 3753 if (tpr->type == T_ERROR_ACK) { 3754 if (tpr->error_ack.TLI_error == TSYSERR) 3755 return (tpr->error_ack.UNIX_error); 3756 else 3757 return (t_tlitosyserr(tpr->error_ack.TLI_error)); 3758 } 3759 3760 return (EPROTO); /* unknown or unexpected primitive */ 3761 } 3762