1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T 29 * All Rights Reserved 30 */ 31 32 /* 33 * Portions of this source code were derived from Berkeley 4.3 BSD 34 * under license from the Regents of the University of California. 35 */ 36 37 #pragma ident "%Z%%M% %I% %E% SMI" 38 39 /* 40 * Implements a kernel based, client side RPC. 41 */ 42 43 #include <sys/param.h> 44 #include <sys/types.h> 45 #include <sys/systm.h> 46 #include <sys/sysmacros.h> 47 #include <sys/stream.h> 48 #include <sys/strsubr.h> 49 #include <sys/ddi.h> 50 #include <sys/tiuser.h> 51 #include <sys/tihdr.h> 52 #include <sys/t_kuser.h> 53 #include <sys/errno.h> 54 #include <sys/kmem.h> 55 #include <sys/debug.h> 56 #include <sys/kstat.h> 57 #include <sys/t_lock.h> 58 #include <sys/cmn_err.h> 59 #include <sys/conf.h> 60 #include <sys/disp.h> 61 #include <sys/taskq.h> 62 #include <sys/list.h> 63 #include <sys/atomic.h> 64 #include <sys/zone.h> 65 #include <netinet/in.h> 66 #include <rpc/types.h> 67 #include <rpc/xdr.h> 68 #include <rpc/auth.h> 69 #include <rpc/clnt.h> 70 #include <rpc/rpc_msg.h> 71 72 static enum clnt_stat clnt_clts_kcallit(CLIENT *, rpcproc_t, xdrproc_t, 73 caddr_t, xdrproc_t, caddr_t, struct timeval); 74 static void clnt_clts_kabort(CLIENT *); 75 static void clnt_clts_kerror(CLIENT *, struct rpc_err *); 76 static bool_t clnt_clts_kfreeres(CLIENT *, xdrproc_t, caddr_t); 77 static bool_t clnt_clts_kcontrol(CLIENT *, int, char *); 78 static void clnt_clts_kdestroy(CLIENT *); 79 static int clnt_clts_ksettimers(CLIENT *, struct rpc_timers *, 80 struct rpc_timers *, int, void (*)(), caddr_t, uint32_t); 81 82 /* 83 * Operations vector for CLTS based RPC 84 */ 85 static struct clnt_ops clts_ops = { 86 clnt_clts_kcallit, /* do rpc call */ 87 clnt_clts_kabort, /* abort call */ 88 clnt_clts_kerror, /* return error status */ 89 clnt_clts_kfreeres, /* free results */ 90 clnt_clts_kdestroy, /* destroy rpc handle */ 91 clnt_clts_kcontrol, /* the ioctl() of rpc */ 92 clnt_clts_ksettimers /* set retry timers */ 93 }; 94 95 /* 96 * Endpoint for CLTS (INET, INET6, loopback, etc.) 97 */ 98 typedef struct endpnt_type { 99 struct endpnt_type *e_next; /* pointer to next endpoint type */ 100 list_t e_pool; /* list of available endpoints */ 101 list_t e_ilist; /* list of idle endpints */ 102 struct endpnt *e_pcurr; /* pointer to current endpoint */ 103 char e_protofmly[KNC_STRSIZE]; /* protocol family */ 104 dev_t e_rdev; /* device */ 105 kmutex_t e_plock; /* pool lock */ 106 kmutex_t e_ilock; /* idle list lock */ 107 timeout_id_t e_itimer; /* timer to dispatch the taskq */ 108 uint_t e_cnt; /* number of endpoints in the pool */ 109 zoneid_t e_zoneid; /* zoneid of endpoint type */ 110 kcondvar_t e_async_cv; /* cv for asynchronous reap threads */ 111 uint_t e_async_count; /* count of asynchronous reap threads */ 112 } endpnt_type_t; 113 114 typedef struct endpnt { 115 list_node_t e_node; /* link to the pool */ 116 list_node_t e_idle; /* link to the idle list */ 117 endpnt_type_t *e_type; /* back pointer to endpoint type */ 118 TIUSER *e_tiptr; /* pointer to transport endpoint */ 119 queue_t *e_wq; /* write queue */ 120 uint_t e_flags; /* endpoint flags */ 121 uint_t e_ref; /* ref count on endpoint */ 122 kcondvar_t e_cv; /* condition variable */ 123 kmutex_t e_lock; /* protects cv and flags */ 124 time_t e_itime; /* time when rele'd */ 125 } endpnt_t; 126 127 #define ENDPNT_ESTABLISHED 0x1 /* endpoint is established */ 128 #define ENDPNT_WAITING 0x2 /* thread waiting for endpoint */ 129 #define ENDPNT_BOUND 0x4 /* endpoint is bound */ 130 #define ENDPNT_STALE 0x8 /* endpoint is dead */ 131 #define ENDPNT_ONIDLE 0x10 /* endpoint is on the idle list */ 132 133 static krwlock_t endpnt_type_lock; /* protects endpnt_type_list */ 134 static endpnt_type_t *endpnt_type_list = NULL; /* list of CLTS endpoints */ 135 static struct kmem_cache *endpnt_cache; /* cache of endpnt_t's */ 136 static taskq_t *endpnt_taskq; /* endpnt_t reaper thread */ 137 static bool_t taskq_created; /* flag for endpnt_taskq */ 138 static kmutex_t endpnt_taskq_lock; /* taskq lock */ 139 static zone_key_t endpnt_destructor_key; 140 141 #define DEFAULT_ENDPOINT_REAP_INTERVAL 60 /* 1 minute */ 142 #define DEFAULT_INTERVAL_SHIFT 30 /* 30 seconds */ 143 144 /* 145 * Endpoint tunables 146 */ 147 static int clnt_clts_max_endpoints = -1; 148 static int clnt_clts_hash_size = DEFAULT_HASH_SIZE; 149 static time_t clnt_clts_endpoint_reap_interval = -1; 150 static clock_t clnt_clts_taskq_dispatch_interval; 151 152 /* 153 * Response completion hash queue 154 */ 155 static call_table_t *clts_call_ht; 156 157 /* 158 * Routines for the endpoint manager 159 */ 160 static struct endpnt_type *endpnt_type_create(struct knetconfig *); 161 static void endpnt_type_free(struct endpnt_type *); 162 static int check_endpnt(struct endpnt *, struct endpnt **); 163 static struct endpnt *endpnt_get(struct knetconfig *); 164 static void endpnt_rele(struct endpnt *); 165 static void endpnt_reap_settimer(endpnt_type_t *); 166 static void endpnt_reap(endpnt_type_t *); 167 static void endpnt_reap_dispatch(void *); 168 static void endpnt_reclaim(zoneid_t); 169 170 171 /* 172 * Request dipatching function. 173 */ 174 static int clnt_clts_dispatch_send(queue_t *q, mblk_t *, struct netbuf *addr, 175 calllist_t *, uint_t); 176 177 /* 178 * The size of the preserialized RPC header information. 179 */ 180 #define CKU_HDRSIZE 20 181 /* 182 * The initial allocation size. It is small to reduce space requirements. 183 */ 184 #define CKU_INITSIZE 2048 185 /* 186 * The size of additional allocations, if required. It is larger to 187 * reduce the number of actual allocations. 188 */ 189 #define CKU_ALLOCSIZE 8192 190 191 /* 192 * Private data per rpc handle. This structure is allocated by 193 * clnt_clts_kcreate, and freed by clnt_clts_kdestroy. 194 */ 195 struct cku_private { 196 CLIENT cku_client; /* client handle */ 197 int cku_retrys; /* request retrys */ 198 calllist_t cku_call; 199 struct endpnt *cku_endpnt; /* open end point */ 200 struct knetconfig cku_config; 201 struct netbuf cku_addr; /* remote address */ 202 struct rpc_err cku_err; /* error status */ 203 XDR cku_outxdr; /* xdr stream for output */ 204 XDR cku_inxdr; /* xdr stream for input */ 205 char cku_rpchdr[CKU_HDRSIZE + 4]; /* rpc header */ 206 struct cred *cku_cred; /* credentials */ 207 struct rpc_timers *cku_timers; /* for estimating RTT */ 208 struct rpc_timers *cku_timeall; /* for estimating RTT */ 209 void (*cku_feedback)(int, int, caddr_t); 210 /* ptr to feedback rtn */ 211 caddr_t cku_feedarg; /* argument for feedback func */ 212 uint32_t cku_xid; /* current XID */ 213 bool_t cku_bcast; /* RPC broadcast hint */ 214 struct rpc_clts_client *cku_stats; /* counters for the zone */ 215 }; 216 217 static const struct rpc_clts_client { 218 kstat_named_t rccalls; 219 kstat_named_t rcbadcalls; 220 kstat_named_t rcretrans; 221 kstat_named_t rcbadxids; 222 kstat_named_t rctimeouts; 223 kstat_named_t rcnewcreds; 224 kstat_named_t rcbadverfs; 225 kstat_named_t rctimers; 226 kstat_named_t rcnomem; 227 kstat_named_t rccantsend; 228 } clts_rcstat_tmpl = { 229 { "calls", KSTAT_DATA_UINT64 }, 230 { "badcalls", KSTAT_DATA_UINT64 }, 231 { "retrans", KSTAT_DATA_UINT64 }, 232 { "badxids", KSTAT_DATA_UINT64 }, 233 { "timeouts", KSTAT_DATA_UINT64 }, 234 { "newcreds", KSTAT_DATA_UINT64 }, 235 { "badverfs", KSTAT_DATA_UINT64 }, 236 { "timers", KSTAT_DATA_UINT64 }, 237 { "nomem", KSTAT_DATA_UINT64 }, 238 { "cantsend", KSTAT_DATA_UINT64 }, 239 }; 240 241 static uint_t clts_rcstat_ndata = 242 sizeof (clts_rcstat_tmpl) / sizeof (kstat_named_t); 243 244 #define RCSTAT_INCR(s, x) \ 245 atomic_add_64(&(s)->x.value.ui64, 1) 246 247 #define ptoh(p) (&((p)->cku_client)) 248 #define htop(h) ((struct cku_private *)((h)->cl_private)) 249 250 /* 251 * Times to retry 252 */ 253 #define SNDTRIES 4 254 #define REFRESHES 2 /* authentication refreshes */ 255 256 static int clnt_clts_do_bindresvport = 1; /* bind to reserved port */ 257 #define BINDRESVPORT_RETRIES 5 258 259 void 260 clnt_clts_stats_init(zoneid_t zoneid, struct rpc_clts_client **statsp) 261 { 262 kstat_t *ksp; 263 kstat_named_t *knp; 264 265 knp = rpcstat_zone_init_common(zoneid, "unix", "rpc_clts_client", 266 (const kstat_named_t *)&clts_rcstat_tmpl, 267 sizeof (clts_rcstat_tmpl)); 268 /* 269 * Backwards compatibility for old kstat clients 270 */ 271 ksp = kstat_create_zone("unix", 0, "rpc_client", "rpc", 272 KSTAT_TYPE_NAMED, clts_rcstat_ndata, 273 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid); 274 if (ksp) { 275 ksp->ks_data = knp; 276 kstat_install(ksp); 277 } 278 *statsp = (struct rpc_clts_client *)knp; 279 } 280 281 void 282 clnt_clts_stats_fini(zoneid_t zoneid, struct rpc_clts_client **statsp) 283 { 284 rpcstat_zone_fini_common(zoneid, "unix", "rpc_clts_client"); 285 kstat_delete_byname_zone("unix", 0, "rpc_client", zoneid); 286 kmem_free(*statsp, sizeof (clts_rcstat_tmpl)); 287 } 288 289 /* 290 * Create an rpc handle for a clts rpc connection. 291 * Allocates space for the handle structure and the private data. 292 */ 293 /* ARGSUSED */ 294 int 295 clnt_clts_kcreate(struct knetconfig *config, struct netbuf *addr, 296 rpcprog_t pgm, rpcvers_t vers, int retrys, struct cred *cred, 297 CLIENT **cl) 298 { 299 CLIENT *h; 300 struct cku_private *p; 301 struct rpc_msg call_msg; 302 int error; 303 int plen; 304 305 if (cl == NULL) 306 return (EINVAL); 307 308 *cl = NULL; 309 error = 0; 310 311 p = kmem_zalloc(sizeof (*p), KM_SLEEP); 312 313 h = ptoh(p); 314 315 /* handle */ 316 h->cl_ops = &clts_ops; 317 h->cl_private = (caddr_t)p; 318 h->cl_auth = authkern_create(); 319 320 /* call message, just used to pre-serialize below */ 321 call_msg.rm_xid = 0; 322 call_msg.rm_direction = CALL; 323 call_msg.rm_call.cb_rpcvers = RPC_MSG_VERSION; 324 call_msg.rm_call.cb_prog = pgm; 325 call_msg.rm_call.cb_vers = vers; 326 327 /* private */ 328 clnt_clts_kinit(h, addr, retrys, cred); 329 330 xdrmem_create(&p->cku_outxdr, p->cku_rpchdr, CKU_HDRSIZE, XDR_ENCODE); 331 332 /* pre-serialize call message header */ 333 if (!xdr_callhdr(&p->cku_outxdr, &call_msg)) { 334 error = EINVAL; /* XXX */ 335 goto bad; 336 } 337 338 p->cku_config.knc_rdev = config->knc_rdev; 339 p->cku_config.knc_semantics = config->knc_semantics; 340 plen = strlen(config->knc_protofmly) + 1; 341 p->cku_config.knc_protofmly = kmem_alloc(plen, KM_SLEEP); 342 bcopy(config->knc_protofmly, p->cku_config.knc_protofmly, plen); 343 344 cv_init(&p->cku_call.call_cv, NULL, CV_DEFAULT, NULL); 345 mutex_init(&p->cku_call.call_lock, NULL, MUTEX_DEFAULT, NULL); 346 347 *cl = h; 348 return (0); 349 350 bad: 351 auth_destroy(h->cl_auth); 352 kmem_free(p->cku_addr.buf, addr->maxlen); 353 kmem_free(p, sizeof (struct cku_private)); 354 355 return (error); 356 } 357 358 void 359 clnt_clts_kinit(CLIENT *h, struct netbuf *addr, int retrys, cred_t *cred) 360 { 361 /* LINTED pointer alignment */ 362 struct cku_private *p = htop(h); 363 struct rpcstat *rsp; 364 365 rsp = zone_getspecific(rpcstat_zone_key, curproc->p_zone); 366 ASSERT(rsp != NULL); 367 368 p->cku_retrys = retrys; 369 370 if (p->cku_addr.maxlen < addr->len) { 371 if (p->cku_addr.maxlen != 0 && p->cku_addr.buf != NULL) 372 kmem_free(p->cku_addr.buf, p->cku_addr.maxlen); 373 374 p->cku_addr.buf = kmem_zalloc(addr->maxlen, KM_SLEEP); 375 p->cku_addr.maxlen = addr->maxlen; 376 } 377 378 p->cku_addr.len = addr->len; 379 bcopy(addr->buf, p->cku_addr.buf, addr->len); 380 381 p->cku_cred = cred; 382 p->cku_xid = 0; 383 p->cku_timers = NULL; 384 p->cku_timeall = NULL; 385 p->cku_feedback = NULL; 386 p->cku_bcast = FALSE; 387 p->cku_call.call_xid = 0; 388 p->cku_call.call_hash = 0; 389 p->cku_call.call_notified = FALSE; 390 p->cku_call.call_next = NULL; 391 p->cku_call.call_prev = NULL; 392 p->cku_call.call_reply = NULL; 393 p->cku_call.call_wq = NULL; 394 p->cku_stats = rsp->rpc_clts_client; 395 } 396 397 /* 398 * set the timers. Return current retransmission timeout. 399 */ 400 static int 401 clnt_clts_ksettimers(CLIENT *h, struct rpc_timers *t, struct rpc_timers *all, 402 int minimum, void (*feedback)(int, int, caddr_t), caddr_t arg, 403 uint32_t xid) 404 { 405 /* LINTED pointer alignment */ 406 struct cku_private *p = htop(h); 407 int value; 408 409 p->cku_feedback = feedback; 410 p->cku_feedarg = arg; 411 p->cku_timers = t; 412 p->cku_timeall = all; 413 if (xid) 414 p->cku_xid = xid; 415 value = all->rt_rtxcur; 416 value += t->rt_rtxcur; 417 if (value < minimum) 418 return (minimum); 419 RCSTAT_INCR(p->cku_stats, rctimers); 420 return (value); 421 } 422 423 /* 424 * Time out back off function. tim is in HZ 425 */ 426 #define MAXTIMO (20 * hz) 427 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim)) 428 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1)) 429 430 #define RETRY_POLL_TIMO 30 431 432 /* 433 * Call remote procedure. 434 * Most of the work of rpc is done here. We serialize what is left 435 * of the header (some was pre-serialized in the handle), serialize 436 * the arguments, and send it off. We wait for a reply or a time out. 437 * Timeout causes an immediate return, other packet problems may cause 438 * a retry on the receive. When a good packet is received we deserialize 439 * it, and check verification. A bad reply code will cause one retry 440 * with full (longhand) credentials. 441 */ 442 enum clnt_stat 443 clnt_clts_kcallit_addr(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args, 444 caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp, 445 struct timeval wait, struct netbuf *sin) 446 { 447 /* LINTED pointer alignment */ 448 struct cku_private *p = htop(h); 449 XDR *xdrs; 450 int stries = p->cku_retrys; 451 int refreshes = REFRESHES; /* number of times to refresh cred */ 452 int round_trip; /* time the RPC */ 453 int error; 454 int hdrsz; 455 mblk_t *mp; 456 mblk_t *mpdup; 457 mblk_t *resp = NULL; 458 mblk_t *tmp; 459 calllist_t *call = &p->cku_call; 460 clock_t timout = 0; 461 bool_t interrupted; 462 enum clnt_stat status; 463 struct rpc_msg reply_msg; 464 enum clnt_stat re_status; 465 466 RCSTAT_INCR(p->cku_stats, rccalls); 467 468 RPCLOG(2, "clnt_clts_kcallit_addr: wait.tv_sec: %ld\n", wait.tv_sec); 469 RPCLOG(2, "clnt_clts_kcallit_addr: wait.tv_usec: %ld\n", wait.tv_usec); 470 471 timout = TIMEVAL_TO_TICK(&wait); 472 473 if (p->cku_xid == 0) { 474 p->cku_xid = alloc_xid(); 475 if (p->cku_endpnt != NULL) 476 endpnt_rele(p->cku_endpnt); 477 p->cku_endpnt = NULL; 478 } 479 480 mpdup = NULL; 481 call_again: 482 483 if (mpdup == NULL) { 484 485 while ((mp = allocb(CKU_INITSIZE, BPRI_LO)) == NULL) { 486 if (strwaitbuf(CKU_INITSIZE, BPRI_LO)) { 487 p->cku_err.re_status = RPC_SYSTEMERROR; 488 p->cku_err.re_errno = ENOSR; 489 goto done; 490 } 491 } 492 493 xdrs = &p->cku_outxdr; 494 xdrmblk_init(xdrs, mp, XDR_ENCODE, CKU_ALLOCSIZE); 495 496 if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) { 497 /* 498 * Copy in the preserialized RPC header 499 * information. 500 */ 501 bcopy(p->cku_rpchdr, mp->b_rptr, CKU_HDRSIZE); 502 503 /* 504 * transaction id is the 1st thing in the output 505 * buffer. 506 */ 507 /* LINTED pointer alignment */ 508 (*(uint32_t *)(mp->b_rptr)) = p->cku_xid; 509 510 /* Skip the preserialized stuff. */ 511 XDR_SETPOS(xdrs, CKU_HDRSIZE); 512 513 /* Serialize dynamic stuff into the output buffer. */ 514 if ((!XDR_PUTINT32(xdrs, (int32_t *)&procnum)) || 515 (!AUTH_MARSHALL(h->cl_auth, xdrs, p->cku_cred)) || 516 (!(*xdr_args)(xdrs, argsp))) { 517 freemsg(mp); 518 p->cku_err.re_status = RPC_CANTENCODEARGS; 519 p->cku_err.re_errno = EIO; 520 goto done; 521 } 522 } else { 523 uint32_t *uproc = (uint32_t *) 524 &p->cku_rpchdr[CKU_HDRSIZE]; 525 IXDR_PUT_U_INT32(uproc, procnum); 526 527 (*(uint32_t *)(&p->cku_rpchdr[0])) = p->cku_xid; 528 XDR_SETPOS(xdrs, 0); 529 530 /* Serialize the procedure number and the arguments. */ 531 if (!AUTH_WRAP(h->cl_auth, (caddr_t)p->cku_rpchdr, 532 CKU_HDRSIZE+4, xdrs, xdr_args, argsp)) { 533 freemsg(mp); 534 p->cku_err.re_status = RPC_CANTENCODEARGS; 535 p->cku_err.re_errno = EIO; 536 goto done; 537 } 538 } 539 } else 540 mp = mpdup; 541 542 mpdup = dupmsg(mp); 543 if (mpdup == NULL) { 544 freemsg(mp); 545 p->cku_err.re_status = RPC_SYSTEMERROR; 546 p->cku_err.re_errno = ENOSR; 547 goto done; 548 } 549 550 /* 551 * Grab an endpnt only if the endpoint is NULL. We could be retrying 552 * the request and in this case we want to go through the same 553 * source port, so that the duplicate request cache may detect a 554 * retry. 555 */ 556 if (p->cku_endpnt == NULL) 557 p->cku_endpnt = endpnt_get(&p->cku_config); 558 559 if (p->cku_endpnt == NULL) { 560 freemsg(mp); 561 p->cku_err.re_status = RPC_SYSTEMERROR; 562 p->cku_err.re_errno = ENOSR; 563 goto done; 564 } 565 566 round_trip = lbolt; 567 568 error = clnt_clts_dispatch_send(p->cku_endpnt->e_wq, mp, 569 &p->cku_addr, call, p->cku_xid); 570 571 if (error != 0) { 572 freemsg(mp); 573 p->cku_err.re_status = RPC_CANTSEND; 574 p->cku_err.re_errno = error; 575 RCSTAT_INCR(p->cku_stats, rccantsend); 576 goto done1; 577 } 578 579 RPCLOG(64, "clnt_clts_kcallit_addr: sent call for xid 0x%x\n", 580 p->cku_xid); 581 582 /* 583 * There are two reasons for which we go back to to tryread. 584 * 585 * a) In case the status is RPC_PROCUNAVAIL and we sent out a 586 * broadcast we should not get any invalid messages with the 587 * RPC_PROCUNAVAIL error back. Some broken RPC implementations 588 * send them and for this we have to ignore them ( as we would 589 * have never received them ) and look for another message 590 * which might contain the valid response because we don't know 591 * how many broken implementations are in the network. So we are 592 * going to loop until 593 * - we received a valid response 594 * - we have processed all invalid responses and 595 * got a time out when we try to receive again a 596 * message. 597 * 598 * b) We will jump back to tryread also in case we failed 599 * within the AUTH_VALIDATE. In this case we should move 600 * on and loop until we received a valid response or we 601 * have processed all responses with broken authentication 602 * and we got a time out when we try to receive a message. 603 */ 604 tryread: 605 mutex_enter(&call->call_lock); 606 interrupted = FALSE; 607 if (call->call_notified == FALSE) { 608 klwp_t *lwp = ttolwp(curthread); 609 clock_t cv_wait_ret = 1; /* init to > 0 */ 610 clock_t cv_timout = timout; 611 612 if (lwp != NULL) 613 lwp->lwp_nostop++; 614 615 cv_timout += lbolt; 616 617 if (h->cl_nosignal) 618 while ((cv_wait_ret = 619 cv_timedwait(&call->call_cv, 620 &call->call_lock, cv_timout)) > 0 && 621 call->call_notified == FALSE); 622 else 623 while ((cv_wait_ret = 624 cv_timedwait_sig(&call->call_cv, 625 &call->call_lock, cv_timout)) > 0 && 626 call->call_notified == FALSE); 627 628 if (cv_wait_ret == 0) 629 interrupted = TRUE; 630 631 if (lwp != NULL) 632 lwp->lwp_nostop--; 633 } 634 resp = call->call_reply; 635 call->call_reply = NULL; 636 status = call->call_status; 637 /* 638 * We have to reset the call_notified here. In case we have 639 * to do a retry ( e.g. in case we got a RPC_PROCUNAVAIL 640 * error ) we need to set this to false to ensure that 641 * we will wait for the next message. When the next message 642 * is going to arrive the function clnt_clts_dispatch_notify 643 * will set this to true again. 644 */ 645 call->call_notified = FALSE; 646 mutex_exit(&call->call_lock); 647 648 if (status == RPC_TIMEDOUT) { 649 if (interrupted) { 650 /* 651 * We got interrupted, bail out 652 */ 653 p->cku_err.re_status = RPC_INTR; 654 p->cku_err.re_errno = EINTR; 655 goto done1; 656 } else { 657 /* 658 * It's possible that our response arrived 659 * right after we timed out. Check to see 660 * if it has arrived before we remove the 661 * calllist from the dispatch queue. 662 */ 663 mutex_enter(&call->call_lock); 664 if (call->call_notified == TRUE) { 665 resp = call->call_reply; 666 call->call_reply = NULL; 667 mutex_exit(&call->call_lock); 668 RPCLOG(8, "clnt_clts_kcallit_addr: " 669 "response received for request " 670 "w/xid 0x%x after timeout\n", 671 p->cku_xid); 672 goto getresponse; 673 } 674 mutex_exit(&call->call_lock); 675 676 RPCLOG(8, "clnt_clts_kcallit_addr: " 677 "request w/xid 0x%x timedout " 678 "waiting for reply\n", p->cku_xid); 679 #if 0 /* XXX not yet */ 680 /* 681 * Timeout may be due to a dead gateway. Send 682 * an ioctl downstream advising deletion of 683 * route when we reach the half-way point to 684 * timing out. 685 */ 686 if (stries == p->cku_retrys/2) { 687 t_kadvise(p->cku_endpnt->e_tiptr, 688 (uchar_t *)p->cku_addr.buf, 689 p->cku_addr.len); 690 } 691 #endif /* not yet */ 692 p->cku_err.re_status = RPC_TIMEDOUT; 693 p->cku_err.re_errno = ETIMEDOUT; 694 RCSTAT_INCR(p->cku_stats, rctimeouts); 695 goto done1; 696 } 697 } 698 699 getresponse: 700 /* 701 * Check to see if a response arrived. If it one is 702 * present then proceed to process the reponse. Otherwise 703 * fall through to retry or retransmit the request. This 704 * is probably not the optimal thing to do, but since we 705 * are most likely dealing with a unrealiable transport it 706 * is the safe thing to so. 707 */ 708 if (resp == NULL) { 709 p->cku_err.re_status = RPC_CANTRECV; 710 p->cku_err.re_errno = EIO; 711 goto done1; 712 } 713 714 /* 715 * Prepare the message for further processing. We need to remove 716 * the datagram header and copy the source address if necessary. No 717 * need to verify the header since rpcmod took care of that. 718 */ 719 /* 720 * Copy the source address if the caller has supplied a netbuf. 721 */ 722 if (sin != NULL) { 723 union T_primitives *pptr; 724 725 pptr = (union T_primitives *)resp->b_rptr; 726 bcopy(resp->b_rptr + pptr->unitdata_ind.SRC_offset, sin->buf, 727 pptr->unitdata_ind.SRC_length); 728 sin->len = pptr->unitdata_ind.SRC_length; 729 } 730 731 /* 732 * Pop off the datagram header. 733 */ 734 hdrsz = resp->b_wptr - resp->b_rptr; 735 if ((resp->b_wptr - (resp->b_rptr + hdrsz)) == 0) { 736 tmp = resp; 737 resp = resp->b_cont; 738 tmp->b_cont = NULL; 739 freeb(tmp); 740 } else { 741 unsigned char *ud_off = resp->b_rptr; 742 resp->b_rptr += hdrsz; 743 tmp = dupb(resp); 744 if (tmp == NULL) { 745 p->cku_err.re_status = RPC_SYSTEMERROR; 746 p->cku_err.re_errno = ENOSR; 747 freemsg(resp); 748 goto done1; 749 } 750 tmp->b_cont = resp->b_cont; 751 resp->b_rptr = ud_off; 752 freeb(resp); 753 resp = tmp; 754 } 755 756 round_trip = lbolt - round_trip; 757 /* 758 * Van Jacobson timer algorithm here, only if NOT a retransmission. 759 */ 760 if (p->cku_timers != NULL && stries == p->cku_retrys) { 761 int rt; 762 763 rt = round_trip; 764 rt -= (p->cku_timers->rt_srtt >> 3); 765 p->cku_timers->rt_srtt += rt; 766 if (rt < 0) 767 rt = - rt; 768 rt -= (p->cku_timers->rt_deviate >> 2); 769 p->cku_timers->rt_deviate += rt; 770 p->cku_timers->rt_rtxcur = 771 (clock_t)((p->cku_timers->rt_srtt >> 2) + 772 p->cku_timers->rt_deviate) >> 1; 773 774 rt = round_trip; 775 rt -= (p->cku_timeall->rt_srtt >> 3); 776 p->cku_timeall->rt_srtt += rt; 777 if (rt < 0) 778 rt = - rt; 779 rt -= (p->cku_timeall->rt_deviate >> 2); 780 p->cku_timeall->rt_deviate += rt; 781 p->cku_timeall->rt_rtxcur = 782 (clock_t)((p->cku_timeall->rt_srtt >> 2) + 783 p->cku_timeall->rt_deviate) >> 1; 784 if (p->cku_feedback != NULL) { 785 (*p->cku_feedback)(FEEDBACK_OK, procnum, 786 p->cku_feedarg); 787 } 788 } 789 790 /* 791 * Process reply 792 */ 793 xdrs = &(p->cku_inxdr); 794 xdrmblk_init(xdrs, resp, XDR_DECODE, 0); 795 796 reply_msg.rm_direction = REPLY; 797 reply_msg.rm_reply.rp_stat = MSG_ACCEPTED; 798 reply_msg.acpted_rply.ar_stat = SUCCESS; 799 reply_msg.acpted_rply.ar_verf = _null_auth; 800 /* 801 * xdr_results will be done in AUTH_UNWRAP. 802 */ 803 reply_msg.acpted_rply.ar_results.where = NULL; 804 reply_msg.acpted_rply.ar_results.proc = xdr_void; 805 806 /* 807 * Decode and validate the response. 808 */ 809 if (!xdr_replymsg(xdrs, &reply_msg)) { 810 p->cku_err.re_status = RPC_CANTDECODERES; 811 p->cku_err.re_errno = EIO; 812 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 813 goto done1; 814 } 815 816 _seterr_reply(&reply_msg, &(p->cku_err)); 817 818 re_status = p->cku_err.re_status; 819 if (re_status == RPC_SUCCESS) { 820 /* 821 * Reply is good, check auth. 822 */ 823 if (!AUTH_VALIDATE(h->cl_auth, 824 &reply_msg.acpted_rply.ar_verf)) { 825 p->cku_err.re_status = RPC_AUTHERROR; 826 p->cku_err.re_why = AUTH_INVALIDRESP; 827 RCSTAT_INCR(p->cku_stats, rcbadverfs); 828 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 829 goto tryread; 830 } 831 if (!AUTH_UNWRAP(h->cl_auth, xdrs, xdr_results, resultsp)) { 832 p->cku_err.re_status = RPC_CANTDECODERES; 833 p->cku_err.re_errno = EIO; 834 } 835 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 836 goto done1; 837 } 838 /* set errno in case we can't recover */ 839 if (re_status != RPC_VERSMISMATCH && 840 re_status != RPC_AUTHERROR && 841 re_status != RPC_PROGVERSMISMATCH) 842 p->cku_err.re_errno = EIO; 843 /* 844 * Determine whether or not we're doing an RPC 845 * broadcast. Some server implementations don't 846 * follow RFC 1050, section 7.4.2 in that they 847 * don't remain silent when they see a proc 848 * they don't support. Therefore we keep trying 849 * to receive on RPC_PROCUNAVAIL, hoping to get 850 * a valid response from a compliant server. 851 */ 852 if (re_status == RPC_PROCUNAVAIL && p->cku_bcast) { 853 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 854 goto tryread; 855 } 856 if (re_status == RPC_AUTHERROR) { 857 /* 858 * Maybe our credential need to be refreshed 859 */ 860 if (refreshes > 0 && 861 AUTH_REFRESH(h->cl_auth, &reply_msg, p->cku_cred)) { 862 /* 863 * The credential is refreshed. Try the request again. 864 * Even if stries == 0, we still retry as long as 865 * refreshes > 0. This prevents a soft authentication 866 * error turning into a hard one at an upper level. 867 */ 868 refreshes--; 869 RCSTAT_INCR(p->cku_stats, rcbadcalls); 870 RCSTAT_INCR(p->cku_stats, rcnewcreds); 871 872 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 873 freemsg(mpdup); 874 call_table_remove(call); 875 mutex_enter(&call->call_lock); 876 if (call->call_reply != NULL) { 877 freemsg(call->call_reply); 878 call->call_reply = NULL; 879 } 880 mutex_exit(&call->call_lock); 881 882 freemsg(resp); 883 mpdup = NULL; 884 goto call_again; 885 } 886 /* 887 * We have used the client handle to do an AUTH_REFRESH 888 * and the RPC status may be set to RPC_SUCCESS; 889 * Let's make sure to set it to RPC_AUTHERROR. 890 */ 891 p->cku_err.re_status = RPC_CANTDECODERES; 892 893 /* 894 * Map recoverable and unrecoverable 895 * authentication errors to appropriate errno 896 */ 897 switch (p->cku_err.re_why) { 898 case AUTH_BADCRED: 899 case AUTH_BADVERF: 900 case AUTH_INVALIDRESP: 901 case AUTH_TOOWEAK: 902 case AUTH_FAILED: 903 case RPCSEC_GSS_NOCRED: 904 case RPCSEC_GSS_FAILED: 905 p->cku_err.re_errno = EACCES; 906 break; 907 case AUTH_REJECTEDCRED: 908 case AUTH_REJECTEDVERF: 909 default: 910 p->cku_err.re_errno = EIO; 911 break; 912 } 913 RPCLOG(1, "clnt_clts_kcallit : authentication failed " 914 "with RPC_AUTHERROR of type %d\n", 915 p->cku_err.re_why); 916 } 917 918 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 919 920 done1: 921 call_table_remove(call); 922 mutex_enter(&call->call_lock); 923 if (call->call_reply != NULL) { 924 freemsg(call->call_reply); 925 call->call_reply = NULL; 926 } 927 mutex_exit(&call->call_lock); 928 RPCLOG(64, "clnt_clts_kcallit_addr: xid 0x%x taken off dispatch list", 929 p->cku_xid); 930 931 done: 932 if (resp != NULL) { 933 freemsg(resp); 934 resp = NULL; 935 } 936 937 if ((p->cku_err.re_status != RPC_SUCCESS) && 938 (p->cku_err.re_status != RPC_INTR) && 939 (p->cku_err.re_status != RPC_UDERROR) && 940 !IS_UNRECOVERABLE_RPC(p->cku_err.re_status)) { 941 if (p->cku_feedback != NULL && stries == p->cku_retrys) { 942 (*p->cku_feedback)(FEEDBACK_REXMIT1, procnum, 943 p->cku_feedarg); 944 } 945 946 timout = backoff(timout); 947 if (p->cku_timeall != (struct rpc_timers *)0) 948 p->cku_timeall->rt_rtxcur = timout; 949 950 if (p->cku_err.re_status == RPC_SYSTEMERROR || 951 p->cku_err.re_status == RPC_CANTSEND) { 952 /* 953 * Errors due to lack of resources, wait a bit 954 * and try again. 955 */ 956 (void) delay(hz/10); 957 /* (void) sleep((caddr_t)&lbolt, PZERO-4); */ 958 } 959 if (stries-- > 0) { 960 RCSTAT_INCR(p->cku_stats, rcretrans); 961 goto call_again; 962 } 963 } 964 965 if (mpdup != NULL) 966 freemsg(mpdup); 967 968 if (p->cku_err.re_status != RPC_SUCCESS) { 969 RCSTAT_INCR(p->cku_stats, rcbadcalls); 970 } 971 972 /* 973 * Allow the endpoint to be held by the client handle in case this 974 * RPC was not successful. A retry may occur at a higher level and 975 * in this case we may want to send the request over the same 976 * source port. 977 */ 978 if (p->cku_err.re_status == RPC_SUCCESS && p->cku_endpnt != NULL) { 979 endpnt_rele(p->cku_endpnt); 980 p->cku_endpnt = NULL; 981 } 982 983 return (p->cku_err.re_status); 984 } 985 986 static enum clnt_stat 987 clnt_clts_kcallit(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args, 988 caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp, 989 struct timeval wait) 990 { 991 return (clnt_clts_kcallit_addr(h, procnum, xdr_args, argsp, 992 xdr_results, resultsp, wait, NULL)); 993 } 994 995 /* 996 * Return error info on this handle. 997 */ 998 static void 999 clnt_clts_kerror(CLIENT *h, struct rpc_err *err) 1000 { 1001 /* LINTED pointer alignment */ 1002 struct cku_private *p = htop(h); 1003 1004 *err = p->cku_err; 1005 } 1006 1007 static bool_t 1008 clnt_clts_kfreeres(CLIENT *h, xdrproc_t xdr_res, caddr_t res_ptr) 1009 { 1010 /* LINTED pointer alignment */ 1011 struct cku_private *p = htop(h); 1012 XDR *xdrs; 1013 1014 xdrs = &(p->cku_outxdr); 1015 xdrs->x_op = XDR_FREE; 1016 return ((*xdr_res)(xdrs, res_ptr)); 1017 } 1018 1019 /*ARGSUSED*/ 1020 static void 1021 clnt_clts_kabort(CLIENT *h) 1022 { 1023 } 1024 1025 static bool_t 1026 clnt_clts_kcontrol(CLIENT *h, int cmd, char *arg) 1027 { 1028 /* LINTED pointer alignment */ 1029 struct cku_private *p = htop(h); 1030 1031 switch (cmd) { 1032 case CLSET_XID: 1033 p->cku_xid = *((uint32_t *)arg); 1034 return (TRUE); 1035 1036 case CLGET_XID: 1037 *((uint32_t *)arg) = p->cku_xid; 1038 return (TRUE); 1039 1040 case CLSET_BCAST: 1041 p->cku_bcast = *((uint32_t *)arg); 1042 return (TRUE); 1043 1044 case CLGET_BCAST: 1045 *((uint32_t *)arg) = p->cku_bcast; 1046 return (TRUE); 1047 1048 default: 1049 return (FALSE); 1050 } 1051 } 1052 1053 /* 1054 * Destroy rpc handle. 1055 * Frees the space used for output buffer, private data, and handle 1056 * structure, and the file pointer/TLI data on last reference. 1057 */ 1058 static void 1059 clnt_clts_kdestroy(CLIENT *h) 1060 { 1061 /* LINTED pointer alignment */ 1062 struct cku_private *p = htop(h); 1063 calllist_t *call = &p->cku_call; 1064 1065 int plen; 1066 1067 RPCLOG(8, "clnt_clts_kdestroy h: %p\n", (void *)h); 1068 RPCLOG(8, "clnt_clts_kdestroy h: xid=0x%x\n", p->cku_xid); 1069 1070 if (p->cku_endpnt != NULL) 1071 endpnt_rele(p->cku_endpnt); 1072 1073 cv_destroy(&call->call_cv); 1074 mutex_destroy(&call->call_lock); 1075 1076 plen = strlen(p->cku_config.knc_protofmly) + 1; 1077 kmem_free(p->cku_config.knc_protofmly, plen); 1078 kmem_free(p->cku_addr.buf, p->cku_addr.maxlen); 1079 kmem_free(p, sizeof (*p)); 1080 } 1081 1082 /* 1083 * The connectionless (CLTS) kRPC endpoint management subsystem. 1084 * 1085 * Because endpoints are potentially shared among threads making RPC calls, 1086 * they are managed in a pool according to type (endpnt_type_t). Each 1087 * endpnt_type_t points to a list of usable endpoints through the e_pool 1088 * field, which is of type list_t. list_t is a doubly-linked list. 1089 * The number of endpoints in the pool is stored in the e_cnt field of 1090 * endpnt_type_t and the endpoints are reference counted using the e_ref field 1091 * in the endpnt_t structure. 1092 * 1093 * As an optimization, endpoints that have no references are also linked 1094 * to an idle list via e_ilist which is also of type list_t. When a thread 1095 * calls endpnt_get() to obtain a transport endpoint, the idle list is first 1096 * consulted and if such an endpoint exists, it is removed from the idle list 1097 * and returned to the caller. 1098 * 1099 * If the idle list is empty, then a check is made to see if more endpoints 1100 * can be created. If so, we proceed and create a new endpoint which is added 1101 * to the pool and returned to the caller. If we have reached the limit and 1102 * cannot make a new endpoint then one is returned to the caller via round- 1103 * robin policy. 1104 * 1105 * When an endpoint is placed on the idle list by a thread calling 1106 * endpnt_rele(), it is timestamped and then a reaper taskq is scheduled to 1107 * be dispatched if one hasn't already been. When the timer fires, the 1108 * taskq traverses the idle list and checks to see which endpoints are 1109 * eligible to be closed. It determines this by checking if the timestamp 1110 * when the endpoint was released has exceeded the the threshold for how long 1111 * it should stay alive. 1112 * 1113 * endpnt_t structures remain persistent until the memory reclaim callback, 1114 * endpnt_reclaim(), is invoked. 1115 * 1116 * Here is an example of how the data structures would be laid out by the 1117 * subsystem: 1118 * 1119 * endpnt_type_t 1120 * 1121 * loopback inet 1122 * _______________ ______________ 1123 * | e_next |----------------------->| e_next |---->> 1124 * | e_pool |<---+ | e_pool |<----+ 1125 * | e_ilist |<---+--+ | e_ilist |<----+--+ 1126 * +->| e_pcurr |----+--+--+ +->| e_pcurr |-----+--+--+ 1127 * | | ... | | | | | | ... | | | | 1128 * | | e_itimer (90) | | | | | | e_itimer (0) | | | | 1129 * | | e_cnt (1) | | | | | | e_cnt (3) | | | | 1130 * | +---------------+ | | | | +--------------+ | | | 1131 * | | | | | | | | 1132 * | endpnt_t | | | | | | | 1133 * | ____________ | | | | ____________ | | | 1134 * | | e_node |<------+ | | | | e_node |<------+ | | 1135 * | | e_idle |<---------+ | | | e_idle | | | | 1136 * +--| e_type |<------------+ +--| e_type | | | | 1137 * | e_tiptr | | | e_tiptr | | | | 1138 * | ... | | | ... | | | | 1139 * | e_lock | | | e_lock | | | | 1140 * | ... | | | ... | | | | 1141 * | e_ref (0) | | | e_ref (2) | | | | 1142 * | e_itime | | | e_itime | | | | 1143 * +------------+ | +------------+ | | | 1144 * | | | | 1145 * | | | | 1146 * | ____________ | | | 1147 * | | e_node |<------+ | | 1148 * | | e_idle |<------+--+ | 1149 * +--| e_type | | | 1150 * | | e_tiptr | | | 1151 * | | ... | | | 1152 * | | e_lock | | | 1153 * | | ... | | | 1154 * | | e_ref (0) | | | 1155 * | | e_itime | | | 1156 * | +------------+ | | 1157 * | | | 1158 * | | | 1159 * | ____________ | | 1160 * | | e_node |<------+ | 1161 * | | e_idle | | 1162 * +--| e_type |<------------+ 1163 * | e_tiptr | 1164 * | ... | 1165 * | e_lock | 1166 * | ... | 1167 * | e_ref (1) | 1168 * | e_itime | 1169 * +------------+ 1170 * 1171 * Endpoint locking strategy: 1172 * 1173 * The following functions manipulate lists which hold the endpoint and the 1174 * endpoints themselves: 1175 * 1176 * endpnt_get()/check_endpnt()/endpnt_rele()/endpnt_reap()/do_endpnt_reclaim() 1177 * 1178 * Lock description follows: 1179 * 1180 * endpnt_type_lock: Global reader/writer lock which protects accesses to the 1181 * endpnt_type_list. 1182 * 1183 * e_plock: Lock defined in the endpnt_type_t. It is intended to 1184 * protect accesses to the pool of endopints (e_pool) for a given 1185 * endpnt_type_t. 1186 * 1187 * e_ilock: Lock defined in endpnt_type_t. It is intended to protect accesses 1188 * to the idle list (e_ilist) of available endpoints for a given 1189 * endpnt_type_t. It also protects access to the e_itimer, e_async_cv, 1190 * and e_async_count fields in endpnt_type_t. 1191 * 1192 * e_lock: Lock defined in the endpnt structure. It is intended to protect 1193 * flags, cv, and ref count. 1194 * 1195 * The order goes as follows so as not to induce deadlock. 1196 * 1197 * endpnt_type_lock -> e_plock -> e_ilock -> e_lock 1198 * 1199 * Interaction with Zones and shutting down: 1200 * 1201 * endpnt_type_ts are uniquely identified by the (e_zoneid, e_rdev, e_protofmly) 1202 * tuple, which means that a zone may not reuse another zone's idle endpoints 1203 * without first doing a t_kclose(). 1204 * 1205 * A zone's endpnt_type_ts are destroyed when a zone is shut down; e_async_cv 1206 * and e_async_count are used to keep track of the threads in endpnt_taskq 1207 * trying to reap endpnt_ts in the endpnt_type_t. 1208 */ 1209 1210 /* 1211 * Allocate and initialize an endpnt_type_t 1212 */ 1213 static struct endpnt_type * 1214 endpnt_type_create(struct knetconfig *config) 1215 { 1216 struct endpnt_type *etype; 1217 1218 /* 1219 * Allocate a new endpoint type to hang a list of 1220 * endpoints off of it. 1221 */ 1222 etype = kmem_alloc(sizeof (struct endpnt_type), KM_SLEEP); 1223 etype->e_next = NULL; 1224 etype->e_pcurr = NULL; 1225 etype->e_itimer = 0; 1226 etype->e_cnt = 0; 1227 1228 (void) strncpy(etype->e_protofmly, config->knc_protofmly, KNC_STRSIZE); 1229 mutex_init(&etype->e_plock, NULL, MUTEX_DEFAULT, NULL); 1230 mutex_init(&etype->e_ilock, NULL, MUTEX_DEFAULT, NULL); 1231 etype->e_rdev = config->knc_rdev; 1232 etype->e_zoneid = getzoneid(); 1233 etype->e_async_count = 0; 1234 cv_init(&etype->e_async_cv, NULL, CV_DEFAULT, NULL); 1235 1236 list_create(&etype->e_pool, sizeof (endpnt_t), 1237 offsetof(endpnt_t, e_node)); 1238 list_create(&etype->e_ilist, sizeof (endpnt_t), 1239 offsetof(endpnt_t, e_idle)); 1240 1241 /* 1242 * Check to see if we need to create a taskq for endpoint 1243 * reaping 1244 */ 1245 mutex_enter(&endpnt_taskq_lock); 1246 if (taskq_created == FALSE) { 1247 taskq_created = TRUE; 1248 mutex_exit(&endpnt_taskq_lock); 1249 ASSERT(endpnt_taskq == NULL); 1250 endpnt_taskq = taskq_create("clts_endpnt_taskq", 1, 1251 minclsyspri, 200, INT_MAX, 0); 1252 } else 1253 mutex_exit(&endpnt_taskq_lock); 1254 1255 return (etype); 1256 } 1257 1258 /* 1259 * Free an endpnt_type_t 1260 */ 1261 static void 1262 endpnt_type_free(struct endpnt_type *etype) 1263 { 1264 mutex_destroy(&etype->e_plock); 1265 mutex_destroy(&etype->e_ilock); 1266 list_destroy(&etype->e_pool); 1267 list_destroy(&etype->e_ilist); 1268 kmem_free(etype, sizeof (endpnt_type_t)); 1269 } 1270 1271 /* 1272 * Check the endpoint to ensure that it is suitable for use. 1273 * 1274 * Possible return values: 1275 * 1276 * return (1) - Endpoint is established, but needs to be re-opened. 1277 * return (0) && *newp == NULL - Endpoint is established, but unusable. 1278 * return (0) && *newp != NULL - Endpoint is established and usable. 1279 */ 1280 static int 1281 check_endpnt(struct endpnt *endp, struct endpnt **newp) 1282 { 1283 *newp = endp; 1284 1285 mutex_enter(&endp->e_lock); 1286 ASSERT(endp->e_ref >= 1); 1287 1288 /* 1289 * The first condition we check for is if the endpoint has been 1290 * allocated, but is unusable either because it has been closed or 1291 * has been marked stale. Only *one* thread will be allowed to 1292 * execute the then clause. This is enforced becuase the first thread 1293 * to check this condition will clear the flags, so that subsequent 1294 * thread(s) checking this endpoint will move on. 1295 */ 1296 if ((endp->e_flags & ENDPNT_ESTABLISHED) && 1297 (!(endp->e_flags & ENDPNT_BOUND) || 1298 (endp->e_flags & ENDPNT_STALE))) { 1299 /* 1300 * Clear the flags here since they will be 1301 * set again by this thread. They need to be 1302 * individually cleared because we want to maintain 1303 * the state for ENDPNT_ONIDLE. 1304 */ 1305 endp->e_flags &= ~(ENDPNT_ESTABLISHED | 1306 ENDPNT_WAITING | ENDPNT_BOUND | ENDPNT_STALE); 1307 mutex_exit(&endp->e_lock); 1308 return (1); 1309 } 1310 1311 /* 1312 * The second condition is meant for any thread that is waiting for 1313 * an endpoint to become established. It will cv_wait() until 1314 * the condition for the endpoint has been changed to ENDPNT_BOUND or 1315 * ENDPNT_STALE. 1316 */ 1317 while (!(endp->e_flags & ENDPNT_BOUND) && 1318 !(endp->e_flags & ENDPNT_STALE)) { 1319 endp->e_flags |= ENDPNT_WAITING; 1320 cv_wait(&endp->e_cv, &endp->e_lock); 1321 } 1322 1323 ASSERT(endp->e_flags & ENDPNT_ESTABLISHED); 1324 1325 /* 1326 * The last case we check for is if the endpoint has been marked stale. 1327 * If this is the case then set *newp to NULL and return, so that the 1328 * caller is notified of the error and can take appropriate action. 1329 */ 1330 if (endp->e_flags & ENDPNT_STALE) { 1331 endp->e_ref--; 1332 *newp = NULL; 1333 } 1334 mutex_exit(&endp->e_lock); 1335 return (0); 1336 } 1337 1338 #ifdef DEBUG 1339 /* 1340 * Provide a fault injection setting to test error conditions. 1341 */ 1342 static int endpnt_get_return_null = 0; 1343 #endif 1344 1345 /* 1346 * Returns a handle (struct endpnt *) to an open and bound endpoint 1347 * specified by the knetconfig passed in. Returns NULL if no valid endpoint 1348 * can be obtained. 1349 */ 1350 static struct endpnt * 1351 endpnt_get(struct knetconfig *config) 1352 { 1353 struct endpnt_type *n_etype = NULL; 1354 struct endpnt_type *np = NULL; 1355 struct endpnt *new = NULL; 1356 struct endpnt *endp = NULL; 1357 struct endpnt *next = NULL; 1358 TIUSER *tiptr = NULL; 1359 int rtries = BINDRESVPORT_RETRIES; 1360 int i = 0; 1361 int error; 1362 int retval; 1363 zoneid_t zoneid = getzoneid(); 1364 1365 RPCLOG(1, "endpnt_get: protofmly %s, ", config->knc_protofmly); 1366 RPCLOG(1, "rdev %ld\n", config->knc_rdev); 1367 1368 #ifdef DEBUG 1369 /* 1370 * Inject fault if desired. Pretend we have a stale endpoint 1371 * and return NULL. 1372 */ 1373 if (endpnt_get_return_null > 0) { 1374 endpnt_get_return_null--; 1375 return (NULL); 1376 } 1377 #endif 1378 rw_enter(&endpnt_type_lock, RW_READER); 1379 1380 top: 1381 for (np = endpnt_type_list; np != NULL; np = np->e_next) 1382 if ((np->e_zoneid == zoneid) && 1383 (np->e_rdev == config->knc_rdev) && 1384 (strcmp(np->e_protofmly, 1385 config->knc_protofmly) == 0)) 1386 break; 1387 1388 if (np == NULL && n_etype != NULL) { 1389 ASSERT(rw_write_held(&endpnt_type_lock)); 1390 1391 /* 1392 * Link the endpoint type onto the list 1393 */ 1394 n_etype->e_next = endpnt_type_list; 1395 endpnt_type_list = n_etype; 1396 np = n_etype; 1397 n_etype = NULL; 1398 } 1399 1400 if (np == NULL) { 1401 /* 1402 * The logic here is that we were unable to find an 1403 * endpnt_type_t that matched our criteria, so we allocate a 1404 * new one. Because kmem_alloc() needs to be called with 1405 * KM_SLEEP, we drop our locks so that we don't induce 1406 * deadlock. After allocating and initializing the 1407 * endpnt_type_t, we reaquire the lock and go back to check 1408 * if this entry needs to be added to the list. Since we do 1409 * some operations without any locking other threads may 1410 * have been looking for the same endpnt_type_t and gone 1411 * through this code path. We check for this case and allow 1412 * one thread to link its endpnt_type_t to the list and the 1413 * other threads will simply free theirs. 1414 */ 1415 rw_exit(&endpnt_type_lock); 1416 n_etype = endpnt_type_create(config); 1417 1418 /* 1419 * We need to reaquire the lock with RW_WRITER here so that 1420 * we can safely link the new endpoint type onto the list. 1421 */ 1422 rw_enter(&endpnt_type_lock, RW_WRITER); 1423 goto top; 1424 } 1425 1426 rw_exit(&endpnt_type_lock); 1427 /* 1428 * If n_etype is not NULL, then another thread was able to 1429 * insert an endpnt_type_t of this type onto the list before 1430 * we did. Go ahead and free ours. 1431 */ 1432 if (n_etype != NULL) 1433 endpnt_type_free(n_etype); 1434 1435 mutex_enter(&np->e_ilock); 1436 /* 1437 * The algorithm to hand out endpoints is to first 1438 * give out those that are idle if such endpoints 1439 * exist. Otherwise, create a new one if we haven't 1440 * reached the max threshold. Finally, we give out 1441 * endpoints in a pseudo LRU fashion (round-robin). 1442 * 1443 * Note: The idle list is merely a hint of those endpoints 1444 * that should be idle. There exists a window after the 1445 * endpoint is released and before it is linked back onto the 1446 * idle list where a thread could get a reference to it and 1447 * use it. This is okay, since the reference counts will 1448 * still be consistent. 1449 */ 1450 if ((endp = (endpnt_t *)list_head(&np->e_ilist)) != NULL) { 1451 timeout_id_t t_id = 0; 1452 1453 mutex_enter(&endp->e_lock); 1454 endp->e_ref++; 1455 endp->e_itime = 0; 1456 endp->e_flags &= ~ENDPNT_ONIDLE; 1457 mutex_exit(&endp->e_lock); 1458 1459 /* 1460 * Pop the endpoint off the idle list and hand it off 1461 */ 1462 list_remove(&np->e_ilist, endp); 1463 1464 if (np->e_itimer != 0) { 1465 t_id = np->e_itimer; 1466 np->e_itimer = 0; 1467 } 1468 mutex_exit(&np->e_ilock); 1469 /* 1470 * Reset the idle timer if it has been set 1471 */ 1472 if (t_id != (timeout_id_t)0) 1473 (void) untimeout(t_id); 1474 1475 if (check_endpnt(endp, &new) == 0) 1476 return (new); 1477 } else if (np->e_cnt >= clnt_clts_max_endpoints) { 1478 /* 1479 * There are no idle endpoints currently, so 1480 * create a new one if we have not reached the maximum or 1481 * hand one out in round-robin. 1482 */ 1483 mutex_exit(&np->e_ilock); 1484 mutex_enter(&np->e_plock); 1485 endp = np->e_pcurr; 1486 mutex_enter(&endp->e_lock); 1487 endp->e_ref++; 1488 mutex_exit(&endp->e_lock); 1489 1490 ASSERT(endp != NULL); 1491 /* 1492 * Advance the pointer to the next eligible endpoint, if 1493 * necessary. 1494 */ 1495 if (np->e_cnt > 1) { 1496 next = (endpnt_t *)list_next(&np->e_pool, np->e_pcurr); 1497 if (next == NULL) 1498 next = (endpnt_t *)list_head(&np->e_pool); 1499 np->e_pcurr = next; 1500 } 1501 1502 mutex_exit(&np->e_plock); 1503 1504 /* 1505 * We need to check to see if this endpoint is bound or 1506 * not. If it is in progress then just wait until 1507 * the set up is complete 1508 */ 1509 if (check_endpnt(endp, &new) == 0) 1510 return (new); 1511 } else { 1512 mutex_exit(&np->e_ilock); 1513 mutex_enter(&np->e_plock); 1514 1515 /* 1516 * Allocate a new endpoint to use. If we can't allocate any 1517 * more memory then use one that is already established if any 1518 * such endpoints exist. 1519 */ 1520 new = kmem_cache_alloc(endpnt_cache, KM_NOSLEEP); 1521 if (new == NULL) { 1522 RPCLOG0(1, "endpnt_get: kmem_cache_alloc failed\n"); 1523 /* 1524 * Try to recover by using an existing endpoint. 1525 */ 1526 if (np->e_cnt <= 0) { 1527 mutex_exit(&np->e_plock); 1528 return (NULL); 1529 } 1530 endp = np->e_pcurr; 1531 if ((next = list_next(&np->e_pool, np->e_pcurr)) != 1532 NULL) 1533 np->e_pcurr = next; 1534 ASSERT(endp != NULL); 1535 mutex_enter(&endp->e_lock); 1536 endp->e_ref++; 1537 mutex_exit(&endp->e_lock); 1538 mutex_exit(&np->e_plock); 1539 1540 if (check_endpnt(endp, &new) == 0) 1541 return (new); 1542 } else { 1543 /* 1544 * Partially init an endpoint structure and put 1545 * it on the list, so that other interested threads 1546 * know that one is being created 1547 */ 1548 bzero(new, sizeof (struct endpnt)); 1549 1550 cv_init(&new->e_cv, NULL, CV_DEFAULT, NULL); 1551 mutex_init(&new->e_lock, NULL, MUTEX_DEFAULT, NULL); 1552 new->e_ref = 1; 1553 new->e_type = np; 1554 1555 /* 1556 * Link the endpoint into the pool. 1557 */ 1558 list_insert_head(&np->e_pool, new); 1559 np->e_cnt++; 1560 if (np->e_pcurr == NULL) 1561 np->e_pcurr = new; 1562 mutex_exit(&np->e_plock); 1563 } 1564 } 1565 1566 /* 1567 * The transport should be opened with sufficient privs 1568 */ 1569 error = t_kopen(NULL, config->knc_rdev, FREAD|FWRITE|FNDELAY, &tiptr, 1570 kcred); 1571 if (error) { 1572 RPCLOG(1, "endpnt_get: t_kopen: %d\n", error); 1573 goto bad; 1574 } 1575 1576 new->e_tiptr = tiptr; 1577 rpc_poptimod(tiptr->fp->f_vnode); 1578 1579 /* 1580 * Allow the kernel to push the module on behalf of the user. 1581 */ 1582 error = strioctl(tiptr->fp->f_vnode, I_PUSH, (intptr_t)"rpcmod", 0, 1583 K_TO_K, kcred, &retval); 1584 if (error) { 1585 RPCLOG(1, "endpnt_get: kstr_push on rpcmod failed %d\n", error); 1586 goto bad; 1587 } 1588 1589 error = strioctl(tiptr->fp->f_vnode, RPC_CLIENT, 0, 0, K_TO_K, 1590 kcred, &retval); 1591 if (error) { 1592 RPCLOG(1, "endpnt_get: strioctl failed %d\n", error); 1593 goto bad; 1594 } 1595 1596 /* 1597 * Connectionless data flow should bypass the stream head. 1598 */ 1599 new->e_wq = tiptr->fp->f_vnode->v_stream->sd_wrq->q_next; 1600 1601 error = strioctl(tiptr->fp->f_vnode, I_PUSH, (intptr_t)"timod", 0, 1602 K_TO_K, kcred, &retval); 1603 if (error) { 1604 RPCLOG(1, "endpnt_get: kstr_push on timod failed %d\n", error); 1605 goto bad; 1606 } 1607 1608 /* 1609 * Attempt to bind the endpoint. If we fail then propogate 1610 * error back to calling subsystem, so that it can be handled 1611 * appropriately. 1612 */ 1613 if (clnt_clts_do_bindresvport && 1614 (strcmp(config->knc_protofmly, NC_INET) == 0 || 1615 strcmp(config->knc_protofmly, NC_INET6) == 0)) { 1616 1617 while ((error = 1618 bindresvport(new->e_tiptr, NULL, NULL, FALSE)) != 0) { 1619 RPCLOG(1, 1620 "endpnt_get: bindresvport error %d\n", 1621 error); 1622 if (error != EPROTO) { 1623 if (rtries-- <= 0) 1624 goto bad; 1625 1626 delay(hz << i++); 1627 continue; 1628 } 1629 1630 (void) t_kclose(new->e_tiptr, 1); 1631 /* 1632 * reopen with all privileges 1633 */ 1634 error = t_kopen(NULL, config->knc_rdev, 1635 FREAD|FWRITE|FNDELAY, 1636 &new->e_tiptr, kcred); 1637 if (error) { 1638 RPCLOG(1, "endpnt_get: t_kopen: %d\n", error); 1639 new->e_tiptr = NULL; 1640 goto bad; 1641 } 1642 } 1643 } else if ((error = t_kbind(new->e_tiptr, NULL, NULL)) != 0) { 1644 RPCLOG(1, "endpnt_get: t_kbind failed: %d\n", error); 1645 goto bad; 1646 } 1647 1648 /* 1649 * Set the flags and notify and waiters that we have an established 1650 * endpoint. 1651 */ 1652 mutex_enter(&new->e_lock); 1653 new->e_flags |= ENDPNT_ESTABLISHED; 1654 new->e_flags |= ENDPNT_BOUND; 1655 if (new->e_flags & ENDPNT_WAITING) { 1656 cv_broadcast(&new->e_cv); 1657 new->e_flags &= ~ENDPNT_WAITING; 1658 } 1659 mutex_exit(&new->e_lock); 1660 1661 return (new); 1662 1663 bad: 1664 ASSERT(new != NULL); 1665 /* 1666 * mark this endpoint as stale and notify any threads waiting 1667 * on this endpoint that it will be going away. 1668 */ 1669 mutex_enter(&new->e_lock); 1670 if (new->e_ref > 0) { 1671 new->e_flags |= ENDPNT_ESTABLISHED; 1672 new->e_flags |= ENDPNT_STALE; 1673 if (new->e_flags & ENDPNT_WAITING) { 1674 cv_broadcast(&new->e_cv); 1675 new->e_flags &= ~ENDPNT_WAITING; 1676 } 1677 } 1678 new->e_ref--; 1679 new->e_tiptr = NULL; 1680 mutex_exit(&new->e_lock); 1681 1682 /* 1683 * If there was a transport endopoint opened, then close it. 1684 */ 1685 if (tiptr != NULL) 1686 (void) t_kclose(tiptr, 1); 1687 1688 return (NULL); 1689 } 1690 1691 /* 1692 * Release a referece to the endpoint 1693 */ 1694 static void 1695 endpnt_rele(struct endpnt *sp) 1696 { 1697 mutex_enter(&sp->e_lock); 1698 ASSERT(sp->e_ref > 0); 1699 sp->e_ref--; 1700 /* 1701 * If the ref count is zero, then start the idle timer and link 1702 * the endpoint onto the idle list. 1703 */ 1704 if (sp->e_ref == 0) { 1705 sp->e_itime = gethrestime_sec(); 1706 1707 /* 1708 * Check to see if the endpoint is already linked to the idle 1709 * list, so that we don't try to reinsert it. 1710 */ 1711 if (sp->e_flags & ENDPNT_ONIDLE) { 1712 mutex_exit(&sp->e_lock); 1713 mutex_enter(&sp->e_type->e_ilock); 1714 endpnt_reap_settimer(sp->e_type); 1715 mutex_exit(&sp->e_type->e_ilock); 1716 return; 1717 } 1718 1719 sp->e_flags |= ENDPNT_ONIDLE; 1720 mutex_exit(&sp->e_lock); 1721 mutex_enter(&sp->e_type->e_ilock); 1722 list_insert_tail(&sp->e_type->e_ilist, sp); 1723 endpnt_reap_settimer(sp->e_type); 1724 mutex_exit(&sp->e_type->e_ilock); 1725 } else 1726 mutex_exit(&sp->e_lock); 1727 } 1728 1729 static void 1730 endpnt_reap_settimer(endpnt_type_t *etp) 1731 { 1732 if (etp->e_itimer == (timeout_id_t)0) 1733 etp->e_itimer = timeout(endpnt_reap_dispatch, (void *)etp, 1734 clnt_clts_taskq_dispatch_interval); 1735 } 1736 1737 static void 1738 endpnt_reap_dispatch(void *a) 1739 { 1740 endpnt_type_t *etp = a; 1741 1742 /* 1743 * The idle timer has fired, so dispatch the taskq to close the 1744 * endpoint. 1745 */ 1746 if (taskq_dispatch(endpnt_taskq, (task_func_t *)endpnt_reap, etp, 1747 TQ_NOSLEEP) == NULL) 1748 return; 1749 mutex_enter(&etp->e_ilock); 1750 etp->e_async_count++; 1751 mutex_exit(&etp->e_ilock); 1752 } 1753 1754 /* 1755 * Traverse the idle list and close those endpoints that have reached their 1756 * timeout interval. 1757 */ 1758 static void 1759 endpnt_reap(endpnt_type_t *etp) 1760 { 1761 struct endpnt *e; 1762 struct endpnt *next_node = NULL; 1763 1764 mutex_enter(&etp->e_ilock); 1765 e = list_head(&etp->e_ilist); 1766 while (e != NULL) { 1767 next_node = list_next(&etp->e_ilist, e); 1768 1769 mutex_enter(&e->e_lock); 1770 if (e->e_ref > 0) { 1771 mutex_exit(&e->e_lock); 1772 e = next_node; 1773 continue; 1774 } 1775 1776 ASSERT(e->e_ref == 0); 1777 if (e->e_itime > 0 && 1778 (e->e_itime + clnt_clts_endpoint_reap_interval) < 1779 gethrestime_sec()) { 1780 e->e_flags &= ~ENDPNT_BOUND; 1781 (void) t_kclose(e->e_tiptr, 1); 1782 e->e_tiptr = NULL; 1783 e->e_itime = 0; 1784 } 1785 mutex_exit(&e->e_lock); 1786 e = next_node; 1787 } 1788 etp->e_itimer = 0; 1789 if (--etp->e_async_count == 0) 1790 cv_signal(&etp->e_async_cv); 1791 mutex_exit(&etp->e_ilock); 1792 } 1793 1794 static void 1795 endpnt_reclaim(zoneid_t zoneid) 1796 { 1797 struct endpnt_type *np; 1798 struct endpnt *e; 1799 struct endpnt *next_node = NULL; 1800 list_t free_list; 1801 int rcnt = 0; 1802 1803 list_create(&free_list, sizeof (endpnt_t), offsetof(endpnt_t, e_node)); 1804 1805 RPCLOG0(1, "endpnt_reclaim: reclaim callback started\n"); 1806 rw_enter(&endpnt_type_lock, RW_READER); 1807 for (np = endpnt_type_list; np != NULL; np = np->e_next) { 1808 if (zoneid != ALL_ZONES && zoneid != np->e_zoneid) 1809 continue; 1810 1811 mutex_enter(&np->e_plock); 1812 RPCLOG(1, "endpnt_reclaim: protofmly %s, ", 1813 np->e_protofmly); 1814 RPCLOG(1, "rdev %ld\n", np->e_rdev); 1815 RPCLOG(1, "endpnt_reclaim: found %d endpoint(s)\n", 1816 np->e_cnt); 1817 1818 if (np->e_cnt == 0) { 1819 mutex_exit(&np->e_plock); 1820 continue; 1821 } 1822 1823 /* 1824 * The nice thing about maintaining an idle list is that if 1825 * there are any endpoints to reclaim, they are going to be 1826 * on this list. Just go through and reap the one's that 1827 * have ref counts of zero. 1828 */ 1829 mutex_enter(&np->e_ilock); 1830 e = list_head(&np->e_ilist); 1831 while (e != NULL) { 1832 next_node = list_next(&np->e_ilist, e); 1833 mutex_enter(&e->e_lock); 1834 if (e->e_ref > 0) { 1835 mutex_exit(&e->e_lock); 1836 e = next_node; 1837 continue; 1838 } 1839 ASSERT(e->e_ref == 0); 1840 mutex_exit(&e->e_lock); 1841 1842 list_remove(&np->e_ilist, e); 1843 list_remove(&np->e_pool, e); 1844 list_insert_head(&free_list, e); 1845 1846 rcnt++; 1847 np->e_cnt--; 1848 e = next_node; 1849 } 1850 mutex_exit(&np->e_ilock); 1851 /* 1852 * Reset the current pointer to be safe 1853 */ 1854 if ((e = (struct endpnt *)list_head(&np->e_pool)) != NULL) 1855 np->e_pcurr = e; 1856 else { 1857 ASSERT(np->e_cnt == 0); 1858 np->e_pcurr = NULL; 1859 } 1860 1861 mutex_exit(&np->e_plock); 1862 } 1863 rw_exit(&endpnt_type_lock); 1864 1865 while ((e = list_head(&free_list)) != NULL) { 1866 list_remove(&free_list, e); 1867 if (e->e_tiptr != NULL) 1868 (void) t_kclose(e->e_tiptr, 1); 1869 1870 cv_destroy(&e->e_cv); 1871 mutex_destroy(&e->e_lock); 1872 kmem_cache_free(endpnt_cache, e); 1873 } 1874 list_destroy(&free_list); 1875 RPCLOG(1, "endpnt_reclaim: reclaimed %d endpoint(s)\n", rcnt); 1876 } 1877 1878 /* 1879 * Endpoint reclaim zones destructor callback routine. 1880 * 1881 * After reclaiming any cached entries, we basically go through the endpnt_type 1882 * list, canceling outstanding timeouts and free'ing data structures. 1883 */ 1884 /* ARGSUSED */ 1885 static void 1886 endpnt_destructor(zoneid_t zoneid, void *a) 1887 { 1888 struct endpnt_type **npp; 1889 struct endpnt_type *np; 1890 struct endpnt_type *free_list = NULL; 1891 timeout_id_t t_id = 0; 1892 extern void clcleanup_zone(zoneid_t); 1893 extern void clcleanup4_zone(zoneid_t); 1894 1895 /* Make sure NFS client handles are released. */ 1896 clcleanup_zone(zoneid); 1897 clcleanup4_zone(zoneid); 1898 1899 endpnt_reclaim(zoneid); 1900 /* 1901 * We don't need to be holding on to any locks across the call to 1902 * endpnt_reclaim() and the code below; we know that no-one can 1903 * be holding open connections for this zone (all processes and kernel 1904 * threads are gone), so nothing could be adding anything to the list. 1905 */ 1906 rw_enter(&endpnt_type_lock, RW_WRITER); 1907 npp = &endpnt_type_list; 1908 while ((np = *npp) != NULL) { 1909 if (np->e_zoneid != zoneid) { 1910 npp = &np->e_next; 1911 continue; 1912 } 1913 mutex_enter(&np->e_plock); 1914 mutex_enter(&np->e_ilock); 1915 if (np->e_itimer != 0) { 1916 t_id = np->e_itimer; 1917 np->e_itimer = 0; 1918 } 1919 ASSERT(np->e_cnt == 0); 1920 ASSERT(list_head(&np->e_pool) == NULL); 1921 ASSERT(list_head(&np->e_ilist) == NULL); 1922 1923 mutex_exit(&np->e_ilock); 1924 mutex_exit(&np->e_plock); 1925 1926 /* 1927 * untimeout() any outstanding timers that have not yet fired. 1928 */ 1929 if (t_id != (timeout_id_t)0) 1930 (void) untimeout(t_id); 1931 *npp = np->e_next; 1932 np->e_next = free_list; 1933 free_list = np; 1934 } 1935 rw_exit(&endpnt_type_lock); 1936 1937 while (free_list != NULL) { 1938 np = free_list; 1939 free_list = free_list->e_next; 1940 /* 1941 * Wait for threads in endpnt_taskq trying to reap endpnt_ts in 1942 * the endpnt_type_t. 1943 */ 1944 mutex_enter(&np->e_ilock); 1945 while (np->e_async_count > 0) 1946 cv_wait(&np->e_async_cv, &np->e_ilock); 1947 cv_destroy(&np->e_async_cv); 1948 mutex_destroy(&np->e_plock); 1949 mutex_destroy(&np->e_ilock); 1950 list_destroy(&np->e_pool); 1951 list_destroy(&np->e_ilist); 1952 kmem_free(np, sizeof (endpnt_type_t)); 1953 } 1954 } 1955 1956 /* 1957 * Endpoint reclaim kmem callback routine. 1958 */ 1959 /* ARGSUSED */ 1960 static void 1961 endpnt_repossess(void *a) 1962 { 1963 /* 1964 * Reclaim idle endpnt's from all zones. 1965 */ 1966 if (endpnt_taskq != NULL) 1967 (void) taskq_dispatch(endpnt_taskq, 1968 (task_func_t *)endpnt_reclaim, (void *)ALL_ZONES, 1969 TQ_NOSLEEP); 1970 } 1971 1972 /* 1973 * RPC request dispatch routine. Constructs a datagram message and wraps it 1974 * around the RPC request to pass downstream. 1975 */ 1976 static int 1977 clnt_clts_dispatch_send(queue_t *q, mblk_t *mp, struct netbuf *addr, 1978 calllist_t *cp, uint_t xid) 1979 { 1980 mblk_t *bp; 1981 int msgsz; 1982 struct T_unitdata_req *udreq; 1983 1984 /* 1985 * Set up the call record. 1986 */ 1987 cp->call_wq = q; 1988 cp->call_xid = xid; 1989 cp->call_status = RPC_TIMEDOUT; 1990 cp->call_notified = FALSE; 1991 RPCLOG(64, 1992 "clnt_clts_dispatch_send: putting xid 0x%x on " 1993 "dispatch list\n", xid); 1994 cp->call_hash = call_hash(xid, clnt_clts_hash_size); 1995 cp->call_bucket = &clts_call_ht[cp->call_hash]; 1996 call_table_enter(cp); 1997 1998 /* 1999 * Construct the datagram 2000 */ 2001 msgsz = (int)TUNITDATAREQSZ; 2002 while (!(bp = allocb(msgsz + addr->len, BPRI_LO))) { 2003 if (strwaitbuf(msgsz + addr->len, BPRI_LO)) 2004 return (ENOSR); 2005 } 2006 2007 udreq = (struct T_unitdata_req *)bp->b_wptr; 2008 udreq->PRIM_type = T_UNITDATA_REQ; 2009 udreq->DEST_length = addr->len; 2010 2011 if (addr->len) { 2012 bcopy(addr->buf, bp->b_wptr + msgsz, addr->len); 2013 udreq->DEST_offset = (t_scalar_t)msgsz; 2014 msgsz += addr->len; 2015 } else 2016 udreq->DEST_offset = 0; 2017 udreq->OPT_length = 0; 2018 udreq->OPT_offset = 0; 2019 2020 bp->b_datap->db_type = M_PROTO; 2021 bp->b_wptr += msgsz; 2022 2023 /* 2024 * Link the datagram header with the actual data 2025 */ 2026 linkb(bp, mp); 2027 2028 /* 2029 * Send downstream. 2030 */ 2031 put(cp->call_wq, bp); 2032 2033 return (0); 2034 } 2035 2036 /* 2037 * RPC response delivery routine. Deliver the response to the waiting 2038 * thread by matching the xid. 2039 */ 2040 void 2041 clnt_clts_dispatch_notify(mblk_t *mp, int resp_off, zoneid_t zoneid) 2042 { 2043 calllist_t *e = NULL; 2044 call_table_t *chtp; 2045 uint32_t xid; 2046 uint_t hash; 2047 unsigned char *hdr_offset; 2048 mblk_t *resp; 2049 2050 /* 2051 * If the RPC response is not contained in the same mblk as the 2052 * datagram header, then move to the next mblk. 2053 */ 2054 hdr_offset = mp->b_rptr; 2055 resp = mp; 2056 if ((mp->b_wptr - (mp->b_rptr + resp_off)) == 0) 2057 resp = mp->b_cont; 2058 else 2059 resp->b_rptr += resp_off; 2060 2061 ASSERT(resp != NULL); 2062 2063 if ((IS_P2ALIGNED(resp->b_rptr, sizeof (uint32_t))) && 2064 (resp->b_wptr - resp->b_rptr) >= sizeof (xid)) 2065 xid = *((uint32_t *)resp->b_rptr); 2066 else { 2067 int i = 0; 2068 unsigned char *p = (unsigned char *)&xid; 2069 unsigned char *rptr; 2070 mblk_t *tmp = resp; 2071 2072 /* 2073 * Copy the xid, byte-by-byte into xid. 2074 */ 2075 while (tmp) { 2076 rptr = tmp->b_rptr; 2077 while (rptr < tmp->b_wptr) { 2078 *p++ = *rptr++; 2079 if (++i >= sizeof (xid)) 2080 goto done_xid_copy; 2081 } 2082 tmp = tmp->b_cont; 2083 } 2084 2085 /* 2086 * If we got here, we ran out of mblk space before the 2087 * xid could be copied. 2088 */ 2089 ASSERT(tmp == NULL && i < sizeof (xid)); 2090 2091 RPCLOG0(1, 2092 "clnt_dispatch_notify(clts): message less than " 2093 "size of xid\n"); 2094 2095 freemsg(mp); 2096 return; 2097 } 2098 2099 done_xid_copy: 2100 2101 /* 2102 * Reset the read pointer back to the beginning of the protocol 2103 * header if we moved it. 2104 */ 2105 if (mp->b_rptr != hdr_offset) 2106 mp->b_rptr = hdr_offset; 2107 2108 hash = call_hash(xid, clnt_clts_hash_size); 2109 chtp = &clts_call_ht[hash]; 2110 /* call_table_find returns with the hash bucket locked */ 2111 call_table_find(chtp, xid, e); 2112 2113 if (e != NULL) { 2114 mutex_enter(&e->call_lock); 2115 /* 2116 * found thread waiting for this reply. 2117 */ 2118 if (e->call_reply) { 2119 RPCLOG(8, 2120 "clnt_dispatch_notify (clts): discarding old " 2121 "reply for xid 0x%x\n", 2122 xid); 2123 freemsg(e->call_reply); 2124 } 2125 e->call_notified = TRUE; 2126 e->call_reply = mp; 2127 e->call_status = RPC_SUCCESS; 2128 cv_signal(&e->call_cv); 2129 mutex_exit(&e->call_lock); 2130 mutex_exit(&chtp->ct_lock); 2131 } else { 2132 zone_t *zone; 2133 struct rpcstat *rpcstat; 2134 2135 mutex_exit(&chtp->ct_lock); 2136 RPCLOG(8, "clnt_dispatch_notify (clts): no caller for reply " 2137 "0x%x\n", xid); 2138 freemsg(mp); 2139 /* 2140 * This is unfortunate, but we need to lookup the zone so we 2141 * can increment its "rcbadxids" counter. 2142 */ 2143 zone = zone_find_by_id(zoneid); 2144 if (zone == NULL) { 2145 /* 2146 * The zone went away... 2147 */ 2148 return; 2149 } 2150 rpcstat = zone_getspecific(rpcstat_zone_key, zone); 2151 if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) { 2152 /* 2153 * Not interested 2154 */ 2155 zone_rele(zone); 2156 return; 2157 } 2158 RCSTAT_INCR(rpcstat->rpc_clts_client, rcbadxids); 2159 zone_rele(zone); 2160 } 2161 } 2162 2163 /* 2164 * Init routine. Called when rpcmod is loaded. 2165 */ 2166 void 2167 clnt_clts_init(void) 2168 { 2169 endpnt_cache = kmem_cache_create("clnt_clts_endpnt_cache", 2170 sizeof (struct endpnt), 0, NULL, NULL, endpnt_repossess, NULL, 2171 NULL, 0); 2172 2173 rw_init(&endpnt_type_lock, NULL, RW_DEFAULT, NULL); 2174 2175 /* 2176 * Perform simple bounds checking to make sure that the setting is 2177 * reasonable 2178 */ 2179 if (clnt_clts_max_endpoints <= 0) { 2180 if (clnt_clts_do_bindresvport) 2181 clnt_clts_max_endpoints = RESERVED_PORTSPACE; 2182 else 2183 clnt_clts_max_endpoints = NONRESERVED_PORTSPACE; 2184 } 2185 2186 if (clnt_clts_do_bindresvport && 2187 clnt_clts_max_endpoints > RESERVED_PORTSPACE) 2188 clnt_clts_max_endpoints = RESERVED_PORTSPACE; 2189 else if (clnt_clts_max_endpoints > NONRESERVED_PORTSPACE) 2190 clnt_clts_max_endpoints = NONRESERVED_PORTSPACE; 2191 2192 if (clnt_clts_hash_size < DEFAULT_MIN_HASH_SIZE) 2193 clnt_clts_hash_size = DEFAULT_MIN_HASH_SIZE; 2194 2195 /* 2196 * Defer creating the taskq until rpcmod gets pushed. If we are 2197 * in diskless boot mode, rpcmod will get loaded early even before 2198 * thread_create() is available. 2199 */ 2200 endpnt_taskq = NULL; 2201 taskq_created = FALSE; 2202 mutex_init(&endpnt_taskq_lock, NULL, MUTEX_DEFAULT, NULL); 2203 2204 if (clnt_clts_endpoint_reap_interval < DEFAULT_ENDPOINT_REAP_INTERVAL) 2205 clnt_clts_endpoint_reap_interval = 2206 DEFAULT_ENDPOINT_REAP_INTERVAL; 2207 2208 /* 2209 * Dispatch the taskq at an interval which is offset from the 2210 * interval that the endpoints should be reaped. 2211 */ 2212 clnt_clts_taskq_dispatch_interval = 2213 (clnt_clts_endpoint_reap_interval + DEFAULT_INTERVAL_SHIFT) 2214 * hz; 2215 2216 /* 2217 * Initialize the completion queue 2218 */ 2219 clts_call_ht = call_table_init(clnt_clts_hash_size); 2220 /* 2221 * Initialize the zone destructor callback. 2222 */ 2223 zone_key_create(&endpnt_destructor_key, NULL, NULL, endpnt_destructor); 2224 } 2225 2226 void 2227 clnt_clts_fini(void) 2228 { 2229 (void) zone_key_delete(endpnt_destructor_key); 2230 } 2231