1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T 28 * All Rights Reserved 29 */ 30 31 /* 32 * Portions of this source code were derived from Berkeley 4.3 BSD 33 * under license from the Regents of the University of California. 34 */ 35 36 37 /* 38 * Implements a kernel based, client side RPC. 39 */ 40 41 #include <sys/param.h> 42 #include <sys/types.h> 43 #include <sys/systm.h> 44 #include <sys/sysmacros.h> 45 #include <sys/stream.h> 46 #include <sys/strsubr.h> 47 #include <sys/ddi.h> 48 #include <sys/tiuser.h> 49 #include <sys/tihdr.h> 50 #include <sys/t_kuser.h> 51 #include <sys/errno.h> 52 #include <sys/kmem.h> 53 #include <sys/debug.h> 54 #include <sys/kstat.h> 55 #include <sys/t_lock.h> 56 #include <sys/cmn_err.h> 57 #include <sys/conf.h> 58 #include <sys/disp.h> 59 #include <sys/taskq.h> 60 #include <sys/list.h> 61 #include <sys/atomic.h> 62 #include <sys/zone.h> 63 #include <netinet/in.h> 64 #include <rpc/types.h> 65 #include <rpc/xdr.h> 66 #include <rpc/auth.h> 67 #include <rpc/clnt.h> 68 #include <rpc/rpc_msg.h> 69 70 #include <sys/sdt.h> 71 72 static enum clnt_stat clnt_clts_kcallit(CLIENT *, rpcproc_t, xdrproc_t, 73 caddr_t, xdrproc_t, caddr_t, struct timeval); 74 static void clnt_clts_kabort(CLIENT *); 75 static void clnt_clts_kerror(CLIENT *, struct rpc_err *); 76 static bool_t clnt_clts_kfreeres(CLIENT *, xdrproc_t, caddr_t); 77 static bool_t clnt_clts_kcontrol(CLIENT *, int, char *); 78 static void clnt_clts_kdestroy(CLIENT *); 79 static int clnt_clts_ksettimers(CLIENT *, struct rpc_timers *, 80 struct rpc_timers *, int, void (*)(), caddr_t, uint32_t); 81 82 /* 83 * Operations vector for CLTS based RPC 84 */ 85 static struct clnt_ops clts_ops = { 86 clnt_clts_kcallit, /* do rpc call */ 87 clnt_clts_kabort, /* abort call */ 88 clnt_clts_kerror, /* return error status */ 89 clnt_clts_kfreeres, /* free results */ 90 clnt_clts_kdestroy, /* destroy rpc handle */ 91 clnt_clts_kcontrol, /* the ioctl() of rpc */ 92 clnt_clts_ksettimers /* set retry timers */ 93 }; 94 95 /* 96 * Endpoint for CLTS (INET, INET6, loopback, etc.) 97 */ 98 typedef struct endpnt_type { 99 struct endpnt_type *e_next; /* pointer to next endpoint type */ 100 list_t e_pool; /* list of available endpoints */ 101 list_t e_ilist; /* list of idle endpoints */ 102 struct endpnt *e_pcurr; /* pointer to current endpoint */ 103 char e_protofmly[KNC_STRSIZE]; /* protocol family */ 104 dev_t e_rdev; /* device */ 105 kmutex_t e_plock; /* pool lock */ 106 kmutex_t e_ilock; /* idle list lock */ 107 timeout_id_t e_itimer; /* timer to dispatch the taskq */ 108 uint_t e_cnt; /* number of endpoints in the pool */ 109 zoneid_t e_zoneid; /* zoneid of endpoint type */ 110 kcondvar_t e_async_cv; /* cv for asynchronous reap threads */ 111 uint_t e_async_count; /* count of asynchronous reap threads */ 112 } endpnt_type_t; 113 114 typedef struct endpnt { 115 list_node_t e_node; /* link to the pool */ 116 list_node_t e_idle; /* link to the idle list */ 117 endpnt_type_t *e_type; /* back pointer to endpoint type */ 118 TIUSER *e_tiptr; /* pointer to transport endpoint */ 119 queue_t *e_wq; /* write queue */ 120 uint_t e_flags; /* endpoint flags */ 121 uint_t e_ref; /* ref count on endpoint */ 122 kcondvar_t e_cv; /* condition variable */ 123 kmutex_t e_lock; /* protects cv and flags */ 124 time_t e_itime; /* time when rele'd */ 125 } endpnt_t; 126 127 #define ENDPNT_ESTABLISHED 0x1 /* endpoint is established */ 128 #define ENDPNT_WAITING 0x2 /* thread waiting for endpoint */ 129 #define ENDPNT_BOUND 0x4 /* endpoint is bound */ 130 #define ENDPNT_STALE 0x8 /* endpoint is dead */ 131 #define ENDPNT_ONIDLE 0x10 /* endpoint is on the idle list */ 132 133 static krwlock_t endpnt_type_lock; /* protects endpnt_type_list */ 134 static endpnt_type_t *endpnt_type_list = NULL; /* list of CLTS endpoints */ 135 static struct kmem_cache *endpnt_cache; /* cache of endpnt_t's */ 136 static taskq_t *endpnt_taskq; /* endpnt_t reaper thread */ 137 static bool_t taskq_created; /* flag for endpnt_taskq */ 138 static kmutex_t endpnt_taskq_lock; /* taskq lock */ 139 static zone_key_t endpnt_destructor_key; 140 141 #define DEFAULT_ENDPOINT_REAP_INTERVAL 60 /* 1 minute */ 142 #define DEFAULT_INTERVAL_SHIFT 30 /* 30 seconds */ 143 144 /* 145 * Endpoint tunables 146 */ 147 static int clnt_clts_max_endpoints = -1; 148 static int clnt_clts_hash_size = DEFAULT_HASH_SIZE; 149 static time_t clnt_clts_endpoint_reap_interval = -1; 150 static clock_t clnt_clts_taskq_dispatch_interval; 151 152 /* 153 * Response completion hash queue 154 */ 155 static call_table_t *clts_call_ht; 156 157 /* 158 * Routines for the endpoint manager 159 */ 160 static struct endpnt_type *endpnt_type_create(struct knetconfig *); 161 static void endpnt_type_free(struct endpnt_type *); 162 static int check_endpnt(struct endpnt *, struct endpnt **); 163 static struct endpnt *endpnt_get(struct knetconfig *, int); 164 static void endpnt_rele(struct endpnt *); 165 static void endpnt_reap_settimer(endpnt_type_t *); 166 static void endpnt_reap(endpnt_type_t *); 167 static void endpnt_reap_dispatch(void *); 168 static void endpnt_reclaim(zoneid_t); 169 170 171 /* 172 * Request dipatching function. 173 */ 174 static int clnt_clts_dispatch_send(queue_t *q, mblk_t *, struct netbuf *addr, 175 calllist_t *, uint_t, cred_t *); 176 177 /* 178 * The size of the preserialized RPC header information. 179 */ 180 #define CKU_HDRSIZE 20 181 /* 182 * The initial allocation size. It is small to reduce space requirements. 183 */ 184 #define CKU_INITSIZE 2048 185 /* 186 * The size of additional allocations, if required. It is larger to 187 * reduce the number of actual allocations. 188 */ 189 #define CKU_ALLOCSIZE 8192 190 191 /* 192 * Private data per rpc handle. This structure is allocated by 193 * clnt_clts_kcreate, and freed by clnt_clts_kdestroy. 194 */ 195 struct cku_private { 196 CLIENT cku_client; /* client handle */ 197 int cku_retrys; /* request retrys */ 198 calllist_t cku_call; 199 struct endpnt *cku_endpnt; /* open end point */ 200 struct knetconfig cku_config; 201 struct netbuf cku_addr; /* remote address */ 202 struct rpc_err cku_err; /* error status */ 203 XDR cku_outxdr; /* xdr stream for output */ 204 XDR cku_inxdr; /* xdr stream for input */ 205 char cku_rpchdr[CKU_HDRSIZE + 4]; /* rpc header */ 206 struct cred *cku_cred; /* credentials */ 207 struct rpc_timers *cku_timers; /* for estimating RTT */ 208 struct rpc_timers *cku_timeall; /* for estimating RTT */ 209 void (*cku_feedback)(int, int, caddr_t); 210 /* ptr to feedback rtn */ 211 caddr_t cku_feedarg; /* argument for feedback func */ 212 uint32_t cku_xid; /* current XID */ 213 bool_t cku_bcast; /* RPC broadcast hint */ 214 int cku_useresvport; /* Use reserved port */ 215 struct rpc_clts_client *cku_stats; /* counters for the zone */ 216 }; 217 218 static const struct rpc_clts_client { 219 kstat_named_t rccalls; 220 kstat_named_t rcbadcalls; 221 kstat_named_t rcretrans; 222 kstat_named_t rcbadxids; 223 kstat_named_t rctimeouts; 224 kstat_named_t rcnewcreds; 225 kstat_named_t rcbadverfs; 226 kstat_named_t rctimers; 227 kstat_named_t rcnomem; 228 kstat_named_t rccantsend; 229 } clts_rcstat_tmpl = { 230 { "calls", KSTAT_DATA_UINT64 }, 231 { "badcalls", KSTAT_DATA_UINT64 }, 232 { "retrans", KSTAT_DATA_UINT64 }, 233 { "badxids", KSTAT_DATA_UINT64 }, 234 { "timeouts", KSTAT_DATA_UINT64 }, 235 { "newcreds", KSTAT_DATA_UINT64 }, 236 { "badverfs", KSTAT_DATA_UINT64 }, 237 { "timers", KSTAT_DATA_UINT64 }, 238 { "nomem", KSTAT_DATA_UINT64 }, 239 { "cantsend", KSTAT_DATA_UINT64 }, 240 }; 241 242 static uint_t clts_rcstat_ndata = 243 sizeof (clts_rcstat_tmpl) / sizeof (kstat_named_t); 244 245 #define RCSTAT_INCR(s, x) \ 246 atomic_add_64(&(s)->x.value.ui64, 1) 247 248 #define ptoh(p) (&((p)->cku_client)) 249 #define htop(h) ((struct cku_private *)((h)->cl_private)) 250 251 /* 252 * Times to retry 253 */ 254 #define SNDTRIES 4 255 #define REFRESHES 2 /* authentication refreshes */ 256 257 /* 258 * The following is used to determine the global default behavior for 259 * CLTS when binding to a local port. 260 * 261 * If the value is set to 1 the default will be to select a reserved 262 * (aka privileged) port, if the value is zero the default will be to 263 * use non-reserved ports. Users of kRPC may override this by using 264 * CLNT_CONTROL() and CLSET_BINDRESVPORT. 265 */ 266 static int clnt_clts_do_bindresvport = 1; 267 268 #define BINDRESVPORT_RETRIES 5 269 270 void 271 clnt_clts_stats_init(zoneid_t zoneid, struct rpc_clts_client **statsp) 272 { 273 kstat_t *ksp; 274 kstat_named_t *knp; 275 276 knp = rpcstat_zone_init_common(zoneid, "unix", "rpc_clts_client", 277 (const kstat_named_t *)&clts_rcstat_tmpl, 278 sizeof (clts_rcstat_tmpl)); 279 /* 280 * Backwards compatibility for old kstat clients 281 */ 282 ksp = kstat_create_zone("unix", 0, "rpc_client", "rpc", 283 KSTAT_TYPE_NAMED, clts_rcstat_ndata, 284 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid); 285 if (ksp) { 286 ksp->ks_data = knp; 287 kstat_install(ksp); 288 } 289 *statsp = (struct rpc_clts_client *)knp; 290 } 291 292 void 293 clnt_clts_stats_fini(zoneid_t zoneid, struct rpc_clts_client **statsp) 294 { 295 rpcstat_zone_fini_common(zoneid, "unix", "rpc_clts_client"); 296 kstat_delete_byname_zone("unix", 0, "rpc_client", zoneid); 297 kmem_free(*statsp, sizeof (clts_rcstat_tmpl)); 298 } 299 300 /* 301 * Create an rpc handle for a clts rpc connection. 302 * Allocates space for the handle structure and the private data. 303 */ 304 /* ARGSUSED */ 305 int 306 clnt_clts_kcreate(struct knetconfig *config, struct netbuf *addr, 307 rpcprog_t pgm, rpcvers_t vers, int retrys, struct cred *cred, 308 CLIENT **cl) 309 { 310 CLIENT *h; 311 struct cku_private *p; 312 struct rpc_msg call_msg; 313 int error; 314 int plen; 315 316 if (cl == NULL) 317 return (EINVAL); 318 319 *cl = NULL; 320 error = 0; 321 322 p = kmem_zalloc(sizeof (*p), KM_SLEEP); 323 324 h = ptoh(p); 325 326 /* handle */ 327 h->cl_ops = &clts_ops; 328 h->cl_private = (caddr_t)p; 329 h->cl_auth = authkern_create(); 330 331 /* call message, just used to pre-serialize below */ 332 call_msg.rm_xid = 0; 333 call_msg.rm_direction = CALL; 334 call_msg.rm_call.cb_rpcvers = RPC_MSG_VERSION; 335 call_msg.rm_call.cb_prog = pgm; 336 call_msg.rm_call.cb_vers = vers; 337 338 /* private */ 339 clnt_clts_kinit(h, addr, retrys, cred); 340 341 xdrmem_create(&p->cku_outxdr, p->cku_rpchdr, CKU_HDRSIZE, XDR_ENCODE); 342 343 /* pre-serialize call message header */ 344 if (!xdr_callhdr(&p->cku_outxdr, &call_msg)) { 345 error = EINVAL; /* XXX */ 346 goto bad; 347 } 348 349 p->cku_config.knc_rdev = config->knc_rdev; 350 p->cku_config.knc_semantics = config->knc_semantics; 351 plen = strlen(config->knc_protofmly) + 1; 352 p->cku_config.knc_protofmly = kmem_alloc(plen, KM_SLEEP); 353 bcopy(config->knc_protofmly, p->cku_config.knc_protofmly, plen); 354 p->cku_useresvport = -1; /* value is has not been set */ 355 356 cv_init(&p->cku_call.call_cv, NULL, CV_DEFAULT, NULL); 357 mutex_init(&p->cku_call.call_lock, NULL, MUTEX_DEFAULT, NULL); 358 359 *cl = h; 360 return (0); 361 362 bad: 363 auth_destroy(h->cl_auth); 364 kmem_free(p->cku_addr.buf, addr->maxlen); 365 kmem_free(p, sizeof (struct cku_private)); 366 367 return (error); 368 } 369 370 void 371 clnt_clts_kinit(CLIENT *h, struct netbuf *addr, int retrys, cred_t *cred) 372 { 373 /* LINTED pointer alignment */ 374 struct cku_private *p = htop(h); 375 struct rpcstat *rsp; 376 377 rsp = zone_getspecific(rpcstat_zone_key, rpc_zone()); 378 ASSERT(rsp != NULL); 379 380 p->cku_retrys = retrys; 381 382 if (p->cku_addr.maxlen < addr->len) { 383 if (p->cku_addr.maxlen != 0 && p->cku_addr.buf != NULL) 384 kmem_free(p->cku_addr.buf, p->cku_addr.maxlen); 385 386 p->cku_addr.buf = kmem_zalloc(addr->maxlen, KM_SLEEP); 387 p->cku_addr.maxlen = addr->maxlen; 388 } 389 390 p->cku_addr.len = addr->len; 391 bcopy(addr->buf, p->cku_addr.buf, addr->len); 392 393 p->cku_cred = cred; 394 p->cku_xid = 0; 395 p->cku_timers = NULL; 396 p->cku_timeall = NULL; 397 p->cku_feedback = NULL; 398 p->cku_bcast = FALSE; 399 p->cku_call.call_xid = 0; 400 p->cku_call.call_hash = 0; 401 p->cku_call.call_notified = FALSE; 402 p->cku_call.call_next = NULL; 403 p->cku_call.call_prev = NULL; 404 p->cku_call.call_reply = NULL; 405 p->cku_call.call_wq = NULL; 406 p->cku_stats = rsp->rpc_clts_client; 407 } 408 409 /* 410 * set the timers. Return current retransmission timeout. 411 */ 412 static int 413 clnt_clts_ksettimers(CLIENT *h, struct rpc_timers *t, struct rpc_timers *all, 414 int minimum, void (*feedback)(int, int, caddr_t), caddr_t arg, 415 uint32_t xid) 416 { 417 /* LINTED pointer alignment */ 418 struct cku_private *p = htop(h); 419 int value; 420 421 p->cku_feedback = feedback; 422 p->cku_feedarg = arg; 423 p->cku_timers = t; 424 p->cku_timeall = all; 425 if (xid) 426 p->cku_xid = xid; 427 value = all->rt_rtxcur; 428 value += t->rt_rtxcur; 429 if (value < minimum) 430 return (minimum); 431 RCSTAT_INCR(p->cku_stats, rctimers); 432 return (value); 433 } 434 435 /* 436 * Time out back off function. tim is in HZ 437 */ 438 #define MAXTIMO (20 * hz) 439 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim)) 440 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1)) 441 442 #define RETRY_POLL_TIMO 30 443 444 /* 445 * Call remote procedure. 446 * Most of the work of rpc is done here. We serialize what is left 447 * of the header (some was pre-serialized in the handle), serialize 448 * the arguments, and send it off. We wait for a reply or a time out. 449 * Timeout causes an immediate return, other packet problems may cause 450 * a retry on the receive. When a good packet is received we deserialize 451 * it, and check verification. A bad reply code will cause one retry 452 * with full (longhand) credentials. 453 */ 454 enum clnt_stat 455 clnt_clts_kcallit_addr(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args, 456 caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp, 457 struct timeval wait, struct netbuf *sin) 458 { 459 /* LINTED pointer alignment */ 460 struct cku_private *p = htop(h); 461 XDR *xdrs; 462 int stries = p->cku_retrys; 463 int refreshes = REFRESHES; /* number of times to refresh cred */ 464 int round_trip; /* time the RPC */ 465 int error; 466 int hdrsz; 467 mblk_t *mp; 468 mblk_t *mpdup; 469 mblk_t *resp = NULL; 470 mblk_t *tmp; 471 calllist_t *call = &p->cku_call; 472 clock_t ori_timout, timout; 473 bool_t interrupted; 474 enum clnt_stat status; 475 struct rpc_msg reply_msg; 476 enum clnt_stat re_status; 477 endpnt_t *endpt; 478 479 RCSTAT_INCR(p->cku_stats, rccalls); 480 481 RPCLOG(2, "clnt_clts_kcallit_addr: wait.tv_sec: %ld\n", wait.tv_sec); 482 RPCLOG(2, "clnt_clts_kcallit_addr: wait.tv_usec: %ld\n", wait.tv_usec); 483 484 timout = TIMEVAL_TO_TICK(&wait); 485 ori_timout = timout; 486 487 if (p->cku_xid == 0) { 488 p->cku_xid = alloc_xid(); 489 if (p->cku_endpnt != NULL) 490 endpnt_rele(p->cku_endpnt); 491 p->cku_endpnt = NULL; 492 } 493 call->call_zoneid = rpc_zoneid(); 494 495 mpdup = NULL; 496 call_again: 497 498 if (mpdup == NULL) { 499 500 while ((mp = allocb(CKU_INITSIZE, BPRI_LO)) == NULL) { 501 if (strwaitbuf(CKU_INITSIZE, BPRI_LO)) { 502 p->cku_err.re_status = RPC_SYSTEMERROR; 503 p->cku_err.re_errno = ENOSR; 504 goto done; 505 } 506 } 507 508 xdrs = &p->cku_outxdr; 509 xdrmblk_init(xdrs, mp, XDR_ENCODE, CKU_ALLOCSIZE); 510 511 if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) { 512 /* 513 * Copy in the preserialized RPC header 514 * information. 515 */ 516 bcopy(p->cku_rpchdr, mp->b_rptr, CKU_HDRSIZE); 517 518 /* 519 * transaction id is the 1st thing in the output 520 * buffer. 521 */ 522 /* LINTED pointer alignment */ 523 (*(uint32_t *)(mp->b_rptr)) = p->cku_xid; 524 525 /* Skip the preserialized stuff. */ 526 XDR_SETPOS(xdrs, CKU_HDRSIZE); 527 528 /* Serialize dynamic stuff into the output buffer. */ 529 if ((!XDR_PUTINT32(xdrs, (int32_t *)&procnum)) || 530 (!AUTH_MARSHALL(h->cl_auth, xdrs, p->cku_cred)) || 531 (!(*xdr_args)(xdrs, argsp))) { 532 freemsg(mp); 533 p->cku_err.re_status = RPC_CANTENCODEARGS; 534 p->cku_err.re_errno = EIO; 535 goto done; 536 } 537 } else { 538 uint32_t *uproc = (uint32_t *) 539 &p->cku_rpchdr[CKU_HDRSIZE]; 540 IXDR_PUT_U_INT32(uproc, procnum); 541 542 (*(uint32_t *)(&p->cku_rpchdr[0])) = p->cku_xid; 543 XDR_SETPOS(xdrs, 0); 544 545 /* Serialize the procedure number and the arguments. */ 546 if (!AUTH_WRAP(h->cl_auth, (caddr_t)p->cku_rpchdr, 547 CKU_HDRSIZE+4, xdrs, xdr_args, argsp)) { 548 freemsg(mp); 549 p->cku_err.re_status = RPC_CANTENCODEARGS; 550 p->cku_err.re_errno = EIO; 551 goto done; 552 } 553 } 554 } else 555 mp = mpdup; 556 557 mpdup = dupmsg(mp); 558 if (mpdup == NULL) { 559 freemsg(mp); 560 p->cku_err.re_status = RPC_SYSTEMERROR; 561 p->cku_err.re_errno = ENOSR; 562 goto done; 563 } 564 565 /* 566 * Grab an endpnt only if the endpoint is NULL. We could be retrying 567 * the request and in this case we want to go through the same 568 * source port, so that the duplicate request cache may detect a 569 * retry. 570 */ 571 572 if (p->cku_endpnt == NULL) 573 p->cku_endpnt = endpnt_get(&p->cku_config, p->cku_useresvport); 574 575 if (p->cku_endpnt == NULL) { 576 freemsg(mp); 577 p->cku_err.re_status = RPC_SYSTEMERROR; 578 p->cku_err.re_errno = ENOSR; 579 goto done; 580 } 581 582 round_trip = ddi_get_lbolt(); 583 584 error = clnt_clts_dispatch_send(p->cku_endpnt->e_wq, mp, 585 &p->cku_addr, call, p->cku_xid, p->cku_cred); 586 587 if (error != 0) { 588 freemsg(mp); 589 p->cku_err.re_status = RPC_CANTSEND; 590 p->cku_err.re_errno = error; 591 RCSTAT_INCR(p->cku_stats, rccantsend); 592 goto done1; 593 } 594 595 RPCLOG(64, "clnt_clts_kcallit_addr: sent call for xid 0x%x\n", 596 p->cku_xid); 597 598 /* 599 * There are two reasons for which we go back to to tryread. 600 * 601 * a) In case the status is RPC_PROCUNAVAIL and we sent out a 602 * broadcast we should not get any invalid messages with the 603 * RPC_PROCUNAVAIL error back. Some broken RPC implementations 604 * send them and for this we have to ignore them ( as we would 605 * have never received them ) and look for another message 606 * which might contain the valid response because we don't know 607 * how many broken implementations are in the network. So we are 608 * going to loop until 609 * - we received a valid response 610 * - we have processed all invalid responses and 611 * got a time out when we try to receive again a 612 * message. 613 * 614 * b) We will jump back to tryread also in case we failed 615 * within the AUTH_VALIDATE. In this case we should move 616 * on and loop until we received a valid response or we 617 * have processed all responses with broken authentication 618 * and we got a time out when we try to receive a message. 619 */ 620 tryread: 621 mutex_enter(&call->call_lock); 622 interrupted = FALSE; 623 if (call->call_notified == FALSE) { 624 klwp_t *lwp = ttolwp(curthread); 625 clock_t cv_wait_ret = 1; /* init to > 0 */ 626 clock_t cv_timout = timout; 627 628 if (lwp != NULL) 629 lwp->lwp_nostop++; 630 631 cv_timout += ddi_get_lbolt(); 632 633 if (h->cl_nosignal) 634 while ((cv_wait_ret = 635 cv_timedwait(&call->call_cv, 636 &call->call_lock, cv_timout)) > 0 && 637 call->call_notified == FALSE) 638 ; 639 else 640 while ((cv_wait_ret = 641 cv_timedwait_sig(&call->call_cv, 642 &call->call_lock, cv_timout)) > 0 && 643 call->call_notified == FALSE) 644 ; 645 646 if (cv_wait_ret == 0) 647 interrupted = TRUE; 648 649 if (lwp != NULL) 650 lwp->lwp_nostop--; 651 } 652 resp = call->call_reply; 653 call->call_reply = NULL; 654 status = call->call_status; 655 /* 656 * We have to reset the call_notified here. In case we have 657 * to do a retry ( e.g. in case we got a RPC_PROCUNAVAIL 658 * error ) we need to set this to false to ensure that 659 * we will wait for the next message. When the next message 660 * is going to arrive the function clnt_clts_dispatch_notify 661 * will set this to true again. 662 */ 663 call->call_notified = FALSE; 664 mutex_exit(&call->call_lock); 665 666 if (status == RPC_TIMEDOUT) { 667 if (interrupted) { 668 /* 669 * We got interrupted, bail out 670 */ 671 p->cku_err.re_status = RPC_INTR; 672 p->cku_err.re_errno = EINTR; 673 goto done1; 674 } else { 675 /* 676 * It's possible that our response arrived 677 * right after we timed out. Check to see 678 * if it has arrived before we remove the 679 * calllist from the dispatch queue. 680 */ 681 mutex_enter(&call->call_lock); 682 if (call->call_notified == TRUE) { 683 resp = call->call_reply; 684 call->call_reply = NULL; 685 mutex_exit(&call->call_lock); 686 RPCLOG(8, "clnt_clts_kcallit_addr: " 687 "response received for request " 688 "w/xid 0x%x after timeout\n", 689 p->cku_xid); 690 goto getresponse; 691 } 692 mutex_exit(&call->call_lock); 693 694 RPCLOG(8, "clnt_clts_kcallit_addr: " 695 "request w/xid 0x%x timedout " 696 "waiting for reply\n", p->cku_xid); 697 #if 0 /* XXX not yet */ 698 /* 699 * Timeout may be due to a dead gateway. Send 700 * an ioctl downstream advising deletion of 701 * route when we reach the half-way point to 702 * timing out. 703 */ 704 if (stries == p->cku_retrys/2) { 705 t_kadvise(p->cku_endpnt->e_tiptr, 706 (uchar_t *)p->cku_addr.buf, 707 p->cku_addr.len); 708 } 709 #endif /* not yet */ 710 p->cku_err.re_status = RPC_TIMEDOUT; 711 p->cku_err.re_errno = ETIMEDOUT; 712 RCSTAT_INCR(p->cku_stats, rctimeouts); 713 goto done1; 714 } 715 } 716 717 getresponse: 718 /* 719 * Check to see if a response arrived. If it one is 720 * present then proceed to process the reponse. Otherwise 721 * fall through to retry or retransmit the request. This 722 * is probably not the optimal thing to do, but since we 723 * are most likely dealing with a unrealiable transport it 724 * is the safe thing to so. 725 */ 726 if (resp == NULL) { 727 p->cku_err.re_status = RPC_CANTRECV; 728 p->cku_err.re_errno = EIO; 729 goto done1; 730 } 731 732 /* 733 * Prepare the message for further processing. We need to remove 734 * the datagram header and copy the source address if necessary. No 735 * need to verify the header since rpcmod took care of that. 736 */ 737 /* 738 * Copy the source address if the caller has supplied a netbuf. 739 */ 740 if (sin != NULL) { 741 union T_primitives *pptr; 742 743 pptr = (union T_primitives *)resp->b_rptr; 744 bcopy(resp->b_rptr + pptr->unitdata_ind.SRC_offset, sin->buf, 745 pptr->unitdata_ind.SRC_length); 746 sin->len = pptr->unitdata_ind.SRC_length; 747 } 748 749 /* 750 * Pop off the datagram header. 751 */ 752 hdrsz = resp->b_wptr - resp->b_rptr; 753 if ((resp->b_wptr - (resp->b_rptr + hdrsz)) == 0) { 754 tmp = resp; 755 resp = resp->b_cont; 756 tmp->b_cont = NULL; 757 freeb(tmp); 758 } else { 759 unsigned char *ud_off = resp->b_rptr; 760 resp->b_rptr += hdrsz; 761 tmp = dupb(resp); 762 if (tmp == NULL) { 763 p->cku_err.re_status = RPC_SYSTEMERROR; 764 p->cku_err.re_errno = ENOSR; 765 freemsg(resp); 766 goto done1; 767 } 768 tmp->b_cont = resp->b_cont; 769 resp->b_rptr = ud_off; 770 freeb(resp); 771 resp = tmp; 772 } 773 774 round_trip = ddi_get_lbolt() - round_trip; 775 /* 776 * Van Jacobson timer algorithm here, only if NOT a retransmission. 777 */ 778 if (p->cku_timers != NULL && stries == p->cku_retrys) { 779 int rt; 780 781 rt = round_trip; 782 rt -= (p->cku_timers->rt_srtt >> 3); 783 p->cku_timers->rt_srtt += rt; 784 if (rt < 0) 785 rt = - rt; 786 rt -= (p->cku_timers->rt_deviate >> 2); 787 p->cku_timers->rt_deviate += rt; 788 p->cku_timers->rt_rtxcur = 789 (clock_t)((p->cku_timers->rt_srtt >> 2) + 790 p->cku_timers->rt_deviate) >> 1; 791 792 rt = round_trip; 793 rt -= (p->cku_timeall->rt_srtt >> 3); 794 p->cku_timeall->rt_srtt += rt; 795 if (rt < 0) 796 rt = - rt; 797 rt -= (p->cku_timeall->rt_deviate >> 2); 798 p->cku_timeall->rt_deviate += rt; 799 p->cku_timeall->rt_rtxcur = 800 (clock_t)((p->cku_timeall->rt_srtt >> 2) + 801 p->cku_timeall->rt_deviate) >> 1; 802 if (p->cku_feedback != NULL) { 803 (*p->cku_feedback)(FEEDBACK_OK, procnum, 804 p->cku_feedarg); 805 } 806 } 807 808 /* 809 * Process reply 810 */ 811 xdrs = &(p->cku_inxdr); 812 xdrmblk_init(xdrs, resp, XDR_DECODE, 0); 813 814 reply_msg.rm_direction = REPLY; 815 reply_msg.rm_reply.rp_stat = MSG_ACCEPTED; 816 reply_msg.acpted_rply.ar_stat = SUCCESS; 817 reply_msg.acpted_rply.ar_verf = _null_auth; 818 /* 819 * xdr_results will be done in AUTH_UNWRAP. 820 */ 821 reply_msg.acpted_rply.ar_results.where = NULL; 822 reply_msg.acpted_rply.ar_results.proc = xdr_void; 823 824 /* 825 * Decode and validate the response. 826 */ 827 if (!xdr_replymsg(xdrs, &reply_msg)) { 828 p->cku_err.re_status = RPC_CANTDECODERES; 829 p->cku_err.re_errno = EIO; 830 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 831 goto done1; 832 } 833 834 _seterr_reply(&reply_msg, &(p->cku_err)); 835 836 re_status = p->cku_err.re_status; 837 if (re_status == RPC_SUCCESS) { 838 /* 839 * Reply is good, check auth. 840 */ 841 if (!AUTH_VALIDATE(h->cl_auth, 842 &reply_msg.acpted_rply.ar_verf)) { 843 p->cku_err.re_status = RPC_AUTHERROR; 844 p->cku_err.re_why = AUTH_INVALIDRESP; 845 RCSTAT_INCR(p->cku_stats, rcbadverfs); 846 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 847 goto tryread; 848 } 849 if (!AUTH_UNWRAP(h->cl_auth, xdrs, xdr_results, resultsp)) { 850 p->cku_err.re_status = RPC_CANTDECODERES; 851 p->cku_err.re_errno = EIO; 852 } 853 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 854 goto done1; 855 } 856 /* set errno in case we can't recover */ 857 if (re_status != RPC_VERSMISMATCH && 858 re_status != RPC_AUTHERROR && re_status != RPC_PROGVERSMISMATCH) 859 p->cku_err.re_errno = EIO; 860 /* 861 * Determine whether or not we're doing an RPC 862 * broadcast. Some server implementations don't 863 * follow RFC 1050, section 7.4.2 in that they 864 * don't remain silent when they see a proc 865 * they don't support. Therefore we keep trying 866 * to receive on RPC_PROCUNAVAIL, hoping to get 867 * a valid response from a compliant server. 868 */ 869 if (re_status == RPC_PROCUNAVAIL && p->cku_bcast) { 870 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 871 goto tryread; 872 } 873 if (re_status == RPC_AUTHERROR) { 874 /* 875 * Maybe our credential need to be refreshed 876 */ 877 if (refreshes > 0 && 878 AUTH_REFRESH(h->cl_auth, &reply_msg, p->cku_cred)) { 879 /* 880 * The credential is refreshed. Try the request again. 881 * Even if stries == 0, we still retry as long as 882 * refreshes > 0. This prevents a soft authentication 883 * error turning into a hard one at an upper level. 884 */ 885 refreshes--; 886 RCSTAT_INCR(p->cku_stats, rcbadcalls); 887 RCSTAT_INCR(p->cku_stats, rcnewcreds); 888 889 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 890 freemsg(mpdup); 891 call_table_remove(call); 892 mutex_enter(&call->call_lock); 893 if (call->call_reply != NULL) { 894 freemsg(call->call_reply); 895 call->call_reply = NULL; 896 } 897 mutex_exit(&call->call_lock); 898 899 freemsg(resp); 900 mpdup = NULL; 901 goto call_again; 902 } 903 /* 904 * We have used the client handle to do an AUTH_REFRESH 905 * and the RPC status may be set to RPC_SUCCESS; 906 * Let's make sure to set it to RPC_AUTHERROR. 907 */ 908 p->cku_err.re_status = RPC_CANTDECODERES; 909 910 /* 911 * Map recoverable and unrecoverable 912 * authentication errors to appropriate errno 913 */ 914 switch (p->cku_err.re_why) { 915 case AUTH_TOOWEAK: 916 /* 917 * Could be an nfsportmon failure, set 918 * useresvport and try again. 919 */ 920 if (p->cku_useresvport != 1) { 921 p->cku_useresvport = 1; 922 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 923 freemsg(mpdup); 924 925 call_table_remove(call); 926 mutex_enter(&call->call_lock); 927 if (call->call_reply != NULL) { 928 freemsg(call->call_reply); 929 call->call_reply = NULL; 930 } 931 mutex_exit(&call->call_lock); 932 933 freemsg(resp); 934 mpdup = NULL; 935 endpt = p->cku_endpnt; 936 if (endpt->e_tiptr != NULL) { 937 mutex_enter(&endpt->e_lock); 938 endpt->e_flags &= ~ENDPNT_BOUND; 939 (void) t_kclose(endpt->e_tiptr, 1); 940 endpt->e_tiptr = NULL; 941 mutex_exit(&endpt->e_lock); 942 943 } 944 945 p->cku_xid = alloc_xid(); 946 endpnt_rele(p->cku_endpnt); 947 p->cku_endpnt = NULL; 948 goto call_again; 949 } 950 /* FALLTHRU */ 951 case AUTH_BADCRED: 952 case AUTH_BADVERF: 953 case AUTH_INVALIDRESP: 954 case AUTH_FAILED: 955 case RPCSEC_GSS_NOCRED: 956 case RPCSEC_GSS_FAILED: 957 p->cku_err.re_errno = EACCES; 958 break; 959 case AUTH_REJECTEDCRED: 960 case AUTH_REJECTEDVERF: 961 default: 962 p->cku_err.re_errno = EIO; 963 break; 964 } 965 RPCLOG(1, "clnt_clts_kcallit : authentication failed " 966 "with RPC_AUTHERROR of type %d\n", 967 p->cku_err.re_why); 968 } 969 970 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 971 972 done1: 973 call_table_remove(call); 974 mutex_enter(&call->call_lock); 975 if (call->call_reply != NULL) { 976 freemsg(call->call_reply); 977 call->call_reply = NULL; 978 } 979 mutex_exit(&call->call_lock); 980 RPCLOG(64, "clnt_clts_kcallit_addr: xid 0x%x taken off dispatch list", 981 p->cku_xid); 982 983 done: 984 if (resp != NULL) { 985 freemsg(resp); 986 resp = NULL; 987 } 988 989 if ((p->cku_err.re_status != RPC_SUCCESS) && 990 (p->cku_err.re_status != RPC_INTR) && 991 (p->cku_err.re_status != RPC_UDERROR) && 992 !IS_UNRECOVERABLE_RPC(p->cku_err.re_status)) { 993 if (p->cku_feedback != NULL && stries == p->cku_retrys) { 994 (*p->cku_feedback)(FEEDBACK_REXMIT1, procnum, 995 p->cku_feedarg); 996 } 997 998 timout = backoff(timout); 999 if (p->cku_timeall != (struct rpc_timers *)0) 1000 p->cku_timeall->rt_rtxcur = timout; 1001 1002 if (p->cku_err.re_status == RPC_SYSTEMERROR || 1003 p->cku_err.re_status == RPC_CANTSEND) { 1004 /* 1005 * Errors due to lack of resources, wait a bit 1006 * and try again. 1007 */ 1008 (void) delay(hz/10); 1009 } 1010 if (stries-- > 0) { 1011 RCSTAT_INCR(p->cku_stats, rcretrans); 1012 goto call_again; 1013 } 1014 } 1015 1016 if (mpdup != NULL) 1017 freemsg(mpdup); 1018 1019 if (p->cku_err.re_status != RPC_SUCCESS) { 1020 RCSTAT_INCR(p->cku_stats, rcbadcalls); 1021 } 1022 1023 /* 1024 * Allow the endpoint to be held by the client handle in case this 1025 * RPC was not successful. A retry may occur at a higher level and 1026 * in this case we may want to send the request over the same 1027 * source port. 1028 * Endpoint is also released for one-way RPC: no reply, nor retransmit 1029 * is expected. 1030 */ 1031 if ((p->cku_err.re_status == RPC_SUCCESS || 1032 (p->cku_err.re_status == RPC_TIMEDOUT && ori_timout == 0)) && 1033 p->cku_endpnt != NULL) { 1034 endpnt_rele(p->cku_endpnt); 1035 p->cku_endpnt = NULL; 1036 } else { 1037 DTRACE_PROBE2(clnt_clts_kcallit_done, int, p->cku_err.re_status, 1038 struct endpnt *, p->cku_endpnt); 1039 } 1040 1041 return (p->cku_err.re_status); 1042 } 1043 1044 static enum clnt_stat 1045 clnt_clts_kcallit(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args, 1046 caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp, 1047 struct timeval wait) 1048 { 1049 return (clnt_clts_kcallit_addr(h, procnum, xdr_args, argsp, 1050 xdr_results, resultsp, wait, NULL)); 1051 } 1052 1053 /* 1054 * Return error info on this handle. 1055 */ 1056 static void 1057 clnt_clts_kerror(CLIENT *h, struct rpc_err *err) 1058 { 1059 /* LINTED pointer alignment */ 1060 struct cku_private *p = htop(h); 1061 1062 *err = p->cku_err; 1063 } 1064 1065 static bool_t 1066 clnt_clts_kfreeres(CLIENT *h, xdrproc_t xdr_res, caddr_t res_ptr) 1067 { 1068 /* LINTED pointer alignment */ 1069 struct cku_private *p = htop(h); 1070 XDR *xdrs; 1071 1072 xdrs = &(p->cku_outxdr); 1073 xdrs->x_op = XDR_FREE; 1074 return ((*xdr_res)(xdrs, res_ptr)); 1075 } 1076 1077 /*ARGSUSED*/ 1078 static void 1079 clnt_clts_kabort(CLIENT *h) 1080 { 1081 } 1082 1083 static bool_t 1084 clnt_clts_kcontrol(CLIENT *h, int cmd, char *arg) 1085 { 1086 /* LINTED pointer alignment */ 1087 struct cku_private *p = htop(h); 1088 1089 switch (cmd) { 1090 case CLSET_XID: 1091 p->cku_xid = *((uint32_t *)arg); 1092 return (TRUE); 1093 1094 case CLGET_XID: 1095 *((uint32_t *)arg) = p->cku_xid; 1096 return (TRUE); 1097 1098 case CLSET_BCAST: 1099 p->cku_bcast = *((uint32_t *)arg); 1100 return (TRUE); 1101 1102 case CLGET_BCAST: 1103 *((uint32_t *)arg) = p->cku_bcast; 1104 return (TRUE); 1105 case CLSET_BINDRESVPORT: 1106 if (arg == NULL) 1107 return (FALSE); 1108 1109 if (*(int *)arg != 1 && *(int *)arg != 0) 1110 return (FALSE); 1111 1112 p->cku_useresvport = *(int *)arg; 1113 1114 return (TRUE); 1115 1116 case CLGET_BINDRESVPORT: 1117 if (arg == NULL) 1118 return (FALSE); 1119 1120 *(int *)arg = p->cku_useresvport; 1121 1122 return (TRUE); 1123 1124 default: 1125 return (FALSE); 1126 } 1127 } 1128 1129 /* 1130 * Destroy rpc handle. 1131 * Frees the space used for output buffer, private data, and handle 1132 * structure, and the file pointer/TLI data on last reference. 1133 */ 1134 static void 1135 clnt_clts_kdestroy(CLIENT *h) 1136 { 1137 /* LINTED pointer alignment */ 1138 struct cku_private *p = htop(h); 1139 calllist_t *call = &p->cku_call; 1140 1141 int plen; 1142 1143 RPCLOG(8, "clnt_clts_kdestroy h: %p\n", (void *)h); 1144 RPCLOG(8, "clnt_clts_kdestroy h: xid=0x%x\n", p->cku_xid); 1145 1146 if (p->cku_endpnt != NULL) 1147 endpnt_rele(p->cku_endpnt); 1148 1149 cv_destroy(&call->call_cv); 1150 mutex_destroy(&call->call_lock); 1151 1152 plen = strlen(p->cku_config.knc_protofmly) + 1; 1153 kmem_free(p->cku_config.knc_protofmly, plen); 1154 kmem_free(p->cku_addr.buf, p->cku_addr.maxlen); 1155 kmem_free(p, sizeof (*p)); 1156 } 1157 1158 /* 1159 * The connectionless (CLTS) kRPC endpoint management subsystem. 1160 * 1161 * Because endpoints are potentially shared among threads making RPC calls, 1162 * they are managed in a pool according to type (endpnt_type_t). Each 1163 * endpnt_type_t points to a list of usable endpoints through the e_pool 1164 * field, which is of type list_t. list_t is a doubly-linked list. 1165 * The number of endpoints in the pool is stored in the e_cnt field of 1166 * endpnt_type_t and the endpoints are reference counted using the e_ref field 1167 * in the endpnt_t structure. 1168 * 1169 * As an optimization, endpoints that have no references are also linked 1170 * to an idle list via e_ilist which is also of type list_t. When a thread 1171 * calls endpnt_get() to obtain a transport endpoint, the idle list is first 1172 * consulted and if such an endpoint exists, it is removed from the idle list 1173 * and returned to the caller. 1174 * 1175 * If the idle list is empty, then a check is made to see if more endpoints 1176 * can be created. If so, we proceed and create a new endpoint which is added 1177 * to the pool and returned to the caller. If we have reached the limit and 1178 * cannot make a new endpoint then one is returned to the caller via round- 1179 * robin policy. 1180 * 1181 * When an endpoint is placed on the idle list by a thread calling 1182 * endpnt_rele(), it is timestamped and then a reaper taskq is scheduled to 1183 * be dispatched if one hasn't already been. When the timer fires, the 1184 * taskq traverses the idle list and checks to see which endpoints are 1185 * eligible to be closed. It determines this by checking if the timestamp 1186 * when the endpoint was released has exceeded the the threshold for how long 1187 * it should stay alive. 1188 * 1189 * endpnt_t structures remain persistent until the memory reclaim callback, 1190 * endpnt_reclaim(), is invoked. 1191 * 1192 * Here is an example of how the data structures would be laid out by the 1193 * subsystem: 1194 * 1195 * endpnt_type_t 1196 * 1197 * loopback inet 1198 * _______________ ______________ 1199 * | e_next |----------------------->| e_next |---->> 1200 * | e_pool |<---+ | e_pool |<----+ 1201 * | e_ilist |<---+--+ | e_ilist |<----+--+ 1202 * +->| e_pcurr |----+--+--+ +->| e_pcurr |-----+--+--+ 1203 * | | ... | | | | | | ... | | | | 1204 * | | e_itimer (90) | | | | | | e_itimer (0) | | | | 1205 * | | e_cnt (1) | | | | | | e_cnt (3) | | | | 1206 * | +---------------+ | | | | +--------------+ | | | 1207 * | | | | | | | | 1208 * | endpnt_t | | | | | | | 1209 * | ____________ | | | | ____________ | | | 1210 * | | e_node |<------+ | | | | e_node |<------+ | | 1211 * | | e_idle |<---------+ | | | e_idle | | | | 1212 * +--| e_type |<------------+ +--| e_type | | | | 1213 * | e_tiptr | | | e_tiptr | | | | 1214 * | ... | | | ... | | | | 1215 * | e_lock | | | e_lock | | | | 1216 * | ... | | | ... | | | | 1217 * | e_ref (0) | | | e_ref (2) | | | | 1218 * | e_itime | | | e_itime | | | | 1219 * +------------+ | +------------+ | | | 1220 * | | | | 1221 * | | | | 1222 * | ____________ | | | 1223 * | | e_node |<------+ | | 1224 * | | e_idle |<------+--+ | 1225 * +--| e_type | | | 1226 * | | e_tiptr | | | 1227 * | | ... | | | 1228 * | | e_lock | | | 1229 * | | ... | | | 1230 * | | e_ref (0) | | | 1231 * | | e_itime | | | 1232 * | +------------+ | | 1233 * | | | 1234 * | | | 1235 * | ____________ | | 1236 * | | e_node |<------+ | 1237 * | | e_idle | | 1238 * +--| e_type |<------------+ 1239 * | e_tiptr | 1240 * | ... | 1241 * | e_lock | 1242 * | ... | 1243 * | e_ref (1) | 1244 * | e_itime | 1245 * +------------+ 1246 * 1247 * Endpoint locking strategy: 1248 * 1249 * The following functions manipulate lists which hold the endpoint and the 1250 * endpoints themselves: 1251 * 1252 * endpnt_get()/check_endpnt()/endpnt_rele()/endpnt_reap()/do_endpnt_reclaim() 1253 * 1254 * Lock description follows: 1255 * 1256 * endpnt_type_lock: Global reader/writer lock which protects accesses to the 1257 * endpnt_type_list. 1258 * 1259 * e_plock: Lock defined in the endpnt_type_t. It is intended to 1260 * protect accesses to the pool of endopints (e_pool) for a given 1261 * endpnt_type_t. 1262 * 1263 * e_ilock: Lock defined in endpnt_type_t. It is intended to protect accesses 1264 * to the idle list (e_ilist) of available endpoints for a given 1265 * endpnt_type_t. It also protects access to the e_itimer, e_async_cv, 1266 * and e_async_count fields in endpnt_type_t. 1267 * 1268 * e_lock: Lock defined in the endpnt structure. It is intended to protect 1269 * flags, cv, and ref count. 1270 * 1271 * The order goes as follows so as not to induce deadlock. 1272 * 1273 * endpnt_type_lock -> e_plock -> e_ilock -> e_lock 1274 * 1275 * Interaction with Zones and shutting down: 1276 * 1277 * endpnt_type_ts are uniquely identified by the (e_zoneid, e_rdev, e_protofmly) 1278 * tuple, which means that a zone may not reuse another zone's idle endpoints 1279 * without first doing a t_kclose(). 1280 * 1281 * A zone's endpnt_type_ts are destroyed when a zone is shut down; e_async_cv 1282 * and e_async_count are used to keep track of the threads in endpnt_taskq 1283 * trying to reap endpnt_ts in the endpnt_type_t. 1284 */ 1285 1286 /* 1287 * Allocate and initialize an endpnt_type_t 1288 */ 1289 static struct endpnt_type * 1290 endpnt_type_create(struct knetconfig *config) 1291 { 1292 struct endpnt_type *etype; 1293 1294 /* 1295 * Allocate a new endpoint type to hang a list of 1296 * endpoints off of it. 1297 */ 1298 etype = kmem_alloc(sizeof (struct endpnt_type), KM_SLEEP); 1299 etype->e_next = NULL; 1300 etype->e_pcurr = NULL; 1301 etype->e_itimer = 0; 1302 etype->e_cnt = 0; 1303 1304 (void) strncpy(etype->e_protofmly, config->knc_protofmly, KNC_STRSIZE); 1305 mutex_init(&etype->e_plock, NULL, MUTEX_DEFAULT, NULL); 1306 mutex_init(&etype->e_ilock, NULL, MUTEX_DEFAULT, NULL); 1307 etype->e_rdev = config->knc_rdev; 1308 etype->e_zoneid = rpc_zoneid(); 1309 etype->e_async_count = 0; 1310 cv_init(&etype->e_async_cv, NULL, CV_DEFAULT, NULL); 1311 1312 list_create(&etype->e_pool, sizeof (endpnt_t), 1313 offsetof(endpnt_t, e_node)); 1314 list_create(&etype->e_ilist, sizeof (endpnt_t), 1315 offsetof(endpnt_t, e_idle)); 1316 1317 /* 1318 * Check to see if we need to create a taskq for endpoint 1319 * reaping 1320 */ 1321 mutex_enter(&endpnt_taskq_lock); 1322 if (taskq_created == FALSE) { 1323 taskq_created = TRUE; 1324 mutex_exit(&endpnt_taskq_lock); 1325 ASSERT(endpnt_taskq == NULL); 1326 endpnt_taskq = taskq_create("clts_endpnt_taskq", 1, 1327 minclsyspri, 200, INT_MAX, 0); 1328 } else 1329 mutex_exit(&endpnt_taskq_lock); 1330 1331 return (etype); 1332 } 1333 1334 /* 1335 * Free an endpnt_type_t 1336 */ 1337 static void 1338 endpnt_type_free(struct endpnt_type *etype) 1339 { 1340 mutex_destroy(&etype->e_plock); 1341 mutex_destroy(&etype->e_ilock); 1342 list_destroy(&etype->e_pool); 1343 list_destroy(&etype->e_ilist); 1344 kmem_free(etype, sizeof (endpnt_type_t)); 1345 } 1346 1347 /* 1348 * Check the endpoint to ensure that it is suitable for use. 1349 * 1350 * Possible return values: 1351 * 1352 * return (1) - Endpoint is established, but needs to be re-opened. 1353 * return (0) && *newp == NULL - Endpoint is established, but unusable. 1354 * return (0) && *newp != NULL - Endpoint is established and usable. 1355 */ 1356 static int 1357 check_endpnt(struct endpnt *endp, struct endpnt **newp) 1358 { 1359 *newp = endp; 1360 1361 mutex_enter(&endp->e_lock); 1362 ASSERT(endp->e_ref >= 1); 1363 1364 /* 1365 * The first condition we check for is if the endpoint has been 1366 * allocated, but is unusable either because it has been closed or 1367 * has been marked stale. Only *one* thread will be allowed to 1368 * execute the then clause. This is enforced because the first thread 1369 * to check this condition will clear the flags, so that subsequent 1370 * thread(s) checking this endpoint will move on. 1371 */ 1372 if ((endp->e_flags & ENDPNT_ESTABLISHED) && 1373 (!(endp->e_flags & ENDPNT_BOUND) || 1374 (endp->e_flags & ENDPNT_STALE))) { 1375 /* 1376 * Clear the flags here since they will be 1377 * set again by this thread. They need to be 1378 * individually cleared because we want to maintain 1379 * the state for ENDPNT_ONIDLE. 1380 */ 1381 endp->e_flags &= ~(ENDPNT_ESTABLISHED | 1382 ENDPNT_WAITING | ENDPNT_BOUND | ENDPNT_STALE); 1383 mutex_exit(&endp->e_lock); 1384 return (1); 1385 } 1386 1387 /* 1388 * The second condition is meant for any thread that is waiting for 1389 * an endpoint to become established. It will cv_wait() until 1390 * the condition for the endpoint has been changed to ENDPNT_BOUND or 1391 * ENDPNT_STALE. 1392 */ 1393 while (!(endp->e_flags & ENDPNT_BOUND) && 1394 !(endp->e_flags & ENDPNT_STALE)) { 1395 endp->e_flags |= ENDPNT_WAITING; 1396 cv_wait(&endp->e_cv, &endp->e_lock); 1397 } 1398 1399 ASSERT(endp->e_flags & ENDPNT_ESTABLISHED); 1400 1401 /* 1402 * The last case we check for is if the endpoint has been marked stale. 1403 * If this is the case then set *newp to NULL and return, so that the 1404 * caller is notified of the error and can take appropriate action. 1405 */ 1406 if (endp->e_flags & ENDPNT_STALE) { 1407 endp->e_ref--; 1408 *newp = NULL; 1409 } 1410 mutex_exit(&endp->e_lock); 1411 return (0); 1412 } 1413 1414 #ifdef DEBUG 1415 /* 1416 * Provide a fault injection setting to test error conditions. 1417 */ 1418 static int endpnt_get_return_null = 0; 1419 #endif 1420 1421 /* 1422 * Returns a handle (struct endpnt *) to an open and bound endpoint 1423 * specified by the knetconfig passed in. Returns NULL if no valid endpoint 1424 * can be obtained. 1425 */ 1426 static struct endpnt * 1427 endpnt_get(struct knetconfig *config, int useresvport) 1428 { 1429 struct endpnt_type *n_etype = NULL; 1430 struct endpnt_type *np = NULL; 1431 struct endpnt *new = NULL; 1432 struct endpnt *endp = NULL; 1433 struct endpnt *next = NULL; 1434 TIUSER *tiptr = NULL; 1435 int rtries = BINDRESVPORT_RETRIES; 1436 int i = 0; 1437 int error; 1438 int retval; 1439 zoneid_t zoneid = rpc_zoneid(); 1440 cred_t *cr; 1441 1442 RPCLOG(1, "endpnt_get: protofmly %s, ", config->knc_protofmly); 1443 RPCLOG(1, "rdev %ld\n", config->knc_rdev); 1444 1445 #ifdef DEBUG 1446 /* 1447 * Inject fault if desired. Pretend we have a stale endpoint 1448 * and return NULL. 1449 */ 1450 if (endpnt_get_return_null > 0) { 1451 endpnt_get_return_null--; 1452 return (NULL); 1453 } 1454 #endif 1455 rw_enter(&endpnt_type_lock, RW_READER); 1456 1457 top: 1458 for (np = endpnt_type_list; np != NULL; np = np->e_next) 1459 if ((np->e_zoneid == zoneid) && 1460 (np->e_rdev == config->knc_rdev) && 1461 (strcmp(np->e_protofmly, 1462 config->knc_protofmly) == 0)) 1463 break; 1464 1465 if (np == NULL && n_etype != NULL) { 1466 ASSERT(rw_write_held(&endpnt_type_lock)); 1467 1468 /* 1469 * Link the endpoint type onto the list 1470 */ 1471 n_etype->e_next = endpnt_type_list; 1472 endpnt_type_list = n_etype; 1473 np = n_etype; 1474 n_etype = NULL; 1475 } 1476 1477 if (np == NULL) { 1478 /* 1479 * The logic here is that we were unable to find an 1480 * endpnt_type_t that matched our criteria, so we allocate a 1481 * new one. Because kmem_alloc() needs to be called with 1482 * KM_SLEEP, we drop our locks so that we don't induce 1483 * deadlock. After allocating and initializing the 1484 * endpnt_type_t, we reaquire the lock and go back to check 1485 * if this entry needs to be added to the list. Since we do 1486 * some operations without any locking other threads may 1487 * have been looking for the same endpnt_type_t and gone 1488 * through this code path. We check for this case and allow 1489 * one thread to link its endpnt_type_t to the list and the 1490 * other threads will simply free theirs. 1491 */ 1492 rw_exit(&endpnt_type_lock); 1493 n_etype = endpnt_type_create(config); 1494 1495 /* 1496 * We need to reaquire the lock with RW_WRITER here so that 1497 * we can safely link the new endpoint type onto the list. 1498 */ 1499 rw_enter(&endpnt_type_lock, RW_WRITER); 1500 goto top; 1501 } 1502 1503 rw_exit(&endpnt_type_lock); 1504 /* 1505 * If n_etype is not NULL, then another thread was able to 1506 * insert an endpnt_type_t of this type onto the list before 1507 * we did. Go ahead and free ours. 1508 */ 1509 if (n_etype != NULL) 1510 endpnt_type_free(n_etype); 1511 1512 mutex_enter(&np->e_ilock); 1513 /* 1514 * The algorithm to hand out endpoints is to first 1515 * give out those that are idle if such endpoints 1516 * exist. Otherwise, create a new one if we haven't 1517 * reached the max threshold. Finally, we give out 1518 * endpoints in a pseudo LRU fashion (round-robin). 1519 * 1520 * Note: The idle list is merely a hint of those endpoints 1521 * that should be idle. There exists a window after the 1522 * endpoint is released and before it is linked back onto the 1523 * idle list where a thread could get a reference to it and 1524 * use it. This is okay, since the reference counts will 1525 * still be consistent. 1526 */ 1527 if ((endp = (endpnt_t *)list_head(&np->e_ilist)) != NULL) { 1528 timeout_id_t t_id = 0; 1529 1530 mutex_enter(&endp->e_lock); 1531 endp->e_ref++; 1532 endp->e_itime = 0; 1533 endp->e_flags &= ~ENDPNT_ONIDLE; 1534 mutex_exit(&endp->e_lock); 1535 1536 /* 1537 * Pop the endpoint off the idle list and hand it off 1538 */ 1539 list_remove(&np->e_ilist, endp); 1540 1541 if (np->e_itimer != 0) { 1542 t_id = np->e_itimer; 1543 np->e_itimer = 0; 1544 } 1545 mutex_exit(&np->e_ilock); 1546 /* 1547 * Reset the idle timer if it has been set 1548 */ 1549 if (t_id != (timeout_id_t)0) 1550 (void) untimeout(t_id); 1551 1552 if (check_endpnt(endp, &new) == 0) 1553 return (new); 1554 } else if (np->e_cnt >= clnt_clts_max_endpoints) { 1555 /* 1556 * There are no idle endpoints currently, so 1557 * create a new one if we have not reached the maximum or 1558 * hand one out in round-robin. 1559 */ 1560 mutex_exit(&np->e_ilock); 1561 mutex_enter(&np->e_plock); 1562 endp = np->e_pcurr; 1563 mutex_enter(&endp->e_lock); 1564 endp->e_ref++; 1565 mutex_exit(&endp->e_lock); 1566 1567 ASSERT(endp != NULL); 1568 /* 1569 * Advance the pointer to the next eligible endpoint, if 1570 * necessary. 1571 */ 1572 if (np->e_cnt > 1) { 1573 next = (endpnt_t *)list_next(&np->e_pool, np->e_pcurr); 1574 if (next == NULL) 1575 next = (endpnt_t *)list_head(&np->e_pool); 1576 np->e_pcurr = next; 1577 } 1578 1579 mutex_exit(&np->e_plock); 1580 1581 /* 1582 * We need to check to see if this endpoint is bound or 1583 * not. If it is in progress then just wait until 1584 * the set up is complete 1585 */ 1586 if (check_endpnt(endp, &new) == 0) 1587 return (new); 1588 } else { 1589 mutex_exit(&np->e_ilock); 1590 mutex_enter(&np->e_plock); 1591 1592 /* 1593 * Allocate a new endpoint to use. If we can't allocate any 1594 * more memory then use one that is already established if any 1595 * such endpoints exist. 1596 */ 1597 new = kmem_cache_alloc(endpnt_cache, KM_NOSLEEP); 1598 if (new == NULL) { 1599 RPCLOG0(1, "endpnt_get: kmem_cache_alloc failed\n"); 1600 /* 1601 * Try to recover by using an existing endpoint. 1602 */ 1603 if (np->e_cnt <= 0) { 1604 mutex_exit(&np->e_plock); 1605 return (NULL); 1606 } 1607 endp = np->e_pcurr; 1608 if ((next = list_next(&np->e_pool, np->e_pcurr)) != 1609 NULL) 1610 np->e_pcurr = next; 1611 ASSERT(endp != NULL); 1612 mutex_enter(&endp->e_lock); 1613 endp->e_ref++; 1614 mutex_exit(&endp->e_lock); 1615 mutex_exit(&np->e_plock); 1616 1617 if (check_endpnt(endp, &new) == 0) 1618 return (new); 1619 } else { 1620 /* 1621 * Partially init an endpoint structure and put 1622 * it on the list, so that other interested threads 1623 * know that one is being created 1624 */ 1625 bzero(new, sizeof (struct endpnt)); 1626 1627 cv_init(&new->e_cv, NULL, CV_DEFAULT, NULL); 1628 mutex_init(&new->e_lock, NULL, MUTEX_DEFAULT, NULL); 1629 new->e_ref = 1; 1630 new->e_type = np; 1631 1632 /* 1633 * Link the endpoint into the pool. 1634 */ 1635 list_insert_head(&np->e_pool, new); 1636 np->e_cnt++; 1637 if (np->e_pcurr == NULL) 1638 np->e_pcurr = new; 1639 mutex_exit(&np->e_plock); 1640 } 1641 } 1642 1643 /* 1644 * The transport should be opened with sufficient privs 1645 */ 1646 cr = zone_kcred(); 1647 error = t_kopen(NULL, config->knc_rdev, FREAD|FWRITE|FNDELAY, &tiptr, 1648 cr); 1649 if (error) { 1650 RPCLOG(1, "endpnt_get: t_kopen: %d\n", error); 1651 goto bad; 1652 } 1653 1654 new->e_tiptr = tiptr; 1655 rpc_poptimod(tiptr->fp->f_vnode); 1656 1657 /* 1658 * Allow the kernel to push the module on behalf of the user. 1659 */ 1660 error = strioctl(tiptr->fp->f_vnode, I_PUSH, (intptr_t)"rpcmod", 0, 1661 K_TO_K, cr, &retval); 1662 if (error) { 1663 RPCLOG(1, "endpnt_get: kstr_push on rpcmod failed %d\n", error); 1664 goto bad; 1665 } 1666 1667 error = strioctl(tiptr->fp->f_vnode, RPC_CLIENT, 0, 0, K_TO_K, 1668 cr, &retval); 1669 if (error) { 1670 RPCLOG(1, "endpnt_get: strioctl failed %d\n", error); 1671 goto bad; 1672 } 1673 1674 /* 1675 * Connectionless data flow should bypass the stream head. 1676 */ 1677 new->e_wq = tiptr->fp->f_vnode->v_stream->sd_wrq->q_next; 1678 1679 error = strioctl(tiptr->fp->f_vnode, I_PUSH, (intptr_t)"timod", 0, 1680 K_TO_K, cr, &retval); 1681 if (error) { 1682 RPCLOG(1, "endpnt_get: kstr_push on timod failed %d\n", error); 1683 goto bad; 1684 } 1685 1686 /* 1687 * Attempt to bind the endpoint. If we fail then propogate 1688 * error back to calling subsystem, so that it can be handled 1689 * appropriately. 1690 * If the caller has not specified reserved port usage then 1691 * take the system default. 1692 */ 1693 if (useresvport == -1) 1694 useresvport = clnt_clts_do_bindresvport; 1695 1696 if (useresvport && 1697 (strcmp(config->knc_protofmly, NC_INET) == 0 || 1698 strcmp(config->knc_protofmly, NC_INET6) == 0)) { 1699 1700 while ((error = 1701 bindresvport(new->e_tiptr, NULL, NULL, FALSE)) != 0) { 1702 RPCLOG(1, 1703 "endpnt_get: bindresvport error %d\n", error); 1704 if (error != EPROTO) { 1705 if (rtries-- <= 0) 1706 goto bad; 1707 1708 delay(hz << i++); 1709 continue; 1710 } 1711 1712 (void) t_kclose(new->e_tiptr, 1); 1713 /* 1714 * reopen with all privileges 1715 */ 1716 error = t_kopen(NULL, config->knc_rdev, 1717 FREAD|FWRITE|FNDELAY, 1718 &new->e_tiptr, cr); 1719 if (error) { 1720 RPCLOG(1, "endpnt_get: t_kopen: %d\n", error); 1721 new->e_tiptr = NULL; 1722 goto bad; 1723 } 1724 } 1725 } else if ((error = t_kbind(new->e_tiptr, NULL, NULL)) != 0) { 1726 RPCLOG(1, "endpnt_get: t_kbind failed: %d\n", error); 1727 goto bad; 1728 } 1729 1730 /* 1731 * Set the flags and notify and waiters that we have an established 1732 * endpoint. 1733 */ 1734 mutex_enter(&new->e_lock); 1735 new->e_flags |= ENDPNT_ESTABLISHED; 1736 new->e_flags |= ENDPNT_BOUND; 1737 if (new->e_flags & ENDPNT_WAITING) { 1738 cv_broadcast(&new->e_cv); 1739 new->e_flags &= ~ENDPNT_WAITING; 1740 } 1741 mutex_exit(&new->e_lock); 1742 1743 return (new); 1744 1745 bad: 1746 ASSERT(new != NULL); 1747 /* 1748 * mark this endpoint as stale and notify any threads waiting 1749 * on this endpoint that it will be going away. 1750 */ 1751 mutex_enter(&new->e_lock); 1752 if (new->e_ref > 0) { 1753 new->e_flags |= ENDPNT_ESTABLISHED; 1754 new->e_flags |= ENDPNT_STALE; 1755 if (new->e_flags & ENDPNT_WAITING) { 1756 cv_broadcast(&new->e_cv); 1757 new->e_flags &= ~ENDPNT_WAITING; 1758 } 1759 } 1760 new->e_ref--; 1761 new->e_tiptr = NULL; 1762 mutex_exit(&new->e_lock); 1763 1764 /* 1765 * If there was a transport endopoint opened, then close it. 1766 */ 1767 if (tiptr != NULL) 1768 (void) t_kclose(tiptr, 1); 1769 1770 return (NULL); 1771 } 1772 1773 /* 1774 * Release a referece to the endpoint 1775 */ 1776 static void 1777 endpnt_rele(struct endpnt *sp) 1778 { 1779 mutex_enter(&sp->e_lock); 1780 ASSERT(sp->e_ref > 0); 1781 sp->e_ref--; 1782 /* 1783 * If the ref count is zero, then start the idle timer and link 1784 * the endpoint onto the idle list. 1785 */ 1786 if (sp->e_ref == 0) { 1787 sp->e_itime = gethrestime_sec(); 1788 1789 /* 1790 * Check to see if the endpoint is already linked to the idle 1791 * list, so that we don't try to reinsert it. 1792 */ 1793 if (sp->e_flags & ENDPNT_ONIDLE) { 1794 mutex_exit(&sp->e_lock); 1795 mutex_enter(&sp->e_type->e_ilock); 1796 endpnt_reap_settimer(sp->e_type); 1797 mutex_exit(&sp->e_type->e_ilock); 1798 return; 1799 } 1800 1801 sp->e_flags |= ENDPNT_ONIDLE; 1802 mutex_exit(&sp->e_lock); 1803 mutex_enter(&sp->e_type->e_ilock); 1804 list_insert_tail(&sp->e_type->e_ilist, sp); 1805 endpnt_reap_settimer(sp->e_type); 1806 mutex_exit(&sp->e_type->e_ilock); 1807 } else 1808 mutex_exit(&sp->e_lock); 1809 } 1810 1811 static void 1812 endpnt_reap_settimer(endpnt_type_t *etp) 1813 { 1814 if (etp->e_itimer == (timeout_id_t)0) 1815 etp->e_itimer = timeout(endpnt_reap_dispatch, (void *)etp, 1816 clnt_clts_taskq_dispatch_interval); 1817 } 1818 1819 static void 1820 endpnt_reap_dispatch(void *a) 1821 { 1822 endpnt_type_t *etp = a; 1823 1824 /* 1825 * The idle timer has fired, so dispatch the taskq to close the 1826 * endpoint. 1827 */ 1828 if (taskq_dispatch(endpnt_taskq, (task_func_t *)endpnt_reap, etp, 1829 TQ_NOSLEEP) == NULL) 1830 return; 1831 mutex_enter(&etp->e_ilock); 1832 etp->e_async_count++; 1833 mutex_exit(&etp->e_ilock); 1834 } 1835 1836 /* 1837 * Traverse the idle list and close those endpoints that have reached their 1838 * timeout interval. 1839 */ 1840 static void 1841 endpnt_reap(endpnt_type_t *etp) 1842 { 1843 struct endpnt *e; 1844 struct endpnt *next_node = NULL; 1845 1846 mutex_enter(&etp->e_ilock); 1847 e = list_head(&etp->e_ilist); 1848 while (e != NULL) { 1849 next_node = list_next(&etp->e_ilist, e); 1850 1851 mutex_enter(&e->e_lock); 1852 if (e->e_ref > 0) { 1853 mutex_exit(&e->e_lock); 1854 e = next_node; 1855 continue; 1856 } 1857 1858 ASSERT(e->e_ref == 0); 1859 if (e->e_itime > 0 && 1860 (e->e_itime + clnt_clts_endpoint_reap_interval) < 1861 gethrestime_sec()) { 1862 e->e_flags &= ~ENDPNT_BOUND; 1863 (void) t_kclose(e->e_tiptr, 1); 1864 e->e_tiptr = NULL; 1865 e->e_itime = 0; 1866 } 1867 mutex_exit(&e->e_lock); 1868 e = next_node; 1869 } 1870 etp->e_itimer = 0; 1871 if (--etp->e_async_count == 0) 1872 cv_signal(&etp->e_async_cv); 1873 mutex_exit(&etp->e_ilock); 1874 } 1875 1876 static void 1877 endpnt_reclaim(zoneid_t zoneid) 1878 { 1879 struct endpnt_type *np; 1880 struct endpnt *e; 1881 struct endpnt *next_node = NULL; 1882 list_t free_list; 1883 int rcnt = 0; 1884 1885 list_create(&free_list, sizeof (endpnt_t), offsetof(endpnt_t, e_node)); 1886 1887 RPCLOG0(1, "endpnt_reclaim: reclaim callback started\n"); 1888 rw_enter(&endpnt_type_lock, RW_READER); 1889 for (np = endpnt_type_list; np != NULL; np = np->e_next) { 1890 if (zoneid != ALL_ZONES && zoneid != np->e_zoneid) 1891 continue; 1892 1893 mutex_enter(&np->e_plock); 1894 RPCLOG(1, "endpnt_reclaim: protofmly %s, ", 1895 np->e_protofmly); 1896 RPCLOG(1, "rdev %ld\n", np->e_rdev); 1897 RPCLOG(1, "endpnt_reclaim: found %d endpoint(s)\n", 1898 np->e_cnt); 1899 1900 if (np->e_cnt == 0) { 1901 mutex_exit(&np->e_plock); 1902 continue; 1903 } 1904 1905 /* 1906 * The nice thing about maintaining an idle list is that if 1907 * there are any endpoints to reclaim, they are going to be 1908 * on this list. Just go through and reap the one's that 1909 * have ref counts of zero. 1910 */ 1911 mutex_enter(&np->e_ilock); 1912 e = list_head(&np->e_ilist); 1913 while (e != NULL) { 1914 next_node = list_next(&np->e_ilist, e); 1915 mutex_enter(&e->e_lock); 1916 if (e->e_ref > 0) { 1917 mutex_exit(&e->e_lock); 1918 e = next_node; 1919 continue; 1920 } 1921 ASSERT(e->e_ref == 0); 1922 mutex_exit(&e->e_lock); 1923 1924 list_remove(&np->e_ilist, e); 1925 list_remove(&np->e_pool, e); 1926 list_insert_head(&free_list, e); 1927 1928 rcnt++; 1929 np->e_cnt--; 1930 e = next_node; 1931 } 1932 mutex_exit(&np->e_ilock); 1933 /* 1934 * Reset the current pointer to be safe 1935 */ 1936 if ((e = (struct endpnt *)list_head(&np->e_pool)) != NULL) 1937 np->e_pcurr = e; 1938 else { 1939 ASSERT(np->e_cnt == 0); 1940 np->e_pcurr = NULL; 1941 } 1942 1943 mutex_exit(&np->e_plock); 1944 } 1945 rw_exit(&endpnt_type_lock); 1946 1947 while ((e = list_head(&free_list)) != NULL) { 1948 list_remove(&free_list, e); 1949 if (e->e_tiptr != NULL) 1950 (void) t_kclose(e->e_tiptr, 1); 1951 1952 cv_destroy(&e->e_cv); 1953 mutex_destroy(&e->e_lock); 1954 kmem_cache_free(endpnt_cache, e); 1955 } 1956 list_destroy(&free_list); 1957 RPCLOG(1, "endpnt_reclaim: reclaimed %d endpoint(s)\n", rcnt); 1958 } 1959 1960 /* 1961 * Endpoint reclaim zones destructor callback routine. 1962 * 1963 * After reclaiming any cached entries, we basically go through the endpnt_type 1964 * list, canceling outstanding timeouts and free'ing data structures. 1965 */ 1966 /* ARGSUSED */ 1967 static void 1968 endpnt_destructor(zoneid_t zoneid, void *a) 1969 { 1970 struct endpnt_type **npp; 1971 struct endpnt_type *np; 1972 struct endpnt_type *free_list = NULL; 1973 timeout_id_t t_id = 0; 1974 extern void clcleanup_zone(zoneid_t); 1975 extern void clcleanup4_zone(zoneid_t); 1976 1977 /* Make sure NFS client handles are released. */ 1978 clcleanup_zone(zoneid); 1979 clcleanup4_zone(zoneid); 1980 1981 endpnt_reclaim(zoneid); 1982 /* 1983 * We don't need to be holding on to any locks across the call to 1984 * endpnt_reclaim() and the code below; we know that no-one can 1985 * be holding open connections for this zone (all processes and kernel 1986 * threads are gone), so nothing could be adding anything to the list. 1987 */ 1988 rw_enter(&endpnt_type_lock, RW_WRITER); 1989 npp = &endpnt_type_list; 1990 while ((np = *npp) != NULL) { 1991 if (np->e_zoneid != zoneid) { 1992 npp = &np->e_next; 1993 continue; 1994 } 1995 mutex_enter(&np->e_plock); 1996 mutex_enter(&np->e_ilock); 1997 if (np->e_itimer != 0) { 1998 t_id = np->e_itimer; 1999 np->e_itimer = 0; 2000 } 2001 ASSERT(np->e_cnt == 0); 2002 ASSERT(list_head(&np->e_pool) == NULL); 2003 ASSERT(list_head(&np->e_ilist) == NULL); 2004 2005 mutex_exit(&np->e_ilock); 2006 mutex_exit(&np->e_plock); 2007 2008 /* 2009 * untimeout() any outstanding timers that have not yet fired. 2010 */ 2011 if (t_id != (timeout_id_t)0) 2012 (void) untimeout(t_id); 2013 *npp = np->e_next; 2014 np->e_next = free_list; 2015 free_list = np; 2016 } 2017 rw_exit(&endpnt_type_lock); 2018 2019 while (free_list != NULL) { 2020 np = free_list; 2021 free_list = free_list->e_next; 2022 /* 2023 * Wait for threads in endpnt_taskq trying to reap endpnt_ts in 2024 * the endpnt_type_t. 2025 */ 2026 mutex_enter(&np->e_ilock); 2027 while (np->e_async_count > 0) 2028 cv_wait(&np->e_async_cv, &np->e_ilock); 2029 cv_destroy(&np->e_async_cv); 2030 mutex_destroy(&np->e_plock); 2031 mutex_destroy(&np->e_ilock); 2032 list_destroy(&np->e_pool); 2033 list_destroy(&np->e_ilist); 2034 kmem_free(np, sizeof (endpnt_type_t)); 2035 } 2036 } 2037 2038 /* 2039 * Endpoint reclaim kmem callback routine. 2040 */ 2041 /* ARGSUSED */ 2042 static void 2043 endpnt_repossess(void *a) 2044 { 2045 /* 2046 * Reclaim idle endpnt's from all zones. 2047 */ 2048 if (endpnt_taskq != NULL) 2049 (void) taskq_dispatch(endpnt_taskq, 2050 (task_func_t *)endpnt_reclaim, (void *)ALL_ZONES, 2051 TQ_NOSLEEP); 2052 } 2053 2054 /* 2055 * RPC request dispatch routine. Constructs a datagram message and wraps it 2056 * around the RPC request to pass downstream. 2057 */ 2058 static int 2059 clnt_clts_dispatch_send(queue_t *q, mblk_t *mp, struct netbuf *addr, 2060 calllist_t *cp, uint_t xid, cred_t *cr) 2061 { 2062 mblk_t *bp; 2063 int msgsz; 2064 struct T_unitdata_req *udreq; 2065 2066 /* 2067 * Set up the call record. 2068 */ 2069 cp->call_wq = q; 2070 cp->call_xid = xid; 2071 cp->call_status = RPC_TIMEDOUT; 2072 cp->call_notified = FALSE; 2073 RPCLOG(64, 2074 "clnt_clts_dispatch_send: putting xid 0x%x on " 2075 "dispatch list\n", xid); 2076 cp->call_hash = call_hash(xid, clnt_clts_hash_size); 2077 cp->call_bucket = &clts_call_ht[cp->call_hash]; 2078 call_table_enter(cp); 2079 2080 /* 2081 * Construct the datagram 2082 */ 2083 msgsz = (int)TUNITDATAREQSZ; 2084 /* 2085 * Note: if the receiver uses SCM_UCRED/getpeerucred the pid will 2086 * appear as -1. 2087 */ 2088 while (!(bp = allocb_cred(msgsz + addr->len, cr, NOPID))) { 2089 if (strwaitbuf(msgsz + addr->len, BPRI_LO)) 2090 return (ENOSR); 2091 } 2092 2093 udreq = (struct T_unitdata_req *)bp->b_wptr; 2094 udreq->PRIM_type = T_UNITDATA_REQ; 2095 udreq->DEST_length = addr->len; 2096 2097 if (addr->len) { 2098 bcopy(addr->buf, bp->b_wptr + msgsz, addr->len); 2099 udreq->DEST_offset = (t_scalar_t)msgsz; 2100 msgsz += addr->len; 2101 } else 2102 udreq->DEST_offset = 0; 2103 udreq->OPT_length = 0; 2104 udreq->OPT_offset = 0; 2105 2106 bp->b_datap->db_type = M_PROTO; 2107 bp->b_wptr += msgsz; 2108 2109 /* 2110 * Link the datagram header with the actual data 2111 */ 2112 linkb(bp, mp); 2113 2114 /* 2115 * Send downstream. 2116 */ 2117 if (canput(cp->call_wq)) { 2118 put(cp->call_wq, bp); 2119 return (0); 2120 } 2121 2122 return (EIO); 2123 } 2124 2125 /* 2126 * RPC response delivery routine. Deliver the response to the waiting 2127 * thread by matching the xid. 2128 */ 2129 void 2130 clnt_clts_dispatch_notify(mblk_t *mp, int resp_off, zoneid_t zoneid) 2131 { 2132 calllist_t *e = NULL; 2133 call_table_t *chtp; 2134 uint32_t xid; 2135 uint_t hash; 2136 unsigned char *hdr_offset; 2137 mblk_t *resp; 2138 2139 /* 2140 * If the RPC response is not contained in the same mblk as the 2141 * datagram header, then move to the next mblk. 2142 */ 2143 hdr_offset = mp->b_rptr; 2144 resp = mp; 2145 if ((mp->b_wptr - (mp->b_rptr + resp_off)) == 0) 2146 resp = mp->b_cont; 2147 else 2148 resp->b_rptr += resp_off; 2149 2150 ASSERT(resp != NULL); 2151 2152 if ((IS_P2ALIGNED(resp->b_rptr, sizeof (uint32_t))) && 2153 (resp->b_wptr - resp->b_rptr) >= sizeof (xid)) 2154 xid = *((uint32_t *)resp->b_rptr); 2155 else { 2156 int i = 0; 2157 unsigned char *p = (unsigned char *)&xid; 2158 unsigned char *rptr; 2159 mblk_t *tmp = resp; 2160 2161 /* 2162 * Copy the xid, byte-by-byte into xid. 2163 */ 2164 while (tmp) { 2165 rptr = tmp->b_rptr; 2166 while (rptr < tmp->b_wptr) { 2167 *p++ = *rptr++; 2168 if (++i >= sizeof (xid)) 2169 goto done_xid_copy; 2170 } 2171 tmp = tmp->b_cont; 2172 } 2173 2174 /* 2175 * If we got here, we ran out of mblk space before the 2176 * xid could be copied. 2177 */ 2178 ASSERT(tmp == NULL && i < sizeof (xid)); 2179 2180 RPCLOG0(1, 2181 "clnt_dispatch_notify(clts): message less than " 2182 "size of xid\n"); 2183 2184 freemsg(mp); 2185 return; 2186 } 2187 2188 done_xid_copy: 2189 2190 /* 2191 * Reset the read pointer back to the beginning of the protocol 2192 * header if we moved it. 2193 */ 2194 if (mp->b_rptr != hdr_offset) 2195 mp->b_rptr = hdr_offset; 2196 2197 hash = call_hash(xid, clnt_clts_hash_size); 2198 chtp = &clts_call_ht[hash]; 2199 /* call_table_find returns with the hash bucket locked */ 2200 call_table_find(chtp, xid, e); 2201 2202 if (e != NULL) { 2203 mutex_enter(&e->call_lock); 2204 2205 /* 2206 * verify that the reply is coming in on 2207 * the same zone that it was sent from. 2208 */ 2209 if (e->call_zoneid != zoneid) { 2210 mutex_exit(&e->call_lock); 2211 mutex_exit(&chtp->ct_lock); 2212 RPCLOG0(8, "clnt_dispatch_notify (clts): incorrect " 2213 "zoneid\n"); 2214 freemsg(mp); 2215 return; 2216 } 2217 2218 /* 2219 * found thread waiting for this reply. 2220 */ 2221 if (e->call_reply) { 2222 RPCLOG(8, 2223 "clnt_dispatch_notify (clts): discarding old " 2224 "reply for xid 0x%x\n", 2225 xid); 2226 freemsg(e->call_reply); 2227 } 2228 e->call_notified = TRUE; 2229 e->call_reply = mp; 2230 e->call_status = RPC_SUCCESS; 2231 cv_signal(&e->call_cv); 2232 mutex_exit(&e->call_lock); 2233 mutex_exit(&chtp->ct_lock); 2234 } else { 2235 zone_t *zone; 2236 struct rpcstat *rpcstat; 2237 2238 mutex_exit(&chtp->ct_lock); 2239 RPCLOG(8, "clnt_dispatch_notify (clts): no caller for reply " 2240 "0x%x\n", xid); 2241 freemsg(mp); 2242 /* 2243 * This is unfortunate, but we need to lookup the zone so we 2244 * can increment its "rcbadxids" counter. 2245 */ 2246 zone = zone_find_by_id(zoneid); 2247 if (zone == NULL) { 2248 /* 2249 * The zone went away... 2250 */ 2251 return; 2252 } 2253 rpcstat = zone_getspecific(rpcstat_zone_key, zone); 2254 if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) { 2255 /* 2256 * Not interested 2257 */ 2258 zone_rele(zone); 2259 return; 2260 } 2261 RCSTAT_INCR(rpcstat->rpc_clts_client, rcbadxids); 2262 zone_rele(zone); 2263 } 2264 } 2265 2266 /* 2267 * Init routine. Called when rpcmod is loaded. 2268 */ 2269 void 2270 clnt_clts_init(void) 2271 { 2272 endpnt_cache = kmem_cache_create("clnt_clts_endpnt_cache", 2273 sizeof (struct endpnt), 0, NULL, NULL, endpnt_repossess, NULL, 2274 NULL, 0); 2275 2276 rw_init(&endpnt_type_lock, NULL, RW_DEFAULT, NULL); 2277 2278 /* 2279 * Perform simple bounds checking to make sure that the setting is 2280 * reasonable 2281 */ 2282 if (clnt_clts_max_endpoints <= 0) { 2283 if (clnt_clts_do_bindresvport) 2284 clnt_clts_max_endpoints = RESERVED_PORTSPACE; 2285 else 2286 clnt_clts_max_endpoints = NONRESERVED_PORTSPACE; 2287 } 2288 2289 if (clnt_clts_do_bindresvport && 2290 clnt_clts_max_endpoints > RESERVED_PORTSPACE) 2291 clnt_clts_max_endpoints = RESERVED_PORTSPACE; 2292 else if (clnt_clts_max_endpoints > NONRESERVED_PORTSPACE) 2293 clnt_clts_max_endpoints = NONRESERVED_PORTSPACE; 2294 2295 if (clnt_clts_hash_size < DEFAULT_MIN_HASH_SIZE) 2296 clnt_clts_hash_size = DEFAULT_MIN_HASH_SIZE; 2297 2298 /* 2299 * Defer creating the taskq until rpcmod gets pushed. If we are 2300 * in diskless boot mode, rpcmod will get loaded early even before 2301 * thread_create() is available. 2302 */ 2303 endpnt_taskq = NULL; 2304 taskq_created = FALSE; 2305 mutex_init(&endpnt_taskq_lock, NULL, MUTEX_DEFAULT, NULL); 2306 2307 if (clnt_clts_endpoint_reap_interval < DEFAULT_ENDPOINT_REAP_INTERVAL) 2308 clnt_clts_endpoint_reap_interval = 2309 DEFAULT_ENDPOINT_REAP_INTERVAL; 2310 2311 /* 2312 * Dispatch the taskq at an interval which is offset from the 2313 * interval that the endpoints should be reaped. 2314 */ 2315 clnt_clts_taskq_dispatch_interval = 2316 (clnt_clts_endpoint_reap_interval + DEFAULT_INTERVAL_SHIFT) * hz; 2317 2318 /* 2319 * Initialize the completion queue 2320 */ 2321 clts_call_ht = call_table_init(clnt_clts_hash_size); 2322 /* 2323 * Initialize the zone destructor callback. 2324 */ 2325 zone_key_create(&endpnt_destructor_key, NULL, NULL, endpnt_destructor); 2326 } 2327 2328 void 2329 clnt_clts_fini(void) 2330 { 2331 (void) zone_key_delete(endpnt_destructor_key); 2332 } 2333