1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 24 */ 25 26 /* 27 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 28 * Use is subject to license terms. 29 */ 30 31 /* 32 * Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T 33 * All Rights Reserved 34 */ 35 36 /* 37 * Portions of this source code were derived from Berkeley 4.3 BSD 38 * under license from the Regents of the University of California. 39 */ 40 41 42 /* 43 * Implements a kernel based, client side RPC. 44 */ 45 46 #include <sys/param.h> 47 #include <sys/types.h> 48 #include <sys/systm.h> 49 #include <sys/sysmacros.h> 50 #include <sys/stream.h> 51 #include <sys/strsubr.h> 52 #include <sys/ddi.h> 53 #include <sys/tiuser.h> 54 #include <sys/tihdr.h> 55 #include <sys/t_kuser.h> 56 #include <sys/errno.h> 57 #include <sys/kmem.h> 58 #include <sys/debug.h> 59 #include <sys/kstat.h> 60 #include <sys/t_lock.h> 61 #include <sys/cmn_err.h> 62 #include <sys/conf.h> 63 #include <sys/disp.h> 64 #include <sys/taskq.h> 65 #include <sys/list.h> 66 #include <sys/atomic.h> 67 #include <sys/zone.h> 68 #include <netinet/in.h> 69 #include <rpc/types.h> 70 #include <rpc/xdr.h> 71 #include <rpc/auth.h> 72 #include <rpc/clnt.h> 73 #include <rpc/rpc_msg.h> 74 75 #include <sys/sdt.h> 76 77 static enum clnt_stat clnt_clts_kcallit(CLIENT *, rpcproc_t, xdrproc_t, 78 caddr_t, xdrproc_t, caddr_t, struct timeval); 79 static void clnt_clts_kabort(CLIENT *); 80 static void clnt_clts_kerror(CLIENT *, struct rpc_err *); 81 static bool_t clnt_clts_kfreeres(CLIENT *, xdrproc_t, caddr_t); 82 static bool_t clnt_clts_kcontrol(CLIENT *, int, char *); 83 static void clnt_clts_kdestroy(CLIENT *); 84 static int clnt_clts_ksettimers(CLIENT *, struct rpc_timers *, 85 struct rpc_timers *, int, void (*)(), caddr_t, uint32_t); 86 87 /* 88 * Operations vector for CLTS based RPC 89 */ 90 static struct clnt_ops clts_ops = { 91 clnt_clts_kcallit, /* do rpc call */ 92 clnt_clts_kabort, /* abort call */ 93 clnt_clts_kerror, /* return error status */ 94 clnt_clts_kfreeres, /* free results */ 95 clnt_clts_kdestroy, /* destroy rpc handle */ 96 clnt_clts_kcontrol, /* the ioctl() of rpc */ 97 clnt_clts_ksettimers /* set retry timers */ 98 }; 99 100 /* 101 * Endpoint for CLTS (INET, INET6, loopback, etc.) 102 */ 103 typedef struct endpnt_type { 104 struct endpnt_type *e_next; /* pointer to next endpoint type */ 105 list_t e_pool; /* list of available endpoints */ 106 list_t e_ilist; /* list of idle endpoints */ 107 struct endpnt *e_pcurr; /* pointer to current endpoint */ 108 char e_protofmly[KNC_STRSIZE]; /* protocol family */ 109 dev_t e_rdev; /* device */ 110 kmutex_t e_plock; /* pool lock */ 111 kmutex_t e_ilock; /* idle list lock */ 112 timeout_id_t e_itimer; /* timer to dispatch the taskq */ 113 uint_t e_cnt; /* number of endpoints in the pool */ 114 zoneid_t e_zoneid; /* zoneid of endpoint type */ 115 kcondvar_t e_async_cv; /* cv for asynchronous reap threads */ 116 uint_t e_async_count; /* count of asynchronous reap threads */ 117 } endpnt_type_t; 118 119 typedef struct endpnt { 120 list_node_t e_node; /* link to the pool */ 121 list_node_t e_idle; /* link to the idle list */ 122 endpnt_type_t *e_type; /* back pointer to endpoint type */ 123 TIUSER *e_tiptr; /* pointer to transport endpoint */ 124 queue_t *e_wq; /* write queue */ 125 uint_t e_flags; /* endpoint flags */ 126 uint_t e_ref; /* ref count on endpoint */ 127 kcondvar_t e_cv; /* condition variable */ 128 kmutex_t e_lock; /* protects cv and flags */ 129 time_t e_itime; /* time when rele'd */ 130 } endpnt_t; 131 132 #define ENDPNT_ESTABLISHED 0x1 /* endpoint is established */ 133 #define ENDPNT_WAITING 0x2 /* thread waiting for endpoint */ 134 #define ENDPNT_BOUND 0x4 /* endpoint is bound */ 135 #define ENDPNT_STALE 0x8 /* endpoint is dead */ 136 #define ENDPNT_ONIDLE 0x10 /* endpoint is on the idle list */ 137 138 static krwlock_t endpnt_type_lock; /* protects endpnt_type_list */ 139 static endpnt_type_t *endpnt_type_list = NULL; /* list of CLTS endpoints */ 140 static struct kmem_cache *endpnt_cache; /* cache of endpnt_t's */ 141 static taskq_t *endpnt_taskq; /* endpnt_t reaper thread */ 142 static bool_t taskq_created; /* flag for endpnt_taskq */ 143 static kmutex_t endpnt_taskq_lock; /* taskq lock */ 144 static zone_key_t endpnt_destructor_key; 145 146 #define DEFAULT_ENDPOINT_REAP_INTERVAL 60 /* 1 minute */ 147 #define DEFAULT_INTERVAL_SHIFT 30 /* 30 seconds */ 148 149 /* 150 * Endpoint tunables 151 */ 152 static int clnt_clts_max_endpoints = -1; 153 static int clnt_clts_hash_size = DEFAULT_HASH_SIZE; 154 static time_t clnt_clts_endpoint_reap_interval = -1; 155 static clock_t clnt_clts_taskq_dispatch_interval; 156 157 /* 158 * Response completion hash queue 159 */ 160 static call_table_t *clts_call_ht; 161 162 /* 163 * Routines for the endpoint manager 164 */ 165 static struct endpnt_type *endpnt_type_create(struct knetconfig *); 166 static void endpnt_type_free(struct endpnt_type *); 167 static int check_endpnt(struct endpnt *, struct endpnt **); 168 static struct endpnt *endpnt_get(struct knetconfig *, int); 169 static void endpnt_rele(struct endpnt *); 170 static void endpnt_reap_settimer(endpnt_type_t *); 171 static void endpnt_reap(endpnt_type_t *); 172 static void endpnt_reap_dispatch(void *); 173 static void endpnt_reclaim(zoneid_t); 174 175 176 /* 177 * Request dipatching function. 178 */ 179 static int clnt_clts_dispatch_send(queue_t *q, mblk_t *, struct netbuf *addr, 180 calllist_t *, uint_t, cred_t *); 181 182 /* 183 * The size of the preserialized RPC header information. 184 */ 185 #define CKU_HDRSIZE 20 186 /* 187 * The initial allocation size. It is small to reduce space requirements. 188 */ 189 #define CKU_INITSIZE 2048 190 /* 191 * The size of additional allocations, if required. It is larger to 192 * reduce the number of actual allocations. 193 */ 194 #define CKU_ALLOCSIZE 8192 195 196 /* 197 * Private data per rpc handle. This structure is allocated by 198 * clnt_clts_kcreate, and freed by clnt_clts_kdestroy. 199 */ 200 struct cku_private { 201 CLIENT cku_client; /* client handle */ 202 int cku_retrys; /* request retrys */ 203 calllist_t cku_call; 204 struct endpnt *cku_endpnt; /* open end point */ 205 struct knetconfig cku_config; 206 struct netbuf cku_addr; /* remote address */ 207 struct rpc_err cku_err; /* error status */ 208 XDR cku_outxdr; /* xdr stream for output */ 209 XDR cku_inxdr; /* xdr stream for input */ 210 char cku_rpchdr[CKU_HDRSIZE + 4]; /* rpc header */ 211 struct cred *cku_cred; /* credentials */ 212 struct rpc_timers *cku_timers; /* for estimating RTT */ 213 struct rpc_timers *cku_timeall; /* for estimating RTT */ 214 void (*cku_feedback)(int, int, caddr_t); 215 /* ptr to feedback rtn */ 216 caddr_t cku_feedarg; /* argument for feedback func */ 217 uint32_t cku_xid; /* current XID */ 218 bool_t cku_bcast; /* RPC broadcast hint */ 219 int cku_useresvport; /* Use reserved port */ 220 struct rpc_clts_client *cku_stats; /* counters for the zone */ 221 }; 222 223 static const struct rpc_clts_client { 224 kstat_named_t rccalls; 225 kstat_named_t rcbadcalls; 226 kstat_named_t rcretrans; 227 kstat_named_t rcbadxids; 228 kstat_named_t rctimeouts; 229 kstat_named_t rcnewcreds; 230 kstat_named_t rcbadverfs; 231 kstat_named_t rctimers; 232 kstat_named_t rcnomem; 233 kstat_named_t rccantsend; 234 } clts_rcstat_tmpl = { 235 { "calls", KSTAT_DATA_UINT64 }, 236 { "badcalls", KSTAT_DATA_UINT64 }, 237 { "retrans", KSTAT_DATA_UINT64 }, 238 { "badxids", KSTAT_DATA_UINT64 }, 239 { "timeouts", KSTAT_DATA_UINT64 }, 240 { "newcreds", KSTAT_DATA_UINT64 }, 241 { "badverfs", KSTAT_DATA_UINT64 }, 242 { "timers", KSTAT_DATA_UINT64 }, 243 { "nomem", KSTAT_DATA_UINT64 }, 244 { "cantsend", KSTAT_DATA_UINT64 }, 245 }; 246 247 static uint_t clts_rcstat_ndata = 248 sizeof (clts_rcstat_tmpl) / sizeof (kstat_named_t); 249 250 #define RCSTAT_INCR(s, x) \ 251 atomic_inc_64(&(s)->x.value.ui64) 252 253 #define ptoh(p) (&((p)->cku_client)) 254 #define htop(h) ((struct cku_private *)((h)->cl_private)) 255 256 /* 257 * Times to retry 258 */ 259 #define SNDTRIES 4 260 #define REFRESHES 2 /* authentication refreshes */ 261 262 /* 263 * The following is used to determine the global default behavior for 264 * CLTS when binding to a local port. 265 * 266 * If the value is set to 1 the default will be to select a reserved 267 * (aka privileged) port, if the value is zero the default will be to 268 * use non-reserved ports. Users of kRPC may override this by using 269 * CLNT_CONTROL() and CLSET_BINDRESVPORT. 270 */ 271 static int clnt_clts_do_bindresvport = 1; 272 273 #define BINDRESVPORT_RETRIES 5 274 275 void 276 clnt_clts_stats_init(zoneid_t zoneid, struct rpc_clts_client **statsp) 277 { 278 kstat_t *ksp; 279 kstat_named_t *knp; 280 281 knp = rpcstat_zone_init_common(zoneid, "unix", "rpc_clts_client", 282 (const kstat_named_t *)&clts_rcstat_tmpl, 283 sizeof (clts_rcstat_tmpl)); 284 /* 285 * Backwards compatibility for old kstat clients 286 */ 287 ksp = kstat_create_zone("unix", 0, "rpc_client", "rpc", 288 KSTAT_TYPE_NAMED, clts_rcstat_ndata, 289 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid); 290 if (ksp) { 291 ksp->ks_data = knp; 292 kstat_install(ksp); 293 } 294 *statsp = (struct rpc_clts_client *)knp; 295 } 296 297 void 298 clnt_clts_stats_fini(zoneid_t zoneid, struct rpc_clts_client **statsp) 299 { 300 rpcstat_zone_fini_common(zoneid, "unix", "rpc_clts_client"); 301 kstat_delete_byname_zone("unix", 0, "rpc_client", zoneid); 302 kmem_free(*statsp, sizeof (clts_rcstat_tmpl)); 303 } 304 305 /* 306 * Create an rpc handle for a clts rpc connection. 307 * Allocates space for the handle structure and the private data. 308 */ 309 /* ARGSUSED */ 310 int 311 clnt_clts_kcreate(struct knetconfig *config, struct netbuf *addr, 312 rpcprog_t pgm, rpcvers_t vers, int retrys, struct cred *cred, 313 CLIENT **cl) 314 { 315 CLIENT *h; 316 struct cku_private *p; 317 struct rpc_msg call_msg; 318 int error; 319 int plen; 320 321 if (cl == NULL) 322 return (EINVAL); 323 324 *cl = NULL; 325 error = 0; 326 327 p = kmem_zalloc(sizeof (*p), KM_SLEEP); 328 329 h = ptoh(p); 330 331 /* handle */ 332 h->cl_ops = &clts_ops; 333 h->cl_private = (caddr_t)p; 334 h->cl_auth = authkern_create(); 335 336 /* call message, just used to pre-serialize below */ 337 call_msg.rm_xid = 0; 338 call_msg.rm_direction = CALL; 339 call_msg.rm_call.cb_rpcvers = RPC_MSG_VERSION; 340 call_msg.rm_call.cb_prog = pgm; 341 call_msg.rm_call.cb_vers = vers; 342 343 /* private */ 344 clnt_clts_kinit(h, addr, retrys, cred); 345 346 xdrmem_create(&p->cku_outxdr, p->cku_rpchdr, CKU_HDRSIZE, XDR_ENCODE); 347 348 /* pre-serialize call message header */ 349 if (!xdr_callhdr(&p->cku_outxdr, &call_msg)) { 350 XDR_DESTROY(&p->cku_outxdr); 351 error = EINVAL; /* XXX */ 352 goto bad; 353 } 354 XDR_DESTROY(&p->cku_outxdr); 355 356 p->cku_config.knc_rdev = config->knc_rdev; 357 p->cku_config.knc_semantics = config->knc_semantics; 358 plen = strlen(config->knc_protofmly) + 1; 359 p->cku_config.knc_protofmly = kmem_alloc(plen, KM_SLEEP); 360 bcopy(config->knc_protofmly, p->cku_config.knc_protofmly, plen); 361 p->cku_useresvport = -1; /* value is has not been set */ 362 363 cv_init(&p->cku_call.call_cv, NULL, CV_DEFAULT, NULL); 364 mutex_init(&p->cku_call.call_lock, NULL, MUTEX_DEFAULT, NULL); 365 366 *cl = h; 367 return (0); 368 369 bad: 370 auth_destroy(h->cl_auth); 371 kmem_free(p->cku_addr.buf, addr->maxlen); 372 kmem_free(p, sizeof (struct cku_private)); 373 374 return (error); 375 } 376 377 void 378 clnt_clts_kinit(CLIENT *h, struct netbuf *addr, int retrys, cred_t *cred) 379 { 380 /* LINTED pointer alignment */ 381 struct cku_private *p = htop(h); 382 struct rpcstat *rsp; 383 384 rsp = zone_getspecific(rpcstat_zone_key, rpc_zone()); 385 ASSERT(rsp != NULL); 386 387 p->cku_retrys = retrys; 388 389 if (p->cku_addr.maxlen < addr->len) { 390 if (p->cku_addr.maxlen != 0 && p->cku_addr.buf != NULL) 391 kmem_free(p->cku_addr.buf, p->cku_addr.maxlen); 392 393 p->cku_addr.buf = kmem_zalloc(addr->maxlen, KM_SLEEP); 394 p->cku_addr.maxlen = addr->maxlen; 395 } 396 397 p->cku_addr.len = addr->len; 398 bcopy(addr->buf, p->cku_addr.buf, addr->len); 399 400 p->cku_cred = cred; 401 p->cku_xid = 0; 402 p->cku_timers = NULL; 403 p->cku_timeall = NULL; 404 p->cku_feedback = NULL; 405 p->cku_bcast = FALSE; 406 p->cku_call.call_xid = 0; 407 p->cku_call.call_hash = 0; 408 p->cku_call.call_notified = FALSE; 409 p->cku_call.call_next = NULL; 410 p->cku_call.call_prev = NULL; 411 p->cku_call.call_reply = NULL; 412 p->cku_call.call_wq = NULL; 413 p->cku_stats = rsp->rpc_clts_client; 414 } 415 416 /* 417 * set the timers. Return current retransmission timeout. 418 */ 419 static int 420 clnt_clts_ksettimers(CLIENT *h, struct rpc_timers *t, struct rpc_timers *all, 421 int minimum, void (*feedback)(int, int, caddr_t), caddr_t arg, 422 uint32_t xid) 423 { 424 /* LINTED pointer alignment */ 425 struct cku_private *p = htop(h); 426 int value; 427 428 p->cku_feedback = feedback; 429 p->cku_feedarg = arg; 430 p->cku_timers = t; 431 p->cku_timeall = all; 432 if (xid) 433 p->cku_xid = xid; 434 value = all->rt_rtxcur; 435 value += t->rt_rtxcur; 436 if (value < minimum) 437 return (minimum); 438 RCSTAT_INCR(p->cku_stats, rctimers); 439 return (value); 440 } 441 442 /* 443 * Time out back off function. tim is in HZ 444 */ 445 #define MAXTIMO (20 * hz) 446 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim)) 447 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1)) 448 449 #define RETRY_POLL_TIMO 30 450 451 /* 452 * Call remote procedure. 453 * Most of the work of rpc is done here. We serialize what is left 454 * of the header (some was pre-serialized in the handle), serialize 455 * the arguments, and send it off. We wait for a reply or a time out. 456 * Timeout causes an immediate return, other packet problems may cause 457 * a retry on the receive. When a good packet is received we deserialize 458 * it, and check verification. A bad reply code will cause one retry 459 * with full (longhand) credentials. 460 */ 461 enum clnt_stat 462 clnt_clts_kcallit_addr(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args, 463 caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp, 464 struct timeval wait, struct netbuf *sin) 465 { 466 /* LINTED pointer alignment */ 467 struct cku_private *p = htop(h); 468 XDR *xdrs; 469 int stries = p->cku_retrys; 470 int refreshes = REFRESHES; /* number of times to refresh cred */ 471 int round_trip; /* time the RPC */ 472 int error; 473 mblk_t *mp; 474 mblk_t *mpdup; 475 mblk_t *resp = NULL; 476 mblk_t *tmp; 477 calllist_t *call = &p->cku_call; 478 clock_t ori_timout, timout; 479 bool_t interrupted; 480 enum clnt_stat status; 481 struct rpc_msg reply_msg; 482 enum clnt_stat re_status; 483 endpnt_t *endpt; 484 485 RCSTAT_INCR(p->cku_stats, rccalls); 486 487 RPCLOG(2, "clnt_clts_kcallit_addr: wait.tv_sec: %ld\n", wait.tv_sec); 488 RPCLOG(2, "clnt_clts_kcallit_addr: wait.tv_usec: %ld\n", wait.tv_usec); 489 490 timout = TIMEVAL_TO_TICK(&wait); 491 ori_timout = timout; 492 493 if (p->cku_xid == 0) { 494 p->cku_xid = alloc_xid(); 495 if (p->cku_endpnt != NULL) 496 endpnt_rele(p->cku_endpnt); 497 p->cku_endpnt = NULL; 498 } 499 call->call_zoneid = rpc_zoneid(); 500 501 mpdup = NULL; 502 call_again: 503 504 if (mpdup == NULL) { 505 506 while ((mp = allocb(CKU_INITSIZE, BPRI_LO)) == NULL) { 507 if (strwaitbuf(CKU_INITSIZE, BPRI_LO)) { 508 p->cku_err.re_status = RPC_SYSTEMERROR; 509 p->cku_err.re_errno = ENOSR; 510 goto done; 511 } 512 } 513 514 xdrs = &p->cku_outxdr; 515 xdrmblk_init(xdrs, mp, XDR_ENCODE, CKU_ALLOCSIZE); 516 517 if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) { 518 /* 519 * Copy in the preserialized RPC header 520 * information. 521 */ 522 bcopy(p->cku_rpchdr, mp->b_rptr, CKU_HDRSIZE); 523 524 /* 525 * transaction id is the 1st thing in the output 526 * buffer. 527 */ 528 /* LINTED pointer alignment */ 529 (*(uint32_t *)(mp->b_rptr)) = p->cku_xid; 530 531 /* Skip the preserialized stuff. */ 532 XDR_SETPOS(xdrs, CKU_HDRSIZE); 533 534 /* Serialize dynamic stuff into the output buffer. */ 535 if ((!XDR_PUTINT32(xdrs, (int32_t *)&procnum)) || 536 (!AUTH_MARSHALL(h->cl_auth, xdrs, p->cku_cred)) || 537 (!(*xdr_args)(xdrs, argsp))) { 538 XDR_DESTROY(xdrs); 539 freemsg(mp); 540 p->cku_err.re_status = RPC_CANTENCODEARGS; 541 p->cku_err.re_errno = EIO; 542 goto done; 543 } 544 } else { 545 uint32_t *uproc = (uint32_t *) 546 &p->cku_rpchdr[CKU_HDRSIZE]; 547 IXDR_PUT_U_INT32(uproc, procnum); 548 549 (*(uint32_t *)(&p->cku_rpchdr[0])) = p->cku_xid; 550 XDR_SETPOS(xdrs, 0); 551 552 /* Serialize the procedure number and the arguments. */ 553 if (!AUTH_WRAP(h->cl_auth, (caddr_t)p->cku_rpchdr, 554 CKU_HDRSIZE+4, xdrs, xdr_args, argsp)) { 555 XDR_DESTROY(xdrs); 556 freemsg(mp); 557 p->cku_err.re_status = RPC_CANTENCODEARGS; 558 p->cku_err.re_errno = EIO; 559 goto done; 560 } 561 } 562 563 XDR_DESTROY(xdrs); 564 } else 565 mp = mpdup; 566 567 mpdup = dupmsg(mp); 568 if (mpdup == NULL) { 569 freemsg(mp); 570 p->cku_err.re_status = RPC_SYSTEMERROR; 571 p->cku_err.re_errno = ENOSR; 572 goto done; 573 } 574 575 /* 576 * Grab an endpnt only if the endpoint is NULL. We could be retrying 577 * the request and in this case we want to go through the same 578 * source port, so that the duplicate request cache may detect a 579 * retry. 580 */ 581 582 if (p->cku_endpnt == NULL) 583 p->cku_endpnt = endpnt_get(&p->cku_config, p->cku_useresvport); 584 585 if (p->cku_endpnt == NULL) { 586 freemsg(mp); 587 p->cku_err.re_status = RPC_SYSTEMERROR; 588 p->cku_err.re_errno = ENOSR; 589 goto done; 590 } 591 592 round_trip = ddi_get_lbolt(); 593 594 error = clnt_clts_dispatch_send(p->cku_endpnt->e_wq, mp, 595 &p->cku_addr, call, p->cku_xid, p->cku_cred); 596 597 if (error != 0) { 598 freemsg(mp); 599 p->cku_err.re_status = RPC_CANTSEND; 600 p->cku_err.re_errno = error; 601 RCSTAT_INCR(p->cku_stats, rccantsend); 602 goto done1; 603 } 604 605 RPCLOG(64, "clnt_clts_kcallit_addr: sent call for xid 0x%x\n", 606 p->cku_xid); 607 608 /* 609 * There are two reasons for which we go back to to tryread. 610 * 611 * a) In case the status is RPC_PROCUNAVAIL and we sent out a 612 * broadcast we should not get any invalid messages with the 613 * RPC_PROCUNAVAIL error back. Some broken RPC implementations 614 * send them and for this we have to ignore them ( as we would 615 * have never received them ) and look for another message 616 * which might contain the valid response because we don't know 617 * how many broken implementations are in the network. So we are 618 * going to loop until 619 * - we received a valid response 620 * - we have processed all invalid responses and 621 * got a time out when we try to receive again a 622 * message. 623 * 624 * b) We will jump back to tryread also in case we failed 625 * within the AUTH_VALIDATE. In this case we should move 626 * on and loop until we received a valid response or we 627 * have processed all responses with broken authentication 628 * and we got a time out when we try to receive a message. 629 */ 630 tryread: 631 mutex_enter(&call->call_lock); 632 interrupted = FALSE; 633 if (call->call_notified == FALSE) { 634 klwp_t *lwp = ttolwp(curthread); 635 clock_t cv_wait_ret = 1; /* init to > 0 */ 636 clock_t cv_timout = timout; 637 638 if (lwp != NULL) 639 lwp->lwp_nostop++; 640 641 cv_timout += ddi_get_lbolt(); 642 643 if (h->cl_nosignal) 644 while ((cv_wait_ret = 645 cv_timedwait(&call->call_cv, 646 &call->call_lock, cv_timout)) > 0 && 647 call->call_notified == FALSE) 648 ; 649 else 650 while ((cv_wait_ret = 651 cv_timedwait_sig(&call->call_cv, 652 &call->call_lock, cv_timout)) > 0 && 653 call->call_notified == FALSE) 654 ; 655 656 if (cv_wait_ret == 0) 657 interrupted = TRUE; 658 659 if (lwp != NULL) 660 lwp->lwp_nostop--; 661 } 662 resp = call->call_reply; 663 call->call_reply = NULL; 664 status = call->call_status; 665 /* 666 * We have to reset the call_notified here. In case we have 667 * to do a retry ( e.g. in case we got a RPC_PROCUNAVAIL 668 * error ) we need to set this to false to ensure that 669 * we will wait for the next message. When the next message 670 * is going to arrive the function clnt_clts_dispatch_notify 671 * will set this to true again. 672 */ 673 call->call_notified = FALSE; 674 call->call_status = RPC_TIMEDOUT; 675 mutex_exit(&call->call_lock); 676 677 if (status == RPC_TIMEDOUT) { 678 if (interrupted) { 679 /* 680 * We got interrupted, bail out 681 */ 682 p->cku_err.re_status = RPC_INTR; 683 p->cku_err.re_errno = EINTR; 684 goto done1; 685 } else { 686 RPCLOG(8, "clnt_clts_kcallit_addr: " 687 "request w/xid 0x%x timedout " 688 "waiting for reply\n", p->cku_xid); 689 #if 0 /* XXX not yet */ 690 /* 691 * Timeout may be due to a dead gateway. Send 692 * an ioctl downstream advising deletion of 693 * route when we reach the half-way point to 694 * timing out. 695 */ 696 if (stries == p->cku_retrys/2) { 697 t_kadvise(p->cku_endpnt->e_tiptr, 698 (uchar_t *)p->cku_addr.buf, 699 p->cku_addr.len); 700 } 701 #endif /* not yet */ 702 p->cku_err.re_status = RPC_TIMEDOUT; 703 p->cku_err.re_errno = ETIMEDOUT; 704 RCSTAT_INCR(p->cku_stats, rctimeouts); 705 goto done1; 706 } 707 } 708 709 ASSERT(resp != NULL); 710 711 /* 712 * Prepare the message for further processing. We need to remove 713 * the datagram header and copy the source address if necessary. No 714 * need to verify the header since rpcmod took care of that. 715 */ 716 /* 717 * Copy the source address if the caller has supplied a netbuf. 718 */ 719 if (sin != NULL) { 720 union T_primitives *pptr; 721 722 pptr = (union T_primitives *)resp->b_rptr; 723 bcopy(resp->b_rptr + pptr->unitdata_ind.SRC_offset, sin->buf, 724 pptr->unitdata_ind.SRC_length); 725 sin->len = pptr->unitdata_ind.SRC_length; 726 } 727 728 /* 729 * Pop off the datagram header. 730 * It was retained in rpcmodrput(). 731 */ 732 tmp = resp; 733 resp = resp->b_cont; 734 tmp->b_cont = NULL; 735 freeb(tmp); 736 737 round_trip = ddi_get_lbolt() - round_trip; 738 /* 739 * Van Jacobson timer algorithm here, only if NOT a retransmission. 740 */ 741 if (p->cku_timers != NULL && stries == p->cku_retrys) { 742 int rt; 743 744 rt = round_trip; 745 rt -= (p->cku_timers->rt_srtt >> 3); 746 p->cku_timers->rt_srtt += rt; 747 if (rt < 0) 748 rt = - rt; 749 rt -= (p->cku_timers->rt_deviate >> 2); 750 p->cku_timers->rt_deviate += rt; 751 p->cku_timers->rt_rtxcur = 752 (clock_t)((p->cku_timers->rt_srtt >> 2) + 753 p->cku_timers->rt_deviate) >> 1; 754 755 rt = round_trip; 756 rt -= (p->cku_timeall->rt_srtt >> 3); 757 p->cku_timeall->rt_srtt += rt; 758 if (rt < 0) 759 rt = - rt; 760 rt -= (p->cku_timeall->rt_deviate >> 2); 761 p->cku_timeall->rt_deviate += rt; 762 p->cku_timeall->rt_rtxcur = 763 (clock_t)((p->cku_timeall->rt_srtt >> 2) + 764 p->cku_timeall->rt_deviate) >> 1; 765 if (p->cku_feedback != NULL) { 766 (*p->cku_feedback)(FEEDBACK_OK, procnum, 767 p->cku_feedarg); 768 } 769 } 770 771 /* 772 * Process reply 773 */ 774 xdrs = &(p->cku_inxdr); 775 xdrmblk_init(xdrs, resp, XDR_DECODE, 0); 776 777 reply_msg.rm_direction = REPLY; 778 reply_msg.rm_reply.rp_stat = MSG_ACCEPTED; 779 reply_msg.acpted_rply.ar_stat = SUCCESS; 780 reply_msg.acpted_rply.ar_verf = _null_auth; 781 /* 782 * xdr_results will be done in AUTH_UNWRAP. 783 */ 784 reply_msg.acpted_rply.ar_results.where = NULL; 785 reply_msg.acpted_rply.ar_results.proc = xdr_void; 786 787 /* 788 * Decode and validate the response. 789 */ 790 if (!xdr_replymsg(xdrs, &reply_msg)) { 791 p->cku_err.re_status = RPC_CANTDECODERES; 792 p->cku_err.re_errno = EIO; 793 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 794 XDR_DESTROY(xdrs); 795 goto done1; 796 } 797 798 _seterr_reply(&reply_msg, &(p->cku_err)); 799 800 re_status = p->cku_err.re_status; 801 if (re_status == RPC_SUCCESS) { 802 /* 803 * Reply is good, check auth. 804 */ 805 if (!AUTH_VALIDATE(h->cl_auth, 806 &reply_msg.acpted_rply.ar_verf)) { 807 p->cku_err.re_status = RPC_AUTHERROR; 808 p->cku_err.re_why = AUTH_INVALIDRESP; 809 RCSTAT_INCR(p->cku_stats, rcbadverfs); 810 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 811 XDR_DESTROY(xdrs); 812 goto tryread; 813 } 814 if (!AUTH_UNWRAP(h->cl_auth, xdrs, xdr_results, resultsp)) { 815 p->cku_err.re_status = RPC_CANTDECODERES; 816 p->cku_err.re_errno = EIO; 817 } 818 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 819 XDR_DESTROY(xdrs); 820 goto done1; 821 } 822 /* set errno in case we can't recover */ 823 if (re_status != RPC_VERSMISMATCH && 824 re_status != RPC_AUTHERROR && re_status != RPC_PROGVERSMISMATCH) 825 p->cku_err.re_errno = EIO; 826 /* 827 * Determine whether or not we're doing an RPC 828 * broadcast. Some server implementations don't 829 * follow RFC 1050, section 7.4.2 in that they 830 * don't remain silent when they see a proc 831 * they don't support. Therefore we keep trying 832 * to receive on RPC_PROCUNAVAIL, hoping to get 833 * a valid response from a compliant server. 834 */ 835 if (re_status == RPC_PROCUNAVAIL && p->cku_bcast) { 836 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 837 XDR_DESTROY(xdrs); 838 goto tryread; 839 } 840 if (re_status == RPC_AUTHERROR) { 841 842 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 843 XDR_DESTROY(xdrs); 844 call_table_remove(call); 845 if (call->call_reply != NULL) { 846 freemsg(call->call_reply); 847 call->call_reply = NULL; 848 } 849 850 /* 851 * Maybe our credential need to be refreshed 852 */ 853 if (refreshes > 0 && 854 AUTH_REFRESH(h->cl_auth, &reply_msg, p->cku_cred)) { 855 /* 856 * The credential is refreshed. Try the request again. 857 * Even if stries == 0, we still retry as long as 858 * refreshes > 0. This prevents a soft authentication 859 * error turning into a hard one at an upper level. 860 */ 861 refreshes--; 862 RCSTAT_INCR(p->cku_stats, rcbadcalls); 863 RCSTAT_INCR(p->cku_stats, rcnewcreds); 864 865 freemsg(mpdup); 866 mpdup = NULL; 867 freemsg(resp); 868 resp = NULL; 869 goto call_again; 870 } 871 /* 872 * We have used the client handle to do an AUTH_REFRESH 873 * and the RPC status may be set to RPC_SUCCESS; 874 * Let's make sure to set it to RPC_AUTHERROR. 875 */ 876 p->cku_err.re_status = RPC_CANTDECODERES; 877 878 /* 879 * Map recoverable and unrecoverable 880 * authentication errors to appropriate errno 881 */ 882 switch (p->cku_err.re_why) { 883 case AUTH_TOOWEAK: 884 /* 885 * Could be an nfsportmon failure, set 886 * useresvport and try again. 887 */ 888 if (p->cku_useresvport != 1) { 889 p->cku_useresvport = 1; 890 891 freemsg(mpdup); 892 mpdup = NULL; 893 freemsg(resp); 894 resp = NULL; 895 896 endpt = p->cku_endpnt; 897 if (endpt->e_tiptr != NULL) { 898 mutex_enter(&endpt->e_lock); 899 endpt->e_flags &= ~ENDPNT_BOUND; 900 (void) t_kclose(endpt->e_tiptr, 1); 901 endpt->e_tiptr = NULL; 902 mutex_exit(&endpt->e_lock); 903 904 } 905 906 p->cku_xid = alloc_xid(); 907 endpnt_rele(p->cku_endpnt); 908 p->cku_endpnt = NULL; 909 goto call_again; 910 } 911 /* FALLTHRU */ 912 case AUTH_BADCRED: 913 case AUTH_BADVERF: 914 case AUTH_INVALIDRESP: 915 case AUTH_FAILED: 916 case RPCSEC_GSS_NOCRED: 917 case RPCSEC_GSS_FAILED: 918 p->cku_err.re_errno = EACCES; 919 break; 920 case AUTH_REJECTEDCRED: 921 case AUTH_REJECTEDVERF: 922 default: 923 p->cku_err.re_errno = EIO; 924 break; 925 } 926 RPCLOG(1, "clnt_clts_kcallit : authentication failed " 927 "with RPC_AUTHERROR of type %d\n", 928 p->cku_err.re_why); 929 goto done; 930 } 931 932 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 933 XDR_DESTROY(xdrs); 934 935 done1: 936 call_table_remove(call); 937 if (call->call_reply != NULL) { 938 freemsg(call->call_reply); 939 call->call_reply = NULL; 940 } 941 RPCLOG(64, "clnt_clts_kcallit_addr: xid 0x%x taken off dispatch list", 942 p->cku_xid); 943 944 done: 945 if (resp != NULL) { 946 freemsg(resp); 947 resp = NULL; 948 } 949 950 if ((p->cku_err.re_status != RPC_SUCCESS) && 951 (p->cku_err.re_status != RPC_INTR) && 952 (p->cku_err.re_status != RPC_UDERROR) && 953 !IS_UNRECOVERABLE_RPC(p->cku_err.re_status)) { 954 if (p->cku_feedback != NULL && stries == p->cku_retrys) { 955 (*p->cku_feedback)(FEEDBACK_REXMIT1, procnum, 956 p->cku_feedarg); 957 } 958 959 timout = backoff(timout); 960 if (p->cku_timeall != (struct rpc_timers *)0) 961 p->cku_timeall->rt_rtxcur = timout; 962 963 if (p->cku_err.re_status == RPC_SYSTEMERROR || 964 p->cku_err.re_status == RPC_CANTSEND) { 965 /* 966 * Errors due to lack of resources, wait a bit 967 * and try again. 968 */ 969 (void) delay(hz/10); 970 } 971 if (stries-- > 0) { 972 RCSTAT_INCR(p->cku_stats, rcretrans); 973 goto call_again; 974 } 975 } 976 977 if (mpdup != NULL) 978 freemsg(mpdup); 979 980 if (p->cku_err.re_status != RPC_SUCCESS) { 981 RCSTAT_INCR(p->cku_stats, rcbadcalls); 982 } 983 984 /* 985 * Allow the endpoint to be held by the client handle in case this 986 * RPC was not successful. A retry may occur at a higher level and 987 * in this case we may want to send the request over the same 988 * source port. 989 * Endpoint is also released for one-way RPC: no reply, nor retransmit 990 * is expected. 991 */ 992 if ((p->cku_err.re_status == RPC_SUCCESS || 993 (p->cku_err.re_status == RPC_TIMEDOUT && ori_timout == 0)) && 994 p->cku_endpnt != NULL) { 995 endpnt_rele(p->cku_endpnt); 996 p->cku_endpnt = NULL; 997 } else { 998 DTRACE_PROBE2(clnt_clts_kcallit_done, int, p->cku_err.re_status, 999 struct endpnt *, p->cku_endpnt); 1000 } 1001 1002 return (p->cku_err.re_status); 1003 } 1004 1005 static enum clnt_stat 1006 clnt_clts_kcallit(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args, 1007 caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp, 1008 struct timeval wait) 1009 { 1010 return (clnt_clts_kcallit_addr(h, procnum, xdr_args, argsp, 1011 xdr_results, resultsp, wait, NULL)); 1012 } 1013 1014 /* 1015 * Return error info on this handle. 1016 */ 1017 static void 1018 clnt_clts_kerror(CLIENT *h, struct rpc_err *err) 1019 { 1020 /* LINTED pointer alignment */ 1021 struct cku_private *p = htop(h); 1022 1023 *err = p->cku_err; 1024 } 1025 1026 /*ARGSUSED*/ 1027 static bool_t 1028 clnt_clts_kfreeres(CLIENT *h, xdrproc_t xdr_res, caddr_t res_ptr) 1029 { 1030 xdr_free(xdr_res, res_ptr); 1031 1032 return (TRUE); 1033 } 1034 1035 /*ARGSUSED*/ 1036 static void 1037 clnt_clts_kabort(CLIENT *h) 1038 { 1039 } 1040 1041 static bool_t 1042 clnt_clts_kcontrol(CLIENT *h, int cmd, char *arg) 1043 { 1044 /* LINTED pointer alignment */ 1045 struct cku_private *p = htop(h); 1046 1047 switch (cmd) { 1048 case CLSET_XID: 1049 p->cku_xid = *((uint32_t *)arg); 1050 return (TRUE); 1051 1052 case CLGET_XID: 1053 *((uint32_t *)arg) = p->cku_xid; 1054 return (TRUE); 1055 1056 case CLSET_BCAST: 1057 p->cku_bcast = *((uint32_t *)arg); 1058 return (TRUE); 1059 1060 case CLGET_BCAST: 1061 *((uint32_t *)arg) = p->cku_bcast; 1062 return (TRUE); 1063 case CLSET_BINDRESVPORT: 1064 if (arg == NULL) 1065 return (FALSE); 1066 1067 if (*(int *)arg != 1 && *(int *)arg != 0) 1068 return (FALSE); 1069 1070 p->cku_useresvport = *(int *)arg; 1071 1072 return (TRUE); 1073 1074 case CLGET_BINDRESVPORT: 1075 if (arg == NULL) 1076 return (FALSE); 1077 1078 *(int *)arg = p->cku_useresvport; 1079 1080 return (TRUE); 1081 1082 default: 1083 return (FALSE); 1084 } 1085 } 1086 1087 /* 1088 * Destroy rpc handle. 1089 * Frees the space used for output buffer, private data, and handle 1090 * structure, and the file pointer/TLI data on last reference. 1091 */ 1092 static void 1093 clnt_clts_kdestroy(CLIENT *h) 1094 { 1095 /* LINTED pointer alignment */ 1096 struct cku_private *p = htop(h); 1097 calllist_t *call = &p->cku_call; 1098 1099 int plen; 1100 1101 RPCLOG(8, "clnt_clts_kdestroy h: %p\n", (void *)h); 1102 RPCLOG(8, "clnt_clts_kdestroy h: xid=0x%x\n", p->cku_xid); 1103 1104 if (p->cku_endpnt != NULL) 1105 endpnt_rele(p->cku_endpnt); 1106 1107 cv_destroy(&call->call_cv); 1108 mutex_destroy(&call->call_lock); 1109 1110 plen = strlen(p->cku_config.knc_protofmly) + 1; 1111 kmem_free(p->cku_config.knc_protofmly, plen); 1112 kmem_free(p->cku_addr.buf, p->cku_addr.maxlen); 1113 kmem_free(p, sizeof (*p)); 1114 } 1115 1116 /* 1117 * The connectionless (CLTS) kRPC endpoint management subsystem. 1118 * 1119 * Because endpoints are potentially shared among threads making RPC calls, 1120 * they are managed in a pool according to type (endpnt_type_t). Each 1121 * endpnt_type_t points to a list of usable endpoints through the e_pool 1122 * field, which is of type list_t. list_t is a doubly-linked list. 1123 * The number of endpoints in the pool is stored in the e_cnt field of 1124 * endpnt_type_t and the endpoints are reference counted using the e_ref field 1125 * in the endpnt_t structure. 1126 * 1127 * As an optimization, endpoints that have no references are also linked 1128 * to an idle list via e_ilist which is also of type list_t. When a thread 1129 * calls endpnt_get() to obtain a transport endpoint, the idle list is first 1130 * consulted and if such an endpoint exists, it is removed from the idle list 1131 * and returned to the caller. 1132 * 1133 * If the idle list is empty, then a check is made to see if more endpoints 1134 * can be created. If so, we proceed and create a new endpoint which is added 1135 * to the pool and returned to the caller. If we have reached the limit and 1136 * cannot make a new endpoint then one is returned to the caller via round- 1137 * robin policy. 1138 * 1139 * When an endpoint is placed on the idle list by a thread calling 1140 * endpnt_rele(), it is timestamped and then a reaper taskq is scheduled to 1141 * be dispatched if one hasn't already been. When the timer fires, the 1142 * taskq traverses the idle list and checks to see which endpoints are 1143 * eligible to be closed. It determines this by checking if the timestamp 1144 * when the endpoint was released has exceeded the the threshold for how long 1145 * it should stay alive. 1146 * 1147 * endpnt_t structures remain persistent until the memory reclaim callback, 1148 * endpnt_reclaim(), is invoked. 1149 * 1150 * Here is an example of how the data structures would be laid out by the 1151 * subsystem: 1152 * 1153 * endpnt_type_t 1154 * 1155 * loopback inet 1156 * _______________ ______________ 1157 * | e_next |----------------------->| e_next |---->> 1158 * | e_pool |<---+ | e_pool |<----+ 1159 * | e_ilist |<---+--+ | e_ilist |<----+--+ 1160 * +->| e_pcurr |----+--+--+ +->| e_pcurr |-----+--+--+ 1161 * | | ... | | | | | | ... | | | | 1162 * | | e_itimer (90) | | | | | | e_itimer (0) | | | | 1163 * | | e_cnt (1) | | | | | | e_cnt (3) | | | | 1164 * | +---------------+ | | | | +--------------+ | | | 1165 * | | | | | | | | 1166 * | endpnt_t | | | | | | | 1167 * | ____________ | | | | ____________ | | | 1168 * | | e_node |<------+ | | | | e_node |<------+ | | 1169 * | | e_idle |<---------+ | | | e_idle | | | | 1170 * +--| e_type |<------------+ +--| e_type | | | | 1171 * | e_tiptr | | | e_tiptr | | | | 1172 * | ... | | | ... | | | | 1173 * | e_lock | | | e_lock | | | | 1174 * | ... | | | ... | | | | 1175 * | e_ref (0) | | | e_ref (2) | | | | 1176 * | e_itime | | | e_itime | | | | 1177 * +------------+ | +------------+ | | | 1178 * | | | | 1179 * | | | | 1180 * | ____________ | | | 1181 * | | e_node |<------+ | | 1182 * | | e_idle |<------+--+ | 1183 * +--| e_type | | | 1184 * | | e_tiptr | | | 1185 * | | ... | | | 1186 * | | e_lock | | | 1187 * | | ... | | | 1188 * | | e_ref (0) | | | 1189 * | | e_itime | | | 1190 * | +------------+ | | 1191 * | | | 1192 * | | | 1193 * | ____________ | | 1194 * | | e_node |<------+ | 1195 * | | e_idle | | 1196 * +--| e_type |<------------+ 1197 * | e_tiptr | 1198 * | ... | 1199 * | e_lock | 1200 * | ... | 1201 * | e_ref (1) | 1202 * | e_itime | 1203 * +------------+ 1204 * 1205 * Endpoint locking strategy: 1206 * 1207 * The following functions manipulate lists which hold the endpoint and the 1208 * endpoints themselves: 1209 * 1210 * endpnt_get()/check_endpnt()/endpnt_rele()/endpnt_reap()/do_endpnt_reclaim() 1211 * 1212 * Lock description follows: 1213 * 1214 * endpnt_type_lock: Global reader/writer lock which protects accesses to the 1215 * endpnt_type_list. 1216 * 1217 * e_plock: Lock defined in the endpnt_type_t. It is intended to 1218 * protect accesses to the pool of endopints (e_pool) for a given 1219 * endpnt_type_t. 1220 * 1221 * e_ilock: Lock defined in endpnt_type_t. It is intended to protect accesses 1222 * to the idle list (e_ilist) of available endpoints for a given 1223 * endpnt_type_t. It also protects access to the e_itimer, e_async_cv, 1224 * and e_async_count fields in endpnt_type_t. 1225 * 1226 * e_lock: Lock defined in the endpnt structure. It is intended to protect 1227 * flags, cv, and ref count. 1228 * 1229 * The order goes as follows so as not to induce deadlock. 1230 * 1231 * endpnt_type_lock -> e_plock -> e_ilock -> e_lock 1232 * 1233 * Interaction with Zones and shutting down: 1234 * 1235 * endpnt_type_ts are uniquely identified by the (e_zoneid, e_rdev, e_protofmly) 1236 * tuple, which means that a zone may not reuse another zone's idle endpoints 1237 * without first doing a t_kclose(). 1238 * 1239 * A zone's endpnt_type_ts are destroyed when a zone is shut down; e_async_cv 1240 * and e_async_count are used to keep track of the threads in endpnt_taskq 1241 * trying to reap endpnt_ts in the endpnt_type_t. 1242 */ 1243 1244 /* 1245 * Allocate and initialize an endpnt_type_t 1246 */ 1247 static struct endpnt_type * 1248 endpnt_type_create(struct knetconfig *config) 1249 { 1250 struct endpnt_type *etype; 1251 1252 /* 1253 * Allocate a new endpoint type to hang a list of 1254 * endpoints off of it. 1255 */ 1256 etype = kmem_alloc(sizeof (struct endpnt_type), KM_SLEEP); 1257 etype->e_next = NULL; 1258 etype->e_pcurr = NULL; 1259 etype->e_itimer = 0; 1260 etype->e_cnt = 0; 1261 1262 (void) strncpy(etype->e_protofmly, config->knc_protofmly, KNC_STRSIZE); 1263 mutex_init(&etype->e_plock, NULL, MUTEX_DEFAULT, NULL); 1264 mutex_init(&etype->e_ilock, NULL, MUTEX_DEFAULT, NULL); 1265 etype->e_rdev = config->knc_rdev; 1266 etype->e_zoneid = rpc_zoneid(); 1267 etype->e_async_count = 0; 1268 cv_init(&etype->e_async_cv, NULL, CV_DEFAULT, NULL); 1269 1270 list_create(&etype->e_pool, sizeof (endpnt_t), 1271 offsetof(endpnt_t, e_node)); 1272 list_create(&etype->e_ilist, sizeof (endpnt_t), 1273 offsetof(endpnt_t, e_idle)); 1274 1275 /* 1276 * Check to see if we need to create a taskq for endpoint 1277 * reaping 1278 */ 1279 mutex_enter(&endpnt_taskq_lock); 1280 if (taskq_created == FALSE) { 1281 taskq_created = TRUE; 1282 mutex_exit(&endpnt_taskq_lock); 1283 ASSERT(endpnt_taskq == NULL); 1284 endpnt_taskq = taskq_create("clts_endpnt_taskq", 1, 1285 minclsyspri, 200, INT_MAX, 0); 1286 } else 1287 mutex_exit(&endpnt_taskq_lock); 1288 1289 return (etype); 1290 } 1291 1292 /* 1293 * Free an endpnt_type_t 1294 */ 1295 static void 1296 endpnt_type_free(struct endpnt_type *etype) 1297 { 1298 mutex_destroy(&etype->e_plock); 1299 mutex_destroy(&etype->e_ilock); 1300 list_destroy(&etype->e_pool); 1301 list_destroy(&etype->e_ilist); 1302 kmem_free(etype, sizeof (endpnt_type_t)); 1303 } 1304 1305 /* 1306 * Check the endpoint to ensure that it is suitable for use. 1307 * 1308 * Possible return values: 1309 * 1310 * return (1) - Endpoint is established, but needs to be re-opened. 1311 * return (0) && *newp == NULL - Endpoint is established, but unusable. 1312 * return (0) && *newp != NULL - Endpoint is established and usable. 1313 */ 1314 static int 1315 check_endpnt(struct endpnt *endp, struct endpnt **newp) 1316 { 1317 *newp = endp; 1318 1319 mutex_enter(&endp->e_lock); 1320 ASSERT(endp->e_ref >= 1); 1321 1322 /* 1323 * The first condition we check for is if the endpoint has been 1324 * allocated, but is unusable either because it has been closed or 1325 * has been marked stale. Only *one* thread will be allowed to 1326 * execute the then clause. This is enforced because the first thread 1327 * to check this condition will clear the flags, so that subsequent 1328 * thread(s) checking this endpoint will move on. 1329 */ 1330 if ((endp->e_flags & ENDPNT_ESTABLISHED) && 1331 (!(endp->e_flags & ENDPNT_BOUND) || 1332 (endp->e_flags & ENDPNT_STALE))) { 1333 /* 1334 * Clear the flags here since they will be 1335 * set again by this thread. They need to be 1336 * individually cleared because we want to maintain 1337 * the state for ENDPNT_ONIDLE. 1338 */ 1339 endp->e_flags &= ~(ENDPNT_ESTABLISHED | 1340 ENDPNT_WAITING | ENDPNT_BOUND | ENDPNT_STALE); 1341 mutex_exit(&endp->e_lock); 1342 return (1); 1343 } 1344 1345 /* 1346 * The second condition is meant for any thread that is waiting for 1347 * an endpoint to become established. It will cv_wait() until 1348 * the condition for the endpoint has been changed to ENDPNT_BOUND or 1349 * ENDPNT_STALE. 1350 */ 1351 while (!(endp->e_flags & ENDPNT_BOUND) && 1352 !(endp->e_flags & ENDPNT_STALE)) { 1353 endp->e_flags |= ENDPNT_WAITING; 1354 cv_wait(&endp->e_cv, &endp->e_lock); 1355 } 1356 1357 ASSERT(endp->e_flags & ENDPNT_ESTABLISHED); 1358 1359 /* 1360 * The last case we check for is if the endpoint has been marked stale. 1361 * If this is the case then set *newp to NULL and return, so that the 1362 * caller is notified of the error and can take appropriate action. 1363 */ 1364 if (endp->e_flags & ENDPNT_STALE) { 1365 endp->e_ref--; 1366 *newp = NULL; 1367 } 1368 mutex_exit(&endp->e_lock); 1369 return (0); 1370 } 1371 1372 #ifdef DEBUG 1373 /* 1374 * Provide a fault injection setting to test error conditions. 1375 */ 1376 static int endpnt_get_return_null = 0; 1377 #endif 1378 1379 /* 1380 * Returns a handle (struct endpnt *) to an open and bound endpoint 1381 * specified by the knetconfig passed in. Returns NULL if no valid endpoint 1382 * can be obtained. 1383 */ 1384 static struct endpnt * 1385 endpnt_get(struct knetconfig *config, int useresvport) 1386 { 1387 struct endpnt_type *n_etype = NULL; 1388 struct endpnt_type *np = NULL; 1389 struct endpnt *new = NULL; 1390 struct endpnt *endp = NULL; 1391 struct endpnt *next = NULL; 1392 TIUSER *tiptr = NULL; 1393 int rtries = BINDRESVPORT_RETRIES; 1394 int i = 0; 1395 int error; 1396 int retval; 1397 zoneid_t zoneid = rpc_zoneid(); 1398 cred_t *cr; 1399 1400 RPCLOG(1, "endpnt_get: protofmly %s, ", config->knc_protofmly); 1401 RPCLOG(1, "rdev %ld\n", config->knc_rdev); 1402 1403 #ifdef DEBUG 1404 /* 1405 * Inject fault if desired. Pretend we have a stale endpoint 1406 * and return NULL. 1407 */ 1408 if (endpnt_get_return_null > 0) { 1409 endpnt_get_return_null--; 1410 return (NULL); 1411 } 1412 #endif 1413 rw_enter(&endpnt_type_lock, RW_READER); 1414 1415 top: 1416 for (np = endpnt_type_list; np != NULL; np = np->e_next) 1417 if ((np->e_zoneid == zoneid) && 1418 (np->e_rdev == config->knc_rdev) && 1419 (strcmp(np->e_protofmly, 1420 config->knc_protofmly) == 0)) 1421 break; 1422 1423 if (np == NULL && n_etype != NULL) { 1424 ASSERT(rw_write_held(&endpnt_type_lock)); 1425 1426 /* 1427 * Link the endpoint type onto the list 1428 */ 1429 n_etype->e_next = endpnt_type_list; 1430 endpnt_type_list = n_etype; 1431 np = n_etype; 1432 n_etype = NULL; 1433 } 1434 1435 if (np == NULL) { 1436 /* 1437 * The logic here is that we were unable to find an 1438 * endpnt_type_t that matched our criteria, so we allocate a 1439 * new one. Because kmem_alloc() needs to be called with 1440 * KM_SLEEP, we drop our locks so that we don't induce 1441 * deadlock. After allocating and initializing the 1442 * endpnt_type_t, we reaquire the lock and go back to check 1443 * if this entry needs to be added to the list. Since we do 1444 * some operations without any locking other threads may 1445 * have been looking for the same endpnt_type_t and gone 1446 * through this code path. We check for this case and allow 1447 * one thread to link its endpnt_type_t to the list and the 1448 * other threads will simply free theirs. 1449 */ 1450 rw_exit(&endpnt_type_lock); 1451 n_etype = endpnt_type_create(config); 1452 1453 /* 1454 * We need to reaquire the lock with RW_WRITER here so that 1455 * we can safely link the new endpoint type onto the list. 1456 */ 1457 rw_enter(&endpnt_type_lock, RW_WRITER); 1458 goto top; 1459 } 1460 1461 rw_exit(&endpnt_type_lock); 1462 /* 1463 * If n_etype is not NULL, then another thread was able to 1464 * insert an endpnt_type_t of this type onto the list before 1465 * we did. Go ahead and free ours. 1466 */ 1467 if (n_etype != NULL) 1468 endpnt_type_free(n_etype); 1469 1470 mutex_enter(&np->e_ilock); 1471 /* 1472 * The algorithm to hand out endpoints is to first 1473 * give out those that are idle if such endpoints 1474 * exist. Otherwise, create a new one if we haven't 1475 * reached the max threshold. Finally, we give out 1476 * endpoints in a pseudo LRU fashion (round-robin). 1477 * 1478 * Note: The idle list is merely a hint of those endpoints 1479 * that should be idle. There exists a window after the 1480 * endpoint is released and before it is linked back onto the 1481 * idle list where a thread could get a reference to it and 1482 * use it. This is okay, since the reference counts will 1483 * still be consistent. 1484 */ 1485 if ((endp = (endpnt_t *)list_head(&np->e_ilist)) != NULL) { 1486 timeout_id_t t_id = 0; 1487 1488 mutex_enter(&endp->e_lock); 1489 endp->e_ref++; 1490 endp->e_itime = 0; 1491 endp->e_flags &= ~ENDPNT_ONIDLE; 1492 mutex_exit(&endp->e_lock); 1493 1494 /* 1495 * Pop the endpoint off the idle list and hand it off 1496 */ 1497 list_remove(&np->e_ilist, endp); 1498 1499 if (np->e_itimer != 0) { 1500 t_id = np->e_itimer; 1501 np->e_itimer = 0; 1502 } 1503 mutex_exit(&np->e_ilock); 1504 /* 1505 * Reset the idle timer if it has been set 1506 */ 1507 if (t_id != (timeout_id_t)0) 1508 (void) untimeout(t_id); 1509 1510 if (check_endpnt(endp, &new) == 0) 1511 return (new); 1512 } else if (np->e_cnt >= clnt_clts_max_endpoints) { 1513 /* 1514 * There are no idle endpoints currently, so 1515 * create a new one if we have not reached the maximum or 1516 * hand one out in round-robin. 1517 */ 1518 mutex_exit(&np->e_ilock); 1519 mutex_enter(&np->e_plock); 1520 endp = np->e_pcurr; 1521 mutex_enter(&endp->e_lock); 1522 endp->e_ref++; 1523 mutex_exit(&endp->e_lock); 1524 1525 ASSERT(endp != NULL); 1526 /* 1527 * Advance the pointer to the next eligible endpoint, if 1528 * necessary. 1529 */ 1530 if (np->e_cnt > 1) { 1531 next = (endpnt_t *)list_next(&np->e_pool, np->e_pcurr); 1532 if (next == NULL) 1533 next = (endpnt_t *)list_head(&np->e_pool); 1534 np->e_pcurr = next; 1535 } 1536 1537 mutex_exit(&np->e_plock); 1538 1539 /* 1540 * We need to check to see if this endpoint is bound or 1541 * not. If it is in progress then just wait until 1542 * the set up is complete 1543 */ 1544 if (check_endpnt(endp, &new) == 0) 1545 return (new); 1546 } else { 1547 mutex_exit(&np->e_ilock); 1548 mutex_enter(&np->e_plock); 1549 1550 /* 1551 * Allocate a new endpoint to use. If we can't allocate any 1552 * more memory then use one that is already established if any 1553 * such endpoints exist. 1554 */ 1555 new = kmem_cache_alloc(endpnt_cache, KM_NOSLEEP); 1556 if (new == NULL) { 1557 RPCLOG0(1, "endpnt_get: kmem_cache_alloc failed\n"); 1558 /* 1559 * Try to recover by using an existing endpoint. 1560 */ 1561 if (np->e_cnt <= 0) { 1562 mutex_exit(&np->e_plock); 1563 return (NULL); 1564 } 1565 endp = np->e_pcurr; 1566 if ((next = list_next(&np->e_pool, np->e_pcurr)) != 1567 NULL) 1568 np->e_pcurr = next; 1569 ASSERT(endp != NULL); 1570 mutex_enter(&endp->e_lock); 1571 endp->e_ref++; 1572 mutex_exit(&endp->e_lock); 1573 mutex_exit(&np->e_plock); 1574 1575 if (check_endpnt(endp, &new) == 0) 1576 return (new); 1577 } else { 1578 /* 1579 * Partially init an endpoint structure and put 1580 * it on the list, so that other interested threads 1581 * know that one is being created 1582 */ 1583 bzero(new, sizeof (struct endpnt)); 1584 1585 cv_init(&new->e_cv, NULL, CV_DEFAULT, NULL); 1586 mutex_init(&new->e_lock, NULL, MUTEX_DEFAULT, NULL); 1587 new->e_ref = 1; 1588 new->e_type = np; 1589 1590 /* 1591 * Link the endpoint into the pool. 1592 */ 1593 list_insert_head(&np->e_pool, new); 1594 np->e_cnt++; 1595 if (np->e_pcurr == NULL) 1596 np->e_pcurr = new; 1597 mutex_exit(&np->e_plock); 1598 } 1599 } 1600 1601 /* 1602 * The transport should be opened with sufficient privs 1603 */ 1604 cr = zone_kcred(); 1605 error = t_kopen(NULL, config->knc_rdev, FREAD|FWRITE|FNDELAY, &tiptr, 1606 cr); 1607 if (error) { 1608 RPCLOG(1, "endpnt_get: t_kopen: %d\n", error); 1609 goto bad; 1610 } 1611 1612 new->e_tiptr = tiptr; 1613 rpc_poptimod(tiptr->fp->f_vnode); 1614 1615 /* 1616 * Allow the kernel to push the module on behalf of the user. 1617 */ 1618 error = strioctl(tiptr->fp->f_vnode, I_PUSH, (intptr_t)"rpcmod", 0, 1619 K_TO_K, cr, &retval); 1620 if (error) { 1621 RPCLOG(1, "endpnt_get: kstr_push on rpcmod failed %d\n", error); 1622 goto bad; 1623 } 1624 1625 error = strioctl(tiptr->fp->f_vnode, RPC_CLIENT, 0, 0, K_TO_K, 1626 cr, &retval); 1627 if (error) { 1628 RPCLOG(1, "endpnt_get: strioctl failed %d\n", error); 1629 goto bad; 1630 } 1631 1632 /* 1633 * Connectionless data flow should bypass the stream head. 1634 */ 1635 new->e_wq = tiptr->fp->f_vnode->v_stream->sd_wrq->q_next; 1636 1637 error = strioctl(tiptr->fp->f_vnode, I_PUSH, (intptr_t)"timod", 0, 1638 K_TO_K, cr, &retval); 1639 if (error) { 1640 RPCLOG(1, "endpnt_get: kstr_push on timod failed %d\n", error); 1641 goto bad; 1642 } 1643 1644 /* 1645 * Attempt to bind the endpoint. If we fail then propogate 1646 * error back to calling subsystem, so that it can be handled 1647 * appropriately. 1648 * If the caller has not specified reserved port usage then 1649 * take the system default. 1650 */ 1651 if (useresvport == -1) 1652 useresvport = clnt_clts_do_bindresvport; 1653 1654 if (useresvport && 1655 (strcmp(config->knc_protofmly, NC_INET) == 0 || 1656 strcmp(config->knc_protofmly, NC_INET6) == 0)) { 1657 1658 while ((error = 1659 bindresvport(new->e_tiptr, NULL, NULL, FALSE)) != 0) { 1660 RPCLOG(1, 1661 "endpnt_get: bindresvport error %d\n", error); 1662 if (error != EPROTO) { 1663 if (rtries-- <= 0) 1664 goto bad; 1665 1666 delay(hz << i++); 1667 continue; 1668 } 1669 1670 (void) t_kclose(new->e_tiptr, 1); 1671 /* 1672 * reopen with all privileges 1673 */ 1674 error = t_kopen(NULL, config->knc_rdev, 1675 FREAD|FWRITE|FNDELAY, 1676 &new->e_tiptr, cr); 1677 if (error) { 1678 RPCLOG(1, "endpnt_get: t_kopen: %d\n", error); 1679 new->e_tiptr = NULL; 1680 goto bad; 1681 } 1682 } 1683 } else if ((error = t_kbind(new->e_tiptr, NULL, NULL)) != 0) { 1684 RPCLOG(1, "endpnt_get: t_kbind failed: %d\n", error); 1685 goto bad; 1686 } 1687 1688 /* 1689 * Set the flags and notify and waiters that we have an established 1690 * endpoint. 1691 */ 1692 mutex_enter(&new->e_lock); 1693 new->e_flags |= ENDPNT_ESTABLISHED; 1694 new->e_flags |= ENDPNT_BOUND; 1695 if (new->e_flags & ENDPNT_WAITING) { 1696 cv_broadcast(&new->e_cv); 1697 new->e_flags &= ~ENDPNT_WAITING; 1698 } 1699 mutex_exit(&new->e_lock); 1700 1701 return (new); 1702 1703 bad: 1704 ASSERT(new != NULL); 1705 /* 1706 * mark this endpoint as stale and notify any threads waiting 1707 * on this endpoint that it will be going away. 1708 */ 1709 mutex_enter(&new->e_lock); 1710 if (new->e_ref > 0) { 1711 new->e_flags |= ENDPNT_ESTABLISHED; 1712 new->e_flags |= ENDPNT_STALE; 1713 if (new->e_flags & ENDPNT_WAITING) { 1714 cv_broadcast(&new->e_cv); 1715 new->e_flags &= ~ENDPNT_WAITING; 1716 } 1717 } 1718 new->e_ref--; 1719 new->e_tiptr = NULL; 1720 mutex_exit(&new->e_lock); 1721 1722 /* 1723 * If there was a transport endopoint opened, then close it. 1724 */ 1725 if (tiptr != NULL) 1726 (void) t_kclose(tiptr, 1); 1727 1728 return (NULL); 1729 } 1730 1731 /* 1732 * Release a referece to the endpoint 1733 */ 1734 static void 1735 endpnt_rele(struct endpnt *sp) 1736 { 1737 mutex_enter(&sp->e_lock); 1738 ASSERT(sp->e_ref > 0); 1739 sp->e_ref--; 1740 /* 1741 * If the ref count is zero, then start the idle timer and link 1742 * the endpoint onto the idle list. 1743 */ 1744 if (sp->e_ref == 0) { 1745 sp->e_itime = gethrestime_sec(); 1746 1747 /* 1748 * Check to see if the endpoint is already linked to the idle 1749 * list, so that we don't try to reinsert it. 1750 */ 1751 if (sp->e_flags & ENDPNT_ONIDLE) { 1752 mutex_exit(&sp->e_lock); 1753 mutex_enter(&sp->e_type->e_ilock); 1754 endpnt_reap_settimer(sp->e_type); 1755 mutex_exit(&sp->e_type->e_ilock); 1756 return; 1757 } 1758 1759 sp->e_flags |= ENDPNT_ONIDLE; 1760 mutex_exit(&sp->e_lock); 1761 mutex_enter(&sp->e_type->e_ilock); 1762 list_insert_tail(&sp->e_type->e_ilist, sp); 1763 endpnt_reap_settimer(sp->e_type); 1764 mutex_exit(&sp->e_type->e_ilock); 1765 } else 1766 mutex_exit(&sp->e_lock); 1767 } 1768 1769 static void 1770 endpnt_reap_settimer(endpnt_type_t *etp) 1771 { 1772 if (etp->e_itimer == (timeout_id_t)0) 1773 etp->e_itimer = timeout(endpnt_reap_dispatch, (void *)etp, 1774 clnt_clts_taskq_dispatch_interval); 1775 } 1776 1777 static void 1778 endpnt_reap_dispatch(void *a) 1779 { 1780 endpnt_type_t *etp = a; 1781 1782 /* 1783 * The idle timer has fired, so dispatch the taskq to close the 1784 * endpoint. 1785 */ 1786 if (taskq_dispatch(endpnt_taskq, (task_func_t *)endpnt_reap, etp, 1787 TQ_NOSLEEP) == TASKQID_INVALID) 1788 return; 1789 mutex_enter(&etp->e_ilock); 1790 etp->e_async_count++; 1791 mutex_exit(&etp->e_ilock); 1792 } 1793 1794 /* 1795 * Traverse the idle list and close those endpoints that have reached their 1796 * timeout interval. 1797 */ 1798 static void 1799 endpnt_reap(endpnt_type_t *etp) 1800 { 1801 struct endpnt *e; 1802 struct endpnt *next_node = NULL; 1803 1804 mutex_enter(&etp->e_ilock); 1805 e = list_head(&etp->e_ilist); 1806 while (e != NULL) { 1807 next_node = list_next(&etp->e_ilist, e); 1808 1809 mutex_enter(&e->e_lock); 1810 if (e->e_ref > 0) { 1811 mutex_exit(&e->e_lock); 1812 e = next_node; 1813 continue; 1814 } 1815 1816 ASSERT(e->e_ref == 0); 1817 if (e->e_itime > 0 && 1818 (e->e_itime + clnt_clts_endpoint_reap_interval) < 1819 gethrestime_sec()) { 1820 e->e_flags &= ~ENDPNT_BOUND; 1821 (void) t_kclose(e->e_tiptr, 1); 1822 e->e_tiptr = NULL; 1823 e->e_itime = 0; 1824 } 1825 mutex_exit(&e->e_lock); 1826 e = next_node; 1827 } 1828 etp->e_itimer = 0; 1829 if (--etp->e_async_count == 0) 1830 cv_signal(&etp->e_async_cv); 1831 mutex_exit(&etp->e_ilock); 1832 } 1833 1834 static void 1835 endpnt_reclaim(zoneid_t zoneid) 1836 { 1837 struct endpnt_type *np; 1838 struct endpnt *e; 1839 struct endpnt *next_node = NULL; 1840 list_t free_list; 1841 int rcnt = 0; 1842 1843 list_create(&free_list, sizeof (endpnt_t), offsetof(endpnt_t, e_node)); 1844 1845 RPCLOG0(1, "endpnt_reclaim: reclaim callback started\n"); 1846 rw_enter(&endpnt_type_lock, RW_READER); 1847 for (np = endpnt_type_list; np != NULL; np = np->e_next) { 1848 if (zoneid != ALL_ZONES && zoneid != np->e_zoneid) 1849 continue; 1850 1851 mutex_enter(&np->e_plock); 1852 RPCLOG(1, "endpnt_reclaim: protofmly %s, ", 1853 np->e_protofmly); 1854 RPCLOG(1, "rdev %ld\n", np->e_rdev); 1855 RPCLOG(1, "endpnt_reclaim: found %d endpoint(s)\n", 1856 np->e_cnt); 1857 1858 if (np->e_cnt == 0) { 1859 mutex_exit(&np->e_plock); 1860 continue; 1861 } 1862 1863 /* 1864 * The nice thing about maintaining an idle list is that if 1865 * there are any endpoints to reclaim, they are going to be 1866 * on this list. Just go through and reap the one's that 1867 * have ref counts of zero. 1868 */ 1869 mutex_enter(&np->e_ilock); 1870 e = list_head(&np->e_ilist); 1871 while (e != NULL) { 1872 next_node = list_next(&np->e_ilist, e); 1873 mutex_enter(&e->e_lock); 1874 if (e->e_ref > 0) { 1875 mutex_exit(&e->e_lock); 1876 e = next_node; 1877 continue; 1878 } 1879 ASSERT(e->e_ref == 0); 1880 mutex_exit(&e->e_lock); 1881 1882 list_remove(&np->e_ilist, e); 1883 list_remove(&np->e_pool, e); 1884 list_insert_head(&free_list, e); 1885 1886 rcnt++; 1887 np->e_cnt--; 1888 e = next_node; 1889 } 1890 mutex_exit(&np->e_ilock); 1891 /* 1892 * Reset the current pointer to be safe 1893 */ 1894 if ((e = (struct endpnt *)list_head(&np->e_pool)) != NULL) 1895 np->e_pcurr = e; 1896 else { 1897 ASSERT(np->e_cnt == 0); 1898 np->e_pcurr = NULL; 1899 } 1900 1901 mutex_exit(&np->e_plock); 1902 } 1903 rw_exit(&endpnt_type_lock); 1904 1905 while ((e = list_head(&free_list)) != NULL) { 1906 list_remove(&free_list, e); 1907 if (e->e_tiptr != NULL) 1908 (void) t_kclose(e->e_tiptr, 1); 1909 1910 cv_destroy(&e->e_cv); 1911 mutex_destroy(&e->e_lock); 1912 kmem_cache_free(endpnt_cache, e); 1913 } 1914 list_destroy(&free_list); 1915 RPCLOG(1, "endpnt_reclaim: reclaimed %d endpoint(s)\n", rcnt); 1916 } 1917 1918 /* 1919 * Endpoint reclaim zones destructor callback routine. 1920 * 1921 * After reclaiming any cached entries, we basically go through the endpnt_type 1922 * list, canceling outstanding timeouts and free'ing data structures. 1923 */ 1924 /* ARGSUSED */ 1925 static void 1926 endpnt_destructor(zoneid_t zoneid, void *a) 1927 { 1928 struct endpnt_type **npp; 1929 struct endpnt_type *np; 1930 struct endpnt_type *free_list = NULL; 1931 timeout_id_t t_id = 0; 1932 extern void clcleanup_zone(zoneid_t); 1933 extern void clcleanup4_zone(zoneid_t); 1934 1935 /* Make sure NFS client handles are released. */ 1936 clcleanup_zone(zoneid); 1937 clcleanup4_zone(zoneid); 1938 1939 endpnt_reclaim(zoneid); 1940 /* 1941 * We don't need to be holding on to any locks across the call to 1942 * endpnt_reclaim() and the code below; we know that no-one can 1943 * be holding open connections for this zone (all processes and kernel 1944 * threads are gone), so nothing could be adding anything to the list. 1945 */ 1946 rw_enter(&endpnt_type_lock, RW_WRITER); 1947 npp = &endpnt_type_list; 1948 while ((np = *npp) != NULL) { 1949 if (np->e_zoneid != zoneid) { 1950 npp = &np->e_next; 1951 continue; 1952 } 1953 mutex_enter(&np->e_plock); 1954 mutex_enter(&np->e_ilock); 1955 if (np->e_itimer != 0) { 1956 t_id = np->e_itimer; 1957 np->e_itimer = 0; 1958 } 1959 ASSERT(np->e_cnt == 0); 1960 ASSERT(list_head(&np->e_pool) == NULL); 1961 ASSERT(list_head(&np->e_ilist) == NULL); 1962 1963 mutex_exit(&np->e_ilock); 1964 mutex_exit(&np->e_plock); 1965 1966 /* 1967 * untimeout() any outstanding timers that have not yet fired. 1968 */ 1969 if (t_id != (timeout_id_t)0) 1970 (void) untimeout(t_id); 1971 *npp = np->e_next; 1972 np->e_next = free_list; 1973 free_list = np; 1974 } 1975 rw_exit(&endpnt_type_lock); 1976 1977 while (free_list != NULL) { 1978 np = free_list; 1979 free_list = free_list->e_next; 1980 /* 1981 * Wait for threads in endpnt_taskq trying to reap endpnt_ts in 1982 * the endpnt_type_t. 1983 */ 1984 mutex_enter(&np->e_ilock); 1985 while (np->e_async_count > 0) 1986 cv_wait(&np->e_async_cv, &np->e_ilock); 1987 cv_destroy(&np->e_async_cv); 1988 mutex_destroy(&np->e_plock); 1989 mutex_destroy(&np->e_ilock); 1990 list_destroy(&np->e_pool); 1991 list_destroy(&np->e_ilist); 1992 kmem_free(np, sizeof (endpnt_type_t)); 1993 } 1994 } 1995 1996 /* 1997 * Endpoint reclaim kmem callback routine. 1998 */ 1999 /* ARGSUSED */ 2000 static void 2001 endpnt_repossess(void *a) 2002 { 2003 /* 2004 * Reclaim idle endpnt's from all zones. 2005 */ 2006 if (endpnt_taskq != NULL) 2007 (void) taskq_dispatch(endpnt_taskq, 2008 (task_func_t *)(uintptr_t)endpnt_reclaim, (void *)ALL_ZONES, 2009 TQ_NOSLEEP); 2010 } 2011 2012 /* 2013 * RPC request dispatch routine. Constructs a datagram message and wraps it 2014 * around the RPC request to pass downstream. 2015 */ 2016 static int 2017 clnt_clts_dispatch_send(queue_t *q, mblk_t *mp, struct netbuf *addr, 2018 calllist_t *cp, uint_t xid, cred_t *cr) 2019 { 2020 mblk_t *bp; 2021 int msgsz; 2022 struct T_unitdata_req *udreq; 2023 2024 /* 2025 * Set up the call record. 2026 */ 2027 cp->call_wq = q; 2028 cp->call_xid = xid; 2029 cp->call_status = RPC_TIMEDOUT; 2030 cp->call_notified = FALSE; 2031 RPCLOG(64, 2032 "clnt_clts_dispatch_send: putting xid 0x%x on " 2033 "dispatch list\n", xid); 2034 cp->call_hash = call_hash(xid, clnt_clts_hash_size); 2035 cp->call_bucket = &clts_call_ht[cp->call_hash]; 2036 call_table_enter(cp); 2037 2038 /* 2039 * Construct the datagram 2040 */ 2041 msgsz = (int)TUNITDATAREQSZ; 2042 /* 2043 * Note: if the receiver uses SCM_UCRED/getpeerucred the pid will 2044 * appear as -1. 2045 */ 2046 while (!(bp = allocb_cred(msgsz + addr->len, cr, NOPID))) { 2047 if (strwaitbuf(msgsz + addr->len, BPRI_LO)) 2048 return (ENOSR); 2049 } 2050 2051 udreq = (struct T_unitdata_req *)bp->b_wptr; 2052 udreq->PRIM_type = T_UNITDATA_REQ; 2053 udreq->DEST_length = addr->len; 2054 2055 if (addr->len) { 2056 bcopy(addr->buf, bp->b_wptr + msgsz, addr->len); 2057 udreq->DEST_offset = (t_scalar_t)msgsz; 2058 msgsz += addr->len; 2059 } else 2060 udreq->DEST_offset = 0; 2061 udreq->OPT_length = 0; 2062 udreq->OPT_offset = 0; 2063 2064 bp->b_datap->db_type = M_PROTO; 2065 bp->b_wptr += msgsz; 2066 2067 /* 2068 * Link the datagram header with the actual data 2069 */ 2070 linkb(bp, mp); 2071 2072 /* 2073 * Send downstream. 2074 */ 2075 if (canput(cp->call_wq)) { 2076 put(cp->call_wq, bp); 2077 return (0); 2078 } 2079 2080 return (EIO); 2081 } 2082 2083 /* 2084 * RPC response delivery routine. Deliver the response to the waiting 2085 * thread by matching the xid. 2086 */ 2087 void 2088 clnt_clts_dispatch_notify(mblk_t *mp, int resp_off, zoneid_t zoneid) 2089 { 2090 calllist_t *e = NULL; 2091 call_table_t *chtp; 2092 uint32_t xid; 2093 uint_t hash; 2094 unsigned char *hdr_offset; 2095 mblk_t *resp; 2096 2097 /* 2098 * If the RPC response is not contained in the same mblk as the 2099 * datagram header, then move to the next mblk. 2100 */ 2101 hdr_offset = mp->b_rptr; 2102 resp = mp; 2103 if ((mp->b_wptr - (mp->b_rptr + resp_off)) == 0) 2104 resp = mp->b_cont; 2105 else 2106 resp->b_rptr += resp_off; 2107 2108 ASSERT(resp != NULL); 2109 2110 if ((IS_P2ALIGNED(resp->b_rptr, sizeof (uint32_t))) && 2111 (resp->b_wptr - resp->b_rptr) >= sizeof (xid)) 2112 xid = *((uint32_t *)resp->b_rptr); 2113 else { 2114 int i = 0; 2115 unsigned char *p = (unsigned char *)&xid; 2116 unsigned char *rptr; 2117 mblk_t *tmp = resp; 2118 2119 /* 2120 * Copy the xid, byte-by-byte into xid. 2121 */ 2122 while (tmp) { 2123 rptr = tmp->b_rptr; 2124 while (rptr < tmp->b_wptr) { 2125 *p++ = *rptr++; 2126 if (++i >= sizeof (xid)) 2127 goto done_xid_copy; 2128 } 2129 tmp = tmp->b_cont; 2130 } 2131 2132 /* 2133 * If we got here, we ran out of mblk space before the 2134 * xid could be copied. 2135 */ 2136 ASSERT(tmp == NULL && i < sizeof (xid)); 2137 2138 RPCLOG0(1, 2139 "clnt_dispatch_notify(clts): message less than " 2140 "size of xid\n"); 2141 2142 freemsg(mp); 2143 return; 2144 } 2145 2146 done_xid_copy: 2147 2148 /* 2149 * Reset the read pointer back to the beginning of the protocol 2150 * header if we moved it. 2151 */ 2152 if (mp->b_rptr != hdr_offset) 2153 mp->b_rptr = hdr_offset; 2154 2155 hash = call_hash(xid, clnt_clts_hash_size); 2156 chtp = &clts_call_ht[hash]; 2157 /* call_table_find returns with the hash bucket locked */ 2158 call_table_find(chtp, xid, e); 2159 2160 if (e != NULL) { 2161 mutex_enter(&e->call_lock); 2162 2163 /* 2164 * verify that the reply is coming in on 2165 * the same zone that it was sent from. 2166 */ 2167 if (e->call_zoneid != zoneid) { 2168 mutex_exit(&e->call_lock); 2169 mutex_exit(&chtp->ct_lock); 2170 RPCLOG0(8, "clnt_dispatch_notify (clts): incorrect " 2171 "zoneid\n"); 2172 freemsg(mp); 2173 return; 2174 } 2175 2176 /* 2177 * found thread waiting for this reply. 2178 */ 2179 if (e->call_reply) { 2180 RPCLOG(8, 2181 "clnt_dispatch_notify (clts): discarding old " 2182 "reply for xid 0x%x\n", 2183 xid); 2184 freemsg(e->call_reply); 2185 } 2186 e->call_notified = TRUE; 2187 e->call_reply = mp; 2188 e->call_status = RPC_SUCCESS; 2189 cv_signal(&e->call_cv); 2190 mutex_exit(&e->call_lock); 2191 mutex_exit(&chtp->ct_lock); 2192 } else { 2193 zone_t *zone; 2194 struct rpcstat *rpcstat; 2195 2196 mutex_exit(&chtp->ct_lock); 2197 RPCLOG(8, "clnt_dispatch_notify (clts): no caller for reply " 2198 "0x%x\n", xid); 2199 freemsg(mp); 2200 /* 2201 * This is unfortunate, but we need to lookup the zone so we 2202 * can increment its "rcbadxids" counter. 2203 */ 2204 zone = zone_find_by_id(zoneid); 2205 if (zone == NULL) { 2206 /* 2207 * The zone went away... 2208 */ 2209 return; 2210 } 2211 rpcstat = zone_getspecific(rpcstat_zone_key, zone); 2212 if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) { 2213 /* 2214 * Not interested 2215 */ 2216 zone_rele(zone); 2217 return; 2218 } 2219 RCSTAT_INCR(rpcstat->rpc_clts_client, rcbadxids); 2220 zone_rele(zone); 2221 } 2222 } 2223 2224 /* 2225 * Init routine. Called when rpcmod is loaded. 2226 */ 2227 void 2228 clnt_clts_init(void) 2229 { 2230 endpnt_cache = kmem_cache_create("clnt_clts_endpnt_cache", 2231 sizeof (struct endpnt), 0, NULL, NULL, endpnt_repossess, NULL, 2232 NULL, 0); 2233 2234 rw_init(&endpnt_type_lock, NULL, RW_DEFAULT, NULL); 2235 2236 /* 2237 * Perform simple bounds checking to make sure that the setting is 2238 * reasonable 2239 */ 2240 if (clnt_clts_max_endpoints <= 0) { 2241 if (clnt_clts_do_bindresvport) 2242 clnt_clts_max_endpoints = RESERVED_PORTSPACE; 2243 else 2244 clnt_clts_max_endpoints = NONRESERVED_PORTSPACE; 2245 } 2246 2247 if (clnt_clts_do_bindresvport && 2248 clnt_clts_max_endpoints > RESERVED_PORTSPACE) 2249 clnt_clts_max_endpoints = RESERVED_PORTSPACE; 2250 else if (clnt_clts_max_endpoints > NONRESERVED_PORTSPACE) 2251 clnt_clts_max_endpoints = NONRESERVED_PORTSPACE; 2252 2253 if (clnt_clts_hash_size < DEFAULT_MIN_HASH_SIZE) 2254 clnt_clts_hash_size = DEFAULT_MIN_HASH_SIZE; 2255 2256 /* 2257 * Defer creating the taskq until rpcmod gets pushed. If we are 2258 * in diskless boot mode, rpcmod will get loaded early even before 2259 * thread_create() is available. 2260 */ 2261 endpnt_taskq = NULL; 2262 taskq_created = FALSE; 2263 mutex_init(&endpnt_taskq_lock, NULL, MUTEX_DEFAULT, NULL); 2264 2265 if (clnt_clts_endpoint_reap_interval < DEFAULT_ENDPOINT_REAP_INTERVAL) 2266 clnt_clts_endpoint_reap_interval = 2267 DEFAULT_ENDPOINT_REAP_INTERVAL; 2268 2269 /* 2270 * Dispatch the taskq at an interval which is offset from the 2271 * interval that the endpoints should be reaped. 2272 */ 2273 clnt_clts_taskq_dispatch_interval = 2274 (clnt_clts_endpoint_reap_interval + DEFAULT_INTERVAL_SHIFT) * hz; 2275 2276 /* 2277 * Initialize the completion queue 2278 */ 2279 clts_call_ht = call_table_init(clnt_clts_hash_size); 2280 /* 2281 * Initialize the zone destructor callback. 2282 */ 2283 zone_key_create(&endpnt_destructor_key, NULL, NULL, endpnt_destructor); 2284 } 2285 2286 void 2287 clnt_clts_fini(void) 2288 { 2289 (void) zone_key_delete(endpnt_destructor_key); 2290 } 2291