1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2008, The Ohio State University. All rights reserved. 28 * 29 * Portions of this source code is developed by the team members of 30 * The Ohio State University's Network-Based Computing Laboratory (NBCL), 31 * headed by Professor Dhabaleswar K. (DK) Panda. 32 * 33 * Acknowledgements to contributions from developors: 34 * Ranjit Noronha: noronha@cse.ohio-state.edu 35 * Lei Chai : chail@cse.ohio-state.edu 36 * Weikuan Yu : yuw@cse.ohio-state.edu 37 * 38 */ 39 40 #include <sys/systm.h> 41 #include <sys/kstat.h> 42 #include <sys/modctl.h> 43 #include <sys/sdt.h> 44 #include <rpc/rpc_rdma.h> 45 46 #include <sys/ib/ibtl/ibti.h> 47 48 uint_t rdma_minchunk = RDMA_MINCHUNK; 49 50 /* 51 * Globals 52 */ 53 int rdma_modloaded = 0; /* flag to load RDMA plugin modules */ 54 int rdma_dev_available = 0; /* if any RDMA device is loaded */ 55 kmutex_t rdma_modload_lock; /* protects rdma_modloaded flag */ 56 57 rdma_svc_wait_t rdma_wait; 58 59 rdma_registry_t *rdma_mod_head = NULL; /* head for RDMA modules */ 60 krwlock_t rdma_lock; /* protects rdma_mod_head list */ 61 ldi_ident_t rpcmod_li = NULL; /* identifies us with ldi_ framework */ 62 63 kmem_cache_t *clist_cache = NULL; 64 65 /* 66 * Statics 67 */ 68 ldi_handle_t rpcib_handle = NULL; 69 70 /* 71 * Externs 72 */ 73 extern kstat_named_t *rdmarcstat_ptr; 74 extern uint_t rdmarcstat_ndata; 75 extern kstat_named_t *rdmarsstat_ptr; 76 extern uint_t rdmarsstat_ndata; 77 78 void rdma_kstat_init(); 79 80 /* 81 * RDMATF module registration routine. 82 * This routine is expected to be called by the init routine in 83 * the plugin modules. 84 */ 85 rdma_stat 86 rdma_register_mod(rdma_mod_t *mod) 87 { 88 rdma_registry_t **mp, *m; 89 90 if (mod->rdma_version != RDMATF_VERS) { 91 return (RDMA_BADVERS); 92 } 93 94 rw_enter(&rdma_lock, RW_WRITER); 95 /* 96 * Ensure not already registered 97 */ 98 mp = &rdma_mod_head; 99 while (*mp != NULL) { 100 if (strncmp((*mp)->r_mod->rdma_api, mod->rdma_api, 101 KNC_STRSIZE) == 0) { 102 if ((*mp)->r_mod_state == RDMA_MOD_INACTIVE) { 103 (*mp)->r_mod_state = RDMA_MOD_ACTIVE; 104 (*mp)->r_mod->rdma_ops = mod->rdma_ops; 105 (*mp)->r_mod->rdma_count = mod->rdma_count; 106 goto announce_hca; 107 } 108 rw_exit(&rdma_lock); 109 return (RDMA_REG_EXIST); 110 } 111 mp = &((*mp)->r_next); 112 } 113 114 /* 115 * New one, create and add to registry 116 */ 117 m = kmem_alloc(sizeof (rdma_registry_t), KM_SLEEP); 118 m->r_mod = kmem_alloc(sizeof (rdma_mod_t), KM_SLEEP); 119 *m->r_mod = *mod; 120 m->r_next = NULL; 121 m->r_mod->rdma_api = kmem_zalloc(KNC_STRSIZE, KM_SLEEP); 122 (void) strncpy(m->r_mod->rdma_api, mod->rdma_api, KNC_STRSIZE); 123 m->r_mod->rdma_api[KNC_STRSIZE - 1] = '\0'; 124 m->r_mod_state = RDMA_MOD_ACTIVE; 125 *mp = m; 126 127 announce_hca: 128 rw_exit(&rdma_lock); 129 /* 130 * Start the nfs service on the rdma xprts. 131 * (this notification mechanism will need to change when we support 132 * multiple hcas and have support for multiple rdma plugins). 133 */ 134 mutex_enter(&rdma_wait.svc_lock); 135 rdma_wait.svc_stat = RDMA_HCA_ATTACH; 136 cv_signal(&rdma_wait.svc_cv); 137 mutex_exit(&rdma_wait.svc_lock); 138 139 return (RDMA_SUCCESS); 140 } 141 142 /* 143 * RDMATF module unregistration routine. 144 * This routine is expected to be called by the fini routine in 145 * the plugin modules. 146 */ 147 rdma_stat 148 rdma_unregister_mod(rdma_mod_t *mod) 149 { 150 rdma_registry_t **m, *mmod = NULL; 151 152 rw_enter(&rdma_lock, RW_WRITER); 153 154 m = &rdma_mod_head; 155 while (*m != NULL) { 156 if (strncmp((*m)->r_mod->rdma_api, mod->rdma_api, 157 KNC_STRSIZE) != 0) { 158 m = &((*m)->r_next); 159 continue; 160 } 161 /* 162 * Check if any device attached, if so return error 163 */ 164 if (mod->rdma_count != 0) { 165 rw_exit(&rdma_lock); 166 return (RDMA_FAILED); 167 } 168 /* 169 * Found entry. Mark it inactive. 170 */ 171 mmod = *m; 172 mmod->r_mod->rdma_count = 0; 173 mmod->r_mod_state = RDMA_MOD_INACTIVE; 174 break; 175 } 176 177 rdma_modloaded = 0; 178 rdma_dev_available = 0; 179 rw_exit(&rdma_lock); 180 181 /* 182 * Stop the nfs service running on the rdma xprts. 183 * (this notification mechanism will need to change when we support 184 * multiple hcas and have support for multiple rdma plugins). 185 */ 186 mutex_enter(&rdma_wait.svc_lock); 187 rdma_wait.svc_stat = RDMA_HCA_DETACH; 188 cv_signal(&rdma_wait.svc_cv); 189 mutex_exit(&rdma_wait.svc_lock); 190 191 /* 192 * Not found. 193 */ 194 return (RDMA_SUCCESS); 195 } 196 197 struct clist * 198 clist_alloc(void) 199 { 200 struct clist *clp; 201 202 clp = kmem_cache_alloc(clist_cache, KM_SLEEP); 203 204 bzero(clp, sizeof (*clp)); 205 206 return (clp); 207 } 208 209 /* 210 * Creates a new chunk list entry, and 211 * adds it to the end of a chunk list. 212 */ 213 void 214 clist_add(struct clist **clp, uint32_t xdroff, int len, 215 struct mrc *shandle, caddr_t saddr, 216 struct mrc *dhandle, caddr_t daddr) 217 { 218 struct clist *cl; 219 220 /* Find the end of the list */ 221 222 while (*clp != NULL) 223 clp = &((*clp)->c_next); 224 225 cl = clist_alloc(); 226 cl->c_xdroff = xdroff; 227 cl->c_len = len; 228 cl->w.c_saddr = (uint64_t)(uintptr_t)saddr; 229 if (shandle) 230 cl->c_smemhandle = *shandle; 231 cl->u.c_daddr = (uint64_t)(uintptr_t)daddr; 232 if (dhandle) 233 cl->c_dmemhandle = *dhandle; 234 cl->c_next = NULL; 235 236 *clp = cl; 237 } 238 239 rdma_stat 240 clist_register(CONN *conn, struct clist *cl, clist_dstsrc dstsrc) 241 { 242 struct clist *c; 243 int status; 244 245 for (c = cl; c; c = c->c_next) { 246 if (c->c_len <= 0) 247 continue; 248 switch (dstsrc) { 249 case CLIST_REG_SOURCE: 250 status = RDMA_REGMEMSYNC(conn, 251 (caddr_t)(struct as *)cl->c_adspc, 252 (caddr_t)(uintptr_t)c->w.c_saddr3, c->c_len, 253 &c->c_smemhandle, (void **)&c->c_ssynchandle, 254 (void *)c->rb_longbuf.rb_private); 255 break; 256 case CLIST_REG_DST: 257 status = RDMA_REGMEMSYNC(conn, 258 (caddr_t)(struct as *)cl->c_adspc, 259 (caddr_t)(uintptr_t)c->u.c_daddr3, c->c_len, 260 &c->c_dmemhandle, (void **)&c->c_dsynchandle, 261 (void *)c->rb_longbuf.rb_private); 262 break; 263 default: 264 return (RDMA_INVAL); 265 } 266 if (status != RDMA_SUCCESS) { 267 (void) clist_deregister(conn, cl, dstsrc); 268 return (status); 269 } 270 } 271 272 return (RDMA_SUCCESS); 273 } 274 275 rdma_stat 276 clist_deregister(CONN *conn, struct clist *cl, clist_dstsrc dstsrc) 277 { 278 struct clist *c; 279 280 for (c = cl; c; c = c->c_next) { 281 switch (dstsrc) { 282 case CLIST_REG_SOURCE: 283 if (c->c_smemhandle.mrc_rmr != 0) { 284 (void) RDMA_DEREGMEMSYNC(conn, 285 (caddr_t)(uintptr_t)c->w.c_saddr3, 286 c->c_smemhandle, 287 (void *)(uintptr_t)c->c_ssynchandle, 288 (void *)c->rb_longbuf.rb_private); 289 c->c_smemhandle.mrc_rmr = 0; 290 c->c_ssynchandle = NULL; 291 } 292 break; 293 case CLIST_REG_DST: 294 if (c->c_dmemhandle.mrc_rmr != 0) { 295 (void) RDMA_DEREGMEMSYNC(conn, 296 (caddr_t)(uintptr_t)c->u.c_daddr3, 297 c->c_dmemhandle, 298 (void *)(uintptr_t)c->c_dsynchandle, 299 (void *)c->rb_longbuf.rb_private); 300 c->c_dmemhandle.mrc_rmr = 0; 301 c->c_dsynchandle = NULL; 302 } 303 break; 304 default: 305 return (RDMA_INVAL); 306 } 307 } 308 309 return (RDMA_SUCCESS); 310 } 311 312 rdma_stat 313 clist_syncmem(CONN *conn, struct clist *cl, clist_dstsrc dstsrc) 314 { 315 struct clist *c; 316 rdma_stat status; 317 318 c = cl; 319 switch (dstsrc) { 320 case CLIST_REG_SOURCE: 321 while (c != NULL) { 322 if (c->c_ssynchandle) { 323 status = RDMA_SYNCMEM(conn, 324 (void *)(uintptr_t)c->c_ssynchandle, 325 (caddr_t)(uintptr_t)c->w.c_saddr3, 326 c->c_len, 0); 327 if (status != RDMA_SUCCESS) 328 return (status); 329 } 330 c = c->c_next; 331 } 332 break; 333 case CLIST_REG_DST: 334 while (c != NULL) { 335 if (c->c_ssynchandle) { 336 status = RDMA_SYNCMEM(conn, 337 (void *)(uintptr_t)c->c_dsynchandle, 338 (caddr_t)(uintptr_t)c->u.c_daddr3, 339 c->c_len, 1); 340 if (status != RDMA_SUCCESS) 341 return (status); 342 } 343 c = c->c_next; 344 } 345 break; 346 default: 347 return (RDMA_INVAL); 348 } 349 350 return (RDMA_SUCCESS); 351 } 352 353 /* 354 * Frees up entries in chunk list 355 */ 356 void 357 clist_free(struct clist *cl) 358 { 359 struct clist *c = cl; 360 361 while (c != NULL) { 362 cl = cl->c_next; 363 kmem_cache_free(clist_cache, c); 364 c = cl; 365 } 366 } 367 368 rdma_stat 369 rdma_clnt_postrecv(CONN *conn, uint32_t xid) 370 { 371 struct clist *cl = NULL; 372 rdma_stat retval; 373 rdma_buf_t rbuf = {0}; 374 375 rbuf.type = RECV_BUFFER; 376 if (RDMA_BUF_ALLOC(conn, &rbuf)) { 377 return (RDMA_NORESOURCE); 378 } 379 380 clist_add(&cl, 0, rbuf.len, &rbuf.handle, rbuf.addr, 381 NULL, NULL); 382 retval = RDMA_CLNT_RECVBUF(conn, cl, xid); 383 clist_free(cl); 384 385 return (retval); 386 } 387 388 rdma_stat 389 rdma_clnt_postrecv_remove(CONN *conn, uint32_t xid) 390 { 391 return (RDMA_CLNT_RECVBUF_REMOVE(conn, xid)); 392 } 393 394 rdma_stat 395 rdma_svc_postrecv(CONN *conn) 396 { 397 struct clist *cl = NULL; 398 rdma_stat retval; 399 rdma_buf_t rbuf = {0}; 400 401 rbuf.type = RECV_BUFFER; 402 if (RDMA_BUF_ALLOC(conn, &rbuf)) { 403 retval = RDMA_NORESOURCE; 404 } else { 405 clist_add(&cl, 0, rbuf.len, &rbuf.handle, rbuf.addr, 406 NULL, NULL); 407 retval = RDMA_SVC_RECVBUF(conn, cl); 408 clist_free(cl); 409 } 410 return (retval); 411 } 412 413 rdma_stat 414 rdma_buf_alloc(CONN *conn, rdma_buf_t *rbuf) 415 { 416 return (RDMA_BUF_ALLOC(conn, rbuf)); 417 } 418 419 void 420 rdma_buf_free(CONN *conn, rdma_buf_t *rbuf) 421 { 422 if (!rbuf || rbuf->addr == NULL) { 423 return; 424 } 425 RDMA_BUF_FREE(conn, rbuf); 426 bzero(rbuf, sizeof (rdma_buf_t)); 427 } 428 429 /* 430 * Caller is holding rdma_modload_lock mutex 431 */ 432 int 433 rdma_modload() 434 { 435 int status; 436 ASSERT(MUTEX_HELD(&rdma_modload_lock)); 437 /* 438 * Load all available RDMA plugins which right now is only IB plugin. 439 * If no IB hardware is present, then quit right away. 440 * ENODEV -- For no device on the system 441 * EPROTONOSUPPORT -- For module not avilable either due to failure to 442 * load or some other reason. 443 */ 444 rdma_modloaded = 1; 445 if (ibt_hw_is_present() == 0) { 446 rdma_dev_available = 0; 447 return (ENODEV); 448 } 449 450 rdma_dev_available = 1; 451 if (rpcmod_li == NULL) 452 return (EPROTONOSUPPORT); 453 454 status = ldi_open_by_name("/devices/ib/rpcib@0:rpcib", 455 FREAD | FWRITE, kcred, 456 &rpcib_handle, rpcmod_li); 457 458 if (status != 0) 459 return (EPROTONOSUPPORT); 460 461 462 /* 463 * We will need to reload the plugin module after it was unregistered 464 * but the resources below need to allocated only the first time. 465 */ 466 if (!clist_cache) { 467 clist_cache = kmem_cache_create("rdma_clist", 468 sizeof (struct clist), _POINTER_ALIGNMENT, NULL, 469 NULL, NULL, NULL, 0, 0); 470 rdma_kstat_init(); 471 } 472 473 (void) ldi_close(rpcib_handle, FREAD|FWRITE, kcred); 474 475 return (0); 476 } 477 478 void 479 rdma_kstat_init(void) 480 { 481 kstat_t *ksp; 482 483 /* 484 * The RDMA framework doesn't know how to deal with Zones, and is 485 * only available in the global zone. 486 */ 487 ASSERT(INGLOBALZONE(curproc)); 488 ksp = kstat_create_zone("unix", 0, "rpc_rdma_client", "rpc", 489 KSTAT_TYPE_NAMED, rdmarcstat_ndata, 490 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, GLOBAL_ZONEID); 491 if (ksp) { 492 ksp->ks_data = (void *) rdmarcstat_ptr; 493 kstat_install(ksp); 494 } 495 496 ksp = kstat_create_zone("unix", 0, "rpc_rdma_server", "rpc", 497 KSTAT_TYPE_NAMED, rdmarsstat_ndata, 498 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, GLOBAL_ZONEID); 499 if (ksp) { 500 ksp->ks_data = (void *) rdmarsstat_ptr; 501 kstat_install(ksp); 502 } 503 } 504 505 rdma_stat 506 rdma_kwait(void) 507 { 508 int ret; 509 rdma_stat stat; 510 511 mutex_enter(&rdma_wait.svc_lock); 512 513 ret = cv_wait_sig(&rdma_wait.svc_cv, &rdma_wait.svc_lock); 514 515 /* 516 * If signalled by a hca attach/detach, pass the right 517 * stat back. 518 */ 519 520 if (ret) 521 stat = rdma_wait.svc_stat; 522 else 523 stat = RDMA_INTR; 524 525 mutex_exit(&rdma_wait.svc_lock); 526 527 return (stat); 528 } 529