xref: /illumos-gate/usr/src/uts/common/rpc/rdma_subr.c (revision 7b4e981c32b1b233ce13a79cac81c8e75937d3f5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright (c) 2008, The Ohio State University. All rights reserved.
28  *
29  * Portions of this source code is developed by the team members of
30  * The Ohio State University's Network-Based Computing Laboratory (NBCL),
31  * headed by Professor Dhabaleswar K. (DK) Panda.
32  *
33  * Acknowledgements to contributions from developors:
34  *   Ranjit Noronha: noronha@cse.ohio-state.edu
35  *   Lei Chai      : chail@cse.ohio-state.edu
36  *   Weikuan Yu    : yuw@cse.ohio-state.edu
37  *
38  */
39 
40 #include <sys/systm.h>
41 #include <sys/kstat.h>
42 #include <sys/modctl.h>
43 #include <sys/sdt.h>
44 #include <rpc/rpc_rdma.h>
45 
46 #include <sys/ib/ibtl/ibti.h>
47 
48 uint_t rdma_minchunk = RDMA_MINCHUNK;
49 
50 /*
51  * Globals
52  */
53 int rdma_modloaded = 0;		/* flag to load RDMA plugin modules */
54 int rdma_dev_available = 0;	/* if any RDMA device is loaded */
55 kmutex_t rdma_modload_lock;	/* protects rdma_modloaded flag */
56 
57 rdma_svc_wait_t rdma_wait;
58 
59 rdma_registry_t	*rdma_mod_head = NULL;	/* head for RDMA modules */
60 krwlock_t	rdma_lock;		/* protects rdma_mod_head list */
61 ldi_ident_t rpcmod_li = NULL;	/* identifies us with ldi_ framework */
62 
63 kmem_cache_t *clist_cache = NULL;
64 
65 /*
66  * Statics
67  */
68 ldi_handle_t rpcib_handle = NULL;
69 
70 /*
71  * Externs
72  */
73 extern	kstat_named_t	*rdmarcstat_ptr;
74 extern	uint_t		rdmarcstat_ndata;
75 extern	kstat_named_t	*rdmarsstat_ptr;
76 extern	uint_t		rdmarsstat_ndata;
77 
78 void rdma_kstat_init();
79 
80 /*
81  * RDMATF module registration routine.
82  * This routine is expected to be called by the init routine in
83  * the plugin modules.
84  */
85 rdma_stat
86 rdma_register_mod(rdma_mod_t *mod)
87 {
88 	rdma_registry_t **mp, *m;
89 
90 	if (mod->rdma_version != RDMATF_VERS) {
91 		return (RDMA_BADVERS);
92 	}
93 
94 	rw_enter(&rdma_lock, RW_WRITER);
95 	/*
96 	 * Ensure not already registered
97 	 */
98 	mp = &rdma_mod_head;
99 	while (*mp != NULL) {
100 		if (strncmp((*mp)->r_mod->rdma_api, mod->rdma_api,
101 		    KNC_STRSIZE) == 0) {
102 			if ((*mp)->r_mod_state == RDMA_MOD_INACTIVE) {
103 				(*mp)->r_mod_state = RDMA_MOD_ACTIVE;
104 				(*mp)->r_mod->rdma_ops = mod->rdma_ops;
105 				(*mp)->r_mod->rdma_count = mod->rdma_count;
106 				goto announce_hca;
107 			}
108 			rw_exit(&rdma_lock);
109 			return (RDMA_REG_EXIST);
110 		}
111 		mp = &((*mp)->r_next);
112 	}
113 
114 	/*
115 	 * New one, create and add to registry
116 	 */
117 	m = kmem_alloc(sizeof (rdma_registry_t), KM_SLEEP);
118 	m->r_mod = kmem_alloc(sizeof (rdma_mod_t), KM_SLEEP);
119 	*m->r_mod = *mod;
120 	m->r_next = NULL;
121 	m->r_mod->rdma_api = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
122 	(void) strncpy(m->r_mod->rdma_api, mod->rdma_api, KNC_STRSIZE);
123 	m->r_mod->rdma_api[KNC_STRSIZE - 1] = '\0';
124 	m->r_mod_state = RDMA_MOD_ACTIVE;
125 	*mp = m;
126 
127 announce_hca:
128 	rw_exit(&rdma_lock);
129 	/*
130 	 * Start the nfs service on the rdma xprts.
131 	 * (this notification mechanism will need to change when we support
132 	 * multiple hcas and have support for multiple rdma plugins).
133 	 */
134 	mutex_enter(&rdma_wait.svc_lock);
135 	rdma_wait.svc_stat = RDMA_HCA_ATTACH;
136 	cv_signal(&rdma_wait.svc_cv);
137 	mutex_exit(&rdma_wait.svc_lock);
138 
139 	return (RDMA_SUCCESS);
140 }
141 
142 /*
143  * RDMATF module unregistration routine.
144  * This routine is expected to be called by the fini routine in
145  * the plugin modules.
146  */
147 rdma_stat
148 rdma_unregister_mod(rdma_mod_t *mod)
149 {
150 	rdma_registry_t **m, *mmod = NULL;
151 
152 	rw_enter(&rdma_lock, RW_WRITER);
153 
154 	m = &rdma_mod_head;
155 	while (*m != NULL) {
156 		if (strncmp((*m)->r_mod->rdma_api, mod->rdma_api,
157 		    KNC_STRSIZE) != 0) {
158 			m = &((*m)->r_next);
159 			continue;
160 		}
161 		/*
162 		 * Check if any device attached, if so return error
163 		 */
164 		if (mod->rdma_count != 0) {
165 			rw_exit(&rdma_lock);
166 			return (RDMA_FAILED);
167 		}
168 		/*
169 		 * Found entry. Mark it inactive.
170 		 */
171 		mmod = *m;
172 		mmod->r_mod->rdma_count = 0;
173 		mmod->r_mod_state = RDMA_MOD_INACTIVE;
174 		break;
175 	}
176 
177 	rdma_modloaded = 0;
178 	rdma_dev_available = 0;
179 	rw_exit(&rdma_lock);
180 
181 	/*
182 	 * Stop the nfs service running on the rdma xprts.
183 	 * (this notification mechanism will need to change when we support
184 	 * multiple hcas and have support for multiple rdma plugins).
185 	 */
186 	mutex_enter(&rdma_wait.svc_lock);
187 	rdma_wait.svc_stat = RDMA_HCA_DETACH;
188 	cv_signal(&rdma_wait.svc_cv);
189 	mutex_exit(&rdma_wait.svc_lock);
190 
191 	/*
192 	 * Not found.
193 	 */
194 	return (RDMA_SUCCESS);
195 }
196 
197 struct clist *
198 clist_alloc(void)
199 {
200 	struct clist *clp;
201 
202 	clp = kmem_cache_alloc(clist_cache, KM_SLEEP);
203 
204 	bzero(clp, sizeof (*clp));
205 
206 	return (clp);
207 }
208 
209 uint32_t
210 clist_len(struct clist *cl)
211 {
212 	uint32_t len = 0;
213 	while (cl) {
214 		len += cl->c_len;
215 		cl = cl->c_next;
216 	}
217 	return (len);
218 }
219 
220 void
221 clist_zero_len(struct clist *cl)
222 {
223 	while (cl != NULL) {
224 		if (cl->c_dmemhandle.mrc_rmr == 0)
225 			break;
226 		cl->c_len = 0;
227 		cl = cl->c_next;
228 	}
229 }
230 
231 /*
232  * Creates a new chunk list entry, and
233  * adds it to the end of a chunk list.
234  */
235 void
236 clist_add(struct clist **clp, uint32_t xdroff, int len,
237     struct mrc *shandle, caddr_t saddr,
238     struct mrc *dhandle, caddr_t daddr)
239 {
240 	struct clist *cl;
241 
242 	/* Find the end of the list */
243 
244 	while (*clp != NULL)
245 		clp = &((*clp)->c_next);
246 
247 	cl = clist_alloc();
248 	cl->c_xdroff = xdroff;
249 	cl->c_len = len;
250 	cl->w.c_saddr = (uint64_t)(uintptr_t)saddr;
251 	if (shandle)
252 		cl->c_smemhandle = *shandle;
253 	cl->u.c_daddr = (uint64_t)(uintptr_t)daddr;
254 	if (dhandle)
255 		cl->c_dmemhandle = *dhandle;
256 	cl->c_next = NULL;
257 
258 	*clp = cl;
259 }
260 
261 rdma_stat
262 clist_register(CONN *conn, struct clist *cl, clist_dstsrc dstsrc)
263 {
264 	struct clist *c;
265 	int status;
266 
267 	for (c = cl; c; c = c->c_next) {
268 		if (c->c_len <= 0)
269 			continue;
270 
271 		c->c_regtype = dstsrc;
272 
273 		switch (dstsrc) {
274 		case CLIST_REG_SOURCE:
275 			status = RDMA_REGMEMSYNC(conn,
276 			    (caddr_t)(struct as *)c->c_adspc,
277 			    (caddr_t)(uintptr_t)c->w.c_saddr3, c->c_len,
278 			    &c->c_smemhandle, (void **)&c->c_ssynchandle,
279 			    (void *)c->rb_longbuf.rb_private);
280 			break;
281 		case CLIST_REG_DST:
282 			status = RDMA_REGMEMSYNC(conn,
283 			    (caddr_t)(struct as *)c->c_adspc,
284 			    (caddr_t)(uintptr_t)c->u.c_daddr3, c->c_len,
285 			    &c->c_dmemhandle, (void **)&c->c_dsynchandle,
286 			    (void *)c->rb_longbuf.rb_private);
287 			break;
288 		default:
289 			return (RDMA_INVAL);
290 		}
291 		if (status != RDMA_SUCCESS) {
292 			(void) clist_deregister(conn, cl);
293 			return (status);
294 		}
295 	}
296 
297 	return (RDMA_SUCCESS);
298 }
299 
300 rdma_stat
301 clist_deregister(CONN *conn, struct clist *cl)
302 {
303 	struct clist *c;
304 
305 	for (c = cl; c; c = c->c_next) {
306 		switch (c->c_regtype) {
307 		case CLIST_REG_SOURCE:
308 			if (c->c_smemhandle.mrc_rmr != 0) {
309 				(void) RDMA_DEREGMEMSYNC(conn,
310 				    (caddr_t)(uintptr_t)c->w.c_saddr3,
311 				    c->c_smemhandle,
312 				    (void *)(uintptr_t)c->c_ssynchandle,
313 				    (void *)c->rb_longbuf.rb_private);
314 				c->c_smemhandle.mrc_rmr = 0;
315 				c->c_ssynchandle = 0;
316 			}
317 			break;
318 		case CLIST_REG_DST:
319 			if (c->c_dmemhandle.mrc_rmr != 0) {
320 				(void) RDMA_DEREGMEMSYNC(conn,
321 				    (caddr_t)(uintptr_t)c->u.c_daddr3,
322 				    c->c_dmemhandle,
323 				    (void *)(uintptr_t)c->c_dsynchandle,
324 				    (void *)c->rb_longbuf.rb_private);
325 				c->c_dmemhandle.mrc_rmr = 0;
326 				c->c_dsynchandle = 0;
327 			}
328 			break;
329 		default:
330 			/* clist unregistered. continue */
331 			break;
332 		}
333 	}
334 
335 	return (RDMA_SUCCESS);
336 }
337 
338 rdma_stat
339 clist_syncmem(CONN *conn, struct clist *cl, clist_dstsrc dstsrc)
340 {
341 	struct clist *c;
342 	rdma_stat status;
343 
344 	c = cl;
345 	switch (dstsrc) {
346 	case CLIST_REG_SOURCE:
347 		while (c != NULL) {
348 			if (c->c_ssynchandle) {
349 				status = RDMA_SYNCMEM(conn,
350 				    (void *)(uintptr_t)c->c_ssynchandle,
351 				    (caddr_t)(uintptr_t)c->w.c_saddr3,
352 				    c->c_len, 0);
353 				if (status != RDMA_SUCCESS)
354 					return (status);
355 			}
356 			c = c->c_next;
357 		}
358 		break;
359 	case CLIST_REG_DST:
360 		while (c != NULL) {
361 			if (c->c_ssynchandle) {
362 				status = RDMA_SYNCMEM(conn,
363 				    (void *)(uintptr_t)c->c_dsynchandle,
364 				    (caddr_t)(uintptr_t)c->u.c_daddr3,
365 				    c->c_len, 1);
366 				if (status != RDMA_SUCCESS)
367 					return (status);
368 			}
369 			c = c->c_next;
370 		}
371 		break;
372 	default:
373 		return (RDMA_INVAL);
374 	}
375 
376 	return (RDMA_SUCCESS);
377 }
378 
379 /*
380  * Frees up entries in chunk list
381  */
382 void
383 clist_free(struct clist *cl)
384 {
385 	struct clist *c = cl;
386 
387 	while (c != NULL) {
388 		cl = cl->c_next;
389 		kmem_cache_free(clist_cache, c);
390 		c = cl;
391 	}
392 }
393 
394 rdma_stat
395 rdma_clnt_postrecv(CONN *conn, uint32_t xid)
396 {
397 	struct clist *cl = NULL;
398 	rdma_stat retval;
399 	rdma_buf_t rbuf = {0};
400 
401 	rbuf.type = RECV_BUFFER;
402 	if (RDMA_BUF_ALLOC(conn, &rbuf)) {
403 		return (RDMA_NORESOURCE);
404 	}
405 
406 	clist_add(&cl, 0, rbuf.len, &rbuf.handle, rbuf.addr,
407 	    NULL, NULL);
408 	retval = RDMA_CLNT_RECVBUF(conn, cl, xid);
409 	clist_free(cl);
410 
411 	return (retval);
412 }
413 
414 rdma_stat
415 rdma_clnt_postrecv_remove(CONN *conn, uint32_t xid)
416 {
417 	return (RDMA_CLNT_RECVBUF_REMOVE(conn, xid));
418 }
419 
420 rdma_stat
421 rdma_svc_postrecv(CONN *conn)
422 {
423 	struct clist *cl = NULL;
424 	rdma_stat retval;
425 	rdma_buf_t rbuf = {0};
426 
427 	rbuf.type = RECV_BUFFER;
428 	if (RDMA_BUF_ALLOC(conn, &rbuf)) {
429 		retval = RDMA_NORESOURCE;
430 	} else {
431 		clist_add(&cl, 0, rbuf.len, &rbuf.handle, rbuf.addr,
432 		    NULL, NULL);
433 		retval = RDMA_SVC_RECVBUF(conn, cl);
434 		clist_free(cl);
435 	}
436 	return (retval);
437 }
438 
439 rdma_stat
440 rdma_buf_alloc(CONN *conn, rdma_buf_t *rbuf)
441 {
442 	return (RDMA_BUF_ALLOC(conn, rbuf));
443 }
444 
445 void
446 rdma_buf_free(CONN *conn, rdma_buf_t *rbuf)
447 {
448 	if (!rbuf || rbuf->addr == NULL) {
449 		return;
450 	}
451 	RDMA_BUF_FREE(conn, rbuf);
452 	bzero(rbuf, sizeof (rdma_buf_t));
453 }
454 
455 /*
456  * Caller is holding rdma_modload_lock mutex
457  */
458 int
459 rdma_modload()
460 {
461 	int status;
462 	ASSERT(MUTEX_HELD(&rdma_modload_lock));
463 	/*
464 	 * Load all available RDMA plugins which right now is only IB plugin.
465 	 * If no IB hardware is present, then quit right away.
466 	 * ENODEV -- For no device on the system
467 	 * EPROTONOSUPPORT -- For module not avilable either due to failure to
468 	 * load or some other reason.
469 	 */
470 	rdma_modloaded = 1;
471 	if (ibt_hw_is_present() == 0) {
472 		rdma_dev_available = 0;
473 		return (ENODEV);
474 	}
475 
476 	rdma_dev_available = 1;
477 	if (rpcmod_li == NULL)
478 		return (EPROTONOSUPPORT);
479 
480 	status = ldi_open_by_name("/devices/ib/rpcib@0:rpcib",
481 	    FREAD | FWRITE, kcred,
482 	    &rpcib_handle, rpcmod_li);
483 
484 	if (status != 0)
485 		return (EPROTONOSUPPORT);
486 
487 
488 	/*
489 	 * We will need to reload the plugin module after it was unregistered
490 	 * but the resources below need to allocated only the first time.
491 	 */
492 	if (!clist_cache) {
493 		clist_cache = kmem_cache_create("rdma_clist",
494 		    sizeof (struct clist), _POINTER_ALIGNMENT, NULL,
495 		    NULL, NULL, NULL, 0, 0);
496 		rdma_kstat_init();
497 	}
498 
499 	(void) ldi_close(rpcib_handle, FREAD|FWRITE, kcred);
500 
501 	return (0);
502 }
503 
504 void
505 rdma_kstat_init(void)
506 {
507 	kstat_t *ksp;
508 
509 	/*
510 	 * The RDMA framework doesn't know how to deal with Zones, and is
511 	 * only available in the global zone.
512 	 */
513 	ASSERT(INGLOBALZONE(curproc));
514 	ksp = kstat_create_zone("unix", 0, "rpc_rdma_client", "rpc",
515 	    KSTAT_TYPE_NAMED, rdmarcstat_ndata,
516 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, GLOBAL_ZONEID);
517 	if (ksp) {
518 		ksp->ks_data = (void *) rdmarcstat_ptr;
519 		kstat_install(ksp);
520 	}
521 
522 	ksp = kstat_create_zone("unix", 0, "rpc_rdma_server", "rpc",
523 	    KSTAT_TYPE_NAMED, rdmarsstat_ndata,
524 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, GLOBAL_ZONEID);
525 	if (ksp) {
526 		ksp->ks_data = (void *) rdmarsstat_ptr;
527 		kstat_install(ksp);
528 	}
529 }
530 
531 rdma_stat
532 rdma_kwait(void)
533 {
534 	int ret;
535 	rdma_stat stat;
536 
537 	mutex_enter(&rdma_wait.svc_lock);
538 
539 	ret = cv_wait_sig(&rdma_wait.svc_cv, &rdma_wait.svc_lock);
540 
541 	/*
542 	 * If signalled by a hca attach/detach, pass the right
543 	 * stat back.
544 	 */
545 
546 	if (ret)
547 		stat =  rdma_wait.svc_stat;
548 	else
549 		stat = RDMA_INTR;
550 
551 	mutex_exit(&rdma_wait.svc_lock);
552 
553 	return (stat);
554 }
555