xref: /illumos-gate/usr/src/uts/common/fs/sockfs/sodirect.c (revision 3e95bd4ab92abca814bd28e854607d1975c7dc88)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/cmn_err.h>
29 #include <sys/uio.h>
30 #include <sys/stropts.h>
31 #include <sys/strsun.h>
32 #include <sys/systm.h>
33 #include <sys/socketvar.h>
34 #include <fs/sockfs/sodirect.h>
35 
36 /*
37  * In support of on-board asynchronous DMA hardware (e.g. Intel I/OAT)
38  * we use a consolidation private KAPI to allow the protocol to start
39  * an asynchronous copyout to a user-land receive-side buffer (uioa)
40  * when a blocking socket read (e.g. read, recv, ...) is pending.
41  *
42  * In some broad strokes, this is what happens. When recv is called,
43  * we first determine whether it would be beneficial to use uioa, and
44  * if so set up the required state (all done by sod_rcv_init()).
45  * The protocol can only initiate asynchronous copyout if the receive
46  * queue is empty, so the first thing we do is drain any previously
47  * queued data (using sod_uioa_so_init()). Once the copyouts (if any)
48  * have been scheduled we wait for the receive to be satisfied. During
49  * that time any new mblks that are enqueued will be scheduled to be
50  * copied out asynchronously (sod_uioa_mblk_init()). When the receive
51  * has been satisfied we wait for all scheduled copyout operations to
52  * complete before we return to the user (sod_rcv_done())
53  */
54 
55 static struct kmem_cache *sock_sod_cache;
56 
57 /*
58  * This function is called at the beginning of recvmsg().
59  *
60  * If I/OAT is enabled on this sonode, initialize the uioa state machine
61  * with state UIOA_ALLOC.
62  */
63 uio_t *
sod_rcv_init(struct sonode * so,int flags,struct uio ** uiopp)64 sod_rcv_init(struct sonode *so, int flags, struct uio **uiopp)
65 {
66 	struct uio *suiop;
67 	struct uio *uiop;
68 	sodirect_t *sodp = so->so_direct;
69 
70 	if (sodp == NULL)
71 		return (NULL);
72 
73 	suiop = NULL;
74 	uiop = *uiopp;
75 
76 	mutex_enter(&so->so_lock);
77 	if (uiop->uio_resid >= uioasync.mincnt &&
78 	    sodp != NULL && sodp->sod_enabled &&
79 	    uioasync.enabled && !(flags & MSG_PEEK) &&
80 	    !so->so_proto_props.sopp_loopback && so->so_filter_active == 0 &&
81 	    !(so->so_state & SS_CANTRCVMORE)) {
82 		/*
83 		 * Big enough I/O for uioa min setup and an sodirect socket
84 		 * and sodirect enabled and uioa enabled and I/O will be done
85 		 * and not EOF so initialize the sodirect_t uioa_t with "uiop".
86 		 */
87 		if (!uioainit(uiop, &sodp->sod_uioa)) {
88 			/*
89 			 * Successful uioainit() so the uio_t part of the
90 			 * uioa_t will be used for all uio_t work to follow,
91 			 * we return the original "uiop" in "suiop".
92 			 */
93 			suiop = uiop;
94 			*uiopp = (uio_t *)&sodp->sod_uioa;
95 			/*
96 			 * Before returning to the caller the passed in uio_t
97 			 * "uiop" will be updated via a call to uioafini()
98 			 * below.
99 			 *
100 			 * Note, the uioa.uioa_state isn't set to UIOA_ENABLED
101 			 * here as first we have to uioamove() any currently
102 			 * queued M_DATA mblk_t(s) so it will be done later.
103 			 */
104 		}
105 	}
106 	mutex_exit(&so->so_lock);
107 
108 	return (suiop);
109 }
110 
111 /*
112  * This function is called at the end of recvmsg(), it finializes all the I/OAT
113  * operations, and reset the uioa state to UIOA_ALLOC.
114  */
115 int
sod_rcv_done(struct sonode * so,struct uio * suiop,struct uio * uiop)116 sod_rcv_done(struct sonode *so, struct uio *suiop, struct uio *uiop)
117 {
118 	int error = 0;
119 	sodirect_t *sodp = so->so_direct;
120 	mblk_t *mp;
121 
122 	if (sodp == NULL) {
123 		return (0);
124 	}
125 
126 	ASSERT(MUTEX_HELD(&so->so_lock));
127 	/* Finish any sodirect and uioa processing */
128 	if (suiop != NULL) {
129 		/* Finish any uioa_t processing */
130 
131 		ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
132 		error = uioafini(suiop, (uioa_t *)uiop);
133 		if ((mp = sodp->sod_uioafh) != NULL) {
134 			sodp->sod_uioafh = NULL;
135 			sodp->sod_uioaft = NULL;
136 			freemsg(mp);
137 		}
138 	}
139 	ASSERT(sodp->sod_uioafh == NULL);
140 
141 	return (error);
142 }
143 
144 /*
145  * Schedule a uioamove() on a mblk. This is done as mblks are enqueued
146  * by the protocol on the socket's rcv queue.
147  *
148  * Caller must be holding so_lock.
149  */
150 void
sod_uioa_mblk_init(struct sodirect_s * sodp,mblk_t * mp,size_t msg_size)151 sod_uioa_mblk_init(struct sodirect_s *sodp, mblk_t *mp, size_t msg_size)
152 {
153 	uioa_t *uioap = &sodp->sod_uioa;
154 	mblk_t *mp1 = mp;
155 	mblk_t *lmp = NULL;
156 
157 	ASSERT(DB_TYPE(mp) == M_DATA);
158 	ASSERT(msg_size == msgdsize(mp));
159 
160 	if (uioap->uioa_state & UIOA_ENABLED) {
161 		/* Uioa is enabled */
162 
163 		if (msg_size > uioap->uio_resid) {
164 			/*
165 			 * There isn't enough uio space for the mblk_t chain
166 			 * so disable uioa such that this and any additional
167 			 * mblk_t data is handled by the socket and schedule
168 			 * the socket for wakeup to finish this uioa.
169 			 */
170 			uioap->uioa_state &= UIOA_CLR;
171 			uioap->uioa_state |= UIOA_FINI;
172 			return;
173 		}
174 		do {
175 			uint32_t	len = MBLKL(mp1);
176 
177 			if (!uioamove(mp1->b_rptr, len, UIO_READ, uioap)) {
178 				/* Scheduled, mark dblk_t as such */
179 				DB_FLAGS(mp1) |= DBLK_UIOA;
180 			} else {
181 				/* Error, turn off async processing */
182 				uioap->uioa_state &= UIOA_CLR;
183 				uioap->uioa_state |= UIOA_FINI;
184 				break;
185 			}
186 			lmp = mp1;
187 		} while ((mp1 = mp1->b_cont) != NULL);
188 
189 		if (mp1 != NULL || uioap->uio_resid == 0) {
190 			/* Break the mblk chain if neccessary. */
191 			if (mp1 != NULL && lmp != NULL) {
192 				mp->b_next = mp1;
193 				lmp->b_cont = NULL;
194 			}
195 		}
196 	}
197 }
198 
199 /*
200  * This function is called on a mblk that thas been successfully uioamoved().
201  */
202 void
sod_uioa_mblk_done(sodirect_t * sodp,mblk_t * bp)203 sod_uioa_mblk_done(sodirect_t *sodp, mblk_t *bp)
204 {
205 	if (bp != NULL && (bp->b_datap->db_flags & DBLK_UIOA)) {
206 		/*
207 		 * A uioa flaged mblk_t chain, already uio processed,
208 		 * add it to the sodirect uioa pending free list.
209 		 *
210 		 * Note, a b_cont chain headed by a DBLK_UIOA enable
211 		 * mblk_t must have all mblk_t(s) DBLK_UIOA enabled.
212 		 */
213 		mblk_t	*bpt = sodp->sod_uioaft;
214 
215 		ASSERT(sodp != NULL);
216 
217 		/*
218 		 * Add first mblk_t of "bp" chain to current sodirect uioa
219 		 * free list tail mblk_t, if any, else empty list so new head.
220 		 */
221 		if (bpt == NULL)
222 			sodp->sod_uioafh = bp;
223 		else
224 			bpt->b_cont = bp;
225 
226 		/*
227 		 * Walk mblk_t "bp" chain to find tail and adjust rptr of
228 		 * each to reflect that uioamove() has consumed all data.
229 		 */
230 		bpt = bp;
231 		for (;;) {
232 			ASSERT(bpt->b_datap->db_flags & DBLK_UIOA);
233 
234 			bpt->b_rptr = bpt->b_wptr;
235 			if (bpt->b_cont == NULL)
236 				break;
237 			bpt = bpt->b_cont;
238 		}
239 		/* New sodirect uioa free list tail */
240 		sodp->sod_uioaft = bpt;
241 
242 		/* Only dequeue once with data returned per uioa_t */
243 		if (sodp->sod_uioa.uioa_state & UIOA_ENABLED) {
244 			sodp->sod_uioa.uioa_state &= UIOA_CLR;
245 			sodp->sod_uioa.uioa_state |= UIOA_FINI;
246 		}
247 	}
248 }
249 
250 /*
251  * When transit from UIOA_INIT state to UIOA_ENABLE state in recvmsg(), call
252  * this function on a non-STREAMS socket to schedule uioamove() on the data
253  * that has already queued in this socket.
254  */
255 void
sod_uioa_so_init(struct sonode * so,struct sodirect_s * sodp,struct uio * uiop)256 sod_uioa_so_init(struct sonode *so, struct sodirect_s *sodp, struct uio *uiop)
257 {
258 	uioa_t	*uioap = (uioa_t *)uiop;
259 	mblk_t	*lbp;
260 	mblk_t	*wbp;
261 	mblk_t	*bp;
262 	int	len;
263 	int	error;
264 	boolean_t in_rcv_q = B_TRUE;
265 
266 	ASSERT(MUTEX_HELD(&so->so_lock));
267 	ASSERT(&sodp->sod_uioa == uioap);
268 
269 	/*
270 	 * Walk first b_cont chain in sod_q
271 	 * and schedule any M_DATA mblk_t's for uio asynchronous move.
272 	 */
273 	bp = so->so_rcv_q_head;
274 
275 again:
276 	/* Walk the chain */
277 	lbp = NULL;
278 	wbp = bp;
279 
280 	do {
281 		if (bp == NULL)
282 			break;
283 
284 		if (wbp->b_datap->db_type != M_DATA) {
285 			/* Not M_DATA, no more uioa */
286 			goto nouioa;
287 		}
288 		if ((len = wbp->b_wptr - wbp->b_rptr) > 0) {
289 			/* Have a M_DATA mblk_t with data */
290 			if (len > uioap->uio_resid || (so->so_oobmark > 0 &&
291 			    len + uioap->uioa_mbytes >= so->so_oobmark)) {
292 				/* Not enough uio sapce, or beyond oobmark */
293 				goto nouioa;
294 			}
295 			ASSERT(!(wbp->b_datap->db_flags & DBLK_UIOA));
296 			error = uioamove(wbp->b_rptr, len,
297 			    UIO_READ, uioap);
298 			if (!error) {
299 				/* Scheduled, mark dblk_t as such */
300 				wbp->b_datap->db_flags |= DBLK_UIOA;
301 			} else {
302 				/* Break the mblk chain */
303 				goto nouioa;
304 			}
305 		}
306 		/* Save last wbp processed */
307 		lbp = wbp;
308 	} while ((wbp = wbp->b_cont) != NULL);
309 
310 	if (in_rcv_q && (bp == NULL || bp->b_next == NULL)) {
311 		/*
312 		 * We get here only once to process the sonode dump area
313 		 * if so_rcv_q_head is NULL or all the mblks have been
314 		 * successfully uioamoved()ed.
315 		 */
316 		in_rcv_q = B_FALSE;
317 
318 		/* move to dump area */
319 		bp = so->so_rcv_head;
320 		goto again;
321 	}
322 
323 	return;
324 
325 nouioa:
326 	/* No more uioa */
327 	uioap->uioa_state &= UIOA_CLR;
328 	uioap->uioa_state |= UIOA_FINI;
329 
330 	/*
331 	 * If we processed 1 or more mblk_t(s) then we need to split the
332 	 * current mblk_t chain in 2 so that all the uioamove()ed mblk_t(s)
333 	 * are in the current chain and the rest are in the following new
334 	 * chain.
335 	 */
336 	if (lbp != NULL) {
337 		/* New end of current chain */
338 		lbp->b_cont = NULL;
339 
340 		/* Insert new chain wbp after bp */
341 		if ((wbp->b_next = bp->b_next) == NULL) {
342 			if (in_rcv_q)
343 				so->so_rcv_q_last_head = wbp;
344 			else
345 				so->so_rcv_last_head = wbp;
346 		}
347 		bp->b_next = wbp;
348 		bp->b_next->b_prev = bp->b_prev;
349 		bp->b_prev = lbp;
350 	}
351 }
352 
353 /*
354  * Initialize sodirect data structures on a socket.
355  */
356 void
sod_sock_init(struct sonode * so)357 sod_sock_init(struct sonode *so)
358 {
359 	sodirect_t	*sodp;
360 
361 	ASSERT(so->so_direct == NULL);
362 
363 	so->so_state |= SS_SODIRECT;
364 
365 	sodp = kmem_cache_alloc(sock_sod_cache, KM_SLEEP);
366 	sodp->sod_enabled = B_TRUE;
367 	sodp->sod_uioafh = NULL;
368 	sodp->sod_uioaft = NULL;
369 	/*
370 	 * Remainder of the sod_uioa members are left uninitialized
371 	 * but will be initialized later by uioainit() before uioa
372 	 * is enabled.
373 	 */
374 	sodp->sod_uioa.uioa_state = UIOA_ALLOC;
375 	so->so_direct = sodp;
376 }
377 
378 void
sod_sock_fini(struct sonode * so)379 sod_sock_fini(struct sonode *so)
380 {
381 	sodirect_t *sodp = so->so_direct;
382 
383 	ASSERT(sodp->sod_uioafh == NULL);
384 
385 	so->so_direct = NULL;
386 	kmem_cache_free(sock_sod_cache, sodp);
387 }
388 
389 /*
390  * Init the sodirect kmem cache while sockfs is loading.
391  */
392 int
sod_init()393 sod_init()
394 {
395 	/* Allocate sodirect_t kmem_cache */
396 	sock_sod_cache = kmem_cache_create("sock_sod_cache",
397 	    sizeof (sodirect_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
398 
399 	return (0);
400 }
401 
402 ssize_t
sod_uioa_mblk(struct sonode * so,mblk_t * mp)403 sod_uioa_mblk(struct sonode *so, mblk_t *mp)
404 {
405 	sodirect_t *sodp = so->so_direct;
406 
407 	ASSERT(sodp != NULL);
408 	ASSERT(MUTEX_HELD(&so->so_lock));
409 
410 	ASSERT(sodp->sod_enabled);
411 	ASSERT(sodp->sod_uioa.uioa_state != (UIOA_ALLOC|UIOA_INIT));
412 
413 	ASSERT(sodp->sod_uioa.uioa_state & (UIOA_ENABLED|UIOA_FINI));
414 
415 	if (mp == NULL && so->so_rcv_q_head != NULL) {
416 		mp = so->so_rcv_q_head;
417 		ASSERT(mp->b_prev != NULL);
418 		mp->b_prev = NULL;
419 		so->so_rcv_q_head = mp->b_next;
420 		if (so->so_rcv_q_head == NULL) {
421 			so->so_rcv_q_last_head = NULL;
422 		}
423 		mp->b_next = NULL;
424 	}
425 
426 	sod_uioa_mblk_done(sodp, mp);
427 
428 	if (so->so_rcv_q_head == NULL && so->so_rcv_head != NULL &&
429 	    DB_TYPE(so->so_rcv_head) == M_DATA &&
430 	    (DB_FLAGS(so->so_rcv_head) & DBLK_UIOA)) {
431 		/* more arrived */
432 		ASSERT(so->so_rcv_q_head == NULL);
433 		mp = so->so_rcv_head;
434 		so->so_rcv_head = mp->b_next;
435 		if (so->so_rcv_head == NULL)
436 			so->so_rcv_last_head = NULL;
437 		mp->b_prev = mp->b_next = NULL;
438 		sod_uioa_mblk_done(sodp, mp);
439 	}
440 
441 #ifdef DEBUG
442 	if (so->so_rcv_q_head != NULL) {
443 		mblk_t *m = so->so_rcv_q_head;
444 		while (m != NULL) {
445 			if (DB_FLAGS(m) & DBLK_UIOA) {
446 				cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p"
447 				    " in so_rcv_q_head.\n", (void *)m);
448 			}
449 			m = m->b_next;
450 		}
451 	}
452 	if (so->so_rcv_head != NULL) {
453 		mblk_t *m = so->so_rcv_head;
454 		while (m != NULL) {
455 			if (DB_FLAGS(m) & DBLK_UIOA) {
456 				cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p"
457 				    " in so_rcv_head.\n", (void *)m);
458 			}
459 			m = m->b_next;
460 		}
461 	}
462 #endif
463 	return (sodp->sod_uioa.uioa_mbytes);
464 }
465