xref: /freebsd/sys/kern/sys_socket.c (revision 1390bba42caf53a00fa370f3844cd7b3725ed4ec)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1990, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/aio.h>
35 #include <sys/domain.h>
36 #include <sys/file.h>
37 #include <sys/filedesc.h>
38 #include <sys/kernel.h>
39 #include <sys/kthread.h>
40 #include <sys/malloc.h>
41 #include <sys/proc.h>
42 #include <sys/protosw.h>
43 #include <sys/sigio.h>
44 #include <sys/signal.h>
45 #include <sys/signalvar.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <sys/filio.h>			/* XXX */
49 #include <sys/sockio.h>
50 #include <sys/stat.h>
51 #include <sys/sysctl.h>
52 #include <sys/sysproto.h>
53 #include <sys/taskqueue.h>
54 #include <sys/uio.h>
55 #include <sys/ucred.h>
56 #include <sys/un.h>
57 #include <sys/unpcb.h>
58 #include <sys/user.h>
59 
60 #include <net/if.h>
61 #include <net/if_var.h>
62 #include <net/route.h>
63 #include <net/vnet.h>
64 
65 #include <netinet/in.h>
66 #include <netinet/in_pcb.h>
67 
68 #include <security/mac/mac_framework.h>
69 
70 #include <vm/vm.h>
71 #include <vm/pmap.h>
72 #include <vm/vm_extern.h>
73 #include <vm/vm_map.h>
74 
75 static SYSCTL_NODE(_kern_ipc, OID_AUTO, aio, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
76     "socket AIO stats");
77 
78 static int empty_results;
79 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, empty_results, CTLFLAG_RD, &empty_results,
80     0, "socket operation returned EAGAIN");
81 
82 static int empty_retries;
83 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, empty_retries, CTLFLAG_RD, &empty_retries,
84     0, "socket operation retries");
85 
86 static fo_rdwr_t soo_read;
87 static fo_rdwr_t soo_write;
88 static fo_ioctl_t soo_ioctl;
89 static fo_poll_t soo_poll;
90 static fo_kqfilter_t soo_kqfilter;
91 static fo_stat_t soo_stat;
92 static fo_close_t soo_close;
93 static fo_fdclose_t soo_fdclose;
94 static fo_chmod_t soo_chmod;
95 static fo_fill_kinfo_t soo_fill_kinfo;
96 static fo_aio_queue_t soo_aio_queue;
97 
98 static void	soo_aio_cancel(struct kaiocb *job);
99 
100 const struct fileops socketops = {
101 	.fo_read = soo_read,
102 	.fo_write = soo_write,
103 	.fo_truncate = invfo_truncate,
104 	.fo_ioctl = soo_ioctl,
105 	.fo_poll = soo_poll,
106 	.fo_kqfilter = soo_kqfilter,
107 	.fo_stat = soo_stat,
108 	.fo_close = soo_close,
109 	.fo_fdclose = soo_fdclose,
110 	.fo_chmod = soo_chmod,
111 	.fo_chown = invfo_chown,
112 	.fo_sendfile = invfo_sendfile,
113 	.fo_fill_kinfo = soo_fill_kinfo,
114 	.fo_aio_queue = soo_aio_queue,
115 	.fo_cmp = file_kcmp_generic,
116 	.fo_flags = DFLAG_PASSABLE
117 };
118 
119 static int
soo_read(struct file * fp,struct uio * uio,struct ucred * active_cred,int flags,struct thread * td)120 soo_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
121     int flags, struct thread *td)
122 {
123 	struct socket *so = fp->f_data;
124 	int error;
125 
126 #ifdef MAC
127 	error = mac_socket_check_receive(active_cred, so);
128 	if (error)
129 		return (error);
130 #endif
131 	error = soreceive(so, 0, uio, 0, 0, 0);
132 	return (error);
133 }
134 
135 static int
soo_write(struct file * fp,struct uio * uio,struct ucred * active_cred,int flags,struct thread * td)136 soo_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
137     int flags, struct thread *td)
138 {
139 	struct socket *so = fp->f_data;
140 	int error;
141 
142 #ifdef MAC
143 	error = mac_socket_check_send(active_cred, so);
144 	if (error)
145 		return (error);
146 #endif
147 	error = sousrsend(so, NULL, uio, NULL, 0, NULL);
148 	return (error);
149 }
150 
151 static int
soo_ioctl(struct file * fp,u_long cmd,void * data,struct ucred * active_cred,struct thread * td)152 soo_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred,
153     struct thread *td)
154 {
155 	struct socket *so = fp->f_data;
156 	int error = 0;
157 
158 	switch (cmd) {
159 	case FIONBIO:
160 		SOCK_LOCK(so);
161 		if (*(int *)data)
162 			so->so_state |= SS_NBIO;
163 		else
164 			so->so_state &= ~SS_NBIO;
165 		SOCK_UNLOCK(so);
166 		break;
167 
168 	case FIOASYNC:
169 		if (*(int *)data) {
170 			SOCK_LOCK(so);
171 			so->so_state |= SS_ASYNC;
172 			if (SOLISTENING(so)) {
173 				so->sol_sbrcv_flags |= SB_ASYNC;
174 				so->sol_sbsnd_flags |= SB_ASYNC;
175 			} else {
176 				SOCK_RECVBUF_LOCK(so);
177 				so->so_rcv.sb_flags |= SB_ASYNC;
178 				SOCK_RECVBUF_UNLOCK(so);
179 				SOCK_SENDBUF_LOCK(so);
180 				so->so_snd.sb_flags |= SB_ASYNC;
181 				SOCK_SENDBUF_UNLOCK(so);
182 			}
183 			SOCK_UNLOCK(so);
184 		} else {
185 			SOCK_LOCK(so);
186 			so->so_state &= ~SS_ASYNC;
187 			if (SOLISTENING(so)) {
188 				so->sol_sbrcv_flags &= ~SB_ASYNC;
189 				so->sol_sbsnd_flags &= ~SB_ASYNC;
190 			} else {
191 				SOCK_RECVBUF_LOCK(so);
192 				so->so_rcv.sb_flags &= ~SB_ASYNC;
193 				SOCK_RECVBUF_UNLOCK(so);
194 				SOCK_SENDBUF_LOCK(so);
195 				so->so_snd.sb_flags &= ~SB_ASYNC;
196 				SOCK_SENDBUF_UNLOCK(so);
197 			}
198 			SOCK_UNLOCK(so);
199 		}
200 		break;
201 
202 	case FIONREAD:
203 		SOCK_RECVBUF_LOCK(so);
204 		if (SOLISTENING(so)) {
205 			error = EINVAL;
206 		} else {
207 			*(int *)data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl;
208 		}
209 		SOCK_RECVBUF_UNLOCK(so);
210 		break;
211 
212 	case FIONWRITE:
213 		/* Unlocked read. */
214 		if (SOLISTENING(so)) {
215 			error = EINVAL;
216 		} else {
217 			*(int *)data = sbavail(&so->so_snd);
218 		}
219 		break;
220 
221 	case FIONSPACE:
222 		/* Unlocked read. */
223 		if (SOLISTENING(so)) {
224 			error = EINVAL;
225 		} else {
226 			if ((so->so_snd.sb_hiwat < sbused(&so->so_snd)) ||
227 			    (so->so_snd.sb_mbmax < so->so_snd.sb_mbcnt)) {
228 				*(int *)data = 0;
229 			} else {
230 				*(int *)data = sbspace(&so->so_snd);
231 			}
232 		}
233 		break;
234 
235 	case FIOSETOWN:
236 		error = fsetown(*(int *)data, &so->so_sigio);
237 		break;
238 
239 	case FIOGETOWN:
240 		*(int *)data = fgetown(&so->so_sigio);
241 		break;
242 
243 	case SIOCSPGRP:
244 		error = fsetown(-(*(int *)data), &so->so_sigio);
245 		break;
246 
247 	case SIOCGPGRP:
248 		*(int *)data = -fgetown(&so->so_sigio);
249 		break;
250 
251 	case SIOCATMARK:
252 		/* Unlocked read. */
253 		if (SOLISTENING(so)) {
254 			error = EINVAL;
255 		} else {
256 			*(int *)data = (so->so_rcv.sb_state & SBS_RCVATMARK) != 0;
257 		}
258 		break;
259 	default:
260 		/*
261 		 * Interface/routing/protocol specific ioctls: interface and
262 		 * routing ioctls should have a different entry since a
263 		 * socket is unnecessary.
264 		 */
265 		if (IOCGROUP(cmd) == 'i')
266 			error = ifioctl(so, cmd, data, td);
267 		else if (IOCGROUP(cmd) == 'r') {
268 			CURVNET_SET(so->so_vnet);
269 			error = rtioctl_fib(cmd, data, so->so_fibnum);
270 			CURVNET_RESTORE();
271 		} else {
272 			CURVNET_SET(so->so_vnet);
273 			error = so->so_proto->pr_control(so, cmd, data, 0, td);
274 			CURVNET_RESTORE();
275 		}
276 		break;
277 	}
278 	return (error);
279 }
280 
281 static int
soo_poll(struct file * fp,int events,struct ucred * active_cred,struct thread * td)282 soo_poll(struct file *fp, int events, struct ucred *active_cred,
283     struct thread *td)
284 {
285 	struct socket *so = fp->f_data;
286 #ifdef MAC
287 	int error;
288 
289 	error = mac_socket_check_poll(active_cred, so);
290 	if (error)
291 		return (error);
292 #endif
293 	return (so->so_proto->pr_sopoll(so, events, td));
294 }
295 
296 static int
soo_kqfilter(struct file * fp,struct knote * kn)297 soo_kqfilter(struct file *fp, struct knote *kn)
298 {
299 	struct socket *so = fp->f_data;
300 
301 	return (so->so_proto->pr_kqfilter(so, kn));
302 }
303 
304 static int
soo_stat(struct file * fp,struct stat * ub,struct ucred * active_cred)305 soo_stat(struct file *fp, struct stat *ub, struct ucred *active_cred)
306 {
307 	struct socket *so = fp->f_data;
308 	int error = 0;
309 
310 	bzero((caddr_t)ub, sizeof (*ub));
311 	ub->st_mode = S_IFSOCK;
312 #ifdef MAC
313 	error = mac_socket_check_stat(active_cred, so);
314 	if (error)
315 		return (error);
316 #endif
317 	SOCK_LOCK(so);
318 	if (!SOLISTENING(so)) {
319 		struct sockbuf *sb;
320 
321 		/*
322 		 * If SBS_CANTRCVMORE is set, but there's still data left
323 		 * in the receive buffer, the socket is still readable.
324 		 */
325 		sb = &so->so_rcv;
326 		SOCK_RECVBUF_LOCK(so);
327 		if ((sb->sb_state & SBS_CANTRCVMORE) == 0 || sbavail(sb))
328 			ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH;
329 		ub->st_size = sbavail(sb) - sb->sb_ctl;
330 		SOCK_RECVBUF_UNLOCK(so);
331 
332 		sb = &so->so_snd;
333 		SOCK_SENDBUF_LOCK(so);
334 		if ((sb->sb_state & SBS_CANTSENDMORE) == 0)
335 			ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH;
336 		SOCK_SENDBUF_UNLOCK(so);
337 	}
338 	ub->st_uid = so->so_cred->cr_uid;
339 	ub->st_gid = so->so_cred->cr_gid;
340 	if (so->so_proto->pr_sense)
341 		error = so->so_proto->pr_sense(so, ub);
342 	SOCK_UNLOCK(so);
343 	return (error);
344 }
345 
346 /*
347  * API socket close on file pointer.  We call soclose() to close the socket
348  * (including initiating closing protocols).  soclose() will sorele() the
349  * file reference but the actual socket will not go away until the socket's
350  * ref count hits 0.
351  */
352 static int
soo_close(struct file * fp,struct thread * td)353 soo_close(struct file *fp, struct thread *td)
354 {
355 	int error = 0;
356 	struct socket *so;
357 
358 	so = fp->f_data;
359 	fp->f_ops = &badfileops;
360 	fp->f_data = NULL;
361 
362 	if (so)
363 		error = soclose(so);
364 	return (error);
365 }
366 
367 static void
soo_fdclose(struct file * fp,int fd __unused,struct thread * td)368 soo_fdclose(struct file *fp, int fd __unused, struct thread *td)
369 {
370 	struct socket *so;
371 
372 	so = fp->f_data;
373 	if (so->so_proto->pr_fdclose != NULL)
374 		so->so_proto->pr_fdclose(so);
375 }
376 
377 static int
soo_chmod(struct file * fp,mode_t mode,struct ucred * cred,struct thread * td)378 soo_chmod(struct file *fp, mode_t mode, struct ucred *cred, struct thread *td)
379 {
380 	struct socket *so;
381 	int error;
382 
383 	so = fp->f_data;
384 	if (so->so_proto->pr_chmod != NULL)
385 		error = so->so_proto->pr_chmod(so, mode, cred, td);
386 	else
387 		error = EINVAL;
388 	return (error);
389 }
390 
391 static int
soo_fill_kinfo(struct file * fp,struct kinfo_file * kif,struct filedesc * fdp)392 soo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
393 {
394 	struct sockaddr_storage ss = { .ss_len = sizeof(ss) };
395 	struct unpcb *unpcb;
396 	struct socket *so;
397 	int error;
398 
399 	kif->kf_type = KF_TYPE_SOCKET;
400 	so = fp->f_data;
401 	CURVNET_SET(so->so_vnet);
402 	kif->kf_un.kf_sock.kf_sock_domain0 =
403 	    so->so_proto->pr_domain->dom_family;
404 	kif->kf_un.kf_sock.kf_sock_type0 = so->so_type;
405 	kif->kf_un.kf_sock.kf_sock_protocol0 = so->so_proto->pr_protocol;
406 	kif->kf_un.kf_sock.kf_sock_pcb = (uintptr_t)so->so_pcb;
407 	switch (kif->kf_un.kf_sock.kf_sock_domain0) {
408 	case AF_INET:
409 	case AF_INET6:
410 		/* XXX: kf_sock_inpcb is obsolete.  It may be removed. */
411 		kif->kf_un.kf_sock.kf_sock_inpcb = (uintptr_t)so->so_pcb;
412 		kif->kf_un.kf_sock.kf_sock_rcv_sb_state =
413 		    so->so_rcv.sb_state;
414 		kif->kf_un.kf_sock.kf_sock_snd_sb_state =
415 		    so->so_snd.sb_state;
416 		kif->kf_un.kf_sock.kf_sock_sendq =
417 		    sbused(&so->so_snd);
418 		kif->kf_un.kf_sock.kf_sock_recvq =
419 		    sbused(&so->so_rcv);
420 		break;
421 	case AF_UNIX:
422 		if (so->so_pcb != NULL) {
423 			unpcb = (struct unpcb *)(so->so_pcb);
424 			if (unpcb->unp_conn) {
425 				kif->kf_un.kf_sock.kf_sock_unpconn =
426 				    (uintptr_t)unpcb->unp_conn;
427 				kif->kf_un.kf_sock.kf_sock_rcv_sb_state =
428 				    so->so_rcv.sb_state;
429 				kif->kf_un.kf_sock.kf_sock_snd_sb_state =
430 				    so->so_snd.sb_state;
431 				kif->kf_un.kf_sock.kf_sock_sendq =
432 				    sbused(&so->so_snd);
433 				kif->kf_un.kf_sock.kf_sock_recvq =
434 				    sbused(&so->so_rcv);
435 			}
436 		}
437 		break;
438 	}
439 	error = sosockaddr(so, (struct sockaddr *)&ss);
440 	if (error == 0 &&
441 	    ss.ss_len <= sizeof(kif->kf_un.kf_sock.kf_sa_local)) {
442 		bcopy(&ss, &kif->kf_un.kf_sock.kf_sa_local, ss.ss_len);
443 	}
444 	ss.ss_len = sizeof(ss);
445 	error = sopeeraddr(so, (struct sockaddr *)&ss);
446 	if (error == 0 &&
447 	    ss.ss_len <= sizeof(kif->kf_un.kf_sock.kf_sa_peer)) {
448 		bcopy(&ss, &kif->kf_un.kf_sock.kf_sa_peer, ss.ss_len);
449 	}
450 	strncpy(kif->kf_path, so->so_proto->pr_domain->dom_name,
451 	    sizeof(kif->kf_path));
452 	CURVNET_RESTORE();
453 	return (0);
454 }
455 
456 /*
457  * Use the 'backend3' field in AIO jobs to store the amount of data
458  * completed by the AIO job so far.
459  */
460 #define	aio_done	backend3
461 
462 static STAILQ_HEAD(, task) soaio_jobs;
463 static struct mtx soaio_jobs_lock;
464 static struct task soaio_kproc_task;
465 static int soaio_starting, soaio_idle, soaio_queued;
466 static struct unrhdr *soaio_kproc_unr;
467 
468 static int soaio_max_procs = MAX_AIO_PROCS;
469 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, max_procs, CTLFLAG_RW, &soaio_max_procs, 0,
470     "Maximum number of kernel processes to use for async socket IO");
471 
472 static int soaio_num_procs;
473 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, num_procs, CTLFLAG_RD, &soaio_num_procs, 0,
474     "Number of active kernel processes for async socket IO");
475 
476 static int soaio_target_procs = TARGET_AIO_PROCS;
477 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, target_procs, CTLFLAG_RD,
478     &soaio_target_procs, 0,
479     "Preferred number of ready kernel processes for async socket IO");
480 
481 static int soaio_lifetime;
482 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, lifetime, CTLFLAG_RW, &soaio_lifetime, 0,
483     "Maximum lifetime for idle aiod");
484 
485 static void
soaio_kproc_loop(void * arg)486 soaio_kproc_loop(void *arg)
487 {
488 	struct proc *p;
489 	struct vmspace *myvm;
490 	struct task *task;
491 	int error, id, pending;
492 
493 	id = (intptr_t)arg;
494 
495 	/*
496 	 * Grab an extra reference on the daemon's vmspace so that it
497 	 * doesn't get freed by jobs that switch to a different
498 	 * vmspace.
499 	 */
500 	p = curproc;
501 	myvm = vmspace_acquire_ref(p);
502 
503 	mtx_lock(&soaio_jobs_lock);
504 	MPASS(soaio_starting > 0);
505 	soaio_starting--;
506 	for (;;) {
507 		while (!STAILQ_EMPTY(&soaio_jobs)) {
508 			task = STAILQ_FIRST(&soaio_jobs);
509 			STAILQ_REMOVE_HEAD(&soaio_jobs, ta_link);
510 			soaio_queued--;
511 			pending = task->ta_pending;
512 			task->ta_pending = 0;
513 			mtx_unlock(&soaio_jobs_lock);
514 
515 			task->ta_func(task->ta_context, pending);
516 
517 			mtx_lock(&soaio_jobs_lock);
518 		}
519 		MPASS(soaio_queued == 0);
520 
521 		if (p->p_vmspace != myvm) {
522 			mtx_unlock(&soaio_jobs_lock);
523 			vmspace_switch_aio(myvm);
524 			mtx_lock(&soaio_jobs_lock);
525 			continue;
526 		}
527 
528 		soaio_idle++;
529 		error = mtx_sleep(&soaio_idle, &soaio_jobs_lock, 0, "-",
530 		    soaio_lifetime);
531 		soaio_idle--;
532 		if (error == EWOULDBLOCK && STAILQ_EMPTY(&soaio_jobs) &&
533 		    soaio_num_procs > soaio_target_procs)
534 			break;
535 	}
536 	soaio_num_procs--;
537 	mtx_unlock(&soaio_jobs_lock);
538 	free_unr(soaio_kproc_unr, id);
539 	kproc_exit(0);
540 }
541 
542 static void
soaio_kproc_create(void * context,int pending)543 soaio_kproc_create(void *context, int pending)
544 {
545 	struct proc *p;
546 	int error, id;
547 
548 	mtx_lock(&soaio_jobs_lock);
549 	for (;;) {
550 		if (soaio_num_procs < soaio_target_procs) {
551 			/* Must create */
552 		} else if (soaio_num_procs >= soaio_max_procs) {
553 			/*
554 			 * Hit the limit on kernel processes, don't
555 			 * create another one.
556 			 */
557 			break;
558 		} else if (soaio_queued <= soaio_idle + soaio_starting) {
559 			/*
560 			 * No more AIO jobs waiting for a process to be
561 			 * created, so stop.
562 			 */
563 			break;
564 		}
565 		soaio_starting++;
566 		mtx_unlock(&soaio_jobs_lock);
567 
568 		id = alloc_unr(soaio_kproc_unr);
569 		error = kproc_create(soaio_kproc_loop, (void *)(intptr_t)id,
570 		    &p, 0, 0, "soaiod%d", id);
571 		if (error != 0) {
572 			free_unr(soaio_kproc_unr, id);
573 			mtx_lock(&soaio_jobs_lock);
574 			soaio_starting--;
575 			break;
576 		}
577 
578 		mtx_lock(&soaio_jobs_lock);
579 		soaio_num_procs++;
580 	}
581 	mtx_unlock(&soaio_jobs_lock);
582 }
583 
584 void
soaio_enqueue(struct task * task)585 soaio_enqueue(struct task *task)
586 {
587 
588 	mtx_lock(&soaio_jobs_lock);
589 	MPASS(task->ta_pending == 0);
590 	task->ta_pending++;
591 	STAILQ_INSERT_TAIL(&soaio_jobs, task, ta_link);
592 	soaio_queued++;
593 	if (soaio_queued <= soaio_idle)
594 		wakeup_one(&soaio_idle);
595 	else if (soaio_num_procs < soaio_max_procs)
596 		taskqueue_enqueue(taskqueue_thread, &soaio_kproc_task);
597 	mtx_unlock(&soaio_jobs_lock);
598 }
599 
600 static void
soaio_init(void * dummy __unused)601 soaio_init(void *dummy __unused)
602 {
603 
604 	soaio_lifetime = AIOD_LIFETIME_DEFAULT;
605 	STAILQ_INIT(&soaio_jobs);
606 	mtx_init(&soaio_jobs_lock, "soaio jobs", NULL, MTX_DEF);
607 	soaio_kproc_unr = new_unrhdr(1, INT_MAX, NULL);
608 	TASK_INIT(&soaio_kproc_task, 0, soaio_kproc_create, NULL);
609 }
610 SYSINIT(soaio, SI_SUB_VFS, SI_ORDER_ANY, soaio_init, NULL);
611 
612 static __inline int
soaio_ready(struct socket * so,struct sockbuf * sb)613 soaio_ready(struct socket *so, struct sockbuf *sb)
614 {
615 	return (sb == &so->so_rcv ? soreadable(so) : sowriteable(so));
616 }
617 
618 static void
soaio_process_job(struct socket * so,sb_which which,struct kaiocb * job)619 soaio_process_job(struct socket *so, sb_which which, struct kaiocb *job)
620 {
621 	struct ucred *td_savedcred;
622 	struct thread *td;
623 	struct sockbuf *sb = sobuf(so, which);
624 #ifdef MAC
625 	struct file *fp = job->fd_file;
626 #endif
627 	size_t cnt, done, job_total_nbytes __diagused;
628 	long ru_before;
629 	int error, flags;
630 
631 	SOCK_BUF_UNLOCK(so, which);
632 	aio_switch_vmspace(job);
633 	td = curthread;
634 retry:
635 	td_savedcred = td->td_ucred;
636 	td->td_ucred = job->cred;
637 
638 	job_total_nbytes = job->uiop->uio_resid + job->aio_done;
639 	done = job->aio_done;
640 	cnt = job->uiop->uio_resid;
641 	job->uiop->uio_offset = 0;
642 	job->uiop->uio_td = td;
643 	flags = MSG_NBIO;
644 
645 	/*
646 	 * For resource usage accounting, only count a completed request
647 	 * as a single message to avoid counting multiple calls to
648 	 * sosend/soreceive on a blocking socket.
649 	 */
650 
651 	if (sb == &so->so_rcv) {
652 		ru_before = td->td_ru.ru_msgrcv;
653 #ifdef MAC
654 		error = mac_socket_check_receive(fp->f_cred, so);
655 		if (error == 0)
656 
657 #endif
658 			error = soreceive(so, NULL, job->uiop, NULL, NULL,
659 			    &flags);
660 		if (td->td_ru.ru_msgrcv != ru_before)
661 			job->msgrcv = 1;
662 	} else {
663 		if (!TAILQ_EMPTY(&sb->sb_aiojobq))
664 			flags |= MSG_MORETOCOME;
665 		ru_before = td->td_ru.ru_msgsnd;
666 #ifdef MAC
667 		error = mac_socket_check_send(fp->f_cred, so);
668 		if (error == 0)
669 #endif
670 			error = sousrsend(so, NULL, job->uiop, NULL, flags,
671 			    job->userproc);
672 		if (td->td_ru.ru_msgsnd != ru_before)
673 			job->msgsnd = 1;
674 	}
675 
676 	done += cnt - job->uiop->uio_resid;
677 	job->aio_done = done;
678 	td->td_ucred = td_savedcred;
679 
680 	if (error == EWOULDBLOCK) {
681 		/*
682 		 * The request was either partially completed or not
683 		 * completed at all due to racing with a read() or
684 		 * write() on the socket.  If the socket is
685 		 * non-blocking, return with any partial completion.
686 		 * If the socket is blocking or if no progress has
687 		 * been made, requeue this request at the head of the
688 		 * queue to try again when the socket is ready.
689 		 */
690 		MPASS(done != job_total_nbytes);
691 		SOCK_BUF_LOCK(so, which);
692 		if (done == 0 || !(so->so_state & SS_NBIO)) {
693 			empty_results++;
694 			if (soaio_ready(so, sb)) {
695 				empty_retries++;
696 				SOCK_BUF_UNLOCK(so, which);
697 				goto retry;
698 			}
699 
700 			if (!aio_set_cancel_function(job, soo_aio_cancel)) {
701 				SOCK_BUF_UNLOCK(so, which);
702 				if (done != 0)
703 					aio_complete(job, done, 0);
704 				else
705 					aio_cancel(job);
706 				SOCK_BUF_LOCK(so, which);
707 			} else {
708 				TAILQ_INSERT_HEAD(&sb->sb_aiojobq, job, list);
709 			}
710 			return;
711 		}
712 		SOCK_BUF_UNLOCK(so, which);
713 	}
714 	if (done != 0 && (error == ERESTART || error == EINTR ||
715 	    error == EWOULDBLOCK))
716 		error = 0;
717 	if (error)
718 		aio_complete(job, -1, error);
719 	else
720 		aio_complete(job, done, 0);
721 	SOCK_BUF_LOCK(so, which);
722 }
723 
724 static void
soaio_process_sb(struct socket * so,sb_which which)725 soaio_process_sb(struct socket *so, sb_which which)
726 {
727 	struct kaiocb *job;
728 	struct sockbuf *sb = sobuf(so, which);
729 
730 	CURVNET_SET(so->so_vnet);
731 	SOCK_BUF_LOCK(so, which);
732 	while (!TAILQ_EMPTY(&sb->sb_aiojobq) && soaio_ready(so, sb)) {
733 		job = TAILQ_FIRST(&sb->sb_aiojobq);
734 		TAILQ_REMOVE(&sb->sb_aiojobq, job, list);
735 		if (!aio_clear_cancel_function(job))
736 			continue;
737 
738 		soaio_process_job(so, which, job);
739 	}
740 
741 	/*
742 	 * If there are still pending requests, the socket must not be
743 	 * ready so set SB_AIO to request a wakeup when the socket
744 	 * becomes ready.
745 	 */
746 	if (!TAILQ_EMPTY(&sb->sb_aiojobq))
747 		sb->sb_flags |= SB_AIO;
748 	sb->sb_flags &= ~SB_AIO_RUNNING;
749 	SOCK_BUF_UNLOCK(so, which);
750 
751 	sorele(so);
752 	CURVNET_RESTORE();
753 }
754 
755 void
soaio_rcv(void * context,int pending)756 soaio_rcv(void *context, int pending)
757 {
758 	struct socket *so;
759 
760 	so = context;
761 	soaio_process_sb(so, SO_RCV);
762 }
763 
764 void
soaio_snd(void * context,int pending)765 soaio_snd(void *context, int pending)
766 {
767 	struct socket *so;
768 
769 	so = context;
770 	soaio_process_sb(so, SO_SND);
771 }
772 
773 void
sowakeup_aio(struct socket * so,sb_which which)774 sowakeup_aio(struct socket *so, sb_which which)
775 {
776 	struct sockbuf *sb = sobuf(so, which);
777 
778 	SOCK_BUF_LOCK_ASSERT(so, which);
779 
780 	sb->sb_flags &= ~SB_AIO;
781 	if (sb->sb_flags & SB_AIO_RUNNING)
782 		return;
783 	sb->sb_flags |= SB_AIO_RUNNING;
784 	soref(so);
785 	soaio_enqueue(&sb->sb_aiotask);
786 }
787 
788 static void
soo_aio_cancel(struct kaiocb * job)789 soo_aio_cancel(struct kaiocb *job)
790 {
791 	struct socket *so;
792 	struct sockbuf *sb;
793 	long done;
794 	int opcode;
795 	sb_which which;
796 
797 	so = job->fd_file->f_data;
798 	opcode = job->uaiocb.aio_lio_opcode;
799 	if (opcode & LIO_READ) {
800 		sb = &so->so_rcv;
801 		which = SO_RCV;
802 	} else {
803 		MPASS(opcode & LIO_WRITE);
804 		sb = &so->so_snd;
805 		which = SO_SND;
806 	}
807 
808 	SOCK_BUF_LOCK(so, which);
809 	if (!aio_cancel_cleared(job))
810 		TAILQ_REMOVE(&sb->sb_aiojobq, job, list);
811 	if (TAILQ_EMPTY(&sb->sb_aiojobq))
812 		sb->sb_flags &= ~SB_AIO;
813 	SOCK_BUF_UNLOCK(so, which);
814 
815 	done = job->aio_done;
816 	if (done != 0)
817 		aio_complete(job, done, 0);
818 	else
819 		aio_cancel(job);
820 }
821 
822 static int
soo_aio_queue(struct file * fp,struct kaiocb * job)823 soo_aio_queue(struct file *fp, struct kaiocb *job)
824 {
825 	struct socket *so = fp->f_data;
826 
827 	return (so->so_proto->pr_aio_queue(so, job));
828 }
829 
830 int
soaio_queue_generic(struct socket * so,struct kaiocb * job)831 soaio_queue_generic(struct socket *so, struct kaiocb *job)
832 {
833 	struct sockbuf *sb;
834 	sb_which which;
835 
836 	/* Lock through the socket, since this may be a listening socket. */
837 	switch (job->uaiocb.aio_lio_opcode & (LIO_WRITE | LIO_READ)) {
838 	case LIO_READ:
839 		SOCK_RECVBUF_LOCK(so);
840 		sb = &so->so_rcv;
841 		which = SO_RCV;
842 		break;
843 	case LIO_WRITE:
844 		SOCK_SENDBUF_LOCK(so);
845 		sb = &so->so_snd;
846 		which = SO_SND;
847 		break;
848 	default:
849 		return (EINVAL);
850 	}
851 
852 	if (SOLISTENING(so)) {
853 		SOCK_BUF_UNLOCK(so, which);
854 		return (EINVAL);
855 	}
856 
857 	if (!aio_set_cancel_function(job, soo_aio_cancel))
858 		panic("new job was cancelled");
859 	TAILQ_INSERT_TAIL(&sb->sb_aiojobq, job, list);
860 	if (!(sb->sb_flags & SB_AIO_RUNNING)) {
861 		if (soaio_ready(so, sb))
862 			sowakeup_aio(so, which);
863 		else
864 			sb->sb_flags |= SB_AIO;
865 	}
866 	SOCK_BUF_UNLOCK(so, which);
867 	return (0);
868 }
869