xref: /illumos-gate/usr/src/lib/libc/port/sys/epoll.c (revision dd72704bd9e794056c558153663c739e2012d721)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2017 Joyent, Inc.
14  * Copyright 2020 Oxide Computer Company
15  */
16 
17 #include <sys/types.h>
18 #include <sys/epoll.h>
19 #include <sys/devpoll.h>
20 #include <unistd.h>
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <poll.h>
24 
25 /*
26  * Events that match their epoll(7) equivalents.
27  */
28 #if EPOLLIN != POLLIN
29 #error value of EPOLLIN does not match value of POLLIN
30 #endif
31 
32 #if EPOLLPRI != POLLPRI
33 #error value of EPOLLPRI does not match value of POLLPRI
34 #endif
35 
36 #if EPOLLOUT != POLLOUT
37 #error value of EPOLLOUT does not match value of POLLOUT
38 #endif
39 
40 #if EPOLLRDNORM != POLLRDNORM
41 #error value of EPOLLRDNORM does not match value of POLLRDNORM
42 #endif
43 
44 #if EPOLLRDBAND != POLLRDBAND
45 #error value of EPOLLRDBAND does not match value of POLLRDBAND
46 #endif
47 
48 #if EPOLLERR != POLLERR
49 #error value of EPOLLERR does not match value of POLLERR
50 #endif
51 
52 #if EPOLLHUP != POLLHUP
53 #error value of EPOLLHUP does not match value of POLLHUP
54 #endif
55 
56 /*
57  * Events that we ignore entirely.  They can be set in events, but they will
58  * never be returned.
59  */
60 #define	EPOLLIGNORED	(EPOLLMSG | EPOLLWAKEUP | EPOLLEXCLUSIVE)
61 
62 /*
63  * Events that we swizzle into other bit positions.
64  */
65 #define	EPOLLSWIZZLED	\
66 	(EPOLLRDHUP | EPOLLONESHOT | EPOLLET | EPOLLWRBAND | EPOLLWRNORM)
67 
68 /*
69  * The defined behavior for epoll_wait/epoll_pwait when using a timeout less
70  * than 0 is to wait for events until they arrive (or interrupted by a signal).
71  * While poll(4D) operates in this manner for a timeout of -1, using other
72  * negative values results in an immediate timeout, as if it had been set to 0.
73  * For that reason, negative values are clamped to -1.
74  */
75 #define	EPOLL_TIMEOUT_CLAMP(t)	(((t) < -1) ? -1 : (t))
76 
77 int
78 epoll_create(int size)
79 {
80 	int fd;
81 
82 	/*
83 	 * From the epoll_create() man page:  "Since Linux 2.6.8, the size
84 	 * argument is ignored, but must be greater than zero."  You keep using
85 	 * that word "ignored"...
86 	 */
87 	if (size <= 0) {
88 		errno = EINVAL;
89 		return (-1);
90 	}
91 
92 	if ((fd = open("/dev/poll", O_RDWR)) == -1)
93 		return (-1);
94 
95 	if (ioctl(fd, DP_EPOLLCOMPAT, 0) == -1) {
96 		(void) close(fd);
97 		return (-1);
98 	}
99 
100 	return (fd);
101 }
102 
103 int
104 epoll_create1(int flags)
105 {
106 	int fd, oflags = O_RDWR;
107 
108 	if (flags & EPOLL_CLOEXEC) {
109 		oflags |= O_CLOEXEC;
110 		flags ^= EPOLL_CLOEXEC;
111 	}
112 	/* Reject unrecognized flags */
113 	if (flags != 0) {
114 		errno = EINVAL;
115 		return (-1);
116 	}
117 
118 	if ((fd = open("/dev/poll", oflags)) == -1)
119 		return (-1);
120 
121 	if (ioctl(fd, DP_EPOLLCOMPAT, 0) == -1) {
122 		(void) close(fd);
123 		return (-1);
124 	}
125 
126 	return (fd);
127 }
128 
129 int
130 epoll_ctl(int epfd, int op, int fd, struct epoll_event *event)
131 {
132 	dvpoll_epollfd_t epoll[2];
133 	uint32_t events, ev = 0;
134 	int i = 0, res;
135 
136 	epoll[i].dpep_pollfd.fd = fd;
137 
138 	switch (op) {
139 	case EPOLL_CTL_DEL:
140 		ev = POLLREMOVE;
141 		break;
142 
143 	case EPOLL_CTL_MOD:
144 		/* EPOLLEXCLUSIVE is prohibited for modify operations */
145 		if ((event->events & EPOLLEXCLUSIVE) != 0) {
146 			errno = EINVAL;
147 			return (-1);
148 		}
149 		/*
150 		 * In the modify case, we pass down two events:  one to
151 		 * remove the event and another to add it back.
152 		 */
153 		epoll[i++].dpep_pollfd.events = POLLREMOVE;
154 		epoll[i].dpep_pollfd.fd = fd;
155 		/* FALLTHROUGH */
156 
157 	case EPOLL_CTL_ADD:
158 		/*
159 		 * Mask off the events that we ignore, and then swizzle the
160 		 * events for which our values differ from their epoll(7)
161 		 * equivalents.
162 		 */
163 		events = event->events;
164 		ev = events & ~(EPOLLIGNORED | EPOLLSWIZZLED);
165 
166 		if (events & EPOLLRDHUP)
167 			ev |= POLLRDHUP;
168 
169 		if (events & EPOLLET)
170 			ev |= POLLET;
171 
172 		if (events & EPOLLONESHOT)
173 			ev |= POLLONESHOT;
174 
175 		if (events & EPOLLWRNORM)
176 			ev |= POLLWRNORM;
177 
178 		if (events & EPOLLWRBAND)
179 			ev |= POLLWRBAND;
180 
181 		epoll[i].dpep_data = event->data.u64;
182 		break;
183 
184 	default:
185 		errno = EOPNOTSUPP;
186 		return (-1);
187 	}
188 
189 	epoll[i].dpep_pollfd.events = ev;
190 retry:
191 	res = write(epfd, epoll, sizeof (epoll[0]) * (i + 1));
192 
193 	if (res == -1) {
194 		if (errno == EINTR) {
195 			/*
196 			 * Linux does not document EINTR as an allowed error
197 			 * for epoll_ctl.  The write must be retried if it is
198 			 * not done automatically via SA_RESTART.
199 			 */
200 			goto retry;
201 		}
202 		if (errno == ELOOP) {
203 			/*
204 			 * Convert the specific /dev/poll error about an fd
205 			 * loop into what is expected from the Linux epoll
206 			 * interface.
207 			 */
208 			errno = EINVAL;
209 		}
210 		return (-1);
211 	}
212 	return (0);
213 }
214 
215 int
216 epoll_wait(int epfd, struct epoll_event *events,
217     int maxevents, int timeout)
218 {
219 	struct dvpoll arg;
220 
221 	if (maxevents <= 0) {
222 		errno = EINVAL;
223 		return (-1);
224 	}
225 
226 	arg.dp_nfds = maxevents;
227 	arg.dp_timeout = EPOLL_TIMEOUT_CLAMP(timeout);
228 	arg.dp_fds = (pollfd_t *)events;
229 
230 	return (ioctl(epfd, DP_POLL, &arg));
231 }
232 
233 int
234 epoll_pwait(int epfd, struct epoll_event *events,
235     int maxevents, int timeout, const sigset_t *sigmask)
236 {
237 	struct dvpoll arg;
238 
239 	if (maxevents <= 0) {
240 		errno = EINVAL;
241 		return (-1);
242 	}
243 
244 	arg.dp_nfds = maxevents;
245 	arg.dp_timeout = EPOLL_TIMEOUT_CLAMP(timeout);
246 	arg.dp_fds = (pollfd_t *)events;
247 	arg.dp_setp = (sigset_t *)sigmask;
248 
249 	return (ioctl(epfd, DP_PPOLL, &arg));
250 }
251