1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * fs/eventfd.c 4 * 5 * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> 6 * 7 */ 8 9 #include <linux/file.h> 10 #include <linux/poll.h> 11 #include <linux/init.h> 12 #include <linux/fs.h> 13 #include <linux/sched/signal.h> 14 #include <linux/kernel.h> 15 #include <linux/slab.h> 16 #include <linux/list.h> 17 #include <linux/spinlock.h> 18 #include <linux/anon_inodes.h> 19 #include <linux/syscalls.h> 20 #include <linux/export.h> 21 #include <linux/kref.h> 22 #include <linux/eventfd.h> 23 #include <linux/proc_fs.h> 24 #include <linux/seq_file.h> 25 #include <linux/idr.h> 26 #include <linux/uio.h> 27 28 static DEFINE_IDA(eventfd_ida); 29 30 struct eventfd_ctx { 31 struct kref kref; 32 wait_queue_head_t wqh; 33 /* 34 * Every time that a write(2) is performed on an eventfd, the 35 * value of the __u64 being written is added to "count" and a 36 * wakeup is performed on "wqh". A read(2) will return the "count" 37 * value to userspace, and will reset "count" to zero. The kernel 38 * side eventfd_signal() also, adds to the "count" counter and 39 * issue a wakeup. 40 */ 41 __u64 count; 42 unsigned int flags; 43 int id; 44 }; 45 46 __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask) 47 { 48 unsigned long flags; 49 50 /* 51 * Deadlock or stack overflow issues can happen if we recurse here 52 * through waitqueue wakeup handlers. If the caller users potentially 53 * nested waitqueues with custom wakeup handlers, then it should 54 * check eventfd_signal_allowed() before calling this function. If 55 * it returns false, the eventfd_signal() call should be deferred to a 56 * safe context. 57 */ 58 if (WARN_ON_ONCE(current->in_eventfd)) 59 return 0; 60 61 spin_lock_irqsave(&ctx->wqh.lock, flags); 62 current->in_eventfd = 1; 63 if (ULLONG_MAX - ctx->count < n) 64 n = ULLONG_MAX - ctx->count; 65 ctx->count += n; 66 if (waitqueue_active(&ctx->wqh)) 67 wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask); 68 current->in_eventfd = 0; 69 spin_unlock_irqrestore(&ctx->wqh.lock, flags); 70 71 return n; 72 } 73 74 /** 75 * eventfd_signal - Adds @n to the eventfd counter. 76 * @ctx: [in] Pointer to the eventfd context. 77 * @n: [in] Value of the counter to be added to the eventfd internal counter. 78 * The value cannot be negative. 79 * 80 * This function is supposed to be called by the kernel in paths that do not 81 * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX 82 * value, and we signal this as overflow condition by returning a EPOLLERR 83 * to poll(2). 84 * 85 * Returns the amount by which the counter was incremented. This will be less 86 * than @n if the counter has overflowed. 87 */ 88 __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) 89 { 90 return eventfd_signal_mask(ctx, n, 0); 91 } 92 EXPORT_SYMBOL_GPL(eventfd_signal); 93 94 static void eventfd_free_ctx(struct eventfd_ctx *ctx) 95 { 96 if (ctx->id >= 0) 97 ida_simple_remove(&eventfd_ida, ctx->id); 98 kfree(ctx); 99 } 100 101 static void eventfd_free(struct kref *kref) 102 { 103 struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref); 104 105 eventfd_free_ctx(ctx); 106 } 107 108 /** 109 * eventfd_ctx_put - Releases a reference to the internal eventfd context. 110 * @ctx: [in] Pointer to eventfd context. 111 * 112 * The eventfd context reference must have been previously acquired either 113 * with eventfd_ctx_fdget() or eventfd_ctx_fileget(). 114 */ 115 void eventfd_ctx_put(struct eventfd_ctx *ctx) 116 { 117 kref_put(&ctx->kref, eventfd_free); 118 } 119 EXPORT_SYMBOL_GPL(eventfd_ctx_put); 120 121 static int eventfd_release(struct inode *inode, struct file *file) 122 { 123 struct eventfd_ctx *ctx = file->private_data; 124 125 wake_up_poll(&ctx->wqh, EPOLLHUP); 126 eventfd_ctx_put(ctx); 127 return 0; 128 } 129 130 static __poll_t eventfd_poll(struct file *file, poll_table *wait) 131 { 132 struct eventfd_ctx *ctx = file->private_data; 133 __poll_t events = 0; 134 u64 count; 135 136 poll_wait(file, &ctx->wqh, wait); 137 138 /* 139 * All writes to ctx->count occur within ctx->wqh.lock. This read 140 * can be done outside ctx->wqh.lock because we know that poll_wait 141 * takes that lock (through add_wait_queue) if our caller will sleep. 142 * 143 * The read _can_ therefore seep into add_wait_queue's critical 144 * section, but cannot move above it! add_wait_queue's spin_lock acts 145 * as an acquire barrier and ensures that the read be ordered properly 146 * against the writes. The following CAN happen and is safe: 147 * 148 * poll write 149 * ----------------- ------------ 150 * lock ctx->wqh.lock (in poll_wait) 151 * count = ctx->count 152 * __add_wait_queue 153 * unlock ctx->wqh.lock 154 * lock ctx->qwh.lock 155 * ctx->count += n 156 * if (waitqueue_active) 157 * wake_up_locked_poll 158 * unlock ctx->qwh.lock 159 * eventfd_poll returns 0 160 * 161 * but the following, which would miss a wakeup, cannot happen: 162 * 163 * poll write 164 * ----------------- ------------ 165 * count = ctx->count (INVALID!) 166 * lock ctx->qwh.lock 167 * ctx->count += n 168 * **waitqueue_active is false** 169 * **no wake_up_locked_poll!** 170 * unlock ctx->qwh.lock 171 * lock ctx->wqh.lock (in poll_wait) 172 * __add_wait_queue 173 * unlock ctx->wqh.lock 174 * eventfd_poll returns 0 175 */ 176 count = READ_ONCE(ctx->count); 177 178 if (count > 0) 179 events |= EPOLLIN; 180 if (count == ULLONG_MAX) 181 events |= EPOLLERR; 182 if (ULLONG_MAX - 1 > count) 183 events |= EPOLLOUT; 184 185 return events; 186 } 187 188 void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) 189 { 190 lockdep_assert_held(&ctx->wqh.lock); 191 192 *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; 193 ctx->count -= *cnt; 194 } 195 EXPORT_SYMBOL_GPL(eventfd_ctx_do_read); 196 197 /** 198 * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue. 199 * @ctx: [in] Pointer to eventfd context. 200 * @wait: [in] Wait queue to be removed. 201 * @cnt: [out] Pointer to the 64-bit counter value. 202 * 203 * Returns %0 if successful, or the following error codes: 204 * 205 * -EAGAIN : The operation would have blocked. 206 * 207 * This is used to atomically remove a wait queue entry from the eventfd wait 208 * queue head, and read/reset the counter value. 209 */ 210 int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, 211 __u64 *cnt) 212 { 213 unsigned long flags; 214 215 spin_lock_irqsave(&ctx->wqh.lock, flags); 216 eventfd_ctx_do_read(ctx, cnt); 217 __remove_wait_queue(&ctx->wqh, wait); 218 if (*cnt != 0 && waitqueue_active(&ctx->wqh)) 219 wake_up_locked_poll(&ctx->wqh, EPOLLOUT); 220 spin_unlock_irqrestore(&ctx->wqh.lock, flags); 221 222 return *cnt != 0 ? 0 : -EAGAIN; 223 } 224 EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue); 225 226 static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to) 227 { 228 struct file *file = iocb->ki_filp; 229 struct eventfd_ctx *ctx = file->private_data; 230 __u64 ucnt = 0; 231 DECLARE_WAITQUEUE(wait, current); 232 233 if (iov_iter_count(to) < sizeof(ucnt)) 234 return -EINVAL; 235 spin_lock_irq(&ctx->wqh.lock); 236 if (!ctx->count) { 237 if ((file->f_flags & O_NONBLOCK) || 238 (iocb->ki_flags & IOCB_NOWAIT)) { 239 spin_unlock_irq(&ctx->wqh.lock); 240 return -EAGAIN; 241 } 242 __add_wait_queue(&ctx->wqh, &wait); 243 for (;;) { 244 set_current_state(TASK_INTERRUPTIBLE); 245 if (ctx->count) 246 break; 247 if (signal_pending(current)) { 248 __remove_wait_queue(&ctx->wqh, &wait); 249 __set_current_state(TASK_RUNNING); 250 spin_unlock_irq(&ctx->wqh.lock); 251 return -ERESTARTSYS; 252 } 253 spin_unlock_irq(&ctx->wqh.lock); 254 schedule(); 255 spin_lock_irq(&ctx->wqh.lock); 256 } 257 __remove_wait_queue(&ctx->wqh, &wait); 258 __set_current_state(TASK_RUNNING); 259 } 260 eventfd_ctx_do_read(ctx, &ucnt); 261 current->in_eventfd = 1; 262 if (waitqueue_active(&ctx->wqh)) 263 wake_up_locked_poll(&ctx->wqh, EPOLLOUT); 264 current->in_eventfd = 0; 265 spin_unlock_irq(&ctx->wqh.lock); 266 if (unlikely(copy_to_iter(&ucnt, sizeof(ucnt), to) != sizeof(ucnt))) 267 return -EFAULT; 268 269 return sizeof(ucnt); 270 } 271 272 static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count, 273 loff_t *ppos) 274 { 275 struct eventfd_ctx *ctx = file->private_data; 276 ssize_t res; 277 __u64 ucnt; 278 DECLARE_WAITQUEUE(wait, current); 279 280 if (count < sizeof(ucnt)) 281 return -EINVAL; 282 if (copy_from_user(&ucnt, buf, sizeof(ucnt))) 283 return -EFAULT; 284 if (ucnt == ULLONG_MAX) 285 return -EINVAL; 286 spin_lock_irq(&ctx->wqh.lock); 287 res = -EAGAIN; 288 if (ULLONG_MAX - ctx->count > ucnt) 289 res = sizeof(ucnt); 290 else if (!(file->f_flags & O_NONBLOCK)) { 291 __add_wait_queue(&ctx->wqh, &wait); 292 for (res = 0;;) { 293 set_current_state(TASK_INTERRUPTIBLE); 294 if (ULLONG_MAX - ctx->count > ucnt) { 295 res = sizeof(ucnt); 296 break; 297 } 298 if (signal_pending(current)) { 299 res = -ERESTARTSYS; 300 break; 301 } 302 spin_unlock_irq(&ctx->wqh.lock); 303 schedule(); 304 spin_lock_irq(&ctx->wqh.lock); 305 } 306 __remove_wait_queue(&ctx->wqh, &wait); 307 __set_current_state(TASK_RUNNING); 308 } 309 if (likely(res > 0)) { 310 ctx->count += ucnt; 311 current->in_eventfd = 1; 312 if (waitqueue_active(&ctx->wqh)) 313 wake_up_locked_poll(&ctx->wqh, EPOLLIN); 314 current->in_eventfd = 0; 315 } 316 spin_unlock_irq(&ctx->wqh.lock); 317 318 return res; 319 } 320 321 #ifdef CONFIG_PROC_FS 322 static void eventfd_show_fdinfo(struct seq_file *m, struct file *f) 323 { 324 struct eventfd_ctx *ctx = f->private_data; 325 326 spin_lock_irq(&ctx->wqh.lock); 327 seq_printf(m, "eventfd-count: %16llx\n", 328 (unsigned long long)ctx->count); 329 spin_unlock_irq(&ctx->wqh.lock); 330 seq_printf(m, "eventfd-id: %d\n", ctx->id); 331 } 332 #endif 333 334 static const struct file_operations eventfd_fops = { 335 #ifdef CONFIG_PROC_FS 336 .show_fdinfo = eventfd_show_fdinfo, 337 #endif 338 .release = eventfd_release, 339 .poll = eventfd_poll, 340 .read_iter = eventfd_read, 341 .write = eventfd_write, 342 .llseek = noop_llseek, 343 }; 344 345 /** 346 * eventfd_fget - Acquire a reference of an eventfd file descriptor. 347 * @fd: [in] Eventfd file descriptor. 348 * 349 * Returns a pointer to the eventfd file structure in case of success, or the 350 * following error pointer: 351 * 352 * -EBADF : Invalid @fd file descriptor. 353 * -EINVAL : The @fd file descriptor is not an eventfd file. 354 */ 355 struct file *eventfd_fget(int fd) 356 { 357 struct file *file; 358 359 file = fget(fd); 360 if (!file) 361 return ERR_PTR(-EBADF); 362 if (file->f_op != &eventfd_fops) { 363 fput(file); 364 return ERR_PTR(-EINVAL); 365 } 366 367 return file; 368 } 369 EXPORT_SYMBOL_GPL(eventfd_fget); 370 371 /** 372 * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context. 373 * @fd: [in] Eventfd file descriptor. 374 * 375 * Returns a pointer to the internal eventfd context, otherwise the error 376 * pointers returned by the following functions: 377 * 378 * eventfd_fget 379 */ 380 struct eventfd_ctx *eventfd_ctx_fdget(int fd) 381 { 382 struct eventfd_ctx *ctx; 383 struct fd f = fdget(fd); 384 if (!f.file) 385 return ERR_PTR(-EBADF); 386 ctx = eventfd_ctx_fileget(f.file); 387 fdput(f); 388 return ctx; 389 } 390 EXPORT_SYMBOL_GPL(eventfd_ctx_fdget); 391 392 /** 393 * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context. 394 * @file: [in] Eventfd file pointer. 395 * 396 * Returns a pointer to the internal eventfd context, otherwise the error 397 * pointer: 398 * 399 * -EINVAL : The @fd file descriptor is not an eventfd file. 400 */ 401 struct eventfd_ctx *eventfd_ctx_fileget(struct file *file) 402 { 403 struct eventfd_ctx *ctx; 404 405 if (file->f_op != &eventfd_fops) 406 return ERR_PTR(-EINVAL); 407 408 ctx = file->private_data; 409 kref_get(&ctx->kref); 410 return ctx; 411 } 412 EXPORT_SYMBOL_GPL(eventfd_ctx_fileget); 413 414 static int do_eventfd(unsigned int count, int flags) 415 { 416 struct eventfd_ctx *ctx; 417 struct file *file; 418 int fd; 419 420 /* Check the EFD_* constants for consistency. */ 421 BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC); 422 BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); 423 424 if (flags & ~EFD_FLAGS_SET) 425 return -EINVAL; 426 427 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); 428 if (!ctx) 429 return -ENOMEM; 430 431 kref_init(&ctx->kref); 432 init_waitqueue_head(&ctx->wqh); 433 ctx->count = count; 434 ctx->flags = flags; 435 ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL); 436 437 flags &= EFD_SHARED_FCNTL_FLAGS; 438 flags |= O_RDWR; 439 fd = get_unused_fd_flags(flags); 440 if (fd < 0) 441 goto err; 442 443 file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, flags); 444 if (IS_ERR(file)) { 445 put_unused_fd(fd); 446 fd = PTR_ERR(file); 447 goto err; 448 } 449 450 file->f_mode |= FMODE_NOWAIT; 451 fd_install(fd, file); 452 return fd; 453 err: 454 eventfd_free_ctx(ctx); 455 return fd; 456 } 457 458 SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) 459 { 460 return do_eventfd(count, flags); 461 } 462 463 SYSCALL_DEFINE1(eventfd, unsigned int, count) 464 { 465 return do_eventfd(count, 0); 466 } 467 468