1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * fs/eventfd.c 4 * 5 * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> 6 * 7 */ 8 9 #include <linux/file.h> 10 #include <linux/poll.h> 11 #include <linux/init.h> 12 #include <linux/fs.h> 13 #include <linux/sched/signal.h> 14 #include <linux/kernel.h> 15 #include <linux/slab.h> 16 #include <linux/list.h> 17 #include <linux/spinlock.h> 18 #include <linux/anon_inodes.h> 19 #include <linux/syscalls.h> 20 #include <linux/export.h> 21 #include <linux/kref.h> 22 #include <linux/eventfd.h> 23 #include <linux/proc_fs.h> 24 #include <linux/seq_file.h> 25 #include <linux/idr.h> 26 #include <linux/uio.h> 27 28 DEFINE_PER_CPU(int, eventfd_wake_count); 29 EXPORT_PER_CPU_SYMBOL_GPL(eventfd_wake_count); 30 31 static DEFINE_IDA(eventfd_ida); 32 33 struct eventfd_ctx { 34 struct kref kref; 35 wait_queue_head_t wqh; 36 /* 37 * Every time that a write(2) is performed on an eventfd, the 38 * value of the __u64 being written is added to "count" and a 39 * wakeup is performed on "wqh". A read(2) will return the "count" 40 * value to userspace, and will reset "count" to zero. The kernel 41 * side eventfd_signal() also, adds to the "count" counter and 42 * issue a wakeup. 43 */ 44 __u64 count; 45 unsigned int flags; 46 int id; 47 }; 48 49 /** 50 * eventfd_signal - Adds @n to the eventfd counter. 51 * @ctx: [in] Pointer to the eventfd context. 52 * @n: [in] Value of the counter to be added to the eventfd internal counter. 53 * The value cannot be negative. 54 * 55 * This function is supposed to be called by the kernel in paths that do not 56 * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX 57 * value, and we signal this as overflow condition by returning a EPOLLERR 58 * to poll(2). 59 * 60 * Returns the amount by which the counter was incremented. This will be less 61 * than @n if the counter has overflowed. 62 */ 63 __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) 64 { 65 unsigned long flags; 66 67 /* 68 * Deadlock or stack overflow issues can happen if we recurse here 69 * through waitqueue wakeup handlers. If the caller users potentially 70 * nested waitqueues with custom wakeup handlers, then it should 71 * check eventfd_signal_count() before calling this function. If 72 * it returns true, the eventfd_signal() call should be deferred to a 73 * safe context. 74 */ 75 if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count))) 76 return 0; 77 78 spin_lock_irqsave(&ctx->wqh.lock, flags); 79 this_cpu_inc(eventfd_wake_count); 80 if (ULLONG_MAX - ctx->count < n) 81 n = ULLONG_MAX - ctx->count; 82 ctx->count += n; 83 if (waitqueue_active(&ctx->wqh)) 84 wake_up_locked_poll(&ctx->wqh, EPOLLIN); 85 this_cpu_dec(eventfd_wake_count); 86 spin_unlock_irqrestore(&ctx->wqh.lock, flags); 87 88 return n; 89 } 90 EXPORT_SYMBOL_GPL(eventfd_signal); 91 92 static void eventfd_free_ctx(struct eventfd_ctx *ctx) 93 { 94 if (ctx->id >= 0) 95 ida_simple_remove(&eventfd_ida, ctx->id); 96 kfree(ctx); 97 } 98 99 static void eventfd_free(struct kref *kref) 100 { 101 struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref); 102 103 eventfd_free_ctx(ctx); 104 } 105 106 /** 107 * eventfd_ctx_put - Releases a reference to the internal eventfd context. 108 * @ctx: [in] Pointer to eventfd context. 109 * 110 * The eventfd context reference must have been previously acquired either 111 * with eventfd_ctx_fdget() or eventfd_ctx_fileget(). 112 */ 113 void eventfd_ctx_put(struct eventfd_ctx *ctx) 114 { 115 kref_put(&ctx->kref, eventfd_free); 116 } 117 EXPORT_SYMBOL_GPL(eventfd_ctx_put); 118 119 static int eventfd_release(struct inode *inode, struct file *file) 120 { 121 struct eventfd_ctx *ctx = file->private_data; 122 123 wake_up_poll(&ctx->wqh, EPOLLHUP); 124 eventfd_ctx_put(ctx); 125 return 0; 126 } 127 128 static __poll_t eventfd_poll(struct file *file, poll_table *wait) 129 { 130 struct eventfd_ctx *ctx = file->private_data; 131 __poll_t events = 0; 132 u64 count; 133 134 poll_wait(file, &ctx->wqh, wait); 135 136 /* 137 * All writes to ctx->count occur within ctx->wqh.lock. This read 138 * can be done outside ctx->wqh.lock because we know that poll_wait 139 * takes that lock (through add_wait_queue) if our caller will sleep. 140 * 141 * The read _can_ therefore seep into add_wait_queue's critical 142 * section, but cannot move above it! add_wait_queue's spin_lock acts 143 * as an acquire barrier and ensures that the read be ordered properly 144 * against the writes. The following CAN happen and is safe: 145 * 146 * poll write 147 * ----------------- ------------ 148 * lock ctx->wqh.lock (in poll_wait) 149 * count = ctx->count 150 * __add_wait_queue 151 * unlock ctx->wqh.lock 152 * lock ctx->qwh.lock 153 * ctx->count += n 154 * if (waitqueue_active) 155 * wake_up_locked_poll 156 * unlock ctx->qwh.lock 157 * eventfd_poll returns 0 158 * 159 * but the following, which would miss a wakeup, cannot happen: 160 * 161 * poll write 162 * ----------------- ------------ 163 * count = ctx->count (INVALID!) 164 * lock ctx->qwh.lock 165 * ctx->count += n 166 * **waitqueue_active is false** 167 * **no wake_up_locked_poll!** 168 * unlock ctx->qwh.lock 169 * lock ctx->wqh.lock (in poll_wait) 170 * __add_wait_queue 171 * unlock ctx->wqh.lock 172 * eventfd_poll returns 0 173 */ 174 count = READ_ONCE(ctx->count); 175 176 if (count > 0) 177 events |= EPOLLIN; 178 if (count == ULLONG_MAX) 179 events |= EPOLLERR; 180 if (ULLONG_MAX - 1 > count) 181 events |= EPOLLOUT; 182 183 return events; 184 } 185 186 void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) 187 { 188 lockdep_assert_held(&ctx->wqh.lock); 189 190 *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; 191 ctx->count -= *cnt; 192 } 193 EXPORT_SYMBOL_GPL(eventfd_ctx_do_read); 194 195 /** 196 * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue. 197 * @ctx: [in] Pointer to eventfd context. 198 * @wait: [in] Wait queue to be removed. 199 * @cnt: [out] Pointer to the 64-bit counter value. 200 * 201 * Returns %0 if successful, or the following error codes: 202 * 203 * -EAGAIN : The operation would have blocked. 204 * 205 * This is used to atomically remove a wait queue entry from the eventfd wait 206 * queue head, and read/reset the counter value. 207 */ 208 int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, 209 __u64 *cnt) 210 { 211 unsigned long flags; 212 213 spin_lock_irqsave(&ctx->wqh.lock, flags); 214 eventfd_ctx_do_read(ctx, cnt); 215 __remove_wait_queue(&ctx->wqh, wait); 216 if (*cnt != 0 && waitqueue_active(&ctx->wqh)) 217 wake_up_locked_poll(&ctx->wqh, EPOLLOUT); 218 spin_unlock_irqrestore(&ctx->wqh.lock, flags); 219 220 return *cnt != 0 ? 0 : -EAGAIN; 221 } 222 EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue); 223 224 static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to) 225 { 226 struct file *file = iocb->ki_filp; 227 struct eventfd_ctx *ctx = file->private_data; 228 __u64 ucnt = 0; 229 DECLARE_WAITQUEUE(wait, current); 230 231 if (iov_iter_count(to) < sizeof(ucnt)) 232 return -EINVAL; 233 spin_lock_irq(&ctx->wqh.lock); 234 if (!ctx->count) { 235 if ((file->f_flags & O_NONBLOCK) || 236 (iocb->ki_flags & IOCB_NOWAIT)) { 237 spin_unlock_irq(&ctx->wqh.lock); 238 return -EAGAIN; 239 } 240 __add_wait_queue(&ctx->wqh, &wait); 241 for (;;) { 242 set_current_state(TASK_INTERRUPTIBLE); 243 if (ctx->count) 244 break; 245 if (signal_pending(current)) { 246 __remove_wait_queue(&ctx->wqh, &wait); 247 __set_current_state(TASK_RUNNING); 248 spin_unlock_irq(&ctx->wqh.lock); 249 return -ERESTARTSYS; 250 } 251 spin_unlock_irq(&ctx->wqh.lock); 252 schedule(); 253 spin_lock_irq(&ctx->wqh.lock); 254 } 255 __remove_wait_queue(&ctx->wqh, &wait); 256 __set_current_state(TASK_RUNNING); 257 } 258 eventfd_ctx_do_read(ctx, &ucnt); 259 if (waitqueue_active(&ctx->wqh)) 260 wake_up_locked_poll(&ctx->wqh, EPOLLOUT); 261 spin_unlock_irq(&ctx->wqh.lock); 262 if (unlikely(copy_to_iter(&ucnt, sizeof(ucnt), to) != sizeof(ucnt))) 263 return -EFAULT; 264 265 return sizeof(ucnt); 266 } 267 268 static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count, 269 loff_t *ppos) 270 { 271 struct eventfd_ctx *ctx = file->private_data; 272 ssize_t res; 273 __u64 ucnt; 274 DECLARE_WAITQUEUE(wait, current); 275 276 if (count < sizeof(ucnt)) 277 return -EINVAL; 278 if (copy_from_user(&ucnt, buf, sizeof(ucnt))) 279 return -EFAULT; 280 if (ucnt == ULLONG_MAX) 281 return -EINVAL; 282 spin_lock_irq(&ctx->wqh.lock); 283 res = -EAGAIN; 284 if (ULLONG_MAX - ctx->count > ucnt) 285 res = sizeof(ucnt); 286 else if (!(file->f_flags & O_NONBLOCK)) { 287 __add_wait_queue(&ctx->wqh, &wait); 288 for (res = 0;;) { 289 set_current_state(TASK_INTERRUPTIBLE); 290 if (ULLONG_MAX - ctx->count > ucnt) { 291 res = sizeof(ucnt); 292 break; 293 } 294 if (signal_pending(current)) { 295 res = -ERESTARTSYS; 296 break; 297 } 298 spin_unlock_irq(&ctx->wqh.lock); 299 schedule(); 300 spin_lock_irq(&ctx->wqh.lock); 301 } 302 __remove_wait_queue(&ctx->wqh, &wait); 303 __set_current_state(TASK_RUNNING); 304 } 305 if (likely(res > 0)) { 306 ctx->count += ucnt; 307 if (waitqueue_active(&ctx->wqh)) 308 wake_up_locked_poll(&ctx->wqh, EPOLLIN); 309 } 310 spin_unlock_irq(&ctx->wqh.lock); 311 312 return res; 313 } 314 315 #ifdef CONFIG_PROC_FS 316 static void eventfd_show_fdinfo(struct seq_file *m, struct file *f) 317 { 318 struct eventfd_ctx *ctx = f->private_data; 319 320 spin_lock_irq(&ctx->wqh.lock); 321 seq_printf(m, "eventfd-count: %16llx\n", 322 (unsigned long long)ctx->count); 323 spin_unlock_irq(&ctx->wqh.lock); 324 seq_printf(m, "eventfd-id: %d\n", ctx->id); 325 } 326 #endif 327 328 static const struct file_operations eventfd_fops = { 329 #ifdef CONFIG_PROC_FS 330 .show_fdinfo = eventfd_show_fdinfo, 331 #endif 332 .release = eventfd_release, 333 .poll = eventfd_poll, 334 .read_iter = eventfd_read, 335 .write = eventfd_write, 336 .llseek = noop_llseek, 337 }; 338 339 /** 340 * eventfd_fget - Acquire a reference of an eventfd file descriptor. 341 * @fd: [in] Eventfd file descriptor. 342 * 343 * Returns a pointer to the eventfd file structure in case of success, or the 344 * following error pointer: 345 * 346 * -EBADF : Invalid @fd file descriptor. 347 * -EINVAL : The @fd file descriptor is not an eventfd file. 348 */ 349 struct file *eventfd_fget(int fd) 350 { 351 struct file *file; 352 353 file = fget(fd); 354 if (!file) 355 return ERR_PTR(-EBADF); 356 if (file->f_op != &eventfd_fops) { 357 fput(file); 358 return ERR_PTR(-EINVAL); 359 } 360 361 return file; 362 } 363 EXPORT_SYMBOL_GPL(eventfd_fget); 364 365 /** 366 * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context. 367 * @fd: [in] Eventfd file descriptor. 368 * 369 * Returns a pointer to the internal eventfd context, otherwise the error 370 * pointers returned by the following functions: 371 * 372 * eventfd_fget 373 */ 374 struct eventfd_ctx *eventfd_ctx_fdget(int fd) 375 { 376 struct eventfd_ctx *ctx; 377 struct fd f = fdget(fd); 378 if (!f.file) 379 return ERR_PTR(-EBADF); 380 ctx = eventfd_ctx_fileget(f.file); 381 fdput(f); 382 return ctx; 383 } 384 EXPORT_SYMBOL_GPL(eventfd_ctx_fdget); 385 386 /** 387 * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context. 388 * @file: [in] Eventfd file pointer. 389 * 390 * Returns a pointer to the internal eventfd context, otherwise the error 391 * pointer: 392 * 393 * -EINVAL : The @fd file descriptor is not an eventfd file. 394 */ 395 struct eventfd_ctx *eventfd_ctx_fileget(struct file *file) 396 { 397 struct eventfd_ctx *ctx; 398 399 if (file->f_op != &eventfd_fops) 400 return ERR_PTR(-EINVAL); 401 402 ctx = file->private_data; 403 kref_get(&ctx->kref); 404 return ctx; 405 } 406 EXPORT_SYMBOL_GPL(eventfd_ctx_fileget); 407 408 static int do_eventfd(unsigned int count, int flags) 409 { 410 struct eventfd_ctx *ctx; 411 struct file *file; 412 int fd; 413 414 /* Check the EFD_* constants for consistency. */ 415 BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC); 416 BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); 417 418 if (flags & ~EFD_FLAGS_SET) 419 return -EINVAL; 420 421 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); 422 if (!ctx) 423 return -ENOMEM; 424 425 kref_init(&ctx->kref); 426 init_waitqueue_head(&ctx->wqh); 427 ctx->count = count; 428 ctx->flags = flags; 429 ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL); 430 431 flags &= EFD_SHARED_FCNTL_FLAGS; 432 flags |= O_RDWR; 433 fd = get_unused_fd_flags(flags); 434 if (fd < 0) 435 goto err; 436 437 file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, flags); 438 if (IS_ERR(file)) { 439 put_unused_fd(fd); 440 fd = PTR_ERR(file); 441 goto err; 442 } 443 444 file->f_mode |= FMODE_NOWAIT; 445 fd_install(fd, file); 446 return fd; 447 err: 448 eventfd_free_ctx(ctx); 449 return fd; 450 } 451 452 SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) 453 { 454 return do_eventfd(count, flags); 455 } 456 457 SYSCALL_DEFINE1(eventfd, unsigned int, count) 458 { 459 return do_eventfd(count, 0); 460 } 461 462