xref: /linux/fs/notify/fanotify/fanotify_user.c (revision fba784cc9e3d22a530211ef3ec60d04562349cb4)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/fanotify.h>
3 #include <linux/fcntl.h>
4 #include <linux/file.h>
5 #include <linux/fs.h>
6 #include <linux/anon_inodes.h>
7 #include <linux/fsnotify_backend.h>
8 #include <linux/init.h>
9 #include <linux/mount.h>
10 #include <linux/namei.h>
11 #include <linux/poll.h>
12 #include <linux/security.h>
13 #include <linux/syscalls.h>
14 #include <linux/slab.h>
15 #include <linux/types.h>
16 #include <linux/uaccess.h>
17 #include <linux/compat.h>
18 #include <linux/sched/signal.h>
19 #include <linux/memcontrol.h>
20 #include <linux/statfs.h>
21 #include <linux/exportfs.h>
22 
23 #include <asm/ioctls.h>
24 
25 #include "../fsnotify.h"
26 #include "../fdinfo.h"
27 #include "fanotify.h"
28 
29 #define FANOTIFY_DEFAULT_MAX_EVENTS	16384
30 #define FANOTIFY_OLD_DEFAULT_MAX_MARKS	8192
31 #define FANOTIFY_DEFAULT_MAX_GROUPS	128
32 #define FANOTIFY_DEFAULT_FEE_POOL_SIZE	32
33 
34 /*
35  * Legacy fanotify marks limits (8192) is per group and we introduced a tunable
36  * limit of marks per user, similar to inotify.  Effectively, the legacy limit
37  * of fanotify marks per user is <max marks per group> * <max groups per user>.
38  * This default limit (1M) also happens to match the increased limit of inotify
39  * max_user_watches since v5.10.
40  */
41 #define FANOTIFY_DEFAULT_MAX_USER_MARKS	\
42 	(FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS)
43 
44 /*
45  * Most of the memory cost of adding an inode mark is pinning the marked inode.
46  * The size of the filesystem inode struct is not uniform across filesystems,
47  * so double the size of a VFS inode is used as a conservative approximation.
48  */
49 #define INODE_MARK_COST	(2 * sizeof(struct inode))
50 
51 /* configurable via /proc/sys/fs/fanotify/ */
52 static int fanotify_max_queued_events __read_mostly;
53 
54 #ifdef CONFIG_SYSCTL
55 
56 #include <linux/sysctl.h>
57 
58 static long ft_zero = 0;
59 static long ft_int_max = INT_MAX;
60 
61 static const struct ctl_table fanotify_table[] = {
62 	{
63 		.procname	= "max_user_groups",
64 		.data	= &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS],
65 		.maxlen		= sizeof(long),
66 		.mode		= 0644,
67 		.proc_handler	= proc_doulongvec_minmax,
68 		.extra1		= &ft_zero,
69 		.extra2		= &ft_int_max,
70 	},
71 	{
72 		.procname	= "max_user_marks",
73 		.data	= &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS],
74 		.maxlen		= sizeof(long),
75 		.mode		= 0644,
76 		.proc_handler	= proc_doulongvec_minmax,
77 		.extra1		= &ft_zero,
78 		.extra2		= &ft_int_max,
79 	},
80 	{
81 		.procname	= "max_queued_events",
82 		.data		= &fanotify_max_queued_events,
83 		.maxlen		= sizeof(int),
84 		.mode		= 0644,
85 		.proc_handler	= proc_dointvec_minmax,
86 		.extra1		= SYSCTL_ZERO
87 	},
88 };
89 
fanotify_sysctls_init(void)90 static void __init fanotify_sysctls_init(void)
91 {
92 	register_sysctl("fs/fanotify", fanotify_table);
93 }
94 #else
95 #define fanotify_sysctls_init() do { } while (0)
96 #endif /* CONFIG_SYSCTL */
97 
98 /*
99  * All flags that may be specified in parameter event_f_flags of fanotify_init.
100  *
101  * Internal and external open flags are stored together in field f_flags of
102  * struct file. Only external open flags shall be allowed in event_f_flags.
103  * Internal flags like FMODE_EXEC shall be excluded.
104  */
105 #define	FANOTIFY_INIT_ALL_EVENT_F_BITS				( \
106 		O_ACCMODE	| O_APPEND	| O_NONBLOCK	| \
107 		__O_SYNC	| O_DSYNC	| O_CLOEXEC     | \
108 		O_LARGEFILE	| O_NOATIME	)
109 
110 extern const struct fsnotify_ops fanotify_fsnotify_ops;
111 
112 struct kmem_cache *fanotify_mark_cache __ro_after_init;
113 struct kmem_cache *fanotify_fid_event_cachep __ro_after_init;
114 struct kmem_cache *fanotify_path_event_cachep __ro_after_init;
115 struct kmem_cache *fanotify_perm_event_cachep __ro_after_init;
116 struct kmem_cache *fanotify_mnt_event_cachep __ro_after_init;
117 
118 #define FANOTIFY_EVENT_ALIGN 4
119 #define FANOTIFY_FID_INFO_HDR_LEN \
120 	(sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
121 #define FANOTIFY_PIDFD_INFO_LEN \
122 	sizeof(struct fanotify_event_info_pidfd)
123 #define FANOTIFY_ERROR_INFO_LEN \
124 	(sizeof(struct fanotify_event_info_error))
125 #define FANOTIFY_RANGE_INFO_LEN \
126 	(sizeof(struct fanotify_event_info_range))
127 #define FANOTIFY_MNT_INFO_LEN \
128 	(sizeof(struct fanotify_event_info_mnt))
129 
fanotify_fid_info_len(int fh_len,int name_len)130 static int fanotify_fid_info_len(int fh_len, int name_len)
131 {
132 	int info_len = fh_len;
133 
134 	if (name_len)
135 		info_len += name_len + 1;
136 
137 	return roundup(FANOTIFY_FID_INFO_HDR_LEN + info_len,
138 		       FANOTIFY_EVENT_ALIGN);
139 }
140 
141 /* FAN_RENAME may have one or two dir+name info records */
fanotify_dir_name_info_len(struct fanotify_event * event)142 static int fanotify_dir_name_info_len(struct fanotify_event *event)
143 {
144 	struct fanotify_info *info = fanotify_event_info(event);
145 	int dir_fh_len = fanotify_event_dir_fh_len(event);
146 	int dir2_fh_len = fanotify_event_dir2_fh_len(event);
147 	int info_len = 0;
148 
149 	if (dir_fh_len)
150 		info_len += fanotify_fid_info_len(dir_fh_len,
151 						  info->name_len);
152 	if (dir2_fh_len)
153 		info_len += fanotify_fid_info_len(dir2_fh_len,
154 						  info->name2_len);
155 
156 	return info_len;
157 }
158 
fanotify_event_len(unsigned int info_mode,struct fanotify_event * event)159 static size_t fanotify_event_len(unsigned int info_mode,
160 				 struct fanotify_event *event)
161 {
162 	size_t event_len = FAN_EVENT_METADATA_LEN;
163 	int fh_len;
164 	int dot_len = 0;
165 
166 	if (fanotify_is_error_event(event->mask))
167 		event_len += FANOTIFY_ERROR_INFO_LEN;
168 
169 	if (fanotify_event_has_any_dir_fh(event)) {
170 		event_len += fanotify_dir_name_info_len(event);
171 	} else if ((info_mode & FAN_REPORT_NAME) &&
172 		   (event->mask & FAN_ONDIR)) {
173 		/*
174 		 * With group flag FAN_REPORT_NAME, if name was not recorded in
175 		 * event on a directory, we will report the name ".".
176 		 */
177 		dot_len = 1;
178 	}
179 
180 	if (fanotify_event_has_object_fh(event)) {
181 		fh_len = fanotify_event_object_fh_len(event);
182 		event_len += fanotify_fid_info_len(fh_len, dot_len);
183 	}
184 	if (fanotify_is_mnt_event(event->mask))
185 		event_len += FANOTIFY_MNT_INFO_LEN;
186 
187 	if (info_mode & FAN_REPORT_PIDFD)
188 		event_len += FANOTIFY_PIDFD_INFO_LEN;
189 
190 	if (fanotify_event_has_access_range(event))
191 		event_len += FANOTIFY_RANGE_INFO_LEN;
192 
193 	return event_len;
194 }
195 
196 /*
197  * Remove an hashed event from merge hash table.
198  */
fanotify_unhash_event(struct fsnotify_group * group,struct fanotify_event * event)199 static void fanotify_unhash_event(struct fsnotify_group *group,
200 				  struct fanotify_event *event)
201 {
202 	assert_spin_locked(&group->notification_lock);
203 
204 	pr_debug("%s: group=%p event=%p bucket=%u\n", __func__,
205 		 group, event, fanotify_event_hash_bucket(group, event));
206 
207 	if (WARN_ON_ONCE(hlist_unhashed(&event->merge_list)))
208 		return;
209 
210 	hlist_del_init(&event->merge_list);
211 }
212 
213 /*
214  * Get an fanotify notification event if one exists and is small
215  * enough to fit in "count". Return an error pointer if the count
216  * is not large enough. When permission event is dequeued, its state is
217  * updated accordingly.
218  */
get_one_event(struct fsnotify_group * group,size_t count)219 static struct fanotify_event *get_one_event(struct fsnotify_group *group,
220 					    size_t count)
221 {
222 	size_t event_size;
223 	struct fanotify_event *event = NULL;
224 	struct fsnotify_event *fsn_event;
225 	unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
226 
227 	pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
228 
229 	spin_lock(&group->notification_lock);
230 	fsn_event = fsnotify_peek_first_event(group);
231 	if (!fsn_event)
232 		goto out;
233 
234 	event = FANOTIFY_E(fsn_event);
235 	event_size = fanotify_event_len(info_mode, event);
236 
237 	if (event_size > count) {
238 		event = ERR_PTR(-EINVAL);
239 		goto out;
240 	}
241 
242 	/*
243 	 * Held the notification_lock the whole time, so this is the
244 	 * same event we peeked above.
245 	 */
246 	fsnotify_remove_first_event(group);
247 	if (fanotify_is_perm_event(event->mask))
248 		FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED;
249 	if (fanotify_is_hashed_event(event->mask))
250 		fanotify_unhash_event(group, event);
251 out:
252 	spin_unlock(&group->notification_lock);
253 	return event;
254 }
255 
create_fd(struct fsnotify_group * group,const struct path * path,struct file ** file)256 static int create_fd(struct fsnotify_group *group, const struct path *path,
257 		     struct file **file)
258 {
259 	int client_fd;
260 	struct file *new_file;
261 
262 	client_fd = get_unused_fd_flags(group->fanotify_data.f_flags);
263 	if (client_fd < 0)
264 		return client_fd;
265 
266 	/*
267 	 * We provide an fd for the userspace program, so it could access the
268 	 * file without generating fanotify events itself.
269 	 */
270 	new_file = dentry_open_nonotify(path, group->fanotify_data.f_flags,
271 					current_cred());
272 	if (IS_ERR(new_file)) {
273 		put_unused_fd(client_fd);
274 		client_fd = PTR_ERR(new_file);
275 	} else {
276 		*file = new_file;
277 	}
278 
279 	return client_fd;
280 }
281 
process_access_response_info(const char __user * info,size_t info_len,struct fanotify_response_info_audit_rule * friar)282 static int process_access_response_info(const char __user *info,
283 					size_t info_len,
284 				struct fanotify_response_info_audit_rule *friar)
285 {
286 	if (info_len != sizeof(*friar))
287 		return -EINVAL;
288 
289 	if (copy_from_user(friar, info, sizeof(*friar)))
290 		return -EFAULT;
291 
292 	if (friar->hdr.type != FAN_RESPONSE_INFO_AUDIT_RULE)
293 		return -EINVAL;
294 	if (friar->hdr.pad != 0)
295 		return -EINVAL;
296 	if (friar->hdr.len != sizeof(*friar))
297 		return -EINVAL;
298 
299 	return info_len;
300 }
301 
302 /*
303  * Finish processing of permission event by setting it to ANSWERED state and
304  * drop group->notification_lock.
305  */
finish_permission_event(struct fsnotify_group * group,struct fanotify_perm_event * event,u32 response,struct fanotify_response_info_audit_rule * friar)306 static void finish_permission_event(struct fsnotify_group *group,
307 				    struct fanotify_perm_event *event, u32 response,
308 				    struct fanotify_response_info_audit_rule *friar)
309 				    __releases(&group->notification_lock)
310 {
311 	bool destroy = false;
312 
313 	assert_spin_locked(&group->notification_lock);
314 	event->response = response & ~FAN_INFO;
315 	if (response & FAN_INFO)
316 		memcpy(&event->audit_rule, friar, sizeof(*friar));
317 
318 	if (event->state == FAN_EVENT_CANCELED)
319 		destroy = true;
320 	else
321 		event->state = FAN_EVENT_ANSWERED;
322 	spin_unlock(&group->notification_lock);
323 	if (destroy)
324 		fsnotify_destroy_event(group, &event->fae.fse);
325 }
326 
process_access_response(struct fsnotify_group * group,struct fanotify_response * response_struct,const char __user * info,size_t info_len)327 static int process_access_response(struct fsnotify_group *group,
328 				   struct fanotify_response *response_struct,
329 				   const char __user *info,
330 				   size_t info_len)
331 {
332 	struct fanotify_perm_event *event;
333 	int fd = response_struct->fd;
334 	u32 response = response_struct->response;
335 	int errno = fanotify_get_response_errno(response);
336 	int ret = info_len;
337 	struct fanotify_response_info_audit_rule friar;
338 
339 	pr_debug("%s: group=%p fd=%d response=%x errno=%d buf=%p size=%zu\n",
340 		 __func__, group, fd, response, errno, info, info_len);
341 	/*
342 	 * make sure the response is valid, if invalid we do nothing and either
343 	 * userspace can send a valid response or we will clean it up after the
344 	 * timeout
345 	 */
346 	if (response & ~FANOTIFY_RESPONSE_VALID_MASK)
347 		return -EINVAL;
348 
349 	switch (response & FANOTIFY_RESPONSE_ACCESS) {
350 	case FAN_ALLOW:
351 		if (errno)
352 			return -EINVAL;
353 		break;
354 	case FAN_DENY:
355 		/* Custom errno is supported only for pre-content groups */
356 		if (errno && group->priority != FSNOTIFY_PRIO_PRE_CONTENT)
357 			return -EINVAL;
358 
359 		/*
360 		 * Limit errno to values expected on open(2)/read(2)/write(2)
361 		 * of regular files.
362 		 */
363 		switch (errno) {
364 		case 0:
365 		case EIO:
366 		case EPERM:
367 		case EBUSY:
368 		case ETXTBSY:
369 		case EAGAIN:
370 		case ENOSPC:
371 		case EDQUOT:
372 			break;
373 		default:
374 			return -EINVAL;
375 		}
376 		break;
377 	default:
378 		return -EINVAL;
379 	}
380 
381 	if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT))
382 		return -EINVAL;
383 
384 	if (response & FAN_INFO) {
385 		ret = process_access_response_info(info, info_len, &friar);
386 		if (ret < 0)
387 			return ret;
388 		if (fd == FAN_NOFD)
389 			return ret;
390 	} else {
391 		ret = 0;
392 	}
393 
394 	if (fd < 0)
395 		return -EINVAL;
396 
397 	spin_lock(&group->notification_lock);
398 	list_for_each_entry(event, &group->fanotify_data.access_list,
399 			    fae.fse.list) {
400 		if (event->fd != fd)
401 			continue;
402 
403 		list_del_init(&event->fae.fse.list);
404 		finish_permission_event(group, event, response, &friar);
405 		wake_up(&group->fanotify_data.access_waitq);
406 		return ret;
407 	}
408 	spin_unlock(&group->notification_lock);
409 
410 	return -ENOENT;
411 }
412 
copy_mnt_info_to_user(struct fanotify_event * event,char __user * buf,int count)413 static size_t copy_mnt_info_to_user(struct fanotify_event *event,
414 				    char __user *buf, int count)
415 {
416 	struct fanotify_event_info_mnt info = { };
417 
418 	info.hdr.info_type = FAN_EVENT_INFO_TYPE_MNT;
419 	info.hdr.len = FANOTIFY_MNT_INFO_LEN;
420 
421 	if (WARN_ON(count < info.hdr.len))
422 		return -EFAULT;
423 
424 	info.mnt_id = FANOTIFY_ME(event)->mnt_id;
425 
426 	if (copy_to_user(buf, &info, sizeof(info)))
427 		return -EFAULT;
428 
429 	return info.hdr.len;
430 }
431 
copy_error_info_to_user(struct fanotify_event * event,char __user * buf,int count)432 static size_t copy_error_info_to_user(struct fanotify_event *event,
433 				      char __user *buf, int count)
434 {
435 	struct fanotify_event_info_error info = { };
436 	struct fanotify_error_event *fee = FANOTIFY_EE(event);
437 
438 	info.hdr.info_type = FAN_EVENT_INFO_TYPE_ERROR;
439 	info.hdr.len = FANOTIFY_ERROR_INFO_LEN;
440 
441 	if (WARN_ON(count < info.hdr.len))
442 		return -EFAULT;
443 
444 	info.error = fee->error;
445 	info.error_count = fee->err_count;
446 
447 	if (copy_to_user(buf, &info, sizeof(info)))
448 		return -EFAULT;
449 
450 	return info.hdr.len;
451 }
452 
copy_fid_info_to_user(__kernel_fsid_t * fsid,struct fanotify_fh * fh,int info_type,const char * name,size_t name_len,char __user * buf,size_t count)453 static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
454 				 int info_type, const char *name,
455 				 size_t name_len,
456 				 char __user *buf, size_t count)
457 {
458 	struct fanotify_event_info_fid info = { };
459 	struct file_handle handle = { };
460 	unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh_buf;
461 	size_t fh_len = fh ? fh->len : 0;
462 	size_t info_len = fanotify_fid_info_len(fh_len, name_len);
463 	size_t len = info_len;
464 
465 	pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n",
466 		 __func__, fh_len, name_len, info_len, count);
467 
468 	if (WARN_ON_ONCE(len < sizeof(info) || len > count))
469 		return -EFAULT;
470 
471 	/*
472 	 * Copy event info fid header followed by variable sized file handle
473 	 * and optionally followed by variable sized filename.
474 	 */
475 	switch (info_type) {
476 	case FAN_EVENT_INFO_TYPE_FID:
477 	case FAN_EVENT_INFO_TYPE_DFID:
478 		if (WARN_ON_ONCE(name_len))
479 			return -EFAULT;
480 		break;
481 	case FAN_EVENT_INFO_TYPE_DFID_NAME:
482 	case FAN_EVENT_INFO_TYPE_OLD_DFID_NAME:
483 	case FAN_EVENT_INFO_TYPE_NEW_DFID_NAME:
484 		if (WARN_ON_ONCE(!name || !name_len))
485 			return -EFAULT;
486 		break;
487 	default:
488 		return -EFAULT;
489 	}
490 
491 	info.hdr.info_type = info_type;
492 	info.hdr.len = len;
493 	info.fsid = *fsid;
494 	if (copy_to_user(buf, &info, sizeof(info)))
495 		return -EFAULT;
496 
497 	buf += sizeof(info);
498 	len -= sizeof(info);
499 	if (WARN_ON_ONCE(len < sizeof(handle)))
500 		return -EFAULT;
501 
502 	handle.handle_type = fh->type;
503 	handle.handle_bytes = fh_len;
504 
505 	/* Mangle handle_type for bad file_handle */
506 	if (!fh_len)
507 		handle.handle_type = FILEID_INVALID;
508 
509 	if (copy_to_user(buf, &handle, sizeof(handle)))
510 		return -EFAULT;
511 
512 	buf += sizeof(handle);
513 	len -= sizeof(handle);
514 	if (WARN_ON_ONCE(len < fh_len))
515 		return -EFAULT;
516 
517 	/*
518 	 * For an inline fh and inline file name, copy through stack to exclude
519 	 * the copy from usercopy hardening protections.
520 	 */
521 	fh_buf = fanotify_fh_buf(fh);
522 	if (fh_len <= FANOTIFY_INLINE_FH_LEN) {
523 		memcpy(bounce, fh_buf, fh_len);
524 		fh_buf = bounce;
525 	}
526 	if (copy_to_user(buf, fh_buf, fh_len))
527 		return -EFAULT;
528 
529 	buf += fh_len;
530 	len -= fh_len;
531 
532 	if (name_len) {
533 		/* Copy the filename with terminating null */
534 		name_len++;
535 		if (WARN_ON_ONCE(len < name_len))
536 			return -EFAULT;
537 
538 		if (copy_to_user(buf, name, name_len))
539 			return -EFAULT;
540 
541 		buf += name_len;
542 		len -= name_len;
543 	}
544 
545 	/* Pad with 0's */
546 	WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN);
547 	if (len > 0 && clear_user(buf, len))
548 		return -EFAULT;
549 
550 	return info_len;
551 }
552 
copy_pidfd_info_to_user(int pidfd,char __user * buf,size_t count)553 static int copy_pidfd_info_to_user(int pidfd,
554 				   char __user *buf,
555 				   size_t count)
556 {
557 	struct fanotify_event_info_pidfd info = { };
558 	size_t info_len = FANOTIFY_PIDFD_INFO_LEN;
559 
560 	if (WARN_ON_ONCE(info_len > count))
561 		return -EFAULT;
562 
563 	info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD;
564 	info.hdr.len = info_len;
565 	info.pidfd = pidfd;
566 
567 	if (copy_to_user(buf, &info, info_len))
568 		return -EFAULT;
569 
570 	return info_len;
571 }
572 
copy_range_info_to_user(struct fanotify_event * event,char __user * buf,int count)573 static size_t copy_range_info_to_user(struct fanotify_event *event,
574 				      char __user *buf, int count)
575 {
576 	struct fanotify_perm_event *pevent = FANOTIFY_PERM(event);
577 	struct fanotify_event_info_range info = { };
578 	size_t info_len = FANOTIFY_RANGE_INFO_LEN;
579 
580 	if (WARN_ON_ONCE(info_len > count))
581 		return -EFAULT;
582 
583 	if (WARN_ON_ONCE(!pevent->ppos))
584 		return -EINVAL;
585 
586 	info.hdr.info_type = FAN_EVENT_INFO_TYPE_RANGE;
587 	info.hdr.len = info_len;
588 	info.offset = *(pevent->ppos);
589 	info.count = pevent->count;
590 
591 	if (copy_to_user(buf, &info, info_len))
592 		return -EFAULT;
593 
594 	return info_len;
595 }
596 
copy_info_records_to_user(struct fanotify_event * event,struct fanotify_info * info,unsigned int info_mode,int pidfd,char __user * buf,size_t count)597 static int copy_info_records_to_user(struct fanotify_event *event,
598 				     struct fanotify_info *info,
599 				     unsigned int info_mode, int pidfd,
600 				     char __user *buf, size_t count)
601 {
602 	int ret, total_bytes = 0, info_type = 0;
603 	unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS;
604 	unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
605 
606 	/*
607 	 * Event info records order is as follows:
608 	 * 1. dir fid + name
609 	 * 2. (optional) new dir fid + new name
610 	 * 3. (optional) child fid
611 	 */
612 	if (fanotify_event_has_dir_fh(event)) {
613 		info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
614 					     FAN_EVENT_INFO_TYPE_DFID;
615 
616 		/* FAN_RENAME uses special info types */
617 		if (event->mask & FAN_RENAME)
618 			info_type = FAN_EVENT_INFO_TYPE_OLD_DFID_NAME;
619 
620 		ret = copy_fid_info_to_user(fanotify_event_fsid(event),
621 					    fanotify_info_dir_fh(info),
622 					    info_type,
623 					    fanotify_info_name(info),
624 					    info->name_len, buf, count);
625 		if (ret < 0)
626 			return ret;
627 
628 		buf += ret;
629 		count -= ret;
630 		total_bytes += ret;
631 	}
632 
633 	/* New dir fid+name may be reported in addition to old dir fid+name */
634 	if (fanotify_event_has_dir2_fh(event)) {
635 		info_type = FAN_EVENT_INFO_TYPE_NEW_DFID_NAME;
636 		ret = copy_fid_info_to_user(fanotify_event_fsid(event),
637 					    fanotify_info_dir2_fh(info),
638 					    info_type,
639 					    fanotify_info_name2(info),
640 					    info->name2_len, buf, count);
641 		if (ret < 0)
642 			return ret;
643 
644 		buf += ret;
645 		count -= ret;
646 		total_bytes += ret;
647 	}
648 
649 	if (fanotify_event_has_object_fh(event)) {
650 		const char *dot = NULL;
651 		int dot_len = 0;
652 
653 		if (fid_mode == FAN_REPORT_FID || info_type) {
654 			/*
655 			 * With only group flag FAN_REPORT_FID only type FID is
656 			 * reported. Second info record type is always FID.
657 			 */
658 			info_type = FAN_EVENT_INFO_TYPE_FID;
659 		} else if ((fid_mode & FAN_REPORT_NAME) &&
660 			   (event->mask & FAN_ONDIR)) {
661 			/*
662 			 * With group flag FAN_REPORT_NAME, if name was not
663 			 * recorded in an event on a directory, report the name
664 			 * "." with info type DFID_NAME.
665 			 */
666 			info_type = FAN_EVENT_INFO_TYPE_DFID_NAME;
667 			dot = ".";
668 			dot_len = 1;
669 		} else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) ||
670 			   (event->mask & FAN_ONDIR)) {
671 			/*
672 			 * With group flag FAN_REPORT_DIR_FID, a single info
673 			 * record has type DFID for directory entry modification
674 			 * event and for event on a directory.
675 			 */
676 			info_type = FAN_EVENT_INFO_TYPE_DFID;
677 		} else {
678 			/*
679 			 * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID,
680 			 * a single info record has type FID for event on a
681 			 * non-directory, when there is no directory to report.
682 			 * For example, on FAN_DELETE_SELF event.
683 			 */
684 			info_type = FAN_EVENT_INFO_TYPE_FID;
685 		}
686 
687 		ret = copy_fid_info_to_user(fanotify_event_fsid(event),
688 					    fanotify_event_object_fh(event),
689 					    info_type, dot, dot_len,
690 					    buf, count);
691 		if (ret < 0)
692 			return ret;
693 
694 		buf += ret;
695 		count -= ret;
696 		total_bytes += ret;
697 	}
698 
699 	if (pidfd_mode) {
700 		ret = copy_pidfd_info_to_user(pidfd, buf, count);
701 		if (ret < 0)
702 			return ret;
703 
704 		buf += ret;
705 		count -= ret;
706 		total_bytes += ret;
707 	}
708 
709 	if (fanotify_is_error_event(event->mask)) {
710 		ret = copy_error_info_to_user(event, buf, count);
711 		if (ret < 0)
712 			return ret;
713 		buf += ret;
714 		count -= ret;
715 		total_bytes += ret;
716 	}
717 
718 	if (fanotify_event_has_access_range(event)) {
719 		ret = copy_range_info_to_user(event, buf, count);
720 		if (ret < 0)
721 			return ret;
722 		buf += ret;
723 		count -= ret;
724 		total_bytes += ret;
725 	}
726 
727 	if (fanotify_is_mnt_event(event->mask)) {
728 		ret = copy_mnt_info_to_user(event, buf, count);
729 		if (ret < 0)
730 			return ret;
731 		buf += ret;
732 		count -= ret;
733 		total_bytes += ret;
734 	}
735 
736 	return total_bytes;
737 }
738 
copy_event_to_user(struct fsnotify_group * group,struct fanotify_event * event,char __user * buf,size_t count)739 static ssize_t copy_event_to_user(struct fsnotify_group *group,
740 				  struct fanotify_event *event,
741 				  char __user *buf, size_t count)
742 {
743 	struct fanotify_event_metadata metadata;
744 	const struct path *path = fanotify_event_path(event);
745 	struct fanotify_info *info = fanotify_event_info(event);
746 	unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
747 	unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
748 	struct file *f = NULL, *pidfd_file = NULL;
749 	int ret, pidfd = -ESRCH, fd = -EBADF;
750 
751 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
752 
753 	metadata.event_len = fanotify_event_len(info_mode, event);
754 	metadata.metadata_len = FAN_EVENT_METADATA_LEN;
755 	metadata.vers = FANOTIFY_METADATA_VERSION;
756 	metadata.reserved = 0;
757 	metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
758 	metadata.pid = pid_vnr(event->pid);
759 	/*
760 	 * For an unprivileged listener, event->pid can be used to identify the
761 	 * events generated by the listener process itself, without disclosing
762 	 * the pids of other processes.
763 	 */
764 	if (FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
765 	    task_tgid(current) != event->pid)
766 		metadata.pid = 0;
767 
768 	/*
769 	 * For now, fid mode is required for an unprivileged listener and
770 	 * fid mode does not report fd in events.  Keep this check anyway
771 	 * for safety in case fid mode requirement is relaxed in the future
772 	 * to allow unprivileged listener to get events with no fd and no fid.
773 	 */
774 	if (!FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
775 	    path && path->mnt && path->dentry) {
776 		fd = create_fd(group, path, &f);
777 		/*
778 		 * Opening an fd from dentry can fail for several reasons.
779 		 * For example, when tasks are gone and we try to open their
780 		 * /proc files or we try to open a WRONLY file like in sysfs
781 		 * or when trying to open a file that was deleted on the
782 		 * remote network server.
783 		 *
784 		 * For a group with FAN_REPORT_FD_ERROR, we will send the
785 		 * event with the error instead of the open fd, otherwise
786 		 * Userspace may not get the error at all.
787 		 * In any case, userspace will not know which file failed to
788 		 * open, so add a debug print for further investigation.
789 		 */
790 		if (fd < 0) {
791 			pr_debug("fanotify: create_fd(%pd2) failed err=%d\n",
792 				 path->dentry, fd);
793 			if (!FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR)) {
794 				/*
795 				 * Historically, we've handled EOPENSTALE in a
796 				 * special way and silently dropped such
797 				 * events. Now we have to keep it to maintain
798 				 * backward compatibility...
799 				 */
800 				if (fd == -EOPENSTALE)
801 					fd = 0;
802 				return fd;
803 			}
804 		}
805 	}
806 	if (FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR))
807 		metadata.fd = fd;
808 	else
809 		metadata.fd = fd >= 0 ? fd : FAN_NOFD;
810 
811 	if (pidfd_mode) {
812 		/*
813 		 * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual
814 		 * exclusion is ever lifted. At the time of incoporating pidfd
815 		 * support within fanotify, the pidfd API only supported the
816 		 * creation of pidfds for thread-group leaders.
817 		 */
818 		WARN_ON_ONCE(FAN_GROUP_FLAG(group, FAN_REPORT_TID));
819 
820 		/*
821 		 * The PIDTYPE_TGID check for an event->pid is performed
822 		 * preemptively in an attempt to catch out cases where the event
823 		 * listener reads events after the event generating process has
824 		 * already terminated.  Depending on flag FAN_REPORT_FD_ERROR,
825 		 * report either -ESRCH or FAN_NOPIDFD to the event listener in
826 		 * those cases with all other pidfd creation errors reported as
827 		 * the error code itself or as FAN_EPIDFD.
828 		 */
829 		if (metadata.pid && pid_has_task(event->pid, PIDTYPE_TGID))
830 			pidfd = pidfd_prepare(event->pid, 0, &pidfd_file);
831 
832 		if (!FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR) && pidfd < 0)
833 			pidfd = pidfd == -ESRCH ? FAN_NOPIDFD : FAN_EPIDFD;
834 	}
835 
836 	ret = -EFAULT;
837 	/*
838 	 * Sanity check copy size in case get_one_event() and
839 	 * event_len sizes ever get out of sync.
840 	 */
841 	if (WARN_ON_ONCE(metadata.event_len > count))
842 		goto out_close_fd;
843 
844 	if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN))
845 		goto out_close_fd;
846 
847 	buf += FAN_EVENT_METADATA_LEN;
848 	count -= FAN_EVENT_METADATA_LEN;
849 
850 	ret = copy_info_records_to_user(event, info, info_mode, pidfd,
851 					buf, count);
852 	if (ret < 0)
853 		goto out_close_fd;
854 
855 	if (f)
856 		fd_install(fd, f);
857 
858 	if (pidfd_file)
859 		fd_install(pidfd, pidfd_file);
860 
861 	if (fanotify_is_perm_event(event->mask))
862 		FANOTIFY_PERM(event)->fd = fd;
863 
864 	return metadata.event_len;
865 
866 out_close_fd:
867 	if (f) {
868 		put_unused_fd(fd);
869 		fput(f);
870 	}
871 
872 	if (pidfd_file) {
873 		put_unused_fd(pidfd);
874 		fput(pidfd_file);
875 	}
876 
877 	return ret;
878 }
879 
880 /* intofiy userspace file descriptor functions */
fanotify_poll(struct file * file,poll_table * wait)881 static __poll_t fanotify_poll(struct file *file, poll_table *wait)
882 {
883 	struct fsnotify_group *group = file->private_data;
884 	__poll_t ret = 0;
885 
886 	poll_wait(file, &group->notification_waitq, wait);
887 	spin_lock(&group->notification_lock);
888 	if (!fsnotify_notify_queue_is_empty(group))
889 		ret = EPOLLIN | EPOLLRDNORM;
890 	spin_unlock(&group->notification_lock);
891 
892 	return ret;
893 }
894 
fanotify_read(struct file * file,char __user * buf,size_t count,loff_t * pos)895 static ssize_t fanotify_read(struct file *file, char __user *buf,
896 			     size_t count, loff_t *pos)
897 {
898 	struct fsnotify_group *group;
899 	struct fanotify_event *event;
900 	char __user *start;
901 	int ret;
902 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
903 
904 	start = buf;
905 	group = file->private_data;
906 
907 	pr_debug("%s: group=%p\n", __func__, group);
908 
909 	add_wait_queue(&group->notification_waitq, &wait);
910 	while (1) {
911 		/*
912 		 * User can supply arbitrarily large buffer. Avoid softlockups
913 		 * in case there are lots of available events.
914 		 */
915 		cond_resched();
916 		event = get_one_event(group, count);
917 		if (IS_ERR(event)) {
918 			ret = PTR_ERR(event);
919 			break;
920 		}
921 
922 		if (!event) {
923 			ret = -EAGAIN;
924 			if (file->f_flags & O_NONBLOCK)
925 				break;
926 
927 			ret = -ERESTARTSYS;
928 			if (signal_pending(current))
929 				break;
930 
931 			if (start != buf)
932 				break;
933 
934 			wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
935 			continue;
936 		}
937 
938 		ret = copy_event_to_user(group, event, buf, count);
939 
940 		/*
941 		 * Permission events get queued to wait for response.  Other
942 		 * events can be destroyed now.
943 		 */
944 		if (!fanotify_is_perm_event(event->mask)) {
945 			fsnotify_destroy_event(group, &event->fse);
946 		} else {
947 			if (ret <= 0 || FANOTIFY_PERM(event)->fd < 0) {
948 				spin_lock(&group->notification_lock);
949 				finish_permission_event(group,
950 					FANOTIFY_PERM(event), FAN_DENY, NULL);
951 				wake_up(&group->fanotify_data.access_waitq);
952 			} else {
953 				spin_lock(&group->notification_lock);
954 				list_add_tail(&event->fse.list,
955 					&group->fanotify_data.access_list);
956 				spin_unlock(&group->notification_lock);
957 			}
958 		}
959 		if (ret < 0)
960 			break;
961 		buf += ret;
962 		count -= ret;
963 	}
964 	remove_wait_queue(&group->notification_waitq, &wait);
965 
966 	if (start != buf && ret != -EFAULT)
967 		ret = buf - start;
968 	return ret;
969 }
970 
fanotify_write(struct file * file,const char __user * buf,size_t count,loff_t * pos)971 static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
972 {
973 	struct fanotify_response response;
974 	struct fsnotify_group *group;
975 	int ret;
976 	const char __user *info_buf = buf + sizeof(struct fanotify_response);
977 	size_t info_len;
978 
979 	if (!IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
980 		return -EINVAL;
981 
982 	group = file->private_data;
983 
984 	pr_debug("%s: group=%p count=%zu\n", __func__, group, count);
985 
986 	if (count < sizeof(response))
987 		return -EINVAL;
988 
989 	if (copy_from_user(&response, buf, sizeof(response)))
990 		return -EFAULT;
991 
992 	info_len = count - sizeof(response);
993 
994 	ret = process_access_response(group, &response, info_buf, info_len);
995 	if (ret < 0)
996 		count = ret;
997 	else
998 		count = sizeof(response) + ret;
999 
1000 	return count;
1001 }
1002 
fanotify_release(struct inode * ignored,struct file * file)1003 static int fanotify_release(struct inode *ignored, struct file *file)
1004 {
1005 	struct fsnotify_group *group = file->private_data;
1006 	struct fsnotify_event *fsn_event;
1007 
1008 	/*
1009 	 * Stop new events from arriving in the notification queue. since
1010 	 * userspace cannot use fanotify fd anymore, no event can enter or
1011 	 * leave access_list by now either.
1012 	 */
1013 	fsnotify_group_stop_queueing(group);
1014 
1015 	/*
1016 	 * Process all permission events on access_list and notification queue
1017 	 * and simulate reply from userspace.
1018 	 */
1019 	spin_lock(&group->notification_lock);
1020 	while (!list_empty(&group->fanotify_data.access_list)) {
1021 		struct fanotify_perm_event *event;
1022 
1023 		event = list_first_entry(&group->fanotify_data.access_list,
1024 				struct fanotify_perm_event, fae.fse.list);
1025 		list_del_init(&event->fae.fse.list);
1026 		finish_permission_event(group, event, FAN_ALLOW, NULL);
1027 		spin_lock(&group->notification_lock);
1028 	}
1029 
1030 	/*
1031 	 * Destroy all non-permission events. For permission events just
1032 	 * dequeue them and set the response. They will be freed once the
1033 	 * response is consumed and fanotify_get_response() returns.
1034 	 */
1035 	while ((fsn_event = fsnotify_remove_first_event(group))) {
1036 		struct fanotify_event *event = FANOTIFY_E(fsn_event);
1037 
1038 		if (!(event->mask & FANOTIFY_PERM_EVENTS)) {
1039 			spin_unlock(&group->notification_lock);
1040 			fsnotify_destroy_event(group, fsn_event);
1041 		} else {
1042 			finish_permission_event(group, FANOTIFY_PERM(event),
1043 						FAN_ALLOW, NULL);
1044 		}
1045 		spin_lock(&group->notification_lock);
1046 	}
1047 	spin_unlock(&group->notification_lock);
1048 
1049 	/* Response for all permission events it set, wakeup waiters */
1050 	wake_up(&group->fanotify_data.access_waitq);
1051 
1052 	/* matches the fanotify_init->fsnotify_alloc_group */
1053 	fsnotify_destroy_group(group);
1054 
1055 	return 0;
1056 }
1057 
fanotify_ioctl(struct file * file,unsigned int cmd,unsigned long arg)1058 static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1059 {
1060 	struct fsnotify_group *group;
1061 	struct fsnotify_event *fsn_event;
1062 	void __user *p;
1063 	int ret = -ENOTTY;
1064 	size_t send_len = 0;
1065 
1066 	group = file->private_data;
1067 
1068 	p = (void __user *) arg;
1069 
1070 	switch (cmd) {
1071 	case FIONREAD:
1072 		spin_lock(&group->notification_lock);
1073 		list_for_each_entry(fsn_event, &group->notification_list, list)
1074 			send_len += FAN_EVENT_METADATA_LEN;
1075 		spin_unlock(&group->notification_lock);
1076 		ret = put_user(send_len, (int __user *) p);
1077 		break;
1078 	}
1079 
1080 	return ret;
1081 }
1082 
1083 static const struct file_operations fanotify_fops = {
1084 	.show_fdinfo	= fanotify_show_fdinfo,
1085 	.poll		= fanotify_poll,
1086 	.read		= fanotify_read,
1087 	.write		= fanotify_write,
1088 	.fasync		= NULL,
1089 	.release	= fanotify_release,
1090 	.unlocked_ioctl	= fanotify_ioctl,
1091 	.compat_ioctl	= compat_ptr_ioctl,
1092 	.llseek		= noop_llseek,
1093 };
1094 
fanotify_find_path(int dfd,const char __user * filename,struct path * path,unsigned int flags,__u64 mask,unsigned int obj_type)1095 static int fanotify_find_path(int dfd, const char __user *filename,
1096 			      struct path *path, unsigned int flags, __u64 mask,
1097 			      unsigned int obj_type)
1098 {
1099 	int ret;
1100 
1101 	pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__,
1102 		 dfd, filename, flags);
1103 
1104 	if (filename == NULL) {
1105 		CLASS(fd, f)(dfd);
1106 
1107 		if (fd_empty(f))
1108 			return -EBADF;
1109 
1110 		if ((flags & FAN_MARK_ONLYDIR) &&
1111 		    !(S_ISDIR(file_inode(fd_file(f))->i_mode)))
1112 			return -ENOTDIR;
1113 
1114 		*path = fd_file(f)->f_path;
1115 		path_get(path);
1116 	} else {
1117 		unsigned int lookup_flags = 0;
1118 
1119 		if (!(flags & FAN_MARK_DONT_FOLLOW))
1120 			lookup_flags |= LOOKUP_FOLLOW;
1121 		if (flags & FAN_MARK_ONLYDIR)
1122 			lookup_flags |= LOOKUP_DIRECTORY;
1123 
1124 		ret = user_path_at(dfd, filename, lookup_flags, path);
1125 		if (ret)
1126 			goto out;
1127 	}
1128 
1129 	/* you can only watch an inode if you have read permissions on it */
1130 	ret = path_permission(path, MAY_READ);
1131 	if (ret) {
1132 		path_put(path);
1133 		goto out;
1134 	}
1135 
1136 	ret = security_path_notify(path, mask, obj_type);
1137 	if (ret)
1138 		path_put(path);
1139 
1140 out:
1141 	return ret;
1142 }
1143 
fanotify_mark_remove_from_mask(struct fsnotify_mark * fsn_mark,__u32 mask,unsigned int flags,__u32 umask,int * destroy)1144 static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
1145 					    __u32 mask, unsigned int flags,
1146 					    __u32 umask, int *destroy)
1147 {
1148 	__u32 oldmask, newmask;
1149 
1150 	/* umask bits cannot be removed by user */
1151 	mask &= ~umask;
1152 	spin_lock(&fsn_mark->lock);
1153 	oldmask = fsnotify_calc_mask(fsn_mark);
1154 	if (!(flags & FANOTIFY_MARK_IGNORE_BITS)) {
1155 		fsn_mark->mask &= ~mask;
1156 	} else {
1157 		fsn_mark->ignore_mask &= ~mask;
1158 	}
1159 	newmask = fsnotify_calc_mask(fsn_mark);
1160 	/*
1161 	 * We need to keep the mark around even if remaining mask cannot
1162 	 * result in any events (e.g. mask == FAN_ONDIR) to support incremenal
1163 	 * changes to the mask.
1164 	 * Destroy mark when only umask bits remain.
1165 	 */
1166 	*destroy = !((fsn_mark->mask | fsn_mark->ignore_mask) & ~umask);
1167 	spin_unlock(&fsn_mark->lock);
1168 
1169 	return oldmask & ~newmask;
1170 }
1171 
fanotify_remove_mark(struct fsnotify_group * group,void * obj,unsigned int obj_type,__u32 mask,unsigned int flags,__u32 umask)1172 static int fanotify_remove_mark(struct fsnotify_group *group,
1173 				void *obj, unsigned int obj_type, __u32 mask,
1174 				unsigned int flags, __u32 umask)
1175 {
1176 	struct fsnotify_mark *fsn_mark = NULL;
1177 	__u32 removed;
1178 	int destroy_mark;
1179 
1180 	fsnotify_group_lock(group);
1181 	fsn_mark = fsnotify_find_mark(obj, obj_type, group);
1182 	if (!fsn_mark) {
1183 		fsnotify_group_unlock(group);
1184 		return -ENOENT;
1185 	}
1186 
1187 	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
1188 						 umask, &destroy_mark);
1189 	if (removed & fsnotify_conn_mask(fsn_mark->connector))
1190 		fsnotify_recalc_mask(fsn_mark->connector);
1191 	if (destroy_mark)
1192 		fsnotify_detach_mark(fsn_mark);
1193 	fsnotify_group_unlock(group);
1194 	if (destroy_mark)
1195 		fsnotify_free_mark(fsn_mark);
1196 
1197 	/* matches the fsnotify_find_mark() */
1198 	fsnotify_put_mark(fsn_mark);
1199 	return 0;
1200 }
1201 
fanotify_mark_update_flags(struct fsnotify_mark * fsn_mark,unsigned int fan_flags)1202 static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark,
1203 				       unsigned int fan_flags)
1204 {
1205 	bool want_iref = !(fan_flags & FAN_MARK_EVICTABLE);
1206 	unsigned int ignore = fan_flags & FANOTIFY_MARK_IGNORE_BITS;
1207 	bool recalc = false;
1208 
1209 	/*
1210 	 * When using FAN_MARK_IGNORE for the first time, mark starts using
1211 	 * independent event flags in ignore mask.  After that, trying to
1212 	 * update the ignore mask with the old FAN_MARK_IGNORED_MASK API
1213 	 * will result in EEXIST error.
1214 	 */
1215 	if (ignore == FAN_MARK_IGNORE)
1216 		fsn_mark->flags |= FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS;
1217 
1218 	/*
1219 	 * Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to
1220 	 * the removal of the FS_MODIFY bit in calculated mask if it was set
1221 	 * because of an ignore mask that is now going to survive FS_MODIFY.
1222 	 */
1223 	if (ignore && (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
1224 	    !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) {
1225 		fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
1226 		if (!(fsn_mark->mask & FS_MODIFY))
1227 			recalc = true;
1228 	}
1229 
1230 	if (fsn_mark->connector->type != FSNOTIFY_OBJ_TYPE_INODE ||
1231 	    want_iref == !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
1232 		return recalc;
1233 
1234 	/*
1235 	 * NO_IREF may be removed from a mark, but not added.
1236 	 * When removed, fsnotify_recalc_mask() will take the inode ref.
1237 	 */
1238 	WARN_ON_ONCE(!want_iref);
1239 	fsn_mark->flags &= ~FSNOTIFY_MARK_FLAG_NO_IREF;
1240 
1241 	return true;
1242 }
1243 
fanotify_mark_add_to_mask(struct fsnotify_mark * fsn_mark,__u32 mask,unsigned int fan_flags)1244 static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
1245 				      __u32 mask, unsigned int fan_flags)
1246 {
1247 	bool recalc;
1248 
1249 	spin_lock(&fsn_mark->lock);
1250 	if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS))
1251 		fsn_mark->mask |= mask;
1252 	else
1253 		fsn_mark->ignore_mask |= mask;
1254 
1255 	recalc = fsnotify_calc_mask(fsn_mark) &
1256 		~fsnotify_conn_mask(fsn_mark->connector);
1257 
1258 	recalc |= fanotify_mark_update_flags(fsn_mark, fan_flags);
1259 	spin_unlock(&fsn_mark->lock);
1260 
1261 	return recalc;
1262 }
1263 
1264 struct fan_fsid {
1265 	struct super_block *sb;
1266 	__kernel_fsid_t id;
1267 	bool weak;
1268 };
1269 
fanotify_set_mark_fsid(struct fsnotify_group * group,struct fsnotify_mark * mark,struct fan_fsid * fsid)1270 static int fanotify_set_mark_fsid(struct fsnotify_group *group,
1271 				  struct fsnotify_mark *mark,
1272 				  struct fan_fsid *fsid)
1273 {
1274 	struct fsnotify_mark_connector *conn;
1275 	struct fsnotify_mark *old;
1276 	struct super_block *old_sb = NULL;
1277 
1278 	FANOTIFY_MARK(mark)->fsid = fsid->id;
1279 	mark->flags |= FSNOTIFY_MARK_FLAG_HAS_FSID;
1280 	if (fsid->weak)
1281 		mark->flags |= FSNOTIFY_MARK_FLAG_WEAK_FSID;
1282 
1283 	/* First mark added will determine if group is single or multi fsid */
1284 	if (list_empty(&group->marks_list))
1285 		return 0;
1286 
1287 	/* Find sb of an existing mark */
1288 	list_for_each_entry(old, &group->marks_list, g_list) {
1289 		conn = READ_ONCE(old->connector);
1290 		if (!conn)
1291 			continue;
1292 		old_sb = fsnotify_connector_sb(conn);
1293 		if (old_sb)
1294 			break;
1295 	}
1296 
1297 	/* Only detached marks left? */
1298 	if (!old_sb)
1299 		return 0;
1300 
1301 	/* Do not allow mixing of marks with weak and strong fsid */
1302 	if ((mark->flags ^ old->flags) & FSNOTIFY_MARK_FLAG_WEAK_FSID)
1303 		return -EXDEV;
1304 
1305 	/* Allow mixing of marks with strong fsid from different fs */
1306 	if (!fsid->weak)
1307 		return 0;
1308 
1309 	/* Do not allow mixing marks with weak fsid from different fs */
1310 	if (old_sb != fsid->sb)
1311 		return -EXDEV;
1312 
1313 	/* Do not allow mixing marks from different btrfs sub-volumes */
1314 	if (!fanotify_fsid_equal(&FANOTIFY_MARK(old)->fsid,
1315 				 &FANOTIFY_MARK(mark)->fsid))
1316 		return -EXDEV;
1317 
1318 	return 0;
1319 }
1320 
fanotify_add_new_mark(struct fsnotify_group * group,void * obj,unsigned int obj_type,unsigned int fan_flags,struct fan_fsid * fsid)1321 static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
1322 						   void *obj,
1323 						   unsigned int obj_type,
1324 						   unsigned int fan_flags,
1325 						   struct fan_fsid *fsid)
1326 {
1327 	struct ucounts *ucounts = group->fanotify_data.ucounts;
1328 	struct fanotify_mark *fan_mark;
1329 	struct fsnotify_mark *mark;
1330 	int ret;
1331 
1332 	/*
1333 	 * Enforce per user marks limits per user in all containing user ns.
1334 	 * A group with FAN_UNLIMITED_MARKS does not contribute to mark count
1335 	 * in the limited groups account.
1336 	 */
1337 	if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) &&
1338 	    !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS))
1339 		return ERR_PTR(-ENOSPC);
1340 
1341 	fan_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
1342 	if (!fan_mark) {
1343 		ret = -ENOMEM;
1344 		goto out_dec_ucounts;
1345 	}
1346 
1347 	mark = &fan_mark->fsn_mark;
1348 	fsnotify_init_mark(mark, group);
1349 	if (fan_flags & FAN_MARK_EVICTABLE)
1350 		mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF;
1351 
1352 	/* Cache fsid of filesystem containing the marked object */
1353 	if (fsid) {
1354 		ret = fanotify_set_mark_fsid(group, mark, fsid);
1355 		if (ret)
1356 			goto out_put_mark;
1357 	} else {
1358 		fan_mark->fsid.val[0] = fan_mark->fsid.val[1] = 0;
1359 	}
1360 
1361 	ret = fsnotify_add_mark_locked(mark, obj, obj_type, 0);
1362 	if (ret)
1363 		goto out_put_mark;
1364 
1365 	return mark;
1366 
1367 out_put_mark:
1368 	fsnotify_put_mark(mark);
1369 out_dec_ucounts:
1370 	if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS))
1371 		dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS);
1372 	return ERR_PTR(ret);
1373 }
1374 
fanotify_group_init_error_pool(struct fsnotify_group * group)1375 static int fanotify_group_init_error_pool(struct fsnotify_group *group)
1376 {
1377 	if (mempool_initialized(&group->fanotify_data.error_events_pool))
1378 		return 0;
1379 
1380 	return mempool_init_kmalloc_pool(&group->fanotify_data.error_events_pool,
1381 					 FANOTIFY_DEFAULT_FEE_POOL_SIZE,
1382 					 sizeof(struct fanotify_error_event));
1383 }
1384 
fanotify_may_update_existing_mark(struct fsnotify_mark * fsn_mark,__u32 mask,unsigned int fan_flags)1385 static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark,
1386 					     __u32 mask, unsigned int fan_flags)
1387 {
1388 	/*
1389 	 * Non evictable mark cannot be downgraded to evictable mark.
1390 	 */
1391 	if (fan_flags & FAN_MARK_EVICTABLE &&
1392 	    !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
1393 		return -EEXIST;
1394 
1395 	/*
1396 	 * New ignore mask semantics cannot be downgraded to old semantics.
1397 	 */
1398 	if (fan_flags & FAN_MARK_IGNORED_MASK &&
1399 	    fsn_mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS)
1400 		return -EEXIST;
1401 
1402 	/*
1403 	 * An ignore mask that survives modify could never be downgraded to not
1404 	 * survive modify.  With new FAN_MARK_IGNORE semantics we make that rule
1405 	 * explicit and return an error when trying to update the ignore mask
1406 	 * without the original FAN_MARK_IGNORED_SURV_MODIFY value.
1407 	 */
1408 	if (fan_flags & FAN_MARK_IGNORE &&
1409 	    !(fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
1410 	    fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
1411 		return -EEXIST;
1412 
1413 	/* For now pre-content events are not generated for directories */
1414 	mask |= fsn_mark->mask;
1415 	if (mask & FANOTIFY_PRE_CONTENT_EVENTS && mask & FAN_ONDIR)
1416 		return -EEXIST;
1417 
1418 	return 0;
1419 }
1420 
fanotify_add_mark(struct fsnotify_group * group,void * obj,unsigned int obj_type,__u32 mask,unsigned int fan_flags,struct fan_fsid * fsid)1421 static int fanotify_add_mark(struct fsnotify_group *group,
1422 			     void *obj, unsigned int obj_type,
1423 			     __u32 mask, unsigned int fan_flags,
1424 			     struct fan_fsid *fsid)
1425 {
1426 	struct fsnotify_mark *fsn_mark;
1427 	bool recalc;
1428 	int ret = 0;
1429 
1430 	fsnotify_group_lock(group);
1431 	fsn_mark = fsnotify_find_mark(obj, obj_type, group);
1432 	if (!fsn_mark) {
1433 		fsn_mark = fanotify_add_new_mark(group, obj, obj_type,
1434 						 fan_flags, fsid);
1435 		if (IS_ERR(fsn_mark)) {
1436 			fsnotify_group_unlock(group);
1437 			return PTR_ERR(fsn_mark);
1438 		}
1439 	}
1440 
1441 	/*
1442 	 * Check if requested mark flags conflict with an existing mark flags.
1443 	 */
1444 	ret = fanotify_may_update_existing_mark(fsn_mark, mask, fan_flags);
1445 	if (ret)
1446 		goto out;
1447 
1448 	/*
1449 	 * Error events are pre-allocated per group, only if strictly
1450 	 * needed (i.e. FAN_FS_ERROR was requested).
1451 	 */
1452 	if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS) &&
1453 	    (mask & FAN_FS_ERROR)) {
1454 		ret = fanotify_group_init_error_pool(group);
1455 		if (ret)
1456 			goto out;
1457 	}
1458 
1459 	recalc = fanotify_mark_add_to_mask(fsn_mark, mask, fan_flags);
1460 	if (recalc)
1461 		fsnotify_recalc_mask(fsn_mark->connector);
1462 
1463 out:
1464 	fsnotify_group_unlock(group);
1465 
1466 	fsnotify_put_mark(fsn_mark);
1467 	return ret;
1468 }
1469 
fanotify_alloc_overflow_event(void)1470 static struct fsnotify_event *fanotify_alloc_overflow_event(void)
1471 {
1472 	struct fanotify_event *oevent;
1473 
1474 	oevent = kmalloc(sizeof(*oevent), GFP_KERNEL_ACCOUNT);
1475 	if (!oevent)
1476 		return NULL;
1477 
1478 	fanotify_init_event(oevent, 0, FS_Q_OVERFLOW);
1479 	oevent->type = FANOTIFY_EVENT_TYPE_OVERFLOW;
1480 
1481 	return &oevent->fse;
1482 }
1483 
fanotify_alloc_merge_hash(void)1484 static struct hlist_head *fanotify_alloc_merge_hash(void)
1485 {
1486 	struct hlist_head *hash;
1487 
1488 	hash = kmalloc(sizeof(struct hlist_head) << FANOTIFY_HTABLE_BITS,
1489 		       GFP_KERNEL_ACCOUNT);
1490 	if (!hash)
1491 		return NULL;
1492 
1493 	__hash_init(hash, FANOTIFY_HTABLE_SIZE);
1494 
1495 	return hash;
1496 }
1497 
1498 /* fanotify syscalls */
SYSCALL_DEFINE2(fanotify_init,unsigned int,flags,unsigned int,event_f_flags)1499 SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
1500 {
1501 	struct fsnotify_group *group;
1502 	int f_flags, fd;
1503 	unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
1504 	unsigned int class = flags & FANOTIFY_CLASS_BITS;
1505 	unsigned int internal_flags = 0;
1506 	struct file *file;
1507 
1508 	pr_debug("%s: flags=%x event_f_flags=%x\n",
1509 		 __func__, flags, event_f_flags);
1510 
1511 	if (!capable(CAP_SYS_ADMIN)) {
1512 		/*
1513 		 * An unprivileged user can setup an fanotify group with
1514 		 * limited functionality - an unprivileged group is limited to
1515 		 * notification events with file handles and it cannot use
1516 		 * unlimited queue/marks.
1517 		 */
1518 		if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode)
1519 			return -EPERM;
1520 
1521 		/*
1522 		 * Setting the internal flag FANOTIFY_UNPRIV on the group
1523 		 * prevents setting mount/filesystem marks on this group and
1524 		 * prevents reporting pid and open fd in events.
1525 		 */
1526 		internal_flags |= FANOTIFY_UNPRIV;
1527 	}
1528 
1529 #ifdef CONFIG_AUDITSYSCALL
1530 	if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT))
1531 #else
1532 	if (flags & ~FANOTIFY_INIT_FLAGS)
1533 #endif
1534 		return -EINVAL;
1535 
1536 	/*
1537 	 * A pidfd can only be returned for a thread-group leader; thus
1538 	 * FAN_REPORT_PIDFD and FAN_REPORT_TID need to remain mutually
1539 	 * exclusive.
1540 	 */
1541 	if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID))
1542 		return -EINVAL;
1543 
1544 	/* Don't allow mixing mnt events with inode events for now */
1545 	if (flags & FAN_REPORT_MNT) {
1546 		if (class != FAN_CLASS_NOTIF)
1547 			return -EINVAL;
1548 		if (flags & (FANOTIFY_FID_BITS | FAN_REPORT_FD_ERROR))
1549 			return -EINVAL;
1550 	}
1551 
1552 	if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
1553 		return -EINVAL;
1554 
1555 	switch (event_f_flags & O_ACCMODE) {
1556 	case O_RDONLY:
1557 	case O_RDWR:
1558 	case O_WRONLY:
1559 		break;
1560 	default:
1561 		return -EINVAL;
1562 	}
1563 
1564 	if (fid_mode && class != FAN_CLASS_NOTIF)
1565 		return -EINVAL;
1566 
1567 	/*
1568 	 * Child name is reported with parent fid so requires dir fid.
1569 	 * We can report both child fid and dir fid with or without name.
1570 	 */
1571 	if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID))
1572 		return -EINVAL;
1573 
1574 	/*
1575 	 * FAN_REPORT_TARGET_FID requires FAN_REPORT_NAME and FAN_REPORT_FID
1576 	 * and is used as an indication to report both dir and child fid on all
1577 	 * dirent events.
1578 	 */
1579 	if ((fid_mode & FAN_REPORT_TARGET_FID) &&
1580 	    (!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID)))
1581 		return -EINVAL;
1582 
1583 	f_flags = O_RDWR;
1584 	if (flags & FAN_CLOEXEC)
1585 		f_flags |= O_CLOEXEC;
1586 	if (flags & FAN_NONBLOCK)
1587 		f_flags |= O_NONBLOCK;
1588 
1589 	/* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
1590 	group = fsnotify_alloc_group(&fanotify_fsnotify_ops,
1591 				     FSNOTIFY_GROUP_USER);
1592 	if (IS_ERR(group)) {
1593 		return PTR_ERR(group);
1594 	}
1595 
1596 	/* Enforce groups limits per user in all containing user ns */
1597 	group->fanotify_data.ucounts = inc_ucount(current_user_ns(),
1598 						  current_euid(),
1599 						  UCOUNT_FANOTIFY_GROUPS);
1600 	if (!group->fanotify_data.ucounts) {
1601 		fd = -EMFILE;
1602 		goto out_destroy_group;
1603 	}
1604 
1605 	group->fanotify_data.flags = flags | internal_flags;
1606 	group->memcg = get_mem_cgroup_from_mm(current->mm);
1607 
1608 	group->fanotify_data.merge_hash = fanotify_alloc_merge_hash();
1609 	if (!group->fanotify_data.merge_hash) {
1610 		fd = -ENOMEM;
1611 		goto out_destroy_group;
1612 	}
1613 
1614 	group->overflow_event = fanotify_alloc_overflow_event();
1615 	if (unlikely(!group->overflow_event)) {
1616 		fd = -ENOMEM;
1617 		goto out_destroy_group;
1618 	}
1619 
1620 	if (force_o_largefile())
1621 		event_f_flags |= O_LARGEFILE;
1622 	group->fanotify_data.f_flags = event_f_flags;
1623 	init_waitqueue_head(&group->fanotify_data.access_waitq);
1624 	INIT_LIST_HEAD(&group->fanotify_data.access_list);
1625 	switch (class) {
1626 	case FAN_CLASS_NOTIF:
1627 		group->priority = FSNOTIFY_PRIO_NORMAL;
1628 		break;
1629 	case FAN_CLASS_CONTENT:
1630 		group->priority = FSNOTIFY_PRIO_CONTENT;
1631 		break;
1632 	case FAN_CLASS_PRE_CONTENT:
1633 		group->priority = FSNOTIFY_PRIO_PRE_CONTENT;
1634 		break;
1635 	default:
1636 		fd = -EINVAL;
1637 		goto out_destroy_group;
1638 	}
1639 
1640 	if (flags & FAN_UNLIMITED_QUEUE) {
1641 		fd = -EPERM;
1642 		if (!capable(CAP_SYS_ADMIN))
1643 			goto out_destroy_group;
1644 		group->max_events = UINT_MAX;
1645 	} else {
1646 		group->max_events = fanotify_max_queued_events;
1647 	}
1648 
1649 	if (flags & FAN_UNLIMITED_MARKS) {
1650 		fd = -EPERM;
1651 		if (!capable(CAP_SYS_ADMIN))
1652 			goto out_destroy_group;
1653 	}
1654 
1655 	if (flags & FAN_ENABLE_AUDIT) {
1656 		fd = -EPERM;
1657 		if (!capable(CAP_AUDIT_WRITE))
1658 			goto out_destroy_group;
1659 	}
1660 
1661 	fd = get_unused_fd_flags(f_flags);
1662 	if (fd < 0)
1663 		goto out_destroy_group;
1664 
1665 	file = anon_inode_getfile_fmode("[fanotify]", &fanotify_fops, group,
1666 					f_flags, FMODE_NONOTIFY);
1667 	if (IS_ERR(file)) {
1668 		put_unused_fd(fd);
1669 		fd = PTR_ERR(file);
1670 		goto out_destroy_group;
1671 	}
1672 	fd_install(fd, file);
1673 	return fd;
1674 
1675 out_destroy_group:
1676 	fsnotify_destroy_group(group);
1677 	return fd;
1678 }
1679 
fanotify_test_fsid(struct dentry * dentry,unsigned int flags,struct fan_fsid * fsid)1680 static int fanotify_test_fsid(struct dentry *dentry, unsigned int flags,
1681 			      struct fan_fsid *fsid)
1682 {
1683 	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
1684 	__kernel_fsid_t root_fsid;
1685 	int err;
1686 
1687 	/*
1688 	 * Make sure dentry is not of a filesystem with zero fsid (e.g. fuse).
1689 	 */
1690 	err = vfs_get_fsid(dentry, &fsid->id);
1691 	if (err)
1692 		return err;
1693 
1694 	fsid->sb = dentry->d_sb;
1695 	if (!fsid->id.val[0] && !fsid->id.val[1]) {
1696 		err = -ENODEV;
1697 		goto weak;
1698 	}
1699 
1700 	/*
1701 	 * Make sure dentry is not of a filesystem subvolume (e.g. btrfs)
1702 	 * which uses a different fsid than sb root.
1703 	 */
1704 	err = vfs_get_fsid(dentry->d_sb->s_root, &root_fsid);
1705 	if (err)
1706 		return err;
1707 
1708 	if (!fanotify_fsid_equal(&root_fsid, &fsid->id)) {
1709 		err = -EXDEV;
1710 		goto weak;
1711 	}
1712 
1713 	fsid->weak = false;
1714 	return 0;
1715 
1716 weak:
1717 	/* Allow weak fsid when marking inodes */
1718 	fsid->weak = true;
1719 	return (mark_type == FAN_MARK_INODE) ? 0 : err;
1720 }
1721 
1722 /* Check if filesystem can encode a unique fid */
fanotify_test_fid(struct dentry * dentry,unsigned int flags)1723 static int fanotify_test_fid(struct dentry *dentry, unsigned int flags)
1724 {
1725 	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
1726 	const struct export_operations *nop = dentry->d_sb->s_export_op;
1727 
1728 	/*
1729 	 * We need to make sure that the filesystem supports encoding of
1730 	 * file handles so user can use name_to_handle_at() to compare fids
1731 	 * reported with events to the file handle of watched objects.
1732 	 */
1733 	if (!exportfs_can_encode_fid(nop))
1734 		return -EOPNOTSUPP;
1735 
1736 	/*
1737 	 * For sb/mount mark, we also need to make sure that the filesystem
1738 	 * supports decoding file handles, so user has a way to map back the
1739 	 * reported fids to filesystem objects.
1740 	 */
1741 	if (mark_type != FAN_MARK_INODE && !exportfs_can_decode_fh(nop))
1742 		return -EOPNOTSUPP;
1743 
1744 	return 0;
1745 }
1746 
fanotify_events_supported(struct fsnotify_group * group,const struct path * path,__u64 mask,unsigned int flags)1747 static int fanotify_events_supported(struct fsnotify_group *group,
1748 				     const struct path *path, __u64 mask,
1749 				     unsigned int flags)
1750 {
1751 	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
1752 	bool is_dir = d_is_dir(path->dentry);
1753 	/* Strict validation of events in non-dir inode mask with v5.17+ APIs */
1754 	bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) ||
1755 				 (mask & FAN_RENAME) ||
1756 				 (flags & FAN_MARK_IGNORE);
1757 
1758 	/*
1759 	 * Filesystems need to opt-into pre-content evnets (a.k.a HSM)
1760 	 * and they are only supported on regular files and directories.
1761 	 */
1762 	if (mask & FANOTIFY_PRE_CONTENT_EVENTS) {
1763 		if (!(path->mnt->mnt_sb->s_iflags & SB_I_ALLOW_HSM))
1764 			return -EOPNOTSUPP;
1765 		if (!is_dir && !d_is_reg(path->dentry))
1766 			return -EINVAL;
1767 	}
1768 
1769 	/*
1770 	 * Some filesystems such as 'proc' acquire unusual locks when opening
1771 	 * files. For them fanotify permission events have high chances of
1772 	 * deadlocking the system - open done when reporting fanotify event
1773 	 * blocks on this "unusual" lock while another process holding the lock
1774 	 * waits for fanotify permission event to be answered. Just disallow
1775 	 * permission events for such filesystems.
1776 	 */
1777 	if (mask & FANOTIFY_PERM_EVENTS &&
1778 	    path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM)
1779 		return -EINVAL;
1780 
1781 	/*
1782 	 * mount and sb marks are not allowed on kernel internal pseudo fs,
1783 	 * like pipe_mnt, because that would subscribe to events on all the
1784 	 * anonynous pipes in the system.
1785 	 *
1786 	 * SB_NOUSER covers all of the internal pseudo fs whose objects are not
1787 	 * exposed to user's mount namespace, but there are other SB_KERNMOUNT
1788 	 * fs, like nsfs, debugfs, for which the value of allowing sb and mount
1789 	 * mark is questionable. For now we leave them alone.
1790 	 */
1791 	if (mark_type != FAN_MARK_INODE &&
1792 	    path->mnt->mnt_sb->s_flags & SB_NOUSER)
1793 		return -EINVAL;
1794 
1795 	/*
1796 	 * We shouldn't have allowed setting dirent events and the directory
1797 	 * flags FAN_ONDIR and FAN_EVENT_ON_CHILD in mask of non-dir inode,
1798 	 * but because we always allowed it, error only when using new APIs.
1799 	 */
1800 	if (strict_dir_events && mark_type == FAN_MARK_INODE &&
1801 	    !is_dir && (mask & FANOTIFY_DIRONLY_EVENT_BITS))
1802 		return -ENOTDIR;
1803 
1804 	return 0;
1805 }
1806 
do_fanotify_mark(int fanotify_fd,unsigned int flags,__u64 mask,int dfd,const char __user * pathname)1807 static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
1808 			    int dfd, const char  __user *pathname)
1809 {
1810 	struct inode *inode = NULL;
1811 	struct fsnotify_group *group;
1812 	struct path path;
1813 	struct fan_fsid __fsid, *fsid = NULL;
1814 	u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
1815 	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
1816 	unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS;
1817 	unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS;
1818 	unsigned int obj_type, fid_mode;
1819 	void *obj = NULL;
1820 	u32 umask = 0;
1821 	int ret;
1822 
1823 	pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
1824 		 __func__, fanotify_fd, flags, dfd, pathname, mask);
1825 
1826 	/* we only use the lower 32 bits as of right now. */
1827 	if (upper_32_bits(mask))
1828 		return -EINVAL;
1829 
1830 	if (flags & ~FANOTIFY_MARK_FLAGS)
1831 		return -EINVAL;
1832 
1833 	switch (mark_type) {
1834 	case FAN_MARK_INODE:
1835 		obj_type = FSNOTIFY_OBJ_TYPE_INODE;
1836 		break;
1837 	case FAN_MARK_MOUNT:
1838 		obj_type = FSNOTIFY_OBJ_TYPE_VFSMOUNT;
1839 		break;
1840 	case FAN_MARK_FILESYSTEM:
1841 		obj_type = FSNOTIFY_OBJ_TYPE_SB;
1842 		break;
1843 	case FAN_MARK_MNTNS:
1844 		obj_type = FSNOTIFY_OBJ_TYPE_MNTNS;
1845 		break;
1846 	default:
1847 		return -EINVAL;
1848 	}
1849 
1850 	switch (mark_cmd) {
1851 	case FAN_MARK_ADD:
1852 	case FAN_MARK_REMOVE:
1853 		if (!mask)
1854 			return -EINVAL;
1855 		break;
1856 	case FAN_MARK_FLUSH:
1857 		if (flags & ~(FANOTIFY_MARK_TYPE_BITS | FAN_MARK_FLUSH))
1858 			return -EINVAL;
1859 		break;
1860 	default:
1861 		return -EINVAL;
1862 	}
1863 
1864 	if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
1865 		valid_mask |= FANOTIFY_PERM_EVENTS;
1866 
1867 	if (mask & ~valid_mask)
1868 		return -EINVAL;
1869 
1870 
1871 	/* We don't allow FAN_MARK_IGNORE & FAN_MARK_IGNORED_MASK together */
1872 	if (ignore == (FAN_MARK_IGNORE | FAN_MARK_IGNORED_MASK))
1873 		return -EINVAL;
1874 
1875 	/*
1876 	 * Event flags (FAN_ONDIR, FAN_EVENT_ON_CHILD) have no effect with
1877 	 * FAN_MARK_IGNORED_MASK.
1878 	 */
1879 	if (ignore == FAN_MARK_IGNORED_MASK) {
1880 		mask &= ~FANOTIFY_EVENT_FLAGS;
1881 		umask = FANOTIFY_EVENT_FLAGS;
1882 	}
1883 
1884 	CLASS(fd, f)(fanotify_fd);
1885 	if (fd_empty(f))
1886 		return -EBADF;
1887 
1888 	/* verify that this is indeed an fanotify instance */
1889 	if (unlikely(fd_file(f)->f_op != &fanotify_fops))
1890 		return -EINVAL;
1891 	group = fd_file(f)->private_data;
1892 
1893 	/* Only report mount events on mnt namespace */
1894 	if (FAN_GROUP_FLAG(group, FAN_REPORT_MNT)) {
1895 		if (mask & ~FANOTIFY_MOUNT_EVENTS)
1896 			return -EINVAL;
1897 		if (mark_type != FAN_MARK_MNTNS)
1898 			return -EINVAL;
1899 	} else {
1900 		if (mask & FANOTIFY_MOUNT_EVENTS)
1901 			return -EINVAL;
1902 		if (mark_type == FAN_MARK_MNTNS)
1903 			return -EINVAL;
1904 	}
1905 
1906 	/*
1907 	 * An unprivileged user is not allowed to setup mount nor filesystem
1908 	 * marks.  This also includes setting up such marks by a group that
1909 	 * was initialized by an unprivileged user.
1910 	 */
1911 	if ((!capable(CAP_SYS_ADMIN) ||
1912 	     FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) &&
1913 	    mark_type != FAN_MARK_INODE)
1914 		return -EPERM;
1915 
1916 	/*
1917 	 * Permission events are not allowed for FAN_CLASS_NOTIF.
1918 	 * Pre-content permission events are not allowed for FAN_CLASS_CONTENT.
1919 	 */
1920 	if (mask & FANOTIFY_PERM_EVENTS &&
1921 	    group->priority == FSNOTIFY_PRIO_NORMAL)
1922 		return -EINVAL;
1923 	else if (mask & FANOTIFY_PRE_CONTENT_EVENTS &&
1924 		 group->priority == FSNOTIFY_PRIO_CONTENT)
1925 		return -EINVAL;
1926 
1927 	if (mask & FAN_FS_ERROR &&
1928 	    mark_type != FAN_MARK_FILESYSTEM)
1929 		return -EINVAL;
1930 
1931 	/*
1932 	 * Evictable is only relevant for inode marks, because only inode object
1933 	 * can be evicted on memory pressure.
1934 	 */
1935 	if (flags & FAN_MARK_EVICTABLE &&
1936 	     mark_type != FAN_MARK_INODE)
1937 		return -EINVAL;
1938 
1939 	/*
1940 	 * Events that do not carry enough information to report
1941 	 * event->fd require a group that supports reporting fid.  Those
1942 	 * events are not supported on a mount mark, because they do not
1943 	 * carry enough information (i.e. path) to be filtered by mount
1944 	 * point.
1945 	 */
1946 	fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
1947 	if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_MOUNT_EVENTS|FANOTIFY_EVENT_FLAGS) &&
1948 	    (!fid_mode || mark_type == FAN_MARK_MOUNT))
1949 		return -EINVAL;
1950 
1951 	/*
1952 	 * FAN_RENAME uses special info type records to report the old and
1953 	 * new parent+name.  Reporting only old and new parent id is less
1954 	 * useful and was not implemented.
1955 	 */
1956 	if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME))
1957 		return -EINVAL;
1958 
1959 	/* Pre-content events are not currently generated for directories. */
1960 	if (mask & FANOTIFY_PRE_CONTENT_EVENTS && mask & FAN_ONDIR)
1961 		return -EINVAL;
1962 
1963 	if (mark_cmd == FAN_MARK_FLUSH) {
1964 		fsnotify_clear_marks_by_group(group, obj_type);
1965 		return 0;
1966 	}
1967 
1968 	ret = fanotify_find_path(dfd, pathname, &path, flags,
1969 			(mask & ALL_FSNOTIFY_EVENTS), obj_type);
1970 	if (ret)
1971 		return ret;
1972 
1973 	if (mark_cmd == FAN_MARK_ADD) {
1974 		ret = fanotify_events_supported(group, &path, mask, flags);
1975 		if (ret)
1976 			goto path_put_and_out;
1977 	}
1978 
1979 	if (fid_mode) {
1980 		ret = fanotify_test_fsid(path.dentry, flags, &__fsid);
1981 		if (ret)
1982 			goto path_put_and_out;
1983 
1984 		ret = fanotify_test_fid(path.dentry, flags);
1985 		if (ret)
1986 			goto path_put_and_out;
1987 
1988 		fsid = &__fsid;
1989 	}
1990 
1991 	/* inode held in place by reference to path; group by fget on fd */
1992 	if (obj_type == FSNOTIFY_OBJ_TYPE_INODE) {
1993 		inode = path.dentry->d_inode;
1994 		obj = inode;
1995 	} else if (obj_type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
1996 		obj = path.mnt;
1997 	} else if (obj_type == FSNOTIFY_OBJ_TYPE_SB) {
1998 		obj = path.mnt->mnt_sb;
1999 	} else if (obj_type == FSNOTIFY_OBJ_TYPE_MNTNS) {
2000 		obj = mnt_ns_from_dentry(path.dentry);
2001 	}
2002 
2003 	ret = -EINVAL;
2004 	if (!obj)
2005 		goto path_put_and_out;
2006 
2007 	/*
2008 	 * If some other task has this inode open for write we should not add
2009 	 * an ignore mask, unless that ignore mask is supposed to survive
2010 	 * modification changes anyway.
2011 	 */
2012 	if (mark_cmd == FAN_MARK_ADD && (flags & FANOTIFY_MARK_IGNORE_BITS) &&
2013 	    !(flags & FAN_MARK_IGNORED_SURV_MODIFY)) {
2014 		ret = !inode ? -EINVAL : -EISDIR;
2015 		/* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */
2016 		if (ignore == FAN_MARK_IGNORE &&
2017 		    (!inode || S_ISDIR(inode->i_mode)))
2018 			goto path_put_and_out;
2019 
2020 		ret = 0;
2021 		if (inode && inode_is_open_for_write(inode))
2022 			goto path_put_and_out;
2023 	}
2024 
2025 	/* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */
2026 	if (!inode || !S_ISDIR(inode->i_mode)) {
2027 		mask &= ~FAN_EVENT_ON_CHILD;
2028 		umask = FAN_EVENT_ON_CHILD;
2029 		/*
2030 		 * If group needs to report parent fid, register for getting
2031 		 * events with parent/name info for non-directory.
2032 		 */
2033 		if ((fid_mode & FAN_REPORT_DIR_FID) &&
2034 		    (flags & FAN_MARK_ADD) && !ignore)
2035 			mask |= FAN_EVENT_ON_CHILD;
2036 	}
2037 
2038 	/* create/update an inode mark */
2039 	switch (mark_cmd) {
2040 	case FAN_MARK_ADD:
2041 		ret = fanotify_add_mark(group, obj, obj_type, mask, flags,
2042 					fsid);
2043 		break;
2044 	case FAN_MARK_REMOVE:
2045 		ret = fanotify_remove_mark(group, obj, obj_type, mask, flags,
2046 					   umask);
2047 		break;
2048 	default:
2049 		ret = -EINVAL;
2050 	}
2051 
2052 path_put_and_out:
2053 	path_put(&path);
2054 	return ret;
2055 }
2056 
2057 #ifndef CONFIG_ARCH_SPLIT_ARG64
SYSCALL_DEFINE5(fanotify_mark,int,fanotify_fd,unsigned int,flags,__u64,mask,int,dfd,const char __user *,pathname)2058 SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
2059 			      __u64, mask, int, dfd,
2060 			      const char  __user *, pathname)
2061 {
2062 	return do_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname);
2063 }
2064 #endif
2065 
2066 #if defined(CONFIG_ARCH_SPLIT_ARG64) || defined(CONFIG_COMPAT)
SYSCALL32_DEFINE6(fanotify_mark,int,fanotify_fd,unsigned int,flags,SC_ARG64 (mask),int,dfd,const char __user *,pathname)2067 SYSCALL32_DEFINE6(fanotify_mark,
2068 				int, fanotify_fd, unsigned int, flags,
2069 				SC_ARG64(mask), int, dfd,
2070 				const char  __user *, pathname)
2071 {
2072 	return do_fanotify_mark(fanotify_fd, flags, SC_VAL64(__u64, mask),
2073 				dfd, pathname);
2074 }
2075 #endif
2076 
2077 /*
2078  * fanotify_user_setup - Our initialization function.  Note that we cannot return
2079  * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
2080  * must result in panic().
2081  */
fanotify_user_setup(void)2082 static int __init fanotify_user_setup(void)
2083 {
2084 	struct sysinfo si;
2085 	int max_marks;
2086 
2087 	si_meminfo(&si);
2088 	/*
2089 	 * Allow up to 1% of addressable memory to be accounted for per user
2090 	 * marks limited to the range [8192, 1048576]. mount and sb marks are
2091 	 * a lot cheaper than inode marks, but there is no reason for a user
2092 	 * to have many of those, so calculate by the cost of inode marks.
2093 	 */
2094 	max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) /
2095 		    INODE_MARK_COST;
2096 	max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS,
2097 				     FANOTIFY_DEFAULT_MAX_USER_MARKS);
2098 
2099 	BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
2100 	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 14);
2101 	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11);
2102 
2103 	fanotify_mark_cache = KMEM_CACHE(fanotify_mark,
2104 					 SLAB_PANIC|SLAB_ACCOUNT);
2105 	fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event,
2106 					       SLAB_PANIC);
2107 	fanotify_path_event_cachep = KMEM_CACHE(fanotify_path_event,
2108 						SLAB_PANIC);
2109 	if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) {
2110 		fanotify_perm_event_cachep =
2111 			KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
2112 	}
2113 	fanotify_mnt_event_cachep = KMEM_CACHE(fanotify_mnt_event, SLAB_PANIC);
2114 
2115 	fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS;
2116 	init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] =
2117 					FANOTIFY_DEFAULT_MAX_GROUPS;
2118 	init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks;
2119 	fanotify_sysctls_init();
2120 
2121 	return 0;
2122 }
2123 device_initcall(fanotify_user_setup);
2124