xref: /linux/fs/notify/fanotify/fanotify_user.c (revision 364eeb79a213fcf9164208b53764223ad522d6b3)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/fanotify.h>
3 #include <linux/fcntl.h>
4 #include <linux/file.h>
5 #include <linux/fs.h>
6 #include <linux/anon_inodes.h>
7 #include <linux/fsnotify_backend.h>
8 #include <linux/init.h>
9 #include <linux/mount.h>
10 #include <linux/namei.h>
11 #include <linux/poll.h>
12 #include <linux/security.h>
13 #include <linux/syscalls.h>
14 #include <linux/slab.h>
15 #include <linux/types.h>
16 #include <linux/uaccess.h>
17 #include <linux/compat.h>
18 #include <linux/sched/signal.h>
19 #include <linux/memcontrol.h>
20 #include <linux/statfs.h>
21 #include <linux/exportfs.h>
22 
23 #include <asm/ioctls.h>
24 
25 #include "../fsnotify.h"
26 #include "../fdinfo.h"
27 #include "fanotify.h"
28 
29 #define FANOTIFY_DEFAULT_MAX_EVENTS	16384
30 #define FANOTIFY_OLD_DEFAULT_MAX_MARKS	8192
31 #define FANOTIFY_DEFAULT_MAX_GROUPS	128
32 #define FANOTIFY_DEFAULT_FEE_POOL_SIZE	32
33 
34 /*
35  * Legacy fanotify marks limits (8192) is per group and we introduced a tunable
36  * limit of marks per user, similar to inotify.  Effectively, the legacy limit
37  * of fanotify marks per user is <max marks per group> * <max groups per user>.
38  * This default limit (1M) also happens to match the increased limit of inotify
39  * max_user_watches since v5.10.
40  */
41 #define FANOTIFY_DEFAULT_MAX_USER_MARKS	\
42 	(FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS)
43 
44 /*
45  * Most of the memory cost of adding an inode mark is pinning the marked inode.
46  * The size of the filesystem inode struct is not uniform across filesystems,
47  * so double the size of a VFS inode is used as a conservative approximation.
48  */
49 #define INODE_MARK_COST	(2 * sizeof(struct inode))
50 
51 /* configurable via /proc/sys/fs/fanotify/ */
52 static int fanotify_max_queued_events __read_mostly;
53 
54 #ifdef CONFIG_SYSCTL
55 
56 #include <linux/sysctl.h>
57 
58 static long ft_zero = 0;
59 static long ft_int_max = INT_MAX;
60 
61 static struct ctl_table fanotify_table[] = {
62 	{
63 		.procname	= "max_user_groups",
64 		.data	= &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS],
65 		.maxlen		= sizeof(long),
66 		.mode		= 0644,
67 		.proc_handler	= proc_doulongvec_minmax,
68 		.extra1		= &ft_zero,
69 		.extra2		= &ft_int_max,
70 	},
71 	{
72 		.procname	= "max_user_marks",
73 		.data	= &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS],
74 		.maxlen		= sizeof(long),
75 		.mode		= 0644,
76 		.proc_handler	= proc_doulongvec_minmax,
77 		.extra1		= &ft_zero,
78 		.extra2		= &ft_int_max,
79 	},
80 	{
81 		.procname	= "max_queued_events",
82 		.data		= &fanotify_max_queued_events,
83 		.maxlen		= sizeof(int),
84 		.mode		= 0644,
85 		.proc_handler	= proc_dointvec_minmax,
86 		.extra1		= SYSCTL_ZERO
87 	},
88 };
89 
90 static void __init fanotify_sysctls_init(void)
91 {
92 	register_sysctl("fs/fanotify", fanotify_table);
93 }
94 #else
95 #define fanotify_sysctls_init() do { } while (0)
96 #endif /* CONFIG_SYSCTL */
97 
98 /*
99  * All flags that may be specified in parameter event_f_flags of fanotify_init.
100  *
101  * Internal and external open flags are stored together in field f_flags of
102  * struct file. Only external open flags shall be allowed in event_f_flags.
103  * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be
104  * excluded.
105  */
106 #define	FANOTIFY_INIT_ALL_EVENT_F_BITS				( \
107 		O_ACCMODE	| O_APPEND	| O_NONBLOCK	| \
108 		__O_SYNC	| O_DSYNC	| O_CLOEXEC     | \
109 		O_LARGEFILE	| O_NOATIME	)
110 
111 extern const struct fsnotify_ops fanotify_fsnotify_ops;
112 
113 struct kmem_cache *fanotify_mark_cache __ro_after_init;
114 struct kmem_cache *fanotify_fid_event_cachep __ro_after_init;
115 struct kmem_cache *fanotify_path_event_cachep __ro_after_init;
116 struct kmem_cache *fanotify_perm_event_cachep __ro_after_init;
117 
118 #define FANOTIFY_EVENT_ALIGN 4
119 #define FANOTIFY_FID_INFO_HDR_LEN \
120 	(sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
121 #define FANOTIFY_PIDFD_INFO_HDR_LEN \
122 	sizeof(struct fanotify_event_info_pidfd)
123 #define FANOTIFY_ERROR_INFO_LEN \
124 	(sizeof(struct fanotify_event_info_error))
125 
126 static int fanotify_fid_info_len(int fh_len, int name_len)
127 {
128 	int info_len = fh_len;
129 
130 	if (name_len)
131 		info_len += name_len + 1;
132 
133 	return roundup(FANOTIFY_FID_INFO_HDR_LEN + info_len,
134 		       FANOTIFY_EVENT_ALIGN);
135 }
136 
137 /* FAN_RENAME may have one or two dir+name info records */
138 static int fanotify_dir_name_info_len(struct fanotify_event *event)
139 {
140 	struct fanotify_info *info = fanotify_event_info(event);
141 	int dir_fh_len = fanotify_event_dir_fh_len(event);
142 	int dir2_fh_len = fanotify_event_dir2_fh_len(event);
143 	int info_len = 0;
144 
145 	if (dir_fh_len)
146 		info_len += fanotify_fid_info_len(dir_fh_len,
147 						  info->name_len);
148 	if (dir2_fh_len)
149 		info_len += fanotify_fid_info_len(dir2_fh_len,
150 						  info->name2_len);
151 
152 	return info_len;
153 }
154 
155 static size_t fanotify_event_len(unsigned int info_mode,
156 				 struct fanotify_event *event)
157 {
158 	size_t event_len = FAN_EVENT_METADATA_LEN;
159 	int fh_len;
160 	int dot_len = 0;
161 
162 	if (!info_mode)
163 		return event_len;
164 
165 	if (fanotify_is_error_event(event->mask))
166 		event_len += FANOTIFY_ERROR_INFO_LEN;
167 
168 	if (fanotify_event_has_any_dir_fh(event)) {
169 		event_len += fanotify_dir_name_info_len(event);
170 	} else if ((info_mode & FAN_REPORT_NAME) &&
171 		   (event->mask & FAN_ONDIR)) {
172 		/*
173 		 * With group flag FAN_REPORT_NAME, if name was not recorded in
174 		 * event on a directory, we will report the name ".".
175 		 */
176 		dot_len = 1;
177 	}
178 
179 	if (info_mode & FAN_REPORT_PIDFD)
180 		event_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
181 
182 	if (fanotify_event_has_object_fh(event)) {
183 		fh_len = fanotify_event_object_fh_len(event);
184 		event_len += fanotify_fid_info_len(fh_len, dot_len);
185 	}
186 
187 	return event_len;
188 }
189 
190 /*
191  * Remove an hashed event from merge hash table.
192  */
193 static void fanotify_unhash_event(struct fsnotify_group *group,
194 				  struct fanotify_event *event)
195 {
196 	assert_spin_locked(&group->notification_lock);
197 
198 	pr_debug("%s: group=%p event=%p bucket=%u\n", __func__,
199 		 group, event, fanotify_event_hash_bucket(group, event));
200 
201 	if (WARN_ON_ONCE(hlist_unhashed(&event->merge_list)))
202 		return;
203 
204 	hlist_del_init(&event->merge_list);
205 }
206 
207 /*
208  * Get an fanotify notification event if one exists and is small
209  * enough to fit in "count". Return an error pointer if the count
210  * is not large enough. When permission event is dequeued, its state is
211  * updated accordingly.
212  */
213 static struct fanotify_event *get_one_event(struct fsnotify_group *group,
214 					    size_t count)
215 {
216 	size_t event_size;
217 	struct fanotify_event *event = NULL;
218 	struct fsnotify_event *fsn_event;
219 	unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
220 
221 	pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
222 
223 	spin_lock(&group->notification_lock);
224 	fsn_event = fsnotify_peek_first_event(group);
225 	if (!fsn_event)
226 		goto out;
227 
228 	event = FANOTIFY_E(fsn_event);
229 	event_size = fanotify_event_len(info_mode, event);
230 
231 	if (event_size > count) {
232 		event = ERR_PTR(-EINVAL);
233 		goto out;
234 	}
235 
236 	/*
237 	 * Held the notification_lock the whole time, so this is the
238 	 * same event we peeked above.
239 	 */
240 	fsnotify_remove_first_event(group);
241 	if (fanotify_is_perm_event(event->mask))
242 		FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED;
243 	if (fanotify_is_hashed_event(event->mask))
244 		fanotify_unhash_event(group, event);
245 out:
246 	spin_unlock(&group->notification_lock);
247 	return event;
248 }
249 
250 static int create_fd(struct fsnotify_group *group, const struct path *path,
251 		     struct file **file)
252 {
253 	int client_fd;
254 	struct file *new_file;
255 
256 	client_fd = get_unused_fd_flags(group->fanotify_data.f_flags);
257 	if (client_fd < 0)
258 		return client_fd;
259 
260 	/*
261 	 * we need a new file handle for the userspace program so it can read even if it was
262 	 * originally opened O_WRONLY.
263 	 */
264 	new_file = dentry_open(path,
265 			       group->fanotify_data.f_flags | __FMODE_NONOTIFY,
266 			       current_cred());
267 	if (IS_ERR(new_file)) {
268 		/*
269 		 * we still send an event even if we can't open the file.  this
270 		 * can happen when say tasks are gone and we try to open their
271 		 * /proc files or we try to open a WRONLY file like in sysfs
272 		 * we just send the errno to userspace since there isn't much
273 		 * else we can do.
274 		 */
275 		put_unused_fd(client_fd);
276 		client_fd = PTR_ERR(new_file);
277 	} else {
278 		*file = new_file;
279 	}
280 
281 	return client_fd;
282 }
283 
284 static int process_access_response_info(const char __user *info,
285 					size_t info_len,
286 				struct fanotify_response_info_audit_rule *friar)
287 {
288 	if (info_len != sizeof(*friar))
289 		return -EINVAL;
290 
291 	if (copy_from_user(friar, info, sizeof(*friar)))
292 		return -EFAULT;
293 
294 	if (friar->hdr.type != FAN_RESPONSE_INFO_AUDIT_RULE)
295 		return -EINVAL;
296 	if (friar->hdr.pad != 0)
297 		return -EINVAL;
298 	if (friar->hdr.len != sizeof(*friar))
299 		return -EINVAL;
300 
301 	return info_len;
302 }
303 
304 /*
305  * Finish processing of permission event by setting it to ANSWERED state and
306  * drop group->notification_lock.
307  */
308 static void finish_permission_event(struct fsnotify_group *group,
309 				    struct fanotify_perm_event *event, u32 response,
310 				    struct fanotify_response_info_audit_rule *friar)
311 				    __releases(&group->notification_lock)
312 {
313 	bool destroy = false;
314 
315 	assert_spin_locked(&group->notification_lock);
316 	event->response = response & ~FAN_INFO;
317 	if (response & FAN_INFO)
318 		memcpy(&event->audit_rule, friar, sizeof(*friar));
319 
320 	if (event->state == FAN_EVENT_CANCELED)
321 		destroy = true;
322 	else
323 		event->state = FAN_EVENT_ANSWERED;
324 	spin_unlock(&group->notification_lock);
325 	if (destroy)
326 		fsnotify_destroy_event(group, &event->fae.fse);
327 }
328 
329 static int process_access_response(struct fsnotify_group *group,
330 				   struct fanotify_response *response_struct,
331 				   const char __user *info,
332 				   size_t info_len)
333 {
334 	struct fanotify_perm_event *event;
335 	int fd = response_struct->fd;
336 	u32 response = response_struct->response;
337 	int ret = info_len;
338 	struct fanotify_response_info_audit_rule friar;
339 
340 	pr_debug("%s: group=%p fd=%d response=%u buf=%p size=%zu\n", __func__,
341 		 group, fd, response, info, info_len);
342 	/*
343 	 * make sure the response is valid, if invalid we do nothing and either
344 	 * userspace can send a valid response or we will clean it up after the
345 	 * timeout
346 	 */
347 	if (response & ~FANOTIFY_RESPONSE_VALID_MASK)
348 		return -EINVAL;
349 
350 	switch (response & FANOTIFY_RESPONSE_ACCESS) {
351 	case FAN_ALLOW:
352 	case FAN_DENY:
353 		break;
354 	default:
355 		return -EINVAL;
356 	}
357 
358 	if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT))
359 		return -EINVAL;
360 
361 	if (response & FAN_INFO) {
362 		ret = process_access_response_info(info, info_len, &friar);
363 		if (ret < 0)
364 			return ret;
365 		if (fd == FAN_NOFD)
366 			return ret;
367 	} else {
368 		ret = 0;
369 	}
370 
371 	if (fd < 0)
372 		return -EINVAL;
373 
374 	spin_lock(&group->notification_lock);
375 	list_for_each_entry(event, &group->fanotify_data.access_list,
376 			    fae.fse.list) {
377 		if (event->fd != fd)
378 			continue;
379 
380 		list_del_init(&event->fae.fse.list);
381 		finish_permission_event(group, event, response, &friar);
382 		wake_up(&group->fanotify_data.access_waitq);
383 		return ret;
384 	}
385 	spin_unlock(&group->notification_lock);
386 
387 	return -ENOENT;
388 }
389 
390 static size_t copy_error_info_to_user(struct fanotify_event *event,
391 				      char __user *buf, int count)
392 {
393 	struct fanotify_event_info_error info = { };
394 	struct fanotify_error_event *fee = FANOTIFY_EE(event);
395 
396 	info.hdr.info_type = FAN_EVENT_INFO_TYPE_ERROR;
397 	info.hdr.len = FANOTIFY_ERROR_INFO_LEN;
398 
399 	if (WARN_ON(count < info.hdr.len))
400 		return -EFAULT;
401 
402 	info.error = fee->error;
403 	info.error_count = fee->err_count;
404 
405 	if (copy_to_user(buf, &info, sizeof(info)))
406 		return -EFAULT;
407 
408 	return info.hdr.len;
409 }
410 
411 static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
412 				 int info_type, const char *name,
413 				 size_t name_len,
414 				 char __user *buf, size_t count)
415 {
416 	struct fanotify_event_info_fid info = { };
417 	struct file_handle handle = { };
418 	unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh_buf;
419 	size_t fh_len = fh ? fh->len : 0;
420 	size_t info_len = fanotify_fid_info_len(fh_len, name_len);
421 	size_t len = info_len;
422 
423 	pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n",
424 		 __func__, fh_len, name_len, info_len, count);
425 
426 	if (WARN_ON_ONCE(len < sizeof(info) || len > count))
427 		return -EFAULT;
428 
429 	/*
430 	 * Copy event info fid header followed by variable sized file handle
431 	 * and optionally followed by variable sized filename.
432 	 */
433 	switch (info_type) {
434 	case FAN_EVENT_INFO_TYPE_FID:
435 	case FAN_EVENT_INFO_TYPE_DFID:
436 		if (WARN_ON_ONCE(name_len))
437 			return -EFAULT;
438 		break;
439 	case FAN_EVENT_INFO_TYPE_DFID_NAME:
440 	case FAN_EVENT_INFO_TYPE_OLD_DFID_NAME:
441 	case FAN_EVENT_INFO_TYPE_NEW_DFID_NAME:
442 		if (WARN_ON_ONCE(!name || !name_len))
443 			return -EFAULT;
444 		break;
445 	default:
446 		return -EFAULT;
447 	}
448 
449 	info.hdr.info_type = info_type;
450 	info.hdr.len = len;
451 	info.fsid = *fsid;
452 	if (copy_to_user(buf, &info, sizeof(info)))
453 		return -EFAULT;
454 
455 	buf += sizeof(info);
456 	len -= sizeof(info);
457 	if (WARN_ON_ONCE(len < sizeof(handle)))
458 		return -EFAULT;
459 
460 	handle.handle_type = fh->type;
461 	handle.handle_bytes = fh_len;
462 
463 	/* Mangle handle_type for bad file_handle */
464 	if (!fh_len)
465 		handle.handle_type = FILEID_INVALID;
466 
467 	if (copy_to_user(buf, &handle, sizeof(handle)))
468 		return -EFAULT;
469 
470 	buf += sizeof(handle);
471 	len -= sizeof(handle);
472 	if (WARN_ON_ONCE(len < fh_len))
473 		return -EFAULT;
474 
475 	/*
476 	 * For an inline fh and inline file name, copy through stack to exclude
477 	 * the copy from usercopy hardening protections.
478 	 */
479 	fh_buf = fanotify_fh_buf(fh);
480 	if (fh_len <= FANOTIFY_INLINE_FH_LEN) {
481 		memcpy(bounce, fh_buf, fh_len);
482 		fh_buf = bounce;
483 	}
484 	if (copy_to_user(buf, fh_buf, fh_len))
485 		return -EFAULT;
486 
487 	buf += fh_len;
488 	len -= fh_len;
489 
490 	if (name_len) {
491 		/* Copy the filename with terminating null */
492 		name_len++;
493 		if (WARN_ON_ONCE(len < name_len))
494 			return -EFAULT;
495 
496 		if (copy_to_user(buf, name, name_len))
497 			return -EFAULT;
498 
499 		buf += name_len;
500 		len -= name_len;
501 	}
502 
503 	/* Pad with 0's */
504 	WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN);
505 	if (len > 0 && clear_user(buf, len))
506 		return -EFAULT;
507 
508 	return info_len;
509 }
510 
511 static int copy_pidfd_info_to_user(int pidfd,
512 				   char __user *buf,
513 				   size_t count)
514 {
515 	struct fanotify_event_info_pidfd info = { };
516 	size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
517 
518 	if (WARN_ON_ONCE(info_len > count))
519 		return -EFAULT;
520 
521 	info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD;
522 	info.hdr.len = info_len;
523 	info.pidfd = pidfd;
524 
525 	if (copy_to_user(buf, &info, info_len))
526 		return -EFAULT;
527 
528 	return info_len;
529 }
530 
531 static int copy_info_records_to_user(struct fanotify_event *event,
532 				     struct fanotify_info *info,
533 				     unsigned int info_mode, int pidfd,
534 				     char __user *buf, size_t count)
535 {
536 	int ret, total_bytes = 0, info_type = 0;
537 	unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS;
538 	unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
539 
540 	/*
541 	 * Event info records order is as follows:
542 	 * 1. dir fid + name
543 	 * 2. (optional) new dir fid + new name
544 	 * 3. (optional) child fid
545 	 */
546 	if (fanotify_event_has_dir_fh(event)) {
547 		info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
548 					     FAN_EVENT_INFO_TYPE_DFID;
549 
550 		/* FAN_RENAME uses special info types */
551 		if (event->mask & FAN_RENAME)
552 			info_type = FAN_EVENT_INFO_TYPE_OLD_DFID_NAME;
553 
554 		ret = copy_fid_info_to_user(fanotify_event_fsid(event),
555 					    fanotify_info_dir_fh(info),
556 					    info_type,
557 					    fanotify_info_name(info),
558 					    info->name_len, buf, count);
559 		if (ret < 0)
560 			return ret;
561 
562 		buf += ret;
563 		count -= ret;
564 		total_bytes += ret;
565 	}
566 
567 	/* New dir fid+name may be reported in addition to old dir fid+name */
568 	if (fanotify_event_has_dir2_fh(event)) {
569 		info_type = FAN_EVENT_INFO_TYPE_NEW_DFID_NAME;
570 		ret = copy_fid_info_to_user(fanotify_event_fsid(event),
571 					    fanotify_info_dir2_fh(info),
572 					    info_type,
573 					    fanotify_info_name2(info),
574 					    info->name2_len, buf, count);
575 		if (ret < 0)
576 			return ret;
577 
578 		buf += ret;
579 		count -= ret;
580 		total_bytes += ret;
581 	}
582 
583 	if (fanotify_event_has_object_fh(event)) {
584 		const char *dot = NULL;
585 		int dot_len = 0;
586 
587 		if (fid_mode == FAN_REPORT_FID || info_type) {
588 			/*
589 			 * With only group flag FAN_REPORT_FID only type FID is
590 			 * reported. Second info record type is always FID.
591 			 */
592 			info_type = FAN_EVENT_INFO_TYPE_FID;
593 		} else if ((fid_mode & FAN_REPORT_NAME) &&
594 			   (event->mask & FAN_ONDIR)) {
595 			/*
596 			 * With group flag FAN_REPORT_NAME, if name was not
597 			 * recorded in an event on a directory, report the name
598 			 * "." with info type DFID_NAME.
599 			 */
600 			info_type = FAN_EVENT_INFO_TYPE_DFID_NAME;
601 			dot = ".";
602 			dot_len = 1;
603 		} else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) ||
604 			   (event->mask & FAN_ONDIR)) {
605 			/*
606 			 * With group flag FAN_REPORT_DIR_FID, a single info
607 			 * record has type DFID for directory entry modification
608 			 * event and for event on a directory.
609 			 */
610 			info_type = FAN_EVENT_INFO_TYPE_DFID;
611 		} else {
612 			/*
613 			 * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID,
614 			 * a single info record has type FID for event on a
615 			 * non-directory, when there is no directory to report.
616 			 * For example, on FAN_DELETE_SELF event.
617 			 */
618 			info_type = FAN_EVENT_INFO_TYPE_FID;
619 		}
620 
621 		ret = copy_fid_info_to_user(fanotify_event_fsid(event),
622 					    fanotify_event_object_fh(event),
623 					    info_type, dot, dot_len,
624 					    buf, count);
625 		if (ret < 0)
626 			return ret;
627 
628 		buf += ret;
629 		count -= ret;
630 		total_bytes += ret;
631 	}
632 
633 	if (pidfd_mode) {
634 		ret = copy_pidfd_info_to_user(pidfd, buf, count);
635 		if (ret < 0)
636 			return ret;
637 
638 		buf += ret;
639 		count -= ret;
640 		total_bytes += ret;
641 	}
642 
643 	if (fanotify_is_error_event(event->mask)) {
644 		ret = copy_error_info_to_user(event, buf, count);
645 		if (ret < 0)
646 			return ret;
647 		buf += ret;
648 		count -= ret;
649 		total_bytes += ret;
650 	}
651 
652 	return total_bytes;
653 }
654 
655 static ssize_t copy_event_to_user(struct fsnotify_group *group,
656 				  struct fanotify_event *event,
657 				  char __user *buf, size_t count)
658 {
659 	struct fanotify_event_metadata metadata;
660 	const struct path *path = fanotify_event_path(event);
661 	struct fanotify_info *info = fanotify_event_info(event);
662 	unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
663 	unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
664 	struct file *f = NULL, *pidfd_file = NULL;
665 	int ret, pidfd = FAN_NOPIDFD, fd = FAN_NOFD;
666 
667 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
668 
669 	metadata.event_len = fanotify_event_len(info_mode, event);
670 	metadata.metadata_len = FAN_EVENT_METADATA_LEN;
671 	metadata.vers = FANOTIFY_METADATA_VERSION;
672 	metadata.reserved = 0;
673 	metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
674 	metadata.pid = pid_vnr(event->pid);
675 	/*
676 	 * For an unprivileged listener, event->pid can be used to identify the
677 	 * events generated by the listener process itself, without disclosing
678 	 * the pids of other processes.
679 	 */
680 	if (FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
681 	    task_tgid(current) != event->pid)
682 		metadata.pid = 0;
683 
684 	/*
685 	 * For now, fid mode is required for an unprivileged listener and
686 	 * fid mode does not report fd in events.  Keep this check anyway
687 	 * for safety in case fid mode requirement is relaxed in the future
688 	 * to allow unprivileged listener to get events with no fd and no fid.
689 	 */
690 	if (!FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
691 	    path && path->mnt && path->dentry) {
692 		fd = create_fd(group, path, &f);
693 		if (fd < 0)
694 			return fd;
695 	}
696 	metadata.fd = fd;
697 
698 	if (pidfd_mode) {
699 		/*
700 		 * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual
701 		 * exclusion is ever lifted. At the time of incoporating pidfd
702 		 * support within fanotify, the pidfd API only supported the
703 		 * creation of pidfds for thread-group leaders.
704 		 */
705 		WARN_ON_ONCE(FAN_GROUP_FLAG(group, FAN_REPORT_TID));
706 
707 		/*
708 		 * The PIDTYPE_TGID check for an event->pid is performed
709 		 * preemptively in an attempt to catch out cases where the event
710 		 * listener reads events after the event generating process has
711 		 * already terminated. Report FAN_NOPIDFD to the event listener
712 		 * in those cases, with all other pidfd creation errors being
713 		 * reported as FAN_EPIDFD.
714 		 */
715 		if (metadata.pid == 0 ||
716 		    !pid_has_task(event->pid, PIDTYPE_TGID)) {
717 			pidfd = FAN_NOPIDFD;
718 		} else {
719 			pidfd = pidfd_prepare(event->pid, 0, &pidfd_file);
720 			if (pidfd < 0)
721 				pidfd = FAN_EPIDFD;
722 		}
723 	}
724 
725 	ret = -EFAULT;
726 	/*
727 	 * Sanity check copy size in case get_one_event() and
728 	 * event_len sizes ever get out of sync.
729 	 */
730 	if (WARN_ON_ONCE(metadata.event_len > count))
731 		goto out_close_fd;
732 
733 	if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN))
734 		goto out_close_fd;
735 
736 	buf += FAN_EVENT_METADATA_LEN;
737 	count -= FAN_EVENT_METADATA_LEN;
738 
739 	if (fanotify_is_perm_event(event->mask))
740 		FANOTIFY_PERM(event)->fd = fd;
741 
742 	if (info_mode) {
743 		ret = copy_info_records_to_user(event, info, info_mode, pidfd,
744 						buf, count);
745 		if (ret < 0)
746 			goto out_close_fd;
747 	}
748 
749 	if (f)
750 		fd_install(fd, f);
751 
752 	if (pidfd_file)
753 		fd_install(pidfd, pidfd_file);
754 
755 	return metadata.event_len;
756 
757 out_close_fd:
758 	if (fd != FAN_NOFD) {
759 		put_unused_fd(fd);
760 		fput(f);
761 	}
762 
763 	if (pidfd >= 0) {
764 		put_unused_fd(pidfd);
765 		fput(pidfd_file);
766 	}
767 
768 	return ret;
769 }
770 
771 /* intofiy userspace file descriptor functions */
772 static __poll_t fanotify_poll(struct file *file, poll_table *wait)
773 {
774 	struct fsnotify_group *group = file->private_data;
775 	__poll_t ret = 0;
776 
777 	poll_wait(file, &group->notification_waitq, wait);
778 	spin_lock(&group->notification_lock);
779 	if (!fsnotify_notify_queue_is_empty(group))
780 		ret = EPOLLIN | EPOLLRDNORM;
781 	spin_unlock(&group->notification_lock);
782 
783 	return ret;
784 }
785 
786 static ssize_t fanotify_read(struct file *file, char __user *buf,
787 			     size_t count, loff_t *pos)
788 {
789 	struct fsnotify_group *group;
790 	struct fanotify_event *event;
791 	char __user *start;
792 	int ret;
793 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
794 
795 	start = buf;
796 	group = file->private_data;
797 
798 	pr_debug("%s: group=%p\n", __func__, group);
799 
800 	add_wait_queue(&group->notification_waitq, &wait);
801 	while (1) {
802 		/*
803 		 * User can supply arbitrarily large buffer. Avoid softlockups
804 		 * in case there are lots of available events.
805 		 */
806 		cond_resched();
807 		event = get_one_event(group, count);
808 		if (IS_ERR(event)) {
809 			ret = PTR_ERR(event);
810 			break;
811 		}
812 
813 		if (!event) {
814 			ret = -EAGAIN;
815 			if (file->f_flags & O_NONBLOCK)
816 				break;
817 
818 			ret = -ERESTARTSYS;
819 			if (signal_pending(current))
820 				break;
821 
822 			if (start != buf)
823 				break;
824 
825 			wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
826 			continue;
827 		}
828 
829 		ret = copy_event_to_user(group, event, buf, count);
830 		if (unlikely(ret == -EOPENSTALE)) {
831 			/*
832 			 * We cannot report events with stale fd so drop it.
833 			 * Setting ret to 0 will continue the event loop and
834 			 * do the right thing if there are no more events to
835 			 * read (i.e. return bytes read, -EAGAIN or wait).
836 			 */
837 			ret = 0;
838 		}
839 
840 		/*
841 		 * Permission events get queued to wait for response.  Other
842 		 * events can be destroyed now.
843 		 */
844 		if (!fanotify_is_perm_event(event->mask)) {
845 			fsnotify_destroy_event(group, &event->fse);
846 		} else {
847 			if (ret <= 0) {
848 				spin_lock(&group->notification_lock);
849 				finish_permission_event(group,
850 					FANOTIFY_PERM(event), FAN_DENY, NULL);
851 				wake_up(&group->fanotify_data.access_waitq);
852 			} else {
853 				spin_lock(&group->notification_lock);
854 				list_add_tail(&event->fse.list,
855 					&group->fanotify_data.access_list);
856 				spin_unlock(&group->notification_lock);
857 			}
858 		}
859 		if (ret < 0)
860 			break;
861 		buf += ret;
862 		count -= ret;
863 	}
864 	remove_wait_queue(&group->notification_waitq, &wait);
865 
866 	if (start != buf && ret != -EFAULT)
867 		ret = buf - start;
868 	return ret;
869 }
870 
871 static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
872 {
873 	struct fanotify_response response;
874 	struct fsnotify_group *group;
875 	int ret;
876 	const char __user *info_buf = buf + sizeof(struct fanotify_response);
877 	size_t info_len;
878 
879 	if (!IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
880 		return -EINVAL;
881 
882 	group = file->private_data;
883 
884 	pr_debug("%s: group=%p count=%zu\n", __func__, group, count);
885 
886 	if (count < sizeof(response))
887 		return -EINVAL;
888 
889 	if (copy_from_user(&response, buf, sizeof(response)))
890 		return -EFAULT;
891 
892 	info_len = count - sizeof(response);
893 
894 	ret = process_access_response(group, &response, info_buf, info_len);
895 	if (ret < 0)
896 		count = ret;
897 	else
898 		count = sizeof(response) + ret;
899 
900 	return count;
901 }
902 
903 static int fanotify_release(struct inode *ignored, struct file *file)
904 {
905 	struct fsnotify_group *group = file->private_data;
906 	struct fsnotify_event *fsn_event;
907 
908 	/*
909 	 * Stop new events from arriving in the notification queue. since
910 	 * userspace cannot use fanotify fd anymore, no event can enter or
911 	 * leave access_list by now either.
912 	 */
913 	fsnotify_group_stop_queueing(group);
914 
915 	/*
916 	 * Process all permission events on access_list and notification queue
917 	 * and simulate reply from userspace.
918 	 */
919 	spin_lock(&group->notification_lock);
920 	while (!list_empty(&group->fanotify_data.access_list)) {
921 		struct fanotify_perm_event *event;
922 
923 		event = list_first_entry(&group->fanotify_data.access_list,
924 				struct fanotify_perm_event, fae.fse.list);
925 		list_del_init(&event->fae.fse.list);
926 		finish_permission_event(group, event, FAN_ALLOW, NULL);
927 		spin_lock(&group->notification_lock);
928 	}
929 
930 	/*
931 	 * Destroy all non-permission events. For permission events just
932 	 * dequeue them and set the response. They will be freed once the
933 	 * response is consumed and fanotify_get_response() returns.
934 	 */
935 	while ((fsn_event = fsnotify_remove_first_event(group))) {
936 		struct fanotify_event *event = FANOTIFY_E(fsn_event);
937 
938 		if (!(event->mask & FANOTIFY_PERM_EVENTS)) {
939 			spin_unlock(&group->notification_lock);
940 			fsnotify_destroy_event(group, fsn_event);
941 		} else {
942 			finish_permission_event(group, FANOTIFY_PERM(event),
943 						FAN_ALLOW, NULL);
944 		}
945 		spin_lock(&group->notification_lock);
946 	}
947 	spin_unlock(&group->notification_lock);
948 
949 	/* Response for all permission events it set, wakeup waiters */
950 	wake_up(&group->fanotify_data.access_waitq);
951 
952 	/* matches the fanotify_init->fsnotify_alloc_group */
953 	fsnotify_destroy_group(group);
954 
955 	return 0;
956 }
957 
958 static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
959 {
960 	struct fsnotify_group *group;
961 	struct fsnotify_event *fsn_event;
962 	void __user *p;
963 	int ret = -ENOTTY;
964 	size_t send_len = 0;
965 
966 	group = file->private_data;
967 
968 	p = (void __user *) arg;
969 
970 	switch (cmd) {
971 	case FIONREAD:
972 		spin_lock(&group->notification_lock);
973 		list_for_each_entry(fsn_event, &group->notification_list, list)
974 			send_len += FAN_EVENT_METADATA_LEN;
975 		spin_unlock(&group->notification_lock);
976 		ret = put_user(send_len, (int __user *) p);
977 		break;
978 	}
979 
980 	return ret;
981 }
982 
983 static const struct file_operations fanotify_fops = {
984 	.show_fdinfo	= fanotify_show_fdinfo,
985 	.poll		= fanotify_poll,
986 	.read		= fanotify_read,
987 	.write		= fanotify_write,
988 	.fasync		= NULL,
989 	.release	= fanotify_release,
990 	.unlocked_ioctl	= fanotify_ioctl,
991 	.compat_ioctl	= compat_ptr_ioctl,
992 	.llseek		= noop_llseek,
993 };
994 
995 static int fanotify_find_path(int dfd, const char __user *filename,
996 			      struct path *path, unsigned int flags, __u64 mask,
997 			      unsigned int obj_type)
998 {
999 	int ret;
1000 
1001 	pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__,
1002 		 dfd, filename, flags);
1003 
1004 	if (filename == NULL) {
1005 		CLASS(fd, f)(dfd);
1006 
1007 		if (fd_empty(f))
1008 			return -EBADF;
1009 
1010 		if ((flags & FAN_MARK_ONLYDIR) &&
1011 		    !(S_ISDIR(file_inode(fd_file(f))->i_mode)))
1012 			return -ENOTDIR;
1013 
1014 		*path = fd_file(f)->f_path;
1015 		path_get(path);
1016 	} else {
1017 		unsigned int lookup_flags = 0;
1018 
1019 		if (!(flags & FAN_MARK_DONT_FOLLOW))
1020 			lookup_flags |= LOOKUP_FOLLOW;
1021 		if (flags & FAN_MARK_ONLYDIR)
1022 			lookup_flags |= LOOKUP_DIRECTORY;
1023 
1024 		ret = user_path_at(dfd, filename, lookup_flags, path);
1025 		if (ret)
1026 			goto out;
1027 	}
1028 
1029 	/* you can only watch an inode if you have read permissions on it */
1030 	ret = path_permission(path, MAY_READ);
1031 	if (ret) {
1032 		path_put(path);
1033 		goto out;
1034 	}
1035 
1036 	ret = security_path_notify(path, mask, obj_type);
1037 	if (ret)
1038 		path_put(path);
1039 
1040 out:
1041 	return ret;
1042 }
1043 
1044 static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
1045 					    __u32 mask, unsigned int flags,
1046 					    __u32 umask, int *destroy)
1047 {
1048 	__u32 oldmask, newmask;
1049 
1050 	/* umask bits cannot be removed by user */
1051 	mask &= ~umask;
1052 	spin_lock(&fsn_mark->lock);
1053 	oldmask = fsnotify_calc_mask(fsn_mark);
1054 	if (!(flags & FANOTIFY_MARK_IGNORE_BITS)) {
1055 		fsn_mark->mask &= ~mask;
1056 	} else {
1057 		fsn_mark->ignore_mask &= ~mask;
1058 	}
1059 	newmask = fsnotify_calc_mask(fsn_mark);
1060 	/*
1061 	 * We need to keep the mark around even if remaining mask cannot
1062 	 * result in any events (e.g. mask == FAN_ONDIR) to support incremenal
1063 	 * changes to the mask.
1064 	 * Destroy mark when only umask bits remain.
1065 	 */
1066 	*destroy = !((fsn_mark->mask | fsn_mark->ignore_mask) & ~umask);
1067 	spin_unlock(&fsn_mark->lock);
1068 
1069 	return oldmask & ~newmask;
1070 }
1071 
1072 static int fanotify_remove_mark(struct fsnotify_group *group,
1073 				void *obj, unsigned int obj_type, __u32 mask,
1074 				unsigned int flags, __u32 umask)
1075 {
1076 	struct fsnotify_mark *fsn_mark = NULL;
1077 	__u32 removed;
1078 	int destroy_mark;
1079 
1080 	fsnotify_group_lock(group);
1081 	fsn_mark = fsnotify_find_mark(obj, obj_type, group);
1082 	if (!fsn_mark) {
1083 		fsnotify_group_unlock(group);
1084 		return -ENOENT;
1085 	}
1086 
1087 	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
1088 						 umask, &destroy_mark);
1089 	if (removed & fsnotify_conn_mask(fsn_mark->connector))
1090 		fsnotify_recalc_mask(fsn_mark->connector);
1091 	if (destroy_mark)
1092 		fsnotify_detach_mark(fsn_mark);
1093 	fsnotify_group_unlock(group);
1094 	if (destroy_mark)
1095 		fsnotify_free_mark(fsn_mark);
1096 
1097 	/* matches the fsnotify_find_mark() */
1098 	fsnotify_put_mark(fsn_mark);
1099 	return 0;
1100 }
1101 
1102 static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark,
1103 				       unsigned int fan_flags)
1104 {
1105 	bool want_iref = !(fan_flags & FAN_MARK_EVICTABLE);
1106 	unsigned int ignore = fan_flags & FANOTIFY_MARK_IGNORE_BITS;
1107 	bool recalc = false;
1108 
1109 	/*
1110 	 * When using FAN_MARK_IGNORE for the first time, mark starts using
1111 	 * independent event flags in ignore mask.  After that, trying to
1112 	 * update the ignore mask with the old FAN_MARK_IGNORED_MASK API
1113 	 * will result in EEXIST error.
1114 	 */
1115 	if (ignore == FAN_MARK_IGNORE)
1116 		fsn_mark->flags |= FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS;
1117 
1118 	/*
1119 	 * Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to
1120 	 * the removal of the FS_MODIFY bit in calculated mask if it was set
1121 	 * because of an ignore mask that is now going to survive FS_MODIFY.
1122 	 */
1123 	if (ignore && (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
1124 	    !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) {
1125 		fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
1126 		if (!(fsn_mark->mask & FS_MODIFY))
1127 			recalc = true;
1128 	}
1129 
1130 	if (fsn_mark->connector->type != FSNOTIFY_OBJ_TYPE_INODE ||
1131 	    want_iref == !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
1132 		return recalc;
1133 
1134 	/*
1135 	 * NO_IREF may be removed from a mark, but not added.
1136 	 * When removed, fsnotify_recalc_mask() will take the inode ref.
1137 	 */
1138 	WARN_ON_ONCE(!want_iref);
1139 	fsn_mark->flags &= ~FSNOTIFY_MARK_FLAG_NO_IREF;
1140 
1141 	return true;
1142 }
1143 
1144 static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
1145 				      __u32 mask, unsigned int fan_flags)
1146 {
1147 	bool recalc;
1148 
1149 	spin_lock(&fsn_mark->lock);
1150 	if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS))
1151 		fsn_mark->mask |= mask;
1152 	else
1153 		fsn_mark->ignore_mask |= mask;
1154 
1155 	recalc = fsnotify_calc_mask(fsn_mark) &
1156 		~fsnotify_conn_mask(fsn_mark->connector);
1157 
1158 	recalc |= fanotify_mark_update_flags(fsn_mark, fan_flags);
1159 	spin_unlock(&fsn_mark->lock);
1160 
1161 	return recalc;
1162 }
1163 
1164 struct fan_fsid {
1165 	struct super_block *sb;
1166 	__kernel_fsid_t id;
1167 	bool weak;
1168 };
1169 
1170 static int fanotify_set_mark_fsid(struct fsnotify_group *group,
1171 				  struct fsnotify_mark *mark,
1172 				  struct fan_fsid *fsid)
1173 {
1174 	struct fsnotify_mark_connector *conn;
1175 	struct fsnotify_mark *old;
1176 	struct super_block *old_sb = NULL;
1177 
1178 	FANOTIFY_MARK(mark)->fsid = fsid->id;
1179 	mark->flags |= FSNOTIFY_MARK_FLAG_HAS_FSID;
1180 	if (fsid->weak)
1181 		mark->flags |= FSNOTIFY_MARK_FLAG_WEAK_FSID;
1182 
1183 	/* First mark added will determine if group is single or multi fsid */
1184 	if (list_empty(&group->marks_list))
1185 		return 0;
1186 
1187 	/* Find sb of an existing mark */
1188 	list_for_each_entry(old, &group->marks_list, g_list) {
1189 		conn = READ_ONCE(old->connector);
1190 		if (!conn)
1191 			continue;
1192 		old_sb = fsnotify_connector_sb(conn);
1193 		if (old_sb)
1194 			break;
1195 	}
1196 
1197 	/* Only detached marks left? */
1198 	if (!old_sb)
1199 		return 0;
1200 
1201 	/* Do not allow mixing of marks with weak and strong fsid */
1202 	if ((mark->flags ^ old->flags) & FSNOTIFY_MARK_FLAG_WEAK_FSID)
1203 		return -EXDEV;
1204 
1205 	/* Allow mixing of marks with strong fsid from different fs */
1206 	if (!fsid->weak)
1207 		return 0;
1208 
1209 	/* Do not allow mixing marks with weak fsid from different fs */
1210 	if (old_sb != fsid->sb)
1211 		return -EXDEV;
1212 
1213 	/* Do not allow mixing marks from different btrfs sub-volumes */
1214 	if (!fanotify_fsid_equal(&FANOTIFY_MARK(old)->fsid,
1215 				 &FANOTIFY_MARK(mark)->fsid))
1216 		return -EXDEV;
1217 
1218 	return 0;
1219 }
1220 
1221 static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
1222 						   void *obj,
1223 						   unsigned int obj_type,
1224 						   unsigned int fan_flags,
1225 						   struct fan_fsid *fsid)
1226 {
1227 	struct ucounts *ucounts = group->fanotify_data.ucounts;
1228 	struct fanotify_mark *fan_mark;
1229 	struct fsnotify_mark *mark;
1230 	int ret;
1231 
1232 	/*
1233 	 * Enforce per user marks limits per user in all containing user ns.
1234 	 * A group with FAN_UNLIMITED_MARKS does not contribute to mark count
1235 	 * in the limited groups account.
1236 	 */
1237 	if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) &&
1238 	    !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS))
1239 		return ERR_PTR(-ENOSPC);
1240 
1241 	fan_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
1242 	if (!fan_mark) {
1243 		ret = -ENOMEM;
1244 		goto out_dec_ucounts;
1245 	}
1246 
1247 	mark = &fan_mark->fsn_mark;
1248 	fsnotify_init_mark(mark, group);
1249 	if (fan_flags & FAN_MARK_EVICTABLE)
1250 		mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF;
1251 
1252 	/* Cache fsid of filesystem containing the marked object */
1253 	if (fsid) {
1254 		ret = fanotify_set_mark_fsid(group, mark, fsid);
1255 		if (ret)
1256 			goto out_put_mark;
1257 	} else {
1258 		fan_mark->fsid.val[0] = fan_mark->fsid.val[1] = 0;
1259 	}
1260 
1261 	ret = fsnotify_add_mark_locked(mark, obj, obj_type, 0);
1262 	if (ret)
1263 		goto out_put_mark;
1264 
1265 	return mark;
1266 
1267 out_put_mark:
1268 	fsnotify_put_mark(mark);
1269 out_dec_ucounts:
1270 	if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS))
1271 		dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS);
1272 	return ERR_PTR(ret);
1273 }
1274 
1275 static int fanotify_group_init_error_pool(struct fsnotify_group *group)
1276 {
1277 	if (mempool_initialized(&group->fanotify_data.error_events_pool))
1278 		return 0;
1279 
1280 	return mempool_init_kmalloc_pool(&group->fanotify_data.error_events_pool,
1281 					 FANOTIFY_DEFAULT_FEE_POOL_SIZE,
1282 					 sizeof(struct fanotify_error_event));
1283 }
1284 
1285 static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark,
1286 					      unsigned int fan_flags)
1287 {
1288 	/*
1289 	 * Non evictable mark cannot be downgraded to evictable mark.
1290 	 */
1291 	if (fan_flags & FAN_MARK_EVICTABLE &&
1292 	    !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
1293 		return -EEXIST;
1294 
1295 	/*
1296 	 * New ignore mask semantics cannot be downgraded to old semantics.
1297 	 */
1298 	if (fan_flags & FAN_MARK_IGNORED_MASK &&
1299 	    fsn_mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS)
1300 		return -EEXIST;
1301 
1302 	/*
1303 	 * An ignore mask that survives modify could never be downgraded to not
1304 	 * survive modify.  With new FAN_MARK_IGNORE semantics we make that rule
1305 	 * explicit and return an error when trying to update the ignore mask
1306 	 * without the original FAN_MARK_IGNORED_SURV_MODIFY value.
1307 	 */
1308 	if (fan_flags & FAN_MARK_IGNORE &&
1309 	    !(fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
1310 	    fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
1311 		return -EEXIST;
1312 
1313 	return 0;
1314 }
1315 
1316 static int fanotify_add_mark(struct fsnotify_group *group,
1317 			     void *obj, unsigned int obj_type,
1318 			     __u32 mask, unsigned int fan_flags,
1319 			     struct fan_fsid *fsid)
1320 {
1321 	struct fsnotify_mark *fsn_mark;
1322 	bool recalc;
1323 	int ret = 0;
1324 
1325 	fsnotify_group_lock(group);
1326 	fsn_mark = fsnotify_find_mark(obj, obj_type, group);
1327 	if (!fsn_mark) {
1328 		fsn_mark = fanotify_add_new_mark(group, obj, obj_type,
1329 						 fan_flags, fsid);
1330 		if (IS_ERR(fsn_mark)) {
1331 			fsnotify_group_unlock(group);
1332 			return PTR_ERR(fsn_mark);
1333 		}
1334 	}
1335 
1336 	/*
1337 	 * Check if requested mark flags conflict with an existing mark flags.
1338 	 */
1339 	ret = fanotify_may_update_existing_mark(fsn_mark, fan_flags);
1340 	if (ret)
1341 		goto out;
1342 
1343 	/*
1344 	 * Error events are pre-allocated per group, only if strictly
1345 	 * needed (i.e. FAN_FS_ERROR was requested).
1346 	 */
1347 	if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS) &&
1348 	    (mask & FAN_FS_ERROR)) {
1349 		ret = fanotify_group_init_error_pool(group);
1350 		if (ret)
1351 			goto out;
1352 	}
1353 
1354 	recalc = fanotify_mark_add_to_mask(fsn_mark, mask, fan_flags);
1355 	if (recalc)
1356 		fsnotify_recalc_mask(fsn_mark->connector);
1357 
1358 out:
1359 	fsnotify_group_unlock(group);
1360 
1361 	fsnotify_put_mark(fsn_mark);
1362 	return ret;
1363 }
1364 
1365 static struct fsnotify_event *fanotify_alloc_overflow_event(void)
1366 {
1367 	struct fanotify_event *oevent;
1368 
1369 	oevent = kmalloc(sizeof(*oevent), GFP_KERNEL_ACCOUNT);
1370 	if (!oevent)
1371 		return NULL;
1372 
1373 	fanotify_init_event(oevent, 0, FS_Q_OVERFLOW);
1374 	oevent->type = FANOTIFY_EVENT_TYPE_OVERFLOW;
1375 
1376 	return &oevent->fse;
1377 }
1378 
1379 static struct hlist_head *fanotify_alloc_merge_hash(void)
1380 {
1381 	struct hlist_head *hash;
1382 
1383 	hash = kmalloc(sizeof(struct hlist_head) << FANOTIFY_HTABLE_BITS,
1384 		       GFP_KERNEL_ACCOUNT);
1385 	if (!hash)
1386 		return NULL;
1387 
1388 	__hash_init(hash, FANOTIFY_HTABLE_SIZE);
1389 
1390 	return hash;
1391 }
1392 
1393 /* fanotify syscalls */
1394 SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
1395 {
1396 	struct fsnotify_group *group;
1397 	int f_flags, fd;
1398 	unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
1399 	unsigned int class = flags & FANOTIFY_CLASS_BITS;
1400 	unsigned int internal_flags = 0;
1401 
1402 	pr_debug("%s: flags=%x event_f_flags=%x\n",
1403 		 __func__, flags, event_f_flags);
1404 
1405 	if (!capable(CAP_SYS_ADMIN)) {
1406 		/*
1407 		 * An unprivileged user can setup an fanotify group with
1408 		 * limited functionality - an unprivileged group is limited to
1409 		 * notification events with file handles and it cannot use
1410 		 * unlimited queue/marks.
1411 		 */
1412 		if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode)
1413 			return -EPERM;
1414 
1415 		/*
1416 		 * Setting the internal flag FANOTIFY_UNPRIV on the group
1417 		 * prevents setting mount/filesystem marks on this group and
1418 		 * prevents reporting pid and open fd in events.
1419 		 */
1420 		internal_flags |= FANOTIFY_UNPRIV;
1421 	}
1422 
1423 #ifdef CONFIG_AUDITSYSCALL
1424 	if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT))
1425 #else
1426 	if (flags & ~FANOTIFY_INIT_FLAGS)
1427 #endif
1428 		return -EINVAL;
1429 
1430 	/*
1431 	 * A pidfd can only be returned for a thread-group leader; thus
1432 	 * FAN_REPORT_PIDFD and FAN_REPORT_TID need to remain mutually
1433 	 * exclusive.
1434 	 */
1435 	if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID))
1436 		return -EINVAL;
1437 
1438 	if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
1439 		return -EINVAL;
1440 
1441 	switch (event_f_flags & O_ACCMODE) {
1442 	case O_RDONLY:
1443 	case O_RDWR:
1444 	case O_WRONLY:
1445 		break;
1446 	default:
1447 		return -EINVAL;
1448 	}
1449 
1450 	if (fid_mode && class != FAN_CLASS_NOTIF)
1451 		return -EINVAL;
1452 
1453 	/*
1454 	 * Child name is reported with parent fid so requires dir fid.
1455 	 * We can report both child fid and dir fid with or without name.
1456 	 */
1457 	if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID))
1458 		return -EINVAL;
1459 
1460 	/*
1461 	 * FAN_REPORT_TARGET_FID requires FAN_REPORT_NAME and FAN_REPORT_FID
1462 	 * and is used as an indication to report both dir and child fid on all
1463 	 * dirent events.
1464 	 */
1465 	if ((fid_mode & FAN_REPORT_TARGET_FID) &&
1466 	    (!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID)))
1467 		return -EINVAL;
1468 
1469 	f_flags = O_RDWR | __FMODE_NONOTIFY;
1470 	if (flags & FAN_CLOEXEC)
1471 		f_flags |= O_CLOEXEC;
1472 	if (flags & FAN_NONBLOCK)
1473 		f_flags |= O_NONBLOCK;
1474 
1475 	/* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
1476 	group = fsnotify_alloc_group(&fanotify_fsnotify_ops,
1477 				     FSNOTIFY_GROUP_USER);
1478 	if (IS_ERR(group)) {
1479 		return PTR_ERR(group);
1480 	}
1481 
1482 	/* Enforce groups limits per user in all containing user ns */
1483 	group->fanotify_data.ucounts = inc_ucount(current_user_ns(),
1484 						  current_euid(),
1485 						  UCOUNT_FANOTIFY_GROUPS);
1486 	if (!group->fanotify_data.ucounts) {
1487 		fd = -EMFILE;
1488 		goto out_destroy_group;
1489 	}
1490 
1491 	group->fanotify_data.flags = flags | internal_flags;
1492 	group->memcg = get_mem_cgroup_from_mm(current->mm);
1493 
1494 	group->fanotify_data.merge_hash = fanotify_alloc_merge_hash();
1495 	if (!group->fanotify_data.merge_hash) {
1496 		fd = -ENOMEM;
1497 		goto out_destroy_group;
1498 	}
1499 
1500 	group->overflow_event = fanotify_alloc_overflow_event();
1501 	if (unlikely(!group->overflow_event)) {
1502 		fd = -ENOMEM;
1503 		goto out_destroy_group;
1504 	}
1505 
1506 	if (force_o_largefile())
1507 		event_f_flags |= O_LARGEFILE;
1508 	group->fanotify_data.f_flags = event_f_flags;
1509 	init_waitqueue_head(&group->fanotify_data.access_waitq);
1510 	INIT_LIST_HEAD(&group->fanotify_data.access_list);
1511 	switch (class) {
1512 	case FAN_CLASS_NOTIF:
1513 		group->priority = FSNOTIFY_PRIO_NORMAL;
1514 		break;
1515 	case FAN_CLASS_CONTENT:
1516 		group->priority = FSNOTIFY_PRIO_CONTENT;
1517 		break;
1518 	case FAN_CLASS_PRE_CONTENT:
1519 		group->priority = FSNOTIFY_PRIO_PRE_CONTENT;
1520 		break;
1521 	default:
1522 		fd = -EINVAL;
1523 		goto out_destroy_group;
1524 	}
1525 
1526 	if (flags & FAN_UNLIMITED_QUEUE) {
1527 		fd = -EPERM;
1528 		if (!capable(CAP_SYS_ADMIN))
1529 			goto out_destroy_group;
1530 		group->max_events = UINT_MAX;
1531 	} else {
1532 		group->max_events = fanotify_max_queued_events;
1533 	}
1534 
1535 	if (flags & FAN_UNLIMITED_MARKS) {
1536 		fd = -EPERM;
1537 		if (!capable(CAP_SYS_ADMIN))
1538 			goto out_destroy_group;
1539 	}
1540 
1541 	if (flags & FAN_ENABLE_AUDIT) {
1542 		fd = -EPERM;
1543 		if (!capable(CAP_AUDIT_WRITE))
1544 			goto out_destroy_group;
1545 	}
1546 
1547 	fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
1548 	if (fd < 0)
1549 		goto out_destroy_group;
1550 
1551 	return fd;
1552 
1553 out_destroy_group:
1554 	fsnotify_destroy_group(group);
1555 	return fd;
1556 }
1557 
1558 static int fanotify_test_fsid(struct dentry *dentry, unsigned int flags,
1559 			      struct fan_fsid *fsid)
1560 {
1561 	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
1562 	__kernel_fsid_t root_fsid;
1563 	int err;
1564 
1565 	/*
1566 	 * Make sure dentry is not of a filesystem with zero fsid (e.g. fuse).
1567 	 */
1568 	err = vfs_get_fsid(dentry, &fsid->id);
1569 	if (err)
1570 		return err;
1571 
1572 	fsid->sb = dentry->d_sb;
1573 	if (!fsid->id.val[0] && !fsid->id.val[1]) {
1574 		err = -ENODEV;
1575 		goto weak;
1576 	}
1577 
1578 	/*
1579 	 * Make sure dentry is not of a filesystem subvolume (e.g. btrfs)
1580 	 * which uses a different fsid than sb root.
1581 	 */
1582 	err = vfs_get_fsid(dentry->d_sb->s_root, &root_fsid);
1583 	if (err)
1584 		return err;
1585 
1586 	if (!fanotify_fsid_equal(&root_fsid, &fsid->id)) {
1587 		err = -EXDEV;
1588 		goto weak;
1589 	}
1590 
1591 	fsid->weak = false;
1592 	return 0;
1593 
1594 weak:
1595 	/* Allow weak fsid when marking inodes */
1596 	fsid->weak = true;
1597 	return (mark_type == FAN_MARK_INODE) ? 0 : err;
1598 }
1599 
1600 /* Check if filesystem can encode a unique fid */
1601 static int fanotify_test_fid(struct dentry *dentry, unsigned int flags)
1602 {
1603 	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
1604 	const struct export_operations *nop = dentry->d_sb->s_export_op;
1605 
1606 	/*
1607 	 * We need to make sure that the filesystem supports encoding of
1608 	 * file handles so user can use name_to_handle_at() to compare fids
1609 	 * reported with events to the file handle of watched objects.
1610 	 */
1611 	if (!exportfs_can_encode_fid(nop))
1612 		return -EOPNOTSUPP;
1613 
1614 	/*
1615 	 * For sb/mount mark, we also need to make sure that the filesystem
1616 	 * supports decoding file handles, so user has a way to map back the
1617 	 * reported fids to filesystem objects.
1618 	 */
1619 	if (mark_type != FAN_MARK_INODE && !exportfs_can_decode_fh(nop))
1620 		return -EOPNOTSUPP;
1621 
1622 	return 0;
1623 }
1624 
1625 static int fanotify_events_supported(struct fsnotify_group *group,
1626 				     const struct path *path, __u64 mask,
1627 				     unsigned int flags)
1628 {
1629 	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
1630 	/* Strict validation of events in non-dir inode mask with v5.17+ APIs */
1631 	bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) ||
1632 				 (mask & FAN_RENAME) ||
1633 				 (flags & FAN_MARK_IGNORE);
1634 
1635 	/*
1636 	 * Some filesystems such as 'proc' acquire unusual locks when opening
1637 	 * files. For them fanotify permission events have high chances of
1638 	 * deadlocking the system - open done when reporting fanotify event
1639 	 * blocks on this "unusual" lock while another process holding the lock
1640 	 * waits for fanotify permission event to be answered. Just disallow
1641 	 * permission events for such filesystems.
1642 	 */
1643 	if (mask & FANOTIFY_PERM_EVENTS &&
1644 	    path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM)
1645 		return -EINVAL;
1646 
1647 	/*
1648 	 * mount and sb marks are not allowed on kernel internal pseudo fs,
1649 	 * like pipe_mnt, because that would subscribe to events on all the
1650 	 * anonynous pipes in the system.
1651 	 *
1652 	 * SB_NOUSER covers all of the internal pseudo fs whose objects are not
1653 	 * exposed to user's mount namespace, but there are other SB_KERNMOUNT
1654 	 * fs, like nsfs, debugfs, for which the value of allowing sb and mount
1655 	 * mark is questionable. For now we leave them alone.
1656 	 */
1657 	if (mark_type != FAN_MARK_INODE &&
1658 	    path->mnt->mnt_sb->s_flags & SB_NOUSER)
1659 		return -EINVAL;
1660 
1661 	/*
1662 	 * We shouldn't have allowed setting dirent events and the directory
1663 	 * flags FAN_ONDIR and FAN_EVENT_ON_CHILD in mask of non-dir inode,
1664 	 * but because we always allowed it, error only when using new APIs.
1665 	 */
1666 	if (strict_dir_events && mark_type == FAN_MARK_INODE &&
1667 	    !d_is_dir(path->dentry) && (mask & FANOTIFY_DIRONLY_EVENT_BITS))
1668 		return -ENOTDIR;
1669 
1670 	return 0;
1671 }
1672 
1673 static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
1674 			    int dfd, const char  __user *pathname)
1675 {
1676 	struct inode *inode = NULL;
1677 	struct vfsmount *mnt = NULL;
1678 	struct fsnotify_group *group;
1679 	struct path path;
1680 	struct fan_fsid __fsid, *fsid = NULL;
1681 	u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
1682 	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
1683 	unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS;
1684 	unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS;
1685 	unsigned int obj_type, fid_mode;
1686 	void *obj;
1687 	u32 umask = 0;
1688 	int ret;
1689 
1690 	pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
1691 		 __func__, fanotify_fd, flags, dfd, pathname, mask);
1692 
1693 	/* we only use the lower 32 bits as of right now. */
1694 	if (upper_32_bits(mask))
1695 		return -EINVAL;
1696 
1697 	if (flags & ~FANOTIFY_MARK_FLAGS)
1698 		return -EINVAL;
1699 
1700 	switch (mark_type) {
1701 	case FAN_MARK_INODE:
1702 		obj_type = FSNOTIFY_OBJ_TYPE_INODE;
1703 		break;
1704 	case FAN_MARK_MOUNT:
1705 		obj_type = FSNOTIFY_OBJ_TYPE_VFSMOUNT;
1706 		break;
1707 	case FAN_MARK_FILESYSTEM:
1708 		obj_type = FSNOTIFY_OBJ_TYPE_SB;
1709 		break;
1710 	default:
1711 		return -EINVAL;
1712 	}
1713 
1714 	switch (mark_cmd) {
1715 	case FAN_MARK_ADD:
1716 	case FAN_MARK_REMOVE:
1717 		if (!mask)
1718 			return -EINVAL;
1719 		break;
1720 	case FAN_MARK_FLUSH:
1721 		if (flags & ~(FANOTIFY_MARK_TYPE_BITS | FAN_MARK_FLUSH))
1722 			return -EINVAL;
1723 		break;
1724 	default:
1725 		return -EINVAL;
1726 	}
1727 
1728 	if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
1729 		valid_mask |= FANOTIFY_PERM_EVENTS;
1730 
1731 	if (mask & ~valid_mask)
1732 		return -EINVAL;
1733 
1734 
1735 	/* We don't allow FAN_MARK_IGNORE & FAN_MARK_IGNORED_MASK together */
1736 	if (ignore == (FAN_MARK_IGNORE | FAN_MARK_IGNORED_MASK))
1737 		return -EINVAL;
1738 
1739 	/*
1740 	 * Event flags (FAN_ONDIR, FAN_EVENT_ON_CHILD) have no effect with
1741 	 * FAN_MARK_IGNORED_MASK.
1742 	 */
1743 	if (ignore == FAN_MARK_IGNORED_MASK) {
1744 		mask &= ~FANOTIFY_EVENT_FLAGS;
1745 		umask = FANOTIFY_EVENT_FLAGS;
1746 	}
1747 
1748 	CLASS(fd, f)(fanotify_fd);
1749 	if (fd_empty(f))
1750 		return -EBADF;
1751 
1752 	/* verify that this is indeed an fanotify instance */
1753 	if (unlikely(fd_file(f)->f_op != &fanotify_fops))
1754 		return -EINVAL;
1755 	group = fd_file(f)->private_data;
1756 
1757 	/*
1758 	 * An unprivileged user is not allowed to setup mount nor filesystem
1759 	 * marks.  This also includes setting up such marks by a group that
1760 	 * was initialized by an unprivileged user.
1761 	 */
1762 	if ((!capable(CAP_SYS_ADMIN) ||
1763 	     FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) &&
1764 	    mark_type != FAN_MARK_INODE)
1765 		return -EPERM;
1766 
1767 	/*
1768 	 * Permission events require minimum priority FAN_CLASS_CONTENT.
1769 	 */
1770 	if (mask & FANOTIFY_PERM_EVENTS &&
1771 	    group->priority < FSNOTIFY_PRIO_CONTENT)
1772 		return -EINVAL;
1773 
1774 	if (mask & FAN_FS_ERROR &&
1775 	    mark_type != FAN_MARK_FILESYSTEM)
1776 		return -EINVAL;
1777 
1778 	/*
1779 	 * Evictable is only relevant for inode marks, because only inode object
1780 	 * can be evicted on memory pressure.
1781 	 */
1782 	if (flags & FAN_MARK_EVICTABLE &&
1783 	     mark_type != FAN_MARK_INODE)
1784 		return -EINVAL;
1785 
1786 	/*
1787 	 * Events that do not carry enough information to report
1788 	 * event->fd require a group that supports reporting fid.  Those
1789 	 * events are not supported on a mount mark, because they do not
1790 	 * carry enough information (i.e. path) to be filtered by mount
1791 	 * point.
1792 	 */
1793 	fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
1794 	if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_EVENT_FLAGS) &&
1795 	    (!fid_mode || mark_type == FAN_MARK_MOUNT))
1796 		return -EINVAL;
1797 
1798 	/*
1799 	 * FAN_RENAME uses special info type records to report the old and
1800 	 * new parent+name.  Reporting only old and new parent id is less
1801 	 * useful and was not implemented.
1802 	 */
1803 	if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME))
1804 		return -EINVAL;
1805 
1806 	if (mark_cmd == FAN_MARK_FLUSH) {
1807 		if (mark_type == FAN_MARK_MOUNT)
1808 			fsnotify_clear_vfsmount_marks_by_group(group);
1809 		else if (mark_type == FAN_MARK_FILESYSTEM)
1810 			fsnotify_clear_sb_marks_by_group(group);
1811 		else
1812 			fsnotify_clear_inode_marks_by_group(group);
1813 		return 0;
1814 	}
1815 
1816 	ret = fanotify_find_path(dfd, pathname, &path, flags,
1817 			(mask & ALL_FSNOTIFY_EVENTS), obj_type);
1818 	if (ret)
1819 		return ret;
1820 
1821 	if (mark_cmd == FAN_MARK_ADD) {
1822 		ret = fanotify_events_supported(group, &path, mask, flags);
1823 		if (ret)
1824 			goto path_put_and_out;
1825 	}
1826 
1827 	if (fid_mode) {
1828 		ret = fanotify_test_fsid(path.dentry, flags, &__fsid);
1829 		if (ret)
1830 			goto path_put_and_out;
1831 
1832 		ret = fanotify_test_fid(path.dentry, flags);
1833 		if (ret)
1834 			goto path_put_and_out;
1835 
1836 		fsid = &__fsid;
1837 	}
1838 
1839 	/* inode held in place by reference to path; group by fget on fd */
1840 	if (mark_type == FAN_MARK_INODE) {
1841 		inode = path.dentry->d_inode;
1842 		obj = inode;
1843 	} else {
1844 		mnt = path.mnt;
1845 		if (mark_type == FAN_MARK_MOUNT)
1846 			obj = mnt;
1847 		else
1848 			obj = mnt->mnt_sb;
1849 	}
1850 
1851 	/*
1852 	 * If some other task has this inode open for write we should not add
1853 	 * an ignore mask, unless that ignore mask is supposed to survive
1854 	 * modification changes anyway.
1855 	 */
1856 	if (mark_cmd == FAN_MARK_ADD && (flags & FANOTIFY_MARK_IGNORE_BITS) &&
1857 	    !(flags & FAN_MARK_IGNORED_SURV_MODIFY)) {
1858 		ret = mnt ? -EINVAL : -EISDIR;
1859 		/* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */
1860 		if (ignore == FAN_MARK_IGNORE &&
1861 		    (mnt || S_ISDIR(inode->i_mode)))
1862 			goto path_put_and_out;
1863 
1864 		ret = 0;
1865 		if (inode && inode_is_open_for_write(inode))
1866 			goto path_put_and_out;
1867 	}
1868 
1869 	/* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */
1870 	if (mnt || !S_ISDIR(inode->i_mode)) {
1871 		mask &= ~FAN_EVENT_ON_CHILD;
1872 		umask = FAN_EVENT_ON_CHILD;
1873 		/*
1874 		 * If group needs to report parent fid, register for getting
1875 		 * events with parent/name info for non-directory.
1876 		 */
1877 		if ((fid_mode & FAN_REPORT_DIR_FID) &&
1878 		    (flags & FAN_MARK_ADD) && !ignore)
1879 			mask |= FAN_EVENT_ON_CHILD;
1880 	}
1881 
1882 	/* create/update an inode mark */
1883 	switch (mark_cmd) {
1884 	case FAN_MARK_ADD:
1885 		ret = fanotify_add_mark(group, obj, obj_type, mask, flags,
1886 					fsid);
1887 		break;
1888 	case FAN_MARK_REMOVE:
1889 		ret = fanotify_remove_mark(group, obj, obj_type, mask, flags,
1890 					   umask);
1891 		break;
1892 	default:
1893 		ret = -EINVAL;
1894 	}
1895 
1896 path_put_and_out:
1897 	path_put(&path);
1898 	return ret;
1899 }
1900 
1901 #ifndef CONFIG_ARCH_SPLIT_ARG64
1902 SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
1903 			      __u64, mask, int, dfd,
1904 			      const char  __user *, pathname)
1905 {
1906 	return do_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname);
1907 }
1908 #endif
1909 
1910 #if defined(CONFIG_ARCH_SPLIT_ARG64) || defined(CONFIG_COMPAT)
1911 SYSCALL32_DEFINE6(fanotify_mark,
1912 				int, fanotify_fd, unsigned int, flags,
1913 				SC_ARG64(mask), int, dfd,
1914 				const char  __user *, pathname)
1915 {
1916 	return do_fanotify_mark(fanotify_fd, flags, SC_VAL64(__u64, mask),
1917 				dfd, pathname);
1918 }
1919 #endif
1920 
1921 /*
1922  * fanotify_user_setup - Our initialization function.  Note that we cannot return
1923  * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
1924  * must result in panic().
1925  */
1926 static int __init fanotify_user_setup(void)
1927 {
1928 	struct sysinfo si;
1929 	int max_marks;
1930 
1931 	si_meminfo(&si);
1932 	/*
1933 	 * Allow up to 1% of addressable memory to be accounted for per user
1934 	 * marks limited to the range [8192, 1048576]. mount and sb marks are
1935 	 * a lot cheaper than inode marks, but there is no reason for a user
1936 	 * to have many of those, so calculate by the cost of inode marks.
1937 	 */
1938 	max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) /
1939 		    INODE_MARK_COST;
1940 	max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS,
1941 				     FANOTIFY_DEFAULT_MAX_USER_MARKS);
1942 
1943 	BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
1944 	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12);
1945 	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11);
1946 
1947 	fanotify_mark_cache = KMEM_CACHE(fanotify_mark,
1948 					 SLAB_PANIC|SLAB_ACCOUNT);
1949 	fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event,
1950 					       SLAB_PANIC);
1951 	fanotify_path_event_cachep = KMEM_CACHE(fanotify_path_event,
1952 						SLAB_PANIC);
1953 	if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) {
1954 		fanotify_perm_event_cachep =
1955 			KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
1956 	}
1957 
1958 	fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS;
1959 	init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] =
1960 					FANOTIFY_DEFAULT_MAX_GROUPS;
1961 	init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks;
1962 	fanotify_sysctls_init();
1963 
1964 	return 0;
1965 }
1966 device_initcall(fanotify_user_setup);
1967