xref: /linux/fs/ceph/locks.c (revision 470ac62dfa5732c149adce2cbce84ac678de701f)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/ceph/ceph_debug.h>
3 
4 #include <linux/file.h>
5 #include <linux/namei.h>
6 #include <linux/random.h>
7 
8 #include "super.h"
9 #include "mds_client.h"
10 #include <linux/ceph/pagelist.h>
11 
12 static u64 lock_secret;
13 static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
14                                          struct ceph_mds_request *req);
15 
16 static inline u64 secure_addr(void *addr)
17 {
18 	u64 v = lock_secret ^ (u64)(unsigned long)addr;
19 	/*
20 	 * Set the most significant bit, so that MDS knows the 'owner'
21 	 * is sufficient to identify the owner of lock. (old code uses
22 	 * both 'owner' and 'pid')
23 	 */
24 	v |= (1ULL << 63);
25 	return v;
26 }
27 
28 void __init ceph_flock_init(void)
29 {
30 	get_random_bytes(&lock_secret, sizeof(lock_secret));
31 }
32 
33 static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
34 {
35 	struct inode *inode = file_inode(dst->fl_file);
36 	atomic_inc(&ceph_inode(inode)->i_filelock_ref);
37 	dst->fl_u.ceph.inode = igrab(inode);
38 }
39 
40 /*
41  * Do not use the 'fl->fl_file' in release function, which
42  * is possibly already released by another thread.
43  */
44 static void ceph_fl_release_lock(struct file_lock *fl)
45 {
46 	struct inode *inode = fl->fl_u.ceph.inode;
47 	struct ceph_inode_info *ci;
48 
49 	/*
50 	 * If inode is NULL it should be a request file_lock,
51 	 * nothing we can do.
52 	 */
53 	if (!inode)
54 		return;
55 
56 	ci = ceph_inode(inode);
57 	if (atomic_dec_and_test(&ci->i_filelock_ref)) {
58 		/* clear error when all locks are released */
59 		spin_lock(&ci->i_ceph_lock);
60 		ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK;
61 		spin_unlock(&ci->i_ceph_lock);
62 	}
63 	fl->fl_u.ceph.inode = NULL;
64 	iput(inode);
65 }
66 
67 static const struct file_lock_operations ceph_fl_lock_ops = {
68 	.fl_copy_lock = ceph_fl_copy_lock,
69 	.fl_release_private = ceph_fl_release_lock,
70 };
71 
72 /*
73  * Implement fcntl and flock locking functions.
74  */
75 static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
76 			     int cmd, u8 wait, struct file_lock *fl)
77 {
78 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
79 	struct ceph_mds_request *req;
80 	int err;
81 	u64 length = 0;
82 	u64 owner;
83 
84 	if (operation == CEPH_MDS_OP_SETFILELOCK) {
85 		/*
86 		 * increasing i_filelock_ref closes race window between
87 		 * handling request reply and adding file_lock struct to
88 		 * inode. Otherwise, auth caps may get trimmed in the
89 		 * window. Caller function will decrease the counter.
90 		 */
91 		fl->fl_ops = &ceph_fl_lock_ops;
92 		fl->fl_ops->fl_copy_lock(fl, NULL);
93 	}
94 
95 	if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK)
96 		wait = 0;
97 
98 	req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
99 	if (IS_ERR(req))
100 		return PTR_ERR(req);
101 	req->r_inode = inode;
102 	ihold(inode);
103 	req->r_num_caps = 1;
104 
105 	/* mds requires start and length rather than start and end */
106 	if (LLONG_MAX == fl->fl_end)
107 		length = 0;
108 	else
109 		length = fl->fl_end - fl->fl_start + 1;
110 
111 	owner = secure_addr(fl->fl_owner);
112 
113 	dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, "
114 	     "start: %llu, length: %llu, wait: %d, type: %d\n", (int)lock_type,
115 	     (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length,
116 	     wait, fl->fl_type);
117 
118 	req->r_args.filelock_change.rule = lock_type;
119 	req->r_args.filelock_change.type = cmd;
120 	req->r_args.filelock_change.owner = cpu_to_le64(owner);
121 	req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
122 	req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
123 	req->r_args.filelock_change.length = cpu_to_le64(length);
124 	req->r_args.filelock_change.wait = wait;
125 
126 	err = ceph_mdsc_submit_request(mdsc, inode, req);
127 	if (!err)
128 		err = ceph_mdsc_wait_request(mdsc, req, wait ?
129 					ceph_lock_wait_for_completion : NULL);
130 	if (!err && operation == CEPH_MDS_OP_GETFILELOCK) {
131 		fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid);
132 		if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
133 			fl->fl_type = F_RDLCK;
134 		else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
135 			fl->fl_type = F_WRLCK;
136 		else
137 			fl->fl_type = F_UNLCK;
138 
139 		fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
140 		length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
141 						 le64_to_cpu(req->r_reply_info.filelock_reply->length);
142 		if (length >= 1)
143 			fl->fl_end = length -1;
144 		else
145 			fl->fl_end = 0;
146 
147 	}
148 	ceph_mdsc_put_request(req);
149 	dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
150 	     "length: %llu, wait: %d, type: %d, err code %d\n", (int)lock_type,
151 	     (int)operation, (u64)fl->fl_pid, fl->fl_start,
152 	     length, wait, fl->fl_type, err);
153 	return err;
154 }
155 
156 static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
157                                          struct ceph_mds_request *req)
158 {
159 	struct ceph_mds_request *intr_req;
160 	struct inode *inode = req->r_inode;
161 	int err, lock_type;
162 
163 	BUG_ON(req->r_op != CEPH_MDS_OP_SETFILELOCK);
164 	if (req->r_args.filelock_change.rule == CEPH_LOCK_FCNTL)
165 		lock_type = CEPH_LOCK_FCNTL_INTR;
166 	else if (req->r_args.filelock_change.rule == CEPH_LOCK_FLOCK)
167 		lock_type = CEPH_LOCK_FLOCK_INTR;
168 	else
169 		BUG_ON(1);
170 	BUG_ON(req->r_args.filelock_change.type == CEPH_LOCK_UNLOCK);
171 
172 	err = wait_for_completion_interruptible(&req->r_completion);
173 	if (!err)
174 		return 0;
175 
176 	dout("ceph_lock_wait_for_completion: request %llu was interrupted\n",
177 	     req->r_tid);
178 
179 	mutex_lock(&mdsc->mutex);
180 	if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
181 		err = 0;
182 	} else {
183 		/*
184 		 * ensure we aren't running concurrently with
185 		 * ceph_fill_trace or ceph_readdir_prepopulate, which
186 		 * rely on locks (dir mutex) held by our caller.
187 		 */
188 		mutex_lock(&req->r_fill_mutex);
189 		req->r_err = err;
190 		set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
191 		mutex_unlock(&req->r_fill_mutex);
192 
193 		if (!req->r_session) {
194 			// haven't sent the request
195 			err = 0;
196 		}
197 	}
198 	mutex_unlock(&mdsc->mutex);
199 	if (!err)
200 		return 0;
201 
202 	intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK,
203 					    USE_AUTH_MDS);
204 	if (IS_ERR(intr_req))
205 		return PTR_ERR(intr_req);
206 
207 	intr_req->r_inode = inode;
208 	ihold(inode);
209 	intr_req->r_num_caps = 1;
210 
211 	intr_req->r_args.filelock_change = req->r_args.filelock_change;
212 	intr_req->r_args.filelock_change.rule = lock_type;
213 	intr_req->r_args.filelock_change.type = CEPH_LOCK_UNLOCK;
214 
215 	err = ceph_mdsc_do_request(mdsc, inode, intr_req);
216 	ceph_mdsc_put_request(intr_req);
217 
218 	if (err && err != -ERESTARTSYS)
219 		return err;
220 
221 	wait_for_completion_killable(&req->r_safe_completion);
222 	return 0;
223 }
224 
225 static int try_unlock_file(struct file *file, struct file_lock *fl)
226 {
227 	int err;
228 	unsigned int orig_flags = fl->fl_flags;
229 	fl->fl_flags |= FL_EXISTS;
230 	err = locks_lock_file_wait(file, fl);
231 	fl->fl_flags = orig_flags;
232 	if (err == -ENOENT) {
233 		if (!(orig_flags & FL_EXISTS))
234 			err = 0;
235 		return err;
236 	}
237 	return 1;
238 }
239 
240 /*
241  * Attempt to set an fcntl lock.
242  * For now, this just goes away to the server. Later it may be more awesome.
243  */
244 int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
245 {
246 	struct inode *inode = file_inode(file);
247 	struct ceph_inode_info *ci = ceph_inode(inode);
248 	int err = 0;
249 	u16 op = CEPH_MDS_OP_SETFILELOCK;
250 	u8 wait = 0;
251 	u8 lock_cmd;
252 
253 	if (!(fl->fl_flags & FL_POSIX))
254 		return -ENOLCK;
255 
256 	if (ceph_inode_is_shutdown(inode))
257 		return -ESTALE;
258 
259 	dout("ceph_lock, fl_owner: %p\n", fl->fl_owner);
260 
261 	/* set wait bit as appropriate, then make command as Ceph expects it*/
262 	if (IS_GETLK(cmd))
263 		op = CEPH_MDS_OP_GETFILELOCK;
264 	else if (IS_SETLKW(cmd))
265 		wait = 1;
266 
267 	spin_lock(&ci->i_ceph_lock);
268 	if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
269 		err = -EIO;
270 	}
271 	spin_unlock(&ci->i_ceph_lock);
272 	if (err < 0) {
273 		if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type)
274 			posix_lock_file(file, fl, NULL);
275 		return err;
276 	}
277 
278 	if (F_RDLCK == fl->fl_type)
279 		lock_cmd = CEPH_LOCK_SHARED;
280 	else if (F_WRLCK == fl->fl_type)
281 		lock_cmd = CEPH_LOCK_EXCL;
282 	else
283 		lock_cmd = CEPH_LOCK_UNLOCK;
284 
285 	if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type) {
286 		err = try_unlock_file(file, fl);
287 		if (err <= 0)
288 			return err;
289 	}
290 
291 	err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl);
292 	if (!err) {
293 		if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->fl_type) {
294 			dout("mds locked, locking locally\n");
295 			err = posix_lock_file(file, fl, NULL);
296 			if (err) {
297 				/* undo! This should only happen if
298 				 * the kernel detects local
299 				 * deadlock. */
300 				ceph_lock_message(CEPH_LOCK_FCNTL, op, inode,
301 						  CEPH_LOCK_UNLOCK, 0, fl);
302 				dout("got %d on posix_lock_file, undid lock\n",
303 				     err);
304 			}
305 		}
306 	}
307 	return err;
308 }
309 
310 int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
311 {
312 	struct inode *inode = file_inode(file);
313 	struct ceph_inode_info *ci = ceph_inode(inode);
314 	int err = 0;
315 	u8 wait = 0;
316 	u8 lock_cmd;
317 
318 	if (!(fl->fl_flags & FL_FLOCK))
319 		return -ENOLCK;
320 
321 	if (ceph_inode_is_shutdown(inode))
322 		return -ESTALE;
323 
324 	dout("ceph_flock, fl_file: %p\n", fl->fl_file);
325 
326 	spin_lock(&ci->i_ceph_lock);
327 	if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
328 		err = -EIO;
329 	}
330 	spin_unlock(&ci->i_ceph_lock);
331 	if (err < 0) {
332 		if (F_UNLCK == fl->fl_type)
333 			locks_lock_file_wait(file, fl);
334 		return err;
335 	}
336 
337 	if (IS_SETLKW(cmd))
338 		wait = 1;
339 
340 	if (F_RDLCK == fl->fl_type)
341 		lock_cmd = CEPH_LOCK_SHARED;
342 	else if (F_WRLCK == fl->fl_type)
343 		lock_cmd = CEPH_LOCK_EXCL;
344 	else
345 		lock_cmd = CEPH_LOCK_UNLOCK;
346 
347 	if (F_UNLCK == fl->fl_type) {
348 		err = try_unlock_file(file, fl);
349 		if (err <= 0)
350 			return err;
351 	}
352 
353 	err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
354 				inode, lock_cmd, wait, fl);
355 	if (!err && F_UNLCK != fl->fl_type) {
356 		err = locks_lock_file_wait(file, fl);
357 		if (err) {
358 			ceph_lock_message(CEPH_LOCK_FLOCK,
359 					  CEPH_MDS_OP_SETFILELOCK,
360 					  inode, CEPH_LOCK_UNLOCK, 0, fl);
361 			dout("got %d on locks_lock_file_wait, undid lock\n", err);
362 		}
363 	}
364 	return err;
365 }
366 
367 /*
368  * Fills in the passed counter variables, so you can prepare pagelist metadata
369  * before calling ceph_encode_locks.
370  */
371 void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
372 {
373 	struct file_lock *lock;
374 	struct file_lock_context *ctx;
375 
376 	*fcntl_count = 0;
377 	*flock_count = 0;
378 
379 	ctx = locks_inode_context(inode);
380 	if (ctx) {
381 		spin_lock(&ctx->flc_lock);
382 		list_for_each_entry(lock, &ctx->flc_posix, fl_list)
383 			++(*fcntl_count);
384 		list_for_each_entry(lock, &ctx->flc_flock, fl_list)
385 			++(*flock_count);
386 		spin_unlock(&ctx->flc_lock);
387 	}
388 	dout("counted %d flock locks and %d fcntl locks\n",
389 	     *flock_count, *fcntl_count);
390 }
391 
392 /*
393  * Given a pointer to a lock, convert it to a ceph filelock
394  */
395 static int lock_to_ceph_filelock(struct file_lock *lock,
396 				 struct ceph_filelock *cephlock)
397 {
398 	int err = 0;
399 	cephlock->start = cpu_to_le64(lock->fl_start);
400 	cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
401 	cephlock->client = cpu_to_le64(0);
402 	cephlock->pid = cpu_to_le64((u64)lock->fl_pid);
403 	cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner));
404 
405 	switch (lock->fl_type) {
406 	case F_RDLCK:
407 		cephlock->type = CEPH_LOCK_SHARED;
408 		break;
409 	case F_WRLCK:
410 		cephlock->type = CEPH_LOCK_EXCL;
411 		break;
412 	case F_UNLCK:
413 		cephlock->type = CEPH_LOCK_UNLOCK;
414 		break;
415 	default:
416 		dout("Have unknown lock type %d\n", lock->fl_type);
417 		err = -EINVAL;
418 	}
419 
420 	return err;
421 }
422 
423 /*
424  * Encode the flock and fcntl locks for the given inode into the ceph_filelock
425  * array. Must be called with inode->i_lock already held.
426  * If we encounter more of a specific lock type than expected, return -ENOSPC.
427  */
428 int ceph_encode_locks_to_buffer(struct inode *inode,
429 				struct ceph_filelock *flocks,
430 				int num_fcntl_locks, int num_flock_locks)
431 {
432 	struct file_lock *lock;
433 	struct file_lock_context *ctx = locks_inode_context(inode);
434 	int err = 0;
435 	int seen_fcntl = 0;
436 	int seen_flock = 0;
437 	int l = 0;
438 
439 	dout("encoding %d flock and %d fcntl locks\n", num_flock_locks,
440 	     num_fcntl_locks);
441 
442 	if (!ctx)
443 		return 0;
444 
445 	spin_lock(&ctx->flc_lock);
446 	list_for_each_entry(lock, &ctx->flc_posix, fl_list) {
447 		++seen_fcntl;
448 		if (seen_fcntl > num_fcntl_locks) {
449 			err = -ENOSPC;
450 			goto fail;
451 		}
452 		err = lock_to_ceph_filelock(lock, &flocks[l]);
453 		if (err)
454 			goto fail;
455 		++l;
456 	}
457 	list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
458 		++seen_flock;
459 		if (seen_flock > num_flock_locks) {
460 			err = -ENOSPC;
461 			goto fail;
462 		}
463 		err = lock_to_ceph_filelock(lock, &flocks[l]);
464 		if (err)
465 			goto fail;
466 		++l;
467 	}
468 fail:
469 	spin_unlock(&ctx->flc_lock);
470 	return err;
471 }
472 
473 /*
474  * Copy the encoded flock and fcntl locks into the pagelist.
475  * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
476  * sequential flock locks.
477  * Returns zero on success.
478  */
479 int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
480 			   struct ceph_pagelist *pagelist,
481 			   int num_fcntl_locks, int num_flock_locks)
482 {
483 	int err = 0;
484 	__le32 nlocks;
485 
486 	nlocks = cpu_to_le32(num_fcntl_locks);
487 	err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
488 	if (err)
489 		goto out_fail;
490 
491 	if (num_fcntl_locks > 0) {
492 		err = ceph_pagelist_append(pagelist, flocks,
493 					   num_fcntl_locks * sizeof(*flocks));
494 		if (err)
495 			goto out_fail;
496 	}
497 
498 	nlocks = cpu_to_le32(num_flock_locks);
499 	err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
500 	if (err)
501 		goto out_fail;
502 
503 	if (num_flock_locks > 0) {
504 		err = ceph_pagelist_append(pagelist, &flocks[num_fcntl_locks],
505 					   num_flock_locks * sizeof(*flocks));
506 	}
507 out_fail:
508 	return err;
509 }
510