xref: /linux/fs/ceph/locks.c (revision 89aa593010135660991d05c92528c2c9163d5900)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/ceph/ceph_debug.h>
3 
4 #include <linux/file.h>
5 #include <linux/namei.h>
6 #include <linux/random.h>
7 
8 #include "super.h"
9 #include "mds_client.h"
10 #include <linux/ceph/pagelist.h>
11 
12 static u64 lock_secret;
13 static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
14                                          struct ceph_mds_request *req);
15 
16 static inline u64 secure_addr(void *addr)
17 {
18 	u64 v = lock_secret ^ (u64)(unsigned long)addr;
19 	/*
20 	 * Set the most significant bit, so that MDS knows the 'owner'
21 	 * is sufficient to identify the owner of lock. (old code uses
22 	 * both 'owner' and 'pid')
23 	 */
24 	v |= (1ULL << 63);
25 	return v;
26 }
27 
28 void __init ceph_flock_init(void)
29 {
30 	get_random_bytes(&lock_secret, sizeof(lock_secret));
31 }
32 
33 static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
34 {
35 	struct inode *inode = file_inode(src->fl_file);
36 	atomic_inc(&ceph_inode(inode)->i_filelock_ref);
37 }
38 
39 static void ceph_fl_release_lock(struct file_lock *fl)
40 {
41 	struct inode *inode = file_inode(fl->fl_file);
42 	atomic_dec(&ceph_inode(inode)->i_filelock_ref);
43 }
44 
45 static const struct file_lock_operations ceph_fl_lock_ops = {
46 	.fl_copy_lock = ceph_fl_copy_lock,
47 	.fl_release_private = ceph_fl_release_lock,
48 };
49 
50 /**
51  * Implement fcntl and flock locking functions.
52  */
53 static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
54 			     int cmd, u8 wait, struct file_lock *fl)
55 {
56 	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
57 	struct ceph_mds_request *req;
58 	int err;
59 	u64 length = 0;
60 	u64 owner;
61 
62 	if (operation == CEPH_MDS_OP_SETFILELOCK) {
63 		/*
64 		 * increasing i_filelock_ref closes race window between
65 		 * handling request reply and adding file_lock struct to
66 		 * inode. Otherwise, auth caps may get trimmed in the
67 		 * window. Caller function will decrease the counter.
68 		 */
69 		fl->fl_ops = &ceph_fl_lock_ops;
70 		atomic_inc(&ceph_inode(inode)->i_filelock_ref);
71 	}
72 
73 	if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK)
74 		wait = 0;
75 
76 	req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
77 	if (IS_ERR(req))
78 		return PTR_ERR(req);
79 	req->r_inode = inode;
80 	ihold(inode);
81 	req->r_num_caps = 1;
82 
83 	/* mds requires start and length rather than start and end */
84 	if (LLONG_MAX == fl->fl_end)
85 		length = 0;
86 	else
87 		length = fl->fl_end - fl->fl_start + 1;
88 
89 	owner = secure_addr(fl->fl_owner);
90 
91 	dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, "
92 	     "start: %llu, length: %llu, wait: %d, type: %d", (int)lock_type,
93 	     (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length,
94 	     wait, fl->fl_type);
95 
96 	req->r_args.filelock_change.rule = lock_type;
97 	req->r_args.filelock_change.type = cmd;
98 	req->r_args.filelock_change.owner = cpu_to_le64(owner);
99 	req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
100 	req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
101 	req->r_args.filelock_change.length = cpu_to_le64(length);
102 	req->r_args.filelock_change.wait = wait;
103 
104 	if (wait)
105 		req->r_wait_for_completion = ceph_lock_wait_for_completion;
106 
107 	err = ceph_mdsc_do_request(mdsc, inode, req);
108 
109 	if (operation == CEPH_MDS_OP_GETFILELOCK) {
110 		fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid);
111 		if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
112 			fl->fl_type = F_RDLCK;
113 		else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
114 			fl->fl_type = F_WRLCK;
115 		else
116 			fl->fl_type = F_UNLCK;
117 
118 		fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
119 		length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
120 						 le64_to_cpu(req->r_reply_info.filelock_reply->length);
121 		if (length >= 1)
122 			fl->fl_end = length -1;
123 		else
124 			fl->fl_end = 0;
125 
126 	}
127 	ceph_mdsc_put_request(req);
128 	dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
129 	     "length: %llu, wait: %d, type: %d, err code %d", (int)lock_type,
130 	     (int)operation, (u64)fl->fl_pid, fl->fl_start,
131 	     length, wait, fl->fl_type, err);
132 	return err;
133 }
134 
135 static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
136                                          struct ceph_mds_request *req)
137 {
138 	struct ceph_mds_request *intr_req;
139 	struct inode *inode = req->r_inode;
140 	int err, lock_type;
141 
142 	BUG_ON(req->r_op != CEPH_MDS_OP_SETFILELOCK);
143 	if (req->r_args.filelock_change.rule == CEPH_LOCK_FCNTL)
144 		lock_type = CEPH_LOCK_FCNTL_INTR;
145 	else if (req->r_args.filelock_change.rule == CEPH_LOCK_FLOCK)
146 		lock_type = CEPH_LOCK_FLOCK_INTR;
147 	else
148 		BUG_ON(1);
149 	BUG_ON(req->r_args.filelock_change.type == CEPH_LOCK_UNLOCK);
150 
151 	err = wait_for_completion_interruptible(&req->r_completion);
152 	if (!err)
153 		return 0;
154 
155 	dout("ceph_lock_wait_for_completion: request %llu was interrupted\n",
156 	     req->r_tid);
157 
158 	mutex_lock(&mdsc->mutex);
159 	if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
160 		err = 0;
161 	} else {
162 		/*
163 		 * ensure we aren't running concurrently with
164 		 * ceph_fill_trace or ceph_readdir_prepopulate, which
165 		 * rely on locks (dir mutex) held by our caller.
166 		 */
167 		mutex_lock(&req->r_fill_mutex);
168 		req->r_err = err;
169 		set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
170 		mutex_unlock(&req->r_fill_mutex);
171 
172 		if (!req->r_session) {
173 			// haven't sent the request
174 			err = 0;
175 		}
176 	}
177 	mutex_unlock(&mdsc->mutex);
178 	if (!err)
179 		return 0;
180 
181 	intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK,
182 					    USE_AUTH_MDS);
183 	if (IS_ERR(intr_req))
184 		return PTR_ERR(intr_req);
185 
186 	intr_req->r_inode = inode;
187 	ihold(inode);
188 	intr_req->r_num_caps = 1;
189 
190 	intr_req->r_args.filelock_change = req->r_args.filelock_change;
191 	intr_req->r_args.filelock_change.rule = lock_type;
192 	intr_req->r_args.filelock_change.type = CEPH_LOCK_UNLOCK;
193 
194 	err = ceph_mdsc_do_request(mdsc, inode, intr_req);
195 	ceph_mdsc_put_request(intr_req);
196 
197 	if (err && err != -ERESTARTSYS)
198 		return err;
199 
200 	wait_for_completion_killable(&req->r_safe_completion);
201 	return 0;
202 }
203 
204 /**
205  * Attempt to set an fcntl lock.
206  * For now, this just goes away to the server. Later it may be more awesome.
207  */
208 int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
209 {
210 	struct inode *inode = file_inode(file);
211 	int err;
212 	u16 op = CEPH_MDS_OP_SETFILELOCK;
213 	u8 lock_cmd;
214 	u8 wait = 0;
215 
216 	if (!(fl->fl_flags & FL_POSIX))
217 		return -ENOLCK;
218 	/* No mandatory locks */
219 	if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
220 		return -ENOLCK;
221 
222 	dout("ceph_lock, fl_owner: %p", fl->fl_owner);
223 
224 	/* set wait bit as appropriate, then make command as Ceph expects it*/
225 	if (IS_GETLK(cmd))
226 		op = CEPH_MDS_OP_GETFILELOCK;
227 	else if (IS_SETLKW(cmd))
228 		wait = 1;
229 
230 	if (op == CEPH_MDS_OP_SETFILELOCK) {
231 		/*
232 		 * increasing i_filelock_ref closes race window between
233 		 * handling request reply and adding file_lock struct to
234 		 * inode. Otherwise, i_auth_cap may get trimmed in the
235 		 * window. Caller function will decrease the counter.
236 		 */
237 		fl->fl_ops = &ceph_fl_lock_ops;
238 		atomic_inc(&ceph_inode(inode)->i_filelock_ref);
239 	}
240 
241 	if (F_RDLCK == fl->fl_type)
242 		lock_cmd = CEPH_LOCK_SHARED;
243 	else if (F_WRLCK == fl->fl_type)
244 		lock_cmd = CEPH_LOCK_EXCL;
245 	else
246 		lock_cmd = CEPH_LOCK_UNLOCK;
247 
248 	err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl);
249 	if (!err) {
250 		if (op != CEPH_MDS_OP_GETFILELOCK) {
251 			dout("mds locked, locking locally");
252 			err = posix_lock_file(file, fl, NULL);
253 			if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
254 				/* undo! This should only happen if
255 				 * the kernel detects local
256 				 * deadlock. */
257 				ceph_lock_message(CEPH_LOCK_FCNTL, op, inode,
258 						  CEPH_LOCK_UNLOCK, 0, fl);
259 				dout("got %d on posix_lock_file, undid lock",
260 				     err);
261 			}
262 		}
263 	}
264 	return err;
265 }
266 
267 int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
268 {
269 	struct inode *inode = file_inode(file);
270 	int err;
271 	u8 lock_cmd;
272 	u8 wait = 0;
273 
274 	if (!(fl->fl_flags & FL_FLOCK))
275 		return -ENOLCK;
276 	/* No mandatory locks */
277 	if (fl->fl_type & LOCK_MAND)
278 		return -EOPNOTSUPP;
279 
280 	dout("ceph_flock, fl_file: %p", fl->fl_file);
281 
282 	/* see comment in ceph_lock */
283 	fl->fl_ops = &ceph_fl_lock_ops;
284 	atomic_inc(&ceph_inode(inode)->i_filelock_ref);
285 
286 	if (IS_SETLKW(cmd))
287 		wait = 1;
288 
289 	if (F_RDLCK == fl->fl_type)
290 		lock_cmd = CEPH_LOCK_SHARED;
291 	else if (F_WRLCK == fl->fl_type)
292 		lock_cmd = CEPH_LOCK_EXCL;
293 	else
294 		lock_cmd = CEPH_LOCK_UNLOCK;
295 
296 	err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
297 				inode, lock_cmd, wait, fl);
298 	if (!err) {
299 		err = locks_lock_file_wait(file, fl);
300 		if (err) {
301 			ceph_lock_message(CEPH_LOCK_FLOCK,
302 					  CEPH_MDS_OP_SETFILELOCK,
303 					  inode, CEPH_LOCK_UNLOCK, 0, fl);
304 			dout("got %d on locks_lock_file_wait, undid lock", err);
305 		}
306 	}
307 	return err;
308 }
309 
310 /*
311  * Fills in the passed counter variables, so you can prepare pagelist metadata
312  * before calling ceph_encode_locks.
313  */
314 void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
315 {
316 	struct file_lock *lock;
317 	struct file_lock_context *ctx;
318 
319 	*fcntl_count = 0;
320 	*flock_count = 0;
321 
322 	ctx = inode->i_flctx;
323 	if (ctx) {
324 		spin_lock(&ctx->flc_lock);
325 		list_for_each_entry(lock, &ctx->flc_posix, fl_list)
326 			++(*fcntl_count);
327 		list_for_each_entry(lock, &ctx->flc_flock, fl_list)
328 			++(*flock_count);
329 		spin_unlock(&ctx->flc_lock);
330 	}
331 	dout("counted %d flock locks and %d fcntl locks",
332 	     *flock_count, *fcntl_count);
333 }
334 
335 /**
336  * Encode the flock and fcntl locks for the given inode into the ceph_filelock
337  * array. Must be called with inode->i_lock already held.
338  * If we encounter more of a specific lock type than expected, return -ENOSPC.
339  */
340 int ceph_encode_locks_to_buffer(struct inode *inode,
341 				struct ceph_filelock *flocks,
342 				int num_fcntl_locks, int num_flock_locks)
343 {
344 	struct file_lock *lock;
345 	struct file_lock_context *ctx = inode->i_flctx;
346 	int err = 0;
347 	int seen_fcntl = 0;
348 	int seen_flock = 0;
349 	int l = 0;
350 
351 	dout("encoding %d flock and %d fcntl locks", num_flock_locks,
352 	     num_fcntl_locks);
353 
354 	if (!ctx)
355 		return 0;
356 
357 	spin_lock(&ctx->flc_lock);
358 	list_for_each_entry(lock, &ctx->flc_posix, fl_list) {
359 		++seen_fcntl;
360 		if (seen_fcntl > num_fcntl_locks) {
361 			err = -ENOSPC;
362 			goto fail;
363 		}
364 		err = lock_to_ceph_filelock(lock, &flocks[l]);
365 		if (err)
366 			goto fail;
367 		++l;
368 	}
369 	list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
370 		++seen_flock;
371 		if (seen_flock > num_flock_locks) {
372 			err = -ENOSPC;
373 			goto fail;
374 		}
375 		err = lock_to_ceph_filelock(lock, &flocks[l]);
376 		if (err)
377 			goto fail;
378 		++l;
379 	}
380 fail:
381 	spin_unlock(&ctx->flc_lock);
382 	return err;
383 }
384 
385 /**
386  * Copy the encoded flock and fcntl locks into the pagelist.
387  * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
388  * sequential flock locks.
389  * Returns zero on success.
390  */
391 int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
392 			   struct ceph_pagelist *pagelist,
393 			   int num_fcntl_locks, int num_flock_locks)
394 {
395 	int err = 0;
396 	__le32 nlocks;
397 
398 	nlocks = cpu_to_le32(num_fcntl_locks);
399 	err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
400 	if (err)
401 		goto out_fail;
402 
403 	err = ceph_pagelist_append(pagelist, flocks,
404 				   num_fcntl_locks * sizeof(*flocks));
405 	if (err)
406 		goto out_fail;
407 
408 	nlocks = cpu_to_le32(num_flock_locks);
409 	err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
410 	if (err)
411 		goto out_fail;
412 
413 	err = ceph_pagelist_append(pagelist,
414 				   &flocks[num_fcntl_locks],
415 				   num_flock_locks * sizeof(*flocks));
416 out_fail:
417 	return err;
418 }
419 
420 /*
421  * Given a pointer to a lock, convert it to a ceph filelock
422  */
423 int lock_to_ceph_filelock(struct file_lock *lock,
424 			  struct ceph_filelock *cephlock)
425 {
426 	int err = 0;
427 	cephlock->start = cpu_to_le64(lock->fl_start);
428 	cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
429 	cephlock->client = cpu_to_le64(0);
430 	cephlock->pid = cpu_to_le64((u64)lock->fl_pid);
431 	cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner));
432 
433 	switch (lock->fl_type) {
434 	case F_RDLCK:
435 		cephlock->type = CEPH_LOCK_SHARED;
436 		break;
437 	case F_WRLCK:
438 		cephlock->type = CEPH_LOCK_EXCL;
439 		break;
440 	case F_UNLCK:
441 		cephlock->type = CEPH_LOCK_UNLOCK;
442 		break;
443 	default:
444 		dout("Have unknown lock type %d", lock->fl_type);
445 		err = -EINVAL;
446 	}
447 
448 	return err;
449 }
450