1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/ceph/ceph_debug.h> 3 4 #include <linux/file.h> 5 #include <linux/namei.h> 6 #include <linux/random.h> 7 8 #include "super.h" 9 #include "mds_client.h" 10 #include <linux/filelock.h> 11 #include <linux/ceph/pagelist.h> 12 13 static u64 lock_secret; 14 static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, 15 struct ceph_mds_request *req); 16 17 static inline u64 secure_addr(void *addr) 18 { 19 u64 v = lock_secret ^ (u64)(unsigned long)addr; 20 /* 21 * Set the most significant bit, so that MDS knows the 'owner' 22 * is sufficient to identify the owner of lock. (old code uses 23 * both 'owner' and 'pid') 24 */ 25 v |= (1ULL << 63); 26 return v; 27 } 28 29 void __init ceph_flock_init(void) 30 { 31 get_random_bytes(&lock_secret, sizeof(lock_secret)); 32 } 33 34 static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src) 35 { 36 struct inode *inode = file_inode(dst->fl_file); 37 atomic_inc(&ceph_inode(inode)->i_filelock_ref); 38 dst->fl_u.ceph.inode = igrab(inode); 39 } 40 41 /* 42 * Do not use the 'fl->fl_file' in release function, which 43 * is possibly already released by another thread. 44 */ 45 static void ceph_fl_release_lock(struct file_lock *fl) 46 { 47 struct inode *inode = fl->fl_u.ceph.inode; 48 struct ceph_inode_info *ci; 49 50 /* 51 * If inode is NULL it should be a request file_lock, 52 * nothing we can do. 53 */ 54 if (!inode) 55 return; 56 57 ci = ceph_inode(inode); 58 if (atomic_dec_and_test(&ci->i_filelock_ref)) { 59 /* clear error when all locks are released */ 60 spin_lock(&ci->i_ceph_lock); 61 ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK; 62 spin_unlock(&ci->i_ceph_lock); 63 } 64 fl->fl_u.ceph.inode = NULL; 65 iput(inode); 66 } 67 68 static const struct file_lock_operations ceph_fl_lock_ops = { 69 .fl_copy_lock = ceph_fl_copy_lock, 70 .fl_release_private = ceph_fl_release_lock, 71 }; 72 73 /* 74 * Implement fcntl and flock locking functions. 75 */ 76 static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, 77 int cmd, u8 wait, struct file_lock *fl) 78 { 79 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); 80 struct ceph_client *cl = mdsc->fsc->client; 81 struct ceph_mds_request *req; 82 int err; 83 u64 length = 0; 84 u64 owner; 85 86 if (operation == CEPH_MDS_OP_SETFILELOCK) { 87 /* 88 * increasing i_filelock_ref closes race window between 89 * handling request reply and adding file_lock struct to 90 * inode. Otherwise, auth caps may get trimmed in the 91 * window. Caller function will decrease the counter. 92 */ 93 fl->fl_ops = &ceph_fl_lock_ops; 94 fl->fl_ops->fl_copy_lock(fl, NULL); 95 } 96 97 if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK) 98 wait = 0; 99 100 req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); 101 if (IS_ERR(req)) 102 return PTR_ERR(req); 103 req->r_inode = inode; 104 ihold(inode); 105 req->r_num_caps = 1; 106 107 /* mds requires start and length rather than start and end */ 108 if (LLONG_MAX == fl->fl_end) 109 length = 0; 110 else 111 length = fl->fl_end - fl->fl_start + 1; 112 113 owner = secure_addr(fl->fl_owner); 114 115 doutc(cl, "rule: %d, op: %d, owner: %llx, pid: %llu, " 116 "start: %llu, length: %llu, wait: %d, type: %d\n", 117 (int)lock_type, (int)operation, owner, (u64)fl->fl_pid, 118 fl->fl_start, length, wait, fl->fl_type); 119 120 req->r_args.filelock_change.rule = lock_type; 121 req->r_args.filelock_change.type = cmd; 122 req->r_args.filelock_change.owner = cpu_to_le64(owner); 123 req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid); 124 req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start); 125 req->r_args.filelock_change.length = cpu_to_le64(length); 126 req->r_args.filelock_change.wait = wait; 127 128 err = ceph_mdsc_submit_request(mdsc, inode, req); 129 if (!err) 130 err = ceph_mdsc_wait_request(mdsc, req, wait ? 131 ceph_lock_wait_for_completion : NULL); 132 if (!err && operation == CEPH_MDS_OP_GETFILELOCK) { 133 fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid); 134 if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) 135 fl->fl_type = F_RDLCK; 136 else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type) 137 fl->fl_type = F_WRLCK; 138 else 139 fl->fl_type = F_UNLCK; 140 141 fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start); 142 length = le64_to_cpu(req->r_reply_info.filelock_reply->start) + 143 le64_to_cpu(req->r_reply_info.filelock_reply->length); 144 if (length >= 1) 145 fl->fl_end = length -1; 146 else 147 fl->fl_end = 0; 148 149 } 150 ceph_mdsc_put_request(req); 151 doutc(cl, "rule: %d, op: %d, pid: %llu, start: %llu, " 152 "length: %llu, wait: %d, type: %d, err code %d\n", 153 (int)lock_type, (int)operation, (u64)fl->fl_pid, 154 fl->fl_start, length, wait, fl->fl_type, err); 155 return err; 156 } 157 158 static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, 159 struct ceph_mds_request *req) 160 { 161 struct ceph_client *cl = mdsc->fsc->client; 162 struct ceph_mds_request *intr_req; 163 struct inode *inode = req->r_inode; 164 int err, lock_type; 165 166 BUG_ON(req->r_op != CEPH_MDS_OP_SETFILELOCK); 167 if (req->r_args.filelock_change.rule == CEPH_LOCK_FCNTL) 168 lock_type = CEPH_LOCK_FCNTL_INTR; 169 else if (req->r_args.filelock_change.rule == CEPH_LOCK_FLOCK) 170 lock_type = CEPH_LOCK_FLOCK_INTR; 171 else 172 BUG_ON(1); 173 BUG_ON(req->r_args.filelock_change.type == CEPH_LOCK_UNLOCK); 174 175 err = wait_for_completion_interruptible(&req->r_completion); 176 if (!err) 177 return 0; 178 179 doutc(cl, "request %llu was interrupted\n", req->r_tid); 180 181 mutex_lock(&mdsc->mutex); 182 if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { 183 err = 0; 184 } else { 185 /* 186 * ensure we aren't running concurrently with 187 * ceph_fill_trace or ceph_readdir_prepopulate, which 188 * rely on locks (dir mutex) held by our caller. 189 */ 190 mutex_lock(&req->r_fill_mutex); 191 req->r_err = err; 192 set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); 193 mutex_unlock(&req->r_fill_mutex); 194 195 if (!req->r_session) { 196 // haven't sent the request 197 err = 0; 198 } 199 } 200 mutex_unlock(&mdsc->mutex); 201 if (!err) 202 return 0; 203 204 intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK, 205 USE_AUTH_MDS); 206 if (IS_ERR(intr_req)) 207 return PTR_ERR(intr_req); 208 209 intr_req->r_inode = inode; 210 ihold(inode); 211 intr_req->r_num_caps = 1; 212 213 intr_req->r_args.filelock_change = req->r_args.filelock_change; 214 intr_req->r_args.filelock_change.rule = lock_type; 215 intr_req->r_args.filelock_change.type = CEPH_LOCK_UNLOCK; 216 217 err = ceph_mdsc_do_request(mdsc, inode, intr_req); 218 ceph_mdsc_put_request(intr_req); 219 220 if (err && err != -ERESTARTSYS) 221 return err; 222 223 wait_for_completion_killable(&req->r_safe_completion); 224 return 0; 225 } 226 227 static int try_unlock_file(struct file *file, struct file_lock *fl) 228 { 229 int err; 230 unsigned int orig_flags = fl->fl_flags; 231 fl->fl_flags |= FL_EXISTS; 232 err = locks_lock_file_wait(file, fl); 233 fl->fl_flags = orig_flags; 234 if (err == -ENOENT) { 235 if (!(orig_flags & FL_EXISTS)) 236 err = 0; 237 return err; 238 } 239 return 1; 240 } 241 242 /* 243 * Attempt to set an fcntl lock. 244 * For now, this just goes away to the server. Later it may be more awesome. 245 */ 246 int ceph_lock(struct file *file, int cmd, struct file_lock *fl) 247 { 248 struct inode *inode = file_inode(file); 249 struct ceph_inode_info *ci = ceph_inode(inode); 250 struct ceph_client *cl = ceph_inode_to_client(inode); 251 int err = 0; 252 u16 op = CEPH_MDS_OP_SETFILELOCK; 253 u8 wait = 0; 254 u8 lock_cmd; 255 256 if (!(fl->fl_flags & FL_POSIX)) 257 return -ENOLCK; 258 259 if (ceph_inode_is_shutdown(inode)) 260 return -ESTALE; 261 262 doutc(cl, "fl_owner: %p\n", fl->fl_owner); 263 264 /* set wait bit as appropriate, then make command as Ceph expects it*/ 265 if (IS_GETLK(cmd)) 266 op = CEPH_MDS_OP_GETFILELOCK; 267 else if (IS_SETLKW(cmd)) 268 wait = 1; 269 270 spin_lock(&ci->i_ceph_lock); 271 if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { 272 err = -EIO; 273 } 274 spin_unlock(&ci->i_ceph_lock); 275 if (err < 0) { 276 if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type) 277 posix_lock_file(file, fl, NULL); 278 return err; 279 } 280 281 if (F_RDLCK == fl->fl_type) 282 lock_cmd = CEPH_LOCK_SHARED; 283 else if (F_WRLCK == fl->fl_type) 284 lock_cmd = CEPH_LOCK_EXCL; 285 else 286 lock_cmd = CEPH_LOCK_UNLOCK; 287 288 if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type) { 289 err = try_unlock_file(file, fl); 290 if (err <= 0) 291 return err; 292 } 293 294 err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl); 295 if (!err) { 296 if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->fl_type) { 297 doutc(cl, "locking locally\n"); 298 err = posix_lock_file(file, fl, NULL); 299 if (err) { 300 /* undo! This should only happen if 301 * the kernel detects local 302 * deadlock. */ 303 ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, 304 CEPH_LOCK_UNLOCK, 0, fl); 305 doutc(cl, "got %d on posix_lock_file, undid lock\n", 306 err); 307 } 308 } 309 } 310 return err; 311 } 312 313 int ceph_flock(struct file *file, int cmd, struct file_lock *fl) 314 { 315 struct inode *inode = file_inode(file); 316 struct ceph_inode_info *ci = ceph_inode(inode); 317 struct ceph_client *cl = ceph_inode_to_client(inode); 318 int err = 0; 319 u8 wait = 0; 320 u8 lock_cmd; 321 322 if (!(fl->fl_flags & FL_FLOCK)) 323 return -ENOLCK; 324 325 if (ceph_inode_is_shutdown(inode)) 326 return -ESTALE; 327 328 doutc(cl, "fl_file: %p\n", fl->fl_file); 329 330 spin_lock(&ci->i_ceph_lock); 331 if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { 332 err = -EIO; 333 } 334 spin_unlock(&ci->i_ceph_lock); 335 if (err < 0) { 336 if (F_UNLCK == fl->fl_type) 337 locks_lock_file_wait(file, fl); 338 return err; 339 } 340 341 if (IS_SETLKW(cmd)) 342 wait = 1; 343 344 if (F_RDLCK == fl->fl_type) 345 lock_cmd = CEPH_LOCK_SHARED; 346 else if (F_WRLCK == fl->fl_type) 347 lock_cmd = CEPH_LOCK_EXCL; 348 else 349 lock_cmd = CEPH_LOCK_UNLOCK; 350 351 if (F_UNLCK == fl->fl_type) { 352 err = try_unlock_file(file, fl); 353 if (err <= 0) 354 return err; 355 } 356 357 err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, 358 inode, lock_cmd, wait, fl); 359 if (!err && F_UNLCK != fl->fl_type) { 360 err = locks_lock_file_wait(file, fl); 361 if (err) { 362 ceph_lock_message(CEPH_LOCK_FLOCK, 363 CEPH_MDS_OP_SETFILELOCK, 364 inode, CEPH_LOCK_UNLOCK, 0, fl); 365 doutc(cl, "got %d on locks_lock_file_wait, undid lock\n", 366 err); 367 } 368 } 369 return err; 370 } 371 372 /* 373 * Fills in the passed counter variables, so you can prepare pagelist metadata 374 * before calling ceph_encode_locks. 375 */ 376 void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) 377 { 378 struct ceph_client *cl = ceph_inode_to_client(inode); 379 struct file_lock *lock; 380 struct file_lock_context *ctx; 381 382 *fcntl_count = 0; 383 *flock_count = 0; 384 385 ctx = locks_inode_context(inode); 386 if (ctx) { 387 spin_lock(&ctx->flc_lock); 388 list_for_each_entry(lock, &ctx->flc_posix, fl_list) 389 ++(*fcntl_count); 390 list_for_each_entry(lock, &ctx->flc_flock, fl_list) 391 ++(*flock_count); 392 spin_unlock(&ctx->flc_lock); 393 } 394 doutc(cl, "counted %d flock locks and %d fcntl locks\n", 395 *flock_count, *fcntl_count); 396 } 397 398 /* 399 * Given a pointer to a lock, convert it to a ceph filelock 400 */ 401 static int lock_to_ceph_filelock(struct inode *inode, 402 struct file_lock *lock, 403 struct ceph_filelock *cephlock) 404 { 405 struct ceph_client *cl = ceph_inode_to_client(inode); 406 int err = 0; 407 408 cephlock->start = cpu_to_le64(lock->fl_start); 409 cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); 410 cephlock->client = cpu_to_le64(0); 411 cephlock->pid = cpu_to_le64((u64)lock->fl_pid); 412 cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner)); 413 414 switch (lock->fl_type) { 415 case F_RDLCK: 416 cephlock->type = CEPH_LOCK_SHARED; 417 break; 418 case F_WRLCK: 419 cephlock->type = CEPH_LOCK_EXCL; 420 break; 421 case F_UNLCK: 422 cephlock->type = CEPH_LOCK_UNLOCK; 423 break; 424 default: 425 doutc(cl, "Have unknown lock type %d\n", lock->fl_type); 426 err = -EINVAL; 427 } 428 429 return err; 430 } 431 432 /* 433 * Encode the flock and fcntl locks for the given inode into the ceph_filelock 434 * array. Must be called with inode->i_lock already held. 435 * If we encounter more of a specific lock type than expected, return -ENOSPC. 436 */ 437 int ceph_encode_locks_to_buffer(struct inode *inode, 438 struct ceph_filelock *flocks, 439 int num_fcntl_locks, int num_flock_locks) 440 { 441 struct file_lock *lock; 442 struct file_lock_context *ctx = locks_inode_context(inode); 443 struct ceph_client *cl = ceph_inode_to_client(inode); 444 int err = 0; 445 int seen_fcntl = 0; 446 int seen_flock = 0; 447 int l = 0; 448 449 doutc(cl, "encoding %d flock and %d fcntl locks\n", num_flock_locks, 450 num_fcntl_locks); 451 452 if (!ctx) 453 return 0; 454 455 spin_lock(&ctx->flc_lock); 456 list_for_each_entry(lock, &ctx->flc_posix, fl_list) { 457 ++seen_fcntl; 458 if (seen_fcntl > num_fcntl_locks) { 459 err = -ENOSPC; 460 goto fail; 461 } 462 err = lock_to_ceph_filelock(inode, lock, &flocks[l]); 463 if (err) 464 goto fail; 465 ++l; 466 } 467 list_for_each_entry(lock, &ctx->flc_flock, fl_list) { 468 ++seen_flock; 469 if (seen_flock > num_flock_locks) { 470 err = -ENOSPC; 471 goto fail; 472 } 473 err = lock_to_ceph_filelock(inode, lock, &flocks[l]); 474 if (err) 475 goto fail; 476 ++l; 477 } 478 fail: 479 spin_unlock(&ctx->flc_lock); 480 return err; 481 } 482 483 /* 484 * Copy the encoded flock and fcntl locks into the pagelist. 485 * Format is: #fcntl locks, sequential fcntl locks, #flock locks, 486 * sequential flock locks. 487 * Returns zero on success. 488 */ 489 int ceph_locks_to_pagelist(struct ceph_filelock *flocks, 490 struct ceph_pagelist *pagelist, 491 int num_fcntl_locks, int num_flock_locks) 492 { 493 int err = 0; 494 __le32 nlocks; 495 496 nlocks = cpu_to_le32(num_fcntl_locks); 497 err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); 498 if (err) 499 goto out_fail; 500 501 if (num_fcntl_locks > 0) { 502 err = ceph_pagelist_append(pagelist, flocks, 503 num_fcntl_locks * sizeof(*flocks)); 504 if (err) 505 goto out_fail; 506 } 507 508 nlocks = cpu_to_le32(num_flock_locks); 509 err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); 510 if (err) 511 goto out_fail; 512 513 if (num_flock_locks > 0) { 514 err = ceph_pagelist_append(pagelist, &flocks[num_fcntl_locks], 515 num_flock_locks * sizeof(*flocks)); 516 } 517 out_fail: 518 return err; 519 } 520