1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/ceph/ceph_debug.h> 3 4 #include <linux/file.h> 5 #include <linux/namei.h> 6 #include <linux/random.h> 7 8 #include "super.h" 9 #include "mds_client.h" 10 #include <linux/filelock.h> 11 #include <linux/ceph/pagelist.h> 12 13 static u64 lock_secret; 14 static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, 15 struct ceph_mds_request *req); 16 17 static inline u64 secure_addr(void *addr) 18 { 19 u64 v = lock_secret ^ (u64)(unsigned long)addr; 20 /* 21 * Set the most significant bit, so that MDS knows the 'owner' 22 * is sufficient to identify the owner of lock. (old code uses 23 * both 'owner' and 'pid') 24 */ 25 v |= (1ULL << 63); 26 return v; 27 } 28 29 void __init ceph_flock_init(void) 30 { 31 get_random_bytes(&lock_secret, sizeof(lock_secret)); 32 } 33 34 static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src) 35 { 36 struct inode *inode = file_inode(dst->c.flc_file); 37 atomic_inc(&ceph_inode(inode)->i_filelock_ref); 38 dst->fl_u.ceph.inode = igrab(inode); 39 } 40 41 /* 42 * Do not use the 'fl->fl_file' in release function, which 43 * is possibly already released by another thread. 44 */ 45 static void ceph_fl_release_lock(struct file_lock *fl) 46 { 47 struct inode *inode = fl->fl_u.ceph.inode; 48 struct ceph_inode_info *ci; 49 50 /* 51 * If inode is NULL it should be a request file_lock, 52 * nothing we can do. 53 */ 54 if (!inode) 55 return; 56 57 ci = ceph_inode(inode); 58 if (atomic_dec_and_test(&ci->i_filelock_ref)) { 59 /* clear error when all locks are released */ 60 spin_lock(&ci->i_ceph_lock); 61 ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK; 62 spin_unlock(&ci->i_ceph_lock); 63 } 64 fl->fl_u.ceph.inode = NULL; 65 iput(inode); 66 } 67 68 static const struct file_lock_operations ceph_fl_lock_ops = { 69 .fl_copy_lock = ceph_fl_copy_lock, 70 .fl_release_private = ceph_fl_release_lock, 71 }; 72 73 /* 74 * Implement fcntl and flock locking functions. 75 */ 76 static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, 77 int cmd, u8 wait, struct file_lock *fl) 78 { 79 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); 80 struct ceph_client *cl = mdsc->fsc->client; 81 struct ceph_mds_request *req; 82 int err; 83 u64 length = 0; 84 u64 owner; 85 86 if (operation == CEPH_MDS_OP_SETFILELOCK) { 87 /* 88 * increasing i_filelock_ref closes race window between 89 * handling request reply and adding file_lock struct to 90 * inode. Otherwise, auth caps may get trimmed in the 91 * window. Caller function will decrease the counter. 92 */ 93 fl->fl_ops = &ceph_fl_lock_ops; 94 fl->fl_ops->fl_copy_lock(fl, NULL); 95 } 96 97 if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK) 98 wait = 0; 99 100 req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); 101 if (IS_ERR(req)) 102 return PTR_ERR(req); 103 req->r_inode = inode; 104 ihold(inode); 105 req->r_num_caps = 1; 106 107 /* mds requires start and length rather than start and end */ 108 if (LLONG_MAX == fl->fl_end) 109 length = 0; 110 else 111 length = fl->fl_end - fl->fl_start + 1; 112 113 owner = secure_addr(fl->c.flc_owner); 114 115 doutc(cl, "rule: %d, op: %d, owner: %llx, pid: %llu, " 116 "start: %llu, length: %llu, wait: %d, type: %d\n", 117 (int)lock_type, (int)operation, owner, 118 (u64) fl->c.flc_pid, 119 fl->fl_start, length, wait, fl->c.flc_type); 120 121 req->r_args.filelock_change.rule = lock_type; 122 req->r_args.filelock_change.type = cmd; 123 req->r_args.filelock_change.owner = cpu_to_le64(owner); 124 req->r_args.filelock_change.pid = cpu_to_le64((u64) fl->c.flc_pid); 125 req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start); 126 req->r_args.filelock_change.length = cpu_to_le64(length); 127 req->r_args.filelock_change.wait = wait; 128 129 err = ceph_mdsc_submit_request(mdsc, inode, req); 130 if (!err) 131 err = ceph_mdsc_wait_request(mdsc, req, wait ? 132 ceph_lock_wait_for_completion : NULL); 133 if (!err && operation == CEPH_MDS_OP_GETFILELOCK) { 134 fl->c.flc_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid); 135 if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) 136 fl->c.flc_type = F_RDLCK; 137 else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type) 138 fl->c.flc_type = F_WRLCK; 139 else 140 fl->c.flc_type = F_UNLCK; 141 142 fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start); 143 length = le64_to_cpu(req->r_reply_info.filelock_reply->start) + 144 le64_to_cpu(req->r_reply_info.filelock_reply->length); 145 if (length >= 1) 146 fl->fl_end = length -1; 147 else 148 fl->fl_end = 0; 149 150 } 151 ceph_mdsc_put_request(req); 152 doutc(cl, "rule: %d, op: %d, pid: %llu, start: %llu, " 153 "length: %llu, wait: %d, type: %d, err code %d\n", 154 (int)lock_type, (int)operation, (u64) fl->c.flc_pid, 155 fl->fl_start, length, wait, fl->c.flc_type, err); 156 return err; 157 } 158 159 static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, 160 struct ceph_mds_request *req) 161 { 162 struct ceph_client *cl = mdsc->fsc->client; 163 struct ceph_mds_request *intr_req; 164 struct inode *inode = req->r_inode; 165 int err, lock_type; 166 167 BUG_ON(req->r_op != CEPH_MDS_OP_SETFILELOCK); 168 if (req->r_args.filelock_change.rule == CEPH_LOCK_FCNTL) 169 lock_type = CEPH_LOCK_FCNTL_INTR; 170 else if (req->r_args.filelock_change.rule == CEPH_LOCK_FLOCK) 171 lock_type = CEPH_LOCK_FLOCK_INTR; 172 else 173 BUG_ON(1); 174 BUG_ON(req->r_args.filelock_change.type == CEPH_LOCK_UNLOCK); 175 176 err = wait_for_completion_interruptible(&req->r_completion); 177 if (!err) 178 return 0; 179 180 doutc(cl, "request %llu was interrupted\n", req->r_tid); 181 182 mutex_lock(&mdsc->mutex); 183 if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { 184 err = 0; 185 } else { 186 /* 187 * ensure we aren't running concurrently with 188 * ceph_fill_trace or ceph_readdir_prepopulate, which 189 * rely on locks (dir mutex) held by our caller. 190 */ 191 mutex_lock(&req->r_fill_mutex); 192 req->r_err = err; 193 set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); 194 mutex_unlock(&req->r_fill_mutex); 195 196 if (!req->r_session) { 197 // haven't sent the request 198 err = 0; 199 } 200 } 201 mutex_unlock(&mdsc->mutex); 202 if (!err) 203 return 0; 204 205 intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK, 206 USE_AUTH_MDS); 207 if (IS_ERR(intr_req)) 208 return PTR_ERR(intr_req); 209 210 intr_req->r_inode = inode; 211 ihold(inode); 212 intr_req->r_num_caps = 1; 213 214 intr_req->r_args.filelock_change = req->r_args.filelock_change; 215 intr_req->r_args.filelock_change.rule = lock_type; 216 intr_req->r_args.filelock_change.type = CEPH_LOCK_UNLOCK; 217 218 err = ceph_mdsc_do_request(mdsc, inode, intr_req); 219 ceph_mdsc_put_request(intr_req); 220 221 if (err && err != -ERESTARTSYS) 222 return err; 223 224 err = wait_for_completion_killable(&req->r_safe_completion); 225 if (err) 226 return err; 227 228 return 0; 229 } 230 231 static int try_unlock_file(struct file *file, struct file_lock *fl) 232 { 233 int err; 234 unsigned int orig_flags = fl->c.flc_flags; 235 fl->c.flc_flags |= FL_EXISTS; 236 err = locks_lock_file_wait(file, fl); 237 fl->c.flc_flags = orig_flags; 238 if (err == -ENOENT) { 239 if (!(orig_flags & FL_EXISTS)) 240 err = 0; 241 return err; 242 } 243 return 1; 244 } 245 246 /* 247 * Attempt to set an fcntl lock. 248 * For now, this just goes away to the server. Later it may be more awesome. 249 */ 250 int ceph_lock(struct file *file, int cmd, struct file_lock *fl) 251 { 252 struct inode *inode = file_inode(file); 253 struct ceph_inode_info *ci = ceph_inode(inode); 254 struct ceph_client *cl = ceph_inode_to_client(inode); 255 int err = 0; 256 u16 op = CEPH_MDS_OP_SETFILELOCK; 257 u8 wait = 0; 258 u8 lock_cmd; 259 260 if (!(fl->c.flc_flags & FL_POSIX)) 261 return -ENOLCK; 262 263 if (ceph_inode_is_shutdown(inode)) 264 return -ESTALE; 265 266 doutc(cl, "fl_owner: %p\n", fl->c.flc_owner); 267 268 /* set wait bit as appropriate, then make command as Ceph expects it*/ 269 if (IS_GETLK(cmd)) 270 op = CEPH_MDS_OP_GETFILELOCK; 271 else if (IS_SETLKW(cmd)) 272 wait = 1; 273 274 spin_lock(&ci->i_ceph_lock); 275 if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { 276 err = -EIO; 277 } 278 spin_unlock(&ci->i_ceph_lock); 279 if (err < 0) { 280 if (op == CEPH_MDS_OP_SETFILELOCK && lock_is_unlock(fl)) 281 posix_lock_file(file, fl, NULL); 282 return err; 283 } 284 285 if (lock_is_read(fl)) 286 lock_cmd = CEPH_LOCK_SHARED; 287 else if (lock_is_write(fl)) 288 lock_cmd = CEPH_LOCK_EXCL; 289 else 290 lock_cmd = CEPH_LOCK_UNLOCK; 291 292 if (op == CEPH_MDS_OP_SETFILELOCK && lock_is_unlock(fl)) { 293 err = try_unlock_file(file, fl); 294 if (err <= 0) 295 return err; 296 } 297 298 err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl); 299 if (!err) { 300 if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->c.flc_type) { 301 doutc(cl, "locking locally\n"); 302 err = posix_lock_file(file, fl, NULL); 303 if (err) { 304 /* undo! This should only happen if 305 * the kernel detects local 306 * deadlock. */ 307 ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, 308 CEPH_LOCK_UNLOCK, 0, fl); 309 doutc(cl, "got %d on posix_lock_file, undid lock\n", 310 err); 311 } 312 } 313 } 314 return err; 315 } 316 317 int ceph_flock(struct file *file, int cmd, struct file_lock *fl) 318 { 319 struct inode *inode = file_inode(file); 320 struct ceph_inode_info *ci = ceph_inode(inode); 321 struct ceph_client *cl = ceph_inode_to_client(inode); 322 int err = 0; 323 u8 wait = 0; 324 u8 lock_cmd; 325 326 if (!(fl->c.flc_flags & FL_FLOCK)) 327 return -ENOLCK; 328 329 if (ceph_inode_is_shutdown(inode)) 330 return -ESTALE; 331 332 doutc(cl, "fl_file: %p\n", fl->c.flc_file); 333 334 spin_lock(&ci->i_ceph_lock); 335 if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { 336 err = -EIO; 337 } 338 spin_unlock(&ci->i_ceph_lock); 339 if (err < 0) { 340 if (lock_is_unlock(fl)) 341 locks_lock_file_wait(file, fl); 342 return err; 343 } 344 345 if (IS_SETLKW(cmd)) 346 wait = 1; 347 348 if (lock_is_read(fl)) 349 lock_cmd = CEPH_LOCK_SHARED; 350 else if (lock_is_write(fl)) 351 lock_cmd = CEPH_LOCK_EXCL; 352 else 353 lock_cmd = CEPH_LOCK_UNLOCK; 354 355 if (lock_is_unlock(fl)) { 356 err = try_unlock_file(file, fl); 357 if (err <= 0) 358 return err; 359 } 360 361 err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, 362 inode, lock_cmd, wait, fl); 363 if (!err && F_UNLCK != fl->c.flc_type) { 364 err = locks_lock_file_wait(file, fl); 365 if (err) { 366 ceph_lock_message(CEPH_LOCK_FLOCK, 367 CEPH_MDS_OP_SETFILELOCK, 368 inode, CEPH_LOCK_UNLOCK, 0, fl); 369 doutc(cl, "got %d on locks_lock_file_wait, undid lock\n", 370 err); 371 } 372 } 373 return err; 374 } 375 376 /* 377 * Fills in the passed counter variables, so you can prepare pagelist metadata 378 * before calling ceph_encode_locks. 379 */ 380 void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) 381 { 382 struct ceph_client *cl = ceph_inode_to_client(inode); 383 struct file_lock *lock; 384 struct file_lock_context *ctx; 385 386 *fcntl_count = 0; 387 *flock_count = 0; 388 389 ctx = locks_inode_context(inode); 390 if (ctx) { 391 spin_lock(&ctx->flc_lock); 392 for_each_file_lock(lock, &ctx->flc_posix) 393 ++(*fcntl_count); 394 for_each_file_lock(lock, &ctx->flc_flock) 395 ++(*flock_count); 396 spin_unlock(&ctx->flc_lock); 397 } 398 doutc(cl, "counted %d flock locks and %d fcntl locks\n", 399 *flock_count, *fcntl_count); 400 } 401 402 /* 403 * Given a pointer to a lock, convert it to a ceph filelock 404 */ 405 static int lock_to_ceph_filelock(struct inode *inode, 406 struct file_lock *lock, 407 struct ceph_filelock *cephlock) 408 { 409 struct ceph_client *cl = ceph_inode_to_client(inode); 410 int err = 0; 411 412 cephlock->start = cpu_to_le64(lock->fl_start); 413 cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); 414 cephlock->client = cpu_to_le64(0); 415 cephlock->pid = cpu_to_le64((u64) lock->c.flc_pid); 416 cephlock->owner = cpu_to_le64(secure_addr(lock->c.flc_owner)); 417 418 switch (lock->c.flc_type) { 419 case F_RDLCK: 420 cephlock->type = CEPH_LOCK_SHARED; 421 break; 422 case F_WRLCK: 423 cephlock->type = CEPH_LOCK_EXCL; 424 break; 425 case F_UNLCK: 426 cephlock->type = CEPH_LOCK_UNLOCK; 427 break; 428 default: 429 doutc(cl, "Have unknown lock type %d\n", 430 lock->c.flc_type); 431 err = -EINVAL; 432 } 433 434 return err; 435 } 436 437 /* 438 * Encode the flock and fcntl locks for the given inode into the ceph_filelock 439 * array. Must be called with inode->i_lock already held. 440 * If we encounter more of a specific lock type than expected, return -ENOSPC. 441 */ 442 int ceph_encode_locks_to_buffer(struct inode *inode, 443 struct ceph_filelock *flocks, 444 int num_fcntl_locks, int num_flock_locks) 445 { 446 struct file_lock *lock; 447 struct file_lock_context *ctx = locks_inode_context(inode); 448 struct ceph_client *cl = ceph_inode_to_client(inode); 449 int err = 0; 450 int seen_fcntl = 0; 451 int seen_flock = 0; 452 int l = 0; 453 454 doutc(cl, "encoding %d flock and %d fcntl locks\n", num_flock_locks, 455 num_fcntl_locks); 456 457 if (!ctx) 458 return 0; 459 460 spin_lock(&ctx->flc_lock); 461 for_each_file_lock(lock, &ctx->flc_posix) { 462 ++seen_fcntl; 463 if (seen_fcntl > num_fcntl_locks) { 464 err = -ENOSPC; 465 goto fail; 466 } 467 err = lock_to_ceph_filelock(inode, lock, &flocks[l]); 468 if (err) 469 goto fail; 470 ++l; 471 } 472 for_each_file_lock(lock, &ctx->flc_flock) { 473 ++seen_flock; 474 if (seen_flock > num_flock_locks) { 475 err = -ENOSPC; 476 goto fail; 477 } 478 err = lock_to_ceph_filelock(inode, lock, &flocks[l]); 479 if (err) 480 goto fail; 481 ++l; 482 } 483 fail: 484 spin_unlock(&ctx->flc_lock); 485 return err; 486 } 487 488 /* 489 * Copy the encoded flock and fcntl locks into the pagelist. 490 * Format is: #fcntl locks, sequential fcntl locks, #flock locks, 491 * sequential flock locks. 492 * Returns zero on success. 493 */ 494 int ceph_locks_to_pagelist(struct ceph_filelock *flocks, 495 struct ceph_pagelist *pagelist, 496 int num_fcntl_locks, int num_flock_locks) 497 { 498 int err = 0; 499 __le32 nlocks; 500 501 nlocks = cpu_to_le32(num_fcntl_locks); 502 err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); 503 if (err) 504 goto out_fail; 505 506 if (num_fcntl_locks > 0) { 507 err = ceph_pagelist_append(pagelist, flocks, 508 num_fcntl_locks * sizeof(*flocks)); 509 if (err) 510 goto out_fail; 511 } 512 513 nlocks = cpu_to_le32(num_flock_locks); 514 err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); 515 if (err) 516 goto out_fail; 517 518 if (num_flock_locks > 0) { 519 err = ceph_pagelist_append(pagelist, &flocks[num_fcntl_locks], 520 num_flock_locks * sizeof(*flocks)); 521 } 522 out_fail: 523 return err; 524 } 525