1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/ceph/ceph_debug.h> 3 4 #include <linux/file.h> 5 #include <linux/namei.h> 6 #include <linux/random.h> 7 8 #include "super.h" 9 #include "mds_client.h" 10 #include <linux/filelock.h> 11 #include <linux/ceph/pagelist.h> 12 13 static u64 lock_secret; 14 static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, 15 struct ceph_mds_request *req); 16 17 static inline u64 secure_addr(void *addr) 18 { 19 u64 v = lock_secret ^ (u64)(unsigned long)addr; 20 /* 21 * Set the most significant bit, so that MDS knows the 'owner' 22 * is sufficient to identify the owner of lock. (old code uses 23 * both 'owner' and 'pid') 24 */ 25 v |= (1ULL << 63); 26 return v; 27 } 28 29 void __init ceph_flock_init(void) 30 { 31 get_random_bytes(&lock_secret, sizeof(lock_secret)); 32 } 33 34 static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src) 35 { 36 struct inode *inode = file_inode(dst->c.flc_file); 37 atomic_inc(&ceph_inode(inode)->i_filelock_ref); 38 dst->fl_u.ceph.inode = igrab(inode); 39 } 40 41 /* 42 * Do not use the 'fl->fl_file' in release function, which 43 * is possibly already released by another thread. 44 */ 45 static void ceph_fl_release_lock(struct file_lock *fl) 46 { 47 struct inode *inode = fl->fl_u.ceph.inode; 48 struct ceph_inode_info *ci; 49 50 /* 51 * If inode is NULL it should be a request file_lock, 52 * nothing we can do. 53 */ 54 if (!inode) 55 return; 56 57 ci = ceph_inode(inode); 58 if (atomic_dec_and_test(&ci->i_filelock_ref)) { 59 /* clear error when all locks are released */ 60 spin_lock(&ci->i_ceph_lock); 61 ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK; 62 spin_unlock(&ci->i_ceph_lock); 63 } 64 fl->fl_u.ceph.inode = NULL; 65 iput(inode); 66 } 67 68 static const struct file_lock_operations ceph_fl_lock_ops = { 69 .fl_copy_lock = ceph_fl_copy_lock, 70 .fl_release_private = ceph_fl_release_lock, 71 }; 72 73 /* 74 * Implement fcntl and flock locking functions. 75 */ 76 static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, 77 int cmd, u8 wait, struct file_lock *fl) 78 { 79 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); 80 struct ceph_client *cl = mdsc->fsc->client; 81 struct ceph_mds_request *req; 82 int err; 83 u64 length = 0; 84 u64 owner; 85 86 if (operation == CEPH_MDS_OP_SETFILELOCK) { 87 /* 88 * increasing i_filelock_ref closes race window between 89 * handling request reply and adding file_lock struct to 90 * inode. Otherwise, auth caps may get trimmed in the 91 * window. Caller function will decrease the counter. 92 */ 93 fl->fl_ops = &ceph_fl_lock_ops; 94 fl->fl_ops->fl_copy_lock(fl, NULL); 95 } 96 97 if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK) 98 wait = 0; 99 100 req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); 101 if (IS_ERR(req)) 102 return PTR_ERR(req); 103 req->r_inode = inode; 104 ihold(inode); 105 req->r_num_caps = 1; 106 107 /* mds requires start and length rather than start and end */ 108 if (LLONG_MAX == fl->fl_end) 109 length = 0; 110 else 111 length = fl->fl_end - fl->fl_start + 1; 112 113 owner = secure_addr(fl->c.flc_owner); 114 115 doutc(cl, "rule: %d, op: %d, owner: %llx, pid: %llu, " 116 "start: %llu, length: %llu, wait: %d, type: %d\n", 117 (int)lock_type, (int)operation, owner, 118 (u64) fl->c.flc_pid, 119 fl->fl_start, length, wait, fl->c.flc_type); 120 121 req->r_args.filelock_change.rule = lock_type; 122 req->r_args.filelock_change.type = cmd; 123 req->r_args.filelock_change.owner = cpu_to_le64(owner); 124 req->r_args.filelock_change.pid = cpu_to_le64((u64) fl->c.flc_pid); 125 req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start); 126 req->r_args.filelock_change.length = cpu_to_le64(length); 127 req->r_args.filelock_change.wait = wait; 128 129 err = ceph_mdsc_submit_request(mdsc, inode, req); 130 if (!err) 131 err = ceph_mdsc_wait_request(mdsc, req, wait ? 132 ceph_lock_wait_for_completion : NULL); 133 if (!err && operation == CEPH_MDS_OP_GETFILELOCK) { 134 fl->c.flc_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid); 135 if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) 136 fl->c.flc_type = F_RDLCK; 137 else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type) 138 fl->c.flc_type = F_WRLCK; 139 else 140 fl->c.flc_type = F_UNLCK; 141 142 fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start); 143 length = le64_to_cpu(req->r_reply_info.filelock_reply->start) + 144 le64_to_cpu(req->r_reply_info.filelock_reply->length); 145 if (length >= 1) 146 fl->fl_end = length -1; 147 else 148 fl->fl_end = 0; 149 150 } 151 ceph_mdsc_put_request(req); 152 doutc(cl, "rule: %d, op: %d, pid: %llu, start: %llu, " 153 "length: %llu, wait: %d, type: %d, err code %d\n", 154 (int)lock_type, (int)operation, (u64) fl->c.flc_pid, 155 fl->fl_start, length, wait, fl->c.flc_type, err); 156 return err; 157 } 158 159 static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, 160 struct ceph_mds_request *req) 161 { 162 struct ceph_client *cl = mdsc->fsc->client; 163 struct ceph_mds_request *intr_req; 164 struct inode *inode = req->r_inode; 165 int err, lock_type; 166 167 BUG_ON(req->r_op != CEPH_MDS_OP_SETFILELOCK); 168 if (req->r_args.filelock_change.rule == CEPH_LOCK_FCNTL) 169 lock_type = CEPH_LOCK_FCNTL_INTR; 170 else if (req->r_args.filelock_change.rule == CEPH_LOCK_FLOCK) 171 lock_type = CEPH_LOCK_FLOCK_INTR; 172 else 173 BUG_ON(1); 174 BUG_ON(req->r_args.filelock_change.type == CEPH_LOCK_UNLOCK); 175 176 err = wait_for_completion_interruptible(&req->r_completion); 177 if (!err) 178 return 0; 179 180 doutc(cl, "request %llu was interrupted\n", req->r_tid); 181 182 mutex_lock(&mdsc->mutex); 183 if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { 184 err = 0; 185 } else { 186 /* 187 * ensure we aren't running concurrently with 188 * ceph_fill_trace or ceph_readdir_prepopulate, which 189 * rely on locks (dir mutex) held by our caller. 190 */ 191 mutex_lock(&req->r_fill_mutex); 192 req->r_err = err; 193 set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); 194 mutex_unlock(&req->r_fill_mutex); 195 196 if (!req->r_session) { 197 // haven't sent the request 198 err = 0; 199 } 200 } 201 mutex_unlock(&mdsc->mutex); 202 if (!err) 203 return 0; 204 205 intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK, 206 USE_AUTH_MDS); 207 if (IS_ERR(intr_req)) 208 return PTR_ERR(intr_req); 209 210 intr_req->r_inode = inode; 211 ihold(inode); 212 intr_req->r_num_caps = 1; 213 214 intr_req->r_args.filelock_change = req->r_args.filelock_change; 215 intr_req->r_args.filelock_change.rule = lock_type; 216 intr_req->r_args.filelock_change.type = CEPH_LOCK_UNLOCK; 217 218 err = ceph_mdsc_do_request(mdsc, inode, intr_req); 219 ceph_mdsc_put_request(intr_req); 220 221 if (err && err != -ERESTARTSYS) 222 return err; 223 224 wait_for_completion_killable(&req->r_safe_completion); 225 return 0; 226 } 227 228 static int try_unlock_file(struct file *file, struct file_lock *fl) 229 { 230 int err; 231 unsigned int orig_flags = fl->c.flc_flags; 232 fl->c.flc_flags |= FL_EXISTS; 233 err = locks_lock_file_wait(file, fl); 234 fl->c.flc_flags = orig_flags; 235 if (err == -ENOENT) { 236 if (!(orig_flags & FL_EXISTS)) 237 err = 0; 238 return err; 239 } 240 return 1; 241 } 242 243 /* 244 * Attempt to set an fcntl lock. 245 * For now, this just goes away to the server. Later it may be more awesome. 246 */ 247 int ceph_lock(struct file *file, int cmd, struct file_lock *fl) 248 { 249 struct inode *inode = file_inode(file); 250 struct ceph_inode_info *ci = ceph_inode(inode); 251 struct ceph_client *cl = ceph_inode_to_client(inode); 252 int err = 0; 253 u16 op = CEPH_MDS_OP_SETFILELOCK; 254 u8 wait = 0; 255 u8 lock_cmd; 256 257 if (!(fl->c.flc_flags & FL_POSIX)) 258 return -ENOLCK; 259 260 if (ceph_inode_is_shutdown(inode)) 261 return -ESTALE; 262 263 doutc(cl, "fl_owner: %p\n", fl->c.flc_owner); 264 265 /* set wait bit as appropriate, then make command as Ceph expects it*/ 266 if (IS_GETLK(cmd)) 267 op = CEPH_MDS_OP_GETFILELOCK; 268 else if (IS_SETLKW(cmd)) 269 wait = 1; 270 271 spin_lock(&ci->i_ceph_lock); 272 if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { 273 err = -EIO; 274 } 275 spin_unlock(&ci->i_ceph_lock); 276 if (err < 0) { 277 if (op == CEPH_MDS_OP_SETFILELOCK && lock_is_unlock(fl)) 278 posix_lock_file(file, fl, NULL); 279 return err; 280 } 281 282 if (lock_is_read(fl)) 283 lock_cmd = CEPH_LOCK_SHARED; 284 else if (lock_is_write(fl)) 285 lock_cmd = CEPH_LOCK_EXCL; 286 else 287 lock_cmd = CEPH_LOCK_UNLOCK; 288 289 if (op == CEPH_MDS_OP_SETFILELOCK && lock_is_unlock(fl)) { 290 err = try_unlock_file(file, fl); 291 if (err <= 0) 292 return err; 293 } 294 295 err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl); 296 if (!err) { 297 if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->c.flc_type) { 298 doutc(cl, "locking locally\n"); 299 err = posix_lock_file(file, fl, NULL); 300 if (err) { 301 /* undo! This should only happen if 302 * the kernel detects local 303 * deadlock. */ 304 ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, 305 CEPH_LOCK_UNLOCK, 0, fl); 306 doutc(cl, "got %d on posix_lock_file, undid lock\n", 307 err); 308 } 309 } 310 } 311 return err; 312 } 313 314 int ceph_flock(struct file *file, int cmd, struct file_lock *fl) 315 { 316 struct inode *inode = file_inode(file); 317 struct ceph_inode_info *ci = ceph_inode(inode); 318 struct ceph_client *cl = ceph_inode_to_client(inode); 319 int err = 0; 320 u8 wait = 0; 321 u8 lock_cmd; 322 323 if (!(fl->c.flc_flags & FL_FLOCK)) 324 return -ENOLCK; 325 326 if (ceph_inode_is_shutdown(inode)) 327 return -ESTALE; 328 329 doutc(cl, "fl_file: %p\n", fl->c.flc_file); 330 331 spin_lock(&ci->i_ceph_lock); 332 if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { 333 err = -EIO; 334 } 335 spin_unlock(&ci->i_ceph_lock); 336 if (err < 0) { 337 if (lock_is_unlock(fl)) 338 locks_lock_file_wait(file, fl); 339 return err; 340 } 341 342 if (IS_SETLKW(cmd)) 343 wait = 1; 344 345 if (lock_is_read(fl)) 346 lock_cmd = CEPH_LOCK_SHARED; 347 else if (lock_is_write(fl)) 348 lock_cmd = CEPH_LOCK_EXCL; 349 else 350 lock_cmd = CEPH_LOCK_UNLOCK; 351 352 if (lock_is_unlock(fl)) { 353 err = try_unlock_file(file, fl); 354 if (err <= 0) 355 return err; 356 } 357 358 err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, 359 inode, lock_cmd, wait, fl); 360 if (!err && F_UNLCK != fl->c.flc_type) { 361 err = locks_lock_file_wait(file, fl); 362 if (err) { 363 ceph_lock_message(CEPH_LOCK_FLOCK, 364 CEPH_MDS_OP_SETFILELOCK, 365 inode, CEPH_LOCK_UNLOCK, 0, fl); 366 doutc(cl, "got %d on locks_lock_file_wait, undid lock\n", 367 err); 368 } 369 } 370 return err; 371 } 372 373 /* 374 * Fills in the passed counter variables, so you can prepare pagelist metadata 375 * before calling ceph_encode_locks. 376 */ 377 void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) 378 { 379 struct ceph_client *cl = ceph_inode_to_client(inode); 380 struct file_lock *lock; 381 struct file_lock_context *ctx; 382 383 *fcntl_count = 0; 384 *flock_count = 0; 385 386 ctx = locks_inode_context(inode); 387 if (ctx) { 388 spin_lock(&ctx->flc_lock); 389 for_each_file_lock(lock, &ctx->flc_posix) 390 ++(*fcntl_count); 391 for_each_file_lock(lock, &ctx->flc_flock) 392 ++(*flock_count); 393 spin_unlock(&ctx->flc_lock); 394 } 395 doutc(cl, "counted %d flock locks and %d fcntl locks\n", 396 *flock_count, *fcntl_count); 397 } 398 399 /* 400 * Given a pointer to a lock, convert it to a ceph filelock 401 */ 402 static int lock_to_ceph_filelock(struct inode *inode, 403 struct file_lock *lock, 404 struct ceph_filelock *cephlock) 405 { 406 struct ceph_client *cl = ceph_inode_to_client(inode); 407 int err = 0; 408 409 cephlock->start = cpu_to_le64(lock->fl_start); 410 cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); 411 cephlock->client = cpu_to_le64(0); 412 cephlock->pid = cpu_to_le64((u64) lock->c.flc_pid); 413 cephlock->owner = cpu_to_le64(secure_addr(lock->c.flc_owner)); 414 415 switch (lock->c.flc_type) { 416 case F_RDLCK: 417 cephlock->type = CEPH_LOCK_SHARED; 418 break; 419 case F_WRLCK: 420 cephlock->type = CEPH_LOCK_EXCL; 421 break; 422 case F_UNLCK: 423 cephlock->type = CEPH_LOCK_UNLOCK; 424 break; 425 default: 426 doutc(cl, "Have unknown lock type %d\n", 427 lock->c.flc_type); 428 err = -EINVAL; 429 } 430 431 return err; 432 } 433 434 /* 435 * Encode the flock and fcntl locks for the given inode into the ceph_filelock 436 * array. Must be called with inode->i_lock already held. 437 * If we encounter more of a specific lock type than expected, return -ENOSPC. 438 */ 439 int ceph_encode_locks_to_buffer(struct inode *inode, 440 struct ceph_filelock *flocks, 441 int num_fcntl_locks, int num_flock_locks) 442 { 443 struct file_lock *lock; 444 struct file_lock_context *ctx = locks_inode_context(inode); 445 struct ceph_client *cl = ceph_inode_to_client(inode); 446 int err = 0; 447 int seen_fcntl = 0; 448 int seen_flock = 0; 449 int l = 0; 450 451 doutc(cl, "encoding %d flock and %d fcntl locks\n", num_flock_locks, 452 num_fcntl_locks); 453 454 if (!ctx) 455 return 0; 456 457 spin_lock(&ctx->flc_lock); 458 for_each_file_lock(lock, &ctx->flc_posix) { 459 ++seen_fcntl; 460 if (seen_fcntl > num_fcntl_locks) { 461 err = -ENOSPC; 462 goto fail; 463 } 464 err = lock_to_ceph_filelock(inode, lock, &flocks[l]); 465 if (err) 466 goto fail; 467 ++l; 468 } 469 for_each_file_lock(lock, &ctx->flc_flock) { 470 ++seen_flock; 471 if (seen_flock > num_flock_locks) { 472 err = -ENOSPC; 473 goto fail; 474 } 475 err = lock_to_ceph_filelock(inode, lock, &flocks[l]); 476 if (err) 477 goto fail; 478 ++l; 479 } 480 fail: 481 spin_unlock(&ctx->flc_lock); 482 return err; 483 } 484 485 /* 486 * Copy the encoded flock and fcntl locks into the pagelist. 487 * Format is: #fcntl locks, sequential fcntl locks, #flock locks, 488 * sequential flock locks. 489 * Returns zero on success. 490 */ 491 int ceph_locks_to_pagelist(struct ceph_filelock *flocks, 492 struct ceph_pagelist *pagelist, 493 int num_fcntl_locks, int num_flock_locks) 494 { 495 int err = 0; 496 __le32 nlocks; 497 498 nlocks = cpu_to_le32(num_fcntl_locks); 499 err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); 500 if (err) 501 goto out_fail; 502 503 if (num_fcntl_locks > 0) { 504 err = ceph_pagelist_append(pagelist, flocks, 505 num_fcntl_locks * sizeof(*flocks)); 506 if (err) 507 goto out_fail; 508 } 509 510 nlocks = cpu_to_le32(num_flock_locks); 511 err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); 512 if (err) 513 goto out_fail; 514 515 if (num_flock_locks > 0) { 516 err = ceph_pagelist_append(pagelist, &flocks[num_fcntl_locks], 517 num_flock_locks * sizeof(*flocks)); 518 } 519 out_fail: 520 return err; 521 } 522