1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/ceph/ceph_debug.h> 3 4 #include <linux/bvec.h> 5 #include <linux/crc32c.h> 6 #include <linux/net.h> 7 #include <linux/socket.h> 8 #include <net/sock.h> 9 10 #include <linux/ceph/ceph_features.h> 11 #include <linux/ceph/decode.h> 12 #include <linux/ceph/libceph.h> 13 #include <linux/ceph/messenger.h> 14 15 /* static tag bytes (protocol control messages) */ 16 static char tag_msg = CEPH_MSGR_TAG_MSG; 17 static char tag_ack = CEPH_MSGR_TAG_ACK; 18 static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE; 19 static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2; 20 21 /* 22 * If @buf is NULL, discard up to @len bytes. 23 */ 24 static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) 25 { 26 struct kvec iov = {buf, len}; 27 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; 28 int r; 29 30 if (!buf) 31 msg.msg_flags |= MSG_TRUNC; 32 33 iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, len); 34 r = sock_recvmsg(sock, &msg, msg.msg_flags); 35 if (r == -EAGAIN) 36 r = 0; 37 return r; 38 } 39 40 static int ceph_tcp_recvpage(struct socket *sock, struct page *page, 41 int page_offset, size_t length) 42 { 43 struct bio_vec bvec; 44 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; 45 int r; 46 47 BUG_ON(page_offset + length > PAGE_SIZE); 48 bvec_set_page(&bvec, page, length, page_offset); 49 iov_iter_bvec(&msg.msg_iter, ITER_DEST, &bvec, 1, length); 50 r = sock_recvmsg(sock, &msg, msg.msg_flags); 51 if (r == -EAGAIN) 52 r = 0; 53 return r; 54 } 55 56 /* 57 * write something. @more is true if caller will be sending more data 58 * shortly. 59 */ 60 static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, 61 size_t kvlen, size_t len, bool more) 62 { 63 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; 64 int r; 65 66 if (more) 67 msg.msg_flags |= MSG_MORE; 68 else 69 msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */ 70 71 r = kernel_sendmsg(sock, &msg, iov, kvlen, len); 72 if (r == -EAGAIN) 73 r = 0; 74 return r; 75 } 76 77 /* 78 * @more: MSG_MORE or 0. 79 */ 80 static int ceph_tcp_sendpage(struct socket *sock, struct page *page, 81 int offset, size_t size, int more) 82 { 83 struct msghdr msg = { 84 .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | more, 85 }; 86 struct bio_vec bvec; 87 int ret; 88 89 /* 90 * MSG_SPLICE_PAGES cannot properly handle pages with page_count == 0, 91 * we need to fall back to sendmsg if that's the case. 92 * 93 * Same goes for slab pages: skb_can_coalesce() allows 94 * coalescing neighboring slab objects into a single frag which 95 * triggers one of hardened usercopy checks. 96 */ 97 if (sendpage_ok(page)) 98 msg.msg_flags |= MSG_SPLICE_PAGES; 99 100 bvec_set_page(&bvec, page, size, offset); 101 iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); 102 103 ret = sock_sendmsg(sock, &msg); 104 if (ret == -EAGAIN) 105 ret = 0; 106 107 return ret; 108 } 109 110 static void con_out_kvec_reset(struct ceph_connection *con) 111 { 112 BUG_ON(con->v1.out_skip); 113 114 con->v1.out_kvec_left = 0; 115 con->v1.out_kvec_bytes = 0; 116 con->v1.out_kvec_cur = &con->v1.out_kvec[0]; 117 } 118 119 static void con_out_kvec_add(struct ceph_connection *con, 120 size_t size, void *data) 121 { 122 int index = con->v1.out_kvec_left; 123 124 BUG_ON(con->v1.out_skip); 125 BUG_ON(index >= ARRAY_SIZE(con->v1.out_kvec)); 126 127 con->v1.out_kvec[index].iov_len = size; 128 con->v1.out_kvec[index].iov_base = data; 129 con->v1.out_kvec_left++; 130 con->v1.out_kvec_bytes += size; 131 } 132 133 /* 134 * Chop off a kvec from the end. Return residual number of bytes for 135 * that kvec, i.e. how many bytes would have been written if the kvec 136 * hadn't been nuked. 137 */ 138 static int con_out_kvec_skip(struct ceph_connection *con) 139 { 140 int skip = 0; 141 142 if (con->v1.out_kvec_bytes > 0) { 143 skip = con->v1.out_kvec_cur[con->v1.out_kvec_left - 1].iov_len; 144 BUG_ON(con->v1.out_kvec_bytes < skip); 145 BUG_ON(!con->v1.out_kvec_left); 146 con->v1.out_kvec_bytes -= skip; 147 con->v1.out_kvec_left--; 148 } 149 150 return skip; 151 } 152 153 static size_t sizeof_footer(struct ceph_connection *con) 154 { 155 return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ? 156 sizeof(struct ceph_msg_footer) : 157 sizeof(struct ceph_msg_footer_old); 158 } 159 160 static void prepare_message_data(struct ceph_msg *msg, u32 data_len) 161 { 162 /* Initialize data cursor if it's not a sparse read */ 163 u64 len = msg->sparse_read_total ? : data_len; 164 165 ceph_msg_data_cursor_init(&msg->cursor, msg, len); 166 } 167 168 /* 169 * Prepare footer for currently outgoing message, and finish things 170 * off. Assumes out_kvec* are already valid.. we just add on to the end. 171 */ 172 static void prepare_write_message_footer(struct ceph_connection *con, 173 struct ceph_msg *m) 174 { 175 m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; 176 177 dout("prepare_write_message_footer %p\n", con); 178 con_out_kvec_add(con, sizeof_footer(con), &m->footer); 179 if (con->peer_features & CEPH_FEATURE_MSG_AUTH) { 180 if (con->ops->sign_message) 181 con->ops->sign_message(m); 182 else 183 m->footer.sig = 0; 184 } else { 185 m->old_footer.flags = m->footer.flags; 186 } 187 con->v1.out_more = m->more_to_follow; 188 con->v1.out_msg_done = true; 189 } 190 191 /* 192 * Prepare headers for the next outgoing message. 193 */ 194 static void prepare_write_message(struct ceph_connection *con, 195 struct ceph_msg *m) 196 { 197 u32 crc; 198 199 con_out_kvec_reset(con); 200 con->v1.out_msg_done = false; 201 202 /* Sneak an ack in there first? If we can get it into the same 203 * TCP packet that's a good thing. */ 204 if (con->in_seq > con->in_seq_acked) { 205 con->in_seq_acked = con->in_seq; 206 con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); 207 con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked); 208 con_out_kvec_add(con, sizeof(con->v1.out_temp_ack), 209 &con->v1.out_temp_ack); 210 } 211 212 dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n", 213 m, con->out_seq, le16_to_cpu(m->hdr.type), 214 le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), 215 m->data_length); 216 WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len)); 217 WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len)); 218 219 /* tag + hdr + front + middle */ 220 con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); 221 con_out_kvec_add(con, sizeof(con->v1.out_hdr), &con->v1.out_hdr); 222 con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); 223 224 if (m->middle) 225 con_out_kvec_add(con, m->middle->vec.iov_len, 226 m->middle->vec.iov_base); 227 228 /* fill in hdr crc and finalize hdr */ 229 crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); 230 m->hdr.crc = cpu_to_le32(crc); 231 memcpy(&con->v1.out_hdr, &m->hdr, sizeof(con->v1.out_hdr)); 232 233 /* fill in front and middle crc, footer */ 234 crc = crc32c(0, m->front.iov_base, m->front.iov_len); 235 m->footer.front_crc = cpu_to_le32(crc); 236 if (m->middle) { 237 crc = crc32c(0, m->middle->vec.iov_base, 238 m->middle->vec.iov_len); 239 m->footer.middle_crc = cpu_to_le32(crc); 240 } else 241 m->footer.middle_crc = 0; 242 dout("%s front_crc %u middle_crc %u\n", __func__, 243 le32_to_cpu(m->footer.front_crc), 244 le32_to_cpu(m->footer.middle_crc)); 245 m->footer.flags = 0; 246 247 /* is there a data payload? */ 248 m->footer.data_crc = 0; 249 if (m->data_length) { 250 prepare_message_data(m, m->data_length); 251 con->v1.out_more = 1; /* data + footer will follow */ 252 } else { 253 /* no, queue up footer too and be done */ 254 prepare_write_message_footer(con, m); 255 } 256 257 ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); 258 } 259 260 /* 261 * Prepare an ack. 262 */ 263 static void prepare_write_ack(struct ceph_connection *con) 264 { 265 dout("prepare_write_ack %p %llu -> %llu\n", con, 266 con->in_seq_acked, con->in_seq); 267 con->in_seq_acked = con->in_seq; 268 269 con_out_kvec_reset(con); 270 271 con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); 272 273 con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked); 274 con_out_kvec_add(con, sizeof(con->v1.out_temp_ack), 275 &con->v1.out_temp_ack); 276 277 con->v1.out_more = 1; /* more will follow.. eventually.. */ 278 ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); 279 } 280 281 /* 282 * Prepare to share the seq during handshake 283 */ 284 static void prepare_write_seq(struct ceph_connection *con) 285 { 286 dout("prepare_write_seq %p %llu -> %llu\n", con, 287 con->in_seq_acked, con->in_seq); 288 con->in_seq_acked = con->in_seq; 289 290 con_out_kvec_reset(con); 291 292 con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked); 293 con_out_kvec_add(con, sizeof(con->v1.out_temp_ack), 294 &con->v1.out_temp_ack); 295 296 ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); 297 } 298 299 /* 300 * Prepare to write keepalive byte. 301 */ 302 static void prepare_write_keepalive(struct ceph_connection *con) 303 { 304 dout("prepare_write_keepalive %p\n", con); 305 con_out_kvec_reset(con); 306 if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) { 307 struct timespec64 now; 308 309 ktime_get_real_ts64(&now); 310 con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2); 311 ceph_encode_timespec64(&con->v1.out_temp_keepalive2, &now); 312 con_out_kvec_add(con, sizeof(con->v1.out_temp_keepalive2), 313 &con->v1.out_temp_keepalive2); 314 } else { 315 con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive); 316 } 317 ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); 318 } 319 320 /* 321 * Connection negotiation. 322 */ 323 324 static int get_connect_authorizer(struct ceph_connection *con) 325 { 326 struct ceph_auth_handshake *auth; 327 int auth_proto; 328 329 if (!con->ops->get_authorizer) { 330 con->v1.auth = NULL; 331 con->v1.out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN; 332 con->v1.out_connect.authorizer_len = 0; 333 return 0; 334 } 335 336 auth = con->ops->get_authorizer(con, &auth_proto, con->v1.auth_retry); 337 if (IS_ERR(auth)) 338 return PTR_ERR(auth); 339 340 con->v1.auth = auth; 341 con->v1.out_connect.authorizer_protocol = cpu_to_le32(auth_proto); 342 con->v1.out_connect.authorizer_len = 343 cpu_to_le32(auth->authorizer_buf_len); 344 return 0; 345 } 346 347 /* 348 * We connected to a peer and are saying hello. 349 */ 350 static void prepare_write_banner(struct ceph_connection *con) 351 { 352 con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER); 353 con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr), 354 &con->msgr->my_enc_addr); 355 356 con->v1.out_more = 0; 357 ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); 358 } 359 360 static void __prepare_write_connect(struct ceph_connection *con) 361 { 362 con_out_kvec_add(con, sizeof(con->v1.out_connect), 363 &con->v1.out_connect); 364 if (con->v1.auth) 365 con_out_kvec_add(con, con->v1.auth->authorizer_buf_len, 366 con->v1.auth->authorizer_buf); 367 368 con->v1.out_more = 0; 369 ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); 370 } 371 372 static int prepare_write_connect(struct ceph_connection *con) 373 { 374 unsigned int global_seq = ceph_get_global_seq(con->msgr, 0); 375 int proto; 376 int ret; 377 378 switch (con->peer_name.type) { 379 case CEPH_ENTITY_TYPE_MON: 380 proto = CEPH_MONC_PROTOCOL; 381 break; 382 case CEPH_ENTITY_TYPE_OSD: 383 proto = CEPH_OSDC_PROTOCOL; 384 break; 385 case CEPH_ENTITY_TYPE_MDS: 386 proto = CEPH_MDSC_PROTOCOL; 387 break; 388 default: 389 BUG(); 390 } 391 392 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, 393 con->v1.connect_seq, global_seq, proto); 394 395 con->v1.out_connect.features = 396 cpu_to_le64(from_msgr(con->msgr)->supported_features); 397 con->v1.out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); 398 con->v1.out_connect.connect_seq = cpu_to_le32(con->v1.connect_seq); 399 con->v1.out_connect.global_seq = cpu_to_le32(global_seq); 400 con->v1.out_connect.protocol_version = cpu_to_le32(proto); 401 con->v1.out_connect.flags = 0; 402 403 ret = get_connect_authorizer(con); 404 if (ret) 405 return ret; 406 407 __prepare_write_connect(con); 408 return 0; 409 } 410 411 /* 412 * write as much of pending kvecs to the socket as we can. 413 * 1 -> done 414 * 0 -> socket full, but more to do 415 * <0 -> error 416 */ 417 static int write_partial_kvec(struct ceph_connection *con) 418 { 419 int ret; 420 421 dout("write_partial_kvec %p %d left\n", con, con->v1.out_kvec_bytes); 422 while (con->v1.out_kvec_bytes > 0) { 423 ret = ceph_tcp_sendmsg(con->sock, con->v1.out_kvec_cur, 424 con->v1.out_kvec_left, 425 con->v1.out_kvec_bytes, 426 con->v1.out_more); 427 if (ret <= 0) 428 goto out; 429 con->v1.out_kvec_bytes -= ret; 430 if (!con->v1.out_kvec_bytes) 431 break; /* done */ 432 433 /* account for full iov entries consumed */ 434 while (ret >= con->v1.out_kvec_cur->iov_len) { 435 BUG_ON(!con->v1.out_kvec_left); 436 ret -= con->v1.out_kvec_cur->iov_len; 437 con->v1.out_kvec_cur++; 438 con->v1.out_kvec_left--; 439 } 440 /* and for a partially-consumed entry */ 441 if (ret) { 442 con->v1.out_kvec_cur->iov_len -= ret; 443 con->v1.out_kvec_cur->iov_base += ret; 444 } 445 } 446 con->v1.out_kvec_left = 0; 447 ret = 1; 448 out: 449 dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con, 450 con->v1.out_kvec_bytes, con->v1.out_kvec_left, ret); 451 return ret; /* done! */ 452 } 453 454 /* 455 * Write as much message data payload as we can. If we finish, queue 456 * up the footer. 457 * 1 -> done, footer is now queued in out_kvec[]. 458 * 0 -> socket full, but more to do 459 * <0 -> error 460 */ 461 static int write_partial_message_data(struct ceph_connection *con, 462 struct ceph_msg *msg) 463 { 464 struct ceph_msg_data_cursor *cursor = &msg->cursor; 465 bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); 466 u32 crc; 467 468 dout("%s %p msg %p\n", __func__, con, msg); 469 470 if (!msg->num_data_items) 471 return -EINVAL; 472 473 /* 474 * Iterate through each page that contains data to be 475 * written, and send as much as possible for each. 476 * 477 * If we are calculating the data crc (the default), we will 478 * need to map the page. If we have no pages, they have 479 * been revoked, so use the zero page. 480 */ 481 crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0; 482 while (cursor->total_resid) { 483 struct page *page; 484 size_t page_offset; 485 size_t length; 486 int ret; 487 488 if (!cursor->resid) { 489 ceph_msg_data_advance(cursor, 0); 490 continue; 491 } 492 493 page = ceph_msg_data_next(cursor, &page_offset, &length); 494 ret = ceph_tcp_sendpage(con->sock, page, page_offset, length, 495 MSG_MORE); 496 if (ret <= 0) { 497 if (do_datacrc) 498 msg->footer.data_crc = cpu_to_le32(crc); 499 500 return ret; 501 } 502 if (do_datacrc && cursor->need_crc) 503 crc = ceph_crc32c_page(crc, page, page_offset, length); 504 ceph_msg_data_advance(cursor, (size_t)ret); 505 } 506 507 dout("%s %p msg %p done\n", __func__, con, msg); 508 509 /* prepare and queue up footer, too */ 510 if (do_datacrc) 511 msg->footer.data_crc = cpu_to_le32(crc); 512 else 513 msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; 514 con_out_kvec_reset(con); 515 prepare_write_message_footer(con, msg); 516 517 return 1; /* must return > 0 to indicate success */ 518 } 519 520 /* 521 * write some zeros 522 */ 523 static int write_partial_skip(struct ceph_connection *con) 524 { 525 int ret; 526 527 dout("%s %p %d left\n", __func__, con, con->v1.out_skip); 528 while (con->v1.out_skip > 0) { 529 size_t size = min(con->v1.out_skip, (int)PAGE_SIZE); 530 531 ret = ceph_tcp_sendpage(con->sock, ceph_zero_page, 0, size, 532 MSG_MORE); 533 if (ret <= 0) 534 goto out; 535 con->v1.out_skip -= ret; 536 } 537 ret = 1; 538 out: 539 return ret; 540 } 541 542 /* 543 * Prepare to read connection handshake, or an ack. 544 */ 545 static void prepare_read_banner(struct ceph_connection *con) 546 { 547 dout("prepare_read_banner %p\n", con); 548 con->v1.in_base_pos = 0; 549 } 550 551 static void prepare_read_connect(struct ceph_connection *con) 552 { 553 dout("prepare_read_connect %p\n", con); 554 con->v1.in_base_pos = 0; 555 } 556 557 static void prepare_read_ack(struct ceph_connection *con) 558 { 559 dout("prepare_read_ack %p\n", con); 560 con->v1.in_base_pos = 0; 561 } 562 563 static void prepare_read_seq(struct ceph_connection *con) 564 { 565 dout("prepare_read_seq %p\n", con); 566 con->v1.in_base_pos = 0; 567 con->v1.in_tag = CEPH_MSGR_TAG_SEQ; 568 } 569 570 static void prepare_read_tag(struct ceph_connection *con) 571 { 572 dout("prepare_read_tag %p\n", con); 573 con->v1.in_base_pos = 0; 574 con->v1.in_tag = CEPH_MSGR_TAG_READY; 575 } 576 577 static void prepare_read_keepalive_ack(struct ceph_connection *con) 578 { 579 dout("prepare_read_keepalive_ack %p\n", con); 580 con->v1.in_base_pos = 0; 581 } 582 583 /* 584 * Prepare to read a message. 585 */ 586 static int prepare_read_message(struct ceph_connection *con) 587 { 588 dout("prepare_read_message %p\n", con); 589 BUG_ON(con->in_msg != NULL); 590 con->v1.in_base_pos = 0; 591 con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0; 592 return 0; 593 } 594 595 static int read_partial(struct ceph_connection *con, 596 int end, int size, void *object) 597 { 598 while (con->v1.in_base_pos < end) { 599 int left = end - con->v1.in_base_pos; 600 int have = size - left; 601 int ret = ceph_tcp_recvmsg(con->sock, object + have, left); 602 if (ret <= 0) 603 return ret; 604 con->v1.in_base_pos += ret; 605 } 606 return 1; 607 } 608 609 /* 610 * Read all or part of the connect-side handshake on a new connection 611 */ 612 static int read_partial_banner(struct ceph_connection *con) 613 { 614 int size; 615 int end; 616 int ret; 617 618 dout("read_partial_banner %p at %d\n", con, con->v1.in_base_pos); 619 620 /* peer's banner */ 621 size = strlen(CEPH_BANNER); 622 end = size; 623 ret = read_partial(con, end, size, con->v1.in_banner); 624 if (ret <= 0) 625 goto out; 626 627 size = sizeof(con->v1.actual_peer_addr); 628 end += size; 629 ret = read_partial(con, end, size, &con->v1.actual_peer_addr); 630 if (ret <= 0) 631 goto out; 632 ceph_decode_banner_addr(&con->v1.actual_peer_addr); 633 634 size = sizeof(con->v1.peer_addr_for_me); 635 end += size; 636 ret = read_partial(con, end, size, &con->v1.peer_addr_for_me); 637 if (ret <= 0) 638 goto out; 639 ceph_decode_banner_addr(&con->v1.peer_addr_for_me); 640 641 out: 642 return ret; 643 } 644 645 static int read_partial_connect(struct ceph_connection *con) 646 { 647 int size; 648 int end; 649 int ret; 650 651 dout("read_partial_connect %p at %d\n", con, con->v1.in_base_pos); 652 653 size = sizeof(con->v1.in_reply); 654 end = size; 655 ret = read_partial(con, end, size, &con->v1.in_reply); 656 if (ret <= 0) 657 goto out; 658 659 if (con->v1.auth) { 660 size = le32_to_cpu(con->v1.in_reply.authorizer_len); 661 if (size > con->v1.auth->authorizer_reply_buf_len) { 662 pr_err("authorizer reply too big: %d > %zu\n", size, 663 con->v1.auth->authorizer_reply_buf_len); 664 ret = -EINVAL; 665 goto out; 666 } 667 668 end += size; 669 ret = read_partial(con, end, size, 670 con->v1.auth->authorizer_reply_buf); 671 if (ret <= 0) 672 goto out; 673 } 674 675 dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n", 676 con, con->v1.in_reply.tag, 677 le32_to_cpu(con->v1.in_reply.connect_seq), 678 le32_to_cpu(con->v1.in_reply.global_seq)); 679 out: 680 return ret; 681 } 682 683 /* 684 * Verify the hello banner looks okay. 685 */ 686 static int verify_hello(struct ceph_connection *con) 687 { 688 if (memcmp(con->v1.in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) { 689 pr_err("connect to %s got bad banner\n", 690 ceph_pr_addr(&con->peer_addr)); 691 con->error_msg = "protocol error, bad banner"; 692 return -1; 693 } 694 return 0; 695 } 696 697 static int process_banner(struct ceph_connection *con) 698 { 699 struct ceph_entity_addr *my_addr = &con->msgr->inst.addr; 700 701 dout("process_banner on %p\n", con); 702 703 if (verify_hello(con) < 0) 704 return -1; 705 706 /* 707 * Make sure the other end is who we wanted. note that the other 708 * end may not yet know their ip address, so if it's 0.0.0.0, give 709 * them the benefit of the doubt. 710 */ 711 if (memcmp(&con->peer_addr, &con->v1.actual_peer_addr, 712 sizeof(con->peer_addr)) != 0 && 713 !(ceph_addr_is_blank(&con->v1.actual_peer_addr) && 714 con->v1.actual_peer_addr.nonce == con->peer_addr.nonce)) { 715 pr_warn("wrong peer, want %s/%u, got %s/%u\n", 716 ceph_pr_addr(&con->peer_addr), 717 le32_to_cpu(con->peer_addr.nonce), 718 ceph_pr_addr(&con->v1.actual_peer_addr), 719 le32_to_cpu(con->v1.actual_peer_addr.nonce)); 720 con->error_msg = "wrong peer at address"; 721 return -1; 722 } 723 724 /* 725 * did we learn our address? 726 */ 727 if (ceph_addr_is_blank(my_addr)) { 728 memcpy(&my_addr->in_addr, 729 &con->v1.peer_addr_for_me.in_addr, 730 sizeof(con->v1.peer_addr_for_me.in_addr)); 731 ceph_addr_set_port(my_addr, 0); 732 ceph_encode_my_addr(con->msgr); 733 dout("process_banner learned my addr is %s\n", 734 ceph_pr_addr(my_addr)); 735 } 736 737 return 0; 738 } 739 740 static int process_connect(struct ceph_connection *con) 741 { 742 u64 sup_feat = from_msgr(con->msgr)->supported_features; 743 u64 req_feat = from_msgr(con->msgr)->required_features; 744 u64 server_feat = le64_to_cpu(con->v1.in_reply.features); 745 int ret; 746 747 dout("process_connect on %p tag %d\n", con, con->v1.in_tag); 748 749 if (con->v1.auth) { 750 int len = le32_to_cpu(con->v1.in_reply.authorizer_len); 751 752 /* 753 * Any connection that defines ->get_authorizer() 754 * should also define ->add_authorizer_challenge() and 755 * ->verify_authorizer_reply(). 756 * 757 * See get_connect_authorizer(). 758 */ 759 if (con->v1.in_reply.tag == 760 CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) { 761 ret = con->ops->add_authorizer_challenge( 762 con, con->v1.auth->authorizer_reply_buf, len); 763 if (ret < 0) 764 return ret; 765 766 con_out_kvec_reset(con); 767 __prepare_write_connect(con); 768 prepare_read_connect(con); 769 return 0; 770 } 771 772 if (len) { 773 ret = con->ops->verify_authorizer_reply(con); 774 if (ret < 0) { 775 con->error_msg = "bad authorize reply"; 776 return ret; 777 } 778 } 779 } 780 781 switch (con->v1.in_reply.tag) { 782 case CEPH_MSGR_TAG_FEATURES: 783 pr_err("%s%lld %s feature set mismatch," 784 " my %llx < server's %llx, missing %llx\n", 785 ENTITY_NAME(con->peer_name), 786 ceph_pr_addr(&con->peer_addr), 787 sup_feat, server_feat, server_feat & ~sup_feat); 788 con->error_msg = "missing required protocol features"; 789 return -1; 790 791 case CEPH_MSGR_TAG_BADPROTOVER: 792 pr_err("%s%lld %s protocol version mismatch," 793 " my %d != server's %d\n", 794 ENTITY_NAME(con->peer_name), 795 ceph_pr_addr(&con->peer_addr), 796 le32_to_cpu(con->v1.out_connect.protocol_version), 797 le32_to_cpu(con->v1.in_reply.protocol_version)); 798 con->error_msg = "protocol version mismatch"; 799 return -1; 800 801 case CEPH_MSGR_TAG_BADAUTHORIZER: 802 con->v1.auth_retry++; 803 dout("process_connect %p got BADAUTHORIZER attempt %d\n", con, 804 con->v1.auth_retry); 805 if (con->v1.auth_retry == 2) { 806 con->error_msg = "connect authorization failure"; 807 return -1; 808 } 809 con_out_kvec_reset(con); 810 ret = prepare_write_connect(con); 811 if (ret < 0) 812 return ret; 813 prepare_read_connect(con); 814 break; 815 816 case CEPH_MSGR_TAG_RESETSESSION: 817 /* 818 * If we connected with a large connect_seq but the peer 819 * has no record of a session with us (no connection, or 820 * connect_seq == 0), they will send RESETSESION to indicate 821 * that they must have reset their session, and may have 822 * dropped messages. 823 */ 824 dout("process_connect got RESET peer seq %u\n", 825 le32_to_cpu(con->v1.in_reply.connect_seq)); 826 pr_info("%s%lld %s session reset\n", 827 ENTITY_NAME(con->peer_name), 828 ceph_pr_addr(&con->peer_addr)); 829 ceph_con_reset_session(con); 830 con_out_kvec_reset(con); 831 ret = prepare_write_connect(con); 832 if (ret < 0) 833 return ret; 834 prepare_read_connect(con); 835 836 /* Tell ceph about it. */ 837 mutex_unlock(&con->mutex); 838 if (con->ops->peer_reset) 839 con->ops->peer_reset(con); 840 mutex_lock(&con->mutex); 841 if (con->state != CEPH_CON_S_V1_CONNECT_MSG) 842 return -EAGAIN; 843 break; 844 845 case CEPH_MSGR_TAG_RETRY_SESSION: 846 /* 847 * If we sent a smaller connect_seq than the peer has, try 848 * again with a larger value. 849 */ 850 dout("process_connect got RETRY_SESSION my seq %u, peer %u\n", 851 le32_to_cpu(con->v1.out_connect.connect_seq), 852 le32_to_cpu(con->v1.in_reply.connect_seq)); 853 con->v1.connect_seq = le32_to_cpu(con->v1.in_reply.connect_seq); 854 con_out_kvec_reset(con); 855 ret = prepare_write_connect(con); 856 if (ret < 0) 857 return ret; 858 prepare_read_connect(con); 859 break; 860 861 case CEPH_MSGR_TAG_RETRY_GLOBAL: 862 /* 863 * If we sent a smaller global_seq than the peer has, try 864 * again with a larger value. 865 */ 866 dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n", 867 con->v1.peer_global_seq, 868 le32_to_cpu(con->v1.in_reply.global_seq)); 869 ceph_get_global_seq(con->msgr, 870 le32_to_cpu(con->v1.in_reply.global_seq)); 871 con_out_kvec_reset(con); 872 ret = prepare_write_connect(con); 873 if (ret < 0) 874 return ret; 875 prepare_read_connect(con); 876 break; 877 878 case CEPH_MSGR_TAG_SEQ: 879 case CEPH_MSGR_TAG_READY: 880 if (req_feat & ~server_feat) { 881 pr_err("%s%lld %s protocol feature mismatch," 882 " my required %llx > server's %llx, need %llx\n", 883 ENTITY_NAME(con->peer_name), 884 ceph_pr_addr(&con->peer_addr), 885 req_feat, server_feat, req_feat & ~server_feat); 886 con->error_msg = "missing required protocol features"; 887 return -1; 888 } 889 890 WARN_ON(con->state != CEPH_CON_S_V1_CONNECT_MSG); 891 con->state = CEPH_CON_S_OPEN; 892 con->v1.auth_retry = 0; /* we authenticated; clear flag */ 893 con->v1.peer_global_seq = 894 le32_to_cpu(con->v1.in_reply.global_seq); 895 con->v1.connect_seq++; 896 con->peer_features = server_feat; 897 dout("process_connect got READY gseq %d cseq %d (%d)\n", 898 con->v1.peer_global_seq, 899 le32_to_cpu(con->v1.in_reply.connect_seq), 900 con->v1.connect_seq); 901 WARN_ON(con->v1.connect_seq != 902 le32_to_cpu(con->v1.in_reply.connect_seq)); 903 904 if (con->v1.in_reply.flags & CEPH_MSG_CONNECT_LOSSY) 905 ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX); 906 907 con->delay = 0; /* reset backoff memory */ 908 909 if (con->v1.in_reply.tag == CEPH_MSGR_TAG_SEQ) { 910 prepare_write_seq(con); 911 prepare_read_seq(con); 912 } else { 913 prepare_read_tag(con); 914 } 915 break; 916 917 case CEPH_MSGR_TAG_WAIT: 918 /* 919 * If there is a connection race (we are opening 920 * connections to each other), one of us may just have 921 * to WAIT. This shouldn't happen if we are the 922 * client. 923 */ 924 con->error_msg = "protocol error, got WAIT as client"; 925 return -1; 926 927 default: 928 con->error_msg = "protocol error, garbage tag during connect"; 929 return -1; 930 } 931 return 0; 932 } 933 934 /* 935 * read (part of) an ack 936 */ 937 static int read_partial_ack(struct ceph_connection *con) 938 { 939 int size = sizeof(con->v1.in_temp_ack); 940 int end = size; 941 942 return read_partial(con, end, size, &con->v1.in_temp_ack); 943 } 944 945 /* 946 * We can finally discard anything that's been acked. 947 */ 948 static void process_ack(struct ceph_connection *con) 949 { 950 u64 ack = le64_to_cpu(con->v1.in_temp_ack); 951 952 if (con->v1.in_tag == CEPH_MSGR_TAG_ACK) 953 ceph_con_discard_sent(con, ack); 954 else 955 ceph_con_discard_requeued(con, ack); 956 957 prepare_read_tag(con); 958 } 959 960 static int read_partial_message_chunk(struct ceph_connection *con, 961 struct kvec *section, 962 unsigned int sec_len, u32 *crc) 963 { 964 int ret, left; 965 966 BUG_ON(!section); 967 968 while (section->iov_len < sec_len) { 969 BUG_ON(section->iov_base == NULL); 970 left = sec_len - section->iov_len; 971 ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base + 972 section->iov_len, left); 973 if (ret <= 0) 974 return ret; 975 section->iov_len += ret; 976 } 977 if (section->iov_len == sec_len) 978 *crc = crc32c(*crc, section->iov_base, section->iov_len); 979 980 return 1; 981 } 982 983 static inline int read_partial_message_section(struct ceph_connection *con, 984 struct kvec *section, 985 unsigned int sec_len, u32 *crc) 986 { 987 *crc = 0; 988 return read_partial_message_chunk(con, section, sec_len, crc); 989 } 990 991 static int read_partial_sparse_msg_extent(struct ceph_connection *con, u32 *crc) 992 { 993 struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor; 994 bool do_bounce = ceph_test_opt(from_msgr(con->msgr), RXBOUNCE); 995 996 if (do_bounce && unlikely(!con->bounce_page)) { 997 con->bounce_page = alloc_page(GFP_NOIO); 998 if (!con->bounce_page) { 999 pr_err("failed to allocate bounce page\n"); 1000 return -ENOMEM; 1001 } 1002 } 1003 1004 while (cursor->sr_resid > 0) { 1005 struct page *page, *rpage; 1006 size_t off, len; 1007 int ret; 1008 1009 page = ceph_msg_data_next(cursor, &off, &len); 1010 rpage = do_bounce ? con->bounce_page : page; 1011 1012 /* clamp to what remains in extent */ 1013 len = min_t(int, len, cursor->sr_resid); 1014 ret = ceph_tcp_recvpage(con->sock, rpage, (int)off, len); 1015 if (ret <= 0) 1016 return ret; 1017 *crc = ceph_crc32c_page(*crc, rpage, off, ret); 1018 ceph_msg_data_advance(cursor, (size_t)ret); 1019 cursor->sr_resid -= ret; 1020 if (do_bounce) 1021 memcpy_page(page, off, rpage, off, ret); 1022 } 1023 return 1; 1024 } 1025 1026 static int read_partial_sparse_msg_data(struct ceph_connection *con) 1027 { 1028 struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor; 1029 bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); 1030 u32 crc = 0; 1031 int ret = 1; 1032 1033 if (do_datacrc) 1034 crc = con->in_data_crc; 1035 1036 while (cursor->total_resid) { 1037 if (con->v1.in_sr_kvec.iov_base) 1038 ret = read_partial_message_chunk(con, 1039 &con->v1.in_sr_kvec, 1040 con->v1.in_sr_len, 1041 &crc); 1042 else if (cursor->sr_resid > 0) 1043 ret = read_partial_sparse_msg_extent(con, &crc); 1044 if (ret <= 0) 1045 break; 1046 1047 memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec)); 1048 ret = con->ops->sparse_read(con, cursor, 1049 (char **)&con->v1.in_sr_kvec.iov_base); 1050 if (ret <= 0) { 1051 ret = ret ? ret : 1; /* must return > 0 to indicate success */ 1052 break; 1053 } 1054 con->v1.in_sr_len = ret; 1055 } 1056 1057 if (do_datacrc) 1058 con->in_data_crc = crc; 1059 1060 return ret; 1061 } 1062 1063 static int read_partial_msg_data(struct ceph_connection *con) 1064 { 1065 struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor; 1066 bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); 1067 struct page *page; 1068 size_t page_offset; 1069 size_t length; 1070 u32 crc = 0; 1071 int ret; 1072 1073 if (do_datacrc) 1074 crc = con->in_data_crc; 1075 while (cursor->total_resid) { 1076 if (!cursor->resid) { 1077 ceph_msg_data_advance(cursor, 0); 1078 continue; 1079 } 1080 1081 page = ceph_msg_data_next(cursor, &page_offset, &length); 1082 ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); 1083 if (ret <= 0) { 1084 if (do_datacrc) 1085 con->in_data_crc = crc; 1086 1087 return ret; 1088 } 1089 1090 if (do_datacrc) 1091 crc = ceph_crc32c_page(crc, page, page_offset, ret); 1092 ceph_msg_data_advance(cursor, (size_t)ret); 1093 } 1094 if (do_datacrc) 1095 con->in_data_crc = crc; 1096 1097 return 1; /* must return > 0 to indicate success */ 1098 } 1099 1100 static int read_partial_msg_data_bounce(struct ceph_connection *con) 1101 { 1102 struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor; 1103 struct page *page; 1104 size_t off, len; 1105 u32 crc; 1106 int ret; 1107 1108 if (unlikely(!con->bounce_page)) { 1109 con->bounce_page = alloc_page(GFP_NOIO); 1110 if (!con->bounce_page) { 1111 pr_err("failed to allocate bounce page\n"); 1112 return -ENOMEM; 1113 } 1114 } 1115 1116 crc = con->in_data_crc; 1117 while (cursor->total_resid) { 1118 if (!cursor->resid) { 1119 ceph_msg_data_advance(cursor, 0); 1120 continue; 1121 } 1122 1123 page = ceph_msg_data_next(cursor, &off, &len); 1124 ret = ceph_tcp_recvpage(con->sock, con->bounce_page, 0, len); 1125 if (ret <= 0) { 1126 con->in_data_crc = crc; 1127 return ret; 1128 } 1129 1130 crc = crc32c(crc, page_address(con->bounce_page), ret); 1131 memcpy_to_page(page, off, page_address(con->bounce_page), ret); 1132 1133 ceph_msg_data_advance(cursor, ret); 1134 } 1135 con->in_data_crc = crc; 1136 1137 return 1; /* must return > 0 to indicate success */ 1138 } 1139 1140 /* 1141 * read (part of) a message. 1142 */ 1143 static int read_partial_message(struct ceph_connection *con) 1144 { 1145 struct ceph_msg *m = con->in_msg; 1146 int size; 1147 int end; 1148 int ret; 1149 unsigned int front_len, middle_len, data_len; 1150 bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); 1151 bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH); 1152 u64 seq; 1153 u32 crc; 1154 1155 dout("read_partial_message con %p msg %p\n", con, m); 1156 1157 /* header */ 1158 size = sizeof(con->v1.in_hdr); 1159 end = size; 1160 ret = read_partial(con, end, size, &con->v1.in_hdr); 1161 if (ret <= 0) 1162 return ret; 1163 1164 crc = crc32c(0, &con->v1.in_hdr, offsetof(struct ceph_msg_header, crc)); 1165 if (cpu_to_le32(crc) != con->v1.in_hdr.crc) { 1166 pr_err("read_partial_message bad hdr crc %u != expected %u\n", 1167 crc, con->v1.in_hdr.crc); 1168 return -EBADMSG; 1169 } 1170 1171 front_len = le32_to_cpu(con->v1.in_hdr.front_len); 1172 if (front_len > CEPH_MSG_MAX_FRONT_LEN) 1173 return -EIO; 1174 middle_len = le32_to_cpu(con->v1.in_hdr.middle_len); 1175 if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN) 1176 return -EIO; 1177 data_len = le32_to_cpu(con->v1.in_hdr.data_len); 1178 if (data_len > CEPH_MSG_MAX_DATA_LEN) 1179 return -EIO; 1180 1181 /* verify seq# */ 1182 seq = le64_to_cpu(con->v1.in_hdr.seq); 1183 if ((s64)seq - (s64)con->in_seq < 1) { 1184 pr_info("skipping %s%lld %s seq %lld expected %lld\n", 1185 ENTITY_NAME(con->peer_name), 1186 ceph_pr_addr(&con->peer_addr), 1187 seq, con->in_seq + 1); 1188 con->v1.in_base_pos = -front_len - middle_len - data_len - 1189 sizeof_footer(con); 1190 con->v1.in_tag = CEPH_MSGR_TAG_READY; 1191 return 1; 1192 } else if ((s64)seq - (s64)con->in_seq > 1) { 1193 pr_err("read_partial_message bad seq %lld expected %lld\n", 1194 seq, con->in_seq + 1); 1195 con->error_msg = "bad message sequence # for incoming message"; 1196 return -EBADE; 1197 } 1198 1199 /* allocate message? */ 1200 if (!con->in_msg) { 1201 int skip = 0; 1202 1203 dout("got hdr type %d front %d data %d\n", con->v1.in_hdr.type, 1204 front_len, data_len); 1205 ret = ceph_con_in_msg_alloc(con, &con->v1.in_hdr, &skip); 1206 if (ret < 0) 1207 return ret; 1208 1209 BUG_ON((!con->in_msg) ^ skip); 1210 if (skip) { 1211 /* skip this message */ 1212 dout("alloc_msg said skip message\n"); 1213 con->v1.in_base_pos = -front_len - middle_len - 1214 data_len - sizeof_footer(con); 1215 con->v1.in_tag = CEPH_MSGR_TAG_READY; 1216 con->in_seq++; 1217 return 1; 1218 } 1219 1220 BUG_ON(!con->in_msg); 1221 BUG_ON(con->in_msg->con != con); 1222 m = con->in_msg; 1223 m->front.iov_len = 0; /* haven't read it yet */ 1224 if (m->middle) 1225 m->middle->vec.iov_len = 0; 1226 1227 /* prepare for data payload, if any */ 1228 1229 if (data_len) 1230 prepare_message_data(con->in_msg, data_len); 1231 } 1232 1233 /* front */ 1234 ret = read_partial_message_section(con, &m->front, front_len, 1235 &con->in_front_crc); 1236 if (ret <= 0) 1237 return ret; 1238 1239 /* middle */ 1240 if (m->middle) { 1241 ret = read_partial_message_section(con, &m->middle->vec, 1242 middle_len, 1243 &con->in_middle_crc); 1244 if (ret <= 0) 1245 return ret; 1246 } 1247 1248 /* (page) data */ 1249 if (data_len) { 1250 if (!m->num_data_items) 1251 return -EIO; 1252 1253 if (m->sparse_read_total) 1254 ret = read_partial_sparse_msg_data(con); 1255 else if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) 1256 ret = read_partial_msg_data_bounce(con); 1257 else 1258 ret = read_partial_msg_data(con); 1259 if (ret <= 0) 1260 return ret; 1261 } 1262 1263 /* footer */ 1264 size = sizeof_footer(con); 1265 end += size; 1266 ret = read_partial(con, end, size, &m->footer); 1267 if (ret <= 0) 1268 return ret; 1269 1270 if (!need_sign) { 1271 m->footer.flags = m->old_footer.flags; 1272 m->footer.sig = 0; 1273 } 1274 1275 dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n", 1276 m, front_len, m->footer.front_crc, middle_len, 1277 m->footer.middle_crc, data_len, m->footer.data_crc); 1278 1279 /* crc ok? */ 1280 if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) { 1281 pr_err("read_partial_message %p front crc %u != exp. %u\n", 1282 m, con->in_front_crc, m->footer.front_crc); 1283 return -EBADMSG; 1284 } 1285 if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) { 1286 pr_err("read_partial_message %p middle crc %u != exp %u\n", 1287 m, con->in_middle_crc, m->footer.middle_crc); 1288 return -EBADMSG; 1289 } 1290 if (do_datacrc && 1291 (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 && 1292 con->in_data_crc != le32_to_cpu(m->footer.data_crc)) { 1293 pr_err("read_partial_message %p data crc %u != exp. %u\n", m, 1294 con->in_data_crc, le32_to_cpu(m->footer.data_crc)); 1295 return -EBADMSG; 1296 } 1297 1298 if (need_sign && con->ops->check_message_signature && 1299 con->ops->check_message_signature(m)) { 1300 pr_err("read_partial_message %p signature check failed\n", m); 1301 return -EBADMSG; 1302 } 1303 1304 return 1; /* done! */ 1305 } 1306 1307 static int read_keepalive_ack(struct ceph_connection *con) 1308 { 1309 struct ceph_timespec ceph_ts; 1310 size_t size = sizeof(ceph_ts); 1311 int ret = read_partial(con, size, size, &ceph_ts); 1312 if (ret <= 0) 1313 return ret; 1314 ceph_decode_timespec64(&con->last_keepalive_ack, &ceph_ts); 1315 prepare_read_tag(con); 1316 return 1; 1317 } 1318 1319 /* 1320 * Read what we can from the socket. 1321 */ 1322 int ceph_con_v1_try_read(struct ceph_connection *con) 1323 { 1324 int ret = -1; 1325 1326 more: 1327 dout("try_read start %p state %d\n", con, con->state); 1328 if (con->state != CEPH_CON_S_V1_BANNER && 1329 con->state != CEPH_CON_S_V1_CONNECT_MSG && 1330 con->state != CEPH_CON_S_OPEN) 1331 return 0; 1332 1333 BUG_ON(!con->sock); 1334 1335 dout("try_read tag %d in_base_pos %d\n", con->v1.in_tag, 1336 con->v1.in_base_pos); 1337 1338 if (con->state == CEPH_CON_S_V1_BANNER) { 1339 ret = read_partial_banner(con); 1340 if (ret <= 0) 1341 goto out; 1342 ret = process_banner(con); 1343 if (ret < 0) 1344 goto out; 1345 1346 con->state = CEPH_CON_S_V1_CONNECT_MSG; 1347 1348 /* 1349 * Received banner is good, exchange connection info. 1350 * Do not reset out_kvec, as sending our banner raced 1351 * with receiving peer banner after connect completed. 1352 */ 1353 ret = prepare_write_connect(con); 1354 if (ret < 0) 1355 goto out; 1356 prepare_read_connect(con); 1357 1358 /* Send connection info before awaiting response */ 1359 goto out; 1360 } 1361 1362 if (con->state == CEPH_CON_S_V1_CONNECT_MSG) { 1363 ret = read_partial_connect(con); 1364 if (ret <= 0) 1365 goto out; 1366 ret = process_connect(con); 1367 if (ret < 0) 1368 goto out; 1369 goto more; 1370 } 1371 1372 WARN_ON(con->state != CEPH_CON_S_OPEN); 1373 1374 if (con->v1.in_base_pos < 0) { 1375 /* 1376 * skipping + discarding content. 1377 */ 1378 ret = ceph_tcp_recvmsg(con->sock, NULL, -con->v1.in_base_pos); 1379 if (ret <= 0) 1380 goto out; 1381 dout("skipped %d / %d bytes\n", ret, -con->v1.in_base_pos); 1382 con->v1.in_base_pos += ret; 1383 if (con->v1.in_base_pos) 1384 goto more; 1385 } 1386 if (con->v1.in_tag == CEPH_MSGR_TAG_READY) { 1387 /* 1388 * what's next? 1389 */ 1390 ret = ceph_tcp_recvmsg(con->sock, &con->v1.in_tag, 1); 1391 if (ret <= 0) 1392 goto out; 1393 dout("try_read got tag %d\n", con->v1.in_tag); 1394 switch (con->v1.in_tag) { 1395 case CEPH_MSGR_TAG_MSG: 1396 prepare_read_message(con); 1397 break; 1398 case CEPH_MSGR_TAG_ACK: 1399 prepare_read_ack(con); 1400 break; 1401 case CEPH_MSGR_TAG_KEEPALIVE2_ACK: 1402 prepare_read_keepalive_ack(con); 1403 break; 1404 case CEPH_MSGR_TAG_CLOSE: 1405 ceph_con_close_socket(con); 1406 con->state = CEPH_CON_S_CLOSED; 1407 goto out; 1408 default: 1409 goto bad_tag; 1410 } 1411 } 1412 if (con->v1.in_tag == CEPH_MSGR_TAG_MSG) { 1413 ret = read_partial_message(con); 1414 if (ret <= 0) { 1415 switch (ret) { 1416 case -EBADMSG: 1417 con->error_msg = "bad crc/signature"; 1418 fallthrough; 1419 case -EBADE: 1420 ret = -EIO; 1421 break; 1422 case -EIO: 1423 con->error_msg = "io error"; 1424 break; 1425 } 1426 goto out; 1427 } 1428 if (con->v1.in_tag == CEPH_MSGR_TAG_READY) 1429 goto more; 1430 ceph_con_process_message(con); 1431 if (con->state == CEPH_CON_S_OPEN) 1432 prepare_read_tag(con); 1433 goto more; 1434 } 1435 if (con->v1.in_tag == CEPH_MSGR_TAG_ACK || 1436 con->v1.in_tag == CEPH_MSGR_TAG_SEQ) { 1437 /* 1438 * the final handshake seq exchange is semantically 1439 * equivalent to an ACK 1440 */ 1441 ret = read_partial_ack(con); 1442 if (ret <= 0) 1443 goto out; 1444 process_ack(con); 1445 goto more; 1446 } 1447 if (con->v1.in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) { 1448 ret = read_keepalive_ack(con); 1449 if (ret <= 0) 1450 goto out; 1451 goto more; 1452 } 1453 1454 out: 1455 dout("try_read done on %p ret %d\n", con, ret); 1456 return ret; 1457 1458 bad_tag: 1459 pr_err("try_read bad tag %d\n", con->v1.in_tag); 1460 con->error_msg = "protocol error, garbage tag"; 1461 ret = -1; 1462 goto out; 1463 } 1464 1465 /* 1466 * Write something to the socket. Called in a worker thread when the 1467 * socket appears to be writeable and we have something ready to send. 1468 */ 1469 int ceph_con_v1_try_write(struct ceph_connection *con) 1470 { 1471 struct ceph_msg *msg; 1472 int ret = 1; 1473 1474 dout("try_write start %p state %d\n", con, con->state); 1475 if (con->state != CEPH_CON_S_PREOPEN && 1476 con->state != CEPH_CON_S_V1_BANNER && 1477 con->state != CEPH_CON_S_V1_CONNECT_MSG && 1478 con->state != CEPH_CON_S_OPEN) 1479 return 0; 1480 1481 /* open the socket first? */ 1482 if (con->state == CEPH_CON_S_PREOPEN) { 1483 BUG_ON(con->sock); 1484 con->state = CEPH_CON_S_V1_BANNER; 1485 1486 con_out_kvec_reset(con); 1487 prepare_write_banner(con); 1488 prepare_read_banner(con); 1489 1490 BUG_ON(con->in_msg); 1491 con->v1.in_tag = CEPH_MSGR_TAG_READY; 1492 dout("try_write initiating connect on %p new state %d\n", 1493 con, con->state); 1494 ret = ceph_tcp_connect(con); 1495 if (ret < 0) { 1496 con->error_msg = "connect error"; 1497 goto out; 1498 } 1499 } 1500 1501 more: 1502 dout("try_write out_kvec_bytes %d\n", con->v1.out_kvec_bytes); 1503 BUG_ON(!con->sock); 1504 1505 /* kvec data queued? */ 1506 if (con->v1.out_kvec_left) { 1507 ret = write_partial_kvec(con); 1508 if (ret <= 0) 1509 goto out; 1510 } 1511 if (con->v1.out_skip) { 1512 ret = write_partial_skip(con); 1513 if (ret <= 0) 1514 goto out; 1515 } 1516 1517 /* msg pages? */ 1518 msg = con->out_msg; 1519 if (msg) { 1520 if (con->v1.out_msg_done) { 1521 ceph_msg_put(msg); 1522 con->out_msg = NULL; /* we're done with this one */ 1523 goto do_next; 1524 } 1525 1526 ret = write_partial_message_data(con, msg); 1527 if (ret == 1) 1528 goto more; /* we need to send the footer, too! */ 1529 if (ret == 0) 1530 goto out; 1531 if (ret < 0) { 1532 dout("try_write write_partial_message_data err %d\n", 1533 ret); 1534 goto out; 1535 } 1536 } 1537 1538 do_next: 1539 if (con->state == CEPH_CON_S_OPEN) { 1540 if (ceph_con_flag_test_and_clear(con, 1541 CEPH_CON_F_KEEPALIVE_PENDING)) { 1542 prepare_write_keepalive(con); 1543 goto more; 1544 } 1545 /* is anything else pending? */ 1546 if ((msg = ceph_con_get_out_msg(con)) != NULL) { 1547 prepare_write_message(con, msg); 1548 goto more; 1549 } 1550 if (con->in_seq > con->in_seq_acked) { 1551 prepare_write_ack(con); 1552 goto more; 1553 } 1554 } 1555 1556 /* Nothing to do! */ 1557 ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING); 1558 dout("try_write nothing else to write.\n"); 1559 ret = 0; 1560 out: 1561 dout("try_write done on %p ret %d\n", con, ret); 1562 return ret; 1563 } 1564 1565 void ceph_con_v1_revoke(struct ceph_connection *con, struct ceph_msg *msg) 1566 { 1567 WARN_ON(con->v1.out_skip); 1568 /* footer */ 1569 if (con->v1.out_msg_done) { 1570 con->v1.out_skip += con_out_kvec_skip(con); 1571 } else { 1572 WARN_ON(!msg->data_length); 1573 con->v1.out_skip += sizeof_footer(con); 1574 } 1575 /* data, middle, front */ 1576 if (msg->data_length) 1577 con->v1.out_skip += msg->cursor.total_resid; 1578 if (msg->middle) 1579 con->v1.out_skip += con_out_kvec_skip(con); 1580 con->v1.out_skip += con_out_kvec_skip(con); 1581 1582 dout("%s con %p out_kvec_bytes %d out_skip %d\n", __func__, con, 1583 con->v1.out_kvec_bytes, con->v1.out_skip); 1584 } 1585 1586 void ceph_con_v1_revoke_incoming(struct ceph_connection *con) 1587 { 1588 unsigned int front_len = le32_to_cpu(con->v1.in_hdr.front_len); 1589 unsigned int middle_len = le32_to_cpu(con->v1.in_hdr.middle_len); 1590 unsigned int data_len = le32_to_cpu(con->v1.in_hdr.data_len); 1591 1592 /* skip rest of message */ 1593 con->v1.in_base_pos = con->v1.in_base_pos - 1594 sizeof(struct ceph_msg_header) - 1595 front_len - 1596 middle_len - 1597 data_len - 1598 sizeof(struct ceph_msg_footer); 1599 1600 con->v1.in_tag = CEPH_MSGR_TAG_READY; 1601 con->in_seq++; 1602 1603 dout("%s con %p in_base_pos %d\n", __func__, con, con->v1.in_base_pos); 1604 } 1605 1606 bool ceph_con_v1_opened(struct ceph_connection *con) 1607 { 1608 return con->v1.connect_seq; 1609 } 1610 1611 void ceph_con_v1_reset_session(struct ceph_connection *con) 1612 { 1613 con->v1.connect_seq = 0; 1614 con->v1.peer_global_seq = 0; 1615 } 1616 1617 void ceph_con_v1_reset_protocol(struct ceph_connection *con) 1618 { 1619 con->v1.out_skip = 0; 1620 } 1621