1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/ceph/ceph_debug.h> 3 4 #include <linux/bvec.h> 5 #include <linux/crc32c.h> 6 #include <linux/net.h> 7 #include <linux/socket.h> 8 #include <net/sock.h> 9 10 #include <linux/ceph/ceph_features.h> 11 #include <linux/ceph/decode.h> 12 #include <linux/ceph/libceph.h> 13 #include <linux/ceph/messenger.h> 14 15 /* static tag bytes (protocol control messages) */ 16 static char tag_msg = CEPH_MSGR_TAG_MSG; 17 static char tag_ack = CEPH_MSGR_TAG_ACK; 18 static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE; 19 static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2; 20 21 /* 22 * If @buf is NULL, discard up to @len bytes. 23 */ 24 static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) 25 { 26 struct kvec iov = {buf, len}; 27 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; 28 int r; 29 30 if (!buf) 31 msg.msg_flags |= MSG_TRUNC; 32 33 iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, len); 34 r = sock_recvmsg(sock, &msg, msg.msg_flags); 35 if (r == -EAGAIN) 36 r = 0; 37 return r; 38 } 39 40 static int ceph_tcp_recvpage(struct socket *sock, struct page *page, 41 int page_offset, size_t length) 42 { 43 struct bio_vec bvec; 44 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; 45 int r; 46 47 BUG_ON(page_offset + length > PAGE_SIZE); 48 bvec_set_page(&bvec, page, length, page_offset); 49 iov_iter_bvec(&msg.msg_iter, ITER_DEST, &bvec, 1, length); 50 r = sock_recvmsg(sock, &msg, msg.msg_flags); 51 if (r == -EAGAIN) 52 r = 0; 53 return r; 54 } 55 56 /* 57 * write something. @more is true if caller will be sending more data 58 * shortly. 59 */ 60 static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, 61 size_t kvlen, size_t len, bool more) 62 { 63 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; 64 int r; 65 66 if (more) 67 msg.msg_flags |= MSG_MORE; 68 else 69 msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */ 70 71 r = kernel_sendmsg(sock, &msg, iov, kvlen, len); 72 if (r == -EAGAIN) 73 r = 0; 74 return r; 75 } 76 77 /* 78 * @more: MSG_MORE or 0. 79 */ 80 static int ceph_tcp_sendpage(struct socket *sock, struct page *page, 81 int offset, size_t size, int more) 82 { 83 struct msghdr msg = { 84 .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | more, 85 }; 86 struct bio_vec bvec; 87 int ret; 88 89 /* 90 * MSG_SPLICE_PAGES cannot properly handle pages with page_count == 0, 91 * we need to fall back to sendmsg if that's the case. 92 * 93 * Same goes for slab pages: skb_can_coalesce() allows 94 * coalescing neighboring slab objects into a single frag which 95 * triggers one of hardened usercopy checks. 96 */ 97 if (sendpage_ok(page)) 98 msg.msg_flags |= MSG_SPLICE_PAGES; 99 100 bvec_set_page(&bvec, page, size, offset); 101 iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); 102 103 ret = sock_sendmsg(sock, &msg); 104 if (ret == -EAGAIN) 105 ret = 0; 106 107 return ret; 108 } 109 110 static void con_out_kvec_reset(struct ceph_connection *con) 111 { 112 BUG_ON(con->v1.out_skip); 113 114 con->v1.out_kvec_left = 0; 115 con->v1.out_kvec_bytes = 0; 116 con->v1.out_kvec_cur = &con->v1.out_kvec[0]; 117 } 118 119 static void con_out_kvec_add(struct ceph_connection *con, 120 size_t size, void *data) 121 { 122 int index = con->v1.out_kvec_left; 123 124 BUG_ON(con->v1.out_skip); 125 BUG_ON(index >= ARRAY_SIZE(con->v1.out_kvec)); 126 127 con->v1.out_kvec[index].iov_len = size; 128 con->v1.out_kvec[index].iov_base = data; 129 con->v1.out_kvec_left++; 130 con->v1.out_kvec_bytes += size; 131 } 132 133 /* 134 * Chop off a kvec from the end. Return residual number of bytes for 135 * that kvec, i.e. how many bytes would have been written if the kvec 136 * hadn't been nuked. 137 */ 138 static int con_out_kvec_skip(struct ceph_connection *con) 139 { 140 int skip = 0; 141 142 if (con->v1.out_kvec_bytes > 0) { 143 skip = con->v1.out_kvec_cur[con->v1.out_kvec_left - 1].iov_len; 144 BUG_ON(con->v1.out_kvec_bytes < skip); 145 BUG_ON(!con->v1.out_kvec_left); 146 con->v1.out_kvec_bytes -= skip; 147 con->v1.out_kvec_left--; 148 } 149 150 return skip; 151 } 152 153 static size_t sizeof_footer(struct ceph_connection *con) 154 { 155 return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ? 156 sizeof(struct ceph_msg_footer) : 157 sizeof(struct ceph_msg_footer_old); 158 } 159 160 static void prepare_message_data(struct ceph_msg *msg, u32 data_len) 161 { 162 /* Initialize data cursor if it's not a sparse read */ 163 if (!msg->sparse_read) 164 ceph_msg_data_cursor_init(&msg->cursor, msg, data_len); 165 } 166 167 /* 168 * Prepare footer for currently outgoing message, and finish things 169 * off. Assumes out_kvec* are already valid.. we just add on to the end. 170 */ 171 static void prepare_write_message_footer(struct ceph_connection *con) 172 { 173 struct ceph_msg *m = con->out_msg; 174 175 m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; 176 177 dout("prepare_write_message_footer %p\n", con); 178 con_out_kvec_add(con, sizeof_footer(con), &m->footer); 179 if (con->peer_features & CEPH_FEATURE_MSG_AUTH) { 180 if (con->ops->sign_message) 181 con->ops->sign_message(m); 182 else 183 m->footer.sig = 0; 184 } else { 185 m->old_footer.flags = m->footer.flags; 186 } 187 con->v1.out_more = m->more_to_follow; 188 con->v1.out_msg_done = true; 189 } 190 191 /* 192 * Prepare headers for the next outgoing message. 193 */ 194 static void prepare_write_message(struct ceph_connection *con) 195 { 196 struct ceph_msg *m; 197 u32 crc; 198 199 con_out_kvec_reset(con); 200 con->v1.out_msg_done = false; 201 202 /* Sneak an ack in there first? If we can get it into the same 203 * TCP packet that's a good thing. */ 204 if (con->in_seq > con->in_seq_acked) { 205 con->in_seq_acked = con->in_seq; 206 con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); 207 con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked); 208 con_out_kvec_add(con, sizeof(con->v1.out_temp_ack), 209 &con->v1.out_temp_ack); 210 } 211 212 ceph_con_get_out_msg(con); 213 m = con->out_msg; 214 215 dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n", 216 m, con->out_seq, le16_to_cpu(m->hdr.type), 217 le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), 218 m->data_length); 219 WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len)); 220 WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len)); 221 222 /* tag + hdr + front + middle */ 223 con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); 224 con_out_kvec_add(con, sizeof(con->v1.out_hdr), &con->v1.out_hdr); 225 con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); 226 227 if (m->middle) 228 con_out_kvec_add(con, m->middle->vec.iov_len, 229 m->middle->vec.iov_base); 230 231 /* fill in hdr crc and finalize hdr */ 232 crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); 233 con->out_msg->hdr.crc = cpu_to_le32(crc); 234 memcpy(&con->v1.out_hdr, &con->out_msg->hdr, sizeof(con->v1.out_hdr)); 235 236 /* fill in front and middle crc, footer */ 237 crc = crc32c(0, m->front.iov_base, m->front.iov_len); 238 con->out_msg->footer.front_crc = cpu_to_le32(crc); 239 if (m->middle) { 240 crc = crc32c(0, m->middle->vec.iov_base, 241 m->middle->vec.iov_len); 242 con->out_msg->footer.middle_crc = cpu_to_le32(crc); 243 } else 244 con->out_msg->footer.middle_crc = 0; 245 dout("%s front_crc %u middle_crc %u\n", __func__, 246 le32_to_cpu(con->out_msg->footer.front_crc), 247 le32_to_cpu(con->out_msg->footer.middle_crc)); 248 con->out_msg->footer.flags = 0; 249 250 /* is there a data payload? */ 251 con->out_msg->footer.data_crc = 0; 252 if (m->data_length) { 253 prepare_message_data(con->out_msg, m->data_length); 254 con->v1.out_more = 1; /* data + footer will follow */ 255 } else { 256 /* no, queue up footer too and be done */ 257 prepare_write_message_footer(con); 258 } 259 260 ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); 261 } 262 263 /* 264 * Prepare an ack. 265 */ 266 static void prepare_write_ack(struct ceph_connection *con) 267 { 268 dout("prepare_write_ack %p %llu -> %llu\n", con, 269 con->in_seq_acked, con->in_seq); 270 con->in_seq_acked = con->in_seq; 271 272 con_out_kvec_reset(con); 273 274 con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); 275 276 con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked); 277 con_out_kvec_add(con, sizeof(con->v1.out_temp_ack), 278 &con->v1.out_temp_ack); 279 280 con->v1.out_more = 1; /* more will follow.. eventually.. */ 281 ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); 282 } 283 284 /* 285 * Prepare to share the seq during handshake 286 */ 287 static void prepare_write_seq(struct ceph_connection *con) 288 { 289 dout("prepare_write_seq %p %llu -> %llu\n", con, 290 con->in_seq_acked, con->in_seq); 291 con->in_seq_acked = con->in_seq; 292 293 con_out_kvec_reset(con); 294 295 con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked); 296 con_out_kvec_add(con, sizeof(con->v1.out_temp_ack), 297 &con->v1.out_temp_ack); 298 299 ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); 300 } 301 302 /* 303 * Prepare to write keepalive byte. 304 */ 305 static void prepare_write_keepalive(struct ceph_connection *con) 306 { 307 dout("prepare_write_keepalive %p\n", con); 308 con_out_kvec_reset(con); 309 if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) { 310 struct timespec64 now; 311 312 ktime_get_real_ts64(&now); 313 con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2); 314 ceph_encode_timespec64(&con->v1.out_temp_keepalive2, &now); 315 con_out_kvec_add(con, sizeof(con->v1.out_temp_keepalive2), 316 &con->v1.out_temp_keepalive2); 317 } else { 318 con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive); 319 } 320 ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); 321 } 322 323 /* 324 * Connection negotiation. 325 */ 326 327 static int get_connect_authorizer(struct ceph_connection *con) 328 { 329 struct ceph_auth_handshake *auth; 330 int auth_proto; 331 332 if (!con->ops->get_authorizer) { 333 con->v1.auth = NULL; 334 con->v1.out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN; 335 con->v1.out_connect.authorizer_len = 0; 336 return 0; 337 } 338 339 auth = con->ops->get_authorizer(con, &auth_proto, con->v1.auth_retry); 340 if (IS_ERR(auth)) 341 return PTR_ERR(auth); 342 343 con->v1.auth = auth; 344 con->v1.out_connect.authorizer_protocol = cpu_to_le32(auth_proto); 345 con->v1.out_connect.authorizer_len = 346 cpu_to_le32(auth->authorizer_buf_len); 347 return 0; 348 } 349 350 /* 351 * We connected to a peer and are saying hello. 352 */ 353 static void prepare_write_banner(struct ceph_connection *con) 354 { 355 con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER); 356 con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr), 357 &con->msgr->my_enc_addr); 358 359 con->v1.out_more = 0; 360 ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); 361 } 362 363 static void __prepare_write_connect(struct ceph_connection *con) 364 { 365 con_out_kvec_add(con, sizeof(con->v1.out_connect), 366 &con->v1.out_connect); 367 if (con->v1.auth) 368 con_out_kvec_add(con, con->v1.auth->authorizer_buf_len, 369 con->v1.auth->authorizer_buf); 370 371 con->v1.out_more = 0; 372 ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); 373 } 374 375 static int prepare_write_connect(struct ceph_connection *con) 376 { 377 unsigned int global_seq = ceph_get_global_seq(con->msgr, 0); 378 int proto; 379 int ret; 380 381 switch (con->peer_name.type) { 382 case CEPH_ENTITY_TYPE_MON: 383 proto = CEPH_MONC_PROTOCOL; 384 break; 385 case CEPH_ENTITY_TYPE_OSD: 386 proto = CEPH_OSDC_PROTOCOL; 387 break; 388 case CEPH_ENTITY_TYPE_MDS: 389 proto = CEPH_MDSC_PROTOCOL; 390 break; 391 default: 392 BUG(); 393 } 394 395 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, 396 con->v1.connect_seq, global_seq, proto); 397 398 con->v1.out_connect.features = 399 cpu_to_le64(from_msgr(con->msgr)->supported_features); 400 con->v1.out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); 401 con->v1.out_connect.connect_seq = cpu_to_le32(con->v1.connect_seq); 402 con->v1.out_connect.global_seq = cpu_to_le32(global_seq); 403 con->v1.out_connect.protocol_version = cpu_to_le32(proto); 404 con->v1.out_connect.flags = 0; 405 406 ret = get_connect_authorizer(con); 407 if (ret) 408 return ret; 409 410 __prepare_write_connect(con); 411 return 0; 412 } 413 414 /* 415 * write as much of pending kvecs to the socket as we can. 416 * 1 -> done 417 * 0 -> socket full, but more to do 418 * <0 -> error 419 */ 420 static int write_partial_kvec(struct ceph_connection *con) 421 { 422 int ret; 423 424 dout("write_partial_kvec %p %d left\n", con, con->v1.out_kvec_bytes); 425 while (con->v1.out_kvec_bytes > 0) { 426 ret = ceph_tcp_sendmsg(con->sock, con->v1.out_kvec_cur, 427 con->v1.out_kvec_left, 428 con->v1.out_kvec_bytes, 429 con->v1.out_more); 430 if (ret <= 0) 431 goto out; 432 con->v1.out_kvec_bytes -= ret; 433 if (!con->v1.out_kvec_bytes) 434 break; /* done */ 435 436 /* account for full iov entries consumed */ 437 while (ret >= con->v1.out_kvec_cur->iov_len) { 438 BUG_ON(!con->v1.out_kvec_left); 439 ret -= con->v1.out_kvec_cur->iov_len; 440 con->v1.out_kvec_cur++; 441 con->v1.out_kvec_left--; 442 } 443 /* and for a partially-consumed entry */ 444 if (ret) { 445 con->v1.out_kvec_cur->iov_len -= ret; 446 con->v1.out_kvec_cur->iov_base += ret; 447 } 448 } 449 con->v1.out_kvec_left = 0; 450 ret = 1; 451 out: 452 dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con, 453 con->v1.out_kvec_bytes, con->v1.out_kvec_left, ret); 454 return ret; /* done! */ 455 } 456 457 /* 458 * Write as much message data payload as we can. If we finish, queue 459 * up the footer. 460 * 1 -> done, footer is now queued in out_kvec[]. 461 * 0 -> socket full, but more to do 462 * <0 -> error 463 */ 464 static int write_partial_message_data(struct ceph_connection *con) 465 { 466 struct ceph_msg *msg = con->out_msg; 467 struct ceph_msg_data_cursor *cursor = &msg->cursor; 468 bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); 469 u32 crc; 470 471 dout("%s %p msg %p\n", __func__, con, msg); 472 473 if (!msg->num_data_items) 474 return -EINVAL; 475 476 /* 477 * Iterate through each page that contains data to be 478 * written, and send as much as possible for each. 479 * 480 * If we are calculating the data crc (the default), we will 481 * need to map the page. If we have no pages, they have 482 * been revoked, so use the zero page. 483 */ 484 crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0; 485 while (cursor->total_resid) { 486 struct page *page; 487 size_t page_offset; 488 size_t length; 489 int ret; 490 491 if (!cursor->resid) { 492 ceph_msg_data_advance(cursor, 0); 493 continue; 494 } 495 496 page = ceph_msg_data_next(cursor, &page_offset, &length); 497 ret = ceph_tcp_sendpage(con->sock, page, page_offset, length, 498 MSG_MORE); 499 if (ret <= 0) { 500 if (do_datacrc) 501 msg->footer.data_crc = cpu_to_le32(crc); 502 503 return ret; 504 } 505 if (do_datacrc && cursor->need_crc) 506 crc = ceph_crc32c_page(crc, page, page_offset, length); 507 ceph_msg_data_advance(cursor, (size_t)ret); 508 } 509 510 dout("%s %p msg %p done\n", __func__, con, msg); 511 512 /* prepare and queue up footer, too */ 513 if (do_datacrc) 514 msg->footer.data_crc = cpu_to_le32(crc); 515 else 516 msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; 517 con_out_kvec_reset(con); 518 prepare_write_message_footer(con); 519 520 return 1; /* must return > 0 to indicate success */ 521 } 522 523 /* 524 * write some zeros 525 */ 526 static int write_partial_skip(struct ceph_connection *con) 527 { 528 int ret; 529 530 dout("%s %p %d left\n", __func__, con, con->v1.out_skip); 531 while (con->v1.out_skip > 0) { 532 size_t size = min(con->v1.out_skip, (int)PAGE_SIZE); 533 534 ret = ceph_tcp_sendpage(con->sock, ceph_zero_page, 0, size, 535 MSG_MORE); 536 if (ret <= 0) 537 goto out; 538 con->v1.out_skip -= ret; 539 } 540 ret = 1; 541 out: 542 return ret; 543 } 544 545 /* 546 * Prepare to read connection handshake, or an ack. 547 */ 548 static void prepare_read_banner(struct ceph_connection *con) 549 { 550 dout("prepare_read_banner %p\n", con); 551 con->v1.in_base_pos = 0; 552 } 553 554 static void prepare_read_connect(struct ceph_connection *con) 555 { 556 dout("prepare_read_connect %p\n", con); 557 con->v1.in_base_pos = 0; 558 } 559 560 static void prepare_read_ack(struct ceph_connection *con) 561 { 562 dout("prepare_read_ack %p\n", con); 563 con->v1.in_base_pos = 0; 564 } 565 566 static void prepare_read_seq(struct ceph_connection *con) 567 { 568 dout("prepare_read_seq %p\n", con); 569 con->v1.in_base_pos = 0; 570 con->v1.in_tag = CEPH_MSGR_TAG_SEQ; 571 } 572 573 static void prepare_read_tag(struct ceph_connection *con) 574 { 575 dout("prepare_read_tag %p\n", con); 576 con->v1.in_base_pos = 0; 577 con->v1.in_tag = CEPH_MSGR_TAG_READY; 578 } 579 580 static void prepare_read_keepalive_ack(struct ceph_connection *con) 581 { 582 dout("prepare_read_keepalive_ack %p\n", con); 583 con->v1.in_base_pos = 0; 584 } 585 586 /* 587 * Prepare to read a message. 588 */ 589 static int prepare_read_message(struct ceph_connection *con) 590 { 591 dout("prepare_read_message %p\n", con); 592 BUG_ON(con->in_msg != NULL); 593 con->v1.in_base_pos = 0; 594 con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0; 595 return 0; 596 } 597 598 static int read_partial(struct ceph_connection *con, 599 int end, int size, void *object) 600 { 601 while (con->v1.in_base_pos < end) { 602 int left = end - con->v1.in_base_pos; 603 int have = size - left; 604 int ret = ceph_tcp_recvmsg(con->sock, object + have, left); 605 if (ret <= 0) 606 return ret; 607 con->v1.in_base_pos += ret; 608 } 609 return 1; 610 } 611 612 /* 613 * Read all or part of the connect-side handshake on a new connection 614 */ 615 static int read_partial_banner(struct ceph_connection *con) 616 { 617 int size; 618 int end; 619 int ret; 620 621 dout("read_partial_banner %p at %d\n", con, con->v1.in_base_pos); 622 623 /* peer's banner */ 624 size = strlen(CEPH_BANNER); 625 end = size; 626 ret = read_partial(con, end, size, con->v1.in_banner); 627 if (ret <= 0) 628 goto out; 629 630 size = sizeof(con->v1.actual_peer_addr); 631 end += size; 632 ret = read_partial(con, end, size, &con->v1.actual_peer_addr); 633 if (ret <= 0) 634 goto out; 635 ceph_decode_banner_addr(&con->v1.actual_peer_addr); 636 637 size = sizeof(con->v1.peer_addr_for_me); 638 end += size; 639 ret = read_partial(con, end, size, &con->v1.peer_addr_for_me); 640 if (ret <= 0) 641 goto out; 642 ceph_decode_banner_addr(&con->v1.peer_addr_for_me); 643 644 out: 645 return ret; 646 } 647 648 static int read_partial_connect(struct ceph_connection *con) 649 { 650 int size; 651 int end; 652 int ret; 653 654 dout("read_partial_connect %p at %d\n", con, con->v1.in_base_pos); 655 656 size = sizeof(con->v1.in_reply); 657 end = size; 658 ret = read_partial(con, end, size, &con->v1.in_reply); 659 if (ret <= 0) 660 goto out; 661 662 if (con->v1.auth) { 663 size = le32_to_cpu(con->v1.in_reply.authorizer_len); 664 if (size > con->v1.auth->authorizer_reply_buf_len) { 665 pr_err("authorizer reply too big: %d > %zu\n", size, 666 con->v1.auth->authorizer_reply_buf_len); 667 ret = -EINVAL; 668 goto out; 669 } 670 671 end += size; 672 ret = read_partial(con, end, size, 673 con->v1.auth->authorizer_reply_buf); 674 if (ret <= 0) 675 goto out; 676 } 677 678 dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n", 679 con, con->v1.in_reply.tag, 680 le32_to_cpu(con->v1.in_reply.connect_seq), 681 le32_to_cpu(con->v1.in_reply.global_seq)); 682 out: 683 return ret; 684 } 685 686 /* 687 * Verify the hello banner looks okay. 688 */ 689 static int verify_hello(struct ceph_connection *con) 690 { 691 if (memcmp(con->v1.in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) { 692 pr_err("connect to %s got bad banner\n", 693 ceph_pr_addr(&con->peer_addr)); 694 con->error_msg = "protocol error, bad banner"; 695 return -1; 696 } 697 return 0; 698 } 699 700 static int process_banner(struct ceph_connection *con) 701 { 702 struct ceph_entity_addr *my_addr = &con->msgr->inst.addr; 703 704 dout("process_banner on %p\n", con); 705 706 if (verify_hello(con) < 0) 707 return -1; 708 709 /* 710 * Make sure the other end is who we wanted. note that the other 711 * end may not yet know their ip address, so if it's 0.0.0.0, give 712 * them the benefit of the doubt. 713 */ 714 if (memcmp(&con->peer_addr, &con->v1.actual_peer_addr, 715 sizeof(con->peer_addr)) != 0 && 716 !(ceph_addr_is_blank(&con->v1.actual_peer_addr) && 717 con->v1.actual_peer_addr.nonce == con->peer_addr.nonce)) { 718 pr_warn("wrong peer, want %s/%u, got %s/%u\n", 719 ceph_pr_addr(&con->peer_addr), 720 le32_to_cpu(con->peer_addr.nonce), 721 ceph_pr_addr(&con->v1.actual_peer_addr), 722 le32_to_cpu(con->v1.actual_peer_addr.nonce)); 723 con->error_msg = "wrong peer at address"; 724 return -1; 725 } 726 727 /* 728 * did we learn our address? 729 */ 730 if (ceph_addr_is_blank(my_addr)) { 731 memcpy(&my_addr->in_addr, 732 &con->v1.peer_addr_for_me.in_addr, 733 sizeof(con->v1.peer_addr_for_me.in_addr)); 734 ceph_addr_set_port(my_addr, 0); 735 ceph_encode_my_addr(con->msgr); 736 dout("process_banner learned my addr is %s\n", 737 ceph_pr_addr(my_addr)); 738 } 739 740 return 0; 741 } 742 743 static int process_connect(struct ceph_connection *con) 744 { 745 u64 sup_feat = from_msgr(con->msgr)->supported_features; 746 u64 req_feat = from_msgr(con->msgr)->required_features; 747 u64 server_feat = le64_to_cpu(con->v1.in_reply.features); 748 int ret; 749 750 dout("process_connect on %p tag %d\n", con, con->v1.in_tag); 751 752 if (con->v1.auth) { 753 int len = le32_to_cpu(con->v1.in_reply.authorizer_len); 754 755 /* 756 * Any connection that defines ->get_authorizer() 757 * should also define ->add_authorizer_challenge() and 758 * ->verify_authorizer_reply(). 759 * 760 * See get_connect_authorizer(). 761 */ 762 if (con->v1.in_reply.tag == 763 CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) { 764 ret = con->ops->add_authorizer_challenge( 765 con, con->v1.auth->authorizer_reply_buf, len); 766 if (ret < 0) 767 return ret; 768 769 con_out_kvec_reset(con); 770 __prepare_write_connect(con); 771 prepare_read_connect(con); 772 return 0; 773 } 774 775 if (len) { 776 ret = con->ops->verify_authorizer_reply(con); 777 if (ret < 0) { 778 con->error_msg = "bad authorize reply"; 779 return ret; 780 } 781 } 782 } 783 784 switch (con->v1.in_reply.tag) { 785 case CEPH_MSGR_TAG_FEATURES: 786 pr_err("%s%lld %s feature set mismatch," 787 " my %llx < server's %llx, missing %llx\n", 788 ENTITY_NAME(con->peer_name), 789 ceph_pr_addr(&con->peer_addr), 790 sup_feat, server_feat, server_feat & ~sup_feat); 791 con->error_msg = "missing required protocol features"; 792 return -1; 793 794 case CEPH_MSGR_TAG_BADPROTOVER: 795 pr_err("%s%lld %s protocol version mismatch," 796 " my %d != server's %d\n", 797 ENTITY_NAME(con->peer_name), 798 ceph_pr_addr(&con->peer_addr), 799 le32_to_cpu(con->v1.out_connect.protocol_version), 800 le32_to_cpu(con->v1.in_reply.protocol_version)); 801 con->error_msg = "protocol version mismatch"; 802 return -1; 803 804 case CEPH_MSGR_TAG_BADAUTHORIZER: 805 con->v1.auth_retry++; 806 dout("process_connect %p got BADAUTHORIZER attempt %d\n", con, 807 con->v1.auth_retry); 808 if (con->v1.auth_retry == 2) { 809 con->error_msg = "connect authorization failure"; 810 return -1; 811 } 812 con_out_kvec_reset(con); 813 ret = prepare_write_connect(con); 814 if (ret < 0) 815 return ret; 816 prepare_read_connect(con); 817 break; 818 819 case CEPH_MSGR_TAG_RESETSESSION: 820 /* 821 * If we connected with a large connect_seq but the peer 822 * has no record of a session with us (no connection, or 823 * connect_seq == 0), they will send RESETSESION to indicate 824 * that they must have reset their session, and may have 825 * dropped messages. 826 */ 827 dout("process_connect got RESET peer seq %u\n", 828 le32_to_cpu(con->v1.in_reply.connect_seq)); 829 pr_info("%s%lld %s session reset\n", 830 ENTITY_NAME(con->peer_name), 831 ceph_pr_addr(&con->peer_addr)); 832 ceph_con_reset_session(con); 833 con_out_kvec_reset(con); 834 ret = prepare_write_connect(con); 835 if (ret < 0) 836 return ret; 837 prepare_read_connect(con); 838 839 /* Tell ceph about it. */ 840 mutex_unlock(&con->mutex); 841 if (con->ops->peer_reset) 842 con->ops->peer_reset(con); 843 mutex_lock(&con->mutex); 844 if (con->state != CEPH_CON_S_V1_CONNECT_MSG) 845 return -EAGAIN; 846 break; 847 848 case CEPH_MSGR_TAG_RETRY_SESSION: 849 /* 850 * If we sent a smaller connect_seq than the peer has, try 851 * again with a larger value. 852 */ 853 dout("process_connect got RETRY_SESSION my seq %u, peer %u\n", 854 le32_to_cpu(con->v1.out_connect.connect_seq), 855 le32_to_cpu(con->v1.in_reply.connect_seq)); 856 con->v1.connect_seq = le32_to_cpu(con->v1.in_reply.connect_seq); 857 con_out_kvec_reset(con); 858 ret = prepare_write_connect(con); 859 if (ret < 0) 860 return ret; 861 prepare_read_connect(con); 862 break; 863 864 case CEPH_MSGR_TAG_RETRY_GLOBAL: 865 /* 866 * If we sent a smaller global_seq than the peer has, try 867 * again with a larger value. 868 */ 869 dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n", 870 con->v1.peer_global_seq, 871 le32_to_cpu(con->v1.in_reply.global_seq)); 872 ceph_get_global_seq(con->msgr, 873 le32_to_cpu(con->v1.in_reply.global_seq)); 874 con_out_kvec_reset(con); 875 ret = prepare_write_connect(con); 876 if (ret < 0) 877 return ret; 878 prepare_read_connect(con); 879 break; 880 881 case CEPH_MSGR_TAG_SEQ: 882 case CEPH_MSGR_TAG_READY: 883 if (req_feat & ~server_feat) { 884 pr_err("%s%lld %s protocol feature mismatch," 885 " my required %llx > server's %llx, need %llx\n", 886 ENTITY_NAME(con->peer_name), 887 ceph_pr_addr(&con->peer_addr), 888 req_feat, server_feat, req_feat & ~server_feat); 889 con->error_msg = "missing required protocol features"; 890 return -1; 891 } 892 893 WARN_ON(con->state != CEPH_CON_S_V1_CONNECT_MSG); 894 con->state = CEPH_CON_S_OPEN; 895 con->v1.auth_retry = 0; /* we authenticated; clear flag */ 896 con->v1.peer_global_seq = 897 le32_to_cpu(con->v1.in_reply.global_seq); 898 con->v1.connect_seq++; 899 con->peer_features = server_feat; 900 dout("process_connect got READY gseq %d cseq %d (%d)\n", 901 con->v1.peer_global_seq, 902 le32_to_cpu(con->v1.in_reply.connect_seq), 903 con->v1.connect_seq); 904 WARN_ON(con->v1.connect_seq != 905 le32_to_cpu(con->v1.in_reply.connect_seq)); 906 907 if (con->v1.in_reply.flags & CEPH_MSG_CONNECT_LOSSY) 908 ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX); 909 910 con->delay = 0; /* reset backoff memory */ 911 912 if (con->v1.in_reply.tag == CEPH_MSGR_TAG_SEQ) { 913 prepare_write_seq(con); 914 prepare_read_seq(con); 915 } else { 916 prepare_read_tag(con); 917 } 918 break; 919 920 case CEPH_MSGR_TAG_WAIT: 921 /* 922 * If there is a connection race (we are opening 923 * connections to each other), one of us may just have 924 * to WAIT. This shouldn't happen if we are the 925 * client. 926 */ 927 con->error_msg = "protocol error, got WAIT as client"; 928 return -1; 929 930 default: 931 con->error_msg = "protocol error, garbage tag during connect"; 932 return -1; 933 } 934 return 0; 935 } 936 937 /* 938 * read (part of) an ack 939 */ 940 static int read_partial_ack(struct ceph_connection *con) 941 { 942 int size = sizeof(con->v1.in_temp_ack); 943 int end = size; 944 945 return read_partial(con, end, size, &con->v1.in_temp_ack); 946 } 947 948 /* 949 * We can finally discard anything that's been acked. 950 */ 951 static void process_ack(struct ceph_connection *con) 952 { 953 u64 ack = le64_to_cpu(con->v1.in_temp_ack); 954 955 if (con->v1.in_tag == CEPH_MSGR_TAG_ACK) 956 ceph_con_discard_sent(con, ack); 957 else 958 ceph_con_discard_requeued(con, ack); 959 960 prepare_read_tag(con); 961 } 962 963 static int read_partial_message_chunk(struct ceph_connection *con, 964 struct kvec *section, 965 unsigned int sec_len, u32 *crc) 966 { 967 int ret, left; 968 969 BUG_ON(!section); 970 971 while (section->iov_len < sec_len) { 972 BUG_ON(section->iov_base == NULL); 973 left = sec_len - section->iov_len; 974 ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base + 975 section->iov_len, left); 976 if (ret <= 0) 977 return ret; 978 section->iov_len += ret; 979 } 980 if (section->iov_len == sec_len) 981 *crc = crc32c(*crc, section->iov_base, section->iov_len); 982 983 return 1; 984 } 985 986 static inline int read_partial_message_section(struct ceph_connection *con, 987 struct kvec *section, 988 unsigned int sec_len, u32 *crc) 989 { 990 *crc = 0; 991 return read_partial_message_chunk(con, section, sec_len, crc); 992 } 993 994 static int read_sparse_msg_extent(struct ceph_connection *con, u32 *crc) 995 { 996 struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor; 997 bool do_bounce = ceph_test_opt(from_msgr(con->msgr), RXBOUNCE); 998 999 if (do_bounce && unlikely(!con->bounce_page)) { 1000 con->bounce_page = alloc_page(GFP_NOIO); 1001 if (!con->bounce_page) { 1002 pr_err("failed to allocate bounce page\n"); 1003 return -ENOMEM; 1004 } 1005 } 1006 1007 while (cursor->sr_resid > 0) { 1008 struct page *page, *rpage; 1009 size_t off, len; 1010 int ret; 1011 1012 page = ceph_msg_data_next(cursor, &off, &len); 1013 rpage = do_bounce ? con->bounce_page : page; 1014 1015 /* clamp to what remains in extent */ 1016 len = min_t(int, len, cursor->sr_resid); 1017 ret = ceph_tcp_recvpage(con->sock, rpage, (int)off, len); 1018 if (ret <= 0) 1019 return ret; 1020 *crc = ceph_crc32c_page(*crc, rpage, off, ret); 1021 ceph_msg_data_advance(cursor, (size_t)ret); 1022 cursor->sr_resid -= ret; 1023 if (do_bounce) 1024 memcpy_page(page, off, rpage, off, ret); 1025 } 1026 return 1; 1027 } 1028 1029 static int read_sparse_msg_data(struct ceph_connection *con) 1030 { 1031 struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor; 1032 bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); 1033 u32 crc = 0; 1034 int ret = 1; 1035 1036 if (do_datacrc) 1037 crc = con->in_data_crc; 1038 1039 do { 1040 if (con->v1.in_sr_kvec.iov_base) 1041 ret = read_partial_message_chunk(con, 1042 &con->v1.in_sr_kvec, 1043 con->v1.in_sr_len, 1044 &crc); 1045 else if (cursor->sr_resid > 0) 1046 ret = read_sparse_msg_extent(con, &crc); 1047 1048 if (ret <= 0) { 1049 if (do_datacrc) 1050 con->in_data_crc = crc; 1051 return ret; 1052 } 1053 1054 memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec)); 1055 ret = con->ops->sparse_read(con, cursor, 1056 (char **)&con->v1.in_sr_kvec.iov_base); 1057 con->v1.in_sr_len = ret; 1058 } while (ret > 0); 1059 1060 if (do_datacrc) 1061 con->in_data_crc = crc; 1062 1063 return ret < 0 ? ret : 1; /* must return > 0 to indicate success */ 1064 } 1065 1066 static int read_partial_msg_data(struct ceph_connection *con) 1067 { 1068 struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor; 1069 bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); 1070 struct page *page; 1071 size_t page_offset; 1072 size_t length; 1073 u32 crc = 0; 1074 int ret; 1075 1076 if (do_datacrc) 1077 crc = con->in_data_crc; 1078 while (cursor->total_resid) { 1079 if (!cursor->resid) { 1080 ceph_msg_data_advance(cursor, 0); 1081 continue; 1082 } 1083 1084 page = ceph_msg_data_next(cursor, &page_offset, &length); 1085 ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); 1086 if (ret <= 0) { 1087 if (do_datacrc) 1088 con->in_data_crc = crc; 1089 1090 return ret; 1091 } 1092 1093 if (do_datacrc) 1094 crc = ceph_crc32c_page(crc, page, page_offset, ret); 1095 ceph_msg_data_advance(cursor, (size_t)ret); 1096 } 1097 if (do_datacrc) 1098 con->in_data_crc = crc; 1099 1100 return 1; /* must return > 0 to indicate success */ 1101 } 1102 1103 static int read_partial_msg_data_bounce(struct ceph_connection *con) 1104 { 1105 struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor; 1106 struct page *page; 1107 size_t off, len; 1108 u32 crc; 1109 int ret; 1110 1111 if (unlikely(!con->bounce_page)) { 1112 con->bounce_page = alloc_page(GFP_NOIO); 1113 if (!con->bounce_page) { 1114 pr_err("failed to allocate bounce page\n"); 1115 return -ENOMEM; 1116 } 1117 } 1118 1119 crc = con->in_data_crc; 1120 while (cursor->total_resid) { 1121 if (!cursor->resid) { 1122 ceph_msg_data_advance(cursor, 0); 1123 continue; 1124 } 1125 1126 page = ceph_msg_data_next(cursor, &off, &len); 1127 ret = ceph_tcp_recvpage(con->sock, con->bounce_page, 0, len); 1128 if (ret <= 0) { 1129 con->in_data_crc = crc; 1130 return ret; 1131 } 1132 1133 crc = crc32c(crc, page_address(con->bounce_page), ret); 1134 memcpy_to_page(page, off, page_address(con->bounce_page), ret); 1135 1136 ceph_msg_data_advance(cursor, ret); 1137 } 1138 con->in_data_crc = crc; 1139 1140 return 1; /* must return > 0 to indicate success */ 1141 } 1142 1143 /* 1144 * read (part of) a message. 1145 */ 1146 static int read_partial_message(struct ceph_connection *con) 1147 { 1148 struct ceph_msg *m = con->in_msg; 1149 int size; 1150 int end; 1151 int ret; 1152 unsigned int front_len, middle_len, data_len; 1153 bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); 1154 bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH); 1155 u64 seq; 1156 u32 crc; 1157 1158 dout("read_partial_message con %p msg %p\n", con, m); 1159 1160 /* header */ 1161 size = sizeof(con->v1.in_hdr); 1162 end = size; 1163 ret = read_partial(con, end, size, &con->v1.in_hdr); 1164 if (ret <= 0) 1165 return ret; 1166 1167 crc = crc32c(0, &con->v1.in_hdr, offsetof(struct ceph_msg_header, crc)); 1168 if (cpu_to_le32(crc) != con->v1.in_hdr.crc) { 1169 pr_err("read_partial_message bad hdr crc %u != expected %u\n", 1170 crc, con->v1.in_hdr.crc); 1171 return -EBADMSG; 1172 } 1173 1174 front_len = le32_to_cpu(con->v1.in_hdr.front_len); 1175 if (front_len > CEPH_MSG_MAX_FRONT_LEN) 1176 return -EIO; 1177 middle_len = le32_to_cpu(con->v1.in_hdr.middle_len); 1178 if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN) 1179 return -EIO; 1180 data_len = le32_to_cpu(con->v1.in_hdr.data_len); 1181 if (data_len > CEPH_MSG_MAX_DATA_LEN) 1182 return -EIO; 1183 1184 /* verify seq# */ 1185 seq = le64_to_cpu(con->v1.in_hdr.seq); 1186 if ((s64)seq - (s64)con->in_seq < 1) { 1187 pr_info("skipping %s%lld %s seq %lld expected %lld\n", 1188 ENTITY_NAME(con->peer_name), 1189 ceph_pr_addr(&con->peer_addr), 1190 seq, con->in_seq + 1); 1191 con->v1.in_base_pos = -front_len - middle_len - data_len - 1192 sizeof_footer(con); 1193 con->v1.in_tag = CEPH_MSGR_TAG_READY; 1194 return 1; 1195 } else if ((s64)seq - (s64)con->in_seq > 1) { 1196 pr_err("read_partial_message bad seq %lld expected %lld\n", 1197 seq, con->in_seq + 1); 1198 con->error_msg = "bad message sequence # for incoming message"; 1199 return -EBADE; 1200 } 1201 1202 /* allocate message? */ 1203 if (!con->in_msg) { 1204 int skip = 0; 1205 1206 dout("got hdr type %d front %d data %d\n", con->v1.in_hdr.type, 1207 front_len, data_len); 1208 ret = ceph_con_in_msg_alloc(con, &con->v1.in_hdr, &skip); 1209 if (ret < 0) 1210 return ret; 1211 1212 BUG_ON((!con->in_msg) ^ skip); 1213 if (skip) { 1214 /* skip this message */ 1215 dout("alloc_msg said skip message\n"); 1216 con->v1.in_base_pos = -front_len - middle_len - 1217 data_len - sizeof_footer(con); 1218 con->v1.in_tag = CEPH_MSGR_TAG_READY; 1219 con->in_seq++; 1220 return 1; 1221 } 1222 1223 BUG_ON(!con->in_msg); 1224 BUG_ON(con->in_msg->con != con); 1225 m = con->in_msg; 1226 m->front.iov_len = 0; /* haven't read it yet */ 1227 if (m->middle) 1228 m->middle->vec.iov_len = 0; 1229 1230 /* prepare for data payload, if any */ 1231 1232 if (data_len) 1233 prepare_message_data(con->in_msg, data_len); 1234 } 1235 1236 /* front */ 1237 ret = read_partial_message_section(con, &m->front, front_len, 1238 &con->in_front_crc); 1239 if (ret <= 0) 1240 return ret; 1241 1242 /* middle */ 1243 if (m->middle) { 1244 ret = read_partial_message_section(con, &m->middle->vec, 1245 middle_len, 1246 &con->in_middle_crc); 1247 if (ret <= 0) 1248 return ret; 1249 } 1250 1251 /* (page) data */ 1252 if (data_len) { 1253 if (!m->num_data_items) 1254 return -EIO; 1255 1256 if (m->sparse_read) 1257 ret = read_sparse_msg_data(con); 1258 else if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) 1259 ret = read_partial_msg_data_bounce(con); 1260 else 1261 ret = read_partial_msg_data(con); 1262 if (ret <= 0) 1263 return ret; 1264 } 1265 1266 /* footer */ 1267 size = sizeof_footer(con); 1268 end += size; 1269 ret = read_partial(con, end, size, &m->footer); 1270 if (ret <= 0) 1271 return ret; 1272 1273 if (!need_sign) { 1274 m->footer.flags = m->old_footer.flags; 1275 m->footer.sig = 0; 1276 } 1277 1278 dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n", 1279 m, front_len, m->footer.front_crc, middle_len, 1280 m->footer.middle_crc, data_len, m->footer.data_crc); 1281 1282 /* crc ok? */ 1283 if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) { 1284 pr_err("read_partial_message %p front crc %u != exp. %u\n", 1285 m, con->in_front_crc, m->footer.front_crc); 1286 return -EBADMSG; 1287 } 1288 if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) { 1289 pr_err("read_partial_message %p middle crc %u != exp %u\n", 1290 m, con->in_middle_crc, m->footer.middle_crc); 1291 return -EBADMSG; 1292 } 1293 if (do_datacrc && 1294 (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 && 1295 con->in_data_crc != le32_to_cpu(m->footer.data_crc)) { 1296 pr_err("read_partial_message %p data crc %u != exp. %u\n", m, 1297 con->in_data_crc, le32_to_cpu(m->footer.data_crc)); 1298 return -EBADMSG; 1299 } 1300 1301 if (need_sign && con->ops->check_message_signature && 1302 con->ops->check_message_signature(m)) { 1303 pr_err("read_partial_message %p signature check failed\n", m); 1304 return -EBADMSG; 1305 } 1306 1307 return 1; /* done! */ 1308 } 1309 1310 static int read_keepalive_ack(struct ceph_connection *con) 1311 { 1312 struct ceph_timespec ceph_ts; 1313 size_t size = sizeof(ceph_ts); 1314 int ret = read_partial(con, size, size, &ceph_ts); 1315 if (ret <= 0) 1316 return ret; 1317 ceph_decode_timespec64(&con->last_keepalive_ack, &ceph_ts); 1318 prepare_read_tag(con); 1319 return 1; 1320 } 1321 1322 /* 1323 * Read what we can from the socket. 1324 */ 1325 int ceph_con_v1_try_read(struct ceph_connection *con) 1326 { 1327 int ret = -1; 1328 1329 more: 1330 dout("try_read start %p state %d\n", con, con->state); 1331 if (con->state != CEPH_CON_S_V1_BANNER && 1332 con->state != CEPH_CON_S_V1_CONNECT_MSG && 1333 con->state != CEPH_CON_S_OPEN) 1334 return 0; 1335 1336 BUG_ON(!con->sock); 1337 1338 dout("try_read tag %d in_base_pos %d\n", con->v1.in_tag, 1339 con->v1.in_base_pos); 1340 1341 if (con->state == CEPH_CON_S_V1_BANNER) { 1342 ret = read_partial_banner(con); 1343 if (ret <= 0) 1344 goto out; 1345 ret = process_banner(con); 1346 if (ret < 0) 1347 goto out; 1348 1349 con->state = CEPH_CON_S_V1_CONNECT_MSG; 1350 1351 /* 1352 * Received banner is good, exchange connection info. 1353 * Do not reset out_kvec, as sending our banner raced 1354 * with receiving peer banner after connect completed. 1355 */ 1356 ret = prepare_write_connect(con); 1357 if (ret < 0) 1358 goto out; 1359 prepare_read_connect(con); 1360 1361 /* Send connection info before awaiting response */ 1362 goto out; 1363 } 1364 1365 if (con->state == CEPH_CON_S_V1_CONNECT_MSG) { 1366 ret = read_partial_connect(con); 1367 if (ret <= 0) 1368 goto out; 1369 ret = process_connect(con); 1370 if (ret < 0) 1371 goto out; 1372 goto more; 1373 } 1374 1375 WARN_ON(con->state != CEPH_CON_S_OPEN); 1376 1377 if (con->v1.in_base_pos < 0) { 1378 /* 1379 * skipping + discarding content. 1380 */ 1381 ret = ceph_tcp_recvmsg(con->sock, NULL, -con->v1.in_base_pos); 1382 if (ret <= 0) 1383 goto out; 1384 dout("skipped %d / %d bytes\n", ret, -con->v1.in_base_pos); 1385 con->v1.in_base_pos += ret; 1386 if (con->v1.in_base_pos) 1387 goto more; 1388 } 1389 if (con->v1.in_tag == CEPH_MSGR_TAG_READY) { 1390 /* 1391 * what's next? 1392 */ 1393 ret = ceph_tcp_recvmsg(con->sock, &con->v1.in_tag, 1); 1394 if (ret <= 0) 1395 goto out; 1396 dout("try_read got tag %d\n", con->v1.in_tag); 1397 switch (con->v1.in_tag) { 1398 case CEPH_MSGR_TAG_MSG: 1399 prepare_read_message(con); 1400 break; 1401 case CEPH_MSGR_TAG_ACK: 1402 prepare_read_ack(con); 1403 break; 1404 case CEPH_MSGR_TAG_KEEPALIVE2_ACK: 1405 prepare_read_keepalive_ack(con); 1406 break; 1407 case CEPH_MSGR_TAG_CLOSE: 1408 ceph_con_close_socket(con); 1409 con->state = CEPH_CON_S_CLOSED; 1410 goto out; 1411 default: 1412 goto bad_tag; 1413 } 1414 } 1415 if (con->v1.in_tag == CEPH_MSGR_TAG_MSG) { 1416 ret = read_partial_message(con); 1417 if (ret <= 0) { 1418 switch (ret) { 1419 case -EBADMSG: 1420 con->error_msg = "bad crc/signature"; 1421 fallthrough; 1422 case -EBADE: 1423 ret = -EIO; 1424 break; 1425 case -EIO: 1426 con->error_msg = "io error"; 1427 break; 1428 } 1429 goto out; 1430 } 1431 if (con->v1.in_tag == CEPH_MSGR_TAG_READY) 1432 goto more; 1433 ceph_con_process_message(con); 1434 if (con->state == CEPH_CON_S_OPEN) 1435 prepare_read_tag(con); 1436 goto more; 1437 } 1438 if (con->v1.in_tag == CEPH_MSGR_TAG_ACK || 1439 con->v1.in_tag == CEPH_MSGR_TAG_SEQ) { 1440 /* 1441 * the final handshake seq exchange is semantically 1442 * equivalent to an ACK 1443 */ 1444 ret = read_partial_ack(con); 1445 if (ret <= 0) 1446 goto out; 1447 process_ack(con); 1448 goto more; 1449 } 1450 if (con->v1.in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) { 1451 ret = read_keepalive_ack(con); 1452 if (ret <= 0) 1453 goto out; 1454 goto more; 1455 } 1456 1457 out: 1458 dout("try_read done on %p ret %d\n", con, ret); 1459 return ret; 1460 1461 bad_tag: 1462 pr_err("try_read bad tag %d\n", con->v1.in_tag); 1463 con->error_msg = "protocol error, garbage tag"; 1464 ret = -1; 1465 goto out; 1466 } 1467 1468 /* 1469 * Write something to the socket. Called in a worker thread when the 1470 * socket appears to be writeable and we have something ready to send. 1471 */ 1472 int ceph_con_v1_try_write(struct ceph_connection *con) 1473 { 1474 int ret = 1; 1475 1476 dout("try_write start %p state %d\n", con, con->state); 1477 if (con->state != CEPH_CON_S_PREOPEN && 1478 con->state != CEPH_CON_S_V1_BANNER && 1479 con->state != CEPH_CON_S_V1_CONNECT_MSG && 1480 con->state != CEPH_CON_S_OPEN) 1481 return 0; 1482 1483 /* open the socket first? */ 1484 if (con->state == CEPH_CON_S_PREOPEN) { 1485 BUG_ON(con->sock); 1486 con->state = CEPH_CON_S_V1_BANNER; 1487 1488 con_out_kvec_reset(con); 1489 prepare_write_banner(con); 1490 prepare_read_banner(con); 1491 1492 BUG_ON(con->in_msg); 1493 con->v1.in_tag = CEPH_MSGR_TAG_READY; 1494 dout("try_write initiating connect on %p new state %d\n", 1495 con, con->state); 1496 ret = ceph_tcp_connect(con); 1497 if (ret < 0) { 1498 con->error_msg = "connect error"; 1499 goto out; 1500 } 1501 } 1502 1503 more: 1504 dout("try_write out_kvec_bytes %d\n", con->v1.out_kvec_bytes); 1505 BUG_ON(!con->sock); 1506 1507 /* kvec data queued? */ 1508 if (con->v1.out_kvec_left) { 1509 ret = write_partial_kvec(con); 1510 if (ret <= 0) 1511 goto out; 1512 } 1513 if (con->v1.out_skip) { 1514 ret = write_partial_skip(con); 1515 if (ret <= 0) 1516 goto out; 1517 } 1518 1519 /* msg pages? */ 1520 if (con->out_msg) { 1521 if (con->v1.out_msg_done) { 1522 ceph_msg_put(con->out_msg); 1523 con->out_msg = NULL; /* we're done with this one */ 1524 goto do_next; 1525 } 1526 1527 ret = write_partial_message_data(con); 1528 if (ret == 1) 1529 goto more; /* we need to send the footer, too! */ 1530 if (ret == 0) 1531 goto out; 1532 if (ret < 0) { 1533 dout("try_write write_partial_message_data err %d\n", 1534 ret); 1535 goto out; 1536 } 1537 } 1538 1539 do_next: 1540 if (con->state == CEPH_CON_S_OPEN) { 1541 if (ceph_con_flag_test_and_clear(con, 1542 CEPH_CON_F_KEEPALIVE_PENDING)) { 1543 prepare_write_keepalive(con); 1544 goto more; 1545 } 1546 /* is anything else pending? */ 1547 if (!list_empty(&con->out_queue)) { 1548 prepare_write_message(con); 1549 goto more; 1550 } 1551 if (con->in_seq > con->in_seq_acked) { 1552 prepare_write_ack(con); 1553 goto more; 1554 } 1555 } 1556 1557 /* Nothing to do! */ 1558 ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING); 1559 dout("try_write nothing else to write.\n"); 1560 ret = 0; 1561 out: 1562 dout("try_write done on %p ret %d\n", con, ret); 1563 return ret; 1564 } 1565 1566 void ceph_con_v1_revoke(struct ceph_connection *con) 1567 { 1568 struct ceph_msg *msg = con->out_msg; 1569 1570 WARN_ON(con->v1.out_skip); 1571 /* footer */ 1572 if (con->v1.out_msg_done) { 1573 con->v1.out_skip += con_out_kvec_skip(con); 1574 } else { 1575 WARN_ON(!msg->data_length); 1576 con->v1.out_skip += sizeof_footer(con); 1577 } 1578 /* data, middle, front */ 1579 if (msg->data_length) 1580 con->v1.out_skip += msg->cursor.total_resid; 1581 if (msg->middle) 1582 con->v1.out_skip += con_out_kvec_skip(con); 1583 con->v1.out_skip += con_out_kvec_skip(con); 1584 1585 dout("%s con %p out_kvec_bytes %d out_skip %d\n", __func__, con, 1586 con->v1.out_kvec_bytes, con->v1.out_skip); 1587 } 1588 1589 void ceph_con_v1_revoke_incoming(struct ceph_connection *con) 1590 { 1591 unsigned int front_len = le32_to_cpu(con->v1.in_hdr.front_len); 1592 unsigned int middle_len = le32_to_cpu(con->v1.in_hdr.middle_len); 1593 unsigned int data_len = le32_to_cpu(con->v1.in_hdr.data_len); 1594 1595 /* skip rest of message */ 1596 con->v1.in_base_pos = con->v1.in_base_pos - 1597 sizeof(struct ceph_msg_header) - 1598 front_len - 1599 middle_len - 1600 data_len - 1601 sizeof(struct ceph_msg_footer); 1602 1603 con->v1.in_tag = CEPH_MSGR_TAG_READY; 1604 con->in_seq++; 1605 1606 dout("%s con %p in_base_pos %d\n", __func__, con, con->v1.in_base_pos); 1607 } 1608 1609 bool ceph_con_v1_opened(struct ceph_connection *con) 1610 { 1611 return con->v1.connect_seq; 1612 } 1613 1614 void ceph_con_v1_reset_session(struct ceph_connection *con) 1615 { 1616 con->v1.connect_seq = 0; 1617 con->v1.peer_global_seq = 0; 1618 } 1619 1620 void ceph_con_v1_reset_protocol(struct ceph_connection *con) 1621 { 1622 con->v1.out_skip = 0; 1623 } 1624